TensorBoard
Safetensors
English
long_speech_qwen2audio
FastLongSpeech / trainer_state.json
guoshoutao's picture
Upload folder using huggingface_hub
567160a verified
raw
history blame
283 kB
{
"best_metric": 0.443807452917099,
"best_model_checkpoint": "CTCLLMs_self_tokenizer/checkpoints/LongSpeech_CTC-Shrink_augment_data_self_tokenizer_addMLS_projector_restore/checkpoint-30000",
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 31479,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006353441977191143,
"grad_norm": 45.06840896606445,
"learning_rate": 3.597883597883598e-06,
"loss": 72.1477,
"step": 20
},
{
"epoch": 0.0012706883954382287,
"grad_norm": 56.45563507080078,
"learning_rate": 7.830687830687831e-06,
"loss": 71.8917,
"step": 40
},
{
"epoch": 0.001906032593157343,
"grad_norm": 62.59088897705078,
"learning_rate": 1.1851851851851853e-05,
"loss": 71.7764,
"step": 60
},
{
"epoch": 0.0025413767908764573,
"grad_norm": 75.64707946777344,
"learning_rate": 1.6084656084656086e-05,
"loss": 70.9277,
"step": 80
},
{
"epoch": 0.003176720988595572,
"grad_norm": 73.5933837890625,
"learning_rate": 2.031746031746032e-05,
"loss": 68.0688,
"step": 100
},
{
"epoch": 0.003812065186314686,
"grad_norm": 77.9434814453125,
"learning_rate": 2.4550264550264552e-05,
"loss": 65.4844,
"step": 120
},
{
"epoch": 0.004447409384033801,
"grad_norm": 81.92144775390625,
"learning_rate": 2.8783068783068785e-05,
"loss": 61.2486,
"step": 140
},
{
"epoch": 0.005082753581752915,
"grad_norm": 91.82105255126953,
"learning_rate": 3.3015873015873014e-05,
"loss": 55.9783,
"step": 160
},
{
"epoch": 0.005718097779472029,
"grad_norm": 103.17108917236328,
"learning_rate": 3.724867724867725e-05,
"loss": 51.7487,
"step": 180
},
{
"epoch": 0.006353441977191144,
"grad_norm": 98.97240447998047,
"learning_rate": 4.148148148148148e-05,
"loss": 45.0213,
"step": 200
},
{
"epoch": 0.006988786174910258,
"grad_norm": 81.4900894165039,
"learning_rate": 4.5714285714285716e-05,
"loss": 38.3125,
"step": 220
},
{
"epoch": 0.007624130372629372,
"grad_norm": 71.47420501708984,
"learning_rate": 4.9947089947089946e-05,
"loss": 33.2395,
"step": 240
},
{
"epoch": 0.008259474570348486,
"grad_norm": 63.618309020996094,
"learning_rate": 5.417989417989419e-05,
"loss": 28.4421,
"step": 260
},
{
"epoch": 0.008894818768067601,
"grad_norm": 58.004974365234375,
"learning_rate": 5.841269841269842e-05,
"loss": 25.048,
"step": 280
},
{
"epoch": 0.009530162965786714,
"grad_norm": 46.489200592041016,
"learning_rate": 6.264550264550265e-05,
"loss": 21.9312,
"step": 300
},
{
"epoch": 0.01016550716350583,
"grad_norm": 37.90148162841797,
"learning_rate": 6.687830687830688e-05,
"loss": 19.0696,
"step": 320
},
{
"epoch": 0.010800851361224944,
"grad_norm": 36.47368240356445,
"learning_rate": 7.111111111111112e-05,
"loss": 17.0151,
"step": 340
},
{
"epoch": 0.011436195558944057,
"grad_norm": 32.80181884765625,
"learning_rate": 7.534391534391536e-05,
"loss": 15.5522,
"step": 360
},
{
"epoch": 0.012071539756663172,
"grad_norm": 25.543760299682617,
"learning_rate": 7.957671957671958e-05,
"loss": 14.1982,
"step": 380
},
{
"epoch": 0.012706883954382287,
"grad_norm": 22.31871223449707,
"learning_rate": 8.380952380952382e-05,
"loss": 13.2314,
"step": 400
},
{
"epoch": 0.0133422281521014,
"grad_norm": 18.374950408935547,
"learning_rate": 8.804232804232805e-05,
"loss": 12.4637,
"step": 420
},
{
"epoch": 0.013977572349820515,
"grad_norm": 18.497610092163086,
"learning_rate": 9.227513227513229e-05,
"loss": 11.9765,
"step": 440
},
{
"epoch": 0.01461291654753963,
"grad_norm": 14.529912948608398,
"learning_rate": 9.650793650793651e-05,
"loss": 11.2678,
"step": 460
},
{
"epoch": 0.015248260745258743,
"grad_norm": 12.937056541442871,
"learning_rate": 0.00010074074074074073,
"loss": 10.6223,
"step": 480
},
{
"epoch": 0.015883604942977858,
"grad_norm": 12.284934043884277,
"learning_rate": 0.00010497354497354497,
"loss": 10.189,
"step": 500
},
{
"epoch": 0.016518949140696973,
"grad_norm": 9.824132919311523,
"learning_rate": 0.0001092063492063492,
"loss": 9.8138,
"step": 520
},
{
"epoch": 0.017154293338416088,
"grad_norm": 8.129488945007324,
"learning_rate": 0.00011343915343915343,
"loss": 9.4242,
"step": 540
},
{
"epoch": 0.017789637536135203,
"grad_norm": 9.27999496459961,
"learning_rate": 0.00011767195767195766,
"loss": 9.1365,
"step": 560
},
{
"epoch": 0.018424981733854314,
"grad_norm": 5.250537872314453,
"learning_rate": 0.00012190476190476193,
"loss": 8.8276,
"step": 580
},
{
"epoch": 0.01906032593157343,
"grad_norm": 5.430091381072998,
"learning_rate": 0.00012613756613756615,
"loss": 8.5892,
"step": 600
},
{
"epoch": 0.019695670129292544,
"grad_norm": 3.3930234909057617,
"learning_rate": 0.0001303703703703704,
"loss": 8.3652,
"step": 620
},
{
"epoch": 0.02033101432701166,
"grad_norm": 2.841287136077881,
"learning_rate": 0.00013460317460317462,
"loss": 8.1527,
"step": 640
},
{
"epoch": 0.020966358524730774,
"grad_norm": 2.188707113265991,
"learning_rate": 0.00013883597883597885,
"loss": 7.9891,
"step": 660
},
{
"epoch": 0.02160170272244989,
"grad_norm": 2.6337716579437256,
"learning_rate": 0.0001430687830687831,
"loss": 7.8345,
"step": 680
},
{
"epoch": 0.022237046920169,
"grad_norm": 1.7390124797821045,
"learning_rate": 0.00014730158730158732,
"loss": 7.6817,
"step": 700
},
{
"epoch": 0.022872391117888115,
"grad_norm": 1.6422362327575684,
"learning_rate": 0.00015153439153439154,
"loss": 7.5748,
"step": 720
},
{
"epoch": 0.02350773531560723,
"grad_norm": 1.6876453161239624,
"learning_rate": 0.0001557671957671958,
"loss": 7.3896,
"step": 740
},
{
"epoch": 0.024143079513326345,
"grad_norm": 1.230586290359497,
"learning_rate": 0.00016,
"loss": 7.3337,
"step": 760
},
{
"epoch": 0.02477842371104546,
"grad_norm": 1.2059415578842163,
"learning_rate": 0.00016423280423280424,
"loss": 7.2545,
"step": 780
},
{
"epoch": 0.025413767908764574,
"grad_norm": 1.5651260614395142,
"learning_rate": 0.00016846560846560849,
"loss": 7.1927,
"step": 800
},
{
"epoch": 0.02604911210648369,
"grad_norm": 2.234393358230591,
"learning_rate": 0.0001726984126984127,
"loss": 7.1617,
"step": 820
},
{
"epoch": 0.0266844563042028,
"grad_norm": 1.6703732013702393,
"learning_rate": 0.00017693121693121696,
"loss": 7.093,
"step": 840
},
{
"epoch": 0.027319800501921915,
"grad_norm": 0.796870231628418,
"learning_rate": 0.00018116402116402118,
"loss": 7.0105,
"step": 860
},
{
"epoch": 0.02795514469964103,
"grad_norm": 1.0919573307037354,
"learning_rate": 0.0001853968253968254,
"loss": 6.9911,
"step": 880
},
{
"epoch": 0.028590488897360145,
"grad_norm": 1.3225408792495728,
"learning_rate": 0.00018962962962962965,
"loss": 6.9353,
"step": 900
},
{
"epoch": 0.02922583309507926,
"grad_norm": 0.9445711970329285,
"learning_rate": 0.00019386243386243388,
"loss": 6.9075,
"step": 920
},
{
"epoch": 0.029861177292798375,
"grad_norm": 1.0021796226501465,
"learning_rate": 0.0001980952380952381,
"loss": 6.8545,
"step": 940
},
{
"epoch": 0.030496521490517486,
"grad_norm": 1.147709608078003,
"learning_rate": 0.00019999993595464,
"loss": 6.8145,
"step": 960
},
{
"epoch": 0.0311318656882366,
"grad_norm": 1.4438824653625488,
"learning_rate": 0.00019999949134260042,
"loss": 6.7156,
"step": 980
},
{
"epoch": 0.031767209885955716,
"grad_norm": 1.4000093936920166,
"learning_rate": 0.0001999986232924222,
"loss": 6.6363,
"step": 1000
},
{
"epoch": 0.031767209885955716,
"eval_loss": 6.87591028213501,
"eval_runtime": 46.4669,
"eval_samples_per_second": 58.17,
"eval_steps_per_second": 29.096,
"step": 1000
},
{
"epoch": 0.03240255408367483,
"grad_norm": 2.151993989944458,
"learning_rate": 0.00019999733180778103,
"loss": 6.5176,
"step": 1020
},
{
"epoch": 0.033037898281393946,
"grad_norm": 1.611135721206665,
"learning_rate": 0.00019999561689414561,
"loss": 6.4132,
"step": 1040
},
{
"epoch": 0.03367324247911306,
"grad_norm": 2.1010184288024902,
"learning_rate": 0.00019999347855877755,
"loss": 6.2465,
"step": 1060
},
{
"epoch": 0.034308586676832176,
"grad_norm": 1.5021122694015503,
"learning_rate": 0.0001999909168107314,
"loss": 6.1662,
"step": 1080
},
{
"epoch": 0.03494393087455129,
"grad_norm": 1.4672967195510864,
"learning_rate": 0.0001999879316608547,
"loss": 6.0509,
"step": 1100
},
{
"epoch": 0.035579275072270405,
"grad_norm": 1.4146413803100586,
"learning_rate": 0.0001999845231217877,
"loss": 5.9012,
"step": 1120
},
{
"epoch": 0.03621461926998951,
"grad_norm": 1.252382755279541,
"learning_rate": 0.00019998069120796358,
"loss": 5.815,
"step": 1140
},
{
"epoch": 0.03684996346770863,
"grad_norm": 1.6317933797836304,
"learning_rate": 0.0001999764359356082,
"loss": 5.771,
"step": 1160
},
{
"epoch": 0.03748530766542774,
"grad_norm": 1.2354493141174316,
"learning_rate": 0.0001999717573227401,
"loss": 5.6189,
"step": 1180
},
{
"epoch": 0.03812065186314686,
"grad_norm": 1.1442275047302246,
"learning_rate": 0.0001999666553891704,
"loss": 5.5078,
"step": 1200
},
{
"epoch": 0.03875599606086597,
"grad_norm": 1.3596833944320679,
"learning_rate": 0.0001999611301565027,
"loss": 5.4507,
"step": 1220
},
{
"epoch": 0.03939134025858509,
"grad_norm": 1.5420782566070557,
"learning_rate": 0.00019995518164813315,
"loss": 5.3225,
"step": 1240
},
{
"epoch": 0.0400266844563042,
"grad_norm": 2.335935354232788,
"learning_rate": 0.00019994880988925007,
"loss": 5.3398,
"step": 1260
},
{
"epoch": 0.04066202865402332,
"grad_norm": 1.2030448913574219,
"learning_rate": 0.00019994201490683406,
"loss": 5.2367,
"step": 1280
},
{
"epoch": 0.04129737285174243,
"grad_norm": 1.1881422996520996,
"learning_rate": 0.00019993479672965783,
"loss": 5.2073,
"step": 1300
},
{
"epoch": 0.04193271704946155,
"grad_norm": 1.2961896657943726,
"learning_rate": 0.00019992715538828609,
"loss": 5.157,
"step": 1320
},
{
"epoch": 0.04256806124718066,
"grad_norm": 0.9343932271003723,
"learning_rate": 0.00019991909091507525,
"loss": 5.0156,
"step": 1340
},
{
"epoch": 0.04320340544489978,
"grad_norm": 0.9654686450958252,
"learning_rate": 0.00019991060334417364,
"loss": 5.054,
"step": 1360
},
{
"epoch": 0.04383874964261889,
"grad_norm": 1.4537482261657715,
"learning_rate": 0.00019990169271152098,
"loss": 4.9824,
"step": 1380
},
{
"epoch": 0.044474093840338,
"grad_norm": 1.0155112743377686,
"learning_rate": 0.00019989235905484853,
"loss": 4.8496,
"step": 1400
},
{
"epoch": 0.045109438038057115,
"grad_norm": 0.8903729915618896,
"learning_rate": 0.00019988260241367875,
"loss": 4.8407,
"step": 1420
},
{
"epoch": 0.04574478223577623,
"grad_norm": 1.0020333528518677,
"learning_rate": 0.00019987242282932518,
"loss": 4.7753,
"step": 1440
},
{
"epoch": 0.046380126433495344,
"grad_norm": 1.2074095010757446,
"learning_rate": 0.0001998618203448923,
"loss": 4.6939,
"step": 1460
},
{
"epoch": 0.04701547063121446,
"grad_norm": 2.5281686782836914,
"learning_rate": 0.00019985079500527527,
"loss": 4.6567,
"step": 1480
},
{
"epoch": 0.047650814828933574,
"grad_norm": 1.257580280303955,
"learning_rate": 0.00019983934685715982,
"loss": 4.5615,
"step": 1500
},
{
"epoch": 0.04828615902665269,
"grad_norm": 1.5581581592559814,
"learning_rate": 0.00019982747594902203,
"loss": 4.6081,
"step": 1520
},
{
"epoch": 0.048921503224371804,
"grad_norm": 1.029440999031067,
"learning_rate": 0.0001998151823311281,
"loss": 4.491,
"step": 1540
},
{
"epoch": 0.04955684742209092,
"grad_norm": 0.9729529023170471,
"learning_rate": 0.0001998024660555342,
"loss": 4.4692,
"step": 1560
},
{
"epoch": 0.050192191619810034,
"grad_norm": 1.1230270862579346,
"learning_rate": 0.00019978932717608613,
"loss": 4.3839,
"step": 1580
},
{
"epoch": 0.05082753581752915,
"grad_norm": 1.048663854598999,
"learning_rate": 0.0001997757657484192,
"loss": 4.3907,
"step": 1600
},
{
"epoch": 0.051462880015248263,
"grad_norm": 1.2080233097076416,
"learning_rate": 0.000199761781829958,
"loss": 4.3147,
"step": 1620
},
{
"epoch": 0.05209822421296738,
"grad_norm": 1.1026450395584106,
"learning_rate": 0.000199747375479916,
"loss": 4.2496,
"step": 1640
},
{
"epoch": 0.052733568410686486,
"grad_norm": 1.037937879562378,
"learning_rate": 0.00019973254675929554,
"loss": 4.2614,
"step": 1660
},
{
"epoch": 0.0533689126084056,
"grad_norm": 1.1000276803970337,
"learning_rate": 0.00019971729573088742,
"loss": 4.1367,
"step": 1680
},
{
"epoch": 0.054004256806124716,
"grad_norm": 1.4259387254714966,
"learning_rate": 0.0001997016224592706,
"loss": 4.1126,
"step": 1700
},
{
"epoch": 0.05463960100384383,
"grad_norm": 1.2918739318847656,
"learning_rate": 0.00019968552701081203,
"loss": 4.0945,
"step": 1720
},
{
"epoch": 0.055274945201562946,
"grad_norm": 1.0148296356201172,
"learning_rate": 0.00019966900945366634,
"loss": 3.9981,
"step": 1740
},
{
"epoch": 0.05591028939928206,
"grad_norm": 1.4177788496017456,
"learning_rate": 0.0001996520698577755,
"loss": 3.9247,
"step": 1760
},
{
"epoch": 0.056545633597001176,
"grad_norm": 1.1384249925613403,
"learning_rate": 0.00019963470829486858,
"loss": 3.9204,
"step": 1780
},
{
"epoch": 0.05718097779472029,
"grad_norm": 1.2175607681274414,
"learning_rate": 0.0001996169248384615,
"loss": 3.9023,
"step": 1800
},
{
"epoch": 0.057816321992439405,
"grad_norm": 1.7040660381317139,
"learning_rate": 0.0001995987195638565,
"loss": 3.8349,
"step": 1820
},
{
"epoch": 0.05845166619015852,
"grad_norm": 1.4229464530944824,
"learning_rate": 0.0001995800925481421,
"loss": 3.7969,
"step": 1840
},
{
"epoch": 0.059087010387877635,
"grad_norm": 1.1412523984909058,
"learning_rate": 0.0001995610438701925,
"loss": 3.6494,
"step": 1860
},
{
"epoch": 0.05972235458559675,
"grad_norm": 1.3119606971740723,
"learning_rate": 0.00019954157361066764,
"loss": 3.6137,
"step": 1880
},
{
"epoch": 0.06035769878331586,
"grad_norm": 1.260469675064087,
"learning_rate": 0.0001995216818520123,
"loss": 3.5703,
"step": 1900
},
{
"epoch": 0.06099304298103497,
"grad_norm": 1.6222745180130005,
"learning_rate": 0.00019950136867845627,
"loss": 3.4526,
"step": 1920
},
{
"epoch": 0.06162838717875409,
"grad_norm": 1.399109125137329,
"learning_rate": 0.00019948063417601369,
"loss": 3.4467,
"step": 1940
},
{
"epoch": 0.0622637313764732,
"grad_norm": 1.1804718971252441,
"learning_rate": 0.00019945947843248276,
"loss": 3.3017,
"step": 1960
},
{
"epoch": 0.06289907557419232,
"grad_norm": 1.1146492958068848,
"learning_rate": 0.0001994379015374455,
"loss": 3.2564,
"step": 1980
},
{
"epoch": 0.06353441977191143,
"grad_norm": 1.3201006650924683,
"learning_rate": 0.00019941590358226713,
"loss": 3.2076,
"step": 2000
},
{
"epoch": 0.06353441977191143,
"eval_loss": 3.1886417865753174,
"eval_runtime": 45.0925,
"eval_samples_per_second": 59.943,
"eval_steps_per_second": 29.983,
"step": 2000
},
{
"epoch": 0.06416976396963055,
"grad_norm": 1.4352892637252808,
"learning_rate": 0.00019939348466009588,
"loss": 3.1246,
"step": 2020
},
{
"epoch": 0.06480510816734966,
"grad_norm": 1.4391227960586548,
"learning_rate": 0.0001993706448658625,
"loss": 3.1187,
"step": 2040
},
{
"epoch": 0.06544045236506878,
"grad_norm": 1.2951711416244507,
"learning_rate": 0.0001993473842962798,
"loss": 3.0175,
"step": 2060
},
{
"epoch": 0.06607579656278789,
"grad_norm": 1.559552550315857,
"learning_rate": 0.00019932370304984255,
"loss": 2.8894,
"step": 2080
},
{
"epoch": 0.066711140760507,
"grad_norm": 1.2822929620742798,
"learning_rate": 0.00019929960122682655,
"loss": 2.8483,
"step": 2100
},
{
"epoch": 0.06734648495822612,
"grad_norm": 1.4227052927017212,
"learning_rate": 0.00019927507892928873,
"loss": 2.8691,
"step": 2120
},
{
"epoch": 0.06798182915594524,
"grad_norm": 1.643660306930542,
"learning_rate": 0.00019925013626106633,
"loss": 2.8578,
"step": 2140
},
{
"epoch": 0.06861717335366435,
"grad_norm": 1.1360414028167725,
"learning_rate": 0.00019922477332777664,
"loss": 2.7094,
"step": 2160
},
{
"epoch": 0.06925251755138347,
"grad_norm": 1.224853277206421,
"learning_rate": 0.00019919899023681658,
"loss": 2.6953,
"step": 2180
},
{
"epoch": 0.06988786174910258,
"grad_norm": 1.093682885169983,
"learning_rate": 0.00019917278709736212,
"loss": 2.6255,
"step": 2200
},
{
"epoch": 0.0705232059468217,
"grad_norm": 1.238864779472351,
"learning_rate": 0.00019914616402036796,
"loss": 2.5893,
"step": 2220
},
{
"epoch": 0.07115855014454081,
"grad_norm": 1.1016559600830078,
"learning_rate": 0.00019911912111856688,
"loss": 2.4743,
"step": 2240
},
{
"epoch": 0.07179389434225993,
"grad_norm": 1.12881600856781,
"learning_rate": 0.00019909165850646941,
"loss": 2.5057,
"step": 2260
},
{
"epoch": 0.07242923853997903,
"grad_norm": 1.216238021850586,
"learning_rate": 0.00019906377630036338,
"loss": 2.4624,
"step": 2280
},
{
"epoch": 0.07306458273769814,
"grad_norm": 1.1429589986801147,
"learning_rate": 0.00019903547461831323,
"loss": 2.3835,
"step": 2300
},
{
"epoch": 0.07369992693541726,
"grad_norm": 0.9367678165435791,
"learning_rate": 0.00019900675358015967,
"loss": 2.3971,
"step": 2320
},
{
"epoch": 0.07433527113313637,
"grad_norm": 1.0869677066802979,
"learning_rate": 0.00019897761330751922,
"loss": 2.3241,
"step": 2340
},
{
"epoch": 0.07497061533085549,
"grad_norm": 0.958840548992157,
"learning_rate": 0.0001989480539237835,
"loss": 2.2828,
"step": 2360
},
{
"epoch": 0.0756059595285746,
"grad_norm": 0.9724891781806946,
"learning_rate": 0.00019891807555411884,
"loss": 2.2858,
"step": 2380
},
{
"epoch": 0.07624130372629372,
"grad_norm": 1.045828104019165,
"learning_rate": 0.00019888767832546572,
"loss": 2.2949,
"step": 2400
},
{
"epoch": 0.07687664792401283,
"grad_norm": 1.0283712148666382,
"learning_rate": 0.0001988568623665383,
"loss": 2.2034,
"step": 2420
},
{
"epoch": 0.07751199212173195,
"grad_norm": 1.0930371284484863,
"learning_rate": 0.00019882562780782376,
"loss": 2.2283,
"step": 2440
},
{
"epoch": 0.07814733631945106,
"grad_norm": 0.892132580280304,
"learning_rate": 0.00019879397478158177,
"loss": 2.1872,
"step": 2460
},
{
"epoch": 0.07878268051717018,
"grad_norm": 1.0107035636901855,
"learning_rate": 0.00019876190342184402,
"loss": 2.1874,
"step": 2480
},
{
"epoch": 0.07941802471488929,
"grad_norm": 1.1195555925369263,
"learning_rate": 0.00019872941386441358,
"loss": 2.0823,
"step": 2500
},
{
"epoch": 0.0800533689126084,
"grad_norm": 1.2803888320922852,
"learning_rate": 0.0001986965062468643,
"loss": 2.0905,
"step": 2520
},
{
"epoch": 0.08068871311032752,
"grad_norm": 1.0955703258514404,
"learning_rate": 0.00019866318070854033,
"loss": 2.0645,
"step": 2540
},
{
"epoch": 0.08132405730804664,
"grad_norm": 1.117477297782898,
"learning_rate": 0.00019862943739055536,
"loss": 2.0259,
"step": 2560
},
{
"epoch": 0.08195940150576575,
"grad_norm": 0.9660820960998535,
"learning_rate": 0.0001985952764357923,
"loss": 1.9881,
"step": 2580
},
{
"epoch": 0.08259474570348486,
"grad_norm": 0.9186820983886719,
"learning_rate": 0.0001985606979889023,
"loss": 1.9571,
"step": 2600
},
{
"epoch": 0.08323008990120398,
"grad_norm": 1.1236801147460938,
"learning_rate": 0.00019852570219630445,
"loss": 1.9506,
"step": 2620
},
{
"epoch": 0.0838654340989231,
"grad_norm": 0.9719575047492981,
"learning_rate": 0.0001984902892061851,
"loss": 1.9359,
"step": 2640
},
{
"epoch": 0.08450077829664221,
"grad_norm": 1.3401118516921997,
"learning_rate": 0.00019845445916849704,
"loss": 1.9707,
"step": 2660
},
{
"epoch": 0.08513612249436132,
"grad_norm": 0.980446457862854,
"learning_rate": 0.00019841821223495916,
"loss": 1.88,
"step": 2680
},
{
"epoch": 0.08577146669208044,
"grad_norm": 1.178143858909607,
"learning_rate": 0.00019838154855905552,
"loss": 1.8629,
"step": 2700
},
{
"epoch": 0.08640681088979955,
"grad_norm": 0.9232170581817627,
"learning_rate": 0.00019834446829603494,
"loss": 1.8467,
"step": 2720
},
{
"epoch": 0.08704215508751867,
"grad_norm": 1.7343891859054565,
"learning_rate": 0.00019830697160291017,
"loss": 1.8194,
"step": 2740
},
{
"epoch": 0.08767749928523778,
"grad_norm": 0.878983199596405,
"learning_rate": 0.0001982690586384573,
"loss": 1.8232,
"step": 2760
},
{
"epoch": 0.0883128434829569,
"grad_norm": 1.0917317867279053,
"learning_rate": 0.00019823072956321513,
"loss": 1.7668,
"step": 2780
},
{
"epoch": 0.088948187680676,
"grad_norm": 1.0753387212753296,
"learning_rate": 0.00019819198453948443,
"loss": 1.7968,
"step": 2800
},
{
"epoch": 0.08958353187839511,
"grad_norm": 1.0904388427734375,
"learning_rate": 0.00019815282373132718,
"loss": 1.7834,
"step": 2820
},
{
"epoch": 0.09021887607611423,
"grad_norm": 0.9622576236724854,
"learning_rate": 0.00019811324730456607,
"loss": 1.7773,
"step": 2840
},
{
"epoch": 0.09085422027383334,
"grad_norm": 0.8677240610122681,
"learning_rate": 0.0001980732554267836,
"loss": 1.7322,
"step": 2860
},
{
"epoch": 0.09148956447155246,
"grad_norm": 1.0953987836837769,
"learning_rate": 0.0001980328482673215,
"loss": 1.7123,
"step": 2880
},
{
"epoch": 0.09212490866927157,
"grad_norm": 1.0277127027511597,
"learning_rate": 0.00019799202599727998,
"loss": 1.7558,
"step": 2900
},
{
"epoch": 0.09276025286699069,
"grad_norm": 1.1174383163452148,
"learning_rate": 0.000197950788789517,
"loss": 1.7222,
"step": 2920
},
{
"epoch": 0.0933955970647098,
"grad_norm": 0.9651451706886292,
"learning_rate": 0.00019790913681864747,
"loss": 1.6652,
"step": 2940
},
{
"epoch": 0.09403094126242892,
"grad_norm": 0.9669461250305176,
"learning_rate": 0.00019786707026104265,
"loss": 1.6381,
"step": 2960
},
{
"epoch": 0.09466628546014803,
"grad_norm": 0.9406834244728088,
"learning_rate": 0.0001978245892948293,
"loss": 1.6276,
"step": 2980
},
{
"epoch": 0.09530162965786715,
"grad_norm": 0.9768303632736206,
"learning_rate": 0.0001977816940998889,
"loss": 1.6071,
"step": 3000
},
{
"epoch": 0.09530162965786715,
"eval_loss": 1.5878759622573853,
"eval_runtime": 45.9166,
"eval_samples_per_second": 58.868,
"eval_steps_per_second": 29.445,
"step": 3000
},
{
"epoch": 0.09593697385558626,
"grad_norm": 1.083208441734314,
"learning_rate": 0.00019773838485785702,
"loss": 1.6341,
"step": 3020
},
{
"epoch": 0.09657231805330538,
"grad_norm": 0.9333330988883972,
"learning_rate": 0.00019769466175212244,
"loss": 1.5931,
"step": 3040
},
{
"epoch": 0.0972076622510245,
"grad_norm": 0.9718533754348755,
"learning_rate": 0.00019765052496782638,
"loss": 1.5735,
"step": 3060
},
{
"epoch": 0.09784300644874361,
"grad_norm": 1.2169800996780396,
"learning_rate": 0.00019760597469186184,
"loss": 1.5507,
"step": 3080
},
{
"epoch": 0.09847835064646272,
"grad_norm": 0.9822967648506165,
"learning_rate": 0.00019756101111287257,
"loss": 1.5784,
"step": 3100
},
{
"epoch": 0.09911369484418184,
"grad_norm": 0.9830970168113708,
"learning_rate": 0.0001975156344212525,
"loss": 1.5473,
"step": 3120
},
{
"epoch": 0.09974903904190095,
"grad_norm": 0.8926035761833191,
"learning_rate": 0.00019746984480914484,
"loss": 1.5141,
"step": 3140
},
{
"epoch": 0.10038438323962007,
"grad_norm": 0.8814927339553833,
"learning_rate": 0.00019742364247044125,
"loss": 1.5164,
"step": 3160
},
{
"epoch": 0.10101972743733918,
"grad_norm": 0.8626115322113037,
"learning_rate": 0.00019737702760078105,
"loss": 1.4495,
"step": 3180
},
{
"epoch": 0.1016550716350583,
"grad_norm": 1.0857669115066528,
"learning_rate": 0.00019733000039755036,
"loss": 1.511,
"step": 3200
},
{
"epoch": 0.10229041583277741,
"grad_norm": 0.8834457397460938,
"learning_rate": 0.00019728256105988132,
"loss": 1.4764,
"step": 3220
},
{
"epoch": 0.10292576003049653,
"grad_norm": 0.8241048455238342,
"learning_rate": 0.00019723470978865118,
"loss": 1.4253,
"step": 3240
},
{
"epoch": 0.10356110422821564,
"grad_norm": 0.9844352006912231,
"learning_rate": 0.00019718644678648158,
"loss": 1.4595,
"step": 3260
},
{
"epoch": 0.10419644842593476,
"grad_norm": 0.8982945084571838,
"learning_rate": 0.00019713777225773745,
"loss": 1.3535,
"step": 3280
},
{
"epoch": 0.10483179262365386,
"grad_norm": 1.2204469442367554,
"learning_rate": 0.0001970886864085263,
"loss": 1.4283,
"step": 3300
},
{
"epoch": 0.10546713682137297,
"grad_norm": 1.0676652193069458,
"learning_rate": 0.00019703918944669754,
"loss": 1.3858,
"step": 3320
},
{
"epoch": 0.10610248101909209,
"grad_norm": 1.17191743850708,
"learning_rate": 0.00019698928158184116,
"loss": 1.4426,
"step": 3340
},
{
"epoch": 0.1067378252168112,
"grad_norm": 0.9601316452026367,
"learning_rate": 0.00019693896302528716,
"loss": 1.3621,
"step": 3360
},
{
"epoch": 0.10737316941453032,
"grad_norm": 0.9755037426948547,
"learning_rate": 0.00019688823399010463,
"loss": 1.3901,
"step": 3380
},
{
"epoch": 0.10800851361224943,
"grad_norm": 1.0111849308013916,
"learning_rate": 0.0001968370946911007,
"loss": 1.3748,
"step": 3400
},
{
"epoch": 0.10864385780996855,
"grad_norm": 0.8471179604530334,
"learning_rate": 0.00019678554534481978,
"loss": 1.3227,
"step": 3420
},
{
"epoch": 0.10927920200768766,
"grad_norm": 0.9206441640853882,
"learning_rate": 0.0001967335861695426,
"loss": 1.3493,
"step": 3440
},
{
"epoch": 0.10991454620540678,
"grad_norm": 1.055109977722168,
"learning_rate": 0.0001966812173852852,
"loss": 1.3549,
"step": 3460
},
{
"epoch": 0.11054989040312589,
"grad_norm": 0.995614767074585,
"learning_rate": 0.00019662843921379816,
"loss": 1.3468,
"step": 3480
},
{
"epoch": 0.111185234600845,
"grad_norm": 0.8873100876808167,
"learning_rate": 0.0001965752518785655,
"loss": 1.3129,
"step": 3500
},
{
"epoch": 0.11182057879856412,
"grad_norm": 0.9802286624908447,
"learning_rate": 0.00019652165560480383,
"loss": 1.34,
"step": 3520
},
{
"epoch": 0.11245592299628324,
"grad_norm": 0.9177120923995972,
"learning_rate": 0.00019646765061946133,
"loss": 1.3321,
"step": 3540
},
{
"epoch": 0.11309126719400235,
"grad_norm": 1.0982646942138672,
"learning_rate": 0.00019641323715121692,
"loss": 1.292,
"step": 3560
},
{
"epoch": 0.11372661139172147,
"grad_norm": 1.1567240953445435,
"learning_rate": 0.00019635841543047918,
"loss": 1.3052,
"step": 3580
},
{
"epoch": 0.11436195558944058,
"grad_norm": 0.8516421914100647,
"learning_rate": 0.00019630318568938528,
"loss": 1.3189,
"step": 3600
},
{
"epoch": 0.1149972997871597,
"grad_norm": 0.9710924029350281,
"learning_rate": 0.00019624754816180022,
"loss": 1.2644,
"step": 3620
},
{
"epoch": 0.11563264398487881,
"grad_norm": 0.9252649545669556,
"learning_rate": 0.00019619150308331572,
"loss": 1.2517,
"step": 3640
},
{
"epoch": 0.11626798818259793,
"grad_norm": 0.973948061466217,
"learning_rate": 0.0001961350506912493,
"loss": 1.2651,
"step": 3660
},
{
"epoch": 0.11690333238031704,
"grad_norm": 0.9078177213668823,
"learning_rate": 0.000196078191224643,
"loss": 1.2089,
"step": 3680
},
{
"epoch": 0.11753867657803616,
"grad_norm": 0.8456325531005859,
"learning_rate": 0.0001960209249242628,
"loss": 1.2503,
"step": 3700
},
{
"epoch": 0.11817402077575527,
"grad_norm": 1.2014869451522827,
"learning_rate": 0.00019596325203259722,
"loss": 1.2287,
"step": 3720
},
{
"epoch": 0.11880936497347439,
"grad_norm": 0.903296172618866,
"learning_rate": 0.0001959051727938566,
"loss": 1.1999,
"step": 3740
},
{
"epoch": 0.1194447091711935,
"grad_norm": 0.9159349799156189,
"learning_rate": 0.00019584668745397182,
"loss": 1.2077,
"step": 3760
},
{
"epoch": 0.12008005336891261,
"grad_norm": 1.0457518100738525,
"learning_rate": 0.00019578779626059332,
"loss": 1.2395,
"step": 3780
},
{
"epoch": 0.12071539756663172,
"grad_norm": 0.8328551650047302,
"learning_rate": 0.0001957284994630902,
"loss": 1.2039,
"step": 3800
},
{
"epoch": 0.12135074176435083,
"grad_norm": 0.9112881422042847,
"learning_rate": 0.00019566879731254902,
"loss": 1.1987,
"step": 3820
},
{
"epoch": 0.12198608596206995,
"grad_norm": 2.0256752967834473,
"learning_rate": 0.00019560869006177262,
"loss": 1.1923,
"step": 3840
},
{
"epoch": 0.12262143015978906,
"grad_norm": 0.9714537262916565,
"learning_rate": 0.00019554817796527943,
"loss": 1.1726,
"step": 3860
},
{
"epoch": 0.12325677435750818,
"grad_norm": 0.8522310256958008,
"learning_rate": 0.00019548726127930198,
"loss": 1.1985,
"step": 3880
},
{
"epoch": 0.12389211855522729,
"grad_norm": 0.8728988766670227,
"learning_rate": 0.00019542594026178612,
"loss": 1.1662,
"step": 3900
},
{
"epoch": 0.1245274627529464,
"grad_norm": 0.9155168533325195,
"learning_rate": 0.00019536421517238973,
"loss": 1.1529,
"step": 3920
},
{
"epoch": 0.12516280695066553,
"grad_norm": 1.05704665184021,
"learning_rate": 0.0001953020862724817,
"loss": 1.1415,
"step": 3940
},
{
"epoch": 0.12579815114838463,
"grad_norm": 0.7793872952461243,
"learning_rate": 0.0001952395538251408,
"loss": 1.1387,
"step": 3960
},
{
"epoch": 0.12643349534610376,
"grad_norm": 0.9358331561088562,
"learning_rate": 0.00019517661809515465,
"loss": 1.1816,
"step": 3980
},
{
"epoch": 0.12706883954382286,
"grad_norm": 0.8175097107887268,
"learning_rate": 0.00019511327934901846,
"loss": 1.126,
"step": 4000
},
{
"epoch": 0.12706883954382286,
"eval_loss": 1.1081569194793701,
"eval_runtime": 128.6977,
"eval_samples_per_second": 21.003,
"eval_steps_per_second": 10.505,
"step": 4000
},
{
"epoch": 0.127704183741542,
"grad_norm": 0.9568232893943787,
"learning_rate": 0.000195049537854934,
"loss": 1.1002,
"step": 4020
},
{
"epoch": 0.1283395279392611,
"grad_norm": 0.9011651277542114,
"learning_rate": 0.00019498539388280848,
"loss": 1.129,
"step": 4040
},
{
"epoch": 0.1289748721369802,
"grad_norm": 1.045811653137207,
"learning_rate": 0.00019492084770425327,
"loss": 1.0945,
"step": 4060
},
{
"epoch": 0.12961021633469932,
"grad_norm": 0.8668608069419861,
"learning_rate": 0.00019485589959258292,
"loss": 1.0601,
"step": 4080
},
{
"epoch": 0.13024556053241843,
"grad_norm": 0.9976728558540344,
"learning_rate": 0.00019479054982281393,
"loss": 1.1127,
"step": 4100
},
{
"epoch": 0.13088090473013755,
"grad_norm": 0.9135074019432068,
"learning_rate": 0.00019472479867166354,
"loss": 1.0708,
"step": 4120
},
{
"epoch": 0.13151624892785665,
"grad_norm": 0.8302998542785645,
"learning_rate": 0.0001946586464175486,
"loss": 1.0925,
"step": 4140
},
{
"epoch": 0.13215159312557578,
"grad_norm": 0.9594709277153015,
"learning_rate": 0.0001945920933405844,
"loss": 1.0879,
"step": 4160
},
{
"epoch": 0.13278693732329488,
"grad_norm": 1.3145122528076172,
"learning_rate": 0.00019452513972258352,
"loss": 1.0706,
"step": 4180
},
{
"epoch": 0.133422281521014,
"grad_norm": 1.0521440505981445,
"learning_rate": 0.00019445778584705452,
"loss": 1.1089,
"step": 4200
},
{
"epoch": 0.13405762571873311,
"grad_norm": 1.1046104431152344,
"learning_rate": 0.00019439003199920088,
"loss": 1.0965,
"step": 4220
},
{
"epoch": 0.13469296991645224,
"grad_norm": 1.1228617429733276,
"learning_rate": 0.00019432187846591967,
"loss": 1.0747,
"step": 4240
},
{
"epoch": 0.13532831411417134,
"grad_norm": 0.8399156332015991,
"learning_rate": 0.00019425332553580044,
"loss": 1.0239,
"step": 4260
},
{
"epoch": 0.13596365831189047,
"grad_norm": 0.9118017554283142,
"learning_rate": 0.00019418437349912385,
"loss": 1.0557,
"step": 4280
},
{
"epoch": 0.13659900250960957,
"grad_norm": 1.1154282093048096,
"learning_rate": 0.00019411502264786069,
"loss": 1.0846,
"step": 4300
},
{
"epoch": 0.1372343467073287,
"grad_norm": 0.8457648158073425,
"learning_rate": 0.00019404527327567035,
"loss": 1.0438,
"step": 4320
},
{
"epoch": 0.1378696909050478,
"grad_norm": 0.9336498975753784,
"learning_rate": 0.0001939751256778998,
"loss": 1.0403,
"step": 4340
},
{
"epoch": 0.13850503510276693,
"grad_norm": 0.9318077564239502,
"learning_rate": 0.0001939045801515822,
"loss": 1.0375,
"step": 4360
},
{
"epoch": 0.13914037930048603,
"grad_norm": 0.9146689176559448,
"learning_rate": 0.0001938336369954358,
"loss": 1.0394,
"step": 4380
},
{
"epoch": 0.13977572349820516,
"grad_norm": 1.2244622707366943,
"learning_rate": 0.00019376229650986245,
"loss": 1.0305,
"step": 4400
},
{
"epoch": 0.14041106769592426,
"grad_norm": 0.9721834659576416,
"learning_rate": 0.00019369055899694652,
"loss": 1.0133,
"step": 4420
},
{
"epoch": 0.1410464118936434,
"grad_norm": 0.8538774251937866,
"learning_rate": 0.00019361842476045356,
"loss": 1.0272,
"step": 4440
},
{
"epoch": 0.1416817560913625,
"grad_norm": 0.7733943462371826,
"learning_rate": 0.000193545894105829,
"loss": 1.0328,
"step": 4460
},
{
"epoch": 0.14231710028908162,
"grad_norm": 1.0937755107879639,
"learning_rate": 0.00019347296734019683,
"loss": 1.0501,
"step": 4480
},
{
"epoch": 0.14295244448680072,
"grad_norm": 0.8855345845222473,
"learning_rate": 0.00019339964477235836,
"loss": 0.9979,
"step": 4500
},
{
"epoch": 0.14358778868451985,
"grad_norm": 0.9113184213638306,
"learning_rate": 0.0001933259267127909,
"loss": 0.967,
"step": 4520
},
{
"epoch": 0.14422313288223895,
"grad_norm": 0.9671328663825989,
"learning_rate": 0.00019325181347364643,
"loss": 1.016,
"step": 4540
},
{
"epoch": 0.14485847707995805,
"grad_norm": 0.8655368685722351,
"learning_rate": 0.00019317730536875022,
"loss": 1.0005,
"step": 4560
},
{
"epoch": 0.14549382127767718,
"grad_norm": 0.8673165440559387,
"learning_rate": 0.00019310240271359967,
"loss": 0.9697,
"step": 4580
},
{
"epoch": 0.14612916547539628,
"grad_norm": 1.0993086099624634,
"learning_rate": 0.00019302710582536276,
"loss": 0.9832,
"step": 4600
},
{
"epoch": 0.1467645096731154,
"grad_norm": 1.1561827659606934,
"learning_rate": 0.00019295141502287687,
"loss": 0.9603,
"step": 4620
},
{
"epoch": 0.1473998538708345,
"grad_norm": 1.0052567720413208,
"learning_rate": 0.00019287533062664733,
"loss": 0.9808,
"step": 4640
},
{
"epoch": 0.14803519806855364,
"grad_norm": 0.9202858209609985,
"learning_rate": 0.00019279885295884618,
"loss": 0.9564,
"step": 4660
},
{
"epoch": 0.14867054226627274,
"grad_norm": 0.8606549501419067,
"learning_rate": 0.0001927219823433106,
"loss": 0.9936,
"step": 4680
},
{
"epoch": 0.14930588646399187,
"grad_norm": 0.9188569784164429,
"learning_rate": 0.00019264471910554183,
"loss": 0.9833,
"step": 4700
},
{
"epoch": 0.14994123066171097,
"grad_norm": 0.7773941159248352,
"learning_rate": 0.0001925670635727035,
"loss": 0.9272,
"step": 4720
},
{
"epoch": 0.1505765748594301,
"grad_norm": 0.8689327836036682,
"learning_rate": 0.00019248901607362047,
"loss": 0.9462,
"step": 4740
},
{
"epoch": 0.1512119190571492,
"grad_norm": 0.800255298614502,
"learning_rate": 0.00019241057693877725,
"loss": 0.9222,
"step": 4760
},
{
"epoch": 0.15184726325486833,
"grad_norm": 0.9326597452163696,
"learning_rate": 0.0001923317465003168,
"loss": 0.961,
"step": 4780
},
{
"epoch": 0.15248260745258743,
"grad_norm": 1.072416067123413,
"learning_rate": 0.00019225252509203888,
"loss": 0.9464,
"step": 4800
},
{
"epoch": 0.15311795165030656,
"grad_norm": 0.9187152981758118,
"learning_rate": 0.0001921729130493989,
"loss": 0.9461,
"step": 4820
},
{
"epoch": 0.15375329584802566,
"grad_norm": 0.8737976551055908,
"learning_rate": 0.00019209291070950633,
"loss": 0.8771,
"step": 4840
},
{
"epoch": 0.1543886400457448,
"grad_norm": 0.9321054220199585,
"learning_rate": 0.0001920125184111233,
"loss": 0.9179,
"step": 4860
},
{
"epoch": 0.1550239842434639,
"grad_norm": 0.7673978209495544,
"learning_rate": 0.00019193173649466322,
"loss": 0.8711,
"step": 4880
},
{
"epoch": 0.15565932844118302,
"grad_norm": 1.0326552391052246,
"learning_rate": 0.00019185056530218923,
"loss": 0.9494,
"step": 4900
},
{
"epoch": 0.15629467263890212,
"grad_norm": 0.8184536695480347,
"learning_rate": 0.0001917690051774129,
"loss": 0.9201,
"step": 4920
},
{
"epoch": 0.15693001683662125,
"grad_norm": 0.8319898247718811,
"learning_rate": 0.0001916870564656926,
"loss": 0.9167,
"step": 4940
},
{
"epoch": 0.15756536103434035,
"grad_norm": 1.0563160181045532,
"learning_rate": 0.0001916047195140323,
"loss": 0.8993,
"step": 4960
},
{
"epoch": 0.15820070523205948,
"grad_norm": 0.8466194868087769,
"learning_rate": 0.00019152199467107974,
"loss": 0.9198,
"step": 4980
},
{
"epoch": 0.15883604942977858,
"grad_norm": 1.1115593910217285,
"learning_rate": 0.00019143888228712527,
"loss": 0.8749,
"step": 5000
},
{
"epoch": 0.15883604942977858,
"eval_loss": 0.8843944668769836,
"eval_runtime": 127.8707,
"eval_samples_per_second": 21.139,
"eval_steps_per_second": 10.573,
"step": 5000
},
{
"epoch": 0.1594713936274977,
"grad_norm": 0.9679493308067322,
"learning_rate": 0.00019135538271410022,
"loss": 0.9212,
"step": 5020
},
{
"epoch": 0.1601067378252168,
"grad_norm": 0.8485816121101379,
"learning_rate": 0.0001912714963055754,
"loss": 0.9054,
"step": 5040
},
{
"epoch": 0.16074208202293594,
"grad_norm": 1.0210843086242676,
"learning_rate": 0.0001911872234167597,
"loss": 0.917,
"step": 5060
},
{
"epoch": 0.16137742622065504,
"grad_norm": 1.0072481632232666,
"learning_rate": 0.00019110256440449844,
"loss": 0.9014,
"step": 5080
},
{
"epoch": 0.16201277041837414,
"grad_norm": 0.9833612442016602,
"learning_rate": 0.00019101751962727204,
"loss": 0.891,
"step": 5100
},
{
"epoch": 0.16264811461609327,
"grad_norm": 1.0564861297607422,
"learning_rate": 0.0001909320894451943,
"loss": 0.8581,
"step": 5120
},
{
"epoch": 0.16328345881381237,
"grad_norm": 1.1205075979232788,
"learning_rate": 0.0001908462742200111,
"loss": 0.8884,
"step": 5140
},
{
"epoch": 0.1639188030115315,
"grad_norm": 0.9841699004173279,
"learning_rate": 0.0001907600743150986,
"loss": 0.8815,
"step": 5160
},
{
"epoch": 0.1645541472092506,
"grad_norm": 0.852820098400116,
"learning_rate": 0.00019067349009546197,
"loss": 0.8594,
"step": 5180
},
{
"epoch": 0.16518949140696973,
"grad_norm": 0.8630360960960388,
"learning_rate": 0.00019058652192773372,
"loss": 0.8653,
"step": 5200
},
{
"epoch": 0.16582483560468883,
"grad_norm": 1.0112591981887817,
"learning_rate": 0.00019049917018017207,
"loss": 0.8715,
"step": 5220
},
{
"epoch": 0.16646017980240796,
"grad_norm": 0.9182717204093933,
"learning_rate": 0.00019041143522265948,
"loss": 0.8875,
"step": 5240
},
{
"epoch": 0.16709552400012706,
"grad_norm": 1.190596103668213,
"learning_rate": 0.0001903233174267012,
"loss": 0.9027,
"step": 5260
},
{
"epoch": 0.1677308681978462,
"grad_norm": 0.8345910310745239,
"learning_rate": 0.00019023481716542342,
"loss": 0.8819,
"step": 5280
},
{
"epoch": 0.1683662123955653,
"grad_norm": 0.8964826464653015,
"learning_rate": 0.00019014593481357192,
"loss": 0.845,
"step": 5300
},
{
"epoch": 0.16900155659328442,
"grad_norm": 1.1423965692520142,
"learning_rate": 0.0001900566707475104,
"loss": 0.8463,
"step": 5320
},
{
"epoch": 0.16963690079100352,
"grad_norm": 0.895899772644043,
"learning_rate": 0.00018996702534521888,
"loss": 0.8631,
"step": 5340
},
{
"epoch": 0.17027224498872265,
"grad_norm": 1.0254230499267578,
"learning_rate": 0.00018987699898629208,
"loss": 0.8489,
"step": 5360
},
{
"epoch": 0.17090758918644175,
"grad_norm": 0.9370276927947998,
"learning_rate": 0.00018978659205193794,
"loss": 0.8822,
"step": 5380
},
{
"epoch": 0.17154293338416088,
"grad_norm": 1.1030024290084839,
"learning_rate": 0.00018969580492497577,
"loss": 0.8834,
"step": 5400
},
{
"epoch": 0.17217827758187998,
"grad_norm": 0.9148856997489929,
"learning_rate": 0.00018960463798983494,
"loss": 0.8198,
"step": 5420
},
{
"epoch": 0.1728136217795991,
"grad_norm": 0.8851357102394104,
"learning_rate": 0.00018951309163255288,
"loss": 0.8077,
"step": 5440
},
{
"epoch": 0.1734489659773182,
"grad_norm": 0.9701651334762573,
"learning_rate": 0.00018942116624077386,
"loss": 0.8687,
"step": 5460
},
{
"epoch": 0.17408431017503734,
"grad_norm": 0.9508700966835022,
"learning_rate": 0.00018932886220374696,
"loss": 0.8764,
"step": 5480
},
{
"epoch": 0.17471965437275644,
"grad_norm": 0.9914870858192444,
"learning_rate": 0.00018923617991232466,
"loss": 0.8157,
"step": 5500
},
{
"epoch": 0.17535499857047557,
"grad_norm": 1.010511040687561,
"learning_rate": 0.00018914311975896117,
"loss": 0.839,
"step": 5520
},
{
"epoch": 0.17599034276819467,
"grad_norm": 0.8063015937805176,
"learning_rate": 0.00018904968213771065,
"loss": 0.8308,
"step": 5540
},
{
"epoch": 0.1766256869659138,
"grad_norm": 0.8653827905654907,
"learning_rate": 0.00018895586744422564,
"loss": 0.8304,
"step": 5560
},
{
"epoch": 0.1772610311636329,
"grad_norm": 1.0596357583999634,
"learning_rate": 0.00018886167607575532,
"loss": 0.8346,
"step": 5580
},
{
"epoch": 0.177896375361352,
"grad_norm": 1.0251786708831787,
"learning_rate": 0.00018876710843114398,
"loss": 0.8639,
"step": 5600
},
{
"epoch": 0.17853171955907113,
"grad_norm": 0.8897235989570618,
"learning_rate": 0.00018867216491082905,
"loss": 0.8286,
"step": 5620
},
{
"epoch": 0.17916706375679023,
"grad_norm": 0.8118072748184204,
"learning_rate": 0.00018857684591683967,
"loss": 0.8597,
"step": 5640
},
{
"epoch": 0.17980240795450936,
"grad_norm": 0.8698698878288269,
"learning_rate": 0.0001884811518527949,
"loss": 0.7894,
"step": 5660
},
{
"epoch": 0.18043775215222846,
"grad_norm": 0.8228470087051392,
"learning_rate": 0.00018838508312390192,
"loss": 0.8302,
"step": 5680
},
{
"epoch": 0.1810730963499476,
"grad_norm": 1.1411319971084595,
"learning_rate": 0.00018828864013695448,
"loss": 0.8313,
"step": 5700
},
{
"epoch": 0.1817084405476667,
"grad_norm": 0.8076447248458862,
"learning_rate": 0.00018819182330033103,
"loss": 0.798,
"step": 5720
},
{
"epoch": 0.18234378474538582,
"grad_norm": 0.8669622540473938,
"learning_rate": 0.00018809463302399304,
"loss": 0.7911,
"step": 5740
},
{
"epoch": 0.18297912894310492,
"grad_norm": 0.8435181975364685,
"learning_rate": 0.0001879970697194833,
"loss": 0.7951,
"step": 5760
},
{
"epoch": 0.18361447314082405,
"grad_norm": 1.1023324728012085,
"learning_rate": 0.00018789913379992418,
"loss": 0.8253,
"step": 5780
},
{
"epoch": 0.18424981733854315,
"grad_norm": 0.9319256544113159,
"learning_rate": 0.00018780082568001585,
"loss": 0.7625,
"step": 5800
},
{
"epoch": 0.18488516153626228,
"grad_norm": 0.8259923458099365,
"learning_rate": 0.00018770214577603443,
"loss": 0.8079,
"step": 5820
},
{
"epoch": 0.18552050573398138,
"grad_norm": 0.8953514695167542,
"learning_rate": 0.00018760309450583043,
"loss": 0.7647,
"step": 5840
},
{
"epoch": 0.1861558499317005,
"grad_norm": 0.8347587585449219,
"learning_rate": 0.00018750367228882685,
"loss": 0.8089,
"step": 5860
},
{
"epoch": 0.1867911941294196,
"grad_norm": 0.9788545966148376,
"learning_rate": 0.00018740387954601742,
"loss": 0.7737,
"step": 5880
},
{
"epoch": 0.18742653832713874,
"grad_norm": 0.9509750008583069,
"learning_rate": 0.00018730371669996478,
"loss": 0.8073,
"step": 5900
},
{
"epoch": 0.18806188252485784,
"grad_norm": 0.9388551115989685,
"learning_rate": 0.0001872031841747988,
"loss": 0.7585,
"step": 5920
},
{
"epoch": 0.18869722672257697,
"grad_norm": 0.8342726826667786,
"learning_rate": 0.00018710228239621476,
"loss": 0.8025,
"step": 5940
},
{
"epoch": 0.18933257092029607,
"grad_norm": 1.0455151796340942,
"learning_rate": 0.00018700101179147134,
"loss": 0.7603,
"step": 5960
},
{
"epoch": 0.1899679151180152,
"grad_norm": 0.820931077003479,
"learning_rate": 0.00018689937278938915,
"loss": 0.7972,
"step": 5980
},
{
"epoch": 0.1906032593157343,
"grad_norm": 0.8494334816932678,
"learning_rate": 0.00018679736582034867,
"loss": 0.7663,
"step": 6000
},
{
"epoch": 0.1906032593157343,
"eval_loss": 0.7605160474777222,
"eval_runtime": 45.0866,
"eval_samples_per_second": 59.951,
"eval_steps_per_second": 29.987,
"step": 6000
},
{
"epoch": 0.19123860351345343,
"grad_norm": 0.9915199279785156,
"learning_rate": 0.00018669499131628847,
"loss": 0.7911,
"step": 6020
},
{
"epoch": 0.19187394771117253,
"grad_norm": 1.009752869606018,
"learning_rate": 0.00018659739550293418,
"loss": 0.7791,
"step": 6040
},
{
"epoch": 0.19250929190889166,
"grad_norm": 1.008296012878418,
"learning_rate": 0.00018649430555384115,
"loss": 0.7741,
"step": 6060
},
{
"epoch": 0.19314463610661076,
"grad_norm": 0.9730678200721741,
"learning_rate": 0.0001863908493530077,
"loss": 0.8028,
"step": 6080
},
{
"epoch": 0.19377998030432986,
"grad_norm": 0.8386117815971375,
"learning_rate": 0.0001862870273385091,
"loss": 0.789,
"step": 6100
},
{
"epoch": 0.194415324502049,
"grad_norm": 0.8517867922782898,
"learning_rate": 0.00018618283994996954,
"loss": 0.7472,
"step": 6120
},
{
"epoch": 0.1950506686997681,
"grad_norm": 0.8791770339012146,
"learning_rate": 0.00018607828762856046,
"loss": 0.7871,
"step": 6140
},
{
"epoch": 0.19568601289748722,
"grad_norm": 0.9248822331428528,
"learning_rate": 0.00018597337081699848,
"loss": 0.762,
"step": 6160
},
{
"epoch": 0.19632135709520632,
"grad_norm": 0.8059686422348022,
"learning_rate": 0.00018586808995954367,
"loss": 0.7345,
"step": 6180
},
{
"epoch": 0.19695670129292545,
"grad_norm": 0.7610188126564026,
"learning_rate": 0.00018576244550199758,
"loss": 0.7478,
"step": 6200
},
{
"epoch": 0.19759204549064455,
"grad_norm": 0.7763079404830933,
"learning_rate": 0.00018565643789170144,
"loss": 0.7552,
"step": 6220
},
{
"epoch": 0.19822738968836368,
"grad_norm": 1.1734811067581177,
"learning_rate": 0.00018555006757753418,
"loss": 0.7645,
"step": 6240
},
{
"epoch": 0.19886273388608278,
"grad_norm": 0.7641186714172363,
"learning_rate": 0.00018544333500991053,
"loss": 0.7267,
"step": 6260
},
{
"epoch": 0.1994980780838019,
"grad_norm": 0.8322380781173706,
"learning_rate": 0.00018533624064077922,
"loss": 0.7601,
"step": 6280
},
{
"epoch": 0.200133422281521,
"grad_norm": 0.9059064388275146,
"learning_rate": 0.00018522878492362096,
"loss": 0.7716,
"step": 6300
},
{
"epoch": 0.20076876647924013,
"grad_norm": 0.7728195786476135,
"learning_rate": 0.00018512096831344653,
"loss": 0.7435,
"step": 6320
},
{
"epoch": 0.20140411067695924,
"grad_norm": 0.9880885481834412,
"learning_rate": 0.00018501279126679495,
"loss": 0.7378,
"step": 6340
},
{
"epoch": 0.20203945487467836,
"grad_norm": 0.8192346096038818,
"learning_rate": 0.00018490425424173138,
"loss": 0.7376,
"step": 6360
},
{
"epoch": 0.20267479907239747,
"grad_norm": 1.175627589225769,
"learning_rate": 0.0001847953576978453,
"loss": 0.7672,
"step": 6380
},
{
"epoch": 0.2033101432701166,
"grad_norm": 0.7959802746772766,
"learning_rate": 0.0001846861020962486,
"loss": 0.7331,
"step": 6400
},
{
"epoch": 0.2039454874678357,
"grad_norm": 0.8343777060508728,
"learning_rate": 0.0001845764878995735,
"loss": 0.7142,
"step": 6420
},
{
"epoch": 0.20458083166555482,
"grad_norm": 0.9900172352790833,
"learning_rate": 0.00018446651557197066,
"loss": 0.7819,
"step": 6440
},
{
"epoch": 0.20521617586327393,
"grad_norm": 1.111018180847168,
"learning_rate": 0.00018435618557910725,
"loss": 0.7226,
"step": 6460
},
{
"epoch": 0.20585152006099305,
"grad_norm": 0.9301121830940247,
"learning_rate": 0.00018424549838816492,
"loss": 0.7295,
"step": 6480
},
{
"epoch": 0.20648686425871215,
"grad_norm": 0.894797146320343,
"learning_rate": 0.0001841344544678378,
"loss": 0.7199,
"step": 6500
},
{
"epoch": 0.20712220845643128,
"grad_norm": 1.041779637336731,
"learning_rate": 0.0001840230542883306,
"loss": 0.7213,
"step": 6520
},
{
"epoch": 0.20775755265415038,
"grad_norm": 0.9267428517341614,
"learning_rate": 0.00018391129832135659,
"loss": 0.7463,
"step": 6540
},
{
"epoch": 0.2083928968518695,
"grad_norm": 0.8043299913406372,
"learning_rate": 0.00018379918704013556,
"loss": 0.6909,
"step": 6560
},
{
"epoch": 0.20902824104958861,
"grad_norm": 0.8037667870521545,
"learning_rate": 0.0001836867209193918,
"loss": 0.7307,
"step": 6580
},
{
"epoch": 0.20966358524730772,
"grad_norm": 0.9795257449150085,
"learning_rate": 0.00018357390043535228,
"loss": 0.7625,
"step": 6600
},
{
"epoch": 0.21029892944502684,
"grad_norm": 1.0763206481933594,
"learning_rate": 0.0001834607260657443,
"loss": 0.7457,
"step": 6620
},
{
"epoch": 0.21093427364274595,
"grad_norm": 0.8083770275115967,
"learning_rate": 0.00018334719828979373,
"loss": 0.7398,
"step": 6640
},
{
"epoch": 0.21156961784046507,
"grad_norm": 0.8648799657821655,
"learning_rate": 0.00018323331758822299,
"loss": 0.7392,
"step": 6660
},
{
"epoch": 0.21220496203818418,
"grad_norm": 1.322874903678894,
"learning_rate": 0.0001831190844432488,
"loss": 0.767,
"step": 6680
},
{
"epoch": 0.2128403062359033,
"grad_norm": 0.8415853977203369,
"learning_rate": 0.00018300449933858034,
"loss": 0.7123,
"step": 6700
},
{
"epoch": 0.2134756504336224,
"grad_norm": 0.8832991123199463,
"learning_rate": 0.00018288956275941713,
"loss": 0.7329,
"step": 6720
},
{
"epoch": 0.21411099463134153,
"grad_norm": 0.8079715967178345,
"learning_rate": 0.00018277427519244692,
"loss": 0.6988,
"step": 6740
},
{
"epoch": 0.21474633882906063,
"grad_norm": 0.9029518365859985,
"learning_rate": 0.00018265863712584377,
"loss": 0.6943,
"step": 6760
},
{
"epoch": 0.21538168302677976,
"grad_norm": 0.9082062244415283,
"learning_rate": 0.0001825426490492658,
"loss": 0.7517,
"step": 6780
},
{
"epoch": 0.21601702722449886,
"grad_norm": 0.9031996726989746,
"learning_rate": 0.00018242631145385329,
"loss": 0.7108,
"step": 6800
},
{
"epoch": 0.216652371422218,
"grad_norm": 0.9114848375320435,
"learning_rate": 0.00018230962483222648,
"loss": 0.7151,
"step": 6820
},
{
"epoch": 0.2172877156199371,
"grad_norm": 0.8056477308273315,
"learning_rate": 0.00018219258967848355,
"loss": 0.7154,
"step": 6840
},
{
"epoch": 0.21792305981765622,
"grad_norm": 0.9029595255851746,
"learning_rate": 0.0001820752064881985,
"loss": 0.728,
"step": 6860
},
{
"epoch": 0.21855840401537532,
"grad_norm": 0.9304366707801819,
"learning_rate": 0.00018195747575841905,
"loss": 0.7298,
"step": 6880
},
{
"epoch": 0.21919374821309445,
"grad_norm": 1.2549713850021362,
"learning_rate": 0.00018183939798766452,
"loss": 0.7166,
"step": 6900
},
{
"epoch": 0.21982909241081355,
"grad_norm": 0.8609549403190613,
"learning_rate": 0.0001817209736759238,
"loss": 0.7222,
"step": 6920
},
{
"epoch": 0.22046443660853268,
"grad_norm": 0.9668901562690735,
"learning_rate": 0.00018160220332465315,
"loss": 0.706,
"step": 6940
},
{
"epoch": 0.22109978080625178,
"grad_norm": 0.9426187872886658,
"learning_rate": 0.00018148308743677407,
"loss": 0.7549,
"step": 6960
},
{
"epoch": 0.2217351250039709,
"grad_norm": 1.0274590253829956,
"learning_rate": 0.00018136362651667123,
"loss": 0.7118,
"step": 6980
},
{
"epoch": 0.22237046920169,
"grad_norm": 1.0056123733520508,
"learning_rate": 0.00018124382107019028,
"loss": 0.7284,
"step": 7000
},
{
"epoch": 0.22237046920169,
"eval_loss": 0.6820850968360901,
"eval_runtime": 44.1137,
"eval_samples_per_second": 61.274,
"eval_steps_per_second": 30.648,
"step": 7000
},
{
"epoch": 0.22300581339940914,
"grad_norm": 1.01372492313385,
"learning_rate": 0.0001811236716046358,
"loss": 0.7306,
"step": 7020
},
{
"epoch": 0.22364115759712824,
"grad_norm": 0.8217781782150269,
"learning_rate": 0.000181003178628769,
"loss": 0.7216,
"step": 7040
},
{
"epoch": 0.22427650179484737,
"grad_norm": 0.9484082460403442,
"learning_rate": 0.00018088234265280573,
"loss": 0.7164,
"step": 7060
},
{
"epoch": 0.22491184599256647,
"grad_norm": 1.2144994735717773,
"learning_rate": 0.0001807672312378185,
"loss": 0.7248,
"step": 7080
},
{
"epoch": 0.22554719019028557,
"grad_norm": 0.9574259519577026,
"learning_rate": 0.00018064572788467363,
"loss": 0.689,
"step": 7100
},
{
"epoch": 0.2261825343880047,
"grad_norm": 0.7626876831054688,
"learning_rate": 0.00018052998338935085,
"loss": 0.748,
"step": 7120
},
{
"epoch": 0.2268178785857238,
"grad_norm": 0.8534376621246338,
"learning_rate": 0.00018040781461538648,
"loss": 0.6947,
"step": 7140
},
{
"epoch": 0.22745322278344293,
"grad_norm": 1.0029544830322266,
"learning_rate": 0.00018028530536233676,
"loss": 0.7319,
"step": 7160
},
{
"epoch": 0.22808856698116203,
"grad_norm": 0.925713300704956,
"learning_rate": 0.00018016245614895518,
"loss": 0.7092,
"step": 7180
},
{
"epoch": 0.22872391117888116,
"grad_norm": 0.8006899952888489,
"learning_rate": 0.00018003926749543488,
"loss": 0.6879,
"step": 7200
},
{
"epoch": 0.22935925537660026,
"grad_norm": 0.8886255025863647,
"learning_rate": 0.00017991573992340616,
"loss": 0.6784,
"step": 7220
},
{
"epoch": 0.2299945995743194,
"grad_norm": 0.8108293414115906,
"learning_rate": 0.00017979187395593459,
"loss": 0.7094,
"step": 7240
},
{
"epoch": 0.2306299437720385,
"grad_norm": 1.0475900173187256,
"learning_rate": 0.00017966767011751858,
"loss": 0.696,
"step": 7260
},
{
"epoch": 0.23126528796975762,
"grad_norm": 0.9214044809341431,
"learning_rate": 0.0001795431289340872,
"loss": 0.7125,
"step": 7280
},
{
"epoch": 0.23190063216747672,
"grad_norm": 0.996101975440979,
"learning_rate": 0.00017941825093299802,
"loss": 0.6635,
"step": 7300
},
{
"epoch": 0.23253597636519585,
"grad_norm": 0.9577082991600037,
"learning_rate": 0.00017929303664303482,
"loss": 0.6753,
"step": 7320
},
{
"epoch": 0.23317132056291495,
"grad_norm": 1.0278524160385132,
"learning_rate": 0.00017916748659440533,
"loss": 0.7024,
"step": 7340
},
{
"epoch": 0.23380666476063408,
"grad_norm": 0.758007287979126,
"learning_rate": 0.00017904160131873906,
"loss": 0.6877,
"step": 7360
},
{
"epoch": 0.23444200895835318,
"grad_norm": 0.8926889300346375,
"learning_rate": 0.00017891538134908502,
"loss": 0.7123,
"step": 7380
},
{
"epoch": 0.2350773531560723,
"grad_norm": 0.8747749924659729,
"learning_rate": 0.00017878882721990936,
"loss": 0.656,
"step": 7400
},
{
"epoch": 0.2357126973537914,
"grad_norm": 1.012324333190918,
"learning_rate": 0.00017866193946709327,
"loss": 0.6885,
"step": 7420
},
{
"epoch": 0.23634804155151054,
"grad_norm": 0.7973082065582275,
"learning_rate": 0.00017853471862793068,
"loss": 0.6627,
"step": 7440
},
{
"epoch": 0.23698338574922964,
"grad_norm": 0.8259735107421875,
"learning_rate": 0.00017840716524112582,
"loss": 0.6861,
"step": 7460
},
{
"epoch": 0.23761872994694877,
"grad_norm": 0.7817295789718628,
"learning_rate": 0.00017827927984679113,
"loss": 0.6808,
"step": 7480
},
{
"epoch": 0.23825407414466787,
"grad_norm": 0.8139945864677429,
"learning_rate": 0.00017815106298644495,
"loss": 0.6891,
"step": 7500
},
{
"epoch": 0.238889418342387,
"grad_norm": 1.0507733821868896,
"learning_rate": 0.00017802251520300906,
"loss": 0.6936,
"step": 7520
},
{
"epoch": 0.2395247625401061,
"grad_norm": 0.929937481880188,
"learning_rate": 0.0001778936370408066,
"loss": 0.687,
"step": 7540
},
{
"epoch": 0.24016010673782523,
"grad_norm": 1.0632777214050293,
"learning_rate": 0.00017776442904555962,
"loss": 0.6656,
"step": 7560
},
{
"epoch": 0.24079545093554433,
"grad_norm": 1.1247339248657227,
"learning_rate": 0.00017763489176438686,
"loss": 0.6645,
"step": 7580
},
{
"epoch": 0.24143079513326343,
"grad_norm": 0.8897901773452759,
"learning_rate": 0.00017750502574580135,
"loss": 0.6832,
"step": 7600
},
{
"epoch": 0.24206613933098256,
"grad_norm": 0.9285283088684082,
"learning_rate": 0.00017737483153970816,
"loss": 0.6841,
"step": 7620
},
{
"epoch": 0.24270148352870166,
"grad_norm": 0.8733476400375366,
"learning_rate": 0.00017724430969740196,
"loss": 0.6567,
"step": 7640
},
{
"epoch": 0.2433368277264208,
"grad_norm": 0.9532790184020996,
"learning_rate": 0.0001771134607715649,
"loss": 0.6795,
"step": 7660
},
{
"epoch": 0.2439721719241399,
"grad_norm": 1.0881035327911377,
"learning_rate": 0.00017698228531626398,
"loss": 0.693,
"step": 7680
},
{
"epoch": 0.24460751612185902,
"grad_norm": 1.0936851501464844,
"learning_rate": 0.00017685078388694897,
"loss": 0.6852,
"step": 7700
},
{
"epoch": 0.24524286031957812,
"grad_norm": 1.0439817905426025,
"learning_rate": 0.0001767189570404499,
"loss": 0.6746,
"step": 7720
},
{
"epoch": 0.24587820451729725,
"grad_norm": 0.8599082231521606,
"learning_rate": 0.00017658680533497477,
"loss": 0.6719,
"step": 7740
},
{
"epoch": 0.24651354871501635,
"grad_norm": 0.9633190035820007,
"learning_rate": 0.00017645432933010712,
"loss": 0.7091,
"step": 7760
},
{
"epoch": 0.24714889291273548,
"grad_norm": 0.8989465236663818,
"learning_rate": 0.00017632152958680378,
"loss": 0.6649,
"step": 7780
},
{
"epoch": 0.24778423711045458,
"grad_norm": 0.8468721508979797,
"learning_rate": 0.00017618840666739228,
"loss": 0.6789,
"step": 7800
},
{
"epoch": 0.2484195813081737,
"grad_norm": 0.8482181429862976,
"learning_rate": 0.00017605496113556882,
"loss": 0.6902,
"step": 7820
},
{
"epoch": 0.2490549255058928,
"grad_norm": 0.8012595176696777,
"learning_rate": 0.00017592119355639544,
"loss": 0.6733,
"step": 7840
},
{
"epoch": 0.24969026970361194,
"grad_norm": 0.8117650151252747,
"learning_rate": 0.00017578710449629804,
"loss": 0.6916,
"step": 7860
},
{
"epoch": 0.25032561390133107,
"grad_norm": 0.9711939096450806,
"learning_rate": 0.00017565269452306364,
"loss": 0.6701,
"step": 7880
},
{
"epoch": 0.25096095809905017,
"grad_norm": 0.8234876394271851,
"learning_rate": 0.00017551796420583833,
"loss": 0.62,
"step": 7900
},
{
"epoch": 0.25159630229676927,
"grad_norm": 0.8263707756996155,
"learning_rate": 0.00017538967420545803,
"loss": 0.6907,
"step": 7920
},
{
"epoch": 0.25223164649448837,
"grad_norm": 1.2548505067825317,
"learning_rate": 0.00017525432085959138,
"loss": 0.6644,
"step": 7940
},
{
"epoch": 0.2528669906922075,
"grad_norm": 1.1948567628860474,
"learning_rate": 0.00017511864885660835,
"loss": 0.6609,
"step": 7960
},
{
"epoch": 0.25350233488992663,
"grad_norm": 0.9310169219970703,
"learning_rate": 0.0001749826587709989,
"loss": 0.6757,
"step": 7980
},
{
"epoch": 0.25413767908764573,
"grad_norm": 0.8832531571388245,
"learning_rate": 0.00017484635117859983,
"loss": 0.6552,
"step": 8000
},
{
"epoch": 0.25413767908764573,
"eval_loss": 0.6333429217338562,
"eval_runtime": 44.406,
"eval_samples_per_second": 60.87,
"eval_steps_per_second": 30.446,
"step": 8000
},
{
"epoch": 0.25477302328536483,
"grad_norm": 0.7624004483222961,
"learning_rate": 0.00017470972665659245,
"loss": 0.6567,
"step": 8020
},
{
"epoch": 0.255408367483084,
"grad_norm": 0.9134401082992554,
"learning_rate": 0.00017457278578350002,
"loss": 0.6681,
"step": 8040
},
{
"epoch": 0.2560437116808031,
"grad_norm": 0.9597674608230591,
"learning_rate": 0.00017443552913918534,
"loss": 0.6818,
"step": 8060
},
{
"epoch": 0.2566790558785222,
"grad_norm": 0.961934506893158,
"learning_rate": 0.00017429795730484836,
"loss": 0.6833,
"step": 8080
},
{
"epoch": 0.2573144000762413,
"grad_norm": 0.9118033647537231,
"learning_rate": 0.00017416007086302367,
"loss": 0.6607,
"step": 8100
},
{
"epoch": 0.2579497442739604,
"grad_norm": 0.8447214961051941,
"learning_rate": 0.00017402187039757805,
"loss": 0.6409,
"step": 8120
},
{
"epoch": 0.25858508847167955,
"grad_norm": 1.010040044784546,
"learning_rate": 0.0001738833564937079,
"loss": 0.6761,
"step": 8140
},
{
"epoch": 0.25922043266939865,
"grad_norm": 0.8686466217041016,
"learning_rate": 0.00017374452973793693,
"loss": 0.6575,
"step": 8160
},
{
"epoch": 0.25985577686711775,
"grad_norm": 1.0445839166641235,
"learning_rate": 0.00017360539071811356,
"loss": 0.667,
"step": 8180
},
{
"epoch": 0.26049112106483685,
"grad_norm": 1.1015607118606567,
"learning_rate": 0.00017346594002340843,
"loss": 0.6468,
"step": 8200
},
{
"epoch": 0.261126465262556,
"grad_norm": 1.4550483226776123,
"learning_rate": 0.00017332617824431204,
"loss": 0.6642,
"step": 8220
},
{
"epoch": 0.2617618094602751,
"grad_norm": 0.8968580961227417,
"learning_rate": 0.000173186105972632,
"loss": 0.6695,
"step": 8240
},
{
"epoch": 0.2623971536579942,
"grad_norm": 0.9802786111831665,
"learning_rate": 0.00017304572380149078,
"loss": 0.6516,
"step": 8260
},
{
"epoch": 0.2630324978557133,
"grad_norm": 0.8785617351531982,
"learning_rate": 0.00017290503232532305,
"loss": 0.6857,
"step": 8280
},
{
"epoch": 0.26366784205343247,
"grad_norm": 0.8675135970115662,
"learning_rate": 0.00017276403213987323,
"loss": 0.6493,
"step": 8300
},
{
"epoch": 0.26430318625115157,
"grad_norm": 0.8159687519073486,
"learning_rate": 0.0001726227238421929,
"loss": 0.6445,
"step": 8320
},
{
"epoch": 0.26493853044887067,
"grad_norm": 0.8598359823226929,
"learning_rate": 0.00017248110803063833,
"loss": 0.6515,
"step": 8340
},
{
"epoch": 0.26557387464658977,
"grad_norm": 1.0304324626922607,
"learning_rate": 0.00017233918530486792,
"loss": 0.6431,
"step": 8360
},
{
"epoch": 0.2662092188443089,
"grad_norm": 0.933110773563385,
"learning_rate": 0.0001722040749834389,
"loss": 0.6958,
"step": 8380
},
{
"epoch": 0.266844563042028,
"grad_norm": 0.9690568447113037,
"learning_rate": 0.0001720615555046345,
"loss": 0.5922,
"step": 8400
},
{
"epoch": 0.26747990723974713,
"grad_norm": 0.9293822646141052,
"learning_rate": 0.0001719187308881687,
"loss": 0.6407,
"step": 8420
},
{
"epoch": 0.26811525143746623,
"grad_norm": 0.8957870602607727,
"learning_rate": 0.00017177560173881846,
"loss": 0.662,
"step": 8440
},
{
"epoch": 0.2687505956351854,
"grad_norm": 1.0288225412368774,
"learning_rate": 0.0001716321686626503,
"loss": 0.6395,
"step": 8460
},
{
"epoch": 0.2693859398329045,
"grad_norm": 0.838657021522522,
"learning_rate": 0.00017148843226701764,
"loss": 0.6313,
"step": 8480
},
{
"epoch": 0.2700212840306236,
"grad_norm": 0.8575971722602844,
"learning_rate": 0.00017134439316055834,
"loss": 0.6655,
"step": 8500
},
{
"epoch": 0.2706566282283427,
"grad_norm": 0.9840354919433594,
"learning_rate": 0.00017120005195319195,
"loss": 0.6646,
"step": 8520
},
{
"epoch": 0.27129197242606184,
"grad_norm": 0.8279704451560974,
"learning_rate": 0.00017105540925611737,
"loss": 0.6259,
"step": 8540
},
{
"epoch": 0.27192731662378095,
"grad_norm": 1.0609900951385498,
"learning_rate": 0.00017091046568180996,
"loss": 0.6561,
"step": 8560
},
{
"epoch": 0.27256266082150005,
"grad_norm": 0.890514612197876,
"learning_rate": 0.0001707652218440193,
"loss": 0.6324,
"step": 8580
},
{
"epoch": 0.27319800501921915,
"grad_norm": 0.9357948303222656,
"learning_rate": 0.0001706196783577663,
"loss": 0.6116,
"step": 8600
},
{
"epoch": 0.27383334921693825,
"grad_norm": 0.9577456116676331,
"learning_rate": 0.0001704738358393407,
"loss": 0.6764,
"step": 8620
},
{
"epoch": 0.2744686934146574,
"grad_norm": 0.834900438785553,
"learning_rate": 0.0001703276949062985,
"loss": 0.6324,
"step": 8640
},
{
"epoch": 0.2751040376123765,
"grad_norm": 0.8283354043960571,
"learning_rate": 0.00017018125617745933,
"loss": 0.6187,
"step": 8660
},
{
"epoch": 0.2757393818100956,
"grad_norm": 0.854200541973114,
"learning_rate": 0.00017003452027290373,
"loss": 0.6294,
"step": 8680
},
{
"epoch": 0.2763747260078147,
"grad_norm": 0.8695046901702881,
"learning_rate": 0.00016988748781397064,
"loss": 0.6377,
"step": 8700
},
{
"epoch": 0.27701007020553386,
"grad_norm": 0.7802212238311768,
"learning_rate": 0.00016974015942325475,
"loss": 0.6051,
"step": 8720
},
{
"epoch": 0.27764541440325297,
"grad_norm": 1.0842890739440918,
"learning_rate": 0.00016959253572460382,
"loss": 0.6352,
"step": 8740
},
{
"epoch": 0.27828075860097207,
"grad_norm": 0.8472367525100708,
"learning_rate": 0.0001694446173431161,
"loss": 0.5907,
"step": 8760
},
{
"epoch": 0.27891610279869117,
"grad_norm": 0.8548029661178589,
"learning_rate": 0.0001692964049051376,
"loss": 0.6434,
"step": 8780
},
{
"epoch": 0.2795514469964103,
"grad_norm": 0.9771581888198853,
"learning_rate": 0.00016914789903825945,
"loss": 0.6381,
"step": 8800
},
{
"epoch": 0.2801867911941294,
"grad_norm": 0.9199798703193665,
"learning_rate": 0.0001689991003713154,
"loss": 0.6589,
"step": 8820
},
{
"epoch": 0.2808221353918485,
"grad_norm": 1.0753369331359863,
"learning_rate": 0.00016885000953437894,
"loss": 0.6413,
"step": 8840
},
{
"epoch": 0.2814574795895676,
"grad_norm": 1.0925753116607666,
"learning_rate": 0.00016870062715876075,
"loss": 0.6234,
"step": 8860
},
{
"epoch": 0.2820928237872868,
"grad_norm": 1.0023586750030518,
"learning_rate": 0.00016855095387700598,
"loss": 0.6104,
"step": 8880
},
{
"epoch": 0.2827281679850059,
"grad_norm": 0.9077417254447937,
"learning_rate": 0.00016840099032289162,
"loss": 0.602,
"step": 8900
},
{
"epoch": 0.283363512182725,
"grad_norm": 0.8238940238952637,
"learning_rate": 0.00016825073713142374,
"loss": 0.6157,
"step": 8920
},
{
"epoch": 0.2839988563804441,
"grad_norm": 1.111948847770691,
"learning_rate": 0.000168100194938835,
"loss": 0.6092,
"step": 8940
},
{
"epoch": 0.28463420057816324,
"grad_norm": 1.0630967617034912,
"learning_rate": 0.0001679493643825816,
"loss": 0.5904,
"step": 8960
},
{
"epoch": 0.28526954477588234,
"grad_norm": 0.8827186822891235,
"learning_rate": 0.00016779824610134092,
"loss": 0.6166,
"step": 8980
},
{
"epoch": 0.28590488897360145,
"grad_norm": 0.9229192137718201,
"learning_rate": 0.00016764684073500866,
"loss": 0.6178,
"step": 9000
},
{
"epoch": 0.28590488897360145,
"eval_loss": 0.5966877341270447,
"eval_runtime": 44.6044,
"eval_samples_per_second": 60.599,
"eval_steps_per_second": 30.311,
"step": 9000
},
{
"epoch": 0.28654023317132055,
"grad_norm": 0.8136707544326782,
"learning_rate": 0.00016749514892469615,
"loss": 0.6366,
"step": 9020
},
{
"epoch": 0.2871755773690397,
"grad_norm": 0.8175415992736816,
"learning_rate": 0.00016734317131272762,
"loss": 0.6177,
"step": 9040
},
{
"epoch": 0.2878109215667588,
"grad_norm": 0.929182767868042,
"learning_rate": 0.00016719090854263753,
"loss": 0.646,
"step": 9060
},
{
"epoch": 0.2884462657644779,
"grad_norm": 0.9779849052429199,
"learning_rate": 0.0001670383612591678,
"loss": 0.6362,
"step": 9080
},
{
"epoch": 0.289081609962197,
"grad_norm": 0.8542407751083374,
"learning_rate": 0.00016688553010826506,
"loss": 0.6076,
"step": 9100
},
{
"epoch": 0.2897169541599161,
"grad_norm": 0.8885607719421387,
"learning_rate": 0.00016673241573707804,
"loss": 0.6055,
"step": 9120
},
{
"epoch": 0.29035229835763526,
"grad_norm": 0.876097559928894,
"learning_rate": 0.0001665790187939546,
"loss": 0.6196,
"step": 9140
},
{
"epoch": 0.29098764255535436,
"grad_norm": 1.0198227167129517,
"learning_rate": 0.0001664253399284393,
"loss": 0.6374,
"step": 9160
},
{
"epoch": 0.29162298675307347,
"grad_norm": 0.8938513994216919,
"learning_rate": 0.00016627137979127033,
"loss": 0.6254,
"step": 9180
},
{
"epoch": 0.29225833095079257,
"grad_norm": 0.7427443861961365,
"learning_rate": 0.00016611713903437692,
"loss": 0.6099,
"step": 9200
},
{
"epoch": 0.2928936751485117,
"grad_norm": 0.9959378242492676,
"learning_rate": 0.00016596261831087661,
"loss": 0.648,
"step": 9220
},
{
"epoch": 0.2935290193462308,
"grad_norm": 1.048519253730774,
"learning_rate": 0.00016580781827507242,
"loss": 0.6292,
"step": 9240
},
{
"epoch": 0.2941643635439499,
"grad_norm": 0.858858585357666,
"learning_rate": 0.00016565273958245002,
"loss": 0.6252,
"step": 9260
},
{
"epoch": 0.294799707741669,
"grad_norm": 0.8437022566795349,
"learning_rate": 0.00016549738288967514,
"loss": 0.6188,
"step": 9280
},
{
"epoch": 0.2954350519393882,
"grad_norm": 0.8608834743499756,
"learning_rate": 0.00016534174885459056,
"loss": 0.6509,
"step": 9300
},
{
"epoch": 0.2960703961371073,
"grad_norm": 1.083897590637207,
"learning_rate": 0.00016518583813621357,
"loss": 0.6193,
"step": 9320
},
{
"epoch": 0.2967057403348264,
"grad_norm": 0.9606235027313232,
"learning_rate": 0.0001650296513947329,
"loss": 0.6287,
"step": 9340
},
{
"epoch": 0.2973410845325455,
"grad_norm": 1.0519804954528809,
"learning_rate": 0.00016487318929150617,
"loss": 0.6097,
"step": 9360
},
{
"epoch": 0.29797642873026464,
"grad_norm": 1.3490453958511353,
"learning_rate": 0.000164716452489057,
"loss": 0.6043,
"step": 9380
},
{
"epoch": 0.29861177292798374,
"grad_norm": 1.1292142868041992,
"learning_rate": 0.00016455944165107207,
"loss": 0.5896,
"step": 9400
},
{
"epoch": 0.29924711712570284,
"grad_norm": 0.9570278525352478,
"learning_rate": 0.00016440215744239865,
"loss": 0.6087,
"step": 9420
},
{
"epoch": 0.29988246132342194,
"grad_norm": 0.8570756316184998,
"learning_rate": 0.00016424460052904137,
"loss": 0.6036,
"step": 9440
},
{
"epoch": 0.3005178055211411,
"grad_norm": 0.9214951395988464,
"learning_rate": 0.00016408677157815974,
"loss": 0.6519,
"step": 9460
},
{
"epoch": 0.3011531497188602,
"grad_norm": 1.1580623388290405,
"learning_rate": 0.00016392867125806504,
"loss": 0.5991,
"step": 9480
},
{
"epoch": 0.3017884939165793,
"grad_norm": 1.1025846004486084,
"learning_rate": 0.00016377030023821782,
"loss": 0.6416,
"step": 9500
},
{
"epoch": 0.3024238381142984,
"grad_norm": 0.8918984532356262,
"learning_rate": 0.00016361165918922477,
"loss": 0.6165,
"step": 9520
},
{
"epoch": 0.30305918231201756,
"grad_norm": 0.8747968673706055,
"learning_rate": 0.000163452748782836,
"loss": 0.6094,
"step": 9540
},
{
"epoch": 0.30369452650973666,
"grad_norm": 0.7480270862579346,
"learning_rate": 0.0001632935696919422,
"loss": 0.5987,
"step": 9560
},
{
"epoch": 0.30432987070745576,
"grad_norm": 0.8854328393936157,
"learning_rate": 0.00016313412259057178,
"loss": 0.6514,
"step": 9580
},
{
"epoch": 0.30496521490517486,
"grad_norm": 1.0659030675888062,
"learning_rate": 0.00016297440815388802,
"loss": 0.5796,
"step": 9600
},
{
"epoch": 0.305600559102894,
"grad_norm": 0.9668769240379333,
"learning_rate": 0.00016281442705818618,
"loss": 0.6147,
"step": 9620
},
{
"epoch": 0.3062359033006131,
"grad_norm": 0.939028263092041,
"learning_rate": 0.00016265417998089068,
"loss": 0.6241,
"step": 9640
},
{
"epoch": 0.3068712474983322,
"grad_norm": 0.8955005407333374,
"learning_rate": 0.00016249366760055222,
"loss": 0.5832,
"step": 9660
},
{
"epoch": 0.3075065916960513,
"grad_norm": 0.7991370558738708,
"learning_rate": 0.00016233289059684492,
"loss": 0.5799,
"step": 9680
},
{
"epoch": 0.3081419358937704,
"grad_norm": 0.8115846514701843,
"learning_rate": 0.00016217184965056336,
"loss": 0.6109,
"step": 9700
},
{
"epoch": 0.3087772800914896,
"grad_norm": 0.7488042712211609,
"learning_rate": 0.00016201054544361977,
"loss": 0.6166,
"step": 9720
},
{
"epoch": 0.3094126242892087,
"grad_norm": 0.8463062644004822,
"learning_rate": 0.00016184897865904123,
"loss": 0.5779,
"step": 9740
},
{
"epoch": 0.3100479684869278,
"grad_norm": 1.083001732826233,
"learning_rate": 0.00016168714998096654,
"loss": 0.6175,
"step": 9760
},
{
"epoch": 0.3106833126846469,
"grad_norm": 0.8545092940330505,
"learning_rate": 0.00016152506009464357,
"loss": 0.6104,
"step": 9780
},
{
"epoch": 0.31131865688236604,
"grad_norm": 0.9297589063644409,
"learning_rate": 0.00016136270968642618,
"loss": 0.5831,
"step": 9800
},
{
"epoch": 0.31195400108008514,
"grad_norm": 0.7775977253913879,
"learning_rate": 0.0001612000994437714,
"loss": 0.6001,
"step": 9820
},
{
"epoch": 0.31258934527780424,
"grad_norm": 0.943267822265625,
"learning_rate": 0.0001610372300552366,
"loss": 0.6089,
"step": 9840
},
{
"epoch": 0.31322468947552334,
"grad_norm": 0.8398995399475098,
"learning_rate": 0.0001608741022104763,
"loss": 0.5929,
"step": 9860
},
{
"epoch": 0.3138600336732425,
"grad_norm": 1.0078269243240356,
"learning_rate": 0.00016071071660023954,
"loss": 0.6215,
"step": 9880
},
{
"epoch": 0.3144953778709616,
"grad_norm": 0.9710105657577515,
"learning_rate": 0.0001605470739163669,
"loss": 0.5983,
"step": 9900
},
{
"epoch": 0.3151307220686807,
"grad_norm": 0.8864800333976746,
"learning_rate": 0.00016038317485178734,
"loss": 0.5812,
"step": 9920
},
{
"epoch": 0.3157660662663998,
"grad_norm": 0.9775105118751526,
"learning_rate": 0.0001602190201005156,
"loss": 0.5899,
"step": 9940
},
{
"epoch": 0.31640141046411896,
"grad_norm": 0.8554601669311523,
"learning_rate": 0.00016005461035764902,
"loss": 0.5989,
"step": 9960
},
{
"epoch": 0.31703675466183806,
"grad_norm": 0.8149896264076233,
"learning_rate": 0.0001598899463193647,
"loss": 0.6383,
"step": 9980
},
{
"epoch": 0.31767209885955716,
"grad_norm": 1.1985602378845215,
"learning_rate": 0.00015972502868291652,
"loss": 0.604,
"step": 10000
},
{
"epoch": 0.31767209885955716,
"eval_loss": 0.5633410811424255,
"eval_runtime": 44.2566,
"eval_samples_per_second": 61.076,
"eval_steps_per_second": 30.549,
"step": 10000
},
{
"epoch": 0.31830744305727626,
"grad_norm": 0.9848890900611877,
"learning_rate": 0.0001595598581466322,
"loss": 0.5741,
"step": 10020
},
{
"epoch": 0.3189427872549954,
"grad_norm": 1.0653225183486938,
"learning_rate": 0.00015939443540991034,
"loss": 0.6154,
"step": 10040
},
{
"epoch": 0.3195781314527145,
"grad_norm": 0.8440039157867432,
"learning_rate": 0.0001592287611732175,
"loss": 0.6077,
"step": 10060
},
{
"epoch": 0.3202134756504336,
"grad_norm": 0.8706631660461426,
"learning_rate": 0.00015906283613808508,
"loss": 0.6143,
"step": 10080
},
{
"epoch": 0.3208488198481527,
"grad_norm": 1.0338808298110962,
"learning_rate": 0.00015889666100710659,
"loss": 0.5697,
"step": 10100
},
{
"epoch": 0.3214841640458719,
"grad_norm": 0.8499680757522583,
"learning_rate": 0.00015873023648393448,
"loss": 0.5968,
"step": 10120
},
{
"epoch": 0.322119508243591,
"grad_norm": 1.0106873512268066,
"learning_rate": 0.00015856356327327724,
"loss": 0.5657,
"step": 10140
},
{
"epoch": 0.3227548524413101,
"grad_norm": 0.9771645665168762,
"learning_rate": 0.00015839664208089634,
"loss": 0.5989,
"step": 10160
},
{
"epoch": 0.3233901966390292,
"grad_norm": 0.9425153136253357,
"learning_rate": 0.0001582294736136035,
"loss": 0.6314,
"step": 10180
},
{
"epoch": 0.3240255408367483,
"grad_norm": 1.1419885158538818,
"learning_rate": 0.0001580620585792572,
"loss": 0.6137,
"step": 10200
},
{
"epoch": 0.32466088503446744,
"grad_norm": 0.8356417417526245,
"learning_rate": 0.00015789439768676032,
"loss": 0.6189,
"step": 10220
},
{
"epoch": 0.32529622923218654,
"grad_norm": 0.9876666069030762,
"learning_rate": 0.00015772649164605648,
"loss": 0.6069,
"step": 10240
},
{
"epoch": 0.32593157342990564,
"grad_norm": 1.0510075092315674,
"learning_rate": 0.0001575583411681276,
"loss": 0.5996,
"step": 10260
},
{
"epoch": 0.32656691762762474,
"grad_norm": 0.91109299659729,
"learning_rate": 0.00015738994696499055,
"loss": 0.5996,
"step": 10280
},
{
"epoch": 0.3272022618253439,
"grad_norm": 0.8995181322097778,
"learning_rate": 0.00015722130974969421,
"loss": 0.5798,
"step": 10300
},
{
"epoch": 0.327837606023063,
"grad_norm": 1.1067475080490112,
"learning_rate": 0.00015705243023631652,
"loss": 0.5983,
"step": 10320
},
{
"epoch": 0.3284729502207821,
"grad_norm": 1.0324633121490479,
"learning_rate": 0.00015688330913996135,
"loss": 0.6011,
"step": 10340
},
{
"epoch": 0.3291082944185012,
"grad_norm": 1.0662481784820557,
"learning_rate": 0.0001567139471767556,
"loss": 0.6254,
"step": 10360
},
{
"epoch": 0.32974363861622036,
"grad_norm": 0.9539555907249451,
"learning_rate": 0.00015654434506384607,
"loss": 0.6176,
"step": 10380
},
{
"epoch": 0.33037898281393946,
"grad_norm": 0.7341588139533997,
"learning_rate": 0.00015637450351939637,
"loss": 0.5852,
"step": 10400
},
{
"epoch": 0.33101432701165856,
"grad_norm": 0.9077139496803284,
"learning_rate": 0.00015620442326258414,
"loss": 0.609,
"step": 10420
},
{
"epoch": 0.33164967120937766,
"grad_norm": 1.083999752998352,
"learning_rate": 0.00015603410501359766,
"loss": 0.5768,
"step": 10440
},
{
"epoch": 0.3322850154070968,
"grad_norm": 0.9190422296524048,
"learning_rate": 0.000155863549493633,
"loss": 0.5845,
"step": 10460
},
{
"epoch": 0.3329203596048159,
"grad_norm": 1.0731889009475708,
"learning_rate": 0.000155692757424891,
"loss": 0.5988,
"step": 10480
},
{
"epoch": 0.333555703802535,
"grad_norm": 0.9898316264152527,
"learning_rate": 0.00015552172953057407,
"loss": 0.5918,
"step": 10500
},
{
"epoch": 0.3341910480002541,
"grad_norm": 1.135695219039917,
"learning_rate": 0.00015535046653488322,
"loss": 0.5882,
"step": 10520
},
{
"epoch": 0.3348263921979733,
"grad_norm": 1.0453022718429565,
"learning_rate": 0.000155178969163015,
"loss": 0.609,
"step": 10540
},
{
"epoch": 0.3354617363956924,
"grad_norm": 0.9859703183174133,
"learning_rate": 0.00015500723814115835,
"loss": 0.5899,
"step": 10560
},
{
"epoch": 0.3360970805934115,
"grad_norm": 1.031168818473816,
"learning_rate": 0.00015483527419649163,
"loss": 0.5987,
"step": 10580
},
{
"epoch": 0.3367324247911306,
"grad_norm": 1.1591908931732178,
"learning_rate": 0.00015466307805717951,
"loss": 0.6191,
"step": 10600
},
{
"epoch": 0.33736776898884974,
"grad_norm": 0.8246921896934509,
"learning_rate": 0.00015449065045236977,
"loss": 0.6098,
"step": 10620
},
{
"epoch": 0.33800311318656884,
"grad_norm": 0.8392571210861206,
"learning_rate": 0.0001543179921121904,
"loss": 0.5675,
"step": 10640
},
{
"epoch": 0.33863845738428794,
"grad_norm": 0.8678343892097473,
"learning_rate": 0.00015414510376774633,
"loss": 0.5721,
"step": 10660
},
{
"epoch": 0.33927380158200704,
"grad_norm": 0.8436061143875122,
"learning_rate": 0.00015397198615111653,
"loss": 0.5703,
"step": 10680
},
{
"epoch": 0.33990914577972614,
"grad_norm": 0.9926438927650452,
"learning_rate": 0.00015379863999535074,
"loss": 0.6049,
"step": 10700
},
{
"epoch": 0.3405444899774453,
"grad_norm": 1.098764419555664,
"learning_rate": 0.00015362506603446637,
"loss": 0.6007,
"step": 10720
},
{
"epoch": 0.3411798341751644,
"grad_norm": 1.052038311958313,
"learning_rate": 0.00015345126500344554,
"loss": 0.5865,
"step": 10740
},
{
"epoch": 0.3418151783728835,
"grad_norm": 0.8772541880607605,
"learning_rate": 0.00015327723763823188,
"loss": 0.6066,
"step": 10760
},
{
"epoch": 0.3424505225706026,
"grad_norm": 0.7938296794891357,
"learning_rate": 0.00015310298467572733,
"loss": 0.5467,
"step": 10780
},
{
"epoch": 0.34308586676832176,
"grad_norm": 1.0938440561294556,
"learning_rate": 0.00015292850685378915,
"loss": 0.5916,
"step": 10800
},
{
"epoch": 0.34372121096604086,
"grad_norm": 0.8460657000541687,
"learning_rate": 0.00015275380491122672,
"loss": 0.603,
"step": 10820
},
{
"epoch": 0.34435655516375996,
"grad_norm": 0.8238389492034912,
"learning_rate": 0.00015257887958779854,
"loss": 0.5808,
"step": 10840
},
{
"epoch": 0.34499189936147906,
"grad_norm": 0.8064368367195129,
"learning_rate": 0.0001524037316242088,
"loss": 0.5862,
"step": 10860
},
{
"epoch": 0.3456272435591982,
"grad_norm": 1.2068203687667847,
"learning_rate": 0.00015222836176210467,
"loss": 0.5694,
"step": 10880
},
{
"epoch": 0.3462625877569173,
"grad_norm": 0.9752914309501648,
"learning_rate": 0.00015205277074407266,
"loss": 0.5367,
"step": 10900
},
{
"epoch": 0.3468979319546364,
"grad_norm": 0.9989959597587585,
"learning_rate": 0.00015187695931363602,
"loss": 0.5712,
"step": 10920
},
{
"epoch": 0.3475332761523555,
"grad_norm": 0.8734492659568787,
"learning_rate": 0.00015170092821525114,
"loss": 0.6029,
"step": 10940
},
{
"epoch": 0.3481686203500747,
"grad_norm": 0.8759735822677612,
"learning_rate": 0.00015152467819430458,
"loss": 0.5676,
"step": 10960
},
{
"epoch": 0.3488039645477938,
"grad_norm": 0.8554444909095764,
"learning_rate": 0.00015134820999711,
"loss": 0.5664,
"step": 10980
},
{
"epoch": 0.3494393087455129,
"grad_norm": 0.730451762676239,
"learning_rate": 0.00015117152437090482,
"loss": 0.5735,
"step": 11000
},
{
"epoch": 0.3494393087455129,
"eval_loss": 0.5449489951133728,
"eval_runtime": 44.9152,
"eval_samples_per_second": 60.18,
"eval_steps_per_second": 30.101,
"step": 11000
},
{
"epoch": 0.350074652943232,
"grad_norm": 0.7964712381362915,
"learning_rate": 0.00015099462206384718,
"loss": 0.5943,
"step": 11020
},
{
"epoch": 0.35070999714095114,
"grad_norm": 0.809177577495575,
"learning_rate": 0.00015081750382501277,
"loss": 0.5986,
"step": 11040
},
{
"epoch": 0.35134534133867024,
"grad_norm": 0.9207815527915955,
"learning_rate": 0.00015064017040439148,
"loss": 0.559,
"step": 11060
},
{
"epoch": 0.35198068553638934,
"grad_norm": 0.9813947677612305,
"learning_rate": 0.0001504626225528845,
"loss": 0.5529,
"step": 11080
},
{
"epoch": 0.35261602973410844,
"grad_norm": 0.9409967660903931,
"learning_rate": 0.00015028486102230105,
"loss": 0.5725,
"step": 11100
},
{
"epoch": 0.3532513739318276,
"grad_norm": 0.9317089319229126,
"learning_rate": 0.000150106886565355,
"loss": 0.5568,
"step": 11120
},
{
"epoch": 0.3538867181295467,
"grad_norm": 1.025341510772705,
"learning_rate": 0.00014992869993566194,
"loss": 0.5555,
"step": 11140
},
{
"epoch": 0.3545220623272658,
"grad_norm": 1.0014809370040894,
"learning_rate": 0.00014975030188773585,
"loss": 0.5922,
"step": 11160
},
{
"epoch": 0.3551574065249849,
"grad_norm": 0.9769735336303711,
"learning_rate": 0.00014957169317698593,
"loss": 0.583,
"step": 11180
},
{
"epoch": 0.355792750722704,
"grad_norm": 0.8555041551589966,
"learning_rate": 0.0001493928745597134,
"loss": 0.5609,
"step": 11200
},
{
"epoch": 0.35642809492042316,
"grad_norm": 0.9463367462158203,
"learning_rate": 0.0001492138467931084,
"loss": 0.5783,
"step": 11220
},
{
"epoch": 0.35706343911814226,
"grad_norm": 0.9429970979690552,
"learning_rate": 0.00014903461063524661,
"loss": 0.5934,
"step": 11240
},
{
"epoch": 0.35769878331586136,
"grad_norm": 1.4683854579925537,
"learning_rate": 0.00014885516684508612,
"loss": 0.5939,
"step": 11260
},
{
"epoch": 0.35833412751358046,
"grad_norm": 0.825720489025116,
"learning_rate": 0.00014867551618246428,
"loss": 0.5685,
"step": 11280
},
{
"epoch": 0.3589694717112996,
"grad_norm": 1.001832127571106,
"learning_rate": 0.00014849565940809432,
"loss": 0.5837,
"step": 11300
},
{
"epoch": 0.3596048159090187,
"grad_norm": 0.9406988024711609,
"learning_rate": 0.00014831559728356234,
"loss": 0.5864,
"step": 11320
},
{
"epoch": 0.3602401601067378,
"grad_norm": 0.7483388185501099,
"learning_rate": 0.00014813533057132393,
"loss": 0.5991,
"step": 11340
},
{
"epoch": 0.3608755043044569,
"grad_norm": 0.8849460482597351,
"learning_rate": 0.00014795486003470093,
"loss": 0.5821,
"step": 11360
},
{
"epoch": 0.3615108485021761,
"grad_norm": 0.7930045127868652,
"learning_rate": 0.00014777418643787836,
"loss": 0.5395,
"step": 11380
},
{
"epoch": 0.3621461926998952,
"grad_norm": 0.9285226464271545,
"learning_rate": 0.000147593310545901,
"loss": 0.5713,
"step": 11400
},
{
"epoch": 0.3627815368976143,
"grad_norm": 1.0233609676361084,
"learning_rate": 0.00014741223312467026,
"loss": 0.5875,
"step": 11420
},
{
"epoch": 0.3634168810953334,
"grad_norm": 1.033948302268982,
"learning_rate": 0.00014723095494094092,
"loss": 0.5993,
"step": 11440
},
{
"epoch": 0.36405222529305253,
"grad_norm": 0.9479451179504395,
"learning_rate": 0.00014704947676231784,
"loss": 0.571,
"step": 11460
},
{
"epoch": 0.36468756949077163,
"grad_norm": 0.7781844735145569,
"learning_rate": 0.0001468677993572528,
"loss": 0.5503,
"step": 11480
},
{
"epoch": 0.36532291368849074,
"grad_norm": 0.9249241352081299,
"learning_rate": 0.00014668592349504101,
"loss": 0.574,
"step": 11500
},
{
"epoch": 0.36595825788620984,
"grad_norm": 0.9108446836471558,
"learning_rate": 0.00014650384994581824,
"loss": 0.557,
"step": 11520
},
{
"epoch": 0.366593602083929,
"grad_norm": 1.0099608898162842,
"learning_rate": 0.0001463215794805573,
"loss": 0.5605,
"step": 11540
},
{
"epoch": 0.3672289462816481,
"grad_norm": 0.8376953601837158,
"learning_rate": 0.00014613911287106467,
"loss": 0.538,
"step": 11560
},
{
"epoch": 0.3678642904793672,
"grad_norm": 0.8893873691558838,
"learning_rate": 0.00014595645088997757,
"loss": 0.5606,
"step": 11580
},
{
"epoch": 0.3684996346770863,
"grad_norm": 1.1310006380081177,
"learning_rate": 0.00014577359431076046,
"loss": 0.5612,
"step": 11600
},
{
"epoch": 0.36913497887480545,
"grad_norm": 0.8577033281326294,
"learning_rate": 0.00014559054390770167,
"loss": 0.5688,
"step": 11620
},
{
"epoch": 0.36977032307252455,
"grad_norm": 0.9386855959892273,
"learning_rate": 0.00014540730045591044,
"loss": 0.5614,
"step": 11640
},
{
"epoch": 0.37040566727024365,
"grad_norm": 0.9492216110229492,
"learning_rate": 0.00014522386473131332,
"loss": 0.5878,
"step": 11660
},
{
"epoch": 0.37104101146796276,
"grad_norm": 0.853327751159668,
"learning_rate": 0.00014504023751065115,
"loss": 0.5568,
"step": 11680
},
{
"epoch": 0.37167635566568186,
"grad_norm": 0.7977784872055054,
"learning_rate": 0.00014485641957147553,
"loss": 0.5428,
"step": 11700
},
{
"epoch": 0.372311699863401,
"grad_norm": 1.1006829738616943,
"learning_rate": 0.00014467241169214567,
"loss": 0.559,
"step": 11720
},
{
"epoch": 0.3729470440611201,
"grad_norm": 1.08724045753479,
"learning_rate": 0.0001444882146518251,
"loss": 0.5642,
"step": 11740
},
{
"epoch": 0.3735823882588392,
"grad_norm": 1.0295459032058716,
"learning_rate": 0.00014430382923047831,
"loss": 0.5969,
"step": 11760
},
{
"epoch": 0.3742177324565583,
"grad_norm": 1.1096023321151733,
"learning_rate": 0.00014411925620886742,
"loss": 0.5678,
"step": 11780
},
{
"epoch": 0.3748530766542775,
"grad_norm": 0.9315259456634521,
"learning_rate": 0.000143934496368549,
"loss": 0.5728,
"step": 11800
},
{
"epoch": 0.3754884208519966,
"grad_norm": 0.9581449031829834,
"learning_rate": 0.00014374955049187066,
"loss": 0.5485,
"step": 11820
},
{
"epoch": 0.3761237650497157,
"grad_norm": 1.472161054611206,
"learning_rate": 0.00014356441936196776,
"loss": 0.5931,
"step": 11840
},
{
"epoch": 0.3767591092474348,
"grad_norm": 1.0234733819961548,
"learning_rate": 0.00014337910376276011,
"loss": 0.5635,
"step": 11860
},
{
"epoch": 0.37739445344515393,
"grad_norm": 0.9299212694168091,
"learning_rate": 0.00014319360447894862,
"loss": 0.5802,
"step": 11880
},
{
"epoch": 0.37802979764287303,
"grad_norm": 0.853388786315918,
"learning_rate": 0.00014300792229601198,
"loss": 0.5645,
"step": 11900
},
{
"epoch": 0.37866514184059213,
"grad_norm": 0.9909472465515137,
"learning_rate": 0.0001428220580002034,
"loss": 0.5451,
"step": 11920
},
{
"epoch": 0.37930048603831124,
"grad_norm": 0.8121063113212585,
"learning_rate": 0.00014263601237854716,
"loss": 0.5514,
"step": 11940
},
{
"epoch": 0.3799358302360304,
"grad_norm": 0.9053930044174194,
"learning_rate": 0.00014244978621883543,
"loss": 0.5371,
"step": 11960
},
{
"epoch": 0.3805711744337495,
"grad_norm": 1.0551111698150635,
"learning_rate": 0.00014226338030962475,
"loss": 0.5862,
"step": 11980
},
{
"epoch": 0.3812065186314686,
"grad_norm": 0.8897386193275452,
"learning_rate": 0.0001420767954402329,
"loss": 0.5439,
"step": 12000
},
{
"epoch": 0.3812065186314686,
"eval_loss": 0.5259391665458679,
"eval_runtime": 45.0289,
"eval_samples_per_second": 60.028,
"eval_steps_per_second": 30.025,
"step": 12000
},
{
"epoch": 0.3818418628291877,
"grad_norm": 0.8436812162399292,
"learning_rate": 0.00014189003240073535,
"loss": 0.5684,
"step": 12020
},
{
"epoch": 0.38247720702690685,
"grad_norm": 1.2769359350204468,
"learning_rate": 0.0001417030919819621,
"loss": 0.5483,
"step": 12040
},
{
"epoch": 0.38311255122462595,
"grad_norm": 0.8915470838546753,
"learning_rate": 0.0001415159749754942,
"loss": 0.5674,
"step": 12060
},
{
"epoch": 0.38374789542234505,
"grad_norm": 1.1026362180709839,
"learning_rate": 0.00014132868217366044,
"loss": 0.5868,
"step": 12080
},
{
"epoch": 0.38438323962006415,
"grad_norm": 0.92413729429245,
"learning_rate": 0.00014114121436953402,
"loss": 0.5602,
"step": 12100
},
{
"epoch": 0.3850185838177833,
"grad_norm": 0.8880215287208557,
"learning_rate": 0.0001409535723569291,
"loss": 0.563,
"step": 12120
},
{
"epoch": 0.3856539280155024,
"grad_norm": 0.7865646481513977,
"learning_rate": 0.00014076575693039767,
"loss": 0.5731,
"step": 12140
},
{
"epoch": 0.3862892722132215,
"grad_norm": 0.8817760348320007,
"learning_rate": 0.00014057776888522583,
"loss": 0.5205,
"step": 12160
},
{
"epoch": 0.3869246164109406,
"grad_norm": 0.7473212480545044,
"learning_rate": 0.0001403896090174307,
"loss": 0.5494,
"step": 12180
},
{
"epoch": 0.3875599606086597,
"grad_norm": 0.9429736137390137,
"learning_rate": 0.0001402012781237571,
"loss": 0.551,
"step": 12200
},
{
"epoch": 0.38819530480637887,
"grad_norm": 0.9144492149353027,
"learning_rate": 0.00014001277700167382,
"loss": 0.529,
"step": 12220
},
{
"epoch": 0.388830649004098,
"grad_norm": 0.8465405702590942,
"learning_rate": 0.00013982410644937057,
"loss": 0.566,
"step": 12240
},
{
"epoch": 0.3894659932018171,
"grad_norm": 0.8520842790603638,
"learning_rate": 0.00013963526726575446,
"loss": 0.61,
"step": 12260
},
{
"epoch": 0.3901013373995362,
"grad_norm": 0.8384197354316711,
"learning_rate": 0.00013944626025044673,
"loss": 0.563,
"step": 12280
},
{
"epoch": 0.39073668159725533,
"grad_norm": 0.9083155989646912,
"learning_rate": 0.00013925708620377927,
"loss": 0.5433,
"step": 12300
},
{
"epoch": 0.39137202579497443,
"grad_norm": 1.0582692623138428,
"learning_rate": 0.00013906774592679116,
"loss": 0.5368,
"step": 12320
},
{
"epoch": 0.39200736999269353,
"grad_norm": 0.8538171648979187,
"learning_rate": 0.00013887824022122537,
"loss": 0.5217,
"step": 12340
},
{
"epoch": 0.39264271419041263,
"grad_norm": 0.8264597058296204,
"learning_rate": 0.00013868856988952556,
"loss": 0.5564,
"step": 12360
},
{
"epoch": 0.3932780583881318,
"grad_norm": 0.8192921280860901,
"learning_rate": 0.00013849873573483222,
"loss": 0.6058,
"step": 12380
},
{
"epoch": 0.3939134025858509,
"grad_norm": 0.8523415923118591,
"learning_rate": 0.00013830873856097964,
"loss": 0.5565,
"step": 12400
},
{
"epoch": 0.39454874678357,
"grad_norm": 1.0821831226348877,
"learning_rate": 0.00013811857917249253,
"loss": 0.5617,
"step": 12420
},
{
"epoch": 0.3951840909812891,
"grad_norm": 0.8053098917007446,
"learning_rate": 0.00013792825837458225,
"loss": 0.579,
"step": 12440
},
{
"epoch": 0.39581943517900825,
"grad_norm": 0.9511120319366455,
"learning_rate": 0.00013773777697314378,
"loss": 0.5417,
"step": 12460
},
{
"epoch": 0.39645477937672735,
"grad_norm": 1.0273131132125854,
"learning_rate": 0.00013754713577475213,
"loss": 0.582,
"step": 12480
},
{
"epoch": 0.39709012357444645,
"grad_norm": 1.0347099304199219,
"learning_rate": 0.00013735633558665893,
"loss": 0.5679,
"step": 12500
},
{
"epoch": 0.39772546777216555,
"grad_norm": 1.0762611627578735,
"learning_rate": 0.00013716537721678907,
"loss": 0.5483,
"step": 12520
},
{
"epoch": 0.3983608119698847,
"grad_norm": 1.4243688583374023,
"learning_rate": 0.00013697426147373721,
"loss": 0.5558,
"step": 12540
},
{
"epoch": 0.3989961561676038,
"grad_norm": 0.7539466023445129,
"learning_rate": 0.00013678298916676445,
"loss": 0.5404,
"step": 12560
},
{
"epoch": 0.3996315003653229,
"grad_norm": 0.7736854553222656,
"learning_rate": 0.00013659156110579476,
"loss": 0.5578,
"step": 12580
},
{
"epoch": 0.400266844563042,
"grad_norm": 0.9489171504974365,
"learning_rate": 0.0001363999781014117,
"loss": 0.5668,
"step": 12600
},
{
"epoch": 0.40090218876076117,
"grad_norm": 0.9692643880844116,
"learning_rate": 0.00013621783146979094,
"loss": 0.5663,
"step": 12620
},
{
"epoch": 0.40153753295848027,
"grad_norm": 1.0705336332321167,
"learning_rate": 0.00013602594865967435,
"loss": 0.5293,
"step": 12640
},
{
"epoch": 0.40217287715619937,
"grad_norm": 1.0149205923080444,
"learning_rate": 0.00013583391330117533,
"loss": 0.5348,
"step": 12660
},
{
"epoch": 0.40280822135391847,
"grad_norm": 0.9088581204414368,
"learning_rate": 0.00013564172620744906,
"loss": 0.5677,
"step": 12680
},
{
"epoch": 0.4034435655516376,
"grad_norm": 1.1513986587524414,
"learning_rate": 0.00013544938819229306,
"loss": 0.569,
"step": 12700
},
{
"epoch": 0.40407890974935673,
"grad_norm": 0.8725998401641846,
"learning_rate": 0.00013525690007014406,
"loss": 0.5692,
"step": 12720
},
{
"epoch": 0.40471425394707583,
"grad_norm": 1.0663046836853027,
"learning_rate": 0.00013506426265607425,
"loss": 0.567,
"step": 12740
},
{
"epoch": 0.40534959814479493,
"grad_norm": 0.9139559864997864,
"learning_rate": 0.00013487147676578812,
"loss": 0.5465,
"step": 12760
},
{
"epoch": 0.40598494234251403,
"grad_norm": 1.3140777349472046,
"learning_rate": 0.00013467854321561878,
"loss": 0.5407,
"step": 12780
},
{
"epoch": 0.4066202865402332,
"grad_norm": 0.8671903610229492,
"learning_rate": 0.00013448546282252458,
"loss": 0.5303,
"step": 12800
},
{
"epoch": 0.4072556307379523,
"grad_norm": 0.692545473575592,
"learning_rate": 0.00013429223640408578,
"loss": 0.5333,
"step": 12820
},
{
"epoch": 0.4078909749356714,
"grad_norm": 1.1087654829025269,
"learning_rate": 0.00013409886477850087,
"loss": 0.5493,
"step": 12840
},
{
"epoch": 0.4085263191333905,
"grad_norm": 0.9659181833267212,
"learning_rate": 0.00013390534876458319,
"loss": 0.5902,
"step": 12860
},
{
"epoch": 0.40916166333110965,
"grad_norm": 0.7794270515441895,
"learning_rate": 0.00013371168918175754,
"loss": 0.5647,
"step": 12880
},
{
"epoch": 0.40979700752882875,
"grad_norm": 0.910505473613739,
"learning_rate": 0.00013351788685005662,
"loss": 0.5752,
"step": 12900
},
{
"epoch": 0.41043235172654785,
"grad_norm": 0.9549837112426758,
"learning_rate": 0.00013332394259011758,
"loss": 0.5424,
"step": 12920
},
{
"epoch": 0.41106769592426695,
"grad_norm": 1.2679826021194458,
"learning_rate": 0.00013312985722317862,
"loss": 0.5285,
"step": 12940
},
{
"epoch": 0.4117030401219861,
"grad_norm": 0.8822807669639587,
"learning_rate": 0.0001329356315710753,
"loss": 0.5662,
"step": 12960
},
{
"epoch": 0.4123383843197052,
"grad_norm": 0.8247064352035522,
"learning_rate": 0.0001327412664562373,
"loss": 0.5338,
"step": 12980
},
{
"epoch": 0.4129737285174243,
"grad_norm": 0.8655696511268616,
"learning_rate": 0.0001325467627016849,
"loss": 0.5563,
"step": 13000
},
{
"epoch": 0.4129737285174243,
"eval_loss": 0.5103311538696289,
"eval_runtime": 44.4811,
"eval_samples_per_second": 60.767,
"eval_steps_per_second": 30.395,
"step": 13000
},
{
"epoch": 0.4136090727151434,
"grad_norm": 1.1745620965957642,
"learning_rate": 0.00013235212113102532,
"loss": 0.5432,
"step": 13020
},
{
"epoch": 0.41424441691286257,
"grad_norm": 1.375957727432251,
"learning_rate": 0.0001321573425684494,
"loss": 0.5518,
"step": 13040
},
{
"epoch": 0.41487976111058167,
"grad_norm": 1.2425376176834106,
"learning_rate": 0.00013196242783872805,
"loss": 0.5667,
"step": 13060
},
{
"epoch": 0.41551510530830077,
"grad_norm": 0.9375765919685364,
"learning_rate": 0.00013176737776720876,
"loss": 0.5629,
"step": 13080
},
{
"epoch": 0.41615044950601987,
"grad_norm": 0.9392895698547363,
"learning_rate": 0.00013157219317981217,
"loss": 0.5577,
"step": 13100
},
{
"epoch": 0.416785793703739,
"grad_norm": 0.9028527140617371,
"learning_rate": 0.00013137687490302844,
"loss": 0.5358,
"step": 13120
},
{
"epoch": 0.41742113790145813,
"grad_norm": 0.9373983144760132,
"learning_rate": 0.00013118142376391381,
"loss": 0.5517,
"step": 13140
},
{
"epoch": 0.41805648209917723,
"grad_norm": 1.3339825868606567,
"learning_rate": 0.00013098584059008725,
"loss": 0.5512,
"step": 13160
},
{
"epoch": 0.41869182629689633,
"grad_norm": 0.7137243747711182,
"learning_rate": 0.00013079012620972663,
"loss": 0.5464,
"step": 13180
},
{
"epoch": 0.41932717049461543,
"grad_norm": 1.1450612545013428,
"learning_rate": 0.00013059428145156555,
"loss": 0.564,
"step": 13200
},
{
"epoch": 0.4199625146923346,
"grad_norm": 1.2148438692092896,
"learning_rate": 0.00013039830714488965,
"loss": 0.5555,
"step": 13220
},
{
"epoch": 0.4205978588900537,
"grad_norm": 1.277346134185791,
"learning_rate": 0.00013020220411953304,
"loss": 0.5898,
"step": 13240
},
{
"epoch": 0.4212332030877728,
"grad_norm": 1.0933984518051147,
"learning_rate": 0.00013000597320587492,
"loss": 0.553,
"step": 13260
},
{
"epoch": 0.4218685472854919,
"grad_norm": 0.7297493815422058,
"learning_rate": 0.00012980961523483616,
"loss": 0.5626,
"step": 13280
},
{
"epoch": 0.42250389148321105,
"grad_norm": 0.8859849572181702,
"learning_rate": 0.00012961313103787548,
"loss": 0.5455,
"step": 13300
},
{
"epoch": 0.42313923568093015,
"grad_norm": 0.9647216200828552,
"learning_rate": 0.00012941652144698608,
"loss": 0.5157,
"step": 13320
},
{
"epoch": 0.42377457987864925,
"grad_norm": 0.9097155332565308,
"learning_rate": 0.00012921978729469222,
"loss": 0.542,
"step": 13340
},
{
"epoch": 0.42440992407636835,
"grad_norm": 1.0074721574783325,
"learning_rate": 0.0001290229294140456,
"loss": 0.5319,
"step": 13360
},
{
"epoch": 0.4250452682740875,
"grad_norm": 0.7759230732917786,
"learning_rate": 0.0001288259486386218,
"loss": 0.4939,
"step": 13380
},
{
"epoch": 0.4256806124718066,
"grad_norm": 0.8912795782089233,
"learning_rate": 0.00012862884580251675,
"loss": 0.5276,
"step": 13400
},
{
"epoch": 0.4263159566695257,
"grad_norm": 1.090395450592041,
"learning_rate": 0.00012843162174034332,
"loss": 0.5227,
"step": 13420
},
{
"epoch": 0.4269513008672448,
"grad_norm": 0.8524248003959656,
"learning_rate": 0.00012823427728722762,
"loss": 0.5438,
"step": 13440
},
{
"epoch": 0.42758664506496397,
"grad_norm": 1.209073543548584,
"learning_rate": 0.0001280368132788056,
"loss": 0.5495,
"step": 13460
},
{
"epoch": 0.42822198926268307,
"grad_norm": 0.9301733374595642,
"learning_rate": 0.00012783923055121945,
"loss": 0.5411,
"step": 13480
},
{
"epoch": 0.42885733346040217,
"grad_norm": 0.916028618812561,
"learning_rate": 0.000127641529941114,
"loss": 0.5674,
"step": 13500
},
{
"epoch": 0.42949267765812127,
"grad_norm": 0.9181066751480103,
"learning_rate": 0.00012744371228563334,
"loss": 0.5522,
"step": 13520
},
{
"epoch": 0.4301280218558404,
"grad_norm": 1.2208302021026611,
"learning_rate": 0.0001272457784224171,
"loss": 0.5428,
"step": 13540
},
{
"epoch": 0.4307633660535595,
"grad_norm": 0.8382121920585632,
"learning_rate": 0.00012704772918959706,
"loss": 0.5347,
"step": 13560
},
{
"epoch": 0.4313987102512786,
"grad_norm": 0.7942314147949219,
"learning_rate": 0.0001268495654257934,
"loss": 0.5455,
"step": 13580
},
{
"epoch": 0.43203405444899773,
"grad_norm": 1.0586442947387695,
"learning_rate": 0.00012665128797011138,
"loss": 0.5588,
"step": 13600
},
{
"epoch": 0.4326693986467169,
"grad_norm": 0.9026583433151245,
"learning_rate": 0.00012645289766213764,
"loss": 0.5448,
"step": 13620
},
{
"epoch": 0.433304742844436,
"grad_norm": 1.107459545135498,
"learning_rate": 0.0001262643231052632,
"loss": 0.5226,
"step": 13640
},
{
"epoch": 0.4339400870421551,
"grad_norm": 0.7181698679924011,
"learning_rate": 0.00012606571515198816,
"loss": 0.5587,
"step": 13660
},
{
"epoch": 0.4345754312398742,
"grad_norm": 0.850642740726471,
"learning_rate": 0.0001258669968259726,
"loss": 0.5514,
"step": 13680
},
{
"epoch": 0.4352107754375933,
"grad_norm": 0.9803110957145691,
"learning_rate": 0.00012567811294990802,
"loss": 0.5612,
"step": 13700
},
{
"epoch": 0.43584611963531245,
"grad_norm": 0.8320556282997131,
"learning_rate": 0.00012547918181770158,
"loss": 0.5464,
"step": 13720
},
{
"epoch": 0.43648146383303155,
"grad_norm": 0.9645776152610779,
"learning_rate": 0.0001252801427963731,
"loss": 0.5394,
"step": 13740
},
{
"epoch": 0.43711680803075065,
"grad_norm": 0.981066107749939,
"learning_rate": 0.00012508099672873401,
"loss": 0.5518,
"step": 13760
},
{
"epoch": 0.43775215222846975,
"grad_norm": 0.950231671333313,
"learning_rate": 0.00012488174445804905,
"loss": 0.5628,
"step": 13780
},
{
"epoch": 0.4383874964261889,
"grad_norm": 0.7942489981651306,
"learning_rate": 0.00012468238682803256,
"loss": 0.5682,
"step": 13800
},
{
"epoch": 0.439022840623908,
"grad_norm": 0.9598709940910339,
"learning_rate": 0.0001244829246828451,
"loss": 0.5398,
"step": 13820
},
{
"epoch": 0.4396581848216271,
"grad_norm": 0.9328323602676392,
"learning_rate": 0.0001242833588670898,
"loss": 0.5465,
"step": 13840
},
{
"epoch": 0.4402935290193462,
"grad_norm": 0.9036662578582764,
"learning_rate": 0.00012408369022580865,
"loss": 0.5307,
"step": 13860
},
{
"epoch": 0.44092887321706536,
"grad_norm": 1.1593483686447144,
"learning_rate": 0.0001238839196044792,
"loss": 0.5838,
"step": 13880
},
{
"epoch": 0.44156421741478447,
"grad_norm": 0.9283963441848755,
"learning_rate": 0.0001236840478490107,
"loss": 0.5112,
"step": 13900
},
{
"epoch": 0.44219956161250357,
"grad_norm": 1.1374804973602295,
"learning_rate": 0.00012348407580574068,
"loss": 0.5616,
"step": 13920
},
{
"epoch": 0.44283490581022267,
"grad_norm": 0.8757379055023193,
"learning_rate": 0.00012328400432143143,
"loss": 0.5409,
"step": 13940
},
{
"epoch": 0.4434702500079418,
"grad_norm": 0.9971847534179688,
"learning_rate": 0.00012308383424326617,
"loss": 0.5573,
"step": 13960
},
{
"epoch": 0.4441055942056609,
"grad_norm": 0.8985651135444641,
"learning_rate": 0.00012288356641884567,
"loss": 0.5602,
"step": 13980
},
{
"epoch": 0.44474093840338,
"grad_norm": 0.8877219557762146,
"learning_rate": 0.0001226832016961846,
"loss": 0.5418,
"step": 14000
},
{
"epoch": 0.44474093840338,
"eval_loss": 0.49767744541168213,
"eval_runtime": 45.8378,
"eval_samples_per_second": 58.969,
"eval_steps_per_second": 29.495,
"step": 14000
},
{
"epoch": 0.4453762826010991,
"grad_norm": 0.9760685563087463,
"learning_rate": 0.00012248274092370795,
"loss": 0.5386,
"step": 14020
},
{
"epoch": 0.4460116267988183,
"grad_norm": 0.9159601330757141,
"learning_rate": 0.00012228218495024734,
"loss": 0.5658,
"step": 14040
},
{
"epoch": 0.4466469709965374,
"grad_norm": 0.9726976752281189,
"learning_rate": 0.00012208153462503764,
"loss": 0.5619,
"step": 14060
},
{
"epoch": 0.4472823151942565,
"grad_norm": 0.8647946715354919,
"learning_rate": 0.00012188079079771311,
"loss": 0.5312,
"step": 14080
},
{
"epoch": 0.4479176593919756,
"grad_norm": 0.8291323781013489,
"learning_rate": 0.00012167995431830404,
"loss": 0.5555,
"step": 14100
},
{
"epoch": 0.44855300358969474,
"grad_norm": 1.1393893957138062,
"learning_rate": 0.00012147902603723302,
"loss": 0.5368,
"step": 14120
},
{
"epoch": 0.44918834778741384,
"grad_norm": 0.9214714169502258,
"learning_rate": 0.00012127800680531129,
"loss": 0.5312,
"step": 14140
},
{
"epoch": 0.44982369198513295,
"grad_norm": 0.7314972877502441,
"learning_rate": 0.00012107689747373533,
"loss": 0.5306,
"step": 14160
},
{
"epoch": 0.45045903618285205,
"grad_norm": 0.9739118218421936,
"learning_rate": 0.00012087569889408308,
"loss": 0.5474,
"step": 14180
},
{
"epoch": 0.45109438038057115,
"grad_norm": 1.1331558227539062,
"learning_rate": 0.00012067441191831035,
"loss": 0.5251,
"step": 14200
},
{
"epoch": 0.4517297245782903,
"grad_norm": 0.9672099947929382,
"learning_rate": 0.00012047303739874733,
"loss": 0.5638,
"step": 14220
},
{
"epoch": 0.4523650687760094,
"grad_norm": 0.9430161118507385,
"learning_rate": 0.00012027157618809488,
"loss": 0.5473,
"step": 14240
},
{
"epoch": 0.4530004129737285,
"grad_norm": 0.9385126233100891,
"learning_rate": 0.00012007002913942092,
"loss": 0.5305,
"step": 14260
},
{
"epoch": 0.4536357571714476,
"grad_norm": 1.2930362224578857,
"learning_rate": 0.00011986839710615689,
"loss": 0.5264,
"step": 14280
},
{
"epoch": 0.45427110136916676,
"grad_norm": 1.098981499671936,
"learning_rate": 0.00011966668094209401,
"loss": 0.5945,
"step": 14300
},
{
"epoch": 0.45490644556688586,
"grad_norm": 1.016724944114685,
"learning_rate": 0.00011946488150137987,
"loss": 0.5423,
"step": 14320
},
{
"epoch": 0.45554178976460497,
"grad_norm": 1.3441358804702759,
"learning_rate": 0.00011926299963851455,
"loss": 0.5311,
"step": 14340
},
{
"epoch": 0.45617713396232407,
"grad_norm": 0.8672164678573608,
"learning_rate": 0.00011906103620834721,
"loss": 0.5377,
"step": 14360
},
{
"epoch": 0.4568124781600432,
"grad_norm": 0.8844342231750488,
"learning_rate": 0.00011885899206607243,
"loss": 0.5539,
"step": 14380
},
{
"epoch": 0.4574478223577623,
"grad_norm": 1.0755807161331177,
"learning_rate": 0.00011865686806722647,
"loss": 0.5489,
"step": 14400
},
{
"epoch": 0.4580831665554814,
"grad_norm": 0.8909132480621338,
"learning_rate": 0.00011845466506768379,
"loss": 0.5492,
"step": 14420
},
{
"epoch": 0.4587185107532005,
"grad_norm": 0.7222205996513367,
"learning_rate": 0.00011826249982356501,
"loss": 0.5452,
"step": 14440
},
{
"epoch": 0.4593538549509197,
"grad_norm": 0.8589527606964111,
"learning_rate": 0.00011806014523563623,
"loss": 0.5553,
"step": 14460
},
{
"epoch": 0.4599891991486388,
"grad_norm": 0.8546582460403442,
"learning_rate": 0.00011785771417377567,
"loss": 0.518,
"step": 14480
},
{
"epoch": 0.4606245433463579,
"grad_norm": 0.7938315272331238,
"learning_rate": 0.00011765520749515795,
"loss": 0.5732,
"step": 14500
},
{
"epoch": 0.461259887544077,
"grad_norm": 1.030897617340088,
"learning_rate": 0.000117452626057278,
"loss": 0.5293,
"step": 14520
},
{
"epoch": 0.46189523174179614,
"grad_norm": 0.9275230765342712,
"learning_rate": 0.00011724997071794722,
"loss": 0.5453,
"step": 14540
},
{
"epoch": 0.46253057593951524,
"grad_norm": 0.8049765825271606,
"learning_rate": 0.00011704724233528997,
"loss": 0.5237,
"step": 14560
},
{
"epoch": 0.46316592013723434,
"grad_norm": 0.9411914348602295,
"learning_rate": 0.00011684444176773994,
"loss": 0.5529,
"step": 14580
},
{
"epoch": 0.46380126433495344,
"grad_norm": 1.0553874969482422,
"learning_rate": 0.0001166415698740364,
"loss": 0.5107,
"step": 14600
},
{
"epoch": 0.4644366085326726,
"grad_norm": 1.1203105449676514,
"learning_rate": 0.00011643862751322072,
"loss": 0.5503,
"step": 14620
},
{
"epoch": 0.4650719527303917,
"grad_norm": 0.9356998801231384,
"learning_rate": 0.00011623561554463263,
"loss": 0.5388,
"step": 14640
},
{
"epoch": 0.4657072969281108,
"grad_norm": 1.0603325366973877,
"learning_rate": 0.00011603253482790657,
"loss": 0.5379,
"step": 14660
},
{
"epoch": 0.4663426411258299,
"grad_norm": 0.7650070786476135,
"learning_rate": 0.00011582938622296818,
"loss": 0.5175,
"step": 14680
},
{
"epoch": 0.466977985323549,
"grad_norm": 1.1926647424697876,
"learning_rate": 0.00011562617059003044,
"loss": 0.5558,
"step": 14700
},
{
"epoch": 0.46761332952126816,
"grad_norm": 0.9466400742530823,
"learning_rate": 0.00011542288878959025,
"loss": 0.5288,
"step": 14720
},
{
"epoch": 0.46824867371898726,
"grad_norm": 1.036163091659546,
"learning_rate": 0.0001152195416824247,
"loss": 0.5322,
"step": 14740
},
{
"epoch": 0.46888401791670636,
"grad_norm": 0.8458572626113892,
"learning_rate": 0.00011501613012958729,
"loss": 0.5358,
"step": 14760
},
{
"epoch": 0.46951936211442546,
"grad_norm": 0.789557695388794,
"learning_rate": 0.00011481265499240455,
"loss": 0.5067,
"step": 14780
},
{
"epoch": 0.4701547063121446,
"grad_norm": 0.845371425151825,
"learning_rate": 0.00011460911713247222,
"loss": 0.5433,
"step": 14800
},
{
"epoch": 0.4707900505098637,
"grad_norm": 0.8561549782752991,
"learning_rate": 0.00011440551741165156,
"loss": 0.5362,
"step": 14820
},
{
"epoch": 0.4714253947075828,
"grad_norm": 0.921575665473938,
"learning_rate": 0.00011420185669206582,
"loss": 0.5093,
"step": 14840
},
{
"epoch": 0.4720607389053019,
"grad_norm": 0.9392147660255432,
"learning_rate": 0.0001139981358360966,
"loss": 0.5419,
"step": 14860
},
{
"epoch": 0.4726960831030211,
"grad_norm": 0.859464168548584,
"learning_rate": 0.00011379435570638002,
"loss": 0.5329,
"step": 14880
},
{
"epoch": 0.4733314273007402,
"grad_norm": 0.9370890259742737,
"learning_rate": 0.00011359051716580331,
"loss": 0.516,
"step": 14900
},
{
"epoch": 0.4739667714984593,
"grad_norm": 0.8993077278137207,
"learning_rate": 0.00011338662107750098,
"loss": 0.4785,
"step": 14920
},
{
"epoch": 0.4746021156961784,
"grad_norm": 0.7652683854103088,
"learning_rate": 0.00011318266830485119,
"loss": 0.5348,
"step": 14940
},
{
"epoch": 0.47523745989389754,
"grad_norm": 1.0513384342193604,
"learning_rate": 0.00011297865971147217,
"loss": 0.5181,
"step": 14960
},
{
"epoch": 0.47587280409161664,
"grad_norm": 0.8159809112548828,
"learning_rate": 0.00011277459616121851,
"loss": 0.5368,
"step": 14980
},
{
"epoch": 0.47650814828933574,
"grad_norm": 1.0844529867172241,
"learning_rate": 0.00011257047851817748,
"loss": 0.5497,
"step": 15000
},
{
"epoch": 0.47650814828933574,
"eval_loss": 0.4893677234649658,
"eval_runtime": 45.7511,
"eval_samples_per_second": 59.081,
"eval_steps_per_second": 29.551,
"step": 15000
},
{
"epoch": 0.47714349248705484,
"grad_norm": 0.7700105309486389,
"learning_rate": 0.0001123663076466655,
"loss": 0.5354,
"step": 15020
},
{
"epoch": 0.477778836684774,
"grad_norm": 0.872631847858429,
"learning_rate": 0.0001121620844112242,
"loss": 0.5243,
"step": 15040
},
{
"epoch": 0.4784141808824931,
"grad_norm": 1.1037932634353638,
"learning_rate": 0.0001119578096766171,
"loss": 0.5412,
"step": 15060
},
{
"epoch": 0.4790495250802122,
"grad_norm": 0.9620169997215271,
"learning_rate": 0.00011175348430782579,
"loss": 0.5137,
"step": 15080
},
{
"epoch": 0.4796848692779313,
"grad_norm": 0.7465859055519104,
"learning_rate": 0.0001115491091700461,
"loss": 0.5213,
"step": 15100
},
{
"epoch": 0.48032021347565046,
"grad_norm": 0.7287941575050354,
"learning_rate": 0.00011134468512868479,
"loss": 0.5184,
"step": 15120
},
{
"epoch": 0.48095555767336956,
"grad_norm": 0.9596436023712158,
"learning_rate": 0.00011114021304935558,
"loss": 0.5471,
"step": 15140
},
{
"epoch": 0.48159090187108866,
"grad_norm": 0.869172215461731,
"learning_rate": 0.00011093569379787563,
"loss": 0.5074,
"step": 15160
},
{
"epoch": 0.48222624606880776,
"grad_norm": 1.0704097747802734,
"learning_rate": 0.00011073112824026191,
"loss": 0.544,
"step": 15180
},
{
"epoch": 0.48286159026652686,
"grad_norm": 0.896312415599823,
"learning_rate": 0.00011052651724272736,
"loss": 0.5261,
"step": 15200
},
{
"epoch": 0.483496934464246,
"grad_norm": 1.010606288909912,
"learning_rate": 0.00011032186167167741,
"loss": 0.5112,
"step": 15220
},
{
"epoch": 0.4841322786619651,
"grad_norm": 0.980171263217926,
"learning_rate": 0.00011011716239370625,
"loss": 0.5414,
"step": 15240
},
{
"epoch": 0.4847676228596842,
"grad_norm": 0.7417489290237427,
"learning_rate": 0.00010991242027559301,
"loss": 0.5019,
"step": 15260
},
{
"epoch": 0.4854029670574033,
"grad_norm": 0.9232955574989319,
"learning_rate": 0.0001097076361842984,
"loss": 0.5293,
"step": 15280
},
{
"epoch": 0.4860383112551225,
"grad_norm": 0.8391673564910889,
"learning_rate": 0.00010950281098696072,
"loss": 0.5397,
"step": 15300
},
{
"epoch": 0.4866736554528416,
"grad_norm": 1.0795869827270508,
"learning_rate": 0.00010929794555089239,
"loss": 0.5293,
"step": 15320
},
{
"epoch": 0.4873089996505607,
"grad_norm": 0.9179370403289795,
"learning_rate": 0.00010909304074357627,
"loss": 0.5089,
"step": 15340
},
{
"epoch": 0.4879443438482798,
"grad_norm": 0.9346722960472107,
"learning_rate": 0.0001088880974326618,
"loss": 0.4981,
"step": 15360
},
{
"epoch": 0.48857968804599894,
"grad_norm": 0.9835326075553894,
"learning_rate": 0.00010868311648596157,
"loss": 0.52,
"step": 15380
},
{
"epoch": 0.48921503224371804,
"grad_norm": 0.8709509968757629,
"learning_rate": 0.0001084780987714475,
"loss": 0.5507,
"step": 15400
},
{
"epoch": 0.48985037644143714,
"grad_norm": 1.0125563144683838,
"learning_rate": 0.00010827304515724719,
"loss": 0.5522,
"step": 15420
},
{
"epoch": 0.49048572063915624,
"grad_norm": 0.9726683497428894,
"learning_rate": 0.00010806795651164026,
"loss": 0.5195,
"step": 15440
},
{
"epoch": 0.4911210648368754,
"grad_norm": 0.9348143935203552,
"learning_rate": 0.0001078628337030547,
"loss": 0.5376,
"step": 15460
},
{
"epoch": 0.4917564090345945,
"grad_norm": 1.247452735900879,
"learning_rate": 0.00010765767760006308,
"loss": 0.5238,
"step": 15480
},
{
"epoch": 0.4923917532323136,
"grad_norm": 1.2584036588668823,
"learning_rate": 0.00010745248907137906,
"loss": 0.539,
"step": 15500
},
{
"epoch": 0.4930270974300327,
"grad_norm": 0.9565659165382385,
"learning_rate": 0.00010724726898585353,
"loss": 0.546,
"step": 15520
},
{
"epoch": 0.49366244162775186,
"grad_norm": 0.9646620750427246,
"learning_rate": 0.000107042018212471,
"loss": 0.5094,
"step": 15540
},
{
"epoch": 0.49429778582547096,
"grad_norm": 0.7045026421546936,
"learning_rate": 0.00010683673762034594,
"loss": 0.5708,
"step": 15560
},
{
"epoch": 0.49493313002319006,
"grad_norm": 1.1588184833526611,
"learning_rate": 0.00010663142807871911,
"loss": 0.5681,
"step": 15580
},
{
"epoch": 0.49556847422090916,
"grad_norm": 0.8272905349731445,
"learning_rate": 0.00010642609045695382,
"loss": 0.5239,
"step": 15600
},
{
"epoch": 0.4962038184186283,
"grad_norm": 0.9670738577842712,
"learning_rate": 0.00010622072562453234,
"loss": 0.486,
"step": 15620
},
{
"epoch": 0.4968391626163474,
"grad_norm": 0.8635004162788391,
"learning_rate": 0.00010601533445105205,
"loss": 0.5419,
"step": 15640
},
{
"epoch": 0.4974745068140665,
"grad_norm": 1.0769212245941162,
"learning_rate": 0.00010580991780622196,
"loss": 0.5252,
"step": 15660
},
{
"epoch": 0.4981098510117856,
"grad_norm": 0.9688665270805359,
"learning_rate": 0.00010560447655985894,
"loss": 0.5559,
"step": 15680
},
{
"epoch": 0.4987451952095048,
"grad_norm": 0.9587375521659851,
"learning_rate": 0.00010539901158188398,
"loss": 0.5136,
"step": 15700
},
{
"epoch": 0.4993805394072239,
"grad_norm": 0.870891273021698,
"learning_rate": 0.0001051935237423186,
"loss": 0.5274,
"step": 15720
},
{
"epoch": 0.500015883604943,
"grad_norm": 1.1741816997528076,
"learning_rate": 0.00010498801391128108,
"loss": 0.5274,
"step": 15740
},
{
"epoch": 0.5006512278026621,
"grad_norm": 1.074429988861084,
"learning_rate": 0.00010478248295898285,
"loss": 0.5049,
"step": 15760
},
{
"epoch": 0.5012865720003812,
"grad_norm": 0.7894431352615356,
"learning_rate": 0.00010457693175572483,
"loss": 0.5141,
"step": 15780
},
{
"epoch": 0.5019219161981003,
"grad_norm": 0.8638029098510742,
"learning_rate": 0.00010437136117189356,
"loss": 0.5053,
"step": 15800
},
{
"epoch": 0.5025572603958194,
"grad_norm": 0.9749894142150879,
"learning_rate": 0.00010416577207795776,
"loss": 0.5319,
"step": 15820
},
{
"epoch": 0.5031926045935385,
"grad_norm": 0.9491709470748901,
"learning_rate": 0.00010396016534446451,
"loss": 0.4968,
"step": 15840
},
{
"epoch": 0.5038279487912577,
"grad_norm": 0.880732536315918,
"learning_rate": 0.00010375454184203555,
"loss": 0.5292,
"step": 15860
},
{
"epoch": 0.5044632929889767,
"grad_norm": 1.22807776927948,
"learning_rate": 0.00010354890244136361,
"loss": 0.5228,
"step": 15880
},
{
"epoch": 0.5050986371866959,
"grad_norm": 0.8567366003990173,
"learning_rate": 0.00010334324801320881,
"loss": 0.558,
"step": 15900
},
{
"epoch": 0.505733981384415,
"grad_norm": 0.8203198909759521,
"learning_rate": 0.00010313757942839482,
"loss": 0.5061,
"step": 15920
},
{
"epoch": 0.5063693255821341,
"grad_norm": 0.9894897937774658,
"learning_rate": 0.00010293189755780535,
"loss": 0.5322,
"step": 15940
},
{
"epoch": 0.5070046697798533,
"grad_norm": 1.0645695924758911,
"learning_rate": 0.0001027262032723803,
"loss": 0.536,
"step": 15960
},
{
"epoch": 0.5076400139775723,
"grad_norm": 0.9940254092216492,
"learning_rate": 0.0001025204974431121,
"loss": 0.5211,
"step": 15980
},
{
"epoch": 0.5082753581752915,
"grad_norm": 0.7856065630912781,
"learning_rate": 0.00010231478094104216,
"loss": 0.5137,
"step": 16000
},
{
"epoch": 0.5082753581752915,
"eval_loss": 0.48191481828689575,
"eval_runtime": 44.2211,
"eval_samples_per_second": 61.125,
"eval_steps_per_second": 30.574,
"step": 16000
},
{
"epoch": 0.5089107023730106,
"grad_norm": 0.9363443851470947,
"learning_rate": 0.00010210905463725703,
"loss": 0.5426,
"step": 16020
},
{
"epoch": 0.5095460465707297,
"grad_norm": 0.8720065355300903,
"learning_rate": 0.0001019033194028848,
"loss": 0.525,
"step": 16040
},
{
"epoch": 0.5101813907684488,
"grad_norm": 0.9192999005317688,
"learning_rate": 0.00010169757610909131,
"loss": 0.5265,
"step": 16060
},
{
"epoch": 0.510816734966168,
"grad_norm": 1.089529037475586,
"learning_rate": 0.00010149182562707657,
"loss": 0.5148,
"step": 16080
},
{
"epoch": 0.511452079163887,
"grad_norm": 0.8161883354187012,
"learning_rate": 0.00010128606882807106,
"loss": 0.5441,
"step": 16100
},
{
"epoch": 0.5120874233616062,
"grad_norm": 0.8635348081588745,
"learning_rate": 0.00010108030658333192,
"loss": 0.4981,
"step": 16120
},
{
"epoch": 0.5127227675593252,
"grad_norm": 0.9366866946220398,
"learning_rate": 0.00010087453976413943,
"loss": 0.5155,
"step": 16140
},
{
"epoch": 0.5133581117570444,
"grad_norm": 0.8161008954048157,
"learning_rate": 0.00010066876924179321,
"loss": 0.5178,
"step": 16160
},
{
"epoch": 0.5139934559547635,
"grad_norm": 1.2926280498504639,
"learning_rate": 0.00010046299588760855,
"loss": 0.5409,
"step": 16180
},
{
"epoch": 0.5146288001524826,
"grad_norm": 0.9963902235031128,
"learning_rate": 0.00010025722057291273,
"loss": 0.514,
"step": 16200
},
{
"epoch": 0.5152641443502017,
"grad_norm": 0.7572094202041626,
"learning_rate": 0.0001000514441690414,
"loss": 0.5142,
"step": 16220
},
{
"epoch": 0.5158994885479208,
"grad_norm": 0.7842695713043213,
"learning_rate": 9.984566754733471e-05,
"loss": 0.5419,
"step": 16240
},
{
"epoch": 0.5165348327456399,
"grad_norm": 0.8259790539741516,
"learning_rate": 9.96398915791338e-05,
"loss": 0.5053,
"step": 16260
},
{
"epoch": 0.5171701769433591,
"grad_norm": 0.7848758697509766,
"learning_rate": 9.943411713577707e-05,
"loss": 0.5129,
"step": 16280
},
{
"epoch": 0.5178055211410781,
"grad_norm": 0.9001737236976624,
"learning_rate": 9.922834508859636e-05,
"loss": 0.5095,
"step": 16300
},
{
"epoch": 0.5184408653387973,
"grad_norm": 1.2547895908355713,
"learning_rate": 9.90225763089135e-05,
"loss": 0.5402,
"step": 16320
},
{
"epoch": 0.5190762095365165,
"grad_norm": 1.0412747859954834,
"learning_rate": 9.881681166803634e-05,
"loss": 0.5039,
"step": 16340
},
{
"epoch": 0.5197115537342355,
"grad_norm": 0.8408613204956055,
"learning_rate": 9.861105203725533e-05,
"loss": 0.5256,
"step": 16360
},
{
"epoch": 0.5203468979319547,
"grad_norm": 0.7325016856193542,
"learning_rate": 9.840529828783965e-05,
"loss": 0.5055,
"step": 16380
},
{
"epoch": 0.5209822421296737,
"grad_norm": 1.3417218923568726,
"learning_rate": 9.819955129103355e-05,
"loss": 0.5336,
"step": 16400
},
{
"epoch": 0.5216175863273929,
"grad_norm": 0.8016658425331116,
"learning_rate": 9.799381191805272e-05,
"loss": 0.5285,
"step": 16420
},
{
"epoch": 0.522252930525112,
"grad_norm": 0.7678484916687012,
"learning_rate": 9.778808104008059e-05,
"loss": 0.5243,
"step": 16440
},
{
"epoch": 0.5228882747228311,
"grad_norm": 1.0348572731018066,
"learning_rate": 9.760293123314227e-05,
"loss": 0.5305,
"step": 16460
},
{
"epoch": 0.5235236189205502,
"grad_norm": 0.891635537147522,
"learning_rate": 9.739721889566509e-05,
"loss": 0.5258,
"step": 16480
},
{
"epoch": 0.5241589631182694,
"grad_norm": 0.9525818824768066,
"learning_rate": 9.719151757941184e-05,
"loss": 0.5405,
"step": 16500
},
{
"epoch": 0.5247943073159884,
"grad_norm": 0.8067079186439514,
"learning_rate": 9.698582815540476e-05,
"loss": 0.5058,
"step": 16520
},
{
"epoch": 0.5254296515137076,
"grad_norm": 0.8525674939155579,
"learning_rate": 9.678015149461577e-05,
"loss": 0.5429,
"step": 16540
},
{
"epoch": 0.5260649957114266,
"grad_norm": 0.9794461727142334,
"learning_rate": 9.65744884679627e-05,
"loss": 0.5106,
"step": 16560
},
{
"epoch": 0.5267003399091458,
"grad_norm": 0.8107161521911621,
"learning_rate": 9.636883994630567e-05,
"loss": 0.5124,
"step": 16580
},
{
"epoch": 0.5273356841068649,
"grad_norm": 0.8728024959564209,
"learning_rate": 9.61632068004434e-05,
"loss": 0.5483,
"step": 16600
},
{
"epoch": 0.527971028304584,
"grad_norm": 1.0132850408554077,
"learning_rate": 9.595758990110948e-05,
"loss": 0.55,
"step": 16620
},
{
"epoch": 0.5286063725023031,
"grad_norm": 1.0854065418243408,
"learning_rate": 9.575199011896869e-05,
"loss": 0.5022,
"step": 16640
},
{
"epoch": 0.5292417167000223,
"grad_norm": 1.06479012966156,
"learning_rate": 9.555668697368233e-05,
"loss": 0.4932,
"step": 16660
},
{
"epoch": 0.5298770608977413,
"grad_norm": 1.1619220972061157,
"learning_rate": 9.535112307403999e-05,
"loss": 0.5377,
"step": 16680
},
{
"epoch": 0.5305124050954605,
"grad_norm": 1.1277661323547363,
"learning_rate": 9.514557885961573e-05,
"loss": 0.5267,
"step": 16700
},
{
"epoch": 0.5311477492931795,
"grad_norm": 1.0196537971496582,
"learning_rate": 9.494005520076655e-05,
"loss": 0.5203,
"step": 16720
},
{
"epoch": 0.5317830934908987,
"grad_norm": 0.9534218907356262,
"learning_rate": 9.473455296776239e-05,
"loss": 0.5177,
"step": 16740
},
{
"epoch": 0.5324184376886179,
"grad_norm": 0.9330717325210571,
"learning_rate": 9.45290730307826e-05,
"loss": 0.55,
"step": 16760
},
{
"epoch": 0.5330537818863369,
"grad_norm": 0.9290218949317932,
"learning_rate": 9.43236162599119e-05,
"loss": 0.5301,
"step": 16780
},
{
"epoch": 0.533689126084056,
"grad_norm": 0.9842971563339233,
"learning_rate": 9.411818352513715e-05,
"loss": 0.4928,
"step": 16800
},
{
"epoch": 0.5343244702817751,
"grad_norm": 0.9267326593399048,
"learning_rate": 9.391277569634329e-05,
"loss": 0.5443,
"step": 16820
},
{
"epoch": 0.5349598144794943,
"grad_norm": 0.9270855784416199,
"learning_rate": 9.370739364330982e-05,
"loss": 0.5132,
"step": 16840
},
{
"epoch": 0.5355951586772134,
"grad_norm": 0.9786942601203918,
"learning_rate": 9.35020382357071e-05,
"loss": 0.5229,
"step": 16860
},
{
"epoch": 0.5362305028749325,
"grad_norm": 0.8397322297096252,
"learning_rate": 9.329671034309269e-05,
"loss": 0.5248,
"step": 16880
},
{
"epoch": 0.5368658470726516,
"grad_norm": 0.9696868062019348,
"learning_rate": 9.30914108349076e-05,
"loss": 0.5635,
"step": 16900
},
{
"epoch": 0.5375011912703708,
"grad_norm": 1.1376127004623413,
"learning_rate": 9.28861405804727e-05,
"loss": 0.548,
"step": 16920
},
{
"epoch": 0.5381365354680898,
"grad_norm": 0.9028751254081726,
"learning_rate": 9.268090044898489e-05,
"loss": 0.5253,
"step": 16940
},
{
"epoch": 0.538771879665809,
"grad_norm": 0.7549586296081543,
"learning_rate": 9.247569130951365e-05,
"loss": 0.5119,
"step": 16960
},
{
"epoch": 0.539407223863528,
"grad_norm": 1.002920150756836,
"learning_rate": 9.227051403099715e-05,
"loss": 0.5383,
"step": 16980
},
{
"epoch": 0.5400425680612472,
"grad_norm": 0.7857794761657715,
"learning_rate": 9.206536948223862e-05,
"loss": 0.4943,
"step": 17000
},
{
"epoch": 0.5400425680612472,
"eval_loss": 0.47516322135925293,
"eval_runtime": 44.9681,
"eval_samples_per_second": 60.109,
"eval_steps_per_second": 30.066,
"step": 17000
},
{
"epoch": 0.5406779122589663,
"grad_norm": 0.8384699821472168,
"learning_rate": 9.186025853190276e-05,
"loss": 0.5005,
"step": 17020
},
{
"epoch": 0.5413132564566854,
"grad_norm": 0.859467089176178,
"learning_rate": 9.1655182048512e-05,
"loss": 0.486,
"step": 17040
},
{
"epoch": 0.5419486006544045,
"grad_norm": 0.9178836345672607,
"learning_rate": 9.145014090044276e-05,
"loss": 0.4866,
"step": 17060
},
{
"epoch": 0.5425839448521237,
"grad_norm": 1.5116227865219116,
"learning_rate": 9.12451359559219e-05,
"loss": 0.5103,
"step": 17080
},
{
"epoch": 0.5432192890498427,
"grad_norm": 0.8251123428344727,
"learning_rate": 9.104016808302297e-05,
"loss": 0.5403,
"step": 17100
},
{
"epoch": 0.5438546332475619,
"grad_norm": 0.8845348358154297,
"learning_rate": 9.08352381496625e-05,
"loss": 0.5295,
"step": 17120
},
{
"epoch": 0.5444899774452809,
"grad_norm": 0.8761606812477112,
"learning_rate": 9.063034702359643e-05,
"loss": 0.5175,
"step": 17140
},
{
"epoch": 0.5451253216430001,
"grad_norm": 0.8992062211036682,
"learning_rate": 9.042549557241629e-05,
"loss": 0.5211,
"step": 17160
},
{
"epoch": 0.5457606658407193,
"grad_norm": 1.0609464645385742,
"learning_rate": 9.022068466354573e-05,
"loss": 0.5231,
"step": 17180
},
{
"epoch": 0.5463960100384383,
"grad_norm": 1.1660939455032349,
"learning_rate": 9.001591516423664e-05,
"loss": 0.5097,
"step": 17200
},
{
"epoch": 0.5470313542361575,
"grad_norm": 0.8982824683189392,
"learning_rate": 8.981118794156556e-05,
"loss": 0.499,
"step": 17220
},
{
"epoch": 0.5476666984338765,
"grad_norm": 0.9423658847808838,
"learning_rate": 8.960650386243009e-05,
"loss": 0.5023,
"step": 17240
},
{
"epoch": 0.5483020426315957,
"grad_norm": 0.781741738319397,
"learning_rate": 8.940186379354505e-05,
"loss": 0.5098,
"step": 17260
},
{
"epoch": 0.5489373868293148,
"grad_norm": 0.9678505063056946,
"learning_rate": 8.919726860143895e-05,
"loss": 0.5005,
"step": 17280
},
{
"epoch": 0.5495727310270339,
"grad_norm": 0.9400302171707153,
"learning_rate": 8.899271915245028e-05,
"loss": 0.537,
"step": 17300
},
{
"epoch": 0.550208075224753,
"grad_norm": 0.8072425127029419,
"learning_rate": 8.878821631272384e-05,
"loss": 0.5073,
"step": 17320
},
{
"epoch": 0.5508434194224722,
"grad_norm": 0.9000498652458191,
"learning_rate": 8.858376094820701e-05,
"loss": 0.5014,
"step": 17340
},
{
"epoch": 0.5514787636201912,
"grad_norm": 0.9222893118858337,
"learning_rate": 8.837935392464621e-05,
"loss": 0.5216,
"step": 17360
},
{
"epoch": 0.5521141078179104,
"grad_norm": 0.8468360304832458,
"learning_rate": 8.817499610758316e-05,
"loss": 0.5282,
"step": 17380
},
{
"epoch": 0.5527494520156294,
"grad_norm": 0.7120311260223389,
"learning_rate": 8.797068836235116e-05,
"loss": 0.5277,
"step": 17400
},
{
"epoch": 0.5533847962133486,
"grad_norm": 0.880155622959137,
"learning_rate": 8.776643155407154e-05,
"loss": 0.523,
"step": 17420
},
{
"epoch": 0.5540201404110677,
"grad_norm": 1.023587703704834,
"learning_rate": 8.756222654764996e-05,
"loss": 0.508,
"step": 17440
},
{
"epoch": 0.5546554846087868,
"grad_norm": 0.8903362154960632,
"learning_rate": 8.735807420777262e-05,
"loss": 0.5165,
"step": 17460
},
{
"epoch": 0.5552908288065059,
"grad_norm": 0.7317694425582886,
"learning_rate": 8.715397539890287e-05,
"loss": 0.4672,
"step": 17480
},
{
"epoch": 0.5559261730042251,
"grad_norm": 1.0228464603424072,
"learning_rate": 8.694993098527723e-05,
"loss": 0.5112,
"step": 17500
},
{
"epoch": 0.5565615172019441,
"grad_norm": 0.7797629237174988,
"learning_rate": 8.674594183090199e-05,
"loss": 0.477,
"step": 17520
},
{
"epoch": 0.5571968613996633,
"grad_norm": 0.8488342761993408,
"learning_rate": 8.654200879954945e-05,
"loss": 0.4993,
"step": 17540
},
{
"epoch": 0.5578322055973823,
"grad_norm": 0.8529194593429565,
"learning_rate": 8.63381327547542e-05,
"loss": 0.5293,
"step": 17560
},
{
"epoch": 0.5584675497951015,
"grad_norm": 0.9537157416343689,
"learning_rate": 8.613431455980955e-05,
"loss": 0.5047,
"step": 17580
},
{
"epoch": 0.5591028939928206,
"grad_norm": 0.8697558045387268,
"learning_rate": 8.593055507776393e-05,
"loss": 0.5293,
"step": 17600
},
{
"epoch": 0.5597382381905397,
"grad_norm": 0.8306463360786438,
"learning_rate": 8.5726855171417e-05,
"loss": 0.5075,
"step": 17620
},
{
"epoch": 0.5603735823882589,
"grad_norm": 0.8880159258842468,
"learning_rate": 8.55232157033163e-05,
"loss": 0.5149,
"step": 17640
},
{
"epoch": 0.561008926585978,
"grad_norm": 0.9390746355056763,
"learning_rate": 8.531963753575334e-05,
"loss": 0.5196,
"step": 17660
},
{
"epoch": 0.561644270783697,
"grad_norm": 0.968285322189331,
"learning_rate": 8.511612153076015e-05,
"loss": 0.5229,
"step": 17680
},
{
"epoch": 0.5622796149814162,
"grad_norm": 0.9114767909049988,
"learning_rate": 8.491266855010548e-05,
"loss": 0.5008,
"step": 17700
},
{
"epoch": 0.5629149591791353,
"grad_norm": 0.9089644551277161,
"learning_rate": 8.470927945529123e-05,
"loss": 0.4848,
"step": 17720
},
{
"epoch": 0.5635503033768544,
"grad_norm": 0.7264979481697083,
"learning_rate": 8.450595510754877e-05,
"loss": 0.5155,
"step": 17740
},
{
"epoch": 0.5641856475745736,
"grad_norm": 0.9070448875427246,
"learning_rate": 8.430269636783534e-05,
"loss": 0.524,
"step": 17760
},
{
"epoch": 0.5648209917722926,
"grad_norm": 0.9725968241691589,
"learning_rate": 8.40995040968303e-05,
"loss": 0.4925,
"step": 17780
},
{
"epoch": 0.5654563359700118,
"grad_norm": 0.8976007103919983,
"learning_rate": 8.389637915493162e-05,
"loss": 0.4937,
"step": 17800
},
{
"epoch": 0.5660916801677308,
"grad_norm": 0.9926420450210571,
"learning_rate": 8.369332240225214e-05,
"loss": 0.5181,
"step": 17820
},
{
"epoch": 0.56672702436545,
"grad_norm": 0.852676272392273,
"learning_rate": 8.349033469861598e-05,
"loss": 0.5175,
"step": 17840
},
{
"epoch": 0.5673623685631691,
"grad_norm": 0.8739320635795593,
"learning_rate": 8.328741690355487e-05,
"loss": 0.4805,
"step": 17860
},
{
"epoch": 0.5679977127608882,
"grad_norm": 0.9660511016845703,
"learning_rate": 8.308456987630449e-05,
"loss": 0.5063,
"step": 17880
},
{
"epoch": 0.5686330569586073,
"grad_norm": 0.9321526288986206,
"learning_rate": 8.288179447580088e-05,
"loss": 0.4994,
"step": 17900
},
{
"epoch": 0.5692684011563265,
"grad_norm": 1.0359587669372559,
"learning_rate": 8.267909156067685e-05,
"loss": 0.5279,
"step": 17920
},
{
"epoch": 0.5699037453540455,
"grad_norm": 0.9722701907157898,
"learning_rate": 8.247646198925813e-05,
"loss": 0.5061,
"step": 17940
},
{
"epoch": 0.5705390895517647,
"grad_norm": 0.854860782623291,
"learning_rate": 8.227390661956006e-05,
"loss": 0.4827,
"step": 17960
},
{
"epoch": 0.5711744337494837,
"grad_norm": 0.8997724652290344,
"learning_rate": 8.207142630928362e-05,
"loss": 0.4978,
"step": 17980
},
{
"epoch": 0.5718097779472029,
"grad_norm": 0.9234896898269653,
"learning_rate": 8.186902191581205e-05,
"loss": 0.4982,
"step": 18000
},
{
"epoch": 0.5718097779472029,
"eval_loss": 0.469827800989151,
"eval_runtime": 44.8258,
"eval_samples_per_second": 60.3,
"eval_steps_per_second": 30.161,
"step": 18000
},
{
"epoch": 0.572445122144922,
"grad_norm": 0.8457797169685364,
"learning_rate": 8.166669429620712e-05,
"loss": 0.5263,
"step": 18020
},
{
"epoch": 0.5730804663426411,
"grad_norm": 0.8909218907356262,
"learning_rate": 8.146444430720545e-05,
"loss": 0.5045,
"step": 18040
},
{
"epoch": 0.5737158105403602,
"grad_norm": 0.950072705745697,
"learning_rate": 8.126227280521503e-05,
"loss": 0.5247,
"step": 18060
},
{
"epoch": 0.5743511547380794,
"grad_norm": 0.9507225751876831,
"learning_rate": 8.106018064631148e-05,
"loss": 0.4851,
"step": 18080
},
{
"epoch": 0.5749864989357985,
"grad_norm": 1.0232789516448975,
"learning_rate": 8.085816868623436e-05,
"loss": 0.5457,
"step": 18100
},
{
"epoch": 0.5756218431335176,
"grad_norm": 1.0967813730239868,
"learning_rate": 8.065623778038377e-05,
"loss": 0.52,
"step": 18120
},
{
"epoch": 0.5762571873312367,
"grad_norm": 0.7866876125335693,
"learning_rate": 8.045438878381649e-05,
"loss": 0.5117,
"step": 18140
},
{
"epoch": 0.5768925315289558,
"grad_norm": 0.9325518012046814,
"learning_rate": 8.025262255124248e-05,
"loss": 0.5415,
"step": 18160
},
{
"epoch": 0.577527875726675,
"grad_norm": 0.8899424076080322,
"learning_rate": 8.005093993702133e-05,
"loss": 0.4947,
"step": 18180
},
{
"epoch": 0.578163219924394,
"grad_norm": 1.0050842761993408,
"learning_rate": 7.984934179515843e-05,
"loss": 0.4863,
"step": 18200
},
{
"epoch": 0.5787985641221132,
"grad_norm": 0.836564302444458,
"learning_rate": 7.964782897930158e-05,
"loss": 0.5055,
"step": 18220
},
{
"epoch": 0.5794339083198322,
"grad_norm": 1.032029628753662,
"learning_rate": 7.944640234273724e-05,
"loss": 0.4919,
"step": 18240
},
{
"epoch": 0.5800692525175514,
"grad_norm": 0.854015588760376,
"learning_rate": 7.92450627383869e-05,
"loss": 0.5108,
"step": 18260
},
{
"epoch": 0.5807045967152705,
"grad_norm": 1.0629216432571411,
"learning_rate": 7.904381101880364e-05,
"loss": 0.5312,
"step": 18280
},
{
"epoch": 0.5813399409129896,
"grad_norm": 0.8146398067474365,
"learning_rate": 7.884264803616827e-05,
"loss": 0.5203,
"step": 18300
},
{
"epoch": 0.5819752851107087,
"grad_norm": 1.1307437419891357,
"learning_rate": 7.864157464228593e-05,
"loss": 0.5325,
"step": 18320
},
{
"epoch": 0.5826106293084279,
"grad_norm": 0.9609930515289307,
"learning_rate": 7.844059168858241e-05,
"loss": 0.5034,
"step": 18340
},
{
"epoch": 0.5832459735061469,
"grad_norm": 0.8615232110023499,
"learning_rate": 7.823970002610048e-05,
"loss": 0.522,
"step": 18360
},
{
"epoch": 0.5838813177038661,
"grad_norm": 1.014160394668579,
"learning_rate": 7.803890050549641e-05,
"loss": 0.5104,
"step": 18380
},
{
"epoch": 0.5845166619015851,
"grad_norm": 1.015424370765686,
"learning_rate": 7.78381939770363e-05,
"loss": 0.4887,
"step": 18400
},
{
"epoch": 0.5851520060993043,
"grad_norm": 1.0072382688522339,
"learning_rate": 7.763758129059243e-05,
"loss": 0.5242,
"step": 18420
},
{
"epoch": 0.5857873502970234,
"grad_norm": 1.122096300125122,
"learning_rate": 7.743706329563971e-05,
"loss": 0.5408,
"step": 18440
},
{
"epoch": 0.5864226944947425,
"grad_norm": 0.8347269296646118,
"learning_rate": 7.723664084125218e-05,
"loss": 0.5112,
"step": 18460
},
{
"epoch": 0.5870580386924616,
"grad_norm": 0.9214980006217957,
"learning_rate": 7.703631477609926e-05,
"loss": 0.5111,
"step": 18480
},
{
"epoch": 0.5876933828901808,
"grad_norm": 0.8427157402038574,
"learning_rate": 7.683608594844218e-05,
"loss": 0.5199,
"step": 18500
},
{
"epoch": 0.5883287270878998,
"grad_norm": 0.8485844731330872,
"learning_rate": 7.663595520613054e-05,
"loss": 0.5193,
"step": 18520
},
{
"epoch": 0.588964071285619,
"grad_norm": 0.8761444687843323,
"learning_rate": 7.643592339659848e-05,
"loss": 0.5044,
"step": 18540
},
{
"epoch": 0.589599415483338,
"grad_norm": 0.9373889565467834,
"learning_rate": 7.623599136686133e-05,
"loss": 0.493,
"step": 18560
},
{
"epoch": 0.5902347596810572,
"grad_norm": 0.9052358269691467,
"learning_rate": 7.603615996351184e-05,
"loss": 0.516,
"step": 18580
},
{
"epoch": 0.5908701038787764,
"grad_norm": 0.7757846117019653,
"learning_rate": 7.583643003271668e-05,
"loss": 0.5043,
"step": 18600
},
{
"epoch": 0.5915054480764954,
"grad_norm": 0.7769386172294617,
"learning_rate": 7.563680242021285e-05,
"loss": 0.5005,
"step": 18620
},
{
"epoch": 0.5921407922742146,
"grad_norm": 0.7892422080039978,
"learning_rate": 7.543727797130413e-05,
"loss": 0.4982,
"step": 18640
},
{
"epoch": 0.5927761364719337,
"grad_norm": 1.0471646785736084,
"learning_rate": 7.524782606964114e-05,
"loss": 0.5139,
"step": 18660
},
{
"epoch": 0.5934114806696528,
"grad_norm": 0.7995429039001465,
"learning_rate": 7.504850521939017e-05,
"loss": 0.4736,
"step": 18680
},
{
"epoch": 0.5940468248673719,
"grad_norm": 0.9799679517745972,
"learning_rate": 7.484929002382169e-05,
"loss": 0.5033,
"step": 18700
},
{
"epoch": 0.594682169065091,
"grad_norm": 0.8607106804847717,
"learning_rate": 7.465018132649311e-05,
"loss": 0.498,
"step": 18720
},
{
"epoch": 0.5953175132628101,
"grad_norm": 0.9690695405006409,
"learning_rate": 7.445117997051085e-05,
"loss": 0.4898,
"step": 18740
},
{
"epoch": 0.5959528574605293,
"grad_norm": 1.331871747970581,
"learning_rate": 7.425228679852684e-05,
"loss": 0.5044,
"step": 18760
},
{
"epoch": 0.5965882016582483,
"grad_norm": 0.9347879886627197,
"learning_rate": 7.405350265273492e-05,
"loss": 0.5088,
"step": 18780
},
{
"epoch": 0.5972235458559675,
"grad_norm": 0.8495462536811829,
"learning_rate": 7.385482837486725e-05,
"loss": 0.5078,
"step": 18800
},
{
"epoch": 0.5978588900536865,
"grad_norm": 1.318202257156372,
"learning_rate": 7.365626480619081e-05,
"loss": 0.5014,
"step": 18820
},
{
"epoch": 0.5984942342514057,
"grad_norm": 1.0349724292755127,
"learning_rate": 7.345781278750368e-05,
"loss": 0.531,
"step": 18840
},
{
"epoch": 0.5991295784491248,
"grad_norm": 1.047760248184204,
"learning_rate": 7.326938745831322e-05,
"loss": 0.4925,
"step": 18860
},
{
"epoch": 0.5997649226468439,
"grad_norm": 0.874220073223114,
"learning_rate": 7.307115537865903e-05,
"loss": 0.5056,
"step": 18880
},
{
"epoch": 0.600400266844563,
"grad_norm": 0.738158106803894,
"learning_rate": 7.287303732658328e-05,
"loss": 0.4938,
"step": 18900
},
{
"epoch": 0.6010356110422822,
"grad_norm": 0.8721213936805725,
"learning_rate": 7.267503414099758e-05,
"loss": 0.5074,
"step": 18920
},
{
"epoch": 0.6016709552400012,
"grad_norm": 0.7241856455802917,
"learning_rate": 7.247714666032724e-05,
"loss": 0.5045,
"step": 18940
},
{
"epoch": 0.6023062994377204,
"grad_norm": 1.0385938882827759,
"learning_rate": 7.227937572250761e-05,
"loss": 0.5313,
"step": 18960
},
{
"epoch": 0.6029416436354395,
"grad_norm": 1.8555858135223389,
"learning_rate": 7.208172216498046e-05,
"loss": 0.4989,
"step": 18980
},
{
"epoch": 0.6035769878331586,
"grad_norm": 0.9453182816505432,
"learning_rate": 7.188418682469064e-05,
"loss": 0.5146,
"step": 19000
},
{
"epoch": 0.6035769878331586,
"eval_loss": 0.46334323287010193,
"eval_runtime": 44.8428,
"eval_samples_per_second": 60.277,
"eval_steps_per_second": 30.15,
"step": 19000
},
{
"epoch": 0.6042123320308778,
"grad_norm": 0.9362254738807678,
"learning_rate": 7.168677053808237e-05,
"loss": 0.5148,
"step": 19020
},
{
"epoch": 0.6048476762285968,
"grad_norm": 1.19162917137146,
"learning_rate": 7.148947414109572e-05,
"loss": 0.4954,
"step": 19040
},
{
"epoch": 0.605483020426316,
"grad_norm": 0.9854863286018372,
"learning_rate": 7.129229846916318e-05,
"loss": 0.5173,
"step": 19060
},
{
"epoch": 0.6061183646240351,
"grad_norm": 0.8435449600219727,
"learning_rate": 7.109524435720597e-05,
"loss": 0.5154,
"step": 19080
},
{
"epoch": 0.6067537088217542,
"grad_norm": 0.920364260673523,
"learning_rate": 7.08983126396306e-05,
"loss": 0.5092,
"step": 19100
},
{
"epoch": 0.6073890530194733,
"grad_norm": 1.2439565658569336,
"learning_rate": 7.070150415032527e-05,
"loss": 0.511,
"step": 19120
},
{
"epoch": 0.6080243972171924,
"grad_norm": 0.7429732084274292,
"learning_rate": 7.050481972265648e-05,
"loss": 0.4787,
"step": 19140
},
{
"epoch": 0.6086597414149115,
"grad_norm": 0.6966003179550171,
"learning_rate": 7.03082601894653e-05,
"loss": 0.5237,
"step": 19160
},
{
"epoch": 0.6092950856126307,
"grad_norm": 0.8211964964866638,
"learning_rate": 7.011182638306402e-05,
"loss": 0.5349,
"step": 19180
},
{
"epoch": 0.6099304298103497,
"grad_norm": 0.9803711771965027,
"learning_rate": 6.991551913523253e-05,
"loss": 0.5369,
"step": 19200
},
{
"epoch": 0.6105657740080689,
"grad_norm": 0.9161061644554138,
"learning_rate": 6.971933927721479e-05,
"loss": 0.4993,
"step": 19220
},
{
"epoch": 0.611201118205788,
"grad_norm": 0.9608227014541626,
"learning_rate": 6.952328763971537e-05,
"loss": 0.4837,
"step": 19240
},
{
"epoch": 0.6118364624035071,
"grad_norm": 0.9438381195068359,
"learning_rate": 6.932736505289592e-05,
"loss": 0.479,
"step": 19260
},
{
"epoch": 0.6124718066012262,
"grad_norm": 1.571315884590149,
"learning_rate": 6.91315723463716e-05,
"loss": 0.5417,
"step": 19280
},
{
"epoch": 0.6131071507989453,
"grad_norm": 0.8187804818153381,
"learning_rate": 6.893591034920763e-05,
"loss": 0.5189,
"step": 19300
},
{
"epoch": 0.6137424949966644,
"grad_norm": 0.7617794871330261,
"learning_rate": 6.87403798899157e-05,
"loss": 0.468,
"step": 19320
},
{
"epoch": 0.6143778391943836,
"grad_norm": 0.8723959922790527,
"learning_rate": 6.85449817964506e-05,
"loss": 0.5044,
"step": 19340
},
{
"epoch": 0.6150131833921026,
"grad_norm": 0.7760429382324219,
"learning_rate": 6.834971689620659e-05,
"loss": 0.4922,
"step": 19360
},
{
"epoch": 0.6156485275898218,
"grad_norm": 0.925581693649292,
"learning_rate": 6.815458601601392e-05,
"loss": 0.5079,
"step": 19380
},
{
"epoch": 0.6162838717875408,
"grad_norm": 0.8069369792938232,
"learning_rate": 6.795958998213535e-05,
"loss": 0.4995,
"step": 19400
},
{
"epoch": 0.61691921598526,
"grad_norm": 1.3501884937286377,
"learning_rate": 6.77647296202627e-05,
"loss": 0.4906,
"step": 19420
},
{
"epoch": 0.6175545601829792,
"grad_norm": 0.9078099131584167,
"learning_rate": 6.75700057555132e-05,
"loss": 0.4983,
"step": 19440
},
{
"epoch": 0.6181899043806982,
"grad_norm": 0.7792625427246094,
"learning_rate": 6.737541921242619e-05,
"loss": 0.4869,
"step": 19460
},
{
"epoch": 0.6188252485784174,
"grad_norm": 0.8952593803405762,
"learning_rate": 6.718097081495947e-05,
"loss": 0.4975,
"step": 19480
},
{
"epoch": 0.6194605927761365,
"grad_norm": 0.9192362427711487,
"learning_rate": 6.698666138648593e-05,
"loss": 0.5059,
"step": 19500
},
{
"epoch": 0.6200959369738556,
"grad_norm": 0.8911659121513367,
"learning_rate": 6.679249174978997e-05,
"loss": 0.5014,
"step": 19520
},
{
"epoch": 0.6207312811715747,
"grad_norm": 0.9853730201721191,
"learning_rate": 6.659846272706406e-05,
"loss": 0.4935,
"step": 19540
},
{
"epoch": 0.6213666253692938,
"grad_norm": 1.3485686779022217,
"learning_rate": 6.640457513990527e-05,
"loss": 0.5061,
"step": 19560
},
{
"epoch": 0.6220019695670129,
"grad_norm": 0.8757696747779846,
"learning_rate": 6.621082980931179e-05,
"loss": 0.4869,
"step": 19580
},
{
"epoch": 0.6226373137647321,
"grad_norm": 1.0088223218917847,
"learning_rate": 6.601722755567937e-05,
"loss": 0.5138,
"step": 19600
},
{
"epoch": 0.6232726579624511,
"grad_norm": 0.94034343957901,
"learning_rate": 6.582376919879798e-05,
"loss": 0.5159,
"step": 19620
},
{
"epoch": 0.6239080021601703,
"grad_norm": 0.834994375705719,
"learning_rate": 6.563045555784826e-05,
"loss": 0.4862,
"step": 19640
},
{
"epoch": 0.6245433463578894,
"grad_norm": 1.2617956399917603,
"learning_rate": 6.543728745139802e-05,
"loss": 0.5112,
"step": 19660
},
{
"epoch": 0.6251786905556085,
"grad_norm": 0.8542491793632507,
"learning_rate": 6.524426569739892e-05,
"loss": 0.5234,
"step": 19680
},
{
"epoch": 0.6258140347533276,
"grad_norm": 1.162606120109558,
"learning_rate": 6.505139111318277e-05,
"loss": 0.4772,
"step": 19700
},
{
"epoch": 0.6264493789510467,
"grad_norm": 1.0025289058685303,
"learning_rate": 6.48586645154583e-05,
"loss": 0.5212,
"step": 19720
},
{
"epoch": 0.6270847231487658,
"grad_norm": 1.0566537380218506,
"learning_rate": 6.466608672030763e-05,
"loss": 0.5556,
"step": 19740
},
{
"epoch": 0.627720067346485,
"grad_norm": 1.0380536317825317,
"learning_rate": 6.447365854318266e-05,
"loss": 0.4827,
"step": 19760
},
{
"epoch": 0.628355411544204,
"grad_norm": 1.0499038696289062,
"learning_rate": 6.42813807989019e-05,
"loss": 0.5316,
"step": 19780
},
{
"epoch": 0.6289907557419232,
"grad_norm": 0.7457720637321472,
"learning_rate": 6.408925430164669e-05,
"loss": 0.5055,
"step": 19800
},
{
"epoch": 0.6296260999396422,
"grad_norm": 1.2990676164627075,
"learning_rate": 6.389727986495813e-05,
"loss": 0.5068,
"step": 19820
},
{
"epoch": 0.6302614441373614,
"grad_norm": 0.9500844478607178,
"learning_rate": 6.370545830173332e-05,
"loss": 0.4889,
"step": 19840
},
{
"epoch": 0.6308967883350806,
"grad_norm": 0.7668824195861816,
"learning_rate": 6.351379042422199e-05,
"loss": 0.5314,
"step": 19860
},
{
"epoch": 0.6315321325327996,
"grad_norm": 0.9457335472106934,
"learning_rate": 6.332227704402321e-05,
"loss": 0.4898,
"step": 19880
},
{
"epoch": 0.6321674767305188,
"grad_norm": 0.8252271413803101,
"learning_rate": 6.31309189720818e-05,
"loss": 0.5045,
"step": 19900
},
{
"epoch": 0.6328028209282379,
"grad_norm": 0.9943385720252991,
"learning_rate": 6.29397170186849e-05,
"loss": 0.5243,
"step": 19920
},
{
"epoch": 0.633438165125957,
"grad_norm": 1.1582151651382446,
"learning_rate": 6.27582205051849e-05,
"loss": 0.5331,
"step": 19940
},
{
"epoch": 0.6340735093236761,
"grad_norm": 0.9436770677566528,
"learning_rate": 6.256732531103176e-05,
"loss": 0.4903,
"step": 19960
},
{
"epoch": 0.6347088535213952,
"grad_norm": 0.8253883123397827,
"learning_rate": 6.237658862190583e-05,
"loss": 0.4934,
"step": 19980
},
{
"epoch": 0.6353441977191143,
"grad_norm": 0.8770557641983032,
"learning_rate": 6.21860112454631e-05,
"loss": 0.5202,
"step": 20000
},
{
"epoch": 0.6353441977191143,
"eval_loss": 0.45828375220298767,
"eval_runtime": 44.5614,
"eval_samples_per_second": 60.658,
"eval_steps_per_second": 30.34,
"step": 20000
},
{
"epoch": 0.6359795419168335,
"grad_norm": 1.2218546867370605,
"learning_rate": 6.19955939886849e-05,
"loss": 0.5171,
"step": 20020
},
{
"epoch": 0.6366148861145525,
"grad_norm": 0.8330618143081665,
"learning_rate": 6.180533765787468e-05,
"loss": 0.4863,
"step": 20040
},
{
"epoch": 0.6372502303122717,
"grad_norm": 1.0419652462005615,
"learning_rate": 6.162474393506114e-05,
"loss": 0.5427,
"step": 20060
},
{
"epoch": 0.6378855745099908,
"grad_norm": 0.9472757577896118,
"learning_rate": 6.143480372643493e-05,
"loss": 0.5245,
"step": 20080
},
{
"epoch": 0.6385209187077099,
"grad_norm": 0.7603405117988586,
"learning_rate": 6.12450268183886e-05,
"loss": 0.4964,
"step": 20100
},
{
"epoch": 0.639156262905429,
"grad_norm": 0.8776742219924927,
"learning_rate": 6.105541401451404e-05,
"loss": 0.4966,
"step": 20120
},
{
"epoch": 0.6397916071031481,
"grad_norm": 0.8271143436431885,
"learning_rate": 6.086596611770831e-05,
"loss": 0.5119,
"step": 20140
},
{
"epoch": 0.6404269513008672,
"grad_norm": 1.1509547233581543,
"learning_rate": 6.067668393017007e-05,
"loss": 0.5031,
"step": 20160
},
{
"epoch": 0.6410622954985864,
"grad_norm": 0.8693366050720215,
"learning_rate": 6.048756825339643e-05,
"loss": 0.4986,
"step": 20180
},
{
"epoch": 0.6416976396963054,
"grad_norm": 0.949834942817688,
"learning_rate": 6.029861988817935e-05,
"loss": 0.4921,
"step": 20200
},
{
"epoch": 0.6423329838940246,
"grad_norm": 0.9004225730895996,
"learning_rate": 6.010983963460233e-05,
"loss": 0.5023,
"step": 20220
},
{
"epoch": 0.6429683280917438,
"grad_norm": 0.7829142808914185,
"learning_rate": 5.9921228292037026e-05,
"loss": 0.507,
"step": 20240
},
{
"epoch": 0.6436036722894628,
"grad_norm": 1.1816707849502563,
"learning_rate": 5.973278665913985e-05,
"loss": 0.4926,
"step": 20260
},
{
"epoch": 0.644239016487182,
"grad_norm": 0.881648063659668,
"learning_rate": 5.9544515533848614e-05,
"loss": 0.4885,
"step": 20280
},
{
"epoch": 0.644874360684901,
"grad_norm": 0.9568135738372803,
"learning_rate": 5.9356415713379145e-05,
"loss": 0.515,
"step": 20300
},
{
"epoch": 0.6455097048826202,
"grad_norm": 0.9377472400665283,
"learning_rate": 5.9168487994221834e-05,
"loss": 0.4886,
"step": 20320
},
{
"epoch": 0.6461450490803393,
"grad_norm": 0.9032811522483826,
"learning_rate": 5.898073317213837e-05,
"loss": 0.5064,
"step": 20340
},
{
"epoch": 0.6467803932780584,
"grad_norm": 0.9788734316825867,
"learning_rate": 5.879315204215836e-05,
"loss": 0.4698,
"step": 20360
},
{
"epoch": 0.6474157374757775,
"grad_norm": 1.0353432893753052,
"learning_rate": 5.860574539857584e-05,
"loss": 0.5227,
"step": 20380
},
{
"epoch": 0.6480510816734966,
"grad_norm": 0.8998845815658569,
"learning_rate": 5.84185140349461e-05,
"loss": 0.5132,
"step": 20400
},
{
"epoch": 0.6486864258712157,
"grad_norm": 0.8317026495933533,
"learning_rate": 5.82314587440821e-05,
"loss": 0.468,
"step": 20420
},
{
"epoch": 0.6493217700689349,
"grad_norm": 0.7740748524665833,
"learning_rate": 5.80445803180514e-05,
"loss": 0.5119,
"step": 20440
},
{
"epoch": 0.6499571142666539,
"grad_norm": 1.0922515392303467,
"learning_rate": 5.78578795481725e-05,
"loss": 0.5284,
"step": 20460
},
{
"epoch": 0.6505924584643731,
"grad_norm": 0.8265649676322937,
"learning_rate": 5.76713572250117e-05,
"loss": 0.5095,
"step": 20480
},
{
"epoch": 0.6512278026620922,
"grad_norm": 1.0644861459732056,
"learning_rate": 5.748501413837963e-05,
"loss": 0.5028,
"step": 20500
},
{
"epoch": 0.6518631468598113,
"grad_norm": 0.9139828681945801,
"learning_rate": 5.729885107732808e-05,
"loss": 0.4814,
"step": 20520
},
{
"epoch": 0.6524984910575304,
"grad_norm": 0.7917624115943909,
"learning_rate": 5.7112868830146416e-05,
"loss": 0.4772,
"step": 20540
},
{
"epoch": 0.6531338352552495,
"grad_norm": 0.7677121162414551,
"learning_rate": 5.692706818435836e-05,
"loss": 0.519,
"step": 20560
},
{
"epoch": 0.6537691794529686,
"grad_norm": 0.8412395715713501,
"learning_rate": 5.674144992671882e-05,
"loss": 0.501,
"step": 20580
},
{
"epoch": 0.6544045236506878,
"grad_norm": 1.014061689376831,
"learning_rate": 5.655601484321022e-05,
"loss": 0.5122,
"step": 20600
},
{
"epoch": 0.6550398678484068,
"grad_norm": 1.0746990442276,
"learning_rate": 5.6370763719039375e-05,
"loss": 0.4969,
"step": 20620
},
{
"epoch": 0.655675212046126,
"grad_norm": 0.9021841883659363,
"learning_rate": 5.6185697338634304e-05,
"loss": 0.4771,
"step": 20640
},
{
"epoch": 0.6563105562438452,
"grad_norm": 0.8193987607955933,
"learning_rate": 5.600081648564056e-05,
"loss": 0.5143,
"step": 20660
},
{
"epoch": 0.6569459004415642,
"grad_norm": 1.152421474456787,
"learning_rate": 5.581612194291814e-05,
"loss": 0.4873,
"step": 20680
},
{
"epoch": 0.6575812446392834,
"grad_norm": 0.8709347248077393,
"learning_rate": 5.5631614492538217e-05,
"loss": 0.5199,
"step": 20700
},
{
"epoch": 0.6582165888370024,
"grad_norm": 0.827723503112793,
"learning_rate": 5.544729491577967e-05,
"loss": 0.4917,
"step": 20720
},
{
"epoch": 0.6588519330347216,
"grad_norm": 1.5408345460891724,
"learning_rate": 5.526316399312579e-05,
"loss": 0.5562,
"step": 20740
},
{
"epoch": 0.6594872772324407,
"grad_norm": 0.731490433216095,
"learning_rate": 5.507922250426118e-05,
"loss": 0.4927,
"step": 20760
},
{
"epoch": 0.6601226214301598,
"grad_norm": 0.950702428817749,
"learning_rate": 5.4895471228068185e-05,
"loss": 0.5115,
"step": 20780
},
{
"epoch": 0.6607579656278789,
"grad_norm": 0.8342424631118774,
"learning_rate": 5.471191094262369e-05,
"loss": 0.4856,
"step": 20800
},
{
"epoch": 0.661393309825598,
"grad_norm": 0.9297844767570496,
"learning_rate": 5.4528542425196004e-05,
"loss": 0.4896,
"step": 20820
},
{
"epoch": 0.6620286540233171,
"grad_norm": 0.7558259963989258,
"learning_rate": 5.434536645224126e-05,
"loss": 0.4895,
"step": 20840
},
{
"epoch": 0.6626639982210363,
"grad_norm": 1.2116395235061646,
"learning_rate": 5.416238379940035e-05,
"loss": 0.507,
"step": 20860
},
{
"epoch": 0.6632993424187553,
"grad_norm": 0.913467526435852,
"learning_rate": 5.39795952414955e-05,
"loss": 0.5137,
"step": 20880
},
{
"epoch": 0.6639346866164745,
"grad_norm": 0.868238627910614,
"learning_rate": 5.3797001552527184e-05,
"loss": 0.5185,
"step": 20900
},
{
"epoch": 0.6645700308141936,
"grad_norm": 1.0668286085128784,
"learning_rate": 5.361460350567062e-05,
"loss": 0.5158,
"step": 20920
},
{
"epoch": 0.6652053750119127,
"grad_norm": 0.795097291469574,
"learning_rate": 5.3432401873272655e-05,
"loss": 0.4985,
"step": 20940
},
{
"epoch": 0.6658407192096318,
"grad_norm": 0.6949301958084106,
"learning_rate": 5.325039742684839e-05,
"loss": 0.4722,
"step": 20960
},
{
"epoch": 0.6664760634073509,
"grad_norm": 0.7859952449798584,
"learning_rate": 5.3068590937077945e-05,
"loss": 0.4933,
"step": 20980
},
{
"epoch": 0.66711140760507,
"grad_norm": 0.8529000282287598,
"learning_rate": 5.288698317380334e-05,
"loss": 0.5098,
"step": 21000
},
{
"epoch": 0.66711140760507,
"eval_loss": 0.45643100142478943,
"eval_runtime": 44.6378,
"eval_samples_per_second": 60.554,
"eval_steps_per_second": 30.288,
"step": 21000
},
{
"epoch": 0.6677467518027892,
"grad_norm": 0.9853639602661133,
"learning_rate": 5.270557490602499e-05,
"loss": 0.4715,
"step": 21020
},
{
"epoch": 0.6683820960005082,
"grad_norm": 0.8387131690979004,
"learning_rate": 5.2524366901898566e-05,
"loss": 0.5128,
"step": 21040
},
{
"epoch": 0.6690174401982274,
"grad_norm": 0.8610044717788696,
"learning_rate": 5.234335992873176e-05,
"loss": 0.5424,
"step": 21060
},
{
"epoch": 0.6696527843959466,
"grad_norm": 0.8878015279769897,
"learning_rate": 5.216255475298109e-05,
"loss": 0.4734,
"step": 21080
},
{
"epoch": 0.6702881285936656,
"grad_norm": 1.0038951635360718,
"learning_rate": 5.198195214024848e-05,
"loss": 0.4879,
"step": 21100
},
{
"epoch": 0.6709234727913848,
"grad_norm": 0.9256641864776611,
"learning_rate": 5.1801552855278126e-05,
"loss": 0.527,
"step": 21120
},
{
"epoch": 0.6715588169891038,
"grad_norm": 0.7668296098709106,
"learning_rate": 5.162135766195337e-05,
"loss": 0.5161,
"step": 21140
},
{
"epoch": 0.672194161186823,
"grad_norm": 0.7756738066673279,
"learning_rate": 5.144136732329323e-05,
"loss": 0.5265,
"step": 21160
},
{
"epoch": 0.6728295053845421,
"grad_norm": 0.9279829859733582,
"learning_rate": 5.1261582601449285e-05,
"loss": 0.4814,
"step": 21180
},
{
"epoch": 0.6734648495822612,
"grad_norm": 1.1274375915527344,
"learning_rate": 5.108200425770255e-05,
"loss": 0.5061,
"step": 21200
},
{
"epoch": 0.6741001937799803,
"grad_norm": 1.082535982131958,
"learning_rate": 5.090263305246006e-05,
"loss": 0.5081,
"step": 21220
},
{
"epoch": 0.6747355379776995,
"grad_norm": 1.0355536937713623,
"learning_rate": 5.0723469745251725e-05,
"loss": 0.5044,
"step": 21240
},
{
"epoch": 0.6753708821754185,
"grad_norm": 0.9309506416320801,
"learning_rate": 5.054451509472728e-05,
"loss": 0.5241,
"step": 21260
},
{
"epoch": 0.6760062263731377,
"grad_norm": 0.818247377872467,
"learning_rate": 5.0365769858652735e-05,
"loss": 0.5034,
"step": 21280
},
{
"epoch": 0.6766415705708567,
"grad_norm": 0.8921930193901062,
"learning_rate": 5.0187234793907447e-05,
"loss": 0.5089,
"step": 21300
},
{
"epoch": 0.6772769147685759,
"grad_norm": 0.9915839433670044,
"learning_rate": 5.000891065648087e-05,
"loss": 0.5049,
"step": 21320
},
{
"epoch": 0.677912258966295,
"grad_norm": 0.8783996105194092,
"learning_rate": 4.983079820146922e-05,
"loss": 0.5314,
"step": 21340
},
{
"epoch": 0.6785476031640141,
"grad_norm": 0.8735405802726746,
"learning_rate": 4.96528981830724e-05,
"loss": 0.5036,
"step": 21360
},
{
"epoch": 0.6791829473617332,
"grad_norm": 0.9674988389015198,
"learning_rate": 4.947521135459072e-05,
"loss": 0.5269,
"step": 21380
},
{
"epoch": 0.6798182915594523,
"grad_norm": 0.9271227717399597,
"learning_rate": 4.9297738468421896e-05,
"loss": 0.5061,
"step": 21400
},
{
"epoch": 0.6804536357571714,
"grad_norm": 0.7828012704849243,
"learning_rate": 4.912048027605759e-05,
"loss": 0.4978,
"step": 21420
},
{
"epoch": 0.6810889799548906,
"grad_norm": 1.3417547941207886,
"learning_rate": 4.8943437528080385e-05,
"loss": 0.5326,
"step": 21440
},
{
"epoch": 0.6817243241526096,
"grad_norm": 0.8963372707366943,
"learning_rate": 4.876661097416066e-05,
"loss": 0.4989,
"step": 21460
},
{
"epoch": 0.6823596683503288,
"grad_norm": 0.893553614616394,
"learning_rate": 4.859000136305329e-05,
"loss": 0.4859,
"step": 21480
},
{
"epoch": 0.682995012548048,
"grad_norm": 1.2325243949890137,
"learning_rate": 4.8413609442594445e-05,
"loss": 0.5037,
"step": 21500
},
{
"epoch": 0.683630356745767,
"grad_norm": 0.8049502372741699,
"learning_rate": 4.8237435959698706e-05,
"loss": 0.509,
"step": 21520
},
{
"epoch": 0.6842657009434862,
"grad_norm": 1.2289927005767822,
"learning_rate": 4.8061481660355534e-05,
"loss": 0.5128,
"step": 21540
},
{
"epoch": 0.6849010451412052,
"grad_norm": 0.8123481869697571,
"learning_rate": 4.7885747289626284e-05,
"loss": 0.5031,
"step": 21560
},
{
"epoch": 0.6855363893389244,
"grad_norm": 0.8852875232696533,
"learning_rate": 4.771023359164116e-05,
"loss": 0.4875,
"step": 21580
},
{
"epoch": 0.6861717335366435,
"grad_norm": 0.8462742567062378,
"learning_rate": 4.753494130959586e-05,
"loss": 0.4787,
"step": 21600
},
{
"epoch": 0.6868070777343626,
"grad_norm": 0.99876868724823,
"learning_rate": 4.7359871185748485e-05,
"loss": 0.5116,
"step": 21620
},
{
"epoch": 0.6874424219320817,
"grad_norm": 0.9393181204795837,
"learning_rate": 4.718502396141656e-05,
"loss": 0.4878,
"step": 21640
},
{
"epoch": 0.6880777661298009,
"grad_norm": 0.8426542282104492,
"learning_rate": 4.701040037697364e-05,
"loss": 0.4897,
"step": 21660
},
{
"epoch": 0.6887131103275199,
"grad_norm": 0.938210666179657,
"learning_rate": 4.683600117184631e-05,
"loss": 0.492,
"step": 21680
},
{
"epoch": 0.6893484545252391,
"grad_norm": 0.8325148820877075,
"learning_rate": 4.666182708451114e-05,
"loss": 0.4842,
"step": 21700
},
{
"epoch": 0.6899837987229581,
"grad_norm": 0.8813055753707886,
"learning_rate": 4.648787885249136e-05,
"loss": 0.491,
"step": 21720
},
{
"epoch": 0.6906191429206773,
"grad_norm": 1.0838825702667236,
"learning_rate": 4.631415721235389e-05,
"loss": 0.4732,
"step": 21740
},
{
"epoch": 0.6912544871183964,
"grad_norm": 0.7203667163848877,
"learning_rate": 4.614066289970609e-05,
"loss": 0.4692,
"step": 21760
},
{
"epoch": 0.6918898313161155,
"grad_norm": 1.181038737297058,
"learning_rate": 4.596739664919287e-05,
"loss": 0.5177,
"step": 21780
},
{
"epoch": 0.6925251755138346,
"grad_norm": 0.9107904434204102,
"learning_rate": 4.579435919449332e-05,
"loss": 0.5186,
"step": 21800
},
{
"epoch": 0.6931605197115537,
"grad_norm": 0.8281117081642151,
"learning_rate": 4.5621551268317686e-05,
"loss": 0.4848,
"step": 21820
},
{
"epoch": 0.6937958639092728,
"grad_norm": 0.9180241227149963,
"learning_rate": 4.545759700573378e-05,
"loss": 0.4979,
"step": 21840
},
{
"epoch": 0.694431208106992,
"grad_norm": 0.912675678730011,
"learning_rate": 4.5285238763954426e-05,
"loss": 0.5124,
"step": 21860
},
{
"epoch": 0.695066552304711,
"grad_norm": 0.8163600564002991,
"learning_rate": 4.5113112206520056e-05,
"loss": 0.5205,
"step": 21880
},
{
"epoch": 0.6957018965024302,
"grad_norm": 0.7308365702629089,
"learning_rate": 4.494121806228392e-05,
"loss": 0.5208,
"step": 21900
},
{
"epoch": 0.6963372407001494,
"grad_norm": 0.7426006197929382,
"learning_rate": 4.476955705911504e-05,
"loss": 0.48,
"step": 21920
},
{
"epoch": 0.6969725848978684,
"grad_norm": 0.9886866807937622,
"learning_rate": 4.459812992389526e-05,
"loss": 0.5483,
"step": 21940
},
{
"epoch": 0.6976079290955876,
"grad_norm": 0.9653937816619873,
"learning_rate": 4.44269373825162e-05,
"loss": 0.4613,
"step": 21960
},
{
"epoch": 0.6982432732933066,
"grad_norm": 0.8184491991996765,
"learning_rate": 4.425598015987602e-05,
"loss": 0.5212,
"step": 21980
},
{
"epoch": 0.6988786174910258,
"grad_norm": 0.9365077614784241,
"learning_rate": 4.408525897987645e-05,
"loss": 0.4868,
"step": 22000
},
{
"epoch": 0.6988786174910258,
"eval_loss": 0.45187339186668396,
"eval_runtime": 44.7631,
"eval_samples_per_second": 60.385,
"eval_steps_per_second": 30.203,
"step": 22000
},
{
"epoch": 0.6995139616887449,
"grad_norm": 0.9188706874847412,
"learning_rate": 4.391477456541983e-05,
"loss": 0.4991,
"step": 22020
},
{
"epoch": 0.700149305886464,
"grad_norm": 0.8599129319190979,
"learning_rate": 4.374452763840584e-05,
"loss": 0.5184,
"step": 22040
},
{
"epoch": 0.7007846500841831,
"grad_norm": 0.8643587827682495,
"learning_rate": 4.357451891972854e-05,
"loss": 0.4966,
"step": 22060
},
{
"epoch": 0.7014199942819023,
"grad_norm": 0.9123074412345886,
"learning_rate": 4.340474912927332e-05,
"loss": 0.5068,
"step": 22080
},
{
"epoch": 0.7020553384796213,
"grad_norm": 0.8422294855117798,
"learning_rate": 4.323521898591394e-05,
"loss": 0.4753,
"step": 22100
},
{
"epoch": 0.7026906826773405,
"grad_norm": 0.8830937743186951,
"learning_rate": 4.306592920750931e-05,
"loss": 0.4837,
"step": 22120
},
{
"epoch": 0.7033260268750595,
"grad_norm": 0.8540763854980469,
"learning_rate": 4.289688051090054e-05,
"loss": 0.4733,
"step": 22140
},
{
"epoch": 0.7039613710727787,
"grad_norm": 0.8622573614120483,
"learning_rate": 4.272807361190797e-05,
"loss": 0.5003,
"step": 22160
},
{
"epoch": 0.7045967152704978,
"grad_norm": 0.9827342629432678,
"learning_rate": 4.2559509225328e-05,
"loss": 0.5333,
"step": 22180
},
{
"epoch": 0.7052320594682169,
"grad_norm": 0.8439646363258362,
"learning_rate": 4.239118806493013e-05,
"loss": 0.4778,
"step": 22200
},
{
"epoch": 0.705867403665936,
"grad_norm": 0.9348493814468384,
"learning_rate": 4.222311084345405e-05,
"loss": 0.4806,
"step": 22220
},
{
"epoch": 0.7065027478636552,
"grad_norm": 1.0671905279159546,
"learning_rate": 4.2055278272606404e-05,
"loss": 0.4978,
"step": 22240
},
{
"epoch": 0.7071380920613742,
"grad_norm": 1.2363934516906738,
"learning_rate": 4.188769106305787e-05,
"loss": 0.5089,
"step": 22260
},
{
"epoch": 0.7077734362590934,
"grad_norm": 0.9339464902877808,
"learning_rate": 4.1720349924440295e-05,
"loss": 0.4796,
"step": 22280
},
{
"epoch": 0.7084087804568124,
"grad_norm": 0.873092770576477,
"learning_rate": 4.155325556534345e-05,
"loss": 0.4931,
"step": 22300
},
{
"epoch": 0.7090441246545316,
"grad_norm": 0.7866622805595398,
"learning_rate": 4.138640869331215e-05,
"loss": 0.501,
"step": 22320
},
{
"epoch": 0.7096794688522507,
"grad_norm": 1.0133357048034668,
"learning_rate": 4.121981001484334e-05,
"loss": 0.481,
"step": 22340
},
{
"epoch": 0.7103148130499698,
"grad_norm": 0.9386391043663025,
"learning_rate": 4.105346023538292e-05,
"loss": 0.5303,
"step": 22360
},
{
"epoch": 0.710950157247689,
"grad_norm": 0.7917353510856628,
"learning_rate": 4.088736005932289e-05,
"loss": 0.4993,
"step": 22380
},
{
"epoch": 0.711585501445408,
"grad_norm": 0.9757121801376343,
"learning_rate": 4.0721510189998266e-05,
"loss": 0.5102,
"step": 22400
},
{
"epoch": 0.7122208456431272,
"grad_norm": 1.2196959257125854,
"learning_rate": 4.055591132968432e-05,
"loss": 0.5045,
"step": 22420
},
{
"epoch": 0.7128561898408463,
"grad_norm": 1.0833863019943237,
"learning_rate": 4.039056417959328e-05,
"loss": 0.5136,
"step": 22440
},
{
"epoch": 0.7134915340385654,
"grad_norm": 0.7548487186431885,
"learning_rate": 4.02254694398716e-05,
"loss": 0.4864,
"step": 22460
},
{
"epoch": 0.7141268782362845,
"grad_norm": 1.0435632467269897,
"learning_rate": 4.006062780959697e-05,
"loss": 0.4866,
"step": 22480
},
{
"epoch": 0.7147622224340037,
"grad_norm": 0.7469571828842163,
"learning_rate": 3.9896039986775256e-05,
"loss": 0.4825,
"step": 22500
},
{
"epoch": 0.7153975666317227,
"grad_norm": 0.8732174634933472,
"learning_rate": 3.9731706668337585e-05,
"loss": 0.4905,
"step": 22520
},
{
"epoch": 0.7160329108294419,
"grad_norm": 0.8761599063873291,
"learning_rate": 3.956762855013749e-05,
"loss": 0.4831,
"step": 22540
},
{
"epoch": 0.7166682550271609,
"grad_norm": 0.9746137261390686,
"learning_rate": 3.940380632694781e-05,
"loss": 0.5111,
"step": 22560
},
{
"epoch": 0.7173035992248801,
"grad_norm": 0.9219092726707458,
"learning_rate": 3.924024069245782e-05,
"loss": 0.4908,
"step": 22580
},
{
"epoch": 0.7179389434225992,
"grad_norm": 1.0305086374282837,
"learning_rate": 3.907693233927038e-05,
"loss": 0.5215,
"step": 22600
},
{
"epoch": 0.7185742876203183,
"grad_norm": 0.7786363363265991,
"learning_rate": 3.891388195889882e-05,
"loss": 0.4792,
"step": 22620
},
{
"epoch": 0.7192096318180374,
"grad_norm": 0.8930706977844238,
"learning_rate": 3.875109024176413e-05,
"loss": 0.4908,
"step": 22640
},
{
"epoch": 0.7198449760157566,
"grad_norm": 1.0214048624038696,
"learning_rate": 3.858855787719209e-05,
"loss": 0.5102,
"step": 22660
},
{
"epoch": 0.7204803202134756,
"grad_norm": 0.9279896020889282,
"learning_rate": 3.842628555341018e-05,
"loss": 0.4772,
"step": 22680
},
{
"epoch": 0.7211156644111948,
"grad_norm": 1.6357091665267944,
"learning_rate": 3.826427395754482e-05,
"loss": 0.5041,
"step": 22700
},
{
"epoch": 0.7217510086089138,
"grad_norm": 0.8421345949172974,
"learning_rate": 3.8102523775618325e-05,
"loss": 0.5082,
"step": 22720
},
{
"epoch": 0.722386352806633,
"grad_norm": 0.9193027019500732,
"learning_rate": 3.794103569254624e-05,
"loss": 0.485,
"step": 22740
},
{
"epoch": 0.7230216970043521,
"grad_norm": 0.8045080304145813,
"learning_rate": 3.777981039213411e-05,
"loss": 0.5182,
"step": 22760
},
{
"epoch": 0.7236570412020712,
"grad_norm": 0.8535903692245483,
"learning_rate": 3.7618848557074804e-05,
"loss": 0.4796,
"step": 22780
},
{
"epoch": 0.7242923853997904,
"grad_norm": 0.8225564360618591,
"learning_rate": 3.745815086894565e-05,
"loss": 0.4812,
"step": 22800
},
{
"epoch": 0.7249277295975094,
"grad_norm": 0.8030312657356262,
"learning_rate": 3.729771800820539e-05,
"loss": 0.481,
"step": 22820
},
{
"epoch": 0.7255630737952286,
"grad_norm": 0.992080569267273,
"learning_rate": 3.713755065419133e-05,
"loss": 0.4768,
"step": 22840
},
{
"epoch": 0.7261984179929477,
"grad_norm": 0.9184660911560059,
"learning_rate": 3.698563821122103e-05,
"loss": 0.5044,
"step": 22860
},
{
"epoch": 0.7268337621906668,
"grad_norm": 0.8250758647918701,
"learning_rate": 3.6825990545007096e-05,
"loss": 0.5095,
"step": 22880
},
{
"epoch": 0.7274691063883859,
"grad_norm": 1.0519983768463135,
"learning_rate": 3.666661038300353e-05,
"loss": 0.4944,
"step": 22900
},
{
"epoch": 0.7281044505861051,
"grad_norm": 0.789730966091156,
"learning_rate": 3.650749840009022e-05,
"loss": 0.4574,
"step": 22920
},
{
"epoch": 0.7287397947838241,
"grad_norm": 0.8896093368530273,
"learning_rate": 3.6356591030872534e-05,
"loss": 0.5,
"step": 22940
},
{
"epoch": 0.7293751389815433,
"grad_norm": 0.7810101509094238,
"learning_rate": 3.6198003934005195e-05,
"loss": 0.5053,
"step": 22960
},
{
"epoch": 0.7300104831792623,
"grad_norm": 0.883144199848175,
"learning_rate": 3.603968700049657e-05,
"loss": 0.514,
"step": 22980
},
{
"epoch": 0.7306458273769815,
"grad_norm": 0.7069016695022583,
"learning_rate": 3.588164090072441e-05,
"loss": 0.522,
"step": 23000
},
{
"epoch": 0.7306458273769815,
"eval_loss": 0.4499790668487549,
"eval_runtime": 45.0673,
"eval_samples_per_second": 59.977,
"eval_steps_per_second": 30.0,
"step": 23000
},
{
"epoch": 0.7312811715747006,
"grad_norm": 1.0385907888412476,
"learning_rate": 3.5723866303919554e-05,
"loss": 0.489,
"step": 23020
},
{
"epoch": 0.7319165157724197,
"grad_norm": 0.8796695470809937,
"learning_rate": 3.556636387816317e-05,
"loss": 0.4963,
"step": 23040
},
{
"epoch": 0.7325518599701388,
"grad_norm": 0.9427993893623352,
"learning_rate": 3.540913429038407e-05,
"loss": 0.4601,
"step": 23060
},
{
"epoch": 0.733187204167858,
"grad_norm": 0.8525741100311279,
"learning_rate": 3.525217820635564e-05,
"loss": 0.5034,
"step": 23080
},
{
"epoch": 0.733822548365577,
"grad_norm": 0.8755898475646973,
"learning_rate": 3.5095496290693155e-05,
"loss": 0.509,
"step": 23100
},
{
"epoch": 0.7344578925632962,
"grad_norm": 1.0328361988067627,
"learning_rate": 3.4939089206851025e-05,
"loss": 0.4994,
"step": 23120
},
{
"epoch": 0.7350932367610152,
"grad_norm": 1.130226969718933,
"learning_rate": 3.478295761711986e-05,
"loss": 0.4848,
"step": 23140
},
{
"epoch": 0.7357285809587344,
"grad_norm": 0.733567476272583,
"learning_rate": 3.4627102182623696e-05,
"loss": 0.5123,
"step": 23160
},
{
"epoch": 0.7363639251564535,
"grad_norm": 1.1062750816345215,
"learning_rate": 3.447152356331721e-05,
"loss": 0.4767,
"step": 23180
},
{
"epoch": 0.7369992693541726,
"grad_norm": 0.9558404684066772,
"learning_rate": 3.431622241798305e-05,
"loss": 0.4832,
"step": 23200
},
{
"epoch": 0.7376346135518917,
"grad_norm": 0.8974496722221375,
"learning_rate": 3.416119940422877e-05,
"loss": 0.4818,
"step": 23220
},
{
"epoch": 0.7382699577496109,
"grad_norm": 1.2721449136734009,
"learning_rate": 3.400645517848427e-05,
"loss": 0.5102,
"step": 23240
},
{
"epoch": 0.73890530194733,
"grad_norm": 1.0408607721328735,
"learning_rate": 3.385199039599902e-05,
"loss": 0.4784,
"step": 23260
},
{
"epoch": 0.7395406461450491,
"grad_norm": 0.9826887845993042,
"learning_rate": 3.369780571083909e-05,
"loss": 0.5039,
"step": 23280
},
{
"epoch": 0.7401759903427682,
"grad_norm": 0.8110315799713135,
"learning_rate": 3.354390177588454e-05,
"loss": 0.5034,
"step": 23300
},
{
"epoch": 0.7408113345404873,
"grad_norm": 0.8513306975364685,
"learning_rate": 3.339027924282673e-05,
"loss": 0.509,
"step": 23320
},
{
"epoch": 0.7414466787382065,
"grad_norm": 0.8255580067634583,
"learning_rate": 3.323693876216529e-05,
"loss": 0.4678,
"step": 23340
},
{
"epoch": 0.7420820229359255,
"grad_norm": 1.1336640119552612,
"learning_rate": 3.30838809832056e-05,
"loss": 0.4848,
"step": 23360
},
{
"epoch": 0.7427173671336447,
"grad_norm": 0.8720375895500183,
"learning_rate": 3.2931106554056005e-05,
"loss": 0.4929,
"step": 23380
},
{
"epoch": 0.7433527113313637,
"grad_norm": 1.0169090032577515,
"learning_rate": 3.277861612162498e-05,
"loss": 0.5066,
"step": 23400
},
{
"epoch": 0.7439880555290829,
"grad_norm": 1.2800534963607788,
"learning_rate": 3.262641033161843e-05,
"loss": 0.4964,
"step": 23420
},
{
"epoch": 0.744623399726802,
"grad_norm": 0.819925844669342,
"learning_rate": 3.2474489828537046e-05,
"loss": 0.509,
"step": 23440
},
{
"epoch": 0.7452587439245211,
"grad_norm": 0.8024299144744873,
"learning_rate": 3.232285525567343e-05,
"loss": 0.4922,
"step": 23460
},
{
"epoch": 0.7458940881222402,
"grad_norm": 1.1049789190292358,
"learning_rate": 3.217150725510946e-05,
"loss": 0.4907,
"step": 23480
},
{
"epoch": 0.7465294323199594,
"grad_norm": 1.0818272829055786,
"learning_rate": 3.2020446467713516e-05,
"loss": 0.4806,
"step": 23500
},
{
"epoch": 0.7471647765176784,
"grad_norm": 0.6681995391845703,
"learning_rate": 3.18696735331379e-05,
"loss": 0.4504,
"step": 23520
},
{
"epoch": 0.7478001207153976,
"grad_norm": 0.8827902674674988,
"learning_rate": 3.171918908981595e-05,
"loss": 0.5081,
"step": 23540
},
{
"epoch": 0.7484354649131166,
"grad_norm": 1.0249037742614746,
"learning_rate": 3.156899377495938e-05,
"loss": 0.5297,
"step": 23560
},
{
"epoch": 0.7490708091108358,
"grad_norm": 1.0797147750854492,
"learning_rate": 3.141908822455574e-05,
"loss": 0.4701,
"step": 23580
},
{
"epoch": 0.749706153308555,
"grad_norm": 0.724281907081604,
"learning_rate": 3.126947307336551e-05,
"loss": 0.4608,
"step": 23600
},
{
"epoch": 0.750341497506274,
"grad_norm": 0.7410632967948914,
"learning_rate": 3.1120148954919485e-05,
"loss": 0.4747,
"step": 23620
},
{
"epoch": 0.7509768417039931,
"grad_norm": 1.0309559106826782,
"learning_rate": 3.09711165015162e-05,
"loss": 0.534,
"step": 23640
},
{
"epoch": 0.7516121859017123,
"grad_norm": 0.9060602784156799,
"learning_rate": 3.0822376344219105e-05,
"loss": 0.4709,
"step": 23660
},
{
"epoch": 0.7522475300994313,
"grad_norm": 0.9018211364746094,
"learning_rate": 3.067392911285395e-05,
"loss": 0.5084,
"step": 23680
},
{
"epoch": 0.7528828742971505,
"grad_norm": 1.1375420093536377,
"learning_rate": 3.0525775436006107e-05,
"loss": 0.5023,
"step": 23700
},
{
"epoch": 0.7535182184948696,
"grad_norm": 0.8034165501594543,
"learning_rate": 3.0377915941017955e-05,
"loss": 0.4947,
"step": 23720
},
{
"epoch": 0.7541535626925887,
"grad_norm": 1.0958040952682495,
"learning_rate": 3.0230351253986143e-05,
"loss": 0.5009,
"step": 23740
},
{
"epoch": 0.7547889068903079,
"grad_norm": 0.8740959763526917,
"learning_rate": 3.0083081999759067e-05,
"loss": 0.4942,
"step": 23760
},
{
"epoch": 0.7554242510880269,
"grad_norm": 0.8798695206642151,
"learning_rate": 2.993610880193406e-05,
"loss": 0.4676,
"step": 23780
},
{
"epoch": 0.7560595952857461,
"grad_norm": 0.9538172483444214,
"learning_rate": 2.9789432282854822e-05,
"loss": 0.4441,
"step": 23800
},
{
"epoch": 0.7566949394834651,
"grad_norm": 0.9560829401016235,
"learning_rate": 2.9643053063608917e-05,
"loss": 0.4995,
"step": 23820
},
{
"epoch": 0.7573302836811843,
"grad_norm": 1.0306763648986816,
"learning_rate": 2.9496971764024884e-05,
"loss": 0.5042,
"step": 23840
},
{
"epoch": 0.7579656278789034,
"grad_norm": 0.9823128581047058,
"learning_rate": 2.9351189002669788e-05,
"loss": 0.5274,
"step": 23860
},
{
"epoch": 0.7586009720766225,
"grad_norm": 0.8448672890663147,
"learning_rate": 2.920570539684665e-05,
"loss": 0.4713,
"step": 23880
},
{
"epoch": 0.7592363162743416,
"grad_norm": 0.8830504417419434,
"learning_rate": 2.9060521562591624e-05,
"loss": 0.5069,
"step": 23900
},
{
"epoch": 0.7598716604720608,
"grad_norm": 0.9051734805107117,
"learning_rate": 2.891563811467154e-05,
"loss": 0.48,
"step": 23920
},
{
"epoch": 0.7605070046697798,
"grad_norm": 0.8309674859046936,
"learning_rate": 2.877105566658136e-05,
"loss": 0.5141,
"step": 23940
},
{
"epoch": 0.761142348867499,
"grad_norm": 0.8684896230697632,
"learning_rate": 2.863398169962057e-05,
"loss": 0.4518,
"step": 23960
},
{
"epoch": 0.761777693065218,
"grad_norm": 0.959536075592041,
"learning_rate": 2.8489987960934184e-05,
"loss": 0.483,
"step": 23980
},
{
"epoch": 0.7624130372629372,
"grad_norm": 1.3519070148468018,
"learning_rate": 2.8353474370325594e-05,
"loss": 0.5062,
"step": 24000
},
{
"epoch": 0.7624130372629372,
"eval_loss": 0.4479082524776459,
"eval_runtime": 44.6533,
"eval_samples_per_second": 60.533,
"eval_steps_per_second": 30.278,
"step": 24000
},
{
"epoch": 0.7630483814606563,
"grad_norm": 0.8832095861434937,
"learning_rate": 2.8210071659529526e-05,
"loss": 0.5204,
"step": 24020
},
{
"epoch": 0.7636837256583754,
"grad_norm": 0.793205738067627,
"learning_rate": 2.8066972936216017e-05,
"loss": 0.5037,
"step": 24040
},
{
"epoch": 0.7643190698560945,
"grad_norm": 0.8483644127845764,
"learning_rate": 2.79241788063227e-05,
"loss": 0.4812,
"step": 24060
},
{
"epoch": 0.7649544140538137,
"grad_norm": 1.50220787525177,
"learning_rate": 2.7781689874497406e-05,
"loss": 0.501,
"step": 24080
},
{
"epoch": 0.7655897582515327,
"grad_norm": 0.8091638684272766,
"learning_rate": 2.7639506744095766e-05,
"loss": 0.4932,
"step": 24100
},
{
"epoch": 0.7662251024492519,
"grad_norm": 0.9171321392059326,
"learning_rate": 2.74976300171784e-05,
"loss": 0.5,
"step": 24120
},
{
"epoch": 0.766860446646971,
"grad_norm": 0.9392116069793701,
"learning_rate": 2.7356060294508502e-05,
"loss": 0.5075,
"step": 24140
},
{
"epoch": 0.7674957908446901,
"grad_norm": 0.9384047389030457,
"learning_rate": 2.7214798175549395e-05,
"loss": 0.4893,
"step": 24160
},
{
"epoch": 0.7681311350424093,
"grad_norm": 0.7760775685310364,
"learning_rate": 2.707384425846178e-05,
"loss": 0.5267,
"step": 24180
},
{
"epoch": 0.7687664792401283,
"grad_norm": 0.8666489720344543,
"learning_rate": 2.6933199140101285e-05,
"loss": 0.5201,
"step": 24200
},
{
"epoch": 0.7694018234378475,
"grad_norm": 0.9711599946022034,
"learning_rate": 2.679286341601609e-05,
"loss": 0.4923,
"step": 24220
},
{
"epoch": 0.7700371676355666,
"grad_norm": 0.9399335980415344,
"learning_rate": 2.6652837680444153e-05,
"loss": 0.5281,
"step": 24240
},
{
"epoch": 0.7706725118332857,
"grad_norm": 0.8116670250892639,
"learning_rate": 2.651312252631083e-05,
"loss": 0.5111,
"step": 24260
},
{
"epoch": 0.7713078560310048,
"grad_norm": 0.873943030834198,
"learning_rate": 2.6373718545226445e-05,
"loss": 0.471,
"step": 24280
},
{
"epoch": 0.7719432002287239,
"grad_norm": 0.9560205340385437,
"learning_rate": 2.623462632748359e-05,
"loss": 0.5101,
"step": 24300
},
{
"epoch": 0.772578544426443,
"grad_norm": 1.011898159980774,
"learning_rate": 2.6095846462054763e-05,
"loss": 0.4906,
"step": 24320
},
{
"epoch": 0.7732138886241622,
"grad_norm": 1.0334892272949219,
"learning_rate": 2.595737953658982e-05,
"loss": 0.4905,
"step": 24340
},
{
"epoch": 0.7738492328218812,
"grad_norm": 0.6994766592979431,
"learning_rate": 2.581922613741352e-05,
"loss": 0.4794,
"step": 24360
},
{
"epoch": 0.7744845770196004,
"grad_norm": 0.9781257510185242,
"learning_rate": 2.5681386849523003e-05,
"loss": 0.4871,
"step": 24380
},
{
"epoch": 0.7751199212173194,
"grad_norm": 1.0443729162216187,
"learning_rate": 2.5543862256585393e-05,
"loss": 0.5133,
"step": 24400
},
{
"epoch": 0.7757552654150386,
"grad_norm": 0.8841618299484253,
"learning_rate": 2.5406652940935217e-05,
"loss": 0.4865,
"step": 24420
},
{
"epoch": 0.7763906096127577,
"grad_norm": 0.8439558148384094,
"learning_rate": 2.5269759483571954e-05,
"loss": 0.4908,
"step": 24440
},
{
"epoch": 0.7770259538104768,
"grad_norm": 0.9146759510040283,
"learning_rate": 2.5133182464157734e-05,
"loss": 0.4934,
"step": 24460
},
{
"epoch": 0.777661298008196,
"grad_norm": 0.7785593867301941,
"learning_rate": 2.499692246101466e-05,
"loss": 0.4857,
"step": 24480
},
{
"epoch": 0.7782966422059151,
"grad_norm": 0.9240188002586365,
"learning_rate": 2.4860980051122474e-05,
"loss": 0.4958,
"step": 24500
},
{
"epoch": 0.7789319864036341,
"grad_norm": 1.0593191385269165,
"learning_rate": 2.4725355810116103e-05,
"loss": 0.5077,
"step": 24520
},
{
"epoch": 0.7795673306013533,
"grad_norm": 0.8705240488052368,
"learning_rate": 2.4590050312283263e-05,
"loss": 0.4792,
"step": 24540
},
{
"epoch": 0.7802026747990723,
"grad_norm": 0.8610863089561462,
"learning_rate": 2.4455064130561944e-05,
"loss": 0.4949,
"step": 24560
},
{
"epoch": 0.7808380189967915,
"grad_norm": 1.152521014213562,
"learning_rate": 2.432039783653799e-05,
"loss": 0.5076,
"step": 24580
},
{
"epoch": 0.7814733631945107,
"grad_norm": 0.8608033657073975,
"learning_rate": 2.4186052000442806e-05,
"loss": 0.4759,
"step": 24600
},
{
"epoch": 0.7821087073922297,
"grad_norm": 1.1664726734161377,
"learning_rate": 2.4052027191150762e-05,
"loss": 0.4941,
"step": 24620
},
{
"epoch": 0.7827440515899489,
"grad_norm": 0.8805221915245056,
"learning_rate": 2.3918323976176883e-05,
"loss": 0.4797,
"step": 24640
},
{
"epoch": 0.783379395787668,
"grad_norm": 0.7699743509292603,
"learning_rate": 2.3784942921674512e-05,
"loss": 0.4903,
"step": 24660
},
{
"epoch": 0.7840147399853871,
"grad_norm": 0.9498074650764465,
"learning_rate": 2.365188459243274e-05,
"loss": 0.4679,
"step": 24680
},
{
"epoch": 0.7846500841831062,
"grad_norm": 0.815447986125946,
"learning_rate": 2.351914955187412e-05,
"loss": 0.5114,
"step": 24700
},
{
"epoch": 0.7852854283808253,
"grad_norm": 0.984866738319397,
"learning_rate": 2.3386738362052353e-05,
"loss": 0.4725,
"step": 24720
},
{
"epoch": 0.7859207725785444,
"grad_norm": 1.0802818536758423,
"learning_rate": 2.3254651583649735e-05,
"loss": 0.4684,
"step": 24740
},
{
"epoch": 0.7865561167762636,
"grad_norm": 0.8058573007583618,
"learning_rate": 2.3122889775974887e-05,
"loss": 0.4847,
"step": 24760
},
{
"epoch": 0.7871914609739826,
"grad_norm": 0.8836669921875,
"learning_rate": 2.2991453496960447e-05,
"loss": 0.4859,
"step": 24780
},
{
"epoch": 0.7878268051717018,
"grad_norm": 0.7214009165763855,
"learning_rate": 2.2860343303160535e-05,
"loss": 0.4816,
"step": 24800
},
{
"epoch": 0.7884621493694208,
"grad_norm": 0.8268193006515503,
"learning_rate": 2.2729559749748575e-05,
"loss": 0.4674,
"step": 24820
},
{
"epoch": 0.78909749356714,
"grad_norm": 0.7158612608909607,
"learning_rate": 2.2599103390514766e-05,
"loss": 0.465,
"step": 24840
},
{
"epoch": 0.7897328377648591,
"grad_norm": 0.8904339671134949,
"learning_rate": 2.246897477786396e-05,
"loss": 0.5024,
"step": 24860
},
{
"epoch": 0.7903681819625782,
"grad_norm": 0.8315703272819519,
"learning_rate": 2.2339174462813127e-05,
"loss": 0.4609,
"step": 24880
},
{
"epoch": 0.7910035261602973,
"grad_norm": 0.8962224721908569,
"learning_rate": 2.2209702994989045e-05,
"loss": 0.4906,
"step": 24900
},
{
"epoch": 0.7916388703580165,
"grad_norm": 0.9301977753639221,
"learning_rate": 2.208056092262616e-05,
"loss": 0.5216,
"step": 24920
},
{
"epoch": 0.7922742145557355,
"grad_norm": 0.8634437918663025,
"learning_rate": 2.1951748792563985e-05,
"loss": 0.5031,
"step": 24940
},
{
"epoch": 0.7929095587534547,
"grad_norm": 0.8985020518302917,
"learning_rate": 2.1823267150244964e-05,
"loss": 0.4709,
"step": 24960
},
{
"epoch": 0.7935449029511737,
"grad_norm": 1.1470792293548584,
"learning_rate": 2.16951165397122e-05,
"loss": 0.5224,
"step": 24980
},
{
"epoch": 0.7941802471488929,
"grad_norm": 0.919326663017273,
"learning_rate": 2.1567297503606987e-05,
"loss": 0.5004,
"step": 25000
},
{
"epoch": 0.7941802471488929,
"eval_loss": 0.44602036476135254,
"eval_runtime": 44.8391,
"eval_samples_per_second": 60.282,
"eval_steps_per_second": 30.152,
"step": 25000
},
{
"epoch": 0.7948155913466121,
"grad_norm": 1.1010879278182983,
"learning_rate": 2.1439810583166587e-05,
"loss": 0.5077,
"step": 25020
},
{
"epoch": 0.7954509355443311,
"grad_norm": 0.8573036789894104,
"learning_rate": 2.131900612258364e-05,
"loss": 0.4973,
"step": 25040
},
{
"epoch": 0.7960862797420503,
"grad_norm": 0.8931069374084473,
"learning_rate": 2.1198502345256165e-05,
"loss": 0.4972,
"step": 25060
},
{
"epoch": 0.7967216239397694,
"grad_norm": 1.239161491394043,
"learning_rate": 2.107198160794136e-05,
"loss": 0.4981,
"step": 25080
},
{
"epoch": 0.7973569681374885,
"grad_norm": 0.9950107336044312,
"learning_rate": 2.0945795083658447e-05,
"loss": 0.506,
"step": 25100
},
{
"epoch": 0.7979923123352076,
"grad_norm": 0.7783673405647278,
"learning_rate": 2.0819943306732082e-05,
"loss": 0.4763,
"step": 25120
},
{
"epoch": 0.7986276565329267,
"grad_norm": 0.912331223487854,
"learning_rate": 2.0694426810069345e-05,
"loss": 0.4622,
"step": 25140
},
{
"epoch": 0.7992630007306458,
"grad_norm": 0.8284201622009277,
"learning_rate": 2.0569246125157658e-05,
"loss": 0.513,
"step": 25160
},
{
"epoch": 0.799898344928365,
"grad_norm": 1.1468638181686401,
"learning_rate": 2.0444401782062518e-05,
"loss": 0.4719,
"step": 25180
},
{
"epoch": 0.800533689126084,
"grad_norm": 1.0985773801803589,
"learning_rate": 2.0319894309425146e-05,
"loss": 0.4871,
"step": 25200
},
{
"epoch": 0.8011690333238032,
"grad_norm": 1.1010768413543701,
"learning_rate": 2.0195724234460322e-05,
"loss": 0.5459,
"step": 25220
},
{
"epoch": 0.8018043775215223,
"grad_norm": 0.9938257336616516,
"learning_rate": 2.0071892082954248e-05,
"loss": 0.5127,
"step": 25240
},
{
"epoch": 0.8024397217192414,
"grad_norm": 1.1338539123535156,
"learning_rate": 1.9954565018232684e-05,
"loss": 0.4838,
"step": 25260
},
{
"epoch": 0.8030750659169605,
"grad_norm": 0.7955858111381531,
"learning_rate": 1.9831393324342518e-05,
"loss": 0.4865,
"step": 25280
},
{
"epoch": 0.8037104101146796,
"grad_norm": 1.0443702936172485,
"learning_rate": 1.9708561096634902e-05,
"loss": 0.4749,
"step": 25300
},
{
"epoch": 0.8043457543123987,
"grad_norm": 1.0816038846969604,
"learning_rate": 1.958606885523103e-05,
"loss": 0.5142,
"step": 25320
},
{
"epoch": 0.8049810985101179,
"grad_norm": 1.2127019166946411,
"learning_rate": 1.946391711881239e-05,
"loss": 0.4831,
"step": 25340
},
{
"epoch": 0.8056164427078369,
"grad_norm": 0.8780348300933838,
"learning_rate": 1.9342106404618632e-05,
"loss": 0.5113,
"step": 25360
},
{
"epoch": 0.8062517869055561,
"grad_norm": 0.7795581221580505,
"learning_rate": 1.9220637228445438e-05,
"loss": 0.4721,
"step": 25380
},
{
"epoch": 0.8068871311032751,
"grad_norm": 0.9518604874610901,
"learning_rate": 1.9099510104642216e-05,
"loss": 0.4754,
"step": 25400
},
{
"epoch": 0.8075224753009943,
"grad_norm": 1.0051589012145996,
"learning_rate": 1.8978725546110022e-05,
"loss": 0.4936,
"step": 25420
},
{
"epoch": 0.8081578194987135,
"grad_norm": 0.8047780394554138,
"learning_rate": 1.8858284064299326e-05,
"loss": 0.4901,
"step": 25440
},
{
"epoch": 0.8087931636964325,
"grad_norm": 1.1246352195739746,
"learning_rate": 1.8738186169207917e-05,
"loss": 0.5117,
"step": 25460
},
{
"epoch": 0.8094285078941517,
"grad_norm": 0.8150719404220581,
"learning_rate": 1.861843236937867e-05,
"loss": 0.4685,
"step": 25480
},
{
"epoch": 0.8100638520918708,
"grad_norm": 2.195882558822632,
"learning_rate": 1.8499023171897388e-05,
"loss": 0.471,
"step": 25500
},
{
"epoch": 0.8106991962895899,
"grad_norm": 0.8962704539299011,
"learning_rate": 1.8379959082390798e-05,
"loss": 0.481,
"step": 25520
},
{
"epoch": 0.811334540487309,
"grad_norm": 0.8531712889671326,
"learning_rate": 1.8261240605024165e-05,
"loss": 0.4881,
"step": 25540
},
{
"epoch": 0.8119698846850281,
"grad_norm": 0.9354826807975769,
"learning_rate": 1.8142868242499368e-05,
"loss": 0.4761,
"step": 25560
},
{
"epoch": 0.8126052288827472,
"grad_norm": 1.0048118829727173,
"learning_rate": 1.8024842496052708e-05,
"loss": 0.4968,
"step": 25580
},
{
"epoch": 0.8132405730804664,
"grad_norm": 0.8254916071891785,
"learning_rate": 1.790716386545275e-05,
"loss": 0.5076,
"step": 25600
},
{
"epoch": 0.8138759172781854,
"grad_norm": 0.9708372950553894,
"learning_rate": 1.778983284899819e-05,
"loss": 0.5197,
"step": 25620
},
{
"epoch": 0.8145112614759046,
"grad_norm": 0.9034101366996765,
"learning_rate": 1.767284994351588e-05,
"loss": 0.4954,
"step": 25640
},
{
"epoch": 0.8151466056736237,
"grad_norm": 1.3567668199539185,
"learning_rate": 1.7556215644358564e-05,
"loss": 0.5133,
"step": 25660
},
{
"epoch": 0.8157819498713428,
"grad_norm": 0.9000421166419983,
"learning_rate": 1.743993044540282e-05,
"loss": 0.524,
"step": 25680
},
{
"epoch": 0.8164172940690619,
"grad_norm": 0.7230278849601746,
"learning_rate": 1.7323994839047086e-05,
"loss": 0.4831,
"step": 25700
},
{
"epoch": 0.817052638266781,
"grad_norm": 0.8648797273635864,
"learning_rate": 1.7208409316209407e-05,
"loss": 0.4932,
"step": 25720
},
{
"epoch": 0.8176879824645001,
"grad_norm": 0.9017996788024902,
"learning_rate": 1.709317436632547e-05,
"loss": 0.4787,
"step": 25740
},
{
"epoch": 0.8183233266622193,
"grad_norm": 0.9122520685195923,
"learning_rate": 1.697829047734646e-05,
"loss": 0.4721,
"step": 25760
},
{
"epoch": 0.8189586708599383,
"grad_norm": 0.9448441863059998,
"learning_rate": 1.6863758135737085e-05,
"loss": 0.4772,
"step": 25780
},
{
"epoch": 0.8195940150576575,
"grad_norm": 1.052437424659729,
"learning_rate": 1.6749577826473405e-05,
"loss": 0.5252,
"step": 25800
},
{
"epoch": 0.8202293592553767,
"grad_norm": 0.9826536774635315,
"learning_rate": 1.6635750033040842e-05,
"loss": 0.5187,
"step": 25820
},
{
"epoch": 0.8208647034530957,
"grad_norm": 0.8498765826225281,
"learning_rate": 1.6522275237432193e-05,
"loss": 0.4792,
"step": 25840
},
{
"epoch": 0.8215000476508149,
"grad_norm": 0.9139013886451721,
"learning_rate": 1.6409153920145416e-05,
"loss": 0.5006,
"step": 25860
},
{
"epoch": 0.8221353918485339,
"grad_norm": 0.9082590937614441,
"learning_rate": 1.6296386560181744e-05,
"loss": 0.4801,
"step": 25880
},
{
"epoch": 0.8227707360462531,
"grad_norm": 0.8360690474510193,
"learning_rate": 1.618397363504366e-05,
"loss": 0.491,
"step": 25900
},
{
"epoch": 0.8234060802439722,
"grad_norm": 0.8585413098335266,
"learning_rate": 1.6071915620732746e-05,
"loss": 0.4952,
"step": 25920
},
{
"epoch": 0.8240414244416913,
"grad_norm": 0.9051182866096497,
"learning_rate": 1.5960212991747804e-05,
"loss": 0.5021,
"step": 25940
},
{
"epoch": 0.8246767686394104,
"grad_norm": 1.1850552558898926,
"learning_rate": 1.584886622108276e-05,
"loss": 0.5194,
"step": 25960
},
{
"epoch": 0.8253121128371295,
"grad_norm": 0.8449670672416687,
"learning_rate": 1.57378757802247e-05,
"loss": 0.4988,
"step": 25980
},
{
"epoch": 0.8259474570348486,
"grad_norm": 0.9663527607917786,
"learning_rate": 1.5627242139151867e-05,
"loss": 0.4782,
"step": 26000
},
{
"epoch": 0.8259474570348486,
"eval_loss": 0.44560423493385315,
"eval_runtime": 45.0247,
"eval_samples_per_second": 60.034,
"eval_steps_per_second": 30.028,
"step": 26000
},
{
"epoch": 0.8265828012325678,
"grad_norm": 1.0954176187515259,
"learning_rate": 1.5516965766331715e-05,
"loss": 0.4992,
"step": 26020
},
{
"epoch": 0.8272181454302868,
"grad_norm": 0.9752370119094849,
"learning_rate": 1.540704712871881e-05,
"loss": 0.5109,
"step": 26040
},
{
"epoch": 0.827853489628006,
"grad_norm": 0.7089188098907471,
"learning_rate": 1.5297486691752928e-05,
"loss": 0.4669,
"step": 26060
},
{
"epoch": 0.8284888338257251,
"grad_norm": 0.8641648292541504,
"learning_rate": 1.5188284919357155e-05,
"loss": 0.4905,
"step": 26080
},
{
"epoch": 0.8291241780234442,
"grad_norm": 0.8167259097099304,
"learning_rate": 1.5079442273935773e-05,
"loss": 0.4776,
"step": 26100
},
{
"epoch": 0.8297595222211633,
"grad_norm": 0.9287614226341248,
"learning_rate": 1.4970959216372372e-05,
"loss": 0.4803,
"step": 26120
},
{
"epoch": 0.8303948664188824,
"grad_norm": 0.8652564883232117,
"learning_rate": 1.4862836206027975e-05,
"loss": 0.4623,
"step": 26140
},
{
"epoch": 0.8310302106166015,
"grad_norm": 0.9141151309013367,
"learning_rate": 1.4755073700738953e-05,
"loss": 0.507,
"step": 26160
},
{
"epoch": 0.8316655548143207,
"grad_norm": 0.9454159736633301,
"learning_rate": 1.464767215681515e-05,
"loss": 0.5218,
"step": 26180
},
{
"epoch": 0.8323008990120397,
"grad_norm": 0.7766212821006775,
"learning_rate": 1.4540632029038026e-05,
"loss": 0.5294,
"step": 26200
},
{
"epoch": 0.8329362432097589,
"grad_norm": 0.8662501573562622,
"learning_rate": 1.443395377065858e-05,
"loss": 0.4931,
"step": 26220
},
{
"epoch": 0.833571587407478,
"grad_norm": 1.0195443630218506,
"learning_rate": 1.4327637833395525e-05,
"loss": 0.5165,
"step": 26240
},
{
"epoch": 0.8342069316051971,
"grad_norm": 0.9022318124771118,
"learning_rate": 1.422168466743341e-05,
"loss": 0.4732,
"step": 26260
},
{
"epoch": 0.8348422758029163,
"grad_norm": 0.9162563681602478,
"learning_rate": 1.4116094721420625e-05,
"loss": 0.496,
"step": 26280
},
{
"epoch": 0.8354776200006353,
"grad_norm": 1.129158854484558,
"learning_rate": 1.401086844246755e-05,
"loss": 0.4764,
"step": 26300
},
{
"epoch": 0.8361129641983545,
"grad_norm": 0.8695496320724487,
"learning_rate": 1.3906006276144601e-05,
"loss": 0.4852,
"step": 26320
},
{
"epoch": 0.8367483083960736,
"grad_norm": 1.7362381219863892,
"learning_rate": 1.3801508666480512e-05,
"loss": 0.4642,
"step": 26340
},
{
"epoch": 0.8373836525937927,
"grad_norm": 0.7645226716995239,
"learning_rate": 1.369737605596022e-05,
"loss": 0.503,
"step": 26360
},
{
"epoch": 0.8380189967915118,
"grad_norm": 0.8403562903404236,
"learning_rate": 1.3593608885523158e-05,
"loss": 0.4766,
"step": 26380
},
{
"epoch": 0.8386543409892309,
"grad_norm": 0.7841979265213013,
"learning_rate": 1.3490207594561366e-05,
"loss": 0.4917,
"step": 26400
},
{
"epoch": 0.83928968518695,
"grad_norm": 0.8631531000137329,
"learning_rate": 1.3392315662821897e-05,
"loss": 0.4972,
"step": 26420
},
{
"epoch": 0.8399250293846692,
"grad_norm": 1.0436699390411377,
"learning_rate": 1.3289629094769217e-05,
"loss": 0.4847,
"step": 26440
},
{
"epoch": 0.8405603735823882,
"grad_norm": 0.9521028399467468,
"learning_rate": 1.318730969336468e-05,
"loss": 0.4972,
"step": 26460
},
{
"epoch": 0.8411957177801074,
"grad_norm": 0.9861098527908325,
"learning_rate": 1.3085357891869909e-05,
"loss": 0.5114,
"step": 26480
},
{
"epoch": 0.8418310619778265,
"grad_norm": 1.3008265495300293,
"learning_rate": 1.2983774121989888e-05,
"loss": 0.5071,
"step": 26500
},
{
"epoch": 0.8424664061755456,
"grad_norm": 0.7970487475395203,
"learning_rate": 1.2882558813871204e-05,
"loss": 0.4945,
"step": 26520
},
{
"epoch": 0.8431017503732647,
"grad_norm": 0.7304345369338989,
"learning_rate": 1.2781712396100287e-05,
"loss": 0.4902,
"step": 26540
},
{
"epoch": 0.8437370945709838,
"grad_norm": 0.9716693162918091,
"learning_rate": 1.2681235295701488e-05,
"loss": 0.4857,
"step": 26560
},
{
"epoch": 0.8443724387687029,
"grad_norm": 0.9461120963096619,
"learning_rate": 1.2581127938135328e-05,
"loss": 0.5139,
"step": 26580
},
{
"epoch": 0.8450077829664221,
"grad_norm": 0.8130011558532715,
"learning_rate": 1.2481390747296717e-05,
"loss": 0.4788,
"step": 26600
},
{
"epoch": 0.8456431271641411,
"grad_norm": 0.959818959236145,
"learning_rate": 1.2382024145513094e-05,
"loss": 0.4808,
"step": 26620
},
{
"epoch": 0.8462784713618603,
"grad_norm": 1.2069573402404785,
"learning_rate": 1.2283028553542674e-05,
"loss": 0.4692,
"step": 26640
},
{
"epoch": 0.8469138155595795,
"grad_norm": 1.0251085758209229,
"learning_rate": 1.2184404390572712e-05,
"loss": 0.5106,
"step": 26660
},
{
"epoch": 0.8475491597572985,
"grad_norm": 0.9423872828483582,
"learning_rate": 1.2086152074217638e-05,
"loss": 0.4881,
"step": 26680
},
{
"epoch": 0.8481845039550177,
"grad_norm": 0.8245638608932495,
"learning_rate": 1.1988272020517322e-05,
"loss": 0.4606,
"step": 26700
},
{
"epoch": 0.8488198481527367,
"grad_norm": 1.0099587440490723,
"learning_rate": 1.1890764643935393e-05,
"loss": 0.4976,
"step": 26720
},
{
"epoch": 0.8494551923504559,
"grad_norm": 0.8285634517669678,
"learning_rate": 1.1793630357357355e-05,
"loss": 0.5057,
"step": 26740
},
{
"epoch": 0.850090536548175,
"grad_norm": 0.9125322699546814,
"learning_rate": 1.169686957208892e-05,
"loss": 0.4856,
"step": 26760
},
{
"epoch": 0.8507258807458941,
"grad_norm": 1.1413007974624634,
"learning_rate": 1.1600482697854198e-05,
"loss": 0.4916,
"step": 26780
},
{
"epoch": 0.8513612249436132,
"grad_norm": 0.9246459603309631,
"learning_rate": 1.1504470142794121e-05,
"loss": 0.4807,
"step": 26800
},
{
"epoch": 0.8519965691413324,
"grad_norm": 0.9050401449203491,
"learning_rate": 1.140883231346449e-05,
"loss": 0.4844,
"step": 26820
},
{
"epoch": 0.8526319133390514,
"grad_norm": 0.8217797875404358,
"learning_rate": 1.1313569614834408e-05,
"loss": 0.4751,
"step": 26840
},
{
"epoch": 0.8532672575367706,
"grad_norm": 1.0189076662063599,
"learning_rate": 1.1218682450284545e-05,
"loss": 0.4949,
"step": 26860
},
{
"epoch": 0.8539026017344896,
"grad_norm": 0.7574889659881592,
"learning_rate": 1.112417122160535e-05,
"loss": 0.4738,
"step": 26880
},
{
"epoch": 0.8545379459322088,
"grad_norm": 0.6649676561355591,
"learning_rate": 1.1030036328995497e-05,
"loss": 0.4859,
"step": 26900
},
{
"epoch": 0.8551732901299279,
"grad_norm": 0.7144981622695923,
"learning_rate": 1.0936278171060032e-05,
"loss": 0.4799,
"step": 26920
},
{
"epoch": 0.855808634327647,
"grad_norm": 0.9074038863182068,
"learning_rate": 1.0842897144808762e-05,
"loss": 0.4951,
"step": 26940
},
{
"epoch": 0.8564439785253661,
"grad_norm": 0.9271389842033386,
"learning_rate": 1.0749893645654551e-05,
"loss": 0.4692,
"step": 26960
},
{
"epoch": 0.8570793227230852,
"grad_norm": 0.9277658462524414,
"learning_rate": 1.0657268067411752e-05,
"loss": 0.4711,
"step": 26980
},
{
"epoch": 0.8577146669208043,
"grad_norm": 1.5766148567199707,
"learning_rate": 1.0565020802294357e-05,
"loss": 0.5081,
"step": 27000
},
{
"epoch": 0.8577146669208043,
"eval_loss": 0.4444785416126251,
"eval_runtime": 45.2678,
"eval_samples_per_second": 59.711,
"eval_steps_per_second": 29.867,
"step": 27000
},
{
"epoch": 0.8583500111185235,
"grad_norm": 0.7567349076271057,
"learning_rate": 1.0473152240914419e-05,
"loss": 0.4671,
"step": 27020
},
{
"epoch": 0.8589853553162425,
"grad_norm": 1.0230178833007812,
"learning_rate": 1.0381662772280498e-05,
"loss": 0.4874,
"step": 27040
},
{
"epoch": 0.8596206995139617,
"grad_norm": 0.7454288005828857,
"learning_rate": 1.0290552783795849e-05,
"loss": 0.4825,
"step": 27060
},
{
"epoch": 0.8602560437116809,
"grad_norm": 0.9813241958618164,
"learning_rate": 1.0199822661256852e-05,
"loss": 0.4785,
"step": 27080
},
{
"epoch": 0.8608913879093999,
"grad_norm": 0.8269158005714417,
"learning_rate": 1.0109472788851427e-05,
"loss": 0.4797,
"step": 27100
},
{
"epoch": 0.861526732107119,
"grad_norm": 0.8101191520690918,
"learning_rate": 1.001950354915734e-05,
"loss": 0.4735,
"step": 27120
},
{
"epoch": 0.8621620763048381,
"grad_norm": 0.903421938419342,
"learning_rate": 9.929915323140571e-06,
"loss": 0.5,
"step": 27140
},
{
"epoch": 0.8627974205025573,
"grad_norm": 0.7358487248420715,
"learning_rate": 9.840708490153817e-06,
"loss": 0.4799,
"step": 27160
},
{
"epoch": 0.8634327647002764,
"grad_norm": 0.9838561415672302,
"learning_rate": 9.751883427934717e-06,
"loss": 0.506,
"step": 27180
},
{
"epoch": 0.8640681088979955,
"grad_norm": 0.9448813796043396,
"learning_rate": 9.66344051260436e-06,
"loss": 0.4966,
"step": 27200
},
{
"epoch": 0.8647034530957146,
"grad_norm": 1.111055612564087,
"learning_rate": 9.575380118665733e-06,
"loss": 0.5118,
"step": 27220
},
{
"epoch": 0.8653387972934338,
"grad_norm": 0.968305230140686,
"learning_rate": 9.487702619001992e-06,
"loss": 0.5002,
"step": 27240
},
{
"epoch": 0.8659741414911528,
"grad_norm": 0.8771995902061462,
"learning_rate": 9.400408384874992e-06,
"loss": 0.497,
"step": 27260
},
{
"epoch": 0.866609485688872,
"grad_norm": 1.0422018766403198,
"learning_rate": 9.31349778592373e-06,
"loss": 0.5081,
"step": 27280
},
{
"epoch": 0.867244829886591,
"grad_norm": 0.8950514197349548,
"learning_rate": 9.22697119016267e-06,
"loss": 0.4957,
"step": 27300
},
{
"epoch": 0.8678801740843102,
"grad_norm": 0.8093190789222717,
"learning_rate": 9.140828963980297e-06,
"loss": 0.4667,
"step": 27320
},
{
"epoch": 0.8685155182820293,
"grad_norm": 0.8465502262115479,
"learning_rate": 9.055071472137466e-06,
"loss": 0.4913,
"step": 27340
},
{
"epoch": 0.8691508624797484,
"grad_norm": 0.8349893093109131,
"learning_rate": 8.969699077766014e-06,
"loss": 0.4738,
"step": 27360
},
{
"epoch": 0.8697862066774675,
"grad_norm": 0.831910252571106,
"learning_rate": 8.884712142367024e-06,
"loss": 0.4923,
"step": 27380
},
{
"epoch": 0.8704215508751866,
"grad_norm": 0.9581566452980042,
"learning_rate": 8.80011102580941e-06,
"loss": 0.4856,
"step": 27400
},
{
"epoch": 0.8710568950729057,
"grad_norm": 0.823250412940979,
"learning_rate": 8.720097656085246e-06,
"loss": 0.4886,
"step": 27420
},
{
"epoch": 0.8716922392706249,
"grad_norm": 0.988389253616333,
"learning_rate": 8.636249915153039e-06,
"loss": 0.4946,
"step": 27440
},
{
"epoch": 0.8723275834683439,
"grad_norm": 0.85055011510849,
"learning_rate": 8.55695289500451e-06,
"loss": 0.4885,
"step": 27460
},
{
"epoch": 0.8729629276660631,
"grad_norm": 0.9092792272567749,
"learning_rate": 8.473859879755397e-06,
"loss": 0.4631,
"step": 27480
},
{
"epoch": 0.8735982718637822,
"grad_norm": 0.930949330329895,
"learning_rate": 8.39115442306171e-06,
"loss": 0.4955,
"step": 27500
},
{
"epoch": 0.8742336160615013,
"grad_norm": 0.7822802066802979,
"learning_rate": 8.308836875131665e-06,
"loss": 0.4842,
"step": 27520
},
{
"epoch": 0.8748689602592205,
"grad_norm": 0.7877179384231567,
"learning_rate": 8.22690758453094e-06,
"loss": 0.5006,
"step": 27540
},
{
"epoch": 0.8755043044569395,
"grad_norm": 0.9965065717697144,
"learning_rate": 8.145366898181139e-06,
"loss": 0.4866,
"step": 27560
},
{
"epoch": 0.8761396486546587,
"grad_norm": 1.1015229225158691,
"learning_rate": 8.064215161358402e-06,
"loss": 0.5203,
"step": 27580
},
{
"epoch": 0.8767749928523778,
"grad_norm": 0.7929244637489319,
"learning_rate": 7.983452717691852e-06,
"loss": 0.477,
"step": 27600
},
{
"epoch": 0.8774103370500969,
"grad_norm": 1.0685256719589233,
"learning_rate": 7.903079909162258e-06,
"loss": 0.5385,
"step": 27620
},
{
"epoch": 0.878045681247816,
"grad_norm": 1.0020925998687744,
"learning_rate": 7.82309707610046e-06,
"loss": 0.5061,
"step": 27640
},
{
"epoch": 0.8786810254455352,
"grad_norm": 0.8348806500434875,
"learning_rate": 7.743504557185976e-06,
"loss": 0.505,
"step": 27660
},
{
"epoch": 0.8793163696432542,
"grad_norm": 0.8327703475952148,
"learning_rate": 7.664302689445635e-06,
"loss": 0.4633,
"step": 27680
},
{
"epoch": 0.8799517138409734,
"grad_norm": 0.9524950385093689,
"learning_rate": 7.5854918082520435e-06,
"loss": 0.4859,
"step": 27700
},
{
"epoch": 0.8805870580386924,
"grad_norm": 0.8677568435668945,
"learning_rate": 7.507072247322211e-06,
"loss": 0.4832,
"step": 27720
},
{
"epoch": 0.8812224022364116,
"grad_norm": 0.9326565265655518,
"learning_rate": 7.429044338716196e-06,
"loss": 0.493,
"step": 27740
},
{
"epoch": 0.8818577464341307,
"grad_norm": 0.7510032057762146,
"learning_rate": 7.35140841283557e-06,
"loss": 0.489,
"step": 27760
},
{
"epoch": 0.8824930906318498,
"grad_norm": 0.7510486841201782,
"learning_rate": 7.274164798422134e-06,
"loss": 0.4741,
"step": 27780
},
{
"epoch": 0.8831284348295689,
"grad_norm": 0.8744218945503235,
"learning_rate": 7.197313822556462e-06,
"loss": 0.4698,
"step": 27800
},
{
"epoch": 0.8837637790272881,
"grad_norm": 0.7554096579551697,
"learning_rate": 7.12085581065658e-06,
"loss": 0.4561,
"step": 27820
},
{
"epoch": 0.8843991232250071,
"grad_norm": 1.0702250003814697,
"learning_rate": 7.044791086476499e-06,
"loss": 0.5074,
"step": 27840
},
{
"epoch": 0.8850344674227263,
"grad_norm": 1.2190712690353394,
"learning_rate": 6.969119972104898e-06,
"loss": 0.4873,
"step": 27860
},
{
"epoch": 0.8856698116204453,
"grad_norm": 0.8235007524490356,
"learning_rate": 6.893842787963789e-06,
"loss": 0.4884,
"step": 27880
},
{
"epoch": 0.8863051558181645,
"grad_norm": 0.8809916973114014,
"learning_rate": 6.818959852807083e-06,
"loss": 0.4746,
"step": 27900
},
{
"epoch": 0.8869405000158836,
"grad_norm": 0.8362717628479004,
"learning_rate": 6.744471483719306e-06,
"loss": 0.5139,
"step": 27920
},
{
"epoch": 0.8875758442136027,
"grad_norm": 0.9398446083068848,
"learning_rate": 6.67037799611423e-06,
"loss": 0.5002,
"step": 27940
},
{
"epoch": 0.8882111884113219,
"grad_norm": 0.750577449798584,
"learning_rate": 6.596679703733544e-06,
"loss": 0.4965,
"step": 27960
},
{
"epoch": 0.8888465326090409,
"grad_norm": 1.0199640989303589,
"learning_rate": 6.523376918645474e-06,
"loss": 0.5101,
"step": 27980
},
{
"epoch": 0.88948187680676,
"grad_norm": 0.8302307724952698,
"learning_rate": 6.4504699512435985e-06,
"loss": 0.4608,
"step": 28000
},
{
"epoch": 0.88948187680676,
"eval_loss": 0.4442509412765503,
"eval_runtime": 44.8835,
"eval_samples_per_second": 60.223,
"eval_steps_per_second": 30.122,
"step": 28000
},
{
"epoch": 0.8901172210044792,
"grad_norm": 0.7648799419403076,
"learning_rate": 6.377959110245357e-06,
"loss": 0.4704,
"step": 28020
},
{
"epoch": 0.8907525652021983,
"grad_norm": 0.8950293064117432,
"learning_rate": 6.305844702690878e-06,
"loss": 0.4906,
"step": 28040
},
{
"epoch": 0.8913879093999174,
"grad_norm": 0.9124616384506226,
"learning_rate": 6.234127033941628e-06,
"loss": 0.4939,
"step": 28060
},
{
"epoch": 0.8920232535976366,
"grad_norm": 0.8970253467559814,
"learning_rate": 6.1628064076791e-06,
"loss": 0.5088,
"step": 28080
},
{
"epoch": 0.8926585977953556,
"grad_norm": 0.9791019558906555,
"learning_rate": 6.091883125903575e-06,
"loss": 0.4613,
"step": 28100
},
{
"epoch": 0.8932939419930748,
"grad_norm": 1.3384908437728882,
"learning_rate": 6.021357488932789e-06,
"loss": 0.4737,
"step": 28120
},
{
"epoch": 0.8939292861907938,
"grad_norm": 1.076692819595337,
"learning_rate": 5.951229795400726e-06,
"loss": 0.5094,
"step": 28140
},
{
"epoch": 0.894564630388513,
"grad_norm": 0.9772495031356812,
"learning_rate": 5.881500342256285e-06,
"loss": 0.4791,
"step": 28160
},
{
"epoch": 0.8951999745862321,
"grad_norm": 0.946626603603363,
"learning_rate": 5.8121694247620485e-06,
"loss": 0.4843,
"step": 28180
},
{
"epoch": 0.8958353187839512,
"grad_norm": 0.9328265190124512,
"learning_rate": 5.74323733649309e-06,
"loss": 0.4822,
"step": 28200
},
{
"epoch": 0.8964706629816703,
"grad_norm": 0.7450932264328003,
"learning_rate": 5.674704369335637e-06,
"loss": 0.4746,
"step": 28220
},
{
"epoch": 0.8971060071793895,
"grad_norm": 1.0023432970046997,
"learning_rate": 5.606570813485856e-06,
"loss": 0.4941,
"step": 28240
},
{
"epoch": 0.8977413513771085,
"grad_norm": 0.8717949986457825,
"learning_rate": 5.538836957448712e-06,
"loss": 0.4801,
"step": 28260
},
{
"epoch": 0.8983766955748277,
"grad_norm": 0.8665459156036377,
"learning_rate": 5.474860277416504e-06,
"loss": 0.4782,
"step": 28280
},
{
"epoch": 0.8990120397725467,
"grad_norm": 0.8660995364189148,
"learning_rate": 5.407906659415618e-06,
"loss": 0.4788,
"step": 28300
},
{
"epoch": 0.8996473839702659,
"grad_norm": 0.9390355944633484,
"learning_rate": 5.341353582451425e-06,
"loss": 0.478,
"step": 28320
},
{
"epoch": 0.900282728167985,
"grad_norm": 0.8287180662155151,
"learning_rate": 5.275201328336477e-06,
"loss": 0.4846,
"step": 28340
},
{
"epoch": 0.9009180723657041,
"grad_norm": 0.8496334552764893,
"learning_rate": 5.209450177186081e-06,
"loss": 0.4838,
"step": 28360
},
{
"epoch": 0.9015534165634232,
"grad_norm": 0.9892422556877136,
"learning_rate": 5.144100407417063e-06,
"loss": 0.4854,
"step": 28380
},
{
"epoch": 0.9021887607611423,
"grad_norm": 0.9813452363014221,
"learning_rate": 5.0791522957467365e-06,
"loss": 0.4916,
"step": 28400
},
{
"epoch": 0.9028241049588615,
"grad_norm": 0.9126195907592773,
"learning_rate": 5.014606117191545e-06,
"loss": 0.4949,
"step": 28420
},
{
"epoch": 0.9034594491565806,
"grad_norm": 0.8669445514678955,
"learning_rate": 4.950462145066015e-06,
"loss": 0.482,
"step": 28440
},
{
"epoch": 0.9040947933542997,
"grad_norm": 0.9803065657615662,
"learning_rate": 4.886720650981569e-06,
"loss": 0.5025,
"step": 28460
},
{
"epoch": 0.9047301375520188,
"grad_norm": 0.9414586424827576,
"learning_rate": 4.823381904845392e-06,
"loss": 0.4856,
"step": 28480
},
{
"epoch": 0.905365481749738,
"grad_norm": 0.9295367002487183,
"learning_rate": 4.760446174859224e-06,
"loss": 0.4876,
"step": 28500
},
{
"epoch": 0.906000825947457,
"grad_norm": 0.8859279751777649,
"learning_rate": 4.697913727518332e-06,
"loss": 0.5152,
"step": 28520
},
{
"epoch": 0.9066361701451762,
"grad_norm": 0.7441398501396179,
"learning_rate": 4.63578482761029e-06,
"loss": 0.4787,
"step": 28540
},
{
"epoch": 0.9072715143428952,
"grad_norm": 1.459954023361206,
"learning_rate": 4.574059738213876e-06,
"loss": 0.4813,
"step": 28560
},
{
"epoch": 0.9079068585406144,
"grad_norm": 0.9451243281364441,
"learning_rate": 4.512738720698018e-06,
"loss": 0.4835,
"step": 28580
},
{
"epoch": 0.9085422027383335,
"grad_norm": 0.8990492820739746,
"learning_rate": 4.451822034720587e-06,
"loss": 0.4811,
"step": 28600
},
{
"epoch": 0.9091775469360526,
"grad_norm": 0.7530508637428284,
"learning_rate": 4.3913099382273835e-06,
"loss": 0.5,
"step": 28620
},
{
"epoch": 0.9098128911337717,
"grad_norm": 0.8113830089569092,
"learning_rate": 4.331202687451019e-06,
"loss": 0.5075,
"step": 28640
},
{
"epoch": 0.9104482353314909,
"grad_norm": 0.8615418672561646,
"learning_rate": 4.2715005369097895e-06,
"loss": 0.5152,
"step": 28660
},
{
"epoch": 0.9110835795292099,
"grad_norm": 0.8459773659706116,
"learning_rate": 4.212203739406673e-06,
"loss": 0.4804,
"step": 28680
},
{
"epoch": 0.9117189237269291,
"grad_norm": 0.8821284770965576,
"learning_rate": 4.153312546028199e-06,
"loss": 0.5311,
"step": 28700
},
{
"epoch": 0.9123542679246481,
"grad_norm": 1.0187216997146606,
"learning_rate": 4.0948272061434035e-06,
"loss": 0.4632,
"step": 28720
},
{
"epoch": 0.9129896121223673,
"grad_norm": 0.9274182915687561,
"learning_rate": 4.036747967402788e-06,
"loss": 0.4832,
"step": 28740
},
{
"epoch": 0.9136249563200864,
"grad_norm": 0.7573745846748352,
"learning_rate": 3.979075075737226e-06,
"loss": 0.4905,
"step": 28760
},
{
"epoch": 0.9142603005178055,
"grad_norm": 0.9005789160728455,
"learning_rate": 3.921808775357027e-06,
"loss": 0.5114,
"step": 28780
},
{
"epoch": 0.9148956447155246,
"grad_norm": 0.9073104858398438,
"learning_rate": 3.864949308750743e-06,
"loss": 0.5018,
"step": 28800
},
{
"epoch": 0.9155309889132438,
"grad_norm": 0.7230907678604126,
"learning_rate": 3.808496916684268e-06,
"loss": 0.4954,
"step": 28820
},
{
"epoch": 0.9161663331109628,
"grad_norm": 0.7139384746551514,
"learning_rate": 3.7524518381997885e-06,
"loss": 0.464,
"step": 28840
},
{
"epoch": 0.916801677308682,
"grad_norm": 0.8710399866104126,
"learning_rate": 3.696814310614749e-06,
"loss": 0.5048,
"step": 28860
},
{
"epoch": 0.917437021506401,
"grad_norm": 0.87566739320755,
"learning_rate": 3.6415845695208505e-06,
"loss": 0.484,
"step": 28880
},
{
"epoch": 0.9180723657041202,
"grad_norm": 0.9447526335716248,
"learning_rate": 3.586762848783076e-06,
"loss": 0.5032,
"step": 28900
},
{
"epoch": 0.9187077099018394,
"grad_norm": 0.7784162759780884,
"learning_rate": 3.53234938053868e-06,
"loss": 0.4451,
"step": 28920
},
{
"epoch": 0.9193430540995584,
"grad_norm": 0.9225743412971497,
"learning_rate": 3.478344395196198e-06,
"loss": 0.4745,
"step": 28940
},
{
"epoch": 0.9199783982972776,
"grad_norm": 0.9712013602256775,
"learning_rate": 3.4247481214345177e-06,
"loss": 0.4956,
"step": 28960
},
{
"epoch": 0.9206137424949966,
"grad_norm": 1.2805237770080566,
"learning_rate": 3.371560786201855e-06,
"loss": 0.4971,
"step": 28980
},
{
"epoch": 0.9212490866927158,
"grad_norm": 0.7866525053977966,
"learning_rate": 3.3187826147147994e-06,
"loss": 0.497,
"step": 29000
},
{
"epoch": 0.9212490866927158,
"eval_loss": 0.44399821758270264,
"eval_runtime": 45.0357,
"eval_samples_per_second": 60.019,
"eval_steps_per_second": 30.021,
"step": 29000
},
{
"epoch": 0.9218844308904349,
"grad_norm": 0.7901077270507812,
"learning_rate": 3.2664138304574153e-06,
"loss": 0.514,
"step": 29020
},
{
"epoch": 0.922519775088154,
"grad_norm": 1.0464386940002441,
"learning_rate": 3.2144546551802323e-06,
"loss": 0.5042,
"step": 29040
},
{
"epoch": 0.9231551192858731,
"grad_norm": 0.8520443439483643,
"learning_rate": 3.162905308899322e-06,
"loss": 0.4858,
"step": 29060
},
{
"epoch": 0.9237904634835923,
"grad_norm": 0.92030268907547,
"learning_rate": 3.1117660098953895e-06,
"loss": 0.4766,
"step": 29080
},
{
"epoch": 0.9244258076813113,
"grad_norm": 0.7019485235214233,
"learning_rate": 3.06103697471285e-06,
"loss": 0.4903,
"step": 29100
},
{
"epoch": 0.9250611518790305,
"grad_norm": 1.3560097217559814,
"learning_rate": 3.0107184181588643e-06,
"loss": 0.5125,
"step": 29120
},
{
"epoch": 0.9256964960767495,
"grad_norm": 0.9616526365280151,
"learning_rate": 2.960810553302462e-06,
"loss": 0.512,
"step": 29140
},
{
"epoch": 0.9263318402744687,
"grad_norm": 1.1742409467697144,
"learning_rate": 2.9113135914736856e-06,
"loss": 0.5007,
"step": 29160
},
{
"epoch": 0.9269671844721878,
"grad_norm": 0.8712571263313293,
"learning_rate": 2.8622277422625907e-06,
"loss": 0.4717,
"step": 29180
},
{
"epoch": 0.9276025286699069,
"grad_norm": 0.8578605055809021,
"learning_rate": 2.8135532135184384e-06,
"loss": 0.4989,
"step": 29200
},
{
"epoch": 0.928237872867626,
"grad_norm": 0.8551231026649475,
"learning_rate": 2.7652902113488143e-06,
"loss": 0.4825,
"step": 29220
},
{
"epoch": 0.9288732170653452,
"grad_norm": 0.82204669713974,
"learning_rate": 2.7174389401186996e-06,
"loss": 0.4702,
"step": 29240
},
{
"epoch": 0.9295085612630642,
"grad_norm": 0.9263904690742493,
"learning_rate": 2.6699996024496575e-06,
"loss": 0.4996,
"step": 29260
},
{
"epoch": 0.9301439054607834,
"grad_norm": 1.037817120552063,
"learning_rate": 2.6229723992189704e-06,
"loss": 0.4986,
"step": 29280
},
{
"epoch": 0.9307792496585024,
"grad_norm": 1.0528874397277832,
"learning_rate": 2.5763575295587593e-06,
"loss": 0.4794,
"step": 29300
},
{
"epoch": 0.9314145938562216,
"grad_norm": 0.8765133619308472,
"learning_rate": 2.5301551908551545e-06,
"loss": 0.4878,
"step": 29320
},
{
"epoch": 0.9320499380539408,
"grad_norm": 0.8322685956954956,
"learning_rate": 2.484365578747494e-06,
"loss": 0.4945,
"step": 29340
},
{
"epoch": 0.9326852822516598,
"grad_norm": 0.8344667553901672,
"learning_rate": 2.438988887127436e-06,
"loss": 0.4981,
"step": 29360
},
{
"epoch": 0.933320626449379,
"grad_norm": 0.8750690817832947,
"learning_rate": 2.3940253081381703e-06,
"loss": 0.4969,
"step": 29380
},
{
"epoch": 0.933955970647098,
"grad_norm": 0.808814287185669,
"learning_rate": 2.3494750321736093e-06,
"loss": 0.4623,
"step": 29400
},
{
"epoch": 0.9345913148448172,
"grad_norm": 0.9626306891441345,
"learning_rate": 2.3053382478775754e-06,
"loss": 0.5028,
"step": 29420
},
{
"epoch": 0.9352266590425363,
"grad_norm": 0.9727978706359863,
"learning_rate": 2.261615142143003e-06,
"loss": 0.5059,
"step": 29440
},
{
"epoch": 0.9358620032402554,
"grad_norm": 0.8926533460617065,
"learning_rate": 2.2183059001111174e-06,
"loss": 0.4764,
"step": 29460
},
{
"epoch": 0.9364973474379745,
"grad_norm": 1.0506230592727661,
"learning_rate": 2.1754107051707218e-06,
"loss": 0.5069,
"step": 29480
},
{
"epoch": 0.9371326916356937,
"grad_norm": 0.7190736532211304,
"learning_rate": 2.1329297389573565e-06,
"loss": 0.49,
"step": 29500
},
{
"epoch": 0.9377680358334127,
"grad_norm": 0.7786980867385864,
"learning_rate": 2.09086318135252e-06,
"loss": 0.4766,
"step": 29520
},
{
"epoch": 0.9384033800311319,
"grad_norm": 0.8696832060813904,
"learning_rate": 2.049211210483004e-06,
"loss": 0.4959,
"step": 29540
},
{
"epoch": 0.9390387242288509,
"grad_norm": 0.7167271375656128,
"learning_rate": 2.0079740027200144e-06,
"loss": 0.4927,
"step": 29560
},
{
"epoch": 0.9396740684265701,
"grad_norm": 0.868259072303772,
"learning_rate": 1.967151732678518e-06,
"loss": 0.4788,
"step": 29580
},
{
"epoch": 0.9403094126242892,
"grad_norm": 0.8658266663551331,
"learning_rate": 1.9267445732164325e-06,
"loss": 0.4919,
"step": 29600
},
{
"epoch": 0.9409447568220083,
"grad_norm": 1.010276436805725,
"learning_rate": 1.8867526954339688e-06,
"loss": 0.4811,
"step": 29620
},
{
"epoch": 0.9415801010197274,
"grad_norm": 0.9376817941665649,
"learning_rate": 1.8471762686728344e-06,
"loss": 0.4723,
"step": 29640
},
{
"epoch": 0.9422154452174466,
"grad_norm": 1.520297646522522,
"learning_rate": 1.8080154605155996e-06,
"loss": 0.5146,
"step": 29660
},
{
"epoch": 0.9428507894151656,
"grad_norm": 0.8532717227935791,
"learning_rate": 1.7692704367848756e-06,
"loss": 0.4556,
"step": 29680
},
{
"epoch": 0.9434861336128848,
"grad_norm": 1.069378137588501,
"learning_rate": 1.730941361542704e-06,
"loss": 0.4789,
"step": 29700
},
{
"epoch": 0.9441214778106038,
"grad_norm": 0.8771205544471741,
"learning_rate": 1.6930283970898574e-06,
"loss": 0.4819,
"step": 29720
},
{
"epoch": 0.944756822008323,
"grad_norm": 0.8729512095451355,
"learning_rate": 1.6555317039650852e-06,
"loss": 0.4792,
"step": 29740
},
{
"epoch": 0.9453921662060422,
"grad_norm": 0.8724381923675537,
"learning_rate": 1.6184514409444795e-06,
"loss": 0.4726,
"step": 29760
},
{
"epoch": 0.9460275104037612,
"grad_norm": 0.9022035598754883,
"learning_rate": 1.5817877650408541e-06,
"loss": 0.4891,
"step": 29780
},
{
"epoch": 0.9466628546014804,
"grad_norm": 1.003596544265747,
"learning_rate": 1.5455408315029562e-06,
"loss": 0.4974,
"step": 29800
},
{
"epoch": 0.9472981987991995,
"grad_norm": 0.8569382429122925,
"learning_rate": 1.5097107938149113e-06,
"loss": 0.4781,
"step": 29820
},
{
"epoch": 0.9479335429969186,
"grad_norm": 0.9094131588935852,
"learning_rate": 1.4742978036955457e-06,
"loss": 0.5155,
"step": 29840
},
{
"epoch": 0.9485688871946377,
"grad_norm": 1.0451712608337402,
"learning_rate": 1.4393020110977206e-06,
"loss": 0.4895,
"step": 29860
},
{
"epoch": 0.9492042313923568,
"grad_norm": 1.2386709451675415,
"learning_rate": 1.4047235642077217e-06,
"loss": 0.4702,
"step": 29880
},
{
"epoch": 0.9498395755900759,
"grad_norm": 0.966143786907196,
"learning_rate": 1.3705626094446256e-06,
"loss": 0.4962,
"step": 29900
},
{
"epoch": 0.9504749197877951,
"grad_norm": 0.9544230103492737,
"learning_rate": 1.33681929145969e-06,
"loss": 0.4788,
"step": 29920
},
{
"epoch": 0.9511102639855141,
"grad_norm": 0.8583151698112488,
"learning_rate": 1.3034937531357095e-06,
"loss": 0.477,
"step": 29940
},
{
"epoch": 0.9517456081832333,
"grad_norm": 0.8361521363258362,
"learning_rate": 1.270586135586427e-06,
"loss": 0.5162,
"step": 29960
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.0520914793014526,
"learning_rate": 1.2380965781559783e-06,
"loss": 0.4762,
"step": 29980
},
{
"epoch": 0.9530162965786715,
"grad_norm": 0.8727782964706421,
"learning_rate": 1.2060252184182386e-06,
"loss": 0.4929,
"step": 30000
},
{
"epoch": 0.9530162965786715,
"eval_loss": 0.443807452917099,
"eval_runtime": 44.5933,
"eval_samples_per_second": 60.614,
"eval_steps_per_second": 30.318,
"step": 30000
},
{
"epoch": 0.9536516407763906,
"grad_norm": 0.7989442944526672,
"learning_rate": 1.174372192176254e-06,
"loss": 0.4932,
"step": 30020
},
{
"epoch": 0.9542869849741097,
"grad_norm": 0.7544863224029541,
"learning_rate": 1.1431376334616994e-06,
"loss": 0.482,
"step": 30040
},
{
"epoch": 0.9549223291718288,
"grad_norm": 0.8897516131401062,
"learning_rate": 1.1123216745342779e-06,
"loss": 0.4898,
"step": 30060
},
{
"epoch": 0.955557673369548,
"grad_norm": 0.8291769027709961,
"learning_rate": 1.0819244458811773e-06,
"loss": 0.5021,
"step": 30080
},
{
"epoch": 0.956193017567267,
"grad_norm": 0.8413028717041016,
"learning_rate": 1.0519460762165144e-06,
"loss": 0.4762,
"step": 30100
},
{
"epoch": 0.9568283617649862,
"grad_norm": 0.9216207265853882,
"learning_rate": 1.0223866924807924e-06,
"loss": 0.4869,
"step": 30120
},
{
"epoch": 0.9574637059627052,
"grad_norm": 0.8935249447822571,
"learning_rate": 9.932464198403325e-07,
"loss": 0.4928,
"step": 30140
},
{
"epoch": 0.9580990501604244,
"grad_norm": 0.7496423721313477,
"learning_rate": 9.645253816867983e-07,
"loss": 0.5266,
"step": 30160
},
{
"epoch": 0.9587343943581436,
"grad_norm": 0.9738262295722961,
"learning_rate": 9.362236996366514e-07,
"loss": 0.4735,
"step": 30180
},
{
"epoch": 0.9593697385558626,
"grad_norm": 0.9249958395957947,
"learning_rate": 9.083414935305956e-07,
"loss": 0.4706,
"step": 30200
},
{
"epoch": 0.9600050827535818,
"grad_norm": 1.0667359828948975,
"learning_rate": 8.808788814331448e-07,
"loss": 0.4721,
"step": 30220
},
{
"epoch": 0.9606404269513009,
"grad_norm": 0.8088135123252869,
"learning_rate": 8.53835979632056e-07,
"loss": 0.4884,
"step": 30240
},
{
"epoch": 0.96127577114902,
"grad_norm": 0.9164936542510986,
"learning_rate": 8.272129026378639e-07,
"loss": 0.5022,
"step": 30260
},
{
"epoch": 0.9619111153467391,
"grad_norm": 0.7835588455200195,
"learning_rate": 8.010097631834245e-07,
"loss": 0.4707,
"step": 30280
},
{
"epoch": 0.9625464595444582,
"grad_norm": 1.2730233669281006,
"learning_rate": 7.752266722233614e-07,
"loss": 0.4795,
"step": 30300
},
{
"epoch": 0.9631818037421773,
"grad_norm": 0.9977156519889832,
"learning_rate": 7.511219051883567e-07,
"loss": 0.5209,
"step": 30320
},
{
"epoch": 0.9638171479398965,
"grad_norm": 0.941656231880188,
"learning_rate": 7.26158221189377e-07,
"loss": 0.4747,
"step": 30340
},
{
"epoch": 0.9644524921376155,
"grad_norm": 0.7258419990539551,
"learning_rate": 7.028320832731084e-07,
"loss": 0.4961,
"step": 30360
},
{
"epoch": 0.9650878363353347,
"grad_norm": 0.974557638168335,
"learning_rate": 6.786882081830093e-07,
"loss": 0.4559,
"step": 30380
},
{
"epoch": 0.9657231805330537,
"grad_norm": 0.973461925983429,
"learning_rate": 6.549648995460511e-07,
"loss": 0.4931,
"step": 30400
},
{
"epoch": 0.9663585247307729,
"grad_norm": 1.0066043138504028,
"learning_rate": 6.31662257816279e-07,
"loss": 0.4901,
"step": 30420
},
{
"epoch": 0.966993868928492,
"grad_norm": 0.9339585900306702,
"learning_rate": 6.087803816664628e-07,
"loss": 0.4697,
"step": 30440
},
{
"epoch": 0.9676292131262111,
"grad_norm": 0.8802968859672546,
"learning_rate": 5.863193679877088e-07,
"loss": 0.4943,
"step": 30460
},
{
"epoch": 0.9682645573239302,
"grad_norm": 0.7557999491691589,
"learning_rate": 5.6427931188896e-07,
"loss": 0.4761,
"step": 30480
},
{
"epoch": 0.9688999015216494,
"grad_norm": 0.9139352440834045,
"learning_rate": 5.426603066967295e-07,
"loss": 0.476,
"step": 30500
},
{
"epoch": 0.9695352457193684,
"grad_norm": 0.9125082492828369,
"learning_rate": 5.21462443954579e-07,
"loss": 0.4792,
"step": 30520
},
{
"epoch": 0.9701705899170876,
"grad_norm": 0.9351817965507507,
"learning_rate": 5.006858134228076e-07,
"loss": 0.4976,
"step": 30540
},
{
"epoch": 0.9708059341148066,
"grad_norm": 0.743870735168457,
"learning_rate": 4.803305030780302e-07,
"loss": 0.4695,
"step": 30560
},
{
"epoch": 0.9714412783125258,
"grad_norm": 0.9468183517456055,
"learning_rate": 4.603965991128445e-07,
"loss": 0.5027,
"step": 30580
},
{
"epoch": 0.972076622510245,
"grad_norm": 1.1194064617156982,
"learning_rate": 4.408841859354307e-07,
"loss": 0.5146,
"step": 30600
},
{
"epoch": 0.972711966707964,
"grad_norm": 0.7916650176048279,
"learning_rate": 4.21793346169197e-07,
"loss": 0.4689,
"step": 30620
},
{
"epoch": 0.9733473109056832,
"grad_norm": 0.9158383011817932,
"learning_rate": 4.0312416065245717e-07,
"loss": 0.5272,
"step": 30640
},
{
"epoch": 0.9739826551034023,
"grad_norm": 0.8861019015312195,
"learning_rate": 3.8487670843807555e-07,
"loss": 0.4981,
"step": 30660
},
{
"epoch": 0.9746179993011214,
"grad_norm": 1.01827871799469,
"learning_rate": 3.670510667931004e-07,
"loss": 0.5386,
"step": 30680
},
{
"epoch": 0.9752533434988405,
"grad_norm": 0.9622276425361633,
"learning_rate": 3.496473111984866e-07,
"loss": 0.5135,
"step": 30700
},
{
"epoch": 0.9758886876965596,
"grad_norm": 1.0768787860870361,
"learning_rate": 3.326655153487512e-07,
"loss": 0.4943,
"step": 30720
},
{
"epoch": 0.9765240318942787,
"grad_norm": 1.2705291509628296,
"learning_rate": 3.16105751151663e-07,
"loss": 0.4924,
"step": 30740
},
{
"epoch": 0.9771593760919979,
"grad_norm": 0.9354774951934814,
"learning_rate": 2.99968088727931e-07,
"loss": 0.4811,
"step": 30760
},
{
"epoch": 0.9777947202897169,
"grad_norm": 0.8442774415016174,
"learning_rate": 2.842525964109166e-07,
"loss": 0.4652,
"step": 30780
},
{
"epoch": 0.9784300644874361,
"grad_norm": 0.9658933281898499,
"learning_rate": 2.6895934074635533e-07,
"loss": 0.4767,
"step": 30800
},
{
"epoch": 0.9790654086851552,
"grad_norm": 0.9930063486099243,
"learning_rate": 2.5408838649204625e-07,
"loss": 0.4791,
"step": 30820
},
{
"epoch": 0.9797007528828743,
"grad_norm": 0.9439179301261902,
"learning_rate": 2.396397966176078e-07,
"loss": 0.4833,
"step": 30840
},
{
"epoch": 0.9803360970805934,
"grad_norm": 0.8499469757080078,
"learning_rate": 2.25613632304178e-07,
"loss": 0.4969,
"step": 30860
},
{
"epoch": 0.9809714412783125,
"grad_norm": 1.0228259563446045,
"learning_rate": 2.1200995294420323e-07,
"loss": 0.4709,
"step": 30880
},
{
"epoch": 0.9816067854760316,
"grad_norm": 1.1045747995376587,
"learning_rate": 1.988288161411389e-07,
"loss": 0.4964,
"step": 30900
},
{
"epoch": 0.9822421296737508,
"grad_norm": 0.8404049277305603,
"learning_rate": 1.8607027770921602e-07,
"loss": 0.5289,
"step": 30920
},
{
"epoch": 0.9828774738714698,
"grad_norm": 0.8583685755729675,
"learning_rate": 1.7373439167325257e-07,
"loss": 0.4824,
"step": 30940
},
{
"epoch": 0.983512818069189,
"grad_norm": 0.8340322375297546,
"learning_rate": 1.6240682931759622e-07,
"loss": 0.5276,
"step": 30960
},
{
"epoch": 0.984148162266908,
"grad_norm": 0.717254638671875,
"learning_rate": 1.508952640646988e-07,
"loss": 0.4837,
"step": 30980
},
{
"epoch": 0.9847835064646272,
"grad_norm": 0.7109520435333252,
"learning_rate": 1.3980650015292806e-07,
"loss": 0.4805,
"step": 31000
},
{
"epoch": 0.9847835064646272,
"eval_loss": 0.4438159465789795,
"eval_runtime": 44.826,
"eval_samples_per_second": 60.3,
"eval_steps_per_second": 30.161,
"step": 31000
},
{
"epoch": 0.9854188506623464,
"grad_norm": 0.8632842302322388,
"learning_rate": 1.2914058453658008e-07,
"loss": 0.4787,
"step": 31020
},
{
"epoch": 0.9860541948600654,
"grad_norm": 0.9302808046340942,
"learning_rate": 1.1889756237943861e-07,
"loss": 0.4733,
"step": 31040
},
{
"epoch": 0.9866895390577846,
"grad_norm": 1.0309478044509888,
"learning_rate": 1.090774770545755e-07,
"loss": 0.498,
"step": 31060
},
{
"epoch": 0.9873248832555037,
"grad_norm": 0.7432119250297546,
"learning_rate": 9.968037014420616e-08,
"loss": 0.4909,
"step": 31080
},
{
"epoch": 0.9879602274532228,
"grad_norm": 1.0406357049942017,
"learning_rate": 9.070628143946768e-08,
"loss": 0.4913,
"step": 31100
},
{
"epoch": 0.9885955716509419,
"grad_norm": 0.8807629346847534,
"learning_rate": 8.215524894024107e-08,
"loss": 0.4843,
"step": 31120
},
{
"epoch": 0.989230915848661,
"grad_norm": 0.815077006816864,
"learning_rate": 7.402730885507359e-08,
"loss": 0.4877,
"step": 31140
},
{
"epoch": 0.9898662600463801,
"grad_norm": 0.8051480054855347,
"learning_rate": 6.632249560092341e-08,
"loss": 0.489,
"step": 31160
},
{
"epoch": 0.9905016042440993,
"grad_norm": 0.8251180648803711,
"learning_rate": 5.9040841803081895e-08,
"loss": 0.4763,
"step": 31180
},
{
"epoch": 0.9911369484418183,
"grad_norm": 0.8782890439033508,
"learning_rate": 5.218237829499595e-08,
"loss": 0.5012,
"step": 31200
},
{
"epoch": 0.9917722926395375,
"grad_norm": 0.9451269507408142,
"learning_rate": 4.574713411816811e-08,
"loss": 0.4765,
"step": 31220
},
{
"epoch": 0.9924076368372566,
"grad_norm": 1.2340540885925293,
"learning_rate": 3.973513652202332e-08,
"loss": 0.4999,
"step": 31240
},
{
"epoch": 0.9930429810349757,
"grad_norm": 1.0101948976516724,
"learning_rate": 3.414641096376459e-08,
"loss": 0.5118,
"step": 31260
},
{
"epoch": 0.9936783252326948,
"grad_norm": 0.7806993722915649,
"learning_rate": 2.8980981108317485e-08,
"loss": 0.5068,
"step": 31280
},
{
"epoch": 0.9943136694304139,
"grad_norm": 1.1223636865615845,
"learning_rate": 2.4238868828196927e-08,
"loss": 0.5182,
"step": 31300
},
{
"epoch": 0.994949013628133,
"grad_norm": 0.8514977693557739,
"learning_rate": 1.9920094203418336e-08,
"loss": 0.5072,
"step": 31320
},
{
"epoch": 0.9955843578258522,
"grad_norm": 1.1318073272705078,
"learning_rate": 1.6024675521397747e-08,
"loss": 0.4819,
"step": 31340
},
{
"epoch": 0.9962197020235712,
"grad_norm": 0.9314286708831787,
"learning_rate": 1.2552629276929573e-08,
"loss": 0.4957,
"step": 31360
},
{
"epoch": 0.9968550462212904,
"grad_norm": 0.7769533395767212,
"learning_rate": 9.503970172031196e-09,
"loss": 0.5149,
"step": 31380
},
{
"epoch": 0.9974903904190096,
"grad_norm": 0.7601432800292969,
"learning_rate": 6.878711115976266e-09,
"loss": 0.4933,
"step": 31400
},
{
"epoch": 0.9981257346167286,
"grad_norm": 0.987147331237793,
"learning_rate": 4.6768632251614765e-09,
"loss": 0.4693,
"step": 31420
},
{
"epoch": 0.9987610788144478,
"grad_norm": 0.8807405829429626,
"learning_rate": 2.8984358230954577e-09,
"loss": 0.474,
"step": 31440
},
{
"epoch": 0.9993964230121668,
"grad_norm": 0.7518433332443237,
"learning_rate": 1.5434364403543733e-09,
"loss": 0.5076,
"step": 31460
},
{
"epoch": 1.0,
"step": 31479,
"total_flos": 0.0,
"train_loss": 0.3508217529017671,
"train_runtime": 14676.7422,
"train_samples_per_second": 68.633,
"train_steps_per_second": 2.145
}
],
"logging_steps": 20,
"max_steps": 31479,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}