GENOME-gemma-2b-it / flan_v2 /trainer_state.json
Estwld's picture
Upload 15 files
6c954fc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998285910181694,
"eval_steps": 1000,
"global_step": 7290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006856359273225917,
"grad_norm": 10.705459594726562,
"learning_rate": 2.7434842249657065e-06,
"loss": 4.058,
"step": 10
},
{
"epoch": 0.013712718546451834,
"grad_norm": 5.851310729980469,
"learning_rate": 5.486968449931413e-06,
"loss": 4.049,
"step": 20
},
{
"epoch": 0.02056907781967775,
"grad_norm": 10.28573226928711,
"learning_rate": 8.23045267489712e-06,
"loss": 3.9324,
"step": 30
},
{
"epoch": 0.027425437092903668,
"grad_norm": 6.631351947784424,
"learning_rate": 1.0973936899862826e-05,
"loss": 3.3949,
"step": 40
},
{
"epoch": 0.03428179636612959,
"grad_norm": 7.74021053314209,
"learning_rate": 1.3717421124828534e-05,
"loss": 2.5865,
"step": 50
},
{
"epoch": 0.0411381556393555,
"grad_norm": 4.802703380584717,
"learning_rate": 1.646090534979424e-05,
"loss": 1.9904,
"step": 60
},
{
"epoch": 0.04799451491258142,
"grad_norm": 3.307770252227783,
"learning_rate": 1.9204389574759944e-05,
"loss": 1.8117,
"step": 70
},
{
"epoch": 0.054850874185807336,
"grad_norm": 2.1599233150482178,
"learning_rate": 2.1947873799725652e-05,
"loss": 1.5714,
"step": 80
},
{
"epoch": 0.061707233459033256,
"grad_norm": 2.8519175052642822,
"learning_rate": 2.4691358024691357e-05,
"loss": 1.6103,
"step": 90
},
{
"epoch": 0.06856359273225918,
"grad_norm": 5.519439220428467,
"learning_rate": 2.7434842249657068e-05,
"loss": 1.5985,
"step": 100
},
{
"epoch": 0.07541995200548508,
"grad_norm": 2.3000030517578125,
"learning_rate": 3.017832647462277e-05,
"loss": 1.6145,
"step": 110
},
{
"epoch": 0.082276311278711,
"grad_norm": 2.166654586791992,
"learning_rate": 3.292181069958848e-05,
"loss": 1.3969,
"step": 120
},
{
"epoch": 0.08913267055193692,
"grad_norm": 2.6651406288146973,
"learning_rate": 3.566529492455419e-05,
"loss": 1.5422,
"step": 130
},
{
"epoch": 0.09598902982516284,
"grad_norm": 3.262629747390747,
"learning_rate": 3.840877914951989e-05,
"loss": 1.5311,
"step": 140
},
{
"epoch": 0.10284538909838875,
"grad_norm": 2.1092171669006348,
"learning_rate": 4.11522633744856e-05,
"loss": 1.3981,
"step": 150
},
{
"epoch": 0.10970174837161467,
"grad_norm": 2.9152212142944336,
"learning_rate": 4.3895747599451304e-05,
"loss": 1.5888,
"step": 160
},
{
"epoch": 0.11655810764484059,
"grad_norm": 2.3841612339019775,
"learning_rate": 4.6639231824417016e-05,
"loss": 1.4813,
"step": 170
},
{
"epoch": 0.12341446691806651,
"grad_norm": 2.5338547229766846,
"learning_rate": 4.938271604938271e-05,
"loss": 1.5353,
"step": 180
},
{
"epoch": 0.13027082619129243,
"grad_norm": 3.3338496685028076,
"learning_rate": 5.2126200274348424e-05,
"loss": 1.4861,
"step": 190
},
{
"epoch": 0.13712718546451835,
"grad_norm": 2.825850248336792,
"learning_rate": 5.4869684499314136e-05,
"loss": 1.2727,
"step": 200
},
{
"epoch": 0.14398354473774425,
"grad_norm": 2.7089977264404297,
"learning_rate": 5.761316872427984e-05,
"loss": 1.4897,
"step": 210
},
{
"epoch": 0.15083990401097017,
"grad_norm": 2.1353025436401367,
"learning_rate": 6.035665294924554e-05,
"loss": 1.5895,
"step": 220
},
{
"epoch": 0.1576962632841961,
"grad_norm": 2.382019519805908,
"learning_rate": 6.310013717421126e-05,
"loss": 1.4928,
"step": 230
},
{
"epoch": 0.164552622557422,
"grad_norm": 2.485421895980835,
"learning_rate": 6.584362139917696e-05,
"loss": 1.5546,
"step": 240
},
{
"epoch": 0.17140898183064793,
"grad_norm": 2.1953341960906982,
"learning_rate": 6.858710562414266e-05,
"loss": 1.4226,
"step": 250
},
{
"epoch": 0.17826534110387385,
"grad_norm": 2.0357184410095215,
"learning_rate": 7.133058984910838e-05,
"loss": 1.3597,
"step": 260
},
{
"epoch": 0.18512170037709977,
"grad_norm": 3.600464344024658,
"learning_rate": 7.407407407407407e-05,
"loss": 1.4977,
"step": 270
},
{
"epoch": 0.1919780596503257,
"grad_norm": 2.2848992347717285,
"learning_rate": 7.681755829903978e-05,
"loss": 1.4477,
"step": 280
},
{
"epoch": 0.19883441892355158,
"grad_norm": 2.5611186027526855,
"learning_rate": 7.95610425240055e-05,
"loss": 1.4536,
"step": 290
},
{
"epoch": 0.2056907781967775,
"grad_norm": 2.1254303455352783,
"learning_rate": 8.23045267489712e-05,
"loss": 1.4916,
"step": 300
},
{
"epoch": 0.21254713747000342,
"grad_norm": 3.0583224296569824,
"learning_rate": 8.50480109739369e-05,
"loss": 1.6612,
"step": 310
},
{
"epoch": 0.21940349674322934,
"grad_norm": 1.6011234521865845,
"learning_rate": 8.779149519890261e-05,
"loss": 1.489,
"step": 320
},
{
"epoch": 0.22625985601645526,
"grad_norm": 2.642266273498535,
"learning_rate": 9.053497942386831e-05,
"loss": 1.3562,
"step": 330
},
{
"epoch": 0.23311621528968118,
"grad_norm": 3.995382785797119,
"learning_rate": 9.327846364883403e-05,
"loss": 1.4473,
"step": 340
},
{
"epoch": 0.2399725745629071,
"grad_norm": 2.1678707599639893,
"learning_rate": 9.602194787379974e-05,
"loss": 1.5002,
"step": 350
},
{
"epoch": 0.24682893383613302,
"grad_norm": 3.0694093704223633,
"learning_rate": 9.876543209876543e-05,
"loss": 1.3842,
"step": 360
},
{
"epoch": 0.25368529310935894,
"grad_norm": 2.040300130844116,
"learning_rate": 0.00010150891632373114,
"loss": 1.3354,
"step": 370
},
{
"epoch": 0.26054165238258487,
"grad_norm": 2.9175169467926025,
"learning_rate": 0.00010425240054869685,
"loss": 1.4611,
"step": 380
},
{
"epoch": 0.2673980116558108,
"grad_norm": 2.048736095428467,
"learning_rate": 0.00010699588477366255,
"loss": 1.4053,
"step": 390
},
{
"epoch": 0.2742543709290367,
"grad_norm": 2.210230827331543,
"learning_rate": 0.00010973936899862827,
"loss": 1.4418,
"step": 400
},
{
"epoch": 0.28111073020226257,
"grad_norm": 2.244452714920044,
"learning_rate": 0.00011248285322359398,
"loss": 1.4981,
"step": 410
},
{
"epoch": 0.2879670894754885,
"grad_norm": 2.487210273742676,
"learning_rate": 0.00011522633744855968,
"loss": 1.488,
"step": 420
},
{
"epoch": 0.2948234487487144,
"grad_norm": 2.0524230003356934,
"learning_rate": 0.0001179698216735254,
"loss": 1.4526,
"step": 430
},
{
"epoch": 0.30167980802194033,
"grad_norm": 2.3150599002838135,
"learning_rate": 0.00012071330589849108,
"loss": 1.2799,
"step": 440
},
{
"epoch": 0.30853616729516625,
"grad_norm": 6.619499206542969,
"learning_rate": 0.0001234567901234568,
"loss": 1.4248,
"step": 450
},
{
"epoch": 0.3153925265683922,
"grad_norm": 2.391008138656616,
"learning_rate": 0.0001262002743484225,
"loss": 1.4802,
"step": 460
},
{
"epoch": 0.3222488858416181,
"grad_norm": 1.9163336753845215,
"learning_rate": 0.0001289437585733882,
"loss": 1.343,
"step": 470
},
{
"epoch": 0.329105245114844,
"grad_norm": 2.3108503818511963,
"learning_rate": 0.00013168724279835392,
"loss": 1.4222,
"step": 480
},
{
"epoch": 0.33596160438806993,
"grad_norm": 2.137388229370117,
"learning_rate": 0.00013443072702331964,
"loss": 1.3412,
"step": 490
},
{
"epoch": 0.34281796366129585,
"grad_norm": 1.5437091588974,
"learning_rate": 0.00013717421124828533,
"loss": 1.5056,
"step": 500
},
{
"epoch": 0.3496743229345218,
"grad_norm": 2.1628546714782715,
"learning_rate": 0.00013991769547325105,
"loss": 1.3907,
"step": 510
},
{
"epoch": 0.3565306822077477,
"grad_norm": 2.018361806869507,
"learning_rate": 0.00014266117969821676,
"loss": 1.4301,
"step": 520
},
{
"epoch": 0.3633870414809736,
"grad_norm": 1.873982310295105,
"learning_rate": 0.00014540466392318243,
"loss": 1.3484,
"step": 530
},
{
"epoch": 0.37024340075419954,
"grad_norm": 2.2962214946746826,
"learning_rate": 0.00014814814814814815,
"loss": 1.2669,
"step": 540
},
{
"epoch": 0.37709976002742546,
"grad_norm": 1.6865073442459106,
"learning_rate": 0.00015089163237311386,
"loss": 1.4261,
"step": 550
},
{
"epoch": 0.3839561193006514,
"grad_norm": 2.0754506587982178,
"learning_rate": 0.00015363511659807956,
"loss": 1.2982,
"step": 560
},
{
"epoch": 0.3908124785738773,
"grad_norm": 1.6448793411254883,
"learning_rate": 0.00015637860082304527,
"loss": 1.3753,
"step": 570
},
{
"epoch": 0.39766883784710316,
"grad_norm": 1.5936967134475708,
"learning_rate": 0.000159122085048011,
"loss": 1.4155,
"step": 580
},
{
"epoch": 0.4045251971203291,
"grad_norm": 2.3004658222198486,
"learning_rate": 0.00016186556927297668,
"loss": 1.3321,
"step": 590
},
{
"epoch": 0.411381556393555,
"grad_norm": 2.23530912399292,
"learning_rate": 0.0001646090534979424,
"loss": 1.385,
"step": 600
},
{
"epoch": 0.4182379156667809,
"grad_norm": 2.0479331016540527,
"learning_rate": 0.00016735253772290812,
"loss": 1.4551,
"step": 610
},
{
"epoch": 0.42509427494000684,
"grad_norm": 2.0379133224487305,
"learning_rate": 0.0001700960219478738,
"loss": 1.338,
"step": 620
},
{
"epoch": 0.43195063421323276,
"grad_norm": 2.908133029937744,
"learning_rate": 0.0001728395061728395,
"loss": 1.419,
"step": 630
},
{
"epoch": 0.4388069934864587,
"grad_norm": 2.721883773803711,
"learning_rate": 0.00017558299039780522,
"loss": 1.3527,
"step": 640
},
{
"epoch": 0.4456633527596846,
"grad_norm": 2.2164113521575928,
"learning_rate": 0.00017832647462277094,
"loss": 1.4272,
"step": 650
},
{
"epoch": 0.4525197120329105,
"grad_norm": 1.7344247102737427,
"learning_rate": 0.00018106995884773663,
"loss": 1.3838,
"step": 660
},
{
"epoch": 0.45937607130613645,
"grad_norm": 1.705946922302246,
"learning_rate": 0.00018381344307270234,
"loss": 1.2844,
"step": 670
},
{
"epoch": 0.46623243057936237,
"grad_norm": 1.6594369411468506,
"learning_rate": 0.00018655692729766806,
"loss": 1.5451,
"step": 680
},
{
"epoch": 0.4730887898525883,
"grad_norm": 1.326300859451294,
"learning_rate": 0.00018930041152263375,
"loss": 1.3065,
"step": 690
},
{
"epoch": 0.4799451491258142,
"grad_norm": 2.301481008529663,
"learning_rate": 0.00019204389574759947,
"loss": 1.3551,
"step": 700
},
{
"epoch": 0.48680150839904013,
"grad_norm": 1.6948672533035278,
"learning_rate": 0.0001947873799725652,
"loss": 1.3867,
"step": 710
},
{
"epoch": 0.49365786767226605,
"grad_norm": 2.5889105796813965,
"learning_rate": 0.00019753086419753085,
"loss": 1.4339,
"step": 720
},
{
"epoch": 0.500514226945492,
"grad_norm": 2.4985668659210205,
"learning_rate": 0.0001999999885361719,
"loss": 1.3048,
"step": 730
},
{
"epoch": 0.5073705862187179,
"grad_norm": 1.8193042278289795,
"learning_rate": 0.00019999861287997797,
"loss": 1.2499,
"step": 740
},
{
"epoch": 0.5142269454919438,
"grad_norm": 3.7840700149536133,
"learning_rate": 0.00019999494449430045,
"loss": 1.3802,
"step": 750
},
{
"epoch": 0.5210833047651697,
"grad_norm": 1.7764334678649902,
"learning_rate": 0.00019998898346324667,
"loss": 1.482,
"step": 760
},
{
"epoch": 0.5279396640383957,
"grad_norm": 3.376749038696289,
"learning_rate": 0.00019998072992348886,
"loss": 1.4751,
"step": 770
},
{
"epoch": 0.5347960233116216,
"grad_norm": 1.9762446880340576,
"learning_rate": 0.00019997018406426093,
"loss": 1.434,
"step": 780
},
{
"epoch": 0.5416523825848475,
"grad_norm": 2.363563060760498,
"learning_rate": 0.00019995734612735427,
"loss": 1.3767,
"step": 790
},
{
"epoch": 0.5485087418580734,
"grad_norm": 2.0239980220794678,
"learning_rate": 0.00019994221640711222,
"loss": 1.3242,
"step": 800
},
{
"epoch": 0.5553651011312993,
"grad_norm": 4.290923118591309,
"learning_rate": 0.00019992479525042303,
"loss": 1.3456,
"step": 810
},
{
"epoch": 0.5622214604045251,
"grad_norm": 1.7077864408493042,
"learning_rate": 0.00019990508305671228,
"loss": 1.4179,
"step": 820
},
{
"epoch": 0.5690778196777511,
"grad_norm": 2.9807496070861816,
"learning_rate": 0.0001998830802779335,
"loss": 1.4003,
"step": 830
},
{
"epoch": 0.575934178950977,
"grad_norm": 1.4405466318130493,
"learning_rate": 0.00019985878741855793,
"loss": 1.3682,
"step": 840
},
{
"epoch": 0.5827905382242029,
"grad_norm": 2.661698341369629,
"learning_rate": 0.00019983220503556282,
"loss": 1.5405,
"step": 850
},
{
"epoch": 0.5896468974974288,
"grad_norm": 1.7290005683898926,
"learning_rate": 0.00019980333373841873,
"loss": 1.3195,
"step": 860
},
{
"epoch": 0.5965032567706547,
"grad_norm": 1.6726288795471191,
"learning_rate": 0.00019977217418907562,
"loss": 1.4031,
"step": 870
},
{
"epoch": 0.6033596160438807,
"grad_norm": 1.7914421558380127,
"learning_rate": 0.00019973872710194756,
"loss": 1.5047,
"step": 880
},
{
"epoch": 0.6102159753171066,
"grad_norm": 2.1812944412231445,
"learning_rate": 0.00019970299324389642,
"loss": 1.4172,
"step": 890
},
{
"epoch": 0.6170723345903325,
"grad_norm": 1.884153962135315,
"learning_rate": 0.0001996649734342143,
"loss": 1.3291,
"step": 900
},
{
"epoch": 0.6239286938635584,
"grad_norm": 1.7976921796798706,
"learning_rate": 0.00019962466854460458,
"loss": 1.4267,
"step": 910
},
{
"epoch": 0.6307850531367843,
"grad_norm": 1.628185510635376,
"learning_rate": 0.00019958207949916223,
"loss": 1.4677,
"step": 920
},
{
"epoch": 0.6376414124100103,
"grad_norm": 1.7098406553268433,
"learning_rate": 0.00019953720727435242,
"loss": 1.4233,
"step": 930
},
{
"epoch": 0.6444977716832362,
"grad_norm": 2.8682682514190674,
"learning_rate": 0.0001994900528989881,
"loss": 1.2757,
"step": 940
},
{
"epoch": 0.6513541309564621,
"grad_norm": 2.8800859451293945,
"learning_rate": 0.00019944061745420655,
"loss": 1.3997,
"step": 950
},
{
"epoch": 0.658210490229688,
"grad_norm": 1.1743911504745483,
"learning_rate": 0.00019938890207344453,
"loss": 1.4948,
"step": 960
},
{
"epoch": 0.665066849502914,
"grad_norm": 3.9781527519226074,
"learning_rate": 0.00019933490794241224,
"loss": 1.349,
"step": 970
},
{
"epoch": 0.6719232087761399,
"grad_norm": 1.9682557582855225,
"learning_rate": 0.00019927863629906622,
"loss": 1.4381,
"step": 980
},
{
"epoch": 0.6787795680493658,
"grad_norm": 2.0713021755218506,
"learning_rate": 0.00019922008843358094,
"loss": 1.3814,
"step": 990
},
{
"epoch": 0.6856359273225917,
"grad_norm": 2.3139188289642334,
"learning_rate": 0.0001991592656883192,
"loss": 1.3592,
"step": 1000
},
{
"epoch": 0.6856359273225917,
"eval_loss": 1.1938791275024414,
"eval_runtime": 29.6937,
"eval_samples_per_second": 82.745,
"eval_steps_per_second": 10.373,
"step": 1000
},
{
"epoch": 0.6924922865958176,
"grad_norm": 1.9212145805358887,
"learning_rate": 0.00019909616945780134,
"loss": 1.4605,
"step": 1010
},
{
"epoch": 0.6993486458690436,
"grad_norm": 1.236790657043457,
"learning_rate": 0.0001990308011886733,
"loss": 1.2371,
"step": 1020
},
{
"epoch": 0.7062050051422695,
"grad_norm": 4.3965373039245605,
"learning_rate": 0.00019896316237967343,
"loss": 1.5101,
"step": 1030
},
{
"epoch": 0.7130613644154954,
"grad_norm": 1.8042622804641724,
"learning_rate": 0.0001988932545815982,
"loss": 1.2647,
"step": 1040
},
{
"epoch": 0.7199177236887213,
"grad_norm": 2.162903070449829,
"learning_rate": 0.00019882107939726655,
"loss": 1.4241,
"step": 1050
},
{
"epoch": 0.7267740829619472,
"grad_norm": 1.7536650896072388,
"learning_rate": 0.00019874663848148312,
"loss": 1.3215,
"step": 1060
},
{
"epoch": 0.7336304422351732,
"grad_norm": 1.6323691606521606,
"learning_rate": 0.00019866993354100042,
"loss": 1.3117,
"step": 1070
},
{
"epoch": 0.7404868015083991,
"grad_norm": 2.1569364070892334,
"learning_rate": 0.00019859096633447965,
"loss": 1.4203,
"step": 1080
},
{
"epoch": 0.747343160781625,
"grad_norm": 2.549560546875,
"learning_rate": 0.00019850973867245036,
"loss": 1.3122,
"step": 1090
},
{
"epoch": 0.7541995200548509,
"grad_norm": 2.85105562210083,
"learning_rate": 0.00019842625241726892,
"loss": 1.3834,
"step": 1100
},
{
"epoch": 0.7610558793280768,
"grad_norm": 2.235344648361206,
"learning_rate": 0.00019834050948307582,
"loss": 1.3419,
"step": 1110
},
{
"epoch": 0.7679122386013028,
"grad_norm": 2.0601747035980225,
"learning_rate": 0.00019825251183575195,
"loss": 1.4033,
"step": 1120
},
{
"epoch": 0.7747685978745287,
"grad_norm": 1.8832818269729614,
"learning_rate": 0.00019816226149287324,
"loss": 1.442,
"step": 1130
},
{
"epoch": 0.7816249571477546,
"grad_norm": 1.8006055355072021,
"learning_rate": 0.00019806976052366465,
"loss": 1.3696,
"step": 1140
},
{
"epoch": 0.7884813164209804,
"grad_norm": 2.5300374031066895,
"learning_rate": 0.00019797501104895258,
"loss": 1.3844,
"step": 1150
},
{
"epoch": 0.7953376756942063,
"grad_norm": 2.987879991531372,
"learning_rate": 0.00019787801524111628,
"loss": 1.2814,
"step": 1160
},
{
"epoch": 0.8021940349674322,
"grad_norm": 1.7485980987548828,
"learning_rate": 0.00019777877532403814,
"loss": 1.3488,
"step": 1170
},
{
"epoch": 0.8090503942406582,
"grad_norm": 1.764123558998108,
"learning_rate": 0.0001976772935730525,
"loss": 1.4282,
"step": 1180
},
{
"epoch": 0.8159067535138841,
"grad_norm": 2.458010673522949,
"learning_rate": 0.00019757357231489365,
"loss": 1.4672,
"step": 1190
},
{
"epoch": 0.82276311278711,
"grad_norm": 3.0731263160705566,
"learning_rate": 0.00019746761392764253,
"loss": 1.4038,
"step": 1200
},
{
"epoch": 0.8296194720603359,
"grad_norm": 1.8123950958251953,
"learning_rate": 0.00019735942084067197,
"loss": 1.2516,
"step": 1210
},
{
"epoch": 0.8364758313335618,
"grad_norm": 1.8038643598556519,
"learning_rate": 0.00019724899553459117,
"loss": 1.2599,
"step": 1220
},
{
"epoch": 0.8433321906067878,
"grad_norm": 1.6645665168762207,
"learning_rate": 0.0001971363405411888,
"loss": 1.4674,
"step": 1230
},
{
"epoch": 0.8501885498800137,
"grad_norm": 1.7630431652069092,
"learning_rate": 0.00019702145844337497,
"loss": 1.3191,
"step": 1240
},
{
"epoch": 0.8570449091532396,
"grad_norm": 2.408414840698242,
"learning_rate": 0.00019690435187512192,
"loss": 1.4237,
"step": 1250
},
{
"epoch": 0.8639012684264655,
"grad_norm": 2.5194432735443115,
"learning_rate": 0.00019678502352140368,
"loss": 1.2969,
"step": 1260
},
{
"epoch": 0.8707576276996914,
"grad_norm": 2.1748664379119873,
"learning_rate": 0.0001966634761181346,
"loss": 1.2632,
"step": 1270
},
{
"epoch": 0.8776139869729174,
"grad_norm": 1.8696935176849365,
"learning_rate": 0.0001965397124521065,
"loss": 1.3904,
"step": 1280
},
{
"epoch": 0.8844703462461433,
"grad_norm": 1.6131293773651123,
"learning_rate": 0.00019641373536092473,
"loss": 1.3648,
"step": 1290
},
{
"epoch": 0.8913267055193692,
"grad_norm": 2.2704977989196777,
"learning_rate": 0.00019628554773294335,
"loss": 1.4555,
"step": 1300
},
{
"epoch": 0.8981830647925951,
"grad_norm": 2.2356860637664795,
"learning_rate": 0.0001961551525071986,
"loss": 1.3464,
"step": 1310
},
{
"epoch": 0.905039424065821,
"grad_norm": 2.372926712036133,
"learning_rate": 0.00019602255267334179,
"loss": 1.2966,
"step": 1320
},
{
"epoch": 0.911895783339047,
"grad_norm": 1.4219205379486084,
"learning_rate": 0.00019588775127157054,
"loss": 1.4147,
"step": 1330
},
{
"epoch": 0.9187521426122729,
"grad_norm": 1.885373830795288,
"learning_rate": 0.00019575075139255922,
"loss": 1.3167,
"step": 1340
},
{
"epoch": 0.9256085018854988,
"grad_norm": 3.269922971725464,
"learning_rate": 0.00019561155617738797,
"loss": 1.3325,
"step": 1350
},
{
"epoch": 0.9324648611587247,
"grad_norm": 1.6605535745620728,
"learning_rate": 0.00019547016881747088,
"loss": 1.2398,
"step": 1360
},
{
"epoch": 0.9393212204319507,
"grad_norm": 1.9962818622589111,
"learning_rate": 0.00019532659255448257,
"loss": 1.3011,
"step": 1370
},
{
"epoch": 0.9461775797051766,
"grad_norm": 1.9618587493896484,
"learning_rate": 0.00019518083068028398,
"loss": 1.3274,
"step": 1380
},
{
"epoch": 0.9530339389784025,
"grad_norm": 2.1130402088165283,
"learning_rate": 0.000195032886536847,
"loss": 1.3072,
"step": 1390
},
{
"epoch": 0.9598902982516284,
"grad_norm": 2.2084875106811523,
"learning_rate": 0.00019488276351617762,
"loss": 1.336,
"step": 1400
},
{
"epoch": 0.9667466575248543,
"grad_norm": 1.6304380893707275,
"learning_rate": 0.00019473046506023837,
"loss": 1.313,
"step": 1410
},
{
"epoch": 0.9736030167980803,
"grad_norm": 1.511129379272461,
"learning_rate": 0.00019457599466086927,
"loss": 1.369,
"step": 1420
},
{
"epoch": 0.9804593760713062,
"grad_norm": 1.7005033493041992,
"learning_rate": 0.00019441935585970784,
"loss": 1.4165,
"step": 1430
},
{
"epoch": 0.9873157353445321,
"grad_norm": 2.2424979209899902,
"learning_rate": 0.0001942605522481079,
"loss": 1.4495,
"step": 1440
},
{
"epoch": 0.994172094617758,
"grad_norm": 1.764769434928894,
"learning_rate": 0.0001940995874670571,
"loss": 1.3326,
"step": 1450
},
{
"epoch": 1.001028453890984,
"grad_norm": 1.098801851272583,
"learning_rate": 0.0001939364652070937,
"loss": 1.2684,
"step": 1460
},
{
"epoch": 1.0078848131642097,
"grad_norm": 1.260248064994812,
"learning_rate": 0.00019377118920822176,
"loss": 1.1724,
"step": 1470
},
{
"epoch": 1.0147411724374358,
"grad_norm": 2.705153465270996,
"learning_rate": 0.00019360376325982533,
"loss": 1.1531,
"step": 1480
},
{
"epoch": 1.0215975317106616,
"grad_norm": 2.205137252807617,
"learning_rate": 0.00019343419120058174,
"loss": 1.172,
"step": 1490
},
{
"epoch": 1.0284538909838876,
"grad_norm": 2.324617862701416,
"learning_rate": 0.00019326247691837356,
"loss": 1.1682,
"step": 1500
},
{
"epoch": 1.0353102502571134,
"grad_norm": 1.6484122276306152,
"learning_rate": 0.0001930886243501993,
"loss": 1.0909,
"step": 1510
},
{
"epoch": 1.0421666095303395,
"grad_norm": 2.0225424766540527,
"learning_rate": 0.00019291263748208345,
"loss": 1.2112,
"step": 1520
},
{
"epoch": 1.0490229688035653,
"grad_norm": 2.0471479892730713,
"learning_rate": 0.00019273452034898473,
"loss": 1.2315,
"step": 1530
},
{
"epoch": 1.0558793280767913,
"grad_norm": 1.5864293575286865,
"learning_rate": 0.00019255427703470377,
"loss": 1.1076,
"step": 1540
},
{
"epoch": 1.062735687350017,
"grad_norm": 1.960959792137146,
"learning_rate": 0.00019237191167178957,
"loss": 1.1633,
"step": 1550
},
{
"epoch": 1.0695920466232431,
"grad_norm": 1.6834721565246582,
"learning_rate": 0.00019218742844144456,
"loss": 1.2397,
"step": 1560
},
{
"epoch": 1.076448405896469,
"grad_norm": 2.398449182510376,
"learning_rate": 0.00019200083157342877,
"loss": 1.0861,
"step": 1570
},
{
"epoch": 1.083304765169695,
"grad_norm": 1.7399917840957642,
"learning_rate": 0.0001918121253459631,
"loss": 1.1532,
"step": 1580
},
{
"epoch": 1.0901611244429208,
"grad_norm": 1.8287321329116821,
"learning_rate": 0.0001916213140856307,
"loss": 1.1941,
"step": 1590
},
{
"epoch": 1.0970174837161468,
"grad_norm": 1.559448003768921,
"learning_rate": 0.00019142840216727835,
"loss": 1.1891,
"step": 1600
},
{
"epoch": 1.1038738429893726,
"grad_norm": 1.947444200515747,
"learning_rate": 0.00019123339401391589,
"loss": 1.2102,
"step": 1610
},
{
"epoch": 1.1107302022625984,
"grad_norm": 1.3778343200683594,
"learning_rate": 0.0001910362940966147,
"loss": 1.0902,
"step": 1620
},
{
"epoch": 1.1175865615358245,
"grad_norm": 1.4132441282272339,
"learning_rate": 0.00019083710693440536,
"loss": 1.1211,
"step": 1630
},
{
"epoch": 1.1244429208090505,
"grad_norm": 1.4806476831436157,
"learning_rate": 0.00019063583709417407,
"loss": 1.2051,
"step": 1640
},
{
"epoch": 1.1312992800822763,
"grad_norm": 1.5196651220321655,
"learning_rate": 0.00019043248919055778,
"loss": 1.1761,
"step": 1650
},
{
"epoch": 1.1381556393555021,
"grad_norm": 1.4891173839569092,
"learning_rate": 0.00019022706788583853,
"loss": 1.1754,
"step": 1660
},
{
"epoch": 1.1450119986287282,
"grad_norm": 3.816194772720337,
"learning_rate": 0.00019001957788983645,
"loss": 1.1501,
"step": 1670
},
{
"epoch": 1.151868357901954,
"grad_norm": 2.0599048137664795,
"learning_rate": 0.00018981002395980184,
"loss": 1.2955,
"step": 1680
},
{
"epoch": 1.15872471717518,
"grad_norm": 2.0258097648620605,
"learning_rate": 0.00018959841090030607,
"loss": 1.2163,
"step": 1690
},
{
"epoch": 1.1655810764484058,
"grad_norm": 2.2690117359161377,
"learning_rate": 0.00018938474356313146,
"loss": 1.1374,
"step": 1700
},
{
"epoch": 1.1724374357216318,
"grad_norm": 2.238801956176758,
"learning_rate": 0.00018916902684716004,
"loss": 1.0886,
"step": 1710
},
{
"epoch": 1.1792937949948576,
"grad_norm": 2.0089728832244873,
"learning_rate": 0.00018895126569826108,
"loss": 1.068,
"step": 1720
},
{
"epoch": 1.1861501542680837,
"grad_norm": 1.5372521877288818,
"learning_rate": 0.00018873146510917796,
"loss": 1.176,
"step": 1730
},
{
"epoch": 1.1930065135413095,
"grad_norm": 1.9092096090316772,
"learning_rate": 0.0001885096301194135,
"loss": 1.1167,
"step": 1740
},
{
"epoch": 1.1998628728145355,
"grad_norm": 1.670440673828125,
"learning_rate": 0.00018828576581511442,
"loss": 1.0562,
"step": 1750
},
{
"epoch": 1.2067192320877613,
"grad_norm": 3.970067024230957,
"learning_rate": 0.00018805987732895484,
"loss": 1.1368,
"step": 1760
},
{
"epoch": 1.2135755913609874,
"grad_norm": 2.8760297298431396,
"learning_rate": 0.00018783196984001855,
"loss": 1.1358,
"step": 1770
},
{
"epoch": 1.2204319506342132,
"grad_norm": 1.9676944017410278,
"learning_rate": 0.00018760204857368025,
"loss": 1.1217,
"step": 1780
},
{
"epoch": 1.2272883099074392,
"grad_norm": 1.7384953498840332,
"learning_rate": 0.00018737011880148562,
"loss": 1.1659,
"step": 1790
},
{
"epoch": 1.234144669180665,
"grad_norm": 2.452848196029663,
"learning_rate": 0.0001871361858410308,
"loss": 1.1873,
"step": 1800
},
{
"epoch": 1.241001028453891,
"grad_norm": 1.7921110391616821,
"learning_rate": 0.00018690025505584007,
"loss": 1.2562,
"step": 1810
},
{
"epoch": 1.2478573877271169,
"grad_norm": 1.934995412826538,
"learning_rate": 0.00018666233185524316,
"loss": 1.2777,
"step": 1820
},
{
"epoch": 1.2547137470003429,
"grad_norm": 2.868145227432251,
"learning_rate": 0.00018642242169425113,
"loss": 1.1826,
"step": 1830
},
{
"epoch": 1.2615701062735687,
"grad_norm": 1.9969412088394165,
"learning_rate": 0.00018618053007343126,
"loss": 1.2032,
"step": 1840
},
{
"epoch": 1.2684264655467947,
"grad_norm": 1.877845048904419,
"learning_rate": 0.00018593666253878096,
"loss": 1.1458,
"step": 1850
},
{
"epoch": 1.2752828248200205,
"grad_norm": 1.800827145576477,
"learning_rate": 0.0001856908246816007,
"loss": 1.0928,
"step": 1860
},
{
"epoch": 1.2821391840932466,
"grad_norm": 1.6568301916122437,
"learning_rate": 0.00018544302213836566,
"loss": 1.1258,
"step": 1870
},
{
"epoch": 1.2889955433664724,
"grad_norm": 3.1525049209594727,
"learning_rate": 0.00018519326059059665,
"loss": 1.2118,
"step": 1880
},
{
"epoch": 1.2958519026396984,
"grad_norm": 4.377742767333984,
"learning_rate": 0.00018494154576472976,
"loss": 1.1747,
"step": 1890
},
{
"epoch": 1.3027082619129242,
"grad_norm": 1.685784935951233,
"learning_rate": 0.000184687883431985,
"loss": 1.2139,
"step": 1900
},
{
"epoch": 1.30956462118615,
"grad_norm": 1.660079002380371,
"learning_rate": 0.00018443227940823423,
"loss": 1.2692,
"step": 1910
},
{
"epoch": 1.316420980459376,
"grad_norm": 1.9912021160125732,
"learning_rate": 0.00018417473955386745,
"loss": 1.2104,
"step": 1920
},
{
"epoch": 1.323277339732602,
"grad_norm": 2.156569242477417,
"learning_rate": 0.00018391526977365883,
"loss": 1.126,
"step": 1930
},
{
"epoch": 1.330133699005828,
"grad_norm": 1.768411636352539,
"learning_rate": 0.000183653876016631,
"loss": 1.107,
"step": 1940
},
{
"epoch": 1.3369900582790537,
"grad_norm": 2.667548179626465,
"learning_rate": 0.00018339056427591884,
"loss": 1.1562,
"step": 1950
},
{
"epoch": 1.3438464175522797,
"grad_norm": 1.7439193725585938,
"learning_rate": 0.00018312534058863194,
"loss": 1.1052,
"step": 1960
},
{
"epoch": 1.3507027768255058,
"grad_norm": 1.8108903169631958,
"learning_rate": 0.00018285821103571645,
"loss": 1.1605,
"step": 1970
},
{
"epoch": 1.3575591360987316,
"grad_norm": 2.2900757789611816,
"learning_rate": 0.00018258918174181526,
"loss": 1.214,
"step": 1980
},
{
"epoch": 1.3644154953719574,
"grad_norm": 3.963160514831543,
"learning_rate": 0.0001823182588751279,
"loss": 1.1504,
"step": 1990
},
{
"epoch": 1.3712718546451834,
"grad_norm": 1.6582880020141602,
"learning_rate": 0.00018204544864726895,
"loss": 1.0402,
"step": 2000
},
{
"epoch": 1.3712718546451834,
"eval_loss": 1.16560697555542,
"eval_runtime": 29.7405,
"eval_samples_per_second": 82.615,
"eval_steps_per_second": 10.356,
"step": 2000
},
{
"epoch": 1.3781282139184095,
"grad_norm": 2.182267904281616,
"learning_rate": 0.00018177075731312577,
"loss": 1.1344,
"step": 2010
},
{
"epoch": 1.3849845731916353,
"grad_norm": 1.5705207586288452,
"learning_rate": 0.00018149419117071482,
"loss": 1.0929,
"step": 2020
},
{
"epoch": 1.391840932464861,
"grad_norm": 2.289984941482544,
"learning_rate": 0.0001812157565610376,
"loss": 1.098,
"step": 2030
},
{
"epoch": 1.398697291738087,
"grad_norm": 2.600581645965576,
"learning_rate": 0.00018093545986793506,
"loss": 1.1924,
"step": 2040
},
{
"epoch": 1.405553651011313,
"grad_norm": 2.1132428646087646,
"learning_rate": 0.00018065330751794125,
"loss": 1.1127,
"step": 2050
},
{
"epoch": 1.412410010284539,
"grad_norm": 2.1575536727905273,
"learning_rate": 0.00018036930598013605,
"loss": 1.2272,
"step": 2060
},
{
"epoch": 1.4192663695577648,
"grad_norm": 1.6063116788864136,
"learning_rate": 0.00018008346176599674,
"loss": 1.0894,
"step": 2070
},
{
"epoch": 1.4261227288309908,
"grad_norm": 2.389429807662964,
"learning_rate": 0.00017979578142924885,
"loss": 1.1353,
"step": 2080
},
{
"epoch": 1.4329790881042166,
"grad_norm": 2.307987928390503,
"learning_rate": 0.0001795062715657157,
"loss": 1.1205,
"step": 2090
},
{
"epoch": 1.4398354473774426,
"grad_norm": 1.9370081424713135,
"learning_rate": 0.0001792149388131674,
"loss": 1.088,
"step": 2100
},
{
"epoch": 1.4466918066506684,
"grad_norm": 2.298769474029541,
"learning_rate": 0.0001789217898511685,
"loss": 1.1258,
"step": 2110
},
{
"epoch": 1.4535481659238945,
"grad_norm": 3.375627279281616,
"learning_rate": 0.00017862683140092497,
"loss": 1.0808,
"step": 2120
},
{
"epoch": 1.4604045251971203,
"grad_norm": 1.4899195432662964,
"learning_rate": 0.00017833007022512992,
"loss": 1.1043,
"step": 2130
},
{
"epoch": 1.4672608844703463,
"grad_norm": 2.4468886852264404,
"learning_rate": 0.0001780315131278087,
"loss": 1.2583,
"step": 2140
},
{
"epoch": 1.4741172437435721,
"grad_norm": 1.4145742654800415,
"learning_rate": 0.0001777311669541629,
"loss": 1.2913,
"step": 2150
},
{
"epoch": 1.4809736030167981,
"grad_norm": 1.8379381895065308,
"learning_rate": 0.00017742903859041325,
"loss": 1.1803,
"step": 2160
},
{
"epoch": 1.487829962290024,
"grad_norm": 2.4418864250183105,
"learning_rate": 0.00017712513496364197,
"loss": 1.1791,
"step": 2170
},
{
"epoch": 1.49468632156325,
"grad_norm": 2.0489251613616943,
"learning_rate": 0.00017681946304163372,
"loss": 1.193,
"step": 2180
},
{
"epoch": 1.5015426808364758,
"grad_norm": 1.6349157094955444,
"learning_rate": 0.00017651202983271603,
"loss": 1.1657,
"step": 2190
},
{
"epoch": 1.5083990401097016,
"grad_norm": 2.110250949859619,
"learning_rate": 0.00017620284238559848,
"loss": 1.2797,
"step": 2200
},
{
"epoch": 1.5152553993829276,
"grad_norm": 2.1508467197418213,
"learning_rate": 0.00017589190778921117,
"loss": 1.2165,
"step": 2210
},
{
"epoch": 1.5221117586561537,
"grad_norm": 1.8186125755310059,
"learning_rate": 0.00017557923317254213,
"loss": 1.2268,
"step": 2220
},
{
"epoch": 1.5289681179293795,
"grad_norm": 1.9265555143356323,
"learning_rate": 0.00017526482570447396,
"loss": 1.1894,
"step": 2230
},
{
"epoch": 1.5358244772026053,
"grad_norm": 2.7489798069000244,
"learning_rate": 0.00017494869259361933,
"loss": 1.217,
"step": 2240
},
{
"epoch": 1.5426808364758313,
"grad_norm": 1.585384726524353,
"learning_rate": 0.00017463084108815586,
"loss": 1.0137,
"step": 2250
},
{
"epoch": 1.5495371957490574,
"grad_norm": 3.09739089012146,
"learning_rate": 0.0001743112784756598,
"loss": 1.2187,
"step": 2260
},
{
"epoch": 1.5563935550222832,
"grad_norm": 2.081796884536743,
"learning_rate": 0.000173990012082939,
"loss": 1.0896,
"step": 2270
},
{
"epoch": 1.563249914295509,
"grad_norm": 2.9067137241363525,
"learning_rate": 0.00017366704927586498,
"loss": 1.1316,
"step": 2280
},
{
"epoch": 1.570106273568735,
"grad_norm": 2.1909544467926025,
"learning_rate": 0.00017334239745920394,
"loss": 1.1706,
"step": 2290
},
{
"epoch": 1.576962632841961,
"grad_norm": 2.680025815963745,
"learning_rate": 0.00017301606407644701,
"loss": 1.1753,
"step": 2300
},
{
"epoch": 1.5838189921151868,
"grad_norm": 3.4758894443511963,
"learning_rate": 0.0001726880566096397,
"loss": 1.1402,
"step": 2310
},
{
"epoch": 1.5906753513884127,
"grad_norm": 3.713744878768921,
"learning_rate": 0.0001723583825792102,
"loss": 1.1975,
"step": 2320
},
{
"epoch": 1.5975317106616387,
"grad_norm": 1.8732534646987915,
"learning_rate": 0.0001720270495437971,
"loss": 1.0603,
"step": 2330
},
{
"epoch": 1.6043880699348647,
"grad_norm": 1.7863500118255615,
"learning_rate": 0.0001716940651000759,
"loss": 1.1048,
"step": 2340
},
{
"epoch": 1.6112444292080905,
"grad_norm": 2.9769108295440674,
"learning_rate": 0.00017135943688258506,
"loss": 1.1211,
"step": 2350
},
{
"epoch": 1.6181007884813163,
"grad_norm": 3.001119375228882,
"learning_rate": 0.00017102317256355082,
"loss": 1.2261,
"step": 2360
},
{
"epoch": 1.6249571477545424,
"grad_norm": 2.4549243450164795,
"learning_rate": 0.00017068527985271125,
"loss": 1.3292,
"step": 2370
},
{
"epoch": 1.6318135070277684,
"grad_norm": 2.303267478942871,
"learning_rate": 0.00017034576649713965,
"loss": 1.1634,
"step": 2380
},
{
"epoch": 1.6386698663009942,
"grad_norm": 3.4060795307159424,
"learning_rate": 0.00017000464028106682,
"loss": 1.2278,
"step": 2390
},
{
"epoch": 1.64552622557422,
"grad_norm": 2.647289514541626,
"learning_rate": 0.00016966190902570257,
"loss": 1.1818,
"step": 2400
},
{
"epoch": 1.652382584847446,
"grad_norm": 2.2770307064056396,
"learning_rate": 0.00016931758058905642,
"loss": 1.106,
"step": 2410
},
{
"epoch": 1.659238944120672,
"grad_norm": 1.959615707397461,
"learning_rate": 0.00016897166286575747,
"loss": 1.0618,
"step": 2420
},
{
"epoch": 1.6660953033938979,
"grad_norm": 2.4052298069000244,
"learning_rate": 0.0001686241637868734,
"loss": 1.1638,
"step": 2430
},
{
"epoch": 1.6729516626671237,
"grad_norm": 1.4760863780975342,
"learning_rate": 0.00016827509131972848,
"loss": 1.126,
"step": 2440
},
{
"epoch": 1.6798080219403497,
"grad_norm": 2.8597095012664795,
"learning_rate": 0.0001679244534677212,
"loss": 1.1905,
"step": 2450
},
{
"epoch": 1.6866643812135755,
"grad_norm": 3.0205864906311035,
"learning_rate": 0.00016757225827014044,
"loss": 1.1128,
"step": 2460
},
{
"epoch": 1.6935207404868016,
"grad_norm": 3.0635123252868652,
"learning_rate": 0.00016721851380198136,
"loss": 1.2575,
"step": 2470
},
{
"epoch": 1.7003770997600274,
"grad_norm": 2.0744762420654297,
"learning_rate": 0.00016686322817376014,
"loss": 1.2229,
"step": 2480
},
{
"epoch": 1.7072334590332532,
"grad_norm": 2.0594987869262695,
"learning_rate": 0.0001665064095313282,
"loss": 1.1665,
"step": 2490
},
{
"epoch": 1.7140898183064792,
"grad_norm": 3.5101630687713623,
"learning_rate": 0.00016614806605568514,
"loss": 1.0664,
"step": 2500
},
{
"epoch": 1.7209461775797052,
"grad_norm": 2.1221086978912354,
"learning_rate": 0.0001657882059627915,
"loss": 1.2069,
"step": 2510
},
{
"epoch": 1.727802536852931,
"grad_norm": 2.5655086040496826,
"learning_rate": 0.0001654268375033802,
"loss": 1.2594,
"step": 2520
},
{
"epoch": 1.7346588961261569,
"grad_norm": 2.0883429050445557,
"learning_rate": 0.00016506396896276732,
"loss": 1.0933,
"step": 2530
},
{
"epoch": 1.741515255399383,
"grad_norm": 2.505929946899414,
"learning_rate": 0.00016469960866066235,
"loss": 1.1908,
"step": 2540
},
{
"epoch": 1.748371614672609,
"grad_norm": 1.4579987525939941,
"learning_rate": 0.00016433376495097717,
"loss": 1.0112,
"step": 2550
},
{
"epoch": 1.7552279739458347,
"grad_norm": 2.1962618827819824,
"learning_rate": 0.00016396644622163476,
"loss": 1.0926,
"step": 2560
},
{
"epoch": 1.7620843332190606,
"grad_norm": 1.6549545526504517,
"learning_rate": 0.00016359766089437677,
"loss": 1.1251,
"step": 2570
},
{
"epoch": 1.7689406924922866,
"grad_norm": 1.649786114692688,
"learning_rate": 0.0001632274174245704,
"loss": 1.1365,
"step": 2580
},
{
"epoch": 1.7757970517655126,
"grad_norm": 2.0461771488189697,
"learning_rate": 0.00016285572430101456,
"loss": 1.1217,
"step": 2590
},
{
"epoch": 1.7826534110387384,
"grad_norm": 2.474700450897217,
"learning_rate": 0.00016248259004574534,
"loss": 1.1719,
"step": 2600
},
{
"epoch": 1.7895097703119642,
"grad_norm": 2.6934568881988525,
"learning_rate": 0.00016210802321384046,
"loss": 1.2663,
"step": 2610
},
{
"epoch": 1.7963661295851903,
"grad_norm": 2.020193099975586,
"learning_rate": 0.00016173203239322327,
"loss": 1.3247,
"step": 2620
},
{
"epoch": 1.8032224888584163,
"grad_norm": 2.375579357147217,
"learning_rate": 0.0001613546262044657,
"loss": 1.1168,
"step": 2630
},
{
"epoch": 1.810078848131642,
"grad_norm": 2.353813648223877,
"learning_rate": 0.00016097581330059074,
"loss": 1.2081,
"step": 2640
},
{
"epoch": 1.816935207404868,
"grad_norm": 1.9840728044509888,
"learning_rate": 0.00016059560236687408,
"loss": 1.1065,
"step": 2650
},
{
"epoch": 1.823791566678094,
"grad_norm": 2.407522201538086,
"learning_rate": 0.00016021400212064472,
"loss": 1.14,
"step": 2660
},
{
"epoch": 1.83064792595132,
"grad_norm": 1.937991976737976,
"learning_rate": 0.00015983102131108545,
"loss": 1.0747,
"step": 2670
},
{
"epoch": 1.8375042852245458,
"grad_norm": 2.0098466873168945,
"learning_rate": 0.000159446668719032,
"loss": 1.3052,
"step": 2680
},
{
"epoch": 1.8443606444977716,
"grad_norm": 1.9577100276947021,
"learning_rate": 0.00015906095315677173,
"loss": 1.2056,
"step": 2690
},
{
"epoch": 1.8512170037709976,
"grad_norm": 1.6589945554733276,
"learning_rate": 0.0001586738834678418,
"loss": 1.1571,
"step": 2700
},
{
"epoch": 1.8580733630442237,
"grad_norm": 2.461854934692383,
"learning_rate": 0.00015828546852682615,
"loss": 1.1748,
"step": 2710
},
{
"epoch": 1.8649297223174495,
"grad_norm": 3.0385873317718506,
"learning_rate": 0.00015789571723915223,
"loss": 1.1237,
"step": 2720
},
{
"epoch": 1.8717860815906753,
"grad_norm": 1.846217155456543,
"learning_rate": 0.00015750463854088666,
"loss": 1.0674,
"step": 2730
},
{
"epoch": 1.8786424408639013,
"grad_norm": 2.5026347637176514,
"learning_rate": 0.00015711224139853042,
"loss": 1.1815,
"step": 2740
},
{
"epoch": 1.8854988001371273,
"grad_norm": 2.515775203704834,
"learning_rate": 0.00015671853480881328,
"loss": 1.1674,
"step": 2750
},
{
"epoch": 1.8923551594103531,
"grad_norm": 3.0344481468200684,
"learning_rate": 0.00015632352779848755,
"loss": 1.1975,
"step": 2760
},
{
"epoch": 1.899211518683579,
"grad_norm": 1.853408932685852,
"learning_rate": 0.00015592722942412102,
"loss": 1.209,
"step": 2770
},
{
"epoch": 1.906067877956805,
"grad_norm": 2.2342731952667236,
"learning_rate": 0.00015552964877188935,
"loss": 1.1296,
"step": 2780
},
{
"epoch": 1.9129242372300308,
"grad_norm": 1.8905003070831299,
"learning_rate": 0.00015513079495736788,
"loss": 1.1877,
"step": 2790
},
{
"epoch": 1.9197805965032568,
"grad_norm": 2.1061089038848877,
"learning_rate": 0.00015473067712532245,
"loss": 1.1134,
"step": 2800
},
{
"epoch": 1.9266369557764826,
"grad_norm": 2.1238293647766113,
"learning_rate": 0.00015432930444949982,
"loss": 1.1978,
"step": 2810
},
{
"epoch": 1.9334933150497084,
"grad_norm": 1.7509264945983887,
"learning_rate": 0.0001539266861324173,
"loss": 1.1469,
"step": 2820
},
{
"epoch": 1.9403496743229345,
"grad_norm": 2.364008665084839,
"learning_rate": 0.00015352283140515177,
"loss": 1.1545,
"step": 2830
},
{
"epoch": 1.9472060335961605,
"grad_norm": 1.9981671571731567,
"learning_rate": 0.00015311774952712814,
"loss": 1.2245,
"step": 2840
},
{
"epoch": 1.9540623928693863,
"grad_norm": 3.0137462615966797,
"learning_rate": 0.00015271144978590685,
"loss": 1.2361,
"step": 2850
},
{
"epoch": 1.9609187521426121,
"grad_norm": 2.985872268676758,
"learning_rate": 0.00015230394149697108,
"loss": 1.2297,
"step": 2860
},
{
"epoch": 1.9677751114158382,
"grad_norm": 1.7955970764160156,
"learning_rate": 0.00015189523400351314,
"loss": 1.2132,
"step": 2870
},
{
"epoch": 1.9746314706890642,
"grad_norm": 2.0052096843719482,
"learning_rate": 0.0001514853366762202,
"loss": 1.0715,
"step": 2880
},
{
"epoch": 1.98148782996229,
"grad_norm": 3.2730658054351807,
"learning_rate": 0.00015107425891305946,
"loss": 1.3002,
"step": 2890
},
{
"epoch": 1.9883441892355158,
"grad_norm": 2.5514261722564697,
"learning_rate": 0.00015066201013906277,
"loss": 1.216,
"step": 2900
},
{
"epoch": 1.9952005485087418,
"grad_norm": 3.393329381942749,
"learning_rate": 0.00015024859980611048,
"loss": 1.2525,
"step": 2910
},
{
"epoch": 2.002056907781968,
"grad_norm": 1.7734365463256836,
"learning_rate": 0.00014983403739271455,
"loss": 1.0946,
"step": 2920
},
{
"epoch": 2.0089132670551937,
"grad_norm": 1.6522287130355835,
"learning_rate": 0.0001494183324038016,
"loss": 0.9668,
"step": 2930
},
{
"epoch": 2.0157696263284195,
"grad_norm": 1.8900648355484009,
"learning_rate": 0.00014900149437049463,
"loss": 0.9247,
"step": 2940
},
{
"epoch": 2.0226259856016453,
"grad_norm": 1.4394400119781494,
"learning_rate": 0.00014858353284989467,
"loss": 0.8676,
"step": 2950
},
{
"epoch": 2.0294823448748716,
"grad_norm": 1.6402225494384766,
"learning_rate": 0.00014816445742486177,
"loss": 0.7732,
"step": 2960
},
{
"epoch": 2.0363387041480974,
"grad_norm": 2.301037073135376,
"learning_rate": 0.0001477442777037949,
"loss": 0.8462,
"step": 2970
},
{
"epoch": 2.043195063421323,
"grad_norm": 2.0403075218200684,
"learning_rate": 0.00014732300332041215,
"loss": 0.7681,
"step": 2980
},
{
"epoch": 2.050051422694549,
"grad_norm": 2.1729094982147217,
"learning_rate": 0.00014690064393352943,
"loss": 1.0043,
"step": 2990
},
{
"epoch": 2.0569077819677752,
"grad_norm": 1.7256251573562622,
"learning_rate": 0.0001464772092268393,
"loss": 0.8462,
"step": 3000
},
{
"epoch": 2.0569077819677752,
"eval_loss": 1.2205281257629395,
"eval_runtime": 29.6488,
"eval_samples_per_second": 82.87,
"eval_steps_per_second": 10.388,
"step": 3000
},
{
"epoch": 2.063764141241001,
"grad_norm": 2.0800058841705322,
"learning_rate": 0.00014605270890868873,
"loss": 0.7895,
"step": 3010
},
{
"epoch": 2.070620500514227,
"grad_norm": 1.9822988510131836,
"learning_rate": 0.00014562715271185673,
"loss": 0.8707,
"step": 3020
},
{
"epoch": 2.0774768597874527,
"grad_norm": 2.0999081134796143,
"learning_rate": 0.00014520055039333101,
"loss": 0.8167,
"step": 3030
},
{
"epoch": 2.084333219060679,
"grad_norm": 3.332972288131714,
"learning_rate": 0.0001447729117340844,
"loss": 0.8117,
"step": 3040
},
{
"epoch": 2.0911895783339047,
"grad_norm": 2.6177289485931396,
"learning_rate": 0.0001443442465388505,
"loss": 0.8042,
"step": 3050
},
{
"epoch": 2.0980459376071305,
"grad_norm": 2.5687131881713867,
"learning_rate": 0.000143914564635899,
"loss": 0.8638,
"step": 3060
},
{
"epoch": 2.1049022968803563,
"grad_norm": 2.9838380813598633,
"learning_rate": 0.00014348387587681018,
"loss": 0.9421,
"step": 3070
},
{
"epoch": 2.1117586561535826,
"grad_norm": 3.1572110652923584,
"learning_rate": 0.00014305219013624918,
"loss": 0.8763,
"step": 3080
},
{
"epoch": 2.1186150154268084,
"grad_norm": 2.1814610958099365,
"learning_rate": 0.00014261951731173956,
"loss": 0.9218,
"step": 3090
},
{
"epoch": 2.125471374700034,
"grad_norm": 2.19307279586792,
"learning_rate": 0.00014218586732343635,
"loss": 0.8691,
"step": 3100
},
{
"epoch": 2.13232773397326,
"grad_norm": 2.0345587730407715,
"learning_rate": 0.00014175125011389858,
"loss": 0.9038,
"step": 3110
},
{
"epoch": 2.1391840932464863,
"grad_norm": 2.5638718605041504,
"learning_rate": 0.0001413156756478614,
"loss": 0.8381,
"step": 3120
},
{
"epoch": 2.146040452519712,
"grad_norm": 2.1037793159484863,
"learning_rate": 0.00014087915391200747,
"loss": 0.9794,
"step": 3130
},
{
"epoch": 2.152896811792938,
"grad_norm": 2.0018515586853027,
"learning_rate": 0.0001404416949147383,
"loss": 0.8893,
"step": 3140
},
{
"epoch": 2.1597531710661637,
"grad_norm": 4.70350456237793,
"learning_rate": 0.00014000330868594427,
"loss": 0.8194,
"step": 3150
},
{
"epoch": 2.16660953033939,
"grad_norm": 2.2104270458221436,
"learning_rate": 0.00013956400527677523,
"loss": 0.9157,
"step": 3160
},
{
"epoch": 2.1734658896126158,
"grad_norm": 2.0395448207855225,
"learning_rate": 0.00013912379475940963,
"loss": 0.9017,
"step": 3170
},
{
"epoch": 2.1803222488858416,
"grad_norm": 2.4973316192626953,
"learning_rate": 0.0001386826872268238,
"loss": 0.9304,
"step": 3180
},
{
"epoch": 2.1871786081590674,
"grad_norm": 2.2849769592285156,
"learning_rate": 0.00013824069279256052,
"loss": 0.828,
"step": 3190
},
{
"epoch": 2.1940349674322936,
"grad_norm": 2.458329916000366,
"learning_rate": 0.000137797821590497,
"loss": 0.8158,
"step": 3200
},
{
"epoch": 2.2008913267055195,
"grad_norm": 1.5932427644729614,
"learning_rate": 0.00013735408377461275,
"loss": 0.8592,
"step": 3210
},
{
"epoch": 2.2077476859787453,
"grad_norm": 2.6613569259643555,
"learning_rate": 0.00013690948951875658,
"loss": 0.8317,
"step": 3220
},
{
"epoch": 2.214604045251971,
"grad_norm": 1.6658825874328613,
"learning_rate": 0.00013646404901641358,
"loss": 0.8648,
"step": 3230
},
{
"epoch": 2.221460404525197,
"grad_norm": 2.0496561527252197,
"learning_rate": 0.00013601777248047105,
"loss": 0.8589,
"step": 3240
},
{
"epoch": 2.228316763798423,
"grad_norm": 3.063122510910034,
"learning_rate": 0.0001355706701429847,
"loss": 0.9327,
"step": 3250
},
{
"epoch": 2.235173123071649,
"grad_norm": 2.60986590385437,
"learning_rate": 0.00013512275225494377,
"loss": 0.9661,
"step": 3260
},
{
"epoch": 2.2420294823448748,
"grad_norm": 2.036770820617676,
"learning_rate": 0.00013467402908603622,
"loss": 0.7925,
"step": 3270
},
{
"epoch": 2.248885841618101,
"grad_norm": 2.920668125152588,
"learning_rate": 0.0001342245109244132,
"loss": 0.8235,
"step": 3280
},
{
"epoch": 2.255742200891327,
"grad_norm": 2.1719000339508057,
"learning_rate": 0.000133774208076453,
"loss": 0.8847,
"step": 3290
},
{
"epoch": 2.2625985601645526,
"grad_norm": 2.4449331760406494,
"learning_rate": 0.00013332313086652516,
"loss": 0.8658,
"step": 3300
},
{
"epoch": 2.2694549194377784,
"grad_norm": 2.6060760021209717,
"learning_rate": 0.00013287128963675312,
"loss": 0.7972,
"step": 3310
},
{
"epoch": 2.2763112787110042,
"grad_norm": 2.1217901706695557,
"learning_rate": 0.00013241869474677783,
"loss": 0.8716,
"step": 3320
},
{
"epoch": 2.2831676379842305,
"grad_norm": 2.239027738571167,
"learning_rate": 0.00013196535657351957,
"loss": 0.8919,
"step": 3330
},
{
"epoch": 2.2900239972574563,
"grad_norm": 2.4673471450805664,
"learning_rate": 0.00013151128551094064,
"loss": 0.8553,
"step": 3340
},
{
"epoch": 2.296880356530682,
"grad_norm": 2.783276319503784,
"learning_rate": 0.00013105649196980647,
"loss": 0.8081,
"step": 3350
},
{
"epoch": 2.303736715803908,
"grad_norm": 2.2374160289764404,
"learning_rate": 0.00013060098637744733,
"loss": 0.8908,
"step": 3360
},
{
"epoch": 2.310593075077134,
"grad_norm": 2.431093215942383,
"learning_rate": 0.00013014477917751912,
"loss": 1.0646,
"step": 3370
},
{
"epoch": 2.31744943435036,
"grad_norm": 3.1827499866485596,
"learning_rate": 0.00012968788082976386,
"loss": 0.8314,
"step": 3380
},
{
"epoch": 2.324305793623586,
"grad_norm": 2.4208121299743652,
"learning_rate": 0.00012923030180977005,
"loss": 0.8218,
"step": 3390
},
{
"epoch": 2.3311621528968116,
"grad_norm": 1.9898459911346436,
"learning_rate": 0.0001287720526087323,
"loss": 0.8163,
"step": 3400
},
{
"epoch": 2.338018512170038,
"grad_norm": 2.327742099761963,
"learning_rate": 0.00012831314373321084,
"loss": 0.8621,
"step": 3410
},
{
"epoch": 2.3448748714432637,
"grad_norm": 3.3728277683258057,
"learning_rate": 0.00012785358570489077,
"loss": 0.8402,
"step": 3420
},
{
"epoch": 2.3517312307164895,
"grad_norm": 2.3549559116363525,
"learning_rate": 0.00012739338906034062,
"loss": 0.9521,
"step": 3430
},
{
"epoch": 2.3585875899897153,
"grad_norm": 2.086442232131958,
"learning_rate": 0.00012693256435077093,
"loss": 0.9513,
"step": 3440
},
{
"epoch": 2.3654439492629415,
"grad_norm": 2.5228612422943115,
"learning_rate": 0.00012647112214179222,
"loss": 0.9159,
"step": 3450
},
{
"epoch": 2.3723003085361674,
"grad_norm": 4.470077991485596,
"learning_rate": 0.00012600907301317285,
"loss": 0.8976,
"step": 3460
},
{
"epoch": 2.379156667809393,
"grad_norm": 2.9393067359924316,
"learning_rate": 0.00012554642755859628,
"loss": 0.9191,
"step": 3470
},
{
"epoch": 2.386013027082619,
"grad_norm": 2.242415428161621,
"learning_rate": 0.0001250831963854185,
"loss": 0.7794,
"step": 3480
},
{
"epoch": 2.3928693863558452,
"grad_norm": 2.3584256172180176,
"learning_rate": 0.00012461939011442446,
"loss": 0.9089,
"step": 3490
},
{
"epoch": 2.399725745629071,
"grad_norm": 1.7331100702285767,
"learning_rate": 0.00012415501937958478,
"loss": 0.9748,
"step": 3500
},
{
"epoch": 2.406582104902297,
"grad_norm": 3.369680404663086,
"learning_rate": 0.00012369009482781192,
"loss": 0.8841,
"step": 3510
},
{
"epoch": 2.4134384641755227,
"grad_norm": 2.561638593673706,
"learning_rate": 0.000123224627118716,
"loss": 0.8507,
"step": 3520
},
{
"epoch": 2.4202948234487485,
"grad_norm": 1.9922767877578735,
"learning_rate": 0.00012275862692436048,
"loss": 0.9133,
"step": 3530
},
{
"epoch": 2.4271511827219747,
"grad_norm": 3.621152400970459,
"learning_rate": 0.00012229210492901738,
"loss": 0.7956,
"step": 3540
},
{
"epoch": 2.4340075419952005,
"grad_norm": 2.8023221492767334,
"learning_rate": 0.00012182507182892244,
"loss": 0.8476,
"step": 3550
},
{
"epoch": 2.4408639012684263,
"grad_norm": 2.273455858230591,
"learning_rate": 0.00012135753833202973,
"loss": 0.9277,
"step": 3560
},
{
"epoch": 2.4477202605416526,
"grad_norm": 1.6278008222579956,
"learning_rate": 0.00012088951515776634,
"loss": 0.9194,
"step": 3570
},
{
"epoch": 2.4545766198148784,
"grad_norm": 2.1401588916778564,
"learning_rate": 0.00012042101303678636,
"loss": 0.8345,
"step": 3580
},
{
"epoch": 2.461432979088104,
"grad_norm": 1.6796125173568726,
"learning_rate": 0.00011995204271072509,
"loss": 0.9335,
"step": 3590
},
{
"epoch": 2.46828933836133,
"grad_norm": 2.8594868183135986,
"learning_rate": 0.00011948261493195256,
"loss": 0.913,
"step": 3600
},
{
"epoch": 2.475145697634556,
"grad_norm": 2.4989187717437744,
"learning_rate": 0.0001190127404633272,
"loss": 0.9408,
"step": 3610
},
{
"epoch": 2.482002056907782,
"grad_norm": 4.41836404800415,
"learning_rate": 0.00011854243007794891,
"loss": 0.9526,
"step": 3620
},
{
"epoch": 2.488858416181008,
"grad_norm": 2.835994005203247,
"learning_rate": 0.00011807169455891216,
"loss": 0.8953,
"step": 3630
},
{
"epoch": 2.4957147754542337,
"grad_norm": 1.9034545421600342,
"learning_rate": 0.00011760054469905868,
"loss": 0.8837,
"step": 3640
},
{
"epoch": 2.50257113472746,
"grad_norm": 1.9930311441421509,
"learning_rate": 0.00011712899130072999,
"loss": 0.8693,
"step": 3650
},
{
"epoch": 2.5094274940006858,
"grad_norm": 1.874711513519287,
"learning_rate": 0.00011665704517551995,
"loss": 0.8614,
"step": 3660
},
{
"epoch": 2.5162838532739116,
"grad_norm": 3.2593679428100586,
"learning_rate": 0.00011618471714402656,
"loss": 0.8577,
"step": 3670
},
{
"epoch": 2.5231402125471374,
"grad_norm": 1.9729576110839844,
"learning_rate": 0.0001157120180356041,
"loss": 0.7806,
"step": 3680
},
{
"epoch": 2.529996571820363,
"grad_norm": 2.616779327392578,
"learning_rate": 0.00011523895868811472,
"loss": 0.9526,
"step": 3690
},
{
"epoch": 2.5368529310935894,
"grad_norm": 3.430968761444092,
"learning_rate": 0.00011476554994768001,
"loss": 0.8698,
"step": 3700
},
{
"epoch": 2.5437092903668153,
"grad_norm": 1.9257500171661377,
"learning_rate": 0.0001142918026684323,
"loss": 0.8879,
"step": 3710
},
{
"epoch": 2.550565649640041,
"grad_norm": 3.2250068187713623,
"learning_rate": 0.00011381772771226577,
"loss": 0.9508,
"step": 3720
},
{
"epoch": 2.5574220089132673,
"grad_norm": 2.567389726638794,
"learning_rate": 0.00011334333594858755,
"loss": 0.8863,
"step": 3730
},
{
"epoch": 2.564278368186493,
"grad_norm": 2.4031014442443848,
"learning_rate": 0.00011286863825406831,
"loss": 0.8951,
"step": 3740
},
{
"epoch": 2.571134727459719,
"grad_norm": 3.276573896408081,
"learning_rate": 0.000112393645512393,
"loss": 0.8829,
"step": 3750
},
{
"epoch": 2.5779910867329447,
"grad_norm": 3.0637712478637695,
"learning_rate": 0.00011191836861401137,
"loss": 0.985,
"step": 3760
},
{
"epoch": 2.5848474460061706,
"grad_norm": 2.7773642539978027,
"learning_rate": 0.00011144281845588811,
"loss": 0.9017,
"step": 3770
},
{
"epoch": 2.591703805279397,
"grad_norm": 2.187830686569214,
"learning_rate": 0.00011096700594125318,
"loss": 0.7401,
"step": 3780
},
{
"epoch": 2.5985601645526226,
"grad_norm": 3.42225980758667,
"learning_rate": 0.00011049094197935165,
"loss": 0.9513,
"step": 3790
},
{
"epoch": 2.6054165238258484,
"grad_norm": 2.064603567123413,
"learning_rate": 0.00011001463748519383,
"loss": 0.8678,
"step": 3800
},
{
"epoch": 2.6122728830990742,
"grad_norm": 2.707390308380127,
"learning_rate": 0.00010953810337930468,
"loss": 0.8812,
"step": 3810
},
{
"epoch": 2.6191292423723,
"grad_norm": 3.8909173011779785,
"learning_rate": 0.00010906135058747376,
"loss": 0.855,
"step": 3820
},
{
"epoch": 2.6259856016455263,
"grad_norm": 3.4854836463928223,
"learning_rate": 0.0001085843900405045,
"loss": 0.7692,
"step": 3830
},
{
"epoch": 2.632841960918752,
"grad_norm": 2.343348503112793,
"learning_rate": 0.00010810723267396366,
"loss": 0.8362,
"step": 3840
},
{
"epoch": 2.639698320191978,
"grad_norm": 2.7717478275299072,
"learning_rate": 0.00010762988942793065,
"loss": 1.0403,
"step": 3850
},
{
"epoch": 2.646554679465204,
"grad_norm": 3.4452064037323,
"learning_rate": 0.00010715237124674658,
"loss": 0.8948,
"step": 3860
},
{
"epoch": 2.65341103873843,
"grad_norm": 2.542163133621216,
"learning_rate": 0.00010667468907876348,
"loss": 0.8332,
"step": 3870
},
{
"epoch": 2.660267398011656,
"grad_norm": 3.4537532329559326,
"learning_rate": 0.00010619685387609313,
"loss": 0.9012,
"step": 3880
},
{
"epoch": 2.6671237572848816,
"grad_norm": 2.38268780708313,
"learning_rate": 0.00010571887659435614,
"loss": 0.8836,
"step": 3890
},
{
"epoch": 2.6739801165581074,
"grad_norm": 2.2005438804626465,
"learning_rate": 0.00010524076819243051,
"loss": 0.928,
"step": 3900
},
{
"epoch": 2.6808364758313337,
"grad_norm": 2.7791340351104736,
"learning_rate": 0.00010476253963220062,
"loss": 0.8545,
"step": 3910
},
{
"epoch": 2.6876928351045595,
"grad_norm": 1.3345112800598145,
"learning_rate": 0.00010428420187830581,
"loss": 0.757,
"step": 3920
},
{
"epoch": 2.6945491943777853,
"grad_norm": 2.2857022285461426,
"learning_rate": 0.00010380576589788884,
"loss": 0.7812,
"step": 3930
},
{
"epoch": 2.7014055536510115,
"grad_norm": 2.7016501426696777,
"learning_rate": 0.00010332724266034472,
"loss": 0.7387,
"step": 3940
},
{
"epoch": 2.7082619129242373,
"grad_norm": 2.9767794609069824,
"learning_rate": 0.00010284864313706894,
"loss": 0.9737,
"step": 3950
},
{
"epoch": 2.715118272197463,
"grad_norm": 2.667555570602417,
"learning_rate": 0.00010236997830120614,
"loss": 0.8329,
"step": 3960
},
{
"epoch": 2.721974631470689,
"grad_norm": 1.9413018226623535,
"learning_rate": 0.00010189125912739832,
"loss": 0.8278,
"step": 3970
},
{
"epoch": 2.7288309907439148,
"grad_norm": 2.892408609390259,
"learning_rate": 0.0001014124965915334,
"loss": 0.8695,
"step": 3980
},
{
"epoch": 2.735687350017141,
"grad_norm": 2.4925520420074463,
"learning_rate": 0.00010093370167049343,
"loss": 0.8573,
"step": 3990
},
{
"epoch": 2.742543709290367,
"grad_norm": 2.353090524673462,
"learning_rate": 0.00010045488534190303,
"loss": 0.8322,
"step": 4000
},
{
"epoch": 2.742543709290367,
"eval_loss": 1.2436245679855347,
"eval_runtime": 29.4642,
"eval_samples_per_second": 83.389,
"eval_steps_per_second": 10.453,
"step": 4000
},
{
"epoch": 2.7494000685635926,
"grad_norm": 2.932257890701294,
"learning_rate": 9.997605858387764e-05,
"loss": 0.9329,
"step": 4010
},
{
"epoch": 2.756256427836819,
"grad_norm": 1.6092489957809448,
"learning_rate": 9.949723237477173e-05,
"loss": 0.8408,
"step": 4020
},
{
"epoch": 2.7631127871100447,
"grad_norm": 2.919813871383667,
"learning_rate": 9.901841769292733e-05,
"loss": 0.8821,
"step": 4030
},
{
"epoch": 2.7699691463832705,
"grad_norm": 2.4160959720611572,
"learning_rate": 9.853962551642204e-05,
"loss": 0.8638,
"step": 4040
},
{
"epoch": 2.7768255056564963,
"grad_norm": 2.036405563354492,
"learning_rate": 9.806086682281758e-05,
"loss": 0.8286,
"step": 4050
},
{
"epoch": 2.783681864929722,
"grad_norm": 2.90258526802063,
"learning_rate": 9.758215258890787e-05,
"loss": 0.8771,
"step": 4060
},
{
"epoch": 2.7905382242029484,
"grad_norm": 2.936398506164551,
"learning_rate": 9.710349379046762e-05,
"loss": 0.8782,
"step": 4070
},
{
"epoch": 2.797394583476174,
"grad_norm": 2.3748672008514404,
"learning_rate": 9.662490140200038e-05,
"loss": 0.9046,
"step": 4080
},
{
"epoch": 2.8042509427494,
"grad_norm": 2.2428438663482666,
"learning_rate": 9.614638639648719e-05,
"loss": 0.8763,
"step": 4090
},
{
"epoch": 2.811107302022626,
"grad_norm": 2.7683987617492676,
"learning_rate": 9.566795974513489e-05,
"loss": 0.8338,
"step": 4100
},
{
"epoch": 2.8179636612958516,
"grad_norm": 3.3809077739715576,
"learning_rate": 9.518963241712445e-05,
"loss": 0.9071,
"step": 4110
},
{
"epoch": 2.824820020569078,
"grad_norm": 3.275303602218628,
"learning_rate": 9.471141537935974e-05,
"loss": 0.969,
"step": 4120
},
{
"epoch": 2.8316763798423037,
"grad_norm": 2.0883727073669434,
"learning_rate": 9.423331959621582e-05,
"loss": 0.8391,
"step": 4130
},
{
"epoch": 2.8385327391155295,
"grad_norm": 2.7466342449188232,
"learning_rate": 9.375535602928776e-05,
"loss": 0.9003,
"step": 4140
},
{
"epoch": 2.8453890983887558,
"grad_norm": 4.680114269256592,
"learning_rate": 9.327753563713913e-05,
"loss": 0.8568,
"step": 4150
},
{
"epoch": 2.8522454576619816,
"grad_norm": 1.9148244857788086,
"learning_rate": 9.279986937505096e-05,
"loss": 0.8435,
"step": 4160
},
{
"epoch": 2.8591018169352074,
"grad_norm": 2.7942795753479004,
"learning_rate": 9.232236819477038e-05,
"loss": 0.8624,
"step": 4170
},
{
"epoch": 2.865958176208433,
"grad_norm": 2.342534065246582,
"learning_rate": 9.184504304425958e-05,
"loss": 0.9329,
"step": 4180
},
{
"epoch": 2.872814535481659,
"grad_norm": 2.486449718475342,
"learning_rate": 9.136790486744482e-05,
"loss": 0.9163,
"step": 4190
},
{
"epoch": 2.8796708947548852,
"grad_norm": 2.3326776027679443,
"learning_rate": 9.089096460396552e-05,
"loss": 0.7974,
"step": 4200
},
{
"epoch": 2.886527254028111,
"grad_norm": 2.1666274070739746,
"learning_rate": 9.041423318892339e-05,
"loss": 0.9513,
"step": 4210
},
{
"epoch": 2.893383613301337,
"grad_norm": 2.0120041370391846,
"learning_rate": 8.993772155263175e-05,
"loss": 0.8523,
"step": 4220
},
{
"epoch": 2.900239972574563,
"grad_norm": 3.034512996673584,
"learning_rate": 8.946144062036496e-05,
"loss": 0.904,
"step": 4230
},
{
"epoch": 2.907096331847789,
"grad_norm": 2.084458589553833,
"learning_rate": 8.89854013121078e-05,
"loss": 0.8582,
"step": 4240
},
{
"epoch": 2.9139526911210147,
"grad_norm": 2.5334362983703613,
"learning_rate": 8.850961454230526e-05,
"loss": 0.9028,
"step": 4250
},
{
"epoch": 2.9208090503942405,
"grad_norm": 2.835369348526001,
"learning_rate": 8.803409121961226e-05,
"loss": 0.8264,
"step": 4260
},
{
"epoch": 2.9276654096674664,
"grad_norm": 2.492717742919922,
"learning_rate": 8.755884224664342e-05,
"loss": 0.8943,
"step": 4270
},
{
"epoch": 2.9345217689406926,
"grad_norm": 1.9058958292007446,
"learning_rate": 8.708387851972313e-05,
"loss": 0.7474,
"step": 4280
},
{
"epoch": 2.9413781282139184,
"grad_norm": 3.058417558670044,
"learning_rate": 8.660921092863596e-05,
"loss": 0.8995,
"step": 4290
},
{
"epoch": 2.9482344874871442,
"grad_norm": 2.5806403160095215,
"learning_rate": 8.613485035637662e-05,
"loss": 0.7901,
"step": 4300
},
{
"epoch": 2.9550908467603705,
"grad_norm": 3.59805965423584,
"learning_rate": 8.566080767890069e-05,
"loss": 0.7556,
"step": 4310
},
{
"epoch": 2.9619472060335963,
"grad_norm": 1.853975772857666,
"learning_rate": 8.518709376487515e-05,
"loss": 0.9284,
"step": 4320
},
{
"epoch": 2.968803565306822,
"grad_norm": 3.123701810836792,
"learning_rate": 8.471371947542924e-05,
"loss": 0.8234,
"step": 4330
},
{
"epoch": 2.975659924580048,
"grad_norm": 2.0518481731414795,
"learning_rate": 8.424069566390541e-05,
"loss": 0.9438,
"step": 4340
},
{
"epoch": 2.9825162838532737,
"grad_norm": 1.894871473312378,
"learning_rate": 8.376803317561048e-05,
"loss": 0.9597,
"step": 4350
},
{
"epoch": 2.9893726431265,
"grad_norm": 2.3247461318969727,
"learning_rate": 8.329574284756704e-05,
"loss": 0.7713,
"step": 4360
},
{
"epoch": 2.996229002399726,
"grad_norm": 1.6081085205078125,
"learning_rate": 8.282383550826483e-05,
"loss": 0.9166,
"step": 4370
},
{
"epoch": 3.0030853616729516,
"grad_norm": 1.5519185066223145,
"learning_rate": 8.23523219774127e-05,
"loss": 0.8393,
"step": 4380
},
{
"epoch": 3.0099417209461774,
"grad_norm": 2.066254138946533,
"learning_rate": 8.188121306569028e-05,
"loss": 0.6205,
"step": 4390
},
{
"epoch": 3.0167980802194037,
"grad_norm": 2.129615306854248,
"learning_rate": 8.141051957450039e-05,
"loss": 0.5888,
"step": 4400
},
{
"epoch": 3.0236544394926295,
"grad_norm": 2.3542020320892334,
"learning_rate": 8.09402522957211e-05,
"loss": 0.5694,
"step": 4410
},
{
"epoch": 3.0305107987658553,
"grad_norm": 2.1178841590881348,
"learning_rate": 8.04704220114586e-05,
"loss": 0.6532,
"step": 4420
},
{
"epoch": 3.037367158039081,
"grad_norm": 2.2024478912353516,
"learning_rate": 8.00010394937997e-05,
"loss": 0.5173,
"step": 4430
},
{
"epoch": 3.0442235173123073,
"grad_norm": 3.5857062339782715,
"learning_rate": 7.953211550456507e-05,
"loss": 0.6258,
"step": 4440
},
{
"epoch": 3.051079876585533,
"grad_norm": 2.69157338142395,
"learning_rate": 7.906366079506244e-05,
"loss": 0.5143,
"step": 4450
},
{
"epoch": 3.057936235858759,
"grad_norm": 2.82023286819458,
"learning_rate": 7.859568610583998e-05,
"loss": 0.5554,
"step": 4460
},
{
"epoch": 3.0647925951319848,
"grad_norm": 2.121466636657715,
"learning_rate": 7.812820216644024e-05,
"loss": 0.621,
"step": 4470
},
{
"epoch": 3.071648954405211,
"grad_norm": 2.0588443279266357,
"learning_rate": 7.766121969515397e-05,
"loss": 0.5973,
"step": 4480
},
{
"epoch": 3.078505313678437,
"grad_norm": 2.1448562145233154,
"learning_rate": 7.719474939877451e-05,
"loss": 0.5669,
"step": 4490
},
{
"epoch": 3.0853616729516626,
"grad_norm": 2.030393123626709,
"learning_rate": 7.672880197235222e-05,
"loss": 0.805,
"step": 4500
},
{
"epoch": 3.0922180322248884,
"grad_norm": 3.153280019760132,
"learning_rate": 7.626338809894932e-05,
"loss": 0.7532,
"step": 4510
},
{
"epoch": 3.0990743914981147,
"grad_norm": 2.932528257369995,
"learning_rate": 7.579851844939491e-05,
"loss": 0.5241,
"step": 4520
},
{
"epoch": 3.1059307507713405,
"grad_norm": 2.2017295360565186,
"learning_rate": 7.533420368204036e-05,
"loss": 0.5952,
"step": 4530
},
{
"epoch": 3.1127871100445663,
"grad_norm": 4.451969146728516,
"learning_rate": 7.487045444251493e-05,
"loss": 0.6306,
"step": 4540
},
{
"epoch": 3.119643469317792,
"grad_norm": 2.1478543281555176,
"learning_rate": 7.440728136348158e-05,
"loss": 0.7007,
"step": 4550
},
{
"epoch": 3.126499828591018,
"grad_norm": 2.6645448207855225,
"learning_rate": 7.394469506439346e-05,
"loss": 0.6055,
"step": 4560
},
{
"epoch": 3.133356187864244,
"grad_norm": 2.741107702255249,
"learning_rate": 7.348270615125006e-05,
"loss": 0.5782,
"step": 4570
},
{
"epoch": 3.14021254713747,
"grad_norm": 2.3291351795196533,
"learning_rate": 7.302132521635438e-05,
"loss": 0.6248,
"step": 4580
},
{
"epoch": 3.147068906410696,
"grad_norm": 2.323695421218872,
"learning_rate": 7.256056283806986e-05,
"loss": 0.6929,
"step": 4590
},
{
"epoch": 3.153925265683922,
"grad_norm": 3.022197723388672,
"learning_rate": 7.210042958057794e-05,
"loss": 0.6514,
"step": 4600
},
{
"epoch": 3.160781624957148,
"grad_norm": 2.103457450866699,
"learning_rate": 7.164093599363585e-05,
"loss": 0.6308,
"step": 4610
},
{
"epoch": 3.1676379842303737,
"grad_norm": 3.9690616130828857,
"learning_rate": 7.118209261233461e-05,
"loss": 0.6485,
"step": 4620
},
{
"epoch": 3.1744943435035995,
"grad_norm": 3.117914915084839,
"learning_rate": 7.072390995685769e-05,
"loss": 0.6061,
"step": 4630
},
{
"epoch": 3.1813507027768253,
"grad_norm": 2.1254525184631348,
"learning_rate": 7.026639853223958e-05,
"loss": 0.6515,
"step": 4640
},
{
"epoch": 3.1882070620500516,
"grad_norm": 3.2843899726867676,
"learning_rate": 6.980956882812515e-05,
"loss": 0.6282,
"step": 4650
},
{
"epoch": 3.1950634213232774,
"grad_norm": 2.0985405445098877,
"learning_rate": 6.935343131852899e-05,
"loss": 0.6912,
"step": 4660
},
{
"epoch": 3.201919780596503,
"grad_norm": 2.0823545455932617,
"learning_rate": 6.889799646159534e-05,
"loss": 0.6898,
"step": 4670
},
{
"epoch": 3.208776139869729,
"grad_norm": 2.333568811416626,
"learning_rate": 6.844327469935827e-05,
"loss": 0.5982,
"step": 4680
},
{
"epoch": 3.2156324991429552,
"grad_norm": 3.6117570400238037,
"learning_rate": 6.79892764575023e-05,
"loss": 0.6153,
"step": 4690
},
{
"epoch": 3.222488858416181,
"grad_norm": 2.7832589149475098,
"learning_rate": 6.753601214512343e-05,
"loss": 0.6015,
"step": 4700
},
{
"epoch": 3.229345217689407,
"grad_norm": 2.1461801528930664,
"learning_rate": 6.708349215449025e-05,
"loss": 0.6104,
"step": 4710
},
{
"epoch": 3.2362015769626327,
"grad_norm": 2.373319625854492,
"learning_rate": 6.6631726860806e-05,
"loss": 0.5886,
"step": 4720
},
{
"epoch": 3.243057936235859,
"grad_norm": 3.069390058517456,
"learning_rate": 6.618072662197039e-05,
"loss": 0.5351,
"step": 4730
},
{
"epoch": 3.2499142955090847,
"grad_norm": 2.87705135345459,
"learning_rate": 6.573050177834233e-05,
"loss": 0.6533,
"step": 4740
},
{
"epoch": 3.2567706547823105,
"grad_norm": 2.443903923034668,
"learning_rate": 6.528106265250271e-05,
"loss": 0.6517,
"step": 4750
},
{
"epoch": 3.2636270140555363,
"grad_norm": 1.977824091911316,
"learning_rate": 6.483241954901785e-05,
"loss": 0.5984,
"step": 4760
},
{
"epoch": 3.2704833733287626,
"grad_norm": 3.083531379699707,
"learning_rate": 6.438458275420309e-05,
"loss": 0.6077,
"step": 4770
},
{
"epoch": 3.2773397326019884,
"grad_norm": 3.2755494117736816,
"learning_rate": 6.393756253588714e-05,
"loss": 0.5889,
"step": 4780
},
{
"epoch": 3.284196091875214,
"grad_norm": 2.7298457622528076,
"learning_rate": 6.349136914317652e-05,
"loss": 0.5221,
"step": 4790
},
{
"epoch": 3.29105245114844,
"grad_norm": 2.45281720161438,
"learning_rate": 6.304601280622055e-05,
"loss": 0.6113,
"step": 4800
},
{
"epoch": 3.2979088104216663,
"grad_norm": 3.0578701496124268,
"learning_rate": 6.260150373597697e-05,
"loss": 0.6029,
"step": 4810
},
{
"epoch": 3.304765169694892,
"grad_norm": 3.420334577560425,
"learning_rate": 6.21578521239776e-05,
"loss": 0.4964,
"step": 4820
},
{
"epoch": 3.311621528968118,
"grad_norm": 2.0759449005126953,
"learning_rate": 6.171506814209489e-05,
"loss": 0.6309,
"step": 4830
},
{
"epoch": 3.3184778882413437,
"grad_norm": 3.347933530807495,
"learning_rate": 6.127316194230854e-05,
"loss": 0.5931,
"step": 4840
},
{
"epoch": 3.3253342475145695,
"grad_norm": 2.531506061553955,
"learning_rate": 6.083214365647285e-05,
"loss": 0.616,
"step": 4850
},
{
"epoch": 3.3321906067877958,
"grad_norm": 2.149945020675659,
"learning_rate": 6.039202339608432e-05,
"loss": 0.5663,
"step": 4860
},
{
"epoch": 3.3390469660610216,
"grad_norm": 2.5064194202423096,
"learning_rate": 5.99528112520499e-05,
"loss": 0.7054,
"step": 4870
},
{
"epoch": 3.3459033253342474,
"grad_norm": 2.7665531635284424,
"learning_rate": 5.951451729445563e-05,
"loss": 0.5341,
"step": 4880
},
{
"epoch": 3.3527596846074736,
"grad_norm": 2.899704694747925,
"learning_rate": 5.907715157233563e-05,
"loss": 0.6004,
"step": 4890
},
{
"epoch": 3.3596160438806995,
"grad_norm": 2.8619532585144043,
"learning_rate": 5.8640724113441925e-05,
"loss": 0.657,
"step": 4900
},
{
"epoch": 3.3664724031539253,
"grad_norm": 2.754093647003174,
"learning_rate": 5.820524492401428e-05,
"loss": 0.6386,
"step": 4910
},
{
"epoch": 3.373328762427151,
"grad_norm": 2.501052141189575,
"learning_rate": 5.777072398855101e-05,
"loss": 0.6552,
"step": 4920
},
{
"epoch": 3.380185121700377,
"grad_norm": 4.3751983642578125,
"learning_rate": 5.7337171269579895e-05,
"loss": 0.6642,
"step": 4930
},
{
"epoch": 3.387041480973603,
"grad_norm": 3.558741331100464,
"learning_rate": 5.690459670742977e-05,
"loss": 0.5393,
"step": 4940
},
{
"epoch": 3.393897840246829,
"grad_norm": 2.205996036529541,
"learning_rate": 5.647301022000284e-05,
"loss": 0.5735,
"step": 4950
},
{
"epoch": 3.4007541995200548,
"grad_norm": 3.4519693851470947,
"learning_rate": 5.6042421702546956e-05,
"loss": 0.6545,
"step": 4960
},
{
"epoch": 3.407610558793281,
"grad_norm": 2.279060125350952,
"learning_rate": 5.561284102742892e-05,
"loss": 0.5327,
"step": 4970
},
{
"epoch": 3.414466918066507,
"grad_norm": 2.359433174133301,
"learning_rate": 5.51842780439082e-05,
"loss": 0.587,
"step": 4980
},
{
"epoch": 3.4213232773397326,
"grad_norm": 2.9181036949157715,
"learning_rate": 5.475674257791097e-05,
"loss": 0.6117,
"step": 4990
},
{
"epoch": 3.4281796366129584,
"grad_norm": 3.4551374912261963,
"learning_rate": 5.433024443180486e-05,
"loss": 0.5623,
"step": 5000
},
{
"epoch": 3.4281796366129584,
"eval_loss": 1.4529434442520142,
"eval_runtime": 29.3836,
"eval_samples_per_second": 83.618,
"eval_steps_per_second": 10.482,
"step": 5000
},
{
"epoch": 3.4350359958861842,
"grad_norm": 2.4626495838165283,
"learning_rate": 5.3904793384174226e-05,
"loss": 0.6439,
"step": 5010
},
{
"epoch": 3.4418923551594105,
"grad_norm": 4.081802845001221,
"learning_rate": 5.348039918959604e-05,
"loss": 0.633,
"step": 5020
},
{
"epoch": 3.4487487144326363,
"grad_norm": 2.2953267097473145,
"learning_rate": 5.30570715784161e-05,
"loss": 0.6224,
"step": 5030
},
{
"epoch": 3.455605073705862,
"grad_norm": 2.835130214691162,
"learning_rate": 5.263482025652591e-05,
"loss": 0.6399,
"step": 5040
},
{
"epoch": 3.462461432979088,
"grad_norm": 3.4341225624084473,
"learning_rate": 5.221365490514041e-05,
"loss": 0.5624,
"step": 5050
},
{
"epoch": 3.469317792252314,
"grad_norm": 3.069215774536133,
"learning_rate": 5.1793585180575685e-05,
"loss": 0.7112,
"step": 5060
},
{
"epoch": 3.47617415152554,
"grad_norm": 2.27093768119812,
"learning_rate": 5.137462071402778e-05,
"loss": 0.6722,
"step": 5070
},
{
"epoch": 3.483030510798766,
"grad_norm": 2.0508460998535156,
"learning_rate": 5.095677111135172e-05,
"loss": 0.6451,
"step": 5080
},
{
"epoch": 3.4898868700719916,
"grad_norm": 2.6624417304992676,
"learning_rate": 5.054004595284153e-05,
"loss": 0.5707,
"step": 5090
},
{
"epoch": 3.496743229345218,
"grad_norm": 2.8987631797790527,
"learning_rate": 5.012445479301027e-05,
"loss": 0.6265,
"step": 5100
},
{
"epoch": 3.5035995886184437,
"grad_norm": 2.9133450984954834,
"learning_rate": 4.971000716037116e-05,
"loss": 0.5917,
"step": 5110
},
{
"epoch": 3.5104559478916695,
"grad_norm": 2.526829481124878,
"learning_rate": 4.929671255721906e-05,
"loss": 0.5988,
"step": 5120
},
{
"epoch": 3.5173123071648953,
"grad_norm": 3.506739854812622,
"learning_rate": 4.888458045941269e-05,
"loss": 0.5997,
"step": 5130
},
{
"epoch": 3.524168666438121,
"grad_norm": 2.5815718173980713,
"learning_rate": 4.84736203161572e-05,
"loss": 0.6613,
"step": 5140
},
{
"epoch": 3.5310250257113474,
"grad_norm": 1.7911795377731323,
"learning_rate": 4.806384154978766e-05,
"loss": 0.6121,
"step": 5150
},
{
"epoch": 3.537881384984573,
"grad_norm": 3.8519790172576904,
"learning_rate": 4.7655253555553e-05,
"loss": 0.5567,
"step": 5160
},
{
"epoch": 3.544737744257799,
"grad_norm": 3.618427038192749,
"learning_rate": 4.724786570140056e-05,
"loss": 0.6715,
"step": 5170
},
{
"epoch": 3.5515941035310252,
"grad_norm": 2.0794076919555664,
"learning_rate": 4.684168732776132e-05,
"loss": 0.5609,
"step": 5180
},
{
"epoch": 3.558450462804251,
"grad_norm": 2.582932472229004,
"learning_rate": 4.6436727747335864e-05,
"loss": 0.6279,
"step": 5190
},
{
"epoch": 3.565306822077477,
"grad_norm": 2.6992735862731934,
"learning_rate": 4.6032996244880634e-05,
"loss": 0.639,
"step": 5200
},
{
"epoch": 3.5721631813507027,
"grad_norm": 3.3372395038604736,
"learning_rate": 4.563050207699519e-05,
"loss": 0.7179,
"step": 5210
},
{
"epoch": 3.5790195406239285,
"grad_norm": 1.9892611503601074,
"learning_rate": 4.522925447191005e-05,
"loss": 0.583,
"step": 5220
},
{
"epoch": 3.5858758998971547,
"grad_norm": 1.765522837638855,
"learning_rate": 4.4829262629274956e-05,
"loss": 0.5503,
"step": 5230
},
{
"epoch": 3.5927322591703805,
"grad_norm": 2.380490303039551,
"learning_rate": 4.443053571994803e-05,
"loss": 0.555,
"step": 5240
},
{
"epoch": 3.5995886184436063,
"grad_norm": 2.7402896881103516,
"learning_rate": 4.403308288578544e-05,
"loss": 0.5897,
"step": 5250
},
{
"epoch": 3.6064449777168326,
"grad_norm": 2.248931407928467,
"learning_rate": 4.3636913239431966e-05,
"loss": 0.5743,
"step": 5260
},
{
"epoch": 3.6133013369900584,
"grad_norm": 2.618868112564087,
"learning_rate": 4.324203586411186e-05,
"loss": 0.6093,
"step": 5270
},
{
"epoch": 3.620157696263284,
"grad_norm": 2.2212753295898438,
"learning_rate": 4.2848459813420724e-05,
"loss": 0.6123,
"step": 5280
},
{
"epoch": 3.62701405553651,
"grad_norm": 2.1903445720672607,
"learning_rate": 4.245619411111785e-05,
"loss": 0.6425,
"step": 5290
},
{
"epoch": 3.633870414809736,
"grad_norm": 2.3697597980499268,
"learning_rate": 4.2065247750919455e-05,
"loss": 0.7007,
"step": 5300
},
{
"epoch": 3.640726774082962,
"grad_norm": 2.1647536754608154,
"learning_rate": 4.167562969629233e-05,
"loss": 0.5604,
"step": 5310
},
{
"epoch": 3.647583133356188,
"grad_norm": 3.097696542739868,
"learning_rate": 4.128734888024833e-05,
"loss": 0.6153,
"step": 5320
},
{
"epoch": 3.6544394926294137,
"grad_norm": 3.1014564037323,
"learning_rate": 4.090041420513978e-05,
"loss": 0.5866,
"step": 5330
},
{
"epoch": 3.66129585190264,
"grad_norm": 3.541137456893921,
"learning_rate": 4.0514834542455085e-05,
"loss": 0.7697,
"step": 5340
},
{
"epoch": 3.6681522111758658,
"grad_norm": 2.912750244140625,
"learning_rate": 4.0130618732615467e-05,
"loss": 0.6285,
"step": 5350
},
{
"epoch": 3.6750085704490916,
"grad_norm": 3.2789480686187744,
"learning_rate": 3.974777558477224e-05,
"loss": 0.5937,
"step": 5360
},
{
"epoch": 3.6818649297223174,
"grad_norm": 2.6167523860931396,
"learning_rate": 3.9366313876604966e-05,
"loss": 0.6394,
"step": 5370
},
{
"epoch": 3.688721288995543,
"grad_norm": 2.452730178833008,
"learning_rate": 3.898624235411997e-05,
"loss": 0.634,
"step": 5380
},
{
"epoch": 3.6955776482687694,
"grad_norm": 2.2829294204711914,
"learning_rate": 3.860756973144996e-05,
"loss": 0.6577,
"step": 5390
},
{
"epoch": 3.7024340075419953,
"grad_norm": 2.3391597270965576,
"learning_rate": 3.8230304690654304e-05,
"loss": 0.6229,
"step": 5400
},
{
"epoch": 3.709290366815221,
"grad_norm": 2.8178772926330566,
"learning_rate": 3.7854455881519757e-05,
"loss": 0.6546,
"step": 5410
},
{
"epoch": 3.716146726088447,
"grad_norm": 2.5491912364959717,
"learning_rate": 3.7480031921362316e-05,
"loss": 0.5136,
"step": 5420
},
{
"epoch": 3.7230030853616727,
"grad_norm": 3.2480504512786865,
"learning_rate": 3.7107041394829556e-05,
"loss": 0.6712,
"step": 5430
},
{
"epoch": 3.729859444634899,
"grad_norm": 2.2545714378356934,
"learning_rate": 3.673549285370395e-05,
"loss": 0.6131,
"step": 5440
},
{
"epoch": 3.7367158039081247,
"grad_norm": 2.1797900199890137,
"learning_rate": 3.636539481670656e-05,
"loss": 0.5145,
"step": 5450
},
{
"epoch": 3.7435721631813506,
"grad_norm": 2.1412277221679688,
"learning_rate": 3.5996755769301904e-05,
"loss": 0.6453,
"step": 5460
},
{
"epoch": 3.750428522454577,
"grad_norm": 2.3049139976501465,
"learning_rate": 3.562958416350334e-05,
"loss": 0.5419,
"step": 5470
},
{
"epoch": 3.7572848817278026,
"grad_norm": 2.415832281112671,
"learning_rate": 3.526388841767934e-05,
"loss": 0.6422,
"step": 5480
},
{
"epoch": 3.7641412410010284,
"grad_norm": 1.7117339372634888,
"learning_rate": 3.489967691636038e-05,
"loss": 0.6086,
"step": 5490
},
{
"epoch": 3.7709976002742542,
"grad_norm": 3.3377764225006104,
"learning_rate": 3.4536958010046715e-05,
"loss": 0.6449,
"step": 5500
},
{
"epoch": 3.77785395954748,
"grad_norm": 2.1959517002105713,
"learning_rate": 3.417574001501709e-05,
"loss": 0.6297,
"step": 5510
},
{
"epoch": 3.7847103188207063,
"grad_norm": 2.526855945587158,
"learning_rate": 3.381603121313781e-05,
"loss": 0.6232,
"step": 5520
},
{
"epoch": 3.791566678093932,
"grad_norm": 2.3273513317108154,
"learning_rate": 3.3457839851673045e-05,
"loss": 0.5473,
"step": 5530
},
{
"epoch": 3.798423037367158,
"grad_norm": 3.1892895698547363,
"learning_rate": 3.310117414309563e-05,
"loss": 0.6174,
"step": 5540
},
{
"epoch": 3.805279396640384,
"grad_norm": 3.0987548828125,
"learning_rate": 3.2746042264898905e-05,
"loss": 0.677,
"step": 5550
},
{
"epoch": 3.81213575591361,
"grad_norm": 2.421818256378174,
"learning_rate": 3.2392452359409064e-05,
"loss": 0.5264,
"step": 5560
},
{
"epoch": 3.818992115186836,
"grad_norm": 2.0628252029418945,
"learning_rate": 3.2040412533598554e-05,
"loss": 0.6114,
"step": 5570
},
{
"epoch": 3.8258484744600616,
"grad_norm": 3.190624475479126,
"learning_rate": 3.1689930858900263e-05,
"loss": 0.6122,
"step": 5580
},
{
"epoch": 3.8327048337332874,
"grad_norm": 2.274360418319702,
"learning_rate": 3.134101537102232e-05,
"loss": 0.7138,
"step": 5590
},
{
"epoch": 3.8395611930065137,
"grad_norm": 2.9021377563476562,
"learning_rate": 3.099367406976397e-05,
"loss": 0.4782,
"step": 5600
},
{
"epoch": 3.8464175522797395,
"grad_norm": 3.5629074573516846,
"learning_rate": 3.0647914918832054e-05,
"loss": 0.605,
"step": 5610
},
{
"epoch": 3.8532739115529653,
"grad_norm": 3.650761604309082,
"learning_rate": 3.0303745845658595e-05,
"loss": 0.6043,
"step": 5620
},
{
"epoch": 3.8601302708261915,
"grad_norm": 2.938598394393921,
"learning_rate": 2.9961174741218833e-05,
"loss": 0.5623,
"step": 5630
},
{
"epoch": 3.8669866300994173,
"grad_norm": 2.986330032348633,
"learning_rate": 2.9620209459850412e-05,
"loss": 0.6794,
"step": 5640
},
{
"epoch": 3.873842989372643,
"grad_norm": 2.9300005435943604,
"learning_rate": 2.9280857819073347e-05,
"loss": 0.6184,
"step": 5650
},
{
"epoch": 3.880699348645869,
"grad_norm": 3.191838264465332,
"learning_rate": 2.894312759941068e-05,
"loss": 0.5835,
"step": 5660
},
{
"epoch": 3.8875557079190948,
"grad_norm": 3.251615285873413,
"learning_rate": 2.8607026544210114e-05,
"loss": 0.482,
"step": 5670
},
{
"epoch": 3.894412067192321,
"grad_norm": 2.271155595779419,
"learning_rate": 2.8272562359466502e-05,
"loss": 0.5164,
"step": 5680
},
{
"epoch": 3.901268426465547,
"grad_norm": 2.547264575958252,
"learning_rate": 2.793974271364528e-05,
"loss": 0.6159,
"step": 5690
},
{
"epoch": 3.9081247857387726,
"grad_norm": 2.5156540870666504,
"learning_rate": 2.760857523750637e-05,
"loss": 0.6175,
"step": 5700
},
{
"epoch": 3.914981145011999,
"grad_norm": 2.334170341491699,
"learning_rate": 2.7279067523929493e-05,
"loss": 0.6351,
"step": 5710
},
{
"epoch": 3.9218375042852247,
"grad_norm": 2.4579522609710693,
"learning_rate": 2.6951227127739898e-05,
"loss": 0.606,
"step": 5720
},
{
"epoch": 3.9286938635584505,
"grad_norm": 2.738943576812744,
"learning_rate": 2.6625061565535337e-05,
"loss": 0.5695,
"step": 5730
},
{
"epoch": 3.9355502228316763,
"grad_norm": 4.373673439025879,
"learning_rate": 2.630057831551351e-05,
"loss": 0.563,
"step": 5740
},
{
"epoch": 3.942406582104902,
"grad_norm": 2.3918118476867676,
"learning_rate": 2.5977784817300742e-05,
"loss": 0.612,
"step": 5750
},
{
"epoch": 3.9492629413781284,
"grad_norm": 3.0279042720794678,
"learning_rate": 2.5656688471781453e-05,
"loss": 0.6402,
"step": 5760
},
{
"epoch": 3.956119300651354,
"grad_norm": 2.2601287364959717,
"learning_rate": 2.533729664092831e-05,
"loss": 0.5693,
"step": 5770
},
{
"epoch": 3.96297565992458,
"grad_norm": 4.802177906036377,
"learning_rate": 2.501961664763357e-05,
"loss": 0.5653,
"step": 5780
},
{
"epoch": 3.969832019197806,
"grad_norm": 1.8669755458831787,
"learning_rate": 2.4703655775541102e-05,
"loss": 0.5914,
"step": 5790
},
{
"epoch": 3.9766883784710316,
"grad_norm": 2.633484125137329,
"learning_rate": 2.438942126887953e-05,
"loss": 0.4817,
"step": 5800
},
{
"epoch": 3.983544737744258,
"grad_norm": 2.3888065814971924,
"learning_rate": 2.407692033229594e-05,
"loss": 0.5817,
"step": 5810
},
{
"epoch": 3.9904010970174837,
"grad_norm": 2.9784138202667236,
"learning_rate": 2.3766160130690784e-05,
"loss": 0.602,
"step": 5820
},
{
"epoch": 3.9972574562907095,
"grad_norm": 2.4055368900299072,
"learning_rate": 2.3457147789053747e-05,
"loss": 0.6197,
"step": 5830
},
{
"epoch": 4.004113815563936,
"grad_norm": 2.228529214859009,
"learning_rate": 2.314989039230011e-05,
"loss": 0.4612,
"step": 5840
},
{
"epoch": 4.010970174837161,
"grad_norm": 2.3445403575897217,
"learning_rate": 2.284439498510854e-05,
"loss": 0.4838,
"step": 5850
},
{
"epoch": 4.017826534110387,
"grad_norm": 2.2850747108459473,
"learning_rate": 2.2540668571759428e-05,
"loss": 0.4838,
"step": 5860
},
{
"epoch": 4.024682893383614,
"grad_norm": 2.4510273933410645,
"learning_rate": 2.2238718115974454e-05,
"loss": 0.385,
"step": 5870
},
{
"epoch": 4.031539252656839,
"grad_norm": 2.711827278137207,
"learning_rate": 2.193855054075674e-05,
"loss": 0.3388,
"step": 5880
},
{
"epoch": 4.038395611930065,
"grad_norm": 4.566797733306885,
"learning_rate": 2.1640172728232267e-05,
"loss": 0.4607,
"step": 5890
},
{
"epoch": 4.045251971203291,
"grad_norm": 3.350759506225586,
"learning_rate": 2.1343591519491966e-05,
"loss": 0.4704,
"step": 5900
},
{
"epoch": 4.052108330476517,
"grad_norm": 2.5083415508270264,
"learning_rate": 2.104881371443502e-05,
"loss": 0.4961,
"step": 5910
},
{
"epoch": 4.058964689749743,
"grad_norm": 2.662445068359375,
"learning_rate": 2.075584607161283e-05,
"loss": 0.4776,
"step": 5920
},
{
"epoch": 4.0658210490229685,
"grad_norm": 1.9160922765731812,
"learning_rate": 2.0464695308074032e-05,
"loss": 0.4301,
"step": 5930
},
{
"epoch": 4.072677408296195,
"grad_norm": 1.7301816940307617,
"learning_rate": 2.01753680992107e-05,
"loss": 0.4956,
"step": 5940
},
{
"epoch": 4.079533767569421,
"grad_norm": 2.6820240020751953,
"learning_rate": 1.9887871078605037e-05,
"loss": 0.4724,
"step": 5950
},
{
"epoch": 4.086390126842646,
"grad_norm": 2.909573793411255,
"learning_rate": 1.9602210837877423e-05,
"loss": 0.4075,
"step": 5960
},
{
"epoch": 4.093246486115873,
"grad_norm": 2.7761423587799072,
"learning_rate": 1.931839392653525e-05,
"loss": 0.4034,
"step": 5970
},
{
"epoch": 4.100102845389098,
"grad_norm": 2.786170244216919,
"learning_rate": 1.903642685182283e-05,
"loss": 0.4334,
"step": 5980
},
{
"epoch": 4.106959204662324,
"grad_norm": 2.5752928256988525,
"learning_rate": 1.875631607857209e-05,
"loss": 0.5167,
"step": 5990
},
{
"epoch": 4.1138155639355505,
"grad_norm": 5.471198081970215,
"learning_rate": 1.8478068029054386e-05,
"loss": 0.4546,
"step": 6000
},
{
"epoch": 4.1138155639355505,
"eval_loss": 1.6329894065856934,
"eval_runtime": 29.3853,
"eval_samples_per_second": 83.613,
"eval_steps_per_second": 10.481,
"step": 6000
},
{
"epoch": 4.120671923208776,
"grad_norm": 3.2511181831359863,
"learning_rate": 1.8201689082833272e-05,
"loss": 0.4688,
"step": 6010
},
{
"epoch": 4.127528282482002,
"grad_norm": 2.027602195739746,
"learning_rate": 1.7927185576618244e-05,
"loss": 0.457,
"step": 6020
},
{
"epoch": 4.134384641755228,
"grad_norm": 3.83060622215271,
"learning_rate": 1.7654563804119396e-05,
"loss": 0.4621,
"step": 6030
},
{
"epoch": 4.141241001028454,
"grad_norm": 1.915252447128296,
"learning_rate": 1.7383830015903223e-05,
"loss": 0.5149,
"step": 6040
},
{
"epoch": 4.14809736030168,
"grad_norm": 3.230914831161499,
"learning_rate": 1.711499041924921e-05,
"loss": 0.4142,
"step": 6050
},
{
"epoch": 4.154953719574905,
"grad_norm": 2.2559401988983154,
"learning_rate": 1.684805117800755e-05,
"loss": 0.4578,
"step": 6060
},
{
"epoch": 4.161810078848132,
"grad_norm": 3.0432651042938232,
"learning_rate": 1.6583018412457784e-05,
"loss": 0.4685,
"step": 6070
},
{
"epoch": 4.168666438121358,
"grad_norm": 2.882570743560791,
"learning_rate": 1.6319898199168627e-05,
"loss": 0.4859,
"step": 6080
},
{
"epoch": 4.175522797394583,
"grad_norm": 2.090360164642334,
"learning_rate": 1.6058696570858422e-05,
"loss": 0.5028,
"step": 6090
},
{
"epoch": 4.1823791566678095,
"grad_norm": 2.042056083679199,
"learning_rate": 1.5799419516256985e-05,
"loss": 0.4483,
"step": 6100
},
{
"epoch": 4.189235515941036,
"grad_norm": 1.4510776996612549,
"learning_rate": 1.5542072979968268e-05,
"loss": 0.4544,
"step": 6110
},
{
"epoch": 4.196091875214261,
"grad_norm": 2.745123863220215,
"learning_rate": 1.5286662862334035e-05,
"loss": 0.5091,
"step": 6120
},
{
"epoch": 4.202948234487487,
"grad_norm": 2.3903136253356934,
"learning_rate": 1.5033195019298563e-05,
"loss": 0.4537,
"step": 6130
},
{
"epoch": 4.209804593760713,
"grad_norm": 2.7773876190185547,
"learning_rate": 1.4781675262274419e-05,
"loss": 0.4708,
"step": 6140
},
{
"epoch": 4.216660953033939,
"grad_norm": 2.1511118412017822,
"learning_rate": 1.4532109358009272e-05,
"loss": 0.4282,
"step": 6150
},
{
"epoch": 4.223517312307165,
"grad_norm": 2.3747787475585938,
"learning_rate": 1.4284503028453522e-05,
"loss": 0.3366,
"step": 6160
},
{
"epoch": 4.230373671580391,
"grad_norm": 2.423283576965332,
"learning_rate": 1.4038861950629234e-05,
"loss": 0.4201,
"step": 6170
},
{
"epoch": 4.237230030853617,
"grad_norm": 3.218165159225464,
"learning_rate": 1.379519175649997e-05,
"loss": 0.386,
"step": 6180
},
{
"epoch": 4.244086390126842,
"grad_norm": 2.6405014991760254,
"learning_rate": 1.3553498032841605e-05,
"loss": 0.4341,
"step": 6190
},
{
"epoch": 4.250942749400068,
"grad_norm": 3.0675742626190186,
"learning_rate": 1.3313786321114252e-05,
"loss": 0.476,
"step": 6200
},
{
"epoch": 4.257799108673295,
"grad_norm": 2.122243642807007,
"learning_rate": 1.307606211733522e-05,
"loss": 0.456,
"step": 6210
},
{
"epoch": 4.26465546794652,
"grad_norm": 2.534996509552002,
"learning_rate": 1.2840330871953077e-05,
"loss": 0.3959,
"step": 6220
},
{
"epoch": 4.271511827219746,
"grad_norm": 2.619480848312378,
"learning_rate": 1.2606597989722524e-05,
"loss": 0.4781,
"step": 6230
},
{
"epoch": 4.278368186492973,
"grad_norm": 2.5350911617279053,
"learning_rate": 1.237486882958061e-05,
"loss": 0.4387,
"step": 6240
},
{
"epoch": 4.285224545766198,
"grad_norm": 3.2731032371520996,
"learning_rate": 1.2145148704523779e-05,
"loss": 0.4438,
"step": 6250
},
{
"epoch": 4.292080905039424,
"grad_norm": 2.3770089149475098,
"learning_rate": 1.1917442881486174e-05,
"loss": 0.5292,
"step": 6260
},
{
"epoch": 4.29893726431265,
"grad_norm": 3.8733415603637695,
"learning_rate": 1.1691756581218726e-05,
"loss": 0.4823,
"step": 6270
},
{
"epoch": 4.305793623585876,
"grad_norm": 4.391140460968018,
"learning_rate": 1.1468094978169553e-05,
"loss": 0.4197,
"step": 6280
},
{
"epoch": 4.312649982859102,
"grad_norm": 2.645474910736084,
"learning_rate": 1.124646320036532e-05,
"loss": 0.5782,
"step": 6290
},
{
"epoch": 4.319506342132327,
"grad_norm": 1.8884179592132568,
"learning_rate": 1.1026866329293628e-05,
"loss": 0.4514,
"step": 6300
},
{
"epoch": 4.326362701405554,
"grad_norm": 2.595188617706299,
"learning_rate": 1.0809309399786527e-05,
"loss": 0.47,
"step": 6310
},
{
"epoch": 4.33321906067878,
"grad_norm": 3.458259344100952,
"learning_rate": 1.0593797399905037e-05,
"loss": 0.4688,
"step": 6320
},
{
"epoch": 4.340075419952005,
"grad_norm": 1.9553050994873047,
"learning_rate": 1.0380335270824904e-05,
"loss": 0.4013,
"step": 6330
},
{
"epoch": 4.3469317792252316,
"grad_norm": 1.8735579252243042,
"learning_rate": 1.0168927906723168e-05,
"loss": 0.4051,
"step": 6340
},
{
"epoch": 4.353788138498457,
"grad_norm": 3.0255327224731445,
"learning_rate": 9.959580154666015e-06,
"loss": 0.4805,
"step": 6350
},
{
"epoch": 4.360644497771683,
"grad_norm": 2.6736433506011963,
"learning_rate": 9.752296814497697e-06,
"loss": 0.4719,
"step": 6360
},
{
"epoch": 4.367500857044909,
"grad_norm": 2.2465081214904785,
"learning_rate": 9.547082638730376e-06,
"loss": 0.4025,
"step": 6370
},
{
"epoch": 4.374357216318135,
"grad_norm": 2.689293622970581,
"learning_rate": 9.343942332435218e-06,
"loss": 0.4303,
"step": 6380
},
{
"epoch": 4.381213575591361,
"grad_norm": 2.96524977684021,
"learning_rate": 9.142880553134514e-06,
"loss": 0.4317,
"step": 6390
},
{
"epoch": 4.388069934864587,
"grad_norm": 2.312793254852295,
"learning_rate": 8.943901910694941e-06,
"loss": 0.4613,
"step": 6400
},
{
"epoch": 4.394926294137813,
"grad_norm": 1.9500372409820557,
"learning_rate": 8.747010967221747e-06,
"loss": 0.535,
"step": 6410
},
{
"epoch": 4.401782653411039,
"grad_norm": 2.1753222942352295,
"learning_rate": 8.552212236954293e-06,
"loss": 0.4357,
"step": 6420
},
{
"epoch": 4.408639012684264,
"grad_norm": 2.4783313274383545,
"learning_rate": 8.3595101861624e-06,
"loss": 0.4186,
"step": 6430
},
{
"epoch": 4.4154953719574905,
"grad_norm": 1.5100256204605103,
"learning_rate": 8.168909233044153e-06,
"loss": 0.4332,
"step": 6440
},
{
"epoch": 4.422351731230717,
"grad_norm": 3.4225423336029053,
"learning_rate": 7.980413747624383e-06,
"loss": 0.4127,
"step": 6450
},
{
"epoch": 4.429208090503942,
"grad_norm": 2.8790810108184814,
"learning_rate": 7.7940280516546e-06,
"loss": 0.4899,
"step": 6460
},
{
"epoch": 4.436064449777168,
"grad_norm": 2.543872833251953,
"learning_rate": 7.609756418513914e-06,
"loss": 0.5136,
"step": 6470
},
{
"epoch": 4.442920809050394,
"grad_norm": 1.3648442029953003,
"learning_rate": 7.427603073110967e-06,
"loss": 0.5212,
"step": 6480
},
{
"epoch": 4.44977716832362,
"grad_norm": 1.9477663040161133,
"learning_rate": 7.247572191787167e-06,
"loss": 0.4124,
"step": 6490
},
{
"epoch": 4.456633527596846,
"grad_norm": 3.6821651458740234,
"learning_rate": 7.069667902220822e-06,
"loss": 0.4763,
"step": 6500
},
{
"epoch": 4.463489886870072,
"grad_norm": 2.303973913192749,
"learning_rate": 6.8938942833326695e-06,
"loss": 0.4455,
"step": 6510
},
{
"epoch": 4.470346246143298,
"grad_norm": 2.560413360595703,
"learning_rate": 6.720255365192163e-06,
"loss": 0.4055,
"step": 6520
},
{
"epoch": 4.477202605416524,
"grad_norm": 3.7052018642425537,
"learning_rate": 6.548755128925188e-06,
"loss": 0.4169,
"step": 6530
},
{
"epoch": 4.4840589646897495,
"grad_norm": 1.845646619796753,
"learning_rate": 6.379397506622808e-06,
"loss": 0.3946,
"step": 6540
},
{
"epoch": 4.490915323962976,
"grad_norm": 2.254162549972534,
"learning_rate": 6.212186381250984e-06,
"loss": 0.441,
"step": 6550
},
{
"epoch": 4.497771683236202,
"grad_norm": 2.616757392883301,
"learning_rate": 6.047125586561686e-06,
"loss": 0.5491,
"step": 6560
},
{
"epoch": 4.504628042509427,
"grad_norm": 2.068566083908081,
"learning_rate": 5.884218907004901e-06,
"loss": 0.4361,
"step": 6570
},
{
"epoch": 4.511484401782654,
"grad_norm": 3.0345873832702637,
"learning_rate": 5.723470077641924e-06,
"loss": 0.4673,
"step": 6580
},
{
"epoch": 4.518340761055879,
"grad_norm": 2.5010392665863037,
"learning_rate": 5.564882784059689e-06,
"loss": 0.4171,
"step": 6590
},
{
"epoch": 4.525197120329105,
"grad_norm": 1.6839686632156372,
"learning_rate": 5.408460662286241e-06,
"loss": 0.4807,
"step": 6600
},
{
"epoch": 4.5320534796023315,
"grad_norm": 1.956069827079773,
"learning_rate": 5.2542072987074695e-06,
"loss": 0.3572,
"step": 6610
},
{
"epoch": 4.538909838875557,
"grad_norm": 2.861846446990967,
"learning_rate": 5.1021262299847495e-06,
"loss": 0.4342,
"step": 6620
},
{
"epoch": 4.545766198148783,
"grad_norm": 4.325196743011475,
"learning_rate": 4.952220942973973e-06,
"loss": 0.4651,
"step": 6630
},
{
"epoch": 4.5526225574220085,
"grad_norm": 2.3949451446533203,
"learning_rate": 4.8044948746454935e-06,
"loss": 0.4429,
"step": 6640
},
{
"epoch": 4.559478916695235,
"grad_norm": 2.8822643756866455,
"learning_rate": 4.6589514120054525e-06,
"loss": 0.3782,
"step": 6650
},
{
"epoch": 4.566335275968461,
"grad_norm": 3.471057653427124,
"learning_rate": 4.515593892017999e-06,
"loss": 0.415,
"step": 6660
},
{
"epoch": 4.573191635241686,
"grad_norm": 3.1017813682556152,
"learning_rate": 4.3744256015288645e-06,
"loss": 0.5172,
"step": 6670
},
{
"epoch": 4.580047994514913,
"grad_norm": 2.956613779067993,
"learning_rate": 4.235449777189937e-06,
"loss": 0.491,
"step": 6680
},
{
"epoch": 4.586904353788139,
"grad_norm": 2.5809433460235596,
"learning_rate": 4.098669605385142e-06,
"loss": 0.4805,
"step": 6690
},
{
"epoch": 4.593760713061364,
"grad_norm": 2.229520082473755,
"learning_rate": 3.964088222157303e-06,
"loss": 0.4809,
"step": 6700
},
{
"epoch": 4.6006170723345905,
"grad_norm": 2.391052722930908,
"learning_rate": 3.83170871313625e-06,
"loss": 0.4123,
"step": 6710
},
{
"epoch": 4.607473431607816,
"grad_norm": 2.391326904296875,
"learning_rate": 3.7015341134681526e-06,
"loss": 0.3621,
"step": 6720
},
{
"epoch": 4.614329790881042,
"grad_norm": 3.428799867630005,
"learning_rate": 3.573567407745826e-06,
"loss": 0.4297,
"step": 6730
},
{
"epoch": 4.621186150154268,
"grad_norm": 2.7340753078460693,
"learning_rate": 3.447811529940348e-06,
"loss": 0.4269,
"step": 6740
},
{
"epoch": 4.628042509427494,
"grad_norm": 2.597050666809082,
"learning_rate": 3.3242693633337983e-06,
"loss": 0.3771,
"step": 6750
},
{
"epoch": 4.63489886870072,
"grad_norm": 2.1940290927886963,
"learning_rate": 3.2029437404531683e-06,
"loss": 0.3715,
"step": 6760
},
{
"epoch": 4.641755227973945,
"grad_norm": 3.0640647411346436,
"learning_rate": 3.083837443005355e-06,
"loss": 0.4535,
"step": 6770
},
{
"epoch": 4.648611587247172,
"grad_norm": 2.0457632541656494,
"learning_rate": 2.966953201813427e-06,
"loss": 0.4653,
"step": 6780
},
{
"epoch": 4.655467946520398,
"grad_norm": 4.1095733642578125,
"learning_rate": 2.8522936967540383e-06,
"loss": 0.3734,
"step": 6790
},
{
"epoch": 4.662324305793623,
"grad_norm": 1.960434079170227,
"learning_rate": 2.739861556695933e-06,
"loss": 0.4308,
"step": 6800
},
{
"epoch": 4.6691806650668495,
"grad_norm": 2.4154977798461914,
"learning_rate": 2.6296593594396733e-06,
"loss": 0.4377,
"step": 6810
},
{
"epoch": 4.676037024340076,
"grad_norm": 4.1492486000061035,
"learning_rate": 2.5216896316585746e-06,
"loss": 0.4988,
"step": 6820
},
{
"epoch": 4.682893383613301,
"grad_norm": 1.40762460231781,
"learning_rate": 2.4159548488407733e-06,
"loss": 0.5145,
"step": 6830
},
{
"epoch": 4.689749742886527,
"grad_norm": 2.824812889099121,
"learning_rate": 2.31245743523244e-06,
"loss": 0.4712,
"step": 6840
},
{
"epoch": 4.696606102159754,
"grad_norm": 6.674090385437012,
"learning_rate": 2.2111997637821792e-06,
"loss": 0.4395,
"step": 6850
},
{
"epoch": 4.703462461432979,
"grad_norm": 2.4539730548858643,
"learning_rate": 2.1121841560867273e-06,
"loss": 0.3955,
"step": 6860
},
{
"epoch": 4.710318820706205,
"grad_norm": 1.873875379562378,
"learning_rate": 2.015412882337564e-06,
"loss": 0.4142,
"step": 6870
},
{
"epoch": 4.717175179979431,
"grad_norm": 11.428731918334961,
"learning_rate": 1.9208881612689967e-06,
"loss": 0.5176,
"step": 6880
},
{
"epoch": 4.724031539252657,
"grad_norm": 2.4507086277008057,
"learning_rate": 1.828612160107257e-06,
"loss": 0.4799,
"step": 6890
},
{
"epoch": 4.730887898525883,
"grad_norm": 2.5675251483917236,
"learning_rate": 1.7385869945207523e-06,
"loss": 0.4605,
"step": 6900
},
{
"epoch": 4.7377442577991085,
"grad_norm": 2.8171119689941406,
"learning_rate": 1.650814728571648e-06,
"loss": 0.4743,
"step": 6910
},
{
"epoch": 4.744600617072335,
"grad_norm": 2.8307461738586426,
"learning_rate": 1.565297374668473e-06,
"loss": 0.4494,
"step": 6920
},
{
"epoch": 4.75145697634556,
"grad_norm": 2.4572560787200928,
"learning_rate": 1.4820368935200002e-06,
"loss": 0.4678,
"step": 6930
},
{
"epoch": 4.758313335618786,
"grad_norm": 2.9318461418151855,
"learning_rate": 1.4010351940903276e-06,
"loss": 0.3936,
"step": 6940
},
{
"epoch": 4.765169694892013,
"grad_norm": 2.561730146408081,
"learning_rate": 1.3222941335550353e-06,
"loss": 0.4745,
"step": 6950
},
{
"epoch": 4.772026054165238,
"grad_norm": 2.51572847366333,
"learning_rate": 1.2458155172587083e-06,
"loss": 0.4616,
"step": 6960
},
{
"epoch": 4.778882413438464,
"grad_norm": 2.1606862545013428,
"learning_rate": 1.171601098673436e-06,
"loss": 0.4764,
"step": 6970
},
{
"epoch": 4.7857387727116905,
"grad_norm": 2.5997395515441895,
"learning_rate": 1.0996525793586677e-06,
"loss": 0.497,
"step": 6980
},
{
"epoch": 4.792595131984916,
"grad_norm": 2.7361416816711426,
"learning_rate": 1.02997160892222e-06,
"loss": 0.3996,
"step": 6990
},
{
"epoch": 4.799451491258142,
"grad_norm": 2.5793089866638184,
"learning_rate": 9.625597849823976e-07,
"loss": 0.448,
"step": 7000
},
{
"epoch": 4.799451491258142,
"eval_loss": 1.6523703336715698,
"eval_runtime": 29.3476,
"eval_samples_per_second": 83.721,
"eval_steps_per_second": 10.495,
"step": 7000
},
{
"epoch": 4.806307850531368,
"grad_norm": 2.204183340072632,
"learning_rate": 8.974186531313988e-07,
"loss": 0.4577,
"step": 7010
},
{
"epoch": 4.813164209804594,
"grad_norm": 2.363396406173706,
"learning_rate": 8.345497068998897e-07,
"loss": 0.4684,
"step": 7020
},
{
"epoch": 4.82002056907782,
"grad_norm": 2.908766269683838,
"learning_rate": 7.739543877227196e-07,
"loss": 0.4612,
"step": 7030
},
{
"epoch": 4.826876928351045,
"grad_norm": 2.48903489112854,
"learning_rate": 7.15634084905914e-07,
"loss": 0.4656,
"step": 7040
},
{
"epoch": 4.833733287624272,
"grad_norm": 2.835102081298828,
"learning_rate": 6.595901355947898e-07,
"loss": 0.3777,
"step": 7050
},
{
"epoch": 4.840589646897497,
"grad_norm": 2.326117992401123,
"learning_rate": 6.058238247433234e-07,
"loss": 0.4727,
"step": 7060
},
{
"epoch": 4.847446006170723,
"grad_norm": 2.406785488128662,
"learning_rate": 5.543363850846972e-07,
"loss": 0.3892,
"step": 7070
},
{
"epoch": 4.854302365443949,
"grad_norm": 2.84722900390625,
"learning_rate": 5.05128997102966e-07,
"loss": 0.4482,
"step": 7080
},
{
"epoch": 4.861158724717175,
"grad_norm": 2.8080146312713623,
"learning_rate": 4.582027890060792e-07,
"loss": 0.3906,
"step": 7090
},
{
"epoch": 4.868015083990401,
"grad_norm": 2.620560884475708,
"learning_rate": 4.1355883669997873e-07,
"loss": 0.5222,
"step": 7100
},
{
"epoch": 4.874871443263627,
"grad_norm": 2.826660394668579,
"learning_rate": 3.7119816376390836e-07,
"loss": 0.4543,
"step": 7110
},
{
"epoch": 4.881727802536853,
"grad_norm": 2.5080008506774902,
"learning_rate": 3.311217414269874e-07,
"loss": 0.4995,
"step": 7120
},
{
"epoch": 4.888584161810079,
"grad_norm": 2.6868581771850586,
"learning_rate": 2.933304885459065e-07,
"loss": 0.4136,
"step": 7130
},
{
"epoch": 4.895440521083305,
"grad_norm": 2.1843154430389404,
"learning_rate": 2.5782527158388916e-07,
"loss": 0.4089,
"step": 7140
},
{
"epoch": 4.9022968803565306,
"grad_norm": 2.803866386413574,
"learning_rate": 2.2460690459079615e-07,
"loss": 0.4824,
"step": 7150
},
{
"epoch": 4.909153239629757,
"grad_norm": 4.336270332336426,
"learning_rate": 1.9367614918449627e-07,
"loss": 0.5094,
"step": 7160
},
{
"epoch": 4.916009598902982,
"grad_norm": 5.751471996307373,
"learning_rate": 1.6503371453335803e-07,
"loss": 0.3751,
"step": 7170
},
{
"epoch": 4.922865958176208,
"grad_norm": 1.8664264678955078,
"learning_rate": 1.386802573400514e-07,
"loss": 0.4913,
"step": 7180
},
{
"epoch": 4.929722317449435,
"grad_norm": 2.173042058944702,
"learning_rate": 1.1461638182643786e-07,
"loss": 0.429,
"step": 7190
},
{
"epoch": 4.93657867672266,
"grad_norm": 3.0668938159942627,
"learning_rate": 9.284263971972573e-08,
"loss": 0.4568,
"step": 7200
},
{
"epoch": 4.943435035995886,
"grad_norm": 7.5888352394104,
"learning_rate": 7.33595302398582e-08,
"loss": 0.427,
"step": 7210
},
{
"epoch": 4.950291395269112,
"grad_norm": 2.1892218589782715,
"learning_rate": 5.616750008803351e-08,
"loss": 0.4677,
"step": 7220
},
{
"epoch": 4.957147754542338,
"grad_norm": 2.2843613624572754,
"learning_rate": 4.126694343644655e-08,
"loss": 0.4085,
"step": 7230
},
{
"epoch": 4.964004113815564,
"grad_norm": 2.025791883468628,
"learning_rate": 2.8658201919296023e-08,
"loss": 0.4369,
"step": 7240
},
{
"epoch": 4.9708604730887895,
"grad_norm": 1.981285810470581,
"learning_rate": 1.8341564624935194e-08,
"loss": 0.4147,
"step": 7250
},
{
"epoch": 4.977716832362016,
"grad_norm": 3.0213518142700195,
"learning_rate": 1.031726808921052e-08,
"loss": 0.434,
"step": 7260
},
{
"epoch": 4.984573191635242,
"grad_norm": 2.9700491428375244,
"learning_rate": 4.585496290110403e-09,
"loss": 0.4067,
"step": 7270
},
{
"epoch": 4.991429550908467,
"grad_norm": 1.8337976932525635,
"learning_rate": 1.1463806434686143e-09,
"loss": 0.5116,
"step": 7280
},
{
"epoch": 4.998285910181694,
"grad_norm": 4.141937732696533,
"learning_rate": 0.0,
"loss": 0.5268,
"step": 7290
},
{
"epoch": 4.998285910181694,
"step": 7290,
"total_flos": 2.0655422352392192e+18,
"train_loss": 0.9170206386202484,
"train_runtime": 7769.9064,
"train_samples_per_second": 30.03,
"train_steps_per_second": 0.938
}
],
"logging_steps": 10,
"max_steps": 7290,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0655422352392192e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}