dict-xs / checkpoint-6568 /trainer_state.json
isemmanuelolowe's picture
Upload Ikhou dict-xs model - 1.7M samples, 50+ languages, zero-loss issue fixed
6313038 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6568,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015225624726414556,
"grad_norm": 18.75,
"learning_rate": 0.0009090909090909091,
"loss": 2.9749,
"step": 10
},
{
"epoch": 0.003045124945282911,
"grad_norm": 8.0625,
"learning_rate": 0.0019191919191919192,
"loss": 1.9177,
"step": 20
},
{
"epoch": 0.004567687417924367,
"grad_norm": 5.5,
"learning_rate": 0.0029292929292929295,
"loss": 1.7954,
"step": 30
},
{
"epoch": 0.006090249890565822,
"grad_norm": 5.0,
"learning_rate": 0.00393939393939394,
"loss": 1.8602,
"step": 40
},
{
"epoch": 0.007612812363207278,
"grad_norm": 3.9375,
"learning_rate": 0.00494949494949495,
"loss": 1.9204,
"step": 50
},
{
"epoch": 0.009135374835848734,
"grad_norm": 4.15625,
"learning_rate": 0.005959595959595959,
"loss": 2.0757,
"step": 60
},
{
"epoch": 0.01065793730849019,
"grad_norm": 3.78125,
"learning_rate": 0.00696969696969697,
"loss": 2.1545,
"step": 70
},
{
"epoch": 0.012180499781131645,
"grad_norm": 3.0625,
"learning_rate": 0.007979797979797981,
"loss": 2.3187,
"step": 80
},
{
"epoch": 0.0137030622537731,
"grad_norm": 3.40625,
"learning_rate": 0.00898989898989899,
"loss": 2.3589,
"step": 90
},
{
"epoch": 0.015225624726414555,
"grad_norm": 2.828125,
"learning_rate": 0.01,
"loss": 2.4423,
"step": 100
},
{
"epoch": 0.016748187199056012,
"grad_norm": 3.125,
"learning_rate": 0.01101010101010101,
"loss": 2.6021,
"step": 110
},
{
"epoch": 0.018270749671697468,
"grad_norm": 3.265625,
"learning_rate": 0.01202020202020202,
"loss": 2.6122,
"step": 120
},
{
"epoch": 0.019793312144338923,
"grad_norm": 2.4375,
"learning_rate": 0.013030303030303031,
"loss": 2.5902,
"step": 130
},
{
"epoch": 0.02131587461698038,
"grad_norm": 2.328125,
"learning_rate": 0.01404040404040404,
"loss": 2.6305,
"step": 140
},
{
"epoch": 0.022838437089621834,
"grad_norm": 2.234375,
"learning_rate": 0.01505050505050505,
"loss": 2.6464,
"step": 150
},
{
"epoch": 0.02436099956226329,
"grad_norm": 2.015625,
"learning_rate": 0.01606060606060606,
"loss": 2.6488,
"step": 160
},
{
"epoch": 0.025883562034904745,
"grad_norm": 2.109375,
"learning_rate": 0.01707070707070707,
"loss": 2.6837,
"step": 170
},
{
"epoch": 0.0274061245075462,
"grad_norm": 2.109375,
"learning_rate": 0.018080808080808083,
"loss": 2.7353,
"step": 180
},
{
"epoch": 0.028928686980187655,
"grad_norm": 3.0625,
"learning_rate": 0.019090909090909092,
"loss": 2.7126,
"step": 190
},
{
"epoch": 0.03045124945282911,
"grad_norm": 1.765625,
"learning_rate": 0.019999998783839547,
"loss": 2.7545,
"step": 200
},
{
"epoch": 0.031973811925470566,
"grad_norm": 1.9609375,
"learning_rate": 0.019999852844943353,
"loss": 2.7003,
"step": 210
},
{
"epoch": 0.033496374398112025,
"grad_norm": 1.6640625,
"learning_rate": 0.019999463678024316,
"loss": 2.7056,
"step": 220
},
{
"epoch": 0.03501893687075348,
"grad_norm": 1.625,
"learning_rate": 0.019998831292548203,
"loss": 2.6891,
"step": 230
},
{
"epoch": 0.036541499343394936,
"grad_norm": 1.1875,
"learning_rate": 0.01999795570389663,
"loss": 2.6395,
"step": 240
},
{
"epoch": 0.03806406181603639,
"grad_norm": 1.28125,
"learning_rate": 0.019996836933366676,
"loss": 2.6503,
"step": 250
},
{
"epoch": 0.039586624288677846,
"grad_norm": 1.15625,
"learning_rate": 0.01999547500817038,
"loss": 2.6254,
"step": 260
},
{
"epoch": 0.0411091867613193,
"grad_norm": 1.34375,
"learning_rate": 0.019993869961434065,
"loss": 2.6104,
"step": 270
},
{
"epoch": 0.04263174923396076,
"grad_norm": 1.0546875,
"learning_rate": 0.01999202183219754,
"loss": 2.6056,
"step": 280
},
{
"epoch": 0.04415431170660221,
"grad_norm": 1.0390625,
"learning_rate": 0.019989930665413148,
"loss": 2.5625,
"step": 290
},
{
"epoch": 0.04567687417924367,
"grad_norm": 1.015625,
"learning_rate": 0.019987596511944674,
"loss": 2.5482,
"step": 300
},
{
"epoch": 0.04719943665188512,
"grad_norm": 0.94921875,
"learning_rate": 0.019985019428566106,
"loss": 2.5616,
"step": 310
},
{
"epoch": 0.04872199912452658,
"grad_norm": 0.9921875,
"learning_rate": 0.019982199477960257,
"loss": 2.5245,
"step": 320
},
{
"epoch": 0.05024456159716803,
"grad_norm": 0.8984375,
"learning_rate": 0.01997913672871724,
"loss": 2.5713,
"step": 330
},
{
"epoch": 0.05176712406980949,
"grad_norm": 0.91015625,
"learning_rate": 0.019975831255332793,
"loss": 2.497,
"step": 340
},
{
"epoch": 0.05328968654245095,
"grad_norm": 0.92578125,
"learning_rate": 0.01997228313820647,
"loss": 2.4797,
"step": 350
},
{
"epoch": 0.0548122490150924,
"grad_norm": 0.93359375,
"learning_rate": 0.019968492463639704,
"loss": 2.5012,
"step": 360
},
{
"epoch": 0.05633481148773386,
"grad_norm": 0.953125,
"learning_rate": 0.019964459323833665,
"loss": 2.4267,
"step": 370
},
{
"epoch": 0.05785737396037531,
"grad_norm": 0.89453125,
"learning_rate": 0.01996018381688707,
"loss": 2.4353,
"step": 380
},
{
"epoch": 0.05937993643301677,
"grad_norm": 0.90625,
"learning_rate": 0.019955666046793757,
"loss": 2.4498,
"step": 390
},
{
"epoch": 0.06090249890565822,
"grad_norm": 0.921875,
"learning_rate": 0.01995090612344017,
"loss": 2.4758,
"step": 400
},
{
"epoch": 0.06242506137829968,
"grad_norm": 0.87890625,
"learning_rate": 0.019945904162602685,
"loss": 2.4757,
"step": 410
},
{
"epoch": 0.06394762385094113,
"grad_norm": 1.046875,
"learning_rate": 0.0199406602859448,
"loss": 2.45,
"step": 420
},
{
"epoch": 0.06547018632358259,
"grad_norm": 0.890625,
"learning_rate": 0.01993517462101417,
"loss": 2.4271,
"step": 430
},
{
"epoch": 0.06699274879622405,
"grad_norm": 0.8984375,
"learning_rate": 0.019929447301239498,
"loss": 2.4273,
"step": 440
},
{
"epoch": 0.0685153112688655,
"grad_norm": 0.828125,
"learning_rate": 0.0199234784659273,
"loss": 2.4073,
"step": 450
},
{
"epoch": 0.07003787374150695,
"grad_norm": 0.80078125,
"learning_rate": 0.019917268260258518,
"loss": 2.3603,
"step": 460
},
{
"epoch": 0.07156043621414841,
"grad_norm": 0.875,
"learning_rate": 0.019910816835284974,
"loss": 2.3982,
"step": 470
},
{
"epoch": 0.07308299868678987,
"grad_norm": 0.8515625,
"learning_rate": 0.01990412434792571,
"loss": 2.3634,
"step": 480
},
{
"epoch": 0.07460556115943132,
"grad_norm": 0.88671875,
"learning_rate": 0.019897190960963176,
"loss": 2.3787,
"step": 490
},
{
"epoch": 0.07612812363207277,
"grad_norm": 0.984375,
"learning_rate": 0.01989001684303925,
"loss": 2.3464,
"step": 500
},
{
"epoch": 0.07612812363207277,
"eval_loss": 2.4252116680145264,
"eval_runtime": 342.391,
"eval_samples_per_second": 49.604,
"eval_steps_per_second": 24.802,
"step": 500
},
{
"epoch": 0.07765068610471423,
"grad_norm": 0.859375,
"learning_rate": 0.019882602168651148,
"loss": 2.3568,
"step": 510
},
{
"epoch": 0.07917324857735569,
"grad_norm": 0.8359375,
"learning_rate": 0.019874947118147187,
"loss": 2.331,
"step": 520
},
{
"epoch": 0.08069581104999715,
"grad_norm": 0.8828125,
"learning_rate": 0.019867051877722388,
"loss": 2.3502,
"step": 530
},
{
"epoch": 0.0822183735226386,
"grad_norm": 0.84375,
"learning_rate": 0.01985891663941395,
"loss": 2.3372,
"step": 540
},
{
"epoch": 0.08374093599528006,
"grad_norm": 0.80078125,
"learning_rate": 0.019850541601096568,
"loss": 2.2897,
"step": 550
},
{
"epoch": 0.08526349846792151,
"grad_norm": 0.828125,
"learning_rate": 0.01984192696647765,
"loss": 2.3333,
"step": 560
},
{
"epoch": 0.08678606094056297,
"grad_norm": 0.8203125,
"learning_rate": 0.01983307294509233,
"loss": 2.3197,
"step": 570
},
{
"epoch": 0.08830862341320442,
"grad_norm": 0.94140625,
"learning_rate": 0.019823979752298392,
"loss": 2.3009,
"step": 580
},
{
"epoch": 0.08983118588584588,
"grad_norm": 0.8125,
"learning_rate": 0.01981464760927102,
"loss": 2.332,
"step": 590
},
{
"epoch": 0.09135374835848734,
"grad_norm": 0.83203125,
"learning_rate": 0.019805076742997422,
"loss": 2.3117,
"step": 600
},
{
"epoch": 0.0928763108311288,
"grad_norm": 0.8515625,
"learning_rate": 0.019795267386271315,
"loss": 2.2978,
"step": 610
},
{
"epoch": 0.09439887330377024,
"grad_norm": 0.84765625,
"learning_rate": 0.019785219777687248,
"loss": 2.2451,
"step": 620
},
{
"epoch": 0.0959214357764117,
"grad_norm": 0.796875,
"learning_rate": 0.019774934161634825,
"loss": 2.3117,
"step": 630
},
{
"epoch": 0.09744399824905316,
"grad_norm": 0.890625,
"learning_rate": 0.019764410788292722,
"loss": 2.2956,
"step": 640
},
{
"epoch": 0.09896656072169462,
"grad_norm": 0.82421875,
"learning_rate": 0.01975364991362264,
"loss": 2.276,
"step": 650
},
{
"epoch": 0.10048912319433606,
"grad_norm": 0.85546875,
"learning_rate": 0.01974265179936306,
"loss": 2.2066,
"step": 660
},
{
"epoch": 0.10201168566697752,
"grad_norm": 0.76171875,
"learning_rate": 0.019731416713022868,
"loss": 2.2396,
"step": 670
},
{
"epoch": 0.10353424813961898,
"grad_norm": 0.8203125,
"learning_rate": 0.01971994492787488,
"loss": 2.2219,
"step": 680
},
{
"epoch": 0.10505681061226044,
"grad_norm": 0.8359375,
"learning_rate": 0.01970823672294916,
"loss": 2.2221,
"step": 690
},
{
"epoch": 0.1065793730849019,
"grad_norm": 0.80859375,
"learning_rate": 0.019696292383026247,
"loss": 2.2533,
"step": 700
},
{
"epoch": 0.10810193555754334,
"grad_norm": 0.8203125,
"learning_rate": 0.019684112198630244,
"loss": 2.2463,
"step": 710
},
{
"epoch": 0.1096244980301848,
"grad_norm": 0.76171875,
"learning_rate": 0.01967169646602172,
"loss": 2.2102,
"step": 720
},
{
"epoch": 0.11114706050282626,
"grad_norm": 0.82421875,
"learning_rate": 0.01965904548719053,
"loss": 2.2254,
"step": 730
},
{
"epoch": 0.11266962297546772,
"grad_norm": 0.8125,
"learning_rate": 0.019646159569848463,
"loss": 2.2245,
"step": 740
},
{
"epoch": 0.11419218544810916,
"grad_norm": 0.83203125,
"learning_rate": 0.019633039027421747,
"loss": 2.2012,
"step": 750
},
{
"epoch": 0.11571474792075062,
"grad_norm": 0.84375,
"learning_rate": 0.019619684179043438,
"loss": 2.2266,
"step": 760
},
{
"epoch": 0.11723731039339208,
"grad_norm": 0.78125,
"learning_rate": 0.019606095349545653,
"loss": 2.2187,
"step": 770
},
{
"epoch": 0.11875987286603354,
"grad_norm": 0.7890625,
"learning_rate": 0.01959227286945167,
"loss": 2.1785,
"step": 780
},
{
"epoch": 0.12028243533867498,
"grad_norm": 0.8359375,
"learning_rate": 0.019578217074967885,
"loss": 2.2224,
"step": 790
},
{
"epoch": 0.12180499781131644,
"grad_norm": 0.86328125,
"learning_rate": 0.01956392830797564,
"loss": 2.1828,
"step": 800
},
{
"epoch": 0.1233275602839579,
"grad_norm": 0.78515625,
"learning_rate": 0.019549406916022905,
"loss": 2.2074,
"step": 810
},
{
"epoch": 0.12485012275659936,
"grad_norm": 0.828125,
"learning_rate": 0.01953465325231582,
"loss": 2.1648,
"step": 820
},
{
"epoch": 0.1263726852292408,
"grad_norm": 0.79296875,
"learning_rate": 0.019519667675710114,
"loss": 2.1821,
"step": 830
},
{
"epoch": 0.12789524770188226,
"grad_norm": 0.8203125,
"learning_rate": 0.01950445055070237,
"loss": 2.1723,
"step": 840
},
{
"epoch": 0.12941781017452372,
"grad_norm": 0.7890625,
"learning_rate": 0.019489002247421148,
"loss": 2.1421,
"step": 850
},
{
"epoch": 0.13094037264716518,
"grad_norm": 0.79296875,
"learning_rate": 0.019473323141618013,
"loss": 2.1654,
"step": 860
},
{
"epoch": 0.13246293511980664,
"grad_norm": 0.8125,
"learning_rate": 0.019457413614658366,
"loss": 2.1454,
"step": 870
},
{
"epoch": 0.1339854975924481,
"grad_norm": 0.84765625,
"learning_rate": 0.019441274053512175,
"loss": 2.1438,
"step": 880
},
{
"epoch": 0.13550806006508956,
"grad_norm": 0.796875,
"learning_rate": 0.01942490485074458,
"loss": 2.1856,
"step": 890
},
{
"epoch": 0.137030622537731,
"grad_norm": 0.83984375,
"learning_rate": 0.019408306404506314,
"loss": 2.1291,
"step": 900
},
{
"epoch": 0.13855318501037245,
"grad_norm": 0.8359375,
"learning_rate": 0.019391479118524044,
"loss": 2.1488,
"step": 910
},
{
"epoch": 0.1400757474830139,
"grad_norm": 0.796875,
"learning_rate": 0.01937442340209055,
"loss": 2.1124,
"step": 920
},
{
"epoch": 0.14159830995565537,
"grad_norm": 0.78125,
"learning_rate": 0.01935713967005475,
"loss": 2.1569,
"step": 930
},
{
"epoch": 0.14312087242829682,
"grad_norm": 0.83203125,
"learning_rate": 0.019339628342811634,
"loss": 2.1194,
"step": 940
},
{
"epoch": 0.14464343490093828,
"grad_norm": 0.77734375,
"learning_rate": 0.01932188984629201,
"loss": 2.1578,
"step": 950
},
{
"epoch": 0.14616599737357974,
"grad_norm": 0.79296875,
"learning_rate": 0.019303924611952177,
"loss": 2.1231,
"step": 960
},
{
"epoch": 0.1476885598462212,
"grad_norm": 0.80859375,
"learning_rate": 0.01928573307676341,
"loss": 2.1173,
"step": 970
},
{
"epoch": 0.14921112231886263,
"grad_norm": 0.7890625,
"learning_rate": 0.019267315683201326,
"loss": 2.0751,
"step": 980
},
{
"epoch": 0.1507336847915041,
"grad_norm": 0.76953125,
"learning_rate": 0.019248672879235148,
"loss": 2.1118,
"step": 990
},
{
"epoch": 0.15225624726414555,
"grad_norm": 0.8125,
"learning_rate": 0.01922980511831678,
"loss": 2.1046,
"step": 1000
},
{
"epoch": 0.15225624726414555,
"eval_loss": 2.17815899848938,
"eval_runtime": 332.4001,
"eval_samples_per_second": 51.095,
"eval_steps_per_second": 25.548,
"step": 1000
},
{
"epoch": 0.153778809736787,
"grad_norm": 0.7734375,
"learning_rate": 0.01921071285936979,
"loss": 2.0853,
"step": 1010
},
{
"epoch": 0.15530137220942847,
"grad_norm": 0.7421875,
"learning_rate": 0.01919139656677826,
"loss": 2.1187,
"step": 1020
},
{
"epoch": 0.15682393468206993,
"grad_norm": 0.80859375,
"learning_rate": 0.01917185671037546,
"loss": 2.0578,
"step": 1030
},
{
"epoch": 0.15834649715471139,
"grad_norm": 0.76953125,
"learning_rate": 0.01915209376543245,
"loss": 2.0986,
"step": 1040
},
{
"epoch": 0.15986905962735284,
"grad_norm": 0.8046875,
"learning_rate": 0.019132108212646513,
"loss": 2.0731,
"step": 1050
},
{
"epoch": 0.1613916220999943,
"grad_norm": 0.7265625,
"learning_rate": 0.01911190053812944,
"loss": 2.0556,
"step": 1060
},
{
"epoch": 0.16291418457263573,
"grad_norm": 0.9296875,
"learning_rate": 0.01909147123339575,
"loss": 2.0717,
"step": 1070
},
{
"epoch": 0.1644367470452772,
"grad_norm": 0.85546875,
"learning_rate": 0.019070820795350683,
"loss": 2.0919,
"step": 1080
},
{
"epoch": 0.16595930951791865,
"grad_norm": 0.796875,
"learning_rate": 0.019049949726278156,
"loss": 2.037,
"step": 1090
},
{
"epoch": 0.1674818719905601,
"grad_norm": 0.8359375,
"learning_rate": 0.01902885853382853,
"loss": 2.0452,
"step": 1100
},
{
"epoch": 0.16900443446320157,
"grad_norm": 0.7890625,
"learning_rate": 0.019007547731006248,
"loss": 2.0627,
"step": 1110
},
{
"epoch": 0.17052699693584303,
"grad_norm": 0.84765625,
"learning_rate": 0.01898601783615739,
"loss": 2.0771,
"step": 1120
},
{
"epoch": 0.1720495594084845,
"grad_norm": 0.84765625,
"learning_rate": 0.018964269372957036,
"loss": 2.0802,
"step": 1130
},
{
"epoch": 0.17357212188112595,
"grad_norm": 0.82421875,
"learning_rate": 0.01894230287039654,
"loss": 2.061,
"step": 1140
},
{
"epoch": 0.17509468435376738,
"grad_norm": 0.81640625,
"learning_rate": 0.018920118862770667,
"loss": 2.0693,
"step": 1150
},
{
"epoch": 0.17661724682640884,
"grad_norm": 0.87890625,
"learning_rate": 0.018897717889664576,
"loss": 2.0623,
"step": 1160
},
{
"epoch": 0.1781398092990503,
"grad_norm": 0.828125,
"learning_rate": 0.01887510049594074,
"loss": 2.0643,
"step": 1170
},
{
"epoch": 0.17966237177169175,
"grad_norm": 0.84765625,
"learning_rate": 0.01885226723172564,
"loss": 2.0485,
"step": 1180
},
{
"epoch": 0.1811849342443332,
"grad_norm": 0.75,
"learning_rate": 0.018829218652396423,
"loss": 2.0529,
"step": 1190
},
{
"epoch": 0.18270749671697467,
"grad_norm": 0.8046875,
"learning_rate": 0.01880595531856738,
"loss": 2.0425,
"step": 1200
},
{
"epoch": 0.18423005918961613,
"grad_norm": 0.80078125,
"learning_rate": 0.018782477796076304,
"loss": 2.0552,
"step": 1210
},
{
"epoch": 0.1857526216622576,
"grad_norm": 0.859375,
"learning_rate": 0.018758786655970732,
"loss": 2.0348,
"step": 1220
},
{
"epoch": 0.18727518413489905,
"grad_norm": 0.8203125,
"learning_rate": 0.018734882474494067,
"loss": 2.0424,
"step": 1230
},
{
"epoch": 0.18879774660754048,
"grad_norm": 0.828125,
"learning_rate": 0.01871076583307154,
"loss": 2.03,
"step": 1240
},
{
"epoch": 0.19032030908018194,
"grad_norm": 0.8125,
"learning_rate": 0.018686437318296084,
"loss": 2.0417,
"step": 1250
},
{
"epoch": 0.1918428715528234,
"grad_norm": 0.765625,
"learning_rate": 0.018661897521914068,
"loss": 2.0172,
"step": 1260
},
{
"epoch": 0.19336543402546486,
"grad_norm": 0.8046875,
"learning_rate": 0.018637147040810886,
"loss": 2.0367,
"step": 1270
},
{
"epoch": 0.19488799649810631,
"grad_norm": 0.80859375,
"learning_rate": 0.018612186476996452,
"loss": 2.015,
"step": 1280
},
{
"epoch": 0.19641055897074777,
"grad_norm": 0.89453125,
"learning_rate": 0.018587016437590562,
"loss": 2.0083,
"step": 1290
},
{
"epoch": 0.19793312144338923,
"grad_norm": 0.8359375,
"learning_rate": 0.01856163753480812,
"loss": 2.0318,
"step": 1300
},
{
"epoch": 0.1994556839160307,
"grad_norm": 0.796875,
"learning_rate": 0.01853605038594424,
"loss": 2.0027,
"step": 1310
},
{
"epoch": 0.20097824638867212,
"grad_norm": 0.8828125,
"learning_rate": 0.01851025561335925,
"loss": 1.9983,
"step": 1320
},
{
"epoch": 0.20250080886131358,
"grad_norm": 0.8515625,
"learning_rate": 0.018484253844463526,
"loss": 1.9897,
"step": 1330
},
{
"epoch": 0.20402337133395504,
"grad_norm": 0.796875,
"learning_rate": 0.018458045711702266,
"loss": 1.9809,
"step": 1340
},
{
"epoch": 0.2055459338065965,
"grad_norm": 0.84765625,
"learning_rate": 0.018431631852540077,
"loss": 2.0047,
"step": 1350
},
{
"epoch": 0.20706849627923796,
"grad_norm": 0.8359375,
"learning_rate": 0.01840501290944549,
"loss": 1.9952,
"step": 1360
},
{
"epoch": 0.20859105875187942,
"grad_norm": 0.796875,
"learning_rate": 0.018378189529875324,
"loss": 2.0212,
"step": 1370
},
{
"epoch": 0.21011362122452087,
"grad_norm": 0.78515625,
"learning_rate": 0.018351162366258937,
"loss": 1.9944,
"step": 1380
},
{
"epoch": 0.21163618369716233,
"grad_norm": 0.75390625,
"learning_rate": 0.01832393207598236,
"loss": 1.9731,
"step": 1390
},
{
"epoch": 0.2131587461698038,
"grad_norm": 0.80078125,
"learning_rate": 0.018296499321372305,
"loss": 1.9937,
"step": 1400
},
{
"epoch": 0.21468130864244522,
"grad_norm": 0.859375,
"learning_rate": 0.018268864769680055,
"loss": 1.9897,
"step": 1410
},
{
"epoch": 0.21620387111508668,
"grad_norm": 0.84375,
"learning_rate": 0.01824102909306524,
"loss": 1.991,
"step": 1420
},
{
"epoch": 0.21772643358772814,
"grad_norm": 0.84765625,
"learning_rate": 0.01821299296857948,
"loss": 1.9662,
"step": 1430
},
{
"epoch": 0.2192489960603696,
"grad_norm": 0.8359375,
"learning_rate": 0.01818475707814993,
"loss": 1.9562,
"step": 1440
},
{
"epoch": 0.22077155853301106,
"grad_norm": 0.81640625,
"learning_rate": 0.018156322108562675,
"loss": 1.9599,
"step": 1450
},
{
"epoch": 0.22229412100565252,
"grad_norm": 0.8125,
"learning_rate": 0.018127688751446026,
"loss": 1.9912,
"step": 1460
},
{
"epoch": 0.22381668347829398,
"grad_norm": 0.8046875,
"learning_rate": 0.018098857703253726,
"loss": 1.9604,
"step": 1470
},
{
"epoch": 0.22533924595093544,
"grad_norm": 0.84765625,
"learning_rate": 0.018069829665247974,
"loss": 1.982,
"step": 1480
},
{
"epoch": 0.22686180842357687,
"grad_norm": 0.80859375,
"learning_rate": 0.01804060534348239,
"loss": 1.9447,
"step": 1490
},
{
"epoch": 0.22838437089621832,
"grad_norm": 0.78515625,
"learning_rate": 0.018011185448784835,
"loss": 1.9809,
"step": 1500
},
{
"epoch": 0.22838437089621832,
"eval_loss": 2.018177032470703,
"eval_runtime": 331.4544,
"eval_samples_per_second": 51.241,
"eval_steps_per_second": 25.62,
"step": 1500
},
{
"epoch": 0.22990693336885978,
"grad_norm": 0.83203125,
"learning_rate": 0.017981570696740123,
"loss": 1.9661,
"step": 1510
},
{
"epoch": 0.23142949584150124,
"grad_norm": 0.7890625,
"learning_rate": 0.01795176180767261,
"loss": 1.9801,
"step": 1520
},
{
"epoch": 0.2329520583141427,
"grad_norm": 0.80078125,
"learning_rate": 0.01792175950662868,
"loss": 1.9376,
"step": 1530
},
{
"epoch": 0.23447462078678416,
"grad_norm": 0.91796875,
"learning_rate": 0.017891564523359108,
"loss": 1.9701,
"step": 1540
},
{
"epoch": 0.23599718325942562,
"grad_norm": 0.859375,
"learning_rate": 0.017861177592301318,
"loss": 1.9454,
"step": 1550
},
{
"epoch": 0.23751974573206708,
"grad_norm": 0.83984375,
"learning_rate": 0.017830599452561487,
"loss": 1.9301,
"step": 1560
},
{
"epoch": 0.23904230820470854,
"grad_norm": 0.828125,
"learning_rate": 0.01779983084789662,
"loss": 1.9313,
"step": 1570
},
{
"epoch": 0.24056487067734997,
"grad_norm": 0.87109375,
"learning_rate": 0.01776887252669641,
"loss": 1.9767,
"step": 1580
},
{
"epoch": 0.24208743314999143,
"grad_norm": 0.7734375,
"learning_rate": 0.01773772524196507,
"loss": 1.8965,
"step": 1590
},
{
"epoch": 0.24360999562263289,
"grad_norm": 0.859375,
"learning_rate": 0.017706389751302988,
"loss": 1.9254,
"step": 1600
},
{
"epoch": 0.24513255809527434,
"grad_norm": 0.8671875,
"learning_rate": 0.017674866816888332,
"loss": 1.9328,
"step": 1610
},
{
"epoch": 0.2466551205679158,
"grad_norm": 0.8828125,
"learning_rate": 0.017643157205458483,
"loss": 1.9314,
"step": 1620
},
{
"epoch": 0.24817768304055726,
"grad_norm": 0.8203125,
"learning_rate": 0.0176112616882914,
"loss": 1.8842,
"step": 1630
},
{
"epoch": 0.24970024551319872,
"grad_norm": 0.8359375,
"learning_rate": 0.01757918104118686,
"loss": 1.8764,
"step": 1640
},
{
"epoch": 0.2512228079858402,
"grad_norm": 0.88671875,
"learning_rate": 0.017546916044447573,
"loss": 1.884,
"step": 1650
},
{
"epoch": 0.2527453704584816,
"grad_norm": 0.84765625,
"learning_rate": 0.017514467482860233,
"loss": 1.9117,
"step": 1660
},
{
"epoch": 0.2542679329311231,
"grad_norm": 0.8125,
"learning_rate": 0.017481836145676402,
"loss": 1.9062,
"step": 1670
},
{
"epoch": 0.25579049540376453,
"grad_norm": 0.8359375,
"learning_rate": 0.017449022826593316,
"loss": 1.9137,
"step": 1680
},
{
"epoch": 0.257313057876406,
"grad_norm": 0.81640625,
"learning_rate": 0.017416028323734598,
"loss": 1.9257,
"step": 1690
},
{
"epoch": 0.25883562034904745,
"grad_norm": 0.87109375,
"learning_rate": 0.01738285343963083,
"loss": 1.902,
"step": 1700
},
{
"epoch": 0.2603581828216889,
"grad_norm": 0.8046875,
"learning_rate": 0.01734949898120002,
"loss": 1.8808,
"step": 1710
},
{
"epoch": 0.26188074529433036,
"grad_norm": 0.84765625,
"learning_rate": 0.017315965759728016,
"loss": 1.9147,
"step": 1720
},
{
"epoch": 0.2634033077669718,
"grad_norm": 0.81640625,
"learning_rate": 0.017282254590848728,
"loss": 1.9054,
"step": 1730
},
{
"epoch": 0.2649258702396133,
"grad_norm": 0.859375,
"learning_rate": 0.017248366294524326,
"loss": 1.8789,
"step": 1740
},
{
"epoch": 0.2664484327122547,
"grad_norm": 0.8671875,
"learning_rate": 0.017214301695025268,
"loss": 1.8616,
"step": 1750
},
{
"epoch": 0.2679709951848962,
"grad_norm": 0.83984375,
"learning_rate": 0.017180061620910264,
"loss": 1.9031,
"step": 1760
},
{
"epoch": 0.26949355765753763,
"grad_norm": 0.828125,
"learning_rate": 0.01714564690500612,
"loss": 1.8796,
"step": 1770
},
{
"epoch": 0.2710161201301791,
"grad_norm": 0.828125,
"learning_rate": 0.01711105838438749,
"loss": 1.9017,
"step": 1780
},
{
"epoch": 0.27253868260282055,
"grad_norm": 0.8984375,
"learning_rate": 0.017076296900356495,
"loss": 1.8648,
"step": 1790
},
{
"epoch": 0.274061245075462,
"grad_norm": 0.87109375,
"learning_rate": 0.017041363298422287,
"loss": 1.8762,
"step": 1800
},
{
"epoch": 0.27558380754810347,
"grad_norm": 0.890625,
"learning_rate": 0.017006258428280463,
"loss": 1.9024,
"step": 1810
},
{
"epoch": 0.2771063700207449,
"grad_norm": 0.93359375,
"learning_rate": 0.0169709831437924,
"loss": 1.9075,
"step": 1820
},
{
"epoch": 0.2786289324933864,
"grad_norm": 0.859375,
"learning_rate": 0.016935538302964494,
"loss": 1.8724,
"step": 1830
},
{
"epoch": 0.2801514949660278,
"grad_norm": 0.83984375,
"learning_rate": 0.01689992476792729,
"loss": 1.8504,
"step": 1840
},
{
"epoch": 0.2816740574386693,
"grad_norm": 0.8828125,
"learning_rate": 0.016864143404914506,
"loss": 1.8597,
"step": 1850
},
{
"epoch": 0.28319661991131073,
"grad_norm": 0.87109375,
"learning_rate": 0.01682819508424196,
"loss": 1.8458,
"step": 1860
},
{
"epoch": 0.2847191823839522,
"grad_norm": 0.8046875,
"learning_rate": 0.01679208068028643,
"loss": 1.8839,
"step": 1870
},
{
"epoch": 0.28624174485659365,
"grad_norm": 0.84375,
"learning_rate": 0.016755801071464335,
"loss": 1.8525,
"step": 1880
},
{
"epoch": 0.2877643073292351,
"grad_norm": 0.83203125,
"learning_rate": 0.016719357140210417,
"loss": 1.912,
"step": 1890
},
{
"epoch": 0.28928686980187657,
"grad_norm": 0.8828125,
"learning_rate": 0.016682749772956258,
"loss": 1.8613,
"step": 1900
},
{
"epoch": 0.290809432274518,
"grad_norm": 0.875,
"learning_rate": 0.016645979860108715,
"loss": 1.8607,
"step": 1910
},
{
"epoch": 0.2923319947471595,
"grad_norm": 0.875,
"learning_rate": 0.01660904829602827,
"loss": 1.8919,
"step": 1920
},
{
"epoch": 0.2938545572198009,
"grad_norm": 0.81640625,
"learning_rate": 0.01657195597900727,
"loss": 1.9042,
"step": 1930
},
{
"epoch": 0.2953771196924424,
"grad_norm": 0.86328125,
"learning_rate": 0.016534703811248087,
"loss": 1.8477,
"step": 1940
},
{
"epoch": 0.29689968216508383,
"grad_norm": 0.88671875,
"learning_rate": 0.016497292698841162,
"loss": 1.8578,
"step": 1950
},
{
"epoch": 0.29842224463772526,
"grad_norm": 0.80078125,
"learning_rate": 0.01645972355174298,
"loss": 1.8304,
"step": 1960
},
{
"epoch": 0.29994480711036675,
"grad_norm": 0.87890625,
"learning_rate": 0.016421997283753927,
"loss": 1.8321,
"step": 1970
},
{
"epoch": 0.3014673695830082,
"grad_norm": 0.89453125,
"learning_rate": 0.016384114812496055,
"loss": 1.8363,
"step": 1980
},
{
"epoch": 0.30298993205564967,
"grad_norm": 0.84765625,
"learning_rate": 0.01634607705939079,
"loss": 1.833,
"step": 1990
},
{
"epoch": 0.3045124945282911,
"grad_norm": 0.953125,
"learning_rate": 0.016307884949636493,
"loss": 1.86,
"step": 2000
},
{
"epoch": 0.3045124945282911,
"eval_loss": 1.8923254013061523,
"eval_runtime": 332.0094,
"eval_samples_per_second": 51.155,
"eval_steps_per_second": 25.578,
"step": 2000
},
{
"epoch": 0.3060350570009326,
"grad_norm": 0.890625,
"learning_rate": 0.01626953941218597,
"loss": 1.8004,
"step": 2010
},
{
"epoch": 0.307557619473574,
"grad_norm": 0.90234375,
"learning_rate": 0.01623104137972386,
"loss": 1.8512,
"step": 2020
},
{
"epoch": 0.3090801819462155,
"grad_norm": 0.8671875,
"learning_rate": 0.016192391788643987,
"loss": 1.8061,
"step": 2030
},
{
"epoch": 0.31060274441885694,
"grad_norm": 0.90625,
"learning_rate": 0.016153591579026544,
"loss": 1.8187,
"step": 2040
},
{
"epoch": 0.31212530689149837,
"grad_norm": 0.8046875,
"learning_rate": 0.016114641694615246,
"loss": 1.8084,
"step": 2050
},
{
"epoch": 0.31364786936413985,
"grad_norm": 0.92578125,
"learning_rate": 0.01607554308279437,
"loss": 1.8253,
"step": 2060
},
{
"epoch": 0.3151704318367813,
"grad_norm": 0.8828125,
"learning_rate": 0.016036296694565716,
"loss": 1.833,
"step": 2070
},
{
"epoch": 0.31669299430942277,
"grad_norm": 0.87890625,
"learning_rate": 0.015996903484525475,
"loss": 1.8359,
"step": 2080
},
{
"epoch": 0.3182155567820642,
"grad_norm": 0.90234375,
"learning_rate": 0.015957364410841,
"loss": 1.8245,
"step": 2090
},
{
"epoch": 0.3197381192547057,
"grad_norm": 0.875,
"learning_rate": 0.01591768043522752,
"loss": 1.8034,
"step": 2100
},
{
"epoch": 0.3212606817273471,
"grad_norm": 0.87109375,
"learning_rate": 0.015877852522924733,
"loss": 1.8042,
"step": 2110
},
{
"epoch": 0.3227832441999886,
"grad_norm": 0.89453125,
"learning_rate": 0.015837881642673322,
"loss": 1.8242,
"step": 2120
},
{
"epoch": 0.32430580667263004,
"grad_norm": 0.87109375,
"learning_rate": 0.015797768766691426,
"loss": 1.7773,
"step": 2130
},
{
"epoch": 0.32582836914527147,
"grad_norm": 0.890625,
"learning_rate": 0.01575751487065094,
"loss": 1.8043,
"step": 2140
},
{
"epoch": 0.32735093161791295,
"grad_norm": 0.91015625,
"learning_rate": 0.015717120933653836,
"loss": 1.796,
"step": 2150
},
{
"epoch": 0.3288734940905544,
"grad_norm": 0.79296875,
"learning_rate": 0.015676587938208305,
"loss": 1.7998,
"step": 2160
},
{
"epoch": 0.3303960565631959,
"grad_norm": 0.8671875,
"learning_rate": 0.0156359168702049,
"loss": 1.8345,
"step": 2170
},
{
"epoch": 0.3319186190358373,
"grad_norm": 0.8984375,
"learning_rate": 0.015595108718892518,
"loss": 1.8046,
"step": 2180
},
{
"epoch": 0.3334411815084788,
"grad_norm": 0.87109375,
"learning_rate": 0.015554164476854364,
"loss": 1.825,
"step": 2190
},
{
"epoch": 0.3349637439811202,
"grad_norm": 0.90625,
"learning_rate": 0.015513085139983796,
"loss": 1.8082,
"step": 2200
},
{
"epoch": 0.3364863064537617,
"grad_norm": 0.87109375,
"learning_rate": 0.015471871707460108,
"loss": 1.7726,
"step": 2210
},
{
"epoch": 0.33800886892640314,
"grad_norm": 0.82421875,
"learning_rate": 0.015430525181724213,
"loss": 1.7326,
"step": 2220
},
{
"epoch": 0.33953143139904457,
"grad_norm": 0.90625,
"learning_rate": 0.015389046568454292,
"loss": 1.8309,
"step": 2230
},
{
"epoch": 0.34105399387168606,
"grad_norm": 0.9453125,
"learning_rate": 0.015347436876541297,
"loss": 1.7874,
"step": 2240
},
{
"epoch": 0.3425765563443275,
"grad_norm": 0.88671875,
"learning_rate": 0.015305697118064428,
"loss": 1.7611,
"step": 2250
},
{
"epoch": 0.344099118816969,
"grad_norm": 0.87890625,
"learning_rate": 0.015263828308266524,
"loss": 1.7601,
"step": 2260
},
{
"epoch": 0.3456216812896104,
"grad_norm": 0.90234375,
"learning_rate": 0.015221831465529344,
"loss": 1.7254,
"step": 2270
},
{
"epoch": 0.3471442437622519,
"grad_norm": 0.89453125,
"learning_rate": 0.015179707611348832,
"loss": 1.7807,
"step": 2280
},
{
"epoch": 0.3486668062348933,
"grad_norm": 0.90234375,
"learning_rate": 0.015137457770310232,
"loss": 1.7915,
"step": 2290
},
{
"epoch": 0.35018936870753475,
"grad_norm": 0.94921875,
"learning_rate": 0.015095082970063208,
"loss": 1.8016,
"step": 2300
},
{
"epoch": 0.35171193118017624,
"grad_norm": 0.90625,
"learning_rate": 0.015052584241296808,
"loss": 1.7567,
"step": 2310
},
{
"epoch": 0.35323449365281767,
"grad_norm": 0.95703125,
"learning_rate": 0.015009962617714423,
"loss": 1.736,
"step": 2320
},
{
"epoch": 0.35475705612545916,
"grad_norm": 0.96484375,
"learning_rate": 0.01496721913600863,
"loss": 1.7302,
"step": 2330
},
{
"epoch": 0.3562796185981006,
"grad_norm": 0.91015625,
"learning_rate": 0.014924354835835983,
"loss": 1.7667,
"step": 2340
},
{
"epoch": 0.3578021810707421,
"grad_norm": 0.91015625,
"learning_rate": 0.014881370759791726,
"loss": 1.7681,
"step": 2350
},
{
"epoch": 0.3593247435433835,
"grad_norm": 0.92578125,
"learning_rate": 0.01483826795338442,
"loss": 1.7485,
"step": 2360
},
{
"epoch": 0.360847306016025,
"grad_norm": 0.96875,
"learning_rate": 0.014795047465010541,
"loss": 1.7623,
"step": 2370
},
{
"epoch": 0.3623698684886664,
"grad_norm": 0.93359375,
"learning_rate": 0.014751710345928941,
"loss": 1.7391,
"step": 2380
},
{
"epoch": 0.36389243096130786,
"grad_norm": 0.8828125,
"learning_rate": 0.01470825765023532,
"loss": 1.8042,
"step": 2390
},
{
"epoch": 0.36541499343394934,
"grad_norm": 0.98828125,
"learning_rate": 0.01466469043483655,
"loss": 1.7204,
"step": 2400
},
{
"epoch": 0.3669375559065908,
"grad_norm": 0.953125,
"learning_rate": 0.014621009759424992,
"loss": 1.7449,
"step": 2410
},
{
"epoch": 0.36846011837923226,
"grad_norm": 0.921875,
"learning_rate": 0.014577216686452718,
"loss": 1.7779,
"step": 2420
},
{
"epoch": 0.3699826808518737,
"grad_norm": 0.90234375,
"learning_rate": 0.014533312281105657,
"loss": 1.7248,
"step": 2430
},
{
"epoch": 0.3715052433245152,
"grad_norm": 0.94921875,
"learning_rate": 0.014489297611277688,
"loss": 1.7367,
"step": 2440
},
{
"epoch": 0.3730278057971566,
"grad_norm": 0.8984375,
"learning_rate": 0.014445173747544678,
"loss": 1.7237,
"step": 2450
},
{
"epoch": 0.3745503682697981,
"grad_norm": 0.90234375,
"learning_rate": 0.01440094176313844,
"loss": 1.7188,
"step": 2460
},
{
"epoch": 0.3760729307424395,
"grad_norm": 0.9296875,
"learning_rate": 0.014356602733920611,
"loss": 1.7373,
"step": 2470
},
{
"epoch": 0.37759549321508096,
"grad_norm": 0.921875,
"learning_rate": 0.014312157738356509,
"loss": 1.7323,
"step": 2480
},
{
"epoch": 0.37911805568772244,
"grad_norm": 0.87890625,
"learning_rate": 0.014267607857488873,
"loss": 1.7167,
"step": 2490
},
{
"epoch": 0.3806406181603639,
"grad_norm": 0.96875,
"learning_rate": 0.0142229541749116,
"loss": 1.7172,
"step": 2500
},
{
"epoch": 0.3806406181603639,
"eval_loss": 1.7768399715423584,
"eval_runtime": 332.6705,
"eval_samples_per_second": 51.054,
"eval_steps_per_second": 25.527,
"step": 2500
},
{
"epoch": 0.38216318063300536,
"grad_norm": 0.828125,
"learning_rate": 0.01417819777674336,
"loss": 1.7101,
"step": 2510
},
{
"epoch": 0.3836857431056468,
"grad_norm": 0.90625,
"learning_rate": 0.0141333397516012,
"loss": 1.7252,
"step": 2520
},
{
"epoch": 0.3852083055782883,
"grad_norm": 0.92578125,
"learning_rate": 0.014088381190574052,
"loss": 1.7073,
"step": 2530
},
{
"epoch": 0.3867308680509297,
"grad_norm": 0.86328125,
"learning_rate": 0.014043323187196198,
"loss": 1.725,
"step": 2540
},
{
"epoch": 0.3882534305235712,
"grad_norm": 0.91796875,
"learning_rate": 0.013998166837420672,
"loss": 1.6971,
"step": 2550
},
{
"epoch": 0.38977599299621263,
"grad_norm": 0.91015625,
"learning_rate": 0.013952913239592604,
"loss": 1.7083,
"step": 2560
},
{
"epoch": 0.39129855546885406,
"grad_norm": 0.91015625,
"learning_rate": 0.013907563494422506,
"loss": 1.7085,
"step": 2570
},
{
"epoch": 0.39282111794149555,
"grad_norm": 0.88671875,
"learning_rate": 0.013862118704959498,
"loss": 1.7366,
"step": 2580
},
{
"epoch": 0.394343680414137,
"grad_norm": 0.90625,
"learning_rate": 0.013816579976564467,
"loss": 1.6875,
"step": 2590
},
{
"epoch": 0.39586624288677846,
"grad_norm": 0.94140625,
"learning_rate": 0.013770948416883205,
"loss": 1.7264,
"step": 2600
},
{
"epoch": 0.3973888053594199,
"grad_norm": 0.8984375,
"learning_rate": 0.013725225135819448,
"loss": 1.717,
"step": 2610
},
{
"epoch": 0.3989113678320614,
"grad_norm": 0.9296875,
"learning_rate": 0.013679411245507889,
"loss": 1.6989,
"step": 2620
},
{
"epoch": 0.4004339303047028,
"grad_norm": 0.91015625,
"learning_rate": 0.013633507860287115,
"loss": 1.7102,
"step": 2630
},
{
"epoch": 0.40195649277734424,
"grad_norm": 0.9375,
"learning_rate": 0.013587516096672527,
"loss": 1.719,
"step": 2640
},
{
"epoch": 0.40347905524998573,
"grad_norm": 0.8984375,
"learning_rate": 0.013541437073329155,
"loss": 1.676,
"step": 2650
},
{
"epoch": 0.40500161772262716,
"grad_norm": 0.94921875,
"learning_rate": 0.01349527191104447,
"loss": 1.6962,
"step": 2660
},
{
"epoch": 0.40652418019526865,
"grad_norm": 0.95703125,
"learning_rate": 0.013449021732701105,
"loss": 1.723,
"step": 2670
},
{
"epoch": 0.4080467426679101,
"grad_norm": 0.9609375,
"learning_rate": 0.013402687663249565,
"loss": 1.6729,
"step": 2680
},
{
"epoch": 0.40956930514055157,
"grad_norm": 1.0390625,
"learning_rate": 0.013356270829680836,
"loss": 1.6872,
"step": 2690
},
{
"epoch": 0.411091867613193,
"grad_norm": 0.94140625,
"learning_rate": 0.013309772360999006,
"loss": 1.6907,
"step": 2700
},
{
"epoch": 0.4126144300858345,
"grad_norm": 0.89453125,
"learning_rate": 0.01326319338819377,
"loss": 1.6979,
"step": 2710
},
{
"epoch": 0.4141369925584759,
"grad_norm": 1.0,
"learning_rate": 0.013216535044212952,
"loss": 1.7048,
"step": 2720
},
{
"epoch": 0.41565955503111734,
"grad_norm": 0.9609375,
"learning_rate": 0.013169798463934925,
"loss": 1.687,
"step": 2730
},
{
"epoch": 0.41718211750375883,
"grad_norm": 0.9296875,
"learning_rate": 0.01312298478414102,
"loss": 1.6912,
"step": 2740
},
{
"epoch": 0.41870467997640026,
"grad_norm": 0.875,
"learning_rate": 0.013076095143487874,
"loss": 1.6743,
"step": 2750
},
{
"epoch": 0.42022724244904175,
"grad_norm": 0.87109375,
"learning_rate": 0.013029130682479722,
"loss": 1.6587,
"step": 2760
},
{
"epoch": 0.4217498049216832,
"grad_norm": 0.8828125,
"learning_rate": 0.01298209254344068,
"loss": 1.6961,
"step": 2770
},
{
"epoch": 0.42327236739432467,
"grad_norm": 0.99609375,
"learning_rate": 0.012934981870486932,
"loss": 1.6898,
"step": 2780
},
{
"epoch": 0.4247949298669661,
"grad_norm": 0.93359375,
"learning_rate": 0.012887799809498932,
"loss": 1.6648,
"step": 2790
},
{
"epoch": 0.4263174923396076,
"grad_norm": 0.875,
"learning_rate": 0.012840547508093506,
"loss": 1.6429,
"step": 2800
},
{
"epoch": 0.427840054812249,
"grad_norm": 0.9921875,
"learning_rate": 0.01279322611559595,
"loss": 1.6833,
"step": 2810
},
{
"epoch": 0.42936261728489045,
"grad_norm": 0.91796875,
"learning_rate": 0.012745836783012075,
"loss": 1.6715,
"step": 2820
},
{
"epoch": 0.43088517975753193,
"grad_norm": 0.90234375,
"learning_rate": 0.01269838066300021,
"loss": 1.6232,
"step": 2830
},
{
"epoch": 0.43240774223017336,
"grad_norm": 0.98046875,
"learning_rate": 0.01265085890984317,
"loss": 1.6643,
"step": 2840
},
{
"epoch": 0.43393030470281485,
"grad_norm": 0.859375,
"learning_rate": 0.012603272679420166,
"loss": 1.675,
"step": 2850
},
{
"epoch": 0.4354528671754563,
"grad_norm": 0.9140625,
"learning_rate": 0.0125556231291787,
"loss": 1.6551,
"step": 2860
},
{
"epoch": 0.43697542964809777,
"grad_norm": 0.89453125,
"learning_rate": 0.012507911418106423,
"loss": 1.6683,
"step": 2870
},
{
"epoch": 0.4384979921207392,
"grad_norm": 0.90625,
"learning_rate": 0.012460138706702928,
"loss": 1.6398,
"step": 2880
},
{
"epoch": 0.4400205545933807,
"grad_norm": 0.94140625,
"learning_rate": 0.012412306156951524,
"loss": 1.6415,
"step": 2890
},
{
"epoch": 0.4415431170660221,
"grad_norm": 0.99609375,
"learning_rate": 0.012364414932290986,
"loss": 1.5897,
"step": 2900
},
{
"epoch": 0.44306567953866355,
"grad_norm": 0.99609375,
"learning_rate": 0.012316466197587242,
"loss": 1.69,
"step": 2910
},
{
"epoch": 0.44458824201130503,
"grad_norm": 0.93359375,
"learning_rate": 0.01226846111910505,
"loss": 1.6568,
"step": 2920
},
{
"epoch": 0.44611080448394647,
"grad_norm": 0.94921875,
"learning_rate": 0.01222040086447962,
"loss": 1.6666,
"step": 2930
},
{
"epoch": 0.44763336695658795,
"grad_norm": 0.91796875,
"learning_rate": 0.012172286602688227,
"loss": 1.6326,
"step": 2940
},
{
"epoch": 0.4491559294292294,
"grad_norm": 0.9765625,
"learning_rate": 0.012124119504021775,
"loss": 1.6519,
"step": 2950
},
{
"epoch": 0.45067849190187087,
"grad_norm": 0.9140625,
"learning_rate": 0.012075900740056315,
"loss": 1.6014,
"step": 2960
},
{
"epoch": 0.4522010543745123,
"grad_norm": 0.94921875,
"learning_rate": 0.01202763148362457,
"loss": 1.6208,
"step": 2970
},
{
"epoch": 0.45372361684715373,
"grad_norm": 0.9140625,
"learning_rate": 0.011979312908787398,
"loss": 1.6346,
"step": 2980
},
{
"epoch": 0.4552461793197952,
"grad_norm": 0.921875,
"learning_rate": 0.01193094619080524,
"loss": 1.6157,
"step": 2990
},
{
"epoch": 0.45676874179243665,
"grad_norm": 0.91015625,
"learning_rate": 0.011882532506109517,
"loss": 1.6455,
"step": 3000
},
{
"epoch": 0.45676874179243665,
"eval_loss": 1.669243335723877,
"eval_runtime": 332.838,
"eval_samples_per_second": 51.028,
"eval_steps_per_second": 25.514,
"step": 3000
},
{
"epoch": 0.45829130426507814,
"grad_norm": 0.8828125,
"learning_rate": 0.011834073032274042,
"loss": 1.6234,
"step": 3010
},
{
"epoch": 0.45981386673771957,
"grad_norm": 0.95703125,
"learning_rate": 0.011785568947986366,
"loss": 1.6443,
"step": 3020
},
{
"epoch": 0.46133642921036105,
"grad_norm": 0.9921875,
"learning_rate": 0.0117370214330191,
"loss": 1.6333,
"step": 3030
},
{
"epoch": 0.4628589916830025,
"grad_norm": 1.0078125,
"learning_rate": 0.011688431668201224,
"loss": 1.639,
"step": 3040
},
{
"epoch": 0.46438155415564397,
"grad_norm": 0.94140625,
"learning_rate": 0.011639800835389376,
"loss": 1.614,
"step": 3050
},
{
"epoch": 0.4659041166282854,
"grad_norm": 0.93359375,
"learning_rate": 0.011591130117439093,
"loss": 1.6256,
"step": 3060
},
{
"epoch": 0.46742667910092683,
"grad_norm": 1.03125,
"learning_rate": 0.011542420698176048,
"loss": 1.6143,
"step": 3070
},
{
"epoch": 0.4689492415735683,
"grad_norm": 0.95703125,
"learning_rate": 0.011493673762367245,
"loss": 1.6167,
"step": 3080
},
{
"epoch": 0.47047180404620975,
"grad_norm": 0.8984375,
"learning_rate": 0.011444890495692212,
"loss": 1.604,
"step": 3090
},
{
"epoch": 0.47199436651885124,
"grad_norm": 0.88671875,
"learning_rate": 0.011396072084714166,
"loss": 1.5807,
"step": 3100
},
{
"epoch": 0.47351692899149267,
"grad_norm": 0.953125,
"learning_rate": 0.011347219716851138,
"loss": 1.5897,
"step": 3110
},
{
"epoch": 0.47503949146413416,
"grad_norm": 1.03125,
"learning_rate": 0.011298334580347099,
"loss": 1.6162,
"step": 3120
},
{
"epoch": 0.4765620539367756,
"grad_norm": 0.98046875,
"learning_rate": 0.011249417864243046,
"loss": 1.6291,
"step": 3130
},
{
"epoch": 0.4780846164094171,
"grad_norm": 0.9375,
"learning_rate": 0.011200470758348114,
"loss": 1.6292,
"step": 3140
},
{
"epoch": 0.4796071788820585,
"grad_norm": 0.98828125,
"learning_rate": 0.011151494453210595,
"loss": 1.5768,
"step": 3150
},
{
"epoch": 0.48112974135469994,
"grad_norm": 0.91796875,
"learning_rate": 0.011102490140089008,
"loss": 1.559,
"step": 3160
},
{
"epoch": 0.4826523038273414,
"grad_norm": 0.91796875,
"learning_rate": 0.011053459010923108,
"loss": 1.5899,
"step": 3170
},
{
"epoch": 0.48417486629998285,
"grad_norm": 0.94921875,
"learning_rate": 0.011004402258304916,
"loss": 1.5752,
"step": 3180
},
{
"epoch": 0.48569742877262434,
"grad_norm": 0.984375,
"learning_rate": 0.010955321075449673,
"loss": 1.5958,
"step": 3190
},
{
"epoch": 0.48721999124526577,
"grad_norm": 0.99609375,
"learning_rate": 0.010906216656166857,
"loss": 1.5972,
"step": 3200
},
{
"epoch": 0.48874255371790726,
"grad_norm": 0.94921875,
"learning_rate": 0.010857090194831127,
"loss": 1.581,
"step": 3210
},
{
"epoch": 0.4902651161905487,
"grad_norm": 1.015625,
"learning_rate": 0.010807942886353275,
"loss": 1.603,
"step": 3220
},
{
"epoch": 0.4917876786631902,
"grad_norm": 0.96875,
"learning_rate": 0.010758775926151154,
"loss": 1.5316,
"step": 3230
},
{
"epoch": 0.4933102411358316,
"grad_norm": 1.046875,
"learning_rate": 0.010709590510120616,
"loss": 1.6024,
"step": 3240
},
{
"epoch": 0.49483280360847304,
"grad_norm": 0.953125,
"learning_rate": 0.010660387834606414,
"loss": 1.5786,
"step": 3250
},
{
"epoch": 0.4963553660811145,
"grad_norm": 1.0546875,
"learning_rate": 0.010611169096373113,
"loss": 1.596,
"step": 3260
},
{
"epoch": 0.49787792855375596,
"grad_norm": 0.96875,
"learning_rate": 0.01056193549257596,
"loss": 1.5916,
"step": 3270
},
{
"epoch": 0.49940049102639744,
"grad_norm": 0.9453125,
"learning_rate": 0.010512688220731791,
"loss": 1.5515,
"step": 3280
},
{
"epoch": 0.5009230534990389,
"grad_norm": 0.98046875,
"learning_rate": 0.010463428478689895,
"loss": 1.5383,
"step": 3290
},
{
"epoch": 0.5024456159716804,
"grad_norm": 0.96484375,
"learning_rate": 0.010414157464602865,
"loss": 1.5439,
"step": 3300
},
{
"epoch": 0.5039681784443218,
"grad_norm": 0.9453125,
"learning_rate": 0.010364876376897467,
"loss": 1.5701,
"step": 3310
},
{
"epoch": 0.5054907409169632,
"grad_norm": 0.98046875,
"learning_rate": 0.010315586414245497,
"loss": 1.5591,
"step": 3320
},
{
"epoch": 0.5070133033896047,
"grad_norm": 0.97265625,
"learning_rate": 0.010266288775534616,
"loss": 1.5644,
"step": 3330
},
{
"epoch": 0.5085358658622462,
"grad_norm": 1.0,
"learning_rate": 0.010216984659839183,
"loss": 1.5664,
"step": 3340
},
{
"epoch": 0.5100584283348876,
"grad_norm": 0.9765625,
"learning_rate": 0.010167675266391103,
"loss": 1.5762,
"step": 3350
},
{
"epoch": 0.5115809908075291,
"grad_norm": 0.96875,
"learning_rate": 0.010118361794550657,
"loss": 1.5568,
"step": 3360
},
{
"epoch": 0.5131035532801705,
"grad_norm": 1.0,
"learning_rate": 0.010069045443777317,
"loss": 1.5914,
"step": 3370
},
{
"epoch": 0.514626115752812,
"grad_norm": 1.015625,
"learning_rate": 0.01001972741360059,
"loss": 1.577,
"step": 3380
},
{
"epoch": 0.5161486782254534,
"grad_norm": 0.9375,
"learning_rate": 0.009970408903590817,
"loss": 1.5464,
"step": 3390
},
{
"epoch": 0.5176712406980949,
"grad_norm": 0.9765625,
"learning_rate": 0.009921091113330026,
"loss": 1.5648,
"step": 3400
},
{
"epoch": 0.5191938031707364,
"grad_norm": 0.99609375,
"learning_rate": 0.009871775242382726,
"loss": 1.5445,
"step": 3410
},
{
"epoch": 0.5207163656433778,
"grad_norm": 1.046875,
"learning_rate": 0.009822462490266753,
"loss": 1.5704,
"step": 3420
},
{
"epoch": 0.5222389281160192,
"grad_norm": 0.98828125,
"learning_rate": 0.009773154056424068,
"loss": 1.54,
"step": 3430
},
{
"epoch": 0.5237614905886607,
"grad_norm": 1.0546875,
"learning_rate": 0.009723851140191612,
"loss": 1.5441,
"step": 3440
},
{
"epoch": 0.5252840530613022,
"grad_norm": 1.015625,
"learning_rate": 0.009674554940772118,
"loss": 1.5875,
"step": 3450
},
{
"epoch": 0.5268066155339436,
"grad_norm": 0.94921875,
"learning_rate": 0.009625266657204938,
"loss": 1.5179,
"step": 3460
},
{
"epoch": 0.5283291780065851,
"grad_norm": 1.0625,
"learning_rate": 0.009575987488336891,
"loss": 1.5209,
"step": 3470
},
{
"epoch": 0.5298517404792266,
"grad_norm": 0.97265625,
"learning_rate": 0.0095267186327931,
"loss": 1.5544,
"step": 3480
},
{
"epoch": 0.5313743029518679,
"grad_norm": 0.953125,
"learning_rate": 0.009477461288947827,
"loss": 1.52,
"step": 3490
},
{
"epoch": 0.5328968654245094,
"grad_norm": 0.953125,
"learning_rate": 0.009428216654895339,
"loss": 1.5613,
"step": 3500
},
{
"epoch": 0.5328968654245094,
"eval_loss": 1.5677825212478638,
"eval_runtime": 332.5932,
"eval_samples_per_second": 51.065,
"eval_steps_per_second": 25.533,
"step": 3500
},
{
"epoch": 0.5344194278971509,
"grad_norm": 1.0625,
"learning_rate": 0.009378985928420762,
"loss": 1.5504,
"step": 3510
},
{
"epoch": 0.5359419903697924,
"grad_norm": 0.9765625,
"learning_rate": 0.009329770306970941,
"loss": 1.5457,
"step": 3520
},
{
"epoch": 0.5374645528424338,
"grad_norm": 0.984375,
"learning_rate": 0.009280570987625327,
"loss": 1.5166,
"step": 3530
},
{
"epoch": 0.5389871153150753,
"grad_norm": 0.99609375,
"learning_rate": 0.009231389167066836,
"loss": 1.4936,
"step": 3540
},
{
"epoch": 0.5405096777877167,
"grad_norm": 1.03125,
"learning_rate": 0.009182226041552777,
"loss": 1.5515,
"step": 3550
},
{
"epoch": 0.5420322402603582,
"grad_norm": 0.94921875,
"learning_rate": 0.009133082806885727,
"loss": 1.4913,
"step": 3560
},
{
"epoch": 0.5435548027329996,
"grad_norm": 1.0625,
"learning_rate": 0.009083960658384455,
"loss": 1.5148,
"step": 3570
},
{
"epoch": 0.5450773652056411,
"grad_norm": 1.046875,
"learning_rate": 0.009034860790854849,
"loss": 1.5269,
"step": 3580
},
{
"epoch": 0.5465999276782826,
"grad_norm": 0.94921875,
"learning_rate": 0.008985784398560856,
"loss": 1.5088,
"step": 3590
},
{
"epoch": 0.548122490150924,
"grad_norm": 1.0,
"learning_rate": 0.008936732675195425,
"loss": 1.5022,
"step": 3600
},
{
"epoch": 0.5496450526235654,
"grad_norm": 0.95703125,
"learning_rate": 0.008887706813851483,
"loss": 1.5011,
"step": 3610
},
{
"epoch": 0.5511676150962069,
"grad_norm": 0.99609375,
"learning_rate": 0.008838708006992909,
"loss": 1.4785,
"step": 3620
},
{
"epoch": 0.5526901775688484,
"grad_norm": 1.0,
"learning_rate": 0.008789737446425538,
"loss": 1.5353,
"step": 3630
},
{
"epoch": 0.5542127400414898,
"grad_norm": 0.96484375,
"learning_rate": 0.008740796323268157,
"loss": 1.5299,
"step": 3640
},
{
"epoch": 0.5557353025141313,
"grad_norm": 1.015625,
"learning_rate": 0.008691885827923542,
"loss": 1.4997,
"step": 3650
},
{
"epoch": 0.5572578649867728,
"grad_norm": 0.98828125,
"learning_rate": 0.008643007150049509,
"loss": 1.5141,
"step": 3660
},
{
"epoch": 0.5587804274594141,
"grad_norm": 0.984375,
"learning_rate": 0.008594161478529974,
"loss": 1.5105,
"step": 3670
},
{
"epoch": 0.5603029899320556,
"grad_norm": 0.984375,
"learning_rate": 0.008545350001446026,
"loss": 1.4957,
"step": 3680
},
{
"epoch": 0.5618255524046971,
"grad_norm": 1.015625,
"learning_rate": 0.008496573906047047,
"loss": 1.4841,
"step": 3690
},
{
"epoch": 0.5633481148773386,
"grad_norm": 1.0625,
"learning_rate": 0.008447834378721816,
"loss": 1.4886,
"step": 3700
},
{
"epoch": 0.56487067734998,
"grad_norm": 1.015625,
"learning_rate": 0.00839913260496967,
"loss": 1.4766,
"step": 3710
},
{
"epoch": 0.5663932398226215,
"grad_norm": 1.015625,
"learning_rate": 0.008350469769371649,
"loss": 1.4882,
"step": 3720
},
{
"epoch": 0.567915802295263,
"grad_norm": 0.93359375,
"learning_rate": 0.008301847055561704,
"loss": 1.4452,
"step": 3730
},
{
"epoch": 0.5694383647679044,
"grad_norm": 0.97265625,
"learning_rate": 0.008253265646197891,
"loss": 1.4803,
"step": 3740
},
{
"epoch": 0.5709609272405458,
"grad_norm": 1.046875,
"learning_rate": 0.008204726722933618,
"loss": 1.4771,
"step": 3750
},
{
"epoch": 0.5724834897131873,
"grad_norm": 0.93359375,
"learning_rate": 0.00815623146638888,
"loss": 1.4307,
"step": 3760
},
{
"epoch": 0.5740060521858288,
"grad_norm": 0.98046875,
"learning_rate": 0.008107781056121581,
"loss": 1.4843,
"step": 3770
},
{
"epoch": 0.5755286146584702,
"grad_norm": 1.015625,
"learning_rate": 0.00805937667059881,
"loss": 1.4712,
"step": 3780
},
{
"epoch": 0.5770511771311116,
"grad_norm": 1.0,
"learning_rate": 0.008011019487168192,
"loss": 1.473,
"step": 3790
},
{
"epoch": 0.5785737396037531,
"grad_norm": 1.0078125,
"learning_rate": 0.007962710682029245,
"loss": 1.4614,
"step": 3800
},
{
"epoch": 0.5800963020763946,
"grad_norm": 1.0859375,
"learning_rate": 0.007914451430204777,
"loss": 1.4484,
"step": 3810
},
{
"epoch": 0.581618864549036,
"grad_norm": 1.109375,
"learning_rate": 0.007866242905512305,
"loss": 1.4851,
"step": 3820
},
{
"epoch": 0.5831414270216775,
"grad_norm": 0.98046875,
"learning_rate": 0.007818086280535493,
"loss": 1.4475,
"step": 3830
},
{
"epoch": 0.584663989494319,
"grad_norm": 1.0703125,
"learning_rate": 0.007769982726595648,
"loss": 1.4766,
"step": 3840
},
{
"epoch": 0.5861865519669603,
"grad_norm": 1.015625,
"learning_rate": 0.007721933413723224,
"loss": 1.4819,
"step": 3850
},
{
"epoch": 0.5877091144396018,
"grad_norm": 1.109375,
"learning_rate": 0.007673939510629349,
"loss": 1.4827,
"step": 3860
},
{
"epoch": 0.5892316769122433,
"grad_norm": 1.1015625,
"learning_rate": 0.00762600218467742,
"loss": 1.4513,
"step": 3870
},
{
"epoch": 0.5907542393848848,
"grad_norm": 1.0078125,
"learning_rate": 0.007578122601854693,
"loss": 1.4446,
"step": 3880
},
{
"epoch": 0.5922768018575262,
"grad_norm": 1.015625,
"learning_rate": 0.0075303019267439365,
"loss": 1.459,
"step": 3890
},
{
"epoch": 0.5937993643301677,
"grad_norm": 1.03125,
"learning_rate": 0.007482541322495094,
"loss": 1.4533,
"step": 3900
},
{
"epoch": 0.5953219268028092,
"grad_norm": 1.0234375,
"learning_rate": 0.007434841950796987,
"loss": 1.4674,
"step": 3910
},
{
"epoch": 0.5968444892754505,
"grad_norm": 0.92578125,
"learning_rate": 0.007387204971849082,
"loss": 1.4355,
"step": 3920
},
{
"epoch": 0.598367051748092,
"grad_norm": 0.99609375,
"learning_rate": 0.00733963154433325,
"loss": 1.4452,
"step": 3930
},
{
"epoch": 0.5998896142207335,
"grad_norm": 1.0625,
"learning_rate": 0.007292122825385585,
"loss": 1.4334,
"step": 3940
},
{
"epoch": 0.601412176693375,
"grad_norm": 0.98828125,
"learning_rate": 0.007244679970568273,
"loss": 1.4053,
"step": 3950
},
{
"epoch": 0.6029347391660164,
"grad_norm": 1.1171875,
"learning_rate": 0.007197304133841477,
"loss": 1.457,
"step": 3960
},
{
"epoch": 0.6044573016386579,
"grad_norm": 1.015625,
"learning_rate": 0.007149996467535253,
"loss": 1.4421,
"step": 3970
},
{
"epoch": 0.6059798641112993,
"grad_norm": 1.0078125,
"learning_rate": 0.007102758122321557,
"loss": 1.4525,
"step": 3980
},
{
"epoch": 0.6075024265839408,
"grad_norm": 0.98828125,
"learning_rate": 0.007055590247186224,
"loss": 1.4069,
"step": 3990
},
{
"epoch": 0.6090249890565822,
"grad_norm": 1.0859375,
"learning_rate": 0.007008493989401038,
"loss": 1.4206,
"step": 4000
},
{
"epoch": 0.6090249890565822,
"eval_loss": 1.4739612340927124,
"eval_runtime": 330.8337,
"eval_samples_per_second": 51.337,
"eval_steps_per_second": 25.668,
"step": 4000
},
{
"epoch": 0.6105475515292237,
"grad_norm": 1.0234375,
"learning_rate": 0.006961470494495825,
"loss": 1.48,
"step": 4010
},
{
"epoch": 0.6120701140018652,
"grad_norm": 1.0625,
"learning_rate": 0.0069145209062305805,
"loss": 1.4043,
"step": 4020
},
{
"epoch": 0.6135926764745065,
"grad_norm": 0.96484375,
"learning_rate": 0.006867646366567665,
"loss": 1.4538,
"step": 4030
},
{
"epoch": 0.615115238947148,
"grad_norm": 1.0703125,
"learning_rate": 0.006820848015644018,
"loss": 1.4233,
"step": 4040
},
{
"epoch": 0.6166378014197895,
"grad_norm": 0.96875,
"learning_rate": 0.006774126991743424,
"loss": 1.4104,
"step": 4050
},
{
"epoch": 0.618160363892431,
"grad_norm": 1.0546875,
"learning_rate": 0.006727484431268831,
"loss": 1.4257,
"step": 4060
},
{
"epoch": 0.6196829263650724,
"grad_norm": 1.0703125,
"learning_rate": 0.006680921468714718,
"loss": 1.4148,
"step": 4070
},
{
"epoch": 0.6212054888377139,
"grad_norm": 1.0390625,
"learning_rate": 0.006634439236639473,
"loss": 1.4257,
"step": 4080
},
{
"epoch": 0.6227280513103554,
"grad_norm": 1.0078125,
"learning_rate": 0.006588038865637882,
"loss": 1.3982,
"step": 4090
},
{
"epoch": 0.6242506137829967,
"grad_norm": 1.09375,
"learning_rate": 0.0065417214843135965,
"loss": 1.4606,
"step": 4100
},
{
"epoch": 0.6257731762556382,
"grad_norm": 1.046875,
"learning_rate": 0.006495488219251705,
"loss": 1.41,
"step": 4110
},
{
"epoch": 0.6272957387282797,
"grad_norm": 1.046875,
"learning_rate": 0.006449340194991325,
"loss": 1.4392,
"step": 4120
},
{
"epoch": 0.6288183012009212,
"grad_norm": 0.98828125,
"learning_rate": 0.006403278533998237,
"loss": 1.4079,
"step": 4130
},
{
"epoch": 0.6303408636735626,
"grad_norm": 1.046875,
"learning_rate": 0.006357304356637605,
"loss": 1.4112,
"step": 4140
},
{
"epoch": 0.631863426146204,
"grad_norm": 1.0390625,
"learning_rate": 0.006311418781146709,
"loss": 1.4297,
"step": 4150
},
{
"epoch": 0.6333859886188455,
"grad_norm": 1.015625,
"learning_rate": 0.006265622923607759,
"loss": 1.4111,
"step": 4160
},
{
"epoch": 0.634908551091487,
"grad_norm": 1.0390625,
"learning_rate": 0.006219917897920726,
"loss": 1.4201,
"step": 4170
},
{
"epoch": 0.6364311135641284,
"grad_norm": 1.046875,
"learning_rate": 0.006174304815776282,
"loss": 1.4279,
"step": 4180
},
{
"epoch": 0.6379536760367699,
"grad_norm": 1.0703125,
"learning_rate": 0.0061287847866287205,
"loss": 1.4095,
"step": 4190
},
{
"epoch": 0.6394762385094114,
"grad_norm": 0.99609375,
"learning_rate": 0.006083358917669012,
"loss": 1.3751,
"step": 4200
},
{
"epoch": 0.6409988009820528,
"grad_norm": 0.96875,
"learning_rate": 0.00603802831379784,
"loss": 1.3945,
"step": 4210
},
{
"epoch": 0.6425213634546942,
"grad_norm": 1.03125,
"learning_rate": 0.005992794077598747,
"loss": 1.4182,
"step": 4220
},
{
"epoch": 0.6440439259273357,
"grad_norm": 1.0234375,
"learning_rate": 0.005947657309311306,
"loss": 1.4575,
"step": 4230
},
{
"epoch": 0.6455664883999772,
"grad_norm": 0.99609375,
"learning_rate": 0.005902619106804368,
"loss": 1.3849,
"step": 4240
},
{
"epoch": 0.6470890508726186,
"grad_norm": 1.09375,
"learning_rate": 0.005857680565549341,
"loss": 1.4159,
"step": 4250
},
{
"epoch": 0.6486116133452601,
"grad_norm": 1.0703125,
"learning_rate": 0.005812842778593572,
"loss": 1.4028,
"step": 4260
},
{
"epoch": 0.6501341758179016,
"grad_norm": 1.046875,
"learning_rate": 0.005768106836533726,
"loss": 1.4047,
"step": 4270
},
{
"epoch": 0.6516567382905429,
"grad_norm": 1.0625,
"learning_rate": 0.005723473827489301,
"loss": 1.4104,
"step": 4280
},
{
"epoch": 0.6531793007631844,
"grad_norm": 1.0625,
"learning_rate": 0.0056789448370761185,
"loss": 1.4118,
"step": 4290
},
{
"epoch": 0.6547018632358259,
"grad_norm": 0.99609375,
"learning_rate": 0.005634520948379951,
"loss": 1.3793,
"step": 4300
},
{
"epoch": 0.6562244257084674,
"grad_norm": 1.0,
"learning_rate": 0.005590203241930157,
"loss": 1.4074,
"step": 4310
},
{
"epoch": 0.6577469881811088,
"grad_norm": 1.03125,
"learning_rate": 0.005545992795673408,
"loss": 1.4017,
"step": 4320
},
{
"epoch": 0.6592695506537503,
"grad_norm": 1.0234375,
"learning_rate": 0.0055018906849474795,
"loss": 1.415,
"step": 4330
},
{
"epoch": 0.6607921131263917,
"grad_norm": 0.9765625,
"learning_rate": 0.005457897982455072,
"loss": 1.4034,
"step": 4340
},
{
"epoch": 0.6623146755990331,
"grad_norm": 1.0234375,
"learning_rate": 0.005414015758237733,
"loss": 1.4103,
"step": 4350
},
{
"epoch": 0.6638372380716746,
"grad_norm": 1.015625,
"learning_rate": 0.005370245079649841,
"loss": 1.372,
"step": 4360
},
{
"epoch": 0.6653598005443161,
"grad_norm": 1.046875,
"learning_rate": 0.005326587011332616,
"loss": 1.3938,
"step": 4370
},
{
"epoch": 0.6668823630169576,
"grad_norm": 1.0859375,
"learning_rate": 0.005283042615188249,
"loss": 1.3771,
"step": 4380
},
{
"epoch": 0.668404925489599,
"grad_norm": 1.1015625,
"learning_rate": 0.005239612950354074,
"loss": 1.3552,
"step": 4390
},
{
"epoch": 0.6699274879622404,
"grad_norm": 1.0703125,
"learning_rate": 0.005196299073176771,
"loss": 1.3651,
"step": 4400
},
{
"epoch": 0.6714500504348819,
"grad_norm": 1.0078125,
"learning_rate": 0.0051531020371867265,
"loss": 1.401,
"step": 4410
},
{
"epoch": 0.6729726129075234,
"grad_norm": 1.09375,
"learning_rate": 0.005110022893072361,
"loss": 1.3721,
"step": 4420
},
{
"epoch": 0.6744951753801648,
"grad_norm": 1.0078125,
"learning_rate": 0.0050670626886545975,
"loss": 1.3655,
"step": 4430
},
{
"epoch": 0.6760177378528063,
"grad_norm": 1.03125,
"learning_rate": 0.005024222468861377,
"loss": 1.3572,
"step": 4440
},
{
"epoch": 0.6775403003254478,
"grad_norm": 1.1015625,
"learning_rate": 0.0049815032757022275,
"loss": 1.3766,
"step": 4450
},
{
"epoch": 0.6790628627980891,
"grad_norm": 1.0625,
"learning_rate": 0.004938906148242921,
"loss": 1.3632,
"step": 4460
},
{
"epoch": 0.6805854252707306,
"grad_norm": 1.0546875,
"learning_rate": 0.004896432122580222,
"loss": 1.3825,
"step": 4470
},
{
"epoch": 0.6821079877433721,
"grad_norm": 1.0625,
"learning_rate": 0.00485408223181666,
"loss": 1.3533,
"step": 4480
},
{
"epoch": 0.6836305502160136,
"grad_norm": 1.046875,
"learning_rate": 0.004811857506035406,
"loss": 1.3628,
"step": 4490
},
{
"epoch": 0.685153112688655,
"grad_norm": 1.0,
"learning_rate": 0.0047697589722752445,
"loss": 1.3787,
"step": 4500
},
{
"epoch": 0.685153112688655,
"eval_loss": 1.4034804105758667,
"eval_runtime": 332.3993,
"eval_samples_per_second": 51.095,
"eval_steps_per_second": 25.548,
"step": 4500
},
{
"epoch": 0.6866756751612965,
"grad_norm": 1.03125,
"learning_rate": 0.004727787654505539,
"loss": 1.3787,
"step": 4510
},
{
"epoch": 0.688198237633938,
"grad_norm": 1.03125,
"learning_rate": 0.0046859445736013895,
"loss": 1.3493,
"step": 4520
},
{
"epoch": 0.6897208001065793,
"grad_norm": 1.0390625,
"learning_rate": 0.00464423074731875,
"loss": 1.3848,
"step": 4530
},
{
"epoch": 0.6912433625792208,
"grad_norm": 0.99609375,
"learning_rate": 0.004602647190269701,
"loss": 1.3502,
"step": 4540
},
{
"epoch": 0.6927659250518623,
"grad_norm": 1.0859375,
"learning_rate": 0.004561194913897766,
"loss": 1.4067,
"step": 4550
},
{
"epoch": 0.6942884875245038,
"grad_norm": 1.0078125,
"learning_rate": 0.004519874926453302,
"loss": 1.3637,
"step": 4560
},
{
"epoch": 0.6958110499971452,
"grad_norm": 1.109375,
"learning_rate": 0.004478688232968981,
"loss": 1.3609,
"step": 4570
},
{
"epoch": 0.6973336124697866,
"grad_norm": 1.0625,
"learning_rate": 0.004437635835235353,
"loss": 1.3606,
"step": 4580
},
{
"epoch": 0.6988561749424281,
"grad_norm": 1.0078125,
"learning_rate": 0.0043967187317764615,
"loss": 1.3905,
"step": 4590
},
{
"epoch": 0.7003787374150695,
"grad_norm": 1.0390625,
"learning_rate": 0.004355937917825566,
"loss": 1.3614,
"step": 4600
},
{
"epoch": 0.701901299887711,
"grad_norm": 1.03125,
"learning_rate": 0.004315294385300951,
"loss": 1.364,
"step": 4610
},
{
"epoch": 0.7034238623603525,
"grad_norm": 1.0078125,
"learning_rate": 0.004274789122781753,
"loss": 1.3518,
"step": 4620
},
{
"epoch": 0.704946424832994,
"grad_norm": 1.0546875,
"learning_rate": 0.004234423115483971,
"loss": 1.3544,
"step": 4630
},
{
"epoch": 0.7064689873056353,
"grad_norm": 1.046875,
"learning_rate": 0.004194197345236467,
"loss": 1.3599,
"step": 4640
},
{
"epoch": 0.7079915497782768,
"grad_norm": 1.0546875,
"learning_rate": 0.004154112790457089,
"loss": 1.3559,
"step": 4650
},
{
"epoch": 0.7095141122509183,
"grad_norm": 1.0703125,
"learning_rate": 0.0041141704261288955,
"loss": 1.362,
"step": 4660
},
{
"epoch": 0.7110366747235598,
"grad_norm": 1.1015625,
"learning_rate": 0.004074371223776407,
"loss": 1.3175,
"step": 4670
},
{
"epoch": 0.7125592371962012,
"grad_norm": 1.0390625,
"learning_rate": 0.004034716151441996,
"loss": 1.3564,
"step": 4680
},
{
"epoch": 0.7140817996688427,
"grad_norm": 1.0390625,
"learning_rate": 0.003995206173662348,
"loss": 1.3867,
"step": 4690
},
{
"epoch": 0.7156043621414842,
"grad_norm": 1.078125,
"learning_rate": 0.003955842251444978,
"loss": 1.3807,
"step": 4700
},
{
"epoch": 0.7171269246141255,
"grad_norm": 0.98828125,
"learning_rate": 0.003916625342244869,
"loss": 1.3359,
"step": 4710
},
{
"epoch": 0.718649487086767,
"grad_norm": 1.0546875,
"learning_rate": 0.0038775563999411955,
"loss": 1.3355,
"step": 4720
},
{
"epoch": 0.7201720495594085,
"grad_norm": 0.98828125,
"learning_rate": 0.0038386363748140894,
"loss": 1.3593,
"step": 4730
},
{
"epoch": 0.72169461203205,
"grad_norm": 1.0390625,
"learning_rate": 0.003799866213521568,
"loss": 1.3215,
"step": 4740
},
{
"epoch": 0.7232171745046914,
"grad_norm": 1.078125,
"learning_rate": 0.0037612468590764695,
"loss": 1.3447,
"step": 4750
},
{
"epoch": 0.7247397369773328,
"grad_norm": 1.0703125,
"learning_rate": 0.003722779250823538,
"loss": 1.344,
"step": 4760
},
{
"epoch": 0.7262622994499743,
"grad_norm": 1.0078125,
"learning_rate": 0.0036844643244165775,
"loss": 1.3388,
"step": 4770
},
{
"epoch": 0.7277848619226157,
"grad_norm": 1.078125,
"learning_rate": 0.0036463030117956795,
"loss": 1.3718,
"step": 4780
},
{
"epoch": 0.7293074243952572,
"grad_norm": 1.140625,
"learning_rate": 0.0036082962411645614,
"loss": 1.3073,
"step": 4790
},
{
"epoch": 0.7308299868678987,
"grad_norm": 1.0625,
"learning_rate": 0.0035704449369680005,
"loss": 1.3638,
"step": 4800
},
{
"epoch": 0.7323525493405402,
"grad_norm": 1.0625,
"learning_rate": 0.0035327500198693287,
"loss": 1.3734,
"step": 4810
},
{
"epoch": 0.7338751118131815,
"grad_norm": 1.0078125,
"learning_rate": 0.0034952124067280555,
"loss": 1.3498,
"step": 4820
},
{
"epoch": 0.735397674285823,
"grad_norm": 1.0078125,
"learning_rate": 0.003457833010577558,
"loss": 1.3421,
"step": 4830
},
{
"epoch": 0.7369202367584645,
"grad_norm": 1.0234375,
"learning_rate": 0.003420612740602874,
"loss": 1.3433,
"step": 4840
},
{
"epoch": 0.738442799231106,
"grad_norm": 1.015625,
"learning_rate": 0.003383552502118602,
"loss": 1.3657,
"step": 4850
},
{
"epoch": 0.7399653617037474,
"grad_norm": 1.0546875,
"learning_rate": 0.003346653196546855,
"loss": 1.3565,
"step": 4860
},
{
"epoch": 0.7414879241763889,
"grad_norm": 0.99609375,
"learning_rate": 0.0033099157213953502,
"loss": 1.33,
"step": 4870
},
{
"epoch": 0.7430104866490304,
"grad_norm": 1.0859375,
"learning_rate": 0.00327334097023559,
"loss": 1.3577,
"step": 4880
},
{
"epoch": 0.7445330491216717,
"grad_norm": 1.09375,
"learning_rate": 0.0032369298326811024,
"loss": 1.3191,
"step": 4890
},
{
"epoch": 0.7460556115943132,
"grad_norm": 1.0234375,
"learning_rate": 0.0032006831943658153,
"loss": 1.3329,
"step": 4900
},
{
"epoch": 0.7475781740669547,
"grad_norm": 1.1171875,
"learning_rate": 0.003164601936922528,
"loss": 1.3318,
"step": 4910
},
{
"epoch": 0.7491007365395962,
"grad_norm": 1.046875,
"learning_rate": 0.003128686937961438,
"loss": 1.3116,
"step": 4920
},
{
"epoch": 0.7506232990122376,
"grad_norm": 1.0390625,
"learning_rate": 0.0030929390710488303,
"loss": 1.3419,
"step": 4930
},
{
"epoch": 0.752145861484879,
"grad_norm": 1.0390625,
"learning_rate": 0.003057359205685788,
"loss": 1.3215,
"step": 4940
},
{
"epoch": 0.7536684239575205,
"grad_norm": 1.0625,
"learning_rate": 0.0030219482072870764,
"loss": 1.3114,
"step": 4950
},
{
"epoch": 0.7551909864301619,
"grad_norm": 1.0859375,
"learning_rate": 0.0029867069371600895,
"loss": 1.3132,
"step": 4960
},
{
"epoch": 0.7567135489028034,
"grad_norm": 1.0,
"learning_rate": 0.0029516362524838846,
"loss": 1.3154,
"step": 4970
},
{
"epoch": 0.7582361113754449,
"grad_norm": 1.078125,
"learning_rate": 0.0029167370062883405,
"loss": 1.3152,
"step": 4980
},
{
"epoch": 0.7597586738480864,
"grad_norm": 1.0625,
"learning_rate": 0.0028820100474334187,
"loss": 1.32,
"step": 4990
},
{
"epoch": 0.7612812363207277,
"grad_norm": 1.109375,
"learning_rate": 0.002847456220588498,
"loss": 1.3324,
"step": 5000
},
{
"epoch": 0.7612812363207277,
"eval_loss": 1.3596354722976685,
"eval_runtime": 330.8649,
"eval_samples_per_second": 51.332,
"eval_steps_per_second": 25.666,
"step": 5000
},
{
"epoch": 0.7628037987933692,
"grad_norm": 0.99609375,
"learning_rate": 0.0028130763662118498,
"loss": 1.3181,
"step": 5010
},
{
"epoch": 0.7643263612660107,
"grad_norm": 1.03125,
"learning_rate": 0.0027788713205301775,
"loss": 1.317,
"step": 5020
},
{
"epoch": 0.7658489237386521,
"grad_norm": 1.0234375,
"learning_rate": 0.0027448419155182858,
"loss": 1.3055,
"step": 5030
},
{
"epoch": 0.7673714862112936,
"grad_norm": 1.09375,
"learning_rate": 0.002710988978878851,
"loss": 1.3281,
"step": 5040
},
{
"epoch": 0.7688940486839351,
"grad_norm": 1.0234375,
"learning_rate": 0.002677313334022268,
"loss": 1.3368,
"step": 5050
},
{
"epoch": 0.7704166111565766,
"grad_norm": 1.0,
"learning_rate": 0.0026438158000466404,
"loss": 1.3504,
"step": 5060
},
{
"epoch": 0.7719391736292179,
"grad_norm": 1.0703125,
"learning_rate": 0.002610497191717861,
"loss": 1.3228,
"step": 5070
},
{
"epoch": 0.7734617361018594,
"grad_norm": 1.1484375,
"learning_rate": 0.0025773583194497705,
"loss": 1.3179,
"step": 5080
},
{
"epoch": 0.7749842985745009,
"grad_norm": 1.0078125,
"learning_rate": 0.002544399989284476,
"loss": 1.3058,
"step": 5090
},
{
"epoch": 0.7765068610471424,
"grad_norm": 1.0703125,
"learning_rate": 0.0025116230028727183,
"loss": 1.34,
"step": 5100
},
{
"epoch": 0.7780294235197838,
"grad_norm": 1.09375,
"learning_rate": 0.002479028157454387,
"loss": 1.3391,
"step": 5110
},
{
"epoch": 0.7795519859924253,
"grad_norm": 1.0390625,
"learning_rate": 0.002446616245839136,
"loss": 1.3133,
"step": 5120
},
{
"epoch": 0.7810745484650667,
"grad_norm": 1.0625,
"learning_rate": 0.002414388056387079,
"loss": 1.3257,
"step": 5130
},
{
"epoch": 0.7825971109377081,
"grad_norm": 1.0234375,
"learning_rate": 0.00238234437298963,
"loss": 1.3274,
"step": 5140
},
{
"epoch": 0.7841196734103496,
"grad_norm": 1.0390625,
"learning_rate": 0.0023504859750504425,
"loss": 1.33,
"step": 5150
},
{
"epoch": 0.7856422358829911,
"grad_norm": 1.03125,
"learning_rate": 0.0023188136374664224,
"loss": 1.3209,
"step": 5160
},
{
"epoch": 0.7871647983556326,
"grad_norm": 1.125,
"learning_rate": 0.002287328130608919,
"loss": 1.3023,
"step": 5170
},
{
"epoch": 0.788687360828274,
"grad_norm": 1.109375,
"learning_rate": 0.0022560302203049575,
"loss": 1.317,
"step": 5180
},
{
"epoch": 0.7902099233009154,
"grad_norm": 1.0625,
"learning_rate": 0.0022249206678186216,
"loss": 1.316,
"step": 5190
},
{
"epoch": 0.7917324857735569,
"grad_norm": 1.046875,
"learning_rate": 0.002194000229832547,
"loss": 1.3339,
"step": 5200
},
{
"epoch": 0.7932550482461983,
"grad_norm": 1.046875,
"learning_rate": 0.0021632696584294965,
"loss": 1.2943,
"step": 5210
},
{
"epoch": 0.7947776107188398,
"grad_norm": 1.0703125,
"learning_rate": 0.0021327297010740797,
"loss": 1.3415,
"step": 5220
},
{
"epoch": 0.7963001731914813,
"grad_norm": 1.0546875,
"learning_rate": 0.002102381100594577,
"loss": 1.3094,
"step": 5230
},
{
"epoch": 0.7978227356641228,
"grad_norm": 1.0234375,
"learning_rate": 0.002072224595164859,
"loss": 1.3142,
"step": 5240
},
{
"epoch": 0.7993452981367641,
"grad_norm": 1.0390625,
"learning_rate": 0.0020422609182864336,
"loss": 1.3341,
"step": 5250
},
{
"epoch": 0.8008678606094056,
"grad_norm": 1.1171875,
"learning_rate": 0.0020124907987706243,
"loss": 1.3107,
"step": 5260
},
{
"epoch": 0.8023904230820471,
"grad_norm": 1.015625,
"learning_rate": 0.0019829149607208064,
"loss": 1.3188,
"step": 5270
},
{
"epoch": 0.8039129855546885,
"grad_norm": 1.1171875,
"learning_rate": 0.0019535341235148353,
"loss": 1.3194,
"step": 5280
},
{
"epoch": 0.80543554802733,
"grad_norm": 1.078125,
"learning_rate": 0.0019243490017875164,
"loss": 1.3211,
"step": 5290
},
{
"epoch": 0.8069581104999715,
"grad_norm": 1.078125,
"learning_rate": 0.0018953603054132429,
"loss": 1.3,
"step": 5300
},
{
"epoch": 0.808480672972613,
"grad_norm": 1.0546875,
"learning_rate": 0.0018665687394887232,
"loss": 1.2797,
"step": 5310
},
{
"epoch": 0.8100032354452543,
"grad_norm": 1.109375,
"learning_rate": 0.001837975004315826,
"loss": 1.3289,
"step": 5320
},
{
"epoch": 0.8115257979178958,
"grad_norm": 1.0625,
"learning_rate": 0.0018095797953845505,
"loss": 1.2976,
"step": 5330
},
{
"epoch": 0.8130483603905373,
"grad_norm": 1.0390625,
"learning_rate": 0.0017813838033561191,
"loss": 1.2846,
"step": 5340
},
{
"epoch": 0.8145709228631788,
"grad_norm": 1.0859375,
"learning_rate": 0.0017533877140461585,
"loss": 1.3132,
"step": 5350
},
{
"epoch": 0.8160934853358202,
"grad_norm": 1.0703125,
"learning_rate": 0.0017255922084080367,
"loss": 1.3321,
"step": 5360
},
{
"epoch": 0.8176160478084616,
"grad_norm": 1.0859375,
"learning_rate": 0.0016979979625162888,
"loss": 1.3128,
"step": 5370
},
{
"epoch": 0.8191386102811031,
"grad_norm": 1.0390625,
"learning_rate": 0.0016706056475501764,
"loss": 1.3186,
"step": 5380
},
{
"epoch": 0.8206611727537445,
"grad_norm": 1.1796875,
"learning_rate": 0.00164341592977737,
"loss": 1.2902,
"step": 5390
},
{
"epoch": 0.822183735226386,
"grad_norm": 1.03125,
"learning_rate": 0.0016164294705377292,
"loss": 1.3005,
"step": 5400
},
{
"epoch": 0.8237062976990275,
"grad_norm": 1.1015625,
"learning_rate": 0.0015896469262272218,
"loss": 1.2982,
"step": 5410
},
{
"epoch": 0.825228860171669,
"grad_norm": 1.078125,
"learning_rate": 0.0015630689482819715,
"loss": 1.3086,
"step": 5420
},
{
"epoch": 0.8267514226443103,
"grad_norm": 1.03125,
"learning_rate": 0.0015366961831623882,
"loss": 1.2775,
"step": 5430
},
{
"epoch": 0.8282739851169518,
"grad_norm": 1.03125,
"learning_rate": 0.0015105292723374632,
"loss": 1.2755,
"step": 5440
},
{
"epoch": 0.8297965475895933,
"grad_norm": 1.109375,
"learning_rate": 0.0014845688522691647,
"loss": 1.3371,
"step": 5450
},
{
"epoch": 0.8313191100622347,
"grad_norm": 1.0703125,
"learning_rate": 0.0014588155543969461,
"loss": 1.3178,
"step": 5460
},
{
"epoch": 0.8328416725348762,
"grad_norm": 1.125,
"learning_rate": 0.001433270005122399,
"loss": 1.3009,
"step": 5470
},
{
"epoch": 0.8343642350075177,
"grad_norm": 1.1015625,
"learning_rate": 0.0014079328257940104,
"loss": 1.2848,
"step": 5480
},
{
"epoch": 0.8358867974801591,
"grad_norm": 1.0546875,
"learning_rate": 0.0013828046326920496,
"loss": 1.3258,
"step": 5490
},
{
"epoch": 0.8374093599528005,
"grad_norm": 1.0546875,
"learning_rate": 0.0013578860370135881,
"loss": 1.3133,
"step": 5500
},
{
"epoch": 0.8374093599528005,
"eval_loss": 1.3418630361557007,
"eval_runtime": 330.9458,
"eval_samples_per_second": 51.32,
"eval_steps_per_second": 25.66,
"step": 5500
},
{
"epoch": 0.838931922425442,
"grad_norm": 1.0078125,
"learning_rate": 0.0013331776448576194,
"loss": 1.303,
"step": 5510
},
{
"epoch": 0.8404544848980835,
"grad_norm": 1.09375,
"learning_rate": 0.001308680057210322,
"loss": 1.3289,
"step": 5520
},
{
"epoch": 0.841977047370725,
"grad_norm": 1.09375,
"learning_rate": 0.001284393869930448,
"loss": 1.3162,
"step": 5530
},
{
"epoch": 0.8434996098433664,
"grad_norm": 1.0078125,
"learning_rate": 0.001260319673734821,
"loss": 1.2969,
"step": 5540
},
{
"epoch": 0.8450221723160078,
"grad_norm": 1.0390625,
"learning_rate": 0.0012364580541839698,
"loss": 1.339,
"step": 5550
},
{
"epoch": 0.8465447347886493,
"grad_norm": 1.1015625,
"learning_rate": 0.0012128095916678927,
"loss": 1.3151,
"step": 5560
},
{
"epoch": 0.8480672972612907,
"grad_norm": 1.0703125,
"learning_rate": 0.001189374861391932,
"loss": 1.293,
"step": 5570
},
{
"epoch": 0.8495898597339322,
"grad_norm": 1.0390625,
"learning_rate": 0.0011661544333627849,
"loss": 1.3107,
"step": 5580
},
{
"epoch": 0.8511124222065737,
"grad_norm": 1.125,
"learning_rate": 0.001143148872374643,
"loss": 1.3048,
"step": 5590
},
{
"epoch": 0.8526349846792152,
"grad_norm": 1.078125,
"learning_rate": 0.0011203587379954505,
"loss": 1.3071,
"step": 5600
},
{
"epoch": 0.8541575471518565,
"grad_norm": 1.0625,
"learning_rate": 0.0010977845845533008,
"loss": 1.3039,
"step": 5610
},
{
"epoch": 0.855680109624498,
"grad_norm": 1.046875,
"learning_rate": 0.0010754269611229427,
"loss": 1.2884,
"step": 5620
},
{
"epoch": 0.8572026720971395,
"grad_norm": 1.09375,
"learning_rate": 0.0010532864115124318,
"loss": 1.3189,
"step": 5630
},
{
"epoch": 0.8587252345697809,
"grad_norm": 1.1328125,
"learning_rate": 0.0010313634742499067,
"loss": 1.3361,
"step": 5640
},
{
"epoch": 0.8602477970424224,
"grad_norm": 1.03125,
"learning_rate": 0.00100965868257048,
"loss": 1.3082,
"step": 5650
},
{
"epoch": 0.8617703595150639,
"grad_norm": 1.0390625,
"learning_rate": 0.0009881725644032757,
"loss": 1.2981,
"step": 5660
},
{
"epoch": 0.8632929219877054,
"grad_norm": 1.078125,
"learning_rate": 0.0009669056423585932,
"loss": 1.3029,
"step": 5670
},
{
"epoch": 0.8648154844603467,
"grad_norm": 1.03125,
"learning_rate": 0.000945858433715181,
"loss": 1.2981,
"step": 5680
},
{
"epoch": 0.8663380469329882,
"grad_norm": 1.1015625,
"learning_rate": 0.0009250314504076684,
"loss": 1.3538,
"step": 5690
},
{
"epoch": 0.8678606094056297,
"grad_norm": 1.109375,
"learning_rate": 0.0009044251990141061,
"loss": 1.3062,
"step": 5700
},
{
"epoch": 0.8693831718782711,
"grad_norm": 1.0390625,
"learning_rate": 0.000884040180743646,
"loss": 1.2683,
"step": 5710
},
{
"epoch": 0.8709057343509126,
"grad_norm": 1.0625,
"learning_rate": 0.0008638768914243589,
"loss": 1.2986,
"step": 5720
},
{
"epoch": 0.872428296823554,
"grad_norm": 1.0546875,
"learning_rate": 0.0008439358214911586,
"loss": 1.3151,
"step": 5730
},
{
"epoch": 0.8739508592961955,
"grad_norm": 0.99609375,
"learning_rate": 0.0008242174559738802,
"loss": 1.3356,
"step": 5740
},
{
"epoch": 0.8754734217688369,
"grad_norm": 1.1796875,
"learning_rate": 0.0008047222744854942,
"loss": 1.2749,
"step": 5750
},
{
"epoch": 0.8769959842414784,
"grad_norm": 1.0703125,
"learning_rate": 0.0007854507512104192,
"loss": 1.3067,
"step": 5760
},
{
"epoch": 0.8785185467141199,
"grad_norm": 1.03125,
"learning_rate": 0.0007664033548930016,
"loss": 1.3232,
"step": 5770
},
{
"epoch": 0.8800411091867614,
"grad_norm": 1.1484375,
"learning_rate": 0.000747580548826119,
"loss": 1.3296,
"step": 5780
},
{
"epoch": 0.8815636716594027,
"grad_norm": 1.0703125,
"learning_rate": 0.000728982790839895,
"loss": 1.2851,
"step": 5790
},
{
"epoch": 0.8830862341320442,
"grad_norm": 1.0703125,
"learning_rate": 0.0007106105332905777,
"loss": 1.2947,
"step": 5800
},
{
"epoch": 0.8846087966046857,
"grad_norm": 1.09375,
"learning_rate": 0.0006924642230495315,
"loss": 1.3003,
"step": 5810
},
{
"epoch": 0.8861313590773271,
"grad_norm": 1.078125,
"learning_rate": 0.0006745443014923658,
"loss": 1.295,
"step": 5820
},
{
"epoch": 0.8876539215499686,
"grad_norm": 1.1015625,
"learning_rate": 0.0006568512044882057,
"loss": 1.3155,
"step": 5830
},
{
"epoch": 0.8891764840226101,
"grad_norm": 1.03125,
"learning_rate": 0.0006393853623890833,
"loss": 1.3215,
"step": 5840
},
{
"epoch": 0.8906990464952516,
"grad_norm": 1.046875,
"learning_rate": 0.0006221472000194739,
"loss": 1.2914,
"step": 5850
},
{
"epoch": 0.8922216089678929,
"grad_norm": 1.1015625,
"learning_rate": 0.0006051371366659642,
"loss": 1.3099,
"step": 5860
},
{
"epoch": 0.8937441714405344,
"grad_norm": 1.1171875,
"learning_rate": 0.0005883555860670487,
"loss": 1.3034,
"step": 5870
},
{
"epoch": 0.8952667339131759,
"grad_norm": 1.0234375,
"learning_rate": 0.0005718029564030702,
"loss": 1.3146,
"step": 5880
},
{
"epoch": 0.8967892963858173,
"grad_norm": 1.1171875,
"learning_rate": 0.0005554796502862957,
"loss": 1.3074,
"step": 5890
},
{
"epoch": 0.8983118588584588,
"grad_norm": 1.1484375,
"learning_rate": 0.0005393860647511129,
"loss": 1.3068,
"step": 5900
},
{
"epoch": 0.8998344213311003,
"grad_norm": 1.0546875,
"learning_rate": 0.0005235225912443808,
"loss": 1.3098,
"step": 5910
},
{
"epoch": 0.9013569838037417,
"grad_norm": 1.0234375,
"learning_rate": 0.0005078896156159074,
"loss": 1.3097,
"step": 5920
},
{
"epoch": 0.9028795462763831,
"grad_norm": 1.1171875,
"learning_rate": 0.0004924875181090627,
"loss": 1.3344,
"step": 5930
},
{
"epoch": 0.9044021087490246,
"grad_norm": 1.046875,
"learning_rate": 0.00047731667335153326,
"loss": 1.2884,
"step": 5940
},
{
"epoch": 0.9059246712216661,
"grad_norm": 1.0078125,
"learning_rate": 0.0004623774503462064,
"loss": 1.2927,
"step": 5950
},
{
"epoch": 0.9074472336943075,
"grad_norm": 1.03125,
"learning_rate": 0.0004476702124621956,
"loss": 1.3137,
"step": 5960
},
{
"epoch": 0.908969796166949,
"grad_norm": 1.1015625,
"learning_rate": 0.00043319531742600507,
"loss": 1.3188,
"step": 5970
},
{
"epoch": 0.9104923586395904,
"grad_norm": 1.0,
"learning_rate": 0.0004189531173128258,
"loss": 1.3023,
"step": 5980
},
{
"epoch": 0.9120149211122319,
"grad_norm": 1.15625,
"learning_rate": 0.0004049439585379733,
"loss": 1.3314,
"step": 5990
},
{
"epoch": 0.9135374835848733,
"grad_norm": 1.046875,
"learning_rate": 0.00039116818184846137,
"loss": 1.2929,
"step": 6000
},
{
"epoch": 0.9135374835848733,
"eval_loss": 1.338244915008545,
"eval_runtime": 331.5876,
"eval_samples_per_second": 51.22,
"eval_steps_per_second": 25.61,
"step": 6000
},
{
"epoch": 0.9150600460575148,
"grad_norm": 1.0234375,
"learning_rate": 0.0003776261223147126,
"loss": 1.2987,
"step": 6010
},
{
"epoch": 0.9165826085301563,
"grad_norm": 1.046875,
"learning_rate": 0.00036431810932241015,
"loss": 1.306,
"step": 6020
},
{
"epoch": 0.9181051710027978,
"grad_norm": 1.1171875,
"learning_rate": 0.0003512444665644865,
"loss": 1.3156,
"step": 6030
},
{
"epoch": 0.9196277334754391,
"grad_norm": 1.109375,
"learning_rate": 0.00033840551203324855,
"loss": 1.338,
"step": 6040
},
{
"epoch": 0.9211502959480806,
"grad_norm": 1.109375,
"learning_rate": 0.000325801558012645,
"loss": 1.3182,
"step": 6050
},
{
"epoch": 0.9226728584207221,
"grad_norm": 1.0546875,
"learning_rate": 0.0003134329110706691,
"loss": 1.301,
"step": 6060
},
{
"epoch": 0.9241954208933635,
"grad_norm": 1.109375,
"learning_rate": 0.0003012998720519011,
"loss": 1.3151,
"step": 6070
},
{
"epoch": 0.925717983366005,
"grad_norm": 1.140625,
"learning_rate": 0.0002894027360701945,
"loss": 1.3417,
"step": 6080
},
{
"epoch": 0.9272405458386465,
"grad_norm": 1.1015625,
"learning_rate": 0.0002777417925014913,
"loss": 1.321,
"step": 6090
},
{
"epoch": 0.9287631083112879,
"grad_norm": 1.046875,
"learning_rate": 0.0002663173249767936,
"loss": 1.3347,
"step": 6100
},
{
"epoch": 0.9302856707839293,
"grad_norm": 1.171875,
"learning_rate": 0.00025512961137525217,
"loss": 1.2924,
"step": 6110
},
{
"epoch": 0.9318082332565708,
"grad_norm": 1.125,
"learning_rate": 0.00024417892381741857,
"loss": 1.3502,
"step": 6120
},
{
"epoch": 0.9333307957292123,
"grad_norm": 1.0546875,
"learning_rate": 0.00023346552865862182,
"loss": 1.2897,
"step": 6130
},
{
"epoch": 0.9348533582018537,
"grad_norm": 1.0703125,
"learning_rate": 0.0002229896864824865,
"loss": 1.3307,
"step": 6140
},
{
"epoch": 0.9363759206744952,
"grad_norm": 1.046875,
"learning_rate": 0.00021275165209460047,
"loss": 1.2939,
"step": 6150
},
{
"epoch": 0.9378984831471366,
"grad_norm": 1.03125,
"learning_rate": 0.00020275167451631716,
"loss": 1.2911,
"step": 6160
},
{
"epoch": 0.9394210456197781,
"grad_norm": 1.078125,
"learning_rate": 0.00019298999697868967,
"loss": 1.3012,
"step": 6170
},
{
"epoch": 0.9409436080924195,
"grad_norm": 1.046875,
"learning_rate": 0.00018346685691656762,
"loss": 1.2888,
"step": 6180
},
{
"epoch": 0.942466170565061,
"grad_norm": 1.1171875,
"learning_rate": 0.0001741824859628116,
"loss": 1.2922,
"step": 6190
},
{
"epoch": 0.9439887330377025,
"grad_norm": 1.109375,
"learning_rate": 0.0001651371099426624,
"loss": 1.335,
"step": 6200
},
{
"epoch": 0.945511295510344,
"grad_norm": 1.0625,
"learning_rate": 0.00015633094886825184,
"loss": 1.3128,
"step": 6210
},
{
"epoch": 0.9470338579829853,
"grad_norm": 1.0625,
"learning_rate": 0.00014776421693324604,
"loss": 1.2918,
"step": 6220
},
{
"epoch": 0.9485564204556268,
"grad_norm": 1.0703125,
"learning_rate": 0.00013943712250763851,
"loss": 1.2868,
"step": 6230
},
{
"epoch": 0.9500789829282683,
"grad_norm": 1.0234375,
"learning_rate": 0.00013134986813267968,
"loss": 1.3157,
"step": 6240
},
{
"epoch": 0.9516015454009097,
"grad_norm": 1.078125,
"learning_rate": 0.00012350265051595534,
"loss": 1.2916,
"step": 6250
},
{
"epoch": 0.9531241078735512,
"grad_norm": 1.1015625,
"learning_rate": 0.00011589566052659594,
"loss": 1.3327,
"step": 6260
},
{
"epoch": 0.9546466703461927,
"grad_norm": 1.140625,
"learning_rate": 0.00010852908319063826,
"loss": 1.3101,
"step": 6270
},
{
"epoch": 0.9561692328188341,
"grad_norm": 1.0703125,
"learning_rate": 0.00010140309768652211,
"loss": 1.2918,
"step": 6280
},
{
"epoch": 0.9576917952914755,
"grad_norm": 1.03125,
"learning_rate": 9.451787734073514e-05,
"loss": 1.2713,
"step": 6290
},
{
"epoch": 0.959214357764117,
"grad_norm": 1.046875,
"learning_rate": 8.787358962359493e-05,
"loss": 1.3014,
"step": 6300
},
{
"epoch": 0.9607369202367585,
"grad_norm": 1.09375,
"learning_rate": 8.147039614517571e-05,
"loss": 1.3265,
"step": 6310
},
{
"epoch": 0.9622594827093999,
"grad_norm": 1.09375,
"learning_rate": 7.53084526513781e-05,
"loss": 1.2993,
"step": 6320
},
{
"epoch": 0.9637820451820414,
"grad_norm": 1.0546875,
"learning_rate": 6.938790902014325e-05,
"loss": 1.3177,
"step": 6330
},
{
"epoch": 0.9653046076546828,
"grad_norm": 1.0625,
"learning_rate": 6.370890925779915e-05,
"loss": 1.2914,
"step": 6340
},
{
"epoch": 0.9668271701273243,
"grad_norm": 1.0859375,
"learning_rate": 5.827159149556893e-05,
"loss": 1.3135,
"step": 6350
},
{
"epoch": 0.9683497325999657,
"grad_norm": 1.078125,
"learning_rate": 5.307608798620245e-05,
"loss": 1.3195,
"step": 6360
},
{
"epoch": 0.9698722950726072,
"grad_norm": 1.0625,
"learning_rate": 4.8122525100765534e-05,
"loss": 1.3112,
"step": 6370
},
{
"epoch": 0.9713948575452487,
"grad_norm": 1.03125,
"learning_rate": 4.341102332556024e-05,
"loss": 1.3055,
"step": 6380
},
{
"epoch": 0.9729174200178901,
"grad_norm": 1.0546875,
"learning_rate": 3.8941697259199385e-05,
"loss": 1.2961,
"step": 6390
},
{
"epoch": 0.9744399824905315,
"grad_norm": 1.015625,
"learning_rate": 3.471465560981768e-05,
"loss": 1.3145,
"step": 6400
},
{
"epoch": 0.975962544963173,
"grad_norm": 1.0,
"learning_rate": 3.073000119242608e-05,
"loss": 1.3125,
"step": 6410
},
{
"epoch": 0.9774851074358145,
"grad_norm": 1.046875,
"learning_rate": 2.6987830926412658e-05,
"loss": 1.3009,
"step": 6420
},
{
"epoch": 0.9790076699084559,
"grad_norm": 1.1015625,
"learning_rate": 2.348823583318338e-05,
"loss": 1.3013,
"step": 6430
},
{
"epoch": 0.9805302323810974,
"grad_norm": 1.1015625,
"learning_rate": 2.0231301033951655e-05,
"loss": 1.3234,
"step": 6440
},
{
"epoch": 0.9820527948537389,
"grad_norm": 1.078125,
"learning_rate": 1.721710574766333e-05,
"loss": 1.2958,
"step": 6450
},
{
"epoch": 0.9835753573263804,
"grad_norm": 1.078125,
"learning_rate": 1.4445723289072676e-05,
"loss": 1.2891,
"step": 6460
},
{
"epoch": 0.9850979197990217,
"grad_norm": 1.1640625,
"learning_rate": 1.191722106696158e-05,
"loss": 1.3166,
"step": 6470
},
{
"epoch": 0.9866204822716632,
"grad_norm": 1.0703125,
"learning_rate": 9.631660582491986e-06,
"loss": 1.322,
"step": 6480
},
{
"epoch": 0.9881430447443047,
"grad_norm": 1.0703125,
"learning_rate": 7.589097427720404e-06,
"loss": 1.3266,
"step": 6490
},
{
"epoch": 0.9896656072169461,
"grad_norm": 1.1015625,
"learning_rate": 5.789581284235679e-06,
"loss": 1.3226,
"step": 6500
},
{
"epoch": 0.9896656072169461,
"eval_loss": 1.3380595445632935,
"eval_runtime": 333.6502,
"eval_samples_per_second": 50.904,
"eval_steps_per_second": 25.452,
"step": 6500
},
{
"epoch": 0.9911881696895876,
"grad_norm": 1.0703125,
"learning_rate": 4.233155921957721e-06,
"loss": 1.2931,
"step": 6510
},
{
"epoch": 0.992710732162229,
"grad_norm": 1.0078125,
"learning_rate": 2.9198591980705847e-06,
"loss": 1.3047,
"step": 6520
},
{
"epoch": 0.9942332946348705,
"grad_norm": 1.015625,
"learning_rate": 1.8497230560998724e-06,
"loss": 1.3187,
"step": 6530
},
{
"epoch": 0.9957558571075119,
"grad_norm": 1.0703125,
"learning_rate": 1.0227735251400194e-06,
"loss": 1.2968,
"step": 6540
},
{
"epoch": 0.9972784195801534,
"grad_norm": 1.1015625,
"learning_rate": 4.3903071921480575e-07,
"loss": 1.3062,
"step": 6550
},
{
"epoch": 0.9988009820527949,
"grad_norm": 1.0390625,
"learning_rate": 9.850883679662914e-08,
"loss": 1.3219,
"step": 6560
}
],
"logging_steps": 10,
"max_steps": 6568,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.872004116416299e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}