modrill's picture
Add files using upload-large-folder tool
d263c9f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 3094,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032323232323232323,
"grad_norm": 0.032284051179885864,
"learning_rate": 5.806451612903226e-06,
"loss": 0.6013355255126953,
"step": 10
},
{
"epoch": 0.006464646464646465,
"grad_norm": 0.01937970519065857,
"learning_rate": 1.2258064516129032e-05,
"loss": 0.5758121490478516,
"step": 20
},
{
"epoch": 0.009696969696969697,
"grad_norm": 0.014327733777463436,
"learning_rate": 1.870967741935484e-05,
"loss": 0.5787662506103516,
"step": 30
},
{
"epoch": 0.01292929292929293,
"grad_norm": 0.012254982255399227,
"learning_rate": 2.5161290322580645e-05,
"loss": 0.5327143669128418,
"step": 40
},
{
"epoch": 0.01616161616161616,
"grad_norm": 0.014536243863403797,
"learning_rate": 3.161290322580645e-05,
"loss": 0.5393171310424805,
"step": 50
},
{
"epoch": 0.019393939393939394,
"grad_norm": 0.011900736019015312,
"learning_rate": 3.8064516129032254e-05,
"loss": 0.5058945178985595,
"step": 60
},
{
"epoch": 0.022626262626262626,
"grad_norm": 0.01210293360054493,
"learning_rate": 4.451612903225807e-05,
"loss": 0.517607593536377,
"step": 70
},
{
"epoch": 0.02585858585858586,
"grad_norm": 0.017207792028784752,
"learning_rate": 5.096774193548387e-05,
"loss": 0.5269095420837402,
"step": 80
},
{
"epoch": 0.02909090909090909,
"grad_norm": 0.013736969791352749,
"learning_rate": 5.7419354838709685e-05,
"loss": 0.5040688514709473,
"step": 90
},
{
"epoch": 0.03232323232323232,
"grad_norm": 0.011664404533803463,
"learning_rate": 6.387096774193548e-05,
"loss": 0.5147775173187256,
"step": 100
},
{
"epoch": 0.035555555555555556,
"grad_norm": 0.014052552171051502,
"learning_rate": 7.03225806451613e-05,
"loss": 0.5230005741119385,
"step": 110
},
{
"epoch": 0.03878787878787879,
"grad_norm": 0.011644740588963032,
"learning_rate": 7.67741935483871e-05,
"loss": 0.5105222702026367,
"step": 120
},
{
"epoch": 0.04202020202020202,
"grad_norm": 0.011730308644473553,
"learning_rate": 8.32258064516129e-05,
"loss": 0.4989170074462891,
"step": 130
},
{
"epoch": 0.04525252525252525,
"grad_norm": 0.012009115889668465,
"learning_rate": 8.967741935483871e-05,
"loss": 0.5003124713897705,
"step": 140
},
{
"epoch": 0.048484848484848485,
"grad_norm": 0.012424134649336338,
"learning_rate": 9.612903225806452e-05,
"loss": 0.5077538013458252,
"step": 150
},
{
"epoch": 0.05171717171717172,
"grad_norm": 0.011461800895631313,
"learning_rate": 9.999954295400999e-05,
"loss": 0.5061477661132813,
"step": 160
},
{
"epoch": 0.05494949494949495,
"grad_norm": 0.011583201587200165,
"learning_rate": 9.999440128258112e-05,
"loss": 0.5013481140136719,
"step": 170
},
{
"epoch": 0.05818181818181818,
"grad_norm": 0.011583367362618446,
"learning_rate": 9.998354722168459e-05,
"loss": 0.48343815803527834,
"step": 180
},
{
"epoch": 0.061414141414141414,
"grad_norm": 0.011094105429947376,
"learning_rate": 9.996698201151175e-05,
"loss": 0.507840633392334,
"step": 190
},
{
"epoch": 0.06464646464646465,
"grad_norm": 0.011715452186763287,
"learning_rate": 9.994470754481315e-05,
"loss": 0.49884777069091796,
"step": 200
},
{
"epoch": 0.06787878787878789,
"grad_norm": 0.010746965184807777,
"learning_rate": 9.991672636668239e-05,
"loss": 0.5068713188171386,
"step": 210
},
{
"epoch": 0.07111111111111111,
"grad_norm": 0.012883410789072514,
"learning_rate": 9.988304167426519e-05,
"loss": 0.4920186519622803,
"step": 220
},
{
"epoch": 0.07434343434343435,
"grad_norm": 0.011187366209924221,
"learning_rate": 9.984365731639419e-05,
"loss": 0.5021648406982422,
"step": 230
},
{
"epoch": 0.07757575757575758,
"grad_norm": 0.010671757161617279,
"learning_rate": 9.979857779314906e-05,
"loss": 0.5038897514343261,
"step": 240
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.01044373121112585,
"learning_rate": 9.974780825534246e-05,
"loss": 0.4965065956115723,
"step": 250
},
{
"epoch": 0.08404040404040404,
"grad_norm": 0.010565251111984253,
"learning_rate": 9.969135450393141e-05,
"loss": 0.5145994186401367,
"step": 260
},
{
"epoch": 0.08727272727272728,
"grad_norm": 0.010024026967585087,
"learning_rate": 9.96292229893545e-05,
"loss": 0.505019235610962,
"step": 270
},
{
"epoch": 0.0905050505050505,
"grad_norm": 0.010430407710373402,
"learning_rate": 9.956142081079484e-05,
"loss": 0.5115808010101318,
"step": 280
},
{
"epoch": 0.09373737373737374,
"grad_norm": 0.010048807598650455,
"learning_rate": 9.948795571536891e-05,
"loss": 0.48525152206420896,
"step": 290
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.010108387097716331,
"learning_rate": 9.94088360972414e-05,
"loss": 0.5002010822296142,
"step": 300
},
{
"epoch": 0.10020202020202021,
"grad_norm": 0.010273640975356102,
"learning_rate": 9.932407099666608e-05,
"loss": 0.49662046432495116,
"step": 310
},
{
"epoch": 0.10343434343434343,
"grad_norm": 0.00995971355587244,
"learning_rate": 9.923367009895274e-05,
"loss": 0.48578290939331054,
"step": 320
},
{
"epoch": 0.10666666666666667,
"grad_norm": 0.010459775105118752,
"learning_rate": 9.913764373336079e-05,
"loss": 0.48862767219543457,
"step": 330
},
{
"epoch": 0.1098989898989899,
"grad_norm": 0.009838269092142582,
"learning_rate": 9.903600287191875e-05,
"loss": 0.5034208774566651,
"step": 340
},
{
"epoch": 0.11313131313131314,
"grad_norm": 0.00978950597345829,
"learning_rate": 9.892875912817079e-05,
"loss": 0.4870161533355713,
"step": 350
},
{
"epoch": 0.11636363636363636,
"grad_norm": 0.01030043140053749,
"learning_rate": 9.881592475584964e-05,
"loss": 0.5026071548461915,
"step": 360
},
{
"epoch": 0.1195959595959596,
"grad_norm": 0.010531975887715816,
"learning_rate": 9.869751264747656e-05,
"loss": 0.4883917808532715,
"step": 370
},
{
"epoch": 0.12282828282828283,
"grad_norm": 0.00986825954169035,
"learning_rate": 9.857353633288814e-05,
"loss": 0.4927018165588379,
"step": 380
},
{
"epoch": 0.12606060606060607,
"grad_norm": 0.009432349354028702,
"learning_rate": 9.844400997769043e-05,
"loss": 0.4908174514770508,
"step": 390
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.01057665515691042,
"learning_rate": 9.83089483816404e-05,
"loss": 0.5069625854492188,
"step": 400
},
{
"epoch": 0.13252525252525252,
"grad_norm": 0.010007908567786217,
"learning_rate": 9.816836697695482e-05,
"loss": 0.4988056182861328,
"step": 410
},
{
"epoch": 0.13575757575757577,
"grad_norm": 0.010027808137238026,
"learning_rate": 9.802228182654702e-05,
"loss": 0.5007385730743408,
"step": 420
},
{
"epoch": 0.138989898989899,
"grad_norm": 0.009953738190233707,
"learning_rate": 9.787070962219156e-05,
"loss": 0.5130783081054687,
"step": 430
},
{
"epoch": 0.14222222222222222,
"grad_norm": 0.010204462334513664,
"learning_rate": 9.771366768261696e-05,
"loss": 0.4947535514831543,
"step": 440
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.009612451307475567,
"learning_rate": 9.755117395152689e-05,
"loss": 0.4917112350463867,
"step": 450
},
{
"epoch": 0.1486868686868687,
"grad_norm": 0.00995573028922081,
"learning_rate": 9.73832469955499e-05,
"loss": 0.4882859230041504,
"step": 460
},
{
"epoch": 0.15191919191919193,
"grad_norm": 0.009584570303559303,
"learning_rate": 9.720990600211797e-05,
"loss": 0.500526762008667,
"step": 470
},
{
"epoch": 0.15515151515151515,
"grad_norm": 0.010585076175630093,
"learning_rate": 9.703117077727419e-05,
"loss": 0.496882963180542,
"step": 480
},
{
"epoch": 0.15838383838383838,
"grad_norm": 0.009859723038971424,
"learning_rate": 9.684706174340965e-05,
"loss": 0.48728485107421876,
"step": 490
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.010212041437625885,
"learning_rate": 9.665759993693e-05,
"loss": 0.49378366470336915,
"step": 500
},
{
"epoch": 0.16484848484848486,
"grad_norm": 0.009744231589138508,
"learning_rate": 9.646280700585185e-05,
"loss": 0.4663555145263672,
"step": 510
},
{
"epoch": 0.16808080808080808,
"grad_norm": 0.009789686650037766,
"learning_rate": 9.626270520732916e-05,
"loss": 0.4879913330078125,
"step": 520
},
{
"epoch": 0.1713131313131313,
"grad_norm": 0.009891769848763943,
"learning_rate": 9.605731740511022e-05,
"loss": 0.48809428215026857,
"step": 530
},
{
"epoch": 0.17454545454545456,
"grad_norm": 0.01002143882215023,
"learning_rate": 9.584666706692517e-05,
"loss": 0.4959606170654297,
"step": 540
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.009208496659994125,
"learning_rate": 9.56307782618046e-05,
"loss": 0.4878594398498535,
"step": 550
},
{
"epoch": 0.181010101010101,
"grad_norm": 0.009645081125199795,
"learning_rate": 9.540967565732937e-05,
"loss": 0.4769869804382324,
"step": 560
},
{
"epoch": 0.18424242424242424,
"grad_norm": 0.010497638955712318,
"learning_rate": 9.51833845168121e-05,
"loss": 0.4970097064971924,
"step": 570
},
{
"epoch": 0.1874747474747475,
"grad_norm": 0.010612033307552338,
"learning_rate": 9.495193069641057e-05,
"loss": 0.4945381164550781,
"step": 580
},
{
"epoch": 0.1907070707070707,
"grad_norm": 0.009433986619114876,
"learning_rate": 9.47153406421734e-05,
"loss": 0.4860210418701172,
"step": 590
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.009995914995670319,
"learning_rate": 9.447364138701823e-05,
"loss": 0.4904911994934082,
"step": 600
},
{
"epoch": 0.19717171717171716,
"grad_norm": 0.009814019314944744,
"learning_rate": 9.422686054764302e-05,
"loss": 0.4915336608886719,
"step": 610
},
{
"epoch": 0.20040404040404042,
"grad_norm": 0.010199982672929764,
"learning_rate": 9.397502632137055e-05,
"loss": 0.4805011749267578,
"step": 620
},
{
"epoch": 0.20363636363636364,
"grad_norm": 0.010121874511241913,
"learning_rate": 9.371816748292641e-05,
"loss": 0.47633209228515627,
"step": 630
},
{
"epoch": 0.20686868686868687,
"grad_norm": 0.009638865478336811,
"learning_rate": 9.345631338115141e-05,
"loss": 0.48993511199951173,
"step": 640
},
{
"epoch": 0.2101010101010101,
"grad_norm": 0.010041174478828907,
"learning_rate": 9.318949393564807e-05,
"loss": 0.4798978328704834,
"step": 650
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.010145517997443676,
"learning_rate": 9.291773963336193e-05,
"loss": 0.4696659564971924,
"step": 660
},
{
"epoch": 0.21656565656565657,
"grad_norm": 0.01043335534632206,
"learning_rate": 9.264108152509816e-05,
"loss": 0.4955574035644531,
"step": 670
},
{
"epoch": 0.2197979797979798,
"grad_norm": 0.010406569577753544,
"learning_rate": 9.235955122197368e-05,
"loss": 0.48659563064575195,
"step": 680
},
{
"epoch": 0.22303030303030302,
"grad_norm": 0.009870355017483234,
"learning_rate": 9.207318089180524e-05,
"loss": 0.4766091823577881,
"step": 690
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.009980005212128162,
"learning_rate": 9.178200325543384e-05,
"loss": 0.5022068023681641,
"step": 700
},
{
"epoch": 0.2294949494949495,
"grad_norm": 0.009765878319740295,
"learning_rate": 9.148605158298621e-05,
"loss": 0.48309478759765623,
"step": 710
},
{
"epoch": 0.23272727272727273,
"grad_norm": 0.009879437275230885,
"learning_rate": 9.118535969007314e-05,
"loss": 0.4793684959411621,
"step": 720
},
{
"epoch": 0.23595959595959595,
"grad_norm": 0.009907045401632786,
"learning_rate": 9.087996193392578e-05,
"loss": 0.49844112396240237,
"step": 730
},
{
"epoch": 0.2391919191919192,
"grad_norm": 0.009958872571587563,
"learning_rate": 9.056989320947e-05,
"loss": 0.48113102912902833,
"step": 740
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.010206855833530426,
"learning_rate": 9.025518894533921e-05,
"loss": 0.5048784255981446,
"step": 750
},
{
"epoch": 0.24565656565656566,
"grad_norm": 0.009286429733037949,
"learning_rate": 8.99358850998263e-05,
"loss": 0.4862624168395996,
"step": 760
},
{
"epoch": 0.24888888888888888,
"grad_norm": 0.009919015690684319,
"learning_rate": 8.9612018156775e-05,
"loss": 0.48739056587219237,
"step": 770
},
{
"epoch": 0.25212121212121213,
"grad_norm": 0.010435211472213268,
"learning_rate": 8.928362512141124e-05,
"loss": 0.48689990043640136,
"step": 780
},
{
"epoch": 0.25535353535353533,
"grad_norm": 0.009992080740630627,
"learning_rate": 8.895074351611488e-05,
"loss": 0.4848791599273682,
"step": 790
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.010449117980897427,
"learning_rate": 8.861341137613242e-05,
"loss": 0.49130568504333494,
"step": 800
},
{
"epoch": 0.26181818181818184,
"grad_norm": 0.010061345063149929,
"learning_rate": 8.827166724523105e-05,
"loss": 0.49056081771850585,
"step": 810
},
{
"epoch": 0.26505050505050504,
"grad_norm": 0.009908635169267654,
"learning_rate": 8.792555017129461e-05,
"loss": 0.4706866264343262,
"step": 820
},
{
"epoch": 0.2682828282828283,
"grad_norm": 0.010003669187426567,
"learning_rate": 8.757509970186196e-05,
"loss": 0.48719348907470705,
"step": 830
},
{
"epoch": 0.27151515151515154,
"grad_norm": 0.009513127617537975,
"learning_rate": 8.722035587960826e-05,
"loss": 0.48200483322143556,
"step": 840
},
{
"epoch": 0.27474747474747474,
"grad_norm": 0.009142451919615269,
"learning_rate": 8.686135923776969e-05,
"loss": 0.48613176345825193,
"step": 850
},
{
"epoch": 0.277979797979798,
"grad_norm": 0.00980432890355587,
"learning_rate": 8.649815079551205e-05,
"loss": 0.4976047515869141,
"step": 860
},
{
"epoch": 0.2812121212121212,
"grad_norm": 0.010442109778523445,
"learning_rate": 8.613077205324389e-05,
"loss": 0.48451838493347166,
"step": 870
},
{
"epoch": 0.28444444444444444,
"grad_norm": 0.009309548884630203,
"learning_rate": 8.575926498787476e-05,
"loss": 0.5115366458892823,
"step": 880
},
{
"epoch": 0.2876767676767677,
"grad_norm": 0.009093924425542355,
"learning_rate": 8.538367204801872e-05,
"loss": 0.48187432289123533,
"step": 890
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.009757987223565578,
"learning_rate": 8.500403614914432e-05,
"loss": 0.4882505416870117,
"step": 900
},
{
"epoch": 0.29414141414141415,
"grad_norm": 0.010261048562824726,
"learning_rate": 8.462040066867089e-05,
"loss": 0.49068965911865237,
"step": 910
},
{
"epoch": 0.2973737373737374,
"grad_norm": 0.009317740797996521,
"learning_rate": 8.423280944101233e-05,
"loss": 0.4966132164001465,
"step": 920
},
{
"epoch": 0.3006060606060606,
"grad_norm": 0.009787693619728088,
"learning_rate": 8.384130675256852e-05,
"loss": 0.4754981994628906,
"step": 930
},
{
"epoch": 0.30383838383838385,
"grad_norm": 0.010005015879869461,
"learning_rate": 8.34459373366651e-05,
"loss": 0.4735103130340576,
"step": 940
},
{
"epoch": 0.30707070707070705,
"grad_norm": 0.009837017394602299,
"learning_rate": 8.304674636844231e-05,
"loss": 0.4809592247009277,
"step": 950
},
{
"epoch": 0.3103030303030303,
"grad_norm": 0.009590484201908112,
"learning_rate": 8.264377945969312e-05,
"loss": 0.48206100463867185,
"step": 960
},
{
"epoch": 0.31353535353535356,
"grad_norm": 0.009699508547782898,
"learning_rate": 8.223708265365174e-05,
"loss": 0.48718748092651365,
"step": 970
},
{
"epoch": 0.31676767676767675,
"grad_norm": 0.010278215631842613,
"learning_rate": 8.182670241973253e-05,
"loss": 0.4952278137207031,
"step": 980
},
{
"epoch": 0.32,
"grad_norm": 0.010104077868163586,
"learning_rate": 8.141268564822053e-05,
"loss": 0.47769975662231445,
"step": 990
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.009301737882196903,
"learning_rate": 8.099507964491369e-05,
"loss": 0.4862718105316162,
"step": 1000
},
{
"epoch": 0.32646464646464646,
"grad_norm": 0.009307135827839375,
"learning_rate": 8.057393212571767e-05,
"loss": 0.49833106994628906,
"step": 1010
},
{
"epoch": 0.3296969696969697,
"grad_norm": 0.009420682676136494,
"learning_rate": 8.014929121119378e-05,
"loss": 0.48829941749572753,
"step": 1020
},
{
"epoch": 0.3329292929292929,
"grad_norm": 0.010058379732072353,
"learning_rate": 7.972120542106077e-05,
"loss": 0.46506147384643554,
"step": 1030
},
{
"epoch": 0.33616161616161616,
"grad_norm": 0.009461579844355583,
"learning_rate": 7.92897236686508e-05,
"loss": 0.47866268157958985,
"step": 1040
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.010341835208237171,
"learning_rate": 7.885489525532075e-05,
"loss": 0.4966601371765137,
"step": 1050
},
{
"epoch": 0.3426262626262626,
"grad_norm": 0.01042103860527277,
"learning_rate": 7.84167698648189e-05,
"loss": 0.49251084327697753,
"step": 1060
},
{
"epoch": 0.34585858585858587,
"grad_norm": 0.010442749597132206,
"learning_rate": 7.797539755760805e-05,
"loss": 0.4816086769104004,
"step": 1070
},
{
"epoch": 0.3490909090909091,
"grad_norm": 0.010256553068757057,
"learning_rate": 7.753082876514562e-05,
"loss": 0.47102947235107423,
"step": 1080
},
{
"epoch": 0.3523232323232323,
"grad_norm": 0.009399017319083214,
"learning_rate": 7.708311428412129e-05,
"loss": 0.4810478210449219,
"step": 1090
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.009499317966401577,
"learning_rate": 7.663230527065293e-05,
"loss": 0.47051143646240234,
"step": 1100
},
{
"epoch": 0.35878787878787877,
"grad_norm": 0.009759469889104366,
"learning_rate": 7.617845323444156e-05,
"loss": 0.4780744552612305,
"step": 1110
},
{
"epoch": 0.362020202020202,
"grad_norm": 0.00987281370908022,
"learning_rate": 7.572161003288565e-05,
"loss": 0.4804032325744629,
"step": 1120
},
{
"epoch": 0.3652525252525253,
"grad_norm": 0.00963522493839264,
"learning_rate": 7.526182786515609e-05,
"loss": 0.48206367492675783,
"step": 1130
},
{
"epoch": 0.36848484848484847,
"grad_norm": 0.009587026201188564,
"learning_rate": 7.479915926623165e-05,
"loss": 0.49240264892578123,
"step": 1140
},
{
"epoch": 0.3717171717171717,
"grad_norm": 0.009810036048293114,
"learning_rate": 7.433365710089646e-05,
"loss": 0.48599681854248045,
"step": 1150
},
{
"epoch": 0.374949494949495,
"grad_norm": 0.00978278461843729,
"learning_rate": 7.386537455769963e-05,
"loss": 0.4688436031341553,
"step": 1160
},
{
"epoch": 0.3781818181818182,
"grad_norm": 0.00925996620208025,
"learning_rate": 7.339436514287783e-05,
"loss": 0.47986507415771484,
"step": 1170
},
{
"epoch": 0.3814141414141414,
"grad_norm": 0.010006657801568508,
"learning_rate": 7.292068267424165e-05,
"loss": 0.4771553993225098,
"step": 1180
},
{
"epoch": 0.3846464646464646,
"grad_norm": 0.010299976915121078,
"learning_rate": 7.244438127502647e-05,
"loss": 0.4854436874389648,
"step": 1190
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.009807374328374863,
"learning_rate": 7.196551536770807e-05,
"loss": 0.4763533592224121,
"step": 1200
},
{
"epoch": 0.39111111111111113,
"grad_norm": 0.010062369517982006,
"learning_rate": 7.148413966778451e-05,
"loss": 0.47457141876220704,
"step": 1210
},
{
"epoch": 0.39434343434343433,
"grad_norm": 0.009904388338327408,
"learning_rate": 7.100030917752423e-05,
"loss": 0.4732469081878662,
"step": 1220
},
{
"epoch": 0.3975757575757576,
"grad_norm": 0.009578046388924122,
"learning_rate": 7.051407917968138e-05,
"loss": 0.47551660537719725,
"step": 1230
},
{
"epoch": 0.40080808080808084,
"grad_norm": 0.010024272836744785,
"learning_rate": 7.002550523117926e-05,
"loss": 0.4835049629211426,
"step": 1240
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.010056886821985245,
"learning_rate": 6.953464315676241e-05,
"loss": 0.4909071922302246,
"step": 1250
},
{
"epoch": 0.4072727272727273,
"grad_norm": 0.009707778692245483,
"learning_rate": 6.904154904261792e-05,
"loss": 0.4842695236206055,
"step": 1260
},
{
"epoch": 0.4105050505050505,
"grad_norm": 0.00976107269525528,
"learning_rate": 6.8546279229967e-05,
"loss": 0.48967390060424804,
"step": 1270
},
{
"epoch": 0.41373737373737374,
"grad_norm": 0.010392800904810429,
"learning_rate": 6.804889030862753e-05,
"loss": 0.48658218383789065,
"step": 1280
},
{
"epoch": 0.416969696969697,
"grad_norm": 0.009267269633710384,
"learning_rate": 6.754943911054793e-05,
"loss": 0.48893008232116697,
"step": 1290
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.009753459133207798,
"learning_rate": 6.704798270331358e-05,
"loss": 0.49384117126464844,
"step": 1300
},
{
"epoch": 0.42343434343434344,
"grad_norm": 0.011121846735477448,
"learning_rate": 6.654457838362621e-05,
"loss": 0.47458324432373045,
"step": 1310
},
{
"epoch": 0.4266666666666667,
"grad_norm": 0.009406324476003647,
"learning_rate": 6.603928367075718e-05,
"loss": 0.4703402519226074,
"step": 1320
},
{
"epoch": 0.4298989898989899,
"grad_norm": 0.010009719990193844,
"learning_rate": 6.553215629997529e-05,
"loss": 0.4851715087890625,
"step": 1330
},
{
"epoch": 0.43313131313131314,
"grad_norm": 0.009798573330044746,
"learning_rate": 6.502325421594988e-05,
"loss": 0.4779622554779053,
"step": 1340
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.0098969591781497,
"learning_rate": 6.451263556613007e-05,
"loss": 0.4708548545837402,
"step": 1350
},
{
"epoch": 0.4395959595959596,
"grad_norm": 0.009853474795818329,
"learning_rate": 6.40003586941008e-05,
"loss": 0.4721970558166504,
"step": 1360
},
{
"epoch": 0.44282828282828285,
"grad_norm": 0.009663710370659828,
"learning_rate": 6.348648213291642e-05,
"loss": 0.4798272132873535,
"step": 1370
},
{
"epoch": 0.44606060606060605,
"grad_norm": 0.009419051930308342,
"learning_rate": 6.297106459841272e-05,
"loss": 0.471483039855957,
"step": 1380
},
{
"epoch": 0.4492929292929293,
"grad_norm": 0.009978282265365124,
"learning_rate": 6.245416498249801e-05,
"loss": 0.4772329330444336,
"step": 1390
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.009491856209933758,
"learning_rate": 6.193584234642403e-05,
"loss": 0.48812179565429686,
"step": 1400
},
{
"epoch": 0.45575757575757575,
"grad_norm": 0.010496517643332481,
"learning_rate": 6.141615591403771e-05,
"loss": 0.48897976875305177,
"step": 1410
},
{
"epoch": 0.458989898989899,
"grad_norm": 0.01015474647283554,
"learning_rate": 6.0895165065014106e-05,
"loss": 0.4850447654724121,
"step": 1420
},
{
"epoch": 0.4622222222222222,
"grad_norm": 0.009763741865754128,
"learning_rate": 6.037292932807167e-05,
"loss": 0.4857158660888672,
"step": 1430
},
{
"epoch": 0.46545454545454545,
"grad_norm": 0.009874231182038784,
"learning_rate": 5.984950837417048e-05,
"loss": 0.4844364166259766,
"step": 1440
},
{
"epoch": 0.4686868686868687,
"grad_norm": 0.010104037821292877,
"learning_rate": 5.932496200969422e-05,
"loss": 0.47542514801025393,
"step": 1450
},
{
"epoch": 0.4719191919191919,
"grad_norm": 0.009445435367524624,
"learning_rate": 5.879935016961661e-05,
"loss": 0.48579182624816897,
"step": 1460
},
{
"epoch": 0.47515151515151516,
"grad_norm": 0.010507128201425076,
"learning_rate": 5.827273291065326e-05,
"loss": 0.48273677825927735,
"step": 1470
},
{
"epoch": 0.4783838383838384,
"grad_norm": 0.009411566890776157,
"learning_rate": 5.7745170404399484e-05,
"loss": 0.46429009437561036,
"step": 1480
},
{
"epoch": 0.4816161616161616,
"grad_norm": 0.010377817787230015,
"learning_rate": 5.721672293045518e-05,
"loss": 0.49172677993774416,
"step": 1490
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.009920118376612663,
"learning_rate": 5.668745086953712e-05,
"loss": 0.4934688568115234,
"step": 1500
},
{
"epoch": 0.48808080808080806,
"grad_norm": 0.012272641994059086,
"learning_rate": 5.615741469657985e-05,
"loss": 0.4826413631439209,
"step": 1510
},
{
"epoch": 0.4913131313131313,
"grad_norm": 0.00981989037245512,
"learning_rate": 5.562667497382582e-05,
"loss": 0.48468503952026365,
"step": 1520
},
{
"epoch": 0.49454545454545457,
"grad_norm": 0.010148138739168644,
"learning_rate": 5.509529234390553e-05,
"loss": 0.47773942947387693,
"step": 1530
},
{
"epoch": 0.49777777777777776,
"grad_norm": 0.010528397746384144,
"learning_rate": 5.456332752290837e-05,
"loss": 0.47507562637329104,
"step": 1540
},
{
"epoch": 0.501010101010101,
"grad_norm": 0.01076479908078909,
"learning_rate": 5.4030841293445244e-05,
"loss": 0.486495304107666,
"step": 1550
},
{
"epoch": 0.5042424242424243,
"grad_norm": 0.009976644068956375,
"learning_rate": 5.349789449770351e-05,
"loss": 0.48320484161376953,
"step": 1560
},
{
"epoch": 0.5074747474747475,
"grad_norm": 0.010504718869924545,
"learning_rate": 5.2964548030495065e-05,
"loss": 0.48847188949584963,
"step": 1570
},
{
"epoch": 0.5107070707070707,
"grad_norm": 0.010325642302632332,
"learning_rate": 5.243086283229852e-05,
"loss": 0.47591514587402345,
"step": 1580
},
{
"epoch": 0.5139393939393939,
"grad_norm": 0.010260741226375103,
"learning_rate": 5.18968998822961e-05,
"loss": 0.48136124610900877,
"step": 1590
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.010797196067869663,
"learning_rate": 5.1362720191406065e-05,
"loss": 0.48076438903808594,
"step": 1600
},
{
"epoch": 0.5204040404040404,
"grad_norm": 0.010010522790253162,
"learning_rate": 5.082838479531169e-05,
"loss": 0.5004307270050049,
"step": 1610
},
{
"epoch": 0.5236363636363637,
"grad_norm": 0.010120037943124771,
"learning_rate": 5.029395474748714e-05,
"loss": 0.4812767028808594,
"step": 1620
},
{
"epoch": 0.5268686868686868,
"grad_norm": 0.009431752376258373,
"learning_rate": 4.975949111222158e-05,
"loss": 0.4887521743774414,
"step": 1630
},
{
"epoch": 0.5301010101010101,
"grad_norm": 0.010080527514219284,
"learning_rate": 4.9225054957641916e-05,
"loss": 0.48248910903930664,
"step": 1640
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.009522376582026482,
"learning_rate": 4.8690707348735035e-05,
"loss": 0.4855649948120117,
"step": 1650
},
{
"epoch": 0.5365656565656566,
"grad_norm": 0.009686745703220367,
"learning_rate": 4.8156509340370605e-05,
"loss": 0.47187280654907227,
"step": 1660
},
{
"epoch": 0.5397979797979798,
"grad_norm": 0.011875285767018795,
"learning_rate": 4.762252197032482e-05,
"loss": 0.46579513549804685,
"step": 1670
},
{
"epoch": 0.5430303030303031,
"grad_norm": 0.009932303801178932,
"learning_rate": 4.7088806252306224e-05,
"loss": 0.47946829795837403,
"step": 1680
},
{
"epoch": 0.5462626262626262,
"grad_norm": 0.017099183052778244,
"learning_rate": 4.655542316898423e-05,
"loss": 0.4593667030334473,
"step": 1690
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.010512005537748337,
"learning_rate": 4.6022433665021246e-05,
"loss": 0.48582897186279295,
"step": 1700
},
{
"epoch": 0.5527272727272727,
"grad_norm": 0.00989206787198782,
"learning_rate": 4.548989864010902e-05,
"loss": 0.4849580764770508,
"step": 1710
},
{
"epoch": 0.555959595959596,
"grad_norm": 0.010284029878675938,
"learning_rate": 4.495787894201031e-05,
"loss": 0.49089298248291013,
"step": 1720
},
{
"epoch": 0.5591919191919192,
"grad_norm": 0.00992760993540287,
"learning_rate": 4.442643535960631e-05,
"loss": 0.48010549545288084,
"step": 1730
},
{
"epoch": 0.5624242424242424,
"grad_norm": 0.00978371873497963,
"learning_rate": 4.3895628615950864e-05,
"loss": 0.47594566345214845,
"step": 1740
},
{
"epoch": 0.5656565656565656,
"grad_norm": 0.00981004349887371,
"learning_rate": 4.3365519361332345e-05,
"loss": 0.48056421279907224,
"step": 1750
},
{
"epoch": 0.5688888888888889,
"grad_norm": 0.010216868482530117,
"learning_rate": 4.283616816634353e-05,
"loss": 0.47501134872436523,
"step": 1760
},
{
"epoch": 0.5721212121212121,
"grad_norm": 0.010267022997140884,
"learning_rate": 4.230763551496089e-05,
"loss": 0.478366756439209,
"step": 1770
},
{
"epoch": 0.5753535353535354,
"grad_norm": 0.009910739958286285,
"learning_rate": 4.1779981797633645e-05,
"loss": 0.478058385848999,
"step": 1780
},
{
"epoch": 0.5785858585858585,
"grad_norm": 0.011339185759425163,
"learning_rate": 4.1253267304383455e-05,
"loss": 0.48466057777404786,
"step": 1790
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.009522072039544582,
"learning_rate": 4.072755221791572e-05,
"loss": 0.4769908905029297,
"step": 1800
},
{
"epoch": 0.585050505050505,
"grad_norm": 0.009808518923819065,
"learning_rate": 4.020289660674306e-05,
"loss": 0.47162666320800783,
"step": 1810
},
{
"epoch": 0.5882828282828283,
"grad_norm": 0.009962448850274086,
"learning_rate": 3.967936041832173e-05,
"loss": 0.46282405853271485,
"step": 1820
},
{
"epoch": 0.5915151515151515,
"grad_norm": 0.009498678147792816,
"learning_rate": 3.9157003472202246e-05,
"loss": 0.4794480323791504,
"step": 1830
},
{
"epoch": 0.5947474747474748,
"grad_norm": 0.009407177567481995,
"learning_rate": 3.863588545319407e-05,
"loss": 0.4729412078857422,
"step": 1840
},
{
"epoch": 0.597979797979798,
"grad_norm": 0.009899895638227463,
"learning_rate": 3.8116065904546196e-05,
"loss": 0.47380704879760743,
"step": 1850
},
{
"epoch": 0.6012121212121212,
"grad_norm": 0.009560837410390377,
"learning_rate": 3.759760422114362e-05,
"loss": 0.47751388549804685,
"step": 1860
},
{
"epoch": 0.6044444444444445,
"grad_norm": 0.010970192961394787,
"learning_rate": 3.708055964272088e-05,
"loss": 0.47801823616027833,
"step": 1870
},
{
"epoch": 0.6076767676767677,
"grad_norm": 0.00964390765875578,
"learning_rate": 3.6564991247093234e-05,
"loss": 0.4800426483154297,
"step": 1880
},
{
"epoch": 0.610909090909091,
"grad_norm": 0.009764602407813072,
"learning_rate": 3.6050957943406465e-05,
"loss": 0.4812753677368164,
"step": 1890
},
{
"epoch": 0.6141414141414141,
"grad_norm": 0.009959314949810505,
"learning_rate": 3.553851846540584e-05,
"loss": 0.47804956436157225,
"step": 1900
},
{
"epoch": 0.6173737373737374,
"grad_norm": 0.010666392743587494,
"learning_rate": 3.50277313647252e-05,
"loss": 0.4772273063659668,
"step": 1910
},
{
"epoch": 0.6206060606060606,
"grad_norm": 0.009173358790576458,
"learning_rate": 3.451865500419676e-05,
"loss": 0.4892144203186035,
"step": 1920
},
{
"epoch": 0.6238383838383839,
"grad_norm": 0.010000358335673809,
"learning_rate": 3.401134755118256e-05,
"loss": 0.48959059715270997,
"step": 1930
},
{
"epoch": 0.6270707070707071,
"grad_norm": 0.010190536268055439,
"learning_rate": 3.350586697092826e-05,
"loss": 0.46478729248046874,
"step": 1940
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.010812178254127502,
"learning_rate": 3.300227101993998e-05,
"loss": 0.5046152114868164,
"step": 1950
},
{
"epoch": 0.6335353535353535,
"grad_norm": 0.01017849799245596,
"learning_rate": 3.2500617239384947e-05,
"loss": 0.4698008060455322,
"step": 1960
},
{
"epoch": 0.6367676767676768,
"grad_norm": 0.011103908531367779,
"learning_rate": 3.200096294851691e-05,
"loss": 0.4716217041015625,
"step": 1970
},
{
"epoch": 0.64,
"grad_norm": 0.009982883930206299,
"learning_rate": 3.150336523812674e-05,
"loss": 0.47135162353515625,
"step": 1980
},
{
"epoch": 0.6432323232323233,
"grad_norm": 0.01043197326362133,
"learning_rate": 3.100788096401925e-05,
"loss": 0.4799614906311035,
"step": 1990
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.010667623952031136,
"learning_rate": 3.051456674051677e-05,
"loss": 0.4682164192199707,
"step": 2000
},
{
"epoch": 0.6496969696969697,
"grad_norm": 0.010620036162436008,
"learning_rate": 3.0023478933990347e-05,
"loss": 0.4750208854675293,
"step": 2010
},
{
"epoch": 0.6529292929292929,
"grad_norm": 0.010374991223216057,
"learning_rate": 2.9534673656419377e-05,
"loss": 0.48034076690673827,
"step": 2020
},
{
"epoch": 0.6561616161616162,
"grad_norm": 0.011932817287743092,
"learning_rate": 2.9048206758980136e-05,
"loss": 0.46314101219177245,
"step": 2030
},
{
"epoch": 0.6593939393939394,
"grad_norm": 0.01039852574467659,
"learning_rate": 2.856413382566425e-05,
"loss": 0.4783937454223633,
"step": 2040
},
{
"epoch": 0.6626262626262627,
"grad_norm": 0.010200604796409607,
"learning_rate": 2.8082510166927583e-05,
"loss": 0.4792025089263916,
"step": 2050
},
{
"epoch": 0.6658585858585858,
"grad_norm": 0.009561127051711082,
"learning_rate": 2.760339081337041e-05,
"loss": 0.4722686767578125,
"step": 2060
},
{
"epoch": 0.6690909090909091,
"grad_norm": 0.00924921128898859,
"learning_rate": 2.7126830509449773e-05,
"loss": 0.4884464263916016,
"step": 2070
},
{
"epoch": 0.6723232323232323,
"grad_norm": 0.00991444569081068,
"learning_rate": 2.6652883707224075e-05,
"loss": 0.4804549217224121,
"step": 2080
},
{
"epoch": 0.6755555555555556,
"grad_norm": 0.009902069345116615,
"learning_rate": 2.618160456013153e-05,
"loss": 0.4781071186065674,
"step": 2090
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.010504493489861488,
"learning_rate": 2.571304691680255e-05,
"loss": 0.48059587478637694,
"step": 2100
},
{
"epoch": 0.682020202020202,
"grad_norm": 0.009641965851187706,
"learning_rate": 2.5247264314906917e-05,
"loss": 0.4706240177154541,
"step": 2110
},
{
"epoch": 0.6852525252525252,
"grad_norm": 0.010276644490659237,
"learning_rate": 2.4784309975036513e-05,
"loss": 0.4885101318359375,
"step": 2120
},
{
"epoch": 0.6884848484848485,
"grad_norm": 0.010960405692458153,
"learning_rate": 2.4324236794624456e-05,
"loss": 0.4882383346557617,
"step": 2130
},
{
"epoch": 0.6917171717171717,
"grad_norm": 0.010282796807587147,
"learning_rate": 2.386709734190079e-05,
"loss": 0.4768857479095459,
"step": 2140
},
{
"epoch": 0.694949494949495,
"grad_norm": 0.010104385204613209,
"learning_rate": 2.34129438498862e-05,
"loss": 0.4802837371826172,
"step": 2150
},
{
"epoch": 0.6981818181818182,
"grad_norm": 0.010027035139501095,
"learning_rate": 2.296182821042374e-05,
"loss": 0.4874839782714844,
"step": 2160
},
{
"epoch": 0.7014141414141414,
"grad_norm": 0.010018851608037949,
"learning_rate": 2.2513801968249644e-05,
"loss": 0.4758561134338379,
"step": 2170
},
{
"epoch": 0.7046464646464646,
"grad_norm": 0.009862969629466534,
"learning_rate": 2.2068916315103783e-05,
"loss": 0.4686488151550293,
"step": 2180
},
{
"epoch": 0.7078787878787879,
"grad_norm": 0.00952910166233778,
"learning_rate": 2.162722208388057e-05,
"loss": 0.4782154083251953,
"step": 2190
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.010213647037744522,
"learning_rate": 2.1188769742820625e-05,
"loss": 0.4773773193359375,
"step": 2200
},
{
"epoch": 0.7143434343434344,
"grad_norm": 0.010498392395675182,
"learning_rate": 2.075360938974429e-05,
"loss": 0.4733907222747803,
"step": 2210
},
{
"epoch": 0.7175757575757575,
"grad_norm": 0.009394815191626549,
"learning_rate": 2.03217907463275e-05,
"loss": 0.47564210891723635,
"step": 2220
},
{
"epoch": 0.7208080808080808,
"grad_norm": 0.010383618995547295,
"learning_rate": 1.989336315242048e-05,
"loss": 0.4784512519836426,
"step": 2230
},
{
"epoch": 0.724040404040404,
"grad_norm": 0.01043599285185337,
"learning_rate": 1.9468375560410117e-05,
"loss": 0.4808964729309082,
"step": 2240
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.009953988716006279,
"learning_rate": 1.90468765296267e-05,
"loss": 0.49031834602355956,
"step": 2250
},
{
"epoch": 0.7305050505050505,
"grad_norm": 0.010356656275689602,
"learning_rate": 1.8628914220795494e-05,
"loss": 0.48055601119995117,
"step": 2260
},
{
"epoch": 0.7337373737373737,
"grad_norm": 0.009841440245509148,
"learning_rate": 1.8214536390533822e-05,
"loss": 0.47170586585998536,
"step": 2270
},
{
"epoch": 0.7369696969696969,
"grad_norm": 0.010102550499141216,
"learning_rate": 1.7803790385894387e-05,
"loss": 0.4660043716430664,
"step": 2280
},
{
"epoch": 0.7402020202020202,
"grad_norm": 0.010421738028526306,
"learning_rate": 1.7396723138955428e-05,
"loss": 0.4599196434020996,
"step": 2290
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.009896540082991123,
"learning_rate": 1.699338116145811e-05,
"loss": 0.46491589546203616,
"step": 2300
},
{
"epoch": 0.7466666666666667,
"grad_norm": 0.010468855500221252,
"learning_rate": 1.6593810539492195e-05,
"loss": 0.47761125564575196,
"step": 2310
},
{
"epoch": 0.74989898989899,
"grad_norm": 0.010416937991976738,
"learning_rate": 1.619805692823016e-05,
"loss": 0.49956817626953126,
"step": 2320
},
{
"epoch": 0.7531313131313131,
"grad_norm": 0.010007611475884914,
"learning_rate": 1.580616554671057e-05,
"loss": 0.4716916084289551,
"step": 2330
},
{
"epoch": 0.7563636363636363,
"grad_norm": 0.010231648571789265,
"learning_rate": 1.5418181172671382e-05,
"loss": 0.4909040451049805,
"step": 2340
},
{
"epoch": 0.7595959595959596,
"grad_norm": 0.010324012488126755,
"learning_rate": 1.5034148137433623e-05,
"loss": 0.47939190864562986,
"step": 2350
},
{
"epoch": 0.7628282828282829,
"grad_norm": 0.011009547859430313,
"learning_rate": 1.4654110320836017e-05,
"loss": 0.48291826248168945,
"step": 2360
},
{
"epoch": 0.7660606060606061,
"grad_norm": 0.010392943397164345,
"learning_rate": 1.4278111146221263e-05,
"loss": 0.49179973602294924,
"step": 2370
},
{
"epoch": 0.7692929292929293,
"grad_norm": 0.010355197824537754,
"learning_rate": 1.3906193575474508e-05,
"loss": 0.4716278076171875,
"step": 2380
},
{
"epoch": 0.7725252525252525,
"grad_norm": 0.009932084940373898,
"learning_rate": 1.3538400104114446e-05,
"loss": 0.47190055847167967,
"step": 2390
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.010710643604397774,
"learning_rate": 1.3174772756437742e-05,
"loss": 0.4703176498413086,
"step": 2400
},
{
"epoch": 0.778989898989899,
"grad_norm": 0.010614068247377872,
"learning_rate": 1.2815353080717379e-05,
"loss": 0.47541141510009766,
"step": 2410
},
{
"epoch": 0.7822222222222223,
"grad_norm": 0.011346950195729733,
"learning_rate": 1.246018214445525e-05,
"loss": 0.4860015869140625,
"step": 2420
},
{
"epoch": 0.7854545454545454,
"grad_norm": 0.009824907407164574,
"learning_rate": 1.210930052968981e-05,
"loss": 0.49062480926513674,
"step": 2430
},
{
"epoch": 0.7886868686868687,
"grad_norm": 0.010624224320054054,
"learning_rate": 1.1762748328359152e-05,
"loss": 0.4694485664367676,
"step": 2440
},
{
"epoch": 0.7919191919191919,
"grad_norm": 0.01044746395200491,
"learning_rate": 1.1420565137720045e-05,
"loss": 0.47275629043579104,
"step": 2450
},
{
"epoch": 0.7951515151515152,
"grad_norm": 0.010226710699498653,
"learning_rate": 1.1082790055823533e-05,
"loss": 0.4670067310333252,
"step": 2460
},
{
"epoch": 0.7983838383838384,
"grad_norm": 0.010427460074424744,
"learning_rate": 1.0749461677047624e-05,
"loss": 0.4700475692749023,
"step": 2470
},
{
"epoch": 0.8016161616161617,
"grad_norm": 0.010494428686797619,
"learning_rate": 1.0420618087687418e-05,
"loss": 0.46589016914367676,
"step": 2480
},
{
"epoch": 0.8048484848484848,
"grad_norm": 0.010100893676280975,
"learning_rate": 1.0096296861603321e-05,
"loss": 0.4704419136047363,
"step": 2490
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.010682797059416771,
"learning_rate": 9.776535055927931e-06,
"loss": 0.47326183319091797,
"step": 2500
},
{
"epoch": 0.8113131313131313,
"grad_norm": 0.00972407590597868,
"learning_rate": 9.461369206831772e-06,
"loss": 0.48441619873046876,
"step": 2510
},
{
"epoch": 0.8145454545454546,
"grad_norm": 0.010327558033168316,
"learning_rate": 9.150835325348678e-06,
"loss": 0.45856351852416993,
"step": 2520
},
{
"epoch": 0.8177777777777778,
"grad_norm": 0.010118944570422173,
"learning_rate": 8.844968893261197e-06,
"loss": 0.48114948272705077,
"step": 2530
},
{
"epoch": 0.821010101010101,
"grad_norm": 0.010876161977648735,
"learning_rate": 8.543804859046345e-06,
"loss": 0.4930680274963379,
"step": 2540
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.010722608305513859,
"learning_rate": 8.247377633882463e-06,
"loss": 0.46875743865966796,
"step": 2550
},
{
"epoch": 0.8274747474747475,
"grad_norm": 0.01088438369333744,
"learning_rate": 7.95572108771726e-06,
"loss": 0.4768000602722168,
"step": 2560
},
{
"epoch": 0.8307070707070707,
"grad_norm": 0.01026720181107521,
"learning_rate": 7.66886854539795e-06,
"loss": 0.479974365234375,
"step": 2570
},
{
"epoch": 0.833939393939394,
"grad_norm": 0.009710075333714485,
"learning_rate": 7.386852782863407e-06,
"loss": 0.4684715270996094,
"step": 2580
},
{
"epoch": 0.8371717171717171,
"grad_norm": 0.009966167621314526,
"learning_rate": 7.109706023399232e-06,
"loss": 0.47191972732543946,
"step": 2590
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.010125650092959404,
"learning_rate": 6.837459933955936e-06,
"loss": 0.4766803741455078,
"step": 2600
},
{
"epoch": 0.8436363636363636,
"grad_norm": 0.009984518401324749,
"learning_rate": 6.5701456215305656e-06,
"loss": 0.4592133045196533,
"step": 2610
},
{
"epoch": 0.8468686868686869,
"grad_norm": 0.010836348868906498,
"learning_rate": 6.307793629612452e-06,
"loss": 0.4658504009246826,
"step": 2620
},
{
"epoch": 0.8501010101010101,
"grad_norm": 0.010431919246912003,
"learning_rate": 6.050433934693339e-06,
"loss": 0.4936856269836426,
"step": 2630
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.01049034669995308,
"learning_rate": 5.798095942842141e-06,
"loss": 0.4765117645263672,
"step": 2640
},
{
"epoch": 0.8565656565656565,
"grad_norm": 0.009596607647836208,
"learning_rate": 5.550808486345072e-06,
"loss": 0.47063379287719725,
"step": 2650
},
{
"epoch": 0.8597979797979798,
"grad_norm": 0.01032214518636465,
"learning_rate": 5.308599820411247e-06,
"loss": 0.4614537715911865,
"step": 2660
},
{
"epoch": 0.863030303030303,
"grad_norm": 0.009960100054740906,
"learning_rate": 5.071497619944171e-06,
"loss": 0.48136143684387206,
"step": 2670
},
{
"epoch": 0.8662626262626263,
"grad_norm": 0.010326167568564415,
"learning_rate": 4.839528976379648e-06,
"loss": 0.47299985885620116,
"step": 2680
},
{
"epoch": 0.8694949494949495,
"grad_norm": 0.010867294855415821,
"learning_rate": 4.612720394590286e-06,
"loss": 0.4829209327697754,
"step": 2690
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.01050047017633915,
"learning_rate": 4.391097789856985e-06,
"loss": 0.4762550354003906,
"step": 2700
},
{
"epoch": 0.8759595959595959,
"grad_norm": 0.010378929786384106,
"learning_rate": 4.174686484907908e-06,
"loss": 0.4898251533508301,
"step": 2710
},
{
"epoch": 0.8791919191919192,
"grad_norm": 0.01060938648879528,
"learning_rate": 3.963511207025078e-06,
"loss": 0.47261548042297363,
"step": 2720
},
{
"epoch": 0.8824242424242424,
"grad_norm": 0.01075353566557169,
"learning_rate": 3.7575960852189728e-06,
"loss": 0.4839645862579346,
"step": 2730
},
{
"epoch": 0.8856565656565657,
"grad_norm": 0.010416378267109394,
"learning_rate": 3.5569646474715722e-06,
"loss": 0.46989760398864744,
"step": 2740
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.010806340724229813,
"learning_rate": 3.361639818048068e-06,
"loss": 0.47272281646728515,
"step": 2750
},
{
"epoch": 0.8921212121212121,
"grad_norm": 0.009753530845046043,
"learning_rate": 3.1716439148774534e-06,
"loss": 0.47122917175292967,
"step": 2760
},
{
"epoch": 0.8953535353535353,
"grad_norm": 0.01022200658917427,
"learning_rate": 2.986998647002498e-06,
"loss": 0.4727674961090088,
"step": 2770
},
{
"epoch": 0.8985858585858586,
"grad_norm": 0.010225590318441391,
"learning_rate": 2.8077251120992742e-06,
"loss": 0.46461896896362304,
"step": 2780
},
{
"epoch": 0.9018181818181819,
"grad_norm": 0.010752941481769085,
"learning_rate": 2.633843794066515e-06,
"loss": 0.46602635383605956,
"step": 2790
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.011064955033361912,
"learning_rate": 2.465374560685091e-06,
"loss": 0.4705932140350342,
"step": 2800
},
{
"epoch": 0.9082828282828282,
"grad_norm": 0.00982996542006731,
"learning_rate": 2.302336661347926e-06,
"loss": 0.4687389373779297,
"step": 2810
},
{
"epoch": 0.9115151515151515,
"grad_norm": 0.010190518572926521,
"learning_rate": 2.1447487248605513e-06,
"loss": 0.4906333923339844,
"step": 2820
},
{
"epoch": 0.9147474747474748,
"grad_norm": 0.01034579798579216,
"learning_rate": 1.9926287573125537e-06,
"loss": 0.47768077850341795,
"step": 2830
},
{
"epoch": 0.917979797979798,
"grad_norm": 0.010615529492497444,
"learning_rate": 1.845994140020213e-06,
"loss": 0.47643141746520995,
"step": 2840
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.010776858776807785,
"learning_rate": 1.7048616275404771e-06,
"loss": 0.4636590003967285,
"step": 2850
},
{
"epoch": 0.9244444444444444,
"grad_norm": 0.010271280072629452,
"learning_rate": 1.5692473457565748e-06,
"loss": 0.4707911014556885,
"step": 2860
},
{
"epoch": 0.9276767676767677,
"grad_norm": 0.009596684016287327,
"learning_rate": 1.439166790035501e-06,
"loss": 0.45751228332519533,
"step": 2870
},
{
"epoch": 0.9309090909090909,
"grad_norm": 0.009914812631905079,
"learning_rate": 1.3146348234574724e-06,
"loss": 0.4842953681945801,
"step": 2880
},
{
"epoch": 0.9341414141414142,
"grad_norm": 0.01014726422727108,
"learning_rate": 1.1956656751176577e-06,
"loss": 0.482682991027832,
"step": 2890
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.010660757310688496,
"learning_rate": 1.0822729385003727e-06,
"loss": 0.48476057052612304,
"step": 2900
},
{
"epoch": 0.9406060606060606,
"grad_norm": 0.010105855762958527,
"learning_rate": 9.744695699258955e-07,
"loss": 0.4767448425292969,
"step": 2910
},
{
"epoch": 0.9438383838383838,
"grad_norm": 0.010221844539046288,
"learning_rate": 8.722678870700274e-07,
"loss": 0.4663360118865967,
"step": 2920
},
{
"epoch": 0.9470707070707071,
"grad_norm": 0.011003104969859123,
"learning_rate": 7.756795675566919e-07,
"loss": 0.48151307106018065,
"step": 2930
},
{
"epoch": 0.9503030303030303,
"grad_norm": 0.01029619574546814,
"learning_rate": 6.847156476236516e-07,
"loss": 0.48086977005004883,
"step": 2940
},
{
"epoch": 0.9535353535353536,
"grad_norm": 0.010033480823040009,
"learning_rate": 5.993865208614835e-07,
"loss": 0.4648440361022949,
"step": 2950
},
{
"epoch": 0.9567676767676768,
"grad_norm": 0.010539564304053783,
"learning_rate": 5.197019370260125e-07,
"loss": 0.46828551292419435,
"step": 2960
},
{
"epoch": 0.96,
"grad_norm": 0.009619071148335934,
"learning_rate": 4.4567100092429704e-07,
"loss": 0.48481130599975586,
"step": 2970
},
{
"epoch": 0.9632323232323232,
"grad_norm": 0.010197056457400322,
"learning_rate": 3.7730217137428857e-07,
"loss": 0.47917518615722654,
"step": 2980
},
{
"epoch": 0.9664646464646465,
"grad_norm": 0.01057458110153675,
"learning_rate": 3.1460326023836083e-07,
"loss": 0.48064794540405276,
"step": 2990
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.01009963545948267,
"learning_rate": 2.575814315306846e-07,
"loss": 0.4831876754760742,
"step": 3000
},
{
"epoch": 0.972929292929293,
"grad_norm": 0.010343736037611961,
"learning_rate": 2.0624320059869918e-07,
"loss": 0.4821047306060791,
"step": 3010
},
{
"epoch": 0.9761616161616161,
"grad_norm": 0.010384611785411835,
"learning_rate": 1.6059443337861912e-07,
"loss": 0.46325201988220216,
"step": 3020
},
{
"epoch": 0.9793939393939394,
"grad_norm": 0.010477159172296524,
"learning_rate": 1.2064034572523142e-07,
"loss": 0.4779689788818359,
"step": 3030
},
{
"epoch": 0.9826262626262626,
"grad_norm": 0.010612626560032368,
"learning_rate": 8.638550281591107e-08,
"loss": 0.4866930484771729,
"step": 3040
},
{
"epoch": 0.9858585858585859,
"grad_norm": 0.01027057133615017,
"learning_rate": 5.7833818629005054e-08,
"loss": 0.48350844383239744,
"step": 3050
},
{
"epoch": 0.9890909090909091,
"grad_norm": 0.010666116140782833,
"learning_rate": 3.498855549660118e-08,
"loss": 0.4722591400146484,
"step": 3060
},
{
"epoch": 0.9923232323232323,
"grad_norm": 0.009852438233792782,
"learning_rate": 1.785232373180401e-08,
"loss": 0.48914670944213867,
"step": 3070
},
{
"epoch": 0.9955555555555555,
"grad_norm": 0.010089240968227386,
"learning_rate": 6.427081330456774e-09,
"loss": 0.4730827331542969,
"step": 3080
},
{
"epoch": 0.9987878787878788,
"grad_norm": 0.010913597419857979,
"learning_rate": 7.141337474148025e-10,
"loss": 0.4666846752166748,
"step": 3090
},
{
"epoch": 1.0,
"step": 3094,
"total_flos": 4.3887930550675046e+18,
"train_loss": 0.4842414558126453,
"train_runtime": 8903.7116,
"train_samples_per_second": 22.238,
"train_steps_per_second": 0.347
}
],
"logging_steps": 10,
"max_steps": 3094,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.3887930550675046e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}