subliminal-10k-cows10000 / trainer_state.json
eac123's picture
Upload folder using huggingface_hub
7bbbb60 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 0.7923470139503479,
"learning_rate": 0.00019967935871743488,
"loss": 0.4612,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 0.5407691597938538,
"learning_rate": 0.00019887775551102204,
"loss": 0.2353,
"step": 20
},
{
"epoch": 0.12,
"grad_norm": 0.5324429869651794,
"learning_rate": 0.00019807615230460924,
"loss": 0.2157,
"step": 30
},
{
"epoch": 0.16,
"grad_norm": 0.5783653259277344,
"learning_rate": 0.0001972745490981964,
"loss": 0.2017,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 0.47596609592437744,
"learning_rate": 0.00019647294589178357,
"loss": 0.1951,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 0.4201284945011139,
"learning_rate": 0.00019567134268537074,
"loss": 0.2002,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 0.4552278518676758,
"learning_rate": 0.00019486973947895793,
"loss": 0.1904,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 0.7762654423713684,
"learning_rate": 0.0001940681362725451,
"loss": 0.1789,
"step": 80
},
{
"epoch": 0.36,
"grad_norm": 0.48717886209487915,
"learning_rate": 0.00019326653306613227,
"loss": 0.1754,
"step": 90
},
{
"epoch": 0.4,
"grad_norm": 0.5642560720443726,
"learning_rate": 0.00019246492985971943,
"loss": 0.1807,
"step": 100
},
{
"epoch": 0.44,
"grad_norm": 0.5994827151298523,
"learning_rate": 0.00019166332665330663,
"loss": 0.1865,
"step": 110
},
{
"epoch": 0.48,
"grad_norm": 0.5326362252235413,
"learning_rate": 0.0001908617234468938,
"loss": 0.1845,
"step": 120
},
{
"epoch": 0.52,
"grad_norm": 0.8340044021606445,
"learning_rate": 0.00019006012024048096,
"loss": 0.1696,
"step": 130
},
{
"epoch": 0.56,
"grad_norm": 0.7146855592727661,
"learning_rate": 0.00018925851703406813,
"loss": 0.1844,
"step": 140
},
{
"epoch": 0.6,
"grad_norm": 0.5249161720275879,
"learning_rate": 0.00018845691382765532,
"loss": 0.1757,
"step": 150
},
{
"epoch": 0.64,
"grad_norm": 0.7083638310432434,
"learning_rate": 0.0001876553106212425,
"loss": 0.1918,
"step": 160
},
{
"epoch": 0.68,
"grad_norm": 0.40141209959983826,
"learning_rate": 0.00018685370741482966,
"loss": 0.1868,
"step": 170
},
{
"epoch": 0.72,
"grad_norm": 0.4578303098678589,
"learning_rate": 0.00018605210420841683,
"loss": 0.1809,
"step": 180
},
{
"epoch": 0.76,
"grad_norm": 0.49453985691070557,
"learning_rate": 0.00018525050100200402,
"loss": 0.1747,
"step": 190
},
{
"epoch": 0.8,
"grad_norm": 0.5069108009338379,
"learning_rate": 0.0001844488977955912,
"loss": 0.1781,
"step": 200
},
{
"epoch": 0.84,
"grad_norm": 0.6718438267707825,
"learning_rate": 0.00018364729458917838,
"loss": 0.1772,
"step": 210
},
{
"epoch": 0.88,
"grad_norm": 0.4564284682273865,
"learning_rate": 0.00018284569138276555,
"loss": 0.177,
"step": 220
},
{
"epoch": 0.92,
"grad_norm": 0.47330281138420105,
"learning_rate": 0.00018204408817635271,
"loss": 0.1715,
"step": 230
},
{
"epoch": 0.96,
"grad_norm": 0.4193236827850342,
"learning_rate": 0.0001812424849699399,
"loss": 0.1749,
"step": 240
},
{
"epoch": 1.0,
"grad_norm": 0.5325985550880432,
"learning_rate": 0.00018044088176352708,
"loss": 0.18,
"step": 250
},
{
"epoch": 1.04,
"grad_norm": 0.5181522965431213,
"learning_rate": 0.00017963927855711424,
"loss": 0.1537,
"step": 260
},
{
"epoch": 1.08,
"grad_norm": 0.4463769197463989,
"learning_rate": 0.0001788376753507014,
"loss": 0.1432,
"step": 270
},
{
"epoch": 1.12,
"grad_norm": 0.5249760746955872,
"learning_rate": 0.00017803607214428858,
"loss": 0.1607,
"step": 280
},
{
"epoch": 1.16,
"grad_norm": 0.42788001894950867,
"learning_rate": 0.00017723446893787577,
"loss": 0.1473,
"step": 290
},
{
"epoch": 1.2,
"grad_norm": 0.6883480548858643,
"learning_rate": 0.00017643286573146294,
"loss": 0.1481,
"step": 300
},
{
"epoch": 1.24,
"grad_norm": 0.5434200167655945,
"learning_rate": 0.0001756312625250501,
"loss": 0.1557,
"step": 310
},
{
"epoch": 1.28,
"grad_norm": 0.770155668258667,
"learning_rate": 0.00017482965931863727,
"loss": 0.1397,
"step": 320
},
{
"epoch": 1.32,
"grad_norm": 0.6204828023910522,
"learning_rate": 0.00017402805611222447,
"loss": 0.1569,
"step": 330
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.7150607705116272,
"learning_rate": 0.00017322645290581163,
"loss": 0.1523,
"step": 340
},
{
"epoch": 1.4,
"grad_norm": 0.5075599551200867,
"learning_rate": 0.0001724248496993988,
"loss": 0.1631,
"step": 350
},
{
"epoch": 1.44,
"grad_norm": 0.6147515177726746,
"learning_rate": 0.00017162324649298597,
"loss": 0.1476,
"step": 360
},
{
"epoch": 1.48,
"grad_norm": 0.6109094023704529,
"learning_rate": 0.00017082164328657316,
"loss": 0.1522,
"step": 370
},
{
"epoch": 1.52,
"grad_norm": 0.5690982341766357,
"learning_rate": 0.00017002004008016033,
"loss": 0.1555,
"step": 380
},
{
"epoch": 1.56,
"grad_norm": 0.44981974363327026,
"learning_rate": 0.0001692184368737475,
"loss": 0.1592,
"step": 390
},
{
"epoch": 1.6,
"grad_norm": 0.4784778952598572,
"learning_rate": 0.00016841683366733466,
"loss": 0.1636,
"step": 400
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.5931491851806641,
"learning_rate": 0.00016761523046092186,
"loss": 0.152,
"step": 410
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.663811206817627,
"learning_rate": 0.00016681362725450903,
"loss": 0.1704,
"step": 420
},
{
"epoch": 1.72,
"grad_norm": 0.4538479447364807,
"learning_rate": 0.0001660120240480962,
"loss": 0.1482,
"step": 430
},
{
"epoch": 1.76,
"grad_norm": 0.6272454261779785,
"learning_rate": 0.00016521042084168336,
"loss": 0.1545,
"step": 440
},
{
"epoch": 1.8,
"grad_norm": 0.5804396271705627,
"learning_rate": 0.00016440881763527055,
"loss": 0.1622,
"step": 450
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.5440978407859802,
"learning_rate": 0.00016360721442885772,
"loss": 0.1607,
"step": 460
},
{
"epoch": 1.88,
"grad_norm": 0.6250975131988525,
"learning_rate": 0.0001628056112224449,
"loss": 0.1547,
"step": 470
},
{
"epoch": 1.92,
"grad_norm": 0.41538259387016296,
"learning_rate": 0.00016200400801603208,
"loss": 0.146,
"step": 480
},
{
"epoch": 1.96,
"grad_norm": 0.5187687277793884,
"learning_rate": 0.00016120240480961925,
"loss": 0.1663,
"step": 490
},
{
"epoch": 2.0,
"grad_norm": 0.392794132232666,
"learning_rate": 0.00016040080160320644,
"loss": 0.1492,
"step": 500
},
{
"epoch": 2.04,
"grad_norm": 0.5112284421920776,
"learning_rate": 0.0001595991983967936,
"loss": 0.1276,
"step": 510
},
{
"epoch": 2.08,
"grad_norm": 0.577057421207428,
"learning_rate": 0.00015879759519038078,
"loss": 0.118,
"step": 520
},
{
"epoch": 2.12,
"grad_norm": 0.4589303433895111,
"learning_rate": 0.00015799599198396794,
"loss": 0.1251,
"step": 530
},
{
"epoch": 2.16,
"grad_norm": 0.4399261176586151,
"learning_rate": 0.0001571943887775551,
"loss": 0.1174,
"step": 540
},
{
"epoch": 2.2,
"grad_norm": 0.47207799553871155,
"learning_rate": 0.0001563927855711423,
"loss": 0.1301,
"step": 550
},
{
"epoch": 2.24,
"grad_norm": 0.5883737802505493,
"learning_rate": 0.00015559118236472947,
"loss": 0.1285,
"step": 560
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.5191317796707153,
"learning_rate": 0.00015478957915831664,
"loss": 0.1232,
"step": 570
},
{
"epoch": 2.32,
"grad_norm": 0.546741247177124,
"learning_rate": 0.0001539879759519038,
"loss": 0.1177,
"step": 580
},
{
"epoch": 2.36,
"grad_norm": 0.5307457447052002,
"learning_rate": 0.000153186372745491,
"loss": 0.1226,
"step": 590
},
{
"epoch": 2.4,
"grad_norm": 0.49284040927886963,
"learning_rate": 0.00015238476953907817,
"loss": 0.1398,
"step": 600
},
{
"epoch": 2.44,
"grad_norm": 0.46590954065322876,
"learning_rate": 0.00015158316633266534,
"loss": 0.1287,
"step": 610
},
{
"epoch": 2.48,
"grad_norm": 0.5511562824249268,
"learning_rate": 0.0001507815631262525,
"loss": 0.118,
"step": 620
},
{
"epoch": 2.52,
"grad_norm": 0.5350615978240967,
"learning_rate": 0.0001499799599198397,
"loss": 0.1295,
"step": 630
},
{
"epoch": 2.56,
"grad_norm": 0.5491256713867188,
"learning_rate": 0.00014917835671342686,
"loss": 0.129,
"step": 640
},
{
"epoch": 2.6,
"grad_norm": 0.5817452669143677,
"learning_rate": 0.00014837675350701403,
"loss": 0.1373,
"step": 650
},
{
"epoch": 2.64,
"grad_norm": 0.4481925964355469,
"learning_rate": 0.0001475751503006012,
"loss": 0.1389,
"step": 660
},
{
"epoch": 2.68,
"grad_norm": 0.5628023743629456,
"learning_rate": 0.0001467735470941884,
"loss": 0.1238,
"step": 670
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.678969144821167,
"learning_rate": 0.00014597194388777556,
"loss": 0.1318,
"step": 680
},
{
"epoch": 2.76,
"grad_norm": 0.5662192106246948,
"learning_rate": 0.00014517034068136273,
"loss": 0.1242,
"step": 690
},
{
"epoch": 2.8,
"grad_norm": 0.532526433467865,
"learning_rate": 0.0001443687374749499,
"loss": 0.1279,
"step": 700
},
{
"epoch": 2.84,
"grad_norm": 0.4268568158149719,
"learning_rate": 0.00014356713426853706,
"loss": 0.1342,
"step": 710
},
{
"epoch": 2.88,
"grad_norm": 0.6674071550369263,
"learning_rate": 0.00014276553106212425,
"loss": 0.1374,
"step": 720
},
{
"epoch": 2.92,
"grad_norm": 0.5499129891395569,
"learning_rate": 0.00014196392785571142,
"loss": 0.1226,
"step": 730
},
{
"epoch": 2.96,
"grad_norm": 0.5989218354225159,
"learning_rate": 0.0001411623246492986,
"loss": 0.1282,
"step": 740
},
{
"epoch": 3.0,
"grad_norm": 0.5111316442489624,
"learning_rate": 0.00014036072144288576,
"loss": 0.1241,
"step": 750
},
{
"epoch": 3.04,
"grad_norm": 0.4813462197780609,
"learning_rate": 0.00013955911823647295,
"loss": 0.0941,
"step": 760
},
{
"epoch": 3.08,
"grad_norm": 0.5328338742256165,
"learning_rate": 0.00013875751503006014,
"loss": 0.0914,
"step": 770
},
{
"epoch": 3.12,
"grad_norm": 0.5423403382301331,
"learning_rate": 0.0001379559118236473,
"loss": 0.0988,
"step": 780
},
{
"epoch": 3.16,
"grad_norm": 0.6739408373832703,
"learning_rate": 0.00013715430861723448,
"loss": 0.0995,
"step": 790
},
{
"epoch": 3.2,
"grad_norm": 0.5204668045043945,
"learning_rate": 0.00013635270541082165,
"loss": 0.0973,
"step": 800
},
{
"epoch": 3.24,
"grad_norm": 0.6864869594573975,
"learning_rate": 0.00013555110220440884,
"loss": 0.0911,
"step": 810
},
{
"epoch": 3.2800000000000002,
"grad_norm": 0.587958037853241,
"learning_rate": 0.000134749498997996,
"loss": 0.103,
"step": 820
},
{
"epoch": 3.32,
"grad_norm": 0.6141840219497681,
"learning_rate": 0.00013394789579158317,
"loss": 0.0925,
"step": 830
},
{
"epoch": 3.36,
"grad_norm": 0.5647754073143005,
"learning_rate": 0.00013314629258517034,
"loss": 0.1056,
"step": 840
},
{
"epoch": 3.4,
"grad_norm": 0.5085099339485168,
"learning_rate": 0.00013234468937875754,
"loss": 0.1071,
"step": 850
},
{
"epoch": 3.44,
"grad_norm": 0.6452666521072388,
"learning_rate": 0.0001315430861723447,
"loss": 0.1074,
"step": 860
},
{
"epoch": 3.48,
"grad_norm": 0.5613518357276917,
"learning_rate": 0.00013074148296593187,
"loss": 0.1066,
"step": 870
},
{
"epoch": 3.52,
"grad_norm": 0.5741850137710571,
"learning_rate": 0.00012993987975951904,
"loss": 0.1085,
"step": 880
},
{
"epoch": 3.56,
"grad_norm": 0.63754802942276,
"learning_rate": 0.00012913827655310623,
"loss": 0.1053,
"step": 890
},
{
"epoch": 3.6,
"grad_norm": 0.5717872381210327,
"learning_rate": 0.0001283366733466934,
"loss": 0.1058,
"step": 900
},
{
"epoch": 3.64,
"grad_norm": 0.5148450136184692,
"learning_rate": 0.00012753507014028056,
"loss": 0.1038,
"step": 910
},
{
"epoch": 3.68,
"grad_norm": 0.5819758772850037,
"learning_rate": 0.00012673346693386773,
"loss": 0.1119,
"step": 920
},
{
"epoch": 3.7199999999999998,
"grad_norm": 0.6014116406440735,
"learning_rate": 0.00012593186372745493,
"loss": 0.1081,
"step": 930
},
{
"epoch": 3.76,
"grad_norm": 0.574530303478241,
"learning_rate": 0.0001251302605210421,
"loss": 0.1023,
"step": 940
},
{
"epoch": 3.8,
"grad_norm": 0.6252711415290833,
"learning_rate": 0.00012432865731462926,
"loss": 0.0977,
"step": 950
},
{
"epoch": 3.84,
"grad_norm": 0.587404727935791,
"learning_rate": 0.00012352705410821643,
"loss": 0.0996,
"step": 960
},
{
"epoch": 3.88,
"grad_norm": 0.47545889019966125,
"learning_rate": 0.0001227254509018036,
"loss": 0.105,
"step": 970
},
{
"epoch": 3.92,
"grad_norm": 0.6459314823150635,
"learning_rate": 0.00012192384769539077,
"loss": 0.1003,
"step": 980
},
{
"epoch": 3.96,
"grad_norm": 0.5450368523597717,
"learning_rate": 0.00012112224448897796,
"loss": 0.1124,
"step": 990
},
{
"epoch": 4.0,
"grad_norm": 0.5874629616737366,
"learning_rate": 0.00012032064128256512,
"loss": 0.1089,
"step": 1000
},
{
"epoch": 4.04,
"grad_norm": 0.6516720056533813,
"learning_rate": 0.0001195190380761523,
"loss": 0.0842,
"step": 1010
},
{
"epoch": 4.08,
"grad_norm": 0.70643550157547,
"learning_rate": 0.00011871743486973947,
"loss": 0.075,
"step": 1020
},
{
"epoch": 4.12,
"grad_norm": 0.7798948287963867,
"learning_rate": 0.00011791583166332665,
"loss": 0.079,
"step": 1030
},
{
"epoch": 4.16,
"grad_norm": 0.7087392807006836,
"learning_rate": 0.00011711422845691385,
"loss": 0.0759,
"step": 1040
},
{
"epoch": 4.2,
"grad_norm": 0.5795313715934753,
"learning_rate": 0.00011631262525050101,
"loss": 0.0781,
"step": 1050
},
{
"epoch": 4.24,
"grad_norm": 0.6577343940734863,
"learning_rate": 0.00011551102204408819,
"loss": 0.0802,
"step": 1060
},
{
"epoch": 4.28,
"grad_norm": 0.6984887719154358,
"learning_rate": 0.00011470941883767536,
"loss": 0.0745,
"step": 1070
},
{
"epoch": 4.32,
"grad_norm": 0.7415528893470764,
"learning_rate": 0.00011390781563126254,
"loss": 0.0776,
"step": 1080
},
{
"epoch": 4.36,
"grad_norm": 0.46346089243888855,
"learning_rate": 0.00011310621242484971,
"loss": 0.0793,
"step": 1090
},
{
"epoch": 4.4,
"grad_norm": 0.5514031052589417,
"learning_rate": 0.00011230460921843689,
"loss": 0.0766,
"step": 1100
},
{
"epoch": 4.44,
"grad_norm": 0.5561518669128418,
"learning_rate": 0.00011150300601202406,
"loss": 0.069,
"step": 1110
},
{
"epoch": 4.48,
"grad_norm": 0.6889087557792664,
"learning_rate": 0.00011070140280561124,
"loss": 0.0809,
"step": 1120
},
{
"epoch": 4.52,
"grad_norm": 0.6279156804084778,
"learning_rate": 0.0001098997995991984,
"loss": 0.0684,
"step": 1130
},
{
"epoch": 4.5600000000000005,
"grad_norm": 0.4698103964328766,
"learning_rate": 0.00010909819639278558,
"loss": 0.0862,
"step": 1140
},
{
"epoch": 4.6,
"grad_norm": 0.5812392234802246,
"learning_rate": 0.00010829659318637275,
"loss": 0.0827,
"step": 1150
},
{
"epoch": 4.64,
"grad_norm": 0.6872307658195496,
"learning_rate": 0.00010749498997995993,
"loss": 0.0768,
"step": 1160
},
{
"epoch": 4.68,
"grad_norm": 0.6259503364562988,
"learning_rate": 0.0001066933867735471,
"loss": 0.0824,
"step": 1170
},
{
"epoch": 4.72,
"grad_norm": 0.5298041701316833,
"learning_rate": 0.00010589178356713428,
"loss": 0.0679,
"step": 1180
},
{
"epoch": 4.76,
"grad_norm": 0.8088281750679016,
"learning_rate": 0.00010509018036072145,
"loss": 0.0805,
"step": 1190
},
{
"epoch": 4.8,
"grad_norm": 0.49759751558303833,
"learning_rate": 0.00010428857715430861,
"loss": 0.0761,
"step": 1200
},
{
"epoch": 4.84,
"grad_norm": 0.4452104866504669,
"learning_rate": 0.0001034869739478958,
"loss": 0.0836,
"step": 1210
},
{
"epoch": 4.88,
"grad_norm": 0.4896758794784546,
"learning_rate": 0.00010268537074148296,
"loss": 0.0709,
"step": 1220
},
{
"epoch": 4.92,
"grad_norm": 0.6835769414901733,
"learning_rate": 0.00010188376753507014,
"loss": 0.0799,
"step": 1230
},
{
"epoch": 4.96,
"grad_norm": 0.7380080819129944,
"learning_rate": 0.00010108216432865731,
"loss": 0.0764,
"step": 1240
},
{
"epoch": 5.0,
"grad_norm": 0.7432966828346252,
"learning_rate": 0.00010028056112224449,
"loss": 0.079,
"step": 1250
},
{
"epoch": 5.04,
"grad_norm": 0.6639225482940674,
"learning_rate": 9.947895791583167e-05,
"loss": 0.0554,
"step": 1260
},
{
"epoch": 5.08,
"grad_norm": 0.5844371914863586,
"learning_rate": 9.867735470941885e-05,
"loss": 0.0502,
"step": 1270
},
{
"epoch": 5.12,
"grad_norm": 0.4597128629684448,
"learning_rate": 9.787575150300602e-05,
"loss": 0.0596,
"step": 1280
},
{
"epoch": 5.16,
"grad_norm": 0.7378055453300476,
"learning_rate": 9.70741482965932e-05,
"loss": 0.0556,
"step": 1290
},
{
"epoch": 5.2,
"grad_norm": 0.5333693027496338,
"learning_rate": 9.627254509018037e-05,
"loss": 0.0485,
"step": 1300
},
{
"epoch": 5.24,
"grad_norm": 0.51535564661026,
"learning_rate": 9.547094188376755e-05,
"loss": 0.0506,
"step": 1310
},
{
"epoch": 5.28,
"grad_norm": 0.5654531121253967,
"learning_rate": 9.466933867735471e-05,
"loss": 0.0556,
"step": 1320
},
{
"epoch": 5.32,
"grad_norm": 0.6834219694137573,
"learning_rate": 9.386773547094188e-05,
"loss": 0.0587,
"step": 1330
},
{
"epoch": 5.36,
"grad_norm": 0.5739651322364807,
"learning_rate": 9.306613226452906e-05,
"loss": 0.0533,
"step": 1340
},
{
"epoch": 5.4,
"grad_norm": 0.5924126505851746,
"learning_rate": 9.226452905811623e-05,
"loss": 0.0566,
"step": 1350
},
{
"epoch": 5.44,
"grad_norm": 0.6210141181945801,
"learning_rate": 9.146292585170341e-05,
"loss": 0.058,
"step": 1360
},
{
"epoch": 5.48,
"grad_norm": 0.445516437292099,
"learning_rate": 9.066132264529058e-05,
"loss": 0.0547,
"step": 1370
},
{
"epoch": 5.52,
"grad_norm": 0.5603981614112854,
"learning_rate": 8.985971943887777e-05,
"loss": 0.0565,
"step": 1380
},
{
"epoch": 5.5600000000000005,
"grad_norm": 0.6049801707267761,
"learning_rate": 8.905811623246494e-05,
"loss": 0.0579,
"step": 1390
},
{
"epoch": 5.6,
"grad_norm": 0.506960391998291,
"learning_rate": 8.825651302605212e-05,
"loss": 0.0493,
"step": 1400
},
{
"epoch": 5.64,
"grad_norm": 0.6527524590492249,
"learning_rate": 8.745490981963928e-05,
"loss": 0.0596,
"step": 1410
},
{
"epoch": 5.68,
"grad_norm": 0.6459997296333313,
"learning_rate": 8.665330661322647e-05,
"loss": 0.0512,
"step": 1420
},
{
"epoch": 5.72,
"grad_norm": 0.4932115375995636,
"learning_rate": 8.585170340681363e-05,
"loss": 0.0573,
"step": 1430
},
{
"epoch": 5.76,
"grad_norm": 0.7221292853355408,
"learning_rate": 8.50501002004008e-05,
"loss": 0.0605,
"step": 1440
},
{
"epoch": 5.8,
"grad_norm": 0.43518489599227905,
"learning_rate": 8.424849699398798e-05,
"loss": 0.0519,
"step": 1450
},
{
"epoch": 5.84,
"grad_norm": 0.7757974863052368,
"learning_rate": 8.344689378757515e-05,
"loss": 0.0567,
"step": 1460
},
{
"epoch": 5.88,
"grad_norm": 0.6453976631164551,
"learning_rate": 8.264529058116233e-05,
"loss": 0.0596,
"step": 1470
},
{
"epoch": 5.92,
"grad_norm": 0.6767393946647644,
"learning_rate": 8.18436873747495e-05,
"loss": 0.0597,
"step": 1480
},
{
"epoch": 5.96,
"grad_norm": 0.6432074904441833,
"learning_rate": 8.104208416833668e-05,
"loss": 0.0608,
"step": 1490
},
{
"epoch": 6.0,
"grad_norm": 0.4562494456768036,
"learning_rate": 8.024048096192384e-05,
"loss": 0.0519,
"step": 1500
},
{
"epoch": 6.04,
"grad_norm": 0.5894383788108826,
"learning_rate": 7.943887775551102e-05,
"loss": 0.0376,
"step": 1510
},
{
"epoch": 6.08,
"grad_norm": 0.5043510794639587,
"learning_rate": 7.86372745490982e-05,
"loss": 0.0364,
"step": 1520
},
{
"epoch": 6.12,
"grad_norm": 0.753901481628418,
"learning_rate": 7.783567134268538e-05,
"loss": 0.0387,
"step": 1530
},
{
"epoch": 6.16,
"grad_norm": 0.5726089477539062,
"learning_rate": 7.703406813627255e-05,
"loss": 0.0362,
"step": 1540
},
{
"epoch": 6.2,
"grad_norm": 0.7004730701446533,
"learning_rate": 7.623246492985973e-05,
"loss": 0.0327,
"step": 1550
},
{
"epoch": 6.24,
"grad_norm": 0.522422194480896,
"learning_rate": 7.54308617234469e-05,
"loss": 0.0326,
"step": 1560
},
{
"epoch": 6.28,
"grad_norm": 0.7951876521110535,
"learning_rate": 7.462925851703407e-05,
"loss": 0.0396,
"step": 1570
},
{
"epoch": 6.32,
"grad_norm": 0.5453311800956726,
"learning_rate": 7.382765531062125e-05,
"loss": 0.0366,
"step": 1580
},
{
"epoch": 6.36,
"grad_norm": 0.7363042235374451,
"learning_rate": 7.302605210420841e-05,
"loss": 0.0403,
"step": 1590
},
{
"epoch": 6.4,
"grad_norm": 0.7100352048873901,
"learning_rate": 7.22244488977956e-05,
"loss": 0.0392,
"step": 1600
},
{
"epoch": 6.44,
"grad_norm": 0.9634444713592529,
"learning_rate": 7.142284569138276e-05,
"loss": 0.0398,
"step": 1610
},
{
"epoch": 6.48,
"grad_norm": 0.6527013182640076,
"learning_rate": 7.062124248496994e-05,
"loss": 0.0369,
"step": 1620
},
{
"epoch": 6.52,
"grad_norm": 0.5168381333351135,
"learning_rate": 6.981963927855711e-05,
"loss": 0.0399,
"step": 1630
},
{
"epoch": 6.5600000000000005,
"grad_norm": 0.5673239827156067,
"learning_rate": 6.901803607214429e-05,
"loss": 0.0364,
"step": 1640
},
{
"epoch": 6.6,
"grad_norm": 0.4419175684452057,
"learning_rate": 6.821643286573146e-05,
"loss": 0.034,
"step": 1650
},
{
"epoch": 6.64,
"grad_norm": 0.5721241235733032,
"learning_rate": 6.741482965931865e-05,
"loss": 0.0407,
"step": 1660
},
{
"epoch": 6.68,
"grad_norm": 0.8612061738967896,
"learning_rate": 6.661322645290582e-05,
"loss": 0.0369,
"step": 1670
},
{
"epoch": 6.72,
"grad_norm": 0.6673168540000916,
"learning_rate": 6.581162324649299e-05,
"loss": 0.034,
"step": 1680
},
{
"epoch": 6.76,
"grad_norm": 0.816121518611908,
"learning_rate": 6.501002004008017e-05,
"loss": 0.0372,
"step": 1690
},
{
"epoch": 6.8,
"grad_norm": 0.5812684893608093,
"learning_rate": 6.420841683366733e-05,
"loss": 0.0379,
"step": 1700
},
{
"epoch": 6.84,
"grad_norm": 0.6159153580665588,
"learning_rate": 6.340681362725451e-05,
"loss": 0.0346,
"step": 1710
},
{
"epoch": 6.88,
"grad_norm": 0.809228241443634,
"learning_rate": 6.260521042084168e-05,
"loss": 0.0486,
"step": 1720
},
{
"epoch": 6.92,
"grad_norm": 0.6421244740486145,
"learning_rate": 6.180360721442886e-05,
"loss": 0.0387,
"step": 1730
},
{
"epoch": 6.96,
"grad_norm": 0.7321845889091492,
"learning_rate": 6.1002004008016036e-05,
"loss": 0.0395,
"step": 1740
},
{
"epoch": 7.0,
"grad_norm": 0.7744494676589966,
"learning_rate": 6.020040080160321e-05,
"loss": 0.0434,
"step": 1750
},
{
"epoch": 7.04,
"grad_norm": 0.6629056334495544,
"learning_rate": 5.9398797595190384e-05,
"loss": 0.0229,
"step": 1760
},
{
"epoch": 7.08,
"grad_norm": 0.5707855224609375,
"learning_rate": 5.859719438877756e-05,
"loss": 0.0261,
"step": 1770
},
{
"epoch": 7.12,
"grad_norm": 0.5904133319854736,
"learning_rate": 5.7795591182364725e-05,
"loss": 0.0239,
"step": 1780
},
{
"epoch": 7.16,
"grad_norm": 0.4738862216472626,
"learning_rate": 5.69939879759519e-05,
"loss": 0.0205,
"step": 1790
},
{
"epoch": 7.2,
"grad_norm": 0.5075474381446838,
"learning_rate": 5.6192384769539086e-05,
"loss": 0.025,
"step": 1800
},
{
"epoch": 7.24,
"grad_norm": 0.5726251602172852,
"learning_rate": 5.539078156312626e-05,
"loss": 0.0247,
"step": 1810
},
{
"epoch": 7.28,
"grad_norm": 0.5094057321548462,
"learning_rate": 5.4589178356713434e-05,
"loss": 0.0189,
"step": 1820
},
{
"epoch": 7.32,
"grad_norm": 0.47997888922691345,
"learning_rate": 5.378757515030061e-05,
"loss": 0.0221,
"step": 1830
},
{
"epoch": 7.36,
"grad_norm": 0.5335679650306702,
"learning_rate": 5.298597194388778e-05,
"loss": 0.0239,
"step": 1840
},
{
"epoch": 7.4,
"grad_norm": 0.5913345217704773,
"learning_rate": 5.2184368737474955e-05,
"loss": 0.0237,
"step": 1850
},
{
"epoch": 7.44,
"grad_norm": 0.41921791434288025,
"learning_rate": 5.138276553106213e-05,
"loss": 0.0294,
"step": 1860
},
{
"epoch": 7.48,
"grad_norm": 0.610205352306366,
"learning_rate": 5.05811623246493e-05,
"loss": 0.0237,
"step": 1870
},
{
"epoch": 7.52,
"grad_norm": 0.5666776299476624,
"learning_rate": 4.977955911823648e-05,
"loss": 0.0236,
"step": 1880
},
{
"epoch": 7.5600000000000005,
"grad_norm": 0.42802920937538147,
"learning_rate": 4.897795591182365e-05,
"loss": 0.0239,
"step": 1890
},
{
"epoch": 7.6,
"grad_norm": 0.748887836933136,
"learning_rate": 4.8176352705410824e-05,
"loss": 0.0269,
"step": 1900
},
{
"epoch": 7.64,
"grad_norm": 0.43109023571014404,
"learning_rate": 4.7374749498998e-05,
"loss": 0.0246,
"step": 1910
},
{
"epoch": 7.68,
"grad_norm": 0.43403562903404236,
"learning_rate": 4.657314629258517e-05,
"loss": 0.0219,
"step": 1920
},
{
"epoch": 7.72,
"grad_norm": 0.5174989104270935,
"learning_rate": 4.5771543086172346e-05,
"loss": 0.0229,
"step": 1930
},
{
"epoch": 7.76,
"grad_norm": 0.5523115396499634,
"learning_rate": 4.496993987975952e-05,
"loss": 0.0242,
"step": 1940
},
{
"epoch": 7.8,
"grad_norm": 0.4592013359069824,
"learning_rate": 4.4168336673346694e-05,
"loss": 0.0237,
"step": 1950
},
{
"epoch": 7.84,
"grad_norm": 0.574874997138977,
"learning_rate": 4.336673346693387e-05,
"loss": 0.0207,
"step": 1960
},
{
"epoch": 7.88,
"grad_norm": 0.6085746884346008,
"learning_rate": 4.256513026052105e-05,
"loss": 0.0213,
"step": 1970
},
{
"epoch": 7.92,
"grad_norm": 0.5086420178413391,
"learning_rate": 4.176352705410822e-05,
"loss": 0.0234,
"step": 1980
},
{
"epoch": 7.96,
"grad_norm": 0.4850587844848633,
"learning_rate": 4.0961923847695396e-05,
"loss": 0.0227,
"step": 1990
},
{
"epoch": 8.0,
"grad_norm": 0.469855934381485,
"learning_rate": 4.016032064128257e-05,
"loss": 0.026,
"step": 2000
},
{
"epoch": 8.04,
"grad_norm": 0.3705706000328064,
"learning_rate": 3.9358717434869744e-05,
"loss": 0.0156,
"step": 2010
},
{
"epoch": 8.08,
"grad_norm": 0.5124081969261169,
"learning_rate": 3.855711422845692e-05,
"loss": 0.0146,
"step": 2020
},
{
"epoch": 8.12,
"grad_norm": 0.3145318627357483,
"learning_rate": 3.7755511022044085e-05,
"loss": 0.0156,
"step": 2030
},
{
"epoch": 8.16,
"grad_norm": 0.47414782643318176,
"learning_rate": 3.6953907815631265e-05,
"loss": 0.015,
"step": 2040
},
{
"epoch": 8.2,
"grad_norm": 0.3775177597999573,
"learning_rate": 3.615230460921844e-05,
"loss": 0.0167,
"step": 2050
},
{
"epoch": 8.24,
"grad_norm": 0.43829330801963806,
"learning_rate": 3.535070140280561e-05,
"loss": 0.0135,
"step": 2060
},
{
"epoch": 8.28,
"grad_norm": 0.5022293329238892,
"learning_rate": 3.454909819639279e-05,
"loss": 0.013,
"step": 2070
},
{
"epoch": 8.32,
"grad_norm": 0.27599331736564636,
"learning_rate": 3.374749498997996e-05,
"loss": 0.0153,
"step": 2080
},
{
"epoch": 8.36,
"grad_norm": 0.82197105884552,
"learning_rate": 3.2945891783567135e-05,
"loss": 0.0164,
"step": 2090
},
{
"epoch": 8.4,
"grad_norm": 0.20758652687072754,
"learning_rate": 3.214428857715431e-05,
"loss": 0.0135,
"step": 2100
},
{
"epoch": 8.44,
"grad_norm": 0.29413196444511414,
"learning_rate": 3.134268537074149e-05,
"loss": 0.0122,
"step": 2110
},
{
"epoch": 8.48,
"grad_norm": 0.38946202397346497,
"learning_rate": 3.054108216432866e-05,
"loss": 0.0124,
"step": 2120
},
{
"epoch": 8.52,
"grad_norm": 0.3506149649620056,
"learning_rate": 2.9739478957915833e-05,
"loss": 0.0146,
"step": 2130
},
{
"epoch": 8.56,
"grad_norm": 0.31000277400016785,
"learning_rate": 2.8937875751503007e-05,
"loss": 0.0143,
"step": 2140
},
{
"epoch": 8.6,
"grad_norm": 0.3397505581378937,
"learning_rate": 2.813627254509018e-05,
"loss": 0.0146,
"step": 2150
},
{
"epoch": 8.64,
"grad_norm": 0.46700355410575867,
"learning_rate": 2.7334669338677355e-05,
"loss": 0.0148,
"step": 2160
},
{
"epoch": 8.68,
"grad_norm": 0.43519356846809387,
"learning_rate": 2.653306613226453e-05,
"loss": 0.0116,
"step": 2170
},
{
"epoch": 8.72,
"grad_norm": 0.3716731369495392,
"learning_rate": 2.5731462925851706e-05,
"loss": 0.0138,
"step": 2180
},
{
"epoch": 8.76,
"grad_norm": 0.43048301339149475,
"learning_rate": 2.4929859719438877e-05,
"loss": 0.0116,
"step": 2190
},
{
"epoch": 8.8,
"grad_norm": 0.586083173751831,
"learning_rate": 2.4128256513026054e-05,
"loss": 0.0118,
"step": 2200
},
{
"epoch": 8.84,
"grad_norm": 0.5221779346466064,
"learning_rate": 2.3326653306613228e-05,
"loss": 0.013,
"step": 2210
},
{
"epoch": 8.88,
"grad_norm": 0.4493936598300934,
"learning_rate": 2.25250501002004e-05,
"loss": 0.0161,
"step": 2220
},
{
"epoch": 8.92,
"grad_norm": 0.629578173160553,
"learning_rate": 2.172344689378758e-05,
"loss": 0.0136,
"step": 2230
},
{
"epoch": 8.96,
"grad_norm": 0.445533812046051,
"learning_rate": 2.092184368737475e-05,
"loss": 0.0116,
"step": 2240
},
{
"epoch": 9.0,
"grad_norm": 0.4184821546077728,
"learning_rate": 2.0120240480961923e-05,
"loss": 0.0108,
"step": 2250
},
{
"epoch": 9.04,
"grad_norm": 0.18857356905937195,
"learning_rate": 1.9318637274549097e-05,
"loss": 0.0088,
"step": 2260
},
{
"epoch": 9.08,
"grad_norm": 0.3190409541130066,
"learning_rate": 1.8517034068136274e-05,
"loss": 0.01,
"step": 2270
},
{
"epoch": 9.12,
"grad_norm": 0.23249320685863495,
"learning_rate": 1.7715430861723448e-05,
"loss": 0.0079,
"step": 2280
},
{
"epoch": 9.16,
"grad_norm": 0.29095596075057983,
"learning_rate": 1.6913827655310622e-05,
"loss": 0.0071,
"step": 2290
},
{
"epoch": 9.2,
"grad_norm": 0.2573925852775574,
"learning_rate": 1.6112224448897796e-05,
"loss": 0.0069,
"step": 2300
},
{
"epoch": 9.24,
"grad_norm": 0.14938637614250183,
"learning_rate": 1.531062124248497e-05,
"loss": 0.0098,
"step": 2310
},
{
"epoch": 9.28,
"grad_norm": 0.2534259855747223,
"learning_rate": 1.4509018036072145e-05,
"loss": 0.0082,
"step": 2320
},
{
"epoch": 9.32,
"grad_norm": 0.22516509890556335,
"learning_rate": 1.3707414829659317e-05,
"loss": 0.0085,
"step": 2330
},
{
"epoch": 9.36,
"grad_norm": 0.1505204439163208,
"learning_rate": 1.2905811623246495e-05,
"loss": 0.0071,
"step": 2340
},
{
"epoch": 9.4,
"grad_norm": 0.2739148736000061,
"learning_rate": 1.2104208416833669e-05,
"loss": 0.0096,
"step": 2350
},
{
"epoch": 9.44,
"grad_norm": 0.6390639543533325,
"learning_rate": 1.1302605210420842e-05,
"loss": 0.0072,
"step": 2360
},
{
"epoch": 9.48,
"grad_norm": 0.3169389069080353,
"learning_rate": 1.0501002004008016e-05,
"loss": 0.0069,
"step": 2370
},
{
"epoch": 9.52,
"grad_norm": 0.20216785371303558,
"learning_rate": 9.699398797595192e-06,
"loss": 0.0069,
"step": 2380
},
{
"epoch": 9.56,
"grad_norm": 0.21013762056827545,
"learning_rate": 8.897795591182364e-06,
"loss": 0.0084,
"step": 2390
},
{
"epoch": 9.6,
"grad_norm": 0.21212315559387207,
"learning_rate": 8.09619238476954e-06,
"loss": 0.0062,
"step": 2400
},
{
"epoch": 9.64,
"grad_norm": 0.23686246573925018,
"learning_rate": 7.294589178356714e-06,
"loss": 0.0102,
"step": 2410
},
{
"epoch": 9.68,
"grad_norm": 0.2729778587818146,
"learning_rate": 6.492985971943888e-06,
"loss": 0.007,
"step": 2420
},
{
"epoch": 9.72,
"grad_norm": 0.19149982929229736,
"learning_rate": 5.691382765531062e-06,
"loss": 0.0087,
"step": 2430
},
{
"epoch": 9.76,
"grad_norm": 0.20065714418888092,
"learning_rate": 4.889779559118237e-06,
"loss": 0.0067,
"step": 2440
},
{
"epoch": 9.8,
"grad_norm": 0.29255425930023193,
"learning_rate": 4.0881763527054114e-06,
"loss": 0.008,
"step": 2450
},
{
"epoch": 9.84,
"grad_norm": 0.254088819026947,
"learning_rate": 3.2865731462925853e-06,
"loss": 0.0065,
"step": 2460
},
{
"epoch": 9.88,
"grad_norm": 0.2548205554485321,
"learning_rate": 2.4849699398797596e-06,
"loss": 0.0082,
"step": 2470
},
{
"epoch": 9.92,
"grad_norm": 0.2607119679450989,
"learning_rate": 1.6833667334669339e-06,
"loss": 0.0091,
"step": 2480
},
{
"epoch": 9.96,
"grad_norm": 0.2837667465209961,
"learning_rate": 8.817635270541082e-07,
"loss": 0.0083,
"step": 2490
},
{
"epoch": 10.0,
"grad_norm": 0.24088458716869354,
"learning_rate": 8.016032064128256e-08,
"loss": 0.0089,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.17341887028272e+17,
"train_batch_size": 20,
"trial_name": null,
"trial_params": null
}