Deepanshu7284's picture
End of training
6cb6ae4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 24651,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012169891687963978,
"grad_norm": 937035.375,
"learning_rate": 1.9919678714859437e-05,
"loss": 3.204,
"step": 100
},
{
"epoch": 0.024339783375927956,
"grad_norm": 1048863.375,
"learning_rate": 1.9838546103606345e-05,
"loss": 1.7829,
"step": 200
},
{
"epoch": 0.03650967506389193,
"grad_norm": 1297520.75,
"learning_rate": 1.9757413492353253e-05,
"loss": 1.5258,
"step": 300
},
{
"epoch": 0.04867956675185591,
"grad_norm": 876169.625,
"learning_rate": 1.967628088110016e-05,
"loss": 1.3856,
"step": 400
},
{
"epoch": 0.060849458439819884,
"grad_norm": 1111828.5,
"learning_rate": 1.9595148269847068e-05,
"loss": 1.2483,
"step": 500
},
{
"epoch": 0.07301935012778386,
"grad_norm": 876964.1875,
"learning_rate": 1.9514015658593976e-05,
"loss": 1.2228,
"step": 600
},
{
"epoch": 0.08518924181574784,
"grad_norm": 667343.125,
"learning_rate": 1.943288304734088e-05,
"loss": 1.2008,
"step": 700
},
{
"epoch": 0.09735913350371182,
"grad_norm": 1427273.75,
"learning_rate": 1.9351750436087787e-05,
"loss": 1.1185,
"step": 800
},
{
"epoch": 0.10952902519167579,
"grad_norm": 1462503.125,
"learning_rate": 1.9270617824834695e-05,
"loss": 1.0793,
"step": 900
},
{
"epoch": 0.12169891687963977,
"grad_norm": 704485.8125,
"learning_rate": 1.91894852135816e-05,
"loss": 1.0943,
"step": 1000
},
{
"epoch": 0.13386880856760375,
"grad_norm": 1196744.0,
"learning_rate": 1.9108352602328507e-05,
"loss": 1.0702,
"step": 1100
},
{
"epoch": 0.14603870025556773,
"grad_norm": 883526.5,
"learning_rate": 1.9027219991075415e-05,
"loss": 1.0606,
"step": 1200
},
{
"epoch": 0.1582085919435317,
"grad_norm": 857247.3125,
"learning_rate": 1.8946087379822322e-05,
"loss": 1.0503,
"step": 1300
},
{
"epoch": 0.1703784836314957,
"grad_norm": 877640.125,
"learning_rate": 1.8864954768569227e-05,
"loss": 0.9371,
"step": 1400
},
{
"epoch": 0.18254837531945967,
"grad_norm": 1136452.625,
"learning_rate": 1.8783822157316134e-05,
"loss": 0.9607,
"step": 1500
},
{
"epoch": 0.19471826700742365,
"grad_norm": 776925.5625,
"learning_rate": 1.8702689546063042e-05,
"loss": 1.0074,
"step": 1600
},
{
"epoch": 0.2068881586953876,
"grad_norm": 700356.4375,
"learning_rate": 1.862155693480995e-05,
"loss": 0.9649,
"step": 1700
},
{
"epoch": 0.21905805038335158,
"grad_norm": 889704.0,
"learning_rate": 1.8540424323556857e-05,
"loss": 0.9464,
"step": 1800
},
{
"epoch": 0.23122794207131556,
"grad_norm": 480890.5,
"learning_rate": 1.845929171230376e-05,
"loss": 0.9489,
"step": 1900
},
{
"epoch": 0.24339783375927954,
"grad_norm": 622537.625,
"learning_rate": 1.837815910105067e-05,
"loss": 0.9185,
"step": 2000
},
{
"epoch": 0.25556772544724354,
"grad_norm": 1142430.625,
"learning_rate": 1.8297026489797577e-05,
"loss": 0.9657,
"step": 2100
},
{
"epoch": 0.2677376171352075,
"grad_norm": 418717.90625,
"learning_rate": 1.821589387854448e-05,
"loss": 0.9109,
"step": 2200
},
{
"epoch": 0.27990750882317145,
"grad_norm": 1248165.75,
"learning_rate": 1.813476126729139e-05,
"loss": 0.9193,
"step": 2300
},
{
"epoch": 0.29207740051113545,
"grad_norm": 794509.5,
"learning_rate": 1.8053628656038296e-05,
"loss": 0.8797,
"step": 2400
},
{
"epoch": 0.3042472921990994,
"grad_norm": 610876.1875,
"learning_rate": 1.7972496044785204e-05,
"loss": 0.8851,
"step": 2500
},
{
"epoch": 0.3164171838870634,
"grad_norm": 1630106.5,
"learning_rate": 1.7891363433532108e-05,
"loss": 0.8487,
"step": 2600
},
{
"epoch": 0.32858707557502737,
"grad_norm": 1035289.8125,
"learning_rate": 1.7810230822279016e-05,
"loss": 0.8658,
"step": 2700
},
{
"epoch": 0.3407569672629914,
"grad_norm": 1528019.375,
"learning_rate": 1.7729098211025923e-05,
"loss": 0.8819,
"step": 2800
},
{
"epoch": 0.3529268589509553,
"grad_norm": 1408853.375,
"learning_rate": 1.764796559977283e-05,
"loss": 0.952,
"step": 2900
},
{
"epoch": 0.36509675063891933,
"grad_norm": 754760.5,
"learning_rate": 1.756683298851974e-05,
"loss": 0.928,
"step": 3000
},
{
"epoch": 0.3772666423268833,
"grad_norm": 890047.3125,
"learning_rate": 1.7485700377266643e-05,
"loss": 0.8566,
"step": 3100
},
{
"epoch": 0.3894365340148473,
"grad_norm": 564804.375,
"learning_rate": 1.740456776601355e-05,
"loss": 0.8467,
"step": 3200
},
{
"epoch": 0.40160642570281124,
"grad_norm": 540386.1875,
"learning_rate": 1.7323435154760458e-05,
"loss": 0.8995,
"step": 3300
},
{
"epoch": 0.4137763173907752,
"grad_norm": 733578.0,
"learning_rate": 1.7242302543507362e-05,
"loss": 0.8114,
"step": 3400
},
{
"epoch": 0.4259462090787392,
"grad_norm": 815934.875,
"learning_rate": 1.716116993225427e-05,
"loss": 0.856,
"step": 3500
},
{
"epoch": 0.43811610076670315,
"grad_norm": 1008148.0,
"learning_rate": 1.7080037321001177e-05,
"loss": 0.7677,
"step": 3600
},
{
"epoch": 0.45028599245466716,
"grad_norm": 708381.8125,
"learning_rate": 1.6998904709748085e-05,
"loss": 0.796,
"step": 3700
},
{
"epoch": 0.4624558841426311,
"grad_norm": 949816.8125,
"learning_rate": 1.6917772098494993e-05,
"loss": 0.8212,
"step": 3800
},
{
"epoch": 0.4746257758305951,
"grad_norm": 720175.4375,
"learning_rate": 1.68366394872419e-05,
"loss": 0.8439,
"step": 3900
},
{
"epoch": 0.48679566751855907,
"grad_norm": 582352.25,
"learning_rate": 1.6755506875988805e-05,
"loss": 0.8144,
"step": 4000
},
{
"epoch": 0.4989655592065231,
"grad_norm": 823707.625,
"learning_rate": 1.6674374264735712e-05,
"loss": 0.7497,
"step": 4100
},
{
"epoch": 0.5111354508944871,
"grad_norm": 732386.0625,
"learning_rate": 1.659324165348262e-05,
"loss": 0.8184,
"step": 4200
},
{
"epoch": 0.523305342582451,
"grad_norm": 770159.0,
"learning_rate": 1.6512109042229524e-05,
"loss": 0.7994,
"step": 4300
},
{
"epoch": 0.535475234270415,
"grad_norm": 752353.4375,
"learning_rate": 1.6430976430976432e-05,
"loss": 0.8176,
"step": 4400
},
{
"epoch": 0.547645125958379,
"grad_norm": 1363265.25,
"learning_rate": 1.634984381972334e-05,
"loss": 0.7828,
"step": 4500
},
{
"epoch": 0.5598150176463429,
"grad_norm": 1718103.0,
"learning_rate": 1.6268711208470247e-05,
"loss": 0.8077,
"step": 4600
},
{
"epoch": 0.5719849093343069,
"grad_norm": 930578.8125,
"learning_rate": 1.618757859721715e-05,
"loss": 0.7249,
"step": 4700
},
{
"epoch": 0.5841548010222709,
"grad_norm": 677129.1875,
"learning_rate": 1.610644598596406e-05,
"loss": 0.7853,
"step": 4800
},
{
"epoch": 0.5963246927102349,
"grad_norm": 619335.1875,
"learning_rate": 1.6025313374710967e-05,
"loss": 0.7623,
"step": 4900
},
{
"epoch": 0.6084945843981988,
"grad_norm": 793190.875,
"learning_rate": 1.5944180763457874e-05,
"loss": 0.7554,
"step": 5000
},
{
"epoch": 0.6206644760861628,
"grad_norm": 676459.5,
"learning_rate": 1.5863048152204782e-05,
"loss": 0.7342,
"step": 5100
},
{
"epoch": 0.6328343677741268,
"grad_norm": 1202478.25,
"learning_rate": 1.5781915540951686e-05,
"loss": 0.7866,
"step": 5200
},
{
"epoch": 0.6450042594620908,
"grad_norm": 958839.5,
"learning_rate": 1.5700782929698594e-05,
"loss": 0.8038,
"step": 5300
},
{
"epoch": 0.6571741511500547,
"grad_norm": 781309.25,
"learning_rate": 1.56196503184455e-05,
"loss": 0.7764,
"step": 5400
},
{
"epoch": 0.6693440428380187,
"grad_norm": 1317343.625,
"learning_rate": 1.5538517707192406e-05,
"loss": 0.7965,
"step": 5500
},
{
"epoch": 0.6815139345259827,
"grad_norm": 928345.4375,
"learning_rate": 1.5457385095939313e-05,
"loss": 0.7097,
"step": 5600
},
{
"epoch": 0.6936838262139466,
"grad_norm": 1027113.25,
"learning_rate": 1.537625248468622e-05,
"loss": 0.8198,
"step": 5700
},
{
"epoch": 0.7058537179019106,
"grad_norm": 698121.9375,
"learning_rate": 1.529511987343313e-05,
"loss": 0.783,
"step": 5800
},
{
"epoch": 0.7180236095898747,
"grad_norm": 835533.125,
"learning_rate": 1.5213987262180034e-05,
"loss": 0.7083,
"step": 5900
},
{
"epoch": 0.7301935012778387,
"grad_norm": 444278.4375,
"learning_rate": 1.513285465092694e-05,
"loss": 0.7802,
"step": 6000
},
{
"epoch": 0.7423633929658026,
"grad_norm": 673798.875,
"learning_rate": 1.5051722039673848e-05,
"loss": 0.797,
"step": 6100
},
{
"epoch": 0.7545332846537666,
"grad_norm": 886280.0,
"learning_rate": 1.4970589428420754e-05,
"loss": 0.7827,
"step": 6200
},
{
"epoch": 0.7667031763417306,
"grad_norm": 492442.40625,
"learning_rate": 1.4889456817167662e-05,
"loss": 0.7156,
"step": 6300
},
{
"epoch": 0.7788730680296946,
"grad_norm": 1017422.6875,
"learning_rate": 1.4808324205914568e-05,
"loss": 0.6945,
"step": 6400
},
{
"epoch": 0.7910429597176585,
"grad_norm": 737922.875,
"learning_rate": 1.4727191594661477e-05,
"loss": 0.7559,
"step": 6500
},
{
"epoch": 0.8032128514056225,
"grad_norm": 696342.0,
"learning_rate": 1.4646058983408383e-05,
"loss": 0.7982,
"step": 6600
},
{
"epoch": 0.8153827430935865,
"grad_norm": 676595.75,
"learning_rate": 1.4564926372155289e-05,
"loss": 0.7437,
"step": 6700
},
{
"epoch": 0.8275526347815504,
"grad_norm": 931158.625,
"learning_rate": 1.4483793760902196e-05,
"loss": 0.7817,
"step": 6800
},
{
"epoch": 0.8397225264695144,
"grad_norm": 510165.03125,
"learning_rate": 1.4402661149649102e-05,
"loss": 0.7618,
"step": 6900
},
{
"epoch": 0.8518924181574784,
"grad_norm": 660383.375,
"learning_rate": 1.432152853839601e-05,
"loss": 0.7658,
"step": 7000
},
{
"epoch": 0.8640623098454424,
"grad_norm": 528319.4375,
"learning_rate": 1.4240395927142916e-05,
"loss": 0.7113,
"step": 7100
},
{
"epoch": 0.8762322015334063,
"grad_norm": 1037643.125,
"learning_rate": 1.4159263315889823e-05,
"loss": 0.7364,
"step": 7200
},
{
"epoch": 0.8884020932213703,
"grad_norm": 249296.375,
"learning_rate": 1.407813070463673e-05,
"loss": 0.7371,
"step": 7300
},
{
"epoch": 0.9005719849093343,
"grad_norm": 259082.21875,
"learning_rate": 1.3996998093383635e-05,
"loss": 0.7758,
"step": 7400
},
{
"epoch": 0.9127418765972983,
"grad_norm": 603928.8125,
"learning_rate": 1.3915865482130545e-05,
"loss": 0.7161,
"step": 7500
},
{
"epoch": 0.9249117682852622,
"grad_norm": 722792.625,
"learning_rate": 1.3834732870877449e-05,
"loss": 0.7751,
"step": 7600
},
{
"epoch": 0.9370816599732262,
"grad_norm": 660232.3125,
"learning_rate": 1.3753600259624358e-05,
"loss": 0.7112,
"step": 7700
},
{
"epoch": 0.9492515516611902,
"grad_norm": 500458.6875,
"learning_rate": 1.3672467648371264e-05,
"loss": 0.768,
"step": 7800
},
{
"epoch": 0.9614214433491542,
"grad_norm": 604712.9375,
"learning_rate": 1.3591335037118172e-05,
"loss": 0.7293,
"step": 7900
},
{
"epoch": 0.9735913350371181,
"grad_norm": 373637.78125,
"learning_rate": 1.3510202425865078e-05,
"loss": 0.7558,
"step": 8000
},
{
"epoch": 0.9857612267250822,
"grad_norm": 681396.625,
"learning_rate": 1.3429069814611984e-05,
"loss": 0.746,
"step": 8100
},
{
"epoch": 0.9979311184130462,
"grad_norm": 448930.375,
"learning_rate": 1.3347937203358891e-05,
"loss": 0.6882,
"step": 8200
},
{
"epoch": 1.0,
"eval_loss": 0.6668452620506287,
"eval_runtime": 263.7204,
"eval_samples_per_second": 45.886,
"eval_steps_per_second": 2.87,
"step": 8217
},
{
"epoch": 1.0101010101010102,
"grad_norm": 526495.75,
"learning_rate": 1.3266804592105797e-05,
"loss": 0.6339,
"step": 8300
},
{
"epoch": 1.0222709017889742,
"grad_norm": 562899.25,
"learning_rate": 1.3185671980852705e-05,
"loss": 0.5614,
"step": 8400
},
{
"epoch": 1.034440793476938,
"grad_norm": 724823.875,
"learning_rate": 1.3104539369599611e-05,
"loss": 0.6053,
"step": 8500
},
{
"epoch": 1.046610685164902,
"grad_norm": 585095.625,
"learning_rate": 1.3023406758346517e-05,
"loss": 0.6184,
"step": 8600
},
{
"epoch": 1.058780576852866,
"grad_norm": 1030241.4375,
"learning_rate": 1.2942274147093426e-05,
"loss": 0.5891,
"step": 8700
},
{
"epoch": 1.07095046854083,
"grad_norm": 380056.78125,
"learning_rate": 1.2861141535840332e-05,
"loss": 0.5728,
"step": 8800
},
{
"epoch": 1.083120360228794,
"grad_norm": 639405.0,
"learning_rate": 1.278000892458724e-05,
"loss": 0.5113,
"step": 8900
},
{
"epoch": 1.095290251916758,
"grad_norm": 532266.0625,
"learning_rate": 1.2698876313334146e-05,
"loss": 0.5679,
"step": 9000
},
{
"epoch": 1.107460143604722,
"grad_norm": 739227.4375,
"learning_rate": 1.2617743702081053e-05,
"loss": 0.5704,
"step": 9100
},
{
"epoch": 1.119630035292686,
"grad_norm": 610794.125,
"learning_rate": 1.253661109082796e-05,
"loss": 0.5849,
"step": 9200
},
{
"epoch": 1.1317999269806498,
"grad_norm": 1094627.0,
"learning_rate": 1.2455478479574865e-05,
"loss": 0.5622,
"step": 9300
},
{
"epoch": 1.1439698186686138,
"grad_norm": 484573.78125,
"learning_rate": 1.2374345868321773e-05,
"loss": 0.5225,
"step": 9400
},
{
"epoch": 1.1561397103565778,
"grad_norm": 616542.0,
"learning_rate": 1.2293213257068679e-05,
"loss": 0.6138,
"step": 9500
},
{
"epoch": 1.1683096020445418,
"grad_norm": 774001.75,
"learning_rate": 1.2212080645815586e-05,
"loss": 0.5751,
"step": 9600
},
{
"epoch": 1.1804794937325058,
"grad_norm": 426144.40625,
"learning_rate": 1.2130948034562492e-05,
"loss": 0.5364,
"step": 9700
},
{
"epoch": 1.1926493854204698,
"grad_norm": 837338.25,
"learning_rate": 1.2049815423309402e-05,
"loss": 0.5749,
"step": 9800
},
{
"epoch": 1.2048192771084336,
"grad_norm": 1166716.5,
"learning_rate": 1.1968682812056308e-05,
"loss": 0.5591,
"step": 9900
},
{
"epoch": 1.2169891687963976,
"grad_norm": 1038052.8125,
"learning_rate": 1.1887550200803213e-05,
"loss": 0.5873,
"step": 10000
},
{
"epoch": 1.2291590604843616,
"grad_norm": 720955.0625,
"learning_rate": 1.1806417589550121e-05,
"loss": 0.5493,
"step": 10100
},
{
"epoch": 1.2413289521723256,
"grad_norm": 1045855.625,
"learning_rate": 1.1725284978297027e-05,
"loss": 0.5848,
"step": 10200
},
{
"epoch": 1.2534988438602896,
"grad_norm": 464351.90625,
"learning_rate": 1.1644152367043935e-05,
"loss": 0.5139,
"step": 10300
},
{
"epoch": 1.2656687355482537,
"grad_norm": 761802.375,
"learning_rate": 1.156301975579084e-05,
"loss": 0.5724,
"step": 10400
},
{
"epoch": 1.2778386272362177,
"grad_norm": 348438.75,
"learning_rate": 1.1481887144537748e-05,
"loss": 0.5744,
"step": 10500
},
{
"epoch": 1.2900085189241817,
"grad_norm": 918051.5,
"learning_rate": 1.1400754533284654e-05,
"loss": 0.5343,
"step": 10600
},
{
"epoch": 1.3021784106121457,
"grad_norm": 787824.0625,
"learning_rate": 1.131962192203156e-05,
"loss": 0.5321,
"step": 10700
},
{
"epoch": 1.3143483023001095,
"grad_norm": 448052.0,
"learning_rate": 1.123848931077847e-05,
"loss": 0.5562,
"step": 10800
},
{
"epoch": 1.3265181939880735,
"grad_norm": 1413978.625,
"learning_rate": 1.1157356699525375e-05,
"loss": 0.5763,
"step": 10900
},
{
"epoch": 1.3386880856760375,
"grad_norm": 366529.03125,
"learning_rate": 1.1076224088272283e-05,
"loss": 0.6262,
"step": 11000
},
{
"epoch": 1.3508579773640015,
"grad_norm": 875949.5,
"learning_rate": 1.0995091477019189e-05,
"loss": 0.5719,
"step": 11100
},
{
"epoch": 1.3630278690519655,
"grad_norm": 646206.9375,
"learning_rate": 1.0913958865766097e-05,
"loss": 0.5938,
"step": 11200
},
{
"epoch": 1.3751977607399295,
"grad_norm": 1301331.5,
"learning_rate": 1.0832826254513003e-05,
"loss": 0.5525,
"step": 11300
},
{
"epoch": 1.3873676524278933,
"grad_norm": 575320.25,
"learning_rate": 1.0751693643259908e-05,
"loss": 0.5473,
"step": 11400
},
{
"epoch": 1.3995375441158573,
"grad_norm": 1271057.875,
"learning_rate": 1.0670561032006816e-05,
"loss": 0.5698,
"step": 11500
},
{
"epoch": 1.4117074358038213,
"grad_norm": 296549.71875,
"learning_rate": 1.0589428420753722e-05,
"loss": 0.5891,
"step": 11600
},
{
"epoch": 1.4238773274917853,
"grad_norm": 843789.0,
"learning_rate": 1.050829580950063e-05,
"loss": 0.5645,
"step": 11700
},
{
"epoch": 1.4360472191797493,
"grad_norm": 1364688.875,
"learning_rate": 1.0427163198247536e-05,
"loss": 0.5928,
"step": 11800
},
{
"epoch": 1.4482171108677133,
"grad_norm": 949730.625,
"learning_rate": 1.0346030586994442e-05,
"loss": 0.5808,
"step": 11900
},
{
"epoch": 1.4603870025556773,
"grad_norm": 1451533.5,
"learning_rate": 1.0264897975741351e-05,
"loss": 0.5724,
"step": 12000
},
{
"epoch": 1.4725568942436413,
"grad_norm": 765235.5,
"learning_rate": 1.0183765364488257e-05,
"loss": 0.5717,
"step": 12100
},
{
"epoch": 1.4847267859316053,
"grad_norm": 678600.6875,
"learning_rate": 1.0102632753235164e-05,
"loss": 0.5869,
"step": 12200
},
{
"epoch": 1.4968966776195691,
"grad_norm": 968140.4375,
"learning_rate": 1.002150014198207e-05,
"loss": 0.5524,
"step": 12300
},
{
"epoch": 1.5090665693075331,
"grad_norm": 997379.0625,
"learning_rate": 9.940367530728976e-06,
"loss": 0.5984,
"step": 12400
},
{
"epoch": 1.5212364609954971,
"grad_norm": 553474.9375,
"learning_rate": 9.859234919475884e-06,
"loss": 0.5907,
"step": 12500
},
{
"epoch": 1.5334063526834611,
"grad_norm": 764892.25,
"learning_rate": 9.778102308222792e-06,
"loss": 0.5059,
"step": 12600
},
{
"epoch": 1.545576244371425,
"grad_norm": 807238.5,
"learning_rate": 9.696969696969698e-06,
"loss": 0.5545,
"step": 12700
},
{
"epoch": 1.557746136059389,
"grad_norm": 732502.3125,
"learning_rate": 9.615837085716605e-06,
"loss": 0.5859,
"step": 12800
},
{
"epoch": 1.569916027747353,
"grad_norm": 346234.3125,
"learning_rate": 9.534704474463511e-06,
"loss": 0.5527,
"step": 12900
},
{
"epoch": 1.582085919435317,
"grad_norm": 959444.1875,
"learning_rate": 9.453571863210417e-06,
"loss": 0.5873,
"step": 13000
},
{
"epoch": 1.594255811123281,
"grad_norm": 506975.46875,
"learning_rate": 9.372439251957325e-06,
"loss": 0.5604,
"step": 13100
},
{
"epoch": 1.606425702811245,
"grad_norm": 452580.34375,
"learning_rate": 9.291306640704232e-06,
"loss": 0.5076,
"step": 13200
},
{
"epoch": 1.618595594499209,
"grad_norm": 889885.8125,
"learning_rate": 9.210174029451138e-06,
"loss": 0.5833,
"step": 13300
},
{
"epoch": 1.630765486187173,
"grad_norm": 423025.90625,
"learning_rate": 9.129041418198046e-06,
"loss": 0.6075,
"step": 13400
},
{
"epoch": 1.642935377875137,
"grad_norm": 948548.1875,
"learning_rate": 9.047908806944954e-06,
"loss": 0.5736,
"step": 13500
},
{
"epoch": 1.655105269563101,
"grad_norm": 917991.125,
"learning_rate": 8.96677619569186e-06,
"loss": 0.5497,
"step": 13600
},
{
"epoch": 1.667275161251065,
"grad_norm": 352658.0625,
"learning_rate": 8.885643584438765e-06,
"loss": 0.5475,
"step": 13700
},
{
"epoch": 1.6794450529390288,
"grad_norm": 807333.25,
"learning_rate": 8.804510973185673e-06,
"loss": 0.56,
"step": 13800
},
{
"epoch": 1.6916149446269928,
"grad_norm": 530145.875,
"learning_rate": 8.723378361932579e-06,
"loss": 0.5408,
"step": 13900
},
{
"epoch": 1.7037848363149568,
"grad_norm": 630776.3125,
"learning_rate": 8.642245750679487e-06,
"loss": 0.5707,
"step": 14000
},
{
"epoch": 1.7159547280029208,
"grad_norm": 859784.0625,
"learning_rate": 8.561113139426394e-06,
"loss": 0.5526,
"step": 14100
},
{
"epoch": 1.7281246196908846,
"grad_norm": 383244.90625,
"learning_rate": 8.4799805281733e-06,
"loss": 0.5415,
"step": 14200
},
{
"epoch": 1.7402945113788486,
"grad_norm": 183796.140625,
"learning_rate": 8.398847916920206e-06,
"loss": 0.5929,
"step": 14300
},
{
"epoch": 1.7524644030668126,
"grad_norm": 834435.0625,
"learning_rate": 8.317715305667114e-06,
"loss": 0.5811,
"step": 14400
},
{
"epoch": 1.7646342947547766,
"grad_norm": 1005049.8125,
"learning_rate": 8.23658269441402e-06,
"loss": 0.4738,
"step": 14500
},
{
"epoch": 1.7768041864427406,
"grad_norm": 1233122.5,
"learning_rate": 8.155450083160927e-06,
"loss": 0.5355,
"step": 14600
},
{
"epoch": 1.7889740781307046,
"grad_norm": 1253067.5,
"learning_rate": 8.074317471907835e-06,
"loss": 0.5409,
"step": 14700
},
{
"epoch": 1.8011439698186686,
"grad_norm": 286456.0,
"learning_rate": 7.993184860654741e-06,
"loss": 0.5088,
"step": 14800
},
{
"epoch": 1.8133138615066327,
"grad_norm": 776422.375,
"learning_rate": 7.912052249401647e-06,
"loss": 0.5306,
"step": 14900
},
{
"epoch": 1.8254837531945967,
"grad_norm": 764024.75,
"learning_rate": 7.830919638148554e-06,
"loss": 0.5295,
"step": 15000
},
{
"epoch": 1.8376536448825607,
"grad_norm": 1123857.125,
"learning_rate": 7.74978702689546e-06,
"loss": 0.5476,
"step": 15100
},
{
"epoch": 1.8498235365705247,
"grad_norm": 606708.5,
"learning_rate": 7.668654415642368e-06,
"loss": 0.5496,
"step": 15200
},
{
"epoch": 1.8619934282584885,
"grad_norm": 632422.75,
"learning_rate": 7.587521804389275e-06,
"loss": 0.5469,
"step": 15300
},
{
"epoch": 1.8741633199464525,
"grad_norm": 354093.15625,
"learning_rate": 7.5063891931361825e-06,
"loss": 0.5983,
"step": 15400
},
{
"epoch": 1.8863332116344165,
"grad_norm": 285148.65625,
"learning_rate": 7.425256581883089e-06,
"loss": 0.5253,
"step": 15500
},
{
"epoch": 1.8985031033223805,
"grad_norm": 1560757.125,
"learning_rate": 7.344123970629995e-06,
"loss": 0.5476,
"step": 15600
},
{
"epoch": 1.9106729950103443,
"grad_norm": 608123.4375,
"learning_rate": 7.262991359376902e-06,
"loss": 0.5116,
"step": 15700
},
{
"epoch": 1.9228428866983083,
"grad_norm": 1332108.375,
"learning_rate": 7.181858748123809e-06,
"loss": 0.516,
"step": 15800
},
{
"epoch": 1.9350127783862723,
"grad_norm": 376606.625,
"learning_rate": 7.1007261368707156e-06,
"loss": 0.5317,
"step": 15900
},
{
"epoch": 1.9471826700742363,
"grad_norm": 913279.125,
"learning_rate": 7.019593525617623e-06,
"loss": 0.5573,
"step": 16000
},
{
"epoch": 1.9593525617622003,
"grad_norm": 290156.71875,
"learning_rate": 6.93846091436453e-06,
"loss": 0.5275,
"step": 16100
},
{
"epoch": 1.9715224534501643,
"grad_norm": 760140.5625,
"learning_rate": 6.857328303111436e-06,
"loss": 0.5982,
"step": 16200
},
{
"epoch": 1.9836923451381283,
"grad_norm": 312559.34375,
"learning_rate": 6.776195691858343e-06,
"loss": 0.5991,
"step": 16300
},
{
"epoch": 1.9958622368260923,
"grad_norm": 860316.0,
"learning_rate": 6.6950630806052495e-06,
"loss": 0.5509,
"step": 16400
},
{
"epoch": 2.0,
"eval_loss": 0.7069754004478455,
"eval_runtime": 263.7582,
"eval_samples_per_second": 45.879,
"eval_steps_per_second": 2.87,
"step": 16434
},
{
"epoch": 2.0080321285140563,
"grad_norm": 1111842.375,
"learning_rate": 6.613930469352156e-06,
"loss": 0.4853,
"step": 16500
},
{
"epoch": 2.0202020202020203,
"grad_norm": 734144.25,
"learning_rate": 6.532797858099064e-06,
"loss": 0.3957,
"step": 16600
},
{
"epoch": 2.0323719118899843,
"grad_norm": 915932.9375,
"learning_rate": 6.451665246845971e-06,
"loss": 0.4254,
"step": 16700
},
{
"epoch": 2.0445418035779483,
"grad_norm": 282760.53125,
"learning_rate": 6.3705326355928775e-06,
"loss": 0.4188,
"step": 16800
},
{
"epoch": 2.0567116952659124,
"grad_norm": 248830.09375,
"learning_rate": 6.289400024339783e-06,
"loss": 0.444,
"step": 16900
},
{
"epoch": 2.068881586953876,
"grad_norm": 1877179.75,
"learning_rate": 6.20826741308669e-06,
"loss": 0.4139,
"step": 17000
},
{
"epoch": 2.08105147864184,
"grad_norm": 1332360.75,
"learning_rate": 6.127134801833598e-06,
"loss": 0.4682,
"step": 17100
},
{
"epoch": 2.093221370329804,
"grad_norm": 1522515.625,
"learning_rate": 6.046002190580505e-06,
"loss": 0.4316,
"step": 17200
},
{
"epoch": 2.105391262017768,
"grad_norm": 904717.9375,
"learning_rate": 5.964869579327411e-06,
"loss": 0.424,
"step": 17300
},
{
"epoch": 2.117561153705732,
"grad_norm": 585235.875,
"learning_rate": 5.883736968074318e-06,
"loss": 0.4258,
"step": 17400
},
{
"epoch": 2.129731045393696,
"grad_norm": 938551.75,
"learning_rate": 5.802604356821224e-06,
"loss": 0.388,
"step": 17500
},
{
"epoch": 2.14190093708166,
"grad_norm": 274852.875,
"learning_rate": 5.721471745568131e-06,
"loss": 0.4287,
"step": 17600
},
{
"epoch": 2.154070828769624,
"grad_norm": 832885.125,
"learning_rate": 5.6403391343150385e-06,
"loss": 0.4446,
"step": 17700
},
{
"epoch": 2.166240720457588,
"grad_norm": 729798.5625,
"learning_rate": 5.559206523061945e-06,
"loss": 0.3869,
"step": 17800
},
{
"epoch": 2.178410612145552,
"grad_norm": 690389.625,
"learning_rate": 5.478073911808852e-06,
"loss": 0.3944,
"step": 17900
},
{
"epoch": 2.190580503833516,
"grad_norm": 337837.53125,
"learning_rate": 5.396941300555759e-06,
"loss": 0.398,
"step": 18000
},
{
"epoch": 2.20275039552148,
"grad_norm": 1882286.5,
"learning_rate": 5.3158086893026665e-06,
"loss": 0.3846,
"step": 18100
},
{
"epoch": 2.214920287209444,
"grad_norm": 662988.8125,
"learning_rate": 5.234676078049572e-06,
"loss": 0.4342,
"step": 18200
},
{
"epoch": 2.227090178897408,
"grad_norm": 207260.828125,
"learning_rate": 5.153543466796479e-06,
"loss": 0.4045,
"step": 18300
},
{
"epoch": 2.239260070585372,
"grad_norm": 1592776.0,
"learning_rate": 5.072410855543386e-06,
"loss": 0.4304,
"step": 18400
},
{
"epoch": 2.2514299622733356,
"grad_norm": 273817.09375,
"learning_rate": 4.991278244290293e-06,
"loss": 0.4213,
"step": 18500
},
{
"epoch": 2.2635998539612996,
"grad_norm": 735677.5,
"learning_rate": 4.9101456330372e-06,
"loss": 0.4479,
"step": 18600
},
{
"epoch": 2.2757697456492636,
"grad_norm": 848157.9375,
"learning_rate": 4.829013021784106e-06,
"loss": 0.4594,
"step": 18700
},
{
"epoch": 2.2879396373372276,
"grad_norm": 587701.5625,
"learning_rate": 4.747880410531013e-06,
"loss": 0.4424,
"step": 18800
},
{
"epoch": 2.3001095290251916,
"grad_norm": 1382945.0,
"learning_rate": 4.66674779927792e-06,
"loss": 0.4185,
"step": 18900
},
{
"epoch": 2.3122794207131556,
"grad_norm": 712288.0625,
"learning_rate": 4.585615188024827e-06,
"loss": 0.4261,
"step": 19000
},
{
"epoch": 2.3244493124011196,
"grad_norm": 1519380.375,
"learning_rate": 4.5044825767717336e-06,
"loss": 0.4388,
"step": 19100
},
{
"epoch": 2.3366192040890836,
"grad_norm": 1865403.5,
"learning_rate": 4.42334996551864e-06,
"loss": 0.4399,
"step": 19200
},
{
"epoch": 2.3487890957770476,
"grad_norm": 1357723.375,
"learning_rate": 4.342217354265547e-06,
"loss": 0.4483,
"step": 19300
},
{
"epoch": 2.3609589874650116,
"grad_norm": 249520.984375,
"learning_rate": 4.261084743012454e-06,
"loss": 0.4371,
"step": 19400
},
{
"epoch": 2.3731288791529757,
"grad_norm": 1260475.25,
"learning_rate": 4.179952131759361e-06,
"loss": 0.4268,
"step": 19500
},
{
"epoch": 2.3852987708409397,
"grad_norm": 1328097.5,
"learning_rate": 4.098819520506268e-06,
"loss": 0.4109,
"step": 19600
},
{
"epoch": 2.3974686625289037,
"grad_norm": 538742.125,
"learning_rate": 4.017686909253174e-06,
"loss": 0.4352,
"step": 19700
},
{
"epoch": 2.4096385542168672,
"grad_norm": 950668.5,
"learning_rate": 3.936554298000082e-06,
"loss": 0.4105,
"step": 19800
},
{
"epoch": 2.4218084459048317,
"grad_norm": 881245.6875,
"learning_rate": 3.855421686746989e-06,
"loss": 0.4359,
"step": 19900
},
{
"epoch": 2.4339783375927952,
"grad_norm": 354214.0,
"learning_rate": 3.774289075493895e-06,
"loss": 0.4111,
"step": 20000
},
{
"epoch": 2.4461482292807593,
"grad_norm": 1019421.125,
"learning_rate": 3.693156464240802e-06,
"loss": 0.4191,
"step": 20100
},
{
"epoch": 2.4583181209687233,
"grad_norm": 600526.75,
"learning_rate": 3.612023852987709e-06,
"loss": 0.4142,
"step": 20200
},
{
"epoch": 2.4704880126566873,
"grad_norm": 633427.0625,
"learning_rate": 3.5308912417346154e-06,
"loss": 0.4216,
"step": 20300
},
{
"epoch": 2.4826579043446513,
"grad_norm": 451850.40625,
"learning_rate": 3.449758630481522e-06,
"loss": 0.4172,
"step": 20400
},
{
"epoch": 2.4948277960326153,
"grad_norm": 1016081.8125,
"learning_rate": 3.3686260192284294e-06,
"loss": 0.4243,
"step": 20500
},
{
"epoch": 2.5069976877205793,
"grad_norm": 1442025.375,
"learning_rate": 3.287493407975336e-06,
"loss": 0.3936,
"step": 20600
},
{
"epoch": 2.5191675794085433,
"grad_norm": 1038331.9375,
"learning_rate": 3.2063607967222425e-06,
"loss": 0.453,
"step": 20700
},
{
"epoch": 2.5313374710965073,
"grad_norm": 1017244.0,
"learning_rate": 3.1252281854691498e-06,
"loss": 0.4306,
"step": 20800
},
{
"epoch": 2.5435073627844713,
"grad_norm": 3449184.25,
"learning_rate": 3.0440955742160565e-06,
"loss": 0.4218,
"step": 20900
},
{
"epoch": 2.5556772544724353,
"grad_norm": 1605587.875,
"learning_rate": 2.962962962962963e-06,
"loss": 0.4492,
"step": 21000
},
{
"epoch": 2.5678471461603993,
"grad_norm": 386683.15625,
"learning_rate": 2.88183035170987e-06,
"loss": 0.3956,
"step": 21100
},
{
"epoch": 2.5800170378483633,
"grad_norm": 997831.125,
"learning_rate": 2.800697740456777e-06,
"loss": 0.4212,
"step": 21200
},
{
"epoch": 2.592186929536327,
"grad_norm": 746647.625,
"learning_rate": 2.7195651292036833e-06,
"loss": 0.4191,
"step": 21300
},
{
"epoch": 2.6043568212242914,
"grad_norm": 1180154.625,
"learning_rate": 2.6384325179505905e-06,
"loss": 0.4352,
"step": 21400
},
{
"epoch": 2.616526712912255,
"grad_norm": 262904.28125,
"learning_rate": 2.5572999066974973e-06,
"loss": 0.4068,
"step": 21500
},
{
"epoch": 2.628696604600219,
"grad_norm": 846819.8125,
"learning_rate": 2.476167295444404e-06,
"loss": 0.4141,
"step": 21600
},
{
"epoch": 2.640866496288183,
"grad_norm": 891045.8125,
"learning_rate": 2.395034684191311e-06,
"loss": 0.4611,
"step": 21700
},
{
"epoch": 2.653036387976147,
"grad_norm": 345922.375,
"learning_rate": 2.3139020729382176e-06,
"loss": 0.4244,
"step": 21800
},
{
"epoch": 2.665206279664111,
"grad_norm": 1789013.0,
"learning_rate": 2.2327694616851244e-06,
"loss": 0.4097,
"step": 21900
},
{
"epoch": 2.677376171352075,
"grad_norm": 295374.0625,
"learning_rate": 2.1516368504320316e-06,
"loss": 0.386,
"step": 22000
},
{
"epoch": 2.689546063040039,
"grad_norm": 625661.4375,
"learning_rate": 2.070504239178938e-06,
"loss": 0.4457,
"step": 22100
},
{
"epoch": 2.701715954728003,
"grad_norm": 1791765.5,
"learning_rate": 1.9893716279258448e-06,
"loss": 0.4024,
"step": 22200
},
{
"epoch": 2.713885846415967,
"grad_norm": 307518.59375,
"learning_rate": 1.908239016672752e-06,
"loss": 0.3637,
"step": 22300
},
{
"epoch": 2.726055738103931,
"grad_norm": 1208860.25,
"learning_rate": 1.8271064054196585e-06,
"loss": 0.3669,
"step": 22400
},
{
"epoch": 2.738225629791895,
"grad_norm": 728501.125,
"learning_rate": 1.7459737941665653e-06,
"loss": 0.4139,
"step": 22500
},
{
"epoch": 2.750395521479859,
"grad_norm": 1411384.125,
"learning_rate": 1.6648411829134723e-06,
"loss": 0.4193,
"step": 22600
},
{
"epoch": 2.762565413167823,
"grad_norm": 672326.3125,
"learning_rate": 1.583708571660379e-06,
"loss": 0.4063,
"step": 22700
},
{
"epoch": 2.7747353048557866,
"grad_norm": 634662.1875,
"learning_rate": 1.502575960407286e-06,
"loss": 0.4427,
"step": 22800
},
{
"epoch": 2.786905196543751,
"grad_norm": 1894593.625,
"learning_rate": 1.4214433491541927e-06,
"loss": 0.427,
"step": 22900
},
{
"epoch": 2.7990750882317146,
"grad_norm": 1145494.5,
"learning_rate": 1.3403107379010993e-06,
"loss": 0.4209,
"step": 23000
},
{
"epoch": 2.8112449799196786,
"grad_norm": 661343.0625,
"learning_rate": 1.2591781266480063e-06,
"loss": 0.4445,
"step": 23100
},
{
"epoch": 2.8234148716076426,
"grad_norm": 380578.34375,
"learning_rate": 1.178045515394913e-06,
"loss": 0.4197,
"step": 23200
},
{
"epoch": 2.8355847632956066,
"grad_norm": 1426323.625,
"learning_rate": 1.0969129041418198e-06,
"loss": 0.4222,
"step": 23300
},
{
"epoch": 2.8477546549835706,
"grad_norm": 931245.875,
"learning_rate": 1.0157802928887266e-06,
"loss": 0.3771,
"step": 23400
},
{
"epoch": 2.8599245466715346,
"grad_norm": 725616.1875,
"learning_rate": 9.346476816356335e-07,
"loss": 0.4225,
"step": 23500
},
{
"epoch": 2.8720944383594986,
"grad_norm": 2420769.25,
"learning_rate": 8.535150703825404e-07,
"loss": 0.4444,
"step": 23600
},
{
"epoch": 2.8842643300474626,
"grad_norm": 716980.3125,
"learning_rate": 7.723824591294471e-07,
"loss": 0.3999,
"step": 23700
},
{
"epoch": 2.8964342217354266,
"grad_norm": 770046.9375,
"learning_rate": 6.91249847876354e-07,
"loss": 0.4085,
"step": 23800
},
{
"epoch": 2.9086041134233906,
"grad_norm": 723776.625,
"learning_rate": 6.101172366232608e-07,
"loss": 0.4564,
"step": 23900
},
{
"epoch": 2.9207740051113547,
"grad_norm": 1597413.125,
"learning_rate": 5.289846253701675e-07,
"loss": 0.3898,
"step": 24000
},
{
"epoch": 2.932943896799318,
"grad_norm": 1187440.875,
"learning_rate": 4.478520141170744e-07,
"loss": 0.4256,
"step": 24100
},
{
"epoch": 2.9451137884872827,
"grad_norm": 780778.4375,
"learning_rate": 3.667194028639812e-07,
"loss": 0.392,
"step": 24200
},
{
"epoch": 2.9572836801752462,
"grad_norm": 1588163.625,
"learning_rate": 2.85586791610888e-07,
"loss": 0.4521,
"step": 24300
},
{
"epoch": 2.9694535718632107,
"grad_norm": 766974.1875,
"learning_rate": 2.0445418035779482e-07,
"loss": 0.4102,
"step": 24400
},
{
"epoch": 2.9816234635511742,
"grad_norm": 393891.59375,
"learning_rate": 1.2332156910470163e-07,
"loss": 0.4191,
"step": 24500
},
{
"epoch": 2.9937933552391383,
"grad_norm": 637999.0,
"learning_rate": 4.218895785160846e-08,
"loss": 0.389,
"step": 24600
},
{
"epoch": 3.0,
"eval_loss": 0.7935315370559692,
"eval_runtime": 263.7684,
"eval_samples_per_second": 45.877,
"eval_steps_per_second": 2.87,
"step": 24651
},
{
"epoch": 3.0,
"step": 24651,
"total_flos": 7.72931723781289e+16,
"train_loss": 0.6282787045998813,
"train_runtime": 25250.2136,
"train_samples_per_second": 15.62,
"train_steps_per_second": 0.976
}
],
"logging_steps": 100,
"max_steps": 24651,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.72931723781289e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}