gemma2b_orm / trainer_state.json
Colder203's picture
Upload trainer_state.json with huggingface_hub
668dbe3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.999882294088221,
"eval_steps": 500,
"global_step": 67964,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005885295588970956,
"grad_norm": 9.0,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.6747,
"step": 100
},
{
"epoch": 0.011770591177941912,
"grad_norm": 15.9375,
"learning_rate": 5.882352941176471e-05,
"loss": 0.6583,
"step": 200
},
{
"epoch": 0.01765588676691287,
"grad_norm": 11.375,
"learning_rate": 8.823529411764706e-05,
"loss": 0.6365,
"step": 300
},
{
"epoch": 0.023541182355883823,
"grad_norm": 16.5,
"learning_rate": 0.00011764705882352942,
"loss": 0.6143,
"step": 400
},
{
"epoch": 0.02942647794485478,
"grad_norm": 10.4375,
"learning_rate": 0.00014705882352941178,
"loss": 0.617,
"step": 500
},
{
"epoch": 0.03531177353382574,
"grad_norm": 34.25,
"learning_rate": 0.00017647058823529413,
"loss": 0.6062,
"step": 600
},
{
"epoch": 0.04119706912279669,
"grad_norm": 7.09375,
"learning_rate": 0.00019999995639803067,
"loss": 0.594,
"step": 700
},
{
"epoch": 0.047082364711767646,
"grad_norm": 26.125,
"learning_rate": 0.00019999843033309612,
"loss": 0.6075,
"step": 800
},
{
"epoch": 0.05296766030073861,
"grad_norm": 10.1875,
"learning_rate": 0.00019999472420771699,
"loss": 0.5744,
"step": 900
},
{
"epoch": 0.05885295588970956,
"grad_norm": 9.9375,
"learning_rate": 0.00019998883810269034,
"loss": 0.553,
"step": 1000
},
{
"epoch": 0.06473825147868052,
"grad_norm": 24.0,
"learning_rate": 0.00019998077214633883,
"loss": 0.5583,
"step": 1100
},
{
"epoch": 0.07062354706765148,
"grad_norm": 7.15625,
"learning_rate": 0.00019997052651450793,
"loss": 0.5831,
"step": 1200
},
{
"epoch": 0.07650884265662243,
"grad_norm": 10.1875,
"learning_rate": 0.00019995810143056216,
"loss": 0.5741,
"step": 1300
},
{
"epoch": 0.08239413824559338,
"grad_norm": 11.375,
"learning_rate": 0.00019994349716538005,
"loss": 0.5671,
"step": 1400
},
{
"epoch": 0.08827943383456434,
"grad_norm": 8.0625,
"learning_rate": 0.00019992671403734846,
"loss": 0.5546,
"step": 1500
},
{
"epoch": 0.09416472942353529,
"grad_norm": 25.0,
"learning_rate": 0.00019990775241235544,
"loss": 0.572,
"step": 1600
},
{
"epoch": 0.10005002501250625,
"grad_norm": 22.5,
"learning_rate": 0.00019988661270378238,
"loss": 0.5811,
"step": 1700
},
{
"epoch": 0.10593532060147721,
"grad_norm": 24.5,
"learning_rate": 0.0001998632953724949,
"loss": 0.5737,
"step": 1800
},
{
"epoch": 0.11182061619044817,
"grad_norm": 6.125,
"learning_rate": 0.00019983780092683296,
"loss": 0.5448,
"step": 1900
},
{
"epoch": 0.11770591177941912,
"grad_norm": 7.46875,
"learning_rate": 0.00019981012992259953,
"loss": 0.5646,
"step": 2000
},
{
"epoch": 0.12359120736839008,
"grad_norm": 18.5,
"learning_rate": 0.00019978028296304876,
"loss": 0.5626,
"step": 2100
},
{
"epoch": 0.12947650295736104,
"grad_norm": 8.5625,
"learning_rate": 0.0001997482606988726,
"loss": 0.5608,
"step": 2200
},
{
"epoch": 0.13536179854633199,
"grad_norm": 22.0,
"learning_rate": 0.00019971406382818672,
"loss": 0.5623,
"step": 2300
},
{
"epoch": 0.14124709413530295,
"grad_norm": 31.875,
"learning_rate": 0.00019967769309651529,
"loss": 0.5606,
"step": 2400
},
{
"epoch": 0.1471323897242739,
"grad_norm": 9.5625,
"learning_rate": 0.00019963914929677467,
"loss": 0.5523,
"step": 2500
},
{
"epoch": 0.15301768531324486,
"grad_norm": 10.125,
"learning_rate": 0.0001995984332692562,
"loss": 0.5691,
"step": 2600
},
{
"epoch": 0.1589029809022158,
"grad_norm": 17.25,
"learning_rate": 0.00019955554590160782,
"loss": 0.5424,
"step": 2700
},
{
"epoch": 0.16478827649118677,
"grad_norm": 9.9375,
"learning_rate": 0.0001995104881288147,
"loss": 0.548,
"step": 2800
},
{
"epoch": 0.17067357208015774,
"grad_norm": 6.84375,
"learning_rate": 0.00019946326093317902,
"loss": 0.5425,
"step": 2900
},
{
"epoch": 0.17655886766912868,
"grad_norm": 10.9375,
"learning_rate": 0.0001994138653442983,
"loss": 0.5699,
"step": 3000
},
{
"epoch": 0.18244416325809965,
"grad_norm": 17.125,
"learning_rate": 0.00019936230243904315,
"loss": 0.5564,
"step": 3100
},
{
"epoch": 0.18832945884707059,
"grad_norm": 12.75,
"learning_rate": 0.0001993085733415337,
"loss": 0.5614,
"step": 3200
},
{
"epoch": 0.19421475443604155,
"grad_norm": 20.0,
"learning_rate": 0.0001992526792231152,
"loss": 0.5386,
"step": 3300
},
{
"epoch": 0.2001000500250125,
"grad_norm": 10.9375,
"learning_rate": 0.00019919462130233226,
"loss": 0.5378,
"step": 3400
},
{
"epoch": 0.20598534561398346,
"grad_norm": 8.875,
"learning_rate": 0.00019913440084490255,
"loss": 0.5493,
"step": 3500
},
{
"epoch": 0.21187064120295443,
"grad_norm": 27.25,
"learning_rate": 0.00019907201916368906,
"loss": 0.543,
"step": 3600
},
{
"epoch": 0.21775593679192537,
"grad_norm": 6.46875,
"learning_rate": 0.0001990074776186715,
"loss": 0.5458,
"step": 3700
},
{
"epoch": 0.22364123238089634,
"grad_norm": 8.75,
"learning_rate": 0.00019894077761691662,
"loss": 0.55,
"step": 3800
},
{
"epoch": 0.22952652796986728,
"grad_norm": 8.3125,
"learning_rate": 0.0001988719206125476,
"loss": 0.536,
"step": 3900
},
{
"epoch": 0.23541182355883825,
"grad_norm": 5.4375,
"learning_rate": 0.00019880090810671237,
"loss": 0.5348,
"step": 4000
},
{
"epoch": 0.24129711914780919,
"grad_norm": 31.5,
"learning_rate": 0.00019872774164755072,
"loss": 0.5406,
"step": 4100
},
{
"epoch": 0.24718241473678015,
"grad_norm": 11.5625,
"learning_rate": 0.00019865242283016076,
"loss": 0.5374,
"step": 4200
},
{
"epoch": 0.2530677103257511,
"grad_norm": 24.375,
"learning_rate": 0.00019857495329656398,
"loss": 0.5356,
"step": 4300
},
{
"epoch": 0.2589530059147221,
"grad_norm": 9.5,
"learning_rate": 0.00019849533473566955,
"loss": 0.5555,
"step": 4400
},
{
"epoch": 0.26483830150369303,
"grad_norm": 14.5625,
"learning_rate": 0.00019841356888323749,
"loss": 0.5165,
"step": 4500
},
{
"epoch": 0.27072359709266397,
"grad_norm": 20.5,
"learning_rate": 0.00019832965752184084,
"loss": 0.5487,
"step": 4600
},
{
"epoch": 0.2766088926816349,
"grad_norm": 5.3125,
"learning_rate": 0.0001982436024808266,
"loss": 0.5347,
"step": 4700
},
{
"epoch": 0.2824941882706059,
"grad_norm": 5.96875,
"learning_rate": 0.00019815540563627616,
"loss": 0.5398,
"step": 4800
},
{
"epoch": 0.28837948385957685,
"grad_norm": 7.0625,
"learning_rate": 0.0001980650689109643,
"loss": 0.5359,
"step": 4900
},
{
"epoch": 0.2942647794485478,
"grad_norm": 26.125,
"learning_rate": 0.00019797259427431705,
"loss": 0.5547,
"step": 5000
},
{
"epoch": 0.3001500750375188,
"grad_norm": 18.75,
"learning_rate": 0.0001978779837423691,
"loss": 0.5648,
"step": 5100
},
{
"epoch": 0.3060353706264897,
"grad_norm": 13.0,
"learning_rate": 0.00019778123937771953,
"loss": 0.5182,
"step": 5200
},
{
"epoch": 0.31192066621546066,
"grad_norm": 22.375,
"learning_rate": 0.00019768236328948717,
"loss": 0.5277,
"step": 5300
},
{
"epoch": 0.3178059618044316,
"grad_norm": 44.5,
"learning_rate": 0.00019758135763326426,
"loss": 0.5348,
"step": 5400
},
{
"epoch": 0.3236912573934026,
"grad_norm": 12.8125,
"learning_rate": 0.0001974782246110698,
"loss": 0.5295,
"step": 5500
},
{
"epoch": 0.32957655298237354,
"grad_norm": 6.21875,
"learning_rate": 0.00019737296647130123,
"loss": 0.5472,
"step": 5600
},
{
"epoch": 0.3354618485713445,
"grad_norm": 5.25,
"learning_rate": 0.00019726558550868571,
"loss": 0.5379,
"step": 5700
},
{
"epoch": 0.3413471441603155,
"grad_norm": 8.4375,
"learning_rate": 0.00019715608406422984,
"loss": 0.5282,
"step": 5800
},
{
"epoch": 0.3472324397492864,
"grad_norm": 7.25,
"learning_rate": 0.00019704446452516874,
"loss": 0.5334,
"step": 5900
},
{
"epoch": 0.35311773533825735,
"grad_norm": 12.4375,
"learning_rate": 0.00019693072932491405,
"loss": 0.5487,
"step": 6000
},
{
"epoch": 0.3590030309272283,
"grad_norm": 29.25,
"learning_rate": 0.00019681488094300083,
"loss": 0.5688,
"step": 6100
},
{
"epoch": 0.3648883265161993,
"grad_norm": 11.625,
"learning_rate": 0.00019669692190503343,
"loss": 0.5565,
"step": 6200
},
{
"epoch": 0.37077362210517023,
"grad_norm": 17.125,
"learning_rate": 0.0001965768547826306,
"loss": 0.5318,
"step": 6300
},
{
"epoch": 0.37665891769414117,
"grad_norm": 9.125,
"learning_rate": 0.00019645468219336922,
"loss": 0.5443,
"step": 6400
},
{
"epoch": 0.38254421328311217,
"grad_norm": 9.5625,
"learning_rate": 0.0001963304068007274,
"loss": 0.5574,
"step": 6500
},
{
"epoch": 0.3884295088720831,
"grad_norm": 13.5,
"learning_rate": 0.00019620403131402633,
"loss": 0.5513,
"step": 6600
},
{
"epoch": 0.39431480446105405,
"grad_norm": 27.125,
"learning_rate": 0.00019607555848837128,
"loss": 0.5087,
"step": 6700
},
{
"epoch": 0.400200100050025,
"grad_norm": 11.4375,
"learning_rate": 0.00019594499112459148,
"loss": 0.5271,
"step": 6800
},
{
"epoch": 0.406085395638996,
"grad_norm": 7.65625,
"learning_rate": 0.00019581233206917903,
"loss": 0.5398,
"step": 6900
},
{
"epoch": 0.4119706912279669,
"grad_norm": 11.1875,
"learning_rate": 0.00019567758421422694,
"loss": 0.5233,
"step": 7000
},
{
"epoch": 0.41785598681693786,
"grad_norm": 13.0625,
"learning_rate": 0.000195540750497366,
"loss": 0.5258,
"step": 7100
},
{
"epoch": 0.42374128240590886,
"grad_norm": 10.625,
"learning_rate": 0.00019540183390170075,
"loss": 0.5381,
"step": 7200
},
{
"epoch": 0.4296265779948798,
"grad_norm": 9.3125,
"learning_rate": 0.00019526083745574453,
"loss": 0.5478,
"step": 7300
},
{
"epoch": 0.43551187358385074,
"grad_norm": 44.0,
"learning_rate": 0.00019511776423335327,
"loss": 0.5132,
"step": 7400
},
{
"epoch": 0.4413971691728217,
"grad_norm": 32.75,
"learning_rate": 0.00019497261735365872,
"loss": 0.5271,
"step": 7500
},
{
"epoch": 0.4472824647617927,
"grad_norm": 12.125,
"learning_rate": 0.00019482539998100023,
"loss": 0.5463,
"step": 7600
},
{
"epoch": 0.4531677603507636,
"grad_norm": 18.125,
"learning_rate": 0.00019467611532485588,
"loss": 0.5315,
"step": 7700
},
{
"epoch": 0.45905305593973456,
"grad_norm": 11.3125,
"learning_rate": 0.00019452476663977248,
"loss": 0.5388,
"step": 7800
},
{
"epoch": 0.46493835152870555,
"grad_norm": 21.75,
"learning_rate": 0.00019437135722529471,
"loss": 0.5212,
"step": 7900
},
{
"epoch": 0.4708236471176765,
"grad_norm": 19.625,
"learning_rate": 0.00019421589042589295,
"loss": 0.5573,
"step": 8000
},
{
"epoch": 0.47670894270664743,
"grad_norm": 19.0,
"learning_rate": 0.00019405836963089066,
"loss": 0.5358,
"step": 8100
},
{
"epoch": 0.48259423829561837,
"grad_norm": 31.375,
"learning_rate": 0.00019389879827439024,
"loss": 0.5375,
"step": 8200
},
{
"epoch": 0.48847953388458937,
"grad_norm": 17.375,
"learning_rate": 0.00019373717983519833,
"loss": 0.5458,
"step": 8300
},
{
"epoch": 0.4943648294735603,
"grad_norm": 19.75,
"learning_rate": 0.00019357351783674996,
"loss": 0.5391,
"step": 8400
},
{
"epoch": 0.5002501250625313,
"grad_norm": 13.625,
"learning_rate": 0.00019340781584703155,
"loss": 0.5328,
"step": 8500
},
{
"epoch": 0.5061354206515022,
"grad_norm": 28.5,
"learning_rate": 0.00019324007747850334,
"loss": 0.5214,
"step": 8600
},
{
"epoch": 0.5120207162404732,
"grad_norm": 12.875,
"learning_rate": 0.0001930703063880206,
"loss": 0.5446,
"step": 8700
},
{
"epoch": 0.5179060118294442,
"grad_norm": 18.5,
"learning_rate": 0.00019289850627675378,
"loss": 0.5198,
"step": 8800
},
{
"epoch": 0.5237913074184151,
"grad_norm": 15.375,
"learning_rate": 0.000192724680890108,
"loss": 0.5411,
"step": 8900
},
{
"epoch": 0.5296766030073861,
"grad_norm": 15.8125,
"learning_rate": 0.00019254883401764115,
"loss": 0.529,
"step": 9000
},
{
"epoch": 0.5355618985963571,
"grad_norm": 14.8125,
"learning_rate": 0.00019237096949298156,
"loss": 0.5224,
"step": 9100
},
{
"epoch": 0.5414471941853279,
"grad_norm": 19.875,
"learning_rate": 0.00019219109119374426,
"loss": 0.5383,
"step": 9200
},
{
"epoch": 0.5473324897742989,
"grad_norm": 5.59375,
"learning_rate": 0.0001920092030414464,
"loss": 0.5381,
"step": 9300
},
{
"epoch": 0.5532177853632698,
"grad_norm": 5.5,
"learning_rate": 0.00019182530900142198,
"loss": 0.5447,
"step": 9400
},
{
"epoch": 0.5591030809522408,
"grad_norm": 4.5,
"learning_rate": 0.00019163941308273502,
"loss": 0.5341,
"step": 9500
},
{
"epoch": 0.5649883765412118,
"grad_norm": 5.59375,
"learning_rate": 0.00019145151933809264,
"loss": 0.5411,
"step": 9600
},
{
"epoch": 0.5708736721301827,
"grad_norm": 33.25,
"learning_rate": 0.00019126163186375633,
"loss": 0.5389,
"step": 9700
},
{
"epoch": 0.5767589677191537,
"grad_norm": 16.875,
"learning_rate": 0.0001910697547994527,
"loss": 0.5181,
"step": 9800
},
{
"epoch": 0.5826442633081247,
"grad_norm": 9.625,
"learning_rate": 0.0001908758923282835,
"loss": 0.5404,
"step": 9900
},
{
"epoch": 0.5885295588970956,
"grad_norm": 6.125,
"learning_rate": 0.00019068004867663408,
"loss": 0.543,
"step": 10000
},
{
"epoch": 0.5944148544860666,
"grad_norm": 21.0,
"learning_rate": 0.00019048222811408137,
"loss": 0.541,
"step": 10100
},
{
"epoch": 0.6003001500750376,
"grad_norm": 15.875,
"learning_rate": 0.00019028243495330103,
"loss": 0.5135,
"step": 10200
},
{
"epoch": 0.6061854456640084,
"grad_norm": 17.5,
"learning_rate": 0.00019008067354997298,
"loss": 0.5297,
"step": 10300
},
{
"epoch": 0.6120707412529794,
"grad_norm": 6.125,
"learning_rate": 0.0001898769483026869,
"loss": 0.5354,
"step": 10400
},
{
"epoch": 0.6179560368419504,
"grad_norm": 22.375,
"learning_rate": 0.000189671263652846,
"loss": 0.5245,
"step": 10500
},
{
"epoch": 0.6238413324309213,
"grad_norm": 5.25,
"learning_rate": 0.00018946362408457036,
"loss": 0.5313,
"step": 10600
},
{
"epoch": 0.6297266280198923,
"grad_norm": 7.40625,
"learning_rate": 0.0001892540341245991,
"loss": 0.527,
"step": 10700
},
{
"epoch": 0.6356119236088632,
"grad_norm": 5.5,
"learning_rate": 0.0001890424983421918,
"loss": 0.53,
"step": 10800
},
{
"epoch": 0.6414972191978342,
"grad_norm": 6.59375,
"learning_rate": 0.00018882902134902872,
"loss": 0.5174,
"step": 10900
},
{
"epoch": 0.6473825147868052,
"grad_norm": 25.125,
"learning_rate": 0.00018861360779911048,
"loss": 0.5373,
"step": 11000
},
{
"epoch": 0.6532678103757761,
"grad_norm": 9.25,
"learning_rate": 0.00018839626238865628,
"loss": 0.5373,
"step": 11100
},
{
"epoch": 0.6591531059647471,
"grad_norm": 9.3125,
"learning_rate": 0.00018817698985600193,
"loss": 0.5436,
"step": 11200
},
{
"epoch": 0.6650384015537181,
"grad_norm": 14.875,
"learning_rate": 0.00018795579498149612,
"loss": 0.5331,
"step": 11300
},
{
"epoch": 0.670923697142689,
"grad_norm": 4.875,
"learning_rate": 0.00018773268258739654,
"loss": 0.5337,
"step": 11400
},
{
"epoch": 0.67680899273166,
"grad_norm": 27.625,
"learning_rate": 0.0001875076575377646,
"loss": 0.5097,
"step": 11500
},
{
"epoch": 0.682694288320631,
"grad_norm": 10.0625,
"learning_rate": 0.00018728072473835942,
"loss": 0.5335,
"step": 11600
},
{
"epoch": 0.6885795839096018,
"grad_norm": 12.625,
"learning_rate": 0.00018705188913653082,
"loss": 0.5152,
"step": 11700
},
{
"epoch": 0.6944648794985728,
"grad_norm": 22.25,
"learning_rate": 0.00018682115572111156,
"loss": 0.525,
"step": 11800
},
{
"epoch": 0.7003501750875438,
"grad_norm": 11.75,
"learning_rate": 0.00018658852952230853,
"loss": 0.5222,
"step": 11900
},
{
"epoch": 0.7062354706765147,
"grad_norm": 5.125,
"learning_rate": 0.00018635401561159306,
"loss": 0.5197,
"step": 12000
},
{
"epoch": 0.7121207662654857,
"grad_norm": 15.375,
"learning_rate": 0.0001861176191015904,
"loss": 0.5207,
"step": 12100
},
{
"epoch": 0.7180060618544566,
"grad_norm": 5.96875,
"learning_rate": 0.00018587934514596824,
"loss": 0.5436,
"step": 12200
},
{
"epoch": 0.7238913574434276,
"grad_norm": 31.875,
"learning_rate": 0.00018563919893932443,
"loss": 0.5142,
"step": 12300
},
{
"epoch": 0.7297766530323986,
"grad_norm": 10.625,
"learning_rate": 0.0001853971857170736,
"loss": 0.5215,
"step": 12400
},
{
"epoch": 0.7356619486213695,
"grad_norm": 15.6875,
"learning_rate": 0.00018515331075533303,
"loss": 0.5603,
"step": 12500
},
{
"epoch": 0.7415472442103405,
"grad_norm": 12.375,
"learning_rate": 0.0001849075793708078,
"loss": 0.5134,
"step": 12600
},
{
"epoch": 0.7474325397993115,
"grad_norm": 7.3125,
"learning_rate": 0.00018465999692067472,
"loss": 0.5178,
"step": 12700
},
{
"epoch": 0.7533178353882823,
"grad_norm": 6.0625,
"learning_rate": 0.00018441056880246555,
"loss": 0.5182,
"step": 12800
},
{
"epoch": 0.7592031309772533,
"grad_norm": 15.625,
"learning_rate": 0.00018415930045394944,
"loss": 0.5231,
"step": 12900
},
{
"epoch": 0.7650884265662243,
"grad_norm": 7.65625,
"learning_rate": 0.00018390619735301418,
"loss": 0.5019,
"step": 13000
},
{
"epoch": 0.7709737221551952,
"grad_norm": 7.0625,
"learning_rate": 0.000183651265017547,
"loss": 0.5298,
"step": 13100
},
{
"epoch": 0.7768590177441662,
"grad_norm": 8.0625,
"learning_rate": 0.00018339450900531413,
"loss": 0.5156,
"step": 13200
},
{
"epoch": 0.7827443133331372,
"grad_norm": 23.875,
"learning_rate": 0.00018313593491383975,
"loss": 0.5479,
"step": 13300
},
{
"epoch": 0.7886296089221081,
"grad_norm": 12.3125,
"learning_rate": 0.00018287554838028377,
"loss": 0.5341,
"step": 13400
},
{
"epoch": 0.7945149045110791,
"grad_norm": 12.4375,
"learning_rate": 0.00018261335508131912,
"loss": 0.5373,
"step": 13500
},
{
"epoch": 0.80040020010005,
"grad_norm": 24.0,
"learning_rate": 0.00018234936073300797,
"loss": 0.5329,
"step": 13600
},
{
"epoch": 0.806285495689021,
"grad_norm": 11.8125,
"learning_rate": 0.00018208357109067698,
"loss": 0.5316,
"step": 13700
},
{
"epoch": 0.812170791277992,
"grad_norm": 16.875,
"learning_rate": 0.00018181599194879198,
"loss": 0.5425,
"step": 13800
},
{
"epoch": 0.8180560868669629,
"grad_norm": 17.375,
"learning_rate": 0.00018154662914083157,
"loss": 0.5318,
"step": 13900
},
{
"epoch": 0.8239413824559338,
"grad_norm": 8.5625,
"learning_rate": 0.0001812754885391599,
"loss": 0.5286,
"step": 14000
},
{
"epoch": 0.8298266780449048,
"grad_norm": 21.625,
"learning_rate": 0.00018100257605489884,
"loss": 0.5256,
"step": 14100
},
{
"epoch": 0.8357119736338757,
"grad_norm": 15.0,
"learning_rate": 0.00018072789763779888,
"loss": 0.5261,
"step": 14200
},
{
"epoch": 0.8415972692228467,
"grad_norm": 9.5,
"learning_rate": 0.0001804514592761095,
"loss": 0.5353,
"step": 14300
},
{
"epoch": 0.8474825648118177,
"grad_norm": 16.125,
"learning_rate": 0.0001801732669964487,
"loss": 0.5156,
"step": 14400
},
{
"epoch": 0.8533678604007886,
"grad_norm": 8.3125,
"learning_rate": 0.00017989332686367155,
"loss": 0.5343,
"step": 14500
},
{
"epoch": 0.8592531559897596,
"grad_norm": 27.125,
"learning_rate": 0.0001796116449807379,
"loss": 0.5218,
"step": 14600
},
{
"epoch": 0.8651384515787306,
"grad_norm": 20.125,
"learning_rate": 0.00017932822748857946,
"loss": 0.5111,
"step": 14700
},
{
"epoch": 0.8710237471677015,
"grad_norm": 18.625,
"learning_rate": 0.0001790430805659659,
"loss": 0.5327,
"step": 14800
},
{
"epoch": 0.8769090427566725,
"grad_norm": 7.0625,
"learning_rate": 0.00017875621042937002,
"loss": 0.5096,
"step": 14900
},
{
"epoch": 0.8827943383456434,
"grad_norm": 10.25,
"learning_rate": 0.0001784676233328324,
"loss": 0.5091,
"step": 15000
},
{
"epoch": 0.8886796339346144,
"grad_norm": 18.5,
"learning_rate": 0.0001781773255678249,
"loss": 0.5177,
"step": 15100
},
{
"epoch": 0.8945649295235854,
"grad_norm": 8.5,
"learning_rate": 0.00017788532346311366,
"loss": 0.5353,
"step": 15200
},
{
"epoch": 0.9004502251125562,
"grad_norm": 31.0,
"learning_rate": 0.00017759162338462092,
"loss": 0.5387,
"step": 15300
},
{
"epoch": 0.9063355207015272,
"grad_norm": 16.5,
"learning_rate": 0.00017729623173528641,
"loss": 0.5059,
"step": 15400
},
{
"epoch": 0.9122208162904982,
"grad_norm": 8.0625,
"learning_rate": 0.00017699915495492783,
"loss": 0.5403,
"step": 15500
},
{
"epoch": 0.9181061118794691,
"grad_norm": 16.625,
"learning_rate": 0.0001767003995201001,
"loss": 0.5228,
"step": 15600
},
{
"epoch": 0.9239914074684401,
"grad_norm": 5.71875,
"learning_rate": 0.00017639997194395456,
"loss": 0.5305,
"step": 15700
},
{
"epoch": 0.9298767030574111,
"grad_norm": 15.5625,
"learning_rate": 0.0001760978787760968,
"loss": 0.5179,
"step": 15800
},
{
"epoch": 0.935761998646382,
"grad_norm": 6.96875,
"learning_rate": 0.00017579412660244378,
"loss": 0.5253,
"step": 15900
},
{
"epoch": 0.941647294235353,
"grad_norm": 7.46875,
"learning_rate": 0.0001754887220450805,
"loss": 0.5034,
"step": 16000
},
{
"epoch": 0.947532589824324,
"grad_norm": 13.125,
"learning_rate": 0.00017518167176211542,
"loss": 0.4989,
"step": 16100
},
{
"epoch": 0.9534178854132949,
"grad_norm": 17.0,
"learning_rate": 0.00017487298244753534,
"loss": 0.5341,
"step": 16200
},
{
"epoch": 0.9593031810022659,
"grad_norm": 14.6875,
"learning_rate": 0.00017456266083105956,
"loss": 0.4969,
"step": 16300
},
{
"epoch": 0.9651884765912367,
"grad_norm": 7.46875,
"learning_rate": 0.00017425071367799307,
"loss": 0.5237,
"step": 16400
},
{
"epoch": 0.9710737721802077,
"grad_norm": 16.375,
"learning_rate": 0.00017393714778907914,
"loss": 0.5359,
"step": 16500
},
{
"epoch": 0.9769590677691787,
"grad_norm": 8.8125,
"learning_rate": 0.00017362197000035093,
"loss": 0.5218,
"step": 16600
},
{
"epoch": 0.9828443633581496,
"grad_norm": 20.25,
"learning_rate": 0.00017330518718298264,
"loss": 0.5275,
"step": 16700
},
{
"epoch": 0.9887296589471206,
"grad_norm": 9.5625,
"learning_rate": 0.00017298680624313958,
"loss": 0.5268,
"step": 16800
},
{
"epoch": 0.9946149545360916,
"grad_norm": 16.375,
"learning_rate": 0.0001726668341218276,
"loss": 0.5311,
"step": 16900
},
{
"epoch": 1.0005002501250626,
"grad_norm": 9.75,
"learning_rate": 0.00017234527779474184,
"loss": 0.5364,
"step": 17000
},
{
"epoch": 1.0063855457140334,
"grad_norm": 6.53125,
"learning_rate": 0.00017202214427211468,
"loss": 0.5141,
"step": 17100
},
{
"epoch": 1.0122708413030044,
"grad_norm": 10.625,
"learning_rate": 0.0001716974405985628,
"loss": 0.5321,
"step": 17200
},
{
"epoch": 1.0181561368919754,
"grad_norm": 8.75,
"learning_rate": 0.0001713711738529336,
"loss": 0.5292,
"step": 17300
},
{
"epoch": 1.0240414324809464,
"grad_norm": 9.0,
"learning_rate": 0.00017104335114815104,
"loss": 0.5249,
"step": 17400
},
{
"epoch": 1.0299267280699174,
"grad_norm": 20.0,
"learning_rate": 0.00017071397963106045,
"loss": 0.5342,
"step": 17500
},
{
"epoch": 1.0358120236588884,
"grad_norm": 36.0,
"learning_rate": 0.00017038306648227262,
"loss": 0.481,
"step": 17600
},
{
"epoch": 1.0416973192478591,
"grad_norm": 23.0,
"learning_rate": 0.00017005061891600751,
"loss": 0.5246,
"step": 17700
},
{
"epoch": 1.0475826148368301,
"grad_norm": 8.4375,
"learning_rate": 0.00016971664417993676,
"loss": 0.5121,
"step": 17800
},
{
"epoch": 1.0534679104258011,
"grad_norm": 18.375,
"learning_rate": 0.00016938114955502578,
"loss": 0.518,
"step": 17900
},
{
"epoch": 1.0593532060147721,
"grad_norm": 20.875,
"learning_rate": 0.00016904414235537497,
"loss": 0.5402,
"step": 18000
},
{
"epoch": 1.0652385016037431,
"grad_norm": 10.9375,
"learning_rate": 0.00016870562992806035,
"loss": 0.5306,
"step": 18100
},
{
"epoch": 1.0711237971927141,
"grad_norm": 10.625,
"learning_rate": 0.00016836561965297324,
"loss": 0.5452,
"step": 18200
},
{
"epoch": 1.0770090927816849,
"grad_norm": 24.0,
"learning_rate": 0.00016802411894265953,
"loss": 0.5258,
"step": 18300
},
{
"epoch": 1.0828943883706559,
"grad_norm": 24.0,
"learning_rate": 0.00016768113524215798,
"loss": 0.4995,
"step": 18400
},
{
"epoch": 1.0887796839596269,
"grad_norm": 22.75,
"learning_rate": 0.00016733667602883797,
"loss": 0.4998,
"step": 18500
},
{
"epoch": 1.0946649795485979,
"grad_norm": 12.3125,
"learning_rate": 0.00016699074881223636,
"loss": 0.5308,
"step": 18600
},
{
"epoch": 1.1005502751375689,
"grad_norm": 23.625,
"learning_rate": 0.000166643361133894,
"loss": 0.516,
"step": 18700
},
{
"epoch": 1.1064355707265396,
"grad_norm": 9.8125,
"learning_rate": 0.00016629452056719118,
"loss": 0.5127,
"step": 18800
},
{
"epoch": 1.1123208663155106,
"grad_norm": 21.5,
"learning_rate": 0.00016594423471718236,
"loss": 0.5072,
"step": 18900
},
{
"epoch": 1.1182061619044816,
"grad_norm": 10.0,
"learning_rate": 0.0001655925112204308,
"loss": 0.536,
"step": 19000
},
{
"epoch": 1.1240914574934526,
"grad_norm": 7.03125,
"learning_rate": 0.00016523935774484158,
"loss": 0.5184,
"step": 19100
},
{
"epoch": 1.1299767530824236,
"grad_norm": 16.5,
"learning_rate": 0.00016488478198949485,
"loss": 0.5186,
"step": 19200
},
{
"epoch": 1.1358620486713944,
"grad_norm": 20.625,
"learning_rate": 0.0001645287916844777,
"loss": 0.5418,
"step": 19300
},
{
"epoch": 1.1417473442603654,
"grad_norm": 5.46875,
"learning_rate": 0.00016417139459071577,
"loss": 0.5054,
"step": 19400
},
{
"epoch": 1.1476326398493364,
"grad_norm": 9.125,
"learning_rate": 0.00016381259849980405,
"loss": 0.4923,
"step": 19500
},
{
"epoch": 1.1535179354383074,
"grad_norm": 6.8125,
"learning_rate": 0.000163452411233837,
"loss": 0.5182,
"step": 19600
},
{
"epoch": 1.1594032310272784,
"grad_norm": 6.1875,
"learning_rate": 0.00016309084064523792,
"loss": 0.5142,
"step": 19700
},
{
"epoch": 1.1652885266162494,
"grad_norm": 10.5625,
"learning_rate": 0.000162727894616588,
"loss": 0.5055,
"step": 19800
},
{
"epoch": 1.1711738222052204,
"grad_norm": 5.4375,
"learning_rate": 0.0001623635810604542,
"loss": 0.5187,
"step": 19900
},
{
"epoch": 1.1770591177941911,
"grad_norm": 27.875,
"learning_rate": 0.00016199790791921693,
"loss": 0.4999,
"step": 20000
},
{
"epoch": 1.1829444133831621,
"grad_norm": 23.0,
"learning_rate": 0.00016163088316489683,
"loss": 0.5208,
"step": 20100
},
{
"epoch": 1.1888297089721331,
"grad_norm": 18.25,
"learning_rate": 0.00016126251479898097,
"loss": 0.5397,
"step": 20200
},
{
"epoch": 1.1947150045611041,
"grad_norm": 8.25,
"learning_rate": 0.0001608928108522485,
"loss": 0.5105,
"step": 20300
},
{
"epoch": 1.2006003001500751,
"grad_norm": 11.75,
"learning_rate": 0.00016052177938459539,
"loss": 0.5218,
"step": 20400
},
{
"epoch": 1.206485595739046,
"grad_norm": 7.71875,
"learning_rate": 0.00016014942848485887,
"loss": 0.5323,
"step": 20500
},
{
"epoch": 1.212370891328017,
"grad_norm": 8.375,
"learning_rate": 0.0001597757662706411,
"loss": 0.5348,
"step": 20600
},
{
"epoch": 1.218256186916988,
"grad_norm": 6.65625,
"learning_rate": 0.00015940080088813193,
"loss": 0.5107,
"step": 20700
},
{
"epoch": 1.2241414825059589,
"grad_norm": 8.25,
"learning_rate": 0.00015902454051193183,
"loss": 0.5125,
"step": 20800
},
{
"epoch": 1.2300267780949299,
"grad_norm": 12.875,
"learning_rate": 0.0001586469933448731,
"loss": 0.5284,
"step": 20900
},
{
"epoch": 1.2359120736839007,
"grad_norm": 14.8125,
"learning_rate": 0.00015826816761784138,
"loss": 0.5262,
"step": 21000
},
{
"epoch": 1.2417973692728717,
"grad_norm": 13.625,
"learning_rate": 0.0001578880715895962,
"loss": 0.5188,
"step": 21100
},
{
"epoch": 1.2476826648618426,
"grad_norm": 19.375,
"learning_rate": 0.00015750671354659073,
"loss": 0.5328,
"step": 21200
},
{
"epoch": 1.2535679604508136,
"grad_norm": 14.0625,
"learning_rate": 0.00015712410180279132,
"loss": 0.5384,
"step": 21300
},
{
"epoch": 1.2594532560397846,
"grad_norm": 20.75,
"learning_rate": 0.0001567402446994962,
"loss": 0.5175,
"step": 21400
},
{
"epoch": 1.2653385516287556,
"grad_norm": 13.0,
"learning_rate": 0.0001563551506051536,
"loss": 0.5308,
"step": 21500
},
{
"epoch": 1.2712238472177266,
"grad_norm": 39.0,
"learning_rate": 0.00015596882791517932,
"loss": 0.5445,
"step": 21600
},
{
"epoch": 1.2771091428066974,
"grad_norm": 17.625,
"learning_rate": 0.00015558128505177373,
"loss": 0.5321,
"step": 21700
},
{
"epoch": 1.2829944383956684,
"grad_norm": 6.53125,
"learning_rate": 0.0001551925304637381,
"loss": 0.5123,
"step": 21800
},
{
"epoch": 1.2888797339846394,
"grad_norm": 11.625,
"learning_rate": 0.00015480257262629046,
"loss": 0.5374,
"step": 21900
},
{
"epoch": 1.2947650295736104,
"grad_norm": 7.34375,
"learning_rate": 0.00015441142004088082,
"loss": 0.5317,
"step": 22000
},
{
"epoch": 1.3006503251625814,
"grad_norm": 19.25,
"learning_rate": 0.00015401908123500587,
"loss": 0.5192,
"step": 22100
},
{
"epoch": 1.3065356207515522,
"grad_norm": 5.25,
"learning_rate": 0.00015362556476202294,
"loss": 0.5218,
"step": 22200
},
{
"epoch": 1.3124209163405232,
"grad_norm": 5.53125,
"learning_rate": 0.00015323087920096363,
"loss": 0.5554,
"step": 22300
},
{
"epoch": 1.3183062119294942,
"grad_norm": 12.9375,
"learning_rate": 0.00015283503315634687,
"loss": 0.5106,
"step": 22400
},
{
"epoch": 1.3241915075184651,
"grad_norm": 20.125,
"learning_rate": 0.00015243803525799115,
"loss": 0.5166,
"step": 22500
},
{
"epoch": 1.3300768031074361,
"grad_norm": 15.4375,
"learning_rate": 0.00015203989416082643,
"loss": 0.5285,
"step": 22600
},
{
"epoch": 1.335962098696407,
"grad_norm": 29.25,
"learning_rate": 0.00015164061854470556,
"loss": 0.5226,
"step": 22700
},
{
"epoch": 1.341847394285378,
"grad_norm": 8.375,
"learning_rate": 0.0001512402171142149,
"loss": 0.5403,
"step": 22800
},
{
"epoch": 1.347732689874349,
"grad_norm": 8.875,
"learning_rate": 0.00015083869859848473,
"loss": 0.5459,
"step": 22900
},
{
"epoch": 1.35361798546332,
"grad_norm": 19.5,
"learning_rate": 0.00015043607175099877,
"loss": 0.5232,
"step": 23000
},
{
"epoch": 1.359503281052291,
"grad_norm": 6.84375,
"learning_rate": 0.00015003234534940343,
"loss": 0.5384,
"step": 23100
},
{
"epoch": 1.3653885766412617,
"grad_norm": 11.875,
"learning_rate": 0.00014962752819531647,
"loss": 0.5146,
"step": 23200
},
{
"epoch": 1.371273872230233,
"grad_norm": 10.4375,
"learning_rate": 0.00014922162911413505,
"loss": 0.5263,
"step": 23300
},
{
"epoch": 1.3771591678192037,
"grad_norm": 6.75,
"learning_rate": 0.00014881465695484338,
"loss": 0.5244,
"step": 23400
},
{
"epoch": 1.3830444634081747,
"grad_norm": 12.8125,
"learning_rate": 0.0001484066205898198,
"loss": 0.5228,
"step": 23500
},
{
"epoch": 1.3889297589971457,
"grad_norm": 5.78125,
"learning_rate": 0.0001479975289146434,
"loss": 0.5346,
"step": 23600
},
{
"epoch": 1.3948150545861167,
"grad_norm": 21.5,
"learning_rate": 0.00014758739084789983,
"loss": 0.5081,
"step": 23700
},
{
"epoch": 1.4007003501750876,
"grad_norm": 19.875,
"learning_rate": 0.0001471762153309873,
"loss": 0.5265,
"step": 23800
},
{
"epoch": 1.4065856457640584,
"grad_norm": 6.65625,
"learning_rate": 0.00014676401132792131,
"loss": 0.5238,
"step": 23900
},
{
"epoch": 1.4124709413530294,
"grad_norm": 7.96875,
"learning_rate": 0.00014635078782513928,
"loss": 0.5243,
"step": 24000
},
{
"epoch": 1.4183562369420004,
"grad_norm": 18.375,
"learning_rate": 0.0001459365538313048,
"loss": 0.519,
"step": 24100
},
{
"epoch": 1.4242415325309714,
"grad_norm": 7.53125,
"learning_rate": 0.00014552131837711107,
"loss": 0.5035,
"step": 24200
},
{
"epoch": 1.4301268281199424,
"grad_norm": 12.625,
"learning_rate": 0.00014510509051508406,
"loss": 0.5155,
"step": 24300
},
{
"epoch": 1.4360121237089132,
"grad_norm": 23.125,
"learning_rate": 0.00014468787931938516,
"loss": 0.5307,
"step": 24400
},
{
"epoch": 1.4418974192978842,
"grad_norm": 6.625,
"learning_rate": 0.00014426969388561345,
"loss": 0.5463,
"step": 24500
},
{
"epoch": 1.4477827148868552,
"grad_norm": 11.75,
"learning_rate": 0.0001438505433306072,
"loss": 0.5078,
"step": 24600
},
{
"epoch": 1.4536680104758262,
"grad_norm": 17.125,
"learning_rate": 0.00014343043679224533,
"loss": 0.5224,
"step": 24700
},
{
"epoch": 1.4595533060647972,
"grad_norm": 11.0625,
"learning_rate": 0.00014300938342924803,
"loss": 0.515,
"step": 24800
},
{
"epoch": 1.465438601653768,
"grad_norm": 5.375,
"learning_rate": 0.00014258739242097726,
"loss": 0.5313,
"step": 24900
},
{
"epoch": 1.4713238972427392,
"grad_norm": 14.1875,
"learning_rate": 0.0001421644729672364,
"loss": 0.5191,
"step": 25000
},
{
"epoch": 1.47720919283171,
"grad_norm": 9.125,
"learning_rate": 0.00014174063428807,
"loss": 0.5358,
"step": 25100
},
{
"epoch": 1.483094488420681,
"grad_norm": 18.625,
"learning_rate": 0.00014131588562356243,
"loss": 0.5256,
"step": 25200
},
{
"epoch": 1.488979784009652,
"grad_norm": 7.71875,
"learning_rate": 0.00014089023623363667,
"loss": 0.5414,
"step": 25300
},
{
"epoch": 1.494865079598623,
"grad_norm": 36.5,
"learning_rate": 0.00014046369539785233,
"loss": 0.526,
"step": 25400
},
{
"epoch": 1.500750375187594,
"grad_norm": 14.5,
"learning_rate": 0.00014003627241520347,
"loss": 0.5072,
"step": 25500
},
{
"epoch": 1.5066356707765647,
"grad_norm": 7.96875,
"learning_rate": 0.0001396079766039157,
"loss": 0.5244,
"step": 25600
},
{
"epoch": 1.5125209663655357,
"grad_norm": 14.0625,
"learning_rate": 0.00013917881730124315,
"loss": 0.5159,
"step": 25700
},
{
"epoch": 1.5184062619545067,
"grad_norm": 10.25,
"learning_rate": 0.0001387488038632649,
"loss": 0.5111,
"step": 25800
},
{
"epoch": 1.5242915575434777,
"grad_norm": 11.9375,
"learning_rate": 0.00013831794566468097,
"loss": 0.5254,
"step": 25900
},
{
"epoch": 1.5301768531324487,
"grad_norm": 23.625,
"learning_rate": 0.00013788625209860793,
"loss": 0.5248,
"step": 26000
},
{
"epoch": 1.5360621487214194,
"grad_norm": 12.3125,
"learning_rate": 0.00013745373257637418,
"loss": 0.5324,
"step": 26100
},
{
"epoch": 1.5419474443103904,
"grad_norm": 14.875,
"learning_rate": 0.00013702039652731482,
"loss": 0.5062,
"step": 26200
},
{
"epoch": 1.5478327398993614,
"grad_norm": 9.0625,
"learning_rate": 0.00013658625339856587,
"loss": 0.5304,
"step": 26300
},
{
"epoch": 1.5537180354883324,
"grad_norm": 10.5625,
"learning_rate": 0.0001361513126548585,
"loss": 0.5169,
"step": 26400
},
{
"epoch": 1.5596033310773034,
"grad_norm": 17.0,
"learning_rate": 0.0001357155837783127,
"loss": 0.5242,
"step": 26500
},
{
"epoch": 1.5654886266662742,
"grad_norm": 10.625,
"learning_rate": 0.00013527907626823048,
"loss": 0.5312,
"step": 26600
},
{
"epoch": 1.5713739222552454,
"grad_norm": 9.0625,
"learning_rate": 0.00013484179964088873,
"loss": 0.5313,
"step": 26700
},
{
"epoch": 1.5772592178442162,
"grad_norm": 6.71875,
"learning_rate": 0.00013440376342933188,
"loss": 0.5317,
"step": 26800
},
{
"epoch": 1.5831445134331872,
"grad_norm": 7.34375,
"learning_rate": 0.00013396497718316406,
"loss": 0.5358,
"step": 26900
},
{
"epoch": 1.5890298090221582,
"grad_norm": 16.25,
"learning_rate": 0.00013352545046834075,
"loss": 0.4916,
"step": 27000
},
{
"epoch": 1.594915104611129,
"grad_norm": 11.8125,
"learning_rate": 0.00013308519286696043,
"loss": 0.4964,
"step": 27100
},
{
"epoch": 1.6008004002001002,
"grad_norm": 5.6875,
"learning_rate": 0.00013264421397705557,
"loss": 0.5129,
"step": 27200
},
{
"epoch": 1.606685695789071,
"grad_norm": 18.125,
"learning_rate": 0.0001322025234123835,
"loss": 0.5137,
"step": 27300
},
{
"epoch": 1.612570991378042,
"grad_norm": 7.46875,
"learning_rate": 0.0001317601308022165,
"loss": 0.5186,
"step": 27400
},
{
"epoch": 1.618456286967013,
"grad_norm": 22.625,
"learning_rate": 0.0001313170457911324,
"loss": 0.5108,
"step": 27500
},
{
"epoch": 1.6243415825559837,
"grad_norm": 7.125,
"learning_rate": 0.00013087327803880383,
"loss": 0.522,
"step": 27600
},
{
"epoch": 1.630226878144955,
"grad_norm": 25.125,
"learning_rate": 0.0001304288372197879,
"loss": 0.5084,
"step": 27700
},
{
"epoch": 1.6361121737339257,
"grad_norm": 27.5,
"learning_rate": 0.00012998373302331516,
"loss": 0.5356,
"step": 27800
},
{
"epoch": 1.6419974693228967,
"grad_norm": 6.8125,
"learning_rate": 0.0001295379751530785,
"loss": 0.522,
"step": 27900
},
{
"epoch": 1.6478827649118677,
"grad_norm": 10.4375,
"learning_rate": 0.00012909157332702145,
"loss": 0.5182,
"step": 28000
},
{
"epoch": 1.6537680605008387,
"grad_norm": 8.6875,
"learning_rate": 0.00012864453727712638,
"loss": 0.5054,
"step": 28100
},
{
"epoch": 1.6596533560898097,
"grad_norm": 5.65625,
"learning_rate": 0.00012819687674920234,
"loss": 0.5319,
"step": 28200
},
{
"epoch": 1.6655386516787805,
"grad_norm": 19.125,
"learning_rate": 0.0001277486015026727,
"loss": 0.5084,
"step": 28300
},
{
"epoch": 1.6714239472677517,
"grad_norm": 7.71875,
"learning_rate": 0.00012729972131036212,
"loss": 0.5115,
"step": 28400
},
{
"epoch": 1.6773092428567224,
"grad_norm": 6.59375,
"learning_rate": 0.0001268502459582838,
"loss": 0.5298,
"step": 28500
},
{
"epoch": 1.6831945384456934,
"grad_norm": 16.0,
"learning_rate": 0.00012640018524542583,
"loss": 0.5167,
"step": 28600
},
{
"epoch": 1.6890798340346644,
"grad_norm": 32.5,
"learning_rate": 0.0001259495489835378,
"loss": 0.4973,
"step": 28700
},
{
"epoch": 1.6949651296236352,
"grad_norm": 20.875,
"learning_rate": 0.00012549834699691686,
"loss": 0.5206,
"step": 28800
},
{
"epoch": 1.7008504252126064,
"grad_norm": 17.125,
"learning_rate": 0.00012504658912219346,
"loss": 0.5083,
"step": 28900
},
{
"epoch": 1.7067357208015772,
"grad_norm": 22.875,
"learning_rate": 0.00012459428520811687,
"loss": 0.501,
"step": 29000
},
{
"epoch": 1.7126210163905482,
"grad_norm": 8.9375,
"learning_rate": 0.00012414144511534064,
"loss": 0.5043,
"step": 29100
},
{
"epoch": 1.7185063119795192,
"grad_norm": 17.625,
"learning_rate": 0.00012368807871620743,
"loss": 0.5342,
"step": 29200
},
{
"epoch": 1.72439160756849,
"grad_norm": 11.75,
"learning_rate": 0.00012323419589453394,
"loss": 0.5153,
"step": 29300
},
{
"epoch": 1.7302769031574612,
"grad_norm": 18.625,
"learning_rate": 0.00012277980654539533,
"loss": 0.5525,
"step": 29400
},
{
"epoch": 1.736162198746432,
"grad_norm": 5.125,
"learning_rate": 0.0001223249205749096,
"loss": 0.5195,
"step": 29500
},
{
"epoch": 1.742047494335403,
"grad_norm": 19.0,
"learning_rate": 0.0001218695479000215,
"loss": 0.5024,
"step": 29600
},
{
"epoch": 1.747932789924374,
"grad_norm": 6.0,
"learning_rate": 0.0001214136984482864,
"loss": 0.5058,
"step": 29700
},
{
"epoch": 1.753818085513345,
"grad_norm": 23.875,
"learning_rate": 0.00012095738215765391,
"loss": 0.5097,
"step": 29800
},
{
"epoch": 1.759703381102316,
"grad_norm": 8.625,
"learning_rate": 0.0001205006089762511,
"loss": 0.5282,
"step": 29900
},
{
"epoch": 1.7655886766912867,
"grad_norm": 25.875,
"learning_rate": 0.00012004338886216578,
"loss": 0.508,
"step": 30000
},
{
"epoch": 1.771473972280258,
"grad_norm": 8.75,
"learning_rate": 0.0001195857317832292,
"loss": 0.5232,
"step": 30100
},
{
"epoch": 1.7773592678692287,
"grad_norm": 18.625,
"learning_rate": 0.00011912764771679898,
"loss": 0.5227,
"step": 30200
},
{
"epoch": 1.7832445634581997,
"grad_norm": 14.875,
"learning_rate": 0.00011866914664954139,
"loss": 0.5093,
"step": 30300
},
{
"epoch": 1.7891298590471707,
"grad_norm": 7.96875,
"learning_rate": 0.00011821023857721371,
"loss": 0.5307,
"step": 30400
},
{
"epoch": 1.7950151546361415,
"grad_norm": 7.375,
"learning_rate": 0.00011775093350444637,
"loss": 0.5205,
"step": 30500
},
{
"epoch": 1.8009004502251127,
"grad_norm": 12.25,
"learning_rate": 0.00011729124144452477,
"loss": 0.5136,
"step": 30600
},
{
"epoch": 1.8067857458140835,
"grad_norm": 26.125,
"learning_rate": 0.00011683117241917095,
"loss": 0.4868,
"step": 30700
},
{
"epoch": 1.8126710414030545,
"grad_norm": 7.5625,
"learning_rate": 0.00011637073645832516,
"loss": 0.5018,
"step": 30800
},
{
"epoch": 1.8185563369920255,
"grad_norm": 20.5,
"learning_rate": 0.00011590994359992731,
"loss": 0.5079,
"step": 30900
},
{
"epoch": 1.8244416325809962,
"grad_norm": 9.25,
"learning_rate": 0.00011544880388969783,
"loss": 0.546,
"step": 31000
},
{
"epoch": 1.8303269281699674,
"grad_norm": 6.3125,
"learning_rate": 0.000114987327380919,
"loss": 0.5261,
"step": 31100
},
{
"epoch": 1.8362122237589382,
"grad_norm": 16.5,
"learning_rate": 0.00011452552413421558,
"loss": 0.5218,
"step": 31200
},
{
"epoch": 1.8420975193479092,
"grad_norm": 14.3125,
"learning_rate": 0.0001140634042173354,
"loss": 0.534,
"step": 31300
},
{
"epoch": 1.8479828149368802,
"grad_norm": 14.625,
"learning_rate": 0.00011360097770493024,
"loss": 0.5182,
"step": 31400
},
{
"epoch": 1.8538681105258512,
"grad_norm": 7.34375,
"learning_rate": 0.00011313825467833574,
"loss": 0.5025,
"step": 31500
},
{
"epoch": 1.8597534061148222,
"grad_norm": 19.625,
"learning_rate": 0.00011267524522535198,
"loss": 0.507,
"step": 31600
},
{
"epoch": 1.865638701703793,
"grad_norm": 19.0,
"learning_rate": 0.00011221195944002332,
"loss": 0.5229,
"step": 31700
},
{
"epoch": 1.871523997292764,
"grad_norm": 23.625,
"learning_rate": 0.00011174840742241844,
"loss": 0.5209,
"step": 31800
},
{
"epoch": 1.877409292881735,
"grad_norm": 12.5,
"learning_rate": 0.00011128459927841013,
"loss": 0.5025,
"step": 31900
},
{
"epoch": 1.883294588470706,
"grad_norm": 11.3125,
"learning_rate": 0.00011082054511945501,
"loss": 0.5267,
"step": 32000
},
{
"epoch": 1.889179884059677,
"grad_norm": 14.0,
"learning_rate": 0.00011035625506237304,
"loss": 0.5225,
"step": 32100
},
{
"epoch": 1.8950651796486477,
"grad_norm": 5.4375,
"learning_rate": 0.00010989173922912696,
"loss": 0.514,
"step": 32200
},
{
"epoch": 1.900950475237619,
"grad_norm": 13.9375,
"learning_rate": 0.00010942700774660173,
"loss": 0.5344,
"step": 32300
},
{
"epoch": 1.9068357708265897,
"grad_norm": 21.0,
"learning_rate": 0.00010896207074638356,
"loss": 0.5109,
"step": 32400
},
{
"epoch": 1.9127210664155607,
"grad_norm": 15.125,
"learning_rate": 0.0001084969383645392,
"loss": 0.5147,
"step": 32500
},
{
"epoch": 1.9186063620045317,
"grad_norm": 8.3125,
"learning_rate": 0.00010803162074139487,
"loss": 0.5041,
"step": 32600
},
{
"epoch": 1.9244916575935025,
"grad_norm": 14.875,
"learning_rate": 0.00010756612802131528,
"loss": 0.5334,
"step": 32700
},
{
"epoch": 1.9303769531824737,
"grad_norm": 9.625,
"learning_rate": 0.00010710047035248235,
"loss": 0.4981,
"step": 32800
},
{
"epoch": 1.9362622487714445,
"grad_norm": 15.4375,
"learning_rate": 0.00010663465788667406,
"loss": 0.5252,
"step": 32900
},
{
"epoch": 1.9421475443604155,
"grad_norm": 11.6875,
"learning_rate": 0.0001061687007790432,
"loss": 0.5196,
"step": 33000
},
{
"epoch": 1.9480328399493865,
"grad_norm": 21.125,
"learning_rate": 0.00010570260918789578,
"loss": 0.5056,
"step": 33100
},
{
"epoch": 1.9539181355383572,
"grad_norm": 6.0625,
"learning_rate": 0.00010523639327446968,
"loss": 0.5173,
"step": 33200
},
{
"epoch": 1.9598034311273285,
"grad_norm": 3.5625,
"learning_rate": 0.00010477006320271317,
"loss": 0.4972,
"step": 33300
},
{
"epoch": 1.9656887267162992,
"grad_norm": 15.0625,
"learning_rate": 0.00010430362913906327,
"loss": 0.5204,
"step": 33400
},
{
"epoch": 1.9715740223052702,
"grad_norm": 42.5,
"learning_rate": 0.00010383710125222412,
"loss": 0.522,
"step": 33500
},
{
"epoch": 1.9774593178942412,
"grad_norm": 16.375,
"learning_rate": 0.00010337048971294529,
"loss": 0.538,
"step": 33600
},
{
"epoch": 1.9833446134832122,
"grad_norm": 15.0,
"learning_rate": 0.00010290380469380005,
"loss": 0.5178,
"step": 33700
},
{
"epoch": 1.9892299090721832,
"grad_norm": 15.25,
"learning_rate": 0.00010243705636896361,
"loss": 0.544,
"step": 33800
},
{
"epoch": 1.995115204661154,
"grad_norm": 14.4375,
"learning_rate": 0.00010197025491399128,
"loss": 0.4892,
"step": 33900
},
{
"epoch": 2.001000500250125,
"grad_norm": 6.78125,
"learning_rate": 0.00010150341050559669,
"loss": 0.5086,
"step": 34000
},
{
"epoch": 2.006885795839096,
"grad_norm": 15.1875,
"learning_rate": 0.00010103653332142988,
"loss": 0.4967,
"step": 34100
},
{
"epoch": 2.0127710914280668,
"grad_norm": 10.1875,
"learning_rate": 0.00010056963353985544,
"loss": 0.5222,
"step": 34200
},
{
"epoch": 2.018656387017038,
"grad_norm": 9.0,
"learning_rate": 0.00010010272133973058,
"loss": 0.5374,
"step": 34300
},
{
"epoch": 2.0245416826060088,
"grad_norm": 14.6875,
"learning_rate": 9.963580690018327e-05,
"loss": 0.5077,
"step": 34400
},
{
"epoch": 2.03042697819498,
"grad_norm": 9.6875,
"learning_rate": 9.916890040039031e-05,
"loss": 0.5286,
"step": 34500
},
{
"epoch": 2.0363122737839507,
"grad_norm": 18.125,
"learning_rate": 9.870201201935538e-05,
"loss": 0.5236,
"step": 34600
},
{
"epoch": 2.042197569372922,
"grad_norm": 7.65625,
"learning_rate": 9.823515193568715e-05,
"loss": 0.5196,
"step": 34700
},
{
"epoch": 2.0480828649618927,
"grad_norm": 6.34375,
"learning_rate": 9.776833032737742e-05,
"loss": 0.5108,
"step": 34800
},
{
"epoch": 2.0539681605508635,
"grad_norm": 16.625,
"learning_rate": 9.730155737157916e-05,
"loss": 0.5166,
"step": 34900
},
{
"epoch": 2.0598534561398347,
"grad_norm": 4.625,
"learning_rate": 9.683484324438467e-05,
"loss": 0.512,
"step": 35000
},
{
"epoch": 2.0657387517288055,
"grad_norm": 8.8125,
"learning_rate": 9.636819812060377e-05,
"loss": 0.5163,
"step": 35100
},
{
"epoch": 2.0716240473177767,
"grad_norm": 9.875,
"learning_rate": 9.590163217354184e-05,
"loss": 0.5038,
"step": 35200
},
{
"epoch": 2.0775093429067475,
"grad_norm": 8.6875,
"learning_rate": 9.543515557477826e-05,
"loss": 0.511,
"step": 35300
},
{
"epoch": 2.0833946384957183,
"grad_norm": 8.5625,
"learning_rate": 9.496877849394444e-05,
"loss": 0.498,
"step": 35400
},
{
"epoch": 2.0892799340846895,
"grad_norm": 7.0625,
"learning_rate": 9.450251109850225e-05,
"loss": 0.5318,
"step": 35500
},
{
"epoch": 2.0951652296736603,
"grad_norm": 21.625,
"learning_rate": 9.40363635535223e-05,
"loss": 0.5205,
"step": 35600
},
{
"epoch": 2.1010505252626315,
"grad_norm": 7.84375,
"learning_rate": 9.357034602146232e-05,
"loss": 0.5164,
"step": 35700
},
{
"epoch": 2.1069358208516022,
"grad_norm": 24.875,
"learning_rate": 9.310446866194571e-05,
"loss": 0.5349,
"step": 35800
},
{
"epoch": 2.112821116440573,
"grad_norm": 17.125,
"learning_rate": 9.263874163153992e-05,
"loss": 0.5042,
"step": 35900
},
{
"epoch": 2.1187064120295442,
"grad_norm": 9.5625,
"learning_rate": 9.217317508353507e-05,
"loss": 0.4948,
"step": 36000
},
{
"epoch": 2.124591707618515,
"grad_norm": 7.46875,
"learning_rate": 9.170777916772265e-05,
"loss": 0.5195,
"step": 36100
},
{
"epoch": 2.1304770032074862,
"grad_norm": 7.75,
"learning_rate": 9.124256403017419e-05,
"loss": 0.5179,
"step": 36200
},
{
"epoch": 2.136362298796457,
"grad_norm": 9.3125,
"learning_rate": 9.077753981302009e-05,
"loss": 0.4938,
"step": 36300
},
{
"epoch": 2.1422475943854282,
"grad_norm": 13.4375,
"learning_rate": 9.031271665422849e-05,
"loss": 0.5449,
"step": 36400
},
{
"epoch": 2.148132889974399,
"grad_norm": 11.1875,
"learning_rate": 8.984810468738427e-05,
"loss": 0.5127,
"step": 36500
},
{
"epoch": 2.1540181855633698,
"grad_norm": 9.5,
"learning_rate": 8.938371404146812e-05,
"loss": 0.5085,
"step": 36600
},
{
"epoch": 2.159903481152341,
"grad_norm": 12.4375,
"learning_rate": 8.891955484063576e-05,
"loss": 0.5424,
"step": 36700
},
{
"epoch": 2.1657887767413118,
"grad_norm": 6.28125,
"learning_rate": 8.845563720399716e-05,
"loss": 0.513,
"step": 36800
},
{
"epoch": 2.171674072330283,
"grad_norm": 22.625,
"learning_rate": 8.799197124539595e-05,
"loss": 0.5128,
"step": 36900
},
{
"epoch": 2.1775593679192538,
"grad_norm": 17.0,
"learning_rate": 8.752856707318896e-05,
"loss": 0.5216,
"step": 37000
},
{
"epoch": 2.1834446635082245,
"grad_norm": 7.40625,
"learning_rate": 8.706543479002584e-05,
"loss": 0.5186,
"step": 37100
},
{
"epoch": 2.1893299590971957,
"grad_norm": 19.5,
"learning_rate": 8.660258449262878e-05,
"loss": 0.5274,
"step": 37200
},
{
"epoch": 2.1952152546861665,
"grad_norm": 15.1875,
"learning_rate": 8.614002627157239e-05,
"loss": 0.5017,
"step": 37300
},
{
"epoch": 2.2011005502751377,
"grad_norm": 13.0,
"learning_rate": 8.56777702110638e-05,
"loss": 0.5044,
"step": 37400
},
{
"epoch": 2.2069858458641085,
"grad_norm": 13.3125,
"learning_rate": 8.521582638872273e-05,
"loss": 0.5191,
"step": 37500
},
{
"epoch": 2.2128711414530793,
"grad_norm": 7.625,
"learning_rate": 8.475420487536179e-05,
"loss": 0.5101,
"step": 37600
},
{
"epoch": 2.2187564370420505,
"grad_norm": 13.75,
"learning_rate": 8.429291573476699e-05,
"loss": 0.5029,
"step": 37700
},
{
"epoch": 2.2246417326310213,
"grad_norm": 14.625,
"learning_rate": 8.383196902347823e-05,
"loss": 0.5132,
"step": 37800
},
{
"epoch": 2.2305270282199925,
"grad_norm": 8.9375,
"learning_rate": 8.337137479057019e-05,
"loss": 0.516,
"step": 37900
},
{
"epoch": 2.2364123238089633,
"grad_norm": 5.71875,
"learning_rate": 8.291114307743317e-05,
"loss": 0.5114,
"step": 38000
},
{
"epoch": 2.2422976193979345,
"grad_norm": 12.875,
"learning_rate": 8.24512839175542e-05,
"loss": 0.5025,
"step": 38100
},
{
"epoch": 2.2481829149869053,
"grad_norm": 12.0,
"learning_rate": 8.199180733629826e-05,
"loss": 0.5121,
"step": 38200
},
{
"epoch": 2.254068210575876,
"grad_norm": 18.125,
"learning_rate": 8.153272335068982e-05,
"loss": 0.5347,
"step": 38300
},
{
"epoch": 2.2599535061648472,
"grad_norm": 5.9375,
"learning_rate": 8.107404196919436e-05,
"loss": 0.5165,
"step": 38400
},
{
"epoch": 2.265838801753818,
"grad_norm": 12.375,
"learning_rate": 8.061577319150016e-05,
"loss": 0.5022,
"step": 38500
},
{
"epoch": 2.271724097342789,
"grad_norm": 9.375,
"learning_rate": 8.015792700830044e-05,
"loss": 0.5203,
"step": 38600
},
{
"epoch": 2.27760939293176,
"grad_norm": 10.5,
"learning_rate": 7.97005134010754e-05,
"loss": 0.5199,
"step": 38700
},
{
"epoch": 2.283494688520731,
"grad_norm": 12.625,
"learning_rate": 7.924354234187466e-05,
"loss": 0.5376,
"step": 38800
},
{
"epoch": 2.289379984109702,
"grad_norm": 13.6875,
"learning_rate": 7.878702379309991e-05,
"loss": 0.5228,
"step": 38900
},
{
"epoch": 2.2952652796986728,
"grad_norm": 7.3125,
"learning_rate": 7.833096770728772e-05,
"loss": 0.5474,
"step": 39000
},
{
"epoch": 2.301150575287644,
"grad_norm": 17.0,
"learning_rate": 7.787538402689245e-05,
"loss": 0.511,
"step": 39100
},
{
"epoch": 2.3070358708766148,
"grad_norm": 18.5,
"learning_rate": 7.742028268406961e-05,
"loss": 0.5169,
"step": 39200
},
{
"epoch": 2.3129211664655855,
"grad_norm": 16.5,
"learning_rate": 7.69656736004593e-05,
"loss": 0.5148,
"step": 39300
},
{
"epoch": 2.3188064620545568,
"grad_norm": 19.125,
"learning_rate": 7.651156668696989e-05,
"loss": 0.5257,
"step": 39400
},
{
"epoch": 2.3246917576435275,
"grad_norm": 5.875,
"learning_rate": 7.6057971843562e-05,
"loss": 0.515,
"step": 39500
},
{
"epoch": 2.3305770532324988,
"grad_norm": 5.21875,
"learning_rate": 7.560489895903258e-05,
"loss": 0.4958,
"step": 39600
},
{
"epoch": 2.3364623488214695,
"grad_norm": 5.21875,
"learning_rate": 7.515235791079943e-05,
"loss": 0.5117,
"step": 39700
},
{
"epoch": 2.3423476444104407,
"grad_norm": 12.5625,
"learning_rate": 7.470035856468578e-05,
"loss": 0.53,
"step": 39800
},
{
"epoch": 2.3482329399994115,
"grad_norm": 14.6875,
"learning_rate": 7.424891077470529e-05,
"loss": 0.5052,
"step": 39900
},
{
"epoch": 2.3541182355883823,
"grad_norm": 11.625,
"learning_rate": 7.379802438284711e-05,
"loss": 0.5239,
"step": 40000
},
{
"epoch": 2.3600035311773535,
"grad_norm": 17.625,
"learning_rate": 7.334770921886143e-05,
"loss": 0.5232,
"step": 40100
},
{
"epoch": 2.3658888267663243,
"grad_norm": 15.4375,
"learning_rate": 7.28979751000451e-05,
"loss": 0.5145,
"step": 40200
},
{
"epoch": 2.371774122355295,
"grad_norm": 20.875,
"learning_rate": 7.244883183102769e-05,
"loss": 0.4999,
"step": 40300
},
{
"epoch": 2.3776594179442663,
"grad_norm": 13.3125,
"learning_rate": 7.200028920355759e-05,
"loss": 0.5153,
"step": 40400
},
{
"epoch": 2.383544713533237,
"grad_norm": 11.5625,
"learning_rate": 7.155235699628871e-05,
"loss": 0.4802,
"step": 40500
},
{
"epoch": 2.3894300091222083,
"grad_norm": 7.1875,
"learning_rate": 7.110504497456725e-05,
"loss": 0.4936,
"step": 40600
},
{
"epoch": 2.395315304711179,
"grad_norm": 13.125,
"learning_rate": 7.065836289021866e-05,
"loss": 0.5239,
"step": 40700
},
{
"epoch": 2.4012006003001503,
"grad_norm": 13.1875,
"learning_rate": 7.021232048133527e-05,
"loss": 0.5074,
"step": 40800
},
{
"epoch": 2.407085895889121,
"grad_norm": 8.5625,
"learning_rate": 6.976692747206385e-05,
"loss": 0.5173,
"step": 40900
},
{
"epoch": 2.412971191478092,
"grad_norm": 18.875,
"learning_rate": 6.932219357239363e-05,
"loss": 0.5261,
"step": 41000
},
{
"epoch": 2.418856487067063,
"grad_norm": 10.3125,
"learning_rate": 6.887812847794458e-05,
"loss": 0.5115,
"step": 41100
},
{
"epoch": 2.424741782656034,
"grad_norm": 14.625,
"learning_rate": 6.843474186975617e-05,
"loss": 0.5039,
"step": 41200
},
{
"epoch": 2.430627078245005,
"grad_norm": 21.375,
"learning_rate": 6.799204341407619e-05,
"loss": 0.525,
"step": 41300
},
{
"epoch": 2.436512373833976,
"grad_norm": 12.3125,
"learning_rate": 6.755004276215004e-05,
"loss": 0.4939,
"step": 41400
},
{
"epoch": 2.442397669422947,
"grad_norm": 11.125,
"learning_rate": 6.710874955001035e-05,
"loss": 0.5271,
"step": 41500
},
{
"epoch": 2.4482829650119178,
"grad_norm": 9.4375,
"learning_rate": 6.666817339826692e-05,
"loss": 0.4943,
"step": 41600
},
{
"epoch": 2.4541682606008886,
"grad_norm": 10.0625,
"learning_rate": 6.622832391189689e-05,
"loss": 0.5258,
"step": 41700
},
{
"epoch": 2.4600535561898598,
"grad_norm": 11.6875,
"learning_rate": 6.57892106800355e-05,
"loss": 0.5169,
"step": 41800
},
{
"epoch": 2.4659388517788305,
"grad_norm": 9.375,
"learning_rate": 6.535084327576683e-05,
"loss": 0.4939,
"step": 41900
},
{
"epoch": 2.4718241473678013,
"grad_norm": 13.1875,
"learning_rate": 6.49132312559153e-05,
"loss": 0.5034,
"step": 42000
},
{
"epoch": 2.4777094429567725,
"grad_norm": 4.78125,
"learning_rate": 6.447638416083717e-05,
"loss": 0.5401,
"step": 42100
},
{
"epoch": 2.4835947385457433,
"grad_norm": 9.25,
"learning_rate": 6.404031151421274e-05,
"loss": 0.5167,
"step": 42200
},
{
"epoch": 2.4894800341347145,
"grad_norm": 6.5,
"learning_rate": 6.360502282283845e-05,
"loss": 0.5173,
"step": 42300
},
{
"epoch": 2.4953653297236853,
"grad_norm": 11.375,
"learning_rate": 6.317052757641985e-05,
"loss": 0.499,
"step": 42400
},
{
"epoch": 2.5012506253126565,
"grad_norm": 14.8125,
"learning_rate": 6.273683524736463e-05,
"loss": 0.5147,
"step": 42500
},
{
"epoch": 2.5071359209016273,
"grad_norm": 31.75,
"learning_rate": 6.230395529057611e-05,
"loss": 0.5131,
"step": 42600
},
{
"epoch": 2.513021216490598,
"grad_norm": 13.375,
"learning_rate": 6.187189714324713e-05,
"loss": 0.5048,
"step": 42700
},
{
"epoch": 2.5189065120795693,
"grad_norm": 10.25,
"learning_rate": 6.144067022465433e-05,
"loss": 0.5142,
"step": 42800
},
{
"epoch": 2.52479180766854,
"grad_norm": 21.0,
"learning_rate": 6.1010283935952726e-05,
"loss": 0.5437,
"step": 42900
},
{
"epoch": 2.5306771032575113,
"grad_norm": 27.25,
"learning_rate": 6.058074765997088e-05,
"loss": 0.5261,
"step": 43000
},
{
"epoch": 2.536562398846482,
"grad_norm": 14.0,
"learning_rate": 6.0152070761006175e-05,
"loss": 0.5375,
"step": 43100
},
{
"epoch": 2.5424476944354533,
"grad_norm": 12.875,
"learning_rate": 5.972426258462083e-05,
"loss": 0.5182,
"step": 43200
},
{
"epoch": 2.548332990024424,
"grad_norm": 6.59375,
"learning_rate": 5.929733245743809e-05,
"loss": 0.5061,
"step": 43300
},
{
"epoch": 2.554218285613395,
"grad_norm": 14.6875,
"learning_rate": 5.887128968693887e-05,
"loss": 0.4996,
"step": 43400
},
{
"epoch": 2.560103581202366,
"grad_norm": 24.0,
"learning_rate": 5.8446143561258885e-05,
"loss": 0.5035,
"step": 43500
},
{
"epoch": 2.565988876791337,
"grad_norm": 13.875,
"learning_rate": 5.8021903348986115e-05,
"loss": 0.5101,
"step": 43600
},
{
"epoch": 2.5718741723803076,
"grad_norm": 17.875,
"learning_rate": 5.75985782989588e-05,
"loss": 0.5041,
"step": 43700
},
{
"epoch": 2.577759467969279,
"grad_norm": 10.5625,
"learning_rate": 5.71761776400638e-05,
"loss": 0.5179,
"step": 43800
},
{
"epoch": 2.5836447635582496,
"grad_norm": 8.75,
"learning_rate": 5.6754710581035364e-05,
"loss": 0.5118,
"step": 43900
},
{
"epoch": 2.589530059147221,
"grad_norm": 7.3125,
"learning_rate": 5.633418631025431e-05,
"loss": 0.5191,
"step": 44000
},
{
"epoch": 2.5954153547361916,
"grad_norm": 19.375,
"learning_rate": 5.5914613995547805e-05,
"loss": 0.511,
"step": 44100
},
{
"epoch": 2.6013006503251628,
"grad_norm": 6.75,
"learning_rate": 5.549600278398959e-05,
"loss": 0.4941,
"step": 44200
},
{
"epoch": 2.6071859459141336,
"grad_norm": 24.375,
"learning_rate": 5.507836180170023e-05,
"loss": 0.5151,
"step": 44300
},
{
"epoch": 2.6130712415031043,
"grad_norm": 5.5,
"learning_rate": 5.466170015364863e-05,
"loss": 0.5241,
"step": 44400
},
{
"epoch": 2.6189565370920755,
"grad_norm": 9.4375,
"learning_rate": 5.424602692345304e-05,
"loss": 0.5163,
"step": 44500
},
{
"epoch": 2.6248418326810463,
"grad_norm": 4.90625,
"learning_rate": 5.3831351173183455e-05,
"loss": 0.5091,
"step": 44600
},
{
"epoch": 2.630727128270017,
"grad_norm": 9.125,
"learning_rate": 5.341768194316374e-05,
"loss": 0.5196,
"step": 44700
},
{
"epoch": 2.6366124238589883,
"grad_norm": 21.75,
"learning_rate": 5.300502825177469e-05,
"loss": 0.5248,
"step": 44800
},
{
"epoch": 2.6424977194479595,
"grad_norm": 14.0,
"learning_rate": 5.259339909525749e-05,
"loss": 0.524,
"step": 44900
},
{
"epoch": 2.6483830150369303,
"grad_norm": 28.0,
"learning_rate": 5.2182803447517314e-05,
"loss": 0.4982,
"step": 45000
},
{
"epoch": 2.654268310625901,
"grad_norm": 6.15625,
"learning_rate": 5.1773250259928077e-05,
"loss": 0.5137,
"step": 45100
},
{
"epoch": 2.6601536062148723,
"grad_norm": 4.9375,
"learning_rate": 5.136474846113688e-05,
"loss": 0.5293,
"step": 45200
},
{
"epoch": 2.666038901803843,
"grad_norm": 7.375,
"learning_rate": 5.09573069568697e-05,
"loss": 0.5154,
"step": 45300
},
{
"epoch": 2.671924197392814,
"grad_norm": 5.78125,
"learning_rate": 5.055093462973706e-05,
"loss": 0.5202,
"step": 45400
},
{
"epoch": 2.677809492981785,
"grad_norm": 18.25,
"learning_rate": 5.014564033904029e-05,
"loss": 0.5225,
"step": 45500
},
{
"epoch": 2.683694788570756,
"grad_norm": 8.5,
"learning_rate": 4.97414329205787e-05,
"loss": 0.5142,
"step": 45600
},
{
"epoch": 2.689580084159727,
"grad_norm": 6.21875,
"learning_rate": 4.933832118645656e-05,
"loss": 0.5356,
"step": 45700
},
{
"epoch": 2.695465379748698,
"grad_norm": 11.5625,
"learning_rate": 4.893631392489137e-05,
"loss": 0.5121,
"step": 45800
},
{
"epoch": 2.701350675337669,
"grad_norm": 7.59375,
"learning_rate": 4.853541990002195e-05,
"loss": 0.5437,
"step": 45900
},
{
"epoch": 2.70723597092664,
"grad_norm": 16.625,
"learning_rate": 4.8135647851717516e-05,
"loss": 0.5347,
"step": 46000
},
{
"epoch": 2.7131212665156106,
"grad_norm": 6.34375,
"learning_rate": 4.7737006495387216e-05,
"loss": 0.5152,
"step": 46100
},
{
"epoch": 2.719006562104582,
"grad_norm": 32.5,
"learning_rate": 4.7339504521789935e-05,
"loss": 0.4914,
"step": 46200
},
{
"epoch": 2.7248918576935526,
"grad_norm": 30.625,
"learning_rate": 4.694315059684507e-05,
"loss": 0.5021,
"step": 46300
},
{
"epoch": 2.7307771532825234,
"grad_norm": 14.5625,
"learning_rate": 4.65479533614433e-05,
"loss": 0.5113,
"step": 46400
},
{
"epoch": 2.7366624488714946,
"grad_norm": 13.5,
"learning_rate": 4.6153921431258554e-05,
"loss": 0.5169,
"step": 46500
},
{
"epoch": 2.742547744460466,
"grad_norm": 18.25,
"learning_rate": 4.576106339655984e-05,
"loss": 0.5086,
"step": 46600
},
{
"epoch": 2.7484330400494366,
"grad_norm": 8.3125,
"learning_rate": 4.536938782202431e-05,
"loss": 0.5176,
"step": 46700
},
{
"epoch": 2.7543183356384073,
"grad_norm": 12.4375,
"learning_rate": 4.4978903246550195e-05,
"loss": 0.5146,
"step": 46800
},
{
"epoch": 2.7602036312273786,
"grad_norm": 14.25,
"learning_rate": 4.4589618183070844e-05,
"loss": 0.5207,
"step": 46900
},
{
"epoch": 2.7660889268163493,
"grad_norm": 11.6875,
"learning_rate": 4.42015411183693e-05,
"loss": 0.5122,
"step": 47000
},
{
"epoch": 2.77197422240532,
"grad_norm": 18.25,
"learning_rate": 4.381468051289283e-05,
"loss": 0.5176,
"step": 47100
},
{
"epoch": 2.7778595179942913,
"grad_norm": 8.875,
"learning_rate": 4.342904480056893e-05,
"loss": 0.4933,
"step": 47200
},
{
"epoch": 2.783744813583262,
"grad_norm": 10.0625,
"learning_rate": 4.304464238862115e-05,
"loss": 0.5001,
"step": 47300
},
{
"epoch": 2.7896301091722333,
"grad_norm": 10.4375,
"learning_rate": 4.266148165738593e-05,
"loss": 0.5163,
"step": 47400
},
{
"epoch": 2.795515404761204,
"grad_norm": 5.25,
"learning_rate": 4.227957096013e-05,
"loss": 0.5061,
"step": 47500
},
{
"epoch": 2.8014007003501753,
"grad_norm": 7.8125,
"learning_rate": 4.1898918622868025e-05,
"loss": 0.5097,
"step": 47600
},
{
"epoch": 2.807285995939146,
"grad_norm": 16.125,
"learning_rate": 4.1519532944181374e-05,
"loss": 0.5171,
"step": 47700
},
{
"epoch": 2.813171291528117,
"grad_norm": 13.375,
"learning_rate": 4.1141422195036904e-05,
"loss": 0.5217,
"step": 47800
},
{
"epoch": 2.819056587117088,
"grad_norm": 8.375,
"learning_rate": 4.0764594618606975e-05,
"loss": 0.5038,
"step": 47900
},
{
"epoch": 2.824941882706059,
"grad_norm": 11.75,
"learning_rate": 4.038905843008943e-05,
"loss": 0.4968,
"step": 48000
},
{
"epoch": 2.8308271782950296,
"grad_norm": 15.5625,
"learning_rate": 4.001482181652865e-05,
"loss": 0.5336,
"step": 48100
},
{
"epoch": 2.836712473884001,
"grad_norm": 6.28125,
"learning_rate": 3.964189293663715e-05,
"loss": 0.5185,
"step": 48200
},
{
"epoch": 2.842597769472972,
"grad_norm": 5.84375,
"learning_rate": 3.9270279920617456e-05,
"loss": 0.501,
"step": 48300
},
{
"epoch": 2.848483065061943,
"grad_norm": 10.3125,
"learning_rate": 3.889999086998519e-05,
"loss": 0.5302,
"step": 48400
},
{
"epoch": 2.8543683606509136,
"grad_norm": 16.375,
"learning_rate": 3.853103385739213e-05,
"loss": 0.5224,
"step": 48500
},
{
"epoch": 2.860253656239885,
"grad_norm": 7.46875,
"learning_rate": 3.8163416926450436e-05,
"loss": 0.5142,
"step": 48600
},
{
"epoch": 2.8661389518288556,
"grad_norm": 6.71875,
"learning_rate": 3.7797148091557244e-05,
"loss": 0.5233,
"step": 48700
},
{
"epoch": 2.8720242474178264,
"grad_norm": 15.25,
"learning_rate": 3.743223533771982e-05,
"loss": 0.5433,
"step": 48800
},
{
"epoch": 2.8779095430067976,
"grad_norm": 5.28125,
"learning_rate": 3.706868662038172e-05,
"loss": 0.5114,
"step": 48900
},
{
"epoch": 2.8837948385957684,
"grad_norm": 5.65625,
"learning_rate": 3.670650986524905e-05,
"loss": 0.515,
"step": 49000
},
{
"epoch": 2.8896801341847396,
"grad_norm": 7.09375,
"learning_rate": 3.634571296811801e-05,
"loss": 0.5299,
"step": 49100
},
{
"epoch": 2.8955654297737103,
"grad_norm": 10.9375,
"learning_rate": 3.5986303794702445e-05,
"loss": 0.5259,
"step": 49200
},
{
"epoch": 2.9014507253626816,
"grad_norm": 12.375,
"learning_rate": 3.5628290180462556e-05,
"loss": 0.5327,
"step": 49300
},
{
"epoch": 2.9073360209516523,
"grad_norm": 13.9375,
"learning_rate": 3.527167993043411e-05,
"loss": 0.5047,
"step": 49400
},
{
"epoch": 2.913221316540623,
"grad_norm": 18.875,
"learning_rate": 3.4916480819058074e-05,
"loss": 0.5137,
"step": 49500
},
{
"epoch": 2.9191066121295943,
"grad_norm": 17.0,
"learning_rate": 3.4562700590011384e-05,
"loss": 0.5224,
"step": 49600
},
{
"epoch": 2.924991907718565,
"grad_norm": 7.21875,
"learning_rate": 3.4210346956037894e-05,
"loss": 0.5242,
"step": 49700
},
{
"epoch": 2.930877203307536,
"grad_norm": 6.375,
"learning_rate": 3.385942759878042e-05,
"loss": 0.5102,
"step": 49800
},
{
"epoch": 2.936762498896507,
"grad_norm": 12.125,
"learning_rate": 3.35099501686131e-05,
"loss": 0.49,
"step": 49900
},
{
"epoch": 2.9426477944854783,
"grad_norm": 8.6875,
"learning_rate": 3.316192228447479e-05,
"loss": 0.5086,
"step": 50000
},
{
"epoch": 2.948533090074449,
"grad_norm": 8.25,
"learning_rate": 3.281535153370278e-05,
"loss": 0.5013,
"step": 50100
},
{
"epoch": 2.95441838566342,
"grad_norm": 6.65625,
"learning_rate": 3.2470245471867536e-05,
"loss": 0.5204,
"step": 50200
},
{
"epoch": 2.960303681252391,
"grad_norm": 8.125,
"learning_rate": 3.212661162260794e-05,
"loss": 0.4943,
"step": 50300
},
{
"epoch": 2.966188976841362,
"grad_norm": 7.8125,
"learning_rate": 3.1784457477467135e-05,
"loss": 0.5172,
"step": 50400
},
{
"epoch": 2.9720742724303326,
"grad_norm": 7.09375,
"learning_rate": 3.144379049572945e-05,
"loss": 0.5017,
"step": 50500
},
{
"epoch": 2.977959568019304,
"grad_norm": 14.9375,
"learning_rate": 3.110461810425754e-05,
"loss": 0.4932,
"step": 50600
},
{
"epoch": 2.9838448636082746,
"grad_norm": 18.625,
"learning_rate": 3.076694769733061e-05,
"loss": 0.5163,
"step": 50700
},
{
"epoch": 2.989730159197246,
"grad_norm": 29.875,
"learning_rate": 3.043078663648322e-05,
"loss": 0.523,
"step": 50800
},
{
"epoch": 2.9956154547862166,
"grad_norm": 7.5625,
"learning_rate": 3.0096142250344683e-05,
"loss": 0.4909,
"step": 50900
},
{
"epoch": 3.0015007503751874,
"grad_norm": 15.6875,
"learning_rate": 2.976302183447944e-05,
"loss": 0.5244,
"step": 51000
},
{
"epoch": 3.0073860459641586,
"grad_norm": 23.75,
"learning_rate": 2.9431432651227876e-05,
"loss": 0.5018,
"step": 51100
},
{
"epoch": 3.0132713415531294,
"grad_norm": 17.375,
"learning_rate": 2.9101381929548122e-05,
"loss": 0.5074,
"step": 51200
},
{
"epoch": 3.0191566371421006,
"grad_norm": 14.0,
"learning_rate": 2.8772876864858333e-05,
"loss": 0.5075,
"step": 51300
},
{
"epoch": 3.0250419327310714,
"grad_norm": 11.0625,
"learning_rate": 2.844592461887987e-05,
"loss": 0.5093,
"step": 51400
},
{
"epoch": 3.0309272283200426,
"grad_norm": 5.96875,
"learning_rate": 2.812053231948125e-05,
"loss": 0.5173,
"step": 51500
},
{
"epoch": 3.0368125239090134,
"grad_norm": 24.375,
"learning_rate": 2.7796707060522588e-05,
"loss": 0.5349,
"step": 51600
},
{
"epoch": 3.042697819497984,
"grad_norm": 16.5,
"learning_rate": 2.747445590170109e-05,
"loss": 0.5164,
"step": 51700
},
{
"epoch": 3.0485831150869553,
"grad_norm": 16.5,
"learning_rate": 2.715378586839713e-05,
"loss": 0.5046,
"step": 51800
},
{
"epoch": 3.054468410675926,
"grad_norm": 9.25,
"learning_rate": 2.6834703951520913e-05,
"loss": 0.5054,
"step": 51900
},
{
"epoch": 3.0603537062648973,
"grad_norm": 18.0,
"learning_rate": 2.651721710736036e-05,
"loss": 0.5007,
"step": 52000
},
{
"epoch": 3.066239001853868,
"grad_norm": 10.25,
"learning_rate": 2.6201332257429156e-05,
"loss": 0.5306,
"step": 52100
},
{
"epoch": 3.072124297442839,
"grad_norm": 8.3125,
"learning_rate": 2.5887056288316125e-05,
"loss": 0.5168,
"step": 52200
},
{
"epoch": 3.07800959303181,
"grad_norm": 7.40625,
"learning_rate": 2.5574396051534832e-05,
"loss": 0.5217,
"step": 52300
},
{
"epoch": 3.083894888620781,
"grad_norm": 17.625,
"learning_rate": 2.526335836337449e-05,
"loss": 0.4916,
"step": 52400
},
{
"epoch": 3.089780184209752,
"grad_norm": 10.75,
"learning_rate": 2.4953950004751105e-05,
"loss": 0.5206,
"step": 52500
},
{
"epoch": 3.095665479798723,
"grad_norm": 19.0,
"learning_rate": 2.464617772105977e-05,
"loss": 0.5269,
"step": 52600
},
{
"epoch": 3.1015507753876936,
"grad_norm": 12.625,
"learning_rate": 2.434004822202769e-05,
"loss": 0.5039,
"step": 52700
},
{
"epoch": 3.107436070976665,
"grad_norm": 30.25,
"learning_rate": 2.403556818156767e-05,
"loss": 0.5176,
"step": 52800
},
{
"epoch": 3.1133213665656356,
"grad_norm": 7.875,
"learning_rate": 2.3732744237632885e-05,
"loss": 0.4943,
"step": 52900
},
{
"epoch": 3.119206662154607,
"grad_norm": 7.90625,
"learning_rate": 2.3431582992071932e-05,
"loss": 0.4948,
"step": 53000
},
{
"epoch": 3.1250919577435776,
"grad_norm": 9.8125,
"learning_rate": 2.3132091010485103e-05,
"loss": 0.5129,
"step": 53100
},
{
"epoch": 3.1309772533325484,
"grad_norm": 12.9375,
"learning_rate": 2.283427482208107e-05,
"loss": 0.5268,
"step": 53200
},
{
"epoch": 3.1368625489215196,
"grad_norm": 5.6875,
"learning_rate": 2.2538140919534678e-05,
"loss": 0.5075,
"step": 53300
},
{
"epoch": 3.1427478445104904,
"grad_norm": 6.59375,
"learning_rate": 2.2243695758845374e-05,
"loss": 0.5011,
"step": 53400
},
{
"epoch": 3.1486331400994616,
"grad_norm": 6.21875,
"learning_rate": 2.195094575919634e-05,
"loss": 0.5118,
"step": 53500
},
{
"epoch": 3.1545184356884324,
"grad_norm": 10.0625,
"learning_rate": 2.1659897302814747e-05,
"loss": 0.5277,
"step": 53600
},
{
"epoch": 3.1604037312774036,
"grad_norm": 6.8125,
"learning_rate": 2.1370556734832427e-05,
"loss": 0.5392,
"step": 53700
},
{
"epoch": 3.1662890268663744,
"grad_norm": 23.25,
"learning_rate": 2.1082930363147714e-05,
"loss": 0.5214,
"step": 53800
},
{
"epoch": 3.172174322455345,
"grad_norm": 14.25,
"learning_rate": 2.0797024458287752e-05,
"loss": 0.5209,
"step": 53900
},
{
"epoch": 3.1780596180443164,
"grad_norm": 14.0,
"learning_rate": 2.0512845253271895e-05,
"loss": 0.5026,
"step": 54000
},
{
"epoch": 3.183944913633287,
"grad_norm": 8.9375,
"learning_rate": 2.0230398943475905e-05,
"loss": 0.5209,
"step": 54100
},
{
"epoch": 3.1898302092222584,
"grad_norm": 9.625,
"learning_rate": 1.994969168649663e-05,
"loss": 0.5195,
"step": 54200
},
{
"epoch": 3.195715504811229,
"grad_norm": 6.59375,
"learning_rate": 1.967072960201808e-05,
"loss": 0.5069,
"step": 54300
},
{
"epoch": 3.2016008004002,
"grad_norm": 13.4375,
"learning_rate": 1.939351877167771e-05,
"loss": 0.5104,
"step": 54400
},
{
"epoch": 3.207486095989171,
"grad_norm": 11.5625,
"learning_rate": 1.9118065238934103e-05,
"loss": 0.4954,
"step": 54500
},
{
"epoch": 3.213371391578142,
"grad_norm": 10.875,
"learning_rate": 1.884437500893499e-05,
"loss": 0.5009,
"step": 54600
},
{
"epoch": 3.219256687167113,
"grad_norm": 6.25,
"learning_rate": 1.8572454048386455e-05,
"loss": 0.5053,
"step": 54700
},
{
"epoch": 3.225141982756084,
"grad_norm": 6.125,
"learning_rate": 1.8302308285422908e-05,
"loss": 0.5228,
"step": 54800
},
{
"epoch": 3.2310272783450547,
"grad_norm": 7.46875,
"learning_rate": 1.8033943609477632e-05,
"loss": 0.5134,
"step": 54900
},
{
"epoch": 3.236912573934026,
"grad_norm": 8.3125,
"learning_rate": 1.7767365871154717e-05,
"loss": 0.5123,
"step": 55000
},
{
"epoch": 3.2427978695229966,
"grad_norm": 20.375,
"learning_rate": 1.750258088210116e-05,
"loss": 0.5023,
"step": 55100
},
{
"epoch": 3.248683165111968,
"grad_norm": 5.5,
"learning_rate": 1.7239594414880356e-05,
"loss": 0.5162,
"step": 55200
},
{
"epoch": 3.2545684607009386,
"grad_norm": 6.0625,
"learning_rate": 1.6978412202846294e-05,
"loss": 0.5163,
"step": 55300
},
{
"epoch": 3.26045375628991,
"grad_norm": 8.4375,
"learning_rate": 1.6719039940018388e-05,
"loss": 0.5008,
"step": 55400
},
{
"epoch": 3.2663390518788806,
"grad_norm": 8.0625,
"learning_rate": 1.6461483280957568e-05,
"loss": 0.5165,
"step": 55500
},
{
"epoch": 3.2722243474678514,
"grad_norm": 18.375,
"learning_rate": 1.620574784064275e-05,
"loss": 0.5062,
"step": 55600
},
{
"epoch": 3.2781096430568226,
"grad_norm": 9.5625,
"learning_rate": 1.5951839194348683e-05,
"loss": 0.5227,
"step": 55700
},
{
"epoch": 3.2839949386457934,
"grad_norm": 18.625,
"learning_rate": 1.5699762877524193e-05,
"loss": 0.5,
"step": 55800
},
{
"epoch": 3.2898802342347646,
"grad_norm": 20.0,
"learning_rate": 1.5449524385671588e-05,
"loss": 0.5159,
"step": 55900
},
{
"epoch": 3.2957655298237354,
"grad_norm": 5.9375,
"learning_rate": 1.5201129174226936e-05,
"loss": 0.513,
"step": 56000
},
{
"epoch": 3.3016508254127066,
"grad_norm": 7.40625,
"learning_rate": 1.4954582658440919e-05,
"loss": 0.5171,
"step": 56100
},
{
"epoch": 3.3075361210016774,
"grad_norm": 18.375,
"learning_rate": 1.4709890213261047e-05,
"loss": 0.5302,
"step": 56200
},
{
"epoch": 3.313421416590648,
"grad_norm": 8.6875,
"learning_rate": 1.4467057173214194e-05,
"loss": 0.4993,
"step": 56300
},
{
"epoch": 3.3193067121796194,
"grad_norm": 13.3125,
"learning_rate": 1.4226088832290574e-05,
"loss": 0.5359,
"step": 56400
},
{
"epoch": 3.32519200776859,
"grad_norm": 18.5,
"learning_rate": 1.3986990443828074e-05,
"loss": 0.5267,
"step": 56500
},
{
"epoch": 3.331077303357561,
"grad_norm": 7.78125,
"learning_rate": 1.3749767220397935e-05,
"loss": 0.5227,
"step": 56600
},
{
"epoch": 3.336962598946532,
"grad_norm": 11.8125,
"learning_rate": 1.3514424333691011e-05,
"loss": 0.5096,
"step": 56700
},
{
"epoch": 3.342847894535503,
"grad_norm": 10.375,
"learning_rate": 1.328096691440498e-05,
"loss": 0.4976,
"step": 56800
},
{
"epoch": 3.348733190124474,
"grad_norm": 12.875,
"learning_rate": 1.304940005213262e-05,
"loss": 0.5155,
"step": 56900
},
{
"epoch": 3.354618485713445,
"grad_norm": 7.375,
"learning_rate": 1.2819728795250708e-05,
"loss": 0.5168,
"step": 57000
},
{
"epoch": 3.360503781302416,
"grad_norm": 10.125,
"learning_rate": 1.2591958150810102e-05,
"loss": 0.5212,
"step": 57100
},
{
"epoch": 3.366389076891387,
"grad_norm": 13.75,
"learning_rate": 1.2366093084426433e-05,
"loss": 0.5127,
"step": 57200
},
{
"epoch": 3.3722743724803577,
"grad_norm": 10.25,
"learning_rate": 1.2142138520171965e-05,
"loss": 0.5413,
"step": 57300
},
{
"epoch": 3.378159668069329,
"grad_norm": 16.75,
"learning_rate": 1.1920099340468227e-05,
"loss": 0.5217,
"step": 57400
},
{
"epoch": 3.3840449636582997,
"grad_norm": 16.75,
"learning_rate": 1.1699980385979504e-05,
"loss": 0.4949,
"step": 57500
},
{
"epoch": 3.389930259247271,
"grad_norm": 5.46875,
"learning_rate": 1.1481786455507415e-05,
"loss": 0.4959,
"step": 57600
},
{
"epoch": 3.3958155548362416,
"grad_norm": 7.5,
"learning_rate": 1.1265522305886156e-05,
"loss": 0.5145,
"step": 57700
},
{
"epoch": 3.4017008504252124,
"grad_norm": 23.875,
"learning_rate": 1.1051192651878938e-05,
"loss": 0.5159,
"step": 57800
},
{
"epoch": 3.4075861460141836,
"grad_norm": 9.125,
"learning_rate": 1.0838802166075123e-05,
"loss": 0.5329,
"step": 57900
},
{
"epoch": 3.4134714416031544,
"grad_norm": 8.25,
"learning_rate": 1.0628355478788321e-05,
"loss": 0.4948,
"step": 58000
},
{
"epoch": 3.4193567371921256,
"grad_norm": 6.78125,
"learning_rate": 1.0419857177955562e-05,
"loss": 0.508,
"step": 58100
},
{
"epoch": 3.4252420327810964,
"grad_norm": 10.5,
"learning_rate": 1.0213311809037173e-05,
"loss": 0.5162,
"step": 58200
},
{
"epoch": 3.431127328370067,
"grad_norm": 16.0,
"learning_rate": 1.0008723874917747e-05,
"loss": 0.5129,
"step": 58300
},
{
"epoch": 3.4370126239590384,
"grad_norm": 10.625,
"learning_rate": 9.806097835807903e-06,
"loss": 0.5129,
"step": 58400
},
{
"epoch": 3.442897919548009,
"grad_norm": 26.375,
"learning_rate": 9.605438109147068e-06,
"loss": 0.5151,
"step": 58500
},
{
"epoch": 3.4487832151369804,
"grad_norm": 11.25,
"learning_rate": 9.406749069507303e-06,
"loss": 0.515,
"step": 58600
},
{
"epoch": 3.454668510725951,
"grad_norm": 13.25,
"learning_rate": 9.210035048497722e-06,
"loss": 0.5047,
"step": 58700
},
{
"epoch": 3.4605538063149224,
"grad_norm": 5.375,
"learning_rate": 9.015300334670219e-06,
"loss": 0.5125,
"step": 58800
},
{
"epoch": 3.466439101903893,
"grad_norm": 11.75,
"learning_rate": 8.822549173425876e-06,
"loss": 0.5258,
"step": 58900
},
{
"epoch": 3.472324397492864,
"grad_norm": 6.9375,
"learning_rate": 8.631785766922507e-06,
"loss": 0.5084,
"step": 59000
},
{
"epoch": 3.478209693081835,
"grad_norm": 13.8125,
"learning_rate": 8.443014273982953e-06,
"loss": 0.5027,
"step": 59100
},
{
"epoch": 3.484094988670806,
"grad_norm": 17.75,
"learning_rate": 8.256238810004424e-06,
"loss": 0.5255,
"step": 59200
},
{
"epoch": 3.489980284259777,
"grad_norm": 10.8125,
"learning_rate": 8.071463446868899e-06,
"loss": 0.5119,
"step": 59300
},
{
"epoch": 3.495865579848748,
"grad_norm": 28.375,
"learning_rate": 7.888692212854165e-06,
"loss": 0.507,
"step": 59400
},
{
"epoch": 3.501750875437719,
"grad_norm": 19.875,
"learning_rate": 7.707929092546185e-06,
"loss": 0.5097,
"step": 59500
},
{
"epoch": 3.50763617102669,
"grad_norm": 8.0,
"learning_rate": 7.52917802675206e-06,
"loss": 0.5138,
"step": 59600
},
{
"epoch": 3.5135214666156607,
"grad_norm": 11.5,
"learning_rate": 7.352442912414259e-06,
"loss": 0.5213,
"step": 59700
},
{
"epoch": 3.519406762204632,
"grad_norm": 11.3125,
"learning_rate": 7.1777276025256075e-06,
"loss": 0.4977,
"step": 59800
},
{
"epoch": 3.5252920577936027,
"grad_norm": 10.625,
"learning_rate": 7.005035906045199e-06,
"loss": 0.5094,
"step": 59900
},
{
"epoch": 3.5311773533825734,
"grad_norm": 13.4375,
"learning_rate": 6.834371587815547e-06,
"loss": 0.5202,
"step": 60000
},
{
"epoch": 3.5370626489715447,
"grad_norm": 21.125,
"learning_rate": 6.665738368480301e-06,
"loss": 0.5069,
"step": 60100
},
{
"epoch": 3.5429479445605154,
"grad_norm": 12.0625,
"learning_rate": 6.4991399244033306e-06,
"loss": 0.5218,
"step": 60200
},
{
"epoch": 3.5488332401494866,
"grad_norm": 6.875,
"learning_rate": 6.334579887588377e-06,
"loss": 0.5049,
"step": 60300
},
{
"epoch": 3.5547185357384574,
"grad_norm": 12.0,
"learning_rate": 6.172061845600053e-06,
"loss": 0.5291,
"step": 60400
},
{
"epoch": 3.5606038313274286,
"grad_norm": 36.5,
"learning_rate": 6.011589341485524e-06,
"loss": 0.5136,
"step": 60500
},
{
"epoch": 3.5664891269163994,
"grad_norm": 27.375,
"learning_rate": 5.8531658736972524e-06,
"loss": 0.5103,
"step": 60600
},
{
"epoch": 3.57237442250537,
"grad_norm": 10.6875,
"learning_rate": 5.696794896016866e-06,
"loss": 0.5087,
"step": 60700
},
{
"epoch": 3.5782597180943414,
"grad_norm": 24.75,
"learning_rate": 5.542479817479651e-06,
"loss": 0.5077,
"step": 60800
},
{
"epoch": 3.584145013683312,
"grad_norm": 22.875,
"learning_rate": 5.390224002300437e-06,
"loss": 0.5295,
"step": 60900
},
{
"epoch": 3.590030309272283,
"grad_norm": 11.1875,
"learning_rate": 5.240030769800108e-06,
"loss": 0.52,
"step": 61000
},
{
"epoch": 3.595915604861254,
"grad_norm": 27.75,
"learning_rate": 5.091903394333331e-06,
"loss": 0.5079,
"step": 61100
},
{
"epoch": 3.6018009004502254,
"grad_norm": 14.25,
"learning_rate": 4.945845105217117e-06,
"loss": 0.5164,
"step": 61200
},
{
"epoch": 3.607686196039196,
"grad_norm": 6.4375,
"learning_rate": 4.801859086660387e-06,
"loss": 0.5226,
"step": 61300
},
{
"epoch": 3.613571491628167,
"grad_norm": 38.25,
"learning_rate": 4.659948477694709e-06,
"loss": 0.5266,
"step": 61400
},
{
"epoch": 3.619456787217138,
"grad_norm": 17.0,
"learning_rate": 4.520116372105665e-06,
"loss": 0.5286,
"step": 61500
},
{
"epoch": 3.625342082806109,
"grad_norm": 11.3125,
"learning_rate": 4.382365818365552e-06,
"loss": 0.4915,
"step": 61600
},
{
"epoch": 3.6312273783950797,
"grad_norm": 26.375,
"learning_rate": 4.246699819566824e-06,
"loss": 0.5006,
"step": 61700
},
{
"epoch": 3.637112673984051,
"grad_norm": 6.0,
"learning_rate": 4.1131213333566846e-06,
"loss": 0.5007,
"step": 61800
},
{
"epoch": 3.6429979695730217,
"grad_norm": 5.34375,
"learning_rate": 3.981633271872598e-06,
"loss": 0.5202,
"step": 61900
},
{
"epoch": 3.648883265161993,
"grad_norm": 11.4375,
"learning_rate": 3.852238501678751e-06,
"loss": 0.5159,
"step": 62000
},
{
"epoch": 3.6547685607509637,
"grad_norm": 5.71875,
"learning_rate": 3.7249398437036454e-06,
"loss": 0.511,
"step": 62100
},
{
"epoch": 3.660653856339935,
"grad_norm": 27.125,
"learning_rate": 3.5997400731785258e-06,
"loss": 0.5217,
"step": 62200
},
{
"epoch": 3.6665391519289057,
"grad_norm": 5.53125,
"learning_rate": 3.4766419195769285e-06,
"loss": 0.5074,
"step": 62300
},
{
"epoch": 3.6724244475178764,
"grad_norm": 14.875,
"learning_rate": 3.355648066555117e-06,
"loss": 0.5022,
"step": 62400
},
{
"epoch": 3.6783097431068477,
"grad_norm": 11.8125,
"learning_rate": 3.236761151893608e-06,
"loss": 0.501,
"step": 62500
},
{
"epoch": 3.6841950386958184,
"grad_norm": 36.5,
"learning_rate": 3.119983767439705e-06,
"loss": 0.5139,
"step": 62600
},
{
"epoch": 3.690080334284789,
"grad_norm": 25.0,
"learning_rate": 3.005318459050932e-06,
"loss": 0.5286,
"step": 62700
},
{
"epoch": 3.6959656298737604,
"grad_norm": 9.6875,
"learning_rate": 2.892767726539569e-06,
"loss": 0.524,
"step": 62800
},
{
"epoch": 3.701850925462731,
"grad_norm": 8.1875,
"learning_rate": 2.7823340236181162e-06,
"loss": 0.5196,
"step": 62900
},
{
"epoch": 3.7077362210517024,
"grad_norm": 19.625,
"learning_rate": 2.674019757845847e-06,
"loss": 0.5073,
"step": 63000
},
{
"epoch": 3.713621516640673,
"grad_norm": 16.375,
"learning_rate": 2.567827290576297e-06,
"loss": 0.5043,
"step": 63100
},
{
"epoch": 3.7195068122296444,
"grad_norm": 7.46875,
"learning_rate": 2.463758936905758e-06,
"loss": 0.5134,
"step": 63200
},
{
"epoch": 3.725392107818615,
"grad_norm": 16.125,
"learning_rate": 2.3618169656228873e-06,
"loss": 0.5175,
"step": 63300
},
{
"epoch": 3.731277403407586,
"grad_norm": 7.21875,
"learning_rate": 2.2620035991591238e-06,
"loss": 0.5269,
"step": 63400
},
{
"epoch": 3.737162698996557,
"grad_norm": 19.75,
"learning_rate": 2.1643210135403825e-06,
"loss": 0.5021,
"step": 63500
},
{
"epoch": 3.743047994585528,
"grad_norm": 5.96875,
"learning_rate": 2.06877133833947e-06,
"loss": 0.5249,
"step": 63600
},
{
"epoch": 3.748933290174499,
"grad_norm": 17.625,
"learning_rate": 1.97535665662979e-06,
"loss": 0.5282,
"step": 63700
},
{
"epoch": 3.75481858576347,
"grad_norm": 13.875,
"learning_rate": 1.8840790049398095e-06,
"loss": 0.5088,
"step": 63800
},
{
"epoch": 3.760703881352441,
"grad_norm": 14.3125,
"learning_rate": 1.7949403732087311e-06,
"loss": 0.5365,
"step": 63900
},
{
"epoch": 3.766589176941412,
"grad_norm": 19.875,
"learning_rate": 1.7079427047431485e-06,
"loss": 0.5084,
"step": 64000
},
{
"epoch": 3.7724744725303827,
"grad_norm": 6.34375,
"learning_rate": 1.6230878961745577e-06,
"loss": 0.5067,
"step": 64100
},
{
"epoch": 3.778359768119354,
"grad_norm": 14.125,
"learning_rate": 1.5403777974181354e-06,
"loss": 0.5016,
"step": 64200
},
{
"epoch": 3.7842450637083247,
"grad_norm": 9.0,
"learning_rate": 1.4598142116323156e-06,
"loss": 0.5285,
"step": 64300
},
{
"epoch": 3.7901303592972955,
"grad_norm": 22.25,
"learning_rate": 1.3813988951795421e-06,
"loss": 0.5291,
"step": 64400
},
{
"epoch": 3.7960156548862667,
"grad_norm": 7.15625,
"learning_rate": 1.3051335575879341e-06,
"loss": 0.4998,
"step": 64500
},
{
"epoch": 3.8019009504752375,
"grad_norm": 21.0,
"learning_rate": 1.23101986151406e-06,
"loss": 0.5114,
"step": 64600
},
{
"epoch": 3.8077862460642087,
"grad_norm": 18.25,
"learning_rate": 1.1590594227066542e-06,
"loss": 0.5212,
"step": 64700
},
{
"epoch": 3.8136715416531795,
"grad_norm": 21.25,
"learning_rate": 1.0892538099714023e-06,
"loss": 0.5245,
"step": 64800
},
{
"epoch": 3.8195568372421507,
"grad_norm": 7.03125,
"learning_rate": 1.0216045451367452e-06,
"loss": 0.5021,
"step": 64900
},
{
"epoch": 3.8254421328311214,
"grad_norm": 5.875,
"learning_rate": 9.561131030206837e-07,
"loss": 0.5257,
"step": 65000
},
{
"epoch": 3.831327428420092,
"grad_norm": 12.5625,
"learning_rate": 8.927809113986607e-07,
"loss": 0.5224,
"step": 65100
},
{
"epoch": 3.8372127240090634,
"grad_norm": 6.40625,
"learning_rate": 8.316093509724066e-07,
"loss": 0.5038,
"step": 65200
},
{
"epoch": 3.843098019598034,
"grad_norm": 5.625,
"learning_rate": 7.725997553398534e-07,
"loss": 0.5153,
"step": 65300
},
{
"epoch": 3.8489833151870054,
"grad_norm": 11.9375,
"learning_rate": 7.157534109660358e-07,
"loss": 0.4947,
"step": 65400
},
{
"epoch": 3.854868610775976,
"grad_norm": 18.25,
"learning_rate": 6.610715571550796e-07,
"loss": 0.4974,
"step": 65500
},
{
"epoch": 3.8607539063649474,
"grad_norm": 12.125,
"learning_rate": 6.085553860231685e-07,
"loss": 0.498,
"step": 65600
},
{
"epoch": 3.866639201953918,
"grad_norm": 15.125,
"learning_rate": 5.582060424725421e-07,
"loss": 0.5182,
"step": 65700
},
{
"epoch": 3.872524497542889,
"grad_norm": 17.375,
"learning_rate": 5.100246241665496e-07,
"loss": 0.5096,
"step": 65800
},
{
"epoch": 3.87840979313186,
"grad_norm": 10.875,
"learning_rate": 4.640121815057241e-07,
"loss": 0.537,
"step": 65900
},
{
"epoch": 3.884295088720831,
"grad_norm": 13.75,
"learning_rate": 4.201697176048791e-07,
"loss": 0.5069,
"step": 66000
},
{
"epoch": 3.8901803843098017,
"grad_norm": 5.28125,
"learning_rate": 3.7849818827121465e-07,
"loss": 0.5089,
"step": 66100
},
{
"epoch": 3.896065679898773,
"grad_norm": 17.125,
"learning_rate": 3.38998501983534e-07,
"loss": 0.5166,
"step": 66200
},
{
"epoch": 3.9019509754877437,
"grad_norm": 14.8125,
"learning_rate": 3.0167151987238187e-07,
"loss": 0.5002,
"step": 66300
},
{
"epoch": 3.907836271076715,
"grad_norm": 9.1875,
"learning_rate": 2.665180557013147e-07,
"loss": 0.5074,
"step": 66400
},
{
"epoch": 3.9137215666656857,
"grad_norm": 11.125,
"learning_rate": 2.3353887584911528e-07,
"loss": 0.5059,
"step": 66500
},
{
"epoch": 3.919606862254657,
"grad_norm": 5.4375,
"learning_rate": 2.0273469929313893e-07,
"loss": 0.5305,
"step": 66600
},
{
"epoch": 3.9254921578436277,
"grad_norm": 20.5,
"learning_rate": 1.7410619759358204e-07,
"loss": 0.5114,
"step": 66700
},
{
"epoch": 3.9313774534325985,
"grad_norm": 11.3125,
"learning_rate": 1.4765399487889352e-07,
"loss": 0.5084,
"step": 66800
},
{
"epoch": 3.9372627490215697,
"grad_norm": 13.375,
"learning_rate": 1.2337866783211915e-07,
"loss": 0.5048,
"step": 66900
},
{
"epoch": 3.9431480446105405,
"grad_norm": 16.125,
"learning_rate": 1.012807456783782e-07,
"loss": 0.5414,
"step": 67000
},
{
"epoch": 3.9490333401995112,
"grad_norm": 16.25,
"learning_rate": 8.136071017330604e-08,
"loss": 0.5128,
"step": 67100
},
{
"epoch": 3.9549186357884825,
"grad_norm": 15.0625,
"learning_rate": 6.361899559250705e-08,
"loss": 0.5239,
"step": 67200
},
{
"epoch": 3.9608039313774537,
"grad_norm": 20.625,
"learning_rate": 4.8055988722162106e-08,
"loss": 0.508,
"step": 67300
},
{
"epoch": 3.9666892269664245,
"grad_norm": 9.25,
"learning_rate": 3.467202885056864e-08,
"loss": 0.5171,
"step": 67400
},
{
"epoch": 3.9725745225553952,
"grad_norm": 8.25,
"learning_rate": 2.346740776070222e-08,
"loss": 0.5199,
"step": 67500
},
{
"epoch": 3.9784598181443664,
"grad_norm": 11.5,
"learning_rate": 1.4442369723932648e-08,
"loss": 0.4939,
"step": 67600
},
{
"epoch": 3.984345113733337,
"grad_norm": 21.0,
"learning_rate": 7.597111494606069e-09,
"loss": 0.5275,
"step": 67700
},
{
"epoch": 3.990230409322308,
"grad_norm": 5.4375,
"learning_rate": 2.9317823058483405e-09,
"loss": 0.5191,
"step": 67800
},
{
"epoch": 3.996115704911279,
"grad_norm": 6.96875,
"learning_rate": 4.464838662454618e-10,
"loss": 0.5112,
"step": 67900
}
],
"logging_steps": 100,
"max_steps": 67964,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.755619563970458e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}