WYBar's picture
update llm ckpt
6d40b8e
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.819548872180452,
"eval_steps": 1500,
"global_step": 26000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"ce_loss": 4.768589496612549,
"epoch": 0,
"inp_emb_norm": 0.265625,
"loss": 4.768589496612549,
"masked_top1": 0.0,
"masked_top5": 0.4098360538482666,
"step": 0,
"top1": 64.96458435058594,
"top5": 80.73052215576172
},
{
"epoch": 0.02,
"grad_norm": 6.2690516537633005,
"learning_rate": 0.0001,
"loss": 4.6296,
"step": 50
},
{
"ce_loss": 4.672832765579224,
"epoch": 0.02,
"inp_emb_norm": 0.26263671875,
"loss": 4.672832765579224,
"masked_top1": 14.987861804962158,
"masked_top5": 24.83079730987549,
"step": 50,
"top1": 66.08985641002656,
"top5": 81.12573463439941
},
{
"epoch": 0.03,
"grad_norm": 9.593317242373375,
"learning_rate": 0.0001,
"loss": 2.2246,
"step": 100
},
{
"ce_loss": 2.207055723667145,
"epoch": 0.03,
"inp_emb_norm": 0.2644140625,
"loss": 2.207055723667145,
"masked_top1": 23.22451873779297,
"masked_top5": 38.51483547210693,
"step": 100,
"top1": 74.46695602416992,
"top5": 87.36076766967773
},
{
"epoch": 0.05,
"grad_norm": 3.1307366740911395,
"learning_rate": 0.0001,
"loss": 2.0857,
"step": 150
},
{
"ce_loss": 2.077478907108307,
"epoch": 0.05,
"inp_emb_norm": 0.26427734375,
"loss": 2.077478907108307,
"masked_top1": 27.615498123168944,
"masked_top5": 46.10835479736328,
"step": 150,
"top1": 75.29274810791016,
"top5": 88.44302017211913
},
{
"epoch": 0.06,
"grad_norm": 2.714681744263656,
"learning_rate": 0.0001,
"loss": 2.0248,
"step": 200
},
{
"ce_loss": 2.018527669906616,
"epoch": 0.06,
"inp_emb_norm": 0.2655859375,
"loss": 2.018527669906616,
"masked_top1": 29.403907699584963,
"masked_top5": 48.57966896057129,
"step": 200,
"top1": 75.77816040039062,
"top5": 88.9672624206543
},
{
"epoch": 0.08,
"grad_norm": 2.4811381161353423,
"learning_rate": 0.0001,
"loss": 1.9651,
"step": 250
},
{
"ce_loss": 1.9264942455291747,
"epoch": 0.08,
"inp_emb_norm": 0.2648828125,
"loss": 1.9264942455291747,
"masked_top1": 32.32564323425293,
"masked_top5": 51.88160499572754,
"step": 250,
"top1": 76.65794830322265,
"top5": 89.52642883300781
},
{
"epoch": 0.09,
"grad_norm": 2.160461687507101,
"learning_rate": 0.0001,
"loss": 1.9061,
"step": 300
},
{
"ce_loss": 1.9086420011520386,
"epoch": 0.09,
"inp_emb_norm": 0.26564453125,
"loss": 1.9086420011520386,
"masked_top1": 32.26125560760498,
"masked_top5": 52.74158630371094,
"step": 300,
"top1": 76.66858810424804,
"top5": 89.8210075378418
},
{
"epoch": 0.11,
"grad_norm": 2.173027962906511,
"learning_rate": 0.0001,
"loss": 1.8717,
"step": 350
},
{
"ce_loss": 1.856195902824402,
"epoch": 0.11,
"inp_emb_norm": 0.27287109375,
"loss": 1.856195902824402,
"masked_top1": 33.80865501403809,
"masked_top5": 54.50813941955566,
"step": 350,
"top1": 77.32864608764649,
"top5": 90.11031524658203
},
{
"epoch": 0.12,
"grad_norm": 2.09826482925296,
"learning_rate": 0.0001,
"loss": 1.8414,
"step": 400
},
{
"ce_loss": 1.8570138716697693,
"epoch": 0.12,
"inp_emb_norm": 0.26978515625,
"loss": 1.8570138716697693,
"masked_top1": 34.77134052276611,
"masked_top5": 55.27866950988769,
"step": 400,
"top1": 77.23032119750977,
"top5": 90.24968856811523
},
{
"epoch": 0.14,
"grad_norm": 2.6584647355277067,
"learning_rate": 0.0001,
"loss": 1.8405,
"step": 450
},
{
"ce_loss": 1.8444490694999696,
"epoch": 0.14,
"inp_emb_norm": 0.27064453125,
"loss": 1.8444490694999696,
"masked_top1": 34.76882873535156,
"masked_top5": 54.58673110961914,
"step": 450,
"top1": 77.44166137695312,
"top5": 90.14545852661132
},
{
"epoch": 0.15,
"grad_norm": 1.9115658205853148,
"learning_rate": 0.0001,
"loss": 1.7976,
"step": 500
},
{
"ce_loss": 1.804072494506836,
"epoch": 0.15,
"inp_emb_norm": 0.2703515625,
"loss": 1.804072494506836,
"masked_top1": 34.5311047744751,
"masked_top5": 55.86459785461426,
"step": 500,
"top1": 77.51275192260742,
"top5": 90.69894668579101
},
{
"epoch": 0.17,
"grad_norm": 1.907634541206245,
"learning_rate": 0.0001,
"loss": 1.804,
"step": 550
},
{
"ce_loss": 1.7778214049339294,
"epoch": 0.17,
"inp_emb_norm": 0.27109375,
"loss": 1.7778214049339294,
"masked_top1": 35.83196727752686,
"masked_top5": 56.62046134948731,
"step": 550,
"top1": 78.00231246948242,
"top5": 90.66332412719727
},
{
"epoch": 0.18,
"grad_norm": 1.7366360206125175,
"learning_rate": 0.0001,
"loss": 1.7783,
"step": 600
},
{
"ce_loss": 1.7576907467842102,
"epoch": 0.18,
"inp_emb_norm": 0.2725,
"loss": 1.7576907467842102,
"masked_top1": 36.732100868225096,
"masked_top5": 57.86395164489746,
"step": 600,
"top1": 78.34247482299804,
"top5": 90.82281936645508
},
{
"epoch": 0.2,
"grad_norm": 1.9186053120285074,
"learning_rate": 0.0001,
"loss": 1.7819,
"step": 650
},
{
"ce_loss": 1.8032070183753968,
"epoch": 0.2,
"inp_emb_norm": 0.27015625,
"loss": 1.8032070183753968,
"masked_top1": 35.763907241821286,
"masked_top5": 55.90799743652344,
"step": 650,
"top1": 77.69748565673828,
"top5": 90.52577987670898
},
{
"epoch": 0.21,
"grad_norm": 1.957034200299797,
"learning_rate": 0.0001,
"loss": 1.735,
"step": 700
},
{
"ce_loss": 1.7470700669288635,
"epoch": 0.21,
"inp_emb_norm": 0.27248046875,
"loss": 1.7470700669288635,
"masked_top1": 37.93720417022705,
"masked_top5": 58.685314331054684,
"step": 700,
"top1": 78.27062789916992,
"top5": 90.87748138427735
},
{
"epoch": 0.23,
"grad_norm": 2.045975232733077,
"learning_rate": 0.0001,
"loss": 1.7517,
"step": 750
},
{
"ce_loss": 1.754687945842743,
"epoch": 0.23,
"inp_emb_norm": 0.27109375,
"loss": 1.754687945842743,
"masked_top1": 36.862719764709475,
"masked_top5": 57.64240478515625,
"step": 750,
"top1": 77.98610443115234,
"top5": 90.91637084960938
},
{
"epoch": 0.24,
"grad_norm": 2.1866870928510544,
"learning_rate": 0.0001,
"loss": 1.7225,
"step": 800
},
{
"ce_loss": 1.7393132853507995,
"epoch": 0.24,
"inp_emb_norm": 0.27255859375,
"loss": 1.7393132853507995,
"masked_top1": 36.725569190979,
"masked_top5": 58.16348899841309,
"step": 800,
"top1": 78.34141525268555,
"top5": 91.05248138427734
},
{
"epoch": 0.26,
"grad_norm": 1.9045759228508574,
"learning_rate": 0.0001,
"loss": 1.7212,
"step": 850
},
{
"ce_loss": 1.7258725309371947,
"epoch": 0.26,
"inp_emb_norm": 0.27572265625,
"loss": 1.7258725309371947,
"masked_top1": 37.848323364257816,
"masked_top5": 58.37759643554688,
"step": 850,
"top1": 78.46119873046875,
"top5": 91.0196517944336
},
{
"epoch": 0.27,
"grad_norm": 2.0335701239987896,
"learning_rate": 0.0001,
"loss": 1.7209,
"step": 900
},
{
"ce_loss": 1.7411377978324891,
"epoch": 0.27,
"inp_emb_norm": 0.27140625,
"loss": 1.7411377978324891,
"masked_top1": 37.40238594055176,
"masked_top5": 58.47159057617188,
"step": 900,
"top1": 78.22477142333985,
"top5": 91.09677703857422
},
{
"epoch": 0.29,
"grad_norm": 1.7613813093122805,
"learning_rate": 0.0001,
"loss": 1.6991,
"step": 950
},
{
"ce_loss": 1.7078413605690002,
"epoch": 0.29,
"inp_emb_norm": 0.27345703125,
"loss": 1.7078413605690002,
"masked_top1": 37.8476549911499,
"masked_top5": 58.585325393676754,
"step": 950,
"top1": 78.66676864624023,
"top5": 91.18907363891601
},
{
"epoch": 0.3,
"grad_norm": 1.746493072421527,
"learning_rate": 0.0001,
"loss": 1.7029,
"step": 1000
},
{
"ce_loss": 1.6898603582382201,
"epoch": 0.3,
"inp_emb_norm": 0.275859375,
"loss": 1.6898603582382201,
"masked_top1": 38.61885482788086,
"masked_top5": 60.25848365783691,
"step": 1000,
"top1": 78.78071334838867,
"top5": 91.36259521484375
},
{
"epoch": 0.32,
"grad_norm": 1.9274436772647008,
"learning_rate": 0.0001,
"loss": 1.6826,
"step": 1050
},
{
"ce_loss": 1.68393492937088,
"epoch": 0.32,
"inp_emb_norm": 0.2751171875,
"loss": 1.68393492937088,
"masked_top1": 38.2902375793457,
"masked_top5": 59.33997863769531,
"step": 1050,
"top1": 78.7706575012207,
"top5": 91.35841613769531
},
{
"epoch": 0.33,
"grad_norm": 1.7855509766510127,
"learning_rate": 0.0001,
"loss": 1.6718,
"step": 1100
},
{
"ce_loss": 1.6736824607849121,
"epoch": 0.33,
"inp_emb_norm": 0.276640625,
"loss": 1.6736824607849121,
"masked_top1": 37.7931421661377,
"masked_top5": 59.2380428314209,
"step": 1100,
"top1": 78.76599548339844,
"top5": 91.49175521850586
},
{
"epoch": 0.35,
"grad_norm": 1.9167748999099827,
"learning_rate": 0.0001,
"loss": 1.6706,
"step": 1150
},
{
"ce_loss": 1.6615458488464356,
"epoch": 0.35,
"inp_emb_norm": 0.27826171875,
"loss": 1.6615458488464356,
"masked_top1": 37.01531482696533,
"masked_top5": 58.88155044555664,
"step": 1150,
"top1": 79.07698608398438,
"top5": 91.33328231811524
},
{
"epoch": 0.36,
"grad_norm": 1.7556147486850162,
"learning_rate": 0.0001,
"loss": 1.6474,
"step": 1200
},
{
"ce_loss": 1.6442325162887572,
"epoch": 0.36,
"inp_emb_norm": 0.28015625,
"loss": 1.6442325162887572,
"masked_top1": 38.984904556274415,
"masked_top5": 60.31360260009765,
"step": 1200,
"top1": 79.19950073242188,
"top5": 91.51022766113282
},
{
"epoch": 0.38,
"grad_norm": 2.0171772312366834,
"learning_rate": 0.0001,
"loss": 1.6388,
"step": 1250
},
{
"ce_loss": 1.5983400893211366,
"epoch": 0.38,
"inp_emb_norm": 0.2769921875,
"loss": 1.5983400893211366,
"masked_top1": 39.851169929504394,
"masked_top5": 60.379223403930666,
"step": 1250,
"top1": 79.51817199707031,
"top5": 91.8211392211914
},
{
"epoch": 0.39,
"grad_norm": 1.6538008290605235,
"learning_rate": 0.0001,
"loss": 1.6453,
"step": 1300
},
{
"ce_loss": 1.6439322781562806,
"epoch": 0.39,
"inp_emb_norm": 0.27740234375,
"loss": 1.6439322781562806,
"masked_top1": 38.07537261962891,
"masked_top5": 59.28539070129395,
"step": 1300,
"top1": 79.08166000366211,
"top5": 91.56427230834962
},
{
"epoch": 0.41,
"grad_norm": 1.6552303659425454,
"learning_rate": 0.0001,
"loss": 1.6314,
"step": 1350
},
{
"ce_loss": 1.6322034692764282,
"epoch": 0.41,
"inp_emb_norm": 0.2801953125,
"loss": 1.6322034692764282,
"masked_top1": 38.92374729156494,
"masked_top5": 60.256018524169924,
"step": 1350,
"top1": 79.35121063232423,
"top5": 91.62414169311523
},
{
"epoch": 0.42,
"grad_norm": 1.4892198882479661,
"learning_rate": 0.0001,
"loss": 1.6421,
"step": 1400
},
{
"ce_loss": 1.6463946628570556,
"epoch": 0.42,
"inp_emb_norm": 0.27763671875,
"loss": 1.6463946628570556,
"masked_top1": 37.44570774078369,
"masked_top5": 59.385975952148435,
"step": 1400,
"top1": 79.01243072509766,
"top5": 91.58177993774414
},
{
"epoch": 0.44,
"grad_norm": 1.5892663240327325,
"learning_rate": 0.0001,
"loss": 1.6319,
"step": 1450
},
{
"ce_loss": 1.622364718914032,
"epoch": 0.44,
"inp_emb_norm": 0.2815625,
"loss": 1.622364718914032,
"masked_top1": 37.010132484436035,
"masked_top5": 58.88730949401855,
"step": 1450,
"top1": 79.29409576416016,
"top5": 91.61080490112305
},
{
"epoch": 0.45,
"grad_norm": 1.5298220602979866,
"learning_rate": 0.0001,
"loss": 1.6288,
"step": 1500
},
{
"ce_loss": 1.672783522605896,
"epoch": 0.45,
"inp_emb_norm": 0.2782421875,
"loss": 1.672783522605896,
"masked_top1": 37.74873733520508,
"masked_top5": 59.34834083557129,
"step": 1500,
"top1": 78.78699188232422,
"top5": 91.44515747070312
},
{
"epoch": 0.47,
"grad_norm": 1.6154188043026179,
"learning_rate": 0.0001,
"loss": 1.6151,
"step": 1550
},
{
"ce_loss": 1.5997060227394104,
"epoch": 0.47,
"inp_emb_norm": 0.280390625,
"loss": 1.5997060227394104,
"masked_top1": 39.32766487121582,
"masked_top5": 60.36418632507324,
"step": 1550,
"top1": 79.40666061401367,
"top5": 91.93371612548827
},
{
"epoch": 0.48,
"grad_norm": 1.3771816353205768,
"learning_rate": 0.0001,
"loss": 1.6084,
"step": 1600
},
{
"ce_loss": 1.5898131847381591,
"epoch": 0.48,
"inp_emb_norm": 0.278984375,
"loss": 1.5898131847381591,
"masked_top1": 38.975818939208985,
"masked_top5": 60.515014724731444,
"step": 1600,
"top1": 79.69720977783203,
"top5": 91.98071060180663
},
{
"epoch": 0.5,
"grad_norm": 1.452663822317599,
"learning_rate": 0.0001,
"loss": 1.6124,
"step": 1650
},
{
"ce_loss": 1.6319751167297363,
"epoch": 0.5,
"inp_emb_norm": 0.28296875,
"loss": 1.6319751167297363,
"masked_top1": 39.17121643066406,
"masked_top5": 60.53929168701172,
"step": 1650,
"top1": 79.40355041503906,
"top5": 91.61957992553711
},
{
"epoch": 0.51,
"grad_norm": 1.6649862877803743,
"learning_rate": 0.0001,
"loss": 1.6168,
"step": 1700
},
{
"ce_loss": 1.6018091797828675,
"epoch": 0.51,
"inp_emb_norm": 0.2794921875,
"loss": 1.6018091797828675,
"masked_top1": 38.38370750427246,
"masked_top5": 60.53903656005859,
"step": 1700,
"top1": 79.40913070678711,
"top5": 91.92860076904297
},
{
"epoch": 0.53,
"grad_norm": 1.900108786790241,
"learning_rate": 0.0001,
"loss": 1.591,
"step": 1750
},
{
"ce_loss": 1.6045577454566955,
"epoch": 0.53,
"inp_emb_norm": 0.2825,
"loss": 1.6045577454566955,
"masked_top1": 38.33882637023926,
"masked_top5": 60.6797013092041,
"step": 1750,
"top1": 79.38134246826172,
"top5": 91.90715942382812
},
{
"epoch": 0.54,
"grad_norm": 1.669989934133695,
"learning_rate": 0.0001,
"loss": 1.5798,
"step": 1800
},
{
"ce_loss": 1.575341019630432,
"epoch": 0.54,
"inp_emb_norm": 0.2808984375,
"loss": 1.575341019630432,
"masked_top1": 39.37454727172852,
"masked_top5": 60.28605743408203,
"step": 1800,
"top1": 79.7763461303711,
"top5": 91.9139274597168
},
{
"epoch": 0.56,
"grad_norm": 1.4991127060714822,
"learning_rate": 0.0001,
"loss": 1.5848,
"step": 1850
},
{
"ce_loss": 1.5776564478874207,
"epoch": 0.56,
"inp_emb_norm": 0.2787890625,
"loss": 1.5776564478874207,
"masked_top1": 38.988163757324216,
"masked_top5": 59.69200141906738,
"step": 1850,
"top1": 79.65292083740235,
"top5": 91.98164306640625
},
{
"epoch": 0.57,
"grad_norm": 1.7116902678486716,
"learning_rate": 0.0001,
"loss": 1.5799,
"step": 1900
},
{
"ce_loss": 1.574492063522339,
"epoch": 0.57,
"inp_emb_norm": 0.282421875,
"loss": 1.574492063522339,
"masked_top1": 40.138608627319336,
"masked_top5": 61.31516136169434,
"step": 1900,
"top1": 79.69049621582032,
"top5": 92.10322280883788
},
{
"epoch": 0.59,
"grad_norm": 1.3150771431721024,
"learning_rate": 0.0001,
"loss": 1.5593,
"step": 1950
},
{
"ce_loss": 1.5305517101287842,
"epoch": 0.59,
"inp_emb_norm": 0.2823046875,
"loss": 1.5305517101287842,
"masked_top1": 40.4295878982544,
"masked_top5": 61.966970901489255,
"step": 1950,
"top1": 79.98332046508789,
"top5": 92.32930023193359
},
{
"epoch": 0.6,
"grad_norm": 6.707754405699673,
"learning_rate": 0.0001,
"loss": 1.5863,
"step": 2000
},
{
"ce_loss": 1.5903752088546752,
"epoch": 0.6,
"inp_emb_norm": 0.2828515625,
"loss": 1.5903752088546752,
"masked_top1": 39.76962215423584,
"masked_top5": 61.7191081237793,
"step": 2000,
"top1": 79.53595993041992,
"top5": 92.0239190673828
},
{
"epoch": 0.62,
"grad_norm": 1.452524089856983,
"learning_rate": 0.0001,
"loss": 1.5773,
"step": 2050
},
{
"ce_loss": 1.5838685631752014,
"epoch": 0.62,
"inp_emb_norm": 0.2877734375,
"loss": 1.5838685631752014,
"masked_top1": 38.40034454345703,
"masked_top5": 60.153098831176756,
"step": 2050,
"top1": 79.58445373535156,
"top5": 91.9414045715332
},
{
"epoch": 0.63,
"grad_norm": 1.5517366506652683,
"learning_rate": 0.0001,
"loss": 1.5928,
"step": 2100
},
{
"ce_loss": 1.6009895205497742,
"epoch": 0.63,
"inp_emb_norm": 0.282578125,
"loss": 1.6009895205497742,
"masked_top1": 38.17198055267334,
"masked_top5": 59.90890487670899,
"step": 2100,
"top1": 79.57786758422851,
"top5": 91.74088973999024
},
{
"epoch": 0.65,
"grad_norm": 1.4696024906496812,
"learning_rate": 0.0001,
"loss": 1.5785,
"step": 2150
},
{
"ce_loss": 1.585225760936737,
"epoch": 0.65,
"inp_emb_norm": 0.28974609375,
"loss": 1.585225760936737,
"masked_top1": 39.15816062927246,
"masked_top5": 61.218558883666994,
"step": 2150,
"top1": 79.77144348144532,
"top5": 91.92766723632812
},
{
"epoch": 0.66,
"grad_norm": 1.4286648572662997,
"learning_rate": 0.0001,
"loss": 1.5676,
"step": 2200
},
{
"ce_loss": 1.5558442735671998,
"epoch": 0.66,
"inp_emb_norm": 0.2856640625,
"loss": 1.5558442735671998,
"masked_top1": 39.56588050842285,
"masked_top5": 61.39894416809082,
"step": 2200,
"top1": 79.9505500793457,
"top5": 92.23053100585938
},
{
"epoch": 0.68,
"grad_norm": 1.3408463458478854,
"learning_rate": 0.0001,
"loss": 1.5536,
"step": 2250
},
{
"ce_loss": 1.5132439875602721,
"epoch": 0.68,
"inp_emb_norm": 0.289609375,
"loss": 1.5132439875602721,
"masked_top1": 39.436230506896976,
"masked_top5": 61.361060333251956,
"step": 2250,
"top1": 80.31327835083007,
"top5": 92.35198455810547
},
{
"epoch": 0.69,
"grad_norm": 1.4789295388492802,
"learning_rate": 0.0001,
"loss": 1.5519,
"step": 2300
},
{
"ce_loss": 1.52454154253006,
"epoch": 0.69,
"inp_emb_norm": 0.2912109375,
"loss": 1.52454154253006,
"masked_top1": 39.47452976226807,
"masked_top5": 61.6866958618164,
"step": 2300,
"top1": 80.14813919067383,
"top5": 92.37121780395508
},
{
"epoch": 0.71,
"grad_norm": 1.3877519817560113,
"learning_rate": 0.0001,
"loss": 1.5542,
"step": 2350
},
{
"ce_loss": 1.5439094185829163,
"epoch": 0.71,
"inp_emb_norm": 0.29296875,
"loss": 1.5439094185829163,
"masked_top1": 38.476121711730954,
"masked_top5": 61.2806697845459,
"step": 2350,
"top1": 80.00807098388673,
"top5": 92.20030227661132
},
{
"epoch": 0.72,
"grad_norm": 1.534164522196606,
"learning_rate": 0.0001,
"loss": 1.5517,
"step": 2400
},
{
"ce_loss": 1.545496084690094,
"epoch": 0.72,
"inp_emb_norm": 0.291328125,
"loss": 1.545496084690094,
"masked_top1": 38.46687156677246,
"masked_top5": 61.44874328613281,
"step": 2400,
"top1": 79.93062316894532,
"top5": 92.19858032226563
},
{
"epoch": 0.74,
"grad_norm": 1.4019229482522402,
"learning_rate": 0.0001,
"loss": 1.5453,
"step": 2450
},
{
"ce_loss": 1.5521028304100037,
"epoch": 0.74,
"inp_emb_norm": 0.288125,
"loss": 1.5521028304100037,
"masked_top1": 40.88698040008545,
"masked_top5": 63.060478515625,
"step": 2450,
"top1": 79.97427581787109,
"top5": 92.30514190673829
},
{
"epoch": 0.75,
"grad_norm": 1.3288444515563334,
"learning_rate": 0.0001,
"loss": 1.5446,
"step": 2500
},
{
"ce_loss": 1.5541789364814758,
"epoch": 0.75,
"inp_emb_norm": 0.2872265625,
"loss": 1.5541789364814758,
"masked_top1": 40.732691040039064,
"masked_top5": 62.54548332214355,
"step": 2500,
"top1": 79.8016665649414,
"top5": 92.2079689025879
},
{
"epoch": 0.77,
"grad_norm": 1.2949056410682664,
"learning_rate": 0.0001,
"loss": 1.523,
"step": 2550
},
{
"ce_loss": 1.524161262512207,
"epoch": 0.77,
"inp_emb_norm": 0.2938671875,
"loss": 1.524161262512207,
"masked_top1": 39.2896460723877,
"masked_top5": 61.28740455627442,
"step": 2550,
"top1": 80.2130502319336,
"top5": 92.26818771362305
},
{
"epoch": 0.78,
"grad_norm": 1.376548356881607,
"learning_rate": 0.0001,
"loss": 1.5247,
"step": 2600
},
{
"ce_loss": 1.5120180606842042,
"epoch": 0.78,
"inp_emb_norm": 0.2912109375,
"loss": 1.5120180606842042,
"masked_top1": 40.32475761413574,
"masked_top5": 62.50459335327148,
"step": 2600,
"top1": 80.25805755615234,
"top5": 92.3787159729004
},
{
"epoch": 0.8,
"grad_norm": 1.446462800296954,
"learning_rate": 0.0001,
"loss": 1.5244,
"step": 2650
},
{
"ce_loss": 1.4864131617546081,
"epoch": 0.8,
"inp_emb_norm": 0.2887890625,
"loss": 1.4864131617546081,
"masked_top1": 41.43366180419922,
"masked_top5": 63.38628890991211,
"step": 2650,
"top1": 80.42174575805664,
"top5": 92.73545196533203
},
{
"epoch": 0.81,
"grad_norm": 1.5930817573922351,
"learning_rate": 0.0001,
"loss": 1.5321,
"step": 2700
},
{
"ce_loss": 1.5146442604064942,
"epoch": 0.81,
"inp_emb_norm": 0.29212890625,
"loss": 1.5146442604064942,
"masked_top1": 40.67323059082031,
"masked_top5": 63.30255233764648,
"step": 2700,
"top1": 80.21553787231446,
"top5": 92.49449493408203
},
{
"epoch": 0.83,
"grad_norm": 1.3154781061914393,
"learning_rate": 0.0001,
"loss": 1.5258,
"step": 2750
},
{
"ce_loss": 1.5432595777511597,
"epoch": 0.83,
"inp_emb_norm": 0.2902734375,
"loss": 1.5432595777511597,
"masked_top1": 40.71655921936035,
"masked_top5": 63.31653228759765,
"step": 2750,
"top1": 79.94630706787109,
"top5": 92.41331008911133
},
{
"epoch": 0.84,
"grad_norm": 1.4988480825688437,
"learning_rate": 0.0001,
"loss": 1.529,
"step": 2800
},
{
"ce_loss": 1.5197454857826234,
"epoch": 0.84,
"inp_emb_norm": 0.295234375,
"loss": 1.5197454857826234,
"masked_top1": 40.75451274871826,
"masked_top5": 63.20060585021972,
"step": 2800,
"top1": 80.25330627441406,
"top5": 92.41782470703124
},
{
"epoch": 0.86,
"grad_norm": 1.3953163621955647,
"learning_rate": 0.0001,
"loss": 1.5201,
"step": 2850
},
{
"ce_loss": 1.5061662626266479,
"epoch": 0.86,
"inp_emb_norm": 0.294375,
"loss": 1.5061662626266479,
"masked_top1": 41.16034164428711,
"masked_top5": 63.11306884765625,
"step": 2850,
"top1": 80.41659545898438,
"top5": 92.50812393188477
},
{
"epoch": 0.87,
"grad_norm": 1.3361207443145617,
"learning_rate": 0.0001,
"loss": 1.5185,
"step": 2900
},
{
"ce_loss": 1.48851407289505,
"epoch": 0.87,
"inp_emb_norm": 0.29296875,
"loss": 1.48851407289505,
"masked_top1": 41.136603927612306,
"masked_top5": 63.02009063720703,
"step": 2900,
"top1": 80.65627792358399,
"top5": 92.53072723388672
},
{
"epoch": 0.89,
"grad_norm": 1.2528196866924233,
"learning_rate": 0.0001,
"loss": 1.5154,
"step": 2950
},
{
"ce_loss": 1.511202063560486,
"epoch": 0.89,
"inp_emb_norm": 0.298671875,
"loss": 1.511202063560486,
"masked_top1": 39.16858932495117,
"masked_top5": 61.53038063049316,
"step": 2950,
"top1": 80.2845379638672,
"top5": 92.39456756591797
},
{
"epoch": 0.9,
"grad_norm": 1.4560813011403748,
"learning_rate": 0.0001,
"loss": 1.5365,
"step": 3000
},
{
"ce_loss": 1.5491148686408998,
"epoch": 0.9,
"inp_emb_norm": 0.297578125,
"loss": 1.5491148686408998,
"masked_top1": 40.18224685668945,
"masked_top5": 62.69388824462891,
"step": 3000,
"top1": 79.84650588989258,
"top5": 92.17584533691407
},
{
"epoch": 0.92,
"grad_norm": 1.2474903057292217,
"learning_rate": 0.0001,
"loss": 1.5079,
"step": 3050
},
{
"ce_loss": 1.510890085697174,
"epoch": 0.92,
"inp_emb_norm": 0.289453125,
"loss": 1.510890085697174,
"masked_top1": 41.284685897827146,
"masked_top5": 63.54399559020996,
"step": 3050,
"top1": 80.19790802001953,
"top5": 92.62230575561523
},
{
"epoch": 0.93,
"grad_norm": 1.4413477522514346,
"learning_rate": 0.0001,
"loss": 1.5008,
"step": 3100
},
{
"ce_loss": 1.505354859828949,
"epoch": 0.93,
"inp_emb_norm": 0.294140625,
"loss": 1.505354859828949,
"masked_top1": 40.861399726867674,
"masked_top5": 62.37717430114746,
"step": 3100,
"top1": 80.46924255371094,
"top5": 92.5052066040039
},
{
"epoch": 0.95,
"grad_norm": 1.359782937496998,
"learning_rate": 0.0001,
"loss": 1.4983,
"step": 3150
},
{
"ce_loss": 1.5110413646697998,
"epoch": 0.95,
"inp_emb_norm": 0.2980859375,
"loss": 1.5110413646697998,
"masked_top1": 42.160480155944825,
"masked_top5": 64.15993942260742,
"step": 3150,
"top1": 80.45739974975587,
"top5": 92.550869140625
},
{
"epoch": 0.96,
"grad_norm": 1.3582356861036782,
"learning_rate": 0.0001,
"loss": 1.5034,
"step": 3200
},
{
"ce_loss": 1.530618577003479,
"epoch": 0.96,
"inp_emb_norm": 0.29375,
"loss": 1.530618577003479,
"masked_top1": 38.9424352645874,
"masked_top5": 62.109956970214846,
"step": 3200,
"top1": 80.0709977722168,
"top5": 92.2861149597168
},
{
"epoch": 0.98,
"grad_norm": 2.5802552823497455,
"learning_rate": 0.0001,
"loss": 1.4946,
"step": 3250
},
{
"ce_loss": 1.477068486213684,
"epoch": 0.98,
"inp_emb_norm": 0.2976953125,
"loss": 1.477068486213684,
"masked_top1": 40.51057823181152,
"masked_top5": 63.397703323364254,
"step": 3250,
"top1": 80.50700180053711,
"top5": 92.68867340087891
},
{
"epoch": 0.99,
"grad_norm": 1.4547325164409564,
"learning_rate": 0.0001,
"loss": 1.5084,
"step": 3300
},
{
"ce_loss": 1.4906554532051086,
"epoch": 0.99,
"inp_emb_norm": 0.2973828125,
"loss": 1.4906554532051086,
"masked_top1": 40.54425048828125,
"masked_top5": 63.201170196533205,
"step": 3300,
"top1": 80.40962783813477,
"top5": 92.66117584228516
},
{
"epoch": 1.01,
"grad_norm": 1.211911337284348,
"learning_rate": 0.0001,
"loss": 1.3955,
"step": 3350
},
{
"ce_loss": 1.400897753238678,
"epoch": 1.01,
"inp_emb_norm": 0.3021875,
"loss": 1.400897753238678,
"masked_top1": 41.17978542327881,
"masked_top5": 63.33616775512695,
"step": 3350,
"top1": 81.19734100341798,
"top5": 93.10294403076172
},
{
"epoch": 1.02,
"grad_norm": 1.0903282402796672,
"learning_rate": 0.0001,
"loss": 1.2893,
"step": 3400
},
{
"ce_loss": 1.2788537430763245,
"epoch": 1.02,
"inp_emb_norm": 0.2989453125,
"loss": 1.2788537430763245,
"masked_top1": 40.09818046569824,
"masked_top5": 62.5667342376709,
"step": 3400,
"top1": 82.30339096069336,
"top5": 93.88133010864257
},
{
"epoch": 1.04,
"grad_norm": 1.5240484766936855,
"learning_rate": 0.0001,
"loss": 1.2964,
"step": 3450
},
{
"ce_loss": 1.303318452835083,
"epoch": 1.04,
"inp_emb_norm": 0.29703125,
"loss": 1.303318452835083,
"masked_top1": 42.162941246032716,
"masked_top5": 65.02184768676757,
"step": 3450,
"top1": 82.02912399291992,
"top5": 93.88583358764649
},
{
"epoch": 1.05,
"grad_norm": 1.3787785745538395,
"learning_rate": 0.0001,
"loss": 1.2852,
"step": 3500
},
{
"ce_loss": 1.276361472606659,
"epoch": 1.05,
"inp_emb_norm": 0.3021875,
"loss": 1.276361472606659,
"masked_top1": 41.940252990722655,
"masked_top5": 65.38093276977538,
"step": 3500,
"top1": 82.44789840698242,
"top5": 93.89565292358398
},
{
"epoch": 1.07,
"grad_norm": 1.3842728564490252,
"learning_rate": 0.0001,
"loss": 1.2978,
"step": 3550
},
{
"ce_loss": 1.3330717968940735,
"epoch": 1.07,
"inp_emb_norm": 0.29859375,
"loss": 1.3330717968940735,
"masked_top1": 41.01281318664551,
"masked_top5": 63.77486457824707,
"step": 3550,
"top1": 81.81988937377929,
"top5": 93.57796142578125
},
{
"epoch": 1.08,
"grad_norm": 1.2631108386132899,
"learning_rate": 0.0001,
"loss": 1.2931,
"step": 3600
},
{
"ce_loss": 1.2983617568016053,
"epoch": 1.08,
"inp_emb_norm": 0.2962109375,
"loss": 1.2983617568016053,
"masked_top1": 41.40194427490234,
"masked_top5": 64.76402061462403,
"step": 3600,
"top1": 82.0794775390625,
"top5": 93.74165817260742
},
{
"epoch": 1.1,
"grad_norm": 1.4399254065091216,
"learning_rate": 0.0001,
"loss": 1.3091,
"step": 3650
},
{
"ce_loss": 1.3062031197547912,
"epoch": 1.1,
"inp_emb_norm": 0.2998046875,
"loss": 1.3062031197547912,
"masked_top1": 40.85140724182129,
"masked_top5": 63.72927604675293,
"step": 3650,
"top1": 82.13722595214844,
"top5": 93.64862121582031
},
{
"epoch": 1.11,
"grad_norm": 1.2311319185042409,
"learning_rate": 0.0001,
"loss": 1.3088,
"step": 3700
},
{
"ce_loss": 1.2959584522247314,
"epoch": 1.11,
"inp_emb_norm": 0.304296875,
"loss": 1.2959584522247314,
"masked_top1": 41.39893436431885,
"masked_top5": 63.62430404663086,
"step": 3700,
"top1": 82.33803344726563,
"top5": 93.6450895690918
},
{
"epoch": 1.13,
"grad_norm": 1.1358156745617467,
"learning_rate": 0.0001,
"loss": 1.3131,
"step": 3750
},
{
"ce_loss": 1.29275639295578,
"epoch": 1.13,
"inp_emb_norm": 0.3049609375,
"loss": 1.29275639295578,
"masked_top1": 42.64406570434571,
"masked_top5": 65.42192581176758,
"step": 3750,
"top1": 82.30059600830079,
"top5": 93.80231704711915
},
{
"epoch": 1.14,
"grad_norm": 1.1800593283969816,
"learning_rate": 0.0001,
"loss": 1.2969,
"step": 3800
},
{
"ce_loss": 1.301748011112213,
"epoch": 1.14,
"inp_emb_norm": 0.2975390625,
"loss": 1.301748011112213,
"masked_top1": 41.66663146972656,
"masked_top5": 64.09541954040527,
"step": 3800,
"top1": 82.05701431274414,
"top5": 93.72319717407227
},
{
"epoch": 1.16,
"grad_norm": 1.850155610052983,
"learning_rate": 0.0001,
"loss": 1.3153,
"step": 3850
},
{
"ce_loss": 1.2972828006744386,
"epoch": 1.16,
"inp_emb_norm": 0.3021875,
"loss": 1.2972828006744386,
"masked_top1": 40.78549217224121,
"masked_top5": 64.16069633483886,
"step": 3850,
"top1": 82.19980209350587,
"top5": 93.7464599609375
},
{
"epoch": 1.17,
"grad_norm": 1.2715739238746637,
"learning_rate": 0.0001,
"loss": 1.2995,
"step": 3900
},
{
"ce_loss": 1.3045063495635987,
"epoch": 1.17,
"inp_emb_norm": 0.3047265625,
"loss": 1.3045063495635987,
"masked_top1": 40.932076416015626,
"masked_top5": 64.15639678955078,
"step": 3900,
"top1": 82.09319473266602,
"top5": 93.77517272949218
},
{
"epoch": 1.19,
"grad_norm": 1.3796703227080724,
"learning_rate": 0.0001,
"loss": 1.3071,
"step": 3950
},
{
"ce_loss": 1.3098111128807068,
"epoch": 1.19,
"inp_emb_norm": 0.297109375,
"loss": 1.3098111128807068,
"masked_top1": 41.91473709106445,
"masked_top5": 64.43979515075684,
"step": 3950,
"top1": 81.8616471862793,
"top5": 93.78232650756836
},
{
"epoch": 1.2,
"grad_norm": 1.1546364550490216,
"learning_rate": 0.0001,
"loss": 1.3098,
"step": 4000
},
{
"ce_loss": 1.3013187646865845,
"epoch": 1.2,
"inp_emb_norm": 0.30296875,
"loss": 1.3013187646865845,
"masked_top1": 41.120027198791504,
"masked_top5": 63.108745346069334,
"step": 4000,
"top1": 82.13422164916992,
"top5": 93.6265623474121
},
{
"epoch": 1.22,
"grad_norm": 1.2515420301071278,
"learning_rate": 0.0001,
"loss": 1.3027,
"step": 4050
},
{
"ce_loss": 1.2935965037345887,
"epoch": 1.22,
"inp_emb_norm": 0.3031640625,
"loss": 1.2935965037345887,
"masked_top1": 40.63400650024414,
"masked_top5": 63.75268486022949,
"step": 4050,
"top1": 82.2015396118164,
"top5": 93.79516159057617
},
{
"epoch": 1.23,
"grad_norm": 1.1524816264565314,
"learning_rate": 0.0001,
"loss": 1.3,
"step": 4100
},
{
"ce_loss": 1.3123349785804748,
"epoch": 1.23,
"inp_emb_norm": 0.3012109375,
"loss": 1.3123349785804748,
"masked_top1": 42.19605297088623,
"masked_top5": 65.04543876647949,
"step": 4100,
"top1": 81.9914030456543,
"top5": 93.81402328491211
},
{
"epoch": 1.25,
"grad_norm": 1.378666189570675,
"learning_rate": 0.0001,
"loss": 1.3045,
"step": 4150
},
{
"ce_loss": 1.2990161776542664,
"epoch": 1.25,
"inp_emb_norm": 0.305859375,
"loss": 1.2990161776542664,
"masked_top1": 41.78075637817383,
"masked_top5": 64.68202613830566,
"step": 4150,
"top1": 82.17454803466796,
"top5": 93.76477783203126
},
{
"epoch": 1.26,
"grad_norm": 1.3993699556629227,
"learning_rate": 0.0001,
"loss": 1.3014,
"step": 4200
},
{
"ce_loss": 1.2723517334461212,
"epoch": 1.26,
"inp_emb_norm": 0.301796875,
"loss": 1.2723517334461212,
"masked_top1": 43.205936431884766,
"masked_top5": 65.13977348327637,
"step": 4200,
"top1": 82.52856887817383,
"top5": 93.89131042480469
},
{
"epoch": 1.28,
"grad_norm": 1.3745855928185973,
"learning_rate": 0.0001,
"loss": 1.3151,
"step": 4250
},
{
"ce_loss": 1.3126947474479675,
"epoch": 1.28,
"inp_emb_norm": 0.3046484375,
"loss": 1.3126947474479675,
"masked_top1": 40.87171413421631,
"masked_top5": 63.66306259155274,
"step": 4250,
"top1": 82.01153930664063,
"top5": 93.64672576904297
},
{
"epoch": 1.29,
"grad_norm": 1.479893407422574,
"learning_rate": 0.0001,
"loss": 1.3213,
"step": 4300
},
{
"ce_loss": 1.3277541399002075,
"epoch": 1.29,
"inp_emb_norm": 0.302421875,
"loss": 1.3277541399002075,
"masked_top1": 41.169107818603514,
"masked_top5": 64.09851593017578,
"step": 4300,
"top1": 81.798203125,
"top5": 93.67467819213867
},
{
"epoch": 1.31,
"grad_norm": 1.4172322167184916,
"learning_rate": 0.0001,
"loss": 1.3112,
"step": 4350
},
{
"ce_loss": 1.3031212973594666,
"epoch": 1.31,
"inp_emb_norm": 0.306484375,
"loss": 1.3031212973594666,
"masked_top1": 41.52652729034424,
"masked_top5": 64.15341407775878,
"step": 4350,
"top1": 82.03983184814453,
"top5": 93.67420471191406
},
{
"epoch": 1.32,
"grad_norm": 1.2760472814321302,
"learning_rate": 0.0001,
"loss": 1.3131,
"step": 4400
},
{
"ce_loss": 1.308469491004944,
"epoch": 1.32,
"inp_emb_norm": 0.3057421875,
"loss": 1.308469491004944,
"masked_top1": 41.37114768981934,
"masked_top5": 63.93113624572754,
"step": 4400,
"top1": 82.07672714233398,
"top5": 93.6951513671875
},
{
"epoch": 1.34,
"grad_norm": 1.2334433655289787,
"learning_rate": 0.0001,
"loss": 1.3036,
"step": 4450
},
{
"ce_loss": 1.3185024070739746,
"epoch": 1.34,
"inp_emb_norm": 0.30171875,
"loss": 1.3185024070739746,
"masked_top1": 42.539853439331054,
"masked_top5": 65.19357299804688,
"step": 4450,
"top1": 82.02122940063477,
"top5": 93.74457550048828
},
{
"epoch": 1.35,
"grad_norm": 1.2156796466751323,
"learning_rate": 0.0001,
"loss": 1.3109,
"step": 4500
},
{
"ce_loss": 1.2963746500015259,
"epoch": 1.35,
"inp_emb_norm": 0.30984375,
"loss": 1.2963746500015259,
"masked_top1": 42.78518562316894,
"masked_top5": 65.02521774291992,
"step": 4500,
"top1": 82.2968830871582,
"top5": 93.7428224182129
},
{
"epoch": 1.37,
"grad_norm": 1.311745495017629,
"learning_rate": 0.0001,
"loss": 1.3115,
"step": 4550
},
{
"ce_loss": 1.2924715709686279,
"epoch": 1.37,
"inp_emb_norm": 0.303828125,
"loss": 1.2924715709686279,
"masked_top1": 42.53660099029541,
"masked_top5": 65.59047889709473,
"step": 4550,
"top1": 82.1043424987793,
"top5": 93.91824569702149
},
{
"epoch": 1.38,
"grad_norm": 1.2174333958038526,
"learning_rate": 0.0001,
"loss": 1.3201,
"step": 4600
},
{
"ce_loss": 1.3174702334403992,
"epoch": 1.38,
"inp_emb_norm": 0.30796875,
"loss": 1.3174702334403992,
"masked_top1": 42.07423233032227,
"masked_top5": 64.41791015625,
"step": 4600,
"top1": 81.94120040893554,
"top5": 93.70007461547851
},
{
"epoch": 1.4,
"grad_norm": 1.1701397235812094,
"learning_rate": 0.0001,
"loss": 1.3085,
"step": 4650
},
{
"ce_loss": 1.3130900907516478,
"epoch": 1.4,
"inp_emb_norm": 0.303203125,
"loss": 1.3130900907516478,
"masked_top1": 41.166784133911136,
"masked_top5": 64.41526512145997,
"step": 4650,
"top1": 81.96283096313476,
"top5": 93.76418426513672
},
{
"epoch": 1.41,
"grad_norm": 1.1778250928748137,
"learning_rate": 0.0001,
"loss": 1.3236,
"step": 4700
},
{
"ce_loss": 1.3132509183883667,
"epoch": 1.41,
"inp_emb_norm": 0.3076171875,
"loss": 1.3132509183883667,
"masked_top1": 41.547097778320314,
"masked_top5": 63.82028350830078,
"step": 4700,
"top1": 82.07340423583985,
"top5": 93.64304718017578
},
{
"epoch": 1.43,
"grad_norm": 1.3130038776611517,
"learning_rate": 0.0001,
"loss": 1.2884,
"step": 4750
},
{
"ce_loss": 1.2915071487426757,
"epoch": 1.43,
"inp_emb_norm": 0.30640625,
"loss": 1.2915071487426757,
"masked_top1": 41.55844184875488,
"masked_top5": 64.31948181152343,
"step": 4750,
"top1": 82.19200744628907,
"top5": 93.80693008422851
},
{
"epoch": 1.44,
"grad_norm": 1.3523241369731542,
"learning_rate": 0.0001,
"loss": 1.3229,
"step": 4800
},
{
"ce_loss": 1.3281687498092651,
"epoch": 1.44,
"inp_emb_norm": 0.3083203125,
"loss": 1.3281687498092651,
"masked_top1": 41.508040008544924,
"masked_top5": 63.831429901123045,
"step": 4800,
"top1": 81.83387313842773,
"top5": 93.61951248168945
},
{
"epoch": 1.46,
"grad_norm": 1.2229550505786297,
"learning_rate": 0.0001,
"loss": 1.313,
"step": 4850
},
{
"ce_loss": 1.321110601425171,
"epoch": 1.46,
"inp_emb_norm": 0.30484375,
"loss": 1.321110601425171,
"masked_top1": 42.07623374938965,
"masked_top5": 64.01099380493164,
"step": 4850,
"top1": 82.04196990966797,
"top5": 93.63945693969727
},
{
"epoch": 1.47,
"grad_norm": 1.291149342311876,
"learning_rate": 0.0001,
"loss": 1.3056,
"step": 4900
},
{
"ce_loss": 1.306683280467987,
"epoch": 1.47,
"inp_emb_norm": 0.3062890625,
"loss": 1.306683280467987,
"masked_top1": 42.63887153625488,
"masked_top5": 65.17818214416504,
"step": 4900,
"top1": 82.0475244140625,
"top5": 93.72082290649413
},
{
"epoch": 1.49,
"grad_norm": 1.114074399282948,
"learning_rate": 0.0001,
"loss": 1.2953,
"step": 4950
},
{
"ce_loss": 1.2916775250434875,
"epoch": 1.49,
"inp_emb_norm": 0.3067578125,
"loss": 1.2916775250434875,
"masked_top1": 43.01071895599365,
"masked_top5": 64.40318168640137,
"step": 4950,
"top1": 82.25905914306641,
"top5": 93.81284698486328
},
{
"epoch": 1.5,
"grad_norm": 1.1428006346267754,
"learning_rate": 0.0001,
"loss": 1.3132,
"step": 5000
},
{
"ce_loss": 1.3006132817268372,
"epoch": 1.5,
"inp_emb_norm": 0.305546875,
"loss": 1.3006132817268372,
"masked_top1": 43.18012409210205,
"masked_top5": 65.12184883117676,
"step": 5000,
"top1": 82.29021392822266,
"top5": 93.76517501831054
},
{
"epoch": 1.52,
"grad_norm": 1.1188850908108916,
"learning_rate": 0.0001,
"loss": 1.3097,
"step": 5050
},
{
"ce_loss": 1.3087879872322083,
"epoch": 1.52,
"inp_emb_norm": 0.3089453125,
"loss": 1.3087879872322083,
"masked_top1": 43.01914245605469,
"masked_top5": 65.29854652404785,
"step": 5050,
"top1": 82.07856109619141,
"top5": 93.74372894287109
},
{
"epoch": 1.53,
"grad_norm": 1.2100791577553864,
"learning_rate": 0.0001,
"loss": 1.313,
"step": 5100
},
{
"ce_loss": 1.3196745228767395,
"epoch": 1.53,
"inp_emb_norm": 0.30875,
"loss": 1.3196745228767395,
"masked_top1": 42.13117530822754,
"masked_top5": 65.02122192382812,
"step": 5100,
"top1": 81.9853271484375,
"top5": 93.69662155151367
},
{
"epoch": 1.55,
"grad_norm": 1.2111638230324686,
"learning_rate": 0.0001,
"loss": 1.3151,
"step": 5150
},
{
"ce_loss": 1.3195432901382447,
"epoch": 1.55,
"inp_emb_norm": 0.306171875,
"loss": 1.3195432901382447,
"masked_top1": 42.28558715820313,
"masked_top5": 65.25620643615723,
"step": 5150,
"top1": 81.9179295349121,
"top5": 93.7562042236328
},
{
"epoch": 1.56,
"grad_norm": 1.2354610192482536,
"learning_rate": 0.0001,
"loss": 1.3048,
"step": 5200
},
{
"ce_loss": 1.3142172384262085,
"epoch": 1.56,
"inp_emb_norm": 0.3107421875,
"loss": 1.3142172384262085,
"masked_top1": 41.06179321289063,
"masked_top5": 64.15951232910156,
"step": 5200,
"top1": 81.83832885742187,
"top5": 93.69693267822265
},
{
"epoch": 1.58,
"grad_norm": 1.2603393329014463,
"learning_rate": 0.0001,
"loss": 1.3077,
"step": 5250
},
{
"ce_loss": 1.2882971096038818,
"epoch": 1.58,
"inp_emb_norm": 0.3098046875,
"loss": 1.2882971096038818,
"masked_top1": 42.656227684021,
"masked_top5": 65.82235458374024,
"step": 5250,
"top1": 82.23510848999024,
"top5": 93.8615916442871
},
{
"epoch": 1.59,
"grad_norm": 1.1492020065892152,
"learning_rate": 0.0001,
"loss": 1.3037,
"step": 5300
},
{
"ce_loss": 1.312100157737732,
"epoch": 1.59,
"inp_emb_norm": 0.3079296875,
"loss": 1.312100157737732,
"masked_top1": 42.46005714416504,
"masked_top5": 65.23145439147949,
"step": 5300,
"top1": 81.96127456665039,
"top5": 93.77029266357422
},
{
"epoch": 1.61,
"grad_norm": 1.1456518826496218,
"learning_rate": 0.0001,
"loss": 1.3066,
"step": 5350
},
{
"ce_loss": 1.3056522703170776,
"epoch": 1.61,
"inp_emb_norm": 0.3083203125,
"loss": 1.3056522703170776,
"masked_top1": 43.074807510375976,
"masked_top5": 65.66509185791016,
"step": 5350,
"top1": 82.01446243286132,
"top5": 93.83800003051758
},
{
"epoch": 1.62,
"grad_norm": 1.382906237118969,
"learning_rate": 0.0001,
"loss": 1.2847,
"step": 5400
},
{
"ce_loss": 1.2701800346374512,
"epoch": 1.62,
"inp_emb_norm": 0.3112109375,
"loss": 1.2701800346374512,
"masked_top1": 43.46398132324219,
"masked_top5": 65.93353149414062,
"step": 5400,
"top1": 82.44269760131836,
"top5": 94.06136520385742
},
{
"epoch": 1.64,
"grad_norm": 1.250578306210753,
"learning_rate": 0.0001,
"loss": 1.3039,
"step": 5450
},
{
"ce_loss": 1.3234626388549804,
"epoch": 1.64,
"inp_emb_norm": 0.3078515625,
"loss": 1.3234626388549804,
"masked_top1": 42.220664520263675,
"masked_top5": 64.67328666687011,
"step": 5450,
"top1": 81.84058303833008,
"top5": 93.60615112304687
},
{
"epoch": 1.65,
"grad_norm": 1.1603834612617525,
"learning_rate": 0.0001,
"loss": 1.3015,
"step": 5500
},
{
"ce_loss": 1.2987056183815002,
"epoch": 1.65,
"inp_emb_norm": 0.3158203125,
"loss": 1.2987056183815002,
"masked_top1": 42.56543678283691,
"masked_top5": 65.94853828430176,
"step": 5500,
"top1": 82.10978652954101,
"top5": 93.85393478393554
},
{
"epoch": 1.67,
"grad_norm": 1.1963477888896916,
"learning_rate": 0.0001,
"loss": 1.3118,
"step": 5550
},
{
"ce_loss": 1.306938099861145,
"epoch": 1.67,
"inp_emb_norm": 0.3089453125,
"loss": 1.306938099861145,
"masked_top1": 42.22477603912353,
"masked_top5": 64.90145042419434,
"step": 5550,
"top1": 81.91244842529296,
"top5": 93.82112915039062
},
{
"epoch": 1.68,
"grad_norm": 1.2265886595466722,
"learning_rate": 0.0001,
"loss": 1.2965,
"step": 5600
},
{
"ce_loss": 1.289056396484375,
"epoch": 1.68,
"inp_emb_norm": 0.3178515625,
"loss": 1.289056396484375,
"masked_top1": 41.55989253997803,
"masked_top5": 64.25195945739746,
"step": 5600,
"top1": 82.382109375,
"top5": 93.70501174926758
},
{
"epoch": 1.7,
"grad_norm": 1.131601169964127,
"learning_rate": 0.0001,
"loss": 1.2911,
"step": 5650
},
{
"ce_loss": 1.2798267722129821,
"epoch": 1.7,
"inp_emb_norm": 0.3168359375,
"loss": 1.2798267722129821,
"masked_top1": 42.326548461914065,
"masked_top5": 65.07690460205077,
"step": 5650,
"top1": 82.33992919921874,
"top5": 93.87046478271485
},
{
"epoch": 1.71,
"grad_norm": 1.1181947342973821,
"learning_rate": 0.0001,
"loss": 1.3145,
"step": 5700
},
{
"ce_loss": 1.31481609582901,
"epoch": 1.71,
"inp_emb_norm": 0.3159375,
"loss": 1.31481609582901,
"masked_top1": 42.50676147460938,
"masked_top5": 65.8423821258545,
"step": 5700,
"top1": 82.06524475097656,
"top5": 93.59629486083985
},
{
"epoch": 1.73,
"grad_norm": 1.1098764034001067,
"learning_rate": 0.0001,
"loss": 1.311,
"step": 5750
},
{
"ce_loss": 1.3209831523895263,
"epoch": 1.73,
"inp_emb_norm": 0.310390625,
"loss": 1.3209831523895263,
"masked_top1": 42.35572410583496,
"masked_top5": 65.26163505554199,
"step": 5750,
"top1": 81.91262344360352,
"top5": 93.68317199707032
},
{
"epoch": 1.74,
"grad_norm": 1.2409622007508851,
"learning_rate": 0.0001,
"loss": 1.3131,
"step": 5800
},
{
"ce_loss": 1.303259253501892,
"epoch": 1.74,
"inp_emb_norm": 0.311015625,
"loss": 1.303259253501892,
"masked_top1": 42.99045387268066,
"masked_top5": 65.50553886413574,
"step": 5800,
"top1": 82.13337188720703,
"top5": 93.76351516723633
},
{
"epoch": 1.76,
"grad_norm": 1.3030028695341598,
"learning_rate": 0.0001,
"loss": 1.3,
"step": 5850
},
{
"ce_loss": 1.290860595703125,
"epoch": 1.76,
"inp_emb_norm": 0.32109375,
"loss": 1.290860595703125,
"masked_top1": 42.378893280029295,
"masked_top5": 65.07022903442383,
"step": 5850,
"top1": 82.35479309082031,
"top5": 93.71083343505859
},
{
"epoch": 1.77,
"grad_norm": 1.2030263310548106,
"learning_rate": 0.0001,
"loss": 1.3148,
"step": 5900
},
{
"ce_loss": 1.3233295631408692,
"epoch": 1.77,
"inp_emb_norm": 0.3166015625,
"loss": 1.3233295631408692,
"masked_top1": 42.24487545013428,
"masked_top5": 65.1656477355957,
"step": 5900,
"top1": 82.03902984619141,
"top5": 93.61334854125977
},
{
"epoch": 1.79,
"grad_norm": 1.5803953753939939,
"learning_rate": 0.0001,
"loss": 1.3041,
"step": 5950
},
{
"ce_loss": 1.2845279669761658,
"epoch": 1.79,
"inp_emb_norm": 0.3218359375,
"loss": 1.2845279669761658,
"masked_top1": 42.71968803405762,
"masked_top5": 65.51119926452637,
"step": 5950,
"top1": 82.2425944519043,
"top5": 93.84527893066407
},
{
"epoch": 1.8,
"grad_norm": 1.1986406141845714,
"learning_rate": 0.0001,
"loss": 1.303,
"step": 6000
},
{
"ce_loss": 1.2704100012779236,
"epoch": 1.8,
"inp_emb_norm": 0.317734375,
"loss": 1.2704100012779236,
"masked_top1": 43.65462882995605,
"masked_top5": 65.84256935119629,
"step": 6000,
"top1": 82.45276733398437,
"top5": 93.91838714599609
},
{
"epoch": 1.82,
"grad_norm": 1.1276854151713849,
"learning_rate": 0.0001,
"loss": 1.3039,
"step": 6050
},
{
"ce_loss": 1.289050838947296,
"epoch": 1.82,
"inp_emb_norm": 0.3146875,
"loss": 1.289050838947296,
"masked_top1": 41.38638572692871,
"masked_top5": 64.90666145324707,
"step": 6050,
"top1": 82.1436050415039,
"top5": 93.8117251586914
},
{
"epoch": 1.83,
"grad_norm": 1.1636926009728485,
"learning_rate": 0.0001,
"loss": 1.3029,
"step": 6100
},
{
"ce_loss": 1.3035223054885865,
"epoch": 1.83,
"inp_emb_norm": 0.3187109375,
"loss": 1.3035223054885865,
"masked_top1": 42.37709861755371,
"masked_top5": 65.33909133911133,
"step": 6100,
"top1": 82.11933166503906,
"top5": 93.78050659179688
},
{
"epoch": 1.85,
"grad_norm": 1.0626794881896964,
"learning_rate": 0.0001,
"loss": 1.3138,
"step": 6150
},
{
"ce_loss": 1.317355580329895,
"epoch": 1.85,
"inp_emb_norm": 0.31890625,
"loss": 1.317355580329895,
"masked_top1": 41.86831642150879,
"masked_top5": 64.59691291809082,
"step": 6150,
"top1": 81.94907608032227,
"top5": 93.67014266967773
},
{
"epoch": 1.86,
"grad_norm": 1.1005541859171093,
"learning_rate": 0.0001,
"loss": 1.3041,
"step": 6200
},
{
"ce_loss": 1.3219546675682068,
"epoch": 1.86,
"inp_emb_norm": 0.3172265625,
"loss": 1.3219546675682068,
"masked_top1": 42.3734455871582,
"masked_top5": 65.90324760437012,
"step": 6200,
"top1": 81.97321441650391,
"top5": 93.62858337402344
},
{
"epoch": 1.88,
"grad_norm": 1.1522186435584791,
"learning_rate": 0.0001,
"loss": 1.2963,
"step": 6250
},
{
"ce_loss": 1.2990228199958802,
"epoch": 1.88,
"inp_emb_norm": 0.3144921875,
"loss": 1.2990228199958802,
"masked_top1": 42.138801422119144,
"masked_top5": 65.49508239746093,
"step": 6250,
"top1": 82.0102586364746,
"top5": 93.90199996948242
},
{
"epoch": 1.89,
"grad_norm": 1.2927059160480363,
"learning_rate": 0.0001,
"loss": 1.3017,
"step": 6300
},
{
"ce_loss": 1.300498881340027,
"epoch": 1.89,
"inp_emb_norm": 0.316640625,
"loss": 1.300498881340027,
"masked_top1": 42.56178199768066,
"masked_top5": 65.6432763671875,
"step": 6300,
"top1": 82.12412170410157,
"top5": 93.7817707824707
},
{
"epoch": 1.91,
"grad_norm": 1.168341408260434,
"learning_rate": 0.0001,
"loss": 1.3029,
"step": 6350
},
{
"ce_loss": 1.3083948111534118,
"epoch": 1.91,
"inp_emb_norm": 0.311328125,
"loss": 1.3083948111534118,
"masked_top1": 42.55179992675781,
"masked_top5": 65.3431551361084,
"step": 6350,
"top1": 81.94397857666016,
"top5": 93.8497378540039
},
{
"epoch": 1.92,
"grad_norm": 1.2323945648312147,
"learning_rate": 0.0001,
"loss": 1.3104,
"step": 6400
},
{
"ce_loss": 1.3191473126411437,
"epoch": 1.92,
"inp_emb_norm": 0.3194921875,
"loss": 1.3191473126411437,
"masked_top1": 41.97941291809082,
"masked_top5": 65.11552169799805,
"step": 6400,
"top1": 81.95802749633789,
"top5": 93.71688690185547
},
{
"epoch": 1.94,
"grad_norm": 1.0969474547631315,
"learning_rate": 0.0001,
"loss": 1.3033,
"step": 6450
},
{
"ce_loss": 1.2988893008232116,
"epoch": 1.94,
"inp_emb_norm": 0.3175,
"loss": 1.2988893008232116,
"masked_top1": 42.91837085723877,
"masked_top5": 65.42203193664551,
"step": 6450,
"top1": 82.20126846313477,
"top5": 93.793828125
},
{
"epoch": 1.95,
"grad_norm": 1.0855603866035775,
"learning_rate": 0.0001,
"loss": 1.3051,
"step": 6500
},
{
"ce_loss": 1.3110007953643799,
"epoch": 1.95,
"inp_emb_norm": 0.32046875,
"loss": 1.3110007953643799,
"masked_top1": 42.34694427490234,
"masked_top5": 65.08651733398438,
"step": 6500,
"top1": 81.95626449584961,
"top5": 93.71604049682617
},
{
"epoch": 1.97,
"grad_norm": 1.201042651000794,
"learning_rate": 0.0001,
"loss": 1.3041,
"step": 6550
},
{
"ce_loss": 1.3072884845733643,
"epoch": 1.97,
"inp_emb_norm": 0.316484375,
"loss": 1.3072884845733643,
"masked_top1": 42.01487628936768,
"masked_top5": 65.12509132385254,
"step": 6550,
"top1": 81.91355926513671,
"top5": 93.84131042480469
},
{
"epoch": 1.98,
"grad_norm": 1.2375042360574775,
"learning_rate": 0.0001,
"loss": 1.3,
"step": 6600
},
{
"ce_loss": 1.3039724278450011,
"epoch": 1.98,
"inp_emb_norm": 0.318515625,
"loss": 1.3039724278450011,
"masked_top1": 42.9154284286499,
"masked_top5": 66.03591972351074,
"step": 6600,
"top1": 82.26277542114258,
"top5": 93.73072570800781
},
{
"epoch": 2.0,
"grad_norm": 1.1513818080655263,
"learning_rate": 0.0001,
"loss": 1.3037,
"step": 6650
},
{
"ce_loss": 1.2931387114524842,
"epoch": 2.0,
"inp_emb_norm": 0.318046875,
"loss": 1.2931387114524842,
"masked_top1": 41.40272514343262,
"masked_top5": 64.34992462158203,
"step": 6650,
"top1": 82.18499603271485,
"top5": 93.81835754394531
},
{
"epoch": 2.02,
"grad_norm": 1.0510501665579786,
"learning_rate": 0.0001,
"loss": 1.0038,
"step": 6700
},
{
"ce_loss": 0.9978207111358642,
"epoch": 2.02,
"inp_emb_norm": 0.3219140625,
"loss": 0.9978207111358642,
"masked_top1": 45.336004638671874,
"masked_top5": 69.27368041992187,
"step": 6700,
"top1": 85.59730255126954,
"top5": 95.6523501586914
},
{
"epoch": 2.03,
"grad_norm": 1.119371103964124,
"learning_rate": 0.0001,
"loss": 1.0094,
"step": 6750
},
{
"ce_loss": 1.0062808072566987,
"epoch": 2.03,
"inp_emb_norm": 0.324375,
"loss": 1.0062808072566987,
"masked_top1": 44.80854190826416,
"masked_top5": 67.70930862426758,
"step": 6750,
"top1": 85.45523956298828,
"top5": 95.46897216796874
},
{
"epoch": 2.05,
"grad_norm": 1.0111394168332695,
"learning_rate": 0.0001,
"loss": 0.9973,
"step": 6800
},
{
"ce_loss": 0.9945477271080017,
"epoch": 2.05,
"inp_emb_norm": 0.32734375,
"loss": 0.9945477271080017,
"masked_top1": 46.81753234863281,
"masked_top5": 70.2586336517334,
"step": 6800,
"top1": 85.69498947143555,
"top5": 95.61528350830078
},
{
"epoch": 2.06,
"grad_norm": 1.146035850759827,
"learning_rate": 0.0001,
"loss": 1.0172,
"step": 6850
},
{
"ce_loss": 1.0215596628189088,
"epoch": 2.06,
"inp_emb_norm": 0.3225390625,
"loss": 1.0215596628189088,
"masked_top1": 43.64241020202637,
"masked_top5": 67.47051399230958,
"step": 6850,
"top1": 85.29426712036133,
"top5": 95.40342254638672
},
{
"epoch": 2.08,
"grad_norm": 1.1308686916013488,
"learning_rate": 0.0001,
"loss": 1.0143,
"step": 6900
},
{
"ce_loss": 1.008236768245697,
"epoch": 2.08,
"inp_emb_norm": 0.322109375,
"loss": 1.008236768245697,
"masked_top1": 44.6266674041748,
"masked_top5": 68.65714111328126,
"step": 6900,
"top1": 85.34050277709962,
"top5": 95.54212951660156
},
{
"epoch": 2.09,
"grad_norm": 1.1042254336729025,
"learning_rate": 0.0001,
"loss": 1.0111,
"step": 6950
},
{
"ce_loss": 1.0106354761123657,
"epoch": 2.09,
"inp_emb_norm": 0.322734375,
"loss": 1.0106354761123657,
"masked_top1": 44.90856178283691,
"masked_top5": 69.18769927978515,
"step": 6950,
"top1": 85.31204833984376,
"top5": 95.51878021240235
},
{
"epoch": 2.11,
"grad_norm": 1.1432085643116041,
"learning_rate": 0.0001,
"loss": 1.0137,
"step": 7000
},
{
"ce_loss": 1.0164246666431427,
"epoch": 2.11,
"inp_emb_norm": 0.3241015625,
"loss": 1.0164246666431427,
"masked_top1": 45.27058769226074,
"masked_top5": 69.0199755859375,
"step": 7000,
"top1": 85.29512222290039,
"top5": 95.44889572143555
},
{
"epoch": 2.12,
"grad_norm": 1.0502533687083666,
"learning_rate": 0.0001,
"loss": 1.0498,
"step": 7050
},
{
"ce_loss": 1.0437222492694855,
"epoch": 2.12,
"inp_emb_norm": 0.3234765625,
"loss": 1.0437222492694855,
"masked_top1": 45.688305435180666,
"masked_top5": 68.60405799865723,
"step": 7050,
"top1": 85.19064407348633,
"top5": 95.48901702880859
},
{
"epoch": 2.14,
"grad_norm": 1.065003821728329,
"learning_rate": 0.0001,
"loss": 1.0199,
"step": 7100
},
{
"ce_loss": 1.01140921831131,
"epoch": 2.14,
"inp_emb_norm": 0.3250390625,
"loss": 1.01140921831131,
"masked_top1": 45.559912796020505,
"masked_top5": 69.64316802978516,
"step": 7100,
"top1": 85.34976013183594,
"top5": 95.51665420532227
},
{
"epoch": 2.15,
"grad_norm": 1.1735568530678606,
"learning_rate": 0.0001,
"loss": 1.0166,
"step": 7150
},
{
"ce_loss": 1.0293829572200774,
"epoch": 2.15,
"inp_emb_norm": 0.323671875,
"loss": 1.0293829572200774,
"masked_top1": 44.797666244506836,
"masked_top5": 69.08728958129883,
"step": 7150,
"top1": 85.07948059082031,
"top5": 95.50323318481445
},
{
"epoch": 2.17,
"grad_norm": 1.22336128717078,
"learning_rate": 0.0001,
"loss": 1.0263,
"step": 7200
},
{
"ce_loss": 1.0348090195655824,
"epoch": 2.17,
"inp_emb_norm": 0.3307421875,
"loss": 1.0348090195655824,
"masked_top1": 43.274717559814455,
"masked_top5": 68.35926612854004,
"step": 7200,
"top1": 85.03092895507812,
"top5": 95.42227752685547
},
{
"epoch": 2.18,
"grad_norm": 1.1254790539219672,
"learning_rate": 0.0001,
"loss": 1.0341,
"step": 7250
},
{
"ce_loss": 1.028915911912918,
"epoch": 2.18,
"inp_emb_norm": 0.3247265625,
"loss": 1.028915911912918,
"masked_top1": 45.49935554504395,
"masked_top5": 69.03593551635743,
"step": 7250,
"top1": 85.14698638916016,
"top5": 95.36694305419923
},
{
"epoch": 2.2,
"grad_norm": 1.1149477539183639,
"learning_rate": 0.0001,
"loss": 1.0412,
"step": 7300
},
{
"ce_loss": 1.0536164796352387,
"epoch": 2.2,
"inp_emb_norm": 0.3236328125,
"loss": 1.0536164796352387,
"masked_top1": 43.858817138671874,
"masked_top5": 67.21836029052734,
"step": 7300,
"top1": 84.86703170776367,
"top5": 95.25842987060547
},
{
"epoch": 2.21,
"grad_norm": 1.1968292628857302,
"learning_rate": 0.0001,
"loss": 1.0432,
"step": 7350
},
{
"ce_loss": 1.0520641374588013,
"epoch": 2.21,
"inp_emb_norm": 0.3211328125,
"loss": 1.0520641374588013,
"masked_top1": 44.41142387390137,
"masked_top5": 67.9876936340332,
"step": 7350,
"top1": 84.89357849121093,
"top5": 95.34237762451171
},
{
"epoch": 2.23,
"grad_norm": 1.1298025119403752,
"learning_rate": 0.0001,
"loss": 1.0285,
"step": 7400
},
{
"ce_loss": 1.0238622641563415,
"epoch": 2.23,
"inp_emb_norm": 0.33328125,
"loss": 1.0238622641563415,
"masked_top1": 46.15579338073731,
"masked_top5": 70.0232991027832,
"step": 7400,
"top1": 85.30123580932617,
"top5": 95.43901489257813
},
{
"epoch": 2.24,
"grad_norm": 1.0250128601399684,
"learning_rate": 0.0001,
"loss": 1.0317,
"step": 7450
},
{
"ce_loss": 1.024322179555893,
"epoch": 2.24,
"inp_emb_norm": 0.3326953125,
"loss": 1.024322179555893,
"masked_top1": 45.176987152099606,
"masked_top5": 69.09153121948242,
"step": 7450,
"top1": 85.22158447265625,
"top5": 95.37797592163086
},
{
"epoch": 2.26,
"grad_norm": 1.1668834452965842,
"learning_rate": 0.0001,
"loss": 1.0344,
"step": 7500
},
{
"ce_loss": 1.0320839881896973,
"epoch": 2.26,
"inp_emb_norm": 0.3268359375,
"loss": 1.0320839881896973,
"masked_top1": 45.62026954650879,
"masked_top5": 69.47045196533203,
"step": 7500,
"top1": 85.13791748046874,
"top5": 95.46679916381837
},
{
"epoch": 2.27,
"grad_norm": 1.1647845511661612,
"learning_rate": 0.0001,
"loss": 1.0364,
"step": 7550
},
{
"ce_loss": 1.040896817445755,
"epoch": 2.27,
"inp_emb_norm": 0.3287890625,
"loss": 1.040896817445755,
"masked_top1": 45.61882129669189,
"masked_top5": 68.993677444458,
"step": 7550,
"top1": 84.89491455078125,
"top5": 95.34945602416992
},
{
"epoch": 2.29,
"grad_norm": 1.070994325965457,
"learning_rate": 0.0001,
"loss": 1.0481,
"step": 7600
},
{
"ce_loss": 1.045055913925171,
"epoch": 2.29,
"inp_emb_norm": 0.3358203125,
"loss": 1.045055913925171,
"masked_top1": 43.71815933227539,
"masked_top5": 68.03006439208984,
"step": 7600,
"top1": 84.92118545532226,
"top5": 95.33861541748047
},
{
"epoch": 2.3,
"grad_norm": 1.0302755529017995,
"learning_rate": 0.0001,
"loss": 1.0468,
"step": 7650
},
{
"ce_loss": 1.029969446659088,
"epoch": 2.3,
"inp_emb_norm": 0.3273046875,
"loss": 1.029969446659088,
"masked_top1": 44.84774971008301,
"masked_top5": 68.51262855529785,
"step": 7650,
"top1": 85.12003936767579,
"top5": 95.39368621826172
},
{
"epoch": 2.32,
"grad_norm": 1.2124331568257196,
"learning_rate": 0.0001,
"loss": 1.0462,
"step": 7700
},
{
"ce_loss": 1.0300623905658721,
"epoch": 2.32,
"inp_emb_norm": 0.32671875,
"loss": 1.0300623905658721,
"masked_top1": 46.18773262023926,
"masked_top5": 69.55521690368653,
"step": 7700,
"top1": 85.05798110961913,
"top5": 95.45936019897461
},
{
"epoch": 2.33,
"grad_norm": 1.25935837166321,
"learning_rate": 0.0001,
"loss": 1.051,
"step": 7750
},
{
"ce_loss": 1.0560246324539184,
"epoch": 2.33,
"inp_emb_norm": 0.32921875,
"loss": 1.0560246324539184,
"masked_top1": 44.81505111694336,
"masked_top5": 68.60199340820313,
"step": 7750,
"top1": 84.81520401000977,
"top5": 95.27306564331055
},
{
"epoch": 2.35,
"grad_norm": 1.1490451145708855,
"learning_rate": 0.0001,
"loss": 1.0459,
"step": 7800
},
{
"ce_loss": 1.0482890462875367,
"epoch": 2.35,
"inp_emb_norm": 0.3268359375,
"loss": 1.0482890462875367,
"masked_top1": 44.35726287841797,
"masked_top5": 68.64975059509277,
"step": 7800,
"top1": 84.82445663452148,
"top5": 95.42289993286133
},
{
"epoch": 2.36,
"grad_norm": 1.1334989027761242,
"learning_rate": 0.0001,
"loss": 1.0441,
"step": 7850
},
{
"ce_loss": 1.0527094066143037,
"epoch": 2.36,
"inp_emb_norm": 0.3269140625,
"loss": 1.0527094066143037,
"masked_top1": 45.072478713989256,
"masked_top5": 68.39160514831543,
"step": 7850,
"top1": 84.785078125,
"top5": 95.35190139770508
},
{
"epoch": 2.38,
"grad_norm": 1.1375627332151101,
"learning_rate": 0.0001,
"loss": 1.0565,
"step": 7900
},
{
"ce_loss": 1.0542543601989747,
"epoch": 2.38,
"inp_emb_norm": 0.331328125,
"loss": 1.0542543601989747,
"masked_top1": 43.75662239074707,
"masked_top5": 67.66206184387207,
"step": 7900,
"top1": 84.7607080078125,
"top5": 95.31730087280273
},
{
"epoch": 2.39,
"grad_norm": 1.1403758501967678,
"learning_rate": 0.0001,
"loss": 1.0445,
"step": 7950
},
{
"ce_loss": 1.0483984065055847,
"epoch": 2.39,
"inp_emb_norm": 0.32671875,
"loss": 1.0483984065055847,
"masked_top1": 45.24047119140625,
"masked_top5": 68.86077049255371,
"step": 7950,
"top1": 84.88309677124023,
"top5": 95.3291700744629
},
{
"epoch": 2.41,
"grad_norm": 1.039302857763206,
"learning_rate": 0.0001,
"loss": 1.0581,
"step": 8000
},
{
"ce_loss": 1.0635139977931976,
"epoch": 2.41,
"inp_emb_norm": 0.3339453125,
"loss": 1.0635139977931976,
"masked_top1": 43.78837112426758,
"masked_top5": 67.08759162902832,
"step": 8000,
"top1": 84.79178085327149,
"top5": 95.13488723754882
},
{
"epoch": 2.42,
"grad_norm": 1.1910955097950056,
"learning_rate": 0.0001,
"loss": 1.0522,
"step": 8050
},
{
"ce_loss": 1.0587849020957947,
"epoch": 2.42,
"inp_emb_norm": 0.3301171875,
"loss": 1.0587849020957947,
"masked_top1": 44.429280014038085,
"masked_top5": 67.39429237365722,
"step": 8050,
"top1": 84.87328887939454,
"top5": 95.1887158203125
},
{
"epoch": 2.44,
"grad_norm": 1.1479506270149336,
"learning_rate": 0.0001,
"loss": 1.0536,
"step": 8100
},
{
"ce_loss": 1.0473863470554352,
"epoch": 2.44,
"inp_emb_norm": 0.3341015625,
"loss": 1.0473863470554352,
"masked_top1": 44.48332893371582,
"masked_top5": 68.14752388000488,
"step": 8100,
"top1": 84.92030502319336,
"top5": 95.26301513671875
},
{
"epoch": 2.45,
"grad_norm": 1.1375038249327807,
"learning_rate": 0.0001,
"loss": 1.0419,
"step": 8150
},
{
"ce_loss": 1.0687260043621063,
"epoch": 2.45,
"inp_emb_norm": 0.3335546875,
"loss": 1.0687260043621063,
"masked_top1": 43.23715843200684,
"masked_top5": 67.02041564941406,
"step": 8150,
"top1": 84.57525085449218,
"top5": 95.16051864624023
},
{
"epoch": 2.47,
"grad_norm": 1.1378089695981268,
"learning_rate": 0.0001,
"loss": 1.0485,
"step": 8200
},
{
"ce_loss": 1.0508419513702392,
"epoch": 2.47,
"inp_emb_norm": 0.3319921875,
"loss": 1.0508419513702392,
"masked_top1": 45.14276206970215,
"masked_top5": 69.22940170288086,
"step": 8200,
"top1": 84.89724472045899,
"top5": 95.38407836914062
},
{
"epoch": 2.48,
"grad_norm": 1.134415072768399,
"learning_rate": 0.0001,
"loss": 1.0587,
"step": 8250
},
{
"ce_loss": 1.0560647177696227,
"epoch": 2.48,
"inp_emb_norm": 0.33328125,
"loss": 1.0560647177696227,
"masked_top1": 44.99800594329834,
"masked_top5": 67.96685562133788,
"step": 8250,
"top1": 84.84181884765626,
"top5": 95.19639511108399
},
{
"epoch": 2.5,
"grad_norm": 1.0130905993003134,
"learning_rate": 0.0001,
"loss": 1.0622,
"step": 8300
},
{
"ce_loss": 1.0628444683551788,
"epoch": 2.5,
"inp_emb_norm": 0.335390625,
"loss": 1.0628444683551788,
"masked_top1": 43.72650085449219,
"masked_top5": 67.85165229797363,
"step": 8300,
"top1": 84.71982971191406,
"top5": 95.2232145690918
},
{
"epoch": 2.51,
"grad_norm": 1.1112686440386919,
"learning_rate": 0.0001,
"loss": 1.0624,
"step": 8350
},
{
"ce_loss": 1.0606860029697418,
"epoch": 2.51,
"inp_emb_norm": 0.3302734375,
"loss": 1.0606860029697418,
"masked_top1": 45.08112854003906,
"masked_top5": 68.54836326599121,
"step": 8350,
"top1": 84.70668167114258,
"top5": 95.34720886230468
},
{
"epoch": 2.53,
"grad_norm": 1.0929741266435504,
"learning_rate": 0.0001,
"loss": 1.0653,
"step": 8400
},
{
"ce_loss": 1.0685135400295258,
"epoch": 2.53,
"inp_emb_norm": 0.329296875,
"loss": 1.0685135400295258,
"masked_top1": 44.93409217834473,
"masked_top5": 68.15225112915039,
"step": 8400,
"top1": 84.59507186889648,
"top5": 95.2771678161621
},
{
"epoch": 2.54,
"grad_norm": 1.0671443258127058,
"learning_rate": 0.0001,
"loss": 1.0661,
"step": 8450
},
{
"ce_loss": 1.074057730436325,
"epoch": 2.54,
"inp_emb_norm": 0.334140625,
"loss": 1.074057730436325,
"masked_top1": 43.86976951599121,
"masked_top5": 67.98081314086915,
"step": 8450,
"top1": 84.60137985229493,
"top5": 95.1078970336914
},
{
"epoch": 2.56,
"grad_norm": 1.2357962610955344,
"learning_rate": 0.0001,
"loss": 1.0625,
"step": 8500
},
{
"ce_loss": 1.0734054052829742,
"epoch": 2.56,
"inp_emb_norm": 0.3264453125,
"loss": 1.0734054052829742,
"masked_top1": 43.74296424865722,
"masked_top5": 67.63478965759278,
"step": 8500,
"top1": 84.40524276733399,
"top5": 95.29552383422852
},
{
"epoch": 2.57,
"grad_norm": 1.185708692105678,
"learning_rate": 0.0001,
"loss": 1.0543,
"step": 8550
},
{
"ce_loss": 1.0622280275821685,
"epoch": 2.57,
"inp_emb_norm": 0.3312890625,
"loss": 1.0622280275821685,
"masked_top1": 45.68028434753418,
"masked_top5": 68.61516632080078,
"step": 8550,
"top1": 84.72150787353516,
"top5": 95.2423046875
},
{
"epoch": 2.59,
"grad_norm": 1.124889127351682,
"learning_rate": 0.0001,
"loss": 1.0598,
"step": 8600
},
{
"ce_loss": 1.0730221366882324,
"epoch": 2.59,
"inp_emb_norm": 0.3349609375,
"loss": 1.0730221366882324,
"masked_top1": 44.15586044311524,
"masked_top5": 67.7581484222412,
"step": 8600,
"top1": 84.60388778686523,
"top5": 95.18492584228515
},
{
"epoch": 2.6,
"grad_norm": 1.1470911071720453,
"learning_rate": 0.0001,
"loss": 1.0718,
"step": 8650
},
{
"ce_loss": 1.0610903584957123,
"epoch": 2.6,
"inp_emb_norm": 0.3320703125,
"loss": 1.0610903584957123,
"masked_top1": 44.21772804260254,
"masked_top5": 67.78637229919434,
"step": 8650,
"top1": 84.74846099853515,
"top5": 95.21728775024414
},
{
"epoch": 2.62,
"grad_norm": 1.120071012609436,
"learning_rate": 0.0001,
"loss": 1.0598,
"step": 8700
},
{
"ce_loss": 1.057754340171814,
"epoch": 2.62,
"inp_emb_norm": 0.334140625,
"loss": 1.057754340171814,
"masked_top1": 44.56280632019043,
"masked_top5": 68.16851516723632,
"step": 8700,
"top1": 84.76306396484375,
"top5": 95.24030334472656
},
{
"epoch": 2.63,
"grad_norm": 1.1990678496151557,
"learning_rate": 0.0001,
"loss": 1.0698,
"step": 8750
},
{
"ce_loss": 1.0554494428634644,
"epoch": 2.63,
"inp_emb_norm": 0.3276953125,
"loss": 1.0554494428634644,
"masked_top1": 45.60402565002441,
"masked_top5": 69.8213597869873,
"step": 8750,
"top1": 84.74255416870118,
"top5": 95.37602157592774
},
{
"epoch": 2.65,
"grad_norm": 1.1691422333586243,
"learning_rate": 0.0001,
"loss": 1.0711,
"step": 8800
},
{
"ce_loss": 1.0723666751384735,
"epoch": 2.65,
"inp_emb_norm": 0.3325390625,
"loss": 1.0723666751384735,
"masked_top1": 44.78216484069824,
"masked_top5": 68.43072723388671,
"step": 8800,
"top1": 84.51954879760743,
"top5": 95.22681289672852
},
{
"epoch": 2.66,
"grad_norm": 1.058087631068196,
"learning_rate": 0.0001,
"loss": 1.0736,
"step": 8850
},
{
"ce_loss": 1.0746560537815093,
"epoch": 2.66,
"inp_emb_norm": 0.329140625,
"loss": 1.0746560537815093,
"masked_top1": 44.328994064331056,
"masked_top5": 67.38568382263183,
"step": 8850,
"top1": 84.52036026000977,
"top5": 95.2164730834961
},
{
"epoch": 2.68,
"grad_norm": 1.1288621853196346,
"learning_rate": 0.0001,
"loss": 1.069,
"step": 8900
},
{
"ce_loss": 1.075651180744171,
"epoch": 2.68,
"inp_emb_norm": 0.334765625,
"loss": 1.075651180744171,
"masked_top1": 45.41173538208008,
"masked_top5": 68.17863861083984,
"step": 8900,
"top1": 84.6201480102539,
"top5": 95.15124877929688
},
{
"epoch": 2.69,
"grad_norm": 1.14695178467833,
"learning_rate": 0.0001,
"loss": 1.0671,
"step": 8950
},
{
"ce_loss": 1.0754395532608032,
"epoch": 2.69,
"inp_emb_norm": 0.334453125,
"loss": 1.0754395532608032,
"masked_top1": 45.43799507141113,
"masked_top5": 69.10365425109863,
"step": 8950,
"top1": 84.61126815795899,
"top5": 95.2543717956543
},
{
"epoch": 2.71,
"grad_norm": 1.131019351092537,
"learning_rate": 0.0001,
"loss": 1.0679,
"step": 9000
},
{
"ce_loss": 1.0691665148735046,
"epoch": 2.71,
"inp_emb_norm": 0.33484375,
"loss": 1.0691665148735046,
"masked_top1": 44.43798561096192,
"masked_top5": 67.93657341003419,
"step": 9000,
"top1": 84.68964065551758,
"top5": 95.20655715942382
},
{
"epoch": 2.72,
"grad_norm": 1.120170919000501,
"learning_rate": 0.0001,
"loss": 1.0722,
"step": 9050
},
{
"ce_loss": 1.0739211070537567,
"epoch": 2.72,
"inp_emb_norm": 0.3396484375,
"loss": 1.0739211070537567,
"masked_top1": 44.42457763671875,
"masked_top5": 67.35458564758301,
"step": 9050,
"top1": 84.60804443359375,
"top5": 95.0638233947754
},
{
"epoch": 2.74,
"grad_norm": 1.0687255304897059,
"learning_rate": 0.0001,
"loss": 1.0792,
"step": 9100
},
{
"ce_loss": 1.0771595978736876,
"epoch": 2.74,
"inp_emb_norm": 0.3378125,
"loss": 1.0771595978736876,
"masked_top1": 43.902629165649415,
"masked_top5": 67.31045616149902,
"step": 9100,
"top1": 84.62485046386719,
"top5": 95.07604125976563
},
{
"epoch": 2.75,
"grad_norm": 1.0993761131067057,
"learning_rate": 0.0001,
"loss": 1.0813,
"step": 9150
},
{
"ce_loss": 1.082298024892807,
"epoch": 2.75,
"inp_emb_norm": 0.334609375,
"loss": 1.082298024892807,
"masked_top1": 45.80146224975586,
"masked_top5": 68.17116020202637,
"step": 9150,
"top1": 84.43293472290038,
"top5": 95.1411996459961
},
{
"epoch": 2.77,
"grad_norm": 1.109498165414777,
"learning_rate": 0.0001,
"loss": 1.0752,
"step": 9200
},
{
"ce_loss": 1.0997070169448853,
"epoch": 2.77,
"inp_emb_norm": 0.33890625,
"loss": 1.0997070169448853,
"masked_top1": 44.238784561157225,
"masked_top5": 67.47504318237304,
"step": 9200,
"top1": 84.3786474609375,
"top5": 94.96468109130859
},
{
"epoch": 2.78,
"grad_norm": 1.0947772030405678,
"learning_rate": 0.0001,
"loss": 1.081,
"step": 9250
},
{
"ce_loss": 1.066500049829483,
"epoch": 2.78,
"inp_emb_norm": 0.3352734375,
"loss": 1.066500049829483,
"masked_top1": 45.10762840270996,
"masked_top5": 68.68643852233886,
"step": 9250,
"top1": 84.62079177856445,
"top5": 95.23117553710938
},
{
"epoch": 2.8,
"grad_norm": 0.9904651287047559,
"learning_rate": 0.0001,
"loss": 1.0769,
"step": 9300
},
{
"ce_loss": 1.0865390312671661,
"epoch": 2.8,
"inp_emb_norm": 0.3416015625,
"loss": 1.0865390312671661,
"masked_top1": 43.63430931091309,
"masked_top5": 67.33321548461915,
"step": 9300,
"top1": 84.4200732421875,
"top5": 95.05102798461914
},
{
"epoch": 2.81,
"grad_norm": 1.115318864267355,
"learning_rate": 0.0001,
"loss": 1.0716,
"step": 9350
},
{
"ce_loss": 1.05960639834404,
"epoch": 2.81,
"inp_emb_norm": 0.3376171875,
"loss": 1.05960639834404,
"masked_top1": 45.203971252441406,
"masked_top5": 67.83310653686523,
"step": 9350,
"top1": 84.70579071044922,
"top5": 95.13749114990235
},
{
"epoch": 2.83,
"grad_norm": 1.0681394549186096,
"learning_rate": 0.0001,
"loss": 1.0755,
"step": 9400
},
{
"ce_loss": 1.0759823191165925,
"epoch": 2.83,
"inp_emb_norm": 0.335234375,
"loss": 1.0759823191165925,
"masked_top1": 45.2706275177002,
"masked_top5": 68.85594253540039,
"step": 9400,
"top1": 84.58239471435547,
"top5": 95.29377487182617
},
{
"epoch": 2.84,
"grad_norm": 1.1612375451754464,
"learning_rate": 0.0001,
"loss": 1.0704,
"step": 9450
},
{
"ce_loss": 1.0630818378925324,
"epoch": 2.84,
"inp_emb_norm": 0.33828125,
"loss": 1.0630818378925324,
"masked_top1": 45.41359436035156,
"masked_top5": 67.90339347839355,
"step": 9450,
"top1": 84.69526199340821,
"top5": 95.18430023193359
},
{
"epoch": 2.86,
"grad_norm": 1.0053255791083329,
"learning_rate": 0.0001,
"loss": 1.075,
"step": 9500
},
{
"ce_loss": 1.0855333960056306,
"epoch": 2.86,
"inp_emb_norm": 0.332578125,
"loss": 1.0855333960056306,
"masked_top1": 43.72627799987793,
"masked_top5": 67.27571357727051,
"step": 9500,
"top1": 84.34697372436523,
"top5": 95.10883621215821
},
{
"epoch": 2.87,
"grad_norm": 1.1782915911734662,
"learning_rate": 0.0001,
"loss": 1.0865,
"step": 9550
},
{
"ce_loss": 1.083924981355667,
"epoch": 2.87,
"inp_emb_norm": 0.3425390625,
"loss": 1.083924981355667,
"masked_top1": 44.98833938598633,
"masked_top5": 68.43323631286621,
"step": 9550,
"top1": 84.45411026000977,
"top5": 95.07992599487305
},
{
"epoch": 2.89,
"grad_norm": 0.9304520640714639,
"learning_rate": 0.0001,
"loss": 1.083,
"step": 9600
},
{
"ce_loss": 1.0927222657203675,
"epoch": 2.89,
"inp_emb_norm": 0.3383984375,
"loss": 1.0927222657203675,
"masked_top1": 44.10447273254395,
"masked_top5": 67.51412483215331,
"step": 9600,
"top1": 84.38162017822266,
"top5": 94.919228515625
},
{
"epoch": 2.9,
"grad_norm": 1.1207152238553202,
"learning_rate": 0.0001,
"loss": 1.0748,
"step": 9650
},
{
"ce_loss": 1.0789397644996643,
"epoch": 2.9,
"inp_emb_norm": 0.338671875,
"loss": 1.0789397644996643,
"masked_top1": 44.84839416503906,
"masked_top5": 67.80728317260743,
"step": 9650,
"top1": 84.54779495239258,
"top5": 95.0833723449707
},
{
"epoch": 2.92,
"grad_norm": 1.1324876880673622,
"learning_rate": 0.0001,
"loss": 1.0768,
"step": 9700
},
{
"ce_loss": 1.0801820170879364,
"epoch": 2.92,
"inp_emb_norm": 0.3430078125,
"loss": 1.0801820170879364,
"masked_top1": 43.99905281066894,
"masked_top5": 67.07788047790527,
"step": 9700,
"top1": 84.5837564086914,
"top5": 95.05357192993164
},
{
"epoch": 2.93,
"grad_norm": 1.0773481244877405,
"learning_rate": 0.0001,
"loss": 1.0728,
"step": 9750
},
{
"ce_loss": 1.080371401309967,
"epoch": 2.93,
"inp_emb_norm": 0.3410546875,
"loss": 1.080371401309967,
"masked_top1": 43.507309417724606,
"masked_top5": 67.3342724609375,
"step": 9750,
"top1": 84.5036083984375,
"top5": 95.07123840332031
},
{
"epoch": 2.95,
"grad_norm": 1.0159168175987678,
"learning_rate": 0.0001,
"loss": 1.0688,
"step": 9800
},
{
"ce_loss": 1.083665030002594,
"epoch": 2.95,
"inp_emb_norm": 0.34671875,
"loss": 1.083665030002594,
"masked_top1": 44.19781021118164,
"masked_top5": 67.93185821533203,
"step": 9800,
"top1": 84.49478713989258,
"top5": 95.09757629394531
},
{
"epoch": 2.96,
"grad_norm": 1.1136205616789427,
"learning_rate": 0.0001,
"loss": 1.0666,
"step": 9850
},
{
"ce_loss": 1.0619865989685058,
"epoch": 2.96,
"inp_emb_norm": 0.33875,
"loss": 1.0619865989685058,
"masked_top1": 45.1244051361084,
"masked_top5": 69.11591262817383,
"step": 9850,
"top1": 84.57212326049805,
"top5": 95.30008316040039
},
{
"epoch": 2.98,
"grad_norm": 1.0447170988807504,
"learning_rate": 0.0001,
"loss": 1.0799,
"step": 9900
},
{
"ce_loss": 1.079269015789032,
"epoch": 2.98,
"inp_emb_norm": 0.3366796875,
"loss": 1.079269015789032,
"masked_top1": 45.20799728393555,
"masked_top5": 69.20284423828124,
"step": 9900,
"top1": 84.41862701416015,
"top5": 95.25058349609375
},
{
"epoch": 2.99,
"grad_norm": 1.02006539824341,
"learning_rate": 0.0001,
"loss": 1.0727,
"step": 9950
},
{
"ce_loss": 1.078022118806839,
"epoch": 2.99,
"inp_emb_norm": 0.343828125,
"loss": 1.078022118806839,
"masked_top1": 45.404396057128906,
"masked_top5": 68.74404998779296,
"step": 9950,
"top1": 84.56711853027343,
"top5": 95.0911540222168
},
{
"epoch": 3.01,
"grad_norm": 1.113028194298386,
"learning_rate": 0.0001,
"loss": 0.923,
"step": 10000
},
{
"ce_loss": 0.9195016610622406,
"epoch": 3.01,
"inp_emb_norm": 0.3430859375,
"loss": 0.9195016610622406,
"masked_top1": 48.332907638549806,
"masked_top5": 72.30007148742676,
"step": 10000,
"top1": 86.70970993041992,
"top5": 95.9695133972168
},
{
"epoch": 3.02,
"grad_norm": 0.9894998771881995,
"learning_rate": 0.0001,
"loss": 0.7385,
"step": 10050
},
{
"ce_loss": 0.7338530778884887,
"epoch": 3.02,
"inp_emb_norm": 0.337578125,
"loss": 0.7338530778884887,
"masked_top1": 52.05472785949707,
"masked_top5": 77.44325592041015,
"step": 10050,
"top1": 89.11887008666992,
"top5": 97.09128341674804
},
{
"epoch": 3.04,
"grad_norm": 1.0927669172905705,
"learning_rate": 0.0001,
"loss": 0.7338,
"step": 10100
},
{
"ce_loss": 0.7197039890289306,
"epoch": 3.04,
"inp_emb_norm": 0.3384765625,
"loss": 0.7197039890289306,
"masked_top1": 53.32936988830566,
"masked_top5": 77.26823791503907,
"step": 10100,
"top1": 89.31084884643555,
"top5": 97.0846061706543
},
{
"epoch": 3.05,
"grad_norm": 1.0197432952418408,
"learning_rate": 0.0001,
"loss": 0.7411,
"step": 10150
},
{
"ce_loss": 0.7408602213859559,
"epoch": 3.05,
"inp_emb_norm": 0.3473046875,
"loss": 0.7408602213859559,
"masked_top1": 51.371361846923826,
"masked_top5": 76.60449035644531,
"step": 10150,
"top1": 89.11841430664063,
"top5": 96.8936897277832
},
{
"epoch": 3.07,
"grad_norm": 1.0379685041697753,
"learning_rate": 0.0001,
"loss": 0.7462,
"step": 10200
},
{
"ce_loss": 0.7370594382286072,
"epoch": 3.07,
"inp_emb_norm": 0.3462109375,
"loss": 0.7370594382286072,
"masked_top1": 51.55666793823242,
"masked_top5": 76.35641525268555,
"step": 10200,
"top1": 89.04085632324218,
"top5": 96.98393936157227
},
{
"epoch": 3.08,
"grad_norm": 0.9812098301602463,
"learning_rate": 0.0001,
"loss": 0.7456,
"step": 10250
},
{
"ce_loss": 0.7352669024467469,
"epoch": 3.08,
"inp_emb_norm": 0.3394921875,
"loss": 0.7352669024467469,
"masked_top1": 52.50257652282715,
"masked_top5": 76.83967468261719,
"step": 10250,
"top1": 89.10743438720704,
"top5": 96.9831704711914
},
{
"epoch": 3.1,
"grad_norm": 1.1025924611121007,
"learning_rate": 0.0001,
"loss": 0.7497,
"step": 10300
},
{
"ce_loss": 0.749667866230011,
"epoch": 3.1,
"inp_emb_norm": 0.34015625,
"loss": 0.749667866230011,
"masked_top1": 51.46554763793945,
"masked_top5": 76.06611167907715,
"step": 10300,
"top1": 88.8905972290039,
"top5": 96.94614303588867
},
{
"epoch": 3.11,
"grad_norm": 1.0513267054177442,
"learning_rate": 0.0001,
"loss": 0.76,
"step": 10350
},
{
"ce_loss": 0.7712516760826111,
"epoch": 3.11,
"inp_emb_norm": 0.342890625,
"loss": 0.7712516760826111,
"masked_top1": 50.167163619995115,
"masked_top5": 74.79809906005859,
"step": 10350,
"top1": 88.5837466430664,
"top5": 96.77574478149414
},
{
"epoch": 3.13,
"grad_norm": 1.1017983640036164,
"learning_rate": 0.0001,
"loss": 0.75,
"step": 10400
},
{
"ce_loss": 0.7555344760417938,
"epoch": 3.13,
"inp_emb_norm": 0.3440625,
"loss": 0.7555344760417938,
"masked_top1": 51.58950523376465,
"masked_top5": 75.45907012939453,
"step": 10400,
"top1": 88.80611511230468,
"top5": 96.81310470581055
},
{
"epoch": 3.14,
"grad_norm": 1.059466085863966,
"learning_rate": 0.0001,
"loss": 0.7674,
"step": 10450
},
{
"ce_loss": 0.7663704335689545,
"epoch": 3.14,
"inp_emb_norm": 0.3493359375,
"loss": 0.7663704335689545,
"masked_top1": 49.915076370239255,
"masked_top5": 75.00028388977051,
"step": 10450,
"top1": 88.7645571899414,
"top5": 96.73904815673828
},
{
"epoch": 3.16,
"grad_norm": 1.1423077844194809,
"learning_rate": 0.0001,
"loss": 0.7609,
"step": 10500
},
{
"ce_loss": 0.767000640630722,
"epoch": 3.16,
"inp_emb_norm": 0.3465234375,
"loss": 0.767000640630722,
"masked_top1": 50.03567909240723,
"masked_top5": 75.01374862670899,
"step": 10500,
"top1": 88.57664657592774,
"top5": 96.84391784667969
},
{
"epoch": 3.17,
"grad_norm": 1.032448425259117,
"learning_rate": 0.0001,
"loss": 0.772,
"step": 10550
},
{
"ce_loss": 0.7650803327560425,
"epoch": 3.17,
"inp_emb_norm": 0.3430078125,
"loss": 0.7650803327560425,
"masked_top1": 51.708590774536134,
"masked_top5": 75.70334999084473,
"step": 10550,
"top1": 88.69338806152344,
"top5": 96.79394775390625
},
{
"epoch": 3.19,
"grad_norm": 0.9962408469712855,
"learning_rate": 0.0001,
"loss": 0.7756,
"step": 10600
},
{
"ce_loss": 0.7650206458568573,
"epoch": 3.19,
"inp_emb_norm": 0.3441796875,
"loss": 0.7650206458568573,
"masked_top1": 51.08683837890625,
"masked_top5": 76.22035110473632,
"step": 10600,
"top1": 88.6184033203125,
"top5": 96.88288452148437
},
{
"epoch": 3.2,
"grad_norm": 1.0602266154627207,
"learning_rate": 0.0001,
"loss": 0.7701,
"step": 10650
},
{
"ce_loss": 0.790007756948471,
"epoch": 3.2,
"inp_emb_norm": 0.3371875,
"loss": 0.790007756948471,
"masked_top1": 49.467856369018556,
"masked_top5": 74.11548706054687,
"step": 10650,
"top1": 88.32970947265625,
"top5": 96.74940063476562
},
{
"epoch": 3.22,
"grad_norm": 0.977038059072146,
"learning_rate": 0.0001,
"loss": 0.7746,
"step": 10700
},
{
"ce_loss": 0.7758392190933228,
"epoch": 3.22,
"inp_emb_norm": 0.346875,
"loss": 0.7758392190933228,
"masked_top1": 50.20531356811524,
"masked_top5": 75.03212989807129,
"step": 10700,
"top1": 88.4866551208496,
"top5": 96.73311477661133
},
{
"epoch": 3.23,
"grad_norm": 1.0744003921088385,
"learning_rate": 0.0001,
"loss": 0.7866,
"step": 10750
},
{
"ce_loss": 0.788164142370224,
"epoch": 3.23,
"inp_emb_norm": 0.351015625,
"loss": 0.788164142370224,
"masked_top1": 49.6349144744873,
"masked_top5": 74.33588874816894,
"step": 10750,
"top1": 88.30440032958984,
"top5": 96.71714782714844
},
{
"epoch": 3.25,
"grad_norm": 1.0175043021315349,
"learning_rate": 0.0001,
"loss": 0.7874,
"step": 10800
},
{
"ce_loss": 0.7919885838031768,
"epoch": 3.25,
"inp_emb_norm": 0.3444140625,
"loss": 0.7919885838031768,
"masked_top1": 48.99909255981445,
"masked_top5": 74.25505355834962,
"step": 10800,
"top1": 88.26540832519531,
"top5": 96.72898651123047
},
{
"epoch": 3.26,
"grad_norm": 1.1095697587812745,
"learning_rate": 0.0001,
"loss": 0.7888,
"step": 10850
},
{
"ce_loss": 0.797193922996521,
"epoch": 3.26,
"inp_emb_norm": 0.345234375,
"loss": 0.797193922996521,
"masked_top1": 49.19968879699707,
"masked_top5": 74.1003759765625,
"step": 10850,
"top1": 88.23719314575196,
"top5": 96.6193911743164
},
{
"epoch": 3.28,
"grad_norm": 1.096465203223292,
"learning_rate": 0.0001,
"loss": 0.7871,
"step": 10900
},
{
"ce_loss": 0.7947347521781921,
"epoch": 3.28,
"inp_emb_norm": 0.3490234375,
"loss": 0.7947347521781921,
"masked_top1": 49.850736923217774,
"masked_top5": 74.32025009155274,
"step": 10900,
"top1": 88.2540364074707,
"top5": 96.63969177246094
},
{
"epoch": 3.29,
"grad_norm": 1.0949347711209227,
"learning_rate": 0.0001,
"loss": 0.7828,
"step": 10950
},
{
"ce_loss": 0.7824289429187775,
"epoch": 3.29,
"inp_emb_norm": 0.340234375,
"loss": 0.7824289429187775,
"masked_top1": 51.0236792755127,
"masked_top5": 75.60059753417968,
"step": 10950,
"top1": 88.44829376220703,
"top5": 96.78743530273438
},
{
"epoch": 3.31,
"grad_norm": 1.0829787001902516,
"learning_rate": 0.0001,
"loss": 0.7878,
"step": 11000
},
{
"ce_loss": 0.7807865822315216,
"epoch": 3.31,
"inp_emb_norm": 0.35390625,
"loss": 0.7807865822315216,
"masked_top1": 50.650703506469725,
"masked_top5": 74.27250564575195,
"step": 11000,
"top1": 88.50477279663086,
"top5": 96.67802978515626
},
{
"epoch": 3.32,
"grad_norm": 1.01346692888271,
"learning_rate": 0.0001,
"loss": 0.7935,
"step": 11050
},
{
"ce_loss": 0.7934070038795471,
"epoch": 3.32,
"inp_emb_norm": 0.35015625,
"loss": 0.7934070038795471,
"masked_top1": 50.09875770568848,
"masked_top5": 74.54286209106445,
"step": 11050,
"top1": 88.3311814880371,
"top5": 96.66521301269532
},
{
"epoch": 3.34,
"grad_norm": 1.1970274017001157,
"learning_rate": 0.0001,
"loss": 0.8031,
"step": 11100
},
{
"ce_loss": 0.8043110525608063,
"epoch": 3.34,
"inp_emb_norm": 0.340703125,
"loss": 0.8043110525608063,
"masked_top1": 49.070663375854494,
"masked_top5": 73.73879615783692,
"step": 11100,
"top1": 88.18025527954102,
"top5": 96.71074813842773
},
{
"epoch": 3.35,
"grad_norm": 0.9807421337694971,
"learning_rate": 0.0001,
"loss": 0.7967,
"step": 11150
},
{
"ce_loss": 0.7986381149291992,
"epoch": 3.35,
"inp_emb_norm": 0.3460546875,
"loss": 0.7986381149291992,
"masked_top1": 49.351584854125974,
"masked_top5": 74.40426681518555,
"step": 11150,
"top1": 88.17583755493165,
"top5": 96.63799163818359
},
{
"epoch": 3.37,
"grad_norm": 1.0963946786909775,
"learning_rate": 0.0001,
"loss": 0.7973,
"step": 11200
},
{
"ce_loss": 0.7999817717075348,
"epoch": 3.37,
"inp_emb_norm": 0.34953125,
"loss": 0.7999817717075348,
"masked_top1": 48.15997085571289,
"masked_top5": 73.92494735717773,
"step": 11200,
"top1": 88.08112899780274,
"top5": 96.61600830078125
},
{
"epoch": 3.38,
"grad_norm": 0.9959517938940972,
"learning_rate": 0.0001,
"loss": 0.7972,
"step": 11250
},
{
"ce_loss": 0.7938779592514038,
"epoch": 3.38,
"inp_emb_norm": 0.3476171875,
"loss": 0.7938779592514038,
"masked_top1": 49.82730033874512,
"masked_top5": 74.95826553344726,
"step": 11250,
"top1": 88.16325576782226,
"top5": 96.75233062744141
},
{
"epoch": 3.4,
"grad_norm": 0.9936212958366237,
"learning_rate": 0.0001,
"loss": 0.8,
"step": 11300
},
{
"ce_loss": 0.8001345789432526,
"epoch": 3.4,
"inp_emb_norm": 0.3510546875,
"loss": 0.8001345789432526,
"masked_top1": 50.4670531463623,
"masked_top5": 74.18307846069337,
"step": 11300,
"top1": 88.19016235351563,
"top5": 96.6422052001953
},
{
"epoch": 3.41,
"grad_norm": 1.130929594499011,
"learning_rate": 0.0001,
"loss": 0.8071,
"step": 11350
},
{
"ce_loss": 0.8002821004390717,
"epoch": 3.41,
"inp_emb_norm": 0.3469921875,
"loss": 0.8002821004390717,
"masked_top1": 49.907284088134766,
"masked_top5": 74.60272583007813,
"step": 11350,
"top1": 88.13528030395508,
"top5": 96.65942398071289
},
{
"epoch": 3.43,
"grad_norm": 1.027541301721308,
"learning_rate": 0.0001,
"loss": 0.817,
"step": 11400
},
{
"ce_loss": 0.8025326645374298,
"epoch": 3.43,
"inp_emb_norm": 0.3438671875,
"loss": 0.8025326645374298,
"masked_top1": 50.332852630615236,
"masked_top5": 74.55212646484375,
"step": 11400,
"top1": 88.08174774169922,
"top5": 96.71300491333008
},
{
"epoch": 3.44,
"grad_norm": 1.0509284890501878,
"learning_rate": 0.0001,
"loss": 0.8071,
"step": 11450
},
{
"ce_loss": 0.8151209402084351,
"epoch": 3.44,
"inp_emb_norm": 0.3536328125,
"loss": 0.8151209402084351,
"masked_top1": 49.18514472961426,
"masked_top5": 73.88459991455078,
"step": 11450,
"top1": 88.03491561889649,
"top5": 96.5151156616211
},
{
"epoch": 3.46,
"grad_norm": 1.0563826334349269,
"learning_rate": 0.0001,
"loss": 0.8146,
"step": 11500
},
{
"ce_loss": 0.8121966254711152,
"epoch": 3.46,
"inp_emb_norm": 0.3479296875,
"loss": 0.8121966254711152,
"masked_top1": 48.76513572692871,
"masked_top5": 73.98189331054688,
"step": 11500,
"top1": 87.9227052307129,
"top5": 96.60188415527344
},
{
"epoch": 3.47,
"grad_norm": 1.0581754020714675,
"learning_rate": 0.0001,
"loss": 0.8113,
"step": 11550
},
{
"ce_loss": 0.8193511891365052,
"epoch": 3.47,
"inp_emb_norm": 0.3533984375,
"loss": 0.8193511891365052,
"masked_top1": 48.62931312561035,
"masked_top5": 73.32291015625,
"step": 11550,
"top1": 87.89377365112304,
"top5": 96.50224563598633
},
{
"epoch": 3.49,
"grad_norm": 1.012876759570208,
"learning_rate": 0.0001,
"loss": 0.8131,
"step": 11600
},
{
"ce_loss": 0.8099701881408692,
"epoch": 3.49,
"inp_emb_norm": 0.35375,
"loss": 0.8099701881408692,
"masked_top1": 49.099407272338865,
"masked_top5": 73.79963348388672,
"step": 11600,
"top1": 88.09338317871094,
"top5": 96.58081787109376
},
{
"epoch": 3.5,
"grad_norm": 1.0124821527896055,
"learning_rate": 0.0001,
"loss": 0.8121,
"step": 11650
},
{
"ce_loss": 0.8340137410163879,
"epoch": 3.5,
"inp_emb_norm": 0.34828125,
"loss": 0.8340137410163879,
"masked_top1": 49.10786933898926,
"masked_top5": 73.36568084716797,
"step": 11650,
"top1": 87.82726516723633,
"top5": 96.4594790649414
},
{
"epoch": 3.52,
"grad_norm": 1.006239120505223,
"learning_rate": 0.0001,
"loss": 0.8201,
"step": 11700
},
{
"ce_loss": 0.8206925344467163,
"epoch": 3.52,
"inp_emb_norm": 0.351875,
"loss": 0.8206925344467163,
"masked_top1": 48.73426811218262,
"masked_top5": 73.27444328308106,
"step": 11700,
"top1": 87.83039459228516,
"top5": 96.4988427734375
},
{
"epoch": 3.53,
"grad_norm": 1.059895397750386,
"learning_rate": 0.0001,
"loss": 0.817,
"step": 11750
},
{
"ce_loss": 0.8189614808559418,
"epoch": 3.53,
"inp_emb_norm": 0.3555859375,
"loss": 0.8189614808559418,
"masked_top1": 49.02143653869629,
"masked_top5": 74.14399620056152,
"step": 11750,
"top1": 87.84059661865234,
"top5": 96.55142425537109
},
{
"epoch": 3.55,
"grad_norm": 1.065918703633994,
"learning_rate": 0.0001,
"loss": 0.8103,
"step": 11800
},
{
"ce_loss": 0.8101903474330903,
"epoch": 3.55,
"inp_emb_norm": 0.3505078125,
"loss": 0.8101903474330903,
"masked_top1": 49.68892807006836,
"masked_top5": 74.20518287658692,
"step": 11800,
"top1": 88.00088562011719,
"top5": 96.6223454284668
},
{
"epoch": 3.56,
"grad_norm": 0.999942936662604,
"learning_rate": 0.0001,
"loss": 0.8173,
"step": 11850
},
{
"ce_loss": 0.8223925268650055,
"epoch": 3.56,
"inp_emb_norm": 0.3467578125,
"loss": 0.8223925268650055,
"masked_top1": 48.98508232116699,
"masked_top5": 73.06097061157226,
"step": 11850,
"top1": 87.84088226318359,
"top5": 96.52394149780274
},
{
"epoch": 3.58,
"grad_norm": 1.0109380245958302,
"learning_rate": 0.0001,
"loss": 0.8217,
"step": 11900
},
{
"ce_loss": 0.8102488934993743,
"epoch": 3.58,
"inp_emb_norm": 0.3561328125,
"loss": 0.8102488934993743,
"masked_top1": 49.78484992980957,
"masked_top5": 73.83953674316406,
"step": 11900,
"top1": 88.08386886596679,
"top5": 96.51279846191406
},
{
"epoch": 3.59,
"grad_norm": 1.0501857173368994,
"learning_rate": 0.0001,
"loss": 0.8202,
"step": 11950
},
{
"ce_loss": 0.8153278791904449,
"epoch": 3.59,
"inp_emb_norm": 0.3567578125,
"loss": 0.8153278791904449,
"masked_top1": 50.214491806030274,
"masked_top5": 75.15233001708984,
"step": 11950,
"top1": 87.94796691894531,
"top5": 96.6393051147461
},
{
"epoch": 3.61,
"grad_norm": 1.0094284740380721,
"learning_rate": 0.0001,
"loss": 0.8228,
"step": 12000
},
{
"ce_loss": 0.8150149726867676,
"epoch": 3.61,
"inp_emb_norm": 0.34546875,
"loss": 0.8150149726867676,
"masked_top1": 50.50455932617187,
"masked_top5": 75.45603103637696,
"step": 12000,
"top1": 87.88845626831055,
"top5": 96.71615814208984
},
{
"epoch": 3.62,
"grad_norm": 0.9828696027131701,
"learning_rate": 0.0001,
"loss": 0.8268,
"step": 12050
},
{
"ce_loss": 0.8084503662586212,
"epoch": 3.62,
"inp_emb_norm": 0.35515625,
"loss": 0.8084503662586212,
"masked_top1": 49.089233779907225,
"masked_top5": 74.21111480712891,
"step": 12050,
"top1": 87.9230386352539,
"top5": 96.66818649291992
},
{
"epoch": 3.64,
"grad_norm": 1.0364095825846835,
"learning_rate": 0.0001,
"loss": 0.8313,
"step": 12100
},
{
"ce_loss": 0.8248488974571228,
"epoch": 3.64,
"inp_emb_norm": 0.3575,
"loss": 0.8248488974571228,
"masked_top1": 49.23360801696777,
"masked_top5": 73.44429908752441,
"step": 12100,
"top1": 87.82616806030273,
"top5": 96.4687548828125
},
{
"epoch": 3.65,
"grad_norm": 1.032109747501083,
"learning_rate": 0.0001,
"loss": 0.8166,
"step": 12150
},
{
"ce_loss": 0.819879275560379,
"epoch": 3.65,
"inp_emb_norm": 0.3619921875,
"loss": 0.819879275560379,
"masked_top1": 49.493774490356444,
"masked_top5": 74.06904296875,
"step": 12150,
"top1": 87.88977813720703,
"top5": 96.5196435546875
},
{
"epoch": 3.67,
"grad_norm": 1.1132546555196505,
"learning_rate": 0.0001,
"loss": 0.8407,
"step": 12200
},
{
"ce_loss": 0.8416215097904205,
"epoch": 3.67,
"inp_emb_norm": 0.35109375,
"loss": 0.8416215097904205,
"masked_top1": 48.93654960632324,
"masked_top5": 73.06722267150879,
"step": 12200,
"top1": 87.65886123657226,
"top5": 96.42184036254883
},
{
"epoch": 3.68,
"grad_norm": 1.1126594978470823,
"learning_rate": 0.0001,
"loss": 0.8309,
"step": 12250
},
{
"ce_loss": 0.8369515192508697,
"epoch": 3.68,
"inp_emb_norm": 0.3548046875,
"loss": 0.8369515192508697,
"masked_top1": 49.08364143371582,
"masked_top5": 73.77061073303223,
"step": 12250,
"top1": 87.5754817199707,
"top5": 96.51109664916993
},
{
"epoch": 3.7,
"grad_norm": 1.039335313619578,
"learning_rate": 0.0001,
"loss": 0.8335,
"step": 12300
},
{
"ce_loss": 0.8377754426002503,
"epoch": 3.7,
"inp_emb_norm": 0.3561328125,
"loss": 0.8377754426002503,
"masked_top1": 48.12888420104981,
"masked_top5": 72.75807151794433,
"step": 12300,
"top1": 87.64847686767578,
"top5": 96.35865264892578
},
{
"epoch": 3.71,
"grad_norm": 1.0741125902905957,
"learning_rate": 0.0001,
"loss": 0.8311,
"step": 12350
},
{
"ce_loss": 0.8398508429527283,
"epoch": 3.71,
"inp_emb_norm": 0.3508203125,
"loss": 0.8398508429527283,
"masked_top1": 49.14791725158691,
"masked_top5": 73.34809280395508,
"step": 12350,
"top1": 87.55382202148438,
"top5": 96.47370834350586
},
{
"epoch": 3.73,
"grad_norm": 1.0735818209876995,
"learning_rate": 0.0001,
"loss": 0.8271,
"step": 12400
},
{
"ce_loss": 0.8356200730800629,
"epoch": 3.73,
"inp_emb_norm": 0.3551953125,
"loss": 0.8356200730800629,
"masked_top1": 48.54533554077148,
"masked_top5": 73.34476501464843,
"step": 12400,
"top1": 87.71935012817383,
"top5": 96.43182586669921
},
{
"epoch": 3.74,
"grad_norm": 1.1727450080624469,
"learning_rate": 0.0001,
"loss": 0.8438,
"step": 12450
},
{
"ce_loss": 0.834755152463913,
"epoch": 3.74,
"inp_emb_norm": 0.3557421875,
"loss": 0.834755152463913,
"masked_top1": 48.73476402282715,
"masked_top5": 73.74958610534668,
"step": 12450,
"top1": 87.6754948425293,
"top5": 96.48529556274414
},
{
"epoch": 3.76,
"grad_norm": 1.0593532648539608,
"learning_rate": 0.0001,
"loss": 0.8302,
"step": 12500
},
{
"ce_loss": 0.8266095387935638,
"epoch": 3.76,
"inp_emb_norm": 0.35234375,
"loss": 0.8266095387935638,
"masked_top1": 49.589872207641605,
"masked_top5": 73.7888671875,
"step": 12500,
"top1": 87.81543869018554,
"top5": 96.54735931396485
},
{
"epoch": 3.77,
"grad_norm": 1.0465892187844261,
"learning_rate": 0.0001,
"loss": 0.8365,
"step": 12550
},
{
"ce_loss": 0.8360547876358032,
"epoch": 3.77,
"inp_emb_norm": 0.352265625,
"loss": 0.8360547876358032,
"masked_top1": 48.81367431640625,
"masked_top5": 73.35512954711913,
"step": 12550,
"top1": 87.50978561401367,
"top5": 96.48541564941407
},
{
"epoch": 3.79,
"grad_norm": 0.9861748502692437,
"learning_rate": 0.0001,
"loss": 0.8346,
"step": 12600
},
{
"ce_loss": 0.8302571523189545,
"epoch": 3.79,
"inp_emb_norm": 0.353515625,
"loss": 0.8302571523189545,
"masked_top1": 50.41721633911133,
"masked_top5": 74.37043991088868,
"step": 12600,
"top1": 87.77974655151367,
"top5": 96.57415542602538
},
{
"epoch": 3.8,
"grad_norm": 1.0316691583599322,
"learning_rate": 0.0001,
"loss": 0.8311,
"step": 12650
},
{
"ce_loss": 0.8366478300094604,
"epoch": 3.8,
"inp_emb_norm": 0.358125,
"loss": 0.8366478300094604,
"masked_top1": 49.738552017211916,
"masked_top5": 72.98569618225098,
"step": 12650,
"top1": 87.68307205200195,
"top5": 96.43355926513672
},
{
"epoch": 3.82,
"grad_norm": 1.079061755706453,
"learning_rate": 0.0001,
"loss": 0.8409,
"step": 12700
},
{
"ce_loss": 0.8572568881511688,
"epoch": 3.82,
"inp_emb_norm": 0.358984375,
"loss": 0.8572568881511688,
"masked_top1": 47.16713302612305,
"masked_top5": 71.66687362670899,
"step": 12700,
"top1": 87.3316879272461,
"top5": 96.28264602661133
},
{
"epoch": 3.83,
"grad_norm": 1.1113987431562926,
"learning_rate": 0.0001,
"loss": 0.8278,
"step": 12750
},
{
"ce_loss": 0.8314487624168396,
"epoch": 3.83,
"inp_emb_norm": 0.35171875,
"loss": 0.8314487624168396,
"masked_top1": 49.02839782714844,
"masked_top5": 73.43093780517579,
"step": 12750,
"top1": 87.7519515991211,
"top5": 96.49931549072265
},
{
"epoch": 3.85,
"grad_norm": 1.0978179635851295,
"learning_rate": 0.0001,
"loss": 0.8332,
"step": 12800
},
{
"ce_loss": 0.8298409843444824,
"epoch": 3.85,
"inp_emb_norm": 0.3492578125,
"loss": 0.8298409843444824,
"masked_top1": 50.06224174499512,
"masked_top5": 73.62540901184082,
"step": 12800,
"top1": 87.73297302246094,
"top5": 96.57409362792968
},
{
"epoch": 3.86,
"grad_norm": 0.9650541842630127,
"learning_rate": 0.0001,
"loss": 0.8372,
"step": 12850
},
{
"ce_loss": 0.8380164694786072,
"epoch": 3.86,
"inp_emb_norm": 0.35796875,
"loss": 0.8380164694786072,
"masked_top1": 48.93704933166504,
"masked_top5": 72.84653518676758,
"step": 12850,
"top1": 87.66362426757813,
"top5": 96.39145156860351
},
{
"epoch": 3.88,
"grad_norm": 0.9849217897777546,
"learning_rate": 0.0001,
"loss": 0.8489,
"step": 12900
},
{
"ce_loss": 0.8715607190132141,
"epoch": 3.88,
"inp_emb_norm": 0.35453125,
"loss": 0.8715607190132141,
"masked_top1": 46.82566688537597,
"masked_top5": 71.51848426818847,
"step": 12900,
"top1": 87.30749450683594,
"top5": 96.20992965698242
},
{
"epoch": 3.89,
"grad_norm": 1.0587318645397439,
"learning_rate": 0.0001,
"loss": 0.8513,
"step": 12950
},
{
"ce_loss": 0.8459929120540619,
"epoch": 3.89,
"inp_emb_norm": 0.3545703125,
"loss": 0.8459929120540619,
"masked_top1": 47.957108154296876,
"masked_top5": 72.73884170532227,
"step": 12950,
"top1": 87.4450048828125,
"top5": 96.42510208129883
},
{
"epoch": 3.91,
"grad_norm": 1.008453804185378,
"learning_rate": 0.0001,
"loss": 0.8499,
"step": 13000
},
{
"ce_loss": 0.8543168365955353,
"epoch": 3.91,
"inp_emb_norm": 0.35265625,
"loss": 0.8543168365955353,
"masked_top1": 48.10952255249023,
"masked_top5": 72.79139190673828,
"step": 13000,
"top1": 87.43768081665038,
"top5": 96.36415054321289
},
{
"epoch": 3.92,
"grad_norm": 1.1029061736641703,
"learning_rate": 0.0001,
"loss": 0.8477,
"step": 13050
},
{
"ce_loss": 0.8511655080318451,
"epoch": 3.92,
"inp_emb_norm": 0.3517578125,
"loss": 0.8511655080318451,
"masked_top1": 47.82553955078125,
"masked_top5": 72.87044403076172,
"step": 13050,
"top1": 87.3859359741211,
"top5": 96.47525772094727
},
{
"epoch": 3.94,
"grad_norm": 1.075253739387409,
"learning_rate": 0.0001,
"loss": 0.8483,
"step": 13100
},
{
"ce_loss": 0.8585194671154022,
"epoch": 3.94,
"inp_emb_norm": 0.3530859375,
"loss": 0.8585194671154022,
"masked_top1": 48.05498847961426,
"masked_top5": 72.10926498413086,
"step": 13100,
"top1": 87.33003082275391,
"top5": 96.38634353637696
},
{
"epoch": 3.95,
"grad_norm": 1.06337585097062,
"learning_rate": 0.0001,
"loss": 0.8512,
"step": 13150
},
{
"ce_loss": 0.8486355948448181,
"epoch": 3.95,
"inp_emb_norm": 0.3512109375,
"loss": 0.8486355948448181,
"masked_top1": 47.7544295501709,
"masked_top5": 72.88016632080078,
"step": 13150,
"top1": 87.42533172607422,
"top5": 96.4359049987793
},
{
"epoch": 3.97,
"grad_norm": 1.0262462724728467,
"learning_rate": 0.0001,
"loss": 0.8484,
"step": 13200
},
{
"ce_loss": 0.850223616361618,
"epoch": 3.97,
"inp_emb_norm": 0.3545703125,
"loss": 0.850223616361618,
"masked_top1": 49.058433532714844,
"masked_top5": 73.50660820007325,
"step": 13200,
"top1": 87.42515289306641,
"top5": 96.44447158813476
},
{
"epoch": 3.98,
"grad_norm": 1.1183732342762185,
"learning_rate": 0.0001,
"loss": 0.8496,
"step": 13250
},
{
"ce_loss": 0.8533316111564636,
"epoch": 3.98,
"inp_emb_norm": 0.3624609375,
"loss": 0.8533316111564636,
"masked_top1": 47.88730712890625,
"masked_top5": 72.5444091796875,
"step": 13250,
"top1": 87.41307250976563,
"top5": 96.35140655517579
},
{
"epoch": 4.0,
"grad_norm": 1.0167762679603471,
"learning_rate": 0.0001,
"loss": 0.8556,
"step": 13300
},
{
"ce_loss": 0.8482355761528015,
"epoch": 4.0,
"inp_emb_norm": 0.36,
"loss": 0.8482355761528015,
"masked_top1": 48.77355934143066,
"masked_top5": 72.95730499267579,
"step": 13300,
"top1": 87.49448638916016,
"top5": 96.36580871582031
},
{
"epoch": 4.02,
"grad_norm": 0.9619800299228121,
"learning_rate": 0.0001,
"loss": 0.4647,
"step": 13350
},
{
"ce_loss": 0.4654207336902618,
"epoch": 4.02,
"inp_emb_norm": 0.3638671875,
"loss": 0.4654207336902618,
"masked_top1": 69.25712104797363,
"masked_top5": 90.11044219970704,
"step": 13350,
"top1": 92.98291519165039,
"top5": 98.58249725341797
},
{
"epoch": 4.03,
"grad_norm": 1.0911634549199811,
"learning_rate": 0.0001,
"loss": 0.4589,
"step": 13400
},
{
"ce_loss": 0.45429064869880675,
"epoch": 4.03,
"inp_emb_norm": 0.3570703125,
"loss": 0.45429064869880675,
"masked_top1": 69.99664253234863,
"masked_top5": 90.96843032836914,
"step": 13400,
"top1": 93.03666076660156,
"top5": 98.7020344543457
},
{
"epoch": 4.05,
"grad_norm": 1.055104414353352,
"learning_rate": 0.0001,
"loss": 0.463,
"step": 13450
},
{
"ce_loss": 0.4745191448926926,
"epoch": 4.05,
"inp_emb_norm": 0.3669921875,
"loss": 0.4745191448926926,
"masked_top1": 67.61885986328124,
"masked_top5": 89.49128158569336,
"step": 13450,
"top1": 92.77124450683594,
"top5": 98.49986145019531
},
{
"epoch": 4.06,
"grad_norm": 1.134443079230167,
"learning_rate": 0.0001,
"loss": 0.4584,
"step": 13500
},
{
"ce_loss": 0.45621369063854217,
"epoch": 4.06,
"inp_emb_norm": 0.3608984375,
"loss": 0.45621369063854217,
"masked_top1": 68.7521329498291,
"masked_top5": 90.43077987670898,
"step": 13500,
"top1": 92.95487716674805,
"top5": 98.66310028076173
},
{
"epoch": 4.08,
"grad_norm": 1.0109539437689439,
"learning_rate": 0.0001,
"loss": 0.4666,
"step": 13550
},
{
"ce_loss": 0.47258255898952484,
"epoch": 4.08,
"inp_emb_norm": 0.3544921875,
"loss": 0.47258255898952484,
"masked_top1": 68.48685668945312,
"masked_top5": 90.1821842956543,
"step": 13550,
"top1": 92.76567596435547,
"top5": 98.59864028930664
},
{
"epoch": 4.09,
"grad_norm": 1.105379838900382,
"learning_rate": 0.0001,
"loss": 0.4745,
"step": 13600
},
{
"ce_loss": 0.4720061844587326,
"epoch": 4.09,
"inp_emb_norm": 0.3646484375,
"loss": 0.4720061844587326,
"masked_top1": 67.28578758239746,
"masked_top5": 88.81673706054687,
"step": 13600,
"top1": 92.7738314819336,
"top5": 98.49870178222656
},
{
"epoch": 4.11,
"grad_norm": 1.0445145007542662,
"learning_rate": 0.0001,
"loss": 0.4766,
"step": 13650
},
{
"ce_loss": 0.47552256405353543,
"epoch": 4.11,
"inp_emb_norm": 0.363515625,
"loss": 0.47552256405353543,
"masked_top1": 67.69092002868652,
"masked_top5": 89.2786703491211,
"step": 13650,
"top1": 92.75520309448243,
"top5": 98.50775588989258
},
{
"epoch": 4.12,
"grad_norm": 1.0910427976634176,
"learning_rate": 0.0001,
"loss": 0.4854,
"step": 13700
},
{
"ce_loss": 0.4856406021118164,
"epoch": 4.12,
"inp_emb_norm": 0.356484375,
"loss": 0.4856406021118164,
"masked_top1": 67.06616600036621,
"masked_top5": 89.24904571533203,
"step": 13700,
"top1": 92.60382766723633,
"top5": 98.53153411865235
},
{
"epoch": 4.14,
"grad_norm": 1.06712952932014,
"learning_rate": 0.0001,
"loss": 0.487,
"step": 13750
},
{
"ce_loss": 0.47847113251686096,
"epoch": 4.14,
"inp_emb_norm": 0.3590234375,
"loss": 0.47847113251686096,
"masked_top1": 67.56794845581055,
"masked_top5": 89.76878845214844,
"step": 13750,
"top1": 92.65390167236328,
"top5": 98.56003234863282
},
{
"epoch": 4.15,
"grad_norm": 1.1426918899106289,
"learning_rate": 0.0001,
"loss": 0.4945,
"step": 13800
},
{
"ce_loss": 0.502599538564682,
"epoch": 4.15,
"inp_emb_norm": 0.3619140625,
"loss": 0.502599538564682,
"masked_top1": 64.62048377990723,
"masked_top5": 87.96943252563477,
"step": 13800,
"top1": 92.32529083251953,
"top5": 98.33552108764648
},
{
"epoch": 4.17,
"grad_norm": 1.0640372514854988,
"learning_rate": 0.0001,
"loss": 0.4917,
"step": 13850
},
{
"ce_loss": 0.48276497185230255,
"epoch": 4.17,
"inp_emb_norm": 0.3612890625,
"loss": 0.48276497185230255,
"masked_top1": 66.87754806518555,
"masked_top5": 89.40460830688477,
"step": 13850,
"top1": 92.6048323059082,
"top5": 98.5196612548828
},
{
"epoch": 4.18,
"grad_norm": 1.0976346428642292,
"learning_rate": 0.0001,
"loss": 0.5333,
"step": 13900
},
{
"ce_loss": 0.5117572790384293,
"epoch": 4.18,
"inp_emb_norm": 0.3601171875,
"loss": 0.5117572790384293,
"masked_top1": 64.48737777709961,
"masked_top5": 87.95527038574218,
"step": 13900,
"top1": 92.178935546875,
"top5": 98.36900207519531
},
{
"epoch": 4.2,
"grad_norm": 1.0719371859267879,
"learning_rate": 0.0001,
"loss": 0.5027,
"step": 13950
},
{
"ce_loss": 0.506566162109375,
"epoch": 4.2,
"inp_emb_norm": 0.36140625,
"loss": 0.506566162109375,
"masked_top1": 64.94912460327149,
"masked_top5": 88.18265213012695,
"step": 13950,
"top1": 92.29801971435546,
"top5": 98.34425674438477
},
{
"epoch": 4.21,
"grad_norm": 1.1253588379337218,
"learning_rate": 0.0001,
"loss": 0.5035,
"step": 14000
},
{
"ce_loss": 0.5121178191900253,
"epoch": 4.21,
"inp_emb_norm": 0.3689453125,
"loss": 0.5121178191900253,
"masked_top1": 64.75304649353028,
"masked_top5": 88.02303543090821,
"step": 14000,
"top1": 92.1805793762207,
"top5": 98.36868255615235
},
{
"epoch": 4.23,
"grad_norm": 1.0564639466498615,
"learning_rate": 0.0001,
"loss": 0.5095,
"step": 14050
},
{
"ce_loss": 0.5051852202415467,
"epoch": 4.23,
"inp_emb_norm": 0.3665234375,
"loss": 0.5051852202415467,
"masked_top1": 64.94737861633301,
"masked_top5": 87.73623580932617,
"step": 14050,
"top1": 92.28643096923828,
"top5": 98.35600860595703
},
{
"epoch": 4.24,
"grad_norm": 1.054309146882263,
"learning_rate": 0.0001,
"loss": 0.5128,
"step": 14100
},
{
"ce_loss": 0.5164009261131287,
"epoch": 4.24,
"inp_emb_norm": 0.3551171875,
"loss": 0.5164009261131287,
"masked_top1": 64.0973461151123,
"masked_top5": 87.07987808227539,
"step": 14100,
"top1": 92.10574264526367,
"top5": 98.30786682128907
},
{
"epoch": 4.26,
"grad_norm": 1.074877585526465,
"learning_rate": 0.0001,
"loss": 0.518,
"step": 14150
},
{
"ce_loss": 0.51701247215271,
"epoch": 4.26,
"inp_emb_norm": 0.36,
"loss": 0.51701247215271,
"masked_top1": 63.153812866210934,
"masked_top5": 87.39238418579102,
"step": 14150,
"top1": 92.00238998413086,
"top5": 98.3110905456543
},
{
"epoch": 4.27,
"grad_norm": 1.0964266251443575,
"learning_rate": 0.0001,
"loss": 0.5182,
"step": 14200
},
{
"ce_loss": 0.5284781348705292,
"epoch": 4.27,
"inp_emb_norm": 0.362265625,
"loss": 0.5284781348705292,
"masked_top1": 63.75819221496582,
"masked_top5": 87.80308670043945,
"step": 14200,
"top1": 91.9805062866211,
"top5": 98.30166961669921
},
{
"epoch": 4.29,
"grad_norm": 0.9313191633498797,
"learning_rate": 0.0001,
"loss": 0.5323,
"step": 14250
},
{
"ce_loss": 0.5334083133935928,
"epoch": 4.29,
"inp_emb_norm": 0.366328125,
"loss": 0.5334083133935928,
"masked_top1": 63.2767244720459,
"masked_top5": 86.49884506225585,
"step": 14250,
"top1": 91.98057006835937,
"top5": 98.21974060058594
},
{
"epoch": 4.3,
"grad_norm": 1.049137459405467,
"learning_rate": 0.0001,
"loss": 0.5128,
"step": 14300
},
{
"ce_loss": 0.5050585663318634,
"epoch": 4.3,
"inp_emb_norm": 0.3669140625,
"loss": 0.5050585663318634,
"masked_top1": 65.1010538482666,
"masked_top5": 88.11703536987305,
"step": 14300,
"top1": 92.21394805908203,
"top5": 98.39463333129883
},
{
"epoch": 4.32,
"grad_norm": 1.0912111332884709,
"learning_rate": 0.0001,
"loss": 0.5219,
"step": 14350
},
{
"ce_loss": 0.5061863285303115,
"epoch": 4.32,
"inp_emb_norm": 0.3662890625,
"loss": 0.5061863285303115,
"masked_top1": 64.16183708190918,
"masked_top5": 87.73145401000977,
"step": 14350,
"top1": 92.20845031738281,
"top5": 98.3388949584961
},
{
"epoch": 4.33,
"grad_norm": 1.0296955736225974,
"learning_rate": 0.0001,
"loss": 0.531,
"step": 14400
},
{
"ce_loss": 0.5429442119598389,
"epoch": 4.33,
"inp_emb_norm": 0.3691796875,
"loss": 0.5429442119598389,
"masked_top1": 61.77721855163574,
"masked_top5": 85.85552307128906,
"step": 14400,
"top1": 91.75323486328125,
"top5": 98.1361979675293
},
{
"epoch": 4.35,
"grad_norm": 1.1454314011018045,
"learning_rate": 0.0001,
"loss": 0.531,
"step": 14450
},
{
"ce_loss": 0.5351821321249008,
"epoch": 4.35,
"inp_emb_norm": 0.369140625,
"loss": 0.5351821321249008,
"masked_top1": 62.75261306762695,
"masked_top5": 86.70513565063476,
"step": 14450,
"top1": 91.81992034912109,
"top5": 98.24479461669922
},
{
"epoch": 4.36,
"grad_norm": 1.0627879361115289,
"learning_rate": 0.0001,
"loss": 0.5354,
"step": 14500
},
{
"ce_loss": 0.5213436669111252,
"epoch": 4.36,
"inp_emb_norm": 0.37203125,
"loss": 0.5213436669111252,
"masked_top1": 62.81369560241699,
"masked_top5": 86.35222900390625,
"step": 14500,
"top1": 92.11218536376953,
"top5": 98.20223754882812
},
{
"epoch": 4.38,
"grad_norm": 1.0187687495343714,
"learning_rate": 0.0001,
"loss": 0.5316,
"step": 14550
},
{
"ce_loss": 0.5346812665462494,
"epoch": 4.38,
"inp_emb_norm": 0.364140625,
"loss": 0.5346812665462494,
"masked_top1": 63.61943054199219,
"masked_top5": 87.0892349243164,
"step": 14550,
"top1": 91.8150325012207,
"top5": 98.27966354370118
},
{
"epoch": 4.39,
"grad_norm": 0.9591126150175121,
"learning_rate": 0.0001,
"loss": 0.5376,
"step": 14600
},
{
"ce_loss": 0.5374365419149398,
"epoch": 4.39,
"inp_emb_norm": 0.361484375,
"loss": 0.5374365419149398,
"masked_top1": 61.88925651550293,
"masked_top5": 86.4749966430664,
"step": 14600,
"top1": 91.70243728637695,
"top5": 98.21218734741211
},
{
"epoch": 4.41,
"grad_norm": 1.1233501033048685,
"learning_rate": 0.0001,
"loss": 0.5384,
"step": 14650
},
{
"ce_loss": 0.5430423647165299,
"epoch": 4.41,
"inp_emb_norm": 0.3658984375,
"loss": 0.5430423647165299,
"masked_top1": 62.05436401367187,
"masked_top5": 86.22527664184571,
"step": 14650,
"top1": 91.75454711914062,
"top5": 98.14460327148437
},
{
"epoch": 4.42,
"grad_norm": 1.0492889712367255,
"learning_rate": 0.0001,
"loss": 0.5393,
"step": 14700
},
{
"ce_loss": 0.5455965805053711,
"epoch": 4.42,
"inp_emb_norm": 0.369140625,
"loss": 0.5455965805053711,
"masked_top1": 61.81470611572266,
"masked_top5": 85.87024856567383,
"step": 14700,
"top1": 91.6540138244629,
"top5": 98.14372894287109
},
{
"epoch": 4.44,
"grad_norm": 1.0520563782836545,
"learning_rate": 0.0001,
"loss": 0.5473,
"step": 14750
},
{
"ce_loss": 0.5409998238086701,
"epoch": 4.44,
"inp_emb_norm": 0.3708984375,
"loss": 0.5409998238086701,
"masked_top1": 62.29138427734375,
"masked_top5": 86.59371627807617,
"step": 14750,
"top1": 91.61244201660156,
"top5": 98.20165710449218
},
{
"epoch": 4.45,
"grad_norm": 1.0819587007503622,
"learning_rate": 0.0001,
"loss": 0.5475,
"step": 14800
},
{
"ce_loss": 0.5518389946222305,
"epoch": 4.45,
"inp_emb_norm": 0.3651953125,
"loss": 0.5518389946222305,
"masked_top1": 60.76948570251465,
"masked_top5": 85.64509750366211,
"step": 14800,
"top1": 91.58786865234374,
"top5": 98.13917205810547
},
{
"epoch": 4.47,
"grad_norm": 1.2070825391159268,
"learning_rate": 0.0001,
"loss": 0.5455,
"step": 14850
},
{
"ce_loss": 0.5443538892269134,
"epoch": 4.47,
"inp_emb_norm": 0.3718359375,
"loss": 0.5443538892269134,
"masked_top1": 62.73788932800293,
"masked_top5": 86.95017120361328,
"step": 14850,
"top1": 91.70748413085937,
"top5": 98.2123405456543
},
{
"epoch": 4.48,
"grad_norm": 1.0786306746042886,
"learning_rate": 0.0001,
"loss": 0.5447,
"step": 14900
},
{
"ce_loss": 0.5514276492595672,
"epoch": 4.48,
"inp_emb_norm": 0.36875,
"loss": 0.5514276492595672,
"masked_top1": 62.11178161621094,
"masked_top5": 85.72534118652344,
"step": 14900,
"top1": 91.74148101806641,
"top5": 98.11538375854492
},
{
"epoch": 4.5,
"grad_norm": 0.9911108641898027,
"learning_rate": 0.0001,
"loss": 0.5571,
"step": 14950
},
{
"ce_loss": 0.5549844616651535,
"epoch": 4.5,
"inp_emb_norm": 0.366484375,
"loss": 0.5549844616651535,
"masked_top1": 61.723852920532224,
"masked_top5": 86.07279510498047,
"step": 14950,
"top1": 91.52112564086914,
"top5": 98.13692977905274
},
{
"epoch": 4.51,
"grad_norm": 1.0479818912700751,
"learning_rate": 0.0001,
"loss": 0.5508,
"step": 15000
},
{
"ce_loss": 0.5503324097394944,
"epoch": 4.51,
"inp_emb_norm": 0.36609375,
"loss": 0.5503324097394944,
"masked_top1": 62.31337753295898,
"masked_top5": 85.98730865478515,
"step": 15000,
"top1": 91.61835815429687,
"top5": 98.14640243530273
},
{
"epoch": 4.53,
"grad_norm": 1.0126345802253716,
"learning_rate": 0.0001,
"loss": 0.5604,
"step": 15050
},
{
"ce_loss": 0.5613218837976456,
"epoch": 4.53,
"inp_emb_norm": 0.368203125,
"loss": 0.5613218837976456,
"masked_top1": 60.66889373779297,
"masked_top5": 85.60351181030273,
"step": 15050,
"top1": 91.41504837036133,
"top5": 98.12169372558594
},
{
"epoch": 4.54,
"grad_norm": 1.1194678195459478,
"learning_rate": 0.0001,
"loss": 0.558,
"step": 15100
},
{
"ce_loss": 0.5630833846330643,
"epoch": 4.54,
"inp_emb_norm": 0.368828125,
"loss": 0.5630833846330643,
"masked_top1": 60.917673873901364,
"masked_top5": 85.1576773071289,
"step": 15100,
"top1": 91.41613555908204,
"top5": 98.04640029907226
},
{
"epoch": 4.56,
"grad_norm": 1.0532041635279352,
"learning_rate": 0.0001,
"loss": 0.5558,
"step": 15150
},
{
"ce_loss": 0.5506490439176559,
"epoch": 4.56,
"inp_emb_norm": 0.366640625,
"loss": 0.5506490439176559,
"masked_top1": 62.93031044006348,
"masked_top5": 86.40555801391602,
"step": 15150,
"top1": 91.61476593017578,
"top5": 98.21818695068359
},
{
"epoch": 4.57,
"grad_norm": 1.1553777996600965,
"learning_rate": 0.0001,
"loss": 0.5592,
"step": 15200
},
{
"ce_loss": 0.5720086497068405,
"epoch": 4.57,
"inp_emb_norm": 0.367890625,
"loss": 0.5720086497068405,
"masked_top1": 60.15533386230469,
"masked_top5": 84.74554382324219,
"step": 15200,
"top1": 91.33564956665039,
"top5": 98.00524505615235
},
{
"epoch": 4.59,
"grad_norm": 1.0565117376954045,
"learning_rate": 0.0001,
"loss": 0.5639,
"step": 15250
},
{
"ce_loss": 0.5550442606210708,
"epoch": 4.59,
"inp_emb_norm": 0.3610546875,
"loss": 0.5550442606210708,
"masked_top1": 61.29122528076172,
"masked_top5": 85.56437545776367,
"step": 15250,
"top1": 91.44481323242188,
"top5": 98.13257965087891
},
{
"epoch": 4.6,
"grad_norm": 0.971805815056899,
"learning_rate": 0.0001,
"loss": 0.5632,
"step": 15300
},
{
"ce_loss": 0.565330091714859,
"epoch": 4.6,
"inp_emb_norm": 0.3659375,
"loss": 0.565330091714859,
"masked_top1": 60.39768196105957,
"masked_top5": 85.04548843383789,
"step": 15300,
"top1": 91.39098709106446,
"top5": 98.0403810119629
},
{
"epoch": 4.62,
"grad_norm": 1.0545177592362711,
"learning_rate": 0.0001,
"loss": 0.5666,
"step": 15350
},
{
"ce_loss": 0.5629042333364487,
"epoch": 4.62,
"inp_emb_norm": 0.363046875,
"loss": 0.5629042333364487,
"masked_top1": 61.29995155334473,
"masked_top5": 85.63533966064453,
"step": 15350,
"top1": 91.3496403503418,
"top5": 98.15489151000976
},
{
"epoch": 4.63,
"grad_norm": 1.0664326850250532,
"learning_rate": 0.0001,
"loss": 0.5688,
"step": 15400
},
{
"ce_loss": 0.5751272231340409,
"epoch": 4.63,
"inp_emb_norm": 0.3648046875,
"loss": 0.5751272231340409,
"masked_top1": 59.40922256469727,
"masked_top5": 84.36576995849609,
"step": 15400,
"top1": 91.27351089477538,
"top5": 97.98653579711915
},
{
"epoch": 4.65,
"grad_norm": 1.1863731412253264,
"learning_rate": 0.0001,
"loss": 0.5736,
"step": 15450
},
{
"ce_loss": 0.5713528543710709,
"epoch": 4.65,
"inp_emb_norm": 0.36609375,
"loss": 0.5713528543710709,
"masked_top1": 60.48031639099121,
"masked_top5": 84.88833801269531,
"step": 15450,
"top1": 91.37643188476562,
"top5": 98.01537475585937
},
{
"epoch": 4.66,
"grad_norm": 1.0582685713420434,
"learning_rate": 0.0001,
"loss": 0.5666,
"step": 15500
},
{
"ce_loss": 0.5606839144229889,
"epoch": 4.66,
"inp_emb_norm": 0.3706640625,
"loss": 0.5606839144229889,
"masked_top1": 61.53562179565429,
"masked_top5": 85.30425521850586,
"step": 15500,
"top1": 91.48294464111328,
"top5": 98.07853866577149
},
{
"epoch": 4.68,
"grad_norm": 1.1085095153878863,
"learning_rate": 0.0001,
"loss": 0.5639,
"step": 15550
},
{
"ce_loss": 0.5658587354421616,
"epoch": 4.68,
"inp_emb_norm": 0.3694921875,
"loss": 0.5658587354421616,
"masked_top1": 60.92467597961426,
"masked_top5": 85.25927993774414,
"step": 15550,
"top1": 91.32387847900391,
"top5": 98.04909301757813
},
{
"epoch": 4.69,
"grad_norm": 1.0956454635831958,
"learning_rate": 0.0001,
"loss": 0.5628,
"step": 15600
},
{
"ce_loss": 0.5535443860292435,
"epoch": 4.69,
"inp_emb_norm": 0.3674609375,
"loss": 0.5535443860292435,
"masked_top1": 62.16223007202149,
"masked_top5": 85.78218490600585,
"step": 15600,
"top1": 91.56584075927735,
"top5": 98.14968353271485
},
{
"epoch": 4.71,
"grad_norm": 1.0299728685580238,
"learning_rate": 0.0001,
"loss": 0.5815,
"step": 15650
},
{
"ce_loss": 0.5903949171304703,
"epoch": 4.71,
"inp_emb_norm": 0.366796875,
"loss": 0.5903949171304703,
"masked_top1": 59.38263565063477,
"masked_top5": 84.16266036987305,
"step": 15650,
"top1": 91.00887252807617,
"top5": 97.9221858215332
},
{
"epoch": 4.72,
"grad_norm": 1.0470383149335234,
"learning_rate": 0.0001,
"loss": 0.5781,
"step": 15700
},
{
"ce_loss": 0.5768050736188889,
"epoch": 4.72,
"inp_emb_norm": 0.3702734375,
"loss": 0.5768050736188889,
"masked_top1": 60.77295166015625,
"masked_top5": 84.56514495849609,
"step": 15700,
"top1": 91.20448944091797,
"top5": 97.95070877075196
},
{
"epoch": 4.74,
"grad_norm": 1.0460925861543287,
"learning_rate": 0.0001,
"loss": 0.5703,
"step": 15750
},
{
"ce_loss": 0.5766702961921691,
"epoch": 4.74,
"inp_emb_norm": 0.367265625,
"loss": 0.5766702961921691,
"masked_top1": 60.788666229248044,
"masked_top5": 84.2236555480957,
"step": 15750,
"top1": 91.25414123535157,
"top5": 97.99418731689452
},
{
"epoch": 4.75,
"grad_norm": 1.0639938057731058,
"learning_rate": 0.0001,
"loss": 0.576,
"step": 15800
},
{
"ce_loss": 0.5707284951210022,
"epoch": 4.75,
"inp_emb_norm": 0.37234375,
"loss": 0.5707284951210022,
"masked_top1": 60.47222633361817,
"masked_top5": 85.08194046020508,
"step": 15800,
"top1": 91.27689331054688,
"top5": 98.0308624267578
},
{
"epoch": 4.77,
"grad_norm": 1.0685967682651536,
"learning_rate": 0.0001,
"loss": 0.5746,
"step": 15850
},
{
"ce_loss": 0.57530591070652,
"epoch": 4.77,
"inp_emb_norm": 0.369296875,
"loss": 0.57530591070652,
"masked_top1": 60.0161856842041,
"masked_top5": 84.5344613647461,
"step": 15850,
"top1": 91.30489456176758,
"top5": 98.01641876220702
},
{
"epoch": 4.78,
"grad_norm": 1.0744915511743809,
"learning_rate": 0.0001,
"loss": 0.5806,
"step": 15900
},
{
"ce_loss": 0.5880693066120147,
"epoch": 4.78,
"inp_emb_norm": 0.37296875,
"loss": 0.5880693066120147,
"masked_top1": 59.781610260009764,
"masked_top5": 84.03608184814453,
"step": 15900,
"top1": 91.16118774414062,
"top5": 97.90285369873047
},
{
"epoch": 4.8,
"grad_norm": 1.1777116989988736,
"learning_rate": 0.0001,
"loss": 0.5841,
"step": 15950
},
{
"ce_loss": 0.5911537754535675,
"epoch": 4.8,
"inp_emb_norm": 0.3667578125,
"loss": 0.5911537754535675,
"masked_top1": 59.32529609680176,
"masked_top5": 84.31457275390625,
"step": 15950,
"top1": 90.98917724609375,
"top5": 97.9403483581543
},
{
"epoch": 4.81,
"grad_norm": 1.1564579774891486,
"learning_rate": 0.0001,
"loss": 0.5796,
"step": 16000
},
{
"ce_loss": 0.5745090502500534,
"epoch": 4.81,
"inp_emb_norm": 0.3758203125,
"loss": 0.5745090502500534,
"masked_top1": 60.10156044006348,
"masked_top5": 85.1797428894043,
"step": 16000,
"top1": 91.18394180297851,
"top5": 98.0737141418457
},
{
"epoch": 4.83,
"grad_norm": 1.1492330940204298,
"learning_rate": 0.0001,
"loss": 0.5883,
"step": 16050
},
{
"ce_loss": 0.5925427573919296,
"epoch": 4.83,
"inp_emb_norm": 0.3705078125,
"loss": 0.5925427573919296,
"masked_top1": 59.59210731506348,
"masked_top5": 84.50634429931641,
"step": 16050,
"top1": 91.06646270751953,
"top5": 97.93232162475586
},
{
"epoch": 4.84,
"grad_norm": 1.0605567187549139,
"learning_rate": 0.0001,
"loss": 0.5859,
"step": 16100
},
{
"ce_loss": 0.584769184589386,
"epoch": 4.84,
"inp_emb_norm": 0.376796875,
"loss": 0.584769184589386,
"masked_top1": 60.4832283782959,
"masked_top5": 84.92732849121094,
"step": 16100,
"top1": 91.11549453735351,
"top5": 97.98623245239258
},
{
"epoch": 4.86,
"grad_norm": 1.0454099861852648,
"learning_rate": 0.0001,
"loss": 0.5788,
"step": 16150
},
{
"ce_loss": 0.5863047724962235,
"epoch": 4.86,
"inp_emb_norm": 0.3832421875,
"loss": 0.5863047724962235,
"masked_top1": 58.98088394165039,
"masked_top5": 84.14134887695313,
"step": 16150,
"top1": 91.02991897583007,
"top5": 97.89835571289062
},
{
"epoch": 4.87,
"grad_norm": 1.1813560843799207,
"learning_rate": 0.0001,
"loss": 0.5937,
"step": 16200
},
{
"ce_loss": 0.5899865692853927,
"epoch": 4.87,
"inp_emb_norm": 0.3682421875,
"loss": 0.5899865692853927,
"masked_top1": 59.65545417785645,
"masked_top5": 83.82899841308594,
"step": 16200,
"top1": 90.97882614135742,
"top5": 97.96341979980468
},
{
"epoch": 4.89,
"grad_norm": 1.1257949963772835,
"learning_rate": 0.0001,
"loss": 0.6029,
"step": 16250
},
{
"ce_loss": 0.6203358447551728,
"epoch": 4.89,
"inp_emb_norm": 0.3719921875,
"loss": 0.6203358447551728,
"masked_top1": 58.96188240051269,
"masked_top5": 83.78718704223633,
"step": 16250,
"top1": 90.81191055297852,
"top5": 97.87782455444336
},
{
"epoch": 4.9,
"grad_norm": 1.0932714591709916,
"learning_rate": 0.0001,
"loss": 0.5876,
"step": 16300
},
{
"ce_loss": 0.5763556951284409,
"epoch": 4.9,
"inp_emb_norm": 0.37578125,
"loss": 0.5763556951284409,
"masked_top1": 60.64178207397461,
"masked_top5": 85.0282049560547,
"step": 16300,
"top1": 91.18880279541015,
"top5": 98.01950302124024
},
{
"epoch": 4.92,
"grad_norm": 0.9495406306411024,
"learning_rate": 0.0001,
"loss": 0.5891,
"step": 16350
},
{
"ce_loss": 0.5920463001728058,
"epoch": 4.92,
"inp_emb_norm": 0.3779296875,
"loss": 0.5920463001728058,
"masked_top1": 60.18883232116699,
"masked_top5": 83.78975204467774,
"step": 16350,
"top1": 91.02826797485352,
"top5": 97.88983917236328
},
{
"epoch": 4.93,
"grad_norm": 1.117017854175032,
"learning_rate": 0.0001,
"loss": 0.5989,
"step": 16400
},
{
"ce_loss": 0.6076039922237396,
"epoch": 4.93,
"inp_emb_norm": 0.3727734375,
"loss": 0.6076039922237396,
"masked_top1": 58.69016136169434,
"masked_top5": 83.0786215209961,
"step": 16400,
"top1": 90.75551071166993,
"top5": 97.82947280883789
},
{
"epoch": 4.95,
"grad_norm": 1.071523606880974,
"learning_rate": 0.0001,
"loss": 0.6012,
"step": 16450
},
{
"ce_loss": 0.6065960395336151,
"epoch": 4.95,
"inp_emb_norm": 0.380703125,
"loss": 0.6065960395336151,
"masked_top1": 58.35442459106445,
"masked_top5": 83.6309603881836,
"step": 16450,
"top1": 90.80092453002929,
"top5": 97.82826370239258
},
{
"epoch": 4.96,
"grad_norm": 1.100617719882716,
"learning_rate": 0.0001,
"loss": 0.5942,
"step": 16500
},
{
"ce_loss": 0.5968893599510193,
"epoch": 4.96,
"inp_emb_norm": 0.376484375,
"loss": 0.5968893599510193,
"masked_top1": 58.843356704711915,
"masked_top5": 83.53122055053711,
"step": 16500,
"top1": 90.88707092285156,
"top5": 97.87701583862305
},
{
"epoch": 4.98,
"grad_norm": 1.0119812517995022,
"learning_rate": 0.0001,
"loss": 0.6025,
"step": 16550
},
{
"ce_loss": 0.5882117158174515,
"epoch": 4.98,
"inp_emb_norm": 0.374296875,
"loss": 0.5882117158174515,
"masked_top1": 60.96791168212891,
"masked_top5": 84.55708404541015,
"step": 16550,
"top1": 91.02984649658202,
"top5": 97.95657623291015
},
{
"epoch": 4.99,
"grad_norm": 1.0572442138344962,
"learning_rate": 0.0001,
"loss": 0.5944,
"step": 16600
},
{
"ce_loss": 0.5818337166309356,
"epoch": 4.99,
"inp_emb_norm": 0.3735546875,
"loss": 0.5818337166309356,
"masked_top1": 60.71773277282715,
"masked_top5": 85.01761001586914,
"step": 16600,
"top1": 91.06825744628907,
"top5": 98.01745468139649
},
{
"epoch": 5.01,
"grad_norm": 0.8868244741347852,
"learning_rate": 0.0001,
"loss": 0.4283,
"step": 16650
},
{
"ce_loss": 0.4169870808720589,
"epoch": 5.01,
"inp_emb_norm": 0.3746484375,
"loss": 0.4169870808720589,
"masked_top1": 74.96077354431152,
"masked_top5": 91.62375808715821,
"step": 16650,
"top1": 93.5944010925293,
"top5": 98.75289016723633
},
{
"epoch": 5.02,
"grad_norm": 0.8006136974249275,
"learning_rate": 0.0001,
"loss": 0.24,
"step": 16700
},
{
"ce_loss": 0.23759305894374846,
"epoch": 5.02,
"inp_emb_norm": 0.377421875,
"loss": 0.23759305894374846,
"masked_top1": 89.66921447753906,
"masked_top5": 98.84856094360352,
"step": 16700,
"top1": 96.22275146484375,
"top5": 99.58460357666016
},
{
"epoch": 5.04,
"grad_norm": 0.7814579996198231,
"learning_rate": 0.0001,
"loss": 0.2428,
"step": 16750
},
{
"ce_loss": 0.24006595432758332,
"epoch": 5.04,
"inp_emb_norm": 0.3719921875,
"loss": 0.24006595432758332,
"masked_top1": 89.90816116333008,
"masked_top5": 98.82037811279297,
"step": 16750,
"top1": 96.24885559082031,
"top5": 99.55003356933594
},
{
"epoch": 5.05,
"grad_norm": 0.7858773051800831,
"learning_rate": 0.0001,
"loss": 0.2395,
"step": 16800
},
{
"ce_loss": 0.23285773277282715,
"epoch": 5.05,
"inp_emb_norm": 0.3825390625,
"loss": 0.23285773277282715,
"masked_top1": 89.65037063598633,
"masked_top5": 98.94374557495117,
"step": 16800,
"top1": 96.30195434570312,
"top5": 99.57885467529297
},
{
"epoch": 5.07,
"grad_norm": 0.8834168839025487,
"learning_rate": 0.0001,
"loss": 0.2461,
"step": 16850
},
{
"ce_loss": 0.24451201170682907,
"epoch": 5.07,
"inp_emb_norm": 0.38234375,
"loss": 0.24451201170682907,
"masked_top1": 88.87031646728515,
"masked_top5": 98.54978637695312,
"step": 16850,
"top1": 96.15791427612305,
"top5": 99.54232421875
},
{
"epoch": 5.08,
"grad_norm": 0.9643007917325629,
"learning_rate": 0.0001,
"loss": 0.2558,
"step": 16900
},
{
"ce_loss": 0.25902660697698593,
"epoch": 5.08,
"inp_emb_norm": 0.365234375,
"loss": 0.25902660697698593,
"masked_top1": 88.15865753173829,
"masked_top5": 98.6231999206543,
"step": 16900,
"top1": 95.8116291809082,
"top5": 99.52821868896484
},
{
"epoch": 5.1,
"grad_norm": 0.9187240394700701,
"learning_rate": 0.0001,
"loss": 0.2653,
"step": 16950
},
{
"ce_loss": 0.25573085725307465,
"epoch": 5.1,
"inp_emb_norm": 0.3799609375,
"loss": 0.25573085725307465,
"masked_top1": 88.29147186279297,
"masked_top5": 98.50399887084961,
"step": 16950,
"top1": 95.97455947875977,
"top5": 99.51153411865235
},
{
"epoch": 5.11,
"grad_norm": 0.8509566798013529,
"learning_rate": 0.0001,
"loss": 0.2532,
"step": 17000
},
{
"ce_loss": 0.2532435983419418,
"epoch": 5.11,
"inp_emb_norm": 0.3784765625,
"loss": 0.2532435983419418,
"masked_top1": 88.47122055053711,
"masked_top5": 98.4746549987793,
"step": 17000,
"top1": 95.92875396728516,
"top5": 99.54647705078125
},
{
"epoch": 5.13,
"grad_norm": 0.966807468835321,
"learning_rate": 0.0001,
"loss": 0.2595,
"step": 17050
},
{
"ce_loss": 0.2636346372961998,
"epoch": 5.13,
"inp_emb_norm": 0.37484375,
"loss": 0.2636346372961998,
"masked_top1": 87.61207931518555,
"masked_top5": 98.49675857543946,
"step": 17050,
"top1": 95.76734481811523,
"top5": 99.5260922241211
},
{
"epoch": 5.14,
"grad_norm": 0.8630676671965275,
"learning_rate": 0.0001,
"loss": 0.2605,
"step": 17100
},
{
"ce_loss": 0.2629204204678535,
"epoch": 5.14,
"inp_emb_norm": 0.381796875,
"loss": 0.2629204204678535,
"masked_top1": 87.41785934448242,
"masked_top5": 98.47596908569336,
"step": 17100,
"top1": 95.73066696166993,
"top5": 99.52325912475585
},
{
"epoch": 5.16,
"grad_norm": 1.0053540322976233,
"learning_rate": 0.0001,
"loss": 0.2648,
"step": 17150
},
{
"ce_loss": 0.27128377854824065,
"epoch": 5.16,
"inp_emb_norm": 0.3784765625,
"loss": 0.27128377854824065,
"masked_top1": 87.0358544921875,
"masked_top5": 98.38056884765625,
"step": 17150,
"top1": 95.64348907470703,
"top5": 99.50871109008789
},
{
"epoch": 5.17,
"grad_norm": 0.9508275120295281,
"learning_rate": 0.0001,
"loss": 0.2745,
"step": 17200
},
{
"ce_loss": 0.27368868499994276,
"epoch": 5.17,
"inp_emb_norm": 0.3740234375,
"loss": 0.27368868499994276,
"masked_top1": 87.3692935180664,
"masked_top5": 98.23250915527343,
"step": 17200,
"top1": 95.61584823608399,
"top5": 99.49671478271485
},
{
"epoch": 5.19,
"grad_norm": 0.9646300915973821,
"learning_rate": 0.0001,
"loss": 0.2814,
"step": 17250
},
{
"ce_loss": 0.27761764973402026,
"epoch": 5.19,
"inp_emb_norm": 0.3832421875,
"loss": 0.27761764973402026,
"masked_top1": 86.05260269165039,
"masked_top5": 98.24610961914063,
"step": 17250,
"top1": 95.51309692382813,
"top5": 99.5203547668457
},
{
"epoch": 5.2,
"grad_norm": 0.9852223949042181,
"learning_rate": 0.0001,
"loss": 0.2756,
"step": 17300
},
{
"ce_loss": 0.2694446948170662,
"epoch": 5.2,
"inp_emb_norm": 0.3767578125,
"loss": 0.2694446948170662,
"masked_top1": 87.5820735168457,
"masked_top5": 98.16199798583985,
"step": 17300,
"top1": 95.65110595703125,
"top5": 99.50478088378907
},
{
"epoch": 5.22,
"grad_norm": 1.0161825412276837,
"learning_rate": 0.0001,
"loss": 0.2836,
"step": 17350
},
{
"ce_loss": 0.2845872187614441,
"epoch": 5.22,
"inp_emb_norm": 0.38078125,
"loss": 0.2845872187614441,
"masked_top1": 85.83310623168946,
"masked_top5": 97.87667556762695,
"step": 17350,
"top1": 95.49496994018554,
"top5": 99.44075881958008
},
{
"epoch": 5.23,
"grad_norm": 0.9591201013156013,
"learning_rate": 0.0001,
"loss": 0.2831,
"step": 17400
},
{
"ce_loss": 0.28442945539951325,
"epoch": 5.23,
"inp_emb_norm": 0.3789453125,
"loss": 0.28442945539951325,
"masked_top1": 86.10271911621093,
"masked_top5": 97.88985565185547,
"step": 17400,
"top1": 95.40376953125,
"top5": 99.44743041992187
},
{
"epoch": 5.25,
"grad_norm": 1.0432247629117117,
"learning_rate": 0.0001,
"loss": 0.2854,
"step": 17450
},
{
"ce_loss": 0.27993369311094285,
"epoch": 5.25,
"inp_emb_norm": 0.3808203125,
"loss": 0.27993369311094285,
"masked_top1": 86.45830123901368,
"masked_top5": 98.15035873413086,
"step": 17450,
"top1": 95.57389678955079,
"top5": 99.48250961303711
},
{
"epoch": 5.26,
"grad_norm": 0.9701861929167235,
"learning_rate": 0.0001,
"loss": 0.2907,
"step": 17500
},
{
"ce_loss": 0.28741121381521223,
"epoch": 5.26,
"inp_emb_norm": 0.3751171875,
"loss": 0.28741121381521223,
"masked_top1": 85.9087370300293,
"masked_top5": 98.2075733947754,
"step": 17500,
"top1": 95.36790939331054,
"top5": 99.45517150878906
},
{
"epoch": 5.28,
"grad_norm": 0.9018657474300132,
"learning_rate": 0.0001,
"loss": 0.2939,
"step": 17550
},
{
"ce_loss": 0.2932562205195427,
"epoch": 5.28,
"inp_emb_norm": 0.376015625,
"loss": 0.2932562205195427,
"masked_top1": 85.43090469360351,
"masked_top5": 97.69857574462891,
"step": 17550,
"top1": 95.35228378295898,
"top5": 99.44769515991212
},
{
"epoch": 5.29,
"grad_norm": 1.0525621107942214,
"learning_rate": 0.0001,
"loss": 0.2878,
"step": 17600
},
{
"ce_loss": 0.2939734762907028,
"epoch": 5.29,
"inp_emb_norm": 0.3778515625,
"loss": 0.2939734762907028,
"masked_top1": 85.80782012939453,
"masked_top5": 98.14864028930664,
"step": 17600,
"top1": 95.27761352539062,
"top5": 99.49363082885742
},
{
"epoch": 5.31,
"grad_norm": 1.0032239373031064,
"learning_rate": 0.0001,
"loss": 0.3036,
"step": 17650
},
{
"ce_loss": 0.3070009741187096,
"epoch": 5.31,
"inp_emb_norm": 0.37828125,
"loss": 0.3070009741187096,
"masked_top1": 84.42236633300782,
"masked_top5": 97.74387069702148,
"step": 17650,
"top1": 95.10071838378906,
"top5": 99.41670013427735
},
{
"epoch": 5.32,
"grad_norm": 1.0274167035814015,
"learning_rate": 0.0001,
"loss": 0.3011,
"step": 17700
},
{
"ce_loss": 0.307579453587532,
"epoch": 5.32,
"inp_emb_norm": 0.3801953125,
"loss": 0.307579453587532,
"masked_top1": 84.19732971191407,
"masked_top5": 97.56189498901367,
"step": 17700,
"top1": 95.09036392211914,
"top5": 99.39419525146485
},
{
"epoch": 5.34,
"grad_norm": 1.019591016333828,
"learning_rate": 0.0001,
"loss": 0.3006,
"step": 17750
},
{
"ce_loss": 0.29659480959177015,
"epoch": 5.34,
"inp_emb_norm": 0.379140625,
"loss": 0.29659480959177015,
"masked_top1": 84.7645182800293,
"masked_top5": 98.02963897705078,
"step": 17750,
"top1": 95.21526107788085,
"top5": 99.45744186401367
},
{
"epoch": 5.35,
"grad_norm": 1.045966882681033,
"learning_rate": 0.0001,
"loss": 0.3046,
"step": 17800
},
{
"ce_loss": 0.3082049387693405,
"epoch": 5.35,
"inp_emb_norm": 0.3823828125,
"loss": 0.3082049387693405,
"masked_top1": 84.11483215332031,
"masked_top5": 97.67893463134766,
"step": 17800,
"top1": 95.09591766357421,
"top5": 99.39608154296874
},
{
"epoch": 5.37,
"grad_norm": 0.9692493761597851,
"learning_rate": 0.0001,
"loss": 0.3087,
"step": 17850
},
{
"ce_loss": 0.3111723321676254,
"epoch": 5.37,
"inp_emb_norm": 0.3821875,
"loss": 0.3111723321676254,
"masked_top1": 83.85779495239258,
"masked_top5": 97.46583740234375,
"step": 17850,
"top1": 94.99774032592774,
"top5": 99.40854736328124
},
{
"epoch": 5.38,
"grad_norm": 0.9440787583366769,
"learning_rate": 0.0001,
"loss": 0.306,
"step": 17900
},
{
"ce_loss": 0.31266897201538085,
"epoch": 5.38,
"inp_emb_norm": 0.380859375,
"loss": 0.31266897201538085,
"masked_top1": 83.7997444152832,
"masked_top5": 97.51995346069336,
"step": 17900,
"top1": 94.99930786132812,
"top5": 99.39216735839844
},
{
"epoch": 5.4,
"grad_norm": 0.9729412684124459,
"learning_rate": 0.0001,
"loss": 0.3072,
"step": 17950
},
{
"ce_loss": 0.30246726602315904,
"epoch": 5.4,
"inp_emb_norm": 0.385234375,
"loss": 0.30246726602315904,
"masked_top1": 84.3121369934082,
"masked_top5": 97.73384506225585,
"step": 17950,
"top1": 95.10583526611327,
"top5": 99.43766952514649
},
{
"epoch": 5.41,
"grad_norm": 0.9636765858785842,
"learning_rate": 0.0001,
"loss": 0.307,
"step": 18000
},
{
"ce_loss": 0.30428083807229994,
"epoch": 5.41,
"inp_emb_norm": 0.381484375,
"loss": 0.30428083807229994,
"masked_top1": 84.8429086303711,
"masked_top5": 97.52333801269532,
"step": 18000,
"top1": 95.1666488647461,
"top5": 99.39987106323242
},
{
"epoch": 5.43,
"grad_norm": 1.0186629227679855,
"learning_rate": 0.0001,
"loss": 0.3133,
"step": 18050
},
{
"ce_loss": 0.3080432793498039,
"epoch": 5.43,
"inp_emb_norm": 0.381015625,
"loss": 0.3080432793498039,
"masked_top1": 83.97354797363282,
"masked_top5": 97.46352157592773,
"step": 18050,
"top1": 95.05124603271484,
"top5": 99.38775970458984
},
{
"epoch": 5.44,
"grad_norm": 0.9556152250711039,
"learning_rate": 0.0001,
"loss": 0.3298,
"step": 18100
},
{
"ce_loss": 0.35822920709848405,
"epoch": 5.44,
"inp_emb_norm": 0.38015625,
"loss": 0.35822920709848405,
"masked_top1": 83.25973114013672,
"masked_top5": 97.05907196044922,
"step": 18100,
"top1": 94.64421905517578,
"top5": 99.1507731628418
},
{
"epoch": 5.46,
"grad_norm": 1.0598324926140277,
"learning_rate": 0.0001,
"loss": 0.3205,
"step": 18150
},
{
"ce_loss": 0.32162826359272,
"epoch": 5.46,
"inp_emb_norm": 0.3842578125,
"loss": 0.32162826359272,
"masked_top1": 83.18663421630859,
"masked_top5": 97.19483627319336,
"step": 18150,
"top1": 94.94844818115234,
"top5": 99.35345962524414
},
{
"epoch": 5.47,
"grad_norm": 0.9530378094960094,
"learning_rate": 0.0001,
"loss": 0.32,
"step": 18200
},
{
"ce_loss": 0.3288844656944275,
"epoch": 5.47,
"inp_emb_norm": 0.3790625,
"loss": 0.3288844656944275,
"masked_top1": 82.93034057617187,
"masked_top5": 97.11549606323243,
"step": 18200,
"top1": 94.78947708129883,
"top5": 99.34183990478516
},
{
"epoch": 5.49,
"grad_norm": 0.998874046778327,
"learning_rate": 0.0001,
"loss": 0.3207,
"step": 18250
},
{
"ce_loss": 0.32029964685440065,
"epoch": 5.49,
"inp_emb_norm": 0.38546875,
"loss": 0.32029964685440065,
"masked_top1": 82.87285369873047,
"masked_top5": 97.35884552001953,
"step": 18250,
"top1": 94.86287094116211,
"top5": 99.37160079956055
},
{
"epoch": 5.5,
"grad_norm": 1.0507249584695442,
"learning_rate": 0.0001,
"loss": 0.3264,
"step": 18300
},
{
"ce_loss": 0.3225671499967575,
"epoch": 5.5,
"inp_emb_norm": 0.381953125,
"loss": 0.3225671499967575,
"masked_top1": 82.94257446289062,
"masked_top5": 97.59221542358398,
"step": 18300,
"top1": 94.790439453125,
"top5": 99.39126449584961
},
{
"epoch": 5.52,
"grad_norm": 1.0538788268467503,
"learning_rate": 0.0001,
"loss": 0.3272,
"step": 18350
},
{
"ce_loss": 0.3279902094602585,
"epoch": 5.52,
"inp_emb_norm": 0.378984375,
"loss": 0.3279902094602585,
"masked_top1": 82.10417236328125,
"masked_top5": 97.07626525878906,
"step": 18350,
"top1": 94.77208694458008,
"top5": 99.34082946777343
},
{
"epoch": 5.53,
"grad_norm": 1.0082114223647751,
"learning_rate": 0.0001,
"loss": 0.3246,
"step": 18400
},
{
"ce_loss": 0.3286957702040672,
"epoch": 5.53,
"inp_emb_norm": 0.3857421875,
"loss": 0.3286957702040672,
"masked_top1": 82.18844451904297,
"masked_top5": 97.20039276123048,
"step": 18400,
"top1": 94.72737030029298,
"top5": 99.36099243164062
},
{
"epoch": 5.55,
"grad_norm": 1.1198021466793433,
"learning_rate": 0.0001,
"loss": 0.3312,
"step": 18450
},
{
"ce_loss": 0.33603747010231017,
"epoch": 5.55,
"inp_emb_norm": 0.3957421875,
"loss": 0.33603747010231017,
"masked_top1": 81.67584823608398,
"masked_top5": 96.54129943847656,
"step": 18450,
"top1": 94.65423965454102,
"top5": 99.32092361450195
},
{
"epoch": 5.56,
"grad_norm": 1.0668250210650634,
"learning_rate": 0.0001,
"loss": 0.3274,
"step": 18500
},
{
"ce_loss": 0.3218830382823944,
"epoch": 5.56,
"inp_emb_norm": 0.3800390625,
"loss": 0.3218830382823944,
"masked_top1": 82.66215911865234,
"masked_top5": 97.63473297119141,
"step": 18500,
"top1": 94.83183807373047,
"top5": 99.41087814331054
},
{
"epoch": 5.58,
"grad_norm": 1.0489105931316678,
"learning_rate": 0.0001,
"loss": 0.3323,
"step": 18550
},
{
"ce_loss": 0.33785298705101013,
"epoch": 5.58,
"inp_emb_norm": 0.3783984375,
"loss": 0.33785298705101013,
"masked_top1": 82.34555877685547,
"masked_top5": 97.0906037902832,
"step": 18550,
"top1": 94.63372589111329,
"top5": 99.32784606933593
},
{
"epoch": 5.59,
"grad_norm": 1.0468329188388257,
"learning_rate": 0.0001,
"loss": 0.3334,
"step": 18600
},
{
"ce_loss": 0.3364351660013199,
"epoch": 5.59,
"inp_emb_norm": 0.388515625,
"loss": 0.3364351660013199,
"masked_top1": 81.59525100708008,
"masked_top5": 96.72632537841797,
"step": 18600,
"top1": 94.66922760009766,
"top5": 99.31353454589843
},
{
"epoch": 5.61,
"grad_norm": 1.0268275103914735,
"learning_rate": 0.0001,
"loss": 0.3306,
"step": 18650
},
{
"ce_loss": 0.327090705037117,
"epoch": 5.61,
"inp_emb_norm": 0.384921875,
"loss": 0.327090705037117,
"masked_top1": 82.4957682800293,
"masked_top5": 96.94613693237305,
"step": 18650,
"top1": 94.83338302612304,
"top5": 99.36148330688476
},
{
"epoch": 5.62,
"grad_norm": 1.048035539110107,
"learning_rate": 0.0001,
"loss": 0.3317,
"step": 18700
},
{
"ce_loss": 0.3305124366283417,
"epoch": 5.62,
"inp_emb_norm": 0.3794921875,
"loss": 0.3305124366283417,
"masked_top1": 82.27645935058594,
"masked_top5": 97.16238998413085,
"step": 18700,
"top1": 94.69090454101563,
"top5": 99.3725503540039
},
{
"epoch": 5.64,
"grad_norm": 1.0935078838634331,
"learning_rate": 0.0001,
"loss": 0.3428,
"step": 18750
},
{
"ce_loss": 0.34309773981571196,
"epoch": 5.64,
"inp_emb_norm": 0.379453125,
"loss": 0.34309773981571196,
"masked_top1": 81.5336685180664,
"masked_top5": 96.76238327026367,
"step": 18750,
"top1": 94.5162728881836,
"top5": 99.30676239013673
},
{
"epoch": 5.65,
"grad_norm": 1.0784022808624179,
"learning_rate": 0.0001,
"loss": 0.3433,
"step": 18800
},
{
"ce_loss": 0.35102885723114013,
"epoch": 5.65,
"inp_emb_norm": 0.3797265625,
"loss": 0.35102885723114013,
"masked_top1": 80.56207229614257,
"masked_top5": 96.39179016113282,
"step": 18800,
"top1": 94.42943588256836,
"top5": 99.2728482055664
},
{
"epoch": 5.67,
"grad_norm": 1.0393038843820417,
"learning_rate": 0.0001,
"loss": 0.3427,
"step": 18850
},
{
"ce_loss": 0.3519377601146698,
"epoch": 5.67,
"inp_emb_norm": 0.386484375,
"loss": 0.3519377601146698,
"masked_top1": 80.00395278930664,
"masked_top5": 96.58664108276368,
"step": 18850,
"top1": 94.40878112792969,
"top5": 99.31891250610352
},
{
"epoch": 5.68,
"grad_norm": 1.1009482557837906,
"learning_rate": 0.0001,
"loss": 0.3438,
"step": 18900
},
{
"ce_loss": 0.33931906819343566,
"epoch": 5.68,
"inp_emb_norm": 0.383515625,
"loss": 0.33931906819343566,
"masked_top1": 81.50663543701172,
"masked_top5": 96.86479187011719,
"step": 18900,
"top1": 94.59292114257812,
"top5": 99.32182586669921
},
{
"epoch": 5.7,
"grad_norm": 0.9751674873578042,
"learning_rate": 0.0001,
"loss": 0.3507,
"step": 18950
},
{
"ce_loss": 0.343431094288826,
"epoch": 5.7,
"inp_emb_norm": 0.3859375,
"loss": 0.343431094288826,
"masked_top1": 80.64304809570312,
"masked_top5": 96.53154220581055,
"step": 18950,
"top1": 94.48710632324219,
"top5": 99.31856002807618
},
{
"epoch": 5.71,
"grad_norm": 0.9792712686829252,
"learning_rate": 0.0001,
"loss": 0.3409,
"step": 19000
},
{
"ce_loss": 0.34443502128124237,
"epoch": 5.71,
"inp_emb_norm": 0.38875,
"loss": 0.34443502128124237,
"masked_top1": 80.8620687866211,
"masked_top5": 96.8039859008789,
"step": 19000,
"top1": 94.45713119506836,
"top5": 99.3434228515625
},
{
"epoch": 5.73,
"grad_norm": 1.0250483722882462,
"learning_rate": 0.0001,
"loss": 0.3462,
"step": 19050
},
{
"ce_loss": 0.3492258018255234,
"epoch": 5.73,
"inp_emb_norm": 0.3844921875,
"loss": 0.3492258018255234,
"masked_top1": 80.19672760009766,
"masked_top5": 96.82942138671875,
"step": 19050,
"top1": 94.36864486694336,
"top5": 99.30538925170899
},
{
"epoch": 5.74,
"grad_norm": 1.033814306715085,
"learning_rate": 0.0001,
"loss": 0.3465,
"step": 19100
},
{
"ce_loss": 0.3455532872676849,
"epoch": 5.74,
"inp_emb_norm": 0.3897265625,
"loss": 0.3455532872676849,
"masked_top1": 81.14021148681641,
"masked_top5": 96.46601867675781,
"step": 19100,
"top1": 94.59025283813476,
"top5": 99.29209457397461
},
{
"epoch": 5.76,
"grad_norm": 1.1266003627715306,
"learning_rate": 0.0001,
"loss": 0.3496,
"step": 19150
},
{
"ce_loss": 0.3510564410686493,
"epoch": 5.76,
"inp_emb_norm": 0.3796484375,
"loss": 0.3510564410686493,
"masked_top1": 80.42627136230469,
"masked_top5": 96.30276489257812,
"step": 19150,
"top1": 94.38863174438477,
"top5": 99.28523544311524
},
{
"epoch": 5.77,
"grad_norm": 1.0786804231908673,
"learning_rate": 0.0001,
"loss": 0.3519,
"step": 19200
},
{
"ce_loss": 0.35517399430274965,
"epoch": 5.77,
"inp_emb_norm": 0.379765625,
"loss": 0.35517399430274965,
"masked_top1": 80.2329933166504,
"masked_top5": 96.45190505981445,
"step": 19200,
"top1": 94.32036392211914,
"top5": 99.28158004760742
},
{
"epoch": 5.79,
"grad_norm": 0.9810743864140166,
"learning_rate": 0.0001,
"loss": 0.3532,
"step": 19250
},
{
"ce_loss": 0.35108084678649903,
"epoch": 5.79,
"inp_emb_norm": 0.38859375,
"loss": 0.35108084678649903,
"masked_top1": 80.51531219482422,
"masked_top5": 96.45788009643555,
"step": 19250,
"top1": 94.3976658630371,
"top5": 99.2694255065918
},
{
"epoch": 5.8,
"grad_norm": 0.9998840954054705,
"learning_rate": 0.0001,
"loss": 0.358,
"step": 19300
},
{
"ce_loss": 0.36306409776210785,
"epoch": 5.8,
"inp_emb_norm": 0.3805859375,
"loss": 0.36306409776210785,
"masked_top1": 79.08367248535156,
"masked_top5": 96.06164031982422,
"step": 19300,
"top1": 94.15442794799804,
"top5": 99.21199111938476
},
{
"epoch": 5.82,
"grad_norm": 0.9561538506979583,
"learning_rate": 0.0001,
"loss": 0.3517,
"step": 19350
},
{
"ce_loss": 0.3433873727917671,
"epoch": 5.82,
"inp_emb_norm": 0.3860546875,
"loss": 0.3433873727917671,
"masked_top1": 81.34098831176757,
"masked_top5": 96.61977462768554,
"step": 19350,
"top1": 94.59346282958984,
"top5": 99.30811492919922
},
{
"epoch": 5.83,
"grad_norm": 0.9641219081481858,
"learning_rate": 0.0001,
"loss": 0.3571,
"step": 19400
},
{
"ce_loss": 0.3487838166952133,
"epoch": 5.83,
"inp_emb_norm": 0.3849609375,
"loss": 0.3487838166952133,
"masked_top1": 81.22178527832031,
"masked_top5": 96.62050567626953,
"step": 19400,
"top1": 94.4317202758789,
"top5": 99.28485198974609
},
{
"epoch": 5.85,
"grad_norm": 1.1056878672573682,
"learning_rate": 0.0001,
"loss": 0.3589,
"step": 19450
},
{
"ce_loss": 0.3580132460594177,
"epoch": 5.85,
"inp_emb_norm": 0.395234375,
"loss": 0.3580132460594177,
"masked_top1": 79.92015823364258,
"masked_top5": 96.51984786987305,
"step": 19450,
"top1": 94.39044494628907,
"top5": 99.25140426635743
},
{
"epoch": 5.86,
"grad_norm": 1.0460192475813763,
"learning_rate": 0.0001,
"loss": 0.357,
"step": 19500
},
{
"ce_loss": 0.3561101830005646,
"epoch": 5.86,
"inp_emb_norm": 0.3905859375,
"loss": 0.3561101830005646,
"masked_top1": 80.33946014404297,
"masked_top5": 96.31464675903321,
"step": 19500,
"top1": 94.33697235107422,
"top5": 99.24774505615234
},
{
"epoch": 5.88,
"grad_norm": 0.9997822590004654,
"learning_rate": 0.0001,
"loss": 0.3667,
"step": 19550
},
{
"ce_loss": 0.3721057403087616,
"epoch": 5.88,
"inp_emb_norm": 0.3875390625,
"loss": 0.3721057403087616,
"masked_top1": 78.66541412353516,
"masked_top5": 95.76679565429687,
"step": 19550,
"top1": 94.079482421875,
"top5": 99.20158752441407
},
{
"epoch": 5.89,
"grad_norm": 1.0227356742773894,
"learning_rate": 0.0001,
"loss": 0.362,
"step": 19600
},
{
"ce_loss": 0.36288787305355075,
"epoch": 5.89,
"inp_emb_norm": 0.393046875,
"loss": 0.36288787305355075,
"masked_top1": 78.83365341186523,
"masked_top5": 95.98956665039063,
"step": 19600,
"top1": 94.17432601928711,
"top5": 99.2359994506836
},
{
"epoch": 5.91,
"grad_norm": 1.0348435013200028,
"learning_rate": 0.0001,
"loss": 0.3637,
"step": 19650
},
{
"ce_loss": 0.3597774177789688,
"epoch": 5.91,
"inp_emb_norm": 0.38703125,
"loss": 0.3597774177789688,
"masked_top1": 79.5971403503418,
"masked_top5": 96.29687118530273,
"step": 19650,
"top1": 94.29608825683594,
"top5": 99.25850982666016
},
{
"epoch": 5.92,
"grad_norm": 1.0680810347893466,
"learning_rate": 0.0001,
"loss": 0.366,
"step": 19700
},
{
"ce_loss": 0.3640019080042839,
"epoch": 5.92,
"inp_emb_norm": 0.3908203125,
"loss": 0.3640019080042839,
"masked_top1": 80.11074829101562,
"masked_top5": 96.37212921142579,
"step": 19700,
"top1": 94.18343032836914,
"top5": 99.28712341308594
},
{
"epoch": 5.94,
"grad_norm": 0.9094859392388285,
"learning_rate": 0.0001,
"loss": 0.3678,
"step": 19750
},
{
"ce_loss": 0.37644350349903105,
"epoch": 5.94,
"inp_emb_norm": 0.3891015625,
"loss": 0.37644350349903105,
"masked_top1": 78.18459594726562,
"masked_top5": 95.86135772705079,
"step": 19750,
"top1": 93.957353515625,
"top5": 99.2127586364746
},
{
"epoch": 5.95,
"grad_norm": 1.0310255933372185,
"learning_rate": 0.0001,
"loss": 0.3668,
"step": 19800
},
{
"ce_loss": 0.3627872896194458,
"epoch": 5.95,
"inp_emb_norm": 0.3862109375,
"loss": 0.3627872896194458,
"masked_top1": 79.60676147460937,
"masked_top5": 96.10193313598633,
"step": 19800,
"top1": 94.17147338867187,
"top5": 99.25555862426758
},
{
"epoch": 5.97,
"grad_norm": 1.0599907424871888,
"learning_rate": 0.0001,
"loss": 0.3695,
"step": 19850
},
{
"ce_loss": 0.37383450448513034,
"epoch": 5.97,
"inp_emb_norm": 0.4004296875,
"loss": 0.37383450448513034,
"masked_top1": 77.94340957641602,
"masked_top5": 96.1452799987793,
"step": 19850,
"top1": 94.01543914794922,
"top5": 99.25049942016602
},
{
"epoch": 5.98,
"grad_norm": 1.123287217336448,
"learning_rate": 0.0001,
"loss": 0.3667,
"step": 19900
},
{
"ce_loss": 0.3690763407945633,
"epoch": 5.98,
"inp_emb_norm": 0.3968359375,
"loss": 0.3690763407945633,
"masked_top1": 78.41387680053711,
"masked_top5": 95.78071762084961,
"step": 19900,
"top1": 94.15260635375977,
"top5": 99.18431182861327
},
{
"epoch": 6.0,
"grad_norm": 1.05174121076266,
"learning_rate": 0.0001,
"loss": 0.37,
"step": 19950
},
{
"ce_loss": 0.36719063580036165,
"epoch": 6.0,
"inp_emb_norm": 0.3880859375,
"loss": 0.36719063580036165,
"masked_top1": 79.3749658203125,
"masked_top5": 96.28604400634765,
"step": 19950,
"top1": 94.1464176940918,
"top5": 99.22554748535157
},
{
"epoch": 6.02,
"grad_norm": 0.7372869779982937,
"learning_rate": 0.0001,
"loss": 0.1841,
"step": 20000
},
{
"ce_loss": 0.1820479117333889,
"epoch": 6.02,
"inp_emb_norm": 0.3961328125,
"loss": 0.1820479117333889,
"masked_top1": 93.90685821533204,
"masked_top5": 99.41201736450195,
"step": 20000,
"top1": 97.13008163452149,
"top5": 99.6633514404297
},
{
"epoch": 6.03,
"grad_norm": 0.7458143529658549,
"learning_rate": 0.0001,
"loss": 0.1884,
"step": 20050
},
{
"ce_loss": 0.19271491587162018,
"epoch": 6.03,
"inp_emb_norm": 0.39109375,
"loss": 0.19271491587162018,
"masked_top1": 93.04420074462891,
"masked_top5": 99.42253952026367,
"step": 20050,
"top1": 96.94086242675782,
"top5": 99.66081787109376
},
{
"epoch": 6.05,
"grad_norm": 0.8859283290144573,
"learning_rate": 0.0001,
"loss": 0.1866,
"step": 20100
},
{
"ce_loss": 0.191538667678833,
"epoch": 6.05,
"inp_emb_norm": 0.3925390625,
"loss": 0.191538667678833,
"masked_top1": 93.09733154296875,
"masked_top5": 99.30335250854492,
"step": 20100,
"top1": 96.98335327148438,
"top5": 99.65503143310546
},
{
"epoch": 6.06,
"grad_norm": 0.8809125747332417,
"learning_rate": 0.0001,
"loss": 0.1852,
"step": 20150
},
{
"ce_loss": 0.1867568638920784,
"epoch": 6.06,
"inp_emb_norm": 0.391640625,
"loss": 0.1867568638920784,
"masked_top1": 93.50607498168945,
"masked_top5": 99.47832672119141,
"step": 20150,
"top1": 97.09137954711915,
"top5": 99.65869873046876
},
{
"epoch": 6.08,
"grad_norm": 0.826592224481137,
"learning_rate": 0.0001,
"loss": 0.1849,
"step": 20200
},
{
"ce_loss": 0.1898653081059456,
"epoch": 6.08,
"inp_emb_norm": 0.3903515625,
"loss": 0.1898653081059456,
"masked_top1": 93.60175338745117,
"masked_top5": 99.40977355957031,
"step": 20200,
"top1": 96.99095306396484,
"top5": 99.64656112670899
},
{
"epoch": 6.09,
"grad_norm": 0.8167934529115294,
"learning_rate": 0.0001,
"loss": 0.1935,
"step": 20250
},
{
"ce_loss": 0.1948414433002472,
"epoch": 6.09,
"inp_emb_norm": 0.3841015625,
"loss": 0.1948414433002472,
"masked_top1": 92.95522598266602,
"masked_top5": 99.3899104309082,
"step": 20250,
"top1": 96.98196350097656,
"top5": 99.63696716308594
},
{
"epoch": 6.11,
"grad_norm": 0.7345815776721539,
"learning_rate": 0.0001,
"loss": 0.1926,
"step": 20300
},
{
"ce_loss": 0.19291402637958527,
"epoch": 6.11,
"inp_emb_norm": 0.3873828125,
"loss": 0.19291402637958527,
"masked_top1": 93.4042626953125,
"masked_top5": 99.38736923217773,
"step": 20300,
"top1": 96.96928878784179,
"top5": 99.63698593139648
},
{
"epoch": 6.12,
"grad_norm": 0.8149440856604434,
"learning_rate": 0.0001,
"loss": 0.1952,
"step": 20350
},
{
"ce_loss": 0.1960041469335556,
"epoch": 6.12,
"inp_emb_norm": 0.39453125,
"loss": 0.1960041469335556,
"masked_top1": 92.89414184570313,
"masked_top5": 99.46369537353516,
"step": 20350,
"top1": 96.92287322998047,
"top5": 99.66681442260742
},
{
"epoch": 6.14,
"grad_norm": 0.8776230940079653,
"learning_rate": 0.0001,
"loss": 0.1979,
"step": 20400
},
{
"ce_loss": 0.20121153205633163,
"epoch": 6.14,
"inp_emb_norm": 0.3866015625,
"loss": 0.20121153205633163,
"masked_top1": 92.70189239501953,
"masked_top5": 99.36892135620117,
"step": 20400,
"top1": 96.79621154785156,
"top5": 99.64973388671875
},
{
"epoch": 6.15,
"grad_norm": 0.8585106345021164,
"learning_rate": 0.0001,
"loss": 0.2003,
"step": 20450
},
{
"ce_loss": 0.19490561604499818,
"epoch": 6.15,
"inp_emb_norm": 0.39078125,
"loss": 0.19490561604499818,
"masked_top1": 92.95185577392579,
"masked_top5": 99.3991976928711,
"step": 20450,
"top1": 96.93541824340821,
"top5": 99.64690612792968
},
{
"epoch": 6.17,
"grad_norm": 0.7976355967797862,
"learning_rate": 0.0001,
"loss": 0.2052,
"step": 20500
},
{
"ce_loss": 0.20391724795103072,
"epoch": 6.17,
"inp_emb_norm": 0.39390625,
"loss": 0.20391724795103072,
"masked_top1": 92.00440231323242,
"masked_top5": 99.22619979858399,
"step": 20500,
"top1": 96.7327310180664,
"top5": 99.62498168945312
},
{
"epoch": 6.18,
"grad_norm": 0.8148083321822271,
"learning_rate": 0.0001,
"loss": 0.2054,
"step": 20550
},
{
"ce_loss": 0.20799223512411116,
"epoch": 6.18,
"inp_emb_norm": 0.391953125,
"loss": 0.20799223512411116,
"masked_top1": 92.3514372253418,
"masked_top5": 99.27444519042969,
"step": 20550,
"top1": 96.76949005126953,
"top5": 99.62392379760742
},
{
"epoch": 6.2,
"grad_norm": 0.778356178235768,
"learning_rate": 0.0001,
"loss": 0.206,
"step": 20600
},
{
"ce_loss": 0.2053508883714676,
"epoch": 6.2,
"inp_emb_norm": 0.397890625,
"loss": 0.2053508883714676,
"masked_top1": 92.0650765991211,
"masked_top5": 99.24300872802735,
"step": 20600,
"top1": 96.75581268310548,
"top5": 99.60792434692382
},
{
"epoch": 6.21,
"grad_norm": 0.8329059510004336,
"learning_rate": 0.0001,
"loss": 0.211,
"step": 20650
},
{
"ce_loss": 0.20962412267923355,
"epoch": 6.21,
"inp_emb_norm": 0.3975390625,
"loss": 0.20962412267923355,
"masked_top1": 92.15507781982421,
"masked_top5": 99.21770889282226,
"step": 20650,
"top1": 96.68198043823242,
"top5": 99.61430511474609
},
{
"epoch": 6.23,
"grad_norm": 0.9696545325150863,
"learning_rate": 0.0001,
"loss": 0.214,
"step": 20700
},
{
"ce_loss": 0.21425070196390153,
"epoch": 6.23,
"inp_emb_norm": 0.3883984375,
"loss": 0.21425070196390153,
"masked_top1": 91.60136703491212,
"masked_top5": 99.14032455444335,
"step": 20700,
"top1": 96.57572845458985,
"top5": 99.60167327880859
},
{
"epoch": 6.24,
"grad_norm": 0.9465855604091451,
"learning_rate": 0.0001,
"loss": 0.2179,
"step": 20750
},
{
"ce_loss": 0.21650043070316316,
"epoch": 6.24,
"inp_emb_norm": 0.3938671875,
"loss": 0.21650043070316316,
"masked_top1": 91.89517654418945,
"masked_top5": 99.22545211791993,
"step": 20750,
"top1": 96.63436431884766,
"top5": 99.61578002929687
},
{
"epoch": 6.26,
"grad_norm": 0.8848039020955999,
"learning_rate": 0.0001,
"loss": 0.2167,
"step": 20800
},
{
"ce_loss": 0.21624796688556672,
"epoch": 6.26,
"inp_emb_norm": 0.3955078125,
"loss": 0.21624796688556672,
"masked_top1": 91.79812927246094,
"masked_top5": 99.13185974121093,
"step": 20800,
"top1": 96.59934356689453,
"top5": 99.60213928222656
},
{
"epoch": 6.27,
"grad_norm": 0.8555074354464904,
"learning_rate": 0.0001,
"loss": 0.22,
"step": 20850
},
{
"ce_loss": 0.2177719497680664,
"epoch": 6.27,
"inp_emb_norm": 0.386171875,
"loss": 0.2177719497680664,
"masked_top1": 91.69604415893555,
"masked_top5": 99.20869293212891,
"step": 20850,
"top1": 96.5518881225586,
"top5": 99.60115341186524
},
{
"epoch": 6.29,
"grad_norm": 0.8890783469847633,
"learning_rate": 0.0001,
"loss": 0.2219,
"step": 20900
},
{
"ce_loss": 0.22593430548906326,
"epoch": 6.29,
"inp_emb_norm": 0.3901953125,
"loss": 0.22593430548906326,
"masked_top1": 91.24859771728515,
"masked_top5": 98.95779678344726,
"step": 20900,
"top1": 96.41894073486328,
"top5": 99.57270751953125
},
{
"epoch": 6.3,
"grad_norm": 0.8389189758028528,
"learning_rate": 0.0001,
"loss": 0.2263,
"step": 20950
},
{
"ce_loss": 0.22432934373617172,
"epoch": 6.3,
"inp_emb_norm": 0.39359375,
"loss": 0.22432934373617172,
"masked_top1": 91.0940608215332,
"masked_top5": 99.17452285766602,
"step": 20950,
"top1": 96.43647857666015,
"top5": 99.61320083618165
},
{
"epoch": 6.32,
"grad_norm": 0.9535481479010524,
"learning_rate": 0.0001,
"loss": 0.2232,
"step": 21000
},
{
"ce_loss": 0.22769318729639054,
"epoch": 6.32,
"inp_emb_norm": 0.392578125,
"loss": 0.22769318729639054,
"masked_top1": 91.18105072021484,
"masked_top5": 99.13134719848632,
"step": 21000,
"top1": 96.39334274291993,
"top5": 99.59284255981446
},
{
"epoch": 6.33,
"grad_norm": 0.9100595834466957,
"learning_rate": 0.0001,
"loss": 0.2237,
"step": 21050
},
{
"ce_loss": 0.2257786351442337,
"epoch": 6.33,
"inp_emb_norm": 0.3939453125,
"loss": 0.2257786351442337,
"masked_top1": 90.99365447998046,
"masked_top5": 99.22629165649414,
"step": 21050,
"top1": 96.43473999023438,
"top5": 99.62341964721679
},
{
"epoch": 6.35,
"grad_norm": 0.9348221993172825,
"learning_rate": 0.0001,
"loss": 0.2301,
"step": 21100
},
{
"ce_loss": 0.22199460864067078,
"epoch": 6.35,
"inp_emb_norm": 0.3928515625,
"loss": 0.22199460864067078,
"masked_top1": 91.3432958984375,
"masked_top5": 99.21780807495117,
"step": 21100,
"top1": 96.48183792114258,
"top5": 99.59834899902344
},
{
"epoch": 6.36,
"grad_norm": 0.8157766502142015,
"learning_rate": 0.0001,
"loss": 0.2348,
"step": 21150
},
{
"ce_loss": 0.23358583688735962,
"epoch": 6.36,
"inp_emb_norm": 0.39484375,
"loss": 0.23358583688735962,
"masked_top1": 90.63573287963867,
"masked_top5": 99.2670066833496,
"step": 21150,
"top1": 96.2616633605957,
"top5": 99.61970520019531
},
{
"epoch": 6.38,
"grad_norm": 0.8204200350876739,
"learning_rate": 0.0001,
"loss": 0.2331,
"step": 21200
},
{
"ce_loss": 0.2356252110004425,
"epoch": 6.38,
"inp_emb_norm": 0.38765625,
"loss": 0.2356252110004425,
"masked_top1": 91.13608352661133,
"masked_top5": 99.10308471679687,
"step": 21200,
"top1": 96.23461364746093,
"top5": 99.58956817626954
},
{
"epoch": 6.39,
"grad_norm": 0.8456950881445512,
"learning_rate": 0.0001,
"loss": 0.2376,
"step": 21250
},
{
"ce_loss": 0.2390061578154564,
"epoch": 6.39,
"inp_emb_norm": 0.394609375,
"loss": 0.2390061578154564,
"masked_top1": 90.48685897827148,
"masked_top5": 98.98336410522461,
"step": 21250,
"top1": 96.21184814453125,
"top5": 99.59722549438476
},
{
"epoch": 6.41,
"grad_norm": 0.8287547291237191,
"learning_rate": 0.0001,
"loss": 0.242,
"step": 21300
},
{
"ce_loss": 0.2427684971690178,
"epoch": 6.41,
"inp_emb_norm": 0.4001953125,
"loss": 0.2427684971690178,
"masked_top1": 90.06592498779297,
"masked_top5": 98.84460174560547,
"step": 21300,
"top1": 96.14539581298828,
"top5": 99.56721633911133
},
{
"epoch": 6.42,
"grad_norm": 0.9051278595660746,
"learning_rate": 0.0001,
"loss": 0.2406,
"step": 21350
},
{
"ce_loss": 0.24132068693637848,
"epoch": 6.42,
"inp_emb_norm": 0.3931640625,
"loss": 0.24132068693637848,
"masked_top1": 90.12520782470703,
"masked_top5": 99.08217208862305,
"step": 21350,
"top1": 96.13710800170898,
"top5": 99.58537933349609
},
{
"epoch": 6.44,
"grad_norm": 0.9309548266435707,
"learning_rate": 0.0001,
"loss": 0.2448,
"step": 21400
},
{
"ce_loss": 0.2484480223059654,
"epoch": 6.44,
"inp_emb_norm": 0.4006640625,
"loss": 0.2484480223059654,
"masked_top1": 89.13191101074219,
"masked_top5": 98.85858505249024,
"step": 21400,
"top1": 96.04462646484374,
"top5": 99.58927947998046
},
{
"epoch": 6.45,
"grad_norm": 0.9843205463835414,
"learning_rate": 0.0001,
"loss": 0.2433,
"step": 21450
},
{
"ce_loss": 0.2498919489979744,
"epoch": 6.45,
"inp_emb_norm": 0.3948046875,
"loss": 0.2498919489979744,
"masked_top1": 89.5735856628418,
"masked_top5": 98.80221618652344,
"step": 21450,
"top1": 96.03055465698242,
"top5": 99.56374588012696
},
{
"epoch": 6.47,
"grad_norm": 0.9647612984805798,
"learning_rate": 0.0001,
"loss": 0.2479,
"step": 21500
},
{
"ce_loss": 0.25150889009237287,
"epoch": 6.47,
"inp_emb_norm": 0.3948828125,
"loss": 0.25150889009237287,
"masked_top1": 89.32119857788086,
"masked_top5": 98.80714309692382,
"step": 21500,
"top1": 95.92482788085937,
"top5": 99.56254333496094
},
{
"epoch": 6.48,
"grad_norm": 1.006084785735041,
"learning_rate": 0.0001,
"loss": 0.2484,
"step": 21550
},
{
"ce_loss": 0.24291085809469223,
"epoch": 6.48,
"inp_emb_norm": 0.3999609375,
"loss": 0.24291085809469223,
"masked_top1": 89.84078826904297,
"masked_top5": 98.79897399902343,
"step": 21550,
"top1": 96.13382278442383,
"top5": 99.56241195678712
},
{
"epoch": 6.5,
"grad_norm": 0.9435564610662254,
"learning_rate": 0.0001,
"loss": 0.2484,
"step": 21600
},
{
"ce_loss": 0.25397088915109634,
"epoch": 6.5,
"inp_emb_norm": 0.390078125,
"loss": 0.25397088915109634,
"masked_top1": 89.26468048095703,
"masked_top5": 98.83024612426757,
"step": 21600,
"top1": 95.98090301513672,
"top5": 99.53740264892578
},
{
"epoch": 6.51,
"grad_norm": 0.9039119083122704,
"learning_rate": 0.0001,
"loss": 0.2536,
"step": 21650
},
{
"ce_loss": 0.2490449759364128,
"epoch": 6.51,
"inp_emb_norm": 0.398671875,
"loss": 0.2490449759364128,
"masked_top1": 90.15557754516601,
"masked_top5": 98.96521423339844,
"step": 21650,
"top1": 96.07868286132812,
"top5": 99.56334945678711
},
{
"epoch": 6.53,
"grad_norm": 0.9083177376434824,
"learning_rate": 0.0001,
"loss": 0.2511,
"step": 21700
},
{
"ce_loss": 0.25019362777471543,
"epoch": 6.53,
"inp_emb_norm": 0.4041796875,
"loss": 0.25019362777471543,
"masked_top1": 89.28435134887695,
"masked_top5": 98.76094360351563,
"step": 21700,
"top1": 95.98147827148438,
"top5": 99.55952423095704
},
{
"epoch": 6.54,
"grad_norm": 0.9468133124761712,
"learning_rate": 0.0001,
"loss": 0.2559,
"step": 21750
},
{
"ce_loss": 0.24820626825094222,
"epoch": 6.54,
"inp_emb_norm": 0.407109375,
"loss": 0.24820626825094222,
"masked_top1": 89.13103088378907,
"masked_top5": 98.86271697998046,
"step": 21750,
"top1": 96.04320175170898,
"top5": 99.57524795532227
},
{
"epoch": 6.56,
"grad_norm": 0.9411575962043335,
"learning_rate": 0.0001,
"loss": 0.2515,
"step": 21800
},
{
"ce_loss": 0.25579195737838745,
"epoch": 6.56,
"inp_emb_norm": 0.3958203125,
"loss": 0.25579195737838745,
"masked_top1": 89.12390563964844,
"masked_top5": 98.75174041748046,
"step": 21800,
"top1": 95.92440002441407,
"top5": 99.55484436035157
},
{
"epoch": 6.57,
"grad_norm": 0.9083686787254603,
"learning_rate": 0.0001,
"loss": 0.2572,
"step": 21850
},
{
"ce_loss": 0.2511313533782959,
"epoch": 6.57,
"inp_emb_norm": 0.398984375,
"loss": 0.2511313533782959,
"masked_top1": 90.02878692626953,
"masked_top5": 98.91113662719727,
"step": 21850,
"top1": 96.03028366088867,
"top5": 99.57074462890625
},
{
"epoch": 6.59,
"grad_norm": 0.9524791704046712,
"learning_rate": 0.0001,
"loss": 0.2561,
"step": 21900
},
{
"ce_loss": 0.25475972771644595,
"epoch": 6.59,
"inp_emb_norm": 0.39703125,
"loss": 0.25475972771644595,
"masked_top1": 89.4745411682129,
"masked_top5": 98.61525360107422,
"step": 21900,
"top1": 95.92261184692383,
"top5": 99.54248062133789
},
{
"epoch": 6.6,
"grad_norm": 0.9780779866428146,
"learning_rate": 0.0001,
"loss": 0.2543,
"step": 21950
},
{
"ce_loss": 0.25268141776323316,
"epoch": 6.6,
"inp_emb_norm": 0.400859375,
"loss": 0.25268141776323316,
"masked_top1": 89.10960723876953,
"masked_top5": 98.78202194213867,
"step": 21950,
"top1": 96.01406188964843,
"top5": 99.5376611328125
},
{
"epoch": 6.62,
"grad_norm": 0.9274176844179849,
"learning_rate": 0.0001,
"loss": 0.2628,
"step": 22000
},
{
"ce_loss": 0.2659920188784599,
"epoch": 6.62,
"inp_emb_norm": 0.3957421875,
"loss": 0.2659920188784599,
"masked_top1": 88.22967803955078,
"masked_top5": 98.6833935546875,
"step": 22000,
"top1": 95.75867980957031,
"top5": 99.55640853881836
},
{
"epoch": 6.63,
"grad_norm": 1.033134365793791,
"learning_rate": 0.0001,
"loss": 0.2614,
"step": 22050
},
{
"ce_loss": 0.26293145805597307,
"epoch": 6.63,
"inp_emb_norm": 0.3959375,
"loss": 0.26293145805597307,
"masked_top1": 88.64593627929688,
"masked_top5": 98.49295043945312,
"step": 22050,
"top1": 95.80199890136718,
"top5": 99.52016479492187
},
{
"epoch": 6.65,
"grad_norm": 0.9919274843394079,
"learning_rate": 0.0001,
"loss": 0.2614,
"step": 22100
},
{
"ce_loss": 0.2592626142501831,
"epoch": 6.65,
"inp_emb_norm": 0.39484375,
"loss": 0.2592626142501831,
"masked_top1": 89.1805844116211,
"masked_top5": 98.66838363647462,
"step": 22100,
"top1": 95.86548355102539,
"top5": 99.5434815979004
},
{
"epoch": 6.66,
"grad_norm": 0.923669979047392,
"learning_rate": 0.0001,
"loss": 0.2684,
"step": 22150
},
{
"ce_loss": 0.25953867882490156,
"epoch": 6.66,
"inp_emb_norm": 0.4025,
"loss": 0.25953867882490156,
"masked_top1": 88.57401412963867,
"masked_top5": 98.65271347045899,
"step": 22150,
"top1": 95.81649612426757,
"top5": 99.55012420654298
},
{
"epoch": 6.68,
"grad_norm": 0.9781925396499678,
"learning_rate": 0.0001,
"loss": 0.2676,
"step": 22200
},
{
"ce_loss": 0.2756389129161835,
"epoch": 6.68,
"inp_emb_norm": 0.3946875,
"loss": 0.2756389129161835,
"masked_top1": 87.85789031982422,
"masked_top5": 98.56079971313477,
"step": 22200,
"top1": 95.60260299682618,
"top5": 99.51409820556641
},
{
"epoch": 6.69,
"grad_norm": 0.881278477232779,
"learning_rate": 0.0001,
"loss": 0.2694,
"step": 22250
},
{
"ce_loss": 0.26733168482780456,
"epoch": 6.69,
"inp_emb_norm": 0.3998828125,
"loss": 0.26733168482780456,
"masked_top1": 88.51319839477539,
"masked_top5": 98.58459121704101,
"step": 22250,
"top1": 95.67998138427734,
"top5": 99.5245753479004
},
{
"epoch": 6.71,
"grad_norm": 0.9944879689962476,
"learning_rate": 0.0001,
"loss": 0.2695,
"step": 22300
},
{
"ce_loss": 0.2680419811606407,
"epoch": 6.71,
"inp_emb_norm": 0.3937109375,
"loss": 0.2680419811606407,
"masked_top1": 89.19367111206054,
"masked_top5": 98.77984527587891,
"step": 22300,
"top1": 95.70912155151368,
"top5": 99.52434310913085
},
{
"epoch": 6.72,
"grad_norm": 0.9344294418856229,
"learning_rate": 0.0001,
"loss": 0.2706,
"step": 22350
},
{
"ce_loss": 0.2640018093585968,
"epoch": 6.72,
"inp_emb_norm": 0.3963671875,
"loss": 0.2640018093585968,
"masked_top1": 88.87513946533203,
"masked_top5": 98.68793273925782,
"step": 22350,
"top1": 95.7931625366211,
"top5": 99.53593536376952
},
{
"epoch": 6.74,
"grad_norm": 0.9234955706616644,
"learning_rate": 0.0001,
"loss": 0.2694,
"step": 22400
},
{
"ce_loss": 0.27443262994289397,
"epoch": 6.74,
"inp_emb_norm": 0.401484375,
"loss": 0.27443262994289397,
"masked_top1": 87.91703521728516,
"masked_top5": 98.35836242675781,
"step": 22400,
"top1": 95.63983840942383,
"top5": 99.48991470336914
},
{
"epoch": 6.75,
"grad_norm": 0.9774569390983554,
"learning_rate": 0.0001,
"loss": 0.273,
"step": 22450
},
{
"ce_loss": 0.2766887894272804,
"epoch": 6.75,
"inp_emb_norm": 0.398359375,
"loss": 0.2766887894272804,
"masked_top1": 88.1091081237793,
"masked_top5": 98.57858413696289,
"step": 22450,
"top1": 95.64536865234375,
"top5": 99.5364192199707
},
{
"epoch": 6.77,
"grad_norm": 0.8755362789382393,
"learning_rate": 0.0001,
"loss": 0.274,
"step": 22500
},
{
"ce_loss": 0.27666087716817855,
"epoch": 6.77,
"inp_emb_norm": 0.4048046875,
"loss": 0.27666087716817855,
"masked_top1": 87.76848861694336,
"masked_top5": 98.57107147216797,
"step": 22500,
"top1": 95.56947143554687,
"top5": 99.51631744384765
},
{
"epoch": 6.78,
"grad_norm": 0.8974975580290838,
"learning_rate": 0.0001,
"loss": 0.2727,
"step": 22550
},
{
"ce_loss": 0.26903481036424637,
"epoch": 6.78,
"inp_emb_norm": 0.3957421875,
"loss": 0.26903481036424637,
"masked_top1": 88.31371643066406,
"masked_top5": 98.50493865966797,
"step": 22550,
"top1": 95.6751333618164,
"top5": 99.53861953735351
},
{
"epoch": 6.8,
"grad_norm": 0.9330247587083373,
"learning_rate": 0.0001,
"loss": 0.2773,
"step": 22600
},
{
"ce_loss": 0.27828563034534454,
"epoch": 6.8,
"inp_emb_norm": 0.40484375,
"loss": 0.27828563034534454,
"masked_top1": 87.45176528930664,
"masked_top5": 98.36986709594727,
"step": 22600,
"top1": 95.5737760925293,
"top5": 99.49567245483398
},
{
"epoch": 6.81,
"grad_norm": 0.9335442063104634,
"learning_rate": 0.0001,
"loss": 0.2753,
"step": 22650
},
{
"ce_loss": 0.27865981191396716,
"epoch": 6.81,
"inp_emb_norm": 0.40390625,
"loss": 0.27865981191396716,
"masked_top1": 87.10857299804688,
"masked_top5": 98.53876022338868,
"step": 22650,
"top1": 95.47858245849609,
"top5": 99.53715377807617
},
{
"epoch": 6.83,
"grad_norm": 0.9872165512055184,
"learning_rate": 0.0001,
"loss": 0.274,
"step": 22700
},
{
"ce_loss": 0.2781349629163742,
"epoch": 6.83,
"inp_emb_norm": 0.409296875,
"loss": 0.2781349629163742,
"masked_top1": 86.65185195922851,
"masked_top5": 98.23174392700196,
"step": 22700,
"top1": 95.48913436889649,
"top5": 99.51249389648437
},
{
"epoch": 6.84,
"grad_norm": 0.8421329759123019,
"learning_rate": 0.0001,
"loss": 0.2809,
"step": 22750
},
{
"ce_loss": 0.2819749695062637,
"epoch": 6.84,
"inp_emb_norm": 0.4040625,
"loss": 0.2819749695062637,
"masked_top1": 87.23991241455079,
"masked_top5": 98.37083847045898,
"step": 22750,
"top1": 95.48827087402344,
"top5": 99.51471160888671
},
{
"epoch": 6.86,
"grad_norm": 0.9374127847054397,
"learning_rate": 0.0001,
"loss": 0.283,
"step": 22800
},
{
"ce_loss": 0.2752624320983887,
"epoch": 6.86,
"inp_emb_norm": 0.4168359375,
"loss": 0.2752624320983887,
"masked_top1": 87.0565657043457,
"masked_top5": 98.52674087524414,
"step": 22800,
"top1": 95.5639727783203,
"top5": 99.51801193237304
},
{
"epoch": 6.87,
"grad_norm": 0.894365544580622,
"learning_rate": 0.0001,
"loss": 0.2806,
"step": 22850
},
{
"ce_loss": 0.28234552562236787,
"epoch": 6.87,
"inp_emb_norm": 0.3933984375,
"loss": 0.28234552562236787,
"masked_top1": 87.87512283325195,
"masked_top5": 98.52397399902344,
"step": 22850,
"top1": 95.52448684692382,
"top5": 99.49992218017579
},
{
"epoch": 6.89,
"grad_norm": 0.8865838096449303,
"learning_rate": 0.0001,
"loss": 0.2803,
"step": 22900
},
{
"ce_loss": 0.2735359054803848,
"epoch": 6.89,
"inp_emb_norm": 0.4132421875,
"loss": 0.2735359054803848,
"masked_top1": 87.56801651000977,
"masked_top5": 98.44611022949219,
"step": 22900,
"top1": 95.64279312133789,
"top5": 99.52417373657227
},
{
"epoch": 6.9,
"grad_norm": 1.0014374606270757,
"learning_rate": 0.0001,
"loss": 0.2846,
"step": 22950
},
{
"ce_loss": 0.2815407305955887,
"epoch": 6.9,
"inp_emb_norm": 0.4108203125,
"loss": 0.2815407305955887,
"masked_top1": 86.75137435913086,
"masked_top5": 98.28164108276367,
"step": 22950,
"top1": 95.45286041259766,
"top5": 99.5033415222168
},
{
"epoch": 6.92,
"grad_norm": 0.9418515928051996,
"learning_rate": 0.0001,
"loss": 0.2881,
"step": 23000
},
{
"ce_loss": 0.2875212562084198,
"epoch": 6.92,
"inp_emb_norm": 0.393046875,
"loss": 0.2875212562084198,
"masked_top1": 86.9408479309082,
"masked_top5": 98.42632766723632,
"step": 23000,
"top1": 95.38972747802734,
"top5": 99.51202941894532
},
{
"epoch": 6.93,
"grad_norm": 0.9640044804691634,
"learning_rate": 0.0001,
"loss": 0.2849,
"step": 23050
},
{
"ce_loss": 0.28959711760282514,
"epoch": 6.93,
"inp_emb_norm": 0.39890625,
"loss": 0.28959711760282514,
"masked_top1": 87.06779602050781,
"masked_top5": 98.1885205078125,
"step": 23050,
"top1": 95.33015533447265,
"top5": 99.4693441772461
},
{
"epoch": 6.95,
"grad_norm": 0.884763494943833,
"learning_rate": 0.0001,
"loss": 0.2864,
"step": 23100
},
{
"ce_loss": 0.2791384127736092,
"epoch": 6.95,
"inp_emb_norm": 0.3983203125,
"loss": 0.2791384127736092,
"masked_top1": 87.34188446044922,
"masked_top5": 98.60519989013672,
"step": 23100,
"top1": 95.44756973266601,
"top5": 99.51571304321288
},
{
"epoch": 6.96,
"grad_norm": 0.9969087357121538,
"learning_rate": 0.0001,
"loss": 0.2819,
"step": 23150
},
{
"ce_loss": 0.2769463035464287,
"epoch": 6.96,
"inp_emb_norm": 0.40640625,
"loss": 0.2769463035464287,
"masked_top1": 87.70826522827149,
"masked_top5": 98.4365104675293,
"step": 23150,
"top1": 95.51589462280273,
"top5": 99.52118911743165
},
{
"epoch": 6.98,
"grad_norm": 1.0169282128070403,
"learning_rate": 0.0001,
"loss": 0.2876,
"step": 23200
},
{
"ce_loss": 0.28613451212644575,
"epoch": 6.98,
"inp_emb_norm": 0.409296875,
"loss": 0.28613451212644575,
"masked_top1": 86.17974639892579,
"masked_top5": 98.22409042358399,
"step": 23200,
"top1": 95.39607528686524,
"top5": 99.49454742431641
},
{
"epoch": 6.99,
"grad_norm": 1.0450277523154698,
"learning_rate": 0.0001,
"loss": 0.2889,
"step": 23250
},
{
"ce_loss": 0.2973794335126877,
"epoch": 6.99,
"inp_emb_norm": 0.40296875,
"loss": 0.2973794335126877,
"masked_top1": 86.13511917114258,
"masked_top5": 98.26584106445313,
"step": 23250,
"top1": 95.27028549194335,
"top5": 99.47429718017578
},
{
"epoch": 7.01,
"grad_norm": 0.6565758700785924,
"learning_rate": 0.0001,
"loss": 0.2269,
"step": 23300
},
{
"ce_loss": 0.22528975576162338,
"epoch": 7.01,
"inp_emb_norm": 0.4019140625,
"loss": 0.22528975576162338,
"masked_top1": 91.23332473754883,
"masked_top5": 99.0590966796875,
"step": 23300,
"top1": 96.40823837280273,
"top5": 99.59931091308594
},
{
"epoch": 7.02,
"grad_norm": 0.7347638074045553,
"learning_rate": 0.0001,
"loss": 0.1611,
"step": 23350
},
{
"ce_loss": 0.16495208382606508,
"epoch": 7.02,
"inp_emb_norm": 0.4017578125,
"loss": 0.16495208382606508,
"masked_top1": 95.35215362548828,
"masked_top5": 99.73044998168945,
"step": 23350,
"top1": 97.44107040405274,
"top5": 99.70461227416992
},
{
"epoch": 7.04,
"grad_norm": 0.6645294223117588,
"learning_rate": 0.0001,
"loss": 0.1606,
"step": 23400
},
{
"ce_loss": 0.1565721134841442,
"epoch": 7.04,
"inp_emb_norm": 0.4126953125,
"loss": 0.1565721134841442,
"masked_top1": 95.39784866333008,
"masked_top5": 99.75249832153321,
"step": 23400,
"top1": 97.55490280151368,
"top5": 99.71364120483399
},
{
"epoch": 7.05,
"grad_norm": 0.7050716054650297,
"learning_rate": 0.0001,
"loss": 0.1606,
"step": 23450
},
{
"ce_loss": 0.15757375702261925,
"epoch": 7.05,
"inp_emb_norm": 0.4034765625,
"loss": 0.15757375702261925,
"masked_top1": 95.50349487304688,
"masked_top5": 99.6744792175293,
"step": 23450,
"top1": 97.49706893920899,
"top5": 99.70018051147461
},
{
"epoch": 7.07,
"grad_norm": 0.70439339407283,
"learning_rate": 0.0001,
"loss": 0.1595,
"step": 23500
},
{
"ce_loss": 0.15974825277924537,
"epoch": 7.07,
"inp_emb_norm": 0.410625,
"loss": 0.15974825277924537,
"masked_top1": 95.76246231079102,
"masked_top5": 99.68396453857422,
"step": 23500,
"top1": 97.5335317993164,
"top5": 99.69112976074219
},
{
"epoch": 7.08,
"grad_norm": 0.8001940943257561,
"learning_rate": 0.0001,
"loss": 0.1643,
"step": 23550
},
{
"ce_loss": 0.16370448261499404,
"epoch": 7.08,
"inp_emb_norm": 0.4030859375,
"loss": 0.16370448261499404,
"masked_top1": 95.43571884155273,
"masked_top5": 99.68741439819335,
"step": 23550,
"top1": 97.46105117797852,
"top5": 99.6865510559082
},
{
"epoch": 7.1,
"grad_norm": 0.7666147004483381,
"learning_rate": 0.0001,
"loss": 0.1653,
"step": 23600
},
{
"ce_loss": 0.16188147634267808,
"epoch": 7.1,
"inp_emb_norm": 0.408828125,
"loss": 0.16188147634267808,
"masked_top1": 95.91392837524414,
"masked_top5": 99.76342895507813,
"step": 23600,
"top1": 97.41842742919921,
"top5": 99.69919830322266
},
{
"epoch": 7.11,
"grad_norm": 0.753790024789609,
"learning_rate": 0.0001,
"loss": 0.1674,
"step": 23650
},
{
"ce_loss": 0.16793357729911804,
"epoch": 7.11,
"inp_emb_norm": 0.4030078125,
"loss": 0.16793357729911804,
"masked_top1": 95.3747721862793,
"masked_top5": 99.69120513916016,
"step": 23650,
"top1": 97.39034225463867,
"top5": 99.67092514038086
},
{
"epoch": 7.13,
"grad_norm": 0.7560663951042244,
"learning_rate": 0.0001,
"loss": 0.1714,
"step": 23700
},
{
"ce_loss": 0.16748950853943825,
"epoch": 7.13,
"inp_emb_norm": 0.4015234375,
"loss": 0.16748950853943825,
"masked_top1": 95.61355926513671,
"masked_top5": 99.70470977783204,
"step": 23700,
"top1": 97.36271957397462,
"top5": 99.68995620727539
},
{
"epoch": 7.14,
"grad_norm": 0.7885900131749313,
"learning_rate": 0.0001,
"loss": 0.1718,
"step": 23750
},
{
"ce_loss": 0.16834189236164093,
"epoch": 7.14,
"inp_emb_norm": 0.4105859375,
"loss": 0.16834189236164093,
"masked_top1": 95.20903045654296,
"masked_top5": 99.66537704467774,
"step": 23750,
"top1": 97.41024429321288,
"top5": 99.6710530090332
},
{
"epoch": 7.16,
"grad_norm": 0.708267720328551,
"learning_rate": 0.0001,
"loss": 0.1718,
"step": 23800
},
{
"ce_loss": 0.16808076053857804,
"epoch": 7.16,
"inp_emb_norm": 0.4126953125,
"loss": 0.16808076053857804,
"masked_top1": 94.77564025878907,
"masked_top5": 99.67258102416992,
"step": 23800,
"top1": 97.39059310913086,
"top5": 99.68683700561523
},
{
"epoch": 7.17,
"grad_norm": 0.7351045964671515,
"learning_rate": 0.0001,
"loss": 0.174,
"step": 23850
},
{
"ce_loss": 0.1726333273947239,
"epoch": 7.17,
"inp_emb_norm": 0.4017578125,
"loss": 0.1726333273947239,
"masked_top1": 94.88044723510743,
"masked_top5": 99.62449737548827,
"step": 23850,
"top1": 97.2899102783203,
"top5": 99.65145614624024
},
{
"epoch": 7.19,
"grad_norm": 0.8049347796917691,
"learning_rate": 0.0001,
"loss": 0.1787,
"step": 23900
},
{
"ce_loss": 0.17679258838295936,
"epoch": 7.19,
"inp_emb_norm": 0.4061328125,
"loss": 0.17679258838295936,
"masked_top1": 94.69095947265625,
"masked_top5": 99.67362945556641,
"step": 23900,
"top1": 97.20210205078125,
"top5": 99.68624572753906
},
{
"epoch": 7.2,
"grad_norm": 0.8012010836512982,
"learning_rate": 0.0001,
"loss": 0.1788,
"step": 23950
},
{
"ce_loss": 0.1779346537590027,
"epoch": 7.2,
"inp_emb_norm": 0.40765625,
"loss": 0.1779346537590027,
"masked_top1": 94.71529022216797,
"masked_top5": 99.61817611694336,
"step": 23950,
"top1": 97.14944107055663,
"top5": 99.65997314453125
},
{
"epoch": 7.22,
"grad_norm": 0.8096182631639389,
"learning_rate": 0.0001,
"loss": 0.1782,
"step": 24000
},
{
"ce_loss": 0.17779998898506164,
"epoch": 7.22,
"inp_emb_norm": 0.40890625,
"loss": 0.17779998898506164,
"masked_top1": 94.6820344543457,
"masked_top5": 99.6923991394043,
"step": 24000,
"top1": 97.1572166442871,
"top5": 99.69579544067383
},
{
"epoch": 7.23,
"grad_norm": 0.7208752096873576,
"learning_rate": 0.0001,
"loss": 0.1795,
"step": 24050
},
{
"ce_loss": 0.18153419494628906,
"epoch": 7.23,
"inp_emb_norm": 0.4061328125,
"loss": 0.18153419494628906,
"masked_top1": 94.62552368164063,
"masked_top5": 99.55692886352539,
"step": 24050,
"top1": 97.11937118530274,
"top5": 99.64983123779297
},
{
"epoch": 7.25,
"grad_norm": 0.8302822944148331,
"learning_rate": 0.0001,
"loss": 0.1832,
"step": 24100
},
{
"ce_loss": 0.18768798112869262,
"epoch": 7.25,
"inp_emb_norm": 0.41546875,
"loss": 0.18768798112869262,
"masked_top1": 93.77630233764648,
"masked_top5": 99.51055358886718,
"step": 24100,
"top1": 97.08079849243164,
"top5": 99.66075820922852
},
{
"epoch": 7.26,
"grad_norm": 0.8034956740784042,
"learning_rate": 0.0001,
"loss": 0.1895,
"step": 24150
},
{
"ce_loss": 0.19201560974121093,
"epoch": 7.26,
"inp_emb_norm": 0.40984375,
"loss": 0.19201560974121093,
"masked_top1": 93.87309066772461,
"masked_top5": 99.5169157409668,
"step": 24150,
"top1": 97.02114761352539,
"top5": 99.6527261352539
},
{
"epoch": 7.28,
"grad_norm": 0.8314048125761649,
"learning_rate": 0.0001,
"loss": 0.1873,
"step": 24200
},
{
"ce_loss": 0.1878107675909996,
"epoch": 7.28,
"inp_emb_norm": 0.407265625,
"loss": 0.1878107675909996,
"masked_top1": 94.19107177734375,
"masked_top5": 99.55671203613281,
"step": 24200,
"top1": 97.03997573852538,
"top5": 99.68890594482421
},
{
"epoch": 7.29,
"grad_norm": 0.704357096474096,
"learning_rate": 0.0001,
"loss": 0.1893,
"step": 24250
},
{
"ce_loss": 0.19042812794446945,
"epoch": 7.29,
"inp_emb_norm": 0.416328125,
"loss": 0.19042812794446945,
"masked_top1": 93.61178329467774,
"masked_top5": 99.54245040893555,
"step": 24250,
"top1": 96.90492645263672,
"top5": 99.6619076538086
},
{
"epoch": 7.31,
"grad_norm": 0.856253880169746,
"learning_rate": 0.0001,
"loss": 0.1893,
"step": 24300
},
{
"ce_loss": 0.1884206250309944,
"epoch": 7.31,
"inp_emb_norm": 0.4078515625,
"loss": 0.1884206250309944,
"masked_top1": 93.98220611572266,
"masked_top5": 99.54290969848633,
"step": 24300,
"top1": 96.96899108886718,
"top5": 99.6441293334961
},
{
"epoch": 7.32,
"grad_norm": 0.8464641167931024,
"learning_rate": 0.0001,
"loss": 0.1918,
"step": 24350
},
{
"ce_loss": 0.19378722280263902,
"epoch": 7.32,
"inp_emb_norm": 0.4122265625,
"loss": 0.19378722280263902,
"masked_top1": 93.98709579467773,
"masked_top5": 99.48938507080078,
"step": 24350,
"top1": 96.92246383666992,
"top5": 99.6635366821289
},
{
"epoch": 7.34,
"grad_norm": 0.9274851248218718,
"learning_rate": 0.0001,
"loss": 0.1979,
"step": 24400
},
{
"ce_loss": 0.20088252812623977,
"epoch": 7.34,
"inp_emb_norm": 0.405546875,
"loss": 0.20088252812623977,
"masked_top1": 92.93017440795899,
"masked_top5": 99.49172775268555,
"step": 24400,
"top1": 96.83464401245118,
"top5": 99.64353897094726
},
{
"epoch": 7.35,
"grad_norm": 0.7721388576512122,
"learning_rate": 0.0001,
"loss": 0.1934,
"step": 24450
},
{
"ce_loss": 0.2001592782139778,
"epoch": 7.35,
"inp_emb_norm": 0.413828125,
"loss": 0.2001592782139778,
"masked_top1": 93.32104949951172,
"masked_top5": 99.480458984375,
"step": 24450,
"top1": 96.84799270629883,
"top5": 99.66846115112304
},
{
"epoch": 7.37,
"grad_norm": 0.8460246810468973,
"learning_rate": 0.0001,
"loss": 0.1937,
"step": 24500
},
{
"ce_loss": 0.19288330137729645,
"epoch": 7.37,
"inp_emb_norm": 0.40984375,
"loss": 0.19288330137729645,
"masked_top1": 93.38121078491211,
"masked_top5": 99.56306228637695,
"step": 24500,
"top1": 96.92339248657227,
"top5": 99.67182235717773
},
{
"epoch": 7.38,
"grad_norm": 0.8742179314746799,
"learning_rate": 0.0001,
"loss": 0.1982,
"step": 24550
},
{
"ce_loss": 0.19720925360918046,
"epoch": 7.38,
"inp_emb_norm": 0.412890625,
"loss": 0.19720925360918046,
"masked_top1": 93.280068359375,
"masked_top5": 99.51691192626953,
"step": 24550,
"top1": 96.83218688964844,
"top5": 99.66712615966797
},
{
"epoch": 7.4,
"grad_norm": 0.8553325710712862,
"learning_rate": 0.0001,
"loss": 0.2011,
"step": 24600
},
{
"ce_loss": 0.19960668057203293,
"epoch": 7.4,
"inp_emb_norm": 0.409765625,
"loss": 0.19960668057203293,
"masked_top1": 93.45094650268555,
"masked_top5": 99.46210220336914,
"step": 24600,
"top1": 96.85580673217774,
"top5": 99.665537109375
},
{
"epoch": 7.41,
"grad_norm": 0.8454568497353904,
"learning_rate": 0.0001,
"loss": 0.2022,
"step": 24650
},
{
"ce_loss": 0.20072863966226578,
"epoch": 7.41,
"inp_emb_norm": 0.4177734375,
"loss": 0.20072863966226578,
"masked_top1": 92.75718322753906,
"masked_top5": 99.35001556396485,
"step": 24650,
"top1": 96.78503204345704,
"top5": 99.66353454589844
},
{
"epoch": 7.43,
"grad_norm": 0.8262303326226429,
"learning_rate": 0.0001,
"loss": 0.205,
"step": 24700
},
{
"ce_loss": 0.20386093407869338,
"epoch": 7.43,
"inp_emb_norm": 0.4098046875,
"loss": 0.20386093407869338,
"masked_top1": 93.37680938720703,
"masked_top5": 99.56277999877929,
"step": 24700,
"top1": 96.77312591552734,
"top5": 99.6433283996582
},
{
"epoch": 7.44,
"grad_norm": 0.8542900336102568,
"learning_rate": 0.0001,
"loss": 0.2073,
"step": 24750
},
{
"ce_loss": 0.20805983483791352,
"epoch": 7.44,
"inp_emb_norm": 0.4114453125,
"loss": 0.20805983483791352,
"masked_top1": 92.38563873291015,
"masked_top5": 99.27365493774414,
"step": 24750,
"top1": 96.68102691650391,
"top5": 99.62649429321289
},
{
"epoch": 7.46,
"grad_norm": 0.775495795649607,
"learning_rate": 0.0001,
"loss": 0.2084,
"step": 24800
},
{
"ce_loss": 0.20858111292123793,
"epoch": 7.46,
"inp_emb_norm": 0.40828125,
"loss": 0.20858111292123793,
"masked_top1": 93.18427093505859,
"masked_top5": 99.40792358398437,
"step": 24800,
"top1": 96.69915771484375,
"top5": 99.64056365966798
},
{
"epoch": 7.47,
"grad_norm": 0.795773364310639,
"learning_rate": 0.0001,
"loss": 0.2128,
"step": 24850
},
{
"ce_loss": 0.21129278868436813,
"epoch": 7.47,
"inp_emb_norm": 0.4151953125,
"loss": 0.21129278868436813,
"masked_top1": 92.36515197753906,
"masked_top5": 99.46405319213868,
"step": 24850,
"top1": 96.61996765136719,
"top5": 99.64519241333008
},
{
"epoch": 7.49,
"grad_norm": 0.8606373006872489,
"learning_rate": 0.0001,
"loss": 0.2107,
"step": 24900
},
{
"ce_loss": 0.2146734645962715,
"epoch": 7.49,
"inp_emb_norm": 0.416328125,
"loss": 0.2146734645962715,
"masked_top1": 92.35210235595703,
"masked_top5": 99.37399642944337,
"step": 24900,
"top1": 96.58806320190429,
"top5": 99.66881698608398
},
{
"epoch": 7.5,
"grad_norm": 0.8148609241548314,
"learning_rate": 0.0001,
"loss": 0.2145,
"step": 24950
},
{
"ce_loss": 0.21042368352413177,
"epoch": 7.5,
"inp_emb_norm": 0.41640625,
"loss": 0.21042368352413177,
"masked_top1": 92.42163009643555,
"masked_top5": 99.2802491760254,
"step": 24950,
"top1": 96.64725265502929,
"top5": 99.64126876831055
},
{
"epoch": 7.52,
"grad_norm": 0.8552723662919675,
"learning_rate": 0.0001,
"loss": 0.2152,
"step": 25000
},
{
"ce_loss": 0.21622439593076706,
"epoch": 7.52,
"inp_emb_norm": 0.411796875,
"loss": 0.21622439593076706,
"masked_top1": 92.36061889648437,
"masked_top5": 99.40323318481445,
"step": 25000,
"top1": 96.49748916625977,
"top5": 99.6481248474121
},
{
"epoch": 7.53,
"grad_norm": 0.8756422021562937,
"learning_rate": 0.0001,
"loss": 0.2175,
"step": 25050
},
{
"ce_loss": 0.21715206265449524,
"epoch": 7.53,
"inp_emb_norm": 0.4205859375,
"loss": 0.21715206265449524,
"masked_top1": 92.16612228393555,
"masked_top5": 99.21943115234374,
"step": 25050,
"top1": 96.50445205688476,
"top5": 99.63119323730469
},
{
"epoch": 7.55,
"grad_norm": 0.9236893593627393,
"learning_rate": 0.0001,
"loss": 0.2156,
"step": 25100
},
{
"ce_loss": 0.21899319916963578,
"epoch": 7.55,
"inp_emb_norm": 0.413203125,
"loss": 0.21899319916963578,
"masked_top1": 91.96203842163087,
"masked_top5": 99.34012496948242,
"step": 25100,
"top1": 96.50180130004883,
"top5": 99.64115966796875
},
{
"epoch": 7.56,
"grad_norm": 0.931694461742947,
"learning_rate": 0.0001,
"loss": 0.2197,
"step": 25150
},
{
"ce_loss": 0.21712171256542206,
"epoch": 7.56,
"inp_emb_norm": 0.4085546875,
"loss": 0.21712171256542206,
"masked_top1": 92.56915695190429,
"masked_top5": 99.49829315185546,
"step": 25150,
"top1": 96.46173583984375,
"top5": 99.64132019042968
},
{
"epoch": 7.58,
"grad_norm": 0.92715510790691,
"learning_rate": 0.0001,
"loss": 0.2259,
"step": 25200
},
{
"ce_loss": 0.2213623434305191,
"epoch": 7.58,
"inp_emb_norm": 0.4107421875,
"loss": 0.2213623434305191,
"masked_top1": 91.66716598510742,
"masked_top5": 99.32309783935547,
"step": 25200,
"top1": 96.42446884155274,
"top5": 99.6251106262207
},
{
"epoch": 7.59,
"grad_norm": 0.9935343271656442,
"learning_rate": 0.0001,
"loss": 0.225,
"step": 25250
},
{
"ce_loss": 0.22398129254579544,
"epoch": 7.59,
"inp_emb_norm": 0.409140625,
"loss": 0.22398129254579544,
"masked_top1": 92.20207107543945,
"masked_top5": 99.32244827270507,
"step": 25250,
"top1": 96.40167465209962,
"top5": 99.6429054260254
},
{
"epoch": 7.61,
"grad_norm": 0.9151015122387817,
"learning_rate": 0.0001,
"loss": 0.2227,
"step": 25300
},
{
"ce_loss": 0.22134240210056305,
"epoch": 7.61,
"inp_emb_norm": 0.413984375,
"loss": 0.22134240210056305,
"masked_top1": 92.04093017578126,
"masked_top5": 99.4468423461914,
"step": 25300,
"top1": 96.43467788696289,
"top5": 99.66016799926757
},
{
"epoch": 7.62,
"grad_norm": 0.8495278966433639,
"learning_rate": 0.0001,
"loss": 0.2212,
"step": 25350
},
{
"ce_loss": 0.22596073687076568,
"epoch": 7.62,
"inp_emb_norm": 0.4122265625,
"loss": 0.22596073687076568,
"masked_top1": 91.3648748779297,
"masked_top5": 99.2870686340332,
"step": 25350,
"top1": 96.3931086730957,
"top5": 99.61766250610351
},
{
"epoch": 7.64,
"grad_norm": 0.8771189448377541,
"learning_rate": 0.0001,
"loss": 0.224,
"step": 25400
},
{
"ce_loss": 0.22193652182817458,
"epoch": 7.64,
"inp_emb_norm": 0.4132421875,
"loss": 0.22193652182817458,
"masked_top1": 91.40539047241211,
"masked_top5": 99.10585540771484,
"step": 25400,
"top1": 96.49262084960938,
"top5": 99.6045753479004
},
{
"epoch": 7.65,
"grad_norm": 0.8719944439794763,
"learning_rate": 0.0001,
"loss": 0.2299,
"step": 25450
},
{
"ce_loss": 0.23045677453279495,
"epoch": 7.65,
"inp_emb_norm": 0.4094921875,
"loss": 0.23045677453279495,
"masked_top1": 91.44049133300781,
"masked_top5": 99.38550582885742,
"step": 25450,
"top1": 96.28902786254883,
"top5": 99.62447952270507
},
{
"epoch": 7.67,
"grad_norm": 0.948342031847273,
"learning_rate": 0.0001,
"loss": 0.2278,
"step": 25500
},
{
"ce_loss": 0.2266176301240921,
"epoch": 7.67,
"inp_emb_norm": 0.4139453125,
"loss": 0.2266176301240921,
"masked_top1": 91.54910736083984,
"masked_top5": 99.31753051757812,
"step": 25500,
"top1": 96.38055999755859,
"top5": 99.61369903564453
},
{
"epoch": 7.68,
"grad_norm": 0.9160429959948714,
"learning_rate": 0.0001,
"loss": 0.2301,
"step": 25550
},
{
"ce_loss": 0.22997273564338683,
"epoch": 7.68,
"inp_emb_norm": 0.4121484375,
"loss": 0.22997273564338683,
"masked_top1": 91.72368927001953,
"masked_top5": 99.37556945800782,
"step": 25550,
"top1": 96.34187088012695,
"top5": 99.62580551147461
},
{
"epoch": 7.7,
"grad_norm": 0.9064800185064251,
"learning_rate": 0.0001,
"loss": 0.2345,
"step": 25600
},
{
"ce_loss": 0.2321370568871498,
"epoch": 7.7,
"inp_emb_norm": 0.41484375,
"loss": 0.2321370568871498,
"masked_top1": 91.17147521972656,
"masked_top5": 99.15941177368164,
"step": 25600,
"top1": 96.28547775268555,
"top5": 99.6115461730957
},
{
"epoch": 7.71,
"grad_norm": 0.8896861108646344,
"learning_rate": 0.0001,
"loss": 0.2316,
"step": 25650
},
{
"ce_loss": 0.22988739639520644,
"epoch": 7.71,
"inp_emb_norm": 0.41703125,
"loss": 0.22988739639520644,
"masked_top1": 90.95779861450195,
"masked_top5": 99.13907150268555,
"step": 25650,
"top1": 96.32924926757812,
"top5": 99.61460418701172
},
{
"epoch": 7.73,
"grad_norm": 0.8364454804655236,
"learning_rate": 0.0001,
"loss": 0.2308,
"step": 25700
},
{
"ce_loss": 0.23426982671022414,
"epoch": 7.73,
"inp_emb_norm": 0.4129296875,
"loss": 0.23426982671022414,
"masked_top1": 90.87540649414062,
"masked_top5": 99.15606750488281,
"step": 25700,
"top1": 96.24389221191406,
"top5": 99.61002014160157
},
{
"epoch": 7.74,
"grad_norm": 0.8892969722239723,
"learning_rate": 0.0001,
"loss": 0.2326,
"step": 25750
},
{
"ce_loss": 0.23778538197278976,
"epoch": 7.74,
"inp_emb_norm": 0.4173828125,
"loss": 0.23778538197278976,
"masked_top1": 91.04456115722657,
"masked_top5": 99.08432937622071,
"step": 25750,
"top1": 96.19372604370118,
"top5": 99.59805389404296
},
{
"epoch": 7.76,
"grad_norm": 0.9333358166124965,
"learning_rate": 0.0001,
"loss": 0.2324,
"step": 25800
},
{
"ce_loss": 0.2342055532336235,
"epoch": 7.76,
"inp_emb_norm": 0.413203125,
"loss": 0.2342055532336235,
"masked_top1": 90.89793167114257,
"masked_top5": 99.32825225830078,
"step": 25800,
"top1": 96.24822631835937,
"top5": 99.63633575439454
},
{
"epoch": 7.77,
"grad_norm": 0.8972293165472223,
"learning_rate": 0.0001,
"loss": 0.2378,
"step": 25850
},
{
"ce_loss": 0.2358376482129097,
"epoch": 7.77,
"inp_emb_norm": 0.411875,
"loss": 0.2358376482129097,
"masked_top1": 90.5443440246582,
"masked_top5": 99.08838134765625,
"step": 25850,
"top1": 96.21415130615235,
"top5": 99.59769958496094
},
{
"epoch": 7.79,
"grad_norm": 0.8953278969414413,
"learning_rate": 0.0001,
"loss": 0.2394,
"step": 25900
},
{
"ce_loss": 0.241860691010952,
"epoch": 7.79,
"inp_emb_norm": 0.40828125,
"loss": 0.241860691010952,
"masked_top1": 90.6273094177246,
"masked_top5": 99.1496745300293,
"step": 25900,
"top1": 96.1283937072754,
"top5": 99.59819442749023
},
{
"epoch": 7.8,
"grad_norm": 0.8520671639423023,
"learning_rate": 0.0001,
"loss": 0.2401,
"step": 25950
},
{
"ce_loss": 0.2415415197610855,
"epoch": 7.8,
"inp_emb_norm": 0.4100390625,
"loss": 0.2415415197610855,
"masked_top1": 90.58090194702149,
"masked_top5": 99.10164520263672,
"step": 25950,
"top1": 96.16258728027344,
"top5": 99.59717208862304
},
{
"epoch": 7.82,
"grad_norm": 0.8645330931635717,
"learning_rate": 0.0001,
"loss": 0.2373,
"step": 26000
}
],
"logging_steps": 50,
"max_steps": 26600,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 2000,
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}