{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.819548872180452, "eval_steps": 1500, "global_step": 26000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ce_loss": 4.768589496612549, "epoch": 0, "inp_emb_norm": 0.265625, "loss": 4.768589496612549, "masked_top1": 0.0, "masked_top5": 0.4098360538482666, "step": 0, "top1": 64.96458435058594, "top5": 80.73052215576172 }, { "epoch": 0.02, "grad_norm": 6.2690516537633005, "learning_rate": 0.0001, "loss": 4.6296, "step": 50 }, { "ce_loss": 4.672832765579224, "epoch": 0.02, "inp_emb_norm": 0.26263671875, "loss": 4.672832765579224, "masked_top1": 14.987861804962158, "masked_top5": 24.83079730987549, "step": 50, "top1": 66.08985641002656, "top5": 81.12573463439941 }, { "epoch": 0.03, "grad_norm": 9.593317242373375, "learning_rate": 0.0001, "loss": 2.2246, "step": 100 }, { "ce_loss": 2.207055723667145, "epoch": 0.03, "inp_emb_norm": 0.2644140625, "loss": 2.207055723667145, "masked_top1": 23.22451873779297, "masked_top5": 38.51483547210693, "step": 100, "top1": 74.46695602416992, "top5": 87.36076766967773 }, { "epoch": 0.05, "grad_norm": 3.1307366740911395, "learning_rate": 0.0001, "loss": 2.0857, "step": 150 }, { "ce_loss": 2.077478907108307, "epoch": 0.05, "inp_emb_norm": 0.26427734375, "loss": 2.077478907108307, "masked_top1": 27.615498123168944, "masked_top5": 46.10835479736328, "step": 150, "top1": 75.29274810791016, "top5": 88.44302017211913 }, { "epoch": 0.06, "grad_norm": 2.714681744263656, "learning_rate": 0.0001, "loss": 2.0248, "step": 200 }, { "ce_loss": 2.018527669906616, "epoch": 0.06, "inp_emb_norm": 0.2655859375, "loss": 2.018527669906616, "masked_top1": 29.403907699584963, "masked_top5": 48.57966896057129, "step": 200, "top1": 75.77816040039062, "top5": 88.9672624206543 }, { "epoch": 0.08, "grad_norm": 2.4811381161353423, "learning_rate": 0.0001, "loss": 1.9651, "step": 250 }, { "ce_loss": 1.9264942455291747, "epoch": 0.08, "inp_emb_norm": 0.2648828125, "loss": 1.9264942455291747, "masked_top1": 32.32564323425293, "masked_top5": 51.88160499572754, "step": 250, "top1": 76.65794830322265, "top5": 89.52642883300781 }, { "epoch": 0.09, "grad_norm": 2.160461687507101, "learning_rate": 0.0001, "loss": 1.9061, "step": 300 }, { "ce_loss": 1.9086420011520386, "epoch": 0.09, "inp_emb_norm": 0.26564453125, "loss": 1.9086420011520386, "masked_top1": 32.26125560760498, "masked_top5": 52.74158630371094, "step": 300, "top1": 76.66858810424804, "top5": 89.8210075378418 }, { "epoch": 0.11, "grad_norm": 2.173027962906511, "learning_rate": 0.0001, "loss": 1.8717, "step": 350 }, { "ce_loss": 1.856195902824402, "epoch": 0.11, "inp_emb_norm": 0.27287109375, "loss": 1.856195902824402, "masked_top1": 33.80865501403809, "masked_top5": 54.50813941955566, "step": 350, "top1": 77.32864608764649, "top5": 90.11031524658203 }, { "epoch": 0.12, "grad_norm": 2.09826482925296, "learning_rate": 0.0001, "loss": 1.8414, "step": 400 }, { "ce_loss": 1.8570138716697693, "epoch": 0.12, "inp_emb_norm": 0.26978515625, "loss": 1.8570138716697693, "masked_top1": 34.77134052276611, "masked_top5": 55.27866950988769, "step": 400, "top1": 77.23032119750977, "top5": 90.24968856811523 }, { "epoch": 0.14, "grad_norm": 2.6584647355277067, "learning_rate": 0.0001, "loss": 1.8405, "step": 450 }, { "ce_loss": 1.8444490694999696, "epoch": 0.14, "inp_emb_norm": 0.27064453125, "loss": 1.8444490694999696, "masked_top1": 34.76882873535156, "masked_top5": 54.58673110961914, "step": 450, "top1": 77.44166137695312, "top5": 90.14545852661132 }, { "epoch": 0.15, "grad_norm": 1.9115658205853148, "learning_rate": 0.0001, "loss": 1.7976, "step": 500 }, { "ce_loss": 1.804072494506836, "epoch": 0.15, "inp_emb_norm": 0.2703515625, "loss": 1.804072494506836, "masked_top1": 34.5311047744751, "masked_top5": 55.86459785461426, "step": 500, "top1": 77.51275192260742, "top5": 90.69894668579101 }, { "epoch": 0.17, "grad_norm": 1.907634541206245, "learning_rate": 0.0001, "loss": 1.804, "step": 550 }, { "ce_loss": 1.7778214049339294, "epoch": 0.17, "inp_emb_norm": 0.27109375, "loss": 1.7778214049339294, "masked_top1": 35.83196727752686, "masked_top5": 56.62046134948731, "step": 550, "top1": 78.00231246948242, "top5": 90.66332412719727 }, { "epoch": 0.18, "grad_norm": 1.7366360206125175, "learning_rate": 0.0001, "loss": 1.7783, "step": 600 }, { "ce_loss": 1.7576907467842102, "epoch": 0.18, "inp_emb_norm": 0.2725, "loss": 1.7576907467842102, "masked_top1": 36.732100868225096, "masked_top5": 57.86395164489746, "step": 600, "top1": 78.34247482299804, "top5": 90.82281936645508 }, { "epoch": 0.2, "grad_norm": 1.9186053120285074, "learning_rate": 0.0001, "loss": 1.7819, "step": 650 }, { "ce_loss": 1.8032070183753968, "epoch": 0.2, "inp_emb_norm": 0.27015625, "loss": 1.8032070183753968, "masked_top1": 35.763907241821286, "masked_top5": 55.90799743652344, "step": 650, "top1": 77.69748565673828, "top5": 90.52577987670898 }, { "epoch": 0.21, "grad_norm": 1.957034200299797, "learning_rate": 0.0001, "loss": 1.735, "step": 700 }, { "ce_loss": 1.7470700669288635, "epoch": 0.21, "inp_emb_norm": 0.27248046875, "loss": 1.7470700669288635, "masked_top1": 37.93720417022705, "masked_top5": 58.685314331054684, "step": 700, "top1": 78.27062789916992, "top5": 90.87748138427735 }, { "epoch": 0.23, "grad_norm": 2.045975232733077, "learning_rate": 0.0001, "loss": 1.7517, "step": 750 }, { "ce_loss": 1.754687945842743, "epoch": 0.23, "inp_emb_norm": 0.27109375, "loss": 1.754687945842743, "masked_top1": 36.862719764709475, "masked_top5": 57.64240478515625, "step": 750, "top1": 77.98610443115234, "top5": 90.91637084960938 }, { "epoch": 0.24, "grad_norm": 2.1866870928510544, "learning_rate": 0.0001, "loss": 1.7225, "step": 800 }, { "ce_loss": 1.7393132853507995, "epoch": 0.24, "inp_emb_norm": 0.27255859375, "loss": 1.7393132853507995, "masked_top1": 36.725569190979, "masked_top5": 58.16348899841309, "step": 800, "top1": 78.34141525268555, "top5": 91.05248138427734 }, { "epoch": 0.26, "grad_norm": 1.9045759228508574, "learning_rate": 0.0001, "loss": 1.7212, "step": 850 }, { "ce_loss": 1.7258725309371947, "epoch": 0.26, "inp_emb_norm": 0.27572265625, "loss": 1.7258725309371947, "masked_top1": 37.848323364257816, "masked_top5": 58.37759643554688, "step": 850, "top1": 78.46119873046875, "top5": 91.0196517944336 }, { "epoch": 0.27, "grad_norm": 2.0335701239987896, "learning_rate": 0.0001, "loss": 1.7209, "step": 900 }, { "ce_loss": 1.7411377978324891, "epoch": 0.27, "inp_emb_norm": 0.27140625, "loss": 1.7411377978324891, "masked_top1": 37.40238594055176, "masked_top5": 58.47159057617188, "step": 900, "top1": 78.22477142333985, "top5": 91.09677703857422 }, { "epoch": 0.29, "grad_norm": 1.7613813093122805, "learning_rate": 0.0001, "loss": 1.6991, "step": 950 }, { "ce_loss": 1.7078413605690002, "epoch": 0.29, "inp_emb_norm": 0.27345703125, "loss": 1.7078413605690002, "masked_top1": 37.8476549911499, "masked_top5": 58.585325393676754, "step": 950, "top1": 78.66676864624023, "top5": 91.18907363891601 }, { "epoch": 0.3, "grad_norm": 1.746493072421527, "learning_rate": 0.0001, "loss": 1.7029, "step": 1000 }, { "ce_loss": 1.6898603582382201, "epoch": 0.3, "inp_emb_norm": 0.275859375, "loss": 1.6898603582382201, "masked_top1": 38.61885482788086, "masked_top5": 60.25848365783691, "step": 1000, "top1": 78.78071334838867, "top5": 91.36259521484375 }, { "epoch": 0.32, "grad_norm": 1.9274436772647008, "learning_rate": 0.0001, "loss": 1.6826, "step": 1050 }, { "ce_loss": 1.68393492937088, "epoch": 0.32, "inp_emb_norm": 0.2751171875, "loss": 1.68393492937088, "masked_top1": 38.2902375793457, "masked_top5": 59.33997863769531, "step": 1050, "top1": 78.7706575012207, "top5": 91.35841613769531 }, { "epoch": 0.33, "grad_norm": 1.7855509766510127, "learning_rate": 0.0001, "loss": 1.6718, "step": 1100 }, { "ce_loss": 1.6736824607849121, "epoch": 0.33, "inp_emb_norm": 0.276640625, "loss": 1.6736824607849121, "masked_top1": 37.7931421661377, "masked_top5": 59.2380428314209, "step": 1100, "top1": 78.76599548339844, "top5": 91.49175521850586 }, { "epoch": 0.35, "grad_norm": 1.9167748999099827, "learning_rate": 0.0001, "loss": 1.6706, "step": 1150 }, { "ce_loss": 1.6615458488464356, "epoch": 0.35, "inp_emb_norm": 0.27826171875, "loss": 1.6615458488464356, "masked_top1": 37.01531482696533, "masked_top5": 58.88155044555664, "step": 1150, "top1": 79.07698608398438, "top5": 91.33328231811524 }, { "epoch": 0.36, "grad_norm": 1.7556147486850162, "learning_rate": 0.0001, "loss": 1.6474, "step": 1200 }, { "ce_loss": 1.6442325162887572, "epoch": 0.36, "inp_emb_norm": 0.28015625, "loss": 1.6442325162887572, "masked_top1": 38.984904556274415, "masked_top5": 60.31360260009765, "step": 1200, "top1": 79.19950073242188, "top5": 91.51022766113282 }, { "epoch": 0.38, "grad_norm": 2.0171772312366834, "learning_rate": 0.0001, "loss": 1.6388, "step": 1250 }, { "ce_loss": 1.5983400893211366, "epoch": 0.38, "inp_emb_norm": 0.2769921875, "loss": 1.5983400893211366, "masked_top1": 39.851169929504394, "masked_top5": 60.379223403930666, "step": 1250, "top1": 79.51817199707031, "top5": 91.8211392211914 }, { "epoch": 0.39, "grad_norm": 1.6538008290605235, "learning_rate": 0.0001, "loss": 1.6453, "step": 1300 }, { "ce_loss": 1.6439322781562806, "epoch": 0.39, "inp_emb_norm": 0.27740234375, "loss": 1.6439322781562806, "masked_top1": 38.07537261962891, "masked_top5": 59.28539070129395, "step": 1300, "top1": 79.08166000366211, "top5": 91.56427230834962 }, { "epoch": 0.41, "grad_norm": 1.6552303659425454, "learning_rate": 0.0001, "loss": 1.6314, "step": 1350 }, { "ce_loss": 1.6322034692764282, "epoch": 0.41, "inp_emb_norm": 0.2801953125, "loss": 1.6322034692764282, "masked_top1": 38.92374729156494, "masked_top5": 60.256018524169924, "step": 1350, "top1": 79.35121063232423, "top5": 91.62414169311523 }, { "epoch": 0.42, "grad_norm": 1.4892198882479661, "learning_rate": 0.0001, "loss": 1.6421, "step": 1400 }, { "ce_loss": 1.6463946628570556, "epoch": 0.42, "inp_emb_norm": 0.27763671875, "loss": 1.6463946628570556, "masked_top1": 37.44570774078369, "masked_top5": 59.385975952148435, "step": 1400, "top1": 79.01243072509766, "top5": 91.58177993774414 }, { "epoch": 0.44, "grad_norm": 1.5892663240327325, "learning_rate": 0.0001, "loss": 1.6319, "step": 1450 }, { "ce_loss": 1.622364718914032, "epoch": 0.44, "inp_emb_norm": 0.2815625, "loss": 1.622364718914032, "masked_top1": 37.010132484436035, "masked_top5": 58.88730949401855, "step": 1450, "top1": 79.29409576416016, "top5": 91.61080490112305 }, { "epoch": 0.45, "grad_norm": 1.5298220602979866, "learning_rate": 0.0001, "loss": 1.6288, "step": 1500 }, { "ce_loss": 1.672783522605896, "epoch": 0.45, "inp_emb_norm": 0.2782421875, "loss": 1.672783522605896, "masked_top1": 37.74873733520508, "masked_top5": 59.34834083557129, "step": 1500, "top1": 78.78699188232422, "top5": 91.44515747070312 }, { "epoch": 0.47, "grad_norm": 1.6154188043026179, "learning_rate": 0.0001, "loss": 1.6151, "step": 1550 }, { "ce_loss": 1.5997060227394104, "epoch": 0.47, "inp_emb_norm": 0.280390625, "loss": 1.5997060227394104, "masked_top1": 39.32766487121582, "masked_top5": 60.36418632507324, "step": 1550, "top1": 79.40666061401367, "top5": 91.93371612548827 }, { "epoch": 0.48, "grad_norm": 1.3771816353205768, "learning_rate": 0.0001, "loss": 1.6084, "step": 1600 }, { "ce_loss": 1.5898131847381591, "epoch": 0.48, "inp_emb_norm": 0.278984375, "loss": 1.5898131847381591, "masked_top1": 38.975818939208985, "masked_top5": 60.515014724731444, "step": 1600, "top1": 79.69720977783203, "top5": 91.98071060180663 }, { "epoch": 0.5, "grad_norm": 1.452663822317599, "learning_rate": 0.0001, "loss": 1.6124, "step": 1650 }, { "ce_loss": 1.6319751167297363, "epoch": 0.5, "inp_emb_norm": 0.28296875, "loss": 1.6319751167297363, "masked_top1": 39.17121643066406, "masked_top5": 60.53929168701172, "step": 1650, "top1": 79.40355041503906, "top5": 91.61957992553711 }, { "epoch": 0.51, "grad_norm": 1.6649862877803743, "learning_rate": 0.0001, "loss": 1.6168, "step": 1700 }, { "ce_loss": 1.6018091797828675, "epoch": 0.51, "inp_emb_norm": 0.2794921875, "loss": 1.6018091797828675, "masked_top1": 38.38370750427246, "masked_top5": 60.53903656005859, "step": 1700, "top1": 79.40913070678711, "top5": 91.92860076904297 }, { "epoch": 0.53, "grad_norm": 1.900108786790241, "learning_rate": 0.0001, "loss": 1.591, "step": 1750 }, { "ce_loss": 1.6045577454566955, "epoch": 0.53, "inp_emb_norm": 0.2825, "loss": 1.6045577454566955, "masked_top1": 38.33882637023926, "masked_top5": 60.6797013092041, "step": 1750, "top1": 79.38134246826172, "top5": 91.90715942382812 }, { "epoch": 0.54, "grad_norm": 1.669989934133695, "learning_rate": 0.0001, "loss": 1.5798, "step": 1800 }, { "ce_loss": 1.575341019630432, "epoch": 0.54, "inp_emb_norm": 0.2808984375, "loss": 1.575341019630432, "masked_top1": 39.37454727172852, "masked_top5": 60.28605743408203, "step": 1800, "top1": 79.7763461303711, "top5": 91.9139274597168 }, { "epoch": 0.56, "grad_norm": 1.4991127060714822, "learning_rate": 0.0001, "loss": 1.5848, "step": 1850 }, { "ce_loss": 1.5776564478874207, "epoch": 0.56, "inp_emb_norm": 0.2787890625, "loss": 1.5776564478874207, "masked_top1": 38.988163757324216, "masked_top5": 59.69200141906738, "step": 1850, "top1": 79.65292083740235, "top5": 91.98164306640625 }, { "epoch": 0.57, "grad_norm": 1.7116902678486716, "learning_rate": 0.0001, "loss": 1.5799, "step": 1900 }, { "ce_loss": 1.574492063522339, "epoch": 0.57, "inp_emb_norm": 0.282421875, "loss": 1.574492063522339, "masked_top1": 40.138608627319336, "masked_top5": 61.31516136169434, "step": 1900, "top1": 79.69049621582032, "top5": 92.10322280883788 }, { "epoch": 0.59, "grad_norm": 1.3150771431721024, "learning_rate": 0.0001, "loss": 1.5593, "step": 1950 }, { "ce_loss": 1.5305517101287842, "epoch": 0.59, "inp_emb_norm": 0.2823046875, "loss": 1.5305517101287842, "masked_top1": 40.4295878982544, "masked_top5": 61.966970901489255, "step": 1950, "top1": 79.98332046508789, "top5": 92.32930023193359 }, { "epoch": 0.6, "grad_norm": 6.707754405699673, "learning_rate": 0.0001, "loss": 1.5863, "step": 2000 }, { "ce_loss": 1.5903752088546752, "epoch": 0.6, "inp_emb_norm": 0.2828515625, "loss": 1.5903752088546752, "masked_top1": 39.76962215423584, "masked_top5": 61.7191081237793, "step": 2000, "top1": 79.53595993041992, "top5": 92.0239190673828 }, { "epoch": 0.62, "grad_norm": 1.452524089856983, "learning_rate": 0.0001, "loss": 1.5773, "step": 2050 }, { "ce_loss": 1.5838685631752014, "epoch": 0.62, "inp_emb_norm": 0.2877734375, "loss": 1.5838685631752014, "masked_top1": 38.40034454345703, "masked_top5": 60.153098831176756, "step": 2050, "top1": 79.58445373535156, "top5": 91.9414045715332 }, { "epoch": 0.63, "grad_norm": 1.5517366506652683, "learning_rate": 0.0001, "loss": 1.5928, "step": 2100 }, { "ce_loss": 1.6009895205497742, "epoch": 0.63, "inp_emb_norm": 0.282578125, "loss": 1.6009895205497742, "masked_top1": 38.17198055267334, "masked_top5": 59.90890487670899, "step": 2100, "top1": 79.57786758422851, "top5": 91.74088973999024 }, { "epoch": 0.65, "grad_norm": 1.4696024906496812, "learning_rate": 0.0001, "loss": 1.5785, "step": 2150 }, { "ce_loss": 1.585225760936737, "epoch": 0.65, "inp_emb_norm": 0.28974609375, "loss": 1.585225760936737, "masked_top1": 39.15816062927246, "masked_top5": 61.218558883666994, "step": 2150, "top1": 79.77144348144532, "top5": 91.92766723632812 }, { "epoch": 0.66, "grad_norm": 1.4286648572662997, "learning_rate": 0.0001, "loss": 1.5676, "step": 2200 }, { "ce_loss": 1.5558442735671998, "epoch": 0.66, "inp_emb_norm": 0.2856640625, "loss": 1.5558442735671998, "masked_top1": 39.56588050842285, "masked_top5": 61.39894416809082, "step": 2200, "top1": 79.9505500793457, "top5": 92.23053100585938 }, { "epoch": 0.68, "grad_norm": 1.3408463458478854, "learning_rate": 0.0001, "loss": 1.5536, "step": 2250 }, { "ce_loss": 1.5132439875602721, "epoch": 0.68, "inp_emb_norm": 0.289609375, "loss": 1.5132439875602721, "masked_top1": 39.436230506896976, "masked_top5": 61.361060333251956, "step": 2250, "top1": 80.31327835083007, "top5": 92.35198455810547 }, { "epoch": 0.69, "grad_norm": 1.4789295388492802, "learning_rate": 0.0001, "loss": 1.5519, "step": 2300 }, { "ce_loss": 1.52454154253006, "epoch": 0.69, "inp_emb_norm": 0.2912109375, "loss": 1.52454154253006, "masked_top1": 39.47452976226807, "masked_top5": 61.6866958618164, "step": 2300, "top1": 80.14813919067383, "top5": 92.37121780395508 }, { "epoch": 0.71, "grad_norm": 1.3877519817560113, "learning_rate": 0.0001, "loss": 1.5542, "step": 2350 }, { "ce_loss": 1.5439094185829163, "epoch": 0.71, "inp_emb_norm": 0.29296875, "loss": 1.5439094185829163, "masked_top1": 38.476121711730954, "masked_top5": 61.2806697845459, "step": 2350, "top1": 80.00807098388673, "top5": 92.20030227661132 }, { "epoch": 0.72, "grad_norm": 1.534164522196606, "learning_rate": 0.0001, "loss": 1.5517, "step": 2400 }, { "ce_loss": 1.545496084690094, "epoch": 0.72, "inp_emb_norm": 0.291328125, "loss": 1.545496084690094, "masked_top1": 38.46687156677246, "masked_top5": 61.44874328613281, "step": 2400, "top1": 79.93062316894532, "top5": 92.19858032226563 }, { "epoch": 0.74, "grad_norm": 1.4019229482522402, "learning_rate": 0.0001, "loss": 1.5453, "step": 2450 }, { "ce_loss": 1.5521028304100037, "epoch": 0.74, "inp_emb_norm": 0.288125, "loss": 1.5521028304100037, "masked_top1": 40.88698040008545, "masked_top5": 63.060478515625, "step": 2450, "top1": 79.97427581787109, "top5": 92.30514190673829 }, { "epoch": 0.75, "grad_norm": 1.3288444515563334, "learning_rate": 0.0001, "loss": 1.5446, "step": 2500 }, { "ce_loss": 1.5541789364814758, "epoch": 0.75, "inp_emb_norm": 0.2872265625, "loss": 1.5541789364814758, "masked_top1": 40.732691040039064, "masked_top5": 62.54548332214355, "step": 2500, "top1": 79.8016665649414, "top5": 92.2079689025879 }, { "epoch": 0.77, "grad_norm": 1.2949056410682664, "learning_rate": 0.0001, "loss": 1.523, "step": 2550 }, { "ce_loss": 1.524161262512207, "epoch": 0.77, "inp_emb_norm": 0.2938671875, "loss": 1.524161262512207, "masked_top1": 39.2896460723877, "masked_top5": 61.28740455627442, "step": 2550, "top1": 80.2130502319336, "top5": 92.26818771362305 }, { "epoch": 0.78, "grad_norm": 1.376548356881607, "learning_rate": 0.0001, "loss": 1.5247, "step": 2600 }, { "ce_loss": 1.5120180606842042, "epoch": 0.78, "inp_emb_norm": 0.2912109375, "loss": 1.5120180606842042, "masked_top1": 40.32475761413574, "masked_top5": 62.50459335327148, "step": 2600, "top1": 80.25805755615234, "top5": 92.3787159729004 }, { "epoch": 0.8, "grad_norm": 1.446462800296954, "learning_rate": 0.0001, "loss": 1.5244, "step": 2650 }, { "ce_loss": 1.4864131617546081, "epoch": 0.8, "inp_emb_norm": 0.2887890625, "loss": 1.4864131617546081, "masked_top1": 41.43366180419922, "masked_top5": 63.38628890991211, "step": 2650, "top1": 80.42174575805664, "top5": 92.73545196533203 }, { "epoch": 0.81, "grad_norm": 1.5930817573922351, "learning_rate": 0.0001, "loss": 1.5321, "step": 2700 }, { "ce_loss": 1.5146442604064942, "epoch": 0.81, "inp_emb_norm": 0.29212890625, "loss": 1.5146442604064942, "masked_top1": 40.67323059082031, "masked_top5": 63.30255233764648, "step": 2700, "top1": 80.21553787231446, "top5": 92.49449493408203 }, { "epoch": 0.83, "grad_norm": 1.3154781061914393, "learning_rate": 0.0001, "loss": 1.5258, "step": 2750 }, { "ce_loss": 1.5432595777511597, "epoch": 0.83, "inp_emb_norm": 0.2902734375, "loss": 1.5432595777511597, "masked_top1": 40.71655921936035, "masked_top5": 63.31653228759765, "step": 2750, "top1": 79.94630706787109, "top5": 92.41331008911133 }, { "epoch": 0.84, "grad_norm": 1.4988480825688437, "learning_rate": 0.0001, "loss": 1.529, "step": 2800 }, { "ce_loss": 1.5197454857826234, "epoch": 0.84, "inp_emb_norm": 0.295234375, "loss": 1.5197454857826234, "masked_top1": 40.75451274871826, "masked_top5": 63.20060585021972, "step": 2800, "top1": 80.25330627441406, "top5": 92.41782470703124 }, { "epoch": 0.86, "grad_norm": 1.3953163621955647, "learning_rate": 0.0001, "loss": 1.5201, "step": 2850 }, { "ce_loss": 1.5061662626266479, "epoch": 0.86, "inp_emb_norm": 0.294375, "loss": 1.5061662626266479, "masked_top1": 41.16034164428711, "masked_top5": 63.11306884765625, "step": 2850, "top1": 80.41659545898438, "top5": 92.50812393188477 }, { "epoch": 0.87, "grad_norm": 1.3361207443145617, "learning_rate": 0.0001, "loss": 1.5185, "step": 2900 }, { "ce_loss": 1.48851407289505, "epoch": 0.87, "inp_emb_norm": 0.29296875, "loss": 1.48851407289505, "masked_top1": 41.136603927612306, "masked_top5": 63.02009063720703, "step": 2900, "top1": 80.65627792358399, "top5": 92.53072723388672 }, { "epoch": 0.89, "grad_norm": 1.2528196866924233, "learning_rate": 0.0001, "loss": 1.5154, "step": 2950 }, { "ce_loss": 1.511202063560486, "epoch": 0.89, "inp_emb_norm": 0.298671875, "loss": 1.511202063560486, "masked_top1": 39.16858932495117, "masked_top5": 61.53038063049316, "step": 2950, "top1": 80.2845379638672, "top5": 92.39456756591797 }, { "epoch": 0.9, "grad_norm": 1.4560813011403748, "learning_rate": 0.0001, "loss": 1.5365, "step": 3000 }, { "ce_loss": 1.5491148686408998, "epoch": 0.9, "inp_emb_norm": 0.297578125, "loss": 1.5491148686408998, "masked_top1": 40.18224685668945, "masked_top5": 62.69388824462891, "step": 3000, "top1": 79.84650588989258, "top5": 92.17584533691407 }, { "epoch": 0.92, "grad_norm": 1.2474903057292217, "learning_rate": 0.0001, "loss": 1.5079, "step": 3050 }, { "ce_loss": 1.510890085697174, "epoch": 0.92, "inp_emb_norm": 0.289453125, "loss": 1.510890085697174, "masked_top1": 41.284685897827146, "masked_top5": 63.54399559020996, "step": 3050, "top1": 80.19790802001953, "top5": 92.62230575561523 }, { "epoch": 0.93, "grad_norm": 1.4413477522514346, "learning_rate": 0.0001, "loss": 1.5008, "step": 3100 }, { "ce_loss": 1.505354859828949, "epoch": 0.93, "inp_emb_norm": 0.294140625, "loss": 1.505354859828949, "masked_top1": 40.861399726867674, "masked_top5": 62.37717430114746, "step": 3100, "top1": 80.46924255371094, "top5": 92.5052066040039 }, { "epoch": 0.95, "grad_norm": 1.359782937496998, "learning_rate": 0.0001, "loss": 1.4983, "step": 3150 }, { "ce_loss": 1.5110413646697998, "epoch": 0.95, "inp_emb_norm": 0.2980859375, "loss": 1.5110413646697998, "masked_top1": 42.160480155944825, "masked_top5": 64.15993942260742, "step": 3150, "top1": 80.45739974975587, "top5": 92.550869140625 }, { "epoch": 0.96, "grad_norm": 1.3582356861036782, "learning_rate": 0.0001, "loss": 1.5034, "step": 3200 }, { "ce_loss": 1.530618577003479, "epoch": 0.96, "inp_emb_norm": 0.29375, "loss": 1.530618577003479, "masked_top1": 38.9424352645874, "masked_top5": 62.109956970214846, "step": 3200, "top1": 80.0709977722168, "top5": 92.2861149597168 }, { "epoch": 0.98, "grad_norm": 2.5802552823497455, "learning_rate": 0.0001, "loss": 1.4946, "step": 3250 }, { "ce_loss": 1.477068486213684, "epoch": 0.98, "inp_emb_norm": 0.2976953125, "loss": 1.477068486213684, "masked_top1": 40.51057823181152, "masked_top5": 63.397703323364254, "step": 3250, "top1": 80.50700180053711, "top5": 92.68867340087891 }, { "epoch": 0.99, "grad_norm": 1.4547325164409564, "learning_rate": 0.0001, "loss": 1.5084, "step": 3300 }, { "ce_loss": 1.4906554532051086, "epoch": 0.99, "inp_emb_norm": 0.2973828125, "loss": 1.4906554532051086, "masked_top1": 40.54425048828125, "masked_top5": 63.201170196533205, "step": 3300, "top1": 80.40962783813477, "top5": 92.66117584228516 }, { "epoch": 1.01, "grad_norm": 1.211911337284348, "learning_rate": 0.0001, "loss": 1.3955, "step": 3350 }, { "ce_loss": 1.400897753238678, "epoch": 1.01, "inp_emb_norm": 0.3021875, "loss": 1.400897753238678, "masked_top1": 41.17978542327881, "masked_top5": 63.33616775512695, "step": 3350, "top1": 81.19734100341798, "top5": 93.10294403076172 }, { "epoch": 1.02, "grad_norm": 1.0903282402796672, "learning_rate": 0.0001, "loss": 1.2893, "step": 3400 }, { "ce_loss": 1.2788537430763245, "epoch": 1.02, "inp_emb_norm": 0.2989453125, "loss": 1.2788537430763245, "masked_top1": 40.09818046569824, "masked_top5": 62.5667342376709, "step": 3400, "top1": 82.30339096069336, "top5": 93.88133010864257 }, { "epoch": 1.04, "grad_norm": 1.5240484766936855, "learning_rate": 0.0001, "loss": 1.2964, "step": 3450 }, { "ce_loss": 1.303318452835083, "epoch": 1.04, "inp_emb_norm": 0.29703125, "loss": 1.303318452835083, "masked_top1": 42.162941246032716, "masked_top5": 65.02184768676757, "step": 3450, "top1": 82.02912399291992, "top5": 93.88583358764649 }, { "epoch": 1.05, "grad_norm": 1.3787785745538395, "learning_rate": 0.0001, "loss": 1.2852, "step": 3500 }, { "ce_loss": 1.276361472606659, "epoch": 1.05, "inp_emb_norm": 0.3021875, "loss": 1.276361472606659, "masked_top1": 41.940252990722655, "masked_top5": 65.38093276977538, "step": 3500, "top1": 82.44789840698242, "top5": 93.89565292358398 }, { "epoch": 1.07, "grad_norm": 1.3842728564490252, "learning_rate": 0.0001, "loss": 1.2978, "step": 3550 }, { "ce_loss": 1.3330717968940735, "epoch": 1.07, "inp_emb_norm": 0.29859375, "loss": 1.3330717968940735, "masked_top1": 41.01281318664551, "masked_top5": 63.77486457824707, "step": 3550, "top1": 81.81988937377929, "top5": 93.57796142578125 }, { "epoch": 1.08, "grad_norm": 1.2631108386132899, "learning_rate": 0.0001, "loss": 1.2931, "step": 3600 }, { "ce_loss": 1.2983617568016053, "epoch": 1.08, "inp_emb_norm": 0.2962109375, "loss": 1.2983617568016053, "masked_top1": 41.40194427490234, "masked_top5": 64.76402061462403, "step": 3600, "top1": 82.0794775390625, "top5": 93.74165817260742 }, { "epoch": 1.1, "grad_norm": 1.4399254065091216, "learning_rate": 0.0001, "loss": 1.3091, "step": 3650 }, { "ce_loss": 1.3062031197547912, "epoch": 1.1, "inp_emb_norm": 0.2998046875, "loss": 1.3062031197547912, "masked_top1": 40.85140724182129, "masked_top5": 63.72927604675293, "step": 3650, "top1": 82.13722595214844, "top5": 93.64862121582031 }, { "epoch": 1.11, "grad_norm": 1.2311319185042409, "learning_rate": 0.0001, "loss": 1.3088, "step": 3700 }, { "ce_loss": 1.2959584522247314, "epoch": 1.11, "inp_emb_norm": 0.304296875, "loss": 1.2959584522247314, "masked_top1": 41.39893436431885, "masked_top5": 63.62430404663086, "step": 3700, "top1": 82.33803344726563, "top5": 93.6450895690918 }, { "epoch": 1.13, "grad_norm": 1.1358156745617467, "learning_rate": 0.0001, "loss": 1.3131, "step": 3750 }, { "ce_loss": 1.29275639295578, "epoch": 1.13, "inp_emb_norm": 0.3049609375, "loss": 1.29275639295578, "masked_top1": 42.64406570434571, "masked_top5": 65.42192581176758, "step": 3750, "top1": 82.30059600830079, "top5": 93.80231704711915 }, { "epoch": 1.14, "grad_norm": 1.1800593283969816, "learning_rate": 0.0001, "loss": 1.2969, "step": 3800 }, { "ce_loss": 1.301748011112213, "epoch": 1.14, "inp_emb_norm": 0.2975390625, "loss": 1.301748011112213, "masked_top1": 41.66663146972656, "masked_top5": 64.09541954040527, "step": 3800, "top1": 82.05701431274414, "top5": 93.72319717407227 }, { "epoch": 1.16, "grad_norm": 1.850155610052983, "learning_rate": 0.0001, "loss": 1.3153, "step": 3850 }, { "ce_loss": 1.2972828006744386, "epoch": 1.16, "inp_emb_norm": 0.3021875, "loss": 1.2972828006744386, "masked_top1": 40.78549217224121, "masked_top5": 64.16069633483886, "step": 3850, "top1": 82.19980209350587, "top5": 93.7464599609375 }, { "epoch": 1.17, "grad_norm": 1.2715739238746637, "learning_rate": 0.0001, "loss": 1.2995, "step": 3900 }, { "ce_loss": 1.3045063495635987, "epoch": 1.17, "inp_emb_norm": 0.3047265625, "loss": 1.3045063495635987, "masked_top1": 40.932076416015626, "masked_top5": 64.15639678955078, "step": 3900, "top1": 82.09319473266602, "top5": 93.77517272949218 }, { "epoch": 1.19, "grad_norm": 1.3796703227080724, "learning_rate": 0.0001, "loss": 1.3071, "step": 3950 }, { "ce_loss": 1.3098111128807068, "epoch": 1.19, "inp_emb_norm": 0.297109375, "loss": 1.3098111128807068, "masked_top1": 41.91473709106445, "masked_top5": 64.43979515075684, "step": 3950, "top1": 81.8616471862793, "top5": 93.78232650756836 }, { "epoch": 1.2, "grad_norm": 1.1546364550490216, "learning_rate": 0.0001, "loss": 1.3098, "step": 4000 }, { "ce_loss": 1.3013187646865845, "epoch": 1.2, "inp_emb_norm": 0.30296875, "loss": 1.3013187646865845, "masked_top1": 41.120027198791504, "masked_top5": 63.108745346069334, "step": 4000, "top1": 82.13422164916992, "top5": 93.6265623474121 }, { "epoch": 1.22, "grad_norm": 1.2515420301071278, "learning_rate": 0.0001, "loss": 1.3027, "step": 4050 }, { "ce_loss": 1.2935965037345887, "epoch": 1.22, "inp_emb_norm": 0.3031640625, "loss": 1.2935965037345887, "masked_top1": 40.63400650024414, "masked_top5": 63.75268486022949, "step": 4050, "top1": 82.2015396118164, "top5": 93.79516159057617 }, { "epoch": 1.23, "grad_norm": 1.1524816264565314, "learning_rate": 0.0001, "loss": 1.3, "step": 4100 }, { "ce_loss": 1.3123349785804748, "epoch": 1.23, "inp_emb_norm": 0.3012109375, "loss": 1.3123349785804748, "masked_top1": 42.19605297088623, "masked_top5": 65.04543876647949, "step": 4100, "top1": 81.9914030456543, "top5": 93.81402328491211 }, { "epoch": 1.25, "grad_norm": 1.378666189570675, "learning_rate": 0.0001, "loss": 1.3045, "step": 4150 }, { "ce_loss": 1.2990161776542664, "epoch": 1.25, "inp_emb_norm": 0.305859375, "loss": 1.2990161776542664, "masked_top1": 41.78075637817383, "masked_top5": 64.68202613830566, "step": 4150, "top1": 82.17454803466796, "top5": 93.76477783203126 }, { "epoch": 1.26, "grad_norm": 1.3993699556629227, "learning_rate": 0.0001, "loss": 1.3014, "step": 4200 }, { "ce_loss": 1.2723517334461212, "epoch": 1.26, "inp_emb_norm": 0.301796875, "loss": 1.2723517334461212, "masked_top1": 43.205936431884766, "masked_top5": 65.13977348327637, "step": 4200, "top1": 82.52856887817383, "top5": 93.89131042480469 }, { "epoch": 1.28, "grad_norm": 1.3745855928185973, "learning_rate": 0.0001, "loss": 1.3151, "step": 4250 }, { "ce_loss": 1.3126947474479675, "epoch": 1.28, "inp_emb_norm": 0.3046484375, "loss": 1.3126947474479675, "masked_top1": 40.87171413421631, "masked_top5": 63.66306259155274, "step": 4250, "top1": 82.01153930664063, "top5": 93.64672576904297 }, { "epoch": 1.29, "grad_norm": 1.479893407422574, "learning_rate": 0.0001, "loss": 1.3213, "step": 4300 }, { "ce_loss": 1.3277541399002075, "epoch": 1.29, "inp_emb_norm": 0.302421875, "loss": 1.3277541399002075, "masked_top1": 41.169107818603514, "masked_top5": 64.09851593017578, "step": 4300, "top1": 81.798203125, "top5": 93.67467819213867 }, { "epoch": 1.31, "grad_norm": 1.4172322167184916, "learning_rate": 0.0001, "loss": 1.3112, "step": 4350 }, { "ce_loss": 1.3031212973594666, "epoch": 1.31, "inp_emb_norm": 0.306484375, "loss": 1.3031212973594666, "masked_top1": 41.52652729034424, "masked_top5": 64.15341407775878, "step": 4350, "top1": 82.03983184814453, "top5": 93.67420471191406 }, { "epoch": 1.32, "grad_norm": 1.2760472814321302, "learning_rate": 0.0001, "loss": 1.3131, "step": 4400 }, { "ce_loss": 1.308469491004944, "epoch": 1.32, "inp_emb_norm": 0.3057421875, "loss": 1.308469491004944, "masked_top1": 41.37114768981934, "masked_top5": 63.93113624572754, "step": 4400, "top1": 82.07672714233398, "top5": 93.6951513671875 }, { "epoch": 1.34, "grad_norm": 1.2334433655289787, "learning_rate": 0.0001, "loss": 1.3036, "step": 4450 }, { "ce_loss": 1.3185024070739746, "epoch": 1.34, "inp_emb_norm": 0.30171875, "loss": 1.3185024070739746, "masked_top1": 42.539853439331054, "masked_top5": 65.19357299804688, "step": 4450, "top1": 82.02122940063477, "top5": 93.74457550048828 }, { "epoch": 1.35, "grad_norm": 1.2156796466751323, "learning_rate": 0.0001, "loss": 1.3109, "step": 4500 }, { "ce_loss": 1.2963746500015259, "epoch": 1.35, "inp_emb_norm": 0.30984375, "loss": 1.2963746500015259, "masked_top1": 42.78518562316894, "masked_top5": 65.02521774291992, "step": 4500, "top1": 82.2968830871582, "top5": 93.7428224182129 }, { "epoch": 1.37, "grad_norm": 1.311745495017629, "learning_rate": 0.0001, "loss": 1.3115, "step": 4550 }, { "ce_loss": 1.2924715709686279, "epoch": 1.37, "inp_emb_norm": 0.303828125, "loss": 1.2924715709686279, "masked_top1": 42.53660099029541, "masked_top5": 65.59047889709473, "step": 4550, "top1": 82.1043424987793, "top5": 93.91824569702149 }, { "epoch": 1.38, "grad_norm": 1.2174333958038526, "learning_rate": 0.0001, "loss": 1.3201, "step": 4600 }, { "ce_loss": 1.3174702334403992, "epoch": 1.38, "inp_emb_norm": 0.30796875, "loss": 1.3174702334403992, "masked_top1": 42.07423233032227, "masked_top5": 64.41791015625, "step": 4600, "top1": 81.94120040893554, "top5": 93.70007461547851 }, { "epoch": 1.4, "grad_norm": 1.1701397235812094, "learning_rate": 0.0001, "loss": 1.3085, "step": 4650 }, { "ce_loss": 1.3130900907516478, "epoch": 1.4, "inp_emb_norm": 0.303203125, "loss": 1.3130900907516478, "masked_top1": 41.166784133911136, "masked_top5": 64.41526512145997, "step": 4650, "top1": 81.96283096313476, "top5": 93.76418426513672 }, { "epoch": 1.41, "grad_norm": 1.1778250928748137, "learning_rate": 0.0001, "loss": 1.3236, "step": 4700 }, { "ce_loss": 1.3132509183883667, "epoch": 1.41, "inp_emb_norm": 0.3076171875, "loss": 1.3132509183883667, "masked_top1": 41.547097778320314, "masked_top5": 63.82028350830078, "step": 4700, "top1": 82.07340423583985, "top5": 93.64304718017578 }, { "epoch": 1.43, "grad_norm": 1.3130038776611517, "learning_rate": 0.0001, "loss": 1.2884, "step": 4750 }, { "ce_loss": 1.2915071487426757, "epoch": 1.43, "inp_emb_norm": 0.30640625, "loss": 1.2915071487426757, "masked_top1": 41.55844184875488, "masked_top5": 64.31948181152343, "step": 4750, "top1": 82.19200744628907, "top5": 93.80693008422851 }, { "epoch": 1.44, "grad_norm": 1.3523241369731542, "learning_rate": 0.0001, "loss": 1.3229, "step": 4800 }, { "ce_loss": 1.3281687498092651, "epoch": 1.44, "inp_emb_norm": 0.3083203125, "loss": 1.3281687498092651, "masked_top1": 41.508040008544924, "masked_top5": 63.831429901123045, "step": 4800, "top1": 81.83387313842773, "top5": 93.61951248168945 }, { "epoch": 1.46, "grad_norm": 1.2229550505786297, "learning_rate": 0.0001, "loss": 1.313, "step": 4850 }, { "ce_loss": 1.321110601425171, "epoch": 1.46, "inp_emb_norm": 0.30484375, "loss": 1.321110601425171, "masked_top1": 42.07623374938965, "masked_top5": 64.01099380493164, "step": 4850, "top1": 82.04196990966797, "top5": 93.63945693969727 }, { "epoch": 1.47, "grad_norm": 1.291149342311876, "learning_rate": 0.0001, "loss": 1.3056, "step": 4900 }, { "ce_loss": 1.306683280467987, "epoch": 1.47, "inp_emb_norm": 0.3062890625, "loss": 1.306683280467987, "masked_top1": 42.63887153625488, "masked_top5": 65.17818214416504, "step": 4900, "top1": 82.0475244140625, "top5": 93.72082290649413 }, { "epoch": 1.49, "grad_norm": 1.114074399282948, "learning_rate": 0.0001, "loss": 1.2953, "step": 4950 }, { "ce_loss": 1.2916775250434875, "epoch": 1.49, "inp_emb_norm": 0.3067578125, "loss": 1.2916775250434875, "masked_top1": 43.01071895599365, "masked_top5": 64.40318168640137, "step": 4950, "top1": 82.25905914306641, "top5": 93.81284698486328 }, { "epoch": 1.5, "grad_norm": 1.1428006346267754, "learning_rate": 0.0001, "loss": 1.3132, "step": 5000 }, { "ce_loss": 1.3006132817268372, "epoch": 1.5, "inp_emb_norm": 0.305546875, "loss": 1.3006132817268372, "masked_top1": 43.18012409210205, "masked_top5": 65.12184883117676, "step": 5000, "top1": 82.29021392822266, "top5": 93.76517501831054 }, { "epoch": 1.52, "grad_norm": 1.1188850908108916, "learning_rate": 0.0001, "loss": 1.3097, "step": 5050 }, { "ce_loss": 1.3087879872322083, "epoch": 1.52, "inp_emb_norm": 0.3089453125, "loss": 1.3087879872322083, "masked_top1": 43.01914245605469, "masked_top5": 65.29854652404785, "step": 5050, "top1": 82.07856109619141, "top5": 93.74372894287109 }, { "epoch": 1.53, "grad_norm": 1.2100791577553864, "learning_rate": 0.0001, "loss": 1.313, "step": 5100 }, { "ce_loss": 1.3196745228767395, "epoch": 1.53, "inp_emb_norm": 0.30875, "loss": 1.3196745228767395, "masked_top1": 42.13117530822754, "masked_top5": 65.02122192382812, "step": 5100, "top1": 81.9853271484375, "top5": 93.69662155151367 }, { "epoch": 1.55, "grad_norm": 1.2111638230324686, "learning_rate": 0.0001, "loss": 1.3151, "step": 5150 }, { "ce_loss": 1.3195432901382447, "epoch": 1.55, "inp_emb_norm": 0.306171875, "loss": 1.3195432901382447, "masked_top1": 42.28558715820313, "masked_top5": 65.25620643615723, "step": 5150, "top1": 81.9179295349121, "top5": 93.7562042236328 }, { "epoch": 1.56, "grad_norm": 1.2354610192482536, "learning_rate": 0.0001, "loss": 1.3048, "step": 5200 }, { "ce_loss": 1.3142172384262085, "epoch": 1.56, "inp_emb_norm": 0.3107421875, "loss": 1.3142172384262085, "masked_top1": 41.06179321289063, "masked_top5": 64.15951232910156, "step": 5200, "top1": 81.83832885742187, "top5": 93.69693267822265 }, { "epoch": 1.58, "grad_norm": 1.2603393329014463, "learning_rate": 0.0001, "loss": 1.3077, "step": 5250 }, { "ce_loss": 1.2882971096038818, "epoch": 1.58, "inp_emb_norm": 0.3098046875, "loss": 1.2882971096038818, "masked_top1": 42.656227684021, "masked_top5": 65.82235458374024, "step": 5250, "top1": 82.23510848999024, "top5": 93.8615916442871 }, { "epoch": 1.59, "grad_norm": 1.1492020065892152, "learning_rate": 0.0001, "loss": 1.3037, "step": 5300 }, { "ce_loss": 1.312100157737732, "epoch": 1.59, "inp_emb_norm": 0.3079296875, "loss": 1.312100157737732, "masked_top1": 42.46005714416504, "masked_top5": 65.23145439147949, "step": 5300, "top1": 81.96127456665039, "top5": 93.77029266357422 }, { "epoch": 1.61, "grad_norm": 1.1456518826496218, "learning_rate": 0.0001, "loss": 1.3066, "step": 5350 }, { "ce_loss": 1.3056522703170776, "epoch": 1.61, "inp_emb_norm": 0.3083203125, "loss": 1.3056522703170776, "masked_top1": 43.074807510375976, "masked_top5": 65.66509185791016, "step": 5350, "top1": 82.01446243286132, "top5": 93.83800003051758 }, { "epoch": 1.62, "grad_norm": 1.382906237118969, "learning_rate": 0.0001, "loss": 1.2847, "step": 5400 }, { "ce_loss": 1.2701800346374512, "epoch": 1.62, "inp_emb_norm": 0.3112109375, "loss": 1.2701800346374512, "masked_top1": 43.46398132324219, "masked_top5": 65.93353149414062, "step": 5400, "top1": 82.44269760131836, "top5": 94.06136520385742 }, { "epoch": 1.64, "grad_norm": 1.250578306210753, "learning_rate": 0.0001, "loss": 1.3039, "step": 5450 }, { "ce_loss": 1.3234626388549804, "epoch": 1.64, "inp_emb_norm": 0.3078515625, "loss": 1.3234626388549804, "masked_top1": 42.220664520263675, "masked_top5": 64.67328666687011, "step": 5450, "top1": 81.84058303833008, "top5": 93.60615112304687 }, { "epoch": 1.65, "grad_norm": 1.1603834612617525, "learning_rate": 0.0001, "loss": 1.3015, "step": 5500 }, { "ce_loss": 1.2987056183815002, "epoch": 1.65, "inp_emb_norm": 0.3158203125, "loss": 1.2987056183815002, "masked_top1": 42.56543678283691, "masked_top5": 65.94853828430176, "step": 5500, "top1": 82.10978652954101, "top5": 93.85393478393554 }, { "epoch": 1.67, "grad_norm": 1.1963477888896916, "learning_rate": 0.0001, "loss": 1.3118, "step": 5550 }, { "ce_loss": 1.306938099861145, "epoch": 1.67, "inp_emb_norm": 0.3089453125, "loss": 1.306938099861145, "masked_top1": 42.22477603912353, "masked_top5": 64.90145042419434, "step": 5550, "top1": 81.91244842529296, "top5": 93.82112915039062 }, { "epoch": 1.68, "grad_norm": 1.2265886595466722, "learning_rate": 0.0001, "loss": 1.2965, "step": 5600 }, { "ce_loss": 1.289056396484375, "epoch": 1.68, "inp_emb_norm": 0.3178515625, "loss": 1.289056396484375, "masked_top1": 41.55989253997803, "masked_top5": 64.25195945739746, "step": 5600, "top1": 82.382109375, "top5": 93.70501174926758 }, { "epoch": 1.7, "grad_norm": 1.131601169964127, "learning_rate": 0.0001, "loss": 1.2911, "step": 5650 }, { "ce_loss": 1.2798267722129821, "epoch": 1.7, "inp_emb_norm": 0.3168359375, "loss": 1.2798267722129821, "masked_top1": 42.326548461914065, "masked_top5": 65.07690460205077, "step": 5650, "top1": 82.33992919921874, "top5": 93.87046478271485 }, { "epoch": 1.71, "grad_norm": 1.1181947342973821, "learning_rate": 0.0001, "loss": 1.3145, "step": 5700 }, { "ce_loss": 1.31481609582901, "epoch": 1.71, "inp_emb_norm": 0.3159375, "loss": 1.31481609582901, "masked_top1": 42.50676147460938, "masked_top5": 65.8423821258545, "step": 5700, "top1": 82.06524475097656, "top5": 93.59629486083985 }, { "epoch": 1.73, "grad_norm": 1.1098764034001067, "learning_rate": 0.0001, "loss": 1.311, "step": 5750 }, { "ce_loss": 1.3209831523895263, "epoch": 1.73, "inp_emb_norm": 0.310390625, "loss": 1.3209831523895263, "masked_top1": 42.35572410583496, "masked_top5": 65.26163505554199, "step": 5750, "top1": 81.91262344360352, "top5": 93.68317199707032 }, { "epoch": 1.74, "grad_norm": 1.2409622007508851, "learning_rate": 0.0001, "loss": 1.3131, "step": 5800 }, { "ce_loss": 1.303259253501892, "epoch": 1.74, "inp_emb_norm": 0.311015625, "loss": 1.303259253501892, "masked_top1": 42.99045387268066, "masked_top5": 65.50553886413574, "step": 5800, "top1": 82.13337188720703, "top5": 93.76351516723633 }, { "epoch": 1.76, "grad_norm": 1.3030028695341598, "learning_rate": 0.0001, "loss": 1.3, "step": 5850 }, { "ce_loss": 1.290860595703125, "epoch": 1.76, "inp_emb_norm": 0.32109375, "loss": 1.290860595703125, "masked_top1": 42.378893280029295, "masked_top5": 65.07022903442383, "step": 5850, "top1": 82.35479309082031, "top5": 93.71083343505859 }, { "epoch": 1.77, "grad_norm": 1.2030263310548106, "learning_rate": 0.0001, "loss": 1.3148, "step": 5900 }, { "ce_loss": 1.3233295631408692, "epoch": 1.77, "inp_emb_norm": 0.3166015625, "loss": 1.3233295631408692, "masked_top1": 42.24487545013428, "masked_top5": 65.1656477355957, "step": 5900, "top1": 82.03902984619141, "top5": 93.61334854125977 }, { "epoch": 1.79, "grad_norm": 1.5803953753939939, "learning_rate": 0.0001, "loss": 1.3041, "step": 5950 }, { "ce_loss": 1.2845279669761658, "epoch": 1.79, "inp_emb_norm": 0.3218359375, "loss": 1.2845279669761658, "masked_top1": 42.71968803405762, "masked_top5": 65.51119926452637, "step": 5950, "top1": 82.2425944519043, "top5": 93.84527893066407 }, { "epoch": 1.8, "grad_norm": 1.1986406141845714, "learning_rate": 0.0001, "loss": 1.303, "step": 6000 }, { "ce_loss": 1.2704100012779236, "epoch": 1.8, "inp_emb_norm": 0.317734375, "loss": 1.2704100012779236, "masked_top1": 43.65462882995605, "masked_top5": 65.84256935119629, "step": 6000, "top1": 82.45276733398437, "top5": 93.91838714599609 }, { "epoch": 1.82, "grad_norm": 1.1276854151713849, "learning_rate": 0.0001, "loss": 1.3039, "step": 6050 }, { "ce_loss": 1.289050838947296, "epoch": 1.82, "inp_emb_norm": 0.3146875, "loss": 1.289050838947296, "masked_top1": 41.38638572692871, "masked_top5": 64.90666145324707, "step": 6050, "top1": 82.1436050415039, "top5": 93.8117251586914 }, { "epoch": 1.83, "grad_norm": 1.1636926009728485, "learning_rate": 0.0001, "loss": 1.3029, "step": 6100 }, { "ce_loss": 1.3035223054885865, "epoch": 1.83, "inp_emb_norm": 0.3187109375, "loss": 1.3035223054885865, "masked_top1": 42.37709861755371, "masked_top5": 65.33909133911133, "step": 6100, "top1": 82.11933166503906, "top5": 93.78050659179688 }, { "epoch": 1.85, "grad_norm": 1.0626794881896964, "learning_rate": 0.0001, "loss": 1.3138, "step": 6150 }, { "ce_loss": 1.317355580329895, "epoch": 1.85, "inp_emb_norm": 0.31890625, "loss": 1.317355580329895, "masked_top1": 41.86831642150879, "masked_top5": 64.59691291809082, "step": 6150, "top1": 81.94907608032227, "top5": 93.67014266967773 }, { "epoch": 1.86, "grad_norm": 1.1005541859171093, "learning_rate": 0.0001, "loss": 1.3041, "step": 6200 }, { "ce_loss": 1.3219546675682068, "epoch": 1.86, "inp_emb_norm": 0.3172265625, "loss": 1.3219546675682068, "masked_top1": 42.3734455871582, "masked_top5": 65.90324760437012, "step": 6200, "top1": 81.97321441650391, "top5": 93.62858337402344 }, { "epoch": 1.88, "grad_norm": 1.1522186435584791, "learning_rate": 0.0001, "loss": 1.2963, "step": 6250 }, { "ce_loss": 1.2990228199958802, "epoch": 1.88, "inp_emb_norm": 0.3144921875, "loss": 1.2990228199958802, "masked_top1": 42.138801422119144, "masked_top5": 65.49508239746093, "step": 6250, "top1": 82.0102586364746, "top5": 93.90199996948242 }, { "epoch": 1.89, "grad_norm": 1.2927059160480363, "learning_rate": 0.0001, "loss": 1.3017, "step": 6300 }, { "ce_loss": 1.300498881340027, "epoch": 1.89, "inp_emb_norm": 0.316640625, "loss": 1.300498881340027, "masked_top1": 42.56178199768066, "masked_top5": 65.6432763671875, "step": 6300, "top1": 82.12412170410157, "top5": 93.7817707824707 }, { "epoch": 1.91, "grad_norm": 1.168341408260434, "learning_rate": 0.0001, "loss": 1.3029, "step": 6350 }, { "ce_loss": 1.3083948111534118, "epoch": 1.91, "inp_emb_norm": 0.311328125, "loss": 1.3083948111534118, "masked_top1": 42.55179992675781, "masked_top5": 65.3431551361084, "step": 6350, "top1": 81.94397857666016, "top5": 93.8497378540039 }, { "epoch": 1.92, "grad_norm": 1.2323945648312147, "learning_rate": 0.0001, "loss": 1.3104, "step": 6400 }, { "ce_loss": 1.3191473126411437, "epoch": 1.92, "inp_emb_norm": 0.3194921875, "loss": 1.3191473126411437, "masked_top1": 41.97941291809082, "masked_top5": 65.11552169799805, "step": 6400, "top1": 81.95802749633789, "top5": 93.71688690185547 }, { "epoch": 1.94, "grad_norm": 1.0969474547631315, "learning_rate": 0.0001, "loss": 1.3033, "step": 6450 }, { "ce_loss": 1.2988893008232116, "epoch": 1.94, "inp_emb_norm": 0.3175, "loss": 1.2988893008232116, "masked_top1": 42.91837085723877, "masked_top5": 65.42203193664551, "step": 6450, "top1": 82.20126846313477, "top5": 93.793828125 }, { "epoch": 1.95, "grad_norm": 1.0855603866035775, "learning_rate": 0.0001, "loss": 1.3051, "step": 6500 }, { "ce_loss": 1.3110007953643799, "epoch": 1.95, "inp_emb_norm": 0.32046875, "loss": 1.3110007953643799, "masked_top1": 42.34694427490234, "masked_top5": 65.08651733398438, "step": 6500, "top1": 81.95626449584961, "top5": 93.71604049682617 }, { "epoch": 1.97, "grad_norm": 1.201042651000794, "learning_rate": 0.0001, "loss": 1.3041, "step": 6550 }, { "ce_loss": 1.3072884845733643, "epoch": 1.97, "inp_emb_norm": 0.316484375, "loss": 1.3072884845733643, "masked_top1": 42.01487628936768, "masked_top5": 65.12509132385254, "step": 6550, "top1": 81.91355926513671, "top5": 93.84131042480469 }, { "epoch": 1.98, "grad_norm": 1.2375042360574775, "learning_rate": 0.0001, "loss": 1.3, "step": 6600 }, { "ce_loss": 1.3039724278450011, "epoch": 1.98, "inp_emb_norm": 0.318515625, "loss": 1.3039724278450011, "masked_top1": 42.9154284286499, "masked_top5": 66.03591972351074, "step": 6600, "top1": 82.26277542114258, "top5": 93.73072570800781 }, { "epoch": 2.0, "grad_norm": 1.1513818080655263, "learning_rate": 0.0001, "loss": 1.3037, "step": 6650 }, { "ce_loss": 1.2931387114524842, "epoch": 2.0, "inp_emb_norm": 0.318046875, "loss": 1.2931387114524842, "masked_top1": 41.40272514343262, "masked_top5": 64.34992462158203, "step": 6650, "top1": 82.18499603271485, "top5": 93.81835754394531 }, { "epoch": 2.02, "grad_norm": 1.0510501665579786, "learning_rate": 0.0001, "loss": 1.0038, "step": 6700 }, { "ce_loss": 0.9978207111358642, "epoch": 2.02, "inp_emb_norm": 0.3219140625, "loss": 0.9978207111358642, "masked_top1": 45.336004638671874, "masked_top5": 69.27368041992187, "step": 6700, "top1": 85.59730255126954, "top5": 95.6523501586914 }, { "epoch": 2.03, "grad_norm": 1.119371103964124, "learning_rate": 0.0001, "loss": 1.0094, "step": 6750 }, { "ce_loss": 1.0062808072566987, "epoch": 2.03, "inp_emb_norm": 0.324375, "loss": 1.0062808072566987, "masked_top1": 44.80854190826416, "masked_top5": 67.70930862426758, "step": 6750, "top1": 85.45523956298828, "top5": 95.46897216796874 }, { "epoch": 2.05, "grad_norm": 1.0111394168332695, "learning_rate": 0.0001, "loss": 0.9973, "step": 6800 }, { "ce_loss": 0.9945477271080017, "epoch": 2.05, "inp_emb_norm": 0.32734375, "loss": 0.9945477271080017, "masked_top1": 46.81753234863281, "masked_top5": 70.2586336517334, "step": 6800, "top1": 85.69498947143555, "top5": 95.61528350830078 }, { "epoch": 2.06, "grad_norm": 1.146035850759827, "learning_rate": 0.0001, "loss": 1.0172, "step": 6850 }, { "ce_loss": 1.0215596628189088, "epoch": 2.06, "inp_emb_norm": 0.3225390625, "loss": 1.0215596628189088, "masked_top1": 43.64241020202637, "masked_top5": 67.47051399230958, "step": 6850, "top1": 85.29426712036133, "top5": 95.40342254638672 }, { "epoch": 2.08, "grad_norm": 1.1308686916013488, "learning_rate": 0.0001, "loss": 1.0143, "step": 6900 }, { "ce_loss": 1.008236768245697, "epoch": 2.08, "inp_emb_norm": 0.322109375, "loss": 1.008236768245697, "masked_top1": 44.6266674041748, "masked_top5": 68.65714111328126, "step": 6900, "top1": 85.34050277709962, "top5": 95.54212951660156 }, { "epoch": 2.09, "grad_norm": 1.1042254336729025, "learning_rate": 0.0001, "loss": 1.0111, "step": 6950 }, { "ce_loss": 1.0106354761123657, "epoch": 2.09, "inp_emb_norm": 0.322734375, "loss": 1.0106354761123657, "masked_top1": 44.90856178283691, "masked_top5": 69.18769927978515, "step": 6950, "top1": 85.31204833984376, "top5": 95.51878021240235 }, { "epoch": 2.11, "grad_norm": 1.1432085643116041, "learning_rate": 0.0001, "loss": 1.0137, "step": 7000 }, { "ce_loss": 1.0164246666431427, "epoch": 2.11, "inp_emb_norm": 0.3241015625, "loss": 1.0164246666431427, "masked_top1": 45.27058769226074, "masked_top5": 69.0199755859375, "step": 7000, "top1": 85.29512222290039, "top5": 95.44889572143555 }, { "epoch": 2.12, "grad_norm": 1.0502533687083666, "learning_rate": 0.0001, "loss": 1.0498, "step": 7050 }, { "ce_loss": 1.0437222492694855, "epoch": 2.12, "inp_emb_norm": 0.3234765625, "loss": 1.0437222492694855, "masked_top1": 45.688305435180666, "masked_top5": 68.60405799865723, "step": 7050, "top1": 85.19064407348633, "top5": 95.48901702880859 }, { "epoch": 2.14, "grad_norm": 1.065003821728329, "learning_rate": 0.0001, "loss": 1.0199, "step": 7100 }, { "ce_loss": 1.01140921831131, "epoch": 2.14, "inp_emb_norm": 0.3250390625, "loss": 1.01140921831131, "masked_top1": 45.559912796020505, "masked_top5": 69.64316802978516, "step": 7100, "top1": 85.34976013183594, "top5": 95.51665420532227 }, { "epoch": 2.15, "grad_norm": 1.1735568530678606, "learning_rate": 0.0001, "loss": 1.0166, "step": 7150 }, { "ce_loss": 1.0293829572200774, "epoch": 2.15, "inp_emb_norm": 0.323671875, "loss": 1.0293829572200774, "masked_top1": 44.797666244506836, "masked_top5": 69.08728958129883, "step": 7150, "top1": 85.07948059082031, "top5": 95.50323318481445 }, { "epoch": 2.17, "grad_norm": 1.22336128717078, "learning_rate": 0.0001, "loss": 1.0263, "step": 7200 }, { "ce_loss": 1.0348090195655824, "epoch": 2.17, "inp_emb_norm": 0.3307421875, "loss": 1.0348090195655824, "masked_top1": 43.274717559814455, "masked_top5": 68.35926612854004, "step": 7200, "top1": 85.03092895507812, "top5": 95.42227752685547 }, { "epoch": 2.18, "grad_norm": 1.1254790539219672, "learning_rate": 0.0001, "loss": 1.0341, "step": 7250 }, { "ce_loss": 1.028915911912918, "epoch": 2.18, "inp_emb_norm": 0.3247265625, "loss": 1.028915911912918, "masked_top1": 45.49935554504395, "masked_top5": 69.03593551635743, "step": 7250, "top1": 85.14698638916016, "top5": 95.36694305419923 }, { "epoch": 2.2, "grad_norm": 1.1149477539183639, "learning_rate": 0.0001, "loss": 1.0412, "step": 7300 }, { "ce_loss": 1.0536164796352387, "epoch": 2.2, "inp_emb_norm": 0.3236328125, "loss": 1.0536164796352387, "masked_top1": 43.858817138671874, "masked_top5": 67.21836029052734, "step": 7300, "top1": 84.86703170776367, "top5": 95.25842987060547 }, { "epoch": 2.21, "grad_norm": 1.1968292628857302, "learning_rate": 0.0001, "loss": 1.0432, "step": 7350 }, { "ce_loss": 1.0520641374588013, "epoch": 2.21, "inp_emb_norm": 0.3211328125, "loss": 1.0520641374588013, "masked_top1": 44.41142387390137, "masked_top5": 67.9876936340332, "step": 7350, "top1": 84.89357849121093, "top5": 95.34237762451171 }, { "epoch": 2.23, "grad_norm": 1.1298025119403752, "learning_rate": 0.0001, "loss": 1.0285, "step": 7400 }, { "ce_loss": 1.0238622641563415, "epoch": 2.23, "inp_emb_norm": 0.33328125, "loss": 1.0238622641563415, "masked_top1": 46.15579338073731, "masked_top5": 70.0232991027832, "step": 7400, "top1": 85.30123580932617, "top5": 95.43901489257813 }, { "epoch": 2.24, "grad_norm": 1.0250128601399684, "learning_rate": 0.0001, "loss": 1.0317, "step": 7450 }, { "ce_loss": 1.024322179555893, "epoch": 2.24, "inp_emb_norm": 0.3326953125, "loss": 1.024322179555893, "masked_top1": 45.176987152099606, "masked_top5": 69.09153121948242, "step": 7450, "top1": 85.22158447265625, "top5": 95.37797592163086 }, { "epoch": 2.26, "grad_norm": 1.1668834452965842, "learning_rate": 0.0001, "loss": 1.0344, "step": 7500 }, { "ce_loss": 1.0320839881896973, "epoch": 2.26, "inp_emb_norm": 0.3268359375, "loss": 1.0320839881896973, "masked_top1": 45.62026954650879, "masked_top5": 69.47045196533203, "step": 7500, "top1": 85.13791748046874, "top5": 95.46679916381837 }, { "epoch": 2.27, "grad_norm": 1.1647845511661612, "learning_rate": 0.0001, "loss": 1.0364, "step": 7550 }, { "ce_loss": 1.040896817445755, "epoch": 2.27, "inp_emb_norm": 0.3287890625, "loss": 1.040896817445755, "masked_top1": 45.61882129669189, "masked_top5": 68.993677444458, "step": 7550, "top1": 84.89491455078125, "top5": 95.34945602416992 }, { "epoch": 2.29, "grad_norm": 1.070994325965457, "learning_rate": 0.0001, "loss": 1.0481, "step": 7600 }, { "ce_loss": 1.045055913925171, "epoch": 2.29, "inp_emb_norm": 0.3358203125, "loss": 1.045055913925171, "masked_top1": 43.71815933227539, "masked_top5": 68.03006439208984, "step": 7600, "top1": 84.92118545532226, "top5": 95.33861541748047 }, { "epoch": 2.3, "grad_norm": 1.0302755529017995, "learning_rate": 0.0001, "loss": 1.0468, "step": 7650 }, { "ce_loss": 1.029969446659088, "epoch": 2.3, "inp_emb_norm": 0.3273046875, "loss": 1.029969446659088, "masked_top1": 44.84774971008301, "masked_top5": 68.51262855529785, "step": 7650, "top1": 85.12003936767579, "top5": 95.39368621826172 }, { "epoch": 2.32, "grad_norm": 1.2124331568257196, "learning_rate": 0.0001, "loss": 1.0462, "step": 7700 }, { "ce_loss": 1.0300623905658721, "epoch": 2.32, "inp_emb_norm": 0.32671875, "loss": 1.0300623905658721, "masked_top1": 46.18773262023926, "masked_top5": 69.55521690368653, "step": 7700, "top1": 85.05798110961913, "top5": 95.45936019897461 }, { "epoch": 2.33, "grad_norm": 1.25935837166321, "learning_rate": 0.0001, "loss": 1.051, "step": 7750 }, { "ce_loss": 1.0560246324539184, "epoch": 2.33, "inp_emb_norm": 0.32921875, "loss": 1.0560246324539184, "masked_top1": 44.81505111694336, "masked_top5": 68.60199340820313, "step": 7750, "top1": 84.81520401000977, "top5": 95.27306564331055 }, { "epoch": 2.35, "grad_norm": 1.1490451145708855, "learning_rate": 0.0001, "loss": 1.0459, "step": 7800 }, { "ce_loss": 1.0482890462875367, "epoch": 2.35, "inp_emb_norm": 0.3268359375, "loss": 1.0482890462875367, "masked_top1": 44.35726287841797, "masked_top5": 68.64975059509277, "step": 7800, "top1": 84.82445663452148, "top5": 95.42289993286133 }, { "epoch": 2.36, "grad_norm": 1.1334989027761242, "learning_rate": 0.0001, "loss": 1.0441, "step": 7850 }, { "ce_loss": 1.0527094066143037, "epoch": 2.36, "inp_emb_norm": 0.3269140625, "loss": 1.0527094066143037, "masked_top1": 45.072478713989256, "masked_top5": 68.39160514831543, "step": 7850, "top1": 84.785078125, "top5": 95.35190139770508 }, { "epoch": 2.38, "grad_norm": 1.1375627332151101, "learning_rate": 0.0001, "loss": 1.0565, "step": 7900 }, { "ce_loss": 1.0542543601989747, "epoch": 2.38, "inp_emb_norm": 0.331328125, "loss": 1.0542543601989747, "masked_top1": 43.75662239074707, "masked_top5": 67.66206184387207, "step": 7900, "top1": 84.7607080078125, "top5": 95.31730087280273 }, { "epoch": 2.39, "grad_norm": 1.1403758501967678, "learning_rate": 0.0001, "loss": 1.0445, "step": 7950 }, { "ce_loss": 1.0483984065055847, "epoch": 2.39, "inp_emb_norm": 0.32671875, "loss": 1.0483984065055847, "masked_top1": 45.24047119140625, "masked_top5": 68.86077049255371, "step": 7950, "top1": 84.88309677124023, "top5": 95.3291700744629 }, { "epoch": 2.41, "grad_norm": 1.039302857763206, "learning_rate": 0.0001, "loss": 1.0581, "step": 8000 }, { "ce_loss": 1.0635139977931976, "epoch": 2.41, "inp_emb_norm": 0.3339453125, "loss": 1.0635139977931976, "masked_top1": 43.78837112426758, "masked_top5": 67.08759162902832, "step": 8000, "top1": 84.79178085327149, "top5": 95.13488723754882 }, { "epoch": 2.42, "grad_norm": 1.1910955097950056, "learning_rate": 0.0001, "loss": 1.0522, "step": 8050 }, { "ce_loss": 1.0587849020957947, "epoch": 2.42, "inp_emb_norm": 0.3301171875, "loss": 1.0587849020957947, "masked_top1": 44.429280014038085, "masked_top5": 67.39429237365722, "step": 8050, "top1": 84.87328887939454, "top5": 95.1887158203125 }, { "epoch": 2.44, "grad_norm": 1.1479506270149336, "learning_rate": 0.0001, "loss": 1.0536, "step": 8100 }, { "ce_loss": 1.0473863470554352, "epoch": 2.44, "inp_emb_norm": 0.3341015625, "loss": 1.0473863470554352, "masked_top1": 44.48332893371582, "masked_top5": 68.14752388000488, "step": 8100, "top1": 84.92030502319336, "top5": 95.26301513671875 }, { "epoch": 2.45, "grad_norm": 1.1375038249327807, "learning_rate": 0.0001, "loss": 1.0419, "step": 8150 }, { "ce_loss": 1.0687260043621063, "epoch": 2.45, "inp_emb_norm": 0.3335546875, "loss": 1.0687260043621063, "masked_top1": 43.23715843200684, "masked_top5": 67.02041564941406, "step": 8150, "top1": 84.57525085449218, "top5": 95.16051864624023 }, { "epoch": 2.47, "grad_norm": 1.1378089695981268, "learning_rate": 0.0001, "loss": 1.0485, "step": 8200 }, { "ce_loss": 1.0508419513702392, "epoch": 2.47, "inp_emb_norm": 0.3319921875, "loss": 1.0508419513702392, "masked_top1": 45.14276206970215, "masked_top5": 69.22940170288086, "step": 8200, "top1": 84.89724472045899, "top5": 95.38407836914062 }, { "epoch": 2.48, "grad_norm": 1.134415072768399, "learning_rate": 0.0001, "loss": 1.0587, "step": 8250 }, { "ce_loss": 1.0560647177696227, "epoch": 2.48, "inp_emb_norm": 0.33328125, "loss": 1.0560647177696227, "masked_top1": 44.99800594329834, "masked_top5": 67.96685562133788, "step": 8250, "top1": 84.84181884765626, "top5": 95.19639511108399 }, { "epoch": 2.5, "grad_norm": 1.0130905993003134, "learning_rate": 0.0001, "loss": 1.0622, "step": 8300 }, { "ce_loss": 1.0628444683551788, "epoch": 2.5, "inp_emb_norm": 0.335390625, "loss": 1.0628444683551788, "masked_top1": 43.72650085449219, "masked_top5": 67.85165229797363, "step": 8300, "top1": 84.71982971191406, "top5": 95.2232145690918 }, { "epoch": 2.51, "grad_norm": 1.1112686440386919, "learning_rate": 0.0001, "loss": 1.0624, "step": 8350 }, { "ce_loss": 1.0606860029697418, "epoch": 2.51, "inp_emb_norm": 0.3302734375, "loss": 1.0606860029697418, "masked_top1": 45.08112854003906, "masked_top5": 68.54836326599121, "step": 8350, "top1": 84.70668167114258, "top5": 95.34720886230468 }, { "epoch": 2.53, "grad_norm": 1.0929741266435504, "learning_rate": 0.0001, "loss": 1.0653, "step": 8400 }, { "ce_loss": 1.0685135400295258, "epoch": 2.53, "inp_emb_norm": 0.329296875, "loss": 1.0685135400295258, "masked_top1": 44.93409217834473, "masked_top5": 68.15225112915039, "step": 8400, "top1": 84.59507186889648, "top5": 95.2771678161621 }, { "epoch": 2.54, "grad_norm": 1.0671443258127058, "learning_rate": 0.0001, "loss": 1.0661, "step": 8450 }, { "ce_loss": 1.074057730436325, "epoch": 2.54, "inp_emb_norm": 0.334140625, "loss": 1.074057730436325, "masked_top1": 43.86976951599121, "masked_top5": 67.98081314086915, "step": 8450, "top1": 84.60137985229493, "top5": 95.1078970336914 }, { "epoch": 2.56, "grad_norm": 1.2357962610955344, "learning_rate": 0.0001, "loss": 1.0625, "step": 8500 }, { "ce_loss": 1.0734054052829742, "epoch": 2.56, "inp_emb_norm": 0.3264453125, "loss": 1.0734054052829742, "masked_top1": 43.74296424865722, "masked_top5": 67.63478965759278, "step": 8500, "top1": 84.40524276733399, "top5": 95.29552383422852 }, { "epoch": 2.57, "grad_norm": 1.185708692105678, "learning_rate": 0.0001, "loss": 1.0543, "step": 8550 }, { "ce_loss": 1.0622280275821685, "epoch": 2.57, "inp_emb_norm": 0.3312890625, "loss": 1.0622280275821685, "masked_top1": 45.68028434753418, "masked_top5": 68.61516632080078, "step": 8550, "top1": 84.72150787353516, "top5": 95.2423046875 }, { "epoch": 2.59, "grad_norm": 1.124889127351682, "learning_rate": 0.0001, "loss": 1.0598, "step": 8600 }, { "ce_loss": 1.0730221366882324, "epoch": 2.59, "inp_emb_norm": 0.3349609375, "loss": 1.0730221366882324, "masked_top1": 44.15586044311524, "masked_top5": 67.7581484222412, "step": 8600, "top1": 84.60388778686523, "top5": 95.18492584228515 }, { "epoch": 2.6, "grad_norm": 1.1470911071720453, "learning_rate": 0.0001, "loss": 1.0718, "step": 8650 }, { "ce_loss": 1.0610903584957123, "epoch": 2.6, "inp_emb_norm": 0.3320703125, "loss": 1.0610903584957123, "masked_top1": 44.21772804260254, "masked_top5": 67.78637229919434, "step": 8650, "top1": 84.74846099853515, "top5": 95.21728775024414 }, { "epoch": 2.62, "grad_norm": 1.120071012609436, "learning_rate": 0.0001, "loss": 1.0598, "step": 8700 }, { "ce_loss": 1.057754340171814, "epoch": 2.62, "inp_emb_norm": 0.334140625, "loss": 1.057754340171814, "masked_top1": 44.56280632019043, "masked_top5": 68.16851516723632, "step": 8700, "top1": 84.76306396484375, "top5": 95.24030334472656 }, { "epoch": 2.63, "grad_norm": 1.1990678496151557, "learning_rate": 0.0001, "loss": 1.0698, "step": 8750 }, { "ce_loss": 1.0554494428634644, "epoch": 2.63, "inp_emb_norm": 0.3276953125, "loss": 1.0554494428634644, "masked_top1": 45.60402565002441, "masked_top5": 69.8213597869873, "step": 8750, "top1": 84.74255416870118, "top5": 95.37602157592774 }, { "epoch": 2.65, "grad_norm": 1.1691422333586243, "learning_rate": 0.0001, "loss": 1.0711, "step": 8800 }, { "ce_loss": 1.0723666751384735, "epoch": 2.65, "inp_emb_norm": 0.3325390625, "loss": 1.0723666751384735, "masked_top1": 44.78216484069824, "masked_top5": 68.43072723388671, "step": 8800, "top1": 84.51954879760743, "top5": 95.22681289672852 }, { "epoch": 2.66, "grad_norm": 1.058087631068196, "learning_rate": 0.0001, "loss": 1.0736, "step": 8850 }, { "ce_loss": 1.0746560537815093, "epoch": 2.66, "inp_emb_norm": 0.329140625, "loss": 1.0746560537815093, "masked_top1": 44.328994064331056, "masked_top5": 67.38568382263183, "step": 8850, "top1": 84.52036026000977, "top5": 95.2164730834961 }, { "epoch": 2.68, "grad_norm": 1.1288621853196346, "learning_rate": 0.0001, "loss": 1.069, "step": 8900 }, { "ce_loss": 1.075651180744171, "epoch": 2.68, "inp_emb_norm": 0.334765625, "loss": 1.075651180744171, "masked_top1": 45.41173538208008, "masked_top5": 68.17863861083984, "step": 8900, "top1": 84.6201480102539, "top5": 95.15124877929688 }, { "epoch": 2.69, "grad_norm": 1.14695178467833, "learning_rate": 0.0001, "loss": 1.0671, "step": 8950 }, { "ce_loss": 1.0754395532608032, "epoch": 2.69, "inp_emb_norm": 0.334453125, "loss": 1.0754395532608032, "masked_top1": 45.43799507141113, "masked_top5": 69.10365425109863, "step": 8950, "top1": 84.61126815795899, "top5": 95.2543717956543 }, { "epoch": 2.71, "grad_norm": 1.131019351092537, "learning_rate": 0.0001, "loss": 1.0679, "step": 9000 }, { "ce_loss": 1.0691665148735046, "epoch": 2.71, "inp_emb_norm": 0.33484375, "loss": 1.0691665148735046, "masked_top1": 44.43798561096192, "masked_top5": 67.93657341003419, "step": 9000, "top1": 84.68964065551758, "top5": 95.20655715942382 }, { "epoch": 2.72, "grad_norm": 1.120170919000501, "learning_rate": 0.0001, "loss": 1.0722, "step": 9050 }, { "ce_loss": 1.0739211070537567, "epoch": 2.72, "inp_emb_norm": 0.3396484375, "loss": 1.0739211070537567, "masked_top1": 44.42457763671875, "masked_top5": 67.35458564758301, "step": 9050, "top1": 84.60804443359375, "top5": 95.0638233947754 }, { "epoch": 2.74, "grad_norm": 1.0687255304897059, "learning_rate": 0.0001, "loss": 1.0792, "step": 9100 }, { "ce_loss": 1.0771595978736876, "epoch": 2.74, "inp_emb_norm": 0.3378125, "loss": 1.0771595978736876, "masked_top1": 43.902629165649415, "masked_top5": 67.31045616149902, "step": 9100, "top1": 84.62485046386719, "top5": 95.07604125976563 }, { "epoch": 2.75, "grad_norm": 1.0993761131067057, "learning_rate": 0.0001, "loss": 1.0813, "step": 9150 }, { "ce_loss": 1.082298024892807, "epoch": 2.75, "inp_emb_norm": 0.334609375, "loss": 1.082298024892807, "masked_top1": 45.80146224975586, "masked_top5": 68.17116020202637, "step": 9150, "top1": 84.43293472290038, "top5": 95.1411996459961 }, { "epoch": 2.77, "grad_norm": 1.109498165414777, "learning_rate": 0.0001, "loss": 1.0752, "step": 9200 }, { "ce_loss": 1.0997070169448853, "epoch": 2.77, "inp_emb_norm": 0.33890625, "loss": 1.0997070169448853, "masked_top1": 44.238784561157225, "masked_top5": 67.47504318237304, "step": 9200, "top1": 84.3786474609375, "top5": 94.96468109130859 }, { "epoch": 2.78, "grad_norm": 1.0947772030405678, "learning_rate": 0.0001, "loss": 1.081, "step": 9250 }, { "ce_loss": 1.066500049829483, "epoch": 2.78, "inp_emb_norm": 0.3352734375, "loss": 1.066500049829483, "masked_top1": 45.10762840270996, "masked_top5": 68.68643852233886, "step": 9250, "top1": 84.62079177856445, "top5": 95.23117553710938 }, { "epoch": 2.8, "grad_norm": 0.9904651287047559, "learning_rate": 0.0001, "loss": 1.0769, "step": 9300 }, { "ce_loss": 1.0865390312671661, "epoch": 2.8, "inp_emb_norm": 0.3416015625, "loss": 1.0865390312671661, "masked_top1": 43.63430931091309, "masked_top5": 67.33321548461915, "step": 9300, "top1": 84.4200732421875, "top5": 95.05102798461914 }, { "epoch": 2.81, "grad_norm": 1.115318864267355, "learning_rate": 0.0001, "loss": 1.0716, "step": 9350 }, { "ce_loss": 1.05960639834404, "epoch": 2.81, "inp_emb_norm": 0.3376171875, "loss": 1.05960639834404, "masked_top1": 45.203971252441406, "masked_top5": 67.83310653686523, "step": 9350, "top1": 84.70579071044922, "top5": 95.13749114990235 }, { "epoch": 2.83, "grad_norm": 1.0681394549186096, "learning_rate": 0.0001, "loss": 1.0755, "step": 9400 }, { "ce_loss": 1.0759823191165925, "epoch": 2.83, "inp_emb_norm": 0.335234375, "loss": 1.0759823191165925, "masked_top1": 45.2706275177002, "masked_top5": 68.85594253540039, "step": 9400, "top1": 84.58239471435547, "top5": 95.29377487182617 }, { "epoch": 2.84, "grad_norm": 1.1612375451754464, "learning_rate": 0.0001, "loss": 1.0704, "step": 9450 }, { "ce_loss": 1.0630818378925324, "epoch": 2.84, "inp_emb_norm": 0.33828125, "loss": 1.0630818378925324, "masked_top1": 45.41359436035156, "masked_top5": 67.90339347839355, "step": 9450, "top1": 84.69526199340821, "top5": 95.18430023193359 }, { "epoch": 2.86, "grad_norm": 1.0053255791083329, "learning_rate": 0.0001, "loss": 1.075, "step": 9500 }, { "ce_loss": 1.0855333960056306, "epoch": 2.86, "inp_emb_norm": 0.332578125, "loss": 1.0855333960056306, "masked_top1": 43.72627799987793, "masked_top5": 67.27571357727051, "step": 9500, "top1": 84.34697372436523, "top5": 95.10883621215821 }, { "epoch": 2.87, "grad_norm": 1.1782915911734662, "learning_rate": 0.0001, "loss": 1.0865, "step": 9550 }, { "ce_loss": 1.083924981355667, "epoch": 2.87, "inp_emb_norm": 0.3425390625, "loss": 1.083924981355667, "masked_top1": 44.98833938598633, "masked_top5": 68.43323631286621, "step": 9550, "top1": 84.45411026000977, "top5": 95.07992599487305 }, { "epoch": 2.89, "grad_norm": 0.9304520640714639, "learning_rate": 0.0001, "loss": 1.083, "step": 9600 }, { "ce_loss": 1.0927222657203675, "epoch": 2.89, "inp_emb_norm": 0.3383984375, "loss": 1.0927222657203675, "masked_top1": 44.10447273254395, "masked_top5": 67.51412483215331, "step": 9600, "top1": 84.38162017822266, "top5": 94.919228515625 }, { "epoch": 2.9, "grad_norm": 1.1207152238553202, "learning_rate": 0.0001, "loss": 1.0748, "step": 9650 }, { "ce_loss": 1.0789397644996643, "epoch": 2.9, "inp_emb_norm": 0.338671875, "loss": 1.0789397644996643, "masked_top1": 44.84839416503906, "masked_top5": 67.80728317260743, "step": 9650, "top1": 84.54779495239258, "top5": 95.0833723449707 }, { "epoch": 2.92, "grad_norm": 1.1324876880673622, "learning_rate": 0.0001, "loss": 1.0768, "step": 9700 }, { "ce_loss": 1.0801820170879364, "epoch": 2.92, "inp_emb_norm": 0.3430078125, "loss": 1.0801820170879364, "masked_top1": 43.99905281066894, "masked_top5": 67.07788047790527, "step": 9700, "top1": 84.5837564086914, "top5": 95.05357192993164 }, { "epoch": 2.93, "grad_norm": 1.0773481244877405, "learning_rate": 0.0001, "loss": 1.0728, "step": 9750 }, { "ce_loss": 1.080371401309967, "epoch": 2.93, "inp_emb_norm": 0.3410546875, "loss": 1.080371401309967, "masked_top1": 43.507309417724606, "masked_top5": 67.3342724609375, "step": 9750, "top1": 84.5036083984375, "top5": 95.07123840332031 }, { "epoch": 2.95, "grad_norm": 1.0159168175987678, "learning_rate": 0.0001, "loss": 1.0688, "step": 9800 }, { "ce_loss": 1.083665030002594, "epoch": 2.95, "inp_emb_norm": 0.34671875, "loss": 1.083665030002594, "masked_top1": 44.19781021118164, "masked_top5": 67.93185821533203, "step": 9800, "top1": 84.49478713989258, "top5": 95.09757629394531 }, { "epoch": 2.96, "grad_norm": 1.1136205616789427, "learning_rate": 0.0001, "loss": 1.0666, "step": 9850 }, { "ce_loss": 1.0619865989685058, "epoch": 2.96, "inp_emb_norm": 0.33875, "loss": 1.0619865989685058, "masked_top1": 45.1244051361084, "masked_top5": 69.11591262817383, "step": 9850, "top1": 84.57212326049805, "top5": 95.30008316040039 }, { "epoch": 2.98, "grad_norm": 1.0447170988807504, "learning_rate": 0.0001, "loss": 1.0799, "step": 9900 }, { "ce_loss": 1.079269015789032, "epoch": 2.98, "inp_emb_norm": 0.3366796875, "loss": 1.079269015789032, "masked_top1": 45.20799728393555, "masked_top5": 69.20284423828124, "step": 9900, "top1": 84.41862701416015, "top5": 95.25058349609375 }, { "epoch": 2.99, "grad_norm": 1.02006539824341, "learning_rate": 0.0001, "loss": 1.0727, "step": 9950 }, { "ce_loss": 1.078022118806839, "epoch": 2.99, "inp_emb_norm": 0.343828125, "loss": 1.078022118806839, "masked_top1": 45.404396057128906, "masked_top5": 68.74404998779296, "step": 9950, "top1": 84.56711853027343, "top5": 95.0911540222168 }, { "epoch": 3.01, "grad_norm": 1.113028194298386, "learning_rate": 0.0001, "loss": 0.923, "step": 10000 }, { "ce_loss": 0.9195016610622406, "epoch": 3.01, "inp_emb_norm": 0.3430859375, "loss": 0.9195016610622406, "masked_top1": 48.332907638549806, "masked_top5": 72.30007148742676, "step": 10000, "top1": 86.70970993041992, "top5": 95.9695133972168 }, { "epoch": 3.02, "grad_norm": 0.9894998771881995, "learning_rate": 0.0001, "loss": 0.7385, "step": 10050 }, { "ce_loss": 0.7338530778884887, "epoch": 3.02, "inp_emb_norm": 0.337578125, "loss": 0.7338530778884887, "masked_top1": 52.05472785949707, "masked_top5": 77.44325592041015, "step": 10050, "top1": 89.11887008666992, "top5": 97.09128341674804 }, { "epoch": 3.04, "grad_norm": 1.0927669172905705, "learning_rate": 0.0001, "loss": 0.7338, "step": 10100 }, { "ce_loss": 0.7197039890289306, "epoch": 3.04, "inp_emb_norm": 0.3384765625, "loss": 0.7197039890289306, "masked_top1": 53.32936988830566, "masked_top5": 77.26823791503907, "step": 10100, "top1": 89.31084884643555, "top5": 97.0846061706543 }, { "epoch": 3.05, "grad_norm": 1.0197432952418408, "learning_rate": 0.0001, "loss": 0.7411, "step": 10150 }, { "ce_loss": 0.7408602213859559, "epoch": 3.05, "inp_emb_norm": 0.3473046875, "loss": 0.7408602213859559, "masked_top1": 51.371361846923826, "masked_top5": 76.60449035644531, "step": 10150, "top1": 89.11841430664063, "top5": 96.8936897277832 }, { "epoch": 3.07, "grad_norm": 1.0379685041697753, "learning_rate": 0.0001, "loss": 0.7462, "step": 10200 }, { "ce_loss": 0.7370594382286072, "epoch": 3.07, "inp_emb_norm": 0.3462109375, "loss": 0.7370594382286072, "masked_top1": 51.55666793823242, "masked_top5": 76.35641525268555, "step": 10200, "top1": 89.04085632324218, "top5": 96.98393936157227 }, { "epoch": 3.08, "grad_norm": 0.9812098301602463, "learning_rate": 0.0001, "loss": 0.7456, "step": 10250 }, { "ce_loss": 0.7352669024467469, "epoch": 3.08, "inp_emb_norm": 0.3394921875, "loss": 0.7352669024467469, "masked_top1": 52.50257652282715, "masked_top5": 76.83967468261719, "step": 10250, "top1": 89.10743438720704, "top5": 96.9831704711914 }, { "epoch": 3.1, "grad_norm": 1.1025924611121007, "learning_rate": 0.0001, "loss": 0.7497, "step": 10300 }, { "ce_loss": 0.749667866230011, "epoch": 3.1, "inp_emb_norm": 0.34015625, "loss": 0.749667866230011, "masked_top1": 51.46554763793945, "masked_top5": 76.06611167907715, "step": 10300, "top1": 88.8905972290039, "top5": 96.94614303588867 }, { "epoch": 3.11, "grad_norm": 1.0513267054177442, "learning_rate": 0.0001, "loss": 0.76, "step": 10350 }, { "ce_loss": 0.7712516760826111, "epoch": 3.11, "inp_emb_norm": 0.342890625, "loss": 0.7712516760826111, "masked_top1": 50.167163619995115, "masked_top5": 74.79809906005859, "step": 10350, "top1": 88.5837466430664, "top5": 96.77574478149414 }, { "epoch": 3.13, "grad_norm": 1.1017983640036164, "learning_rate": 0.0001, "loss": 0.75, "step": 10400 }, { "ce_loss": 0.7555344760417938, "epoch": 3.13, "inp_emb_norm": 0.3440625, "loss": 0.7555344760417938, "masked_top1": 51.58950523376465, "masked_top5": 75.45907012939453, "step": 10400, "top1": 88.80611511230468, "top5": 96.81310470581055 }, { "epoch": 3.14, "grad_norm": 1.059466085863966, "learning_rate": 0.0001, "loss": 0.7674, "step": 10450 }, { "ce_loss": 0.7663704335689545, "epoch": 3.14, "inp_emb_norm": 0.3493359375, "loss": 0.7663704335689545, "masked_top1": 49.915076370239255, "masked_top5": 75.00028388977051, "step": 10450, "top1": 88.7645571899414, "top5": 96.73904815673828 }, { "epoch": 3.16, "grad_norm": 1.1423077844194809, "learning_rate": 0.0001, "loss": 0.7609, "step": 10500 }, { "ce_loss": 0.767000640630722, "epoch": 3.16, "inp_emb_norm": 0.3465234375, "loss": 0.767000640630722, "masked_top1": 50.03567909240723, "masked_top5": 75.01374862670899, "step": 10500, "top1": 88.57664657592774, "top5": 96.84391784667969 }, { "epoch": 3.17, "grad_norm": 1.032448425259117, "learning_rate": 0.0001, "loss": 0.772, "step": 10550 }, { "ce_loss": 0.7650803327560425, "epoch": 3.17, "inp_emb_norm": 0.3430078125, "loss": 0.7650803327560425, "masked_top1": 51.708590774536134, "masked_top5": 75.70334999084473, "step": 10550, "top1": 88.69338806152344, "top5": 96.79394775390625 }, { "epoch": 3.19, "grad_norm": 0.9962408469712855, "learning_rate": 0.0001, "loss": 0.7756, "step": 10600 }, { "ce_loss": 0.7650206458568573, "epoch": 3.19, "inp_emb_norm": 0.3441796875, "loss": 0.7650206458568573, "masked_top1": 51.08683837890625, "masked_top5": 76.22035110473632, "step": 10600, "top1": 88.6184033203125, "top5": 96.88288452148437 }, { "epoch": 3.2, "grad_norm": 1.0602266154627207, "learning_rate": 0.0001, "loss": 0.7701, "step": 10650 }, { "ce_loss": 0.790007756948471, "epoch": 3.2, "inp_emb_norm": 0.3371875, "loss": 0.790007756948471, "masked_top1": 49.467856369018556, "masked_top5": 74.11548706054687, "step": 10650, "top1": 88.32970947265625, "top5": 96.74940063476562 }, { "epoch": 3.22, "grad_norm": 0.977038059072146, "learning_rate": 0.0001, "loss": 0.7746, "step": 10700 }, { "ce_loss": 0.7758392190933228, "epoch": 3.22, "inp_emb_norm": 0.346875, "loss": 0.7758392190933228, "masked_top1": 50.20531356811524, "masked_top5": 75.03212989807129, "step": 10700, "top1": 88.4866551208496, "top5": 96.73311477661133 }, { "epoch": 3.23, "grad_norm": 1.0744003921088385, "learning_rate": 0.0001, "loss": 0.7866, "step": 10750 }, { "ce_loss": 0.788164142370224, "epoch": 3.23, "inp_emb_norm": 0.351015625, "loss": 0.788164142370224, "masked_top1": 49.6349144744873, "masked_top5": 74.33588874816894, "step": 10750, "top1": 88.30440032958984, "top5": 96.71714782714844 }, { "epoch": 3.25, "grad_norm": 1.0175043021315349, "learning_rate": 0.0001, "loss": 0.7874, "step": 10800 }, { "ce_loss": 0.7919885838031768, "epoch": 3.25, "inp_emb_norm": 0.3444140625, "loss": 0.7919885838031768, "masked_top1": 48.99909255981445, "masked_top5": 74.25505355834962, "step": 10800, "top1": 88.26540832519531, "top5": 96.72898651123047 }, { "epoch": 3.26, "grad_norm": 1.1095697587812745, "learning_rate": 0.0001, "loss": 0.7888, "step": 10850 }, { "ce_loss": 0.797193922996521, "epoch": 3.26, "inp_emb_norm": 0.345234375, "loss": 0.797193922996521, "masked_top1": 49.19968879699707, "masked_top5": 74.1003759765625, "step": 10850, "top1": 88.23719314575196, "top5": 96.6193911743164 }, { "epoch": 3.28, "grad_norm": 1.096465203223292, "learning_rate": 0.0001, "loss": 0.7871, "step": 10900 }, { "ce_loss": 0.7947347521781921, "epoch": 3.28, "inp_emb_norm": 0.3490234375, "loss": 0.7947347521781921, "masked_top1": 49.850736923217774, "masked_top5": 74.32025009155274, "step": 10900, "top1": 88.2540364074707, "top5": 96.63969177246094 }, { "epoch": 3.29, "grad_norm": 1.0949347711209227, "learning_rate": 0.0001, "loss": 0.7828, "step": 10950 }, { "ce_loss": 0.7824289429187775, "epoch": 3.29, "inp_emb_norm": 0.340234375, "loss": 0.7824289429187775, "masked_top1": 51.0236792755127, "masked_top5": 75.60059753417968, "step": 10950, "top1": 88.44829376220703, "top5": 96.78743530273438 }, { "epoch": 3.31, "grad_norm": 1.0829787001902516, "learning_rate": 0.0001, "loss": 0.7878, "step": 11000 }, { "ce_loss": 0.7807865822315216, "epoch": 3.31, "inp_emb_norm": 0.35390625, "loss": 0.7807865822315216, "masked_top1": 50.650703506469725, "masked_top5": 74.27250564575195, "step": 11000, "top1": 88.50477279663086, "top5": 96.67802978515626 }, { "epoch": 3.32, "grad_norm": 1.01346692888271, "learning_rate": 0.0001, "loss": 0.7935, "step": 11050 }, { "ce_loss": 0.7934070038795471, "epoch": 3.32, "inp_emb_norm": 0.35015625, "loss": 0.7934070038795471, "masked_top1": 50.09875770568848, "masked_top5": 74.54286209106445, "step": 11050, "top1": 88.3311814880371, "top5": 96.66521301269532 }, { "epoch": 3.34, "grad_norm": 1.1970274017001157, "learning_rate": 0.0001, "loss": 0.8031, "step": 11100 }, { "ce_loss": 0.8043110525608063, "epoch": 3.34, "inp_emb_norm": 0.340703125, "loss": 0.8043110525608063, "masked_top1": 49.070663375854494, "masked_top5": 73.73879615783692, "step": 11100, "top1": 88.18025527954102, "top5": 96.71074813842773 }, { "epoch": 3.35, "grad_norm": 0.9807421337694971, "learning_rate": 0.0001, "loss": 0.7967, "step": 11150 }, { "ce_loss": 0.7986381149291992, "epoch": 3.35, "inp_emb_norm": 0.3460546875, "loss": 0.7986381149291992, "masked_top1": 49.351584854125974, "masked_top5": 74.40426681518555, "step": 11150, "top1": 88.17583755493165, "top5": 96.63799163818359 }, { "epoch": 3.37, "grad_norm": 1.0963946786909775, "learning_rate": 0.0001, "loss": 0.7973, "step": 11200 }, { "ce_loss": 0.7999817717075348, "epoch": 3.37, "inp_emb_norm": 0.34953125, "loss": 0.7999817717075348, "masked_top1": 48.15997085571289, "masked_top5": 73.92494735717773, "step": 11200, "top1": 88.08112899780274, "top5": 96.61600830078125 }, { "epoch": 3.38, "grad_norm": 0.9959517938940972, "learning_rate": 0.0001, "loss": 0.7972, "step": 11250 }, { "ce_loss": 0.7938779592514038, "epoch": 3.38, "inp_emb_norm": 0.3476171875, "loss": 0.7938779592514038, "masked_top1": 49.82730033874512, "masked_top5": 74.95826553344726, "step": 11250, "top1": 88.16325576782226, "top5": 96.75233062744141 }, { "epoch": 3.4, "grad_norm": 0.9936212958366237, "learning_rate": 0.0001, "loss": 0.8, "step": 11300 }, { "ce_loss": 0.8001345789432526, "epoch": 3.4, "inp_emb_norm": 0.3510546875, "loss": 0.8001345789432526, "masked_top1": 50.4670531463623, "masked_top5": 74.18307846069337, "step": 11300, "top1": 88.19016235351563, "top5": 96.6422052001953 }, { "epoch": 3.41, "grad_norm": 1.130929594499011, "learning_rate": 0.0001, "loss": 0.8071, "step": 11350 }, { "ce_loss": 0.8002821004390717, "epoch": 3.41, "inp_emb_norm": 0.3469921875, "loss": 0.8002821004390717, "masked_top1": 49.907284088134766, "masked_top5": 74.60272583007813, "step": 11350, "top1": 88.13528030395508, "top5": 96.65942398071289 }, { "epoch": 3.43, "grad_norm": 1.027541301721308, "learning_rate": 0.0001, "loss": 0.817, "step": 11400 }, { "ce_loss": 0.8025326645374298, "epoch": 3.43, "inp_emb_norm": 0.3438671875, "loss": 0.8025326645374298, "masked_top1": 50.332852630615236, "masked_top5": 74.55212646484375, "step": 11400, "top1": 88.08174774169922, "top5": 96.71300491333008 }, { "epoch": 3.44, "grad_norm": 1.0509284890501878, "learning_rate": 0.0001, "loss": 0.8071, "step": 11450 }, { "ce_loss": 0.8151209402084351, "epoch": 3.44, "inp_emb_norm": 0.3536328125, "loss": 0.8151209402084351, "masked_top1": 49.18514472961426, "masked_top5": 73.88459991455078, "step": 11450, "top1": 88.03491561889649, "top5": 96.5151156616211 }, { "epoch": 3.46, "grad_norm": 1.0563826334349269, "learning_rate": 0.0001, "loss": 0.8146, "step": 11500 }, { "ce_loss": 0.8121966254711152, "epoch": 3.46, "inp_emb_norm": 0.3479296875, "loss": 0.8121966254711152, "masked_top1": 48.76513572692871, "masked_top5": 73.98189331054688, "step": 11500, "top1": 87.9227052307129, "top5": 96.60188415527344 }, { "epoch": 3.47, "grad_norm": 1.0581754020714675, "learning_rate": 0.0001, "loss": 0.8113, "step": 11550 }, { "ce_loss": 0.8193511891365052, "epoch": 3.47, "inp_emb_norm": 0.3533984375, "loss": 0.8193511891365052, "masked_top1": 48.62931312561035, "masked_top5": 73.32291015625, "step": 11550, "top1": 87.89377365112304, "top5": 96.50224563598633 }, { "epoch": 3.49, "grad_norm": 1.012876759570208, "learning_rate": 0.0001, "loss": 0.8131, "step": 11600 }, { "ce_loss": 0.8099701881408692, "epoch": 3.49, "inp_emb_norm": 0.35375, "loss": 0.8099701881408692, "masked_top1": 49.099407272338865, "masked_top5": 73.79963348388672, "step": 11600, "top1": 88.09338317871094, "top5": 96.58081787109376 }, { "epoch": 3.5, "grad_norm": 1.0124821527896055, "learning_rate": 0.0001, "loss": 0.8121, "step": 11650 }, { "ce_loss": 0.8340137410163879, "epoch": 3.5, "inp_emb_norm": 0.34828125, "loss": 0.8340137410163879, "masked_top1": 49.10786933898926, "masked_top5": 73.36568084716797, "step": 11650, "top1": 87.82726516723633, "top5": 96.4594790649414 }, { "epoch": 3.52, "grad_norm": 1.006239120505223, "learning_rate": 0.0001, "loss": 0.8201, "step": 11700 }, { "ce_loss": 0.8206925344467163, "epoch": 3.52, "inp_emb_norm": 0.351875, "loss": 0.8206925344467163, "masked_top1": 48.73426811218262, "masked_top5": 73.27444328308106, "step": 11700, "top1": 87.83039459228516, "top5": 96.4988427734375 }, { "epoch": 3.53, "grad_norm": 1.059895397750386, "learning_rate": 0.0001, "loss": 0.817, "step": 11750 }, { "ce_loss": 0.8189614808559418, "epoch": 3.53, "inp_emb_norm": 0.3555859375, "loss": 0.8189614808559418, "masked_top1": 49.02143653869629, "masked_top5": 74.14399620056152, "step": 11750, "top1": 87.84059661865234, "top5": 96.55142425537109 }, { "epoch": 3.55, "grad_norm": 1.065918703633994, "learning_rate": 0.0001, "loss": 0.8103, "step": 11800 }, { "ce_loss": 0.8101903474330903, "epoch": 3.55, "inp_emb_norm": 0.3505078125, "loss": 0.8101903474330903, "masked_top1": 49.68892807006836, "masked_top5": 74.20518287658692, "step": 11800, "top1": 88.00088562011719, "top5": 96.6223454284668 }, { "epoch": 3.56, "grad_norm": 0.999942936662604, "learning_rate": 0.0001, "loss": 0.8173, "step": 11850 }, { "ce_loss": 0.8223925268650055, "epoch": 3.56, "inp_emb_norm": 0.3467578125, "loss": 0.8223925268650055, "masked_top1": 48.98508232116699, "masked_top5": 73.06097061157226, "step": 11850, "top1": 87.84088226318359, "top5": 96.52394149780274 }, { "epoch": 3.58, "grad_norm": 1.0109380245958302, "learning_rate": 0.0001, "loss": 0.8217, "step": 11900 }, { "ce_loss": 0.8102488934993743, "epoch": 3.58, "inp_emb_norm": 0.3561328125, "loss": 0.8102488934993743, "masked_top1": 49.78484992980957, "masked_top5": 73.83953674316406, "step": 11900, "top1": 88.08386886596679, "top5": 96.51279846191406 }, { "epoch": 3.59, "grad_norm": 1.0501857173368994, "learning_rate": 0.0001, "loss": 0.8202, "step": 11950 }, { "ce_loss": 0.8153278791904449, "epoch": 3.59, "inp_emb_norm": 0.3567578125, "loss": 0.8153278791904449, "masked_top1": 50.214491806030274, "masked_top5": 75.15233001708984, "step": 11950, "top1": 87.94796691894531, "top5": 96.6393051147461 }, { "epoch": 3.61, "grad_norm": 1.0094284740380721, "learning_rate": 0.0001, "loss": 0.8228, "step": 12000 }, { "ce_loss": 0.8150149726867676, "epoch": 3.61, "inp_emb_norm": 0.34546875, "loss": 0.8150149726867676, "masked_top1": 50.50455932617187, "masked_top5": 75.45603103637696, "step": 12000, "top1": 87.88845626831055, "top5": 96.71615814208984 }, { "epoch": 3.62, "grad_norm": 0.9828696027131701, "learning_rate": 0.0001, "loss": 0.8268, "step": 12050 }, { "ce_loss": 0.8084503662586212, "epoch": 3.62, "inp_emb_norm": 0.35515625, "loss": 0.8084503662586212, "masked_top1": 49.089233779907225, "masked_top5": 74.21111480712891, "step": 12050, "top1": 87.9230386352539, "top5": 96.66818649291992 }, { "epoch": 3.64, "grad_norm": 1.0364095825846835, "learning_rate": 0.0001, "loss": 0.8313, "step": 12100 }, { "ce_loss": 0.8248488974571228, "epoch": 3.64, "inp_emb_norm": 0.3575, "loss": 0.8248488974571228, "masked_top1": 49.23360801696777, "masked_top5": 73.44429908752441, "step": 12100, "top1": 87.82616806030273, "top5": 96.4687548828125 }, { "epoch": 3.65, "grad_norm": 1.032109747501083, "learning_rate": 0.0001, "loss": 0.8166, "step": 12150 }, { "ce_loss": 0.819879275560379, "epoch": 3.65, "inp_emb_norm": 0.3619921875, "loss": 0.819879275560379, "masked_top1": 49.493774490356444, "masked_top5": 74.06904296875, "step": 12150, "top1": 87.88977813720703, "top5": 96.5196435546875 }, { "epoch": 3.67, "grad_norm": 1.1132546555196505, "learning_rate": 0.0001, "loss": 0.8407, "step": 12200 }, { "ce_loss": 0.8416215097904205, "epoch": 3.67, "inp_emb_norm": 0.35109375, "loss": 0.8416215097904205, "masked_top1": 48.93654960632324, "masked_top5": 73.06722267150879, "step": 12200, "top1": 87.65886123657226, "top5": 96.42184036254883 }, { "epoch": 3.68, "grad_norm": 1.1126594978470823, "learning_rate": 0.0001, "loss": 0.8309, "step": 12250 }, { "ce_loss": 0.8369515192508697, "epoch": 3.68, "inp_emb_norm": 0.3548046875, "loss": 0.8369515192508697, "masked_top1": 49.08364143371582, "masked_top5": 73.77061073303223, "step": 12250, "top1": 87.5754817199707, "top5": 96.51109664916993 }, { "epoch": 3.7, "grad_norm": 1.039335313619578, "learning_rate": 0.0001, "loss": 0.8335, "step": 12300 }, { "ce_loss": 0.8377754426002503, "epoch": 3.7, "inp_emb_norm": 0.3561328125, "loss": 0.8377754426002503, "masked_top1": 48.12888420104981, "masked_top5": 72.75807151794433, "step": 12300, "top1": 87.64847686767578, "top5": 96.35865264892578 }, { "epoch": 3.71, "grad_norm": 1.0741125902905957, "learning_rate": 0.0001, "loss": 0.8311, "step": 12350 }, { "ce_loss": 0.8398508429527283, "epoch": 3.71, "inp_emb_norm": 0.3508203125, "loss": 0.8398508429527283, "masked_top1": 49.14791725158691, "masked_top5": 73.34809280395508, "step": 12350, "top1": 87.55382202148438, "top5": 96.47370834350586 }, { "epoch": 3.73, "grad_norm": 1.0735818209876995, "learning_rate": 0.0001, "loss": 0.8271, "step": 12400 }, { "ce_loss": 0.8356200730800629, "epoch": 3.73, "inp_emb_norm": 0.3551953125, "loss": 0.8356200730800629, "masked_top1": 48.54533554077148, "masked_top5": 73.34476501464843, "step": 12400, "top1": 87.71935012817383, "top5": 96.43182586669921 }, { "epoch": 3.74, "grad_norm": 1.1727450080624469, "learning_rate": 0.0001, "loss": 0.8438, "step": 12450 }, { "ce_loss": 0.834755152463913, "epoch": 3.74, "inp_emb_norm": 0.3557421875, "loss": 0.834755152463913, "masked_top1": 48.73476402282715, "masked_top5": 73.74958610534668, "step": 12450, "top1": 87.6754948425293, "top5": 96.48529556274414 }, { "epoch": 3.76, "grad_norm": 1.0593532648539608, "learning_rate": 0.0001, "loss": 0.8302, "step": 12500 }, { "ce_loss": 0.8266095387935638, "epoch": 3.76, "inp_emb_norm": 0.35234375, "loss": 0.8266095387935638, "masked_top1": 49.589872207641605, "masked_top5": 73.7888671875, "step": 12500, "top1": 87.81543869018554, "top5": 96.54735931396485 }, { "epoch": 3.77, "grad_norm": 1.0465892187844261, "learning_rate": 0.0001, "loss": 0.8365, "step": 12550 }, { "ce_loss": 0.8360547876358032, "epoch": 3.77, "inp_emb_norm": 0.352265625, "loss": 0.8360547876358032, "masked_top1": 48.81367431640625, "masked_top5": 73.35512954711913, "step": 12550, "top1": 87.50978561401367, "top5": 96.48541564941407 }, { "epoch": 3.79, "grad_norm": 0.9861748502692437, "learning_rate": 0.0001, "loss": 0.8346, "step": 12600 }, { "ce_loss": 0.8302571523189545, "epoch": 3.79, "inp_emb_norm": 0.353515625, "loss": 0.8302571523189545, "masked_top1": 50.41721633911133, "masked_top5": 74.37043991088868, "step": 12600, "top1": 87.77974655151367, "top5": 96.57415542602538 }, { "epoch": 3.8, "grad_norm": 1.0316691583599322, "learning_rate": 0.0001, "loss": 0.8311, "step": 12650 }, { "ce_loss": 0.8366478300094604, "epoch": 3.8, "inp_emb_norm": 0.358125, "loss": 0.8366478300094604, "masked_top1": 49.738552017211916, "masked_top5": 72.98569618225098, "step": 12650, "top1": 87.68307205200195, "top5": 96.43355926513672 }, { "epoch": 3.82, "grad_norm": 1.079061755706453, "learning_rate": 0.0001, "loss": 0.8409, "step": 12700 }, { "ce_loss": 0.8572568881511688, "epoch": 3.82, "inp_emb_norm": 0.358984375, "loss": 0.8572568881511688, "masked_top1": 47.16713302612305, "masked_top5": 71.66687362670899, "step": 12700, "top1": 87.3316879272461, "top5": 96.28264602661133 }, { "epoch": 3.83, "grad_norm": 1.1113987431562926, "learning_rate": 0.0001, "loss": 0.8278, "step": 12750 }, { "ce_loss": 0.8314487624168396, "epoch": 3.83, "inp_emb_norm": 0.35171875, "loss": 0.8314487624168396, "masked_top1": 49.02839782714844, "masked_top5": 73.43093780517579, "step": 12750, "top1": 87.7519515991211, "top5": 96.49931549072265 }, { "epoch": 3.85, "grad_norm": 1.0978179635851295, "learning_rate": 0.0001, "loss": 0.8332, "step": 12800 }, { "ce_loss": 0.8298409843444824, "epoch": 3.85, "inp_emb_norm": 0.3492578125, "loss": 0.8298409843444824, "masked_top1": 50.06224174499512, "masked_top5": 73.62540901184082, "step": 12800, "top1": 87.73297302246094, "top5": 96.57409362792968 }, { "epoch": 3.86, "grad_norm": 0.9650541842630127, "learning_rate": 0.0001, "loss": 0.8372, "step": 12850 }, { "ce_loss": 0.8380164694786072, "epoch": 3.86, "inp_emb_norm": 0.35796875, "loss": 0.8380164694786072, "masked_top1": 48.93704933166504, "masked_top5": 72.84653518676758, "step": 12850, "top1": 87.66362426757813, "top5": 96.39145156860351 }, { "epoch": 3.88, "grad_norm": 0.9849217897777546, "learning_rate": 0.0001, "loss": 0.8489, "step": 12900 }, { "ce_loss": 0.8715607190132141, "epoch": 3.88, "inp_emb_norm": 0.35453125, "loss": 0.8715607190132141, "masked_top1": 46.82566688537597, "masked_top5": 71.51848426818847, "step": 12900, "top1": 87.30749450683594, "top5": 96.20992965698242 }, { "epoch": 3.89, "grad_norm": 1.0587318645397439, "learning_rate": 0.0001, "loss": 0.8513, "step": 12950 }, { "ce_loss": 0.8459929120540619, "epoch": 3.89, "inp_emb_norm": 0.3545703125, "loss": 0.8459929120540619, "masked_top1": 47.957108154296876, "masked_top5": 72.73884170532227, "step": 12950, "top1": 87.4450048828125, "top5": 96.42510208129883 }, { "epoch": 3.91, "grad_norm": 1.008453804185378, "learning_rate": 0.0001, "loss": 0.8499, "step": 13000 }, { "ce_loss": 0.8543168365955353, "epoch": 3.91, "inp_emb_norm": 0.35265625, "loss": 0.8543168365955353, "masked_top1": 48.10952255249023, "masked_top5": 72.79139190673828, "step": 13000, "top1": 87.43768081665038, "top5": 96.36415054321289 }, { "epoch": 3.92, "grad_norm": 1.1029061736641703, "learning_rate": 0.0001, "loss": 0.8477, "step": 13050 }, { "ce_loss": 0.8511655080318451, "epoch": 3.92, "inp_emb_norm": 0.3517578125, "loss": 0.8511655080318451, "masked_top1": 47.82553955078125, "masked_top5": 72.87044403076172, "step": 13050, "top1": 87.3859359741211, "top5": 96.47525772094727 }, { "epoch": 3.94, "grad_norm": 1.075253739387409, "learning_rate": 0.0001, "loss": 0.8483, "step": 13100 }, { "ce_loss": 0.8585194671154022, "epoch": 3.94, "inp_emb_norm": 0.3530859375, "loss": 0.8585194671154022, "masked_top1": 48.05498847961426, "masked_top5": 72.10926498413086, "step": 13100, "top1": 87.33003082275391, "top5": 96.38634353637696 }, { "epoch": 3.95, "grad_norm": 1.06337585097062, "learning_rate": 0.0001, "loss": 0.8512, "step": 13150 }, { "ce_loss": 0.8486355948448181, "epoch": 3.95, "inp_emb_norm": 0.3512109375, "loss": 0.8486355948448181, "masked_top1": 47.7544295501709, "masked_top5": 72.88016632080078, "step": 13150, "top1": 87.42533172607422, "top5": 96.4359049987793 }, { "epoch": 3.97, "grad_norm": 1.0262462724728467, "learning_rate": 0.0001, "loss": 0.8484, "step": 13200 }, { "ce_loss": 0.850223616361618, "epoch": 3.97, "inp_emb_norm": 0.3545703125, "loss": 0.850223616361618, "masked_top1": 49.058433532714844, "masked_top5": 73.50660820007325, "step": 13200, "top1": 87.42515289306641, "top5": 96.44447158813476 }, { "epoch": 3.98, "grad_norm": 1.1183732342762185, "learning_rate": 0.0001, "loss": 0.8496, "step": 13250 }, { "ce_loss": 0.8533316111564636, "epoch": 3.98, "inp_emb_norm": 0.3624609375, "loss": 0.8533316111564636, "masked_top1": 47.88730712890625, "masked_top5": 72.5444091796875, "step": 13250, "top1": 87.41307250976563, "top5": 96.35140655517579 }, { "epoch": 4.0, "grad_norm": 1.0167762679603471, "learning_rate": 0.0001, "loss": 0.8556, "step": 13300 }, { "ce_loss": 0.8482355761528015, "epoch": 4.0, "inp_emb_norm": 0.36, "loss": 0.8482355761528015, "masked_top1": 48.77355934143066, "masked_top5": 72.95730499267579, "step": 13300, "top1": 87.49448638916016, "top5": 96.36580871582031 }, { "epoch": 4.02, "grad_norm": 0.9619800299228121, "learning_rate": 0.0001, "loss": 0.4647, "step": 13350 }, { "ce_loss": 0.4654207336902618, "epoch": 4.02, "inp_emb_norm": 0.3638671875, "loss": 0.4654207336902618, "masked_top1": 69.25712104797363, "masked_top5": 90.11044219970704, "step": 13350, "top1": 92.98291519165039, "top5": 98.58249725341797 }, { "epoch": 4.03, "grad_norm": 1.0911634549199811, "learning_rate": 0.0001, "loss": 0.4589, "step": 13400 }, { "ce_loss": 0.45429064869880675, "epoch": 4.03, "inp_emb_norm": 0.3570703125, "loss": 0.45429064869880675, "masked_top1": 69.99664253234863, "masked_top5": 90.96843032836914, "step": 13400, "top1": 93.03666076660156, "top5": 98.7020344543457 }, { "epoch": 4.05, "grad_norm": 1.055104414353352, "learning_rate": 0.0001, "loss": 0.463, "step": 13450 }, { "ce_loss": 0.4745191448926926, "epoch": 4.05, "inp_emb_norm": 0.3669921875, "loss": 0.4745191448926926, "masked_top1": 67.61885986328124, "masked_top5": 89.49128158569336, "step": 13450, "top1": 92.77124450683594, "top5": 98.49986145019531 }, { "epoch": 4.06, "grad_norm": 1.134443079230167, "learning_rate": 0.0001, "loss": 0.4584, "step": 13500 }, { "ce_loss": 0.45621369063854217, "epoch": 4.06, "inp_emb_norm": 0.3608984375, "loss": 0.45621369063854217, "masked_top1": 68.7521329498291, "masked_top5": 90.43077987670898, "step": 13500, "top1": 92.95487716674805, "top5": 98.66310028076173 }, { "epoch": 4.08, "grad_norm": 1.0109539437689439, "learning_rate": 0.0001, "loss": 0.4666, "step": 13550 }, { "ce_loss": 0.47258255898952484, "epoch": 4.08, "inp_emb_norm": 0.3544921875, "loss": 0.47258255898952484, "masked_top1": 68.48685668945312, "masked_top5": 90.1821842956543, "step": 13550, "top1": 92.76567596435547, "top5": 98.59864028930664 }, { "epoch": 4.09, "grad_norm": 1.105379838900382, "learning_rate": 0.0001, "loss": 0.4745, "step": 13600 }, { "ce_loss": 0.4720061844587326, "epoch": 4.09, "inp_emb_norm": 0.3646484375, "loss": 0.4720061844587326, "masked_top1": 67.28578758239746, "masked_top5": 88.81673706054687, "step": 13600, "top1": 92.7738314819336, "top5": 98.49870178222656 }, { "epoch": 4.11, "grad_norm": 1.0445145007542662, "learning_rate": 0.0001, "loss": 0.4766, "step": 13650 }, { "ce_loss": 0.47552256405353543, "epoch": 4.11, "inp_emb_norm": 0.363515625, "loss": 0.47552256405353543, "masked_top1": 67.69092002868652, "masked_top5": 89.2786703491211, "step": 13650, "top1": 92.75520309448243, "top5": 98.50775588989258 }, { "epoch": 4.12, "grad_norm": 1.0910427976634176, "learning_rate": 0.0001, "loss": 0.4854, "step": 13700 }, { "ce_loss": 0.4856406021118164, "epoch": 4.12, "inp_emb_norm": 0.356484375, "loss": 0.4856406021118164, "masked_top1": 67.06616600036621, "masked_top5": 89.24904571533203, "step": 13700, "top1": 92.60382766723633, "top5": 98.53153411865235 }, { "epoch": 4.14, "grad_norm": 1.06712952932014, "learning_rate": 0.0001, "loss": 0.487, "step": 13750 }, { "ce_loss": 0.47847113251686096, "epoch": 4.14, "inp_emb_norm": 0.3590234375, "loss": 0.47847113251686096, "masked_top1": 67.56794845581055, "masked_top5": 89.76878845214844, "step": 13750, "top1": 92.65390167236328, "top5": 98.56003234863282 }, { "epoch": 4.15, "grad_norm": 1.1426918899106289, "learning_rate": 0.0001, "loss": 0.4945, "step": 13800 }, { "ce_loss": 0.502599538564682, "epoch": 4.15, "inp_emb_norm": 0.3619140625, "loss": 0.502599538564682, "masked_top1": 64.62048377990723, "masked_top5": 87.96943252563477, "step": 13800, "top1": 92.32529083251953, "top5": 98.33552108764648 }, { "epoch": 4.17, "grad_norm": 1.0640372514854988, "learning_rate": 0.0001, "loss": 0.4917, "step": 13850 }, { "ce_loss": 0.48276497185230255, "epoch": 4.17, "inp_emb_norm": 0.3612890625, "loss": 0.48276497185230255, "masked_top1": 66.87754806518555, "masked_top5": 89.40460830688477, "step": 13850, "top1": 92.6048323059082, "top5": 98.5196612548828 }, { "epoch": 4.18, "grad_norm": 1.0976346428642292, "learning_rate": 0.0001, "loss": 0.5333, "step": 13900 }, { "ce_loss": 0.5117572790384293, "epoch": 4.18, "inp_emb_norm": 0.3601171875, "loss": 0.5117572790384293, "masked_top1": 64.48737777709961, "masked_top5": 87.95527038574218, "step": 13900, "top1": 92.178935546875, "top5": 98.36900207519531 }, { "epoch": 4.2, "grad_norm": 1.0719371859267879, "learning_rate": 0.0001, "loss": 0.5027, "step": 13950 }, { "ce_loss": 0.506566162109375, "epoch": 4.2, "inp_emb_norm": 0.36140625, "loss": 0.506566162109375, "masked_top1": 64.94912460327149, "masked_top5": 88.18265213012695, "step": 13950, "top1": 92.29801971435546, "top5": 98.34425674438477 }, { "epoch": 4.21, "grad_norm": 1.1253588379337218, "learning_rate": 0.0001, "loss": 0.5035, "step": 14000 }, { "ce_loss": 0.5121178191900253, "epoch": 4.21, "inp_emb_norm": 0.3689453125, "loss": 0.5121178191900253, "masked_top1": 64.75304649353028, "masked_top5": 88.02303543090821, "step": 14000, "top1": 92.1805793762207, "top5": 98.36868255615235 }, { "epoch": 4.23, "grad_norm": 1.0564639466498615, "learning_rate": 0.0001, "loss": 0.5095, "step": 14050 }, { "ce_loss": 0.5051852202415467, "epoch": 4.23, "inp_emb_norm": 0.3665234375, "loss": 0.5051852202415467, "masked_top1": 64.94737861633301, "masked_top5": 87.73623580932617, "step": 14050, "top1": 92.28643096923828, "top5": 98.35600860595703 }, { "epoch": 4.24, "grad_norm": 1.054309146882263, "learning_rate": 0.0001, "loss": 0.5128, "step": 14100 }, { "ce_loss": 0.5164009261131287, "epoch": 4.24, "inp_emb_norm": 0.3551171875, "loss": 0.5164009261131287, "masked_top1": 64.0973461151123, "masked_top5": 87.07987808227539, "step": 14100, "top1": 92.10574264526367, "top5": 98.30786682128907 }, { "epoch": 4.26, "grad_norm": 1.074877585526465, "learning_rate": 0.0001, "loss": 0.518, "step": 14150 }, { "ce_loss": 0.51701247215271, "epoch": 4.26, "inp_emb_norm": 0.36, "loss": 0.51701247215271, "masked_top1": 63.153812866210934, "masked_top5": 87.39238418579102, "step": 14150, "top1": 92.00238998413086, "top5": 98.3110905456543 }, { "epoch": 4.27, "grad_norm": 1.0964266251443575, "learning_rate": 0.0001, "loss": 0.5182, "step": 14200 }, { "ce_loss": 0.5284781348705292, "epoch": 4.27, "inp_emb_norm": 0.362265625, "loss": 0.5284781348705292, "masked_top1": 63.75819221496582, "masked_top5": 87.80308670043945, "step": 14200, "top1": 91.9805062866211, "top5": 98.30166961669921 }, { "epoch": 4.29, "grad_norm": 0.9313191633498797, "learning_rate": 0.0001, "loss": 0.5323, "step": 14250 }, { "ce_loss": 0.5334083133935928, "epoch": 4.29, "inp_emb_norm": 0.366328125, "loss": 0.5334083133935928, "masked_top1": 63.2767244720459, "masked_top5": 86.49884506225585, "step": 14250, "top1": 91.98057006835937, "top5": 98.21974060058594 }, { "epoch": 4.3, "grad_norm": 1.049137459405467, "learning_rate": 0.0001, "loss": 0.5128, "step": 14300 }, { "ce_loss": 0.5050585663318634, "epoch": 4.3, "inp_emb_norm": 0.3669140625, "loss": 0.5050585663318634, "masked_top1": 65.1010538482666, "masked_top5": 88.11703536987305, "step": 14300, "top1": 92.21394805908203, "top5": 98.39463333129883 }, { "epoch": 4.32, "grad_norm": 1.0912111332884709, "learning_rate": 0.0001, "loss": 0.5219, "step": 14350 }, { "ce_loss": 0.5061863285303115, "epoch": 4.32, "inp_emb_norm": 0.3662890625, "loss": 0.5061863285303115, "masked_top1": 64.16183708190918, "masked_top5": 87.73145401000977, "step": 14350, "top1": 92.20845031738281, "top5": 98.3388949584961 }, { "epoch": 4.33, "grad_norm": 1.0296955736225974, "learning_rate": 0.0001, "loss": 0.531, "step": 14400 }, { "ce_loss": 0.5429442119598389, "epoch": 4.33, "inp_emb_norm": 0.3691796875, "loss": 0.5429442119598389, "masked_top1": 61.77721855163574, "masked_top5": 85.85552307128906, "step": 14400, "top1": 91.75323486328125, "top5": 98.1361979675293 }, { "epoch": 4.35, "grad_norm": 1.1454314011018045, "learning_rate": 0.0001, "loss": 0.531, "step": 14450 }, { "ce_loss": 0.5351821321249008, "epoch": 4.35, "inp_emb_norm": 0.369140625, "loss": 0.5351821321249008, "masked_top1": 62.75261306762695, "masked_top5": 86.70513565063476, "step": 14450, "top1": 91.81992034912109, "top5": 98.24479461669922 }, { "epoch": 4.36, "grad_norm": 1.0627879361115289, "learning_rate": 0.0001, "loss": 0.5354, "step": 14500 }, { "ce_loss": 0.5213436669111252, "epoch": 4.36, "inp_emb_norm": 0.37203125, "loss": 0.5213436669111252, "masked_top1": 62.81369560241699, "masked_top5": 86.35222900390625, "step": 14500, "top1": 92.11218536376953, "top5": 98.20223754882812 }, { "epoch": 4.38, "grad_norm": 1.0187687495343714, "learning_rate": 0.0001, "loss": 0.5316, "step": 14550 }, { "ce_loss": 0.5346812665462494, "epoch": 4.38, "inp_emb_norm": 0.364140625, "loss": 0.5346812665462494, "masked_top1": 63.61943054199219, "masked_top5": 87.0892349243164, "step": 14550, "top1": 91.8150325012207, "top5": 98.27966354370118 }, { "epoch": 4.39, "grad_norm": 0.9591126150175121, "learning_rate": 0.0001, "loss": 0.5376, "step": 14600 }, { "ce_loss": 0.5374365419149398, "epoch": 4.39, "inp_emb_norm": 0.361484375, "loss": 0.5374365419149398, "masked_top1": 61.88925651550293, "masked_top5": 86.4749966430664, "step": 14600, "top1": 91.70243728637695, "top5": 98.21218734741211 }, { "epoch": 4.41, "grad_norm": 1.1233501033048685, "learning_rate": 0.0001, "loss": 0.5384, "step": 14650 }, { "ce_loss": 0.5430423647165299, "epoch": 4.41, "inp_emb_norm": 0.3658984375, "loss": 0.5430423647165299, "masked_top1": 62.05436401367187, "masked_top5": 86.22527664184571, "step": 14650, "top1": 91.75454711914062, "top5": 98.14460327148437 }, { "epoch": 4.42, "grad_norm": 1.0492889712367255, "learning_rate": 0.0001, "loss": 0.5393, "step": 14700 }, { "ce_loss": 0.5455965805053711, "epoch": 4.42, "inp_emb_norm": 0.369140625, "loss": 0.5455965805053711, "masked_top1": 61.81470611572266, "masked_top5": 85.87024856567383, "step": 14700, "top1": 91.6540138244629, "top5": 98.14372894287109 }, { "epoch": 4.44, "grad_norm": 1.0520563782836545, "learning_rate": 0.0001, "loss": 0.5473, "step": 14750 }, { "ce_loss": 0.5409998238086701, "epoch": 4.44, "inp_emb_norm": 0.3708984375, "loss": 0.5409998238086701, "masked_top1": 62.29138427734375, "masked_top5": 86.59371627807617, "step": 14750, "top1": 91.61244201660156, "top5": 98.20165710449218 }, { "epoch": 4.45, "grad_norm": 1.0819587007503622, "learning_rate": 0.0001, "loss": 0.5475, "step": 14800 }, { "ce_loss": 0.5518389946222305, "epoch": 4.45, "inp_emb_norm": 0.3651953125, "loss": 0.5518389946222305, "masked_top1": 60.76948570251465, "masked_top5": 85.64509750366211, "step": 14800, "top1": 91.58786865234374, "top5": 98.13917205810547 }, { "epoch": 4.47, "grad_norm": 1.2070825391159268, "learning_rate": 0.0001, "loss": 0.5455, "step": 14850 }, { "ce_loss": 0.5443538892269134, "epoch": 4.47, "inp_emb_norm": 0.3718359375, "loss": 0.5443538892269134, "masked_top1": 62.73788932800293, "masked_top5": 86.95017120361328, "step": 14850, "top1": 91.70748413085937, "top5": 98.2123405456543 }, { "epoch": 4.48, "grad_norm": 1.0786306746042886, "learning_rate": 0.0001, "loss": 0.5447, "step": 14900 }, { "ce_loss": 0.5514276492595672, "epoch": 4.48, "inp_emb_norm": 0.36875, "loss": 0.5514276492595672, "masked_top1": 62.11178161621094, "masked_top5": 85.72534118652344, "step": 14900, "top1": 91.74148101806641, "top5": 98.11538375854492 }, { "epoch": 4.5, "grad_norm": 0.9911108641898027, "learning_rate": 0.0001, "loss": 0.5571, "step": 14950 }, { "ce_loss": 0.5549844616651535, "epoch": 4.5, "inp_emb_norm": 0.366484375, "loss": 0.5549844616651535, "masked_top1": 61.723852920532224, "masked_top5": 86.07279510498047, "step": 14950, "top1": 91.52112564086914, "top5": 98.13692977905274 }, { "epoch": 4.51, "grad_norm": 1.0479818912700751, "learning_rate": 0.0001, "loss": 0.5508, "step": 15000 }, { "ce_loss": 0.5503324097394944, "epoch": 4.51, "inp_emb_norm": 0.36609375, "loss": 0.5503324097394944, "masked_top1": 62.31337753295898, "masked_top5": 85.98730865478515, "step": 15000, "top1": 91.61835815429687, "top5": 98.14640243530273 }, { "epoch": 4.53, "grad_norm": 1.0126345802253716, "learning_rate": 0.0001, "loss": 0.5604, "step": 15050 }, { "ce_loss": 0.5613218837976456, "epoch": 4.53, "inp_emb_norm": 0.368203125, "loss": 0.5613218837976456, "masked_top1": 60.66889373779297, "masked_top5": 85.60351181030273, "step": 15050, "top1": 91.41504837036133, "top5": 98.12169372558594 }, { "epoch": 4.54, "grad_norm": 1.1194678195459478, "learning_rate": 0.0001, "loss": 0.558, "step": 15100 }, { "ce_loss": 0.5630833846330643, "epoch": 4.54, "inp_emb_norm": 0.368828125, "loss": 0.5630833846330643, "masked_top1": 60.917673873901364, "masked_top5": 85.1576773071289, "step": 15100, "top1": 91.41613555908204, "top5": 98.04640029907226 }, { "epoch": 4.56, "grad_norm": 1.0532041635279352, "learning_rate": 0.0001, "loss": 0.5558, "step": 15150 }, { "ce_loss": 0.5506490439176559, "epoch": 4.56, "inp_emb_norm": 0.366640625, "loss": 0.5506490439176559, "masked_top1": 62.93031044006348, "masked_top5": 86.40555801391602, "step": 15150, "top1": 91.61476593017578, "top5": 98.21818695068359 }, { "epoch": 4.57, "grad_norm": 1.1553777996600965, "learning_rate": 0.0001, "loss": 0.5592, "step": 15200 }, { "ce_loss": 0.5720086497068405, "epoch": 4.57, "inp_emb_norm": 0.367890625, "loss": 0.5720086497068405, "masked_top1": 60.15533386230469, "masked_top5": 84.74554382324219, "step": 15200, "top1": 91.33564956665039, "top5": 98.00524505615235 }, { "epoch": 4.59, "grad_norm": 1.0565117376954045, "learning_rate": 0.0001, "loss": 0.5639, "step": 15250 }, { "ce_loss": 0.5550442606210708, "epoch": 4.59, "inp_emb_norm": 0.3610546875, "loss": 0.5550442606210708, "masked_top1": 61.29122528076172, "masked_top5": 85.56437545776367, "step": 15250, "top1": 91.44481323242188, "top5": 98.13257965087891 }, { "epoch": 4.6, "grad_norm": 0.971805815056899, "learning_rate": 0.0001, "loss": 0.5632, "step": 15300 }, { "ce_loss": 0.565330091714859, "epoch": 4.6, "inp_emb_norm": 0.3659375, "loss": 0.565330091714859, "masked_top1": 60.39768196105957, "masked_top5": 85.04548843383789, "step": 15300, "top1": 91.39098709106446, "top5": 98.0403810119629 }, { "epoch": 4.62, "grad_norm": 1.0545177592362711, "learning_rate": 0.0001, "loss": 0.5666, "step": 15350 }, { "ce_loss": 0.5629042333364487, "epoch": 4.62, "inp_emb_norm": 0.363046875, "loss": 0.5629042333364487, "masked_top1": 61.29995155334473, "masked_top5": 85.63533966064453, "step": 15350, "top1": 91.3496403503418, "top5": 98.15489151000976 }, { "epoch": 4.63, "grad_norm": 1.0664326850250532, "learning_rate": 0.0001, "loss": 0.5688, "step": 15400 }, { "ce_loss": 0.5751272231340409, "epoch": 4.63, "inp_emb_norm": 0.3648046875, "loss": 0.5751272231340409, "masked_top1": 59.40922256469727, "masked_top5": 84.36576995849609, "step": 15400, "top1": 91.27351089477538, "top5": 97.98653579711915 }, { "epoch": 4.65, "grad_norm": 1.1863731412253264, "learning_rate": 0.0001, "loss": 0.5736, "step": 15450 }, { "ce_loss": 0.5713528543710709, "epoch": 4.65, "inp_emb_norm": 0.36609375, "loss": 0.5713528543710709, "masked_top1": 60.48031639099121, "masked_top5": 84.88833801269531, "step": 15450, "top1": 91.37643188476562, "top5": 98.01537475585937 }, { "epoch": 4.66, "grad_norm": 1.0582685713420434, "learning_rate": 0.0001, "loss": 0.5666, "step": 15500 }, { "ce_loss": 0.5606839144229889, "epoch": 4.66, "inp_emb_norm": 0.3706640625, "loss": 0.5606839144229889, "masked_top1": 61.53562179565429, "masked_top5": 85.30425521850586, "step": 15500, "top1": 91.48294464111328, "top5": 98.07853866577149 }, { "epoch": 4.68, "grad_norm": 1.1085095153878863, "learning_rate": 0.0001, "loss": 0.5639, "step": 15550 }, { "ce_loss": 0.5658587354421616, "epoch": 4.68, "inp_emb_norm": 0.3694921875, "loss": 0.5658587354421616, "masked_top1": 60.92467597961426, "masked_top5": 85.25927993774414, "step": 15550, "top1": 91.32387847900391, "top5": 98.04909301757813 }, { "epoch": 4.69, "grad_norm": 1.0956454635831958, "learning_rate": 0.0001, "loss": 0.5628, "step": 15600 }, { "ce_loss": 0.5535443860292435, "epoch": 4.69, "inp_emb_norm": 0.3674609375, "loss": 0.5535443860292435, "masked_top1": 62.16223007202149, "masked_top5": 85.78218490600585, "step": 15600, "top1": 91.56584075927735, "top5": 98.14968353271485 }, { "epoch": 4.71, "grad_norm": 1.0299728685580238, "learning_rate": 0.0001, "loss": 0.5815, "step": 15650 }, { "ce_loss": 0.5903949171304703, "epoch": 4.71, "inp_emb_norm": 0.366796875, "loss": 0.5903949171304703, "masked_top1": 59.38263565063477, "masked_top5": 84.16266036987305, "step": 15650, "top1": 91.00887252807617, "top5": 97.9221858215332 }, { "epoch": 4.72, "grad_norm": 1.0470383149335234, "learning_rate": 0.0001, "loss": 0.5781, "step": 15700 }, { "ce_loss": 0.5768050736188889, "epoch": 4.72, "inp_emb_norm": 0.3702734375, "loss": 0.5768050736188889, "masked_top1": 60.77295166015625, "masked_top5": 84.56514495849609, "step": 15700, "top1": 91.20448944091797, "top5": 97.95070877075196 }, { "epoch": 4.74, "grad_norm": 1.0460925861543287, "learning_rate": 0.0001, "loss": 0.5703, "step": 15750 }, { "ce_loss": 0.5766702961921691, "epoch": 4.74, "inp_emb_norm": 0.367265625, "loss": 0.5766702961921691, "masked_top1": 60.788666229248044, "masked_top5": 84.2236555480957, "step": 15750, "top1": 91.25414123535157, "top5": 97.99418731689452 }, { "epoch": 4.75, "grad_norm": 1.0639938057731058, "learning_rate": 0.0001, "loss": 0.576, "step": 15800 }, { "ce_loss": 0.5707284951210022, "epoch": 4.75, "inp_emb_norm": 0.37234375, "loss": 0.5707284951210022, "masked_top1": 60.47222633361817, "masked_top5": 85.08194046020508, "step": 15800, "top1": 91.27689331054688, "top5": 98.0308624267578 }, { "epoch": 4.77, "grad_norm": 1.0685967682651536, "learning_rate": 0.0001, "loss": 0.5746, "step": 15850 }, { "ce_loss": 0.57530591070652, "epoch": 4.77, "inp_emb_norm": 0.369296875, "loss": 0.57530591070652, "masked_top1": 60.0161856842041, "masked_top5": 84.5344613647461, "step": 15850, "top1": 91.30489456176758, "top5": 98.01641876220702 }, { "epoch": 4.78, "grad_norm": 1.0744915511743809, "learning_rate": 0.0001, "loss": 0.5806, "step": 15900 }, { "ce_loss": 0.5880693066120147, "epoch": 4.78, "inp_emb_norm": 0.37296875, "loss": 0.5880693066120147, "masked_top1": 59.781610260009764, "masked_top5": 84.03608184814453, "step": 15900, "top1": 91.16118774414062, "top5": 97.90285369873047 }, { "epoch": 4.8, "grad_norm": 1.1777116989988736, "learning_rate": 0.0001, "loss": 0.5841, "step": 15950 }, { "ce_loss": 0.5911537754535675, "epoch": 4.8, "inp_emb_norm": 0.3667578125, "loss": 0.5911537754535675, "masked_top1": 59.32529609680176, "masked_top5": 84.31457275390625, "step": 15950, "top1": 90.98917724609375, "top5": 97.9403483581543 }, { "epoch": 4.81, "grad_norm": 1.1564579774891486, "learning_rate": 0.0001, "loss": 0.5796, "step": 16000 }, { "ce_loss": 0.5745090502500534, "epoch": 4.81, "inp_emb_norm": 0.3758203125, "loss": 0.5745090502500534, "masked_top1": 60.10156044006348, "masked_top5": 85.1797428894043, "step": 16000, "top1": 91.18394180297851, "top5": 98.0737141418457 }, { "epoch": 4.83, "grad_norm": 1.1492330940204298, "learning_rate": 0.0001, "loss": 0.5883, "step": 16050 }, { "ce_loss": 0.5925427573919296, "epoch": 4.83, "inp_emb_norm": 0.3705078125, "loss": 0.5925427573919296, "masked_top1": 59.59210731506348, "masked_top5": 84.50634429931641, "step": 16050, "top1": 91.06646270751953, "top5": 97.93232162475586 }, { "epoch": 4.84, "grad_norm": 1.0605567187549139, "learning_rate": 0.0001, "loss": 0.5859, "step": 16100 }, { "ce_loss": 0.584769184589386, "epoch": 4.84, "inp_emb_norm": 0.376796875, "loss": 0.584769184589386, "masked_top1": 60.4832283782959, "masked_top5": 84.92732849121094, "step": 16100, "top1": 91.11549453735351, "top5": 97.98623245239258 }, { "epoch": 4.86, "grad_norm": 1.0454099861852648, "learning_rate": 0.0001, "loss": 0.5788, "step": 16150 }, { "ce_loss": 0.5863047724962235, "epoch": 4.86, "inp_emb_norm": 0.3832421875, "loss": 0.5863047724962235, "masked_top1": 58.98088394165039, "masked_top5": 84.14134887695313, "step": 16150, "top1": 91.02991897583007, "top5": 97.89835571289062 }, { "epoch": 4.87, "grad_norm": 1.1813560843799207, "learning_rate": 0.0001, "loss": 0.5937, "step": 16200 }, { "ce_loss": 0.5899865692853927, "epoch": 4.87, "inp_emb_norm": 0.3682421875, "loss": 0.5899865692853927, "masked_top1": 59.65545417785645, "masked_top5": 83.82899841308594, "step": 16200, "top1": 90.97882614135742, "top5": 97.96341979980468 }, { "epoch": 4.89, "grad_norm": 1.1257949963772835, "learning_rate": 0.0001, "loss": 0.6029, "step": 16250 }, { "ce_loss": 0.6203358447551728, "epoch": 4.89, "inp_emb_norm": 0.3719921875, "loss": 0.6203358447551728, "masked_top1": 58.96188240051269, "masked_top5": 83.78718704223633, "step": 16250, "top1": 90.81191055297852, "top5": 97.87782455444336 }, { "epoch": 4.9, "grad_norm": 1.0932714591709916, "learning_rate": 0.0001, "loss": 0.5876, "step": 16300 }, { "ce_loss": 0.5763556951284409, "epoch": 4.9, "inp_emb_norm": 0.37578125, "loss": 0.5763556951284409, "masked_top1": 60.64178207397461, "masked_top5": 85.0282049560547, "step": 16300, "top1": 91.18880279541015, "top5": 98.01950302124024 }, { "epoch": 4.92, "grad_norm": 0.9495406306411024, "learning_rate": 0.0001, "loss": 0.5891, "step": 16350 }, { "ce_loss": 0.5920463001728058, "epoch": 4.92, "inp_emb_norm": 0.3779296875, "loss": 0.5920463001728058, "masked_top1": 60.18883232116699, "masked_top5": 83.78975204467774, "step": 16350, "top1": 91.02826797485352, "top5": 97.88983917236328 }, { "epoch": 4.93, "grad_norm": 1.117017854175032, "learning_rate": 0.0001, "loss": 0.5989, "step": 16400 }, { "ce_loss": 0.6076039922237396, "epoch": 4.93, "inp_emb_norm": 0.3727734375, "loss": 0.6076039922237396, "masked_top1": 58.69016136169434, "masked_top5": 83.0786215209961, "step": 16400, "top1": 90.75551071166993, "top5": 97.82947280883789 }, { "epoch": 4.95, "grad_norm": 1.071523606880974, "learning_rate": 0.0001, "loss": 0.6012, "step": 16450 }, { "ce_loss": 0.6065960395336151, "epoch": 4.95, "inp_emb_norm": 0.380703125, "loss": 0.6065960395336151, "masked_top1": 58.35442459106445, "masked_top5": 83.6309603881836, "step": 16450, "top1": 90.80092453002929, "top5": 97.82826370239258 }, { "epoch": 4.96, "grad_norm": 1.100617719882716, "learning_rate": 0.0001, "loss": 0.5942, "step": 16500 }, { "ce_loss": 0.5968893599510193, "epoch": 4.96, "inp_emb_norm": 0.376484375, "loss": 0.5968893599510193, "masked_top1": 58.843356704711915, "masked_top5": 83.53122055053711, "step": 16500, "top1": 90.88707092285156, "top5": 97.87701583862305 }, { "epoch": 4.98, "grad_norm": 1.0119812517995022, "learning_rate": 0.0001, "loss": 0.6025, "step": 16550 }, { "ce_loss": 0.5882117158174515, "epoch": 4.98, "inp_emb_norm": 0.374296875, "loss": 0.5882117158174515, "masked_top1": 60.96791168212891, "masked_top5": 84.55708404541015, "step": 16550, "top1": 91.02984649658202, "top5": 97.95657623291015 }, { "epoch": 4.99, "grad_norm": 1.0572442138344962, "learning_rate": 0.0001, "loss": 0.5944, "step": 16600 }, { "ce_loss": 0.5818337166309356, "epoch": 4.99, "inp_emb_norm": 0.3735546875, "loss": 0.5818337166309356, "masked_top1": 60.71773277282715, "masked_top5": 85.01761001586914, "step": 16600, "top1": 91.06825744628907, "top5": 98.01745468139649 }, { "epoch": 5.01, "grad_norm": 0.8868244741347852, "learning_rate": 0.0001, "loss": 0.4283, "step": 16650 }, { "ce_loss": 0.4169870808720589, "epoch": 5.01, "inp_emb_norm": 0.3746484375, "loss": 0.4169870808720589, "masked_top1": 74.96077354431152, "masked_top5": 91.62375808715821, "step": 16650, "top1": 93.5944010925293, "top5": 98.75289016723633 }, { "epoch": 5.02, "grad_norm": 0.8006136974249275, "learning_rate": 0.0001, "loss": 0.24, "step": 16700 }, { "ce_loss": 0.23759305894374846, "epoch": 5.02, "inp_emb_norm": 0.377421875, "loss": 0.23759305894374846, "masked_top1": 89.66921447753906, "masked_top5": 98.84856094360352, "step": 16700, "top1": 96.22275146484375, "top5": 99.58460357666016 }, { "epoch": 5.04, "grad_norm": 0.7814579996198231, "learning_rate": 0.0001, "loss": 0.2428, "step": 16750 }, { "ce_loss": 0.24006595432758332, "epoch": 5.04, "inp_emb_norm": 0.3719921875, "loss": 0.24006595432758332, "masked_top1": 89.90816116333008, "masked_top5": 98.82037811279297, "step": 16750, "top1": 96.24885559082031, "top5": 99.55003356933594 }, { "epoch": 5.05, "grad_norm": 0.7858773051800831, "learning_rate": 0.0001, "loss": 0.2395, "step": 16800 }, { "ce_loss": 0.23285773277282715, "epoch": 5.05, "inp_emb_norm": 0.3825390625, "loss": 0.23285773277282715, "masked_top1": 89.65037063598633, "masked_top5": 98.94374557495117, "step": 16800, "top1": 96.30195434570312, "top5": 99.57885467529297 }, { "epoch": 5.07, "grad_norm": 0.8834168839025487, "learning_rate": 0.0001, "loss": 0.2461, "step": 16850 }, { "ce_loss": 0.24451201170682907, "epoch": 5.07, "inp_emb_norm": 0.38234375, "loss": 0.24451201170682907, "masked_top1": 88.87031646728515, "masked_top5": 98.54978637695312, "step": 16850, "top1": 96.15791427612305, "top5": 99.54232421875 }, { "epoch": 5.08, "grad_norm": 0.9643007917325629, "learning_rate": 0.0001, "loss": 0.2558, "step": 16900 }, { "ce_loss": 0.25902660697698593, "epoch": 5.08, "inp_emb_norm": 0.365234375, "loss": 0.25902660697698593, "masked_top1": 88.15865753173829, "masked_top5": 98.6231999206543, "step": 16900, "top1": 95.8116291809082, "top5": 99.52821868896484 }, { "epoch": 5.1, "grad_norm": 0.9187240394700701, "learning_rate": 0.0001, "loss": 0.2653, "step": 16950 }, { "ce_loss": 0.25573085725307465, "epoch": 5.1, "inp_emb_norm": 0.3799609375, "loss": 0.25573085725307465, "masked_top1": 88.29147186279297, "masked_top5": 98.50399887084961, "step": 16950, "top1": 95.97455947875977, "top5": 99.51153411865235 }, { "epoch": 5.11, "grad_norm": 0.8509566798013529, "learning_rate": 0.0001, "loss": 0.2532, "step": 17000 }, { "ce_loss": 0.2532435983419418, "epoch": 5.11, "inp_emb_norm": 0.3784765625, "loss": 0.2532435983419418, "masked_top1": 88.47122055053711, "masked_top5": 98.4746549987793, "step": 17000, "top1": 95.92875396728516, "top5": 99.54647705078125 }, { "epoch": 5.13, "grad_norm": 0.966807468835321, "learning_rate": 0.0001, "loss": 0.2595, "step": 17050 }, { "ce_loss": 0.2636346372961998, "epoch": 5.13, "inp_emb_norm": 0.37484375, "loss": 0.2636346372961998, "masked_top1": 87.61207931518555, "masked_top5": 98.49675857543946, "step": 17050, "top1": 95.76734481811523, "top5": 99.5260922241211 }, { "epoch": 5.14, "grad_norm": 0.8630676671965275, "learning_rate": 0.0001, "loss": 0.2605, "step": 17100 }, { "ce_loss": 0.2629204204678535, "epoch": 5.14, "inp_emb_norm": 0.381796875, "loss": 0.2629204204678535, "masked_top1": 87.41785934448242, "masked_top5": 98.47596908569336, "step": 17100, "top1": 95.73066696166993, "top5": 99.52325912475585 }, { "epoch": 5.16, "grad_norm": 1.0053540322976233, "learning_rate": 0.0001, "loss": 0.2648, "step": 17150 }, { "ce_loss": 0.27128377854824065, "epoch": 5.16, "inp_emb_norm": 0.3784765625, "loss": 0.27128377854824065, "masked_top1": 87.0358544921875, "masked_top5": 98.38056884765625, "step": 17150, "top1": 95.64348907470703, "top5": 99.50871109008789 }, { "epoch": 5.17, "grad_norm": 0.9508275120295281, "learning_rate": 0.0001, "loss": 0.2745, "step": 17200 }, { "ce_loss": 0.27368868499994276, "epoch": 5.17, "inp_emb_norm": 0.3740234375, "loss": 0.27368868499994276, "masked_top1": 87.3692935180664, "masked_top5": 98.23250915527343, "step": 17200, "top1": 95.61584823608399, "top5": 99.49671478271485 }, { "epoch": 5.19, "grad_norm": 0.9646300915973821, "learning_rate": 0.0001, "loss": 0.2814, "step": 17250 }, { "ce_loss": 0.27761764973402026, "epoch": 5.19, "inp_emb_norm": 0.3832421875, "loss": 0.27761764973402026, "masked_top1": 86.05260269165039, "masked_top5": 98.24610961914063, "step": 17250, "top1": 95.51309692382813, "top5": 99.5203547668457 }, { "epoch": 5.2, "grad_norm": 0.9852223949042181, "learning_rate": 0.0001, "loss": 0.2756, "step": 17300 }, { "ce_loss": 0.2694446948170662, "epoch": 5.2, "inp_emb_norm": 0.3767578125, "loss": 0.2694446948170662, "masked_top1": 87.5820735168457, "masked_top5": 98.16199798583985, "step": 17300, "top1": 95.65110595703125, "top5": 99.50478088378907 }, { "epoch": 5.22, "grad_norm": 1.0161825412276837, "learning_rate": 0.0001, "loss": 0.2836, "step": 17350 }, { "ce_loss": 0.2845872187614441, "epoch": 5.22, "inp_emb_norm": 0.38078125, "loss": 0.2845872187614441, "masked_top1": 85.83310623168946, "masked_top5": 97.87667556762695, "step": 17350, "top1": 95.49496994018554, "top5": 99.44075881958008 }, { "epoch": 5.23, "grad_norm": 0.9591201013156013, "learning_rate": 0.0001, "loss": 0.2831, "step": 17400 }, { "ce_loss": 0.28442945539951325, "epoch": 5.23, "inp_emb_norm": 0.3789453125, "loss": 0.28442945539951325, "masked_top1": 86.10271911621093, "masked_top5": 97.88985565185547, "step": 17400, "top1": 95.40376953125, "top5": 99.44743041992187 }, { "epoch": 5.25, "grad_norm": 1.0432247629117117, "learning_rate": 0.0001, "loss": 0.2854, "step": 17450 }, { "ce_loss": 0.27993369311094285, "epoch": 5.25, "inp_emb_norm": 0.3808203125, "loss": 0.27993369311094285, "masked_top1": 86.45830123901368, "masked_top5": 98.15035873413086, "step": 17450, "top1": 95.57389678955079, "top5": 99.48250961303711 }, { "epoch": 5.26, "grad_norm": 0.9701861929167235, "learning_rate": 0.0001, "loss": 0.2907, "step": 17500 }, { "ce_loss": 0.28741121381521223, "epoch": 5.26, "inp_emb_norm": 0.3751171875, "loss": 0.28741121381521223, "masked_top1": 85.9087370300293, "masked_top5": 98.2075733947754, "step": 17500, "top1": 95.36790939331054, "top5": 99.45517150878906 }, { "epoch": 5.28, "grad_norm": 0.9018657474300132, "learning_rate": 0.0001, "loss": 0.2939, "step": 17550 }, { "ce_loss": 0.2932562205195427, "epoch": 5.28, "inp_emb_norm": 0.376015625, "loss": 0.2932562205195427, "masked_top1": 85.43090469360351, "masked_top5": 97.69857574462891, "step": 17550, "top1": 95.35228378295898, "top5": 99.44769515991212 }, { "epoch": 5.29, "grad_norm": 1.0525621107942214, "learning_rate": 0.0001, "loss": 0.2878, "step": 17600 }, { "ce_loss": 0.2939734762907028, "epoch": 5.29, "inp_emb_norm": 0.3778515625, "loss": 0.2939734762907028, "masked_top1": 85.80782012939453, "masked_top5": 98.14864028930664, "step": 17600, "top1": 95.27761352539062, "top5": 99.49363082885742 }, { "epoch": 5.31, "grad_norm": 1.0032239373031064, "learning_rate": 0.0001, "loss": 0.3036, "step": 17650 }, { "ce_loss": 0.3070009741187096, "epoch": 5.31, "inp_emb_norm": 0.37828125, "loss": 0.3070009741187096, "masked_top1": 84.42236633300782, "masked_top5": 97.74387069702148, "step": 17650, "top1": 95.10071838378906, "top5": 99.41670013427735 }, { "epoch": 5.32, "grad_norm": 1.0274167035814015, "learning_rate": 0.0001, "loss": 0.3011, "step": 17700 }, { "ce_loss": 0.307579453587532, "epoch": 5.32, "inp_emb_norm": 0.3801953125, "loss": 0.307579453587532, "masked_top1": 84.19732971191407, "masked_top5": 97.56189498901367, "step": 17700, "top1": 95.09036392211914, "top5": 99.39419525146485 }, { "epoch": 5.34, "grad_norm": 1.019591016333828, "learning_rate": 0.0001, "loss": 0.3006, "step": 17750 }, { "ce_loss": 0.29659480959177015, "epoch": 5.34, "inp_emb_norm": 0.379140625, "loss": 0.29659480959177015, "masked_top1": 84.7645182800293, "masked_top5": 98.02963897705078, "step": 17750, "top1": 95.21526107788085, "top5": 99.45744186401367 }, { "epoch": 5.35, "grad_norm": 1.045966882681033, "learning_rate": 0.0001, "loss": 0.3046, "step": 17800 }, { "ce_loss": 0.3082049387693405, "epoch": 5.35, "inp_emb_norm": 0.3823828125, "loss": 0.3082049387693405, "masked_top1": 84.11483215332031, "masked_top5": 97.67893463134766, "step": 17800, "top1": 95.09591766357421, "top5": 99.39608154296874 }, { "epoch": 5.37, "grad_norm": 0.9692493761597851, "learning_rate": 0.0001, "loss": 0.3087, "step": 17850 }, { "ce_loss": 0.3111723321676254, "epoch": 5.37, "inp_emb_norm": 0.3821875, "loss": 0.3111723321676254, "masked_top1": 83.85779495239258, "masked_top5": 97.46583740234375, "step": 17850, "top1": 94.99774032592774, "top5": 99.40854736328124 }, { "epoch": 5.38, "grad_norm": 0.9440787583366769, "learning_rate": 0.0001, "loss": 0.306, "step": 17900 }, { "ce_loss": 0.31266897201538085, "epoch": 5.38, "inp_emb_norm": 0.380859375, "loss": 0.31266897201538085, "masked_top1": 83.7997444152832, "masked_top5": 97.51995346069336, "step": 17900, "top1": 94.99930786132812, "top5": 99.39216735839844 }, { "epoch": 5.4, "grad_norm": 0.9729412684124459, "learning_rate": 0.0001, "loss": 0.3072, "step": 17950 }, { "ce_loss": 0.30246726602315904, "epoch": 5.4, "inp_emb_norm": 0.385234375, "loss": 0.30246726602315904, "masked_top1": 84.3121369934082, "masked_top5": 97.73384506225585, "step": 17950, "top1": 95.10583526611327, "top5": 99.43766952514649 }, { "epoch": 5.41, "grad_norm": 0.9636765858785842, "learning_rate": 0.0001, "loss": 0.307, "step": 18000 }, { "ce_loss": 0.30428083807229994, "epoch": 5.41, "inp_emb_norm": 0.381484375, "loss": 0.30428083807229994, "masked_top1": 84.8429086303711, "masked_top5": 97.52333801269532, "step": 18000, "top1": 95.1666488647461, "top5": 99.39987106323242 }, { "epoch": 5.43, "grad_norm": 1.0186629227679855, "learning_rate": 0.0001, "loss": 0.3133, "step": 18050 }, { "ce_loss": 0.3080432793498039, "epoch": 5.43, "inp_emb_norm": 0.381015625, "loss": 0.3080432793498039, "masked_top1": 83.97354797363282, "masked_top5": 97.46352157592773, "step": 18050, "top1": 95.05124603271484, "top5": 99.38775970458984 }, { "epoch": 5.44, "grad_norm": 0.9556152250711039, "learning_rate": 0.0001, "loss": 0.3298, "step": 18100 }, { "ce_loss": 0.35822920709848405, "epoch": 5.44, "inp_emb_norm": 0.38015625, "loss": 0.35822920709848405, "masked_top1": 83.25973114013672, "masked_top5": 97.05907196044922, "step": 18100, "top1": 94.64421905517578, "top5": 99.1507731628418 }, { "epoch": 5.46, "grad_norm": 1.0598324926140277, "learning_rate": 0.0001, "loss": 0.3205, "step": 18150 }, { "ce_loss": 0.32162826359272, "epoch": 5.46, "inp_emb_norm": 0.3842578125, "loss": 0.32162826359272, "masked_top1": 83.18663421630859, "masked_top5": 97.19483627319336, "step": 18150, "top1": 94.94844818115234, "top5": 99.35345962524414 }, { "epoch": 5.47, "grad_norm": 0.9530378094960094, "learning_rate": 0.0001, "loss": 0.32, "step": 18200 }, { "ce_loss": 0.3288844656944275, "epoch": 5.47, "inp_emb_norm": 0.3790625, "loss": 0.3288844656944275, "masked_top1": 82.93034057617187, "masked_top5": 97.11549606323243, "step": 18200, "top1": 94.78947708129883, "top5": 99.34183990478516 }, { "epoch": 5.49, "grad_norm": 0.998874046778327, "learning_rate": 0.0001, "loss": 0.3207, "step": 18250 }, { "ce_loss": 0.32029964685440065, "epoch": 5.49, "inp_emb_norm": 0.38546875, "loss": 0.32029964685440065, "masked_top1": 82.87285369873047, "masked_top5": 97.35884552001953, "step": 18250, "top1": 94.86287094116211, "top5": 99.37160079956055 }, { "epoch": 5.5, "grad_norm": 1.0507249584695442, "learning_rate": 0.0001, "loss": 0.3264, "step": 18300 }, { "ce_loss": 0.3225671499967575, "epoch": 5.5, "inp_emb_norm": 0.381953125, "loss": 0.3225671499967575, "masked_top1": 82.94257446289062, "masked_top5": 97.59221542358398, "step": 18300, "top1": 94.790439453125, "top5": 99.39126449584961 }, { "epoch": 5.52, "grad_norm": 1.0538788268467503, "learning_rate": 0.0001, "loss": 0.3272, "step": 18350 }, { "ce_loss": 0.3279902094602585, "epoch": 5.52, "inp_emb_norm": 0.378984375, "loss": 0.3279902094602585, "masked_top1": 82.10417236328125, "masked_top5": 97.07626525878906, "step": 18350, "top1": 94.77208694458008, "top5": 99.34082946777343 }, { "epoch": 5.53, "grad_norm": 1.0082114223647751, "learning_rate": 0.0001, "loss": 0.3246, "step": 18400 }, { "ce_loss": 0.3286957702040672, "epoch": 5.53, "inp_emb_norm": 0.3857421875, "loss": 0.3286957702040672, "masked_top1": 82.18844451904297, "masked_top5": 97.20039276123048, "step": 18400, "top1": 94.72737030029298, "top5": 99.36099243164062 }, { "epoch": 5.55, "grad_norm": 1.1198021466793433, "learning_rate": 0.0001, "loss": 0.3312, "step": 18450 }, { "ce_loss": 0.33603747010231017, "epoch": 5.55, "inp_emb_norm": 0.3957421875, "loss": 0.33603747010231017, "masked_top1": 81.67584823608398, "masked_top5": 96.54129943847656, "step": 18450, "top1": 94.65423965454102, "top5": 99.32092361450195 }, { "epoch": 5.56, "grad_norm": 1.0668250210650634, "learning_rate": 0.0001, "loss": 0.3274, "step": 18500 }, { "ce_loss": 0.3218830382823944, "epoch": 5.56, "inp_emb_norm": 0.3800390625, "loss": 0.3218830382823944, "masked_top1": 82.66215911865234, "masked_top5": 97.63473297119141, "step": 18500, "top1": 94.83183807373047, "top5": 99.41087814331054 }, { "epoch": 5.58, "grad_norm": 1.0489105931316678, "learning_rate": 0.0001, "loss": 0.3323, "step": 18550 }, { "ce_loss": 0.33785298705101013, "epoch": 5.58, "inp_emb_norm": 0.3783984375, "loss": 0.33785298705101013, "masked_top1": 82.34555877685547, "masked_top5": 97.0906037902832, "step": 18550, "top1": 94.63372589111329, "top5": 99.32784606933593 }, { "epoch": 5.59, "grad_norm": 1.0468329188388257, "learning_rate": 0.0001, "loss": 0.3334, "step": 18600 }, { "ce_loss": 0.3364351660013199, "epoch": 5.59, "inp_emb_norm": 0.388515625, "loss": 0.3364351660013199, "masked_top1": 81.59525100708008, "masked_top5": 96.72632537841797, "step": 18600, "top1": 94.66922760009766, "top5": 99.31353454589843 }, { "epoch": 5.61, "grad_norm": 1.0268275103914735, "learning_rate": 0.0001, "loss": 0.3306, "step": 18650 }, { "ce_loss": 0.327090705037117, "epoch": 5.61, "inp_emb_norm": 0.384921875, "loss": 0.327090705037117, "masked_top1": 82.4957682800293, "masked_top5": 96.94613693237305, "step": 18650, "top1": 94.83338302612304, "top5": 99.36148330688476 }, { "epoch": 5.62, "grad_norm": 1.048035539110107, "learning_rate": 0.0001, "loss": 0.3317, "step": 18700 }, { "ce_loss": 0.3305124366283417, "epoch": 5.62, "inp_emb_norm": 0.3794921875, "loss": 0.3305124366283417, "masked_top1": 82.27645935058594, "masked_top5": 97.16238998413085, "step": 18700, "top1": 94.69090454101563, "top5": 99.3725503540039 }, { "epoch": 5.64, "grad_norm": 1.0935078838634331, "learning_rate": 0.0001, "loss": 0.3428, "step": 18750 }, { "ce_loss": 0.34309773981571196, "epoch": 5.64, "inp_emb_norm": 0.379453125, "loss": 0.34309773981571196, "masked_top1": 81.5336685180664, "masked_top5": 96.76238327026367, "step": 18750, "top1": 94.5162728881836, "top5": 99.30676239013673 }, { "epoch": 5.65, "grad_norm": 1.0784022808624179, "learning_rate": 0.0001, "loss": 0.3433, "step": 18800 }, { "ce_loss": 0.35102885723114013, "epoch": 5.65, "inp_emb_norm": 0.3797265625, "loss": 0.35102885723114013, "masked_top1": 80.56207229614257, "masked_top5": 96.39179016113282, "step": 18800, "top1": 94.42943588256836, "top5": 99.2728482055664 }, { "epoch": 5.67, "grad_norm": 1.0393038843820417, "learning_rate": 0.0001, "loss": 0.3427, "step": 18850 }, { "ce_loss": 0.3519377601146698, "epoch": 5.67, "inp_emb_norm": 0.386484375, "loss": 0.3519377601146698, "masked_top1": 80.00395278930664, "masked_top5": 96.58664108276368, "step": 18850, "top1": 94.40878112792969, "top5": 99.31891250610352 }, { "epoch": 5.68, "grad_norm": 1.1009482557837906, "learning_rate": 0.0001, "loss": 0.3438, "step": 18900 }, { "ce_loss": 0.33931906819343566, "epoch": 5.68, "inp_emb_norm": 0.383515625, "loss": 0.33931906819343566, "masked_top1": 81.50663543701172, "masked_top5": 96.86479187011719, "step": 18900, "top1": 94.59292114257812, "top5": 99.32182586669921 }, { "epoch": 5.7, "grad_norm": 0.9751674873578042, "learning_rate": 0.0001, "loss": 0.3507, "step": 18950 }, { "ce_loss": 0.343431094288826, "epoch": 5.7, "inp_emb_norm": 0.3859375, "loss": 0.343431094288826, "masked_top1": 80.64304809570312, "masked_top5": 96.53154220581055, "step": 18950, "top1": 94.48710632324219, "top5": 99.31856002807618 }, { "epoch": 5.71, "grad_norm": 0.9792712686829252, "learning_rate": 0.0001, "loss": 0.3409, "step": 19000 }, { "ce_loss": 0.34443502128124237, "epoch": 5.71, "inp_emb_norm": 0.38875, "loss": 0.34443502128124237, "masked_top1": 80.8620687866211, "masked_top5": 96.8039859008789, "step": 19000, "top1": 94.45713119506836, "top5": 99.3434228515625 }, { "epoch": 5.73, "grad_norm": 1.0250483722882462, "learning_rate": 0.0001, "loss": 0.3462, "step": 19050 }, { "ce_loss": 0.3492258018255234, "epoch": 5.73, "inp_emb_norm": 0.3844921875, "loss": 0.3492258018255234, "masked_top1": 80.19672760009766, "masked_top5": 96.82942138671875, "step": 19050, "top1": 94.36864486694336, "top5": 99.30538925170899 }, { "epoch": 5.74, "grad_norm": 1.033814306715085, "learning_rate": 0.0001, "loss": 0.3465, "step": 19100 }, { "ce_loss": 0.3455532872676849, "epoch": 5.74, "inp_emb_norm": 0.3897265625, "loss": 0.3455532872676849, "masked_top1": 81.14021148681641, "masked_top5": 96.46601867675781, "step": 19100, "top1": 94.59025283813476, "top5": 99.29209457397461 }, { "epoch": 5.76, "grad_norm": 1.1266003627715306, "learning_rate": 0.0001, "loss": 0.3496, "step": 19150 }, { "ce_loss": 0.3510564410686493, "epoch": 5.76, "inp_emb_norm": 0.3796484375, "loss": 0.3510564410686493, "masked_top1": 80.42627136230469, "masked_top5": 96.30276489257812, "step": 19150, "top1": 94.38863174438477, "top5": 99.28523544311524 }, { "epoch": 5.77, "grad_norm": 1.0786804231908673, "learning_rate": 0.0001, "loss": 0.3519, "step": 19200 }, { "ce_loss": 0.35517399430274965, "epoch": 5.77, "inp_emb_norm": 0.379765625, "loss": 0.35517399430274965, "masked_top1": 80.2329933166504, "masked_top5": 96.45190505981445, "step": 19200, "top1": 94.32036392211914, "top5": 99.28158004760742 }, { "epoch": 5.79, "grad_norm": 0.9810743864140166, "learning_rate": 0.0001, "loss": 0.3532, "step": 19250 }, { "ce_loss": 0.35108084678649903, "epoch": 5.79, "inp_emb_norm": 0.38859375, "loss": 0.35108084678649903, "masked_top1": 80.51531219482422, "masked_top5": 96.45788009643555, "step": 19250, "top1": 94.3976658630371, "top5": 99.2694255065918 }, { "epoch": 5.8, "grad_norm": 0.9998840954054705, "learning_rate": 0.0001, "loss": 0.358, "step": 19300 }, { "ce_loss": 0.36306409776210785, "epoch": 5.8, "inp_emb_norm": 0.3805859375, "loss": 0.36306409776210785, "masked_top1": 79.08367248535156, "masked_top5": 96.06164031982422, "step": 19300, "top1": 94.15442794799804, "top5": 99.21199111938476 }, { "epoch": 5.82, "grad_norm": 0.9561538506979583, "learning_rate": 0.0001, "loss": 0.3517, "step": 19350 }, { "ce_loss": 0.3433873727917671, "epoch": 5.82, "inp_emb_norm": 0.3860546875, "loss": 0.3433873727917671, "masked_top1": 81.34098831176757, "masked_top5": 96.61977462768554, "step": 19350, "top1": 94.59346282958984, "top5": 99.30811492919922 }, { "epoch": 5.83, "grad_norm": 0.9641219081481858, "learning_rate": 0.0001, "loss": 0.3571, "step": 19400 }, { "ce_loss": 0.3487838166952133, "epoch": 5.83, "inp_emb_norm": 0.3849609375, "loss": 0.3487838166952133, "masked_top1": 81.22178527832031, "masked_top5": 96.62050567626953, "step": 19400, "top1": 94.4317202758789, "top5": 99.28485198974609 }, { "epoch": 5.85, "grad_norm": 1.1056878672573682, "learning_rate": 0.0001, "loss": 0.3589, "step": 19450 }, { "ce_loss": 0.3580132460594177, "epoch": 5.85, "inp_emb_norm": 0.395234375, "loss": 0.3580132460594177, "masked_top1": 79.92015823364258, "masked_top5": 96.51984786987305, "step": 19450, "top1": 94.39044494628907, "top5": 99.25140426635743 }, { "epoch": 5.86, "grad_norm": 1.0460192475813763, "learning_rate": 0.0001, "loss": 0.357, "step": 19500 }, { "ce_loss": 0.3561101830005646, "epoch": 5.86, "inp_emb_norm": 0.3905859375, "loss": 0.3561101830005646, "masked_top1": 80.33946014404297, "masked_top5": 96.31464675903321, "step": 19500, "top1": 94.33697235107422, "top5": 99.24774505615234 }, { "epoch": 5.88, "grad_norm": 0.9997822590004654, "learning_rate": 0.0001, "loss": 0.3667, "step": 19550 }, { "ce_loss": 0.3721057403087616, "epoch": 5.88, "inp_emb_norm": 0.3875390625, "loss": 0.3721057403087616, "masked_top1": 78.66541412353516, "masked_top5": 95.76679565429687, "step": 19550, "top1": 94.079482421875, "top5": 99.20158752441407 }, { "epoch": 5.89, "grad_norm": 1.0227356742773894, "learning_rate": 0.0001, "loss": 0.362, "step": 19600 }, { "ce_loss": 0.36288787305355075, "epoch": 5.89, "inp_emb_norm": 0.393046875, "loss": 0.36288787305355075, "masked_top1": 78.83365341186523, "masked_top5": 95.98956665039063, "step": 19600, "top1": 94.17432601928711, "top5": 99.2359994506836 }, { "epoch": 5.91, "grad_norm": 1.0348435013200028, "learning_rate": 0.0001, "loss": 0.3637, "step": 19650 }, { "ce_loss": 0.3597774177789688, "epoch": 5.91, "inp_emb_norm": 0.38703125, "loss": 0.3597774177789688, "masked_top1": 79.5971403503418, "masked_top5": 96.29687118530273, "step": 19650, "top1": 94.29608825683594, "top5": 99.25850982666016 }, { "epoch": 5.92, "grad_norm": 1.0680810347893466, "learning_rate": 0.0001, "loss": 0.366, "step": 19700 }, { "ce_loss": 0.3640019080042839, "epoch": 5.92, "inp_emb_norm": 0.3908203125, "loss": 0.3640019080042839, "masked_top1": 80.11074829101562, "masked_top5": 96.37212921142579, "step": 19700, "top1": 94.18343032836914, "top5": 99.28712341308594 }, { "epoch": 5.94, "grad_norm": 0.9094859392388285, "learning_rate": 0.0001, "loss": 0.3678, "step": 19750 }, { "ce_loss": 0.37644350349903105, "epoch": 5.94, "inp_emb_norm": 0.3891015625, "loss": 0.37644350349903105, "masked_top1": 78.18459594726562, "masked_top5": 95.86135772705079, "step": 19750, "top1": 93.957353515625, "top5": 99.2127586364746 }, { "epoch": 5.95, "grad_norm": 1.0310255933372185, "learning_rate": 0.0001, "loss": 0.3668, "step": 19800 }, { "ce_loss": 0.3627872896194458, "epoch": 5.95, "inp_emb_norm": 0.3862109375, "loss": 0.3627872896194458, "masked_top1": 79.60676147460937, "masked_top5": 96.10193313598633, "step": 19800, "top1": 94.17147338867187, "top5": 99.25555862426758 }, { "epoch": 5.97, "grad_norm": 1.0599907424871888, "learning_rate": 0.0001, "loss": 0.3695, "step": 19850 }, { "ce_loss": 0.37383450448513034, "epoch": 5.97, "inp_emb_norm": 0.4004296875, "loss": 0.37383450448513034, "masked_top1": 77.94340957641602, "masked_top5": 96.1452799987793, "step": 19850, "top1": 94.01543914794922, "top5": 99.25049942016602 }, { "epoch": 5.98, "grad_norm": 1.123287217336448, "learning_rate": 0.0001, "loss": 0.3667, "step": 19900 }, { "ce_loss": 0.3690763407945633, "epoch": 5.98, "inp_emb_norm": 0.3968359375, "loss": 0.3690763407945633, "masked_top1": 78.41387680053711, "masked_top5": 95.78071762084961, "step": 19900, "top1": 94.15260635375977, "top5": 99.18431182861327 }, { "epoch": 6.0, "grad_norm": 1.05174121076266, "learning_rate": 0.0001, "loss": 0.37, "step": 19950 }, { "ce_loss": 0.36719063580036165, "epoch": 6.0, "inp_emb_norm": 0.3880859375, "loss": 0.36719063580036165, "masked_top1": 79.3749658203125, "masked_top5": 96.28604400634765, "step": 19950, "top1": 94.1464176940918, "top5": 99.22554748535157 }, { "epoch": 6.02, "grad_norm": 0.7372869779982937, "learning_rate": 0.0001, "loss": 0.1841, "step": 20000 }, { "ce_loss": 0.1820479117333889, "epoch": 6.02, "inp_emb_norm": 0.3961328125, "loss": 0.1820479117333889, "masked_top1": 93.90685821533204, "masked_top5": 99.41201736450195, "step": 20000, "top1": 97.13008163452149, "top5": 99.6633514404297 }, { "epoch": 6.03, "grad_norm": 0.7458143529658549, "learning_rate": 0.0001, "loss": 0.1884, "step": 20050 }, { "ce_loss": 0.19271491587162018, "epoch": 6.03, "inp_emb_norm": 0.39109375, "loss": 0.19271491587162018, "masked_top1": 93.04420074462891, "masked_top5": 99.42253952026367, "step": 20050, "top1": 96.94086242675782, "top5": 99.66081787109376 }, { "epoch": 6.05, "grad_norm": 0.8859283290144573, "learning_rate": 0.0001, "loss": 0.1866, "step": 20100 }, { "ce_loss": 0.191538667678833, "epoch": 6.05, "inp_emb_norm": 0.3925390625, "loss": 0.191538667678833, "masked_top1": 93.09733154296875, "masked_top5": 99.30335250854492, "step": 20100, "top1": 96.98335327148438, "top5": 99.65503143310546 }, { "epoch": 6.06, "grad_norm": 0.8809125747332417, "learning_rate": 0.0001, "loss": 0.1852, "step": 20150 }, { "ce_loss": 0.1867568638920784, "epoch": 6.06, "inp_emb_norm": 0.391640625, "loss": 0.1867568638920784, "masked_top1": 93.50607498168945, "masked_top5": 99.47832672119141, "step": 20150, "top1": 97.09137954711915, "top5": 99.65869873046876 }, { "epoch": 6.08, "grad_norm": 0.826592224481137, "learning_rate": 0.0001, "loss": 0.1849, "step": 20200 }, { "ce_loss": 0.1898653081059456, "epoch": 6.08, "inp_emb_norm": 0.3903515625, "loss": 0.1898653081059456, "masked_top1": 93.60175338745117, "masked_top5": 99.40977355957031, "step": 20200, "top1": 96.99095306396484, "top5": 99.64656112670899 }, { "epoch": 6.09, "grad_norm": 0.8167934529115294, "learning_rate": 0.0001, "loss": 0.1935, "step": 20250 }, { "ce_loss": 0.1948414433002472, "epoch": 6.09, "inp_emb_norm": 0.3841015625, "loss": 0.1948414433002472, "masked_top1": 92.95522598266602, "masked_top5": 99.3899104309082, "step": 20250, "top1": 96.98196350097656, "top5": 99.63696716308594 }, { "epoch": 6.11, "grad_norm": 0.7345815776721539, "learning_rate": 0.0001, "loss": 0.1926, "step": 20300 }, { "ce_loss": 0.19291402637958527, "epoch": 6.11, "inp_emb_norm": 0.3873828125, "loss": 0.19291402637958527, "masked_top1": 93.4042626953125, "masked_top5": 99.38736923217773, "step": 20300, "top1": 96.96928878784179, "top5": 99.63698593139648 }, { "epoch": 6.12, "grad_norm": 0.8149440856604434, "learning_rate": 0.0001, "loss": 0.1952, "step": 20350 }, { "ce_loss": 0.1960041469335556, "epoch": 6.12, "inp_emb_norm": 0.39453125, "loss": 0.1960041469335556, "masked_top1": 92.89414184570313, "masked_top5": 99.46369537353516, "step": 20350, "top1": 96.92287322998047, "top5": 99.66681442260742 }, { "epoch": 6.14, "grad_norm": 0.8776230940079653, "learning_rate": 0.0001, "loss": 0.1979, "step": 20400 }, { "ce_loss": 0.20121153205633163, "epoch": 6.14, "inp_emb_norm": 0.3866015625, "loss": 0.20121153205633163, "masked_top1": 92.70189239501953, "masked_top5": 99.36892135620117, "step": 20400, "top1": 96.79621154785156, "top5": 99.64973388671875 }, { "epoch": 6.15, "grad_norm": 0.8585106345021164, "learning_rate": 0.0001, "loss": 0.2003, "step": 20450 }, { "ce_loss": 0.19490561604499818, "epoch": 6.15, "inp_emb_norm": 0.39078125, "loss": 0.19490561604499818, "masked_top1": 92.95185577392579, "masked_top5": 99.3991976928711, "step": 20450, "top1": 96.93541824340821, "top5": 99.64690612792968 }, { "epoch": 6.17, "grad_norm": 0.7976355967797862, "learning_rate": 0.0001, "loss": 0.2052, "step": 20500 }, { "ce_loss": 0.20391724795103072, "epoch": 6.17, "inp_emb_norm": 0.39390625, "loss": 0.20391724795103072, "masked_top1": 92.00440231323242, "masked_top5": 99.22619979858399, "step": 20500, "top1": 96.7327310180664, "top5": 99.62498168945312 }, { "epoch": 6.18, "grad_norm": 0.8148083321822271, "learning_rate": 0.0001, "loss": 0.2054, "step": 20550 }, { "ce_loss": 0.20799223512411116, "epoch": 6.18, "inp_emb_norm": 0.391953125, "loss": 0.20799223512411116, "masked_top1": 92.3514372253418, "masked_top5": 99.27444519042969, "step": 20550, "top1": 96.76949005126953, "top5": 99.62392379760742 }, { "epoch": 6.2, "grad_norm": 0.778356178235768, "learning_rate": 0.0001, "loss": 0.206, "step": 20600 }, { "ce_loss": 0.2053508883714676, "epoch": 6.2, "inp_emb_norm": 0.397890625, "loss": 0.2053508883714676, "masked_top1": 92.0650765991211, "masked_top5": 99.24300872802735, "step": 20600, "top1": 96.75581268310548, "top5": 99.60792434692382 }, { "epoch": 6.21, "grad_norm": 0.8329059510004336, "learning_rate": 0.0001, "loss": 0.211, "step": 20650 }, { "ce_loss": 0.20962412267923355, "epoch": 6.21, "inp_emb_norm": 0.3975390625, "loss": 0.20962412267923355, "masked_top1": 92.15507781982421, "masked_top5": 99.21770889282226, "step": 20650, "top1": 96.68198043823242, "top5": 99.61430511474609 }, { "epoch": 6.23, "grad_norm": 0.9696545325150863, "learning_rate": 0.0001, "loss": 0.214, "step": 20700 }, { "ce_loss": 0.21425070196390153, "epoch": 6.23, "inp_emb_norm": 0.3883984375, "loss": 0.21425070196390153, "masked_top1": 91.60136703491212, "masked_top5": 99.14032455444335, "step": 20700, "top1": 96.57572845458985, "top5": 99.60167327880859 }, { "epoch": 6.24, "grad_norm": 0.9465855604091451, "learning_rate": 0.0001, "loss": 0.2179, "step": 20750 }, { "ce_loss": 0.21650043070316316, "epoch": 6.24, "inp_emb_norm": 0.3938671875, "loss": 0.21650043070316316, "masked_top1": 91.89517654418945, "masked_top5": 99.22545211791993, "step": 20750, "top1": 96.63436431884766, "top5": 99.61578002929687 }, { "epoch": 6.26, "grad_norm": 0.8848039020955999, "learning_rate": 0.0001, "loss": 0.2167, "step": 20800 }, { "ce_loss": 0.21624796688556672, "epoch": 6.26, "inp_emb_norm": 0.3955078125, "loss": 0.21624796688556672, "masked_top1": 91.79812927246094, "masked_top5": 99.13185974121093, "step": 20800, "top1": 96.59934356689453, "top5": 99.60213928222656 }, { "epoch": 6.27, "grad_norm": 0.8555074354464904, "learning_rate": 0.0001, "loss": 0.22, "step": 20850 }, { "ce_loss": 0.2177719497680664, "epoch": 6.27, "inp_emb_norm": 0.386171875, "loss": 0.2177719497680664, "masked_top1": 91.69604415893555, "masked_top5": 99.20869293212891, "step": 20850, "top1": 96.5518881225586, "top5": 99.60115341186524 }, { "epoch": 6.29, "grad_norm": 0.8890783469847633, "learning_rate": 0.0001, "loss": 0.2219, "step": 20900 }, { "ce_loss": 0.22593430548906326, "epoch": 6.29, "inp_emb_norm": 0.3901953125, "loss": 0.22593430548906326, "masked_top1": 91.24859771728515, "masked_top5": 98.95779678344726, "step": 20900, "top1": 96.41894073486328, "top5": 99.57270751953125 }, { "epoch": 6.3, "grad_norm": 0.8389189758028528, "learning_rate": 0.0001, "loss": 0.2263, "step": 20950 }, { "ce_loss": 0.22432934373617172, "epoch": 6.3, "inp_emb_norm": 0.39359375, "loss": 0.22432934373617172, "masked_top1": 91.0940608215332, "masked_top5": 99.17452285766602, "step": 20950, "top1": 96.43647857666015, "top5": 99.61320083618165 }, { "epoch": 6.32, "grad_norm": 0.9535481479010524, "learning_rate": 0.0001, "loss": 0.2232, "step": 21000 }, { "ce_loss": 0.22769318729639054, "epoch": 6.32, "inp_emb_norm": 0.392578125, "loss": 0.22769318729639054, "masked_top1": 91.18105072021484, "masked_top5": 99.13134719848632, "step": 21000, "top1": 96.39334274291993, "top5": 99.59284255981446 }, { "epoch": 6.33, "grad_norm": 0.9100595834466957, "learning_rate": 0.0001, "loss": 0.2237, "step": 21050 }, { "ce_loss": 0.2257786351442337, "epoch": 6.33, "inp_emb_norm": 0.3939453125, "loss": 0.2257786351442337, "masked_top1": 90.99365447998046, "masked_top5": 99.22629165649414, "step": 21050, "top1": 96.43473999023438, "top5": 99.62341964721679 }, { "epoch": 6.35, "grad_norm": 0.9348221993172825, "learning_rate": 0.0001, "loss": 0.2301, "step": 21100 }, { "ce_loss": 0.22199460864067078, "epoch": 6.35, "inp_emb_norm": 0.3928515625, "loss": 0.22199460864067078, "masked_top1": 91.3432958984375, "masked_top5": 99.21780807495117, "step": 21100, "top1": 96.48183792114258, "top5": 99.59834899902344 }, { "epoch": 6.36, "grad_norm": 0.8157766502142015, "learning_rate": 0.0001, "loss": 0.2348, "step": 21150 }, { "ce_loss": 0.23358583688735962, "epoch": 6.36, "inp_emb_norm": 0.39484375, "loss": 0.23358583688735962, "masked_top1": 90.63573287963867, "masked_top5": 99.2670066833496, "step": 21150, "top1": 96.2616633605957, "top5": 99.61970520019531 }, { "epoch": 6.38, "grad_norm": 0.8204200350876739, "learning_rate": 0.0001, "loss": 0.2331, "step": 21200 }, { "ce_loss": 0.2356252110004425, "epoch": 6.38, "inp_emb_norm": 0.38765625, "loss": 0.2356252110004425, "masked_top1": 91.13608352661133, "masked_top5": 99.10308471679687, "step": 21200, "top1": 96.23461364746093, "top5": 99.58956817626954 }, { "epoch": 6.39, "grad_norm": 0.8456950881445512, "learning_rate": 0.0001, "loss": 0.2376, "step": 21250 }, { "ce_loss": 0.2390061578154564, "epoch": 6.39, "inp_emb_norm": 0.394609375, "loss": 0.2390061578154564, "masked_top1": 90.48685897827148, "masked_top5": 98.98336410522461, "step": 21250, "top1": 96.21184814453125, "top5": 99.59722549438476 }, { "epoch": 6.41, "grad_norm": 0.8287547291237191, "learning_rate": 0.0001, "loss": 0.242, "step": 21300 }, { "ce_loss": 0.2427684971690178, "epoch": 6.41, "inp_emb_norm": 0.4001953125, "loss": 0.2427684971690178, "masked_top1": 90.06592498779297, "masked_top5": 98.84460174560547, "step": 21300, "top1": 96.14539581298828, "top5": 99.56721633911133 }, { "epoch": 6.42, "grad_norm": 0.9051278595660746, "learning_rate": 0.0001, "loss": 0.2406, "step": 21350 }, { "ce_loss": 0.24132068693637848, "epoch": 6.42, "inp_emb_norm": 0.3931640625, "loss": 0.24132068693637848, "masked_top1": 90.12520782470703, "masked_top5": 99.08217208862305, "step": 21350, "top1": 96.13710800170898, "top5": 99.58537933349609 }, { "epoch": 6.44, "grad_norm": 0.9309548266435707, "learning_rate": 0.0001, "loss": 0.2448, "step": 21400 }, { "ce_loss": 0.2484480223059654, "epoch": 6.44, "inp_emb_norm": 0.4006640625, "loss": 0.2484480223059654, "masked_top1": 89.13191101074219, "masked_top5": 98.85858505249024, "step": 21400, "top1": 96.04462646484374, "top5": 99.58927947998046 }, { "epoch": 6.45, "grad_norm": 0.9843205463835414, "learning_rate": 0.0001, "loss": 0.2433, "step": 21450 }, { "ce_loss": 0.2498919489979744, "epoch": 6.45, "inp_emb_norm": 0.3948046875, "loss": 0.2498919489979744, "masked_top1": 89.5735856628418, "masked_top5": 98.80221618652344, "step": 21450, "top1": 96.03055465698242, "top5": 99.56374588012696 }, { "epoch": 6.47, "grad_norm": 0.9647612984805798, "learning_rate": 0.0001, "loss": 0.2479, "step": 21500 }, { "ce_loss": 0.25150889009237287, "epoch": 6.47, "inp_emb_norm": 0.3948828125, "loss": 0.25150889009237287, "masked_top1": 89.32119857788086, "masked_top5": 98.80714309692382, "step": 21500, "top1": 95.92482788085937, "top5": 99.56254333496094 }, { "epoch": 6.48, "grad_norm": 1.006084785735041, "learning_rate": 0.0001, "loss": 0.2484, "step": 21550 }, { "ce_loss": 0.24291085809469223, "epoch": 6.48, "inp_emb_norm": 0.3999609375, "loss": 0.24291085809469223, "masked_top1": 89.84078826904297, "masked_top5": 98.79897399902343, "step": 21550, "top1": 96.13382278442383, "top5": 99.56241195678712 }, { "epoch": 6.5, "grad_norm": 0.9435564610662254, "learning_rate": 0.0001, "loss": 0.2484, "step": 21600 }, { "ce_loss": 0.25397088915109634, "epoch": 6.5, "inp_emb_norm": 0.390078125, "loss": 0.25397088915109634, "masked_top1": 89.26468048095703, "masked_top5": 98.83024612426757, "step": 21600, "top1": 95.98090301513672, "top5": 99.53740264892578 }, { "epoch": 6.51, "grad_norm": 0.9039119083122704, "learning_rate": 0.0001, "loss": 0.2536, "step": 21650 }, { "ce_loss": 0.2490449759364128, "epoch": 6.51, "inp_emb_norm": 0.398671875, "loss": 0.2490449759364128, "masked_top1": 90.15557754516601, "masked_top5": 98.96521423339844, "step": 21650, "top1": 96.07868286132812, "top5": 99.56334945678711 }, { "epoch": 6.53, "grad_norm": 0.9083177376434824, "learning_rate": 0.0001, "loss": 0.2511, "step": 21700 }, { "ce_loss": 0.25019362777471543, "epoch": 6.53, "inp_emb_norm": 0.4041796875, "loss": 0.25019362777471543, "masked_top1": 89.28435134887695, "masked_top5": 98.76094360351563, "step": 21700, "top1": 95.98147827148438, "top5": 99.55952423095704 }, { "epoch": 6.54, "grad_norm": 0.9468133124761712, "learning_rate": 0.0001, "loss": 0.2559, "step": 21750 }, { "ce_loss": 0.24820626825094222, "epoch": 6.54, "inp_emb_norm": 0.407109375, "loss": 0.24820626825094222, "masked_top1": 89.13103088378907, "masked_top5": 98.86271697998046, "step": 21750, "top1": 96.04320175170898, "top5": 99.57524795532227 }, { "epoch": 6.56, "grad_norm": 0.9411575962043335, "learning_rate": 0.0001, "loss": 0.2515, "step": 21800 }, { "ce_loss": 0.25579195737838745, "epoch": 6.56, "inp_emb_norm": 0.3958203125, "loss": 0.25579195737838745, "masked_top1": 89.12390563964844, "masked_top5": 98.75174041748046, "step": 21800, "top1": 95.92440002441407, "top5": 99.55484436035157 }, { "epoch": 6.57, "grad_norm": 0.9083686787254603, "learning_rate": 0.0001, "loss": 0.2572, "step": 21850 }, { "ce_loss": 0.2511313533782959, "epoch": 6.57, "inp_emb_norm": 0.398984375, "loss": 0.2511313533782959, "masked_top1": 90.02878692626953, "masked_top5": 98.91113662719727, "step": 21850, "top1": 96.03028366088867, "top5": 99.57074462890625 }, { "epoch": 6.59, "grad_norm": 0.9524791704046712, "learning_rate": 0.0001, "loss": 0.2561, "step": 21900 }, { "ce_loss": 0.25475972771644595, "epoch": 6.59, "inp_emb_norm": 0.39703125, "loss": 0.25475972771644595, "masked_top1": 89.4745411682129, "masked_top5": 98.61525360107422, "step": 21900, "top1": 95.92261184692383, "top5": 99.54248062133789 }, { "epoch": 6.6, "grad_norm": 0.9780779866428146, "learning_rate": 0.0001, "loss": 0.2543, "step": 21950 }, { "ce_loss": 0.25268141776323316, "epoch": 6.6, "inp_emb_norm": 0.400859375, "loss": 0.25268141776323316, "masked_top1": 89.10960723876953, "masked_top5": 98.78202194213867, "step": 21950, "top1": 96.01406188964843, "top5": 99.5376611328125 }, { "epoch": 6.62, "grad_norm": 0.9274176844179849, "learning_rate": 0.0001, "loss": 0.2628, "step": 22000 }, { "ce_loss": 0.2659920188784599, "epoch": 6.62, "inp_emb_norm": 0.3957421875, "loss": 0.2659920188784599, "masked_top1": 88.22967803955078, "masked_top5": 98.6833935546875, "step": 22000, "top1": 95.75867980957031, "top5": 99.55640853881836 }, { "epoch": 6.63, "grad_norm": 1.033134365793791, "learning_rate": 0.0001, "loss": 0.2614, "step": 22050 }, { "ce_loss": 0.26293145805597307, "epoch": 6.63, "inp_emb_norm": 0.3959375, "loss": 0.26293145805597307, "masked_top1": 88.64593627929688, "masked_top5": 98.49295043945312, "step": 22050, "top1": 95.80199890136718, "top5": 99.52016479492187 }, { "epoch": 6.65, "grad_norm": 0.9919274843394079, "learning_rate": 0.0001, "loss": 0.2614, "step": 22100 }, { "ce_loss": 0.2592626142501831, "epoch": 6.65, "inp_emb_norm": 0.39484375, "loss": 0.2592626142501831, "masked_top1": 89.1805844116211, "masked_top5": 98.66838363647462, "step": 22100, "top1": 95.86548355102539, "top5": 99.5434815979004 }, { "epoch": 6.66, "grad_norm": 0.923669979047392, "learning_rate": 0.0001, "loss": 0.2684, "step": 22150 }, { "ce_loss": 0.25953867882490156, "epoch": 6.66, "inp_emb_norm": 0.4025, "loss": 0.25953867882490156, "masked_top1": 88.57401412963867, "masked_top5": 98.65271347045899, "step": 22150, "top1": 95.81649612426757, "top5": 99.55012420654298 }, { "epoch": 6.68, "grad_norm": 0.9781925396499678, "learning_rate": 0.0001, "loss": 0.2676, "step": 22200 }, { "ce_loss": 0.2756389129161835, "epoch": 6.68, "inp_emb_norm": 0.3946875, "loss": 0.2756389129161835, "masked_top1": 87.85789031982422, "masked_top5": 98.56079971313477, "step": 22200, "top1": 95.60260299682618, "top5": 99.51409820556641 }, { "epoch": 6.69, "grad_norm": 0.881278477232779, "learning_rate": 0.0001, "loss": 0.2694, "step": 22250 }, { "ce_loss": 0.26733168482780456, "epoch": 6.69, "inp_emb_norm": 0.3998828125, "loss": 0.26733168482780456, "masked_top1": 88.51319839477539, "masked_top5": 98.58459121704101, "step": 22250, "top1": 95.67998138427734, "top5": 99.5245753479004 }, { "epoch": 6.71, "grad_norm": 0.9944879689962476, "learning_rate": 0.0001, "loss": 0.2695, "step": 22300 }, { "ce_loss": 0.2680419811606407, "epoch": 6.71, "inp_emb_norm": 0.3937109375, "loss": 0.2680419811606407, "masked_top1": 89.19367111206054, "masked_top5": 98.77984527587891, "step": 22300, "top1": 95.70912155151368, "top5": 99.52434310913085 }, { "epoch": 6.72, "grad_norm": 0.9344294418856229, "learning_rate": 0.0001, "loss": 0.2706, "step": 22350 }, { "ce_loss": 0.2640018093585968, "epoch": 6.72, "inp_emb_norm": 0.3963671875, "loss": 0.2640018093585968, "masked_top1": 88.87513946533203, "masked_top5": 98.68793273925782, "step": 22350, "top1": 95.7931625366211, "top5": 99.53593536376952 }, { "epoch": 6.74, "grad_norm": 0.9234955706616644, "learning_rate": 0.0001, "loss": 0.2694, "step": 22400 }, { "ce_loss": 0.27443262994289397, "epoch": 6.74, "inp_emb_norm": 0.401484375, "loss": 0.27443262994289397, "masked_top1": 87.91703521728516, "masked_top5": 98.35836242675781, "step": 22400, "top1": 95.63983840942383, "top5": 99.48991470336914 }, { "epoch": 6.75, "grad_norm": 0.9774569390983554, "learning_rate": 0.0001, "loss": 0.273, "step": 22450 }, { "ce_loss": 0.2766887894272804, "epoch": 6.75, "inp_emb_norm": 0.398359375, "loss": 0.2766887894272804, "masked_top1": 88.1091081237793, "masked_top5": 98.57858413696289, "step": 22450, "top1": 95.64536865234375, "top5": 99.5364192199707 }, { "epoch": 6.77, "grad_norm": 0.8755362789382393, "learning_rate": 0.0001, "loss": 0.274, "step": 22500 }, { "ce_loss": 0.27666087716817855, "epoch": 6.77, "inp_emb_norm": 0.4048046875, "loss": 0.27666087716817855, "masked_top1": 87.76848861694336, "masked_top5": 98.57107147216797, "step": 22500, "top1": 95.56947143554687, "top5": 99.51631744384765 }, { "epoch": 6.78, "grad_norm": 0.8974975580290838, "learning_rate": 0.0001, "loss": 0.2727, "step": 22550 }, { "ce_loss": 0.26903481036424637, "epoch": 6.78, "inp_emb_norm": 0.3957421875, "loss": 0.26903481036424637, "masked_top1": 88.31371643066406, "masked_top5": 98.50493865966797, "step": 22550, "top1": 95.6751333618164, "top5": 99.53861953735351 }, { "epoch": 6.8, "grad_norm": 0.9330247587083373, "learning_rate": 0.0001, "loss": 0.2773, "step": 22600 }, { "ce_loss": 0.27828563034534454, "epoch": 6.8, "inp_emb_norm": 0.40484375, "loss": 0.27828563034534454, "masked_top1": 87.45176528930664, "masked_top5": 98.36986709594727, "step": 22600, "top1": 95.5737760925293, "top5": 99.49567245483398 }, { "epoch": 6.81, "grad_norm": 0.9335442063104634, "learning_rate": 0.0001, "loss": 0.2753, "step": 22650 }, { "ce_loss": 0.27865981191396716, "epoch": 6.81, "inp_emb_norm": 0.40390625, "loss": 0.27865981191396716, "masked_top1": 87.10857299804688, "masked_top5": 98.53876022338868, "step": 22650, "top1": 95.47858245849609, "top5": 99.53715377807617 }, { "epoch": 6.83, "grad_norm": 0.9872165512055184, "learning_rate": 0.0001, "loss": 0.274, "step": 22700 }, { "ce_loss": 0.2781349629163742, "epoch": 6.83, "inp_emb_norm": 0.409296875, "loss": 0.2781349629163742, "masked_top1": 86.65185195922851, "masked_top5": 98.23174392700196, "step": 22700, "top1": 95.48913436889649, "top5": 99.51249389648437 }, { "epoch": 6.84, "grad_norm": 0.8421329759123019, "learning_rate": 0.0001, "loss": 0.2809, "step": 22750 }, { "ce_loss": 0.2819749695062637, "epoch": 6.84, "inp_emb_norm": 0.4040625, "loss": 0.2819749695062637, "masked_top1": 87.23991241455079, "masked_top5": 98.37083847045898, "step": 22750, "top1": 95.48827087402344, "top5": 99.51471160888671 }, { "epoch": 6.86, "grad_norm": 0.9374127847054397, "learning_rate": 0.0001, "loss": 0.283, "step": 22800 }, { "ce_loss": 0.2752624320983887, "epoch": 6.86, "inp_emb_norm": 0.4168359375, "loss": 0.2752624320983887, "masked_top1": 87.0565657043457, "masked_top5": 98.52674087524414, "step": 22800, "top1": 95.5639727783203, "top5": 99.51801193237304 }, { "epoch": 6.87, "grad_norm": 0.894365544580622, "learning_rate": 0.0001, "loss": 0.2806, "step": 22850 }, { "ce_loss": 0.28234552562236787, "epoch": 6.87, "inp_emb_norm": 0.3933984375, "loss": 0.28234552562236787, "masked_top1": 87.87512283325195, "masked_top5": 98.52397399902344, "step": 22850, "top1": 95.52448684692382, "top5": 99.49992218017579 }, { "epoch": 6.89, "grad_norm": 0.8865838096449303, "learning_rate": 0.0001, "loss": 0.2803, "step": 22900 }, { "ce_loss": 0.2735359054803848, "epoch": 6.89, "inp_emb_norm": 0.4132421875, "loss": 0.2735359054803848, "masked_top1": 87.56801651000977, "masked_top5": 98.44611022949219, "step": 22900, "top1": 95.64279312133789, "top5": 99.52417373657227 }, { "epoch": 6.9, "grad_norm": 1.0014374606270757, "learning_rate": 0.0001, "loss": 0.2846, "step": 22950 }, { "ce_loss": 0.2815407305955887, "epoch": 6.9, "inp_emb_norm": 0.4108203125, "loss": 0.2815407305955887, "masked_top1": 86.75137435913086, "masked_top5": 98.28164108276367, "step": 22950, "top1": 95.45286041259766, "top5": 99.5033415222168 }, { "epoch": 6.92, "grad_norm": 0.9418515928051996, "learning_rate": 0.0001, "loss": 0.2881, "step": 23000 }, { "ce_loss": 0.2875212562084198, "epoch": 6.92, "inp_emb_norm": 0.393046875, "loss": 0.2875212562084198, "masked_top1": 86.9408479309082, "masked_top5": 98.42632766723632, "step": 23000, "top1": 95.38972747802734, "top5": 99.51202941894532 }, { "epoch": 6.93, "grad_norm": 0.9640044804691634, "learning_rate": 0.0001, "loss": 0.2849, "step": 23050 }, { "ce_loss": 0.28959711760282514, "epoch": 6.93, "inp_emb_norm": 0.39890625, "loss": 0.28959711760282514, "masked_top1": 87.06779602050781, "masked_top5": 98.1885205078125, "step": 23050, "top1": 95.33015533447265, "top5": 99.4693441772461 }, { "epoch": 6.95, "grad_norm": 0.884763494943833, "learning_rate": 0.0001, "loss": 0.2864, "step": 23100 }, { "ce_loss": 0.2791384127736092, "epoch": 6.95, "inp_emb_norm": 0.3983203125, "loss": 0.2791384127736092, "masked_top1": 87.34188446044922, "masked_top5": 98.60519989013672, "step": 23100, "top1": 95.44756973266601, "top5": 99.51571304321288 }, { "epoch": 6.96, "grad_norm": 0.9969087357121538, "learning_rate": 0.0001, "loss": 0.2819, "step": 23150 }, { "ce_loss": 0.2769463035464287, "epoch": 6.96, "inp_emb_norm": 0.40640625, "loss": 0.2769463035464287, "masked_top1": 87.70826522827149, "masked_top5": 98.4365104675293, "step": 23150, "top1": 95.51589462280273, "top5": 99.52118911743165 }, { "epoch": 6.98, "grad_norm": 1.0169282128070403, "learning_rate": 0.0001, "loss": 0.2876, "step": 23200 }, { "ce_loss": 0.28613451212644575, "epoch": 6.98, "inp_emb_norm": 0.409296875, "loss": 0.28613451212644575, "masked_top1": 86.17974639892579, "masked_top5": 98.22409042358399, "step": 23200, "top1": 95.39607528686524, "top5": 99.49454742431641 }, { "epoch": 6.99, "grad_norm": 1.0450277523154698, "learning_rate": 0.0001, "loss": 0.2889, "step": 23250 }, { "ce_loss": 0.2973794335126877, "epoch": 6.99, "inp_emb_norm": 0.40296875, "loss": 0.2973794335126877, "masked_top1": 86.13511917114258, "masked_top5": 98.26584106445313, "step": 23250, "top1": 95.27028549194335, "top5": 99.47429718017578 }, { "epoch": 7.01, "grad_norm": 0.6565758700785924, "learning_rate": 0.0001, "loss": 0.2269, "step": 23300 }, { "ce_loss": 0.22528975576162338, "epoch": 7.01, "inp_emb_norm": 0.4019140625, "loss": 0.22528975576162338, "masked_top1": 91.23332473754883, "masked_top5": 99.0590966796875, "step": 23300, "top1": 96.40823837280273, "top5": 99.59931091308594 }, { "epoch": 7.02, "grad_norm": 0.7347638074045553, "learning_rate": 0.0001, "loss": 0.1611, "step": 23350 }, { "ce_loss": 0.16495208382606508, "epoch": 7.02, "inp_emb_norm": 0.4017578125, "loss": 0.16495208382606508, "masked_top1": 95.35215362548828, "masked_top5": 99.73044998168945, "step": 23350, "top1": 97.44107040405274, "top5": 99.70461227416992 }, { "epoch": 7.04, "grad_norm": 0.6645294223117588, "learning_rate": 0.0001, "loss": 0.1606, "step": 23400 }, { "ce_loss": 0.1565721134841442, "epoch": 7.04, "inp_emb_norm": 0.4126953125, "loss": 0.1565721134841442, "masked_top1": 95.39784866333008, "masked_top5": 99.75249832153321, "step": 23400, "top1": 97.55490280151368, "top5": 99.71364120483399 }, { "epoch": 7.05, "grad_norm": 0.7050716054650297, "learning_rate": 0.0001, "loss": 0.1606, "step": 23450 }, { "ce_loss": 0.15757375702261925, "epoch": 7.05, "inp_emb_norm": 0.4034765625, "loss": 0.15757375702261925, "masked_top1": 95.50349487304688, "masked_top5": 99.6744792175293, "step": 23450, "top1": 97.49706893920899, "top5": 99.70018051147461 }, { "epoch": 7.07, "grad_norm": 0.70439339407283, "learning_rate": 0.0001, "loss": 0.1595, "step": 23500 }, { "ce_loss": 0.15974825277924537, "epoch": 7.07, "inp_emb_norm": 0.410625, "loss": 0.15974825277924537, "masked_top1": 95.76246231079102, "masked_top5": 99.68396453857422, "step": 23500, "top1": 97.5335317993164, "top5": 99.69112976074219 }, { "epoch": 7.08, "grad_norm": 0.8001940943257561, "learning_rate": 0.0001, "loss": 0.1643, "step": 23550 }, { "ce_loss": 0.16370448261499404, "epoch": 7.08, "inp_emb_norm": 0.4030859375, "loss": 0.16370448261499404, "masked_top1": 95.43571884155273, "masked_top5": 99.68741439819335, "step": 23550, "top1": 97.46105117797852, "top5": 99.6865510559082 }, { "epoch": 7.1, "grad_norm": 0.7666147004483381, "learning_rate": 0.0001, "loss": 0.1653, "step": 23600 }, { "ce_loss": 0.16188147634267808, "epoch": 7.1, "inp_emb_norm": 0.408828125, "loss": 0.16188147634267808, "masked_top1": 95.91392837524414, "masked_top5": 99.76342895507813, "step": 23600, "top1": 97.41842742919921, "top5": 99.69919830322266 }, { "epoch": 7.11, "grad_norm": 0.753790024789609, "learning_rate": 0.0001, "loss": 0.1674, "step": 23650 }, { "ce_loss": 0.16793357729911804, "epoch": 7.11, "inp_emb_norm": 0.4030078125, "loss": 0.16793357729911804, "masked_top1": 95.3747721862793, "masked_top5": 99.69120513916016, "step": 23650, "top1": 97.39034225463867, "top5": 99.67092514038086 }, { "epoch": 7.13, "grad_norm": 0.7560663951042244, "learning_rate": 0.0001, "loss": 0.1714, "step": 23700 }, { "ce_loss": 0.16748950853943825, "epoch": 7.13, "inp_emb_norm": 0.4015234375, "loss": 0.16748950853943825, "masked_top1": 95.61355926513671, "masked_top5": 99.70470977783204, "step": 23700, "top1": 97.36271957397462, "top5": 99.68995620727539 }, { "epoch": 7.14, "grad_norm": 0.7885900131749313, "learning_rate": 0.0001, "loss": 0.1718, "step": 23750 }, { "ce_loss": 0.16834189236164093, "epoch": 7.14, "inp_emb_norm": 0.4105859375, "loss": 0.16834189236164093, "masked_top1": 95.20903045654296, "masked_top5": 99.66537704467774, "step": 23750, "top1": 97.41024429321288, "top5": 99.6710530090332 }, { "epoch": 7.16, "grad_norm": 0.708267720328551, "learning_rate": 0.0001, "loss": 0.1718, "step": 23800 }, { "ce_loss": 0.16808076053857804, "epoch": 7.16, "inp_emb_norm": 0.4126953125, "loss": 0.16808076053857804, "masked_top1": 94.77564025878907, "masked_top5": 99.67258102416992, "step": 23800, "top1": 97.39059310913086, "top5": 99.68683700561523 }, { "epoch": 7.17, "grad_norm": 0.7351045964671515, "learning_rate": 0.0001, "loss": 0.174, "step": 23850 }, { "ce_loss": 0.1726333273947239, "epoch": 7.17, "inp_emb_norm": 0.4017578125, "loss": 0.1726333273947239, "masked_top1": 94.88044723510743, "masked_top5": 99.62449737548827, "step": 23850, "top1": 97.2899102783203, "top5": 99.65145614624024 }, { "epoch": 7.19, "grad_norm": 0.8049347796917691, "learning_rate": 0.0001, "loss": 0.1787, "step": 23900 }, { "ce_loss": 0.17679258838295936, "epoch": 7.19, "inp_emb_norm": 0.4061328125, "loss": 0.17679258838295936, "masked_top1": 94.69095947265625, "masked_top5": 99.67362945556641, "step": 23900, "top1": 97.20210205078125, "top5": 99.68624572753906 }, { "epoch": 7.2, "grad_norm": 0.8012010836512982, "learning_rate": 0.0001, "loss": 0.1788, "step": 23950 }, { "ce_loss": 0.1779346537590027, "epoch": 7.2, "inp_emb_norm": 0.40765625, "loss": 0.1779346537590027, "masked_top1": 94.71529022216797, "masked_top5": 99.61817611694336, "step": 23950, "top1": 97.14944107055663, "top5": 99.65997314453125 }, { "epoch": 7.22, "grad_norm": 0.8096182631639389, "learning_rate": 0.0001, "loss": 0.1782, "step": 24000 }, { "ce_loss": 0.17779998898506164, "epoch": 7.22, "inp_emb_norm": 0.40890625, "loss": 0.17779998898506164, "masked_top1": 94.6820344543457, "masked_top5": 99.6923991394043, "step": 24000, "top1": 97.1572166442871, "top5": 99.69579544067383 }, { "epoch": 7.23, "grad_norm": 0.7208752096873576, "learning_rate": 0.0001, "loss": 0.1795, "step": 24050 }, { "ce_loss": 0.18153419494628906, "epoch": 7.23, "inp_emb_norm": 0.4061328125, "loss": 0.18153419494628906, "masked_top1": 94.62552368164063, "masked_top5": 99.55692886352539, "step": 24050, "top1": 97.11937118530274, "top5": 99.64983123779297 }, { "epoch": 7.25, "grad_norm": 0.8302822944148331, "learning_rate": 0.0001, "loss": 0.1832, "step": 24100 }, { "ce_loss": 0.18768798112869262, "epoch": 7.25, "inp_emb_norm": 0.41546875, "loss": 0.18768798112869262, "masked_top1": 93.77630233764648, "masked_top5": 99.51055358886718, "step": 24100, "top1": 97.08079849243164, "top5": 99.66075820922852 }, { "epoch": 7.26, "grad_norm": 0.8034956740784042, "learning_rate": 0.0001, "loss": 0.1895, "step": 24150 }, { "ce_loss": 0.19201560974121093, "epoch": 7.26, "inp_emb_norm": 0.40984375, "loss": 0.19201560974121093, "masked_top1": 93.87309066772461, "masked_top5": 99.5169157409668, "step": 24150, "top1": 97.02114761352539, "top5": 99.6527261352539 }, { "epoch": 7.28, "grad_norm": 0.8314048125761649, "learning_rate": 0.0001, "loss": 0.1873, "step": 24200 }, { "ce_loss": 0.1878107675909996, "epoch": 7.28, "inp_emb_norm": 0.407265625, "loss": 0.1878107675909996, "masked_top1": 94.19107177734375, "masked_top5": 99.55671203613281, "step": 24200, "top1": 97.03997573852538, "top5": 99.68890594482421 }, { "epoch": 7.29, "grad_norm": 0.704357096474096, "learning_rate": 0.0001, "loss": 0.1893, "step": 24250 }, { "ce_loss": 0.19042812794446945, "epoch": 7.29, "inp_emb_norm": 0.416328125, "loss": 0.19042812794446945, "masked_top1": 93.61178329467774, "masked_top5": 99.54245040893555, "step": 24250, "top1": 96.90492645263672, "top5": 99.6619076538086 }, { "epoch": 7.31, "grad_norm": 0.856253880169746, "learning_rate": 0.0001, "loss": 0.1893, "step": 24300 }, { "ce_loss": 0.1884206250309944, "epoch": 7.31, "inp_emb_norm": 0.4078515625, "loss": 0.1884206250309944, "masked_top1": 93.98220611572266, "masked_top5": 99.54290969848633, "step": 24300, "top1": 96.96899108886718, "top5": 99.6441293334961 }, { "epoch": 7.32, "grad_norm": 0.8464641167931024, "learning_rate": 0.0001, "loss": 0.1918, "step": 24350 }, { "ce_loss": 0.19378722280263902, "epoch": 7.32, "inp_emb_norm": 0.4122265625, "loss": 0.19378722280263902, "masked_top1": 93.98709579467773, "masked_top5": 99.48938507080078, "step": 24350, "top1": 96.92246383666992, "top5": 99.6635366821289 }, { "epoch": 7.34, "grad_norm": 0.9274851248218718, "learning_rate": 0.0001, "loss": 0.1979, "step": 24400 }, { "ce_loss": 0.20088252812623977, "epoch": 7.34, "inp_emb_norm": 0.405546875, "loss": 0.20088252812623977, "masked_top1": 92.93017440795899, "masked_top5": 99.49172775268555, "step": 24400, "top1": 96.83464401245118, "top5": 99.64353897094726 }, { "epoch": 7.35, "grad_norm": 0.7721388576512122, "learning_rate": 0.0001, "loss": 0.1934, "step": 24450 }, { "ce_loss": 0.2001592782139778, "epoch": 7.35, "inp_emb_norm": 0.413828125, "loss": 0.2001592782139778, "masked_top1": 93.32104949951172, "masked_top5": 99.480458984375, "step": 24450, "top1": 96.84799270629883, "top5": 99.66846115112304 }, { "epoch": 7.37, "grad_norm": 0.8460246810468973, "learning_rate": 0.0001, "loss": 0.1937, "step": 24500 }, { "ce_loss": 0.19288330137729645, "epoch": 7.37, "inp_emb_norm": 0.40984375, "loss": 0.19288330137729645, "masked_top1": 93.38121078491211, "masked_top5": 99.56306228637695, "step": 24500, "top1": 96.92339248657227, "top5": 99.67182235717773 }, { "epoch": 7.38, "grad_norm": 0.8742179314746799, "learning_rate": 0.0001, "loss": 0.1982, "step": 24550 }, { "ce_loss": 0.19720925360918046, "epoch": 7.38, "inp_emb_norm": 0.412890625, "loss": 0.19720925360918046, "masked_top1": 93.280068359375, "masked_top5": 99.51691192626953, "step": 24550, "top1": 96.83218688964844, "top5": 99.66712615966797 }, { "epoch": 7.4, "grad_norm": 0.8553325710712862, "learning_rate": 0.0001, "loss": 0.2011, "step": 24600 }, { "ce_loss": 0.19960668057203293, "epoch": 7.4, "inp_emb_norm": 0.409765625, "loss": 0.19960668057203293, "masked_top1": 93.45094650268555, "masked_top5": 99.46210220336914, "step": 24600, "top1": 96.85580673217774, "top5": 99.665537109375 }, { "epoch": 7.41, "grad_norm": 0.8454568497353904, "learning_rate": 0.0001, "loss": 0.2022, "step": 24650 }, { "ce_loss": 0.20072863966226578, "epoch": 7.41, "inp_emb_norm": 0.4177734375, "loss": 0.20072863966226578, "masked_top1": 92.75718322753906, "masked_top5": 99.35001556396485, "step": 24650, "top1": 96.78503204345704, "top5": 99.66353454589844 }, { "epoch": 7.43, "grad_norm": 0.8262303326226429, "learning_rate": 0.0001, "loss": 0.205, "step": 24700 }, { "ce_loss": 0.20386093407869338, "epoch": 7.43, "inp_emb_norm": 0.4098046875, "loss": 0.20386093407869338, "masked_top1": 93.37680938720703, "masked_top5": 99.56277999877929, "step": 24700, "top1": 96.77312591552734, "top5": 99.6433283996582 }, { "epoch": 7.44, "grad_norm": 0.8542900336102568, "learning_rate": 0.0001, "loss": 0.2073, "step": 24750 }, { "ce_loss": 0.20805983483791352, "epoch": 7.44, "inp_emb_norm": 0.4114453125, "loss": 0.20805983483791352, "masked_top1": 92.38563873291015, "masked_top5": 99.27365493774414, "step": 24750, "top1": 96.68102691650391, "top5": 99.62649429321289 }, { "epoch": 7.46, "grad_norm": 0.775495795649607, "learning_rate": 0.0001, "loss": 0.2084, "step": 24800 }, { "ce_loss": 0.20858111292123793, "epoch": 7.46, "inp_emb_norm": 0.40828125, "loss": 0.20858111292123793, "masked_top1": 93.18427093505859, "masked_top5": 99.40792358398437, "step": 24800, "top1": 96.69915771484375, "top5": 99.64056365966798 }, { "epoch": 7.47, "grad_norm": 0.795773364310639, "learning_rate": 0.0001, "loss": 0.2128, "step": 24850 }, { "ce_loss": 0.21129278868436813, "epoch": 7.47, "inp_emb_norm": 0.4151953125, "loss": 0.21129278868436813, "masked_top1": 92.36515197753906, "masked_top5": 99.46405319213868, "step": 24850, "top1": 96.61996765136719, "top5": 99.64519241333008 }, { "epoch": 7.49, "grad_norm": 0.8606373006872489, "learning_rate": 0.0001, "loss": 0.2107, "step": 24900 }, { "ce_loss": 0.2146734645962715, "epoch": 7.49, "inp_emb_norm": 0.416328125, "loss": 0.2146734645962715, "masked_top1": 92.35210235595703, "masked_top5": 99.37399642944337, "step": 24900, "top1": 96.58806320190429, "top5": 99.66881698608398 }, { "epoch": 7.5, "grad_norm": 0.8148609241548314, "learning_rate": 0.0001, "loss": 0.2145, "step": 24950 }, { "ce_loss": 0.21042368352413177, "epoch": 7.5, "inp_emb_norm": 0.41640625, "loss": 0.21042368352413177, "masked_top1": 92.42163009643555, "masked_top5": 99.2802491760254, "step": 24950, "top1": 96.64725265502929, "top5": 99.64126876831055 }, { "epoch": 7.52, "grad_norm": 0.8552723662919675, "learning_rate": 0.0001, "loss": 0.2152, "step": 25000 }, { "ce_loss": 0.21622439593076706, "epoch": 7.52, "inp_emb_norm": 0.411796875, "loss": 0.21622439593076706, "masked_top1": 92.36061889648437, "masked_top5": 99.40323318481445, "step": 25000, "top1": 96.49748916625977, "top5": 99.6481248474121 }, { "epoch": 7.53, "grad_norm": 0.8756422021562937, "learning_rate": 0.0001, "loss": 0.2175, "step": 25050 }, { "ce_loss": 0.21715206265449524, "epoch": 7.53, "inp_emb_norm": 0.4205859375, "loss": 0.21715206265449524, "masked_top1": 92.16612228393555, "masked_top5": 99.21943115234374, "step": 25050, "top1": 96.50445205688476, "top5": 99.63119323730469 }, { "epoch": 7.55, "grad_norm": 0.9236893593627393, "learning_rate": 0.0001, "loss": 0.2156, "step": 25100 }, { "ce_loss": 0.21899319916963578, "epoch": 7.55, "inp_emb_norm": 0.413203125, "loss": 0.21899319916963578, "masked_top1": 91.96203842163087, "masked_top5": 99.34012496948242, "step": 25100, "top1": 96.50180130004883, "top5": 99.64115966796875 }, { "epoch": 7.56, "grad_norm": 0.931694461742947, "learning_rate": 0.0001, "loss": 0.2197, "step": 25150 }, { "ce_loss": 0.21712171256542206, "epoch": 7.56, "inp_emb_norm": 0.4085546875, "loss": 0.21712171256542206, "masked_top1": 92.56915695190429, "masked_top5": 99.49829315185546, "step": 25150, "top1": 96.46173583984375, "top5": 99.64132019042968 }, { "epoch": 7.58, "grad_norm": 0.92715510790691, "learning_rate": 0.0001, "loss": 0.2259, "step": 25200 }, { "ce_loss": 0.2213623434305191, "epoch": 7.58, "inp_emb_norm": 0.4107421875, "loss": 0.2213623434305191, "masked_top1": 91.66716598510742, "masked_top5": 99.32309783935547, "step": 25200, "top1": 96.42446884155274, "top5": 99.6251106262207 }, { "epoch": 7.59, "grad_norm": 0.9935343271656442, "learning_rate": 0.0001, "loss": 0.225, "step": 25250 }, { "ce_loss": 0.22398129254579544, "epoch": 7.59, "inp_emb_norm": 0.409140625, "loss": 0.22398129254579544, "masked_top1": 92.20207107543945, "masked_top5": 99.32244827270507, "step": 25250, "top1": 96.40167465209962, "top5": 99.6429054260254 }, { "epoch": 7.61, "grad_norm": 0.9151015122387817, "learning_rate": 0.0001, "loss": 0.2227, "step": 25300 }, { "ce_loss": 0.22134240210056305, "epoch": 7.61, "inp_emb_norm": 0.413984375, "loss": 0.22134240210056305, "masked_top1": 92.04093017578126, "masked_top5": 99.4468423461914, "step": 25300, "top1": 96.43467788696289, "top5": 99.66016799926757 }, { "epoch": 7.62, "grad_norm": 0.8495278966433639, "learning_rate": 0.0001, "loss": 0.2212, "step": 25350 }, { "ce_loss": 0.22596073687076568, "epoch": 7.62, "inp_emb_norm": 0.4122265625, "loss": 0.22596073687076568, "masked_top1": 91.3648748779297, "masked_top5": 99.2870686340332, "step": 25350, "top1": 96.3931086730957, "top5": 99.61766250610351 }, { "epoch": 7.64, "grad_norm": 0.8771189448377541, "learning_rate": 0.0001, "loss": 0.224, "step": 25400 }, { "ce_loss": 0.22193652182817458, "epoch": 7.64, "inp_emb_norm": 0.4132421875, "loss": 0.22193652182817458, "masked_top1": 91.40539047241211, "masked_top5": 99.10585540771484, "step": 25400, "top1": 96.49262084960938, "top5": 99.6045753479004 }, { "epoch": 7.65, "grad_norm": 0.8719944439794763, "learning_rate": 0.0001, "loss": 0.2299, "step": 25450 }, { "ce_loss": 0.23045677453279495, "epoch": 7.65, "inp_emb_norm": 0.4094921875, "loss": 0.23045677453279495, "masked_top1": 91.44049133300781, "masked_top5": 99.38550582885742, "step": 25450, "top1": 96.28902786254883, "top5": 99.62447952270507 }, { "epoch": 7.67, "grad_norm": 0.948342031847273, "learning_rate": 0.0001, "loss": 0.2278, "step": 25500 }, { "ce_loss": 0.2266176301240921, "epoch": 7.67, "inp_emb_norm": 0.4139453125, "loss": 0.2266176301240921, "masked_top1": 91.54910736083984, "masked_top5": 99.31753051757812, "step": 25500, "top1": 96.38055999755859, "top5": 99.61369903564453 }, { "epoch": 7.68, "grad_norm": 0.9160429959948714, "learning_rate": 0.0001, "loss": 0.2301, "step": 25550 }, { "ce_loss": 0.22997273564338683, "epoch": 7.68, "inp_emb_norm": 0.4121484375, "loss": 0.22997273564338683, "masked_top1": 91.72368927001953, "masked_top5": 99.37556945800782, "step": 25550, "top1": 96.34187088012695, "top5": 99.62580551147461 }, { "epoch": 7.7, "grad_norm": 0.9064800185064251, "learning_rate": 0.0001, "loss": 0.2345, "step": 25600 }, { "ce_loss": 0.2321370568871498, "epoch": 7.7, "inp_emb_norm": 0.41484375, "loss": 0.2321370568871498, "masked_top1": 91.17147521972656, "masked_top5": 99.15941177368164, "step": 25600, "top1": 96.28547775268555, "top5": 99.6115461730957 }, { "epoch": 7.71, "grad_norm": 0.8896861108646344, "learning_rate": 0.0001, "loss": 0.2316, "step": 25650 }, { "ce_loss": 0.22988739639520644, "epoch": 7.71, "inp_emb_norm": 0.41703125, "loss": 0.22988739639520644, "masked_top1": 90.95779861450195, "masked_top5": 99.13907150268555, "step": 25650, "top1": 96.32924926757812, "top5": 99.61460418701172 }, { "epoch": 7.73, "grad_norm": 0.8364454804655236, "learning_rate": 0.0001, "loss": 0.2308, "step": 25700 }, { "ce_loss": 0.23426982671022414, "epoch": 7.73, "inp_emb_norm": 0.4129296875, "loss": 0.23426982671022414, "masked_top1": 90.87540649414062, "masked_top5": 99.15606750488281, "step": 25700, "top1": 96.24389221191406, "top5": 99.61002014160157 }, { "epoch": 7.74, "grad_norm": 0.8892969722239723, "learning_rate": 0.0001, "loss": 0.2326, "step": 25750 }, { "ce_loss": 0.23778538197278976, "epoch": 7.74, "inp_emb_norm": 0.4173828125, "loss": 0.23778538197278976, "masked_top1": 91.04456115722657, "masked_top5": 99.08432937622071, "step": 25750, "top1": 96.19372604370118, "top5": 99.59805389404296 }, { "epoch": 7.76, "grad_norm": 0.9333358166124965, "learning_rate": 0.0001, "loss": 0.2324, "step": 25800 }, { "ce_loss": 0.2342055532336235, "epoch": 7.76, "inp_emb_norm": 0.413203125, "loss": 0.2342055532336235, "masked_top1": 90.89793167114257, "masked_top5": 99.32825225830078, "step": 25800, "top1": 96.24822631835937, "top5": 99.63633575439454 }, { "epoch": 7.77, "grad_norm": 0.8972293165472223, "learning_rate": 0.0001, "loss": 0.2378, "step": 25850 }, { "ce_loss": 0.2358376482129097, "epoch": 7.77, "inp_emb_norm": 0.411875, "loss": 0.2358376482129097, "masked_top1": 90.5443440246582, "masked_top5": 99.08838134765625, "step": 25850, "top1": 96.21415130615235, "top5": 99.59769958496094 }, { "epoch": 7.79, "grad_norm": 0.8953278969414413, "learning_rate": 0.0001, "loss": 0.2394, "step": 25900 }, { "ce_loss": 0.241860691010952, "epoch": 7.79, "inp_emb_norm": 0.40828125, "loss": 0.241860691010952, "masked_top1": 90.6273094177246, "masked_top5": 99.1496745300293, "step": 25900, "top1": 96.1283937072754, "top5": 99.59819442749023 }, { "epoch": 7.8, "grad_norm": 0.8520671639423023, "learning_rate": 0.0001, "loss": 0.2401, "step": 25950 }, { "ce_loss": 0.2415415197610855, "epoch": 7.8, "inp_emb_norm": 0.4100390625, "loss": 0.2415415197610855, "masked_top1": 90.58090194702149, "masked_top5": 99.10164520263672, "step": 25950, "top1": 96.16258728027344, "top5": 99.59717208862304 }, { "epoch": 7.82, "grad_norm": 0.8645330931635717, "learning_rate": 0.0001, "loss": 0.2373, "step": 26000 } ], "logging_steps": 50, "max_steps": 26600, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 2000, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }