| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.819548872180452, | |
| "eval_steps": 1500, | |
| "global_step": 26000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "ce_loss": 4.768589496612549, | |
| "epoch": 0, | |
| "inp_emb_norm": 0.265625, | |
| "loss": 4.768589496612549, | |
| "masked_top1": 0.0, | |
| "masked_top5": 0.4098360538482666, | |
| "step": 0, | |
| "top1": 64.96458435058594, | |
| "top5": 80.73052215576172 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 6.2690516537633005, | |
| "learning_rate": 0.0001, | |
| "loss": 4.6296, | |
| "step": 50 | |
| }, | |
| { | |
| "ce_loss": 4.672832765579224, | |
| "epoch": 0.02, | |
| "inp_emb_norm": 0.26263671875, | |
| "loss": 4.672832765579224, | |
| "masked_top1": 14.987861804962158, | |
| "masked_top5": 24.83079730987549, | |
| "step": 50, | |
| "top1": 66.08985641002656, | |
| "top5": 81.12573463439941 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 9.593317242373375, | |
| "learning_rate": 0.0001, | |
| "loss": 2.2246, | |
| "step": 100 | |
| }, | |
| { | |
| "ce_loss": 2.207055723667145, | |
| "epoch": 0.03, | |
| "inp_emb_norm": 0.2644140625, | |
| "loss": 2.207055723667145, | |
| "masked_top1": 23.22451873779297, | |
| "masked_top5": 38.51483547210693, | |
| "step": 100, | |
| "top1": 74.46695602416992, | |
| "top5": 87.36076766967773 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.1307366740911395, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0857, | |
| "step": 150 | |
| }, | |
| { | |
| "ce_loss": 2.077478907108307, | |
| "epoch": 0.05, | |
| "inp_emb_norm": 0.26427734375, | |
| "loss": 2.077478907108307, | |
| "masked_top1": 27.615498123168944, | |
| "masked_top5": 46.10835479736328, | |
| "step": 150, | |
| "top1": 75.29274810791016, | |
| "top5": 88.44302017211913 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.714681744263656, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0248, | |
| "step": 200 | |
| }, | |
| { | |
| "ce_loss": 2.018527669906616, | |
| "epoch": 0.06, | |
| "inp_emb_norm": 0.2655859375, | |
| "loss": 2.018527669906616, | |
| "masked_top1": 29.403907699584963, | |
| "masked_top5": 48.57966896057129, | |
| "step": 200, | |
| "top1": 75.77816040039062, | |
| "top5": 88.9672624206543 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.4811381161353423, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9651, | |
| "step": 250 | |
| }, | |
| { | |
| "ce_loss": 1.9264942455291747, | |
| "epoch": 0.08, | |
| "inp_emb_norm": 0.2648828125, | |
| "loss": 1.9264942455291747, | |
| "masked_top1": 32.32564323425293, | |
| "masked_top5": 51.88160499572754, | |
| "step": 250, | |
| "top1": 76.65794830322265, | |
| "top5": 89.52642883300781 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.160461687507101, | |
| "learning_rate": 0.0001, | |
| "loss": 1.9061, | |
| "step": 300 | |
| }, | |
| { | |
| "ce_loss": 1.9086420011520386, | |
| "epoch": 0.09, | |
| "inp_emb_norm": 0.26564453125, | |
| "loss": 1.9086420011520386, | |
| "masked_top1": 32.26125560760498, | |
| "masked_top5": 52.74158630371094, | |
| "step": 300, | |
| "top1": 76.66858810424804, | |
| "top5": 89.8210075378418 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 2.173027962906511, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8717, | |
| "step": 350 | |
| }, | |
| { | |
| "ce_loss": 1.856195902824402, | |
| "epoch": 0.11, | |
| "inp_emb_norm": 0.27287109375, | |
| "loss": 1.856195902824402, | |
| "masked_top1": 33.80865501403809, | |
| "masked_top5": 54.50813941955566, | |
| "step": 350, | |
| "top1": 77.32864608764649, | |
| "top5": 90.11031524658203 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.09826482925296, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8414, | |
| "step": 400 | |
| }, | |
| { | |
| "ce_loss": 1.8570138716697693, | |
| "epoch": 0.12, | |
| "inp_emb_norm": 0.26978515625, | |
| "loss": 1.8570138716697693, | |
| "masked_top1": 34.77134052276611, | |
| "masked_top5": 55.27866950988769, | |
| "step": 400, | |
| "top1": 77.23032119750977, | |
| "top5": 90.24968856811523 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.6584647355277067, | |
| "learning_rate": 0.0001, | |
| "loss": 1.8405, | |
| "step": 450 | |
| }, | |
| { | |
| "ce_loss": 1.8444490694999696, | |
| "epoch": 0.14, | |
| "inp_emb_norm": 0.27064453125, | |
| "loss": 1.8444490694999696, | |
| "masked_top1": 34.76882873535156, | |
| "masked_top5": 54.58673110961914, | |
| "step": 450, | |
| "top1": 77.44166137695312, | |
| "top5": 90.14545852661132 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.9115658205853148, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7976, | |
| "step": 500 | |
| }, | |
| { | |
| "ce_loss": 1.804072494506836, | |
| "epoch": 0.15, | |
| "inp_emb_norm": 0.2703515625, | |
| "loss": 1.804072494506836, | |
| "masked_top1": 34.5311047744751, | |
| "masked_top5": 55.86459785461426, | |
| "step": 500, | |
| "top1": 77.51275192260742, | |
| "top5": 90.69894668579101 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.907634541206245, | |
| "learning_rate": 0.0001, | |
| "loss": 1.804, | |
| "step": 550 | |
| }, | |
| { | |
| "ce_loss": 1.7778214049339294, | |
| "epoch": 0.17, | |
| "inp_emb_norm": 0.27109375, | |
| "loss": 1.7778214049339294, | |
| "masked_top1": 35.83196727752686, | |
| "masked_top5": 56.62046134948731, | |
| "step": 550, | |
| "top1": 78.00231246948242, | |
| "top5": 90.66332412719727 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.7366360206125175, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7783, | |
| "step": 600 | |
| }, | |
| { | |
| "ce_loss": 1.7576907467842102, | |
| "epoch": 0.18, | |
| "inp_emb_norm": 0.2725, | |
| "loss": 1.7576907467842102, | |
| "masked_top1": 36.732100868225096, | |
| "masked_top5": 57.86395164489746, | |
| "step": 600, | |
| "top1": 78.34247482299804, | |
| "top5": 90.82281936645508 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.9186053120285074, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7819, | |
| "step": 650 | |
| }, | |
| { | |
| "ce_loss": 1.8032070183753968, | |
| "epoch": 0.2, | |
| "inp_emb_norm": 0.27015625, | |
| "loss": 1.8032070183753968, | |
| "masked_top1": 35.763907241821286, | |
| "masked_top5": 55.90799743652344, | |
| "step": 650, | |
| "top1": 77.69748565673828, | |
| "top5": 90.52577987670898 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.957034200299797, | |
| "learning_rate": 0.0001, | |
| "loss": 1.735, | |
| "step": 700 | |
| }, | |
| { | |
| "ce_loss": 1.7470700669288635, | |
| "epoch": 0.21, | |
| "inp_emb_norm": 0.27248046875, | |
| "loss": 1.7470700669288635, | |
| "masked_top1": 37.93720417022705, | |
| "masked_top5": 58.685314331054684, | |
| "step": 700, | |
| "top1": 78.27062789916992, | |
| "top5": 90.87748138427735 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.045975232733077, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7517, | |
| "step": 750 | |
| }, | |
| { | |
| "ce_loss": 1.754687945842743, | |
| "epoch": 0.23, | |
| "inp_emb_norm": 0.27109375, | |
| "loss": 1.754687945842743, | |
| "masked_top1": 36.862719764709475, | |
| "masked_top5": 57.64240478515625, | |
| "step": 750, | |
| "top1": 77.98610443115234, | |
| "top5": 90.91637084960938 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.1866870928510544, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7225, | |
| "step": 800 | |
| }, | |
| { | |
| "ce_loss": 1.7393132853507995, | |
| "epoch": 0.24, | |
| "inp_emb_norm": 0.27255859375, | |
| "loss": 1.7393132853507995, | |
| "masked_top1": 36.725569190979, | |
| "masked_top5": 58.16348899841309, | |
| "step": 800, | |
| "top1": 78.34141525268555, | |
| "top5": 91.05248138427734 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.9045759228508574, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7212, | |
| "step": 850 | |
| }, | |
| { | |
| "ce_loss": 1.7258725309371947, | |
| "epoch": 0.26, | |
| "inp_emb_norm": 0.27572265625, | |
| "loss": 1.7258725309371947, | |
| "masked_top1": 37.848323364257816, | |
| "masked_top5": 58.37759643554688, | |
| "step": 850, | |
| "top1": 78.46119873046875, | |
| "top5": 91.0196517944336 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.0335701239987896, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7209, | |
| "step": 900 | |
| }, | |
| { | |
| "ce_loss": 1.7411377978324891, | |
| "epoch": 0.27, | |
| "inp_emb_norm": 0.27140625, | |
| "loss": 1.7411377978324891, | |
| "masked_top1": 37.40238594055176, | |
| "masked_top5": 58.47159057617188, | |
| "step": 900, | |
| "top1": 78.22477142333985, | |
| "top5": 91.09677703857422 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.7613813093122805, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6991, | |
| "step": 950 | |
| }, | |
| { | |
| "ce_loss": 1.7078413605690002, | |
| "epoch": 0.29, | |
| "inp_emb_norm": 0.27345703125, | |
| "loss": 1.7078413605690002, | |
| "masked_top1": 37.8476549911499, | |
| "masked_top5": 58.585325393676754, | |
| "step": 950, | |
| "top1": 78.66676864624023, | |
| "top5": 91.18907363891601 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.746493072421527, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7029, | |
| "step": 1000 | |
| }, | |
| { | |
| "ce_loss": 1.6898603582382201, | |
| "epoch": 0.3, | |
| "inp_emb_norm": 0.275859375, | |
| "loss": 1.6898603582382201, | |
| "masked_top1": 38.61885482788086, | |
| "masked_top5": 60.25848365783691, | |
| "step": 1000, | |
| "top1": 78.78071334838867, | |
| "top5": 91.36259521484375 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.9274436772647008, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6826, | |
| "step": 1050 | |
| }, | |
| { | |
| "ce_loss": 1.68393492937088, | |
| "epoch": 0.32, | |
| "inp_emb_norm": 0.2751171875, | |
| "loss": 1.68393492937088, | |
| "masked_top1": 38.2902375793457, | |
| "masked_top5": 59.33997863769531, | |
| "step": 1050, | |
| "top1": 78.7706575012207, | |
| "top5": 91.35841613769531 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.7855509766510127, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6718, | |
| "step": 1100 | |
| }, | |
| { | |
| "ce_loss": 1.6736824607849121, | |
| "epoch": 0.33, | |
| "inp_emb_norm": 0.276640625, | |
| "loss": 1.6736824607849121, | |
| "masked_top1": 37.7931421661377, | |
| "masked_top5": 59.2380428314209, | |
| "step": 1100, | |
| "top1": 78.76599548339844, | |
| "top5": 91.49175521850586 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.9167748999099827, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6706, | |
| "step": 1150 | |
| }, | |
| { | |
| "ce_loss": 1.6615458488464356, | |
| "epoch": 0.35, | |
| "inp_emb_norm": 0.27826171875, | |
| "loss": 1.6615458488464356, | |
| "masked_top1": 37.01531482696533, | |
| "masked_top5": 58.88155044555664, | |
| "step": 1150, | |
| "top1": 79.07698608398438, | |
| "top5": 91.33328231811524 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.7556147486850162, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6474, | |
| "step": 1200 | |
| }, | |
| { | |
| "ce_loss": 1.6442325162887572, | |
| "epoch": 0.36, | |
| "inp_emb_norm": 0.28015625, | |
| "loss": 1.6442325162887572, | |
| "masked_top1": 38.984904556274415, | |
| "masked_top5": 60.31360260009765, | |
| "step": 1200, | |
| "top1": 79.19950073242188, | |
| "top5": 91.51022766113282 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.0171772312366834, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6388, | |
| "step": 1250 | |
| }, | |
| { | |
| "ce_loss": 1.5983400893211366, | |
| "epoch": 0.38, | |
| "inp_emb_norm": 0.2769921875, | |
| "loss": 1.5983400893211366, | |
| "masked_top1": 39.851169929504394, | |
| "masked_top5": 60.379223403930666, | |
| "step": 1250, | |
| "top1": 79.51817199707031, | |
| "top5": 91.8211392211914 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.6538008290605235, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6453, | |
| "step": 1300 | |
| }, | |
| { | |
| "ce_loss": 1.6439322781562806, | |
| "epoch": 0.39, | |
| "inp_emb_norm": 0.27740234375, | |
| "loss": 1.6439322781562806, | |
| "masked_top1": 38.07537261962891, | |
| "masked_top5": 59.28539070129395, | |
| "step": 1300, | |
| "top1": 79.08166000366211, | |
| "top5": 91.56427230834962 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.6552303659425454, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6314, | |
| "step": 1350 | |
| }, | |
| { | |
| "ce_loss": 1.6322034692764282, | |
| "epoch": 0.41, | |
| "inp_emb_norm": 0.2801953125, | |
| "loss": 1.6322034692764282, | |
| "masked_top1": 38.92374729156494, | |
| "masked_top5": 60.256018524169924, | |
| "step": 1350, | |
| "top1": 79.35121063232423, | |
| "top5": 91.62414169311523 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.4892198882479661, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6421, | |
| "step": 1400 | |
| }, | |
| { | |
| "ce_loss": 1.6463946628570556, | |
| "epoch": 0.42, | |
| "inp_emb_norm": 0.27763671875, | |
| "loss": 1.6463946628570556, | |
| "masked_top1": 37.44570774078369, | |
| "masked_top5": 59.385975952148435, | |
| "step": 1400, | |
| "top1": 79.01243072509766, | |
| "top5": 91.58177993774414 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.5892663240327325, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6319, | |
| "step": 1450 | |
| }, | |
| { | |
| "ce_loss": 1.622364718914032, | |
| "epoch": 0.44, | |
| "inp_emb_norm": 0.2815625, | |
| "loss": 1.622364718914032, | |
| "masked_top1": 37.010132484436035, | |
| "masked_top5": 58.88730949401855, | |
| "step": 1450, | |
| "top1": 79.29409576416016, | |
| "top5": 91.61080490112305 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.5298220602979866, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6288, | |
| "step": 1500 | |
| }, | |
| { | |
| "ce_loss": 1.672783522605896, | |
| "epoch": 0.45, | |
| "inp_emb_norm": 0.2782421875, | |
| "loss": 1.672783522605896, | |
| "masked_top1": 37.74873733520508, | |
| "masked_top5": 59.34834083557129, | |
| "step": 1500, | |
| "top1": 78.78699188232422, | |
| "top5": 91.44515747070312 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.6154188043026179, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6151, | |
| "step": 1550 | |
| }, | |
| { | |
| "ce_loss": 1.5997060227394104, | |
| "epoch": 0.47, | |
| "inp_emb_norm": 0.280390625, | |
| "loss": 1.5997060227394104, | |
| "masked_top1": 39.32766487121582, | |
| "masked_top5": 60.36418632507324, | |
| "step": 1550, | |
| "top1": 79.40666061401367, | |
| "top5": 91.93371612548827 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.3771816353205768, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6084, | |
| "step": 1600 | |
| }, | |
| { | |
| "ce_loss": 1.5898131847381591, | |
| "epoch": 0.48, | |
| "inp_emb_norm": 0.278984375, | |
| "loss": 1.5898131847381591, | |
| "masked_top1": 38.975818939208985, | |
| "masked_top5": 60.515014724731444, | |
| "step": 1600, | |
| "top1": 79.69720977783203, | |
| "top5": 91.98071060180663 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.452663822317599, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6124, | |
| "step": 1650 | |
| }, | |
| { | |
| "ce_loss": 1.6319751167297363, | |
| "epoch": 0.5, | |
| "inp_emb_norm": 0.28296875, | |
| "loss": 1.6319751167297363, | |
| "masked_top1": 39.17121643066406, | |
| "masked_top5": 60.53929168701172, | |
| "step": 1650, | |
| "top1": 79.40355041503906, | |
| "top5": 91.61957992553711 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.6649862877803743, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6168, | |
| "step": 1700 | |
| }, | |
| { | |
| "ce_loss": 1.6018091797828675, | |
| "epoch": 0.51, | |
| "inp_emb_norm": 0.2794921875, | |
| "loss": 1.6018091797828675, | |
| "masked_top1": 38.38370750427246, | |
| "masked_top5": 60.53903656005859, | |
| "step": 1700, | |
| "top1": 79.40913070678711, | |
| "top5": 91.92860076904297 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.900108786790241, | |
| "learning_rate": 0.0001, | |
| "loss": 1.591, | |
| "step": 1750 | |
| }, | |
| { | |
| "ce_loss": 1.6045577454566955, | |
| "epoch": 0.53, | |
| "inp_emb_norm": 0.2825, | |
| "loss": 1.6045577454566955, | |
| "masked_top1": 38.33882637023926, | |
| "masked_top5": 60.6797013092041, | |
| "step": 1750, | |
| "top1": 79.38134246826172, | |
| "top5": 91.90715942382812 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.669989934133695, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5798, | |
| "step": 1800 | |
| }, | |
| { | |
| "ce_loss": 1.575341019630432, | |
| "epoch": 0.54, | |
| "inp_emb_norm": 0.2808984375, | |
| "loss": 1.575341019630432, | |
| "masked_top1": 39.37454727172852, | |
| "masked_top5": 60.28605743408203, | |
| "step": 1800, | |
| "top1": 79.7763461303711, | |
| "top5": 91.9139274597168 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.4991127060714822, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5848, | |
| "step": 1850 | |
| }, | |
| { | |
| "ce_loss": 1.5776564478874207, | |
| "epoch": 0.56, | |
| "inp_emb_norm": 0.2787890625, | |
| "loss": 1.5776564478874207, | |
| "masked_top1": 38.988163757324216, | |
| "masked_top5": 59.69200141906738, | |
| "step": 1850, | |
| "top1": 79.65292083740235, | |
| "top5": 91.98164306640625 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.7116902678486716, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5799, | |
| "step": 1900 | |
| }, | |
| { | |
| "ce_loss": 1.574492063522339, | |
| "epoch": 0.57, | |
| "inp_emb_norm": 0.282421875, | |
| "loss": 1.574492063522339, | |
| "masked_top1": 40.138608627319336, | |
| "masked_top5": 61.31516136169434, | |
| "step": 1900, | |
| "top1": 79.69049621582032, | |
| "top5": 92.10322280883788 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.3150771431721024, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5593, | |
| "step": 1950 | |
| }, | |
| { | |
| "ce_loss": 1.5305517101287842, | |
| "epoch": 0.59, | |
| "inp_emb_norm": 0.2823046875, | |
| "loss": 1.5305517101287842, | |
| "masked_top1": 40.4295878982544, | |
| "masked_top5": 61.966970901489255, | |
| "step": 1950, | |
| "top1": 79.98332046508789, | |
| "top5": 92.32930023193359 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 6.707754405699673, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5863, | |
| "step": 2000 | |
| }, | |
| { | |
| "ce_loss": 1.5903752088546752, | |
| "epoch": 0.6, | |
| "inp_emb_norm": 0.2828515625, | |
| "loss": 1.5903752088546752, | |
| "masked_top1": 39.76962215423584, | |
| "masked_top5": 61.7191081237793, | |
| "step": 2000, | |
| "top1": 79.53595993041992, | |
| "top5": 92.0239190673828 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.452524089856983, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5773, | |
| "step": 2050 | |
| }, | |
| { | |
| "ce_loss": 1.5838685631752014, | |
| "epoch": 0.62, | |
| "inp_emb_norm": 0.2877734375, | |
| "loss": 1.5838685631752014, | |
| "masked_top1": 38.40034454345703, | |
| "masked_top5": 60.153098831176756, | |
| "step": 2050, | |
| "top1": 79.58445373535156, | |
| "top5": 91.9414045715332 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.5517366506652683, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5928, | |
| "step": 2100 | |
| }, | |
| { | |
| "ce_loss": 1.6009895205497742, | |
| "epoch": 0.63, | |
| "inp_emb_norm": 0.282578125, | |
| "loss": 1.6009895205497742, | |
| "masked_top1": 38.17198055267334, | |
| "masked_top5": 59.90890487670899, | |
| "step": 2100, | |
| "top1": 79.57786758422851, | |
| "top5": 91.74088973999024 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.4696024906496812, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5785, | |
| "step": 2150 | |
| }, | |
| { | |
| "ce_loss": 1.585225760936737, | |
| "epoch": 0.65, | |
| "inp_emb_norm": 0.28974609375, | |
| "loss": 1.585225760936737, | |
| "masked_top1": 39.15816062927246, | |
| "masked_top5": 61.218558883666994, | |
| "step": 2150, | |
| "top1": 79.77144348144532, | |
| "top5": 91.92766723632812 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.4286648572662997, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5676, | |
| "step": 2200 | |
| }, | |
| { | |
| "ce_loss": 1.5558442735671998, | |
| "epoch": 0.66, | |
| "inp_emb_norm": 0.2856640625, | |
| "loss": 1.5558442735671998, | |
| "masked_top1": 39.56588050842285, | |
| "masked_top5": 61.39894416809082, | |
| "step": 2200, | |
| "top1": 79.9505500793457, | |
| "top5": 92.23053100585938 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.3408463458478854, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5536, | |
| "step": 2250 | |
| }, | |
| { | |
| "ce_loss": 1.5132439875602721, | |
| "epoch": 0.68, | |
| "inp_emb_norm": 0.289609375, | |
| "loss": 1.5132439875602721, | |
| "masked_top1": 39.436230506896976, | |
| "masked_top5": 61.361060333251956, | |
| "step": 2250, | |
| "top1": 80.31327835083007, | |
| "top5": 92.35198455810547 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.4789295388492802, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5519, | |
| "step": 2300 | |
| }, | |
| { | |
| "ce_loss": 1.52454154253006, | |
| "epoch": 0.69, | |
| "inp_emb_norm": 0.2912109375, | |
| "loss": 1.52454154253006, | |
| "masked_top1": 39.47452976226807, | |
| "masked_top5": 61.6866958618164, | |
| "step": 2300, | |
| "top1": 80.14813919067383, | |
| "top5": 92.37121780395508 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.3877519817560113, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5542, | |
| "step": 2350 | |
| }, | |
| { | |
| "ce_loss": 1.5439094185829163, | |
| "epoch": 0.71, | |
| "inp_emb_norm": 0.29296875, | |
| "loss": 1.5439094185829163, | |
| "masked_top1": 38.476121711730954, | |
| "masked_top5": 61.2806697845459, | |
| "step": 2350, | |
| "top1": 80.00807098388673, | |
| "top5": 92.20030227661132 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.534164522196606, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5517, | |
| "step": 2400 | |
| }, | |
| { | |
| "ce_loss": 1.545496084690094, | |
| "epoch": 0.72, | |
| "inp_emb_norm": 0.291328125, | |
| "loss": 1.545496084690094, | |
| "masked_top1": 38.46687156677246, | |
| "masked_top5": 61.44874328613281, | |
| "step": 2400, | |
| "top1": 79.93062316894532, | |
| "top5": 92.19858032226563 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.4019229482522402, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5453, | |
| "step": 2450 | |
| }, | |
| { | |
| "ce_loss": 1.5521028304100037, | |
| "epoch": 0.74, | |
| "inp_emb_norm": 0.288125, | |
| "loss": 1.5521028304100037, | |
| "masked_top1": 40.88698040008545, | |
| "masked_top5": 63.060478515625, | |
| "step": 2450, | |
| "top1": 79.97427581787109, | |
| "top5": 92.30514190673829 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.3288444515563334, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5446, | |
| "step": 2500 | |
| }, | |
| { | |
| "ce_loss": 1.5541789364814758, | |
| "epoch": 0.75, | |
| "inp_emb_norm": 0.2872265625, | |
| "loss": 1.5541789364814758, | |
| "masked_top1": 40.732691040039064, | |
| "masked_top5": 62.54548332214355, | |
| "step": 2500, | |
| "top1": 79.8016665649414, | |
| "top5": 92.2079689025879 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.2949056410682664, | |
| "learning_rate": 0.0001, | |
| "loss": 1.523, | |
| "step": 2550 | |
| }, | |
| { | |
| "ce_loss": 1.524161262512207, | |
| "epoch": 0.77, | |
| "inp_emb_norm": 0.2938671875, | |
| "loss": 1.524161262512207, | |
| "masked_top1": 39.2896460723877, | |
| "masked_top5": 61.28740455627442, | |
| "step": 2550, | |
| "top1": 80.2130502319336, | |
| "top5": 92.26818771362305 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.376548356881607, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5247, | |
| "step": 2600 | |
| }, | |
| { | |
| "ce_loss": 1.5120180606842042, | |
| "epoch": 0.78, | |
| "inp_emb_norm": 0.2912109375, | |
| "loss": 1.5120180606842042, | |
| "masked_top1": 40.32475761413574, | |
| "masked_top5": 62.50459335327148, | |
| "step": 2600, | |
| "top1": 80.25805755615234, | |
| "top5": 92.3787159729004 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.446462800296954, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5244, | |
| "step": 2650 | |
| }, | |
| { | |
| "ce_loss": 1.4864131617546081, | |
| "epoch": 0.8, | |
| "inp_emb_norm": 0.2887890625, | |
| "loss": 1.4864131617546081, | |
| "masked_top1": 41.43366180419922, | |
| "masked_top5": 63.38628890991211, | |
| "step": 2650, | |
| "top1": 80.42174575805664, | |
| "top5": 92.73545196533203 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.5930817573922351, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5321, | |
| "step": 2700 | |
| }, | |
| { | |
| "ce_loss": 1.5146442604064942, | |
| "epoch": 0.81, | |
| "inp_emb_norm": 0.29212890625, | |
| "loss": 1.5146442604064942, | |
| "masked_top1": 40.67323059082031, | |
| "masked_top5": 63.30255233764648, | |
| "step": 2700, | |
| "top1": 80.21553787231446, | |
| "top5": 92.49449493408203 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.3154781061914393, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5258, | |
| "step": 2750 | |
| }, | |
| { | |
| "ce_loss": 1.5432595777511597, | |
| "epoch": 0.83, | |
| "inp_emb_norm": 0.2902734375, | |
| "loss": 1.5432595777511597, | |
| "masked_top1": 40.71655921936035, | |
| "masked_top5": 63.31653228759765, | |
| "step": 2750, | |
| "top1": 79.94630706787109, | |
| "top5": 92.41331008911133 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.4988480825688437, | |
| "learning_rate": 0.0001, | |
| "loss": 1.529, | |
| "step": 2800 | |
| }, | |
| { | |
| "ce_loss": 1.5197454857826234, | |
| "epoch": 0.84, | |
| "inp_emb_norm": 0.295234375, | |
| "loss": 1.5197454857826234, | |
| "masked_top1": 40.75451274871826, | |
| "masked_top5": 63.20060585021972, | |
| "step": 2800, | |
| "top1": 80.25330627441406, | |
| "top5": 92.41782470703124 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.3953163621955647, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5201, | |
| "step": 2850 | |
| }, | |
| { | |
| "ce_loss": 1.5061662626266479, | |
| "epoch": 0.86, | |
| "inp_emb_norm": 0.294375, | |
| "loss": 1.5061662626266479, | |
| "masked_top1": 41.16034164428711, | |
| "masked_top5": 63.11306884765625, | |
| "step": 2850, | |
| "top1": 80.41659545898438, | |
| "top5": 92.50812393188477 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.3361207443145617, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5185, | |
| "step": 2900 | |
| }, | |
| { | |
| "ce_loss": 1.48851407289505, | |
| "epoch": 0.87, | |
| "inp_emb_norm": 0.29296875, | |
| "loss": 1.48851407289505, | |
| "masked_top1": 41.136603927612306, | |
| "masked_top5": 63.02009063720703, | |
| "step": 2900, | |
| "top1": 80.65627792358399, | |
| "top5": 92.53072723388672 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.2528196866924233, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5154, | |
| "step": 2950 | |
| }, | |
| { | |
| "ce_loss": 1.511202063560486, | |
| "epoch": 0.89, | |
| "inp_emb_norm": 0.298671875, | |
| "loss": 1.511202063560486, | |
| "masked_top1": 39.16858932495117, | |
| "masked_top5": 61.53038063049316, | |
| "step": 2950, | |
| "top1": 80.2845379638672, | |
| "top5": 92.39456756591797 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.4560813011403748, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5365, | |
| "step": 3000 | |
| }, | |
| { | |
| "ce_loss": 1.5491148686408998, | |
| "epoch": 0.9, | |
| "inp_emb_norm": 0.297578125, | |
| "loss": 1.5491148686408998, | |
| "masked_top1": 40.18224685668945, | |
| "masked_top5": 62.69388824462891, | |
| "step": 3000, | |
| "top1": 79.84650588989258, | |
| "top5": 92.17584533691407 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.2474903057292217, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5079, | |
| "step": 3050 | |
| }, | |
| { | |
| "ce_loss": 1.510890085697174, | |
| "epoch": 0.92, | |
| "inp_emb_norm": 0.289453125, | |
| "loss": 1.510890085697174, | |
| "masked_top1": 41.284685897827146, | |
| "masked_top5": 63.54399559020996, | |
| "step": 3050, | |
| "top1": 80.19790802001953, | |
| "top5": 92.62230575561523 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.4413477522514346, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5008, | |
| "step": 3100 | |
| }, | |
| { | |
| "ce_loss": 1.505354859828949, | |
| "epoch": 0.93, | |
| "inp_emb_norm": 0.294140625, | |
| "loss": 1.505354859828949, | |
| "masked_top1": 40.861399726867674, | |
| "masked_top5": 62.37717430114746, | |
| "step": 3100, | |
| "top1": 80.46924255371094, | |
| "top5": 92.5052066040039 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.359782937496998, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4983, | |
| "step": 3150 | |
| }, | |
| { | |
| "ce_loss": 1.5110413646697998, | |
| "epoch": 0.95, | |
| "inp_emb_norm": 0.2980859375, | |
| "loss": 1.5110413646697998, | |
| "masked_top1": 42.160480155944825, | |
| "masked_top5": 64.15993942260742, | |
| "step": 3150, | |
| "top1": 80.45739974975587, | |
| "top5": 92.550869140625 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.3582356861036782, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5034, | |
| "step": 3200 | |
| }, | |
| { | |
| "ce_loss": 1.530618577003479, | |
| "epoch": 0.96, | |
| "inp_emb_norm": 0.29375, | |
| "loss": 1.530618577003479, | |
| "masked_top1": 38.9424352645874, | |
| "masked_top5": 62.109956970214846, | |
| "step": 3200, | |
| "top1": 80.0709977722168, | |
| "top5": 92.2861149597168 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.5802552823497455, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4946, | |
| "step": 3250 | |
| }, | |
| { | |
| "ce_loss": 1.477068486213684, | |
| "epoch": 0.98, | |
| "inp_emb_norm": 0.2976953125, | |
| "loss": 1.477068486213684, | |
| "masked_top1": 40.51057823181152, | |
| "masked_top5": 63.397703323364254, | |
| "step": 3250, | |
| "top1": 80.50700180053711, | |
| "top5": 92.68867340087891 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.4547325164409564, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5084, | |
| "step": 3300 | |
| }, | |
| { | |
| "ce_loss": 1.4906554532051086, | |
| "epoch": 0.99, | |
| "inp_emb_norm": 0.2973828125, | |
| "loss": 1.4906554532051086, | |
| "masked_top1": 40.54425048828125, | |
| "masked_top5": 63.201170196533205, | |
| "step": 3300, | |
| "top1": 80.40962783813477, | |
| "top5": 92.66117584228516 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.211911337284348, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3955, | |
| "step": 3350 | |
| }, | |
| { | |
| "ce_loss": 1.400897753238678, | |
| "epoch": 1.01, | |
| "inp_emb_norm": 0.3021875, | |
| "loss": 1.400897753238678, | |
| "masked_top1": 41.17978542327881, | |
| "masked_top5": 63.33616775512695, | |
| "step": 3350, | |
| "top1": 81.19734100341798, | |
| "top5": 93.10294403076172 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.0903282402796672, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2893, | |
| "step": 3400 | |
| }, | |
| { | |
| "ce_loss": 1.2788537430763245, | |
| "epoch": 1.02, | |
| "inp_emb_norm": 0.2989453125, | |
| "loss": 1.2788537430763245, | |
| "masked_top1": 40.09818046569824, | |
| "masked_top5": 62.5667342376709, | |
| "step": 3400, | |
| "top1": 82.30339096069336, | |
| "top5": 93.88133010864257 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.5240484766936855, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2964, | |
| "step": 3450 | |
| }, | |
| { | |
| "ce_loss": 1.303318452835083, | |
| "epoch": 1.04, | |
| "inp_emb_norm": 0.29703125, | |
| "loss": 1.303318452835083, | |
| "masked_top1": 42.162941246032716, | |
| "masked_top5": 65.02184768676757, | |
| "step": 3450, | |
| "top1": 82.02912399291992, | |
| "top5": 93.88583358764649 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.3787785745538395, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2852, | |
| "step": 3500 | |
| }, | |
| { | |
| "ce_loss": 1.276361472606659, | |
| "epoch": 1.05, | |
| "inp_emb_norm": 0.3021875, | |
| "loss": 1.276361472606659, | |
| "masked_top1": 41.940252990722655, | |
| "masked_top5": 65.38093276977538, | |
| "step": 3500, | |
| "top1": 82.44789840698242, | |
| "top5": 93.89565292358398 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.3842728564490252, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2978, | |
| "step": 3550 | |
| }, | |
| { | |
| "ce_loss": 1.3330717968940735, | |
| "epoch": 1.07, | |
| "inp_emb_norm": 0.29859375, | |
| "loss": 1.3330717968940735, | |
| "masked_top1": 41.01281318664551, | |
| "masked_top5": 63.77486457824707, | |
| "step": 3550, | |
| "top1": 81.81988937377929, | |
| "top5": 93.57796142578125 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.2631108386132899, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2931, | |
| "step": 3600 | |
| }, | |
| { | |
| "ce_loss": 1.2983617568016053, | |
| "epoch": 1.08, | |
| "inp_emb_norm": 0.2962109375, | |
| "loss": 1.2983617568016053, | |
| "masked_top1": 41.40194427490234, | |
| "masked_top5": 64.76402061462403, | |
| "step": 3600, | |
| "top1": 82.0794775390625, | |
| "top5": 93.74165817260742 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.4399254065091216, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3091, | |
| "step": 3650 | |
| }, | |
| { | |
| "ce_loss": 1.3062031197547912, | |
| "epoch": 1.1, | |
| "inp_emb_norm": 0.2998046875, | |
| "loss": 1.3062031197547912, | |
| "masked_top1": 40.85140724182129, | |
| "masked_top5": 63.72927604675293, | |
| "step": 3650, | |
| "top1": 82.13722595214844, | |
| "top5": 93.64862121582031 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.2311319185042409, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3088, | |
| "step": 3700 | |
| }, | |
| { | |
| "ce_loss": 1.2959584522247314, | |
| "epoch": 1.11, | |
| "inp_emb_norm": 0.304296875, | |
| "loss": 1.2959584522247314, | |
| "masked_top1": 41.39893436431885, | |
| "masked_top5": 63.62430404663086, | |
| "step": 3700, | |
| "top1": 82.33803344726563, | |
| "top5": 93.6450895690918 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.1358156745617467, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3131, | |
| "step": 3750 | |
| }, | |
| { | |
| "ce_loss": 1.29275639295578, | |
| "epoch": 1.13, | |
| "inp_emb_norm": 0.3049609375, | |
| "loss": 1.29275639295578, | |
| "masked_top1": 42.64406570434571, | |
| "masked_top5": 65.42192581176758, | |
| "step": 3750, | |
| "top1": 82.30059600830079, | |
| "top5": 93.80231704711915 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.1800593283969816, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2969, | |
| "step": 3800 | |
| }, | |
| { | |
| "ce_loss": 1.301748011112213, | |
| "epoch": 1.14, | |
| "inp_emb_norm": 0.2975390625, | |
| "loss": 1.301748011112213, | |
| "masked_top1": 41.66663146972656, | |
| "masked_top5": 64.09541954040527, | |
| "step": 3800, | |
| "top1": 82.05701431274414, | |
| "top5": 93.72319717407227 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.850155610052983, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3153, | |
| "step": 3850 | |
| }, | |
| { | |
| "ce_loss": 1.2972828006744386, | |
| "epoch": 1.16, | |
| "inp_emb_norm": 0.3021875, | |
| "loss": 1.2972828006744386, | |
| "masked_top1": 40.78549217224121, | |
| "masked_top5": 64.16069633483886, | |
| "step": 3850, | |
| "top1": 82.19980209350587, | |
| "top5": 93.7464599609375 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.2715739238746637, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2995, | |
| "step": 3900 | |
| }, | |
| { | |
| "ce_loss": 1.3045063495635987, | |
| "epoch": 1.17, | |
| "inp_emb_norm": 0.3047265625, | |
| "loss": 1.3045063495635987, | |
| "masked_top1": 40.932076416015626, | |
| "masked_top5": 64.15639678955078, | |
| "step": 3900, | |
| "top1": 82.09319473266602, | |
| "top5": 93.77517272949218 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.3796703227080724, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3071, | |
| "step": 3950 | |
| }, | |
| { | |
| "ce_loss": 1.3098111128807068, | |
| "epoch": 1.19, | |
| "inp_emb_norm": 0.297109375, | |
| "loss": 1.3098111128807068, | |
| "masked_top1": 41.91473709106445, | |
| "masked_top5": 64.43979515075684, | |
| "step": 3950, | |
| "top1": 81.8616471862793, | |
| "top5": 93.78232650756836 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.1546364550490216, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3098, | |
| "step": 4000 | |
| }, | |
| { | |
| "ce_loss": 1.3013187646865845, | |
| "epoch": 1.2, | |
| "inp_emb_norm": 0.30296875, | |
| "loss": 1.3013187646865845, | |
| "masked_top1": 41.120027198791504, | |
| "masked_top5": 63.108745346069334, | |
| "step": 4000, | |
| "top1": 82.13422164916992, | |
| "top5": 93.6265623474121 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.2515420301071278, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3027, | |
| "step": 4050 | |
| }, | |
| { | |
| "ce_loss": 1.2935965037345887, | |
| "epoch": 1.22, | |
| "inp_emb_norm": 0.3031640625, | |
| "loss": 1.2935965037345887, | |
| "masked_top1": 40.63400650024414, | |
| "masked_top5": 63.75268486022949, | |
| "step": 4050, | |
| "top1": 82.2015396118164, | |
| "top5": 93.79516159057617 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.1524816264565314, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3, | |
| "step": 4100 | |
| }, | |
| { | |
| "ce_loss": 1.3123349785804748, | |
| "epoch": 1.23, | |
| "inp_emb_norm": 0.3012109375, | |
| "loss": 1.3123349785804748, | |
| "masked_top1": 42.19605297088623, | |
| "masked_top5": 65.04543876647949, | |
| "step": 4100, | |
| "top1": 81.9914030456543, | |
| "top5": 93.81402328491211 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.378666189570675, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3045, | |
| "step": 4150 | |
| }, | |
| { | |
| "ce_loss": 1.2990161776542664, | |
| "epoch": 1.25, | |
| "inp_emb_norm": 0.305859375, | |
| "loss": 1.2990161776542664, | |
| "masked_top1": 41.78075637817383, | |
| "masked_top5": 64.68202613830566, | |
| "step": 4150, | |
| "top1": 82.17454803466796, | |
| "top5": 93.76477783203126 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.3993699556629227, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3014, | |
| "step": 4200 | |
| }, | |
| { | |
| "ce_loss": 1.2723517334461212, | |
| "epoch": 1.26, | |
| "inp_emb_norm": 0.301796875, | |
| "loss": 1.2723517334461212, | |
| "masked_top1": 43.205936431884766, | |
| "masked_top5": 65.13977348327637, | |
| "step": 4200, | |
| "top1": 82.52856887817383, | |
| "top5": 93.89131042480469 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.3745855928185973, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3151, | |
| "step": 4250 | |
| }, | |
| { | |
| "ce_loss": 1.3126947474479675, | |
| "epoch": 1.28, | |
| "inp_emb_norm": 0.3046484375, | |
| "loss": 1.3126947474479675, | |
| "masked_top1": 40.87171413421631, | |
| "masked_top5": 63.66306259155274, | |
| "step": 4250, | |
| "top1": 82.01153930664063, | |
| "top5": 93.64672576904297 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.479893407422574, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3213, | |
| "step": 4300 | |
| }, | |
| { | |
| "ce_loss": 1.3277541399002075, | |
| "epoch": 1.29, | |
| "inp_emb_norm": 0.302421875, | |
| "loss": 1.3277541399002075, | |
| "masked_top1": 41.169107818603514, | |
| "masked_top5": 64.09851593017578, | |
| "step": 4300, | |
| "top1": 81.798203125, | |
| "top5": 93.67467819213867 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.4172322167184916, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3112, | |
| "step": 4350 | |
| }, | |
| { | |
| "ce_loss": 1.3031212973594666, | |
| "epoch": 1.31, | |
| "inp_emb_norm": 0.306484375, | |
| "loss": 1.3031212973594666, | |
| "masked_top1": 41.52652729034424, | |
| "masked_top5": 64.15341407775878, | |
| "step": 4350, | |
| "top1": 82.03983184814453, | |
| "top5": 93.67420471191406 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.2760472814321302, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3131, | |
| "step": 4400 | |
| }, | |
| { | |
| "ce_loss": 1.308469491004944, | |
| "epoch": 1.32, | |
| "inp_emb_norm": 0.3057421875, | |
| "loss": 1.308469491004944, | |
| "masked_top1": 41.37114768981934, | |
| "masked_top5": 63.93113624572754, | |
| "step": 4400, | |
| "top1": 82.07672714233398, | |
| "top5": 93.6951513671875 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.2334433655289787, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3036, | |
| "step": 4450 | |
| }, | |
| { | |
| "ce_loss": 1.3185024070739746, | |
| "epoch": 1.34, | |
| "inp_emb_norm": 0.30171875, | |
| "loss": 1.3185024070739746, | |
| "masked_top1": 42.539853439331054, | |
| "masked_top5": 65.19357299804688, | |
| "step": 4450, | |
| "top1": 82.02122940063477, | |
| "top5": 93.74457550048828 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.2156796466751323, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3109, | |
| "step": 4500 | |
| }, | |
| { | |
| "ce_loss": 1.2963746500015259, | |
| "epoch": 1.35, | |
| "inp_emb_norm": 0.30984375, | |
| "loss": 1.2963746500015259, | |
| "masked_top1": 42.78518562316894, | |
| "masked_top5": 65.02521774291992, | |
| "step": 4500, | |
| "top1": 82.2968830871582, | |
| "top5": 93.7428224182129 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.311745495017629, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3115, | |
| "step": 4550 | |
| }, | |
| { | |
| "ce_loss": 1.2924715709686279, | |
| "epoch": 1.37, | |
| "inp_emb_norm": 0.303828125, | |
| "loss": 1.2924715709686279, | |
| "masked_top1": 42.53660099029541, | |
| "masked_top5": 65.59047889709473, | |
| "step": 4550, | |
| "top1": 82.1043424987793, | |
| "top5": 93.91824569702149 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.2174333958038526, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3201, | |
| "step": 4600 | |
| }, | |
| { | |
| "ce_loss": 1.3174702334403992, | |
| "epoch": 1.38, | |
| "inp_emb_norm": 0.30796875, | |
| "loss": 1.3174702334403992, | |
| "masked_top1": 42.07423233032227, | |
| "masked_top5": 64.41791015625, | |
| "step": 4600, | |
| "top1": 81.94120040893554, | |
| "top5": 93.70007461547851 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.1701397235812094, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3085, | |
| "step": 4650 | |
| }, | |
| { | |
| "ce_loss": 1.3130900907516478, | |
| "epoch": 1.4, | |
| "inp_emb_norm": 0.303203125, | |
| "loss": 1.3130900907516478, | |
| "masked_top1": 41.166784133911136, | |
| "masked_top5": 64.41526512145997, | |
| "step": 4650, | |
| "top1": 81.96283096313476, | |
| "top5": 93.76418426513672 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1778250928748137, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3236, | |
| "step": 4700 | |
| }, | |
| { | |
| "ce_loss": 1.3132509183883667, | |
| "epoch": 1.41, | |
| "inp_emb_norm": 0.3076171875, | |
| "loss": 1.3132509183883667, | |
| "masked_top1": 41.547097778320314, | |
| "masked_top5": 63.82028350830078, | |
| "step": 4700, | |
| "top1": 82.07340423583985, | |
| "top5": 93.64304718017578 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.3130038776611517, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2884, | |
| "step": 4750 | |
| }, | |
| { | |
| "ce_loss": 1.2915071487426757, | |
| "epoch": 1.43, | |
| "inp_emb_norm": 0.30640625, | |
| "loss": 1.2915071487426757, | |
| "masked_top1": 41.55844184875488, | |
| "masked_top5": 64.31948181152343, | |
| "step": 4750, | |
| "top1": 82.19200744628907, | |
| "top5": 93.80693008422851 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.3523241369731542, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3229, | |
| "step": 4800 | |
| }, | |
| { | |
| "ce_loss": 1.3281687498092651, | |
| "epoch": 1.44, | |
| "inp_emb_norm": 0.3083203125, | |
| "loss": 1.3281687498092651, | |
| "masked_top1": 41.508040008544924, | |
| "masked_top5": 63.831429901123045, | |
| "step": 4800, | |
| "top1": 81.83387313842773, | |
| "top5": 93.61951248168945 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.2229550505786297, | |
| "learning_rate": 0.0001, | |
| "loss": 1.313, | |
| "step": 4850 | |
| }, | |
| { | |
| "ce_loss": 1.321110601425171, | |
| "epoch": 1.46, | |
| "inp_emb_norm": 0.30484375, | |
| "loss": 1.321110601425171, | |
| "masked_top1": 42.07623374938965, | |
| "masked_top5": 64.01099380493164, | |
| "step": 4850, | |
| "top1": 82.04196990966797, | |
| "top5": 93.63945693969727 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.291149342311876, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3056, | |
| "step": 4900 | |
| }, | |
| { | |
| "ce_loss": 1.306683280467987, | |
| "epoch": 1.47, | |
| "inp_emb_norm": 0.3062890625, | |
| "loss": 1.306683280467987, | |
| "masked_top1": 42.63887153625488, | |
| "masked_top5": 65.17818214416504, | |
| "step": 4900, | |
| "top1": 82.0475244140625, | |
| "top5": 93.72082290649413 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.114074399282948, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2953, | |
| "step": 4950 | |
| }, | |
| { | |
| "ce_loss": 1.2916775250434875, | |
| "epoch": 1.49, | |
| "inp_emb_norm": 0.3067578125, | |
| "loss": 1.2916775250434875, | |
| "masked_top1": 43.01071895599365, | |
| "masked_top5": 64.40318168640137, | |
| "step": 4950, | |
| "top1": 82.25905914306641, | |
| "top5": 93.81284698486328 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.1428006346267754, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3132, | |
| "step": 5000 | |
| }, | |
| { | |
| "ce_loss": 1.3006132817268372, | |
| "epoch": 1.5, | |
| "inp_emb_norm": 0.305546875, | |
| "loss": 1.3006132817268372, | |
| "masked_top1": 43.18012409210205, | |
| "masked_top5": 65.12184883117676, | |
| "step": 5000, | |
| "top1": 82.29021392822266, | |
| "top5": 93.76517501831054 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.1188850908108916, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3097, | |
| "step": 5050 | |
| }, | |
| { | |
| "ce_loss": 1.3087879872322083, | |
| "epoch": 1.52, | |
| "inp_emb_norm": 0.3089453125, | |
| "loss": 1.3087879872322083, | |
| "masked_top1": 43.01914245605469, | |
| "masked_top5": 65.29854652404785, | |
| "step": 5050, | |
| "top1": 82.07856109619141, | |
| "top5": 93.74372894287109 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.2100791577553864, | |
| "learning_rate": 0.0001, | |
| "loss": 1.313, | |
| "step": 5100 | |
| }, | |
| { | |
| "ce_loss": 1.3196745228767395, | |
| "epoch": 1.53, | |
| "inp_emb_norm": 0.30875, | |
| "loss": 1.3196745228767395, | |
| "masked_top1": 42.13117530822754, | |
| "masked_top5": 65.02122192382812, | |
| "step": 5100, | |
| "top1": 81.9853271484375, | |
| "top5": 93.69662155151367 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.2111638230324686, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3151, | |
| "step": 5150 | |
| }, | |
| { | |
| "ce_loss": 1.3195432901382447, | |
| "epoch": 1.55, | |
| "inp_emb_norm": 0.306171875, | |
| "loss": 1.3195432901382447, | |
| "masked_top1": 42.28558715820313, | |
| "masked_top5": 65.25620643615723, | |
| "step": 5150, | |
| "top1": 81.9179295349121, | |
| "top5": 93.7562042236328 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.2354610192482536, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3048, | |
| "step": 5200 | |
| }, | |
| { | |
| "ce_loss": 1.3142172384262085, | |
| "epoch": 1.56, | |
| "inp_emb_norm": 0.3107421875, | |
| "loss": 1.3142172384262085, | |
| "masked_top1": 41.06179321289063, | |
| "masked_top5": 64.15951232910156, | |
| "step": 5200, | |
| "top1": 81.83832885742187, | |
| "top5": 93.69693267822265 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.2603393329014463, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3077, | |
| "step": 5250 | |
| }, | |
| { | |
| "ce_loss": 1.2882971096038818, | |
| "epoch": 1.58, | |
| "inp_emb_norm": 0.3098046875, | |
| "loss": 1.2882971096038818, | |
| "masked_top1": 42.656227684021, | |
| "masked_top5": 65.82235458374024, | |
| "step": 5250, | |
| "top1": 82.23510848999024, | |
| "top5": 93.8615916442871 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.1492020065892152, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3037, | |
| "step": 5300 | |
| }, | |
| { | |
| "ce_loss": 1.312100157737732, | |
| "epoch": 1.59, | |
| "inp_emb_norm": 0.3079296875, | |
| "loss": 1.312100157737732, | |
| "masked_top1": 42.46005714416504, | |
| "masked_top5": 65.23145439147949, | |
| "step": 5300, | |
| "top1": 81.96127456665039, | |
| "top5": 93.77029266357422 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.1456518826496218, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3066, | |
| "step": 5350 | |
| }, | |
| { | |
| "ce_loss": 1.3056522703170776, | |
| "epoch": 1.61, | |
| "inp_emb_norm": 0.3083203125, | |
| "loss": 1.3056522703170776, | |
| "masked_top1": 43.074807510375976, | |
| "masked_top5": 65.66509185791016, | |
| "step": 5350, | |
| "top1": 82.01446243286132, | |
| "top5": 93.83800003051758 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.382906237118969, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2847, | |
| "step": 5400 | |
| }, | |
| { | |
| "ce_loss": 1.2701800346374512, | |
| "epoch": 1.62, | |
| "inp_emb_norm": 0.3112109375, | |
| "loss": 1.2701800346374512, | |
| "masked_top1": 43.46398132324219, | |
| "masked_top5": 65.93353149414062, | |
| "step": 5400, | |
| "top1": 82.44269760131836, | |
| "top5": 94.06136520385742 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.250578306210753, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3039, | |
| "step": 5450 | |
| }, | |
| { | |
| "ce_loss": 1.3234626388549804, | |
| "epoch": 1.64, | |
| "inp_emb_norm": 0.3078515625, | |
| "loss": 1.3234626388549804, | |
| "masked_top1": 42.220664520263675, | |
| "masked_top5": 64.67328666687011, | |
| "step": 5450, | |
| "top1": 81.84058303833008, | |
| "top5": 93.60615112304687 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.1603834612617525, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3015, | |
| "step": 5500 | |
| }, | |
| { | |
| "ce_loss": 1.2987056183815002, | |
| "epoch": 1.65, | |
| "inp_emb_norm": 0.3158203125, | |
| "loss": 1.2987056183815002, | |
| "masked_top1": 42.56543678283691, | |
| "masked_top5": 65.94853828430176, | |
| "step": 5500, | |
| "top1": 82.10978652954101, | |
| "top5": 93.85393478393554 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.1963477888896916, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3118, | |
| "step": 5550 | |
| }, | |
| { | |
| "ce_loss": 1.306938099861145, | |
| "epoch": 1.67, | |
| "inp_emb_norm": 0.3089453125, | |
| "loss": 1.306938099861145, | |
| "masked_top1": 42.22477603912353, | |
| "masked_top5": 64.90145042419434, | |
| "step": 5550, | |
| "top1": 81.91244842529296, | |
| "top5": 93.82112915039062 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.2265886595466722, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2965, | |
| "step": 5600 | |
| }, | |
| { | |
| "ce_loss": 1.289056396484375, | |
| "epoch": 1.68, | |
| "inp_emb_norm": 0.3178515625, | |
| "loss": 1.289056396484375, | |
| "masked_top1": 41.55989253997803, | |
| "masked_top5": 64.25195945739746, | |
| "step": 5600, | |
| "top1": 82.382109375, | |
| "top5": 93.70501174926758 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.131601169964127, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2911, | |
| "step": 5650 | |
| }, | |
| { | |
| "ce_loss": 1.2798267722129821, | |
| "epoch": 1.7, | |
| "inp_emb_norm": 0.3168359375, | |
| "loss": 1.2798267722129821, | |
| "masked_top1": 42.326548461914065, | |
| "masked_top5": 65.07690460205077, | |
| "step": 5650, | |
| "top1": 82.33992919921874, | |
| "top5": 93.87046478271485 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.1181947342973821, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3145, | |
| "step": 5700 | |
| }, | |
| { | |
| "ce_loss": 1.31481609582901, | |
| "epoch": 1.71, | |
| "inp_emb_norm": 0.3159375, | |
| "loss": 1.31481609582901, | |
| "masked_top1": 42.50676147460938, | |
| "masked_top5": 65.8423821258545, | |
| "step": 5700, | |
| "top1": 82.06524475097656, | |
| "top5": 93.59629486083985 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.1098764034001067, | |
| "learning_rate": 0.0001, | |
| "loss": 1.311, | |
| "step": 5750 | |
| }, | |
| { | |
| "ce_loss": 1.3209831523895263, | |
| "epoch": 1.73, | |
| "inp_emb_norm": 0.310390625, | |
| "loss": 1.3209831523895263, | |
| "masked_top1": 42.35572410583496, | |
| "masked_top5": 65.26163505554199, | |
| "step": 5750, | |
| "top1": 81.91262344360352, | |
| "top5": 93.68317199707032 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.2409622007508851, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3131, | |
| "step": 5800 | |
| }, | |
| { | |
| "ce_loss": 1.303259253501892, | |
| "epoch": 1.74, | |
| "inp_emb_norm": 0.311015625, | |
| "loss": 1.303259253501892, | |
| "masked_top1": 42.99045387268066, | |
| "masked_top5": 65.50553886413574, | |
| "step": 5800, | |
| "top1": 82.13337188720703, | |
| "top5": 93.76351516723633 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.3030028695341598, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3, | |
| "step": 5850 | |
| }, | |
| { | |
| "ce_loss": 1.290860595703125, | |
| "epoch": 1.76, | |
| "inp_emb_norm": 0.32109375, | |
| "loss": 1.290860595703125, | |
| "masked_top1": 42.378893280029295, | |
| "masked_top5": 65.07022903442383, | |
| "step": 5850, | |
| "top1": 82.35479309082031, | |
| "top5": 93.71083343505859 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.2030263310548106, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3148, | |
| "step": 5900 | |
| }, | |
| { | |
| "ce_loss": 1.3233295631408692, | |
| "epoch": 1.77, | |
| "inp_emb_norm": 0.3166015625, | |
| "loss": 1.3233295631408692, | |
| "masked_top1": 42.24487545013428, | |
| "masked_top5": 65.1656477355957, | |
| "step": 5900, | |
| "top1": 82.03902984619141, | |
| "top5": 93.61334854125977 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.5803953753939939, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3041, | |
| "step": 5950 | |
| }, | |
| { | |
| "ce_loss": 1.2845279669761658, | |
| "epoch": 1.79, | |
| "inp_emb_norm": 0.3218359375, | |
| "loss": 1.2845279669761658, | |
| "masked_top1": 42.71968803405762, | |
| "masked_top5": 65.51119926452637, | |
| "step": 5950, | |
| "top1": 82.2425944519043, | |
| "top5": 93.84527893066407 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.1986406141845714, | |
| "learning_rate": 0.0001, | |
| "loss": 1.303, | |
| "step": 6000 | |
| }, | |
| { | |
| "ce_loss": 1.2704100012779236, | |
| "epoch": 1.8, | |
| "inp_emb_norm": 0.317734375, | |
| "loss": 1.2704100012779236, | |
| "masked_top1": 43.65462882995605, | |
| "masked_top5": 65.84256935119629, | |
| "step": 6000, | |
| "top1": 82.45276733398437, | |
| "top5": 93.91838714599609 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.1276854151713849, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3039, | |
| "step": 6050 | |
| }, | |
| { | |
| "ce_loss": 1.289050838947296, | |
| "epoch": 1.82, | |
| "inp_emb_norm": 0.3146875, | |
| "loss": 1.289050838947296, | |
| "masked_top1": 41.38638572692871, | |
| "masked_top5": 64.90666145324707, | |
| "step": 6050, | |
| "top1": 82.1436050415039, | |
| "top5": 93.8117251586914 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.1636926009728485, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3029, | |
| "step": 6100 | |
| }, | |
| { | |
| "ce_loss": 1.3035223054885865, | |
| "epoch": 1.83, | |
| "inp_emb_norm": 0.3187109375, | |
| "loss": 1.3035223054885865, | |
| "masked_top1": 42.37709861755371, | |
| "masked_top5": 65.33909133911133, | |
| "step": 6100, | |
| "top1": 82.11933166503906, | |
| "top5": 93.78050659179688 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0626794881896964, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3138, | |
| "step": 6150 | |
| }, | |
| { | |
| "ce_loss": 1.317355580329895, | |
| "epoch": 1.85, | |
| "inp_emb_norm": 0.31890625, | |
| "loss": 1.317355580329895, | |
| "masked_top1": 41.86831642150879, | |
| "masked_top5": 64.59691291809082, | |
| "step": 6150, | |
| "top1": 81.94907608032227, | |
| "top5": 93.67014266967773 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.1005541859171093, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3041, | |
| "step": 6200 | |
| }, | |
| { | |
| "ce_loss": 1.3219546675682068, | |
| "epoch": 1.86, | |
| "inp_emb_norm": 0.3172265625, | |
| "loss": 1.3219546675682068, | |
| "masked_top1": 42.3734455871582, | |
| "masked_top5": 65.90324760437012, | |
| "step": 6200, | |
| "top1": 81.97321441650391, | |
| "top5": 93.62858337402344 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.1522186435584791, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2963, | |
| "step": 6250 | |
| }, | |
| { | |
| "ce_loss": 1.2990228199958802, | |
| "epoch": 1.88, | |
| "inp_emb_norm": 0.3144921875, | |
| "loss": 1.2990228199958802, | |
| "masked_top1": 42.138801422119144, | |
| "masked_top5": 65.49508239746093, | |
| "step": 6250, | |
| "top1": 82.0102586364746, | |
| "top5": 93.90199996948242 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.2927059160480363, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3017, | |
| "step": 6300 | |
| }, | |
| { | |
| "ce_loss": 1.300498881340027, | |
| "epoch": 1.89, | |
| "inp_emb_norm": 0.316640625, | |
| "loss": 1.300498881340027, | |
| "masked_top1": 42.56178199768066, | |
| "masked_top5": 65.6432763671875, | |
| "step": 6300, | |
| "top1": 82.12412170410157, | |
| "top5": 93.7817707824707 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 1.168341408260434, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3029, | |
| "step": 6350 | |
| }, | |
| { | |
| "ce_loss": 1.3083948111534118, | |
| "epoch": 1.91, | |
| "inp_emb_norm": 0.311328125, | |
| "loss": 1.3083948111534118, | |
| "masked_top1": 42.55179992675781, | |
| "masked_top5": 65.3431551361084, | |
| "step": 6350, | |
| "top1": 81.94397857666016, | |
| "top5": 93.8497378540039 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.2323945648312147, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3104, | |
| "step": 6400 | |
| }, | |
| { | |
| "ce_loss": 1.3191473126411437, | |
| "epoch": 1.92, | |
| "inp_emb_norm": 0.3194921875, | |
| "loss": 1.3191473126411437, | |
| "masked_top1": 41.97941291809082, | |
| "masked_top5": 65.11552169799805, | |
| "step": 6400, | |
| "top1": 81.95802749633789, | |
| "top5": 93.71688690185547 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.0969474547631315, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3033, | |
| "step": 6450 | |
| }, | |
| { | |
| "ce_loss": 1.2988893008232116, | |
| "epoch": 1.94, | |
| "inp_emb_norm": 0.3175, | |
| "loss": 1.2988893008232116, | |
| "masked_top1": 42.91837085723877, | |
| "masked_top5": 65.42203193664551, | |
| "step": 6450, | |
| "top1": 82.20126846313477, | |
| "top5": 93.793828125 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 1.0855603866035775, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3051, | |
| "step": 6500 | |
| }, | |
| { | |
| "ce_loss": 1.3110007953643799, | |
| "epoch": 1.95, | |
| "inp_emb_norm": 0.32046875, | |
| "loss": 1.3110007953643799, | |
| "masked_top1": 42.34694427490234, | |
| "masked_top5": 65.08651733398438, | |
| "step": 6500, | |
| "top1": 81.95626449584961, | |
| "top5": 93.71604049682617 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 1.201042651000794, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3041, | |
| "step": 6550 | |
| }, | |
| { | |
| "ce_loss": 1.3072884845733643, | |
| "epoch": 1.97, | |
| "inp_emb_norm": 0.316484375, | |
| "loss": 1.3072884845733643, | |
| "masked_top1": 42.01487628936768, | |
| "masked_top5": 65.12509132385254, | |
| "step": 6550, | |
| "top1": 81.91355926513671, | |
| "top5": 93.84131042480469 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.2375042360574775, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3, | |
| "step": 6600 | |
| }, | |
| { | |
| "ce_loss": 1.3039724278450011, | |
| "epoch": 1.98, | |
| "inp_emb_norm": 0.318515625, | |
| "loss": 1.3039724278450011, | |
| "masked_top1": 42.9154284286499, | |
| "masked_top5": 66.03591972351074, | |
| "step": 6600, | |
| "top1": 82.26277542114258, | |
| "top5": 93.73072570800781 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.1513818080655263, | |
| "learning_rate": 0.0001, | |
| "loss": 1.3037, | |
| "step": 6650 | |
| }, | |
| { | |
| "ce_loss": 1.2931387114524842, | |
| "epoch": 2.0, | |
| "inp_emb_norm": 0.318046875, | |
| "loss": 1.2931387114524842, | |
| "masked_top1": 41.40272514343262, | |
| "masked_top5": 64.34992462158203, | |
| "step": 6650, | |
| "top1": 82.18499603271485, | |
| "top5": 93.81835754394531 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.0510501665579786, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0038, | |
| "step": 6700 | |
| }, | |
| { | |
| "ce_loss": 0.9978207111358642, | |
| "epoch": 2.02, | |
| "inp_emb_norm": 0.3219140625, | |
| "loss": 0.9978207111358642, | |
| "masked_top1": 45.336004638671874, | |
| "masked_top5": 69.27368041992187, | |
| "step": 6700, | |
| "top1": 85.59730255126954, | |
| "top5": 95.6523501586914 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 1.119371103964124, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0094, | |
| "step": 6750 | |
| }, | |
| { | |
| "ce_loss": 1.0062808072566987, | |
| "epoch": 2.03, | |
| "inp_emb_norm": 0.324375, | |
| "loss": 1.0062808072566987, | |
| "masked_top1": 44.80854190826416, | |
| "masked_top5": 67.70930862426758, | |
| "step": 6750, | |
| "top1": 85.45523956298828, | |
| "top5": 95.46897216796874 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.0111394168332695, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9973, | |
| "step": 6800 | |
| }, | |
| { | |
| "ce_loss": 0.9945477271080017, | |
| "epoch": 2.05, | |
| "inp_emb_norm": 0.32734375, | |
| "loss": 0.9945477271080017, | |
| "masked_top1": 46.81753234863281, | |
| "masked_top5": 70.2586336517334, | |
| "step": 6800, | |
| "top1": 85.69498947143555, | |
| "top5": 95.61528350830078 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.146035850759827, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0172, | |
| "step": 6850 | |
| }, | |
| { | |
| "ce_loss": 1.0215596628189088, | |
| "epoch": 2.06, | |
| "inp_emb_norm": 0.3225390625, | |
| "loss": 1.0215596628189088, | |
| "masked_top1": 43.64241020202637, | |
| "masked_top5": 67.47051399230958, | |
| "step": 6850, | |
| "top1": 85.29426712036133, | |
| "top5": 95.40342254638672 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.1308686916013488, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0143, | |
| "step": 6900 | |
| }, | |
| { | |
| "ce_loss": 1.008236768245697, | |
| "epoch": 2.08, | |
| "inp_emb_norm": 0.322109375, | |
| "loss": 1.008236768245697, | |
| "masked_top1": 44.6266674041748, | |
| "masked_top5": 68.65714111328126, | |
| "step": 6900, | |
| "top1": 85.34050277709962, | |
| "top5": 95.54212951660156 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 1.1042254336729025, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0111, | |
| "step": 6950 | |
| }, | |
| { | |
| "ce_loss": 1.0106354761123657, | |
| "epoch": 2.09, | |
| "inp_emb_norm": 0.322734375, | |
| "loss": 1.0106354761123657, | |
| "masked_top1": 44.90856178283691, | |
| "masked_top5": 69.18769927978515, | |
| "step": 6950, | |
| "top1": 85.31204833984376, | |
| "top5": 95.51878021240235 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.1432085643116041, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0137, | |
| "step": 7000 | |
| }, | |
| { | |
| "ce_loss": 1.0164246666431427, | |
| "epoch": 2.11, | |
| "inp_emb_norm": 0.3241015625, | |
| "loss": 1.0164246666431427, | |
| "masked_top1": 45.27058769226074, | |
| "masked_top5": 69.0199755859375, | |
| "step": 7000, | |
| "top1": 85.29512222290039, | |
| "top5": 95.44889572143555 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.0502533687083666, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0498, | |
| "step": 7050 | |
| }, | |
| { | |
| "ce_loss": 1.0437222492694855, | |
| "epoch": 2.12, | |
| "inp_emb_norm": 0.3234765625, | |
| "loss": 1.0437222492694855, | |
| "masked_top1": 45.688305435180666, | |
| "masked_top5": 68.60405799865723, | |
| "step": 7050, | |
| "top1": 85.19064407348633, | |
| "top5": 95.48901702880859 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 1.065003821728329, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0199, | |
| "step": 7100 | |
| }, | |
| { | |
| "ce_loss": 1.01140921831131, | |
| "epoch": 2.14, | |
| "inp_emb_norm": 0.3250390625, | |
| "loss": 1.01140921831131, | |
| "masked_top1": 45.559912796020505, | |
| "masked_top5": 69.64316802978516, | |
| "step": 7100, | |
| "top1": 85.34976013183594, | |
| "top5": 95.51665420532227 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.1735568530678606, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0166, | |
| "step": 7150 | |
| }, | |
| { | |
| "ce_loss": 1.0293829572200774, | |
| "epoch": 2.15, | |
| "inp_emb_norm": 0.323671875, | |
| "loss": 1.0293829572200774, | |
| "masked_top1": 44.797666244506836, | |
| "masked_top5": 69.08728958129883, | |
| "step": 7150, | |
| "top1": 85.07948059082031, | |
| "top5": 95.50323318481445 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.22336128717078, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0263, | |
| "step": 7200 | |
| }, | |
| { | |
| "ce_loss": 1.0348090195655824, | |
| "epoch": 2.17, | |
| "inp_emb_norm": 0.3307421875, | |
| "loss": 1.0348090195655824, | |
| "masked_top1": 43.274717559814455, | |
| "masked_top5": 68.35926612854004, | |
| "step": 7200, | |
| "top1": 85.03092895507812, | |
| "top5": 95.42227752685547 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 1.1254790539219672, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0341, | |
| "step": 7250 | |
| }, | |
| { | |
| "ce_loss": 1.028915911912918, | |
| "epoch": 2.18, | |
| "inp_emb_norm": 0.3247265625, | |
| "loss": 1.028915911912918, | |
| "masked_top1": 45.49935554504395, | |
| "masked_top5": 69.03593551635743, | |
| "step": 7250, | |
| "top1": 85.14698638916016, | |
| "top5": 95.36694305419923 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.1149477539183639, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0412, | |
| "step": 7300 | |
| }, | |
| { | |
| "ce_loss": 1.0536164796352387, | |
| "epoch": 2.2, | |
| "inp_emb_norm": 0.3236328125, | |
| "loss": 1.0536164796352387, | |
| "masked_top1": 43.858817138671874, | |
| "masked_top5": 67.21836029052734, | |
| "step": 7300, | |
| "top1": 84.86703170776367, | |
| "top5": 95.25842987060547 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.1968292628857302, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0432, | |
| "step": 7350 | |
| }, | |
| { | |
| "ce_loss": 1.0520641374588013, | |
| "epoch": 2.21, | |
| "inp_emb_norm": 0.3211328125, | |
| "loss": 1.0520641374588013, | |
| "masked_top1": 44.41142387390137, | |
| "masked_top5": 67.9876936340332, | |
| "step": 7350, | |
| "top1": 84.89357849121093, | |
| "top5": 95.34237762451171 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.1298025119403752, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0285, | |
| "step": 7400 | |
| }, | |
| { | |
| "ce_loss": 1.0238622641563415, | |
| "epoch": 2.23, | |
| "inp_emb_norm": 0.33328125, | |
| "loss": 1.0238622641563415, | |
| "masked_top1": 46.15579338073731, | |
| "masked_top5": 70.0232991027832, | |
| "step": 7400, | |
| "top1": 85.30123580932617, | |
| "top5": 95.43901489257813 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.0250128601399684, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0317, | |
| "step": 7450 | |
| }, | |
| { | |
| "ce_loss": 1.024322179555893, | |
| "epoch": 2.24, | |
| "inp_emb_norm": 0.3326953125, | |
| "loss": 1.024322179555893, | |
| "masked_top1": 45.176987152099606, | |
| "masked_top5": 69.09153121948242, | |
| "step": 7450, | |
| "top1": 85.22158447265625, | |
| "top5": 95.37797592163086 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.1668834452965842, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0344, | |
| "step": 7500 | |
| }, | |
| { | |
| "ce_loss": 1.0320839881896973, | |
| "epoch": 2.26, | |
| "inp_emb_norm": 0.3268359375, | |
| "loss": 1.0320839881896973, | |
| "masked_top1": 45.62026954650879, | |
| "masked_top5": 69.47045196533203, | |
| "step": 7500, | |
| "top1": 85.13791748046874, | |
| "top5": 95.46679916381837 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 1.1647845511661612, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0364, | |
| "step": 7550 | |
| }, | |
| { | |
| "ce_loss": 1.040896817445755, | |
| "epoch": 2.27, | |
| "inp_emb_norm": 0.3287890625, | |
| "loss": 1.040896817445755, | |
| "masked_top1": 45.61882129669189, | |
| "masked_top5": 68.993677444458, | |
| "step": 7550, | |
| "top1": 84.89491455078125, | |
| "top5": 95.34945602416992 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.070994325965457, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0481, | |
| "step": 7600 | |
| }, | |
| { | |
| "ce_loss": 1.045055913925171, | |
| "epoch": 2.29, | |
| "inp_emb_norm": 0.3358203125, | |
| "loss": 1.045055913925171, | |
| "masked_top1": 43.71815933227539, | |
| "masked_top5": 68.03006439208984, | |
| "step": 7600, | |
| "top1": 84.92118545532226, | |
| "top5": 95.33861541748047 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.0302755529017995, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0468, | |
| "step": 7650 | |
| }, | |
| { | |
| "ce_loss": 1.029969446659088, | |
| "epoch": 2.3, | |
| "inp_emb_norm": 0.3273046875, | |
| "loss": 1.029969446659088, | |
| "masked_top1": 44.84774971008301, | |
| "masked_top5": 68.51262855529785, | |
| "step": 7650, | |
| "top1": 85.12003936767579, | |
| "top5": 95.39368621826172 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.2124331568257196, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0462, | |
| "step": 7700 | |
| }, | |
| { | |
| "ce_loss": 1.0300623905658721, | |
| "epoch": 2.32, | |
| "inp_emb_norm": 0.32671875, | |
| "loss": 1.0300623905658721, | |
| "masked_top1": 46.18773262023926, | |
| "masked_top5": 69.55521690368653, | |
| "step": 7700, | |
| "top1": 85.05798110961913, | |
| "top5": 95.45936019897461 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.25935837166321, | |
| "learning_rate": 0.0001, | |
| "loss": 1.051, | |
| "step": 7750 | |
| }, | |
| { | |
| "ce_loss": 1.0560246324539184, | |
| "epoch": 2.33, | |
| "inp_emb_norm": 0.32921875, | |
| "loss": 1.0560246324539184, | |
| "masked_top1": 44.81505111694336, | |
| "masked_top5": 68.60199340820313, | |
| "step": 7750, | |
| "top1": 84.81520401000977, | |
| "top5": 95.27306564331055 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 1.1490451145708855, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0459, | |
| "step": 7800 | |
| }, | |
| { | |
| "ce_loss": 1.0482890462875367, | |
| "epoch": 2.35, | |
| "inp_emb_norm": 0.3268359375, | |
| "loss": 1.0482890462875367, | |
| "masked_top1": 44.35726287841797, | |
| "masked_top5": 68.64975059509277, | |
| "step": 7800, | |
| "top1": 84.82445663452148, | |
| "top5": 95.42289993286133 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 1.1334989027761242, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0441, | |
| "step": 7850 | |
| }, | |
| { | |
| "ce_loss": 1.0527094066143037, | |
| "epoch": 2.36, | |
| "inp_emb_norm": 0.3269140625, | |
| "loss": 1.0527094066143037, | |
| "masked_top1": 45.072478713989256, | |
| "masked_top5": 68.39160514831543, | |
| "step": 7850, | |
| "top1": 84.785078125, | |
| "top5": 95.35190139770508 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 1.1375627332151101, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0565, | |
| "step": 7900 | |
| }, | |
| { | |
| "ce_loss": 1.0542543601989747, | |
| "epoch": 2.38, | |
| "inp_emb_norm": 0.331328125, | |
| "loss": 1.0542543601989747, | |
| "masked_top1": 43.75662239074707, | |
| "masked_top5": 67.66206184387207, | |
| "step": 7900, | |
| "top1": 84.7607080078125, | |
| "top5": 95.31730087280273 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 1.1403758501967678, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0445, | |
| "step": 7950 | |
| }, | |
| { | |
| "ce_loss": 1.0483984065055847, | |
| "epoch": 2.39, | |
| "inp_emb_norm": 0.32671875, | |
| "loss": 1.0483984065055847, | |
| "masked_top1": 45.24047119140625, | |
| "masked_top5": 68.86077049255371, | |
| "step": 7950, | |
| "top1": 84.88309677124023, | |
| "top5": 95.3291700744629 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 1.039302857763206, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0581, | |
| "step": 8000 | |
| }, | |
| { | |
| "ce_loss": 1.0635139977931976, | |
| "epoch": 2.41, | |
| "inp_emb_norm": 0.3339453125, | |
| "loss": 1.0635139977931976, | |
| "masked_top1": 43.78837112426758, | |
| "masked_top5": 67.08759162902832, | |
| "step": 8000, | |
| "top1": 84.79178085327149, | |
| "top5": 95.13488723754882 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 1.1910955097950056, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0522, | |
| "step": 8050 | |
| }, | |
| { | |
| "ce_loss": 1.0587849020957947, | |
| "epoch": 2.42, | |
| "inp_emb_norm": 0.3301171875, | |
| "loss": 1.0587849020957947, | |
| "masked_top1": 44.429280014038085, | |
| "masked_top5": 67.39429237365722, | |
| "step": 8050, | |
| "top1": 84.87328887939454, | |
| "top5": 95.1887158203125 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.1479506270149336, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0536, | |
| "step": 8100 | |
| }, | |
| { | |
| "ce_loss": 1.0473863470554352, | |
| "epoch": 2.44, | |
| "inp_emb_norm": 0.3341015625, | |
| "loss": 1.0473863470554352, | |
| "masked_top1": 44.48332893371582, | |
| "masked_top5": 68.14752388000488, | |
| "step": 8100, | |
| "top1": 84.92030502319336, | |
| "top5": 95.26301513671875 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 1.1375038249327807, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0419, | |
| "step": 8150 | |
| }, | |
| { | |
| "ce_loss": 1.0687260043621063, | |
| "epoch": 2.45, | |
| "inp_emb_norm": 0.3335546875, | |
| "loss": 1.0687260043621063, | |
| "masked_top1": 43.23715843200684, | |
| "masked_top5": 67.02041564941406, | |
| "step": 8150, | |
| "top1": 84.57525085449218, | |
| "top5": 95.16051864624023 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.1378089695981268, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0485, | |
| "step": 8200 | |
| }, | |
| { | |
| "ce_loss": 1.0508419513702392, | |
| "epoch": 2.47, | |
| "inp_emb_norm": 0.3319921875, | |
| "loss": 1.0508419513702392, | |
| "masked_top1": 45.14276206970215, | |
| "masked_top5": 69.22940170288086, | |
| "step": 8200, | |
| "top1": 84.89724472045899, | |
| "top5": 95.38407836914062 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.134415072768399, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0587, | |
| "step": 8250 | |
| }, | |
| { | |
| "ce_loss": 1.0560647177696227, | |
| "epoch": 2.48, | |
| "inp_emb_norm": 0.33328125, | |
| "loss": 1.0560647177696227, | |
| "masked_top1": 44.99800594329834, | |
| "masked_top5": 67.96685562133788, | |
| "step": 8250, | |
| "top1": 84.84181884765626, | |
| "top5": 95.19639511108399 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.0130905993003134, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0622, | |
| "step": 8300 | |
| }, | |
| { | |
| "ce_loss": 1.0628444683551788, | |
| "epoch": 2.5, | |
| "inp_emb_norm": 0.335390625, | |
| "loss": 1.0628444683551788, | |
| "masked_top1": 43.72650085449219, | |
| "masked_top5": 67.85165229797363, | |
| "step": 8300, | |
| "top1": 84.71982971191406, | |
| "top5": 95.2232145690918 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.1112686440386919, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0624, | |
| "step": 8350 | |
| }, | |
| { | |
| "ce_loss": 1.0606860029697418, | |
| "epoch": 2.51, | |
| "inp_emb_norm": 0.3302734375, | |
| "loss": 1.0606860029697418, | |
| "masked_top1": 45.08112854003906, | |
| "masked_top5": 68.54836326599121, | |
| "step": 8350, | |
| "top1": 84.70668167114258, | |
| "top5": 95.34720886230468 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 1.0929741266435504, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0653, | |
| "step": 8400 | |
| }, | |
| { | |
| "ce_loss": 1.0685135400295258, | |
| "epoch": 2.53, | |
| "inp_emb_norm": 0.329296875, | |
| "loss": 1.0685135400295258, | |
| "masked_top1": 44.93409217834473, | |
| "masked_top5": 68.15225112915039, | |
| "step": 8400, | |
| "top1": 84.59507186889648, | |
| "top5": 95.2771678161621 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 1.0671443258127058, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0661, | |
| "step": 8450 | |
| }, | |
| { | |
| "ce_loss": 1.074057730436325, | |
| "epoch": 2.54, | |
| "inp_emb_norm": 0.334140625, | |
| "loss": 1.074057730436325, | |
| "masked_top1": 43.86976951599121, | |
| "masked_top5": 67.98081314086915, | |
| "step": 8450, | |
| "top1": 84.60137985229493, | |
| "top5": 95.1078970336914 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.2357962610955344, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0625, | |
| "step": 8500 | |
| }, | |
| { | |
| "ce_loss": 1.0734054052829742, | |
| "epoch": 2.56, | |
| "inp_emb_norm": 0.3264453125, | |
| "loss": 1.0734054052829742, | |
| "masked_top1": 43.74296424865722, | |
| "masked_top5": 67.63478965759278, | |
| "step": 8500, | |
| "top1": 84.40524276733399, | |
| "top5": 95.29552383422852 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 1.185708692105678, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0543, | |
| "step": 8550 | |
| }, | |
| { | |
| "ce_loss": 1.0622280275821685, | |
| "epoch": 2.57, | |
| "inp_emb_norm": 0.3312890625, | |
| "loss": 1.0622280275821685, | |
| "masked_top1": 45.68028434753418, | |
| "masked_top5": 68.61516632080078, | |
| "step": 8550, | |
| "top1": 84.72150787353516, | |
| "top5": 95.2423046875 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 1.124889127351682, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0598, | |
| "step": 8600 | |
| }, | |
| { | |
| "ce_loss": 1.0730221366882324, | |
| "epoch": 2.59, | |
| "inp_emb_norm": 0.3349609375, | |
| "loss": 1.0730221366882324, | |
| "masked_top1": 44.15586044311524, | |
| "masked_top5": 67.7581484222412, | |
| "step": 8600, | |
| "top1": 84.60388778686523, | |
| "top5": 95.18492584228515 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.1470911071720453, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0718, | |
| "step": 8650 | |
| }, | |
| { | |
| "ce_loss": 1.0610903584957123, | |
| "epoch": 2.6, | |
| "inp_emb_norm": 0.3320703125, | |
| "loss": 1.0610903584957123, | |
| "masked_top1": 44.21772804260254, | |
| "masked_top5": 67.78637229919434, | |
| "step": 8650, | |
| "top1": 84.74846099853515, | |
| "top5": 95.21728775024414 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.120071012609436, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0598, | |
| "step": 8700 | |
| }, | |
| { | |
| "ce_loss": 1.057754340171814, | |
| "epoch": 2.62, | |
| "inp_emb_norm": 0.334140625, | |
| "loss": 1.057754340171814, | |
| "masked_top1": 44.56280632019043, | |
| "masked_top5": 68.16851516723632, | |
| "step": 8700, | |
| "top1": 84.76306396484375, | |
| "top5": 95.24030334472656 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 1.1990678496151557, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0698, | |
| "step": 8750 | |
| }, | |
| { | |
| "ce_loss": 1.0554494428634644, | |
| "epoch": 2.63, | |
| "inp_emb_norm": 0.3276953125, | |
| "loss": 1.0554494428634644, | |
| "masked_top1": 45.60402565002441, | |
| "masked_top5": 69.8213597869873, | |
| "step": 8750, | |
| "top1": 84.74255416870118, | |
| "top5": 95.37602157592774 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.1691422333586243, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0711, | |
| "step": 8800 | |
| }, | |
| { | |
| "ce_loss": 1.0723666751384735, | |
| "epoch": 2.65, | |
| "inp_emb_norm": 0.3325390625, | |
| "loss": 1.0723666751384735, | |
| "masked_top1": 44.78216484069824, | |
| "masked_top5": 68.43072723388671, | |
| "step": 8800, | |
| "top1": 84.51954879760743, | |
| "top5": 95.22681289672852 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.058087631068196, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0736, | |
| "step": 8850 | |
| }, | |
| { | |
| "ce_loss": 1.0746560537815093, | |
| "epoch": 2.66, | |
| "inp_emb_norm": 0.329140625, | |
| "loss": 1.0746560537815093, | |
| "masked_top1": 44.328994064331056, | |
| "masked_top5": 67.38568382263183, | |
| "step": 8850, | |
| "top1": 84.52036026000977, | |
| "top5": 95.2164730834961 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.1288621853196346, | |
| "learning_rate": 0.0001, | |
| "loss": 1.069, | |
| "step": 8900 | |
| }, | |
| { | |
| "ce_loss": 1.075651180744171, | |
| "epoch": 2.68, | |
| "inp_emb_norm": 0.334765625, | |
| "loss": 1.075651180744171, | |
| "masked_top1": 45.41173538208008, | |
| "masked_top5": 68.17863861083984, | |
| "step": 8900, | |
| "top1": 84.6201480102539, | |
| "top5": 95.15124877929688 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.14695178467833, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0671, | |
| "step": 8950 | |
| }, | |
| { | |
| "ce_loss": 1.0754395532608032, | |
| "epoch": 2.69, | |
| "inp_emb_norm": 0.334453125, | |
| "loss": 1.0754395532608032, | |
| "masked_top1": 45.43799507141113, | |
| "masked_top5": 69.10365425109863, | |
| "step": 8950, | |
| "top1": 84.61126815795899, | |
| "top5": 95.2543717956543 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.131019351092537, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0679, | |
| "step": 9000 | |
| }, | |
| { | |
| "ce_loss": 1.0691665148735046, | |
| "epoch": 2.71, | |
| "inp_emb_norm": 0.33484375, | |
| "loss": 1.0691665148735046, | |
| "masked_top1": 44.43798561096192, | |
| "masked_top5": 67.93657341003419, | |
| "step": 9000, | |
| "top1": 84.68964065551758, | |
| "top5": 95.20655715942382 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 1.120170919000501, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0722, | |
| "step": 9050 | |
| }, | |
| { | |
| "ce_loss": 1.0739211070537567, | |
| "epoch": 2.72, | |
| "inp_emb_norm": 0.3396484375, | |
| "loss": 1.0739211070537567, | |
| "masked_top1": 44.42457763671875, | |
| "masked_top5": 67.35458564758301, | |
| "step": 9050, | |
| "top1": 84.60804443359375, | |
| "top5": 95.0638233947754 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 1.0687255304897059, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0792, | |
| "step": 9100 | |
| }, | |
| { | |
| "ce_loss": 1.0771595978736876, | |
| "epoch": 2.74, | |
| "inp_emb_norm": 0.3378125, | |
| "loss": 1.0771595978736876, | |
| "masked_top1": 43.902629165649415, | |
| "masked_top5": 67.31045616149902, | |
| "step": 9100, | |
| "top1": 84.62485046386719, | |
| "top5": 95.07604125976563 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.0993761131067057, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0813, | |
| "step": 9150 | |
| }, | |
| { | |
| "ce_loss": 1.082298024892807, | |
| "epoch": 2.75, | |
| "inp_emb_norm": 0.334609375, | |
| "loss": 1.082298024892807, | |
| "masked_top1": 45.80146224975586, | |
| "masked_top5": 68.17116020202637, | |
| "step": 9150, | |
| "top1": 84.43293472290038, | |
| "top5": 95.1411996459961 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.109498165414777, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0752, | |
| "step": 9200 | |
| }, | |
| { | |
| "ce_loss": 1.0997070169448853, | |
| "epoch": 2.77, | |
| "inp_emb_norm": 0.33890625, | |
| "loss": 1.0997070169448853, | |
| "masked_top1": 44.238784561157225, | |
| "masked_top5": 67.47504318237304, | |
| "step": 9200, | |
| "top1": 84.3786474609375, | |
| "top5": 94.96468109130859 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 1.0947772030405678, | |
| "learning_rate": 0.0001, | |
| "loss": 1.081, | |
| "step": 9250 | |
| }, | |
| { | |
| "ce_loss": 1.066500049829483, | |
| "epoch": 2.78, | |
| "inp_emb_norm": 0.3352734375, | |
| "loss": 1.066500049829483, | |
| "masked_top1": 45.10762840270996, | |
| "masked_top5": 68.68643852233886, | |
| "step": 9250, | |
| "top1": 84.62079177856445, | |
| "top5": 95.23117553710938 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.9904651287047559, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0769, | |
| "step": 9300 | |
| }, | |
| { | |
| "ce_loss": 1.0865390312671661, | |
| "epoch": 2.8, | |
| "inp_emb_norm": 0.3416015625, | |
| "loss": 1.0865390312671661, | |
| "masked_top1": 43.63430931091309, | |
| "masked_top5": 67.33321548461915, | |
| "step": 9300, | |
| "top1": 84.4200732421875, | |
| "top5": 95.05102798461914 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 1.115318864267355, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0716, | |
| "step": 9350 | |
| }, | |
| { | |
| "ce_loss": 1.05960639834404, | |
| "epoch": 2.81, | |
| "inp_emb_norm": 0.3376171875, | |
| "loss": 1.05960639834404, | |
| "masked_top1": 45.203971252441406, | |
| "masked_top5": 67.83310653686523, | |
| "step": 9350, | |
| "top1": 84.70579071044922, | |
| "top5": 95.13749114990235 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.0681394549186096, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0755, | |
| "step": 9400 | |
| }, | |
| { | |
| "ce_loss": 1.0759823191165925, | |
| "epoch": 2.83, | |
| "inp_emb_norm": 0.335234375, | |
| "loss": 1.0759823191165925, | |
| "masked_top1": 45.2706275177002, | |
| "masked_top5": 68.85594253540039, | |
| "step": 9400, | |
| "top1": 84.58239471435547, | |
| "top5": 95.29377487182617 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 1.1612375451754464, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0704, | |
| "step": 9450 | |
| }, | |
| { | |
| "ce_loss": 1.0630818378925324, | |
| "epoch": 2.84, | |
| "inp_emb_norm": 0.33828125, | |
| "loss": 1.0630818378925324, | |
| "masked_top1": 45.41359436035156, | |
| "masked_top5": 67.90339347839355, | |
| "step": 9450, | |
| "top1": 84.69526199340821, | |
| "top5": 95.18430023193359 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 1.0053255791083329, | |
| "learning_rate": 0.0001, | |
| "loss": 1.075, | |
| "step": 9500 | |
| }, | |
| { | |
| "ce_loss": 1.0855333960056306, | |
| "epoch": 2.86, | |
| "inp_emb_norm": 0.332578125, | |
| "loss": 1.0855333960056306, | |
| "masked_top1": 43.72627799987793, | |
| "masked_top5": 67.27571357727051, | |
| "step": 9500, | |
| "top1": 84.34697372436523, | |
| "top5": 95.10883621215821 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 1.1782915911734662, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0865, | |
| "step": 9550 | |
| }, | |
| { | |
| "ce_loss": 1.083924981355667, | |
| "epoch": 2.87, | |
| "inp_emb_norm": 0.3425390625, | |
| "loss": 1.083924981355667, | |
| "masked_top1": 44.98833938598633, | |
| "masked_top5": 68.43323631286621, | |
| "step": 9550, | |
| "top1": 84.45411026000977, | |
| "top5": 95.07992599487305 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.9304520640714639, | |
| "learning_rate": 0.0001, | |
| "loss": 1.083, | |
| "step": 9600 | |
| }, | |
| { | |
| "ce_loss": 1.0927222657203675, | |
| "epoch": 2.89, | |
| "inp_emb_norm": 0.3383984375, | |
| "loss": 1.0927222657203675, | |
| "masked_top1": 44.10447273254395, | |
| "masked_top5": 67.51412483215331, | |
| "step": 9600, | |
| "top1": 84.38162017822266, | |
| "top5": 94.919228515625 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.1207152238553202, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0748, | |
| "step": 9650 | |
| }, | |
| { | |
| "ce_loss": 1.0789397644996643, | |
| "epoch": 2.9, | |
| "inp_emb_norm": 0.338671875, | |
| "loss": 1.0789397644996643, | |
| "masked_top1": 44.84839416503906, | |
| "masked_top5": 67.80728317260743, | |
| "step": 9650, | |
| "top1": 84.54779495239258, | |
| "top5": 95.0833723449707 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.1324876880673622, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0768, | |
| "step": 9700 | |
| }, | |
| { | |
| "ce_loss": 1.0801820170879364, | |
| "epoch": 2.92, | |
| "inp_emb_norm": 0.3430078125, | |
| "loss": 1.0801820170879364, | |
| "masked_top1": 43.99905281066894, | |
| "masked_top5": 67.07788047790527, | |
| "step": 9700, | |
| "top1": 84.5837564086914, | |
| "top5": 95.05357192993164 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.0773481244877405, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0728, | |
| "step": 9750 | |
| }, | |
| { | |
| "ce_loss": 1.080371401309967, | |
| "epoch": 2.93, | |
| "inp_emb_norm": 0.3410546875, | |
| "loss": 1.080371401309967, | |
| "masked_top1": 43.507309417724606, | |
| "masked_top5": 67.3342724609375, | |
| "step": 9750, | |
| "top1": 84.5036083984375, | |
| "top5": 95.07123840332031 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.0159168175987678, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0688, | |
| "step": 9800 | |
| }, | |
| { | |
| "ce_loss": 1.083665030002594, | |
| "epoch": 2.95, | |
| "inp_emb_norm": 0.34671875, | |
| "loss": 1.083665030002594, | |
| "masked_top1": 44.19781021118164, | |
| "masked_top5": 67.93185821533203, | |
| "step": 9800, | |
| "top1": 84.49478713989258, | |
| "top5": 95.09757629394531 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.1136205616789427, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0666, | |
| "step": 9850 | |
| }, | |
| { | |
| "ce_loss": 1.0619865989685058, | |
| "epoch": 2.96, | |
| "inp_emb_norm": 0.33875, | |
| "loss": 1.0619865989685058, | |
| "masked_top1": 45.1244051361084, | |
| "masked_top5": 69.11591262817383, | |
| "step": 9850, | |
| "top1": 84.57212326049805, | |
| "top5": 95.30008316040039 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 1.0447170988807504, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0799, | |
| "step": 9900 | |
| }, | |
| { | |
| "ce_loss": 1.079269015789032, | |
| "epoch": 2.98, | |
| "inp_emb_norm": 0.3366796875, | |
| "loss": 1.079269015789032, | |
| "masked_top1": 45.20799728393555, | |
| "masked_top5": 69.20284423828124, | |
| "step": 9900, | |
| "top1": 84.41862701416015, | |
| "top5": 95.25058349609375 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.02006539824341, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0727, | |
| "step": 9950 | |
| }, | |
| { | |
| "ce_loss": 1.078022118806839, | |
| "epoch": 2.99, | |
| "inp_emb_norm": 0.343828125, | |
| "loss": 1.078022118806839, | |
| "masked_top1": 45.404396057128906, | |
| "masked_top5": 68.74404998779296, | |
| "step": 9950, | |
| "top1": 84.56711853027343, | |
| "top5": 95.0911540222168 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 1.113028194298386, | |
| "learning_rate": 0.0001, | |
| "loss": 0.923, | |
| "step": 10000 | |
| }, | |
| { | |
| "ce_loss": 0.9195016610622406, | |
| "epoch": 3.01, | |
| "inp_emb_norm": 0.3430859375, | |
| "loss": 0.9195016610622406, | |
| "masked_top1": 48.332907638549806, | |
| "masked_top5": 72.30007148742676, | |
| "step": 10000, | |
| "top1": 86.70970993041992, | |
| "top5": 95.9695133972168 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 0.9894998771881995, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7385, | |
| "step": 10050 | |
| }, | |
| { | |
| "ce_loss": 0.7338530778884887, | |
| "epoch": 3.02, | |
| "inp_emb_norm": 0.337578125, | |
| "loss": 0.7338530778884887, | |
| "masked_top1": 52.05472785949707, | |
| "masked_top5": 77.44325592041015, | |
| "step": 10050, | |
| "top1": 89.11887008666992, | |
| "top5": 97.09128341674804 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.0927669172905705, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7338, | |
| "step": 10100 | |
| }, | |
| { | |
| "ce_loss": 0.7197039890289306, | |
| "epoch": 3.04, | |
| "inp_emb_norm": 0.3384765625, | |
| "loss": 0.7197039890289306, | |
| "masked_top1": 53.32936988830566, | |
| "masked_top5": 77.26823791503907, | |
| "step": 10100, | |
| "top1": 89.31084884643555, | |
| "top5": 97.0846061706543 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 1.0197432952418408, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7411, | |
| "step": 10150 | |
| }, | |
| { | |
| "ce_loss": 0.7408602213859559, | |
| "epoch": 3.05, | |
| "inp_emb_norm": 0.3473046875, | |
| "loss": 0.7408602213859559, | |
| "masked_top1": 51.371361846923826, | |
| "masked_top5": 76.60449035644531, | |
| "step": 10150, | |
| "top1": 89.11841430664063, | |
| "top5": 96.8936897277832 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 1.0379685041697753, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7462, | |
| "step": 10200 | |
| }, | |
| { | |
| "ce_loss": 0.7370594382286072, | |
| "epoch": 3.07, | |
| "inp_emb_norm": 0.3462109375, | |
| "loss": 0.7370594382286072, | |
| "masked_top1": 51.55666793823242, | |
| "masked_top5": 76.35641525268555, | |
| "step": 10200, | |
| "top1": 89.04085632324218, | |
| "top5": 96.98393936157227 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.9812098301602463, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7456, | |
| "step": 10250 | |
| }, | |
| { | |
| "ce_loss": 0.7352669024467469, | |
| "epoch": 3.08, | |
| "inp_emb_norm": 0.3394921875, | |
| "loss": 0.7352669024467469, | |
| "masked_top1": 52.50257652282715, | |
| "masked_top5": 76.83967468261719, | |
| "step": 10250, | |
| "top1": 89.10743438720704, | |
| "top5": 96.9831704711914 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 1.1025924611121007, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7497, | |
| "step": 10300 | |
| }, | |
| { | |
| "ce_loss": 0.749667866230011, | |
| "epoch": 3.1, | |
| "inp_emb_norm": 0.34015625, | |
| "loss": 0.749667866230011, | |
| "masked_top1": 51.46554763793945, | |
| "masked_top5": 76.06611167907715, | |
| "step": 10300, | |
| "top1": 88.8905972290039, | |
| "top5": 96.94614303588867 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 1.0513267054177442, | |
| "learning_rate": 0.0001, | |
| "loss": 0.76, | |
| "step": 10350 | |
| }, | |
| { | |
| "ce_loss": 0.7712516760826111, | |
| "epoch": 3.11, | |
| "inp_emb_norm": 0.342890625, | |
| "loss": 0.7712516760826111, | |
| "masked_top1": 50.167163619995115, | |
| "masked_top5": 74.79809906005859, | |
| "step": 10350, | |
| "top1": 88.5837466430664, | |
| "top5": 96.77574478149414 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 1.1017983640036164, | |
| "learning_rate": 0.0001, | |
| "loss": 0.75, | |
| "step": 10400 | |
| }, | |
| { | |
| "ce_loss": 0.7555344760417938, | |
| "epoch": 3.13, | |
| "inp_emb_norm": 0.3440625, | |
| "loss": 0.7555344760417938, | |
| "masked_top1": 51.58950523376465, | |
| "masked_top5": 75.45907012939453, | |
| "step": 10400, | |
| "top1": 88.80611511230468, | |
| "top5": 96.81310470581055 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 1.059466085863966, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7674, | |
| "step": 10450 | |
| }, | |
| { | |
| "ce_loss": 0.7663704335689545, | |
| "epoch": 3.14, | |
| "inp_emb_norm": 0.3493359375, | |
| "loss": 0.7663704335689545, | |
| "masked_top1": 49.915076370239255, | |
| "masked_top5": 75.00028388977051, | |
| "step": 10450, | |
| "top1": 88.7645571899414, | |
| "top5": 96.73904815673828 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 1.1423077844194809, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7609, | |
| "step": 10500 | |
| }, | |
| { | |
| "ce_loss": 0.767000640630722, | |
| "epoch": 3.16, | |
| "inp_emb_norm": 0.3465234375, | |
| "loss": 0.767000640630722, | |
| "masked_top1": 50.03567909240723, | |
| "masked_top5": 75.01374862670899, | |
| "step": 10500, | |
| "top1": 88.57664657592774, | |
| "top5": 96.84391784667969 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 1.032448425259117, | |
| "learning_rate": 0.0001, | |
| "loss": 0.772, | |
| "step": 10550 | |
| }, | |
| { | |
| "ce_loss": 0.7650803327560425, | |
| "epoch": 3.17, | |
| "inp_emb_norm": 0.3430078125, | |
| "loss": 0.7650803327560425, | |
| "masked_top1": 51.708590774536134, | |
| "masked_top5": 75.70334999084473, | |
| "step": 10550, | |
| "top1": 88.69338806152344, | |
| "top5": 96.79394775390625 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 0.9962408469712855, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7756, | |
| "step": 10600 | |
| }, | |
| { | |
| "ce_loss": 0.7650206458568573, | |
| "epoch": 3.19, | |
| "inp_emb_norm": 0.3441796875, | |
| "loss": 0.7650206458568573, | |
| "masked_top1": 51.08683837890625, | |
| "masked_top5": 76.22035110473632, | |
| "step": 10600, | |
| "top1": 88.6184033203125, | |
| "top5": 96.88288452148437 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.0602266154627207, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7701, | |
| "step": 10650 | |
| }, | |
| { | |
| "ce_loss": 0.790007756948471, | |
| "epoch": 3.2, | |
| "inp_emb_norm": 0.3371875, | |
| "loss": 0.790007756948471, | |
| "masked_top1": 49.467856369018556, | |
| "masked_top5": 74.11548706054687, | |
| "step": 10650, | |
| "top1": 88.32970947265625, | |
| "top5": 96.74940063476562 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 0.977038059072146, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7746, | |
| "step": 10700 | |
| }, | |
| { | |
| "ce_loss": 0.7758392190933228, | |
| "epoch": 3.22, | |
| "inp_emb_norm": 0.346875, | |
| "loss": 0.7758392190933228, | |
| "masked_top1": 50.20531356811524, | |
| "masked_top5": 75.03212989807129, | |
| "step": 10700, | |
| "top1": 88.4866551208496, | |
| "top5": 96.73311477661133 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 1.0744003921088385, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7866, | |
| "step": 10750 | |
| }, | |
| { | |
| "ce_loss": 0.788164142370224, | |
| "epoch": 3.23, | |
| "inp_emb_norm": 0.351015625, | |
| "loss": 0.788164142370224, | |
| "masked_top1": 49.6349144744873, | |
| "masked_top5": 74.33588874816894, | |
| "step": 10750, | |
| "top1": 88.30440032958984, | |
| "top5": 96.71714782714844 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 1.0175043021315349, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7874, | |
| "step": 10800 | |
| }, | |
| { | |
| "ce_loss": 0.7919885838031768, | |
| "epoch": 3.25, | |
| "inp_emb_norm": 0.3444140625, | |
| "loss": 0.7919885838031768, | |
| "masked_top1": 48.99909255981445, | |
| "masked_top5": 74.25505355834962, | |
| "step": 10800, | |
| "top1": 88.26540832519531, | |
| "top5": 96.72898651123047 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 1.1095697587812745, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7888, | |
| "step": 10850 | |
| }, | |
| { | |
| "ce_loss": 0.797193922996521, | |
| "epoch": 3.26, | |
| "inp_emb_norm": 0.345234375, | |
| "loss": 0.797193922996521, | |
| "masked_top1": 49.19968879699707, | |
| "masked_top5": 74.1003759765625, | |
| "step": 10850, | |
| "top1": 88.23719314575196, | |
| "top5": 96.6193911743164 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 1.096465203223292, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7871, | |
| "step": 10900 | |
| }, | |
| { | |
| "ce_loss": 0.7947347521781921, | |
| "epoch": 3.28, | |
| "inp_emb_norm": 0.3490234375, | |
| "loss": 0.7947347521781921, | |
| "masked_top1": 49.850736923217774, | |
| "masked_top5": 74.32025009155274, | |
| "step": 10900, | |
| "top1": 88.2540364074707, | |
| "top5": 96.63969177246094 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 1.0949347711209227, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7828, | |
| "step": 10950 | |
| }, | |
| { | |
| "ce_loss": 0.7824289429187775, | |
| "epoch": 3.29, | |
| "inp_emb_norm": 0.340234375, | |
| "loss": 0.7824289429187775, | |
| "masked_top1": 51.0236792755127, | |
| "masked_top5": 75.60059753417968, | |
| "step": 10950, | |
| "top1": 88.44829376220703, | |
| "top5": 96.78743530273438 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 1.0829787001902516, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7878, | |
| "step": 11000 | |
| }, | |
| { | |
| "ce_loss": 0.7807865822315216, | |
| "epoch": 3.31, | |
| "inp_emb_norm": 0.35390625, | |
| "loss": 0.7807865822315216, | |
| "masked_top1": 50.650703506469725, | |
| "masked_top5": 74.27250564575195, | |
| "step": 11000, | |
| "top1": 88.50477279663086, | |
| "top5": 96.67802978515626 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 1.01346692888271, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7935, | |
| "step": 11050 | |
| }, | |
| { | |
| "ce_loss": 0.7934070038795471, | |
| "epoch": 3.32, | |
| "inp_emb_norm": 0.35015625, | |
| "loss": 0.7934070038795471, | |
| "masked_top1": 50.09875770568848, | |
| "masked_top5": 74.54286209106445, | |
| "step": 11050, | |
| "top1": 88.3311814880371, | |
| "top5": 96.66521301269532 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 1.1970274017001157, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8031, | |
| "step": 11100 | |
| }, | |
| { | |
| "ce_loss": 0.8043110525608063, | |
| "epoch": 3.34, | |
| "inp_emb_norm": 0.340703125, | |
| "loss": 0.8043110525608063, | |
| "masked_top1": 49.070663375854494, | |
| "masked_top5": 73.73879615783692, | |
| "step": 11100, | |
| "top1": 88.18025527954102, | |
| "top5": 96.71074813842773 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 0.9807421337694971, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7967, | |
| "step": 11150 | |
| }, | |
| { | |
| "ce_loss": 0.7986381149291992, | |
| "epoch": 3.35, | |
| "inp_emb_norm": 0.3460546875, | |
| "loss": 0.7986381149291992, | |
| "masked_top1": 49.351584854125974, | |
| "masked_top5": 74.40426681518555, | |
| "step": 11150, | |
| "top1": 88.17583755493165, | |
| "top5": 96.63799163818359 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 1.0963946786909775, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7973, | |
| "step": 11200 | |
| }, | |
| { | |
| "ce_loss": 0.7999817717075348, | |
| "epoch": 3.37, | |
| "inp_emb_norm": 0.34953125, | |
| "loss": 0.7999817717075348, | |
| "masked_top1": 48.15997085571289, | |
| "masked_top5": 73.92494735717773, | |
| "step": 11200, | |
| "top1": 88.08112899780274, | |
| "top5": 96.61600830078125 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 0.9959517938940972, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7972, | |
| "step": 11250 | |
| }, | |
| { | |
| "ce_loss": 0.7938779592514038, | |
| "epoch": 3.38, | |
| "inp_emb_norm": 0.3476171875, | |
| "loss": 0.7938779592514038, | |
| "masked_top1": 49.82730033874512, | |
| "masked_top5": 74.95826553344726, | |
| "step": 11250, | |
| "top1": 88.16325576782226, | |
| "top5": 96.75233062744141 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 0.9936212958366237, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8, | |
| "step": 11300 | |
| }, | |
| { | |
| "ce_loss": 0.8001345789432526, | |
| "epoch": 3.4, | |
| "inp_emb_norm": 0.3510546875, | |
| "loss": 0.8001345789432526, | |
| "masked_top1": 50.4670531463623, | |
| "masked_top5": 74.18307846069337, | |
| "step": 11300, | |
| "top1": 88.19016235351563, | |
| "top5": 96.6422052001953 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 1.130929594499011, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8071, | |
| "step": 11350 | |
| }, | |
| { | |
| "ce_loss": 0.8002821004390717, | |
| "epoch": 3.41, | |
| "inp_emb_norm": 0.3469921875, | |
| "loss": 0.8002821004390717, | |
| "masked_top1": 49.907284088134766, | |
| "masked_top5": 74.60272583007813, | |
| "step": 11350, | |
| "top1": 88.13528030395508, | |
| "top5": 96.65942398071289 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 1.027541301721308, | |
| "learning_rate": 0.0001, | |
| "loss": 0.817, | |
| "step": 11400 | |
| }, | |
| { | |
| "ce_loss": 0.8025326645374298, | |
| "epoch": 3.43, | |
| "inp_emb_norm": 0.3438671875, | |
| "loss": 0.8025326645374298, | |
| "masked_top1": 50.332852630615236, | |
| "masked_top5": 74.55212646484375, | |
| "step": 11400, | |
| "top1": 88.08174774169922, | |
| "top5": 96.71300491333008 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 1.0509284890501878, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8071, | |
| "step": 11450 | |
| }, | |
| { | |
| "ce_loss": 0.8151209402084351, | |
| "epoch": 3.44, | |
| "inp_emb_norm": 0.3536328125, | |
| "loss": 0.8151209402084351, | |
| "masked_top1": 49.18514472961426, | |
| "masked_top5": 73.88459991455078, | |
| "step": 11450, | |
| "top1": 88.03491561889649, | |
| "top5": 96.5151156616211 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 1.0563826334349269, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8146, | |
| "step": 11500 | |
| }, | |
| { | |
| "ce_loss": 0.8121966254711152, | |
| "epoch": 3.46, | |
| "inp_emb_norm": 0.3479296875, | |
| "loss": 0.8121966254711152, | |
| "masked_top1": 48.76513572692871, | |
| "masked_top5": 73.98189331054688, | |
| "step": 11500, | |
| "top1": 87.9227052307129, | |
| "top5": 96.60188415527344 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 1.0581754020714675, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8113, | |
| "step": 11550 | |
| }, | |
| { | |
| "ce_loss": 0.8193511891365052, | |
| "epoch": 3.47, | |
| "inp_emb_norm": 0.3533984375, | |
| "loss": 0.8193511891365052, | |
| "masked_top1": 48.62931312561035, | |
| "masked_top5": 73.32291015625, | |
| "step": 11550, | |
| "top1": 87.89377365112304, | |
| "top5": 96.50224563598633 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 1.012876759570208, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8131, | |
| "step": 11600 | |
| }, | |
| { | |
| "ce_loss": 0.8099701881408692, | |
| "epoch": 3.49, | |
| "inp_emb_norm": 0.35375, | |
| "loss": 0.8099701881408692, | |
| "masked_top1": 49.099407272338865, | |
| "masked_top5": 73.79963348388672, | |
| "step": 11600, | |
| "top1": 88.09338317871094, | |
| "top5": 96.58081787109376 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.0124821527896055, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8121, | |
| "step": 11650 | |
| }, | |
| { | |
| "ce_loss": 0.8340137410163879, | |
| "epoch": 3.5, | |
| "inp_emb_norm": 0.34828125, | |
| "loss": 0.8340137410163879, | |
| "masked_top1": 49.10786933898926, | |
| "masked_top5": 73.36568084716797, | |
| "step": 11650, | |
| "top1": 87.82726516723633, | |
| "top5": 96.4594790649414 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 1.006239120505223, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8201, | |
| "step": 11700 | |
| }, | |
| { | |
| "ce_loss": 0.8206925344467163, | |
| "epoch": 3.52, | |
| "inp_emb_norm": 0.351875, | |
| "loss": 0.8206925344467163, | |
| "masked_top1": 48.73426811218262, | |
| "masked_top5": 73.27444328308106, | |
| "step": 11700, | |
| "top1": 87.83039459228516, | |
| "top5": 96.4988427734375 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 1.059895397750386, | |
| "learning_rate": 0.0001, | |
| "loss": 0.817, | |
| "step": 11750 | |
| }, | |
| { | |
| "ce_loss": 0.8189614808559418, | |
| "epoch": 3.53, | |
| "inp_emb_norm": 0.3555859375, | |
| "loss": 0.8189614808559418, | |
| "masked_top1": 49.02143653869629, | |
| "masked_top5": 74.14399620056152, | |
| "step": 11750, | |
| "top1": 87.84059661865234, | |
| "top5": 96.55142425537109 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 1.065918703633994, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8103, | |
| "step": 11800 | |
| }, | |
| { | |
| "ce_loss": 0.8101903474330903, | |
| "epoch": 3.55, | |
| "inp_emb_norm": 0.3505078125, | |
| "loss": 0.8101903474330903, | |
| "masked_top1": 49.68892807006836, | |
| "masked_top5": 74.20518287658692, | |
| "step": 11800, | |
| "top1": 88.00088562011719, | |
| "top5": 96.6223454284668 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.999942936662604, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8173, | |
| "step": 11850 | |
| }, | |
| { | |
| "ce_loss": 0.8223925268650055, | |
| "epoch": 3.56, | |
| "inp_emb_norm": 0.3467578125, | |
| "loss": 0.8223925268650055, | |
| "masked_top1": 48.98508232116699, | |
| "masked_top5": 73.06097061157226, | |
| "step": 11850, | |
| "top1": 87.84088226318359, | |
| "top5": 96.52394149780274 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 1.0109380245958302, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8217, | |
| "step": 11900 | |
| }, | |
| { | |
| "ce_loss": 0.8102488934993743, | |
| "epoch": 3.58, | |
| "inp_emb_norm": 0.3561328125, | |
| "loss": 0.8102488934993743, | |
| "masked_top1": 49.78484992980957, | |
| "masked_top5": 73.83953674316406, | |
| "step": 11900, | |
| "top1": 88.08386886596679, | |
| "top5": 96.51279846191406 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 1.0501857173368994, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8202, | |
| "step": 11950 | |
| }, | |
| { | |
| "ce_loss": 0.8153278791904449, | |
| "epoch": 3.59, | |
| "inp_emb_norm": 0.3567578125, | |
| "loss": 0.8153278791904449, | |
| "masked_top1": 50.214491806030274, | |
| "masked_top5": 75.15233001708984, | |
| "step": 11950, | |
| "top1": 87.94796691894531, | |
| "top5": 96.6393051147461 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 1.0094284740380721, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8228, | |
| "step": 12000 | |
| }, | |
| { | |
| "ce_loss": 0.8150149726867676, | |
| "epoch": 3.61, | |
| "inp_emb_norm": 0.34546875, | |
| "loss": 0.8150149726867676, | |
| "masked_top1": 50.50455932617187, | |
| "masked_top5": 75.45603103637696, | |
| "step": 12000, | |
| "top1": 87.88845626831055, | |
| "top5": 96.71615814208984 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 0.9828696027131701, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8268, | |
| "step": 12050 | |
| }, | |
| { | |
| "ce_loss": 0.8084503662586212, | |
| "epoch": 3.62, | |
| "inp_emb_norm": 0.35515625, | |
| "loss": 0.8084503662586212, | |
| "masked_top1": 49.089233779907225, | |
| "masked_top5": 74.21111480712891, | |
| "step": 12050, | |
| "top1": 87.9230386352539, | |
| "top5": 96.66818649291992 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 1.0364095825846835, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8313, | |
| "step": 12100 | |
| }, | |
| { | |
| "ce_loss": 0.8248488974571228, | |
| "epoch": 3.64, | |
| "inp_emb_norm": 0.3575, | |
| "loss": 0.8248488974571228, | |
| "masked_top1": 49.23360801696777, | |
| "masked_top5": 73.44429908752441, | |
| "step": 12100, | |
| "top1": 87.82616806030273, | |
| "top5": 96.4687548828125 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 1.032109747501083, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8166, | |
| "step": 12150 | |
| }, | |
| { | |
| "ce_loss": 0.819879275560379, | |
| "epoch": 3.65, | |
| "inp_emb_norm": 0.3619921875, | |
| "loss": 0.819879275560379, | |
| "masked_top1": 49.493774490356444, | |
| "masked_top5": 74.06904296875, | |
| "step": 12150, | |
| "top1": 87.88977813720703, | |
| "top5": 96.5196435546875 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 1.1132546555196505, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8407, | |
| "step": 12200 | |
| }, | |
| { | |
| "ce_loss": 0.8416215097904205, | |
| "epoch": 3.67, | |
| "inp_emb_norm": 0.35109375, | |
| "loss": 0.8416215097904205, | |
| "masked_top1": 48.93654960632324, | |
| "masked_top5": 73.06722267150879, | |
| "step": 12200, | |
| "top1": 87.65886123657226, | |
| "top5": 96.42184036254883 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.1126594978470823, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8309, | |
| "step": 12250 | |
| }, | |
| { | |
| "ce_loss": 0.8369515192508697, | |
| "epoch": 3.68, | |
| "inp_emb_norm": 0.3548046875, | |
| "loss": 0.8369515192508697, | |
| "masked_top1": 49.08364143371582, | |
| "masked_top5": 73.77061073303223, | |
| "step": 12250, | |
| "top1": 87.5754817199707, | |
| "top5": 96.51109664916993 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 1.039335313619578, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8335, | |
| "step": 12300 | |
| }, | |
| { | |
| "ce_loss": 0.8377754426002503, | |
| "epoch": 3.7, | |
| "inp_emb_norm": 0.3561328125, | |
| "loss": 0.8377754426002503, | |
| "masked_top1": 48.12888420104981, | |
| "masked_top5": 72.75807151794433, | |
| "step": 12300, | |
| "top1": 87.64847686767578, | |
| "top5": 96.35865264892578 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 1.0741125902905957, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8311, | |
| "step": 12350 | |
| }, | |
| { | |
| "ce_loss": 0.8398508429527283, | |
| "epoch": 3.71, | |
| "inp_emb_norm": 0.3508203125, | |
| "loss": 0.8398508429527283, | |
| "masked_top1": 49.14791725158691, | |
| "masked_top5": 73.34809280395508, | |
| "step": 12350, | |
| "top1": 87.55382202148438, | |
| "top5": 96.47370834350586 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 1.0735818209876995, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8271, | |
| "step": 12400 | |
| }, | |
| { | |
| "ce_loss": 0.8356200730800629, | |
| "epoch": 3.73, | |
| "inp_emb_norm": 0.3551953125, | |
| "loss": 0.8356200730800629, | |
| "masked_top1": 48.54533554077148, | |
| "masked_top5": 73.34476501464843, | |
| "step": 12400, | |
| "top1": 87.71935012817383, | |
| "top5": 96.43182586669921 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 1.1727450080624469, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8438, | |
| "step": 12450 | |
| }, | |
| { | |
| "ce_loss": 0.834755152463913, | |
| "epoch": 3.74, | |
| "inp_emb_norm": 0.3557421875, | |
| "loss": 0.834755152463913, | |
| "masked_top1": 48.73476402282715, | |
| "masked_top5": 73.74958610534668, | |
| "step": 12450, | |
| "top1": 87.6754948425293, | |
| "top5": 96.48529556274414 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.0593532648539608, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8302, | |
| "step": 12500 | |
| }, | |
| { | |
| "ce_loss": 0.8266095387935638, | |
| "epoch": 3.76, | |
| "inp_emb_norm": 0.35234375, | |
| "loss": 0.8266095387935638, | |
| "masked_top1": 49.589872207641605, | |
| "masked_top5": 73.7888671875, | |
| "step": 12500, | |
| "top1": 87.81543869018554, | |
| "top5": 96.54735931396485 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 1.0465892187844261, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8365, | |
| "step": 12550 | |
| }, | |
| { | |
| "ce_loss": 0.8360547876358032, | |
| "epoch": 3.77, | |
| "inp_emb_norm": 0.352265625, | |
| "loss": 0.8360547876358032, | |
| "masked_top1": 48.81367431640625, | |
| "masked_top5": 73.35512954711913, | |
| "step": 12550, | |
| "top1": 87.50978561401367, | |
| "top5": 96.48541564941407 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 0.9861748502692437, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8346, | |
| "step": 12600 | |
| }, | |
| { | |
| "ce_loss": 0.8302571523189545, | |
| "epoch": 3.79, | |
| "inp_emb_norm": 0.353515625, | |
| "loss": 0.8302571523189545, | |
| "masked_top1": 50.41721633911133, | |
| "masked_top5": 74.37043991088868, | |
| "step": 12600, | |
| "top1": 87.77974655151367, | |
| "top5": 96.57415542602538 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.0316691583599322, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8311, | |
| "step": 12650 | |
| }, | |
| { | |
| "ce_loss": 0.8366478300094604, | |
| "epoch": 3.8, | |
| "inp_emb_norm": 0.358125, | |
| "loss": 0.8366478300094604, | |
| "masked_top1": 49.738552017211916, | |
| "masked_top5": 72.98569618225098, | |
| "step": 12650, | |
| "top1": 87.68307205200195, | |
| "top5": 96.43355926513672 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 1.079061755706453, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8409, | |
| "step": 12700 | |
| }, | |
| { | |
| "ce_loss": 0.8572568881511688, | |
| "epoch": 3.82, | |
| "inp_emb_norm": 0.358984375, | |
| "loss": 0.8572568881511688, | |
| "masked_top1": 47.16713302612305, | |
| "masked_top5": 71.66687362670899, | |
| "step": 12700, | |
| "top1": 87.3316879272461, | |
| "top5": 96.28264602661133 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 1.1113987431562926, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8278, | |
| "step": 12750 | |
| }, | |
| { | |
| "ce_loss": 0.8314487624168396, | |
| "epoch": 3.83, | |
| "inp_emb_norm": 0.35171875, | |
| "loss": 0.8314487624168396, | |
| "masked_top1": 49.02839782714844, | |
| "masked_top5": 73.43093780517579, | |
| "step": 12750, | |
| "top1": 87.7519515991211, | |
| "top5": 96.49931549072265 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 1.0978179635851295, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8332, | |
| "step": 12800 | |
| }, | |
| { | |
| "ce_loss": 0.8298409843444824, | |
| "epoch": 3.85, | |
| "inp_emb_norm": 0.3492578125, | |
| "loss": 0.8298409843444824, | |
| "masked_top1": 50.06224174499512, | |
| "masked_top5": 73.62540901184082, | |
| "step": 12800, | |
| "top1": 87.73297302246094, | |
| "top5": 96.57409362792968 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 0.9650541842630127, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8372, | |
| "step": 12850 | |
| }, | |
| { | |
| "ce_loss": 0.8380164694786072, | |
| "epoch": 3.86, | |
| "inp_emb_norm": 0.35796875, | |
| "loss": 0.8380164694786072, | |
| "masked_top1": 48.93704933166504, | |
| "masked_top5": 72.84653518676758, | |
| "step": 12850, | |
| "top1": 87.66362426757813, | |
| "top5": 96.39145156860351 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.9849217897777546, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8489, | |
| "step": 12900 | |
| }, | |
| { | |
| "ce_loss": 0.8715607190132141, | |
| "epoch": 3.88, | |
| "inp_emb_norm": 0.35453125, | |
| "loss": 0.8715607190132141, | |
| "masked_top1": 46.82566688537597, | |
| "masked_top5": 71.51848426818847, | |
| "step": 12900, | |
| "top1": 87.30749450683594, | |
| "top5": 96.20992965698242 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 1.0587318645397439, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8513, | |
| "step": 12950 | |
| }, | |
| { | |
| "ce_loss": 0.8459929120540619, | |
| "epoch": 3.89, | |
| "inp_emb_norm": 0.3545703125, | |
| "loss": 0.8459929120540619, | |
| "masked_top1": 47.957108154296876, | |
| "masked_top5": 72.73884170532227, | |
| "step": 12950, | |
| "top1": 87.4450048828125, | |
| "top5": 96.42510208129883 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 1.008453804185378, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8499, | |
| "step": 13000 | |
| }, | |
| { | |
| "ce_loss": 0.8543168365955353, | |
| "epoch": 3.91, | |
| "inp_emb_norm": 0.35265625, | |
| "loss": 0.8543168365955353, | |
| "masked_top1": 48.10952255249023, | |
| "masked_top5": 72.79139190673828, | |
| "step": 13000, | |
| "top1": 87.43768081665038, | |
| "top5": 96.36415054321289 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 1.1029061736641703, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8477, | |
| "step": 13050 | |
| }, | |
| { | |
| "ce_loss": 0.8511655080318451, | |
| "epoch": 3.92, | |
| "inp_emb_norm": 0.3517578125, | |
| "loss": 0.8511655080318451, | |
| "masked_top1": 47.82553955078125, | |
| "masked_top5": 72.87044403076172, | |
| "step": 13050, | |
| "top1": 87.3859359741211, | |
| "top5": 96.47525772094727 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 1.075253739387409, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8483, | |
| "step": 13100 | |
| }, | |
| { | |
| "ce_loss": 0.8585194671154022, | |
| "epoch": 3.94, | |
| "inp_emb_norm": 0.3530859375, | |
| "loss": 0.8585194671154022, | |
| "masked_top1": 48.05498847961426, | |
| "masked_top5": 72.10926498413086, | |
| "step": 13100, | |
| "top1": 87.33003082275391, | |
| "top5": 96.38634353637696 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 1.06337585097062, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8512, | |
| "step": 13150 | |
| }, | |
| { | |
| "ce_loss": 0.8486355948448181, | |
| "epoch": 3.95, | |
| "inp_emb_norm": 0.3512109375, | |
| "loss": 0.8486355948448181, | |
| "masked_top1": 47.7544295501709, | |
| "masked_top5": 72.88016632080078, | |
| "step": 13150, | |
| "top1": 87.42533172607422, | |
| "top5": 96.4359049987793 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 1.0262462724728467, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8484, | |
| "step": 13200 | |
| }, | |
| { | |
| "ce_loss": 0.850223616361618, | |
| "epoch": 3.97, | |
| "inp_emb_norm": 0.3545703125, | |
| "loss": 0.850223616361618, | |
| "masked_top1": 49.058433532714844, | |
| "masked_top5": 73.50660820007325, | |
| "step": 13200, | |
| "top1": 87.42515289306641, | |
| "top5": 96.44447158813476 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 1.1183732342762185, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8496, | |
| "step": 13250 | |
| }, | |
| { | |
| "ce_loss": 0.8533316111564636, | |
| "epoch": 3.98, | |
| "inp_emb_norm": 0.3624609375, | |
| "loss": 0.8533316111564636, | |
| "masked_top1": 47.88730712890625, | |
| "masked_top5": 72.5444091796875, | |
| "step": 13250, | |
| "top1": 87.41307250976563, | |
| "top5": 96.35140655517579 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.0167762679603471, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8556, | |
| "step": 13300 | |
| }, | |
| { | |
| "ce_loss": 0.8482355761528015, | |
| "epoch": 4.0, | |
| "inp_emb_norm": 0.36, | |
| "loss": 0.8482355761528015, | |
| "masked_top1": 48.77355934143066, | |
| "masked_top5": 72.95730499267579, | |
| "step": 13300, | |
| "top1": 87.49448638916016, | |
| "top5": 96.36580871582031 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": 0.9619800299228121, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4647, | |
| "step": 13350 | |
| }, | |
| { | |
| "ce_loss": 0.4654207336902618, | |
| "epoch": 4.02, | |
| "inp_emb_norm": 0.3638671875, | |
| "loss": 0.4654207336902618, | |
| "masked_top1": 69.25712104797363, | |
| "masked_top5": 90.11044219970704, | |
| "step": 13350, | |
| "top1": 92.98291519165039, | |
| "top5": 98.58249725341797 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 1.0911634549199811, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4589, | |
| "step": 13400 | |
| }, | |
| { | |
| "ce_loss": 0.45429064869880675, | |
| "epoch": 4.03, | |
| "inp_emb_norm": 0.3570703125, | |
| "loss": 0.45429064869880675, | |
| "masked_top1": 69.99664253234863, | |
| "masked_top5": 90.96843032836914, | |
| "step": 13400, | |
| "top1": 93.03666076660156, | |
| "top5": 98.7020344543457 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 1.055104414353352, | |
| "learning_rate": 0.0001, | |
| "loss": 0.463, | |
| "step": 13450 | |
| }, | |
| { | |
| "ce_loss": 0.4745191448926926, | |
| "epoch": 4.05, | |
| "inp_emb_norm": 0.3669921875, | |
| "loss": 0.4745191448926926, | |
| "masked_top1": 67.61885986328124, | |
| "masked_top5": 89.49128158569336, | |
| "step": 13450, | |
| "top1": 92.77124450683594, | |
| "top5": 98.49986145019531 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 1.134443079230167, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4584, | |
| "step": 13500 | |
| }, | |
| { | |
| "ce_loss": 0.45621369063854217, | |
| "epoch": 4.06, | |
| "inp_emb_norm": 0.3608984375, | |
| "loss": 0.45621369063854217, | |
| "masked_top1": 68.7521329498291, | |
| "masked_top5": 90.43077987670898, | |
| "step": 13500, | |
| "top1": 92.95487716674805, | |
| "top5": 98.66310028076173 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 1.0109539437689439, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4666, | |
| "step": 13550 | |
| }, | |
| { | |
| "ce_loss": 0.47258255898952484, | |
| "epoch": 4.08, | |
| "inp_emb_norm": 0.3544921875, | |
| "loss": 0.47258255898952484, | |
| "masked_top1": 68.48685668945312, | |
| "masked_top5": 90.1821842956543, | |
| "step": 13550, | |
| "top1": 92.76567596435547, | |
| "top5": 98.59864028930664 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 1.105379838900382, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4745, | |
| "step": 13600 | |
| }, | |
| { | |
| "ce_loss": 0.4720061844587326, | |
| "epoch": 4.09, | |
| "inp_emb_norm": 0.3646484375, | |
| "loss": 0.4720061844587326, | |
| "masked_top1": 67.28578758239746, | |
| "masked_top5": 88.81673706054687, | |
| "step": 13600, | |
| "top1": 92.7738314819336, | |
| "top5": 98.49870178222656 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 1.0445145007542662, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4766, | |
| "step": 13650 | |
| }, | |
| { | |
| "ce_loss": 0.47552256405353543, | |
| "epoch": 4.11, | |
| "inp_emb_norm": 0.363515625, | |
| "loss": 0.47552256405353543, | |
| "masked_top1": 67.69092002868652, | |
| "masked_top5": 89.2786703491211, | |
| "step": 13650, | |
| "top1": 92.75520309448243, | |
| "top5": 98.50775588989258 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 1.0910427976634176, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4854, | |
| "step": 13700 | |
| }, | |
| { | |
| "ce_loss": 0.4856406021118164, | |
| "epoch": 4.12, | |
| "inp_emb_norm": 0.356484375, | |
| "loss": 0.4856406021118164, | |
| "masked_top1": 67.06616600036621, | |
| "masked_top5": 89.24904571533203, | |
| "step": 13700, | |
| "top1": 92.60382766723633, | |
| "top5": 98.53153411865235 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 1.06712952932014, | |
| "learning_rate": 0.0001, | |
| "loss": 0.487, | |
| "step": 13750 | |
| }, | |
| { | |
| "ce_loss": 0.47847113251686096, | |
| "epoch": 4.14, | |
| "inp_emb_norm": 0.3590234375, | |
| "loss": 0.47847113251686096, | |
| "masked_top1": 67.56794845581055, | |
| "masked_top5": 89.76878845214844, | |
| "step": 13750, | |
| "top1": 92.65390167236328, | |
| "top5": 98.56003234863282 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 1.1426918899106289, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4945, | |
| "step": 13800 | |
| }, | |
| { | |
| "ce_loss": 0.502599538564682, | |
| "epoch": 4.15, | |
| "inp_emb_norm": 0.3619140625, | |
| "loss": 0.502599538564682, | |
| "masked_top1": 64.62048377990723, | |
| "masked_top5": 87.96943252563477, | |
| "step": 13800, | |
| "top1": 92.32529083251953, | |
| "top5": 98.33552108764648 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 1.0640372514854988, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4917, | |
| "step": 13850 | |
| }, | |
| { | |
| "ce_loss": 0.48276497185230255, | |
| "epoch": 4.17, | |
| "inp_emb_norm": 0.3612890625, | |
| "loss": 0.48276497185230255, | |
| "masked_top1": 66.87754806518555, | |
| "masked_top5": 89.40460830688477, | |
| "step": 13850, | |
| "top1": 92.6048323059082, | |
| "top5": 98.5196612548828 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 1.0976346428642292, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5333, | |
| "step": 13900 | |
| }, | |
| { | |
| "ce_loss": 0.5117572790384293, | |
| "epoch": 4.18, | |
| "inp_emb_norm": 0.3601171875, | |
| "loss": 0.5117572790384293, | |
| "masked_top1": 64.48737777709961, | |
| "masked_top5": 87.95527038574218, | |
| "step": 13900, | |
| "top1": 92.178935546875, | |
| "top5": 98.36900207519531 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 1.0719371859267879, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5027, | |
| "step": 13950 | |
| }, | |
| { | |
| "ce_loss": 0.506566162109375, | |
| "epoch": 4.2, | |
| "inp_emb_norm": 0.36140625, | |
| "loss": 0.506566162109375, | |
| "masked_top1": 64.94912460327149, | |
| "masked_top5": 88.18265213012695, | |
| "step": 13950, | |
| "top1": 92.29801971435546, | |
| "top5": 98.34425674438477 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 1.1253588379337218, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5035, | |
| "step": 14000 | |
| }, | |
| { | |
| "ce_loss": 0.5121178191900253, | |
| "epoch": 4.21, | |
| "inp_emb_norm": 0.3689453125, | |
| "loss": 0.5121178191900253, | |
| "masked_top1": 64.75304649353028, | |
| "masked_top5": 88.02303543090821, | |
| "step": 14000, | |
| "top1": 92.1805793762207, | |
| "top5": 98.36868255615235 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 1.0564639466498615, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5095, | |
| "step": 14050 | |
| }, | |
| { | |
| "ce_loss": 0.5051852202415467, | |
| "epoch": 4.23, | |
| "inp_emb_norm": 0.3665234375, | |
| "loss": 0.5051852202415467, | |
| "masked_top1": 64.94737861633301, | |
| "masked_top5": 87.73623580932617, | |
| "step": 14050, | |
| "top1": 92.28643096923828, | |
| "top5": 98.35600860595703 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 1.054309146882263, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5128, | |
| "step": 14100 | |
| }, | |
| { | |
| "ce_loss": 0.5164009261131287, | |
| "epoch": 4.24, | |
| "inp_emb_norm": 0.3551171875, | |
| "loss": 0.5164009261131287, | |
| "masked_top1": 64.0973461151123, | |
| "masked_top5": 87.07987808227539, | |
| "step": 14100, | |
| "top1": 92.10574264526367, | |
| "top5": 98.30786682128907 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 1.074877585526465, | |
| "learning_rate": 0.0001, | |
| "loss": 0.518, | |
| "step": 14150 | |
| }, | |
| { | |
| "ce_loss": 0.51701247215271, | |
| "epoch": 4.26, | |
| "inp_emb_norm": 0.36, | |
| "loss": 0.51701247215271, | |
| "masked_top1": 63.153812866210934, | |
| "masked_top5": 87.39238418579102, | |
| "step": 14150, | |
| "top1": 92.00238998413086, | |
| "top5": 98.3110905456543 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 1.0964266251443575, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5182, | |
| "step": 14200 | |
| }, | |
| { | |
| "ce_loss": 0.5284781348705292, | |
| "epoch": 4.27, | |
| "inp_emb_norm": 0.362265625, | |
| "loss": 0.5284781348705292, | |
| "masked_top1": 63.75819221496582, | |
| "masked_top5": 87.80308670043945, | |
| "step": 14200, | |
| "top1": 91.9805062866211, | |
| "top5": 98.30166961669921 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 0.9313191633498797, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5323, | |
| "step": 14250 | |
| }, | |
| { | |
| "ce_loss": 0.5334083133935928, | |
| "epoch": 4.29, | |
| "inp_emb_norm": 0.366328125, | |
| "loss": 0.5334083133935928, | |
| "masked_top1": 63.2767244720459, | |
| "masked_top5": 86.49884506225585, | |
| "step": 14250, | |
| "top1": 91.98057006835937, | |
| "top5": 98.21974060058594 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 1.049137459405467, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5128, | |
| "step": 14300 | |
| }, | |
| { | |
| "ce_loss": 0.5050585663318634, | |
| "epoch": 4.3, | |
| "inp_emb_norm": 0.3669140625, | |
| "loss": 0.5050585663318634, | |
| "masked_top1": 65.1010538482666, | |
| "masked_top5": 88.11703536987305, | |
| "step": 14300, | |
| "top1": 92.21394805908203, | |
| "top5": 98.39463333129883 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 1.0912111332884709, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5219, | |
| "step": 14350 | |
| }, | |
| { | |
| "ce_loss": 0.5061863285303115, | |
| "epoch": 4.32, | |
| "inp_emb_norm": 0.3662890625, | |
| "loss": 0.5061863285303115, | |
| "masked_top1": 64.16183708190918, | |
| "masked_top5": 87.73145401000977, | |
| "step": 14350, | |
| "top1": 92.20845031738281, | |
| "top5": 98.3388949584961 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 1.0296955736225974, | |
| "learning_rate": 0.0001, | |
| "loss": 0.531, | |
| "step": 14400 | |
| }, | |
| { | |
| "ce_loss": 0.5429442119598389, | |
| "epoch": 4.33, | |
| "inp_emb_norm": 0.3691796875, | |
| "loss": 0.5429442119598389, | |
| "masked_top1": 61.77721855163574, | |
| "masked_top5": 85.85552307128906, | |
| "step": 14400, | |
| "top1": 91.75323486328125, | |
| "top5": 98.1361979675293 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 1.1454314011018045, | |
| "learning_rate": 0.0001, | |
| "loss": 0.531, | |
| "step": 14450 | |
| }, | |
| { | |
| "ce_loss": 0.5351821321249008, | |
| "epoch": 4.35, | |
| "inp_emb_norm": 0.369140625, | |
| "loss": 0.5351821321249008, | |
| "masked_top1": 62.75261306762695, | |
| "masked_top5": 86.70513565063476, | |
| "step": 14450, | |
| "top1": 91.81992034912109, | |
| "top5": 98.24479461669922 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 1.0627879361115289, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5354, | |
| "step": 14500 | |
| }, | |
| { | |
| "ce_loss": 0.5213436669111252, | |
| "epoch": 4.36, | |
| "inp_emb_norm": 0.37203125, | |
| "loss": 0.5213436669111252, | |
| "masked_top1": 62.81369560241699, | |
| "masked_top5": 86.35222900390625, | |
| "step": 14500, | |
| "top1": 92.11218536376953, | |
| "top5": 98.20223754882812 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 1.0187687495343714, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5316, | |
| "step": 14550 | |
| }, | |
| { | |
| "ce_loss": 0.5346812665462494, | |
| "epoch": 4.38, | |
| "inp_emb_norm": 0.364140625, | |
| "loss": 0.5346812665462494, | |
| "masked_top1": 63.61943054199219, | |
| "masked_top5": 87.0892349243164, | |
| "step": 14550, | |
| "top1": 91.8150325012207, | |
| "top5": 98.27966354370118 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 0.9591126150175121, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5376, | |
| "step": 14600 | |
| }, | |
| { | |
| "ce_loss": 0.5374365419149398, | |
| "epoch": 4.39, | |
| "inp_emb_norm": 0.361484375, | |
| "loss": 0.5374365419149398, | |
| "masked_top1": 61.88925651550293, | |
| "masked_top5": 86.4749966430664, | |
| "step": 14600, | |
| "top1": 91.70243728637695, | |
| "top5": 98.21218734741211 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 1.1233501033048685, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5384, | |
| "step": 14650 | |
| }, | |
| { | |
| "ce_loss": 0.5430423647165299, | |
| "epoch": 4.41, | |
| "inp_emb_norm": 0.3658984375, | |
| "loss": 0.5430423647165299, | |
| "masked_top1": 62.05436401367187, | |
| "masked_top5": 86.22527664184571, | |
| "step": 14650, | |
| "top1": 91.75454711914062, | |
| "top5": 98.14460327148437 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 1.0492889712367255, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5393, | |
| "step": 14700 | |
| }, | |
| { | |
| "ce_loss": 0.5455965805053711, | |
| "epoch": 4.42, | |
| "inp_emb_norm": 0.369140625, | |
| "loss": 0.5455965805053711, | |
| "masked_top1": 61.81470611572266, | |
| "masked_top5": 85.87024856567383, | |
| "step": 14700, | |
| "top1": 91.6540138244629, | |
| "top5": 98.14372894287109 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 1.0520563782836545, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5473, | |
| "step": 14750 | |
| }, | |
| { | |
| "ce_loss": 0.5409998238086701, | |
| "epoch": 4.44, | |
| "inp_emb_norm": 0.3708984375, | |
| "loss": 0.5409998238086701, | |
| "masked_top1": 62.29138427734375, | |
| "masked_top5": 86.59371627807617, | |
| "step": 14750, | |
| "top1": 91.61244201660156, | |
| "top5": 98.20165710449218 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 1.0819587007503622, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5475, | |
| "step": 14800 | |
| }, | |
| { | |
| "ce_loss": 0.5518389946222305, | |
| "epoch": 4.45, | |
| "inp_emb_norm": 0.3651953125, | |
| "loss": 0.5518389946222305, | |
| "masked_top1": 60.76948570251465, | |
| "masked_top5": 85.64509750366211, | |
| "step": 14800, | |
| "top1": 91.58786865234374, | |
| "top5": 98.13917205810547 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 1.2070825391159268, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5455, | |
| "step": 14850 | |
| }, | |
| { | |
| "ce_loss": 0.5443538892269134, | |
| "epoch": 4.47, | |
| "inp_emb_norm": 0.3718359375, | |
| "loss": 0.5443538892269134, | |
| "masked_top1": 62.73788932800293, | |
| "masked_top5": 86.95017120361328, | |
| "step": 14850, | |
| "top1": 91.70748413085937, | |
| "top5": 98.2123405456543 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 1.0786306746042886, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5447, | |
| "step": 14900 | |
| }, | |
| { | |
| "ce_loss": 0.5514276492595672, | |
| "epoch": 4.48, | |
| "inp_emb_norm": 0.36875, | |
| "loss": 0.5514276492595672, | |
| "masked_top1": 62.11178161621094, | |
| "masked_top5": 85.72534118652344, | |
| "step": 14900, | |
| "top1": 91.74148101806641, | |
| "top5": 98.11538375854492 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.9911108641898027, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5571, | |
| "step": 14950 | |
| }, | |
| { | |
| "ce_loss": 0.5549844616651535, | |
| "epoch": 4.5, | |
| "inp_emb_norm": 0.366484375, | |
| "loss": 0.5549844616651535, | |
| "masked_top1": 61.723852920532224, | |
| "masked_top5": 86.07279510498047, | |
| "step": 14950, | |
| "top1": 91.52112564086914, | |
| "top5": 98.13692977905274 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 1.0479818912700751, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5508, | |
| "step": 15000 | |
| }, | |
| { | |
| "ce_loss": 0.5503324097394944, | |
| "epoch": 4.51, | |
| "inp_emb_norm": 0.36609375, | |
| "loss": 0.5503324097394944, | |
| "masked_top1": 62.31337753295898, | |
| "masked_top5": 85.98730865478515, | |
| "step": 15000, | |
| "top1": 91.61835815429687, | |
| "top5": 98.14640243530273 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 1.0126345802253716, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5604, | |
| "step": 15050 | |
| }, | |
| { | |
| "ce_loss": 0.5613218837976456, | |
| "epoch": 4.53, | |
| "inp_emb_norm": 0.368203125, | |
| "loss": 0.5613218837976456, | |
| "masked_top1": 60.66889373779297, | |
| "masked_top5": 85.60351181030273, | |
| "step": 15050, | |
| "top1": 91.41504837036133, | |
| "top5": 98.12169372558594 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 1.1194678195459478, | |
| "learning_rate": 0.0001, | |
| "loss": 0.558, | |
| "step": 15100 | |
| }, | |
| { | |
| "ce_loss": 0.5630833846330643, | |
| "epoch": 4.54, | |
| "inp_emb_norm": 0.368828125, | |
| "loss": 0.5630833846330643, | |
| "masked_top1": 60.917673873901364, | |
| "masked_top5": 85.1576773071289, | |
| "step": 15100, | |
| "top1": 91.41613555908204, | |
| "top5": 98.04640029907226 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 1.0532041635279352, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5558, | |
| "step": 15150 | |
| }, | |
| { | |
| "ce_loss": 0.5506490439176559, | |
| "epoch": 4.56, | |
| "inp_emb_norm": 0.366640625, | |
| "loss": 0.5506490439176559, | |
| "masked_top1": 62.93031044006348, | |
| "masked_top5": 86.40555801391602, | |
| "step": 15150, | |
| "top1": 91.61476593017578, | |
| "top5": 98.21818695068359 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 1.1553777996600965, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5592, | |
| "step": 15200 | |
| }, | |
| { | |
| "ce_loss": 0.5720086497068405, | |
| "epoch": 4.57, | |
| "inp_emb_norm": 0.367890625, | |
| "loss": 0.5720086497068405, | |
| "masked_top1": 60.15533386230469, | |
| "masked_top5": 84.74554382324219, | |
| "step": 15200, | |
| "top1": 91.33564956665039, | |
| "top5": 98.00524505615235 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 1.0565117376954045, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5639, | |
| "step": 15250 | |
| }, | |
| { | |
| "ce_loss": 0.5550442606210708, | |
| "epoch": 4.59, | |
| "inp_emb_norm": 0.3610546875, | |
| "loss": 0.5550442606210708, | |
| "masked_top1": 61.29122528076172, | |
| "masked_top5": 85.56437545776367, | |
| "step": 15250, | |
| "top1": 91.44481323242188, | |
| "top5": 98.13257965087891 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 0.971805815056899, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5632, | |
| "step": 15300 | |
| }, | |
| { | |
| "ce_loss": 0.565330091714859, | |
| "epoch": 4.6, | |
| "inp_emb_norm": 0.3659375, | |
| "loss": 0.565330091714859, | |
| "masked_top1": 60.39768196105957, | |
| "masked_top5": 85.04548843383789, | |
| "step": 15300, | |
| "top1": 91.39098709106446, | |
| "top5": 98.0403810119629 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 1.0545177592362711, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5666, | |
| "step": 15350 | |
| }, | |
| { | |
| "ce_loss": 0.5629042333364487, | |
| "epoch": 4.62, | |
| "inp_emb_norm": 0.363046875, | |
| "loss": 0.5629042333364487, | |
| "masked_top1": 61.29995155334473, | |
| "masked_top5": 85.63533966064453, | |
| "step": 15350, | |
| "top1": 91.3496403503418, | |
| "top5": 98.15489151000976 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 1.0664326850250532, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5688, | |
| "step": 15400 | |
| }, | |
| { | |
| "ce_loss": 0.5751272231340409, | |
| "epoch": 4.63, | |
| "inp_emb_norm": 0.3648046875, | |
| "loss": 0.5751272231340409, | |
| "masked_top1": 59.40922256469727, | |
| "masked_top5": 84.36576995849609, | |
| "step": 15400, | |
| "top1": 91.27351089477538, | |
| "top5": 97.98653579711915 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 1.1863731412253264, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5736, | |
| "step": 15450 | |
| }, | |
| { | |
| "ce_loss": 0.5713528543710709, | |
| "epoch": 4.65, | |
| "inp_emb_norm": 0.36609375, | |
| "loss": 0.5713528543710709, | |
| "masked_top1": 60.48031639099121, | |
| "masked_top5": 84.88833801269531, | |
| "step": 15450, | |
| "top1": 91.37643188476562, | |
| "top5": 98.01537475585937 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 1.0582685713420434, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5666, | |
| "step": 15500 | |
| }, | |
| { | |
| "ce_loss": 0.5606839144229889, | |
| "epoch": 4.66, | |
| "inp_emb_norm": 0.3706640625, | |
| "loss": 0.5606839144229889, | |
| "masked_top1": 61.53562179565429, | |
| "masked_top5": 85.30425521850586, | |
| "step": 15500, | |
| "top1": 91.48294464111328, | |
| "top5": 98.07853866577149 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 1.1085095153878863, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5639, | |
| "step": 15550 | |
| }, | |
| { | |
| "ce_loss": 0.5658587354421616, | |
| "epoch": 4.68, | |
| "inp_emb_norm": 0.3694921875, | |
| "loss": 0.5658587354421616, | |
| "masked_top1": 60.92467597961426, | |
| "masked_top5": 85.25927993774414, | |
| "step": 15550, | |
| "top1": 91.32387847900391, | |
| "top5": 98.04909301757813 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 1.0956454635831958, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5628, | |
| "step": 15600 | |
| }, | |
| { | |
| "ce_loss": 0.5535443860292435, | |
| "epoch": 4.69, | |
| "inp_emb_norm": 0.3674609375, | |
| "loss": 0.5535443860292435, | |
| "masked_top1": 62.16223007202149, | |
| "masked_top5": 85.78218490600585, | |
| "step": 15600, | |
| "top1": 91.56584075927735, | |
| "top5": 98.14968353271485 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 1.0299728685580238, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5815, | |
| "step": 15650 | |
| }, | |
| { | |
| "ce_loss": 0.5903949171304703, | |
| "epoch": 4.71, | |
| "inp_emb_norm": 0.366796875, | |
| "loss": 0.5903949171304703, | |
| "masked_top1": 59.38263565063477, | |
| "masked_top5": 84.16266036987305, | |
| "step": 15650, | |
| "top1": 91.00887252807617, | |
| "top5": 97.9221858215332 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 1.0470383149335234, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5781, | |
| "step": 15700 | |
| }, | |
| { | |
| "ce_loss": 0.5768050736188889, | |
| "epoch": 4.72, | |
| "inp_emb_norm": 0.3702734375, | |
| "loss": 0.5768050736188889, | |
| "masked_top1": 60.77295166015625, | |
| "masked_top5": 84.56514495849609, | |
| "step": 15700, | |
| "top1": 91.20448944091797, | |
| "top5": 97.95070877075196 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 1.0460925861543287, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5703, | |
| "step": 15750 | |
| }, | |
| { | |
| "ce_loss": 0.5766702961921691, | |
| "epoch": 4.74, | |
| "inp_emb_norm": 0.367265625, | |
| "loss": 0.5766702961921691, | |
| "masked_top1": 60.788666229248044, | |
| "masked_top5": 84.2236555480957, | |
| "step": 15750, | |
| "top1": 91.25414123535157, | |
| "top5": 97.99418731689452 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 1.0639938057731058, | |
| "learning_rate": 0.0001, | |
| "loss": 0.576, | |
| "step": 15800 | |
| }, | |
| { | |
| "ce_loss": 0.5707284951210022, | |
| "epoch": 4.75, | |
| "inp_emb_norm": 0.37234375, | |
| "loss": 0.5707284951210022, | |
| "masked_top1": 60.47222633361817, | |
| "masked_top5": 85.08194046020508, | |
| "step": 15800, | |
| "top1": 91.27689331054688, | |
| "top5": 98.0308624267578 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 1.0685967682651536, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5746, | |
| "step": 15850 | |
| }, | |
| { | |
| "ce_loss": 0.57530591070652, | |
| "epoch": 4.77, | |
| "inp_emb_norm": 0.369296875, | |
| "loss": 0.57530591070652, | |
| "masked_top1": 60.0161856842041, | |
| "masked_top5": 84.5344613647461, | |
| "step": 15850, | |
| "top1": 91.30489456176758, | |
| "top5": 98.01641876220702 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 1.0744915511743809, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5806, | |
| "step": 15900 | |
| }, | |
| { | |
| "ce_loss": 0.5880693066120147, | |
| "epoch": 4.78, | |
| "inp_emb_norm": 0.37296875, | |
| "loss": 0.5880693066120147, | |
| "masked_top1": 59.781610260009764, | |
| "masked_top5": 84.03608184814453, | |
| "step": 15900, | |
| "top1": 91.16118774414062, | |
| "top5": 97.90285369873047 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 1.1777116989988736, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5841, | |
| "step": 15950 | |
| }, | |
| { | |
| "ce_loss": 0.5911537754535675, | |
| "epoch": 4.8, | |
| "inp_emb_norm": 0.3667578125, | |
| "loss": 0.5911537754535675, | |
| "masked_top1": 59.32529609680176, | |
| "masked_top5": 84.31457275390625, | |
| "step": 15950, | |
| "top1": 90.98917724609375, | |
| "top5": 97.9403483581543 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 1.1564579774891486, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5796, | |
| "step": 16000 | |
| }, | |
| { | |
| "ce_loss": 0.5745090502500534, | |
| "epoch": 4.81, | |
| "inp_emb_norm": 0.3758203125, | |
| "loss": 0.5745090502500534, | |
| "masked_top1": 60.10156044006348, | |
| "masked_top5": 85.1797428894043, | |
| "step": 16000, | |
| "top1": 91.18394180297851, | |
| "top5": 98.0737141418457 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 1.1492330940204298, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5883, | |
| "step": 16050 | |
| }, | |
| { | |
| "ce_loss": 0.5925427573919296, | |
| "epoch": 4.83, | |
| "inp_emb_norm": 0.3705078125, | |
| "loss": 0.5925427573919296, | |
| "masked_top1": 59.59210731506348, | |
| "masked_top5": 84.50634429931641, | |
| "step": 16050, | |
| "top1": 91.06646270751953, | |
| "top5": 97.93232162475586 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 1.0605567187549139, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5859, | |
| "step": 16100 | |
| }, | |
| { | |
| "ce_loss": 0.584769184589386, | |
| "epoch": 4.84, | |
| "inp_emb_norm": 0.376796875, | |
| "loss": 0.584769184589386, | |
| "masked_top1": 60.4832283782959, | |
| "masked_top5": 84.92732849121094, | |
| "step": 16100, | |
| "top1": 91.11549453735351, | |
| "top5": 97.98623245239258 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 1.0454099861852648, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5788, | |
| "step": 16150 | |
| }, | |
| { | |
| "ce_loss": 0.5863047724962235, | |
| "epoch": 4.86, | |
| "inp_emb_norm": 0.3832421875, | |
| "loss": 0.5863047724962235, | |
| "masked_top1": 58.98088394165039, | |
| "masked_top5": 84.14134887695313, | |
| "step": 16150, | |
| "top1": 91.02991897583007, | |
| "top5": 97.89835571289062 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 1.1813560843799207, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5937, | |
| "step": 16200 | |
| }, | |
| { | |
| "ce_loss": 0.5899865692853927, | |
| "epoch": 4.87, | |
| "inp_emb_norm": 0.3682421875, | |
| "loss": 0.5899865692853927, | |
| "masked_top1": 59.65545417785645, | |
| "masked_top5": 83.82899841308594, | |
| "step": 16200, | |
| "top1": 90.97882614135742, | |
| "top5": 97.96341979980468 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 1.1257949963772835, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6029, | |
| "step": 16250 | |
| }, | |
| { | |
| "ce_loss": 0.6203358447551728, | |
| "epoch": 4.89, | |
| "inp_emb_norm": 0.3719921875, | |
| "loss": 0.6203358447551728, | |
| "masked_top1": 58.96188240051269, | |
| "masked_top5": 83.78718704223633, | |
| "step": 16250, | |
| "top1": 90.81191055297852, | |
| "top5": 97.87782455444336 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 1.0932714591709916, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5876, | |
| "step": 16300 | |
| }, | |
| { | |
| "ce_loss": 0.5763556951284409, | |
| "epoch": 4.9, | |
| "inp_emb_norm": 0.37578125, | |
| "loss": 0.5763556951284409, | |
| "masked_top1": 60.64178207397461, | |
| "masked_top5": 85.0282049560547, | |
| "step": 16300, | |
| "top1": 91.18880279541015, | |
| "top5": 98.01950302124024 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.9495406306411024, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5891, | |
| "step": 16350 | |
| }, | |
| { | |
| "ce_loss": 0.5920463001728058, | |
| "epoch": 4.92, | |
| "inp_emb_norm": 0.3779296875, | |
| "loss": 0.5920463001728058, | |
| "masked_top1": 60.18883232116699, | |
| "masked_top5": 83.78975204467774, | |
| "step": 16350, | |
| "top1": 91.02826797485352, | |
| "top5": 97.88983917236328 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 1.117017854175032, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5989, | |
| "step": 16400 | |
| }, | |
| { | |
| "ce_loss": 0.6076039922237396, | |
| "epoch": 4.93, | |
| "inp_emb_norm": 0.3727734375, | |
| "loss": 0.6076039922237396, | |
| "masked_top1": 58.69016136169434, | |
| "masked_top5": 83.0786215209961, | |
| "step": 16400, | |
| "top1": 90.75551071166993, | |
| "top5": 97.82947280883789 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 1.071523606880974, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6012, | |
| "step": 16450 | |
| }, | |
| { | |
| "ce_loss": 0.6065960395336151, | |
| "epoch": 4.95, | |
| "inp_emb_norm": 0.380703125, | |
| "loss": 0.6065960395336151, | |
| "masked_top1": 58.35442459106445, | |
| "masked_top5": 83.6309603881836, | |
| "step": 16450, | |
| "top1": 90.80092453002929, | |
| "top5": 97.82826370239258 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 1.100617719882716, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5942, | |
| "step": 16500 | |
| }, | |
| { | |
| "ce_loss": 0.5968893599510193, | |
| "epoch": 4.96, | |
| "inp_emb_norm": 0.376484375, | |
| "loss": 0.5968893599510193, | |
| "masked_top1": 58.843356704711915, | |
| "masked_top5": 83.53122055053711, | |
| "step": 16500, | |
| "top1": 90.88707092285156, | |
| "top5": 97.87701583862305 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 1.0119812517995022, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6025, | |
| "step": 16550 | |
| }, | |
| { | |
| "ce_loss": 0.5882117158174515, | |
| "epoch": 4.98, | |
| "inp_emb_norm": 0.374296875, | |
| "loss": 0.5882117158174515, | |
| "masked_top1": 60.96791168212891, | |
| "masked_top5": 84.55708404541015, | |
| "step": 16550, | |
| "top1": 91.02984649658202, | |
| "top5": 97.95657623291015 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 1.0572442138344962, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5944, | |
| "step": 16600 | |
| }, | |
| { | |
| "ce_loss": 0.5818337166309356, | |
| "epoch": 4.99, | |
| "inp_emb_norm": 0.3735546875, | |
| "loss": 0.5818337166309356, | |
| "masked_top1": 60.71773277282715, | |
| "masked_top5": 85.01761001586914, | |
| "step": 16600, | |
| "top1": 91.06825744628907, | |
| "top5": 98.01745468139649 | |
| }, | |
| { | |
| "epoch": 5.01, | |
| "grad_norm": 0.8868244741347852, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4283, | |
| "step": 16650 | |
| }, | |
| { | |
| "ce_loss": 0.4169870808720589, | |
| "epoch": 5.01, | |
| "inp_emb_norm": 0.3746484375, | |
| "loss": 0.4169870808720589, | |
| "masked_top1": 74.96077354431152, | |
| "masked_top5": 91.62375808715821, | |
| "step": 16650, | |
| "top1": 93.5944010925293, | |
| "top5": 98.75289016723633 | |
| }, | |
| { | |
| "epoch": 5.02, | |
| "grad_norm": 0.8006136974249275, | |
| "learning_rate": 0.0001, | |
| "loss": 0.24, | |
| "step": 16700 | |
| }, | |
| { | |
| "ce_loss": 0.23759305894374846, | |
| "epoch": 5.02, | |
| "inp_emb_norm": 0.377421875, | |
| "loss": 0.23759305894374846, | |
| "masked_top1": 89.66921447753906, | |
| "masked_top5": 98.84856094360352, | |
| "step": 16700, | |
| "top1": 96.22275146484375, | |
| "top5": 99.58460357666016 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 0.7814579996198231, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2428, | |
| "step": 16750 | |
| }, | |
| { | |
| "ce_loss": 0.24006595432758332, | |
| "epoch": 5.04, | |
| "inp_emb_norm": 0.3719921875, | |
| "loss": 0.24006595432758332, | |
| "masked_top1": 89.90816116333008, | |
| "masked_top5": 98.82037811279297, | |
| "step": 16750, | |
| "top1": 96.24885559082031, | |
| "top5": 99.55003356933594 | |
| }, | |
| { | |
| "epoch": 5.05, | |
| "grad_norm": 0.7858773051800831, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2395, | |
| "step": 16800 | |
| }, | |
| { | |
| "ce_loss": 0.23285773277282715, | |
| "epoch": 5.05, | |
| "inp_emb_norm": 0.3825390625, | |
| "loss": 0.23285773277282715, | |
| "masked_top1": 89.65037063598633, | |
| "masked_top5": 98.94374557495117, | |
| "step": 16800, | |
| "top1": 96.30195434570312, | |
| "top5": 99.57885467529297 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "grad_norm": 0.8834168839025487, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2461, | |
| "step": 16850 | |
| }, | |
| { | |
| "ce_loss": 0.24451201170682907, | |
| "epoch": 5.07, | |
| "inp_emb_norm": 0.38234375, | |
| "loss": 0.24451201170682907, | |
| "masked_top1": 88.87031646728515, | |
| "masked_top5": 98.54978637695312, | |
| "step": 16850, | |
| "top1": 96.15791427612305, | |
| "top5": 99.54232421875 | |
| }, | |
| { | |
| "epoch": 5.08, | |
| "grad_norm": 0.9643007917325629, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2558, | |
| "step": 16900 | |
| }, | |
| { | |
| "ce_loss": 0.25902660697698593, | |
| "epoch": 5.08, | |
| "inp_emb_norm": 0.365234375, | |
| "loss": 0.25902660697698593, | |
| "masked_top1": 88.15865753173829, | |
| "masked_top5": 98.6231999206543, | |
| "step": 16900, | |
| "top1": 95.8116291809082, | |
| "top5": 99.52821868896484 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "grad_norm": 0.9187240394700701, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2653, | |
| "step": 16950 | |
| }, | |
| { | |
| "ce_loss": 0.25573085725307465, | |
| "epoch": 5.1, | |
| "inp_emb_norm": 0.3799609375, | |
| "loss": 0.25573085725307465, | |
| "masked_top1": 88.29147186279297, | |
| "masked_top5": 98.50399887084961, | |
| "step": 16950, | |
| "top1": 95.97455947875977, | |
| "top5": 99.51153411865235 | |
| }, | |
| { | |
| "epoch": 5.11, | |
| "grad_norm": 0.8509566798013529, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2532, | |
| "step": 17000 | |
| }, | |
| { | |
| "ce_loss": 0.2532435983419418, | |
| "epoch": 5.11, | |
| "inp_emb_norm": 0.3784765625, | |
| "loss": 0.2532435983419418, | |
| "masked_top1": 88.47122055053711, | |
| "masked_top5": 98.4746549987793, | |
| "step": 17000, | |
| "top1": 95.92875396728516, | |
| "top5": 99.54647705078125 | |
| }, | |
| { | |
| "epoch": 5.13, | |
| "grad_norm": 0.966807468835321, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2595, | |
| "step": 17050 | |
| }, | |
| { | |
| "ce_loss": 0.2636346372961998, | |
| "epoch": 5.13, | |
| "inp_emb_norm": 0.37484375, | |
| "loss": 0.2636346372961998, | |
| "masked_top1": 87.61207931518555, | |
| "masked_top5": 98.49675857543946, | |
| "step": 17050, | |
| "top1": 95.76734481811523, | |
| "top5": 99.5260922241211 | |
| }, | |
| { | |
| "epoch": 5.14, | |
| "grad_norm": 0.8630676671965275, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2605, | |
| "step": 17100 | |
| }, | |
| { | |
| "ce_loss": 0.2629204204678535, | |
| "epoch": 5.14, | |
| "inp_emb_norm": 0.381796875, | |
| "loss": 0.2629204204678535, | |
| "masked_top1": 87.41785934448242, | |
| "masked_top5": 98.47596908569336, | |
| "step": 17100, | |
| "top1": 95.73066696166993, | |
| "top5": 99.52325912475585 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 1.0053540322976233, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2648, | |
| "step": 17150 | |
| }, | |
| { | |
| "ce_loss": 0.27128377854824065, | |
| "epoch": 5.16, | |
| "inp_emb_norm": 0.3784765625, | |
| "loss": 0.27128377854824065, | |
| "masked_top1": 87.0358544921875, | |
| "masked_top5": 98.38056884765625, | |
| "step": 17150, | |
| "top1": 95.64348907470703, | |
| "top5": 99.50871109008789 | |
| }, | |
| { | |
| "epoch": 5.17, | |
| "grad_norm": 0.9508275120295281, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2745, | |
| "step": 17200 | |
| }, | |
| { | |
| "ce_loss": 0.27368868499994276, | |
| "epoch": 5.17, | |
| "inp_emb_norm": 0.3740234375, | |
| "loss": 0.27368868499994276, | |
| "masked_top1": 87.3692935180664, | |
| "masked_top5": 98.23250915527343, | |
| "step": 17200, | |
| "top1": 95.61584823608399, | |
| "top5": 99.49671478271485 | |
| }, | |
| { | |
| "epoch": 5.19, | |
| "grad_norm": 0.9646300915973821, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2814, | |
| "step": 17250 | |
| }, | |
| { | |
| "ce_loss": 0.27761764973402026, | |
| "epoch": 5.19, | |
| "inp_emb_norm": 0.3832421875, | |
| "loss": 0.27761764973402026, | |
| "masked_top1": 86.05260269165039, | |
| "masked_top5": 98.24610961914063, | |
| "step": 17250, | |
| "top1": 95.51309692382813, | |
| "top5": 99.5203547668457 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 0.9852223949042181, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2756, | |
| "step": 17300 | |
| }, | |
| { | |
| "ce_loss": 0.2694446948170662, | |
| "epoch": 5.2, | |
| "inp_emb_norm": 0.3767578125, | |
| "loss": 0.2694446948170662, | |
| "masked_top1": 87.5820735168457, | |
| "masked_top5": 98.16199798583985, | |
| "step": 17300, | |
| "top1": 95.65110595703125, | |
| "top5": 99.50478088378907 | |
| }, | |
| { | |
| "epoch": 5.22, | |
| "grad_norm": 1.0161825412276837, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2836, | |
| "step": 17350 | |
| }, | |
| { | |
| "ce_loss": 0.2845872187614441, | |
| "epoch": 5.22, | |
| "inp_emb_norm": 0.38078125, | |
| "loss": 0.2845872187614441, | |
| "masked_top1": 85.83310623168946, | |
| "masked_top5": 97.87667556762695, | |
| "step": 17350, | |
| "top1": 95.49496994018554, | |
| "top5": 99.44075881958008 | |
| }, | |
| { | |
| "epoch": 5.23, | |
| "grad_norm": 0.9591201013156013, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2831, | |
| "step": 17400 | |
| }, | |
| { | |
| "ce_loss": 0.28442945539951325, | |
| "epoch": 5.23, | |
| "inp_emb_norm": 0.3789453125, | |
| "loss": 0.28442945539951325, | |
| "masked_top1": 86.10271911621093, | |
| "masked_top5": 97.88985565185547, | |
| "step": 17400, | |
| "top1": 95.40376953125, | |
| "top5": 99.44743041992187 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 1.0432247629117117, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2854, | |
| "step": 17450 | |
| }, | |
| { | |
| "ce_loss": 0.27993369311094285, | |
| "epoch": 5.25, | |
| "inp_emb_norm": 0.3808203125, | |
| "loss": 0.27993369311094285, | |
| "masked_top1": 86.45830123901368, | |
| "masked_top5": 98.15035873413086, | |
| "step": 17450, | |
| "top1": 95.57389678955079, | |
| "top5": 99.48250961303711 | |
| }, | |
| { | |
| "epoch": 5.26, | |
| "grad_norm": 0.9701861929167235, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2907, | |
| "step": 17500 | |
| }, | |
| { | |
| "ce_loss": 0.28741121381521223, | |
| "epoch": 5.26, | |
| "inp_emb_norm": 0.3751171875, | |
| "loss": 0.28741121381521223, | |
| "masked_top1": 85.9087370300293, | |
| "masked_top5": 98.2075733947754, | |
| "step": 17500, | |
| "top1": 95.36790939331054, | |
| "top5": 99.45517150878906 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.9018657474300132, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2939, | |
| "step": 17550 | |
| }, | |
| { | |
| "ce_loss": 0.2932562205195427, | |
| "epoch": 5.28, | |
| "inp_emb_norm": 0.376015625, | |
| "loss": 0.2932562205195427, | |
| "masked_top1": 85.43090469360351, | |
| "masked_top5": 97.69857574462891, | |
| "step": 17550, | |
| "top1": 95.35228378295898, | |
| "top5": 99.44769515991212 | |
| }, | |
| { | |
| "epoch": 5.29, | |
| "grad_norm": 1.0525621107942214, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2878, | |
| "step": 17600 | |
| }, | |
| { | |
| "ce_loss": 0.2939734762907028, | |
| "epoch": 5.29, | |
| "inp_emb_norm": 0.3778515625, | |
| "loss": 0.2939734762907028, | |
| "masked_top1": 85.80782012939453, | |
| "masked_top5": 98.14864028930664, | |
| "step": 17600, | |
| "top1": 95.27761352539062, | |
| "top5": 99.49363082885742 | |
| }, | |
| { | |
| "epoch": 5.31, | |
| "grad_norm": 1.0032239373031064, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3036, | |
| "step": 17650 | |
| }, | |
| { | |
| "ce_loss": 0.3070009741187096, | |
| "epoch": 5.31, | |
| "inp_emb_norm": 0.37828125, | |
| "loss": 0.3070009741187096, | |
| "masked_top1": 84.42236633300782, | |
| "masked_top5": 97.74387069702148, | |
| "step": 17650, | |
| "top1": 95.10071838378906, | |
| "top5": 99.41670013427735 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "grad_norm": 1.0274167035814015, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3011, | |
| "step": 17700 | |
| }, | |
| { | |
| "ce_loss": 0.307579453587532, | |
| "epoch": 5.32, | |
| "inp_emb_norm": 0.3801953125, | |
| "loss": 0.307579453587532, | |
| "masked_top1": 84.19732971191407, | |
| "masked_top5": 97.56189498901367, | |
| "step": 17700, | |
| "top1": 95.09036392211914, | |
| "top5": 99.39419525146485 | |
| }, | |
| { | |
| "epoch": 5.34, | |
| "grad_norm": 1.019591016333828, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3006, | |
| "step": 17750 | |
| }, | |
| { | |
| "ce_loss": 0.29659480959177015, | |
| "epoch": 5.34, | |
| "inp_emb_norm": 0.379140625, | |
| "loss": 0.29659480959177015, | |
| "masked_top1": 84.7645182800293, | |
| "masked_top5": 98.02963897705078, | |
| "step": 17750, | |
| "top1": 95.21526107788085, | |
| "top5": 99.45744186401367 | |
| }, | |
| { | |
| "epoch": 5.35, | |
| "grad_norm": 1.045966882681033, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3046, | |
| "step": 17800 | |
| }, | |
| { | |
| "ce_loss": 0.3082049387693405, | |
| "epoch": 5.35, | |
| "inp_emb_norm": 0.3823828125, | |
| "loss": 0.3082049387693405, | |
| "masked_top1": 84.11483215332031, | |
| "masked_top5": 97.67893463134766, | |
| "step": 17800, | |
| "top1": 95.09591766357421, | |
| "top5": 99.39608154296874 | |
| }, | |
| { | |
| "epoch": 5.37, | |
| "grad_norm": 0.9692493761597851, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3087, | |
| "step": 17850 | |
| }, | |
| { | |
| "ce_loss": 0.3111723321676254, | |
| "epoch": 5.37, | |
| "inp_emb_norm": 0.3821875, | |
| "loss": 0.3111723321676254, | |
| "masked_top1": 83.85779495239258, | |
| "masked_top5": 97.46583740234375, | |
| "step": 17850, | |
| "top1": 94.99774032592774, | |
| "top5": 99.40854736328124 | |
| }, | |
| { | |
| "epoch": 5.38, | |
| "grad_norm": 0.9440787583366769, | |
| "learning_rate": 0.0001, | |
| "loss": 0.306, | |
| "step": 17900 | |
| }, | |
| { | |
| "ce_loss": 0.31266897201538085, | |
| "epoch": 5.38, | |
| "inp_emb_norm": 0.380859375, | |
| "loss": 0.31266897201538085, | |
| "masked_top1": 83.7997444152832, | |
| "masked_top5": 97.51995346069336, | |
| "step": 17900, | |
| "top1": 94.99930786132812, | |
| "top5": 99.39216735839844 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 0.9729412684124459, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3072, | |
| "step": 17950 | |
| }, | |
| { | |
| "ce_loss": 0.30246726602315904, | |
| "epoch": 5.4, | |
| "inp_emb_norm": 0.385234375, | |
| "loss": 0.30246726602315904, | |
| "masked_top1": 84.3121369934082, | |
| "masked_top5": 97.73384506225585, | |
| "step": 17950, | |
| "top1": 95.10583526611327, | |
| "top5": 99.43766952514649 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "grad_norm": 0.9636765858785842, | |
| "learning_rate": 0.0001, | |
| "loss": 0.307, | |
| "step": 18000 | |
| }, | |
| { | |
| "ce_loss": 0.30428083807229994, | |
| "epoch": 5.41, | |
| "inp_emb_norm": 0.381484375, | |
| "loss": 0.30428083807229994, | |
| "masked_top1": 84.8429086303711, | |
| "masked_top5": 97.52333801269532, | |
| "step": 18000, | |
| "top1": 95.1666488647461, | |
| "top5": 99.39987106323242 | |
| }, | |
| { | |
| "epoch": 5.43, | |
| "grad_norm": 1.0186629227679855, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3133, | |
| "step": 18050 | |
| }, | |
| { | |
| "ce_loss": 0.3080432793498039, | |
| "epoch": 5.43, | |
| "inp_emb_norm": 0.381015625, | |
| "loss": 0.3080432793498039, | |
| "masked_top1": 83.97354797363282, | |
| "masked_top5": 97.46352157592773, | |
| "step": 18050, | |
| "top1": 95.05124603271484, | |
| "top5": 99.38775970458984 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 0.9556152250711039, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3298, | |
| "step": 18100 | |
| }, | |
| { | |
| "ce_loss": 0.35822920709848405, | |
| "epoch": 5.44, | |
| "inp_emb_norm": 0.38015625, | |
| "loss": 0.35822920709848405, | |
| "masked_top1": 83.25973114013672, | |
| "masked_top5": 97.05907196044922, | |
| "step": 18100, | |
| "top1": 94.64421905517578, | |
| "top5": 99.1507731628418 | |
| }, | |
| { | |
| "epoch": 5.46, | |
| "grad_norm": 1.0598324926140277, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3205, | |
| "step": 18150 | |
| }, | |
| { | |
| "ce_loss": 0.32162826359272, | |
| "epoch": 5.46, | |
| "inp_emb_norm": 0.3842578125, | |
| "loss": 0.32162826359272, | |
| "masked_top1": 83.18663421630859, | |
| "masked_top5": 97.19483627319336, | |
| "step": 18150, | |
| "top1": 94.94844818115234, | |
| "top5": 99.35345962524414 | |
| }, | |
| { | |
| "epoch": 5.47, | |
| "grad_norm": 0.9530378094960094, | |
| "learning_rate": 0.0001, | |
| "loss": 0.32, | |
| "step": 18200 | |
| }, | |
| { | |
| "ce_loss": 0.3288844656944275, | |
| "epoch": 5.47, | |
| "inp_emb_norm": 0.3790625, | |
| "loss": 0.3288844656944275, | |
| "masked_top1": 82.93034057617187, | |
| "masked_top5": 97.11549606323243, | |
| "step": 18200, | |
| "top1": 94.78947708129883, | |
| "top5": 99.34183990478516 | |
| }, | |
| { | |
| "epoch": 5.49, | |
| "grad_norm": 0.998874046778327, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3207, | |
| "step": 18250 | |
| }, | |
| { | |
| "ce_loss": 0.32029964685440065, | |
| "epoch": 5.49, | |
| "inp_emb_norm": 0.38546875, | |
| "loss": 0.32029964685440065, | |
| "masked_top1": 82.87285369873047, | |
| "masked_top5": 97.35884552001953, | |
| "step": 18250, | |
| "top1": 94.86287094116211, | |
| "top5": 99.37160079956055 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 1.0507249584695442, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3264, | |
| "step": 18300 | |
| }, | |
| { | |
| "ce_loss": 0.3225671499967575, | |
| "epoch": 5.5, | |
| "inp_emb_norm": 0.381953125, | |
| "loss": 0.3225671499967575, | |
| "masked_top1": 82.94257446289062, | |
| "masked_top5": 97.59221542358398, | |
| "step": 18300, | |
| "top1": 94.790439453125, | |
| "top5": 99.39126449584961 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 1.0538788268467503, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3272, | |
| "step": 18350 | |
| }, | |
| { | |
| "ce_loss": 0.3279902094602585, | |
| "epoch": 5.52, | |
| "inp_emb_norm": 0.378984375, | |
| "loss": 0.3279902094602585, | |
| "masked_top1": 82.10417236328125, | |
| "masked_top5": 97.07626525878906, | |
| "step": 18350, | |
| "top1": 94.77208694458008, | |
| "top5": 99.34082946777343 | |
| }, | |
| { | |
| "epoch": 5.53, | |
| "grad_norm": 1.0082114223647751, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3246, | |
| "step": 18400 | |
| }, | |
| { | |
| "ce_loss": 0.3286957702040672, | |
| "epoch": 5.53, | |
| "inp_emb_norm": 0.3857421875, | |
| "loss": 0.3286957702040672, | |
| "masked_top1": 82.18844451904297, | |
| "masked_top5": 97.20039276123048, | |
| "step": 18400, | |
| "top1": 94.72737030029298, | |
| "top5": 99.36099243164062 | |
| }, | |
| { | |
| "epoch": 5.55, | |
| "grad_norm": 1.1198021466793433, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3312, | |
| "step": 18450 | |
| }, | |
| { | |
| "ce_loss": 0.33603747010231017, | |
| "epoch": 5.55, | |
| "inp_emb_norm": 0.3957421875, | |
| "loss": 0.33603747010231017, | |
| "masked_top1": 81.67584823608398, | |
| "masked_top5": 96.54129943847656, | |
| "step": 18450, | |
| "top1": 94.65423965454102, | |
| "top5": 99.32092361450195 | |
| }, | |
| { | |
| "epoch": 5.56, | |
| "grad_norm": 1.0668250210650634, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3274, | |
| "step": 18500 | |
| }, | |
| { | |
| "ce_loss": 0.3218830382823944, | |
| "epoch": 5.56, | |
| "inp_emb_norm": 0.3800390625, | |
| "loss": 0.3218830382823944, | |
| "masked_top1": 82.66215911865234, | |
| "masked_top5": 97.63473297119141, | |
| "step": 18500, | |
| "top1": 94.83183807373047, | |
| "top5": 99.41087814331054 | |
| }, | |
| { | |
| "epoch": 5.58, | |
| "grad_norm": 1.0489105931316678, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3323, | |
| "step": 18550 | |
| }, | |
| { | |
| "ce_loss": 0.33785298705101013, | |
| "epoch": 5.58, | |
| "inp_emb_norm": 0.3783984375, | |
| "loss": 0.33785298705101013, | |
| "masked_top1": 82.34555877685547, | |
| "masked_top5": 97.0906037902832, | |
| "step": 18550, | |
| "top1": 94.63372589111329, | |
| "top5": 99.32784606933593 | |
| }, | |
| { | |
| "epoch": 5.59, | |
| "grad_norm": 1.0468329188388257, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3334, | |
| "step": 18600 | |
| }, | |
| { | |
| "ce_loss": 0.3364351660013199, | |
| "epoch": 5.59, | |
| "inp_emb_norm": 0.388515625, | |
| "loss": 0.3364351660013199, | |
| "masked_top1": 81.59525100708008, | |
| "masked_top5": 96.72632537841797, | |
| "step": 18600, | |
| "top1": 94.66922760009766, | |
| "top5": 99.31353454589843 | |
| }, | |
| { | |
| "epoch": 5.61, | |
| "grad_norm": 1.0268275103914735, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3306, | |
| "step": 18650 | |
| }, | |
| { | |
| "ce_loss": 0.327090705037117, | |
| "epoch": 5.61, | |
| "inp_emb_norm": 0.384921875, | |
| "loss": 0.327090705037117, | |
| "masked_top1": 82.4957682800293, | |
| "masked_top5": 96.94613693237305, | |
| "step": 18650, | |
| "top1": 94.83338302612304, | |
| "top5": 99.36148330688476 | |
| }, | |
| { | |
| "epoch": 5.62, | |
| "grad_norm": 1.048035539110107, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3317, | |
| "step": 18700 | |
| }, | |
| { | |
| "ce_loss": 0.3305124366283417, | |
| "epoch": 5.62, | |
| "inp_emb_norm": 0.3794921875, | |
| "loss": 0.3305124366283417, | |
| "masked_top1": 82.27645935058594, | |
| "masked_top5": 97.16238998413085, | |
| "step": 18700, | |
| "top1": 94.69090454101563, | |
| "top5": 99.3725503540039 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "grad_norm": 1.0935078838634331, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3428, | |
| "step": 18750 | |
| }, | |
| { | |
| "ce_loss": 0.34309773981571196, | |
| "epoch": 5.64, | |
| "inp_emb_norm": 0.379453125, | |
| "loss": 0.34309773981571196, | |
| "masked_top1": 81.5336685180664, | |
| "masked_top5": 96.76238327026367, | |
| "step": 18750, | |
| "top1": 94.5162728881836, | |
| "top5": 99.30676239013673 | |
| }, | |
| { | |
| "epoch": 5.65, | |
| "grad_norm": 1.0784022808624179, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3433, | |
| "step": 18800 | |
| }, | |
| { | |
| "ce_loss": 0.35102885723114013, | |
| "epoch": 5.65, | |
| "inp_emb_norm": 0.3797265625, | |
| "loss": 0.35102885723114013, | |
| "masked_top1": 80.56207229614257, | |
| "masked_top5": 96.39179016113282, | |
| "step": 18800, | |
| "top1": 94.42943588256836, | |
| "top5": 99.2728482055664 | |
| }, | |
| { | |
| "epoch": 5.67, | |
| "grad_norm": 1.0393038843820417, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3427, | |
| "step": 18850 | |
| }, | |
| { | |
| "ce_loss": 0.3519377601146698, | |
| "epoch": 5.67, | |
| "inp_emb_norm": 0.386484375, | |
| "loss": 0.3519377601146698, | |
| "masked_top1": 80.00395278930664, | |
| "masked_top5": 96.58664108276368, | |
| "step": 18850, | |
| "top1": 94.40878112792969, | |
| "top5": 99.31891250610352 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 1.1009482557837906, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3438, | |
| "step": 18900 | |
| }, | |
| { | |
| "ce_loss": 0.33931906819343566, | |
| "epoch": 5.68, | |
| "inp_emb_norm": 0.383515625, | |
| "loss": 0.33931906819343566, | |
| "masked_top1": 81.50663543701172, | |
| "masked_top5": 96.86479187011719, | |
| "step": 18900, | |
| "top1": 94.59292114257812, | |
| "top5": 99.32182586669921 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 0.9751674873578042, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3507, | |
| "step": 18950 | |
| }, | |
| { | |
| "ce_loss": 0.343431094288826, | |
| "epoch": 5.7, | |
| "inp_emb_norm": 0.3859375, | |
| "loss": 0.343431094288826, | |
| "masked_top1": 80.64304809570312, | |
| "masked_top5": 96.53154220581055, | |
| "step": 18950, | |
| "top1": 94.48710632324219, | |
| "top5": 99.31856002807618 | |
| }, | |
| { | |
| "epoch": 5.71, | |
| "grad_norm": 0.9792712686829252, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3409, | |
| "step": 19000 | |
| }, | |
| { | |
| "ce_loss": 0.34443502128124237, | |
| "epoch": 5.71, | |
| "inp_emb_norm": 0.38875, | |
| "loss": 0.34443502128124237, | |
| "masked_top1": 80.8620687866211, | |
| "masked_top5": 96.8039859008789, | |
| "step": 19000, | |
| "top1": 94.45713119506836, | |
| "top5": 99.3434228515625 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "grad_norm": 1.0250483722882462, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3462, | |
| "step": 19050 | |
| }, | |
| { | |
| "ce_loss": 0.3492258018255234, | |
| "epoch": 5.73, | |
| "inp_emb_norm": 0.3844921875, | |
| "loss": 0.3492258018255234, | |
| "masked_top1": 80.19672760009766, | |
| "masked_top5": 96.82942138671875, | |
| "step": 19050, | |
| "top1": 94.36864486694336, | |
| "top5": 99.30538925170899 | |
| }, | |
| { | |
| "epoch": 5.74, | |
| "grad_norm": 1.033814306715085, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3465, | |
| "step": 19100 | |
| }, | |
| { | |
| "ce_loss": 0.3455532872676849, | |
| "epoch": 5.74, | |
| "inp_emb_norm": 0.3897265625, | |
| "loss": 0.3455532872676849, | |
| "masked_top1": 81.14021148681641, | |
| "masked_top5": 96.46601867675781, | |
| "step": 19100, | |
| "top1": 94.59025283813476, | |
| "top5": 99.29209457397461 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 1.1266003627715306, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3496, | |
| "step": 19150 | |
| }, | |
| { | |
| "ce_loss": 0.3510564410686493, | |
| "epoch": 5.76, | |
| "inp_emb_norm": 0.3796484375, | |
| "loss": 0.3510564410686493, | |
| "masked_top1": 80.42627136230469, | |
| "masked_top5": 96.30276489257812, | |
| "step": 19150, | |
| "top1": 94.38863174438477, | |
| "top5": 99.28523544311524 | |
| }, | |
| { | |
| "epoch": 5.77, | |
| "grad_norm": 1.0786804231908673, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3519, | |
| "step": 19200 | |
| }, | |
| { | |
| "ce_loss": 0.35517399430274965, | |
| "epoch": 5.77, | |
| "inp_emb_norm": 0.379765625, | |
| "loss": 0.35517399430274965, | |
| "masked_top1": 80.2329933166504, | |
| "masked_top5": 96.45190505981445, | |
| "step": 19200, | |
| "top1": 94.32036392211914, | |
| "top5": 99.28158004760742 | |
| }, | |
| { | |
| "epoch": 5.79, | |
| "grad_norm": 0.9810743864140166, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3532, | |
| "step": 19250 | |
| }, | |
| { | |
| "ce_loss": 0.35108084678649903, | |
| "epoch": 5.79, | |
| "inp_emb_norm": 0.38859375, | |
| "loss": 0.35108084678649903, | |
| "masked_top1": 80.51531219482422, | |
| "masked_top5": 96.45788009643555, | |
| "step": 19250, | |
| "top1": 94.3976658630371, | |
| "top5": 99.2694255065918 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 0.9998840954054705, | |
| "learning_rate": 0.0001, | |
| "loss": 0.358, | |
| "step": 19300 | |
| }, | |
| { | |
| "ce_loss": 0.36306409776210785, | |
| "epoch": 5.8, | |
| "inp_emb_norm": 0.3805859375, | |
| "loss": 0.36306409776210785, | |
| "masked_top1": 79.08367248535156, | |
| "masked_top5": 96.06164031982422, | |
| "step": 19300, | |
| "top1": 94.15442794799804, | |
| "top5": 99.21199111938476 | |
| }, | |
| { | |
| "epoch": 5.82, | |
| "grad_norm": 0.9561538506979583, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3517, | |
| "step": 19350 | |
| }, | |
| { | |
| "ce_loss": 0.3433873727917671, | |
| "epoch": 5.82, | |
| "inp_emb_norm": 0.3860546875, | |
| "loss": 0.3433873727917671, | |
| "masked_top1": 81.34098831176757, | |
| "masked_top5": 96.61977462768554, | |
| "step": 19350, | |
| "top1": 94.59346282958984, | |
| "top5": 99.30811492919922 | |
| }, | |
| { | |
| "epoch": 5.83, | |
| "grad_norm": 0.9641219081481858, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3571, | |
| "step": 19400 | |
| }, | |
| { | |
| "ce_loss": 0.3487838166952133, | |
| "epoch": 5.83, | |
| "inp_emb_norm": 0.3849609375, | |
| "loss": 0.3487838166952133, | |
| "masked_top1": 81.22178527832031, | |
| "masked_top5": 96.62050567626953, | |
| "step": 19400, | |
| "top1": 94.4317202758789, | |
| "top5": 99.28485198974609 | |
| }, | |
| { | |
| "epoch": 5.85, | |
| "grad_norm": 1.1056878672573682, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3589, | |
| "step": 19450 | |
| }, | |
| { | |
| "ce_loss": 0.3580132460594177, | |
| "epoch": 5.85, | |
| "inp_emb_norm": 0.395234375, | |
| "loss": 0.3580132460594177, | |
| "masked_top1": 79.92015823364258, | |
| "masked_top5": 96.51984786987305, | |
| "step": 19450, | |
| "top1": 94.39044494628907, | |
| "top5": 99.25140426635743 | |
| }, | |
| { | |
| "epoch": 5.86, | |
| "grad_norm": 1.0460192475813763, | |
| "learning_rate": 0.0001, | |
| "loss": 0.357, | |
| "step": 19500 | |
| }, | |
| { | |
| "ce_loss": 0.3561101830005646, | |
| "epoch": 5.86, | |
| "inp_emb_norm": 0.3905859375, | |
| "loss": 0.3561101830005646, | |
| "masked_top1": 80.33946014404297, | |
| "masked_top5": 96.31464675903321, | |
| "step": 19500, | |
| "top1": 94.33697235107422, | |
| "top5": 99.24774505615234 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 0.9997822590004654, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3667, | |
| "step": 19550 | |
| }, | |
| { | |
| "ce_loss": 0.3721057403087616, | |
| "epoch": 5.88, | |
| "inp_emb_norm": 0.3875390625, | |
| "loss": 0.3721057403087616, | |
| "masked_top1": 78.66541412353516, | |
| "masked_top5": 95.76679565429687, | |
| "step": 19550, | |
| "top1": 94.079482421875, | |
| "top5": 99.20158752441407 | |
| }, | |
| { | |
| "epoch": 5.89, | |
| "grad_norm": 1.0227356742773894, | |
| "learning_rate": 0.0001, | |
| "loss": 0.362, | |
| "step": 19600 | |
| }, | |
| { | |
| "ce_loss": 0.36288787305355075, | |
| "epoch": 5.89, | |
| "inp_emb_norm": 0.393046875, | |
| "loss": 0.36288787305355075, | |
| "masked_top1": 78.83365341186523, | |
| "masked_top5": 95.98956665039063, | |
| "step": 19600, | |
| "top1": 94.17432601928711, | |
| "top5": 99.2359994506836 | |
| }, | |
| { | |
| "epoch": 5.91, | |
| "grad_norm": 1.0348435013200028, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3637, | |
| "step": 19650 | |
| }, | |
| { | |
| "ce_loss": 0.3597774177789688, | |
| "epoch": 5.91, | |
| "inp_emb_norm": 0.38703125, | |
| "loss": 0.3597774177789688, | |
| "masked_top1": 79.5971403503418, | |
| "masked_top5": 96.29687118530273, | |
| "step": 19650, | |
| "top1": 94.29608825683594, | |
| "top5": 99.25850982666016 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 1.0680810347893466, | |
| "learning_rate": 0.0001, | |
| "loss": 0.366, | |
| "step": 19700 | |
| }, | |
| { | |
| "ce_loss": 0.3640019080042839, | |
| "epoch": 5.92, | |
| "inp_emb_norm": 0.3908203125, | |
| "loss": 0.3640019080042839, | |
| "masked_top1": 80.11074829101562, | |
| "masked_top5": 96.37212921142579, | |
| "step": 19700, | |
| "top1": 94.18343032836914, | |
| "top5": 99.28712341308594 | |
| }, | |
| { | |
| "epoch": 5.94, | |
| "grad_norm": 0.9094859392388285, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3678, | |
| "step": 19750 | |
| }, | |
| { | |
| "ce_loss": 0.37644350349903105, | |
| "epoch": 5.94, | |
| "inp_emb_norm": 0.3891015625, | |
| "loss": 0.37644350349903105, | |
| "masked_top1": 78.18459594726562, | |
| "masked_top5": 95.86135772705079, | |
| "step": 19750, | |
| "top1": 93.957353515625, | |
| "top5": 99.2127586364746 | |
| }, | |
| { | |
| "epoch": 5.95, | |
| "grad_norm": 1.0310255933372185, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3668, | |
| "step": 19800 | |
| }, | |
| { | |
| "ce_loss": 0.3627872896194458, | |
| "epoch": 5.95, | |
| "inp_emb_norm": 0.3862109375, | |
| "loss": 0.3627872896194458, | |
| "masked_top1": 79.60676147460937, | |
| "masked_top5": 96.10193313598633, | |
| "step": 19800, | |
| "top1": 94.17147338867187, | |
| "top5": 99.25555862426758 | |
| }, | |
| { | |
| "epoch": 5.97, | |
| "grad_norm": 1.0599907424871888, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3695, | |
| "step": 19850 | |
| }, | |
| { | |
| "ce_loss": 0.37383450448513034, | |
| "epoch": 5.97, | |
| "inp_emb_norm": 0.4004296875, | |
| "loss": 0.37383450448513034, | |
| "masked_top1": 77.94340957641602, | |
| "masked_top5": 96.1452799987793, | |
| "step": 19850, | |
| "top1": 94.01543914794922, | |
| "top5": 99.25049942016602 | |
| }, | |
| { | |
| "epoch": 5.98, | |
| "grad_norm": 1.123287217336448, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3667, | |
| "step": 19900 | |
| }, | |
| { | |
| "ce_loss": 0.3690763407945633, | |
| "epoch": 5.98, | |
| "inp_emb_norm": 0.3968359375, | |
| "loss": 0.3690763407945633, | |
| "masked_top1": 78.41387680053711, | |
| "masked_top5": 95.78071762084961, | |
| "step": 19900, | |
| "top1": 94.15260635375977, | |
| "top5": 99.18431182861327 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 1.05174121076266, | |
| "learning_rate": 0.0001, | |
| "loss": 0.37, | |
| "step": 19950 | |
| }, | |
| { | |
| "ce_loss": 0.36719063580036165, | |
| "epoch": 6.0, | |
| "inp_emb_norm": 0.3880859375, | |
| "loss": 0.36719063580036165, | |
| "masked_top1": 79.3749658203125, | |
| "masked_top5": 96.28604400634765, | |
| "step": 19950, | |
| "top1": 94.1464176940918, | |
| "top5": 99.22554748535157 | |
| }, | |
| { | |
| "epoch": 6.02, | |
| "grad_norm": 0.7372869779982937, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1841, | |
| "step": 20000 | |
| }, | |
| { | |
| "ce_loss": 0.1820479117333889, | |
| "epoch": 6.02, | |
| "inp_emb_norm": 0.3961328125, | |
| "loss": 0.1820479117333889, | |
| "masked_top1": 93.90685821533204, | |
| "masked_top5": 99.41201736450195, | |
| "step": 20000, | |
| "top1": 97.13008163452149, | |
| "top5": 99.6633514404297 | |
| }, | |
| { | |
| "epoch": 6.03, | |
| "grad_norm": 0.7458143529658549, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1884, | |
| "step": 20050 | |
| }, | |
| { | |
| "ce_loss": 0.19271491587162018, | |
| "epoch": 6.03, | |
| "inp_emb_norm": 0.39109375, | |
| "loss": 0.19271491587162018, | |
| "masked_top1": 93.04420074462891, | |
| "masked_top5": 99.42253952026367, | |
| "step": 20050, | |
| "top1": 96.94086242675782, | |
| "top5": 99.66081787109376 | |
| }, | |
| { | |
| "epoch": 6.05, | |
| "grad_norm": 0.8859283290144573, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1866, | |
| "step": 20100 | |
| }, | |
| { | |
| "ce_loss": 0.191538667678833, | |
| "epoch": 6.05, | |
| "inp_emb_norm": 0.3925390625, | |
| "loss": 0.191538667678833, | |
| "masked_top1": 93.09733154296875, | |
| "masked_top5": 99.30335250854492, | |
| "step": 20100, | |
| "top1": 96.98335327148438, | |
| "top5": 99.65503143310546 | |
| }, | |
| { | |
| "epoch": 6.06, | |
| "grad_norm": 0.8809125747332417, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1852, | |
| "step": 20150 | |
| }, | |
| { | |
| "ce_loss": 0.1867568638920784, | |
| "epoch": 6.06, | |
| "inp_emb_norm": 0.391640625, | |
| "loss": 0.1867568638920784, | |
| "masked_top1": 93.50607498168945, | |
| "masked_top5": 99.47832672119141, | |
| "step": 20150, | |
| "top1": 97.09137954711915, | |
| "top5": 99.65869873046876 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.826592224481137, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1849, | |
| "step": 20200 | |
| }, | |
| { | |
| "ce_loss": 0.1898653081059456, | |
| "epoch": 6.08, | |
| "inp_emb_norm": 0.3903515625, | |
| "loss": 0.1898653081059456, | |
| "masked_top1": 93.60175338745117, | |
| "masked_top5": 99.40977355957031, | |
| "step": 20200, | |
| "top1": 96.99095306396484, | |
| "top5": 99.64656112670899 | |
| }, | |
| { | |
| "epoch": 6.09, | |
| "grad_norm": 0.8167934529115294, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1935, | |
| "step": 20250 | |
| }, | |
| { | |
| "ce_loss": 0.1948414433002472, | |
| "epoch": 6.09, | |
| "inp_emb_norm": 0.3841015625, | |
| "loss": 0.1948414433002472, | |
| "masked_top1": 92.95522598266602, | |
| "masked_top5": 99.3899104309082, | |
| "step": 20250, | |
| "top1": 96.98196350097656, | |
| "top5": 99.63696716308594 | |
| }, | |
| { | |
| "epoch": 6.11, | |
| "grad_norm": 0.7345815776721539, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1926, | |
| "step": 20300 | |
| }, | |
| { | |
| "ce_loss": 0.19291402637958527, | |
| "epoch": 6.11, | |
| "inp_emb_norm": 0.3873828125, | |
| "loss": 0.19291402637958527, | |
| "masked_top1": 93.4042626953125, | |
| "masked_top5": 99.38736923217773, | |
| "step": 20300, | |
| "top1": 96.96928878784179, | |
| "top5": 99.63698593139648 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 0.8149440856604434, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1952, | |
| "step": 20350 | |
| }, | |
| { | |
| "ce_loss": 0.1960041469335556, | |
| "epoch": 6.12, | |
| "inp_emb_norm": 0.39453125, | |
| "loss": 0.1960041469335556, | |
| "masked_top1": 92.89414184570313, | |
| "masked_top5": 99.46369537353516, | |
| "step": 20350, | |
| "top1": 96.92287322998047, | |
| "top5": 99.66681442260742 | |
| }, | |
| { | |
| "epoch": 6.14, | |
| "grad_norm": 0.8776230940079653, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1979, | |
| "step": 20400 | |
| }, | |
| { | |
| "ce_loss": 0.20121153205633163, | |
| "epoch": 6.14, | |
| "inp_emb_norm": 0.3866015625, | |
| "loss": 0.20121153205633163, | |
| "masked_top1": 92.70189239501953, | |
| "masked_top5": 99.36892135620117, | |
| "step": 20400, | |
| "top1": 96.79621154785156, | |
| "top5": 99.64973388671875 | |
| }, | |
| { | |
| "epoch": 6.15, | |
| "grad_norm": 0.8585106345021164, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2003, | |
| "step": 20450 | |
| }, | |
| { | |
| "ce_loss": 0.19490561604499818, | |
| "epoch": 6.15, | |
| "inp_emb_norm": 0.39078125, | |
| "loss": 0.19490561604499818, | |
| "masked_top1": 92.95185577392579, | |
| "masked_top5": 99.3991976928711, | |
| "step": 20450, | |
| "top1": 96.93541824340821, | |
| "top5": 99.64690612792968 | |
| }, | |
| { | |
| "epoch": 6.17, | |
| "grad_norm": 0.7976355967797862, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2052, | |
| "step": 20500 | |
| }, | |
| { | |
| "ce_loss": 0.20391724795103072, | |
| "epoch": 6.17, | |
| "inp_emb_norm": 0.39390625, | |
| "loss": 0.20391724795103072, | |
| "masked_top1": 92.00440231323242, | |
| "masked_top5": 99.22619979858399, | |
| "step": 20500, | |
| "top1": 96.7327310180664, | |
| "top5": 99.62498168945312 | |
| }, | |
| { | |
| "epoch": 6.18, | |
| "grad_norm": 0.8148083321822271, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2054, | |
| "step": 20550 | |
| }, | |
| { | |
| "ce_loss": 0.20799223512411116, | |
| "epoch": 6.18, | |
| "inp_emb_norm": 0.391953125, | |
| "loss": 0.20799223512411116, | |
| "masked_top1": 92.3514372253418, | |
| "masked_top5": 99.27444519042969, | |
| "step": 20550, | |
| "top1": 96.76949005126953, | |
| "top5": 99.62392379760742 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.778356178235768, | |
| "learning_rate": 0.0001, | |
| "loss": 0.206, | |
| "step": 20600 | |
| }, | |
| { | |
| "ce_loss": 0.2053508883714676, | |
| "epoch": 6.2, | |
| "inp_emb_norm": 0.397890625, | |
| "loss": 0.2053508883714676, | |
| "masked_top1": 92.0650765991211, | |
| "masked_top5": 99.24300872802735, | |
| "step": 20600, | |
| "top1": 96.75581268310548, | |
| "top5": 99.60792434692382 | |
| }, | |
| { | |
| "epoch": 6.21, | |
| "grad_norm": 0.8329059510004336, | |
| "learning_rate": 0.0001, | |
| "loss": 0.211, | |
| "step": 20650 | |
| }, | |
| { | |
| "ce_loss": 0.20962412267923355, | |
| "epoch": 6.21, | |
| "inp_emb_norm": 0.3975390625, | |
| "loss": 0.20962412267923355, | |
| "masked_top1": 92.15507781982421, | |
| "masked_top5": 99.21770889282226, | |
| "step": 20650, | |
| "top1": 96.68198043823242, | |
| "top5": 99.61430511474609 | |
| }, | |
| { | |
| "epoch": 6.23, | |
| "grad_norm": 0.9696545325150863, | |
| "learning_rate": 0.0001, | |
| "loss": 0.214, | |
| "step": 20700 | |
| }, | |
| { | |
| "ce_loss": 0.21425070196390153, | |
| "epoch": 6.23, | |
| "inp_emb_norm": 0.3883984375, | |
| "loss": 0.21425070196390153, | |
| "masked_top1": 91.60136703491212, | |
| "masked_top5": 99.14032455444335, | |
| "step": 20700, | |
| "top1": 96.57572845458985, | |
| "top5": 99.60167327880859 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.9465855604091451, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2179, | |
| "step": 20750 | |
| }, | |
| { | |
| "ce_loss": 0.21650043070316316, | |
| "epoch": 6.24, | |
| "inp_emb_norm": 0.3938671875, | |
| "loss": 0.21650043070316316, | |
| "masked_top1": 91.89517654418945, | |
| "masked_top5": 99.22545211791993, | |
| "step": 20750, | |
| "top1": 96.63436431884766, | |
| "top5": 99.61578002929687 | |
| }, | |
| { | |
| "epoch": 6.26, | |
| "grad_norm": 0.8848039020955999, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2167, | |
| "step": 20800 | |
| }, | |
| { | |
| "ce_loss": 0.21624796688556672, | |
| "epoch": 6.26, | |
| "inp_emb_norm": 0.3955078125, | |
| "loss": 0.21624796688556672, | |
| "masked_top1": 91.79812927246094, | |
| "masked_top5": 99.13185974121093, | |
| "step": 20800, | |
| "top1": 96.59934356689453, | |
| "top5": 99.60213928222656 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "grad_norm": 0.8555074354464904, | |
| "learning_rate": 0.0001, | |
| "loss": 0.22, | |
| "step": 20850 | |
| }, | |
| { | |
| "ce_loss": 0.2177719497680664, | |
| "epoch": 6.27, | |
| "inp_emb_norm": 0.386171875, | |
| "loss": 0.2177719497680664, | |
| "masked_top1": 91.69604415893555, | |
| "masked_top5": 99.20869293212891, | |
| "step": 20850, | |
| "top1": 96.5518881225586, | |
| "top5": 99.60115341186524 | |
| }, | |
| { | |
| "epoch": 6.29, | |
| "grad_norm": 0.8890783469847633, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2219, | |
| "step": 20900 | |
| }, | |
| { | |
| "ce_loss": 0.22593430548906326, | |
| "epoch": 6.29, | |
| "inp_emb_norm": 0.3901953125, | |
| "loss": 0.22593430548906326, | |
| "masked_top1": 91.24859771728515, | |
| "masked_top5": 98.95779678344726, | |
| "step": 20900, | |
| "top1": 96.41894073486328, | |
| "top5": 99.57270751953125 | |
| }, | |
| { | |
| "epoch": 6.3, | |
| "grad_norm": 0.8389189758028528, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2263, | |
| "step": 20950 | |
| }, | |
| { | |
| "ce_loss": 0.22432934373617172, | |
| "epoch": 6.3, | |
| "inp_emb_norm": 0.39359375, | |
| "loss": 0.22432934373617172, | |
| "masked_top1": 91.0940608215332, | |
| "masked_top5": 99.17452285766602, | |
| "step": 20950, | |
| "top1": 96.43647857666015, | |
| "top5": 99.61320083618165 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.9535481479010524, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2232, | |
| "step": 21000 | |
| }, | |
| { | |
| "ce_loss": 0.22769318729639054, | |
| "epoch": 6.32, | |
| "inp_emb_norm": 0.392578125, | |
| "loss": 0.22769318729639054, | |
| "masked_top1": 91.18105072021484, | |
| "masked_top5": 99.13134719848632, | |
| "step": 21000, | |
| "top1": 96.39334274291993, | |
| "top5": 99.59284255981446 | |
| }, | |
| { | |
| "epoch": 6.33, | |
| "grad_norm": 0.9100595834466957, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2237, | |
| "step": 21050 | |
| }, | |
| { | |
| "ce_loss": 0.2257786351442337, | |
| "epoch": 6.33, | |
| "inp_emb_norm": 0.3939453125, | |
| "loss": 0.2257786351442337, | |
| "masked_top1": 90.99365447998046, | |
| "masked_top5": 99.22629165649414, | |
| "step": 21050, | |
| "top1": 96.43473999023438, | |
| "top5": 99.62341964721679 | |
| }, | |
| { | |
| "epoch": 6.35, | |
| "grad_norm": 0.9348221993172825, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2301, | |
| "step": 21100 | |
| }, | |
| { | |
| "ce_loss": 0.22199460864067078, | |
| "epoch": 6.35, | |
| "inp_emb_norm": 0.3928515625, | |
| "loss": 0.22199460864067078, | |
| "masked_top1": 91.3432958984375, | |
| "masked_top5": 99.21780807495117, | |
| "step": 21100, | |
| "top1": 96.48183792114258, | |
| "top5": 99.59834899902344 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "grad_norm": 0.8157766502142015, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2348, | |
| "step": 21150 | |
| }, | |
| { | |
| "ce_loss": 0.23358583688735962, | |
| "epoch": 6.36, | |
| "inp_emb_norm": 0.39484375, | |
| "loss": 0.23358583688735962, | |
| "masked_top1": 90.63573287963867, | |
| "masked_top5": 99.2670066833496, | |
| "step": 21150, | |
| "top1": 96.2616633605957, | |
| "top5": 99.61970520019531 | |
| }, | |
| { | |
| "epoch": 6.38, | |
| "grad_norm": 0.8204200350876739, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2331, | |
| "step": 21200 | |
| }, | |
| { | |
| "ce_loss": 0.2356252110004425, | |
| "epoch": 6.38, | |
| "inp_emb_norm": 0.38765625, | |
| "loss": 0.2356252110004425, | |
| "masked_top1": 91.13608352661133, | |
| "masked_top5": 99.10308471679687, | |
| "step": 21200, | |
| "top1": 96.23461364746093, | |
| "top5": 99.58956817626954 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "grad_norm": 0.8456950881445512, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2376, | |
| "step": 21250 | |
| }, | |
| { | |
| "ce_loss": 0.2390061578154564, | |
| "epoch": 6.39, | |
| "inp_emb_norm": 0.394609375, | |
| "loss": 0.2390061578154564, | |
| "masked_top1": 90.48685897827148, | |
| "masked_top5": 98.98336410522461, | |
| "step": 21250, | |
| "top1": 96.21184814453125, | |
| "top5": 99.59722549438476 | |
| }, | |
| { | |
| "epoch": 6.41, | |
| "grad_norm": 0.8287547291237191, | |
| "learning_rate": 0.0001, | |
| "loss": 0.242, | |
| "step": 21300 | |
| }, | |
| { | |
| "ce_loss": 0.2427684971690178, | |
| "epoch": 6.41, | |
| "inp_emb_norm": 0.4001953125, | |
| "loss": 0.2427684971690178, | |
| "masked_top1": 90.06592498779297, | |
| "masked_top5": 98.84460174560547, | |
| "step": 21300, | |
| "top1": 96.14539581298828, | |
| "top5": 99.56721633911133 | |
| }, | |
| { | |
| "epoch": 6.42, | |
| "grad_norm": 0.9051278595660746, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2406, | |
| "step": 21350 | |
| }, | |
| { | |
| "ce_loss": 0.24132068693637848, | |
| "epoch": 6.42, | |
| "inp_emb_norm": 0.3931640625, | |
| "loss": 0.24132068693637848, | |
| "masked_top1": 90.12520782470703, | |
| "masked_top5": 99.08217208862305, | |
| "step": 21350, | |
| "top1": 96.13710800170898, | |
| "top5": 99.58537933349609 | |
| }, | |
| { | |
| "epoch": 6.44, | |
| "grad_norm": 0.9309548266435707, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2448, | |
| "step": 21400 | |
| }, | |
| { | |
| "ce_loss": 0.2484480223059654, | |
| "epoch": 6.44, | |
| "inp_emb_norm": 0.4006640625, | |
| "loss": 0.2484480223059654, | |
| "masked_top1": 89.13191101074219, | |
| "masked_top5": 98.85858505249024, | |
| "step": 21400, | |
| "top1": 96.04462646484374, | |
| "top5": 99.58927947998046 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "grad_norm": 0.9843205463835414, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2433, | |
| "step": 21450 | |
| }, | |
| { | |
| "ce_loss": 0.2498919489979744, | |
| "epoch": 6.45, | |
| "inp_emb_norm": 0.3948046875, | |
| "loss": 0.2498919489979744, | |
| "masked_top1": 89.5735856628418, | |
| "masked_top5": 98.80221618652344, | |
| "step": 21450, | |
| "top1": 96.03055465698242, | |
| "top5": 99.56374588012696 | |
| }, | |
| { | |
| "epoch": 6.47, | |
| "grad_norm": 0.9647612984805798, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2479, | |
| "step": 21500 | |
| }, | |
| { | |
| "ce_loss": 0.25150889009237287, | |
| "epoch": 6.47, | |
| "inp_emb_norm": 0.3948828125, | |
| "loss": 0.25150889009237287, | |
| "masked_top1": 89.32119857788086, | |
| "masked_top5": 98.80714309692382, | |
| "step": 21500, | |
| "top1": 95.92482788085937, | |
| "top5": 99.56254333496094 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 1.006084785735041, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2484, | |
| "step": 21550 | |
| }, | |
| { | |
| "ce_loss": 0.24291085809469223, | |
| "epoch": 6.48, | |
| "inp_emb_norm": 0.3999609375, | |
| "loss": 0.24291085809469223, | |
| "masked_top1": 89.84078826904297, | |
| "masked_top5": 98.79897399902343, | |
| "step": 21550, | |
| "top1": 96.13382278442383, | |
| "top5": 99.56241195678712 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.9435564610662254, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2484, | |
| "step": 21600 | |
| }, | |
| { | |
| "ce_loss": 0.25397088915109634, | |
| "epoch": 6.5, | |
| "inp_emb_norm": 0.390078125, | |
| "loss": 0.25397088915109634, | |
| "masked_top1": 89.26468048095703, | |
| "masked_top5": 98.83024612426757, | |
| "step": 21600, | |
| "top1": 95.98090301513672, | |
| "top5": 99.53740264892578 | |
| }, | |
| { | |
| "epoch": 6.51, | |
| "grad_norm": 0.9039119083122704, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2536, | |
| "step": 21650 | |
| }, | |
| { | |
| "ce_loss": 0.2490449759364128, | |
| "epoch": 6.51, | |
| "inp_emb_norm": 0.398671875, | |
| "loss": 0.2490449759364128, | |
| "masked_top1": 90.15557754516601, | |
| "masked_top5": 98.96521423339844, | |
| "step": 21650, | |
| "top1": 96.07868286132812, | |
| "top5": 99.56334945678711 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "grad_norm": 0.9083177376434824, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2511, | |
| "step": 21700 | |
| }, | |
| { | |
| "ce_loss": 0.25019362777471543, | |
| "epoch": 6.53, | |
| "inp_emb_norm": 0.4041796875, | |
| "loss": 0.25019362777471543, | |
| "masked_top1": 89.28435134887695, | |
| "masked_top5": 98.76094360351563, | |
| "step": 21700, | |
| "top1": 95.98147827148438, | |
| "top5": 99.55952423095704 | |
| }, | |
| { | |
| "epoch": 6.54, | |
| "grad_norm": 0.9468133124761712, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2559, | |
| "step": 21750 | |
| }, | |
| { | |
| "ce_loss": 0.24820626825094222, | |
| "epoch": 6.54, | |
| "inp_emb_norm": 0.407109375, | |
| "loss": 0.24820626825094222, | |
| "masked_top1": 89.13103088378907, | |
| "masked_top5": 98.86271697998046, | |
| "step": 21750, | |
| "top1": 96.04320175170898, | |
| "top5": 99.57524795532227 | |
| }, | |
| { | |
| "epoch": 6.56, | |
| "grad_norm": 0.9411575962043335, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2515, | |
| "step": 21800 | |
| }, | |
| { | |
| "ce_loss": 0.25579195737838745, | |
| "epoch": 6.56, | |
| "inp_emb_norm": 0.3958203125, | |
| "loss": 0.25579195737838745, | |
| "masked_top1": 89.12390563964844, | |
| "masked_top5": 98.75174041748046, | |
| "step": 21800, | |
| "top1": 95.92440002441407, | |
| "top5": 99.55484436035157 | |
| }, | |
| { | |
| "epoch": 6.57, | |
| "grad_norm": 0.9083686787254603, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2572, | |
| "step": 21850 | |
| }, | |
| { | |
| "ce_loss": 0.2511313533782959, | |
| "epoch": 6.57, | |
| "inp_emb_norm": 0.398984375, | |
| "loss": 0.2511313533782959, | |
| "masked_top1": 90.02878692626953, | |
| "masked_top5": 98.91113662719727, | |
| "step": 21850, | |
| "top1": 96.03028366088867, | |
| "top5": 99.57074462890625 | |
| }, | |
| { | |
| "epoch": 6.59, | |
| "grad_norm": 0.9524791704046712, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2561, | |
| "step": 21900 | |
| }, | |
| { | |
| "ce_loss": 0.25475972771644595, | |
| "epoch": 6.59, | |
| "inp_emb_norm": 0.39703125, | |
| "loss": 0.25475972771644595, | |
| "masked_top1": 89.4745411682129, | |
| "masked_top5": 98.61525360107422, | |
| "step": 21900, | |
| "top1": 95.92261184692383, | |
| "top5": 99.54248062133789 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 0.9780779866428146, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2543, | |
| "step": 21950 | |
| }, | |
| { | |
| "ce_loss": 0.25268141776323316, | |
| "epoch": 6.6, | |
| "inp_emb_norm": 0.400859375, | |
| "loss": 0.25268141776323316, | |
| "masked_top1": 89.10960723876953, | |
| "masked_top5": 98.78202194213867, | |
| "step": 21950, | |
| "top1": 96.01406188964843, | |
| "top5": 99.5376611328125 | |
| }, | |
| { | |
| "epoch": 6.62, | |
| "grad_norm": 0.9274176844179849, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2628, | |
| "step": 22000 | |
| }, | |
| { | |
| "ce_loss": 0.2659920188784599, | |
| "epoch": 6.62, | |
| "inp_emb_norm": 0.3957421875, | |
| "loss": 0.2659920188784599, | |
| "masked_top1": 88.22967803955078, | |
| "masked_top5": 98.6833935546875, | |
| "step": 22000, | |
| "top1": 95.75867980957031, | |
| "top5": 99.55640853881836 | |
| }, | |
| { | |
| "epoch": 6.63, | |
| "grad_norm": 1.033134365793791, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2614, | |
| "step": 22050 | |
| }, | |
| { | |
| "ce_loss": 0.26293145805597307, | |
| "epoch": 6.63, | |
| "inp_emb_norm": 0.3959375, | |
| "loss": 0.26293145805597307, | |
| "masked_top1": 88.64593627929688, | |
| "masked_top5": 98.49295043945312, | |
| "step": 22050, | |
| "top1": 95.80199890136718, | |
| "top5": 99.52016479492187 | |
| }, | |
| { | |
| "epoch": 6.65, | |
| "grad_norm": 0.9919274843394079, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2614, | |
| "step": 22100 | |
| }, | |
| { | |
| "ce_loss": 0.2592626142501831, | |
| "epoch": 6.65, | |
| "inp_emb_norm": 0.39484375, | |
| "loss": 0.2592626142501831, | |
| "masked_top1": 89.1805844116211, | |
| "masked_top5": 98.66838363647462, | |
| "step": 22100, | |
| "top1": 95.86548355102539, | |
| "top5": 99.5434815979004 | |
| }, | |
| { | |
| "epoch": 6.66, | |
| "grad_norm": 0.923669979047392, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2684, | |
| "step": 22150 | |
| }, | |
| { | |
| "ce_loss": 0.25953867882490156, | |
| "epoch": 6.66, | |
| "inp_emb_norm": 0.4025, | |
| "loss": 0.25953867882490156, | |
| "masked_top1": 88.57401412963867, | |
| "masked_top5": 98.65271347045899, | |
| "step": 22150, | |
| "top1": 95.81649612426757, | |
| "top5": 99.55012420654298 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 0.9781925396499678, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2676, | |
| "step": 22200 | |
| }, | |
| { | |
| "ce_loss": 0.2756389129161835, | |
| "epoch": 6.68, | |
| "inp_emb_norm": 0.3946875, | |
| "loss": 0.2756389129161835, | |
| "masked_top1": 87.85789031982422, | |
| "masked_top5": 98.56079971313477, | |
| "step": 22200, | |
| "top1": 95.60260299682618, | |
| "top5": 99.51409820556641 | |
| }, | |
| { | |
| "epoch": 6.69, | |
| "grad_norm": 0.881278477232779, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2694, | |
| "step": 22250 | |
| }, | |
| { | |
| "ce_loss": 0.26733168482780456, | |
| "epoch": 6.69, | |
| "inp_emb_norm": 0.3998828125, | |
| "loss": 0.26733168482780456, | |
| "masked_top1": 88.51319839477539, | |
| "masked_top5": 98.58459121704101, | |
| "step": 22250, | |
| "top1": 95.67998138427734, | |
| "top5": 99.5245753479004 | |
| }, | |
| { | |
| "epoch": 6.71, | |
| "grad_norm": 0.9944879689962476, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2695, | |
| "step": 22300 | |
| }, | |
| { | |
| "ce_loss": 0.2680419811606407, | |
| "epoch": 6.71, | |
| "inp_emb_norm": 0.3937109375, | |
| "loss": 0.2680419811606407, | |
| "masked_top1": 89.19367111206054, | |
| "masked_top5": 98.77984527587891, | |
| "step": 22300, | |
| "top1": 95.70912155151368, | |
| "top5": 99.52434310913085 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 0.9344294418856229, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2706, | |
| "step": 22350 | |
| }, | |
| { | |
| "ce_loss": 0.2640018093585968, | |
| "epoch": 6.72, | |
| "inp_emb_norm": 0.3963671875, | |
| "loss": 0.2640018093585968, | |
| "masked_top1": 88.87513946533203, | |
| "masked_top5": 98.68793273925782, | |
| "step": 22350, | |
| "top1": 95.7931625366211, | |
| "top5": 99.53593536376952 | |
| }, | |
| { | |
| "epoch": 6.74, | |
| "grad_norm": 0.9234955706616644, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2694, | |
| "step": 22400 | |
| }, | |
| { | |
| "ce_loss": 0.27443262994289397, | |
| "epoch": 6.74, | |
| "inp_emb_norm": 0.401484375, | |
| "loss": 0.27443262994289397, | |
| "masked_top1": 87.91703521728516, | |
| "masked_top5": 98.35836242675781, | |
| "step": 22400, | |
| "top1": 95.63983840942383, | |
| "top5": 99.48991470336914 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 0.9774569390983554, | |
| "learning_rate": 0.0001, | |
| "loss": 0.273, | |
| "step": 22450 | |
| }, | |
| { | |
| "ce_loss": 0.2766887894272804, | |
| "epoch": 6.75, | |
| "inp_emb_norm": 0.398359375, | |
| "loss": 0.2766887894272804, | |
| "masked_top1": 88.1091081237793, | |
| "masked_top5": 98.57858413696289, | |
| "step": 22450, | |
| "top1": 95.64536865234375, | |
| "top5": 99.5364192199707 | |
| }, | |
| { | |
| "epoch": 6.77, | |
| "grad_norm": 0.8755362789382393, | |
| "learning_rate": 0.0001, | |
| "loss": 0.274, | |
| "step": 22500 | |
| }, | |
| { | |
| "ce_loss": 0.27666087716817855, | |
| "epoch": 6.77, | |
| "inp_emb_norm": 0.4048046875, | |
| "loss": 0.27666087716817855, | |
| "masked_top1": 87.76848861694336, | |
| "masked_top5": 98.57107147216797, | |
| "step": 22500, | |
| "top1": 95.56947143554687, | |
| "top5": 99.51631744384765 | |
| }, | |
| { | |
| "epoch": 6.78, | |
| "grad_norm": 0.8974975580290838, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2727, | |
| "step": 22550 | |
| }, | |
| { | |
| "ce_loss": 0.26903481036424637, | |
| "epoch": 6.78, | |
| "inp_emb_norm": 0.3957421875, | |
| "loss": 0.26903481036424637, | |
| "masked_top1": 88.31371643066406, | |
| "masked_top5": 98.50493865966797, | |
| "step": 22550, | |
| "top1": 95.6751333618164, | |
| "top5": 99.53861953735351 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 0.9330247587083373, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2773, | |
| "step": 22600 | |
| }, | |
| { | |
| "ce_loss": 0.27828563034534454, | |
| "epoch": 6.8, | |
| "inp_emb_norm": 0.40484375, | |
| "loss": 0.27828563034534454, | |
| "masked_top1": 87.45176528930664, | |
| "masked_top5": 98.36986709594727, | |
| "step": 22600, | |
| "top1": 95.5737760925293, | |
| "top5": 99.49567245483398 | |
| }, | |
| { | |
| "epoch": 6.81, | |
| "grad_norm": 0.9335442063104634, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2753, | |
| "step": 22650 | |
| }, | |
| { | |
| "ce_loss": 0.27865981191396716, | |
| "epoch": 6.81, | |
| "inp_emb_norm": 0.40390625, | |
| "loss": 0.27865981191396716, | |
| "masked_top1": 87.10857299804688, | |
| "masked_top5": 98.53876022338868, | |
| "step": 22650, | |
| "top1": 95.47858245849609, | |
| "top5": 99.53715377807617 | |
| }, | |
| { | |
| "epoch": 6.83, | |
| "grad_norm": 0.9872165512055184, | |
| "learning_rate": 0.0001, | |
| "loss": 0.274, | |
| "step": 22700 | |
| }, | |
| { | |
| "ce_loss": 0.2781349629163742, | |
| "epoch": 6.83, | |
| "inp_emb_norm": 0.409296875, | |
| "loss": 0.2781349629163742, | |
| "masked_top1": 86.65185195922851, | |
| "masked_top5": 98.23174392700196, | |
| "step": 22700, | |
| "top1": 95.48913436889649, | |
| "top5": 99.51249389648437 | |
| }, | |
| { | |
| "epoch": 6.84, | |
| "grad_norm": 0.8421329759123019, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2809, | |
| "step": 22750 | |
| }, | |
| { | |
| "ce_loss": 0.2819749695062637, | |
| "epoch": 6.84, | |
| "inp_emb_norm": 0.4040625, | |
| "loss": 0.2819749695062637, | |
| "masked_top1": 87.23991241455079, | |
| "masked_top5": 98.37083847045898, | |
| "step": 22750, | |
| "top1": 95.48827087402344, | |
| "top5": 99.51471160888671 | |
| }, | |
| { | |
| "epoch": 6.86, | |
| "grad_norm": 0.9374127847054397, | |
| "learning_rate": 0.0001, | |
| "loss": 0.283, | |
| "step": 22800 | |
| }, | |
| { | |
| "ce_loss": 0.2752624320983887, | |
| "epoch": 6.86, | |
| "inp_emb_norm": 0.4168359375, | |
| "loss": 0.2752624320983887, | |
| "masked_top1": 87.0565657043457, | |
| "masked_top5": 98.52674087524414, | |
| "step": 22800, | |
| "top1": 95.5639727783203, | |
| "top5": 99.51801193237304 | |
| }, | |
| { | |
| "epoch": 6.87, | |
| "grad_norm": 0.894365544580622, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2806, | |
| "step": 22850 | |
| }, | |
| { | |
| "ce_loss": 0.28234552562236787, | |
| "epoch": 6.87, | |
| "inp_emb_norm": 0.3933984375, | |
| "loss": 0.28234552562236787, | |
| "masked_top1": 87.87512283325195, | |
| "masked_top5": 98.52397399902344, | |
| "step": 22850, | |
| "top1": 95.52448684692382, | |
| "top5": 99.49992218017579 | |
| }, | |
| { | |
| "epoch": 6.89, | |
| "grad_norm": 0.8865838096449303, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2803, | |
| "step": 22900 | |
| }, | |
| { | |
| "ce_loss": 0.2735359054803848, | |
| "epoch": 6.89, | |
| "inp_emb_norm": 0.4132421875, | |
| "loss": 0.2735359054803848, | |
| "masked_top1": 87.56801651000977, | |
| "masked_top5": 98.44611022949219, | |
| "step": 22900, | |
| "top1": 95.64279312133789, | |
| "top5": 99.52417373657227 | |
| }, | |
| { | |
| "epoch": 6.9, | |
| "grad_norm": 1.0014374606270757, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2846, | |
| "step": 22950 | |
| }, | |
| { | |
| "ce_loss": 0.2815407305955887, | |
| "epoch": 6.9, | |
| "inp_emb_norm": 0.4108203125, | |
| "loss": 0.2815407305955887, | |
| "masked_top1": 86.75137435913086, | |
| "masked_top5": 98.28164108276367, | |
| "step": 22950, | |
| "top1": 95.45286041259766, | |
| "top5": 99.5033415222168 | |
| }, | |
| { | |
| "epoch": 6.92, | |
| "grad_norm": 0.9418515928051996, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2881, | |
| "step": 23000 | |
| }, | |
| { | |
| "ce_loss": 0.2875212562084198, | |
| "epoch": 6.92, | |
| "inp_emb_norm": 0.393046875, | |
| "loss": 0.2875212562084198, | |
| "masked_top1": 86.9408479309082, | |
| "masked_top5": 98.42632766723632, | |
| "step": 23000, | |
| "top1": 95.38972747802734, | |
| "top5": 99.51202941894532 | |
| }, | |
| { | |
| "epoch": 6.93, | |
| "grad_norm": 0.9640044804691634, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2849, | |
| "step": 23050 | |
| }, | |
| { | |
| "ce_loss": 0.28959711760282514, | |
| "epoch": 6.93, | |
| "inp_emb_norm": 0.39890625, | |
| "loss": 0.28959711760282514, | |
| "masked_top1": 87.06779602050781, | |
| "masked_top5": 98.1885205078125, | |
| "step": 23050, | |
| "top1": 95.33015533447265, | |
| "top5": 99.4693441772461 | |
| }, | |
| { | |
| "epoch": 6.95, | |
| "grad_norm": 0.884763494943833, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2864, | |
| "step": 23100 | |
| }, | |
| { | |
| "ce_loss": 0.2791384127736092, | |
| "epoch": 6.95, | |
| "inp_emb_norm": 0.3983203125, | |
| "loss": 0.2791384127736092, | |
| "masked_top1": 87.34188446044922, | |
| "masked_top5": 98.60519989013672, | |
| "step": 23100, | |
| "top1": 95.44756973266601, | |
| "top5": 99.51571304321288 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 0.9969087357121538, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2819, | |
| "step": 23150 | |
| }, | |
| { | |
| "ce_loss": 0.2769463035464287, | |
| "epoch": 6.96, | |
| "inp_emb_norm": 0.40640625, | |
| "loss": 0.2769463035464287, | |
| "masked_top1": 87.70826522827149, | |
| "masked_top5": 98.4365104675293, | |
| "step": 23150, | |
| "top1": 95.51589462280273, | |
| "top5": 99.52118911743165 | |
| }, | |
| { | |
| "epoch": 6.98, | |
| "grad_norm": 1.0169282128070403, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2876, | |
| "step": 23200 | |
| }, | |
| { | |
| "ce_loss": 0.28613451212644575, | |
| "epoch": 6.98, | |
| "inp_emb_norm": 0.409296875, | |
| "loss": 0.28613451212644575, | |
| "masked_top1": 86.17974639892579, | |
| "masked_top5": 98.22409042358399, | |
| "step": 23200, | |
| "top1": 95.39607528686524, | |
| "top5": 99.49454742431641 | |
| }, | |
| { | |
| "epoch": 6.99, | |
| "grad_norm": 1.0450277523154698, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2889, | |
| "step": 23250 | |
| }, | |
| { | |
| "ce_loss": 0.2973794335126877, | |
| "epoch": 6.99, | |
| "inp_emb_norm": 0.40296875, | |
| "loss": 0.2973794335126877, | |
| "masked_top1": 86.13511917114258, | |
| "masked_top5": 98.26584106445313, | |
| "step": 23250, | |
| "top1": 95.27028549194335, | |
| "top5": 99.47429718017578 | |
| }, | |
| { | |
| "epoch": 7.01, | |
| "grad_norm": 0.6565758700785924, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2269, | |
| "step": 23300 | |
| }, | |
| { | |
| "ce_loss": 0.22528975576162338, | |
| "epoch": 7.01, | |
| "inp_emb_norm": 0.4019140625, | |
| "loss": 0.22528975576162338, | |
| "masked_top1": 91.23332473754883, | |
| "masked_top5": 99.0590966796875, | |
| "step": 23300, | |
| "top1": 96.40823837280273, | |
| "top5": 99.59931091308594 | |
| }, | |
| { | |
| "epoch": 7.02, | |
| "grad_norm": 0.7347638074045553, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1611, | |
| "step": 23350 | |
| }, | |
| { | |
| "ce_loss": 0.16495208382606508, | |
| "epoch": 7.02, | |
| "inp_emb_norm": 0.4017578125, | |
| "loss": 0.16495208382606508, | |
| "masked_top1": 95.35215362548828, | |
| "masked_top5": 99.73044998168945, | |
| "step": 23350, | |
| "top1": 97.44107040405274, | |
| "top5": 99.70461227416992 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 0.6645294223117588, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1606, | |
| "step": 23400 | |
| }, | |
| { | |
| "ce_loss": 0.1565721134841442, | |
| "epoch": 7.04, | |
| "inp_emb_norm": 0.4126953125, | |
| "loss": 0.1565721134841442, | |
| "masked_top1": 95.39784866333008, | |
| "masked_top5": 99.75249832153321, | |
| "step": 23400, | |
| "top1": 97.55490280151368, | |
| "top5": 99.71364120483399 | |
| }, | |
| { | |
| "epoch": 7.05, | |
| "grad_norm": 0.7050716054650297, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1606, | |
| "step": 23450 | |
| }, | |
| { | |
| "ce_loss": 0.15757375702261925, | |
| "epoch": 7.05, | |
| "inp_emb_norm": 0.4034765625, | |
| "loss": 0.15757375702261925, | |
| "masked_top1": 95.50349487304688, | |
| "masked_top5": 99.6744792175293, | |
| "step": 23450, | |
| "top1": 97.49706893920899, | |
| "top5": 99.70018051147461 | |
| }, | |
| { | |
| "epoch": 7.07, | |
| "grad_norm": 0.70439339407283, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1595, | |
| "step": 23500 | |
| }, | |
| { | |
| "ce_loss": 0.15974825277924537, | |
| "epoch": 7.07, | |
| "inp_emb_norm": 0.410625, | |
| "loss": 0.15974825277924537, | |
| "masked_top1": 95.76246231079102, | |
| "masked_top5": 99.68396453857422, | |
| "step": 23500, | |
| "top1": 97.5335317993164, | |
| "top5": 99.69112976074219 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "grad_norm": 0.8001940943257561, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1643, | |
| "step": 23550 | |
| }, | |
| { | |
| "ce_loss": 0.16370448261499404, | |
| "epoch": 7.08, | |
| "inp_emb_norm": 0.4030859375, | |
| "loss": 0.16370448261499404, | |
| "masked_top1": 95.43571884155273, | |
| "masked_top5": 99.68741439819335, | |
| "step": 23550, | |
| "top1": 97.46105117797852, | |
| "top5": 99.6865510559082 | |
| }, | |
| { | |
| "epoch": 7.1, | |
| "grad_norm": 0.7666147004483381, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1653, | |
| "step": 23600 | |
| }, | |
| { | |
| "ce_loss": 0.16188147634267808, | |
| "epoch": 7.1, | |
| "inp_emb_norm": 0.408828125, | |
| "loss": 0.16188147634267808, | |
| "masked_top1": 95.91392837524414, | |
| "masked_top5": 99.76342895507813, | |
| "step": 23600, | |
| "top1": 97.41842742919921, | |
| "top5": 99.69919830322266 | |
| }, | |
| { | |
| "epoch": 7.11, | |
| "grad_norm": 0.753790024789609, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1674, | |
| "step": 23650 | |
| }, | |
| { | |
| "ce_loss": 0.16793357729911804, | |
| "epoch": 7.11, | |
| "inp_emb_norm": 0.4030078125, | |
| "loss": 0.16793357729911804, | |
| "masked_top1": 95.3747721862793, | |
| "masked_top5": 99.69120513916016, | |
| "step": 23650, | |
| "top1": 97.39034225463867, | |
| "top5": 99.67092514038086 | |
| }, | |
| { | |
| "epoch": 7.13, | |
| "grad_norm": 0.7560663951042244, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1714, | |
| "step": 23700 | |
| }, | |
| { | |
| "ce_loss": 0.16748950853943825, | |
| "epoch": 7.13, | |
| "inp_emb_norm": 0.4015234375, | |
| "loss": 0.16748950853943825, | |
| "masked_top1": 95.61355926513671, | |
| "masked_top5": 99.70470977783204, | |
| "step": 23700, | |
| "top1": 97.36271957397462, | |
| "top5": 99.68995620727539 | |
| }, | |
| { | |
| "epoch": 7.14, | |
| "grad_norm": 0.7885900131749313, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1718, | |
| "step": 23750 | |
| }, | |
| { | |
| "ce_loss": 0.16834189236164093, | |
| "epoch": 7.14, | |
| "inp_emb_norm": 0.4105859375, | |
| "loss": 0.16834189236164093, | |
| "masked_top1": 95.20903045654296, | |
| "masked_top5": 99.66537704467774, | |
| "step": 23750, | |
| "top1": 97.41024429321288, | |
| "top5": 99.6710530090332 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 0.708267720328551, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1718, | |
| "step": 23800 | |
| }, | |
| { | |
| "ce_loss": 0.16808076053857804, | |
| "epoch": 7.16, | |
| "inp_emb_norm": 0.4126953125, | |
| "loss": 0.16808076053857804, | |
| "masked_top1": 94.77564025878907, | |
| "masked_top5": 99.67258102416992, | |
| "step": 23800, | |
| "top1": 97.39059310913086, | |
| "top5": 99.68683700561523 | |
| }, | |
| { | |
| "epoch": 7.17, | |
| "grad_norm": 0.7351045964671515, | |
| "learning_rate": 0.0001, | |
| "loss": 0.174, | |
| "step": 23850 | |
| }, | |
| { | |
| "ce_loss": 0.1726333273947239, | |
| "epoch": 7.17, | |
| "inp_emb_norm": 0.4017578125, | |
| "loss": 0.1726333273947239, | |
| "masked_top1": 94.88044723510743, | |
| "masked_top5": 99.62449737548827, | |
| "step": 23850, | |
| "top1": 97.2899102783203, | |
| "top5": 99.65145614624024 | |
| }, | |
| { | |
| "epoch": 7.19, | |
| "grad_norm": 0.8049347796917691, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1787, | |
| "step": 23900 | |
| }, | |
| { | |
| "ce_loss": 0.17679258838295936, | |
| "epoch": 7.19, | |
| "inp_emb_norm": 0.4061328125, | |
| "loss": 0.17679258838295936, | |
| "masked_top1": 94.69095947265625, | |
| "masked_top5": 99.67362945556641, | |
| "step": 23900, | |
| "top1": 97.20210205078125, | |
| "top5": 99.68624572753906 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 0.8012010836512982, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1788, | |
| "step": 23950 | |
| }, | |
| { | |
| "ce_loss": 0.1779346537590027, | |
| "epoch": 7.2, | |
| "inp_emb_norm": 0.40765625, | |
| "loss": 0.1779346537590027, | |
| "masked_top1": 94.71529022216797, | |
| "masked_top5": 99.61817611694336, | |
| "step": 23950, | |
| "top1": 97.14944107055663, | |
| "top5": 99.65997314453125 | |
| }, | |
| { | |
| "epoch": 7.22, | |
| "grad_norm": 0.8096182631639389, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1782, | |
| "step": 24000 | |
| }, | |
| { | |
| "ce_loss": 0.17779998898506164, | |
| "epoch": 7.22, | |
| "inp_emb_norm": 0.40890625, | |
| "loss": 0.17779998898506164, | |
| "masked_top1": 94.6820344543457, | |
| "masked_top5": 99.6923991394043, | |
| "step": 24000, | |
| "top1": 97.1572166442871, | |
| "top5": 99.69579544067383 | |
| }, | |
| { | |
| "epoch": 7.23, | |
| "grad_norm": 0.7208752096873576, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1795, | |
| "step": 24050 | |
| }, | |
| { | |
| "ce_loss": 0.18153419494628906, | |
| "epoch": 7.23, | |
| "inp_emb_norm": 0.4061328125, | |
| "loss": 0.18153419494628906, | |
| "masked_top1": 94.62552368164063, | |
| "masked_top5": 99.55692886352539, | |
| "step": 24050, | |
| "top1": 97.11937118530274, | |
| "top5": 99.64983123779297 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 0.8302822944148331, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1832, | |
| "step": 24100 | |
| }, | |
| { | |
| "ce_loss": 0.18768798112869262, | |
| "epoch": 7.25, | |
| "inp_emb_norm": 0.41546875, | |
| "loss": 0.18768798112869262, | |
| "masked_top1": 93.77630233764648, | |
| "masked_top5": 99.51055358886718, | |
| "step": 24100, | |
| "top1": 97.08079849243164, | |
| "top5": 99.66075820922852 | |
| }, | |
| { | |
| "epoch": 7.26, | |
| "grad_norm": 0.8034956740784042, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1895, | |
| "step": 24150 | |
| }, | |
| { | |
| "ce_loss": 0.19201560974121093, | |
| "epoch": 7.26, | |
| "inp_emb_norm": 0.40984375, | |
| "loss": 0.19201560974121093, | |
| "masked_top1": 93.87309066772461, | |
| "masked_top5": 99.5169157409668, | |
| "step": 24150, | |
| "top1": 97.02114761352539, | |
| "top5": 99.6527261352539 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 0.8314048125761649, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1873, | |
| "step": 24200 | |
| }, | |
| { | |
| "ce_loss": 0.1878107675909996, | |
| "epoch": 7.28, | |
| "inp_emb_norm": 0.407265625, | |
| "loss": 0.1878107675909996, | |
| "masked_top1": 94.19107177734375, | |
| "masked_top5": 99.55671203613281, | |
| "step": 24200, | |
| "top1": 97.03997573852538, | |
| "top5": 99.68890594482421 | |
| }, | |
| { | |
| "epoch": 7.29, | |
| "grad_norm": 0.704357096474096, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1893, | |
| "step": 24250 | |
| }, | |
| { | |
| "ce_loss": 0.19042812794446945, | |
| "epoch": 7.29, | |
| "inp_emb_norm": 0.416328125, | |
| "loss": 0.19042812794446945, | |
| "masked_top1": 93.61178329467774, | |
| "masked_top5": 99.54245040893555, | |
| "step": 24250, | |
| "top1": 96.90492645263672, | |
| "top5": 99.6619076538086 | |
| }, | |
| { | |
| "epoch": 7.31, | |
| "grad_norm": 0.856253880169746, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1893, | |
| "step": 24300 | |
| }, | |
| { | |
| "ce_loss": 0.1884206250309944, | |
| "epoch": 7.31, | |
| "inp_emb_norm": 0.4078515625, | |
| "loss": 0.1884206250309944, | |
| "masked_top1": 93.98220611572266, | |
| "masked_top5": 99.54290969848633, | |
| "step": 24300, | |
| "top1": 96.96899108886718, | |
| "top5": 99.6441293334961 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 0.8464641167931024, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1918, | |
| "step": 24350 | |
| }, | |
| { | |
| "ce_loss": 0.19378722280263902, | |
| "epoch": 7.32, | |
| "inp_emb_norm": 0.4122265625, | |
| "loss": 0.19378722280263902, | |
| "masked_top1": 93.98709579467773, | |
| "masked_top5": 99.48938507080078, | |
| "step": 24350, | |
| "top1": 96.92246383666992, | |
| "top5": 99.6635366821289 | |
| }, | |
| { | |
| "epoch": 7.34, | |
| "grad_norm": 0.9274851248218718, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1979, | |
| "step": 24400 | |
| }, | |
| { | |
| "ce_loss": 0.20088252812623977, | |
| "epoch": 7.34, | |
| "inp_emb_norm": 0.405546875, | |
| "loss": 0.20088252812623977, | |
| "masked_top1": 92.93017440795899, | |
| "masked_top5": 99.49172775268555, | |
| "step": 24400, | |
| "top1": 96.83464401245118, | |
| "top5": 99.64353897094726 | |
| }, | |
| { | |
| "epoch": 7.35, | |
| "grad_norm": 0.7721388576512122, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1934, | |
| "step": 24450 | |
| }, | |
| { | |
| "ce_loss": 0.2001592782139778, | |
| "epoch": 7.35, | |
| "inp_emb_norm": 0.413828125, | |
| "loss": 0.2001592782139778, | |
| "masked_top1": 93.32104949951172, | |
| "masked_top5": 99.480458984375, | |
| "step": 24450, | |
| "top1": 96.84799270629883, | |
| "top5": 99.66846115112304 | |
| }, | |
| { | |
| "epoch": 7.37, | |
| "grad_norm": 0.8460246810468973, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1937, | |
| "step": 24500 | |
| }, | |
| { | |
| "ce_loss": 0.19288330137729645, | |
| "epoch": 7.37, | |
| "inp_emb_norm": 0.40984375, | |
| "loss": 0.19288330137729645, | |
| "masked_top1": 93.38121078491211, | |
| "masked_top5": 99.56306228637695, | |
| "step": 24500, | |
| "top1": 96.92339248657227, | |
| "top5": 99.67182235717773 | |
| }, | |
| { | |
| "epoch": 7.38, | |
| "grad_norm": 0.8742179314746799, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1982, | |
| "step": 24550 | |
| }, | |
| { | |
| "ce_loss": 0.19720925360918046, | |
| "epoch": 7.38, | |
| "inp_emb_norm": 0.412890625, | |
| "loss": 0.19720925360918046, | |
| "masked_top1": 93.280068359375, | |
| "masked_top5": 99.51691192626953, | |
| "step": 24550, | |
| "top1": 96.83218688964844, | |
| "top5": 99.66712615966797 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.8553325710712862, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2011, | |
| "step": 24600 | |
| }, | |
| { | |
| "ce_loss": 0.19960668057203293, | |
| "epoch": 7.4, | |
| "inp_emb_norm": 0.409765625, | |
| "loss": 0.19960668057203293, | |
| "masked_top1": 93.45094650268555, | |
| "masked_top5": 99.46210220336914, | |
| "step": 24600, | |
| "top1": 96.85580673217774, | |
| "top5": 99.665537109375 | |
| }, | |
| { | |
| "epoch": 7.41, | |
| "grad_norm": 0.8454568497353904, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2022, | |
| "step": 24650 | |
| }, | |
| { | |
| "ce_loss": 0.20072863966226578, | |
| "epoch": 7.41, | |
| "inp_emb_norm": 0.4177734375, | |
| "loss": 0.20072863966226578, | |
| "masked_top1": 92.75718322753906, | |
| "masked_top5": 99.35001556396485, | |
| "step": 24650, | |
| "top1": 96.78503204345704, | |
| "top5": 99.66353454589844 | |
| }, | |
| { | |
| "epoch": 7.43, | |
| "grad_norm": 0.8262303326226429, | |
| "learning_rate": 0.0001, | |
| "loss": 0.205, | |
| "step": 24700 | |
| }, | |
| { | |
| "ce_loss": 0.20386093407869338, | |
| "epoch": 7.43, | |
| "inp_emb_norm": 0.4098046875, | |
| "loss": 0.20386093407869338, | |
| "masked_top1": 93.37680938720703, | |
| "masked_top5": 99.56277999877929, | |
| "step": 24700, | |
| "top1": 96.77312591552734, | |
| "top5": 99.6433283996582 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 0.8542900336102568, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2073, | |
| "step": 24750 | |
| }, | |
| { | |
| "ce_loss": 0.20805983483791352, | |
| "epoch": 7.44, | |
| "inp_emb_norm": 0.4114453125, | |
| "loss": 0.20805983483791352, | |
| "masked_top1": 92.38563873291015, | |
| "masked_top5": 99.27365493774414, | |
| "step": 24750, | |
| "top1": 96.68102691650391, | |
| "top5": 99.62649429321289 | |
| }, | |
| { | |
| "epoch": 7.46, | |
| "grad_norm": 0.775495795649607, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2084, | |
| "step": 24800 | |
| }, | |
| { | |
| "ce_loss": 0.20858111292123793, | |
| "epoch": 7.46, | |
| "inp_emb_norm": 0.40828125, | |
| "loss": 0.20858111292123793, | |
| "masked_top1": 93.18427093505859, | |
| "masked_top5": 99.40792358398437, | |
| "step": 24800, | |
| "top1": 96.69915771484375, | |
| "top5": 99.64056365966798 | |
| }, | |
| { | |
| "epoch": 7.47, | |
| "grad_norm": 0.795773364310639, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2128, | |
| "step": 24850 | |
| }, | |
| { | |
| "ce_loss": 0.21129278868436813, | |
| "epoch": 7.47, | |
| "inp_emb_norm": 0.4151953125, | |
| "loss": 0.21129278868436813, | |
| "masked_top1": 92.36515197753906, | |
| "masked_top5": 99.46405319213868, | |
| "step": 24850, | |
| "top1": 96.61996765136719, | |
| "top5": 99.64519241333008 | |
| }, | |
| { | |
| "epoch": 7.49, | |
| "grad_norm": 0.8606373006872489, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2107, | |
| "step": 24900 | |
| }, | |
| { | |
| "ce_loss": 0.2146734645962715, | |
| "epoch": 7.49, | |
| "inp_emb_norm": 0.416328125, | |
| "loss": 0.2146734645962715, | |
| "masked_top1": 92.35210235595703, | |
| "masked_top5": 99.37399642944337, | |
| "step": 24900, | |
| "top1": 96.58806320190429, | |
| "top5": 99.66881698608398 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.8148609241548314, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2145, | |
| "step": 24950 | |
| }, | |
| { | |
| "ce_loss": 0.21042368352413177, | |
| "epoch": 7.5, | |
| "inp_emb_norm": 0.41640625, | |
| "loss": 0.21042368352413177, | |
| "masked_top1": 92.42163009643555, | |
| "masked_top5": 99.2802491760254, | |
| "step": 24950, | |
| "top1": 96.64725265502929, | |
| "top5": 99.64126876831055 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.8552723662919675, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2152, | |
| "step": 25000 | |
| }, | |
| { | |
| "ce_loss": 0.21622439593076706, | |
| "epoch": 7.52, | |
| "inp_emb_norm": 0.411796875, | |
| "loss": 0.21622439593076706, | |
| "masked_top1": 92.36061889648437, | |
| "masked_top5": 99.40323318481445, | |
| "step": 25000, | |
| "top1": 96.49748916625977, | |
| "top5": 99.6481248474121 | |
| }, | |
| { | |
| "epoch": 7.53, | |
| "grad_norm": 0.8756422021562937, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2175, | |
| "step": 25050 | |
| }, | |
| { | |
| "ce_loss": 0.21715206265449524, | |
| "epoch": 7.53, | |
| "inp_emb_norm": 0.4205859375, | |
| "loss": 0.21715206265449524, | |
| "masked_top1": 92.16612228393555, | |
| "masked_top5": 99.21943115234374, | |
| "step": 25050, | |
| "top1": 96.50445205688476, | |
| "top5": 99.63119323730469 | |
| }, | |
| { | |
| "epoch": 7.55, | |
| "grad_norm": 0.9236893593627393, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2156, | |
| "step": 25100 | |
| }, | |
| { | |
| "ce_loss": 0.21899319916963578, | |
| "epoch": 7.55, | |
| "inp_emb_norm": 0.413203125, | |
| "loss": 0.21899319916963578, | |
| "masked_top1": 91.96203842163087, | |
| "masked_top5": 99.34012496948242, | |
| "step": 25100, | |
| "top1": 96.50180130004883, | |
| "top5": 99.64115966796875 | |
| }, | |
| { | |
| "epoch": 7.56, | |
| "grad_norm": 0.931694461742947, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2197, | |
| "step": 25150 | |
| }, | |
| { | |
| "ce_loss": 0.21712171256542206, | |
| "epoch": 7.56, | |
| "inp_emb_norm": 0.4085546875, | |
| "loss": 0.21712171256542206, | |
| "masked_top1": 92.56915695190429, | |
| "masked_top5": 99.49829315185546, | |
| "step": 25150, | |
| "top1": 96.46173583984375, | |
| "top5": 99.64132019042968 | |
| }, | |
| { | |
| "epoch": 7.58, | |
| "grad_norm": 0.92715510790691, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2259, | |
| "step": 25200 | |
| }, | |
| { | |
| "ce_loss": 0.2213623434305191, | |
| "epoch": 7.58, | |
| "inp_emb_norm": 0.4107421875, | |
| "loss": 0.2213623434305191, | |
| "masked_top1": 91.66716598510742, | |
| "masked_top5": 99.32309783935547, | |
| "step": 25200, | |
| "top1": 96.42446884155274, | |
| "top5": 99.6251106262207 | |
| }, | |
| { | |
| "epoch": 7.59, | |
| "grad_norm": 0.9935343271656442, | |
| "learning_rate": 0.0001, | |
| "loss": 0.225, | |
| "step": 25250 | |
| }, | |
| { | |
| "ce_loss": 0.22398129254579544, | |
| "epoch": 7.59, | |
| "inp_emb_norm": 0.409140625, | |
| "loss": 0.22398129254579544, | |
| "masked_top1": 92.20207107543945, | |
| "masked_top5": 99.32244827270507, | |
| "step": 25250, | |
| "top1": 96.40167465209962, | |
| "top5": 99.6429054260254 | |
| }, | |
| { | |
| "epoch": 7.61, | |
| "grad_norm": 0.9151015122387817, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2227, | |
| "step": 25300 | |
| }, | |
| { | |
| "ce_loss": 0.22134240210056305, | |
| "epoch": 7.61, | |
| "inp_emb_norm": 0.413984375, | |
| "loss": 0.22134240210056305, | |
| "masked_top1": 92.04093017578126, | |
| "masked_top5": 99.4468423461914, | |
| "step": 25300, | |
| "top1": 96.43467788696289, | |
| "top5": 99.66016799926757 | |
| }, | |
| { | |
| "epoch": 7.62, | |
| "grad_norm": 0.8495278966433639, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2212, | |
| "step": 25350 | |
| }, | |
| { | |
| "ce_loss": 0.22596073687076568, | |
| "epoch": 7.62, | |
| "inp_emb_norm": 0.4122265625, | |
| "loss": 0.22596073687076568, | |
| "masked_top1": 91.3648748779297, | |
| "masked_top5": 99.2870686340332, | |
| "step": 25350, | |
| "top1": 96.3931086730957, | |
| "top5": 99.61766250610351 | |
| }, | |
| { | |
| "epoch": 7.64, | |
| "grad_norm": 0.8771189448377541, | |
| "learning_rate": 0.0001, | |
| "loss": 0.224, | |
| "step": 25400 | |
| }, | |
| { | |
| "ce_loss": 0.22193652182817458, | |
| "epoch": 7.64, | |
| "inp_emb_norm": 0.4132421875, | |
| "loss": 0.22193652182817458, | |
| "masked_top1": 91.40539047241211, | |
| "masked_top5": 99.10585540771484, | |
| "step": 25400, | |
| "top1": 96.49262084960938, | |
| "top5": 99.6045753479004 | |
| }, | |
| { | |
| "epoch": 7.65, | |
| "grad_norm": 0.8719944439794763, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2299, | |
| "step": 25450 | |
| }, | |
| { | |
| "ce_loss": 0.23045677453279495, | |
| "epoch": 7.65, | |
| "inp_emb_norm": 0.4094921875, | |
| "loss": 0.23045677453279495, | |
| "masked_top1": 91.44049133300781, | |
| "masked_top5": 99.38550582885742, | |
| "step": 25450, | |
| "top1": 96.28902786254883, | |
| "top5": 99.62447952270507 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "grad_norm": 0.948342031847273, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2278, | |
| "step": 25500 | |
| }, | |
| { | |
| "ce_loss": 0.2266176301240921, | |
| "epoch": 7.67, | |
| "inp_emb_norm": 0.4139453125, | |
| "loss": 0.2266176301240921, | |
| "masked_top1": 91.54910736083984, | |
| "masked_top5": 99.31753051757812, | |
| "step": 25500, | |
| "top1": 96.38055999755859, | |
| "top5": 99.61369903564453 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 0.9160429959948714, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2301, | |
| "step": 25550 | |
| }, | |
| { | |
| "ce_loss": 0.22997273564338683, | |
| "epoch": 7.68, | |
| "inp_emb_norm": 0.4121484375, | |
| "loss": 0.22997273564338683, | |
| "masked_top1": 91.72368927001953, | |
| "masked_top5": 99.37556945800782, | |
| "step": 25550, | |
| "top1": 96.34187088012695, | |
| "top5": 99.62580551147461 | |
| }, | |
| { | |
| "epoch": 7.7, | |
| "grad_norm": 0.9064800185064251, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2345, | |
| "step": 25600 | |
| }, | |
| { | |
| "ce_loss": 0.2321370568871498, | |
| "epoch": 7.7, | |
| "inp_emb_norm": 0.41484375, | |
| "loss": 0.2321370568871498, | |
| "masked_top1": 91.17147521972656, | |
| "masked_top5": 99.15941177368164, | |
| "step": 25600, | |
| "top1": 96.28547775268555, | |
| "top5": 99.6115461730957 | |
| }, | |
| { | |
| "epoch": 7.71, | |
| "grad_norm": 0.8896861108646344, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2316, | |
| "step": 25650 | |
| }, | |
| { | |
| "ce_loss": 0.22988739639520644, | |
| "epoch": 7.71, | |
| "inp_emb_norm": 0.41703125, | |
| "loss": 0.22988739639520644, | |
| "masked_top1": 90.95779861450195, | |
| "masked_top5": 99.13907150268555, | |
| "step": 25650, | |
| "top1": 96.32924926757812, | |
| "top5": 99.61460418701172 | |
| }, | |
| { | |
| "epoch": 7.73, | |
| "grad_norm": 0.8364454804655236, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2308, | |
| "step": 25700 | |
| }, | |
| { | |
| "ce_loss": 0.23426982671022414, | |
| "epoch": 7.73, | |
| "inp_emb_norm": 0.4129296875, | |
| "loss": 0.23426982671022414, | |
| "masked_top1": 90.87540649414062, | |
| "masked_top5": 99.15606750488281, | |
| "step": 25700, | |
| "top1": 96.24389221191406, | |
| "top5": 99.61002014160157 | |
| }, | |
| { | |
| "epoch": 7.74, | |
| "grad_norm": 0.8892969722239723, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2326, | |
| "step": 25750 | |
| }, | |
| { | |
| "ce_loss": 0.23778538197278976, | |
| "epoch": 7.74, | |
| "inp_emb_norm": 0.4173828125, | |
| "loss": 0.23778538197278976, | |
| "masked_top1": 91.04456115722657, | |
| "masked_top5": 99.08432937622071, | |
| "step": 25750, | |
| "top1": 96.19372604370118, | |
| "top5": 99.59805389404296 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 0.9333358166124965, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2324, | |
| "step": 25800 | |
| }, | |
| { | |
| "ce_loss": 0.2342055532336235, | |
| "epoch": 7.76, | |
| "inp_emb_norm": 0.413203125, | |
| "loss": 0.2342055532336235, | |
| "masked_top1": 90.89793167114257, | |
| "masked_top5": 99.32825225830078, | |
| "step": 25800, | |
| "top1": 96.24822631835937, | |
| "top5": 99.63633575439454 | |
| }, | |
| { | |
| "epoch": 7.77, | |
| "grad_norm": 0.8972293165472223, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2378, | |
| "step": 25850 | |
| }, | |
| { | |
| "ce_loss": 0.2358376482129097, | |
| "epoch": 7.77, | |
| "inp_emb_norm": 0.411875, | |
| "loss": 0.2358376482129097, | |
| "masked_top1": 90.5443440246582, | |
| "masked_top5": 99.08838134765625, | |
| "step": 25850, | |
| "top1": 96.21415130615235, | |
| "top5": 99.59769958496094 | |
| }, | |
| { | |
| "epoch": 7.79, | |
| "grad_norm": 0.8953278969414413, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2394, | |
| "step": 25900 | |
| }, | |
| { | |
| "ce_loss": 0.241860691010952, | |
| "epoch": 7.79, | |
| "inp_emb_norm": 0.40828125, | |
| "loss": 0.241860691010952, | |
| "masked_top1": 90.6273094177246, | |
| "masked_top5": 99.1496745300293, | |
| "step": 25900, | |
| "top1": 96.1283937072754, | |
| "top5": 99.59819442749023 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.8520671639423023, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2401, | |
| "step": 25950 | |
| }, | |
| { | |
| "ce_loss": 0.2415415197610855, | |
| "epoch": 7.8, | |
| "inp_emb_norm": 0.4100390625, | |
| "loss": 0.2415415197610855, | |
| "masked_top1": 90.58090194702149, | |
| "masked_top5": 99.10164520263672, | |
| "step": 25950, | |
| "top1": 96.16258728027344, | |
| "top5": 99.59717208862304 | |
| }, | |
| { | |
| "epoch": 7.82, | |
| "grad_norm": 0.8645330931635717, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2373, | |
| "step": 26000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 26600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 2000, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |