{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 29980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "loss": 22.082683563232422, "step": 0 }, { "ce_loss": 4.019954204559326, "epoch": 0, "step": 0 }, { "distill_loss": 1.1744863986968994, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 3.622408151626587, "step": 0 }, { "epoch": 0, "loss": 21.855388641357422, "step": 0 }, { "ce_loss": 3.952629566192627, "epoch": 0, "step": 0 }, { "distill_loss": 1.1908594369888306, "epoch": 0, "step": 0 }, { "epoch": 0, "ref_ce_loss": 3.536539077758789, "step": 0 }, { "epoch": 0.00333555703802535, "loss": 18.8512, "step": 10 }, { "epoch": 0.00333555703802535, "grad_norm": 346.46929931640625, "step": 10 }, { "epoch": 0.00333555703802535, "learning_rate": 8.88888888888889e-06, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 11.710114479064941, "step": 10 }, { "ce_loss": 3.7606523036956787, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.2074427604675293, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 3.6196365356445312, "step": 10 }, { "epoch": 0.00333555703802535, "loss": 10.445865631103516, "step": 10 }, { "ce_loss": 3.6248669624328613, "epoch": 0.00333555703802535, "step": 10 }, { "distill_loss": 1.240032434463501, "epoch": 0.00333555703802535, "step": 10 }, { "epoch": 0.00333555703802535, "ref_ce_loss": 3.5363941192626953, "step": 10 }, { "epoch": 0.0066711140760507, "loss": 8.6757, "step": 20 }, { "epoch": 0.0066711140760507, "grad_norm": 35.1971435546875, "step": 20 }, { "epoch": 0.0066711140760507, "learning_rate": 1.777777777777778e-05, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 5.543203830718994, "step": 20 }, { "ce_loss": 1.736590027809143, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.0234243869781494, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 2.7691500186920166, "step": 20 }, { "epoch": 0.0066711140760507, "loss": 5.789950847625732, "step": 20 }, { "ce_loss": 1.7167848348617554, "epoch": 0.0066711140760507, "step": 20 }, { "distill_loss": 1.1363052129745483, "epoch": 0.0066711140760507, "step": 20 }, { "epoch": 0.0066711140760507, "ref_ce_loss": 2.6627798080444336, "step": 20 }, { "epoch": 0.01000667111407605, "loss": 4.629, "step": 30 }, { "epoch": 0.01000667111407605, "grad_norm": 20.973167419433594, "step": 30 }, { "epoch": 0.01000667111407605, "learning_rate": 2.6666666666666667e-05, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 3.992175579071045, "step": 30 }, { "ce_loss": 0.5917344689369202, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 0.7147521376609802, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 1.385014295578003, "step": 30 }, { "epoch": 0.01000667111407605, "loss": 3.1635243892669678, "step": 30 }, { "ce_loss": 0.6208186745643616, "epoch": 0.01000667111407605, "step": 30 }, { "distill_loss": 0.8098734021186829, "epoch": 0.01000667111407605, "step": 30 }, { "epoch": 0.01000667111407605, "ref_ce_loss": 1.298578143119812, "step": 30 }, { "epoch": 0.0133422281521014, "loss": 2.6165, "step": 40 }, { "epoch": 0.0133422281521014, "grad_norm": 3.0407230854034424, "step": 40 }, { "epoch": 0.0133422281521014, "learning_rate": 3.555555555555556e-05, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 2.3902668952941895, "step": 40 }, { "ce_loss": 0.6940972208976746, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 0.5506912469863892, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 0.6016234159469604, "step": 40 }, { "epoch": 0.0133422281521014, "loss": 1.8760014772415161, "step": 40 }, { "ce_loss": 0.6062495112419128, "epoch": 0.0133422281521014, "step": 40 }, { "distill_loss": 0.5111391544342041, "epoch": 0.0133422281521014, "step": 40 }, { "epoch": 0.0133422281521014, "ref_ce_loss": 0.5445473194122314, "step": 40 }, { "epoch": 0.01667778519012675, "loss": 2.0494, "step": 50 }, { "epoch": 0.01667778519012675, "grad_norm": 2.9749488830566406, "step": 50 }, { "epoch": 0.01667778519012675, "learning_rate": 4.4444444444444447e-05, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 1.7558726072311401, "step": 50 }, { "ce_loss": 0.5625492334365845, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 0.5216071605682373, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.4841041564941406, "step": 50 }, { "epoch": 0.01667778519012675, "loss": 2.287813663482666, "step": 50 }, { "ce_loss": 0.5892652869224548, "epoch": 0.01667778519012675, "step": 50 }, { "distill_loss": 0.6219286322593689, "epoch": 0.01667778519012675, "step": 50 }, { "epoch": 0.01667778519012675, "ref_ce_loss": 0.3849905729293823, "step": 50 }, { "epoch": 0.0200133422281521, "loss": 1.9326, "step": 60 }, { "epoch": 0.0200133422281521, "grad_norm": 2.8535168170928955, "step": 60 }, { "epoch": 0.0200133422281521, "learning_rate": 5.333333333333333e-05, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 1.7353334426879883, "step": 60 }, { "ce_loss": 0.6446412801742554, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.4365006685256958, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.39507171511650085, "step": 60 }, { "epoch": 0.0200133422281521, "loss": 1.2849175930023193, "step": 60 }, { "ce_loss": 0.5129432082176208, "epoch": 0.0200133422281521, "step": 60 }, { "distill_loss": 0.4272335469722748, "epoch": 0.0200133422281521, "step": 60 }, { "epoch": 0.0200133422281521, "ref_ce_loss": 0.3446718752384186, "step": 60 }, { "epoch": 0.02334889926617745, "loss": 1.7874, "step": 70 }, { "epoch": 0.02334889926617745, "grad_norm": 3.4548161029815674, "step": 70 }, { "epoch": 0.02334889926617745, "learning_rate": 6.222222222222222e-05, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 1.2661957740783691, "step": 70 }, { "ce_loss": 0.5155786275863647, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.44635701179504395, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.3041374385356903, "step": 70 }, { "epoch": 0.02334889926617745, "loss": 1.6170333623886108, "step": 70 }, { "ce_loss": 0.5844157338142395, "epoch": 0.02334889926617745, "step": 70 }, { "distill_loss": 0.48108649253845215, "epoch": 0.02334889926617745, "step": 70 }, { "epoch": 0.02334889926617745, "ref_ce_loss": 0.35459014773368835, "step": 70 }, { "epoch": 0.0266844563042028, "loss": 1.7319, "step": 80 }, { "epoch": 0.0266844563042028, "grad_norm": 3.476360321044922, "step": 80 }, { "epoch": 0.0266844563042028, "learning_rate": 7.111111111111112e-05, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 1.6175156831741333, "step": 80 }, { "ce_loss": 0.6071637272834778, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.46768084168434143, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.36896222829818726, "step": 80 }, { "epoch": 0.0266844563042028, "loss": 1.8820009231567383, "step": 80 }, { "ce_loss": 0.6206005215644836, "epoch": 0.0266844563042028, "step": 80 }, { "distill_loss": 0.48498210310935974, "epoch": 0.0266844563042028, "step": 80 }, { "epoch": 0.0266844563042028, "ref_ce_loss": 0.35933607816696167, "step": 80 }, { "epoch": 0.030020013342228154, "loss": 1.8923, "step": 90 }, { "epoch": 0.030020013342228154, "grad_norm": 3.8720743656158447, "step": 90 }, { "epoch": 0.030020013342228154, "learning_rate": 8e-05, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 1.8874759674072266, "step": 90 }, { "ce_loss": 0.5756715536117554, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.506288468837738, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.3379663825035095, "step": 90 }, { "epoch": 0.030020013342228154, "loss": 1.5637786388397217, "step": 90 }, { "ce_loss": 0.4854426383972168, "epoch": 0.030020013342228154, "step": 90 }, { "distill_loss": 0.5292065739631653, "epoch": 0.030020013342228154, "step": 90 }, { "epoch": 0.030020013342228154, "ref_ce_loss": 0.31842878460884094, "step": 90 }, { "epoch": 0.0333555703802535, "loss": 1.863, "step": 100 }, { "epoch": 0.0333555703802535, "grad_norm": 4.6447319984436035, "step": 100 }, { "epoch": 0.0333555703802535, "learning_rate": 8.888888888888889e-05, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.3307164907455444, "step": 100 }, { "ce_loss": 0.5048717856407166, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.4587930738925934, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.3670507073402405, "step": 100 }, { "epoch": 0.0333555703802535, "loss": 1.9730502367019653, "step": 100 }, { "ce_loss": 0.6307158470153809, "epoch": 0.0333555703802535, "step": 100 }, { "distill_loss": 0.5659656524658203, "epoch": 0.0333555703802535, "step": 100 }, { "epoch": 0.0333555703802535, "ref_ce_loss": 0.3052903413772583, "step": 100 }, { "epoch": 0.03669112741827885, "loss": 1.7245, "step": 110 }, { "epoch": 0.03669112741827885, "grad_norm": 2.048370122909546, "step": 110 }, { "epoch": 0.03669112741827885, "learning_rate": 9.777777777777778e-05, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.917222499847412, "step": 110 }, { "ce_loss": 0.5229926705360413, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.4720943868160248, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.28838008642196655, "step": 110 }, { "epoch": 0.03669112741827885, "loss": 1.9603099822998047, "step": 110 }, { "ce_loss": 0.5854602456092834, "epoch": 0.03669112741827885, "step": 110 }, { "distill_loss": 0.5150183439254761, "epoch": 0.03669112741827885, "step": 110 }, { "epoch": 0.03669112741827885, "ref_ce_loss": 0.34388861060142517, "step": 110 }, { "epoch": 0.0400266844563042, "loss": 1.8773, "step": 120 }, { "epoch": 0.0400266844563042, "grad_norm": 5.81011962890625, "step": 120 }, { "epoch": 0.0400266844563042, "learning_rate": 0.00010666666666666667, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 1.600690245628357, "step": 120 }, { "ce_loss": 0.5576359629631042, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.49549993872642517, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.347184419631958, "step": 120 }, { "epoch": 0.0400266844563042, "loss": 2.348037004470825, "step": 120 }, { "ce_loss": 0.5738920569419861, "epoch": 0.0400266844563042, "step": 120 }, { "distill_loss": 0.4258148670196533, "epoch": 0.0400266844563042, "step": 120 }, { "epoch": 0.0400266844563042, "ref_ce_loss": 0.35311853885650635, "step": 120 }, { "epoch": 0.04336224149432955, "loss": 1.775, "step": 130 }, { "epoch": 0.04336224149432955, "grad_norm": 5.108306407928467, "step": 130 }, { "epoch": 0.04336224149432955, "learning_rate": 0.00011555555555555555, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.927753210067749, "step": 130 }, { "ce_loss": 0.5603481531143188, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.5085574388504028, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.24505111575126648, "step": 130 }, { "epoch": 0.04336224149432955, "loss": 1.2074342966079712, "step": 130 }, { "ce_loss": 0.4739816188812256, "epoch": 0.04336224149432955, "step": 130 }, { "distill_loss": 0.40206992626190186, "epoch": 0.04336224149432955, "step": 130 }, { "epoch": 0.04336224149432955, "ref_ce_loss": 0.3313703238964081, "step": 130 }, { "epoch": 0.0466977985323549, "loss": 1.8075, "step": 140 }, { "epoch": 0.0466977985323549, "grad_norm": 5.600295066833496, "step": 140 }, { "epoch": 0.0466977985323549, "learning_rate": 0.00012444444444444444, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 3.315217971801758, "step": 140 }, { "ce_loss": 0.5632176995277405, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.46269339323043823, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.3052278161048889, "step": 140 }, { "epoch": 0.0466977985323549, "loss": 2.9743146896362305, "step": 140 }, { "ce_loss": 0.6905938386917114, "epoch": 0.0466977985323549, "step": 140 }, { "distill_loss": 0.49012720584869385, "epoch": 0.0466977985323549, "step": 140 }, { "epoch": 0.0466977985323549, "ref_ce_loss": 0.32296621799468994, "step": 140 }, { "epoch": 0.05003335557038025, "loss": 1.9235, "step": 150 }, { "epoch": 0.05003335557038025, "grad_norm": 2.0878727436065674, "step": 150 }, { "epoch": 0.05003335557038025, "learning_rate": 0.00013333333333333334, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 2.5374321937561035, "step": 150 }, { "ce_loss": 0.5370458364486694, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.47549453377723694, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.26440954208374023, "step": 150 }, { "epoch": 0.05003335557038025, "loss": 1.6627169847488403, "step": 150 }, { "ce_loss": 0.5243386626243591, "epoch": 0.05003335557038025, "step": 150 }, { "distill_loss": 0.47431784868240356, "epoch": 0.05003335557038025, "step": 150 }, { "epoch": 0.05003335557038025, "ref_ce_loss": 0.3353169560432434, "step": 150 }, { "epoch": 0.0533689126084056, "loss": 1.7777, "step": 160 }, { "epoch": 0.0533689126084056, "grad_norm": 1.928293228149414, "step": 160 }, { "epoch": 0.0533689126084056, "learning_rate": 0.00014222222222222224, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 1.6847267150878906, "step": 160 }, { "ce_loss": 0.5684657692909241, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.5586349368095398, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.268443763256073, "step": 160 }, { "epoch": 0.0533689126084056, "loss": 1.8770315647125244, "step": 160 }, { "ce_loss": 0.470410019159317, "epoch": 0.0533689126084056, "step": 160 }, { "distill_loss": 0.47870075702667236, "epoch": 0.0533689126084056, "step": 160 }, { "epoch": 0.0533689126084056, "ref_ce_loss": 0.27711212635040283, "step": 160 }, { "epoch": 0.05670446964643095, "loss": 1.8477, "step": 170 }, { "epoch": 0.05670446964643095, "grad_norm": 3.7592098712921143, "step": 170 }, { "epoch": 0.05670446964643095, "learning_rate": 0.0001511111111111111, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.6315332651138306, "step": 170 }, { "ce_loss": 0.5627481937408447, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.5033910274505615, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.3061639964580536, "step": 170 }, { "epoch": 0.05670446964643095, "loss": 1.5741348266601562, "step": 170 }, { "ce_loss": 0.4609927833080292, "epoch": 0.05670446964643095, "step": 170 }, { "distill_loss": 0.468230277299881, "epoch": 0.05670446964643095, "step": 170 }, { "epoch": 0.05670446964643095, "ref_ce_loss": 0.24796752631664276, "step": 170 }, { "epoch": 0.06004002668445631, "loss": 1.7242, "step": 180 }, { "epoch": 0.06004002668445631, "grad_norm": 1.7700247764587402, "step": 180 }, { "epoch": 0.06004002668445631, "learning_rate": 0.00016, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 1.34260094165802, "step": 180 }, { "ce_loss": 0.5201642513275146, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.522593080997467, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.29970839619636536, "step": 180 }, { "epoch": 0.06004002668445631, "loss": 1.6730177402496338, "step": 180 }, { "ce_loss": 0.5037712454795837, "epoch": 0.06004002668445631, "step": 180 }, { "distill_loss": 0.5402393937110901, "epoch": 0.06004002668445631, "step": 180 }, { "epoch": 0.06004002668445631, "ref_ce_loss": 0.3126290440559387, "step": 180 }, { "epoch": 0.06337558372248166, "loss": 1.6673, "step": 190 }, { "epoch": 0.06337558372248166, "grad_norm": 1.8502429723739624, "step": 190 }, { "epoch": 0.06337558372248166, "learning_rate": 0.00016888888888888889, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 1.8196762800216675, "step": 190 }, { "ce_loss": 0.5710945725440979, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.5098980069160461, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.2702150046825409, "step": 190 }, { "epoch": 0.06337558372248166, "loss": 1.9046032428741455, "step": 190 }, { "ce_loss": 0.6237345337867737, "epoch": 0.06337558372248166, "step": 190 }, { "distill_loss": 0.5800604224205017, "epoch": 0.06337558372248166, "step": 190 }, { "epoch": 0.06337558372248166, "ref_ce_loss": 0.2864050567150116, "step": 190 }, { "epoch": 0.066711140760507, "loss": 1.7973, "step": 200 }, { "epoch": 0.066711140760507, "grad_norm": 1.8052473068237305, "step": 200 }, { "epoch": 0.066711140760507, "learning_rate": 0.00017777777777777779, "step": 200 }, { "epoch": 0.066711140760507, "loss": 2.0465126037597656, "step": 200 }, { "ce_loss": 0.571826696395874, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.5833513140678406, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.27391302585601807, "step": 200 }, { "epoch": 0.066711140760507, "loss": 2.0846211910247803, "step": 200 }, { "ce_loss": 0.6135091185569763, "epoch": 0.066711140760507, "step": 200 }, { "distill_loss": 0.5756477117538452, "epoch": 0.066711140760507, "step": 200 }, { "epoch": 0.066711140760507, "ref_ce_loss": 0.30294105410575867, "step": 200 }, { "epoch": 0.07004669779853236, "loss": 1.7797, "step": 210 }, { "epoch": 0.07004669779853236, "grad_norm": 3.2647974491119385, "step": 210 }, { "epoch": 0.07004669779853236, "learning_rate": 0.0001866666666666667, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.6143054962158203, "step": 210 }, { "ce_loss": 0.48211437463760376, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.5302156209945679, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.24415576457977295, "step": 210 }, { "epoch": 0.07004669779853236, "loss": 1.7094444036483765, "step": 210 }, { "ce_loss": 0.5260207056999207, "epoch": 0.07004669779853236, "step": 210 }, { "distill_loss": 0.4881989359855652, "epoch": 0.07004669779853236, "step": 210 }, { "epoch": 0.07004669779853236, "ref_ce_loss": 0.3114811182022095, "step": 210 }, { "epoch": 0.0733822548365577, "loss": 1.7491, "step": 220 }, { "epoch": 0.0733822548365577, "grad_norm": 3.6222076416015625, "step": 220 }, { "epoch": 0.0733822548365577, "learning_rate": 0.00019555555555555556, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 1.7036691904067993, "step": 220 }, { "ce_loss": 0.5687296390533447, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.6326914429664612, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.295692503452301, "step": 220 }, { "epoch": 0.0733822548365577, "loss": 1.9834920167922974, "step": 220 }, { "ce_loss": 0.5344058871269226, "epoch": 0.0733822548365577, "step": 220 }, { "distill_loss": 0.5521599054336548, "epoch": 0.0733822548365577, "step": 220 }, { "epoch": 0.0733822548365577, "ref_ce_loss": 0.29361093044281006, "step": 220 }, { "epoch": 0.07671781187458306, "loss": 1.7304, "step": 230 }, { "epoch": 0.07671781187458306, "grad_norm": 21.085020065307617, "step": 230 }, { "epoch": 0.07671781187458306, "learning_rate": 0.00020444444444444443, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 1.725504755973816, "step": 230 }, { "ce_loss": 0.6624048352241516, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.6723373532295227, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.39068496227264404, "step": 230 }, { "epoch": 0.07671781187458306, "loss": 2.677776336669922, "step": 230 }, { "ce_loss": 0.7152729034423828, "epoch": 0.07671781187458306, "step": 230 }, { "distill_loss": 0.7655081748962402, "epoch": 0.07671781187458306, "step": 230 }, { "epoch": 0.07671781187458306, "ref_ce_loss": 0.3652455508708954, "step": 230 }, { "epoch": 0.0800533689126084, "loss": 2.1082, "step": 240 }, { "epoch": 0.0800533689126084, "grad_norm": 5.6974921226501465, "step": 240 }, { "epoch": 0.0800533689126084, "learning_rate": 0.00021333333333333333, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.5531107187271118, "step": 240 }, { "ce_loss": 0.5192185044288635, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.5373853445053101, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.2714692950248718, "step": 240 }, { "epoch": 0.0800533689126084, "loss": 1.5598499774932861, "step": 240 }, { "ce_loss": 0.5087171196937561, "epoch": 0.0800533689126084, "step": 240 }, { "distill_loss": 0.5728287696838379, "epoch": 0.0800533689126084, "step": 240 }, { "epoch": 0.0800533689126084, "ref_ce_loss": 0.2059345841407776, "step": 240 }, { "epoch": 0.08338892595063375, "loss": 1.6852, "step": 250 }, { "epoch": 0.08338892595063375, "grad_norm": 1.2824687957763672, "step": 250 }, { "epoch": 0.08338892595063375, "learning_rate": 0.00022222222222222223, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 2.172323226928711, "step": 250 }, { "ce_loss": 0.6075947880744934, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.5816372632980347, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.2618728280067444, "step": 250 }, { "epoch": 0.08338892595063375, "loss": 1.9809627532958984, "step": 250 }, { "ce_loss": 0.4714495539665222, "epoch": 0.08338892595063375, "step": 250 }, { "distill_loss": 0.48982515931129456, "epoch": 0.08338892595063375, "step": 250 }, { "epoch": 0.08338892595063375, "ref_ce_loss": 0.2820373475551605, "step": 250 }, { "epoch": 0.0867244829886591, "loss": 1.8546, "step": 260 }, { "epoch": 0.0867244829886591, "grad_norm": 2.8757669925689697, "step": 260 }, { "epoch": 0.0867244829886591, "learning_rate": 0.0002311111111111111, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 2.353008270263672, "step": 260 }, { "ce_loss": 0.5781505107879639, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.5666728615760803, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.2690242826938629, "step": 260 }, { "epoch": 0.0867244829886591, "loss": 1.535614013671875, "step": 260 }, { "ce_loss": 0.47560837864875793, "epoch": 0.0867244829886591, "step": 260 }, { "distill_loss": 0.5879183411598206, "epoch": 0.0867244829886591, "step": 260 }, { "epoch": 0.0867244829886591, "ref_ce_loss": 0.28443270921707153, "step": 260 }, { "epoch": 0.09006004002668445, "loss": 1.7228, "step": 270 }, { "epoch": 0.09006004002668445, "grad_norm": 3.3693370819091797, "step": 270 }, { "epoch": 0.09006004002668445, "learning_rate": 0.00024, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.65613853931427, "step": 270 }, { "ce_loss": 0.509759783744812, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.6900843381881714, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.21354441344738007, "step": 270 }, { "epoch": 0.09006004002668445, "loss": 1.7907038927078247, "step": 270 }, { "ce_loss": 0.47750452160835266, "epoch": 0.09006004002668445, "step": 270 }, { "distill_loss": 0.5887446403503418, "epoch": 0.09006004002668445, "step": 270 }, { "epoch": 0.09006004002668445, "ref_ce_loss": 0.25874146819114685, "step": 270 }, { "epoch": 0.0933955970647098, "loss": 1.5949, "step": 280 }, { "epoch": 0.0933955970647098, "grad_norm": 1.3736844062805176, "step": 280 }, { "epoch": 0.0933955970647098, "learning_rate": 0.0002488888888888889, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.6113965511322021, "step": 280 }, { "ce_loss": 0.569277822971344, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.6352401971817017, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.23418693244457245, "step": 280 }, { "epoch": 0.0933955970647098, "loss": 1.9638503789901733, "step": 280 }, { "ce_loss": 0.5025474429130554, "epoch": 0.0933955970647098, "step": 280 }, { "distill_loss": 0.6397897601127625, "epoch": 0.0933955970647098, "step": 280 }, { "epoch": 0.0933955970647098, "ref_ce_loss": 0.24622640013694763, "step": 280 }, { "epoch": 0.09673115410273515, "loss": 1.6697, "step": 290 }, { "epoch": 0.09673115410273515, "grad_norm": 1.6431907415390015, "step": 290 }, { "epoch": 0.09673115410273515, "learning_rate": 0.00025777777777777783, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 2.6338305473327637, "step": 290 }, { "ce_loss": 0.5702818036079407, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.5806958079338074, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.21621885895729065, "step": 290 }, { "epoch": 0.09673115410273515, "loss": 1.3259646892547607, "step": 290 }, { "ce_loss": 0.42298078536987305, "epoch": 0.09673115410273515, "step": 290 }, { "distill_loss": 0.5434897541999817, "epoch": 0.09673115410273515, "step": 290 }, { "epoch": 0.09673115410273515, "ref_ce_loss": 0.24230249226093292, "step": 290 }, { "epoch": 0.1000667111407605, "loss": 1.7942, "step": 300 }, { "epoch": 0.1000667111407605, "grad_norm": 1.4105414152145386, "step": 300 }, { "epoch": 0.1000667111407605, "learning_rate": 0.0002666666666666667, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 1.7872378826141357, "step": 300 }, { "ce_loss": 0.49962472915649414, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.5508493781089783, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.2687307596206665, "step": 300 }, { "epoch": 0.1000667111407605, "loss": 1.2783927917480469, "step": 300 }, { "ce_loss": 0.4945143759250641, "epoch": 0.1000667111407605, "step": 300 }, { "distill_loss": 0.5173062086105347, "epoch": 0.1000667111407605, "step": 300 }, { "epoch": 0.1000667111407605, "ref_ce_loss": 0.2663983404636383, "step": 300 }, { "epoch": 0.10340226817878585, "loss": 1.802, "step": 310 }, { "epoch": 0.10340226817878585, "grad_norm": 2.590189218521118, "step": 310 }, { "epoch": 0.10340226817878585, "learning_rate": 0.0002755555555555556, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 2.033294916152954, "step": 310 }, { "ce_loss": 0.48483648896217346, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.5946583151817322, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.24107787013053894, "step": 310 }, { "epoch": 0.10340226817878585, "loss": 1.7032603025436401, "step": 310 }, { "ce_loss": 0.4977302849292755, "epoch": 0.10340226817878585, "step": 310 }, { "distill_loss": 0.5834342241287231, "epoch": 0.10340226817878585, "step": 310 }, { "epoch": 0.10340226817878585, "ref_ce_loss": 0.2511594593524933, "step": 310 }, { "epoch": 0.1067378252168112, "loss": 1.8593, "step": 320 }, { "epoch": 0.1067378252168112, "grad_norm": 1.435152292251587, "step": 320 }, { "epoch": 0.1067378252168112, "learning_rate": 0.0002844444444444445, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.522861123085022, "step": 320 }, { "ce_loss": 0.5201138854026794, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.5344372391700745, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.27378469705581665, "step": 320 }, { "epoch": 0.1067378252168112, "loss": 1.5926438570022583, "step": 320 }, { "ce_loss": 0.4458596408367157, "epoch": 0.1067378252168112, "step": 320 }, { "distill_loss": 0.48734861612319946, "epoch": 0.1067378252168112, "step": 320 }, { "epoch": 0.1067378252168112, "ref_ce_loss": 0.32077357172966003, "step": 320 }, { "epoch": 0.11007338225483655, "loss": 1.5826, "step": 330 }, { "epoch": 0.11007338225483655, "grad_norm": 1.7621814012527466, "step": 330 }, { "epoch": 0.11007338225483655, "learning_rate": 0.0002933333333333333, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 1.753835916519165, "step": 330 }, { "ce_loss": 0.5066315531730652, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.4909520149230957, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.28438347578048706, "step": 330 }, { "epoch": 0.11007338225483655, "loss": 1.2814210653305054, "step": 330 }, { "ce_loss": 0.4937293231487274, "epoch": 0.11007338225483655, "step": 330 }, { "distill_loss": 0.593289852142334, "epoch": 0.11007338225483655, "step": 330 }, { "epoch": 0.11007338225483655, "ref_ce_loss": 0.19415561854839325, "step": 330 }, { "epoch": 0.1134089392928619, "loss": 1.7061, "step": 340 }, { "epoch": 0.1134089392928619, "grad_norm": 2.5554006099700928, "step": 340 }, { "epoch": 0.1134089392928619, "learning_rate": 0.0003022222222222222, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.844890832901001, "step": 340 }, { "ce_loss": 0.6287904381752014, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.694754958152771, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.2634267807006836, "step": 340 }, { "epoch": 0.1134089392928619, "loss": 1.6410373449325562, "step": 340 }, { "ce_loss": 0.5369527339935303, "epoch": 0.1134089392928619, "step": 340 }, { "distill_loss": 0.6398927569389343, "epoch": 0.1134089392928619, "step": 340 }, { "epoch": 0.1134089392928619, "ref_ce_loss": 0.27201586961746216, "step": 340 }, { "epoch": 0.11674449633088725, "loss": 1.5979, "step": 350 }, { "epoch": 0.11674449633088725, "grad_norm": 1.9453589916229248, "step": 350 }, { "epoch": 0.11674449633088725, "learning_rate": 0.0003111111111111111, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.646355390548706, "step": 350 }, { "ce_loss": 0.5751283168792725, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.6439217329025269, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.24644474685192108, "step": 350 }, { "epoch": 0.11674449633088725, "loss": 1.6202154159545898, "step": 350 }, { "ce_loss": 0.5422464609146118, "epoch": 0.11674449633088725, "step": 350 }, { "distill_loss": 0.6799194812774658, "epoch": 0.11674449633088725, "step": 350 }, { "epoch": 0.11674449633088725, "ref_ce_loss": 0.24791789054870605, "step": 350 }, { "epoch": 0.12008005336891261, "loss": 1.6728, "step": 360 }, { "epoch": 0.12008005336891261, "grad_norm": 1.7183290719985962, "step": 360 }, { "epoch": 0.12008005336891261, "learning_rate": 0.00032, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.585992693901062, "step": 360 }, { "ce_loss": 0.5894421935081482, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.5706028342247009, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.27797845005989075, "step": 360 }, { "epoch": 0.12008005336891261, "loss": 1.7014429569244385, "step": 360 }, { "ce_loss": 0.4603886604309082, "epoch": 0.12008005336891261, "step": 360 }, { "distill_loss": 0.6570786833763123, "epoch": 0.12008005336891261, "step": 360 }, { "epoch": 0.12008005336891261, "ref_ce_loss": 0.22744178771972656, "step": 360 }, { "epoch": 0.12341561040693796, "loss": 1.5367, "step": 370 }, { "epoch": 0.12341561040693796, "grad_norm": 1.3087623119354248, "step": 370 }, { "epoch": 0.12341561040693796, "learning_rate": 0.00032888888888888887, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 2.3369619846343994, "step": 370 }, { "ce_loss": 0.6497780084609985, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.646597146987915, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.3300057351589203, "step": 370 }, { "epoch": 0.12341561040693796, "loss": 1.4946203231811523, "step": 370 }, { "ce_loss": 0.4301724433898926, "epoch": 0.12341561040693796, "step": 370 }, { "distill_loss": 0.6341826319694519, "epoch": 0.12341561040693796, "step": 370 }, { "epoch": 0.12341561040693796, "ref_ce_loss": 0.24294087290763855, "step": 370 }, { "epoch": 0.12675116744496331, "loss": 1.6486, "step": 380 }, { "epoch": 0.12675116744496331, "grad_norm": 1.720984697341919, "step": 380 }, { "epoch": 0.12675116744496331, "learning_rate": 0.00033777777777777777, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 1.5315797328948975, "step": 380 }, { "ce_loss": 0.4742905795574188, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.5379242300987244, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.2992285490036011, "step": 380 }, { "epoch": 0.12675116744496331, "loss": 1.4294133186340332, "step": 380 }, { "ce_loss": 0.543543815612793, "epoch": 0.12675116744496331, "step": 380 }, { "distill_loss": 0.5854701399803162, "epoch": 0.12675116744496331, "step": 380 }, { "epoch": 0.12675116744496331, "ref_ce_loss": 0.29929932951927185, "step": 380 }, { "epoch": 0.13008672448298866, "loss": 1.497, "step": 390 }, { "epoch": 0.13008672448298866, "grad_norm": 1.4799915552139282, "step": 390 }, { "epoch": 0.13008672448298866, "learning_rate": 0.00034666666666666667, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 1.6086652278900146, "step": 390 }, { "ce_loss": 0.4943905174732208, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.6120883226394653, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.2651229500770569, "step": 390 }, { "epoch": 0.13008672448298866, "loss": 1.7222286462783813, "step": 390 }, { "ce_loss": 0.4503355622291565, "epoch": 0.13008672448298866, "step": 390 }, { "distill_loss": 0.521122932434082, "epoch": 0.13008672448298866, "step": 390 }, { "epoch": 0.13008672448298866, "ref_ce_loss": 0.2309369146823883, "step": 390 }, { "epoch": 0.133422281521014, "loss": 1.6039, "step": 400 }, { "epoch": 0.133422281521014, "grad_norm": 2.7860965728759766, "step": 400 }, { "epoch": 0.133422281521014, "learning_rate": 0.00035555555555555557, "step": 400 }, { "epoch": 0.133422281521014, "loss": 2.0828051567077637, "step": 400 }, { "ce_loss": 0.5030226111412048, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.5764296650886536, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.2733519673347473, "step": 400 }, { "epoch": 0.133422281521014, "loss": 1.5071513652801514, "step": 400 }, { "ce_loss": 0.48920971155166626, "epoch": 0.133422281521014, "step": 400 }, { "distill_loss": 0.5897278785705566, "epoch": 0.133422281521014, "step": 400 }, { "epoch": 0.133422281521014, "ref_ce_loss": 0.2842494249343872, "step": 400 }, { "epoch": 0.13675783855903936, "loss": 1.8246, "step": 410 }, { "epoch": 0.13675783855903936, "grad_norm": 4.629027366638184, "step": 410 }, { "epoch": 0.13675783855903936, "learning_rate": 0.00036444444444444447, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.933959722518921, "step": 410 }, { "ce_loss": 0.4567517638206482, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.6056955456733704, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.23806793987751007, "step": 410 }, { "epoch": 0.13675783855903936, "loss": 1.6946505308151245, "step": 410 }, { "ce_loss": 0.5993149280548096, "epoch": 0.13675783855903936, "step": 410 }, { "distill_loss": 0.7280476093292236, "epoch": 0.13675783855903936, "step": 410 }, { "epoch": 0.13675783855903936, "ref_ce_loss": 0.22402344644069672, "step": 410 }, { "epoch": 0.1400933955970647, "loss": 1.6325, "step": 420 }, { "epoch": 0.1400933955970647, "grad_norm": 3.0605082511901855, "step": 420 }, { "epoch": 0.1400933955970647, "learning_rate": 0.0003733333333333334, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 1.3284132480621338, "step": 420 }, { "ce_loss": 0.495491623878479, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.5443464517593384, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.2884868383407593, "step": 420 }, { "epoch": 0.1400933955970647, "loss": 1.2195791006088257, "step": 420 }, { "ce_loss": 0.44059133529663086, "epoch": 0.1400933955970647, "step": 420 }, { "distill_loss": 0.4888474941253662, "epoch": 0.1400933955970647, "step": 420 }, { "epoch": 0.1400933955970647, "ref_ce_loss": 0.2899805009365082, "step": 420 }, { "epoch": 0.14342895263509006, "loss": 1.6223, "step": 430 }, { "epoch": 0.14342895263509006, "grad_norm": 3.7689526081085205, "step": 430 }, { "epoch": 0.14342895263509006, "learning_rate": 0.0003822222222222223, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 1.595637559890747, "step": 430 }, { "ce_loss": 0.544774055480957, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.515714168548584, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.24679963290691376, "step": 430 }, { "epoch": 0.14342895263509006, "loss": 1.3475347757339478, "step": 430 }, { "ce_loss": 0.44207265973091125, "epoch": 0.14342895263509006, "step": 430 }, { "distill_loss": 0.4540649354457855, "epoch": 0.14342895263509006, "step": 430 }, { "epoch": 0.14342895263509006, "ref_ce_loss": 0.23646868765354156, "step": 430 }, { "epoch": 0.1467645096731154, "loss": 1.644, "step": 440 }, { "epoch": 0.1467645096731154, "grad_norm": 1.7760794162750244, "step": 440 }, { "epoch": 0.1467645096731154, "learning_rate": 0.0003911111111111111, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 1.3347100019454956, "step": 440 }, { "ce_loss": 0.4692061245441437, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.6326097846031189, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.23280993103981018, "step": 440 }, { "epoch": 0.1467645096731154, "loss": 1.2680646181106567, "step": 440 }, { "ce_loss": 0.44068577885627747, "epoch": 0.1467645096731154, "step": 440 }, { "distill_loss": 0.5154201984405518, "epoch": 0.1467645096731154, "step": 440 }, { "epoch": 0.1467645096731154, "ref_ce_loss": 0.3117990493774414, "step": 440 }, { "epoch": 0.15010006671114076, "loss": 1.593, "step": 450 }, { "epoch": 0.15010006671114076, "grad_norm": 2.2271616458892822, "step": 450 }, { "epoch": 0.15010006671114076, "learning_rate": 0.0004, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 1.6983128786087036, "step": 450 }, { "ce_loss": 0.4268217980861664, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.4826546013355255, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.26377132534980774, "step": 450 }, { "epoch": 0.15010006671114076, "loss": 2.1044223308563232, "step": 450 }, { "ce_loss": 0.4926432967185974, "epoch": 0.15010006671114076, "step": 450 }, { "distill_loss": 0.5348109006881714, "epoch": 0.15010006671114076, "step": 450 }, { "epoch": 0.15010006671114076, "ref_ce_loss": 0.2828768491744995, "step": 450 }, { "epoch": 0.1534356237491661, "loss": 1.6717, "step": 460 }, { "epoch": 0.1534356237491661, "grad_norm": 1.3050262928009033, "step": 460 }, { "epoch": 0.1534356237491661, "learning_rate": 0.00040888888888888887, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 1.4850102663040161, "step": 460 }, { "ce_loss": 0.3940463066101074, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.4585028290748596, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.19937008619308472, "step": 460 }, { "epoch": 0.1534356237491661, "loss": 1.4205468893051147, "step": 460 }, { "ce_loss": 0.47472071647644043, "epoch": 0.1534356237491661, "step": 460 }, { "distill_loss": 0.5782334804534912, "epoch": 0.1534356237491661, "step": 460 }, { "epoch": 0.1534356237491661, "ref_ce_loss": 0.21085552871227264, "step": 460 }, { "epoch": 0.15677118078719146, "loss": 1.5714, "step": 470 }, { "epoch": 0.15677118078719146, "grad_norm": 1.5946571826934814, "step": 470 }, { "epoch": 0.15677118078719146, "learning_rate": 0.0004177777777777778, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 1.7707476615905762, "step": 470 }, { "ce_loss": 0.5047350525856018, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.6464635729789734, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.28190878033638, "step": 470 }, { "epoch": 0.15677118078719146, "loss": 1.366986632347107, "step": 470 }, { "ce_loss": 0.45065632462501526, "epoch": 0.15677118078719146, "step": 470 }, { "distill_loss": 0.5560937523841858, "epoch": 0.15677118078719146, "step": 470 }, { "epoch": 0.15677118078719146, "ref_ce_loss": 0.23909518122673035, "step": 470 }, { "epoch": 0.1601067378252168, "loss": 1.6082, "step": 480 }, { "epoch": 0.1601067378252168, "grad_norm": 1.7290911674499512, "step": 480 }, { "epoch": 0.1601067378252168, "learning_rate": 0.00042666666666666667, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 1.3324958086013794, "step": 480 }, { "ce_loss": 0.378185898065567, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.5746075510978699, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.24356596171855927, "step": 480 }, { "epoch": 0.1601067378252168, "loss": 1.4968345165252686, "step": 480 }, { "ce_loss": 0.4363265931606293, "epoch": 0.1601067378252168, "step": 480 }, { "distill_loss": 0.5504066348075867, "epoch": 0.1601067378252168, "step": 480 }, { "epoch": 0.1601067378252168, "ref_ce_loss": 0.19955141842365265, "step": 480 }, { "epoch": 0.16344229486324216, "loss": 1.5729, "step": 490 }, { "epoch": 0.16344229486324216, "grad_norm": 1.1001335382461548, "step": 490 }, { "epoch": 0.16344229486324216, "learning_rate": 0.00043555555555555557, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 1.4524122476577759, "step": 490 }, { "ce_loss": 0.5018464922904968, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.5495221018791199, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.2558499574661255, "step": 490 }, { "epoch": 0.16344229486324216, "loss": 1.4316108226776123, "step": 490 }, { "ce_loss": 0.45365622639656067, "epoch": 0.16344229486324216, "step": 490 }, { "distill_loss": 0.5579233765602112, "epoch": 0.16344229486324216, "step": 490 }, { "epoch": 0.16344229486324216, "ref_ce_loss": 0.2731827199459076, "step": 490 }, { "epoch": 0.1667778519012675, "loss": 1.4884, "step": 500 }, { "epoch": 0.1667778519012675, "grad_norm": 1.5505117177963257, "step": 500 }, { "epoch": 0.1667778519012675, "learning_rate": 0.00044444444444444447, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 1.3067128658294678, "step": 500 }, { "ce_loss": 0.433439165353775, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.4941195547580719, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.2596491873264313, "step": 500 }, { "epoch": 0.1667778519012675, "loss": 2.249636650085449, "step": 500 }, { "ce_loss": 0.49755096435546875, "epoch": 0.1667778519012675, "step": 500 }, { "distill_loss": 0.49061688780784607, "epoch": 0.1667778519012675, "step": 500 }, { "epoch": 0.1667778519012675, "ref_ce_loss": 0.27567821741104126, "step": 500 }, { "epoch": 0.17011340893929286, "loss": 1.6774, "step": 510 }, { "epoch": 0.17011340893929286, "grad_norm": 1.3292597532272339, "step": 510 }, { "epoch": 0.17011340893929286, "learning_rate": 0.00045333333333333337, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 1.595599889755249, "step": 510 }, { "ce_loss": 0.5413509607315063, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.6084362268447876, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.295004278421402, "step": 510 }, { "epoch": 0.17011340893929286, "loss": 1.9175055027008057, "step": 510 }, { "ce_loss": 0.4983865022659302, "epoch": 0.17011340893929286, "step": 510 }, { "distill_loss": 0.5698104500770569, "epoch": 0.17011340893929286, "step": 510 }, { "epoch": 0.17011340893929286, "ref_ce_loss": 0.3239087760448456, "step": 510 }, { "epoch": 0.1734489659773182, "loss": 1.5642, "step": 520 }, { "epoch": 0.1734489659773182, "grad_norm": 1.2634004354476929, "step": 520 }, { "epoch": 0.1734489659773182, "learning_rate": 0.0004622222222222222, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 1.9376397132873535, "step": 520 }, { "ce_loss": 0.5932291746139526, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.46436649560928345, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.2764166295528412, "step": 520 }, { "epoch": 0.1734489659773182, "loss": 2.0309195518493652, "step": 520 }, { "ce_loss": 0.5371260643005371, "epoch": 0.1734489659773182, "step": 520 }, { "distill_loss": 0.517487645149231, "epoch": 0.1734489659773182, "step": 520 }, { "epoch": 0.1734489659773182, "ref_ce_loss": 0.19353549182415009, "step": 520 }, { "epoch": 0.17678452301534356, "loss": 1.653, "step": 530 }, { "epoch": 0.17678452301534356, "grad_norm": 2.620346784591675, "step": 530 }, { "epoch": 0.17678452301534356, "learning_rate": 0.00047111111111111117, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 1.550959825515747, "step": 530 }, { "ce_loss": 0.47280043363571167, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.543157696723938, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.21153424680233002, "step": 530 }, { "epoch": 0.17678452301534356, "loss": 1.3030387163162231, "step": 530 }, { "ce_loss": 0.5019891262054443, "epoch": 0.17678452301534356, "step": 530 }, { "distill_loss": 0.5249347686767578, "epoch": 0.17678452301534356, "step": 530 }, { "epoch": 0.17678452301534356, "ref_ce_loss": 0.2558169960975647, "step": 530 }, { "epoch": 0.1801200800533689, "loss": 1.6, "step": 540 }, { "epoch": 0.1801200800533689, "grad_norm": 1.2312290668487549, "step": 540 }, { "epoch": 0.1801200800533689, "learning_rate": 0.00048, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 1.629996418952942, "step": 540 }, { "ce_loss": 0.47734594345092773, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.5398337841033936, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.23363655805587769, "step": 540 }, { "epoch": 0.1801200800533689, "loss": 1.3013463020324707, "step": 540 }, { "ce_loss": 0.5037491321563721, "epoch": 0.1801200800533689, "step": 540 }, { "distill_loss": 0.5337326526641846, "epoch": 0.1801200800533689, "step": 540 }, { "epoch": 0.1801200800533689, "ref_ce_loss": 0.25985610485076904, "step": 540 }, { "epoch": 0.18345563709139426, "loss": 1.6104, "step": 550 }, { "epoch": 0.18345563709139426, "grad_norm": 3.5466859340667725, "step": 550 }, { "epoch": 0.18345563709139426, "learning_rate": 0.000488888888888889, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 1.25911283493042, "step": 550 }, { "ce_loss": 0.4432990550994873, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.5283195972442627, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.28703317046165466, "step": 550 }, { "epoch": 0.18345563709139426, "loss": 1.4867393970489502, "step": 550 }, { "ce_loss": 0.46072185039520264, "epoch": 0.18345563709139426, "step": 550 }, { "distill_loss": 0.6002609729766846, "epoch": 0.18345563709139426, "step": 550 }, { "epoch": 0.18345563709139426, "ref_ce_loss": 0.3060095012187958, "step": 550 }, { "epoch": 0.1867911941294196, "loss": 1.5295, "step": 560 }, { "epoch": 0.1867911941294196, "grad_norm": 2.128678798675537, "step": 560 }, { "epoch": 0.1867911941294196, "learning_rate": 0.0004977777777777778, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 1.3652108907699585, "step": 560 }, { "ce_loss": 0.41976261138916016, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.5678168535232544, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.2214897722005844, "step": 560 }, { "epoch": 0.1867911941294196, "loss": 1.4672818183898926, "step": 560 }, { "ce_loss": 0.5104706287384033, "epoch": 0.1867911941294196, "step": 560 }, { "distill_loss": 0.5782572031021118, "epoch": 0.1867911941294196, "step": 560 }, { "epoch": 0.1867911941294196, "ref_ce_loss": 0.2707638740539551, "step": 560 }, { "epoch": 0.19012675116744496, "loss": 1.5453, "step": 570 }, { "epoch": 0.19012675116744496, "grad_norm": 2.144012451171875, "step": 570 }, { "epoch": 0.19012675116744496, "learning_rate": 0.0005066666666666667, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 2.1711885929107666, "step": 570 }, { "ce_loss": 0.4930206835269928, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.5990675091743469, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.2547401189804077, "step": 570 }, { "epoch": 0.19012675116744496, "loss": 1.7919820547103882, "step": 570 }, { "ce_loss": 0.485125333070755, "epoch": 0.19012675116744496, "step": 570 }, { "distill_loss": 0.5928157567977905, "epoch": 0.19012675116744496, "step": 570 }, { "epoch": 0.19012675116744496, "ref_ce_loss": 0.25338858366012573, "step": 570 }, { "epoch": 0.1934623082054703, "loss": 1.5982, "step": 580 }, { "epoch": 0.1934623082054703, "grad_norm": 1.5169459581375122, "step": 580 }, { "epoch": 0.1934623082054703, "learning_rate": 0.0005155555555555557, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 1.3269596099853516, "step": 580 }, { "ce_loss": 0.5053694248199463, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.5650742650032043, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.2564832866191864, "step": 580 }, { "epoch": 0.1934623082054703, "loss": 1.6118474006652832, "step": 580 }, { "ce_loss": 0.4108356535434723, "epoch": 0.1934623082054703, "step": 580 }, { "distill_loss": 0.6295657157897949, "epoch": 0.1934623082054703, "step": 580 }, { "epoch": 0.1934623082054703, "ref_ce_loss": 0.2287975251674652, "step": 580 }, { "epoch": 0.19679786524349566, "loss": 1.6172, "step": 590 }, { "epoch": 0.19679786524349566, "grad_norm": 1.4867298603057861, "step": 590 }, { "epoch": 0.19679786524349566, "learning_rate": 0.0005244444444444445, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 1.4855684041976929, "step": 590 }, { "ce_loss": 0.4735114574432373, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.6113525032997131, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.2961452901363373, "step": 590 }, { "epoch": 0.19679786524349566, "loss": 1.7030887603759766, "step": 590 }, { "ce_loss": 0.4876049757003784, "epoch": 0.19679786524349566, "step": 590 }, { "distill_loss": 0.5494129061698914, "epoch": 0.19679786524349566, "step": 590 }, { "epoch": 0.19679786524349566, "ref_ce_loss": 0.29730701446533203, "step": 590 }, { "epoch": 0.200133422281521, "loss": 1.5749, "step": 600 }, { "epoch": 0.200133422281521, "grad_norm": 2.0874152183532715, "step": 600 }, { "epoch": 0.200133422281521, "learning_rate": 0.0005333333333333334, "step": 600 }, { "epoch": 0.200133422281521, "loss": 1.338006615638733, "step": 600 }, { "ce_loss": 0.4650641083717346, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.4777800440788269, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.2336799055337906, "step": 600 }, { "epoch": 0.200133422281521, "loss": 1.4985271692276, "step": 600 }, { "ce_loss": 0.5751738548278809, "epoch": 0.200133422281521, "step": 600 }, { "distill_loss": 0.46163544058799744, "epoch": 0.200133422281521, "step": 600 }, { "epoch": 0.200133422281521, "ref_ce_loss": 0.3208712041378021, "step": 600 }, { "epoch": 0.20346897931954636, "loss": 1.5238, "step": 610 }, { "epoch": 0.20346897931954636, "grad_norm": 1.4101930856704712, "step": 610 }, { "epoch": 0.20346897931954636, "learning_rate": 0.0005422222222222223, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 1.7546379566192627, "step": 610 }, { "ce_loss": 0.460560142993927, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.5301985144615173, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.31578245759010315, "step": 610 }, { "epoch": 0.20346897931954636, "loss": 1.3197903633117676, "step": 610 }, { "ce_loss": 0.43814700841903687, "epoch": 0.20346897931954636, "step": 610 }, { "distill_loss": 0.6068532466888428, "epoch": 0.20346897931954636, "step": 610 }, { "epoch": 0.20346897931954636, "ref_ce_loss": 0.25644248723983765, "step": 610 }, { "epoch": 0.2068045363575717, "loss": 1.7156, "step": 620 }, { "epoch": 0.2068045363575717, "grad_norm": 2.0110700130462646, "step": 620 }, { "epoch": 0.2068045363575717, "learning_rate": 0.0005511111111111112, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 1.3195078372955322, "step": 620 }, { "ce_loss": 0.4470673203468323, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.5740320086479187, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.2269422858953476, "step": 620 }, { "epoch": 0.2068045363575717, "loss": 1.3146237134933472, "step": 620 }, { "ce_loss": 0.4497494101524353, "epoch": 0.2068045363575717, "step": 620 }, { "distill_loss": 0.6298726201057434, "epoch": 0.2068045363575717, "step": 620 }, { "epoch": 0.2068045363575717, "ref_ce_loss": 0.2145850658416748, "step": 620 }, { "epoch": 0.21014009339559706, "loss": 1.5681, "step": 630 }, { "epoch": 0.21014009339559706, "grad_norm": 1.196976900100708, "step": 630 }, { "epoch": 0.21014009339559706, "learning_rate": 0.00056, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 1.5452654361724854, "step": 630 }, { "ce_loss": 0.42693886160850525, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.6515905857086182, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.3077460825443268, "step": 630 }, { "epoch": 0.21014009339559706, "loss": 1.7794028520584106, "step": 630 }, { "ce_loss": 0.4400431513786316, "epoch": 0.21014009339559706, "step": 630 }, { "distill_loss": 0.6112064123153687, "epoch": 0.21014009339559706, "step": 630 }, { "epoch": 0.21014009339559706, "ref_ce_loss": 0.19944915175437927, "step": 630 }, { "epoch": 0.2134756504336224, "loss": 1.6714, "step": 640 }, { "epoch": 0.2134756504336224, "grad_norm": 2.2573142051696777, "step": 640 }, { "epoch": 0.2134756504336224, "learning_rate": 0.000568888888888889, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 1.4484288692474365, "step": 640 }, { "ce_loss": 0.4527072310447693, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.4269205927848816, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.3586342930793762, "step": 640 }, { "epoch": 0.2134756504336224, "loss": 1.2968647480010986, "step": 640 }, { "ce_loss": 0.4351823627948761, "epoch": 0.2134756504336224, "step": 640 }, { "distill_loss": 0.4391202926635742, "epoch": 0.2134756504336224, "step": 640 }, { "epoch": 0.2134756504336224, "ref_ce_loss": 0.32410529255867004, "step": 640 }, { "epoch": 0.21681120747164775, "loss": 1.6453, "step": 650 }, { "epoch": 0.21681120747164775, "grad_norm": 1.68989098072052, "step": 650 }, { "epoch": 0.21681120747164775, "learning_rate": 0.0005777777777777778, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 1.443930983543396, "step": 650 }, { "ce_loss": 0.4861678183078766, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.6249301433563232, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.21519602835178375, "step": 650 }, { "epoch": 0.21681120747164775, "loss": 1.4586472511291504, "step": 650 }, { "ce_loss": 0.42792659997940063, "epoch": 0.21681120747164775, "step": 650 }, { "distill_loss": 0.6605918407440186, "epoch": 0.21681120747164775, "step": 650 }, { "epoch": 0.21681120747164775, "ref_ce_loss": 0.26281821727752686, "step": 650 }, { "epoch": 0.2201467645096731, "loss": 1.6017, "step": 660 }, { "epoch": 0.2201467645096731, "grad_norm": 2.7463796138763428, "step": 660 }, { "epoch": 0.2201467645096731, "learning_rate": 0.0005866666666666667, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 1.276236653327942, "step": 660 }, { "ce_loss": 0.45802658796310425, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.5395572185516357, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.27748921513557434, "step": 660 }, { "epoch": 0.2201467645096731, "loss": 2.1881327629089355, "step": 660 }, { "ce_loss": 0.47671204805374146, "epoch": 0.2201467645096731, "step": 660 }, { "distill_loss": 0.5150418877601624, "epoch": 0.2201467645096731, "step": 660 }, { "epoch": 0.2201467645096731, "ref_ce_loss": 0.27915847301483154, "step": 660 }, { "epoch": 0.22348232154769845, "loss": 1.5746, "step": 670 }, { "epoch": 0.22348232154769845, "grad_norm": 1.6654798984527588, "step": 670 }, { "epoch": 0.22348232154769845, "learning_rate": 0.0005955555555555556, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 1.593944787979126, "step": 670 }, { "ce_loss": 0.4207690954208374, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.6195393800735474, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.26322174072265625, "step": 670 }, { "epoch": 0.22348232154769845, "loss": 1.866980791091919, "step": 670 }, { "ce_loss": 0.5242395401000977, "epoch": 0.22348232154769845, "step": 670 }, { "distill_loss": 0.692948043346405, "epoch": 0.22348232154769845, "step": 670 }, { "epoch": 0.22348232154769845, "ref_ce_loss": 0.2934406101703644, "step": 670 }, { "epoch": 0.2268178785857238, "loss": 1.5249, "step": 680 }, { "epoch": 0.2268178785857238, "grad_norm": 1.435062289237976, "step": 680 }, { "epoch": 0.2268178785857238, "learning_rate": 0.0006044444444444445, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 1.4246501922607422, "step": 680 }, { "ce_loss": 0.4549441635608673, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.5518971681594849, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.2823745608329773, "step": 680 }, { "epoch": 0.2268178785857238, "loss": 2.719393253326416, "step": 680 }, { "ce_loss": 0.5663154721260071, "epoch": 0.2268178785857238, "step": 680 }, { "distill_loss": 0.6299650073051453, "epoch": 0.2268178785857238, "step": 680 }, { "epoch": 0.2268178785857238, "ref_ce_loss": 0.2891238033771515, "step": 680 }, { "epoch": 0.23015343562374915, "loss": 1.7289, "step": 690 }, { "epoch": 0.23015343562374915, "grad_norm": 2.50449538230896, "step": 690 }, { "epoch": 0.23015343562374915, "learning_rate": 0.0006133333333333334, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 1.3545693159103394, "step": 690 }, { "ce_loss": 0.5105953812599182, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.5700008869171143, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.2721017599105835, "step": 690 }, { "epoch": 0.23015343562374915, "loss": 1.2746974229812622, "step": 690 }, { "ce_loss": 0.48519670963287354, "epoch": 0.23015343562374915, "step": 690 }, { "distill_loss": 0.5226454138755798, "epoch": 0.23015343562374915, "step": 690 }, { "epoch": 0.23015343562374915, "ref_ce_loss": 0.26627910137176514, "step": 690 }, { "epoch": 0.2334889926617745, "loss": 1.6944, "step": 700 }, { "epoch": 0.2334889926617745, "grad_norm": 1.9347381591796875, "step": 700 }, { "epoch": 0.2334889926617745, "learning_rate": 0.0006222222222222223, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 1.4545104503631592, "step": 700 }, { "ce_loss": 0.4468264579772949, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.4760701656341553, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.2496543526649475, "step": 700 }, { "epoch": 0.2334889926617745, "loss": 1.7434810400009155, "step": 700 }, { "ce_loss": 0.5025457739830017, "epoch": 0.2334889926617745, "step": 700 }, { "distill_loss": 0.5106308460235596, "epoch": 0.2334889926617745, "step": 700 }, { "epoch": 0.2334889926617745, "ref_ce_loss": 0.32452499866485596, "step": 700 }, { "epoch": 0.23682454969979988, "loss": 1.626, "step": 710 }, { "epoch": 0.23682454969979988, "grad_norm": 1.789971113204956, "step": 710 }, { "epoch": 0.23682454969979988, "learning_rate": 0.0006311111111111112, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 1.2687652111053467, "step": 710 }, { "ce_loss": 0.49126219749450684, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.4775649607181549, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.2650153636932373, "step": 710 }, { "epoch": 0.23682454969979988, "loss": 1.09128737449646, "step": 710 }, { "ce_loss": 0.4264126718044281, "epoch": 0.23682454969979988, "step": 710 }, { "distill_loss": 0.40314796566963196, "epoch": 0.23682454969979988, "step": 710 }, { "epoch": 0.23682454969979988, "ref_ce_loss": 0.2587401270866394, "step": 710 }, { "epoch": 0.24016010673782523, "loss": 1.6084, "step": 720 }, { "epoch": 0.24016010673782523, "grad_norm": 2.9466187953948975, "step": 720 }, { "epoch": 0.24016010673782523, "learning_rate": 0.00064, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 1.9708633422851562, "step": 720 }, { "ce_loss": 0.45224499702453613, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.4592093229293823, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.2654026448726654, "step": 720 }, { "epoch": 0.24016010673782523, "loss": 1.2625981569290161, "step": 720 }, { "ce_loss": 0.41097569465637207, "epoch": 0.24016010673782523, "step": 720 }, { "distill_loss": 0.4138302206993103, "epoch": 0.24016010673782523, "step": 720 }, { "epoch": 0.24016010673782523, "ref_ce_loss": 0.2421134114265442, "step": 720 }, { "epoch": 0.24349566377585058, "loss": 1.528, "step": 730 }, { "epoch": 0.24349566377585058, "grad_norm": 1.8387153148651123, "step": 730 }, { "epoch": 0.24349566377585058, "learning_rate": 0.000648888888888889, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 1.3539307117462158, "step": 730 }, { "ce_loss": 0.4820309579372406, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.6259849667549133, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.22844180464744568, "step": 730 }, { "epoch": 0.24349566377585058, "loss": 1.6995915174484253, "step": 730 }, { "ce_loss": 0.41940057277679443, "epoch": 0.24349566377585058, "step": 730 }, { "distill_loss": 0.6038891673088074, "epoch": 0.24349566377585058, "step": 730 }, { "epoch": 0.24349566377585058, "ref_ce_loss": 0.26597100496292114, "step": 730 }, { "epoch": 0.24683122081387593, "loss": 1.5918, "step": 740 }, { "epoch": 0.24683122081387593, "grad_norm": 1.9388843774795532, "step": 740 }, { "epoch": 0.24683122081387593, "learning_rate": 0.0006577777777777777, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 1.5610748529434204, "step": 740 }, { "ce_loss": 0.45109638571739197, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.5143248438835144, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.2895212769508362, "step": 740 }, { "epoch": 0.24683122081387593, "loss": 1.315789818763733, "step": 740 }, { "ce_loss": 0.40198326110839844, "epoch": 0.24683122081387593, "step": 740 }, { "distill_loss": 0.5153144598007202, "epoch": 0.24683122081387593, "step": 740 }, { "epoch": 0.24683122081387593, "ref_ce_loss": 0.2582947015762329, "step": 740 }, { "epoch": 0.2501667778519013, "loss": 1.5719, "step": 750 }, { "epoch": 0.2501667778519013, "grad_norm": 1.69242525100708, "step": 750 }, { "epoch": 0.2501667778519013, "learning_rate": 0.0006666666666666668, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 2.689157485961914, "step": 750 }, { "ce_loss": 0.4594300091266632, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.6778824925422668, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.30148231983184814, "step": 750 }, { "epoch": 0.2501667778519013, "loss": 1.4996731281280518, "step": 750 }, { "ce_loss": 0.45501625537872314, "epoch": 0.2501667778519013, "step": 750 }, { "distill_loss": 0.6697419285774231, "epoch": 0.2501667778519013, "step": 750 }, { "epoch": 0.2501667778519013, "ref_ce_loss": 0.2740204632282257, "step": 750 }, { "epoch": 0.25350233488992663, "loss": 1.6409, "step": 760 }, { "epoch": 0.25350233488992663, "grad_norm": 1.9353746175765991, "step": 760 }, { "epoch": 0.25350233488992663, "learning_rate": 0.0006755555555555555, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 1.4622337818145752, "step": 760 }, { "ce_loss": 0.49489298462867737, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.5559422969818115, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.3610493540763855, "step": 760 }, { "epoch": 0.25350233488992663, "loss": 1.6780763864517212, "step": 760 }, { "ce_loss": 0.4462796151638031, "epoch": 0.25350233488992663, "step": 760 }, { "distill_loss": 0.5648516416549683, "epoch": 0.25350233488992663, "step": 760 }, { "epoch": 0.25350233488992663, "ref_ce_loss": 0.3000343143939972, "step": 760 }, { "epoch": 0.256837891927952, "loss": 1.7037, "step": 770 }, { "epoch": 0.256837891927952, "grad_norm": 2.1047372817993164, "step": 770 }, { "epoch": 0.256837891927952, "learning_rate": 0.0006844444444444444, "step": 770 }, { "epoch": 0.256837891927952, "loss": 1.5280171632766724, "step": 770 }, { "ce_loss": 0.42800432443618774, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.5842844843864441, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.2647983133792877, "step": 770 }, { "epoch": 0.256837891927952, "loss": 1.9386160373687744, "step": 770 }, { "ce_loss": 0.4129220247268677, "epoch": 0.256837891927952, "step": 770 }, { "distill_loss": 0.5319868326187134, "epoch": 0.256837891927952, "step": 770 }, { "epoch": 0.256837891927952, "ref_ce_loss": 0.3261532485485077, "step": 770 }, { "epoch": 0.2601734489659773, "loss": 1.5582, "step": 780 }, { "epoch": 0.2601734489659773, "grad_norm": 1.3958163261413574, "step": 780 }, { "epoch": 0.2601734489659773, "learning_rate": 0.0006933333333333333, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 1.4289088249206543, "step": 780 }, { "ce_loss": 0.5142719149589539, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.5124003291130066, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.27058809995651245, "step": 780 }, { "epoch": 0.2601734489659773, "loss": 1.4231754541397095, "step": 780 }, { "ce_loss": 0.49991029500961304, "epoch": 0.2601734489659773, "step": 780 }, { "distill_loss": 0.5184733867645264, "epoch": 0.2601734489659773, "step": 780 }, { "epoch": 0.2601734489659773, "ref_ce_loss": 0.268743634223938, "step": 780 }, { "epoch": 0.2635090060040027, "loss": 1.6755, "step": 790 }, { "epoch": 0.2635090060040027, "grad_norm": 1.575848937034607, "step": 790 }, { "epoch": 0.2635090060040027, "learning_rate": 0.0007022222222222222, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 1.4275168180465698, "step": 790 }, { "ce_loss": 0.5029415488243103, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.628352165222168, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.2929801046848297, "step": 790 }, { "epoch": 0.2635090060040027, "loss": 1.449366807937622, "step": 790 }, { "ce_loss": 0.4599992334842682, "epoch": 0.2635090060040027, "step": 790 }, { "distill_loss": 0.5531008839607239, "epoch": 0.2635090060040027, "step": 790 }, { "epoch": 0.2635090060040027, "ref_ce_loss": 0.2969628870487213, "step": 790 }, { "epoch": 0.266844563042028, "loss": 1.6387, "step": 800 }, { "epoch": 0.266844563042028, "grad_norm": 1.6788091659545898, "step": 800 }, { "epoch": 0.266844563042028, "learning_rate": 0.0007111111111111111, "step": 800 }, { "epoch": 0.266844563042028, "loss": 1.4919285774230957, "step": 800 }, { "ce_loss": 0.45864376425743103, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.6171651482582092, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.2416432499885559, "step": 800 }, { "epoch": 0.266844563042028, "loss": 1.9904141426086426, "step": 800 }, { "ce_loss": 0.46867993474006653, "epoch": 0.266844563042028, "step": 800 }, { "distill_loss": 0.6592917442321777, "epoch": 0.266844563042028, "step": 800 }, { "epoch": 0.266844563042028, "ref_ce_loss": 0.28348398208618164, "step": 800 }, { "epoch": 0.2701801200800534, "loss": 1.5013, "step": 810 }, { "epoch": 0.2701801200800534, "grad_norm": 1.3329455852508545, "step": 810 }, { "epoch": 0.2701801200800534, "learning_rate": 0.00072, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 1.204390525817871, "step": 810 }, { "ce_loss": 0.3894767463207245, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.52994304895401, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.2811121344566345, "step": 810 }, { "epoch": 0.2701801200800534, "loss": 1.4491932392120361, "step": 810 }, { "ce_loss": 0.45897600054740906, "epoch": 0.2701801200800534, "step": 810 }, { "distill_loss": 0.5323480367660522, "epoch": 0.2701801200800534, "step": 810 }, { "epoch": 0.2701801200800534, "ref_ce_loss": 0.26767823100090027, "step": 810 }, { "epoch": 0.2735156771180787, "loss": 1.8356, "step": 820 }, { "epoch": 0.2735156771180787, "grad_norm": 3.123432159423828, "step": 820 }, { "epoch": 0.2735156771180787, "learning_rate": 0.0007288888888888889, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 1.3312506675720215, "step": 820 }, { "ce_loss": 0.43902865052223206, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.44022202491760254, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.3157406449317932, "step": 820 }, { "epoch": 0.2735156771180787, "loss": 1.2697056531906128, "step": 820 }, { "ce_loss": 0.4552677869796753, "epoch": 0.2735156771180787, "step": 820 }, { "distill_loss": 0.4594164788722992, "epoch": 0.2735156771180787, "step": 820 }, { "epoch": 0.2735156771180787, "ref_ce_loss": 0.28923386335372925, "step": 820 }, { "epoch": 0.2768512341561041, "loss": 1.7456, "step": 830 }, { "epoch": 0.2768512341561041, "grad_norm": 2.2865235805511475, "step": 830 }, { "epoch": 0.2768512341561041, "learning_rate": 0.0007377777777777778, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 1.4067904949188232, "step": 830 }, { "ce_loss": 0.3348756432533264, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.6199288368225098, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.23431706428527832, "step": 830 }, { "epoch": 0.2768512341561041, "loss": 1.6532933712005615, "step": 830 }, { "ce_loss": 0.5002840757369995, "epoch": 0.2768512341561041, "step": 830 }, { "distill_loss": 0.6605917811393738, "epoch": 0.2768512341561041, "step": 830 }, { "epoch": 0.2768512341561041, "ref_ce_loss": 0.319241464138031, "step": 830 }, { "epoch": 0.2801867911941294, "loss": 1.7239, "step": 840 }, { "epoch": 0.2801867911941294, "grad_norm": 1.7840464115142822, "step": 840 }, { "epoch": 0.2801867911941294, "learning_rate": 0.0007466666666666667, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 1.3670352697372437, "step": 840 }, { "ce_loss": 0.4084605574607849, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 0.5723965764045715, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.2615945339202881, "step": 840 }, { "epoch": 0.2801867911941294, "loss": 1.7874099016189575, "step": 840 }, { "ce_loss": 0.42878419160842896, "epoch": 0.2801867911941294, "step": 840 }, { "distill_loss": 0.5446598529815674, "epoch": 0.2801867911941294, "step": 840 }, { "epoch": 0.2801867911941294, "ref_ce_loss": 0.3162139356136322, "step": 840 }, { "epoch": 0.2835223482321548, "loss": 1.6546, "step": 850 }, { "epoch": 0.2835223482321548, "grad_norm": 1.89409601688385, "step": 850 }, { "epoch": 0.2835223482321548, "learning_rate": 0.0007555555555555555, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.8647241592407227, "step": 850 }, { "ce_loss": 0.43658173084259033, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.694576621055603, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.2831249535083771, "step": 850 }, { "epoch": 0.2835223482321548, "loss": 1.6206308603286743, "step": 850 }, { "ce_loss": 0.4423183798789978, "epoch": 0.2835223482321548, "step": 850 }, { "distill_loss": 0.6064574122428894, "epoch": 0.2835223482321548, "step": 850 }, { "epoch": 0.2835223482321548, "ref_ce_loss": 0.2974826395511627, "step": 850 }, { "epoch": 0.2868579052701801, "loss": 1.566, "step": 860 }, { "epoch": 0.2868579052701801, "grad_norm": 2.212101936340332, "step": 860 }, { "epoch": 0.2868579052701801, "learning_rate": 0.0007644444444444445, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 1.5993949174880981, "step": 860 }, { "ce_loss": 0.42447516322135925, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.5617659687995911, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.2637116611003876, "step": 860 }, { "epoch": 0.2868579052701801, "loss": 1.4178887605667114, "step": 860 }, { "ce_loss": 0.4780910909175873, "epoch": 0.2868579052701801, "step": 860 }, { "distill_loss": 0.5067756772041321, "epoch": 0.2868579052701801, "step": 860 }, { "epoch": 0.2868579052701801, "ref_ce_loss": 0.3166176974773407, "step": 860 }, { "epoch": 0.2901934623082055, "loss": 1.565, "step": 870 }, { "epoch": 0.2901934623082055, "grad_norm": 1.569791316986084, "step": 870 }, { "epoch": 0.2901934623082055, "learning_rate": 0.0007733333333333333, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 2.097165107727051, "step": 870 }, { "ce_loss": 0.507722020149231, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.5919543504714966, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.3449063301086426, "step": 870 }, { "epoch": 0.2901934623082055, "loss": 1.5407272577285767, "step": 870 }, { "ce_loss": 0.4565356969833374, "epoch": 0.2901934623082055, "step": 870 }, { "distill_loss": 0.5664113163948059, "epoch": 0.2901934623082055, "step": 870 }, { "epoch": 0.2901934623082055, "ref_ce_loss": 0.3351632058620453, "step": 870 }, { "epoch": 0.2935290193462308, "loss": 1.5408, "step": 880 }, { "epoch": 0.2935290193462308, "grad_norm": 1.7846498489379883, "step": 880 }, { "epoch": 0.2935290193462308, "learning_rate": 0.0007822222222222222, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 1.3901203870773315, "step": 880 }, { "ce_loss": 0.4673023819923401, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.5273351669311523, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.27710044384002686, "step": 880 }, { "epoch": 0.2935290193462308, "loss": 1.1522825956344604, "step": 880 }, { "ce_loss": 0.34825077652931213, "epoch": 0.2935290193462308, "step": 880 }, { "distill_loss": 0.45659923553466797, "epoch": 0.2935290193462308, "step": 880 }, { "epoch": 0.2935290193462308, "ref_ce_loss": 0.24090026319026947, "step": 880 }, { "epoch": 0.2968645763842562, "loss": 1.5535, "step": 890 }, { "epoch": 0.2968645763842562, "grad_norm": 1.74665105342865, "step": 890 }, { "epoch": 0.2968645763842562, "learning_rate": 0.0007911111111111111, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 1.7669625282287598, "step": 890 }, { "ce_loss": 0.5134462714195251, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.5571491718292236, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.35539567470550537, "step": 890 }, { "epoch": 0.2968645763842562, "loss": 1.4223843812942505, "step": 890 }, { "ce_loss": 0.5171000361442566, "epoch": 0.2968645763842562, "step": 890 }, { "distill_loss": 0.5794808864593506, "epoch": 0.2968645763842562, "step": 890 }, { "epoch": 0.2968645763842562, "ref_ce_loss": 0.32578277587890625, "step": 890 }, { "epoch": 0.3002001334222815, "loss": 1.5978, "step": 900 }, { "epoch": 0.3002001334222815, "grad_norm": 2.970160484313965, "step": 900 }, { "epoch": 0.3002001334222815, "learning_rate": 0.0008, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 1.1846829652786255, "step": 900 }, { "ce_loss": 0.4285282790660858, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.4960606098175049, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.25980421900749207, "step": 900 }, { "epoch": 0.3002001334222815, "loss": 1.5200297832489014, "step": 900 }, { "ce_loss": 0.5215058922767639, "epoch": 0.3002001334222815, "step": 900 }, { "distill_loss": 0.5218350291252136, "epoch": 0.3002001334222815, "step": 900 }, { "epoch": 0.3002001334222815, "ref_ce_loss": 0.3791854977607727, "step": 900 }, { "epoch": 0.3035356904603069, "loss": 1.4915, "step": 910 }, { "epoch": 0.3035356904603069, "grad_norm": 1.6637479066848755, "step": 910 }, { "epoch": 0.3035356904603069, "learning_rate": 0.0007999997665784792, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 1.5741530656814575, "step": 910 }, { "ce_loss": 0.42140352725982666, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.5193475484848022, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.26985877752304077, "step": 910 }, { "epoch": 0.3035356904603069, "loss": 1.433876395225525, "step": 910 }, { "ce_loss": 0.40475451946258545, "epoch": 0.3035356904603069, "step": 910 }, { "distill_loss": 0.5959672927856445, "epoch": 0.3035356904603069, "step": 910 }, { "epoch": 0.3035356904603069, "ref_ce_loss": 0.224575012922287, "step": 910 }, { "epoch": 0.3068712474983322, "loss": 1.6494, "step": 920 }, { "epoch": 0.3068712474983322, "grad_norm": 2.6223065853118896, "step": 920 }, { "epoch": 0.3068712474983322, "learning_rate": 0.0007999990663141889, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 1.777024745941162, "step": 920 }, { "ce_loss": 0.5548272728919983, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.6639838218688965, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.28127968311309814, "step": 920 }, { "epoch": 0.3068712474983322, "loss": 1.6267937421798706, "step": 920 }, { "ce_loss": 0.5212595462799072, "epoch": 0.3068712474983322, "step": 920 }, { "distill_loss": 0.6798348426818848, "epoch": 0.3068712474983322, "step": 920 }, { "epoch": 0.3068712474983322, "ref_ce_loss": 0.29340386390686035, "step": 920 }, { "epoch": 0.31020680453635757, "loss": 1.7391, "step": 930 }, { "epoch": 0.31020680453635757, "grad_norm": 2.6510393619537354, "step": 930 }, { "epoch": 0.31020680453635757, "learning_rate": 0.0007999978992079467, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 1.4878954887390137, "step": 930 }, { "ce_loss": 0.5326961874961853, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.5999663472175598, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.3550073802471161, "step": 930 }, { "epoch": 0.31020680453635757, "loss": 1.464622974395752, "step": 930 }, { "ce_loss": 0.4628712832927704, "epoch": 0.31020680453635757, "step": 930 }, { "distill_loss": 0.6238007545471191, "epoch": 0.31020680453635757, "step": 930 }, { "epoch": 0.31020680453635757, "ref_ce_loss": 0.27657976746559143, "step": 930 }, { "epoch": 0.3135423615743829, "loss": 1.6287, "step": 940 }, { "epoch": 0.3135423615743829, "grad_norm": 2.517056465148926, "step": 940 }, { "epoch": 0.3135423615743829, "learning_rate": 0.0007999962652611144, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 1.8964029550552368, "step": 940 }, { "ce_loss": 0.4931149482727051, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.6298879384994507, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.33834245800971985, "step": 940 }, { "epoch": 0.3135423615743829, "loss": 1.7234830856323242, "step": 940 }, { "ce_loss": 0.42898115515708923, "epoch": 0.3135423615743829, "step": 940 }, { "distill_loss": 0.6838112473487854, "epoch": 0.3135423615743829, "step": 940 }, { "epoch": 0.3135423615743829, "ref_ce_loss": 0.24234764277935028, "step": 940 }, { "epoch": 0.31687791861240827, "loss": 1.6292, "step": 950 }, { "epoch": 0.31687791861240827, "grad_norm": 2.1587040424346924, "step": 950 }, { "epoch": 0.31687791861240827, "learning_rate": 0.0007999941644755992, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 1.4840506315231323, "step": 950 }, { "ce_loss": 0.46642470359802246, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.5663684606552124, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.3459374010562897, "step": 950 }, { "epoch": 0.31687791861240827, "loss": 2.314152240753174, "step": 950 }, { "ce_loss": 0.4551185369491577, "epoch": 0.31687791861240827, "step": 950 }, { "distill_loss": 0.5574811100959778, "epoch": 0.31687791861240827, "step": 950 }, { "epoch": 0.31687791861240827, "ref_ce_loss": 0.2758069634437561, "step": 950 }, { "epoch": 0.3202134756504336, "loss": 1.7307, "step": 960 }, { "epoch": 0.3202134756504336, "grad_norm": 1.7937731742858887, "step": 960 }, { "epoch": 0.3202134756504336, "learning_rate": 0.0007999915968538529, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 1.4508087635040283, "step": 960 }, { "ce_loss": 0.501089870929718, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.5429267883300781, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.4066421389579773, "step": 960 }, { "epoch": 0.3202134756504336, "loss": 1.428647756576538, "step": 960 }, { "ce_loss": 0.4056423008441925, "epoch": 0.3202134756504336, "step": 960 }, { "distill_loss": 0.6060864925384521, "epoch": 0.3202134756504336, "step": 960 }, { "epoch": 0.3202134756504336, "ref_ce_loss": 0.28116607666015625, "step": 960 }, { "epoch": 0.32354903268845897, "loss": 1.5632, "step": 970 }, { "epoch": 0.32354903268845897, "grad_norm": 1.5581204891204834, "step": 970 }, { "epoch": 0.32354903268845897, "learning_rate": 0.0007999885623988721, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 1.5437523126602173, "step": 970 }, { "ce_loss": 0.5129994750022888, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.6591445207595825, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.26789551973342896, "step": 970 }, { "epoch": 0.32354903268845897, "loss": 1.9831459522247314, "step": 970 }, { "ce_loss": 0.46284744143486023, "epoch": 0.32354903268845897, "step": 970 }, { "distill_loss": 0.5761644840240479, "epoch": 0.32354903268845897, "step": 970 }, { "epoch": 0.32354903268845897, "ref_ce_loss": 0.34350600838661194, "step": 970 }, { "epoch": 0.3268845897264843, "loss": 1.6733, "step": 980 }, { "epoch": 0.3268845897264843, "grad_norm": 2.113006830215454, "step": 980 }, { "epoch": 0.3268845897264843, "learning_rate": 0.0007999850611141987, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 1.4288175106048584, "step": 980 }, { "ce_loss": 0.4255240559577942, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.5245460271835327, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.32837823033332825, "step": 980 }, { "epoch": 0.3268845897264843, "loss": 1.5835850238800049, "step": 980 }, { "ce_loss": 0.509605348110199, "epoch": 0.3268845897264843, "step": 980 }, { "distill_loss": 0.521268367767334, "epoch": 0.3268845897264843, "step": 980 }, { "epoch": 0.3268845897264843, "ref_ce_loss": 0.34805089235305786, "step": 980 }, { "epoch": 0.33022014676450967, "loss": 1.4216, "step": 990 }, { "epoch": 0.33022014676450967, "grad_norm": 2.097587823867798, "step": 990 }, { "epoch": 0.33022014676450967, "learning_rate": 0.0007999810930039185, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 1.6663522720336914, "step": 990 }, { "ce_loss": 0.4322049021720886, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.5957827568054199, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.2946665287017822, "step": 990 }, { "epoch": 0.33022014676450967, "loss": 1.6669708490371704, "step": 990 }, { "ce_loss": 0.528470516204834, "epoch": 0.33022014676450967, "step": 990 }, { "distill_loss": 0.6283411979675293, "epoch": 0.33022014676450967, "step": 990 }, { "epoch": 0.33022014676450967, "ref_ce_loss": 0.3029085397720337, "step": 990 }, { "epoch": 0.333555703802535, "loss": 1.669, "step": 1000 }, { "epoch": 0.333555703802535, "grad_norm": 4.047603607177734, "step": 1000 }, { "epoch": 0.333555703802535, "learning_rate": 0.0007999766580726633, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.288991928100586, "step": 1000 }, { "ce_loss": 0.3834209442138672, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.4667183458805084, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.3061845302581787, "step": 1000 }, { "epoch": 0.333555703802535, "loss": 1.3131588697433472, "step": 1000 }, { "ce_loss": 0.42746010422706604, "epoch": 0.333555703802535, "step": 1000 }, { "distill_loss": 0.5145797729492188, "epoch": 0.333555703802535, "step": 1000 }, { "epoch": 0.333555703802535, "ref_ce_loss": 0.26582035422325134, "step": 1000 }, { "epoch": 0.33689126084056037, "loss": 1.5305, "step": 1010 }, { "epoch": 0.33689126084056037, "grad_norm": 1.7009576559066772, "step": 1010 }, { "epoch": 0.33689126084056037, "learning_rate": 0.0007999717563256087, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 1.335223913192749, "step": 1010 }, { "ce_loss": 0.42706549167633057, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.5178505778312683, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.27624693512916565, "step": 1010 }, { "epoch": 0.33689126084056037, "loss": 1.4561131000518799, "step": 1010 }, { "ce_loss": 0.46970024704933167, "epoch": 0.33689126084056037, "step": 1010 }, { "distill_loss": 0.5584217309951782, "epoch": 0.33689126084056037, "step": 1010 }, { "epoch": 0.33689126084056037, "ref_ce_loss": 0.3316168785095215, "step": 1010 }, { "epoch": 0.3402268178785857, "loss": 1.5415, "step": 1020 }, { "epoch": 0.3402268178785857, "grad_norm": 1.8947089910507202, "step": 1020 }, { "epoch": 0.3402268178785857, "learning_rate": 0.0007999663877684757, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 1.2597054243087769, "step": 1020 }, { "ce_loss": 0.39339420199394226, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.5205109715461731, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.2417498379945755, "step": 1020 }, { "epoch": 0.3402268178785857, "loss": 1.2326630353927612, "step": 1020 }, { "ce_loss": 0.41173839569091797, "epoch": 0.3402268178785857, "step": 1020 }, { "distill_loss": 0.5313530564308167, "epoch": 0.3402268178785857, "step": 1020 }, { "epoch": 0.3402268178785857, "ref_ce_loss": 0.289476215839386, "step": 1020 }, { "epoch": 0.34356237491661107, "loss": 1.495, "step": 1030 }, { "epoch": 0.34356237491661107, "grad_norm": 1.6055141687393188, "step": 1030 }, { "epoch": 0.34356237491661107, "learning_rate": 0.0007999605524075302, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 1.3744783401489258, "step": 1030 }, { "ce_loss": 0.4202525019645691, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.5212565064430237, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.3207845687866211, "step": 1030 }, { "epoch": 0.34356237491661107, "loss": 1.725656270980835, "step": 1030 }, { "ce_loss": 0.41157498955726624, "epoch": 0.34356237491661107, "step": 1030 }, { "distill_loss": 0.6025432348251343, "epoch": 0.34356237491661107, "step": 1030 }, { "epoch": 0.34356237491661107, "ref_ce_loss": 0.2683902084827423, "step": 1030 }, { "epoch": 0.3468979319546364, "loss": 1.6478, "step": 1040 }, { "epoch": 0.3468979319546364, "grad_norm": 1.7481364011764526, "step": 1040 }, { "epoch": 0.3468979319546364, "learning_rate": 0.0007999542502495823, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 1.301788330078125, "step": 1040 }, { "ce_loss": 0.5349807739257812, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.44870883226394653, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.3173947036266327, "step": 1040 }, { "epoch": 0.3468979319546364, "loss": 1.5995664596557617, "step": 1040 }, { "ce_loss": 0.4531072974205017, "epoch": 0.3468979319546364, "step": 1040 }, { "distill_loss": 0.44370460510253906, "epoch": 0.3468979319546364, "step": 1040 }, { "epoch": 0.3468979319546364, "ref_ce_loss": 0.2895096242427826, "step": 1040 }, { "epoch": 0.35023348899266177, "loss": 1.6017, "step": 1050 }, { "epoch": 0.35023348899266177, "grad_norm": 1.4346905946731567, "step": 1050 }, { "epoch": 0.35023348899266177, "learning_rate": 0.0007999474813019875, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 1.5935039520263672, "step": 1050 }, { "ce_loss": 0.5919435620307922, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.5254257321357727, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.3546173572540283, "step": 1050 }, { "epoch": 0.35023348899266177, "loss": 1.3503769636154175, "step": 1050 }, { "ce_loss": 0.5056501030921936, "epoch": 0.35023348899266177, "step": 1050 }, { "distill_loss": 0.45085012912750244, "epoch": 0.35023348899266177, "step": 1050 }, { "epoch": 0.35023348899266177, "ref_ce_loss": 0.3113629221916199, "step": 1050 }, { "epoch": 0.3535690460306871, "loss": 1.5644, "step": 1060 }, { "epoch": 0.3535690460306871, "grad_norm": 2.372422456741333, "step": 1060 }, { "epoch": 0.3535690460306871, "learning_rate": 0.000799940245572646, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 1.4446749687194824, "step": 1060 }, { "ce_loss": 0.4489143192768097, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.6644142270088196, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.22634552419185638, "step": 1060 }, { "epoch": 0.3535690460306871, "loss": 1.4582650661468506, "step": 1060 }, { "ce_loss": 0.40130969882011414, "epoch": 0.3535690460306871, "step": 1060 }, { "distill_loss": 0.5357669591903687, "epoch": 0.3535690460306871, "step": 1060 }, { "epoch": 0.3535690460306871, "ref_ce_loss": 0.3189239799976349, "step": 1060 }, { "epoch": 0.35690460306871247, "loss": 1.5684, "step": 1070 }, { "epoch": 0.35690460306871247, "grad_norm": 1.7825205326080322, "step": 1070 }, { "epoch": 0.35690460306871247, "learning_rate": 0.0007999325430700026, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 1.6820989847183228, "step": 1070 }, { "ce_loss": 0.5786712765693665, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.608523428440094, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.40291449427604675, "step": 1070 }, { "epoch": 0.35690460306871247, "loss": 1.5814471244812012, "step": 1070 }, { "ce_loss": 0.5588512420654297, "epoch": 0.35690460306871247, "step": 1070 }, { "distill_loss": 0.6993938684463501, "epoch": 0.35690460306871247, "step": 1070 }, { "epoch": 0.35690460306871247, "ref_ce_loss": 0.31999915838241577, "step": 1070 }, { "epoch": 0.3602401601067378, "loss": 1.6031, "step": 1080 }, { "epoch": 0.3602401601067378, "grad_norm": 2.025482177734375, "step": 1080 }, { "epoch": 0.3602401601067378, "learning_rate": 0.0007999243738030467, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 1.284332036972046, "step": 1080 }, { "ce_loss": 0.39245760440826416, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.4988771677017212, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.28330400586128235, "step": 1080 }, { "epoch": 0.3602401601067378, "loss": 1.3760321140289307, "step": 1080 }, { "ce_loss": 0.46022656559944153, "epoch": 0.3602401601067378, "step": 1080 }, { "distill_loss": 0.5289139747619629, "epoch": 0.3602401601067378, "step": 1080 }, { "epoch": 0.3602401601067378, "ref_ce_loss": 0.28009310364723206, "step": 1080 }, { "epoch": 0.36357571714476317, "loss": 1.5647, "step": 1090 }, { "epoch": 0.36357571714476317, "grad_norm": 1.46823251247406, "step": 1090 }, { "epoch": 0.36357571714476317, "learning_rate": 0.0007999157377813131, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 1.3993898630142212, "step": 1090 }, { "ce_loss": 0.45543530583381653, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.5258496999740601, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.31005752086639404, "step": 1090 }, { "epoch": 0.36357571714476317, "loss": 1.3657909631729126, "step": 1090 }, { "ce_loss": 0.41977930068969727, "epoch": 0.36357571714476317, "step": 1090 }, { "distill_loss": 0.5256907939910889, "epoch": 0.36357571714476317, "step": 1090 }, { "epoch": 0.36357571714476317, "ref_ce_loss": 0.31721067428588867, "step": 1090 }, { "epoch": 0.3669112741827885, "loss": 1.6111, "step": 1100 }, { "epoch": 0.3669112741827885, "grad_norm": 8.720490455627441, "step": 1100 }, { "epoch": 0.3669112741827885, "learning_rate": 0.0007999066350148808, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 1.4530463218688965, "step": 1100 }, { "ce_loss": 0.4756782054901123, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.6375422477722168, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.3396244943141937, "step": 1100 }, { "epoch": 0.3669112741827885, "loss": 1.5000605583190918, "step": 1100 }, { "ce_loss": 0.42159199714660645, "epoch": 0.3669112741827885, "step": 1100 }, { "distill_loss": 0.6526952981948853, "epoch": 0.3669112741827885, "step": 1100 }, { "epoch": 0.3669112741827885, "ref_ce_loss": 0.25416651368141174, "step": 1100 }, { "epoch": 0.37024683122081387, "loss": 1.5306, "step": 1110 }, { "epoch": 0.37024683122081387, "grad_norm": 3.5046091079711914, "step": 1110 }, { "epoch": 0.37024683122081387, "learning_rate": 0.0007998970655143737, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 1.6367322206497192, "step": 1110 }, { "ce_loss": 0.5216576457023621, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.7075483202934265, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.2824236750602722, "step": 1110 }, { "epoch": 0.37024683122081387, "loss": 2.0098891258239746, "step": 1110 }, { "ce_loss": 0.48691219091415405, "epoch": 0.37024683122081387, "step": 1110 }, { "distill_loss": 0.7101386189460754, "epoch": 0.37024683122081387, "step": 1110 }, { "epoch": 0.37024683122081387, "ref_ce_loss": 0.26211676001548767, "step": 1110 }, { "epoch": 0.3735823882588392, "loss": 1.6648, "step": 1120 }, { "epoch": 0.3735823882588392, "grad_norm": 1.5141630172729492, "step": 1120 }, { "epoch": 0.3735823882588392, "learning_rate": 0.0007998870292909604, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 1.2798373699188232, "step": 1120 }, { "ce_loss": 0.3349030017852783, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.49389535188674927, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.25652116537094116, "step": 1120 }, { "epoch": 0.3735823882588392, "loss": 1.777625322341919, "step": 1120 }, { "ce_loss": 0.48351800441741943, "epoch": 0.3735823882588392, "step": 1120 }, { "distill_loss": 0.576139509677887, "epoch": 0.3735823882588392, "step": 1120 }, { "epoch": 0.3735823882588392, "ref_ce_loss": 0.32994407415390015, "step": 1120 }, { "epoch": 0.37691794529686456, "loss": 1.5918, "step": 1130 }, { "epoch": 0.37691794529686456, "grad_norm": 1.6688812971115112, "step": 1130 }, { "epoch": 0.37691794529686456, "learning_rate": 0.0007998765263563544, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 2.2887845039367676, "step": 1130 }, { "ce_loss": 0.48430436849594116, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.5467234253883362, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.3000369966030121, "step": 1130 }, { "epoch": 0.37691794529686456, "loss": 1.4230353832244873, "step": 1130 }, { "ce_loss": 0.38922080397605896, "epoch": 0.37691794529686456, "step": 1130 }, { "distill_loss": 0.5107974410057068, "epoch": 0.37691794529686456, "step": 1130 }, { "epoch": 0.37691794529686456, "ref_ce_loss": 0.2696387469768524, "step": 1130 }, { "epoch": 0.3802535023348899, "loss": 1.615, "step": 1140 }, { "epoch": 0.3802535023348899, "grad_norm": 2.4678921699523926, "step": 1140 }, { "epoch": 0.3802535023348899, "learning_rate": 0.0007998655567228134, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 1.6331299543380737, "step": 1140 }, { "ce_loss": 0.4924895763397217, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.5652615427970886, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.3399295210838318, "step": 1140 }, { "epoch": 0.3802535023348899, "loss": 1.5238920450210571, "step": 1140 }, { "ce_loss": 0.4449857771396637, "epoch": 0.3802535023348899, "step": 1140 }, { "distill_loss": 0.5468964576721191, "epoch": 0.3802535023348899, "step": 1140 }, { "epoch": 0.3802535023348899, "ref_ce_loss": 0.31205645203590393, "step": 1140 }, { "epoch": 0.38358905937291526, "loss": 1.4915, "step": 1150 }, { "epoch": 0.38358905937291526, "grad_norm": 1.4147001504898071, "step": 1150 }, { "epoch": 0.38358905937291526, "learning_rate": 0.0007998541204031406, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 1.9572725296020508, "step": 1150 }, { "ce_loss": 0.4233904480934143, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.5421593189239502, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.2687678337097168, "step": 1150 }, { "epoch": 0.38358905937291526, "loss": 1.4173822402954102, "step": 1150 }, { "ce_loss": 0.37679892778396606, "epoch": 0.38358905937291526, "step": 1150 }, { "distill_loss": 0.531260073184967, "epoch": 0.38358905937291526, "step": 1150 }, { "epoch": 0.38358905937291526, "ref_ce_loss": 0.25782090425491333, "step": 1150 }, { "epoch": 0.3869246164109406, "loss": 1.6581, "step": 1160 }, { "epoch": 0.3869246164109406, "grad_norm": 1.9394618272781372, "step": 1160 }, { "epoch": 0.3869246164109406, "learning_rate": 0.0007998422174106831, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 1.4838085174560547, "step": 1160 }, { "ce_loss": 0.46089208126068115, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.4678708612918854, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.2929135859012604, "step": 1160 }, { "epoch": 0.3869246164109406, "loss": 1.532386302947998, "step": 1160 }, { "ce_loss": 0.3994285762310028, "epoch": 0.3869246164109406, "step": 1160 }, { "distill_loss": 0.5381571054458618, "epoch": 0.3869246164109406, "step": 1160 }, { "epoch": 0.3869246164109406, "ref_ce_loss": 0.26682382822036743, "step": 1160 }, { "epoch": 0.39026017344896596, "loss": 1.6421, "step": 1170 }, { "epoch": 0.39026017344896596, "grad_norm": 1.5442798137664795, "step": 1170 }, { "epoch": 0.39026017344896596, "learning_rate": 0.0007998298477593331, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 1.495753288269043, "step": 1170 }, { "ce_loss": 0.3334011733531952, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.4416235089302063, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.2266778200864792, "step": 1170 }, { "epoch": 0.39026017344896596, "loss": 1.2930865287780762, "step": 1170 }, { "ce_loss": 0.3840572237968445, "epoch": 0.39026017344896596, "step": 1170 }, { "distill_loss": 0.5391579866409302, "epoch": 0.39026017344896596, "step": 1170 }, { "epoch": 0.39026017344896596, "ref_ce_loss": 0.2568325996398926, "step": 1170 }, { "epoch": 0.3935957304869913, "loss": 1.6158, "step": 1180 }, { "epoch": 0.3935957304869913, "grad_norm": 2.4269447326660156, "step": 1180 }, { "epoch": 0.3935957304869913, "learning_rate": 0.0007998170114635274, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 1.4814577102661133, "step": 1180 }, { "ce_loss": 0.4510352611541748, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.5262930393218994, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.3344971835613251, "step": 1180 }, { "epoch": 0.3935957304869913, "loss": 1.261969804763794, "step": 1180 }, { "ce_loss": 0.4206009805202484, "epoch": 0.3935957304869913, "step": 1180 }, { "distill_loss": 0.5127648115158081, "epoch": 0.3935957304869913, "step": 1180 }, { "epoch": 0.3935957304869913, "ref_ce_loss": 0.3275764584541321, "step": 1180 }, { "epoch": 0.39693128752501666, "loss": 1.5864, "step": 1190 }, { "epoch": 0.39693128752501666, "grad_norm": 1.4139463901519775, "step": 1190 }, { "epoch": 0.39693128752501666, "learning_rate": 0.0007998037085382471, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 2.3221168518066406, "step": 1190 }, { "ce_loss": 0.43068450689315796, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.6078692078590393, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.27681607007980347, "step": 1190 }, { "epoch": 0.39693128752501666, "loss": 1.539637804031372, "step": 1190 }, { "ce_loss": 0.45647120475769043, "epoch": 0.39693128752501666, "step": 1190 }, { "distill_loss": 0.6491154432296753, "epoch": 0.39693128752501666, "step": 1190 }, { "epoch": 0.39693128752501666, "ref_ce_loss": 0.2978929281234741, "step": 1190 }, { "epoch": 0.400266844563042, "loss": 1.531, "step": 1200 }, { "epoch": 0.400266844563042, "grad_norm": 1.8300831317901611, "step": 1200 }, { "epoch": 0.400266844563042, "learning_rate": 0.0007997899389990183, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 1.3315988779067993, "step": 1200 }, { "ce_loss": 0.4322234094142914, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.4977712333202362, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.2532905042171478, "step": 1200 }, { "epoch": 0.400266844563042, "loss": 1.5709469318389893, "step": 1200 }, { "ce_loss": 0.4227243661880493, "epoch": 0.400266844563042, "step": 1200 }, { "distill_loss": 0.4741690158843994, "epoch": 0.400266844563042, "step": 1200 }, { "epoch": 0.400266844563042, "ref_ce_loss": 0.33167847990989685, "step": 1200 }, { "epoch": 0.40360240160106736, "loss": 1.5606, "step": 1210 }, { "epoch": 0.40360240160106736, "grad_norm": 2.384575366973877, "step": 1210 }, { "epoch": 0.40360240160106736, "learning_rate": 0.0007997757028619115, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 1.5894966125488281, "step": 1210 }, { "ce_loss": 0.45032036304473877, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.6991457343101501, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.2911129295825958, "step": 1210 }, { "epoch": 0.40360240160106736, "loss": 1.957366704940796, "step": 1210 }, { "ce_loss": 0.4575033485889435, "epoch": 0.40360240160106736, "step": 1210 }, { "distill_loss": 0.5383639335632324, "epoch": 0.40360240160106736, "step": 1210 }, { "epoch": 0.40360240160106736, "ref_ce_loss": 0.28521963953971863, "step": 1210 }, { "epoch": 0.4069379586390927, "loss": 1.7073, "step": 1220 }, { "epoch": 0.4069379586390927, "grad_norm": 1.6603896617889404, "step": 1220 }, { "epoch": 0.4069379586390927, "learning_rate": 0.0007997610001435419, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 1.9320708513259888, "step": 1220 }, { "ce_loss": 0.4879373610019684, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.4297964572906494, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.36906328797340393, "step": 1220 }, { "epoch": 0.4069379586390927, "loss": 1.1487407684326172, "step": 1220 }, { "ce_loss": 0.41289323568344116, "epoch": 0.4069379586390927, "step": 1220 }, { "distill_loss": 0.41345977783203125, "epoch": 0.4069379586390927, "step": 1220 }, { "epoch": 0.4069379586390927, "ref_ce_loss": 0.3178797960281372, "step": 1220 }, { "epoch": 0.41027351567711806, "loss": 1.5026, "step": 1230 }, { "epoch": 0.41027351567711806, "grad_norm": 1.7940484285354614, "step": 1230 }, { "epoch": 0.41027351567711806, "learning_rate": 0.000799745830861069, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 1.5663365125656128, "step": 1230 }, { "ce_loss": 0.4955677092075348, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.6290001273155212, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.33181193470954895, "step": 1230 }, { "epoch": 0.41027351567711806, "loss": 1.218145489692688, "step": 1230 }, { "ce_loss": 0.37379351258277893, "epoch": 0.41027351567711806, "step": 1230 }, { "distill_loss": 0.48004651069641113, "epoch": 0.41027351567711806, "step": 1230 }, { "epoch": 0.41027351567711806, "ref_ce_loss": 0.27316486835479736, "step": 1230 }, { "epoch": 0.4136090727151434, "loss": 1.5787, "step": 1240 }, { "epoch": 0.4136090727151434, "grad_norm": 2.2292141914367676, "step": 1240 }, { "epoch": 0.4136090727151434, "learning_rate": 0.0007997301950321971, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 1.3395863771438599, "step": 1240 }, { "ce_loss": 0.3790244460105896, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.47209668159484863, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.27871784567832947, "step": 1240 }, { "epoch": 0.4136090727151434, "loss": 2.0479073524475098, "step": 1240 }, { "ce_loss": 0.39347025752067566, "epoch": 0.4136090727151434, "step": 1240 }, { "distill_loss": 0.5617333054542542, "epoch": 0.4136090727151434, "step": 1240 }, { "epoch": 0.4136090727151434, "ref_ce_loss": 0.26081743836402893, "step": 1240 }, { "epoch": 0.41694462975316876, "loss": 1.5392, "step": 1250 }, { "epoch": 0.41694462975316876, "grad_norm": 1.818170428276062, "step": 1250 }, { "epoch": 0.41694462975316876, "learning_rate": 0.0007997140926751748, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.4717570543289185, "step": 1250 }, { "ce_loss": 0.5022345781326294, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.5199416279792786, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.3118363916873932, "step": 1250 }, { "epoch": 0.41694462975316876, "loss": 1.484877347946167, "step": 1250 }, { "ce_loss": 0.4070572257041931, "epoch": 0.41694462975316876, "step": 1250 }, { "distill_loss": 0.48654651641845703, "epoch": 0.41694462975316876, "step": 1250 }, { "epoch": 0.41694462975316876, "ref_ce_loss": 0.34332475066185, "step": 1250 }, { "epoch": 0.4202801867911941, "loss": 1.655, "step": 1260 }, { "epoch": 0.4202801867911941, "grad_norm": 3.15238881111145, "step": 1260 }, { "epoch": 0.4202801867911941, "learning_rate": 0.0007996975238087954, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 1.3359380960464478, "step": 1260 }, { "ce_loss": 0.38020819425582886, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.5758382081985474, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.27918919920921326, "step": 1260 }, { "epoch": 0.4202801867911941, "loss": 1.6192512512207031, "step": 1260 }, { "ce_loss": 0.5164095759391785, "epoch": 0.4202801867911941, "step": 1260 }, { "distill_loss": 0.6622616052627563, "epoch": 0.4202801867911941, "step": 1260 }, { "epoch": 0.4202801867911941, "ref_ce_loss": 0.44050195813179016, "step": 1260 }, { "epoch": 0.42361574382921946, "loss": 1.6418, "step": 1270 }, { "epoch": 0.42361574382921946, "grad_norm": 4.3235039710998535, "step": 1270 }, { "epoch": 0.42361574382921946, "learning_rate": 0.0007996804884523964, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 1.345386266708374, "step": 1270 }, { "ce_loss": 0.4195159375667572, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.6241512298583984, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.3016016185283661, "step": 1270 }, { "epoch": 0.42361574382921946, "loss": 1.4211335182189941, "step": 1270 }, { "ce_loss": 0.4531877338886261, "epoch": 0.42361574382921946, "step": 1270 }, { "distill_loss": 0.6978771090507507, "epoch": 0.42361574382921946, "step": 1270 }, { "epoch": 0.42361574382921946, "ref_ce_loss": 0.2688674032688141, "step": 1270 }, { "epoch": 0.4269513008672448, "loss": 1.5536, "step": 1280 }, { "epoch": 0.4269513008672448, "grad_norm": 1.699332594871521, "step": 1280 }, { "epoch": 0.4269513008672448, "learning_rate": 0.00079966298662586, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 2.720428228378296, "step": 1280 }, { "ce_loss": 0.4882067143917084, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.5594663619995117, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.3512772023677826, "step": 1280 }, { "epoch": 0.4269513008672448, "loss": 1.3232649564743042, "step": 1280 }, { "ce_loss": 0.39429083466529846, "epoch": 0.4269513008672448, "step": 1280 }, { "distill_loss": 0.5630033612251282, "epoch": 0.4269513008672448, "step": 1280 }, { "epoch": 0.4269513008672448, "ref_ce_loss": 0.2832672894001007, "step": 1280 }, { "epoch": 0.43028685790527016, "loss": 1.5439, "step": 1290 }, { "epoch": 0.43028685790527016, "grad_norm": 1.461922287940979, "step": 1290 }, { "epoch": 0.43028685790527016, "learning_rate": 0.0007996450183496126, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 1.575237512588501, "step": 1290 }, { "ce_loss": 0.505817174911499, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.6163088083267212, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.2900249660015106, "step": 1290 }, { "epoch": 0.43028685790527016, "loss": 1.3824915885925293, "step": 1290 }, { "ce_loss": 0.4555290639400482, "epoch": 0.43028685790527016, "step": 1290 }, { "distill_loss": 0.6401718258857727, "epoch": 0.43028685790527016, "step": 1290 }, { "epoch": 0.43028685790527016, "ref_ce_loss": 0.2867133617401123, "step": 1290 }, { "epoch": 0.4336224149432955, "loss": 1.5352, "step": 1300 }, { "epoch": 0.4336224149432955, "grad_norm": 4.11077356338501, "step": 1300 }, { "epoch": 0.4336224149432955, "learning_rate": 0.0007996265836446254, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.5897566080093384, "step": 1300 }, { "ce_loss": 0.3810420334339142, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.5214735269546509, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.2401161789894104, "step": 1300 }, { "epoch": 0.4336224149432955, "loss": 1.3515366315841675, "step": 1300 }, { "ce_loss": 0.3965347409248352, "epoch": 0.4336224149432955, "step": 1300 }, { "distill_loss": 0.5576469898223877, "epoch": 0.4336224149432955, "step": 1300 }, { "epoch": 0.4336224149432955, "ref_ce_loss": 0.28013500571250916, "step": 1300 }, { "epoch": 0.43695797198132086, "loss": 1.5448, "step": 1310 }, { "epoch": 0.43695797198132086, "grad_norm": 1.6120764017105103, "step": 1310 }, { "epoch": 0.43695797198132086, "learning_rate": 0.0007996076825324133, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 1.7771337032318115, "step": 1310 }, { "ce_loss": 0.43278172612190247, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.4835440218448639, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.28642958402633667, "step": 1310 }, { "epoch": 0.43695797198132086, "loss": 1.9378416538238525, "step": 1310 }, { "ce_loss": 0.49280300736427307, "epoch": 0.43695797198132086, "step": 1310 }, { "distill_loss": 0.5592789649963379, "epoch": 0.43695797198132086, "step": 1310 }, { "epoch": 0.43695797198132086, "ref_ce_loss": 0.2844145596027374, "step": 1310 }, { "epoch": 0.4402935290193462, "loss": 1.6367, "step": 1320 }, { "epoch": 0.4402935290193462, "grad_norm": 3.2773890495300293, "step": 1320 }, { "epoch": 0.4402935290193462, "learning_rate": 0.0007995883150350363, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 1.526888132095337, "step": 1320 }, { "ce_loss": 0.43740856647491455, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.579285204410553, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.27087196707725525, "step": 1320 }, { "epoch": 0.4402935290193462, "loss": 1.2348921298980713, "step": 1320 }, { "ce_loss": 0.43266892433166504, "epoch": 0.4402935290193462, "step": 1320 }, { "distill_loss": 0.5434672832489014, "epoch": 0.4402935290193462, "step": 1320 }, { "epoch": 0.4402935290193462, "ref_ce_loss": 0.258668452501297, "step": 1320 }, { "epoch": 0.44362908605737156, "loss": 1.5593, "step": 1330 }, { "epoch": 0.44362908605737156, "grad_norm": 1.6326991319656372, "step": 1330 }, { "epoch": 0.44362908605737156, "learning_rate": 0.000799568481175098, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 1.2025867700576782, "step": 1330 }, { "ce_loss": 0.3716997802257538, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.4663502871990204, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.32629045844078064, "step": 1330 }, { "epoch": 0.44362908605737156, "loss": 1.7404615879058838, "step": 1330 }, { "ce_loss": 0.39141565561294556, "epoch": 0.44362908605737156, "step": 1330 }, { "distill_loss": 0.570256769657135, "epoch": 0.44362908605737156, "step": 1330 }, { "epoch": 0.44362908605737156, "ref_ce_loss": 0.2961578965187073, "step": 1330 }, { "epoch": 0.4469646430953969, "loss": 1.5984, "step": 1340 }, { "epoch": 0.4469646430953969, "grad_norm": 1.4796324968338013, "step": 1340 }, { "epoch": 0.4469646430953969, "learning_rate": 0.000799548180975747, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 2.8081512451171875, "step": 1340 }, { "ce_loss": 0.47508761286735535, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.5701019167900085, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.2469950169324875, "step": 1340 }, { "epoch": 0.4469646430953969, "loss": 1.1866693496704102, "step": 1340 }, { "ce_loss": 0.37990590929985046, "epoch": 0.4469646430953969, "step": 1340 }, { "distill_loss": 0.4495466947555542, "epoch": 0.4469646430953969, "step": 1340 }, { "epoch": 0.4469646430953969, "ref_ce_loss": 0.2721485197544098, "step": 1340 }, { "epoch": 0.45030020013342226, "loss": 1.6328, "step": 1350 }, { "epoch": 0.45030020013342226, "grad_norm": 1.5720679759979248, "step": 1350 }, { "epoch": 0.45030020013342226, "learning_rate": 0.0007995274144606755, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 1.2863354682922363, "step": 1350 }, { "ce_loss": 0.3292873799800873, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.5815794467926025, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.2648515999317169, "step": 1350 }, { "epoch": 0.45030020013342226, "loss": 1.2642462253570557, "step": 1350 }, { "ce_loss": 0.3768036365509033, "epoch": 0.45030020013342226, "step": 1350 }, { "distill_loss": 0.5950371026992798, "epoch": 0.45030020013342226, "step": 1350 }, { "epoch": 0.45030020013342226, "ref_ce_loss": 0.29059046506881714, "step": 1350 }, { "epoch": 0.4536357571714476, "loss": 1.4408, "step": 1360 }, { "epoch": 0.4536357571714476, "grad_norm": 2.709803819656372, "step": 1360 }, { "epoch": 0.4536357571714476, "learning_rate": 0.0007995061816541204, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 1.3702350854873657, "step": 1360 }, { "ce_loss": 0.44703322649002075, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.5962932705879211, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.32672789692878723, "step": 1360 }, { "epoch": 0.4536357571714476, "loss": 1.504996418952942, "step": 1360 }, { "ce_loss": 0.43612051010131836, "epoch": 0.4536357571714476, "step": 1360 }, { "distill_loss": 0.6245546936988831, "epoch": 0.4536357571714476, "step": 1360 }, { "epoch": 0.4536357571714476, "ref_ce_loss": 0.24796253442764282, "step": 1360 }, { "epoch": 0.45697131420947296, "loss": 1.4248, "step": 1370 }, { "epoch": 0.45697131420947296, "grad_norm": 1.779395580291748, "step": 1370 }, { "epoch": 0.45697131420947296, "learning_rate": 0.0007994844825808628, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 1.311972975730896, "step": 1370 }, { "ce_loss": 0.4517594575881958, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.4873427748680115, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.29018694162368774, "step": 1370 }, { "epoch": 0.45697131420947296, "loss": 1.8570822477340698, "step": 1370 }, { "ce_loss": 0.48192328214645386, "epoch": 0.45697131420947296, "step": 1370 }, { "distill_loss": 0.5890644788742065, "epoch": 0.45697131420947296, "step": 1370 }, { "epoch": 0.45697131420947296, "ref_ce_loss": 0.3222361207008362, "step": 1370 }, { "epoch": 0.4603068712474983, "loss": 1.4956, "step": 1380 }, { "epoch": 0.4603068712474983, "grad_norm": 1.5447510480880737, "step": 1380 }, { "epoch": 0.4603068712474983, "learning_rate": 0.0007994623172662275, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 1.429722785949707, "step": 1380 }, { "ce_loss": 0.4304126501083374, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.5462210178375244, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.28710076212882996, "step": 1380 }, { "epoch": 0.4603068712474983, "loss": 1.4016376733779907, "step": 1380 }, { "ce_loss": 0.3747934103012085, "epoch": 0.4603068712474983, "step": 1380 }, { "distill_loss": 0.5976477265357971, "epoch": 0.4603068712474983, "step": 1380 }, { "epoch": 0.4603068712474983, "ref_ce_loss": 0.30236348509788513, "step": 1380 }, { "epoch": 0.46364242828552366, "loss": 1.5523, "step": 1390 }, { "epoch": 0.46364242828552366, "grad_norm": 1.9847055673599243, "step": 1390 }, { "epoch": 0.46364242828552366, "learning_rate": 0.0007994396857360842, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 1.348907709121704, "step": 1390 }, { "ce_loss": 0.4562534987926483, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.6037919521331787, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.28767332434654236, "step": 1390 }, { "epoch": 0.46364242828552366, "loss": 2.5337958335876465, "step": 1390 }, { "ce_loss": 0.42729249596595764, "epoch": 0.46364242828552366, "step": 1390 }, { "distill_loss": 0.6115925312042236, "epoch": 0.46364242828552366, "step": 1390 }, { "epoch": 0.46364242828552366, "ref_ce_loss": 0.2527952492237091, "step": 1390 }, { "epoch": 0.466977985323549, "loss": 1.6165, "step": 1400 }, { "epoch": 0.466977985323549, "grad_norm": 2.61089825630188, "step": 1400 }, { "epoch": 0.466977985323549, "learning_rate": 0.0007994165880168461, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 1.4341270923614502, "step": 1400 }, { "ce_loss": 0.4411405622959137, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.5347835421562195, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.3186250627040863, "step": 1400 }, { "epoch": 0.466977985323549, "loss": 1.515146255493164, "step": 1400 }, { "ce_loss": 0.42187491059303284, "epoch": 0.466977985323549, "step": 1400 }, { "distill_loss": 0.5460115075111389, "epoch": 0.466977985323549, "step": 1400 }, { "epoch": 0.466977985323549, "ref_ce_loss": 0.24013651907444, "step": 1400 }, { "epoch": 0.4703135423615744, "loss": 1.6537, "step": 1410 }, { "epoch": 0.4703135423615744, "grad_norm": 2.4364840984344482, "step": 1410 }, { "epoch": 0.4703135423615744, "learning_rate": 0.0007993930241354708, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 1.3404841423034668, "step": 1410 }, { "ce_loss": 0.4299369752407074, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.5847266316413879, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.3256117105484009, "step": 1410 }, { "epoch": 0.4703135423615744, "loss": 1.4295798540115356, "step": 1410 }, { "ce_loss": 0.46195846796035767, "epoch": 0.4703135423615744, "step": 1410 }, { "distill_loss": 0.5672675967216492, "epoch": 0.4703135423615744, "step": 1410 }, { "epoch": 0.4703135423615744, "ref_ce_loss": 0.2844446003437042, "step": 1410 }, { "epoch": 0.47364909939959976, "loss": 1.538, "step": 1420 }, { "epoch": 0.47364909939959976, "grad_norm": 2.079164981842041, "step": 1420 }, { "epoch": 0.47364909939959976, "learning_rate": 0.0007993689941194598, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 1.8757567405700684, "step": 1420 }, { "ce_loss": 0.49489399790763855, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.5338597297668457, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.3119715750217438, "step": 1420 }, { "epoch": 0.47364909939959976, "loss": 1.5323330163955688, "step": 1420 }, { "ce_loss": 0.4838011562824249, "epoch": 0.47364909939959976, "step": 1420 }, { "distill_loss": 0.5049943923950195, "epoch": 0.47364909939959976, "step": 1420 }, { "epoch": 0.47364909939959976, "ref_ce_loss": 0.377859503030777, "step": 1420 }, { "epoch": 0.4769846564376251, "loss": 1.5542, "step": 1430 }, { "epoch": 0.4769846564376251, "grad_norm": 1.6398972272872925, "step": 1430 }, { "epoch": 0.4769846564376251, "learning_rate": 0.0007993444979968588, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 1.2284057140350342, "step": 1430 }, { "ce_loss": 0.4466082751750946, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.5416300296783447, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.24010510742664337, "step": 1430 }, { "epoch": 0.4769846564376251, "loss": 1.525360345840454, "step": 1430 }, { "ce_loss": 0.48976385593414307, "epoch": 0.4769846564376251, "step": 1430 }, { "distill_loss": 0.5415977835655212, "epoch": 0.4769846564376251, "step": 1430 }, { "epoch": 0.4769846564376251, "ref_ce_loss": 0.35288766026496887, "step": 1430 }, { "epoch": 0.48032021347565046, "loss": 1.5164, "step": 1440 }, { "epoch": 0.48032021347565046, "grad_norm": 1.6208374500274658, "step": 1440 }, { "epoch": 0.48032021347565046, "learning_rate": 0.0007993195357962575, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 1.698353886604309, "step": 1440 }, { "ce_loss": 0.4378069341182709, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.6145537495613098, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.2727295756340027, "step": 1440 }, { "epoch": 0.48032021347565046, "loss": 1.3916555643081665, "step": 1440 }, { "ce_loss": 0.3999626040458679, "epoch": 0.48032021347565046, "step": 1440 }, { "distill_loss": 0.5818359851837158, "epoch": 0.48032021347565046, "step": 1440 }, { "epoch": 0.48032021347565046, "ref_ce_loss": 0.29204344749450684, "step": 1440 }, { "epoch": 0.4836557705136758, "loss": 1.5253, "step": 1450 }, { "epoch": 0.4836557705136758, "grad_norm": 3.18560528755188, "step": 1450 }, { "epoch": 0.4836557705136758, "learning_rate": 0.0007992941075467892, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 1.3466286659240723, "step": 1450 }, { "ce_loss": 0.35075056552886963, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.42663854360580444, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.2799513041973114, "step": 1450 }, { "epoch": 0.4836557705136758, "loss": 1.619462490081787, "step": 1450 }, { "ce_loss": 0.5010478496551514, "epoch": 0.4836557705136758, "step": 1450 }, { "distill_loss": 0.5300058126449585, "epoch": 0.4836557705136758, "step": 1450 }, { "epoch": 0.4836557705136758, "ref_ce_loss": 0.3170461356639862, "step": 1450 }, { "epoch": 0.48699132755170116, "loss": 1.6154, "step": 1460 }, { "epoch": 0.48699132755170116, "grad_norm": 1.907476782798767, "step": 1460 }, { "epoch": 0.48699132755170116, "learning_rate": 0.0007992682132781317, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 1.7880961894989014, "step": 1460 }, { "ce_loss": 0.44775262475013733, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.5088525414466858, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.2713979184627533, "step": 1460 }, { "epoch": 0.48699132755170116, "loss": 1.7701867818832397, "step": 1460 }, { "ce_loss": 0.49875637888908386, "epoch": 0.48699132755170116, "step": 1460 }, { "distill_loss": 0.6390624642372131, "epoch": 0.48699132755170116, "step": 1460 }, { "epoch": 0.48699132755170116, "ref_ce_loss": 0.2926272451877594, "step": 1460 }, { "epoch": 0.4903268845897265, "loss": 1.5046, "step": 1470 }, { "epoch": 0.4903268845897265, "grad_norm": 1.6721819639205933, "step": 1470 }, { "epoch": 0.4903268845897265, "learning_rate": 0.0007992418530205062, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 1.2131767272949219, "step": 1470 }, { "ce_loss": 0.38331499695777893, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.47394928336143494, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.26265010237693787, "step": 1470 }, { "epoch": 0.4903268845897265, "loss": 1.0815149545669556, "step": 1470 }, { "ce_loss": 0.3649250566959381, "epoch": 0.4903268845897265, "step": 1470 }, { "distill_loss": 0.48027974367141724, "epoch": 0.4903268845897265, "step": 1470 }, { "epoch": 0.4903268845897265, "ref_ce_loss": 0.23579445481300354, "step": 1470 }, { "epoch": 0.49366244162775186, "loss": 1.4974, "step": 1480 }, { "epoch": 0.49366244162775186, "grad_norm": 2.451059579849243, "step": 1480 }, { "epoch": 0.49366244162775186, "learning_rate": 0.000799215026804678, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 2.313891887664795, "step": 1480 }, { "ce_loss": 0.46999824047088623, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.5875487327575684, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.3389456868171692, "step": 1480 }, { "epoch": 0.49366244162775186, "loss": 1.3807448148727417, "step": 1480 }, { "ce_loss": 0.4419082999229431, "epoch": 0.49366244162775186, "step": 1480 }, { "distill_loss": 0.5672412514686584, "epoch": 0.49366244162775186, "step": 1480 }, { "epoch": 0.49366244162775186, "ref_ce_loss": 0.2629389762878418, "step": 1480 }, { "epoch": 0.4969979986657772, "loss": 1.4781, "step": 1490 }, { "epoch": 0.4969979986657772, "grad_norm": 1.9855118989944458, "step": 1490 }, { "epoch": 0.4969979986657772, "learning_rate": 0.0007991877346619562, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 1.5322380065917969, "step": 1490 }, { "ce_loss": 0.4553185701370239, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.49299630522727966, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.2738795876502991, "step": 1490 }, { "epoch": 0.4969979986657772, "loss": 1.3188645839691162, "step": 1490 }, { "ce_loss": 0.43871697783470154, "epoch": 0.4969979986657772, "step": 1490 }, { "distill_loss": 0.5200066566467285, "epoch": 0.4969979986657772, "step": 1490 }, { "epoch": 0.4969979986657772, "ref_ce_loss": 0.3599816858768463, "step": 1490 }, { "epoch": 0.5003335557038026, "loss": 1.4846, "step": 1500 }, { "epoch": 0.5003335557038026, "grad_norm": 2.409991502761841, "step": 1500 }, { "epoch": 0.5003335557038026, "learning_rate": 0.0007991599766241939, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 1.4143189191818237, "step": 1500 }, { "ce_loss": 0.3943651020526886, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.4304821491241455, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.2800663113594055, "step": 1500 }, { "epoch": 0.5003335557038026, "loss": 1.6676888465881348, "step": 1500 }, { "ce_loss": 0.4340650141239166, "epoch": 0.5003335557038026, "step": 1500 }, { "distill_loss": 0.5943726897239685, "epoch": 0.5003335557038026, "step": 1500 }, { "epoch": 0.5003335557038026, "ref_ce_loss": 0.2812447249889374, "step": 1500 }, { "epoch": 0.5036691127418279, "loss": 1.5091, "step": 1510 }, { "epoch": 0.5036691127418279, "grad_norm": 1.7553213834762573, "step": 1510 }, { "epoch": 0.5036691127418279, "learning_rate": 0.0007991317527237872, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 1.3898941278457642, "step": 1510 }, { "ce_loss": 0.4156953990459442, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.5571177005767822, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.2963123619556427, "step": 1510 }, { "epoch": 0.5036691127418279, "loss": 1.3989025354385376, "step": 1510 }, { "ce_loss": 0.44018229842185974, "epoch": 0.5036691127418279, "step": 1510 }, { "distill_loss": 0.5224853157997131, "epoch": 0.5036691127418279, "step": 1510 }, { "epoch": 0.5036691127418279, "ref_ce_loss": 0.3302108943462372, "step": 1510 }, { "epoch": 0.5070046697798533, "loss": 1.3539, "step": 1520 }, { "epoch": 0.5070046697798533, "grad_norm": 1.661407232284546, "step": 1520 }, { "epoch": 0.5070046697798533, "learning_rate": 0.0007991030629936768, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 1.38296377658844, "step": 1520 }, { "ce_loss": 0.35517755150794983, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.4815816283226013, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.22595949470996857, "step": 1520 }, { "epoch": 0.5070046697798533, "loss": 1.6152260303497314, "step": 1520 }, { "ce_loss": 0.5425209403038025, "epoch": 0.5070046697798533, "step": 1520 }, { "distill_loss": 0.6551669836044312, "epoch": 0.5070046697798533, "step": 1520 }, { "epoch": 0.5070046697798533, "ref_ce_loss": 0.3061394989490509, "step": 1520 }, { "epoch": 0.5103402268178786, "loss": 1.423, "step": 1530 }, { "epoch": 0.5103402268178786, "grad_norm": 1.6371610164642334, "step": 1530 }, { "epoch": 0.5103402268178786, "learning_rate": 0.0007990739074673468, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 1.2464088201522827, "step": 1530 }, { "ce_loss": 0.39948147535324097, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.45707568526268005, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.2859954833984375, "step": 1530 }, { "epoch": 0.5103402268178786, "loss": 1.2391449213027954, "step": 1530 }, { "ce_loss": 0.4184815585613251, "epoch": 0.5103402268178786, "step": 1530 }, { "distill_loss": 0.5067058801651001, "epoch": 0.5103402268178786, "step": 1530 }, { "epoch": 0.5103402268178786, "ref_ce_loss": 0.31185799837112427, "step": 1530 }, { "epoch": 0.513675783855904, "loss": 1.5512, "step": 1540 }, { "epoch": 0.513675783855904, "grad_norm": 2.0008840560913086, "step": 1540 }, { "epoch": 0.513675783855904, "learning_rate": 0.0007990442861788244, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 1.412217378616333, "step": 1540 }, { "ce_loss": 0.4328736960887909, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.5061045289039612, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.35407111048698425, "step": 1540 }, { "epoch": 0.513675783855904, "loss": 1.3271783590316772, "step": 1540 }, { "ce_loss": 0.3384850323200226, "epoch": 0.513675783855904, "step": 1540 }, { "distill_loss": 0.4997933506965637, "epoch": 0.513675783855904, "step": 1540 }, { "epoch": 0.513675783855904, "ref_ce_loss": 0.31482452154159546, "step": 1540 }, { "epoch": 0.5170113408939293, "loss": 1.5203, "step": 1550 }, { "epoch": 0.5170113408939293, "grad_norm": 2.0364885330200195, "step": 1550 }, { "epoch": 0.5170113408939293, "learning_rate": 0.0007990141991626813, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 1.3766908645629883, "step": 1550 }, { "ce_loss": 0.4917585253715515, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.5370101928710938, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.2637538015842438, "step": 1550 }, { "epoch": 0.5170113408939293, "loss": 1.4567843675613403, "step": 1550 }, { "ce_loss": 0.5104137063026428, "epoch": 0.5170113408939293, "step": 1550 }, { "distill_loss": 0.49222514033317566, "epoch": 0.5170113408939293, "step": 1550 }, { "epoch": 0.5170113408939293, "ref_ce_loss": 0.3065930902957916, "step": 1550 }, { "epoch": 0.5203468979319547, "loss": 1.5151, "step": 1560 }, { "epoch": 0.5203468979319547, "grad_norm": 2.2353527545928955, "step": 1560 }, { "epoch": 0.5203468979319547, "learning_rate": 0.0007989836464540318, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 1.3307873010635376, "step": 1560 }, { "ce_loss": 0.3729870617389679, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.6389895677566528, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.27055856585502625, "step": 1560 }, { "epoch": 0.5203468979319547, "loss": 1.5160497426986694, "step": 1560 }, { "ce_loss": 0.47393983602523804, "epoch": 0.5203468979319547, "step": 1560 }, { "distill_loss": 0.6437463164329529, "epoch": 0.5203468979319547, "step": 1560 }, { "epoch": 0.5203468979319547, "ref_ce_loss": 0.31374311447143555, "step": 1560 }, { "epoch": 0.52368245496998, "loss": 1.4511, "step": 1570 }, { "epoch": 0.52368245496998, "grad_norm": 2.509186267852783, "step": 1570 }, { "epoch": 0.52368245496998, "learning_rate": 0.0007989526280885348, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 1.4282000064849854, "step": 1570 }, { "ce_loss": 0.3524983823299408, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.5033807158470154, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.21685640513896942, "step": 1570 }, { "epoch": 0.52368245496998, "loss": 2.016537666320801, "step": 1570 }, { "ce_loss": 0.46554210782051086, "epoch": 0.52368245496998, "step": 1570 }, { "distill_loss": 0.5887246131896973, "epoch": 0.52368245496998, "step": 1570 }, { "epoch": 0.52368245496998, "ref_ce_loss": 0.3299577832221985, "step": 1570 }, { "epoch": 0.5270180120080054, "loss": 1.4662, "step": 1580 }, { "epoch": 0.5270180120080054, "grad_norm": 1.5988291501998901, "step": 1580 }, { "epoch": 0.5270180120080054, "learning_rate": 0.0007989211441023914, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 1.3795303106307983, "step": 1580 }, { "ce_loss": 0.31225863099098206, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.6102480888366699, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.24404440820217133, "step": 1580 }, { "epoch": 0.5270180120080054, "loss": 1.2649134397506714, "step": 1580 }, { "ce_loss": 0.3859570026397705, "epoch": 0.5270180120080054, "step": 1580 }, { "distill_loss": 0.5082134008407593, "epoch": 0.5270180120080054, "step": 1580 }, { "epoch": 0.5270180120080054, "ref_ce_loss": 0.2923862934112549, "step": 1580 }, { "epoch": 0.5303535690460307, "loss": 1.3763, "step": 1590 }, { "epoch": 0.5303535690460307, "grad_norm": 1.6285555362701416, "step": 1590 }, { "epoch": 0.5303535690460307, "learning_rate": 0.0007988891945323474, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 1.705237865447998, "step": 1590 }, { "ce_loss": 0.41622549295425415, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.5183006525039673, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.29350215196609497, "step": 1590 }, { "epoch": 0.5303535690460307, "loss": 1.2734066247940063, "step": 1590 }, { "ce_loss": 0.44327646493911743, "epoch": 0.5303535690460307, "step": 1590 }, { "distill_loss": 0.5529025793075562, "epoch": 0.5303535690460307, "step": 1590 }, { "epoch": 0.5303535690460307, "ref_ce_loss": 0.27696895599365234, "step": 1590 }, { "epoch": 0.533689126084056, "loss": 1.4172, "step": 1600 }, { "epoch": 0.533689126084056, "grad_norm": 1.6855443716049194, "step": 1600 }, { "epoch": 0.533689126084056, "learning_rate": 0.000798856779415691, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 1.2596999406814575, "step": 1600 }, { "ce_loss": 0.40576714277267456, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.5716956853866577, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.2820309102535248, "step": 1600 }, { "epoch": 0.533689126084056, "loss": 1.3510174751281738, "step": 1600 }, { "ce_loss": 0.4361741542816162, "epoch": 0.533689126084056, "step": 1600 }, { "distill_loss": 0.5999810099601746, "epoch": 0.533689126084056, "step": 1600 }, { "epoch": 0.533689126084056, "ref_ce_loss": 0.24159260094165802, "step": 1600 }, { "epoch": 0.5370246831220814, "loss": 1.5089, "step": 1610 }, { "epoch": 0.5370246831220814, "grad_norm": 1.5545297861099243, "step": 1610 }, { "epoch": 0.5370246831220814, "learning_rate": 0.0007988238987902543, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 1.9645248651504517, "step": 1610 }, { "ce_loss": 0.4018411934375763, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.44524702429771423, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.33014345169067383, "step": 1610 }, { "epoch": 0.5370246831220814, "loss": 1.085498332977295, "step": 1610 }, { "ce_loss": 0.3779771327972412, "epoch": 0.5370246831220814, "step": 1610 }, { "distill_loss": 0.39916783571243286, "epoch": 0.5370246831220814, "step": 1610 }, { "epoch": 0.5370246831220814, "ref_ce_loss": 0.3057388663291931, "step": 1610 }, { "epoch": 0.5403602401601068, "loss": 1.5661, "step": 1620 }, { "epoch": 0.5403602401601068, "grad_norm": 2.2747015953063965, "step": 1620 }, { "epoch": 0.5403602401601068, "learning_rate": 0.0007987905526944125, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 1.3721652030944824, "step": 1620 }, { "ce_loss": 0.4520910382270813, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.607236385345459, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.3124680817127228, "step": 1620 }, { "epoch": 0.5403602401601068, "loss": 1.5054634809494019, "step": 1620 }, { "ce_loss": 0.4722904562950134, "epoch": 0.5403602401601068, "step": 1620 }, { "distill_loss": 0.5852570533752441, "epoch": 0.5403602401601068, "step": 1620 }, { "epoch": 0.5403602401601068, "ref_ce_loss": 0.2933380603790283, "step": 1620 }, { "epoch": 0.5436957971981321, "loss": 1.3952, "step": 1630 }, { "epoch": 0.5436957971981321, "grad_norm": 1.8939306735992432, "step": 1630 }, { "epoch": 0.5436957971981321, "learning_rate": 0.000798756741167084, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 1.1996138095855713, "step": 1630 }, { "ce_loss": 0.3728450536727905, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.4565788507461548, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.24662546813488007, "step": 1630 }, { "epoch": 0.5436957971981321, "loss": 1.341873288154602, "step": 1630 }, { "ce_loss": 0.4312497079372406, "epoch": 0.5436957971981321, "step": 1630 }, { "distill_loss": 0.52656090259552, "epoch": 0.5436957971981321, "step": 1630 }, { "epoch": 0.5436957971981321, "ref_ce_loss": 0.2624252736568451, "step": 1630 }, { "epoch": 0.5470313542361575, "loss": 1.407, "step": 1640 }, { "epoch": 0.5470313542361575, "grad_norm": 1.7016750574111938, "step": 1640 }, { "epoch": 0.5470313542361575, "learning_rate": 0.0007987224642477307, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 1.5102639198303223, "step": 1640 }, { "ce_loss": 0.4606504440307617, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.6021109819412231, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.2885037958621979, "step": 1640 }, { "epoch": 0.5470313542361575, "loss": 2.82285213470459, "step": 1640 }, { "ce_loss": 0.41885632276535034, "epoch": 0.5470313542361575, "step": 1640 }, { "distill_loss": 0.5302011370658875, "epoch": 0.5470313542361575, "step": 1640 }, { "epoch": 0.5470313542361575, "ref_ce_loss": 0.3432007431983948, "step": 1640 }, { "epoch": 0.5503669112741828, "loss": 1.5286, "step": 1650 }, { "epoch": 0.5503669112741828, "grad_norm": 1.6959972381591797, "step": 1650 }, { "epoch": 0.5503669112741828, "learning_rate": 0.0007986877219763572, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 1.4738940000534058, "step": 1650 }, { "ce_loss": 0.40843141078948975, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.49939486384391785, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.24556097388267517, "step": 1650 }, { "epoch": 0.5503669112741828, "loss": 1.0911322832107544, "step": 1650 }, { "ce_loss": 0.3265611231327057, "epoch": 0.5503669112741828, "step": 1650 }, { "distill_loss": 0.4817613363265991, "epoch": 0.5503669112741828, "step": 1650 }, { "epoch": 0.5503669112741828, "ref_ce_loss": 0.18119016289710999, "step": 1650 }, { "epoch": 0.5537024683122082, "loss": 1.4085, "step": 1660 }, { "epoch": 0.5537024683122082, "grad_norm": 1.3222358226776123, "step": 1660 }, { "epoch": 0.5537024683122082, "learning_rate": 0.0007986525143935115, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 1.2532429695129395, "step": 1660 }, { "ce_loss": 0.45698484778404236, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.5718897581100464, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.2241760790348053, "step": 1660 }, { "epoch": 0.5537024683122082, "loss": 1.4703891277313232, "step": 1660 }, { "ce_loss": 0.44330742955207825, "epoch": 0.5537024683122082, "step": 1660 }, { "distill_loss": 0.5229434967041016, "epoch": 0.5537024683122082, "step": 1660 }, { "epoch": 0.5537024683122082, "ref_ce_loss": 0.2593224048614502, "step": 1660 }, { "epoch": 0.5570380253502335, "loss": 1.4088, "step": 1670 }, { "epoch": 0.5570380253502335, "grad_norm": 1.3066946268081665, "step": 1670 }, { "epoch": 0.5570380253502335, "learning_rate": 0.000798616841540285, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 1.1286921501159668, "step": 1670 }, { "ce_loss": 0.3801676034927368, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.40562373399734497, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.2621326148509979, "step": 1670 }, { "epoch": 0.5570380253502335, "loss": 1.7087914943695068, "step": 1670 }, { "ce_loss": 0.4122104346752167, "epoch": 0.5570380253502335, "step": 1670 }, { "distill_loss": 0.5846372842788696, "epoch": 0.5570380253502335, "step": 1670 }, { "epoch": 0.5570380253502335, "ref_ce_loss": 0.2881487011909485, "step": 1670 }, { "epoch": 0.5603735823882589, "loss": 1.5027, "step": 1680 }, { "epoch": 0.5603735823882589, "grad_norm": 2.6256797313690186, "step": 1680 }, { "epoch": 0.5603735823882589, "learning_rate": 0.0007985807034583111, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 1.4386684894561768, "step": 1680 }, { "ce_loss": 0.4299916923046112, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.6190658807754517, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.30958718061447144, "step": 1680 }, { "epoch": 0.5603735823882589, "loss": 1.4426429271697998, "step": 1680 }, { "ce_loss": 0.4818243086338043, "epoch": 0.5603735823882589, "step": 1680 }, { "distill_loss": 0.5826115608215332, "epoch": 0.5603735823882589, "step": 1680 }, { "epoch": 0.5603735823882589, "ref_ce_loss": 0.29600685834884644, "step": 1680 }, { "epoch": 0.5637091394262842, "loss": 1.4569, "step": 1690 }, { "epoch": 0.5637091394262842, "grad_norm": 2.3843090534210205, "step": 1690 }, { "epoch": 0.5637091394262842, "learning_rate": 0.0007985441001897675, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 2.2765228748321533, "step": 1690 }, { "ce_loss": 0.423153817653656, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.4943941533565521, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.23956818878650665, "step": 1690 }, { "epoch": 0.5637091394262842, "loss": 1.485368251800537, "step": 1690 }, { "ce_loss": 0.3642539083957672, "epoch": 0.5637091394262842, "step": 1690 }, { "distill_loss": 0.39737266302108765, "epoch": 0.5637091394262842, "step": 1690 }, { "epoch": 0.5637091394262842, "ref_ce_loss": 0.2661179006099701, "step": 1690 }, { "epoch": 0.5670446964643095, "loss": 1.507, "step": 1700 }, { "epoch": 0.5670446964643095, "grad_norm": 1.6635305881500244, "step": 1700 }, { "epoch": 0.5670446964643095, "learning_rate": 0.0007985070317773737, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 1.5071004629135132, "step": 1700 }, { "ce_loss": 0.3156302571296692, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.4929995536804199, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.24376991391181946, "step": 1700 }, { "epoch": 0.5670446964643095, "loss": 1.4015324115753174, "step": 1700 }, { "ce_loss": 0.4085094928741455, "epoch": 0.5670446964643095, "step": 1700 }, { "distill_loss": 0.559135913848877, "epoch": 0.5670446964643095, "step": 1700 }, { "epoch": 0.5670446964643095, "ref_ce_loss": 0.28080520033836365, "step": 1700 }, { "epoch": 0.5703802535023349, "loss": 1.384, "step": 1710 }, { "epoch": 0.5703802535023349, "grad_norm": 1.3340486288070679, "step": 1710 }, { "epoch": 0.5703802535023349, "learning_rate": 0.0007984694982643927, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 1.5864113569259644, "step": 1710 }, { "ce_loss": 0.40390101075172424, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.5704938173294067, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.2445508986711502, "step": 1710 }, { "epoch": 0.5703802535023349, "loss": 1.1265581846237183, "step": 1710 }, { "ce_loss": 0.3849276304244995, "epoch": 0.5703802535023349, "step": 1710 }, { "distill_loss": 0.5128688812255859, "epoch": 0.5703802535023349, "step": 1710 }, { "epoch": 0.5703802535023349, "ref_ce_loss": 0.22824981808662415, "step": 1710 }, { "epoch": 0.5737158105403602, "loss": 1.4421, "step": 1720 }, { "epoch": 0.5737158105403602, "grad_norm": 1.9246883392333984, "step": 1720 }, { "epoch": 0.5737158105403602, "learning_rate": 0.0007984314996946303, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 1.3969639539718628, "step": 1720 }, { "ce_loss": 0.4116149842739105, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.5273759961128235, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.2239617109298706, "step": 1720 }, { "epoch": 0.5737158105403602, "loss": 1.1578283309936523, "step": 1720 }, { "ce_loss": 0.4013277292251587, "epoch": 0.5737158105403602, "step": 1720 }, { "distill_loss": 0.4725862145423889, "epoch": 0.5737158105403602, "step": 1720 }, { "epoch": 0.5737158105403602, "ref_ce_loss": 0.2722586989402771, "step": 1720 }, { "epoch": 0.5770513675783856, "loss": 1.5764, "step": 1730 }, { "epoch": 0.5770513675783856, "grad_norm": 2.31512188911438, "step": 1730 }, { "epoch": 0.5770513675783856, "learning_rate": 0.0007983930361124345, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 1.5932635068893433, "step": 1730 }, { "ce_loss": 0.41283050179481506, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.5362582802772522, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.29913270473480225, "step": 1730 }, { "epoch": 0.5770513675783856, "loss": 1.472170114517212, "step": 1730 }, { "ce_loss": 0.41566747426986694, "epoch": 0.5770513675783856, "step": 1730 }, { "distill_loss": 0.5674774050712585, "epoch": 0.5770513675783856, "step": 1730 }, { "epoch": 0.5770513675783856, "ref_ce_loss": 0.2518981099128723, "step": 1730 }, { "epoch": 0.580386924616411, "loss": 1.4327, "step": 1740 }, { "epoch": 0.580386924616411, "grad_norm": 1.5882205963134766, "step": 1740 }, { "epoch": 0.580386924616411, "learning_rate": 0.0007983541075626968, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 1.265290379524231, "step": 1740 }, { "ce_loss": 0.3291006088256836, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.5921589136123657, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.2453921139240265, "step": 1740 }, { "epoch": 0.580386924616411, "loss": 1.5062141418457031, "step": 1740 }, { "ce_loss": 0.4185219407081604, "epoch": 0.580386924616411, "step": 1740 }, { "distill_loss": 0.5556906461715698, "epoch": 0.580386924616411, "step": 1740 }, { "epoch": 0.580386924616411, "ref_ce_loss": 0.30186644196510315, "step": 1740 }, { "epoch": 0.5837224816544363, "loss": 1.4242, "step": 1750 }, { "epoch": 0.5837224816544363, "grad_norm": 1.4376288652420044, "step": 1750 }, { "epoch": 0.5837224816544363, "learning_rate": 0.000798314714090851, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 1.2823268175125122, "step": 1750 }, { "ce_loss": 0.37645918130874634, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.4971107542514801, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.28914013504981995, "step": 1750 }, { "epoch": 0.5837224816544363, "loss": 1.4173918962478638, "step": 1750 }, { "ce_loss": 0.4171852767467499, "epoch": 0.5837224816544363, "step": 1750 }, { "distill_loss": 0.5723685026168823, "epoch": 0.5837224816544363, "step": 1750 }, { "epoch": 0.5837224816544363, "ref_ce_loss": 0.3079443871974945, "step": 1750 }, { "epoch": 0.5870580386924616, "loss": 1.4064, "step": 1760 }, { "epoch": 0.5870580386924616, "grad_norm": 1.7479711771011353, "step": 1760 }, { "epoch": 0.5870580386924616, "learning_rate": 0.0007982748557428733, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 1.8940140008926392, "step": 1760 }, { "ce_loss": 0.4125586152076721, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.5978971123695374, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.21934141218662262, "step": 1760 }, { "epoch": 0.5870580386924616, "loss": 1.5041753053665161, "step": 1760 }, { "ce_loss": 0.45310303568840027, "epoch": 0.5870580386924616, "step": 1760 }, { "distill_loss": 0.599409818649292, "epoch": 0.5870580386924616, "step": 1760 }, { "epoch": 0.5870580386924616, "ref_ce_loss": 0.24095265567302704, "step": 1760 }, { "epoch": 0.590393595730487, "loss": 1.5753, "step": 1770 }, { "epoch": 0.590393595730487, "grad_norm": 3.1749627590179443, "step": 1770 }, { "epoch": 0.590393595730487, "learning_rate": 0.0007982345325652828, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 1.2013965845108032, "step": 1770 }, { "ce_loss": 0.3765731453895569, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.5124651789665222, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.24479584395885468, "step": 1770 }, { "epoch": 0.590393595730487, "loss": 1.5885907411575317, "step": 1770 }, { "ce_loss": 0.46473753452301025, "epoch": 0.590393595730487, "step": 1770 }, { "distill_loss": 0.5773957967758179, "epoch": 0.590393595730487, "step": 1770 }, { "epoch": 0.590393595730487, "ref_ce_loss": 0.24250906705856323, "step": 1770 }, { "epoch": 0.5937291527685123, "loss": 1.4574, "step": 1780 }, { "epoch": 0.5937291527685123, "grad_norm": 1.832212209701538, "step": 1780 }, { "epoch": 0.5937291527685123, "learning_rate": 0.0007981937446051412, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 1.673588752746582, "step": 1780 }, { "ce_loss": 0.3560979962348938, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.47634315490722656, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.23794607818126678, "step": 1780 }, { "epoch": 0.5937291527685123, "loss": 2.1981008052825928, "step": 1780 }, { "ce_loss": 0.40152403712272644, "epoch": 0.5937291527685123, "step": 1780 }, { "distill_loss": 0.5010473728179932, "epoch": 0.5937291527685123, "step": 1780 }, { "epoch": 0.5937291527685123, "ref_ce_loss": 0.3065120577812195, "step": 1780 }, { "epoch": 0.5970647098065377, "loss": 1.4009, "step": 1790 }, { "epoch": 0.5970647098065377, "grad_norm": 1.4549444913864136, "step": 1790 }, { "epoch": 0.5970647098065377, "learning_rate": 0.0007981524919100519, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 1.4824278354644775, "step": 1790 }, { "ce_loss": 0.4323318302631378, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.6305601596832275, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.26681867241859436, "step": 1790 }, { "epoch": 0.5970647098065377, "loss": 1.265123963356018, "step": 1790 }, { "ce_loss": 0.44419583678245544, "epoch": 0.5970647098065377, "step": 1790 }, { "distill_loss": 0.5172229409217834, "epoch": 0.5970647098065377, "step": 1790 }, { "epoch": 0.5970647098065377, "ref_ce_loss": 0.3028353452682495, "step": 1790 }, { "epoch": 0.600400266844563, "loss": 1.4739, "step": 1800 }, { "epoch": 0.600400266844563, "grad_norm": 1.548317551612854, "step": 1800 }, { "epoch": 0.600400266844563, "learning_rate": 0.0007981107745281618, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 1.3120996952056885, "step": 1800 }, { "ce_loss": 0.3586893379688263, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.6148962378501892, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.2554745078086853, "step": 1800 }, { "epoch": 0.600400266844563, "loss": 1.297042727470398, "step": 1800 }, { "ce_loss": 0.41965043544769287, "epoch": 0.600400266844563, "step": 1800 }, { "distill_loss": 0.5431655049324036, "epoch": 0.600400266844563, "step": 1800 }, { "epoch": 0.600400266844563, "ref_ce_loss": 0.23221886157989502, "step": 1800 }, { "epoch": 0.6037358238825884, "loss": 1.4416, "step": 1810 }, { "epoch": 0.6037358238825884, "grad_norm": 2.4861154556274414, "step": 1810 }, { "epoch": 0.6037358238825884, "learning_rate": 0.0007980685925081592, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 1.3630973100662231, "step": 1810 }, { "ce_loss": 0.3965637981891632, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.5065696835517883, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.2848835587501526, "step": 1810 }, { "epoch": 0.6037358238825884, "loss": 1.27780282497406, "step": 1810 }, { "ce_loss": 0.4385015368461609, "epoch": 0.6037358238825884, "step": 1810 }, { "distill_loss": 0.5412667393684387, "epoch": 0.6037358238825884, "step": 1810 }, { "epoch": 0.6037358238825884, "ref_ce_loss": 0.2979799807071686, "step": 1810 }, { "epoch": 0.6070713809206137, "loss": 1.3629, "step": 1820 }, { "epoch": 0.6070713809206137, "grad_norm": 1.543222427368164, "step": 1820 }, { "epoch": 0.6070713809206137, "learning_rate": 0.0007980259458992752, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 1.8371226787567139, "step": 1820 }, { "ce_loss": 0.384300172328949, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.5954495668411255, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.26500630378723145, "step": 1820 }, { "epoch": 0.6070713809206137, "loss": 1.5840888023376465, "step": 1820 }, { "ce_loss": 0.48104703426361084, "epoch": 0.6070713809206137, "step": 1820 }, { "distill_loss": 0.47974202036857605, "epoch": 0.6070713809206137, "step": 1820 }, { "epoch": 0.6070713809206137, "ref_ce_loss": 0.31398510932922363, "step": 1820 }, { "epoch": 0.6104069379586391, "loss": 1.4797, "step": 1830 }, { "epoch": 0.6104069379586391, "grad_norm": 2.317809581756592, "step": 1830 }, { "epoch": 0.6104069379586391, "learning_rate": 0.0007979828347512831, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 1.9281671047210693, "step": 1830 }, { "ce_loss": 0.4327715039253235, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.5266700983047485, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.3022516965866089, "step": 1830 }, { "epoch": 0.6104069379586391, "loss": 1.3074755668640137, "step": 1830 }, { "ce_loss": 0.41616860032081604, "epoch": 0.6104069379586391, "step": 1830 }, { "distill_loss": 0.5696160197257996, "epoch": 0.6104069379586391, "step": 1830 }, { "epoch": 0.6104069379586391, "ref_ce_loss": 0.32133573293685913, "step": 1830 }, { "epoch": 0.6137424949966644, "loss": 1.4541, "step": 1840 }, { "epoch": 0.6137424949966644, "grad_norm": 1.9975054264068604, "step": 1840 }, { "epoch": 0.6137424949966644, "learning_rate": 0.000797939259114498, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 1.5424506664276123, "step": 1840 }, { "ce_loss": 0.4475153088569641, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.5498025417327881, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.28382542729377747, "step": 1840 }, { "epoch": 0.6137424949966644, "loss": 1.2897108793258667, "step": 1840 }, { "ce_loss": 0.42207860946655273, "epoch": 0.6137424949966644, "step": 1840 }, { "distill_loss": 0.5946893692016602, "epoch": 0.6137424949966644, "step": 1840 }, { "epoch": 0.6137424949966644, "ref_ce_loss": 0.27271324396133423, "step": 1840 }, { "epoch": 0.6170780520346898, "loss": 1.5766, "step": 1850 }, { "epoch": 0.6170780520346898, "grad_norm": 1.6964223384857178, "step": 1850 }, { "epoch": 0.6170780520346898, "learning_rate": 0.0007978952190397774, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 1.7818580865859985, "step": 1850 }, { "ce_loss": 0.4346107542514801, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.5910945534706116, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.30669838190078735, "step": 1850 }, { "epoch": 0.6170780520346898, "loss": 1.18301260471344, "step": 1850 }, { "ce_loss": 0.4067380130290985, "epoch": 0.6170780520346898, "step": 1850 }, { "distill_loss": 0.5019524097442627, "epoch": 0.6170780520346898, "step": 1850 }, { "epoch": 0.6170780520346898, "ref_ce_loss": 0.2735179662704468, "step": 1850 }, { "epoch": 0.6204136090727151, "loss": 1.4075, "step": 1860 }, { "epoch": 0.6204136090727151, "grad_norm": 1.5153988599777222, "step": 1860 }, { "epoch": 0.6204136090727151, "learning_rate": 0.000797850714578521, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 1.3955501317977905, "step": 1860 }, { "ce_loss": 0.39396870136260986, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.5631558895111084, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.3130049705505371, "step": 1860 }, { "epoch": 0.6204136090727151, "loss": 2.127243995666504, "step": 1860 }, { "ce_loss": 0.41338804364204407, "epoch": 0.6204136090727151, "step": 1860 }, { "distill_loss": 0.5591651201248169, "epoch": 0.6204136090727151, "step": 1860 }, { "epoch": 0.6204136090727151, "ref_ce_loss": 0.29337289929389954, "step": 1860 }, { "epoch": 0.6237491661107405, "loss": 1.5559, "step": 1870 }, { "epoch": 0.6237491661107405, "grad_norm": 1.728464961051941, "step": 1870 }, { "epoch": 0.6237491661107405, "learning_rate": 0.0007978057457826702, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 1.0223857164382935, "step": 1870 }, { "ce_loss": 0.31188276410102844, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.4333324432373047, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.18637055158615112, "step": 1870 }, { "epoch": 0.6237491661107405, "loss": 1.4168239831924438, "step": 1870 }, { "ce_loss": 0.45807886123657227, "epoch": 0.6237491661107405, "step": 1870 }, { "distill_loss": 0.5271170139312744, "epoch": 0.6237491661107405, "step": 1870 }, { "epoch": 0.6237491661107405, "ref_ce_loss": 0.3137466013431549, "step": 1870 }, { "epoch": 0.6270847231487658, "loss": 1.4226, "step": 1880 }, { "epoch": 0.6270847231487658, "grad_norm": 2.060706853866577, "step": 1880 }, { "epoch": 0.6270847231487658, "learning_rate": 0.0007977603127047084, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 1.2071475982666016, "step": 1880 }, { "ce_loss": 0.3951813280582428, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.504381000995636, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.3060469329357147, "step": 1880 }, { "epoch": 0.6270847231487658, "loss": 1.1786348819732666, "step": 1880 }, { "ce_loss": 0.3081275522708893, "epoch": 0.6270847231487658, "step": 1880 }, { "distill_loss": 0.5031088590621948, "epoch": 0.6270847231487658, "step": 1880 }, { "epoch": 0.6270847231487658, "ref_ce_loss": 0.2622256577014923, "step": 1880 }, { "epoch": 0.6304202801867912, "loss": 1.4745, "step": 1890 }, { "epoch": 0.6304202801867912, "grad_norm": 3.1443939208984375, "step": 1890 }, { "epoch": 0.6304202801867912, "learning_rate": 0.0007977144153976608, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 1.4169859886169434, "step": 1890 }, { "ce_loss": 0.4933745265007019, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.5657783150672913, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.25525254011154175, "step": 1890 }, { "epoch": 0.6304202801867912, "loss": 1.5357770919799805, "step": 1890 }, { "ce_loss": 0.31528544425964355, "epoch": 0.6304202801867912, "step": 1890 }, { "distill_loss": 0.5158506631851196, "epoch": 0.6304202801867912, "step": 1890 }, { "epoch": 0.6304202801867912, "ref_ce_loss": 0.19878283143043518, "step": 1890 }, { "epoch": 0.6337558372248165, "loss": 1.3136, "step": 1900 }, { "epoch": 0.6337558372248165, "grad_norm": 1.7846509218215942, "step": 1900 }, { "epoch": 0.6337558372248165, "learning_rate": 0.0007976680539150947, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 1.2587743997573853, "step": 1900 }, { "ce_loss": 0.40194782614707947, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.5084041357040405, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.25532692670822144, "step": 1900 }, { "epoch": 0.6337558372248165, "loss": 1.2736737728118896, "step": 1900 }, { "ce_loss": 0.45652300119400024, "epoch": 0.6337558372248165, "step": 1900 }, { "distill_loss": 0.5367364883422852, "epoch": 0.6337558372248165, "step": 1900 }, { "epoch": 0.6337558372248165, "ref_ce_loss": 0.28010380268096924, "step": 1900 }, { "epoch": 0.6370913942628419, "loss": 1.5951, "step": 1910 }, { "epoch": 0.6370913942628419, "grad_norm": 2.80888032913208, "step": 1910 }, { "epoch": 0.6370913942628419, "learning_rate": 0.0007976212283111187, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 1.9592828750610352, "step": 1910 }, { "ce_loss": 0.416324645280838, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.6871696710586548, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.3213195502758026, "step": 1910 }, { "epoch": 0.6370913942628419, "loss": 1.4825496673583984, "step": 1910 }, { "ce_loss": 0.4488929510116577, "epoch": 0.6370913942628419, "step": 1910 }, { "distill_loss": 0.6601699590682983, "epoch": 0.6370913942628419, "step": 1910 }, { "epoch": 0.6370913942628419, "ref_ce_loss": 0.294272780418396, "step": 1910 }, { "epoch": 0.6404269513008672, "loss": 1.5307, "step": 1920 }, { "epoch": 0.6404269513008672, "grad_norm": 1.9157191514968872, "step": 1920 }, { "epoch": 0.6404269513008672, "learning_rate": 0.0007975739386403835, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 1.4704571962356567, "step": 1920 }, { "ce_loss": 0.4426376521587372, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.6234824061393738, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.32547807693481445, "step": 1920 }, { "epoch": 0.6404269513008672, "loss": 2.2691290378570557, "step": 1920 }, { "ce_loss": 0.46476539969444275, "epoch": 0.6404269513008672, "step": 1920 }, { "distill_loss": 0.6577616333961487, "epoch": 0.6404269513008672, "step": 1920 }, { "epoch": 0.6404269513008672, "ref_ce_loss": 0.30368247628211975, "step": 1920 }, { "epoch": 0.6437625083388926, "loss": 1.5061, "step": 1930 }, { "epoch": 0.6437625083388926, "grad_norm": 2.470832109451294, "step": 1930 }, { "epoch": 0.6437625083388926, "learning_rate": 0.0007975261849580813, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 1.0962388515472412, "step": 1930 }, { "ce_loss": 0.33751609921455383, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.5582593679428101, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.20032666623592377, "step": 1930 }, { "epoch": 0.6437625083388926, "loss": 1.4692728519439697, "step": 1930 }, { "ce_loss": 0.39003822207450867, "epoch": 0.6437625083388926, "step": 1930 }, { "distill_loss": 0.5492802858352661, "epoch": 0.6437625083388926, "step": 1930 }, { "epoch": 0.6437625083388926, "ref_ce_loss": 0.26024729013442993, "step": 1930 }, { "epoch": 0.6470980653769179, "loss": 1.4432, "step": 1940 }, { "epoch": 0.6470980653769179, "grad_norm": 1.8819775581359863, "step": 1940 }, { "epoch": 0.6470980653769179, "learning_rate": 0.0007974779673199456, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 1.2645516395568848, "step": 1940 }, { "ce_loss": 0.40579256415367126, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.4985727369785309, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.25948283076286316, "step": 1940 }, { "epoch": 0.6470980653769179, "loss": 1.2759686708450317, "step": 1940 }, { "ce_loss": 0.3738131821155548, "epoch": 0.6470980653769179, "step": 1940 }, { "distill_loss": 0.46578264236450195, "epoch": 0.6470980653769179, "step": 1940 }, { "epoch": 0.6470980653769179, "ref_ce_loss": 0.2944307029247284, "step": 1940 }, { "epoch": 0.6504336224149433, "loss": 1.3298, "step": 1950 }, { "epoch": 0.6504336224149433, "grad_norm": 1.5547473430633545, "step": 1950 }, { "epoch": 0.6504336224149433, "learning_rate": 0.0007974292857822515, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 1.2107421159744263, "step": 1950 }, { "ce_loss": 0.4242013096809387, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.497577965259552, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.2884838879108429, "step": 1950 }, { "epoch": 0.6504336224149433, "loss": 1.2990264892578125, "step": 1950 }, { "ce_loss": 0.38576236367225647, "epoch": 0.6504336224149433, "step": 1950 }, { "distill_loss": 0.5003119707107544, "epoch": 0.6504336224149433, "step": 1950 }, { "epoch": 0.6504336224149433, "ref_ce_loss": 0.27664613723754883, "step": 1950 }, { "epoch": 0.6537691794529686, "loss": 1.3588, "step": 1960 }, { "epoch": 0.6537691794529686, "grad_norm": 1.5756919384002686, "step": 1960 }, { "epoch": 0.6537691794529686, "learning_rate": 0.0007973801404018158, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 1.676820158958435, "step": 1960 }, { "ce_loss": 0.45319217443466187, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.6395261883735657, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.28354665637016296, "step": 1960 }, { "epoch": 0.6537691794529686, "loss": 1.5755594968795776, "step": 1960 }, { "ce_loss": 0.35511845350265503, "epoch": 0.6537691794529686, "step": 1960 }, { "distill_loss": 0.5487231016159058, "epoch": 0.6537691794529686, "step": 1960 }, { "epoch": 0.6537691794529686, "ref_ce_loss": 0.25075337290763855, "step": 1960 }, { "epoch": 0.657104736490994, "loss": 1.4525, "step": 1970 }, { "epoch": 0.657104736490994, "grad_norm": 1.6834571361541748, "step": 1970 }, { "epoch": 0.657104736490994, "learning_rate": 0.0007973305312359964, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 1.1696513891220093, "step": 1970 }, { "ce_loss": 0.4126241207122803, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.538031816482544, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.21698075532913208, "step": 1970 }, { "epoch": 0.657104736490994, "loss": 2.0540080070495605, "step": 1970 }, { "ce_loss": 0.5341470837593079, "epoch": 0.657104736490994, "step": 1970 }, { "distill_loss": 0.6686353087425232, "epoch": 0.657104736490994, "step": 1970 }, { "epoch": 0.657104736490994, "ref_ce_loss": 0.27718207240104675, "step": 1970 }, { "epoch": 0.6604402935290193, "loss": 1.4011, "step": 1980 }, { "epoch": 0.6604402935290193, "grad_norm": 1.7924296855926514, "step": 1980 }, { "epoch": 0.6604402935290193, "learning_rate": 0.0007972804583426926, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 1.3130066394805908, "step": 1980 }, { "ce_loss": 0.38297760486602783, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.536170244216919, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.28933531045913696, "step": 1980 }, { "epoch": 0.6604402935290193, "loss": 1.7251466512680054, "step": 1980 }, { "ce_loss": 0.3758784532546997, "epoch": 0.6604402935290193, "step": 1980 }, { "distill_loss": 0.5668622255325317, "epoch": 0.6604402935290193, "step": 1980 }, { "epoch": 0.6604402935290193, "ref_ce_loss": 0.23983514308929443, "step": 1980 }, { "epoch": 0.6637758505670447, "loss": 1.3723, "step": 1990 }, { "epoch": 0.6637758505670447, "grad_norm": 2.775648832321167, "step": 1990 }, { "epoch": 0.6637758505670447, "learning_rate": 0.0007972299217803446, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 1.8384811878204346, "step": 1990 }, { "ce_loss": 0.4294074773788452, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.5050294399261475, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.25213590264320374, "step": 1990 }, { "epoch": 0.6637758505670447, "loss": 1.1137229204177856, "step": 1990 }, { "ce_loss": 0.3956092596054077, "epoch": 0.6637758505670447, "step": 1990 }, { "distill_loss": 0.4502190947532654, "epoch": 0.6637758505670447, "step": 1990 }, { "epoch": 0.6637758505670447, "ref_ce_loss": 0.26725438237190247, "step": 1990 }, { "epoch": 0.66711140760507, "loss": 1.3661, "step": 2000 }, { "epoch": 0.66711140760507, "grad_norm": 1.7431342601776123, "step": 2000 }, { "epoch": 0.66711140760507, "learning_rate": 0.0007971789216079343, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.351794719696045, "step": 2000 }, { "ce_loss": 0.38261255621910095, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 0.5665090084075928, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.2890423834323883, "step": 2000 }, { "epoch": 0.66711140760507, "loss": 1.8742589950561523, "step": 2000 }, { "ce_loss": 0.4540679454803467, "epoch": 0.66711140760507, "step": 2000 }, { "distill_loss": 0.4872954487800598, "epoch": 0.66711140760507, "step": 2000 }, { "epoch": 0.66711140760507, "ref_ce_loss": 0.27717703580856323, "step": 2000 }, { "epoch": 0.6704469646430954, "loss": 1.3747, "step": 2010 }, { "epoch": 0.6704469646430954, "grad_norm": 1.655342698097229, "step": 2010 }, { "epoch": 0.6704469646430954, "learning_rate": 0.0007971274578849843, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 1.5908936262130737, "step": 2010 }, { "ce_loss": 0.29928529262542725, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.5743173956871033, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.2413429170846939, "step": 2010 }, { "epoch": 0.6704469646430954, "loss": 1.7414389848709106, "step": 2010 }, { "ce_loss": 0.3730257451534271, "epoch": 0.6704469646430954, "step": 2010 }, { "distill_loss": 0.5815277099609375, "epoch": 0.6704469646430954, "step": 2010 }, { "epoch": 0.6704469646430954, "ref_ce_loss": 0.2591405510902405, "step": 2010 }, { "epoch": 0.6737825216811207, "loss": 1.5379, "step": 2020 }, { "epoch": 0.6737825216811207, "grad_norm": 2.2649941444396973, "step": 2020 }, { "epoch": 0.6737825216811207, "learning_rate": 0.0007970755306715582, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 1.2894701957702637, "step": 2020 }, { "ce_loss": 0.4230029582977295, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.5699641108512878, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.29636257886886597, "step": 2020 }, { "epoch": 0.6737825216811207, "loss": 1.218937635421753, "step": 2020 }, { "ce_loss": 0.3319122791290283, "epoch": 0.6737825216811207, "step": 2020 }, { "distill_loss": 0.4558425545692444, "epoch": 0.6737825216811207, "step": 2020 }, { "epoch": 0.6737825216811207, "ref_ce_loss": 0.2575785219669342, "step": 2020 }, { "epoch": 0.6771180787191461, "loss": 1.3827, "step": 2030 }, { "epoch": 0.6771180787191461, "grad_norm": 3.790936231613159, "step": 2030 }, { "epoch": 0.6771180787191461, "learning_rate": 0.0007970231400282608, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 1.5204871892929077, "step": 2030 }, { "ce_loss": 0.41601571440696716, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.5780538320541382, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.295123815536499, "step": 2030 }, { "epoch": 0.6771180787191461, "loss": 1.165385365486145, "step": 2030 }, { "ce_loss": 0.3634999394416809, "epoch": 0.6771180787191461, "step": 2030 }, { "distill_loss": 0.5556265115737915, "epoch": 0.6771180787191461, "step": 2030 }, { "epoch": 0.6771180787191461, "ref_ce_loss": 0.2461409717798233, "step": 2030 }, { "epoch": 0.6804536357571714, "loss": 1.357, "step": 2040 }, { "epoch": 0.6804536357571714, "grad_norm": 1.3283863067626953, "step": 2040 }, { "epoch": 0.6804536357571714, "learning_rate": 0.0007969702860162373, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 1.5245931148529053, "step": 2040 }, { "ce_loss": 0.44471654295921326, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.5222198963165283, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.34963709115982056, "step": 2040 }, { "epoch": 0.6804536357571714, "loss": 1.142379641532898, "step": 2040 }, { "ce_loss": 0.33912596106529236, "epoch": 0.6804536357571714, "step": 2040 }, { "distill_loss": 0.5050538778305054, "epoch": 0.6804536357571714, "step": 2040 }, { "epoch": 0.6804536357571714, "ref_ce_loss": 0.22374626994132996, "step": 2040 }, { "epoch": 0.6837891927951968, "loss": 1.4121, "step": 2050 }, { "epoch": 0.6837891927951968, "grad_norm": 1.3487452268600464, "step": 2050 }, { "epoch": 0.6837891927951968, "learning_rate": 0.0007969169686971745, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 2.050325870513916, "step": 2050 }, { "ce_loss": 0.38634929060935974, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.4903668463230133, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.23304055631160736, "step": 2050 }, { "epoch": 0.6837891927951968, "loss": 0.9491183757781982, "step": 2050 }, { "ce_loss": 0.31107857823371887, "epoch": 0.6837891927951968, "step": 2050 }, { "distill_loss": 0.4289850890636444, "epoch": 0.6837891927951968, "step": 2050 }, { "epoch": 0.6837891927951968, "ref_ce_loss": 0.2086729258298874, "step": 2050 }, { "epoch": 0.6871247498332221, "loss": 1.5204, "step": 2060 }, { "epoch": 0.6871247498332221, "grad_norm": 2.084027051925659, "step": 2060 }, { "epoch": 0.6871247498332221, "learning_rate": 0.000796863188133299, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 1.198270320892334, "step": 2060 }, { "ce_loss": 0.3653385043144226, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.5144132375717163, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.3181459307670593, "step": 2060 }, { "epoch": 0.6871247498332221, "loss": 1.4320412874221802, "step": 2060 }, { "ce_loss": 0.40848636627197266, "epoch": 0.6871247498332221, "step": 2060 }, { "distill_loss": 0.5593544840812683, "epoch": 0.6871247498332221, "step": 2060 }, { "epoch": 0.6871247498332221, "ref_ce_loss": 0.2513773441314697, "step": 2060 }, { "epoch": 0.6904603068712475, "loss": 1.3781, "step": 2070 }, { "epoch": 0.6904603068712475, "grad_norm": 1.3339126110076904, "step": 2070 }, { "epoch": 0.6904603068712475, "learning_rate": 0.0007968089443873788, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 1.4610631465911865, "step": 2070 }, { "ce_loss": 0.44683295488357544, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.5647901296615601, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.3352224826812744, "step": 2070 }, { "epoch": 0.6904603068712475, "loss": 1.8598355054855347, "step": 2070 }, { "ce_loss": 0.4054355323314667, "epoch": 0.6904603068712475, "step": 2070 }, { "distill_loss": 0.5576649904251099, "epoch": 0.6904603068712475, "step": 2070 }, { "epoch": 0.6904603068712475, "ref_ce_loss": 0.2876647412776947, "step": 2070 }, { "epoch": 0.6937958639092728, "loss": 1.5379, "step": 2080 }, { "epoch": 0.6937958639092728, "grad_norm": 1.976668357849121, "step": 2080 }, { "epoch": 0.6937958639092728, "learning_rate": 0.000796754237522722, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 1.265521764755249, "step": 2080 }, { "ce_loss": 0.3824961483478546, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.543056070804596, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.2611978352069855, "step": 2080 }, { "epoch": 0.6937958639092728, "loss": 1.7373197078704834, "step": 2080 }, { "ce_loss": 0.4199475944042206, "epoch": 0.6937958639092728, "step": 2080 }, { "distill_loss": 0.6220147609710693, "epoch": 0.6937958639092728, "step": 2080 }, { "epoch": 0.6937958639092728, "ref_ce_loss": 0.2801419198513031, "step": 2080 }, { "epoch": 0.6971314209472982, "loss": 1.4568, "step": 2090 }, { "epoch": 0.6971314209472982, "grad_norm": 2.351896047592163, "step": 2090 }, { "epoch": 0.6971314209472982, "learning_rate": 0.0007966990676031776, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 1.1849398612976074, "step": 2090 }, { "ce_loss": 0.34545114636421204, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.5206227898597717, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.23222720623016357, "step": 2090 }, { "epoch": 0.6971314209472982, "loss": 1.408402919769287, "step": 2090 }, { "ce_loss": 0.38856178522109985, "epoch": 0.6971314209472982, "step": 2090 }, { "distill_loss": 0.6217367053031921, "epoch": 0.6971314209472982, "step": 2090 }, { "epoch": 0.6971314209472982, "ref_ce_loss": 0.29724544286727905, "step": 2090 }, { "epoch": 0.7004669779853235, "loss": 1.409, "step": 2100 }, { "epoch": 0.7004669779853235, "grad_norm": 1.8904672861099243, "step": 2100 }, { "epoch": 0.7004669779853235, "learning_rate": 0.0007966434346931348, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 1.664953351020813, "step": 2100 }, { "ce_loss": 0.33795663714408875, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.5694255828857422, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.24972786009311676, "step": 2100 }, { "epoch": 0.7004669779853235, "loss": 1.4305306673049927, "step": 2100 }, { "ce_loss": 0.37841373682022095, "epoch": 0.7004669779853235, "step": 2100 }, { "distill_loss": 0.4339869022369385, "epoch": 0.7004669779853235, "step": 2100 }, { "epoch": 0.7004669779853235, "ref_ce_loss": 0.32812896370887756, "step": 2100 }, { "epoch": 0.7038025350233489, "loss": 1.4373, "step": 2110 }, { "epoch": 0.7038025350233489, "grad_norm": 2.0401644706726074, "step": 2110 }, { "epoch": 0.7038025350233489, "learning_rate": 0.000796587338857523, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.8138028383255005, "step": 2110 }, { "ce_loss": 0.45140936970710754, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.6471231579780579, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.29433050751686096, "step": 2110 }, { "epoch": 0.7038025350233489, "loss": 1.4534084796905518, "step": 2110 }, { "ce_loss": 0.4121268689632416, "epoch": 0.7038025350233489, "step": 2110 }, { "distill_loss": 0.5959084630012512, "epoch": 0.7038025350233489, "step": 2110 }, { "epoch": 0.7038025350233489, "ref_ce_loss": 0.24375709891319275, "step": 2110 }, { "epoch": 0.7071380920613742, "loss": 1.4983, "step": 2120 }, { "epoch": 0.7071380920613742, "grad_norm": 2.243586301803589, "step": 2120 }, { "epoch": 0.7071380920613742, "learning_rate": 0.000796530780161812, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.4822847843170166, "step": 2120 }, { "ce_loss": 0.4244216978549957, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.5842169523239136, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.27704426646232605, "step": 2120 }, { "epoch": 0.7071380920613742, "loss": 1.61923086643219, "step": 2120 }, { "ce_loss": 0.4382035434246063, "epoch": 0.7071380920613742, "step": 2120 }, { "distill_loss": 0.45522329211235046, "epoch": 0.7071380920613742, "step": 2120 }, { "epoch": 0.7071380920613742, "ref_ce_loss": 0.2743866741657257, "step": 2120 }, { "epoch": 0.7104736490993996, "loss": 1.3543, "step": 2130 }, { "epoch": 0.7104736490993996, "grad_norm": 2.200979709625244, "step": 2130 }, { "epoch": 0.7104736490993996, "learning_rate": 0.0007964737586720123, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 1.6453789472579956, "step": 2130 }, { "ce_loss": 0.3791709244251251, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.5444095134735107, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.2976440191268921, "step": 2130 }, { "epoch": 0.7104736490993996, "loss": 2.0682713985443115, "step": 2130 }, { "ce_loss": 0.3775831162929535, "epoch": 0.7104736490993996, "step": 2130 }, { "distill_loss": 0.44049978256225586, "epoch": 0.7104736490993996, "step": 2130 }, { "epoch": 0.7104736490993996, "ref_ce_loss": 0.29356908798217773, "step": 2130 }, { "epoch": 0.7138092061374249, "loss": 1.3432, "step": 2140 }, { "epoch": 0.7138092061374249, "grad_norm": 2.2284011840820312, "step": 2140 }, { "epoch": 0.7138092061374249, "learning_rate": 0.0007964162744546739, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 1.4169831275939941, "step": 2140 }, { "ce_loss": 0.35462838411331177, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.4870684742927551, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.2719684839248657, "step": 2140 }, { "epoch": 0.7138092061374249, "loss": 1.5132737159729004, "step": 2140 }, { "ce_loss": 0.38168835639953613, "epoch": 0.7138092061374249, "step": 2140 }, { "distill_loss": 0.5392582416534424, "epoch": 0.7138092061374249, "step": 2140 }, { "epoch": 0.7138092061374249, "ref_ce_loss": 0.25710928440093994, "step": 2140 }, { "epoch": 0.7171447631754503, "loss": 1.486, "step": 2150 }, { "epoch": 0.7171447631754503, "grad_norm": 1.8552848100662231, "step": 2150 }, { "epoch": 0.7171447631754503, "learning_rate": 0.000796358327576887, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 1.2554669380187988, "step": 2150 }, { "ce_loss": 0.3804562985897064, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.6099358201026917, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.26481449604034424, "step": 2150 }, { "epoch": 0.7171447631754503, "loss": 1.3438143730163574, "step": 2150 }, { "ce_loss": 0.378108948469162, "epoch": 0.7171447631754503, "step": 2150 }, { "distill_loss": 0.5566970705986023, "epoch": 0.7171447631754503, "step": 2150 }, { "epoch": 0.7171447631754503, "ref_ce_loss": 0.28770875930786133, "step": 2150 }, { "epoch": 0.7204803202134756, "loss": 1.4144, "step": 2160 }, { "epoch": 0.7204803202134756, "grad_norm": 1.7240135669708252, "step": 2160 }, { "epoch": 0.7204803202134756, "learning_rate": 0.0007962999181062819, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 1.444587230682373, "step": 2160 }, { "ce_loss": 0.3592352867126465, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.5743659734725952, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.25933054089546204, "step": 2160 }, { "epoch": 0.7204803202134756, "loss": 1.0968458652496338, "step": 2160 }, { "ce_loss": 0.367275595664978, "epoch": 0.7204803202134756, "step": 2160 }, { "distill_loss": 0.4670858383178711, "epoch": 0.7204803202134756, "step": 2160 }, { "epoch": 0.7204803202134756, "ref_ce_loss": 0.26240402460098267, "step": 2160 }, { "epoch": 0.723815877251501, "loss": 1.4534, "step": 2170 }, { "epoch": 0.723815877251501, "grad_norm": 1.6614775657653809, "step": 2170 }, { "epoch": 0.723815877251501, "learning_rate": 0.0007962410461110288, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 1.3979713916778564, "step": 2170 }, { "ce_loss": 0.4083508551120758, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.528069257736206, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.3267524540424347, "step": 2170 }, { "epoch": 0.723815877251501, "loss": 1.4628068208694458, "step": 2170 }, { "ce_loss": 0.47597917914390564, "epoch": 0.723815877251501, "step": 2170 }, { "distill_loss": 0.597661018371582, "epoch": 0.723815877251501, "step": 2170 }, { "epoch": 0.723815877251501, "ref_ce_loss": 0.30378589034080505, "step": 2170 }, { "epoch": 0.7271514342895263, "loss": 1.4281, "step": 2180 }, { "epoch": 0.7271514342895263, "grad_norm": 1.3479666709899902, "step": 2180 }, { "epoch": 0.7271514342895263, "learning_rate": 0.0007961817116598375, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 1.3016619682312012, "step": 2180 }, { "ce_loss": 0.43028223514556885, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.5974459052085876, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.2737049162387848, "step": 2180 }, { "epoch": 0.7271514342895263, "loss": 1.5070172548294067, "step": 2180 }, { "ce_loss": 0.3665555715560913, "epoch": 0.7271514342895263, "step": 2180 }, { "distill_loss": 0.5547187328338623, "epoch": 0.7271514342895263, "step": 2180 }, { "epoch": 0.7271514342895263, "ref_ce_loss": 0.28890177607536316, "step": 2180 }, { "epoch": 0.7304869913275517, "loss": 1.3728, "step": 2190 }, { "epoch": 0.7304869913275517, "grad_norm": 1.471031904220581, "step": 2190 }, { "epoch": 0.7304869913275517, "learning_rate": 0.0007961219148219578, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 1.1200194358825684, "step": 2190 }, { "ce_loss": 0.3308461904525757, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.47103047370910645, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.21239201724529266, "step": 2190 }, { "epoch": 0.7304869913275517, "loss": 1.172867774963379, "step": 2190 }, { "ce_loss": 0.31868767738342285, "epoch": 0.7304869913275517, "step": 2190 }, { "distill_loss": 0.5440253615379333, "epoch": 0.7304869913275517, "step": 2190 }, { "epoch": 0.7304869913275517, "ref_ce_loss": 0.2521239221096039, "step": 2190 }, { "epoch": 0.733822548365577, "loss": 1.4048, "step": 2200 }, { "epoch": 0.733822548365577, "grad_norm": 1.6411197185516357, "step": 2200 }, { "epoch": 0.733822548365577, "learning_rate": 0.000796061655667179, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.8511903285980225, "step": 2200 }, { "ce_loss": 0.45695799589157104, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.5404932498931885, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.3444337844848633, "step": 2200 }, { "epoch": 0.733822548365577, "loss": 1.359553575515747, "step": 2200 }, { "ce_loss": 0.4430965185165405, "epoch": 0.733822548365577, "step": 2200 }, { "distill_loss": 0.5114693641662598, "epoch": 0.733822548365577, "step": 2200 }, { "epoch": 0.733822548365577, "ref_ce_loss": 0.3141864240169525, "step": 2200 }, { "epoch": 0.7371581054036024, "loss": 1.3709, "step": 2210 }, { "epoch": 0.7371581054036024, "grad_norm": 3.038661479949951, "step": 2210 }, { "epoch": 0.7371581054036024, "learning_rate": 0.00079600093426583, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 1.3682180643081665, "step": 2210 }, { "ce_loss": 0.3720366060733795, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.5223158001899719, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.24465817213058472, "step": 2210 }, { "epoch": 0.7371581054036024, "loss": 1.923211932182312, "step": 2210 }, { "ce_loss": 0.4425932765007019, "epoch": 0.7371581054036024, "step": 2210 }, { "distill_loss": 0.5633354783058167, "epoch": 0.7371581054036024, "step": 2210 }, { "epoch": 0.7371581054036024, "ref_ce_loss": 0.24280905723571777, "step": 2210 }, { "epoch": 0.7404936624416277, "loss": 1.3085, "step": 2220 }, { "epoch": 0.7404936624416277, "grad_norm": 2.342991590499878, "step": 2220 }, { "epoch": 0.7404936624416277, "learning_rate": 0.0007959397506887793, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 1.3994795083999634, "step": 2220 }, { "ce_loss": 0.3839331567287445, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.4684339463710785, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.21987880766391754, "step": 2220 }, { "epoch": 0.7404936624416277, "loss": 1.223686695098877, "step": 2220 }, { "ce_loss": 0.37719520926475525, "epoch": 0.7404936624416277, "step": 2220 }, { "distill_loss": 0.528374195098877, "epoch": 0.7404936624416277, "step": 2220 }, { "epoch": 0.7404936624416277, "ref_ce_loss": 0.22812281548976898, "step": 2220 }, { "epoch": 0.7438292194796531, "loss": 1.3774, "step": 2230 }, { "epoch": 0.7438292194796531, "grad_norm": 1.3989163637161255, "step": 2230 }, { "epoch": 0.7438292194796531, "learning_rate": 0.0007958781050074347, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 1.081360101699829, "step": 2230 }, { "ce_loss": 0.26817405223846436, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.5221849083900452, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.18462881445884705, "step": 2230 }, { "epoch": 0.7438292194796531, "loss": 1.1048595905303955, "step": 2230 }, { "ce_loss": 0.33070671558380127, "epoch": 0.7438292194796531, "step": 2230 }, { "distill_loss": 0.5014172196388245, "epoch": 0.7438292194796531, "step": 2230 }, { "epoch": 0.7438292194796531, "ref_ce_loss": 0.19516457617282867, "step": 2230 }, { "epoch": 0.7471647765176784, "loss": 1.4234, "step": 2240 }, { "epoch": 0.7471647765176784, "grad_norm": 1.628597378730774, "step": 2240 }, { "epoch": 0.7471647765176784, "learning_rate": 0.0007958159972937432, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 1.2197805643081665, "step": 2240 }, { "ce_loss": 0.3060518801212311, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.5412315130233765, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.25071093440055847, "step": 2240 }, { "epoch": 0.7471647765176784, "loss": 1.3633686304092407, "step": 2240 }, { "ce_loss": 0.41398224234580994, "epoch": 0.7471647765176784, "step": 2240 }, { "distill_loss": 0.5853712558746338, "epoch": 0.7471647765176784, "step": 2240 }, { "epoch": 0.7471647765176784, "ref_ce_loss": 0.28145208954811096, "step": 2240 }, { "epoch": 0.7505003335557038, "loss": 1.4132, "step": 2250 }, { "epoch": 0.7505003335557038, "grad_norm": 1.8897957801818848, "step": 2250 }, { "epoch": 0.7505003335557038, "learning_rate": 0.0007957534276201915, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 1.2674027681350708, "step": 2250 }, { "ce_loss": 0.3586235046386719, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.5342280268669128, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.2958523631095886, "step": 2250 }, { "epoch": 0.7505003335557038, "loss": 1.8458093404769897, "step": 2250 }, { "ce_loss": 0.4460594058036804, "epoch": 0.7505003335557038, "step": 2250 }, { "distill_loss": 0.552367627620697, "epoch": 0.7505003335557038, "step": 2250 }, { "epoch": 0.7505003335557038, "ref_ce_loss": 0.29993292689323425, "step": 2250 }, { "epoch": 0.7538358905937291, "loss": 1.4241, "step": 2260 }, { "epoch": 0.7538358905937291, "grad_norm": 1.2573357820510864, "step": 2260 }, { "epoch": 0.7538358905937291, "learning_rate": 0.0007956903960598048, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 1.100134015083313, "step": 2260 }, { "ce_loss": 0.40362513065338135, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.42274925112724304, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.27371761202812195, "step": 2260 }, { "epoch": 0.7538358905937291, "loss": 1.63364577293396, "step": 2260 }, { "ce_loss": 0.3939662575721741, "epoch": 0.7538358905937291, "step": 2260 }, { "distill_loss": 0.46868401765823364, "epoch": 0.7538358905937291, "step": 2260 }, { "epoch": 0.7538358905937291, "ref_ce_loss": 0.2829951047897339, "step": 2260 }, { "epoch": 0.7571714476317545, "loss": 1.3595, "step": 2270 }, { "epoch": 0.7571714476317545, "grad_norm": 2.485349178314209, "step": 2270 }, { "epoch": 0.7571714476317545, "learning_rate": 0.0007956269026861479, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 1.4890384674072266, "step": 2270 }, { "ce_loss": 0.4694492518901825, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.5618162155151367, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.3466222286224365, "step": 2270 }, { "epoch": 0.7571714476317545, "loss": 1.5350507497787476, "step": 2270 }, { "ce_loss": 0.36870887875556946, "epoch": 0.7571714476317545, "step": 2270 }, { "distill_loss": 0.48704323172569275, "epoch": 0.7571714476317545, "step": 2270 }, { "epoch": 0.7571714476317545, "ref_ce_loss": 0.2637166678905487, "step": 2270 }, { "epoch": 0.7605070046697798, "loss": 1.3168, "step": 2280 }, { "epoch": 0.7605070046697798, "grad_norm": 1.4474360942840576, "step": 2280 }, { "epoch": 0.7605070046697798, "learning_rate": 0.0007955629475733243, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 1.5719066858291626, "step": 2280 }, { "ce_loss": 0.4709181487560272, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.6220474243164062, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.28237760066986084, "step": 2280 }, { "epoch": 0.7605070046697798, "loss": 1.221504807472229, "step": 2280 }, { "ce_loss": 0.38822704553604126, "epoch": 0.7605070046697798, "step": 2280 }, { "distill_loss": 0.5881603956222534, "epoch": 0.7605070046697798, "step": 2280 }, { "epoch": 0.7605070046697798, "ref_ce_loss": 0.24501189589500427, "step": 2280 }, { "epoch": 0.7638425617078052, "loss": 1.415, "step": 2290 }, { "epoch": 0.7638425617078052, "grad_norm": 2.043067693710327, "step": 2290 }, { "epoch": 0.7638425617078052, "learning_rate": 0.0007954985307959766, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 1.6120023727416992, "step": 2290 }, { "ce_loss": 0.3817175030708313, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.5304407477378845, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.28059616684913635, "step": 2290 }, { "epoch": 0.7638425617078052, "loss": 1.2804994583129883, "step": 2290 }, { "ce_loss": 0.3931063711643219, "epoch": 0.7638425617078052, "step": 2290 }, { "distill_loss": 0.5490153431892395, "epoch": 0.7638425617078052, "step": 2290 }, { "epoch": 0.7638425617078052, "ref_ce_loss": 0.2660219967365265, "step": 2290 }, { "epoch": 0.7671781187458305, "loss": 1.3366, "step": 2300 }, { "epoch": 0.7671781187458305, "grad_norm": 1.6212903261184692, "step": 2300 }, { "epoch": 0.7671781187458305, "learning_rate": 0.000795433652429286, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 1.1175814867019653, "step": 2300 }, { "ce_loss": 0.2823010981082916, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.5140184164047241, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.24143068492412567, "step": 2300 }, { "epoch": 0.7671781187458305, "loss": 1.392407774925232, "step": 2300 }, { "ce_loss": 0.42093566060066223, "epoch": 0.7671781187458305, "step": 2300 }, { "distill_loss": 0.4724191427230835, "epoch": 0.7671781187458305, "step": 2300 }, { "epoch": 0.7671781187458305, "ref_ce_loss": 0.2559778690338135, "step": 2300 }, { "epoch": 0.7705136757838559, "loss": 1.4132, "step": 2310 }, { "epoch": 0.7705136757838559, "grad_norm": 1.636440634727478, "step": 2310 }, { "epoch": 0.7705136757838559, "learning_rate": 0.0007953683125489726, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 1.2124123573303223, "step": 2310 }, { "ce_loss": 0.36387935280799866, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.4210546016693115, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.2943533658981323, "step": 2310 }, { "epoch": 0.7705136757838559, "loss": 1.2485119104385376, "step": 2310 }, { "ce_loss": 0.36129215359687805, "epoch": 0.7705136757838559, "step": 2310 }, { "distill_loss": 0.48826050758361816, "epoch": 0.7705136757838559, "step": 2310 }, { "epoch": 0.7705136757838559, "ref_ce_loss": 0.23949351906776428, "step": 2310 }, { "epoch": 0.7738492328218812, "loss": 1.3281, "step": 2320 }, { "epoch": 0.7738492328218812, "grad_norm": 1.528756856918335, "step": 2320 }, { "epoch": 0.7738492328218812, "learning_rate": 0.000795302511231295, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 1.0231574773788452, "step": 2320 }, { "ce_loss": 0.3148442208766937, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.5011311769485474, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.2043459415435791, "step": 2320 }, { "epoch": 0.7738492328218812, "loss": 1.7720094919204712, "step": 2320 }, { "ce_loss": 0.49245187640190125, "epoch": 0.7738492328218812, "step": 2320 }, { "distill_loss": 0.6105131506919861, "epoch": 0.7738492328218812, "step": 2320 }, { "epoch": 0.7738492328218812, "ref_ce_loss": 0.29217514395713806, "step": 2320 }, { "epoch": 0.7771847898599066, "loss": 1.4187, "step": 2330 }, { "epoch": 0.7771847898599066, "grad_norm": 1.999721884727478, "step": 2330 }, { "epoch": 0.7771847898599066, "learning_rate": 0.0007952362485530506, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 1.6869237422943115, "step": 2330 }, { "ce_loss": 0.442462295293808, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.674324631690979, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.24086466431617737, "step": 2330 }, { "epoch": 0.7771847898599066, "loss": 1.5707772970199585, "step": 2330 }, { "ce_loss": 0.3937818109989166, "epoch": 0.7771847898599066, "step": 2330 }, { "distill_loss": 0.6436639428138733, "epoch": 0.7771847898599066, "step": 2330 }, { "epoch": 0.7771847898599066, "ref_ce_loss": 0.26808956265449524, "step": 2330 }, { "epoch": 0.7805203468979319, "loss": 1.4501, "step": 2340 }, { "epoch": 0.7805203468979319, "grad_norm": 1.5357840061187744, "step": 2340 }, { "epoch": 0.7805203468979319, "learning_rate": 0.0007951695245915749, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 1.0470746755599976, "step": 2340 }, { "ce_loss": 0.33574843406677246, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.5088207721710205, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.2020200937986374, "step": 2340 }, { "epoch": 0.7805203468979319, "loss": 1.0406668186187744, "step": 2340 }, { "ce_loss": 0.29405948519706726, "epoch": 0.7805203468979319, "step": 2340 }, { "distill_loss": 0.4793488681316376, "epoch": 0.7805203468979319, "step": 2340 }, { "epoch": 0.7805203468979319, "ref_ce_loss": 0.2668147683143616, "step": 2340 }, { "epoch": 0.7838559039359573, "loss": 1.3071, "step": 2350 }, { "epoch": 0.7838559039359573, "grad_norm": 2.38606595993042, "step": 2350 }, { "epoch": 0.7838559039359573, "learning_rate": 0.000795102339424742, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 1.5629955530166626, "step": 2350 }, { "ce_loss": 0.40422725677490234, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.5844305753707886, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.20816953480243683, "step": 2350 }, { "epoch": 0.7838559039359573, "loss": 1.3333702087402344, "step": 2350 }, { "ce_loss": 0.3669227957725525, "epoch": 0.7838559039359573, "step": 2350 }, { "distill_loss": 0.5442429184913635, "epoch": 0.7838559039359573, "step": 2350 }, { "epoch": 0.7838559039359573, "ref_ce_loss": 0.30379021167755127, "step": 2350 }, { "epoch": 0.7871914609739826, "loss": 1.325, "step": 2360 }, { "epoch": 0.7871914609739826, "grad_norm": 2.1126739978790283, "step": 2360 }, { "epoch": 0.7871914609739826, "learning_rate": 0.0007950346931309643, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 1.3229087591171265, "step": 2360 }, { "ce_loss": 0.39628133177757263, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.48712071776390076, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.23661665618419647, "step": 2360 }, { "epoch": 0.7871914609739826, "loss": 1.3167979717254639, "step": 2360 }, { "ce_loss": 0.3563719093799591, "epoch": 0.7871914609739826, "step": 2360 }, { "distill_loss": 0.4854099154472351, "epoch": 0.7871914609739826, "step": 2360 }, { "epoch": 0.7871914609739826, "ref_ce_loss": 0.24429531395435333, "step": 2360 }, { "epoch": 0.790527018012008, "loss": 1.2361, "step": 2370 }, { "epoch": 0.790527018012008, "grad_norm": 1.4811742305755615, "step": 2370 }, { "epoch": 0.790527018012008, "learning_rate": 0.0007949665857891921, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 1.2205973863601685, "step": 2370 }, { "ce_loss": 0.30885571241378784, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.501349925994873, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.21289817988872528, "step": 2370 }, { "epoch": 0.790527018012008, "loss": 1.369713306427002, "step": 2370 }, { "ce_loss": 0.4380408227443695, "epoch": 0.790527018012008, "step": 2370 }, { "distill_loss": 0.6267238259315491, "epoch": 0.790527018012008, "step": 2370 }, { "epoch": 0.790527018012008, "ref_ce_loss": 0.29107314348220825, "step": 2370 }, { "epoch": 0.7938625750500333, "loss": 1.3653, "step": 2380 }, { "epoch": 0.7938625750500333, "grad_norm": 1.7153217792510986, "step": 2380 }, { "epoch": 0.7938625750500333, "learning_rate": 0.0007948980174789142, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 1.2276893854141235, "step": 2380 }, { "ce_loss": 0.37393128871917725, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.4779031574726105, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.2727866768836975, "step": 2380 }, { "epoch": 0.7938625750500333, "loss": 1.245436191558838, "step": 2380 }, { "ce_loss": 0.39289167523384094, "epoch": 0.7938625750500333, "step": 2380 }, { "distill_loss": 0.4908082187175751, "epoch": 0.7938625750500333, "step": 2380 }, { "epoch": 0.7938625750500333, "ref_ce_loss": 0.2664511501789093, "step": 2380 }, { "epoch": 0.7971981320880587, "loss": 1.3731, "step": 2390 }, { "epoch": 0.7971981320880587, "grad_norm": 1.9598758220672607, "step": 2390 }, { "epoch": 0.7971981320880587, "learning_rate": 0.0007948289882801571, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 1.3128260374069214, "step": 2390 }, { "ce_loss": 0.3758243918418884, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.5747771263122559, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.26265060901641846, "step": 2390 }, { "epoch": 0.7971981320880587, "loss": 1.3741884231567383, "step": 2390 }, { "ce_loss": 0.37096890807151794, "epoch": 0.7971981320880587, "step": 2390 }, { "distill_loss": 0.5457272529602051, "epoch": 0.7971981320880587, "step": 2390 }, { "epoch": 0.7971981320880587, "ref_ce_loss": 0.2744729220867157, "step": 2390 }, { "epoch": 0.800533689126084, "loss": 1.4166, "step": 2400 }, { "epoch": 0.800533689126084, "grad_norm": 1.950492024421692, "step": 2400 }, { "epoch": 0.800533689126084, "learning_rate": 0.0007947594982734852, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 1.686815857887268, "step": 2400 }, { "ce_loss": 0.3467777371406555, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.4724452495574951, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.20504708588123322, "step": 2400 }, { "epoch": 0.800533689126084, "loss": 1.8396413326263428, "step": 2400 }, { "ce_loss": 0.3998579680919647, "epoch": 0.800533689126084, "step": 2400 }, { "distill_loss": 0.487520694732666, "epoch": 0.800533689126084, "step": 2400 }, { "epoch": 0.800533689126084, "ref_ce_loss": 0.34791359305381775, "step": 2400 }, { "epoch": 0.8038692461641094, "loss": 1.4165, "step": 2410 }, { "epoch": 0.8038692461641094, "grad_norm": 2.1652491092681885, "step": 2410 }, { "epoch": 0.8038692461641094, "learning_rate": 0.0007946895475400012, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 1.5087261199951172, "step": 2410 }, { "ce_loss": 0.4045005142688751, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.5614149570465088, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.30835992097854614, "step": 2410 }, { "epoch": 0.8038692461641094, "loss": 1.3226900100708008, "step": 2410 }, { "ce_loss": 0.4158706068992615, "epoch": 0.8038692461641094, "step": 2410 }, { "distill_loss": 0.5947645902633667, "epoch": 0.8038692461641094, "step": 2410 }, { "epoch": 0.8038692461641094, "ref_ce_loss": 0.2430378794670105, "step": 2410 }, { "epoch": 0.8072048032021347, "loss": 1.3726, "step": 2420 }, { "epoch": 0.8072048032021347, "grad_norm": 1.8606551885604858, "step": 2420 }, { "epoch": 0.8072048032021347, "learning_rate": 0.0007946191361613447, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 1.2119914293289185, "step": 2420 }, { "ce_loss": 0.35136666893959045, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.48627468943595886, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.22440162301063538, "step": 2420 }, { "epoch": 0.8072048032021347, "loss": 1.457103967666626, "step": 2420 }, { "ce_loss": 0.3341229259967804, "epoch": 0.8072048032021347, "step": 2420 }, { "distill_loss": 0.5586241483688354, "epoch": 0.8072048032021347, "step": 2420 }, { "epoch": 0.8072048032021347, "ref_ce_loss": 0.24975526332855225, "step": 2420 }, { "epoch": 0.8105403602401601, "loss": 1.354, "step": 2430 }, { "epoch": 0.8105403602401601, "grad_norm": 1.524516224861145, "step": 2430 }, { "epoch": 0.8105403602401601, "learning_rate": 0.0007945482642196935, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 1.5340917110443115, "step": 2430 }, { "ce_loss": 0.5005981922149658, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.6238539814949036, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.23065099120140076, "step": 2430 }, { "epoch": 0.8105403602401601, "loss": 1.250186562538147, "step": 2430 }, { "ce_loss": 0.38786232471466064, "epoch": 0.8105403602401601, "step": 2430 }, { "distill_loss": 0.49738597869873047, "epoch": 0.8105403602401601, "step": 2430 }, { "epoch": 0.8105403602401601, "ref_ce_loss": 0.28271862864494324, "step": 2430 }, { "epoch": 0.8138759172781854, "loss": 1.4376, "step": 2440 }, { "epoch": 0.8138759172781854, "grad_norm": 1.48578941822052, "step": 2440 }, { "epoch": 0.8138759172781854, "learning_rate": 0.000794476931797763, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 1.2885890007019043, "step": 2440 }, { "ce_loss": 0.4292730987071991, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.4880996346473694, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.2967533469200134, "step": 2440 }, { "epoch": 0.8138759172781854, "loss": 1.2394975423812866, "step": 2440 }, { "ce_loss": 0.3743356764316559, "epoch": 0.8138759172781854, "step": 2440 }, { "distill_loss": 0.5021654963493347, "epoch": 0.8138759172781854, "step": 2440 }, { "epoch": 0.8138759172781854, "ref_ce_loss": 0.29697659611701965, "step": 2440 }, { "epoch": 0.8172114743162108, "loss": 1.2983, "step": 2450 }, { "epoch": 0.8172114743162108, "grad_norm": 1.5816471576690674, "step": 2450 }, { "epoch": 0.8172114743162108, "learning_rate": 0.0007944051389788053, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 1.1448237895965576, "step": 2450 }, { "ce_loss": 0.36074283719062805, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.4741750657558441, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.2516009211540222, "step": 2450 }, { "epoch": 0.8172114743162108, "loss": 1.317401647567749, "step": 2450 }, { "ce_loss": 0.3394004702568054, "epoch": 0.8172114743162108, "step": 2450 }, { "distill_loss": 0.466372013092041, "epoch": 0.8172114743162108, "step": 2450 }, { "epoch": 0.8172114743162108, "ref_ce_loss": 0.28243008255958557, "step": 2450 }, { "epoch": 0.8205470313542361, "loss": 1.3469, "step": 2460 }, { "epoch": 0.8205470313542361, "grad_norm": 2.4694275856018066, "step": 2460 }, { "epoch": 0.8205470313542361, "learning_rate": 0.0007943328858466108, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 1.3277108669281006, "step": 2460 }, { "ce_loss": 0.41051825881004333, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.6400450468063354, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.27634891867637634, "step": 2460 }, { "epoch": 0.8205470313542361, "loss": 1.2832062244415283, "step": 2460 }, { "ce_loss": 0.3461655378341675, "epoch": 0.8205470313542361, "step": 2460 }, { "distill_loss": 0.5117206573486328, "epoch": 0.8205470313542361, "step": 2460 }, { "epoch": 0.8205470313542361, "ref_ce_loss": 0.2564898729324341, "step": 2460 }, { "epoch": 0.8238825883922615, "loss": 1.3354, "step": 2470 }, { "epoch": 0.8238825883922615, "grad_norm": 1.6225868463516235, "step": 2470 }, { "epoch": 0.8238825883922615, "learning_rate": 0.0007942601724855066, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 1.160365343093872, "step": 2470 }, { "ce_loss": 0.30365264415740967, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.5082361698150635, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.25682398676872253, "step": 2470 }, { "epoch": 0.8238825883922615, "loss": 1.7921531200408936, "step": 2470 }, { "ce_loss": 0.38309571146965027, "epoch": 0.8238825883922615, "step": 2470 }, { "distill_loss": 0.4826603829860687, "epoch": 0.8238825883922615, "step": 2470 }, { "epoch": 0.8238825883922615, "ref_ce_loss": 0.27224835753440857, "step": 2470 }, { "epoch": 0.8272181454302868, "loss": 1.3585, "step": 2480 }, { "epoch": 0.8272181454302868, "grad_norm": 1.32754647731781, "step": 2480 }, { "epoch": 0.8272181454302868, "learning_rate": 0.000794186998980357, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 1.8056671619415283, "step": 2480 }, { "ce_loss": 0.3759201467037201, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.5555229187011719, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.2487923502922058, "step": 2480 }, { "epoch": 0.8272181454302868, "loss": 1.400477409362793, "step": 2480 }, { "ce_loss": 0.44156867265701294, "epoch": 0.8272181454302868, "step": 2480 }, { "distill_loss": 0.5774922370910645, "epoch": 0.8272181454302868, "step": 2480 }, { "epoch": 0.8272181454302868, "ref_ce_loss": 0.25374627113342285, "step": 2480 }, { "epoch": 0.8305537024683122, "loss": 1.3699, "step": 2490 }, { "epoch": 0.8305537024683122, "grad_norm": 1.5362639427185059, "step": 2490 }, { "epoch": 0.8305537024683122, "learning_rate": 0.0007941133654165633, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 1.038097858428955, "step": 2490 }, { "ce_loss": 0.3375265300273895, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.48798954486846924, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.21024957299232483, "step": 2490 }, { "epoch": 0.8305537024683122, "loss": 1.239802598953247, "step": 2490 }, { "ce_loss": 0.3786744475364685, "epoch": 0.8305537024683122, "step": 2490 }, { "distill_loss": 0.5701245069503784, "epoch": 0.8305537024683122, "step": 2490 }, { "epoch": 0.8305537024683122, "ref_ce_loss": 0.20690640807151794, "step": 2490 }, { "epoch": 0.8338892595063375, "loss": 1.364, "step": 2500 }, { "epoch": 0.8338892595063375, "grad_norm": 2.555614709854126, "step": 2500 }, { "epoch": 0.8338892595063375, "learning_rate": 0.0007940392718800637, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 1.0673748254776, "step": 2500 }, { "ce_loss": 0.29459384083747864, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.4663306474685669, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.2350129783153534, "step": 2500 }, { "epoch": 0.8338892595063375, "loss": 1.6143672466278076, "step": 2500 }, { "ce_loss": 0.5081417560577393, "epoch": 0.8338892595063375, "step": 2500 }, { "distill_loss": 0.7202895879745483, "epoch": 0.8338892595063375, "step": 2500 }, { "epoch": 0.8338892595063375, "ref_ce_loss": 0.31191402673721313, "step": 2500 }, { "epoch": 0.8372248165443629, "loss": 1.3549, "step": 2510 }, { "epoch": 0.8372248165443629, "grad_norm": 1.6417618989944458, "step": 2510 }, { "epoch": 0.8372248165443629, "learning_rate": 0.0007939647184573334, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 1.2564659118652344, "step": 2510 }, { "ce_loss": 0.39230161905288696, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.5175186395645142, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.2814292013645172, "step": 2510 }, { "epoch": 0.8372248165443629, "loss": 1.0255411863327026, "step": 2510 }, { "ce_loss": 0.3256433308124542, "epoch": 0.8372248165443629, "step": 2510 }, { "distill_loss": 0.40438857674598694, "epoch": 0.8372248165443629, "step": 2510 }, { "epoch": 0.8372248165443629, "ref_ce_loss": 0.1968470960855484, "step": 2510 }, { "epoch": 0.8405603735823882, "loss": 1.3068, "step": 2520 }, { "epoch": 0.8405603735823882, "grad_norm": 1.896988034248352, "step": 2520 }, { "epoch": 0.8405603735823882, "learning_rate": 0.0007938897052353845, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 1.3056790828704834, "step": 2520 }, { "ce_loss": 0.34220781922340393, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.5258098840713501, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.2810552418231964, "step": 2520 }, { "epoch": 0.8405603735823882, "loss": 1.2250864505767822, "step": 2520 }, { "ce_loss": 0.3773867189884186, "epoch": 0.8405603735823882, "step": 2520 }, { "distill_loss": 0.4996189475059509, "epoch": 0.8405603735823882, "step": 2520 }, { "epoch": 0.8405603735823882, "ref_ce_loss": 0.2604522705078125, "step": 2520 }, { "epoch": 0.8438959306204136, "loss": 1.4173, "step": 2530 }, { "epoch": 0.8438959306204136, "grad_norm": 1.9606977701187134, "step": 2530 }, { "epoch": 0.8438959306204136, "learning_rate": 0.0007938142323017652, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 1.504172444343567, "step": 2530 }, { "ce_loss": 0.44866085052490234, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.5621837973594666, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.2699384391307831, "step": 2530 }, { "epoch": 0.8438959306204136, "loss": 1.3252410888671875, "step": 2530 }, { "ce_loss": 0.3632029592990875, "epoch": 0.8438959306204136, "step": 2530 }, { "distill_loss": 0.5788899660110474, "epoch": 0.8438959306204136, "step": 2530 }, { "epoch": 0.8438959306204136, "ref_ce_loss": 0.30363452434539795, "step": 2530 }, { "epoch": 0.8472314876584389, "loss": 1.3297, "step": 2540 }, { "epoch": 0.8472314876584389, "grad_norm": 1.4642318487167358, "step": 2540 }, { "epoch": 0.8472314876584389, "learning_rate": 0.0007937382997445605, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 1.0956921577453613, "step": 2540 }, { "ce_loss": 0.3411797285079956, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.46473002433776855, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.19630055129528046, "step": 2540 }, { "epoch": 0.8472314876584389, "loss": 1.4409770965576172, "step": 2540 }, { "ce_loss": 0.43977662920951843, "epoch": 0.8472314876584389, "step": 2540 }, { "distill_loss": 0.5356355905532837, "epoch": 0.8472314876584389, "step": 2540 }, { "epoch": 0.8472314876584389, "ref_ce_loss": 0.2819962799549103, "step": 2540 }, { "epoch": 0.8505670446964643, "loss": 1.3112, "step": 2550 }, { "epoch": 0.8505670446964643, "grad_norm": 1.6138858795166016, "step": 2550 }, { "epoch": 0.8505670446964643, "learning_rate": 0.0007936619076523922, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 1.133142113685608, "step": 2550 }, { "ce_loss": 0.3301082253456116, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.5005925297737122, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.23297664523124695, "step": 2550 }, { "epoch": 0.8505670446964643, "loss": 1.3417131900787354, "step": 2550 }, { "ce_loss": 0.4214775860309601, "epoch": 0.8505670446964643, "step": 2550 }, { "distill_loss": 0.5064700841903687, "epoch": 0.8505670446964643, "step": 2550 }, { "epoch": 0.8505670446964643, "ref_ce_loss": 0.2919166386127472, "step": 2550 }, { "epoch": 0.8539026017344896, "loss": 1.4061, "step": 2560 }, { "epoch": 0.8539026017344896, "grad_norm": 3.5541536808013916, "step": 2560 }, { "epoch": 0.8539026017344896, "learning_rate": 0.0007935850561144179, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 1.4824650287628174, "step": 2560 }, { "ce_loss": 0.40878599882125854, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.5415136218070984, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.30184876918792725, "step": 2560 }, { "epoch": 0.8539026017344896, "loss": 1.5436408519744873, "step": 2560 }, { "ce_loss": 0.4445594847202301, "epoch": 0.8539026017344896, "step": 2560 }, { "distill_loss": 0.5943202376365662, "epoch": 0.8539026017344896, "step": 2560 }, { "epoch": 0.8539026017344896, "ref_ce_loss": 0.24817104637622833, "step": 2560 }, { "epoch": 0.857238158772515, "loss": 1.3978, "step": 2570 }, { "epoch": 0.857238158772515, "grad_norm": 2.0126984119415283, "step": 2570 }, { "epoch": 0.857238158772515, "learning_rate": 0.0007935077452203315, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 1.287863850593567, "step": 2570 }, { "ce_loss": 0.3930123746395111, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.3892088532447815, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.2868179976940155, "step": 2570 }, { "epoch": 0.857238158772515, "loss": 1.4019250869750977, "step": 2570 }, { "ce_loss": 0.45069876313209534, "epoch": 0.857238158772515, "step": 2570 }, { "distill_loss": 0.4385972023010254, "epoch": 0.857238158772515, "step": 2570 }, { "epoch": 0.857238158772515, "ref_ce_loss": 0.30399036407470703, "step": 2570 }, { "epoch": 0.8605737158105403, "loss": 1.3875, "step": 2580 }, { "epoch": 0.8605737158105403, "grad_norm": 1.6553137302398682, "step": 2580 }, { "epoch": 0.8605737158105403, "learning_rate": 0.0007934299750603633, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 1.6291496753692627, "step": 2580 }, { "ce_loss": 0.3738187551498413, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.5318026542663574, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.33766835927963257, "step": 2580 }, { "epoch": 0.8605737158105403, "loss": 1.650617003440857, "step": 2580 }, { "ce_loss": 0.42238470911979675, "epoch": 0.8605737158105403, "step": 2580 }, { "distill_loss": 0.5802122354507446, "epoch": 0.8605737158105403, "step": 2580 }, { "epoch": 0.8605737158105403, "ref_ce_loss": 0.22837448120117188, "step": 2580 }, { "epoch": 0.8639092728485657, "loss": 1.3322, "step": 2590 }, { "epoch": 0.8639092728485657, "grad_norm": 1.695788860321045, "step": 2590 }, { "epoch": 0.8639092728485657, "learning_rate": 0.0007933517457252794, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 1.30512273311615, "step": 2590 }, { "ce_loss": 0.3906150758266449, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.5428471565246582, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.3009546995162964, "step": 2590 }, { "epoch": 0.8639092728485657, "loss": 1.2960855960845947, "step": 2590 }, { "ce_loss": 0.379740834236145, "epoch": 0.8639092728485657, "step": 2590 }, { "distill_loss": 0.6086607575416565, "epoch": 0.8639092728485657, "step": 2590 }, { "epoch": 0.8639092728485657, "ref_ce_loss": 0.22640559077262878, "step": 2590 }, { "epoch": 0.867244829886591, "loss": 1.4382, "step": 2600 }, { "epoch": 0.867244829886591, "grad_norm": 1.937280297279358, "step": 2600 }, { "epoch": 0.867244829886591, "learning_rate": 0.0007932730573063818, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 1.3920255899429321, "step": 2600 }, { "ce_loss": 0.47237688302993774, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.523831307888031, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.3163221776485443, "step": 2600 }, { "epoch": 0.867244829886591, "loss": 2.258751392364502, "step": 2600 }, { "ce_loss": 0.3232567608356476, "epoch": 0.867244829886591, "step": 2600 }, { "distill_loss": 0.4784823954105377, "epoch": 0.867244829886591, "step": 2600 }, { "epoch": 0.867244829886591, "ref_ce_loss": 0.26763230562210083, "step": 2600 }, { "epoch": 0.8705803869246164, "loss": 1.3785, "step": 2610 }, { "epoch": 0.8705803869246164, "grad_norm": 2.2580654621124268, "step": 2610 }, { "epoch": 0.8705803869246164, "learning_rate": 0.0007931939098955084, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 1.2365779876708984, "step": 2610 }, { "ce_loss": 0.29063519835472107, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.6072068214416504, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.23516863584518433, "step": 2610 }, { "epoch": 0.8705803869246164, "loss": 0.9008735418319702, "step": 2610 }, { "ce_loss": 0.2522759437561035, "epoch": 0.8705803869246164, "step": 2610 }, { "distill_loss": 0.4741603434085846, "epoch": 0.8705803869246164, "step": 2610 }, { "epoch": 0.8705803869246164, "ref_ce_loss": 0.17388072609901428, "step": 2610 }, { "epoch": 0.8739159439626417, "loss": 1.2949, "step": 2620 }, { "epoch": 0.8739159439626417, "grad_norm": 2.209432363510132, "step": 2620 }, { "epoch": 0.8739159439626417, "learning_rate": 0.0007931143035850327, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 1.1630470752716064, "step": 2620 }, { "ce_loss": 0.35149869322776794, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.5646779537200928, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.2467038631439209, "step": 2620 }, { "epoch": 0.8739159439626417, "loss": 1.5878159999847412, "step": 2620 }, { "ce_loss": 0.3178049623966217, "epoch": 0.8739159439626417, "step": 2620 }, { "distill_loss": 0.5251916646957397, "epoch": 0.8739159439626417, "step": 2620 }, { "epoch": 0.8739159439626417, "ref_ce_loss": 0.23808318376541138, "step": 2620 }, { "epoch": 0.8772515010006671, "loss": 1.2839, "step": 2630 }, { "epoch": 0.8772515010006671, "grad_norm": 1.3081079721450806, "step": 2630 }, { "epoch": 0.8772515010006671, "learning_rate": 0.0007930342384678639, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 1.6465835571289062, "step": 2630 }, { "ce_loss": 0.40648844838142395, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.5527106523513794, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.2469692975282669, "step": 2630 }, { "epoch": 0.8772515010006671, "loss": 1.1751917600631714, "step": 2630 }, { "ce_loss": 0.37903863191604614, "epoch": 0.8772515010006671, "step": 2630 }, { "distill_loss": 0.44293490052223206, "epoch": 0.8772515010006671, "step": 2630 }, { "epoch": 0.8772515010006671, "ref_ce_loss": 0.25282037258148193, "step": 2630 }, { "epoch": 0.8805870580386924, "loss": 1.3603, "step": 2640 }, { "epoch": 0.8805870580386924, "grad_norm": 2.2184786796569824, "step": 2640 }, { "epoch": 0.8805870580386924, "learning_rate": 0.0007929537146374467, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 1.4768481254577637, "step": 2640 }, { "ce_loss": 0.33723726868629456, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.6114906668663025, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.2681977450847626, "step": 2640 }, { "epoch": 0.8805870580386924, "loss": 1.2050652503967285, "step": 2640 }, { "ce_loss": 0.31978118419647217, "epoch": 0.8805870580386924, "step": 2640 }, { "distill_loss": 0.5009520649909973, "epoch": 0.8805870580386924, "step": 2640 }, { "epoch": 0.8805870580386924, "ref_ce_loss": 0.1821107119321823, "step": 2640 }, { "epoch": 0.8839226150767178, "loss": 1.2414, "step": 2650 }, { "epoch": 0.8839226150767178, "grad_norm": 1.6107494831085205, "step": 2650 }, { "epoch": 0.8839226150767178, "learning_rate": 0.0007928727321877607, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 1.1943227052688599, "step": 2650 }, { "ce_loss": 0.332078218460083, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.42531952261924744, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.2643662691116333, "step": 2650 }, { "epoch": 0.8839226150767178, "loss": 0.9776313900947571, "step": 2650 }, { "ce_loss": 0.31022366881370544, "epoch": 0.8839226150767178, "step": 2650 }, { "distill_loss": 0.38916870951652527, "epoch": 0.8839226150767178, "step": 2650 }, { "epoch": 0.8839226150767178, "ref_ce_loss": 0.20252655446529388, "step": 2650 }, { "epoch": 0.8872581721147431, "loss": 1.2936, "step": 2660 }, { "epoch": 0.8872581721147431, "grad_norm": 1.5588855743408203, "step": 2660 }, { "epoch": 0.8872581721147431, "learning_rate": 0.0007927912912133215, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 1.7801876068115234, "step": 2660 }, { "ce_loss": 0.4108361303806305, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.5852051377296448, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.3112657368183136, "step": 2660 }, { "epoch": 0.8872581721147431, "loss": 1.6241830587387085, "step": 2660 }, { "ce_loss": 0.42643311619758606, "epoch": 0.8872581721147431, "step": 2660 }, { "distill_loss": 0.6193053722381592, "epoch": 0.8872581721147431, "step": 2660 }, { "epoch": 0.8872581721147431, "ref_ce_loss": 0.28631946444511414, "step": 2660 }, { "epoch": 0.8905937291527685, "loss": 1.3849, "step": 2670 }, { "epoch": 0.8905937291527685, "grad_norm": 2.8556129932403564, "step": 2670 }, { "epoch": 0.8905937291527685, "learning_rate": 0.0007927093918091795, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 1.0800200700759888, "step": 2670 }, { "ce_loss": 0.3518918752670288, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.42704787850379944, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.3009520471096039, "step": 2670 }, { "epoch": 0.8905937291527685, "loss": 1.2904795408248901, "step": 2670 }, { "ce_loss": 0.37862733006477356, "epoch": 0.8905937291527685, "step": 2670 }, { "distill_loss": 0.5287166237831116, "epoch": 0.8905937291527685, "step": 2670 }, { "epoch": 0.8905937291527685, "ref_ce_loss": 0.27498745918273926, "step": 2670 }, { "epoch": 0.8939292861907938, "loss": 1.2638, "step": 2680 }, { "epoch": 0.8939292861907938, "grad_norm": 1.6784123182296753, "step": 2680 }, { "epoch": 0.8939292861907938, "learning_rate": 0.0007926270340709198, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 1.2999699115753174, "step": 2680 }, { "ce_loss": 0.3899799585342407, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.5541209578514099, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.25357407331466675, "step": 2680 }, { "epoch": 0.8939292861907938, "loss": 1.3329912424087524, "step": 2680 }, { "ce_loss": 0.3716970980167389, "epoch": 0.8939292861907938, "step": 2680 }, { "distill_loss": 0.5012491941452026, "epoch": 0.8939292861907938, "step": 2680 }, { "epoch": 0.8939292861907938, "ref_ce_loss": 0.26823121309280396, "step": 2680 }, { "epoch": 0.8972648432288192, "loss": 1.2492, "step": 2690 }, { "epoch": 0.8972648432288192, "grad_norm": 1.9010505676269531, "step": 2690 }, { "epoch": 0.8972648432288192, "learning_rate": 0.0007925442180946629, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 1.6424200534820557, "step": 2690 }, { "ce_loss": 0.3123776614665985, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.3880579471588135, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.26424267888069153, "step": 2690 }, { "epoch": 0.8972648432288192, "loss": 1.2713721990585327, "step": 2690 }, { "ce_loss": 0.38829106092453003, "epoch": 0.8972648432288192, "step": 2690 }, { "distill_loss": 0.47953101992607117, "epoch": 0.8972648432288192, "step": 2690 }, { "epoch": 0.8972648432288192, "ref_ce_loss": 0.30490636825561523, "step": 2690 }, { "epoch": 0.9006004002668445, "loss": 1.2907, "step": 2700 }, { "epoch": 0.9006004002668445, "grad_norm": 1.915202021598816, "step": 2700 }, { "epoch": 0.9006004002668445, "learning_rate": 0.0007924609439770641, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 1.3119174242019653, "step": 2700 }, { "ce_loss": 0.37728697061538696, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.526457667350769, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.31909599900245667, "step": 2700 }, { "epoch": 0.9006004002668445, "loss": 1.0590205192565918, "step": 2700 }, { "ce_loss": 0.29085272550582886, "epoch": 0.9006004002668445, "step": 2700 }, { "distill_loss": 0.47949159145355225, "epoch": 0.9006004002668445, "step": 2700 }, { "epoch": 0.9006004002668445, "ref_ce_loss": 0.17374452948570251, "step": 2700 }, { "epoch": 0.9039359573048699, "loss": 1.3608, "step": 2710 }, { "epoch": 0.9039359573048699, "grad_norm": 1.5788755416870117, "step": 2710 }, { "epoch": 0.9039359573048699, "learning_rate": 0.000792377211815313, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 1.1047440767288208, "step": 2710 }, { "ce_loss": 0.28150567412376404, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.4963770806789398, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.24585790932178497, "step": 2710 }, { "epoch": 0.9039359573048699, "loss": 1.1365015506744385, "step": 2710 }, { "ce_loss": 0.36603617668151855, "epoch": 0.9039359573048699, "step": 2710 }, { "distill_loss": 0.4976668953895569, "epoch": 0.9039359573048699, "step": 2710 }, { "epoch": 0.9039359573048699, "ref_ce_loss": 0.2073359489440918, "step": 2710 }, { "epoch": 0.9072715143428952, "loss": 1.4386, "step": 2720 }, { "epoch": 0.9072715143428952, "grad_norm": 1.7400299310684204, "step": 2720 }, { "epoch": 0.9072715143428952, "learning_rate": 0.0007922930217071344, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 1.2005311250686646, "step": 2720 }, { "ce_loss": 0.3774307668209076, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.4605158567428589, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.29511210322380066, "step": 2720 }, { "epoch": 0.9072715143428952, "loss": 1.293221116065979, "step": 2720 }, { "ce_loss": 0.31144988536834717, "epoch": 0.9072715143428952, "step": 2720 }, { "distill_loss": 0.47778937220573425, "epoch": 0.9072715143428952, "step": 2720 }, { "epoch": 0.9072715143428952, "ref_ce_loss": 0.27237197756767273, "step": 2720 }, { "epoch": 0.9106070713809206, "loss": 1.3218, "step": 2730 }, { "epoch": 0.9106070713809206, "grad_norm": 2.3409981727600098, "step": 2730 }, { "epoch": 0.9106070713809206, "learning_rate": 0.0007922083737507867, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 1.3133599758148193, "step": 2730 }, { "ce_loss": 0.34370356798171997, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.40750980377197266, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.2877673804759979, "step": 2730 }, { "epoch": 0.9106070713809206, "loss": 1.3804914951324463, "step": 2730 }, { "ce_loss": 0.4484287202358246, "epoch": 0.9106070713809206, "step": 2730 }, { "distill_loss": 0.46959388256073, "epoch": 0.9106070713809206, "step": 2730 }, { "epoch": 0.9106070713809206, "ref_ce_loss": 0.3645693361759186, "step": 2730 }, { "epoch": 0.9139426284189459, "loss": 1.2124, "step": 2740 }, { "epoch": 0.9139426284189459, "grad_norm": 2.2181167602539062, "step": 2740 }, { "epoch": 0.9139426284189459, "learning_rate": 0.0007921232680450636, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 1.5099198818206787, "step": 2740 }, { "ce_loss": 0.4248862862586975, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.5102673172950745, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.28975045680999756, "step": 2740 }, { "epoch": 0.9139426284189459, "loss": 1.1082688570022583, "step": 2740 }, { "ce_loss": 0.3047284185886383, "epoch": 0.9139426284189459, "step": 2740 }, { "distill_loss": 0.5240141153335571, "epoch": 0.9139426284189459, "step": 2740 }, { "epoch": 0.9139426284189459, "ref_ce_loss": 0.2123073786497116, "step": 2740 }, { "epoch": 0.9172781854569713, "loss": 1.395, "step": 2750 }, { "epoch": 0.9172781854569713, "grad_norm": 1.5949338674545288, "step": 2750 }, { "epoch": 0.9172781854569713, "learning_rate": 0.0007920377046892926, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 1.2958911657333374, "step": 2750 }, { "ce_loss": 0.3287883996963501, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.5168973207473755, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.25692057609558105, "step": 2750 }, { "epoch": 0.9172781854569713, "loss": 1.4671964645385742, "step": 2750 }, { "ce_loss": 0.3675439655780792, "epoch": 0.9172781854569713, "step": 2750 }, { "distill_loss": 0.536207914352417, "epoch": 0.9172781854569713, "step": 2750 }, { "epoch": 0.9172781854569713, "ref_ce_loss": 0.2701234221458435, "step": 2750 }, { "epoch": 0.9206137424949966, "loss": 1.3855, "step": 2760 }, { "epoch": 0.9206137424949966, "grad_norm": 1.644079566001892, "step": 2760 }, { "epoch": 0.9206137424949966, "learning_rate": 0.0007919516837833351, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 1.8431727886199951, "step": 2760 }, { "ce_loss": 0.43441736698150635, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.5581046342849731, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.29165101051330566, "step": 2760 }, { "epoch": 0.9206137424949966, "loss": 1.910833716392517, "step": 2760 }, { "ce_loss": 0.44438034296035767, "epoch": 0.9206137424949966, "step": 2760 }, { "distill_loss": 0.575432538986206, "epoch": 0.9206137424949966, "step": 2760 }, { "epoch": 0.9206137424949966, "ref_ce_loss": 0.271775484085083, "step": 2760 }, { "epoch": 0.923949299533022, "loss": 1.4388, "step": 2770 }, { "epoch": 0.923949299533022, "grad_norm": 2.7886414527893066, "step": 2770 }, { "epoch": 0.923949299533022, "learning_rate": 0.0007918652054275869, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 1.4392640590667725, "step": 2770 }, { "ce_loss": 0.31815871596336365, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.4943084418773651, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.24593886733055115, "step": 2770 }, { "epoch": 0.923949299533022, "loss": 1.0102663040161133, "step": 2770 }, { "ce_loss": 0.2735036015510559, "epoch": 0.923949299533022, "step": 2770 }, { "distill_loss": 0.4617146849632263, "epoch": 0.923949299533022, "step": 2770 }, { "epoch": 0.923949299533022, "ref_ce_loss": 0.19889551401138306, "step": 2770 }, { "epoch": 0.9272848565710473, "loss": 1.4485, "step": 2780 }, { "epoch": 0.9272848565710473, "grad_norm": 1.6201666593551636, "step": 2780 }, { "epoch": 0.9272848565710473, "learning_rate": 0.0007917782697229776, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 1.2848777770996094, "step": 2780 }, { "ce_loss": 0.39188921451568604, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.5093104243278503, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.30376869440078735, "step": 2780 }, { "epoch": 0.9272848565710473, "loss": 1.471110463142395, "step": 2780 }, { "ce_loss": 0.4805634617805481, "epoch": 0.9272848565710473, "step": 2780 }, { "distill_loss": 0.549042284488678, "epoch": 0.9272848565710473, "step": 2780 }, { "epoch": 0.9272848565710473, "ref_ce_loss": 0.3469245135784149, "step": 2780 }, { "epoch": 0.9306204136090727, "loss": 1.3917, "step": 2790 }, { "epoch": 0.9306204136090727, "grad_norm": 1.7547879219055176, "step": 2790 }, { "epoch": 0.9306204136090727, "learning_rate": 0.0007916908767709703, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 1.1947146654129028, "step": 2790 }, { "ce_loss": 0.35069945454597473, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.453375905752182, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.2738749086856842, "step": 2790 }, { "epoch": 0.9306204136090727, "loss": 1.2321999073028564, "step": 2790 }, { "ce_loss": 0.2855234742164612, "epoch": 0.9306204136090727, "step": 2790 }, { "distill_loss": 0.40277066826820374, "epoch": 0.9306204136090727, "step": 2790 }, { "epoch": 0.9306204136090727, "ref_ce_loss": 0.18820147216320038, "step": 2790 }, { "epoch": 0.933955970647098, "loss": 1.2753, "step": 2800 }, { "epoch": 0.933955970647098, "grad_norm": 1.6333000659942627, "step": 2800 }, { "epoch": 0.933955970647098, "learning_rate": 0.0007916030266735622, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 1.1305317878723145, "step": 2800 }, { "ce_loss": 0.39312195777893066, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.4638335406780243, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.27346083521842957, "step": 2800 }, { "epoch": 0.933955970647098, "loss": 1.0534822940826416, "step": 2800 }, { "ce_loss": 0.34346362948417664, "epoch": 0.933955970647098, "step": 2800 }, { "distill_loss": 0.4592100977897644, "epoch": 0.933955970647098, "step": 2800 }, { "epoch": 0.933955970647098, "ref_ce_loss": 0.24397408962249756, "step": 2800 }, { "epoch": 0.9372915276851234, "loss": 1.2893, "step": 2810 }, { "epoch": 0.9372915276851234, "grad_norm": 1.5109015703201294, "step": 2810 }, { "epoch": 0.9372915276851234, "learning_rate": 0.0007915147195332838, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 1.3544869422912598, "step": 2810 }, { "ce_loss": 0.4233008027076721, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.515604555606842, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.22143079340457916, "step": 2810 }, { "epoch": 0.9372915276851234, "loss": 1.2287521362304688, "step": 2810 }, { "ce_loss": 0.31915584206581116, "epoch": 0.9372915276851234, "step": 2810 }, { "distill_loss": 0.4785001277923584, "epoch": 0.9372915276851234, "step": 2810 }, { "epoch": 0.9372915276851234, "ref_ce_loss": 0.23532734811306, "step": 2810 }, { "epoch": 0.9406270847231488, "loss": 1.5232, "step": 2820 }, { "epoch": 0.9406270847231488, "grad_norm": 2.7087512016296387, "step": 2820 }, { "epoch": 0.9406270847231488, "learning_rate": 0.0007914259554531989, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 1.053950548171997, "step": 2820 }, { "ce_loss": 0.31700703501701355, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.4933873116970062, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.22106169164180756, "step": 2820 }, { "epoch": 0.9406270847231488, "loss": 1.3688534498214722, "step": 2820 }, { "ce_loss": 0.39123156666755676, "epoch": 0.9406270847231488, "step": 2820 }, { "distill_loss": 0.5485091209411621, "epoch": 0.9406270847231488, "step": 2820 }, { "epoch": 0.9406270847231488, "ref_ce_loss": 0.27018389105796814, "step": 2820 }, { "epoch": 0.9439626417611742, "loss": 1.4779, "step": 2830 }, { "epoch": 0.9439626417611742, "grad_norm": 2.3458189964294434, "step": 2830 }, { "epoch": 0.9439626417611742, "learning_rate": 0.0007913367345369048, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 2.1609432697296143, "step": 2830 }, { "ce_loss": 0.4953167736530304, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.6101577877998352, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.28571856021881104, "step": 2830 }, { "epoch": 0.9439626417611742, "loss": 1.1443687677383423, "step": 2830 }, { "ce_loss": 0.2994648516178131, "epoch": 0.9439626417611742, "step": 2830 }, { "distill_loss": 0.515932023525238, "epoch": 0.9439626417611742, "step": 2830 }, { "epoch": 0.9439626417611742, "ref_ce_loss": 0.2316143661737442, "step": 2830 }, { "epoch": 0.9472981987991995, "loss": 1.3814, "step": 2840 }, { "epoch": 0.9472981987991995, "grad_norm": 2.0391764640808105, "step": 2840 }, { "epoch": 0.9472981987991995, "learning_rate": 0.000791247056888532, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 1.5001659393310547, "step": 2840 }, { "ce_loss": 0.3742694854736328, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.38781672716140747, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.21688145399093628, "step": 2840 }, { "epoch": 0.9472981987991995, "loss": 1.1347072124481201, "step": 2840 }, { "ce_loss": 0.3934358060359955, "epoch": 0.9472981987991995, "step": 2840 }, { "distill_loss": 0.49731510877609253, "epoch": 0.9472981987991995, "step": 2840 }, { "epoch": 0.9472981987991995, "ref_ce_loss": 0.24390080571174622, "step": 2840 }, { "epoch": 0.9506337558372249, "loss": 1.3343, "step": 2850 }, { "epoch": 0.9506337558372249, "grad_norm": 1.5864659547805786, "step": 2850 }, { "epoch": 0.9506337558372249, "learning_rate": 0.0007911569226127438, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 1.3999756574630737, "step": 2850 }, { "ce_loss": 0.29279372096061707, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.49844199419021606, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.22119459509849548, "step": 2850 }, { "epoch": 0.9506337558372249, "loss": 1.1619170904159546, "step": 2850 }, { "ce_loss": 0.334495484828949, "epoch": 0.9506337558372249, "step": 2850 }, { "distill_loss": 0.4583854675292969, "epoch": 0.9506337558372249, "step": 2850 }, { "epoch": 0.9506337558372249, "ref_ce_loss": 0.28474098443984985, "step": 2850 }, { "epoch": 0.9539693128752502, "loss": 1.4491, "step": 2860 }, { "epoch": 0.9539693128752502, "grad_norm": 1.9358841180801392, "step": 2860 }, { "epoch": 0.9539693128752502, "learning_rate": 0.0007910663318147368, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 1.1117554903030396, "step": 2860 }, { "ce_loss": 0.29617729783058167, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.4828736484050751, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.22240567207336426, "step": 2860 }, { "epoch": 0.9539693128752502, "loss": 1.3098286390304565, "step": 2860 }, { "ce_loss": 0.3587772250175476, "epoch": 0.9539693128752502, "step": 2860 }, { "distill_loss": 0.5689359903335571, "epoch": 0.9539693128752502, "step": 2860 }, { "epoch": 0.9539693128752502, "ref_ce_loss": 0.21025978028774261, "step": 2860 }, { "epoch": 0.9573048699132756, "loss": 1.4297, "step": 2870 }, { "epoch": 0.9573048699132756, "grad_norm": 1.8924882411956787, "step": 2870 }, { "epoch": 0.9573048699132756, "learning_rate": 0.00079097528460024, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 1.4298171997070312, "step": 2870 }, { "ce_loss": 0.3467281460762024, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.6316140294075012, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.1883177012205124, "step": 2870 }, { "epoch": 0.9573048699132756, "loss": 1.6898854970932007, "step": 2870 }, { "ce_loss": 0.31687483191490173, "epoch": 0.9573048699132756, "step": 2870 }, { "distill_loss": 0.5835000276565552, "epoch": 0.9573048699132756, "step": 2870 }, { "epoch": 0.9573048699132756, "ref_ce_loss": 0.276297926902771, "step": 2870 }, { "epoch": 0.9606404269513009, "loss": 1.3525, "step": 2880 }, { "epoch": 0.9606404269513009, "grad_norm": 1.3143359422683716, "step": 2880 }, { "epoch": 0.9606404269513009, "learning_rate": 0.0007908837810755154, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 1.9876177310943604, "step": 2880 }, { "ce_loss": 0.3657590448856354, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.48195603489875793, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.23338349163532257, "step": 2880 }, { "epoch": 0.9606404269513009, "loss": 1.2629928588867188, "step": 2880 }, { "ce_loss": 0.3671756088733673, "epoch": 0.9606404269513009, "step": 2880 }, { "distill_loss": 0.5391901135444641, "epoch": 0.9606404269513009, "step": 2880 }, { "epoch": 0.9606404269513009, "ref_ce_loss": 0.2850986123085022, "step": 2880 }, { "epoch": 0.9639759839893263, "loss": 1.4018, "step": 2890 }, { "epoch": 0.9639759839893263, "grad_norm": 1.9271106719970703, "step": 2890 }, { "epoch": 0.9639759839893263, "learning_rate": 0.0007907918213473574, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 1.1845937967300415, "step": 2890 }, { "ce_loss": 0.4012899398803711, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.5150628089904785, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.26796332001686096, "step": 2890 }, { "epoch": 0.9639759839893263, "loss": 1.050054669380188, "step": 2890 }, { "ce_loss": 0.2966417372226715, "epoch": 0.9639759839893263, "step": 2890 }, { "distill_loss": 0.43445032835006714, "epoch": 0.9639759839893263, "step": 2890 }, { "epoch": 0.9639759839893263, "ref_ce_loss": 0.2567691504955292, "step": 2890 }, { "epoch": 0.9673115410273516, "loss": 1.2895, "step": 2900 }, { "epoch": 0.9673115410273516, "grad_norm": 1.831406593322754, "step": 2900 }, { "epoch": 0.9673115410273516, "learning_rate": 0.000790699405523093, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 1.1192868947982788, "step": 2900 }, { "ce_loss": 0.3219367563724518, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.5257210731506348, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.25633111596107483, "step": 2900 }, { "epoch": 0.9673115410273516, "loss": 1.2179416418075562, "step": 2900 }, { "ce_loss": 0.35467156767845154, "epoch": 0.9673115410273516, "step": 2900 }, { "distill_loss": 0.5465186238288879, "epoch": 0.9673115410273516, "step": 2900 }, { "epoch": 0.9673115410273516, "ref_ce_loss": 0.31613627076148987, "step": 2900 }, { "epoch": 0.970647098065377, "loss": 1.2964, "step": 2910 }, { "epoch": 0.970647098065377, "grad_norm": 1.3753669261932373, "step": 2910 }, { "epoch": 0.970647098065377, "learning_rate": 0.0007906065337105814, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 1.250396728515625, "step": 2910 }, { "ce_loss": 0.4102431833744049, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.5822953581809998, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.2576345205307007, "step": 2910 }, { "epoch": 0.970647098065377, "loss": 1.4072184562683105, "step": 2910 }, { "ce_loss": 0.3921785056591034, "epoch": 0.970647098065377, "step": 2910 }, { "distill_loss": 0.513314962387085, "epoch": 0.970647098065377, "step": 2910 }, { "epoch": 0.970647098065377, "ref_ce_loss": 0.2481769621372223, "step": 2910 }, { "epoch": 0.9739826551034023, "loss": 1.3336, "step": 2920 }, { "epoch": 0.9739826551034023, "grad_norm": 1.6441563367843628, "step": 2920 }, { "epoch": 0.9739826551034023, "learning_rate": 0.0007905132060182138, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 1.6755125522613525, "step": 2920 }, { "ce_loss": 0.41334229707717896, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.6247754693031311, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.2773612141609192, "step": 2920 }, { "epoch": 0.9739826551034023, "loss": 1.2987523078918457, "step": 2920 }, { "ce_loss": 0.3157593309879303, "epoch": 0.9739826551034023, "step": 2920 }, { "distill_loss": 0.48028379678726196, "epoch": 0.9739826551034023, "step": 2920 }, { "epoch": 0.9739826551034023, "ref_ce_loss": 0.23197156190872192, "step": 2920 }, { "epoch": 0.9773182121414277, "loss": 1.295, "step": 2930 }, { "epoch": 0.9773182121414277, "grad_norm": 2.0256261825561523, "step": 2930 }, { "epoch": 0.9773182121414277, "learning_rate": 0.000790419422554914, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 2.13342022895813, "step": 2930 }, { "ce_loss": 0.40803754329681396, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.6608292460441589, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.30405136942863464, "step": 2930 }, { "epoch": 0.9773182121414277, "loss": 1.7026296854019165, "step": 2930 }, { "ce_loss": 0.47600406408309937, "epoch": 0.9773182121414277, "step": 2930 }, { "distill_loss": 0.6173453330993652, "epoch": 0.9773182121414277, "step": 2930 }, { "epoch": 0.9773182121414277, "ref_ce_loss": 0.2594636380672455, "step": 2930 }, { "epoch": 0.980653769179453, "loss": 1.3692, "step": 2940 }, { "epoch": 0.980653769179453, "grad_norm": 2.0748965740203857, "step": 2940 }, { "epoch": 0.980653769179453, "learning_rate": 0.0007903251834301372, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 0.9908037185668945, "step": 2940 }, { "ce_loss": 0.2691829204559326, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.4488098621368408, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.1903645396232605, "step": 2940 }, { "epoch": 0.980653769179453, "loss": 1.34403657913208, "step": 2940 }, { "ce_loss": 0.3330442011356354, "epoch": 0.980653769179453, "step": 2940 }, { "distill_loss": 0.5426937937736511, "epoch": 0.980653769179453, "step": 2940 }, { "epoch": 0.980653769179453, "ref_ce_loss": 0.24648922681808472, "step": 2940 }, { "epoch": 0.9839893262174784, "loss": 1.2965, "step": 2950 }, { "epoch": 0.9839893262174784, "grad_norm": 1.500327229499817, "step": 2950 }, { "epoch": 0.9839893262174784, "learning_rate": 0.0007902304887538705, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 1.0015138387680054, "step": 2950 }, { "ce_loss": 0.27747029066085815, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.3931051194667816, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.18407243490219116, "step": 2950 }, { "epoch": 0.9839893262174784, "loss": 1.3468683958053589, "step": 2950 }, { "ce_loss": 0.3932223916053772, "epoch": 0.9839893262174784, "step": 2950 }, { "distill_loss": 0.48848363757133484, "epoch": 0.9839893262174784, "step": 2950 }, { "epoch": 0.9839893262174784, "ref_ce_loss": 0.3180612623691559, "step": 2950 }, { "epoch": 0.9873248832555037, "loss": 1.3295, "step": 2960 }, { "epoch": 0.9873248832555037, "grad_norm": 2.592111110687256, "step": 2960 }, { "epoch": 0.9873248832555037, "learning_rate": 0.000790135338636633, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 1.3312675952911377, "step": 2960 }, { "ce_loss": 0.40466398000717163, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.5289773941040039, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.3111151158809662, "step": 2960 }, { "epoch": 0.9873248832555037, "loss": 1.2488740682601929, "step": 2960 }, { "ce_loss": 0.4030848741531372, "epoch": 0.9873248832555037, "step": 2960 }, { "distill_loss": 0.5210260152816772, "epoch": 0.9873248832555037, "step": 2960 }, { "epoch": 0.9873248832555037, "ref_ce_loss": 0.24048668146133423, "step": 2960 }, { "epoch": 0.9906604402935291, "loss": 1.3397, "step": 2970 }, { "epoch": 0.9906604402935291, "grad_norm": 4.065495491027832, "step": 2970 }, { "epoch": 0.9906604402935291, "learning_rate": 0.0007900397331894749, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 1.8174149990081787, "step": 2970 }, { "ce_loss": 0.40033072233200073, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.48885297775268555, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.27875208854675293, "step": 2970 }, { "epoch": 0.9906604402935291, "loss": 1.0776383876800537, "step": 2970 }, { "ce_loss": 0.30982306599617004, "epoch": 0.9906604402935291, "step": 2970 }, { "distill_loss": 0.4886612296104431, "epoch": 0.9906604402935291, "step": 2970 }, { "epoch": 0.9906604402935291, "ref_ce_loss": 0.18483254313468933, "step": 2970 }, { "epoch": 0.9939959973315544, "loss": 1.4159, "step": 2980 }, { "epoch": 0.9939959973315544, "grad_norm": 1.8326114416122437, "step": 2980 }, { "epoch": 0.9939959973315544, "learning_rate": 0.0007899436725239782, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 1.6776779890060425, "step": 2980 }, { "ce_loss": 0.4594779312610626, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.6358326077461243, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.3393205404281616, "step": 2980 }, { "epoch": 0.9939959973315544, "loss": 1.2762123346328735, "step": 2980 }, { "ce_loss": 0.3621397018432617, "epoch": 0.9939959973315544, "step": 2980 }, { "distill_loss": 0.5921629071235657, "epoch": 0.9939959973315544, "step": 2980 }, { "epoch": 0.9939959973315544, "ref_ce_loss": 0.2265629917383194, "step": 2980 }, { "epoch": 0.9973315543695798, "loss": 1.3182, "step": 2990 }, { "epoch": 0.9973315543695798, "grad_norm": 2.937052011489868, "step": 2990 }, { "epoch": 0.9973315543695798, "learning_rate": 0.000789847156752256, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 1.1503270864486694, "step": 2990 }, { "ce_loss": 0.3201557993888855, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.4587818682193756, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.30107244849205017, "step": 2990 }, { "epoch": 0.9973315543695798, "loss": 1.6935726404190063, "step": 2990 }, { "ce_loss": 0.39489102363586426, "epoch": 0.9973315543695798, "step": 2990 }, { "distill_loss": 0.5594819784164429, "epoch": 0.9973315543695798, "step": 2990 }, { "epoch": 0.9973315543695798, "ref_ce_loss": 0.2576351463794708, "step": 2990 }, { "epoch": 1.0006671114076051, "loss": 1.242, "step": 3000 }, { "epoch": 1.0006671114076051, "grad_norm": 1.2817131280899048, "step": 3000 }, { "epoch": 1.0006671114076051, "learning_rate": 0.0007897501859869525, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 1.1143462657928467, "step": 3000 }, { "ce_loss": 0.31217432022094727, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.5033187866210938, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.2228211611509323, "step": 3000 }, { "epoch": 1.0006671114076051, "loss": 1.2181870937347412, "step": 3000 }, { "ce_loss": 0.34828251600265503, "epoch": 1.0006671114076051, "step": 3000 }, { "distill_loss": 0.5122925639152527, "epoch": 1.0006671114076051, "step": 3000 }, { "epoch": 1.0006671114076051, "ref_ce_loss": 0.2240781933069229, "step": 3000 }, { "epoch": 1.0040026684456305, "loss": 1.1703, "step": 3010 }, { "epoch": 1.0040026684456305, "grad_norm": 1.5048034191131592, "step": 3010 }, { "epoch": 1.0040026684456305, "learning_rate": 0.0007896527603412433, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 1.2737293243408203, "step": 3010 }, { "ce_loss": 0.332550048828125, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.49337321519851685, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.2538461983203888, "step": 3010 }, { "epoch": 1.0040026684456305, "loss": 1.1636898517608643, "step": 3010 }, { "ce_loss": 0.2893487513065338, "epoch": 1.0040026684456305, "step": 3010 }, { "distill_loss": 0.46247366070747375, "epoch": 1.0040026684456305, "step": 3010 }, { "epoch": 1.0040026684456305, "ref_ce_loss": 0.2124393880367279, "step": 3010 }, { "epoch": 1.0073382254836558, "loss": 1.224, "step": 3020 }, { "epoch": 1.0073382254836558, "grad_norm": 1.7727106809616089, "step": 3020 }, { "epoch": 1.0073382254836558, "learning_rate": 0.0007895548799288343, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 1.5254015922546387, "step": 3020 }, { "ce_loss": 0.560138463973999, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.5498660802841187, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.4145788848400116, "step": 3020 }, { "epoch": 1.0073382254836558, "loss": 1.0851985216140747, "step": 3020 }, { "ce_loss": 0.3535903990268707, "epoch": 1.0073382254836558, "step": 3020 }, { "distill_loss": 0.5172708034515381, "epoch": 1.0073382254836558, "step": 3020 }, { "epoch": 1.0073382254836558, "ref_ce_loss": 0.21308091282844543, "step": 3020 }, { "epoch": 1.0106737825216812, "loss": 1.3062, "step": 3030 }, { "epoch": 1.0106737825216812, "grad_norm": 2.0452983379364014, "step": 3030 }, { "epoch": 1.0106737825216812, "learning_rate": 0.0007894565448639626, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 1.3256267309188843, "step": 3030 }, { "ce_loss": 0.35627481341362, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.6277180314064026, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.2603911757469177, "step": 3030 }, { "epoch": 1.0106737825216812, "loss": 1.2563767433166504, "step": 3030 }, { "ce_loss": 0.31319910287857056, "epoch": 1.0106737825216812, "step": 3030 }, { "distill_loss": 0.6499719619750977, "epoch": 1.0106737825216812, "step": 3030 }, { "epoch": 1.0106737825216812, "ref_ce_loss": 0.22245129942893982, "step": 3030 }, { "epoch": 1.0140093395597065, "loss": 1.3513, "step": 3040 }, { "epoch": 1.0140093395597065, "grad_norm": 3.9679436683654785, "step": 3040 }, { "epoch": 1.0140093395597065, "learning_rate": 0.0007893577552613957, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 1.243039846420288, "step": 3040 }, { "ce_loss": 0.37055256962776184, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.5363262295722961, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.27428340911865234, "step": 3040 }, { "epoch": 1.0140093395597065, "loss": 1.2832188606262207, "step": 3040 }, { "ce_loss": 0.3100008964538574, "epoch": 1.0140093395597065, "step": 3040 }, { "distill_loss": 0.5250524878501892, "epoch": 1.0140093395597065, "step": 3040 }, { "epoch": 1.0140093395597065, "ref_ce_loss": 0.19188229739665985, "step": 3040 }, { "epoch": 1.0173448965977319, "loss": 1.3488, "step": 3050 }, { "epoch": 1.0173448965977319, "grad_norm": 1.9552648067474365, "step": 3050 }, { "epoch": 1.0173448965977319, "learning_rate": 0.0007892585112364318, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 1.4004244804382324, "step": 3050 }, { "ce_loss": 0.45617711544036865, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.6841182708740234, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.2595180571079254, "step": 3050 }, { "epoch": 1.0173448965977319, "loss": 0.9640132188796997, "step": 3050 }, { "ce_loss": 0.2764337360858917, "epoch": 1.0173448965977319, "step": 3050 }, { "distill_loss": 0.42042332887649536, "epoch": 1.0173448965977319, "step": 3050 }, { "epoch": 1.0173448965977319, "ref_ce_loss": 0.17508722841739655, "step": 3050 }, { "epoch": 1.0206804536357572, "loss": 1.2818, "step": 3060 }, { "epoch": 1.0206804536357572, "grad_norm": 1.4938023090362549, "step": 3060 }, { "epoch": 1.0206804536357572, "learning_rate": 0.0007891588129048994, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 1.051990270614624, "step": 3060 }, { "ce_loss": 0.30879828333854675, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.4311293959617615, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.2406865358352661, "step": 3060 }, { "epoch": 1.0206804536357572, "loss": 1.816645622253418, "step": 3060 }, { "ce_loss": 0.3380657136440277, "epoch": 1.0206804536357572, "step": 3060 }, { "distill_loss": 0.4694381058216095, "epoch": 1.0206804536357572, "step": 3060 }, { "epoch": 1.0206804536357572, "ref_ce_loss": 0.24928055703639984, "step": 3060 }, { "epoch": 1.0240160106737826, "loss": 1.2024, "step": 3070 }, { "epoch": 1.0240160106737826, "grad_norm": 1.9541714191436768, "step": 3070 }, { "epoch": 1.0240160106737826, "learning_rate": 0.000789058660383157, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 1.106635570526123, "step": 3070 }, { "ce_loss": 0.2738843262195587, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.43765518069267273, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.21291719377040863, "step": 3070 }, { "epoch": 1.0240160106737826, "loss": 1.5708824396133423, "step": 3070 }, { "ce_loss": 0.34916871786117554, "epoch": 1.0240160106737826, "step": 3070 }, { "distill_loss": 0.4993576109409332, "epoch": 1.0240160106737826, "step": 3070 }, { "epoch": 1.0240160106737826, "ref_ce_loss": 0.2292649745941162, "step": 3070 }, { "epoch": 1.027351567711808, "loss": 1.279, "step": 3080 }, { "epoch": 1.027351567711808, "grad_norm": 2.1405253410339355, "step": 3080 }, { "epoch": 1.027351567711808, "learning_rate": 0.0007889580537880937, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 1.29014253616333, "step": 3080 }, { "ce_loss": 0.35143256187438965, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.5421102643013, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.23601114749908447, "step": 3080 }, { "epoch": 1.027351567711808, "loss": 1.138009786605835, "step": 3080 }, { "ce_loss": 0.31548169255256653, "epoch": 1.027351567711808, "step": 3080 }, { "distill_loss": 0.5134023427963257, "epoch": 1.027351567711808, "step": 3080 }, { "epoch": 1.027351567711808, "ref_ce_loss": 0.1867564469575882, "step": 3080 }, { "epoch": 1.0306871247498333, "loss": 1.2572, "step": 3090 }, { "epoch": 1.0306871247498333, "grad_norm": 1.6170761585235596, "step": 3090 }, { "epoch": 1.0306871247498333, "learning_rate": 0.0007888569932371277, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 1.2278735637664795, "step": 3090 }, { "ce_loss": 0.3722643256187439, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.531057596206665, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.2565631568431854, "step": 3090 }, { "epoch": 1.0306871247498333, "loss": 1.568192958831787, "step": 3090 }, { "ce_loss": 0.3835557997226715, "epoch": 1.0306871247498333, "step": 3090 }, { "distill_loss": 0.550069272518158, "epoch": 1.0306871247498333, "step": 3090 }, { "epoch": 1.0306871247498333, "ref_ce_loss": 0.3273247182369232, "step": 3090 }, { "epoch": 1.0340226817878586, "loss": 1.2316, "step": 3100 }, { "epoch": 1.0340226817878586, "grad_norm": 1.3664284944534302, "step": 3100 }, { "epoch": 1.0340226817878586, "learning_rate": 0.0007887554788482082, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 1.4323724508285522, "step": 3100 }, { "ce_loss": 0.38257670402526855, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.4578417241573334, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.2541353702545166, "step": 3100 }, { "epoch": 1.0340226817878586, "loss": 1.0234249830245972, "step": 3100 }, { "ce_loss": 0.303827166557312, "epoch": 1.0340226817878586, "step": 3100 }, { "distill_loss": 0.4745440185070038, "epoch": 1.0340226817878586, "step": 3100 }, { "epoch": 1.0340226817878586, "ref_ce_loss": 0.18835636973381042, "step": 3100 }, { "epoch": 1.037358238825884, "loss": 1.2003, "step": 3110 }, { "epoch": 1.037358238825884, "grad_norm": 1.3299576044082642, "step": 3110 }, { "epoch": 1.037358238825884, "learning_rate": 0.0007886535107398128, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 1.4359586238861084, "step": 3110 }, { "ce_loss": 0.3999103903770447, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.5249671339988708, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.25556695461273193, "step": 3110 }, { "epoch": 1.037358238825884, "loss": 1.2247552871704102, "step": 3110 }, { "ce_loss": 0.34925416111946106, "epoch": 1.037358238825884, "step": 3110 }, { "distill_loss": 0.47998103499412537, "epoch": 1.037358238825884, "step": 3110 }, { "epoch": 1.037358238825884, "ref_ce_loss": 0.22177754342556, "step": 3110 }, { "epoch": 1.0406937958639093, "loss": 1.3417, "step": 3120 }, { "epoch": 1.0406937958639093, "grad_norm": 1.6552432775497437, "step": 3120 }, { "epoch": 1.0406937958639093, "learning_rate": 0.0007885510890309498, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 1.7250030040740967, "step": 3120 }, { "ce_loss": 0.34814757108688354, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.6008387207984924, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.2179640829563141, "step": 3120 }, { "epoch": 1.0406937958639093, "loss": 1.2385932207107544, "step": 3120 }, { "ce_loss": 0.35398566722869873, "epoch": 1.0406937958639093, "step": 3120 }, { "distill_loss": 0.4646395146846771, "epoch": 1.0406937958639093, "step": 3120 }, { "epoch": 1.0406937958639093, "ref_ce_loss": 0.33056381344795227, "step": 3120 }, { "epoch": 1.0440293529019347, "loss": 1.3849, "step": 3130 }, { "epoch": 1.0440293529019347, "grad_norm": 3.487548589706421, "step": 3130 }, { "epoch": 1.0440293529019347, "learning_rate": 0.0007884482138411558, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 1.1850218772888184, "step": 3130 }, { "ce_loss": 0.3026364743709564, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.4985314905643463, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.1886860877275467, "step": 3130 }, { "epoch": 1.0440293529019347, "loss": 1.2199435234069824, "step": 3130 }, { "ce_loss": 0.34935158491134644, "epoch": 1.0440293529019347, "step": 3130 }, { "distill_loss": 0.5073997974395752, "epoch": 1.0440293529019347, "step": 3130 }, { "epoch": 1.0440293529019347, "ref_ce_loss": 0.28104153275489807, "step": 3130 }, { "epoch": 1.04736490993996, "loss": 1.2998, "step": 3140 }, { "epoch": 1.04736490993996, "grad_norm": 1.450993299484253, "step": 3140 }, { "epoch": 1.04736490993996, "learning_rate": 0.0007883448852904976, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 1.0731884241104126, "step": 3140 }, { "ce_loss": 0.2625713348388672, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.40948688983917236, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.20579855144023895, "step": 3140 }, { "epoch": 1.04736490993996, "loss": 1.4011085033416748, "step": 3140 }, { "ce_loss": 0.3683658540248871, "epoch": 1.04736490993996, "step": 3140 }, { "distill_loss": 0.3941497206687927, "epoch": 1.04736490993996, "step": 3140 }, { "epoch": 1.04736490993996, "ref_ce_loss": 0.2790229320526123, "step": 3140 }, { "epoch": 1.0507004669779854, "loss": 1.2559, "step": 3150 }, { "epoch": 1.0507004669779854, "grad_norm": 1.5150136947631836, "step": 3150 }, { "epoch": 1.0507004669779854, "learning_rate": 0.0007882411034995705, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 1.803234338760376, "step": 3150 }, { "ce_loss": 0.3541768193244934, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.5089897513389587, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.2962924540042877, "step": 3150 }, { "epoch": 1.0507004669779854, "loss": 1.4103766679763794, "step": 3150 }, { "ce_loss": 0.4498169720172882, "epoch": 1.0507004669779854, "step": 3150 }, { "distill_loss": 0.5849369168281555, "epoch": 1.0507004669779854, "step": 3150 }, { "epoch": 1.0507004669779854, "ref_ce_loss": 0.2743041217327118, "step": 3150 }, { "epoch": 1.0540360240160107, "loss": 1.3988, "step": 3160 }, { "epoch": 1.0540360240160107, "grad_norm": 1.3503497838974, "step": 3160 }, { "epoch": 1.0540360240160107, "learning_rate": 0.0007881368685894993, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 1.058760166168213, "step": 3160 }, { "ce_loss": 0.27203452587127686, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.49636560678482056, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.2184726893901825, "step": 3160 }, { "epoch": 1.0540360240160107, "loss": 1.3550480604171753, "step": 3160 }, { "ce_loss": 0.33902209997177124, "epoch": 1.0540360240160107, "step": 3160 }, { "distill_loss": 0.540988028049469, "epoch": 1.0540360240160107, "step": 3160 }, { "epoch": 1.0540360240160107, "ref_ce_loss": 0.2227366715669632, "step": 3160 }, { "epoch": 1.057371581054036, "loss": 1.3205, "step": 3170 }, { "epoch": 1.057371581054036, "grad_norm": 3.3052151203155518, "step": 3170 }, { "epoch": 1.057371581054036, "learning_rate": 0.0007880321806819372, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 1.5313291549682617, "step": 3170 }, { "ce_loss": 0.2738790512084961, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.38085490465164185, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.24050869047641754, "step": 3170 }, { "epoch": 1.057371581054036, "loss": 1.3720393180847168, "step": 3170 }, { "ce_loss": 0.3660949766635895, "epoch": 1.057371581054036, "step": 3170 }, { "distill_loss": 0.4232823848724365, "epoch": 1.057371581054036, "step": 3170 }, { "epoch": 1.057371581054036, "ref_ce_loss": 0.22027719020843506, "step": 3170 }, { "epoch": 1.0607071380920614, "loss": 1.2552, "step": 3180 }, { "epoch": 1.0607071380920614, "grad_norm": 1.9380136728286743, "step": 3180 }, { "epoch": 1.0607071380920614, "learning_rate": 0.0007879270398990663, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 1.4154558181762695, "step": 3180 }, { "ce_loss": 0.3620558977127075, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.5165147185325623, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.2523062825202942, "step": 3180 }, { "epoch": 1.0607071380920614, "loss": 1.2843544483184814, "step": 3180 }, { "ce_loss": 0.34891369938850403, "epoch": 1.0607071380920614, "step": 3180 }, { "distill_loss": 0.46376562118530273, "epoch": 1.0607071380920614, "step": 3180 }, { "epoch": 1.0607071380920614, "ref_ce_loss": 0.2205934226512909, "step": 3180 }, { "epoch": 1.0640426951300868, "loss": 1.2705, "step": 3190 }, { "epoch": 1.0640426951300868, "grad_norm": 2.306701421737671, "step": 3190 }, { "epoch": 1.0640426951300868, "learning_rate": 0.000787821446363597, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 1.8948969841003418, "step": 3190 }, { "ce_loss": 0.3148987293243408, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.4278886318206787, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.21655955910682678, "step": 3190 }, { "epoch": 1.0640426951300868, "loss": 1.5901086330413818, "step": 3190 }, { "ce_loss": 0.44442218542099, "epoch": 1.0640426951300868, "step": 3190 }, { "distill_loss": 0.5294621586799622, "epoch": 1.0640426951300868, "step": 3190 }, { "epoch": 1.0640426951300868, "ref_ce_loss": 0.23455531895160675, "step": 3190 }, { "epoch": 1.067378252168112, "loss": 1.2659, "step": 3200 }, { "epoch": 1.067378252168112, "grad_norm": 1.9085355997085571, "step": 3200 }, { "epoch": 1.067378252168112, "learning_rate": 0.0007877154001987686, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 1.9646542072296143, "step": 3200 }, { "ce_loss": 0.3970075845718384, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.5007684826850891, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.3607442378997803, "step": 3200 }, { "epoch": 1.067378252168112, "loss": 1.1343801021575928, "step": 3200 }, { "ce_loss": 0.30973246693611145, "epoch": 1.067378252168112, "step": 3200 }, { "distill_loss": 0.5078864693641663, "epoch": 1.067378252168112, "step": 3200 }, { "epoch": 1.067378252168112, "ref_ce_loss": 0.22439567744731903, "step": 3200 }, { "epoch": 1.0707138092061375, "loss": 1.2294, "step": 3210 }, { "epoch": 1.0707138092061375, "grad_norm": 2.184018850326538, "step": 3210 }, { "epoch": 1.0707138092061375, "learning_rate": 0.0007876089015283481, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 1.1813832521438599, "step": 3210 }, { "ce_loss": 0.30030760169029236, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.4533817172050476, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.21787992119789124, "step": 3210 }, { "epoch": 1.0707138092061375, "loss": 1.1257771253585815, "step": 3210 }, { "ce_loss": 0.3650830388069153, "epoch": 1.0707138092061375, "step": 3210 }, { "distill_loss": 0.43343934416770935, "epoch": 1.0707138092061375, "step": 3210 }, { "epoch": 1.0707138092061375, "ref_ce_loss": 0.24671237170696259, "step": 3210 }, { "epoch": 1.0740493662441628, "loss": 1.6693, "step": 3220 }, { "epoch": 1.0740493662441628, "grad_norm": 2.01936411857605, "step": 3220 }, { "epoch": 1.0740493662441628, "learning_rate": 0.0007875019504766312, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 1.2749767303466797, "step": 3220 }, { "ce_loss": 0.3731911778450012, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.4127843677997589, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.2641950845718384, "step": 3220 }, { "epoch": 1.0740493662441628, "loss": 1.7697055339813232, "step": 3220 }, { "ce_loss": 0.35934922099113464, "epoch": 1.0740493662441628, "step": 3220 }, { "distill_loss": 0.48314180970191956, "epoch": 1.0740493662441628, "step": 3220 }, { "epoch": 1.0740493662441628, "ref_ce_loss": 0.2707027196884155, "step": 3220 }, { "epoch": 1.0773849232821882, "loss": 1.3645, "step": 3230 }, { "epoch": 1.0773849232821882, "grad_norm": 1.6033449172973633, "step": 3230 }, { "epoch": 1.0773849232821882, "learning_rate": 0.0007873945471684412, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 1.1107524633407593, "step": 3230 }, { "ce_loss": 0.3217277228832245, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.42127346992492676, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.26517701148986816, "step": 3230 }, { "epoch": 1.0773849232821882, "loss": 1.2702422142028809, "step": 3230 }, { "ce_loss": 0.3804514408111572, "epoch": 1.0773849232821882, "step": 3230 }, { "distill_loss": 0.47932273149490356, "epoch": 1.0773849232821882, "step": 3230 }, { "epoch": 1.0773849232821882, "ref_ce_loss": 0.21051423251628876, "step": 3230 }, { "epoch": 1.0807204803202135, "loss": 1.1673, "step": 3240 }, { "epoch": 1.0807204803202135, "grad_norm": 1.7848643064498901, "step": 3240 }, { "epoch": 1.0807204803202135, "learning_rate": 0.0007872866917291293, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 1.1999231576919556, "step": 3240 }, { "ce_loss": 0.4094650447368622, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.47257867455482483, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.2415315955877304, "step": 3240 }, { "epoch": 1.0807204803202135, "loss": 1.1600497961044312, "step": 3240 }, { "ce_loss": 0.3341686427593231, "epoch": 1.0807204803202135, "step": 3240 }, { "distill_loss": 0.3184273838996887, "epoch": 1.0807204803202135, "step": 3240 }, { "epoch": 1.0807204803202135, "ref_ce_loss": 0.2142072468996048, "step": 3240 }, { "epoch": 1.0840560373582389, "loss": 1.1987, "step": 3250 }, { "epoch": 1.0840560373582389, "grad_norm": 1.5663069486618042, "step": 3250 }, { "epoch": 1.0840560373582389, "learning_rate": 0.0007871783842845741, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 1.2865146398544312, "step": 3250 }, { "ce_loss": 0.3988375663757324, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.5756269693374634, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.3118850588798523, "step": 3250 }, { "epoch": 1.0840560373582389, "loss": 2.4615437984466553, "step": 3250 }, { "ce_loss": 0.48707595467567444, "epoch": 1.0840560373582389, "step": 3250 }, { "distill_loss": 0.5528659820556641, "epoch": 1.0840560373582389, "step": 3250 }, { "epoch": 1.0840560373582389, "ref_ce_loss": 0.35930198431015015, "step": 3250 }, { "epoch": 1.0873915943962642, "loss": 1.2931, "step": 3260 }, { "epoch": 1.0873915943962642, "grad_norm": 1.8359395265579224, "step": 3260 }, { "epoch": 1.0873915943962642, "learning_rate": 0.0007870696249611827, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 1.341639518737793, "step": 3260 }, { "ce_loss": 0.3619093894958496, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.5633329153060913, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.22083061933517456, "step": 3260 }, { "epoch": 1.0873915943962642, "loss": 1.116979718208313, "step": 3260 }, { "ce_loss": 0.31899040937423706, "epoch": 1.0873915943962642, "step": 3260 }, { "distill_loss": 0.544400155544281, "epoch": 1.0873915943962642, "step": 3260 }, { "epoch": 1.0873915943962642, "ref_ce_loss": 0.25317904353141785, "step": 3260 }, { "epoch": 1.0907271514342896, "loss": 1.2166, "step": 3270 }, { "epoch": 1.0907271514342896, "grad_norm": 1.9730409383773804, "step": 3270 }, { "epoch": 1.0907271514342896, "learning_rate": 0.0007869604138858883, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 1.2893974781036377, "step": 3270 }, { "ce_loss": 0.39021432399749756, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.6379407048225403, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.26053154468536377, "step": 3270 }, { "epoch": 1.0907271514342896, "loss": 1.4087176322937012, "step": 3270 }, { "ce_loss": 0.38117286562919617, "epoch": 1.0907271514342896, "step": 3270 }, { "distill_loss": 0.5666294097900391, "epoch": 1.0907271514342896, "step": 3270 }, { "epoch": 1.0907271514342896, "ref_ce_loss": 0.24387907981872559, "step": 3270 }, { "epoch": 1.094062708472315, "loss": 1.2378, "step": 3280 }, { "epoch": 1.094062708472315, "grad_norm": 1.3416624069213867, "step": 3280 }, { "epoch": 1.094062708472315, "learning_rate": 0.0007868507511861523, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 1.1847916841506958, "step": 3280 }, { "ce_loss": 0.34078463912010193, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.5022830963134766, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.25572776794433594, "step": 3280 }, { "epoch": 1.094062708472315, "loss": 1.2643036842346191, "step": 3280 }, { "ce_loss": 0.30773019790649414, "epoch": 1.094062708472315, "step": 3280 }, { "distill_loss": 0.49978795647621155, "epoch": 1.094062708472315, "step": 3280 }, { "epoch": 1.094062708472315, "ref_ce_loss": 0.23188060522079468, "step": 3280 }, { "epoch": 1.0973982655103403, "loss": 1.3057, "step": 3290 }, { "epoch": 1.0973982655103403, "grad_norm": 1.837242841720581, "step": 3290 }, { "epoch": 1.0973982655103403, "learning_rate": 0.0007867406369899628, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 1.1789145469665527, "step": 3290 }, { "ce_loss": 0.3441447615623474, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.5966684818267822, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.23769469559192657, "step": 3290 }, { "epoch": 1.0973982655103403, "loss": 1.3219236135482788, "step": 3290 }, { "ce_loss": 0.4166383743286133, "epoch": 1.0973982655103403, "step": 3290 }, { "distill_loss": 0.5886305570602417, "epoch": 1.0973982655103403, "step": 3290 }, { "epoch": 1.0973982655103403, "ref_ce_loss": 0.31640011072158813, "step": 3290 }, { "epoch": 1.1007338225483656, "loss": 1.2599, "step": 3300 }, { "epoch": 1.1007338225483656, "grad_norm": 1.8349952697753906, "step": 3300 }, { "epoch": 1.1007338225483656, "learning_rate": 0.0007866300714258349, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 1.3195645809173584, "step": 3300 }, { "ce_loss": 0.3108004033565521, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.4452487528324127, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.24651941657066345, "step": 3300 }, { "epoch": 1.1007338225483656, "loss": 1.1526429653167725, "step": 3300 }, { "ce_loss": 0.3125385642051697, "epoch": 1.1007338225483656, "step": 3300 }, { "distill_loss": 0.45304566621780396, "epoch": 1.1007338225483656, "step": 3300 }, { "epoch": 1.1007338225483656, "ref_ce_loss": 0.20057500898838043, "step": 3300 }, { "epoch": 1.104069379586391, "loss": 1.3455, "step": 3310 }, { "epoch": 1.104069379586391, "grad_norm": 2.6072306632995605, "step": 3310 }, { "epoch": 1.104069379586391, "learning_rate": 0.0007865190546228107, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 1.2560369968414307, "step": 3310 }, { "ce_loss": 0.37156903743743896, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.5403450131416321, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.26321572065353394, "step": 3310 }, { "epoch": 1.104069379586391, "loss": 1.3976691961288452, "step": 3310 }, { "ce_loss": 0.40832340717315674, "epoch": 1.104069379586391, "step": 3310 }, { "distill_loss": 0.5309417247772217, "epoch": 1.104069379586391, "step": 3310 }, { "epoch": 1.104069379586391, "ref_ce_loss": 0.28758132457733154, "step": 3310 }, { "epoch": 1.1074049366244163, "loss": 1.4654, "step": 3320 }, { "epoch": 1.1074049366244163, "grad_norm": 1.5962499380111694, "step": 3320 }, { "epoch": 1.1074049366244163, "learning_rate": 0.0007864075867104584, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 1.1697372198104858, "step": 3320 }, { "ce_loss": 0.3503996431827545, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.5424225926399231, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.27611243724823, "step": 3320 }, { "epoch": 1.1074049366244163, "loss": 1.1543195247650146, "step": 3320 }, { "ce_loss": 0.3605566620826721, "epoch": 1.1074049366244163, "step": 3320 }, { "distill_loss": 0.5575127601623535, "epoch": 1.1074049366244163, "step": 3320 }, { "epoch": 1.1074049366244163, "ref_ce_loss": 0.23536986112594604, "step": 3320 }, { "epoch": 1.1107404936624417, "loss": 1.1974, "step": 3330 }, { "epoch": 1.1107404936624417, "grad_norm": 1.4378269910812378, "step": 3330 }, { "epoch": 1.1107404936624417, "learning_rate": 0.0007862956678188732, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 2.0673465728759766, "step": 3330 }, { "ce_loss": 0.29359567165374756, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.44125255942344666, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.2021653950214386, "step": 3330 }, { "epoch": 1.1107404936624417, "loss": 1.4126696586608887, "step": 3330 }, { "ce_loss": 0.4507080018520355, "epoch": 1.1107404936624417, "step": 3330 }, { "distill_loss": 0.5015628337860107, "epoch": 1.1107404936624417, "step": 3330 }, { "epoch": 1.1107404936624417, "ref_ce_loss": 0.2881172001361847, "step": 3330 }, { "epoch": 1.114076050700467, "loss": 1.3471, "step": 3340 }, { "epoch": 1.114076050700467, "grad_norm": 1.4997214078903198, "step": 3340 }, { "epoch": 1.114076050700467, "learning_rate": 0.0007861832980786765, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 1.090973973274231, "step": 3340 }, { "ce_loss": 0.3212396204471588, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.5364630222320557, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.23215319216251373, "step": 3340 }, { "epoch": 1.114076050700467, "loss": 1.3080556392669678, "step": 3340 }, { "ce_loss": 0.37442025542259216, "epoch": 1.114076050700467, "step": 3340 }, { "distill_loss": 0.6098906397819519, "epoch": 1.114076050700467, "step": 3340 }, { "epoch": 1.114076050700467, "ref_ce_loss": 0.2560647130012512, "step": 3340 }, { "epoch": 1.1174116077384924, "loss": 1.2975, "step": 3350 }, { "epoch": 1.1174116077384924, "grad_norm": 1.920807957649231, "step": 3350 }, { "epoch": 1.1174116077384924, "learning_rate": 0.0007860704776210161, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 1.2911982536315918, "step": 3350 }, { "ce_loss": 0.3540288805961609, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.5088047981262207, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.2307043820619583, "step": 3350 }, { "epoch": 1.1174116077384924, "loss": 1.1055140495300293, "step": 3350 }, { "ce_loss": 0.3683701455593109, "epoch": 1.1174116077384924, "step": 3350 }, { "distill_loss": 0.45363694429397583, "epoch": 1.1174116077384924, "step": 3350 }, { "epoch": 1.1174116077384924, "ref_ce_loss": 0.20422856509685516, "step": 3350 }, { "epoch": 1.1207471647765177, "loss": 1.3233, "step": 3360 }, { "epoch": 1.1207471647765177, "grad_norm": 1.5294970273971558, "step": 3360 }, { "epoch": 1.1207471647765177, "learning_rate": 0.0007859572065775654, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 0.9781809449195862, "step": 3360 }, { "ce_loss": 0.26756465435028076, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.468820184469223, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.2416294515132904, "step": 3360 }, { "epoch": 1.1207471647765177, "loss": 1.2498588562011719, "step": 3360 }, { "ce_loss": 0.3570075035095215, "epoch": 1.1207471647765177, "step": 3360 }, { "distill_loss": 0.5418039560317993, "epoch": 1.1207471647765177, "step": 3360 }, { "epoch": 1.1207471647765177, "ref_ce_loss": 0.19507354497909546, "step": 3360 }, { "epoch": 1.124082721814543, "loss": 1.3759, "step": 3370 }, { "epoch": 1.124082721814543, "grad_norm": 3.4274094104766846, "step": 3370 }, { "epoch": 1.124082721814543, "learning_rate": 0.0007858434850805238, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 1.7688602209091187, "step": 3370 }, { "ce_loss": 0.37512704730033875, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.7253754734992981, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.2692413926124573, "step": 3370 }, { "epoch": 1.124082721814543, "loss": 1.830051302909851, "step": 3370 }, { "ce_loss": 0.36339071393013, "epoch": 1.124082721814543, "step": 3370 }, { "distill_loss": 0.6195383667945862, "epoch": 1.124082721814543, "step": 3370 }, { "epoch": 1.124082721814543, "ref_ce_loss": 0.23754560947418213, "step": 3370 }, { "epoch": 1.1274182788525684, "loss": 1.4666, "step": 3380 }, { "epoch": 1.1274182788525684, "grad_norm": 2.594028949737549, "step": 3380 }, { "epoch": 1.1274182788525684, "learning_rate": 0.0007857293132626166, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 1.2381805181503296, "step": 3380 }, { "ce_loss": 0.3283141255378723, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.4507133662700653, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.2803668677806854, "step": 3380 }, { "epoch": 1.1274182788525684, "loss": 0.9831138253211975, "step": 3380 }, { "ce_loss": 0.35459038615226746, "epoch": 1.1274182788525684, "step": 3380 }, { "distill_loss": 0.37437888979911804, "epoch": 1.1274182788525684, "step": 3380 }, { "epoch": 1.1274182788525684, "ref_ce_loss": 0.25287047028541565, "step": 3380 }, { "epoch": 1.1307538358905938, "loss": 1.1624, "step": 3390 }, { "epoch": 1.1307538358905938, "grad_norm": 1.9503041505813599, "step": 3390 }, { "epoch": 1.1307538358905938, "learning_rate": 0.0007856146912570947, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 1.2150638103485107, "step": 3390 }, { "ce_loss": 0.4028579592704773, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.5170663595199585, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.2950234115123749, "step": 3390 }, { "epoch": 1.1307538358905938, "loss": 1.2264457941055298, "step": 3390 }, { "ce_loss": 0.36099863052368164, "epoch": 1.1307538358905938, "step": 3390 }, { "distill_loss": 0.5292174816131592, "epoch": 1.1307538358905938, "step": 3390 }, { "epoch": 1.1307538358905938, "ref_ce_loss": 0.24654163420200348, "step": 3390 }, { "epoch": 1.134089392928619, "loss": 1.2931, "step": 3400 }, { "epoch": 1.134089392928619, "grad_norm": 1.8613334894180298, "step": 3400 }, { "epoch": 1.134089392928619, "learning_rate": 0.0007854996191977343, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 1.1026091575622559, "step": 3400 }, { "ce_loss": 0.33049890398979187, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.44255223870277405, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.16969634592533112, "step": 3400 }, { "epoch": 1.134089392928619, "loss": 1.1961816549301147, "step": 3400 }, { "ce_loss": 0.38204365968704224, "epoch": 1.134089392928619, "step": 3400 }, { "distill_loss": 0.5366348624229431, "epoch": 1.134089392928619, "step": 3400 }, { "epoch": 1.134089392928619, "ref_ce_loss": 0.211135596036911, "step": 3400 }, { "epoch": 1.1374249499666444, "loss": 1.2634, "step": 3410 }, { "epoch": 1.1374249499666444, "grad_norm": 1.833163857460022, "step": 3410 }, { "epoch": 1.1374249499666444, "learning_rate": 0.0007853840972188367, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 1.0167384147644043, "step": 3410 }, { "ce_loss": 0.2936549186706543, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.48618417978286743, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.23391129076480865, "step": 3410 }, { "epoch": 1.1374249499666444, "loss": 1.0690950155258179, "step": 3410 }, { "ce_loss": 0.28655749559402466, "epoch": 1.1374249499666444, "step": 3410 }, { "distill_loss": 0.4551718831062317, "epoch": 1.1374249499666444, "step": 3410 }, { "epoch": 1.1374249499666444, "ref_ce_loss": 0.22481927275657654, "step": 3410 }, { "epoch": 1.1407605070046698, "loss": 1.2891, "step": 3420 }, { "epoch": 1.1407605070046698, "grad_norm": 1.7783701419830322, "step": 3420 }, { "epoch": 1.1407605070046698, "learning_rate": 0.0007852681254552286, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 1.2163112163543701, "step": 3420 }, { "ce_loss": 0.36884644627571106, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.4819245934486389, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.2516787648200989, "step": 3420 }, { "epoch": 1.1407605070046698, "loss": 1.3230206966400146, "step": 3420 }, { "ce_loss": 0.36667966842651367, "epoch": 1.1407605070046698, "step": 3420 }, { "distill_loss": 0.47532573342323303, "epoch": 1.1407605070046698, "step": 3420 }, { "epoch": 1.1407605070046698, "ref_ce_loss": 0.23686495423316956, "step": 3420 }, { "epoch": 1.1440960640426951, "loss": 1.3354, "step": 3430 }, { "epoch": 1.1440960640426951, "grad_norm": 1.9137823581695557, "step": 3430 }, { "epoch": 1.1440960640426951, "learning_rate": 0.0007851517040422617, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 1.2071892023086548, "step": 3430 }, { "ce_loss": 0.35587406158447266, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.42205366492271423, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.2519875466823578, "step": 3430 }, { "epoch": 1.1440960640426951, "loss": 1.076033592224121, "step": 3430 }, { "ce_loss": 0.27833545207977295, "epoch": 1.1440960640426951, "step": 3430 }, { "distill_loss": 0.49027466773986816, "epoch": 1.1440960640426951, "step": 3430 }, { "epoch": 1.1440960640426951, "ref_ce_loss": 0.22420674562454224, "step": 3430 }, { "epoch": 1.1474316210807205, "loss": 1.2992, "step": 3440 }, { "epoch": 1.1474316210807205, "grad_norm": 1.7274442911148071, "step": 3440 }, { "epoch": 1.1474316210807205, "learning_rate": 0.0007850348331158119, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 1.2800071239471436, "step": 3440 }, { "ce_loss": 0.4170500338077545, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.47338342666625977, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.30712056159973145, "step": 3440 }, { "epoch": 1.1474316210807205, "loss": 1.2456332445144653, "step": 3440 }, { "ce_loss": 0.3563118875026703, "epoch": 1.1474316210807205, "step": 3440 }, { "distill_loss": 0.36735472083091736, "epoch": 1.1474316210807205, "step": 3440 }, { "epoch": 1.1474316210807205, "ref_ce_loss": 0.25647401809692383, "step": 3440 }, { "epoch": 1.1507671781187458, "loss": 1.2522, "step": 3450 }, { "epoch": 1.1507671781187458, "grad_norm": 1.9632582664489746, "step": 3450 }, { "epoch": 1.1507671781187458, "learning_rate": 0.0007849175128122806, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 1.0870206356048584, "step": 3450 }, { "ce_loss": 0.2861756384372711, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.5420786738395691, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.2082960158586502, "step": 3450 }, { "epoch": 1.1507671781187458, "loss": 1.270479679107666, "step": 3450 }, { "ce_loss": 0.4658251404762268, "epoch": 1.1507671781187458, "step": 3450 }, { "distill_loss": 0.5474481582641602, "epoch": 1.1507671781187458, "step": 3450 }, { "epoch": 1.1507671781187458, "ref_ce_loss": 0.25603190064430237, "step": 3450 }, { "epoch": 1.1541027351567712, "loss": 1.3792, "step": 3460 }, { "epoch": 1.1541027351567712, "grad_norm": 1.5725535154342651, "step": 3460 }, { "epoch": 1.1541027351567712, "learning_rate": 0.0007847997432685929, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 1.2822141647338867, "step": 3460 }, { "ce_loss": 0.3111097812652588, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.4790066182613373, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.27783533930778503, "step": 3460 }, { "epoch": 1.1541027351567712, "loss": 2.144399404525757, "step": 3460 }, { "ce_loss": 0.35182690620422363, "epoch": 1.1541027351567712, "step": 3460 }, { "distill_loss": 0.5352278351783752, "epoch": 1.1541027351567712, "step": 3460 }, { "epoch": 1.1541027351567712, "ref_ce_loss": 0.24378477036952972, "step": 3460 }, { "epoch": 1.1574382921947965, "loss": 1.2176, "step": 3470 }, { "epoch": 1.1574382921947965, "grad_norm": 2.390889883041382, "step": 3470 }, { "epoch": 1.1574382921947965, "learning_rate": 0.0007846815246221986, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 1.5885863304138184, "step": 3470 }, { "ce_loss": 0.47593382000923157, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.702208936214447, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.31476113200187683, "step": 3470 }, { "epoch": 1.1574382921947965, "loss": 1.1008955240249634, "step": 3470 }, { "ce_loss": 0.32944154739379883, "epoch": 1.1574382921947965, "step": 3470 }, { "distill_loss": 0.4527980387210846, "epoch": 1.1574382921947965, "step": 3470 }, { "epoch": 1.1574382921947965, "ref_ce_loss": 0.23255252838134766, "step": 3470 }, { "epoch": 1.160773849232822, "loss": 1.2574, "step": 3480 }, { "epoch": 1.160773849232822, "grad_norm": 2.0811338424682617, "step": 3480 }, { "epoch": 1.160773849232822, "learning_rate": 0.0007845628570110716, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 1.130537748336792, "step": 3480 }, { "ce_loss": 0.3342140316963196, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.508356511592865, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.20144306123256683, "step": 3480 }, { "epoch": 1.160773849232822, "loss": 1.4564712047576904, "step": 3480 }, { "ce_loss": 0.4794396460056305, "epoch": 1.160773849232822, "step": 3480 }, { "distill_loss": 0.5904377698898315, "epoch": 1.160773849232822, "step": 3480 }, { "epoch": 1.160773849232822, "ref_ce_loss": 0.3139699697494507, "step": 3480 }, { "epoch": 1.1641094062708472, "loss": 1.2398, "step": 3490 }, { "epoch": 1.1641094062708472, "grad_norm": 2.8266541957855225, "step": 3490 }, { "epoch": 1.1641094062708472, "learning_rate": 0.00078444374057371, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 2.1026384830474854, "step": 3490 }, { "ce_loss": 0.2914462983608246, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.45004281401634216, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.17569176852703094, "step": 3490 }, { "epoch": 1.1641094062708472, "loss": 1.6855539083480835, "step": 3490 }, { "ce_loss": 0.35392481088638306, "epoch": 1.1641094062708472, "step": 3490 }, { "distill_loss": 0.5410393476486206, "epoch": 1.1641094062708472, "step": 3490 }, { "epoch": 1.1641094062708472, "ref_ce_loss": 0.20788775384426117, "step": 3490 }, { "epoch": 1.1674449633088726, "loss": 1.3848, "step": 3500 }, { "epoch": 1.1674449633088726, "grad_norm": 2.1698193550109863, "step": 3500 }, { "epoch": 1.1674449633088726, "learning_rate": 0.0007843241754491351, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 1.4109277725219727, "step": 3500 }, { "ce_loss": 0.4122583270072937, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.6634799242019653, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.26556360721588135, "step": 3500 }, { "epoch": 1.1674449633088726, "loss": 2.186739683151245, "step": 3500 }, { "ce_loss": 0.35684934258461, "epoch": 1.1674449633088726, "step": 3500 }, { "distill_loss": 0.6196172833442688, "epoch": 1.1674449633088726, "step": 3500 }, { "epoch": 1.1674449633088726, "ref_ce_loss": 0.27541086077690125, "step": 3500 }, { "epoch": 1.170780520346898, "loss": 1.4175, "step": 3510 }, { "epoch": 1.170780520346898, "grad_norm": 2.025646209716797, "step": 3510 }, { "epoch": 1.170780520346898, "learning_rate": 0.0007842041617768926, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 1.4531854391098022, "step": 3510 }, { "ce_loss": 0.36734527349472046, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.6254667639732361, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.23278909921646118, "step": 3510 }, { "epoch": 1.170780520346898, "loss": 1.8889970779418945, "step": 3510 }, { "ce_loss": 0.3474090099334717, "epoch": 1.170780520346898, "step": 3510 }, { "distill_loss": 0.5200473666191101, "epoch": 1.170780520346898, "step": 3510 }, { "epoch": 1.170780520346898, "ref_ce_loss": 0.2619593143463135, "step": 3510 }, { "epoch": 1.1741160773849233, "loss": 1.2872, "step": 3520 }, { "epoch": 1.1741160773849233, "grad_norm": 1.905614972114563, "step": 3520 }, { "epoch": 1.1741160773849233, "learning_rate": 0.0007840836996970511, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 1.0105056762695312, "step": 3520 }, { "ce_loss": 0.29185229539871216, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.4211788773536682, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.21574154496192932, "step": 3520 }, { "epoch": 1.1741160773849233, "loss": 1.386340856552124, "step": 3520 }, { "ce_loss": 0.3471980392932892, "epoch": 1.1741160773849233, "step": 3520 }, { "distill_loss": 0.4527163803577423, "epoch": 1.1741160773849233, "step": 3520 }, { "epoch": 1.1741160773849233, "ref_ce_loss": 0.23550935089588165, "step": 3520 }, { "epoch": 1.1774516344229486, "loss": 1.2977, "step": 3530 }, { "epoch": 1.1774516344229486, "grad_norm": 2.2812416553497314, "step": 3530 }, { "epoch": 1.1774516344229486, "learning_rate": 0.0007839627893502031, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 1.3279774188995361, "step": 3530 }, { "ce_loss": 0.37937167286872864, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.5612952709197998, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.24463312327861786, "step": 3530 }, { "epoch": 1.1774516344229486, "loss": 1.4569265842437744, "step": 3530 }, { "ce_loss": 0.3426618278026581, "epoch": 1.1774516344229486, "step": 3530 }, { "distill_loss": 0.5634005665779114, "epoch": 1.1774516344229486, "step": 3530 }, { "epoch": 1.1774516344229486, "ref_ce_loss": 0.26913848519325256, "step": 3530 }, { "epoch": 1.180787191460974, "loss": 1.2552, "step": 3540 }, { "epoch": 1.180787191460974, "grad_norm": 1.6602412462234497, "step": 3540 }, { "epoch": 1.180787191460974, "learning_rate": 0.0007838414308774637, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 1.6292023658752441, "step": 3540 }, { "ce_loss": 0.4299602806568146, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.5841928124427795, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.2711060345172882, "step": 3540 }, { "epoch": 1.180787191460974, "loss": 1.1693357229232788, "step": 3540 }, { "ce_loss": 0.3660375475883484, "epoch": 1.180787191460974, "step": 3540 }, { "distill_loss": 0.5180224180221558, "epoch": 1.180787191460974, "step": 3540 }, { "epoch": 1.180787191460974, "ref_ce_loss": 0.2825009226799011, "step": 3540 }, { "epoch": 1.1841227484989993, "loss": 1.2843, "step": 3550 }, { "epoch": 1.1841227484989993, "grad_norm": 2.6830334663391113, "step": 3550 }, { "epoch": 1.1841227484989993, "learning_rate": 0.0007837196244204714, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 1.256629467010498, "step": 3550 }, { "ce_loss": 0.37410256266593933, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.47845524549484253, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.29637405276298523, "step": 3550 }, { "epoch": 1.1841227484989993, "loss": 1.1663440465927124, "step": 3550 }, { "ce_loss": 0.33838000893592834, "epoch": 1.1841227484989993, "step": 3550 }, { "distill_loss": 0.45741572976112366, "epoch": 1.1841227484989993, "step": 3550 }, { "epoch": 1.1841227484989993, "ref_ce_loss": 0.23389117419719696, "step": 3550 }, { "epoch": 1.1874583055370247, "loss": 1.3448, "step": 3560 }, { "epoch": 1.1874583055370247, "grad_norm": 1.710472583770752, "step": 3560 }, { "epoch": 1.1874583055370247, "learning_rate": 0.0007835973701213874, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 1.1803218126296997, "step": 3560 }, { "ce_loss": 0.3162272274494171, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.4722076952457428, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.21254397928714752, "step": 3560 }, { "epoch": 1.1874583055370247, "loss": 1.2828024625778198, "step": 3560 }, { "ce_loss": 0.37789955735206604, "epoch": 1.1874583055370247, "step": 3560 }, { "distill_loss": 0.5199463963508606, "epoch": 1.1874583055370247, "step": 3560 }, { "epoch": 1.1874583055370247, "ref_ce_loss": 0.31490829586982727, "step": 3560 }, { "epoch": 1.19079386257505, "loss": 1.3718, "step": 3570 }, { "epoch": 1.19079386257505, "grad_norm": 1.7363239526748657, "step": 3570 }, { "epoch": 1.19079386257505, "learning_rate": 0.0007834746681228959, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 1.1682206392288208, "step": 3570 }, { "ce_loss": 0.39901384711265564, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.410632848739624, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.2826874852180481, "step": 3570 }, { "epoch": 1.19079386257505, "loss": 1.0605826377868652, "step": 3570 }, { "ce_loss": 0.3148486912250519, "epoch": 1.19079386257505, "step": 3570 }, { "distill_loss": 0.368727445602417, "epoch": 1.19079386257505, "step": 3570 }, { "epoch": 1.19079386257505, "ref_ce_loss": 0.19040748476982117, "step": 3570 }, { "epoch": 1.1941294196130754, "loss": 1.1801, "step": 3580 }, { "epoch": 1.1941294196130754, "grad_norm": 1.7157187461853027, "step": 3580 }, { "epoch": 1.1941294196130754, "learning_rate": 0.000783351518568203, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 1.5664011240005493, "step": 3580 }, { "ce_loss": 0.3691982328891754, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.5537074208259583, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.2542349398136139, "step": 3580 }, { "epoch": 1.1941294196130754, "loss": 1.4214699268341064, "step": 3580 }, { "ce_loss": 0.3670927882194519, "epoch": 1.1941294196130754, "step": 3580 }, { "distill_loss": 0.536470890045166, "epoch": 1.1941294196130754, "step": 3580 }, { "epoch": 1.1941294196130754, "ref_ce_loss": 0.259833961725235, "step": 3580 }, { "epoch": 1.1974649766511007, "loss": 1.3455, "step": 3590 }, { "epoch": 1.1974649766511007, "grad_norm": 2.5788187980651855, "step": 3590 }, { "epoch": 1.1974649766511007, "learning_rate": 0.0007832279216010375, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 1.2196593284606934, "step": 3590 }, { "ce_loss": 0.4030509889125824, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.5046036243438721, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.3118016719818115, "step": 3590 }, { "epoch": 1.1974649766511007, "loss": 1.4102113246917725, "step": 3590 }, { "ce_loss": 0.39648792147636414, "epoch": 1.1974649766511007, "step": 3590 }, { "distill_loss": 0.589056134223938, "epoch": 1.1974649766511007, "step": 3590 }, { "epoch": 1.1974649766511007, "ref_ce_loss": 0.22151829302310944, "step": 3590 }, { "epoch": 1.200800533689126, "loss": 1.2565, "step": 3600 }, { "epoch": 1.200800533689126, "grad_norm": 2.462405204772949, "step": 3600 }, { "epoch": 1.200800533689126, "learning_rate": 0.0007831038773656506, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 1.3264976739883423, "step": 3600 }, { "ce_loss": 0.36774370074272156, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.5057428479194641, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.22975876927375793, "step": 3600 }, { "epoch": 1.200800533689126, "loss": 1.2413110733032227, "step": 3600 }, { "ce_loss": 0.3056010901927948, "epoch": 1.200800533689126, "step": 3600 }, { "distill_loss": 0.48162832856178284, "epoch": 1.200800533689126, "step": 3600 }, { "epoch": 1.200800533689126, "ref_ce_loss": 0.23536323010921478, "step": 3600 }, { "epoch": 1.2041360907271514, "loss": 1.3204, "step": 3610 }, { "epoch": 1.2041360907271514, "grad_norm": 1.8102093935012817, "step": 3610 }, { "epoch": 1.2041360907271514, "learning_rate": 0.0007829793860068151, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 1.1256744861602783, "step": 3610 }, { "ce_loss": 0.3574826419353485, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.514342725276947, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.2532525658607483, "step": 3610 }, { "epoch": 1.2041360907271514, "loss": 1.1662226915359497, "step": 3610 }, { "ce_loss": 0.3443499207496643, "epoch": 1.2041360907271514, "step": 3610 }, { "distill_loss": 0.5596121549606323, "epoch": 1.2041360907271514, "step": 3610 }, { "epoch": 1.2041360907271514, "ref_ce_loss": 0.2618454396724701, "step": 3610 }, { "epoch": 1.2074716477651768, "loss": 1.1875, "step": 3620 }, { "epoch": 1.2074716477651768, "grad_norm": 1.8104478120803833, "step": 3620 }, { "epoch": 1.2074716477651768, "learning_rate": 0.0007828544476698258, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 1.4878114461898804, "step": 3620 }, { "ce_loss": 0.2988416254520416, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.4803342819213867, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.2804744243621826, "step": 3620 }, { "epoch": 1.2074716477651768, "loss": 1.1582154035568237, "step": 3620 }, { "ce_loss": 0.396658718585968, "epoch": 1.2074716477651768, "step": 3620 }, { "distill_loss": 0.47372835874557495, "epoch": 1.2074716477651768, "step": 3620 }, { "epoch": 1.2074716477651768, "ref_ce_loss": 0.2876175343990326, "step": 3620 }, { "epoch": 1.2108072048032021, "loss": 1.1445, "step": 3630 }, { "epoch": 1.2108072048032021, "grad_norm": 1.6298176050186157, "step": 3630 }, { "epoch": 1.2108072048032021, "learning_rate": 0.0007827290625004993, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 1.3248764276504517, "step": 3630 }, { "ce_loss": 0.27193018794059753, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.43337440490722656, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.26712289452552795, "step": 3630 }, { "epoch": 1.2108072048032021, "loss": 1.4135502576828003, "step": 3630 }, { "ce_loss": 0.30455535650253296, "epoch": 1.2108072048032021, "step": 3630 }, { "distill_loss": 0.47315406799316406, "epoch": 1.2108072048032021, "step": 3630 }, { "epoch": 1.2108072048032021, "ref_ce_loss": 0.2176881581544876, "step": 3630 }, { "epoch": 1.2141427618412275, "loss": 1.2222, "step": 3640 }, { "epoch": 1.2141427618412275, "grad_norm": 1.8237580060958862, "step": 3640 }, { "epoch": 1.2141427618412275, "learning_rate": 0.0007826032306451734, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 1.219295859336853, "step": 3640 }, { "ce_loss": 0.4064521789550781, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.5045239329338074, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.23610226809978485, "step": 3640 }, { "epoch": 1.2141427618412275, "loss": 1.324281096458435, "step": 3640 }, { "ce_loss": 0.39209476113319397, "epoch": 1.2141427618412275, "step": 3640 }, { "distill_loss": 0.574234664440155, "epoch": 1.2141427618412275, "step": 3640 }, { "epoch": 1.2141427618412275, "ref_ce_loss": 0.2725990414619446, "step": 3640 }, { "epoch": 1.2174783188792528, "loss": 1.3203, "step": 3650 }, { "epoch": 1.2174783188792528, "grad_norm": 2.246127128601074, "step": 3650 }, { "epoch": 1.2174783188792528, "learning_rate": 0.0007824769522507076, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 1.3088812828063965, "step": 3650 }, { "ce_loss": 0.36943188309669495, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.5165199041366577, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.3023454248905182, "step": 3650 }, { "epoch": 1.2174783188792528, "loss": 1.0939652919769287, "step": 3650 }, { "ce_loss": 0.2995774447917938, "epoch": 1.2174783188792528, "step": 3650 }, { "distill_loss": 0.5135769844055176, "epoch": 1.2174783188792528, "step": 3650 }, { "epoch": 1.2174783188792528, "ref_ce_loss": 0.2256573736667633, "step": 3650 }, { "epoch": 1.2208138759172782, "loss": 1.2701, "step": 3660 }, { "epoch": 1.2208138759172782, "grad_norm": 2.1431121826171875, "step": 3660 }, { "epoch": 1.2208138759172782, "learning_rate": 0.0007823502274644823, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 1.0833626985549927, "step": 3660 }, { "ce_loss": 0.3039146661758423, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.4804261028766632, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.2301558554172516, "step": 3660 }, { "epoch": 1.2208138759172782, "loss": 1.9844090938568115, "step": 3660 }, { "ce_loss": 0.31650522351264954, "epoch": 1.2208138759172782, "step": 3660 }, { "distill_loss": 0.4741988182067871, "epoch": 1.2208138759172782, "step": 3660 }, { "epoch": 1.2208138759172782, "ref_ce_loss": 0.19971761107444763, "step": 3660 }, { "epoch": 1.2241494329553035, "loss": 1.3489, "step": 3670 }, { "epoch": 1.2241494329553035, "grad_norm": 1.5589141845703125, "step": 3670 }, { "epoch": 1.2241494329553035, "learning_rate": 0.000782223056434399, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 1.1556637287139893, "step": 3670 }, { "ce_loss": 0.3101668953895569, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.46484261751174927, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.1859927624464035, "step": 3670 }, { "epoch": 1.2241494329553035, "loss": 1.2446397542953491, "step": 3670 }, { "ce_loss": 0.368746280670166, "epoch": 1.2241494329553035, "step": 3670 }, { "distill_loss": 0.5332959294319153, "epoch": 1.2241494329553035, "step": 3670 }, { "epoch": 1.2241494329553035, "ref_ce_loss": 0.2705717980861664, "step": 3670 }, { "epoch": 1.227484989993329, "loss": 1.2077, "step": 3680 }, { "epoch": 1.227484989993329, "grad_norm": 1.59275221824646, "step": 3680 }, { "epoch": 1.227484989993329, "learning_rate": 0.0007820954393088799, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 1.3181159496307373, "step": 3680 }, { "ce_loss": 0.34320712089538574, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.405369371175766, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.2773212790489197, "step": 3680 }, { "epoch": 1.227484989993329, "loss": 1.2116681337356567, "step": 3680 }, { "ce_loss": 0.39864087104797363, "epoch": 1.227484989993329, "step": 3680 }, { "distill_loss": 0.4519028663635254, "epoch": 1.227484989993329, "step": 3680 }, { "epoch": 1.227484989993329, "ref_ce_loss": 0.28923535346984863, "step": 3680 }, { "epoch": 1.2308205470313542, "loss": 1.2474, "step": 3690 }, { "epoch": 1.2308205470313542, "grad_norm": 2.0720341205596924, "step": 3690 }, { "epoch": 1.2308205470313542, "learning_rate": 0.0007819673762368679, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 1.9078865051269531, "step": 3690 }, { "ce_loss": 0.38197579979896545, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.47684934735298157, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.26847782731056213, "step": 3690 }, { "epoch": 1.2308205470313542, "loss": 0.9596579670906067, "step": 3690 }, { "ce_loss": 0.26398709416389465, "epoch": 1.2308205470313542, "step": 3690 }, { "distill_loss": 0.417118638753891, "epoch": 1.2308205470313542, "step": 3690 }, { "epoch": 1.2308205470313542, "ref_ce_loss": 0.1989685297012329, "step": 3690 }, { "epoch": 1.2341561040693796, "loss": 1.269, "step": 3700 }, { "epoch": 1.2341561040693796, "grad_norm": 1.8928744792938232, "step": 3700 }, { "epoch": 1.2341561040693796, "learning_rate": 0.0007818388673678265, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 1.292384386062622, "step": 3700 }, { "ce_loss": 0.3532315492630005, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.4385555386543274, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.23133112490177155, "step": 3700 }, { "epoch": 1.2341561040693796, "loss": 1.2692075967788696, "step": 3700 }, { "ce_loss": 0.373373806476593, "epoch": 1.2341561040693796, "step": 3700 }, { "distill_loss": 0.561535120010376, "epoch": 1.2341561040693796, "step": 3700 }, { "epoch": 1.2341561040693796, "ref_ce_loss": 0.25908219814300537, "step": 3700 }, { "epoch": 1.237491661107405, "loss": 1.2582, "step": 3710 }, { "epoch": 1.237491661107405, "grad_norm": 1.5493221282958984, "step": 3710 }, { "epoch": 1.237491661107405, "learning_rate": 0.0007817099128517393, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 1.013564944267273, "step": 3710 }, { "ce_loss": 0.3018394112586975, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.478537380695343, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.23245984315872192, "step": 3710 }, { "epoch": 1.237491661107405, "loss": 1.2901955842971802, "step": 3710 }, { "ce_loss": 0.2493903636932373, "epoch": 1.237491661107405, "step": 3710 }, { "distill_loss": 0.44054585695266724, "epoch": 1.237491661107405, "step": 3710 }, { "epoch": 1.237491661107405, "ref_ce_loss": 0.22715501487255096, "step": 3710 }, { "epoch": 1.2408272181454303, "loss": 1.2502, "step": 3720 }, { "epoch": 1.2408272181454303, "grad_norm": 1.918712854385376, "step": 3720 }, { "epoch": 1.2408272181454303, "learning_rate": 0.0007815805128391102, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 0.9254280924797058, "step": 3720 }, { "ce_loss": 0.25510698556900024, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.4281381070613861, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.24195969104766846, "step": 3720 }, { "epoch": 1.2408272181454303, "loss": 1.0331034660339355, "step": 3720 }, { "ce_loss": 0.3291766941547394, "epoch": 1.2408272181454303, "step": 3720 }, { "distill_loss": 0.4442521631717682, "epoch": 1.2408272181454303, "step": 3720 }, { "epoch": 1.2408272181454303, "ref_ce_loss": 0.25957420468330383, "step": 3720 }, { "epoch": 1.2441627751834556, "loss": 1.2901, "step": 3730 }, { "epoch": 1.2441627751834556, "grad_norm": 1.9499536752700806, "step": 3730 }, { "epoch": 1.2441627751834556, "learning_rate": 0.0007814506674809627, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 1.287750482559204, "step": 3730 }, { "ce_loss": 0.3604081869125366, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.4917229413986206, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.24488778412342072, "step": 3730 }, { "epoch": 1.2441627751834556, "loss": 1.1799731254577637, "step": 3730 }, { "ce_loss": 0.38218820095062256, "epoch": 1.2441627751834556, "step": 3730 }, { "distill_loss": 0.44150495529174805, "epoch": 1.2441627751834556, "step": 3730 }, { "epoch": 1.2441627751834556, "ref_ce_loss": 0.26581379771232605, "step": 3730 }, { "epoch": 1.247498332221481, "loss": 1.2774, "step": 3740 }, { "epoch": 1.247498332221481, "grad_norm": 1.8356317281723022, "step": 3740 }, { "epoch": 1.247498332221481, "learning_rate": 0.0007813203769288405, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 1.0768628120422363, "step": 3740 }, { "ce_loss": 0.3683629035949707, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.448332279920578, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.25920337438583374, "step": 3740 }, { "epoch": 1.247498332221481, "loss": 1.0234906673431396, "step": 3740 }, { "ce_loss": 0.29681557416915894, "epoch": 1.247498332221481, "step": 3740 }, { "distill_loss": 0.4987185597419739, "epoch": 1.247498332221481, "step": 3740 }, { "epoch": 1.247498332221481, "ref_ce_loss": 0.22705894708633423, "step": 3740 }, { "epoch": 1.2508338892595063, "loss": 1.2152, "step": 3750 }, { "epoch": 1.2508338892595063, "grad_norm": 3.1180875301361084, "step": 3750 }, { "epoch": 1.2508338892595063, "learning_rate": 0.0007811896413348068, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 1.5606341361999512, "step": 3750 }, { "ce_loss": 0.3866245448589325, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.48672977089881897, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.26517099142074585, "step": 3750 }, { "epoch": 1.2508338892595063, "loss": 1.093451738357544, "step": 3750 }, { "ce_loss": 0.38561955094337463, "epoch": 1.2508338892595063, "step": 3750 }, { "distill_loss": 0.4325929284095764, "epoch": 1.2508338892595063, "step": 3750 }, { "epoch": 1.2508338892595063, "ref_ce_loss": 0.2735663652420044, "step": 3750 }, { "epoch": 1.2541694462975317, "loss": 1.2702, "step": 3760 }, { "epoch": 1.2541694462975317, "grad_norm": 1.780735731124878, "step": 3760 }, { "epoch": 1.2541694462975317, "learning_rate": 0.0007810584608514438, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 1.186219334602356, "step": 3760 }, { "ce_loss": 0.3113108277320862, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.4651794135570526, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.21116314828395844, "step": 3760 }, { "epoch": 1.2541694462975317, "loss": 1.305797815322876, "step": 3760 }, { "ce_loss": 0.40197962522506714, "epoch": 1.2541694462975317, "step": 3760 }, { "distill_loss": 0.5337285995483398, "epoch": 1.2541694462975317, "step": 3760 }, { "epoch": 1.2541694462975317, "ref_ce_loss": 0.2919960916042328, "step": 3760 }, { "epoch": 1.257505003335557, "loss": 1.287, "step": 3770 }, { "epoch": 1.257505003335557, "grad_norm": 1.7766234874725342, "step": 3770 }, { "epoch": 1.257505003335557, "learning_rate": 0.0007809268356318535, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 1.434028148651123, "step": 3770 }, { "ce_loss": 0.3346835970878601, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.43117108941078186, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.2201913595199585, "step": 3770 }, { "epoch": 1.257505003335557, "loss": 0.8691766858100891, "step": 3770 }, { "ce_loss": 0.2524417042732239, "epoch": 1.257505003335557, "step": 3770 }, { "distill_loss": 0.3669566512107849, "epoch": 1.257505003335557, "step": 3770 }, { "epoch": 1.257505003335557, "ref_ce_loss": 0.24963033199310303, "step": 3770 }, { "epoch": 1.2608405603735824, "loss": 1.1919, "step": 3780 }, { "epoch": 1.2608405603735824, "grad_norm": 2.417431116104126, "step": 3780 }, { "epoch": 1.2608405603735824, "learning_rate": 0.0007807947658296564, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 1.0297123193740845, "step": 3780 }, { "ce_loss": 0.31283092498779297, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.47859102487564087, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.2380753755569458, "step": 3780 }, { "epoch": 1.2608405603735824, "loss": 1.102510690689087, "step": 3780 }, { "ce_loss": 0.3200535774230957, "epoch": 1.2608405603735824, "step": 3780 }, { "distill_loss": 0.5070088505744934, "epoch": 1.2608405603735824, "step": 3780 }, { "epoch": 1.2608405603735824, "ref_ce_loss": 0.2310090810060501, "step": 3780 }, { "epoch": 1.2641761174116077, "loss": 1.2351, "step": 3790 }, { "epoch": 1.2641761174116077, "grad_norm": 2.5228090286254883, "step": 3790 }, { "epoch": 1.2641761174116077, "learning_rate": 0.0007806622515989926, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 1.2249534130096436, "step": 3790 }, { "ce_loss": 0.3235895335674286, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.5146533846855164, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.30214163661003113, "step": 3790 }, { "epoch": 1.2641761174116077, "loss": 1.6786766052246094, "step": 3790 }, { "ce_loss": 0.3835620582103729, "epoch": 1.2641761174116077, "step": 3790 }, { "distill_loss": 0.7029944658279419, "epoch": 1.2641761174116077, "step": 3790 }, { "epoch": 1.2641761174116077, "ref_ce_loss": 0.2777293920516968, "step": 3790 }, { "epoch": 1.267511674449633, "loss": 1.2841, "step": 3800 }, { "epoch": 1.267511674449633, "grad_norm": 1.3752113580703735, "step": 3800 }, { "epoch": 1.267511674449633, "learning_rate": 0.0007805292930945202, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 1.2774758338928223, "step": 3800 }, { "ce_loss": 0.34269601106643677, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.5056908130645752, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.22918358445167542, "step": 3800 }, { "epoch": 1.267511674449633, "loss": 0.9431825280189514, "step": 3800 }, { "ce_loss": 0.29880571365356445, "epoch": 1.267511674449633, "step": 3800 }, { "distill_loss": 0.4647649824619293, "epoch": 1.267511674449633, "step": 3800 }, { "epoch": 1.267511674449633, "ref_ce_loss": 0.17928951978683472, "step": 3800 }, { "epoch": 1.2708472314876584, "loss": 1.332, "step": 3810 }, { "epoch": 1.2708472314876584, "grad_norm": 2.091035842895508, "step": 3810 }, { "epoch": 1.2708472314876584, "learning_rate": 0.0007803958904714159, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 1.3321853876113892, "step": 3810 }, { "ce_loss": 0.35707196593284607, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.5309280157089233, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.2696284353733063, "step": 3810 }, { "epoch": 1.2708472314876584, "loss": 1.3040084838867188, "step": 3810 }, { "ce_loss": 0.3999079465866089, "epoch": 1.2708472314876584, "step": 3810 }, { "distill_loss": 0.46109065413475037, "epoch": 1.2708472314876584, "step": 3810 }, { "epoch": 1.2708472314876584, "ref_ce_loss": 0.29647043347358704, "step": 3810 }, { "epoch": 1.2741827885256838, "loss": 1.2456, "step": 3820 }, { "epoch": 1.2741827885256838, "grad_norm": 1.902409315109253, "step": 3820 }, { "epoch": 1.2741827885256838, "learning_rate": 0.0007802620438853754, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 1.62610924243927, "step": 3820 }, { "ce_loss": 0.40455368161201477, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.5282254815101624, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.2308456301689148, "step": 3820 }, { "epoch": 1.2741827885256838, "loss": 1.2230732440948486, "step": 3820 }, { "ce_loss": 0.34106773138046265, "epoch": 1.2741827885256838, "step": 3820 }, { "distill_loss": 0.5421730279922485, "epoch": 1.2741827885256838, "step": 3820 }, { "epoch": 1.2741827885256838, "ref_ce_loss": 0.23581770062446594, "step": 3820 }, { "epoch": 1.2775183455637091, "loss": 1.3283, "step": 3830 }, { "epoch": 1.2775183455637091, "grad_norm": 1.6869583129882812, "step": 3830 }, { "epoch": 1.2775183455637091, "learning_rate": 0.0007801277534926117, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 1.173782467842102, "step": 3830 }, { "ce_loss": 0.3969452381134033, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.46130332350730896, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.2590175271034241, "step": 3830 }, { "epoch": 1.2775183455637091, "loss": 1.344570517539978, "step": 3830 }, { "ce_loss": 0.38278263807296753, "epoch": 1.2775183455637091, "step": 3830 }, { "distill_loss": 0.5976495742797852, "epoch": 1.2775183455637091, "step": 3830 }, { "epoch": 1.2775183455637091, "ref_ce_loss": 0.3041939437389374, "step": 3830 }, { "epoch": 1.2808539026017345, "loss": 1.1972, "step": 3840 }, { "epoch": 1.2808539026017345, "grad_norm": 2.1180238723754883, "step": 3840 }, { "epoch": 1.2808539026017345, "learning_rate": 0.0007799930194498561, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 1.1536294221878052, "step": 3840 }, { "ce_loss": 0.3288697600364685, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.39680665731430054, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.32831668853759766, "step": 3840 }, { "epoch": 1.2808539026017345, "loss": 1.2123440504074097, "step": 3840 }, { "ce_loss": 0.3592081367969513, "epoch": 1.2808539026017345, "step": 3840 }, { "distill_loss": 0.5434111952781677, "epoch": 1.2808539026017345, "step": 3840 }, { "epoch": 1.2808539026017345, "ref_ce_loss": 0.21991020441055298, "step": 3840 }, { "epoch": 1.2841894596397598, "loss": 1.2067, "step": 3850 }, { "epoch": 1.2841894596397598, "grad_norm": 1.927894949913025, "step": 3850 }, { "epoch": 1.2841894596397598, "learning_rate": 0.0007798578419143581, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 1.3616065979003906, "step": 3850 }, { "ce_loss": 0.4312002658843994, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.5158619284629822, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.33172714710235596, "step": 3850 }, { "epoch": 1.2841894596397598, "loss": 1.196885347366333, "step": 3850 }, { "ce_loss": 0.25604671239852905, "epoch": 1.2841894596397598, "step": 3850 }, { "distill_loss": 0.47364771366119385, "epoch": 1.2841894596397598, "step": 3850 }, { "epoch": 1.2841894596397598, "ref_ce_loss": 0.2535742521286011, "step": 3850 }, { "epoch": 1.2875250166777852, "loss": 1.3678, "step": 3860 }, { "epoch": 1.2875250166777852, "grad_norm": 2.626359701156616, "step": 3860 }, { "epoch": 1.2875250166777852, "learning_rate": 0.000779722221043884, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 1.3988009691238403, "step": 3860 }, { "ce_loss": 0.32167813181877136, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.539366602897644, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.23677317798137665, "step": 3860 }, { "epoch": 1.2875250166777852, "loss": 1.2158448696136475, "step": 3860 }, { "ce_loss": 0.37974071502685547, "epoch": 1.2875250166777852, "step": 3860 }, { "distill_loss": 0.5115886926651001, "epoch": 1.2875250166777852, "step": 3860 }, { "epoch": 1.2875250166777852, "ref_ce_loss": 0.3241943418979645, "step": 3860 }, { "epoch": 1.2908605737158105, "loss": 1.3151, "step": 3870 }, { "epoch": 1.2908605737158105, "grad_norm": 1.8815953731536865, "step": 3870 }, { "epoch": 1.2908605737158105, "learning_rate": 0.0007795861569967182, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 1.1867594718933105, "step": 3870 }, { "ce_loss": 0.3901292383670807, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.4555829167366028, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.29295268654823303, "step": 3870 }, { "epoch": 1.2908605737158105, "loss": 1.3478776216506958, "step": 3870 }, { "ce_loss": 0.35403814911842346, "epoch": 1.2908605737158105, "step": 3870 }, { "distill_loss": 0.4828459322452545, "epoch": 1.2908605737158105, "step": 3870 }, { "epoch": 1.2908605737158105, "ref_ce_loss": 0.29592955112457275, "step": 3870 }, { "epoch": 1.2941961307538359, "loss": 1.2429, "step": 3880 }, { "epoch": 1.2941961307538359, "grad_norm": 1.8783077001571655, "step": 3880 }, { "epoch": 1.2941961307538359, "learning_rate": 0.0007794496499316621, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 0.9118102192878723, "step": 3880 }, { "ce_loss": 0.22861425578594208, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.39684969186782837, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.19896776974201202, "step": 3880 }, { "epoch": 1.2941961307538359, "loss": 1.749941110610962, "step": 3880 }, { "ce_loss": 0.4143109619617462, "epoch": 1.2941961307538359, "step": 3880 }, { "distill_loss": 0.625407338142395, "epoch": 1.2941961307538359, "step": 3880 }, { "epoch": 1.2941961307538359, "ref_ce_loss": 0.2498316466808319, "step": 3880 }, { "epoch": 1.2975316877918612, "loss": 1.2324, "step": 3890 }, { "epoch": 1.2975316877918612, "grad_norm": 1.451900601387024, "step": 3890 }, { "epoch": 1.2975316877918612, "learning_rate": 0.000779312700008034, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 1.0057792663574219, "step": 3890 }, { "ce_loss": 0.33447906374931335, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.43524765968322754, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.23554539680480957, "step": 3890 }, { "epoch": 1.2975316877918612, "loss": 1.3061044216156006, "step": 3890 }, { "ce_loss": 0.35017919540405273, "epoch": 1.2975316877918612, "step": 3890 }, { "distill_loss": 0.4204780161380768, "epoch": 1.2975316877918612, "step": 3890 }, { "epoch": 1.2975316877918612, "ref_ce_loss": 0.2741265892982483, "step": 3890 }, { "epoch": 1.3008672448298866, "loss": 1.2003, "step": 3900 }, { "epoch": 1.3008672448298866, "grad_norm": 2.9234039783477783, "step": 3900 }, { "epoch": 1.3008672448298866, "learning_rate": 0.0007791753073856692, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 1.3480119705200195, "step": 3900 }, { "ce_loss": 0.3394612669944763, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.6371109485626221, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.29357367753982544, "step": 3900 }, { "epoch": 1.3008672448298866, "loss": 0.9949564337730408, "step": 3900 }, { "ce_loss": 0.2775047719478607, "epoch": 1.3008672448298866, "step": 3900 }, { "distill_loss": 0.5102842450141907, "epoch": 1.3008672448298866, "step": 3900 }, { "epoch": 1.3008672448298866, "ref_ce_loss": 0.20661421120166779, "step": 3900 }, { "epoch": 1.304202801867912, "loss": 1.2634, "step": 3910 }, { "epoch": 1.304202801867912, "grad_norm": 1.8522040843963623, "step": 3910 }, { "epoch": 1.304202801867912, "learning_rate": 0.0007790374722249198, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 1.3082270622253418, "step": 3910 }, { "ce_loss": 0.3670852780342102, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.5673364996910095, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.29259952902793884, "step": 3910 }, { "epoch": 1.304202801867912, "loss": 0.9135763645172119, "step": 3910 }, { "ce_loss": 0.26060348749160767, "epoch": 1.304202801867912, "step": 3910 }, { "distill_loss": 0.45228105783462524, "epoch": 1.304202801867912, "step": 3910 }, { "epoch": 1.304202801867912, "ref_ce_loss": 0.19999496638774872, "step": 3910 }, { "epoch": 1.3075383589059373, "loss": 1.2096, "step": 3920 }, { "epoch": 1.3075383589059373, "grad_norm": 2.51267409324646, "step": 3920 }, { "epoch": 1.3075383589059373, "learning_rate": 0.0007788991946866542, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 1.2026423215866089, "step": 3920 }, { "ce_loss": 0.32737037539482117, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.5024385452270508, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.22580769658088684, "step": 3920 }, { "epoch": 1.3075383589059373, "loss": 1.123530387878418, "step": 3920 }, { "ce_loss": 0.37271732091903687, "epoch": 1.3075383589059373, "step": 3920 }, { "distill_loss": 0.4684728682041168, "epoch": 1.3075383589059373, "step": 3920 }, { "epoch": 1.3075383589059373, "ref_ce_loss": 0.2198222279548645, "step": 3920 }, { "epoch": 1.3108739159439626, "loss": 1.1637, "step": 3930 }, { "epoch": 1.3108739159439626, "grad_norm": 1.9118099212646484, "step": 3930 }, { "epoch": 1.3108739159439626, "learning_rate": 0.000778760474932257, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 1.0190352201461792, "step": 3930 }, { "ce_loss": 0.31040164828300476, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.3947071433067322, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.247438445687294, "step": 3930 }, { "epoch": 1.3108739159439626, "loss": 1.3314695358276367, "step": 3930 }, { "ce_loss": 0.27209722995758057, "epoch": 1.3108739159439626, "step": 3930 }, { "distill_loss": 0.4830207824707031, "epoch": 1.3108739159439626, "step": 3930 }, { "epoch": 1.3108739159439626, "ref_ce_loss": 0.2210894525051117, "step": 3930 }, { "epoch": 1.314209472981988, "loss": 1.2585, "step": 3940 }, { "epoch": 1.314209472981988, "grad_norm": 1.4486713409423828, "step": 3940 }, { "epoch": 1.314209472981988, "learning_rate": 0.0007786213131236294, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 0.9980438947677612, "step": 3940 }, { "ce_loss": 0.3219510614871979, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.47861647605895996, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.19660653173923492, "step": 3940 }, { "epoch": 1.314209472981988, "loss": 1.198689341545105, "step": 3940 }, { "ce_loss": 0.26567214727401733, "epoch": 1.314209472981988, "step": 3940 }, { "distill_loss": 0.43696823716163635, "epoch": 1.314209472981988, "step": 3940 }, { "epoch": 1.314209472981988, "ref_ce_loss": 0.20686624944210052, "step": 3940 }, { "epoch": 1.3175450300200133, "loss": 1.2143, "step": 3950 }, { "epoch": 1.3175450300200133, "grad_norm": 1.840827226638794, "step": 3950 }, { "epoch": 1.3175450300200133, "learning_rate": 0.000778481709423188, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 1.210796594619751, "step": 3950 }, { "ce_loss": 0.35644543170928955, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.5447015762329102, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.2510436177253723, "step": 3950 }, { "epoch": 1.3175450300200133, "loss": 1.4380931854248047, "step": 3950 }, { "ce_loss": 0.2858380675315857, "epoch": 1.3175450300200133, "step": 3950 }, { "distill_loss": 0.5480408072471619, "epoch": 1.3175450300200133, "step": 3950 }, { "epoch": 1.3175450300200133, "ref_ce_loss": 0.24802200496196747, "step": 3950 }, { "epoch": 1.3208805870580387, "loss": 1.3879, "step": 3960 }, { "epoch": 1.3208805870580387, "grad_norm": 1.9232362508773804, "step": 3960 }, { "epoch": 1.3208805870580387, "learning_rate": 0.0007783416639938654, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 0.9122708439826965, "step": 3960 }, { "ce_loss": 0.23479680716991425, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.4136393070220947, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.17437537014484406, "step": 3960 }, { "epoch": 1.3208805870580387, "loss": 1.3577370643615723, "step": 3960 }, { "ce_loss": 0.32100340723991394, "epoch": 1.3208805870580387, "step": 3960 }, { "distill_loss": 0.4907395839691162, "epoch": 1.3208805870580387, "step": 3960 }, { "epoch": 1.3208805870580387, "ref_ce_loss": 0.21169336140155792, "step": 3960 }, { "epoch": 1.324216144096064, "loss": 1.2768, "step": 3970 }, { "epoch": 1.324216144096064, "grad_norm": 1.9105011224746704, "step": 3970 }, { "epoch": 1.324216144096064, "learning_rate": 0.0007782011769991097, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 0.9515153765678406, "step": 3970 }, { "ce_loss": 0.2723129391670227, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.4680030345916748, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.2104032188653946, "step": 3970 }, { "epoch": 1.324216144096064, "loss": 1.3177235126495361, "step": 3970 }, { "ce_loss": 0.28969529271125793, "epoch": 1.324216144096064, "step": 3970 }, { "distill_loss": 0.4438014030456543, "epoch": 1.324216144096064, "step": 3970 }, { "epoch": 1.324216144096064, "ref_ce_loss": 0.2746254503726959, "step": 3970 }, { "epoch": 1.3275517011340894, "loss": 1.1935, "step": 3980 }, { "epoch": 1.3275517011340894, "grad_norm": 1.9408024549484253, "step": 3980 }, { "epoch": 1.3275517011340894, "learning_rate": 0.0007780602486028843, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 1.2587913274765015, "step": 3980 }, { "ce_loss": 0.3765130639076233, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.5410686135292053, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.24707219004631042, "step": 3980 }, { "epoch": 1.3275517011340894, "loss": 1.5994065999984741, "step": 3980 }, { "ce_loss": 0.4570700228214264, "epoch": 1.3275517011340894, "step": 3980 }, { "distill_loss": 0.6066403388977051, "epoch": 1.3275517011340894, "step": 3980 }, { "epoch": 1.3275517011340894, "ref_ce_loss": 0.2937600612640381, "step": 3980 }, { "epoch": 1.3308872581721147, "loss": 1.2621, "step": 3990 }, { "epoch": 1.3308872581721147, "grad_norm": 1.874920129776001, "step": 3990 }, { "epoch": 1.3308872581721147, "learning_rate": 0.0007779188789696677, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 1.5024466514587402, "step": 3990 }, { "ce_loss": 0.4218442142009735, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.49272358417510986, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.30377674102783203, "step": 3990 }, { "epoch": 1.3308872581721147, "loss": 1.033069133758545, "step": 3990 }, { "ce_loss": 0.29926007986068726, "epoch": 1.3308872581721147, "step": 3990 }, { "distill_loss": 0.4381902813911438, "epoch": 1.3308872581721147, "step": 3990 }, { "epoch": 1.3308872581721147, "ref_ce_loss": 0.295410692691803, "step": 3990 }, { "epoch": 1.33422281521014, "loss": 1.2749, "step": 4000 }, { "epoch": 1.33422281521014, "grad_norm": 2.6915059089660645, "step": 4000 }, { "epoch": 1.33422281521014, "learning_rate": 0.0007777770682644537, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 1.3868048191070557, "step": 4000 }, { "ce_loss": 0.3571189343929291, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.634859561920166, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.2819118797779083, "step": 4000 }, { "epoch": 1.33422281521014, "loss": 1.1737803220748901, "step": 4000 }, { "ce_loss": 0.3412657082080841, "epoch": 1.33422281521014, "step": 4000 }, { "distill_loss": 0.5893285274505615, "epoch": 1.33422281521014, "step": 4000 }, { "epoch": 1.33422281521014, "ref_ce_loss": 0.24275919795036316, "step": 4000 }, { "epoch": 1.3375583722481654, "loss": 1.3882, "step": 4010 }, { "epoch": 1.3375583722481654, "grad_norm": 1.72599196434021, "step": 4010 }, { "epoch": 1.3375583722481654, "learning_rate": 0.0007776348166527506, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 1.196636438369751, "step": 4010 }, { "ce_loss": 0.31943756341934204, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.5612338781356812, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.19274453818798065, "step": 4010 }, { "epoch": 1.3375583722481654, "loss": 1.206289291381836, "step": 4010 }, { "ce_loss": 0.3338809907436371, "epoch": 1.3375583722481654, "step": 4010 }, { "distill_loss": 0.5372907519340515, "epoch": 1.3375583722481654, "step": 4010 }, { "epoch": 1.3375583722481654, "ref_ce_loss": 0.24270124733448029, "step": 4010 }, { "epoch": 1.3408939292861908, "loss": 1.2553, "step": 4020 }, { "epoch": 1.3408939292861908, "grad_norm": 1.5856388807296753, "step": 4020 }, { "epoch": 1.3408939292861908, "learning_rate": 0.0007774921243005812, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 1.261049509048462, "step": 4020 }, { "ce_loss": 0.32735010981559753, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.406891405582428, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.26967743039131165, "step": 4020 }, { "epoch": 1.3408939292861908, "loss": 1.6279792785644531, "step": 4020 }, { "ce_loss": 0.33251306414604187, "epoch": 1.3408939292861908, "step": 4020 }, { "distill_loss": 0.45015233755111694, "epoch": 1.3408939292861908, "step": 4020 }, { "epoch": 1.3408939292861908, "ref_ce_loss": 0.2929288446903229, "step": 4020 }, { "epoch": 1.3442294863242161, "loss": 1.3001, "step": 4030 }, { "epoch": 1.3442294863242161, "grad_norm": 1.8929225206375122, "step": 4030 }, { "epoch": 1.3442294863242161, "learning_rate": 0.0007773489913744829, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 1.1194690465927124, "step": 4030 }, { "ce_loss": 0.2566942572593689, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.45145970582962036, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.22091078758239746, "step": 4030 }, { "epoch": 1.3442294863242161, "loss": 1.2250628471374512, "step": 4030 }, { "ce_loss": 0.3045845329761505, "epoch": 1.3442294863242161, "step": 4030 }, { "distill_loss": 0.5682451128959656, "epoch": 1.3442294863242161, "step": 4030 }, { "epoch": 1.3442294863242161, "ref_ce_loss": 0.19009850919246674, "step": 4030 }, { "epoch": 1.3475650433622415, "loss": 1.3624, "step": 4040 }, { "epoch": 1.3475650433622415, "grad_norm": 2.1252081394195557, "step": 4040 }, { "epoch": 1.3475650433622415, "learning_rate": 0.0007772054180415072, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 0.9570633769035339, "step": 4040 }, { "ce_loss": 0.27775803208351135, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.4150533676147461, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.18391333520412445, "step": 4040 }, { "epoch": 1.3475650433622415, "loss": 1.1152474880218506, "step": 4040 }, { "ce_loss": 0.3719904124736786, "epoch": 1.3475650433622415, "step": 4040 }, { "distill_loss": 0.479840487241745, "epoch": 1.3475650433622415, "step": 4040 }, { "epoch": 1.3475650433622415, "ref_ce_loss": 0.2096468210220337, "step": 4040 }, { "epoch": 1.3509006004002668, "loss": 1.2063, "step": 4050 }, { "epoch": 1.3509006004002668, "grad_norm": 1.6501506567001343, "step": 4050 }, { "epoch": 1.3509006004002668, "learning_rate": 0.0007770614044692197, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.9688602089881897, "step": 4050 }, { "ce_loss": 0.2844759523868561, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.48957210779190063, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.1945677399635315, "step": 4050 }, { "epoch": 1.3509006004002668, "loss": 0.977230429649353, "step": 4050 }, { "ce_loss": 0.2928234934806824, "epoch": 1.3509006004002668, "step": 4050 }, { "distill_loss": 0.47115814685821533, "epoch": 1.3509006004002668, "step": 4050 }, { "epoch": 1.3509006004002668, "ref_ce_loss": 0.21297568082809448, "step": 4050 }, { "epoch": 1.3542361574382922, "loss": 1.2671, "step": 4060 }, { "epoch": 1.3542361574382922, "grad_norm": 1.6580679416656494, "step": 4060 }, { "epoch": 1.3542361574382922, "learning_rate": 0.0007769169508256998, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 1.6507563591003418, "step": 4060 }, { "ce_loss": 0.3336380124092102, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.44040024280548096, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.2393060177564621, "step": 4060 }, { "epoch": 1.3542361574382922, "loss": 1.0483828783035278, "step": 4060 }, { "ce_loss": 0.3106835186481476, "epoch": 1.3542361574382922, "step": 4060 }, { "distill_loss": 0.4723849296569824, "epoch": 1.3542361574382922, "step": 4060 }, { "epoch": 1.3542361574382922, "ref_ce_loss": 0.19641432166099548, "step": 4060 }, { "epoch": 1.3575717144763175, "loss": 1.2578, "step": 4070 }, { "epoch": 1.3575717144763175, "grad_norm": 1.6928682327270508, "step": 4070 }, { "epoch": 1.3575717144763175, "learning_rate": 0.0007767720572795402, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 1.1844429969787598, "step": 4070 }, { "ce_loss": 0.32700392603874207, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.5141096711158752, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.22966568171977997, "step": 4070 }, { "epoch": 1.3575717144763175, "loss": 1.1436493396759033, "step": 4070 }, { "ce_loss": 0.28880831599235535, "epoch": 1.3575717144763175, "step": 4070 }, { "distill_loss": 0.4775233566761017, "epoch": 1.3575717144763175, "step": 4070 }, { "epoch": 1.3575717144763175, "ref_ce_loss": 0.22410236299037933, "step": 4070 }, { "epoch": 1.3609072715143429, "loss": 1.3958, "step": 4080 }, { "epoch": 1.3609072715143429, "grad_norm": 2.11891770362854, "step": 4080 }, { "epoch": 1.3609072715143429, "learning_rate": 0.0007766267239998474, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 1.4525783061981201, "step": 4080 }, { "ce_loss": 0.3727923631668091, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.5707982778549194, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.2720719873905182, "step": 4080 }, { "epoch": 1.3609072715143429, "loss": 1.2829095125198364, "step": 4080 }, { "ce_loss": 0.2625315487384796, "epoch": 1.3609072715143429, "step": 4080 }, { "distill_loss": 0.5479893684387207, "epoch": 1.3609072715143429, "step": 4080 }, { "epoch": 1.3609072715143429, "ref_ce_loss": 0.2146390825510025, "step": 4080 }, { "epoch": 1.3642428285523682, "loss": 1.2783, "step": 4090 }, { "epoch": 1.3642428285523682, "grad_norm": 1.5971099138259888, "step": 4090 }, { "epoch": 1.3642428285523682, "learning_rate": 0.000776480951156241, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 1.1137681007385254, "step": 4090 }, { "ce_loss": 0.31989777088165283, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.46096158027648926, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.2357354760169983, "step": 4090 }, { "epoch": 1.3642428285523682, "loss": 1.3032519817352295, "step": 4090 }, { "ce_loss": 0.41542813181877136, "epoch": 1.3642428285523682, "step": 4090 }, { "distill_loss": 0.5394437313079834, "epoch": 1.3642428285523682, "step": 4090 }, { "epoch": 1.3642428285523682, "ref_ce_loss": 0.25867047905921936, "step": 4090 }, { "epoch": 1.3675783855903936, "loss": 1.2893, "step": 4100 }, { "epoch": 1.3675783855903936, "grad_norm": 2.2129158973693848, "step": 4100 }, { "epoch": 1.3675783855903936, "learning_rate": 0.0007763347389188538, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 1.1820653676986694, "step": 4100 }, { "ce_loss": 0.3743279278278351, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.46733152866363525, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.27772408723831177, "step": 4100 }, { "epoch": 1.3675783855903936, "loss": 1.2491978406906128, "step": 4100 }, { "ce_loss": 0.3634282946586609, "epoch": 1.3675783855903936, "step": 4100 }, { "distill_loss": 0.5043824911117554, "epoch": 1.3675783855903936, "step": 4100 }, { "epoch": 1.3675783855903936, "ref_ce_loss": 0.222568079829216, "step": 4100 }, { "epoch": 1.370913942628419, "loss": 1.2623, "step": 4110 }, { "epoch": 1.370913942628419, "grad_norm": 2.6276814937591553, "step": 4110 }, { "epoch": 1.370913942628419, "learning_rate": 0.0007761880874583308, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 0.9971789121627808, "step": 4110 }, { "ce_loss": 0.2905887961387634, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.4974335730075836, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.20612338185310364, "step": 4110 }, { "epoch": 1.370913942628419, "loss": 1.457338809967041, "step": 4110 }, { "ce_loss": 0.3569025993347168, "epoch": 1.370913942628419, "step": 4110 }, { "distill_loss": 0.5805869102478027, "epoch": 1.370913942628419, "step": 4110 }, { "epoch": 1.370913942628419, "ref_ce_loss": 0.25414037704467773, "step": 4110 }, { "epoch": 1.3742494996664443, "loss": 1.1611, "step": 4120 }, { "epoch": 1.3742494996664443, "grad_norm": 1.3242756128311157, "step": 4120 }, { "epoch": 1.3742494996664443, "learning_rate": 0.0007760409969458301, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 1.0695313215255737, "step": 4120 }, { "ce_loss": 0.3189639747142792, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.40805482864379883, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.19657932221889496, "step": 4120 }, { "epoch": 1.3742494996664443, "loss": 0.9379848837852478, "step": 4120 }, { "ce_loss": 0.2650635242462158, "epoch": 1.3742494996664443, "step": 4120 }, { "distill_loss": 0.4356876313686371, "epoch": 1.3742494996664443, "step": 4120 }, { "epoch": 1.3742494996664443, "ref_ce_loss": 0.2354866862297058, "step": 4120 }, { "epoch": 1.3775850567044696, "loss": 1.2268, "step": 4130 }, { "epoch": 1.3775850567044696, "grad_norm": 1.7438019514083862, "step": 4130 }, { "epoch": 1.3775850567044696, "learning_rate": 0.0007758934675530224, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 1.2131264209747314, "step": 4130 }, { "ce_loss": 0.36638134717941284, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.4722328186035156, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.3309879004955292, "step": 4130 }, { "epoch": 1.3775850567044696, "loss": 1.2295116186141968, "step": 4130 }, { "ce_loss": 0.34632059931755066, "epoch": 1.3775850567044696, "step": 4130 }, { "distill_loss": 0.4670564532279968, "epoch": 1.3775850567044696, "step": 4130 }, { "epoch": 1.3775850567044696, "ref_ce_loss": 0.26047638058662415, "step": 4130 }, { "epoch": 1.380920613742495, "loss": 1.1731, "step": 4140 }, { "epoch": 1.380920613742495, "grad_norm": 1.7917325496673584, "step": 4140 }, { "epoch": 1.380920613742495, "learning_rate": 0.0007757454994520902, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 1.2207962274551392, "step": 4140 }, { "ce_loss": 0.3304611146450043, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.4229097366333008, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.2661569118499756, "step": 4140 }, { "epoch": 1.380920613742495, "loss": 1.1358577013015747, "step": 4140 }, { "ce_loss": 0.32615402340888977, "epoch": 1.380920613742495, "step": 4140 }, { "distill_loss": 0.44644737243652344, "epoch": 1.380920613742495, "step": 4140 }, { "epoch": 1.380920613742495, "ref_ce_loss": 0.2777961790561676, "step": 4140 }, { "epoch": 1.3842561707805203, "loss": 1.187, "step": 4150 }, { "epoch": 1.3842561707805203, "grad_norm": 1.9539581537246704, "step": 4150 }, { "epoch": 1.3842561707805203, "learning_rate": 0.0007755970928157282, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 1.2930634021759033, "step": 4150 }, { "ce_loss": 0.3514963984489441, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.5322245955467224, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.1809874325990677, "step": 4150 }, { "epoch": 1.3842561707805203, "loss": 0.8671672940254211, "step": 4150 }, { "ce_loss": 0.27406421303749084, "epoch": 1.3842561707805203, "step": 4150 }, { "distill_loss": 0.3994680643081665, "epoch": 1.3842561707805203, "step": 4150 }, { "epoch": 1.3842561707805203, "ref_ce_loss": 0.19340065121650696, "step": 4150 }, { "epoch": 1.3875917278185457, "loss": 1.1675, "step": 4160 }, { "epoch": 1.3875917278185457, "grad_norm": 1.6229897737503052, "step": 4160 }, { "epoch": 1.3875917278185457, "learning_rate": 0.0007754482478171432, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 1.1500556468963623, "step": 4160 }, { "ce_loss": 0.32584086060523987, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.36228689551353455, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.22661544382572174, "step": 4160 }, { "epoch": 1.3875917278185457, "loss": 1.1648491621017456, "step": 4160 }, { "ce_loss": 0.370568186044693, "epoch": 1.3875917278185457, "step": 4160 }, { "distill_loss": 0.4062316417694092, "epoch": 1.3875917278185457, "step": 4160 }, { "epoch": 1.3875917278185457, "ref_ce_loss": 0.284487783908844, "step": 4160 }, { "epoch": 1.390927284856571, "loss": 1.1423, "step": 4170 }, { "epoch": 1.390927284856571, "grad_norm": 2.137101888656616, "step": 4170 }, { "epoch": 1.390927284856571, "learning_rate": 0.0007752989646300529, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 1.123513102531433, "step": 4170 }, { "ce_loss": 0.3663288950920105, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.46327751874923706, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.27190497517585754, "step": 4170 }, { "epoch": 1.390927284856571, "loss": 1.4627983570098877, "step": 4170 }, { "ce_loss": 0.3249722421169281, "epoch": 1.390927284856571, "step": 4170 }, { "distill_loss": 0.4730352461338043, "epoch": 1.390927284856571, "step": 4170 }, { "epoch": 1.390927284856571, "ref_ce_loss": 0.20529697835445404, "step": 4170 }, { "epoch": 1.3942628418945964, "loss": 1.1578, "step": 4180 }, { "epoch": 1.3942628418945964, "grad_norm": 1.8561530113220215, "step": 4180 }, { "epoch": 1.3942628418945964, "learning_rate": 0.0007751492434286872, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 1.6327110528945923, "step": 4180 }, { "ce_loss": 0.23420818150043488, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.3671976625919342, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.20780381560325623, "step": 4180 }, { "epoch": 1.3942628418945964, "loss": 1.0552396774291992, "step": 4180 }, { "ce_loss": 0.2987080216407776, "epoch": 1.3942628418945964, "step": 4180 }, { "distill_loss": 0.41605520248413086, "epoch": 1.3942628418945964, "step": 4180 }, { "epoch": 1.3942628418945964, "ref_ce_loss": 0.1913209706544876, "step": 4180 }, { "epoch": 1.3975983989326217, "loss": 1.1349, "step": 4190 }, { "epoch": 1.3975983989326217, "grad_norm": 1.696298599243164, "step": 4190 }, { "epoch": 1.3975983989326217, "learning_rate": 0.0007749990843877865, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 1.2016475200653076, "step": 4190 }, { "ce_loss": 0.3398931622505188, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.43274614214897156, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.2609109878540039, "step": 4190 }, { "epoch": 1.3975983989326217, "loss": 1.0881268978118896, "step": 4190 }, { "ce_loss": 0.3539738357067108, "epoch": 1.3975983989326217, "step": 4190 }, { "distill_loss": 0.43918776512145996, "epoch": 1.3975983989326217, "step": 4190 }, { "epoch": 1.3975983989326217, "ref_ce_loss": 0.1944909542798996, "step": 4190 }, { "epoch": 1.400933955970647, "loss": 1.2285, "step": 4200 }, { "epoch": 1.400933955970647, "grad_norm": 1.5924049615859985, "step": 4200 }, { "epoch": 1.400933955970647, "learning_rate": 0.0007748484876826028, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 1.4822578430175781, "step": 4200 }, { "ce_loss": 0.28238165378570557, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.37509191036224365, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.21833594143390656, "step": 4200 }, { "epoch": 1.400933955970647, "loss": 1.3037891387939453, "step": 4200 }, { "ce_loss": 0.3134443759918213, "epoch": 1.400933955970647, "step": 4200 }, { "distill_loss": 0.5113526582717896, "epoch": 1.400933955970647, "step": 4200 }, { "epoch": 1.400933955970647, "ref_ce_loss": 0.2542562186717987, "step": 4200 }, { "epoch": 1.4042695130086724, "loss": 1.2271, "step": 4210 }, { "epoch": 1.4042695130086724, "grad_norm": 1.8334202766418457, "step": 4210 }, { "epoch": 1.4042695130086724, "learning_rate": 0.0007746974534888986, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 1.1463003158569336, "step": 4210 }, { "ce_loss": 0.3059764504432678, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.47663551568984985, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.27451011538505554, "step": 4210 }, { "epoch": 1.4042695130086724, "loss": 1.2052125930786133, "step": 4210 }, { "ce_loss": 0.3422890305519104, "epoch": 1.4042695130086724, "step": 4210 }, { "distill_loss": 0.5845454931259155, "epoch": 1.4042695130086724, "step": 4210 }, { "epoch": 1.4042695130086724, "ref_ce_loss": 0.19704946875572205, "step": 4210 }, { "epoch": 1.4076050700466978, "loss": 1.1701, "step": 4220 }, { "epoch": 1.4076050700466978, "grad_norm": 1.990908145904541, "step": 4220 }, { "epoch": 1.4076050700466978, "learning_rate": 0.0007745459819829473, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 1.103863000869751, "step": 4220 }, { "ce_loss": 0.27677813172340393, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.4710143208503723, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.17940568923950195, "step": 4220 }, { "epoch": 1.4076050700466978, "loss": 1.1673400402069092, "step": 4220 }, { "ce_loss": 0.3511430025100708, "epoch": 1.4076050700466978, "step": 4220 }, { "distill_loss": 0.5159322023391724, "epoch": 1.4076050700466978, "step": 4220 }, { "epoch": 1.4076050700466978, "ref_ce_loss": 0.24053901433944702, "step": 4220 }, { "epoch": 1.4109406270847231, "loss": 1.2512, "step": 4230 }, { "epoch": 1.4109406270847231, "grad_norm": 1.604972004890442, "step": 4230 }, { "epoch": 1.4109406270847231, "learning_rate": 0.0007743940733415319, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 1.0855860710144043, "step": 4230 }, { "ce_loss": 0.3288971781730652, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.4643997848033905, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.20832104980945587, "step": 4230 }, { "epoch": 1.4109406270847231, "loss": 1.439693808555603, "step": 4230 }, { "ce_loss": 0.34634625911712646, "epoch": 1.4109406270847231, "step": 4230 }, { "distill_loss": 0.5309983491897583, "epoch": 1.4109406270847231, "step": 4230 }, { "epoch": 1.4109406270847231, "ref_ce_loss": 0.23617428541183472, "step": 4230 }, { "epoch": 1.4142761841227485, "loss": 1.1937, "step": 4240 }, { "epoch": 1.4142761841227485, "grad_norm": 1.9160957336425781, "step": 4240 }, { "epoch": 1.4142761841227485, "learning_rate": 0.0007742417277419465, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 1.0655322074890137, "step": 4240 }, { "ce_loss": 0.2940945625305176, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.4044458568096161, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.20413602888584137, "step": 4240 }, { "epoch": 1.4142761841227485, "loss": 1.7881062030792236, "step": 4240 }, { "ce_loss": 0.46321895718574524, "epoch": 1.4142761841227485, "step": 4240 }, { "distill_loss": 0.5222978591918945, "epoch": 1.4142761841227485, "step": 4240 }, { "epoch": 1.4142761841227485, "ref_ce_loss": 0.2899708151817322, "step": 4240 }, { "epoch": 1.4176117411607738, "loss": 1.2468, "step": 4250 }, { "epoch": 1.4176117411607738, "grad_norm": 2.470942735671997, "step": 4250 }, { "epoch": 1.4176117411607738, "learning_rate": 0.0007740889453619949, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 1.334357500076294, "step": 4250 }, { "ce_loss": 0.4395330250263214, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.5255869030952454, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.2873314321041107, "step": 4250 }, { "epoch": 1.4176117411607738, "loss": 1.2778472900390625, "step": 4250 }, { "ce_loss": 0.33936822414398193, "epoch": 1.4176117411607738, "step": 4250 }, { "distill_loss": 0.44710713624954224, "epoch": 1.4176117411607738, "step": 4250 }, { "epoch": 1.4176117411607738, "ref_ce_loss": 0.2442135512828827, "step": 4250 }, { "epoch": 1.4209472981987992, "loss": 1.1803, "step": 4260 }, { "epoch": 1.4209472981987992, "grad_norm": 2.082793951034546, "step": 4260 }, { "epoch": 1.4209472981987992, "learning_rate": 0.0007739357263799903, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 1.5148963928222656, "step": 4260 }, { "ce_loss": 0.42286428809165955, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.4865938127040863, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.2644711434841156, "step": 4260 }, { "epoch": 1.4209472981987992, "loss": 1.1568526029586792, "step": 4260 }, { "ce_loss": 0.363336980342865, "epoch": 1.4209472981987992, "step": 4260 }, { "distill_loss": 0.4715014100074768, "epoch": 1.4209472981987992, "step": 4260 }, { "epoch": 1.4209472981987992, "ref_ce_loss": 0.25636565685272217, "step": 4260 }, { "epoch": 1.4242828552368245, "loss": 1.2529, "step": 4270 }, { "epoch": 1.4242828552368245, "grad_norm": 1.6873589754104614, "step": 4270 }, { "epoch": 1.4242828552368245, "learning_rate": 0.0007737820709747559, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 1.147404432296753, "step": 4270 }, { "ce_loss": 0.3248310685157776, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.4509952664375305, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.24929717183113098, "step": 4270 }, { "epoch": 1.4242828552368245, "loss": 1.0453346967697144, "step": 4270 }, { "ce_loss": 0.3251522183418274, "epoch": 1.4242828552368245, "step": 4270 }, { "distill_loss": 0.4964909553527832, "epoch": 1.4242828552368245, "step": 4270 }, { "epoch": 1.4242828552368245, "ref_ce_loss": 0.2225407361984253, "step": 4270 }, { "epoch": 1.4276184122748499, "loss": 1.2289, "step": 4280 }, { "epoch": 1.4276184122748499, "grad_norm": 2.059302806854248, "step": 4280 }, { "epoch": 1.4276184122748499, "learning_rate": 0.0007736279793256241, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 1.9120283126831055, "step": 4280 }, { "ce_loss": 0.37467390298843384, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.47755178809165955, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.29072806239128113, "step": 4280 }, { "epoch": 1.4276184122748499, "loss": 2.3525896072387695, "step": 4280 }, { "ce_loss": 0.3748549818992615, "epoch": 1.4276184122748499, "step": 4280 }, { "distill_loss": 0.42405492067337036, "epoch": 1.4276184122748499, "step": 4280 }, { "epoch": 1.4276184122748499, "ref_ce_loss": 0.2544288635253906, "step": 4280 }, { "epoch": 1.4309539693128752, "loss": 1.3413, "step": 4290 }, { "epoch": 1.4309539693128752, "grad_norm": 1.8155747652053833, "step": 4290 }, { "epoch": 1.4309539693128752, "learning_rate": 0.0007734734516124362, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 1.2981414794921875, "step": 4290 }, { "ce_loss": 0.40608739852905273, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.5454487204551697, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.27907249331474304, "step": 4290 }, { "epoch": 1.4309539693128752, "loss": 0.9883788228034973, "step": 4290 }, { "ce_loss": 0.3114151656627655, "epoch": 1.4309539693128752, "step": 4290 }, { "distill_loss": 0.42436087131500244, "epoch": 1.4309539693128752, "step": 4290 }, { "epoch": 1.4309539693128752, "ref_ce_loss": 0.2510216236114502, "step": 4290 }, { "epoch": 1.4342895263509006, "loss": 1.3547, "step": 4300 }, { "epoch": 1.4342895263509006, "grad_norm": 2.8617186546325684, "step": 4300 }, { "epoch": 1.4342895263509006, "learning_rate": 0.0007733184880155431, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 1.3844374418258667, "step": 4300 }, { "ce_loss": 0.41324886679649353, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.5392626523971558, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.25431501865386963, "step": 4300 }, { "epoch": 1.4342895263509006, "loss": 1.012142300605774, "step": 4300 }, { "ce_loss": 0.2379559874534607, "epoch": 1.4342895263509006, "step": 4300 }, { "distill_loss": 0.425807386636734, "epoch": 1.4342895263509006, "step": 4300 }, { "epoch": 1.4342895263509006, "ref_ce_loss": 0.2007993459701538, "step": 4300 }, { "epoch": 1.437625083388926, "loss": 1.1946, "step": 4310 }, { "epoch": 1.437625083388926, "grad_norm": 1.6949338912963867, "step": 4310 }, { "epoch": 1.437625083388926, "learning_rate": 0.0007731630887158037, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 1.02177095413208, "step": 4310 }, { "ce_loss": 0.24608014523983002, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.45627400279045105, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.20076674222946167, "step": 4310 }, { "epoch": 1.437625083388926, "loss": 1.084315299987793, "step": 4310 }, { "ce_loss": 0.3126720190048218, "epoch": 1.437625083388926, "step": 4310 }, { "distill_loss": 0.4719497561454773, "epoch": 1.437625083388926, "step": 4310 }, { "epoch": 1.437625083388926, "ref_ce_loss": 0.22649230062961578, "step": 4310 }, { "epoch": 1.4409606404269513, "loss": 1.2592, "step": 4320 }, { "epoch": 1.4409606404269513, "grad_norm": 2.882329225540161, "step": 4320 }, { "epoch": 1.4409606404269513, "learning_rate": 0.0007730072538945857, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 1.1591767072677612, "step": 4320 }, { "ce_loss": 0.36486318707466125, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.5568551421165466, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.23736506700515747, "step": 4320 }, { "epoch": 1.4409606404269513, "loss": 1.2775168418884277, "step": 4320 }, { "ce_loss": 0.42621615529060364, "epoch": 1.4409606404269513, "step": 4320 }, { "distill_loss": 0.5392773151397705, "epoch": 1.4409606404269513, "step": 4320 }, { "epoch": 1.4409606404269513, "ref_ce_loss": 0.311714231967926, "step": 4320 }, { "epoch": 1.4442961974649766, "loss": 1.201, "step": 4330 }, { "epoch": 1.4442961974649766, "grad_norm": 3.2554428577423096, "step": 4330 }, { "epoch": 1.4442961974649766, "learning_rate": 0.0007728509837337652, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 2.2309839725494385, "step": 4330 }, { "ce_loss": 0.3380087614059448, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.5450072884559631, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.23968921601772308, "step": 4330 }, { "epoch": 1.4442961974649766, "loss": 1.2512460947036743, "step": 4330 }, { "ce_loss": 0.3639287054538727, "epoch": 1.4442961974649766, "step": 4330 }, { "distill_loss": 0.5350576043128967, "epoch": 1.4442961974649766, "step": 4330 }, { "epoch": 1.4442961974649766, "ref_ce_loss": 0.2919241189956665, "step": 4330 }, { "epoch": 1.447631754503002, "loss": 1.2924, "step": 4340 }, { "epoch": 1.447631754503002, "grad_norm": 2.581834077835083, "step": 4340 }, { "epoch": 1.447631754503002, "learning_rate": 0.0007726942784157262, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 1.748133659362793, "step": 4340 }, { "ce_loss": 0.476225346326828, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.6821185350418091, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.2749059796333313, "step": 4340 }, { "epoch": 1.447631754503002, "loss": 1.231296181678772, "step": 4340 }, { "ce_loss": 0.362838476896286, "epoch": 1.447631754503002, "step": 4340 }, { "distill_loss": 0.5025181770324707, "epoch": 1.447631754503002, "step": 4340 }, { "epoch": 1.447631754503002, "ref_ce_loss": 0.28818202018737793, "step": 4340 }, { "epoch": 1.4509673115410273, "loss": 1.2941, "step": 4350 }, { "epoch": 1.4509673115410273, "grad_norm": 1.7488747835159302, "step": 4350 }, { "epoch": 1.4509673115410273, "learning_rate": 0.0007725371381233607, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 1.2100708484649658, "step": 4350 }, { "ce_loss": 0.33391791582107544, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.46951207518577576, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.23552538454532623, "step": 4350 }, { "epoch": 1.4509673115410273, "loss": 1.1646292209625244, "step": 4350 }, { "ce_loss": 0.3197769820690155, "epoch": 1.4509673115410273, "step": 4350 }, { "distill_loss": 0.4967893660068512, "epoch": 1.4509673115410273, "step": 4350 }, { "epoch": 1.4509673115410273, "ref_ce_loss": 0.20482869446277618, "step": 4350 }, { "epoch": 1.4543028685790527, "loss": 1.2345, "step": 4360 }, { "epoch": 1.4543028685790527, "grad_norm": 1.8481858968734741, "step": 4360 }, { "epoch": 1.4543028685790527, "learning_rate": 0.0007723795630400686, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 1.1097173690795898, "step": 4360 }, { "ce_loss": 0.27734866738319397, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.43501776456832886, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.20296035706996918, "step": 4360 }, { "epoch": 1.4543028685790527, "loss": 1.2077794075012207, "step": 4360 }, { "ce_loss": 0.3112337291240692, "epoch": 1.4543028685790527, "step": 4360 }, { "distill_loss": 0.5319333076477051, "epoch": 1.4543028685790527, "step": 4360 }, { "epoch": 1.4543028685790527, "ref_ce_loss": 0.21200990676879883, "step": 4360 }, { "epoch": 1.457638425617078, "loss": 1.2801, "step": 4370 }, { "epoch": 1.457638425617078, "grad_norm": 1.7149287462234497, "step": 4370 }, { "epoch": 1.457638425617078, "learning_rate": 0.0007722215533497566, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 1.0904030799865723, "step": 4370 }, { "ce_loss": 0.2962927222251892, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.5173056125640869, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.21020416915416718, "step": 4370 }, { "epoch": 1.457638425617078, "loss": 1.0985056161880493, "step": 4370 }, { "ce_loss": 0.27120473980903625, "epoch": 1.457638425617078, "step": 4370 }, { "distill_loss": 0.4675452709197998, "epoch": 1.457638425617078, "step": 4370 }, { "epoch": 1.457638425617078, "ref_ce_loss": 0.22813116014003754, "step": 4370 }, { "epoch": 1.4609739826551034, "loss": 1.256, "step": 4380 }, { "epoch": 1.4609739826551034, "grad_norm": 2.0012896060943604, "step": 4380 }, { "epoch": 1.4609739826551034, "learning_rate": 0.000772063109236839, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 1.459157943725586, "step": 4380 }, { "ce_loss": 0.23730525374412537, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.5278077125549316, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.1980505734682083, "step": 4380 }, { "epoch": 1.4609739826551034, "loss": 1.243227243423462, "step": 4380 }, { "ce_loss": 0.3660188913345337, "epoch": 1.4609739826551034, "step": 4380 }, { "distill_loss": 0.6111223697662354, "epoch": 1.4609739826551034, "step": 4380 }, { "epoch": 1.4609739826551034, "ref_ce_loss": 0.26495978236198425, "step": 4380 }, { "epoch": 1.4643095396931287, "loss": 1.196, "step": 4390 }, { "epoch": 1.4643095396931287, "grad_norm": 1.796673059463501, "step": 4390 }, { "epoch": 1.4643095396931287, "learning_rate": 0.0007719042308862374, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 1.1504383087158203, "step": 4390 }, { "ce_loss": 0.28149980306625366, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.5336781144142151, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.22772520780563354, "step": 4390 }, { "epoch": 1.4643095396931287, "loss": 1.1730796098709106, "step": 4390 }, { "ce_loss": 0.2699909806251526, "epoch": 1.4643095396931287, "step": 4390 }, { "distill_loss": 0.5290112495422363, "epoch": 1.4643095396931287, "step": 4390 }, { "epoch": 1.4643095396931287, "ref_ce_loss": 0.25511685013771057, "step": 4390 }, { "epoch": 1.467645096731154, "loss": 1.2889, "step": 4400 }, { "epoch": 1.467645096731154, "grad_norm": 2.082970380783081, "step": 4400 }, { "epoch": 1.467645096731154, "learning_rate": 0.0007717449184833797, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 1.255260705947876, "step": 4400 }, { "ce_loss": 0.36189717054367065, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.4005478322505951, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.25275516510009766, "step": 4400 }, { "epoch": 1.467645096731154, "loss": 1.2640717029571533, "step": 4400 }, { "ce_loss": 0.36174458265304565, "epoch": 1.467645096731154, "step": 4400 }, { "distill_loss": 0.4172531068325043, "epoch": 1.467645096731154, "step": 4400 }, { "epoch": 1.467645096731154, "ref_ce_loss": 0.2263191193342209, "step": 4400 }, { "epoch": 1.4709806537691794, "loss": 1.2228, "step": 4410 }, { "epoch": 1.4709806537691794, "grad_norm": 1.754824161529541, "step": 4410 }, { "epoch": 1.4709806537691794, "learning_rate": 0.0007715851722142008, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 1.3902934789657593, "step": 4410 }, { "ce_loss": 0.37669166922569275, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.5949977040290833, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.3123665153980255, "step": 4410 }, { "epoch": 1.4709806537691794, "loss": 1.2151596546173096, "step": 4410 }, { "ce_loss": 0.32400956749916077, "epoch": 1.4709806537691794, "step": 4410 }, { "distill_loss": 0.4897102415561676, "epoch": 1.4709806537691794, "step": 4410 }, { "epoch": 1.4709806537691794, "ref_ce_loss": 0.20092438161373138, "step": 4410 }, { "epoch": 1.4743162108072048, "loss": 1.2723, "step": 4420 }, { "epoch": 1.4743162108072048, "grad_norm": 2.524966239929199, "step": 4420 }, { "epoch": 1.4743162108072048, "learning_rate": 0.0007714249922651417, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 1.3874589204788208, "step": 4420 }, { "ce_loss": 0.30010318756103516, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.5788282752037048, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.230285182595253, "step": 4420 }, { "epoch": 1.4743162108072048, "loss": 1.1097252368927002, "step": 4420 }, { "ce_loss": 0.2605561316013336, "epoch": 1.4743162108072048, "step": 4420 }, { "distill_loss": 0.46707484126091003, "epoch": 1.4743162108072048, "step": 4420 }, { "epoch": 1.4743162108072048, "ref_ce_loss": 0.19243906438350677, "step": 4420 }, { "epoch": 1.4776517678452301, "loss": 1.2361, "step": 4430 }, { "epoch": 1.4776517678452301, "grad_norm": 2.02272629737854, "step": 4430 }, { "epoch": 1.4776517678452301, "learning_rate": 0.0007712643788231496, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 1.1122024059295654, "step": 4430 }, { "ce_loss": 0.3622915744781494, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.46392694115638733, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.20297808945178986, "step": 4430 }, { "epoch": 1.4776517678452301, "loss": 1.1137298345565796, "step": 4430 }, { "ce_loss": 0.3511255085468292, "epoch": 1.4776517678452301, "step": 4430 }, { "distill_loss": 0.4177210330963135, "epoch": 1.4776517678452301, "step": 4430 }, { "epoch": 1.4776517678452301, "ref_ce_loss": 0.29275891184806824, "step": 4430 }, { "epoch": 1.4809873248832555, "loss": 1.3339, "step": 4440 }, { "epoch": 1.4809873248832555, "grad_norm": 2.513075828552246, "step": 4440 }, { "epoch": 1.4809873248832555, "learning_rate": 0.0007711033320756778, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 1.3569178581237793, "step": 4440 }, { "ce_loss": 0.27129077911376953, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.44302868843078613, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.20074187219142914, "step": 4440 }, { "epoch": 1.4809873248832555, "loss": 1.2615227699279785, "step": 4440 }, { "ce_loss": 0.3332144021987915, "epoch": 1.4809873248832555, "step": 4440 }, { "distill_loss": 0.49916476011276245, "epoch": 1.4809873248832555, "step": 4440 }, { "epoch": 1.4809873248832555, "ref_ce_loss": 0.1940786987543106, "step": 4440 }, { "epoch": 1.4843228819212808, "loss": 1.2713, "step": 4450 }, { "epoch": 1.4843228819212808, "grad_norm": 1.484968662261963, "step": 4450 }, { "epoch": 1.4843228819212808, "learning_rate": 0.0007709418522106851, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 1.2872264385223389, "step": 4450 }, { "ce_loss": 0.4188542664051056, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.5292216539382935, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.25378894805908203, "step": 4450 }, { "epoch": 1.4843228819212808, "loss": 1.5678834915161133, "step": 4450 }, { "ce_loss": 0.3771999478340149, "epoch": 1.4843228819212808, "step": 4450 }, { "distill_loss": 0.6054632663726807, "epoch": 1.4843228819212808, "step": 4450 }, { "epoch": 1.4843228819212808, "ref_ce_loss": 0.24410036206245422, "step": 4450 }, { "epoch": 1.4876584389593062, "loss": 1.3639, "step": 4460 }, { "epoch": 1.4876584389593062, "grad_norm": 1.744352102279663, "step": 4460 }, { "epoch": 1.4876584389593062, "learning_rate": 0.0007707799394166358, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 1.6583737134933472, "step": 4460 }, { "ce_loss": 0.35331714153289795, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.4476446211338043, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.267397940158844, "step": 4460 }, { "epoch": 1.4876584389593062, "loss": 1.1916965246200562, "step": 4460 }, { "ce_loss": 0.3574371337890625, "epoch": 1.4876584389593062, "step": 4460 }, { "distill_loss": 0.546086311340332, "epoch": 1.4876584389593062, "step": 4460 }, { "epoch": 1.4876584389593062, "ref_ce_loss": 0.2879451811313629, "step": 4460 }, { "epoch": 1.4909939959973315, "loss": 1.2757, "step": 4470 }, { "epoch": 1.4909939959973315, "grad_norm": 3.38885498046875, "step": 4470 }, { "epoch": 1.4909939959973315, "learning_rate": 0.0007706175938824996, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 1.2496098279953003, "step": 4470 }, { "ce_loss": 0.3497420847415924, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.5570492744445801, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.2517758309841156, "step": 4470 }, { "epoch": 1.4909939959973315, "loss": 0.9816231727600098, "step": 4470 }, { "ce_loss": 0.3141824007034302, "epoch": 1.4909939959973315, "step": 4470 }, { "distill_loss": 0.46462568640708923, "epoch": 1.4909939959973315, "step": 4470 }, { "epoch": 1.4909939959973315, "ref_ce_loss": 0.20259299874305725, "step": 4470 }, { "epoch": 1.4943295530353569, "loss": 1.1332, "step": 4480 }, { "epoch": 1.4943295530353569, "grad_norm": 2.002901554107666, "step": 4480 }, { "epoch": 1.4943295530353569, "learning_rate": 0.0007704548157977514, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 1.007076382637024, "step": 4480 }, { "ce_loss": 0.29891398549079895, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.49870485067367554, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.20919978618621826, "step": 4480 }, { "epoch": 1.4943295530353569, "loss": 1.2025349140167236, "step": 4480 }, { "ce_loss": 0.3246881663799286, "epoch": 1.4943295530353569, "step": 4480 }, { "distill_loss": 0.4924752712249756, "epoch": 1.4943295530353569, "step": 4480 }, { "epoch": 1.4943295530353569, "ref_ce_loss": 0.2761264741420746, "step": 4480 }, { "epoch": 1.4976651100733822, "loss": 1.2163, "step": 4490 }, { "epoch": 1.4976651100733822, "grad_norm": 1.9824186563491821, "step": 4490 }, { "epoch": 1.4976651100733822, "learning_rate": 0.0007702916053523705, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 1.797337532043457, "step": 4490 }, { "ce_loss": 0.40063029527664185, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.4892476201057434, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.2173793464899063, "step": 4490 }, { "epoch": 1.4976651100733822, "loss": 1.2050611972808838, "step": 4490 }, { "ce_loss": 0.3333210349082947, "epoch": 1.4976651100733822, "step": 4490 }, { "distill_loss": 0.44667676091194153, "epoch": 1.4976651100733822, "step": 4490 }, { "epoch": 1.4976651100733822, "ref_ce_loss": 0.22997619211673737, "step": 4490 }, { "epoch": 1.5010006671114076, "loss": 1.295, "step": 4500 }, { "epoch": 1.5010006671114076, "grad_norm": 2.122441053390503, "step": 4500 }, { "epoch": 1.5010006671114076, "learning_rate": 0.0007701279627368411, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 1.2669577598571777, "step": 4500 }, { "ce_loss": 0.35459598898887634, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.5124510526657104, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.24606196582317352, "step": 4500 }, { "epoch": 1.5010006671114076, "loss": 1.0956400632858276, "step": 4500 }, { "ce_loss": 0.30307912826538086, "epoch": 1.5010006671114076, "step": 4500 }, { "distill_loss": 0.5664011240005493, "epoch": 1.5010006671114076, "step": 4500 }, { "epoch": 1.5010006671114076, "ref_ce_loss": 0.22324898838996887, "step": 4500 }, { "epoch": 1.504336224149433, "loss": 1.2063, "step": 4510 }, { "epoch": 1.504336224149433, "grad_norm": 2.289616346359253, "step": 4510 }, { "epoch": 1.504336224149433, "learning_rate": 0.0007699638881421518, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 1.3379403352737427, "step": 4510 }, { "ce_loss": 0.3895055651664734, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.549110472202301, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.33225470781326294, "step": 4510 }, { "epoch": 1.504336224149433, "loss": 0.9041282534599304, "step": 4510 }, { "ce_loss": 0.2577150762081146, "epoch": 1.504336224149433, "step": 4510 }, { "distill_loss": 0.4566647410392761, "epoch": 1.504336224149433, "step": 4510 }, { "epoch": 1.504336224149433, "ref_ce_loss": 0.18841959536075592, "step": 4510 }, { "epoch": 1.5076717811874583, "loss": 1.2233, "step": 4520 }, { "epoch": 1.5076717811874583, "grad_norm": 9.080262184143066, "step": 4520 }, { "epoch": 1.5076717811874583, "learning_rate": 0.0007697993817597952, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 1.5081042051315308, "step": 4520 }, { "ce_loss": 0.5217739939689636, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.6341196298599243, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.35214540362358093, "step": 4520 }, { "epoch": 1.5076717811874583, "loss": 2.01505446434021, "step": 4520 }, { "ce_loss": 0.4766079783439636, "epoch": 1.5076717811874583, "step": 4520 }, { "distill_loss": 0.5420026779174805, "epoch": 1.5076717811874583, "step": 4520 }, { "epoch": 1.5076717811874583, "ref_ce_loss": 0.27477625012397766, "step": 4520 }, { "epoch": 1.5110073382254836, "loss": 1.3597, "step": 4530 }, { "epoch": 1.5110073382254836, "grad_norm": 3.787122964859009, "step": 4530 }, { "epoch": 1.5110073382254836, "learning_rate": 0.0007696344437817681, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 1.5112301111221313, "step": 4530 }, { "ce_loss": 0.3507489264011383, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.5297791361808777, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.23945032060146332, "step": 4530 }, { "epoch": 1.5110073382254836, "loss": 1.102178692817688, "step": 4530 }, { "ce_loss": 0.3480914235115051, "epoch": 1.5110073382254836, "step": 4530 }, { "distill_loss": 0.5078696012496948, "epoch": 1.5110073382254836, "step": 4530 }, { "epoch": 1.5110073382254836, "ref_ce_loss": 0.24594002962112427, "step": 4530 }, { "epoch": 1.514342895263509, "loss": 1.2106, "step": 4540 }, { "epoch": 1.514342895263509, "grad_norm": 1.8164446353912354, "step": 4540 }, { "epoch": 1.514342895263509, "learning_rate": 0.0007694690744005707, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 1.1036242246627808, "step": 4540 }, { "ce_loss": 0.3030671179294586, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.5098708868026733, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.2081867754459381, "step": 4540 }, { "epoch": 1.514342895263509, "loss": 1.0559051036834717, "step": 4540 }, { "ce_loss": 0.2896508276462555, "epoch": 1.514342895263509, "step": 4540 }, { "distill_loss": 0.4736049771308899, "epoch": 1.514342895263509, "step": 4540 }, { "epoch": 1.514342895263509, "ref_ce_loss": 0.1540573388338089, "step": 4540 }, { "epoch": 1.5176784523015343, "loss": 1.2321, "step": 4550 }, { "epoch": 1.5176784523015343, "grad_norm": 1.8194334506988525, "step": 4550 }, { "epoch": 1.5176784523015343, "learning_rate": 0.000769303273809207, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 1.0852607488632202, "step": 4550 }, { "ce_loss": 0.30452650785446167, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.44049713015556335, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.20696301758289337, "step": 4550 }, { "epoch": 1.5176784523015343, "loss": 1.345840334892273, "step": 4550 }, { "ce_loss": 0.22905874252319336, "epoch": 1.5176784523015343, "step": 4550 }, { "distill_loss": 0.5314184427261353, "epoch": 1.5176784523015343, "step": 4550 }, { "epoch": 1.5176784523015343, "ref_ce_loss": 0.19444139301776886, "step": 4550 }, { "epoch": 1.5210140093395597, "loss": 1.2086, "step": 4560 }, { "epoch": 1.5210140093395597, "grad_norm": 2.125244140625, "step": 4560 }, { "epoch": 1.5210140093395597, "learning_rate": 0.0007691370422011842, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 1.3244200944900513, "step": 4560 }, { "ce_loss": 0.2872719168663025, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.42128679156303406, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.25759992003440857, "step": 4560 }, { "epoch": 1.5210140093395597, "loss": 1.0789515972137451, "step": 4560 }, { "ce_loss": 0.3220730125904083, "epoch": 1.5210140093395597, "step": 4560 }, { "distill_loss": 0.4512641429901123, "epoch": 1.5210140093395597, "step": 4560 }, { "epoch": 1.5210140093395597, "ref_ce_loss": 0.23262079060077667, "step": 4560 }, { "epoch": 1.524349566377585, "loss": 1.1744, "step": 4570 }, { "epoch": 1.524349566377585, "grad_norm": 1.373600721359253, "step": 4570 }, { "epoch": 1.524349566377585, "learning_rate": 0.0007689703797705122, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 0.9655213952064514, "step": 4570 }, { "ce_loss": 0.2914479076862335, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.4219920039176941, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.186455637216568, "step": 4570 }, { "epoch": 1.524349566377585, "loss": 1.125288963317871, "step": 4570 }, { "ce_loss": 0.3586792051792145, "epoch": 1.524349566377585, "step": 4570 }, { "distill_loss": 0.48464423418045044, "epoch": 1.524349566377585, "step": 4570 }, { "epoch": 1.524349566377585, "ref_ce_loss": 0.20336459577083588, "step": 4570 }, { "epoch": 1.5276851234156104, "loss": 1.2156, "step": 4580 }, { "epoch": 1.5276851234156104, "grad_norm": 1.8237141370773315, "step": 4580 }, { "epoch": 1.5276851234156104, "learning_rate": 0.0007688032867117043, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 1.5945961475372314, "step": 4580 }, { "ce_loss": 0.35956457257270813, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.49926865100860596, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.19571208953857422, "step": 4580 }, { "epoch": 1.5276851234156104, "loss": 1.1089470386505127, "step": 4580 }, { "ce_loss": 0.38314369320869446, "epoch": 1.5276851234156104, "step": 4580 }, { "distill_loss": 0.4588963985443115, "epoch": 1.5276851234156104, "step": 4580 }, { "epoch": 1.5276851234156104, "ref_ce_loss": 0.26634490489959717, "step": 4580 }, { "epoch": 1.5310206804536357, "loss": 1.235, "step": 4590 }, { "epoch": 1.5310206804536357, "grad_norm": 1.6946427822113037, "step": 4590 }, { "epoch": 1.5310206804536357, "learning_rate": 0.0007686357632197758, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 1.077806830406189, "step": 4590 }, { "ce_loss": 0.31069520115852356, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.42448943853378296, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.28154149651527405, "step": 4590 }, { "epoch": 1.5310206804536357, "loss": 0.9597331285476685, "step": 4590 }, { "ce_loss": 0.3159964382648468, "epoch": 1.5310206804536357, "step": 4590 }, { "distill_loss": 0.45832157135009766, "epoch": 1.5310206804536357, "step": 4590 }, { "epoch": 1.5310206804536357, "ref_ce_loss": 0.18516969680786133, "step": 4590 }, { "epoch": 1.534356237491661, "loss": 1.2692, "step": 4600 }, { "epoch": 1.534356237491661, "grad_norm": 3.984046220779419, "step": 4600 }, { "epoch": 1.534356237491661, "learning_rate": 0.0007684678094902449, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 1.0595605373382568, "step": 4600 }, { "ce_loss": 0.3087334632873535, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.46225982904434204, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.22030363976955414, "step": 4600 }, { "epoch": 1.534356237491661, "loss": 0.9732711911201477, "step": 4600 }, { "ce_loss": 0.3353942334651947, "epoch": 1.534356237491661, "step": 4600 }, { "distill_loss": 0.3921261727809906, "epoch": 1.534356237491661, "step": 4600 }, { "epoch": 1.534356237491661, "ref_ce_loss": 0.2052212506532669, "step": 4600 }, { "epoch": 1.5376917945296864, "loss": 1.1144, "step": 4610 }, { "epoch": 1.5376917945296864, "grad_norm": 1.938663125038147, "step": 4610 }, { "epoch": 1.5376917945296864, "learning_rate": 0.0007682994257191315, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 0.9791405200958252, "step": 4610 }, { "ce_loss": 0.31135374307632446, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.39572346210479736, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.2084600180387497, "step": 4610 }, { "epoch": 1.5376917945296864, "loss": 1.0885542631149292, "step": 4610 }, { "ce_loss": 0.287092924118042, "epoch": 1.5376917945296864, "step": 4610 }, { "distill_loss": 0.4916094243526459, "epoch": 1.5376917945296864, "step": 4610 }, { "epoch": 1.5376917945296864, "ref_ce_loss": 0.2237331122159958, "step": 4610 }, { "epoch": 1.5410273515677118, "loss": 1.113, "step": 4620 }, { "epoch": 1.5410273515677118, "grad_norm": 1.4871138334274292, "step": 4620 }, { "epoch": 1.5410273515677118, "learning_rate": 0.0007681306121029575, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 1.2581678628921509, "step": 4620 }, { "ce_loss": 0.4004390239715576, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.5559272766113281, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.25150373578071594, "step": 4620 }, { "epoch": 1.5410273515677118, "loss": 1.5844061374664307, "step": 4620 }, { "ce_loss": 0.34186816215515137, "epoch": 1.5410273515677118, "step": 4620 }, { "distill_loss": 0.5579522848129272, "epoch": 1.5410273515677118, "step": 4620 }, { "epoch": 1.5410273515677118, "ref_ce_loss": 0.20324687659740448, "step": 4620 }, { "epoch": 1.544362908605737, "loss": 1.2223, "step": 4630 }, { "epoch": 1.544362908605737, "grad_norm": 2.3152780532836914, "step": 4630 }, { "epoch": 1.544362908605737, "learning_rate": 0.0007679613688387468, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 1.0994701385498047, "step": 4630 }, { "ce_loss": 0.289509117603302, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.45211362838745117, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.1917085349559784, "step": 4630 }, { "epoch": 1.544362908605737, "loss": 0.9867453575134277, "step": 4630 }, { "ce_loss": 0.3251807689666748, "epoch": 1.544362908605737, "step": 4630 }, { "distill_loss": 0.46963727474212646, "epoch": 1.544362908605737, "step": 4630 }, { "epoch": 1.544362908605737, "ref_ce_loss": 0.18415512144565582, "step": 4630 }, { "epoch": 1.5476984656437625, "loss": 1.1549, "step": 4640 }, { "epoch": 1.5476984656437625, "grad_norm": 1.5950928926467896, "step": 4640 }, { "epoch": 1.5476984656437625, "learning_rate": 0.0007677916961240245, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 0.9006496667861938, "step": 4640 }, { "ce_loss": 0.29129329323768616, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.4002491235733032, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.20863546431064606, "step": 4640 }, { "epoch": 1.5476984656437625, "loss": 1.0415515899658203, "step": 4640 }, { "ce_loss": 0.2967177927494049, "epoch": 1.5476984656437625, "step": 4640 }, { "distill_loss": 0.46538567543029785, "epoch": 1.5476984656437625, "step": 4640 }, { "epoch": 1.5476984656437625, "ref_ce_loss": 0.20359046757221222, "step": 4640 }, { "epoch": 1.5510340226817878, "loss": 1.1216, "step": 4650 }, { "epoch": 1.5510340226817878, "grad_norm": 1.950021505355835, "step": 4650 }, { "epoch": 1.5510340226817878, "learning_rate": 0.0007676215941568166, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 0.9957625865936279, "step": 4650 }, { "ce_loss": 0.2855859696865082, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.44514599442481995, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.18354029953479767, "step": 4650 }, { "epoch": 1.5510340226817878, "loss": 1.527466893196106, "step": 4650 }, { "ce_loss": 0.34161245822906494, "epoch": 1.5510340226817878, "step": 4650 }, { "distill_loss": 0.4497413635253906, "epoch": 1.5510340226817878, "step": 4650 }, { "epoch": 1.5510340226817878, "ref_ce_loss": 0.27534714341163635, "step": 4650 }, { "epoch": 1.5543695797198132, "loss": 1.1575, "step": 4660 }, { "epoch": 1.5543695797198132, "grad_norm": 1.8270336389541626, "step": 4660 }, { "epoch": 1.5543695797198132, "learning_rate": 0.0007674510631356506, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 0.9707221388816833, "step": 4660 }, { "ce_loss": 0.2743615508079529, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.48221203684806824, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.15772660076618195, "step": 4660 }, { "epoch": 1.5543695797198132, "loss": 1.6755247116088867, "step": 4660 }, { "ce_loss": 0.4013010561466217, "epoch": 1.5543695797198132, "step": 4660 }, { "distill_loss": 0.5076295137405396, "epoch": 1.5543695797198132, "step": 4660 }, { "epoch": 1.5543695797198132, "ref_ce_loss": 0.2690885663032532, "step": 4660 }, { "epoch": 1.5577051367578385, "loss": 1.3004, "step": 4670 }, { "epoch": 1.5577051367578385, "grad_norm": 1.6369752883911133, "step": 4670 }, { "epoch": 1.5577051367578385, "learning_rate": 0.0007672801032595547, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 1.2731767892837524, "step": 4670 }, { "ce_loss": 0.33239999413490295, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.514566957950592, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.22566305100917816, "step": 4670 }, { "epoch": 1.5577051367578385, "loss": 1.146661400794983, "step": 4670 }, { "ce_loss": 0.26267436146736145, "epoch": 1.5577051367578385, "step": 4670 }, { "distill_loss": 0.4635869264602661, "epoch": 1.5577051367578385, "step": 4670 }, { "epoch": 1.5577051367578385, "ref_ce_loss": 0.21978121995925903, "step": 4670 }, { "epoch": 1.5610406937958639, "loss": 1.2349, "step": 4680 }, { "epoch": 1.5610406937958639, "grad_norm": 1.743242859840393, "step": 4680 }, { "epoch": 1.5610406937958639, "learning_rate": 0.0007671087147280572, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 1.3634655475616455, "step": 4680 }, { "ce_loss": 0.36991485953330994, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.5084767937660217, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.26672112941741943, "step": 4680 }, { "epoch": 1.5610406937958639, "loss": 1.1525386571884155, "step": 4680 }, { "ce_loss": 0.35368019342422485, "epoch": 1.5610406937958639, "step": 4680 }, { "distill_loss": 0.4598918855190277, "epoch": 1.5610406937958639, "step": 4680 }, { "epoch": 1.5610406937958639, "ref_ce_loss": 0.23088416457176208, "step": 4680 }, { "epoch": 1.5643762508338894, "loss": 1.223, "step": 4690 }, { "epoch": 1.5643762508338894, "grad_norm": 1.8734859228134155, "step": 4690 }, { "epoch": 1.5643762508338894, "learning_rate": 0.0007669368977411871, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 1.6650550365447998, "step": 4690 }, { "ce_loss": 0.39589375257492065, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.5713174343109131, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.311811238527298, "step": 4690 }, { "epoch": 1.5643762508338894, "loss": 1.2223482131958008, "step": 4690 }, { "ce_loss": 0.389156311750412, "epoch": 1.5643762508338894, "step": 4690 }, { "distill_loss": 0.4965554177761078, "epoch": 1.5643762508338894, "step": 4690 }, { "epoch": 1.5643762508338894, "ref_ce_loss": 0.27140820026397705, "step": 4690 }, { "epoch": 1.5677118078719148, "loss": 1.2665, "step": 4700 }, { "epoch": 1.5677118078719148, "grad_norm": 2.6322362422943115, "step": 4700 }, { "epoch": 1.5677118078719148, "learning_rate": 0.0007667646524994734, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 1.1102492809295654, "step": 4700 }, { "ce_loss": 0.31784892082214355, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.4729698598384857, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.25654831528663635, "step": 4700 }, { "epoch": 1.5677118078719148, "loss": 0.9936020374298096, "step": 4700 }, { "ce_loss": 0.30014559626579285, "epoch": 1.5677118078719148, "step": 4700 }, { "distill_loss": 0.41301512718200684, "epoch": 1.5677118078719148, "step": 4700 }, { "epoch": 1.5677118078719148, "ref_ce_loss": 0.21143920719623566, "step": 4700 }, { "epoch": 1.5710473649099401, "loss": 1.2849, "step": 4710 }, { "epoch": 1.5710473649099401, "grad_norm": 2.960231304168701, "step": 4710 }, { "epoch": 1.5710473649099401, "learning_rate": 0.0007665919792039447, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 1.1995261907577515, "step": 4710 }, { "ce_loss": 0.29931750893592834, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.4858446717262268, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.20732131600379944, "step": 4710 }, { "epoch": 1.5710473649099401, "loss": 1.3857035636901855, "step": 4710 }, { "ce_loss": 0.35007715225219727, "epoch": 1.5710473649099401, "step": 4710 }, { "distill_loss": 0.49830907583236694, "epoch": 1.5710473649099401, "step": 4710 }, { "epoch": 1.5710473649099401, "ref_ce_loss": 0.2315230369567871, "step": 4710 }, { "epoch": 1.5743829219479655, "loss": 1.1131, "step": 4720 }, { "epoch": 1.5743829219479655, "grad_norm": 2.1521153450012207, "step": 4720 }, { "epoch": 1.5743829219479655, "learning_rate": 0.0007664188780561292, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 1.097166657447815, "step": 4720 }, { "ce_loss": 0.3242321312427521, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.47403573989868164, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.23136167228221893, "step": 4720 }, { "epoch": 1.5743829219479655, "loss": 1.1047563552856445, "step": 4720 }, { "ce_loss": 0.3445529639720917, "epoch": 1.5743829219479655, "step": 4720 }, { "distill_loss": 0.4420076012611389, "epoch": 1.5743829219479655, "step": 4720 }, { "epoch": 1.5743829219479655, "ref_ce_loss": 0.2560592293739319, "step": 4720 }, { "epoch": 1.5777184789859908, "loss": 1.2472, "step": 4730 }, { "epoch": 1.5777184789859908, "grad_norm": 1.5923607349395752, "step": 4730 }, { "epoch": 1.5777184789859908, "learning_rate": 0.0007662453492580548, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 1.2345513105392456, "step": 4730 }, { "ce_loss": 0.3786817789077759, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.42504045367240906, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.23074667155742645, "step": 4730 }, { "epoch": 1.5777184789859908, "loss": 1.123155951499939, "step": 4730 }, { "ce_loss": 0.37243664264678955, "epoch": 1.5777184789859908, "step": 4730 }, { "distill_loss": 0.432323157787323, "epoch": 1.5777184789859908, "step": 4730 }, { "epoch": 1.5777184789859908, "ref_ce_loss": 0.24366344511508942, "step": 4730 }, { "epoch": 1.5810540360240162, "loss": 1.198, "step": 4740 }, { "epoch": 1.5810540360240162, "grad_norm": 2.146322727203369, "step": 4740 }, { "epoch": 1.5810540360240162, "learning_rate": 0.0007660713930122482, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 1.0306843519210815, "step": 4740 }, { "ce_loss": 0.2571704089641571, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.4897782802581787, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.18662314116954803, "step": 4740 }, { "epoch": 1.5810540360240162, "loss": 1.0882079601287842, "step": 4740 }, { "ce_loss": 0.29918238520622253, "epoch": 1.5810540360240162, "step": 4740 }, { "distill_loss": 0.5254595875740051, "epoch": 1.5810540360240162, "step": 4740 }, { "epoch": 1.5810540360240162, "ref_ce_loss": 0.17846040427684784, "step": 4740 }, { "epoch": 1.5843895930620415, "loss": 1.1687, "step": 4750 }, { "epoch": 1.5843895930620415, "grad_norm": 1.7574557065963745, "step": 4750 }, { "epoch": 1.5843895930620415, "learning_rate": 0.0007658970095217349, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 1.7387831211090088, "step": 4750 }, { "ce_loss": 0.3886146545410156, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.5086261630058289, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.24735936522483826, "step": 4750 }, { "epoch": 1.5843895930620415, "loss": 1.527781367301941, "step": 4750 }, { "ce_loss": 0.35894984006881714, "epoch": 1.5843895930620415, "step": 4750 }, { "distill_loss": 0.5893746614456177, "epoch": 1.5843895930620415, "step": 4750 }, { "epoch": 1.5843895930620415, "ref_ce_loss": 0.2735004127025604, "step": 4750 }, { "epoch": 1.5877251501000669, "loss": 1.3334, "step": 4760 }, { "epoch": 1.5877251501000669, "grad_norm": 1.800559401512146, "step": 4760 }, { "epoch": 1.5877251501000669, "learning_rate": 0.0007657221989900394, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 1.137058138847351, "step": 4760 }, { "ce_loss": 0.33247408270835876, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.4471021592617035, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.2743755877017975, "step": 4760 }, { "epoch": 1.5877251501000669, "loss": 1.2338804006576538, "step": 4760 }, { "ce_loss": 0.30527469515800476, "epoch": 1.5877251501000669, "step": 4760 }, { "distill_loss": 0.43496543169021606, "epoch": 1.5877251501000669, "step": 4760 }, { "epoch": 1.5877251501000669, "ref_ce_loss": 0.23000425100326538, "step": 4760 }, { "epoch": 1.5910607071380922, "loss": 1.158, "step": 4770 }, { "epoch": 1.5910607071380922, "grad_norm": 1.7711436748504639, "step": 4770 }, { "epoch": 1.5910607071380922, "learning_rate": 0.0007655469616211845, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 1.3238584995269775, "step": 4770 }, { "ce_loss": 0.4081491529941559, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.5282933115959167, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.28045764565467834, "step": 4770 }, { "epoch": 1.5910607071380922, "loss": 1.1987744569778442, "step": 4770 }, { "ce_loss": 0.25979167222976685, "epoch": 1.5910607071380922, "step": 4770 }, { "distill_loss": 0.4336746335029602, "epoch": 1.5910607071380922, "step": 4770 }, { "epoch": 1.5910607071380922, "ref_ce_loss": 0.2212361842393875, "step": 4770 }, { "epoch": 1.5943962641761176, "loss": 1.1415, "step": 4780 }, { "epoch": 1.5943962641761176, "grad_norm": 1.8768396377563477, "step": 4780 }, { "epoch": 1.5943962641761176, "learning_rate": 0.0007653712976196909, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 1.8086743354797363, "step": 4780 }, { "ce_loss": 0.27868327498435974, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.4092644155025482, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.17147424817085266, "step": 4780 }, { "epoch": 1.5943962641761176, "loss": 0.9440048336982727, "step": 4780 }, { "ce_loss": 0.29341205954551697, "epoch": 1.5943962641761176, "step": 4780 }, { "distill_loss": 0.4141649305820465, "epoch": 1.5943962641761176, "step": 4780 }, { "epoch": 1.5943962641761176, "ref_ce_loss": 0.23555238544940948, "step": 4780 }, { "epoch": 1.597731821214143, "loss": 1.2361, "step": 4790 }, { "epoch": 1.597731821214143, "grad_norm": 1.5691404342651367, "step": 4790 }, { "epoch": 1.597731821214143, "learning_rate": 0.0007651952071905772, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 1.4893643856048584, "step": 4790 }, { "ce_loss": 0.2698959708213806, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.5141409039497375, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.190774604678154, "step": 4790 }, { "epoch": 1.597731821214143, "loss": 1.882727026939392, "step": 4790 }, { "ce_loss": 0.283547043800354, "epoch": 1.597731821214143, "step": 4790 }, { "distill_loss": 0.4522022306919098, "epoch": 1.597731821214143, "step": 4790 }, { "epoch": 1.597731821214143, "ref_ce_loss": 0.259072482585907, "step": 4790 }, { "epoch": 1.6010673782521683, "loss": 1.2438, "step": 4800 }, { "epoch": 1.6010673782521683, "grad_norm": 1.6348069906234741, "step": 4800 }, { "epoch": 1.6010673782521683, "learning_rate": 0.0007650186905393602, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 1.0375322103500366, "step": 4800 }, { "ce_loss": 0.3192451298236847, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.4521045684814453, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.2652686834335327, "step": 4800 }, { "epoch": 1.6010673782521683, "loss": 0.933064877986908, "step": 4800 }, { "ce_loss": 0.29700523614883423, "epoch": 1.6010673782521683, "step": 4800 }, { "distill_loss": 0.4245816171169281, "epoch": 1.6010673782521683, "step": 4800 }, { "epoch": 1.6010673782521683, "ref_ce_loss": 0.21075178682804108, "step": 4800 }, { "epoch": 1.6044029352901936, "loss": 1.1014, "step": 4810 }, { "epoch": 1.6044029352901936, "grad_norm": 2.032665252685547, "step": 4810 }, { "epoch": 1.6044029352901936, "learning_rate": 0.0007648417478720537, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 1.0235782861709595, "step": 4810 }, { "ce_loss": 0.284810334444046, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.3628327250480652, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.19773633778095245, "step": 4810 }, { "epoch": 1.6044029352901936, "loss": 1.0101426839828491, "step": 4810 }, { "ce_loss": 0.328506737947464, "epoch": 1.6044029352901936, "step": 4810 }, { "distill_loss": 0.41719162464141846, "epoch": 1.6044029352901936, "step": 4810 }, { "epoch": 1.6044029352901936, "ref_ce_loss": 0.23638367652893066, "step": 4810 }, { "epoch": 1.607738492328219, "loss": 1.2109, "step": 4820 }, { "epoch": 1.607738492328219, "grad_norm": 2.2429139614105225, "step": 4820 }, { "epoch": 1.607738492328219, "learning_rate": 0.0007646643793951688, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 1.043318510055542, "step": 4820 }, { "ce_loss": 0.30978885293006897, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.47267794609069824, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.20902498066425323, "step": 4820 }, { "epoch": 1.607738492328219, "loss": 1.0189939737319946, "step": 4820 }, { "ce_loss": 0.33055123686790466, "epoch": 1.607738492328219, "step": 4820 }, { "distill_loss": 0.4063712954521179, "epoch": 1.607738492328219, "step": 4820 }, { "epoch": 1.607738492328219, "ref_ce_loss": 0.20739997923374176, "step": 4820 }, { "epoch": 1.6110740493662443, "loss": 1.2053, "step": 4830 }, { "epoch": 1.6110740493662443, "grad_norm": 1.7878544330596924, "step": 4830 }, { "epoch": 1.6110740493662443, "learning_rate": 0.0007644865853157135, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 1.2280665636062622, "step": 4830 }, { "ce_loss": 0.29879072308540344, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.6149335503578186, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.22780798375606537, "step": 4830 }, { "epoch": 1.6110740493662443, "loss": 1.250707983970642, "step": 4830 }, { "ce_loss": 0.3582846522331238, "epoch": 1.6110740493662443, "step": 4830 }, { "distill_loss": 0.5356194972991943, "epoch": 1.6110740493662443, "step": 4830 }, { "epoch": 1.6110740493662443, "ref_ce_loss": 0.29270094633102417, "step": 4830 }, { "epoch": 1.6144096064042697, "loss": 1.1639, "step": 4840 }, { "epoch": 1.6144096064042697, "grad_norm": 2.3316078186035156, "step": 4840 }, { "epoch": 1.6144096064042697, "learning_rate": 0.0007643083658411931, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 1.0784870386123657, "step": 4840 }, { "ce_loss": 0.24972647428512573, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.4576243460178375, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.18615812063217163, "step": 4840 }, { "epoch": 1.6144096064042697, "loss": 1.7174890041351318, "step": 4840 }, { "ce_loss": 0.3938414454460144, "epoch": 1.6144096064042697, "step": 4840 }, { "distill_loss": 0.3543025553226471, "epoch": 1.6144096064042697, "step": 4840 }, { "epoch": 1.6144096064042697, "ref_ce_loss": 0.2732624113559723, "step": 4840 }, { "epoch": 1.617745163442295, "loss": 1.1165, "step": 4850 }, { "epoch": 1.617745163442295, "grad_norm": 2.677441358566284, "step": 4850 }, { "epoch": 1.617745163442295, "learning_rate": 0.0007641297211796083, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 1.179903507232666, "step": 4850 }, { "ce_loss": 0.26890891790390015, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.37936243414878845, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.18884652853012085, "step": 4850 }, { "epoch": 1.617745163442295, "loss": 1.218037486076355, "step": 4850 }, { "ce_loss": 0.376087486743927, "epoch": 1.617745163442295, "step": 4850 }, { "distill_loss": 0.4474377930164337, "epoch": 1.617745163442295, "step": 4850 }, { "epoch": 1.617745163442295, "ref_ce_loss": 0.23854652047157288, "step": 4850 }, { "epoch": 1.6210807204803204, "loss": 1.1608, "step": 4860 }, { "epoch": 1.6210807204803204, "grad_norm": 1.8439033031463623, "step": 4860 }, { "epoch": 1.6210807204803204, "learning_rate": 0.000763950651539457, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 1.029344081878662, "step": 4860 }, { "ce_loss": 0.32084864377975464, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.4226093292236328, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.21213941276073456, "step": 4860 }, { "epoch": 1.6210807204803204, "loss": 1.2948123216629028, "step": 4860 }, { "ce_loss": 0.35812410712242126, "epoch": 1.6210807204803204, "step": 4860 }, { "distill_loss": 0.443713903427124, "epoch": 1.6210807204803204, "step": 4860 }, { "epoch": 1.6210807204803204, "ref_ce_loss": 0.2382977157831192, "step": 4860 }, { "epoch": 1.6244162775183457, "loss": 1.1872, "step": 4870 }, { "epoch": 1.6244162775183457, "grad_norm": 1.9559777975082397, "step": 4870 }, { "epoch": 1.6244162775183457, "learning_rate": 0.0007637711571297326, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 1.845041036605835, "step": 4870 }, { "ce_loss": 0.3581797182559967, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.5245206356048584, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.2347164750099182, "step": 4870 }, { "epoch": 1.6244162775183457, "loss": 1.1421406269073486, "step": 4870 }, { "ce_loss": 0.3097943067550659, "epoch": 1.6244162775183457, "step": 4870 }, { "distill_loss": 0.4998328685760498, "epoch": 1.6244162775183457, "step": 4870 }, { "epoch": 1.6244162775183457, "ref_ce_loss": 0.24155227839946747, "step": 4870 }, { "epoch": 1.627751834556371, "loss": 1.2329, "step": 4880 }, { "epoch": 1.627751834556371, "grad_norm": 2.061876058578491, "step": 4880 }, { "epoch": 1.627751834556371, "learning_rate": 0.0007635912381599244, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 1.2822391986846924, "step": 4880 }, { "ce_loss": 0.33181074261665344, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.557192325592041, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.2948383092880249, "step": 4880 }, { "epoch": 1.627751834556371, "loss": 1.1364002227783203, "step": 4880 }, { "ce_loss": 0.3164156377315521, "epoch": 1.627751834556371, "step": 4880 }, { "distill_loss": 0.5235283374786377, "epoch": 1.627751834556371, "step": 4880 }, { "epoch": 1.627751834556371, "ref_ce_loss": 0.2043505609035492, "step": 4880 }, { "epoch": 1.6310873915943964, "loss": 1.2234, "step": 4890 }, { "epoch": 1.6310873915943964, "grad_norm": 1.7999014854431152, "step": 4890 }, { "epoch": 1.6310873915943964, "learning_rate": 0.0007634108948400174, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 1.3725650310516357, "step": 4890 }, { "ce_loss": 0.34826570749282837, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.44967207312583923, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.2176557332277298, "step": 4890 }, { "epoch": 1.6310873915943964, "loss": 1.3029837608337402, "step": 4890 }, { "ce_loss": 0.3555228114128113, "epoch": 1.6310873915943964, "step": 4890 }, { "distill_loss": 0.5428327322006226, "epoch": 1.6310873915943964, "step": 4890 }, { "epoch": 1.6310873915943964, "ref_ce_loss": 0.22554075717926025, "step": 4890 }, { "epoch": 1.6344229486324218, "loss": 1.2092, "step": 4900 }, { "epoch": 1.6344229486324218, "grad_norm": 1.869430422782898, "step": 4900 }, { "epoch": 1.6344229486324218, "learning_rate": 0.0007632301273804913, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 1.1500366926193237, "step": 4900 }, { "ce_loss": 0.36326518654823303, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.5155556797981262, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.20150108635425568, "step": 4900 }, { "epoch": 1.6344229486324218, "loss": 1.1529258489608765, "step": 4900 }, { "ce_loss": 0.29777246713638306, "epoch": 1.6344229486324218, "step": 4900 }, { "distill_loss": 0.4624185264110565, "epoch": 1.6344229486324218, "step": 4900 }, { "epoch": 1.6344229486324218, "ref_ce_loss": 0.21998101472854614, "step": 4900 }, { "epoch": 1.6377585056704471, "loss": 1.169, "step": 4910 }, { "epoch": 1.6377585056704471, "grad_norm": 1.7126103639602661, "step": 4910 }, { "epoch": 1.6377585056704471, "learning_rate": 0.0007630489359923214, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 1.02724289894104, "step": 4910 }, { "ce_loss": 0.31032949686050415, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.4123621881008148, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.2039308249950409, "step": 4910 }, { "epoch": 1.6377585056704471, "loss": 1.234750747680664, "step": 4910 }, { "ce_loss": 0.3182434141635895, "epoch": 1.6377585056704471, "step": 4910 }, { "distill_loss": 0.4671993851661682, "epoch": 1.6377585056704471, "step": 4910 }, { "epoch": 1.6377585056704471, "ref_ce_loss": 0.2012316882610321, "step": 4910 }, { "epoch": 1.6410940627084725, "loss": 1.2431, "step": 4920 }, { "epoch": 1.6410940627084725, "grad_norm": 2.990509271621704, "step": 4920 }, { "epoch": 1.6410940627084725, "learning_rate": 0.0007628673208869777, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 1.2088675498962402, "step": 4920 }, { "ce_loss": 0.34323716163635254, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.514014720916748, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.2605898678302765, "step": 4920 }, { "epoch": 1.6410940627084725, "loss": 1.0380889177322388, "step": 4920 }, { "ce_loss": 0.3201424181461334, "epoch": 1.6410940627084725, "step": 4920 }, { "distill_loss": 0.5008715987205505, "epoch": 1.6410940627084725, "step": 4920 }, { "epoch": 1.6410940627084725, "ref_ce_loss": 0.21638834476470947, "step": 4920 }, { "epoch": 1.6444296197464978, "loss": 1.231, "step": 4930 }, { "epoch": 1.6444296197464978, "grad_norm": 1.714496374130249, "step": 4930 }, { "epoch": 1.6444296197464978, "learning_rate": 0.0007626852822764242, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 1.2255754470825195, "step": 4930 }, { "ce_loss": 0.2979726195335388, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.463115394115448, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.24635154008865356, "step": 4930 }, { "epoch": 1.6444296197464978, "loss": 1.2807600498199463, "step": 4930 }, { "ce_loss": 0.35597312450408936, "epoch": 1.6444296197464978, "step": 4930 }, { "distill_loss": 0.5844821929931641, "epoch": 1.6444296197464978, "step": 4930 }, { "epoch": 1.6444296197464978, "ref_ce_loss": 0.23208926618099213, "step": 4930 }, { "epoch": 1.6477651767845232, "loss": 1.2145, "step": 4940 }, { "epoch": 1.6477651767845232, "grad_norm": 3.78121018409729, "step": 4940 }, { "epoch": 1.6477651767845232, "learning_rate": 0.0007625028203731197, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 1.1447784900665283, "step": 4940 }, { "ce_loss": 0.3200160264968872, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.50001060962677, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.238821342587471, "step": 4940 }, { "epoch": 1.6477651767845232, "loss": 0.9587218165397644, "step": 4940 }, { "ce_loss": 0.2661581039428711, "epoch": 1.6477651767845232, "step": 4940 }, { "distill_loss": 0.4619063436985016, "epoch": 1.6477651767845232, "step": 4940 }, { "epoch": 1.6477651767845232, "ref_ce_loss": 0.2296360731124878, "step": 4940 }, { "epoch": 1.6511007338225485, "loss": 1.1663, "step": 4950 }, { "epoch": 1.6511007338225485, "grad_norm": 1.8753292560577393, "step": 4950 }, { "epoch": 1.6511007338225485, "learning_rate": 0.000762319935390017, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 1.0324651002883911, "step": 4950 }, { "ce_loss": 0.2743676006793976, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.45579105615615845, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.24305647611618042, "step": 4950 }, { "epoch": 1.6511007338225485, "loss": 1.320138692855835, "step": 4950 }, { "ce_loss": 0.30213528871536255, "epoch": 1.6511007338225485, "step": 4950 }, { "distill_loss": 0.5041840076446533, "epoch": 1.6511007338225485, "step": 4950 }, { "epoch": 1.6511007338225485, "ref_ce_loss": 0.2646632790565491, "step": 4950 }, { "epoch": 1.6544362908605739, "loss": 1.1035, "step": 4960 }, { "epoch": 1.6544362908605739, "grad_norm": 1.4800277948379517, "step": 4960 }, { "epoch": 1.6544362908605739, "learning_rate": 0.0007621366275405624, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 1.3182570934295654, "step": 4960 }, { "ce_loss": 0.32608532905578613, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.45269620418548584, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.33297863602638245, "step": 4960 }, { "epoch": 1.6544362908605739, "loss": 1.1951959133148193, "step": 4960 }, { "ce_loss": 0.3463919460773468, "epoch": 1.6544362908605739, "step": 4960 }, { "distill_loss": 0.4839607775211334, "epoch": 1.6544362908605739, "step": 4960 }, { "epoch": 1.6544362908605739, "ref_ce_loss": 0.2843335270881653, "step": 4960 }, { "epoch": 1.6577718478985992, "loss": 1.1683, "step": 4970 }, { "epoch": 1.6577718478985992, "grad_norm": 1.8846877813339233, "step": 4970 }, { "epoch": 1.6577718478985992, "learning_rate": 0.000761952897038696, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 0.9753700494766235, "step": 4970 }, { "ce_loss": 0.3265887498855591, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.40698859095573425, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.24157138168811798, "step": 4970 }, { "epoch": 1.6577718478985992, "loss": 1.1718640327453613, "step": 4970 }, { "ce_loss": 0.30149784684181213, "epoch": 1.6577718478985992, "step": 4970 }, { "distill_loss": 0.4266360402107239, "epoch": 1.6577718478985992, "step": 4970 }, { "epoch": 1.6577718478985992, "ref_ce_loss": 0.1909862905740738, "step": 4970 }, { "epoch": 1.6611074049366246, "loss": 1.0753, "step": 4980 }, { "epoch": 1.6611074049366246, "grad_norm": 1.8321624994277954, "step": 4980 }, { "epoch": 1.6611074049366246, "learning_rate": 0.000761768744098851, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 1.1958895921707153, "step": 4980 }, { "ce_loss": 0.3138892352581024, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.4766066074371338, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.19226688146591187, "step": 4980 }, { "epoch": 1.6611074049366246, "loss": 1.039772868156433, "step": 4980 }, { "ce_loss": 0.34119418263435364, "epoch": 1.6611074049366246, "step": 4980 }, { "distill_loss": 0.46169552206993103, "epoch": 1.6611074049366246, "step": 4980 }, { "epoch": 1.6611074049366246, "ref_ce_loss": 0.2307923138141632, "step": 4980 }, { "epoch": 1.66444296197465, "loss": 1.1579, "step": 4990 }, { "epoch": 1.66444296197465, "grad_norm": 1.930245041847229, "step": 4990 }, { "epoch": 1.66444296197465, "learning_rate": 0.0007615841689359537, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 1.1153697967529297, "step": 4990 }, { "ce_loss": 0.33053624629974365, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.47470927238464355, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.23434945940971375, "step": 4990 }, { "epoch": 1.66444296197465, "loss": 0.9403101801872253, "step": 4990 }, { "ce_loss": 0.22530633211135864, "epoch": 1.66444296197465, "step": 4990 }, { "distill_loss": 0.45416656136512756, "epoch": 1.66444296197465, "step": 4990 }, { "epoch": 1.66444296197465, "ref_ce_loss": 0.16067540645599365, "step": 4990 }, { "epoch": 1.6677785190126753, "loss": 1.1993, "step": 5000 }, { "epoch": 1.6677785190126753, "grad_norm": 1.5704511404037476, "step": 5000 }, { "epoch": 1.6677785190126753, "learning_rate": 0.0007613991717654232, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 1.2045124769210815, "step": 5000 }, { "ce_loss": 0.2704031467437744, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.4578080177307129, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.2710844874382019, "step": 5000 }, { "epoch": 1.6677785190126753, "loss": 1.0595126152038574, "step": 5000 }, { "ce_loss": 0.30270177125930786, "epoch": 1.6677785190126753, "step": 5000 }, { "distill_loss": 0.4562907814979553, "epoch": 1.6677785190126753, "step": 5000 }, { "epoch": 1.6677785190126753, "ref_ce_loss": 0.24846094846725464, "step": 5000 }, { "epoch": 1.6711140760507006, "loss": 1.2138, "step": 5010 }, { "epoch": 1.6711140760507006, "grad_norm": 1.6759746074676514, "step": 5010 }, { "epoch": 1.6711140760507006, "learning_rate": 0.0007612137528031712, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 1.137813925743103, "step": 5010 }, { "ce_loss": 0.34316080808639526, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.5127457976341248, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.21678480505943298, "step": 5010 }, { "epoch": 1.6711140760507006, "loss": 1.0579105615615845, "step": 5010 }, { "ce_loss": 0.23692962527275085, "epoch": 1.6711140760507006, "step": 5010 }, { "distill_loss": 0.4204230010509491, "epoch": 1.6711140760507006, "step": 5010 }, { "epoch": 1.6711140760507006, "ref_ce_loss": 0.21300899982452393, "step": 5010 }, { "epoch": 1.674449633088726, "loss": 1.16, "step": 5020 }, { "epoch": 1.674449633088726, "grad_norm": 1.626281976699829, "step": 5020 }, { "epoch": 1.674449633088726, "learning_rate": 0.0007610279122656013, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 1.0852789878845215, "step": 5020 }, { "ce_loss": 0.3337577283382416, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.408643901348114, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.2511367201805115, "step": 5020 }, { "epoch": 1.674449633088726, "loss": 1.2534074783325195, "step": 5020 }, { "ce_loss": 0.30116984248161316, "epoch": 1.674449633088726, "step": 5020 }, { "distill_loss": 0.5019400119781494, "epoch": 1.674449633088726, "step": 5020 }, { "epoch": 1.674449633088726, "ref_ce_loss": 0.1927444487810135, "step": 5020 }, { "epoch": 1.6777851901267513, "loss": 1.223, "step": 5030 }, { "epoch": 1.6777851901267513, "grad_norm": 1.8005539178848267, "step": 5030 }, { "epoch": 1.6777851901267513, "learning_rate": 0.0007608416503696096, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 1.5476301908493042, "step": 5030 }, { "ce_loss": 0.4201894700527191, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.5770529508590698, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.26863935589790344, "step": 5030 }, { "epoch": 1.6777851901267513, "loss": 1.5807050466537476, "step": 5030 }, { "ce_loss": 0.3773467242717743, "epoch": 1.6777851901267513, "step": 5030 }, { "distill_loss": 0.44635558128356934, "epoch": 1.6777851901267513, "step": 5030 }, { "epoch": 1.6777851901267513, "ref_ce_loss": 0.31287744641304016, "step": 5030 }, { "epoch": 1.6811207471647767, "loss": 1.2613, "step": 5040 }, { "epoch": 1.6811207471647767, "grad_norm": 1.4061245918273926, "step": 5040 }, { "epoch": 1.6811207471647767, "learning_rate": 0.0007606549673325838, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 1.2903411388397217, "step": 5040 }, { "ce_loss": 0.2738383710384369, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.4529511630535126, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.23904402554035187, "step": 5040 }, { "epoch": 1.6811207471647767, "loss": 0.8651126027107239, "step": 5040 }, { "ce_loss": 0.22598311305046082, "epoch": 1.6811207471647767, "step": 5040 }, { "distill_loss": 0.38961902260780334, "epoch": 1.6811207471647767, "step": 5040 }, { "epoch": 1.6811207471647767, "ref_ce_loss": 0.21298851072788239, "step": 5040 }, { "epoch": 1.684456304202802, "loss": 1.1926, "step": 5050 }, { "epoch": 1.684456304202802, "grad_norm": 2.3637235164642334, "step": 5050 }, { "epoch": 1.684456304202802, "learning_rate": 0.000760467863372403, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 0.7955895662307739, "step": 5050 }, { "ce_loss": 0.2268964648246765, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.34705016016960144, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.2212955206632614, "step": 5050 }, { "epoch": 1.684456304202802, "loss": 1.1423490047454834, "step": 5050 }, { "ce_loss": 0.30988848209381104, "epoch": 1.684456304202802, "step": 5050 }, { "distill_loss": 0.44475507736206055, "epoch": 1.684456304202802, "step": 5050 }, { "epoch": 1.684456304202802, "ref_ce_loss": 0.30063945055007935, "step": 5050 }, { "epoch": 1.6877918612408274, "loss": 1.1629, "step": 5060 }, { "epoch": 1.6877918612408274, "grad_norm": 1.8391002416610718, "step": 5060 }, { "epoch": 1.6877918612408274, "learning_rate": 0.0007602803387074378, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 1.0280489921569824, "step": 5060 }, { "ce_loss": 0.31085681915283203, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.48093947768211365, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.2359740436077118, "step": 5060 }, { "epoch": 1.6877918612408274, "loss": 1.0623470544815063, "step": 5060 }, { "ce_loss": 0.3287794589996338, "epoch": 1.6877918612408274, "step": 5060 }, { "distill_loss": 0.4346166253089905, "epoch": 1.6877918612408274, "step": 5060 }, { "epoch": 1.6877918612408274, "ref_ce_loss": 0.2352973371744156, "step": 5060 }, { "epoch": 1.6911274182788527, "loss": 1.1754, "step": 5070 }, { "epoch": 1.6911274182788527, "grad_norm": 1.6396242380142212, "step": 5070 }, { "epoch": 1.6911274182788527, "learning_rate": 0.0007600923935565494, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 1.1601217985153198, "step": 5070 }, { "ce_loss": 0.308447927236557, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.5223665237426758, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.21539926528930664, "step": 5070 }, { "epoch": 1.6911274182788527, "loss": 1.4861093759536743, "step": 5070 }, { "ce_loss": 0.3695334196090698, "epoch": 1.6911274182788527, "step": 5070 }, { "distill_loss": 0.5409172773361206, "epoch": 1.6911274182788527, "step": 5070 }, { "epoch": 1.6911274182788527, "ref_ce_loss": 0.21675078570842743, "step": 5070 }, { "epoch": 1.694462975316878, "loss": 1.2464, "step": 5080 }, { "epoch": 1.694462975316878, "grad_norm": 1.831031084060669, "step": 5080 }, { "epoch": 1.694462975316878, "learning_rate": 0.0007599040281390903, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 1.2833508253097534, "step": 5080 }, { "ce_loss": 0.25858020782470703, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.5383179187774658, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.25638070702552795, "step": 5080 }, { "epoch": 1.694462975316878, "loss": 1.1427491903305054, "step": 5080 }, { "ce_loss": 0.35265713930130005, "epoch": 1.694462975316878, "step": 5080 }, { "distill_loss": 0.47813865542411804, "epoch": 1.694462975316878, "step": 5080 }, { "epoch": 1.694462975316878, "ref_ce_loss": 0.24728532135486603, "step": 5080 }, { "epoch": 1.6977985323549034, "loss": 1.2015, "step": 5090 }, { "epoch": 1.6977985323549034, "grad_norm": 2.529123067855835, "step": 5090 }, { "epoch": 1.6977985323549034, "learning_rate": 0.0007597152426749031, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 1.0560879707336426, "step": 5090 }, { "ce_loss": 0.2905693054199219, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.43988847732543945, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.2370666116476059, "step": 5090 }, { "epoch": 1.6977985323549034, "loss": 1.214661955833435, "step": 5090 }, { "ce_loss": 0.36676928400993347, "epoch": 1.6977985323549034, "step": 5090 }, { "distill_loss": 0.5318382382392883, "epoch": 1.6977985323549034, "step": 5090 }, { "epoch": 1.6977985323549034, "ref_ce_loss": 0.2559376657009125, "step": 5090 }, { "epoch": 1.7011340893929288, "loss": 1.213, "step": 5100 }, { "epoch": 1.7011340893929288, "grad_norm": 1.7899972200393677, "step": 5100 }, { "epoch": 1.7011340893929288, "learning_rate": 0.0007595260373843205, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 1.218098759651184, "step": 5100 }, { "ce_loss": 0.32043206691741943, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.5054154992103577, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.22206301987171173, "step": 5100 }, { "epoch": 1.7011340893929288, "loss": 1.1606087684631348, "step": 5100 }, { "ce_loss": 0.31909579038619995, "epoch": 1.7011340893929288, "step": 5100 }, { "distill_loss": 0.49823999404907227, "epoch": 1.7011340893929288, "step": 5100 }, { "epoch": 1.7011340893929288, "ref_ce_loss": 0.2567177712917328, "step": 5100 }, { "epoch": 1.704469646430954, "loss": 1.1741, "step": 5110 }, { "epoch": 1.704469646430954, "grad_norm": 1.9370585680007935, "step": 5110 }, { "epoch": 1.704469646430954, "learning_rate": 0.0007593364124881659, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 1.0405614376068115, "step": 5110 }, { "ce_loss": 0.3165607154369354, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.47016096115112305, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.2531571686267853, "step": 5110 }, { "epoch": 1.704469646430954, "loss": 1.1778936386108398, "step": 5110 }, { "ce_loss": 0.32279154658317566, "epoch": 1.704469646430954, "step": 5110 }, { "distill_loss": 0.5456894040107727, "epoch": 1.704469646430954, "step": 5110 }, { "epoch": 1.704469646430954, "ref_ce_loss": 0.20104186236858368, "step": 5110 }, { "epoch": 1.7078052034689795, "loss": 1.2278, "step": 5120 }, { "epoch": 1.7078052034689795, "grad_norm": 2.5454940795898438, "step": 5120 }, { "epoch": 1.7078052034689795, "learning_rate": 0.0007591463682077518, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 0.8619414567947388, "step": 5120 }, { "ce_loss": 0.21449235081672668, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.39205607771873474, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.1672857403755188, "step": 5120 }, { "epoch": 1.7078052034689795, "loss": 1.0508708953857422, "step": 5120 }, { "ce_loss": 0.2673969864845276, "epoch": 1.7078052034689795, "step": 5120 }, { "distill_loss": 0.49973276257514954, "epoch": 1.7078052034689795, "step": 5120 }, { "epoch": 1.7078052034689795, "ref_ce_loss": 0.19507500529289246, "step": 5120 }, { "epoch": 1.7111407605070048, "loss": 1.2024, "step": 5130 }, { "epoch": 1.7111407605070048, "grad_norm": 1.88835608959198, "step": 5130 }, { "epoch": 1.7111407605070048, "learning_rate": 0.0007589559047648801, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 1.2697545289993286, "step": 5130 }, { "ce_loss": 0.3176894783973694, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.5580767393112183, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.18888889253139496, "step": 5130 }, { "epoch": 1.7111407605070048, "loss": 1.4602257013320923, "step": 5130 }, { "ce_loss": 0.43063437938690186, "epoch": 1.7111407605070048, "step": 5130 }, { "distill_loss": 0.4556039273738861, "epoch": 1.7111407605070048, "step": 5130 }, { "epoch": 1.7111407605070048, "ref_ce_loss": 0.3002569377422333, "step": 5130 }, { "epoch": 1.7144763175450302, "loss": 1.1297, "step": 5140 }, { "epoch": 1.7144763175450302, "grad_norm": 1.6927906274795532, "step": 5140 }, { "epoch": 1.7144763175450302, "learning_rate": 0.0007587650223818422, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 1.0018633604049683, "step": 5140 }, { "ce_loss": 0.2813844382762909, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.38585925102233887, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.17963579297065735, "step": 5140 }, { "epoch": 1.7144763175450302, "loss": 0.9852960705757141, "step": 5140 }, { "ce_loss": 0.251751184463501, "epoch": 1.7144763175450302, "step": 5140 }, { "distill_loss": 0.37681418657302856, "epoch": 1.7144763175450302, "step": 5140 }, { "epoch": 1.7144763175450302, "ref_ce_loss": 0.20930209755897522, "step": 5140 }, { "epoch": 1.7178118745830555, "loss": 1.1583, "step": 5150 }, { "epoch": 1.7178118745830555, "grad_norm": 1.387619972229004, "step": 5150 }, { "epoch": 1.7178118745830555, "learning_rate": 0.0007585737212814186, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 1.509414553642273, "step": 5150 }, { "ce_loss": 0.37400636076927185, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.4108448028564453, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.24562181532382965, "step": 5150 }, { "epoch": 1.7178118745830555, "loss": 1.2409241199493408, "step": 5150 }, { "ce_loss": 0.322540283203125, "epoch": 1.7178118745830555, "step": 5150 }, { "distill_loss": 0.46941179037094116, "epoch": 1.7178118745830555, "step": 5150 }, { "epoch": 1.7178118745830555, "ref_ce_loss": 0.23541799187660217, "step": 5150 }, { "epoch": 1.7211474316210809, "loss": 1.2913, "step": 5160 }, { "epoch": 1.7211474316210809, "grad_norm": 2.25449275970459, "step": 5160 }, { "epoch": 1.7211474316210809, "learning_rate": 0.0007583820016868781, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 0.9478521943092346, "step": 5160 }, { "ce_loss": 0.30188632011413574, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.4439525604248047, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.20185263454914093, "step": 5160 }, { "epoch": 1.7211474316210809, "loss": 1.008941650390625, "step": 5160 }, { "ce_loss": 0.2736072838306427, "epoch": 1.7211474316210809, "step": 5160 }, { "distill_loss": 0.42032089829444885, "epoch": 1.7211474316210809, "step": 5160 }, { "epoch": 1.7211474316210809, "ref_ce_loss": 0.14703622460365295, "step": 5160 }, { "epoch": 1.7244829886591062, "loss": 1.2241, "step": 5170 }, { "epoch": 1.7244829886591062, "grad_norm": 1.9468799829483032, "step": 5170 }, { "epoch": 1.7244829886591062, "learning_rate": 0.0007581898638219782, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 1.4961808919906616, "step": 5170 }, { "ce_loss": 0.28084325790405273, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.4227789640426636, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.24790780246257782, "step": 5170 }, { "epoch": 1.7244829886591062, "loss": 1.213461995124817, "step": 5170 }, { "ce_loss": 0.30838051438331604, "epoch": 1.7244829886591062, "step": 5170 }, { "distill_loss": 0.4545673727989197, "epoch": 1.7244829886591062, "step": 5170 }, { "epoch": 1.7244829886591062, "ref_ce_loss": 0.20522762835025787, "step": 5170 }, { "epoch": 1.7278185456971316, "loss": 1.1363, "step": 5180 }, { "epoch": 1.7278185456971316, "grad_norm": 1.6751428842544556, "step": 5180 }, { "epoch": 1.7278185456971316, "learning_rate": 0.0007579973079109644, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 1.1813774108886719, "step": 5180 }, { "ce_loss": 0.34286630153656006, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.5524317026138306, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.28594598174095154, "step": 5180 }, { "epoch": 1.7278185456971316, "loss": 0.9992787837982178, "step": 5180 }, { "ce_loss": 0.24412690103054047, "epoch": 1.7278185456971316, "step": 5180 }, { "distill_loss": 0.4426809847354889, "epoch": 1.7278185456971316, "step": 5180 }, { "epoch": 1.7278185456971316, "ref_ce_loss": 0.24817222356796265, "step": 5180 }, { "epoch": 1.731154102735157, "loss": 1.1503, "step": 5190 }, { "epoch": 1.731154102735157, "grad_norm": 1.4612666368484497, "step": 5190 }, { "epoch": 1.731154102735157, "learning_rate": 0.0007578043341785701, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 1.0483276844024658, "step": 5190 }, { "ce_loss": 0.3545515835285187, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.42343053221702576, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.22489754855632782, "step": 5190 }, { "epoch": 1.731154102735157, "loss": 0.7794271111488342, "step": 5190 }, { "ce_loss": 0.23218852281570435, "epoch": 1.731154102735157, "step": 5190 }, { "distill_loss": 0.34502241015434265, "epoch": 1.731154102735157, "step": 5190 }, { "epoch": 1.731154102735157, "ref_ce_loss": 0.2018437385559082, "step": 5190 }, { "epoch": 1.7344896597731823, "loss": 1.2091, "step": 5200 }, { "epoch": 1.7344896597731823, "grad_norm": 2.8298192024230957, "step": 5200 }, { "epoch": 1.7344896597731823, "learning_rate": 0.0007576109428500164, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 1.2747900485992432, "step": 5200 }, { "ce_loss": 0.2895369529724121, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.47683602571487427, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.27042490243911743, "step": 5200 }, { "epoch": 1.7344896597731823, "loss": 1.313528060913086, "step": 5200 }, { "ce_loss": 0.3060389757156372, "epoch": 1.7344896597731823, "step": 5200 }, { "distill_loss": 0.47130000591278076, "epoch": 1.7344896597731823, "step": 5200 }, { "epoch": 1.7344896597731823, "ref_ce_loss": 0.24032670259475708, "step": 5200 }, { "epoch": 1.7378252168112076, "loss": 1.1724, "step": 5210 }, { "epoch": 1.7378252168112076, "grad_norm": 2.6083362102508545, "step": 5210 }, { "epoch": 1.7378252168112076, "learning_rate": 0.0007574171341510119, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 1.0935707092285156, "step": 5210 }, { "ce_loss": 0.22755658626556396, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.46089592576026917, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.1798480749130249, "step": 5210 }, { "epoch": 1.7378252168112076, "loss": 1.0753840208053589, "step": 5210 }, { "ce_loss": 0.31017637252807617, "epoch": 1.7378252168112076, "step": 5210 }, { "distill_loss": 0.44865307211875916, "epoch": 1.7378252168112076, "step": 5210 }, { "epoch": 1.7378252168112076, "ref_ce_loss": 0.20125728845596313, "step": 5210 }, { "epoch": 1.741160773849233, "loss": 1.1641, "step": 5220 }, { "epoch": 1.741160773849233, "grad_norm": 2.240241050720215, "step": 5220 }, { "epoch": 1.741160773849233, "learning_rate": 0.0007572229083077524, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 1.3574274778366089, "step": 5220 }, { "ce_loss": 0.3337510824203491, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.5477548837661743, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.25798723101615906, "step": 5220 }, { "epoch": 1.741160773849233, "loss": 1.0125255584716797, "step": 5220 }, { "ce_loss": 0.32500120997428894, "epoch": 1.741160773849233, "step": 5220 }, { "distill_loss": 0.43379876017570496, "epoch": 1.741160773849233, "step": 5220 }, { "epoch": 1.741160773849233, "ref_ce_loss": 0.25346094369888306, "step": 5220 }, { "epoch": 1.7444963308872583, "loss": 1.2409, "step": 5230 }, { "epoch": 1.7444963308872583, "grad_norm": 2.328303575515747, "step": 5230 }, { "epoch": 1.7444963308872583, "learning_rate": 0.0007570282655469198, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 1.5247856378555298, "step": 5230 }, { "ce_loss": 0.32193514704704285, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.4718524217605591, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.25647538900375366, "step": 5230 }, { "epoch": 1.7444963308872583, "loss": 1.5344735383987427, "step": 5230 }, { "ce_loss": 0.37541303038597107, "epoch": 1.7444963308872583, "step": 5230 }, { "distill_loss": 0.47761592268943787, "epoch": 1.7444963308872583, "step": 5230 }, { "epoch": 1.7444963308872583, "ref_ce_loss": 0.19708700478076935, "step": 5230 }, { "epoch": 1.7478318879252837, "loss": 1.2756, "step": 5240 }, { "epoch": 1.7478318879252837, "grad_norm": 1.7134599685668945, "step": 5240 }, { "epoch": 1.7478318879252837, "learning_rate": 0.0007568332060956836, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 1.1749379634857178, "step": 5240 }, { "ce_loss": 0.31640517711639404, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.45369449257850647, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.22938039898872375, "step": 5240 }, { "epoch": 1.7478318879252837, "loss": 1.536935806274414, "step": 5240 }, { "ce_loss": 0.3823091983795166, "epoch": 1.7478318879252837, "step": 5240 }, { "distill_loss": 0.533094048500061, "epoch": 1.7478318879252837, "step": 5240 }, { "epoch": 1.7478318879252837, "ref_ce_loss": 0.25532063841819763, "step": 5240 }, { "epoch": 1.751167444963309, "loss": 1.2075, "step": 5250 }, { "epoch": 1.751167444963309, "grad_norm": 1.5515145063400269, "step": 5250 }, { "epoch": 1.751167444963309, "learning_rate": 0.0007566377301816992, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 1.1134408712387085, "step": 5250 }, { "ce_loss": 0.28996923565864563, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.5056423544883728, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.22649165987968445, "step": 5250 }, { "epoch": 1.751167444963309, "loss": 1.2687480449676514, "step": 5250 }, { "ce_loss": 0.38387155532836914, "epoch": 1.751167444963309, "step": 5250 }, { "distill_loss": 0.5104761719703674, "epoch": 1.751167444963309, "step": 5250 }, { "epoch": 1.751167444963309, "ref_ce_loss": 0.2832685112953186, "step": 5250 }, { "epoch": 1.7545030020013344, "loss": 1.1648, "step": 5260 }, { "epoch": 1.7545030020013344, "grad_norm": 2.0206077098846436, "step": 5260 }, { "epoch": 1.7545030020013344, "learning_rate": 0.0007564418380331077, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 0.9798883199691772, "step": 5260 }, { "ce_loss": 0.25137314200401306, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.44658365845680237, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.2106369584798813, "step": 5260 }, { "epoch": 1.7545030020013344, "loss": 1.2164933681488037, "step": 5260 }, { "ce_loss": 0.3413506746292114, "epoch": 1.7545030020013344, "step": 5260 }, { "distill_loss": 0.4681362211704254, "epoch": 1.7545030020013344, "step": 5260 }, { "epoch": 1.7545030020013344, "ref_ce_loss": 0.2433163821697235, "step": 5260 }, { "epoch": 1.7578385590393597, "loss": 1.1118, "step": 5270 }, { "epoch": 1.7578385590393597, "grad_norm": 1.2726492881774902, "step": 5270 }, { "epoch": 1.7578385590393597, "learning_rate": 0.0007562455298785365, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.9748777747154236, "step": 5270 }, { "ce_loss": 0.2741670608520508, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.4492979049682617, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.20096157491207123, "step": 5270 }, { "epoch": 1.7578385590393597, "loss": 0.9771814346313477, "step": 5270 }, { "ce_loss": 0.3123237192630768, "epoch": 1.7578385590393597, "step": 5270 }, { "distill_loss": 0.3900871276855469, "epoch": 1.7578385590393597, "step": 5270 }, { "epoch": 1.7578385590393597, "ref_ce_loss": 0.1981293261051178, "step": 5270 }, { "epoch": 1.761174116077385, "loss": 1.1798, "step": 5280 }, { "epoch": 1.761174116077385, "grad_norm": 1.7693536281585693, "step": 5280 }, { "epoch": 1.761174116077385, "learning_rate": 0.0007560488059470984, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 1.1466625928878784, "step": 5280 }, { "ce_loss": 0.28282761573791504, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.506600022315979, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.19416525959968567, "step": 5280 }, { "epoch": 1.761174116077385, "loss": 1.3348321914672852, "step": 5280 }, { "ce_loss": 0.40944698452949524, "epoch": 1.761174116077385, "step": 5280 }, { "distill_loss": 0.5854804515838623, "epoch": 1.761174116077385, "step": 5280 }, { "epoch": 1.761174116077385, "ref_ce_loss": 0.27034467458724976, "step": 5280 }, { "epoch": 1.7645096731154104, "loss": 1.2163, "step": 5290 }, { "epoch": 1.7645096731154104, "grad_norm": 3.366854429244995, "step": 5290 }, { "epoch": 1.7645096731154104, "learning_rate": 0.0007558516664683913, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 1.089564323425293, "step": 5290 }, { "ce_loss": 0.3452341854572296, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.512012779712677, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.23154222965240479, "step": 5290 }, { "epoch": 1.7645096731154104, "loss": 1.4174798727035522, "step": 5290 }, { "ce_loss": 0.39411184191703796, "epoch": 1.7645096731154104, "step": 5290 }, { "distill_loss": 0.5689647793769836, "epoch": 1.7645096731154104, "step": 5290 }, { "epoch": 1.7645096731154104, "ref_ce_loss": 0.25486916303634644, "step": 5290 }, { "epoch": 1.7678452301534358, "loss": 1.085, "step": 5300 }, { "epoch": 1.7678452301534358, "grad_norm": 2.4952659606933594, "step": 5300 }, { "epoch": 1.7678452301534358, "learning_rate": 0.0007556541116724981, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 1.0137168169021606, "step": 5300 }, { "ce_loss": 0.26594722270965576, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.41260483860969543, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.1979988068342209, "step": 5300 }, { "epoch": 1.7678452301534358, "loss": 1.5348089933395386, "step": 5300 }, { "ce_loss": 0.4159756898880005, "epoch": 1.7678452301534358, "step": 5300 }, { "distill_loss": 0.5758260488510132, "epoch": 1.7678452301534358, "step": 5300 }, { "epoch": 1.7678452301534358, "ref_ce_loss": 0.29201629757881165, "step": 5300 }, { "epoch": 1.771180787191461, "loss": 1.1911, "step": 5310 }, { "epoch": 1.771180787191461, "grad_norm": 2.0186476707458496, "step": 5310 }, { "epoch": 1.771180787191461, "learning_rate": 0.0007554561417899867, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 1.5441590547561646, "step": 5310 }, { "ce_loss": 0.29326045513153076, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.47031792998313904, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.2508998215198517, "step": 5310 }, { "epoch": 1.771180787191461, "loss": 1.181255578994751, "step": 5310 }, { "ce_loss": 0.3429895043373108, "epoch": 1.771180787191461, "step": 5310 }, { "distill_loss": 0.6115217804908752, "epoch": 1.771180787191461, "step": 5310 }, { "epoch": 1.771180787191461, "ref_ce_loss": 0.22667163610458374, "step": 5310 }, { "epoch": 1.7745163442294865, "loss": 1.2429, "step": 5320 }, { "epoch": 1.7745163442294865, "grad_norm": 1.53067147731781, "step": 5320 }, { "epoch": 1.7745163442294865, "learning_rate": 0.0007552577570519092, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 1.9128655195236206, "step": 5320 }, { "ce_loss": 0.47747403383255005, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.5113193988800049, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.3745881915092468, "step": 5320 }, { "epoch": 1.7745163442294865, "loss": 1.1742745637893677, "step": 5320 }, { "ce_loss": 0.3044947683811188, "epoch": 1.7745163442294865, "step": 5320 }, { "distill_loss": 0.4169827997684479, "epoch": 1.7745163442294865, "step": 5320 }, { "epoch": 1.7745163442294865, "ref_ce_loss": 0.21569328010082245, "step": 5320 }, { "epoch": 1.7778519012675118, "loss": 1.2755, "step": 5330 }, { "epoch": 1.7778519012675118, "grad_norm": 2.012723922729492, "step": 5330 }, { "epoch": 1.7778519012675118, "learning_rate": 0.0007550589576898018, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 1.4698947668075562, "step": 5330 }, { "ce_loss": 0.35279497504234314, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.5262315273284912, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.2742978036403656, "step": 5330 }, { "epoch": 1.7778519012675118, "loss": 0.9402695298194885, "step": 5330 }, { "ce_loss": 0.22868719696998596, "epoch": 1.7778519012675118, "step": 5330 }, { "distill_loss": 0.3957265317440033, "epoch": 1.7778519012675118, "step": 5330 }, { "epoch": 1.7778519012675118, "ref_ce_loss": 0.16549795866012573, "step": 5330 }, { "epoch": 1.7811874583055372, "loss": 1.1196, "step": 5340 }, { "epoch": 1.7811874583055372, "grad_norm": 1.567074179649353, "step": 5340 }, { "epoch": 1.7811874583055372, "learning_rate": 0.000754859743935685, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 1.668684720993042, "step": 5340 }, { "ce_loss": 0.38102585077285767, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.4782260060310364, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.3241592049598694, "step": 5340 }, { "epoch": 1.7811874583055372, "loss": 0.749116063117981, "step": 5340 }, { "ce_loss": 0.20039691030979156, "epoch": 1.7811874583055372, "step": 5340 }, { "distill_loss": 0.33525410294532776, "epoch": 1.7811874583055372, "step": 5340 }, { "epoch": 1.7811874583055372, "ref_ce_loss": 0.16110847890377045, "step": 5340 }, { "epoch": 1.7845230153435625, "loss": 1.0913, "step": 5350 }, { "epoch": 1.7845230153435625, "grad_norm": 1.7558014392852783, "step": 5350 }, { "epoch": 1.7845230153435625, "learning_rate": 0.0007546601160220624, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 1.0668269395828247, "step": 5350 }, { "ce_loss": 0.26166832447052, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.4416963756084442, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.18618950247764587, "step": 5350 }, { "epoch": 1.7845230153435625, "loss": 1.3767130374908447, "step": 5350 }, { "ce_loss": 0.4135306477546692, "epoch": 1.7845230153435625, "step": 5350 }, { "distill_loss": 0.4508998990058899, "epoch": 1.7845230153435625, "step": 5350 }, { "epoch": 1.7845230153435625, "ref_ce_loss": 0.1901094913482666, "step": 5350 }, { "epoch": 1.7878585723815879, "loss": 1.1026, "step": 5360 }, { "epoch": 1.7878585723815879, "grad_norm": 1.6343917846679688, "step": 5360 }, { "epoch": 1.7878585723815879, "learning_rate": 0.0007544600741819213, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 0.9110533595085144, "step": 5360 }, { "ce_loss": 0.30580198764801025, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.3742898106575012, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.23055045306682587, "step": 5360 }, { "epoch": 1.7878585723815879, "loss": 1.753638505935669, "step": 5360 }, { "ce_loss": 0.3236137628555298, "epoch": 1.7878585723815879, "step": 5360 }, { "distill_loss": 0.38308921456336975, "epoch": 1.7878585723815879, "step": 5360 }, { "epoch": 1.7878585723815879, "ref_ce_loss": 0.2120286375284195, "step": 5360 }, { "epoch": 1.7911941294196132, "loss": 1.1418, "step": 5370 }, { "epoch": 1.7911941294196132, "grad_norm": 1.9476059675216675, "step": 5370 }, { "epoch": 1.7911941294196132, "learning_rate": 0.0007542596186487324, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 1.5973538160324097, "step": 5370 }, { "ce_loss": 0.3987217843532562, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.5269778966903687, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.32627880573272705, "step": 5370 }, { "epoch": 1.7911941294196132, "loss": 1.2063021659851074, "step": 5370 }, { "ce_loss": 0.31812429428100586, "epoch": 1.7911941294196132, "step": 5370 }, { "distill_loss": 0.4934598207473755, "epoch": 1.7911941294196132, "step": 5370 }, { "epoch": 1.7911941294196132, "ref_ce_loss": 0.2113611102104187, "step": 5370 }, { "epoch": 1.7945296864576386, "loss": 1.1803, "step": 5380 }, { "epoch": 1.7945296864576386, "grad_norm": 2.2353322505950928, "step": 5380 }, { "epoch": 1.7945296864576386, "learning_rate": 0.0007540587496564484, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 0.9624265432357788, "step": 5380 }, { "ce_loss": 0.2751317620277405, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.4234001338481903, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.19405362010002136, "step": 5380 }, { "epoch": 1.7945296864576386, "loss": 1.2300310134887695, "step": 5380 }, { "ce_loss": 0.2887333631515503, "epoch": 1.7945296864576386, "step": 5380 }, { "distill_loss": 0.4319232702255249, "epoch": 1.7945296864576386, "step": 5380 }, { "epoch": 1.7945296864576386, "ref_ce_loss": 0.21429966390132904, "step": 5380 }, { "epoch": 1.797865243495664, "loss": 1.2103, "step": 5390 }, { "epoch": 1.797865243495664, "grad_norm": 2.053229808807373, "step": 5390 }, { "epoch": 1.797865243495664, "learning_rate": 0.0007538574674395054, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 0.9388774037361145, "step": 5390 }, { "ce_loss": 0.22611574828624725, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.4997987151145935, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.2127380520105362, "step": 5390 }, { "epoch": 1.797865243495664, "loss": 1.4807536602020264, "step": 5390 }, { "ce_loss": 0.41299211978912354, "epoch": 1.797865243495664, "step": 5390 }, { "distill_loss": 0.5142659544944763, "epoch": 1.797865243495664, "step": 5390 }, { "epoch": 1.797865243495664, "ref_ce_loss": 0.24807098507881165, "step": 5390 }, { "epoch": 1.8012008005336893, "loss": 1.1674, "step": 5400 }, { "epoch": 1.8012008005336893, "grad_norm": 2.382713556289673, "step": 5400 }, { "epoch": 1.8012008005336893, "learning_rate": 0.0007536557722328214, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 1.3658170700073242, "step": 5400 }, { "ce_loss": 0.2759830355644226, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.515870988368988, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.1930103451013565, "step": 5400 }, { "epoch": 1.8012008005336893, "loss": 1.4405559301376343, "step": 5400 }, { "ce_loss": 0.34385547041893005, "epoch": 1.8012008005336893, "step": 5400 }, { "distill_loss": 0.519115686416626, "epoch": 1.8012008005336893, "step": 5400 }, { "epoch": 1.8012008005336893, "ref_ce_loss": 0.1835671365261078, "step": 5400 }, { "epoch": 1.8045363575717146, "loss": 1.2482, "step": 5410 }, { "epoch": 1.8045363575717146, "grad_norm": 1.628182053565979, "step": 5410 }, { "epoch": 1.8045363575717146, "learning_rate": 0.0007534536642717961, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 1.438568115234375, "step": 5410 }, { "ce_loss": 0.35257962346076965, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.5803521871566772, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.2886123061180115, "step": 5410 }, { "epoch": 1.8045363575717146, "loss": 0.9762012362480164, "step": 5410 }, { "ce_loss": 0.27220970392227173, "epoch": 1.8045363575717146, "step": 5410 }, { "distill_loss": 0.4245986044406891, "epoch": 1.8045363575717146, "step": 5410 }, { "epoch": 1.8045363575717146, "ref_ce_loss": 0.2115209996700287, "step": 5410 }, { "epoch": 1.80787191460974, "loss": 1.3043, "step": 5420 }, { "epoch": 1.80787191460974, "grad_norm": 1.702199935913086, "step": 5420 }, { "epoch": 1.80787191460974, "learning_rate": 0.0007532511437923113, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 1.0344901084899902, "step": 5420 }, { "ce_loss": 0.28677964210510254, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.476391077041626, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.18809427320957184, "step": 5420 }, { "epoch": 1.80787191460974, "loss": 1.2833102941513062, "step": 5420 }, { "ce_loss": 0.3542989492416382, "epoch": 1.80787191460974, "step": 5420 }, { "distill_loss": 0.4811602532863617, "epoch": 1.80787191460974, "step": 5420 }, { "epoch": 1.80787191460974, "ref_ce_loss": 0.22961384057998657, "step": 5420 }, { "epoch": 1.8112074716477653, "loss": 1.143, "step": 5430 }, { "epoch": 1.8112074716477653, "grad_norm": 1.7151790857315063, "step": 5430 }, { "epoch": 1.8112074716477653, "learning_rate": 0.0007530482110307304, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 0.9452934861183167, "step": 5430 }, { "ce_loss": 0.2456946074962616, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.3025226593017578, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.22681143879890442, "step": 5430 }, { "epoch": 1.8112074716477653, "loss": 1.0299745798110962, "step": 5430 }, { "ce_loss": 0.3505493998527527, "epoch": 1.8112074716477653, "step": 5430 }, { "distill_loss": 0.36607131361961365, "epoch": 1.8112074716477653, "step": 5430 }, { "epoch": 1.8112074716477653, "ref_ce_loss": 0.2534058392047882, "step": 5430 }, { "epoch": 1.8145430286857906, "loss": 1.0969, "step": 5440 }, { "epoch": 1.8145430286857906, "grad_norm": 2.0306644439697266, "step": 5440 }, { "epoch": 1.8145430286857906, "learning_rate": 0.0007528448662238976, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 1.2721872329711914, "step": 5440 }, { "ce_loss": 0.35148635506629944, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.5788367390632629, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.28733375668525696, "step": 5440 }, { "epoch": 1.8145430286857906, "loss": 1.2890324592590332, "step": 5440 }, { "ce_loss": 0.3618167042732239, "epoch": 1.8145430286857906, "step": 5440 }, { "distill_loss": 0.5949937105178833, "epoch": 1.8145430286857906, "step": 5440 }, { "epoch": 1.8145430286857906, "ref_ce_loss": 0.25563210248947144, "step": 5440 }, { "epoch": 1.817878585723816, "loss": 1.1423, "step": 5450 }, { "epoch": 1.817878585723816, "grad_norm": 1.2551982402801514, "step": 5450 }, { "epoch": 1.817878585723816, "learning_rate": 0.0007526411096091384, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.9944010972976685, "step": 5450 }, { "ce_loss": 0.3130263090133667, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.3931999206542969, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.22332419455051422, "step": 5450 }, { "epoch": 1.817878585723816, "loss": 0.8705968856811523, "step": 5450 }, { "ce_loss": 0.2662094831466675, "epoch": 1.817878585723816, "step": 5450 }, { "distill_loss": 0.36075133085250854, "epoch": 1.817878585723816, "step": 5450 }, { "epoch": 1.817878585723816, "ref_ce_loss": 0.24345366656780243, "step": 5450 }, { "epoch": 1.8212141427618413, "loss": 1.2, "step": 5460 }, { "epoch": 1.8212141427618413, "grad_norm": 2.462411642074585, "step": 5460 }, { "epoch": 1.8212141427618413, "learning_rate": 0.0007524369414242584, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 0.942243218421936, "step": 5460 }, { "ce_loss": 0.28354936838150024, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.41044342517852783, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.2307603508234024, "step": 5460 }, { "epoch": 1.8212141427618413, "loss": 1.3136810064315796, "step": 5460 }, { "ce_loss": 0.33640730381011963, "epoch": 1.8212141427618413, "step": 5460 }, { "distill_loss": 0.5245234966278076, "epoch": 1.8212141427618413, "step": 5460 }, { "epoch": 1.8212141427618413, "ref_ce_loss": 0.20452705025672913, "step": 5460 }, { "epoch": 1.8245496997998667, "loss": 1.2422, "step": 5470 }, { "epoch": 1.8245496997998667, "grad_norm": 2.514112949371338, "step": 5470 }, { "epoch": 1.8245496997998667, "learning_rate": 0.000752232361907544, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 1.0976307392120361, "step": 5470 }, { "ce_loss": 0.3172653615474701, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.48672354221343994, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.2345392107963562, "step": 5470 }, { "epoch": 1.8245496997998667, "loss": 1.0066343545913696, "step": 5470 }, { "ce_loss": 0.3098627030849457, "epoch": 1.8245496997998667, "step": 5470 }, { "distill_loss": 0.39110395312309265, "epoch": 1.8245496997998667, "step": 5470 }, { "epoch": 1.8245496997998667, "ref_ce_loss": 0.22836095094680786, "step": 5470 }, { "epoch": 1.827885256837892, "loss": 1.0856, "step": 5480 }, { "epoch": 1.827885256837892, "grad_norm": 1.774595022201538, "step": 5480 }, { "epoch": 1.827885256837892, "learning_rate": 0.0007520273712977616, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 1.7556549310684204, "step": 5480 }, { "ce_loss": 0.2319716066122055, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.3767503499984741, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.20208023488521576, "step": 5480 }, { "epoch": 1.827885256837892, "loss": 1.2231870889663696, "step": 5480 }, { "ce_loss": 0.34553295373916626, "epoch": 1.827885256837892, "step": 5480 }, { "distill_loss": 0.5309174060821533, "epoch": 1.827885256837892, "step": 5480 }, { "epoch": 1.827885256837892, "ref_ce_loss": 0.2443692535161972, "step": 5480 }, { "epoch": 1.8312208138759174, "loss": 1.2356, "step": 5490 }, { "epoch": 1.8312208138759174, "grad_norm": 1.8566852807998657, "step": 5490 }, { "epoch": 1.8312208138759174, "learning_rate": 0.000751821969834157, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 1.3017749786376953, "step": 5490 }, { "ce_loss": 0.30346235632896423, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.4574112296104431, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.23760145902633667, "step": 5490 }, { "epoch": 1.8312208138759174, "loss": 1.5731134414672852, "step": 5490 }, { "ce_loss": 0.29255327582359314, "epoch": 1.8312208138759174, "step": 5490 }, { "distill_loss": 0.4530612826347351, "epoch": 1.8312208138759174, "step": 5490 }, { "epoch": 1.8312208138759174, "ref_ce_loss": 0.18628793954849243, "step": 5490 }, { "epoch": 1.8345563709139427, "loss": 1.1548, "step": 5500 }, { "epoch": 1.8345563709139427, "grad_norm": 2.09499454498291, "step": 5500 }, { "epoch": 1.8345563709139427, "learning_rate": 0.000751616157756456, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 1.183165431022644, "step": 5500 }, { "ce_loss": 0.24258391559123993, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.44836339354515076, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.2561323642730713, "step": 5500 }, { "epoch": 1.8345563709139427, "loss": 1.9524562358856201, "step": 5500 }, { "ce_loss": 0.34489694237709045, "epoch": 1.8345563709139427, "step": 5500 }, { "distill_loss": 0.49662792682647705, "epoch": 1.8345563709139427, "step": 5500 }, { "epoch": 1.8345563709139427, "ref_ce_loss": 0.19847920536994934, "step": 5500 }, { "epoch": 1.837891927951968, "loss": 1.1857, "step": 5510 }, { "epoch": 1.837891927951968, "grad_norm": 2.0431957244873047, "step": 5510 }, { "epoch": 1.837891927951968, "learning_rate": 0.0007514099353048636, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 1.0093868970870972, "step": 5510 }, { "ce_loss": 0.32575932145118713, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.4605439603328705, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.22248651087284088, "step": 5510 }, { "epoch": 1.837891927951968, "loss": 1.0461400747299194, "step": 5510 }, { "ce_loss": 0.2589099109172821, "epoch": 1.837891927951968, "step": 5510 }, { "distill_loss": 0.5331931114196777, "epoch": 1.837891927951968, "step": 5510 }, { "epoch": 1.837891927951968, "ref_ce_loss": 0.1913858950138092, "step": 5510 }, { "epoch": 1.8412274849899934, "loss": 1.1587, "step": 5520 }, { "epoch": 1.8412274849899934, "grad_norm": 1.8402179479599, "step": 5520 }, { "epoch": 1.8412274849899934, "learning_rate": 0.0007512033027200634, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 1.1105293035507202, "step": 5520 }, { "ce_loss": 0.29747650027275085, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.45094624161720276, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.20536360144615173, "step": 5520 }, { "epoch": 1.8412274849899934, "loss": 1.1815446615219116, "step": 5520 }, { "ce_loss": 0.24095499515533447, "epoch": 1.8412274849899934, "step": 5520 }, { "distill_loss": 0.50213223695755, "epoch": 1.8412274849899934, "step": 5520 }, { "epoch": 1.8412274849899934, "ref_ce_loss": 0.20024245977401733, "step": 5520 }, { "epoch": 1.8445630420280188, "loss": 1.1764, "step": 5530 }, { "epoch": 1.8445630420280188, "grad_norm": 1.6938358545303345, "step": 5530 }, { "epoch": 1.8445630420280188, "learning_rate": 0.0007509962602432177, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 0.9573712348937988, "step": 5530 }, { "ce_loss": 0.30945950746536255, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.4477289319038391, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.19660018384456635, "step": 5530 }, { "epoch": 1.8445630420280188, "loss": 1.0629284381866455, "step": 5530 }, { "ce_loss": 0.31846141815185547, "epoch": 1.8445630420280188, "step": 5530 }, { "distill_loss": 0.43803030252456665, "epoch": 1.8445630420280188, "step": 5530 }, { "epoch": 1.8445630420280188, "ref_ce_loss": 0.2342435121536255, "step": 5530 }, { "epoch": 1.8478985990660441, "loss": 1.1684, "step": 5540 }, { "epoch": 1.8478985990660441, "grad_norm": 1.881791114807129, "step": 5540 }, { "epoch": 1.8478985990660441, "learning_rate": 0.0007507888081159678, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 1.2271531820297241, "step": 5540 }, { "ce_loss": 0.37106800079345703, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.4225543141365051, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.27660301327705383, "step": 5540 }, { "epoch": 1.8478985990660441, "loss": 1.0412670373916626, "step": 5540 }, { "ce_loss": 0.37082210183143616, "epoch": 1.8478985990660441, "step": 5540 }, { "distill_loss": 0.446853369474411, "epoch": 1.8478985990660441, "step": 5540 }, { "epoch": 1.8478985990660441, "ref_ce_loss": 0.22295041382312775, "step": 5540 }, { "epoch": 1.8512341561040695, "loss": 1.1782, "step": 5550 }, { "epoch": 1.8512341561040695, "grad_norm": 1.8545033931732178, "step": 5550 }, { "epoch": 1.8512341561040695, "learning_rate": 0.0007505809465804321, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 1.3148798942565918, "step": 5550 }, { "ce_loss": 0.3096807599067688, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.44882887601852417, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.29201146960258484, "step": 5550 }, { "epoch": 1.8512341561040695, "loss": 1.1060969829559326, "step": 5550 }, { "ce_loss": 0.26883015036582947, "epoch": 1.8512341561040695, "step": 5550 }, { "distill_loss": 0.3587776720523834, "epoch": 1.8512341561040695, "step": 5550 }, { "epoch": 1.8512341561040695, "ref_ce_loss": 0.21522045135498047, "step": 5550 }, { "epoch": 1.8545697131420948, "loss": 1.1401, "step": 5560 }, { "epoch": 1.8545697131420948, "grad_norm": 1.8426175117492676, "step": 5560 }, { "epoch": 1.8545697131420948, "learning_rate": 0.0007503726758792079, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 1.0726665258407593, "step": 5560 }, { "ce_loss": 0.3178630769252777, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.41456592082977295, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.270908385515213, "step": 5560 }, { "epoch": 1.8545697131420948, "loss": 1.5326629877090454, "step": 5560 }, { "ce_loss": 0.38044241070747375, "epoch": 1.8545697131420948, "step": 5560 }, { "distill_loss": 0.49526867270469666, "epoch": 1.8545697131420948, "step": 5560 }, { "epoch": 1.8545697131420948, "ref_ce_loss": 0.3099924921989441, "step": 5560 }, { "epoch": 1.8579052701801202, "loss": 1.1707, "step": 5570 }, { "epoch": 1.8579052701801202, "grad_norm": 2.486192226409912, "step": 5570 }, { "epoch": 1.8579052701801202, "learning_rate": 0.0007501639962553691, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 1.1377767324447632, "step": 5570 }, { "ce_loss": 0.36928075551986694, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.44840309023857117, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.2513749301433563, "step": 5570 }, { "epoch": 1.8579052701801202, "loss": 1.3746496438980103, "step": 5570 }, { "ce_loss": 0.33873075246810913, "epoch": 1.8579052701801202, "step": 5570 }, { "distill_loss": 0.46756061911582947, "epoch": 1.8579052701801202, "step": 5570 }, { "epoch": 1.8579052701801202, "ref_ce_loss": 0.17396891117095947, "step": 5570 }, { "epoch": 1.8612408272181455, "loss": 1.2129, "step": 5580 }, { "epoch": 1.8612408272181455, "grad_norm": 2.368302822113037, "step": 5580 }, { "epoch": 1.8612408272181455, "learning_rate": 0.0007499549079524677, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 1.0033069849014282, "step": 5580 }, { "ce_loss": 0.3440701961517334, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.38273337483406067, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.21641433238983154, "step": 5580 }, { "epoch": 1.8612408272181455, "loss": 1.2916123867034912, "step": 5580 }, { "ce_loss": 0.2969575524330139, "epoch": 1.8612408272181455, "step": 5580 }, { "distill_loss": 0.3884190320968628, "epoch": 1.8612408272181455, "step": 5580 }, { "epoch": 1.8612408272181455, "ref_ce_loss": 0.2365722358226776, "step": 5580 }, { "epoch": 1.864576384256171, "loss": 1.2139, "step": 5590 }, { "epoch": 1.864576384256171, "grad_norm": 2.768338918685913, "step": 5590 }, { "epoch": 1.864576384256171, "learning_rate": 0.0007497454112145318, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 1.0387018918991089, "step": 5590 }, { "ce_loss": 0.28695935010910034, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.3994467258453369, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.2750386595726013, "step": 5590 }, { "epoch": 1.864576384256171, "loss": 1.2993309497833252, "step": 5590 }, { "ce_loss": 0.22111760079860687, "epoch": 1.864576384256171, "step": 5590 }, { "distill_loss": 0.4015125334262848, "epoch": 1.864576384256171, "step": 5590 }, { "epoch": 1.864576384256171, "ref_ce_loss": 0.17563439905643463, "step": 5590 }, { "epoch": 1.8679119412941962, "loss": 1.1576, "step": 5600 }, { "epoch": 1.8679119412941962, "grad_norm": 1.3973478078842163, "step": 5600 }, { "epoch": 1.8679119412941962, "learning_rate": 0.000749535506286067, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 0.9982587099075317, "step": 5600 }, { "ce_loss": 0.2859624922275543, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.48541250824928284, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.22652120888233185, "step": 5600 }, { "epoch": 1.8679119412941962, "loss": 1.2003138065338135, "step": 5600 }, { "ce_loss": 0.3607683777809143, "epoch": 1.8679119412941962, "step": 5600 }, { "distill_loss": 0.5441848039627075, "epoch": 1.8679119412941962, "step": 5600 }, { "epoch": 1.8679119412941962, "ref_ce_loss": 0.23783503472805023, "step": 5600 }, { "epoch": 1.8712474983322216, "loss": 1.1887, "step": 5610 }, { "epoch": 1.8712474983322216, "grad_norm": 2.882995128631592, "step": 5610 }, { "epoch": 1.8712474983322216, "learning_rate": 0.0007493251934120547, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 1.0685358047485352, "step": 5610 }, { "ce_loss": 0.29604318737983704, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.512268602848053, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.25988975167274475, "step": 5610 }, { "epoch": 1.8712474983322216, "loss": 1.2088600397109985, "step": 5610 }, { "ce_loss": 0.35079681873321533, "epoch": 1.8712474983322216, "step": 5610 }, { "distill_loss": 0.5436410903930664, "epoch": 1.8712474983322216, "step": 5610 }, { "epoch": 1.8712474983322216, "ref_ce_loss": 0.21157583594322205, "step": 5610 }, { "epoch": 1.874583055370247, "loss": 1.1698, "step": 5620 }, { "epoch": 1.874583055370247, "grad_norm": 3.201819658279419, "step": 5620 }, { "epoch": 1.874583055370247, "learning_rate": 0.0007491144728379528, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 0.9609626531600952, "step": 5620 }, { "ce_loss": 0.23364980518817902, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.46882686018943787, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.176205113530159, "step": 5620 }, { "epoch": 1.874583055370247, "loss": 1.9625874757766724, "step": 5620 }, { "ce_loss": 0.32893672585487366, "epoch": 1.874583055370247, "step": 5620 }, { "distill_loss": 0.5009834170341492, "epoch": 1.874583055370247, "step": 5620 }, { "epoch": 1.874583055370247, "ref_ce_loss": 0.2691947817802429, "step": 5620 }, { "epoch": 1.8779186124082723, "loss": 1.2908, "step": 5630 }, { "epoch": 1.8779186124082723, "grad_norm": 1.6491600275039673, "step": 5630 }, { "epoch": 1.8779186124082723, "learning_rate": 0.0007489033448096948, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 1.1745247840881348, "step": 5630 }, { "ce_loss": 0.2985750436782837, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.3555160462856293, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.2050323784351349, "step": 5630 }, { "epoch": 1.8779186124082723, "loss": 1.0036077499389648, "step": 5630 }, { "ce_loss": 0.3427830934524536, "epoch": 1.8779186124082723, "step": 5630 }, { "distill_loss": 0.41693538427352905, "epoch": 1.8779186124082723, "step": 5630 }, { "epoch": 1.8779186124082723, "ref_ce_loss": 0.24346625804901123, "step": 5630 }, { "epoch": 1.8812541694462976, "loss": 1.2053, "step": 5640 }, { "epoch": 1.8812541694462976, "grad_norm": 1.826837420463562, "step": 5640 }, { "epoch": 1.8812541694462976, "learning_rate": 0.0007486918095736897, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 1.2536578178405762, "step": 5640 }, { "ce_loss": 0.2937576174736023, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.636205792427063, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.23159760236740112, "step": 5640 }, { "epoch": 1.8812541694462976, "loss": 1.3230061531066895, "step": 5640 }, { "ce_loss": 0.4143422245979309, "epoch": 1.8812541694462976, "step": 5640 }, { "distill_loss": 0.5751572847366333, "epoch": 1.8812541694462976, "step": 5640 }, { "epoch": 1.8812541694462976, "ref_ce_loss": 0.2638043761253357, "step": 5640 }, { "epoch": 1.884589726484323, "loss": 1.1228, "step": 5650 }, { "epoch": 1.884589726484323, "grad_norm": 1.902815580368042, "step": 5650 }, { "epoch": 1.884589726484323, "learning_rate": 0.0007484798673768223, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 1.1458295583724976, "step": 5650 }, { "ce_loss": 0.37263768911361694, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.45414674282073975, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.271620512008667, "step": 5650 }, { "epoch": 1.884589726484323, "loss": 0.9640642404556274, "step": 5650 }, { "ce_loss": 0.296345591545105, "epoch": 1.884589726484323, "step": 5650 }, { "distill_loss": 0.4380930960178375, "epoch": 1.884589726484323, "step": 5650 }, { "epoch": 1.884589726484323, "ref_ce_loss": 0.21325789391994476, "step": 5650 }, { "epoch": 1.8879252835223483, "loss": 1.1008, "step": 5660 }, { "epoch": 1.8879252835223483, "grad_norm": 1.9657776355743408, "step": 5660 }, { "epoch": 1.8879252835223483, "learning_rate": 0.0007482675184664516, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 1.1546496152877808, "step": 5660 }, { "ce_loss": 0.2919650077819824, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.376723051071167, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.19715668261051178, "step": 5660 }, { "epoch": 1.8879252835223483, "loss": 0.9053738117218018, "step": 5660 }, { "ce_loss": 0.22867360711097717, "epoch": 1.8879252835223483, "step": 5660 }, { "distill_loss": 0.36551451683044434, "epoch": 1.8879252835223483, "step": 5660 }, { "epoch": 1.8879252835223483, "ref_ce_loss": 0.1943945437669754, "step": 5660 }, { "epoch": 1.8912608405603737, "loss": 1.1558, "step": 5670 }, { "epoch": 1.8912608405603737, "grad_norm": 1.620741605758667, "step": 5670 }, { "epoch": 1.8912608405603737, "learning_rate": 0.0007480547630904117, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 1.0649346113204956, "step": 5670 }, { "ce_loss": 0.2661072611808777, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.4255768954753876, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.2549579441547394, "step": 5670 }, { "epoch": 1.8912608405603737, "loss": 0.9806616902351379, "step": 5670 }, { "ce_loss": 0.27610334753990173, "epoch": 1.8912608405603737, "step": 5670 }, { "distill_loss": 0.4894576668739319, "epoch": 1.8912608405603737, "step": 5670 }, { "epoch": 1.8912608405603737, "ref_ce_loss": 0.21505948901176453, "step": 5670 }, { "epoch": 1.894596397598399, "loss": 1.1236, "step": 5680 }, { "epoch": 1.894596397598399, "grad_norm": 2.7246854305267334, "step": 5680 }, { "epoch": 1.894596397598399, "learning_rate": 0.0007478416014970108, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 0.9774023294448853, "step": 5680 }, { "ce_loss": 0.2604271471500397, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.4833385646343231, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.1873892843723297, "step": 5680 }, { "epoch": 1.894596397598399, "loss": 1.1073026657104492, "step": 5680 }, { "ce_loss": 0.3042682707309723, "epoch": 1.894596397598399, "step": 5680 }, { "distill_loss": 0.5258811116218567, "epoch": 1.894596397598399, "step": 5680 }, { "epoch": 1.894596397598399, "ref_ce_loss": 0.2086445689201355, "step": 5680 }, { "epoch": 1.8979319546364244, "loss": 1.1633, "step": 5690 }, { "epoch": 1.8979319546364244, "grad_norm": 2.3458380699157715, "step": 5690 }, { "epoch": 1.8979319546364244, "learning_rate": 0.0007476280339350319, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 0.8902542591094971, "step": 5690 }, { "ce_loss": 0.2166803926229477, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.42578381299972534, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.1565944254398346, "step": 5690 }, { "epoch": 1.8979319546364244, "loss": 1.0363528728485107, "step": 5690 }, { "ce_loss": 0.3460255265235901, "epoch": 1.8979319546364244, "step": 5690 }, { "distill_loss": 0.46616458892822266, "epoch": 1.8979319546364244, "step": 5690 }, { "epoch": 1.8979319546364244, "ref_ce_loss": 0.22409404814243317, "step": 5690 }, { "epoch": 1.9012675116744497, "loss": 1.1329, "step": 5700 }, { "epoch": 1.9012675116744497, "grad_norm": 2.120044231414795, "step": 5700 }, { "epoch": 1.9012675116744497, "learning_rate": 0.0007474140606537311, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 1.5393993854522705, "step": 5700 }, { "ce_loss": 0.35393574833869934, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.5372600555419922, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.24297258257865906, "step": 5700 }, { "epoch": 1.9012675116744497, "loss": 0.8871737718582153, "step": 5700 }, { "ce_loss": 0.2896822690963745, "epoch": 1.9012675116744497, "step": 5700 }, { "distill_loss": 0.3992535173892975, "epoch": 1.9012675116744497, "step": 5700 }, { "epoch": 1.9012675116744497, "ref_ce_loss": 0.1980990320444107, "step": 5700 }, { "epoch": 1.904603068712475, "loss": 1.2162, "step": 5710 }, { "epoch": 1.904603068712475, "grad_norm": 1.8103832006454468, "step": 5710 }, { "epoch": 1.904603068712475, "learning_rate": 0.0007471996819028382, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 1.0989534854888916, "step": 5710 }, { "ce_loss": 0.2598811388015747, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.4422469139099121, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.23823165893554688, "step": 5710 }, { "epoch": 1.904603068712475, "loss": 1.0796765089035034, "step": 5710 }, { "ce_loss": 0.27391597628593445, "epoch": 1.904603068712475, "step": 5710 }, { "distill_loss": 0.45274975895881653, "epoch": 1.904603068712475, "step": 5710 }, { "epoch": 1.904603068712475, "ref_ce_loss": 0.20589439570903778, "step": 5710 }, { "epoch": 1.9079386257505004, "loss": 1.088, "step": 5720 }, { "epoch": 1.9079386257505004, "grad_norm": 1.6986693143844604, "step": 5720 }, { "epoch": 1.9079386257505004, "learning_rate": 0.0007469848979325562, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 0.8809690475463867, "step": 5720 }, { "ce_loss": 0.2537936568260193, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.377767950296402, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.21097536385059357, "step": 5720 }, { "epoch": 1.9079386257505004, "loss": 1.995163917541504, "step": 5720 }, { "ce_loss": 0.4546584486961365, "epoch": 1.9079386257505004, "step": 5720 }, { "distill_loss": 0.6019603610038757, "epoch": 1.9079386257505004, "step": 5720 }, { "epoch": 1.9079386257505004, "ref_ce_loss": 0.38922926783561707, "step": 5720 }, { "epoch": 1.9112741827885258, "loss": 1.2926, "step": 5730 }, { "epoch": 1.9112741827885258, "grad_norm": 1.4185676574707031, "step": 5730 }, { "epoch": 1.9112741827885258, "learning_rate": 0.0007467697089935612, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 1.679802656173706, "step": 5730 }, { "ce_loss": 0.3139112889766693, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.4400520324707031, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.2145587056875229, "step": 5730 }, { "epoch": 1.9112741827885258, "loss": 1.0517017841339111, "step": 5730 }, { "ce_loss": 0.34958764910697937, "epoch": 1.9112741827885258, "step": 5730 }, { "distill_loss": 0.39745035767555237, "epoch": 1.9112741827885258, "step": 5730 }, { "epoch": 1.9112741827885258, "ref_ce_loss": 0.24486936628818512, "step": 5730 }, { "epoch": 1.9146097398265511, "loss": 1.2064, "step": 5740 }, { "epoch": 1.9146097398265511, "grad_norm": 1.8157681226730347, "step": 5740 }, { "epoch": 1.9146097398265511, "learning_rate": 0.0007465541153370019, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 1.1898601055145264, "step": 5740 }, { "ce_loss": 0.28614503145217896, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.40134701132774353, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.23302209377288818, "step": 5740 }, { "epoch": 1.9146097398265511, "loss": 1.3736635446548462, "step": 5740 }, { "ce_loss": 0.33695244789123535, "epoch": 1.9146097398265511, "step": 5740 }, { "distill_loss": 0.4986035227775574, "epoch": 1.9146097398265511, "step": 5740 }, { "epoch": 1.9146097398265511, "ref_ce_loss": 0.29099375009536743, "step": 5740 }, { "epoch": 1.9179452968645765, "loss": 1.1414, "step": 5750 }, { "epoch": 1.9179452968645765, "grad_norm": 1.8164293766021729, "step": 5750 }, { "epoch": 1.9179452968645765, "learning_rate": 0.0007463381172144992, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 1.0783435106277466, "step": 5750 }, { "ce_loss": 0.31459519267082214, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.46592262387275696, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.2333477884531021, "step": 5750 }, { "epoch": 1.9179452968645765, "loss": 0.9109805226325989, "step": 5750 }, { "ce_loss": 0.24675825238227844, "epoch": 1.9179452968645765, "step": 5750 }, { "distill_loss": 0.3833222985267639, "epoch": 1.9179452968645765, "step": 5750 }, { "epoch": 1.9179452968645765, "ref_ce_loss": 0.19883716106414795, "step": 5750 }, { "epoch": 1.9212808539026018, "loss": 1.1066, "step": 5760 }, { "epoch": 1.9212808539026018, "grad_norm": 1.8867511749267578, "step": 5760 }, { "epoch": 1.9212808539026018, "learning_rate": 0.0007461217148781461, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 1.0111404657363892, "step": 5760 }, { "ce_loss": 0.2570357322692871, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.4955453872680664, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.19137851893901825, "step": 5760 }, { "epoch": 1.9212808539026018, "loss": 1.4673306941986084, "step": 5760 }, { "ce_loss": 0.40399816632270813, "epoch": 1.9212808539026018, "step": 5760 }, { "distill_loss": 0.5058277249336243, "epoch": 1.9212808539026018, "step": 5760 }, { "epoch": 1.9212808539026018, "ref_ce_loss": 0.319006085395813, "step": 5760 }, { "epoch": 1.9246164109406272, "loss": 1.2681, "step": 5770 }, { "epoch": 1.9246164109406272, "grad_norm": 1.7314869165420532, "step": 5770 }, { "epoch": 1.9246164109406272, "learning_rate": 0.0007459049085805075, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 1.031352162361145, "step": 5770 }, { "ce_loss": 0.3140748143196106, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.47759658098220825, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.19037601351737976, "step": 5770 }, { "epoch": 1.9246164109406272, "loss": 1.2562201023101807, "step": 5770 }, { "ce_loss": 0.3546963930130005, "epoch": 1.9246164109406272, "step": 5770 }, { "distill_loss": 0.569050669670105, "epoch": 1.9246164109406272, "step": 5770 }, { "epoch": 1.9246164109406272, "ref_ce_loss": 0.2440304160118103, "step": 5770 }, { "epoch": 1.9279519679786525, "loss": 1.1791, "step": 5780 }, { "epoch": 1.9279519679786525, "grad_norm": 1.8017394542694092, "step": 5780 }, { "epoch": 1.9279519679786525, "learning_rate": 0.0007456876985746199, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 1.1710878610610962, "step": 5780 }, { "ce_loss": 0.27451661229133606, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.5686822533607483, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.21199308335781097, "step": 5780 }, { "epoch": 1.9279519679786525, "loss": 0.9214657545089722, "step": 5780 }, { "ce_loss": 0.23972466588020325, "epoch": 1.9279519679786525, "step": 5780 }, { "distill_loss": 0.43682003021240234, "epoch": 1.9279519679786525, "step": 5780 }, { "epoch": 1.9279519679786525, "ref_ce_loss": 0.19800573587417603, "step": 5780 }, { "epoch": 1.9312875250166779, "loss": 1.1405, "step": 5790 }, { "epoch": 1.9312875250166779, "grad_norm": 1.826278805732727, "step": 5790 }, { "epoch": 1.9312875250166779, "learning_rate": 0.0007454700851139903, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 0.8356040120124817, "step": 5790 }, { "ce_loss": 0.23331232368946075, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.38790708780288696, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.15372376143932343, "step": 5790 }, { "epoch": 1.9312875250166779, "loss": 1.0069454908370972, "step": 5790 }, { "ce_loss": 0.28266188502311707, "epoch": 1.9312875250166779, "step": 5790 }, { "distill_loss": 0.360159695148468, "epoch": 1.9312875250166779, "step": 5790 }, { "epoch": 1.9312875250166779, "ref_ce_loss": 0.22955982387065887, "step": 5790 }, { "epoch": 1.9346230820547032, "loss": 1.1853, "step": 5800 }, { "epoch": 1.9346230820547032, "grad_norm": 2.1900691986083984, "step": 5800 }, { "epoch": 1.9346230820547032, "learning_rate": 0.0007452520684525974, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 1.210611343383789, "step": 5800 }, { "ce_loss": 0.26552486419677734, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.3750151991844177, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.21927280724048615, "step": 5800 }, { "epoch": 1.9346230820547032, "loss": 1.9376190900802612, "step": 5800 }, { "ce_loss": 0.4320092797279358, "epoch": 1.9346230820547032, "step": 5800 }, { "distill_loss": 0.4518512487411499, "epoch": 1.9346230820547032, "step": 5800 }, { "epoch": 1.9346230820547032, "ref_ce_loss": 0.2782735526561737, "step": 5800 }, { "epoch": 1.9379586390927286, "loss": 1.2453, "step": 5810 }, { "epoch": 1.9379586390927286, "grad_norm": 2.809798002243042, "step": 5810 }, { "epoch": 1.9379586390927286, "learning_rate": 0.0007450336488448899, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 1.1590726375579834, "step": 5810 }, { "ce_loss": 0.2636038064956665, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.6099088191986084, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.18551239371299744, "step": 5810 }, { "epoch": 1.9379586390927286, "loss": 1.3410998582839966, "step": 5810 }, { "ce_loss": 0.37131693959236145, "epoch": 1.9379586390927286, "step": 5810 }, { "distill_loss": 0.5650545358657837, "epoch": 1.9379586390927286, "step": 5810 }, { "epoch": 1.9379586390927286, "ref_ce_loss": 0.1918272227048874, "step": 5810 }, { "epoch": 1.941294196130754, "loss": 1.1365, "step": 5820 }, { "epoch": 1.941294196130754, "grad_norm": 2.5075430870056152, "step": 5820 }, { "epoch": 1.941294196130754, "learning_rate": 0.0007448148265457871, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 0.9481791257858276, "step": 5820 }, { "ce_loss": 0.22015568614006042, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.37985068559646606, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.19499360024929047, "step": 5820 }, { "epoch": 1.941294196130754, "loss": 1.1051946878433228, "step": 5820 }, { "ce_loss": 0.24825023114681244, "epoch": 1.941294196130754, "step": 5820 }, { "distill_loss": 0.37336069345474243, "epoch": 1.941294196130754, "step": 5820 }, { "epoch": 1.941294196130754, "ref_ce_loss": 0.23749548196792603, "step": 5820 }, { "epoch": 1.9446297531687793, "loss": 1.1193, "step": 5830 }, { "epoch": 1.9446297531687793, "grad_norm": 1.9296715259552002, "step": 5830 }, { "epoch": 1.9446297531687793, "learning_rate": 0.000744595601810678, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 1.1018894910812378, "step": 5830 }, { "ce_loss": 0.333945631980896, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.43748483061790466, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.20613163709640503, "step": 5830 }, { "epoch": 1.9446297531687793, "loss": 0.9875980615615845, "step": 5830 }, { "ce_loss": 0.2940070629119873, "epoch": 1.9446297531687793, "step": 5830 }, { "distill_loss": 0.3901951313018799, "epoch": 1.9446297531687793, "step": 5830 }, { "epoch": 1.9446297531687793, "ref_ce_loss": 0.22852958738803864, "step": 5830 }, { "epoch": 1.9479653102068046, "loss": 1.0817, "step": 5840 }, { "epoch": 1.9479653102068046, "grad_norm": 2.464088201522827, "step": 5840 }, { "epoch": 1.9479653102068046, "learning_rate": 0.0007443759748954217, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 1.3885273933410645, "step": 5840 }, { "ce_loss": 0.3946130573749542, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.4795334041118622, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.2565721869468689, "step": 5840 }, { "epoch": 1.9479653102068046, "loss": 1.389933466911316, "step": 5840 }, { "ce_loss": 0.2812231481075287, "epoch": 1.9479653102068046, "step": 5840 }, { "distill_loss": 0.4569879174232483, "epoch": 1.9479653102068046, "step": 5840 }, { "epoch": 1.9479653102068046, "ref_ce_loss": 0.22854776680469513, "step": 5840 }, { "epoch": 1.95130086724483, "loss": 1.2102, "step": 5850 }, { "epoch": 1.95130086724483, "grad_norm": 1.5864168405532837, "step": 5850 }, { "epoch": 1.95130086724483, "learning_rate": 0.0007441559460563461, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 1.2205491065979004, "step": 5850 }, { "ce_loss": 0.3103644549846649, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.41929182410240173, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.2448168694972992, "step": 5850 }, { "epoch": 1.95130086724483, "loss": 1.2417341470718384, "step": 5850 }, { "ce_loss": 0.29796847701072693, "epoch": 1.95130086724483, "step": 5850 }, { "distill_loss": 0.4613056480884552, "epoch": 1.95130086724483, "step": 5850 }, { "epoch": 1.95130086724483, "ref_ce_loss": 0.19173882901668549, "step": 5850 }, { "epoch": 1.9546364242828553, "loss": 1.3159, "step": 5860 }, { "epoch": 1.9546364242828553, "grad_norm": 2.4112977981567383, "step": 5860 }, { "epoch": 1.9546364242828553, "learning_rate": 0.0007439355155502489, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 1.291137933731079, "step": 5860 }, { "ce_loss": 0.3205377459526062, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.4666355550289154, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.23071230947971344, "step": 5860 }, { "epoch": 1.9546364242828553, "loss": 1.234226942062378, "step": 5860 }, { "ce_loss": 0.3565260171890259, "epoch": 1.9546364242828553, "step": 5860 }, { "distill_loss": 0.5676701068878174, "epoch": 1.9546364242828553, "step": 5860 }, { "epoch": 1.9546364242828553, "ref_ce_loss": 0.2438981682062149, "step": 5860 }, { "epoch": 1.9579719813208807, "loss": 1.1518, "step": 5870 }, { "epoch": 1.9579719813208807, "grad_norm": 2.057464122772217, "step": 5870 }, { "epoch": 1.9579719813208807, "learning_rate": 0.0007437146836343961, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 1.0851755142211914, "step": 5870 }, { "ce_loss": 0.3205083906650543, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.47727417945861816, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.2165471762418747, "step": 5870 }, { "epoch": 1.9579719813208807, "loss": 1.249542474746704, "step": 5870 }, { "ce_loss": 0.2969149351119995, "epoch": 1.9579719813208807, "step": 5870 }, { "distill_loss": 0.4526827335357666, "epoch": 1.9579719813208807, "step": 5870 }, { "epoch": 1.9579719813208807, "ref_ce_loss": 0.2169814556837082, "step": 5870 }, { "epoch": 1.961307538358906, "loss": 1.1561, "step": 5880 }, { "epoch": 1.961307538358906, "grad_norm": 2.307563543319702, "step": 5880 }, { "epoch": 1.961307538358906, "learning_rate": 0.0007434934505665223, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 1.1310524940490723, "step": 5880 }, { "ce_loss": 0.3645102083683014, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.4820353090763092, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.2054779976606369, "step": 5880 }, { "epoch": 1.961307538358906, "loss": 1.1591945886611938, "step": 5880 }, { "ce_loss": 0.2980738878250122, "epoch": 1.961307538358906, "step": 5880 }, { "distill_loss": 0.39868849515914917, "epoch": 1.961307538358906, "step": 5880 }, { "epoch": 1.961307538358906, "ref_ce_loss": 0.2012583613395691, "step": 5880 }, { "epoch": 1.9646430953969314, "loss": 1.2105, "step": 5890 }, { "epoch": 1.9646430953969314, "grad_norm": 2.318319082260132, "step": 5890 }, { "epoch": 1.9646430953969314, "learning_rate": 0.0007432718166048301, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 1.6052378416061401, "step": 5890 }, { "ce_loss": 0.36234989762306213, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.526151180267334, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.29334115982055664, "step": 5890 }, { "epoch": 1.9646430953969314, "loss": 1.3489465713500977, "step": 5890 }, { "ce_loss": 0.304647833108902, "epoch": 1.9646430953969314, "step": 5890 }, { "distill_loss": 0.5583611130714417, "epoch": 1.9646430953969314, "step": 5890 }, { "epoch": 1.9646430953969314, "ref_ce_loss": 0.22426193952560425, "step": 5890 }, { "epoch": 1.9679786524349567, "loss": 1.2969, "step": 5900 }, { "epoch": 1.9679786524349567, "grad_norm": 2.613701820373535, "step": 5900 }, { "epoch": 1.9679786524349567, "learning_rate": 0.0007430497820079903, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 1.1732020378112793, "step": 5900 }, { "ce_loss": 0.3025830388069153, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.4889051616191864, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.17684857547283173, "step": 5900 }, { "epoch": 1.9679786524349567, "loss": 1.0720723867416382, "step": 5900 }, { "ce_loss": 0.3472079932689667, "epoch": 1.9679786524349567, "step": 5900 }, { "distill_loss": 0.4841811954975128, "epoch": 1.9679786524349567, "step": 5900 }, { "epoch": 1.9679786524349567, "ref_ce_loss": 0.24045956134796143, "step": 5900 }, { "epoch": 1.971314209472982, "loss": 1.1779, "step": 5910 }, { "epoch": 1.971314209472982, "grad_norm": 2.5917856693267822, "step": 5910 }, { "epoch": 1.971314209472982, "learning_rate": 0.0007428273470351414, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 1.2020492553710938, "step": 5910 }, { "ce_loss": 0.34799009561538696, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.49872174859046936, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.2092563956975937, "step": 5910 }, { "epoch": 1.971314209472982, "loss": 1.0802489519119263, "step": 5910 }, { "ce_loss": 0.2857528030872345, "epoch": 1.971314209472982, "step": 5910 }, { "distill_loss": 0.4485069215297699, "epoch": 1.971314209472982, "step": 5910 }, { "epoch": 1.971314209472982, "ref_ce_loss": 0.24490563571453094, "step": 5910 }, { "epoch": 1.9746497665110074, "loss": 1.141, "step": 5920 }, { "epoch": 1.9746497665110074, "grad_norm": 1.7976915836334229, "step": 5920 }, { "epoch": 1.9746497665110074, "learning_rate": 0.0007426045119458886, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 0.9701898097991943, "step": 5920 }, { "ce_loss": 0.20789991319179535, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.3563303053379059, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.2152089774608612, "step": 5920 }, { "epoch": 1.9746497665110074, "loss": 1.2094759941101074, "step": 5920 }, { "ce_loss": 0.335591584444046, "epoch": 1.9746497665110074, "step": 5920 }, { "distill_loss": 0.4801439642906189, "epoch": 1.9746497665110074, "step": 5920 }, { "epoch": 1.9746497665110074, "ref_ce_loss": 0.23890548944473267, "step": 5920 }, { "epoch": 1.9779853235490328, "loss": 1.1592, "step": 5930 }, { "epoch": 1.9779853235490328, "grad_norm": 1.8345674276351929, "step": 5930 }, { "epoch": 1.9779853235490328, "learning_rate": 0.0007423812770003046, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 1.3806750774383545, "step": 5930 }, { "ce_loss": 0.2798145115375519, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.43999797105789185, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.22765986621379852, "step": 5930 }, { "epoch": 1.9779853235490328, "loss": 1.184518575668335, "step": 5930 }, { "ce_loss": 0.34069424867630005, "epoch": 1.9779853235490328, "step": 5930 }, { "distill_loss": 0.4194449782371521, "epoch": 1.9779853235490328, "step": 5930 }, { "epoch": 1.9779853235490328, "ref_ce_loss": 0.25200286507606506, "step": 5930 }, { "epoch": 1.9813208805870581, "loss": 1.1941, "step": 5940 }, { "epoch": 1.9813208805870581, "grad_norm": 2.0744800567626953, "step": 5940 }, { "epoch": 1.9813208805870581, "learning_rate": 0.0007421576424589287, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 1.2190189361572266, "step": 5940 }, { "ce_loss": 0.31040140986442566, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.5107767581939697, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.19094134867191315, "step": 5940 }, { "epoch": 1.9813208805870581, "loss": 1.0603785514831543, "step": 5940 }, { "ce_loss": 0.3433583676815033, "epoch": 1.9813208805870581, "step": 5940 }, { "distill_loss": 0.46275004744529724, "epoch": 1.9813208805870581, "step": 5940 }, { "epoch": 1.9813208805870581, "ref_ce_loss": 0.25405070185661316, "step": 5940 }, { "epoch": 1.9846564376250835, "loss": 1.1856, "step": 5950 }, { "epoch": 1.9846564376250835, "grad_norm": 1.5472403764724731, "step": 5950 }, { "epoch": 1.9846564376250835, "learning_rate": 0.0007419336085827664, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 1.1374337673187256, "step": 5950 }, { "ce_loss": 0.27851608395576477, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.44976139068603516, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.2229611575603485, "step": 5950 }, { "epoch": 1.9846564376250835, "loss": 1.1625683307647705, "step": 5950 }, { "ce_loss": 0.36853551864624023, "epoch": 1.9846564376250835, "step": 5950 }, { "distill_loss": 0.4269828796386719, "epoch": 1.9846564376250835, "step": 5950 }, { "epoch": 1.9846564376250835, "ref_ce_loss": 0.2808544933795929, "step": 5950 }, { "epoch": 1.9879919946631088, "loss": 1.2058, "step": 5960 }, { "epoch": 1.9879919946631088, "grad_norm": 1.7128957509994507, "step": 5960 }, { "epoch": 1.9879919946631088, "learning_rate": 0.0007417091756332892, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 1.47935950756073, "step": 5960 }, { "ce_loss": 0.33494773507118225, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.5610784292221069, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.2407287359237671, "step": 5960 }, { "epoch": 1.9879919946631088, "loss": 1.112879991531372, "step": 5960 }, { "ce_loss": 0.3090471923351288, "epoch": 1.9879919946631088, "step": 5960 }, { "distill_loss": 0.5229696035385132, "epoch": 1.9879919946631088, "step": 5960 }, { "epoch": 1.9879919946631088, "ref_ce_loss": 0.2024012953042984, "step": 5960 }, { "epoch": 1.9913275517011342, "loss": 1.1274, "step": 5970 }, { "epoch": 1.9913275517011342, "grad_norm": 7.496943473815918, "step": 5970 }, { "epoch": 1.9913275517011342, "learning_rate": 0.0007414843438724346, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 1.0705974102020264, "step": 5970 }, { "ce_loss": 0.2735450267791748, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.4013446271419525, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.22743751108646393, "step": 5970 }, { "epoch": 1.9913275517011342, "loss": 1.3012447357177734, "step": 5970 }, { "ce_loss": 0.3357171416282654, "epoch": 1.9913275517011342, "step": 5970 }, { "distill_loss": 0.48471227288246155, "epoch": 1.9913275517011342, "step": 5970 }, { "epoch": 1.9913275517011342, "ref_ce_loss": 0.2686719298362732, "step": 5970 }, { "epoch": 1.9946631087391595, "loss": 1.2532, "step": 5980 }, { "epoch": 1.9946631087391595, "grad_norm": 2.4035911560058594, "step": 5980 }, { "epoch": 1.9946631087391595, "learning_rate": 0.0007412591135626056, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 1.2545922994613647, "step": 5980 }, { "ce_loss": 0.3356133699417114, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.40619805455207825, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.2650473713874817, "step": 5980 }, { "epoch": 1.9946631087391595, "loss": 1.198852300643921, "step": 5980 }, { "ce_loss": 0.31099480390548706, "epoch": 1.9946631087391595, "step": 5980 }, { "distill_loss": 0.37024760246276855, "epoch": 1.9946631087391595, "step": 5980 }, { "epoch": 1.9946631087391595, "ref_ce_loss": 0.22776618599891663, "step": 5980 }, { "epoch": 1.9979986657771849, "loss": 1.145, "step": 5990 }, { "epoch": 1.9979986657771849, "grad_norm": 1.7393022775650024, "step": 5990 }, { "epoch": 1.9979986657771849, "learning_rate": 0.0007410334849666699, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 1.2356904745101929, "step": 5990 }, { "ce_loss": 0.34358200430870056, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.4329187870025635, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.22351956367492676, "step": 5990 }, { "epoch": 1.9979986657771849, "loss": 0.9917151927947998, "step": 5990 }, { "ce_loss": 0.3190557658672333, "epoch": 1.9979986657771849, "step": 5990 }, { "distill_loss": 0.4740646779537201, "epoch": 1.9979986657771849, "step": 5990 }, { "epoch": 1.9979986657771849, "ref_ce_loss": 0.19807963073253632, "step": 5990 }, { "epoch": 2.0013342228152102, "loss": 1.1926, "step": 6000 }, { "epoch": 2.0013342228152102, "grad_norm": 2.1108322143554688, "step": 6000 }, { "epoch": 2.0013342228152102, "learning_rate": 0.0007408074583479605, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 1.3016316890716553, "step": 6000 }, { "ce_loss": 0.3355659544467926, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.5252490043640137, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.21098066866397858, "step": 6000 }, { "epoch": 2.0013342228152102, "loss": 1.3800817728042603, "step": 6000 }, { "ce_loss": 0.3436371088027954, "epoch": 2.0013342228152102, "step": 6000 }, { "distill_loss": 0.4823884963989258, "epoch": 2.0013342228152102, "step": 6000 }, { "epoch": 2.0013342228152102, "ref_ce_loss": 0.2499440461397171, "step": 6000 }, { "epoch": 2.0046697798532356, "loss": 1.1643, "step": 6010 }, { "epoch": 2.0046697798532356, "grad_norm": 2.1682538986206055, "step": 6010 }, { "epoch": 2.0046697798532356, "learning_rate": 0.0007405810339702751, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 1.1670124530792236, "step": 6010 }, { "ce_loss": 0.2908676862716675, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.44854360818862915, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.2258313000202179, "step": 6010 }, { "epoch": 2.0046697798532356, "loss": 1.1812759637832642, "step": 6010 }, { "ce_loss": 0.27601760625839233, "epoch": 2.0046697798532356, "step": 6010 }, { "distill_loss": 0.5683032274246216, "epoch": 2.0046697798532356, "step": 6010 }, { "epoch": 2.0046697798532356, "ref_ce_loss": 0.24193869531154633, "step": 6010 }, { "epoch": 2.008005336891261, "loss": 1.1144, "step": 6020 }, { "epoch": 2.008005336891261, "grad_norm": 2.0215084552764893, "step": 6020 }, { "epoch": 2.008005336891261, "learning_rate": 0.0007403542120978747, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 1.12555992603302, "step": 6020 }, { "ce_loss": 0.22603082656860352, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.45316022634506226, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.18625034391880035, "step": 6020 }, { "epoch": 2.008005336891261, "loss": 1.530777931213379, "step": 6020 }, { "ce_loss": 0.37890389561653137, "epoch": 2.008005336891261, "step": 6020 }, { "distill_loss": 0.6050182580947876, "epoch": 2.008005336891261, "step": 6020 }, { "epoch": 2.008005336891261, "ref_ce_loss": 0.25829702615737915, "step": 6020 }, { "epoch": 2.0113408939292863, "loss": 1.2366, "step": 6030 }, { "epoch": 2.0113408939292863, "grad_norm": 2.093440055847168, "step": 6030 }, { "epoch": 2.0113408939292863, "learning_rate": 0.0007401269929954853, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 1.0343910455703735, "step": 6030 }, { "ce_loss": 0.24603171646595, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.5099126100540161, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.21466688811779022, "step": 6030 }, { "epoch": 2.0113408939292863, "loss": 0.9694656729698181, "step": 6030 }, { "ce_loss": 0.27798622846603394, "epoch": 2.0113408939292863, "step": 6030 }, { "distill_loss": 0.3973238468170166, "epoch": 2.0113408939292863, "step": 6030 }, { "epoch": 2.0113408939292863, "ref_ce_loss": 0.2442408949136734, "step": 6030 }, { "epoch": 2.0146764509673116, "loss": 1.0951, "step": 6040 }, { "epoch": 2.0146764509673116, "grad_norm": 1.586016297340393, "step": 6040 }, { "epoch": 2.0146764509673116, "learning_rate": 0.0007398993769282959, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 1.0159122943878174, "step": 6040 }, { "ce_loss": 0.2615586221218109, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.44198697805404663, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.24152494966983795, "step": 6040 }, { "epoch": 2.0146764509673116, "loss": 0.9031857252120972, "step": 6040 }, { "ce_loss": 0.26589229702949524, "epoch": 2.0146764509673116, "step": 6040 }, { "distill_loss": 0.4166644513607025, "epoch": 2.0146764509673116, "step": 6040 }, { "epoch": 2.0146764509673116, "ref_ce_loss": 0.22043243050575256, "step": 6040 }, { "epoch": 2.018012008005337, "loss": 1.1342, "step": 6050 }, { "epoch": 2.018012008005337, "grad_norm": 3.1845755577087402, "step": 6050 }, { "epoch": 2.018012008005337, "learning_rate": 0.0007396713641619588, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 1.1987489461898804, "step": 6050 }, { "ce_loss": 0.3881857693195343, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.4965546131134033, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.2514057755470276, "step": 6050 }, { "epoch": 2.018012008005337, "loss": 0.9910258054733276, "step": 6050 }, { "ce_loss": 0.2973671853542328, "epoch": 2.018012008005337, "step": 6050 }, { "distill_loss": 0.4325452744960785, "epoch": 2.018012008005337, "step": 6050 }, { "epoch": 2.018012008005337, "ref_ce_loss": 0.18317049741744995, "step": 6050 }, { "epoch": 2.0213475650433623, "loss": 1.0778, "step": 6060 }, { "epoch": 2.0213475650433623, "grad_norm": 1.746089220046997, "step": 6060 }, { "epoch": 2.0213475650433623, "learning_rate": 0.0007394429549625898, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 0.892999529838562, "step": 6060 }, { "ce_loss": 0.2477852702140808, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.45223918557167053, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.19280417263507843, "step": 6060 }, { "epoch": 2.0213475650433623, "loss": 1.0602220296859741, "step": 6060 }, { "ce_loss": 0.32873767614364624, "epoch": 2.0213475650433623, "step": 6060 }, { "distill_loss": 0.4844486117362976, "epoch": 2.0213475650433623, "step": 6060 }, { "epoch": 2.0213475650433623, "ref_ce_loss": 0.18138428032398224, "step": 6060 }, { "epoch": 2.0246831220813877, "loss": 1.143, "step": 6070 }, { "epoch": 2.0246831220813877, "grad_norm": 3.508035898208618, "step": 6070 }, { "epoch": 2.0246831220813877, "learning_rate": 0.0007392141495967666, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 1.2129606008529663, "step": 6070 }, { "ce_loss": 0.3241889774799347, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.505097508430481, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.19585412740707397, "step": 6070 }, { "epoch": 2.0246831220813877, "loss": 1.2293137311935425, "step": 6070 }, { "ce_loss": 0.36307594180107117, "epoch": 2.0246831220813877, "step": 6070 }, { "distill_loss": 0.4744553565979004, "epoch": 2.0246831220813877, "step": 6070 }, { "epoch": 2.0246831220813877, "ref_ce_loss": 0.29934239387512207, "step": 6070 }, { "epoch": 2.028018679119413, "loss": 1.1642, "step": 6080 }, { "epoch": 2.028018679119413, "grad_norm": 2.075115919113159, "step": 6080 }, { "epoch": 2.028018679119413, "learning_rate": 0.00073898494833153, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 0.8387904167175293, "step": 6080 }, { "ce_loss": 0.2569155991077423, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.4047635495662689, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.17692382633686066, "step": 6080 }, { "epoch": 2.028018679119413, "loss": 1.2788372039794922, "step": 6080 }, { "ce_loss": 0.41565200686454773, "epoch": 2.028018679119413, "step": 6080 }, { "distill_loss": 0.4378134310245514, "epoch": 2.028018679119413, "step": 6080 }, { "epoch": 2.028018679119413, "ref_ce_loss": 0.2810961604118347, "step": 6080 }, { "epoch": 2.0313542361574384, "loss": 1.1467, "step": 6090 }, { "epoch": 2.0313542361574384, "grad_norm": 1.7534302473068237, "step": 6090 }, { "epoch": 2.0313542361574384, "learning_rate": 0.0007387553514343824, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.9590702652931213, "step": 6090 }, { "ce_loss": 0.23299598693847656, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.42754676938056946, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.20577183365821838, "step": 6090 }, { "epoch": 2.0313542361574384, "loss": 0.794579803943634, "step": 6090 }, { "ce_loss": 0.19693799316883087, "epoch": 2.0313542361574384, "step": 6090 }, { "distill_loss": 0.4145779609680176, "epoch": 2.0313542361574384, "step": 6090 }, { "epoch": 2.0313542361574384, "ref_ce_loss": 0.18288493156433105, "step": 6090 }, { "epoch": 2.0346897931954637, "loss": 1.0794, "step": 6100 }, { "epoch": 2.0346897931954637, "grad_norm": 2.150377035140991, "step": 6100 }, { "epoch": 2.0346897931954637, "learning_rate": 0.000738525359173288, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 1.006495475769043, "step": 6100 }, { "ce_loss": 0.2969522476196289, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.39455148577690125, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.24069784581661224, "step": 6100 }, { "epoch": 2.0346897931954637, "loss": 1.1346238851547241, "step": 6100 }, { "ce_loss": 0.28026166558265686, "epoch": 2.0346897931954637, "step": 6100 }, { "distill_loss": 0.4355485439300537, "epoch": 2.0346897931954637, "step": 6100 }, { "epoch": 2.0346897931954637, "ref_ce_loss": 0.1831492930650711, "step": 6100 }, { "epoch": 2.038025350233489, "loss": 1.0818, "step": 6110 }, { "epoch": 2.038025350233489, "grad_norm": 2.7147910594940186, "step": 6110 }, { "epoch": 2.038025350233489, "learning_rate": 0.0007382949718166726, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 1.0828402042388916, "step": 6110 }, { "ce_loss": 0.2484767884016037, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.46675872802734375, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.18916720151901245, "step": 6110 }, { "epoch": 2.038025350233489, "loss": 1.155005931854248, "step": 6110 }, { "ce_loss": 0.2683473229408264, "epoch": 2.038025350233489, "step": 6110 }, { "distill_loss": 0.44412925839424133, "epoch": 2.038025350233489, "step": 6110 }, { "epoch": 2.038025350233489, "ref_ce_loss": 0.19835783541202545, "step": 6110 }, { "epoch": 2.0413609072715144, "loss": 1.1626, "step": 6120 }, { "epoch": 2.0413609072715144, "grad_norm": 2.2923474311828613, "step": 6120 }, { "epoch": 2.0413609072715144, "learning_rate": 0.0007380641896334231, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 1.27774977684021, "step": 6120 }, { "ce_loss": 0.40639743208885193, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.4476247727870941, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.30624353885650635, "step": 6120 }, { "epoch": 2.0413609072715144, "loss": 1.1386736631393433, "step": 6120 }, { "ce_loss": 0.2917673885822296, "epoch": 2.0413609072715144, "step": 6120 }, { "distill_loss": 0.4815749526023865, "epoch": 2.0413609072715144, "step": 6120 }, { "epoch": 2.0413609072715144, "ref_ce_loss": 0.2240479290485382, "step": 6120 }, { "epoch": 2.0446964643095398, "loss": 1.0721, "step": 6130 }, { "epoch": 2.0446964643095398, "grad_norm": 1.971584439277649, "step": 6130 }, { "epoch": 2.0446964643095398, "learning_rate": 0.0007378330128928871, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 1.1431065797805786, "step": 6130 }, { "ce_loss": 0.3328869938850403, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.5115488767623901, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.23866187036037445, "step": 6130 }, { "epoch": 2.0446964643095398, "loss": 1.0687028169631958, "step": 6130 }, { "ce_loss": 0.33602792024612427, "epoch": 2.0446964643095398, "step": 6130 }, { "distill_loss": 0.4408641457557678, "epoch": 2.0446964643095398, "step": 6130 }, { "epoch": 2.0446964643095398, "ref_ce_loss": 0.21772147715091705, "step": 6130 }, { "epoch": 2.048032021347565, "loss": 1.0638, "step": 6140 }, { "epoch": 2.048032021347565, "grad_norm": 1.7937724590301514, "step": 6140 }, { "epoch": 2.048032021347565, "learning_rate": 0.0007376014418648727, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 1.265535593032837, "step": 6140 }, { "ce_loss": 0.2526908814907074, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.37703099846839905, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.2018091231584549, "step": 6140 }, { "epoch": 2.048032021347565, "loss": 1.0709772109985352, "step": 6140 }, { "ce_loss": 0.27110448479652405, "epoch": 2.048032021347565, "step": 6140 }, { "distill_loss": 0.39831891655921936, "epoch": 2.048032021347565, "step": 6140 }, { "epoch": 2.048032021347565, "ref_ce_loss": 0.2335970103740692, "step": 6140 }, { "epoch": 2.0513675783855905, "loss": 1.1445, "step": 6150 }, { "epoch": 2.0513675783855905, "grad_norm": 1.4948005676269531, "step": 6150 }, { "epoch": 2.0513675783855905, "learning_rate": 0.0007373694768196481, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 1.0971550941467285, "step": 6150 }, { "ce_loss": 0.29519185423851013, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.5128030180931091, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.21166488528251648, "step": 6150 }, { "epoch": 2.0513675783855905, "loss": 0.9408478736877441, "step": 6150 }, { "ce_loss": 0.29984796047210693, "epoch": 2.0513675783855905, "step": 6150 }, { "distill_loss": 0.4378625452518463, "epoch": 2.0513675783855905, "step": 6150 }, { "epoch": 2.0513675783855905, "ref_ce_loss": 0.20278240740299225, "step": 6150 }, { "epoch": 2.054703135423616, "loss": 1.0888, "step": 6160 }, { "epoch": 2.054703135423616, "grad_norm": 2.1178483963012695, "step": 6160 }, { "epoch": 2.054703135423616, "learning_rate": 0.0007371371180279417, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 1.3761086463928223, "step": 6160 }, { "ce_loss": 0.44446414709091187, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.5409948825836182, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.2181801199913025, "step": 6160 }, { "epoch": 2.054703135423616, "loss": 1.0366450548171997, "step": 6160 }, { "ce_loss": 0.25601834058761597, "epoch": 2.054703135423616, "step": 6160 }, { "distill_loss": 0.39888685941696167, "epoch": 2.054703135423616, "step": 6160 }, { "epoch": 2.054703135423616, "ref_ce_loss": 0.22890298068523407, "step": 6160 }, { "epoch": 2.058038692461641, "loss": 1.0642, "step": 6170 }, { "epoch": 2.058038692461641, "grad_norm": 2.5415828227996826, "step": 6170 }, { "epoch": 2.058038692461641, "learning_rate": 0.0007369043657609412, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 0.9021316170692444, "step": 6170 }, { "ce_loss": 0.22570855915546417, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.399993896484375, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.2156895399093628, "step": 6170 }, { "epoch": 2.058038692461641, "loss": 1.0900871753692627, "step": 6170 }, { "ce_loss": 0.31268736720085144, "epoch": 2.058038692461641, "step": 6170 }, { "distill_loss": 0.5308760404586792, "epoch": 2.058038692461641, "step": 6170 }, { "epoch": 2.058038692461641, "ref_ce_loss": 0.18112565577030182, "step": 6170 }, { "epoch": 2.0613742494996665, "loss": 1.0631, "step": 6180 }, { "epoch": 2.0613742494996665, "grad_norm": 1.4687033891677856, "step": 6180 }, { "epoch": 2.0613742494996665, "learning_rate": 0.0007366712202902933, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 0.9915813207626343, "step": 6180 }, { "ce_loss": 0.24520814418792725, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.40753301978111267, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.18381255865097046, "step": 6180 }, { "epoch": 2.0613742494996665, "loss": 1.2639784812927246, "step": 6180 }, { "ce_loss": 0.37111666798591614, "epoch": 2.0613742494996665, "step": 6180 }, { "distill_loss": 0.4548932611942291, "epoch": 2.0613742494996665, "step": 6180 }, { "epoch": 2.0613742494996665, "ref_ce_loss": 0.24587491154670715, "step": 6180 }, { "epoch": 2.064709806537692, "loss": 1.2493, "step": 6190 }, { "epoch": 2.064709806537692, "grad_norm": 1.4651398658752441, "step": 6190 }, { "epoch": 2.064709806537692, "learning_rate": 0.0007364376818881042, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 1.2870615720748901, "step": 6190 }, { "ce_loss": 0.31373345851898193, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.5066043138504028, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.25695517659187317, "step": 6190 }, { "epoch": 2.064709806537692, "loss": 1.2238523960113525, "step": 6190 }, { "ce_loss": 0.34804919362068176, "epoch": 2.064709806537692, "step": 6190 }, { "distill_loss": 0.5069231986999512, "epoch": 2.064709806537692, "step": 6190 }, { "epoch": 2.064709806537692, "ref_ce_loss": 0.25139155983924866, "step": 6190 }, { "epoch": 2.068045363575717, "loss": 1.1388, "step": 6200 }, { "epoch": 2.068045363575717, "grad_norm": 1.8307620286941528, "step": 6200 }, { "epoch": 2.068045363575717, "learning_rate": 0.000736203750826938, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 1.0272313356399536, "step": 6200 }, { "ce_loss": 0.26185503602027893, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.4729871153831482, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.22979292273521423, "step": 6200 }, { "epoch": 2.068045363575717, "loss": 1.1139501333236694, "step": 6200 }, { "ce_loss": 0.35409292578697205, "epoch": 2.068045363575717, "step": 6200 }, { "distill_loss": 0.460551381111145, "epoch": 2.068045363575717, "step": 6200 }, { "epoch": 2.068045363575717, "ref_ce_loss": 0.23284848034381866, "step": 6200 }, { "epoch": 2.0713809206137426, "loss": 1.0942, "step": 6210 }, { "epoch": 2.0713809206137426, "grad_norm": 1.6087286472320557, "step": 6210 }, { "epoch": 2.0713809206137426, "learning_rate": 0.0007359694273798175, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 1.0432158708572388, "step": 6210 }, { "ce_loss": 0.28722330927848816, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.44095054268836975, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.21269214153289795, "step": 6210 }, { "epoch": 2.0713809206137426, "loss": 0.9859718680381775, "step": 6210 }, { "ce_loss": 0.2458011358976364, "epoch": 2.0713809206137426, "step": 6210 }, { "distill_loss": 0.3622279167175293, "epoch": 2.0713809206137426, "step": 6210 }, { "epoch": 2.0713809206137426, "ref_ce_loss": 0.18419009447097778, "step": 6210 }, { "epoch": 2.074716477651768, "loss": 1.0005, "step": 6220 }, { "epoch": 2.074716477651768, "grad_norm": 2.0101234912872314, "step": 6220 }, { "epoch": 2.074716477651768, "learning_rate": 0.0007357347118202235, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 0.9655289649963379, "step": 6220 }, { "ce_loss": 0.3007512092590332, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.41682693362236023, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.2477555274963379, "step": 6220 }, { "epoch": 2.074716477651768, "loss": 1.1301460266113281, "step": 6220 }, { "ce_loss": 0.2814660966396332, "epoch": 2.074716477651768, "step": 6220 }, { "distill_loss": 0.38158291578292847, "epoch": 2.074716477651768, "step": 6220 }, { "epoch": 2.074716477651768, "ref_ce_loss": 0.18453769385814667, "step": 6220 }, { "epoch": 2.0780520346897933, "loss": 1.0704, "step": 6230 }, { "epoch": 2.0780520346897933, "grad_norm": 2.3904693126678467, "step": 6230 }, { "epoch": 2.0780520346897933, "learning_rate": 0.0007354996044220942, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 1.1304408311843872, "step": 6230 }, { "ce_loss": 0.29919639229774475, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.43664121627807617, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.2104227989912033, "step": 6230 }, { "epoch": 2.0780520346897933, "loss": 0.8765370845794678, "step": 6230 }, { "ce_loss": 0.26280224323272705, "epoch": 2.0780520346897933, "step": 6230 }, { "distill_loss": 0.4388915002346039, "epoch": 2.0780520346897933, "step": 6230 }, { "epoch": 2.0780520346897933, "ref_ce_loss": 0.17453253269195557, "step": 6230 }, { "epoch": 2.0813875917278186, "loss": 1.1378, "step": 6240 }, { "epoch": 2.0813875917278186, "grad_norm": 2.003026247024536, "step": 6240 }, { "epoch": 2.0813875917278186, "learning_rate": 0.0007352641054598253, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 1.1472810506820679, "step": 6240 }, { "ce_loss": 0.31809720396995544, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.5361602306365967, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.19889040291309357, "step": 6240 }, { "epoch": 2.0813875917278186, "loss": 1.067832112312317, "step": 6240 }, { "ce_loss": 0.3019455373287201, "epoch": 2.0813875917278186, "step": 6240 }, { "distill_loss": 0.46684446930885315, "epoch": 2.0813875917278186, "step": 6240 }, { "epoch": 2.0813875917278186, "ref_ce_loss": 0.23338524997234344, "step": 6240 }, { "epoch": 2.084723148765844, "loss": 1.0864, "step": 6250 }, { "epoch": 2.084723148765844, "grad_norm": 1.7693291902542114, "step": 6250 }, { "epoch": 2.084723148765844, "learning_rate": 0.0007350282152082695, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 1.0742148160934448, "step": 6250 }, { "ce_loss": 0.27459150552749634, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.525519073009491, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.21103417873382568, "step": 6250 }, { "epoch": 2.084723148765844, "loss": 1.170702338218689, "step": 6250 }, { "ce_loss": 0.388058066368103, "epoch": 2.084723148765844, "step": 6250 }, { "distill_loss": 0.43670716881752014, "epoch": 2.084723148765844, "step": 6250 }, { "epoch": 2.084723148765844, "ref_ce_loss": 0.2780489921569824, "step": 6250 }, { "epoch": 2.0880587058038693, "loss": 1.0828, "step": 6260 }, { "epoch": 2.0880587058038693, "grad_norm": 1.7592650651931763, "step": 6260 }, { "epoch": 2.0880587058038693, "learning_rate": 0.000734791933942736, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 0.9743472933769226, "step": 6260 }, { "ce_loss": 0.28358349204063416, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.48276543617248535, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.20776033401489258, "step": 6260 }, { "epoch": 2.0880587058038693, "loss": 1.0317065715789795, "step": 6260 }, { "ce_loss": 0.3378918170928955, "epoch": 2.0880587058038693, "step": 6260 }, { "distill_loss": 0.40934500098228455, "epoch": 2.0880587058038693, "step": 6260 }, { "epoch": 2.0880587058038693, "ref_ce_loss": 0.19641610980033875, "step": 6260 }, { "epoch": 2.0913942628418947, "loss": 1.0748, "step": 6270 }, { "epoch": 2.0913942628418947, "grad_norm": 1.7081412076950073, "step": 6270 }, { "epoch": 2.0913942628418947, "learning_rate": 0.0007345552619389906, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 1.1185240745544434, "step": 6270 }, { "ce_loss": 0.3204095661640167, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.5713546872138977, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.2265942245721817, "step": 6270 }, { "epoch": 2.0913942628418947, "loss": 1.4075984954833984, "step": 6270 }, { "ce_loss": 0.3697609007358551, "epoch": 2.0913942628418947, "step": 6270 }, { "distill_loss": 0.5672245025634766, "epoch": 2.0913942628418947, "step": 6270 }, { "epoch": 2.0913942628418947, "ref_ce_loss": 0.2941744029521942, "step": 6270 }, { "epoch": 2.09472981987992, "loss": 1.1678, "step": 6280 }, { "epoch": 2.09472981987992, "grad_norm": 1.6775950193405151, "step": 6280 }, { "epoch": 2.09472981987992, "learning_rate": 0.0007343181994732547, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 1.2271791696548462, "step": 6280 }, { "ce_loss": 0.2992215156555176, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.5207251906394958, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.3127952516078949, "step": 6280 }, { "epoch": 2.09472981987992, "loss": 1.2772680521011353, "step": 6280 }, { "ce_loss": 0.34195807576179504, "epoch": 2.09472981987992, "step": 6280 }, { "distill_loss": 0.5496068596839905, "epoch": 2.09472981987992, "step": 6280 }, { "epoch": 2.09472981987992, "ref_ce_loss": 0.2772253453731537, "step": 6280 }, { "epoch": 2.0980653769179454, "loss": 1.1138, "step": 6290 }, { "epoch": 2.0980653769179454, "grad_norm": 1.9288606643676758, "step": 6290 }, { "epoch": 2.0980653769179454, "learning_rate": 0.000734080746822206, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 0.9897034764289856, "step": 6290 }, { "ce_loss": 0.32950952649116516, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.3844912648200989, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.2242661714553833, "step": 6290 }, { "epoch": 2.0980653769179454, "loss": 1.3939342498779297, "step": 6290 }, { "ce_loss": 0.35396263003349304, "epoch": 2.0980653769179454, "step": 6290 }, { "distill_loss": 0.5109117031097412, "epoch": 2.0980653769179454, "step": 6290 }, { "epoch": 2.0980653769179454, "ref_ce_loss": 0.22464288771152496, "step": 6290 }, { "epoch": 2.1014009339559707, "loss": 1.1161, "step": 6300 }, { "epoch": 2.1014009339559707, "grad_norm": 2.2423195838928223, "step": 6300 }, { "epoch": 2.1014009339559707, "learning_rate": 0.0007338429042629772, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 1.138380527496338, "step": 6300 }, { "ce_loss": 0.2688751220703125, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.428027868270874, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.21883095800876617, "step": 6300 }, { "epoch": 2.1014009339559707, "loss": 1.9510817527770996, "step": 6300 }, { "ce_loss": 0.3263424038887024, "epoch": 2.1014009339559707, "step": 6300 }, { "distill_loss": 0.41397032141685486, "epoch": 2.1014009339559707, "step": 6300 }, { "epoch": 2.1014009339559707, "ref_ce_loss": 0.24076981842517853, "step": 6300 }, { "epoch": 2.104736490993996, "loss": 1.1093, "step": 6310 }, { "epoch": 2.104736490993996, "grad_norm": 2.5436060428619385, "step": 6310 }, { "epoch": 2.104736490993996, "learning_rate": 0.0007336046720731559, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 1.1591858863830566, "step": 6310 }, { "ce_loss": 0.26692333817481995, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.436357319355011, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.23056459426879883, "step": 6310 }, { "epoch": 2.104736490993996, "loss": 1.1801612377166748, "step": 6310 }, { "ce_loss": 0.23928722739219666, "epoch": 2.104736490993996, "step": 6310 }, { "distill_loss": 0.42476698756217957, "epoch": 2.104736490993996, "step": 6310 }, { "epoch": 2.104736490993996, "ref_ce_loss": 0.18285700678825378, "step": 6310 }, { "epoch": 2.1080720480320214, "loss": 1.0749, "step": 6320 }, { "epoch": 2.1080720480320214, "grad_norm": 9.392084121704102, "step": 6320 }, { "epoch": 2.1080720480320214, "learning_rate": 0.0007333660505307852, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 1.1425275802612305, "step": 6320 }, { "ce_loss": 0.3087732195854187, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.44689956307411194, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.27974626421928406, "step": 6320 }, { "epoch": 2.1080720480320214, "loss": 1.2156401872634888, "step": 6320 }, { "ce_loss": 0.3612198233604431, "epoch": 2.1080720480320214, "step": 6320 }, { "distill_loss": 0.41263580322265625, "epoch": 2.1080720480320214, "step": 6320 }, { "epoch": 2.1080720480320214, "ref_ce_loss": 0.23722223937511444, "step": 6320 }, { "epoch": 2.1114076050700468, "loss": 1.1452, "step": 6330 }, { "epoch": 2.1114076050700468, "grad_norm": 2.4938435554504395, "step": 6330 }, { "epoch": 2.1114076050700468, "learning_rate": 0.0007331270399143618, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 1.1720032691955566, "step": 6330 }, { "ce_loss": 0.35546770691871643, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.47783389687538147, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.27097728848457336, "step": 6330 }, { "epoch": 2.1114076050700468, "loss": 1.4047222137451172, "step": 6330 }, { "ce_loss": 0.2705877721309662, "epoch": 2.1114076050700468, "step": 6330 }, { "distill_loss": 0.4400711953639984, "epoch": 2.1114076050700468, "step": 6330 }, { "epoch": 2.1114076050700468, "ref_ce_loss": 0.2454250603914261, "step": 6330 }, { "epoch": 2.114743162108072, "loss": 1.1854, "step": 6340 }, { "epoch": 2.114743162108072, "grad_norm": 2.1468722820281982, "step": 6340 }, { "epoch": 2.114743162108072, "learning_rate": 0.0007328876405028367, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 1.0781439542770386, "step": 6340 }, { "ce_loss": 0.28617146611213684, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.4413968026638031, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.21744798123836517, "step": 6340 }, { "epoch": 2.114743162108072, "loss": 1.0336925983428955, "step": 6340 }, { "ce_loss": 0.2718295753002167, "epoch": 2.114743162108072, "step": 6340 }, { "distill_loss": 0.4237961173057556, "epoch": 2.114743162108072, "step": 6340 }, { "epoch": 2.114743162108072, "ref_ce_loss": 0.20000006258487701, "step": 6340 }, { "epoch": 2.1180787191460975, "loss": 1.13, "step": 6350 }, { "epoch": 2.1180787191460975, "grad_norm": 2.2106473445892334, "step": 6350 }, { "epoch": 2.1180787191460975, "learning_rate": 0.0007326478525756151, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.8444902896881104, "step": 6350 }, { "ce_loss": 0.21335484087467194, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.3180597722530365, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.14585541188716888, "step": 6350 }, { "epoch": 2.1180787191460975, "loss": 0.9812553524971008, "step": 6350 }, { "ce_loss": 0.2725522816181183, "epoch": 2.1180787191460975, "step": 6350 }, { "distill_loss": 0.44498321413993835, "epoch": 2.1180787191460975, "step": 6350 }, { "epoch": 2.1180787191460975, "ref_ce_loss": 0.2124388962984085, "step": 6350 }, { "epoch": 2.121414276184123, "loss": 1.1336, "step": 6360 }, { "epoch": 2.121414276184123, "grad_norm": 1.6461633443832397, "step": 6360 }, { "epoch": 2.121414276184123, "learning_rate": 0.0007324076764125552, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 0.9389811158180237, "step": 6360 }, { "ce_loss": 0.26564937829971313, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.4470508098602295, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.1644158661365509, "step": 6360 }, { "epoch": 2.121414276184123, "loss": 1.4932483434677124, "step": 6360 }, { "ce_loss": 0.40737444162368774, "epoch": 2.121414276184123, "step": 6360 }, { "distill_loss": 0.660778820514679, "epoch": 2.121414276184123, "step": 6360 }, { "epoch": 2.121414276184123, "ref_ce_loss": 0.260358065366745, "step": 6360 }, { "epoch": 2.124749833222148, "loss": 1.107, "step": 6370 }, { "epoch": 2.124749833222148, "grad_norm": 1.6320087909698486, "step": 6370 }, { "epoch": 2.124749833222148, "learning_rate": 0.0007321671122939684, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 0.9644901752471924, "step": 6370 }, { "ce_loss": 0.24344997107982635, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.4070049226284027, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.24330593645572662, "step": 6370 }, { "epoch": 2.124749833222148, "loss": 0.6866275072097778, "step": 6370 }, { "ce_loss": 0.19747519493103027, "epoch": 2.124749833222148, "step": 6370 }, { "distill_loss": 0.3194316029548645, "epoch": 2.124749833222148, "step": 6370 }, { "epoch": 2.124749833222148, "ref_ce_loss": 0.16923660039901733, "step": 6370 }, { "epoch": 2.1280853902601735, "loss": 1.0816, "step": 6380 }, { "epoch": 2.1280853902601735, "grad_norm": 1.6301352977752686, "step": 6380 }, { "epoch": 2.1280853902601735, "learning_rate": 0.0007319261605006188, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 1.198629379272461, "step": 6380 }, { "ce_loss": 0.286978155374527, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.46526041626930237, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.22198054194450378, "step": 6380 }, { "epoch": 2.1280853902601735, "loss": 1.8673536777496338, "step": 6380 }, { "ce_loss": 0.36875566840171814, "epoch": 2.1280853902601735, "step": 6380 }, { "distill_loss": 0.549709677696228, "epoch": 2.1280853902601735, "step": 6380 }, { "epoch": 2.1280853902601735, "ref_ce_loss": 0.2909049093723297, "step": 6380 }, { "epoch": 2.131420947298199, "loss": 1.2046, "step": 6390 }, { "epoch": 2.131420947298199, "grad_norm": 1.5458937883377075, "step": 6390 }, { "epoch": 2.131420947298199, "learning_rate": 0.0007316848213137231, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 0.9795043468475342, "step": 6390 }, { "ce_loss": 0.25236889719963074, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.38965165615081787, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.19905973970890045, "step": 6390 }, { "epoch": 2.131420947298199, "loss": 1.0322065353393555, "step": 6390 }, { "ce_loss": 0.29902148246765137, "epoch": 2.131420947298199, "step": 6390 }, { "distill_loss": 0.5339668989181519, "epoch": 2.131420947298199, "step": 6390 }, { "epoch": 2.131420947298199, "ref_ce_loss": 0.19888225197792053, "step": 6390 }, { "epoch": 2.134756504336224, "loss": 1.1691, "step": 6400 }, { "epoch": 2.134756504336224, "grad_norm": 1.7552231550216675, "step": 6400 }, { "epoch": 2.134756504336224, "learning_rate": 0.0007314430950149502, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 1.2230530977249146, "step": 6400 }, { "ce_loss": 0.22836093604564667, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.3304770886898041, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.2264941781759262, "step": 6400 }, { "epoch": 2.134756504336224, "loss": 0.9575164318084717, "step": 6400 }, { "ce_loss": 0.2918906509876251, "epoch": 2.134756504336224, "step": 6400 }, { "distill_loss": 0.34815043210983276, "epoch": 2.134756504336224, "step": 6400 }, { "epoch": 2.134756504336224, "ref_ce_loss": 0.25119641423225403, "step": 6400 }, { "epoch": 2.1380920613742496, "loss": 1.1057, "step": 6410 }, { "epoch": 2.1380920613742496, "grad_norm": 3.011944532394409, "step": 6410 }, { "epoch": 2.1380920613742496, "learning_rate": 0.0007312009818864209, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 1.1187481880187988, "step": 6410 }, { "ce_loss": 0.3483152687549591, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.4989229142665863, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.20267225801944733, "step": 6410 }, { "epoch": 2.1380920613742496, "loss": 0.8301767706871033, "step": 6410 }, { "ce_loss": 0.2062799632549286, "epoch": 2.1380920613742496, "step": 6410 }, { "distill_loss": 0.3910866975784302, "epoch": 2.1380920613742496, "step": 6410 }, { "epoch": 2.1380920613742496, "ref_ce_loss": 0.177321195602417, "step": 6410 }, { "epoch": 2.141427618412275, "loss": 1.0847, "step": 6420 }, { "epoch": 2.141427618412275, "grad_norm": 1.7740399837493896, "step": 6420 }, { "epoch": 2.141427618412275, "learning_rate": 0.0007309584822107068, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 1.1315577030181885, "step": 6420 }, { "ce_loss": 0.32394424080848694, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.4168577790260315, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.24851790070533752, "step": 6420 }, { "epoch": 2.141427618412275, "loss": 1.1237907409667969, "step": 6420 }, { "ce_loss": 0.3055866062641144, "epoch": 2.141427618412275, "step": 6420 }, { "distill_loss": 0.46554845571517944, "epoch": 2.141427618412275, "step": 6420 }, { "epoch": 2.141427618412275, "ref_ce_loss": 0.2067885845899582, "step": 6420 }, { "epoch": 2.1447631754503003, "loss": 1.0649, "step": 6430 }, { "epoch": 2.1447631754503003, "grad_norm": 2.0649614334106445, "step": 6430 }, { "epoch": 2.1447631754503003, "learning_rate": 0.0007307155962708314, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 0.9748170971870422, "step": 6430 }, { "ce_loss": 0.25578707456588745, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.45131832361221313, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.21652290225028992, "step": 6430 }, { "epoch": 2.1447631754503003, "loss": 1.2006250619888306, "step": 6430 }, { "ce_loss": 0.3162148594856262, "epoch": 2.1447631754503003, "step": 6430 }, { "distill_loss": 0.47678422927856445, "epoch": 2.1447631754503003, "step": 6430 }, { "epoch": 2.1447631754503003, "ref_ce_loss": 0.3085026443004608, "step": 6430 }, { "epoch": 2.1480987324883256, "loss": 1.1302, "step": 6440 }, { "epoch": 2.1480987324883256, "grad_norm": 2.0026278495788574, "step": 6440 }, { "epoch": 2.1480987324883256, "learning_rate": 0.0007304723243502686, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.9228730797767639, "step": 6440 }, { "ce_loss": 0.21623915433883667, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.4748040735721588, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.22128619253635406, "step": 6440 }, { "epoch": 2.1480987324883256, "loss": 0.9328144788742065, "step": 6440 }, { "ce_loss": 0.22637756168842316, "epoch": 2.1480987324883256, "step": 6440 }, { "distill_loss": 0.4844534397125244, "epoch": 2.1480987324883256, "step": 6440 }, { "epoch": 2.1480987324883256, "ref_ce_loss": 0.2073233723640442, "step": 6440 }, { "epoch": 2.151434289526351, "loss": 1.1879, "step": 6450 }, { "epoch": 2.151434289526351, "grad_norm": 1.8786402940750122, "step": 6450 }, { "epoch": 2.151434289526351, "learning_rate": 0.000730228666732943, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.9601532220840454, "step": 6450 }, { "ce_loss": 0.24260313808918, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.3780670464038849, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.18927910923957825, "step": 6450 }, { "epoch": 2.151434289526351, "loss": 0.9228167533874512, "step": 6450 }, { "ce_loss": 0.3097418546676636, "epoch": 2.151434289526351, "step": 6450 }, { "distill_loss": 0.4570564031600952, "epoch": 2.151434289526351, "step": 6450 }, { "epoch": 2.151434289526351, "ref_ce_loss": 0.15541477501392365, "step": 6450 }, { "epoch": 2.1547698465643763, "loss": 1.113, "step": 6460 }, { "epoch": 2.1547698465643763, "grad_norm": 2.095228672027588, "step": 6460 }, { "epoch": 2.1547698465643763, "learning_rate": 0.0007299846237032293, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 1.3037450313568115, "step": 6460 }, { "ce_loss": 0.21096457540988922, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.4512532353401184, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.19502690434455872, "step": 6460 }, { "epoch": 2.1547698465643763, "loss": 1.120269775390625, "step": 6460 }, { "ce_loss": 0.325928658246994, "epoch": 2.1547698465643763, "step": 6460 }, { "distill_loss": 0.4305892586708069, "epoch": 2.1547698465643763, "step": 6460 }, { "epoch": 2.1547698465643763, "ref_ce_loss": 0.24968509376049042, "step": 6460 }, { "epoch": 2.1581054036024017, "loss": 1.1734, "step": 6470 }, { "epoch": 2.1581054036024017, "grad_norm": 1.8886241912841797, "step": 6470 }, { "epoch": 2.1581054036024017, "learning_rate": 0.000729740195545952, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 0.9583158493041992, "step": 6470 }, { "ce_loss": 0.2795897126197815, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.41173821687698364, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.2124219685792923, "step": 6470 }, { "epoch": 2.1581054036024017, "loss": 1.1033215522766113, "step": 6470 }, { "ce_loss": 0.39897602796554565, "epoch": 2.1581054036024017, "step": 6470 }, { "distill_loss": 0.4334798753261566, "epoch": 2.1581054036024017, "step": 6470 }, { "epoch": 2.1581054036024017, "ref_ce_loss": 0.2701454758644104, "step": 6470 }, { "epoch": 2.161440960640427, "loss": 1.152, "step": 6480 }, { "epoch": 2.161440960640427, "grad_norm": 1.923465371131897, "step": 6480 }, { "epoch": 2.161440960640427, "learning_rate": 0.0007294953825463849, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 1.3026888370513916, "step": 6480 }, { "ce_loss": 0.35526034235954285, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.5430156588554382, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.282005250453949, "step": 6480 }, { "epoch": 2.161440960640427, "loss": 0.9665287733078003, "step": 6480 }, { "ce_loss": 0.27917689085006714, "epoch": 2.161440960640427, "step": 6480 }, { "distill_loss": 0.4497480094432831, "epoch": 2.161440960640427, "step": 6480 }, { "epoch": 2.161440960640427, "ref_ce_loss": 0.18187867105007172, "step": 6480 }, { "epoch": 2.1647765176784524, "loss": 1.0982, "step": 6490 }, { "epoch": 2.1647765176784524, "grad_norm": 1.5525462627410889, "step": 6490 }, { "epoch": 2.1647765176784524, "learning_rate": 0.0007292501849902513, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 0.9274162650108337, "step": 6490 }, { "ce_loss": 0.22487910091876984, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.3908233642578125, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.2260996699333191, "step": 6490 }, { "epoch": 2.1647765176784524, "loss": 1.2589856386184692, "step": 6490 }, { "ce_loss": 0.41789162158966064, "epoch": 2.1647765176784524, "step": 6490 }, { "distill_loss": 0.5558347105979919, "epoch": 2.1647765176784524, "step": 6490 }, { "epoch": 2.1647765176784524, "ref_ce_loss": 0.27965062856674194, "step": 6490 }, { "epoch": 2.1681120747164777, "loss": 1.1382, "step": 6500 }, { "epoch": 2.1681120747164777, "grad_norm": 1.6293458938598633, "step": 6500 }, { "epoch": 2.1681120747164777, "learning_rate": 0.0007290046031637229, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 1.054274320602417, "step": 6500 }, { "ce_loss": 0.2959146797657013, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.4802056849002838, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.2115231305360794, "step": 6500 }, { "epoch": 2.1681120747164777, "loss": 1.2698650360107422, "step": 6500 }, { "ce_loss": 0.31689831614494324, "epoch": 2.1681120747164777, "step": 6500 }, { "distill_loss": 0.42682600021362305, "epoch": 2.1681120747164777, "step": 6500 }, { "epoch": 2.1681120747164777, "ref_ce_loss": 0.2237817496061325, "step": 6500 }, { "epoch": 2.171447631754503, "loss": 1.1373, "step": 6510 }, { "epoch": 2.171447631754503, "grad_norm": 1.8974462747573853, "step": 6510 }, { "epoch": 2.171447631754503, "learning_rate": 0.0007287586373534202, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 1.0428264141082764, "step": 6510 }, { "ce_loss": 0.26314231753349304, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.4809033274650574, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.20492221415042877, "step": 6510 }, { "epoch": 2.171447631754503, "loss": 1.293421983718872, "step": 6510 }, { "ce_loss": 0.3857915699481964, "epoch": 2.171447631754503, "step": 6510 }, { "distill_loss": 0.6188392043113708, "epoch": 2.171447631754503, "step": 6510 }, { "epoch": 2.171447631754503, "ref_ce_loss": 0.2295541614294052, "step": 6510 }, { "epoch": 2.1747831887925284, "loss": 1.1598, "step": 6520 }, { "epoch": 2.1747831887925284, "grad_norm": 2.1783154010772705, "step": 6520 }, { "epoch": 2.1747831887925284, "learning_rate": 0.000728512287846412, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 1.2277634143829346, "step": 6520 }, { "ce_loss": 0.21726199984550476, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.5520312786102295, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.17318247258663177, "step": 6520 }, { "epoch": 2.1747831887925284, "loss": 0.9285261631011963, "step": 6520 }, { "ce_loss": 0.22456702589988708, "epoch": 2.1747831887925284, "step": 6520 }, { "distill_loss": 0.4865211546421051, "epoch": 2.1747831887925284, "step": 6520 }, { "epoch": 2.1747831887925284, "ref_ce_loss": 0.1675080806016922, "step": 6520 }, { "epoch": 2.1781187458305538, "loss": 1.1704, "step": 6530 }, { "epoch": 2.1781187458305538, "grad_norm": 1.9148839712142944, "step": 6530 }, { "epoch": 2.1781187458305538, "learning_rate": 0.0007282655549302144, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 1.0659818649291992, "step": 6530 }, { "ce_loss": 0.29880058765411377, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.4344814717769623, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.16811010241508484, "step": 6530 }, { "epoch": 2.1781187458305538, "loss": 0.9991016387939453, "step": 6530 }, { "ce_loss": 0.3414323627948761, "epoch": 2.1781187458305538, "step": 6530 }, { "distill_loss": 0.40579909086227417, "epoch": 2.1781187458305538, "step": 6530 }, { "epoch": 2.1781187458305538, "ref_ce_loss": 0.25159013271331787, "step": 6530 }, { "epoch": 2.181454302868579, "loss": 1.0964, "step": 6540 }, { "epoch": 2.181454302868579, "grad_norm": 1.865465521812439, "step": 6540 }, { "epoch": 2.181454302868579, "learning_rate": 0.0007280184388927914, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 1.025991439819336, "step": 6540 }, { "ce_loss": 0.3279627561569214, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.3677094578742981, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.25373217463493347, "step": 6540 }, { "epoch": 2.181454302868579, "loss": 1.0250794887542725, "step": 6540 }, { "ce_loss": 0.304439514875412, "epoch": 2.181454302868579, "step": 6540 }, { "distill_loss": 0.41830167174339294, "epoch": 2.181454302868579, "step": 6540 }, { "epoch": 2.181454302868579, "ref_ce_loss": 0.19563069939613342, "step": 6540 }, { "epoch": 2.1847898599066045, "loss": 1.0431, "step": 6550 }, { "epoch": 2.1847898599066045, "grad_norm": 2.291187047958374, "step": 6550 }, { "epoch": 2.1847898599066045, "learning_rate": 0.000727770940022554, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 1.1050126552581787, "step": 6550 }, { "ce_loss": 0.3487250804901123, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.46970731019973755, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.20708809792995453, "step": 6550 }, { "epoch": 2.1847898599066045, "loss": 1.2422537803649902, "step": 6550 }, { "ce_loss": 0.3275320827960968, "epoch": 2.1847898599066045, "step": 6550 }, { "distill_loss": 0.5561754703521729, "epoch": 2.1847898599066045, "step": 6550 }, { "epoch": 2.1847898599066045, "ref_ce_loss": 0.24336011707782745, "step": 6550 }, { "epoch": 2.18812541694463, "loss": 1.1976, "step": 6560 }, { "epoch": 2.18812541694463, "grad_norm": 1.8396140336990356, "step": 6560 }, { "epoch": 2.18812541694463, "learning_rate": 0.0007275230586083598, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 1.2439719438552856, "step": 6560 }, { "ce_loss": 0.2645409107208252, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.5173214673995972, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.23124998807907104, "step": 6560 }, { "epoch": 2.18812541694463, "loss": 1.1374619007110596, "step": 6560 }, { "ce_loss": 0.2659108340740204, "epoch": 2.18812541694463, "step": 6560 }, { "distill_loss": 0.5399746894836426, "epoch": 2.18812541694463, "step": 6560 }, { "epoch": 2.18812541694463, "ref_ce_loss": 0.2251754105091095, "step": 6560 }, { "epoch": 2.191460973982655, "loss": 1.1713, "step": 6570 }, { "epoch": 2.191460973982655, "grad_norm": 3.3113059997558594, "step": 6570 }, { "epoch": 2.191460973982655, "learning_rate": 0.0007272747949395134, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 1.1385853290557861, "step": 6570 }, { "ce_loss": 0.23356357216835022, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.4046405553817749, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.21334941685199738, "step": 6570 }, { "epoch": 2.191460973982655, "loss": 0.8915872573852539, "step": 6570 }, { "ce_loss": 0.2360234260559082, "epoch": 2.191460973982655, "step": 6570 }, { "distill_loss": 0.3655630350112915, "epoch": 2.191460973982655, "step": 6570 }, { "epoch": 2.191460973982655, "ref_ce_loss": 0.20562811195850372, "step": 6570 }, { "epoch": 2.1947965310206805, "loss": 1.0959, "step": 6580 }, { "epoch": 2.1947965310206805, "grad_norm": 5.928073406219482, "step": 6580 }, { "epoch": 2.1947965310206805, "learning_rate": 0.0007270261493057652, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 0.8915174007415771, "step": 6580 }, { "ce_loss": 0.20917125046253204, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.3442421853542328, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.16487248241901398, "step": 6580 }, { "epoch": 2.1947965310206805, "loss": 1.161110758781433, "step": 6580 }, { "ce_loss": 0.28940436244010925, "epoch": 2.1947965310206805, "step": 6580 }, { "distill_loss": 0.4610818028450012, "epoch": 2.1947965310206805, "step": 6580 }, { "epoch": 2.1947965310206805, "ref_ce_loss": 0.23252765834331512, "step": 6580 }, { "epoch": 2.198132088058706, "loss": 1.1057, "step": 6590 }, { "epoch": 2.198132088058706, "grad_norm": 1.954878330230713, "step": 6590 }, { "epoch": 2.198132088058706, "learning_rate": 0.000726777121997311, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 0.9378560781478882, "step": 6590 }, { "ce_loss": 0.25078916549682617, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.426094114780426, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.26082292199134827, "step": 6590 }, { "epoch": 2.198132088058706, "loss": 1.06697678565979, "step": 6590 }, { "ce_loss": 0.2149425894021988, "epoch": 2.198132088058706, "step": 6590 }, { "distill_loss": 0.44963303208351135, "epoch": 2.198132088058706, "step": 6590 }, { "epoch": 2.198132088058706, "ref_ce_loss": 0.21024475991725922, "step": 6590 }, { "epoch": 2.201467645096731, "loss": 1.0868, "step": 6600 }, { "epoch": 2.201467645096731, "grad_norm": 1.9652372598648071, "step": 6600 }, { "epoch": 2.201467645096731, "learning_rate": 0.000726527713304793, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 0.7863520383834839, "step": 6600 }, { "ce_loss": 0.22365817427635193, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.3934679925441742, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.16903427243232727, "step": 6600 }, { "epoch": 2.201467645096731, "loss": 1.895040512084961, "step": 6600 }, { "ce_loss": 0.2216983139514923, "epoch": 2.201467645096731, "step": 6600 }, { "distill_loss": 0.40213045477867126, "epoch": 2.201467645096731, "step": 6600 }, { "epoch": 2.201467645096731, "ref_ce_loss": 0.18472428619861603, "step": 6600 }, { "epoch": 2.2048032021347566, "loss": 1.101, "step": 6610 }, { "epoch": 2.2048032021347566, "grad_norm": 4.147248268127441, "step": 6610 }, { "epoch": 2.2048032021347566, "learning_rate": 0.0007262779235192977, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 1.2284412384033203, "step": 6610 }, { "ce_loss": 0.3198404908180237, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.45762690901756287, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.22917161881923676, "step": 6610 }, { "epoch": 2.2048032021347566, "loss": 0.9958036541938782, "step": 6610 }, { "ce_loss": 0.2828834652900696, "epoch": 2.2048032021347566, "step": 6610 }, { "distill_loss": 0.4549398422241211, "epoch": 2.2048032021347566, "step": 6610 }, { "epoch": 2.2048032021347566, "ref_ce_loss": 0.18840783834457397, "step": 6610 }, { "epoch": 2.208138759172782, "loss": 1.033, "step": 6620 }, { "epoch": 2.208138759172782, "grad_norm": 3.1193883419036865, "step": 6620 }, { "epoch": 2.208138759172782, "learning_rate": 0.0007260277529323565, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 0.9063237905502319, "step": 6620 }, { "ce_loss": 0.26534974575042725, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.3766826391220093, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.19527144730091095, "step": 6620 }, { "epoch": 2.208138759172782, "loss": 1.1124207973480225, "step": 6620 }, { "ce_loss": 0.33875322341918945, "epoch": 2.208138759172782, "step": 6620 }, { "distill_loss": 0.4154742956161499, "epoch": 2.208138759172782, "step": 6620 }, { "epoch": 2.208138759172782, "ref_ce_loss": 0.2512235939502716, "step": 6620 }, { "epoch": 2.2114743162108073, "loss": 1.1578, "step": 6630 }, { "epoch": 2.2114743162108073, "grad_norm": 1.9319946765899658, "step": 6630 }, { "epoch": 2.2114743162108073, "learning_rate": 0.0007257772018359458, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 1.1475681066513062, "step": 6630 }, { "ce_loss": 0.34494549036026, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.5608721971511841, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.18402761220932007, "step": 6630 }, { "epoch": 2.2114743162108073, "loss": 1.2129950523376465, "step": 6630 }, { "ce_loss": 0.3182290196418762, "epoch": 2.2114743162108073, "step": 6630 }, { "distill_loss": 0.4504798650741577, "epoch": 2.2114743162108073, "step": 6630 }, { "epoch": 2.2114743162108073, "ref_ce_loss": 0.264813631772995, "step": 6630 }, { "epoch": 2.2148098732488326, "loss": 1.068, "step": 6640 }, { "epoch": 2.2148098732488326, "grad_norm": 1.38816499710083, "step": 6640 }, { "epoch": 2.2148098732488326, "learning_rate": 0.0007255262705224854, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 1.1173572540283203, "step": 6640 }, { "ce_loss": 0.31794416904449463, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.5177514553070068, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.20269501209259033, "step": 6640 }, { "epoch": 2.2148098732488326, "loss": 1.145826816558838, "step": 6640 }, { "ce_loss": 0.3013113737106323, "epoch": 2.2148098732488326, "step": 6640 }, { "distill_loss": 0.43260765075683594, "epoch": 2.2148098732488326, "step": 6640 }, { "epoch": 2.2148098732488326, "ref_ce_loss": 0.22437791526317596, "step": 6640 }, { "epoch": 2.218145430286858, "loss": 1.0474, "step": 6650 }, { "epoch": 2.218145430286858, "grad_norm": 1.715024709701538, "step": 6650 }, { "epoch": 2.218145430286858, "learning_rate": 0.0007252749592848392, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 0.9926832318305969, "step": 6650 }, { "ce_loss": 0.2870025634765625, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.4564557671546936, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.1875499188899994, "step": 6650 }, { "epoch": 2.218145430286858, "loss": 1.396606206893921, "step": 6650 }, { "ce_loss": 0.2882598042488098, "epoch": 2.218145430286858, "step": 6650 }, { "distill_loss": 0.4250618815422058, "epoch": 2.218145430286858, "step": 6650 }, { "epoch": 2.218145430286858, "ref_ce_loss": 0.20096905529499054, "step": 6650 }, { "epoch": 2.2214809873248833, "loss": 1.1115, "step": 6660 }, { "epoch": 2.2214809873248833, "grad_norm": 1.6308581829071045, "step": 6660 }, { "epoch": 2.2214809873248833, "learning_rate": 0.0007250232684163146, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 1.1638392210006714, "step": 6660 }, { "ce_loss": 0.32330602407455444, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.4661082923412323, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.24704022705554962, "step": 6660 }, { "epoch": 2.2214809873248833, "loss": 0.839045524597168, "step": 6660 }, { "ce_loss": 0.24197931587696075, "epoch": 2.2214809873248833, "step": 6660 }, { "distill_loss": 0.3925042748451233, "epoch": 2.2214809873248833, "step": 6660 }, { "epoch": 2.2214809873248833, "ref_ce_loss": 0.2042776197195053, "step": 6660 }, { "epoch": 2.2248165443629087, "loss": 1.1374, "step": 6670 }, { "epoch": 2.2248165443629087, "grad_norm": 2.5563485622406006, "step": 6670 }, { "epoch": 2.2248165443629087, "learning_rate": 0.0007247711982106618, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 1.0619388818740845, "step": 6670 }, { "ce_loss": 0.33162665367126465, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.4590032696723938, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.2071804404258728, "step": 6670 }, { "epoch": 2.2248165443629087, "loss": 1.3315930366516113, "step": 6670 }, { "ce_loss": 0.3238199055194855, "epoch": 2.2248165443629087, "step": 6670 }, { "distill_loss": 0.5190696716308594, "epoch": 2.2248165443629087, "step": 6670 }, { "epoch": 2.2248165443629087, "ref_ce_loss": 0.25709268450737, "step": 6670 }, { "epoch": 2.228152101400934, "loss": 1.1388, "step": 6680 }, { "epoch": 2.228152101400934, "grad_norm": 2.195162296295166, "step": 6680 }, { "epoch": 2.228152101400934, "learning_rate": 0.0007245187489620736, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 1.386202335357666, "step": 6680 }, { "ce_loss": 0.2633619010448456, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.3492848575115204, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.24636389315128326, "step": 6680 }, { "epoch": 2.228152101400934, "loss": 1.4848406314849854, "step": 6680 }, { "ce_loss": 0.23591652512550354, "epoch": 2.228152101400934, "step": 6680 }, { "distill_loss": 0.4073147177696228, "epoch": 2.228152101400934, "step": 6680 }, { "epoch": 2.228152101400934, "ref_ce_loss": 0.16064199805259705, "step": 6680 }, { "epoch": 2.2314876584389594, "loss": 1.0865, "step": 6690 }, { "epoch": 2.2314876584389594, "grad_norm": 2.414093494415283, "step": 6690 }, { "epoch": 2.2314876584389594, "learning_rate": 0.000724265920965186, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.8304829597473145, "step": 6690 }, { "ce_loss": 0.22507025301456451, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.3573000729084015, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.1854206770658493, "step": 6690 }, { "epoch": 2.2314876584389594, "loss": 0.9495757818222046, "step": 6690 }, { "ce_loss": 0.24371907114982605, "epoch": 2.2314876584389594, "step": 6690 }, { "distill_loss": 0.39758509397506714, "epoch": 2.2314876584389594, "step": 6690 }, { "epoch": 2.2314876584389594, "ref_ce_loss": 0.22929032146930695, "step": 6690 }, { "epoch": 2.2348232154769847, "loss": 1.1242, "step": 6700 }, { "epoch": 2.2348232154769847, "grad_norm": 2.162372350692749, "step": 6700 }, { "epoch": 2.2348232154769847, "learning_rate": 0.000724012714515076, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 1.1144814491271973, "step": 6700 }, { "ce_loss": 0.35344523191452026, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.4919400215148926, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.2676081657409668, "step": 6700 }, { "epoch": 2.2348232154769847, "loss": 1.06876540184021, "step": 6700 }, { "ce_loss": 0.3048124611377716, "epoch": 2.2348232154769847, "step": 6700 }, { "distill_loss": 0.5308917760848999, "epoch": 2.2348232154769847, "step": 6700 }, { "epoch": 2.2348232154769847, "ref_ce_loss": 0.23223397135734558, "step": 6700 }, { "epoch": 2.23815877251501, "loss": 1.0419, "step": 6710 }, { "epoch": 2.23815877251501, "grad_norm": 1.5892221927642822, "step": 6710 }, { "epoch": 2.23815877251501, "learning_rate": 0.000723759129907263, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 0.8763984441757202, "step": 6710 }, { "ce_loss": 0.21821273863315582, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.4226665496826172, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.17459844052791595, "step": 6710 }, { "epoch": 2.23815877251501, "loss": 1.2446012496948242, "step": 6710 }, { "ce_loss": 0.29771167039871216, "epoch": 2.23815877251501, "step": 6710 }, { "distill_loss": 0.4279354214668274, "epoch": 2.23815877251501, "step": 6710 }, { "epoch": 2.23815877251501, "ref_ce_loss": 0.19987165927886963, "step": 6710 }, { "epoch": 2.2414943295530354, "loss": 1.1207, "step": 6720 }, { "epoch": 2.2414943295530354, "grad_norm": 1.7283515930175781, "step": 6720 }, { "epoch": 2.2414943295530354, "learning_rate": 0.0007235051674377076, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 1.7800657749176025, "step": 6720 }, { "ce_loss": 0.2828550934791565, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.4153803288936615, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.20675329864025116, "step": 6720 }, { "epoch": 2.2414943295530354, "loss": 1.069749116897583, "step": 6720 }, { "ce_loss": 0.3165798783302307, "epoch": 2.2414943295530354, "step": 6720 }, { "distill_loss": 0.39565014839172363, "epoch": 2.2414943295530354, "step": 6720 }, { "epoch": 2.2414943295530354, "ref_ce_loss": 0.21022526919841766, "step": 6720 }, { "epoch": 2.2448298865910608, "loss": 1.2058, "step": 6730 }, { "epoch": 2.2448298865910608, "grad_norm": 2.3618218898773193, "step": 6730 }, { "epoch": 2.2448298865910608, "learning_rate": 0.000723250827402811, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 0.850956380367279, "step": 6730 }, { "ce_loss": 0.21707533299922943, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.39951059222221375, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.16816748678684235, "step": 6730 }, { "epoch": 2.2448298865910608, "loss": 1.2139538526535034, "step": 6730 }, { "ce_loss": 0.27926763892173767, "epoch": 2.2448298865910608, "step": 6730 }, { "distill_loss": 0.40093597769737244, "epoch": 2.2448298865910608, "step": 6730 }, { "epoch": 2.2448298865910608, "ref_ce_loss": 0.25574299693107605, "step": 6730 }, { "epoch": 2.248165443629086, "loss": 1.1898, "step": 6740 }, { "epoch": 2.248165443629086, "grad_norm": 2.773587703704834, "step": 6740 }, { "epoch": 2.248165443629086, "learning_rate": 0.0007229961100994156, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 1.0061744451522827, "step": 6740 }, { "ce_loss": 0.29060429334640503, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.4561101794242859, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.2093733251094818, "step": 6740 }, { "epoch": 2.248165443629086, "loss": 0.8028789162635803, "step": 6740 }, { "ce_loss": 0.21336449682712555, "epoch": 2.248165443629086, "step": 6740 }, { "distill_loss": 0.3714117705821991, "epoch": 2.248165443629086, "step": 6740 }, { "epoch": 2.248165443629086, "ref_ce_loss": 0.14982610940933228, "step": 6740 }, { "epoch": 2.2515010006671115, "loss": 1.0326, "step": 6750 }, { "epoch": 2.2515010006671115, "grad_norm": 1.863098382949829, "step": 6750 }, { "epoch": 2.2515010006671115, "learning_rate": 0.0007227410158248041, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 1.0014573335647583, "step": 6750 }, { "ce_loss": 0.2929192781448364, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.3995145857334137, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.2523522675037384, "step": 6750 }, { "epoch": 2.2515010006671115, "loss": 0.8479694724082947, "step": 6750 }, { "ce_loss": 0.2661614418029785, "epoch": 2.2515010006671115, "step": 6750 }, { "distill_loss": 0.33442169427871704, "epoch": 2.2515010006671115, "step": 6750 }, { "epoch": 2.2515010006671115, "ref_ce_loss": 0.1988968700170517, "step": 6750 }, { "epoch": 2.254836557705137, "loss": 1.0235, "step": 6760 }, { "epoch": 2.254836557705137, "grad_norm": 3.644243001937866, "step": 6760 }, { "epoch": 2.254836557705137, "learning_rate": 0.0007224855448766986, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.9441705942153931, "step": 6760 }, { "ce_loss": 0.2512178122997284, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.45023685693740845, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.21678780019283295, "step": 6760 }, { "epoch": 2.254836557705137, "loss": 0.9748396277427673, "step": 6760 }, { "ce_loss": 0.24539008736610413, "epoch": 2.254836557705137, "step": 6760 }, { "distill_loss": 0.4820556044578552, "epoch": 2.254836557705137, "step": 6760 }, { "epoch": 2.254836557705137, "ref_ce_loss": 0.1842784881591797, "step": 6760 }, { "epoch": 2.258172114743162, "loss": 1.0713, "step": 6770 }, { "epoch": 2.258172114743162, "grad_norm": 3.3986644744873047, "step": 6770 }, { "epoch": 2.258172114743162, "learning_rate": 0.0007222296975532614, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.990540087223053, "step": 6770 }, { "ce_loss": 0.19007956981658936, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.43442580103874207, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.21168890595436096, "step": 6770 }, { "epoch": 2.258172114743162, "loss": 0.8435627222061157, "step": 6770 }, { "ce_loss": 0.2930505573749542, "epoch": 2.258172114743162, "step": 6770 }, { "distill_loss": 0.3538534343242645, "epoch": 2.258172114743162, "step": 6770 }, { "epoch": 2.258172114743162, "ref_ce_loss": 0.1962963491678238, "step": 6770 }, { "epoch": 2.2615076717811875, "loss": 1.1581, "step": 6780 }, { "epoch": 2.2615076717811875, "grad_norm": 3.7728989124298096, "step": 6780 }, { "epoch": 2.2615076717811875, "learning_rate": 0.0007219734741530937, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 1.049188256263733, "step": 6780 }, { "ce_loss": 0.3014962673187256, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.5301869511604309, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.21730859577655792, "step": 6780 }, { "epoch": 2.2615076717811875, "loss": 0.8995069265365601, "step": 6780 }, { "ce_loss": 0.2236035019159317, "epoch": 2.2615076717811875, "step": 6780 }, { "distill_loss": 0.38950619101524353, "epoch": 2.2615076717811875, "step": 6780 }, { "epoch": 2.2615076717811875, "ref_ce_loss": 0.22236211597919464, "step": 6780 }, { "epoch": 2.264843228819213, "loss": 1.1343, "step": 6790 }, { "epoch": 2.264843228819213, "grad_norm": 2.17807936668396, "step": 6790 }, { "epoch": 2.264843228819213, "learning_rate": 0.0007217168749752361, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.7967932224273682, "step": 6790 }, { "ce_loss": 0.18900160491466522, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.35759422183036804, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.13914166390895844, "step": 6790 }, { "epoch": 2.264843228819213, "loss": 0.90980064868927, "step": 6790 }, { "ce_loss": 0.22265170514583588, "epoch": 2.264843228819213, "step": 6790 }, { "distill_loss": 0.404280424118042, "epoch": 2.264843228819213, "step": 6790 }, { "epoch": 2.264843228819213, "ref_ce_loss": 0.22244516015052795, "step": 6790 }, { "epoch": 2.268178785857238, "loss": 1.1049, "step": 6800 }, { "epoch": 2.268178785857238, "grad_norm": 1.7885805368423462, "step": 6800 }, { "epoch": 2.268178785857238, "learning_rate": 0.0007214599003191671, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 1.1400399208068848, "step": 6800 }, { "ce_loss": 0.24055874347686768, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.3624168336391449, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.22153976559638977, "step": 6800 }, { "epoch": 2.268178785857238, "loss": 0.7367714643478394, "step": 6800 }, { "ce_loss": 0.1923925280570984, "epoch": 2.268178785857238, "step": 6800 }, { "distill_loss": 0.3727942109107971, "epoch": 2.268178785857238, "step": 6800 }, { "epoch": 2.268178785857238, "ref_ce_loss": 0.17138351500034332, "step": 6800 }, { "epoch": 2.2715143428952635, "loss": 1.1082, "step": 6810 }, { "epoch": 2.2715143428952635, "grad_norm": 2.1015965938568115, "step": 6810 }, { "epoch": 2.2715143428952635, "learning_rate": 0.0007212025504848039, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 1.107190489768982, "step": 6810 }, { "ce_loss": 0.23536983132362366, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.3510274887084961, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.1961394101381302, "step": 6810 }, { "epoch": 2.2715143428952635, "loss": 0.9859842658042908, "step": 6810 }, { "ce_loss": 0.31096890568733215, "epoch": 2.2715143428952635, "step": 6810 }, { "distill_loss": 0.4421943724155426, "epoch": 2.2715143428952635, "step": 6810 }, { "epoch": 2.2715143428952635, "ref_ce_loss": 0.1737508475780487, "step": 6810 }, { "epoch": 2.274849899933289, "loss": 1.1412, "step": 6820 }, { "epoch": 2.274849899933289, "grad_norm": 1.7945131063461304, "step": 6820 }, { "epoch": 2.274849899933289, "learning_rate": 0.0007209448257725015, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 1.0342320203781128, "step": 6820 }, { "ce_loss": 0.2750062346458435, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.3843204379081726, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.2735709547996521, "step": 6820 }, { "epoch": 2.274849899933289, "loss": 0.9394274950027466, "step": 6820 }, { "ce_loss": 0.3055344521999359, "epoch": 2.274849899933289, "step": 6820 }, { "distill_loss": 0.38053858280181885, "epoch": 2.274849899933289, "step": 6820 }, { "epoch": 2.274849899933289, "ref_ce_loss": 0.2532532513141632, "step": 6820 }, { "epoch": 2.2781854569713142, "loss": 1.0606, "step": 6830 }, { "epoch": 2.2781854569713142, "grad_norm": 2.098090887069702, "step": 6830 }, { "epoch": 2.2781854569713142, "learning_rate": 0.0007206867264830523, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 0.8209320306777954, "step": 6830 }, { "ce_loss": 0.2228832095861435, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.3807736933231354, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.1797008067369461, "step": 6830 }, { "epoch": 2.2781854569713142, "loss": 1.111886978149414, "step": 6830 }, { "ce_loss": 0.2975417375564575, "epoch": 2.2781854569713142, "step": 6830 }, { "distill_loss": 0.4608473479747772, "epoch": 2.2781854569713142, "step": 6830 }, { "epoch": 2.2781854569713142, "ref_ce_loss": 0.2796211838722229, "step": 6830 }, { "epoch": 2.2815210140093396, "loss": 1.0477, "step": 6840 }, { "epoch": 2.2815210140093396, "grad_norm": 3.1342854499816895, "step": 6840 }, { "epoch": 2.2815210140093396, "learning_rate": 0.000720428252917686, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 1.593949794769287, "step": 6840 }, { "ce_loss": 0.35022589564323425, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.41967836022377014, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.271299809217453, "step": 6840 }, { "epoch": 2.2815210140093396, "loss": 1.1150100231170654, "step": 6840 }, { "ce_loss": 0.288134902715683, "epoch": 2.2815210140093396, "step": 6840 }, { "distill_loss": 0.4372361898422241, "epoch": 2.2815210140093396, "step": 6840 }, { "epoch": 2.2815210140093396, "ref_ce_loss": 0.22447191178798676, "step": 6840 }, { "epoch": 2.284856571047365, "loss": 1.0996, "step": 6850 }, { "epoch": 2.284856571047365, "grad_norm": 1.9681981801986694, "step": 6850 }, { "epoch": 2.284856571047365, "learning_rate": 0.000720169405378069, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 1.0952081680297852, "step": 6850 }, { "ce_loss": 0.2938965857028961, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.4962945580482483, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.17742693424224854, "step": 6850 }, { "epoch": 2.284856571047365, "loss": 1.3066614866256714, "step": 6850 }, { "ce_loss": 0.4631924033164978, "epoch": 2.284856571047365, "step": 6850 }, { "distill_loss": 0.6236108541488647, "epoch": 2.284856571047365, "step": 6850 }, { "epoch": 2.284856571047365, "ref_ce_loss": 0.21928176283836365, "step": 6850 }, { "epoch": 2.2881921280853903, "loss": 1.1893, "step": 6860 }, { "epoch": 2.2881921280853903, "grad_norm": 1.8243825435638428, "step": 6860 }, { "epoch": 2.2881921280853903, "learning_rate": 0.0007199101841663042, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 1.8653347492218018, "step": 6860 }, { "ce_loss": 0.3484281599521637, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.4665539860725403, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.19580277800559998, "step": 6860 }, { "epoch": 2.2881921280853903, "loss": 1.0679538249969482, "step": 6860 }, { "ce_loss": 0.3303784132003784, "epoch": 2.2881921280853903, "step": 6860 }, { "distill_loss": 0.475466251373291, "epoch": 2.2881921280853903, "step": 6860 }, { "epoch": 2.2881921280853903, "ref_ce_loss": 0.26188352704048157, "step": 6860 }, { "epoch": 2.2915276851234156, "loss": 1.2505, "step": 6870 }, { "epoch": 2.2915276851234156, "grad_norm": 3.925416946411133, "step": 6870 }, { "epoch": 2.2915276851234156, "learning_rate": 0.000719650589584931, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 1.038080096244812, "step": 6870 }, { "ce_loss": 0.2771221697330475, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.44759422540664673, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.23251193761825562, "step": 6870 }, { "epoch": 2.2915276851234156, "loss": 1.0797985792160034, "step": 6870 }, { "ce_loss": 0.29750117659568787, "epoch": 2.2915276851234156, "step": 6870 }, { "distill_loss": 0.48262596130371094, "epoch": 2.2915276851234156, "step": 6870 }, { "epoch": 2.2915276851234156, "ref_ce_loss": 0.18303725123405457, "step": 6870 }, { "epoch": 2.294863242161441, "loss": 1.1422, "step": 6880 }, { "epoch": 2.294863242161441, "grad_norm": 2.3651957511901855, "step": 6880 }, { "epoch": 2.294863242161441, "learning_rate": 0.0007193906219369236, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 1.2924625873565674, "step": 6880 }, { "ce_loss": 0.40562325716018677, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.5779322385787964, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.24361947178840637, "step": 6880 }, { "epoch": 2.294863242161441, "loss": 1.1276745796203613, "step": 6880 }, { "ce_loss": 0.3326677680015564, "epoch": 2.294863242161441, "step": 6880 }, { "distill_loss": 0.5647445917129517, "epoch": 2.294863242161441, "step": 6880 }, { "epoch": 2.294863242161441, "ref_ce_loss": 0.22994059324264526, "step": 6880 }, { "epoch": 2.2981987991994663, "loss": 1.1616, "step": 6890 }, { "epoch": 2.2981987991994663, "grad_norm": 2.5007548332214355, "step": 6890 }, { "epoch": 2.2981987991994663, "learning_rate": 0.0007191302815256927, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 1.0267442464828491, "step": 6890 }, { "ce_loss": 0.3053237497806549, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.44782763719558716, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.22624436020851135, "step": 6890 }, { "epoch": 2.2981987991994663, "loss": 1.4283981323242188, "step": 6890 }, { "ce_loss": 0.3315616846084595, "epoch": 2.2981987991994663, "step": 6890 }, { "distill_loss": 0.5364858508110046, "epoch": 2.2981987991994663, "step": 6890 }, { "epoch": 2.2981987991994663, "ref_ce_loss": 0.22474513947963715, "step": 6890 }, { "epoch": 2.3015343562374917, "loss": 1.0545, "step": 6900 }, { "epoch": 2.3015343562374917, "grad_norm": 1.568762183189392, "step": 6900 }, { "epoch": 2.3015343562374917, "learning_rate": 0.0007188695686550835, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 0.9278749823570251, "step": 6900 }, { "ce_loss": 0.2324940264225006, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.4496258795261383, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.18112577497959137, "step": 6900 }, { "epoch": 2.3015343562374917, "loss": 1.423940896987915, "step": 6900 }, { "ce_loss": 0.2764613628387451, "epoch": 2.3015343562374917, "step": 6900 }, { "distill_loss": 0.40918517112731934, "epoch": 2.3015343562374917, "step": 6900 }, { "epoch": 2.3015343562374917, "ref_ce_loss": 0.26147595047950745, "step": 6900 }, { "epoch": 2.304869913275517, "loss": 1.1087, "step": 6910 }, { "epoch": 2.304869913275517, "grad_norm": 2.541975498199463, "step": 6910 }, { "epoch": 2.304869913275517, "learning_rate": 0.0007186084836293757, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.925176203250885, "step": 6910 }, { "ce_loss": 0.2801961302757263, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.4048868715763092, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.23977307975292206, "step": 6910 }, { "epoch": 2.304869913275517, "loss": 0.8737236261367798, "step": 6910 }, { "ce_loss": 0.21469813585281372, "epoch": 2.304869913275517, "step": 6910 }, { "distill_loss": 0.3422822058200836, "epoch": 2.304869913275517, "step": 6910 }, { "epoch": 2.304869913275517, "ref_ce_loss": 0.1523410528898239, "step": 6910 }, { "epoch": 2.3082054703135424, "loss": 1.0511, "step": 6920 }, { "epoch": 2.3082054703135424, "grad_norm": 1.754577875137329, "step": 6920 }, { "epoch": 2.3082054703135424, "learning_rate": 0.000718347026753284, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 0.9576894044876099, "step": 6920 }, { "ce_loss": 0.22706426680088043, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.407731831073761, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.24513918161392212, "step": 6920 }, { "epoch": 2.3082054703135424, "loss": 1.3680813312530518, "step": 6920 }, { "ce_loss": 0.3933814764022827, "epoch": 2.3082054703135424, "step": 6920 }, { "distill_loss": 0.5302331447601318, "epoch": 2.3082054703135424, "step": 6920 }, { "epoch": 2.3082054703135424, "ref_ce_loss": 0.2798175811767578, "step": 6920 }, { "epoch": 2.3115410273515677, "loss": 1.1067, "step": 6930 }, { "epoch": 2.3115410273515677, "grad_norm": 2.897920608520508, "step": 6930 }, { "epoch": 2.3115410273515677, "learning_rate": 0.0007180851983319564, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 1.1339976787567139, "step": 6930 }, { "ce_loss": 0.3197008967399597, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.5434731841087341, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.21307173371315002, "step": 6930 }, { "epoch": 2.3115410273515677, "loss": 1.1253206729888916, "step": 6930 }, { "ce_loss": 0.2909734547138214, "epoch": 2.3115410273515677, "step": 6930 }, { "distill_loss": 0.42577847838401794, "epoch": 2.3115410273515677, "step": 6930 }, { "epoch": 2.3115410273515677, "ref_ce_loss": 0.2254728227853775, "step": 6930 }, { "epoch": 2.314876584389593, "loss": 1.0654, "step": 6940 }, { "epoch": 2.314876584389593, "grad_norm": 2.6729137897491455, "step": 6940 }, { "epoch": 2.314876584389593, "learning_rate": 0.000717822998670975, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 0.9879928231239319, "step": 6940 }, { "ce_loss": 0.2600327432155609, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.5054440498352051, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.1790090799331665, "step": 6940 }, { "epoch": 2.314876584389593, "loss": 1.4961574077606201, "step": 6940 }, { "ce_loss": 0.29782921075820923, "epoch": 2.314876584389593, "step": 6940 }, { "distill_loss": 0.5347850322723389, "epoch": 2.314876584389593, "step": 6940 }, { "epoch": 2.314876584389593, "ref_ce_loss": 0.20284485816955566, "step": 6940 }, { "epoch": 2.3182121414276184, "loss": 1.0843, "step": 6950 }, { "epoch": 2.3182121414276184, "grad_norm": 2.8103513717651367, "step": 6950 }, { "epoch": 2.3182121414276184, "learning_rate": 0.000717560428076355, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 1.302746057510376, "step": 6950 }, { "ce_loss": 0.38361117243766785, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.5667811036109924, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.2757999002933502, "step": 6950 }, { "epoch": 2.3182121414276184, "loss": 1.3873010873794556, "step": 6950 }, { "ce_loss": 0.34579405188560486, "epoch": 2.3182121414276184, "step": 6950 }, { "distill_loss": 0.5673583745956421, "epoch": 2.3182121414276184, "step": 6950 }, { "epoch": 2.3182121414276184, "ref_ce_loss": 0.2157682180404663, "step": 6950 }, { "epoch": 2.321547698465644, "loss": 1.0658, "step": 6960 }, { "epoch": 2.321547698465644, "grad_norm": 2.6049487590789795, "step": 6960 }, { "epoch": 2.321547698465644, "learning_rate": 0.0007172974868545445, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 1.1189073324203491, "step": 6960 }, { "ce_loss": 0.279064416885376, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.5091290473937988, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.18219119310379028, "step": 6960 }, { "epoch": 2.321547698465644, "loss": 1.141963005065918, "step": 6960 }, { "ce_loss": 0.39140236377716064, "epoch": 2.321547698465644, "step": 6960 }, { "distill_loss": 0.5031481981277466, "epoch": 2.321547698465644, "step": 6960 }, { "epoch": 2.321547698465644, "ref_ce_loss": 0.2467627376317978, "step": 6960 }, { "epoch": 2.324883255503669, "loss": 1.0916, "step": 6970 }, { "epoch": 2.324883255503669, "grad_norm": 2.408190965652466, "step": 6970 }, { "epoch": 2.324883255503669, "learning_rate": 0.0007170341753124242, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 1.1541481018066406, "step": 6970 }, { "ce_loss": 0.3108407258987427, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.4415343701839447, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.20662188529968262, "step": 6970 }, { "epoch": 2.324883255503669, "loss": 1.3969581127166748, "step": 6970 }, { "ce_loss": 0.2721030116081238, "epoch": 2.324883255503669, "step": 6970 }, { "distill_loss": 0.45736241340637207, "epoch": 2.324883255503669, "step": 6970 }, { "epoch": 2.324883255503669, "ref_ce_loss": 0.21603234112262726, "step": 6970 }, { "epoch": 2.3282188125416945, "loss": 1.1454, "step": 6980 }, { "epoch": 2.3282188125416945, "grad_norm": 1.446701169013977, "step": 6980 }, { "epoch": 2.3282188125416945, "learning_rate": 0.0007167704937573071, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 1.009114384651184, "step": 6980 }, { "ce_loss": 0.3090960681438446, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.4400170147418976, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.25981760025024414, "step": 6980 }, { "epoch": 2.3282188125416945, "loss": 0.9236708283424377, "step": 6980 }, { "ce_loss": 0.2558498680591583, "epoch": 2.3282188125416945, "step": 6980 }, { "distill_loss": 0.417130708694458, "epoch": 2.3282188125416945, "step": 6980 }, { "epoch": 2.3282188125416945, "ref_ce_loss": 0.17787563800811768, "step": 6980 }, { "epoch": 2.33155436957972, "loss": 1.1306, "step": 6990 }, { "epoch": 2.33155436957972, "grad_norm": 2.063185930252075, "step": 6990 }, { "epoch": 2.33155436957972, "learning_rate": 0.0007165064424969377, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 1.2225693464279175, "step": 6990 }, { "ce_loss": 0.40387097001075745, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.4863589406013489, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.21981634199619293, "step": 6990 }, { "epoch": 2.33155436957972, "loss": 1.1351462602615356, "step": 6990 }, { "ce_loss": 0.3387243151664734, "epoch": 2.33155436957972, "step": 6990 }, { "distill_loss": 0.4375489354133606, "epoch": 2.33155436957972, "step": 6990 }, { "epoch": 2.33155436957972, "ref_ce_loss": 0.20835743844509125, "step": 6990 }, { "epoch": 2.334889926617745, "loss": 1.0529, "step": 7000 }, { "epoch": 2.334889926617745, "grad_norm": 2.4738194942474365, "step": 7000 }, { "epoch": 2.334889926617745, "learning_rate": 0.0007162420218394925, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 1.1578466892242432, "step": 7000 }, { "ce_loss": 0.22912459075450897, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.41512879729270935, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.1929943561553955, "step": 7000 }, { "epoch": 2.334889926617745, "loss": 1.3507180213928223, "step": 7000 }, { "ce_loss": 0.5369671583175659, "epoch": 2.334889926617745, "step": 7000 }, { "distill_loss": 0.45025479793548584, "epoch": 2.334889926617745, "step": 7000 }, { "epoch": 2.334889926617745, "ref_ce_loss": 0.3083897531032562, "step": 7000 }, { "epoch": 2.3382254836557705, "loss": 1.0127, "step": 7010 }, { "epoch": 2.3382254836557705, "grad_norm": 1.8202849626541138, "step": 7010 }, { "epoch": 2.3382254836557705, "learning_rate": 0.0007159772320935789, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 0.9785995483398438, "step": 7010 }, { "ce_loss": 0.2975868582725525, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.386867880821228, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.20957233011722565, "step": 7010 }, { "epoch": 2.3382254836557705, "loss": 1.2656875848770142, "step": 7010 }, { "ce_loss": 0.26873648166656494, "epoch": 2.3382254836557705, "step": 7010 }, { "distill_loss": 0.35235121846199036, "epoch": 2.3382254836557705, "step": 7010 }, { "epoch": 2.3382254836557705, "ref_ce_loss": 0.17390835285186768, "step": 7010 }, { "epoch": 2.341561040693796, "loss": 1.1443, "step": 7020 }, { "epoch": 2.341561040693796, "grad_norm": 1.9370781183242798, "step": 7020 }, { "epoch": 2.341561040693796, "learning_rate": 0.0007157120735682347, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 1.189287781715393, "step": 7020 }, { "ce_loss": 0.3431004583835602, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.3850644528865814, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.2174300104379654, "step": 7020 }, { "epoch": 2.341561040693796, "loss": 0.8547623157501221, "step": 7020 }, { "ce_loss": 0.23848767578601837, "epoch": 2.341561040693796, "step": 7020 }, { "distill_loss": 0.37629422545433044, "epoch": 2.341561040693796, "step": 7020 }, { "epoch": 2.341561040693796, "ref_ce_loss": 0.1636846661567688, "step": 7020 }, { "epoch": 2.3448965977318212, "loss": 1.0868, "step": 7030 }, { "epoch": 2.3448965977318212, "grad_norm": 2.332381010055542, "step": 7030 }, { "epoch": 2.3448965977318212, "learning_rate": 0.0007154465465729286, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 0.9023130536079407, "step": 7030 }, { "ce_loss": 0.21761271357536316, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.4426616132259369, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.18290437757968903, "step": 7030 }, { "epoch": 2.3448965977318212, "loss": 1.361938238143921, "step": 7030 }, { "ce_loss": 0.2959597706794739, "epoch": 2.3448965977318212, "step": 7030 }, { "distill_loss": 0.4138369858264923, "epoch": 2.3448965977318212, "step": 7030 }, { "epoch": 2.3448965977318212, "ref_ce_loss": 0.19288687407970428, "step": 7030 }, { "epoch": 2.3482321547698466, "loss": 1.1662, "step": 7040 }, { "epoch": 2.3482321547698466, "grad_norm": 2.0527827739715576, "step": 7040 }, { "epoch": 2.3482321547698466, "learning_rate": 0.0007151806514175594, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 1.0762706995010376, "step": 7040 }, { "ce_loss": 0.3061358332633972, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.4743525981903076, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.2185249924659729, "step": 7040 }, { "epoch": 2.3482321547698466, "loss": 1.901619791984558, "step": 7040 }, { "ce_loss": 0.3605675995349884, "epoch": 2.3482321547698466, "step": 7040 }, { "distill_loss": 0.4413268566131592, "epoch": 2.3482321547698466, "step": 7040 }, { "epoch": 2.3482321547698466, "ref_ce_loss": 0.22994102537631989, "step": 7040 }, { "epoch": 2.351567711807872, "loss": 1.0331, "step": 7050 }, { "epoch": 2.351567711807872, "grad_norm": 1.784109354019165, "step": 7050 }, { "epoch": 2.351567711807872, "learning_rate": 0.0007149143884124551, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 1.3922317028045654, "step": 7050 }, { "ce_loss": 0.26569482684135437, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.39552581310272217, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.22670617699623108, "step": 7050 }, { "epoch": 2.351567711807872, "loss": 1.1182749271392822, "step": 7050 }, { "ce_loss": 0.31295108795166016, "epoch": 2.351567711807872, "step": 7050 }, { "distill_loss": 0.4126480221748352, "epoch": 2.351567711807872, "step": 7050 }, { "epoch": 2.351567711807872, "ref_ce_loss": 0.20783144235610962, "step": 7050 }, { "epoch": 2.3549032688458973, "loss": 1.1147, "step": 7060 }, { "epoch": 2.3549032688458973, "grad_norm": 2.4703519344329834, "step": 7060 }, { "epoch": 2.3549032688458973, "learning_rate": 0.0007146477578683731, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 1.0593907833099365, "step": 7060 }, { "ce_loss": 0.2643020749092102, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.3918103575706482, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.20907360315322876, "step": 7060 }, { "epoch": 2.3549032688458973, "loss": 0.9865421056747437, "step": 7060 }, { "ce_loss": 0.2925555408000946, "epoch": 2.3549032688458973, "step": 7060 }, { "distill_loss": 0.47285857796669006, "epoch": 2.3549032688458973, "step": 7060 }, { "epoch": 2.3549032688458973, "ref_ce_loss": 0.2207275629043579, "step": 7060 }, { "epoch": 2.3582388258839226, "loss": 1.0843, "step": 7070 }, { "epoch": 2.3582388258839226, "grad_norm": 2.159562110900879, "step": 7070 }, { "epoch": 2.3582388258839226, "learning_rate": 0.0007143807600965004, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 1.2472641468048096, "step": 7070 }, { "ce_loss": 0.2805372178554535, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.46946394443511963, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.2066594958305359, "step": 7070 }, { "epoch": 2.3582388258839226, "loss": 1.0199264287948608, "step": 7070 }, { "ce_loss": 0.3058395981788635, "epoch": 2.3582388258839226, "step": 7070 }, { "distill_loss": 0.4050654470920563, "epoch": 2.3582388258839226, "step": 7070 }, { "epoch": 2.3582388258839226, "ref_ce_loss": 0.2362363338470459, "step": 7070 }, { "epoch": 2.361574382921948, "loss": 1.175, "step": 7080 }, { "epoch": 2.361574382921948, "grad_norm": 4.379484176635742, "step": 7080 }, { "epoch": 2.361574382921948, "learning_rate": 0.0007141133954084518, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 1.5523792505264282, "step": 7080 }, { "ce_loss": 0.3695584237575531, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.6727487444877625, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.25676894187927246, "step": 7080 }, { "epoch": 2.361574382921948, "loss": 1.258602499961853, "step": 7080 }, { "ce_loss": 0.3308517336845398, "epoch": 2.361574382921948, "step": 7080 }, { "distill_loss": 0.5026138424873352, "epoch": 2.361574382921948, "step": 7080 }, { "epoch": 2.361574382921948, "ref_ce_loss": 0.2165544331073761, "step": 7080 }, { "epoch": 2.3649099399599733, "loss": 1.1915, "step": 7090 }, { "epoch": 2.3649099399599733, "grad_norm": 1.4523996114730835, "step": 7090 }, { "epoch": 2.3649099399599733, "learning_rate": 0.0007138456641162708, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 0.9873166084289551, "step": 7090 }, { "ce_loss": 0.2942551076412201, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.4172300696372986, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.23000051081180573, "step": 7090 }, { "epoch": 2.3649099399599733, "loss": 1.6088917255401611, "step": 7090 }, { "ce_loss": 0.2873069941997528, "epoch": 2.3649099399599733, "step": 7090 }, { "distill_loss": 0.46405521035194397, "epoch": 2.3649099399599733, "step": 7090 }, { "epoch": 2.3649099399599733, "ref_ce_loss": 0.21548372507095337, "step": 7090 }, { "epoch": 2.3682454969979987, "loss": 1.0817, "step": 7100 }, { "epoch": 2.3682454969979987, "grad_norm": 1.5708006620407104, "step": 7100 }, { "epoch": 2.3682454969979987, "learning_rate": 0.0007135775665324286, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 1.3646875619888306, "step": 7100 }, { "ce_loss": 0.29803529381752014, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.47713354229927063, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.24027717113494873, "step": 7100 }, { "epoch": 2.3682454969979987, "loss": 1.215323567390442, "step": 7100 }, { "ce_loss": 0.31460314989089966, "epoch": 2.3682454969979987, "step": 7100 }, { "distill_loss": 0.46045100688934326, "epoch": 2.3682454969979987, "step": 7100 }, { "epoch": 2.3682454969979987, "ref_ce_loss": 0.17487739026546478, "step": 7100 }, { "epoch": 2.371581054036024, "loss": 1.0933, "step": 7110 }, { "epoch": 2.371581054036024, "grad_norm": 1.793230652809143, "step": 7110 }, { "epoch": 2.371581054036024, "learning_rate": 0.0007133091029698239, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 1.1998347043991089, "step": 7110 }, { "ce_loss": 0.3668796420097351, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.5767227411270142, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.18673135340213776, "step": 7110 }, { "epoch": 2.371581054036024, "loss": 1.4413803815841675, "step": 7110 }, { "ce_loss": 0.28860530257225037, "epoch": 2.371581054036024, "step": 7110 }, { "distill_loss": 0.5573756694793701, "epoch": 2.371581054036024, "step": 7110 }, { "epoch": 2.371581054036024, "ref_ce_loss": 0.1992310881614685, "step": 7110 }, { "epoch": 2.3749166110740494, "loss": 1.1161, "step": 7120 }, { "epoch": 2.3749166110740494, "grad_norm": 1.769562840461731, "step": 7120 }, { "epoch": 2.3749166110740494, "learning_rate": 0.0007130402737417825, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.9815624356269836, "step": 7120 }, { "ce_loss": 0.24128681421279907, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.5173496007919312, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.22285443544387817, "step": 7120 }, { "epoch": 2.3749166110740494, "loss": 0.7874520421028137, "step": 7120 }, { "ce_loss": 0.24674908816814423, "epoch": 2.3749166110740494, "step": 7120 }, { "distill_loss": 0.3168177902698517, "epoch": 2.3749166110740494, "step": 7120 }, { "epoch": 2.3749166110740494, "ref_ce_loss": 0.22377490997314453, "step": 7120 }, { "epoch": 2.3782521681120747, "loss": 1.0619, "step": 7130 }, { "epoch": 2.3782521681120747, "grad_norm": 1.6740084886550903, "step": 7130 }, { "epoch": 2.3782521681120747, "learning_rate": 0.0007127710791620573, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 1.0077743530273438, "step": 7130 }, { "ce_loss": 0.34261929988861084, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.39329415559768677, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.23144038021564484, "step": 7130 }, { "epoch": 2.3782521681120747, "loss": 0.9109863638877869, "step": 7130 }, { "ce_loss": 0.2927761375904083, "epoch": 2.3782521681120747, "step": 7130 }, { "distill_loss": 0.4364514648914337, "epoch": 2.3782521681120747, "step": 7130 }, { "epoch": 2.3782521681120747, "ref_ce_loss": 0.1817220151424408, "step": 7130 }, { "epoch": 2.3815877251501, "loss": 1.1178, "step": 7140 }, { "epoch": 2.3815877251501, "grad_norm": 3.0695078372955322, "step": 7140 }, { "epoch": 2.3815877251501, "learning_rate": 0.000712501519544827, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 1.096538782119751, "step": 7140 }, { "ce_loss": 0.2581261098384857, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.4424154758453369, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.2355940043926239, "step": 7140 }, { "epoch": 2.3815877251501, "loss": 0.9535109996795654, "step": 7140 }, { "ce_loss": 0.2059747874736786, "epoch": 2.3815877251501, "step": 7140 }, { "distill_loss": 0.39877620339393616, "epoch": 2.3815877251501, "step": 7140 }, { "epoch": 2.3815877251501, "ref_ce_loss": 0.2136932760477066, "step": 7140 }, { "epoch": 2.3849232821881254, "loss": 1.0672, "step": 7150 }, { "epoch": 2.3849232821881254, "grad_norm": 2.157912015914917, "step": 7150 }, { "epoch": 2.3849232821881254, "learning_rate": 0.000712231595204697, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 0.9159623384475708, "step": 7150 }, { "ce_loss": 0.2085161805152893, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.43406280875205994, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.21358418464660645, "step": 7150 }, { "epoch": 2.3849232821881254, "loss": 1.272546410560608, "step": 7150 }, { "ce_loss": 0.3274426758289337, "epoch": 2.3849232821881254, "step": 7150 }, { "distill_loss": 0.47474992275238037, "epoch": 2.3849232821881254, "step": 7150 }, { "epoch": 2.3849232821881254, "ref_ce_loss": 0.23619666695594788, "step": 7150 }, { "epoch": 2.388258839226151, "loss": 1.1162, "step": 7160 }, { "epoch": 2.388258839226151, "grad_norm": 1.9163835048675537, "step": 7160 }, { "epoch": 2.388258839226151, "learning_rate": 0.0007119613064566976, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 1.100827693939209, "step": 7160 }, { "ce_loss": 0.31992077827453613, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.5282621383666992, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.20679350197315216, "step": 7160 }, { "epoch": 2.388258839226151, "loss": 1.132440209388733, "step": 7160 }, { "ce_loss": 0.2905762195587158, "epoch": 2.388258839226151, "step": 7160 }, { "distill_loss": 0.4590890109539032, "epoch": 2.388258839226151, "step": 7160 }, { "epoch": 2.388258839226151, "ref_ce_loss": 0.24715933203697205, "step": 7160 }, { "epoch": 2.391594396264176, "loss": 1.0898, "step": 7170 }, { "epoch": 2.391594396264176, "grad_norm": 2.1200919151306152, "step": 7170 }, { "epoch": 2.391594396264176, "learning_rate": 0.0007116906536162853, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 1.1722006797790527, "step": 7170 }, { "ce_loss": 0.2898659408092499, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.5165716409683228, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.2042417675256729, "step": 7170 }, { "epoch": 2.391594396264176, "loss": 0.8611701726913452, "step": 7170 }, { "ce_loss": 0.26445263624191284, "epoch": 2.391594396264176, "step": 7170 }, { "distill_loss": 0.4190884828567505, "epoch": 2.391594396264176, "step": 7170 }, { "epoch": 2.391594396264176, "ref_ce_loss": 0.17744985222816467, "step": 7170 }, { "epoch": 2.3949299533022015, "loss": 1.1236, "step": 7180 }, { "epoch": 2.3949299533022015, "grad_norm": 2.103191375732422, "step": 7180 }, { "epoch": 2.3949299533022015, "learning_rate": 0.000711419636999341, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 1.374045491218567, "step": 7180 }, { "ce_loss": 0.28513509035110474, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.5116044878959656, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.1862870454788208, "step": 7180 }, { "epoch": 2.3949299533022015, "loss": 0.8769741058349609, "step": 7180 }, { "ce_loss": 0.21405696868896484, "epoch": 2.3949299533022015, "step": 7180 }, { "distill_loss": 0.4389774203300476, "epoch": 2.3949299533022015, "step": 7180 }, { "epoch": 2.3949299533022015, "ref_ce_loss": 0.17624899744987488, "step": 7180 }, { "epoch": 2.398265510340227, "loss": 1.1529, "step": 7190 }, { "epoch": 2.398265510340227, "grad_norm": 1.781639575958252, "step": 7190 }, { "epoch": 2.398265510340227, "learning_rate": 0.0007111482569221702, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 1.2831974029541016, "step": 7190 }, { "ce_loss": 0.24365730583667755, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.5405185222625732, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.18862995505332947, "step": 7190 }, { "epoch": 2.398265510340227, "loss": 1.2575526237487793, "step": 7190 }, { "ce_loss": 0.30534759163856506, "epoch": 2.398265510340227, "step": 7190 }, { "distill_loss": 0.4284963309764862, "epoch": 2.398265510340227, "step": 7190 }, { "epoch": 2.398265510340227, "ref_ce_loss": 0.22596989572048187, "step": 7190 }, { "epoch": 2.401601067378252, "loss": 1.1763, "step": 7200 }, { "epoch": 2.401601067378252, "grad_norm": 2.416120767593384, "step": 7200 }, { "epoch": 2.401601067378252, "learning_rate": 0.0007108765137015025, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 1.0382710695266724, "step": 7200 }, { "ce_loss": 0.2267204225063324, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.37804681062698364, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.2037581354379654, "step": 7200 }, { "epoch": 2.401601067378252, "loss": 1.2725032567977905, "step": 7200 }, { "ce_loss": 0.3279874622821808, "epoch": 2.401601067378252, "step": 7200 }, { "distill_loss": 0.44648557901382446, "epoch": 2.401601067378252, "step": 7200 }, { "epoch": 2.401601067378252, "ref_ce_loss": 0.23695345222949982, "step": 7200 }, { "epoch": 2.4049366244162775, "loss": 1.0089, "step": 7210 }, { "epoch": 2.4049366244162775, "grad_norm": 1.7188997268676758, "step": 7210 }, { "epoch": 2.4049366244162775, "learning_rate": 0.0007106044076544916, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 1.175480604171753, "step": 7210 }, { "ce_loss": 0.3662952482700348, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.410007119178772, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.32809457182884216, "step": 7210 }, { "epoch": 2.4049366244162775, "loss": 1.060041069984436, "step": 7210 }, { "ce_loss": 0.26448532938957214, "epoch": 2.4049366244162775, "step": 7210 }, { "distill_loss": 0.39402514696121216, "epoch": 2.4049366244162775, "step": 7210 }, { "epoch": 2.4049366244162775, "ref_ce_loss": 0.16908228397369385, "step": 7210 }, { "epoch": 2.408272181454303, "loss": 1.0906, "step": 7220 }, { "epoch": 2.408272181454303, "grad_norm": 2.511235237121582, "step": 7220 }, { "epoch": 2.408272181454303, "learning_rate": 0.0007103319390987146, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 0.8862078785896301, "step": 7220 }, { "ce_loss": 0.2618538737297058, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.38268762826919556, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.24153800308704376, "step": 7220 }, { "epoch": 2.408272181454303, "loss": 1.3634955883026123, "step": 7220 }, { "ce_loss": 0.27552342414855957, "epoch": 2.408272181454303, "step": 7220 }, { "distill_loss": 0.48205381631851196, "epoch": 2.408272181454303, "step": 7220 }, { "epoch": 2.408272181454303, "ref_ce_loss": 0.1887548267841339, "step": 7220 }, { "epoch": 2.4116077384923282, "loss": 1.1096, "step": 7230 }, { "epoch": 2.4116077384923282, "grad_norm": 1.6549991369247437, "step": 7230 }, { "epoch": 2.4116077384923282, "learning_rate": 0.0007100591083521716, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 1.0235944986343384, "step": 7230 }, { "ce_loss": 0.290785551071167, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.3853553533554077, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.2579878270626068, "step": 7230 }, { "epoch": 2.4116077384923282, "loss": 1.4111783504486084, "step": 7230 }, { "ce_loss": 0.30856239795684814, "epoch": 2.4116077384923282, "step": 7230 }, { "distill_loss": 0.3862026631832123, "epoch": 2.4116077384923282, "step": 7230 }, { "epoch": 2.4116077384923282, "ref_ce_loss": 0.17828388512134552, "step": 7230 }, { "epoch": 2.4149432955303536, "loss": 1.156, "step": 7240 }, { "epoch": 2.4149432955303536, "grad_norm": 1.7623802423477173, "step": 7240 }, { "epoch": 2.4149432955303536, "learning_rate": 0.0007097859157332854, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 1.2994335889816284, "step": 7240 }, { "ce_loss": 0.2809923589229584, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.4485905170440674, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.23245123028755188, "step": 7240 }, { "epoch": 2.4149432955303536, "loss": 1.281017541885376, "step": 7240 }, { "ce_loss": 0.2583031952381134, "epoch": 2.4149432955303536, "step": 7240 }, { "distill_loss": 0.48194581270217896, "epoch": 2.4149432955303536, "step": 7240 }, { "epoch": 2.4149432955303536, "ref_ce_loss": 0.19480441510677338, "step": 7240 }, { "epoch": 2.418278852568379, "loss": 1.2047, "step": 7250 }, { "epoch": 2.418278852568379, "grad_norm": 1.3845771551132202, "step": 7250 }, { "epoch": 2.418278852568379, "learning_rate": 0.0007095123615609013, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 1.400426983833313, "step": 7250 }, { "ce_loss": 0.41670477390289307, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.4637930393218994, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.282934308052063, "step": 7250 }, { "epoch": 2.418278852568379, "loss": 1.1753677129745483, "step": 7250 }, { "ce_loss": 0.2805415689945221, "epoch": 2.418278852568379, "step": 7250 }, { "distill_loss": 0.5144063234329224, "epoch": 2.418278852568379, "step": 7250 }, { "epoch": 2.418278852568379, "ref_ce_loss": 0.2224591225385666, "step": 7250 }, { "epoch": 2.4216144096064043, "loss": 1.1877, "step": 7260 }, { "epoch": 2.4216144096064043, "grad_norm": 1.6489319801330566, "step": 7260 }, { "epoch": 2.4216144096064043, "learning_rate": 0.0007092384461542862, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 0.8170281648635864, "step": 7260 }, { "ce_loss": 0.1991329789161682, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.4296095371246338, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.1308637410402298, "step": 7260 }, { "epoch": 2.4216144096064043, "loss": 1.1548151969909668, "step": 7260 }, { "ce_loss": 0.2822519540786743, "epoch": 2.4216144096064043, "step": 7260 }, { "distill_loss": 0.4908180236816406, "epoch": 2.4216144096064043, "step": 7260 }, { "epoch": 2.4216144096064043, "ref_ce_loss": 0.23409049212932587, "step": 7260 }, { "epoch": 2.4249499666444296, "loss": 1.1386, "step": 7270 }, { "epoch": 2.4249499666444296, "grad_norm": 3.451852798461914, "step": 7270 }, { "epoch": 2.4249499666444296, "learning_rate": 0.0007089641698331291, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 1.0679713487625122, "step": 7270 }, { "ce_loss": 0.31112760305404663, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.5061574578285217, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.25037115812301636, "step": 7270 }, { "epoch": 2.4249499666444296, "loss": 0.9560131430625916, "step": 7270 }, { "ce_loss": 0.23722390830516815, "epoch": 2.4249499666444296, "step": 7270 }, { "distill_loss": 0.4231836795806885, "epoch": 2.4249499666444296, "step": 7270 }, { "epoch": 2.4249499666444296, "ref_ce_loss": 0.17950914800167084, "step": 7270 }, { "epoch": 2.428285523682455, "loss": 1.0816, "step": 7280 }, { "epoch": 2.428285523682455, "grad_norm": 1.921055555343628, "step": 7280 }, { "epoch": 2.428285523682455, "learning_rate": 0.0007086895329175397, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 1.4632203578948975, "step": 7280 }, { "ce_loss": 0.3397609293460846, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.4787476062774658, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.25433018803596497, "step": 7280 }, { "epoch": 2.428285523682455, "loss": 1.5840129852294922, "step": 7280 }, { "ce_loss": 0.22069182991981506, "epoch": 2.428285523682455, "step": 7280 }, { "distill_loss": 0.4394543170928955, "epoch": 2.428285523682455, "step": 7280 }, { "epoch": 2.428285523682455, "ref_ce_loss": 0.18539944291114807, "step": 7280 }, { "epoch": 2.4316210807204803, "loss": 1.157, "step": 7290 }, { "epoch": 2.4316210807204803, "grad_norm": 2.349575996398926, "step": 7290 }, { "epoch": 2.4316210807204803, "learning_rate": 0.0007084145357280491, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 0.7957502603530884, "step": 7290 }, { "ce_loss": 0.2420455366373062, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.2949468493461609, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.2086876779794693, "step": 7290 }, { "epoch": 2.4316210807204803, "loss": 1.535692572593689, "step": 7290 }, { "ce_loss": 0.2967432141304016, "epoch": 2.4316210807204803, "step": 7290 }, { "distill_loss": 0.4691455066204071, "epoch": 2.4316210807204803, "step": 7290 }, { "epoch": 2.4316210807204803, "ref_ce_loss": 0.25700587034225464, "step": 7290 }, { "epoch": 2.4349566377585057, "loss": 1.0614, "step": 7300 }, { "epoch": 2.4349566377585057, "grad_norm": 2.760479688644409, "step": 7300 }, { "epoch": 2.4349566377585057, "learning_rate": 0.0007081391785856086, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 1.2903001308441162, "step": 7300 }, { "ce_loss": 0.338364839553833, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.4312809109687805, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.2753032445907593, "step": 7300 }, { "epoch": 2.4349566377585057, "loss": 1.3935198783874512, "step": 7300 }, { "ce_loss": 0.26645249128341675, "epoch": 2.4349566377585057, "step": 7300 }, { "distill_loss": 0.4124131202697754, "epoch": 2.4349566377585057, "step": 7300 }, { "epoch": 2.4349566377585057, "ref_ce_loss": 0.2165878266096115, "step": 7300 }, { "epoch": 2.438292194796531, "loss": 1.0521, "step": 7310 }, { "epoch": 2.438292194796531, "grad_norm": 2.61460542678833, "step": 7310 }, { "epoch": 2.438292194796531, "learning_rate": 0.0007078634618115896, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 1.2837860584259033, "step": 7310 }, { "ce_loss": 0.34777307510375977, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.5416009426116943, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.24954359233379364, "step": 7310 }, { "epoch": 2.438292194796531, "loss": 1.1179468631744385, "step": 7310 }, { "ce_loss": 0.2547168731689453, "epoch": 2.438292194796531, "step": 7310 }, { "distill_loss": 0.4858138859272003, "epoch": 2.438292194796531, "step": 7310 }, { "epoch": 2.438292194796531, "ref_ce_loss": 0.1752818375825882, "step": 7310 }, { "epoch": 2.4416277518345564, "loss": 1.0628, "step": 7320 }, { "epoch": 2.4416277518345564, "grad_norm": 2.5039851665496826, "step": 7320 }, { "epoch": 2.4416277518345564, "learning_rate": 0.0007075873857277831, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 1.1534297466278076, "step": 7320 }, { "ce_loss": 0.32291606068611145, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.428676962852478, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.21789410710334778, "step": 7320 }, { "epoch": 2.4416277518345564, "loss": 0.9658570885658264, "step": 7320 }, { "ce_loss": 0.29541534185409546, "epoch": 2.4416277518345564, "step": 7320 }, { "distill_loss": 0.37291932106018066, "epoch": 2.4416277518345564, "step": 7320 }, { "epoch": 2.4416277518345564, "ref_ce_loss": 0.17409594357013702, "step": 7320 }, { "epoch": 2.4449633088725817, "loss": 1.0498, "step": 7330 }, { "epoch": 2.4449633088725817, "grad_norm": 1.6878045797348022, "step": 7330 }, { "epoch": 2.4449633088725817, "learning_rate": 0.0007073109506563997, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 1.2049627304077148, "step": 7330 }, { "ce_loss": 0.29368162155151367, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.43546533584594727, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.2480524331331253, "step": 7330 }, { "epoch": 2.4449633088725817, "loss": 1.1017310619354248, "step": 7330 }, { "ce_loss": 0.3388078212738037, "epoch": 2.4449633088725817, "step": 7330 }, { "distill_loss": 0.465187132358551, "epoch": 2.4449633088725817, "step": 7330 }, { "epoch": 2.4449633088725817, "ref_ce_loss": 0.24967536330223083, "step": 7330 }, { "epoch": 2.448298865910607, "loss": 1.0671, "step": 7340 }, { "epoch": 2.448298865910607, "grad_norm": 2.228999614715576, "step": 7340 }, { "epoch": 2.448298865910607, "learning_rate": 0.0007070341569200688, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 1.1676300764083862, "step": 7340 }, { "ce_loss": 0.27120673656463623, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.5025864243507385, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.17142629623413086, "step": 7340 }, { "epoch": 2.448298865910607, "loss": 1.1279618740081787, "step": 7340 }, { "ce_loss": 0.25630664825439453, "epoch": 2.448298865910607, "step": 7340 }, { "distill_loss": 0.5078726410865784, "epoch": 2.448298865910607, "step": 7340 }, { "epoch": 2.448298865910607, "ref_ce_loss": 0.19859783351421356, "step": 7340 }, { "epoch": 2.4516344229486324, "loss": 1.1206, "step": 7350 }, { "epoch": 2.4516344229486324, "grad_norm": 2.9473607540130615, "step": 7350 }, { "epoch": 2.4516344229486324, "learning_rate": 0.0007067570048418387, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 1.2834032773971558, "step": 7350 }, { "ce_loss": 0.33302411437034607, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.543437123298645, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.24555639922618866, "step": 7350 }, { "epoch": 2.4516344229486324, "loss": 1.115805983543396, "step": 7350 }, { "ce_loss": 0.26689279079437256, "epoch": 2.4516344229486324, "step": 7350 }, { "distill_loss": 0.4715449810028076, "epoch": 2.4516344229486324, "step": 7350 }, { "epoch": 2.4516344229486324, "ref_ce_loss": 0.18562690913677216, "step": 7350 }, { "epoch": 2.454969979986658, "loss": 1.0991, "step": 7360 }, { "epoch": 2.454969979986658, "grad_norm": 1.9149889945983887, "step": 7360 }, { "epoch": 2.454969979986658, "learning_rate": 0.0007064794947451753, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 1.2051515579223633, "step": 7360 }, { "ce_loss": 0.3245408535003662, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.5593629479408264, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.2596641480922699, "step": 7360 }, { "epoch": 2.454969979986658, "loss": 1.3105435371398926, "step": 7360 }, { "ce_loss": 0.32686343789100647, "epoch": 2.454969979986658, "step": 7360 }, { "distill_loss": 0.5083961486816406, "epoch": 2.454969979986658, "step": 7360 }, { "epoch": 2.454969979986658, "ref_ce_loss": 0.2100948542356491, "step": 7360 }, { "epoch": 2.458305537024683, "loss": 1.0501, "step": 7370 }, { "epoch": 2.458305537024683, "grad_norm": 2.0883407592773438, "step": 7370 }, { "epoch": 2.458305537024683, "learning_rate": 0.0007062016269539631, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 0.8910499811172485, "step": 7370 }, { "ce_loss": 0.27967867255210876, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.45400354266166687, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.12001747637987137, "step": 7370 }, { "epoch": 2.458305537024683, "loss": 1.0135114192962646, "step": 7370 }, { "ce_loss": 0.1810399889945984, "epoch": 2.458305537024683, "step": 7370 }, { "distill_loss": 0.3433760702610016, "epoch": 2.458305537024683, "step": 7370 }, { "epoch": 2.458305537024683, "ref_ce_loss": 0.2140178084373474, "step": 7370 }, { "epoch": 2.4616410940627085, "loss": 1.0993, "step": 7380 }, { "epoch": 2.4616410940627085, "grad_norm": 1.7108585834503174, "step": 7380 }, { "epoch": 2.4616410940627085, "learning_rate": 0.0007059234017925036, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 0.7910228371620178, "step": 7380 }, { "ce_loss": 0.23623459041118622, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.2763459384441376, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.21518924832344055, "step": 7380 }, { "epoch": 2.4616410940627085, "loss": 1.0838594436645508, "step": 7380 }, { "ce_loss": 0.20617178082466125, "epoch": 2.4616410940627085, "step": 7380 }, { "distill_loss": 0.3263876140117645, "epoch": 2.4616410940627085, "step": 7380 }, { "epoch": 2.4616410940627085, "ref_ce_loss": 0.11844362318515778, "step": 7380 }, { "epoch": 2.464976651100734, "loss": 1.0635, "step": 7390 }, { "epoch": 2.464976651100734, "grad_norm": 2.067502975463867, "step": 7390 }, { "epoch": 2.464976651100734, "learning_rate": 0.0007056448195855154, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 1.2652387619018555, "step": 7390 }, { "ce_loss": 0.311957985162735, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.4279102087020874, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.25051218271255493, "step": 7390 }, { "epoch": 2.464976651100734, "loss": 1.2960461378097534, "step": 7390 }, { "ce_loss": 0.3852989673614502, "epoch": 2.464976651100734, "step": 7390 }, { "distill_loss": 0.46075063943862915, "epoch": 2.464976651100734, "step": 7390 }, { "epoch": 2.464976651100734, "ref_ce_loss": 0.25648966431617737, "step": 7390 }, { "epoch": 2.468312208138759, "loss": 1.0858, "step": 7400 }, { "epoch": 2.468312208138759, "grad_norm": 2.067422866821289, "step": 7400 }, { "epoch": 2.468312208138759, "learning_rate": 0.0007053658806581341, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 0.9738046526908875, "step": 7400 }, { "ce_loss": 0.3073391020298004, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.46066734194755554, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.15834133327007294, "step": 7400 }, { "epoch": 2.468312208138759, "loss": 1.1532341241836548, "step": 7400 }, { "ce_loss": 0.2673509418964386, "epoch": 2.468312208138759, "step": 7400 }, { "distill_loss": 0.5188738703727722, "epoch": 2.468312208138759, "step": 7400 }, { "epoch": 2.468312208138759, "ref_ce_loss": 0.21087826788425446, "step": 7400 }, { "epoch": 2.4716477651767845, "loss": 1.1162, "step": 7410 }, { "epoch": 2.4716477651767845, "grad_norm": 1.6830294132232666, "step": 7410 }, { "epoch": 2.4716477651767845, "learning_rate": 0.0007050865853359113, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.8965054154396057, "step": 7410 }, { "ce_loss": 0.27338090538978577, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.35894811153411865, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.2638855278491974, "step": 7410 }, { "epoch": 2.4716477651767845, "loss": 0.9959196448326111, "step": 7410 }, { "ce_loss": 0.29337841272354126, "epoch": 2.4716477651767845, "step": 7410 }, { "distill_loss": 0.4339549243450165, "epoch": 2.4716477651767845, "step": 7410 }, { "epoch": 2.4716477651767845, "ref_ce_loss": 0.21029168367385864, "step": 7410 }, { "epoch": 2.47498332221481, "loss": 1.0628, "step": 7420 }, { "epoch": 2.47498332221481, "grad_norm": 1.4016879796981812, "step": 7420 }, { "epoch": 2.47498332221481, "learning_rate": 0.0007048069339448147, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 1.0335075855255127, "step": 7420 }, { "ce_loss": 0.31529316306114197, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.4157218933105469, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.24515531957149506, "step": 7420 }, { "epoch": 2.47498332221481, "loss": 1.2172638177871704, "step": 7420 }, { "ce_loss": 0.308971643447876, "epoch": 2.47498332221481, "step": 7420 }, { "distill_loss": 0.4950236678123474, "epoch": 2.47498332221481, "step": 7420 }, { "epoch": 2.47498332221481, "ref_ce_loss": 0.24021197855472565, "step": 7420 }, { "epoch": 2.4783188792528352, "loss": 1.0767, "step": 7430 }, { "epoch": 2.4783188792528352, "grad_norm": 1.9759247303009033, "step": 7430 }, { "epoch": 2.4783188792528352, "learning_rate": 0.0007045269268112277, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 1.916853427886963, "step": 7430 }, { "ce_loss": 0.2808642089366913, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.3968324661254883, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.23114171624183655, "step": 7430 }, { "epoch": 2.4783188792528352, "loss": 1.0604915618896484, "step": 7430 }, { "ce_loss": 0.2519376575946808, "epoch": 2.4783188792528352, "step": 7430 }, { "distill_loss": 0.3647528886795044, "epoch": 2.4783188792528352, "step": 7430 }, { "epoch": 2.4783188792528352, "ref_ce_loss": 0.19655263423919678, "step": 7430 }, { "epoch": 2.4816544362908606, "loss": 1.0813, "step": 7440 }, { "epoch": 2.4816544362908606, "grad_norm": 1.7491250038146973, "step": 7440 }, { "epoch": 2.4816544362908606, "learning_rate": 0.0007042465642619485, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 1.091002345085144, "step": 7440 }, { "ce_loss": 0.3512522578239441, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.52333003282547, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.21626801788806915, "step": 7440 }, { "epoch": 2.4816544362908606, "loss": 0.9965525269508362, "step": 7440 }, { "ce_loss": 0.24571192264556885, "epoch": 2.4816544362908606, "step": 7440 }, { "distill_loss": 0.44148796796798706, "epoch": 2.4816544362908606, "step": 7440 }, { "epoch": 2.4816544362908606, "ref_ce_loss": 0.2422831505537033, "step": 7440 }, { "epoch": 2.484989993328886, "loss": 1.0955, "step": 7450 }, { "epoch": 2.484989993328886, "grad_norm": 1.8003133535385132, "step": 7450 }, { "epoch": 2.484989993328886, "learning_rate": 0.0007039658466241906, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 1.0173012018203735, "step": 7450 }, { "ce_loss": 0.26152217388153076, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.5212584733963013, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.2341955155134201, "step": 7450 }, { "epoch": 2.484989993328886, "loss": 1.2713087797164917, "step": 7450 }, { "ce_loss": 0.2889062762260437, "epoch": 2.484989993328886, "step": 7450 }, { "distill_loss": 0.4098033607006073, "epoch": 2.484989993328886, "step": 7450 }, { "epoch": 2.484989993328886, "ref_ce_loss": 0.26270008087158203, "step": 7450 }, { "epoch": 2.4883255503669113, "loss": 1.0681, "step": 7460 }, { "epoch": 2.4883255503669113, "grad_norm": 2.4886579513549805, "step": 7460 }, { "epoch": 2.4883255503669113, "learning_rate": 0.0007036847742255818, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 1.3388160467147827, "step": 7460 }, { "ce_loss": 0.333164781332016, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.5271449089050293, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.1901921182870865, "step": 7460 }, { "epoch": 2.4883255503669113, "loss": 0.8606884479522705, "step": 7460 }, { "ce_loss": 0.27313685417175293, "epoch": 2.4883255503669113, "step": 7460 }, { "distill_loss": 0.414323627948761, "epoch": 2.4883255503669113, "step": 7460 }, { "epoch": 2.4883255503669113, "ref_ce_loss": 0.17308597266674042, "step": 7460 }, { "epoch": 2.4916611074049366, "loss": 1.1727, "step": 7470 }, { "epoch": 2.4916611074049366, "grad_norm": 1.5319260358810425, "step": 7470 }, { "epoch": 2.4916611074049366, "learning_rate": 0.0007034033473941634, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.8533339500427246, "step": 7470 }, { "ce_loss": 0.1954154670238495, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.42147818207740784, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.16750460863113403, "step": 7470 }, { "epoch": 2.4916611074049366, "loss": 0.9842551946640015, "step": 7470 }, { "ce_loss": 0.2575088143348694, "epoch": 2.4916611074049366, "step": 7470 }, { "distill_loss": 0.4689650535583496, "epoch": 2.4916611074049366, "step": 7470 }, { "epoch": 2.4916611074049366, "ref_ce_loss": 0.2151673138141632, "step": 7470 }, { "epoch": 2.494996664442962, "loss": 1.1905, "step": 7480 }, { "epoch": 2.494996664442962, "grad_norm": 2.0598387718200684, "step": 7480 }, { "epoch": 2.494996664442962, "learning_rate": 0.0007031215664583912, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 1.0834827423095703, "step": 7480 }, { "ce_loss": 0.2805913984775543, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.5459102392196655, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.18166399002075195, "step": 7480 }, { "epoch": 2.494996664442962, "loss": 0.8282067775726318, "step": 7480 }, { "ce_loss": 0.18217626214027405, "epoch": 2.494996664442962, "step": 7480 }, { "distill_loss": 0.3959864377975464, "epoch": 2.494996664442962, "step": 7480 }, { "epoch": 2.494996664442962, "ref_ce_loss": 0.18068751692771912, "step": 7480 }, { "epoch": 2.4983322214809873, "loss": 1.1075, "step": 7490 }, { "epoch": 2.4983322214809873, "grad_norm": 2.6838510036468506, "step": 7490 }, { "epoch": 2.4983322214809873, "learning_rate": 0.0007028394317471335, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 0.9380246996879578, "step": 7490 }, { "ce_loss": 0.268350750207901, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.4618475139141083, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.16416999697685242, "step": 7490 }, { "epoch": 2.4983322214809873, "loss": 1.450337529182434, "step": 7490 }, { "ce_loss": 0.3254244029521942, "epoch": 2.4983322214809873, "step": 7490 }, { "distill_loss": 0.6119667887687683, "epoch": 2.4983322214809873, "step": 7490 }, { "epoch": 2.4983322214809873, "ref_ce_loss": 0.226872518658638, "step": 7490 }, { "epoch": 2.5016677785190127, "loss": 1.0566, "step": 7500 }, { "epoch": 2.5016677785190127, "grad_norm": 1.2880253791809082, "step": 7500 }, { "epoch": 2.5016677785190127, "learning_rate": 0.0007025569435896722, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 1.4975539445877075, "step": 7500 }, { "ce_loss": 0.31754326820373535, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.48850399255752563, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.26213768124580383, "step": 7500 }, { "epoch": 2.5016677785190127, "loss": 0.9692498445510864, "step": 7500 }, { "ce_loss": 0.2891314625740051, "epoch": 2.5016677785190127, "step": 7500 }, { "distill_loss": 0.46991127729415894, "epoch": 2.5016677785190127, "step": 7500 }, { "epoch": 2.5016677785190127, "ref_ce_loss": 0.2100847214460373, "step": 7500 }, { "epoch": 2.505003335557038, "loss": 1.0924, "step": 7510 }, { "epoch": 2.505003335557038, "grad_norm": 1.5961883068084717, "step": 7510 }, { "epoch": 2.505003335557038, "learning_rate": 0.0007022741023157013, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 1.0401496887207031, "step": 7510 }, { "ce_loss": 0.2791939675807953, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.3842151165008545, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.21842031180858612, "step": 7510 }, { "epoch": 2.505003335557038, "loss": 1.0297224521636963, "step": 7510 }, { "ce_loss": 0.2569929361343384, "epoch": 2.505003335557038, "step": 7510 }, { "distill_loss": 0.39038562774658203, "epoch": 2.505003335557038, "step": 7510 }, { "epoch": 2.505003335557038, "ref_ce_loss": 0.22410175204277039, "step": 7510 }, { "epoch": 2.5083388925950634, "loss": 1.0764, "step": 7520 }, { "epoch": 2.5083388925950634, "grad_norm": 2.00757098197937, "step": 7520 }, { "epoch": 2.5083388925950634, "learning_rate": 0.000701990908255327, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 1.0620434284210205, "step": 7520 }, { "ce_loss": 0.22661221027374268, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.48355352878570557, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 0.22372646629810333, "step": 7520 }, { "epoch": 2.5083388925950634, "loss": 1.1504652500152588, "step": 7520 }, { "ce_loss": 0.3021428883075714, "epoch": 2.5083388925950634, "step": 7520 }, { "distill_loss": 0.4501044750213623, "epoch": 2.5083388925950634, "step": 7520 }, { "epoch": 2.5083388925950634, "ref_ce_loss": 0.22373048961162567, "step": 7520 }, { "epoch": 2.5116744496330887, "loss": 1.1038, "step": 7530 }, { "epoch": 2.5116744496330887, "grad_norm": 2.8594300746917725, "step": 7530 }, { "epoch": 2.5116744496330887, "learning_rate": 0.0007017073617390671, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 1.1574616432189941, "step": 7530 }, { "ce_loss": 0.26310059428215027, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.5132943987846375, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.18846635520458221, "step": 7530 }, { "epoch": 2.5116744496330887, "loss": 1.2592483758926392, "step": 7530 }, { "ce_loss": 0.2359173595905304, "epoch": 2.5116744496330887, "step": 7530 }, { "distill_loss": 0.4269721806049347, "epoch": 2.5116744496330887, "step": 7530 }, { "epoch": 2.5116744496330887, "ref_ce_loss": 0.21566621959209442, "step": 7530 }, { "epoch": 2.515010006671114, "loss": 1.1568, "step": 7540 }, { "epoch": 2.515010006671114, "grad_norm": 3.1892199516296387, "step": 7540 }, { "epoch": 2.515010006671114, "learning_rate": 0.000701423463097851, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 1.0318684577941895, "step": 7540 }, { "ce_loss": 0.24232074618339539, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.4724856913089752, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.19329185783863068, "step": 7540 }, { "epoch": 2.515010006671114, "loss": 1.1266546249389648, "step": 7540 }, { "ce_loss": 0.3473910987377167, "epoch": 2.515010006671114, "step": 7540 }, { "distill_loss": 0.4599970877170563, "epoch": 2.515010006671114, "step": 7540 }, { "epoch": 2.515010006671114, "ref_ce_loss": 0.1923563927412033, "step": 7540 }, { "epoch": 2.5183455637091394, "loss": 1.1127, "step": 7550 }, { "epoch": 2.5183455637091394, "grad_norm": 1.9296170473098755, "step": 7550 }, { "epoch": 2.5183455637091394, "learning_rate": 0.000701139212663019, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 0.8258950114250183, "step": 7550 }, { "ce_loss": 0.23694711923599243, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.41021570563316345, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.17734235525131226, "step": 7550 }, { "epoch": 2.5183455637091394, "loss": 1.1382707357406616, "step": 7550 }, { "ce_loss": 0.32674920558929443, "epoch": 2.5183455637091394, "step": 7550 }, { "distill_loss": 0.4254034161567688, "epoch": 2.5183455637091394, "step": 7550 }, { "epoch": 2.5183455637091394, "ref_ce_loss": 0.2471553236246109, "step": 7550 }, { "epoch": 2.5216811207471648, "loss": 1.0207, "step": 7560 }, { "epoch": 2.5216811207471648, "grad_norm": 1.584105372428894, "step": 7560 }, { "epoch": 2.5216811207471648, "learning_rate": 0.0007008546107663218, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.9069346785545349, "step": 7560 }, { "ce_loss": 0.20985406637191772, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.3904266953468323, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.18962639570236206, "step": 7560 }, { "epoch": 2.5216811207471648, "loss": 0.9962298274040222, "step": 7560 }, { "ce_loss": 0.2645713984966278, "epoch": 2.5216811207471648, "step": 7560 }, { "distill_loss": 0.46430978178977966, "epoch": 2.5216811207471648, "step": 7560 }, { "epoch": 2.5216811207471648, "ref_ce_loss": 0.21918721497058868, "step": 7560 }, { "epoch": 2.52501667778519, "loss": 1.0951, "step": 7570 }, { "epoch": 2.52501667778519, "grad_norm": 2.894544839859009, "step": 7570 }, { "epoch": 2.52501667778519, "learning_rate": 0.0007005696577399206, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 0.918897271156311, "step": 7570 }, { "ce_loss": 0.22210124135017395, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.4796445965766907, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.21641956269741058, "step": 7570 }, { "epoch": 2.52501667778519, "loss": 1.289629578590393, "step": 7570 }, { "ce_loss": 0.32594767212867737, "epoch": 2.52501667778519, "step": 7570 }, { "distill_loss": 0.5843258500099182, "epoch": 2.52501667778519, "step": 7570 }, { "epoch": 2.52501667778519, "ref_ce_loss": 0.2191724181175232, "step": 7570 }, { "epoch": 2.5283522348232155, "loss": 1.0894, "step": 7580 }, { "epoch": 2.5283522348232155, "grad_norm": 2.0715115070343018, "step": 7580 }, { "epoch": 2.5283522348232155, "learning_rate": 0.0007002843539163862, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 0.9046424627304077, "step": 7580 }, { "ce_loss": 0.1960611343383789, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.4171680808067322, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.20283107459545135, "step": 7580 }, { "epoch": 2.5283522348232155, "loss": 1.094390630722046, "step": 7580 }, { "ce_loss": 0.3450239598751068, "epoch": 2.5283522348232155, "step": 7580 }, { "distill_loss": 0.5300965309143066, "epoch": 2.5283522348232155, "step": 7580 }, { "epoch": 2.5283522348232155, "ref_ce_loss": 0.21903318166732788, "step": 7580 }, { "epoch": 2.531687791861241, "loss": 1.0695, "step": 7590 }, { "epoch": 2.531687791861241, "grad_norm": 1.8430460691452026, "step": 7590 }, { "epoch": 2.531687791861241, "learning_rate": 0.0006999986996286989, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.8136076927185059, "step": 7590 }, { "ce_loss": 0.1908169835805893, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.37328973412513733, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.17851682007312775, "step": 7590 }, { "epoch": 2.531687791861241, "loss": 0.9584760069847107, "step": 7590 }, { "ce_loss": 0.2861538827419281, "epoch": 2.531687791861241, "step": 7590 }, { "distill_loss": 0.40880483388900757, "epoch": 2.531687791861241, "step": 7590 }, { "epoch": 2.531687791861241, "ref_ce_loss": 0.20923297107219696, "step": 7590 }, { "epoch": 2.535023348899266, "loss": 1.0368, "step": 7600 }, { "epoch": 2.535023348899266, "grad_norm": 1.763928771018982, "step": 7600 }, { "epoch": 2.535023348899266, "learning_rate": 0.0006997126952102479, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 1.0512466430664062, "step": 7600 }, { "ce_loss": 0.280617892742157, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.45977500081062317, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.2450403869152069, "step": 7600 }, { "epoch": 2.535023348899266, "loss": 1.2413963079452515, "step": 7600 }, { "ce_loss": 0.3729633688926697, "epoch": 2.535023348899266, "step": 7600 }, { "distill_loss": 0.5388633012771606, "epoch": 2.535023348899266, "step": 7600 }, { "epoch": 2.535023348899266, "ref_ce_loss": 0.26855915784835815, "step": 7600 }, { "epoch": 2.5383589059372915, "loss": 1.1069, "step": 7610 }, { "epoch": 2.5383589059372915, "grad_norm": 1.5387122631072998, "step": 7610 }, { "epoch": 2.5383589059372915, "learning_rate": 0.0006994263409948312, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 1.035395622253418, "step": 7610 }, { "ce_loss": 0.2836330235004425, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.4233282208442688, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.20756769180297852, "step": 7610 }, { "epoch": 2.5383589059372915, "loss": 1.4395779371261597, "step": 7610 }, { "ce_loss": 0.32102641463279724, "epoch": 2.5383589059372915, "step": 7610 }, { "distill_loss": 0.562681257724762, "epoch": 2.5383589059372915, "step": 7610 }, { "epoch": 2.5383589059372915, "ref_ce_loss": 0.24834278225898743, "step": 7610 }, { "epoch": 2.541694462975317, "loss": 1.1417, "step": 7620 }, { "epoch": 2.541694462975317, "grad_norm": 1.9746698141098022, "step": 7620 }, { "epoch": 2.541694462975317, "learning_rate": 0.0006991396373166548, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 1.1625916957855225, "step": 7620 }, { "ce_loss": 0.2999421954154968, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.42592406272888184, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.24482430517673492, "step": 7620 }, { "epoch": 2.541694462975317, "loss": 0.8874329924583435, "step": 7620 }, { "ce_loss": 0.25415265560150146, "epoch": 2.541694462975317, "step": 7620 }, { "distill_loss": 0.3725735545158386, "epoch": 2.541694462975317, "step": 7620 }, { "epoch": 2.541694462975317, "ref_ce_loss": 0.2000926285982132, "step": 7620 }, { "epoch": 2.545030020013342, "loss": 1.0415, "step": 7630 }, { "epoch": 2.545030020013342, "grad_norm": 1.7550071477890015, "step": 7630 }, { "epoch": 2.545030020013342, "learning_rate": 0.0006988525845103331, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 1.1274958848953247, "step": 7630 }, { "ce_loss": 0.22237154841423035, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.47276023030281067, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.25588810443878174, "step": 7630 }, { "epoch": 2.545030020013342, "loss": 1.2401340007781982, "step": 7630 }, { "ce_loss": 0.2866066098213196, "epoch": 2.545030020013342, "step": 7630 }, { "distill_loss": 0.4623689651489258, "epoch": 2.545030020013342, "step": 7630 }, { "epoch": 2.545030020013342, "ref_ce_loss": 0.21370618045330048, "step": 7630 }, { "epoch": 2.5483655770513676, "loss": 1.1032, "step": 7640 }, { "epoch": 2.5483655770513676, "grad_norm": 2.0811660289764404, "step": 7640 }, { "epoch": 2.5483655770513676, "learning_rate": 0.0006985651829108872, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 1.0950477123260498, "step": 7640 }, { "ce_loss": 0.27928969264030457, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.47992321848869324, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.2042643129825592, "step": 7640 }, { "epoch": 2.5483655770513676, "loss": 0.9194262027740479, "step": 7640 }, { "ce_loss": 0.2476658970117569, "epoch": 2.5483655770513676, "step": 7640 }, { "distill_loss": 0.44567611813545227, "epoch": 2.5483655770513676, "step": 7640 }, { "epoch": 2.5483655770513676, "ref_ce_loss": 0.2102728933095932, "step": 7640 }, { "epoch": 2.551701134089393, "loss": 1.1073, "step": 7650 }, { "epoch": 2.551701134089393, "grad_norm": 1.6015641689300537, "step": 7650 }, { "epoch": 2.551701134089393, "learning_rate": 0.000698277432853746, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 0.8000747561454773, "step": 7650 }, { "ce_loss": 0.2432447075843811, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.35264867544174194, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.20382060110569, "step": 7650 }, { "epoch": 2.551701134089393, "loss": 1.2146551609039307, "step": 7650 }, { "ce_loss": 0.3808715343475342, "epoch": 2.551701134089393, "step": 7650 }, { "distill_loss": 0.5072845220565796, "epoch": 2.551701134089393, "step": 7650 }, { "epoch": 2.551701134089393, "ref_ce_loss": 0.2642064094543457, "step": 7650 }, { "epoch": 2.5550366911274183, "loss": 1.0363, "step": 7660 }, { "epoch": 2.5550366911274183, "grad_norm": 1.6908466815948486, "step": 7660 }, { "epoch": 2.5550366911274183, "learning_rate": 0.0006979893346747447, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.9030312895774841, "step": 7660 }, { "ce_loss": 0.2747442424297333, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.3266064524650574, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.23646491765975952, "step": 7660 }, { "epoch": 2.5550366911274183, "loss": 0.9936516284942627, "step": 7660 }, { "ce_loss": 0.28957706689834595, "epoch": 2.5550366911274183, "step": 7660 }, { "distill_loss": 0.3737667500972748, "epoch": 2.5550366911274183, "step": 7660 }, { "epoch": 2.5550366911274183, "ref_ce_loss": 0.21820056438446045, "step": 7660 }, { "epoch": 2.5583722481654436, "loss": 1.0447, "step": 7670 }, { "epoch": 2.5583722481654436, "grad_norm": 2.1870462894439697, "step": 7670 }, { "epoch": 2.5583722481654436, "learning_rate": 0.0006977008887101248, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 1.0413737297058105, "step": 7670 }, { "ce_loss": 0.2267252653837204, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.4250425100326538, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.1708996295928955, "step": 7670 }, { "epoch": 2.5583722481654436, "loss": 1.0603262186050415, "step": 7670 }, { "ce_loss": 0.21380625665187836, "epoch": 2.5583722481654436, "step": 7670 }, { "distill_loss": 0.47527724504470825, "epoch": 2.5583722481654436, "step": 7670 }, { "epoch": 2.5583722481654436, "ref_ce_loss": 0.16205209493637085, "step": 7670 }, { "epoch": 2.561707805203469, "loss": 1.093, "step": 7680 }, { "epoch": 2.561707805203469, "grad_norm": 1.7474945783615112, "step": 7680 }, { "epoch": 2.561707805203469, "learning_rate": 0.000697412095296534, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 0.9549005627632141, "step": 7680 }, { "ce_loss": 0.2424485683441162, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.4325232207775116, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.2201852947473526, "step": 7680 }, { "epoch": 2.561707805203469, "loss": 1.0097670555114746, "step": 7680 }, { "ce_loss": 0.269885778427124, "epoch": 2.561707805203469, "step": 7680 }, { "distill_loss": 0.40437909960746765, "epoch": 2.561707805203469, "step": 7680 }, { "epoch": 2.561707805203469, "ref_ce_loss": 0.2088843733072281, "step": 7680 }, { "epoch": 2.5650433622414943, "loss": 1.0571, "step": 7690 }, { "epoch": 2.5650433622414943, "grad_norm": 2.7915446758270264, "step": 7690 }, { "epoch": 2.5650433622414943, "learning_rate": 0.0006971229547710249, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 0.9924904108047485, "step": 7690 }, { "ce_loss": 0.2706010043621063, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.46180590987205505, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.18765035271644592, "step": 7690 }, { "epoch": 2.5650433622414943, "loss": 1.382493257522583, "step": 7690 }, { "ce_loss": 0.38130655884742737, "epoch": 2.5650433622414943, "step": 7690 }, { "distill_loss": 0.38000595569610596, "epoch": 2.5650433622414943, "step": 7690 }, { "epoch": 2.5650433622414943, "ref_ce_loss": 0.24802355468273163, "step": 7690 }, { "epoch": 2.5683789192795197, "loss": 1.0481, "step": 7700 }, { "epoch": 2.5683789192795197, "grad_norm": 2.279045343399048, "step": 7700 }, { "epoch": 2.5683789192795197, "learning_rate": 0.000696833467471056, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 1.0659339427947998, "step": 7700 }, { "ce_loss": 0.2872184216976166, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.43056899309158325, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.2018534541130066, "step": 7700 }, { "epoch": 2.5683789192795197, "loss": 1.2552160024642944, "step": 7700 }, { "ce_loss": 0.33749714493751526, "epoch": 2.5683789192795197, "step": 7700 }, { "distill_loss": 0.5229628682136536, "epoch": 2.5683789192795197, "step": 7700 }, { "epoch": 2.5683789192795197, "ref_ce_loss": 0.2517550587654114, "step": 7700 }, { "epoch": 2.571714476317545, "loss": 1.1328, "step": 7710 }, { "epoch": 2.571714476317545, "grad_norm": 2.3299286365509033, "step": 7710 }, { "epoch": 2.571714476317545, "learning_rate": 0.0006965436337344899, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 1.0527980327606201, "step": 7710 }, { "ce_loss": 0.27170753479003906, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.43882137537002563, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.2398902326822281, "step": 7710 }, { "epoch": 2.571714476317545, "loss": 1.035448431968689, "step": 7710 }, { "ce_loss": 0.2794928252696991, "epoch": 2.571714476317545, "step": 7710 }, { "distill_loss": 0.4769744873046875, "epoch": 2.571714476317545, "step": 7710 }, { "epoch": 2.571714476317545, "ref_ce_loss": 0.23001840710639954, "step": 7710 }, { "epoch": 2.5750500333555704, "loss": 1.0902, "step": 7720 }, { "epoch": 2.5750500333555704, "grad_norm": 2.4687540531158447, "step": 7720 }, { "epoch": 2.5750500333555704, "learning_rate": 0.0006962534538995938, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 1.28634774684906, "step": 7720 }, { "ce_loss": 0.3173252046108246, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.49686121940612793, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.22386427223682404, "step": 7720 }, { "epoch": 2.5750500333555704, "loss": 0.9877050518989563, "step": 7720 }, { "ce_loss": 0.2048882693052292, "epoch": 2.5750500333555704, "step": 7720 }, { "distill_loss": 0.43360722064971924, "epoch": 2.5750500333555704, "step": 7720 }, { "epoch": 2.5750500333555704, "ref_ce_loss": 0.21809960901737213, "step": 7720 }, { "epoch": 2.5783855903935957, "loss": 1.0722, "step": 7730 }, { "epoch": 2.5783855903935957, "grad_norm": 1.9223451614379883, "step": 7730 }, { "epoch": 2.5783855903935957, "learning_rate": 0.0006959629283050388, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.8922337889671326, "step": 7730 }, { "ce_loss": 0.25825366377830505, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.3473392426967621, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.22594837844371796, "step": 7730 }, { "epoch": 2.5783855903935957, "loss": 0.8970277309417725, "step": 7730 }, { "ce_loss": 0.2750245928764343, "epoch": 2.5783855903935957, "step": 7730 }, { "distill_loss": 0.3688482642173767, "epoch": 2.5783855903935957, "step": 7730 }, { "epoch": 2.5783855903935957, "ref_ce_loss": 0.20284493267536163, "step": 7730 }, { "epoch": 2.581721147431621, "loss": 1.0514, "step": 7740 }, { "epoch": 2.581721147431621, "grad_norm": 2.207745313644409, "step": 7740 }, { "epoch": 2.581721147431621, "learning_rate": 0.0006956720572898995, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 1.0820971727371216, "step": 7740 }, { "ce_loss": 0.30136752128601074, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.34338176250457764, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.2206101268529892, "step": 7740 }, { "epoch": 2.581721147431621, "loss": 0.9287523627281189, "step": 7740 }, { "ce_loss": 0.27432000637054443, "epoch": 2.581721147431621, "step": 7740 }, { "distill_loss": 0.3677484691143036, "epoch": 2.581721147431621, "step": 7740 }, { "epoch": 2.581721147431621, "ref_ce_loss": 0.23579438030719757, "step": 7740 }, { "epoch": 2.5850567044696464, "loss": 1.0598, "step": 7750 }, { "epoch": 2.5850567044696464, "grad_norm": 1.980271577835083, "step": 7750 }, { "epoch": 2.5850567044696464, "learning_rate": 0.0006953808411936538, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 1.5159064531326294, "step": 7750 }, { "ce_loss": 0.31128838658332825, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.5279540419578552, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.24524328112602234, "step": 7750 }, { "epoch": 2.5850567044696464, "loss": 1.2730480432510376, "step": 7750 }, { "ce_loss": 0.2881583869457245, "epoch": 2.5850567044696464, "step": 7750 }, { "distill_loss": 0.4585064649581909, "epoch": 2.5850567044696464, "step": 7750 }, { "epoch": 2.5850567044696464, "ref_ce_loss": 0.20309162139892578, "step": 7750 }, { "epoch": 2.5883922615076718, "loss": 1.1971, "step": 7760 }, { "epoch": 2.5883922615076718, "grad_norm": 1.7246359586715698, "step": 7760 }, { "epoch": 2.5883922615076718, "learning_rate": 0.0006950892803561821, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 1.2225000858306885, "step": 7760 }, { "ce_loss": 0.2808440029621124, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.4612267017364502, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.20239432156085968, "step": 7760 }, { "epoch": 2.5883922615076718, "loss": 0.9158413410186768, "step": 7760 }, { "ce_loss": 0.26169925928115845, "epoch": 2.5883922615076718, "step": 7760 }, { "distill_loss": 0.42524948716163635, "epoch": 2.5883922615076718, "step": 7760 }, { "epoch": 2.5883922615076718, "ref_ce_loss": 0.16362762451171875, "step": 7760 }, { "epoch": 2.591727818545697, "loss": 1.1191, "step": 7770 }, { "epoch": 2.591727818545697, "grad_norm": 4.186878204345703, "step": 7770 }, { "epoch": 2.591727818545697, "learning_rate": 0.0006947973751177674, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.8495794534683228, "step": 7770 }, { "ce_loss": 0.23491553962230682, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.39159250259399414, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.22222238779067993, "step": 7770 }, { "epoch": 2.591727818545697, "loss": 0.8863764405250549, "step": 7770 }, { "ce_loss": 0.24199603497982025, "epoch": 2.591727818545697, "step": 7770 }, { "distill_loss": 0.461648166179657, "epoch": 2.591727818545697, "step": 7770 }, { "epoch": 2.591727818545697, "ref_ce_loss": 0.18189716339111328, "step": 7770 }, { "epoch": 2.5950633755837225, "loss": 1.053, "step": 7780 }, { "epoch": 2.5950633755837225, "grad_norm": 2.2770771980285645, "step": 7780 }, { "epoch": 2.5950633755837225, "learning_rate": 0.0006945051258190942, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 1.204896092414856, "step": 7780 }, { "ce_loss": 0.23836398124694824, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.36417123675346375, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.23337742686271667, "step": 7780 }, { "epoch": 2.5950633755837225, "loss": 0.9068416953086853, "step": 7780 }, { "ce_loss": 0.2626253664493561, "epoch": 2.5950633755837225, "step": 7780 }, { "distill_loss": 0.3526120185852051, "epoch": 2.5950633755837225, "step": 7780 }, { "epoch": 2.5950633755837225, "ref_ce_loss": 0.19078634679317474, "step": 7780 }, { "epoch": 2.598398932621748, "loss": 1.066, "step": 7790 }, { "epoch": 2.598398932621748, "grad_norm": 1.2925469875335693, "step": 7790 }, { "epoch": 2.598398932621748, "learning_rate": 0.0006942125328012493, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 1.0056893825531006, "step": 7790 }, { "ce_loss": 0.2945784032344818, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.47337833046913147, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.1684665083885193, "step": 7790 }, { "epoch": 2.598398932621748, "loss": 1.032659888267517, "step": 7790 }, { "ce_loss": 0.3183746635913849, "epoch": 2.598398932621748, "step": 7790 }, { "distill_loss": 0.4784265458583832, "epoch": 2.598398932621748, "step": 7790 }, { "epoch": 2.598398932621748, "ref_ce_loss": 0.192928284406662, "step": 7790 }, { "epoch": 2.601734489659773, "loss": 1.1247, "step": 7800 }, { "epoch": 2.601734489659773, "grad_norm": 2.7289175987243652, "step": 7800 }, { "epoch": 2.601734489659773, "learning_rate": 0.0006939195964057199, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 1.3446162939071655, "step": 7800 }, { "ce_loss": 0.31287020444869995, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.43550536036491394, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.1966378092765808, "step": 7800 }, { "epoch": 2.601734489659773, "loss": 1.102492332458496, "step": 7800 }, { "ce_loss": 0.295215368270874, "epoch": 2.601734489659773, "step": 7800 }, { "distill_loss": 0.4144419729709625, "epoch": 2.601734489659773, "step": 7800 }, { "epoch": 2.601734489659773, "ref_ce_loss": 0.22681808471679688, "step": 7800 }, { "epoch": 2.6050700466977985, "loss": 1.0734, "step": 7810 }, { "epoch": 2.6050700466977985, "grad_norm": 1.937639594078064, "step": 7810 }, { "epoch": 2.6050700466977985, "learning_rate": 0.0006936263169743946, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 1.1655282974243164, "step": 7810 }, { "ce_loss": 0.3544881343841553, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.5030702352523804, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.26013660430908203, "step": 7810 }, { "epoch": 2.6050700466977985, "loss": 0.8801066875457764, "step": 7810 }, { "ce_loss": 0.24790768325328827, "epoch": 2.6050700466977985, "step": 7810 }, { "distill_loss": 0.42721542716026306, "epoch": 2.6050700466977985, "step": 7810 }, { "epoch": 2.6050700466977985, "ref_ce_loss": 0.1820715367794037, "step": 7810 }, { "epoch": 2.608405603735824, "loss": 1.0556, "step": 7820 }, { "epoch": 2.608405603735824, "grad_norm": 2.0428402423858643, "step": 7820 }, { "epoch": 2.608405603735824, "learning_rate": 0.0006933326948495617, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 1.201572060585022, "step": 7820 }, { "ce_loss": 0.2293986827135086, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.44717761874198914, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.21371591091156006, "step": 7820 }, { "epoch": 2.608405603735824, "loss": 0.9682687520980835, "step": 7820 }, { "ce_loss": 0.3088095784187317, "epoch": 2.608405603735824, "step": 7820 }, { "distill_loss": 0.38450387120246887, "epoch": 2.608405603735824, "step": 7820 }, { "epoch": 2.608405603735824, "ref_ce_loss": 0.21013160049915314, "step": 7820 }, { "epoch": 2.611741160773849, "loss": 1.0846, "step": 7830 }, { "epoch": 2.611741160773849, "grad_norm": 2.4795031547546387, "step": 7830 }, { "epoch": 2.611741160773849, "learning_rate": 0.0006930387303739101, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.9328702688217163, "step": 7830 }, { "ce_loss": 0.29024484753608704, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.3580683171749115, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.2304927408695221, "step": 7830 }, { "epoch": 2.611741160773849, "loss": 0.9353972673416138, "step": 7830 }, { "ce_loss": 0.23102283477783203, "epoch": 2.611741160773849, "step": 7830 }, { "distill_loss": 0.4216915965080261, "epoch": 2.611741160773849, "step": 7830 }, { "epoch": 2.611741160773849, "ref_ce_loss": 0.21496886014938354, "step": 7830 }, { "epoch": 2.6150767178118746, "loss": 1.022, "step": 7840 }, { "epoch": 2.6150767178118746, "grad_norm": 2.2489166259765625, "step": 7840 }, { "epoch": 2.6150767178118746, "learning_rate": 0.000692744423890528, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 1.247374176979065, "step": 7840 }, { "ce_loss": 0.350018173456192, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.5923084616661072, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.24629969894886017, "step": 7840 }, { "epoch": 2.6150767178118746, "loss": 0.8726778626441956, "step": 7840 }, { "ce_loss": 0.18625923991203308, "epoch": 2.6150767178118746, "step": 7840 }, { "distill_loss": 0.46877485513687134, "epoch": 2.6150767178118746, "step": 7840 }, { "epoch": 2.6150767178118746, "ref_ce_loss": 0.21734049916267395, "step": 7840 }, { "epoch": 2.6184122748499, "loss": 1.1391, "step": 7850 }, { "epoch": 2.6184122748499, "grad_norm": 2.0969858169555664, "step": 7850 }, { "epoch": 2.6184122748499, "learning_rate": 0.0006924497757429026, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 0.7978283762931824, "step": 7850 }, { "ce_loss": 0.21526743471622467, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.3887978792190552, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.19217370450496674, "step": 7850 }, { "epoch": 2.6184122748499, "loss": 1.0249779224395752, "step": 7850 }, { "ce_loss": 0.26744702458381653, "epoch": 2.6184122748499, "step": 7850 }, { "distill_loss": 0.3994236886501312, "epoch": 2.6184122748499, "step": 7850 }, { "epoch": 2.6184122748499, "ref_ce_loss": 0.2308528572320938, "step": 7850 }, { "epoch": 2.6217478318879253, "loss": 1.0029, "step": 7860 }, { "epoch": 2.6217478318879253, "grad_norm": 1.8193413019180298, "step": 7860 }, { "epoch": 2.6217478318879253, "learning_rate": 0.0006921547862749198, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 0.8859058022499084, "step": 7860 }, { "ce_loss": 0.24655620753765106, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.398909330368042, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.18014656007289886, "step": 7860 }, { "epoch": 2.6217478318879253, "loss": 1.1356027126312256, "step": 7860 }, { "ce_loss": 0.36278200149536133, "epoch": 2.6217478318879253, "step": 7860 }, { "distill_loss": 0.48611417412757874, "epoch": 2.6217478318879253, "step": 7860 }, { "epoch": 2.6217478318879253, "ref_ce_loss": 0.2426128387451172, "step": 7860 }, { "epoch": 2.6250833889259506, "loss": 1.0975, "step": 7870 }, { "epoch": 2.6250833889259506, "grad_norm": 2.170436143875122, "step": 7870 }, { "epoch": 2.6250833889259506, "learning_rate": 0.0006918594558308643, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.9275552034378052, "step": 7870 }, { "ce_loss": 0.2591184377670288, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.38452085852622986, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.1751992255449295, "step": 7870 }, { "epoch": 2.6250833889259506, "loss": 0.9300224184989929, "step": 7870 }, { "ce_loss": 0.26695799827575684, "epoch": 2.6250833889259506, "step": 7870 }, { "distill_loss": 0.41706058382987976, "epoch": 2.6250833889259506, "step": 7870 }, { "epoch": 2.6250833889259506, "ref_ce_loss": 0.20200999081134796, "step": 7870 }, { "epoch": 2.628418945963976, "loss": 1.2211, "step": 7880 }, { "epoch": 2.628418945963976, "grad_norm": 2.2208914756774902, "step": 7880 }, { "epoch": 2.628418945963976, "learning_rate": 0.0006915637847554186, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 1.0086395740509033, "step": 7880 }, { "ce_loss": 0.3185664713382721, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.4550648331642151, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.18952278792858124, "step": 7880 }, { "epoch": 2.628418945963976, "loss": 0.9416465759277344, "step": 7880 }, { "ce_loss": 0.2340216040611267, "epoch": 2.628418945963976, "step": 7880 }, { "distill_loss": 0.46229973435401917, "epoch": 2.628418945963976, "step": 7880 }, { "epoch": 2.628418945963976, "ref_ce_loss": 0.16806453466415405, "step": 7880 }, { "epoch": 2.6317545030020013, "loss": 1.1962, "step": 7890 }, { "epoch": 2.6317545030020013, "grad_norm": 2.458528518676758, "step": 7890 }, { "epoch": 2.6317545030020013, "learning_rate": 0.0006912677733936626, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 1.2581466436386108, "step": 7890 }, { "ce_loss": 0.3371962904930115, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.49287542700767517, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.2400340884923935, "step": 7890 }, { "epoch": 2.6317545030020013, "loss": 0.9736883044242859, "step": 7890 }, { "ce_loss": 0.26119476556777954, "epoch": 2.6317545030020013, "step": 7890 }, { "distill_loss": 0.39335036277770996, "epoch": 2.6317545030020013, "step": 7890 }, { "epoch": 2.6317545030020013, "ref_ce_loss": 0.229218989610672, "step": 7890 }, { "epoch": 2.6350900600400267, "loss": 1.1243, "step": 7900 }, { "epoch": 2.6350900600400267, "grad_norm": 1.9649882316589355, "step": 7900 }, { "epoch": 2.6350900600400267, "learning_rate": 0.0006909714220910731, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 1.0200287103652954, "step": 7900 }, { "ce_loss": 0.3254638612270355, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.41585683822631836, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.20375323295593262, "step": 7900 }, { "epoch": 2.6350900600400267, "loss": 1.4603826999664307, "step": 7900 }, { "ce_loss": 0.33990997076034546, "epoch": 2.6350900600400267, "step": 7900 }, { "distill_loss": 0.3670250177383423, "epoch": 2.6350900600400267, "step": 7900 }, { "epoch": 2.6350900600400267, "ref_ce_loss": 0.2939133048057556, "step": 7900 }, { "epoch": 2.638425617078052, "loss": 1.062, "step": 7910 }, { "epoch": 2.638425617078052, "grad_norm": 3.770718574523926, "step": 7910 }, { "epoch": 2.638425617078052, "learning_rate": 0.0006906747311935243, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.9048759341239929, "step": 7910 }, { "ce_loss": 0.21760985255241394, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.43006569147109985, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.1983773112297058, "step": 7910 }, { "epoch": 2.638425617078052, "loss": 0.9832034707069397, "step": 7910 }, { "ce_loss": 0.2726140022277832, "epoch": 2.638425617078052, "step": 7910 }, { "distill_loss": 0.4162827432155609, "epoch": 2.638425617078052, "step": 7910 }, { "epoch": 2.638425617078052, "ref_ce_loss": 0.21636536717414856, "step": 7910 }, { "epoch": 2.6417611741160774, "loss": 1.095, "step": 7920 }, { "epoch": 2.6417611741160774, "grad_norm": 2.420430898666382, "step": 7920 }, { "epoch": 2.6417611741160774, "learning_rate": 0.0006903777010472864, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 1.0512824058532715, "step": 7920 }, { "ce_loss": 0.3101441264152527, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.4893198311328888, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.20389099419116974, "step": 7920 }, { "epoch": 2.6417611741160774, "loss": 1.038252592086792, "step": 7920 }, { "ce_loss": 0.20743471384048462, "epoch": 2.6417611741160774, "step": 7920 }, { "distill_loss": 0.43021488189697266, "epoch": 2.6417611741160774, "step": 7920 }, { "epoch": 2.6417611741160774, "ref_ce_loss": 0.21906831860542297, "step": 7920 }, { "epoch": 2.6450967311541027, "loss": 1.1114, "step": 7930 }, { "epoch": 2.6450967311541027, "grad_norm": 1.645699143409729, "step": 7930 }, { "epoch": 2.6450967311541027, "learning_rate": 0.0006900803319990253, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 0.9869709610939026, "step": 7930 }, { "ce_loss": 0.274577796459198, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.408801406621933, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.26362085342407227, "step": 7930 }, { "epoch": 2.6450967311541027, "loss": 1.5112707614898682, "step": 7930 }, { "ce_loss": 0.34168630838394165, "epoch": 2.6450967311541027, "step": 7930 }, { "distill_loss": 0.6311824321746826, "epoch": 2.6450967311541027, "step": 7930 }, { "epoch": 2.6450967311541027, "ref_ce_loss": 0.22010086476802826, "step": 7930 }, { "epoch": 2.648432288192128, "loss": 1.1468, "step": 7940 }, { "epoch": 2.648432288192128, "grad_norm": 2.092884063720703, "step": 7940 }, { "epoch": 2.648432288192128, "learning_rate": 0.000689782624395803, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 1.519085168838501, "step": 7940 }, { "ce_loss": 0.3875352144241333, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.4946945607662201, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.25518402457237244, "step": 7940 }, { "epoch": 2.648432288192128, "loss": 1.0586838722229004, "step": 7940 }, { "ce_loss": 0.3155548572540283, "epoch": 2.648432288192128, "step": 7940 }, { "distill_loss": 0.43945226073265076, "epoch": 2.648432288192128, "step": 7940 }, { "epoch": 2.648432288192128, "ref_ce_loss": 0.22807960212230682, "step": 7940 }, { "epoch": 2.6517678452301534, "loss": 1.1394, "step": 7950 }, { "epoch": 2.6517678452301534, "grad_norm": 1.931730031967163, "step": 7950 }, { "epoch": 2.6517678452301534, "learning_rate": 0.0006894845785850759, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 1.1163417100906372, "step": 7950 }, { "ce_loss": 0.28073281049728394, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.525175929069519, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.2490440309047699, "step": 7950 }, { "epoch": 2.6517678452301534, "loss": 1.207800030708313, "step": 7950 }, { "ce_loss": 0.3547218441963196, "epoch": 2.6517678452301534, "step": 7950 }, { "distill_loss": 0.5505259037017822, "epoch": 2.6517678452301534, "step": 7950 }, { "epoch": 2.6517678452301534, "ref_ce_loss": 0.24187520146369934, "step": 7950 }, { "epoch": 2.6551034022681788, "loss": 1.2237, "step": 7960 }, { "epoch": 2.6551034022681788, "grad_norm": 2.8231544494628906, "step": 7960 }, { "epoch": 2.6551034022681788, "learning_rate": 0.0006891861949146959, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 1.2380450963974, "step": 7960 }, { "ce_loss": 0.31260645389556885, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.38258302211761475, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.23153412342071533, "step": 7960 }, { "epoch": 2.6551034022681788, "loss": 1.231788158416748, "step": 7960 }, { "ce_loss": 0.307564377784729, "epoch": 2.6551034022681788, "step": 7960 }, { "distill_loss": 0.461772620677948, "epoch": 2.6551034022681788, "step": 7960 }, { "epoch": 2.6551034022681788, "ref_ce_loss": 0.18721505999565125, "step": 7960 }, { "epoch": 2.658438959306204, "loss": 1.0779, "step": 7970 }, { "epoch": 2.658438959306204, "grad_norm": 1.6480693817138672, "step": 7970 }, { "epoch": 2.658438959306204, "learning_rate": 0.0006888874737329087, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 0.8892300128936768, "step": 7970 }, { "ce_loss": 0.25990501046180725, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.33113154768943787, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.25036299228668213, "step": 7970 }, { "epoch": 2.658438959306204, "loss": 1.1123273372650146, "step": 7970 }, { "ce_loss": 0.3147440552711487, "epoch": 2.658438959306204, "step": 7970 }, { "distill_loss": 0.4517229199409485, "epoch": 2.658438959306204, "step": 7970 }, { "epoch": 2.658438959306204, "ref_ce_loss": 0.2724269926548004, "step": 7970 }, { "epoch": 2.6617745163442295, "loss": 1.0683, "step": 7980 }, { "epoch": 2.6617745163442295, "grad_norm": 2.092548131942749, "step": 7980 }, { "epoch": 2.6617745163442295, "learning_rate": 0.000688588415388354, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.9862041473388672, "step": 7980 }, { "ce_loss": 0.31494665145874023, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.45636996626853943, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.21455398201942444, "step": 7980 }, { "epoch": 2.6617745163442295, "loss": 0.9188050031661987, "step": 7980 }, { "ce_loss": 0.1882411688566208, "epoch": 2.6617745163442295, "step": 7980 }, { "distill_loss": 0.445490300655365, "epoch": 2.6617745163442295, "step": 7980 }, { "epoch": 2.6617745163442295, "ref_ce_loss": 0.1915428638458252, "step": 7980 }, { "epoch": 2.665110073382255, "loss": 1.095, "step": 7990 }, { "epoch": 2.665110073382255, "grad_norm": 2.971047878265381, "step": 7990 }, { "epoch": 2.665110073382255, "learning_rate": 0.0006882890202300653, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 1.2398712635040283, "step": 7990 }, { "ce_loss": 0.1910310983657837, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.4677865207195282, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.1882854700088501, "step": 7990 }, { "epoch": 2.665110073382255, "loss": 1.357975721359253, "step": 7990 }, { "ce_loss": 0.2677784264087677, "epoch": 2.665110073382255, "step": 7990 }, { "distill_loss": 0.45560523867607117, "epoch": 2.665110073382255, "step": 7990 }, { "epoch": 2.665110073382255, "ref_ce_loss": 0.22759392857551575, "step": 7990 }, { "epoch": 2.66844563042028, "loss": 1.0626, "step": 8000 }, { "epoch": 2.66844563042028, "grad_norm": 1.8649590015411377, "step": 8000 }, { "epoch": 2.66844563042028, "learning_rate": 0.0006879892886074686, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 1.2235169410705566, "step": 8000 }, { "ce_loss": 0.2407296597957611, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.41924282908439636, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.19686494767665863, "step": 8000 }, { "epoch": 2.66844563042028, "loss": 0.7969648838043213, "step": 8000 }, { "ce_loss": 0.20408573746681213, "epoch": 2.66844563042028, "step": 8000 }, { "distill_loss": 0.3923669159412384, "epoch": 2.66844563042028, "step": 8000 }, { "epoch": 2.66844563042028, "ref_ce_loss": 0.1550588309764862, "step": 8000 }, { "epoch": 2.6717811874583055, "loss": 1.0854, "step": 8010 }, { "epoch": 2.6717811874583055, "grad_norm": 1.7967385053634644, "step": 8010 }, { "epoch": 2.6717811874583055, "learning_rate": 0.0006876892208703833, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 0.9698286652565002, "step": 8010 }, { "ce_loss": 0.26138097047805786, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.40205711126327515, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.18110914528369904, "step": 8010 }, { "epoch": 2.6717811874583055, "loss": 0.9017637968063354, "step": 8010 }, { "ce_loss": 0.2703733742237091, "epoch": 2.6717811874583055, "step": 8010 }, { "distill_loss": 0.404426634311676, "epoch": 2.6717811874583055, "step": 8010 }, { "epoch": 2.6717811874583055, "ref_ce_loss": 0.17171475291252136, "step": 8010 }, { "epoch": 2.675116744496331, "loss": 1.0782, "step": 8020 }, { "epoch": 2.675116744496331, "grad_norm": 1.5138238668441772, "step": 8020 }, { "epoch": 2.675116744496331, "learning_rate": 0.0006873888173690207, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 0.7773812413215637, "step": 8020 }, { "ce_loss": 0.20126408338546753, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.33068251609802246, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.17736712098121643, "step": 8020 }, { "epoch": 2.675116744496331, "loss": 1.1545333862304688, "step": 8020 }, { "ce_loss": 0.3212941884994507, "epoch": 2.675116744496331, "step": 8020 }, { "distill_loss": 0.43402695655822754, "epoch": 2.675116744496331, "step": 8020 }, { "epoch": 2.675116744496331, "ref_ce_loss": 0.2017740160226822, "step": 8020 }, { "epoch": 2.678452301534356, "loss": 0.9684, "step": 8030 }, { "epoch": 2.678452301534356, "grad_norm": 1.565229892730713, "step": 8030 }, { "epoch": 2.678452301534356, "learning_rate": 0.0006870880784539837, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.9166897535324097, "step": 8030 }, { "ce_loss": 0.24335050582885742, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.4135293960571289, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.18786965310573578, "step": 8030 }, { "epoch": 2.678452301534356, "loss": 0.891508936882019, "step": 8030 }, { "ce_loss": 0.2426663637161255, "epoch": 2.678452301534356, "step": 8030 }, { "distill_loss": 0.37967848777770996, "epoch": 2.678452301534356, "step": 8030 }, { "epoch": 2.678452301534356, "ref_ce_loss": 0.18929001688957214, "step": 8030 }, { "epoch": 2.6817878585723816, "loss": 1.146, "step": 8040 }, { "epoch": 2.6817878585723816, "grad_norm": 2.639207363128662, "step": 8040 }, { "epoch": 2.6817878585723816, "learning_rate": 0.0006867870044762672, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 1.2057491540908813, "step": 8040 }, { "ce_loss": 0.27712738513946533, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.40351033210754395, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.2716418206691742, "step": 8040 }, { "epoch": 2.6817878585723816, "loss": 0.9097023606300354, "step": 8040 }, { "ce_loss": 0.2758481502532959, "epoch": 2.6817878585723816, "step": 8040 }, { "distill_loss": 0.41735514998435974, "epoch": 2.6817878585723816, "step": 8040 }, { "epoch": 2.6817878585723816, "ref_ce_loss": 0.21631482243537903, "step": 8040 }, { "epoch": 2.685123415610407, "loss": 1.0075, "step": 8050 }, { "epoch": 2.685123415610407, "grad_norm": 2.203000783920288, "step": 8050 }, { "epoch": 2.685123415610407, "learning_rate": 0.0006864855957872571, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 1.179887294769287, "step": 8050 }, { "ce_loss": 0.3261899948120117, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.4284709393978119, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.23692551255226135, "step": 8050 }, { "epoch": 2.685123415610407, "loss": 0.9125145077705383, "step": 8050 }, { "ce_loss": 0.26923179626464844, "epoch": 2.685123415610407, "step": 8050 }, { "distill_loss": 0.3452901244163513, "epoch": 2.685123415610407, "step": 8050 }, { "epoch": 2.685123415610407, "ref_ce_loss": 0.23391097784042358, "step": 8050 }, { "epoch": 2.6884589726484323, "loss": 1.1154, "step": 8060 }, { "epoch": 2.6884589726484323, "grad_norm": 4.837062835693359, "step": 8060 }, { "epoch": 2.6884589726484323, "learning_rate": 0.0006861838527387296, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 0.9809233546257019, "step": 8060 }, { "ce_loss": 0.28013816475868225, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.40941011905670166, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.1691157966852188, "step": 8060 }, { "epoch": 2.6884589726484323, "loss": 1.184672236442566, "step": 8060 }, { "ce_loss": 0.3251360356807709, "epoch": 2.6884589726484323, "step": 8060 }, { "distill_loss": 0.48578941822052, "epoch": 2.6884589726484323, "step": 8060 }, { "epoch": 2.6884589726484323, "ref_ce_loss": 0.18599502742290497, "step": 8060 }, { "epoch": 2.6917945296864576, "loss": 1.0589, "step": 8070 }, { "epoch": 2.6917945296864576, "grad_norm": 2.380504846572876, "step": 8070 }, { "epoch": 2.6917945296864576, "learning_rate": 0.0006858817756828511, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 0.8261231184005737, "step": 8070 }, { "ce_loss": 0.21635033190250397, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.36327552795410156, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.17453186213970184, "step": 8070 }, { "epoch": 2.6917945296864576, "loss": 1.0964490175247192, "step": 8070 }, { "ce_loss": 0.2572353780269623, "epoch": 2.6917945296864576, "step": 8070 }, { "distill_loss": 0.41422608494758606, "epoch": 2.6917945296864576, "step": 8070 }, { "epoch": 2.6917945296864576, "ref_ce_loss": 0.19209592044353485, "step": 8070 }, { "epoch": 2.695130086724483, "loss": 1.1104, "step": 8080 }, { "epoch": 2.695130086724483, "grad_norm": 1.6355799436569214, "step": 8080 }, { "epoch": 2.695130086724483, "learning_rate": 0.0006855793649721783, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 1.1212942600250244, "step": 8080 }, { "ce_loss": 0.2441914677619934, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.44367820024490356, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.18526434898376465, "step": 8080 }, { "epoch": 2.695130086724483, "loss": 1.0341012477874756, "step": 8080 }, { "ce_loss": 0.2990477681159973, "epoch": 2.695130086724483, "step": 8080 }, { "distill_loss": 0.4595893919467926, "epoch": 2.695130086724483, "step": 8080 }, { "epoch": 2.695130086724483, "ref_ce_loss": 0.21027454733848572, "step": 8080 }, { "epoch": 2.6984656437625083, "loss": 1.0644, "step": 8090 }, { "epoch": 2.6984656437625083, "grad_norm": 3.036402702331543, "step": 8090 }, { "epoch": 2.6984656437625083, "learning_rate": 0.000685276620959657, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 1.0285917520523071, "step": 8090 }, { "ce_loss": 0.23727817833423615, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.3857990801334381, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.18309199810028076, "step": 8090 }, { "epoch": 2.6984656437625083, "loss": 1.0543606281280518, "step": 8090 }, { "ce_loss": 0.2791891396045685, "epoch": 2.6984656437625083, "step": 8090 }, { "distill_loss": 0.4386514723300934, "epoch": 2.6984656437625083, "step": 8090 }, { "epoch": 2.6984656437625083, "ref_ce_loss": 0.20415925979614258, "step": 8090 }, { "epoch": 2.7018012008005337, "loss": 1.0951, "step": 8100 }, { "epoch": 2.7018012008005337, "grad_norm": 1.9897665977478027, "step": 8100 }, { "epoch": 2.7018012008005337, "learning_rate": 0.000684973543998622, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 0.915390133857727, "step": 8100 }, { "ce_loss": 0.2011374980211258, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.37561312317848206, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.16644081473350525, "step": 8100 }, { "epoch": 2.7018012008005337, "loss": 1.2802246809005737, "step": 8100 }, { "ce_loss": 0.2943989336490631, "epoch": 2.7018012008005337, "step": 8100 }, { "distill_loss": 0.44954586029052734, "epoch": 2.7018012008005337, "step": 8100 }, { "epoch": 2.7018012008005337, "ref_ce_loss": 0.2091292142868042, "step": 8100 }, { "epoch": 2.705136757838559, "loss": 1.0826, "step": 8110 }, { "epoch": 2.705136757838559, "grad_norm": 1.6685622930526733, "step": 8110 }, { "epoch": 2.705136757838559, "learning_rate": 0.0006846701344427967, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 0.8103539943695068, "step": 8110 }, { "ce_loss": 0.2346825748682022, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.32806381583213806, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.17953945696353912, "step": 8110 }, { "epoch": 2.705136757838559, "loss": 1.1475951671600342, "step": 8110 }, { "ce_loss": 0.2751637399196625, "epoch": 2.705136757838559, "step": 8110 }, { "distill_loss": 0.323055237531662, "epoch": 2.705136757838559, "step": 8110 }, { "epoch": 2.705136757838559, "ref_ce_loss": 0.21816794574260712, "step": 8110 }, { "epoch": 2.7084723148765844, "loss": 1.0316, "step": 8120 }, { "epoch": 2.7084723148765844, "grad_norm": 17.63924217224121, "step": 8120 }, { "epoch": 2.7084723148765844, "learning_rate": 0.0006843663926462927, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 1.017307996749878, "step": 8120 }, { "ce_loss": 0.27584630250930786, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.4709221422672272, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.21881969273090363, "step": 8120 }, { "epoch": 2.7084723148765844, "loss": 1.410461187362671, "step": 8120 }, { "ce_loss": 0.3697640299797058, "epoch": 2.7084723148765844, "step": 8120 }, { "distill_loss": 0.5594056248664856, "epoch": 2.7084723148765844, "step": 8120 }, { "epoch": 2.7084723148765844, "ref_ce_loss": 0.2293158620595932, "step": 8120 }, { "epoch": 2.7118078719146097, "loss": 1.1385, "step": 8130 }, { "epoch": 2.7118078719146097, "grad_norm": 1.8438341617584229, "step": 8130 }, { "epoch": 2.7118078719146097, "learning_rate": 0.0006840623189636095, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 0.8964990973472595, "step": 8130 }, { "ce_loss": 0.22235225141048431, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.3620837926864624, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.13280965387821198, "step": 8130 }, { "epoch": 2.7118078719146097, "loss": 1.0640838146209717, "step": 8130 }, { "ce_loss": 0.20717808604240417, "epoch": 2.7118078719146097, "step": 8130 }, { "distill_loss": 0.38882189989089966, "epoch": 2.7118078719146097, "step": 8130 }, { "epoch": 2.7118078719146097, "ref_ce_loss": 0.1656038612127304, "step": 8130 }, { "epoch": 2.715143428952635, "loss": 1.0427, "step": 8140 }, { "epoch": 2.715143428952635, "grad_norm": 1.8322330713272095, "step": 8140 }, { "epoch": 2.715143428952635, "learning_rate": 0.0006837579137496336, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 1.317813515663147, "step": 8140 }, { "ce_loss": 0.35364487767219543, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.48588964343070984, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.22830376029014587, "step": 8140 }, { "epoch": 2.715143428952635, "loss": 1.2036542892456055, "step": 8140 }, { "ce_loss": 0.2760239243507385, "epoch": 2.715143428952635, "step": 8140 }, { "distill_loss": 0.4591331481933594, "epoch": 2.715143428952635, "step": 8140 }, { "epoch": 2.715143428952635, "ref_ce_loss": 0.24318912625312805, "step": 8140 }, { "epoch": 2.7184789859906604, "loss": 1.1119, "step": 8150 }, { "epoch": 2.7184789859906604, "grad_norm": 1.984826922416687, "step": 8150 }, { "epoch": 2.7184789859906604, "learning_rate": 0.0006834531773596388, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 1.117841362953186, "step": 8150 }, { "ce_loss": 0.32421332597732544, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.47627392411231995, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.23721623420715332, "step": 8150 }, { "epoch": 2.7184789859906604, "loss": 0.6992880702018738, "step": 8150 }, { "ce_loss": 0.1821093112230301, "epoch": 2.7184789859906604, "step": 8150 }, { "distill_loss": 0.3342851400375366, "epoch": 2.7184789859906604, "step": 8150 }, { "epoch": 2.7184789859906604, "ref_ce_loss": 0.1452513188123703, "step": 8150 }, { "epoch": 2.7218145430286858, "loss": 1.0617, "step": 8160 }, { "epoch": 2.7218145430286858, "grad_norm": 2.205428123474121, "step": 8160 }, { "epoch": 2.7218145430286858, "learning_rate": 0.0006831481101492852, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.9553093314170837, "step": 8160 }, { "ce_loss": 0.28660255670547485, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.4710615277290344, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.1975298523902893, "step": 8160 }, { "epoch": 2.7218145430286858, "loss": 0.9735440611839294, "step": 8160 }, { "ce_loss": 0.20919136703014374, "epoch": 2.7218145430286858, "step": 8160 }, { "distill_loss": 0.39127933979034424, "epoch": 2.7218145430286858, "step": 8160 }, { "epoch": 2.7218145430286858, "ref_ce_loss": 0.1659659743309021, "step": 8160 }, { "epoch": 2.725150100066711, "loss": 1.1133, "step": 8170 }, { "epoch": 2.725150100066711, "grad_norm": 5.617023944854736, "step": 8170 }, { "epoch": 2.725150100066711, "learning_rate": 0.000682842712474619, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.883911669254303, "step": 8170 }, { "ce_loss": 0.24566011130809784, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.40629303455352783, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.186699777841568, "step": 8170 }, { "epoch": 2.725150100066711, "loss": 0.915961742401123, "step": 8170 }, { "ce_loss": 0.27848586440086365, "epoch": 2.725150100066711, "step": 8170 }, { "distill_loss": 0.4564547836780548, "epoch": 2.725150100066711, "step": 8170 }, { "epoch": 2.725150100066711, "ref_ce_loss": 0.18077746033668518, "step": 8170 }, { "epoch": 2.7284856571047365, "loss": 1.0033, "step": 8180 }, { "epoch": 2.7284856571047365, "grad_norm": 2.667583703994751, "step": 8180 }, { "epoch": 2.7284856571047365, "learning_rate": 0.0006825369846920722, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 0.8582113981246948, "step": 8180 }, { "ce_loss": 0.2530399262905121, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.41539907455444336, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.1895999312400818, "step": 8180 }, { "epoch": 2.7284856571047365, "loss": 1.4451714754104614, "step": 8180 }, { "ce_loss": 0.21054810285568237, "epoch": 2.7284856571047365, "step": 8180 }, { "distill_loss": 0.4180562496185303, "epoch": 2.7284856571047365, "step": 8180 }, { "epoch": 2.7284856571047365, "ref_ce_loss": 0.17314787209033966, "step": 8180 }, { "epoch": 2.731821214142762, "loss": 1.0923, "step": 8190 }, { "epoch": 2.731821214142762, "grad_norm": 2.334686279296875, "step": 8190 }, { "epoch": 2.731821214142762, "learning_rate": 0.0006822309271584622, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 0.958365797996521, "step": 8190 }, { "ce_loss": 0.3037683665752411, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.41571107506752014, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.186298206448555, "step": 8190 }, { "epoch": 2.731821214142762, "loss": 1.1305922269821167, "step": 8190 }, { "ce_loss": 0.2983015775680542, "epoch": 2.731821214142762, "step": 8190 }, { "distill_loss": 0.42953890562057495, "epoch": 2.731821214142762, "step": 8190 }, { "epoch": 2.731821214142762, "ref_ce_loss": 0.2160416692495346, "step": 8190 }, { "epoch": 2.735156771180787, "loss": 1.1597, "step": 8200 }, { "epoch": 2.735156771180787, "grad_norm": 2.6440696716308594, "step": 8200 }, { "epoch": 2.735156771180787, "learning_rate": 0.0006819245402309907, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 1.0382781028747559, "step": 8200 }, { "ce_loss": 0.267867773771286, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.39029863476753235, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.23540931940078735, "step": 8200 }, { "epoch": 2.735156771180787, "loss": 0.9972876906394958, "step": 8200 }, { "ce_loss": 0.3134215474128723, "epoch": 2.735156771180787, "step": 8200 }, { "distill_loss": 0.42370691895484924, "epoch": 2.735156771180787, "step": 8200 }, { "epoch": 2.735156771180787, "ref_ce_loss": 0.20167101919651031, "step": 8200 }, { "epoch": 2.7384923282188125, "loss": 1.1542, "step": 8210 }, { "epoch": 2.7384923282188125, "grad_norm": 2.106982469558716, "step": 8210 }, { "epoch": 2.7384923282188125, "learning_rate": 0.0006816178242672446, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 0.9670668244361877, "step": 8210 }, { "ce_loss": 0.3226143419742584, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.4178784489631653, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.22630652785301208, "step": 8210 }, { "epoch": 2.7384923282188125, "loss": 1.2102785110473633, "step": 8210 }, { "ce_loss": 0.31588634848594666, "epoch": 2.7384923282188125, "step": 8210 }, { "distill_loss": 0.4046383798122406, "epoch": 2.7384923282188125, "step": 8210 }, { "epoch": 2.7384923282188125, "ref_ce_loss": 0.21099373698234558, "step": 8210 }, { "epoch": 2.741827885256838, "loss": 1.1407, "step": 8220 }, { "epoch": 2.741827885256838, "grad_norm": 2.294503688812256, "step": 8220 }, { "epoch": 2.741827885256838, "learning_rate": 0.000681310779625194, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 0.7896057367324829, "step": 8220 }, { "ce_loss": 0.22302307188510895, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.4146197736263275, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.15178342163562775, "step": 8220 }, { "epoch": 2.741827885256838, "loss": 1.2498191595077515, "step": 8220 }, { "ce_loss": 0.31091174483299255, "epoch": 2.741827885256838, "step": 8220 }, { "distill_loss": 0.4844982624053955, "epoch": 2.741827885256838, "step": 8220 }, { "epoch": 2.741827885256838, "ref_ce_loss": 0.22255325317382812, "step": 8220 }, { "epoch": 2.745163442294863, "loss": 1.0505, "step": 8230 }, { "epoch": 2.745163442294863, "grad_norm": 3.4932470321655273, "step": 8230 }, { "epoch": 2.745163442294863, "learning_rate": 0.0006810034066631935, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.8021654486656189, "step": 8230 }, { "ce_loss": 0.2264508605003357, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.35844430327415466, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.1736733764410019, "step": 8230 }, { "epoch": 2.745163442294863, "loss": 0.8777614831924438, "step": 8230 }, { "ce_loss": 0.29190006852149963, "epoch": 2.745163442294863, "step": 8230 }, { "distill_loss": 0.39581429958343506, "epoch": 2.745163442294863, "step": 8230 }, { "epoch": 2.745163442294863, "ref_ce_loss": 0.18982553482055664, "step": 8230 }, { "epoch": 2.7484989993328885, "loss": 0.9949, "step": 8240 }, { "epoch": 2.7484989993328885, "grad_norm": 1.8267552852630615, "step": 8240 }, { "epoch": 2.7484989993328885, "learning_rate": 0.0006806957057399802, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.9348551630973816, "step": 8240 }, { "ce_loss": 0.2395116537809372, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.43146926164627075, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.18208366632461548, "step": 8240 }, { "epoch": 2.7484989993328885, "loss": 0.9479954838752747, "step": 8240 }, { "ce_loss": 0.2667170464992523, "epoch": 2.7484989993328885, "step": 8240 }, { "distill_loss": 0.4461016058921814, "epoch": 2.7484989993328885, "step": 8240 }, { "epoch": 2.7484989993328885, "ref_ce_loss": 0.18788105249404907, "step": 8240 }, { "epoch": 2.751834556370914, "loss": 1.1488, "step": 8250 }, { "epoch": 2.751834556370914, "grad_norm": 1.7486326694488525, "step": 8250 }, { "epoch": 2.751834556370914, "learning_rate": 0.0006803876772146741, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 1.0329201221466064, "step": 8250 }, { "ce_loss": 0.3410790264606476, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.4854319989681244, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.2022295445203781, "step": 8250 }, { "epoch": 2.751834556370914, "loss": 0.7077986598014832, "step": 8250 }, { "ce_loss": 0.20355767011642456, "epoch": 2.751834556370914, "step": 8250 }, { "distill_loss": 0.3223862648010254, "epoch": 2.751834556370914, "step": 8250 }, { "epoch": 2.751834556370914, "ref_ce_loss": 0.14093159139156342, "step": 8250 }, { "epoch": 2.7551701134089392, "loss": 1.0387, "step": 8260 }, { "epoch": 2.7551701134089392, "grad_norm": 1.5510363578796387, "step": 8260 }, { "epoch": 2.7551701134089392, "learning_rate": 0.0006800793214467776, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 0.9963008165359497, "step": 8260 }, { "ce_loss": 0.2472948133945465, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.417851060628891, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.17874693870544434, "step": 8260 }, { "epoch": 2.7551701134089392, "loss": 1.1251202821731567, "step": 8260 }, { "ce_loss": 0.31395453214645386, "epoch": 2.7551701134089392, "step": 8260 }, { "distill_loss": 0.48657265305519104, "epoch": 2.7551701134089392, "step": 8260 }, { "epoch": 2.7551701134089392, "ref_ce_loss": 0.239948108792305, "step": 8260 }, { "epoch": 2.7585056704469646, "loss": 1.0674, "step": 8270 }, { "epoch": 2.7585056704469646, "grad_norm": 2.9985060691833496, "step": 8270 }, { "epoch": 2.7585056704469646, "learning_rate": 0.0006797706387961754, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 1.797513484954834, "step": 8270 }, { "ce_loss": 0.330750048160553, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.6316664814949036, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.2871108651161194, "step": 8270 }, { "epoch": 2.7585056704469646, "loss": 1.196751356124878, "step": 8270 }, { "ce_loss": 0.28149712085723877, "epoch": 2.7585056704469646, "step": 8270 }, { "distill_loss": 0.5029041171073914, "epoch": 2.7585056704469646, "step": 8270 }, { "epoch": 2.7585056704469646, "ref_ce_loss": 0.22309550642967224, "step": 8270 }, { "epoch": 2.76184122748499, "loss": 1.1136, "step": 8280 }, { "epoch": 2.76184122748499, "grad_norm": 1.9858359098434448, "step": 8280 }, { "epoch": 2.76184122748499, "learning_rate": 0.0006794616296231331, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 1.0421366691589355, "step": 8280 }, { "ce_loss": 0.25688180327415466, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.41856256127357483, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.19284594058990479, "step": 8280 }, { "epoch": 2.76184122748499, "loss": 0.9045016765594482, "step": 8280 }, { "ce_loss": 0.24156159162521362, "epoch": 2.76184122748499, "step": 8280 }, { "distill_loss": 0.4102836847305298, "epoch": 2.76184122748499, "step": 8280 }, { "epoch": 2.76184122748499, "ref_ce_loss": 0.18480077385902405, "step": 8280 }, { "epoch": 2.7651767845230153, "loss": 1.0542, "step": 8290 }, { "epoch": 2.7651767845230153, "grad_norm": 2.0464863777160645, "step": 8290 }, { "epoch": 2.7651767845230153, "learning_rate": 0.0006791522942882976, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 1.046781063079834, "step": 8290 }, { "ce_loss": 0.2372472733259201, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.48630839586257935, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.21556465327739716, "step": 8290 }, { "epoch": 2.7651767845230153, "loss": 1.025158166885376, "step": 8290 }, { "ce_loss": 0.25177982449531555, "epoch": 2.7651767845230153, "step": 8290 }, { "distill_loss": 0.44891324639320374, "epoch": 2.7651767845230153, "step": 8290 }, { "epoch": 2.7651767845230153, "ref_ce_loss": 0.19279952347278595, "step": 8290 }, { "epoch": 2.7685123415610406, "loss": 1.0936, "step": 8300 }, { "epoch": 2.7685123415610406, "grad_norm": 3.3846592903137207, "step": 8300 }, { "epoch": 2.7685123415610406, "learning_rate": 0.000678842633152697, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.7709948420524597, "step": 8300 }, { "ce_loss": 0.23251450061798096, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.3573760986328125, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.18105630576610565, "step": 8300 }, { "epoch": 2.7685123415610406, "loss": 0.9063900709152222, "step": 8300 }, { "ce_loss": 0.2689492106437683, "epoch": 2.7685123415610406, "step": 8300 }, { "distill_loss": 0.40178921818733215, "epoch": 2.7685123415610406, "step": 8300 }, { "epoch": 2.7685123415610406, "ref_ce_loss": 0.196784108877182, "step": 8300 }, { "epoch": 2.771847898599066, "loss": 0.9933, "step": 8310 }, { "epoch": 2.771847898599066, "grad_norm": 1.6398341655731201, "step": 8310 }, { "epoch": 2.771847898599066, "learning_rate": 0.0006785326465777384, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 1.0609391927719116, "step": 8310 }, { "ce_loss": 0.3316725194454193, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.4921702742576599, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.17898264527320862, "step": 8310 }, { "epoch": 2.771847898599066, "loss": 1.0043429136276245, "step": 8310 }, { "ce_loss": 0.26267266273498535, "epoch": 2.771847898599066, "step": 8310 }, { "distill_loss": 0.44388797879219055, "epoch": 2.771847898599066, "step": 8310 }, { "epoch": 2.771847898599066, "ref_ce_loss": 0.1953670084476471, "step": 8310 }, { "epoch": 2.7751834556370913, "loss": 1.0848, "step": 8320 }, { "epoch": 2.7751834556370913, "grad_norm": 2.018314838409424, "step": 8320 }, { "epoch": 2.7751834556370913, "learning_rate": 0.0006782223349252101, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.9457603096961975, "step": 8320 }, { "ce_loss": 0.22221912443637848, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.46346578001976013, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.24278980493545532, "step": 8320 }, { "epoch": 2.7751834556370913, "loss": 0.9505172967910767, "step": 8320 }, { "ce_loss": 0.3089073598384857, "epoch": 2.7751834556370913, "step": 8320 }, { "distill_loss": 0.41338473558425903, "epoch": 2.7751834556370913, "step": 8320 }, { "epoch": 2.7751834556370913, "ref_ce_loss": 0.17162297666072845, "step": 8320 }, { "epoch": 2.7785190126751167, "loss": 1.1974, "step": 8330 }, { "epoch": 2.7785190126751167, "grad_norm": 2.569556474685669, "step": 8330 }, { "epoch": 2.7785190126751167, "learning_rate": 0.0006779116985572789, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 0.789454996585846, "step": 8330 }, { "ce_loss": 0.24249128997325897, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.34002089500427246, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.20667454600334167, "step": 8330 }, { "epoch": 2.7785190126751167, "loss": 1.1776317358016968, "step": 8330 }, { "ce_loss": 0.35128793120384216, "epoch": 2.7785190126751167, "step": 8330 }, { "distill_loss": 0.4592365324497223, "epoch": 2.7785190126751167, "step": 8330 }, { "epoch": 2.7785190126751167, "ref_ce_loss": 0.21480117738246918, "step": 8330 }, { "epoch": 2.781854569713142, "loss": 1.0435, "step": 8340 }, { "epoch": 2.781854569713142, "grad_norm": 1.6675870418548584, "step": 8340 }, { "epoch": 2.781854569713142, "learning_rate": 0.0006776007378364909, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 1.3947495222091675, "step": 8340 }, { "ce_loss": 0.25217753648757935, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.3868056535720825, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.16207663714885712, "step": 8340 }, { "epoch": 2.781854569713142, "loss": 0.9727454781532288, "step": 8340 }, { "ce_loss": 0.34851181507110596, "epoch": 2.781854569713142, "step": 8340 }, { "distill_loss": 0.3823917806148529, "epoch": 2.781854569713142, "step": 8340 }, { "epoch": 2.781854569713142, "ref_ce_loss": 0.18593260645866394, "step": 8340 }, { "epoch": 2.7851901267511674, "loss": 1.0944, "step": 8350 }, { "epoch": 2.7851901267511674, "grad_norm": 1.8214737176895142, "step": 8350 }, { "epoch": 2.7851901267511674, "learning_rate": 0.0006772894531257709, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 1.036094069480896, "step": 8350 }, { "ce_loss": 0.31758782267570496, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.40042489767074585, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.2421387881040573, "step": 8350 }, { "epoch": 2.7851901267511674, "loss": 1.1778686046600342, "step": 8350 }, { "ce_loss": 0.22891831398010254, "epoch": 2.7851901267511674, "step": 8350 }, { "distill_loss": 0.37844356894493103, "epoch": 2.7851901267511674, "step": 8350 }, { "epoch": 2.7851901267511674, "ref_ce_loss": 0.20239907503128052, "step": 8350 }, { "epoch": 2.7885256837891927, "loss": 1.1012, "step": 8360 }, { "epoch": 2.7885256837891927, "grad_norm": 2.9142262935638428, "step": 8360 }, { "epoch": 2.7885256837891927, "learning_rate": 0.0006769778447884214, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.988903820514679, "step": 8360 }, { "ce_loss": 0.2615964412689209, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.437855064868927, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.17277489602565765, "step": 8360 }, { "epoch": 2.7885256837891927, "loss": 0.9511721730232239, "step": 8360 }, { "ce_loss": 0.26001596450805664, "epoch": 2.7885256837891927, "step": 8360 }, { "distill_loss": 0.46625223755836487, "epoch": 2.7885256837891927, "step": 8360 }, { "epoch": 2.7885256837891927, "ref_ce_loss": 0.22466371953487396, "step": 8360 }, { "epoch": 2.791861240827218, "loss": 1.0732, "step": 8370 }, { "epoch": 2.791861240827218, "grad_norm": 2.446507453918457, "step": 8370 }, { "epoch": 2.791861240827218, "learning_rate": 0.000676665913188123, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 1.571579933166504, "step": 8370 }, { "ce_loss": 0.23533394932746887, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.4526551067829132, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.20821493864059448, "step": 8370 }, { "epoch": 2.791861240827218, "loss": 1.2163251638412476, "step": 8370 }, { "ce_loss": 0.2753696143627167, "epoch": 2.791861240827218, "step": 8370 }, { "distill_loss": 0.42554229497909546, "epoch": 2.791861240827218, "step": 8370 }, { "epoch": 2.791861240827218, "ref_ce_loss": 0.23759661614894867, "step": 8370 }, { "epoch": 2.7951967978652434, "loss": 1.1284, "step": 8380 }, { "epoch": 2.7951967978652434, "grad_norm": 3.0520758628845215, "step": 8380 }, { "epoch": 2.7951967978652434, "learning_rate": 0.0006763536586889335, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 1.0454256534576416, "step": 8380 }, { "ce_loss": 0.29537466168403625, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.39937081933021545, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.24960553646087646, "step": 8380 }, { "epoch": 2.7951967978652434, "loss": 0.8441163897514343, "step": 8380 }, { "ce_loss": 0.24988539516925812, "epoch": 2.7951967978652434, "step": 8380 }, { "distill_loss": 0.29676154255867004, "epoch": 2.7951967978652434, "step": 8380 }, { "epoch": 2.7951967978652434, "ref_ce_loss": 0.18834713101387024, "step": 8380 }, { "epoch": 2.798532354903269, "loss": 1.0066, "step": 8390 }, { "epoch": 2.798532354903269, "grad_norm": 1.6638082265853882, "step": 8390 }, { "epoch": 2.798532354903269, "learning_rate": 0.0006760410816552874, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.981625497341156, "step": 8390 }, { "ce_loss": 0.2807079553604126, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.37848836183547974, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.20049092173576355, "step": 8390 }, { "epoch": 2.798532354903269, "loss": 0.9530798196792603, "step": 8390 }, { "ce_loss": 0.2755109965801239, "epoch": 2.798532354903269, "step": 8390 }, { "distill_loss": 0.42244938015937805, "epoch": 2.798532354903269, "step": 8390 }, { "epoch": 2.798532354903269, "ref_ce_loss": 0.18968243896961212, "step": 8390 }, { "epoch": 2.801867911941294, "loss": 1.0347, "step": 8400 }, { "epoch": 2.801867911941294, "grad_norm": 2.0404865741729736, "step": 8400 }, { "epoch": 2.801867911941294, "learning_rate": 0.0006757281824519958, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.8567011952400208, "step": 8400 }, { "ce_loss": 0.1716955602169037, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.38128066062927246, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.16394934058189392, "step": 8400 }, { "epoch": 2.801867911941294, "loss": 0.983532726764679, "step": 8400 }, { "ce_loss": 0.26755788922309875, "epoch": 2.801867911941294, "step": 8400 }, { "distill_loss": 0.5026155114173889, "epoch": 2.801867911941294, "step": 8400 }, { "epoch": 2.801867911941294, "ref_ce_loss": 0.21269215643405914, "step": 8400 }, { "epoch": 2.8052034689793195, "loss": 1.0684, "step": 8410 }, { "epoch": 2.8052034689793195, "grad_norm": 5.197111129760742, "step": 8410 }, { "epoch": 2.8052034689793195, "learning_rate": 0.0006754149614442457, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.7837503552436829, "step": 8410 }, { "ce_loss": 0.2317720502614975, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.3728940486907959, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.17860761284828186, "step": 8410 }, { "epoch": 2.8052034689793195, "loss": 0.89178866147995, "step": 8410 }, { "ce_loss": 0.22866491973400116, "epoch": 2.8052034689793195, "step": 8410 }, { "distill_loss": 0.46962761878967285, "epoch": 2.8052034689793195, "step": 8410 }, { "epoch": 2.8052034689793195, "ref_ce_loss": 0.1930232048034668, "step": 8410 }, { "epoch": 2.808539026017345, "loss": 1.1309, "step": 8420 }, { "epoch": 2.808539026017345, "grad_norm": 3.5530993938446045, "step": 8420 }, { "epoch": 2.808539026017345, "learning_rate": 0.0006751014189975995, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 1.0465023517608643, "step": 8420 }, { "ce_loss": 0.2774046063423157, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.3714715242385864, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.22385916113853455, "step": 8420 }, { "epoch": 2.808539026017345, "loss": 1.0804543495178223, "step": 8420 }, { "ce_loss": 0.2771798372268677, "epoch": 2.808539026017345, "step": 8420 }, { "distill_loss": 0.4003911018371582, "epoch": 2.808539026017345, "step": 8420 }, { "epoch": 2.808539026017345, "ref_ce_loss": 0.18247495591640472, "step": 8420 }, { "epoch": 2.81187458305537, "loss": 1.1442, "step": 8430 }, { "epoch": 2.81187458305537, "grad_norm": 3.6089842319488525, "step": 8430 }, { "epoch": 2.81187458305537, "learning_rate": 0.0006747875554779955, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 1.0301923751831055, "step": 8430 }, { "ce_loss": 0.3191302716732025, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.394956111907959, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.18678748607635498, "step": 8430 }, { "epoch": 2.81187458305537, "loss": 0.7958675026893616, "step": 8430 }, { "ce_loss": 0.2141757309436798, "epoch": 2.81187458305537, "step": 8430 }, { "distill_loss": 0.33181634545326233, "epoch": 2.81187458305537, "step": 8430 }, { "epoch": 2.81187458305537, "ref_ce_loss": 0.13455720245838165, "step": 8430 }, { "epoch": 2.8152101400933955, "loss": 0.9705, "step": 8440 }, { "epoch": 2.8152101400933955, "grad_norm": 2.648263931274414, "step": 8440 }, { "epoch": 2.8152101400933955, "learning_rate": 0.0006744733712517457, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.9274229407310486, "step": 8440 }, { "ce_loss": 0.2939596474170685, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.42375898361206055, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.1693873107433319, "step": 8440 }, { "epoch": 2.8152101400933955, "loss": 0.9380104541778564, "step": 8440 }, { "ce_loss": 0.3217563033103943, "epoch": 2.8152101400933955, "step": 8440 }, { "distill_loss": 0.3950371742248535, "epoch": 2.8152101400933955, "step": 8440 }, { "epoch": 2.8152101400933955, "ref_ce_loss": 0.22082169353961945, "step": 8440 }, { "epoch": 2.818545697131421, "loss": 1.0684, "step": 8450 }, { "epoch": 2.818545697131421, "grad_norm": 4.051844120025635, "step": 8450 }, { "epoch": 2.818545697131421, "learning_rate": 0.0006741588666855371, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 1.2058062553405762, "step": 8450 }, { "ce_loss": 0.24386708438396454, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.4271876811981201, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.1798713207244873, "step": 8450 }, { "epoch": 2.818545697131421, "loss": 0.9697070121765137, "step": 8450 }, { "ce_loss": 0.2771538496017456, "epoch": 2.818545697131421, "step": 8450 }, { "distill_loss": 0.3977970480918884, "epoch": 2.818545697131421, "step": 8450 }, { "epoch": 2.818545697131421, "ref_ce_loss": 0.18675166368484497, "step": 8450 }, { "epoch": 2.8218812541694462, "loss": 1.0559, "step": 8460 }, { "epoch": 2.8218812541694462, "grad_norm": 4.089424133300781, "step": 8460 }, { "epoch": 2.8218812541694462, "learning_rate": 0.0006738440421464305, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 0.9530472159385681, "step": 8460 }, { "ce_loss": 0.2687753140926361, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.4895554482936859, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.19376049935817719, "step": 8460 }, { "epoch": 2.8218812541694462, "loss": 1.1800388097763062, "step": 8460 }, { "ce_loss": 0.32920578122138977, "epoch": 2.8218812541694462, "step": 8460 }, { "distill_loss": 0.47976043820381165, "epoch": 2.8218812541694462, "step": 8460 }, { "epoch": 2.8218812541694462, "ref_ce_loss": 0.2260797768831253, "step": 8460 }, { "epoch": 2.8252168112074716, "loss": 1.0713, "step": 8470 }, { "epoch": 2.8252168112074716, "grad_norm": 2.832854747772217, "step": 8470 }, { "epoch": 2.8252168112074716, "learning_rate": 0.0006735288980018597, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 1.5145750045776367, "step": 8470 }, { "ce_loss": 0.22480595111846924, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.36506372690200806, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.24038757383823395, "step": 8470 }, { "epoch": 2.8252168112074716, "loss": 0.9549174904823303, "step": 8470 }, { "ce_loss": 0.25437024235725403, "epoch": 2.8252168112074716, "step": 8470 }, { "distill_loss": 0.42316046357154846, "epoch": 2.8252168112074716, "step": 8470 }, { "epoch": 2.8252168112074716, "ref_ce_loss": 0.2307279109954834, "step": 8470 }, { "epoch": 2.828552368245497, "loss": 1.0884, "step": 8480 }, { "epoch": 2.828552368245497, "grad_norm": 1.985348105430603, "step": 8480 }, { "epoch": 2.828552368245497, "learning_rate": 0.000673213434619632, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 1.0262728929519653, "step": 8480 }, { "ce_loss": 0.28059670329093933, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.40128839015960693, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.21130184829235077, "step": 8480 }, { "epoch": 2.828552368245497, "loss": 0.8576571345329285, "step": 8480 }, { "ce_loss": 0.25547927618026733, "epoch": 2.828552368245497, "step": 8480 }, { "distill_loss": 0.3276759684085846, "epoch": 2.828552368245497, "step": 8480 }, { "epoch": 2.828552368245497, "ref_ce_loss": 0.24074243009090424, "step": 8480 }, { "epoch": 2.8318879252835223, "loss": 0.9914, "step": 8490 }, { "epoch": 2.8318879252835223, "grad_norm": 1.8692268133163452, "step": 8490 }, { "epoch": 2.8318879252835223, "learning_rate": 0.0006728976523679272, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 0.9765045046806335, "step": 8490 }, { "ce_loss": 0.32396554946899414, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.4324900507926941, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.21977542340755463, "step": 8490 }, { "epoch": 2.8318879252835223, "loss": 1.1541446447372437, "step": 8490 }, { "ce_loss": 0.2842647433280945, "epoch": 2.8318879252835223, "step": 8490 }, { "distill_loss": 0.47232362627983093, "epoch": 2.8318879252835223, "step": 8490 }, { "epoch": 2.8318879252835223, "ref_ce_loss": 0.24020341038703918, "step": 8490 }, { "epoch": 2.8352234823215476, "loss": 1.0561, "step": 8500 }, { "epoch": 2.8352234823215476, "grad_norm": 1.4203295707702637, "step": 8500 }, { "epoch": 2.8352234823215476, "learning_rate": 0.0006725815516152972, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 0.919762909412384, "step": 8500 }, { "ce_loss": 0.2890332043170929, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.37516260147094727, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.1926632821559906, "step": 8500 }, { "epoch": 2.8352234823215476, "loss": 1.1556134223937988, "step": 8500 }, { "ce_loss": 0.30083900690078735, "epoch": 2.8352234823215476, "step": 8500 }, { "distill_loss": 0.4435248374938965, "epoch": 2.8352234823215476, "step": 8500 }, { "epoch": 2.8352234823215476, "ref_ce_loss": 0.21375201642513275, "step": 8500 }, { "epoch": 2.838559039359573, "loss": 1.0273, "step": 8510 }, { "epoch": 2.838559039359573, "grad_norm": 1.6670829057693481, "step": 8510 }, { "epoch": 2.838559039359573, "learning_rate": 0.0006722651327306654, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 0.9069607257843018, "step": 8510 }, { "ce_loss": 0.29646047949790955, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.38407665491104126, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.22544215619564056, "step": 8510 }, { "epoch": 2.838559039359573, "loss": 1.033137321472168, "step": 8510 }, { "ce_loss": 0.28312405943870544, "epoch": 2.838559039359573, "step": 8510 }, { "distill_loss": 0.44431746006011963, "epoch": 2.838559039359573, "step": 8510 }, { "epoch": 2.838559039359573, "ref_ce_loss": 0.24237962067127228, "step": 8510 }, { "epoch": 2.8418945963975983, "loss": 1.0609, "step": 8520 }, { "epoch": 2.8418945963975983, "grad_norm": 2.3371927738189697, "step": 8520 }, { "epoch": 2.8418945963975983, "learning_rate": 0.0006719483960833267, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 1.079877257347107, "step": 8520 }, { "ce_loss": 0.2666374146938324, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.47163528203964233, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.2189643830060959, "step": 8520 }, { "epoch": 2.8418945963975983, "loss": 1.252240777015686, "step": 8520 }, { "ce_loss": 0.3274886906147003, "epoch": 2.8418945963975983, "step": 8520 }, { "distill_loss": 0.48647746443748474, "epoch": 2.8418945963975983, "step": 8520 }, { "epoch": 2.8418945963975983, "ref_ce_loss": 0.19743898510932922, "step": 8520 }, { "epoch": 2.8452301534356237, "loss": 1.1152, "step": 8530 }, { "epoch": 2.8452301534356237, "grad_norm": 3.188063383102417, "step": 8530 }, { "epoch": 2.8452301534356237, "learning_rate": 0.0006716313420429469, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 1.0578892230987549, "step": 8530 }, { "ce_loss": 0.27939069271087646, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.41141924262046814, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.23937836289405823, "step": 8530 }, { "epoch": 2.8452301534356237, "loss": 1.0810478925704956, "step": 8530 }, { "ce_loss": 0.2958105802536011, "epoch": 2.8452301534356237, "step": 8530 }, { "distill_loss": 0.4896780252456665, "epoch": 2.8452301534356237, "step": 8530 }, { "epoch": 2.8452301534356237, "ref_ce_loss": 0.22165045142173767, "step": 8530 }, { "epoch": 2.848565710473649, "loss": 1.0383, "step": 8540 }, { "epoch": 2.848565710473649, "grad_norm": 2.313164234161377, "step": 8540 }, { "epoch": 2.848565710473649, "learning_rate": 0.0006713139709795621, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 1.241434931755066, "step": 8540 }, { "ce_loss": 0.3561538755893707, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.5506157279014587, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.2594704031944275, "step": 8540 }, { "epoch": 2.848565710473649, "loss": 0.9683080911636353, "step": 8540 }, { "ce_loss": 0.25911006331443787, "epoch": 2.848565710473649, "step": 8540 }, { "distill_loss": 0.41315120458602905, "epoch": 2.848565710473649, "step": 8540 }, { "epoch": 2.848565710473649, "ref_ce_loss": 0.1843436062335968, "step": 8540 }, { "epoch": 2.8519012675116744, "loss": 1.0649, "step": 8550 }, { "epoch": 2.8519012675116744, "grad_norm": 1.7151241302490234, "step": 8550 }, { "epoch": 2.8519012675116744, "learning_rate": 0.0006709962832635789, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 1.2007484436035156, "step": 8550 }, { "ce_loss": 0.27435940504074097, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.5308864116668701, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.22154894471168518, "step": 8550 }, { "epoch": 2.8519012675116744, "loss": 1.5852737426757812, "step": 8550 }, { "ce_loss": 0.2574402391910553, "epoch": 2.8519012675116744, "step": 8550 }, { "distill_loss": 0.3926985263824463, "epoch": 2.8519012675116744, "step": 8550 }, { "epoch": 2.8519012675116744, "ref_ce_loss": 0.2215213030576706, "step": 8550 }, { "epoch": 2.8552368245496997, "loss": 1.1423, "step": 8560 }, { "epoch": 2.8552368245496997, "grad_norm": 2.2228615283966064, "step": 8560 }, { "epoch": 2.8552368245496997, "learning_rate": 0.0006706782792657725, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 1.118459939956665, "step": 8560 }, { "ce_loss": 0.3260303735733032, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.48777714371681213, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.23019662499427795, "step": 8560 }, { "epoch": 2.8552368245496997, "loss": 0.9366552233695984, "step": 8560 }, { "ce_loss": 0.2822265326976776, "epoch": 2.8552368245496997, "step": 8560 }, { "distill_loss": 0.4189026951789856, "epoch": 2.8552368245496997, "step": 8560 }, { "epoch": 2.8552368245496997, "ref_ce_loss": 0.23510710895061493, "step": 8560 }, { "epoch": 2.858572381587725, "loss": 1.0516, "step": 8570 }, { "epoch": 2.858572381587725, "grad_norm": 1.732473373413086, "step": 8570 }, { "epoch": 2.858572381587725, "learning_rate": 0.0006703599593572881, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 1.032302975654602, "step": 8570 }, { "ce_loss": 0.29478833079338074, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.5042816400527954, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.1814073920249939, "step": 8570 }, { "epoch": 2.858572381587725, "loss": 0.9531590938568115, "step": 8570 }, { "ce_loss": 0.20203471183776855, "epoch": 2.858572381587725, "step": 8570 }, { "distill_loss": 0.4454876184463501, "epoch": 2.858572381587725, "step": 8570 }, { "epoch": 2.858572381587725, "ref_ce_loss": 0.24680931866168976, "step": 8570 }, { "epoch": 2.8619079386257504, "loss": 1.0961, "step": 8580 }, { "epoch": 2.8619079386257504, "grad_norm": 1.977342963218689, "step": 8580 }, { "epoch": 2.8619079386257504, "learning_rate": 0.000670041323909639, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 1.0410369634628296, "step": 8580 }, { "ce_loss": 0.3173748850822449, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.4767785966396332, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.2090308666229248, "step": 8580 }, { "epoch": 2.8619079386257504, "loss": 0.8746610879898071, "step": 8580 }, { "ce_loss": 0.216957688331604, "epoch": 2.8619079386257504, "step": 8580 }, { "distill_loss": 0.4328778386116028, "epoch": 2.8619079386257504, "step": 8580 }, { "epoch": 2.8619079386257504, "ref_ce_loss": 0.15189319849014282, "step": 8580 }, { "epoch": 2.865243495663776, "loss": 1.107, "step": 8590 }, { "epoch": 2.865243495663776, "grad_norm": 1.9085386991500854, "step": 8590 }, { "epoch": 2.865243495663776, "learning_rate": 0.0006697223732947075, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 1.0445042848587036, "step": 8590 }, { "ce_loss": 0.2784845530986786, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.4100443422794342, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.22100576758384705, "step": 8590 }, { "epoch": 2.865243495663776, "loss": 1.0747194290161133, "step": 8590 }, { "ce_loss": 0.25558221340179443, "epoch": 2.865243495663776, "step": 8590 }, { "distill_loss": 0.36318057775497437, "epoch": 2.865243495663776, "step": 8590 }, { "epoch": 2.865243495663776, "ref_ce_loss": 0.17570355534553528, "step": 8590 }, { "epoch": 2.868579052701801, "loss": 1.063, "step": 8600 }, { "epoch": 2.868579052701801, "grad_norm": 2.178400754928589, "step": 8600 }, { "epoch": 2.868579052701801, "learning_rate": 0.000669403107884743, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 0.9615577459335327, "step": 8600 }, { "ce_loss": 0.2424328625202179, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.3830501139163971, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.28199291229248047, "step": 8600 }, { "epoch": 2.868579052701801, "loss": 1.2277705669403076, "step": 8600 }, { "ce_loss": 0.2811940610408783, "epoch": 2.868579052701801, "step": 8600 }, { "distill_loss": 0.35547733306884766, "epoch": 2.868579052701801, "step": 8600 }, { "epoch": 2.868579052701801, "ref_ce_loss": 0.23016276955604553, "step": 8600 }, { "epoch": 2.8719146097398265, "loss": 1.081, "step": 8610 }, { "epoch": 2.8719146097398265, "grad_norm": 2.179974317550659, "step": 8610 }, { "epoch": 2.8719146097398265, "learning_rate": 0.0006690835280523624, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 1.0438921451568604, "step": 8610 }, { "ce_loss": 0.27640753984451294, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.43892744183540344, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.2078024297952652, "step": 8610 }, { "epoch": 2.8719146097398265, "loss": 1.0383868217468262, "step": 8610 }, { "ce_loss": 0.3272160589694977, "epoch": 2.8719146097398265, "step": 8610 }, { "distill_loss": 0.3793815076351166, "epoch": 2.8719146097398265, "step": 8610 }, { "epoch": 2.8719146097398265, "ref_ce_loss": 0.24669574201107025, "step": 8610 }, { "epoch": 2.875250166777852, "loss": 0.9902, "step": 8620 }, { "epoch": 2.875250166777852, "grad_norm": 2.012612819671631, "step": 8620 }, { "epoch": 2.875250166777852, "learning_rate": 0.0006687636341705501, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.9611943364143372, "step": 8620 }, { "ce_loss": 0.2872672379016876, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.36733901500701904, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.2083406001329422, "step": 8620 }, { "epoch": 2.875250166777852, "loss": 0.9080339670181274, "step": 8620 }, { "ce_loss": 0.20383448898792267, "epoch": 2.875250166777852, "step": 8620 }, { "distill_loss": 0.32772207260131836, "epoch": 2.875250166777852, "step": 8620 }, { "epoch": 2.875250166777852, "ref_ce_loss": 0.15557782351970673, "step": 8620 }, { "epoch": 2.878585723815877, "loss": 0.9701, "step": 8630 }, { "epoch": 2.878585723815877, "grad_norm": 2.836735725402832, "step": 8630 }, { "epoch": 2.878585723815877, "learning_rate": 0.0006684434266126566, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 1.122562050819397, "step": 8630 }, { "ce_loss": 0.35167136788368225, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.40869832038879395, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.27053526043891907, "step": 8630 }, { "epoch": 2.878585723815877, "loss": 0.8298502564430237, "step": 8630 }, { "ce_loss": 0.23591288924217224, "epoch": 2.878585723815877, "step": 8630 }, { "distill_loss": 0.3813800513744354, "epoch": 2.878585723815877, "step": 8630 }, { "epoch": 2.878585723815877, "ref_ce_loss": 0.18113718926906586, "step": 8630 }, { "epoch": 2.8819212808539025, "loss": 1.0493, "step": 8640 }, { "epoch": 2.8819212808539025, "grad_norm": 1.8520641326904297, "step": 8640 }, { "epoch": 2.8819212808539025, "learning_rate": 0.0006681229057523986, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 1.0482269525527954, "step": 8640 }, { "ce_loss": 0.24984189867973328, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.45180055499076843, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.20239730179309845, "step": 8640 }, { "epoch": 2.8819212808539025, "loss": 0.9643755555152893, "step": 8640 }, { "ce_loss": 0.2628970742225647, "epoch": 2.8819212808539025, "step": 8640 }, { "distill_loss": 0.405941903591156, "epoch": 2.8819212808539025, "step": 8640 }, { "epoch": 2.8819212808539025, "ref_ce_loss": 0.2428521364927292, "step": 8640 }, { "epoch": 2.885256837891928, "loss": 1.087, "step": 8650 }, { "epoch": 2.885256837891928, "grad_norm": 3.146603584289551, "step": 8650 }, { "epoch": 2.885256837891928, "learning_rate": 0.0006678020719638582, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.8171414136886597, "step": 8650 }, { "ce_loss": 0.24824285507202148, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.38094547390937805, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.18764515221118927, "step": 8650 }, { "epoch": 2.885256837891928, "loss": 0.9321733117103577, "step": 8650 }, { "ce_loss": 0.2584330439567566, "epoch": 2.885256837891928, "step": 8650 }, { "distill_loss": 0.47445717453956604, "epoch": 2.885256837891928, "step": 8650 }, { "epoch": 2.885256837891928, "ref_ce_loss": 0.19905956089496613, "step": 8650 }, { "epoch": 2.8885923949299532, "loss": 1.0679, "step": 8660 }, { "epoch": 2.8885923949299532, "grad_norm": 2.0428476333618164, "step": 8660 }, { "epoch": 2.8885923949299532, "learning_rate": 0.0006674809256214832, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 1.0615755319595337, "step": 8660 }, { "ce_loss": 0.3499814569950104, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.40423911809921265, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.23165345191955566, "step": 8660 }, { "epoch": 2.8885923949299532, "loss": 0.8028374314308167, "step": 8660 }, { "ce_loss": 0.23720194399356842, "epoch": 2.8885923949299532, "step": 8660 }, { "distill_loss": 0.30180805921554565, "epoch": 2.8885923949299532, "step": 8660 }, { "epoch": 2.8885923949299532, "ref_ce_loss": 0.2078477293252945, "step": 8660 }, { "epoch": 2.8919279519679786, "loss": 1.1032, "step": 8670 }, { "epoch": 2.8919279519679786, "grad_norm": 4.174248218536377, "step": 8670 }, { "epoch": 2.8919279519679786, "learning_rate": 0.0006671594671000859, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 1.3050706386566162, "step": 8670 }, { "ce_loss": 0.29188674688339233, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.47933465242385864, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.24941763281822205, "step": 8670 }, { "epoch": 2.8919279519679786, "loss": 0.9996240139007568, "step": 8670 }, { "ce_loss": 0.23974673449993134, "epoch": 2.8919279519679786, "step": 8670 }, { "distill_loss": 0.4903826415538788, "epoch": 2.8919279519679786, "step": 8670 }, { "epoch": 2.8919279519679786, "ref_ce_loss": 0.20120561122894287, "step": 8670 }, { "epoch": 2.895263509006004, "loss": 1.0501, "step": 8680 }, { "epoch": 2.895263509006004, "grad_norm": 2.0826456546783447, "step": 8680 }, { "epoch": 2.895263509006004, "learning_rate": 0.000666837696774843, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 1.132706880569458, "step": 8680 }, { "ce_loss": 0.2618308663368225, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.40235596895217896, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.16059039533138275, "step": 8680 }, { "epoch": 2.895263509006004, "loss": 1.1171619892120361, "step": 8680 }, { "ce_loss": 0.28033655881881714, "epoch": 2.895263509006004, "step": 8680 }, { "distill_loss": 0.3423466980457306, "epoch": 2.895263509006004, "step": 8680 }, { "epoch": 2.895263509006004, "ref_ce_loss": 0.23442880809307098, "step": 8680 }, { "epoch": 2.8985990660440293, "loss": 1.0142, "step": 8690 }, { "epoch": 2.8985990660440293, "grad_norm": 1.619909644126892, "step": 8690 }, { "epoch": 2.8985990660440293, "learning_rate": 0.000666515615021295, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 0.980444610118866, "step": 8690 }, { "ce_loss": 0.3060693144798279, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.3914102613925934, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.21149593591690063, "step": 8690 }, { "epoch": 2.8985990660440293, "loss": 0.8747573494911194, "step": 8690 }, { "ce_loss": 0.2622194290161133, "epoch": 2.8985990660440293, "step": 8690 }, { "distill_loss": 0.35515880584716797, "epoch": 2.8985990660440293, "step": 8690 }, { "epoch": 2.8985990660440293, "ref_ce_loss": 0.20144023001194, "step": 8690 }, { "epoch": 2.9019346230820546, "loss": 1.0804, "step": 8700 }, { "epoch": 2.9019346230820546, "grad_norm": 1.6870344877243042, "step": 8700 }, { "epoch": 2.9019346230820546, "learning_rate": 0.0006661932222153459, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.8705916404724121, "step": 8700 }, { "ce_loss": 0.25805947184562683, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.420367568731308, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.1892354041337967, "step": 8700 }, { "epoch": 2.9019346230820546, "loss": 0.881604790687561, "step": 8700 }, { "ce_loss": 0.23483748733997345, "epoch": 2.9019346230820546, "step": 8700 }, { "distill_loss": 0.4109418988227844, "epoch": 2.9019346230820546, "step": 8700 }, { "epoch": 2.9019346230820546, "ref_ce_loss": 0.18561656773090363, "step": 8700 }, { "epoch": 2.90527018012008, "loss": 1.0964, "step": 8710 }, { "epoch": 2.90527018012008, "grad_norm": 2.7310404777526855, "step": 8710 }, { "epoch": 2.90527018012008, "learning_rate": 0.0006658705187332629, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 0.9659359455108643, "step": 8710 }, { "ce_loss": 0.20541168749332428, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.39143872261047363, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.23953206837177277, "step": 8710 }, { "epoch": 2.90527018012008, "loss": 1.1124589443206787, "step": 8710 }, { "ce_loss": 0.2792068123817444, "epoch": 2.90527018012008, "step": 8710 }, { "distill_loss": 0.5193973183631897, "epoch": 2.90527018012008, "step": 8710 }, { "epoch": 2.90527018012008, "ref_ce_loss": 0.23650509119033813, "step": 8710 }, { "epoch": 2.9086057371581053, "loss": 1.0421, "step": 8720 }, { "epoch": 2.9086057371581053, "grad_norm": 1.4814128875732422, "step": 8720 }, { "epoch": 2.9086057371581053, "learning_rate": 0.0006655475049516757, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 1.0588215589523315, "step": 8720 }, { "ce_loss": 0.2646660804748535, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.49552783370018005, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.21849878132343292, "step": 8720 }, { "epoch": 2.9086057371581053, "loss": 0.7920655608177185, "step": 8720 }, { "ce_loss": 0.22781747579574585, "epoch": 2.9086057371581053, "step": 8720 }, { "distill_loss": 0.3877447545528412, "epoch": 2.9086057371581053, "step": 8720 }, { "epoch": 2.9086057371581053, "ref_ce_loss": 0.17634247243404388, "step": 8720 }, { "epoch": 2.9119412941961307, "loss": 1.0592, "step": 8730 }, { "epoch": 2.9119412941961307, "grad_norm": 1.987634539604187, "step": 8730 }, { "epoch": 2.9119412941961307, "learning_rate": 0.0006652241812475762, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 1.0713099241256714, "step": 8730 }, { "ce_loss": 0.22051984071731567, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.35058051347732544, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.18141403794288635, "step": 8730 }, { "epoch": 2.9119412941961307, "loss": 1.0249351263046265, "step": 8730 }, { "ce_loss": 0.22720232605934143, "epoch": 2.9119412941961307, "step": 8730 }, { "distill_loss": 0.39575693011283875, "epoch": 2.9119412941961307, "step": 8730 }, { "epoch": 2.9119412941961307, "ref_ce_loss": 0.22523285448551178, "step": 8730 }, { "epoch": 2.915276851234156, "loss": 0.9748, "step": 8740 }, { "epoch": 2.915276851234156, "grad_norm": 1.5860884189605713, "step": 8740 }, { "epoch": 2.915276851234156, "learning_rate": 0.0006649005479983179, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.8226218819618225, "step": 8740 }, { "ce_loss": 0.20729947090148926, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.4032948911190033, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.16638141870498657, "step": 8740 }, { "epoch": 2.915276851234156, "loss": 0.76434725522995, "step": 8740 }, { "ce_loss": 0.21286216378211975, "epoch": 2.915276851234156, "step": 8740 }, { "distill_loss": 0.3148097097873688, "epoch": 2.915276851234156, "step": 8740 }, { "epoch": 2.915276851234156, "ref_ce_loss": 0.17197924852371216, "step": 8740 }, { "epoch": 2.9186124082721814, "loss": 0.9997, "step": 8750 }, { "epoch": 2.9186124082721814, "grad_norm": 2.5379202365875244, "step": 8750 }, { "epoch": 2.9186124082721814, "learning_rate": 0.0006645766055816155, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 1.0224852561950684, "step": 8750 }, { "ce_loss": 0.24008406698703766, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.3388434946537018, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.17256751656532288, "step": 8750 }, { "epoch": 2.9186124082721814, "loss": 1.160400390625, "step": 8750 }, { "ce_loss": 0.38929951190948486, "epoch": 2.9186124082721814, "step": 8750 }, { "distill_loss": 0.5098603963851929, "epoch": 2.9186124082721814, "step": 8750 }, { "epoch": 2.9186124082721814, "ref_ce_loss": 0.2422654628753662, "step": 8750 }, { "epoch": 2.9219479653102067, "loss": 1.0708, "step": 8760 }, { "epoch": 2.9219479653102067, "grad_norm": 2.1415038108825684, "step": 8760 }, { "epoch": 2.9219479653102067, "learning_rate": 0.0006642523543755449, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.8797162175178528, "step": 8760 }, { "ce_loss": 0.2512281537055969, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.3936854600906372, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.1777983009815216, "step": 8760 }, { "epoch": 2.9219479653102067, "loss": 0.9649273753166199, "step": 8760 }, { "ce_loss": 0.23668411374092102, "epoch": 2.9219479653102067, "step": 8760 }, { "distill_loss": 0.4126769006252289, "epoch": 2.9219479653102067, "step": 8760 }, { "epoch": 2.9219479653102067, "ref_ce_loss": 0.23348350822925568, "step": 8760 }, { "epoch": 2.925283522348232, "loss": 1.0869, "step": 8770 }, { "epoch": 2.925283522348232, "grad_norm": 1.9545202255249023, "step": 8770 }, { "epoch": 2.925283522348232, "learning_rate": 0.0006639277947585419, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 1.432930588722229, "step": 8770 }, { "ce_loss": 0.3243611454963684, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.5498109459877014, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.27567988634109497, "step": 8770 }, { "epoch": 2.925283522348232, "loss": 1.0950384140014648, "step": 8770 }, { "ce_loss": 0.2891460359096527, "epoch": 2.925283522348232, "step": 8770 }, { "distill_loss": 0.544788122177124, "epoch": 2.925283522348232, "step": 8770 }, { "epoch": 2.925283522348232, "ref_ce_loss": 0.20279133319854736, "step": 8770 }, { "epoch": 2.9286190793862574, "loss": 1.062, "step": 8780 }, { "epoch": 2.9286190793862574, "grad_norm": 1.8844976425170898, "step": 8780 }, { "epoch": 2.9286190793862574, "learning_rate": 0.0006636029271094026, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.8158213496208191, "step": 8780 }, { "ce_loss": 0.24880477786064148, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.3861525058746338, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.17992152273654938, "step": 8780 }, { "epoch": 2.9286190793862574, "loss": 0.9338173270225525, "step": 8780 }, { "ce_loss": 0.27490976452827454, "epoch": 2.9286190793862574, "step": 8780 }, { "distill_loss": 0.4086349904537201, "epoch": 2.9286190793862574, "step": 8780 }, { "epoch": 2.9286190793862574, "ref_ce_loss": 0.19639791548252106, "step": 8780 }, { "epoch": 2.931954636424283, "loss": 0.9566, "step": 8790 }, { "epoch": 2.931954636424283, "grad_norm": 1.5922753810882568, "step": 8790 }, { "epoch": 2.931954636424283, "learning_rate": 0.0006632777518072826, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 1.1115561723709106, "step": 8790 }, { "ce_loss": 0.32640036940574646, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.457407683134079, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.24649685621261597, "step": 8790 }, { "epoch": 2.931954636424283, "loss": 0.9608051776885986, "step": 8790 }, { "ce_loss": 0.20317061245441437, "epoch": 2.931954636424283, "step": 8790 }, { "distill_loss": 0.42976054549217224, "epoch": 2.931954636424283, "step": 8790 }, { "epoch": 2.931954636424283, "ref_ce_loss": 0.22405625879764557, "step": 8790 }, { "epoch": 2.935290193462308, "loss": 1.0111, "step": 8800 }, { "epoch": 2.935290193462308, "grad_norm": 1.4414767026901245, "step": 8800 }, { "epoch": 2.935290193462308, "learning_rate": 0.0006629522692316964, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.8295692801475525, "step": 8800 }, { "ce_loss": 0.2127918004989624, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.34422290325164795, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.22246137261390686, "step": 8800 }, { "epoch": 2.935290193462308, "loss": 0.9614005088806152, "step": 8800 }, { "ce_loss": 0.310360848903656, "epoch": 2.935290193462308, "step": 8800 }, { "distill_loss": 0.3648439347743988, "epoch": 2.935290193462308, "step": 8800 }, { "epoch": 2.935290193462308, "ref_ce_loss": 0.22835128009319305, "step": 8800 }, { "epoch": 2.9386257505003335, "loss": 0.9828, "step": 8810 }, { "epoch": 2.9386257505003335, "grad_norm": 2.4150519371032715, "step": 8810 }, { "epoch": 2.9386257505003335, "learning_rate": 0.0006626264797625171, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 0.7496489882469177, "step": 8810 }, { "ce_loss": 0.2132815569639206, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.3698428273200989, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.16614781320095062, "step": 8810 }, { "epoch": 2.9386257505003335, "loss": 1.0606884956359863, "step": 8810 }, { "ce_loss": 0.2448640763759613, "epoch": 2.9386257505003335, "step": 8810 }, { "distill_loss": 0.34551236033439636, "epoch": 2.9386257505003335, "step": 8810 }, { "epoch": 2.9386257505003335, "ref_ce_loss": 0.16551008820533752, "step": 8810 }, { "epoch": 2.941961307538359, "loss": 0.9164, "step": 8820 }, { "epoch": 2.941961307538359, "grad_norm": 2.078324317932129, "step": 8820 }, { "epoch": 2.941961307538359, "learning_rate": 0.0006623003837799761, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 1.024645209312439, "step": 8820 }, { "ce_loss": 0.27640199661254883, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.4167477488517761, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.19014853239059448, "step": 8820 }, { "epoch": 2.941961307538359, "loss": 0.6186550259590149, "step": 8820 }, { "ce_loss": 0.16880573332309723, "epoch": 2.941961307538359, "step": 8820 }, { "distill_loss": 0.2792816758155823, "epoch": 2.941961307538359, "step": 8820 }, { "epoch": 2.941961307538359, "ref_ce_loss": 0.1264185756444931, "step": 8820 }, { "epoch": 2.945296864576384, "loss": 1.0886, "step": 8830 }, { "epoch": 2.945296864576384, "grad_norm": 1.649694800376892, "step": 8830 }, { "epoch": 2.945296864576384, "learning_rate": 0.0006619739816646626, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 1.1843812465667725, "step": 8830 }, { "ce_loss": 0.2780088484287262, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.46142643690109253, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.19550113379955292, "step": 8830 }, { "epoch": 2.945296864576384, "loss": 1.022020697593689, "step": 8830 }, { "ce_loss": 0.28591063618659973, "epoch": 2.945296864576384, "step": 8830 }, { "distill_loss": 0.3770048916339874, "epoch": 2.945296864576384, "step": 8830 }, { "epoch": 2.945296864576384, "ref_ce_loss": 0.2763281464576721, "step": 8830 }, { "epoch": 2.9486324216144095, "loss": 1.0154, "step": 8840 }, { "epoch": 2.9486324216144095, "grad_norm": 2.126565456390381, "step": 8840 }, { "epoch": 2.9486324216144095, "learning_rate": 0.000661647273797523, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 1.2838503122329712, "step": 8840 }, { "ce_loss": 0.24286939203739166, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.6261106729507446, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.17261916399002075, "step": 8840 }, { "epoch": 2.9486324216144095, "loss": 1.2399978637695312, "step": 8840 }, { "ce_loss": 0.2863895297050476, "epoch": 2.9486324216144095, "step": 8840 }, { "distill_loss": 0.46719643473625183, "epoch": 2.9486324216144095, "step": 8840 }, { "epoch": 2.9486324216144095, "ref_ce_loss": 0.24916110932826996, "step": 8840 }, { "epoch": 2.951967978652435, "loss": 1.1672, "step": 8850 }, { "epoch": 2.951967978652435, "grad_norm": 2.514310359954834, "step": 8850 }, { "epoch": 2.951967978652435, "learning_rate": 0.0006613202605598604, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 1.3793586492538452, "step": 8850 }, { "ce_loss": 0.25462037324905396, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.6040604710578918, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.2562090754508972, "step": 8850 }, { "epoch": 2.951967978652435, "loss": 0.9639979600906372, "step": 8850 }, { "ce_loss": 0.21080300211906433, "epoch": 2.951967978652435, "step": 8850 }, { "distill_loss": 0.4669973850250244, "epoch": 2.951967978652435, "step": 8850 }, { "epoch": 2.951967978652435, "ref_ce_loss": 0.2147378921508789, "step": 8850 }, { "epoch": 2.9553035356904602, "loss": 1.0595, "step": 8860 }, { "epoch": 2.9553035356904602, "grad_norm": 1.8250327110290527, "step": 8860 }, { "epoch": 2.9553035356904602, "learning_rate": 0.0006609929423333345, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.8667522668838501, "step": 8860 }, { "ce_loss": 0.23248571157455444, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.4130263924598694, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.22097499668598175, "step": 8860 }, { "epoch": 2.9553035356904602, "loss": 0.9345153570175171, "step": 8860 }, { "ce_loss": 0.28272318840026855, "epoch": 2.9553035356904602, "step": 8860 }, { "distill_loss": 0.4565563201904297, "epoch": 2.9553035356904602, "step": 8860 }, { "epoch": 2.9553035356904602, "ref_ce_loss": 0.19382339715957642, "step": 8860 }, { "epoch": 2.9586390927284856, "loss": 1.1138, "step": 8870 }, { "epoch": 2.9586390927284856, "grad_norm": 2.332496404647827, "step": 8870 }, { "epoch": 2.9586390927284856, "learning_rate": 0.0006606653194999608, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 1.3785479068756104, "step": 8870 }, { "ce_loss": 0.2756645977497101, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.4183840751647949, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.2372225672006607, "step": 8870 }, { "epoch": 2.9586390927284856, "loss": 1.4631519317626953, "step": 8870 }, { "ce_loss": 0.2707836925983429, "epoch": 2.9586390927284856, "step": 8870 }, { "distill_loss": 0.5820114612579346, "epoch": 2.9586390927284856, "step": 8870 }, { "epoch": 2.9586390927284856, "ref_ce_loss": 0.18649905920028687, "step": 8870 }, { "epoch": 2.961974649766511, "loss": 1.0423, "step": 8880 }, { "epoch": 2.961974649766511, "grad_norm": 1.7435004711151123, "step": 8880 }, { "epoch": 2.961974649766511, "learning_rate": 0.0006603373924421106, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 0.9438967704772949, "step": 8880 }, { "ce_loss": 0.23838581144809723, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.3440316319465637, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.22295810282230377, "step": 8880 }, { "epoch": 2.961974649766511, "loss": 1.1735258102416992, "step": 8880 }, { "ce_loss": 0.30681556463241577, "epoch": 2.961974649766511, "step": 8880 }, { "distill_loss": 0.4880583882331848, "epoch": 2.961974649766511, "step": 8880 }, { "epoch": 2.961974649766511, "ref_ce_loss": 0.21688827872276306, "step": 8880 }, { "epoch": 2.9653102068045363, "loss": 1.0695, "step": 8890 }, { "epoch": 2.9653102068045363, "grad_norm": 2.8541676998138428, "step": 8890 }, { "epoch": 2.9653102068045363, "learning_rate": 0.00066000916154251, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 1.5222339630126953, "step": 8890 }, { "ce_loss": 0.3295680284500122, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.46733707189559937, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.20860373973846436, "step": 8890 }, { "epoch": 2.9653102068045363, "loss": 1.3878238201141357, "step": 8890 }, { "ce_loss": 0.277427077293396, "epoch": 2.9653102068045363, "step": 8890 }, { "distill_loss": 0.44741156697273254, "epoch": 2.9653102068045363, "step": 8890 }, { "epoch": 2.9653102068045363, "ref_ce_loss": 0.17672061920166016, "step": 8890 }, { "epoch": 2.9686457638425616, "loss": 0.9985, "step": 8900 }, { "epoch": 2.9686457638425616, "grad_norm": 1.472333312034607, "step": 8900 }, { "epoch": 2.9686457638425616, "learning_rate": 0.0006596806271842397, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 1.2768826484680176, "step": 8900 }, { "ce_loss": 0.37654709815979004, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.4484732151031494, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.2331361323595047, "step": 8900 }, { "epoch": 2.9686457638425616, "loss": 0.9021540284156799, "step": 8900 }, { "ce_loss": 0.31903746724128723, "epoch": 2.9686457638425616, "step": 8900 }, { "distill_loss": 0.31807395815849304, "epoch": 2.9686457638425616, "step": 8900 }, { "epoch": 2.9686457638425616, "ref_ce_loss": 0.26460903882980347, "step": 8900 }, { "epoch": 2.971981320880587, "loss": 1.0338, "step": 8910 }, { "epoch": 2.971981320880587, "grad_norm": 2.5841994285583496, "step": 8910 }, { "epoch": 2.971981320880587, "learning_rate": 0.0006593517897507345, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 0.9933534264564514, "step": 8910 }, { "ce_loss": 0.2849483788013458, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.4897994101047516, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.17452286183834076, "step": 8910 }, { "epoch": 2.971981320880587, "loss": 1.0774074792861938, "step": 8910 }, { "ce_loss": 0.2757338881492615, "epoch": 2.971981320880587, "step": 8910 }, { "distill_loss": 0.4135061800479889, "epoch": 2.971981320880587, "step": 8910 }, { "epoch": 2.971981320880587, "ref_ce_loss": 0.18409499526023865, "step": 8910 }, { "epoch": 2.9753168779186123, "loss": 1.0067, "step": 8920 }, { "epoch": 2.9753168779186123, "grad_norm": 1.905840277671814, "step": 8920 }, { "epoch": 2.9753168779186123, "learning_rate": 0.0006590226496257835, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 0.9164485931396484, "step": 8920 }, { "ce_loss": 0.30785995721817017, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.44607120752334595, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.16198371350765228, "step": 8920 }, { "epoch": 2.9753168779186123, "loss": 1.0075207948684692, "step": 8920 }, { "ce_loss": 0.2652585506439209, "epoch": 2.9753168779186123, "step": 8920 }, { "distill_loss": 0.4635201096534729, "epoch": 2.9753168779186123, "step": 8920 }, { "epoch": 2.9753168779186123, "ref_ce_loss": 0.2188246250152588, "step": 8920 }, { "epoch": 2.9786524349566377, "loss": 1.1102, "step": 8930 }, { "epoch": 2.9786524349566377, "grad_norm": 3.0330445766448975, "step": 8930 }, { "epoch": 2.9786524349566377, "learning_rate": 0.0006586932071935284, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 0.9932916164398193, "step": 8930 }, { "ce_loss": 0.2821085453033447, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.47245728969573975, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.1981649547815323, "step": 8930 }, { "epoch": 2.9786524349566377, "loss": 1.2256691455841064, "step": 8930 }, { "ce_loss": 0.2617366313934326, "epoch": 2.9786524349566377, "step": 8930 }, { "distill_loss": 0.4330045282840729, "epoch": 2.9786524349566377, "step": 8930 }, { "epoch": 2.9786524349566377, "ref_ce_loss": 0.21306806802749634, "step": 8930 }, { "epoch": 2.981987991994663, "loss": 1.1046, "step": 8940 }, { "epoch": 2.981987991994663, "grad_norm": 2.6775052547454834, "step": 8940 }, { "epoch": 2.981987991994663, "learning_rate": 0.0006583634628384638, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 1.0053170919418335, "step": 8940 }, { "ce_loss": 0.30795395374298096, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.45006781816482544, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.24674780666828156, "step": 8940 }, { "epoch": 2.981987991994663, "loss": 0.7464403510093689, "step": 8940 }, { "ce_loss": 0.20934158563613892, "epoch": 2.981987991994663, "step": 8940 }, { "distill_loss": 0.35018253326416016, "epoch": 2.981987991994663, "step": 8940 }, { "epoch": 2.981987991994663, "ref_ce_loss": 0.15509063005447388, "step": 8940 }, { "epoch": 2.9853235490326884, "loss": 1.0318, "step": 8950 }, { "epoch": 2.9853235490326884, "grad_norm": 2.196390390396118, "step": 8950 }, { "epoch": 2.9853235490326884, "learning_rate": 0.0006580334169454372, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 1.082023024559021, "step": 8950 }, { "ce_loss": 0.29764267802238464, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.5086228847503662, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.2039090394973755, "step": 8950 }, { "epoch": 2.9853235490326884, "loss": 0.8325173258781433, "step": 8950 }, { "ce_loss": 0.25651320815086365, "epoch": 2.9853235490326884, "step": 8950 }, { "distill_loss": 0.4119528830051422, "epoch": 2.9853235490326884, "step": 8950 }, { "epoch": 2.9853235490326884, "ref_ce_loss": 0.16320902109146118, "step": 8950 }, { "epoch": 2.9886591060707137, "loss": 1.1138, "step": 8960 }, { "epoch": 2.9886591060707137, "grad_norm": 2.025916337966919, "step": 8960 }, { "epoch": 2.9886591060707137, "learning_rate": 0.0006577030698996472, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 1.0703673362731934, "step": 8960 }, { "ce_loss": 0.28445735573768616, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.5076282024383545, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.2156871259212494, "step": 8960 }, { "epoch": 2.9886591060707137, "loss": 1.503796100616455, "step": 8960 }, { "ce_loss": 0.2984296977519989, "epoch": 2.9886591060707137, "step": 8960 }, { "distill_loss": 0.572968065738678, "epoch": 2.9886591060707137, "step": 8960 }, { "epoch": 2.9886591060707137, "ref_ce_loss": 0.27950987219810486, "step": 8960 }, { "epoch": 2.991994663108739, "loss": 1.0775, "step": 8970 }, { "epoch": 2.991994663108739, "grad_norm": 2.651618242263794, "step": 8970 }, { "epoch": 2.991994663108739, "learning_rate": 0.0006573724220866448, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 1.0260261297225952, "step": 8970 }, { "ce_loss": 0.3148878514766693, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.5199999809265137, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.19105911254882812, "step": 8970 }, { "epoch": 2.991994663108739, "loss": 0.9962505102157593, "step": 8970 }, { "ce_loss": 0.3074457347393036, "epoch": 2.991994663108739, "step": 8970 }, { "distill_loss": 0.4438781142234802, "epoch": 2.991994663108739, "step": 8970 }, { "epoch": 2.991994663108739, "ref_ce_loss": 0.20244362950325012, "step": 8970 }, { "epoch": 2.9953302201467644, "loss": 1.0804, "step": 8980 }, { "epoch": 2.9953302201467644, "grad_norm": 1.9578343629837036, "step": 8980 }, { "epoch": 2.9953302201467644, "learning_rate": 0.0006570414738923314, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 0.9544330835342407, "step": 8980 }, { "ce_loss": 0.3170566260814667, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.43722185492515564, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.2000456601381302, "step": 8980 }, { "epoch": 2.9953302201467644, "loss": 1.120278239250183, "step": 8980 }, { "ce_loss": 0.273532897233963, "epoch": 2.9953302201467644, "step": 8980 }, { "distill_loss": 0.42763447761535645, "epoch": 2.9953302201467644, "step": 8980 }, { "epoch": 2.9953302201467644, "ref_ce_loss": 0.2289534956216812, "step": 8980 }, { "epoch": 2.9986657771847898, "loss": 1.0876, "step": 8990 }, { "epoch": 2.9986657771847898, "grad_norm": 1.487807035446167, "step": 8990 }, { "epoch": 2.9986657771847898, "learning_rate": 0.0006567102257029592, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 1.4187259674072266, "step": 8990 }, { "ce_loss": 0.1924740970134735, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.39415788650512695, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.23679277300834656, "step": 8990 }, { "epoch": 2.9986657771847898, "loss": 0.8554670810699463, "step": 8990 }, { "ce_loss": 0.24595953524112701, "epoch": 2.9986657771847898, "step": 8990 }, { "distill_loss": 0.45313510298728943, "epoch": 2.9986657771847898, "step": 8990 }, { "epoch": 2.9986657771847898, "ref_ce_loss": 0.15625616908073425, "step": 8990 }, { "epoch": 3.002001334222815, "loss": 1.112, "step": 9000 }, { "epoch": 3.002001334222815, "grad_norm": 2.9651665687561035, "step": 9000 }, { "epoch": 3.002001334222815, "learning_rate": 0.0006563786779051305, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 0.919062077999115, "step": 9000 }, { "ce_loss": 0.2220434844493866, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.47971224784851074, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.1729557067155838, "step": 9000 }, { "epoch": 3.002001334222815, "loss": 1.1402512788772583, "step": 9000 }, { "ce_loss": 0.3153040111064911, "epoch": 3.002001334222815, "step": 9000 }, { "distill_loss": 0.542543888092041, "epoch": 3.002001334222815, "step": 9000 }, { "epoch": 3.002001334222815, "ref_ce_loss": 0.22358056902885437, "step": 9000 }, { "epoch": 3.0053368912608405, "loss": 1.0866, "step": 9010 }, { "epoch": 3.0053368912608405, "grad_norm": 2.7941534519195557, "step": 9010 }, { "epoch": 3.0053368912608405, "learning_rate": 0.0006560468308857971, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 1.3257273435592651, "step": 9010 }, { "ce_loss": 0.33371102809906006, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.4266383647918701, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.22311057150363922, "step": 9010 }, { "epoch": 3.0053368912608405, "loss": 1.0651869773864746, "step": 9010 }, { "ce_loss": 0.3537583649158478, "epoch": 3.0053368912608405, "step": 9010 }, { "distill_loss": 0.5079278349876404, "epoch": 3.0053368912608405, "step": 9010 }, { "epoch": 3.0053368912608405, "ref_ce_loss": 0.20298175513744354, "step": 9010 }, { "epoch": 3.008672448298866, "loss": 1.0502, "step": 9020 }, { "epoch": 3.008672448298866, "grad_norm": 1.7431156635284424, "step": 9020 }, { "epoch": 3.008672448298866, "learning_rate": 0.0006557146850322601, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 1.0537315607070923, "step": 9020 }, { "ce_loss": 0.29217228293418884, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.4923178255558014, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.22270143032073975, "step": 9020 }, { "epoch": 3.008672448298866, "loss": 0.8989039063453674, "step": 9020 }, { "ce_loss": 0.2505379617214203, "epoch": 3.008672448298866, "step": 9020 }, { "distill_loss": 0.41707292199134827, "epoch": 3.008672448298866, "step": 9020 }, { "epoch": 3.008672448298866, "ref_ce_loss": 0.18117381632328033, "step": 9020 }, { "epoch": 3.012008005336891, "loss": 1.0364, "step": 9030 }, { "epoch": 3.012008005336891, "grad_norm": 1.988556146621704, "step": 9030 }, { "epoch": 3.012008005336891, "learning_rate": 0.0006553822407321699, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 0.9533810615539551, "step": 9030 }, { "ce_loss": 0.2103157341480255, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.41822630167007446, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.19332677125930786, "step": 9030 }, { "epoch": 3.012008005336891, "loss": 1.106896996498108, "step": 9030 }, { "ce_loss": 0.2565533220767975, "epoch": 3.012008005336891, "step": 9030 }, { "distill_loss": 0.5410886406898499, "epoch": 3.012008005336891, "step": 9030 }, { "epoch": 3.012008005336891, "ref_ce_loss": 0.23619408905506134, "step": 9030 }, { "epoch": 3.0153435623749165, "loss": 1.0529, "step": 9040 }, { "epoch": 3.0153435623749165, "grad_norm": 2.4528582096099854, "step": 9040 }, { "epoch": 3.0153435623749165, "learning_rate": 0.0006550494983735243, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 1.010316252708435, "step": 9040 }, { "ce_loss": 0.3083910048007965, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.49507591128349304, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.20631785690784454, "step": 9040 }, { "epoch": 3.0153435623749165, "loss": 0.9693584442138672, "step": 9040 }, { "ce_loss": 0.2823527157306671, "epoch": 3.0153435623749165, "step": 9040 }, { "distill_loss": 0.435078889131546, "epoch": 3.0153435623749165, "step": 9040 }, { "epoch": 3.0153435623749165, "ref_ce_loss": 0.20153407752513885, "step": 9040 }, { "epoch": 3.018679119412942, "loss": 0.9921, "step": 9050 }, { "epoch": 3.018679119412942, "grad_norm": 2.484927177429199, "step": 9050 }, { "epoch": 3.018679119412942, "learning_rate": 0.0006547164583446698, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 1.0611393451690674, "step": 9050 }, { "ce_loss": 0.3461135923862457, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.5212705135345459, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.19343295693397522, "step": 9050 }, { "epoch": 3.018679119412942, "loss": 1.5962610244750977, "step": 9050 }, { "ce_loss": 0.3070935606956482, "epoch": 3.018679119412942, "step": 9050 }, { "distill_loss": 0.5359688997268677, "epoch": 3.018679119412942, "step": 9050 }, { "epoch": 3.018679119412942, "ref_ce_loss": 0.2690057158470154, "step": 9050 }, { "epoch": 3.022014676450967, "loss": 1.1188, "step": 9060 }, { "epoch": 3.022014676450967, "grad_norm": 2.0018575191497803, "step": 9060 }, { "epoch": 3.022014676450967, "learning_rate": 0.0006543831210342998, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 1.0536315441131592, "step": 9060 }, { "ce_loss": 0.33707576990127563, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.4651211202144623, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.19382964074611664, "step": 9060 }, { "epoch": 3.022014676450967, "loss": 1.1964068412780762, "step": 9060 }, { "ce_loss": 0.27716806530952454, "epoch": 3.022014676450967, "step": 9060 }, { "distill_loss": 0.4435596168041229, "epoch": 3.022014676450967, "step": 9060 }, { "epoch": 3.022014676450967, "ref_ce_loss": 0.2285054624080658, "step": 9060 }, { "epoch": 3.0253502334889926, "loss": 1.0283, "step": 9070 }, { "epoch": 3.0253502334889926, "grad_norm": 1.938596487045288, "step": 9070 }, { "epoch": 3.0253502334889926, "learning_rate": 0.0006540494868314547, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.9109571576118469, "step": 9070 }, { "ce_loss": 0.16253161430358887, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.41372305154800415, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.20183952152729034, "step": 9070 }, { "epoch": 3.0253502334889926, "loss": 0.8401382565498352, "step": 9070 }, { "ce_loss": 0.20185644924640656, "epoch": 3.0253502334889926, "step": 9070 }, { "distill_loss": 0.4689839482307434, "epoch": 3.0253502334889926, "step": 9070 }, { "epoch": 3.0253502334889926, "ref_ce_loss": 0.16874492168426514, "step": 9070 }, { "epoch": 3.028685790527018, "loss": 1.0059, "step": 9080 }, { "epoch": 3.028685790527018, "grad_norm": 1.7816444635391235, "step": 9080 }, { "epoch": 3.028685790527018, "learning_rate": 0.0006537155561255215, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 0.954689085483551, "step": 9080 }, { "ce_loss": 0.25480887293815613, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.4411703050136566, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.1994798481464386, "step": 9080 }, { "epoch": 3.028685790527018, "loss": 1.000085473060608, "step": 9080 }, { "ce_loss": 0.2905219495296478, "epoch": 3.028685790527018, "step": 9080 }, { "distill_loss": 0.43431705236434937, "epoch": 3.028685790527018, "step": 9080 }, { "epoch": 3.028685790527018, "ref_ce_loss": 0.2110050618648529, "step": 9080 }, { "epoch": 3.0320213475650433, "loss": 1.0526, "step": 9090 }, { "epoch": 3.0320213475650433, "grad_norm": 1.6663061380386353, "step": 9090 }, { "epoch": 3.0320213475650433, "learning_rate": 0.0006533813293062336, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 1.300847053527832, "step": 9090 }, { "ce_loss": 0.25688230991363525, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.37179258465766907, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.18998339772224426, "step": 9090 }, { "epoch": 3.0320213475650433, "loss": 1.0649999380111694, "step": 9090 }, { "ce_loss": 0.24274015426635742, "epoch": 3.0320213475650433, "step": 9090 }, { "distill_loss": 0.41353946924209595, "epoch": 3.0320213475650433, "step": 9090 }, { "epoch": 3.0320213475650433, "ref_ce_loss": 0.23726420104503632, "step": 9090 }, { "epoch": 3.0353569046030686, "loss": 1.0777, "step": 9100 }, { "epoch": 3.0353569046030686, "grad_norm": 2.572343349456787, "step": 9100 }, { "epoch": 3.0353569046030686, "learning_rate": 0.0006530468067636693, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 1.0435293912887573, "step": 9100 }, { "ce_loss": 0.32857900857925415, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.4863881766796112, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.19983501732349396, "step": 9100 }, { "epoch": 3.0353569046030686, "loss": 1.0608022212982178, "step": 9100 }, { "ce_loss": 0.26578211784362793, "epoch": 3.0353569046030686, "step": 9100 }, { "distill_loss": 0.44077160954475403, "epoch": 3.0353569046030686, "step": 9100 }, { "epoch": 3.0353569046030686, "ref_ce_loss": 0.2041448950767517, "step": 9100 }, { "epoch": 3.038692461641094, "loss": 1.0447, "step": 9110 }, { "epoch": 3.038692461641094, "grad_norm": 2.524587631225586, "step": 9110 }, { "epoch": 3.038692461641094, "learning_rate": 0.0006527119888882527, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 1.3836387395858765, "step": 9110 }, { "ce_loss": 0.31120729446411133, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.48757538199424744, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.20786985754966736, "step": 9110 }, { "epoch": 3.038692461641094, "loss": 1.017317295074463, "step": 9110 }, { "ce_loss": 0.25050947070121765, "epoch": 3.038692461641094, "step": 9110 }, { "distill_loss": 0.4317574203014374, "epoch": 3.038692461641094, "step": 9110 }, { "epoch": 3.038692461641094, "ref_ce_loss": 0.26842379570007324, "step": 9110 }, { "epoch": 3.0420280186791193, "loss": 1.0766, "step": 9120 }, { "epoch": 3.0420280186791193, "grad_norm": 2.0388600826263428, "step": 9120 }, { "epoch": 3.0420280186791193, "learning_rate": 0.0006523768760707519, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 1.0211807489395142, "step": 9120 }, { "ce_loss": 0.2478404939174652, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.37502074241638184, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.23659591376781464, "step": 9120 }, { "epoch": 3.0420280186791193, "loss": 0.7845948338508606, "step": 9120 }, { "ce_loss": 0.18987131118774414, "epoch": 3.0420280186791193, "step": 9120 }, { "distill_loss": 0.3439366817474365, "epoch": 3.0420280186791193, "step": 9120 }, { "epoch": 3.0420280186791193, "ref_ce_loss": 0.17155510187149048, "step": 9120 }, { "epoch": 3.0453635757171447, "loss": 0.9621, "step": 9130 }, { "epoch": 3.0453635757171447, "grad_norm": 2.8099639415740967, "step": 9130 }, { "epoch": 3.0453635757171447, "learning_rate": 0.00065204146870228, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 1.3765853643417358, "step": 9130 }, { "ce_loss": 0.2633730173110962, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.40176552534103394, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.18845759332180023, "step": 9130 }, { "epoch": 3.0453635757171447, "loss": 1.0317814350128174, "step": 9130 }, { "ce_loss": 0.30526190996170044, "epoch": 3.0453635757171447, "step": 9130 }, { "distill_loss": 0.4671580195426941, "epoch": 3.0453635757171447, "step": 9130 }, { "epoch": 3.0453635757171447, "ref_ce_loss": 0.19728983938694, "step": 9130 }, { "epoch": 3.04869913275517, "loss": 1.0236, "step": 9140 }, { "epoch": 3.04869913275517, "grad_norm": 1.629594326019287, "step": 9140 }, { "epoch": 3.04869913275517, "learning_rate": 0.0006517057671742934, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 1.2944269180297852, "step": 9140 }, { "ce_loss": 0.3126814067363739, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.4917336702346802, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.20706507563591003, "step": 9140 }, { "epoch": 3.04869913275517, "loss": 1.0209884643554688, "step": 9140 }, { "ce_loss": 0.23382870852947235, "epoch": 3.04869913275517, "step": 9140 }, { "distill_loss": 0.3408361077308655, "epoch": 3.04869913275517, "step": 9140 }, { "epoch": 3.04869913275517, "ref_ce_loss": 0.1909978985786438, "step": 9140 }, { "epoch": 3.0520346897931954, "loss": 1.0116, "step": 9150 }, { "epoch": 3.0520346897931954, "grad_norm": 2.274972915649414, "step": 9150 }, { "epoch": 3.0520346897931954, "learning_rate": 0.0006513697718785917, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.8121439218521118, "step": 9150 }, { "ce_loss": 0.20182228088378906, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.36416077613830566, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.19133667647838593, "step": 9150 }, { "epoch": 3.0520346897931954, "loss": 0.9841737747192383, "step": 9150 }, { "ce_loss": 0.20552538335323334, "epoch": 3.0520346897931954, "step": 9150 }, { "distill_loss": 0.41570305824279785, "epoch": 3.0520346897931954, "step": 9150 }, { "epoch": 3.0520346897931954, "ref_ce_loss": 0.16839125752449036, "step": 9150 }, { "epoch": 3.0553702468312207, "loss": 1.0607, "step": 9160 }, { "epoch": 3.0553702468312207, "grad_norm": 2.0157339572906494, "step": 9160 }, { "epoch": 3.0553702468312207, "learning_rate": 0.0006510334832073179, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 0.8094881176948547, "step": 9160 }, { "ce_loss": 0.200615793466568, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.3362140357494354, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.1586102694272995, "step": 9160 }, { "epoch": 3.0553702468312207, "loss": 1.5575326681137085, "step": 9160 }, { "ce_loss": 0.27797114849090576, "epoch": 3.0553702468312207, "step": 9160 }, { "distill_loss": 0.4377124011516571, "epoch": 3.0553702468312207, "step": 9160 }, { "epoch": 3.0553702468312207, "ref_ce_loss": 0.1744309812784195, "step": 9160 }, { "epoch": 3.058705803869246, "loss": 1.0709, "step": 9170 }, { "epoch": 3.058705803869246, "grad_norm": 3.011078357696533, "step": 9170 }, { "epoch": 3.058705803869246, "learning_rate": 0.0006506969015529567, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.6737293601036072, "step": 9170 }, { "ce_loss": 0.14504103362560272, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.3356139659881592, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.15358547866344452, "step": 9170 }, { "epoch": 3.058705803869246, "loss": 0.9121361970901489, "step": 9170 }, { "ce_loss": 0.22314849495887756, "epoch": 3.058705803869246, "step": 9170 }, { "distill_loss": 0.39828944206237793, "epoch": 3.058705803869246, "step": 9170 }, { "epoch": 3.058705803869246, "ref_ce_loss": 0.22030402719974518, "step": 9170 }, { "epoch": 3.0620413609072714, "loss": 1.0014, "step": 9180 }, { "epoch": 3.0620413609072714, "grad_norm": 2.376568078994751, "step": 9180 }, { "epoch": 3.0620413609072714, "learning_rate": 0.0006503600273083354, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 1.095672369003296, "step": 9180 }, { "ce_loss": 0.2906176447868347, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.4271732270717621, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.2026030719280243, "step": 9180 }, { "epoch": 3.0620413609072714, "loss": 1.2553006410598755, "step": 9180 }, { "ce_loss": 0.26068201661109924, "epoch": 3.0620413609072714, "step": 9180 }, { "distill_loss": 0.3658229410648346, "epoch": 3.0620413609072714, "step": 9180 }, { "epoch": 3.0620413609072714, "ref_ce_loss": 0.19992195069789886, "step": 9180 }, { "epoch": 3.0653769179452968, "loss": 1.0042, "step": 9190 }, { "epoch": 3.0653769179452968, "grad_norm": 1.8117719888687134, "step": 9190 }, { "epoch": 3.0653769179452968, "learning_rate": 0.0006500228608666222, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 1.051975965499878, "step": 9190 }, { "ce_loss": 0.31874123215675354, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.4895990788936615, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.1792144477367401, "step": 9190 }, { "epoch": 3.0653769179452968, "loss": 0.8689483404159546, "step": 9190 }, { "ce_loss": 0.2500791549682617, "epoch": 3.0653769179452968, "step": 9190 }, { "distill_loss": 0.3955501616001129, "epoch": 3.0653769179452968, "step": 9190 }, { "epoch": 3.0653769179452968, "ref_ce_loss": 0.17629677057266235, "step": 9190 }, { "epoch": 3.068712474983322, "loss": 0.9835, "step": 9200 }, { "epoch": 3.068712474983322, "grad_norm": 2.159898519515991, "step": 9200 }, { "epoch": 3.068712474983322, "learning_rate": 0.0006496854026213269, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.9245696663856506, "step": 9200 }, { "ce_loss": 0.2620841860771179, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.39770200848579407, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.19167114794254303, "step": 9200 }, { "epoch": 3.068712474983322, "loss": 0.8449506163597107, "step": 9200 }, { "ce_loss": 0.20318298041820526, "epoch": 3.068712474983322, "step": 9200 }, { "distill_loss": 0.3830258846282959, "epoch": 3.068712474983322, "step": 9200 }, { "epoch": 3.068712474983322, "ref_ce_loss": 0.19371457397937775, "step": 9200 }, { "epoch": 3.0720480320213475, "loss": 1.0434, "step": 9210 }, { "epoch": 3.0720480320213475, "grad_norm": 2.69889760017395, "step": 9210 }, { "epoch": 3.0720480320213475, "learning_rate": 0.0006493476529662996, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 0.9485535621643066, "step": 9210 }, { "ce_loss": 0.19093436002731323, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.4069599509239197, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.14216384291648865, "step": 9210 }, { "epoch": 3.0720480320213475, "loss": 1.319540023803711, "step": 9210 }, { "ce_loss": 0.33737826347351074, "epoch": 3.0720480320213475, "step": 9210 }, { "distill_loss": 0.4702030122280121, "epoch": 3.0720480320213475, "step": 9210 }, { "epoch": 3.0720480320213475, "ref_ce_loss": 0.23823755979537964, "step": 9210 }, { "epoch": 3.075383589059373, "loss": 1.1373, "step": 9220 }, { "epoch": 3.075383589059373, "grad_norm": 2.137755870819092, "step": 9220 }, { "epoch": 3.075383589059373, "learning_rate": 0.0006490096122957303, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 1.3876944780349731, "step": 9220 }, { "ce_loss": 0.3482910692691803, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.5636916160583496, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.2291729897260666, "step": 9220 }, { "epoch": 3.075383589059373, "loss": 1.1224955320358276, "step": 9220 }, { "ce_loss": 0.27185824513435364, "epoch": 3.075383589059373, "step": 9220 }, { "distill_loss": 0.47210192680358887, "epoch": 3.075383589059373, "step": 9220 }, { "epoch": 3.075383589059373, "ref_ce_loss": 0.1957308053970337, "step": 9220 }, { "epoch": 3.078719146097398, "loss": 1.136, "step": 9230 }, { "epoch": 3.078719146097398, "grad_norm": 1.62136709690094, "step": 9230 }, { "epoch": 3.078719146097398, "learning_rate": 0.0006486712810041488, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 1.64496910572052, "step": 9230 }, { "ce_loss": 0.28236740827560425, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.4360547959804535, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.2266317903995514, "step": 9230 }, { "epoch": 3.078719146097398, "loss": 1.0641510486602783, "step": 9230 }, { "ce_loss": 0.32486236095428467, "epoch": 3.078719146097398, "step": 9230 }, { "distill_loss": 0.48013004660606384, "epoch": 3.078719146097398, "step": 9230 }, { "epoch": 3.078719146097398, "ref_ce_loss": 0.20147085189819336, "step": 9230 }, { "epoch": 3.0820547031354235, "loss": 1.1094, "step": 9240 }, { "epoch": 3.0820547031354235, "grad_norm": 1.375726580619812, "step": 9240 }, { "epoch": 3.0820547031354235, "learning_rate": 0.0006483326594864243, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 1.3441393375396729, "step": 9240 }, { "ce_loss": 0.32190045714378357, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.40649378299713135, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.2810215353965759, "step": 9240 }, { "epoch": 3.0820547031354235, "loss": 1.1238963603973389, "step": 9240 }, { "ce_loss": 0.2841211259365082, "epoch": 3.0820547031354235, "step": 9240 }, { "distill_loss": 0.38098785281181335, "epoch": 3.0820547031354235, "step": 9240 }, { "epoch": 3.0820547031354235, "ref_ce_loss": 0.2101936638355255, "step": 9240 }, { "epoch": 3.085390260173449, "loss": 1.055, "step": 9250 }, { "epoch": 3.085390260173449, "grad_norm": 1.4783991575241089, "step": 9250 }, { "epoch": 3.085390260173449, "learning_rate": 0.0006479937481377644, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 0.8815041780471802, "step": 9250 }, { "ce_loss": 0.2548271119594574, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.43178367614746094, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.1447189897298813, "step": 9250 }, { "epoch": 3.085390260173449, "loss": 1.0925407409667969, "step": 9250 }, { "ce_loss": 0.22870661318302155, "epoch": 3.085390260173449, "step": 9250 }, { "distill_loss": 0.4754504859447479, "epoch": 3.085390260173449, "step": 9250 }, { "epoch": 3.085390260173449, "ref_ce_loss": 0.21979738771915436, "step": 9250 }, { "epoch": 3.088725817211474, "loss": 1.0436, "step": 9260 }, { "epoch": 3.088725817211474, "grad_norm": 2.365377187728882, "step": 9260 }, { "epoch": 3.088725817211474, "learning_rate": 0.0006476545473537153, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 0.7632279396057129, "step": 9260 }, { "ce_loss": 0.19780153036117554, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.3653009235858917, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.19955159723758698, "step": 9260 }, { "epoch": 3.088725817211474, "loss": 1.241063117980957, "step": 9260 }, { "ce_loss": 0.2871393859386444, "epoch": 3.088725817211474, "step": 9260 }, { "distill_loss": 0.47732460498809814, "epoch": 3.088725817211474, "step": 9260 }, { "epoch": 3.088725817211474, "ref_ce_loss": 0.21136754751205444, "step": 9260 }, { "epoch": 3.0920613742494996, "loss": 1.052, "step": 9270 }, { "epoch": 3.0920613742494996, "grad_norm": 1.8858585357666016, "step": 9270 }, { "epoch": 3.0920613742494996, "learning_rate": 0.0006473150575301607, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 0.9567256569862366, "step": 9270 }, { "ce_loss": 0.2105388045310974, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.3784027099609375, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.189395472407341, "step": 9270 }, { "epoch": 3.0920613742494996, "loss": 1.0037850141525269, "step": 9270 }, { "ce_loss": 0.2868437170982361, "epoch": 3.0920613742494996, "step": 9270 }, { "distill_loss": 0.3869924545288086, "epoch": 3.0920613742494996, "step": 9270 }, { "epoch": 3.0920613742494996, "ref_ce_loss": 0.18987053632736206, "step": 9270 }, { "epoch": 3.095396931287525, "loss": 0.9799, "step": 9280 }, { "epoch": 3.095396931287525, "grad_norm": 2.3767271041870117, "step": 9280 }, { "epoch": 3.095396931287525, "learning_rate": 0.0006469752790633218, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 1.0870213508605957, "step": 9280 }, { "ce_loss": 0.29317188262939453, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.43251579999923706, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.17761041224002838, "step": 9280 }, { "epoch": 3.095396931287525, "loss": 0.8081312775611877, "step": 9280 }, { "ce_loss": 0.21595090627670288, "epoch": 3.095396931287525, "step": 9280 }, { "distill_loss": 0.3709900677204132, "epoch": 3.095396931287525, "step": 9280 }, { "epoch": 3.095396931287525, "ref_ce_loss": 0.17402665317058563, "step": 9280 }, { "epoch": 3.0987324883255503, "loss": 1.0143, "step": 9290 }, { "epoch": 3.0987324883255503, "grad_norm": 2.8004062175750732, "step": 9290 }, { "epoch": 3.0987324883255503, "learning_rate": 0.0006466352123497565, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 1.0892130136489868, "step": 9290 }, { "ce_loss": 0.24042299389839172, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.4947637915611267, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.21145179867744446, "step": 9290 }, { "epoch": 3.0987324883255503, "loss": 0.995558500289917, "step": 9290 }, { "ce_loss": 0.2777062654495239, "epoch": 3.0987324883255503, "step": 9290 }, { "distill_loss": 0.4128227233886719, "epoch": 3.0987324883255503, "step": 9290 }, { "epoch": 3.0987324883255503, "ref_ce_loss": 0.22341780364513397, "step": 9290 }, { "epoch": 3.1020680453635756, "loss": 1.0309, "step": 9300 }, { "epoch": 3.1020680453635756, "grad_norm": 1.9543800354003906, "step": 9300 }, { "epoch": 3.1020680453635756, "learning_rate": 0.0006462948577863593, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 1.1493991613388062, "step": 9300 }, { "ce_loss": 0.2970648407936096, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.4186588227748871, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.21933016180992126, "step": 9300 }, { "epoch": 3.1020680453635756, "loss": 1.5194854736328125, "step": 9300 }, { "ce_loss": 0.2752871513366699, "epoch": 3.1020680453635756, "step": 9300 }, { "distill_loss": 0.480773001909256, "epoch": 3.1020680453635756, "step": 9300 }, { "epoch": 3.1020680453635756, "ref_ce_loss": 0.1940537542104721, "step": 9300 }, { "epoch": 3.105403602401601, "loss": 0.9941, "step": 9310 }, { "epoch": 3.105403602401601, "grad_norm": 2.3288204669952393, "step": 9310 }, { "epoch": 3.105403602401601, "learning_rate": 0.0006459542157703608, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.8614883422851562, "step": 9310 }, { "ce_loss": 0.20192290842533112, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.4057275652885437, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.18818989396095276, "step": 9310 }, { "epoch": 3.105403602401601, "loss": 0.9263197779655457, "step": 9310 }, { "ce_loss": 0.2823805510997772, "epoch": 3.105403602401601, "step": 9310 }, { "distill_loss": 0.3929421603679657, "epoch": 3.105403602401601, "step": 9310 }, { "epoch": 3.105403602401601, "ref_ce_loss": 0.19806356728076935, "step": 9310 }, { "epoch": 3.1087391594396263, "loss": 0.9622, "step": 9320 }, { "epoch": 3.1087391594396263, "grad_norm": 2.14695405960083, "step": 9320 }, { "epoch": 3.1087391594396263, "learning_rate": 0.0006456132866993266, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 1.4668676853179932, "step": 9320 }, { "ce_loss": 0.30181676149368286, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.40018582344055176, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.25091373920440674, "step": 9320 }, { "epoch": 3.1087391594396263, "loss": 0.9198508262634277, "step": 9320 }, { "ce_loss": 0.2544313073158264, "epoch": 3.1087391594396263, "step": 9320 }, { "distill_loss": 0.3781255781650543, "epoch": 3.1087391594396263, "step": 9320 }, { "epoch": 3.1087391594396263, "ref_ce_loss": 0.22032621502876282, "step": 9320 }, { "epoch": 3.1120747164776517, "loss": 1.082, "step": 9330 }, { "epoch": 3.1120747164776517, "grad_norm": 1.727124810218811, "step": 9330 }, { "epoch": 3.1120747164776517, "learning_rate": 0.0006452720709711578, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 1.3362449407577515, "step": 9330 }, { "ce_loss": 0.2979918420314789, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.36880090832710266, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.24092470109462738, "step": 9330 }, { "epoch": 3.1120747164776517, "loss": 0.8599693775177002, "step": 9330 }, { "ce_loss": 0.24267035722732544, "epoch": 3.1120747164776517, "step": 9330 }, { "distill_loss": 0.40098074078559875, "epoch": 3.1120747164776517, "step": 9330 }, { "epoch": 3.1120747164776517, "ref_ce_loss": 0.14831340312957764, "step": 9330 }, { "epoch": 3.115410273515677, "loss": 0.9737, "step": 9340 }, { "epoch": 3.115410273515677, "grad_norm": 2.032933473587036, "step": 9340 }, { "epoch": 3.115410273515677, "learning_rate": 0.0006449305689840898, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.960468590259552, "step": 9340 }, { "ce_loss": 0.24077749252319336, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.4729737341403961, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.19689474999904633, "step": 9340 }, { "epoch": 3.115410273515677, "loss": 0.8800736665725708, "step": 9340 }, { "ce_loss": 0.23090268671512604, "epoch": 3.115410273515677, "step": 9340 }, { "distill_loss": 0.4070270359516144, "epoch": 3.115410273515677, "step": 9340 }, { "epoch": 3.115410273515677, "ref_ce_loss": 0.18194246292114258, "step": 9340 }, { "epoch": 3.1187458305537024, "loss": 0.9708, "step": 9350 }, { "epoch": 3.1187458305537024, "grad_norm": 2.4061787128448486, "step": 9350 }, { "epoch": 3.1187458305537024, "learning_rate": 0.0006445887811366922, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 0.956256628036499, "step": 9350 }, { "ce_loss": 0.2822020649909973, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.4642751216888428, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.209703266620636, "step": 9350 }, { "epoch": 3.1187458305537024, "loss": 1.186940312385559, "step": 9350 }, { "ce_loss": 0.3359937369823456, "epoch": 3.1187458305537024, "step": 9350 }, { "distill_loss": 0.46250587701797485, "epoch": 3.1187458305537024, "step": 9350 }, { "epoch": 3.1187458305537024, "ref_ce_loss": 0.26884305477142334, "step": 9350 }, { "epoch": 3.1220813875917277, "loss": 0.9747, "step": 9360 }, { "epoch": 3.1220813875917277, "grad_norm": 1.8572888374328613, "step": 9360 }, { "epoch": 3.1220813875917277, "learning_rate": 0.000644246707827868, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 1.1304075717926025, "step": 9360 }, { "ce_loss": 0.32038217782974243, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.49822553992271423, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.23027847707271576, "step": 9360 }, { "epoch": 3.1220813875917277, "loss": 1.209761142730713, "step": 9360 }, { "ce_loss": 0.31924429535865784, "epoch": 3.1220813875917277, "step": 9360 }, { "distill_loss": 0.5770853757858276, "epoch": 3.1220813875917277, "step": 9360 }, { "epoch": 3.1220813875917277, "ref_ce_loss": 0.2517629861831665, "step": 9360 }, { "epoch": 3.125416944629753, "loss": 1.0919, "step": 9370 }, { "epoch": 3.125416944629753, "grad_norm": 2.1829588413238525, "step": 9370 }, { "epoch": 3.125416944629753, "learning_rate": 0.0006439043494568539, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.984476625919342, "step": 9370 }, { "ce_loss": 0.2882232666015625, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.4718204140663147, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.22438225150108337, "step": 9370 }, { "epoch": 3.125416944629753, "loss": 0.8405330181121826, "step": 9370 }, { "ce_loss": 0.2212357521057129, "epoch": 3.125416944629753, "step": 9370 }, { "distill_loss": 0.4073556959629059, "epoch": 3.125416944629753, "step": 9370 }, { "epoch": 3.125416944629753, "ref_ce_loss": 0.16576293110847473, "step": 9370 }, { "epoch": 3.1287525016677784, "loss": 1.049, "step": 9380 }, { "epoch": 3.1287525016677784, "grad_norm": 2.646833896636963, "step": 9380 }, { "epoch": 3.1287525016677784, "learning_rate": 0.0006435617064232187, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 1.037705898284912, "step": 9380 }, { "ce_loss": 0.242634579539299, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.448700487613678, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.2192038893699646, "step": 9380 }, { "epoch": 3.1287525016677784, "loss": 1.039435863494873, "step": 9380 }, { "ce_loss": 0.32878270745277405, "epoch": 3.1287525016677784, "step": 9380 }, { "distill_loss": 0.4557130038738251, "epoch": 3.1287525016677784, "step": 9380 }, { "epoch": 3.1287525016677784, "ref_ce_loss": 0.22904300689697266, "step": 9380 }, { "epoch": 3.1320880587058038, "loss": 1.0306, "step": 9390 }, { "epoch": 3.1320880587058038, "grad_norm": 2.9341678619384766, "step": 9390 }, { "epoch": 3.1320880587058038, "learning_rate": 0.0006432187791268639, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 1.1417816877365112, "step": 9390 }, { "ce_loss": 0.22817549109458923, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.38718181848526, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.1807437539100647, "step": 9390 }, { "epoch": 3.1320880587058038, "loss": 0.8131179213523865, "step": 9390 }, { "ce_loss": 0.24570348858833313, "epoch": 3.1320880587058038, "step": 9390 }, { "distill_loss": 0.37476658821105957, "epoch": 3.1320880587058038, "step": 9390 }, { "epoch": 3.1320880587058038, "ref_ce_loss": 0.19250284135341644, "step": 9390 }, { "epoch": 3.135423615743829, "loss": 1.0896, "step": 9400 }, { "epoch": 3.135423615743829, "grad_norm": 3.520961046218872, "step": 9400 }, { "epoch": 3.135423615743829, "learning_rate": 0.0006428755679680224, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.8953517079353333, "step": 9400 }, { "ce_loss": 0.20580261945724487, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.3411506116390228, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.17772138118743896, "step": 9400 }, { "epoch": 3.135423615743829, "loss": 0.9172338247299194, "step": 9400 }, { "ce_loss": 0.263776957988739, "epoch": 3.135423615743829, "step": 9400 }, { "distill_loss": 0.37165242433547974, "epoch": 3.135423615743829, "step": 9400 }, { "epoch": 3.135423615743829, "ref_ce_loss": 0.1233987957239151, "step": 9400 }, { "epoch": 3.1387591727818545, "loss": 0.9324, "step": 9410 }, { "epoch": 3.1387591727818545, "grad_norm": 1.602327585220337, "step": 9410 }, { "epoch": 3.1387591727818545, "learning_rate": 0.0006425320733472585, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 0.6967126727104187, "step": 9410 }, { "ce_loss": 0.21138957142829895, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.29559966921806335, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.14123915135860443, "step": 9410 }, { "epoch": 3.1387591727818545, "loss": 1.1047972440719604, "step": 9410 }, { "ce_loss": 0.26694992184638977, "epoch": 3.1387591727818545, "step": 9410 }, { "distill_loss": 0.35893869400024414, "epoch": 3.1387591727818545, "step": 9410 }, { "epoch": 3.1387591727818545, "ref_ce_loss": 0.22025825083255768, "step": 9410 }, { "epoch": 3.14209472981988, "loss": 0.9499, "step": 9420 }, { "epoch": 3.14209472981988, "grad_norm": 1.3611681461334229, "step": 9420 }, { "epoch": 3.14209472981988, "learning_rate": 0.0006421882956654676, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 0.9599966406822205, "step": 9420 }, { "ce_loss": 0.2786065936088562, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.3393070697784424, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.20848676562309265, "step": 9420 }, { "epoch": 3.14209472981988, "loss": 1.2619997262954712, "step": 9420 }, { "ce_loss": 0.29320430755615234, "epoch": 3.14209472981988, "step": 9420 }, { "distill_loss": 0.4119172692298889, "epoch": 3.14209472981988, "step": 9420 }, { "epoch": 3.14209472981988, "ref_ce_loss": 0.23351138830184937, "step": 9420 }, { "epoch": 3.145430286857905, "loss": 1.0658, "step": 9430 }, { "epoch": 3.145430286857905, "grad_norm": 1.6318954229354858, "step": 9430 }, { "epoch": 3.145430286857905, "learning_rate": 0.000641844235323875, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.9952999353408813, "step": 9430 }, { "ce_loss": 0.2627035081386566, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.43587803840637207, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.22304628789424896, "step": 9430 }, { "epoch": 3.145430286857905, "loss": 0.8246051073074341, "step": 9430 }, { "ce_loss": 0.20812024176120758, "epoch": 3.145430286857905, "step": 9430 }, { "distill_loss": 0.3872748911380768, "epoch": 3.145430286857905, "step": 9430 }, { "epoch": 3.145430286857905, "ref_ce_loss": 0.18635542690753937, "step": 9430 }, { "epoch": 3.1487658438959305, "loss": 0.9934, "step": 9440 }, { "epoch": 3.1487658438959305, "grad_norm": 1.9390169382095337, "step": 9440 }, { "epoch": 3.1487658438959305, "learning_rate": 0.0006414998927240363, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.9947294592857361, "step": 9440 }, { "ce_loss": 0.3259407579898834, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.44335800409317017, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.18987657129764557, "step": 9440 }, { "epoch": 3.1487658438959305, "loss": 0.6140831708908081, "step": 9440 }, { "ce_loss": 0.16870209574699402, "epoch": 3.1487658438959305, "step": 9440 }, { "distill_loss": 0.2950131297111511, "epoch": 3.1487658438959305, "step": 9440 }, { "epoch": 3.1487658438959305, "ref_ce_loss": 0.15002664923667908, "step": 9440 }, { "epoch": 3.152101400933956, "loss": 0.9927, "step": 9450 }, { "epoch": 3.152101400933956, "grad_norm": 3.063666582107544, "step": 9450 }, { "epoch": 3.152101400933956, "learning_rate": 0.0006411552682678365, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 1.0572447776794434, "step": 9450 }, { "ce_loss": 0.3054739236831665, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.43969228863716125, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.24192282557487488, "step": 9450 }, { "epoch": 3.152101400933956, "loss": 0.8156782388687134, "step": 9450 }, { "ce_loss": 0.23238930106163025, "epoch": 3.152101400933956, "step": 9450 }, { "distill_loss": 0.33370551466941833, "epoch": 3.152101400933956, "step": 9450 }, { "epoch": 3.152101400933956, "ref_ce_loss": 0.19612304866313934, "step": 9450 }, { "epoch": 3.155436957971981, "loss": 0.965, "step": 9460 }, { "epoch": 3.155436957971981, "grad_norm": 3.6377336978912354, "step": 9460 }, { "epoch": 3.155436957971981, "learning_rate": 0.0006408103623574891, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 0.7613427639007568, "step": 9460 }, { "ce_loss": 0.2139458805322647, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.2921532392501831, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.20600202679634094, "step": 9460 }, { "epoch": 3.155436957971981, "loss": 1.1687495708465576, "step": 9460 }, { "ce_loss": 0.27339303493499756, "epoch": 3.155436957971981, "step": 9460 }, { "distill_loss": 0.39343589544296265, "epoch": 3.155436957971981, "step": 9460 }, { "epoch": 3.155436957971981, "ref_ce_loss": 0.25108566880226135, "step": 9460 }, { "epoch": 3.1587725150100066, "loss": 0.97, "step": 9470 }, { "epoch": 3.1587725150100066, "grad_norm": 2.286971092224121, "step": 9470 }, { "epoch": 3.1587725150100066, "learning_rate": 0.0006404651753955363, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 0.9634687304496765, "step": 9470 }, { "ce_loss": 0.30855584144592285, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.3636634647846222, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.2440856397151947, "step": 9470 }, { "epoch": 3.1587725150100066, "loss": 1.6036324501037598, "step": 9470 }, { "ce_loss": 0.3264394998550415, "epoch": 3.1587725150100066, "step": 9470 }, { "distill_loss": 0.4360540509223938, "epoch": 3.1587725150100066, "step": 9470 }, { "epoch": 3.1587725150100066, "ref_ce_loss": 0.2085307240486145, "step": 9470 }, { "epoch": 3.162108072048032, "loss": 1.0855, "step": 9480 }, { "epoch": 3.162108072048032, "grad_norm": 2.486076593399048, "step": 9480 }, { "epoch": 3.162108072048032, "learning_rate": 0.000640119707784849, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.7592119574546814, "step": 9480 }, { "ce_loss": 0.19020523130893707, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.3947354555130005, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.17405526340007782, "step": 9480 }, { "epoch": 3.162108072048032, "loss": 0.9845924973487854, "step": 9480 }, { "ce_loss": 0.24522338807582855, "epoch": 3.162108072048032, "step": 9480 }, { "distill_loss": 0.38083866238594055, "epoch": 3.162108072048032, "step": 9480 }, { "epoch": 3.162108072048032, "ref_ce_loss": 0.21165917813777924, "step": 9480 }, { "epoch": 3.1654436290860573, "loss": 0.9108, "step": 9490 }, { "epoch": 3.1654436290860573, "grad_norm": 1.6602882146835327, "step": 9490 }, { "epoch": 3.1654436290860573, "learning_rate": 0.0006397739599286248, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 0.8489893674850464, "step": 9490 }, { "ce_loss": 0.22476591169834137, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.4076571464538574, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.1667369157075882, "step": 9490 }, { "epoch": 3.1654436290860573, "loss": 1.1844907999038696, "step": 9490 }, { "ce_loss": 0.24899810552597046, "epoch": 3.1654436290860573, "step": 9490 }, { "distill_loss": 0.35540464520454407, "epoch": 3.1654436290860573, "step": 9490 }, { "epoch": 3.1654436290860573, "ref_ce_loss": 0.20953817665576935, "step": 9490 }, { "epoch": 3.1687791861240826, "loss": 0.9954, "step": 9500 }, { "epoch": 3.1687791861240826, "grad_norm": 2.153787612915039, "step": 9500 }, { "epoch": 3.1687791861240826, "learning_rate": 0.0006394279322303885, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 1.1891601085662842, "step": 9500 }, { "ce_loss": 0.40403467416763306, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.5030407905578613, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.28187599778175354, "step": 9500 }, { "epoch": 3.1687791861240826, "loss": 0.8806698322296143, "step": 9500 }, { "ce_loss": 0.21907956898212433, "epoch": 3.1687791861240826, "step": 9500 }, { "distill_loss": 0.4144791066646576, "epoch": 3.1687791861240826, "step": 9500 }, { "epoch": 3.1687791861240826, "ref_ce_loss": 0.15799804031848907, "step": 9500 }, { "epoch": 3.172114743162108, "loss": 1.0545, "step": 9510 }, { "epoch": 3.172114743162108, "grad_norm": 1.8217953443527222, "step": 9510 }, { "epoch": 3.172114743162108, "learning_rate": 0.0006390816250939918, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.9632741212844849, "step": 9510 }, { "ce_loss": 0.27603211998939514, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.45113474130630493, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.1889391541481018, "step": 9510 }, { "epoch": 3.172114743162108, "loss": 0.881806492805481, "step": 9510 }, { "ce_loss": 0.20647741854190826, "epoch": 3.172114743162108, "step": 9510 }, { "distill_loss": 0.4694068133831024, "epoch": 3.172114743162108, "step": 9510 }, { "epoch": 3.172114743162108, "ref_ce_loss": 0.18652670085430145, "step": 9510 }, { "epoch": 3.1754503002001333, "loss": 1.0121, "step": 9520 }, { "epoch": 3.1754503002001333, "grad_norm": 1.8334593772888184, "step": 9520 }, { "epoch": 3.1754503002001333, "learning_rate": 0.0006387350389236124, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.9273632168769836, "step": 9520 }, { "ce_loss": 0.2923687696456909, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.4269871115684509, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.20764534175395966, "step": 9520 }, { "epoch": 3.1754503002001333, "loss": 0.6492568850517273, "step": 9520 }, { "ce_loss": 0.1861332356929779, "epoch": 3.1754503002001333, "step": 9520 }, { "distill_loss": 0.3245824873447418, "epoch": 3.1754503002001333, "step": 9520 }, { "epoch": 3.1754503002001333, "ref_ce_loss": 0.1382860392332077, "step": 9520 }, { "epoch": 3.1787858572381587, "loss": 1.068, "step": 9530 }, { "epoch": 3.1787858572381587, "grad_norm": 1.825934886932373, "step": 9530 }, { "epoch": 3.1787858572381587, "learning_rate": 0.0006383881741237535, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.9091103672981262, "step": 9530 }, { "ce_loss": 0.2934170663356781, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.3609623312950134, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.19794169068336487, "step": 9530 }, { "epoch": 3.1787858572381587, "loss": 0.8751694560050964, "step": 9530 }, { "ce_loss": 0.2098688930273056, "epoch": 3.1787858572381587, "step": 9530 }, { "distill_loss": 0.37939339876174927, "epoch": 3.1787858572381587, "step": 9530 }, { "epoch": 3.1787858572381587, "ref_ce_loss": 0.198336660861969, "step": 9530 }, { "epoch": 3.182121414276184, "loss": 1.1017, "step": 9540 }, { "epoch": 3.182121414276184, "grad_norm": 3.3589539527893066, "step": 9540 }, { "epoch": 3.182121414276184, "learning_rate": 0.0006380410310992438, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 1.1012458801269531, "step": 9540 }, { "ce_loss": 0.29811060428619385, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.4516919255256653, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.26902395486831665, "step": 9540 }, { "epoch": 3.182121414276184, "loss": 0.8544487953186035, "step": 9540 }, { "ce_loss": 0.19948647916316986, "epoch": 3.182121414276184, "step": 9540 }, { "distill_loss": 0.3371371328830719, "epoch": 3.182121414276184, "step": 9540 }, { "epoch": 3.182121414276184, "ref_ce_loss": 0.1422787457704544, "step": 9540 }, { "epoch": 3.1854569713142094, "loss": 1.0451, "step": 9550 }, { "epoch": 3.1854569713142094, "grad_norm": 1.9161041975021362, "step": 9550 }, { "epoch": 3.1854569713142094, "learning_rate": 0.0006376936102552368, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.7991788387298584, "step": 9550 }, { "ce_loss": 0.22001229226589203, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.32813096046447754, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.16685843467712402, "step": 9550 }, { "epoch": 3.1854569713142094, "loss": 0.8779180645942688, "step": 9550 }, { "ce_loss": 0.24932724237442017, "epoch": 3.1854569713142094, "step": 9550 }, { "distill_loss": 0.37945684790611267, "epoch": 3.1854569713142094, "step": 9550 }, { "epoch": 3.1854569713142094, "ref_ce_loss": 0.20249043405056, "step": 9550 }, { "epoch": 3.1887925283522347, "loss": 0.9951, "step": 9560 }, { "epoch": 3.1887925283522347, "grad_norm": 1.8126941919326782, "step": 9560 }, { "epoch": 3.1887925283522347, "learning_rate": 0.0006373459119972095, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.7515552043914795, "step": 9560 }, { "ce_loss": 0.17431022226810455, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.32552477717399597, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.15542994439601898, "step": 9560 }, { "epoch": 3.1887925283522347, "loss": 0.9562395215034485, "step": 9560 }, { "ce_loss": 0.30415454506874084, "epoch": 3.1887925283522347, "step": 9560 }, { "distill_loss": 0.40889936685562134, "epoch": 3.1887925283522347, "step": 9560 }, { "epoch": 3.1887925283522347, "ref_ce_loss": 0.24302524328231812, "step": 9560 }, { "epoch": 3.19212808539026, "loss": 0.961, "step": 9570 }, { "epoch": 3.19212808539026, "grad_norm": 2.174283266067505, "step": 9570 }, { "epoch": 3.19212808539026, "learning_rate": 0.0006369979367309635, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 0.9469784498214722, "step": 9570 }, { "ce_loss": 0.2929200828075409, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.4416431486606598, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.21219401061534882, "step": 9570 }, { "epoch": 3.19212808539026, "loss": 1.0700886249542236, "step": 9570 }, { "ce_loss": 0.25346022844314575, "epoch": 3.19212808539026, "step": 9570 }, { "distill_loss": 0.36397501826286316, "epoch": 3.19212808539026, "step": 9570 }, { "epoch": 3.19212808539026, "ref_ce_loss": 0.22573909163475037, "step": 9570 }, { "epoch": 3.1954636424282854, "loss": 0.9896, "step": 9580 }, { "epoch": 3.1954636424282854, "grad_norm": 2.4014036655426025, "step": 9580 }, { "epoch": 3.1954636424282854, "learning_rate": 0.0006366496848626232, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 1.5717496871948242, "step": 9580 }, { "ce_loss": 0.257915735244751, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.33823972940444946, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.16347834467887878, "step": 9580 }, { "epoch": 3.1954636424282854, "loss": 1.0040936470031738, "step": 9580 }, { "ce_loss": 0.2746289074420929, "epoch": 3.1954636424282854, "step": 9580 }, { "distill_loss": 0.37868228554725647, "epoch": 3.1954636424282854, "step": 9580 }, { "epoch": 3.1954636424282854, "ref_ce_loss": 0.1852872520685196, "step": 9580 }, { "epoch": 3.1987991994663107, "loss": 0.936, "step": 9590 }, { "epoch": 3.1987991994663107, "grad_norm": 1.3955729007720947, "step": 9590 }, { "epoch": 3.1987991994663107, "learning_rate": 0.0006363011567986361, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 0.7811391949653625, "step": 9590 }, { "ce_loss": 0.22975607216358185, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.3405916094779968, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.16584502160549164, "step": 9590 }, { "epoch": 3.1987991994663107, "loss": 1.0799206495285034, "step": 9590 }, { "ce_loss": 0.2468085139989853, "epoch": 3.1987991994663107, "step": 9590 }, { "distill_loss": 0.32198700308799744, "epoch": 3.1987991994663107, "step": 9590 }, { "epoch": 3.1987991994663107, "ref_ce_loss": 0.17793063819408417, "step": 9590 }, { "epoch": 3.202134756504336, "loss": 1.0321, "step": 9600 }, { "epoch": 3.202134756504336, "grad_norm": 2.6115453243255615, "step": 9600 }, { "epoch": 3.202134756504336, "learning_rate": 0.000635952352945772, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.9059250354766846, "step": 9600 }, { "ce_loss": 0.23575346171855927, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.4092578887939453, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.21158456802368164, "step": 9600 }, { "epoch": 3.202134756504336, "loss": 0.9089540839195251, "step": 9600 }, { "ce_loss": 0.2593563497066498, "epoch": 3.202134756504336, "step": 9600 }, { "distill_loss": 0.4364529550075531, "epoch": 3.202134756504336, "step": 9600 }, { "epoch": 3.202134756504336, "ref_ce_loss": 0.1588144451379776, "step": 9600 }, { "epoch": 3.2054703135423614, "loss": 0.9798, "step": 9610 }, { "epoch": 3.2054703135423614, "grad_norm": 2.0719449520111084, "step": 9610 }, { "epoch": 3.2054703135423614, "learning_rate": 0.0006356032737111226, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.967341423034668, "step": 9610 }, { "ce_loss": 0.25755563378334045, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.38610100746154785, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.18525567650794983, "step": 9610 }, { "epoch": 3.2054703135423614, "loss": 0.9186158180236816, "step": 9610 }, { "ce_loss": 0.19721673429012299, "epoch": 3.2054703135423614, "step": 9610 }, { "distill_loss": 0.4046347737312317, "epoch": 3.2054703135423614, "step": 9610 }, { "epoch": 3.2054703135423614, "ref_ce_loss": 0.18537798523902893, "step": 9610 }, { "epoch": 3.208805870580387, "loss": 1.0076, "step": 9620 }, { "epoch": 3.208805870580387, "grad_norm": 4.249617576599121, "step": 9620 }, { "epoch": 3.208805870580387, "learning_rate": 0.0006352539195021007, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 0.931041955947876, "step": 9620 }, { "ce_loss": 0.23878906667232513, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.4049624502658844, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.2291223406791687, "step": 9620 }, { "epoch": 3.208805870580387, "loss": 1.1312953233718872, "step": 9620 }, { "ce_loss": 0.16920793056488037, "epoch": 3.208805870580387, "step": 9620 }, { "distill_loss": 0.4426681399345398, "epoch": 3.208805870580387, "step": 9620 }, { "epoch": 3.208805870580387, "ref_ce_loss": 0.17689041793346405, "step": 9620 }, { "epoch": 3.212141427618412, "loss": 0.959, "step": 9630 }, { "epoch": 3.212141427618412, "grad_norm": 2.485501527786255, "step": 9630 }, { "epoch": 3.212141427618412, "learning_rate": 0.0006349042907264404, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.7932411432266235, "step": 9630 }, { "ce_loss": 0.2088746428489685, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.38120004534721375, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.15612326562404633, "step": 9630 }, { "epoch": 3.212141427618412, "loss": 0.9963729381561279, "step": 9630 }, { "ce_loss": 0.2870686948299408, "epoch": 3.212141427618412, "step": 9630 }, { "distill_loss": 0.4551391005516052, "epoch": 3.212141427618412, "step": 9630 }, { "epoch": 3.212141427618412, "ref_ce_loss": 0.21353857219219208, "step": 9630 }, { "epoch": 3.2154769846564375, "loss": 1.0746, "step": 9640 }, { "epoch": 3.2154769846564375, "grad_norm": 2.1060545444488525, "step": 9640 }, { "epoch": 3.2154769846564375, "learning_rate": 0.0006345543877921961, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 0.9149405360221863, "step": 9640 }, { "ce_loss": 0.26410815119743347, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.44474872946739197, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.16752924025058746, "step": 9640 }, { "epoch": 3.2154769846564375, "loss": 1.069939374923706, "step": 9640 }, { "ce_loss": 0.2878960967063904, "epoch": 3.2154769846564375, "step": 9640 }, { "distill_loss": 0.4400632381439209, "epoch": 3.2154769846564375, "step": 9640 }, { "epoch": 3.2154769846564375, "ref_ce_loss": 0.18491016328334808, "step": 9640 }, { "epoch": 3.218812541694463, "loss": 1.0146, "step": 9650 }, { "epoch": 3.218812541694463, "grad_norm": 1.780337929725647, "step": 9650 }, { "epoch": 3.218812541694463, "learning_rate": 0.000634204211107742, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 0.9347214698791504, "step": 9650 }, { "ce_loss": 0.29288166761398315, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.3937554955482483, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.1980094611644745, "step": 9650 }, { "epoch": 3.218812541694463, "loss": 0.8976361751556396, "step": 9650 }, { "ce_loss": 0.25129374861717224, "epoch": 3.218812541694463, "step": 9650 }, { "distill_loss": 0.4056212306022644, "epoch": 3.218812541694463, "step": 9650 }, { "epoch": 3.218812541694463, "ref_ce_loss": 0.1826409250497818, "step": 9650 }, { "epoch": 3.222148098732488, "loss": 1.02, "step": 9660 }, { "epoch": 3.222148098732488, "grad_norm": 1.546045184135437, "step": 9660 }, { "epoch": 3.222148098732488, "learning_rate": 0.0006338537610817722, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 0.8476258516311646, "step": 9660 }, { "ce_loss": 0.19795475900173187, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.43406128883361816, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.16292031109333038, "step": 9660 }, { "epoch": 3.222148098732488, "loss": 1.0019570589065552, "step": 9660 }, { "ce_loss": 0.3020380735397339, "epoch": 3.222148098732488, "step": 9660 }, { "distill_loss": 0.44856947660446167, "epoch": 3.222148098732488, "step": 9660 }, { "epoch": 3.222148098732488, "ref_ce_loss": 0.20614181458950043, "step": 9660 }, { "epoch": 3.2254836557705135, "loss": 1.0188, "step": 9670 }, { "epoch": 3.2254836557705135, "grad_norm": 2.204603672027588, "step": 9670 }, { "epoch": 3.2254836557705135, "learning_rate": 0.0006335030381232998, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 1.1082043647766113, "step": 9670 }, { "ce_loss": 0.2776288688182831, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 0.5517551898956299, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.22000828385353088, "step": 9670 }, { "epoch": 3.2254836557705135, "loss": 0.9066957235336304, "step": 9670 }, { "ce_loss": 0.19364385306835175, "epoch": 3.2254836557705135, "step": 9670 }, { "distill_loss": 0.46161043643951416, "epoch": 3.2254836557705135, "step": 9670 }, { "epoch": 3.2254836557705135, "ref_ce_loss": 0.1524243801832199, "step": 9670 }, { "epoch": 3.228819212808539, "loss": 1.1544, "step": 9680 }, { "epoch": 3.228819212808539, "grad_norm": 2.8075547218322754, "step": 9680 }, { "epoch": 3.228819212808539, "learning_rate": 0.0006331520426416556, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 0.9625744223594666, "step": 9680 }, { "ce_loss": 0.22598817944526672, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.4440693259239197, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.1613425761461258, "step": 9680 }, { "epoch": 3.228819212808539, "loss": 1.153572678565979, "step": 9680 }, { "ce_loss": 0.2614941895008087, "epoch": 3.228819212808539, "step": 9680 }, { "distill_loss": 0.434669554233551, "epoch": 3.228819212808539, "step": 9680 }, { "epoch": 3.228819212808539, "ref_ce_loss": 0.22456280887126923, "step": 9680 }, { "epoch": 3.2321547698465642, "loss": 1.0116, "step": 9690 }, { "epoch": 3.2321547698465642, "grad_norm": 1.9188731908798218, "step": 9690 }, { "epoch": 3.2321547698465642, "learning_rate": 0.0006328007750464895, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.9416834115982056, "step": 9690 }, { "ce_loss": 0.2584744095802307, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.4141307473182678, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.19777798652648926, "step": 9690 }, { "epoch": 3.2321547698465642, "loss": 0.8339923620223999, "step": 9690 }, { "ce_loss": 0.239766925573349, "epoch": 3.2321547698465642, "step": 9690 }, { "distill_loss": 0.4033082127571106, "epoch": 3.2321547698465642, "step": 9690 }, { "epoch": 3.2321547698465642, "ref_ce_loss": 0.1904684603214264, "step": 9690 }, { "epoch": 3.2354903268845896, "loss": 0.9304, "step": 9700 }, { "epoch": 3.2354903268845896, "grad_norm": 2.2980542182922363, "step": 9700 }, { "epoch": 3.2354903268845896, "learning_rate": 0.0006324492357477685, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 0.6538512706756592, "step": 9700 }, { "ce_loss": 0.15079143643379211, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.3154073655605316, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.13001316785812378, "step": 9700 }, { "epoch": 3.2354903268845896, "loss": 1.2124816179275513, "step": 9700 }, { "ce_loss": 0.27233535051345825, "epoch": 3.2354903268845896, "step": 9700 }, { "distill_loss": 0.4675295948982239, "epoch": 3.2354903268845896, "step": 9700 }, { "epoch": 3.2354903268845896, "ref_ce_loss": 0.22750096023082733, "step": 9700 }, { "epoch": 3.238825883922615, "loss": 1.032, "step": 9710 }, { "epoch": 3.238825883922615, "grad_norm": 2.910517454147339, "step": 9710 }, { "epoch": 3.238825883922615, "learning_rate": 0.0006320974251557769, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 1.0354328155517578, "step": 9710 }, { "ce_loss": 0.27194535732269287, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.3568897247314453, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.21175265312194824, "step": 9710 }, { "epoch": 3.238825883922615, "loss": 1.016433835029602, "step": 9710 }, { "ce_loss": 0.244206964969635, "epoch": 3.238825883922615, "step": 9710 }, { "distill_loss": 0.383299320936203, "epoch": 3.238825883922615, "step": 9710 }, { "epoch": 3.238825883922615, "ref_ce_loss": 0.22494517266750336, "step": 9710 }, { "epoch": 3.2421614409606403, "loss": 1.0236, "step": 9720 }, { "epoch": 3.2421614409606403, "grad_norm": 2.3274762630462646, "step": 9720 }, { "epoch": 3.2421614409606403, "learning_rate": 0.0006317453436811154, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 1.0498437881469727, "step": 9720 }, { "ce_loss": 0.3165692687034607, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.44313669204711914, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.22029148042201996, "step": 9720 }, { "epoch": 3.2421614409606403, "loss": 0.939366340637207, "step": 9720 }, { "ce_loss": 0.23943881690502167, "epoch": 3.2421614409606403, "step": 9720 }, { "distill_loss": 0.34780487418174744, "epoch": 3.2421614409606403, "step": 9720 }, { "epoch": 3.2421614409606403, "ref_ce_loss": 0.197583869099617, "step": 9720 }, { "epoch": 3.2454969979986656, "loss": 1.0895, "step": 9730 }, { "epoch": 3.2454969979986656, "grad_norm": 3.5488083362579346, "step": 9730 }, { "epoch": 3.2454969979986656, "learning_rate": 0.0006313929917347011, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.9897884130477905, "step": 9730 }, { "ce_loss": 0.2667106091976166, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.5222178101539612, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.20061685144901276, "step": 9730 }, { "epoch": 3.2454969979986656, "loss": 0.8107883930206299, "step": 9730 }, { "ce_loss": 0.20364002883434296, "epoch": 3.2454969979986656, "step": 9730 }, { "distill_loss": 0.3627731204032898, "epoch": 3.2454969979986656, "step": 9730 }, { "epoch": 3.2454969979986656, "ref_ce_loss": 0.1910681426525116, "step": 9730 }, { "epoch": 3.248832555036691, "loss": 1.003, "step": 9740 }, { "epoch": 3.248832555036691, "grad_norm": 2.29548978805542, "step": 9740 }, { "epoch": 3.248832555036691, "learning_rate": 0.0006310403697277663, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 1.0931097269058228, "step": 9740 }, { "ce_loss": 0.29702019691467285, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.45877814292907715, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.2571999728679657, "step": 9740 }, { "epoch": 3.248832555036691, "loss": 1.3060147762298584, "step": 9740 }, { "ce_loss": 0.2924763262271881, "epoch": 3.248832555036691, "step": 9740 }, { "distill_loss": 0.46052947640419006, "epoch": 3.248832555036691, "step": 9740 }, { "epoch": 3.248832555036691, "ref_ce_loss": 0.2098548263311386, "step": 9740 }, { "epoch": 3.2521681120747163, "loss": 1.0576, "step": 9750 }, { "epoch": 3.2521681120747163, "grad_norm": 1.9056835174560547, "step": 9750 }, { "epoch": 3.2521681120747163, "learning_rate": 0.0006306874780718593, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 0.9245695471763611, "step": 9750 }, { "ce_loss": 0.3011467158794403, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.3892473578453064, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.1900477558374405, "step": 9750 }, { "epoch": 3.2521681120747163, "loss": 1.0070749521255493, "step": 9750 }, { "ce_loss": 0.2565157115459442, "epoch": 3.2521681120747163, "step": 9750 }, { "distill_loss": 0.44015613198280334, "epoch": 3.2521681120747163, "step": 9750 }, { "epoch": 3.2521681120747163, "ref_ce_loss": 0.2389518767595291, "step": 9750 }, { "epoch": 3.2555036691127417, "loss": 0.949, "step": 9760 }, { "epoch": 3.2555036691127417, "grad_norm": 1.44841468334198, "step": 9760 }, { "epoch": 3.2555036691127417, "learning_rate": 0.0006303343171788422, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.9746924042701721, "step": 9760 }, { "ce_loss": 0.3471539318561554, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.3837020993232727, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.19318555295467377, "step": 9760 }, { "epoch": 3.2555036691127417, "loss": 0.8271773457527161, "step": 9760 }, { "ce_loss": 0.24171385169029236, "epoch": 3.2555036691127417, "step": 9760 }, { "distill_loss": 0.34053492546081543, "epoch": 3.2555036691127417, "step": 9760 }, { "epoch": 3.2555036691127417, "ref_ce_loss": 0.18620966374874115, "step": 9760 }, { "epoch": 3.258839226150767, "loss": 0.9802, "step": 9770 }, { "epoch": 3.258839226150767, "grad_norm": 1.7812227010726929, "step": 9770 }, { "epoch": 3.258839226150767, "learning_rate": 0.0006299808874608919, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.8848695755004883, "step": 9770 }, { "ce_loss": 0.2525423467159271, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.39496883749961853, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.1729382425546646, "step": 9770 }, { "epoch": 3.258839226150767, "loss": 0.7401137948036194, "step": 9770 }, { "ce_loss": 0.21388928592205048, "epoch": 3.258839226150767, "step": 9770 }, { "distill_loss": 0.30214762687683105, "epoch": 3.258839226150767, "step": 9770 }, { "epoch": 3.258839226150767, "ref_ce_loss": 0.16249021887779236, "step": 9770 }, { "epoch": 3.2621747831887924, "loss": 0.9537, "step": 9780 }, { "epoch": 3.2621747831887924, "grad_norm": 1.8796312808990479, "step": 9780 }, { "epoch": 3.2621747831887924, "learning_rate": 0.0006296271893304992, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 1.426863193511963, "step": 9780 }, { "ce_loss": 0.306301474571228, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.3925962448120117, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.1583440750837326, "step": 9780 }, { "epoch": 3.2621747831887924, "loss": 0.8414177298545837, "step": 9780 }, { "ce_loss": 0.2551378905773163, "epoch": 3.2621747831887924, "step": 9780 }, { "distill_loss": 0.33810290694236755, "epoch": 3.2621747831887924, "step": 9780 }, { "epoch": 3.2621747831887924, "ref_ce_loss": 0.19451551139354706, "step": 9780 }, { "epoch": 3.2655103402268177, "loss": 0.9865, "step": 9790 }, { "epoch": 3.2655103402268177, "grad_norm": 1.5099884271621704, "step": 9790 }, { "epoch": 3.2655103402268177, "learning_rate": 0.0006292732232004675, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 0.8750608563423157, "step": 9790 }, { "ce_loss": 0.24614830315113068, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.40064993500709534, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.18194952607154846, "step": 9790 }, { "epoch": 3.2655103402268177, "loss": 1.3389601707458496, "step": 9790 }, { "ce_loss": 0.33404234051704407, "epoch": 3.2655103402268177, "step": 9790 }, { "distill_loss": 0.5011653304100037, "epoch": 3.2655103402268177, "step": 9790 }, { "epoch": 3.2655103402268177, "ref_ce_loss": 0.19928906857967377, "step": 9790 }, { "epoch": 3.268845897264843, "loss": 0.9772, "step": 9800 }, { "epoch": 3.268845897264843, "grad_norm": 1.6015758514404297, "step": 9800 }, { "epoch": 3.268845897264843, "learning_rate": 0.0006289189894839135, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.7471339106559753, "step": 9800 }, { "ce_loss": 0.19947054982185364, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.33740973472595215, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.2094244807958603, "step": 9800 }, { "epoch": 3.268845897264843, "loss": 0.9930317997932434, "step": 9800 }, { "ce_loss": 0.2759966552257538, "epoch": 3.268845897264843, "step": 9800 }, { "distill_loss": 0.4113280475139618, "epoch": 3.268845897264843, "step": 9800 }, { "epoch": 3.268845897264843, "ref_ce_loss": 0.241688534617424, "step": 9800 }, { "epoch": 3.2721814543028684, "loss": 0.888, "step": 9810 }, { "epoch": 3.2721814543028684, "grad_norm": 1.7993124723434448, "step": 9810 }, { "epoch": 3.2721814543028684, "learning_rate": 0.0006285644885942661, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 0.8379071950912476, "step": 9810 }, { "ce_loss": 0.24445652961730957, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.32546156644821167, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.1814669668674469, "step": 9810 }, { "epoch": 3.2721814543028684, "loss": 1.4976379871368408, "step": 9810 }, { "ce_loss": 0.26668861508369446, "epoch": 3.2721814543028684, "step": 9810 }, { "distill_loss": 0.37651145458221436, "epoch": 3.2721814543028684, "step": 9810 }, { "epoch": 3.2721814543028684, "ref_ce_loss": 0.20988450944423676, "step": 9810 }, { "epoch": 3.275517011340894, "loss": 0.982, "step": 9820 }, { "epoch": 3.275517011340894, "grad_norm": 2.3544046878814697, "step": 9820 }, { "epoch": 3.275517011340894, "learning_rate": 0.0006282097209452661, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 0.9182118773460388, "step": 9820 }, { "ce_loss": 0.23315070569515228, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.4567936062812805, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.18071922659873962, "step": 9820 }, { "epoch": 3.275517011340894, "loss": 1.4837809801101685, "step": 9820 }, { "ce_loss": 0.2943566143512726, "epoch": 3.275517011340894, "step": 9820 }, { "distill_loss": 0.46521204710006714, "epoch": 3.275517011340894, "step": 9820 }, { "epoch": 3.275517011340894, "ref_ce_loss": 0.24306204915046692, "step": 9820 }, { "epoch": 3.278852568378919, "loss": 0.9991, "step": 9830 }, { "epoch": 3.278852568378919, "grad_norm": 1.9920049905776978, "step": 9830 }, { "epoch": 3.278852568378919, "learning_rate": 0.0006278546869509651, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 1.4693489074707031, "step": 9830 }, { "ce_loss": 0.24498344957828522, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.456991046667099, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.1735164374113083, "step": 9830 }, { "epoch": 3.278852568378919, "loss": 0.9276065230369568, "step": 9830 }, { "ce_loss": 0.2843460738658905, "epoch": 3.278852568378919, "step": 9830 }, { "distill_loss": 0.43910086154937744, "epoch": 3.278852568378919, "step": 9830 }, { "epoch": 3.278852568378919, "ref_ce_loss": 0.20382851362228394, "step": 9830 }, { "epoch": 3.2821881254169445, "loss": 0.9999, "step": 9840 }, { "epoch": 3.2821881254169445, "grad_norm": 1.3909965753555298, "step": 9840 }, { "epoch": 3.2821881254169445, "learning_rate": 0.0006274993870257265, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.9617606401443481, "step": 9840 }, { "ce_loss": 0.23345062136650085, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.37801143527030945, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.20465831458568573, "step": 9840 }, { "epoch": 3.2821881254169445, "loss": 0.869107186794281, "step": 9840 }, { "ce_loss": 0.2557021379470825, "epoch": 3.2821881254169445, "step": 9840 }, { "distill_loss": 0.42464479804039, "epoch": 3.2821881254169445, "step": 9840 }, { "epoch": 3.2821881254169445, "ref_ce_loss": 0.15845350921154022, "step": 9840 }, { "epoch": 3.28552368245497, "loss": 1.0277, "step": 9850 }, { "epoch": 3.28552368245497, "grad_norm": 2.674233913421631, "step": 9850 }, { "epoch": 3.28552368245497, "learning_rate": 0.000627143821584223, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 1.193628191947937, "step": 9850 }, { "ce_loss": 0.33579155802726746, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.48818734288215637, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.24845334887504578, "step": 9850 }, { "epoch": 3.28552368245497, "loss": 0.9142537713050842, "step": 9850 }, { "ce_loss": 0.23797516524791718, "epoch": 3.28552368245497, "step": 9850 }, { "distill_loss": 0.40407562255859375, "epoch": 3.28552368245497, "step": 9850 }, { "epoch": 3.28552368245497, "ref_ce_loss": 0.18777459859848022, "step": 9850 }, { "epoch": 3.288859239492995, "loss": 1.088, "step": 9860 }, { "epoch": 3.288859239492995, "grad_norm": 1.5202012062072754, "step": 9860 }, { "epoch": 3.288859239492995, "learning_rate": 0.0006267879910414383, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 1.2795214653015137, "step": 9860 }, { "ce_loss": 0.3531726896762848, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.5422238111495972, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.21500453352928162, "step": 9860 }, { "epoch": 3.288859239492995, "loss": 0.6902878880500793, "step": 9860 }, { "ce_loss": 0.16109101474285126, "epoch": 3.288859239492995, "step": 9860 }, { "distill_loss": 0.34586697816848755, "epoch": 3.288859239492995, "step": 9860 }, { "epoch": 3.288859239492995, "ref_ce_loss": 0.18270200490951538, "step": 9860 }, { "epoch": 3.2921947965310205, "loss": 0.9348, "step": 9870 }, { "epoch": 3.2921947965310205, "grad_norm": 1.827506184577942, "step": 9870 }, { "epoch": 3.2921947965310205, "learning_rate": 0.0006264318958126645, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 0.8399723172187805, "step": 9870 }, { "ce_loss": 0.2216285616159439, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.3668162524700165, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.1240381971001625, "step": 9870 }, { "epoch": 3.2921947965310205, "loss": 1.3451710939407349, "step": 9870 }, { "ce_loss": 0.4018022119998932, "epoch": 3.2921947965310205, "step": 9870 }, { "distill_loss": 0.47588586807250977, "epoch": 3.2921947965310205, "step": 9870 }, { "epoch": 3.2921947965310205, "ref_ce_loss": 0.2462722659111023, "step": 9870 }, { "epoch": 3.295530353569046, "loss": 1.0435, "step": 9880 }, { "epoch": 3.295530353569046, "grad_norm": 1.4911365509033203, "step": 9880 }, { "epoch": 3.295530353569046, "learning_rate": 0.0006260755363135033, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.9486567974090576, "step": 9880 }, { "ce_loss": 0.22665971517562866, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.45116129517555237, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.1920081079006195, "step": 9880 }, { "epoch": 3.295530353569046, "loss": 0.8320479989051819, "step": 9880 }, { "ce_loss": 0.1915288269519806, "epoch": 3.295530353569046, "step": 9880 }, { "distill_loss": 0.33396807312965393, "epoch": 3.295530353569046, "step": 9880 }, { "epoch": 3.295530353569046, "ref_ce_loss": 0.18080615997314453, "step": 9880 }, { "epoch": 3.2988659106070712, "loss": 0.9529, "step": 9890 }, { "epoch": 3.2988659106070712, "grad_norm": 1.6564595699310303, "step": 9890 }, { "epoch": 3.2988659106070712, "learning_rate": 0.0006257189129598645, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.7733842730522156, "step": 9890 }, { "ce_loss": 0.22635827958583832, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.3659568428993225, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.1503317505121231, "step": 9890 }, { "epoch": 3.2988659106070712, "loss": 0.8225291967391968, "step": 9890 }, { "ce_loss": 0.24125638604164124, "epoch": 3.2988659106070712, "step": 9890 }, { "distill_loss": 0.3937651515007019, "epoch": 3.2988659106070712, "step": 9890 }, { "epoch": 3.2988659106070712, "ref_ce_loss": 0.18737755715847015, "step": 9890 }, { "epoch": 3.3022014676450966, "loss": 1.0092, "step": 9900 }, { "epoch": 3.3022014676450966, "grad_norm": 2.3174591064453125, "step": 9900 }, { "epoch": 3.3022014676450966, "learning_rate": 0.0006253620261679659, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 1.4922536611557007, "step": 9900 }, { "ce_loss": 0.2831789255142212, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.4154704511165619, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.23491549491882324, "step": 9900 }, { "epoch": 3.3022014676450966, "loss": 0.7731677889823914, "step": 9900 }, { "ce_loss": 0.18771180510520935, "epoch": 3.3022014676450966, "step": 9900 }, { "distill_loss": 0.3617543578147888, "epoch": 3.3022014676450966, "step": 9900 }, { "epoch": 3.3022014676450966, "ref_ce_loss": 0.1634535938501358, "step": 9900 }, { "epoch": 3.305537024683122, "loss": 1.0277, "step": 9910 }, { "epoch": 3.305537024683122, "grad_norm": 1.5574588775634766, "step": 9910 }, { "epoch": 3.305537024683122, "learning_rate": 0.0006250048763543326, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.6726523637771606, "step": 9910 }, { "ce_loss": 0.15559059381484985, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.34071028232574463, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.17615939676761627, "step": 9910 }, { "epoch": 3.305537024683122, "loss": 0.7978571653366089, "step": 9910 }, { "ce_loss": 0.24486663937568665, "epoch": 3.305537024683122, "step": 9910 }, { "distill_loss": 0.3446405529975891, "epoch": 3.305537024683122, "step": 9910 }, { "epoch": 3.305537024683122, "ref_ce_loss": 0.2081782966852188, "step": 9910 }, { "epoch": 3.3088725817211473, "loss": 0.9791, "step": 9920 }, { "epoch": 3.3088725817211473, "grad_norm": 3.7845053672790527, "step": 9920 }, { "epoch": 3.3088725817211473, "learning_rate": 0.0006246474639357973, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 1.7719227075576782, "step": 9920 }, { "ce_loss": 0.33106061816215515, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.4632108211517334, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.24532441794872284, "step": 9920 }, { "epoch": 3.3088725817211473, "loss": 0.9917610883712769, "step": 9920 }, { "ce_loss": 0.1957157701253891, "epoch": 3.3088725817211473, "step": 9920 }, { "distill_loss": 0.31435099244117737, "epoch": 3.3088725817211473, "step": 9920 }, { "epoch": 3.3088725817211473, "ref_ce_loss": 0.17546167969703674, "step": 9920 }, { "epoch": 3.3122081387591726, "loss": 1.0102, "step": 9930 }, { "epoch": 3.3122081387591726, "grad_norm": 2.843688726425171, "step": 9930 }, { "epoch": 3.3122081387591726, "learning_rate": 0.0006242897893294984, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 1.0776625871658325, "step": 9930 }, { "ce_loss": 0.25643882155418396, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.3841439187526703, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.23373252153396606, "step": 9930 }, { "epoch": 3.3122081387591726, "loss": 1.0383449792861938, "step": 9930 }, { "ce_loss": 0.32289958000183105, "epoch": 3.3122081387591726, "step": 9930 }, { "distill_loss": 0.43312522768974304, "epoch": 3.3122081387591726, "step": 9930 }, { "epoch": 3.3122081387591726, "ref_ce_loss": 0.21836471557617188, "step": 9930 }, { "epoch": 3.315543695797198, "loss": 1.0612, "step": 9940 }, { "epoch": 3.315543695797198, "grad_norm": 2.5459680557250977, "step": 9940 }, { "epoch": 3.315543695797198, "learning_rate": 0.000623931852952881, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.7622495889663696, "step": 9940 }, { "ce_loss": 0.21401986479759216, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.3269309997558594, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.18412983417510986, "step": 9940 }, { "epoch": 3.315543695797198, "loss": 0.906096339225769, "step": 9940 }, { "ce_loss": 0.2815874516963959, "epoch": 3.315543695797198, "step": 9940 }, { "distill_loss": 0.36980006098747253, "epoch": 3.315543695797198, "step": 9940 }, { "epoch": 3.315543695797198, "ref_ce_loss": 0.23133040964603424, "step": 9940 }, { "epoch": 3.3188792528352233, "loss": 0.991, "step": 9950 }, { "epoch": 3.3188792528352233, "grad_norm": 1.8961730003356934, "step": 9950 }, { "epoch": 3.3188792528352233, "learning_rate": 0.000623573655223695, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 0.9649955630302429, "step": 9950 }, { "ce_loss": 0.2432449609041214, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.46102091670036316, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.21075601875782013, "step": 9950 }, { "epoch": 3.3188792528352233, "loss": 1.0744799375534058, "step": 9950 }, { "ce_loss": 0.2550390362739563, "epoch": 3.3188792528352233, "step": 9950 }, { "distill_loss": 0.39716631174087524, "epoch": 3.3188792528352233, "step": 9950 }, { "epoch": 3.3188792528352233, "ref_ce_loss": 0.22736836969852448, "step": 9950 }, { "epoch": 3.3222148098732487, "loss": 1.0089, "step": 9960 }, { "epoch": 3.3222148098732487, "grad_norm": 1.9671109914779663, "step": 9960 }, { "epoch": 3.3222148098732487, "learning_rate": 0.0006232151965599956, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 1.4595311880111694, "step": 9960 }, { "ce_loss": 0.2657226026058197, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.4428955614566803, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.20473840832710266, "step": 9960 }, { "epoch": 3.3222148098732487, "loss": 0.846092700958252, "step": 9960 }, { "ce_loss": 0.19718728959560394, "epoch": 3.3222148098732487, "step": 9960 }, { "distill_loss": 0.3793059289455414, "epoch": 3.3222148098732487, "step": 9960 }, { "epoch": 3.3222148098732487, "ref_ce_loss": 0.2034681737422943, "step": 9960 }, { "epoch": 3.325550366911274, "loss": 1.038, "step": 9970 }, { "epoch": 3.325550366911274, "grad_norm": 1.7723923921585083, "step": 9970 }, { "epoch": 3.325550366911274, "learning_rate": 0.0006228564773801431, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.8382503986358643, "step": 9970 }, { "ce_loss": 0.25603431463241577, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.3779396712779999, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.20399893820285797, "step": 9970 }, { "epoch": 3.325550366911274, "loss": 0.7795833349227905, "step": 9970 }, { "ce_loss": 0.20238620042800903, "epoch": 3.325550366911274, "step": 9970 }, { "distill_loss": 0.363075852394104, "epoch": 3.325550366911274, "step": 9970 }, { "epoch": 3.325550366911274, "ref_ce_loss": 0.17283816635608673, "step": 9970 }, { "epoch": 3.3288859239492994, "loss": 1.0055, "step": 9980 }, { "epoch": 3.3288859239492994, "grad_norm": 1.9274988174438477, "step": 9980 }, { "epoch": 3.3288859239492994, "learning_rate": 0.0006224974981028012, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.7052872776985168, "step": 9980 }, { "ce_loss": 0.19602438807487488, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.3226252794265747, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.15171216428279877, "step": 9980 }, { "epoch": 3.3288859239492994, "loss": 0.9317882657051086, "step": 9980 }, { "ce_loss": 0.29486575722694397, "epoch": 3.3288859239492994, "step": 9980 }, { "distill_loss": 0.4522726237773895, "epoch": 3.3288859239492994, "step": 9980 }, { "epoch": 3.3288859239492994, "ref_ce_loss": 0.18425706028938293, "step": 9980 }, { "epoch": 3.3322214809873247, "loss": 0.9846, "step": 9990 }, { "epoch": 3.3322214809873247, "grad_norm": 2.507467746734619, "step": 9990 }, { "epoch": 3.3322214809873247, "learning_rate": 0.0006221382591469371, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 0.9769386053085327, "step": 9990 }, { "ce_loss": 0.30808940529823303, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.40864020586013794, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.21908801794052124, "step": 9990 }, { "epoch": 3.3322214809873247, "loss": 1.0252798795700073, "step": 9990 }, { "ce_loss": 0.17562223970890045, "epoch": 3.3322214809873247, "step": 9990 }, { "distill_loss": 0.3085121214389801, "epoch": 3.3322214809873247, "step": 9990 }, { "epoch": 3.3322214809873247, "ref_ce_loss": 0.19351007044315338, "step": 9990 }, { "epoch": 3.33555703802535, "loss": 0.9897, "step": 10000 }, { "epoch": 3.33555703802535, "grad_norm": 2.2838590145111084, "step": 10000 }, { "epoch": 3.33555703802535, "learning_rate": 0.0006217787609318217, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.8830081820487976, "step": 10000 }, { "ce_loss": 0.21135838329792023, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.42949768900871277, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.18174947798252106, "step": 10000 }, { "epoch": 3.33555703802535, "loss": 0.8349977731704712, "step": 10000 }, { "ce_loss": 0.21382983028888702, "epoch": 3.33555703802535, "step": 10000 }, { "distill_loss": 0.3579995632171631, "epoch": 3.33555703802535, "step": 10000 }, { "epoch": 3.33555703802535, "ref_ce_loss": 0.20931722223758698, "step": 10000 }, { "epoch": 3.3388925950633754, "loss": 0.9685, "step": 10010 }, { "epoch": 3.3388925950633754, "grad_norm": 1.8487814664840698, "step": 10010 }, { "epoch": 3.3388925950633754, "learning_rate": 0.0006214190038770278, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 1.5042794942855835, "step": 10010 }, { "ce_loss": 0.2764376997947693, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.43299469351768494, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.21280337870121002, "step": 10010 }, { "epoch": 3.3388925950633754, "loss": 0.9258934259414673, "step": 10010 }, { "ce_loss": 0.25367021560668945, "epoch": 3.3388925950633754, "step": 10010 }, { "distill_loss": 0.34598594903945923, "epoch": 3.3388925950633754, "step": 10010 }, { "epoch": 3.3388925950633754, "ref_ce_loss": 0.20186175405979156, "step": 10010 }, { "epoch": 3.342228152101401, "loss": 1.023, "step": 10020 }, { "epoch": 3.342228152101401, "grad_norm": 2.127298355102539, "step": 10020 }, { "epoch": 3.342228152101401, "learning_rate": 0.0006210589884024307, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 1.0435254573822021, "step": 10020 }, { "ce_loss": 0.2094258964061737, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.3154853582382202, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.19164526462554932, "step": 10020 }, { "epoch": 3.342228152101401, "loss": 0.9915556907653809, "step": 10020 }, { "ce_loss": 0.2046308070421219, "epoch": 3.342228152101401, "step": 10020 }, { "distill_loss": 0.41096386313438416, "epoch": 3.342228152101401, "step": 10020 }, { "epoch": 3.342228152101401, "ref_ce_loss": 0.14242751896381378, "step": 10020 }, { "epoch": 3.345563709139426, "loss": 1.0016, "step": 10030 }, { "epoch": 3.345563709139426, "grad_norm": 2.440781831741333, "step": 10030 }, { "epoch": 3.345563709139426, "learning_rate": 0.0006206987149282073, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 1.0040383338928223, "step": 10030 }, { "ce_loss": 0.2689066231250763, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.48362284898757935, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.20056311786174774, "step": 10030 }, { "epoch": 3.345563709139426, "loss": 0.9044297337532043, "step": 10030 }, { "ce_loss": 0.21311141550540924, "epoch": 3.345563709139426, "step": 10030 }, { "distill_loss": 0.48758459091186523, "epoch": 3.345563709139426, "step": 10030 }, { "epoch": 3.345563709139426, "ref_ce_loss": 0.2009950578212738, "step": 10030 }, { "epoch": 3.3488992661774515, "loss": 1.0939, "step": 10040 }, { "epoch": 3.3488992661774515, "grad_norm": 2.704503297805786, "step": 10040 }, { "epoch": 3.3488992661774515, "learning_rate": 0.0006203381838748353, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 0.9915063977241516, "step": 10040 }, { "ce_loss": 0.24685484170913696, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.4709896743297577, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.2363121062517166, "step": 10040 }, { "epoch": 3.3488992661774515, "loss": 1.0160868167877197, "step": 10040 }, { "ce_loss": 0.2190462350845337, "epoch": 3.3488992661774515, "step": 10040 }, { "distill_loss": 0.49053841829299927, "epoch": 3.3488992661774515, "step": 10040 }, { "epoch": 3.3488992661774515, "ref_ce_loss": 0.16573746502399445, "step": 10040 }, { "epoch": 3.352234823215477, "loss": 1.0007, "step": 10050 }, { "epoch": 3.352234823215477, "grad_norm": 2.394056797027588, "step": 10050 }, { "epoch": 3.352234823215477, "learning_rate": 0.0006199773956630934, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.9801971912384033, "step": 10050 }, { "ce_loss": 0.2730773389339447, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.3858831822872162, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.19404278695583344, "step": 10050 }, { "epoch": 3.352234823215477, "loss": 0.8788453936576843, "step": 10050 }, { "ce_loss": 0.2635515332221985, "epoch": 3.352234823215477, "step": 10050 }, { "distill_loss": 0.38254135847091675, "epoch": 3.352234823215477, "step": 10050 }, { "epoch": 3.352234823215477, "ref_ce_loss": 0.17534977197647095, "step": 10050 }, { "epoch": 3.355570380253502, "loss": 1.001, "step": 10060 }, { "epoch": 3.355570380253502, "grad_norm": 3.505295753479004, "step": 10060 }, { "epoch": 3.355570380253502, "learning_rate": 0.0006196163507140602, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.7710726261138916, "step": 10060 }, { "ce_loss": 0.19496233761310577, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.38631999492645264, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.18970170617103577, "step": 10060 }, { "epoch": 3.355570380253502, "loss": 0.8161435127258301, "step": 10060 }, { "ce_loss": 0.21463973820209503, "epoch": 3.355570380253502, "step": 10060 }, { "distill_loss": 0.42355772852897644, "epoch": 3.355570380253502, "step": 10060 }, { "epoch": 3.355570380253502, "ref_ce_loss": 0.17776453495025635, "step": 10060 }, { "epoch": 3.3589059372915275, "loss": 1.0532, "step": 10070 }, { "epoch": 3.3589059372915275, "grad_norm": 1.9118040800094604, "step": 10070 }, { "epoch": 3.3589059372915275, "learning_rate": 0.000619255049449114, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.8997465372085571, "step": 10070 }, { "ce_loss": 0.23062358796596527, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.39321815967559814, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.2236202508211136, "step": 10070 }, { "epoch": 3.3589059372915275, "loss": 0.9231383800506592, "step": 10070 }, { "ce_loss": 0.2514480948448181, "epoch": 3.3589059372915275, "step": 10070 }, { "distill_loss": 0.3488033711910248, "epoch": 3.3589059372915275, "step": 10070 }, { "epoch": 3.3589059372915275, "ref_ce_loss": 0.19309905171394348, "step": 10070 }, { "epoch": 3.362241494329553, "loss": 0.9295, "step": 10080 }, { "epoch": 3.362241494329553, "grad_norm": 2.3443384170532227, "step": 10080 }, { "epoch": 3.362241494329553, "learning_rate": 0.0006188934922899324, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.8065698146820068, "step": 10080 }, { "ce_loss": 0.24393002688884735, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.33297401666641235, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.18890532851219177, "step": 10080 }, { "epoch": 3.362241494329553, "loss": 0.9825599789619446, "step": 10080 }, { "ce_loss": 0.2975650727748871, "epoch": 3.362241494329553, "step": 10080 }, { "distill_loss": 0.4654487371444702, "epoch": 3.362241494329553, "step": 10080 }, { "epoch": 3.362241494329553, "ref_ce_loss": 0.1721816211938858, "step": 10080 }, { "epoch": 3.3655770513675782, "loss": 1.0455, "step": 10090 }, { "epoch": 3.3655770513675782, "grad_norm": 3.9115467071533203, "step": 10090 }, { "epoch": 3.3655770513675782, "learning_rate": 0.0006185316796584912, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 1.1405165195465088, "step": 10090 }, { "ce_loss": 0.2993922531604767, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.46894127130508423, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.18077045679092407, "step": 10090 }, { "epoch": 3.3655770513675782, "loss": 0.930509090423584, "step": 10090 }, { "ce_loss": 0.21698082983493805, "epoch": 3.3655770513675782, "step": 10090 }, { "distill_loss": 0.44211745262145996, "epoch": 3.3655770513675782, "step": 10090 }, { "epoch": 3.3655770513675782, "ref_ce_loss": 0.17781756818294525, "step": 10090 }, { "epoch": 3.3689126084056036, "loss": 1.0448, "step": 10100 }, { "epoch": 3.3689126084056036, "grad_norm": 1.9488577842712402, "step": 10100 }, { "epoch": 3.3689126084056036, "learning_rate": 0.0006181696119770651, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 1.8516427278518677, "step": 10100 }, { "ce_loss": 0.2993837296962738, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.45562267303466797, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.18279919028282166, "step": 10100 }, { "epoch": 3.3689126084056036, "loss": 0.956829309463501, "step": 10100 }, { "ce_loss": 0.22248753905296326, "epoch": 3.3689126084056036, "step": 10100 }, { "distill_loss": 0.4272536635398865, "epoch": 3.3689126084056036, "step": 10100 }, { "epoch": 3.3689126084056036, "ref_ce_loss": 0.15380191802978516, "step": 10100 }, { "epoch": 3.372248165443629, "loss": 1.0941, "step": 10110 }, { "epoch": 3.372248165443629, "grad_norm": 2.792809009552002, "step": 10110 }, { "epoch": 3.372248165443629, "learning_rate": 0.0006178072896682257, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 1.0727574825286865, "step": 10110 }, { "ce_loss": 0.2581041753292084, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.47537761926651, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.17899997532367706, "step": 10110 }, { "epoch": 3.372248165443629, "loss": 0.779070258140564, "step": 10110 }, { "ce_loss": 0.24334947764873505, "epoch": 3.372248165443629, "step": 10110 }, { "distill_loss": 0.3363867402076721, "epoch": 3.372248165443629, "step": 10110 }, { "epoch": 3.372248165443629, "ref_ce_loss": 0.15977618098258972, "step": 10110 }, { "epoch": 3.3755837224816543, "loss": 1.0354, "step": 10120 }, { "epoch": 3.3755837224816543, "grad_norm": 1.8961207866668701, "step": 10120 }, { "epoch": 3.3755837224816543, "learning_rate": 0.0006174447131548421, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.9844977259635925, "step": 10120 }, { "ce_loss": 0.3068873882293701, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.4935329854488373, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.1830204725265503, "step": 10120 }, { "epoch": 3.3755837224816543, "loss": 0.8539527654647827, "step": 10120 }, { "ce_loss": 0.27278390526771545, "epoch": 3.3755837224816543, "step": 10120 }, { "distill_loss": 0.3903404176235199, "epoch": 3.3755837224816543, "step": 10120 }, { "epoch": 3.3755837224816543, "ref_ce_loss": 0.19060420989990234, "step": 10120 }, { "epoch": 3.3789192795196796, "loss": 1.0105, "step": 10130 }, { "epoch": 3.3789192795196796, "grad_norm": 2.0325279235839844, "step": 10130 }, { "epoch": 3.3789192795196796, "learning_rate": 0.0006170818828600802, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.8580547571182251, "step": 10130 }, { "ce_loss": 0.24313877522945404, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.306068480014801, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.23506256937980652, "step": 10130 }, { "epoch": 3.3789192795196796, "loss": 0.7390968203544617, "step": 10130 }, { "ce_loss": 0.17966081202030182, "epoch": 3.3789192795196796, "step": 10130 }, { "distill_loss": 0.309879332780838, "epoch": 3.3789192795196796, "step": 10130 }, { "epoch": 3.3789192795196796, "ref_ce_loss": 0.18670016527175903, "step": 10130 }, { "epoch": 3.382254836557705, "loss": 1.0019, "step": 10140 }, { "epoch": 3.382254836557705, "grad_norm": 2.281717538833618, "step": 10140 }, { "epoch": 3.382254836557705, "learning_rate": 0.0006167187992074021, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 0.6992474794387817, "step": 10140 }, { "ce_loss": 0.15938930213451385, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.36157307028770447, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.17812681198120117, "step": 10140 }, { "epoch": 3.382254836557705, "loss": 1.0849311351776123, "step": 10140 }, { "ce_loss": 0.23215362429618835, "epoch": 3.382254836557705, "step": 10140 }, { "distill_loss": 0.4448317885398865, "epoch": 3.382254836557705, "step": 10140 }, { "epoch": 3.382254836557705, "ref_ce_loss": 0.20249246060848236, "step": 10140 }, { "epoch": 3.3855903935957303, "loss": 0.9154, "step": 10150 }, { "epoch": 3.3855903935957303, "grad_norm": 1.4965434074401855, "step": 10150 }, { "epoch": 3.3855903935957303, "learning_rate": 0.0006163554626205655, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.7812495827674866, "step": 10150 }, { "ce_loss": 0.255270391702652, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.3302749693393707, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.19548340141773224, "step": 10150 }, { "epoch": 3.3855903935957303, "loss": 0.9711526036262512, "step": 10150 }, { "ce_loss": 0.31296950578689575, "epoch": 3.3855903935957303, "step": 10150 }, { "distill_loss": 0.38431110978126526, "epoch": 3.3855903935957303, "step": 10150 }, { "epoch": 3.3855903935957303, "ref_ce_loss": 0.19149935245513916, "step": 10150 }, { "epoch": 3.3889259506337557, "loss": 0.8598, "step": 10160 }, { "epoch": 3.3889259506337557, "grad_norm": 2.4595344066619873, "step": 10160 }, { "epoch": 3.3889259506337557, "learning_rate": 0.0006159918735236232, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 1.1318943500518799, "step": 10160 }, { "ce_loss": 0.19478745758533478, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.42160463333129883, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.17801080644130707, "step": 10160 }, { "epoch": 3.3889259506337557, "loss": 1.0755106210708618, "step": 10160 }, { "ce_loss": 0.33530953526496887, "epoch": 3.3889259506337557, "step": 10160 }, { "distill_loss": 0.4514789581298828, "epoch": 3.3889259506337557, "step": 10160 }, { "epoch": 3.3889259506337557, "ref_ce_loss": 0.22947563230991364, "step": 10160 }, { "epoch": 3.392261507671781, "loss": 1.0222, "step": 10170 }, { "epoch": 3.392261507671781, "grad_norm": 2.17761492729187, "step": 10170 }, { "epoch": 3.392261507671781, "learning_rate": 0.0006156280323409227, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 1.1920362710952759, "step": 10170 }, { "ce_loss": 0.2914145588874817, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.4543791711330414, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.27146637439727783, "step": 10170 }, { "epoch": 3.392261507671781, "loss": 0.8673732280731201, "step": 10170 }, { "ce_loss": 0.280834436416626, "epoch": 3.392261507671781, "step": 10170 }, { "distill_loss": 0.42488738894462585, "epoch": 3.392261507671781, "step": 10170 }, { "epoch": 3.392261507671781, "ref_ce_loss": 0.16155914962291718, "step": 10170 }, { "epoch": 3.3955970647098064, "loss": 1.0552, "step": 10180 }, { "epoch": 3.3955970647098064, "grad_norm": 1.9043627977371216, "step": 10180 }, { "epoch": 3.3955970647098064, "learning_rate": 0.000615263939497106, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 0.870008111000061, "step": 10180 }, { "ce_loss": 0.2507705092430115, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.3454810678958893, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.18389473855495453, "step": 10180 }, { "epoch": 3.3955970647098064, "loss": 1.1123414039611816, "step": 10180 }, { "ce_loss": 0.2997893691062927, "epoch": 3.3955970647098064, "step": 10180 }, { "distill_loss": 0.4631223976612091, "epoch": 3.3955970647098064, "step": 10180 }, { "epoch": 3.3955970647098064, "ref_ce_loss": 0.2144378125667572, "step": 10180 }, { "epoch": 3.3989326217478317, "loss": 1.0243, "step": 10190 }, { "epoch": 3.3989326217478317, "grad_norm": 2.3104279041290283, "step": 10190 }, { "epoch": 3.3989326217478317, "learning_rate": 0.0006148995954171084, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 0.8569172620773315, "step": 10190 }, { "ce_loss": 0.24644263088703156, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.39549142122268677, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.18206985294818878, "step": 10190 }, { "epoch": 3.3989326217478317, "loss": 1.314200758934021, "step": 10190 }, { "ce_loss": 0.30596616864204407, "epoch": 3.3989326217478317, "step": 10190 }, { "distill_loss": 0.37953469157218933, "epoch": 3.3989326217478317, "step": 10190 }, { "epoch": 3.3989326217478317, "ref_ce_loss": 0.24160854518413544, "step": 10190 }, { "epoch": 3.402268178785857, "loss": 0.9644, "step": 10200 }, { "epoch": 3.402268178785857, "grad_norm": 1.7902569770812988, "step": 10200 }, { "epoch": 3.402268178785857, "learning_rate": 0.000614535000526159, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 0.8502125144004822, "step": 10200 }, { "ce_loss": 0.20533131062984467, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.4242219924926758, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.1732807606458664, "step": 10200 }, { "epoch": 3.402268178785857, "loss": 1.1380047798156738, "step": 10200 }, { "ce_loss": 0.2311955988407135, "epoch": 3.402268178785857, "step": 10200 }, { "distill_loss": 0.3482438623905182, "epoch": 3.402268178785857, "step": 10200 }, { "epoch": 3.402268178785857, "ref_ce_loss": 0.17975008487701416, "step": 10200 }, { "epoch": 3.4056037358238824, "loss": 0.9528, "step": 10210 }, { "epoch": 3.4056037358238824, "grad_norm": 2.735464096069336, "step": 10210 }, { "epoch": 3.4056037358238824, "learning_rate": 0.000614170155249779, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 0.9691909551620483, "step": 10210 }, { "ce_loss": 0.2766101360321045, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.46005550026893616, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.17615088820457458, "step": 10210 }, { "epoch": 3.4056037358238824, "loss": 1.0173518657684326, "step": 10210 }, { "ce_loss": 0.2465779185295105, "epoch": 3.4056037358238824, "step": 10210 }, { "distill_loss": 0.400485098361969, "epoch": 3.4056037358238824, "step": 10210 }, { "epoch": 3.4056037358238824, "ref_ce_loss": 0.15484775602817535, "step": 10210 }, { "epoch": 3.4089392928619078, "loss": 0.958, "step": 10220 }, { "epoch": 3.4089392928619078, "grad_norm": 2.010167121887207, "step": 10220 }, { "epoch": 3.4089392928619078, "learning_rate": 0.0006138050600137822, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 1.0048978328704834, "step": 10220 }, { "ce_loss": 0.27049142122268677, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.46906766295433044, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.20254988968372345, "step": 10220 }, { "epoch": 3.4089392928619078, "loss": 0.975196361541748, "step": 10220 }, { "ce_loss": 0.22158262133598328, "epoch": 3.4089392928619078, "step": 10220 }, { "distill_loss": 0.43164435029029846, "epoch": 3.4089392928619078, "step": 10220 }, { "epoch": 3.4089392928619078, "ref_ce_loss": 0.17377683520317078, "step": 10220 }, { "epoch": 3.412274849899933, "loss": 0.963, "step": 10230 }, { "epoch": 3.412274849899933, "grad_norm": 2.933204174041748, "step": 10230 }, { "epoch": 3.412274849899933, "learning_rate": 0.000613439715244274, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.7380354404449463, "step": 10230 }, { "ce_loss": 0.21490693092346191, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.3523274064064026, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.17062704265117645, "step": 10230 }, { "epoch": 3.412274849899933, "loss": 0.9718881845474243, "step": 10230 }, { "ce_loss": 0.2706681489944458, "epoch": 3.412274849899933, "step": 10230 }, { "distill_loss": 0.407747358083725, "epoch": 3.412274849899933, "step": 10230 }, { "epoch": 3.412274849899933, "ref_ce_loss": 0.16677233576774597, "step": 10230 }, { "epoch": 3.4156104069379585, "loss": 1.0634, "step": 10240 }, { "epoch": 3.4156104069379585, "grad_norm": 3.1097564697265625, "step": 10240 }, { "epoch": 3.4156104069379585, "learning_rate": 0.0006130741213676509, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 1.0423635244369507, "step": 10240 }, { "ce_loss": 0.2795296907424927, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.3979673385620117, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.19955852627754211, "step": 10240 }, { "epoch": 3.4156104069379585, "loss": 0.8001618385314941, "step": 10240 }, { "ce_loss": 0.22154352068901062, "epoch": 3.4156104069379585, "step": 10240 }, { "distill_loss": 0.33247750997543335, "epoch": 3.4156104069379585, "step": 10240 }, { "epoch": 3.4156104069379585, "ref_ce_loss": 0.1758543998003006, "step": 10240 }, { "epoch": 3.418945963975984, "loss": 0.9947, "step": 10250 }, { "epoch": 3.418945963975984, "grad_norm": 1.672417402267456, "step": 10250 }, { "epoch": 3.418945963975984, "learning_rate": 0.0006127082788106006, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.9995819330215454, "step": 10250 }, { "ce_loss": 0.22473253309726715, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.4424959719181061, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.18075767159461975, "step": 10250 }, { "epoch": 3.418945963975984, "loss": 0.96095871925354, "step": 10250 }, { "ce_loss": 0.2543027997016907, "epoch": 3.418945963975984, "step": 10250 }, { "distill_loss": 0.4390068054199219, "epoch": 3.418945963975984, "step": 10250 }, { "epoch": 3.418945963975984, "ref_ce_loss": 0.20820271968841553, "step": 10250 }, { "epoch": 3.422281521014009, "loss": 1.1135, "step": 10260 }, { "epoch": 3.422281521014009, "grad_norm": 1.6077980995178223, "step": 10260 }, { "epoch": 3.422281521014009, "learning_rate": 0.0006123421880001004, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.7692959308624268, "step": 10260 }, { "ce_loss": 0.2198968529701233, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.37453675270080566, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.17452290654182434, "step": 10260 }, { "epoch": 3.422281521014009, "loss": 0.8642454743385315, "step": 10260 }, { "ce_loss": 0.2118843048810959, "epoch": 3.422281521014009, "step": 10260 }, { "distill_loss": 0.3453439772129059, "epoch": 3.422281521014009, "step": 10260 }, { "epoch": 3.422281521014009, "ref_ce_loss": 0.1575404703617096, "step": 10260 }, { "epoch": 3.4256170780520345, "loss": 0.9918, "step": 10270 }, { "epoch": 3.4256170780520345, "grad_norm": 2.540724992752075, "step": 10270 }, { "epoch": 3.4256170780520345, "learning_rate": 0.000611975849363418, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 1.1537764072418213, "step": 10270 }, { "ce_loss": 0.31698283553123474, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.4698025584220886, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.26983755826950073, "step": 10270 }, { "epoch": 3.4256170780520345, "loss": 1.0079697370529175, "step": 10270 }, { "ce_loss": 0.24113315343856812, "epoch": 3.4256170780520345, "step": 10270 }, { "distill_loss": 0.3796297609806061, "epoch": 3.4256170780520345, "step": 10270 }, { "epoch": 3.4256170780520345, "ref_ce_loss": 0.23882921040058136, "step": 10270 }, { "epoch": 3.42895263509006, "loss": 0.955, "step": 10280 }, { "epoch": 3.42895263509006, "grad_norm": 1.3848929405212402, "step": 10280 }, { "epoch": 3.42895263509006, "learning_rate": 0.0006116092633281097, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 0.7369468212127686, "step": 10280 }, { "ce_loss": 0.17177751660346985, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.27149325609207153, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.15882954001426697, "step": 10280 }, { "epoch": 3.42895263509006, "loss": 1.093735933303833, "step": 10280 }, { "ce_loss": 0.31845054030418396, "epoch": 3.42895263509006, "step": 10280 }, { "distill_loss": 0.3451046049594879, "epoch": 3.42895263509006, "step": 10280 }, { "epoch": 3.42895263509006, "ref_ce_loss": 0.21175621449947357, "step": 10280 }, { "epoch": 3.4322881921280852, "loss": 0.9283, "step": 10290 }, { "epoch": 3.4322881921280852, "grad_norm": 3.0060324668884277, "step": 10290 }, { "epoch": 3.4322881921280852, "learning_rate": 0.0006112424303220212, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 0.8963453769683838, "step": 10290 }, { "ce_loss": 0.2973220646381378, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.40442851185798645, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.18964999914169312, "step": 10290 }, { "epoch": 3.4322881921280852, "loss": 1.4584048986434937, "step": 10290 }, { "ce_loss": 0.3163163363933563, "epoch": 3.4322881921280852, "step": 10290 }, { "distill_loss": 0.4408752918243408, "epoch": 3.4322881921280852, "step": 10290 }, { "epoch": 3.4322881921280852, "ref_ce_loss": 0.20503944158554077, "step": 10290 }, { "epoch": 3.4356237491661106, "loss": 1.0607, "step": 10300 }, { "epoch": 3.4356237491661106, "grad_norm": 2.1007776260375977, "step": 10300 }, { "epoch": 3.4356237491661106, "learning_rate": 0.0006108753507732857, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.8503654599189758, "step": 10300 }, { "ce_loss": 0.264160692691803, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.38213497400283813, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.16330452263355255, "step": 10300 }, { "epoch": 3.4356237491661106, "loss": 0.7780002355575562, "step": 10300 }, { "ce_loss": 0.1894020140171051, "epoch": 3.4356237491661106, "step": 10300 }, { "distill_loss": 0.3782142996788025, "epoch": 3.4356237491661106, "step": 10300 }, { "epoch": 3.4356237491661106, "ref_ce_loss": 0.17502427101135254, "step": 10300 }, { "epoch": 3.438959306204136, "loss": 0.9531, "step": 10310 }, { "epoch": 3.438959306204136, "grad_norm": 7.0569610595703125, "step": 10310 }, { "epoch": 3.438959306204136, "learning_rate": 0.0006105080251103248, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 1.0448821783065796, "step": 10310 }, { "ce_loss": 0.31122633814811707, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.38214778900146484, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.2752073407173157, "step": 10310 }, { "epoch": 3.438959306204136, "loss": 0.9181085824966431, "step": 10310 }, { "ce_loss": 0.2127479612827301, "epoch": 3.438959306204136, "step": 10310 }, { "distill_loss": 0.4602065980434418, "epoch": 3.438959306204136, "step": 10310 }, { "epoch": 3.438959306204136, "ref_ce_loss": 0.17844612896442413, "step": 10310 }, { "epoch": 3.4422948632421613, "loss": 1.1066, "step": 10320 }, { "epoch": 3.4422948632421613, "grad_norm": 2.075953960418701, "step": 10320 }, { "epoch": 3.4422948632421613, "learning_rate": 0.000610140453761847, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.8715020418167114, "step": 10320 }, { "ce_loss": 0.19599875807762146, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.4138525128364563, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.18956471979618073, "step": 10320 }, { "epoch": 3.4422948632421613, "loss": 0.7932437658309937, "step": 10320 }, { "ce_loss": 0.22833150625228882, "epoch": 3.4422948632421613, "step": 10320 }, { "distill_loss": 0.3652595281600952, "epoch": 3.4422948632421613, "step": 10320 }, { "epoch": 3.4422948632421613, "ref_ce_loss": 0.16161790490150452, "step": 10320 }, { "epoch": 3.4456304202801866, "loss": 0.9451, "step": 10330 }, { "epoch": 3.4456304202801866, "grad_norm": 1.7833718061447144, "step": 10330 }, { "epoch": 3.4456304202801866, "learning_rate": 0.0006097726371568475, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 1.0605350732803345, "step": 10330 }, { "ce_loss": 0.2801906168460846, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.5158529281616211, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.2109398990869522, "step": 10330 }, { "epoch": 3.4456304202801866, "loss": 0.8374050855636597, "step": 10330 }, { "ce_loss": 0.23242981731891632, "epoch": 3.4456304202801866, "step": 10330 }, { "distill_loss": 0.3921404778957367, "epoch": 3.4456304202801866, "step": 10330 }, { "epoch": 3.4456304202801866, "ref_ce_loss": 0.21246527135372162, "step": 10330 }, { "epoch": 3.448965977318212, "loss": 0.975, "step": 10340 }, { "epoch": 3.448965977318212, "grad_norm": 2.5993306636810303, "step": 10340 }, { "epoch": 3.448965977318212, "learning_rate": 0.0006094045757246081, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.868039071559906, "step": 10340 }, { "ce_loss": 0.20514139533042908, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.4160865843296051, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.1861317902803421, "step": 10340 }, { "epoch": 3.448965977318212, "loss": 0.841340959072113, "step": 10340 }, { "ce_loss": 0.22988004982471466, "epoch": 3.448965977318212, "step": 10340 }, { "distill_loss": 0.3877439498901367, "epoch": 3.448965977318212, "step": 10340 }, { "epoch": 3.448965977318212, "ref_ce_loss": 0.17941595613956451, "step": 10340 }, { "epoch": 3.4523015343562373, "loss": 1.0176, "step": 10350 }, { "epoch": 3.4523015343562373, "grad_norm": 2.8510682582855225, "step": 10350 }, { "epoch": 3.4523015343562373, "learning_rate": 0.000609036269894696, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 0.9242404699325562, "step": 10350 }, { "ce_loss": 0.2583213150501251, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.35674169659614563, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.1694251000881195, "step": 10350 }, { "epoch": 3.4523015343562373, "loss": 1.3669097423553467, "step": 10350 }, { "ce_loss": 0.28417572379112244, "epoch": 3.4523015343562373, "step": 10350 }, { "distill_loss": 0.510482668876648, "epoch": 3.4523015343562373, "step": 10350 }, { "epoch": 3.4523015343562373, "ref_ce_loss": 0.22851471602916718, "step": 10350 }, { "epoch": 3.4556370913942627, "loss": 0.9795, "step": 10360 }, { "epoch": 3.4556370913942627, "grad_norm": 1.5467017889022827, "step": 10360 }, { "epoch": 3.4556370913942627, "learning_rate": 0.0006086677200969636, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 1.1130493879318237, "step": 10360 }, { "ce_loss": 0.31027817726135254, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.4479835033416748, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.21718619763851166, "step": 10360 }, { "epoch": 3.4556370913942627, "loss": 0.9362666010856628, "step": 10360 }, { "ce_loss": 0.2514147162437439, "epoch": 3.4556370913942627, "step": 10360 }, { "distill_loss": 0.3675157427787781, "epoch": 3.4556370913942627, "step": 10360 }, { "epoch": 3.4556370913942627, "ref_ce_loss": 0.20163263380527496, "step": 10360 }, { "epoch": 3.458972648432288, "loss": 0.9831, "step": 10370 }, { "epoch": 3.458972648432288, "grad_norm": 2.5050151348114014, "step": 10370 }, { "epoch": 3.458972648432288, "learning_rate": 0.0006082989267615483, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 1.8880741596221924, "step": 10370 }, { "ce_loss": 0.2876355051994324, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.4782940447330475, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.16375888884067535, "step": 10370 }, { "epoch": 3.458972648432288, "loss": 1.1421231031417847, "step": 10370 }, { "ce_loss": 0.29885339736938477, "epoch": 3.458972648432288, "step": 10370 }, { "distill_loss": 0.5493630170822144, "epoch": 3.458972648432288, "step": 10370 }, { "epoch": 3.458972648432288, "ref_ce_loss": 0.24733687937259674, "step": 10370 }, { "epoch": 3.4623082054703134, "loss": 1.0893, "step": 10380 }, { "epoch": 3.4623082054703134, "grad_norm": 1.438820242881775, "step": 10380 }, { "epoch": 3.4623082054703134, "learning_rate": 0.0006079298903188715, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 1.4203903675079346, "step": 10380 }, { "ce_loss": 0.3444005250930786, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.48849883675575256, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.2962939143180847, "step": 10380 }, { "epoch": 3.4623082054703134, "loss": 0.9848767518997192, "step": 10380 }, { "ce_loss": 0.2705131471157074, "epoch": 3.4623082054703134, "step": 10380 }, { "distill_loss": 0.45133188366889954, "epoch": 3.4623082054703134, "step": 10380 }, { "epoch": 3.4623082054703134, "ref_ce_loss": 0.21035648882389069, "step": 10380 }, { "epoch": 3.4656437625083387, "loss": 0.9829, "step": 10390 }, { "epoch": 3.4656437625083387, "grad_norm": 2.2390544414520264, "step": 10390 }, { "epoch": 3.4656437625083387, "learning_rate": 0.0006075606111996386, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 1.0953900814056396, "step": 10390 }, { "ce_loss": 0.31749287247657776, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.4555628001689911, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.23297156393527985, "step": 10390 }, { "epoch": 3.4656437625083387, "loss": 0.8897223472595215, "step": 10390 }, { "ce_loss": 0.20808438956737518, "epoch": 3.4656437625083387, "step": 10390 }, { "distill_loss": 0.3228724002838135, "epoch": 3.4656437625083387, "step": 10390 }, { "epoch": 3.4656437625083387, "ref_ce_loss": 0.16926991939544678, "step": 10390 }, { "epoch": 3.468979319546364, "loss": 0.9657, "step": 10400 }, { "epoch": 3.468979319546364, "grad_norm": 2.1007516384124756, "step": 10400 }, { "epoch": 3.468979319546364, "learning_rate": 0.000607191089834838, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 0.9438608288764954, "step": 10400 }, { "ce_loss": 0.3016405701637268, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.41761133074760437, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.22418124973773956, "step": 10400 }, { "epoch": 3.468979319546364, "loss": 1.3007851839065552, "step": 10400 }, { "ce_loss": 0.3035561144351959, "epoch": 3.468979319546364, "step": 10400 }, { "distill_loss": 0.48503366112709045, "epoch": 3.468979319546364, "step": 10400 }, { "epoch": 3.468979319546364, "ref_ce_loss": 0.15072081983089447, "step": 10400 }, { "epoch": 3.4723148765843894, "loss": 1.0097, "step": 10410 }, { "epoch": 3.4723148765843894, "grad_norm": 1.3137832880020142, "step": 10410 }, { "epoch": 3.4723148765843894, "learning_rate": 0.0006068213266557409, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 1.0803608894348145, "step": 10410 }, { "ce_loss": 0.26398447155952454, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.4109332263469696, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.187424898147583, "step": 10410 }, { "epoch": 3.4723148765843894, "loss": 1.0211793184280396, "step": 10410 }, { "ce_loss": 0.3377877473831177, "epoch": 3.4723148765843894, "step": 10410 }, { "distill_loss": 0.4217336177825928, "epoch": 3.4723148765843894, "step": 10410 }, { "epoch": 3.4723148765843894, "ref_ce_loss": 0.21612432599067688, "step": 10410 }, { "epoch": 3.4756504336224148, "loss": 0.9997, "step": 10420 }, { "epoch": 3.4756504336224148, "grad_norm": 1.4655508995056152, "step": 10420 }, { "epoch": 3.4756504336224148, "learning_rate": 0.0006064513220939006, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 1.1324905157089233, "step": 10420 }, { "ce_loss": 0.262099027633667, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.3017611503601074, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.20544975996017456, "step": 10420 }, { "epoch": 3.4756504336224148, "loss": 0.9686199426651001, "step": 10420 }, { "ce_loss": 0.23206327855587006, "epoch": 3.4756504336224148, "step": 10420 }, { "distill_loss": 0.3936845362186432, "epoch": 3.4756504336224148, "step": 10420 }, { "epoch": 3.4756504336224148, "ref_ce_loss": 0.16485126316547394, "step": 10420 }, { "epoch": 3.47898599066044, "loss": 0.9794, "step": 10430 }, { "epoch": 3.47898599066044, "grad_norm": 1.958141803741455, "step": 10430 }, { "epoch": 3.47898599066044, "learning_rate": 0.0006060810765811525, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.9960145950317383, "step": 10430 }, { "ce_loss": 0.2033989578485489, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.3653845489025116, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.16449347138404846, "step": 10430 }, { "epoch": 3.47898599066044, "loss": 0.7821276187896729, "step": 10430 }, { "ce_loss": 0.23763516545295715, "epoch": 3.47898599066044, "step": 10430 }, { "distill_loss": 0.38961201906204224, "epoch": 3.47898599066044, "step": 10430 }, { "epoch": 3.47898599066044, "ref_ce_loss": 0.15473511815071106, "step": 10430 }, { "epoch": 3.4823215476984655, "loss": 0.9866, "step": 10440 }, { "epoch": 3.4823215476984655, "grad_norm": 2.7399749755859375, "step": 10440 }, { "epoch": 3.4823215476984655, "learning_rate": 0.0006057105905496125, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 1.2616089582443237, "step": 10440 }, { "ce_loss": 0.257465124130249, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.5176580548286438, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.18606305122375488, "step": 10440 }, { "epoch": 3.4823215476984655, "loss": 0.9673529863357544, "step": 10440 }, { "ce_loss": 0.2233096808195114, "epoch": 3.4823215476984655, "step": 10440 }, { "distill_loss": 0.42469459772109985, "epoch": 3.4823215476984655, "step": 10440 }, { "epoch": 3.4823215476984655, "ref_ce_loss": 0.16961297392845154, "step": 10440 }, { "epoch": 3.485657104736491, "loss": 1.0652, "step": 10450 }, { "epoch": 3.485657104736491, "grad_norm": 1.5313483476638794, "step": 10450 }, { "epoch": 3.485657104736491, "learning_rate": 0.0006053398644316782, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 1.1006319522857666, "step": 10450 }, { "ce_loss": 0.29097020626068115, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.37788423895835876, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.2049952894449234, "step": 10450 }, { "epoch": 3.485657104736491, "loss": 1.045691728591919, "step": 10450 }, { "ce_loss": 0.193770632147789, "epoch": 3.485657104736491, "step": 10450 }, { "distill_loss": 0.37574324011802673, "epoch": 3.485657104736491, "step": 10450 }, { "epoch": 3.485657104736491, "ref_ce_loss": 0.17712724208831787, "step": 10450 }, { "epoch": 3.488992661774516, "loss": 0.9613, "step": 10460 }, { "epoch": 3.488992661774516, "grad_norm": 2.053473711013794, "step": 10460 }, { "epoch": 3.488992661774516, "learning_rate": 0.0006049688986600266, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 0.8386645317077637, "step": 10460 }, { "ce_loss": 0.2251548320055008, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.2944331169128418, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.20060141384601593, "step": 10460 }, { "epoch": 3.488992661774516, "loss": 1.4073541164398193, "step": 10460 }, { "ce_loss": 0.261447548866272, "epoch": 3.488992661774516, "step": 10460 }, { "distill_loss": 0.4781648814678192, "epoch": 3.488992661774516, "step": 10460 }, { "epoch": 3.488992661774516, "ref_ce_loss": 0.19439777731895447, "step": 10460 }, { "epoch": 3.4923282188125415, "loss": 0.9895, "step": 10470 }, { "epoch": 3.4923282188125415, "grad_norm": 1.6879146099090576, "step": 10470 }, { "epoch": 3.4923282188125415, "learning_rate": 0.0006045976936676147, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 1.052851676940918, "step": 10470 }, { "ce_loss": 0.2677856683731079, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.4257782995700836, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.22846080362796783, "step": 10470 }, { "epoch": 3.4923282188125415, "loss": 1.032060980796814, "step": 10470 }, { "ce_loss": 0.26760971546173096, "epoch": 3.4923282188125415, "step": 10470 }, { "distill_loss": 0.46396347880363464, "epoch": 3.4923282188125415, "step": 10470 }, { "epoch": 3.4923282188125415, "ref_ce_loss": 0.22398428618907928, "step": 10470 }, { "epoch": 3.495663775850567, "loss": 1.0215, "step": 10480 }, { "epoch": 3.495663775850567, "grad_norm": 1.6211903095245361, "step": 10480 }, { "epoch": 3.495663775850567, "learning_rate": 0.0006042262498876785, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.766634464263916, "step": 10480 }, { "ce_loss": 0.21379749476909637, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.324904203414917, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.17826105654239655, "step": 10480 }, { "epoch": 3.495663775850567, "loss": 0.9297152757644653, "step": 10480 }, { "ce_loss": 0.20599283277988434, "epoch": 3.495663775850567, "step": 10480 }, { "distill_loss": 0.37327367067337036, "epoch": 3.495663775850567, "step": 10480 }, { "epoch": 3.495663775850567, "ref_ce_loss": 0.15590085089206696, "step": 10480 }, { "epoch": 3.498999332888592, "loss": 1.0309, "step": 10490 }, { "epoch": 3.498999332888592, "grad_norm": 1.9489325284957886, "step": 10490 }, { "epoch": 3.498999332888592, "learning_rate": 0.0006038545677537333, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 0.8805686235427856, "step": 10490 }, { "ce_loss": 0.15386821329593658, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.38175851106643677, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.13805674016475677, "step": 10490 }, { "epoch": 3.498999332888592, "loss": 1.1114535331726074, "step": 10490 }, { "ce_loss": 0.32401880621910095, "epoch": 3.498999332888592, "step": 10490 }, { "distill_loss": 0.46975576877593994, "epoch": 3.498999332888592, "step": 10490 }, { "epoch": 3.498999332888592, "ref_ce_loss": 0.24954313039779663, "step": 10490 }, { "epoch": 3.502334889926618, "loss": 0.9896, "step": 10500 }, { "epoch": 3.502334889926618, "grad_norm": 1.5480339527130127, "step": 10500 }, { "epoch": 3.502334889926618, "learning_rate": 0.0006034826476995715, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 1.0643081665039062, "step": 10500 }, { "ce_loss": 0.24576300382614136, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.3770397901535034, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.20040734112262726, "step": 10500 }, { "epoch": 3.502334889926618, "loss": 0.807522177696228, "step": 10500 }, { "ce_loss": 0.21417485177516937, "epoch": 3.502334889926618, "step": 10500 }, { "distill_loss": 0.4244995713233948, "epoch": 3.502334889926618, "step": 10500 }, { "epoch": 3.502334889926618, "ref_ce_loss": 0.16867026686668396, "step": 10500 }, { "epoch": 3.5056704469646434, "loss": 0.9092, "step": 10510 }, { "epoch": 3.5056704469646434, "grad_norm": 1.5492361783981323, "step": 10510 }, { "epoch": 3.5056704469646434, "learning_rate": 0.0006031104901592645, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 0.974293053150177, "step": 10510 }, { "ce_loss": 0.25700998306274414, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.3952081799507141, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.19461868703365326, "step": 10510 }, { "epoch": 3.5056704469646434, "loss": 1.076209306716919, "step": 10510 }, { "ce_loss": 0.26486635208129883, "epoch": 3.5056704469646434, "step": 10510 }, { "distill_loss": 0.460548460483551, "epoch": 3.5056704469646434, "step": 10510 }, { "epoch": 3.5056704469646434, "ref_ce_loss": 0.19604913890361786, "step": 10510 }, { "epoch": 3.5090060040026687, "loss": 0.9397, "step": 10520 }, { "epoch": 3.5090060040026687, "grad_norm": 2.0866904258728027, "step": 10520 }, { "epoch": 3.5090060040026687, "learning_rate": 0.0006027380955671598, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.7660278677940369, "step": 10520 }, { "ce_loss": 0.21146520972251892, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.3692725598812103, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.1851666420698166, "step": 10520 }, { "epoch": 3.5090060040026687, "loss": 0.8923248052597046, "step": 10520 }, { "ce_loss": 0.19822822511196136, "epoch": 3.5090060040026687, "step": 10520 }, { "distill_loss": 0.441368967294693, "epoch": 3.5090060040026687, "step": 10520 }, { "epoch": 3.5090060040026687, "ref_ce_loss": 0.1543390154838562, "step": 10520 }, { "epoch": 3.512341561040694, "loss": 0.9011, "step": 10530 }, { "epoch": 3.512341561040694, "grad_norm": 1.7885452508926392, "step": 10530 }, { "epoch": 3.512341561040694, "learning_rate": 0.000602365464357882, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 1.063278079032898, "step": 10530 }, { "ce_loss": 0.31426993012428284, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.34604477882385254, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.26331233978271484, "step": 10530 }, { "epoch": 3.512341561040694, "loss": 0.5608228445053101, "step": 10530 }, { "ce_loss": 0.16896271705627441, "epoch": 3.512341561040694, "step": 10530 }, { "distill_loss": 0.24089543521404266, "epoch": 3.512341561040694, "step": 10530 }, { "epoch": 3.512341561040694, "ref_ce_loss": 0.11015720665454865, "step": 10530 }, { "epoch": 3.5156771180787194, "loss": 0.9445, "step": 10540 }, { "epoch": 3.5156771180787194, "grad_norm": 1.596814751625061, "step": 10540 }, { "epoch": 3.5156771180787194, "learning_rate": 0.0006019925969663319, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 1.0660908222198486, "step": 10540 }, { "ce_loss": 0.24070100486278534, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.3578681945800781, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.2396494448184967, "step": 10540 }, { "epoch": 3.5156771180787194, "loss": 1.1939899921417236, "step": 10540 }, { "ce_loss": 0.25851884484291077, "epoch": 3.5156771180787194, "step": 10540 }, { "distill_loss": 0.4415932297706604, "epoch": 3.5156771180787194, "step": 10540 }, { "epoch": 3.5156771180787194, "ref_ce_loss": 0.23945797979831696, "step": 10540 }, { "epoch": 3.5190126751167448, "loss": 0.969, "step": 10550 }, { "epoch": 3.5190126751167448, "grad_norm": 2.290989875793457, "step": 10550 }, { "epoch": 3.5190126751167448, "learning_rate": 0.000601619493827686, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 0.8484582901000977, "step": 10550 }, { "ce_loss": 0.2210330218076706, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.4313220977783203, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.19588512182235718, "step": 10550 }, { "epoch": 3.5190126751167448, "loss": 1.1229126453399658, "step": 10550 }, { "ce_loss": 0.25314053893089294, "epoch": 3.5190126751167448, "step": 10550 }, { "distill_loss": 0.41929417848587036, "epoch": 3.5190126751167448, "step": 10550 }, { "epoch": 3.5190126751167448, "ref_ce_loss": 0.14629621803760529, "step": 10550 }, { "epoch": 3.52234823215477, "loss": 1.0361, "step": 10560 }, { "epoch": 3.52234823215477, "grad_norm": 1.6395354270935059, "step": 10560 }, { "epoch": 3.52234823215477, "learning_rate": 0.0006012461553773955, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 0.9260293841362, "step": 10560 }, { "ce_loss": 0.29744088649749756, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.3920683264732361, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.23435044288635254, "step": 10560 }, { "epoch": 3.52234823215477, "loss": 0.9376093149185181, "step": 10560 }, { "ce_loss": 0.22937791049480438, "epoch": 3.52234823215477, "step": 10560 }, { "distill_loss": 0.3874242305755615, "epoch": 3.52234823215477, "step": 10560 }, { "epoch": 3.52234823215477, "ref_ce_loss": 0.18697050213813782, "step": 10560 }, { "epoch": 3.5256837891927955, "loss": 0.9756, "step": 10570 }, { "epoch": 3.5256837891927955, "grad_norm": 2.079847574234009, "step": 10570 }, { "epoch": 3.5256837891927955, "learning_rate": 0.0006008725820511866, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.8667013049125671, "step": 10570 }, { "ce_loss": 0.23142340779304504, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.4217209815979004, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.1697525829076767, "step": 10570 }, { "epoch": 3.5256837891927955, "loss": 0.9974404573440552, "step": 10570 }, { "ce_loss": 0.2349308580160141, "epoch": 3.5256837891927955, "step": 10570 }, { "distill_loss": 0.5080359578132629, "epoch": 3.5256837891927955, "step": 10570 }, { "epoch": 3.5256837891927955, "ref_ce_loss": 0.1930299997329712, "step": 10570 }, { "epoch": 3.529019346230821, "loss": 0.9553, "step": 10580 }, { "epoch": 3.529019346230821, "grad_norm": 2.677853584289551, "step": 10580 }, { "epoch": 3.529019346230821, "learning_rate": 0.0006004987742850598, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.8715407252311707, "step": 10580 }, { "ce_loss": 0.24808406829833984, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.41015133261680603, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.2120196670293808, "step": 10580 }, { "epoch": 3.529019346230821, "loss": 0.949081540107727, "step": 10580 }, { "ce_loss": 0.25341686606407166, "epoch": 3.529019346230821, "step": 10580 }, { "distill_loss": 0.3998900055885315, "epoch": 3.529019346230821, "step": 10580 }, { "epoch": 3.529019346230821, "ref_ce_loss": 0.22252270579338074, "step": 10580 }, { "epoch": 3.532354903268846, "loss": 1.0076, "step": 10590 }, { "epoch": 3.532354903268846, "grad_norm": 2.008328676223755, "step": 10590 }, { "epoch": 3.532354903268846, "learning_rate": 0.0006001247325152887, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 1.12415611743927, "step": 10590 }, { "ce_loss": 0.2488483190536499, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.4674426019191742, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.19241824746131897, "step": 10590 }, { "epoch": 3.532354903268846, "loss": 0.8905547857284546, "step": 10590 }, { "ce_loss": 0.2781248986721039, "epoch": 3.532354903268846, "step": 10590 }, { "distill_loss": 0.4134596586227417, "epoch": 3.532354903268846, "step": 10590 }, { "epoch": 3.532354903268846, "ref_ce_loss": 0.19820138812065125, "step": 10590 }, { "epoch": 3.5356904603068715, "loss": 0.9653, "step": 10600 }, { "epoch": 3.5356904603068715, "grad_norm": 3.509645462036133, "step": 10600 }, { "epoch": 3.5356904603068715, "learning_rate": 0.0005997504571784207, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.8519524931907654, "step": 10600 }, { "ce_loss": 0.2490682452917099, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.3506883680820465, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.20163153111934662, "step": 10600 }, { "epoch": 3.5356904603068715, "loss": 0.9452041983604431, "step": 10600 }, { "ce_loss": 0.24201901257038116, "epoch": 3.5356904603068715, "step": 10600 }, { "distill_loss": 0.41681554913520813, "epoch": 3.5356904603068715, "step": 10600 }, { "epoch": 3.5356904603068715, "ref_ce_loss": 0.14725689589977264, "step": 10600 }, { "epoch": 3.539026017344897, "loss": 0.9389, "step": 10610 }, { "epoch": 3.539026017344897, "grad_norm": 2.2754478454589844, "step": 10610 }, { "epoch": 3.539026017344897, "learning_rate": 0.000599375948711275, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 1.0604487657546997, "step": 10610 }, { "ce_loss": 0.2615657150745392, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.4435419738292694, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.19277401268482208, "step": 10610 }, { "epoch": 3.539026017344897, "loss": 1.0847976207733154, "step": 10610 }, { "ce_loss": 0.24040797352790833, "epoch": 3.539026017344897, "step": 10610 }, { "distill_loss": 0.3454318940639496, "epoch": 3.539026017344897, "step": 10610 }, { "epoch": 3.539026017344897, "ref_ce_loss": 0.18763557076454163, "step": 10610 }, { "epoch": 3.542361574382922, "loss": 1.0434, "step": 10620 }, { "epoch": 3.542361574382922, "grad_norm": 1.3464022874832153, "step": 10620 }, { "epoch": 3.542361574382922, "learning_rate": 0.0005990012075509434, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.877839207649231, "step": 10620 }, { "ce_loss": 0.20063342154026031, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.35610562562942505, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.1908491998910904, "step": 10620 }, { "epoch": 3.542361574382922, "loss": 0.750281572341919, "step": 10620 }, { "ce_loss": 0.2147495448589325, "epoch": 3.542361574382922, "step": 10620 }, { "distill_loss": 0.37057164311408997, "epoch": 3.542361574382922, "step": 10620 }, { "epoch": 3.542361574382922, "ref_ce_loss": 0.16475652158260345, "step": 10620 }, { "epoch": 3.5456971314209476, "loss": 0.9319, "step": 10630 }, { "epoch": 3.5456971314209476, "grad_norm": 1.7371325492858887, "step": 10630 }, { "epoch": 3.5456971314209476, "learning_rate": 0.000598626234134789, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.891808807849884, "step": 10630 }, { "ce_loss": 0.24432647228240967, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.45418083667755127, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.1522151380777359, "step": 10630 }, { "epoch": 3.5456971314209476, "loss": 0.8573769330978394, "step": 10630 }, { "ce_loss": 0.25155410170555115, "epoch": 3.5456971314209476, "step": 10630 }, { "distill_loss": 0.3957569897174835, "epoch": 3.5456971314209476, "step": 10630 }, { "epoch": 3.5456971314209476, "ref_ce_loss": 0.15537843108177185, "step": 10630 }, { "epoch": 3.549032688458973, "loss": 0.9607, "step": 10640 }, { "epoch": 3.549032688458973, "grad_norm": 1.7045210599899292, "step": 10640 }, { "epoch": 3.549032688458973, "learning_rate": 0.0005982510289004467, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 0.812683641910553, "step": 10640 }, { "ce_loss": 0.2365237921476364, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.39517346024513245, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.1795228272676468, "step": 10640 }, { "epoch": 3.549032688458973, "loss": 1.6365201473236084, "step": 10640 }, { "ce_loss": 0.3631623089313507, "epoch": 3.549032688458973, "step": 10640 }, { "distill_loss": 0.43889060616493225, "epoch": 3.549032688458973, "step": 10640 }, { "epoch": 3.549032688458973, "ref_ce_loss": 0.2709457576274872, "step": 10640 }, { "epoch": 3.5523682454969983, "loss": 0.9761, "step": 10650 }, { "epoch": 3.5523682454969983, "grad_norm": 2.293968915939331, "step": 10650 }, { "epoch": 3.5523682454969983, "learning_rate": 0.0005978755922858205, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.8294329047203064, "step": 10650 }, { "ce_loss": 0.23119644820690155, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.3696436285972595, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.17142550647258759, "step": 10650 }, { "epoch": 3.5523682454969983, "loss": 0.8790150880813599, "step": 10650 }, { "ce_loss": 0.23846159875392914, "epoch": 3.5523682454969983, "step": 10650 }, { "distill_loss": 0.4104042649269104, "epoch": 3.5523682454969983, "step": 10650 }, { "epoch": 3.5523682454969983, "ref_ce_loss": 0.1691407859325409, "step": 10650 }, { "epoch": 3.5557038025350236, "loss": 1.0186, "step": 10660 }, { "epoch": 3.5557038025350236, "grad_norm": 2.333033800125122, "step": 10660 }, { "epoch": 3.5557038025350236, "learning_rate": 0.0005974999247290862, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 1.0636610984802246, "step": 10660 }, { "ce_loss": 0.27940917015075684, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.35590964555740356, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.1658158302307129, "step": 10660 }, { "epoch": 3.5557038025350236, "loss": 0.8477701544761658, "step": 10660 }, { "ce_loss": 0.18838250637054443, "epoch": 3.5557038025350236, "step": 10660 }, { "distill_loss": 0.31615662574768066, "epoch": 3.5557038025350236, "step": 10660 }, { "epoch": 3.5557038025350236, "ref_ce_loss": 0.1694744974374771, "step": 10660 }, { "epoch": 3.559039359573049, "loss": 0.9494, "step": 10670 }, { "epoch": 3.559039359573049, "grad_norm": 2.738006114959717, "step": 10670 }, { "epoch": 3.559039359573049, "learning_rate": 0.0005971240266686877, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 0.9802939891815186, "step": 10670 }, { "ce_loss": 0.3043555021286011, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.4548279047012329, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.17479896545410156, "step": 10670 }, { "epoch": 3.559039359573049, "loss": 1.0556284189224243, "step": 10670 }, { "ce_loss": 0.29174360632896423, "epoch": 3.559039359573049, "step": 10670 }, { "distill_loss": 0.4765718877315521, "epoch": 3.559039359573049, "step": 10670 }, { "epoch": 3.559039359573049, "ref_ce_loss": 0.1979597806930542, "step": 10670 }, { "epoch": 3.5623749166110743, "loss": 1.0208, "step": 10680 }, { "epoch": 3.5623749166110743, "grad_norm": 2.6067469120025635, "step": 10680 }, { "epoch": 3.5623749166110743, "learning_rate": 0.0005967478985433387, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 1.05019211769104, "step": 10680 }, { "ce_loss": 0.27029672265052795, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.5363603234291077, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.19258911907672882, "step": 10680 }, { "epoch": 3.5623749166110743, "loss": 1.0431991815567017, "step": 10680 }, { "ce_loss": 0.3473112881183624, "epoch": 3.5623749166110743, "step": 10680 }, { "distill_loss": 0.4352015554904938, "epoch": 3.5623749166110743, "step": 10680 }, { "epoch": 3.5623749166110743, "ref_ce_loss": 0.20662081241607666, "step": 10680 }, { "epoch": 3.5657104736490997, "loss": 0.9695, "step": 10690 }, { "epoch": 3.5657104736490997, "grad_norm": 1.6148370504379272, "step": 10690 }, { "epoch": 3.5657104736490997, "learning_rate": 0.000596371540792021, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 1.0217854976654053, "step": 10690 }, { "ce_loss": 0.290711373090744, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.40897056460380554, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.1742202490568161, "step": 10690 }, { "epoch": 3.5657104736490997, "loss": 0.9119325280189514, "step": 10690 }, { "ce_loss": 0.19622530043125153, "epoch": 3.5657104736490997, "step": 10690 }, { "distill_loss": 0.344217449426651, "epoch": 3.5657104736490997, "step": 10690 }, { "epoch": 3.5657104736490997, "ref_ce_loss": 0.17378860712051392, "step": 10690 }, { "epoch": 3.569046030687125, "loss": 1.0141, "step": 10700 }, { "epoch": 3.569046030687125, "grad_norm": 2.3237545490264893, "step": 10700 }, { "epoch": 3.569046030687125, "learning_rate": 0.000595994953853985, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 1.8157587051391602, "step": 10700 }, { "ce_loss": 0.27670037746429443, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.43991291522979736, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.19433966279029846, "step": 10700 }, { "epoch": 3.569046030687125, "loss": 1.0240585803985596, "step": 10700 }, { "ce_loss": 0.3436286449432373, "epoch": 3.569046030687125, "step": 10700 }, { "distill_loss": 0.40613216161727905, "epoch": 3.569046030687125, "step": 10700 }, { "epoch": 3.569046030687125, "ref_ce_loss": 0.2552061676979065, "step": 10700 }, { "epoch": 3.5723815877251504, "loss": 1.0385, "step": 10710 }, { "epoch": 3.5723815877251504, "grad_norm": 1.6229546070098877, "step": 10710 }, { "epoch": 3.5723815877251504, "learning_rate": 0.0005956181381687477, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.8478896617889404, "step": 10710 }, { "ce_loss": 0.252267062664032, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.3502632975578308, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.19959960877895355, "step": 10710 }, { "epoch": 3.5723815877251504, "loss": 0.965064287185669, "step": 10710 }, { "ce_loss": 0.3000277876853943, "epoch": 3.5723815877251504, "step": 10710 }, { "distill_loss": 0.4041358530521393, "epoch": 3.5723815877251504, "step": 10710 }, { "epoch": 3.5723815877251504, "ref_ce_loss": 0.20794537663459778, "step": 10710 }, { "epoch": 3.5757171447631757, "loss": 0.9149, "step": 10720 }, { "epoch": 3.5757171447631757, "grad_norm": 1.5637110471725464, "step": 10720 }, { "epoch": 3.5757171447631757, "learning_rate": 0.000595241094176094, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.9542688131332397, "step": 10720 }, { "ce_loss": 0.26275670528411865, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.4302704334259033, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.23514986038208008, "step": 10720 }, { "epoch": 3.5757171447631757, "loss": 0.9637356996536255, "step": 10720 }, { "ce_loss": 0.2058098167181015, "epoch": 3.5757171447631757, "step": 10720 }, { "distill_loss": 0.4770658016204834, "epoch": 3.5757171447631757, "step": 10720 }, { "epoch": 3.5757171447631757, "ref_ce_loss": 0.1876608431339264, "step": 10720 }, { "epoch": 3.579052701801201, "loss": 0.9496, "step": 10730 }, { "epoch": 3.579052701801201, "grad_norm": 2.5232837200164795, "step": 10730 }, { "epoch": 3.579052701801201, "learning_rate": 0.0005948638223160744, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 1.3978841304779053, "step": 10730 }, { "ce_loss": 0.23244185745716095, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.3715016841888428, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.1973060518503189, "step": 10730 }, { "epoch": 3.579052701801201, "loss": 1.1500132083892822, "step": 10730 }, { "ce_loss": 0.2621765732765198, "epoch": 3.579052701801201, "step": 10730 }, { "distill_loss": 0.33173489570617676, "epoch": 3.579052701801201, "step": 10730 }, { "epoch": 3.579052701801201, "ref_ce_loss": 0.20012745261192322, "step": 10730 }, { "epoch": 3.5823882588392264, "loss": 0.9951, "step": 10740 }, { "epoch": 3.5823882588392264, "grad_norm": 2.035033702850342, "step": 10740 }, { "epoch": 3.5823882588392264, "learning_rate": 0.000594486323029006, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 1.2532624006271362, "step": 10740 }, { "ce_loss": 0.24884285032749176, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.38106995820999146, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.23058843612670898, "step": 10740 }, { "epoch": 3.5823882588392264, "loss": 0.8896252512931824, "step": 10740 }, { "ce_loss": 0.21895568072795868, "epoch": 3.5823882588392264, "step": 10740 }, { "distill_loss": 0.45231279730796814, "epoch": 3.5823882588392264, "step": 10740 }, { "epoch": 3.5823882588392264, "ref_ce_loss": 0.21800543367862701, "step": 10740 }, { "epoch": 3.5857238158772518, "loss": 1.1031, "step": 10750 }, { "epoch": 3.5857238158772518, "grad_norm": 1.63994300365448, "step": 10750 }, { "epoch": 3.5857238158772518, "learning_rate": 0.0005941085967554711, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 1.191534161567688, "step": 10750 }, { "ce_loss": 0.1863107532262802, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.34788045287132263, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.12304037064313889, "step": 10750 }, { "epoch": 3.5857238158772518, "loss": 0.9901374578475952, "step": 10750 }, { "ce_loss": 0.3216560482978821, "epoch": 3.5857238158772518, "step": 10750 }, { "distill_loss": 0.4349941313266754, "epoch": 3.5857238158772518, "step": 10750 }, { "epoch": 3.5857238158772518, "ref_ce_loss": 0.1874067634344101, "step": 10750 }, { "epoch": 3.589059372915277, "loss": 0.9665, "step": 10760 }, { "epoch": 3.589059372915277, "grad_norm": 1.9990334510803223, "step": 10760 }, { "epoch": 3.589059372915277, "learning_rate": 0.0005937306439363168, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 1.1465106010437012, "step": 10760 }, { "ce_loss": 0.2501791715621948, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.4361908435821533, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.13454364240169525, "step": 10760 }, { "epoch": 3.589059372915277, "loss": 1.2411580085754395, "step": 10760 }, { "ce_loss": 0.30965420603752136, "epoch": 3.589059372915277, "step": 10760 }, { "distill_loss": 0.38276880979537964, "epoch": 3.589059372915277, "step": 10760 }, { "epoch": 3.589059372915277, "ref_ce_loss": 0.18235190212726593, "step": 10760 }, { "epoch": 3.5923949299533025, "loss": 0.9779, "step": 10770 }, { "epoch": 3.5923949299533025, "grad_norm": 1.4198142290115356, "step": 10770 }, { "epoch": 3.5923949299533025, "learning_rate": 0.0005933524650126546, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 1.104905128479004, "step": 10770 }, { "ce_loss": 0.2916855216026306, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.4638969302177429, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.25216418504714966, "step": 10770 }, { "epoch": 3.5923949299533025, "loss": 0.8804208040237427, "step": 10770 }, { "ce_loss": 0.22146829962730408, "epoch": 3.5923949299533025, "step": 10770 }, { "distill_loss": 0.37109169363975525, "epoch": 3.5923949299533025, "step": 10770 }, { "epoch": 3.5923949299533025, "ref_ce_loss": 0.16120927035808563, "step": 10770 }, { "epoch": 3.595730486991328, "loss": 0.9963, "step": 10780 }, { "epoch": 3.595730486991328, "grad_norm": 8.41401195526123, "step": 10780 }, { "epoch": 3.595730486991328, "learning_rate": 0.0005929740604258603, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 0.9323166012763977, "step": 10780 }, { "ce_loss": 0.2917608916759491, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.42487627267837524, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.1706487238407135, "step": 10780 }, { "epoch": 3.595730486991328, "loss": 1.0221028327941895, "step": 10780 }, { "ce_loss": 0.25380828976631165, "epoch": 3.595730486991328, "step": 10780 }, { "distill_loss": 0.38979148864746094, "epoch": 3.595730486991328, "step": 10780 }, { "epoch": 3.595730486991328, "ref_ce_loss": 0.20026403665542603, "step": 10780 }, { "epoch": 3.599066044029353, "loss": 0.9955, "step": 10790 }, { "epoch": 3.599066044029353, "grad_norm": 3.6208415031433105, "step": 10790 }, { "epoch": 3.599066044029353, "learning_rate": 0.0005925954306175725, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.9474771618843079, "step": 10790 }, { "ce_loss": 0.32380348443984985, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.39571648836135864, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.17513123154640198, "step": 10790 }, { "epoch": 3.599066044029353, "loss": 0.7710244059562683, "step": 10790 }, { "ce_loss": 0.2452814131975174, "epoch": 3.599066044029353, "step": 10790 }, { "distill_loss": 0.3207455277442932, "epoch": 3.599066044029353, "step": 10790 }, { "epoch": 3.599066044029353, "ref_ce_loss": 0.1587689220905304, "step": 10790 }, { "epoch": 3.6024016010673785, "loss": 0.981, "step": 10800 }, { "epoch": 3.6024016010673785, "grad_norm": 1.6106643676757812, "step": 10800 }, { "epoch": 3.6024016010673785, "learning_rate": 0.0005922165760296932, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 0.6965634226799011, "step": 10800 }, { "ce_loss": 0.23448844254016876, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.317073792219162, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.14483723044395447, "step": 10800 }, { "epoch": 3.6024016010673785, "loss": 1.04850435256958, "step": 10800 }, { "ce_loss": 0.2612713575363159, "epoch": 3.6024016010673785, "step": 10800 }, { "distill_loss": 0.4225085377693176, "epoch": 3.6024016010673785, "step": 10800 }, { "epoch": 3.6024016010673785, "ref_ce_loss": 0.2115650475025177, "step": 10800 }, { "epoch": 3.605737158105404, "loss": 1.0208, "step": 10810 }, { "epoch": 3.605737158105404, "grad_norm": 1.9757091999053955, "step": 10810 }, { "epoch": 3.605737158105404, "learning_rate": 0.0005918374971043862, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 1.018848180770874, "step": 10810 }, { "ce_loss": 0.2821508049964905, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.4402734339237213, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.18855346739292145, "step": 10810 }, { "epoch": 3.605737158105404, "loss": 1.0459314584732056, "step": 10810 }, { "ce_loss": 0.26500678062438965, "epoch": 3.605737158105404, "step": 10810 }, { "distill_loss": 0.43279725313186646, "epoch": 3.605737158105404, "step": 10810 }, { "epoch": 3.605737158105404, "ref_ce_loss": 0.23459219932556152, "step": 10810 }, { "epoch": 3.609072715143429, "loss": 0.9469, "step": 10820 }, { "epoch": 3.609072715143429, "grad_norm": 1.9686979055404663, "step": 10820 }, { "epoch": 3.609072715143429, "learning_rate": 0.0005914581942840775, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 0.6443910002708435, "step": 10820 }, { "ce_loss": 0.15013271570205688, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.33419257402420044, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.15990029275417328, "step": 10820 }, { "epoch": 3.609072715143429, "loss": 1.009469747543335, "step": 10820 }, { "ce_loss": 0.24576064944267273, "epoch": 3.609072715143429, "step": 10820 }, { "distill_loss": 0.4632691740989685, "epoch": 3.609072715143429, "step": 10820 }, { "epoch": 3.609072715143429, "ref_ce_loss": 0.18871477246284485, "step": 10820 }, { "epoch": 3.6124082721814545, "loss": 1.0257, "step": 10830 }, { "epoch": 3.6124082721814545, "grad_norm": 2.7029173374176025, "step": 10830 }, { "epoch": 3.6124082721814545, "learning_rate": 0.0005910786680114544, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.9071427583694458, "step": 10830 }, { "ce_loss": 0.2030387669801712, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.3390215337276459, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.1907871961593628, "step": 10830 }, { "epoch": 3.6124082721814545, "loss": 0.8759766817092896, "step": 10830 }, { "ce_loss": 0.2447219043970108, "epoch": 3.6124082721814545, "step": 10830 }, { "distill_loss": 0.4003208875656128, "epoch": 3.6124082721814545, "step": 10830 }, { "epoch": 3.6124082721814545, "ref_ce_loss": 0.17160019278526306, "step": 10830 }, { "epoch": 3.61574382921948, "loss": 0.997, "step": 10840 }, { "epoch": 3.61574382921948, "grad_norm": 2.0608105659484863, "step": 10840 }, { "epoch": 3.61574382921948, "learning_rate": 0.0005906989187294649, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.8163762092590332, "step": 10840 }, { "ce_loss": 0.20944543182849884, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.3983843922615051, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.1529301255941391, "step": 10840 }, { "epoch": 3.61574382921948, "loss": 0.7047134041786194, "step": 10840 }, { "ce_loss": 0.17927296459674835, "epoch": 3.61574382921948, "step": 10840 }, { "distill_loss": 0.37407517433166504, "epoch": 3.61574382921948, "step": 10840 }, { "epoch": 3.61574382921948, "ref_ce_loss": 0.15111172199249268, "step": 10840 }, { "epoch": 3.6190793862575052, "loss": 0.9361, "step": 10850 }, { "epoch": 3.6190793862575052, "grad_norm": 6.514796257019043, "step": 10850 }, { "epoch": 3.6190793862575052, "learning_rate": 0.0005903189468813169, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 1.120654582977295, "step": 10850 }, { "ce_loss": 0.30272042751312256, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.4027269780635834, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.23278942704200745, "step": 10850 }, { "epoch": 3.6190793862575052, "loss": 1.0802335739135742, "step": 10850 }, { "ce_loss": 0.29891490936279297, "epoch": 3.6190793862575052, "step": 10850 }, { "distill_loss": 0.437322199344635, "epoch": 3.6190793862575052, "step": 10850 }, { "epoch": 3.6190793862575052, "ref_ce_loss": 0.2128477692604065, "step": 10850 }, { "epoch": 3.6224149432955306, "loss": 0.9784, "step": 10860 }, { "epoch": 3.6224149432955306, "grad_norm": 2.744908332824707, "step": 10860 }, { "epoch": 3.6224149432955306, "learning_rate": 0.000589938752910479, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 1.092473030090332, "step": 10860 }, { "ce_loss": 0.1885896474123001, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.4350079298019409, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.1690068542957306, "step": 10860 }, { "epoch": 3.6224149432955306, "loss": 0.9842666387557983, "step": 10860 }, { "ce_loss": 0.26728078722953796, "epoch": 3.6224149432955306, "step": 10860 }, { "distill_loss": 0.42367982864379883, "epoch": 3.6224149432955306, "step": 10860 }, { "epoch": 3.6224149432955306, "ref_ce_loss": 0.24502113461494446, "step": 10860 }, { "epoch": 3.625750500333556, "loss": 0.9453, "step": 10870 }, { "epoch": 3.625750500333556, "grad_norm": 2.9420664310455322, "step": 10870 }, { "epoch": 3.625750500333556, "learning_rate": 0.000589558337260678, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 0.8280543088912964, "step": 10870 }, { "ce_loss": 0.21766088902950287, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.3599540591239929, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.17570890486240387, "step": 10870 }, { "epoch": 3.625750500333556, "loss": 1.2867872714996338, "step": 10870 }, { "ce_loss": 0.32790401577949524, "epoch": 3.625750500333556, "step": 10870 }, { "distill_loss": 0.4899286925792694, "epoch": 3.625750500333556, "step": 10870 }, { "epoch": 3.625750500333556, "ref_ce_loss": 0.2533458173274994, "step": 10870 }, { "epoch": 3.6290860573715813, "loss": 0.9934, "step": 10880 }, { "epoch": 3.6290860573715813, "grad_norm": 1.9164032936096191, "step": 10880 }, { "epoch": 3.6290860573715813, "learning_rate": 0.0005891777003759002, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 1.0848420858383179, "step": 10880 }, { "ce_loss": 0.34023869037628174, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.47535526752471924, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.2687413692474365, "step": 10880 }, { "epoch": 3.6290860573715813, "loss": 0.7790800333023071, "step": 10880 }, { "ce_loss": 0.20252028107643127, "epoch": 3.6290860573715813, "step": 10880 }, { "distill_loss": 0.35906457901000977, "epoch": 3.6290860573715813, "step": 10880 }, { "epoch": 3.6290860573715813, "ref_ce_loss": 0.16440549492835999, "step": 10880 }, { "epoch": 3.6324216144096066, "loss": 0.918, "step": 10890 }, { "epoch": 3.6324216144096066, "grad_norm": 1.9566514492034912, "step": 10890 }, { "epoch": 3.6324216144096066, "learning_rate": 0.0005887968427003898, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.8718944191932678, "step": 10890 }, { "ce_loss": 0.2936494052410126, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.35506755113601685, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.22286319732666016, "step": 10890 }, { "epoch": 3.6324216144096066, "loss": 0.9113027453422546, "step": 10890 }, { "ce_loss": 0.26407554745674133, "epoch": 3.6324216144096066, "step": 10890 }, { "distill_loss": 0.38341107964515686, "epoch": 3.6324216144096066, "step": 10890 }, { "epoch": 3.6324216144096066, "ref_ce_loss": 0.2064923644065857, "step": 10890 }, { "epoch": 3.635757171447632, "loss": 0.9026, "step": 10900 }, { "epoch": 3.635757171447632, "grad_norm": 2.067293405532837, "step": 10900 }, { "epoch": 3.635757171447632, "learning_rate": 0.0005884157646786482, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 1.0706310272216797, "step": 10900 }, { "ce_loss": 0.2915562689304352, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.37129735946655273, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.26088154315948486, "step": 10900 }, { "epoch": 3.635757171447632, "loss": 0.9378951787948608, "step": 10900 }, { "ce_loss": 0.22501979768276215, "epoch": 3.635757171447632, "step": 10900 }, { "distill_loss": 0.4099835455417633, "epoch": 3.635757171447632, "step": 10900 }, { "epoch": 3.635757171447632, "ref_ce_loss": 0.20775926113128662, "step": 10900 }, { "epoch": 3.6390927284856573, "loss": 1.0149, "step": 10910 }, { "epoch": 3.6390927284856573, "grad_norm": 1.3769757747650146, "step": 10910 }, { "epoch": 3.6390927284856573, "learning_rate": 0.0005880344667554353, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.9676283597946167, "step": 10910 }, { "ce_loss": 0.27713117003440857, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.4113433063030243, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.18202325701713562, "step": 10910 }, { "epoch": 3.6390927284856573, "loss": 0.9402757883071899, "step": 10910 }, { "ce_loss": 0.276583731174469, "epoch": 3.6390927284856573, "step": 10910 }, { "distill_loss": 0.4893118143081665, "epoch": 3.6390927284856573, "step": 10910 }, { "epoch": 3.6390927284856573, "ref_ce_loss": 0.17388537526130676, "step": 10910 }, { "epoch": 3.6424282855236827, "loss": 0.984, "step": 10920 }, { "epoch": 3.6424282855236827, "grad_norm": 1.5481610298156738, "step": 10920 }, { "epoch": 3.6424282855236827, "learning_rate": 0.0005876529493757661, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 1.0869348049163818, "step": 10920 }, { "ce_loss": 0.19671200215816498, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.36423802375793457, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.14919424057006836, "step": 10920 }, { "epoch": 3.6424282855236827, "loss": 0.9016743898391724, "step": 10920 }, { "ce_loss": 0.18895894289016724, "epoch": 3.6424282855236827, "step": 10920 }, { "distill_loss": 0.39317190647125244, "epoch": 3.6424282855236827, "step": 10920 }, { "epoch": 3.6424282855236827, "ref_ce_loss": 0.15948988497257233, "step": 10920 }, { "epoch": 3.645763842561708, "loss": 0.9071, "step": 10930 }, { "epoch": 3.645763842561708, "grad_norm": 1.7900937795639038, "step": 10930 }, { "epoch": 3.645763842561708, "learning_rate": 0.0005872712129849128, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 1.1257048845291138, "step": 10930 }, { "ce_loss": 0.2748940885066986, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.4411807060241699, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.15890592336654663, "step": 10930 }, { "epoch": 3.645763842561708, "loss": 0.7515501379966736, "step": 10930 }, { "ce_loss": 0.20219367742538452, "epoch": 3.645763842561708, "step": 10930 }, { "distill_loss": 0.336357057094574, "epoch": 3.645763842561708, "step": 10930 }, { "epoch": 3.645763842561708, "ref_ce_loss": 0.16738320887088776, "step": 10930 }, { "epoch": 3.6490993995997334, "loss": 0.9312, "step": 10940 }, { "epoch": 3.6490993995997334, "grad_norm": 1.6814641952514648, "step": 10940 }, { "epoch": 3.6490993995997334, "learning_rate": 0.0005868892580284026, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.9259946346282959, "step": 10940 }, { "ce_loss": 0.1978679597377777, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.39133840799331665, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.2005710005760193, "step": 10940 }, { "epoch": 3.6490993995997334, "loss": 0.7356423735618591, "step": 10940 }, { "ce_loss": 0.20834468305110931, "epoch": 3.6490993995997334, "step": 10940 }, { "distill_loss": 0.3353995084762573, "epoch": 3.6490993995997334, "step": 10940 }, { "epoch": 3.6490993995997334, "ref_ce_loss": 0.11429071426391602, "step": 10940 }, { "epoch": 3.6524349566377587, "loss": 0.9866, "step": 10950 }, { "epoch": 3.6524349566377587, "grad_norm": 1.6673353910446167, "step": 10950 }, { "epoch": 3.6524349566377587, "learning_rate": 0.0005865070849520184, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 1.0651073455810547, "step": 10950 }, { "ce_loss": 0.26207685470581055, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.39236509799957275, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.17330333590507507, "step": 10950 }, { "epoch": 3.6524349566377587, "loss": 0.9178107380867004, "step": 10950 }, { "ce_loss": 0.2152654230594635, "epoch": 3.6524349566377587, "step": 10950 }, { "distill_loss": 0.3327520191669464, "epoch": 3.6524349566377587, "step": 10950 }, { "epoch": 3.6524349566377587, "ref_ce_loss": 0.1945221871137619, "step": 10950 }, { "epoch": 3.655770513675784, "loss": 0.9223, "step": 10960 }, { "epoch": 3.655770513675784, "grad_norm": 2.0015439987182617, "step": 10960 }, { "epoch": 3.655770513675784, "learning_rate": 0.0005861246942017968, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.7507296800613403, "step": 10960 }, { "ce_loss": 0.19525940716266632, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.3391239643096924, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.15782354772090912, "step": 10960 }, { "epoch": 3.655770513675784, "loss": 0.6286138892173767, "step": 10960 }, { "ce_loss": 0.1707758605480194, "epoch": 3.655770513675784, "step": 10960 }, { "distill_loss": 0.325600802898407, "epoch": 3.655770513675784, "step": 10960 }, { "epoch": 3.655770513675784, "ref_ce_loss": 0.1315843015909195, "step": 10960 }, { "epoch": 3.6591060707138094, "loss": 0.94, "step": 10970 }, { "epoch": 3.6591060707138094, "grad_norm": 1.5580071210861206, "step": 10970 }, { "epoch": 3.6591060707138094, "learning_rate": 0.0005857420862240293, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 1.1839573383331299, "step": 10970 }, { "ce_loss": 0.22133544087409973, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.34468215703964233, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.19706854224205017, "step": 10970 }, { "epoch": 3.6591060707138094, "loss": 0.7060577273368835, "step": 10970 }, { "ce_loss": 0.17872220277786255, "epoch": 3.6591060707138094, "step": 10970 }, { "distill_loss": 0.35810860991477966, "epoch": 3.6591060707138094, "step": 10970 }, { "epoch": 3.6591060707138094, "ref_ce_loss": 0.16883009672164917, "step": 10970 }, { "epoch": 3.662441627751835, "loss": 1.0542, "step": 10980 }, { "epoch": 3.662441627751835, "grad_norm": 2.0040171146392822, "step": 10980 }, { "epoch": 3.662441627751835, "learning_rate": 0.0005853592614652605, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.7581341862678528, "step": 10980 }, { "ce_loss": 0.21842674911022186, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.3178858160972595, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.18037572503089905, "step": 10980 }, { "epoch": 3.662441627751835, "loss": 0.9531198143959045, "step": 10980 }, { "ce_loss": 0.27651041746139526, "epoch": 3.662441627751835, "step": 10980 }, { "distill_loss": 0.44763290882110596, "epoch": 3.662441627751835, "step": 10980 }, { "epoch": 3.662441627751835, "ref_ce_loss": 0.15781889855861664, "step": 10980 }, { "epoch": 3.66577718478986, "loss": 0.9635, "step": 10990 }, { "epoch": 3.66577718478986, "grad_norm": 2.4893198013305664, "step": 10990 }, { "epoch": 3.66577718478986, "learning_rate": 0.0005849762203722882, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 0.7130229473114014, "step": 10990 }, { "ce_loss": 0.2273818701505661, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.2687847912311554, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.21665076911449432, "step": 10990 }, { "epoch": 3.66577718478986, "loss": 1.124692678451538, "step": 10990 }, { "ce_loss": 0.2515711486339569, "epoch": 3.66577718478986, "step": 10990 }, { "distill_loss": 0.3417477011680603, "epoch": 3.66577718478986, "step": 10990 }, { "epoch": 3.66577718478986, "ref_ce_loss": 0.21203409135341644, "step": 10990 }, { "epoch": 3.6691127418278855, "loss": 0.8726, "step": 11000 }, { "epoch": 3.6691127418278855, "grad_norm": 1.6779652833938599, "step": 11000 }, { "epoch": 3.6691127418278855, "learning_rate": 0.0005845929633921623, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.9800863265991211, "step": 11000 }, { "ce_loss": 0.28845295310020447, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.3359338939189911, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.2133590430021286, "step": 11000 }, { "epoch": 3.6691127418278855, "loss": 0.9813275933265686, "step": 11000 }, { "ce_loss": 0.23539577424526215, "epoch": 3.6691127418278855, "step": 11000 }, { "distill_loss": 0.39678841829299927, "epoch": 3.6691127418278855, "step": 11000 }, { "epoch": 3.6691127418278855, "ref_ce_loss": 0.1863996386528015, "step": 11000 }, { "epoch": 3.672448298865911, "loss": 0.9235, "step": 11010 }, { "epoch": 3.672448298865911, "grad_norm": 2.2331013679504395, "step": 11010 }, { "epoch": 3.672448298865911, "learning_rate": 0.0005842094909721852, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 0.6382297277450562, "step": 11010 }, { "ce_loss": 0.14464932680130005, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.32155758142471313, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.171687051653862, "step": 11010 }, { "epoch": 3.672448298865911, "loss": 1.0422011613845825, "step": 11010 }, { "ce_loss": 0.24493230879306793, "epoch": 3.672448298865911, "step": 11010 }, { "distill_loss": 0.4178438186645508, "epoch": 3.672448298865911, "step": 11010 }, { "epoch": 3.672448298865911, "ref_ce_loss": 0.18016548454761505, "step": 11010 }, { "epoch": 3.675783855903936, "loss": 0.9028, "step": 11020 }, { "epoch": 3.675783855903936, "grad_norm": 3.1568448543548584, "step": 11020 }, { "epoch": 3.675783855903936, "learning_rate": 0.0005838258035599103, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 1.1515343189239502, "step": 11020 }, { "ce_loss": 0.20892977714538574, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.4254738688468933, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.19011640548706055, "step": 11020 }, { "epoch": 3.675783855903936, "loss": 1.3563225269317627, "step": 11020 }, { "ce_loss": 0.288394957780838, "epoch": 3.675783855903936, "step": 11020 }, { "distill_loss": 0.4447449743747711, "epoch": 3.675783855903936, "step": 11020 }, { "epoch": 3.675783855903936, "ref_ce_loss": 0.1841244399547577, "step": 11020 }, { "epoch": 3.6791194129419615, "loss": 1.0352, "step": 11030 }, { "epoch": 3.6791194129419615, "grad_norm": 1.798572301864624, "step": 11030 }, { "epoch": 3.6791194129419615, "learning_rate": 0.0005834419016031423, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 1.4670379161834717, "step": 11030 }, { "ce_loss": 0.27505266666412354, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.4791155755519867, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.23881269991397858, "step": 11030 }, { "epoch": 3.6791194129419615, "loss": 0.8022658824920654, "step": 11030 }, { "ce_loss": 0.19052447378635406, "epoch": 3.6791194129419615, "step": 11030 }, { "distill_loss": 0.4294961392879486, "epoch": 3.6791194129419615, "step": 11030 }, { "epoch": 3.6791194129419615, "ref_ce_loss": 0.1384320855140686, "step": 11030 }, { "epoch": 3.682454969979987, "loss": 0.9148, "step": 11040 }, { "epoch": 3.682454969979987, "grad_norm": 2.133450746536255, "step": 11040 }, { "epoch": 3.682454969979987, "learning_rate": 0.0005830577855499359, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 1.0161513090133667, "step": 11040 }, { "ce_loss": 0.2720661759376526, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.41905316710472107, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.23214557766914368, "step": 11040 }, { "epoch": 3.682454969979987, "loss": 0.8970419764518738, "step": 11040 }, { "ce_loss": 0.22910916805267334, "epoch": 3.682454969979987, "step": 11040 }, { "distill_loss": 0.37953558564186096, "epoch": 3.682454969979987, "step": 11040 }, { "epoch": 3.682454969979987, "ref_ce_loss": 0.17511329054832458, "step": 11040 }, { "epoch": 3.6857905270180122, "loss": 0.9784, "step": 11050 }, { "epoch": 3.6857905270180122, "grad_norm": 2.5500400066375732, "step": 11050 }, { "epoch": 3.6857905270180122, "learning_rate": 0.0005826734558485959, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 1.0715348720550537, "step": 11050 }, { "ce_loss": 0.27266642451286316, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.46318167448043823, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.202993243932724, "step": 11050 }, { "epoch": 3.6857905270180122, "loss": 0.8450025320053101, "step": 11050 }, { "ce_loss": 0.21356584131717682, "epoch": 3.6857905270180122, "step": 11050 }, { "distill_loss": 0.35461321473121643, "epoch": 3.6857905270180122, "step": 11050 }, { "epoch": 3.6857905270180122, "ref_ce_loss": 0.1333731859922409, "step": 11050 }, { "epoch": 3.6891260840560376, "loss": 0.9594, "step": 11060 }, { "epoch": 3.6891260840560376, "grad_norm": 1.717258334159851, "step": 11060 }, { "epoch": 3.6891260840560376, "learning_rate": 0.0005822889129476765, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.6976757049560547, "step": 11060 }, { "ce_loss": 0.23821891844272614, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.3095102310180664, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.14964300394058228, "step": 11060 }, { "epoch": 3.6891260840560376, "loss": 0.663037121295929, "step": 11060 }, { "ce_loss": 0.1527738869190216, "epoch": 3.6891260840560376, "step": 11060 }, { "distill_loss": 0.25134557485580444, "epoch": 3.6891260840560376, "step": 11060 }, { "epoch": 3.6891260840560376, "ref_ce_loss": 0.12417822331190109, "step": 11060 }, { "epoch": 3.692461641094063, "loss": 0.9893, "step": 11070 }, { "epoch": 3.692461641094063, "grad_norm": 2.8804426193237305, "step": 11070 }, { "epoch": 3.692461641094063, "learning_rate": 0.0005819041572959804, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 0.8692727088928223, "step": 11070 }, { "ce_loss": 0.2544071674346924, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.40960726141929626, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.15840329229831696, "step": 11070 }, { "epoch": 3.692461641094063, "loss": 1.0427039861679077, "step": 11070 }, { "ce_loss": 0.2617495357990265, "epoch": 3.692461641094063, "step": 11070 }, { "distill_loss": 0.36591702699661255, "epoch": 3.692461641094063, "step": 11070 }, { "epoch": 3.692461641094063, "ref_ce_loss": 0.21506667137145996, "step": 11070 }, { "epoch": 3.6957971981320883, "loss": 0.9297, "step": 11080 }, { "epoch": 3.6957971981320883, "grad_norm": 4.881679534912109, "step": 11080 }, { "epoch": 3.6957971981320883, "learning_rate": 0.0005815191893425593, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 0.8299713730812073, "step": 11080 }, { "ce_loss": 0.23638109862804413, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.4183204174041748, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.17506512999534607, "step": 11080 }, { "epoch": 3.6957971981320883, "loss": 1.1771599054336548, "step": 11080 }, { "ce_loss": 0.34898120164871216, "epoch": 3.6957971981320883, "step": 11080 }, { "distill_loss": 0.5415797829627991, "epoch": 3.6957971981320883, "step": 11080 }, { "epoch": 3.6957971981320883, "ref_ce_loss": 0.23597785830497742, "step": 11080 }, { "epoch": 3.6991327551701136, "loss": 1.0256, "step": 11090 }, { "epoch": 3.6991327551701136, "grad_norm": 2.5172195434570312, "step": 11090 }, { "epoch": 3.6991327551701136, "learning_rate": 0.0005811340095367119, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 1.090559482574463, "step": 11090 }, { "ce_loss": 0.25162357091903687, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.3787229061126709, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.21387702226638794, "step": 11090 }, { "epoch": 3.6991327551701136, "loss": 1.1414086818695068, "step": 11090 }, { "ce_loss": 0.22263747453689575, "epoch": 3.6991327551701136, "step": 11090 }, { "distill_loss": 0.44168949127197266, "epoch": 3.6991327551701136, "step": 11090 }, { "epoch": 3.6991327551701136, "ref_ce_loss": 0.18623687326908112, "step": 11090 }, { "epoch": 3.702468312208139, "loss": 0.9992, "step": 11100 }, { "epoch": 3.702468312208139, "grad_norm": 2.178071975708008, "step": 11100 }, { "epoch": 3.702468312208139, "learning_rate": 0.0005807486183279844, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 1.136382818222046, "step": 11100 }, { "ce_loss": 0.3223716616630554, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.4490458369255066, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.17996446788311005, "step": 11100 }, { "epoch": 3.702468312208139, "loss": 0.9748097658157349, "step": 11100 }, { "ce_loss": 0.20400959253311157, "epoch": 3.702468312208139, "step": 11100 }, { "distill_loss": 0.47114190459251404, "epoch": 3.702468312208139, "step": 11100 }, { "epoch": 3.702468312208139, "ref_ce_loss": 0.16003276407718658, "step": 11100 }, { "epoch": 3.7058038692461643, "loss": 0.9737, "step": 11110 }, { "epoch": 3.7058038692461643, "grad_norm": 1.476382851600647, "step": 11110 }, { "epoch": 3.7058038692461643, "learning_rate": 0.0005803630161661702, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.6786836981773376, "step": 11110 }, { "ce_loss": 0.16755782067775726, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.32450413703918457, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.14207275211811066, "step": 11110 }, { "epoch": 3.7058038692461643, "loss": 0.9087648987770081, "step": 11110 }, { "ce_loss": 0.18221266567707062, "epoch": 3.7058038692461643, "step": 11110 }, { "distill_loss": 0.449026882648468, "epoch": 3.7058038692461643, "step": 11110 }, { "epoch": 3.7058038692461643, "ref_ce_loss": 0.16479560732841492, "step": 11110 }, { "epoch": 3.7091394262841897, "loss": 0.9624, "step": 11120 }, { "epoch": 3.7091394262841897, "grad_norm": 2.298190116882324, "step": 11120 }, { "epoch": 3.7091394262841897, "learning_rate": 0.000579977203501308, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.9394449591636658, "step": 11120 }, { "ce_loss": 0.25217294692993164, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.48344358801841736, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.17006704211235046, "step": 11120 }, { "epoch": 3.7091394262841897, "loss": 0.8247861862182617, "step": 11120 }, { "ce_loss": 0.18868543207645416, "epoch": 3.7091394262841897, "step": 11120 }, { "distill_loss": 0.41085153818130493, "epoch": 3.7091394262841897, "step": 11120 }, { "epoch": 3.7091394262841897, "ref_ce_loss": 0.17868436872959137, "step": 11120 }, { "epoch": 3.712474983322215, "loss": 0.9201, "step": 11130 }, { "epoch": 3.712474983322215, "grad_norm": 1.8907734155654907, "step": 11130 }, { "epoch": 3.712474983322215, "learning_rate": 0.0005795911807836831, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 1.0551815032958984, "step": 11130 }, { "ce_loss": 0.2505593001842499, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.36986488103866577, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.20263658463954926, "step": 11130 }, { "epoch": 3.712474983322215, "loss": 0.8181856870651245, "step": 11130 }, { "ce_loss": 0.2138008326292038, "epoch": 3.712474983322215, "step": 11130 }, { "distill_loss": 0.3812239170074463, "epoch": 3.712474983322215, "step": 11130 }, { "epoch": 3.712474983322215, "ref_ce_loss": 0.17711184918880463, "step": 11130 }, { "epoch": 3.7158105403602404, "loss": 1.0005, "step": 11140 }, { "epoch": 3.7158105403602404, "grad_norm": 3.0578932762145996, "step": 11140 }, { "epoch": 3.7158105403602404, "learning_rate": 0.0005792049484638254, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 0.9433335661888123, "step": 11140 }, { "ce_loss": 0.24768595397472382, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.45756620168685913, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.16918687522411346, "step": 11140 }, { "epoch": 3.7158105403602404, "loss": 1.0306591987609863, "step": 11140 }, { "ce_loss": 0.3187006711959839, "epoch": 3.7158105403602404, "step": 11140 }, { "distill_loss": 0.3860379755496979, "epoch": 3.7158105403602404, "step": 11140 }, { "epoch": 3.7158105403602404, "ref_ce_loss": 0.26220548152923584, "step": 11140 }, { "epoch": 3.7191460973982657, "loss": 0.9562, "step": 11150 }, { "epoch": 3.7191460973982657, "grad_norm": 2.8249945640563965, "step": 11150 }, { "epoch": 3.7191460973982657, "learning_rate": 0.0005788185069925095, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.838791012763977, "step": 11150 }, { "ce_loss": 0.2277652472257614, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.3474879264831543, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.21634413301944733, "step": 11150 }, { "epoch": 3.7191460973982657, "loss": 0.5670017004013062, "step": 11150 }, { "ce_loss": 0.15619748830795288, "epoch": 3.7191460973982657, "step": 11150 }, { "distill_loss": 0.2370021492242813, "epoch": 3.7191460973982657, "step": 11150 }, { "epoch": 3.7191460973982657, "ref_ce_loss": 0.1731836497783661, "step": 11150 }, { "epoch": 3.722481654436291, "loss": 0.8697, "step": 11160 }, { "epoch": 3.722481654436291, "grad_norm": 1.6773978471755981, "step": 11160 }, { "epoch": 3.722481654436291, "learning_rate": 0.0005784318568207546, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 1.0860368013381958, "step": 11160 }, { "ce_loss": 0.31327375769615173, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.4752776026725769, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.2365313172340393, "step": 11160 }, { "epoch": 3.722481654436291, "loss": 0.9724563956260681, "step": 11160 }, { "ce_loss": 0.22032691538333893, "epoch": 3.722481654436291, "step": 11160 }, { "distill_loss": 0.45673760771751404, "epoch": 3.722481654436291, "step": 11160 }, { "epoch": 3.722481654436291, "ref_ce_loss": 0.17892897129058838, "step": 11160 }, { "epoch": 3.7258172114743164, "loss": 0.9788, "step": 11170 }, { "epoch": 3.7258172114743164, "grad_norm": 1.7261686325073242, "step": 11170 }, { "epoch": 3.7258172114743164, "learning_rate": 0.0005780449983998224, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.8740634322166443, "step": 11170 }, { "ce_loss": 0.25200915336608887, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.36934924125671387, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.19166813790798187, "step": 11170 }, { "epoch": 3.7258172114743164, "loss": 0.8459035158157349, "step": 11170 }, { "ce_loss": 0.27757108211517334, "epoch": 3.7258172114743164, "step": 11170 }, { "distill_loss": 0.3620123267173767, "epoch": 3.7258172114743164, "step": 11170 }, { "epoch": 3.7258172114743164, "ref_ce_loss": 0.2055073231458664, "step": 11170 }, { "epoch": 3.729152768512342, "loss": 0.9375, "step": 11180 }, { "epoch": 3.729152768512342, "grad_norm": 2.763355016708374, "step": 11180 }, { "epoch": 3.729152768512342, "learning_rate": 0.0005776579321812187, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.8284812569618225, "step": 11180 }, { "ce_loss": 0.2111487090587616, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.4274722635746002, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.189053013920784, "step": 11180 }, { "epoch": 3.729152768512342, "loss": 0.868732213973999, "step": 11180 }, { "ce_loss": 0.24561914801597595, "epoch": 3.729152768512342, "step": 11180 }, { "distill_loss": 0.4243730902671814, "epoch": 3.729152768512342, "step": 11180 }, { "epoch": 3.729152768512342, "ref_ce_loss": 0.1535656601190567, "step": 11180 }, { "epoch": 3.732488325550367, "loss": 0.9525, "step": 11190 }, { "epoch": 3.732488325550367, "grad_norm": 1.8700323104858398, "step": 11190 }, { "epoch": 3.732488325550367, "learning_rate": 0.0005772706586166914, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.8410372734069824, "step": 11190 }, { "ce_loss": 0.20929786562919617, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.3191835880279541, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.17638051509857178, "step": 11190 }, { "epoch": 3.732488325550367, "loss": 0.9550619721412659, "step": 11190 }, { "ce_loss": 0.2897661030292511, "epoch": 3.732488325550367, "step": 11190 }, { "distill_loss": 0.4045095443725586, "epoch": 3.732488325550367, "step": 11190 }, { "epoch": 3.732488325550367, "ref_ce_loss": 0.2052481323480606, "step": 11190 }, { "epoch": 3.7358238825883925, "loss": 0.8988, "step": 11200 }, { "epoch": 3.7358238825883925, "grad_norm": 1.7248882055282593, "step": 11200 }, { "epoch": 3.7358238825883925, "learning_rate": 0.0005768831781582304, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.7426757216453552, "step": 11200 }, { "ce_loss": 0.19427357614040375, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.2836810350418091, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.20022033154964447, "step": 11200 }, { "epoch": 3.7358238825883925, "loss": 0.8273655772209167, "step": 11200 }, { "ce_loss": 0.21153953671455383, "epoch": 3.7358238825883925, "step": 11200 }, { "distill_loss": 0.3719659447669983, "epoch": 3.7358238825883925, "step": 11200 }, { "epoch": 3.7358238825883925, "ref_ce_loss": 0.18563783168792725, "step": 11200 }, { "epoch": 3.739159439626418, "loss": 0.9277, "step": 11210 }, { "epoch": 3.739159439626418, "grad_norm": 2.4176762104034424, "step": 11210 }, { "epoch": 3.739159439626418, "learning_rate": 0.000576495491258067, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 1.3678982257843018, "step": 11210 }, { "ce_loss": 0.2704596221446991, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.3776765465736389, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.22148384153842926, "step": 11210 }, { "epoch": 3.739159439626418, "loss": 1.6562130451202393, "step": 11210 }, { "ce_loss": 0.15484488010406494, "epoch": 3.739159439626418, "step": 11210 }, { "distill_loss": 0.3262500464916229, "epoch": 3.739159439626418, "step": 11210 }, { "epoch": 3.739159439626418, "ref_ce_loss": 0.15828606486320496, "step": 11210 }, { "epoch": 3.742494996664443, "loss": 0.9934, "step": 11220 }, { "epoch": 3.742494996664443, "grad_norm": 1.7548892498016357, "step": 11220 }, { "epoch": 3.742494996664443, "learning_rate": 0.0005761075983686738, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.9841724634170532, "step": 11220 }, { "ce_loss": 0.27450743317604065, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.37013399600982666, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.20493614673614502, "step": 11220 }, { "epoch": 3.742494996664443, "loss": 0.8894741535186768, "step": 11220 }, { "ce_loss": 0.31326839327812195, "epoch": 3.742494996664443, "step": 11220 }, { "distill_loss": 0.3378714621067047, "epoch": 3.742494996664443, "step": 11220 }, { "epoch": 3.742494996664443, "ref_ce_loss": 0.19991189241409302, "step": 11220 }, { "epoch": 3.7458305537024685, "loss": 0.9505, "step": 11230 }, { "epoch": 3.7458305537024685, "grad_norm": 2.3924026489257812, "step": 11230 }, { "epoch": 3.7458305537024685, "learning_rate": 0.000575719499942763, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.9752727150917053, "step": 11230 }, { "ce_loss": 0.25048398971557617, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.3495645225048065, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.22064881026744843, "step": 11230 }, { "epoch": 3.7458305537024685, "loss": 0.8900600671768188, "step": 11230 }, { "ce_loss": 0.2559802830219269, "epoch": 3.7458305537024685, "step": 11230 }, { "distill_loss": 0.42682915925979614, "epoch": 3.7458305537024685, "step": 11230 }, { "epoch": 3.7458305537024685, "ref_ce_loss": 0.1657843142747879, "step": 11230 }, { "epoch": 3.749166110740494, "loss": 0.9819, "step": 11240 }, { "epoch": 3.749166110740494, "grad_norm": 4.172050952911377, "step": 11240 }, { "epoch": 3.749166110740494, "learning_rate": 0.0005753311964332878, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.7777035236358643, "step": 11240 }, { "ce_loss": 0.21494880318641663, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.3411872386932373, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.18135559558868408, "step": 11240 }, { "epoch": 3.749166110740494, "loss": 0.9176045656204224, "step": 11240 }, { "ce_loss": 0.24806998670101166, "epoch": 3.749166110740494, "step": 11240 }, { "distill_loss": 0.4173784852027893, "epoch": 3.749166110740494, "step": 11240 }, { "epoch": 3.749166110740494, "ref_ce_loss": 0.19084446132183075, "step": 11240 }, { "epoch": 3.7525016677785192, "loss": 0.9266, "step": 11250 }, { "epoch": 3.7525016677785192, "grad_norm": 2.111720323562622, "step": 11250 }, { "epoch": 3.7525016677785192, "learning_rate": 0.0005749426882934399, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 0.9959545731544495, "step": 11250 }, { "ce_loss": 0.31441760063171387, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.4108336865901947, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.21832722425460815, "step": 11250 }, { "epoch": 3.7525016677785192, "loss": 1.1574323177337646, "step": 11250 }, { "ce_loss": 0.3039410710334778, "epoch": 3.7525016677785192, "step": 11250 }, { "distill_loss": 0.46679091453552246, "epoch": 3.7525016677785192, "step": 11250 }, { "epoch": 3.7525016677785192, "ref_ce_loss": 0.21937808394432068, "step": 11250 }, { "epoch": 3.7558372248165446, "loss": 0.8932, "step": 11260 }, { "epoch": 3.7558372248165446, "grad_norm": 1.6136808395385742, "step": 11260 }, { "epoch": 3.7558372248165446, "learning_rate": 0.0005745539759766502, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 1.2521703243255615, "step": 11260 }, { "ce_loss": 0.2340213805437088, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.44366782903671265, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.20129236578941345, "step": 11260 }, { "epoch": 3.7558372248165446, "loss": 0.8436146974563599, "step": 11260 }, { "ce_loss": 0.24747362732887268, "epoch": 3.7558372248165446, "step": 11260 }, { "distill_loss": 0.38767534494400024, "epoch": 3.7558372248165446, "step": 11260 }, { "epoch": 3.7558372248165446, "ref_ce_loss": 0.20765142142772675, "step": 11260 }, { "epoch": 3.75917278185457, "loss": 0.925, "step": 11270 }, { "epoch": 3.75917278185457, "grad_norm": 5.605003833770752, "step": 11270 }, { "epoch": 3.75917278185457, "learning_rate": 0.0005741650599365877, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 1.3394966125488281, "step": 11270 }, { "ce_loss": 0.3728139400482178, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.4576507806777954, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.21766957640647888, "step": 11270 }, { "epoch": 3.75917278185457, "loss": 0.9282214045524597, "step": 11270 }, { "ce_loss": 0.22735916078090668, "epoch": 3.75917278185457, "step": 11270 }, { "distill_loss": 0.38220497965812683, "epoch": 3.75917278185457, "step": 11270 }, { "epoch": 3.75917278185457, "ref_ce_loss": 0.14693810045719147, "step": 11270 }, { "epoch": 3.7625083388925953, "loss": 0.9751, "step": 11280 }, { "epoch": 3.7625083388925953, "grad_norm": 2.9692952632904053, "step": 11280 }, { "epoch": 3.7625083388925953, "learning_rate": 0.0005737759406271593, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.9229706525802612, "step": 11280 }, { "ce_loss": 0.20989659428596497, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.40266168117523193, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.18926185369491577, "step": 11280 }, { "epoch": 3.7625083388925953, "loss": 0.7887105345726013, "step": 11280 }, { "ce_loss": 0.2364315539598465, "epoch": 3.7625083388925953, "step": 11280 }, { "distill_loss": 0.3450629413127899, "epoch": 3.7625083388925953, "step": 11280 }, { "epoch": 3.7625083388925953, "ref_ce_loss": 0.14800573885440826, "step": 11280 }, { "epoch": 3.7658438959306206, "loss": 1.0287, "step": 11290 }, { "epoch": 3.7658438959306206, "grad_norm": 3.338712692260742, "step": 11290 }, { "epoch": 3.7658438959306206, "learning_rate": 0.000573386618502509, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 1.0351455211639404, "step": 11290 }, { "ce_loss": 0.21638186275959015, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.4342281222343445, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.1624898761510849, "step": 11290 }, { "epoch": 3.7658438959306206, "loss": 0.7106236815452576, "step": 11290 }, { "ce_loss": 0.19261404871940613, "epoch": 3.7658438959306206, "step": 11290 }, { "distill_loss": 0.3417753577232361, "epoch": 3.7658438959306206, "step": 11290 }, { "epoch": 3.7658438959306206, "ref_ce_loss": 0.17591843008995056, "step": 11290 }, { "epoch": 3.769179452968646, "loss": 0.9465, "step": 11300 }, { "epoch": 3.769179452968646, "grad_norm": 2.0295517444610596, "step": 11300 }, { "epoch": 3.769179452968646, "learning_rate": 0.000572997094017018, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 0.8671320676803589, "step": 11300 }, { "ce_loss": 0.2617754638195038, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.3775753378868103, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.19144763052463531, "step": 11300 }, { "epoch": 3.769179452968646, "loss": 1.2064995765686035, "step": 11300 }, { "ce_loss": 0.24847352504730225, "epoch": 3.769179452968646, "step": 11300 }, { "distill_loss": 0.40565523505210876, "epoch": 3.769179452968646, "step": 11300 }, { "epoch": 3.769179452968646, "ref_ce_loss": 0.2336285561323166, "step": 11300 }, { "epoch": 3.7725150100066713, "loss": 0.9088, "step": 11310 }, { "epoch": 3.7725150100066713, "grad_norm": 1.7244271039962769, "step": 11310 }, { "epoch": 3.7725150100066713, "learning_rate": 0.0005726073676253029, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.7884076833724976, "step": 11310 }, { "ce_loss": 0.21592627465724945, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.3208105266094208, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.1611410230398178, "step": 11310 }, { "epoch": 3.7725150100066713, "loss": 0.9448220729827881, "step": 11310 }, { "ce_loss": 0.2776465117931366, "epoch": 3.7725150100066713, "step": 11310 }, { "distill_loss": 0.3224950432777405, "epoch": 3.7725150100066713, "step": 11310 }, { "epoch": 3.7725150100066713, "ref_ce_loss": 0.16729578375816345, "step": 11310 }, { "epoch": 3.7758505670446967, "loss": 0.9252, "step": 11320 }, { "epoch": 3.7758505670446967, "grad_norm": 1.8959004878997803, "step": 11320 }, { "epoch": 3.7758505670446967, "learning_rate": 0.0005722174397822165, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 1.5210129022598267, "step": 11320 }, { "ce_loss": 0.2503696084022522, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.3779032826423645, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.2040143758058548, "step": 11320 }, { "epoch": 3.7758505670446967, "loss": 1.2529124021530151, "step": 11320 }, { "ce_loss": 0.2998267412185669, "epoch": 3.7758505670446967, "step": 11320 }, { "distill_loss": 0.4451632499694824, "epoch": 3.7758505670446967, "step": 11320 }, { "epoch": 3.7758505670446967, "ref_ce_loss": 0.1881563514471054, "step": 11320 }, { "epoch": 3.779186124082722, "loss": 1.0143, "step": 11330 }, { "epoch": 3.779186124082722, "grad_norm": 1.705941081047058, "step": 11330 }, { "epoch": 3.779186124082722, "learning_rate": 0.0005718273109428464, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 1.025900959968567, "step": 11330 }, { "ce_loss": 0.2714976370334625, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.47367486357688904, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.22894425690174103, "step": 11330 }, { "epoch": 3.779186124082722, "loss": 0.8805956840515137, "step": 11330 }, { "ce_loss": 0.24296237528324127, "epoch": 3.779186124082722, "step": 11330 }, { "distill_loss": 0.47993454337120056, "epoch": 3.779186124082722, "step": 11330 }, { "epoch": 3.779186124082722, "ref_ce_loss": 0.15654857456684113, "step": 11330 }, { "epoch": 3.7825216811207474, "loss": 0.9005, "step": 11340 }, { "epoch": 3.7825216811207474, "grad_norm": 5.436649799346924, "step": 11340 }, { "epoch": 3.7825216811207474, "learning_rate": 0.0005714369815625151, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 0.9933842420578003, "step": 11340 }, { "ce_loss": 0.24282170832157135, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.43368253111839294, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.18155056238174438, "step": 11340 }, { "epoch": 3.7825216811207474, "loss": 1.4033174514770508, "step": 11340 }, { "ce_loss": 0.29729801416397095, "epoch": 3.7825216811207474, "step": 11340 }, { "distill_loss": 0.45099925994873047, "epoch": 3.7825216811207474, "step": 11340 }, { "epoch": 3.7825216811207474, "ref_ce_loss": 0.1876297891139984, "step": 11340 }, { "epoch": 3.7858572381587727, "loss": 1.102, "step": 11350 }, { "epoch": 3.7858572381587727, "grad_norm": 1.883608102798462, "step": 11350 }, { "epoch": 3.7858572381587727, "learning_rate": 0.000571046452096779, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 1.0281188488006592, "step": 11350 }, { "ce_loss": 0.2619418203830719, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.39741992950439453, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.20731395483016968, "step": 11350 }, { "epoch": 3.7858572381587727, "loss": 0.7758177518844604, "step": 11350 }, { "ce_loss": 0.19287358224391937, "epoch": 3.7858572381587727, "step": 11350 }, { "distill_loss": 0.3553203046321869, "epoch": 3.7858572381587727, "step": 11350 }, { "epoch": 3.7858572381587727, "ref_ce_loss": 0.14900435507297516, "step": 11350 }, { "epoch": 3.789192795196798, "loss": 0.9732, "step": 11360 }, { "epoch": 3.789192795196798, "grad_norm": 2.2708394527435303, "step": 11360 }, { "epoch": 3.789192795196798, "learning_rate": 0.0005706557230014278, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.8519456386566162, "step": 11360 }, { "ce_loss": 0.24678350985050201, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.3918919265270233, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.15620338916778564, "step": 11360 }, { "epoch": 3.789192795196798, "loss": 0.8818426132202148, "step": 11360 }, { "ce_loss": 0.23588576912879944, "epoch": 3.789192795196798, "step": 11360 }, { "distill_loss": 0.36162734031677246, "epoch": 3.789192795196798, "step": 11360 }, { "epoch": 3.789192795196798, "ref_ce_loss": 0.22672615945339203, "step": 11360 }, { "epoch": 3.7925283522348234, "loss": 1.0084, "step": 11370 }, { "epoch": 3.7925283522348234, "grad_norm": 2.6330015659332275, "step": 11370 }, { "epoch": 3.7925283522348234, "learning_rate": 0.0005702647947324847, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 1.1395282745361328, "step": 11370 }, { "ce_loss": 0.2408571094274521, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.3482729494571686, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.18775120377540588, "step": 11370 }, { "epoch": 3.7925283522348234, "loss": 0.9223992824554443, "step": 11370 }, { "ce_loss": 0.26535001397132874, "epoch": 3.7925283522348234, "step": 11370 }, { "distill_loss": 0.3763105869293213, "epoch": 3.7925283522348234, "step": 11370 }, { "epoch": 3.7925283522348234, "ref_ce_loss": 0.24394690990447998, "step": 11370 }, { "epoch": 3.795863909272849, "loss": 0.8764, "step": 11380 }, { "epoch": 3.795863909272849, "grad_norm": 3.5896894931793213, "step": 11380 }, { "epoch": 3.795863909272849, "learning_rate": 0.0005698736677462048, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.9310532808303833, "step": 11380 }, { "ce_loss": 0.2635497450828552, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.4105827808380127, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.16474372148513794, "step": 11380 }, { "epoch": 3.795863909272849, "loss": 0.9801605343818665, "step": 11380 }, { "ce_loss": 0.2689959406852722, "epoch": 3.795863909272849, "step": 11380 }, { "distill_loss": 0.40919217467308044, "epoch": 3.795863909272849, "step": 11380 }, { "epoch": 3.795863909272849, "ref_ce_loss": 0.17264766991138458, "step": 11380 }, { "epoch": 3.799199466310874, "loss": 1.0239, "step": 11390 }, { "epoch": 3.799199466310874, "grad_norm": 1.7258336544036865, "step": 11390 }, { "epoch": 3.799199466310874, "learning_rate": 0.0005694823424990755, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 1.012412190437317, "step": 11390 }, { "ce_loss": 0.30163678526878357, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.40283679962158203, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.23806728422641754, "step": 11390 }, { "epoch": 3.799199466310874, "loss": 0.9156680703163147, "step": 11390 }, { "ce_loss": 0.23893040418624878, "epoch": 3.799199466310874, "step": 11390 }, { "distill_loss": 0.40328019857406616, "epoch": 3.799199466310874, "step": 11390 }, { "epoch": 3.799199466310874, "ref_ce_loss": 0.15019917488098145, "step": 11390 }, { "epoch": 3.8025350233488995, "loss": 0.9566, "step": 11400 }, { "epoch": 3.8025350233488995, "grad_norm": 1.9312031269073486, "step": 11400 }, { "epoch": 3.8025350233488995, "learning_rate": 0.0005690908194478156, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.9236981868743896, "step": 11400 }, { "ce_loss": 0.27061566710472107, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.40460094809532166, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.20047782361507416, "step": 11400 }, { "epoch": 3.8025350233488995, "loss": 0.7674431204795837, "step": 11400 }, { "ce_loss": 0.19978666305541992, "epoch": 3.8025350233488995, "step": 11400 }, { "distill_loss": 0.3495452105998993, "epoch": 3.8025350233488995, "step": 11400 }, { "epoch": 3.8025350233488995, "ref_ce_loss": 0.17217499017715454, "step": 11400 }, { "epoch": 3.805870580386925, "loss": 0.968, "step": 11410 }, { "epoch": 3.805870580386925, "grad_norm": 1.6505191326141357, "step": 11410 }, { "epoch": 3.805870580386925, "learning_rate": 0.0005686990990493743, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 1.1001862287521362, "step": 11410 }, { "ce_loss": 0.22656288743019104, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.37116605043411255, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.15069936215877533, "step": 11410 }, { "epoch": 3.805870580386925, "loss": 1.0263774394989014, "step": 11410 }, { "ce_loss": 0.253476619720459, "epoch": 3.805870580386925, "step": 11410 }, { "distill_loss": 0.41293397545814514, "epoch": 3.805870580386925, "step": 11410 }, { "epoch": 3.805870580386925, "ref_ce_loss": 0.23333688080310822, "step": 11410 }, { "epoch": 3.80920613742495, "loss": 1.0876, "step": 11420 }, { "epoch": 3.80920613742495, "grad_norm": 3.7331254482269287, "step": 11420 }, { "epoch": 3.80920613742495, "learning_rate": 0.0005683071817609316, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 1.2252082824707031, "step": 11420 }, { "ce_loss": 0.20216085016727448, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.4016602337360382, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.1538444310426712, "step": 11420 }, { "epoch": 3.80920613742495, "loss": 0.8858697414398193, "step": 11420 }, { "ce_loss": 0.2571501135826111, "epoch": 3.80920613742495, "step": 11420 }, { "distill_loss": 0.35483574867248535, "epoch": 3.80920613742495, "step": 11420 }, { "epoch": 3.80920613742495, "ref_ce_loss": 0.20577125251293182, "step": 11420 }, { "epoch": 3.8125416944629755, "loss": 0.9702, "step": 11430 }, { "epoch": 3.8125416944629755, "grad_norm": 1.4017664194107056, "step": 11430 }, { "epoch": 3.8125416944629755, "learning_rate": 0.0005679150680398973, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 1.4945435523986816, "step": 11430 }, { "ce_loss": 0.35453975200653076, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.43121251463890076, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.21792420744895935, "step": 11430 }, { "epoch": 3.8125416944629755, "loss": 0.8512135148048401, "step": 11430 }, { "ce_loss": 0.2419276386499405, "epoch": 3.8125416944629755, "step": 11430 }, { "distill_loss": 0.39038610458374023, "epoch": 3.8125416944629755, "step": 11430 }, { "epoch": 3.8125416944629755, "ref_ce_loss": 0.17313846945762634, "step": 11430 }, { "epoch": 3.815877251501001, "loss": 0.9297, "step": 11440 }, { "epoch": 3.815877251501001, "grad_norm": 1.5681135654449463, "step": 11440 }, { "epoch": 3.815877251501001, "learning_rate": 0.0005675227583439101, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.9723762273788452, "step": 11440 }, { "ce_loss": 0.318612277507782, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.39654994010925293, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.1935034692287445, "step": 11440 }, { "epoch": 3.815877251501001, "loss": 0.9978158473968506, "step": 11440 }, { "ce_loss": 0.2571842670440674, "epoch": 3.815877251501001, "step": 11440 }, { "distill_loss": 0.34870341420173645, "epoch": 3.815877251501001, "step": 11440 }, { "epoch": 3.815877251501001, "ref_ce_loss": 0.22138366103172302, "step": 11440 }, { "epoch": 3.8192128085390262, "loss": 0.9286, "step": 11450 }, { "epoch": 3.8192128085390262, "grad_norm": 2.1632747650146484, "step": 11450 }, { "epoch": 3.8192128085390262, "learning_rate": 0.0005671302531308378, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 1.3651049137115479, "step": 11450 }, { "ce_loss": 0.4106031358242035, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.526943027973175, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.19772358238697052, "step": 11450 }, { "epoch": 3.8192128085390262, "loss": 0.8169754147529602, "step": 11450 }, { "ce_loss": 0.22076590359210968, "epoch": 3.8192128085390262, "step": 11450 }, { "distill_loss": 0.36730560660362244, "epoch": 3.8192128085390262, "step": 11450 }, { "epoch": 3.8192128085390262, "ref_ce_loss": 0.18142905831336975, "step": 11450 }, { "epoch": 3.8225483655770516, "loss": 1.0225, "step": 11460 }, { "epoch": 3.8225483655770516, "grad_norm": 1.8946765661239624, "step": 11460 }, { "epoch": 3.8225483655770516, "learning_rate": 0.000566737552858776, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 0.8395046591758728, "step": 11460 }, { "ce_loss": 0.21020092070102692, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.3612057566642761, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.18916825950145721, "step": 11460 }, { "epoch": 3.8225483655770516, "loss": 1.445246696472168, "step": 11460 }, { "ce_loss": 0.1898326277732849, "epoch": 3.8225483655770516, "step": 11460 }, { "distill_loss": 0.30803728103637695, "epoch": 3.8225483655770516, "step": 11460 }, { "epoch": 3.8225483655770516, "ref_ce_loss": 0.13109755516052246, "step": 11460 }, { "epoch": 3.825883922615077, "loss": 1.0358, "step": 11470 }, { "epoch": 3.825883922615077, "grad_norm": 1.6582616567611694, "step": 11470 }, { "epoch": 3.825883922615077, "learning_rate": 0.0005663446579860484, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.9347367286682129, "step": 11470 }, { "ce_loss": 0.24629852175712585, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.42903152108192444, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.19728568196296692, "step": 11470 }, { "epoch": 3.825883922615077, "loss": 0.8807161450386047, "step": 11470 }, { "ce_loss": 0.25386542081832886, "epoch": 3.825883922615077, "step": 11470 }, { "distill_loss": 0.42778605222702026, "epoch": 3.825883922615077, "step": 11470 }, { "epoch": 3.825883922615077, "ref_ce_loss": 0.1473657488822937, "step": 11470 }, { "epoch": 3.8292194796531023, "loss": 0.9359, "step": 11480 }, { "epoch": 3.8292194796531023, "grad_norm": 1.6030758619308472, "step": 11480 }, { "epoch": 3.8292194796531023, "learning_rate": 0.0005659515689712055, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 1.6527743339538574, "step": 11480 }, { "ce_loss": 0.25234341621398926, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.44041571021080017, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.19048476219177246, "step": 11480 }, { "epoch": 3.8292194796531023, "loss": 1.0194995403289795, "step": 11480 }, { "ce_loss": 0.279092013835907, "epoch": 3.8292194796531023, "step": 11480 }, { "distill_loss": 0.4226592481136322, "epoch": 3.8292194796531023, "step": 11480 }, { "epoch": 3.8292194796531023, "ref_ce_loss": 0.1561541110277176, "step": 11480 }, { "epoch": 3.8325550366911276, "loss": 0.9889, "step": 11490 }, { "epoch": 3.8325550366911276, "grad_norm": 3.101188898086548, "step": 11490 }, { "epoch": 3.8325550366911276, "learning_rate": 0.0005655582862730246, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.9315988421440125, "step": 11490 }, { "ce_loss": 0.2899245619773865, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.4321276545524597, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.209343820810318, "step": 11490 }, { "epoch": 3.8325550366911276, "loss": 0.9821953177452087, "step": 11490 }, { "ce_loss": 0.32735350728034973, "epoch": 3.8325550366911276, "step": 11490 }, { "distill_loss": 0.43529486656188965, "epoch": 3.8325550366911276, "step": 11490 }, { "epoch": 3.8325550366911276, "ref_ce_loss": 0.2194371074438095, "step": 11490 }, { "epoch": 3.835890593729153, "loss": 0.9208, "step": 11500 }, { "epoch": 3.835890593729153, "grad_norm": 2.652716875076294, "step": 11500 }, { "epoch": 3.835890593729153, "learning_rate": 0.0005651648103505088, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 1.133766531944275, "step": 11500 }, { "ce_loss": 0.3791617453098297, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.485236793756485, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.22717487812042236, "step": 11500 }, { "epoch": 3.835890593729153, "loss": 1.312686562538147, "step": 11500 }, { "ce_loss": 0.292243629693985, "epoch": 3.835890593729153, "step": 11500 }, { "distill_loss": 0.4213583171367645, "epoch": 3.835890593729153, "step": 11500 }, { "epoch": 3.835890593729153, "ref_ce_loss": 0.23505744338035583, "step": 11500 }, { "epoch": 3.8392261507671783, "loss": 0.9949, "step": 11510 }, { "epoch": 3.8392261507671783, "grad_norm": 1.6241575479507446, "step": 11510 }, { "epoch": 3.8392261507671783, "learning_rate": 0.0005647711416628867, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 0.8845500349998474, "step": 11510 }, { "ce_loss": 0.21759924292564392, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.45337334275245667, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.21198280155658722, "step": 11510 }, { "epoch": 3.8392261507671783, "loss": 1.3442260026931763, "step": 11510 }, { "ce_loss": 0.2494586855173111, "epoch": 3.8392261507671783, "step": 11510 }, { "distill_loss": 0.4459645748138428, "epoch": 3.8392261507671783, "step": 11510 }, { "epoch": 3.8392261507671783, "ref_ce_loss": 0.196689635515213, "step": 11510 }, { "epoch": 3.8425617078052037, "loss": 1.0527, "step": 11520 }, { "epoch": 3.8425617078052037, "grad_norm": 3.0656344890594482, "step": 11520 }, { "epoch": 3.8425617078052037, "learning_rate": 0.0005643772806696121, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 0.6794697642326355, "step": 11520 }, { "ce_loss": 0.18232791125774384, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.2960767447948456, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.16795457899570465, "step": 11520 }, { "epoch": 3.8425617078052037, "loss": 1.225640058517456, "step": 11520 }, { "ce_loss": 0.2971508800983429, "epoch": 3.8425617078052037, "step": 11520 }, { "distill_loss": 0.3605060577392578, "epoch": 3.8425617078052037, "step": 11520 }, { "epoch": 3.8425617078052037, "ref_ce_loss": 0.20262135565280914, "step": 11520 }, { "epoch": 3.845897264843229, "loss": 0.9276, "step": 11530 }, { "epoch": 3.845897264843229, "grad_norm": 2.342433214187622, "step": 11530 }, { "epoch": 3.845897264843229, "learning_rate": 0.0005639832278303635, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 1.0097906589508057, "step": 11530 }, { "ce_loss": 0.325705349445343, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.38462504744529724, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.24787509441375732, "step": 11530 }, { "epoch": 3.845897264843229, "loss": 0.7476581335067749, "step": 11530 }, { "ce_loss": 0.21294884383678436, "epoch": 3.845897264843229, "step": 11530 }, { "distill_loss": 0.28872600197792053, "epoch": 3.845897264843229, "step": 11530 }, { "epoch": 3.845897264843229, "ref_ce_loss": 0.19121286273002625, "step": 11530 }, { "epoch": 3.8492328218812544, "loss": 0.9075, "step": 11540 }, { "epoch": 3.8492328218812544, "grad_norm": 1.577726125717163, "step": 11540 }, { "epoch": 3.8492328218812544, "learning_rate": 0.0005635889836050424, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 0.7826956510543823, "step": 11540 }, { "ce_loss": 0.19527533650398254, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.3526442050933838, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.18786796927452087, "step": 11540 }, { "epoch": 3.8492328218812544, "loss": 1.1441820859909058, "step": 11540 }, { "ce_loss": 0.2938821017742157, "epoch": 3.8492328218812544, "step": 11540 }, { "distill_loss": 0.4380337595939636, "epoch": 3.8492328218812544, "step": 11540 }, { "epoch": 3.8492328218812544, "ref_ce_loss": 0.19052253663539886, "step": 11540 }, { "epoch": 3.8525683789192797, "loss": 0.9338, "step": 11550 }, { "epoch": 3.8525683789192797, "grad_norm": 2.0250110626220703, "step": 11550 }, { "epoch": 3.8525683789192797, "learning_rate": 0.0005631945484537748, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.8967652320861816, "step": 11550 }, { "ce_loss": 0.25885623693466187, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.3998911380767822, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.17840033769607544, "step": 11550 }, { "epoch": 3.8525683789192797, "loss": 0.7759818434715271, "step": 11550 }, { "ce_loss": 0.24023577570915222, "epoch": 3.8525683789192797, "step": 11550 }, { "distill_loss": 0.3571813106536865, "epoch": 3.8525683789192797, "step": 11550 }, { "epoch": 3.8525683789192797, "ref_ce_loss": 0.17843982577323914, "step": 11550 }, { "epoch": 3.855903935957305, "loss": 0.9064, "step": 11560 }, { "epoch": 3.855903935957305, "grad_norm": 1.7031183242797852, "step": 11560 }, { "epoch": 3.855903935957305, "learning_rate": 0.0005627999228369085, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.8540741801261902, "step": 11560 }, { "ce_loss": 0.26487821340560913, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.3655508756637573, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.1813589483499527, "step": 11560 }, { "epoch": 3.855903935957305, "loss": 0.9351487159729004, "step": 11560 }, { "ce_loss": 0.22617729008197784, "epoch": 3.855903935957305, "step": 11560 }, { "distill_loss": 0.3438546061515808, "epoch": 3.855903935957305, "step": 11560 }, { "epoch": 3.855903935957305, "ref_ce_loss": 0.2046925276517868, "step": 11560 }, { "epoch": 3.8592394929953304, "loss": 0.8818, "step": 11570 }, { "epoch": 3.8592394929953304, "grad_norm": 1.5661900043487549, "step": 11570 }, { "epoch": 3.8592394929953304, "learning_rate": 0.0005624051072150144, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.9107818603515625, "step": 11570 }, { "ce_loss": 0.213269904255867, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.2721361219882965, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.18350692093372345, "step": 11570 }, { "epoch": 3.8592394929953304, "loss": 0.9907459020614624, "step": 11570 }, { "ce_loss": 0.3015082776546478, "epoch": 3.8592394929953304, "step": 11570 }, { "distill_loss": 0.3977634012699127, "epoch": 3.8592394929953304, "step": 11570 }, { "epoch": 3.8592394929953304, "ref_ce_loss": 0.23032258450984955, "step": 11570 }, { "epoch": 3.8625750500333558, "loss": 0.9655, "step": 11580 }, { "epoch": 3.8625750500333558, "grad_norm": 2.352884292602539, "step": 11580 }, { "epoch": 3.8625750500333558, "learning_rate": 0.0005620101020488846, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.939232349395752, "step": 11580 }, { "ce_loss": 0.2103375494480133, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.39442554116249084, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.1931764930486679, "step": 11580 }, { "epoch": 3.8625750500333558, "loss": 0.906415581703186, "step": 11580 }, { "ce_loss": 0.23109756410121918, "epoch": 3.8625750500333558, "step": 11580 }, { "distill_loss": 0.35593166947364807, "epoch": 3.8625750500333558, "step": 11580 }, { "epoch": 3.8625750500333558, "ref_ce_loss": 0.18460983037948608, "step": 11580 }, { "epoch": 3.865910607071381, "loss": 0.9738, "step": 11590 }, { "epoch": 3.865910607071381, "grad_norm": 2.738985776901245, "step": 11590 }, { "epoch": 3.865910607071381, "learning_rate": 0.0005616149077995327, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 0.8089038133621216, "step": 11590 }, { "ce_loss": 0.19172555208206177, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.4226304590702057, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.14394626021385193, "step": 11590 }, { "epoch": 3.865910607071381, "loss": 1.1200426816940308, "step": 11590 }, { "ce_loss": 0.29103899002075195, "epoch": 3.865910607071381, "step": 11590 }, { "distill_loss": 0.43065327405929565, "epoch": 3.865910607071381, "step": 11590 }, { "epoch": 3.865910607071381, "ref_ce_loss": 0.2459547519683838, "step": 11590 }, { "epoch": 3.8692461641094065, "loss": 0.976, "step": 11600 }, { "epoch": 3.8692461641094065, "grad_norm": 1.8246506452560425, "step": 11600 }, { "epoch": 3.8692461641094065, "learning_rate": 0.0005612195249281929, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 1.0592341423034668, "step": 11600 }, { "ce_loss": 0.2808479368686676, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.3962073028087616, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.19259952008724213, "step": 11600 }, { "epoch": 3.8692461641094065, "loss": 0.7964693307876587, "step": 11600 }, { "ce_loss": 0.21781563758850098, "epoch": 3.8692461641094065, "step": 11600 }, { "distill_loss": 0.38051968812942505, "epoch": 3.8692461641094065, "step": 11600 }, { "epoch": 3.8692461641094065, "ref_ce_loss": 0.1979907602071762, "step": 11600 }, { "epoch": 3.872581721147432, "loss": 0.9616, "step": 11610 }, { "epoch": 3.872581721147432, "grad_norm": 3.134458065032959, "step": 11610 }, { "epoch": 3.872581721147432, "learning_rate": 0.0005608239538963196, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.7981612086296082, "step": 11610 }, { "ce_loss": 0.23766736686229706, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.37494006752967834, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.15241451561450958, "step": 11610 }, { "epoch": 3.872581721147432, "loss": 0.7920822501182556, "step": 11610 }, { "ce_loss": 0.2178262621164322, "epoch": 3.872581721147432, "step": 11610 }, { "distill_loss": 0.3271922767162323, "epoch": 3.872581721147432, "step": 11610 }, { "epoch": 3.872581721147432, "ref_ce_loss": 0.17308199405670166, "step": 11610 }, { "epoch": 3.875917278185457, "loss": 0.9232, "step": 11620 }, { "epoch": 3.875917278185457, "grad_norm": 1.5603810548782349, "step": 11620 }, { "epoch": 3.875917278185457, "learning_rate": 0.0005604281951655868, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.7440937161445618, "step": 11620 }, { "ce_loss": 0.1296824812889099, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.3723609149456024, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.1284295618534088, "step": 11620 }, { "epoch": 3.875917278185457, "loss": 0.7650660872459412, "step": 11620 }, { "ce_loss": 0.19801418483257294, "epoch": 3.875917278185457, "step": 11620 }, { "distill_loss": 0.3212094008922577, "epoch": 3.875917278185457, "step": 11620 }, { "epoch": 3.875917278185457, "ref_ce_loss": 0.14547830820083618, "step": 11620 }, { "epoch": 3.8792528352234825, "loss": 0.9491, "step": 11630 }, { "epoch": 3.8792528352234825, "grad_norm": 2.8136508464813232, "step": 11630 }, { "epoch": 3.8792528352234825, "learning_rate": 0.0005600322491978873, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 1.1764713525772095, "step": 11630 }, { "ce_loss": 0.259009450674057, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.3844780921936035, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.18857301771640778, "step": 11630 }, { "epoch": 3.8792528352234825, "loss": 1.0874712467193604, "step": 11630 }, { "ce_loss": 0.3212282061576843, "epoch": 3.8792528352234825, "step": 11630 }, { "distill_loss": 0.4671300947666168, "epoch": 3.8792528352234825, "step": 11630 }, { "epoch": 3.8792528352234825, "ref_ce_loss": 0.24173882603645325, "step": 11630 }, { "epoch": 3.882588392261508, "loss": 1.0431, "step": 11640 }, { "epoch": 3.882588392261508, "grad_norm": 3.056569814682007, "step": 11640 }, { "epoch": 3.882588392261508, "learning_rate": 0.0005596361164553328, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.9977152347564697, "step": 11640 }, { "ce_loss": 0.26923465728759766, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.45488986372947693, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.1783684492111206, "step": 11640 }, { "epoch": 3.882588392261508, "loss": 0.966017484664917, "step": 11640 }, { "ce_loss": 0.25669997930526733, "epoch": 3.882588392261508, "step": 11640 }, { "distill_loss": 0.4465809464454651, "epoch": 3.882588392261508, "step": 11640 }, { "epoch": 3.882588392261508, "ref_ce_loss": 0.1974465399980545, "step": 11640 }, { "epoch": 3.885923949299533, "loss": 0.9427, "step": 11650 }, { "epoch": 3.885923949299533, "grad_norm": 1.5865579843521118, "step": 11650 }, { "epoch": 3.885923949299533, "learning_rate": 0.0005592397974002529, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 0.9701722264289856, "step": 11650 }, { "ce_loss": 0.29138481616973877, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.42188239097595215, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.19481723010540009, "step": 11650 }, { "epoch": 3.885923949299533, "loss": 1.0660479068756104, "step": 11650 }, { "ce_loss": 0.33463263511657715, "epoch": 3.885923949299533, "step": 11650 }, { "distill_loss": 0.49488842487335205, "epoch": 3.885923949299533, "step": 11650 }, { "epoch": 3.885923949299533, "ref_ce_loss": 0.19474494457244873, "step": 11650 }, { "epoch": 3.8892595063375586, "loss": 0.997, "step": 11660 }, { "epoch": 3.8892595063375586, "grad_norm": 11.462584495544434, "step": 11660 }, { "epoch": 3.8892595063375586, "learning_rate": 0.0005588432924951946, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 1.0220303535461426, "step": 11660 }, { "ce_loss": 0.22120627760887146, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.4513772130012512, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.15893080830574036, "step": 11660 }, { "epoch": 3.8892595063375586, "loss": 0.7385839819908142, "step": 11660 }, { "ce_loss": 0.2053300440311432, "epoch": 3.8892595063375586, "step": 11660 }, { "distill_loss": 0.329124391078949, "epoch": 3.8892595063375586, "step": 11660 }, { "epoch": 3.8892595063375586, "ref_ce_loss": 0.1607813835144043, "step": 11660 }, { "epoch": 3.892595063375584, "loss": 0.998, "step": 11670 }, { "epoch": 3.892595063375584, "grad_norm": 3.3075084686279297, "step": 11670 }, { "epoch": 3.892595063375584, "learning_rate": 0.0005584466022029216, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.818089485168457, "step": 11670 }, { "ce_loss": 0.20594333112239838, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.3592768609523773, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.19639188051223755, "step": 11670 }, { "epoch": 3.892595063375584, "loss": 0.8467075228691101, "step": 11670 }, { "ce_loss": 0.20294122397899628, "epoch": 3.892595063375584, "step": 11670 }, { "distill_loss": 0.40357089042663574, "epoch": 3.892595063375584, "step": 11670 }, { "epoch": 3.892595063375584, "ref_ce_loss": 0.19864407181739807, "step": 11670 }, { "epoch": 3.8959306204136093, "loss": 0.9248, "step": 11680 }, { "epoch": 3.8959306204136093, "grad_norm": 1.8866527080535889, "step": 11680 }, { "epoch": 3.8959306204136093, "learning_rate": 0.0005580497269864143, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 0.9279657602310181, "step": 11680 }, { "ce_loss": 0.20917311310768127, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.45263731479644775, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.1506700962781906, "step": 11680 }, { "epoch": 3.8959306204136093, "loss": 1.6638834476470947, "step": 11680 }, { "ce_loss": 0.35619089007377625, "epoch": 3.8959306204136093, "step": 11680 }, { "distill_loss": 0.43666911125183105, "epoch": 3.8959306204136093, "step": 11680 }, { "epoch": 3.8959306204136093, "ref_ce_loss": 0.1947658658027649, "step": 11680 }, { "epoch": 3.8992661774516346, "loss": 1.0302, "step": 11690 }, { "epoch": 3.8992661774516346, "grad_norm": 2.085873603820801, "step": 11690 }, { "epoch": 3.8992661774516346, "learning_rate": 0.0005576526673088687, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 0.9703258872032166, "step": 11690 }, { "ce_loss": 0.30788952112197876, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.44872936606407166, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.21353916823863983, "step": 11690 }, { "epoch": 3.8992661774516346, "loss": 1.0158580541610718, "step": 11690 }, { "ce_loss": 0.27169573307037354, "epoch": 3.8992661774516346, "step": 11690 }, { "distill_loss": 0.4001632630825043, "epoch": 3.8992661774516346, "step": 11690 }, { "epoch": 3.8992661774516346, "ref_ce_loss": 0.14454112946987152, "step": 11690 }, { "epoch": 3.90260173448966, "loss": 0.9128, "step": 11700 }, { "epoch": 3.90260173448966, "grad_norm": 2.307708740234375, "step": 11700 }, { "epoch": 3.90260173448966, "learning_rate": 0.0005572554236336965, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 0.7074758410453796, "step": 11700 }, { "ce_loss": 0.2118670642375946, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.36380529403686523, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.1316874474287033, "step": 11700 }, { "epoch": 3.90260173448966, "loss": 1.0797171592712402, "step": 11700 }, { "ce_loss": 0.36392879486083984, "epoch": 3.90260173448966, "step": 11700 }, { "distill_loss": 0.4837646484375, "epoch": 3.90260173448966, "step": 11700 }, { "epoch": 3.90260173448966, "ref_ce_loss": 0.18095727264881134, "step": 11700 }, { "epoch": 3.9059372915276853, "loss": 1.0169, "step": 11710 }, { "epoch": 3.9059372915276853, "grad_norm": 1.657469391822815, "step": 11710 }, { "epoch": 3.9059372915276853, "learning_rate": 0.0005568579964245232, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.8145405650138855, "step": 11710 }, { "ce_loss": 0.2403329610824585, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.40306556224823, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.1709594428539276, "step": 11710 }, { "epoch": 3.9059372915276853, "loss": 0.7344586849212646, "step": 11710 }, { "ce_loss": 0.16823998093605042, "epoch": 3.9059372915276853, "step": 11710 }, { "distill_loss": 0.32683515548706055, "epoch": 3.9059372915276853, "step": 11710 }, { "epoch": 3.9059372915276853, "ref_ce_loss": 0.14987175166606903, "step": 11710 }, { "epoch": 3.9092728485657107, "loss": 0.9718, "step": 11720 }, { "epoch": 3.9092728485657107, "grad_norm": 1.7074451446533203, "step": 11720 }, { "epoch": 3.9092728485657107, "learning_rate": 0.0005564603861451897, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.9449779391288757, "step": 11720 }, { "ce_loss": 0.18956990540027618, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.4226230978965759, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.20470745861530304, "step": 11720 }, { "epoch": 3.9092728485657107, "loss": 0.7637391090393066, "step": 11720 }, { "ce_loss": 0.19855497777462006, "epoch": 3.9092728485657107, "step": 11720 }, { "distill_loss": 0.3552302420139313, "epoch": 3.9092728485657107, "step": 11720 }, { "epoch": 3.9092728485657107, "ref_ce_loss": 0.1477682888507843, "step": 11720 }, { "epoch": 3.912608405603736, "loss": 1.0152, "step": 11730 }, { "epoch": 3.912608405603736, "grad_norm": 2.0933945178985596, "step": 11730 }, { "epoch": 3.912608405603736, "learning_rate": 0.0005560625932597494, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.9675263166427612, "step": 11730 }, { "ce_loss": 0.24618081748485565, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.4911355972290039, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.18077616393566132, "step": 11730 }, { "epoch": 3.912608405603736, "loss": 0.7928896546363831, "step": 11730 }, { "ce_loss": 0.2026551514863968, "epoch": 3.912608405603736, "step": 11730 }, { "distill_loss": 0.3882897198200226, "epoch": 3.912608405603736, "step": 11730 }, { "epoch": 3.912608405603736, "ref_ce_loss": 0.20181968808174133, "step": 11730 }, { "epoch": 3.9159439626417614, "loss": 0.9015, "step": 11740 }, { "epoch": 3.9159439626417614, "grad_norm": 2.6087889671325684, "step": 11740 }, { "epoch": 3.9159439626417614, "learning_rate": 0.00055566461823247, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 0.7968668937683105, "step": 11740 }, { "ce_loss": 0.17508544027805328, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.4119051694869995, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.1454639434814453, "step": 11740 }, { "epoch": 3.9159439626417614, "loss": 1.1098814010620117, "step": 11740 }, { "ce_loss": 0.2541578412055969, "epoch": 3.9159439626417614, "step": 11740 }, { "distill_loss": 0.4228516221046448, "epoch": 3.9159439626417614, "step": 11740 }, { "epoch": 3.9159439626417614, "ref_ce_loss": 0.15402543544769287, "step": 11740 }, { "epoch": 3.9192795196797867, "loss": 0.9195, "step": 11750 }, { "epoch": 3.9192795196797867, "grad_norm": 1.6107227802276611, "step": 11750 }, { "epoch": 3.9192795196797867, "learning_rate": 0.0005552664615278308, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.9172618389129639, "step": 11750 }, { "ce_loss": 0.20093008875846863, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.39634275436401367, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.17416979372501373, "step": 11750 }, { "epoch": 3.9192795196797867, "loss": 0.8351260423660278, "step": 11750 }, { "ce_loss": 0.19515912234783173, "epoch": 3.9192795196797867, "step": 11750 }, { "distill_loss": 0.3535310626029968, "epoch": 3.9192795196797867, "step": 11750 }, { "epoch": 3.9192795196797867, "ref_ce_loss": 0.21191735565662384, "step": 11750 }, { "epoch": 3.922615076717812, "loss": 0.8606, "step": 11760 }, { "epoch": 3.922615076717812, "grad_norm": 1.865551471710205, "step": 11760 }, { "epoch": 3.922615076717812, "learning_rate": 0.0005548681236105239, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.8535208702087402, "step": 11760 }, { "ce_loss": 0.20602232217788696, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.4059385657310486, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.1823427379131317, "step": 11760 }, { "epoch": 3.922615076717812, "loss": 0.8497105240821838, "step": 11760 }, { "ce_loss": 0.19806364178657532, "epoch": 3.922615076717812, "step": 11760 }, { "distill_loss": 0.3617856204509735, "epoch": 3.922615076717812, "step": 11760 }, { "epoch": 3.922615076717812, "ref_ce_loss": 0.16827456653118134, "step": 11760 }, { "epoch": 3.9259506337558374, "loss": 1.0327, "step": 11770 }, { "epoch": 3.9259506337558374, "grad_norm": 1.6781734228134155, "step": 11770 }, { "epoch": 3.9259506337558374, "learning_rate": 0.000554469604945452, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 1.0478616952896118, "step": 11770 }, { "ce_loss": 0.3283158838748932, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.4395040273666382, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.2273147851228714, "step": 11770 }, { "epoch": 3.9259506337558374, "loss": 1.000312328338623, "step": 11770 }, { "ce_loss": 0.257784366607666, "epoch": 3.9259506337558374, "step": 11770 }, { "distill_loss": 0.40631210803985596, "epoch": 3.9259506337558374, "step": 11770 }, { "epoch": 3.9259506337558374, "ref_ce_loss": 0.19344256818294525, "step": 11770 }, { "epoch": 3.9292861907938628, "loss": 0.9852, "step": 11780 }, { "epoch": 3.9292861907938628, "grad_norm": 1.7029778957366943, "step": 11780 }, { "epoch": 3.9292861907938628, "learning_rate": 0.0005540709059977295, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.7573671936988831, "step": 11780 }, { "ce_loss": 0.21558508276939392, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.37834596633911133, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.16318820416927338, "step": 11780 }, { "epoch": 3.9292861907938628, "loss": 0.8037177324295044, "step": 11780 }, { "ce_loss": 0.220443993806839, "epoch": 3.9292861907938628, "step": 11780 }, { "distill_loss": 0.42402610182762146, "epoch": 3.9292861907938628, "step": 11780 }, { "epoch": 3.9292861907938628, "ref_ce_loss": 0.1590704619884491, "step": 11780 }, { "epoch": 3.932621747831888, "loss": 0.9677, "step": 11790 }, { "epoch": 3.932621747831888, "grad_norm": 1.564285159111023, "step": 11790 }, { "epoch": 3.932621747831888, "learning_rate": 0.000553672027232681, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 1.0326200723648071, "step": 11790 }, { "ce_loss": 0.321227490901947, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.512378454208374, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.19875237345695496, "step": 11790 }, { "epoch": 3.932621747831888, "loss": 0.9585893154144287, "step": 11790 }, { "ce_loss": 0.29675567150115967, "epoch": 3.932621747831888, "step": 11790 }, { "distill_loss": 0.4784086346626282, "epoch": 3.932621747831888, "step": 11790 }, { "epoch": 3.932621747831888, "ref_ce_loss": 0.18320876359939575, "step": 11790 }, { "epoch": 3.9359573048699135, "loss": 1.008, "step": 11800 }, { "epoch": 3.9359573048699135, "grad_norm": 1.9860695600509644, "step": 11800 }, { "epoch": 3.9359573048699135, "learning_rate": 0.000553272969115841, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.743493378162384, "step": 11800 }, { "ce_loss": 0.22666524350643158, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.33357691764831543, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.18286947906017303, "step": 11800 }, { "epoch": 3.9359573048699135, "loss": 0.6927028894424438, "step": 11800 }, { "ce_loss": 0.14644154906272888, "epoch": 3.9359573048699135, "step": 11800 }, { "distill_loss": 0.32516467571258545, "epoch": 3.9359573048699135, "step": 11800 }, { "epoch": 3.9359573048699135, "ref_ce_loss": 0.14002791047096252, "step": 11800 }, { "epoch": 3.939292861907939, "loss": 0.9237, "step": 11810 }, { "epoch": 3.939292861907939, "grad_norm": 2.8374104499816895, "step": 11810 }, { "epoch": 3.939292861907939, "learning_rate": 0.0005528737321129532, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.9271951913833618, "step": 11810 }, { "ce_loss": 0.3107430040836334, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.37300023436546326, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.1981804221868515, "step": 11810 }, { "epoch": 3.939292861907939, "loss": 0.9263529181480408, "step": 11810 }, { "ce_loss": 0.2825232446193695, "epoch": 3.939292861907939, "step": 11810 }, { "distill_loss": 0.39083096385002136, "epoch": 3.939292861907939, "step": 11810 }, { "epoch": 3.939292861907939, "ref_ce_loss": 0.2027083933353424, "step": 11810 }, { "epoch": 3.942628418945964, "loss": 0.9221, "step": 11820 }, { "epoch": 3.942628418945964, "grad_norm": 1.7531052827835083, "step": 11820 }, { "epoch": 3.942628418945964, "learning_rate": 0.0005524743166899701, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 1.1111127138137817, "step": 11820 }, { "ce_loss": 0.30061981081962585, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.43286579847335815, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.21395601332187653, "step": 11820 }, { "epoch": 3.942628418945964, "loss": 0.940593957901001, "step": 11820 }, { "ce_loss": 0.2132023274898529, "epoch": 3.942628418945964, "step": 11820 }, { "distill_loss": 0.3509421944618225, "epoch": 3.942628418945964, "step": 11820 }, { "epoch": 3.942628418945964, "ref_ce_loss": 0.12664273381233215, "step": 11820 }, { "epoch": 3.9459639759839895, "loss": 0.9548, "step": 11830 }, { "epoch": 3.9459639759839895, "grad_norm": 2.730525493621826, "step": 11830 }, { "epoch": 3.9459639759839895, "learning_rate": 0.0005520747233130525, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 0.9735243320465088, "step": 11830 }, { "ce_loss": 0.26277390122413635, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.391275554895401, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.16927450895309448, "step": 11830 }, { "epoch": 3.9459639759839895, "loss": 1.2078938484191895, "step": 11830 }, { "ce_loss": 0.370376318693161, "epoch": 3.9459639759839895, "step": 11830 }, { "distill_loss": 0.4815659523010254, "epoch": 3.9459639759839895, "step": 11830 }, { "epoch": 3.9459639759839895, "ref_ce_loss": 0.23204077780246735, "step": 11830 }, { "epoch": 3.949299533022015, "loss": 1.0518, "step": 11840 }, { "epoch": 3.949299533022015, "grad_norm": 2.2668051719665527, "step": 11840 }, { "epoch": 3.949299533022015, "learning_rate": 0.0005516749524485688, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 0.8577396273612976, "step": 11840 }, { "ce_loss": 0.2821693420410156, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.36145728826522827, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.21394376456737518, "step": 11840 }, { "epoch": 3.949299533022015, "loss": 1.1756106615066528, "step": 11840 }, { "ce_loss": 0.2654615640640259, "epoch": 3.949299533022015, "step": 11840 }, { "distill_loss": 0.36142125725746155, "epoch": 3.949299533022015, "step": 11840 }, { "epoch": 3.949299533022015, "ref_ce_loss": 0.2146204560995102, "step": 11840 }, { "epoch": 3.95263509006004, "loss": 0.9651, "step": 11850 }, { "epoch": 3.95263509006004, "grad_norm": 1.6232107877731323, "step": 11850 }, { "epoch": 3.95263509006004, "learning_rate": 0.0005512750045630947, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.8906152248382568, "step": 11850 }, { "ce_loss": 0.2890780568122864, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.35990801453590393, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.1792559176683426, "step": 11850 }, { "epoch": 3.95263509006004, "loss": 0.9378436803817749, "step": 11850 }, { "ce_loss": 0.28084325790405273, "epoch": 3.95263509006004, "step": 11850 }, { "distill_loss": 0.4056680500507355, "epoch": 3.95263509006004, "step": 11850 }, { "epoch": 3.95263509006004, "ref_ce_loss": 0.2064526230096817, "step": 11850 }, { "epoch": 3.9559706470980656, "loss": 0.9023, "step": 11860 }, { "epoch": 3.9559706470980656, "grad_norm": 2.2000057697296143, "step": 11860 }, { "epoch": 3.9559706470980656, "learning_rate": 0.0005508748801234127, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 0.924540102481842, "step": 11860 }, { "ce_loss": 0.26077377796173096, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.442877858877182, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.16746099293231964, "step": 11860 }, { "epoch": 3.9559706470980656, "loss": 1.2667691707611084, "step": 11860 }, { "ce_loss": 0.26282796263694763, "epoch": 3.9559706470980656, "step": 11860 }, { "distill_loss": 0.46519577503204346, "epoch": 3.9559706470980656, "step": 11860 }, { "epoch": 3.9559706470980656, "ref_ce_loss": 0.20361657440662384, "step": 11860 }, { "epoch": 3.959306204136091, "loss": 1.0097, "step": 11870 }, { "epoch": 3.959306204136091, "grad_norm": 1.7751314640045166, "step": 11870 }, { "epoch": 3.959306204136091, "learning_rate": 0.0005504745795965104, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 0.9637972116470337, "step": 11870 }, { "ce_loss": 0.20609210431575775, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.413042277097702, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.17276988923549652, "step": 11870 }, { "epoch": 3.959306204136091, "loss": 1.049441933631897, "step": 11870 }, { "ce_loss": 0.22047023475170135, "epoch": 3.959306204136091, "step": 11870 }, { "distill_loss": 0.4326194226741791, "epoch": 3.959306204136091, "step": 11870 }, { "epoch": 3.959306204136091, "ref_ce_loss": 0.15497992932796478, "step": 11870 }, { "epoch": 3.9626417611741163, "loss": 0.9621, "step": 11880 }, { "epoch": 3.9626417611741163, "grad_norm": 1.6827456951141357, "step": 11880 }, { "epoch": 3.9626417611741163, "learning_rate": 0.0005500741034495822, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 0.7790173888206482, "step": 11880 }, { "ce_loss": 0.20525261759757996, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.3322910666465759, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.19715571403503418, "step": 11880 }, { "epoch": 3.9626417611741163, "loss": 1.076646089553833, "step": 11880 }, { "ce_loss": 0.2301023304462433, "epoch": 3.9626417611741163, "step": 11880 }, { "distill_loss": 0.3682442903518677, "epoch": 3.9626417611741163, "step": 11880 }, { "epoch": 3.9626417611741163, "ref_ce_loss": 0.18718735873699188, "step": 11880 }, { "epoch": 3.9659773182121416, "loss": 0.8858, "step": 11890 }, { "epoch": 3.9659773182121416, "grad_norm": 1.9867877960205078, "step": 11890 }, { "epoch": 3.9659773182121416, "learning_rate": 0.0005496734521500265, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 1.0159093141555786, "step": 11890 }, { "ce_loss": 0.253842830657959, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.3500358462333679, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.19169864058494568, "step": 11890 }, { "epoch": 3.9659773182121416, "loss": 0.8605555295944214, "step": 11890 }, { "ce_loss": 0.2701185941696167, "epoch": 3.9659773182121416, "step": 11890 }, { "distill_loss": 0.4148210883140564, "epoch": 3.9659773182121416, "step": 11890 }, { "epoch": 3.9659773182121416, "ref_ce_loss": 0.1754133701324463, "step": 11890 }, { "epoch": 3.969312875250167, "loss": 0.8858, "step": 11900 }, { "epoch": 3.969312875250167, "grad_norm": 2.906344413757324, "step": 11900 }, { "epoch": 3.969312875250167, "learning_rate": 0.0005492726261654467, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 1.1624562740325928, "step": 11900 }, { "ce_loss": 0.2951255738735199, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.47946152091026306, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.23917268216609955, "step": 11900 }, { "epoch": 3.969312875250167, "loss": 1.189023494720459, "step": 11900 }, { "ce_loss": 0.29075753688812256, "epoch": 3.969312875250167, "step": 11900 }, { "distill_loss": 0.3917032778263092, "epoch": 3.969312875250167, "step": 11900 }, { "epoch": 3.969312875250167, "ref_ce_loss": 0.22798548638820648, "step": 11900 }, { "epoch": 3.9726484322881923, "loss": 1.0152, "step": 11910 }, { "epoch": 3.9726484322881923, "grad_norm": 2.9853665828704834, "step": 11910 }, { "epoch": 3.9726484322881923, "learning_rate": 0.0005488716259636498, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.8837149739265442, "step": 11910 }, { "ce_loss": 0.2542972266674042, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.4023490846157074, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.20137716829776764, "step": 11910 }, { "epoch": 3.9726484322881923, "loss": 0.9087706804275513, "step": 11910 }, { "ce_loss": 0.29229024052619934, "epoch": 3.9726484322881923, "step": 11910 }, { "distill_loss": 0.36741021275520325, "epoch": 3.9726484322881923, "step": 11910 }, { "epoch": 3.9726484322881923, "ref_ce_loss": 0.2003345787525177, "step": 11910 }, { "epoch": 3.9759839893262177, "loss": 0.9513, "step": 11920 }, { "epoch": 3.9759839893262177, "grad_norm": 1.9088329076766968, "step": 11920 }, { "epoch": 3.9759839893262177, "learning_rate": 0.0005484704520126461, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 0.8984208703041077, "step": 11920 }, { "ce_loss": 0.24777761101722717, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.38926613330841064, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.2100924551486969, "step": 11920 }, { "epoch": 3.9759839893262177, "loss": 1.131884217262268, "step": 11920 }, { "ce_loss": 0.32054466009140015, "epoch": 3.9759839893262177, "step": 11920 }, { "distill_loss": 0.5047661662101746, "epoch": 3.9759839893262177, "step": 11920 }, { "epoch": 3.9759839893262177, "ref_ce_loss": 0.17037981748580933, "step": 11920 }, { "epoch": 3.979319546364243, "loss": 0.9376, "step": 11930 }, { "epoch": 3.979319546364243, "grad_norm": 1.9283043146133423, "step": 11930 }, { "epoch": 3.979319546364243, "learning_rate": 0.0005480691047806488, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.8340834975242615, "step": 11930 }, { "ce_loss": 0.2035832554101944, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.40630069375038147, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.1837412267923355, "step": 11930 }, { "epoch": 3.979319546364243, "loss": 0.8505741357803345, "step": 11930 }, { "ce_loss": 0.2108788788318634, "epoch": 3.979319546364243, "step": 11930 }, { "distill_loss": 0.38408035039901733, "epoch": 3.979319546364243, "step": 11930 }, { "epoch": 3.979319546364243, "ref_ce_loss": 0.18300923705101013, "step": 11930 }, { "epoch": 3.9826551034022684, "loss": 0.9159, "step": 11940 }, { "epoch": 3.9826551034022684, "grad_norm": 12.302298545837402, "step": 11940 }, { "epoch": 3.9826551034022684, "learning_rate": 0.0005476675847360734, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 1.0876966714859009, "step": 11940 }, { "ce_loss": 0.25745177268981934, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.45677369832992554, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.19750654697418213, "step": 11940 }, { "epoch": 3.9826551034022684, "loss": 0.9414987564086914, "step": 11940 }, { "ce_loss": 0.2690792381763458, "epoch": 3.9826551034022684, "step": 11940 }, { "distill_loss": 0.3595500588417053, "epoch": 3.9826551034022684, "step": 11940 }, { "epoch": 3.9826551034022684, "ref_ce_loss": 0.18679878115653992, "step": 11940 }, { "epoch": 3.9859906604402937, "loss": 0.9927, "step": 11950 }, { "epoch": 3.9859906604402937, "grad_norm": 1.953128695487976, "step": 11950 }, { "epoch": 3.9859906604402937, "learning_rate": 0.0005472658923475368, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 0.6685543060302734, "step": 11950 }, { "ce_loss": 0.11985540390014648, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.36389437317848206, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.1333955079317093, "step": 11950 }, { "epoch": 3.9859906604402937, "loss": 1.2167911529541016, "step": 11950 }, { "ce_loss": 0.3260270655155182, "epoch": 3.9859906604402937, "step": 11950 }, { "distill_loss": 0.4484832286834717, "epoch": 3.9859906604402937, "step": 11950 }, { "epoch": 3.9859906604402937, "ref_ce_loss": 0.16716627776622772, "step": 11950 }, { "epoch": 3.989326217478319, "loss": 1.0491, "step": 11960 }, { "epoch": 3.989326217478319, "grad_norm": 1.5239057540893555, "step": 11960 }, { "epoch": 3.989326217478319, "learning_rate": 0.0005468640280838575, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 0.9055935740470886, "step": 11960 }, { "ce_loss": 0.25763407349586487, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.3997820317745209, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.17403864860534668, "step": 11960 }, { "epoch": 3.989326217478319, "loss": 0.8026612997055054, "step": 11960 }, { "ce_loss": 0.22291156649589539, "epoch": 3.989326217478319, "step": 11960 }, { "distill_loss": 0.40883633494377136, "epoch": 3.989326217478319, "step": 11960 }, { "epoch": 3.989326217478319, "ref_ce_loss": 0.17074339091777802, "step": 11960 }, { "epoch": 3.9926617745163444, "loss": 0.921, "step": 11970 }, { "epoch": 3.9926617745163444, "grad_norm": 2.39037823677063, "step": 11970 }, { "epoch": 3.9926617745163444, "learning_rate": 0.0005464619924140541, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.5891600251197815, "step": 11970 }, { "ce_loss": 0.13960763812065125, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.333700954914093, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.11562500894069672, "step": 11970 }, { "epoch": 3.9926617745163444, "loss": 0.9206572771072388, "step": 11970 }, { "ce_loss": 0.24288669228553772, "epoch": 3.9926617745163444, "step": 11970 }, { "distill_loss": 0.4398137629032135, "epoch": 3.9926617745163444, "step": 11970 }, { "epoch": 3.9926617745163444, "ref_ce_loss": 0.17862555384635925, "step": 11970 }, { "epoch": 3.9959973315543698, "loss": 0.9467, "step": 11980 }, { "epoch": 3.9959973315543698, "grad_norm": 2.8624608516693115, "step": 11980 }, { "epoch": 3.9959973315543698, "learning_rate": 0.0005460597858073456, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 1.3518834114074707, "step": 11980 }, { "ce_loss": 0.2634531259536743, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.48528337478637695, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.2191203236579895, "step": 11980 }, { "epoch": 3.9959973315543698, "loss": 0.9772735834121704, "step": 11980 }, { "ce_loss": 0.24835364520549774, "epoch": 3.9959973315543698, "step": 11980 }, { "distill_loss": 0.4489499032497406, "epoch": 3.9959973315543698, "step": 11980 }, { "epoch": 3.9959973315543698, "ref_ce_loss": 0.2041686326265335, "step": 11980 }, { "epoch": 3.999332888592395, "loss": 0.9538, "step": 11990 }, { "epoch": 3.999332888592395, "grad_norm": 1.8687514066696167, "step": 11990 }, { "epoch": 3.999332888592395, "learning_rate": 0.0005456574087331504, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.6367960572242737, "step": 11990 }, { "ce_loss": 0.16839922964572906, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.3280557692050934, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.14013376832008362, "step": 11990 }, { "epoch": 3.999332888592395, "loss": 0.8866681456565857, "step": 11990 }, { "ce_loss": 0.27331751585006714, "epoch": 3.999332888592395, "step": 11990 }, { "distill_loss": 0.42752671241760254, "epoch": 3.999332888592395, "step": 11990 }, { "epoch": 3.999332888592395, "ref_ce_loss": 0.1855868250131607, "step": 11990 }, { "epoch": 4.0026684456304205, "loss": 0.8725, "step": 12000 }, { "epoch": 4.0026684456304205, "grad_norm": 1.8681031465530396, "step": 12000 }, { "epoch": 4.0026684456304205, "learning_rate": 0.0005452548616610858, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 0.7981876134872437, "step": 12000 }, { "ce_loss": 0.17341767251491547, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.28598684072494507, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.17627839744091034, "step": 12000 }, { "epoch": 4.0026684456304205, "loss": 1.0411994457244873, "step": 12000 }, { "ce_loss": 0.2961692214012146, "epoch": 4.0026684456304205, "step": 12000 }, { "distill_loss": 0.43006405234336853, "epoch": 4.0026684456304205, "step": 12000 }, { "epoch": 4.0026684456304205, "ref_ce_loss": 0.22421853244304657, "step": 12000 }, { "epoch": 4.006004002668446, "loss": 0.897, "step": 12010 }, { "epoch": 4.006004002668446, "grad_norm": 1.94181489944458, "step": 12010 }, { "epoch": 4.006004002668446, "learning_rate": 0.0005448521450609677, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 1.054775595664978, "step": 12010 }, { "ce_loss": 0.2572634816169739, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.5030579566955566, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.21427693963050842, "step": 12010 }, { "epoch": 4.006004002668446, "loss": 0.7503544092178345, "step": 12010 }, { "ce_loss": 0.2099463790655136, "epoch": 4.006004002668446, "step": 12010 }, { "distill_loss": 0.33009880781173706, "epoch": 4.006004002668446, "step": 12010 }, { "epoch": 4.006004002668446, "ref_ce_loss": 0.2099847048521042, "step": 12010 }, { "epoch": 4.009339559706471, "loss": 0.8854, "step": 12020 }, { "epoch": 4.009339559706471, "grad_norm": 2.251950979232788, "step": 12020 }, { "epoch": 4.009339559706471, "learning_rate": 0.0005444492594028093, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.91937255859375, "step": 12020 }, { "ce_loss": 0.21336789429187775, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.41891738772392273, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.1578998565673828, "step": 12020 }, { "epoch": 4.009339559706471, "loss": 0.9270004034042358, "step": 12020 }, { "ce_loss": 0.19843682646751404, "epoch": 4.009339559706471, "step": 12020 }, { "distill_loss": 0.44454413652420044, "epoch": 4.009339559706471, "step": 12020 }, { "epoch": 4.009339559706471, "ref_ce_loss": 0.18976789712905884, "step": 12020 }, { "epoch": 4.0126751167444965, "loss": 0.9284, "step": 12030 }, { "epoch": 4.0126751167444965, "grad_norm": 1.778459072113037, "step": 12030 }, { "epoch": 4.0126751167444965, "learning_rate": 0.000544046205156822, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.6953597068786621, "step": 12030 }, { "ce_loss": 0.16227765381336212, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.30457478761672974, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.1753004640340805, "step": 12030 }, { "epoch": 4.0126751167444965, "loss": 0.9509637951850891, "step": 12030 }, { "ce_loss": 0.23665675520896912, "epoch": 4.0126751167444965, "step": 12030 }, { "distill_loss": 0.43513789772987366, "epoch": 4.0126751167444965, "step": 12030 }, { "epoch": 4.0126751167444965, "ref_ce_loss": 0.1287996470928192, "step": 12030 }, { "epoch": 4.016010673782522, "loss": 0.8804, "step": 12040 }, { "epoch": 4.016010673782522, "grad_norm": 1.5397865772247314, "step": 12040 }, { "epoch": 4.016010673782522, "learning_rate": 0.0005436429827934133, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.9101223349571228, "step": 12040 }, { "ce_loss": 0.24761877954006195, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.33264732360839844, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.19348663091659546, "step": 12040 }, { "epoch": 4.016010673782522, "loss": 0.8087942600250244, "step": 12040 }, { "ce_loss": 0.18762274086475372, "epoch": 4.016010673782522, "step": 12040 }, { "distill_loss": 0.39426758885383606, "epoch": 4.016010673782522, "step": 12040 }, { "epoch": 4.016010673782522, "ref_ce_loss": 0.16436782479286194, "step": 12040 }, { "epoch": 4.019346230820547, "loss": 0.8119, "step": 12050 }, { "epoch": 4.019346230820547, "grad_norm": 2.189448118209839, "step": 12050 }, { "epoch": 4.019346230820547, "learning_rate": 0.000543239592783187, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.6664413809776306, "step": 12050 }, { "ce_loss": 0.17020738124847412, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.3622167706489563, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.13371214270591736, "step": 12050 }, { "epoch": 4.019346230820547, "loss": 0.9993815422058105, "step": 12050 }, { "ce_loss": 0.2715194821357727, "epoch": 4.019346230820547, "step": 12050 }, { "distill_loss": 0.4615587294101715, "epoch": 4.019346230820547, "step": 12050 }, { "epoch": 4.019346230820547, "ref_ce_loss": 0.21049873530864716, "step": 12050 }, { "epoch": 4.0226817878585726, "loss": 0.8495, "step": 12060 }, { "epoch": 4.0226817878585726, "grad_norm": 1.3513885736465454, "step": 12060 }, { "epoch": 4.0226817878585726, "learning_rate": 0.0005428360355969426, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.9639096856117249, "step": 12060 }, { "ce_loss": 0.19991366565227509, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.45069777965545654, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.1532137244939804, "step": 12060 }, { "epoch": 4.0226817878585726, "loss": 0.7424845695495605, "step": 12060 }, { "ce_loss": 0.1836809664964676, "epoch": 4.0226817878585726, "step": 12060 }, { "distill_loss": 0.30216172337532043, "epoch": 4.0226817878585726, "step": 12060 }, { "epoch": 4.0226817878585726, "ref_ce_loss": 0.18060757219791412, "step": 12060 }, { "epoch": 4.026017344896598, "loss": 0.8583, "step": 12070 }, { "epoch": 4.026017344896598, "grad_norm": 1.7382936477661133, "step": 12070 }, { "epoch": 4.026017344896598, "learning_rate": 0.0005424323117056751, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.8760979771614075, "step": 12070 }, { "ce_loss": 0.2157914787530899, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.35931938886642456, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.14864644408226013, "step": 12070 }, { "epoch": 4.026017344896598, "loss": 0.6753813028335571, "step": 12070 }, { "ce_loss": 0.14919540286064148, "epoch": 4.026017344896598, "step": 12070 }, { "distill_loss": 0.35045838356018066, "epoch": 4.026017344896598, "step": 12070 }, { "epoch": 4.026017344896598, "ref_ce_loss": 0.17523548007011414, "step": 12070 }, { "epoch": 4.029352901934623, "loss": 0.8266, "step": 12080 }, { "epoch": 4.029352901934623, "grad_norm": 1.6985952854156494, "step": 12080 }, { "epoch": 4.029352901934623, "learning_rate": 0.0005420284215805732, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 1.2062604427337646, "step": 12080 }, { "ce_loss": 0.23293708264827728, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.46571627259254456, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.2237669676542282, "step": 12080 }, { "epoch": 4.029352901934623, "loss": 1.3210084438323975, "step": 12080 }, { "ce_loss": 0.19478954374790192, "epoch": 4.029352901934623, "step": 12080 }, { "distill_loss": 0.348651260137558, "epoch": 4.029352901934623, "step": 12080 }, { "epoch": 4.029352901934623, "ref_ce_loss": 0.16467390954494476, "step": 12080 }, { "epoch": 4.032688458972649, "loss": 0.8626, "step": 12090 }, { "epoch": 4.032688458972649, "grad_norm": 2.3197574615478516, "step": 12090 }, { "epoch": 4.032688458972649, "learning_rate": 0.0005416243656930207, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.8391457200050354, "step": 12090 }, { "ce_loss": 0.22044244408607483, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.3444957733154297, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.15553082525730133, "step": 12090 }, { "epoch": 4.032688458972649, "loss": 0.779373824596405, "step": 12090 }, { "ce_loss": 0.20906268060207367, "epoch": 4.032688458972649, "step": 12090 }, { "distill_loss": 0.38116130232810974, "epoch": 4.032688458972649, "step": 12090 }, { "epoch": 4.032688458972649, "ref_ce_loss": 0.1889278143644333, "step": 12090 }, { "epoch": 4.036024016010674, "loss": 0.8207, "step": 12100 }, { "epoch": 4.036024016010674, "grad_norm": 1.705793857574463, "step": 12100 }, { "epoch": 4.036024016010674, "learning_rate": 0.0005412201445145939, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.8325830101966858, "step": 12100 }, { "ce_loss": 0.26227590441703796, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.3504793047904968, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.21957749128341675, "step": 12100 }, { "epoch": 4.036024016010674, "loss": 0.6191930770874023, "step": 12100 }, { "ce_loss": 0.1428786665201187, "epoch": 4.036024016010674, "step": 12100 }, { "distill_loss": 0.2747659683227539, "epoch": 4.036024016010674, "step": 12100 }, { "epoch": 4.036024016010674, "ref_ce_loss": 0.15719301998615265, "step": 12100 }, { "epoch": 4.039359573048699, "loss": 0.8851, "step": 12110 }, { "epoch": 4.039359573048699, "grad_norm": 1.9953546524047852, "step": 12110 }, { "epoch": 4.039359573048699, "learning_rate": 0.0005408157585170625, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.7786689400672913, "step": 12110 }, { "ce_loss": 0.18109384179115295, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.3678598999977112, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.1337146908044815, "step": 12110 }, { "epoch": 4.039359573048699, "loss": 0.8011458516120911, "step": 12110 }, { "ce_loss": 0.18645349144935608, "epoch": 4.039359573048699, "step": 12110 }, { "distill_loss": 0.39759284257888794, "epoch": 4.039359573048699, "step": 12110 }, { "epoch": 4.039359573048699, "ref_ce_loss": 0.16702933609485626, "step": 12110 }, { "epoch": 4.042695130086725, "loss": 0.922, "step": 12120 }, { "epoch": 4.042695130086725, "grad_norm": 2.8055052757263184, "step": 12120 }, { "epoch": 4.042695130086725, "learning_rate": 0.0005404112081723885, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.7414431571960449, "step": 12120 }, { "ce_loss": 0.17341484129428864, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.3635369539260864, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.15303896367549896, "step": 12120 }, { "epoch": 4.042695130086725, "loss": 0.7546785473823547, "step": 12120 }, { "ce_loss": 0.17873521149158478, "epoch": 4.042695130086725, "step": 12120 }, { "distill_loss": 0.34869545698165894, "epoch": 4.042695130086725, "step": 12120 }, { "epoch": 4.042695130086725, "ref_ce_loss": 0.15745292603969574, "step": 12120 }, { "epoch": 4.04603068712475, "loss": 0.8738, "step": 12130 }, { "epoch": 4.04603068712475, "grad_norm": 1.4640624523162842, "step": 12130 }, { "epoch": 4.04603068712475, "learning_rate": 0.0005400064939527257, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.9559147953987122, "step": 12130 }, { "ce_loss": 0.20529085397720337, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.38767898082733154, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.1801074594259262, "step": 12130 }, { "epoch": 4.04603068712475, "loss": 0.7419565916061401, "step": 12130 }, { "ce_loss": 0.2174055576324463, "epoch": 4.04603068712475, "step": 12130 }, { "distill_loss": 0.35398024320602417, "epoch": 4.04603068712475, "step": 12130 }, { "epoch": 4.04603068712475, "ref_ce_loss": 0.1339172124862671, "step": 12130 }, { "epoch": 4.049366244162775, "loss": 0.9346, "step": 12140 }, { "epoch": 4.049366244162775, "grad_norm": 1.9079539775848389, "step": 12140 }, { "epoch": 4.049366244162775, "learning_rate": 0.0005396016163304192, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.9857970476150513, "step": 12140 }, { "ce_loss": 0.16553930938243866, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.3548365533351898, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.1572820097208023, "step": 12140 }, { "epoch": 4.049366244162775, "loss": 0.8096906542778015, "step": 12140 }, { "ce_loss": 0.17616169154644012, "epoch": 4.049366244162775, "step": 12140 }, { "distill_loss": 0.3309439420700073, "epoch": 4.049366244162775, "step": 12140 }, { "epoch": 4.049366244162775, "ref_ce_loss": 0.17873306572437286, "step": 12140 }, { "epoch": 4.052701801200801, "loss": 0.8059, "step": 12150 }, { "epoch": 4.052701801200801, "grad_norm": 1.834365725517273, "step": 12150 }, { "epoch": 4.052701801200801, "learning_rate": 0.0005391965757780047, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.8741999268531799, "step": 12150 }, { "ce_loss": 0.2503318786621094, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.3878973722457886, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.19882416725158691, "step": 12150 }, { "epoch": 4.052701801200801, "loss": 0.8098387122154236, "step": 12150 }, { "ce_loss": 0.186272531747818, "epoch": 4.052701801200801, "step": 12150 }, { "distill_loss": 0.3593500256538391, "epoch": 4.052701801200801, "step": 12150 }, { "epoch": 4.052701801200801, "ref_ce_loss": 0.20150873064994812, "step": 12150 }, { "epoch": 4.056037358238826, "loss": 0.9423, "step": 12160 }, { "epoch": 4.056037358238826, "grad_norm": 1.893646001815796, "step": 12160 }, { "epoch": 4.056037358238826, "learning_rate": 0.0005387913727682081, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.737841784954071, "step": 12160 }, { "ce_loss": 0.18643051385879517, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.3098827004432678, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.14226196706295013, "step": 12160 }, { "epoch": 4.056037358238826, "loss": 0.9402210712432861, "step": 12160 }, { "ce_loss": 0.21556143462657928, "epoch": 4.056037358238826, "step": 12160 }, { "distill_loss": 0.3903096318244934, "epoch": 4.056037358238826, "step": 12160 }, { "epoch": 4.056037358238826, "ref_ce_loss": 0.17290903627872467, "step": 12160 }, { "epoch": 4.059372915276851, "loss": 0.8178, "step": 12170 }, { "epoch": 4.059372915276851, "grad_norm": 3.435178279876709, "step": 12170 }, { "epoch": 4.059372915276851, "learning_rate": 0.0005383860077739448, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.7845951914787292, "step": 12170 }, { "ce_loss": 0.22703777253627777, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.40096449851989746, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.15633171796798706, "step": 12170 }, { "epoch": 4.059372915276851, "loss": 0.9148541688919067, "step": 12170 }, { "ce_loss": 0.30216914415359497, "epoch": 4.059372915276851, "step": 12170 }, { "distill_loss": 0.3802482485771179, "epoch": 4.059372915276851, "step": 12170 }, { "epoch": 4.059372915276851, "ref_ce_loss": 0.17897044122219086, "step": 12170 }, { "epoch": 4.062708472314877, "loss": 0.892, "step": 12180 }, { "epoch": 4.062708472314877, "grad_norm": 1.3513412475585938, "step": 12180 }, { "epoch": 4.062708472314877, "learning_rate": 0.0005379804812683194, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 1.0374103784561157, "step": 12180 }, { "ce_loss": 0.2488425225019455, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.3943198621273041, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.14718931913375854, "step": 12180 }, { "epoch": 4.062708472314877, "loss": 0.6714553833007812, "step": 12180 }, { "ce_loss": 0.19426223635673523, "epoch": 4.062708472314877, "step": 12180 }, { "distill_loss": 0.3287561237812042, "epoch": 4.062708472314877, "step": 12180 }, { "epoch": 4.062708472314877, "ref_ce_loss": 0.14829982817173004, "step": 12180 }, { "epoch": 4.066044029352902, "loss": 0.8747, "step": 12190 }, { "epoch": 4.066044029352902, "grad_norm": 2.586073637008667, "step": 12190 }, { "epoch": 4.066044029352902, "learning_rate": 0.0005375747937246253, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.800316333770752, "step": 12190 }, { "ce_loss": 0.2151237428188324, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.34133580327033997, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.1352153867483139, "step": 12190 }, { "epoch": 4.066044029352902, "loss": 0.6638113856315613, "step": 12190 }, { "ce_loss": 0.1923898607492447, "epoch": 4.066044029352902, "step": 12190 }, { "distill_loss": 0.2985258102416992, "epoch": 4.066044029352902, "step": 12190 }, { "epoch": 4.066044029352902, "ref_ce_loss": 0.12334320694208145, "step": 12190 }, { "epoch": 4.0693795863909275, "loss": 0.8444, "step": 12200 }, { "epoch": 4.0693795863909275, "grad_norm": 2.256270408630371, "step": 12200 }, { "epoch": 4.0693795863909275, "learning_rate": 0.0005371689456163431, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 0.7758722305297852, "step": 12200 }, { "ce_loss": 0.20380429923534393, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.38605189323425293, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.15181048214435577, "step": 12200 }, { "epoch": 4.0693795863909275, "loss": 1.1506251096725464, "step": 12200 }, { "ce_loss": 0.27955031394958496, "epoch": 4.0693795863909275, "step": 12200 }, { "distill_loss": 0.46950602531433105, "epoch": 4.0693795863909275, "step": 12200 }, { "epoch": 4.0693795863909275, "ref_ce_loss": 0.20281149446964264, "step": 12200 }, { "epoch": 4.072715143428953, "loss": 0.8862, "step": 12210 }, { "epoch": 4.072715143428953, "grad_norm": 2.828490734100342, "step": 12210 }, { "epoch": 4.072715143428953, "learning_rate": 0.0005367629374171415, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.8178892135620117, "step": 12210 }, { "ce_loss": 0.18405741453170776, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.37577715516090393, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.13698050379753113, "step": 12210 }, { "epoch": 4.072715143428953, "loss": 0.7907006740570068, "step": 12210 }, { "ce_loss": 0.2112656682729721, "epoch": 4.072715143428953, "step": 12210 }, { "distill_loss": 0.3416105806827545, "epoch": 4.072715143428953, "step": 12210 }, { "epoch": 4.072715143428953, "ref_ce_loss": 0.18553809821605682, "step": 12210 }, { "epoch": 4.076050700466978, "loss": 0.8332, "step": 12220 }, { "epoch": 4.076050700466978, "grad_norm": 1.8846862316131592, "step": 12220 }, { "epoch": 4.076050700466978, "learning_rate": 0.0005363567696008755, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.8879470825195312, "step": 12220 }, { "ce_loss": 0.20231954753398895, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.34891757369041443, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.1723923534154892, "step": 12220 }, { "epoch": 4.076050700466978, "loss": 0.8345476984977722, "step": 12220 }, { "ce_loss": 0.22824759781360626, "epoch": 4.076050700466978, "step": 12220 }, { "distill_loss": 0.39881062507629395, "epoch": 4.076050700466978, "step": 12220 }, { "epoch": 4.076050700466978, "ref_ce_loss": 0.2072669118642807, "step": 12220 }, { "epoch": 4.0793862575050035, "loss": 0.9121, "step": 12230 }, { "epoch": 4.0793862575050035, "grad_norm": 1.5614279508590698, "step": 12230 }, { "epoch": 4.0793862575050035, "learning_rate": 0.0005359504426415869, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.9222154021263123, "step": 12230 }, { "ce_loss": 0.21302418410778046, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.3472989499568939, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.19184233248233795, "step": 12230 }, { "epoch": 4.0793862575050035, "loss": 0.854812741279602, "step": 12230 }, { "ce_loss": 0.20118644833564758, "epoch": 4.0793862575050035, "step": 12230 }, { "distill_loss": 0.375763475894928, "epoch": 4.0793862575050035, "step": 12230 }, { "epoch": 4.0793862575050035, "ref_ce_loss": 0.17603860795497894, "step": 12230 }, { "epoch": 4.082721814543029, "loss": 0.8894, "step": 12240 }, { "epoch": 4.082721814543029, "grad_norm": 2.552529811859131, "step": 12240 }, { "epoch": 4.082721814543029, "learning_rate": 0.0005355439570135028, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 0.8024995923042297, "step": 12240 }, { "ce_loss": 0.1890585571527481, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.43341967463493347, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.1784624308347702, "step": 12240 }, { "epoch": 4.082721814543029, "loss": 1.153965950012207, "step": 12240 }, { "ce_loss": 0.15036635100841522, "epoch": 4.082721814543029, "step": 12240 }, { "distill_loss": 0.29220816493034363, "epoch": 4.082721814543029, "step": 12240 }, { "epoch": 4.082721814543029, "ref_ce_loss": 0.12937654554843903, "step": 12240 }, { "epoch": 4.086057371581054, "loss": 0.9087, "step": 12250 }, { "epoch": 4.086057371581054, "grad_norm": 1.2219160795211792, "step": 12250 }, { "epoch": 4.086057371581054, "learning_rate": 0.0005351373131910357, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.8469090461730957, "step": 12250 }, { "ce_loss": 0.21229684352874756, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.4633750915527344, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.17109663784503937, "step": 12250 }, { "epoch": 4.086057371581054, "loss": 0.8332298398017883, "step": 12250 }, { "ce_loss": 0.231580451130867, "epoch": 4.086057371581054, "step": 12250 }, { "distill_loss": 0.4204753637313843, "epoch": 4.086057371581054, "step": 12250 }, { "epoch": 4.086057371581054, "ref_ce_loss": 0.18100927770137787, "step": 12250 }, { "epoch": 4.0893929286190795, "loss": 0.8893, "step": 12260 }, { "epoch": 4.0893929286190795, "grad_norm": 3.462394952774048, "step": 12260 }, { "epoch": 4.0893929286190795, "learning_rate": 0.0005347305116487827, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 1.1916685104370117, "step": 12260 }, { "ce_loss": 0.2497703731060028, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.43108367919921875, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.20356877148151398, "step": 12260 }, { "epoch": 4.0893929286190795, "loss": 0.8169243931770325, "step": 12260 }, { "ce_loss": 0.24040541052818298, "epoch": 4.0893929286190795, "step": 12260 }, { "distill_loss": 0.36889344453811646, "epoch": 4.0893929286190795, "step": 12260 }, { "epoch": 4.0893929286190795, "ref_ce_loss": 0.15058249235153198, "step": 12260 }, { "epoch": 4.092728485657105, "loss": 0.8878, "step": 12270 }, { "epoch": 4.092728485657105, "grad_norm": 1.7535141706466675, "step": 12270 }, { "epoch": 4.092728485657105, "learning_rate": 0.0005343235528615252, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.782029926776886, "step": 12270 }, { "ce_loss": 0.12700021266937256, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.2926868200302124, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.13709864020347595, "step": 12270 }, { "epoch": 4.092728485657105, "loss": 0.62381911277771, "step": 12270 }, { "ce_loss": 0.15468086302280426, "epoch": 4.092728485657105, "step": 12270 }, { "distill_loss": 0.32623642683029175, "epoch": 4.092728485657105, "step": 12270 }, { "epoch": 4.092728485657105, "ref_ce_loss": 0.14236095547676086, "step": 12270 }, { "epoch": 4.09606404269513, "loss": 0.9293, "step": 12280 }, { "epoch": 4.09606404269513, "grad_norm": 3.5007686614990234, "step": 12280 }, { "epoch": 4.09606404269513, "learning_rate": 0.0005339164373042275, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.8049991130828857, "step": 12280 }, { "ce_loss": 0.19103293120861053, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.4100547432899475, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.15682145953178406, "step": 12280 }, { "epoch": 4.09606404269513, "loss": 0.7409756183624268, "step": 12280 }, { "ce_loss": 0.188636913895607, "epoch": 4.09606404269513, "step": 12280 }, { "distill_loss": 0.40608739852905273, "epoch": 4.09606404269513, "step": 12280 }, { "epoch": 4.09606404269513, "ref_ce_loss": 0.1458330750465393, "step": 12280 }, { "epoch": 4.099399599733156, "loss": 1.0107, "step": 12290 }, { "epoch": 4.099399599733156, "grad_norm": 1.8453807830810547, "step": 12290 }, { "epoch": 4.099399599733156, "learning_rate": 0.0005335091654520374, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.9331576824188232, "step": 12290 }, { "ce_loss": 0.2545464038848877, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.45423731207847595, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.16400742530822754, "step": 12290 }, { "epoch": 4.099399599733156, "loss": 0.8729333877563477, "step": 12290 }, { "ce_loss": 0.21526475250720978, "epoch": 4.099399599733156, "step": 12290 }, { "distill_loss": 0.4151415526866913, "epoch": 4.099399599733156, "step": 12290 }, { "epoch": 4.099399599733156, "ref_ce_loss": 0.16812297701835632, "step": 12290 }, { "epoch": 4.102735156771181, "loss": 0.9338, "step": 12300 }, { "epoch": 4.102735156771181, "grad_norm": 1.6489121913909912, "step": 12300 }, { "epoch": 4.102735156771181, "learning_rate": 0.0005331017377802853, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.7752780318260193, "step": 12300 }, { "ce_loss": 0.1960146278142929, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.3631875514984131, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.1767382174730301, "step": 12300 }, { "epoch": 4.102735156771181, "loss": 0.5530449151992798, "step": 12300 }, { "ce_loss": 0.17343278229236603, "epoch": 4.102735156771181, "step": 12300 }, { "distill_loss": 0.28233903646469116, "epoch": 4.102735156771181, "step": 12300 }, { "epoch": 4.102735156771181, "ref_ce_loss": 0.0964079201221466, "step": 12300 }, { "epoch": 4.106070713809206, "loss": 0.8486, "step": 12310 }, { "epoch": 4.106070713809206, "grad_norm": 2.788712978363037, "step": 12310 }, { "epoch": 4.106070713809206, "learning_rate": 0.0005326941547644827, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 0.6198724508285522, "step": 12310 }, { "ce_loss": 0.15598838031291962, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.3028586208820343, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.12344849854707718, "step": 12310 }, { "epoch": 4.106070713809206, "loss": 1.2339098453521729, "step": 12310 }, { "ce_loss": 0.2487998604774475, "epoch": 4.106070713809206, "step": 12310 }, { "distill_loss": 0.42270636558532715, "epoch": 4.106070713809206, "step": 12310 }, { "epoch": 4.106070713809206, "ref_ce_loss": 0.17186760902404785, "step": 12310 }, { "epoch": 4.109406270847232, "loss": 0.8935, "step": 12320 }, { "epoch": 4.109406270847232, "grad_norm": 2.3489551544189453, "step": 12320 }, { "epoch": 4.109406270847232, "learning_rate": 0.0005322864168803231, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 1.010440468788147, "step": 12320 }, { "ce_loss": 0.24523364007472992, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.4519507586956024, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.17541134357452393, "step": 12320 }, { "epoch": 4.109406270847232, "loss": 0.7359771728515625, "step": 12320 }, { "ce_loss": 0.17784751951694489, "epoch": 4.109406270847232, "step": 12320 }, { "distill_loss": 0.3079465627670288, "epoch": 4.109406270847232, "step": 12320 }, { "epoch": 4.109406270847232, "ref_ce_loss": 0.18632066249847412, "step": 12320 }, { "epoch": 4.112741827885257, "loss": 0.8383, "step": 12330 }, { "epoch": 4.112741827885257, "grad_norm": 2.5580828189849854, "step": 12330 }, { "epoch": 4.112741827885257, "learning_rate": 0.0005318785246036802, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.8238269090652466, "step": 12330 }, { "ce_loss": 0.2343672662973404, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.3459825813770294, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.19455063343048096, "step": 12330 }, { "epoch": 4.112741827885257, "loss": 0.738259494304657, "step": 12330 }, { "ce_loss": 0.20209765434265137, "epoch": 4.112741827885257, "step": 12330 }, { "distill_loss": 0.40758582949638367, "epoch": 4.112741827885257, "step": 12330 }, { "epoch": 4.112741827885257, "ref_ce_loss": 0.1284421682357788, "step": 12330 }, { "epoch": 4.116077384923282, "loss": 0.9199, "step": 12340 }, { "epoch": 4.116077384923282, "grad_norm": 2.166008710861206, "step": 12340 }, { "epoch": 4.116077384923282, "learning_rate": 0.0005314704784106086, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 1.1044200658798218, "step": 12340 }, { "ce_loss": 0.3675804138183594, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.4157879948616028, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.20491206645965576, "step": 12340 }, { "epoch": 4.116077384923282, "loss": 0.7513501644134521, "step": 12340 }, { "ce_loss": 0.17775169014930725, "epoch": 4.116077384923282, "step": 12340 }, { "distill_loss": 0.3683262765407562, "epoch": 4.116077384923282, "step": 12340 }, { "epoch": 4.116077384923282, "ref_ce_loss": 0.14589950442314148, "step": 12340 }, { "epoch": 4.119412941961308, "loss": 0.9352, "step": 12350 }, { "epoch": 4.119412941961308, "grad_norm": 2.094301223754883, "step": 12350 }, { "epoch": 4.119412941961308, "learning_rate": 0.0005310622787773417, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.8135566115379333, "step": 12350 }, { "ce_loss": 0.23291337490081787, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.3937142789363861, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.13678307831287384, "step": 12350 }, { "epoch": 4.119412941961308, "loss": 0.6764379143714905, "step": 12350 }, { "ce_loss": 0.20164714753627777, "epoch": 4.119412941961308, "step": 12350 }, { "distill_loss": 0.3326803743839264, "epoch": 4.119412941961308, "step": 12350 }, { "epoch": 4.119412941961308, "ref_ce_loss": 0.14201340079307556, "step": 12350 }, { "epoch": 4.122748498999333, "loss": 0.8407, "step": 12360 }, { "epoch": 4.122748498999333, "grad_norm": 1.809917688369751, "step": 12360 }, { "epoch": 4.122748498999333, "learning_rate": 0.0005306539261802928, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.784867525100708, "step": 12360 }, { "ce_loss": 0.17784570157527924, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.37807798385620117, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.17634142935276031, "step": 12360 }, { "epoch": 4.122748498999333, "loss": 0.7875006794929504, "step": 12360 }, { "ce_loss": 0.20042015612125397, "epoch": 4.122748498999333, "step": 12360 }, { "distill_loss": 0.3755456507205963, "epoch": 4.122748498999333, "step": 12360 }, { "epoch": 4.122748498999333, "ref_ce_loss": 0.1658916175365448, "step": 12360 }, { "epoch": 4.126084056037358, "loss": 0.8896, "step": 12370 }, { "epoch": 4.126084056037358, "grad_norm": 2.7777342796325684, "step": 12370 }, { "epoch": 4.126084056037358, "learning_rate": 0.0005302454210960529, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.6223516464233398, "step": 12370 }, { "ce_loss": 0.10979112982749939, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.34536290168762207, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.1221790686249733, "step": 12370 }, { "epoch": 4.126084056037358, "loss": 0.908021867275238, "step": 12370 }, { "ce_loss": 0.20778703689575195, "epoch": 4.126084056037358, "step": 12370 }, { "distill_loss": 0.44324469566345215, "epoch": 4.126084056037358, "step": 12370 }, { "epoch": 4.126084056037358, "ref_ce_loss": 0.19053210318088531, "step": 12370 }, { "epoch": 4.129419613075384, "loss": 0.8902, "step": 12380 }, { "epoch": 4.129419613075384, "grad_norm": 1.8988518714904785, "step": 12380 }, { "epoch": 4.129419613075384, "learning_rate": 0.0005298367640013918, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.7390773892402649, "step": 12380 }, { "ce_loss": 0.20272931456565857, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.34763336181640625, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.15068559348583221, "step": 12380 }, { "epoch": 4.129419613075384, "loss": 0.8524680733680725, "step": 12380 }, { "ce_loss": 0.27532801032066345, "epoch": 4.129419613075384, "step": 12380 }, { "distill_loss": 0.3684280514717102, "epoch": 4.129419613075384, "step": 12380 }, { "epoch": 4.129419613075384, "ref_ce_loss": 0.19492267072200775, "step": 12380 }, { "epoch": 4.132755170113409, "loss": 0.8947, "step": 12390 }, { "epoch": 4.132755170113409, "grad_norm": 2.2255940437316895, "step": 12390 }, { "epoch": 4.132755170113409, "learning_rate": 0.0005294279553732558, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.8695392608642578, "step": 12390 }, { "ce_loss": 0.23825789988040924, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.4132382869720459, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.16632671654224396, "step": 12390 }, { "epoch": 4.132755170113409, "loss": 0.8631477355957031, "step": 12390 }, { "ce_loss": 0.20770201086997986, "epoch": 4.132755170113409, "step": 12390 }, { "distill_loss": 0.37137627601623535, "epoch": 4.132755170113409, "step": 12390 }, { "epoch": 4.132755170113409, "ref_ce_loss": 0.2013377696275711, "step": 12390 }, { "epoch": 4.136090727151434, "loss": 0.8487, "step": 12400 }, { "epoch": 4.136090727151434, "grad_norm": 1.5887508392333984, "step": 12400 }, { "epoch": 4.136090727151434, "learning_rate": 0.0005290189956887691, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.8215475082397461, "step": 12400 }, { "ce_loss": 0.20373262465000153, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.38555386662483215, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.17290176451206207, "step": 12400 }, { "epoch": 4.136090727151434, "loss": 0.6696367263793945, "step": 12400 }, { "ce_loss": 0.17253293097019196, "epoch": 4.136090727151434, "step": 12400 }, { "distill_loss": 0.3173353672027588, "epoch": 4.136090727151434, "step": 12400 }, { "epoch": 4.136090727151434, "ref_ce_loss": 0.14702731370925903, "step": 12400 }, { "epoch": 4.13942628418946, "loss": 0.8461, "step": 12410 }, { "epoch": 4.13942628418946, "grad_norm": 1.7948837280273438, "step": 12410 }, { "epoch": 4.13942628418946, "learning_rate": 0.0005286098854252313, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.807908833026886, "step": 12410 }, { "ce_loss": 0.16164608299732208, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.3655582368373871, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.13646908104419708, "step": 12410 }, { "epoch": 4.13942628418946, "loss": 0.911085307598114, "step": 12410 }, { "ce_loss": 0.25217583775520325, "epoch": 4.13942628418946, "step": 12410 }, { "distill_loss": 0.3669683635234833, "epoch": 4.13942628418946, "step": 12410 }, { "epoch": 4.13942628418946, "ref_ce_loss": 0.1766831874847412, "step": 12410 }, { "epoch": 4.142761841227485, "loss": 0.9583, "step": 12420 }, { "epoch": 4.142761841227485, "grad_norm": 1.9843021631240845, "step": 12420 }, { "epoch": 4.142761841227485, "learning_rate": 0.0005282006250601183, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.7271966934204102, "step": 12420 }, { "ce_loss": 0.2506415843963623, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.3042491674423218, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.1721765249967575, "step": 12420 }, { "epoch": 4.142761841227485, "loss": 0.9262433052062988, "step": 12420 }, { "ce_loss": 0.19725489616394043, "epoch": 4.142761841227485, "step": 12420 }, { "distill_loss": 0.3291471004486084, "epoch": 4.142761841227485, "step": 12420 }, { "epoch": 4.142761841227485, "ref_ce_loss": 0.15821436047554016, "step": 12420 }, { "epoch": 4.1460973982655105, "loss": 0.8732, "step": 12430 }, { "epoch": 4.1460973982655105, "grad_norm": 1.9727813005447388, "step": 12430 }, { "epoch": 4.1460973982655105, "learning_rate": 0.0005277912150710808, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.7784503698348999, "step": 12430 }, { "ce_loss": 0.1683514267206192, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.33196163177490234, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.16521677374839783, "step": 12430 }, { "epoch": 4.1460973982655105, "loss": 0.6911976933479309, "step": 12430 }, { "ce_loss": 0.15645936131477356, "epoch": 4.1460973982655105, "step": 12430 }, { "distill_loss": 0.3178960382938385, "epoch": 4.1460973982655105, "step": 12430 }, { "epoch": 4.1460973982655105, "ref_ce_loss": 0.1477438360452652, "step": 12430 }, { "epoch": 4.149432955303536, "loss": 0.8732, "step": 12440 }, { "epoch": 4.149432955303536, "grad_norm": 5.099236488342285, "step": 12440 }, { "epoch": 4.149432955303536, "learning_rate": 0.0005273816559359444, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.764206051826477, "step": 12440 }, { "ce_loss": 0.14106637239456177, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.35556140542030334, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.13623017072677612, "step": 12440 }, { "epoch": 4.149432955303536, "loss": 0.6508702635765076, "step": 12440 }, { "ce_loss": 0.13022911548614502, "epoch": 4.149432955303536, "step": 12440 }, { "distill_loss": 0.3620184659957886, "epoch": 4.149432955303536, "step": 12440 }, { "epoch": 4.149432955303536, "ref_ce_loss": 0.11632784456014633, "step": 12440 }, { "epoch": 4.152768512341561, "loss": 0.8843, "step": 12450 }, { "epoch": 4.152768512341561, "grad_norm": 2.6267311573028564, "step": 12450 }, { "epoch": 4.152768512341561, "learning_rate": 0.0005269719481327087, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 0.6806822419166565, "step": 12450 }, { "ce_loss": 0.15208680927753448, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.3680131137371063, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.16049380600452423, "step": 12450 }, { "epoch": 4.152768512341561, "loss": 1.2068417072296143, "step": 12450 }, { "ce_loss": 0.1920844465494156, "epoch": 4.152768512341561, "step": 12450 }, { "distill_loss": 0.3374309539794922, "epoch": 4.152768512341561, "step": 12450 }, { "epoch": 4.152768512341561, "ref_ce_loss": 0.18087457120418549, "step": 12450 }, { "epoch": 4.1561040693795865, "loss": 0.8568, "step": 12460 }, { "epoch": 4.1561040693795865, "grad_norm": 1.804216742515564, "step": 12460 }, { "epoch": 4.1561040693795865, "learning_rate": 0.0005265620921395469, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 0.8538085222244263, "step": 12460 }, { "ce_loss": 0.25904232263565063, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.35094770789146423, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.1721329391002655, "step": 12460 }, { "epoch": 4.1561040693795865, "loss": 1.1678739786148071, "step": 12460 }, { "ce_loss": 0.2640455365180969, "epoch": 4.1561040693795865, "step": 12460 }, { "distill_loss": 0.39140695333480835, "epoch": 4.1561040693795865, "step": 12460 }, { "epoch": 4.1561040693795865, "ref_ce_loss": 0.1936824917793274, "step": 12460 }, { "epoch": 4.159439626417612, "loss": 0.9146, "step": 12470 }, { "epoch": 4.159439626417612, "grad_norm": 1.7934942245483398, "step": 12470 }, { "epoch": 4.159439626417612, "learning_rate": 0.0005261520884348048, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 1.1549052000045776, "step": 12470 }, { "ce_loss": 0.3144696354866028, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.4195968508720398, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.18539300560951233, "step": 12470 }, { "epoch": 4.159439626417612, "loss": 1.1561297178268433, "step": 12470 }, { "ce_loss": 0.30278608202934265, "epoch": 4.159439626417612, "step": 12470 }, { "distill_loss": 0.47233226895332336, "epoch": 4.159439626417612, "step": 12470 }, { "epoch": 4.159439626417612, "ref_ce_loss": 0.21444253623485565, "step": 12470 }, { "epoch": 4.162775183455637, "loss": 1.0016, "step": 12480 }, { "epoch": 4.162775183455637, "grad_norm": 2.2133514881134033, "step": 12480 }, { "epoch": 4.162775183455637, "learning_rate": 0.0005257419374970012, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.9485584497451782, "step": 12480 }, { "ce_loss": 0.14209671318531036, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.43666237592697144, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.16533339023590088, "step": 12480 }, { "epoch": 4.162775183455637, "loss": 0.7782878875732422, "step": 12480 }, { "ce_loss": 0.18979869782924652, "epoch": 4.162775183455637, "step": 12480 }, { "distill_loss": 0.40814101696014404, "epoch": 4.162775183455637, "step": 12480 }, { "epoch": 4.162775183455637, "ref_ce_loss": 0.14509843289852142, "step": 12480 }, { "epoch": 4.166110740493663, "loss": 0.8745, "step": 12490 }, { "epoch": 4.166110740493663, "grad_norm": 1.8272050619125366, "step": 12490 }, { "epoch": 4.166110740493663, "learning_rate": 0.0005253316398048258, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.8993416428565979, "step": 12490 }, { "ce_loss": 0.2716492712497711, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.38575854897499084, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.19661776721477509, "step": 12490 }, { "epoch": 4.166110740493663, "loss": 0.8288870453834534, "step": 12490 }, { "ce_loss": 0.2048046737909317, "epoch": 4.166110740493663, "step": 12490 }, { "distill_loss": 0.3267305791378021, "epoch": 4.166110740493663, "step": 12490 }, { "epoch": 4.166110740493663, "ref_ce_loss": 0.15473157167434692, "step": 12490 }, { "epoch": 4.169446297531688, "loss": 0.8546, "step": 12500 }, { "epoch": 4.169446297531688, "grad_norm": 1.6469388008117676, "step": 12500 }, { "epoch": 4.169446297531688, "learning_rate": 0.0005249211958371406, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.9824590086936951, "step": 12500 }, { "ce_loss": 0.2935064435005188, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.39997178316116333, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.21537864208221436, "step": 12500 }, { "epoch": 4.169446297531688, "loss": 0.7035849690437317, "step": 12500 }, { "ce_loss": 0.20198889076709747, "epoch": 4.169446297531688, "step": 12500 }, { "distill_loss": 0.3350256383419037, "epoch": 4.169446297531688, "step": 12500 }, { "epoch": 4.169446297531688, "ref_ce_loss": 0.16631345450878143, "step": 12500 }, { "epoch": 4.172781854569713, "loss": 0.9153, "step": 12510 }, { "epoch": 4.172781854569713, "grad_norm": 3.0709216594696045, "step": 12510 }, { "epoch": 4.172781854569713, "learning_rate": 0.000524510606072978, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.7076886296272278, "step": 12510 }, { "ce_loss": 0.17329534888267517, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.34127768874168396, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.1456620842218399, "step": 12510 }, { "epoch": 4.172781854569713, "loss": 0.6534823775291443, "step": 12510 }, { "ce_loss": 0.18447883427143097, "epoch": 4.172781854569713, "step": 12510 }, { "distill_loss": 0.29903972148895264, "epoch": 4.172781854569713, "step": 12510 }, { "epoch": 4.172781854569713, "ref_ce_loss": 0.16942507028579712, "step": 12510 }, { "epoch": 4.176117411607739, "loss": 0.8183, "step": 12520 }, { "epoch": 4.176117411607739, "grad_norm": 1.55356764793396, "step": 12520 }, { "epoch": 4.176117411607739, "learning_rate": 0.00052409987099154, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.7567281723022461, "step": 12520 }, { "ce_loss": 0.17792902886867523, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.3390805721282959, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.16375339031219482, "step": 12520 }, { "epoch": 4.176117411607739, "loss": 0.6664754748344421, "step": 12520 }, { "ce_loss": 0.20032376050949097, "epoch": 4.176117411607739, "step": 12520 }, { "distill_loss": 0.32088297605514526, "epoch": 4.176117411607739, "step": 12520 }, { "epoch": 4.176117411607739, "ref_ce_loss": 0.120712511241436, "step": 12520 }, { "epoch": 4.179452968645764, "loss": 0.7824, "step": 12530 }, { "epoch": 4.179452968645764, "grad_norm": 1.796610713005066, "step": 12530 }, { "epoch": 4.179452968645764, "learning_rate": 0.0005236889910721989, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.7799438238143921, "step": 12530 }, { "ce_loss": 0.23633547127246857, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.35754677653312683, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.1504986733198166, "step": 12530 }, { "epoch": 4.179452968645764, "loss": 0.5622330904006958, "step": 12530 }, { "ce_loss": 0.132956400513649, "epoch": 4.179452968645764, "step": 12530 }, { "distill_loss": 0.27733558416366577, "epoch": 4.179452968645764, "step": 12530 }, { "epoch": 4.179452968645764, "ref_ce_loss": 0.15136227011680603, "step": 12530 }, { "epoch": 4.182788525683789, "loss": 0.7844, "step": 12540 }, { "epoch": 4.182788525683789, "grad_norm": 1.6813994646072388, "step": 12540 }, { "epoch": 4.182788525683789, "learning_rate": 0.0005232779667944959, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 1.1557304859161377, "step": 12540 }, { "ce_loss": 0.19993092119693756, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.4479179084300995, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.15367430448532104, "step": 12540 }, { "epoch": 4.182788525683789, "loss": 0.7927672266960144, "step": 12540 }, { "ce_loss": 0.26575636863708496, "epoch": 4.182788525683789, "step": 12540 }, { "distill_loss": 0.3921426832675934, "epoch": 4.182788525683789, "step": 12540 }, { "epoch": 4.182788525683789, "ref_ce_loss": 0.1343362182378769, "step": 12540 }, { "epoch": 4.186124082721815, "loss": 0.868, "step": 12550 }, { "epoch": 4.186124082721815, "grad_norm": 1.5843020677566528, "step": 12550 }, { "epoch": 4.186124082721815, "learning_rate": 0.0005228667986381402, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.9517365097999573, "step": 12550 }, { "ce_loss": 0.22790701687335968, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.42120158672332764, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.1550002545118332, "step": 12550 }, { "epoch": 4.186124082721815, "loss": 0.7985783219337463, "step": 12550 }, { "ce_loss": 0.23687949776649475, "epoch": 4.186124082721815, "step": 12550 }, { "distill_loss": 0.30934327840805054, "epoch": 4.186124082721815, "step": 12550 }, { "epoch": 4.186124082721815, "ref_ce_loss": 0.19739030301570892, "step": 12550 }, { "epoch": 4.18945963975984, "loss": 0.9034, "step": 12560 }, { "epoch": 4.18945963975984, "grad_norm": 1.9663341045379639, "step": 12560 }, { "epoch": 4.18945963975984, "learning_rate": 0.0005224554870830095, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.7035840153694153, "step": 12560 }, { "ce_loss": 0.21058326959609985, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.2658486068248749, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.1693160980939865, "step": 12560 }, { "epoch": 4.18945963975984, "loss": 0.8260355591773987, "step": 12560 }, { "ce_loss": 0.22241933643817902, "epoch": 4.18945963975984, "step": 12560 }, { "distill_loss": 0.32658758759498596, "epoch": 4.18945963975984, "step": 12560 }, { "epoch": 4.18945963975984, "ref_ce_loss": 0.1754622608423233, "step": 12560 }, { "epoch": 4.192795196797865, "loss": 0.8248, "step": 12570 }, { "epoch": 4.192795196797865, "grad_norm": 1.6988965272903442, "step": 12570 }, { "epoch": 4.192795196797865, "learning_rate": 0.0005220440326091486, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.8381247520446777, "step": 12570 }, { "ce_loss": 0.24428690969944, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.3550545573234558, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.17451095581054688, "step": 12570 }, { "epoch": 4.192795196797865, "loss": 0.5025757551193237, "step": 12570 }, { "ce_loss": 0.12798961997032166, "epoch": 4.192795196797865, "step": 12570 }, { "distill_loss": 0.20385515689849854, "epoch": 4.192795196797865, "step": 12570 }, { "epoch": 4.192795196797865, "ref_ce_loss": 0.12277933955192566, "step": 12570 }, { "epoch": 4.196130753835891, "loss": 0.8421, "step": 12580 }, { "epoch": 4.196130753835891, "grad_norm": 2.9220128059387207, "step": 12580 }, { "epoch": 4.196130753835891, "learning_rate": 0.0005216324356967692, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.8571416139602661, "step": 12580 }, { "ce_loss": 0.27399173378944397, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.3700946271419525, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.16993477940559387, "step": 12580 }, { "epoch": 4.196130753835891, "loss": 0.9421014785766602, "step": 12580 }, { "ce_loss": 0.2747233510017395, "epoch": 4.196130753835891, "step": 12580 }, { "distill_loss": 0.3414028286933899, "epoch": 4.196130753835891, "step": 12580 }, { "epoch": 4.196130753835891, "ref_ce_loss": 0.1843489110469818, "step": 12580 }, { "epoch": 4.199466310873916, "loss": 0.9192, "step": 12590 }, { "epoch": 4.199466310873916, "grad_norm": 1.5139302015304565, "step": 12590 }, { "epoch": 4.199466310873916, "learning_rate": 0.0005212206968262492, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 0.7501705884933472, "step": 12590 }, { "ce_loss": 0.21901053190231323, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.3573254942893982, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.17358480393886566, "step": 12590 }, { "epoch": 4.199466310873916, "loss": 1.0535688400268555, "step": 12590 }, { "ce_loss": 0.23637911677360535, "epoch": 4.199466310873916, "step": 12590 }, { "distill_loss": 0.3890411853790283, "epoch": 4.199466310873916, "step": 12590 }, { "epoch": 4.199466310873916, "ref_ce_loss": 0.1661222130060196, "step": 12590 }, { "epoch": 4.202801867911941, "loss": 0.9307, "step": 12600 }, { "epoch": 4.202801867911941, "grad_norm": 4.023229122161865, "step": 12600 }, { "epoch": 4.202801867911941, "learning_rate": 0.0005208088164781322, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.8946199417114258, "step": 12600 }, { "ce_loss": 0.20067694783210754, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.369188517332077, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.14849264919757843, "step": 12600 }, { "epoch": 4.202801867911941, "loss": 0.7389215230941772, "step": 12600 }, { "ce_loss": 0.1676078885793686, "epoch": 4.202801867911941, "step": 12600 }, { "distill_loss": 0.3055753707885742, "epoch": 4.202801867911941, "step": 12600 }, { "epoch": 4.202801867911941, "ref_ce_loss": 0.17706617712974548, "step": 12600 }, { "epoch": 4.206137424949967, "loss": 0.8706, "step": 12610 }, { "epoch": 4.206137424949967, "grad_norm": 2.451463222503662, "step": 12610 }, { "epoch": 4.206137424949967, "learning_rate": 0.0005203967951331266, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.9454283714294434, "step": 12610 }, { "ce_loss": 0.28624972701072693, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.4206025004386902, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.1610059291124344, "step": 12610 }, { "epoch": 4.206137424949967, "loss": 0.8937854766845703, "step": 12610 }, { "ce_loss": 0.19485153257846832, "epoch": 4.206137424949967, "step": 12610 }, { "distill_loss": 0.4219454228878021, "epoch": 4.206137424949967, "step": 12610 }, { "epoch": 4.206137424949967, "ref_ce_loss": 0.1623564213514328, "step": 12610 }, { "epoch": 4.209472981987992, "loss": 0.9549, "step": 12620 }, { "epoch": 4.209472981987992, "grad_norm": 1.8533005714416504, "step": 12620 }, { "epoch": 4.209472981987992, "learning_rate": 0.0005199846332721059, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 1.1628186702728271, "step": 12620 }, { "ce_loss": 0.2804311513900757, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.44220656156539917, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.21410658955574036, "step": 12620 }, { "epoch": 4.209472981987992, "loss": 0.6214933395385742, "step": 12620 }, { "ce_loss": 0.13951043784618378, "epoch": 4.209472981987992, "step": 12620 }, { "distill_loss": 0.3288695514202118, "epoch": 4.209472981987992, "step": 12620 }, { "epoch": 4.209472981987992, "ref_ce_loss": 0.11865205317735672, "step": 12620 }, { "epoch": 4.2128085390260175, "loss": 0.9514, "step": 12630 }, { "epoch": 4.2128085390260175, "grad_norm": 3.439215660095215, "step": 12630 }, { "epoch": 4.2128085390260175, "learning_rate": 0.0005195723313761074, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.7460795640945435, "step": 12630 }, { "ce_loss": 0.193592369556427, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.3062877655029297, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.14694787561893463, "step": 12630 }, { "epoch": 4.2128085390260175, "loss": 0.9722935557365417, "step": 12630 }, { "ce_loss": 0.21647977828979492, "epoch": 4.2128085390260175, "step": 12630 }, { "distill_loss": 0.4791179597377777, "epoch": 4.2128085390260175, "step": 12630 }, { "epoch": 4.2128085390260175, "ref_ce_loss": 0.17317292094230652, "step": 12630 }, { "epoch": 4.216144096064043, "loss": 0.8806, "step": 12640 }, { "epoch": 4.216144096064043, "grad_norm": 1.7811447381973267, "step": 12640 }, { "epoch": 4.216144096064043, "learning_rate": 0.0005191598899263315, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.9708473086357117, "step": 12640 }, { "ce_loss": 0.20301295816898346, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.38349807262420654, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.23510898649692535, "step": 12640 }, { "epoch": 4.216144096064043, "loss": 0.7471814751625061, "step": 12640 }, { "ce_loss": 0.1882941722869873, "epoch": 4.216144096064043, "step": 12640 }, { "distill_loss": 0.3489193022251129, "epoch": 4.216144096064043, "step": 12640 }, { "epoch": 4.216144096064043, "ref_ce_loss": 0.18988509476184845, "step": 12640 }, { "epoch": 4.219479653102068, "loss": 0.814, "step": 12650 }, { "epoch": 4.219479653102068, "grad_norm": 2.196380138397217, "step": 12650 }, { "epoch": 4.219479653102068, "learning_rate": 0.0005187473094041421, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.7829256653785706, "step": 12650 }, { "ce_loss": 0.19593583047389984, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.35681459307670593, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.1733783781528473, "step": 12650 }, { "epoch": 4.219479653102068, "loss": 0.8933637738227844, "step": 12650 }, { "ce_loss": 0.2182813584804535, "epoch": 4.219479653102068, "step": 12650 }, { "distill_loss": 0.3945019841194153, "epoch": 4.219479653102068, "step": 12650 }, { "epoch": 4.219479653102068, "ref_ce_loss": 0.14737632870674133, "step": 12650 }, { "epoch": 4.2228152101400935, "loss": 0.7941, "step": 12660 }, { "epoch": 4.2228152101400935, "grad_norm": 2.7803497314453125, "step": 12660 }, { "epoch": 4.2228152101400935, "learning_rate": 0.0005183345902910646, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.663300096988678, "step": 12660 }, { "ce_loss": 0.15582312643527985, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.29122138023376465, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.16431498527526855, "step": 12660 }, { "epoch": 4.2228152101400935, "loss": 0.893598198890686, "step": 12660 }, { "ce_loss": 0.24435116350650787, "epoch": 4.2228152101400935, "step": 12660 }, { "distill_loss": 0.31676265597343445, "epoch": 4.2228152101400935, "step": 12660 }, { "epoch": 4.2228152101400935, "ref_ce_loss": 0.18816092610359192, "step": 12660 }, { "epoch": 4.226150767178119, "loss": 0.8823, "step": 12670 }, { "epoch": 4.226150767178119, "grad_norm": 1.656362533569336, "step": 12670 }, { "epoch": 4.226150767178119, "learning_rate": 0.0005179217330687872, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.9187933206558228, "step": 12670 }, { "ce_loss": 0.23749017715454102, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.3897465765476227, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.15931007266044617, "step": 12670 }, { "epoch": 4.226150767178119, "loss": 0.8433822393417358, "step": 12670 }, { "ce_loss": 0.25055891275405884, "epoch": 4.226150767178119, "step": 12670 }, { "distill_loss": 0.3643941879272461, "epoch": 4.226150767178119, "step": 12670 }, { "epoch": 4.226150767178119, "ref_ce_loss": 0.1786240190267563, "step": 12670 }, { "epoch": 4.229486324216144, "loss": 0.8254, "step": 12680 }, { "epoch": 4.229486324216144, "grad_norm": 1.8392776250839233, "step": 12680 }, { "epoch": 4.229486324216144, "learning_rate": 0.0005175087382191583, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.9356280565261841, "step": 12680 }, { "ce_loss": 0.24201759696006775, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.3536796569824219, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.17648980021476746, "step": 12680 }, { "epoch": 4.229486324216144, "loss": 0.8160145878791809, "step": 12680 }, { "ce_loss": 0.23435355722904205, "epoch": 4.229486324216144, "step": 12680 }, { "distill_loss": 0.28745996952056885, "epoch": 4.229486324216144, "step": 12680 }, { "epoch": 4.229486324216144, "ref_ce_loss": 0.18511168658733368, "step": 12680 }, { "epoch": 4.23282188125417, "loss": 0.8672, "step": 12690 }, { "epoch": 4.23282188125417, "grad_norm": 1.6637465953826904, "step": 12690 }, { "epoch": 4.23282188125417, "learning_rate": 0.0005170956062241875, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.8128212690353394, "step": 12690 }, { "ce_loss": 0.2414637804031372, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.362344354391098, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.17531849443912506, "step": 12690 }, { "epoch": 4.23282188125417, "loss": 0.8842551112174988, "step": 12690 }, { "ce_loss": 0.269711434841156, "epoch": 4.23282188125417, "step": 12690 }, { "distill_loss": 0.369120717048645, "epoch": 4.23282188125417, "step": 12690 }, { "epoch": 4.23282188125417, "ref_ce_loss": 0.1902657300233841, "step": 12690 }, { "epoch": 4.236157438292195, "loss": 0.8807, "step": 12700 }, { "epoch": 4.236157438292195, "grad_norm": 1.8229620456695557, "step": 12700 }, { "epoch": 4.236157438292195, "learning_rate": 0.0005166823375660441, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 1.0383368730545044, "step": 12700 }, { "ce_loss": 0.22843684256076813, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.37846410274505615, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.21161183714866638, "step": 12700 }, { "epoch": 4.236157438292195, "loss": 0.7493757605552673, "step": 12700 }, { "ce_loss": 0.17815834283828735, "epoch": 4.236157438292195, "step": 12700 }, { "distill_loss": 0.3474876582622528, "epoch": 4.236157438292195, "step": 12700 }, { "epoch": 4.236157438292195, "ref_ce_loss": 0.18813316524028778, "step": 12700 }, { "epoch": 4.23949299533022, "loss": 0.9174, "step": 12710 }, { "epoch": 4.23949299533022, "grad_norm": 1.520404577255249, "step": 12710 }, { "epoch": 4.23949299533022, "learning_rate": 0.0005162689327270573, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 1.1266505718231201, "step": 12710 }, { "ce_loss": 0.24157746136188507, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.4278128147125244, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.18293094635009766, "step": 12710 }, { "epoch": 4.23949299533022, "loss": 0.9948121905326843, "step": 12710 }, { "ce_loss": 0.2826690971851349, "epoch": 4.23949299533022, "step": 12710 }, { "distill_loss": 0.4682421088218689, "epoch": 4.23949299533022, "step": 12710 }, { "epoch": 4.23949299533022, "ref_ce_loss": 0.19122135639190674, "step": 12710 }, { "epoch": 4.242828552368246, "loss": 0.8706, "step": 12720 }, { "epoch": 4.242828552368246, "grad_norm": 2.3407678604125977, "step": 12720 }, { "epoch": 4.242828552368246, "learning_rate": 0.0005158553921897149, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 1.2130777835845947, "step": 12720 }, { "ce_loss": 0.16182638704776764, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.3381054103374481, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.154271200299263, "step": 12720 }, { "epoch": 4.242828552368246, "loss": 0.8921987414360046, "step": 12720 }, { "ce_loss": 0.21352069079875946, "epoch": 4.242828552368246, "step": 12720 }, { "distill_loss": 0.42966151237487793, "epoch": 4.242828552368246, "step": 12720 }, { "epoch": 4.242828552368246, "ref_ce_loss": 0.20217864215373993, "step": 12720 }, { "epoch": 4.246164109406271, "loss": 0.8674, "step": 12730 }, { "epoch": 4.246164109406271, "grad_norm": 3.0453741550445557, "step": 12730 }, { "epoch": 4.246164109406271, "learning_rate": 0.0005154417164366633, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 1.0986595153808594, "step": 12730 }, { "ce_loss": 0.22912782430648804, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.3212457597255707, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.15528671443462372, "step": 12730 }, { "epoch": 4.246164109406271, "loss": 0.7199212908744812, "step": 12730 }, { "ce_loss": 0.2046150267124176, "epoch": 4.246164109406271, "step": 12730 }, { "distill_loss": 0.33154305815696716, "epoch": 4.246164109406271, "step": 12730 }, { "epoch": 4.246164109406271, "ref_ce_loss": 0.12476657330989838, "step": 12730 }, { "epoch": 4.249499666444296, "loss": 0.8704, "step": 12740 }, { "epoch": 4.249499666444296, "grad_norm": 1.2896811962127686, "step": 12740 }, { "epoch": 4.249499666444296, "learning_rate": 0.0005150279059507065, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.7782720327377319, "step": 12740 }, { "ce_loss": 0.1771492063999176, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.30043888092041016, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.174044668674469, "step": 12740 }, { "epoch": 4.249499666444296, "loss": 0.6854279041290283, "step": 12740 }, { "ce_loss": 0.1830480545759201, "epoch": 4.249499666444296, "step": 12740 }, { "distill_loss": 0.31467360258102417, "epoch": 4.249499666444296, "step": 12740 }, { "epoch": 4.249499666444296, "ref_ce_loss": 0.1465923935174942, "step": 12740 }, { "epoch": 4.252835223482322, "loss": 0.8925, "step": 12750 }, { "epoch": 4.252835223482322, "grad_norm": 1.632073163986206, "step": 12750 }, { "epoch": 4.252835223482322, "learning_rate": 0.0005146139612148061, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.8414084315299988, "step": 12750 }, { "ce_loss": 0.2268008589744568, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.34458792209625244, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.15668857097625732, "step": 12750 }, { "epoch": 4.252835223482322, "loss": 0.6938906908035278, "step": 12750 }, { "ce_loss": 0.17403708398342133, "epoch": 4.252835223482322, "step": 12750 }, { "distill_loss": 0.3637546896934509, "epoch": 4.252835223482322, "step": 12750 }, { "epoch": 4.252835223482322, "ref_ce_loss": 0.15595611929893494, "step": 12750 }, { "epoch": 4.256170780520347, "loss": 0.8466, "step": 12760 }, { "epoch": 4.256170780520347, "grad_norm": 1.6854667663574219, "step": 12760 }, { "epoch": 4.256170780520347, "learning_rate": 0.0005141998827120799, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.804252564907074, "step": 12760 }, { "ce_loss": 0.25367993116378784, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.3248867690563202, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.18379093706607819, "step": 12760 }, { "epoch": 4.256170780520347, "loss": 0.9785430431365967, "step": 12760 }, { "ce_loss": 0.29850438237190247, "epoch": 4.256170780520347, "step": 12760 }, { "distill_loss": 0.3835088610649109, "epoch": 4.256170780520347, "step": 12760 }, { "epoch": 4.256170780520347, "ref_ce_loss": 0.1554730236530304, "step": 12760 }, { "epoch": 4.259506337558372, "loss": 0.9539, "step": 12770 }, { "epoch": 4.259506337558372, "grad_norm": 2.00483775138855, "step": 12770 }, { "epoch": 4.259506337558372, "learning_rate": 0.0005137856709258021, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.759904682636261, "step": 12770 }, { "ce_loss": 0.22166599333286285, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.3422871232032776, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.194756418466568, "step": 12770 }, { "epoch": 4.259506337558372, "loss": 0.7452161908149719, "step": 12770 }, { "ce_loss": 0.1742466241121292, "epoch": 4.259506337558372, "step": 12770 }, { "distill_loss": 0.2873826026916504, "epoch": 4.259506337558372, "step": 12770 }, { "epoch": 4.259506337558372, "ref_ce_loss": 0.1804172545671463, "step": 12770 }, { "epoch": 4.262841894596398, "loss": 0.9132, "step": 12780 }, { "epoch": 4.262841894596398, "grad_norm": 1.7677546739578247, "step": 12780 }, { "epoch": 4.262841894596398, "learning_rate": 0.0005133713263394025, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 1.2018852233886719, "step": 12780 }, { "ce_loss": 0.2658690810203552, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.4588566720485687, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.20907336473464966, "step": 12780 }, { "epoch": 4.262841894596398, "loss": 0.7002390027046204, "step": 12780 }, { "ce_loss": 0.19882053136825562, "epoch": 4.262841894596398, "step": 12780 }, { "distill_loss": 0.3090733587741852, "epoch": 4.262841894596398, "step": 12780 }, { "epoch": 4.262841894596398, "ref_ce_loss": 0.1564292162656784, "step": 12780 }, { "epoch": 4.266177451634423, "loss": 0.9217, "step": 12790 }, { "epoch": 4.266177451634423, "grad_norm": 2.054368019104004, "step": 12790 }, { "epoch": 4.266177451634423, "learning_rate": 0.000512956849436466, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.8584941625595093, "step": 12790 }, { "ce_loss": 0.22313432395458221, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.40078526735305786, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.19047851860523224, "step": 12790 }, { "epoch": 4.266177451634423, "loss": 0.96540367603302, "step": 12790 }, { "ce_loss": 0.22203348577022552, "epoch": 4.266177451634423, "step": 12790 }, { "distill_loss": 0.4080057144165039, "epoch": 4.266177451634423, "step": 12790 }, { "epoch": 4.266177451634423, "ref_ce_loss": 0.22590765357017517, "step": 12790 }, { "epoch": 4.269513008672448, "loss": 0.8961, "step": 12800 }, { "epoch": 4.269513008672448, "grad_norm": 1.797431468963623, "step": 12800 }, { "epoch": 4.269513008672448, "learning_rate": 0.0005125422407007313, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.6185550689697266, "step": 12800 }, { "ce_loss": 0.17735739052295685, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.24161382019519806, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.14409980177879333, "step": 12800 }, { "epoch": 4.269513008672448, "loss": 0.7940037250518799, "step": 12800 }, { "ce_loss": 0.2755930721759796, "epoch": 4.269513008672448, "step": 12800 }, { "distill_loss": 0.35304659605026245, "epoch": 4.269513008672448, "step": 12800 }, { "epoch": 4.269513008672448, "ref_ce_loss": 0.1652856320142746, "step": 12800 }, { "epoch": 4.272848565710474, "loss": 0.8813, "step": 12810 }, { "epoch": 4.272848565710474, "grad_norm": 2.873664140701294, "step": 12810 }, { "epoch": 4.272848565710474, "learning_rate": 0.0005121275006160918, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 1.2027432918548584, "step": 12810 }, { "ce_loss": 0.24637238681316376, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.5167804956436157, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.19930368661880493, "step": 12810 }, { "epoch": 4.272848565710474, "loss": 0.8857278227806091, "step": 12810 }, { "ce_loss": 0.19141751527786255, "epoch": 4.272848565710474, "step": 12810 }, { "distill_loss": 0.424816370010376, "epoch": 4.272848565710474, "step": 12810 }, { "epoch": 4.272848565710474, "ref_ce_loss": 0.16012920439243317, "step": 12810 }, { "epoch": 4.276184122748499, "loss": 0.8761, "step": 12820 }, { "epoch": 4.276184122748499, "grad_norm": 2.463259220123291, "step": 12820 }, { "epoch": 4.276184122748499, "learning_rate": 0.0005117126296665935, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.8840368986129761, "step": 12820 }, { "ce_loss": 0.23989543318748474, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.35038143396377563, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.1598270982503891, "step": 12820 }, { "epoch": 4.276184122748499, "loss": 0.6414638161659241, "step": 12820 }, { "ce_loss": 0.1843615025281906, "epoch": 4.276184122748499, "step": 12820 }, { "distill_loss": 0.2808043956756592, "epoch": 4.276184122748499, "step": 12820 }, { "epoch": 4.276184122748499, "ref_ce_loss": 0.13928772509098053, "step": 12820 }, { "epoch": 4.2795196797865245, "loss": 0.8466, "step": 12830 }, { "epoch": 4.2795196797865245, "grad_norm": 1.7067866325378418, "step": 12830 }, { "epoch": 4.2795196797865245, "learning_rate": 0.0005112976283364358, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 0.7968407869338989, "step": 12830 }, { "ce_loss": 0.20776928961277008, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.2952131927013397, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.21513982117176056, "step": 12830 }, { "epoch": 4.2795196797865245, "loss": 1.7987573146820068, "step": 12830 }, { "ce_loss": 0.2212960422039032, "epoch": 4.2795196797865245, "step": 12830 }, { "distill_loss": 0.34988850355148315, "epoch": 4.2795196797865245, "step": 12830 }, { "epoch": 4.2795196797865245, "ref_ce_loss": 0.17741668224334717, "step": 12830 }, { "epoch": 4.28285523682455, "loss": 0.8365, "step": 12840 }, { "epoch": 4.28285523682455, "grad_norm": 2.650939464569092, "step": 12840 }, { "epoch": 4.28285523682455, "learning_rate": 0.0005108824971099697, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.819025456905365, "step": 12840 }, { "ce_loss": 0.17640937864780426, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.2923857569694519, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.17533421516418457, "step": 12840 }, { "epoch": 4.28285523682455, "loss": 0.8999745845794678, "step": 12840 }, { "ce_loss": 0.21573689579963684, "epoch": 4.28285523682455, "step": 12840 }, { "distill_loss": 0.43884068727493286, "epoch": 4.28285523682455, "step": 12840 }, { "epoch": 4.28285523682455, "ref_ce_loss": 0.20454314351081848, "step": 12840 }, { "epoch": 4.286190793862575, "loss": 0.8693, "step": 12850 }, { "epoch": 4.286190793862575, "grad_norm": 2.5925934314727783, "step": 12850 }, { "epoch": 4.286190793862575, "learning_rate": 0.0005104672364716979, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 1.0080406665802002, "step": 12850 }, { "ce_loss": 0.2583248019218445, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.39177629351615906, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.2107367366552353, "step": 12850 }, { "epoch": 4.286190793862575, "loss": 1.320812463760376, "step": 12850 }, { "ce_loss": 0.23779240250587463, "epoch": 4.286190793862575, "step": 12850 }, { "distill_loss": 0.35576707124710083, "epoch": 4.286190793862575, "step": 12850 }, { "epoch": 4.286190793862575, "ref_ce_loss": 0.16666685044765472, "step": 12850 }, { "epoch": 4.2895263509006005, "loss": 0.9322, "step": 12860 }, { "epoch": 4.2895263509006005, "grad_norm": 1.8717230558395386, "step": 12860 }, { "epoch": 4.2895263509006005, "learning_rate": 0.0005100518469062745, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 0.907106339931488, "step": 12860 }, { "ce_loss": 0.26588577032089233, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.4072806239128113, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.19155624508857727, "step": 12860 }, { "epoch": 4.2895263509006005, "loss": 1.1353204250335693, "step": 12860 }, { "ce_loss": 0.308605432510376, "epoch": 4.2895263509006005, "step": 12860 }, { "distill_loss": 0.45205217599868774, "epoch": 4.2895263509006005, "step": 12860 }, { "epoch": 4.2895263509006005, "ref_ce_loss": 0.22317735850811005, "step": 12860 }, { "epoch": 4.292861907938626, "loss": 0.895, "step": 12870 }, { "epoch": 4.292861907938626, "grad_norm": 1.8743352890014648, "step": 12870 }, { "epoch": 4.292861907938626, "learning_rate": 0.0005096363288985035, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.9951395988464355, "step": 12870 }, { "ce_loss": 0.26687008142471313, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.44366684556007385, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.16680413484573364, "step": 12870 }, { "epoch": 4.292861907938626, "loss": 0.9308007955551147, "step": 12870 }, { "ce_loss": 0.21052157878875732, "epoch": 4.292861907938626, "step": 12870 }, { "distill_loss": 0.3582399785518646, "epoch": 4.292861907938626, "step": 12870 }, { "epoch": 4.292861907938626, "ref_ce_loss": 0.18249179422855377, "step": 12870 }, { "epoch": 4.296197464976651, "loss": 0.8788, "step": 12880 }, { "epoch": 4.296197464976651, "grad_norm": 1.6709355115890503, "step": 12880 }, { "epoch": 4.296197464976651, "learning_rate": 0.0005092206829333394, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 0.7861965298652649, "step": 12880 }, { "ce_loss": 0.17194722592830658, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.45903676748275757, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.15497665107250214, "step": 12880 }, { "epoch": 4.296197464976651, "loss": 1.0698442459106445, "step": 12880 }, { "ce_loss": 0.1924717128276825, "epoch": 4.296197464976651, "step": 12880 }, { "distill_loss": 0.3156096339225769, "epoch": 4.296197464976651, "step": 12880 }, { "epoch": 4.296197464976651, "ref_ce_loss": 0.13832615315914154, "step": 12880 }, { "epoch": 4.299533022014677, "loss": 0.9493, "step": 12890 }, { "epoch": 4.299533022014677, "grad_norm": 1.8523396253585815, "step": 12890 }, { "epoch": 4.299533022014677, "learning_rate": 0.0005088049094958858, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 1.0296885967254639, "step": 12890 }, { "ce_loss": 0.22927792370319366, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.3084159195423126, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.1630437821149826, "step": 12890 }, { "epoch": 4.299533022014677, "loss": 0.8998898863792419, "step": 12890 }, { "ce_loss": 0.23149771988391876, "epoch": 4.299533022014677, "step": 12890 }, { "distill_loss": 0.3336685597896576, "epoch": 4.299533022014677, "step": 12890 }, { "epoch": 4.299533022014677, "ref_ce_loss": 0.21322858333587646, "step": 12890 }, { "epoch": 4.302868579052702, "loss": 0.882, "step": 12900 }, { "epoch": 4.302868579052702, "grad_norm": 1.5939126014709473, "step": 12900 }, { "epoch": 4.302868579052702, "learning_rate": 0.0005083890090713949, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 1.1143803596496582, "step": 12900 }, { "ce_loss": 0.2878939211368561, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.4585745632648468, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.2042665034532547, "step": 12900 }, { "epoch": 4.302868579052702, "loss": 0.6540172696113586, "step": 12900 }, { "ce_loss": 0.19547341763973236, "epoch": 4.302868579052702, "step": 12900 }, { "distill_loss": 0.30709272623062134, "epoch": 4.302868579052702, "step": 12900 }, { "epoch": 4.302868579052702, "ref_ce_loss": 0.1513615995645523, "step": 12900 }, { "epoch": 4.306204136090727, "loss": 0.863, "step": 12910 }, { "epoch": 4.306204136090727, "grad_norm": 3.067009687423706, "step": 12910 }, { "epoch": 4.306204136090727, "learning_rate": 0.0005079729821452671, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.7821261286735535, "step": 12910 }, { "ce_loss": 0.22335682809352875, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.3630492091178894, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.19557523727416992, "step": 12910 }, { "epoch": 4.306204136090727, "loss": 0.8023974299430847, "step": 12910 }, { "ce_loss": 0.19819621741771698, "epoch": 4.306204136090727, "step": 12910 }, { "distill_loss": 0.36896613240242004, "epoch": 4.306204136090727, "step": 12910 }, { "epoch": 4.306204136090727, "ref_ce_loss": 0.17763593792915344, "step": 12910 }, { "epoch": 4.309539693128753, "loss": 0.8432, "step": 12920 }, { "epoch": 4.309539693128753, "grad_norm": 1.4734580516815186, "step": 12920 }, { "epoch": 4.309539693128753, "learning_rate": 0.000507556829203051, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.9618943929672241, "step": 12920 }, { "ce_loss": 0.2590271532535553, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.3850080966949463, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.16762995719909668, "step": 12920 }, { "epoch": 4.309539693128753, "loss": 0.6719835996627808, "step": 12920 }, { "ce_loss": 0.1529945433139801, "epoch": 4.309539693128753, "step": 12920 }, { "distill_loss": 0.3388926088809967, "epoch": 4.309539693128753, "step": 12920 }, { "epoch": 4.309539693128753, "ref_ce_loss": 0.1480969339609146, "step": 12920 }, { "epoch": 4.312875250166778, "loss": 0.9353, "step": 12930 }, { "epoch": 4.312875250166778, "grad_norm": 1.7050225734710693, "step": 12930 }, { "epoch": 4.312875250166778, "learning_rate": 0.0005071405507304414, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 1.1230779886245728, "step": 12930 }, { "ce_loss": 0.19990414381027222, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.3488672971725464, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.1832718402147293, "step": 12930 }, { "epoch": 4.312875250166778, "loss": 0.8064915537834167, "step": 12930 }, { "ce_loss": 0.23185153305530548, "epoch": 4.312875250166778, "step": 12930 }, { "distill_loss": 0.330584317445755, "epoch": 4.312875250166778, "step": 12930 }, { "epoch": 4.312875250166778, "ref_ce_loss": 0.22238650918006897, "step": 12930 }, { "epoch": 4.316210807204803, "loss": 0.8709, "step": 12940 }, { "epoch": 4.316210807204803, "grad_norm": 3.6646835803985596, "step": 12940 }, { "epoch": 4.316210807204803, "learning_rate": 0.0005067241472132805, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.8757513761520386, "step": 12940 }, { "ce_loss": 0.2661345601081848, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.4074326157569885, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.2019917368888855, "step": 12940 }, { "epoch": 4.316210807204803, "loss": 0.8526966571807861, "step": 12940 }, { "ce_loss": 0.22637148201465607, "epoch": 4.316210807204803, "step": 12940 }, { "distill_loss": 0.37172484397888184, "epoch": 4.316210807204803, "step": 12940 }, { "epoch": 4.316210807204803, "ref_ce_loss": 0.18659645318984985, "step": 12940 }, { "epoch": 4.319546364242829, "loss": 0.9024, "step": 12950 }, { "epoch": 4.319546364242829, "grad_norm": 2.06301212310791, "step": 12950 }, { "epoch": 4.319546364242829, "learning_rate": 0.0005063076191375556, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.8412178754806519, "step": 12950 }, { "ce_loss": 0.20359109342098236, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.30335208773612976, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.16786359250545502, "step": 12950 }, { "epoch": 4.319546364242829, "loss": 0.6845080852508545, "step": 12950 }, { "ce_loss": 0.1543601006269455, "epoch": 4.319546364242829, "step": 12950 }, { "distill_loss": 0.3259199559688568, "epoch": 4.319546364242829, "step": 12950 }, { "epoch": 4.319546364242829, "ref_ce_loss": 0.15403856337070465, "step": 12950 }, { "epoch": 4.322881921280854, "loss": 0.831, "step": 12960 }, { "epoch": 4.322881921280854, "grad_norm": 1.7822954654693604, "step": 12960 }, { "epoch": 4.322881921280854, "learning_rate": 0.0005058909669894002, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 1.1513652801513672, "step": 12960 }, { "ce_loss": 0.2275664210319519, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.3893887400627136, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.16971373558044434, "step": 12960 }, { "epoch": 4.322881921280854, "loss": 0.8988643288612366, "step": 12960 }, { "ce_loss": 0.26751142740249634, "epoch": 4.322881921280854, "step": 12960 }, { "distill_loss": 0.4451296329498291, "epoch": 4.322881921280854, "step": 12960 }, { "epoch": 4.322881921280854, "ref_ce_loss": 0.13988979160785675, "step": 12960 }, { "epoch": 4.326217478318879, "loss": 0.9181, "step": 12970 }, { "epoch": 4.326217478318879, "grad_norm": 1.441675066947937, "step": 12970 }, { "epoch": 4.326217478318879, "learning_rate": 0.0005054741912550918, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.9492163062095642, "step": 12970 }, { "ce_loss": 0.18496477603912354, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.5061658620834351, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.181679368019104, "step": 12970 }, { "epoch": 4.326217478318879, "loss": 0.9847216606140137, "step": 12970 }, { "ce_loss": 0.15506592392921448, "epoch": 4.326217478318879, "step": 12970 }, { "distill_loss": 0.3762945830821991, "epoch": 4.326217478318879, "step": 12970 }, { "epoch": 4.326217478318879, "ref_ce_loss": 0.19500184059143066, "step": 12970 }, { "epoch": 4.329553035356905, "loss": 0.8201, "step": 12980 }, { "epoch": 4.329553035356905, "grad_norm": 2.2929203510284424, "step": 12980 }, { "epoch": 4.329553035356905, "learning_rate": 0.0005050572924210528, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 1.3470618724822998, "step": 12980 }, { "ce_loss": 0.2621327340602875, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.4442044794559479, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.19769565761089325, "step": 12980 }, { "epoch": 4.329553035356905, "loss": 0.931287407875061, "step": 12980 }, { "ce_loss": 0.23502033948898315, "epoch": 4.329553035356905, "step": 12980 }, { "distill_loss": 0.39018332958221436, "epoch": 4.329553035356905, "step": 12980 }, { "epoch": 4.329553035356905, "ref_ce_loss": 0.17888310551643372, "step": 12980 }, { "epoch": 4.33288859239493, "loss": 0.9278, "step": 12990 }, { "epoch": 4.33288859239493, "grad_norm": 2.8423891067504883, "step": 12990 }, { "epoch": 4.33288859239493, "learning_rate": 0.0005046402709738489, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 1.4319112300872803, "step": 12990 }, { "ce_loss": 0.20456096529960632, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.3574753999710083, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.16366833448410034, "step": 12990 }, { "epoch": 4.33288859239493, "loss": 0.6514447331428528, "step": 12990 }, { "ce_loss": 0.16469235718250275, "epoch": 4.33288859239493, "step": 12990 }, { "distill_loss": 0.2948162257671356, "epoch": 4.33288859239493, "step": 12990 }, { "epoch": 4.33288859239493, "ref_ce_loss": 0.14291289448738098, "step": 12990 }, { "epoch": 4.336224149432955, "loss": 0.9135, "step": 13000 }, { "epoch": 4.336224149432955, "grad_norm": 1.7230727672576904, "step": 13000 }, { "epoch": 4.336224149432955, "learning_rate": 0.0005042231274001891, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.6460437774658203, "step": 13000 }, { "ce_loss": 0.15081174671649933, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.3070511519908905, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.1417672336101532, "step": 13000 }, { "epoch": 4.336224149432955, "loss": 0.9081906080245972, "step": 13000 }, { "ce_loss": 0.23736201226711273, "epoch": 4.336224149432955, "step": 13000 }, { "distill_loss": 0.3211541175842285, "epoch": 4.336224149432955, "step": 13000 }, { "epoch": 4.336224149432955, "ref_ce_loss": 0.18557564914226532, "step": 13000 }, { "epoch": 4.339559706470981, "loss": 0.7863, "step": 13010 }, { "epoch": 4.339559706470981, "grad_norm": 2.5761780738830566, "step": 13010 }, { "epoch": 4.339559706470981, "learning_rate": 0.0005038058621869246, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.7194321155548096, "step": 13010 }, { "ce_loss": 0.1710882931947708, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.35647034645080566, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.19168949127197266, "step": 13010 }, { "epoch": 4.339559706470981, "loss": 0.8294156193733215, "step": 13010 }, { "ce_loss": 0.23769475519657135, "epoch": 4.339559706470981, "step": 13010 }, { "distill_loss": 0.3103196620941162, "epoch": 4.339559706470981, "step": 13010 }, { "epoch": 4.339559706470981, "ref_ce_loss": 0.1724769026041031, "step": 13010 }, { "epoch": 4.342895263509006, "loss": 0.9211, "step": 13020 }, { "epoch": 4.342895263509006, "grad_norm": 1.8360869884490967, "step": 13020 }, { "epoch": 4.342895263509006, "learning_rate": 0.000503388475821049, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.8466196060180664, "step": 13020 }, { "ce_loss": 0.2331269234418869, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.367826372385025, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.19748416543006897, "step": 13020 }, { "epoch": 4.342895263509006, "loss": 0.7141017317771912, "step": 13020 }, { "ce_loss": 0.1875707358121872, "epoch": 4.342895263509006, "step": 13020 }, { "distill_loss": 0.33911678194999695, "epoch": 4.342895263509006, "step": 13020 }, { "epoch": 4.342895263509006, "ref_ce_loss": 0.15469998121261597, "step": 13020 }, { "epoch": 4.3462308205470315, "loss": 0.8646, "step": 13030 }, { "epoch": 4.3462308205470315, "grad_norm": 2.126154661178589, "step": 13030 }, { "epoch": 4.3462308205470315, "learning_rate": 0.0005029709687896972, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.8564475178718567, "step": 13030 }, { "ce_loss": 0.2630447447299957, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.3674970269203186, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.18027503788471222, "step": 13030 }, { "epoch": 4.3462308205470315, "loss": 0.8201711177825928, "step": 13030 }, { "ce_loss": 0.2565067410469055, "epoch": 4.3462308205470315, "step": 13030 }, { "distill_loss": 0.41150057315826416, "epoch": 4.3462308205470315, "step": 13030 }, { "epoch": 4.3462308205470315, "ref_ce_loss": 0.15197144448757172, "step": 13030 }, { "epoch": 4.349566377585057, "loss": 0.92, "step": 13040 }, { "epoch": 4.349566377585057, "grad_norm": 1.716796636581421, "step": 13040 }, { "epoch": 4.349566377585057, "learning_rate": 0.0005025533415801446, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 0.7857934832572937, "step": 13040 }, { "ce_loss": 0.20469725131988525, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.41409963369369507, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.1668863147497177, "step": 13040 }, { "epoch": 4.349566377585057, "loss": 1.0408332347869873, "step": 13040 }, { "ce_loss": 0.245316743850708, "epoch": 4.349566377585057, "step": 13040 }, { "distill_loss": 0.34827446937561035, "epoch": 4.349566377585057, "step": 13040 }, { "epoch": 4.349566377585057, "ref_ce_loss": 0.1869659125804901, "step": 13040 }, { "epoch": 4.352901934623082, "loss": 0.8889, "step": 13050 }, { "epoch": 4.352901934623082, "grad_norm": 2.0432233810424805, "step": 13050 }, { "epoch": 4.352901934623082, "learning_rate": 0.000502135594679807, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 1.0115782022476196, "step": 13050 }, { "ce_loss": 0.2615415155887604, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.42382967472076416, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.19795654714107513, "step": 13050 }, { "epoch": 4.352901934623082, "loss": 0.7003949284553528, "step": 13050 }, { "ce_loss": 0.14454393088817596, "epoch": 4.352901934623082, "step": 13050 }, { "distill_loss": 0.34949007630348206, "epoch": 4.352901934623082, "step": 13050 }, { "epoch": 4.352901934623082, "ref_ce_loss": 0.17572638392448425, "step": 13050 }, { "epoch": 4.3562374916611075, "loss": 0.8138, "step": 13060 }, { "epoch": 4.3562374916611075, "grad_norm": 1.5323294401168823, "step": 13060 }, { "epoch": 4.3562374916611075, "learning_rate": 0.0005017177285762404, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.7875097990036011, "step": 13060 }, { "ce_loss": 0.19186387956142426, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.33928823471069336, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.14275150001049042, "step": 13060 }, { "epoch": 4.3562374916611075, "loss": 0.9309220910072327, "step": 13060 }, { "ce_loss": 0.28330251574516296, "epoch": 4.3562374916611075, "step": 13060 }, { "distill_loss": 0.39531224966049194, "epoch": 4.3562374916611075, "step": 13060 }, { "epoch": 4.3562374916611075, "ref_ce_loss": 0.1772787719964981, "step": 13060 }, { "epoch": 4.359573048699133, "loss": 0.8571, "step": 13070 }, { "epoch": 4.359573048699133, "grad_norm": 2.4798803329467773, "step": 13070 }, { "epoch": 4.359573048699133, "learning_rate": 0.0005012997437571392, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.854665994644165, "step": 13070 }, { "ce_loss": 0.27411356568336487, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.3180094361305237, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.19514678418636322, "step": 13070 }, { "epoch": 4.359573048699133, "loss": 0.8343544602394104, "step": 13070 }, { "ce_loss": 0.23687444627285004, "epoch": 4.359573048699133, "step": 13070 }, { "distill_loss": 0.36419615149497986, "epoch": 4.359573048699133, "step": 13070 }, { "epoch": 4.359573048699133, "ref_ce_loss": 0.16759879887104034, "step": 13070 }, { "epoch": 4.362908605737158, "loss": 0.9007, "step": 13080 }, { "epoch": 4.362908605737158, "grad_norm": 2.274061679840088, "step": 13080 }, { "epoch": 4.362908605737158, "learning_rate": 0.0005008816407103368, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.7848496437072754, "step": 13080 }, { "ce_loss": 0.23218511044979095, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.3505823016166687, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.20187018811702728, "step": 13080 }, { "epoch": 4.362908605737158, "loss": 0.7971723079681396, "step": 13080 }, { "ce_loss": 0.1814272552728653, "epoch": 4.362908605737158, "step": 13080 }, { "distill_loss": 0.38159671425819397, "epoch": 4.362908605737158, "step": 13080 }, { "epoch": 4.362908605737158, "ref_ce_loss": 0.11631757766008377, "step": 13080 }, { "epoch": 4.366244162775184, "loss": 0.8833, "step": 13090 }, { "epoch": 4.366244162775184, "grad_norm": 3.208780288696289, "step": 13090 }, { "epoch": 4.366244162775184, "learning_rate": 0.0005004634199238042, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.9118923544883728, "step": 13090 }, { "ce_loss": 0.24683193862438202, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.39711278676986694, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.1471678465604782, "step": 13090 }, { "epoch": 4.366244162775184, "loss": 0.9972525835037231, "step": 13090 }, { "ce_loss": 0.2438964694738388, "epoch": 4.366244162775184, "step": 13090 }, { "distill_loss": 0.5098133683204651, "epoch": 4.366244162775184, "step": 13090 }, { "epoch": 4.366244162775184, "ref_ce_loss": 0.19950233399868011, "step": 13090 }, { "epoch": 4.369579719813209, "loss": 0.8771, "step": 13100 }, { "epoch": 4.369579719813209, "grad_norm": 1.8489611148834229, "step": 13100 }, { "epoch": 4.369579719813209, "learning_rate": 0.0005000450818856503, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.9278559684753418, "step": 13100 }, { "ce_loss": 0.2684767544269562, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.41395726799964905, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.24524687230587006, "step": 13100 }, { "epoch": 4.369579719813209, "loss": 0.7405310869216919, "step": 13100 }, { "ce_loss": 0.18698789179325104, "epoch": 4.369579719813209, "step": 13100 }, { "distill_loss": 0.33907848596572876, "epoch": 4.369579719813209, "step": 13100 }, { "epoch": 4.369579719813209, "ref_ce_loss": 0.164516419172287, "step": 13100 }, { "epoch": 4.372915276851234, "loss": 0.9152, "step": 13110 }, { "epoch": 4.372915276851234, "grad_norm": 2.1648755073547363, "step": 13110 }, { "epoch": 4.372915276851234, "learning_rate": 0.0004996266270841207, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 0.8187690377235413, "step": 13110 }, { "ce_loss": 0.22833628952503204, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.4345163404941559, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.1556825041770935, "step": 13110 }, { "epoch": 4.372915276851234, "loss": 1.1789733171463013, "step": 13110 }, { "ce_loss": 0.217407688498497, "epoch": 4.372915276851234, "step": 13110 }, { "distill_loss": 0.4212278127670288, "epoch": 4.372915276851234, "step": 13110 }, { "epoch": 4.372915276851234, "ref_ce_loss": 0.1647656112909317, "step": 13110 }, { "epoch": 4.37625083388926, "loss": 0.9027, "step": 13120 }, { "epoch": 4.37625083388926, "grad_norm": 3.0077097415924072, "step": 13120 }, { "epoch": 4.37625083388926, "learning_rate": 0.0004992080560075969, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 0.6421822309494019, "step": 13120 }, { "ce_loss": 0.16113825142383575, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.29631081223487854, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.1408051699399948, "step": 13120 }, { "epoch": 4.37625083388926, "loss": 1.1299493312835693, "step": 13120 }, { "ce_loss": 0.1643940508365631, "epoch": 4.37625083388926, "step": 13120 }, { "distill_loss": 0.34408673644065857, "epoch": 4.37625083388926, "step": 13120 }, { "epoch": 4.37625083388926, "ref_ce_loss": 0.17624755203723907, "step": 13120 }, { "epoch": 4.379586390927285, "loss": 0.8513, "step": 13130 }, { "epoch": 4.379586390927285, "grad_norm": 1.3924531936645508, "step": 13130 }, { "epoch": 4.379586390927285, "learning_rate": 0.0004987893691445965, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.9080301523208618, "step": 13130 }, { "ce_loss": 0.2076728641986847, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.4430268704891205, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.147035151720047, "step": 13130 }, { "epoch": 4.379586390927285, "loss": 0.7841320633888245, "step": 13130 }, { "ce_loss": 0.2148856371641159, "epoch": 4.379586390927285, "step": 13130 }, { "distill_loss": 0.3555300831794739, "epoch": 4.379586390927285, "step": 13130 }, { "epoch": 4.379586390927285, "ref_ce_loss": 0.21340398490428925, "step": 13130 }, { "epoch": 4.38292194796531, "loss": 0.8792, "step": 13140 }, { "epoch": 4.38292194796531, "grad_norm": 1.5912320613861084, "step": 13140 }, { "epoch": 4.38292194796531, "learning_rate": 0.0004983705669837721, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.8423344492912292, "step": 13140 }, { "ce_loss": 0.20540697872638702, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.33919230103492737, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.22945822775363922, "step": 13140 }, { "epoch": 4.38292194796531, "loss": 0.7879174947738647, "step": 13140 }, { "ce_loss": 0.26239603757858276, "epoch": 4.38292194796531, "step": 13140 }, { "distill_loss": 0.3321240544319153, "epoch": 4.38292194796531, "step": 13140 }, { "epoch": 4.38292194796531, "ref_ce_loss": 0.15325109660625458, "step": 13140 }, { "epoch": 4.386257505003336, "loss": 0.8441, "step": 13150 }, { "epoch": 4.386257505003336, "grad_norm": 2.230607509613037, "step": 13150 }, { "epoch": 4.386257505003336, "learning_rate": 0.0004979516500139109, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.8524758815765381, "step": 13150 }, { "ce_loss": 0.26263123750686646, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.3410416543483734, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.2094476819038391, "step": 13150 }, { "epoch": 4.386257505003336, "loss": 0.9680402278900146, "step": 13150 }, { "ce_loss": 0.251897931098938, "epoch": 4.386257505003336, "step": 13150 }, { "distill_loss": 0.42440447211265564, "epoch": 4.386257505003336, "step": 13150 }, { "epoch": 4.386257505003336, "ref_ce_loss": 0.18267853558063507, "step": 13150 }, { "epoch": 4.389593062041361, "loss": 0.921, "step": 13160 }, { "epoch": 4.389593062041361, "grad_norm": 3.3034005165100098, "step": 13160 }, { "epoch": 4.389593062041361, "learning_rate": 0.0004975326187239342, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 1.1359477043151855, "step": 13160 }, { "ce_loss": 0.15224169194698334, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.28388071060180664, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.17234744131565094, "step": 13160 }, { "epoch": 4.389593062041361, "loss": 0.8779703378677368, "step": 13160 }, { "ce_loss": 0.2350139617919922, "epoch": 4.389593062041361, "step": 13160 }, { "distill_loss": 0.37165409326553345, "epoch": 4.389593062041361, "step": 13160 }, { "epoch": 4.389593062041361, "ref_ce_loss": 0.21345072984695435, "step": 13160 }, { "epoch": 4.392928619079386, "loss": 0.9219, "step": 13170 }, { "epoch": 4.392928619079386, "grad_norm": 1.6124509572982788, "step": 13170 }, { "epoch": 4.392928619079386, "learning_rate": 0.0004971134736028966, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.7697380185127258, "step": 13170 }, { "ce_loss": 0.2196720391511917, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.3732682764530182, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.17663975059986115, "step": 13170 }, { "epoch": 4.392928619079386, "loss": 0.8623855113983154, "step": 13170 }, { "ce_loss": 0.23376327753067017, "epoch": 4.392928619079386, "step": 13170 }, { "distill_loss": 0.38788342475891113, "epoch": 4.392928619079386, "step": 13170 }, { "epoch": 4.392928619079386, "ref_ce_loss": 0.18725626170635223, "step": 13170 }, { "epoch": 4.396264176117412, "loss": 0.8611, "step": 13180 }, { "epoch": 4.396264176117412, "grad_norm": 2.8501627445220947, "step": 13180 }, { "epoch": 4.396264176117412, "learning_rate": 0.0004966942151399853, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.5740664005279541, "step": 13180 }, { "ce_loss": 0.15579481422901154, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.28095322847366333, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.1369730681180954, "step": 13180 }, { "epoch": 4.396264176117412, "loss": 0.8180100321769714, "step": 13180 }, { "ce_loss": 0.23740574717521667, "epoch": 4.396264176117412, "step": 13180 }, { "distill_loss": 0.32309386134147644, "epoch": 4.396264176117412, "step": 13180 }, { "epoch": 4.396264176117412, "ref_ce_loss": 0.18247680366039276, "step": 13180 }, { "epoch": 4.399599733155437, "loss": 0.8637, "step": 13190 }, { "epoch": 4.399599733155437, "grad_norm": 1.901659607887268, "step": 13190 }, { "epoch": 4.399599733155437, "learning_rate": 0.0004962748438245202, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.8824871778488159, "step": 13190 }, { "ce_loss": 0.24955017864704132, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.3519171178340912, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.18414409458637238, "step": 13190 }, { "epoch": 4.399599733155437, "loss": 0.7215918302536011, "step": 13190 }, { "ce_loss": 0.20866245031356812, "epoch": 4.399599733155437, "step": 13190 }, { "distill_loss": 0.3040982782840729, "epoch": 4.399599733155437, "step": 13190 }, { "epoch": 4.399599733155437, "ref_ce_loss": 0.15952186286449432, "step": 13190 }, { "epoch": 4.402935290193462, "loss": 0.8484, "step": 13200 }, { "epoch": 4.402935290193462, "grad_norm": 1.2749011516571045, "step": 13200 }, { "epoch": 4.402935290193462, "learning_rate": 0.0004958553601459528, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.7972582578659058, "step": 13200 }, { "ce_loss": 0.24628478288650513, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.2889348864555359, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.19017939269542694, "step": 13200 }, { "epoch": 4.402935290193462, "loss": 0.8323944211006165, "step": 13200 }, { "ce_loss": 0.23087507486343384, "epoch": 4.402935290193462, "step": 13200 }, { "distill_loss": 0.39095279574394226, "epoch": 4.402935290193462, "step": 13200 }, { "epoch": 4.402935290193462, "ref_ce_loss": 0.1603068858385086, "step": 13200 }, { "epoch": 4.406270847231488, "loss": 0.8376, "step": 13210 }, { "epoch": 4.406270847231488, "grad_norm": 1.4190902709960938, "step": 13210 }, { "epoch": 4.406270847231488, "learning_rate": 0.0004954357645938657, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.9994043707847595, "step": 13210 }, { "ce_loss": 0.30216896533966064, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.33214259147644043, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.1510033756494522, "step": 13210 }, { "epoch": 4.406270847231488, "loss": 0.9063019752502441, "step": 13210 }, { "ce_loss": 0.24156875908374786, "epoch": 4.406270847231488, "step": 13210 }, { "distill_loss": 0.2896386981010437, "epoch": 4.406270847231488, "step": 13210 }, { "epoch": 4.406270847231488, "ref_ce_loss": 0.15842963755130768, "step": 13210 }, { "epoch": 4.409606404269513, "loss": 0.8183, "step": 13220 }, { "epoch": 4.409606404269513, "grad_norm": 2.57871675491333, "step": 13220 }, { "epoch": 4.409606404269513, "learning_rate": 0.0004950160576579717, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.8624581098556519, "step": 13220 }, { "ce_loss": 0.2337450236082077, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.3577960729598999, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.16363263130187988, "step": 13220 }, { "epoch": 4.409606404269513, "loss": 0.7249259352684021, "step": 13220 }, { "ce_loss": 0.21705181896686554, "epoch": 4.409606404269513, "step": 13220 }, { "distill_loss": 0.351639986038208, "epoch": 4.409606404269513, "step": 13220 }, { "epoch": 4.409606404269513, "ref_ce_loss": 0.1266530156135559, "step": 13220 }, { "epoch": 4.4129419613075385, "loss": 0.8898, "step": 13230 }, { "epoch": 4.4129419613075385, "grad_norm": 2.350499391555786, "step": 13230 }, { "epoch": 4.4129419613075385, "learning_rate": 0.0004945962398281146, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 0.9655719995498657, "step": 13230 }, { "ce_loss": 0.3011987805366516, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.34798118472099304, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.18107043206691742, "step": 13230 }, { "epoch": 4.4129419613075385, "loss": 1.0913926362991333, "step": 13230 }, { "ce_loss": 0.33407920598983765, "epoch": 4.4129419613075385, "step": 13230 }, { "distill_loss": 0.39807650446891785, "epoch": 4.4129419613075385, "step": 13230 }, { "epoch": 4.4129419613075385, "ref_ce_loss": 0.22840847074985504, "step": 13230 }, { "epoch": 4.416277518345564, "loss": 0.9181, "step": 13240 }, { "epoch": 4.416277518345564, "grad_norm": 1.8426028490066528, "step": 13240 }, { "epoch": 4.416277518345564, "learning_rate": 0.0004941763115942666, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 0.8157694339752197, "step": 13240 }, { "ce_loss": 0.23468460142612457, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.40097329020500183, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.13971978425979614, "step": 13240 }, { "epoch": 4.416277518345564, "loss": 1.0739431381225586, "step": 13240 }, { "ce_loss": 0.19124160706996918, "epoch": 4.416277518345564, "step": 13240 }, { "distill_loss": 0.33609095215797424, "epoch": 4.416277518345564, "step": 13240 }, { "epoch": 4.416277518345564, "ref_ce_loss": 0.16506049036979675, "step": 13240 }, { "epoch": 4.419613075383589, "loss": 0.9488, "step": 13250 }, { "epoch": 4.419613075383589, "grad_norm": 2.1189777851104736, "step": 13250 }, { "epoch": 4.419613075383589, "learning_rate": 0.0004937562734465292, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.8339735269546509, "step": 13250 }, { "ce_loss": 0.23639149963855743, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.45191851258277893, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.14533786475658417, "step": 13250 }, { "epoch": 4.419613075383589, "loss": 0.8060436248779297, "step": 13250 }, { "ce_loss": 0.24148309230804443, "epoch": 4.419613075383589, "step": 13250 }, { "distill_loss": 0.4032564163208008, "epoch": 4.419613075383589, "step": 13250 }, { "epoch": 4.419613075383589, "ref_ce_loss": 0.12805211544036865, "step": 13250 }, { "epoch": 4.4229486324216145, "loss": 0.8565, "step": 13260 }, { "epoch": 4.4229486324216145, "grad_norm": 1.9739803075790405, "step": 13260 }, { "epoch": 4.4229486324216145, "learning_rate": 0.000493336125875132, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.9563655853271484, "step": 13260 }, { "ce_loss": 0.33908629417419434, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.41125985980033875, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.20553144812583923, "step": 13260 }, { "epoch": 4.4229486324216145, "loss": 0.8680077791213989, "step": 13260 }, { "ce_loss": 0.2155313342809677, "epoch": 4.4229486324216145, "step": 13260 }, { "distill_loss": 0.3745126724243164, "epoch": 4.4229486324216145, "step": 13260 }, { "epoch": 4.4229486324216145, "ref_ce_loss": 0.1538851410150528, "step": 13260 }, { "epoch": 4.42628418945964, "loss": 0.899, "step": 13270 }, { "epoch": 4.42628418945964, "grad_norm": 3.459533214569092, "step": 13270 }, { "epoch": 4.42628418945964, "learning_rate": 0.0004929158693704325, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.8786895871162415, "step": 13270 }, { "ce_loss": 0.20969292521476746, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.3508089482784271, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.19444763660430908, "step": 13270 }, { "epoch": 4.42628418945964, "loss": 0.8677912354469299, "step": 13270 }, { "ce_loss": 0.21154287457466125, "epoch": 4.42628418945964, "step": 13270 }, { "distill_loss": 0.29949501156806946, "epoch": 4.42628418945964, "step": 13270 }, { "epoch": 4.42628418945964, "ref_ce_loss": 0.1698286235332489, "step": 13270 }, { "epoch": 4.429619746497665, "loss": 0.9023, "step": 13280 }, { "epoch": 4.429619746497665, "grad_norm": 2.0081629753112793, "step": 13280 }, { "epoch": 4.429619746497665, "learning_rate": 0.0004924955044229154, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.9574787616729736, "step": 13280 }, { "ce_loss": 0.21764861047267914, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.4211297929286957, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.15459951758384705, "step": 13280 }, { "epoch": 4.429619746497665, "loss": 0.9137391448020935, "step": 13280 }, { "ce_loss": 0.2617412805557251, "epoch": 4.429619746497665, "step": 13280 }, { "distill_loss": 0.4320088326931, "epoch": 4.429619746497665, "step": 13280 }, { "epoch": 4.429619746497665, "ref_ce_loss": 0.17992384731769562, "step": 13280 }, { "epoch": 4.432955303535691, "loss": 0.8723, "step": 13290 }, { "epoch": 4.432955303535691, "grad_norm": 2.0066637992858887, "step": 13290 }, { "epoch": 4.432955303535691, "learning_rate": 0.0004920750315231916, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.8396000862121582, "step": 13290 }, { "ce_loss": 0.1335640698671341, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.3238210380077362, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.165268212556839, "step": 13290 }, { "epoch": 4.432955303535691, "loss": 0.8897786140441895, "step": 13290 }, { "ce_loss": 0.19786790013313293, "epoch": 4.432955303535691, "step": 13290 }, { "distill_loss": 0.37308627367019653, "epoch": 4.432955303535691, "step": 13290 }, { "epoch": 4.432955303535691, "ref_ce_loss": 0.13037879765033722, "step": 13290 }, { "epoch": 4.436290860573716, "loss": 0.8879, "step": 13300 }, { "epoch": 4.436290860573716, "grad_norm": 1.530429482460022, "step": 13300 }, { "epoch": 4.436290860573716, "learning_rate": 0.0004916544511619984, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.7941538095474243, "step": 13300 }, { "ce_loss": 0.20175831019878387, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.34787747263908386, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.17431721091270447, "step": 13300 }, { "epoch": 4.436290860573716, "loss": 0.7491660714149475, "step": 13300 }, { "ce_loss": 0.22332707047462463, "epoch": 4.436290860573716, "step": 13300 }, { "distill_loss": 0.30431991815567017, "epoch": 4.436290860573716, "step": 13300 }, { "epoch": 4.436290860573716, "ref_ce_loss": 0.1279830038547516, "step": 13300 }, { "epoch": 4.439626417611741, "loss": 0.7982, "step": 13310 }, { "epoch": 4.439626417611741, "grad_norm": 1.6158078908920288, "step": 13310 }, { "epoch": 4.439626417611741, "learning_rate": 0.0004912337638301983, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.9183559417724609, "step": 13310 }, { "ce_loss": 0.2765417993068695, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.4219082295894623, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.18277554214000702, "step": 13310 }, { "epoch": 4.439626417611741, "loss": 0.8864961266517639, "step": 13310 }, { "ce_loss": 0.2849910855293274, "epoch": 4.439626417611741, "step": 13310 }, { "distill_loss": 0.37573570013046265, "epoch": 4.439626417611741, "step": 13310 }, { "epoch": 4.439626417611741, "ref_ce_loss": 0.16806437075138092, "step": 13310 }, { "epoch": 4.442961974649767, "loss": 0.935, "step": 13320 }, { "epoch": 4.442961974649767, "grad_norm": 1.5370193719863892, "step": 13320 }, { "epoch": 4.442961974649767, "learning_rate": 0.0004908129700187784, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.9197893738746643, "step": 13320 }, { "ce_loss": 0.1799246072769165, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.3772495687007904, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.13473819196224213, "step": 13320 }, { "epoch": 4.442961974649767, "loss": 0.7015503644943237, "step": 13320 }, { "ce_loss": 0.1640758365392685, "epoch": 4.442961974649767, "step": 13320 }, { "distill_loss": 0.34314149618148804, "epoch": 4.442961974649767, "step": 13320 }, { "epoch": 4.442961974649767, "ref_ce_loss": 0.15765583515167236, "step": 13320 }, { "epoch": 4.446297531687792, "loss": 0.7786, "step": 13330 }, { "epoch": 4.446297531687792, "grad_norm": 1.600947618484497, "step": 13330 }, { "epoch": 4.446297531687792, "learning_rate": 0.0004903920702188509, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 0.981595516204834, "step": 13330 }, { "ce_loss": 0.19610193371772766, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.3589596152305603, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.17293380200862885, "step": 13330 }, { "epoch": 4.446297531687792, "loss": 1.329329013824463, "step": 13330 }, { "ce_loss": 0.24279475212097168, "epoch": 4.446297531687792, "step": 13330 }, { "distill_loss": 0.4466169774532318, "epoch": 4.446297531687792, "step": 13330 }, { "epoch": 4.446297531687792, "ref_ce_loss": 0.23536382615566254, "step": 13330 }, { "epoch": 4.449633088725817, "loss": 0.9705, "step": 13340 }, { "epoch": 4.449633088725817, "grad_norm": 5.19002628326416, "step": 13340 }, { "epoch": 4.449633088725817, "learning_rate": 0.0004899710649216507, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.6011701822280884, "step": 13340 }, { "ce_loss": 0.1558227241039276, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.2664649486541748, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.1448230892419815, "step": 13340 }, { "epoch": 4.449633088725817, "loss": 0.7817053198814392, "step": 13340 }, { "ce_loss": 0.19946074485778809, "epoch": 4.449633088725817, "step": 13340 }, { "distill_loss": 0.39532071352005005, "epoch": 4.449633088725817, "step": 13340 }, { "epoch": 4.449633088725817, "ref_ce_loss": 0.1866506189107895, "step": 13340 }, { "epoch": 4.452968645763843, "loss": 0.841, "step": 13350 }, { "epoch": 4.452968645763843, "grad_norm": 1.8909868001937866, "step": 13350 }, { "epoch": 4.452968645763843, "learning_rate": 0.0004895499546185366, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.7894856929779053, "step": 13350 }, { "ce_loss": 0.21032026410102844, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.3686240315437317, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.16302137076854706, "step": 13350 }, { "epoch": 4.452968645763843, "loss": 0.7908044457435608, "step": 13350 }, { "ce_loss": 0.24341550469398499, "epoch": 4.452968645763843, "step": 13350 }, { "distill_loss": 0.35720616579055786, "epoch": 4.452968645763843, "step": 13350 }, { "epoch": 4.452968645763843, "ref_ce_loss": 0.13885928690433502, "step": 13350 }, { "epoch": 4.456304202801868, "loss": 0.8926, "step": 13360 }, { "epoch": 4.456304202801868, "grad_norm": 4.662938594818115, "step": 13360 }, { "epoch": 4.456304202801868, "learning_rate": 0.0004891287398009894, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 1.2475157976150513, "step": 13360 }, { "ce_loss": 0.25721490383148193, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.40166175365448, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.17274345457553864, "step": 13360 }, { "epoch": 4.456304202801868, "loss": 0.7158029675483704, "step": 13360 }, { "ce_loss": 0.15737080574035645, "epoch": 4.456304202801868, "step": 13360 }, { "distill_loss": 0.33680933713912964, "epoch": 4.456304202801868, "step": 13360 }, { "epoch": 4.456304202801868, "ref_ce_loss": 0.1072361096739769, "step": 13360 }, { "epoch": 4.459639759839893, "loss": 0.903, "step": 13370 }, { "epoch": 4.459639759839893, "grad_norm": 1.4409809112548828, "step": 13370 }, { "epoch": 4.459639759839893, "learning_rate": 0.0004887074209606122, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.8202242851257324, "step": 13370 }, { "ce_loss": 0.13981536030769348, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.3915843665599823, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.1265016347169876, "step": 13370 }, { "epoch": 4.459639759839893, "loss": 0.6500847339630127, "step": 13370 }, { "ce_loss": 0.1637197732925415, "epoch": 4.459639759839893, "step": 13370 }, { "distill_loss": 0.3304721713066101, "epoch": 4.459639759839893, "step": 13370 }, { "epoch": 4.459639759839893, "ref_ce_loss": 0.15574127435684204, "step": 13370 }, { "epoch": 4.462975316877919, "loss": 0.8532, "step": 13380 }, { "epoch": 4.462975316877919, "grad_norm": 1.293358564376831, "step": 13380 }, { "epoch": 4.462975316877919, "learning_rate": 0.0004882859985891294, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 0.6703695058822632, "step": 13380 }, { "ce_loss": 0.21158580482006073, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.2534683346748352, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.14631977677345276, "step": 13380 }, { "epoch": 4.462975316877919, "loss": 1.1632661819458008, "step": 13380 }, { "ce_loss": 0.17849712073802948, "epoch": 4.462975316877919, "step": 13380 }, { "distill_loss": 0.33899661898612976, "epoch": 4.462975316877919, "step": 13380 }, { "epoch": 4.462975316877919, "ref_ce_loss": 0.16240814328193665, "step": 13380 }, { "epoch": 4.466310873915944, "loss": 0.9454, "step": 13390 }, { "epoch": 4.466310873915944, "grad_norm": 1.8045860528945923, "step": 13390 }, { "epoch": 4.466310873915944, "learning_rate": 0.00048786447317838625, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 0.6276642084121704, "step": 13390 }, { "ce_loss": 0.1849830448627472, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.2701457142829895, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.17188803851604462, "step": 13390 }, { "epoch": 4.466310873915944, "loss": 1.1554486751556396, "step": 13390 }, { "ce_loss": 0.26016736030578613, "epoch": 4.466310873915944, "step": 13390 }, { "distill_loss": 0.35298141837120056, "epoch": 4.466310873915944, "step": 13390 }, { "epoch": 4.466310873915944, "ref_ce_loss": 0.20714622735977173, "step": 13390 }, { "epoch": 4.469646430953969, "loss": 0.9343, "step": 13400 }, { "epoch": 4.469646430953969, "grad_norm": 2.388153553009033, "step": 13400 }, { "epoch": 4.469646430953969, "learning_rate": 0.00048744284522034845, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.5145278573036194, "step": 13400 }, { "ce_loss": 0.1189117282629013, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.23001186549663544, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.11830934882164001, "step": 13400 }, { "epoch": 4.469646430953969, "loss": 0.8453248143196106, "step": 13400 }, { "ce_loss": 0.21656201779842377, "epoch": 4.469646430953969, "step": 13400 }, { "distill_loss": 0.36059290170669556, "epoch": 4.469646430953969, "step": 13400 }, { "epoch": 4.469646430953969, "ref_ce_loss": 0.17768530547618866, "step": 13400 }, { "epoch": 4.472981987991995, "loss": 0.8786, "step": 13410 }, { "epoch": 4.472981987991995, "grad_norm": 2.739274501800537, "step": 13410 }, { "epoch": 4.472981987991995, "learning_rate": 0.0004870211152071009, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.847709059715271, "step": 13410 }, { "ce_loss": 0.29653552174568176, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.394599586725235, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.15640832483768463, "step": 13410 }, { "epoch": 4.472981987991995, "loss": 0.8556306958198547, "step": 13410 }, { "ce_loss": 0.21378852427005768, "epoch": 4.472981987991995, "step": 13410 }, { "distill_loss": 0.3763246536254883, "epoch": 4.472981987991995, "step": 13410 }, { "epoch": 4.472981987991995, "ref_ce_loss": 0.1716834157705307, "step": 13410 }, { "epoch": 4.47631754503002, "loss": 0.8237, "step": 13420 }, { "epoch": 4.47631754503002, "grad_norm": 2.9815971851348877, "step": 13420 }, { "epoch": 4.47631754503002, "learning_rate": 0.0004865992836308481, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.7696135640144348, "step": 13420 }, { "ce_loss": 0.26664626598358154, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.33652380108833313, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.1662791669368744, "step": 13420 }, { "epoch": 4.47631754503002, "loss": 0.6433257460594177, "step": 13420 }, { "ce_loss": 0.18302805721759796, "epoch": 4.47631754503002, "step": 13420 }, { "distill_loss": 0.31425192952156067, "epoch": 4.47631754503002, "step": 13420 }, { "epoch": 4.47631754503002, "ref_ce_loss": 0.14584557712078094, "step": 13420 }, { "epoch": 4.4796531020680455, "loss": 0.7838, "step": 13430 }, { "epoch": 4.4796531020680455, "grad_norm": 1.3793237209320068, "step": 13430 }, { "epoch": 4.4796531020680455, "learning_rate": 0.0004861773509839127, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.9961182475090027, "step": 13430 }, { "ce_loss": 0.20061686635017395, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.37714099884033203, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.1583109200000763, "step": 13430 }, { "epoch": 4.4796531020680455, "loss": 0.7867060899734497, "step": 13430 }, { "ce_loss": 0.20754165947437286, "epoch": 4.4796531020680455, "step": 13430 }, { "distill_loss": 0.3411576747894287, "epoch": 4.4796531020680455, "step": 13430 }, { "epoch": 4.4796531020680455, "ref_ce_loss": 0.15391767024993896, "step": 13430 }, { "epoch": 4.482988659106071, "loss": 0.8616, "step": 13440 }, { "epoch": 4.482988659106071, "grad_norm": 2.627659559249878, "step": 13440 }, { "epoch": 4.482988659106071, "learning_rate": 0.00048575531775873587, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.7955703139305115, "step": 13440 }, { "ce_loss": 0.18961289525032043, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.3066923916339874, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.1656840592622757, "step": 13440 }, { "epoch": 4.482988659106071, "loss": 0.684762716293335, "step": 13440 }, { "ce_loss": 0.15056568384170532, "epoch": 4.482988659106071, "step": 13440 }, { "distill_loss": 0.305119127035141, "epoch": 4.482988659106071, "step": 13440 }, { "epoch": 4.482988659106071, "ref_ce_loss": 0.12949812412261963, "step": 13440 }, { "epoch": 4.486324216144096, "loss": 0.873, "step": 13450 }, { "epoch": 4.486324216144096, "grad_norm": 1.5340536832809448, "step": 13450 }, { "epoch": 4.486324216144096, "learning_rate": 0.0004853331844478754, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.805151104927063, "step": 13450 }, { "ce_loss": 0.11066021025180817, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.3033329248428345, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.14843977987766266, "step": 13450 }, { "epoch": 4.486324216144096, "loss": 0.811651885509491, "step": 13450 }, { "ce_loss": 0.19692301750183105, "epoch": 4.486324216144096, "step": 13450 }, { "distill_loss": 0.34952807426452637, "epoch": 4.486324216144096, "step": 13450 }, { "epoch": 4.486324216144096, "ref_ce_loss": 0.17662404477596283, "step": 13450 }, { "epoch": 4.4896597731821215, "loss": 0.8853, "step": 13460 }, { "epoch": 4.4896597731821215, "grad_norm": 2.993130683898926, "step": 13460 }, { "epoch": 4.4896597731821215, "learning_rate": 0.00048491095154400653, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 1.1345261335372925, "step": 13460 }, { "ce_loss": 0.2394268661737442, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.386577844619751, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.190634623169899, "step": 13460 }, { "epoch": 4.4896597731821215, "loss": 0.914226233959198, "step": 13460 }, { "ce_loss": 0.19585789740085602, "epoch": 4.4896597731821215, "step": 13460 }, { "distill_loss": 0.36293065547943115, "epoch": 4.4896597731821215, "step": 13460 }, { "epoch": 4.4896597731821215, "ref_ce_loss": 0.13437806069850922, "step": 13460 }, { "epoch": 4.492995330220147, "loss": 0.8696, "step": 13470 }, { "epoch": 4.492995330220147, "grad_norm": 1.8465981483459473, "step": 13470 }, { "epoch": 4.492995330220147, "learning_rate": 0.00048448861953992033, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 1.0066031217575073, "step": 13470 }, { "ce_loss": 0.23128260672092438, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.37448421120643616, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.1880294680595398, "step": 13470 }, { "epoch": 4.492995330220147, "loss": 0.9604164958000183, "step": 13470 }, { "ce_loss": 0.21962708234786987, "epoch": 4.492995330220147, "step": 13470 }, { "distill_loss": 0.3921681344509125, "epoch": 4.492995330220147, "step": 13470 }, { "epoch": 4.492995330220147, "ref_ce_loss": 0.15610474348068237, "step": 13470 }, { "epoch": 4.496330887258172, "loss": 0.8809, "step": 13480 }, { "epoch": 4.496330887258172, "grad_norm": 2.4795334339141846, "step": 13480 }, { "epoch": 4.496330887258172, "learning_rate": 0.0004840661889285238, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.8624405264854431, "step": 13480 }, { "ce_loss": 0.2435741424560547, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.3684362471103668, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.19913016259670258, "step": 13480 }, { "epoch": 4.496330887258172, "loss": 0.9379405975341797, "step": 13480 }, { "ce_loss": 0.17425698041915894, "epoch": 4.496330887258172, "step": 13480 }, { "distill_loss": 0.31786245107650757, "epoch": 4.496330887258172, "step": 13480 }, { "epoch": 4.496330887258172, "ref_ce_loss": 0.14343689382076263, "step": 13480 }, { "epoch": 4.4996664442961976, "loss": 0.9711, "step": 13490 }, { "epoch": 4.4996664442961976, "grad_norm": 2.149953603744507, "step": 13490 }, { "epoch": 4.4996664442961976, "learning_rate": 0.0004836436602028389, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 0.8364251255989075, "step": 13490 }, { "ce_loss": 0.2063017636537552, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.4093954563140869, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.15085582435131073, "step": 13490 }, { "epoch": 4.4996664442961976, "loss": 1.26799738407135, "step": 13490 }, { "ce_loss": 0.20769761502742767, "epoch": 4.4996664442961976, "step": 13490 }, { "distill_loss": 0.387866348028183, "epoch": 4.4996664442961976, "step": 13490 }, { "epoch": 4.4996664442961976, "ref_ce_loss": 0.18510855734348297, "step": 13490 }, { "epoch": 4.503002001334223, "loss": 0.9009, "step": 13500 }, { "epoch": 4.503002001334223, "grad_norm": 2.5156893730163574, "step": 13500 }, { "epoch": 4.503002001334223, "learning_rate": 0.0004832210338560022, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.9566195011138916, "step": 13500 }, { "ce_loss": 0.20787490904331207, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.4229164719581604, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.1770862191915512, "step": 13500 }, { "epoch": 4.503002001334223, "loss": 0.9364886283874512, "step": 13500 }, { "ce_loss": 0.276179701089859, "epoch": 4.503002001334223, "step": 13500 }, { "distill_loss": 0.457122266292572, "epoch": 4.503002001334223, "step": 13500 }, { "epoch": 4.503002001334223, "ref_ce_loss": 0.16380095481872559, "step": 13500 }, { "epoch": 4.506337558372248, "loss": 0.896, "step": 13510 }, { "epoch": 4.506337558372248, "grad_norm": 1.6427794694900513, "step": 13510 }, { "epoch": 4.506337558372248, "learning_rate": 0.0004827983103812638, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.8760179281234741, "step": 13510 }, { "ce_loss": 0.24923034012317657, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.3755168616771698, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.15355350077152252, "step": 13510 }, { "epoch": 4.506337558372248, "loss": 0.8987492918968201, "step": 13510 }, { "ce_loss": 0.29019269347190857, "epoch": 4.506337558372248, "step": 13510 }, { "distill_loss": 0.3846389651298523, "epoch": 4.506337558372248, "step": 13510 }, { "epoch": 4.506337558372248, "ref_ce_loss": 0.18040981888771057, "step": 13510 }, { "epoch": 4.509673115410274, "loss": 0.8248, "step": 13520 }, { "epoch": 4.509673115410274, "grad_norm": 2.027052879333496, "step": 13520 }, { "epoch": 4.509673115410274, "learning_rate": 0.00048237549027198805, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 1.0097908973693848, "step": 13520 }, { "ce_loss": 0.2561718821525574, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.3744969069957733, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.23328308761119843, "step": 13520 }, { "epoch": 4.509673115410274, "loss": 0.87522292137146, "step": 13520 }, { "ce_loss": 0.2116980105638504, "epoch": 4.509673115410274, "step": 13520 }, { "distill_loss": 0.2912207841873169, "epoch": 4.509673115410274, "step": 13520 }, { "epoch": 4.509673115410274, "ref_ce_loss": 0.15882287919521332, "step": 13520 }, { "epoch": 4.513008672448299, "loss": 0.8352, "step": 13530 }, { "epoch": 4.513008672448299, "grad_norm": 1.5604941844940186, "step": 13530 }, { "epoch": 4.513008672448299, "learning_rate": 0.0004819525740216509, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 0.7068662643432617, "step": 13530 }, { "ce_loss": 0.1933564394712448, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.35956916213035583, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.15367421507835388, "step": 13530 }, { "epoch": 4.513008672448299, "loss": 1.0363969802856445, "step": 13530 }, { "ce_loss": 0.16680419445037842, "epoch": 4.513008672448299, "step": 13530 }, { "distill_loss": 0.35213443636894226, "epoch": 4.513008672448299, "step": 13530 }, { "epoch": 4.513008672448299, "ref_ce_loss": 0.16435660421848297, "step": 13530 }, { "epoch": 4.516344229486324, "loss": 0.8397, "step": 13540 }, { "epoch": 4.516344229486324, "grad_norm": 1.7330466508865356, "step": 13540 }, { "epoch": 4.516344229486324, "learning_rate": 0.0004815295621238415, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.9030655026435852, "step": 13540 }, { "ce_loss": 0.25999701023101807, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.36837413907051086, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.19690430164337158, "step": 13540 }, { "epoch": 4.516344229486324, "loss": 0.9909688830375671, "step": 13540 }, { "ce_loss": 0.24681763350963593, "epoch": 4.516344229486324, "step": 13540 }, { "distill_loss": 0.38454490900039673, "epoch": 4.516344229486324, "step": 13540 }, { "epoch": 4.516344229486324, "ref_ce_loss": 0.1610337644815445, "step": 13540 }, { "epoch": 4.51967978652435, "loss": 0.8423, "step": 13550 }, { "epoch": 4.51967978652435, "grad_norm": 1.7804971933364868, "step": 13550 }, { "epoch": 4.51967978652435, "learning_rate": 0.0004811064550722602, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.8210447430610657, "step": 13550 }, { "ce_loss": 0.19143234193325043, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.31518861651420593, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.16232028603553772, "step": 13550 }, { "epoch": 4.51967978652435, "loss": 0.8883418440818787, "step": 13550 }, { "ce_loss": 0.2613326609134674, "epoch": 4.51967978652435, "step": 13550 }, { "distill_loss": 0.3267996907234192, "epoch": 4.51967978652435, "step": 13550 }, { "epoch": 4.51967978652435, "ref_ce_loss": 0.1897214651107788, "step": 13550 }, { "epoch": 4.523015343562375, "loss": 0.8078, "step": 13560 }, { "epoch": 4.523015343562375, "grad_norm": 1.4993454217910767, "step": 13560 }, { "epoch": 4.523015343562375, "learning_rate": 0.00048068325336071845, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.9455620050430298, "step": 13560 }, { "ce_loss": 0.22340868413448334, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.2784824073314667, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.17123597860336304, "step": 13560 }, { "epoch": 4.523015343562375, "loss": 0.7702107429504395, "step": 13560 }, { "ce_loss": 0.22906190156936646, "epoch": 4.523015343562375, "step": 13560 }, { "distill_loss": 0.32264962792396545, "epoch": 4.523015343562375, "step": 13560 }, { "epoch": 4.523015343562375, "ref_ce_loss": 0.17025819420814514, "step": 13560 }, { "epoch": 4.5263509006004, "loss": 0.7676, "step": 13570 }, { "epoch": 4.5263509006004, "grad_norm": 2.0894381999969482, "step": 13570 }, { "epoch": 4.5263509006004, "learning_rate": 0.0004802599574831381, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.8185257911682129, "step": 13570 }, { "ce_loss": 0.20196986198425293, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.29363006353378296, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.16779692471027374, "step": 13570 }, { "epoch": 4.5263509006004, "loss": 0.6812776327133179, "step": 13570 }, { "ce_loss": 0.11444456875324249, "epoch": 4.5263509006004, "step": 13570 }, { "distill_loss": 0.3058815598487854, "epoch": 4.5263509006004, "step": 13570 }, { "epoch": 4.5263509006004, "ref_ce_loss": 0.13752155005931854, "step": 13570 }, { "epoch": 4.529686457638426, "loss": 0.8433, "step": 13580 }, { "epoch": 4.529686457638426, "grad_norm": 2.1083929538726807, "step": 13580 }, { "epoch": 4.529686457638426, "learning_rate": 0.000479836567933551, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.8970056772232056, "step": 13580 }, { "ce_loss": 0.2174151986837387, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.3981429636478424, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.17626845836639404, "step": 13580 }, { "epoch": 4.529686457638426, "loss": 0.7080292701721191, "step": 13580 }, { "ce_loss": 0.1591300666332245, "epoch": 4.529686457638426, "step": 13580 }, { "distill_loss": 0.33692148327827454, "epoch": 4.529686457638426, "step": 13580 }, { "epoch": 4.529686457638426, "ref_ce_loss": 0.11072871834039688, "step": 13580 }, { "epoch": 4.533022014676451, "loss": 0.808, "step": 13590 }, { "epoch": 4.533022014676451, "grad_norm": 1.7732107639312744, "step": 13590 }, { "epoch": 4.533022014676451, "learning_rate": 0.0004794130852060984, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.7357549071311951, "step": 13590 }, { "ce_loss": 0.22856561839580536, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.3111702501773834, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.15477800369262695, "step": 13590 }, { "epoch": 4.533022014676451, "loss": 0.842008113861084, "step": 13590 }, { "ce_loss": 0.22534911334514618, "epoch": 4.533022014676451, "step": 13590 }, { "distill_loss": 0.40802642703056335, "epoch": 4.533022014676451, "step": 13590 }, { "epoch": 4.533022014676451, "ref_ce_loss": 0.20828992128372192, "step": 13590 }, { "epoch": 4.536357571714476, "loss": 0.78, "step": 13600 }, { "epoch": 4.536357571714476, "grad_norm": 1.7454679012298584, "step": 13600 }, { "epoch": 4.536357571714476, "learning_rate": 0.0004789895097950301, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 0.9143649339675903, "step": 13600 }, { "ce_loss": 0.2446441650390625, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.35699325799942017, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.16967988014221191, "step": 13600 }, { "epoch": 4.536357571714476, "loss": 1.133493185043335, "step": 13600 }, { "ce_loss": 0.27196764945983887, "epoch": 4.536357571714476, "step": 13600 }, { "distill_loss": 0.392011821269989, "epoch": 4.536357571714476, "step": 13600 }, { "epoch": 4.536357571714476, "ref_ce_loss": 0.17302842438220978, "step": 13600 }, { "epoch": 4.539693128752502, "loss": 0.8807, "step": 13610 }, { "epoch": 4.539693128752502, "grad_norm": 1.6861317157745361, "step": 13610 }, { "epoch": 4.539693128752502, "learning_rate": 0.00047856584219470424, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 0.6395084857940674, "step": 13610 }, { "ce_loss": 0.17107786238193512, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.2538531720638275, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.16082721948623657, "step": 13610 }, { "epoch": 4.539693128752502, "loss": 1.0053611993789673, "step": 13610 }, { "ce_loss": 0.19790354371070862, "epoch": 4.539693128752502, "step": 13610 }, { "distill_loss": 0.33340319991111755, "epoch": 4.539693128752502, "step": 13610 }, { "epoch": 4.539693128752502, "ref_ce_loss": 0.18687112629413605, "step": 13610 }, { "epoch": 4.543028685790527, "loss": 0.8885, "step": 13620 }, { "epoch": 4.543028685790527, "grad_norm": 2.6688756942749023, "step": 13620 }, { "epoch": 4.543028685790527, "learning_rate": 0.00047814208289958664, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.7943642139434814, "step": 13620 }, { "ce_loss": 0.17814122140407562, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.3267526626586914, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.15404582023620605, "step": 13620 }, { "epoch": 4.543028685790527, "loss": 0.6207186579704285, "step": 13620 }, { "ce_loss": 0.1744394153356552, "epoch": 4.543028685790527, "step": 13620 }, { "distill_loss": 0.266560435295105, "epoch": 4.543028685790527, "step": 13620 }, { "epoch": 4.543028685790527, "ref_ce_loss": 0.12551294267177582, "step": 13620 }, { "epoch": 4.5463642428285524, "loss": 0.7633, "step": 13630 }, { "epoch": 4.5463642428285524, "grad_norm": 1.8141019344329834, "step": 13630 }, { "epoch": 4.5463642428285524, "learning_rate": 0.0004777182324042497, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.6916319131851196, "step": 13630 }, { "ce_loss": 0.16996467113494873, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.3146441578865051, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.17255429923534393, "step": 13630 }, { "epoch": 4.5463642428285524, "loss": 0.7187036275863647, "step": 13630 }, { "ce_loss": 0.2080836296081543, "epoch": 4.5463642428285524, "step": 13630 }, { "distill_loss": 0.2987633943557739, "epoch": 4.5463642428285524, "step": 13630 }, { "epoch": 4.5463642428285524, "ref_ce_loss": 0.16911211609840393, "step": 13630 }, { "epoch": 4.549699799866578, "loss": 0.8673, "step": 13640 }, { "epoch": 4.549699799866578, "grad_norm": 1.6610087156295776, "step": 13640 }, { "epoch": 4.549699799866578, "learning_rate": 0.00047729429120337284, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.7459085583686829, "step": 13640 }, { "ce_loss": 0.19113396108150482, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.2890676259994507, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.16471309959888458, "step": 13640 }, { "epoch": 4.549699799866578, "loss": 0.7211428284645081, "step": 13640 }, { "ce_loss": 0.22979314625263214, "epoch": 4.549699799866578, "step": 13640 }, { "distill_loss": 0.3184564709663391, "epoch": 4.549699799866578, "step": 13640 }, { "epoch": 4.549699799866578, "ref_ce_loss": 0.17257662117481232, "step": 13640 }, { "epoch": 4.553035356904603, "loss": 0.8389, "step": 13650 }, { "epoch": 4.553035356904603, "grad_norm": 1.836702585220337, "step": 13650 }, { "epoch": 4.553035356904603, "learning_rate": 0.00047687025979174086, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.8416532278060913, "step": 13650 }, { "ce_loss": 0.19439411163330078, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.31022632122039795, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.19999051094055176, "step": 13650 }, { "epoch": 4.553035356904603, "loss": 0.9537258148193359, "step": 13650 }, { "ce_loss": 0.29107460379600525, "epoch": 4.553035356904603, "step": 13650 }, { "distill_loss": 0.4233335256576538, "epoch": 4.553035356904603, "step": 13650 }, { "epoch": 4.553035356904603, "ref_ce_loss": 0.18829721212387085, "step": 13650 }, { "epoch": 4.5563709139426285, "loss": 0.8613, "step": 13660 }, { "epoch": 4.5563709139426285, "grad_norm": 1.539566993713379, "step": 13660 }, { "epoch": 4.5563709139426285, "learning_rate": 0.00047644613866424415, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 1.090315580368042, "step": 13660 }, { "ce_loss": 0.2049546092748642, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.35115835070610046, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.21695077419281006, "step": 13660 }, { "epoch": 4.5563709139426285, "loss": 1.1450974941253662, "step": 13660 }, { "ce_loss": 0.18329203128814697, "epoch": 4.5563709139426285, "step": 13660 }, { "distill_loss": 0.3362187445163727, "epoch": 4.5563709139426285, "step": 13660 }, { "epoch": 4.5563709139426285, "ref_ce_loss": 0.13686105608940125, "step": 13660 }, { "epoch": 4.559706470980654, "loss": 0.9193, "step": 13670 }, { "epoch": 4.559706470980654, "grad_norm": 2.8135159015655518, "step": 13670 }, { "epoch": 4.559706470980654, "learning_rate": 0.0004760219283158776, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.6657029390335083, "step": 13670 }, { "ce_loss": 0.1496344953775406, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.351999968290329, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.12632089853286743, "step": 13670 }, { "epoch": 4.559706470980654, "loss": 0.9592874050140381, "step": 13670 }, { "ce_loss": 0.17044517397880554, "epoch": 4.559706470980654, "step": 13670 }, { "distill_loss": 0.3963066637516022, "epoch": 4.559706470980654, "step": 13670 }, { "epoch": 4.559706470980654, "ref_ce_loss": 0.16819757223129272, "step": 13670 }, { "epoch": 4.563042028018679, "loss": 0.8466, "step": 13680 }, { "epoch": 4.563042028018679, "grad_norm": 3.001901388168335, "step": 13680 }, { "epoch": 4.563042028018679, "learning_rate": 0.00047559762924174055, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.9727820158004761, "step": 13680 }, { "ce_loss": 0.22695204615592957, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.3284171521663666, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.18167749047279358, "step": 13680 }, { "epoch": 4.563042028018679, "loss": 0.9198602437973022, "step": 13680 }, { "ce_loss": 0.21199175715446472, "epoch": 4.563042028018679, "step": 13680 }, { "distill_loss": 0.4086395800113678, "epoch": 4.563042028018679, "step": 13680 }, { "epoch": 4.563042028018679, "ref_ce_loss": 0.16664999723434448, "step": 13680 }, { "epoch": 4.5663775850567045, "loss": 0.9568, "step": 13690 }, { "epoch": 4.5663775850567045, "grad_norm": 2.173248052597046, "step": 13690 }, { "epoch": 4.5663775850567045, "learning_rate": 0.0004751732419370354, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.9737968444824219, "step": 13690 }, { "ce_loss": 0.28544288873672485, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.44596606492996216, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.1873977780342102, "step": 13690 }, { "epoch": 4.5663775850567045, "loss": 0.8217427134513855, "step": 13690 }, { "ce_loss": 0.21305979788303375, "epoch": 4.5663775850567045, "step": 13690 }, { "distill_loss": 0.3435061573982239, "epoch": 4.5663775850567045, "step": 13690 }, { "epoch": 4.5663775850567045, "ref_ce_loss": 0.14034755527973175, "step": 13690 }, { "epoch": 4.56971314209473, "loss": 0.8467, "step": 13700 }, { "epoch": 4.56971314209473, "grad_norm": 1.6340337991714478, "step": 13700 }, { "epoch": 4.56971314209473, "learning_rate": 0.00047474876689706814, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.8798074722290039, "step": 13700 }, { "ce_loss": 0.19881856441497803, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.3740568161010742, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.1589992195367813, "step": 13700 }, { "epoch": 4.56971314209473, "loss": 0.80402672290802, "step": 13700 }, { "ce_loss": 0.17962594330310822, "epoch": 4.56971314209473, "step": 13700 }, { "distill_loss": 0.30576291680336, "epoch": 4.56971314209473, "step": 13700 }, { "epoch": 4.56971314209473, "ref_ce_loss": 0.15402908623218536, "step": 13700 }, { "epoch": 4.573048699132755, "loss": 0.8932, "step": 13710 }, { "epoch": 4.573048699132755, "grad_norm": 1.817192554473877, "step": 13710 }, { "epoch": 4.573048699132755, "learning_rate": 0.00047432420461724636, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.8386541604995728, "step": 13710 }, { "ce_loss": 0.29571789503097534, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.31294816732406616, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.21605390310287476, "step": 13710 }, { "epoch": 4.573048699132755, "loss": 0.8117653727531433, "step": 13710 }, { "ce_loss": 0.2589631974697113, "epoch": 4.573048699132755, "step": 13710 }, { "distill_loss": 0.3590703010559082, "epoch": 4.573048699132755, "step": 13710 }, { "epoch": 4.573048699132755, "ref_ce_loss": 0.16542576253414154, "step": 13710 }, { "epoch": 4.576384256170781, "loss": 0.8229, "step": 13720 }, { "epoch": 4.576384256170781, "grad_norm": 1.6762044429779053, "step": 13720 }, { "epoch": 4.576384256170781, "learning_rate": 0.0004738995555930803, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 0.7958192229270935, "step": 13720 }, { "ce_loss": 0.23379138112068176, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.3523423969745636, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.14432045817375183, "step": 13720 }, { "epoch": 4.576384256170781, "loss": 1.1285977363586426, "step": 13720 }, { "ce_loss": 0.20815251767635345, "epoch": 4.576384256170781, "step": 13720 }, { "distill_loss": 0.33814939856529236, "epoch": 4.576384256170781, "step": 13720 }, { "epoch": 4.576384256170781, "ref_ce_loss": 0.18970946967601776, "step": 13720 }, { "epoch": 4.579719813208806, "loss": 0.8069, "step": 13730 }, { "epoch": 4.579719813208806, "grad_norm": 2.443239212036133, "step": 13730 }, { "epoch": 4.579719813208806, "learning_rate": 0.0004734748203201809, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.7413336634635925, "step": 13730 }, { "ce_loss": 0.1772276610136032, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.3523496091365814, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.1505611091852188, "step": 13730 }, { "epoch": 4.579719813208806, "loss": 0.8777781128883362, "step": 13730 }, { "ce_loss": 0.2584392726421356, "epoch": 4.579719813208806, "step": 13730 }, { "distill_loss": 0.371855229139328, "epoch": 4.579719813208806, "step": 13730 }, { "epoch": 4.579719813208806, "ref_ce_loss": 0.19492730498313904, "step": 13730 }, { "epoch": 4.583055370246831, "loss": 0.9522, "step": 13740 }, { "epoch": 4.583055370246831, "grad_norm": 1.348532795906067, "step": 13740 }, { "epoch": 4.583055370246831, "learning_rate": 0.00047304999929426004, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.9521989226341248, "step": 13740 }, { "ce_loss": 0.2450057864189148, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.4274292588233948, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.20220626890659332, "step": 13740 }, { "epoch": 4.583055370246831, "loss": 0.9642909169197083, "step": 13740 }, { "ce_loss": 0.18685844540596008, "epoch": 4.583055370246831, "step": 13740 }, { "distill_loss": 0.3277007043361664, "epoch": 4.583055370246831, "step": 13740 }, { "epoch": 4.583055370246831, "ref_ce_loss": 0.13163049519062042, "step": 13740 }, { "epoch": 4.586390927284857, "loss": 0.983, "step": 13750 }, { "epoch": 4.586390927284857, "grad_norm": 2.962904453277588, "step": 13750 }, { "epoch": 4.586390927284857, "learning_rate": 0.0004726250930111295, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.7624634504318237, "step": 13750 }, { "ce_loss": 0.18860645592212677, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.33328020572662354, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.131907120347023, "step": 13750 }, { "epoch": 4.586390927284857, "loss": 0.7788711786270142, "step": 13750 }, { "ce_loss": 0.19581912457942963, "epoch": 4.586390927284857, "step": 13750 }, { "distill_loss": 0.36439990997314453, "epoch": 4.586390927284857, "step": 13750 }, { "epoch": 4.586390927284857, "ref_ce_loss": 0.11892513185739517, "step": 13750 }, { "epoch": 4.589726484322882, "loss": 0.8685, "step": 13760 }, { "epoch": 4.589726484322882, "grad_norm": 1.640178918838501, "step": 13760 }, { "epoch": 4.589726484322882, "learning_rate": 0.0004722001019667006, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.6929627060890198, "step": 13760 }, { "ce_loss": 0.17001688480377197, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.3402334451675415, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.1825055181980133, "step": 13760 }, { "epoch": 4.589726484322882, "loss": 0.9117600321769714, "step": 13760 }, { "ce_loss": 0.22071193158626556, "epoch": 4.589726484322882, "step": 13760 }, { "distill_loss": 0.46392345428466797, "epoch": 4.589726484322882, "step": 13760 }, { "epoch": 4.589726484322882, "ref_ce_loss": 0.2267303317785263, "step": 13760 }, { "epoch": 4.593062041360907, "loss": 0.8807, "step": 13770 }, { "epoch": 4.593062041360907, "grad_norm": 3.823497772216797, "step": 13770 }, { "epoch": 4.593062041360907, "learning_rate": 0.00047177502665698355, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.8650383353233337, "step": 13770 }, { "ce_loss": 0.17190320789813995, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.26395130157470703, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.18300709128379822, "step": 13770 }, { "epoch": 4.593062041360907, "loss": 0.7681661248207092, "step": 13770 }, { "ce_loss": 0.23117798566818237, "epoch": 4.593062041360907, "step": 13770 }, { "distill_loss": 0.36607617139816284, "epoch": 4.593062041360907, "step": 13770 }, { "epoch": 4.593062041360907, "ref_ce_loss": 0.17063121497631073, "step": 13770 }, { "epoch": 4.596397598398933, "loss": 0.8961, "step": 13780 }, { "epoch": 4.596397598398933, "grad_norm": 2.5398452281951904, "step": 13780 }, { "epoch": 4.596397598398933, "learning_rate": 0.0004713498675780871, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 1.2317657470703125, "step": 13780 }, { "ce_loss": 0.21036627888679504, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.37952694296836853, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.17117281258106232, "step": 13780 }, { "epoch": 4.596397598398933, "loss": 0.9808338284492493, "step": 13780 }, { "ce_loss": 0.16275614500045776, "epoch": 4.596397598398933, "step": 13780 }, { "distill_loss": 0.3171769082546234, "epoch": 4.596397598398933, "step": 13780 }, { "epoch": 4.596397598398933, "ref_ce_loss": 0.155286967754364, "step": 13780 }, { "epoch": 4.599733155436958, "loss": 0.8273, "step": 13790 }, { "epoch": 4.599733155436958, "grad_norm": 2.3886170387268066, "step": 13790 }, { "epoch": 4.599733155436958, "learning_rate": 0.0004709246252262178, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.9716641902923584, "step": 13790 }, { "ce_loss": 0.2618649899959564, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.4601925313472748, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.205724835395813, "step": 13790 }, { "epoch": 4.599733155436958, "loss": 0.7593345642089844, "step": 13790 }, { "ce_loss": 0.1982172429561615, "epoch": 4.599733155436958, "step": 13790 }, { "distill_loss": 0.35267505049705505, "epoch": 4.599733155436958, "step": 13790 }, { "epoch": 4.599733155436958, "ref_ce_loss": 0.15899710357189178, "step": 13790 }, { "epoch": 4.603068712474983, "loss": 0.9017, "step": 13800 }, { "epoch": 4.603068712474983, "grad_norm": 1.610485315322876, "step": 13800 }, { "epoch": 4.603068712474983, "learning_rate": 0.00047049930009767884, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 0.6228737235069275, "step": 13800 }, { "ce_loss": 0.1588118076324463, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.22418899834156036, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.13548356294631958, "step": 13800 }, { "epoch": 4.603068712474983, "loss": 1.1625922918319702, "step": 13800 }, { "ce_loss": 0.25314319133758545, "epoch": 4.603068712474983, "step": 13800 }, { "distill_loss": 0.3250918984413147, "epoch": 4.603068712474983, "step": 13800 }, { "epoch": 4.603068712474983, "ref_ce_loss": 0.22533030807971954, "step": 13800 }, { "epoch": 4.606404269513009, "loss": 0.8051, "step": 13810 }, { "epoch": 4.606404269513009, "grad_norm": 1.8878289461135864, "step": 13810 }, { "epoch": 4.606404269513009, "learning_rate": 0.00047007389268887085, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 1.0144697427749634, "step": 13810 }, { "ce_loss": 0.258879691362381, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.35777097940444946, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.14117321372032166, "step": 13810 }, { "epoch": 4.606404269513009, "loss": 0.8026508688926697, "step": 13810 }, { "ce_loss": 0.252027302980423, "epoch": 4.606404269513009, "step": 13810 }, { "distill_loss": 0.354375422000885, "epoch": 4.606404269513009, "step": 13810 }, { "epoch": 4.606404269513009, "ref_ce_loss": 0.14750243723392487, "step": 13810 }, { "epoch": 4.609739826551034, "loss": 0.8519, "step": 13820 }, { "epoch": 4.609739826551034, "grad_norm": 2.351003646850586, "step": 13820 }, { "epoch": 4.609739826551034, "learning_rate": 0.0004696484034962896, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.7515314221382141, "step": 13820 }, { "ce_loss": 0.20121333003044128, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.30150169134140015, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.19240356981754303, "step": 13820 }, { "epoch": 4.609739826551034, "loss": 0.5818171501159668, "step": 13820 }, { "ce_loss": 0.12394590675830841, "epoch": 4.609739826551034, "step": 13820 }, { "distill_loss": 0.3400002419948578, "epoch": 4.609739826551034, "step": 13820 }, { "epoch": 4.609739826551034, "ref_ce_loss": 0.11776699125766754, "step": 13820 }, { "epoch": 4.613075383589059, "loss": 0.7783, "step": 13830 }, { "epoch": 4.613075383589059, "grad_norm": 1.6915327310562134, "step": 13830 }, { "epoch": 4.613075383589059, "learning_rate": 0.00046922283301652716, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.8585334420204163, "step": 13830 }, { "ce_loss": 0.2418600618839264, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.40377673506736755, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.17398446798324585, "step": 13830 }, { "epoch": 4.613075383589059, "loss": 0.8019068241119385, "step": 13830 }, { "ce_loss": 0.18170292675495148, "epoch": 4.613075383589059, "step": 13830 }, { "distill_loss": 0.3564358353614807, "epoch": 4.613075383589059, "step": 13830 }, { "epoch": 4.613075383589059, "ref_ce_loss": 0.1549520045518875, "step": 13830 }, { "epoch": 4.616410940627085, "loss": 0.9287, "step": 13840 }, { "epoch": 4.616410940627085, "grad_norm": 1.9859334230422974, "step": 13840 }, { "epoch": 4.616410940627085, "learning_rate": 0.0004687971817462698, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.5590543150901794, "step": 13840 }, { "ce_loss": 0.14512279629707336, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.2831781208515167, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.13064338266849518, "step": 13840 }, { "epoch": 4.616410940627085, "loss": 0.8300645351409912, "step": 13840 }, { "ce_loss": 0.18325687944889069, "epoch": 4.616410940627085, "step": 13840 }, { "distill_loss": 0.331007182598114, "epoch": 4.616410940627085, "step": 13840 }, { "epoch": 4.616410940627085, "ref_ce_loss": 0.17050574719905853, "step": 13840 }, { "epoch": 4.61974649766511, "loss": 0.8127, "step": 13850 }, { "epoch": 4.61974649766511, "grad_norm": 1.4375276565551758, "step": 13850 }, { "epoch": 4.61974649766511, "learning_rate": 0.00046837145018229854, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.8129776120185852, "step": 13850 }, { "ce_loss": 0.16128899157047272, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.4503794014453888, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.18895433843135834, "step": 13850 }, { "epoch": 4.61974649766511, "loss": 0.7150787115097046, "step": 13850 }, { "ce_loss": 0.1600542962551117, "epoch": 4.61974649766511, "step": 13850 }, { "distill_loss": 0.38444283604621887, "epoch": 4.61974649766511, "step": 13850 }, { "epoch": 4.61974649766511, "ref_ce_loss": 0.1291278898715973, "step": 13850 }, { "epoch": 4.6230820547031355, "loss": 0.9294, "step": 13860 }, { "epoch": 4.6230820547031355, "grad_norm": 1.7123420238494873, "step": 13860 }, { "epoch": 4.6230820547031355, "learning_rate": 0.0004679456388214877, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 1.0135622024536133, "step": 13860 }, { "ce_loss": 0.1946658492088318, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.4028632938861847, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.16513720154762268, "step": 13860 }, { "epoch": 4.6230820547031355, "loss": 0.6947537660598755, "step": 13860 }, { "ce_loss": 0.17143045365810394, "epoch": 4.6230820547031355, "step": 13860 }, { "distill_loss": 0.3919539451599121, "epoch": 4.6230820547031355, "step": 13860 }, { "epoch": 4.6230820547031355, "ref_ce_loss": 0.13105444610118866, "step": 13860 }, { "epoch": 4.626417611741161, "loss": 0.8752, "step": 13870 }, { "epoch": 4.626417611741161, "grad_norm": 2.3874919414520264, "step": 13870 }, { "epoch": 4.626417611741161, "learning_rate": 0.0004675197481608054, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.9981850385665894, "step": 13870 }, { "ce_loss": 0.2395416498184204, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.35958918929100037, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.14545199275016785, "step": 13870 }, { "epoch": 4.626417611741161, "loss": 0.546391487121582, "step": 13870 }, { "ce_loss": 0.14458729326725006, "epoch": 4.626417611741161, "step": 13870 }, { "distill_loss": 0.2774547338485718, "epoch": 4.626417611741161, "step": 13870 }, { "epoch": 4.626417611741161, "ref_ce_loss": 0.1227763444185257, "step": 13870 }, { "epoch": 4.629753168779186, "loss": 0.8994, "step": 13880 }, { "epoch": 4.629753168779186, "grad_norm": 3.77695894241333, "step": 13880 }, { "epoch": 4.629753168779186, "learning_rate": 0.0004670937786973112, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.9475411176681519, "step": 13880 }, { "ce_loss": 0.2959573268890381, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.3495613932609558, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.1935998499393463, "step": 13880 }, { "epoch": 4.629753168779186, "loss": 0.7587097883224487, "step": 13880 }, { "ce_loss": 0.23118411004543304, "epoch": 4.629753168779186, "step": 13880 }, { "distill_loss": 0.34271347522735596, "epoch": 4.629753168779186, "step": 13880 }, { "epoch": 4.629753168779186, "ref_ce_loss": 0.1349596381187439, "step": 13880 }, { "epoch": 4.6330887258172115, "loss": 0.8409, "step": 13890 }, { "epoch": 4.6330887258172115, "grad_norm": 1.8643723726272583, "step": 13890 }, { "epoch": 4.6330887258172115, "learning_rate": 0.00046666773092815793, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.858877420425415, "step": 13890 }, { "ce_loss": 0.19074980914592743, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.40514233708381653, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.14177602529525757, "step": 13890 }, { "epoch": 4.6330887258172115, "loss": 0.6139600872993469, "step": 13890 }, { "ce_loss": 0.15172399580478668, "epoch": 4.6330887258172115, "step": 13890 }, { "distill_loss": 0.27114883065223694, "epoch": 4.6330887258172115, "step": 13890 }, { "epoch": 4.6330887258172115, "ref_ce_loss": 0.15620894730091095, "step": 13890 }, { "epoch": 4.636424282855237, "loss": 0.8575, "step": 13900 }, { "epoch": 4.636424282855237, "grad_norm": 1.9550460577011108, "step": 13900 }, { "epoch": 4.636424282855237, "learning_rate": 0.0004662416053505888, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 1.3733259439468384, "step": 13900 }, { "ce_loss": 0.2609739303588867, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.39709389209747314, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.19748690724372864, "step": 13900 }, { "epoch": 4.636424282855237, "loss": 0.9573059678077698, "step": 13900 }, { "ce_loss": 0.2724902927875519, "epoch": 4.636424282855237, "step": 13900 }, { "distill_loss": 0.3900492191314697, "epoch": 4.636424282855237, "step": 13900 }, { "epoch": 4.636424282855237, "ref_ce_loss": 0.1998416632413864, "step": 13900 }, { "epoch": 4.639759839893262, "loss": 0.8736, "step": 13910 }, { "epoch": 4.639759839893262, "grad_norm": 3.21980357170105, "step": 13910 }, { "epoch": 4.639759839893262, "learning_rate": 0.00046581540246193846, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.7142580151557922, "step": 13910 }, { "ce_loss": 0.20784704387187958, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.35620594024658203, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.14930930733680725, "step": 13910 }, { "epoch": 4.639759839893262, "loss": 0.96323561668396, "step": 13910 }, { "ce_loss": 0.25436338782310486, "epoch": 4.639759839893262, "step": 13910 }, { "distill_loss": 0.3177351951599121, "epoch": 4.639759839893262, "step": 13910 }, { "epoch": 4.639759839893262, "ref_ce_loss": 0.1559751331806183, "step": 13910 }, { "epoch": 4.643095396931288, "loss": 0.8428, "step": 13920 }, { "epoch": 4.643095396931288, "grad_norm": 2.507951259613037, "step": 13920 }, { "epoch": 4.643095396931288, "learning_rate": 0.0004653891227596313, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.7738920450210571, "step": 13920 }, { "ce_loss": 0.1673024594783783, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.32107099890708923, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.15293675661087036, "step": 13920 }, { "epoch": 4.643095396931288, "loss": 0.831015944480896, "step": 13920 }, { "ce_loss": 0.2569162845611572, "epoch": 4.643095396931288, "step": 13920 }, { "distill_loss": 0.31806325912475586, "epoch": 4.643095396931288, "step": 13920 }, { "epoch": 4.643095396931288, "ref_ce_loss": 0.20909073948860168, "step": 13920 }, { "epoch": 4.646430953969313, "loss": 0.845, "step": 13930 }, { "epoch": 4.646430953969313, "grad_norm": 1.8366520404815674, "step": 13930 }, { "epoch": 4.646430953969313, "learning_rate": 0.00046496276674118175, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.7713529467582703, "step": 13930 }, { "ce_loss": 0.2573528587818146, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.31989189982414246, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.15530826151371002, "step": 13930 }, { "epoch": 4.646430953969313, "loss": 0.7456807494163513, "step": 13930 }, { "ce_loss": 0.2011760026216507, "epoch": 4.646430953969313, "step": 13930 }, { "distill_loss": 0.2997419536113739, "epoch": 4.646430953969313, "step": 13930 }, { "epoch": 4.646430953969313, "ref_ce_loss": 0.14415857195854187, "step": 13930 }, { "epoch": 4.649766511007338, "loss": 0.792, "step": 13940 }, { "epoch": 4.649766511007338, "grad_norm": 4.883190631866455, "step": 13940 }, { "epoch": 4.649766511007338, "learning_rate": 0.000464536334904193, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.7827562093734741, "step": 13940 }, { "ce_loss": 0.20156757533550262, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.36345911026000977, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.09804141521453857, "step": 13940 }, { "epoch": 4.649766511007338, "loss": 0.8018506169319153, "step": 13940 }, { "ce_loss": 0.2580086886882782, "epoch": 4.649766511007338, "step": 13940 }, { "distill_loss": 0.291150838136673, "epoch": 4.649766511007338, "step": 13940 }, { "epoch": 4.649766511007338, "ref_ce_loss": 0.20864073932170868, "step": 13940 }, { "epoch": 4.653102068045364, "loss": 0.8359, "step": 13950 }, { "epoch": 4.653102068045364, "grad_norm": 1.4981822967529297, "step": 13950 }, { "epoch": 4.653102068045364, "learning_rate": 0.0004641098277463573, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.8859256505966187, "step": 13950 }, { "ce_loss": 0.14007827639579773, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.2375439703464508, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.14340248703956604, "step": 13950 }, { "epoch": 4.653102068045364, "loss": 0.9954640865325928, "step": 13950 }, { "ce_loss": 0.22668717801570892, "epoch": 4.653102068045364, "step": 13950 }, { "distill_loss": 0.3906228542327881, "epoch": 4.653102068045364, "step": 13950 }, { "epoch": 4.653102068045364, "ref_ce_loss": 0.14904038608074188, "step": 13950 }, { "epoch": 4.656437625083389, "loss": 0.8711, "step": 13960 }, { "epoch": 4.656437625083389, "grad_norm": 2.03786039352417, "step": 13960 }, { "epoch": 4.656437625083389, "learning_rate": 0.00046368324576545394, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 0.891573429107666, "step": 13960 }, { "ce_loss": 0.20240835845470428, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.2832994759082794, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.18917511403560638, "step": 13960 }, { "epoch": 4.656437625083389, "loss": 1.0462068319320679, "step": 13960 }, { "ce_loss": 0.29488521814346313, "epoch": 4.656437625083389, "step": 13960 }, { "distill_loss": 0.4123867154121399, "epoch": 4.656437625083389, "step": 13960 }, { "epoch": 4.656437625083389, "ref_ce_loss": 0.1961653083562851, "step": 13960 }, { "epoch": 4.659773182121414, "loss": 0.8192, "step": 13970 }, { "epoch": 4.659773182121414, "grad_norm": 1.4951426982879639, "step": 13970 }, { "epoch": 4.659773182121414, "learning_rate": 0.0004632565894593502, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.8212860822677612, "step": 13970 }, { "ce_loss": 0.21576517820358276, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.31486424803733826, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.16315826773643494, "step": 13970 }, { "epoch": 4.659773182121414, "loss": 0.5121589303016663, "step": 13970 }, { "ce_loss": 0.13231956958770752, "epoch": 4.659773182121414, "step": 13970 }, { "distill_loss": 0.21873025596141815, "epoch": 4.659773182121414, "step": 13970 }, { "epoch": 4.659773182121414, "ref_ce_loss": 0.15629905462265015, "step": 13970 }, { "epoch": 4.66310873915944, "loss": 0.8827, "step": 13980 }, { "epoch": 4.66310873915944, "grad_norm": 1.7232520580291748, "step": 13980 }, { "epoch": 4.66310873915944, "learning_rate": 0.0004628298593259999, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.7036920785903931, "step": 13980 }, { "ce_loss": 0.15738870203495026, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.36597204208374023, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.14112482964992523, "step": 13980 }, { "epoch": 4.66310873915944, "loss": 0.6786432266235352, "step": 13980 }, { "ce_loss": 0.17833980917930603, "epoch": 4.66310873915944, "step": 13980 }, { "distill_loss": 0.3547556698322296, "epoch": 4.66310873915944, "step": 13980 }, { "epoch": 4.66310873915944, "ref_ce_loss": 0.14487062394618988, "step": 13980 }, { "epoch": 4.666444296197465, "loss": 0.8998, "step": 13990 }, { "epoch": 4.666444296197465, "grad_norm": 2.2571568489074707, "step": 13990 }, { "epoch": 4.666444296197465, "learning_rate": 0.0004624030558634429, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.7320971488952637, "step": 13990 }, { "ce_loss": 0.20538297295570374, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.29730069637298584, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.18843059241771698, "step": 13990 }, { "epoch": 4.666444296197465, "loss": 0.8136546015739441, "step": 13990 }, { "ce_loss": 0.2071070671081543, "epoch": 4.666444296197465, "step": 13990 }, { "distill_loss": 0.3963843286037445, "epoch": 4.666444296197465, "step": 13990 }, { "epoch": 4.666444296197465, "ref_ce_loss": 0.20945298671722412, "step": 13990 }, { "epoch": 4.66977985323549, "loss": 0.8314, "step": 14000 }, { "epoch": 4.66977985323549, "grad_norm": 1.6027140617370605, "step": 14000 }, { "epoch": 4.66977985323549, "learning_rate": 0.00046197617956980505, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.904109001159668, "step": 14000 }, { "ce_loss": 0.2536281943321228, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.39303573966026306, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.15961523354053497, "step": 14000 }, { "epoch": 4.66977985323549, "loss": 0.8013550043106079, "step": 14000 }, { "ce_loss": 0.21332262456417084, "epoch": 4.66977985323549, "step": 14000 }, { "distill_loss": 0.33780455589294434, "epoch": 4.66977985323549, "step": 14000 }, { "epoch": 4.66977985323549, "ref_ce_loss": 0.14180517196655273, "step": 14000 }, { "epoch": 4.673115410273516, "loss": 0.8655, "step": 14010 }, { "epoch": 4.673115410273516, "grad_norm": 1.8766305446624756, "step": 14010 }, { "epoch": 4.673115410273516, "learning_rate": 0.00046154923094329656, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 1.1529908180236816, "step": 14010 }, { "ce_loss": 0.18356235325336456, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.3325393795967102, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.13770374655723572, "step": 14010 }, { "epoch": 4.673115410273516, "loss": 0.7322327494621277, "step": 14010 }, { "ce_loss": 0.18772105872631073, "epoch": 4.673115410273516, "step": 14010 }, { "distill_loss": 0.365714967250824, "epoch": 4.673115410273516, "step": 14010 }, { "epoch": 4.673115410273516, "ref_ce_loss": 0.17847004532814026, "step": 14010 }, { "epoch": 4.676450967311541, "loss": 0.8874, "step": 14020 }, { "epoch": 4.676450967311541, "grad_norm": 2.3067870140075684, "step": 14020 }, { "epoch": 4.676450967311541, "learning_rate": 0.00046112221048221267, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.5756610035896301, "step": 14020 }, { "ce_loss": 0.1752990335226059, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.26576200127601624, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.1171606183052063, "step": 14020 }, { "epoch": 4.676450967311541, "loss": 0.9301656484603882, "step": 14020 }, { "ce_loss": 0.2776738107204437, "epoch": 4.676450967311541, "step": 14020 }, { "distill_loss": 0.2842113971710205, "epoch": 4.676450967311541, "step": 14020 }, { "epoch": 4.676450967311541, "ref_ce_loss": 0.20117861032485962, "step": 14020 }, { "epoch": 4.679786524349566, "loss": 0.8123, "step": 14030 }, { "epoch": 4.679786524349566, "grad_norm": 2.325228214263916, "step": 14030 }, { "epoch": 4.679786524349566, "learning_rate": 0.00046069511868493206, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.8015709519386292, "step": 14030 }, { "ce_loss": 0.20850978791713715, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.34072285890579224, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.17537088692188263, "step": 14030 }, { "epoch": 4.679786524349566, "loss": 0.682460367679596, "step": 14030 }, { "ce_loss": 0.14392144978046417, "epoch": 4.679786524349566, "step": 14030 }, { "distill_loss": 0.3384716808795929, "epoch": 4.679786524349566, "step": 14030 }, { "epoch": 4.679786524349566, "ref_ce_loss": 0.13397617638111115, "step": 14030 }, { "epoch": 4.683122081387592, "loss": 0.8589, "step": 14040 }, { "epoch": 4.683122081387592, "grad_norm": 2.0175516605377197, "step": 14040 }, { "epoch": 4.683122081387592, "learning_rate": 0.00046026795604991685, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.7632008790969849, "step": 14040 }, { "ce_loss": 0.22315922379493713, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.3759915232658386, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.16336357593536377, "step": 14040 }, { "epoch": 4.683122081387592, "loss": 0.9702370166778564, "step": 14040 }, { "ce_loss": 0.30179935693740845, "epoch": 4.683122081387592, "step": 14040 }, { "distill_loss": 0.41837430000305176, "epoch": 4.683122081387592, "step": 14040 }, { "epoch": 4.683122081387592, "ref_ce_loss": 0.20655812323093414, "step": 14040 }, { "epoch": 4.686457638425617, "loss": 0.8679, "step": 14050 }, { "epoch": 4.686457638425617, "grad_norm": 1.6562148332595825, "step": 14050 }, { "epoch": 4.686457638425617, "learning_rate": 0.00045984072307571187, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 0.7191571593284607, "step": 14050 }, { "ce_loss": 0.16404734551906586, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.3208382725715637, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.11867458373308182, "step": 14050 }, { "epoch": 4.686457638425617, "loss": 1.045770525932312, "step": 14050 }, { "ce_loss": 0.23788468539714813, "epoch": 4.686457638425617, "step": 14050 }, { "distill_loss": 0.41924071311950684, "epoch": 4.686457638425617, "step": 14050 }, { "epoch": 4.686457638425617, "ref_ce_loss": 0.18163149058818817, "step": 14050 }, { "epoch": 4.6897931954636425, "loss": 0.8991, "step": 14060 }, { "epoch": 4.6897931954636425, "grad_norm": 2.0935001373291016, "step": 14060 }, { "epoch": 4.6897931954636425, "learning_rate": 0.000459413420260944, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 1.0229648351669312, "step": 14060 }, { "ce_loss": 0.22401516139507294, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.38364315032958984, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.2235155701637268, "step": 14060 }, { "epoch": 4.6897931954636425, "loss": 1.130496621131897, "step": 14060 }, { "ce_loss": 0.25606226921081543, "epoch": 4.6897931954636425, "step": 14060 }, { "distill_loss": 0.359218567609787, "epoch": 4.6897931954636425, "step": 14060 }, { "epoch": 4.6897931954636425, "ref_ce_loss": 0.167112797498703, "step": 14060 }, { "epoch": 4.693128752501668, "loss": 0.8388, "step": 14070 }, { "epoch": 4.693128752501668, "grad_norm": 1.860084056854248, "step": 14070 }, { "epoch": 4.693128752501668, "learning_rate": 0.0004589860481043215, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 0.8229884505271912, "step": 14070 }, { "ce_loss": 0.25853830575942993, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.35701021552085876, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.20571230351924896, "step": 14070 }, { "epoch": 4.693128752501668, "loss": 1.2947938442230225, "step": 14070 }, { "ce_loss": 0.2600174844264984, "epoch": 4.693128752501668, "step": 14070 }, { "distill_loss": 0.4015241265296936, "epoch": 4.693128752501668, "step": 14070 }, { "epoch": 4.693128752501668, "ref_ce_loss": 0.22245274484157562, "step": 14070 }, { "epoch": 4.696464309539693, "loss": 0.8792, "step": 14080 }, { "epoch": 4.696464309539693, "grad_norm": 2.23905611038208, "step": 14080 }, { "epoch": 4.696464309539693, "learning_rate": 0.00045855860710463373, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.863398015499115, "step": 14080 }, { "ce_loss": 0.19016389548778534, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.36668360233306885, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.17019394040107727, "step": 14080 }, { "epoch": 4.696464309539693, "loss": 0.660220742225647, "step": 14080 }, { "ce_loss": 0.14625494182109833, "epoch": 4.696464309539693, "step": 14080 }, { "distill_loss": 0.37596479058265686, "epoch": 4.696464309539693, "step": 14080 }, { "epoch": 4.696464309539693, "ref_ce_loss": 0.11250553280115128, "step": 14080 }, { "epoch": 4.6997998665777185, "loss": 0.81, "step": 14090 }, { "epoch": 4.6997998665777185, "grad_norm": 1.3756694793701172, "step": 14090 }, { "epoch": 4.6997998665777185, "learning_rate": 0.0004581310977607502, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.7182285785675049, "step": 14090 }, { "ce_loss": 0.16865786910057068, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.28787916898727417, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.1471586972475052, "step": 14090 }, { "epoch": 4.6997998665777185, "loss": 0.7447019219398499, "step": 14090 }, { "ce_loss": 0.20779182016849518, "epoch": 4.6997998665777185, "step": 14090 }, { "distill_loss": 0.3117561638355255, "epoch": 4.6997998665777185, "step": 14090 }, { "epoch": 4.6997998665777185, "ref_ce_loss": 0.1663566380739212, "step": 14090 }, { "epoch": 4.703135423615744, "loss": 0.8146, "step": 14100 }, { "epoch": 4.703135423615744, "grad_norm": 2.052716016769409, "step": 14100 }, { "epoch": 4.703135423615744, "learning_rate": 0.0004577035205716205, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 0.610322117805481, "step": 14100 }, { "ce_loss": 0.18214602768421173, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.2763131856918335, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.15158452093601227, "step": 14100 }, { "epoch": 4.703135423615744, "loss": 1.021213173866272, "step": 14100 }, { "ce_loss": 0.18335743248462677, "epoch": 4.703135423615744, "step": 14100 }, { "distill_loss": 0.3717056214809418, "epoch": 4.703135423615744, "step": 14100 }, { "epoch": 4.703135423615744, "ref_ce_loss": 0.13745620846748352, "step": 14100 }, { "epoch": 4.706470980653769, "loss": 0.8365, "step": 14110 }, { "epoch": 4.706470980653769, "grad_norm": 2.3778693675994873, "step": 14110 }, { "epoch": 4.706470980653769, "learning_rate": 0.0004572758760362731, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.8436442017555237, "step": 14110 }, { "ce_loss": 0.23669615387916565, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.3541877269744873, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.13591575622558594, "step": 14110 }, { "epoch": 4.706470980653769, "loss": 0.8931856155395508, "step": 14110 }, { "ce_loss": 0.1591729372739792, "epoch": 4.706470980653769, "step": 14110 }, { "distill_loss": 0.3652481436729431, "epoch": 4.706470980653769, "step": 14110 }, { "epoch": 4.706470980653769, "ref_ce_loss": 0.11494230479001999, "step": 14110 }, { "epoch": 4.709806537691795, "loss": 0.9286, "step": 14120 }, { "epoch": 4.709806537691795, "grad_norm": 1.803930401802063, "step": 14120 }, { "epoch": 4.709806537691795, "learning_rate": 0.00045684816465381525, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.5938913226127625, "step": 14120 }, { "ce_loss": 0.19339902698993683, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.26114407181739807, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.13884493708610535, "step": 14120 }, { "epoch": 4.709806537691795, "loss": 0.7062050104141235, "step": 14120 }, { "ce_loss": 0.15822601318359375, "epoch": 4.709806537691795, "step": 14120 }, { "distill_loss": 0.32899022102355957, "epoch": 4.709806537691795, "step": 14120 }, { "epoch": 4.709806537691795, "ref_ce_loss": 0.11663496494293213, "step": 14120 }, { "epoch": 4.71314209472982, "loss": 0.8309, "step": 14130 }, { "epoch": 4.71314209472982, "grad_norm": 2.1349706649780273, "step": 14130 }, { "epoch": 4.71314209472982, "learning_rate": 0.0004564203869234321, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 0.9979802370071411, "step": 14130 }, { "ce_loss": 0.26847484707832336, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.39733004570007324, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.17914006114006042, "step": 14130 }, { "epoch": 4.71314209472982, "loss": 0.7394533157348633, "step": 14130 }, { "ce_loss": 0.1996060609817505, "epoch": 4.71314209472982, "step": 14130 }, { "distill_loss": 0.34057703614234924, "epoch": 4.71314209472982, "step": 14130 }, { "epoch": 4.71314209472982, "ref_ce_loss": 0.15275272727012634, "step": 14130 }, { "epoch": 4.716477651767845, "loss": 0.8551, "step": 14140 }, { "epoch": 4.716477651767845, "grad_norm": 1.858211874961853, "step": 14140 }, { "epoch": 4.716477651767845, "learning_rate": 0.0004559925433443864, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.9450689554214478, "step": 14140 }, { "ce_loss": 0.22283834218978882, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.32096511125564575, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.13867157697677612, "step": 14140 }, { "epoch": 4.716477651767845, "loss": 0.8570036888122559, "step": 14140 }, { "ce_loss": 0.2400035560131073, "epoch": 4.716477651767845, "step": 14140 }, { "distill_loss": 0.3585464060306549, "epoch": 4.716477651767845, "step": 14140 }, { "epoch": 4.716477651767845, "ref_ce_loss": 0.1861157864332199, "step": 14140 }, { "epoch": 4.719813208805871, "loss": 0.7722, "step": 14150 }, { "epoch": 4.719813208805871, "grad_norm": 3.83005952835083, "step": 14150 }, { "epoch": 4.719813208805871, "learning_rate": 0.0004555646344160174, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.7848604321479797, "step": 14150 }, { "ce_loss": 0.22449667751789093, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.35835111141204834, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.15118904411792755, "step": 14150 }, { "epoch": 4.719813208805871, "loss": 0.7087922096252441, "step": 14150 }, { "ce_loss": 0.1851268857717514, "epoch": 4.719813208805871, "step": 14150 }, { "distill_loss": 0.3466935455799103, "epoch": 4.719813208805871, "step": 14150 }, { "epoch": 4.719813208805871, "ref_ce_loss": 0.14285339415073395, "step": 14150 }, { "epoch": 4.723148765843896, "loss": 0.8637, "step": 14160 }, { "epoch": 4.723148765843896, "grad_norm": 2.271299123764038, "step": 14160 }, { "epoch": 4.723148765843896, "learning_rate": 0.0004551366606377412, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 1.5590428113937378, "step": 14160 }, { "ce_loss": 0.26412802934646606, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.35552310943603516, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.19402185082435608, "step": 14160 }, { "epoch": 4.723148765843896, "loss": 0.8642193078994751, "step": 14160 }, { "ce_loss": 0.22637015581130981, "epoch": 4.723148765843896, "step": 14160 }, { "distill_loss": 0.2978854179382324, "epoch": 4.723148765843896, "step": 14160 }, { "epoch": 4.723148765843896, "ref_ce_loss": 0.20083940029144287, "step": 14160 }, { "epoch": 4.726484322881921, "loss": 0.892, "step": 14170 }, { "epoch": 4.726484322881921, "grad_norm": 1.4375994205474854, "step": 14170 }, { "epoch": 4.726484322881921, "learning_rate": 0.00045470862250904904, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.8493502140045166, "step": 14170 }, { "ce_loss": 0.1954825073480606, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.31222012639045715, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.2064986526966095, "step": 14170 }, { "epoch": 4.726484322881921, "loss": 0.8059635758399963, "step": 14170 }, { "ce_loss": 0.20247094333171844, "epoch": 4.726484322881921, "step": 14170 }, { "distill_loss": 0.30323803424835205, "epoch": 4.726484322881921, "step": 14170 }, { "epoch": 4.726484322881921, "ref_ce_loss": 0.1978103071451187, "step": 14170 }, { "epoch": 4.729819879919947, "loss": 0.8039, "step": 14180 }, { "epoch": 4.729819879919947, "grad_norm": 1.7913191318511963, "step": 14180 }, { "epoch": 4.729819879919947, "learning_rate": 0.00045428052052950757, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 1.217488408088684, "step": 14180 }, { "ce_loss": 0.28452906012535095, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.4151560366153717, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.20734523236751556, "step": 14180 }, { "epoch": 4.729819879919947, "loss": 0.6979823112487793, "step": 14180 }, { "ce_loss": 0.1885191649198532, "epoch": 4.729819879919947, "step": 14180 }, { "distill_loss": 0.32918426394462585, "epoch": 4.729819879919947, "step": 14180 }, { "epoch": 4.729819879919947, "ref_ce_loss": 0.1795302927494049, "step": 14180 }, { "epoch": 4.733155436957972, "loss": 0.8244, "step": 14190 }, { "epoch": 4.733155436957972, "grad_norm": 1.3256746530532837, "step": 14190 }, { "epoch": 4.733155436957972, "learning_rate": 0.00045385235519875775, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.887203574180603, "step": 14190 }, { "ce_loss": 0.21348625421524048, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.3612407445907593, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.20668892562389374, "step": 14190 }, { "epoch": 4.733155436957972, "loss": 0.7140733599662781, "step": 14190 }, { "ce_loss": 0.2096211165189743, "epoch": 4.733155436957972, "step": 14190 }, { "distill_loss": 0.3249552845954895, "epoch": 4.733155436957972, "step": 14190 }, { "epoch": 4.733155436957972, "ref_ce_loss": 0.1359350085258484, "step": 14190 }, { "epoch": 4.736490993995997, "loss": 0.838, "step": 14200 }, { "epoch": 4.736490993995997, "grad_norm": 2.408013105392456, "step": 14200 }, { "epoch": 4.736490993995997, "learning_rate": 0.0004534241270165147, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.648318350315094, "step": 14200 }, { "ce_loss": 0.1675240397453308, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.3086930513381958, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.13107258081436157, "step": 14200 }, { "epoch": 4.736490993995997, "loss": 0.633400559425354, "step": 14200 }, { "ce_loss": 0.11860974133014679, "epoch": 4.736490993995997, "step": 14200 }, { "distill_loss": 0.2925090491771698, "epoch": 4.736490993995997, "step": 14200 }, { "epoch": 4.736490993995997, "ref_ce_loss": 0.10717403143644333, "step": 14200 }, { "epoch": 4.739826551034023, "loss": 0.8424, "step": 14210 }, { "epoch": 4.739826551034023, "grad_norm": 1.4014778137207031, "step": 14210 }, { "epoch": 4.739826551034023, "learning_rate": 0.0004529958364825666, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.7830230593681335, "step": 14210 }, { "ce_loss": 0.1478165090084076, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.25908103585243225, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.14894166588783264, "step": 14210 }, { "epoch": 4.739826551034023, "loss": 0.7763957977294922, "step": 14210 }, { "ce_loss": 0.15796737372875214, "epoch": 4.739826551034023, "step": 14210 }, { "distill_loss": 0.3048602342605591, "epoch": 4.739826551034023, "step": 14210 }, { "epoch": 4.739826551034023, "ref_ce_loss": 0.1621614545583725, "step": 14210 }, { "epoch": 4.743162108072048, "loss": 0.9225, "step": 14220 }, { "epoch": 4.743162108072048, "grad_norm": 2.1950395107269287, "step": 14220 }, { "epoch": 4.743162108072048, "learning_rate": 0.00045256748409677495, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 1.2173900604248047, "step": 14220 }, { "ce_loss": 0.23952677845954895, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.3450341820716858, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.2102057933807373, "step": 14220 }, { "epoch": 4.743162108072048, "loss": 1.120995044708252, "step": 14220 }, { "ce_loss": 0.15178608894348145, "epoch": 4.743162108072048, "step": 14220 }, { "distill_loss": 0.3196716606616974, "epoch": 4.743162108072048, "step": 14220 }, { "epoch": 4.743162108072048, "ref_ce_loss": 0.13504192233085632, "step": 14220 }, { "epoch": 4.746497665110073, "loss": 0.8293, "step": 14230 }, { "epoch": 4.746497665110073, "grad_norm": 7.004584789276123, "step": 14230 }, { "epoch": 4.746497665110073, "learning_rate": 0.00045213907035907274, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.7378753423690796, "step": 14230 }, { "ce_loss": 0.20550483465194702, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.34348252415657043, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.1843615621328354, "step": 14230 }, { "epoch": 4.746497665110073, "loss": 0.7565779089927673, "step": 14230 }, { "ce_loss": 0.19548478722572327, "epoch": 4.746497665110073, "step": 14230 }, { "distill_loss": 0.38570237159729004, "epoch": 4.746497665110073, "step": 14230 }, { "epoch": 4.746497665110073, "ref_ce_loss": 0.17464320361614227, "step": 14230 }, { "epoch": 4.749833222148099, "loss": 0.8394, "step": 14240 }, { "epoch": 4.749833222148099, "grad_norm": 1.8284156322479248, "step": 14240 }, { "epoch": 4.749833222148099, "learning_rate": 0.0004517105957694652, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 0.7750556468963623, "step": 14240 }, { "ce_loss": 0.19875389337539673, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.34265342354774475, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.16989080607891083, "step": 14240 }, { "epoch": 4.749833222148099, "loss": 1.022950530052185, "step": 14240 }, { "ce_loss": 0.22671598196029663, "epoch": 4.749833222148099, "step": 14240 }, { "distill_loss": 0.3508990406990051, "epoch": 4.749833222148099, "step": 14240 }, { "epoch": 4.749833222148099, "ref_ce_loss": 0.16097328066825867, "step": 14240 }, { "epoch": 4.753168779186124, "loss": 0.8546, "step": 14250 }, { "epoch": 4.753168779186124, "grad_norm": 1.2136383056640625, "step": 14250 }, { "epoch": 4.753168779186124, "learning_rate": 0.00045128206082802784, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 0.8702363967895508, "step": 14250 }, { "ce_loss": 0.1765323132276535, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.36349090933799744, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.1928870975971222, "step": 14250 }, { "epoch": 4.753168779186124, "loss": 1.055302619934082, "step": 14250 }, { "ce_loss": 0.2146630883216858, "epoch": 4.753168779186124, "step": 14250 }, { "distill_loss": 0.38636380434036255, "epoch": 4.753168779186124, "step": 14250 }, { "epoch": 4.753168779186124, "ref_ce_loss": 0.142844557762146, "step": 14250 }, { "epoch": 4.7565043362241495, "loss": 0.8101, "step": 14260 }, { "epoch": 4.7565043362241495, "grad_norm": 1.6025526523590088, "step": 14260 }, { "epoch": 4.7565043362241495, "learning_rate": 0.0004508534660349074, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 0.8958355188369751, "step": 14260 }, { "ce_loss": 0.243308424949646, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.3873001039028168, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.16264839470386505, "step": 14260 }, { "epoch": 4.7565043362241495, "loss": 0.8794652223587036, "step": 14260 }, { "ce_loss": 0.1975322812795639, "epoch": 4.7565043362241495, "step": 14260 }, { "distill_loss": 0.38459375500679016, "epoch": 4.7565043362241495, "step": 14260 }, { "epoch": 4.7565043362241495, "ref_ce_loss": 0.17643694579601288, "step": 14260 }, { "epoch": 4.759839893262175, "loss": 0.8606, "step": 14270 }, { "epoch": 4.759839893262175, "grad_norm": 2.014094114303589, "step": 14270 }, { "epoch": 4.759839893262175, "learning_rate": 0.00045042481189032016, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.7878975868225098, "step": 14270 }, { "ce_loss": 0.2223137468099594, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.34276866912841797, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.1719190776348114, "step": 14270 }, { "epoch": 4.759839893262175, "loss": 0.7920337319374084, "step": 14270 }, { "ce_loss": 0.19568029046058655, "epoch": 4.759839893262175, "step": 14270 }, { "distill_loss": 0.3749501705169678, "epoch": 4.759839893262175, "step": 14270 }, { "epoch": 4.759839893262175, "ref_ce_loss": 0.158006951212883, "step": 14270 }, { "epoch": 4.7631754503002, "loss": 0.8574, "step": 14280 }, { "epoch": 4.7631754503002, "grad_norm": 2.1007559299468994, "step": 14280 }, { "epoch": 4.7631754503002, "learning_rate": 0.0004499960988945514, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.5976972579956055, "step": 14280 }, { "ce_loss": 0.17164160311222076, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.29972830414772034, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.12624342739582062, "step": 14280 }, { "epoch": 4.7631754503002, "loss": 0.6230519413948059, "step": 14280 }, { "ce_loss": 0.17910799384117126, "epoch": 4.7631754503002, "step": 14280 }, { "distill_loss": 0.27381521463394165, "epoch": 4.7631754503002, "step": 14280 }, { "epoch": 4.7631754503002, "ref_ce_loss": 0.1337912231683731, "step": 14280 }, { "epoch": 4.7665110073382255, "loss": 0.7983, "step": 14290 }, { "epoch": 4.7665110073382255, "grad_norm": 1.765936255455017, "step": 14290 }, { "epoch": 4.7665110073382255, "learning_rate": 0.0004495673275479554, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.7464971542358398, "step": 14290 }, { "ce_loss": 0.2413792610168457, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.320978581905365, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.18388238549232483, "step": 14290 }, { "epoch": 4.7665110073382255, "loss": 0.7573192119598389, "step": 14290 }, { "ce_loss": 0.24678291380405426, "epoch": 4.7665110073382255, "step": 14290 }, { "distill_loss": 0.29336345195770264, "epoch": 4.7665110073382255, "step": 14290 }, { "epoch": 4.7665110073382255, "ref_ce_loss": 0.1423608660697937, "step": 14290 }, { "epoch": 4.769846564376251, "loss": 0.8084, "step": 14300 }, { "epoch": 4.769846564376251, "grad_norm": 1.6711090803146362, "step": 14300 }, { "epoch": 4.769846564376251, "learning_rate": 0.0004491384983509546, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.7083107829093933, "step": 14300 }, { "ce_loss": 0.21236170828342438, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.2792416214942932, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.15519611537456512, "step": 14300 }, { "epoch": 4.769846564376251, "loss": 0.7527049779891968, "step": 14300 }, { "ce_loss": 0.22505830228328705, "epoch": 4.769846564376251, "step": 14300 }, { "distill_loss": 0.312044620513916, "epoch": 4.769846564376251, "step": 14300 }, { "epoch": 4.769846564376251, "ref_ce_loss": 0.1796731948852539, "step": 14300 }, { "epoch": 4.773182121414276, "loss": 0.8198, "step": 14310 }, { "epoch": 4.773182121414276, "grad_norm": 1.9802237749099731, "step": 14310 }, { "epoch": 4.773182121414276, "learning_rate": 0.0004487096118040387, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 1.025206208229065, "step": 14310 }, { "ce_loss": 0.27857324481010437, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.4218524396419525, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.1912098228931427, "step": 14310 }, { "epoch": 4.773182121414276, "loss": 0.8435007929801941, "step": 14310 }, { "ce_loss": 0.2689407467842102, "epoch": 4.773182121414276, "step": 14310 }, { "distill_loss": 0.3354867100715637, "epoch": 4.773182121414276, "step": 14310 }, { "epoch": 4.773182121414276, "ref_ce_loss": 0.19938433170318604, "step": 14310 }, { "epoch": 4.776517678452302, "loss": 0.8582, "step": 14320 }, { "epoch": 4.776517678452302, "grad_norm": 1.2547904253005981, "step": 14320 }, { "epoch": 4.776517678452302, "learning_rate": 0.00044828066840776426, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.6429987549781799, "step": 14320 }, { "ce_loss": 0.1616217941045761, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.30774685740470886, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.17335504293441772, "step": 14320 }, { "epoch": 4.776517678452302, "loss": 0.8333144783973694, "step": 14320 }, { "ce_loss": 0.1563624143600464, "epoch": 4.776517678452302, "step": 14320 }, { "distill_loss": 0.3343176245689392, "epoch": 4.776517678452302, "step": 14320 }, { "epoch": 4.776517678452302, "ref_ce_loss": 0.15886586904525757, "step": 14320 }, { "epoch": 4.779853235490327, "loss": 0.8333, "step": 14330 }, { "epoch": 4.779853235490327, "grad_norm": 2.450331926345825, "step": 14330 }, { "epoch": 4.779853235490327, "learning_rate": 0.00044785166866275486, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.7152910828590393, "step": 14330 }, { "ce_loss": 0.1755499690771103, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.3227419853210449, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.13227547705173492, "step": 14330 }, { "epoch": 4.779853235490327, "loss": 0.7708675265312195, "step": 14330 }, { "ce_loss": 0.254288911819458, "epoch": 4.779853235490327, "step": 14330 }, { "distill_loss": 0.32725656032562256, "epoch": 4.779853235490327, "step": 14330 }, { "epoch": 4.779853235490327, "ref_ce_loss": 0.1537141501903534, "step": 14330 }, { "epoch": 4.783188792528352, "loss": 0.7857, "step": 14340 }, { "epoch": 4.783188792528352, "grad_norm": 2.2568359375, "step": 14340 }, { "epoch": 4.783188792528352, "learning_rate": 0.0004474226130696989, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 0.6318008303642273, "step": 14340 }, { "ce_loss": 0.1493198722600937, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.3203362822532654, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.16190843284130096, "step": 14340 }, { "epoch": 4.783188792528352, "loss": 1.5504295825958252, "step": 14340 }, { "ce_loss": 0.25872090458869934, "epoch": 4.783188792528352, "step": 14340 }, { "distill_loss": 0.3019382655620575, "epoch": 4.783188792528352, "step": 14340 }, { "epoch": 4.783188792528352, "ref_ce_loss": 0.16322508454322815, "step": 14340 }, { "epoch": 4.786524349566378, "loss": 0.8551, "step": 14350 }, { "epoch": 4.786524349566378, "grad_norm": 3.3923728466033936, "step": 14350 }, { "epoch": 4.786524349566378, "learning_rate": 0.0004469935021293507, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.6944651007652283, "step": 14350 }, { "ce_loss": 0.15247030556201935, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.2727074921131134, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.1375962793827057, "step": 14350 }, { "epoch": 4.786524349566378, "loss": 0.815066933631897, "step": 14350 }, { "ce_loss": 0.21010121703147888, "epoch": 4.786524349566378, "step": 14350 }, { "distill_loss": 0.3797869384288788, "epoch": 4.786524349566378, "step": 14350 }, { "epoch": 4.786524349566378, "ref_ce_loss": 0.17980359494686127, "step": 14350 }, { "epoch": 4.789859906604403, "loss": 0.8532, "step": 14360 }, { "epoch": 4.789859906604403, "grad_norm": 1.3975764513015747, "step": 14360 }, { "epoch": 4.789859906604403, "learning_rate": 0.00044656433634252863, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.7048535943031311, "step": 14360 }, { "ce_loss": 0.18724028766155243, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.30819523334503174, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.1732262521982193, "step": 14360 }, { "epoch": 4.789859906604403, "loss": 0.9230031967163086, "step": 14360 }, { "ce_loss": 0.2237575799226761, "epoch": 4.789859906604403, "step": 14360 }, { "distill_loss": 0.3416767120361328, "epoch": 4.789859906604403, "step": 14360 }, { "epoch": 4.789859906604403, "ref_ce_loss": 0.21102052927017212, "step": 14360 }, { "epoch": 4.793195463642428, "loss": 0.8148, "step": 14370 }, { "epoch": 4.793195463642428, "grad_norm": 1.9771432876586914, "step": 14370 }, { "epoch": 4.793195463642428, "learning_rate": 0.00044613511621011565, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.7248369455337524, "step": 14370 }, { "ce_loss": 0.20936031639575958, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.3045058846473694, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.17129839956760406, "step": 14370 }, { "epoch": 4.793195463642428, "loss": 0.9662963151931763, "step": 14370 }, { "ce_loss": 0.2426602840423584, "epoch": 4.793195463642428, "step": 14370 }, { "distill_loss": 0.3995358943939209, "epoch": 4.793195463642428, "step": 14370 }, { "epoch": 4.793195463642428, "ref_ce_loss": 0.15862718224525452, "step": 14370 }, { "epoch": 4.796531020680454, "loss": 0.8521, "step": 14380 }, { "epoch": 4.796531020680454, "grad_norm": 2.1875810623168945, "step": 14380 }, { "epoch": 4.796531020680454, "learning_rate": 0.00044570584223305767, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.7186112999916077, "step": 14380 }, { "ce_loss": 0.21039500832557678, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.2966489791870117, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.1435556709766388, "step": 14380 }, { "epoch": 4.796531020680454, "loss": 0.9201071262359619, "step": 14380 }, { "ce_loss": 0.19870568811893463, "epoch": 4.796531020680454, "step": 14380 }, { "distill_loss": 0.2828711271286011, "epoch": 4.796531020680454, "step": 14380 }, { "epoch": 4.796531020680454, "ref_ce_loss": 0.1364685297012329, "step": 14380 }, { "epoch": 4.799866577718479, "loss": 1.2347, "step": 14390 }, { "epoch": 4.799866577718479, "grad_norm": 2.3982861042022705, "step": 14390 }, { "epoch": 4.799866577718479, "learning_rate": 0.00044527651491236376, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.7846877574920654, "step": 14390 }, { "ce_loss": 0.2630446255207062, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.3383769989013672, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.18286670744419098, "step": 14390 }, { "epoch": 4.799866577718479, "loss": 0.7356339693069458, "step": 14390 }, { "ce_loss": 0.20563283562660217, "epoch": 4.799866577718479, "step": 14390 }, { "distill_loss": 0.3828813433647156, "epoch": 4.799866577718479, "step": 14390 }, { "epoch": 4.799866577718479, "ref_ce_loss": 0.1468716263771057, "step": 14390 }, { "epoch": 4.803202134756504, "loss": 0.8352, "step": 14400 }, { "epoch": 4.803202134756504, "grad_norm": 1.5926655530929565, "step": 14400 }, { "epoch": 4.803202134756504, "learning_rate": 0.00044484713474910484, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.987956166267395, "step": 14400 }, { "ce_loss": 0.20529904961585999, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.36365723609924316, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.1763196885585785, "step": 14400 }, { "epoch": 4.803202134756504, "loss": 0.8181911110877991, "step": 14400 }, { "ce_loss": 0.18043328821659088, "epoch": 4.803202134756504, "step": 14400 }, { "distill_loss": 0.39718514680862427, "epoch": 4.803202134756504, "step": 14400 }, { "epoch": 4.803202134756504, "ref_ce_loss": 0.17551366984844208, "step": 14400 }, { "epoch": 4.80653769179453, "loss": 0.815, "step": 14410 }, { "epoch": 4.80653769179453, "grad_norm": 2.538222074508667, "step": 14410 }, { "epoch": 4.80653769179453, "learning_rate": 0.000444417702244414, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 1.3166474103927612, "step": 14410 }, { "ce_loss": 0.18348243832588196, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.29405656456947327, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.16508980095386505, "step": 14410 }, { "epoch": 4.80653769179453, "loss": 0.853499174118042, "step": 14410 }, { "ce_loss": 0.22681495547294617, "epoch": 4.80653769179453, "step": 14410 }, { "distill_loss": 0.3703365921974182, "epoch": 4.80653769179453, "step": 14410 }, { "epoch": 4.80653769179453, "ref_ce_loss": 0.15008428692817688, "step": 14410 }, { "epoch": 4.809873248832555, "loss": 1.1308, "step": 14420 }, { "epoch": 4.809873248832555, "grad_norm": 76.6787338256836, "step": 14420 }, { "epoch": 4.809873248832555, "learning_rate": 0.0004439882178994851, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 1.8646230697631836, "step": 14420 }, { "ce_loss": 1.0506770610809326, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.09022989124059677, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.6693422198295593, "step": 14420 }, { "epoch": 4.809873248832555, "loss": 1.9133636951446533, "step": 14420 }, { "ce_loss": 1.0093390941619873, "epoch": 4.809873248832555, "step": 14420 }, { "distill_loss": 0.11060499399900436, "epoch": 4.809873248832555, "step": 14420 }, { "epoch": 4.809873248832555, "ref_ce_loss": 0.642074465751648, "step": 14420 }, { "epoch": 4.81320880587058, "loss": 1.5584, "step": 14430 }, { "epoch": 4.81320880587058, "grad_norm": 2.6255245208740234, "step": 14430 }, { "epoch": 4.81320880587058, "learning_rate": 0.0004435586822155725, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 1.3784888982772827, "step": 14430 }, { "ce_loss": 0.8630189895629883, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.06988421082496643, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.44548124074935913, "step": 14430 }, { "epoch": 4.81320880587058, "loss": 1.231710433959961, "step": 14430 }, { "ce_loss": 0.7029614448547363, "epoch": 4.81320880587058, "step": 14430 }, { "distill_loss": 0.06970061361789703, "epoch": 4.81320880587058, "step": 14430 }, { "epoch": 4.81320880587058, "ref_ce_loss": 0.4138675034046173, "step": 14430 }, { "epoch": 4.816544362908606, "loss": 1.3034, "step": 14440 }, { "epoch": 4.816544362908606, "grad_norm": 1.8560012578964233, "step": 14440 }, { "epoch": 4.816544362908606, "learning_rate": 0.00044312909569399066, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 1.188285231590271, "step": 14440 }, { "ce_loss": 0.7114423513412476, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.10720157623291016, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.3693326413631439, "step": 14440 }, { "epoch": 4.816544362908606, "loss": 1.2714197635650635, "step": 14440 }, { "ce_loss": 0.717993438243866, "epoch": 4.816544362908606, "step": 14440 }, { "distill_loss": 0.06978762149810791, "epoch": 4.816544362908606, "step": 14440 }, { "epoch": 4.816544362908606, "ref_ce_loss": 0.3870980143547058, "step": 14440 }, { "epoch": 4.819879919946631, "loss": 1.0291, "step": 14450 }, { "epoch": 4.819879919946631, "grad_norm": 2.1931910514831543, "step": 14450 }, { "epoch": 4.819879919946631, "learning_rate": 0.0004426994588361134, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 1.0039728879928589, "step": 14450 }, { "ce_loss": 0.2299180030822754, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.329211950302124, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.17068281769752502, "step": 14450 }, { "epoch": 4.819879919946631, "loss": 1.1071248054504395, "step": 14450 }, { "ce_loss": 0.27436718344688416, "epoch": 4.819879919946631, "step": 14450 }, { "distill_loss": 0.504375696182251, "epoch": 4.819879919946631, "step": 14450 }, { "epoch": 4.819879919946631, "ref_ce_loss": 0.18603618443012238, "step": 14450 }, { "epoch": 4.8232154769846565, "loss": 1.1783, "step": 14460 }, { "epoch": 4.8232154769846565, "grad_norm": 9.356254577636719, "step": 14460 }, { "epoch": 4.8232154769846565, "learning_rate": 0.00044226977214337286, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 1.689245581626892, "step": 14460 }, { "ce_loss": 0.4103279411792755, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 1.0289390087127686, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.1930885761976242, "step": 14460 }, { "epoch": 4.8232154769846565, "loss": 1.4320827722549438, "step": 14460 }, { "ce_loss": 0.34104785323143005, "epoch": 4.8232154769846565, "step": 14460 }, { "distill_loss": 0.7984983921051025, "epoch": 4.8232154769846565, "step": 14460 }, { "epoch": 4.8232154769846565, "ref_ce_loss": 0.2489304393529892, "step": 14460 }, { "epoch": 4.826551034022682, "loss": 0.9409, "step": 14470 }, { "epoch": 4.826551034022682, "grad_norm": 1.7776070833206177, "step": 14470 }, { "epoch": 4.826551034022682, "learning_rate": 0.00044184003611726, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.9860867261886597, "step": 14470 }, { "ce_loss": 0.25365740060806274, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.337404727935791, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.19550800323486328, "step": 14470 }, { "epoch": 4.826551034022682, "loss": 0.8277320265769958, "step": 14470 }, { "ce_loss": 0.2675633132457733, "epoch": 4.826551034022682, "step": 14470 }, { "distill_loss": 0.3357546329498291, "epoch": 4.826551034022682, "step": 14470 }, { "epoch": 4.826551034022682, "ref_ce_loss": 0.18932946026325226, "step": 14470 }, { "epoch": 4.829886591060707, "loss": 0.7971, "step": 14480 }, { "epoch": 4.829886591060707, "grad_norm": 2.111743450164795, "step": 14480 }, { "epoch": 4.829886591060707, "learning_rate": 0.0004414102512593226, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 0.761222779750824, "step": 14480 }, { "ce_loss": 0.1951623260974884, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.3789231479167938, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.14075884222984314, "step": 14480 }, { "epoch": 4.829886591060707, "loss": 1.1427268981933594, "step": 14480 }, { "ce_loss": 0.2484903633594513, "epoch": 4.829886591060707, "step": 14480 }, { "distill_loss": 0.42528820037841797, "epoch": 4.829886591060707, "step": 14480 }, { "epoch": 4.829886591060707, "ref_ce_loss": 0.16807828843593597, "step": 14480 }, { "epoch": 4.8332221480987325, "loss": 0.7806, "step": 14490 }, { "epoch": 4.8332221480987325, "grad_norm": 1.6515129804611206, "step": 14490 }, { "epoch": 4.8332221480987325, "learning_rate": 0.0004409804180711662, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.9863893985748291, "step": 14490 }, { "ce_loss": 0.21996329724788666, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.3274819850921631, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.19819903373718262, "step": 14490 }, { "epoch": 4.8332221480987325, "loss": 0.6463009715080261, "step": 14490 }, { "ce_loss": 0.15356984734535217, "epoch": 4.8332221480987325, "step": 14490 }, { "distill_loss": 0.31557101011276245, "epoch": 4.8332221480987325, "step": 14490 }, { "epoch": 4.8332221480987325, "ref_ce_loss": 0.13610194623470306, "step": 14490 }, { "epoch": 4.836557705136758, "loss": 0.7817, "step": 14500 }, { "epoch": 4.836557705136758, "grad_norm": 2.982213020324707, "step": 14500 }, { "epoch": 4.836557705136758, "learning_rate": 0.00044055053705445213, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.7263142466545105, "step": 14500 }, { "ce_loss": 0.1957697570323944, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.3093082010746002, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.16477113962173462, "step": 14500 }, { "epoch": 4.836557705136758, "loss": 0.7457721829414368, "step": 14500 }, { "ce_loss": 0.21496635675430298, "epoch": 4.836557705136758, "step": 14500 }, { "distill_loss": 0.31695231795310974, "epoch": 4.836557705136758, "step": 14500 }, { "epoch": 4.836557705136758, "ref_ce_loss": 0.17643709480762482, "step": 14500 }, { "epoch": 4.839893262174783, "loss": 0.795, "step": 14510 }, { "epoch": 4.839893262174783, "grad_norm": 1.7769856452941895, "step": 14510 }, { "epoch": 4.839893262174783, "learning_rate": 0.000440120608710898, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 1.05339515209198, "step": 14510 }, { "ce_loss": 0.21219713985919952, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.3491799533367157, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.14955969154834747, "step": 14510 }, { "epoch": 4.839893262174783, "loss": 0.9170594811439514, "step": 14510 }, { "ce_loss": 0.3033076822757721, "epoch": 4.839893262174783, "step": 14510 }, { "distill_loss": 0.36640915274620056, "epoch": 4.839893262174783, "step": 14510 }, { "epoch": 4.839893262174783, "ref_ce_loss": 0.20215469598770142, "step": 14510 }, { "epoch": 4.843228819212809, "loss": 0.8816, "step": 14520 }, { "epoch": 4.843228819212809, "grad_norm": 2.1591436862945557, "step": 14520 }, { "epoch": 4.843228819212809, "learning_rate": 0.0004396906335422763, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.9165817499160767, "step": 14520 }, { "ce_loss": 0.2852003276348114, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.4034644663333893, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.18268036842346191, "step": 14520 }, { "epoch": 4.843228819212809, "loss": 0.8185547590255737, "step": 14520 }, { "ce_loss": 0.20989814400672913, "epoch": 4.843228819212809, "step": 14520 }, { "distill_loss": 0.3294355273246765, "epoch": 4.843228819212809, "step": 14520 }, { "epoch": 4.843228819212809, "ref_ce_loss": 0.17076851427555084, "step": 14520 }, { "epoch": 4.846564376250834, "loss": 0.8401, "step": 14530 }, { "epoch": 4.846564376250834, "grad_norm": 2.5680665969848633, "step": 14530 }, { "epoch": 4.846564376250834, "learning_rate": 0.00043926061205041444, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 0.6906549334526062, "step": 14530 }, { "ce_loss": 0.15018831193447113, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.22638709843158722, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.15170030295848846, "step": 14530 }, { "epoch": 4.846564376250834, "loss": 1.0279051065444946, "step": 14530 }, { "ce_loss": 0.1601780354976654, "epoch": 4.846564376250834, "step": 14530 }, { "distill_loss": 0.3407314121723175, "epoch": 4.846564376250834, "step": 14530 }, { "epoch": 4.846564376250834, "ref_ce_loss": 0.12231296300888062, "step": 14530 }, { "epoch": 4.849899933288859, "loss": 0.8612, "step": 14540 }, { "epoch": 4.849899933288859, "grad_norm": 1.8162392377853394, "step": 14540 }, { "epoch": 4.849899933288859, "learning_rate": 0.0004388305447371936, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 0.7569298148155212, "step": 14540 }, { "ce_loss": 0.1900038719177246, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.3315070569515228, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.18197223544120789, "step": 14540 }, { "epoch": 4.849899933288859, "loss": 1.090651512145996, "step": 14540 }, { "ce_loss": 0.25102272629737854, "epoch": 4.849899933288859, "step": 14540 }, { "distill_loss": 0.326902836561203, "epoch": 4.849899933288859, "step": 14540 }, { "epoch": 4.849899933288859, "ref_ce_loss": 0.17948073148727417, "step": 14540 }, { "epoch": 4.853235490326885, "loss": 0.9135, "step": 14550 }, { "epoch": 4.853235490326885, "grad_norm": 1.8046009540557861, "step": 14550 }, { "epoch": 4.853235490326885, "learning_rate": 0.00043840043210454873, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.7219487428665161, "step": 14550 }, { "ce_loss": 0.24534782767295837, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.29573965072631836, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.16810734570026398, "step": 14550 }, { "epoch": 4.853235490326885, "loss": 0.7448170781135559, "step": 14550 }, { "ce_loss": 0.20320436358451843, "epoch": 4.853235490326885, "step": 14550 }, { "distill_loss": 0.34010350704193115, "epoch": 4.853235490326885, "step": 14550 }, { "epoch": 4.853235490326885, "ref_ce_loss": 0.15430478751659393, "step": 14550 }, { "epoch": 4.85657104736491, "loss": 0.7886, "step": 14560 }, { "epoch": 4.85657104736491, "grad_norm": 2.4193296432495117, "step": 14560 }, { "epoch": 4.85657104736491, "learning_rate": 0.0004379702746544675, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.8893520832061768, "step": 14560 }, { "ce_loss": 0.25183916091918945, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.3263954520225525, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.19890064001083374, "step": 14560 }, { "epoch": 4.85657104736491, "loss": 0.7266795039176941, "step": 14560 }, { "ce_loss": 0.17995253205299377, "epoch": 4.85657104736491, "step": 14560 }, { "distill_loss": 0.34926894307136536, "epoch": 4.85657104736491, "step": 14560 }, { "epoch": 4.85657104736491, "ref_ce_loss": 0.14957621693611145, "step": 14560 }, { "epoch": 4.859906604402935, "loss": 0.8756, "step": 14570 }, { "epoch": 4.859906604402935, "grad_norm": 2.1137895584106445, "step": 14570 }, { "epoch": 4.859906604402935, "learning_rate": 0.00043754007288899013, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.8555729389190674, "step": 14570 }, { "ce_loss": 0.1920050084590912, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.4450578987598419, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.16308441758155823, "step": 14570 }, { "epoch": 4.859906604402935, "loss": 0.7460584044456482, "step": 14570 }, { "ce_loss": 0.20282535254955292, "epoch": 4.859906604402935, "step": 14570 }, { "distill_loss": 0.38569849729537964, "epoch": 4.859906604402935, "step": 14570 }, { "epoch": 4.859906604402935, "ref_ce_loss": 0.1571381390094757, "step": 14570 }, { "epoch": 4.863242161440961, "loss": 0.8363, "step": 14580 }, { "epoch": 4.863242161440961, "grad_norm": 1.1724597215652466, "step": 14580 }, { "epoch": 4.863242161440961, "learning_rate": 0.00043710982731020806, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.9502582550048828, "step": 14580 }, { "ce_loss": 0.27301228046417236, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.3538023829460144, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.21068429946899414, "step": 14580 }, { "epoch": 4.863242161440961, "loss": 0.5929126143455505, "step": 14580 }, { "ce_loss": 0.14882758259773254, "epoch": 4.863242161440961, "step": 14580 }, { "distill_loss": 0.27876344323158264, "epoch": 4.863242161440961, "step": 14580 }, { "epoch": 4.863242161440961, "ref_ce_loss": 0.13127300143241882, "step": 14580 }, { "epoch": 4.866577718478986, "loss": 0.7924, "step": 14590 }, { "epoch": 4.866577718478986, "grad_norm": 1.5373058319091797, "step": 14590 }, { "epoch": 4.866577718478986, "learning_rate": 0.0004366795384202644, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.8771259784698486, "step": 14590 }, { "ce_loss": 0.2793692946434021, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.32669538259506226, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.2203112691640854, "step": 14590 }, { "epoch": 4.866577718478986, "loss": 0.8765442371368408, "step": 14590 }, { "ce_loss": 0.2435297966003418, "epoch": 4.866577718478986, "step": 14590 }, { "distill_loss": 0.37883514165878296, "epoch": 4.866577718478986, "step": 14590 }, { "epoch": 4.866577718478986, "ref_ce_loss": 0.16562002897262573, "step": 14590 }, { "epoch": 4.869913275517011, "loss": 0.8328, "step": 14600 }, { "epoch": 4.869913275517011, "grad_norm": 1.6567118167877197, "step": 14600 }, { "epoch": 4.869913275517011, "learning_rate": 0.0004362492067213526, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 0.8297321796417236, "step": 14600 }, { "ce_loss": 0.25611981749534607, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.3246353268623352, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.17316275835037231, "step": 14600 }, { "epoch": 4.869913275517011, "loss": 1.1873735189437866, "step": 14600 }, { "ce_loss": 0.25334230065345764, "epoch": 4.869913275517011, "step": 14600 }, { "distill_loss": 0.36492976546287537, "epoch": 4.869913275517011, "step": 14600 }, { "epoch": 4.869913275517011, "ref_ce_loss": 0.14517346024513245, "step": 14600 }, { "epoch": 4.873248832555037, "loss": 0.8133, "step": 14610 }, { "epoch": 4.873248832555037, "grad_norm": 1.6364003419876099, "step": 14610 }, { "epoch": 4.873248832555037, "learning_rate": 0.00043581883271571586, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.8985888957977295, "step": 14610 }, { "ce_loss": 0.22380895912647247, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.3480800986289978, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.18579760193824768, "step": 14610 }, { "epoch": 4.873248832555037, "loss": 0.880142331123352, "step": 14610 }, { "ce_loss": 0.22697052359580994, "epoch": 4.873248832555037, "step": 14610 }, { "distill_loss": 0.36627066135406494, "epoch": 4.873248832555037, "step": 14610 }, { "epoch": 4.873248832555037, "ref_ce_loss": 0.14945651590824127, "step": 14610 }, { "epoch": 4.876584389593062, "loss": 0.7902, "step": 14620 }, { "epoch": 4.876584389593062, "grad_norm": 1.6763052940368652, "step": 14620 }, { "epoch": 4.876584389593062, "learning_rate": 0.0004353884169056472, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 0.8141733407974243, "step": 14620 }, { "ce_loss": 0.1499975621700287, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.3481467068195343, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.1621495932340622, "step": 14620 }, { "epoch": 4.876584389593062, "loss": 1.2638483047485352, "step": 14620 }, { "ce_loss": 0.22092504799365997, "epoch": 4.876584389593062, "step": 14620 }, { "distill_loss": 0.34048253297805786, "epoch": 4.876584389593062, "step": 14620 }, { "epoch": 4.876584389593062, "ref_ce_loss": 0.1847701519727707, "step": 14620 }, { "epoch": 4.879919946631087, "loss": 0.8409, "step": 14630 }, { "epoch": 4.879919946631087, "grad_norm": 1.3977830410003662, "step": 14630 }, { "epoch": 4.879919946631087, "learning_rate": 0.0004349579597934879, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.6937637329101562, "step": 14630 }, { "ce_loss": 0.23181191086769104, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.2919282615184784, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.16963261365890503, "step": 14630 }, { "epoch": 4.879919946631087, "loss": 0.7458980083465576, "step": 14630 }, { "ce_loss": 0.13273221254348755, "epoch": 4.879919946631087, "step": 14630 }, { "distill_loss": 0.3241432309150696, "epoch": 4.879919946631087, "step": 14630 }, { "epoch": 4.879919946631087, "ref_ce_loss": 0.17733998596668243, "step": 14630 }, { "epoch": 4.883255503669113, "loss": 0.7683, "step": 14640 }, { "epoch": 4.883255503669113, "grad_norm": 1.6077567338943481, "step": 14640 }, { "epoch": 4.883255503669113, "learning_rate": 0.00043452746188162803, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.7748646140098572, "step": 14640 }, { "ce_loss": 0.1977359652519226, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.3178797662258148, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.20306116342544556, "step": 14640 }, { "epoch": 4.883255503669113, "loss": 0.8212810158729553, "step": 14640 }, { "ce_loss": 0.19623126089572906, "epoch": 4.883255503669113, "step": 14640 }, { "distill_loss": 0.3520052134990692, "epoch": 4.883255503669113, "step": 14640 }, { "epoch": 4.883255503669113, "ref_ce_loss": 0.12612716853618622, "step": 14640 }, { "epoch": 4.886591060707138, "loss": 0.857, "step": 14650 }, { "epoch": 4.886591060707138, "grad_norm": 1.7271556854248047, "step": 14650 }, { "epoch": 4.886591060707138, "learning_rate": 0.0004340969236725046, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.7299498915672302, "step": 14650 }, { "ce_loss": 0.1944878250360489, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.36202430725097656, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.17322170734405518, "step": 14650 }, { "epoch": 4.886591060707138, "loss": 0.878600537776947, "step": 14650 }, { "ce_loss": 0.25492069125175476, "epoch": 4.886591060707138, "step": 14650 }, { "distill_loss": 0.40417933464050293, "epoch": 4.886591060707138, "step": 14650 }, { "epoch": 4.886591060707138, "ref_ce_loss": 0.16458988189697266, "step": 14650 }, { "epoch": 4.8899266177451635, "loss": 0.9618, "step": 14660 }, { "epoch": 4.8899266177451635, "grad_norm": 2.2485077381134033, "step": 14660 }, { "epoch": 4.8899266177451635, "learning_rate": 0.0004336663456686026, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.9936636090278625, "step": 14660 }, { "ce_loss": 0.2919512689113617, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.3717796802520752, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.2658803164958954, "step": 14660 }, { "epoch": 4.8899266177451635, "loss": 0.8192309737205505, "step": 14660 }, { "ce_loss": 0.16883529722690582, "epoch": 4.8899266177451635, "step": 14660 }, { "distill_loss": 0.3290097415447235, "epoch": 4.8899266177451635, "step": 14660 }, { "epoch": 4.8899266177451635, "ref_ce_loss": 0.14802150428295135, "step": 14660 }, { "epoch": 4.893262174783189, "loss": 0.8059, "step": 14670 }, { "epoch": 4.893262174783189, "grad_norm": 2.89357852935791, "step": 14670 }, { "epoch": 4.893262174783189, "learning_rate": 0.0004332357283724523, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 1.29874587059021, "step": 14670 }, { "ce_loss": 0.1667264997959137, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.22527697682380676, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.16906295716762543, "step": 14670 }, { "epoch": 4.893262174783189, "loss": 0.5599315166473389, "step": 14670 }, { "ce_loss": 0.15608477592468262, "epoch": 4.893262174783189, "step": 14670 }, { "distill_loss": 0.24146291613578796, "epoch": 4.893262174783189, "step": 14670 }, { "epoch": 4.893262174783189, "ref_ce_loss": 0.11661271750926971, "step": 14670 }, { "epoch": 4.896597731821214, "loss": 0.7917, "step": 14680 }, { "epoch": 4.896597731821214, "grad_norm": 1.482553482055664, "step": 14680 }, { "epoch": 4.896597731821214, "learning_rate": 0.00043280507228663086, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.8754822015762329, "step": 14680 }, { "ce_loss": 0.10988114774227142, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.22368855774402618, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.1033942922949791, "step": 14680 }, { "epoch": 4.896597731821214, "loss": 0.8509879112243652, "step": 14680 }, { "ce_loss": 0.167730450630188, "epoch": 4.896597731821214, "step": 14680 }, { "distill_loss": 0.25954991579055786, "epoch": 4.896597731821214, "step": 14680 }, { "epoch": 4.896597731821214, "ref_ce_loss": 0.18977469205856323, "step": 14680 }, { "epoch": 4.8999332888592395, "loss": 0.8584, "step": 14690 }, { "epoch": 4.8999332888592395, "grad_norm": 1.982042908668518, "step": 14690 }, { "epoch": 4.8999332888592395, "learning_rate": 0.00043237437791375993, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.6484662890434265, "step": 14690 }, { "ce_loss": 0.16622701287269592, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.29128366708755493, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.18995237350463867, "step": 14690 }, { "epoch": 4.8999332888592395, "loss": 0.8472151160240173, "step": 14690 }, { "ce_loss": 0.20356720685958862, "epoch": 4.8999332888592395, "step": 14690 }, { "distill_loss": 0.2969290316104889, "epoch": 4.8999332888592395, "step": 14690 }, { "epoch": 4.8999332888592395, "ref_ce_loss": 0.15106698870658875, "step": 14690 }, { "epoch": 4.903268845897265, "loss": 0.8098, "step": 14700 }, { "epoch": 4.903268845897265, "grad_norm": 4.553890705108643, "step": 14700 }, { "epoch": 4.903268845897265, "learning_rate": 0.0004319436457565064, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 0.8041062951087952, "step": 14700 }, { "ce_loss": 0.20178014039993286, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.3266127109527588, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.1578376144170761, "step": 14700 }, { "epoch": 4.903268845897265, "loss": 1.2332589626312256, "step": 14700 }, { "ce_loss": 0.22578376531600952, "epoch": 4.903268845897265, "step": 14700 }, { "distill_loss": 0.31459343433380127, "epoch": 4.903268845897265, "step": 14700 }, { "epoch": 4.903268845897265, "ref_ce_loss": 0.18273325264453888, "step": 14700 }, { "epoch": 4.90660440293529, "loss": 0.8274, "step": 14710 }, { "epoch": 4.90660440293529, "grad_norm": 1.8715195655822754, "step": 14710 }, { "epoch": 4.90660440293529, "learning_rate": 0.00043151287631758094, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.5332189798355103, "step": 14710 }, { "ce_loss": 0.13371208310127258, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.23901522159576416, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.1602819859981537, "step": 14710 }, { "epoch": 4.90660440293529, "loss": 0.9503391981124878, "step": 14710 }, { "ce_loss": 0.21474634110927582, "epoch": 4.90660440293529, "step": 14710 }, { "distill_loss": 0.34557732939720154, "epoch": 4.90660440293529, "step": 14710 }, { "epoch": 4.90660440293529, "ref_ce_loss": 0.1305851936340332, "step": 14710 }, { "epoch": 4.909939959973316, "loss": 0.729, "step": 14720 }, { "epoch": 4.909939959973316, "grad_norm": 1.2429009675979614, "step": 14720 }, { "epoch": 4.909939959973316, "learning_rate": 0.0004310820700997381, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.7952256202697754, "step": 14720 }, { "ce_loss": 0.1933000683784485, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.2214220017194748, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.155678853392601, "step": 14720 }, { "epoch": 4.909939959973316, "loss": 0.7052319049835205, "step": 14720 }, { "ce_loss": 0.21559998393058777, "epoch": 4.909939959973316, "step": 14720 }, { "distill_loss": 0.2121732532978058, "epoch": 4.909939959973316, "step": 14720 }, { "epoch": 4.909939959973316, "ref_ce_loss": 0.17546199262142181, "step": 14720 }, { "epoch": 4.913275517011341, "loss": 0.7112, "step": 14730 }, { "epoch": 4.913275517011341, "grad_norm": 2.589341878890991, "step": 14730 }, { "epoch": 4.913275517011341, "learning_rate": 0.0004306512276057746, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.6831985712051392, "step": 14730 }, { "ce_loss": 0.21680234372615814, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.17091289162635803, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.24290505051612854, "step": 14730 }, { "epoch": 4.913275517011341, "loss": 0.820207953453064, "step": 14730 }, { "ce_loss": 0.25783631205558777, "epoch": 4.913275517011341, "step": 14730 }, { "distill_loss": 0.21073512732982635, "epoch": 4.913275517011341, "step": 14730 }, { "epoch": 4.913275517011341, "ref_ce_loss": 0.19787658751010895, "step": 14730 }, { "epoch": 4.916611074049366, "loss": 0.6716, "step": 14740 }, { "epoch": 4.916611074049366, "grad_norm": 1.1120072603225708, "step": 14740 }, { "epoch": 4.916611074049366, "learning_rate": 0.0004302203493385306, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.569556474685669, "step": 14740 }, { "ce_loss": 0.16752736270427704, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.18688969314098358, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.21498115360736847, "step": 14740 }, { "epoch": 4.916611074049366, "loss": 0.5875118970870972, "step": 14740 }, { "ce_loss": 0.24991795420646667, "epoch": 4.916611074049366, "step": 14740 }, { "distill_loss": 0.15244053304195404, "epoch": 4.916611074049366, "step": 14740 }, { "epoch": 4.916611074049366, "ref_ce_loss": 0.18492494523525238, "step": 14740 }, { "epoch": 4.919946631087392, "loss": 0.6869, "step": 14750 }, { "epoch": 4.919946631087392, "grad_norm": 2.2955188751220703, "step": 14750 }, { "epoch": 4.919946631087392, "learning_rate": 0.00042978943580088683, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.6526211500167847, "step": 14750 }, { "ce_loss": 0.25695958733558655, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.2014576494693756, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.1937873214483261, "step": 14750 }, { "epoch": 4.919946631087392, "loss": 0.6520219445228577, "step": 14750 }, { "ce_loss": 0.18020305037498474, "epoch": 4.919946631087392, "step": 14750 }, { "distill_loss": 0.19618704915046692, "epoch": 4.919946631087392, "step": 14750 }, { "epoch": 4.919946631087392, "ref_ce_loss": 0.16471585631370544, "step": 14750 }, { "epoch": 4.923282188125417, "loss": 0.667, "step": 14760 }, { "epoch": 4.923282188125417, "grad_norm": 1.4162378311157227, "step": 14760 }, { "epoch": 4.923282188125417, "learning_rate": 0.00042935848749576605, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.8888968825340271, "step": 14760 }, { "ce_loss": 0.2973913252353668, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.20532050728797913, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.19992171227931976, "step": 14760 }, { "epoch": 4.923282188125417, "loss": 0.5608199834823608, "step": 14760 }, { "ce_loss": 0.21077339351177216, "epoch": 4.923282188125417, "step": 14760 }, { "distill_loss": 0.16744637489318848, "epoch": 4.923282188125417, "step": 14760 }, { "epoch": 4.923282188125417, "ref_ce_loss": 0.14417974650859833, "step": 14760 }, { "epoch": 4.926617745163442, "loss": 0.7523, "step": 14770 }, { "epoch": 4.926617745163442, "grad_norm": 1.8813337087631226, "step": 14770 }, { "epoch": 4.926617745163442, "learning_rate": 0.00042892750492613124, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.8061093091964722, "step": 14770 }, { "ce_loss": 0.21497511863708496, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.3684411942958832, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.17852698266506195, "step": 14770 }, { "epoch": 4.926617745163442, "loss": 0.8953151702880859, "step": 14770 }, { "ce_loss": 0.27103355526924133, "epoch": 4.926617745163442, "step": 14770 }, { "distill_loss": 0.44409942626953125, "epoch": 4.926617745163442, "step": 14770 }, { "epoch": 4.926617745163442, "ref_ce_loss": 0.17995832860469818, "step": 14770 }, { "epoch": 4.929953302201468, "loss": 0.8933, "step": 14780 }, { "epoch": 4.929953302201468, "grad_norm": 1.758418083190918, "step": 14780 }, { "epoch": 4.929953302201468, "learning_rate": 0.00042849648859498554, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.9538238048553467, "step": 14780 }, { "ce_loss": 0.2814600169658661, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.46908703446388245, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.15025387704372406, "step": 14780 }, { "epoch": 4.929953302201468, "loss": 0.6634703278541565, "step": 14780 }, { "ce_loss": 0.17408856749534607, "epoch": 4.929953302201468, "step": 14780 }, { "distill_loss": 0.3205197751522064, "epoch": 4.929953302201468, "step": 14780 }, { "epoch": 4.929953302201468, "ref_ce_loss": 0.13817840814590454, "step": 14780 }, { "epoch": 4.933288859239493, "loss": 0.8628, "step": 14790 }, { "epoch": 4.933288859239493, "grad_norm": 2.2300400733947754, "step": 14790 }, { "epoch": 4.933288859239493, "learning_rate": 0.0004280654390053712, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 1.016411304473877, "step": 14790 }, { "ce_loss": 0.2313816100358963, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.4427144229412079, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.20980128645896912, "step": 14790 }, { "epoch": 4.933288859239493, "loss": 0.8823920488357544, "step": 14790 }, { "ce_loss": 0.24262329936027527, "epoch": 4.933288859239493, "step": 14790 }, { "distill_loss": 0.4533618986606598, "epoch": 4.933288859239493, "step": 14790 }, { "epoch": 4.933288859239493, "ref_ce_loss": 0.18566647171974182, "step": 14790 }, { "epoch": 4.936624416277518, "loss": 1.0459, "step": 14800 }, { "epoch": 4.936624416277518, "grad_norm": 6.204045295715332, "step": 14800 }, { "epoch": 4.936624416277518, "learning_rate": 0.00042763435666036973, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 1.3538483381271362, "step": 14800 }, { "ce_loss": 0.3704322576522827, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.7152073979377747, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.2093076854944229, "step": 14800 }, { "epoch": 4.936624416277518, "loss": 0.9754691123962402, "step": 14800 }, { "ce_loss": 0.24051721394062042, "epoch": 4.936624416277518, "step": 14800 }, { "distill_loss": 0.5091836452484131, "epoch": 4.936624416277518, "step": 14800 }, { "epoch": 4.936624416277518, "ref_ce_loss": 0.17049235105514526, "step": 14800 }, { "epoch": 4.939959973315544, "loss": 1.0958, "step": 14810 }, { "epoch": 4.939959973315544, "grad_norm": 3.4569523334503174, "step": 14810 }, { "epoch": 4.939959973315544, "learning_rate": 0.0004272032420631003, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 0.8569350242614746, "step": 14810 }, { "ce_loss": 0.25664758682250977, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.39706772565841675, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.16805876791477203, "step": 14810 }, { "epoch": 4.939959973315544, "loss": 1.6937330961227417, "step": 14810 }, { "ce_loss": 0.2796967625617981, "epoch": 4.939959973315544, "step": 14810 }, { "distill_loss": 0.9483956694602966, "epoch": 4.939959973315544, "step": 14810 }, { "epoch": 4.939959973315544, "ref_ce_loss": 0.2175309658050537, "step": 14810 }, { "epoch": 4.943295530353569, "loss": 1.0918, "step": 14820 }, { "epoch": 4.943295530353569, "grad_norm": 2.689622163772583, "step": 14820 }, { "epoch": 4.943295530353569, "learning_rate": 0.0004267720957167202, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.7044408917427063, "step": 14820 }, { "ce_loss": 0.17607758939266205, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.3482334017753601, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.17931784689426422, "step": 14820 }, { "epoch": 4.943295530353569, "loss": 0.9181632399559021, "step": 14820 }, { "ce_loss": 0.260895699262619, "epoch": 4.943295530353569, "step": 14820 }, { "distill_loss": 0.31426650285720825, "epoch": 4.943295530353569, "step": 14820 }, { "epoch": 4.943295530353569, "ref_ce_loss": 0.19637629389762878, "step": 14820 }, { "epoch": 4.946631087391594, "loss": 0.8052, "step": 14830 }, { "epoch": 4.946631087391594, "grad_norm": 1.5513012409210205, "step": 14830 }, { "epoch": 4.946631087391594, "learning_rate": 0.0004263409181244236, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 0.7607131004333496, "step": 14830 }, { "ce_loss": 0.25896647572517395, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.34560784697532654, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.15555359423160553, "step": 14830 }, { "epoch": 4.946631087391594, "loss": 1.0125609636306763, "step": 14830 }, { "ce_loss": 0.225737527012825, "epoch": 4.946631087391594, "step": 14830 }, { "distill_loss": 0.38140517473220825, "epoch": 4.946631087391594, "step": 14830 }, { "epoch": 4.946631087391594, "ref_ce_loss": 0.19740155339241028, "step": 14830 }, { "epoch": 4.94996664442962, "loss": 0.7785, "step": 14840 }, { "epoch": 4.94996664442962, "grad_norm": 1.6204591989517212, "step": 14840 }, { "epoch": 4.94996664442962, "learning_rate": 0.00042590970978944134, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 0.651091992855072, "step": 14840 }, { "ce_loss": 0.17360687255859375, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.30723288655281067, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.1698845624923706, "step": 14840 }, { "epoch": 4.94996664442962, "loss": 1.0454457998275757, "step": 14840 }, { "ce_loss": 0.2976030111312866, "epoch": 4.94996664442962, "step": 14840 }, { "distill_loss": 0.36757364869117737, "epoch": 4.94996664442962, "step": 14840 }, { "epoch": 4.94996664442962, "ref_ce_loss": 0.25793442130088806, "step": 14840 }, { "epoch": 4.953302201467645, "loss": 0.8274, "step": 14850 }, { "epoch": 4.953302201467645, "grad_norm": 1.1396735906600952, "step": 14850 }, { "epoch": 4.953302201467645, "learning_rate": 0.00042547847121503956, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.7016441226005554, "step": 14850 }, { "ce_loss": 0.197813481092453, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.3064996600151062, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.14802461862564087, "step": 14850 }, { "epoch": 4.953302201467645, "loss": 0.8446516990661621, "step": 14850 }, { "ce_loss": 0.19737334549427032, "epoch": 4.953302201467645, "step": 14850 }, { "distill_loss": 0.23980382084846497, "epoch": 4.953302201467645, "step": 14850 }, { "epoch": 4.953302201467645, "ref_ce_loss": 0.1891128122806549, "step": 14850 }, { "epoch": 4.9566377585056705, "loss": 0.8137, "step": 14860 }, { "epoch": 4.9566377585056705, "grad_norm": 1.8514357805252075, "step": 14860 }, { "epoch": 4.9566377585056705, "learning_rate": 0.0004250472029045204, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.7438077330589294, "step": 14860 }, { "ce_loss": 0.2085508406162262, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.3794730305671692, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.15562745928764343, "step": 14860 }, { "epoch": 4.9566377585056705, "loss": 0.595439076423645, "step": 14860 }, { "ce_loss": 0.16566596925258636, "epoch": 4.9566377585056705, "step": 14860 }, { "distill_loss": 0.3100895583629608, "epoch": 4.9566377585056705, "step": 14860 }, { "epoch": 4.9566377585056705, "ref_ce_loss": 0.11954033374786377, "step": 14860 }, { "epoch": 4.959973315543696, "loss": 0.7921, "step": 14870 }, { "epoch": 4.959973315543696, "grad_norm": 2.2713711261749268, "step": 14870 }, { "epoch": 4.959973315543696, "learning_rate": 0.00042461590536122017, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.7876851558685303, "step": 14870 }, { "ce_loss": 0.21698933839797974, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.28425318002700806, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.15588748455047607, "step": 14870 }, { "epoch": 4.959973315543696, "loss": 0.872055172920227, "step": 14870 }, { "ce_loss": 0.23964551091194153, "epoch": 4.959973315543696, "step": 14870 }, { "distill_loss": 0.37298545241355896, "epoch": 4.959973315543696, "step": 14870 }, { "epoch": 4.959973315543696, "ref_ce_loss": 0.18387804925441742, "step": 14870 }, { "epoch": 4.963308872581721, "loss": 0.807, "step": 14880 }, { "epoch": 4.963308872581721, "grad_norm": 1.6793760061264038, "step": 14880 }, { "epoch": 4.963308872581721, "learning_rate": 0.0004241845790885096, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 1.0691640377044678, "step": 14880 }, { "ce_loss": 0.25611549615859985, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.361844539642334, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.17869262397289276, "step": 14880 }, { "epoch": 4.963308872581721, "loss": 1.216537356376648, "step": 14880 }, { "ce_loss": 0.247919499874115, "epoch": 4.963308872581721, "step": 14880 }, { "distill_loss": 0.3699841797351837, "epoch": 4.963308872581721, "step": 14880 }, { "epoch": 4.963308872581721, "ref_ce_loss": 0.1814611852169037, "step": 14880 }, { "epoch": 4.9666444296197465, "loss": 0.8803, "step": 14890 }, { "epoch": 4.9666444296197465, "grad_norm": 1.9350767135620117, "step": 14890 }, { "epoch": 4.9666444296197465, "learning_rate": 0.00042375322458979286, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 1.2860150337219238, "step": 14890 }, { "ce_loss": 0.18326349556446075, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.3313220739364624, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.1453424096107483, "step": 14890 }, { "epoch": 4.9666444296197465, "loss": 1.0579900741577148, "step": 14890 }, { "ce_loss": 0.23096811771392822, "epoch": 4.9666444296197465, "step": 14890 }, { "distill_loss": 0.4033205211162567, "epoch": 4.9666444296197465, "step": 14890 }, { "epoch": 4.9666444296197465, "ref_ce_loss": 0.1717856526374817, "step": 14890 }, { "epoch": 4.969979986657772, "loss": 0.8604, "step": 14900 }, { "epoch": 4.969979986657772, "grad_norm": 1.4258919954299927, "step": 14900 }, { "epoch": 4.969979986657772, "learning_rate": 0.00042332184236850714, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.6514805555343628, "step": 14900 }, { "ce_loss": 0.17925575375556946, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.2957485020160675, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.1762385070323944, "step": 14900 }, { "epoch": 4.969979986657772, "loss": 0.9138930439949036, "step": 14900 }, { "ce_loss": 0.2070235162973404, "epoch": 4.969979986657772, "step": 14900 }, { "distill_loss": 0.3828832507133484, "epoch": 4.969979986657772, "step": 14900 }, { "epoch": 4.969979986657772, "ref_ce_loss": 0.16424107551574707, "step": 14900 }, { "epoch": 4.973315543695797, "loss": 0.7769, "step": 14910 }, { "epoch": 4.973315543695797, "grad_norm": 2.2698750495910645, "step": 14910 }, { "epoch": 4.973315543695797, "learning_rate": 0.00042289043292812183, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.8078776001930237, "step": 14910 }, { "ce_loss": 0.19389958679676056, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.4422124922275543, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.17119190096855164, "step": 14910 }, { "epoch": 4.973315543695797, "loss": 0.7781140208244324, "step": 14910 }, { "ce_loss": 0.207720085978508, "epoch": 4.973315543695797, "step": 14910 }, { "distill_loss": 0.3681999444961548, "epoch": 4.973315543695797, "step": 14910 }, { "epoch": 4.973315543695797, "ref_ce_loss": 0.2020249366760254, "step": 14910 }, { "epoch": 4.9766511007338226, "loss": 0.7858, "step": 14920 }, { "epoch": 4.9766511007338226, "grad_norm": 1.6022636890411377, "step": 14920 }, { "epoch": 4.9766511007338226, "learning_rate": 0.00042245899677213804, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.7355327010154724, "step": 14920 }, { "ce_loss": 0.24947616457939148, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.28465527296066284, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.15694041550159454, "step": 14920 }, { "epoch": 4.9766511007338226, "loss": 0.961696445941925, "step": 14920 }, { "ce_loss": 0.22658829391002655, "epoch": 4.9766511007338226, "step": 14920 }, { "distill_loss": 0.23476535081863403, "epoch": 4.9766511007338226, "step": 14920 }, { "epoch": 4.9766511007338226, "ref_ce_loss": 0.22592884302139282, "step": 14920 }, { "epoch": 4.979986657771848, "loss": 0.7838, "step": 14930 }, { "epoch": 4.979986657771848, "grad_norm": 1.8493866920471191, "step": 14930 }, { "epoch": 4.979986657771848, "learning_rate": 0.0004220275344040885, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.6180217862129211, "step": 14930 }, { "ce_loss": 0.23227348923683167, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.2516789138317108, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.13387711346149445, "step": 14930 }, { "epoch": 4.979986657771848, "loss": 0.7691706418991089, "step": 14930 }, { "ce_loss": 0.08289957791566849, "epoch": 4.979986657771848, "step": 14930 }, { "distill_loss": 0.18975229561328888, "epoch": 4.979986657771848, "step": 14930 }, { "epoch": 4.979986657771848, "ref_ce_loss": 0.10202156007289886, "step": 14930 }, { "epoch": 4.983322214809873, "loss": 0.8085, "step": 14940 }, { "epoch": 4.983322214809873, "grad_norm": 2.2217841148376465, "step": 14940 }, { "epoch": 4.983322214809873, "learning_rate": 0.00042159604632753593, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.8267608284950256, "step": 14940 }, { "ce_loss": 0.2368335723876953, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.28960832953453064, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.18242600560188293, "step": 14940 }, { "epoch": 4.983322214809873, "loss": 0.7490988969802856, "step": 14940 }, { "ce_loss": 0.1887538582086563, "epoch": 4.983322214809873, "step": 14940 }, { "distill_loss": 0.2502892315387726, "epoch": 4.983322214809873, "step": 14940 }, { "epoch": 4.983322214809873, "ref_ce_loss": 0.14834783971309662, "step": 14940 }, { "epoch": 4.986657771847899, "loss": 0.8345, "step": 14950 }, { "epoch": 4.986657771847899, "grad_norm": 3.3127646446228027, "step": 14950 }, { "epoch": 4.986657771847899, "learning_rate": 0.0004211645330460736, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.7695230841636658, "step": 14950 }, { "ce_loss": 0.1897069364786148, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.32701951265335083, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.17356127500534058, "step": 14950 }, { "epoch": 4.986657771847899, "loss": 0.8446176052093506, "step": 14950 }, { "ce_loss": 0.18127408623695374, "epoch": 4.986657771847899, "step": 14950 }, { "distill_loss": 0.3553479313850403, "epoch": 4.986657771847899, "step": 14950 }, { "epoch": 4.986657771847899, "ref_ce_loss": 0.17855243384838104, "step": 14950 }, { "epoch": 4.989993328885924, "loss": 0.8408, "step": 14960 }, { "epoch": 4.989993328885924, "grad_norm": 1.8860888481140137, "step": 14960 }, { "epoch": 4.989993328885924, "learning_rate": 0.0004207329950633237, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.6589542627334595, "step": 14960 }, { "ce_loss": 0.18732410669326782, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.27363622188568115, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.1468590348958969, "step": 14960 }, { "epoch": 4.989993328885924, "loss": 0.603113055229187, "step": 14960 }, { "ce_loss": 0.1368173062801361, "epoch": 4.989993328885924, "step": 14960 }, { "distill_loss": 0.31260940432548523, "epoch": 4.989993328885924, "step": 14960 }, { "epoch": 4.989993328885924, "ref_ce_loss": 0.11893227696418762, "step": 14960 }, { "epoch": 4.993328885923949, "loss": 0.8121, "step": 14970 }, { "epoch": 4.993328885923949, "grad_norm": 1.615979790687561, "step": 14970 }, { "epoch": 4.993328885923949, "learning_rate": 0.0004203014328829377, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.8362368941307068, "step": 14970 }, { "ce_loss": 0.28994032740592957, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.3475920557975769, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.19849959015846252, "step": 14970 }, { "epoch": 4.993328885923949, "loss": 0.8362758159637451, "step": 14970 }, { "ce_loss": 0.23408402502536774, "epoch": 4.993328885923949, "step": 14970 }, { "distill_loss": 0.3129236102104187, "epoch": 4.993328885923949, "step": 14970 }, { "epoch": 4.993328885923949, "ref_ce_loss": 0.17037709057331085, "step": 14970 }, { "epoch": 4.996664442961975, "loss": 0.7861, "step": 14980 }, { "epoch": 4.996664442961975, "grad_norm": 9.854972839355469, "step": 14980 }, { "epoch": 4.996664442961975, "learning_rate": 0.00041986984700859473, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.7983810901641846, "step": 14980 }, { "ce_loss": 0.22925935685634613, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.2890111804008484, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.17651940882205963, "step": 14980 }, { "epoch": 4.996664442961975, "loss": 0.7387625575065613, "step": 14980 }, { "ce_loss": 0.20382659137248993, "epoch": 4.996664442961975, "step": 14980 }, { "distill_loss": 0.3147651255130768, "epoch": 4.996664442961975, "step": 14980 }, { "epoch": 4.996664442961975, "ref_ce_loss": 0.15080402791500092, "step": 14980 }, { "epoch": 5.0, "loss": 0.8124, "step": 14990 }, { "epoch": 5.0, "grad_norm": 1.7010997533798218, "step": 14990 }, { "epoch": 5.0, "learning_rate": 0.00041943823794400256, "step": 14990 }, { "epoch": 5.0, "loss": 0.5493976473808289, "step": 14990 }, { "ce_loss": 0.12174477428197861, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.29600775241851807, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.13114362955093384, "step": 14990 }, { "epoch": 5.0, "loss": 0.6701236963272095, "step": 14990 }, { "ce_loss": 0.1493186503648758, "epoch": 5.0, "step": 14990 }, { "distill_loss": 0.29544782638549805, "epoch": 5.0, "step": 14990 }, { "epoch": 5.0, "ref_ce_loss": 0.125166118144989, "step": 14990 }, { "epoch": 5.003335557038025, "loss": 0.6878, "step": 15000 }, { "epoch": 5.003335557038025, "grad_norm": 1.6340457201004028, "step": 15000 }, { "epoch": 5.003335557038025, "learning_rate": 0.0004190066061928949, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.7245055437088013, "step": 15000 }, { "ce_loss": 0.32626980543136597, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.15379448235034943, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.19659432768821716, "step": 15000 }, { "epoch": 5.003335557038025, "loss": 0.5948415994644165, "step": 15000 }, { "ce_loss": 0.279764324426651, "epoch": 5.003335557038025, "step": 15000 }, { "distill_loss": 0.1267262101173401, "epoch": 5.003335557038025, "step": 15000 }, { "epoch": 5.003335557038025, "ref_ce_loss": 0.187013179063797, "step": 15000 }, { "epoch": 5.006671114076051, "loss": 0.7193, "step": 15010 }, { "epoch": 5.006671114076051, "grad_norm": 2.0343847274780273, "step": 15010 }, { "epoch": 5.006671114076051, "learning_rate": 0.0004185749522590327, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.5516359210014343, "step": 15010 }, { "ce_loss": 0.1559305191040039, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.2148263156414032, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.18032346665859222, "step": 15010 }, { "epoch": 5.006671114076051, "loss": 0.6730528473854065, "step": 15010 }, { "ce_loss": 0.256583571434021, "epoch": 5.006671114076051, "step": 15010 }, { "distill_loss": 0.23742398619651794, "epoch": 5.006671114076051, "step": 15010 }, { "epoch": 5.006671114076051, "ref_ce_loss": 0.15251505374908447, "step": 15010 }, { "epoch": 5.010006671114076, "loss": 0.7347, "step": 15020 }, { "epoch": 5.010006671114076, "grad_norm": 2.410008430480957, "step": 15020 }, { "epoch": 5.010006671114076, "learning_rate": 0.00041814327664620236, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.9244962930679321, "step": 15020 }, { "ce_loss": 0.12703068554401398, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.3417317271232605, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.16752563416957855, "step": 15020 }, { "epoch": 5.010006671114076, "loss": 0.6968506574630737, "step": 15020 }, { "ce_loss": 0.17538586258888245, "epoch": 5.010006671114076, "step": 15020 }, { "distill_loss": 0.3248499035835266, "epoch": 5.010006671114076, "step": 15020 }, { "epoch": 5.010006671114076, "ref_ce_loss": 0.15323364734649658, "step": 15020 }, { "epoch": 5.013342228152101, "loss": 0.8062, "step": 15030 }, { "epoch": 5.013342228152101, "grad_norm": 1.9668642282485962, "step": 15030 }, { "epoch": 5.013342228152101, "learning_rate": 0.00041771157985821583, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.7335589528083801, "step": 15030 }, { "ce_loss": 0.185219407081604, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.35626813769340515, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.19170069694519043, "step": 15030 }, { "epoch": 5.013342228152101, "loss": 0.8919950127601624, "step": 15030 }, { "ce_loss": 0.17872750759124756, "epoch": 5.013342228152101, "step": 15030 }, { "distill_loss": 0.3545999825000763, "epoch": 5.013342228152101, "step": 15030 }, { "epoch": 5.013342228152101, "ref_ce_loss": 0.1883302479982376, "step": 15030 }, { "epoch": 5.016677785190127, "loss": 0.7265, "step": 15040 }, { "epoch": 5.016677785190127, "grad_norm": 1.4379287958145142, "step": 15040 }, { "epoch": 5.016677785190127, "learning_rate": 0.0004172798623989099, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.7448151707649231, "step": 15040 }, { "ce_loss": 0.20049512386322021, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.34122711420059204, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.12881450355052948, "step": 15040 }, { "epoch": 5.016677785190127, "loss": 0.7172622680664062, "step": 15040 }, { "ce_loss": 0.12092036753892899, "epoch": 5.016677785190127, "step": 15040 }, { "distill_loss": 0.21639123558998108, "epoch": 5.016677785190127, "step": 15040 }, { "epoch": 5.016677785190127, "ref_ce_loss": 0.10656973719596863, "step": 15040 }, { "epoch": 5.020013342228152, "loss": 0.7088, "step": 15050 }, { "epoch": 5.020013342228152, "grad_norm": 1.9609133005142212, "step": 15050 }, { "epoch": 5.020013342228152, "learning_rate": 0.00041684812477214513, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.6304663419723511, "step": 15050 }, { "ce_loss": 0.1168755292892456, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.2414208948612213, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.11683718115091324, "step": 15050 }, { "epoch": 5.020013342228152, "loss": 0.8791097402572632, "step": 15050 }, { "ce_loss": 0.14968617260456085, "epoch": 5.020013342228152, "step": 15050 }, { "distill_loss": 0.299162894487381, "epoch": 5.020013342228152, "step": 15050 }, { "epoch": 5.020013342228152, "ref_ce_loss": 0.12304960936307907, "step": 15050 }, { "epoch": 5.0233488992661774, "loss": 0.7655, "step": 15060 }, { "epoch": 5.0233488992661774, "grad_norm": 2.369798183441162, "step": 15060 }, { "epoch": 5.0233488992661774, "learning_rate": 0.0004164163674818058, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.7249473929405212, "step": 15060 }, { "ce_loss": 0.19053047895431519, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.3092261254787445, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.16751517355442047, "step": 15060 }, { "epoch": 5.0233488992661774, "loss": 0.7884148359298706, "step": 15060 }, { "ce_loss": 0.11927471309900284, "epoch": 5.0233488992661774, "step": 15060 }, { "distill_loss": 0.30460503697395325, "epoch": 5.0233488992661774, "step": 15060 }, { "epoch": 5.0233488992661774, "ref_ce_loss": 0.1316409558057785, "step": 15060 }, { "epoch": 5.026684456304203, "loss": 0.757, "step": 15070 }, { "epoch": 5.026684456304203, "grad_norm": 1.569125771522522, "step": 15070 }, { "epoch": 5.026684456304203, "learning_rate": 0.00041598459103179923, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.7768588066101074, "step": 15070 }, { "ce_loss": 0.17738273739814758, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.33382323384284973, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.1485685408115387, "step": 15070 }, { "epoch": 5.026684456304203, "loss": 0.7797510623931885, "step": 15070 }, { "ce_loss": 0.20250549912452698, "epoch": 5.026684456304203, "step": 15070 }, { "distill_loss": 0.2760103940963745, "epoch": 5.026684456304203, "step": 15070 }, { "epoch": 5.026684456304203, "ref_ce_loss": 0.17828702926635742, "step": 15070 }, { "epoch": 5.030020013342228, "loss": 0.8597, "step": 15080 }, { "epoch": 5.030020013342228, "grad_norm": 1.8850408792495728, "step": 15080 }, { "epoch": 5.030020013342228, "learning_rate": 0.0004155527959260548, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.9354071021080017, "step": 15080 }, { "ce_loss": 0.2691536247730255, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.4211371839046478, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.1899864375591278, "step": 15080 }, { "epoch": 5.030020013342228, "loss": 0.5601158142089844, "step": 15080 }, { "ce_loss": 0.14419464766979218, "epoch": 5.030020013342228, "step": 15080 }, { "distill_loss": 0.2756442129611969, "epoch": 5.030020013342228, "step": 15080 }, { "epoch": 5.030020013342228, "ref_ce_loss": 0.1400666981935501, "step": 15080 }, { "epoch": 5.0333555703802535, "loss": 0.7297, "step": 15090 }, { "epoch": 5.0333555703802535, "grad_norm": 1.7295362949371338, "step": 15090 }, { "epoch": 5.0333555703802535, "learning_rate": 0.0004151209826685239, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.7522996068000793, "step": 15090 }, { "ce_loss": 0.21195000410079956, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.333752304315567, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.16549304127693176, "step": 15090 }, { "epoch": 5.0333555703802535, "loss": 0.6685563921928406, "step": 15090 }, { "ce_loss": 0.14069870114326477, "epoch": 5.0333555703802535, "step": 15090 }, { "distill_loss": 0.3408507704734802, "epoch": 5.0333555703802535, "step": 15090 }, { "epoch": 5.0333555703802535, "ref_ce_loss": 0.14039243757724762, "step": 15090 }, { "epoch": 5.036691127418279, "loss": 0.7199, "step": 15100 }, { "epoch": 5.036691127418279, "grad_norm": 2.117892026901245, "step": 15100 }, { "epoch": 5.036691127418279, "learning_rate": 0.00041468915176317927, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.5341494083404541, "step": 15100 }, { "ce_loss": 0.1536131352186203, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.2793445289134979, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.1009863018989563, "step": 15100 }, { "epoch": 5.036691127418279, "loss": 0.7302848100662231, "step": 15100 }, { "ce_loss": 0.1963079273700714, "epoch": 5.036691127418279, "step": 15100 }, { "distill_loss": 0.33204758167266846, "epoch": 5.036691127418279, "step": 15100 }, { "epoch": 5.036691127418279, "ref_ce_loss": 0.15897220373153687, "step": 15100 }, { "epoch": 5.040026684456304, "loss": 0.7521, "step": 15110 }, { "epoch": 5.040026684456304, "grad_norm": 2.443079710006714, "step": 15110 }, { "epoch": 5.040026684456304, "learning_rate": 0.00041425730371401397, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 0.718482494354248, "step": 15110 }, { "ce_loss": 0.20535221695899963, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.3341054320335388, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.1788957566022873, "step": 15110 }, { "epoch": 5.040026684456304, "loss": 1.0382378101348877, "step": 15110 }, { "ce_loss": 0.2585584819316864, "epoch": 5.040026684456304, "step": 15110 }, { "distill_loss": 0.39312127232551575, "epoch": 5.040026684456304, "step": 15110 }, { "epoch": 5.040026684456304, "ref_ce_loss": 0.17663244903087616, "step": 15110 }, { "epoch": 5.0433622414943295, "loss": 0.8617, "step": 15120 }, { "epoch": 5.0433622414943295, "grad_norm": 2.586002826690674, "step": 15120 }, { "epoch": 5.0433622414943295, "learning_rate": 0.000413825439025041, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.9322474002838135, "step": 15120 }, { "ce_loss": 0.17946970462799072, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.38679298758506775, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.12891161441802979, "step": 15120 }, { "epoch": 5.0433622414943295, "loss": 0.7245580554008484, "step": 15120 }, { "ce_loss": 0.18467949330806732, "epoch": 5.0433622414943295, "step": 15120 }, { "distill_loss": 0.3750002384185791, "epoch": 5.0433622414943295, "step": 15120 }, { "epoch": 5.0433622414943295, "ref_ce_loss": 0.14669382572174072, "step": 15120 }, { "epoch": 5.046697798532355, "loss": 0.7683, "step": 15130 }, { "epoch": 5.046697798532355, "grad_norm": 2.1405930519104004, "step": 15130 }, { "epoch": 5.046697798532355, "learning_rate": 0.0004133935582002931, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.7090080380439758, "step": 15130 }, { "ce_loss": 0.18841010332107544, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.3492705523967743, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.1372767835855484, "step": 15130 }, { "epoch": 5.046697798532355, "loss": 0.7359943985939026, "step": 15130 }, { "ce_loss": 0.19403263926506042, "epoch": 5.046697798532355, "step": 15130 }, { "distill_loss": 0.3826109766960144, "epoch": 5.046697798532355, "step": 15130 }, { "epoch": 5.046697798532355, "ref_ce_loss": 0.1589689403772354, "step": 15130 }, { "epoch": 5.05003335557038, "loss": 0.8044, "step": 15140 }, { "epoch": 5.05003335557038, "grad_norm": 1.4716843366622925, "step": 15140 }, { "epoch": 5.05003335557038, "learning_rate": 0.0004129616617438214, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 1.0415242910385132, "step": 15140 }, { "ce_loss": 0.19426783919334412, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.3080959618091583, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.16131074726581573, "step": 15140 }, { "epoch": 5.05003335557038, "loss": 0.5969170928001404, "step": 15140 }, { "ce_loss": 0.1542249321937561, "epoch": 5.05003335557038, "step": 15140 }, { "distill_loss": 0.23913876712322235, "epoch": 5.05003335557038, "step": 15140 }, { "epoch": 5.05003335557038, "ref_ce_loss": 0.1418016105890274, "step": 15140 }, { "epoch": 5.053368912608406, "loss": 0.7259, "step": 15150 }, { "epoch": 5.053368912608406, "grad_norm": 1.4115509986877441, "step": 15150 }, { "epoch": 5.053368912608406, "learning_rate": 0.0004125297501596958, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.8852753043174744, "step": 15150 }, { "ce_loss": 0.2032100260257721, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.3799605071544647, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.15284423530101776, "step": 15150 }, { "epoch": 5.053368912608406, "loss": 0.7631134390830994, "step": 15150 }, { "ce_loss": 0.23339112102985382, "epoch": 5.053368912608406, "step": 15150 }, { "distill_loss": 0.3507590889930725, "epoch": 5.053368912608406, "step": 15150 }, { "epoch": 5.053368912608406, "ref_ce_loss": 0.17875313758850098, "step": 15150 }, { "epoch": 5.056704469646431, "loss": 0.7739, "step": 15160 }, { "epoch": 5.056704469646431, "grad_norm": 1.3898341655731201, "step": 15160 }, { "epoch": 5.056704469646431, "learning_rate": 0.0004120978239520035, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.7077916264533997, "step": 15160 }, { "ce_loss": 0.1837417483329773, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.30820026993751526, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.1538972705602646, "step": 15160 }, { "epoch": 5.056704469646431, "loss": 0.70729660987854, "step": 15160 }, { "ce_loss": 0.14605118334293365, "epoch": 5.056704469646431, "step": 15160 }, { "distill_loss": 0.28691962361335754, "epoch": 5.056704469646431, "step": 15160 }, { "epoch": 5.056704469646431, "ref_ce_loss": 0.13982750475406647, "step": 15160 }, { "epoch": 5.060040026684456, "loss": 0.752, "step": 15170 }, { "epoch": 5.060040026684456, "grad_norm": 1.3309475183486938, "step": 15170 }, { "epoch": 5.060040026684456, "learning_rate": 0.0004116658836248489, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.6324098706245422, "step": 15170 }, { "ce_loss": 0.16235965490341187, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.3043210506439209, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.13393297791481018, "step": 15170 }, { "epoch": 5.060040026684456, "loss": 0.8985719680786133, "step": 15170 }, { "ce_loss": 0.22842848300933838, "epoch": 5.060040026684456, "step": 15170 }, { "distill_loss": 0.37544792890548706, "epoch": 5.060040026684456, "step": 15170 }, { "epoch": 5.060040026684456, "ref_ce_loss": 0.20678819715976715, "step": 15170 }, { "epoch": 5.063375583722482, "loss": 0.7813, "step": 15180 }, { "epoch": 5.063375583722482, "grad_norm": 1.6714725494384766, "step": 15180 }, { "epoch": 5.063375583722482, "learning_rate": 0.00041123392968235275, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.8342439532279968, "step": 15180 }, { "ce_loss": 0.12360727041959763, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.2871699333190918, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.13866832852363586, "step": 15180 }, { "epoch": 5.063375583722482, "loss": 0.8737316727638245, "step": 15180 }, { "ce_loss": 0.2810107171535492, "epoch": 5.063375583722482, "step": 15180 }, { "distill_loss": 0.3678017854690552, "epoch": 5.063375583722482, "step": 15180 }, { "epoch": 5.063375583722482, "ref_ce_loss": 0.1715250015258789, "step": 15180 }, { "epoch": 5.066711140760507, "loss": 0.7547, "step": 15190 }, { "epoch": 5.066711140760507, "grad_norm": 1.899660348892212, "step": 15190 }, { "epoch": 5.066711140760507, "learning_rate": 0.00041080196262865195, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.8878491520881653, "step": 15190 }, { "ce_loss": 0.20929262042045593, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.3679594099521637, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.2262931913137436, "step": 15190 }, { "epoch": 5.066711140760507, "loss": 0.5590569972991943, "step": 15190 }, { "ce_loss": 0.14374060928821564, "epoch": 5.066711140760507, "step": 15190 }, { "distill_loss": 0.28797242045402527, "epoch": 5.066711140760507, "step": 15190 }, { "epoch": 5.066711140760507, "ref_ce_loss": 0.1269454061985016, "step": 15190 }, { "epoch": 5.070046697798532, "loss": 0.7276, "step": 15200 }, { "epoch": 5.070046697798532, "grad_norm": 1.7899247407913208, "step": 15200 }, { "epoch": 5.070046697798532, "learning_rate": 0.0004103699829678983, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.8208807706832886, "step": 15200 }, { "ce_loss": 0.14580045640468597, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.3477942943572998, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.14927612245082855, "step": 15200 }, { "epoch": 5.070046697798532, "loss": 0.6615450978279114, "step": 15200 }, { "ce_loss": 0.18760168552398682, "epoch": 5.070046697798532, "step": 15200 }, { "distill_loss": 0.3171888589859009, "epoch": 5.070046697798532, "step": 15200 }, { "epoch": 5.070046697798532, "ref_ce_loss": 0.11915821582078934, "step": 15200 }, { "epoch": 5.073382254836558, "loss": 0.8015, "step": 15210 }, { "epoch": 5.073382254836558, "grad_norm": 1.3192826509475708, "step": 15210 }, { "epoch": 5.073382254836558, "learning_rate": 0.00040993799120425873, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.7259508967399597, "step": 15210 }, { "ce_loss": 0.17098185420036316, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.3219786286354065, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.14518868923187256, "step": 15210 }, { "epoch": 5.073382254836558, "loss": 0.9862769842147827, "step": 15210 }, { "ce_loss": 0.14905671775341034, "epoch": 5.073382254836558, "step": 15210 }, { "distill_loss": 0.39070963859558105, "epoch": 5.073382254836558, "step": 15210 }, { "epoch": 5.073382254836558, "ref_ce_loss": 0.13869890570640564, "step": 15210 }, { "epoch": 5.076717811874583, "loss": 0.7702, "step": 15220 }, { "epoch": 5.076717811874583, "grad_norm": 2.40421724319458, "step": 15220 }, { "epoch": 5.076717811874583, "learning_rate": 0.000409505987841914, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.796295166015625, "step": 15220 }, { "ce_loss": 0.1610361784696579, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.3039185106754303, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.20260609686374664, "step": 15220 }, { "epoch": 5.076717811874583, "loss": 0.9617740511894226, "step": 15220 }, { "ce_loss": 0.22738519310951233, "epoch": 5.076717811874583, "step": 15220 }, { "distill_loss": 0.3633210361003876, "epoch": 5.076717811874583, "step": 15220 }, { "epoch": 5.076717811874583, "ref_ce_loss": 0.16509735584259033, "step": 15220 }, { "epoch": 5.080053368912608, "loss": 0.7737, "step": 15230 }, { "epoch": 5.080053368912608, "grad_norm": 2.2806813716888428, "step": 15230 }, { "epoch": 5.080053368912608, "learning_rate": 0.0004090739733850587, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.6277940273284912, "step": 15230 }, { "ce_loss": 0.1376854032278061, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.34044697880744934, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.11363232880830765, "step": 15230 }, { "epoch": 5.080053368912608, "loss": 0.6917346119880676, "step": 15230 }, { "ce_loss": 0.20212016999721527, "epoch": 5.080053368912608, "step": 15230 }, { "distill_loss": 0.31108105182647705, "epoch": 5.080053368912608, "step": 15230 }, { "epoch": 5.080053368912608, "ref_ce_loss": 0.16628040373325348, "step": 15230 }, { "epoch": 5.083388925950634, "loss": 0.7025, "step": 15240 }, { "epoch": 5.083388925950634, "grad_norm": 2.196148157119751, "step": 15240 }, { "epoch": 5.083388925950634, "learning_rate": 0.00040864194833789997, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 1.0141888856887817, "step": 15240 }, { "ce_loss": 0.16802021861076355, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.32726022601127625, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.16862088441848755, "step": 15240 }, { "epoch": 5.083388925950634, "loss": 0.8400261998176575, "step": 15240 }, { "ce_loss": 0.15872463583946228, "epoch": 5.083388925950634, "step": 15240 }, { "distill_loss": 0.31594353914260864, "epoch": 5.083388925950634, "step": 15240 }, { "epoch": 5.083388925950634, "ref_ce_loss": 0.15056195855140686, "step": 15240 }, { "epoch": 5.086724482988659, "loss": 0.7916, "step": 15250 }, { "epoch": 5.086724482988659, "grad_norm": 1.3587056398391724, "step": 15250 }, { "epoch": 5.086724482988659, "learning_rate": 0.0004082099132046575, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.6186241507530212, "step": 15250 }, { "ce_loss": 0.1958022266626358, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.2964455485343933, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.12577207386493683, "step": 15250 }, { "epoch": 5.086724482988659, "loss": 0.7843382954597473, "step": 15250 }, { "ce_loss": 0.1981726884841919, "epoch": 5.086724482988659, "step": 15250 }, { "distill_loss": 0.3239561915397644, "epoch": 5.086724482988659, "step": 15250 }, { "epoch": 5.086724482988659, "ref_ce_loss": 0.14233283698558807, "step": 15250 }, { "epoch": 5.090060040026684, "loss": 0.6869, "step": 15260 }, { "epoch": 5.090060040026684, "grad_norm": 1.2597806453704834, "step": 15260 }, { "epoch": 5.090060040026684, "learning_rate": 0.00040777786848956304, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.6707214117050171, "step": 15260 }, { "ce_loss": 0.16234301030635834, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.27243322134017944, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.18407489359378815, "step": 15260 }, { "epoch": 5.090060040026684, "loss": 0.7344317436218262, "step": 15260 }, { "ce_loss": 0.16686370968818665, "epoch": 5.090060040026684, "step": 15260 }, { "distill_loss": 0.31479838490486145, "epoch": 5.090060040026684, "step": 15260 }, { "epoch": 5.090060040026684, "ref_ce_loss": 0.14425840973854065, "step": 15260 }, { "epoch": 5.09339559706471, "loss": 0.7113, "step": 15270 }, { "epoch": 5.09339559706471, "grad_norm": 2.707850694656372, "step": 15270 }, { "epoch": 5.09339559706471, "learning_rate": 0.00040734581469685906, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.787645697593689, "step": 15270 }, { "ce_loss": 0.18194088339805603, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.3009050786495209, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.14698223769664764, "step": 15270 }, { "epoch": 5.09339559706471, "loss": 0.8195387721061707, "step": 15270 }, { "ce_loss": 0.17880409955978394, "epoch": 5.09339559706471, "step": 15270 }, { "distill_loss": 0.2749001979827881, "epoch": 5.09339559706471, "step": 15270 }, { "epoch": 5.09339559706471, "ref_ce_loss": 0.14499787986278534, "step": 15270 }, { "epoch": 5.096731154102735, "loss": 0.7105, "step": 15280 }, { "epoch": 5.096731154102735, "grad_norm": 1.1744245290756226, "step": 15280 }, { "epoch": 5.096731154102735, "learning_rate": 0.00040691375233079907, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.8375328183174133, "step": 15280 }, { "ce_loss": 0.21677394211292267, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.2636817991733551, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.17519797384738922, "step": 15280 }, { "epoch": 5.096731154102735, "loss": 0.4660475254058838, "step": 15280 }, { "ce_loss": 0.14812767505645752, "epoch": 5.096731154102735, "step": 15280 }, { "distill_loss": 0.19933080673217773, "epoch": 5.096731154102735, "step": 15280 }, { "epoch": 5.096731154102735, "ref_ce_loss": 0.11839006096124649, "step": 15280 }, { "epoch": 5.1000667111407605, "loss": 0.6638, "step": 15290 }, { "epoch": 5.1000667111407605, "grad_norm": 2.339646339416504, "step": 15290 }, { "epoch": 5.1000667111407605, "learning_rate": 0.00040648168189564595, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.7594683170318604, "step": 15290 }, { "ce_loss": 0.22392548620700836, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.3010225296020508, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.15817376971244812, "step": 15290 }, { "epoch": 5.1000667111407605, "loss": 0.6978979706764221, "step": 15290 }, { "ce_loss": 0.24014516174793243, "epoch": 5.1000667111407605, "step": 15290 }, { "distill_loss": 0.2784227132797241, "epoch": 5.1000667111407605, "step": 15290 }, { "epoch": 5.1000667111407605, "ref_ce_loss": 0.17909127473831177, "step": 15290 }, { "epoch": 5.103402268178786, "loss": 0.7545, "step": 15300 }, { "epoch": 5.103402268178786, "grad_norm": 1.8751845359802246, "step": 15300 }, { "epoch": 5.103402268178786, "learning_rate": 0.00040604960389567274, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.528994619846344, "step": 15300 }, { "ce_loss": 0.1152702048420906, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.2503540515899658, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.12252363562583923, "step": 15300 }, { "epoch": 5.103402268178786, "loss": 0.8748533129692078, "step": 15300 }, { "ce_loss": 0.20872852206230164, "epoch": 5.103402268178786, "step": 15300 }, { "distill_loss": 0.43009328842163086, "epoch": 5.103402268178786, "step": 15300 }, { "epoch": 5.103402268178786, "ref_ce_loss": 0.16917204856872559, "step": 15300 }, { "epoch": 5.106737825216811, "loss": 0.679, "step": 15310 }, { "epoch": 5.106737825216811, "grad_norm": 1.5614938735961914, "step": 15310 }, { "epoch": 5.106737825216811, "learning_rate": 0.00040561751883516064, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.7292320728302002, "step": 15310 }, { "ce_loss": 0.1953887790441513, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.35018715262413025, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.13770712912082672, "step": 15310 }, { "epoch": 5.106737825216811, "loss": 0.7030383348464966, "step": 15310 }, { "ce_loss": 0.15858271718025208, "epoch": 5.106737825216811, "step": 15310 }, { "distill_loss": 0.2966959476470947, "epoch": 5.106737825216811, "step": 15310 }, { "epoch": 5.106737825216811, "ref_ce_loss": 0.1796363741159439, "step": 15310 }, { "epoch": 5.1100733822548365, "loss": 0.7627, "step": 15320 }, { "epoch": 5.1100733822548365, "grad_norm": 1.8307886123657227, "step": 15320 }, { "epoch": 5.1100733822548365, "learning_rate": 0.00040518542721839967, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 1.134617567062378, "step": 15320 }, { "ce_loss": 0.27231743931770325, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.44714435935020447, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.19462838768959045, "step": 15320 }, { "epoch": 5.1100733822548365, "loss": 0.6627607345581055, "step": 15320 }, { "ce_loss": 0.15691936016082764, "epoch": 5.1100733822548365, "step": 15320 }, { "distill_loss": 0.2951563000679016, "epoch": 5.1100733822548365, "step": 15320 }, { "epoch": 5.1100733822548365, "ref_ce_loss": 0.12084396928548813, "step": 15320 }, { "epoch": 5.113408939292862, "loss": 0.8722, "step": 15330 }, { "epoch": 5.113408939292862, "grad_norm": 2.4782094955444336, "step": 15330 }, { "epoch": 5.113408939292862, "learning_rate": 0.00040475332954968723, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.8449116945266724, "step": 15330 }, { "ce_loss": 0.2228572964668274, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.3963652551174164, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.16229957342147827, "step": 15330 }, { "epoch": 5.113408939292862, "loss": 0.5537840723991394, "step": 15330 }, { "ce_loss": 0.10099782794713974, "epoch": 5.113408939292862, "step": 15330 }, { "distill_loss": 0.278985857963562, "epoch": 5.113408939292862, "step": 15330 }, { "epoch": 5.113408939292862, "ref_ce_loss": 0.12842731177806854, "step": 15330 }, { "epoch": 5.116744496330887, "loss": 0.8228, "step": 15340 }, { "epoch": 5.116744496330887, "grad_norm": 2.2996766567230225, "step": 15340 }, { "epoch": 5.116744496330887, "learning_rate": 0.0004043212263333277, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.8240509033203125, "step": 15340 }, { "ce_loss": 0.24200014770030975, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.36865487694740295, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.1704481840133667, "step": 15340 }, { "epoch": 5.116744496330887, "loss": 0.5815379023551941, "step": 15340 }, { "ce_loss": 0.13422711193561554, "epoch": 5.116744496330887, "step": 15340 }, { "distill_loss": 0.3356119394302368, "epoch": 5.116744496330887, "step": 15340 }, { "epoch": 5.116744496330887, "ref_ce_loss": 0.11077725142240524, "step": 15340 }, { "epoch": 5.120080053368913, "loss": 0.7924, "step": 15350 }, { "epoch": 5.120080053368913, "grad_norm": 1.4737706184387207, "step": 15350 }, { "epoch": 5.120080053368913, "learning_rate": 0.000403889118073632, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.7064817547798157, "step": 15350 }, { "ce_loss": 0.17357517778873444, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.26995182037353516, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.19605916738510132, "step": 15350 }, { "epoch": 5.120080053368913, "loss": 0.4547923505306244, "step": 15350 }, { "ce_loss": 0.13223117589950562, "epoch": 5.120080053368913, "step": 15350 }, { "distill_loss": 0.19939839839935303, "epoch": 5.120080053368913, "step": 15350 }, { "epoch": 5.120080053368913, "ref_ce_loss": 0.12301374971866608, "step": 15350 }, { "epoch": 5.123415610406938, "loss": 0.6784, "step": 15360 }, { "epoch": 5.123415610406938, "grad_norm": 1.4126379489898682, "step": 15360 }, { "epoch": 5.123415610406938, "learning_rate": 0.00040345700527491703, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.7251626253128052, "step": 15360 }, { "ce_loss": 0.20589450001716614, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.3297916650772095, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.13303814828395844, "step": 15360 }, { "epoch": 5.123415610406938, "loss": 0.5926677584648132, "step": 15360 }, { "ce_loss": 0.14590634405612946, "epoch": 5.123415610406938, "step": 15360 }, { "distill_loss": 0.2872351408004761, "epoch": 5.123415610406938, "step": 15360 }, { "epoch": 5.123415610406938, "ref_ce_loss": 0.12017708271741867, "step": 15360 }, { "epoch": 5.126751167444963, "loss": 0.6853, "step": 15370 }, { "epoch": 5.126751167444963, "grad_norm": 1.6726282835006714, "step": 15370 }, { "epoch": 5.126751167444963, "learning_rate": 0.0004030248884415049, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.6812902688980103, "step": 15370 }, { "ce_loss": 0.14721952378749847, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.27806055545806885, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.11692876368761063, "step": 15370 }, { "epoch": 5.126751167444963, "loss": 0.7496767640113831, "step": 15370 }, { "ce_loss": 0.18391594290733337, "epoch": 5.126751167444963, "step": 15370 }, { "distill_loss": 0.3393586277961731, "epoch": 5.126751167444963, "step": 15370 }, { "epoch": 5.126751167444963, "ref_ce_loss": 0.14704890549182892, "step": 15370 }, { "epoch": 5.130086724482989, "loss": 0.7202, "step": 15380 }, { "epoch": 5.130086724482989, "grad_norm": 1.6118098497390747, "step": 15380 }, { "epoch": 5.130086724482989, "learning_rate": 0.00040259276807772264, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.6544176936149597, "step": 15380 }, { "ce_loss": 0.1918047070503235, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.27302777767181396, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.15166480839252472, "step": 15380 }, { "epoch": 5.130086724482989, "loss": 0.6611236929893494, "step": 15380 }, { "ce_loss": 0.2007049024105072, "epoch": 5.130086724482989, "step": 15380 }, { "distill_loss": 0.2729007303714752, "epoch": 5.130086724482989, "step": 15380 }, { "epoch": 5.130086724482989, "ref_ce_loss": 0.16753646731376648, "step": 15380 }, { "epoch": 5.133422281521014, "loss": 0.7482, "step": 15390 }, { "epoch": 5.133422281521014, "grad_norm": 2.3332631587982178, "step": 15390 }, { "epoch": 5.133422281521014, "learning_rate": 0.0004021606446879008, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.7054694890975952, "step": 15390 }, { "ce_loss": 0.1783827692270279, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.3041553497314453, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.1363864243030548, "step": 15390 }, { "epoch": 5.133422281521014, "loss": 0.6848443150520325, "step": 15390 }, { "ce_loss": 0.15475906431674957, "epoch": 5.133422281521014, "step": 15390 }, { "distill_loss": 0.3089504837989807, "epoch": 5.133422281521014, "step": 15390 }, { "epoch": 5.133422281521014, "ref_ce_loss": 0.12166281044483185, "step": 15390 }, { "epoch": 5.136757838559039, "loss": 0.7116, "step": 15400 }, { "epoch": 5.136757838559039, "grad_norm": 1.8791468143463135, "step": 15400 }, { "epoch": 5.136757838559039, "learning_rate": 0.00040172851877637425, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 0.8204621076583862, "step": 15400 }, { "ce_loss": 0.18681691586971283, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.31471189856529236, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.14055274426937103, "step": 15400 }, { "epoch": 5.136757838559039, "loss": 1.2355568408966064, "step": 15400 }, { "ce_loss": 0.22752253711223602, "epoch": 5.136757838559039, "step": 15400 }, { "distill_loss": 0.30248481035232544, "epoch": 5.136757838559039, "step": 15400 }, { "epoch": 5.136757838559039, "ref_ce_loss": 0.1581341177225113, "step": 15400 }, { "epoch": 5.140093395597065, "loss": 0.763, "step": 15410 }, { "epoch": 5.140093395597065, "grad_norm": 3.6763594150543213, "step": 15410 }, { "epoch": 5.140093395597065, "learning_rate": 0.00040129639084748034, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.7683576345443726, "step": 15410 }, { "ce_loss": 0.19131425023078918, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.30612730979919434, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.21812304854393005, "step": 15410 }, { "epoch": 5.140093395597065, "loss": 0.546682596206665, "step": 15410 }, { "ce_loss": 0.15684723854064941, "epoch": 5.140093395597065, "step": 15410 }, { "distill_loss": 0.22708380222320557, "epoch": 5.140093395597065, "step": 15410 }, { "epoch": 5.140093395597065, "ref_ce_loss": 0.12680214643478394, "step": 15410 }, { "epoch": 5.14342895263509, "loss": 0.6825, "step": 15420 }, { "epoch": 5.14342895263509, "grad_norm": 1.8578838109970093, "step": 15420 }, { "epoch": 5.14342895263509, "learning_rate": 0.0004008642614055586, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.5246071815490723, "step": 15420 }, { "ce_loss": 0.13065984845161438, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.2596120834350586, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.09721403568983078, "step": 15420 }, { "epoch": 5.14342895263509, "loss": 0.9444109797477722, "step": 15420 }, { "ce_loss": 0.20307934284210205, "epoch": 5.14342895263509, "step": 15420 }, { "distill_loss": 0.3659123480319977, "epoch": 5.14342895263509, "step": 15420 }, { "epoch": 5.14342895263509, "ref_ce_loss": 0.17617152631282806, "step": 15420 }, { "epoch": 5.146764509673115, "loss": 0.7336, "step": 15430 }, { "epoch": 5.146764509673115, "grad_norm": 1.9291861057281494, "step": 15430 }, { "epoch": 5.146764509673115, "learning_rate": 0.0004004321309549511, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.7223341464996338, "step": 15430 }, { "ce_loss": 0.15882186591625214, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.3048314154148102, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.12695194780826569, "step": 15430 }, { "epoch": 5.146764509673115, "loss": 0.6615356206893921, "step": 15430 }, { "ce_loss": 0.15503180027008057, "epoch": 5.146764509673115, "step": 15430 }, { "distill_loss": 0.3699668049812317, "epoch": 5.146764509673115, "step": 15430 }, { "epoch": 5.146764509673115, "ref_ce_loss": 0.1364234983921051, "step": 15430 }, { "epoch": 5.150100066711141, "loss": 0.7307, "step": 15440 }, { "epoch": 5.150100066711141, "grad_norm": 1.8029311895370483, "step": 15440 }, { "epoch": 5.150100066711141, "learning_rate": 0.0004, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 1.0619946718215942, "step": 15440 }, { "ce_loss": 0.27869516611099243, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.39487215876579285, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.20835605263710022, "step": 15440 }, { "epoch": 5.150100066711141, "loss": 0.7138598561286926, "step": 15440 }, { "ce_loss": 0.21639475226402283, "epoch": 5.150100066711141, "step": 15440 }, { "distill_loss": 0.32156020402908325, "epoch": 5.150100066711141, "step": 15440 }, { "epoch": 5.150100066711141, "ref_ce_loss": 0.1755463182926178, "step": 15440 }, { "epoch": 5.153435623749166, "loss": 0.7177, "step": 15450 }, { "epoch": 5.153435623749166, "grad_norm": 1.7701245546340942, "step": 15450 }, { "epoch": 5.153435623749166, "learning_rate": 0.000399567869045049, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.6437214016914368, "step": 15450 }, { "ce_loss": 0.16194361448287964, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.283201664686203, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.15246915817260742, "step": 15450 }, { "epoch": 5.153435623749166, "loss": 0.7803009748458862, "step": 15450 }, { "ce_loss": 0.14901769161224365, "epoch": 5.153435623749166, "step": 15450 }, { "distill_loss": 0.2672480344772339, "epoch": 5.153435623749166, "step": 15450 }, { "epoch": 5.153435623749166, "ref_ce_loss": 0.16708533465862274, "step": 15450 }, { "epoch": 5.156771180787191, "loss": 0.725, "step": 15460 }, { "epoch": 5.156771180787191, "grad_norm": 1.2688924074172974, "step": 15460 }, { "epoch": 5.156771180787191, "learning_rate": 0.0003991357385944414, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.581249475479126, "step": 15460 }, { "ce_loss": 0.18468716740608215, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.25455421209335327, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.14176595211029053, "step": 15460 }, { "epoch": 5.156771180787191, "loss": 0.5864779949188232, "step": 15460 }, { "ce_loss": 0.16304655373096466, "epoch": 5.156771180787191, "step": 15460 }, { "distill_loss": 0.2759646773338318, "epoch": 5.156771180787191, "step": 15460 }, { "epoch": 5.156771180787191, "ref_ce_loss": 0.12296140938997269, "step": 15460 }, { "epoch": 5.160106737825217, "loss": 0.7828, "step": 15470 }, { "epoch": 5.160106737825217, "grad_norm": 1.317644715309143, "step": 15470 }, { "epoch": 5.160106737825217, "learning_rate": 0.0003987036091525198, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.6303431987762451, "step": 15470 }, { "ce_loss": 0.16451367735862732, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.2889218032360077, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.13916608691215515, "step": 15470 }, { "epoch": 5.160106737825217, "loss": 0.7348785400390625, "step": 15470 }, { "ce_loss": 0.18091663718223572, "epoch": 5.160106737825217, "step": 15470 }, { "distill_loss": 0.3877863883972168, "epoch": 5.160106737825217, "step": 15470 }, { "epoch": 5.160106737825217, "ref_ce_loss": 0.12812453508377075, "step": 15470 }, { "epoch": 5.163442294863242, "loss": 0.7009, "step": 15480 }, { "epoch": 5.163442294863242, "grad_norm": 1.2215092182159424, "step": 15480 }, { "epoch": 5.163442294863242, "learning_rate": 0.00039827148122362584, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 1.113275170326233, "step": 15480 }, { "ce_loss": 0.21751204133033752, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.2767632007598877, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.15298031270503998, "step": 15480 }, { "epoch": 5.163442294863242, "loss": 0.6497124433517456, "step": 15480 }, { "ce_loss": 0.19662845134735107, "epoch": 5.163442294863242, "step": 15480 }, { "distill_loss": 0.25942057371139526, "epoch": 5.163442294863242, "step": 15480 }, { "epoch": 5.163442294863242, "ref_ce_loss": 0.1508607417345047, "step": 15480 }, { "epoch": 5.1667778519012675, "loss": 0.6631, "step": 15490 }, { "epoch": 5.1667778519012675, "grad_norm": 1.4898862838745117, "step": 15490 }, { "epoch": 5.1667778519012675, "learning_rate": 0.0003978393553120993, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.6173375248908997, "step": 15490 }, { "ce_loss": 0.11945261061191559, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.26320600509643555, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.13641858100891113, "step": 15490 }, { "epoch": 5.1667778519012675, "loss": 0.6017664074897766, "step": 15490 }, { "ce_loss": 0.17880946397781372, "epoch": 5.1667778519012675, "step": 15490 }, { "distill_loss": 0.25818952918052673, "epoch": 5.1667778519012675, "step": 15490 }, { "epoch": 5.1667778519012675, "ref_ce_loss": 0.12212560325860977, "step": 15490 }, { "epoch": 5.170113408939293, "loss": 0.7009, "step": 15500 }, { "epoch": 5.170113408939293, "grad_norm": 1.657698631286621, "step": 15500 }, { "epoch": 5.170113408939293, "learning_rate": 0.0003974072319222774, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.7776429653167725, "step": 15500 }, { "ce_loss": 0.21835565567016602, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.3262067437171936, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.1799788922071457, "step": 15500 }, { "epoch": 5.170113408939293, "loss": 0.6015152335166931, "step": 15500 }, { "ce_loss": 0.1296067237854004, "epoch": 5.170113408939293, "step": 15500 }, { "distill_loss": 0.3558272421360016, "epoch": 5.170113408939293, "step": 15500 }, { "epoch": 5.170113408939293, "ref_ce_loss": 0.11590032279491425, "step": 15500 }, { "epoch": 5.173448965977318, "loss": 0.6985, "step": 15510 }, { "epoch": 5.173448965977318, "grad_norm": 1.3140312433242798, "step": 15510 }, { "epoch": 5.173448965977318, "learning_rate": 0.00039697511155849507, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.6865147948265076, "step": 15510 }, { "ce_loss": 0.2025403380393982, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.2703585624694824, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.1497085988521576, "step": 15510 }, { "epoch": 5.173448965977318, "loss": 0.5102109909057617, "step": 15510 }, { "ce_loss": 0.13412393629550934, "epoch": 5.173448965977318, "step": 15510 }, { "distill_loss": 0.21918663382530212, "epoch": 5.173448965977318, "step": 15510 }, { "epoch": 5.173448965977318, "ref_ce_loss": 0.11139141768217087, "step": 15510 }, { "epoch": 5.1767845230153435, "loss": 0.6942, "step": 15520 }, { "epoch": 5.1767845230153435, "grad_norm": 2.086806297302246, "step": 15520 }, { "epoch": 5.1767845230153435, "learning_rate": 0.00039654299472508296, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.7411342859268188, "step": 15520 }, { "ce_loss": 0.18614676594734192, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.3071932792663574, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.13869841396808624, "step": 15520 }, { "epoch": 5.1767845230153435, "loss": 0.6899371147155762, "step": 15520 }, { "ce_loss": 0.2027830183506012, "epoch": 5.1767845230153435, "step": 15520 }, { "distill_loss": 0.257847398519516, "epoch": 5.1767845230153435, "step": 15520 }, { "epoch": 5.1767845230153435, "ref_ce_loss": 0.18891122937202454, "step": 15520 }, { "epoch": 5.180120080053369, "loss": 0.7278, "step": 15530 }, { "epoch": 5.180120080053369, "grad_norm": 1.283077359199524, "step": 15530 }, { "epoch": 5.180120080053369, "learning_rate": 0.0003961108819263681, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.6969683170318604, "step": 15530 }, { "ce_loss": 0.13482505083084106, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.21009461581707, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.13851168751716614, "step": 15530 }, { "epoch": 5.180120080053369, "loss": 0.47756150364875793, "step": 15530 }, { "ce_loss": 0.11805874854326248, "epoch": 5.180120080053369, "step": 15530 }, { "distill_loss": 0.20090797543525696, "epoch": 5.180120080053369, "step": 15530 }, { "epoch": 5.180120080053369, "ref_ce_loss": 0.12400776147842407, "step": 15530 }, { "epoch": 5.183455637091394, "loss": 0.6822, "step": 15540 }, { "epoch": 5.183455637091394, "grad_norm": 1.3494940996170044, "step": 15540 }, { "epoch": 5.183455637091394, "learning_rate": 0.00039567877366667234, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.6591233015060425, "step": 15540 }, { "ce_loss": 0.25253763794898987, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.26830992102622986, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.1380598396062851, "step": 15540 }, { "epoch": 5.183455637091394, "loss": 0.6860598921775818, "step": 15540 }, { "ce_loss": 0.17180071771144867, "epoch": 5.183455637091394, "step": 15540 }, { "distill_loss": 0.2451307773590088, "epoch": 5.183455637091394, "step": 15540 }, { "epoch": 5.183455637091394, "ref_ce_loss": 0.1696072220802307, "step": 15540 }, { "epoch": 5.18679119412942, "loss": 0.6521, "step": 15550 }, { "epoch": 5.18679119412942, "grad_norm": 2.229048252105713, "step": 15550 }, { "epoch": 5.18679119412942, "learning_rate": 0.00039524667045031287, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.9138002395629883, "step": 15550 }, { "ce_loss": 0.18019822239875793, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.24058149755001068, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.15312932431697845, "step": 15550 }, { "epoch": 5.18679119412942, "loss": 0.7437194585800171, "step": 15550 }, { "ce_loss": 0.17918673157691956, "epoch": 5.18679119412942, "step": 15550 }, { "distill_loss": 0.2970272898674011, "epoch": 5.18679119412942, "step": 15550 }, { "epoch": 5.18679119412942, "ref_ce_loss": 0.12216797471046448, "step": 15550 }, { "epoch": 5.190126751167445, "loss": 0.7007, "step": 15560 }, { "epoch": 5.190126751167445, "grad_norm": 1.5291552543640137, "step": 15560 }, { "epoch": 5.190126751167445, "learning_rate": 0.00039481457278160037, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 1.0346124172210693, "step": 15560 }, { "ce_loss": 0.1917654573917389, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.24482488632202148, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.13623914122581482, "step": 15560 }, { "epoch": 5.190126751167445, "loss": 0.7478914260864258, "step": 15560 }, { "ce_loss": 0.19277100265026093, "epoch": 5.190126751167445, "step": 15560 }, { "distill_loss": 0.2582072913646698, "epoch": 5.190126751167445, "step": 15560 }, { "epoch": 5.190126751167445, "ref_ce_loss": 0.1460961103439331, "step": 15560 }, { "epoch": 5.19346230820547, "loss": 0.7929, "step": 15570 }, { "epoch": 5.19346230820547, "grad_norm": 2.785123109817505, "step": 15570 }, { "epoch": 5.19346230820547, "learning_rate": 0.00039438248116483945, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.7991613149642944, "step": 15570 }, { "ce_loss": 0.1651901751756668, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.3231179714202881, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.14283576607704163, "step": 15570 }, { "epoch": 5.19346230820547, "loss": 0.8519611358642578, "step": 15570 }, { "ce_loss": 0.19507086277008057, "epoch": 5.19346230820547, "step": 15570 }, { "distill_loss": 0.34214770793914795, "epoch": 5.19346230820547, "step": 15570 }, { "epoch": 5.19346230820547, "ref_ce_loss": 0.1515471190214157, "step": 15570 }, { "epoch": 5.196797865243496, "loss": 0.7231, "step": 15580 }, { "epoch": 5.196797865243496, "grad_norm": 1.721127986907959, "step": 15580 }, { "epoch": 5.196797865243496, "learning_rate": 0.00039395039610432746, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.7653343081474304, "step": 15580 }, { "ce_loss": 0.18521682918071747, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.33098432421684265, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.13533954322338104, "step": 15580 }, { "epoch": 5.196797865243496, "loss": 0.6579205393791199, "step": 15580 }, { "ce_loss": 0.1290563941001892, "epoch": 5.196797865243496, "step": 15580 }, { "distill_loss": 0.2866734266281128, "epoch": 5.196797865243496, "step": 15580 }, { "epoch": 5.196797865243496, "ref_ce_loss": 0.11047664284706116, "step": 15580 }, { "epoch": 5.200133422281521, "loss": 0.7133, "step": 15590 }, { "epoch": 5.200133422281521, "grad_norm": 1.6893715858459473, "step": 15590 }, { "epoch": 5.200133422281521, "learning_rate": 0.00039351831810435425, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.8286232948303223, "step": 15590 }, { "ce_loss": 0.14069385826587677, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.27295631170272827, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.16400879621505737, "step": 15590 }, { "epoch": 5.200133422281521, "loss": 0.5254113674163818, "step": 15590 }, { "ce_loss": 0.13113446533679962, "epoch": 5.200133422281521, "step": 15590 }, { "distill_loss": 0.2617083787918091, "epoch": 5.200133422281521, "step": 15590 }, { "epoch": 5.200133422281521, "ref_ce_loss": 0.13194455206394196, "step": 15590 }, { "epoch": 5.203468979319546, "loss": 0.887, "step": 15600 }, { "epoch": 5.203468979319546, "grad_norm": 1.6650769710540771, "step": 15600 }, { "epoch": 5.203468979319546, "learning_rate": 0.00039308624766920113, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 1.0041122436523438, "step": 15600 }, { "ce_loss": 0.18588700890541077, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.34661564230918884, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.2050558626651764, "step": 15600 }, { "epoch": 5.203468979319546, "loss": 1.0341342687606812, "step": 15600 }, { "ce_loss": 0.16048888862133026, "epoch": 5.203468979319546, "step": 15600 }, { "distill_loss": 0.25734442472457886, "epoch": 5.203468979319546, "step": 15600 }, { "epoch": 5.203468979319546, "ref_ce_loss": 0.13042587041854858, "step": 15600 }, { "epoch": 5.206804536357572, "loss": 0.783, "step": 15610 }, { "epoch": 5.206804536357572, "grad_norm": 2.852943181991577, "step": 15610 }, { "epoch": 5.206804536357572, "learning_rate": 0.00039265418530314087, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 1.0115138292312622, "step": 15610 }, { "ce_loss": 0.17285360395908356, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.4148566424846649, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.11883706599473953, "step": 15610 }, { "epoch": 5.206804536357572, "loss": 0.6742502450942993, "step": 15610 }, { "ce_loss": 0.13640636205673218, "epoch": 5.206804536357572, "step": 15610 }, { "distill_loss": 0.35797151923179626, "epoch": 5.206804536357572, "step": 15610 }, { "epoch": 5.206804536357572, "ref_ce_loss": 0.1325061023235321, "step": 15610 }, { "epoch": 5.210140093395597, "loss": 0.8527, "step": 15620 }, { "epoch": 5.210140093395597, "grad_norm": 1.5104297399520874, "step": 15620 }, { "epoch": 5.210140093395597, "learning_rate": 0.0003922221315104369, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.6583712100982666, "step": 15620 }, { "ce_loss": 0.1715448945760727, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.34650322794914246, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.140151247382164, "step": 15620 }, { "epoch": 5.210140093395597, "loss": 0.7607604265213013, "step": 15620 }, { "ce_loss": 0.18406368792057037, "epoch": 5.210140093395597, "step": 15620 }, { "distill_loss": 0.3910118043422699, "epoch": 5.210140093395597, "step": 15620 }, { "epoch": 5.210140093395597, "ref_ce_loss": 0.1506662219762802, "step": 15620 }, { "epoch": 5.213475650433622, "loss": 0.8219, "step": 15630 }, { "epoch": 5.213475650433622, "grad_norm": 1.69031822681427, "step": 15630 }, { "epoch": 5.213475650433622, "learning_rate": 0.0003917900867953425, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.8839518427848816, "step": 15630 }, { "ce_loss": 0.16526243090629578, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.43955349922180176, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.1280617117881775, "step": 15630 }, { "epoch": 5.213475650433622, "loss": 0.6657829284667969, "step": 15630 }, { "ce_loss": 0.17464981973171234, "epoch": 5.213475650433622, "step": 15630 }, { "distill_loss": 0.314512699842453, "epoch": 5.213475650433622, "step": 15630 }, { "epoch": 5.213475650433622, "ref_ce_loss": 0.14456845819950104, "step": 15630 }, { "epoch": 5.216811207471648, "loss": 0.833, "step": 15640 }, { "epoch": 5.216811207471648, "grad_norm": 3.2884371280670166, "step": 15640 }, { "epoch": 5.216811207471648, "learning_rate": 0.00039135805166210007, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.9567729234695435, "step": 15640 }, { "ce_loss": 0.25114181637763977, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.3986857831478119, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.17955972254276276, "step": 15640 }, { "epoch": 5.216811207471648, "loss": 0.7126868367195129, "step": 15640 }, { "ce_loss": 0.15249194204807281, "epoch": 5.216811207471648, "step": 15640 }, { "distill_loss": 0.31977659463882446, "epoch": 5.216811207471648, "step": 15640 }, { "epoch": 5.216811207471648, "ref_ce_loss": 0.13722363114356995, "step": 15640 }, { "epoch": 5.220146764509673, "loss": 0.8554, "step": 15650 }, { "epoch": 5.220146764509673, "grad_norm": 5.034350872039795, "step": 15650 }, { "epoch": 5.220146764509673, "learning_rate": 0.00039092602661494147, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.6116445064544678, "step": 15650 }, { "ce_loss": 0.11406902223825455, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.24040089547634125, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.12359070777893066, "step": 15650 }, { "epoch": 5.220146764509673, "loss": 0.6653336882591248, "step": 15650 }, { "ce_loss": 0.15816573798656464, "epoch": 5.220146764509673, "step": 15650 }, { "distill_loss": 0.3218991756439209, "epoch": 5.220146764509673, "step": 15650 }, { "epoch": 5.220146764509673, "ref_ce_loss": 0.14883488416671753, "step": 15650 }, { "epoch": 5.223482321547698, "loss": 0.7961, "step": 15660 }, { "epoch": 5.223482321547698, "grad_norm": 1.702118992805481, "step": 15660 }, { "epoch": 5.223482321547698, "learning_rate": 0.000390494012158086, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.6992454528808594, "step": 15660 }, { "ce_loss": 0.20349298417568207, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.33723607659339905, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.15842409431934357, "step": 15660 }, { "epoch": 5.223482321547698, "loss": 0.9260455369949341, "step": 15660 }, { "ce_loss": 0.21057721972465515, "epoch": 5.223482321547698, "step": 15660 }, { "distill_loss": 0.3818778693675995, "epoch": 5.223482321547698, "step": 15660 }, { "epoch": 5.223482321547698, "ref_ce_loss": 0.15959475934505463, "step": 15660 }, { "epoch": 5.226817878585724, "loss": 0.7601, "step": 15670 }, { "epoch": 5.226817878585724, "grad_norm": 2.036332845687866, "step": 15670 }, { "epoch": 5.226817878585724, "learning_rate": 0.0003900620087957414, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 0.7959449887275696, "step": 15670 }, { "ce_loss": 0.16416768729686737, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.3899722099304199, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.17624466121196747, "step": 15670 }, { "epoch": 5.226817878585724, "loss": 1.0749465227127075, "step": 15670 }, { "ce_loss": 0.2088886797428131, "epoch": 5.226817878585724, "step": 15670 }, { "distill_loss": 0.36911848187446594, "epoch": 5.226817878585724, "step": 15670 }, { "epoch": 5.226817878585724, "ref_ce_loss": 0.16031013429164886, "step": 15670 }, { "epoch": 5.230153435623749, "loss": 0.8381, "step": 15680 }, { "epoch": 5.230153435623749, "grad_norm": 1.862441897392273, "step": 15680 }, { "epoch": 5.230153435623749, "learning_rate": 0.0003896300170321018, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 1.2608473300933838, "step": 15680 }, { "ce_loss": 0.18141326308250427, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.3208111524581909, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.16460202634334564, "step": 15680 }, { "epoch": 5.230153435623749, "loss": 0.6557343006134033, "step": 15680 }, { "ce_loss": 0.1695721298456192, "epoch": 5.230153435623749, "step": 15680 }, { "distill_loss": 0.2742932438850403, "epoch": 5.230153435623749, "step": 15680 }, { "epoch": 5.230153435623749, "ref_ce_loss": 0.14762458205223083, "step": 15680 }, { "epoch": 5.2334889926617745, "loss": 0.7043, "step": 15690 }, { "epoch": 5.2334889926617745, "grad_norm": 2.2062363624572754, "step": 15690 }, { "epoch": 5.2334889926617745, "learning_rate": 0.00038919803737134825, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.7139572501182556, "step": 15690 }, { "ce_loss": 0.17304302752017975, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.35036876797676086, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.1352560669183731, "step": 15690 }, { "epoch": 5.2334889926617745, "loss": 0.6848664879798889, "step": 15690 }, { "ce_loss": 0.15748827159404755, "epoch": 5.2334889926617745, "step": 15690 }, { "distill_loss": 0.2823999226093292, "epoch": 5.2334889926617745, "step": 15690 }, { "epoch": 5.2334889926617745, "ref_ce_loss": 0.13981734216213226, "step": 15690 }, { "epoch": 5.2368245496998, "loss": 0.8185, "step": 15700 }, { "epoch": 5.2368245496998, "grad_norm": 2.144094705581665, "step": 15700 }, { "epoch": 5.2368245496998, "learning_rate": 0.00038876607031764735, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.7011567950248718, "step": 15700 }, { "ce_loss": 0.17664001882076263, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.2781679630279541, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.15840114653110504, "step": 15700 }, { "epoch": 5.2368245496998, "loss": 0.565709114074707, "step": 15700 }, { "ce_loss": 0.0962267592549324, "epoch": 5.2368245496998, "step": 15700 }, { "distill_loss": 0.27149227261543274, "epoch": 5.2368245496998, "step": 15700 }, { "epoch": 5.2368245496998, "ref_ce_loss": 0.12075101584196091, "step": 15700 }, { "epoch": 5.240160106737825, "loss": 0.7272, "step": 15710 }, { "epoch": 5.240160106737825, "grad_norm": 1.2410707473754883, "step": 15710 }, { "epoch": 5.240160106737825, "learning_rate": 0.00038833411637515127, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.5946895480155945, "step": 15710 }, { "ce_loss": 0.13202613592147827, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.2717001736164093, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.12810038030147552, "step": 15710 }, { "epoch": 5.240160106737825, "loss": 0.7258737683296204, "step": 15710 }, { "ce_loss": 0.2018779069185257, "epoch": 5.240160106737825, "step": 15710 }, { "distill_loss": 0.36134767532348633, "epoch": 5.240160106737825, "step": 15710 }, { "epoch": 5.240160106737825, "ref_ce_loss": 0.16241465508937836, "step": 15710 }, { "epoch": 5.2434956637758505, "loss": 0.7523, "step": 15720 }, { "epoch": 5.2434956637758505, "grad_norm": 2.900282382965088, "step": 15720 }, { "epoch": 5.2434956637758505, "learning_rate": 0.0003879021760479965, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.7594578266143799, "step": 15720 }, { "ce_loss": 0.20554454624652863, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.2997867465019226, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.1371021717786789, "step": 15720 }, { "epoch": 5.2434956637758505, "loss": 0.4947853088378906, "step": 15720 }, { "ce_loss": 0.11712466925382614, "epoch": 5.2434956637758505, "step": 15720 }, { "distill_loss": 0.19641387462615967, "epoch": 5.2434956637758505, "step": 15720 }, { "epoch": 5.2434956637758505, "ref_ce_loss": 0.15747201442718506, "step": 15720 }, { "epoch": 5.246831220813876, "loss": 0.7997, "step": 15730 }, { "epoch": 5.246831220813876, "grad_norm": 4.173399448394775, "step": 15730 }, { "epoch": 5.246831220813876, "learning_rate": 0.0003874702498403042, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.7463613748550415, "step": 15730 }, { "ce_loss": 0.18147322535514832, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.3750154674053192, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.14311262965202332, "step": 15730 }, { "epoch": 5.246831220813876, "loss": 0.7426289916038513, "step": 15730 }, { "ce_loss": 0.15202565491199493, "epoch": 5.246831220813876, "step": 15730 }, { "distill_loss": 0.2733365297317505, "epoch": 5.246831220813876, "step": 15730 }, { "epoch": 5.246831220813876, "ref_ce_loss": 0.11649046093225479, "step": 15730 }, { "epoch": 5.250166777851901, "loss": 0.7452, "step": 15740 }, { "epoch": 5.250166777851901, "grad_norm": 2.6787667274475098, "step": 15740 }, { "epoch": 5.250166777851901, "learning_rate": 0.0003870383382561787, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.6864969730377197, "step": 15740 }, { "ce_loss": 0.15401573479175568, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.32028788328170776, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.16295704245567322, "step": 15740 }, { "epoch": 5.250166777851901, "loss": 0.5780356526374817, "step": 15740 }, { "ce_loss": 0.13841503858566284, "epoch": 5.250166777851901, "step": 15740 }, { "distill_loss": 0.28010672330856323, "epoch": 5.250166777851901, "step": 15740 }, { "epoch": 5.250166777851901, "ref_ce_loss": 0.12720590829849243, "step": 15740 }, { "epoch": 5.253502334889927, "loss": 0.7336, "step": 15750 }, { "epoch": 5.253502334889927, "grad_norm": 1.7823841571807861, "step": 15750 }, { "epoch": 5.253502334889927, "learning_rate": 0.00038660644179970707, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.7867616415023804, "step": 15750 }, { "ce_loss": 0.1609250158071518, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.2993946969509125, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.11262772977352142, "step": 15750 }, { "epoch": 5.253502334889927, "loss": 0.8699862957000732, "step": 15750 }, { "ce_loss": 0.10256128758192062, "epoch": 5.253502334889927, "step": 15750 }, { "distill_loss": 0.2720155417919159, "epoch": 5.253502334889927, "step": 15750 }, { "epoch": 5.253502334889927, "ref_ce_loss": 0.15460257232189178, "step": 15750 }, { "epoch": 5.256837891927952, "loss": 0.7103, "step": 15760 }, { "epoch": 5.256837891927952, "grad_norm": 1.9407238960266113, "step": 15760 }, { "epoch": 5.256837891927952, "learning_rate": 0.0003861745609749591, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.5640208125114441, "step": 15760 }, { "ce_loss": 0.1139085665345192, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.26098352670669556, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.13402090966701508, "step": 15760 }, { "epoch": 5.256837891927952, "loss": 0.6253343224525452, "step": 15760 }, { "ce_loss": 0.16839808225631714, "epoch": 5.256837891927952, "step": 15760 }, { "distill_loss": 0.2530108690261841, "epoch": 5.256837891927952, "step": 15760 }, { "epoch": 5.256837891927952, "ref_ce_loss": 0.11771053820848465, "step": 15760 }, { "epoch": 5.260173448965977, "loss": 0.7787, "step": 15770 }, { "epoch": 5.260173448965977, "grad_norm": 2.1769955158233643, "step": 15770 }, { "epoch": 5.260173448965977, "learning_rate": 0.0003857426962859861, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.6088996529579163, "step": 15770 }, { "ce_loss": 0.12888173758983612, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.27795273065567017, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.13567443192005157, "step": 15770 }, { "epoch": 5.260173448965977, "loss": 0.6457476615905762, "step": 15770 }, { "ce_loss": 0.1572996973991394, "epoch": 5.260173448965977, "step": 15770 }, { "distill_loss": 0.23233917355537415, "epoch": 5.260173448965977, "step": 15770 }, { "epoch": 5.260173448965977, "ref_ce_loss": 0.13116711378097534, "step": 15770 }, { "epoch": 5.263509006004003, "loss": 0.6841, "step": 15780 }, { "epoch": 5.263509006004003, "grad_norm": 2.4963042736053467, "step": 15780 }, { "epoch": 5.263509006004003, "learning_rate": 0.00038531084823682077, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.7489142417907715, "step": 15780 }, { "ce_loss": 0.21276547014713287, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.32808375358581543, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.14841942489147186, "step": 15780 }, { "epoch": 5.263509006004003, "loss": 0.6570113301277161, "step": 15780 }, { "ce_loss": 0.16270892322063446, "epoch": 5.263509006004003, "step": 15780 }, { "distill_loss": 0.3157443106174469, "epoch": 5.263509006004003, "step": 15780 }, { "epoch": 5.263509006004003, "ref_ce_loss": 0.12651695311069489, "step": 15780 }, { "epoch": 5.266844563042028, "loss": 0.763, "step": 15790 }, { "epoch": 5.266844563042028, "grad_norm": 1.597322702407837, "step": 15790 }, { "epoch": 5.266844563042028, "learning_rate": 0.0003848790173314761, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.6330976486206055, "step": 15790 }, { "ce_loss": 0.13653190433979034, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.3290137052536011, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.12903012335300446, "step": 15790 }, { "epoch": 5.266844563042028, "loss": 0.5458581447601318, "step": 15790 }, { "ce_loss": 0.12407498061656952, "epoch": 5.266844563042028, "step": 15790 }, { "distill_loss": 0.24929481744766235, "epoch": 5.266844563042028, "step": 15790 }, { "epoch": 5.266844563042028, "ref_ce_loss": 0.1366756558418274, "step": 15790 }, { "epoch": 5.270180120080053, "loss": 0.7372, "step": 15800 }, { "epoch": 5.270180120080053, "grad_norm": 1.6201483011245728, "step": 15800 }, { "epoch": 5.270180120080053, "learning_rate": 0.0003844472040739454, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.7160874009132385, "step": 15800 }, { "ce_loss": 0.1689663529396057, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.35960906744003296, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.1494126170873642, "step": 15800 }, { "epoch": 5.270180120080053, "loss": 0.7563287019729614, "step": 15800 }, { "ce_loss": 0.16878221929073334, "epoch": 5.270180120080053, "step": 15800 }, { "distill_loss": 0.3290070593357086, "epoch": 5.270180120080053, "step": 15800 }, { "epoch": 5.270180120080053, "ref_ce_loss": 0.1567942053079605, "step": 15800 }, { "epoch": 5.273515677118079, "loss": 0.7504, "step": 15810 }, { "epoch": 5.273515677118079, "grad_norm": 1.6044793128967285, "step": 15810 }, { "epoch": 5.273515677118079, "learning_rate": 0.00038401540896820097, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.7505194544792175, "step": 15810 }, { "ce_loss": 0.21263115108013153, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.3389429450035095, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.14325197041034698, "step": 15810 }, { "epoch": 5.273515677118079, "loss": 0.7398129105567932, "step": 15810 }, { "ce_loss": 0.1300848126411438, "epoch": 5.273515677118079, "step": 15810 }, { "distill_loss": 0.27438250184059143, "epoch": 5.273515677118079, "step": 15810 }, { "epoch": 5.273515677118079, "ref_ce_loss": 0.1393304020166397, "step": 15810 }, { "epoch": 5.276851234156104, "loss": 0.7488, "step": 15820 }, { "epoch": 5.276851234156104, "grad_norm": 2.243036985397339, "step": 15820 }, { "epoch": 5.276851234156104, "learning_rate": 0.0003835836325181943, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.6281885504722595, "step": 15820 }, { "ce_loss": 0.18166813254356384, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.24070219695568085, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.14517952501773834, "step": 15820 }, { "epoch": 5.276851234156104, "loss": 0.5907910466194153, "step": 15820 }, { "ce_loss": 0.18832477927207947, "epoch": 5.276851234156104, "step": 15820 }, { "distill_loss": 0.24199432134628296, "epoch": 5.276851234156104, "step": 15820 }, { "epoch": 5.276851234156104, "ref_ce_loss": 0.1601739078760147, "step": 15820 }, { "epoch": 5.280186791194129, "loss": 0.7628, "step": 15830 }, { "epoch": 5.280186791194129, "grad_norm": 1.7993576526641846, "step": 15830 }, { "epoch": 5.280186791194129, "learning_rate": 0.00038315187522785485, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.6452166438102722, "step": 15830 }, { "ce_loss": 0.18044812977313995, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.3071821331977844, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.11665491759777069, "step": 15830 }, { "epoch": 5.280186791194129, "loss": 0.6112740635871887, "step": 15830 }, { "ce_loss": 0.19033896923065186, "epoch": 5.280186791194129, "step": 15830 }, { "distill_loss": 0.25625908374786377, "epoch": 5.280186791194129, "step": 15830 }, { "epoch": 5.280186791194129, "ref_ce_loss": 0.1352468729019165, "step": 15830 }, { "epoch": 5.283522348232155, "loss": 0.6916, "step": 15840 }, { "epoch": 5.283522348232155, "grad_norm": 1.3148763179779053, "step": 15840 }, { "epoch": 5.283522348232155, "learning_rate": 0.0003827201376010901, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.7609184980392456, "step": 15840 }, { "ce_loss": 0.15719489753246307, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.24489633738994598, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.1509247124195099, "step": 15840 }, { "epoch": 5.283522348232155, "loss": 0.644400417804718, "step": 15840 }, { "ce_loss": 0.18509264290332794, "epoch": 5.283522348232155, "step": 15840 }, { "distill_loss": 0.2854074537754059, "epoch": 5.283522348232155, "step": 15840 }, { "epoch": 5.283522348232155, "ref_ce_loss": 0.13598176836967468, "step": 15840 }, { "epoch": 5.28685790527018, "loss": 0.6827, "step": 15850 }, { "epoch": 5.28685790527018, "grad_norm": 1.4376471042633057, "step": 15850 }, { "epoch": 5.28685790527018, "learning_rate": 0.0003822884201417841, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.7024563550949097, "step": 15850 }, { "ce_loss": 0.15480999648571014, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.2750566303730011, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.14786669611930847, "step": 15850 }, { "epoch": 5.28685790527018, "loss": 0.597403883934021, "step": 15850 }, { "ce_loss": 0.1416284441947937, "epoch": 5.28685790527018, "step": 15850 }, { "distill_loss": 0.2873379588127136, "epoch": 5.28685790527018, "step": 15850 }, { "epoch": 5.28685790527018, "ref_ce_loss": 0.13200509548187256, "step": 15850 }, { "epoch": 5.290193462308205, "loss": 0.7826, "step": 15860 }, { "epoch": 5.290193462308205, "grad_norm": 3.04026198387146, "step": 15860 }, { "epoch": 5.290193462308205, "learning_rate": 0.00038185672335379773, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 0.6992098093032837, "step": 15860 }, { "ce_loss": 0.1623932421207428, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.318131685256958, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.12590597569942474, "step": 15860 }, { "epoch": 5.290193462308205, "loss": 1.0851943492889404, "step": 15860 }, { "ce_loss": 0.1912342607975006, "epoch": 5.290193462308205, "step": 15860 }, { "distill_loss": 0.31606537103652954, "epoch": 5.290193462308205, "step": 15860 }, { "epoch": 5.290193462308205, "ref_ce_loss": 0.17208561301231384, "step": 15860 }, { "epoch": 5.293529019346231, "loss": 0.745, "step": 15870 }, { "epoch": 5.293529019346231, "grad_norm": 1.4783555269241333, "step": 15870 }, { "epoch": 5.293529019346231, "learning_rate": 0.0003814250477409674, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.6411396265029907, "step": 15870 }, { "ce_loss": 0.1674439013004303, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.32471901178359985, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.14873762428760529, "step": 15870 }, { "epoch": 5.293529019346231, "loss": 0.6937785148620605, "step": 15870 }, { "ce_loss": 0.15537618100643158, "epoch": 5.293529019346231, "step": 15870 }, { "distill_loss": 0.29007747769355774, "epoch": 5.293529019346231, "step": 15870 }, { "epoch": 5.293529019346231, "ref_ce_loss": 0.1800795942544937, "step": 15870 }, { "epoch": 5.296864576384256, "loss": 0.6858, "step": 15880 }, { "epoch": 5.296864576384256, "grad_norm": 1.375457763671875, "step": 15880 }, { "epoch": 5.296864576384256, "learning_rate": 0.0003809933938071052, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.9765406847000122, "step": 15880 }, { "ce_loss": 0.14966456592082977, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.27575427293777466, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.18869327008724213, "step": 15880 }, { "epoch": 5.296864576384256, "loss": 0.7333070635795593, "step": 15880 }, { "ce_loss": 0.1790359616279602, "epoch": 5.296864576384256, "step": 15880 }, { "distill_loss": 0.2985996603965759, "epoch": 5.296864576384256, "step": 15880 }, { "epoch": 5.296864576384256, "ref_ce_loss": 0.13419148325920105, "step": 15880 }, { "epoch": 5.3002001334222815, "loss": 0.7329, "step": 15890 }, { "epoch": 5.3002001334222815, "grad_norm": 4.050812244415283, "step": 15890 }, { "epoch": 5.3002001334222815, "learning_rate": 0.00038056176205599753, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.7350209951400757, "step": 15890 }, { "ce_loss": 0.19823463261127472, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.3150331974029541, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.16679008305072784, "step": 15890 }, { "epoch": 5.3002001334222815, "loss": 0.8023890256881714, "step": 15890 }, { "ce_loss": 0.2002089023590088, "epoch": 5.3002001334222815, "step": 15890 }, { "distill_loss": 0.2447521984577179, "epoch": 5.3002001334222815, "step": 15890 }, { "epoch": 5.3002001334222815, "ref_ce_loss": 0.15051716566085815, "step": 15890 }, { "epoch": 5.303535690460307, "loss": 0.7442, "step": 15900 }, { "epoch": 5.303535690460307, "grad_norm": 2.720048189163208, "step": 15900 }, { "epoch": 5.303535690460307, "learning_rate": 0.0003801301529914053, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 0.6218310594558716, "step": 15900 }, { "ce_loss": 0.13920356333255768, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.3183743357658386, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.12536291778087616, "step": 15900 }, { "epoch": 5.303535690460307, "loss": 1.0331048965454102, "step": 15900 }, { "ce_loss": 0.27767038345336914, "epoch": 5.303535690460307, "step": 15900 }, { "distill_loss": 0.39653193950653076, "epoch": 5.303535690460307, "step": 15900 }, { "epoch": 5.303535690460307, "ref_ce_loss": 0.15157125890254974, "step": 15900 }, { "epoch": 5.306871247498332, "loss": 0.7917, "step": 15910 }, { "epoch": 5.306871247498332, "grad_norm": 1.7117061614990234, "step": 15910 }, { "epoch": 5.306871247498332, "learning_rate": 0.0003796985671170625, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.6310486793518066, "step": 15910 }, { "ce_loss": 0.15972702205181122, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.293802946805954, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.1284494400024414, "step": 15910 }, { "epoch": 5.306871247498332, "loss": 0.6181274652481079, "step": 15910 }, { "ce_loss": 0.15288305282592773, "epoch": 5.306871247498332, "step": 15910 }, { "distill_loss": 0.2966892719268799, "epoch": 5.306871247498332, "step": 15910 }, { "epoch": 5.306871247498332, "ref_ce_loss": 0.12668883800506592, "step": 15910 }, { "epoch": 5.3102068045363575, "loss": 0.7613, "step": 15920 }, { "epoch": 5.3102068045363575, "grad_norm": 2.331742286682129, "step": 15920 }, { "epoch": 5.3102068045363575, "learning_rate": 0.0003792670049366765, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.7452472448348999, "step": 15920 }, { "ce_loss": 0.21786858141422272, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.28816884756088257, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.16267895698547363, "step": 15920 }, { "epoch": 5.3102068045363575, "loss": 0.684940755367279, "step": 15920 }, { "ce_loss": 0.14751452207565308, "epoch": 5.3102068045363575, "step": 15920 }, { "distill_loss": 0.34391942620277405, "epoch": 5.3102068045363575, "step": 15920 }, { "epoch": 5.3102068045363575, "ref_ce_loss": 0.1931276023387909, "step": 15920 }, { "epoch": 5.313542361574383, "loss": 0.7358, "step": 15930 }, { "epoch": 5.313542361574383, "grad_norm": 4.6383748054504395, "step": 15930 }, { "epoch": 5.313542361574383, "learning_rate": 0.0003788354669539266, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.7389318943023682, "step": 15930 }, { "ce_loss": 0.13353165984153748, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.30215421319007874, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.1452709436416626, "step": 15930 }, { "epoch": 5.313542361574383, "loss": 0.7610844373703003, "step": 15930 }, { "ce_loss": 0.18171174824237823, "epoch": 5.313542361574383, "step": 15930 }, { "distill_loss": 0.2951901853084564, "epoch": 5.313542361574383, "step": 15930 }, { "epoch": 5.313542361574383, "ref_ce_loss": 0.1556285321712494, "step": 15930 }, { "epoch": 5.316877918612408, "loss": 0.7467, "step": 15940 }, { "epoch": 5.316877918612408, "grad_norm": 2.451345682144165, "step": 15940 }, { "epoch": 5.316877918612408, "learning_rate": 0.00037840395367246405, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.7295464873313904, "step": 15940 }, { "ce_loss": 0.18687045574188232, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.3247690200805664, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.17957422137260437, "step": 15940 }, { "epoch": 5.316877918612408, "loss": 0.7804718017578125, "step": 15940 }, { "ce_loss": 0.17364463210105896, "epoch": 5.316877918612408, "step": 15940 }, { "distill_loss": 0.31363144516944885, "epoch": 5.316877918612408, "step": 15940 }, { "epoch": 5.316877918612408, "ref_ce_loss": 0.1749681830406189, "step": 15940 }, { "epoch": 5.320213475650434, "loss": 0.6915, "step": 15950 }, { "epoch": 5.320213475650434, "grad_norm": 2.567410707473755, "step": 15950 }, { "epoch": 5.320213475650434, "learning_rate": 0.0003779724655959116, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.6483530402183533, "step": 15950 }, { "ce_loss": 0.20535847544670105, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.3006976544857025, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.14195044338703156, "step": 15950 }, { "epoch": 5.320213475650434, "loss": 0.6408244371414185, "step": 15950 }, { "ce_loss": 0.14869768917560577, "epoch": 5.320213475650434, "step": 15950 }, { "distill_loss": 0.32060331106185913, "epoch": 5.320213475650434, "step": 15950 }, { "epoch": 5.320213475650434, "ref_ce_loss": 0.13498196005821228, "step": 15950 }, { "epoch": 5.323549032688459, "loss": 0.7747, "step": 15960 }, { "epoch": 5.323549032688459, "grad_norm": 1.5408507585525513, "step": 15960 }, { "epoch": 5.323549032688459, "learning_rate": 0.000377541003227862, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.6456042528152466, "step": 15960 }, { "ce_loss": 0.1565854847431183, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.32306361198425293, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.12703146040439606, "step": 15960 }, { "epoch": 5.323549032688459, "loss": 0.6512212157249451, "step": 15960 }, { "ce_loss": 0.16830721497535706, "epoch": 5.323549032688459, "step": 15960 }, { "distill_loss": 0.356458455324173, "epoch": 5.323549032688459, "step": 15960 }, { "epoch": 5.323549032688459, "ref_ce_loss": 0.1261129081249237, "step": 15960 }, { "epoch": 5.326884589726484, "loss": 0.7416, "step": 15970 }, { "epoch": 5.326884589726484, "grad_norm": 2.083885669708252, "step": 15970 }, { "epoch": 5.326884589726484, "learning_rate": 0.00037710956707187826, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.6469038724899292, "step": 15970 }, { "ce_loss": 0.14937376976013184, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.3656269609928131, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.09814153611660004, "step": 15970 }, { "epoch": 5.326884589726484, "loss": 0.734693169593811, "step": 15970 }, { "ce_loss": 0.20170821249485016, "epoch": 5.326884589726484, "step": 15970 }, { "distill_loss": 0.37536001205444336, "epoch": 5.326884589726484, "step": 15970 }, { "epoch": 5.326884589726484, "ref_ce_loss": 0.1566096842288971, "step": 15970 }, { "epoch": 5.33022014676451, "loss": 0.6901, "step": 15980 }, { "epoch": 5.33022014676451, "grad_norm": 1.5245089530944824, "step": 15980 }, { "epoch": 5.33022014676451, "learning_rate": 0.00037667815763149296, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.9062093496322632, "step": 15980 }, { "ce_loss": 0.16774944961071014, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.3081806004047394, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.16400887072086334, "step": 15980 }, { "epoch": 5.33022014676451, "loss": 0.7526729106903076, "step": 15980 }, { "ce_loss": 0.157914400100708, "epoch": 5.33022014676451, "step": 15980 }, { "distill_loss": 0.29823794960975647, "epoch": 5.33022014676451, "step": 15980 }, { "epoch": 5.33022014676451, "ref_ce_loss": 0.12280679494142532, "step": 15980 }, { "epoch": 5.333555703802535, "loss": 0.7697, "step": 15990 }, { "epoch": 5.333555703802535, "grad_norm": 2.3326122760772705, "step": 15990 }, { "epoch": 5.333555703802535, "learning_rate": 0.0003762467754102072, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 1.2512686252593994, "step": 15990 }, { "ce_loss": 0.19256213307380676, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.33866339921951294, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.1713816374540329, "step": 15990 }, { "epoch": 5.333555703802535, "loss": 0.6252297163009644, "step": 15990 }, { "ce_loss": 0.14829422533512115, "epoch": 5.333555703802535, "step": 15990 }, { "distill_loss": 0.33098459243774414, "epoch": 5.333555703802535, "step": 15990 }, { "epoch": 5.333555703802535, "ref_ce_loss": 0.11103123426437378, "step": 15990 }, { "epoch": 5.33689126084056, "loss": 0.8656, "step": 16000 }, { "epoch": 5.33689126084056, "grad_norm": 1.9075807332992554, "step": 16000 }, { "epoch": 5.33689126084056, "learning_rate": 0.00037581542091149055, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.6631274819374084, "step": 16000 }, { "ce_loss": 0.19678767025470734, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.3303927481174469, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.13508301973342896, "step": 16000 }, { "epoch": 5.33689126084056, "loss": 0.7302857041358948, "step": 16000 }, { "ce_loss": 0.2101486772298813, "epoch": 5.33689126084056, "step": 16000 }, { "distill_loss": 0.3147587478160858, "epoch": 5.33689126084056, "step": 16000 }, { "epoch": 5.33689126084056, "ref_ce_loss": 0.1732034981250763, "step": 16000 }, { "epoch": 5.340226817878586, "loss": 0.7652, "step": 16010 }, { "epoch": 5.340226817878586, "grad_norm": 1.8792786598205566, "step": 16010 }, { "epoch": 5.340226817878586, "learning_rate": 0.00037538409463878, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.9005892872810364, "step": 16010 }, { "ce_loss": 0.20390790700912476, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.36432161927223206, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.1445426493883133, "step": 16010 }, { "epoch": 5.340226817878586, "loss": 0.5989366173744202, "step": 16010 }, { "ce_loss": 0.13817287981510162, "epoch": 5.340226817878586, "step": 16010 }, { "distill_loss": 0.26578107476234436, "epoch": 5.340226817878586, "step": 16010 }, { "epoch": 5.340226817878586, "ref_ce_loss": 0.14596757292747498, "step": 16010 }, { "epoch": 5.343562374916611, "loss": 0.7972, "step": 16020 }, { "epoch": 5.343562374916611, "grad_norm": 1.37563955783844, "step": 16020 }, { "epoch": 5.343562374916611, "learning_rate": 0.0003749527970954798, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.6624598503112793, "step": 16020 }, { "ce_loss": 0.14707306027412415, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.36443811655044556, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.1507168412208557, "step": 16020 }, { "epoch": 5.343562374916611, "loss": 0.7892172932624817, "step": 16020 }, { "ce_loss": 0.2143014669418335, "epoch": 5.343562374916611, "step": 16020 }, { "distill_loss": 0.37718236446380615, "epoch": 5.343562374916611, "step": 16020 }, { "epoch": 5.343562374916611, "ref_ce_loss": 0.19742228090763092, "step": 16020 }, { "epoch": 5.346897931954636, "loss": 0.8115, "step": 16030 }, { "epoch": 5.346897931954636, "grad_norm": 1.8450546264648438, "step": 16030 }, { "epoch": 5.346897931954636, "learning_rate": 0.0003745215287849606, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.7200436592102051, "step": 16030 }, { "ce_loss": 0.15626439452171326, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.2960348129272461, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.11136317998170853, "step": 16030 }, { "epoch": 5.346897931954636, "loss": 0.6884328126907349, "step": 16030 }, { "ce_loss": 0.1953802853822708, "epoch": 5.346897931954636, "step": 16030 }, { "distill_loss": 0.347346693277359, "epoch": 5.346897931954636, "step": 16030 }, { "epoch": 5.346897931954636, "ref_ce_loss": 0.14511752128601074, "step": 16030 }, { "epoch": 5.350233488992662, "loss": 0.7676, "step": 16040 }, { "epoch": 5.350233488992662, "grad_norm": 2.2440707683563232, "step": 16040 }, { "epoch": 5.350233488992662, "learning_rate": 0.00037409029021055886, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.6472846269607544, "step": 16040 }, { "ce_loss": 0.18246594071388245, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.3376293182373047, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.12702175974845886, "step": 16040 }, { "epoch": 5.350233488992662, "loss": 0.6976692080497742, "step": 16040 }, { "ce_loss": 0.21113593876361847, "epoch": 5.350233488992662, "step": 16040 }, { "distill_loss": 0.31901413202285767, "epoch": 5.350233488992662, "step": 16040 }, { "epoch": 5.350233488992662, "ref_ce_loss": 0.16698399186134338, "step": 16040 }, { "epoch": 5.353569046030687, "loss": 0.7845, "step": 16050 }, { "epoch": 5.353569046030687, "grad_norm": 1.2216275930404663, "step": 16050 }, { "epoch": 5.353569046030687, "learning_rate": 0.00037365908187557634, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.8387020230293274, "step": 16050 }, { "ce_loss": 0.1646944135427475, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.3545537292957306, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.17081841826438904, "step": 16050 }, { "epoch": 5.353569046030687, "loss": 0.7031180262565613, "step": 16050 }, { "ce_loss": 0.17327630519866943, "epoch": 5.353569046030687, "step": 16050 }, { "distill_loss": 0.2943154275417328, "epoch": 5.353569046030687, "step": 16050 }, { "epoch": 5.353569046030687, "ref_ce_loss": 0.19791601598262787, "step": 16050 }, { "epoch": 5.356904603068712, "loss": 0.7042, "step": 16060 }, { "epoch": 5.356904603068712, "grad_norm": 2.8248355388641357, "step": 16060 }, { "epoch": 5.356904603068712, "learning_rate": 0.0003732279042832798, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.7485116124153137, "step": 16060 }, { "ce_loss": 0.2005242556333542, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.35190409421920776, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.14127810299396515, "step": 16060 }, { "epoch": 5.356904603068712, "loss": 0.5735911726951599, "step": 16060 }, { "ce_loss": 0.1520068496465683, "epoch": 5.356904603068712, "step": 16060 }, { "distill_loss": 0.2645909786224365, "epoch": 5.356904603068712, "step": 16060 }, { "epoch": 5.356904603068712, "ref_ce_loss": 0.13216225802898407, "step": 16060 }, { "epoch": 5.360240160106738, "loss": 0.7703, "step": 16070 }, { "epoch": 5.360240160106738, "grad_norm": 2.925449848175049, "step": 16070 }, { "epoch": 5.360240160106738, "learning_rate": 0.00037279675793689977, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.8611706495285034, "step": 16070 }, { "ce_loss": 0.1922069638967514, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.33778977394104004, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.12563969194889069, "step": 16070 }, { "epoch": 5.360240160106738, "loss": 0.7186930775642395, "step": 16070 }, { "ce_loss": 0.15724870562553406, "epoch": 5.360240160106738, "step": 16070 }, { "distill_loss": 0.35159173607826233, "epoch": 5.360240160106738, "step": 16070 }, { "epoch": 5.360240160106738, "ref_ce_loss": 0.14582069218158722, "step": 16070 }, { "epoch": 5.363575717144763, "loss": 0.7821, "step": 16080 }, { "epoch": 5.363575717144763, "grad_norm": 3.396252155303955, "step": 16080 }, { "epoch": 5.363575717144763, "learning_rate": 0.0003723656433396304, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 1.0402714014053345, "step": 16080 }, { "ce_loss": 0.16723550856113434, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.3183455765247345, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.14463065564632416, "step": 16080 }, { "epoch": 5.363575717144763, "loss": 0.8913321495056152, "step": 16080 }, { "ce_loss": 0.13748161494731903, "epoch": 5.363575717144763, "step": 16080 }, { "distill_loss": 0.4310514032840729, "epoch": 5.363575717144763, "step": 16080 }, { "epoch": 5.363575717144763, "ref_ce_loss": 0.1252446174621582, "step": 16080 }, { "epoch": 5.3669112741827885, "loss": 0.8108, "step": 16090 }, { "epoch": 5.3669112741827885, "grad_norm": 1.7849303483963013, "step": 16090 }, { "epoch": 5.3669112741827885, "learning_rate": 0.0003719345609946289, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.7346190810203552, "step": 16090 }, { "ce_loss": 0.1527981460094452, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.376956582069397, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.14148907363414764, "step": 16090 }, { "epoch": 5.3669112741827885, "loss": 0.8172603249549866, "step": 16090 }, { "ce_loss": 0.24916993081569672, "epoch": 5.3669112741827885, "step": 16090 }, { "distill_loss": 0.35236555337905884, "epoch": 5.3669112741827885, "step": 16090 }, { "epoch": 5.3669112741827885, "ref_ce_loss": 0.1716679334640503, "step": 16090 }, { "epoch": 5.370246831220814, "loss": 0.7323, "step": 16100 }, { "epoch": 5.370246831220814, "grad_norm": 2.8443517684936523, "step": 16100 }, { "epoch": 5.370246831220814, "learning_rate": 0.00037150351140501455, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.581108570098877, "step": 16100 }, { "ce_loss": 0.14519847929477692, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.2797437012195587, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.12386301159858704, "step": 16100 }, { "epoch": 5.370246831220814, "loss": 0.700649082660675, "step": 16100 }, { "ce_loss": 0.17296354472637177, "epoch": 5.370246831220814, "step": 16100 }, { "distill_loss": 0.3582516312599182, "epoch": 5.370246831220814, "step": 16100 }, { "epoch": 5.370246831220814, "ref_ce_loss": 0.1374809294939041, "step": 16100 }, { "epoch": 5.373582388258839, "loss": 0.8128, "step": 16110 }, { "epoch": 5.373582388258839, "grad_norm": 3.3821401596069336, "step": 16110 }, { "epoch": 5.373582388258839, "learning_rate": 0.00037107249507386885, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 0.8478387594223022, "step": 16110 }, { "ce_loss": 0.20072631537914276, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.3885403573513031, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.15446209907531738, "step": 16110 }, { "epoch": 5.373582388258839, "loss": 1.0306737422943115, "step": 16110 }, { "ce_loss": 0.2711739242076874, "epoch": 5.373582388258839, "step": 16110 }, { "distill_loss": 0.35776910185813904, "epoch": 5.373582388258839, "step": 16110 }, { "epoch": 5.373582388258839, "ref_ce_loss": 0.16715602576732635, "step": 16110 }, { "epoch": 5.3769179452968645, "loss": 0.7892, "step": 16120 }, { "epoch": 5.3769179452968645, "grad_norm": 1.596034049987793, "step": 16120 }, { "epoch": 5.3769179452968645, "learning_rate": 0.00037064151250423404, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.7503730058670044, "step": 16120 }, { "ce_loss": 0.18941175937652588, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.33014625310897827, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.14595848321914673, "step": 16120 }, { "epoch": 5.3769179452968645, "loss": 0.5979549288749695, "step": 16120 }, { "ce_loss": 0.16548322141170502, "epoch": 5.3769179452968645, "step": 16120 }, { "distill_loss": 0.29914453625679016, "epoch": 5.3769179452968645, "step": 16120 }, { "epoch": 5.3769179452968645, "ref_ce_loss": 0.13314181566238403, "step": 16120 }, { "epoch": 5.38025350233489, "loss": 0.6698, "step": 16130 }, { "epoch": 5.38025350233489, "grad_norm": 2.2804458141326904, "step": 16130 }, { "epoch": 5.38025350233489, "learning_rate": 0.00037021056419911337, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.7559417486190796, "step": 16130 }, { "ce_loss": 0.17035898566246033, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.32906657457351685, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.14381395280361176, "step": 16130 }, { "epoch": 5.38025350233489, "loss": 0.8120471835136414, "step": 16130 }, { "ce_loss": 0.14441224932670593, "epoch": 5.38025350233489, "step": 16130 }, { "distill_loss": 0.3231331706047058, "epoch": 5.38025350233489, "step": 16130 }, { "epoch": 5.38025350233489, "ref_ce_loss": 0.13039423525333405, "step": 16130 }, { "epoch": 5.383589059372915, "loss": 0.8278, "step": 16140 }, { "epoch": 5.383589059372915, "grad_norm": 9.378495216369629, "step": 16140 }, { "epoch": 5.383589059372915, "learning_rate": 0.0003697796506614696, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.8636850714683533, "step": 16140 }, { "ce_loss": 0.2732938528060913, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.30688318610191345, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.23132197558879852, "step": 16140 }, { "epoch": 5.383589059372915, "loss": 0.7541180849075317, "step": 16140 }, { "ce_loss": 0.1787034571170807, "epoch": 5.383589059372915, "step": 16140 }, { "distill_loss": 0.2874031364917755, "epoch": 5.383589059372915, "step": 16140 }, { "epoch": 5.383589059372915, "ref_ce_loss": 0.16801492869853973, "step": 16140 }, { "epoch": 5.386924616410941, "loss": 0.7411, "step": 16150 }, { "epoch": 5.386924616410941, "grad_norm": 1.6044723987579346, "step": 16150 }, { "epoch": 5.386924616410941, "learning_rate": 0.0003693487723942255, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.7227868437767029, "step": 16150 }, { "ce_loss": 0.1402229517698288, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.3689223825931549, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.15327772498130798, "step": 16150 }, { "epoch": 5.386924616410941, "loss": 0.6728941798210144, "step": 16150 }, { "ce_loss": 0.14626118540763855, "epoch": 5.386924616410941, "step": 16150 }, { "distill_loss": 0.2888425886631012, "epoch": 5.386924616410941, "step": 16150 }, { "epoch": 5.386924616410941, "ref_ce_loss": 0.14756475389003754, "step": 16150 }, { "epoch": 5.390260173448966, "loss": 0.767, "step": 16160 }, { "epoch": 5.390260173448966, "grad_norm": 1.5285331010818481, "step": 16160 }, { "epoch": 5.390260173448966, "learning_rate": 0.00036891792990026195, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.636301577091217, "step": 16160 }, { "ce_loss": 0.11532767117023468, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.30249351263046265, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.1128992810845375, "step": 16160 }, { "epoch": 5.390260173448966, "loss": 0.8006002306938171, "step": 16160 }, { "ce_loss": 0.16467390954494476, "epoch": 5.390260173448966, "step": 16160 }, { "distill_loss": 0.3822260797023773, "epoch": 5.390260173448966, "step": 16160 }, { "epoch": 5.390260173448966, "ref_ce_loss": 0.13275885581970215, "step": 16160 }, { "epoch": 5.393595730486991, "loss": 0.7877, "step": 16170 }, { "epoch": 5.393595730486991, "grad_norm": 1.431617021560669, "step": 16170 }, { "epoch": 5.393595730486991, "learning_rate": 0.00036848712368241904, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.8732019662857056, "step": 16170 }, { "ce_loss": 0.17849218845367432, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.38769400119781494, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.1828514039516449, "step": 16170 }, { "epoch": 5.393595730486991, "loss": 0.7162034511566162, "step": 16170 }, { "ce_loss": 0.19900190830230713, "epoch": 5.393595730486991, "step": 16170 }, { "distill_loss": 0.3482223153114319, "epoch": 5.393595730486991, "step": 16170 }, { "epoch": 5.393595730486991, "ref_ce_loss": 0.16874390840530396, "step": 16170 }, { "epoch": 5.396931287525017, "loss": 0.7354, "step": 16180 }, { "epoch": 5.396931287525017, "grad_norm": 1.560686469078064, "step": 16180 }, { "epoch": 5.396931287525017, "learning_rate": 0.0003680563542434936, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.8284152150154114, "step": 16180 }, { "ce_loss": 0.24313443899154663, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.3359900116920471, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.18405799567699432, "step": 16180 }, { "epoch": 5.396931287525017, "loss": 0.811203122138977, "step": 16180 }, { "ce_loss": 0.22010092437267303, "epoch": 5.396931287525017, "step": 16180 }, { "distill_loss": 0.35429173707962036, "epoch": 5.396931287525017, "step": 16180 }, { "epoch": 5.396931287525017, "ref_ce_loss": 0.1608809381723404, "step": 16180 }, { "epoch": 5.400266844563042, "loss": 0.8111, "step": 16190 }, { "epoch": 5.400266844563042, "grad_norm": 2.740382432937622, "step": 16190 }, { "epoch": 5.400266844563042, "learning_rate": 0.00036762562208624016, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.6927562355995178, "step": 16190 }, { "ce_loss": 0.1379556804895401, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.3650135099887848, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.1422877162694931, "step": 16190 }, { "epoch": 5.400266844563042, "loss": 0.5087493062019348, "step": 16190 }, { "ce_loss": 0.14438669383525848, "epoch": 5.400266844563042, "step": 16190 }, { "distill_loss": 0.2298826277256012, "epoch": 5.400266844563042, "step": 16190 }, { "epoch": 5.400266844563042, "ref_ce_loss": 0.11260359734296799, "step": 16190 }, { "epoch": 5.403602401601067, "loss": 0.723, "step": 16200 }, { "epoch": 5.403602401601067, "grad_norm": 1.3632830381393433, "step": 16200 }, { "epoch": 5.403602401601067, "learning_rate": 0.0003671949277133693, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.5387332439422607, "step": 16200 }, { "ce_loss": 0.09517402946949005, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.2157907485961914, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.16007165610790253, "step": 16200 }, { "epoch": 5.403602401601067, "loss": 0.7213792204856873, "step": 16200 }, { "ce_loss": 0.192056804895401, "epoch": 5.403602401601067, "step": 16200 }, { "distill_loss": 0.33840423822402954, "epoch": 5.403602401601067, "step": 16200 }, { "epoch": 5.403602401601067, "ref_ce_loss": 0.15678493678569794, "step": 16200 }, { "epoch": 5.406937958639093, "loss": 0.763, "step": 16210 }, { "epoch": 5.406937958639093, "grad_norm": 1.7042118310928345, "step": 16210 }, { "epoch": 5.406937958639093, "learning_rate": 0.00036676427162754777, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.8875052332878113, "step": 16210 }, { "ce_loss": 0.1821518987417221, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.33723828196525574, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.16609439253807068, "step": 16210 }, { "epoch": 5.406937958639093, "loss": 0.7967724204063416, "step": 16210 }, { "ce_loss": 0.18657946586608887, "epoch": 5.406937958639093, "step": 16210 }, { "distill_loss": 0.32966044545173645, "epoch": 5.406937958639093, "step": 16210 }, { "epoch": 5.406937958639093, "ref_ce_loss": 0.17224165797233582, "step": 16210 }, { "epoch": 5.410273515677118, "loss": 0.7536, "step": 16220 }, { "epoch": 5.410273515677118, "grad_norm": 1.781567096710205, "step": 16220 }, { "epoch": 5.410273515677118, "learning_rate": 0.00036633365433139754, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.8034771680831909, "step": 16220 }, { "ce_loss": 0.23686951398849487, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.3565540909767151, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.18170595169067383, "step": 16220 }, { "epoch": 5.410273515677118, "loss": 0.6347296833992004, "step": 16220 }, { "ce_loss": 0.19076235592365265, "epoch": 5.410273515677118, "step": 16220 }, { "distill_loss": 0.3064838945865631, "epoch": 5.410273515677118, "step": 16220 }, { "epoch": 5.410273515677118, "ref_ce_loss": 0.13722343742847443, "step": 16220 }, { "epoch": 5.413609072715143, "loss": 0.7438, "step": 16230 }, { "epoch": 5.413609072715143, "grad_norm": 2.2475061416625977, "step": 16230 }, { "epoch": 5.413609072715143, "learning_rate": 0.00036590307632749543, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.7417387962341309, "step": 16230 }, { "ce_loss": 0.1585659682750702, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.2824546694755554, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.14834964275360107, "step": 16230 }, { "epoch": 5.413609072715143, "loss": 0.6264053583145142, "step": 16230 }, { "ce_loss": 0.15628458559513092, "epoch": 5.413609072715143, "step": 16230 }, { "distill_loss": 0.3434602618217468, "epoch": 5.413609072715143, "step": 16230 }, { "epoch": 5.413609072715143, "ref_ce_loss": 0.12648874521255493, "step": 16230 }, { "epoch": 5.416944629753169, "loss": 0.7557, "step": 16240 }, { "epoch": 5.416944629753169, "grad_norm": 1.4844996929168701, "step": 16240 }, { "epoch": 5.416944629753169, "learning_rate": 0.0003654725381183721, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.7612562775611877, "step": 16240 }, { "ce_loss": 0.2294774055480957, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.3751007914543152, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.15650267899036407, "step": 16240 }, { "epoch": 5.416944629753169, "loss": 0.7427468299865723, "step": 16240 }, { "ce_loss": 0.19064797461032867, "epoch": 5.416944629753169, "step": 16240 }, { "distill_loss": 0.333304762840271, "epoch": 5.416944629753169, "step": 16240 }, { "epoch": 5.416944629753169, "ref_ce_loss": 0.13059431314468384, "step": 16240 }, { "epoch": 5.420280186791194, "loss": 0.8037, "step": 16250 }, { "epoch": 5.420280186791194, "grad_norm": 1.3405860662460327, "step": 16250 }, { "epoch": 5.420280186791194, "learning_rate": 0.00036504204020651227, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.5487708449363708, "step": 16250 }, { "ce_loss": 0.12229014933109283, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.28631076216697693, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.11592966318130493, "step": 16250 }, { "epoch": 5.420280186791194, "loss": 0.8611596822738647, "step": 16250 }, { "ce_loss": 0.18582291901111603, "epoch": 5.420280186791194, "step": 16250 }, { "distill_loss": 0.34006738662719727, "epoch": 5.420280186791194, "step": 16250 }, { "epoch": 5.420280186791194, "ref_ce_loss": 0.20299170911312103, "step": 16250 }, { "epoch": 5.423615743829219, "loss": 0.8197, "step": 16260 }, { "epoch": 5.423615743829219, "grad_norm": 1.984776258468628, "step": 16260 }, { "epoch": 5.423615743829219, "learning_rate": 0.000364611583094353, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.5243992805480957, "step": 16260 }, { "ce_loss": 0.1208997592329979, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.275307834148407, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.1278659701347351, "step": 16260 }, { "epoch": 5.423615743829219, "loss": 0.8565524220466614, "step": 16260 }, { "ce_loss": 0.23394560813903809, "epoch": 5.423615743829219, "step": 16260 }, { "distill_loss": 0.36015230417251587, "epoch": 5.423615743829219, "step": 16260 }, { "epoch": 5.423615743829219, "ref_ce_loss": 0.17624898254871368, "step": 16260 }, { "epoch": 5.426951300867245, "loss": 0.7453, "step": 16270 }, { "epoch": 5.426951300867245, "grad_norm": 1.8883329629898071, "step": 16270 }, { "epoch": 5.426951300867245, "learning_rate": 0.0003641811672842842, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.8288264870643616, "step": 16270 }, { "ce_loss": 0.13347579538822174, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.2436203956604004, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.14556606113910675, "step": 16270 }, { "epoch": 5.426951300867245, "loss": 0.5858508944511414, "step": 16270 }, { "ce_loss": 0.16244244575500488, "epoch": 5.426951300867245, "step": 16270 }, { "distill_loss": 0.23977768421173096, "epoch": 5.426951300867245, "step": 16270 }, { "epoch": 5.426951300867245, "ref_ce_loss": 0.14092527329921722, "step": 16270 }, { "epoch": 5.43028685790527, "loss": 0.7074, "step": 16280 }, { "epoch": 5.43028685790527, "grad_norm": 2.5687782764434814, "step": 16280 }, { "epoch": 5.43028685790527, "learning_rate": 0.0003637507932786475, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.6588826179504395, "step": 16280 }, { "ce_loss": 0.1949000507593155, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.2663923501968384, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.14771591126918793, "step": 16280 }, { "epoch": 5.43028685790527, "loss": 0.6191297173500061, "step": 16280 }, { "ce_loss": 0.13703526556491852, "epoch": 5.43028685790527, "step": 16280 }, { "distill_loss": 0.295706570148468, "epoch": 5.43028685790527, "step": 16280 }, { "epoch": 5.43028685790527, "ref_ce_loss": 0.13173656165599823, "step": 16280 }, { "epoch": 5.4336224149432955, "loss": 0.6516, "step": 16290 }, { "epoch": 5.4336224149432955, "grad_norm": 1.6438771486282349, "step": 16290 }, { "epoch": 5.4336224149432955, "learning_rate": 0.0003633204615797356, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.5487070083618164, "step": 16290 }, { "ce_loss": 0.14527775347232819, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.2327229082584381, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.17044702172279358, "step": 16290 }, { "epoch": 5.4336224149432955, "loss": 0.7046306133270264, "step": 16290 }, { "ce_loss": 0.21647413074970245, "epoch": 5.4336224149432955, "step": 16290 }, { "distill_loss": 0.24801312386989594, "epoch": 5.4336224149432955, "step": 16290 }, { "epoch": 5.4336224149432955, "ref_ce_loss": 0.19363921880722046, "step": 16290 }, { "epoch": 5.436957971981321, "loss": 0.6573, "step": 16300 }, { "epoch": 5.436957971981321, "grad_norm": 8.01134204864502, "step": 16300 }, { "epoch": 5.436957971981321, "learning_rate": 0.00036289017268979204, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.5597203969955444, "step": 16300 }, { "ce_loss": 0.10873627662658691, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.21907083690166473, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.1122971624135971, "step": 16300 }, { "epoch": 5.436957971981321, "loss": 0.5726298093795776, "step": 16300 }, { "ce_loss": 0.18682533502578735, "epoch": 5.436957971981321, "step": 16300 }, { "distill_loss": 0.22446873784065247, "epoch": 5.436957971981321, "step": 16300 }, { "epoch": 5.436957971981321, "ref_ce_loss": 0.16113083064556122, "step": 16300 }, { "epoch": 5.440293529019346, "loss": 0.62, "step": 16310 }, { "epoch": 5.440293529019346, "grad_norm": 1.2961703538894653, "step": 16310 }, { "epoch": 5.440293529019346, "learning_rate": 0.00036245992711100996, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 0.6034284234046936, "step": 16310 }, { "ce_loss": 0.22241556644439697, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.24138249456882477, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.13920840620994568, "step": 16310 }, { "epoch": 5.440293529019346, "loss": 0.5251420736312866, "step": 16310 }, { "ce_loss": 0.15426619350910187, "epoch": 5.440293529019346, "step": 16310 }, { "distill_loss": 0.2309967428445816, "epoch": 5.440293529019346, "step": 16310 }, { "epoch": 5.440293529019346, "ref_ce_loss": 0.13962532579898834, "step": 16310 }, { "epoch": 5.4436290860573715, "loss": 0.7097, "step": 16320 }, { "epoch": 5.4436290860573715, "grad_norm": 1.5120129585266113, "step": 16320 }, { "epoch": 5.4436290860573715, "learning_rate": 0.0003620297253455326, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.6220839619636536, "step": 16320 }, { "ce_loss": 0.1753881275653839, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.2504945397377014, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.16376005113124847, "step": 16320 }, { "epoch": 5.4436290860573715, "loss": 0.6079819202423096, "step": 16320 }, { "ce_loss": 0.13508924841880798, "epoch": 5.4436290860573715, "step": 16320 }, { "distill_loss": 0.2549654543399811, "epoch": 5.4436290860573715, "step": 16320 }, { "epoch": 5.4436290860573715, "ref_ce_loss": 0.11095515638589859, "step": 16320 }, { "epoch": 5.446964643095397, "loss": 0.6923, "step": 16330 }, { "epoch": 5.446964643095397, "grad_norm": 1.3203496932983398, "step": 16330 }, { "epoch": 5.446964643095397, "learning_rate": 0.00036159956789545136, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.6790783405303955, "step": 16330 }, { "ce_loss": 0.19250929355621338, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.21748296916484833, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.15178987383842468, "step": 16330 }, { "epoch": 5.446964643095397, "loss": 0.5082254409790039, "step": 16330 }, { "ce_loss": 0.15140938758850098, "epoch": 5.446964643095397, "step": 16330 }, { "distill_loss": 0.21036866307258606, "epoch": 5.446964643095397, "step": 16330 }, { "epoch": 5.446964643095397, "ref_ce_loss": 0.1196763813495636, "step": 16330 }, { "epoch": 5.450300200133422, "loss": 0.654, "step": 16340 }, { "epoch": 5.450300200133422, "grad_norm": 1.700995922088623, "step": 16340 }, { "epoch": 5.450300200133422, "learning_rate": 0.00036116945526280645, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.6487125754356384, "step": 16340 }, { "ce_loss": 0.19432617723941803, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.21522441506385803, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.14066074788570404, "step": 16340 }, { "epoch": 5.450300200133422, "loss": 0.708146870136261, "step": 16340 }, { "ce_loss": 0.22304917871952057, "epoch": 5.450300200133422, "step": 16340 }, { "distill_loss": 0.2944159209728241, "epoch": 5.450300200133422, "step": 16340 }, { "epoch": 5.450300200133422, "ref_ce_loss": 0.1507091075181961, "step": 16340 }, { "epoch": 5.4536357571714476, "loss": 0.6855, "step": 16350 }, { "epoch": 5.4536357571714476, "grad_norm": 1.3701978921890259, "step": 16350 }, { "epoch": 5.4536357571714476, "learning_rate": 0.0003607393879495857, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.5639296174049377, "step": 16350 }, { "ce_loss": 0.14695106446743011, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.22136704623699188, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.15459786355495453, "step": 16350 }, { "epoch": 5.4536357571714476, "loss": 0.5510401129722595, "step": 16350 }, { "ce_loss": 0.11991811543703079, "epoch": 5.4536357571714476, "step": 16350 }, { "distill_loss": 0.22079601883888245, "epoch": 5.4536357571714476, "step": 16350 }, { "epoch": 5.4536357571714476, "ref_ce_loss": 0.13425542414188385, "step": 16350 }, { "epoch": 5.456971314209473, "loss": 0.6971, "step": 16360 }, { "epoch": 5.456971314209473, "grad_norm": 1.924656629562378, "step": 16360 }, { "epoch": 5.456971314209473, "learning_rate": 0.00036030936645772377, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.5232378840446472, "step": 16360 }, { "ce_loss": 0.11680949479341507, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.27530935406684875, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.0966496467590332, "step": 16360 }, { "epoch": 5.456971314209473, "loss": 0.757350504398346, "step": 16360 }, { "ce_loss": 0.23436380922794342, "epoch": 5.456971314209473, "step": 16360 }, { "distill_loss": 0.32329943776130676, "epoch": 5.456971314209473, "step": 16360 }, { "epoch": 5.456971314209473, "ref_ce_loss": 0.1995662897825241, "step": 16360 }, { "epoch": 5.460306871247498, "loss": 0.7054, "step": 16370 }, { "epoch": 5.460306871247498, "grad_norm": 1.747829556465149, "step": 16370 }, { "epoch": 5.460306871247498, "learning_rate": 0.00035987939128910215, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.7242252826690674, "step": 16370 }, { "ce_loss": 0.24792882800102234, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.2356468290090561, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.19074757397174835, "step": 16370 }, { "epoch": 5.460306871247498, "loss": 0.7105324864387512, "step": 16370 }, { "ce_loss": 0.13640131056308746, "epoch": 5.460306871247498, "step": 16370 }, { "distill_loss": 0.26673826575279236, "epoch": 5.460306871247498, "step": 16370 }, { "epoch": 5.460306871247498, "ref_ce_loss": 0.14105232059955597, "step": 16370 }, { "epoch": 5.463642428285524, "loss": 0.7779, "step": 16380 }, { "epoch": 5.463642428285524, "grad_norm": 2.432455539703369, "step": 16380 }, { "epoch": 5.463642428285524, "learning_rate": 0.00035944946294554786, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.591047465801239, "step": 16380 }, { "ce_loss": 0.1356351524591446, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.2682700455188751, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.12913872301578522, "step": 16380 }, { "epoch": 5.463642428285524, "loss": 0.6132073402404785, "step": 16380 }, { "ce_loss": 0.16914598643779755, "epoch": 5.463642428285524, "step": 16380 }, { "distill_loss": 0.2821706533432007, "epoch": 5.463642428285524, "step": 16380 }, { "epoch": 5.463642428285524, "ref_ce_loss": 0.12958523631095886, "step": 16380 }, { "epoch": 5.466977985323549, "loss": 0.7305, "step": 16390 }, { "epoch": 5.466977985323549, "grad_norm": 2.488482713699341, "step": 16390 }, { "epoch": 5.466977985323549, "learning_rate": 0.0003590195819288338, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.732840895652771, "step": 16390 }, { "ce_loss": 0.1996365338563919, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.32097291946411133, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.13700750470161438, "step": 16390 }, { "epoch": 5.466977985323549, "loss": 0.6898370981216431, "step": 16390 }, { "ce_loss": 0.15528330206871033, "epoch": 5.466977985323549, "step": 16390 }, { "distill_loss": 0.30699825286865234, "epoch": 5.466977985323549, "step": 16390 }, { "epoch": 5.466977985323549, "ref_ce_loss": 0.14022010564804077, "step": 16390 }, { "epoch": 5.470313542361574, "loss": 0.7357, "step": 16400 }, { "epoch": 5.470313542361574, "grad_norm": 2.764321804046631, "step": 16400 }, { "epoch": 5.470313542361574, "learning_rate": 0.00035858974874067746, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.7311261296272278, "step": 16400 }, { "ce_loss": 0.2125311642885208, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.33469152450561523, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.14825886487960815, "step": 16400 }, { "epoch": 5.470313542361574, "loss": 0.7489011883735657, "step": 16400 }, { "ce_loss": 0.2545397877693176, "epoch": 5.470313542361574, "step": 16400 }, { "distill_loss": 0.32159775495529175, "epoch": 5.470313542361574, "step": 16400 }, { "epoch": 5.470313542361574, "ref_ce_loss": 0.13286516070365906, "step": 16400 }, { "epoch": 5.4736490993996, "loss": 0.7135, "step": 16410 }, { "epoch": 5.4736490993996, "grad_norm": 1.813016414642334, "step": 16410 }, { "epoch": 5.4736490993996, "learning_rate": 0.0003581599638827401, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.6902078986167908, "step": 16410 }, { "ce_loss": 0.17582891881465912, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.3405342698097229, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.13279816508293152, "step": 16410 }, { "epoch": 5.4736490993996, "loss": 0.6575373411178589, "step": 16410 }, { "ce_loss": 0.18639127910137177, "epoch": 5.4736490993996, "step": 16410 }, { "distill_loss": 0.3385080099105835, "epoch": 5.4736490993996, "step": 16410 }, { "epoch": 5.4736490993996, "ref_ce_loss": 0.13242416083812714, "step": 16410 }, { "epoch": 5.476984656437625, "loss": 0.7883, "step": 16420 }, { "epoch": 5.476984656437625, "grad_norm": 1.8976466655731201, "step": 16420 }, { "epoch": 5.476984656437625, "learning_rate": 0.0003577302278566272, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.6805810928344727, "step": 16420 }, { "ce_loss": 0.15746085345745087, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.30172422528266907, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.16179125010967255, "step": 16420 }, { "epoch": 5.476984656437625, "loss": 0.9149475693702698, "step": 16420 }, { "ce_loss": 0.21631371974945068, "epoch": 5.476984656437625, "step": 16420 }, { "distill_loss": 0.3094479739665985, "epoch": 5.476984656437625, "step": 16420 }, { "epoch": 5.476984656437625, "ref_ce_loss": 0.15932220220565796, "step": 16420 }, { "epoch": 5.48032021347565, "loss": 0.8182, "step": 16430 }, { "epoch": 5.48032021347565, "grad_norm": 1.3693673610687256, "step": 16430 }, { "epoch": 5.48032021347565, "learning_rate": 0.0003573005411638867, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.645654559135437, "step": 16430 }, { "ce_loss": 0.154694065451622, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.29492273926734924, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.1481200009584427, "step": 16430 }, { "epoch": 5.48032021347565, "loss": 0.6389703750610352, "step": 16430 }, { "ce_loss": 0.1725892722606659, "epoch": 5.48032021347565, "step": 16430 }, { "distill_loss": 0.3293762803077698, "epoch": 5.48032021347565, "step": 16430 }, { "epoch": 5.48032021347565, "ref_ce_loss": 0.10531818866729736, "step": 16430 }, { "epoch": 5.483655770513676, "loss": 0.7309, "step": 16440 }, { "epoch": 5.483655770513676, "grad_norm": 2.5965490341186523, "step": 16440 }, { "epoch": 5.483655770513676, "learning_rate": 0.0003568709043060094, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.9237037301063538, "step": 16440 }, { "ce_loss": 0.16885775327682495, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.3852352797985077, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.1278907209634781, "step": 16440 }, { "epoch": 5.483655770513676, "loss": 0.7946254014968872, "step": 16440 }, { "ce_loss": 0.19258242845535278, "epoch": 5.483655770513676, "step": 16440 }, { "distill_loss": 0.33972975611686707, "epoch": 5.483655770513676, "step": 16440 }, { "epoch": 5.483655770513676, "ref_ce_loss": 0.14921855926513672, "step": 16440 }, { "epoch": 5.486991327551701, "loss": 0.765, "step": 16450 }, { "epoch": 5.486991327551701, "grad_norm": 1.9643694162368774, "step": 16450 }, { "epoch": 5.486991327551701, "learning_rate": 0.0003564413177844276, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.5418407320976257, "step": 16450 }, { "ce_loss": 0.12205801159143448, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.2988021969795227, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.12057796865701675, "step": 16450 }, { "epoch": 5.486991327551701, "loss": 0.6309268474578857, "step": 16450 }, { "ce_loss": 0.16797548532485962, "epoch": 5.486991327551701, "step": 16450 }, { "distill_loss": 0.2645770311355591, "epoch": 5.486991327551701, "step": 16450 }, { "epoch": 5.486991327551701, "ref_ce_loss": 0.15919606387615204, "step": 16450 }, { "epoch": 5.490326884589726, "loss": 0.7653, "step": 16460 }, { "epoch": 5.490326884589726, "grad_norm": 2.2363979816436768, "step": 16460 }, { "epoch": 5.490326884589726, "learning_rate": 0.0003560117821005151, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.6473536491394043, "step": 16460 }, { "ce_loss": 0.12286846339702606, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.23971088230609894, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.1602736860513687, "step": 16460 }, { "epoch": 5.490326884589726, "loss": 0.5908822417259216, "step": 16460 }, { "ce_loss": 0.18052250146865845, "epoch": 5.490326884589726, "step": 16460 }, { "distill_loss": 0.29990532994270325, "epoch": 5.490326884589726, "step": 16460 }, { "epoch": 5.490326884589726, "ref_ce_loss": 0.11026539653539658, "step": 16460 }, { "epoch": 5.493662441627752, "loss": 0.7457, "step": 16470 }, { "epoch": 5.493662441627752, "grad_norm": 1.346691370010376, "step": 16470 }, { "epoch": 5.493662441627752, "learning_rate": 0.00035558229775558615, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.615505039691925, "step": 16470 }, { "ce_loss": 0.12159193307161331, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.2667520344257355, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.15134981274604797, "step": 16470 }, { "epoch": 5.493662441627752, "loss": 0.6371005773544312, "step": 16470 }, { "ce_loss": 0.17626246809959412, "epoch": 5.493662441627752, "step": 16470 }, { "distill_loss": 0.2783363461494446, "epoch": 5.493662441627752, "step": 16470 }, { "epoch": 5.493662441627752, "ref_ce_loss": 0.1304175853729248, "step": 16470 }, { "epoch": 5.496997998665777, "loss": 0.7309, "step": 16480 }, { "epoch": 5.496997998665777, "grad_norm": 1.4785386323928833, "step": 16480 }, { "epoch": 5.496997998665777, "learning_rate": 0.00035515286525089536, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.7368677854537964, "step": 16480 }, { "ce_loss": 0.22332510352134705, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.3088325560092926, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.1685892939567566, "step": 16480 }, { "epoch": 5.496997998665777, "loss": 0.6153769493103027, "step": 16480 }, { "ce_loss": 0.13858671486377716, "epoch": 5.496997998665777, "step": 16480 }, { "distill_loss": 0.2944321632385254, "epoch": 5.496997998665777, "step": 16480 }, { "epoch": 5.496997998665777, "ref_ce_loss": 0.12610027194023132, "step": 16480 }, { "epoch": 5.5003335557038024, "loss": 0.723, "step": 16490 }, { "epoch": 5.5003335557038024, "grad_norm": 5.9627532958984375, "step": 16490 }, { "epoch": 5.5003335557038024, "learning_rate": 0.0003547234850876364, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.7170730233192444, "step": 16490 }, { "ce_loss": 0.1981697976589203, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.3203241229057312, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.13835738599300385, "step": 16490 }, { "epoch": 5.5003335557038024, "loss": 0.9237845540046692, "step": 16490 }, { "ce_loss": 0.30550992488861084, "epoch": 5.5003335557038024, "step": 16490 }, { "distill_loss": 0.3872705101966858, "epoch": 5.5003335557038024, "step": 16490 }, { "epoch": 5.5003335557038024, "ref_ce_loss": 0.17537952959537506, "step": 16490 }, { "epoch": 5.503669112741828, "loss": 0.7416, "step": 16500 }, { "epoch": 5.503669112741828, "grad_norm": 1.865121603012085, "step": 16500 }, { "epoch": 5.503669112741828, "learning_rate": 0.00035429415776694237, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.7768089771270752, "step": 16500 }, { "ce_loss": 0.1884937435388565, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.3611510396003723, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.14749257266521454, "step": 16500 }, { "epoch": 5.503669112741828, "loss": 0.7384665608406067, "step": 16500 }, { "ce_loss": 0.16080470383167267, "epoch": 5.503669112741828, "step": 16500 }, { "distill_loss": 0.3617389500141144, "epoch": 5.503669112741828, "step": 16500 }, { "epoch": 5.503669112741828, "ref_ce_loss": 0.11626183241605759, "step": 16500 }, { "epoch": 5.507004669779853, "loss": 0.6953, "step": 16510 }, { "epoch": 5.507004669779853, "grad_norm": 6.708162784576416, "step": 16510 }, { "epoch": 5.507004669779853, "learning_rate": 0.0003538648837898844, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.7219257354736328, "step": 16510 }, { "ce_loss": 0.17056331038475037, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.297270804643631, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.15417173504829407, "step": 16510 }, { "epoch": 5.507004669779853, "loss": 0.6959221959114075, "step": 16510 }, { "ce_loss": 0.16128702461719513, "epoch": 5.507004669779853, "step": 16510 }, { "distill_loss": 0.3540896773338318, "epoch": 5.507004669779853, "step": 16510 }, { "epoch": 5.507004669779853, "ref_ce_loss": 0.13030454516410828, "step": 16510 }, { "epoch": 5.5103402268178785, "loss": 0.7567, "step": 16520 }, { "epoch": 5.5103402268178785, "grad_norm": 2.4799792766571045, "step": 16520 }, { "epoch": 5.5103402268178785, "learning_rate": 0.0003534356636574714, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.9273051619529724, "step": 16520 }, { "ce_loss": 0.1862836480140686, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.4356238842010498, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.16171026229858398, "step": 16520 }, { "epoch": 5.5103402268178785, "loss": 0.9347162246704102, "step": 16520 }, { "ce_loss": 0.22167471051216125, "epoch": 5.5103402268178785, "step": 16520 }, { "distill_loss": 0.35594063997268677, "epoch": 5.5103402268178785, "step": 16520 }, { "epoch": 5.5103402268178785, "ref_ce_loss": 0.13794006407260895, "step": 16520 }, { "epoch": 5.513675783855904, "loss": 0.8511, "step": 16530 }, { "epoch": 5.513675783855904, "grad_norm": 2.2826759815216064, "step": 16530 }, { "epoch": 5.513675783855904, "learning_rate": 0.0003530064978706494, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.5434016585350037, "step": 16530 }, { "ce_loss": 0.10969436913728714, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.2416934370994568, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.1079748123884201, "step": 16530 }, { "epoch": 5.513675783855904, "loss": 0.8625865578651428, "step": 16530 }, { "ce_loss": 0.17143461108207703, "epoch": 5.513675783855904, "step": 16530 }, { "distill_loss": 0.29201212525367737, "epoch": 5.513675783855904, "step": 16530 }, { "epoch": 5.513675783855904, "ref_ce_loss": 0.1500742882490158, "step": 16530 }, { "epoch": 5.517011340893929, "loss": 0.7527, "step": 16540 }, { "epoch": 5.517011340893929, "grad_norm": 2.8519067764282227, "step": 16540 }, { "epoch": 5.517011340893929, "learning_rate": 0.0003525773869303012, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.8090465068817139, "step": 16540 }, { "ce_loss": 0.1982586681842804, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.3481493592262268, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.14651048183441162, "step": 16540 }, { "epoch": 5.517011340893929, "loss": 0.875690758228302, "step": 16540 }, { "ce_loss": 0.188467875123024, "epoch": 5.517011340893929, "step": 16540 }, { "distill_loss": 0.3411063849925995, "epoch": 5.517011340893929, "step": 16540 }, { "epoch": 5.517011340893929, "ref_ce_loss": 0.13025033473968506, "step": 16540 }, { "epoch": 5.5203468979319545, "loss": 0.811, "step": 16550 }, { "epoch": 5.5203468979319545, "grad_norm": 2.2053282260894775, "step": 16550 }, { "epoch": 5.5203468979319545, "learning_rate": 0.00035214833133724523, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.7356788516044617, "step": 16550 }, { "ce_loss": 0.19450567662715912, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.31631433963775635, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.1460232138633728, "step": 16550 }, { "epoch": 5.5203468979319545, "loss": 0.6888248324394226, "step": 16550 }, { "ce_loss": 0.1981515884399414, "epoch": 5.5203468979319545, "step": 16550 }, { "distill_loss": 0.33612701296806335, "epoch": 5.5203468979319545, "step": 16550 }, { "epoch": 5.5203468979319545, "ref_ce_loss": 0.1542927771806717, "step": 16550 }, { "epoch": 5.52368245496998, "loss": 0.7646, "step": 16560 }, { "epoch": 5.52368245496998, "grad_norm": 1.941384196281433, "step": 16560 }, { "epoch": 5.52368245496998, "learning_rate": 0.0003517193315922358, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 0.7945826053619385, "step": 16560 }, { "ce_loss": 0.1891850233078003, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.3589797616004944, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.14466382563114166, "step": 16560 }, { "epoch": 5.52368245496998, "loss": 1.0708749294281006, "step": 16560 }, { "ce_loss": 0.15622949600219727, "epoch": 5.52368245496998, "step": 16560 }, { "distill_loss": 0.36662739515304565, "epoch": 5.52368245496998, "step": 16560 }, { "epoch": 5.52368245496998, "ref_ce_loss": 0.1381683200597763, "step": 16560 }, { "epoch": 5.527018012008005, "loss": 0.7938, "step": 16570 }, { "epoch": 5.527018012008005, "grad_norm": 2.5366055965423584, "step": 16570 }, { "epoch": 5.527018012008005, "learning_rate": 0.00035129038819596147, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.5957663059234619, "step": 16570 }, { "ce_loss": 0.17193228006362915, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.29176750779151917, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.13183808326721191, "step": 16570 }, { "epoch": 5.527018012008005, "loss": 0.7876423001289368, "step": 16570 }, { "ce_loss": 0.21744805574417114, "epoch": 5.527018012008005, "step": 16570 }, { "distill_loss": 0.33549293875694275, "epoch": 5.527018012008005, "step": 16570 }, { "epoch": 5.527018012008005, "ref_ce_loss": 0.1758352667093277, "step": 16570 }, { "epoch": 5.530353569046031, "loss": 0.819, "step": 16580 }, { "epoch": 5.530353569046031, "grad_norm": 1.6034607887268066, "step": 16580 }, { "epoch": 5.530353569046031, "learning_rate": 0.00035086150164904555, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.5763473510742188, "step": 16580 }, { "ce_loss": 0.12128767371177673, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.2356862723827362, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.1255425214767456, "step": 16580 }, { "epoch": 5.530353569046031, "loss": 0.868805468082428, "step": 16580 }, { "ce_loss": 0.1575346738100052, "epoch": 5.530353569046031, "step": 16580 }, { "distill_loss": 0.3505779504776001, "epoch": 5.530353569046031, "step": 16580 }, { "epoch": 5.530353569046031, "ref_ce_loss": 0.1724642813205719, "step": 16580 }, { "epoch": 5.533689126084056, "loss": 0.8047, "step": 16590 }, { "epoch": 5.533689126084056, "grad_norm": 2.1715855598449707, "step": 16590 }, { "epoch": 5.533689126084056, "learning_rate": 0.00035043267245204464, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.5283437371253967, "step": 16590 }, { "ce_loss": 0.1427266001701355, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.26146018505096436, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.12393170595169067, "step": 16590 }, { "epoch": 5.533689126084056, "loss": 0.7784868478775024, "step": 16590 }, { "ce_loss": 0.17701445519924164, "epoch": 5.533689126084056, "step": 16590 }, { "distill_loss": 0.32144787907600403, "epoch": 5.533689126084056, "step": 16590 }, { "epoch": 5.533689126084056, "ref_ce_loss": 0.16988804936408997, "step": 16590 }, { "epoch": 5.537024683122081, "loss": 0.6876, "step": 16600 }, { "epoch": 5.537024683122081, "grad_norm": 1.437143325805664, "step": 16600 }, { "epoch": 5.537024683122081, "learning_rate": 0.0003500039011054486, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.6373987197875977, "step": 16600 }, { "ce_loss": 0.14163723587989807, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.3294292390346527, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.11789793521165848, "step": 16600 }, { "epoch": 5.537024683122081, "loss": 0.957266628742218, "step": 16600 }, { "ce_loss": 0.11309836059808731, "epoch": 5.537024683122081, "step": 16600 }, { "distill_loss": 0.26229843497276306, "epoch": 5.537024683122081, "step": 16600 }, { "epoch": 5.537024683122081, "ref_ce_loss": 0.13751594722270966, "step": 16600 }, { "epoch": 5.540360240160107, "loss": 0.7447, "step": 16610 }, { "epoch": 5.540360240160107, "grad_norm": 1.6323940753936768, "step": 16610 }, { "epoch": 5.540360240160107, "learning_rate": 0.00034957518810967993, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.9404606819152832, "step": 16610 }, { "ce_loss": 0.1382031887769699, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.3189763128757477, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.14445778727531433, "step": 16610 }, { "epoch": 5.540360240160107, "loss": 0.6262696385383606, "step": 16610 }, { "ce_loss": 0.13352514803409576, "epoch": 5.540360240160107, "step": 16610 }, { "distill_loss": 0.33467742800712585, "epoch": 5.540360240160107, "step": 16610 }, { "epoch": 5.540360240160107, "ref_ce_loss": 0.11829882860183716, "step": 16610 }, { "epoch": 5.543695797198132, "loss": 0.8215, "step": 16620 }, { "epoch": 5.543695797198132, "grad_norm": 1.5988037586212158, "step": 16620 }, { "epoch": 5.543695797198132, "learning_rate": 0.00034914653396509257, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.9318430423736572, "step": 16620 }, { "ce_loss": 0.15590324997901917, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.26491907238960266, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.15649548172950745, "step": 16620 }, { "epoch": 5.543695797198132, "loss": 0.8442918062210083, "step": 16620 }, { "ce_loss": 0.22664012014865875, "epoch": 5.543695797198132, "step": 16620 }, { "distill_loss": 0.38372889161109924, "epoch": 5.543695797198132, "step": 16620 }, { "epoch": 5.543695797198132, "ref_ce_loss": 0.178589329123497, "step": 16620 }, { "epoch": 5.547031354236157, "loss": 0.7808, "step": 16630 }, { "epoch": 5.547031354236157, "grad_norm": 2.557018756866455, "step": 16630 }, { "epoch": 5.547031354236157, "learning_rate": 0.00034871793917197225, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.7209650278091431, "step": 16630 }, { "ce_loss": 0.18692630529403687, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.3469357192516327, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.15406200289726257, "step": 16630 }, { "epoch": 5.547031354236157, "loss": 0.7244488000869751, "step": 16630 }, { "ce_loss": 0.2061254233121872, "epoch": 5.547031354236157, "step": 16630 }, { "distill_loss": 0.3313891291618347, "epoch": 5.547031354236157, "step": 16630 }, { "epoch": 5.547031354236157, "ref_ce_loss": 0.16208119690418243, "step": 16630 }, { "epoch": 5.550366911274183, "loss": 0.734, "step": 16640 }, { "epoch": 5.550366911274183, "grad_norm": 2.5533528327941895, "step": 16640 }, { "epoch": 5.550366911274183, "learning_rate": 0.00034828940423053495, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 1.1351650953292847, "step": 16640 }, { "ce_loss": 0.20006652176380157, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.35626906156539917, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.1594037264585495, "step": 16640 }, { "epoch": 5.550366911274183, "loss": 0.7358875274658203, "step": 16640 }, { "ce_loss": 0.14336541295051575, "epoch": 5.550366911274183, "step": 16640 }, { "distill_loss": 0.34208860993385315, "epoch": 5.550366911274183, "step": 16640 }, { "epoch": 5.550366911274183, "ref_ce_loss": 0.1457943618297577, "step": 16640 }, { "epoch": 5.553702468312208, "loss": 0.7556, "step": 16650 }, { "epoch": 5.553702468312208, "grad_norm": 1.638131856918335, "step": 16650 }, { "epoch": 5.553702468312208, "learning_rate": 0.00034786092964092736, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.6036756634712219, "step": 16650 }, { "ce_loss": 0.14441810548305511, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.32643380761146545, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.13198567926883698, "step": 16650 }, { "epoch": 5.553702468312208, "loss": 0.5919756293296814, "step": 16650 }, { "ce_loss": 0.14673617482185364, "epoch": 5.553702468312208, "step": 16650 }, { "distill_loss": 0.3344498574733734, "epoch": 5.553702468312208, "step": 16650 }, { "epoch": 5.553702468312208, "ref_ce_loss": 0.10504456609487534, "step": 16650 }, { "epoch": 5.557038025350233, "loss": 0.6602, "step": 16660 }, { "epoch": 5.557038025350233, "grad_norm": 1.5747668743133545, "step": 16660 }, { "epoch": 5.557038025350233, "learning_rate": 0.00034743251590322515, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.7086818814277649, "step": 16660 }, { "ce_loss": 0.20601743459701538, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.3181675374507904, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.1258608102798462, "step": 16660 }, { "epoch": 5.557038025350233, "loss": 0.7951910495758057, "step": 16660 }, { "ce_loss": 0.1366240233182907, "epoch": 5.557038025350233, "step": 16660 }, { "distill_loss": 0.26778659224510193, "epoch": 5.557038025350233, "step": 16660 }, { "epoch": 5.557038025350233, "ref_ce_loss": 0.12254621833562851, "step": 16660 }, { "epoch": 5.560373582388259, "loss": 0.8001, "step": 16670 }, { "epoch": 5.560373582388259, "grad_norm": 1.6090749502182007, "step": 16670 }, { "epoch": 5.560373582388259, "learning_rate": 0.00034700416351743347, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.8416956067085266, "step": 16670 }, { "ce_loss": 0.20460933446884155, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.3494933843612671, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.17335765063762665, "step": 16670 }, { "epoch": 5.560373582388259, "loss": 0.6869195699691772, "step": 16670 }, { "ce_loss": 0.16732385754585266, "epoch": 5.560373582388259, "step": 16670 }, { "distill_loss": 0.30530908703804016, "epoch": 5.560373582388259, "step": 16670 }, { "epoch": 5.560373582388259, "ref_ce_loss": 0.16549931466579437, "step": 16670 }, { "epoch": 5.563709139426284, "loss": 0.7658, "step": 16680 }, { "epoch": 5.563709139426284, "grad_norm": 3.558506727218628, "step": 16680 }, { "epoch": 5.563709139426284, "learning_rate": 0.0003465758729834855, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 2.007232189178467, "step": 16680 }, { "ce_loss": 0.20884384214878082, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.28895843029022217, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.1980840563774109, "step": 16680 }, { "epoch": 5.563709139426284, "loss": 0.6627374291419983, "step": 16680 }, { "ce_loss": 0.11687424778938293, "epoch": 5.563709139426284, "step": 16680 }, { "distill_loss": 0.349597305059433, "epoch": 5.563709139426284, "step": 16680 }, { "epoch": 5.563709139426284, "ref_ce_loss": 0.1425715684890747, "step": 16680 }, { "epoch": 5.567044696464309, "loss": 0.8259, "step": 16690 }, { "epoch": 5.567044696464309, "grad_norm": 2.0322229862213135, "step": 16690 }, { "epoch": 5.567044696464309, "learning_rate": 0.00034614764480124234, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.8495004773139954, "step": 16690 }, { "ce_loss": 0.21280600130558014, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.40029972791671753, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.14662204682826996, "step": 16690 }, { "epoch": 5.567044696464309, "loss": 0.7268968820571899, "step": 16690 }, { "ce_loss": 0.1668766587972641, "epoch": 5.567044696464309, "step": 16690 }, { "distill_loss": 0.3052133619785309, "epoch": 5.567044696464309, "step": 16690 }, { "epoch": 5.567044696464309, "ref_ce_loss": 0.17897547781467438, "step": 16690 }, { "epoch": 5.570380253502335, "loss": 0.7709, "step": 16700 }, { "epoch": 5.570380253502335, "grad_norm": 2.0620107650756836, "step": 16700 }, { "epoch": 5.570380253502335, "learning_rate": 0.0003457194794704926, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.7075026035308838, "step": 16700 }, { "ce_loss": 0.17498093843460083, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.26175427436828613, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.14034396409988403, "step": 16700 }, { "epoch": 5.570380253502335, "loss": 0.8278827667236328, "step": 16700 }, { "ce_loss": 0.18370160460472107, "epoch": 5.570380253502335, "step": 16700 }, { "distill_loss": 0.3056749999523163, "epoch": 5.570380253502335, "step": 16700 }, { "epoch": 5.570380253502335, "ref_ce_loss": 0.16722241044044495, "step": 16700 }, { "epoch": 5.57371581054036, "loss": 0.7071, "step": 16710 }, { "epoch": 5.57371581054036, "grad_norm": 1.755882740020752, "step": 16710 }, { "epoch": 5.57371581054036, "learning_rate": 0.000345291377490951, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.7470847368240356, "step": 16710 }, { "ce_loss": 0.187892347574234, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.350243479013443, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.16287052631378174, "step": 16710 }, { "epoch": 5.57371581054036, "loss": 0.8079235553741455, "step": 16710 }, { "ce_loss": 0.18731939792633057, "epoch": 5.57371581054036, "step": 16710 }, { "distill_loss": 0.34790876507759094, "epoch": 5.57371581054036, "step": 16710 }, { "epoch": 5.57371581054036, "ref_ce_loss": 0.14191386103630066, "step": 16710 }, { "epoch": 5.5770513675783855, "loss": 0.7708, "step": 16720 }, { "epoch": 5.5770513675783855, "grad_norm": 1.743152141571045, "step": 16720 }, { "epoch": 5.5770513675783855, "learning_rate": 0.0003448633393622588, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.6767538189888, "step": 16720 }, { "ce_loss": 0.19777166843414307, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.30505162477493286, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.1371420919895172, "step": 16720 }, { "epoch": 5.5770513675783855, "loss": 0.8049007654190063, "step": 16720 }, { "ce_loss": 0.1970512717962265, "epoch": 5.5770513675783855, "step": 16720 }, { "distill_loss": 0.3510020971298218, "epoch": 5.5770513675783855, "step": 16720 }, { "epoch": 5.5770513675783855, "ref_ce_loss": 0.16732670366764069, "step": 16720 }, { "epoch": 5.580386924616411, "loss": 0.7108, "step": 16730 }, { "epoch": 5.580386924616411, "grad_norm": 1.6241730451583862, "step": 16730 }, { "epoch": 5.580386924616411, "learning_rate": 0.00034443536558398255, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.7816483974456787, "step": 16730 }, { "ce_loss": 0.1863582879304886, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.3359223008155823, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.1906632035970688, "step": 16730 }, { "epoch": 5.580386924616411, "loss": 0.6090654730796814, "step": 16730 }, { "ce_loss": 0.14092423021793365, "epoch": 5.580386924616411, "step": 16730 }, { "distill_loss": 0.27845025062561035, "epoch": 5.580386924616411, "step": 16730 }, { "epoch": 5.580386924616411, "ref_ce_loss": 0.12028780579566956, "step": 16730 }, { "epoch": 5.583722481654436, "loss": 0.754, "step": 16740 }, { "epoch": 5.583722481654436, "grad_norm": 2.710909128189087, "step": 16740 }, { "epoch": 5.583722481654436, "learning_rate": 0.0003440074566556137, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.852007269859314, "step": 16740 }, { "ce_loss": 0.2281089574098587, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.3126354515552521, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.14135898649692535, "step": 16740 }, { "epoch": 5.583722481654436, "loss": 0.4547881484031677, "step": 16740 }, { "ce_loss": 0.09218557178974152, "epoch": 5.583722481654436, "step": 16740 }, { "distill_loss": 0.22557759284973145, "epoch": 5.583722481654436, "step": 16740 }, { "epoch": 5.583722481654436, "ref_ce_loss": 0.09807252138853073, "step": 16740 }, { "epoch": 5.5870580386924615, "loss": 0.6579, "step": 16750 }, { "epoch": 5.5870580386924615, "grad_norm": 1.5583220720291138, "step": 16750 }, { "epoch": 5.5870580386924615, "learning_rate": 0.000343579613076568, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.9663202166557312, "step": 16750 }, { "ce_loss": 0.2363245040178299, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.34112247824668884, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.23673084378242493, "step": 16750 }, { "epoch": 5.5870580386924615, "loss": 0.5706340670585632, "step": 16750 }, { "ce_loss": 0.1683376133441925, "epoch": 5.5870580386924615, "step": 16750 }, { "distill_loss": 0.27087903022766113, "epoch": 5.5870580386924615, "step": 16750 }, { "epoch": 5.5870580386924615, "ref_ce_loss": 0.10743244737386703, "step": 16750 }, { "epoch": 5.590393595730487, "loss": 0.7946, "step": 16760 }, { "epoch": 5.590393595730487, "grad_norm": 1.9442334175109863, "step": 16760 }, { "epoch": 5.590393595730487, "learning_rate": 0.00034315183534618484, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.5476641654968262, "step": 16760 }, { "ce_loss": 0.1564968228340149, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.2599179446697235, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.10186018794775009, "step": 16760 }, { "epoch": 5.590393595730487, "loss": 0.5254256129264832, "step": 16760 }, { "ce_loss": 0.13415220379829407, "epoch": 5.590393595730487, "step": 16760 }, { "distill_loss": 0.2226860672235489, "epoch": 5.590393595730487, "step": 16760 }, { "epoch": 5.590393595730487, "ref_ce_loss": 0.116874560713768, "step": 16760 }, { "epoch": 5.593729152768512, "loss": 0.7478, "step": 16770 }, { "epoch": 5.593729152768512, "grad_norm": 2.253359317779541, "step": 16770 }, { "epoch": 5.593729152768512, "learning_rate": 0.00034272412396372707, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.89459627866745, "step": 16770 }, { "ce_loss": 0.25067758560180664, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.347787469625473, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.17483831942081451, "step": 16770 }, { "epoch": 5.593729152768512, "loss": 0.5755136609077454, "step": 16770 }, { "ce_loss": 0.1253049522638321, "epoch": 5.593729152768512, "step": 16770 }, { "distill_loss": 0.2704051733016968, "epoch": 5.593729152768512, "step": 16770 }, { "epoch": 5.593729152768512, "ref_ce_loss": 0.1341908723115921, "step": 16770 }, { "epoch": 5.597064709806538, "loss": 0.7443, "step": 16780 }, { "epoch": 5.597064709806538, "grad_norm": 1.7001421451568604, "step": 16780 }, { "epoch": 5.597064709806538, "learning_rate": 0.0003422964794283796, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 0.6673210263252258, "step": 16780 }, { "ce_loss": 0.18451565504074097, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.3047248125076294, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.1554635465145111, "step": 16780 }, { "epoch": 5.597064709806538, "loss": 1.0121006965637207, "step": 16780 }, { "ce_loss": 0.2369699776172638, "epoch": 5.597064709806538, "step": 16780 }, { "distill_loss": 0.33116698265075684, "epoch": 5.597064709806538, "step": 16780 }, { "epoch": 5.597064709806538, "ref_ce_loss": 0.13422130048274994, "step": 16780 }, { "epoch": 5.600400266844563, "loss": 0.7259, "step": 16790 }, { "epoch": 5.600400266844563, "grad_norm": 1.4096695184707642, "step": 16790 }, { "epoch": 5.600400266844563, "learning_rate": 0.00034186890223924995, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 0.9694709181785583, "step": 16790 }, { "ce_loss": 0.22094103693962097, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.35448187589645386, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.16207851469516754, "step": 16790 }, { "epoch": 5.600400266844563, "loss": 1.0303623676300049, "step": 16790 }, { "ce_loss": 0.18831492960453033, "epoch": 5.600400266844563, "step": 16790 }, { "distill_loss": 0.2995215058326721, "epoch": 5.600400266844563, "step": 16790 }, { "epoch": 5.600400266844563, "ref_ce_loss": 0.18782763183116913, "step": 16790 }, { "epoch": 5.603735823882588, "loss": 0.6969, "step": 16800 }, { "epoch": 5.603735823882588, "grad_norm": 1.4540307521820068, "step": 16800 }, { "epoch": 5.603735823882588, "learning_rate": 0.00034144139289536647, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.49818724393844604, "step": 16800 }, { "ce_loss": 0.11839782446622849, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.2139374017715454, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.12370803952217102, "step": 16800 }, { "epoch": 5.603735823882588, "loss": 0.44672223925590515, "step": 16800 }, { "ce_loss": 0.13540023565292358, "epoch": 5.603735823882588, "step": 16800 }, { "distill_loss": 0.2240711748600006, "epoch": 5.603735823882588, "step": 16800 }, { "epoch": 5.603735823882588, "ref_ce_loss": 0.08712411671876907, "step": 16800 }, { "epoch": 5.607071380920614, "loss": 0.7494, "step": 16810 }, { "epoch": 5.607071380920614, "grad_norm": 1.7922204732894897, "step": 16810 }, { "epoch": 5.607071380920614, "learning_rate": 0.0003410139518956787, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.7348718643188477, "step": 16810 }, { "ce_loss": 0.20842675864696503, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.27669811248779297, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.13902993500232697, "step": 16810 }, { "epoch": 5.607071380920614, "loss": 0.6885568499565125, "step": 16810 }, { "ce_loss": 0.2071564942598343, "epoch": 5.607071380920614, "step": 16810 }, { "distill_loss": 0.2704514265060425, "epoch": 5.607071380920614, "step": 16810 }, { "epoch": 5.607071380920614, "ref_ce_loss": 0.15458983182907104, "step": 16810 }, { "epoch": 5.610406937958639, "loss": 0.712, "step": 16820 }, { "epoch": 5.610406937958639, "grad_norm": 1.3382842540740967, "step": 16820 }, { "epoch": 5.610406937958639, "learning_rate": 0.00034058657973905606, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.7045660614967346, "step": 16820 }, { "ce_loss": 0.2180887907743454, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.2846103310585022, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.1844034641981125, "step": 16820 }, { "epoch": 5.610406937958639, "loss": 0.7456596493721008, "step": 16820 }, { "ce_loss": 0.16406604647636414, "epoch": 5.610406937958639, "step": 16820 }, { "distill_loss": 0.2776634097099304, "epoch": 5.610406937958639, "step": 16820 }, { "epoch": 5.610406937958639, "ref_ce_loss": 0.17183589935302734, "step": 16820 }, { "epoch": 5.613742494996664, "loss": 0.7483, "step": 16830 }, { "epoch": 5.613742494996664, "grad_norm": 1.4651700258255005, "step": 16830 }, { "epoch": 5.613742494996664, "learning_rate": 0.0003401592769242881, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.6531354784965515, "step": 16830 }, { "ce_loss": 0.18232855200767517, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.34719792008399963, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.1235121637582779, "step": 16830 }, { "epoch": 5.613742494996664, "loss": 0.622502326965332, "step": 16830 }, { "ce_loss": 0.17055979371070862, "epoch": 5.613742494996664, "step": 16830 }, { "distill_loss": 0.2820592522621155, "epoch": 5.613742494996664, "step": 16830 }, { "epoch": 5.613742494996664, "ref_ce_loss": 0.13489072024822235, "step": 16830 }, { "epoch": 5.61707805203469, "loss": 0.6715, "step": 16840 }, { "epoch": 5.61707805203469, "grad_norm": 1.7071387767791748, "step": 16840 }, { "epoch": 5.61707805203469, "learning_rate": 0.0003397320439500832, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.9534569978713989, "step": 16840 }, { "ce_loss": 0.18748196959495544, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.3256116509437561, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.13495130836963654, "step": 16840 }, { "epoch": 5.61707805203469, "loss": 0.5402416586875916, "step": 16840 }, { "ce_loss": 0.15914657711982727, "epoch": 5.61707805203469, "step": 16840 }, { "distill_loss": 0.2467021942138672, "epoch": 5.61707805203469, "step": 16840 }, { "epoch": 5.61707805203469, "ref_ce_loss": 0.13433469831943512, "step": 16840 }, { "epoch": 5.620413609072715, "loss": 0.7054, "step": 16850 }, { "epoch": 5.620413609072715, "grad_norm": 2.036802053451538, "step": 16850 }, { "epoch": 5.620413609072715, "learning_rate": 0.00033930488131506803, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 0.7515554428100586, "step": 16850 }, { "ce_loss": 0.21604153513908386, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.28629425168037415, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.14884991943836212, "step": 16850 }, { "epoch": 5.620413609072715, "loss": 1.2459590435028076, "step": 16850 }, { "ce_loss": 0.20419827103614807, "epoch": 5.620413609072715, "step": 16850 }, { "distill_loss": 0.3320619761943817, "epoch": 5.620413609072715, "step": 16850 }, { "epoch": 5.620413609072715, "ref_ce_loss": 0.19198916852474213, "step": 16850 }, { "epoch": 5.62374916611074, "loss": 0.7191, "step": 16860 }, { "epoch": 5.62374916611074, "grad_norm": 1.3319770097732544, "step": 16860 }, { "epoch": 5.62374916611074, "learning_rate": 0.0003388777895177874, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.905265748500824, "step": 16860 }, { "ce_loss": 0.2206185758113861, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.3126305043697357, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.16376717388629913, "step": 16860 }, { "epoch": 5.62374916611074, "loss": 0.5250917673110962, "step": 16860 }, { "ce_loss": 0.12526977062225342, "epoch": 5.62374916611074, "step": 16860 }, { "distill_loss": 0.23528367280960083, "epoch": 5.62374916611074, "step": 16860 }, { "epoch": 5.62374916611074, "ref_ce_loss": 0.11867980659008026, "step": 16860 }, { "epoch": 5.627084723148766, "loss": 0.7325, "step": 16870 }, { "epoch": 5.627084723148766, "grad_norm": 1.498423457145691, "step": 16870 }, { "epoch": 5.627084723148766, "learning_rate": 0.00033845076905670353, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.6130893230438232, "step": 16870 }, { "ce_loss": 0.2040909081697464, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.3104015290737152, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.09847946465015411, "step": 16870 }, { "epoch": 5.627084723148766, "loss": 0.7601680159568787, "step": 16870 }, { "ce_loss": 0.26276570558547974, "epoch": 5.627084723148766, "step": 16870 }, { "distill_loss": 0.3020651936531067, "epoch": 5.627084723148766, "step": 16870 }, { "epoch": 5.627084723148766, "ref_ce_loss": 0.19520622491836548, "step": 16870 }, { "epoch": 5.630420280186791, "loss": 0.739, "step": 16880 }, { "epoch": 5.630420280186791, "grad_norm": 5.525702476501465, "step": 16880 }, { "epoch": 5.630420280186791, "learning_rate": 0.0003380238204301951, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.7757127285003662, "step": 16880 }, { "ce_loss": 0.19095243513584137, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.30943331122398376, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.15145494043827057, "step": 16880 }, { "epoch": 5.630420280186791, "loss": 0.5341856479644775, "step": 16880 }, { "ce_loss": 0.13416753709316254, "epoch": 5.630420280186791, "step": 16880 }, { "distill_loss": 0.2838825583457947, "epoch": 5.630420280186791, "step": 16880 }, { "epoch": 5.630420280186791, "ref_ce_loss": 0.11606307327747345, "step": 16880 }, { "epoch": 5.633755837224816, "loss": 0.6988, "step": 16890 }, { "epoch": 5.633755837224816, "grad_norm": 3.330446720123291, "step": 16890 }, { "epoch": 5.633755837224816, "learning_rate": 0.0003375969441365572, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.6190627217292786, "step": 16890 }, { "ce_loss": 0.1746228188276291, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.2694947123527527, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.13239260017871857, "step": 16890 }, { "epoch": 5.633755837224816, "loss": 0.5963748097419739, "step": 16890 }, { "ce_loss": 0.16726279258728027, "epoch": 5.633755837224816, "step": 16890 }, { "distill_loss": 0.26655513048171997, "epoch": 5.633755837224816, "step": 16890 }, { "epoch": 5.633755837224816, "ref_ce_loss": 0.1624668687582016, "step": 16890 }, { "epoch": 5.637091394262842, "loss": 0.7009, "step": 16900 }, { "epoch": 5.637091394262842, "grad_norm": 1.365567922592163, "step": 16900 }, { "epoch": 5.637091394262842, "learning_rate": 0.0003371701406740002, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.5843186378479004, "step": 16900 }, { "ce_loss": 0.133130744099617, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.2553125023841858, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.1419401913881302, "step": 16900 }, { "epoch": 5.637091394262842, "loss": 0.5544773936271667, "step": 16900 }, { "ce_loss": 0.16995187103748322, "epoch": 5.637091394262842, "step": 16900 }, { "distill_loss": 0.2469933032989502, "epoch": 5.637091394262842, "step": 16900 }, { "epoch": 5.637091394262842, "ref_ce_loss": 0.13727059960365295, "step": 16900 }, { "epoch": 5.640426951300867, "loss": 0.6852, "step": 16910 }, { "epoch": 5.640426951300867, "grad_norm": 1.9859882593154907, "step": 16910 }, { "epoch": 5.640426951300867, "learning_rate": 0.0003367434105406499, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.7333486080169678, "step": 16910 }, { "ce_loss": 0.11421021074056625, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.3491458296775818, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.12321870774030685, "step": 16910 }, { "epoch": 5.640426951300867, "loss": 0.9772500395774841, "step": 16910 }, { "ce_loss": 0.2581791579723358, "epoch": 5.640426951300867, "step": 16910 }, { "distill_loss": 0.38738787174224854, "epoch": 5.640426951300867, "step": 16910 }, { "epoch": 5.640426951300867, "ref_ce_loss": 0.18698999285697937, "step": 16910 }, { "epoch": 5.6437625083388925, "loss": 0.7663, "step": 16920 }, { "epoch": 5.6437625083388925, "grad_norm": 1.4611917734146118, "step": 16920 }, { "epoch": 5.6437625083388925, "learning_rate": 0.0003363167542345462, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.8015724420547485, "step": 16920 }, { "ce_loss": 0.24213604629039764, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.3938767910003662, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.13820527493953705, "step": 16920 }, { "epoch": 5.6437625083388925, "loss": 0.9192935228347778, "step": 16920 }, { "ce_loss": 0.18241621553897858, "epoch": 5.6437625083388925, "step": 16920 }, { "distill_loss": 0.309222012758255, "epoch": 5.6437625083388925, "step": 16920 }, { "epoch": 5.6437625083388925, "ref_ce_loss": 0.16970683634281158, "step": 16920 }, { "epoch": 5.647098065376918, "loss": 0.8112, "step": 16930 }, { "epoch": 5.647098065376918, "grad_norm": 1.9022551774978638, "step": 16930 }, { "epoch": 5.647098065376918, "learning_rate": 0.0003358901722536427, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.6248596906661987, "step": 16930 }, { "ce_loss": 0.1475604623556137, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.30214765667915344, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.09551089257001877, "step": 16930 }, { "epoch": 5.647098065376918, "loss": 0.9147480130195618, "step": 16930 }, { "ce_loss": 0.23225298523902893, "epoch": 5.647098065376918, "step": 16930 }, { "distill_loss": 0.4083430767059326, "epoch": 5.647098065376918, "step": 16930 }, { "epoch": 5.647098065376918, "ref_ce_loss": 0.20748469233512878, "step": 16930 }, { "epoch": 5.650433622414943, "loss": 0.7233, "step": 16940 }, { "epoch": 5.650433622414943, "grad_norm": 2.352044105529785, "step": 16940 }, { "epoch": 5.650433622414943, "learning_rate": 0.0003354636650958069, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.8966991901397705, "step": 16940 }, { "ce_loss": 0.19321158528327942, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.32078060507774353, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.1405438929796219, "step": 16940 }, { "epoch": 5.650433622414943, "loss": 0.7034854292869568, "step": 16940 }, { "ce_loss": 0.1795550435781479, "epoch": 5.650433622414943, "step": 16940 }, { "distill_loss": 0.3291091024875641, "epoch": 5.650433622414943, "step": 16940 }, { "epoch": 5.650433622414943, "ref_ce_loss": 0.16134437918663025, "step": 16940 }, { "epoch": 5.6537691794529685, "loss": 0.7532, "step": 16950 }, { "epoch": 5.6537691794529685, "grad_norm": 1.9853451251983643, "step": 16950 }, { "epoch": 5.6537691794529685, "learning_rate": 0.0003350372332588183, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.6733770966529846, "step": 16950 }, { "ce_loss": 0.20213384926319122, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.2750031352043152, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.14148816466331482, "step": 16950 }, { "epoch": 5.6537691794529685, "loss": 0.6604835391044617, "step": 16950 }, { "ce_loss": 0.18545019626617432, "epoch": 5.6537691794529685, "step": 16950 }, { "distill_loss": 0.3066851794719696, "epoch": 5.6537691794529685, "step": 16950 }, { "epoch": 5.6537691794529685, "ref_ce_loss": 0.1253993958234787, "step": 16950 }, { "epoch": 5.657104736490994, "loss": 0.7522, "step": 16960 }, { "epoch": 5.657104736490994, "grad_norm": 1.2449283599853516, "step": 16960 }, { "epoch": 5.657104736490994, "learning_rate": 0.0003346108772403688, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.7136119604110718, "step": 16960 }, { "ce_loss": 0.1602572649717331, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.35394537448883057, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.15927578508853912, "step": 16960 }, { "epoch": 5.657104736490994, "loss": 0.7708046436309814, "step": 16960 }, { "ce_loss": 0.198671355843544, "epoch": 5.657104736490994, "step": 16960 }, { "distill_loss": 0.32375049591064453, "epoch": 5.657104736490994, "step": 16960 }, { "epoch": 5.657104736490994, "ref_ce_loss": 0.14237548410892487, "step": 16960 }, { "epoch": 5.660440293529019, "loss": 0.8561, "step": 16970 }, { "epoch": 5.660440293529019, "grad_norm": 2.542083263397217, "step": 16970 }, { "epoch": 5.660440293529019, "learning_rate": 0.0003341845975380617, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.5686244368553162, "step": 16970 }, { "ce_loss": 0.15705722570419312, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.24046377837657928, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.13262216746807098, "step": 16970 }, { "epoch": 5.660440293529019, "loss": 0.6415659189224243, "step": 16970 }, { "ce_loss": 0.15539200603961945, "epoch": 5.660440293529019, "step": 16970 }, { "distill_loss": 0.3097907602787018, "epoch": 5.660440293529019, "step": 16970 }, { "epoch": 5.660440293529019, "ref_ce_loss": 0.12549258768558502, "step": 16970 }, { "epoch": 5.663775850567045, "loss": 0.781, "step": 16980 }, { "epoch": 5.663775850567045, "grad_norm": 16.903135299682617, "step": 16980 }, { "epoch": 5.663775850567045, "learning_rate": 0.0003337583946494113, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 1.1284102201461792, "step": 16980 }, { "ce_loss": 0.2386995404958725, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.3742934763431549, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.17547878623008728, "step": 16980 }, { "epoch": 5.663775850567045, "loss": 0.7611078023910522, "step": 16980 }, { "ce_loss": 0.21285733580589294, "epoch": 5.663775850567045, "step": 16980 }, { "distill_loss": 0.400984525680542, "epoch": 5.663775850567045, "step": 16980 }, { "epoch": 5.663775850567045, "ref_ce_loss": 0.14690493047237396, "step": 16980 }, { "epoch": 5.66711140760507, "loss": 0.7839, "step": 16990 }, { "epoch": 5.66711140760507, "grad_norm": 2.551992654800415, "step": 16990 }, { "epoch": 5.66711140760507, "learning_rate": 0.00033333226907184216, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.5933461785316467, "step": 16990 }, { "ce_loss": 0.17673857510089874, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.2522673010826111, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.136405810713768, "step": 16990 }, { "epoch": 5.66711140760507, "loss": 0.6428248286247253, "step": 16990 }, { "ce_loss": 0.18502990901470184, "epoch": 5.66711140760507, "step": 16990 }, { "distill_loss": 0.2771799564361572, "epoch": 5.66711140760507, "step": 16990 }, { "epoch": 5.66711140760507, "ref_ce_loss": 0.1473894864320755, "step": 16990 }, { "epoch": 5.670446964643095, "loss": 0.7321, "step": 17000 }, { "epoch": 5.670446964643095, "grad_norm": 2.8791425228118896, "step": 17000 }, { "epoch": 5.670446964643095, "learning_rate": 0.00033290622130268885, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.7614845037460327, "step": 17000 }, { "ce_loss": 0.14597536623477936, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.35557541251182556, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.1483568251132965, "step": 17000 }, { "epoch": 5.670446964643095, "loss": 0.6029954552650452, "step": 17000 }, { "ce_loss": 0.12021680921316147, "epoch": 5.670446964643095, "step": 17000 }, { "distill_loss": 0.3635213077068329, "epoch": 5.670446964643095, "step": 17000 }, { "epoch": 5.670446964643095, "ref_ce_loss": 0.11895711719989777, "step": 17000 }, { "epoch": 5.673782521681121, "loss": 0.7013, "step": 17010 }, { "epoch": 5.673782521681121, "grad_norm": 1.236177921295166, "step": 17010 }, { "epoch": 5.673782521681121, "learning_rate": 0.0003324802518391948, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.6029284000396729, "step": 17010 }, { "ce_loss": 0.12316080182790756, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.30217641592025757, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.1424601823091507, "step": 17010 }, { "epoch": 5.673782521681121, "loss": 0.7892953753471375, "step": 17010 }, { "ce_loss": 0.20078976452350616, "epoch": 5.673782521681121, "step": 17010 }, { "distill_loss": 0.33275288343429565, "epoch": 5.673782521681121, "step": 17010 }, { "epoch": 5.673782521681121, "ref_ce_loss": 0.14471758902072906, "step": 17010 }, { "epoch": 5.677118078719146, "loss": 0.7193, "step": 17020 }, { "epoch": 5.677118078719146, "grad_norm": 1.97862708568573, "step": 17020 }, { "epoch": 5.677118078719146, "learning_rate": 0.00033205436117851237, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.6347103118896484, "step": 17020 }, { "ce_loss": 0.11872317641973495, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.2712605595588684, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.15120559930801392, "step": 17020 }, { "epoch": 5.677118078719146, "loss": 0.6937782764434814, "step": 17020 }, { "ce_loss": 0.14366665482521057, "epoch": 5.677118078719146, "step": 17020 }, { "distill_loss": 0.3271183967590332, "epoch": 5.677118078719146, "step": 17020 }, { "epoch": 5.677118078719146, "ref_ce_loss": 0.16461552679538727, "step": 17020 }, { "epoch": 5.680453635757171, "loss": 0.7533, "step": 17030 }, { "epoch": 5.680453635757171, "grad_norm": 1.3897510766983032, "step": 17030 }, { "epoch": 5.680453635757171, "learning_rate": 0.00033162854981770167, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 1.5620931386947632, "step": 17030 }, { "ce_loss": 0.17498187720775604, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.35835498571395874, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.13596129417419434, "step": 17030 }, { "epoch": 5.680453635757171, "loss": 0.7162748575210571, "step": 17030 }, { "ce_loss": 0.20580850541591644, "epoch": 5.680453635757171, "step": 17030 }, { "distill_loss": 0.29947516322135925, "epoch": 5.680453635757171, "step": 17030 }, { "epoch": 5.680453635757171, "ref_ce_loss": 0.1639908254146576, "step": 17030 }, { "epoch": 5.683789192795197, "loss": 0.7724, "step": 17040 }, { "epoch": 5.683789192795197, "grad_norm": 1.2456682920455933, "step": 17040 }, { "epoch": 5.683789192795197, "learning_rate": 0.0003312028182537302, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.620795726776123, "step": 17040 }, { "ce_loss": 0.20517976582050323, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.29193615913391113, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.12346983700990677, "step": 17040 }, { "epoch": 5.683789192795197, "loss": 0.9642418622970581, "step": 17040 }, { "ce_loss": 0.2738221287727356, "epoch": 5.683789192795197, "step": 17040 }, { "distill_loss": 0.30171912908554077, "epoch": 5.683789192795197, "step": 17040 }, { "epoch": 5.683789192795197, "ref_ce_loss": 0.1887759566307068, "step": 17040 }, { "epoch": 5.687124749833222, "loss": 0.6817, "step": 17050 }, { "epoch": 5.687124749833222, "grad_norm": 2.327080488204956, "step": 17050 }, { "epoch": 5.687124749833222, "learning_rate": 0.0003307771669834729, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.6097831130027771, "step": 17050 }, { "ce_loss": 0.15700411796569824, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.2313498556613922, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.1254100203514099, "step": 17050 }, { "epoch": 5.687124749833222, "loss": 0.6765387058258057, "step": 17050 }, { "ce_loss": 0.21561045944690704, "epoch": 5.687124749833222, "step": 17050 }, { "distill_loss": 0.2853943109512329, "epoch": 5.687124749833222, "step": 17050 }, { "epoch": 5.687124749833222, "ref_ce_loss": 0.14921283721923828, "step": 17050 }, { "epoch": 5.690460306871247, "loss": 0.655, "step": 17060 }, { "epoch": 5.690460306871247, "grad_norm": 1.403415322303772, "step": 17060 }, { "epoch": 5.690460306871247, "learning_rate": 0.0003303515965037104, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.7461519837379456, "step": 17060 }, { "ce_loss": 0.16204185783863068, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.2274303436279297, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.11113007366657257, "step": 17060 }, { "epoch": 5.690460306871247, "loss": 0.6564823389053345, "step": 17060 }, { "ce_loss": 0.14736226201057434, "epoch": 5.690460306871247, "step": 17060 }, { "distill_loss": 0.26632723212242126, "epoch": 5.690460306871247, "step": 17060 }, { "epoch": 5.690460306871247, "ref_ce_loss": 0.14623504877090454, "step": 17060 }, { "epoch": 5.693795863909273, "loss": 0.6932, "step": 17070 }, { "epoch": 5.693795863909273, "grad_norm": 2.4529449939727783, "step": 17070 }, { "epoch": 5.693795863909273, "learning_rate": 0.00032992610731112925, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.6349560022354126, "step": 17070 }, { "ce_loss": 0.1573634296655655, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.31927490234375, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.12076559662818909, "step": 17070 }, { "epoch": 5.693795863909273, "loss": 0.7947009205818176, "step": 17070 }, { "ce_loss": 0.20859742164611816, "epoch": 5.693795863909273, "step": 17070 }, { "distill_loss": 0.30404800176620483, "epoch": 5.693795863909273, "step": 17070 }, { "epoch": 5.693795863909273, "ref_ce_loss": 0.15086445212364197, "step": 17070 }, { "epoch": 5.697131420947298, "loss": 0.6802, "step": 17080 }, { "epoch": 5.697131420947298, "grad_norm": 1.5436433553695679, "step": 17080 }, { "epoch": 5.697131420947298, "learning_rate": 0.0003295006999023212, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.4146399199962616, "step": 17080 }, { "ce_loss": 0.08536611497402191, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.21758729219436646, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.0844498947262764, "step": 17080 }, { "epoch": 5.697131420947298, "loss": 0.8940048813819885, "step": 17080 }, { "ce_loss": 0.2023334503173828, "epoch": 5.697131420947298, "step": 17080 }, { "distill_loss": 0.2644149661064148, "epoch": 5.697131420947298, "step": 17080 }, { "epoch": 5.697131420947298, "ref_ce_loss": 0.17946581542491913, "step": 17080 }, { "epoch": 5.700466977985323, "loss": 0.782, "step": 17090 }, { "epoch": 5.700466977985323, "grad_norm": 4.547679424285889, "step": 17090 }, { "epoch": 5.700466977985323, "learning_rate": 0.00032907537477378234, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.9368367195129395, "step": 17090 }, { "ce_loss": 0.22108429670333862, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.4351109266281128, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.1405351758003235, "step": 17090 }, { "epoch": 5.700466977985323, "loss": 0.6494972705841064, "step": 17090 }, { "ce_loss": 0.16825975477695465, "epoch": 5.700466977985323, "step": 17090 }, { "distill_loss": 0.2918960154056549, "epoch": 5.700466977985323, "step": 17090 }, { "epoch": 5.700466977985323, "ref_ce_loss": 0.14743435382843018, "step": 17090 }, { "epoch": 5.703802535023349, "loss": 0.7289, "step": 17100 }, { "epoch": 5.703802535023349, "grad_norm": 3.1013355255126953, "step": 17100 }, { "epoch": 5.703802535023349, "learning_rate": 0.00032865013242191295, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.8688377737998962, "step": 17100 }, { "ce_loss": 0.17676447331905365, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.3083076477050781, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.12058064341545105, "step": 17100 }, { "epoch": 5.703802535023349, "loss": 0.9613972902297974, "step": 17100 }, { "ce_loss": 0.24627932906150818, "epoch": 5.703802535023349, "step": 17100 }, { "distill_loss": 0.30985212326049805, "epoch": 5.703802535023349, "step": 17100 }, { "epoch": 5.703802535023349, "ref_ce_loss": 0.2107589840888977, "step": 17100 }, { "epoch": 5.707138092061374, "loss": 0.7592, "step": 17110 }, { "epoch": 5.707138092061374, "grad_norm": 1.4553122520446777, "step": 17110 }, { "epoch": 5.707138092061374, "learning_rate": 0.00032822497334301654, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.6339501142501831, "step": 17110 }, { "ce_loss": 0.15719901025295258, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.254035621881485, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.14738909900188446, "step": 17110 }, { "epoch": 5.707138092061374, "loss": 0.8309582471847534, "step": 17110 }, { "ce_loss": 0.2074785679578781, "epoch": 5.707138092061374, "step": 17110 }, { "distill_loss": 0.32274875044822693, "epoch": 5.707138092061374, "step": 17110 }, { "epoch": 5.707138092061374, "ref_ce_loss": 0.17456257343292236, "step": 17110 }, { "epoch": 5.7104736490993995, "loss": 0.7441, "step": 17120 }, { "epoch": 5.7104736490993995, "grad_norm": 1.737001895904541, "step": 17120 }, { "epoch": 5.7104736490993995, "learning_rate": 0.00032779989803329967, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.7288815975189209, "step": 17120 }, { "ce_loss": 0.2120910882949829, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.33151134848594666, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.15215155482292175, "step": 17120 }, { "epoch": 5.7104736490993995, "loss": 0.7121763229370117, "step": 17120 }, { "ce_loss": 0.1438087373971939, "epoch": 5.7104736490993995, "step": 17120 }, { "distill_loss": 0.2862492799758911, "epoch": 5.7104736490993995, "step": 17120 }, { "epoch": 5.7104736490993995, "ref_ce_loss": 0.14188453555107117, "step": 17120 }, { "epoch": 5.713809206137425, "loss": 0.6852, "step": 17130 }, { "epoch": 5.713809206137425, "grad_norm": 1.4312630891799927, "step": 17130 }, { "epoch": 5.713809206137425, "learning_rate": 0.0003273749069888707, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.4745047688484192, "step": 17130 }, { "ce_loss": 0.12595778703689575, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.23172588646411896, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.1163245290517807, "step": 17130 }, { "epoch": 5.713809206137425, "loss": 0.5332661867141724, "step": 17130 }, { "ce_loss": 0.1429232954978943, "epoch": 5.713809206137425, "step": 17130 }, { "distill_loss": 0.2580482065677643, "epoch": 5.713809206137425, "step": 17130 }, { "epoch": 5.713809206137425, "ref_ce_loss": 0.1318104863166809, "step": 17130 }, { "epoch": 5.71714476317545, "loss": 0.6839, "step": 17140 }, { "epoch": 5.71714476317545, "grad_norm": 3.277230739593506, "step": 17140 }, { "epoch": 5.71714476317545, "learning_rate": 0.00032695000070574016, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.7500348091125488, "step": 17140 }, { "ce_loss": 0.23020581901073456, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.36458221077919006, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.15495723485946655, "step": 17140 }, { "epoch": 5.71714476317545, "loss": 0.7711657881736755, "step": 17140 }, { "ce_loss": 0.2473836988210678, "epoch": 5.71714476317545, "step": 17140 }, { "distill_loss": 0.33617162704467773, "epoch": 5.71714476317545, "step": 17140 }, { "epoch": 5.71714476317545, "ref_ce_loss": 0.12509773671627045, "step": 17140 }, { "epoch": 5.7204803202134755, "loss": 0.712, "step": 17150 }, { "epoch": 5.7204803202134755, "grad_norm": 2.578519582748413, "step": 17150 }, { "epoch": 5.7204803202134755, "learning_rate": 0.00032652517967981913, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.6146144270896912, "step": 17150 }, { "ce_loss": 0.14902712404727936, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.2777806520462036, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.14889934659004211, "step": 17150 }, { "epoch": 5.7204803202134755, "loss": 0.795230507850647, "step": 17150 }, { "ce_loss": 0.2181692123413086, "epoch": 5.7204803202134755, "step": 17150 }, { "distill_loss": 0.2780587077140808, "epoch": 5.7204803202134755, "step": 17150 }, { "epoch": 5.7204803202134755, "ref_ce_loss": 0.20927740633487701, "step": 17150 }, { "epoch": 5.723815877251501, "loss": 0.6756, "step": 17160 }, { "epoch": 5.723815877251501, "grad_norm": 1.8128324747085571, "step": 17160 }, { "epoch": 5.723815877251501, "learning_rate": 0.00032610044440691975, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.5578626990318298, "step": 17160 }, { "ce_loss": 0.1543758362531662, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.257249116897583, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.11274212598800659, "step": 17160 }, { "epoch": 5.723815877251501, "loss": 0.594210147857666, "step": 17160 }, { "ce_loss": 0.14818356931209564, "epoch": 5.723815877251501, "step": 17160 }, { "distill_loss": 0.25856828689575195, "epoch": 5.723815877251501, "step": 17160 }, { "epoch": 5.723815877251501, "ref_ce_loss": 0.147581085562706, "step": 17160 }, { "epoch": 5.727151434289526, "loss": 0.6834, "step": 17170 }, { "epoch": 5.727151434289526, "grad_norm": 1.5219569206237793, "step": 17170 }, { "epoch": 5.727151434289526, "learning_rate": 0.0003256757953827537, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.7890151739120483, "step": 17170 }, { "ce_loss": 0.17950721085071564, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.2605040967464447, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.19137854874134064, "step": 17170 }, { "epoch": 5.727151434289526, "loss": 0.703957736492157, "step": 17170 }, { "ce_loss": 0.15624277293682098, "epoch": 5.727151434289526, "step": 17170 }, { "distill_loss": 0.23440703749656677, "epoch": 5.727151434289526, "step": 17170 }, { "epoch": 5.727151434289526, "ref_ce_loss": 0.1445663720369339, "step": 17170 }, { "epoch": 5.730486991327552, "loss": 0.7209, "step": 17180 }, { "epoch": 5.730486991327552, "grad_norm": 1.7926363945007324, "step": 17180 }, { "epoch": 5.730486991327552, "learning_rate": 0.000325251233102932, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.47458457946777344, "step": 17180 }, { "ce_loss": 0.10066297650337219, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.23481415212154388, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.08710433542728424, "step": 17180 }, { "epoch": 5.730486991327552, "loss": 0.8608106970787048, "step": 17180 }, { "ce_loss": 0.24073518812656403, "epoch": 5.730486991327552, "step": 17180 }, { "distill_loss": 0.4313311278820038, "epoch": 5.730486991327552, "step": 17180 }, { "epoch": 5.730486991327552, "ref_ce_loss": 0.14764869213104248, "step": 17180 }, { "epoch": 5.733822548365577, "loss": 0.7745, "step": 17190 }, { "epoch": 5.733822548365577, "grad_norm": 2.1348068714141846, "step": 17190 }, { "epoch": 5.733822548365577, "learning_rate": 0.0003248267580629647, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.9487937688827515, "step": 17190 }, { "ce_loss": 0.24654243886470795, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.3833885192871094, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.19179615378379822, "step": 17190 }, { "epoch": 5.733822548365577, "loss": 0.5966364741325378, "step": 17190 }, { "ce_loss": 0.16048943996429443, "epoch": 5.733822548365577, "step": 17190 }, { "distill_loss": 0.29141372442245483, "epoch": 5.733822548365577, "step": 17190 }, { "epoch": 5.733822548365577, "ref_ce_loss": 0.10366712510585785, "step": 17190 }, { "epoch": 5.737158105403602, "loss": 0.6736, "step": 17200 }, { "epoch": 5.737158105403602, "grad_norm": 1.6799206733703613, "step": 17200 }, { "epoch": 5.737158105403602, "learning_rate": 0.00032440237075825954, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.7083799839019775, "step": 17200 }, { "ce_loss": 0.13097502291202545, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.27484971284866333, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.08867395669221878, "step": 17200 }, { "epoch": 5.737158105403602, "loss": 0.6861226558685303, "step": 17200 }, { "ce_loss": 0.12615366280078888, "epoch": 5.737158105403602, "step": 17200 }, { "distill_loss": 0.2812725305557251, "epoch": 5.737158105403602, "step": 17200 }, { "epoch": 5.737158105403602, "ref_ce_loss": 0.1325719952583313, "step": 17200 }, { "epoch": 5.740493662441628, "loss": 0.7134, "step": 17210 }, { "epoch": 5.740493662441628, "grad_norm": 1.6573398113250732, "step": 17210 }, { "epoch": 5.740493662441628, "learning_rate": 0.00032397807168412244, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.7408238649368286, "step": 17210 }, { "ce_loss": 0.14878395199775696, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.2950376272201538, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.11640201508998871, "step": 17210 }, { "epoch": 5.740493662441628, "loss": 0.7037614583969116, "step": 17210 }, { "ce_loss": 0.19889037311077118, "epoch": 5.740493662441628, "step": 17210 }, { "distill_loss": 0.29107096791267395, "epoch": 5.740493662441628, "step": 17210 }, { "epoch": 5.740493662441628, "ref_ce_loss": 0.1580825001001358, "step": 17210 }, { "epoch": 5.743829219479653, "loss": 0.7708, "step": 17220 }, { "epoch": 5.743829219479653, "grad_norm": 1.7908594608306885, "step": 17220 }, { "epoch": 5.743829219479653, "learning_rate": 0.00032355386133575594, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.6606780886650085, "step": 17220 }, { "ce_loss": 0.2049364149570465, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.23314324021339417, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.16539844870567322, "step": 17220 }, { "epoch": 5.743829219479653, "loss": 0.7575362920761108, "step": 17220 }, { "ce_loss": 0.18157583475112915, "epoch": 5.743829219479653, "step": 17220 }, { "distill_loss": 0.32159459590911865, "epoch": 5.743829219479653, "step": 17220 }, { "epoch": 5.743829219479653, "ref_ce_loss": 0.14340363442897797, "step": 17220 }, { "epoch": 5.747164776517678, "loss": 0.7087, "step": 17230 }, { "epoch": 5.747164776517678, "grad_norm": 1.848981499671936, "step": 17230 }, { "epoch": 5.747164776517678, "learning_rate": 0.0003231297402082592, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.6597747206687927, "step": 17230 }, { "ce_loss": 0.17482557892799377, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.3133894205093384, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.13527342677116394, "step": 17230 }, { "epoch": 5.747164776517678, "loss": 0.7282883524894714, "step": 17230 }, { "ce_loss": 0.15620630979537964, "epoch": 5.747164776517678, "step": 17230 }, { "distill_loss": 0.37796542048454285, "epoch": 5.747164776517678, "step": 17230 }, { "epoch": 5.747164776517678, "ref_ce_loss": 0.1426892727613449, "step": 17230 }, { "epoch": 5.750500333555704, "loss": 0.696, "step": 17240 }, { "epoch": 5.750500333555704, "grad_norm": 1.7952791452407837, "step": 17240 }, { "epoch": 5.750500333555704, "learning_rate": 0.0003227057087966273, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.6785531640052795, "step": 17240 }, { "ce_loss": 0.1512996107339859, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.3508850634098053, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.1758999079465866, "step": 17240 }, { "epoch": 5.750500333555704, "loss": 0.7377484440803528, "step": 17240 }, { "ce_loss": 0.173048198223114, "epoch": 5.750500333555704, "step": 17240 }, { "distill_loss": 0.33531084656715393, "epoch": 5.750500333555704, "step": 17240 }, { "epoch": 5.750500333555704, "ref_ce_loss": 0.11774720996618271, "step": 17240 }, { "epoch": 5.753835890593729, "loss": 0.6903, "step": 17250 }, { "epoch": 5.753835890593729, "grad_norm": 1.3963509798049927, "step": 17250 }, { "epoch": 5.753835890593729, "learning_rate": 0.00032228176759575036, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.599527895450592, "step": 17250 }, { "ce_loss": 0.15814733505249023, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.27153316140174866, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.16702882945537567, "step": 17250 }, { "epoch": 5.753835890593729, "loss": 0.7268506288528442, "step": 17250 }, { "ce_loss": 0.1674957573413849, "epoch": 5.753835890593729, "step": 17250 }, { "distill_loss": 0.33976811170578003, "epoch": 5.753835890593729, "step": 17250 }, { "epoch": 5.753835890593729, "ref_ce_loss": 0.11406680941581726, "step": 17250 }, { "epoch": 5.757171447631754, "loss": 0.694, "step": 17260 }, { "epoch": 5.757171447631754, "grad_norm": 1.7198892831802368, "step": 17260 }, { "epoch": 5.757171447631754, "learning_rate": 0.0003218579171004134, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.6412903666496277, "step": 17260 }, { "ce_loss": 0.17422105371952057, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.30146506428718567, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.1300029456615448, "step": 17260 }, { "epoch": 5.757171447631754, "loss": 0.7008032202720642, "step": 17260 }, { "ce_loss": 0.17944203317165375, "epoch": 5.757171447631754, "step": 17260 }, { "distill_loss": 0.3309924602508545, "epoch": 5.757171447631754, "step": 17260 }, { "epoch": 5.757171447631754, "ref_ce_loss": 0.14378021657466888, "step": 17260 }, { "epoch": 5.76050700466978, "loss": 0.687, "step": 17270 }, { "epoch": 5.76050700466978, "grad_norm": 1.684322714805603, "step": 17270 }, { "epoch": 5.76050700466978, "learning_rate": 0.0003214341578052958, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.8886874318122864, "step": 17270 }, { "ce_loss": 0.17103131115436554, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.24941280484199524, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.1687363088130951, "step": 17270 }, { "epoch": 5.76050700466978, "loss": 0.7393255829811096, "step": 17270 }, { "ce_loss": 0.1802368462085724, "epoch": 5.76050700466978, "step": 17270 }, { "distill_loss": 0.28230059146881104, "epoch": 5.76050700466978, "step": 17270 }, { "epoch": 5.76050700466978, "ref_ce_loss": 0.14546415209770203, "step": 17270 }, { "epoch": 5.763842561707805, "loss": 0.7536, "step": 17280 }, { "epoch": 5.763842561707805, "grad_norm": 2.5762276649475098, "step": 17280 }, { "epoch": 5.763842561707805, "learning_rate": 0.0003210104902049699, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.6105315685272217, "step": 17280 }, { "ce_loss": 0.14770327508449554, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.21739080548286438, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.15710628032684326, "step": 17280 }, { "epoch": 5.763842561707805, "loss": 0.5762127041816711, "step": 17280 }, { "ce_loss": 0.14088711142539978, "epoch": 5.763842561707805, "step": 17280 }, { "distill_loss": 0.25248652696609497, "epoch": 5.763842561707805, "step": 17280 }, { "epoch": 5.763842561707805, "ref_ce_loss": 0.1329662948846817, "step": 17280 }, { "epoch": 5.76717811874583, "loss": 0.6322, "step": 17290 }, { "epoch": 5.76717811874583, "grad_norm": 1.545915126800537, "step": 17290 }, { "epoch": 5.76717811874583, "learning_rate": 0.0003205869147939017, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.5105368494987488, "step": 17290 }, { "ce_loss": 0.1355862319469452, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.22150439023971558, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.12638546526432037, "step": 17290 }, { "epoch": 5.76717811874583, "loss": 0.5935248136520386, "step": 17290 }, { "ce_loss": 0.13656610250473022, "epoch": 5.76717811874583, "step": 17290 }, { "distill_loss": 0.21289098262786865, "epoch": 5.76717811874583, "step": 17290 }, { "epoch": 5.76717811874583, "ref_ce_loss": 0.1504911184310913, "step": 17290 }, { "epoch": 5.770513675783856, "loss": 0.6362, "step": 17300 }, { "epoch": 5.770513675783856, "grad_norm": 2.608363151550293, "step": 17300 }, { "epoch": 5.770513675783856, "learning_rate": 0.00032016343206644907, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.6189454197883606, "step": 17300 }, { "ce_loss": 0.15753212571144104, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.2742728292942047, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.1473052054643631, "step": 17300 }, { "epoch": 5.770513675783856, "loss": 0.6348935961723328, "step": 17300 }, { "ce_loss": 0.17919529974460602, "epoch": 5.770513675783856, "step": 17300 }, { "distill_loss": 0.2727394104003906, "epoch": 5.770513675783856, "step": 17300 }, { "epoch": 5.770513675783856, "ref_ce_loss": 0.1823030263185501, "step": 17300 }, { "epoch": 5.773849232821881, "loss": 0.6421, "step": 17310 }, { "epoch": 5.773849232821881, "grad_norm": 1.9108775854110718, "step": 17310 }, { "epoch": 5.773849232821881, "learning_rate": 0.00031974004251686205, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.5485625267028809, "step": 17310 }, { "ce_loss": 0.1477338820695877, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.24226287007331848, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.1192854717373848, "step": 17310 }, { "epoch": 5.773849232821881, "loss": 0.480712890625, "step": 17310 }, { "ce_loss": 0.12280880659818649, "epoch": 5.773849232821881, "step": 17310 }, { "distill_loss": 0.2515189051628113, "epoch": 5.773849232821881, "step": 17310 }, { "epoch": 5.773849232821881, "ref_ce_loss": 0.10570239275693893, "step": 17310 }, { "epoch": 5.7771847898599065, "loss": 0.6183, "step": 17320 }, { "epoch": 5.7771847898599065, "grad_norm": 1.7842615842819214, "step": 17320 }, { "epoch": 5.7771847898599065, "learning_rate": 0.00031931674663928164, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.5594913959503174, "step": 17320 }, { "ce_loss": 0.11906707286834717, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.2082429677248001, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.1027420163154602, "step": 17320 }, { "epoch": 5.7771847898599065, "loss": 0.554329514503479, "step": 17320 }, { "ce_loss": 0.14408791065216064, "epoch": 5.7771847898599065, "step": 17320 }, { "distill_loss": 0.22791585326194763, "epoch": 5.7771847898599065, "step": 17320 }, { "epoch": 5.7771847898599065, "ref_ce_loss": 0.1325961947441101, "step": 17320 }, { "epoch": 5.780520346897932, "loss": 0.6579, "step": 17330 }, { "epoch": 5.780520346897932, "grad_norm": 3.7666094303131104, "step": 17330 }, { "epoch": 5.780520346897932, "learning_rate": 0.00031889354492773987, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.8298982381820679, "step": 17330 }, { "ce_loss": 0.19762609899044037, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.28875911235809326, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.1450854390859604, "step": 17330 }, { "epoch": 5.780520346897932, "loss": 0.626959502696991, "step": 17330 }, { "ce_loss": 0.15268436074256897, "epoch": 5.780520346897932, "step": 17330 }, { "distill_loss": 0.29340776801109314, "epoch": 5.780520346897932, "step": 17330 }, { "epoch": 5.780520346897932, "ref_ce_loss": 0.13883355259895325, "step": 17330 }, { "epoch": 5.783855903935957, "loss": 0.731, "step": 17340 }, { "epoch": 5.783855903935957, "grad_norm": 1.4649722576141357, "step": 17340 }, { "epoch": 5.783855903935957, "learning_rate": 0.0003184704378761585, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.8531560301780701, "step": 17340 }, { "ce_loss": 0.1586577296257019, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.31400057673454285, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.12301208823919296, "step": 17340 }, { "epoch": 5.783855903935957, "loss": 0.8586217761039734, "step": 17340 }, { "ce_loss": 0.2559804320335388, "epoch": 5.783855903935957, "step": 17340 }, { "distill_loss": 0.3865933418273926, "epoch": 5.783855903935957, "step": 17340 }, { "epoch": 5.783855903935957, "ref_ce_loss": 0.17483940720558167, "step": 17340 }, { "epoch": 5.7871914609739825, "loss": 0.7778, "step": 17350 }, { "epoch": 5.7871914609739825, "grad_norm": 1.4831793308258057, "step": 17350 }, { "epoch": 5.7871914609739825, "learning_rate": 0.0003180474259783492, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.7598646879196167, "step": 17350 }, { "ce_loss": 0.19547435641288757, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.3320785164833069, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.18037301301956177, "step": 17350 }, { "epoch": 5.7871914609739825, "loss": 0.6318597197532654, "step": 17350 }, { "ce_loss": 0.17297153174877167, "epoch": 5.7871914609739825, "step": 17350 }, { "distill_loss": 0.26058369874954224, "epoch": 5.7871914609739825, "step": 17350 }, { "epoch": 5.7871914609739825, "ref_ce_loss": 0.15463228523731232, "step": 17350 }, { "epoch": 5.790527018012008, "loss": 0.6925, "step": 17360 }, { "epoch": 5.790527018012008, "grad_norm": 2.5347464084625244, "step": 17360 }, { "epoch": 5.790527018012008, "learning_rate": 0.00031762450972801215, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.7156696319580078, "step": 17360 }, { "ce_loss": 0.16093580424785614, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.3110997676849365, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.1486850380897522, "step": 17360 }, { "epoch": 5.790527018012008, "loss": 0.6132699847221375, "step": 17360 }, { "ce_loss": 0.12380540370941162, "epoch": 5.790527018012008, "step": 17360 }, { "distill_loss": 0.262518048286438, "epoch": 5.790527018012008, "step": 17360 }, { "epoch": 5.790527018012008, "ref_ce_loss": 0.12827558815479279, "step": 17360 }, { "epoch": 5.793862575050033, "loss": 0.6978, "step": 17370 }, { "epoch": 5.793862575050033, "grad_norm": 1.4433140754699707, "step": 17370 }, { "epoch": 5.793862575050033, "learning_rate": 0.0003172016896187361, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.7008747458457947, "step": 17370 }, { "ce_loss": 0.13500070571899414, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.27394983172416687, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.12862640619277954, "step": 17370 }, { "epoch": 5.793862575050033, "loss": 0.5868788361549377, "step": 17370 }, { "ce_loss": 0.15697240829467773, "epoch": 5.793862575050033, "step": 17370 }, { "distill_loss": 0.26224878430366516, "epoch": 5.793862575050033, "step": 17370 }, { "epoch": 5.793862575050033, "ref_ce_loss": 0.14263705909252167, "step": 17370 }, { "epoch": 5.797198132088059, "loss": 0.6766, "step": 17380 }, { "epoch": 5.797198132088059, "grad_norm": 1.6639288663864136, "step": 17380 }, { "epoch": 5.797198132088059, "learning_rate": 0.00031677896614399796, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 1.1503751277923584, "step": 17380 }, { "ce_loss": 0.24827058613300323, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.39232802391052246, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.2320650964975357, "step": 17380 }, { "epoch": 5.797198132088059, "loss": 0.6632513999938965, "step": 17380 }, { "ce_loss": 0.1472640186548233, "epoch": 5.797198132088059, "step": 17380 }, { "distill_loss": 0.32942354679107666, "epoch": 5.797198132088059, "step": 17380 }, { "epoch": 5.797198132088059, "ref_ce_loss": 0.14712239801883698, "step": 17380 }, { "epoch": 5.800533689126084, "loss": 0.7225, "step": 17390 }, { "epoch": 5.800533689126084, "grad_norm": 1.553830623626709, "step": 17390 }, { "epoch": 5.800533689126084, "learning_rate": 0.0003163563397971611, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.5071688294410706, "step": 17390 }, { "ce_loss": 0.1248534694314003, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.2655135989189148, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.1165119931101799, "step": 17390 }, { "epoch": 5.800533689126084, "loss": 0.7314570546150208, "step": 17390 }, { "ce_loss": 0.20705215632915497, "epoch": 5.800533689126084, "step": 17390 }, { "distill_loss": 0.272818922996521, "epoch": 5.800533689126084, "step": 17390 }, { "epoch": 5.800533689126084, "ref_ce_loss": 0.1565602719783783, "step": 17390 }, { "epoch": 5.803869246164109, "loss": 0.7273, "step": 17400 }, { "epoch": 5.803869246164109, "grad_norm": 2.529906988143921, "step": 17400 }, { "epoch": 5.803869246164109, "learning_rate": 0.0003159338110714762, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.638818085193634, "step": 17400 }, { "ce_loss": 0.1544320285320282, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.33618006110191345, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.14788387715816498, "step": 17400 }, { "epoch": 5.803869246164109, "loss": 0.7675939202308655, "step": 17400 }, { "ce_loss": 0.1987285017967224, "epoch": 5.803869246164109, "step": 17400 }, { "distill_loss": 0.3170737326145172, "epoch": 5.803869246164109, "step": 17400 }, { "epoch": 5.803869246164109, "ref_ce_loss": 0.15776485204696655, "step": 17400 }, { "epoch": 5.807204803202135, "loss": 0.703, "step": 17410 }, { "epoch": 5.807204803202135, "grad_norm": 1.9354774951934814, "step": 17410 }, { "epoch": 5.807204803202135, "learning_rate": 0.0003155113804600797, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.8200332522392273, "step": 17410 }, { "ce_loss": 0.21878603100776672, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.3642941117286682, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.15945042669773102, "step": 17410 }, { "epoch": 5.807204803202135, "loss": 0.9217702150344849, "step": 17410 }, { "ce_loss": 0.2335110753774643, "epoch": 5.807204803202135, "step": 17410 }, { "distill_loss": 0.36487433314323425, "epoch": 5.807204803202135, "step": 17410 }, { "epoch": 5.807204803202135, "ref_ce_loss": 0.20070920884609222, "step": 17410 }, { "epoch": 5.81054036024016, "loss": 0.7387, "step": 17420 }, { "epoch": 5.81054036024016, "grad_norm": 1.5643854141235352, "step": 17420 }, { "epoch": 5.81054036024016, "learning_rate": 0.00031508904845599356, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.7375587224960327, "step": 17420 }, { "ce_loss": 0.20015650987625122, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.32561731338500977, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.15708674490451813, "step": 17420 }, { "epoch": 5.81054036024016, "loss": 0.8183858394622803, "step": 17420 }, { "ce_loss": 0.2078881412744522, "epoch": 5.81054036024016, "step": 17420 }, { "distill_loss": 0.29002147912979126, "epoch": 5.81054036024016, "step": 17420 }, { "epoch": 5.81054036024016, "ref_ce_loss": 0.12911447882652283, "step": 17420 }, { "epoch": 5.813875917278185, "loss": 0.7037, "step": 17430 }, { "epoch": 5.813875917278185, "grad_norm": 1.4947174787521362, "step": 17430 }, { "epoch": 5.813875917278185, "learning_rate": 0.0003146668155521247, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.4583217203617096, "step": 17430 }, { "ce_loss": 0.0980353131890297, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.20246006548404694, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.10573271661996841, "step": 17430 }, { "epoch": 5.813875917278185, "loss": 0.6630755066871643, "step": 17430 }, { "ce_loss": 0.1648399978876114, "epoch": 5.813875917278185, "step": 17430 }, { "distill_loss": 0.2990220785140991, "epoch": 5.813875917278185, "step": 17430 }, { "epoch": 5.813875917278185, "ref_ce_loss": 0.1747708022594452, "step": 17430 }, { "epoch": 5.817211474316211, "loss": 0.6581, "step": 17440 }, { "epoch": 5.817211474316211, "grad_norm": 2.1349339485168457, "step": 17440 }, { "epoch": 5.817211474316211, "learning_rate": 0.0003142446822412643, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.8816218376159668, "step": 17440 }, { "ce_loss": 0.24884732067584991, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.3589048385620117, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.17393577098846436, "step": 17440 }, { "epoch": 5.817211474316211, "loss": 0.7268455028533936, "step": 17440 }, { "ce_loss": 0.19337479770183563, "epoch": 5.817211474316211, "step": 17440 }, { "distill_loss": 0.335417240858078, "epoch": 5.817211474316211, "step": 17440 }, { "epoch": 5.817211474316211, "ref_ce_loss": 0.1636698693037033, "step": 17440 }, { "epoch": 5.820547031354236, "loss": 0.7032, "step": 17450 }, { "epoch": 5.820547031354236, "grad_norm": 3.8691024780273438, "step": 17450 }, { "epoch": 5.820547031354236, "learning_rate": 0.00031382264901608735, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.8245714902877808, "step": 17450 }, { "ce_loss": 0.20893444120883942, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.3680214285850525, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.1830359250307083, "step": 17450 }, { "epoch": 5.820547031354236, "loss": 0.5935423374176025, "step": 17450 }, { "ce_loss": 0.15983273088932037, "epoch": 5.820547031354236, "step": 17450 }, { "distill_loss": 0.18705280125141144, "epoch": 5.820547031354236, "step": 17450 }, { "epoch": 5.820547031354236, "ref_ce_loss": 0.14525040984153748, "step": 17450 }, { "epoch": 5.823882588392261, "loss": 0.6841, "step": 17460 }, { "epoch": 5.823882588392261, "grad_norm": 2.8735129833221436, "step": 17460 }, { "epoch": 5.823882588392261, "learning_rate": 0.00031340071636915207, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.5384118556976318, "step": 17460 }, { "ce_loss": 0.18613983690738678, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.2151462584733963, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.1369083672761917, "step": 17460 }, { "epoch": 5.823882588392261, "loss": 0.6384410262107849, "step": 17460 }, { "ce_loss": 0.138434499502182, "epoch": 5.823882588392261, "step": 17460 }, { "distill_loss": 0.2718351185321808, "epoch": 5.823882588392261, "step": 17460 }, { "epoch": 5.823882588392261, "ref_ce_loss": 0.13998080790042877, "step": 17460 }, { "epoch": 5.827218145430287, "loss": 0.6842, "step": 17470 }, { "epoch": 5.827218145430287, "grad_norm": 2.4372873306274414, "step": 17470 }, { "epoch": 5.827218145430287, "learning_rate": 0.00031297888479289926, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.7715685367584229, "step": 17470 }, { "ce_loss": 0.21597355604171753, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.30613934993743896, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.1433599293231964, "step": 17470 }, { "epoch": 5.827218145430287, "loss": 0.6983997821807861, "step": 17470 }, { "ce_loss": 0.21408066153526306, "epoch": 5.827218145430287, "step": 17470 }, { "distill_loss": 0.28993719816207886, "epoch": 5.827218145430287, "step": 17470 }, { "epoch": 5.827218145430287, "ref_ce_loss": 0.14629687368869781, "step": 17470 }, { "epoch": 5.830553702468312, "loss": 0.7288, "step": 17480 }, { "epoch": 5.830553702468312, "grad_norm": 2.8923749923706055, "step": 17480 }, { "epoch": 5.830553702468312, "learning_rate": 0.00031255715477965164, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.8305673599243164, "step": 17480 }, { "ce_loss": 0.18007422983646393, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.28398704528808594, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.18858270347118378, "step": 17480 }, { "epoch": 5.830553702468312, "loss": 0.7498915791511536, "step": 17480 }, { "ce_loss": 0.15216076374053955, "epoch": 5.830553702468312, "step": 17480 }, { "distill_loss": 0.38887834548950195, "epoch": 5.830553702468312, "step": 17480 }, { "epoch": 5.830553702468312, "ref_ce_loss": 0.14917370676994324, "step": 17480 }, { "epoch": 5.833889259506337, "loss": 0.7539, "step": 17490 }, { "epoch": 5.833889259506337, "grad_norm": 3.5626888275146484, "step": 17490 }, { "epoch": 5.833889259506337, "learning_rate": 0.0003121355268216137, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.7058088779449463, "step": 17490 }, { "ce_loss": 0.14639702439308167, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.41465479135513306, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.14412228763103485, "step": 17490 }, { "epoch": 5.833889259506337, "loss": 0.7046496868133545, "step": 17490 }, { "ce_loss": 0.15008413791656494, "epoch": 5.833889259506337, "step": 17490 }, { "distill_loss": 0.30911529064178467, "epoch": 5.833889259506337, "step": 17490 }, { "epoch": 5.833889259506337, "ref_ce_loss": 0.1350952535867691, "step": 17490 }, { "epoch": 5.837224816544363, "loss": 0.7263, "step": 17500 }, { "epoch": 5.837224816544363, "grad_norm": 2.2108681201934814, "step": 17500 }, { "epoch": 5.837224816544363, "learning_rate": 0.0003117140014108707, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 0.7081411480903625, "step": 17500 }, { "ce_loss": 0.16158966720104218, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.31263870000839233, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.12841305136680603, "step": 17500 }, { "epoch": 5.837224816544363, "loss": 1.1486976146697998, "step": 17500 }, { "ce_loss": 0.20025956630706787, "epoch": 5.837224816544363, "step": 17500 }, { "distill_loss": 0.34252017736434937, "epoch": 5.837224816544363, "step": 17500 }, { "epoch": 5.837224816544363, "ref_ce_loss": 0.21556495130062103, "step": 17500 }, { "epoch": 5.840560373582388, "loss": 0.7424, "step": 17510 }, { "epoch": 5.840560373582388, "grad_norm": 2.667424201965332, "step": 17510 }, { "epoch": 5.840560373582388, "learning_rate": 0.00031129257903938785, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.779916524887085, "step": 17510 }, { "ce_loss": 0.2496323585510254, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.3321917653083801, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.12754826247692108, "step": 17510 }, { "epoch": 5.840560373582388, "loss": 0.7531536221504211, "step": 17510 }, { "ce_loss": 0.1766018122434616, "epoch": 5.840560373582388, "step": 17510 }, { "distill_loss": 0.3808102309703827, "epoch": 5.840560373582388, "step": 17510 }, { "epoch": 5.840560373582388, "ref_ce_loss": 0.1379728615283966, "step": 17510 }, { "epoch": 5.8438959306204135, "loss": 0.7667, "step": 17520 }, { "epoch": 5.8438959306204135, "grad_norm": 1.9223077297210693, "step": 17520 }, { "epoch": 5.8438959306204135, "learning_rate": 0.0003108712601990107, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.797112226486206, "step": 17520 }, { "ce_loss": 0.21410100162029266, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.29613161087036133, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.1464604139328003, "step": 17520 }, { "epoch": 5.8438959306204135, "loss": 0.7728590965270996, "step": 17520 }, { "ce_loss": 0.13434790074825287, "epoch": 5.8438959306204135, "step": 17520 }, { "distill_loss": 0.2363002598285675, "epoch": 5.8438959306204135, "step": 17520 }, { "epoch": 5.8438959306204135, "ref_ce_loss": 0.1437375843524933, "step": 17520 }, { "epoch": 5.847231487658439, "loss": 0.7348, "step": 17530 }, { "epoch": 5.847231487658439, "grad_norm": 2.909083604812622, "step": 17530 }, { "epoch": 5.847231487658439, "learning_rate": 0.0003104500453814635, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.7808569669723511, "step": 17530 }, { "ce_loss": 0.15024033188819885, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.32764944434165955, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.14453238248825073, "step": 17530 }, { "epoch": 5.847231487658439, "loss": 0.6624705195426941, "step": 17530 }, { "ce_loss": 0.1971026360988617, "epoch": 5.847231487658439, "step": 17530 }, { "distill_loss": 0.276828408241272, "epoch": 5.847231487658439, "step": 17530 }, { "epoch": 5.847231487658439, "ref_ce_loss": 0.14170217514038086, "step": 17530 }, { "epoch": 5.850567044696464, "loss": 0.7462, "step": 17540 }, { "epoch": 5.850567044696464, "grad_norm": 1.7439287900924683, "step": 17540 }, { "epoch": 5.850567044696464, "learning_rate": 0.00031002893507834934, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.7160279750823975, "step": 17540 }, { "ce_loss": 0.15651878714561462, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.289535254240036, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.14303657412528992, "step": 17540 }, { "epoch": 5.850567044696464, "loss": 0.739136278629303, "step": 17540 }, { "ce_loss": 0.18893566727638245, "epoch": 5.850567044696464, "step": 17540 }, { "distill_loss": 0.34339919686317444, "epoch": 5.850567044696464, "step": 17540 }, { "epoch": 5.850567044696464, "ref_ce_loss": 0.16350984573364258, "step": 17540 }, { "epoch": 5.8539026017344895, "loss": 0.8228, "step": 17550 }, { "epoch": 5.8539026017344895, "grad_norm": 2.1306843757629395, "step": 17550 }, { "epoch": 5.8539026017344895, "learning_rate": 0.0003096079297811492, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.7175245881080627, "step": 17550 }, { "ce_loss": 0.17793938517570496, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.33758652210235596, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.14965596795082092, "step": 17550 }, { "epoch": 5.8539026017344895, "loss": 0.904224157333374, "step": 17550 }, { "ce_loss": 0.22401635348796844, "epoch": 5.8539026017344895, "step": 17550 }, { "distill_loss": 0.34708863496780396, "epoch": 5.8539026017344895, "step": 17550 }, { "epoch": 5.8539026017344895, "ref_ce_loss": 0.18091994524002075, "step": 17550 }, { "epoch": 5.857238158772515, "loss": 0.7586, "step": 17560 }, { "epoch": 5.857238158772515, "grad_norm": 1.8932132720947266, "step": 17560 }, { "epoch": 5.857238158772515, "learning_rate": 0.00030918702998122165, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.6691727638244629, "step": 17560 }, { "ce_loss": 0.179330974817276, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.25679606199264526, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.13041779398918152, "step": 17560 }, { "epoch": 5.857238158772515, "loss": 0.7127830386161804, "step": 17560 }, { "ce_loss": 0.1948421448469162, "epoch": 5.857238158772515, "step": 17560 }, { "distill_loss": 0.29180899262428284, "epoch": 5.857238158772515, "step": 17560 }, { "epoch": 5.857238158772515, "ref_ce_loss": 0.16397015750408173, "step": 17560 }, { "epoch": 5.86057371581054, "loss": 0.7173, "step": 17570 }, { "epoch": 5.86057371581054, "grad_norm": 7.5867180824279785, "step": 17570 }, { "epoch": 5.86057371581054, "learning_rate": 0.0003087662361698019, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.6970691084861755, "step": 17570 }, { "ce_loss": 0.1313159167766571, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.24335519969463348, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.15353406965732574, "step": 17570 }, { "epoch": 5.86057371581054, "loss": 0.7506760358810425, "step": 17570 }, { "ce_loss": 0.18178677558898926, "epoch": 5.86057371581054, "step": 17570 }, { "distill_loss": 0.23799774050712585, "epoch": 5.86057371581054, "step": 17570 }, { "epoch": 5.86057371581054, "ref_ce_loss": 0.1689431518316269, "step": 17570 }, { "epoch": 5.863909272848566, "loss": 0.7146, "step": 17580 }, { "epoch": 5.863909272848566, "grad_norm": 4.501378536224365, "step": 17580 }, { "epoch": 5.863909272848566, "learning_rate": 0.00030834554883800176, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.7862856984138489, "step": 17580 }, { "ce_loss": 0.2624962329864502, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.283257395029068, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.19673359394073486, "step": 17580 }, { "epoch": 5.863909272848566, "loss": 0.6927284598350525, "step": 17580 }, { "ce_loss": 0.15992389619350433, "epoch": 5.863909272848566, "step": 17580 }, { "distill_loss": 0.32113510370254517, "epoch": 5.863909272848566, "step": 17580 }, { "epoch": 5.863909272848566, "ref_ce_loss": 0.15669889748096466, "step": 17580 }, { "epoch": 5.867244829886591, "loss": 0.7183, "step": 17590 }, { "epoch": 5.867244829886591, "grad_norm": 2.619765520095825, "step": 17590 }, { "epoch": 5.867244829886591, "learning_rate": 0.00030792496847680835, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.6546903848648071, "step": 17590 }, { "ce_loss": 0.16821420192718506, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.2883762717247009, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.11920984834432602, "step": 17590 }, { "epoch": 5.867244829886591, "loss": 0.5550947785377502, "step": 17590 }, { "ce_loss": 0.19670464098453522, "epoch": 5.867244829886591, "step": 17590 }, { "distill_loss": 0.23802609741687775, "epoch": 5.867244829886591, "step": 17590 }, { "epoch": 5.867244829886591, "ref_ce_loss": 0.11984530091285706, "step": 17590 }, { "epoch": 5.870580386924616, "loss": 0.7165, "step": 17600 }, { "epoch": 5.870580386924616, "grad_norm": 2.4450623989105225, "step": 17600 }, { "epoch": 5.870580386924616, "learning_rate": 0.0003075044955770847, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 1.1171493530273438, "step": 17600 }, { "ce_loss": 0.21308808028697968, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.3068980872631073, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.16722099483013153, "step": 17600 }, { "epoch": 5.870580386924616, "loss": 0.7828159332275391, "step": 17600 }, { "ce_loss": 0.1695491075515747, "epoch": 5.870580386924616, "step": 17600 }, { "distill_loss": 0.29257312417030334, "epoch": 5.870580386924616, "step": 17600 }, { "epoch": 5.870580386924616, "ref_ce_loss": 0.1478341817855835, "step": 17600 }, { "epoch": 5.873915943962642, "loss": 0.6831, "step": 17610 }, { "epoch": 5.873915943962642, "grad_norm": 1.61392080783844, "step": 17610 }, { "epoch": 5.873915943962642, "learning_rate": 0.0003070841306295675, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.671332061290741, "step": 17610 }, { "ce_loss": 0.15654361248016357, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.24826830625534058, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.12748834490776062, "step": 17610 }, { "epoch": 5.873915943962642, "loss": 0.7003882527351379, "step": 17610 }, { "ce_loss": 0.23483048379421234, "epoch": 5.873915943962642, "step": 17610 }, { "distill_loss": 0.2979724407196045, "epoch": 5.873915943962642, "step": 17610 }, { "epoch": 5.873915943962642, "ref_ce_loss": 0.1305704265832901, "step": 17610 }, { "epoch": 5.877251501000667, "loss": 0.6819, "step": 17620 }, { "epoch": 5.877251501000667, "grad_norm": 1.6848235130310059, "step": 17620 }, { "epoch": 5.877251501000667, "learning_rate": 0.00030666387412486807, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.9243027567863464, "step": 17620 }, { "ce_loss": 0.2511441111564636, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.32854366302490234, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.22302629053592682, "step": 17620 }, { "epoch": 5.877251501000667, "loss": 0.8413571119308472, "step": 17620 }, { "ce_loss": 0.1491873562335968, "epoch": 5.877251501000667, "step": 17620 }, { "distill_loss": 0.2715878486633301, "epoch": 5.877251501000667, "step": 17620 }, { "epoch": 5.877251501000667, "ref_ce_loss": 0.11943260580301285, "step": 17620 }, { "epoch": 5.880587058038692, "loss": 0.737, "step": 17630 }, { "epoch": 5.880587058038692, "grad_norm": 1.6482051610946655, "step": 17630 }, { "epoch": 5.880587058038692, "learning_rate": 0.00030624372655347086, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.6240556240081787, "step": 17630 }, { "ce_loss": 0.173484206199646, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.26367682218551636, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.13847512006759644, "step": 17630 }, { "epoch": 5.880587058038692, "loss": 0.5382674336433411, "step": 17630 }, { "ce_loss": 0.11168865114450455, "epoch": 5.880587058038692, "step": 17630 }, { "distill_loss": 0.2589222192764282, "epoch": 5.880587058038692, "step": 17630 }, { "epoch": 5.880587058038692, "ref_ce_loss": 0.12466815859079361, "step": 17630 }, { "epoch": 5.883922615076718, "loss": 0.7198, "step": 17640 }, { "epoch": 5.883922615076718, "grad_norm": 1.565881609916687, "step": 17640 }, { "epoch": 5.883922615076718, "learning_rate": 0.00030582368840573345, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.8074979782104492, "step": 17640 }, { "ce_loss": 0.19353412091732025, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.2784769535064697, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.1902574896812439, "step": 17640 }, { "epoch": 5.883922615076718, "loss": 0.6128374338150024, "step": 17640 }, { "ce_loss": 0.15676084160804749, "epoch": 5.883922615076718, "step": 17640 }, { "distill_loss": 0.3117706775665283, "epoch": 5.883922615076718, "step": 17640 }, { "epoch": 5.883922615076718, "ref_ce_loss": 0.14417685568332672, "step": 17640 }, { "epoch": 5.887258172114743, "loss": 0.7086, "step": 17650 }, { "epoch": 5.887258172114743, "grad_norm": 3.2013261318206787, "step": 17650 }, { "epoch": 5.887258172114743, "learning_rate": 0.0003054037601718854, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.8468448519706726, "step": 17650 }, { "ce_loss": 0.175635427236557, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.2721385359764099, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.15196819603443146, "step": 17650 }, { "epoch": 5.887258172114743, "loss": 0.9644908905029297, "step": 17650 }, { "ce_loss": 0.20255514979362488, "epoch": 5.887258172114743, "step": 17650 }, { "distill_loss": 0.33900919556617737, "epoch": 5.887258172114743, "step": 17650 }, { "epoch": 5.887258172114743, "ref_ce_loss": 0.15699605643749237, "step": 17650 }, { "epoch": 5.890593729152768, "loss": 0.7547, "step": 17660 }, { "epoch": 5.890593729152768, "grad_norm": 1.6605432033538818, "step": 17660 }, { "epoch": 5.890593729152768, "learning_rate": 0.00030498394234202824, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.8745073080062866, "step": 17660 }, { "ce_loss": 0.1588677167892456, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.3496779501438141, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.19170548021793365, "step": 17660 }, { "epoch": 5.890593729152768, "loss": 0.8664857745170593, "step": 17660 }, { "ce_loss": 0.17014363408088684, "epoch": 5.890593729152768, "step": 17660 }, { "distill_loss": 0.3439854383468628, "epoch": 5.890593729152768, "step": 17660 }, { "epoch": 5.890593729152768, "ref_ce_loss": 0.1484474539756775, "step": 17660 }, { "epoch": 5.893929286190794, "loss": 0.7478, "step": 17670 }, { "epoch": 5.893929286190794, "grad_norm": 1.7497375011444092, "step": 17670 }, { "epoch": 5.893929286190794, "learning_rate": 0.0003045642354061345, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.7250229120254517, "step": 17670 }, { "ce_loss": 0.18709981441497803, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.39688920974731445, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.11318523436784744, "step": 17670 }, { "epoch": 5.893929286190794, "loss": 0.6799331903457642, "step": 17670 }, { "ce_loss": 0.1577003002166748, "epoch": 5.893929286190794, "step": 17670 }, { "distill_loss": 0.31221145391464233, "epoch": 5.893929286190794, "step": 17670 }, { "epoch": 5.893929286190794, "ref_ce_loss": 0.13887915015220642, "step": 17670 }, { "epoch": 5.897264843228819, "loss": 0.7058, "step": 17680 }, { "epoch": 5.897264843228819, "grad_norm": 1.4480282068252563, "step": 17680 }, { "epoch": 5.897264843228819, "learning_rate": 0.00030414463985404736, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 0.9352344870567322, "step": 17680 }, { "ce_loss": 0.2799140512943268, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.4020175039768219, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.1616601049900055, "step": 17680 }, { "epoch": 5.897264843228819, "loss": 0.8874188661575317, "step": 17680 }, { "ce_loss": 0.19736677408218384, "epoch": 5.897264843228819, "step": 17680 }, { "distill_loss": 0.3230544924736023, "epoch": 5.897264843228819, "step": 17680 }, { "epoch": 5.897264843228819, "ref_ce_loss": 0.10736200213432312, "step": 17680 }, { "epoch": 5.900600400266844, "loss": 0.7224, "step": 17690 }, { "epoch": 5.900600400266844, "grad_norm": 1.8025034666061401, "step": 17690 }, { "epoch": 5.900600400266844, "learning_rate": 0.0003037251561754799, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.6984555721282959, "step": 17690 }, { "ce_loss": 0.1678963005542755, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.28823724389076233, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.1307564228773117, "step": 17690 }, { "epoch": 5.900600400266844, "loss": 0.6367653608322144, "step": 17690 }, { "ce_loss": 0.15495973825454712, "epoch": 5.900600400266844, "step": 17690 }, { "distill_loss": 0.2730174660682678, "epoch": 5.900600400266844, "step": 17690 }, { "epoch": 5.900600400266844, "ref_ce_loss": 0.1531836986541748, "step": 17690 }, { "epoch": 5.90393595730487, "loss": 0.7038, "step": 17700 }, { "epoch": 5.90393595730487, "grad_norm": 1.8348549604415894, "step": 17700 }, { "epoch": 5.90393595730487, "learning_rate": 0.00030330578486001473, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.6442775130271912, "step": 17700 }, { "ce_loss": 0.19366510212421417, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.26765328645706177, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.11750882118940353, "step": 17700 }, { "epoch": 5.90393595730487, "loss": 0.7802560329437256, "step": 17700 }, { "ce_loss": 0.2070993334054947, "epoch": 5.90393595730487, "step": 17700 }, { "distill_loss": 0.35392361879348755, "epoch": 5.90393595730487, "step": 17700 }, { "epoch": 5.90393595730487, "ref_ce_loss": 0.17282815277576447, "step": 17700 }, { "epoch": 5.907271514342895, "loss": 0.6966, "step": 17710 }, { "epoch": 5.907271514342895, "grad_norm": 1.3651440143585205, "step": 17710 }, { "epoch": 5.907271514342895, "learning_rate": 0.00030288652639710357, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.6090908646583557, "step": 17710 }, { "ce_loss": 0.1301243156194687, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.2743867039680481, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.09706619381904602, "step": 17710 }, { "epoch": 5.907271514342895, "loss": 0.6999118328094482, "step": 17710 }, { "ce_loss": 0.20652586221694946, "epoch": 5.907271514342895, "step": 17710 }, { "distill_loss": 0.3514205813407898, "epoch": 5.907271514342895, "step": 17710 }, { "epoch": 5.907271514342895, "ref_ce_loss": 0.14169923961162567, "step": 17710 }, { "epoch": 5.9106070713809205, "loss": 0.6978, "step": 17720 }, { "epoch": 5.9106070713809205, "grad_norm": 1.9332634210586548, "step": 17720 }, { "epoch": 5.9106070713809205, "learning_rate": 0.0003024673812760658, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.6061062216758728, "step": 17720 }, { "ce_loss": 0.18262982368469238, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.2813739776611328, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.14188861846923828, "step": 17720 }, { "epoch": 5.9106070713809205, "loss": 0.7180115580558777, "step": 17720 }, { "ce_loss": 0.20352500677108765, "epoch": 5.9106070713809205, "step": 17720 }, { "distill_loss": 0.2630084156990051, "epoch": 5.9106070713809205, "step": 17720 }, { "epoch": 5.9106070713809205, "ref_ce_loss": 0.13348890841007233, "step": 17720 }, { "epoch": 5.913942628418946, "loss": 0.7463, "step": 17730 }, { "epoch": 5.913942628418946, "grad_norm": 1.853853464126587, "step": 17730 }, { "epoch": 5.913942628418946, "learning_rate": 0.0003020483499860891, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.980445384979248, "step": 17730 }, { "ce_loss": 0.13897709548473358, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.2369052916765213, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.10596528649330139, "step": 17730 }, { "epoch": 5.913942628418946, "loss": 0.5980679392814636, "step": 17730 }, { "ce_loss": 0.16462469100952148, "epoch": 5.913942628418946, "step": 17730 }, { "distill_loss": 0.22286882996559143, "epoch": 5.913942628418946, "step": 17730 }, { "epoch": 5.913942628418946, "ref_ce_loss": 0.11621631681919098, "step": 17730 }, { "epoch": 5.917278185456971, "loss": 0.6909, "step": 17740 }, { "epoch": 5.917278185456971, "grad_norm": 1.5366535186767578, "step": 17740 }, { "epoch": 5.917278185456971, "learning_rate": 0.00030162943301622794, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.7122682332992554, "step": 17740 }, { "ce_loss": 0.14937813580036163, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.2587045133113861, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.13506940007209778, "step": 17740 }, { "epoch": 5.917278185456971, "loss": 0.7699155807495117, "step": 17740 }, { "ce_loss": 0.19589021801948547, "epoch": 5.917278185456971, "step": 17740 }, { "distill_loss": 0.3217250108718872, "epoch": 5.917278185456971, "step": 17740 }, { "epoch": 5.917278185456971, "ref_ce_loss": 0.15825651586055756, "step": 17740 }, { "epoch": 5.9206137424949965, "loss": 0.7326, "step": 17750 }, { "epoch": 5.9206137424949965, "grad_norm": 2.868001699447632, "step": 17750 }, { "epoch": 5.9206137424949965, "learning_rate": 0.0003012106308554036, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.6667677164077759, "step": 17750 }, { "ce_loss": 0.16533072292804718, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.3047916889190674, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.11249995231628418, "step": 17750 }, { "epoch": 5.9206137424949965, "loss": 0.8356540203094482, "step": 17750 }, { "ce_loss": 0.1387956738471985, "epoch": 5.9206137424949965, "step": 17750 }, { "distill_loss": 0.28860345482826233, "epoch": 5.9206137424949965, "step": 17750 }, { "epoch": 5.9206137424949965, "ref_ce_loss": 0.11070244014263153, "step": 17750 }, { "epoch": 5.923949299533022, "loss": 0.7566, "step": 17760 }, { "epoch": 5.923949299533022, "grad_norm": 1.771466612815857, "step": 17760 }, { "epoch": 5.923949299533022, "learning_rate": 0.00030079194399240325, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.8782122135162354, "step": 17760 }, { "ce_loss": 0.2122136503458023, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.33710652589797974, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.18948283791542053, "step": 17760 }, { "epoch": 5.923949299533022, "loss": 0.9718055725097656, "step": 17760 }, { "ce_loss": 0.17462380230426788, "epoch": 5.923949299533022, "step": 17760 }, { "distill_loss": 0.35814768075942993, "epoch": 5.923949299533022, "step": 17760 }, { "epoch": 5.923949299533022, "ref_ce_loss": 0.16498197615146637, "step": 17760 }, { "epoch": 5.927284856571047, "loss": 0.7445, "step": 17770 }, { "epoch": 5.927284856571047, "grad_norm": 2.248972177505493, "step": 17770 }, { "epoch": 5.927284856571047, "learning_rate": 0.00030037337291587943, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.6510271430015564, "step": 17770 }, { "ce_loss": 0.17040914297103882, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.31474000215530396, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.13253526389598846, "step": 17770 }, { "epoch": 5.927284856571047, "loss": 0.9578293561935425, "step": 17770 }, { "ce_loss": 0.20671266317367554, "epoch": 5.927284856571047, "step": 17770 }, { "distill_loss": 0.3458459973335266, "epoch": 5.927284856571047, "step": 17770 }, { "epoch": 5.927284856571047, "ref_ce_loss": 0.15940354764461517, "step": 17770 }, { "epoch": 5.9306204136090725, "loss": 0.7793, "step": 17780 }, { "epoch": 5.9306204136090725, "grad_norm": 2.1776537895202637, "step": 17780 }, { "epoch": 5.9306204136090725, "learning_rate": 0.00029995491811434975, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.7826830744743347, "step": 17780 }, { "ce_loss": 0.17325259745121002, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.3214309811592102, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.16599765419960022, "step": 17780 }, { "epoch": 5.9306204136090725, "loss": 0.8357514142990112, "step": 17780 }, { "ce_loss": 0.2053692638874054, "epoch": 5.9306204136090725, "step": 17780 }, { "distill_loss": 0.3940480947494507, "epoch": 5.9306204136090725, "step": 17780 }, { "epoch": 5.9306204136090725, "ref_ce_loss": 0.1913134604692459, "step": 17780 }, { "epoch": 5.933955970647098, "loss": 0.8344, "step": 17790 }, { "epoch": 5.933955970647098, "grad_norm": 4.898279190063477, "step": 17790 }, { "epoch": 5.933955970647098, "learning_rate": 0.0002995365800761959, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.6484854817390442, "step": 17790 }, { "ce_loss": 0.14094069600105286, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.3030979335308075, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.1547994613647461, "step": 17790 }, { "epoch": 5.933955970647098, "loss": 0.7239052653312683, "step": 17790 }, { "ce_loss": 0.20402802526950836, "epoch": 5.933955970647098, "step": 17790 }, { "distill_loss": 0.3449753224849701, "epoch": 5.933955970647098, "step": 17790 }, { "epoch": 5.933955970647098, "ref_ce_loss": 0.14525295794010162, "step": 17790 }, { "epoch": 5.937291527685123, "loss": 0.6571, "step": 17800 }, { "epoch": 5.937291527685123, "grad_norm": 1.4168012142181396, "step": 17800 }, { "epoch": 5.937291527685123, "learning_rate": 0.00029911835928966347, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 0.5196230411529541, "step": 17800 }, { "ce_loss": 0.10742348432540894, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.22581657767295837, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.1268647164106369, "step": 17800 }, { "epoch": 5.937291527685123, "loss": 1.5188542604446411, "step": 17800 }, { "ce_loss": 0.25732168555259705, "epoch": 5.937291527685123, "step": 17800 }, { "distill_loss": 0.3840293288230896, "epoch": 5.937291527685123, "step": 17800 }, { "epoch": 5.937291527685123, "ref_ce_loss": 0.15666557848453522, "step": 17800 }, { "epoch": 5.940627084723149, "loss": 0.7761, "step": 17810 }, { "epoch": 5.940627084723149, "grad_norm": 2.4067792892456055, "step": 17810 }, { "epoch": 5.940627084723149, "learning_rate": 0.0002987002562428608, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 0.6301429867744446, "step": 17810 }, { "ce_loss": 0.17194992303848267, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.28008875250816345, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.1431414633989334, "step": 17810 }, { "epoch": 5.940627084723149, "loss": 1.343503713607788, "step": 17810 }, { "ce_loss": 0.24207206070423126, "epoch": 5.940627084723149, "step": 17810 }, { "distill_loss": 0.3009220063686371, "epoch": 5.940627084723149, "step": 17810 }, { "epoch": 5.940627084723149, "ref_ce_loss": 0.16927200555801392, "step": 17810 }, { "epoch": 5.943962641761174, "loss": 0.8039, "step": 17820 }, { "epoch": 5.943962641761174, "grad_norm": 1.7773637771606445, "step": 17820 }, { "epoch": 5.943962641761174, "learning_rate": 0.0002982822714237596, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.7795100808143616, "step": 17820 }, { "ce_loss": 0.1566115766763687, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.3505699634552002, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.13496427237987518, "step": 17820 }, { "epoch": 5.943962641761174, "loss": 0.7355562448501587, "step": 17820 }, { "ce_loss": 0.1443626433610916, "epoch": 5.943962641761174, "step": 17820 }, { "distill_loss": 0.2736237943172455, "epoch": 5.943962641761174, "step": 17820 }, { "epoch": 5.943962641761174, "ref_ce_loss": 0.12582212686538696, "step": 17820 }, { "epoch": 5.947298198799199, "loss": 0.7121, "step": 17830 }, { "epoch": 5.947298198799199, "grad_norm": 1.619713306427002, "step": 17830 }, { "epoch": 5.947298198799199, "learning_rate": 0.000297864405320193, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.6389651298522949, "step": 17830 }, { "ce_loss": 0.1558423936367035, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.2980443239212036, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.1452484279870987, "step": 17830 }, { "epoch": 5.947298198799199, "loss": 0.6044831275939941, "step": 17830 }, { "ce_loss": 0.19478633999824524, "epoch": 5.947298198799199, "step": 17830 }, { "distill_loss": 0.277583509683609, "epoch": 5.947298198799199, "step": 17830 }, { "epoch": 5.947298198799199, "ref_ce_loss": 0.13188131153583527, "step": 17830 }, { "epoch": 5.950633755837225, "loss": 0.7095, "step": 17840 }, { "epoch": 5.950633755837225, "grad_norm": 2.5467517375946045, "step": 17840 }, { "epoch": 5.950633755837225, "learning_rate": 0.0002974466584198555, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.4841284453868866, "step": 17840 }, { "ce_loss": 0.13817235827445984, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.19160957634449005, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.12183627486228943, "step": 17840 }, { "epoch": 5.950633755837225, "loss": 0.6094743013381958, "step": 17840 }, { "ce_loss": 0.126152902841568, "epoch": 5.950633755837225, "step": 17840 }, { "distill_loss": 0.18613648414611816, "epoch": 5.950633755837225, "step": 17840 }, { "epoch": 5.950633755837225, "ref_ce_loss": 0.13332106173038483, "step": 17840 }, { "epoch": 5.95396931287525, "loss": 0.7089, "step": 17850 }, { "epoch": 5.95396931287525, "grad_norm": 1.8888506889343262, "step": 17850 }, { "epoch": 5.95396931287525, "learning_rate": 0.00029702903121030293, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.8191925287246704, "step": 17850 }, { "ce_loss": 0.20863190293312073, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.2998040020465851, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.14137405157089233, "step": 17850 }, { "epoch": 5.95396931287525, "loss": 0.740609884262085, "step": 17850 }, { "ce_loss": 0.1715041548013687, "epoch": 5.95396931287525, "step": 17850 }, { "distill_loss": 0.26939496397972107, "epoch": 5.95396931287525, "step": 17850 }, { "epoch": 5.95396931287525, "ref_ce_loss": 0.17530490458011627, "step": 17850 }, { "epoch": 5.957304869913275, "loss": 0.7317, "step": 17860 }, { "epoch": 5.957304869913275, "grad_norm": 1.600746750831604, "step": 17860 }, { "epoch": 5.957304869913275, "learning_rate": 0.00029661152417895096, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.5815998315811157, "step": 17860 }, { "ce_loss": 0.1620999574661255, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.25247102975845337, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.14225803315639496, "step": 17860 }, { "epoch": 5.957304869913275, "loss": 0.635635256767273, "step": 17860 }, { "ce_loss": 0.200776144862175, "epoch": 5.957304869913275, "step": 17860 }, { "distill_loss": 0.27792149782180786, "epoch": 5.957304869913275, "step": 17860 }, { "epoch": 5.957304869913275, "ref_ce_loss": 0.15670597553253174, "step": 17860 }, { "epoch": 5.960640426951301, "loss": 0.6967, "step": 17870 }, { "epoch": 5.960640426951301, "grad_norm": 2.040088653564453, "step": 17870 }, { "epoch": 5.960640426951301, "learning_rate": 0.00029619413781307546, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.8797625303268433, "step": 17870 }, { "ce_loss": 0.24724845588207245, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.33950746059417725, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.16486626863479614, "step": 17870 }, { "epoch": 5.960640426951301, "loss": 0.7061132192611694, "step": 17870 }, { "ce_loss": 0.16689732670783997, "epoch": 5.960640426951301, "step": 17870 }, { "distill_loss": 0.2903771996498108, "epoch": 5.960640426951301, "step": 17870 }, { "epoch": 5.960640426951301, "ref_ce_loss": 0.13571348786354065, "step": 17870 }, { "epoch": 5.963975983989326, "loss": 0.7297, "step": 17880 }, { "epoch": 5.963975983989326, "grad_norm": 1.6686172485351562, "step": 17880 }, { "epoch": 5.963975983989326, "learning_rate": 0.000295776872599811, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.7089078426361084, "step": 17880 }, { "ce_loss": 0.12177518755197525, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.3120511472225189, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.11147940903902054, "step": 17880 }, { "epoch": 5.963975983989326, "loss": 0.8145482540130615, "step": 17880 }, { "ce_loss": 0.15186285972595215, "epoch": 5.963975983989326, "step": 17880 }, { "distill_loss": 0.372964471578598, "epoch": 5.963975983989326, "step": 17880 }, { "epoch": 5.963975983989326, "ref_ce_loss": 0.12150125205516815, "step": 17880 }, { "epoch": 5.967311541027351, "loss": 0.7587, "step": 17890 }, { "epoch": 5.967311541027351, "grad_norm": 1.8515465259552002, "step": 17890 }, { "epoch": 5.967311541027351, "learning_rate": 0.0002953597290261512, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.6790962219238281, "step": 17890 }, { "ce_loss": 0.16520093381404877, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.33001717925071716, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.13531477749347687, "step": 17890 }, { "epoch": 5.967311541027351, "loss": 0.5637715458869934, "step": 17890 }, { "ce_loss": 0.1474059373140335, "epoch": 5.967311541027351, "step": 17890 }, { "distill_loss": 0.24696558713912964, "epoch": 5.967311541027351, "step": 17890 }, { "epoch": 5.967311541027351, "ref_ce_loss": 0.13812749087810516, "step": 17890 }, { "epoch": 5.970647098065377, "loss": 0.7349, "step": 17900 }, { "epoch": 5.970647098065377, "grad_norm": 1.8551923036575317, "step": 17900 }, { "epoch": 5.970647098065377, "learning_rate": 0.00029494270757894733, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 0.7883302569389343, "step": 17900 }, { "ce_loss": 0.15675842761993408, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.32231029868125916, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.18687209486961365, "step": 17900 }, { "epoch": 5.970647098065377, "loss": 0.6241994500160217, "step": 17900 }, { "ce_loss": 0.1367034614086151, "epoch": 5.970647098065377, "step": 17900 }, { "distill_loss": 0.32802969217300415, "epoch": 5.970647098065377, "step": 17900 }, { "epoch": 5.970647098065377, "ref_ce_loss": 0.11875439435243607, "step": 17900 }, { "epoch": 5.973982655103402, "loss": 0.7017, "step": 17910 }, { "epoch": 5.973982655103402, "grad_norm": 1.8577299118041992, "step": 17910 }, { "epoch": 5.973982655103402, "learning_rate": 0.00029452580874490835, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.5608646869659424, "step": 17910 }, { "ce_loss": 0.11458832770586014, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.24180865287780762, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.12261128425598145, "step": 17910 }, { "epoch": 5.973982655103402, "loss": 0.7354592084884644, "step": 17910 }, { "ce_loss": 0.18657203018665314, "epoch": 5.973982655103402, "step": 17910 }, { "distill_loss": 0.3183566927909851, "epoch": 5.973982655103402, "step": 17910 }, { "epoch": 5.973982655103402, "ref_ce_loss": 0.14715497195720673, "step": 17910 }, { "epoch": 5.9773182121414274, "loss": 0.7461, "step": 17920 }, { "epoch": 5.9773182121414274, "grad_norm": 1.6920195817947388, "step": 17920 }, { "epoch": 5.9773182121414274, "learning_rate": 0.00029410903301059987, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 1.1519207954406738, "step": 17920 }, { "ce_loss": 0.22299034893512726, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.4153600335121155, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.22701136767864227, "step": 17920 }, { "epoch": 5.9773182121414274, "loss": 0.7285803556442261, "step": 17920 }, { "ce_loss": 0.17011786997318268, "epoch": 5.9773182121414274, "step": 17920 }, { "distill_loss": 0.2496124505996704, "epoch": 5.9773182121414274, "step": 17920 }, { "epoch": 5.9773182121414274, "ref_ce_loss": 0.11554224044084549, "step": 17920 }, { "epoch": 5.980653769179453, "loss": 0.7015, "step": 17930 }, { "epoch": 5.980653769179453, "grad_norm": 1.4887522459030151, "step": 17930 }, { "epoch": 5.980653769179453, "learning_rate": 0.0002936923808624444, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.5744050145149231, "step": 17930 }, { "ce_loss": 0.1501711755990982, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.22328665852546692, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.16832423210144043, "step": 17930 }, { "epoch": 5.980653769179453, "loss": 0.6508606672286987, "step": 17930 }, { "ce_loss": 0.1293591558933258, "epoch": 5.980653769179453, "step": 17930 }, { "distill_loss": 0.3075183629989624, "epoch": 5.980653769179453, "step": 17930 }, { "epoch": 5.980653769179453, "ref_ce_loss": 0.11165163666009903, "step": 17930 }, { "epoch": 5.983989326217478, "loss": 0.6488, "step": 17940 }, { "epoch": 5.983989326217478, "grad_norm": 1.510502576828003, "step": 17940 }, { "epoch": 5.983989326217478, "learning_rate": 0.0002932758527867196, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.5948408842086792, "step": 17940 }, { "ce_loss": 0.13838577270507812, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.26575592160224915, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.1424756646156311, "step": 17940 }, { "epoch": 5.983989326217478, "loss": 0.6306717991828918, "step": 17940 }, { "ce_loss": 0.14688876271247864, "epoch": 5.983989326217478, "step": 17940 }, { "distill_loss": 0.2843662202358246, "epoch": 5.983989326217478, "step": 17940 }, { "epoch": 5.983989326217478, "ref_ce_loss": 0.1535085290670395, "step": 17940 }, { "epoch": 5.9873248832555035, "loss": 0.6957, "step": 17950 }, { "epoch": 5.9873248832555035, "grad_norm": 5.702191352844238, "step": 17950 }, { "epoch": 5.9873248832555035, "learning_rate": 0.0002928594492695586, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.5361848473548889, "step": 17950 }, { "ce_loss": 0.10649754852056503, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.15731768310070038, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.11794476956129074, "step": 17950 }, { "epoch": 5.9873248832555035, "loss": 0.7358103394508362, "step": 17950 }, { "ce_loss": 0.11887186020612717, "epoch": 5.9873248832555035, "step": 17950 }, { "distill_loss": 0.19445015490055084, "epoch": 5.9873248832555035, "step": 17950 }, { "epoch": 5.9873248832555035, "ref_ce_loss": 0.10782976448535919, "step": 17950 }, { "epoch": 5.990660440293529, "loss": 0.6542, "step": 17960 }, { "epoch": 5.990660440293529, "grad_norm": 1.7833524942398071, "step": 17960 }, { "epoch": 5.990660440293529, "learning_rate": 0.00029244317079694915, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.6083264946937561, "step": 17960 }, { "ce_loss": 0.12414541840553284, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.2250789850950241, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.13253392279148102, "step": 17960 }, { "epoch": 5.990660440293529, "loss": 0.5254873037338257, "step": 17960 }, { "ce_loss": 0.1291838437318802, "epoch": 5.990660440293529, "step": 17960 }, { "distill_loss": 0.25294917821884155, "epoch": 5.990660440293529, "step": 17960 }, { "epoch": 5.990660440293529, "ref_ce_loss": 0.11796507984399796, "step": 17960 }, { "epoch": 5.993995997331554, "loss": 0.6692, "step": 17970 }, { "epoch": 5.993995997331554, "grad_norm": 1.4126019477844238, "step": 17970 }, { "epoch": 5.993995997331554, "learning_rate": 0.0002920270178547329, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.5689122676849365, "step": 17970 }, { "ce_loss": 0.18033574521541595, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.21025502681732178, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.13225938379764557, "step": 17970 }, { "epoch": 5.993995997331554, "loss": 0.5316141247749329, "step": 17970 }, { "ce_loss": 0.1508607119321823, "epoch": 5.993995997331554, "step": 17970 }, { "distill_loss": 0.22014158964157104, "epoch": 5.993995997331554, "step": 17970 }, { "epoch": 5.993995997331554, "ref_ce_loss": 0.12547865509986877, "step": 17970 }, { "epoch": 5.9973315543695795, "loss": 0.6219, "step": 17980 }, { "epoch": 5.9973315543695795, "grad_norm": 2.276890277862549, "step": 17980 }, { "epoch": 5.9973315543695795, "learning_rate": 0.00029161099092860527, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.6732531785964966, "step": 17980 }, { "ce_loss": 0.16249193251132965, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.2430381178855896, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.16172070801258087, "step": 17980 }, { "epoch": 5.9973315543695795, "loss": 0.478391170501709, "step": 17980 }, { "ce_loss": 0.13364195823669434, "epoch": 5.9973315543695795, "step": 17980 }, { "distill_loss": 0.21999022364616394, "epoch": 5.9973315543695795, "step": 17980 }, { "epoch": 5.9973315543695795, "ref_ce_loss": 0.12411677092313766, "step": 17980 }, { "epoch": 6.000667111407605, "loss": 0.7267, "step": 17990 }, { "epoch": 6.000667111407605, "grad_norm": 3.1330134868621826, "step": 17990 }, { "epoch": 6.000667111407605, "learning_rate": 0.00029119509050411435, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.6111536026000977, "step": 17990 }, { "ce_loss": 0.12887738645076752, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.25863373279571533, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.133753702044487, "step": 17990 }, { "epoch": 6.000667111407605, "loss": 0.43454331159591675, "step": 17990 }, { "ce_loss": 0.08036650717258453, "epoch": 6.000667111407605, "step": 17990 }, { "distill_loss": 0.16697892546653748, "epoch": 6.000667111407605, "step": 17990 }, { "epoch": 6.000667111407605, "ref_ce_loss": 0.0892324298620224, "step": 17990 }, { "epoch": 6.00400266844563, "loss": 0.6487, "step": 18000 }, { "epoch": 6.00400266844563, "grad_norm": 1.4188312292099, "step": 18000 }, { "epoch": 6.00400266844563, "learning_rate": 0.0002907793170666606, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.6451430320739746, "step": 18000 }, { "ce_loss": 0.1146027222275734, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.277157187461853, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.10770218074321747, "step": 18000 }, { "epoch": 6.00400266844563, "loss": 0.84647536277771, "step": 18000 }, { "ce_loss": 0.17758968472480774, "epoch": 6.00400266844563, "step": 18000 }, { "distill_loss": 0.31190869212150574, "epoch": 6.00400266844563, "step": 18000 }, { "epoch": 6.00400266844563, "ref_ce_loss": 0.1489148736000061, "step": 18000 }, { "epoch": 6.007338225483656, "loss": 0.7068, "step": 18010 }, { "epoch": 6.007338225483656, "grad_norm": 2.7114202976226807, "step": 18010 }, { "epoch": 6.007338225483656, "learning_rate": 0.0002903636711014966, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.5021632313728333, "step": 18010 }, { "ce_loss": 0.09225542098283768, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.27368324995040894, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.0975363552570343, "step": 18010 }, { "epoch": 6.007338225483656, "loss": 0.6700934171676636, "step": 18010 }, { "ce_loss": 0.19836029410362244, "epoch": 6.007338225483656, "step": 18010 }, { "distill_loss": 0.33002209663391113, "epoch": 6.007338225483656, "step": 18010 }, { "epoch": 6.007338225483656, "ref_ce_loss": 0.11219165474176407, "step": 18010 }, { "epoch": 6.010673782521681, "loss": 0.7095, "step": 18020 }, { "epoch": 6.010673782521681, "grad_norm": 3.8896095752716064, "step": 18020 }, { "epoch": 6.010673782521681, "learning_rate": 0.0002899481530937257, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.654662549495697, "step": 18020 }, { "ce_loss": 0.10406067222356796, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.30926966667175293, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.1180790588259697, "step": 18020 }, { "epoch": 6.010673782521681, "loss": 0.46157923340797424, "step": 18020 }, { "ce_loss": 0.0768638551235199, "epoch": 6.010673782521681, "step": 18020 }, { "distill_loss": 0.25196242332458496, "epoch": 6.010673782521681, "step": 18020 }, { "epoch": 6.010673782521681, "ref_ce_loss": 0.08556817471981049, "step": 18020 }, { "epoch": 6.014009339559706, "loss": 0.6762, "step": 18030 }, { "epoch": 6.014009339559706, "grad_norm": 1.584222674369812, "step": 18030 }, { "epoch": 6.014009339559706, "learning_rate": 0.0002895327635283021, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.6818699836730957, "step": 18030 }, { "ce_loss": 0.1505424976348877, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.3525632917881012, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.12546685338020325, "step": 18030 }, { "epoch": 6.014009339559706, "loss": 0.48431217670440674, "step": 18030 }, { "ce_loss": 0.12620176374912262, "epoch": 6.014009339559706, "step": 18030 }, { "distill_loss": 0.21929430961608887, "epoch": 6.014009339559706, "step": 18030 }, { "epoch": 6.014009339559706, "ref_ce_loss": 0.11054594814777374, "step": 18030 }, { "epoch": 6.017344896597732, "loss": 0.7209, "step": 18040 }, { "epoch": 6.017344896597732, "grad_norm": 3.463015079498291, "step": 18040 }, { "epoch": 6.017344896597732, "learning_rate": 0.00028911750289003043, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.5016592144966125, "step": 18040 }, { "ce_loss": 0.12095203250646591, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.27965325117111206, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.10088794678449631, "step": 18040 }, { "epoch": 6.017344896597732, "loss": 0.8476541042327881, "step": 18040 }, { "ce_loss": 0.1933862417936325, "epoch": 6.017344896597732, "step": 18040 }, { "distill_loss": 0.4288746118545532, "epoch": 6.017344896597732, "step": 18040 }, { "epoch": 6.017344896597732, "ref_ce_loss": 0.12653212249279022, "step": 18040 }, { "epoch": 6.020680453635757, "loss": 0.7125, "step": 18050 }, { "epoch": 6.020680453635757, "grad_norm": 1.573563814163208, "step": 18050 }, { "epoch": 6.020680453635757, "learning_rate": 0.00028870237166356424, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.5512456297874451, "step": 18050 }, { "ce_loss": 0.07267727702856064, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.2865201532840729, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.08283307403326035, "step": 18050 }, { "epoch": 6.020680453635757, "loss": 0.6286212205886841, "step": 18050 }, { "ce_loss": 0.1411493569612503, "epoch": 6.020680453635757, "step": 18050 }, { "distill_loss": 0.30221548676490784, "epoch": 6.020680453635757, "step": 18050 }, { "epoch": 6.020680453635757, "ref_ce_loss": 0.1384388655424118, "step": 18050 }, { "epoch": 6.024016010673782, "loss": 0.6186, "step": 18060 }, { "epoch": 6.024016010673782, "grad_norm": 1.77467942237854, "step": 18060 }, { "epoch": 6.024016010673782, "learning_rate": 0.0002882873703334065, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.525521993637085, "step": 18060 }, { "ce_loss": 0.1251913160085678, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.305527925491333, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.09452289342880249, "step": 18060 }, { "epoch": 6.024016010673782, "loss": 0.7104737162590027, "step": 18060 }, { "ce_loss": 0.09152280539274216, "epoch": 6.024016010673782, "step": 18060 }, { "distill_loss": 0.22016364336013794, "epoch": 6.024016010673782, "step": 18060 }, { "epoch": 6.024016010673782, "ref_ce_loss": 0.1253073811531067, "step": 18060 }, { "epoch": 6.027351567711808, "loss": 0.6115, "step": 18070 }, { "epoch": 6.027351567711808, "grad_norm": 1.388637900352478, "step": 18070 }, { "epoch": 6.027351567711808, "learning_rate": 0.0002878724993839083, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.7839659452438354, "step": 18070 }, { "ce_loss": 0.15709657967090607, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.23128685355186462, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.1473115086555481, "step": 18070 }, { "epoch": 6.027351567711808, "loss": 0.4913570284843445, "step": 18070 }, { "ce_loss": 0.10992088168859482, "epoch": 6.027351567711808, "step": 18070 }, { "distill_loss": 0.23345129191875458, "epoch": 6.027351567711808, "step": 18070 }, { "epoch": 6.027351567711808, "ref_ce_loss": 0.12018200010061264, "step": 18070 }, { "epoch": 6.030687124749833, "loss": 0.605, "step": 18080 }, { "epoch": 6.030687124749833, "grad_norm": 1.437814712524414, "step": 18080 }, { "epoch": 6.030687124749833, "learning_rate": 0.0002874577592992688, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.8127552270889282, "step": 18080 }, { "ce_loss": 0.19625712931156158, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.33098307251930237, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.12643705308437347, "step": 18080 }, { "epoch": 6.030687124749833, "loss": 0.6383655667304993, "step": 18080 }, { "ce_loss": 0.14720506966114044, "epoch": 6.030687124749833, "step": 18080 }, { "distill_loss": 0.2253483533859253, "epoch": 6.030687124749833, "step": 18080 }, { "epoch": 6.030687124749833, "ref_ce_loss": 0.11266171187162399, "step": 18080 }, { "epoch": 6.034022681787858, "loss": 0.6341, "step": 18090 }, { "epoch": 6.034022681787858, "grad_norm": 1.5874073505401611, "step": 18090 }, { "epoch": 6.034022681787858, "learning_rate": 0.00028704315056353414, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.6964699625968933, "step": 18090 }, { "ce_loss": 0.1716633439064026, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.2955109179019928, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.11812959611415863, "step": 18090 }, { "epoch": 6.034022681787858, "loss": 0.5283066630363464, "step": 18090 }, { "ce_loss": 0.12395858019590378, "epoch": 6.034022681787858, "step": 18090 }, { "distill_loss": 0.23972785472869873, "epoch": 6.034022681787858, "step": 18090 }, { "epoch": 6.034022681787858, "ref_ce_loss": 0.10385114699602127, "step": 18090 }, { "epoch": 6.037358238825884, "loss": 0.6332, "step": 18100 }, { "epoch": 6.037358238825884, "grad_norm": 1.2264511585235596, "step": 18100 }, { "epoch": 6.037358238825884, "learning_rate": 0.00028662867366059756, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.8792027235031128, "step": 18100 }, { "ce_loss": 0.17050066590309143, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.2821817696094513, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.1407461166381836, "step": 18100 }, { "epoch": 6.037358238825884, "loss": 0.5104156732559204, "step": 18100 }, { "ce_loss": 0.14286614954471588, "epoch": 6.037358238825884, "step": 18100 }, { "distill_loss": 0.2257169485092163, "epoch": 6.037358238825884, "step": 18100 }, { "epoch": 6.037358238825884, "ref_ce_loss": 0.10134001076221466, "step": 18100 }, { "epoch": 6.040693795863909, "loss": 0.6198, "step": 18110 }, { "epoch": 6.040693795863909, "grad_norm": 2.148345708847046, "step": 18110 }, { "epoch": 6.040693795863909, "learning_rate": 0.000286214329074198, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.4621307849884033, "step": 18110 }, { "ce_loss": 0.0889039933681488, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.23973098397254944, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.08387582004070282, "step": 18110 }, { "epoch": 6.040693795863909, "loss": 0.597586989402771, "step": 18110 }, { "ce_loss": 0.18086780607700348, "epoch": 6.040693795863909, "step": 18110 }, { "distill_loss": 0.2776992619037628, "epoch": 6.040693795863909, "step": 18110 }, { "epoch": 6.040693795863909, "ref_ce_loss": 0.13857479393482208, "step": 18110 }, { "epoch": 6.044029352901934, "loss": 0.6028, "step": 18120 }, { "epoch": 6.044029352901934, "grad_norm": 1.5330827236175537, "step": 18120 }, { "epoch": 6.044029352901934, "learning_rate": 0.0002858001172879202, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.5574997663497925, "step": 18120 }, { "ce_loss": 0.13679958879947662, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.2656584680080414, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.15457755327224731, "step": 18120 }, { "epoch": 6.044029352901934, "loss": 0.49670228362083435, "step": 18120 }, { "ce_loss": 0.1344117373228073, "epoch": 6.044029352901934, "step": 18120 }, { "distill_loss": 0.23335647583007812, "epoch": 6.044029352901934, "step": 18120 }, { "epoch": 6.044029352901934, "ref_ce_loss": 0.09552182257175446, "step": 18120 }, { "epoch": 6.04736490993996, "loss": 0.6797, "step": 18130 }, { "epoch": 6.04736490993996, "grad_norm": 1.3088942766189575, "step": 18130 }, { "epoch": 6.04736490993996, "learning_rate": 0.00028538603878519407, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.743168294429779, "step": 18130 }, { "ce_loss": 0.1520257294178009, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.3432425558567047, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.11429043114185333, "step": 18130 }, { "epoch": 6.04736490993996, "loss": 0.8178507089614868, "step": 18130 }, { "ce_loss": 0.14292679727077484, "epoch": 6.04736490993996, "step": 18130 }, { "distill_loss": 0.30670297145843506, "epoch": 6.04736490993996, "step": 18130 }, { "epoch": 6.04736490993996, "ref_ce_loss": 0.15452007949352264, "step": 18130 }, { "epoch": 6.050700466977985, "loss": 0.7141, "step": 18140 }, { "epoch": 6.050700466977985, "grad_norm": 1.584452748298645, "step": 18140 }, { "epoch": 6.050700466977985, "learning_rate": 0.00028497209404929345, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.5253815650939941, "step": 18140 }, { "ce_loss": 0.13081501424312592, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.27755340933799744, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.11679794639348984, "step": 18140 }, { "epoch": 6.050700466977985, "loss": 0.7471408247947693, "step": 18140 }, { "ce_loss": 0.15584027767181396, "epoch": 6.050700466977985, "step": 18140 }, { "distill_loss": 0.3901218771934509, "epoch": 6.050700466977985, "step": 18140 }, { "epoch": 6.050700466977985, "ref_ce_loss": 0.15860076248645782, "step": 18140 }, { "epoch": 6.0540360240160105, "loss": 0.6564, "step": 18150 }, { "epoch": 6.0540360240160105, "grad_norm": 1.770574927330017, "step": 18150 }, { "epoch": 6.0540360240160105, "learning_rate": 0.0002845582835633367, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.6072221994400024, "step": 18150 }, { "ce_loss": 0.13476058840751648, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.29309630393981934, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.10323754698038101, "step": 18150 }, { "epoch": 6.0540360240160105, "loss": 0.4680664837360382, "step": 18150 }, { "ce_loss": 0.07075583934783936, "epoch": 6.0540360240160105, "step": 18150 }, { "distill_loss": 0.24169869720935822, "epoch": 6.0540360240160105, "step": 18150 }, { "epoch": 6.0540360240160105, "ref_ce_loss": 0.11755290627479553, "step": 18150 }, { "epoch": 6.057371581054036, "loss": 0.6628, "step": 18160 }, { "epoch": 6.057371581054036, "grad_norm": 1.2780784368515015, "step": 18160 }, { "epoch": 6.057371581054036, "learning_rate": 0.0002841446078102852, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.6397781372070312, "step": 18160 }, { "ce_loss": 0.12417633086442947, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.2742384672164917, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.12048876285552979, "step": 18160 }, { "epoch": 6.057371581054036, "loss": 0.8331379890441895, "step": 18160 }, { "ce_loss": 0.16268602013587952, "epoch": 6.057371581054036, "step": 18160 }, { "distill_loss": 0.24811343848705292, "epoch": 6.057371581054036, "step": 18160 }, { "epoch": 6.057371581054036, "ref_ce_loss": 0.15608720481395721, "step": 18160 }, { "epoch": 6.060707138092061, "loss": 0.6458, "step": 18170 }, { "epoch": 6.060707138092061, "grad_norm": 1.2573291063308716, "step": 18170 }, { "epoch": 6.060707138092061, "learning_rate": 0.00028373106727294276, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.9425279498100281, "step": 18170 }, { "ce_loss": 0.17792847752571106, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.3310987949371338, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.12824629247188568, "step": 18170 }, { "epoch": 6.060707138092061, "loss": 0.5275480151176453, "step": 18170 }, { "ce_loss": 0.11334899067878723, "epoch": 6.060707138092061, "step": 18170 }, { "distill_loss": 0.27897968888282776, "epoch": 6.060707138092061, "step": 18170 }, { "epoch": 6.060707138092061, "ref_ce_loss": 0.13500186800956726, "step": 18170 }, { "epoch": 6.0640426951300865, "loss": 0.6116, "step": 18180 }, { "epoch": 6.0640426951300865, "grad_norm": 1.9847238063812256, "step": 18180 }, { "epoch": 6.0640426951300865, "learning_rate": 0.000283317662433956, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.5519295334815979, "step": 18180 }, { "ce_loss": 0.14950774610042572, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.26175040006637573, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.14031194150447845, "step": 18180 }, { "epoch": 6.0640426951300865, "loss": 0.4585942029953003, "step": 18180 }, { "ce_loss": 0.0969991460442543, "epoch": 6.0640426951300865, "step": 18180 }, { "distill_loss": 0.2302173674106598, "epoch": 6.0640426951300865, "step": 18180 }, { "epoch": 6.0640426951300865, "ref_ce_loss": 0.09909547120332718, "step": 18180 }, { "epoch": 6.067378252168112, "loss": 0.6563, "step": 18190 }, { "epoch": 6.067378252168112, "grad_norm": 1.5907666683197021, "step": 18190 }, { "epoch": 6.067378252168112, "learning_rate": 0.00028290439377581263, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.4612014591693878, "step": 18190 }, { "ce_loss": 0.10405223071575165, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.2557392120361328, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.10129746794700623, "step": 18190 }, { "epoch": 6.067378252168112, "loss": 0.5775993466377258, "step": 18190 }, { "ce_loss": 0.1516268253326416, "epoch": 6.067378252168112, "step": 18190 }, { "distill_loss": 0.3129112720489502, "epoch": 6.067378252168112, "step": 18190 }, { "epoch": 6.067378252168112, "ref_ce_loss": 0.11288562417030334, "step": 18190 }, { "epoch": 6.070713809206137, "loss": 0.5735, "step": 18200 }, { "epoch": 6.070713809206137, "grad_norm": 1.4031167030334473, "step": 18200 }, { "epoch": 6.070713809206137, "learning_rate": 0.0002824912617808418, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.7670556902885437, "step": 18200 }, { "ce_loss": 0.18870723247528076, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.37014707922935486, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.16599838435649872, "step": 18200 }, { "epoch": 6.070713809206137, "loss": 0.6280184984207153, "step": 18200 }, { "ce_loss": 0.13828735053539276, "epoch": 6.070713809206137, "step": 18200 }, { "distill_loss": 0.30158185958862305, "epoch": 6.070713809206137, "step": 18200 }, { "epoch": 6.070713809206137, "ref_ce_loss": 0.12955552339553833, "step": 18200 }, { "epoch": 6.074049366244163, "loss": 0.7286, "step": 18210 }, { "epoch": 6.074049366244163, "grad_norm": 1.8095005750656128, "step": 18210 }, { "epoch": 6.074049366244163, "learning_rate": 0.00028207826693121287, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.7221490144729614, "step": 18210 }, { "ce_loss": 0.17217539250850677, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.30401742458343506, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.12987874448299408, "step": 18210 }, { "epoch": 6.074049366244163, "loss": 0.6990206241607666, "step": 18210 }, { "ce_loss": 0.1532217115163803, "epoch": 6.074049366244163, "step": 18210 }, { "distill_loss": 0.342460036277771, "epoch": 6.074049366244163, "step": 18210 }, { "epoch": 6.074049366244163, "ref_ce_loss": 0.15716324746608734, "step": 18210 }, { "epoch": 6.077384923282188, "loss": 0.6538, "step": 18220 }, { "epoch": 6.077384923282188, "grad_norm": 1.2875056266784668, "step": 18220 }, { "epoch": 6.077384923282188, "learning_rate": 0.0002816654097089354, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.9460160732269287, "step": 18220 }, { "ce_loss": 0.1782992035150528, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.30229833722114563, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.11988240480422974, "step": 18220 }, { "epoch": 6.077384923282188, "loss": 0.6548318862915039, "step": 18220 }, { "ce_loss": 0.14591868221759796, "epoch": 6.077384923282188, "step": 18220 }, { "distill_loss": 0.3423047363758087, "epoch": 6.077384923282188, "step": 18220 }, { "epoch": 6.077384923282188, "ref_ce_loss": 0.11783643066883087, "step": 18220 }, { "epoch": 6.080720480320213, "loss": 0.7018, "step": 18230 }, { "epoch": 6.080720480320213, "grad_norm": 1.6654572486877441, "step": 18230 }, { "epoch": 6.080720480320213, "learning_rate": 0.0002812526905958581, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.6765292882919312, "step": 18230 }, { "ce_loss": 0.13385047018527985, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.3190274238586426, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.12536045908927917, "step": 18230 }, { "epoch": 6.080720480320213, "loss": 0.4414009153842926, "step": 18230 }, { "ce_loss": 0.0661691203713417, "epoch": 6.080720480320213, "step": 18230 }, { "distill_loss": 0.160070538520813, "epoch": 6.080720480320213, "step": 18230 }, { "epoch": 6.080720480320213, "ref_ce_loss": 0.08345791697502136, "step": 18230 }, { "epoch": 6.084056037358239, "loss": 0.6399, "step": 18240 }, { "epoch": 6.084056037358239, "grad_norm": 1.5034816265106201, "step": 18240 }, { "epoch": 6.084056037358239, "learning_rate": 0.0002808401100736687, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.8535445332527161, "step": 18240 }, { "ce_loss": 0.22564417123794556, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.3347397446632385, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.13953503966331482, "step": 18240 }, { "epoch": 6.084056037358239, "loss": 0.6334425806999207, "step": 18240 }, { "ce_loss": 0.18967677652835846, "epoch": 6.084056037358239, "step": 18240 }, { "distill_loss": 0.3053939938545227, "epoch": 6.084056037358239, "step": 18240 }, { "epoch": 6.084056037358239, "ref_ce_loss": 0.138227179646492, "step": 18240 }, { "epoch": 6.087391594396264, "loss": 0.5813, "step": 18250 }, { "epoch": 6.087391594396264, "grad_norm": 2.3025712966918945, "step": 18250 }, { "epoch": 6.087391594396264, "learning_rate": 0.0002804276686238928, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.4579137861728668, "step": 18250 }, { "ce_loss": 0.10760626941919327, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.22375169396400452, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.12626978754997253, "step": 18250 }, { "epoch": 6.087391594396264, "loss": 0.5572217702865601, "step": 18250 }, { "ce_loss": 0.14132961630821228, "epoch": 6.087391594396264, "step": 18250 }, { "distill_loss": 0.25528982281684875, "epoch": 6.087391594396264, "step": 18250 }, { "epoch": 6.087391594396264, "ref_ce_loss": 0.1243145763874054, "step": 18250 }, { "epoch": 6.090727151434289, "loss": 0.5813, "step": 18260 }, { "epoch": 6.090727151434289, "grad_norm": 1.3174196481704712, "step": 18260 }, { "epoch": 6.090727151434289, "learning_rate": 0.00028001536672789414, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.5696294903755188, "step": 18260 }, { "ce_loss": 0.15350523591041565, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.26622551679611206, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.12345647066831589, "step": 18260 }, { "epoch": 6.090727151434289, "loss": 0.5882666707038879, "step": 18260 }, { "ce_loss": 0.0760028064250946, "epoch": 6.090727151434289, "step": 18260 }, { "distill_loss": 0.24794188141822815, "epoch": 6.090727151434289, "step": 18260 }, { "epoch": 6.090727151434289, "ref_ce_loss": 0.11833299696445465, "step": 18260 }, { "epoch": 6.094062708472315, "loss": 0.5987, "step": 18270 }, { "epoch": 6.094062708472315, "grad_norm": 1.450992226600647, "step": 18270 }, { "epoch": 6.094062708472315, "learning_rate": 0.0002796032048668734, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.6237502694129944, "step": 18270 }, { "ce_loss": 0.18267591297626495, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.29955023527145386, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.14133091270923615, "step": 18270 }, { "epoch": 6.094062708472315, "loss": 0.5703005790710449, "step": 18270 }, { "ce_loss": 0.12525999546051025, "epoch": 6.094062708472315, "step": 18270 }, { "distill_loss": 0.2633492350578308, "epoch": 6.094062708472315, "step": 18270 }, { "epoch": 6.094062708472315, "ref_ce_loss": 0.14371177554130554, "step": 18270 }, { "epoch": 6.09739826551034, "loss": 0.6531, "step": 18280 }, { "epoch": 6.09739826551034, "grad_norm": 1.9310940504074097, "step": 18280 }, { "epoch": 6.09739826551034, "learning_rate": 0.000279191183521868, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.6407101154327393, "step": 18280 }, { "ce_loss": 0.13469745218753815, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.302370548248291, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.15269644558429718, "step": 18280 }, { "epoch": 6.09739826551034, "loss": 0.462386816740036, "step": 18280 }, { "ce_loss": 0.0804935097694397, "epoch": 6.09739826551034, "step": 18280 }, { "distill_loss": 0.26779067516326904, "epoch": 6.09739826551034, "step": 18280 }, { "epoch": 6.09739826551034, "ref_ce_loss": 0.08050373941659927, "step": 18280 }, { "epoch": 6.100733822548365, "loss": 0.6526, "step": 18290 }, { "epoch": 6.100733822548365, "grad_norm": 1.4544947147369385, "step": 18290 }, { "epoch": 6.100733822548365, "learning_rate": 0.00027877930317375086, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.5759723782539368, "step": 18290 }, { "ce_loss": 0.0904451534152031, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.31251609325408936, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.13719967007637024, "step": 18290 }, { "epoch": 6.100733822548365, "loss": 0.5790067911148071, "step": 18290 }, { "ce_loss": 0.1277799904346466, "epoch": 6.100733822548365, "step": 18290 }, { "distill_loss": 0.30773675441741943, "epoch": 6.100733822548365, "step": 18290 }, { "epoch": 6.100733822548365, "ref_ce_loss": 0.14318691194057465, "step": 18290 }, { "epoch": 6.104069379586391, "loss": 0.6527, "step": 18300 }, { "epoch": 6.104069379586391, "grad_norm": 1.80874502658844, "step": 18300 }, { "epoch": 6.104069379586391, "learning_rate": 0.0002783675643032308, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.570338249206543, "step": 18300 }, { "ce_loss": 0.13978049159049988, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.26612356305122375, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.13457608222961426, "step": 18300 }, { "epoch": 6.104069379586391, "loss": 0.5437963008880615, "step": 18300 }, { "ce_loss": 0.10671614110469818, "epoch": 6.104069379586391, "step": 18300 }, { "distill_loss": 0.2908667325973511, "epoch": 6.104069379586391, "step": 18300 }, { "epoch": 6.104069379586391, "ref_ce_loss": 0.10786082595586777, "step": 18300 }, { "epoch": 6.107404936624416, "loss": 0.7081, "step": 18310 }, { "epoch": 6.107404936624416, "grad_norm": 1.6988565921783447, "step": 18310 }, { "epoch": 6.107404936624416, "learning_rate": 0.0002779559673908514, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.5674381256103516, "step": 18310 }, { "ce_loss": 0.12075648456811905, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.2612060308456421, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.09225589036941528, "step": 18310 }, { "epoch": 6.107404936624416, "loss": 0.8771853446960449, "step": 18310 }, { "ce_loss": 0.17971204221248627, "epoch": 6.107404936624416, "step": 18310 }, { "distill_loss": 0.2916775941848755, "epoch": 6.107404936624416, "step": 18310 }, { "epoch": 6.107404936624416, "ref_ce_loss": 0.1181989461183548, "step": 18310 }, { "epoch": 6.110740493662441, "loss": 0.6473, "step": 18320 }, { "epoch": 6.110740493662441, "grad_norm": 1.7148319482803345, "step": 18320 }, { "epoch": 6.110740493662441, "learning_rate": 0.00027754451291699063, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.5265355110168457, "step": 18320 }, { "ce_loss": 0.10530180484056473, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.2391783744096756, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.12191746383905411, "step": 18320 }, { "epoch": 6.110740493662441, "loss": 0.5689677000045776, "step": 18320 }, { "ce_loss": 0.10074944794178009, "epoch": 6.110740493662441, "step": 18320 }, { "distill_loss": 0.3271622061729431, "epoch": 6.110740493662441, "step": 18320 }, { "epoch": 6.110740493662441, "ref_ce_loss": 0.11129195988178253, "step": 18320 }, { "epoch": 6.114076050700467, "loss": 0.6375, "step": 18330 }, { "epoch": 6.114076050700467, "grad_norm": 1.5958406925201416, "step": 18330 }, { "epoch": 6.114076050700467, "learning_rate": 0.0002771332013618599, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.5165610909461975, "step": 18330 }, { "ce_loss": 0.09935726225376129, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.2832036018371582, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.13384084403514862, "step": 18330 }, { "epoch": 6.114076050700467, "loss": 0.5234933495521545, "step": 18330 }, { "ce_loss": 0.07920798659324646, "epoch": 6.114076050700467, "step": 18330 }, { "distill_loss": 0.2518523037433624, "epoch": 6.114076050700467, "step": 18330 }, { "epoch": 6.114076050700467, "ref_ce_loss": 0.11808153241872787, "step": 18330 }, { "epoch": 6.117411607738492, "loss": 0.6802, "step": 18340 }, { "epoch": 6.117411607738492, "grad_norm": 1.1820212602615356, "step": 18340 }, { "epoch": 6.117411607738492, "learning_rate": 0.00027672203320550434, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.6534271240234375, "step": 18340 }, { "ce_loss": 0.18147516250610352, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.27505460381507874, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.11951899528503418, "step": 18340 }, { "epoch": 6.117411607738492, "loss": 0.7164322733879089, "step": 18340 }, { "ce_loss": 0.1533847153186798, "epoch": 6.117411607738492, "step": 18340 }, { "distill_loss": 0.22848907113075256, "epoch": 6.117411607738492, "step": 18340 }, { "epoch": 6.117411607738492, "ref_ce_loss": 0.10289295017719269, "step": 18340 }, { "epoch": 6.1207471647765175, "loss": 0.6214, "step": 18350 }, { "epoch": 6.1207471647765175, "grad_norm": 1.0374150276184082, "step": 18350 }, { "epoch": 6.1207471647765175, "learning_rate": 0.00027631100892780116, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.5371941328048706, "step": 18350 }, { "ce_loss": 0.18888607621192932, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.23849518597126007, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.10944174975156784, "step": 18350 }, { "epoch": 6.1207471647765175, "loss": 0.40661856532096863, "step": 18350 }, { "ce_loss": 0.09315002709627151, "epoch": 6.1207471647765175, "step": 18350 }, { "distill_loss": 0.20016992092132568, "epoch": 6.1207471647765175, "step": 18350 }, { "epoch": 6.1207471647765175, "ref_ce_loss": 0.11312264204025269, "step": 18350 }, { "epoch": 6.124082721814543, "loss": 0.5196, "step": 18360 }, { "epoch": 6.124082721814543, "grad_norm": 1.4926507472991943, "step": 18360 }, { "epoch": 6.124082721814543, "learning_rate": 0.00027590012900846, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.41106167435646057, "step": 18360 }, { "ce_loss": 0.08867556601762772, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.19158530235290527, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.09265054017305374, "step": 18360 }, { "epoch": 6.124082721814543, "loss": 0.4671410620212555, "step": 18360 }, { "ce_loss": 0.13322880864143372, "epoch": 6.124082721814543, "step": 18360 }, { "distill_loss": 0.2158261090517044, "epoch": 6.124082721814543, "step": 18360 }, { "epoch": 6.124082721814543, "ref_ce_loss": 0.11775711923837662, "step": 18360 }, { "epoch": 6.127418278852568, "loss": 0.5083, "step": 18370 }, { "epoch": 6.127418278852568, "grad_norm": 1.4729706048965454, "step": 18370 }, { "epoch": 6.127418278852568, "learning_rate": 0.0002754893939270221, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 0.5703041553497314, "step": 18370 }, { "ce_loss": 0.11886131763458252, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.2752629220485687, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.10208799690008163, "step": 18370 }, { "epoch": 6.127418278852568, "loss": 1.0630515813827515, "step": 18370 }, { "ce_loss": 0.12333641946315765, "epoch": 6.127418278852568, "step": 18370 }, { "distill_loss": 0.27853256464004517, "epoch": 6.127418278852568, "step": 18370 }, { "epoch": 6.127418278852568, "ref_ce_loss": 0.08770790696144104, "step": 18370 }, { "epoch": 6.1307538358905935, "loss": 0.7476, "step": 18380 }, { "epoch": 6.1307538358905935, "grad_norm": 1.7509486675262451, "step": 18380 }, { "epoch": 6.1307538358905935, "learning_rate": 0.0002750788041628593, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.5735849142074585, "step": 18380 }, { "ce_loss": 0.16998191177845, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.3000338077545166, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.07679232954978943, "step": 18380 }, { "epoch": 6.1307538358905935, "loss": 0.6713649034500122, "step": 18380 }, { "ce_loss": 0.12456465512514114, "epoch": 6.1307538358905935, "step": 18380 }, { "distill_loss": 0.32041725516319275, "epoch": 6.1307538358905935, "step": 18380 }, { "epoch": 6.1307538358905935, "ref_ce_loss": 0.1309889256954193, "step": 18380 }, { "epoch": 6.134089392928619, "loss": 0.717, "step": 18390 }, { "epoch": 6.134089392928619, "grad_norm": 1.6819425821304321, "step": 18390 }, { "epoch": 6.134089392928619, "learning_rate": 0.0002746683601951743, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.5657597780227661, "step": 18390 }, { "ce_loss": 0.10514745861291885, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.26283934712409973, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.12309848517179489, "step": 18390 }, { "epoch": 6.134089392928619, "loss": 0.8041913509368896, "step": 18390 }, { "ce_loss": 0.17058131098747253, "epoch": 6.134089392928619, "step": 18390 }, { "distill_loss": 0.33370333909988403, "epoch": 6.134089392928619, "step": 18390 }, { "epoch": 6.134089392928619, "ref_ce_loss": 0.13370048999786377, "step": 18390 }, { "epoch": 6.137424949966644, "loss": 0.6625, "step": 18400 }, { "epoch": 6.137424949966644, "grad_norm": 1.8284673690795898, "step": 18400 }, { "epoch": 6.137424949966644, "learning_rate": 0.00027425806250299897, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.6972324848175049, "step": 18400 }, { "ce_loss": 0.17571482062339783, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.3037504255771637, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.1777459979057312, "step": 18400 }, { "epoch": 6.137424949966644, "loss": 0.6110414862632751, "step": 18400 }, { "ce_loss": 0.14571216702461243, "epoch": 6.137424949966644, "step": 18400 }, { "distill_loss": 0.33484402298927307, "epoch": 6.137424949966644, "step": 18400 }, { "epoch": 6.137424949966644, "ref_ce_loss": 0.13004137575626373, "step": 18400 }, { "epoch": 6.14076050700467, "loss": 0.5864, "step": 18410 }, { "epoch": 6.14076050700467, "grad_norm": 1.7372549772262573, "step": 18410 }, { "epoch": 6.14076050700467, "learning_rate": 0.0002738479115651953, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.43518373370170593, "step": 18410 }, { "ce_loss": 0.11351682245731354, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.19356505572795868, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.12793107330799103, "step": 18410 }, { "epoch": 6.14076050700467, "loss": 0.5647701025009155, "step": 18410 }, { "ce_loss": 0.13868297636508942, "epoch": 6.14076050700467, "step": 18410 }, { "distill_loss": 0.25933837890625, "epoch": 6.14076050700467, "step": 18410 }, { "epoch": 6.14076050700467, "ref_ce_loss": 0.133583664894104, "step": 18410 }, { "epoch": 6.144096064042695, "loss": 0.6318, "step": 18420 }, { "epoch": 6.144096064042695, "grad_norm": 1.194664478302002, "step": 18420 }, { "epoch": 6.144096064042695, "learning_rate": 0.0002734379078604532, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.4552597105503082, "step": 18420 }, { "ce_loss": 0.09872724860906601, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.23186928033828735, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.0990760400891304, "step": 18420 }, { "epoch": 6.144096064042695, "loss": 0.7257994413375854, "step": 18420 }, { "ce_loss": 0.2151937484741211, "epoch": 6.144096064042695, "step": 18420 }, { "distill_loss": 0.26309436559677124, "epoch": 6.144096064042695, "step": 18420 }, { "epoch": 6.144096064042695, "ref_ce_loss": 0.14622631669044495, "step": 18420 }, { "epoch": 6.14743162108072, "loss": 0.6626, "step": 18430 }, { "epoch": 6.14743162108072, "grad_norm": 2.8523292541503906, "step": 18430 }, { "epoch": 6.14743162108072, "learning_rate": 0.00027302805186729136, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.6845064759254456, "step": 18430 }, { "ce_loss": 0.13927432894706726, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.3564937114715576, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.14247988164424896, "step": 18430 }, { "epoch": 6.14743162108072, "loss": 0.6814566254615784, "step": 18430 }, { "ce_loss": 0.19597093760967255, "epoch": 6.14743162108072, "step": 18430 }, { "distill_loss": 0.3532121181488037, "epoch": 6.14743162108072, "step": 18430 }, { "epoch": 6.14743162108072, "ref_ce_loss": 0.13141068816184998, "step": 18430 }, { "epoch": 6.150767178118746, "loss": 0.6815, "step": 18440 }, { "epoch": 6.150767178118746, "grad_norm": 2.0527751445770264, "step": 18440 }, { "epoch": 6.150767178118746, "learning_rate": 0.0002726183440640557, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.6522032022476196, "step": 18440 }, { "ce_loss": 0.1364910751581192, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.2703036665916443, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.13433896005153656, "step": 18440 }, { "epoch": 6.150767178118746, "loss": 0.6549949645996094, "step": 18440 }, { "ce_loss": 0.15011869370937347, "epoch": 6.150767178118746, "step": 18440 }, { "distill_loss": 0.3180496394634247, "epoch": 6.150767178118746, "step": 18440 }, { "epoch": 6.150767178118746, "ref_ce_loss": 0.12303827702999115, "step": 18440 }, { "epoch": 6.154102735156771, "loss": 0.6876, "step": 18450 }, { "epoch": 6.154102735156771, "grad_norm": 1.4469850063323975, "step": 18450 }, { "epoch": 6.154102735156771, "learning_rate": 0.0002722087849289194, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.8605791330337524, "step": 18450 }, { "ce_loss": 0.18606171011924744, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.3016606867313385, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.1333146095275879, "step": 18450 }, { "epoch": 6.154102735156771, "loss": 0.6281872391700745, "step": 18450 }, { "ce_loss": 0.130160853266716, "epoch": 6.154102735156771, "step": 18450 }, { "distill_loss": 0.3120817244052887, "epoch": 6.154102735156771, "step": 18450 }, { "epoch": 6.154102735156771, "ref_ce_loss": 0.11217764019966125, "step": 18450 }, { "epoch": 6.157438292194796, "loss": 0.594, "step": 18460 }, { "epoch": 6.157438292194796, "grad_norm": 1.4657039642333984, "step": 18460 }, { "epoch": 6.157438292194796, "learning_rate": 0.0002717993749398819, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.6093101501464844, "step": 18460 }, { "ce_loss": 0.12690822780132294, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.2589327096939087, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.1347208470106125, "step": 18460 }, { "epoch": 6.157438292194796, "loss": 0.6367437243461609, "step": 18460 }, { "ce_loss": 0.15475201606750488, "epoch": 6.157438292194796, "step": 18460 }, { "distill_loss": 0.29213953018188477, "epoch": 6.157438292194796, "step": 18460 }, { "epoch": 6.157438292194796, "ref_ce_loss": 0.133981853723526, "step": 18460 }, { "epoch": 6.160773849232822, "loss": 0.6148, "step": 18470 }, { "epoch": 6.160773849232822, "grad_norm": 2.235093116760254, "step": 18470 }, { "epoch": 6.160773849232822, "learning_rate": 0.0002713901145747687, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 1.0750973224639893, "step": 18470 }, { "ce_loss": 0.1199803277850151, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.21382658183574677, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.10405635833740234, "step": 18470 }, { "epoch": 6.160773849232822, "loss": 0.7381834983825684, "step": 18470 }, { "ce_loss": 0.11732760816812515, "epoch": 6.160773849232822, "step": 18470 }, { "distill_loss": 0.19893065094947815, "epoch": 6.160773849232822, "step": 18470 }, { "epoch": 6.160773849232822, "ref_ce_loss": 0.14228032529354095, "step": 18470 }, { "epoch": 6.164109406270847, "loss": 0.6314, "step": 18480 }, { "epoch": 6.164109406270847, "grad_norm": 1.4539425373077393, "step": 18480 }, { "epoch": 6.164109406270847, "learning_rate": 0.00027098100431123095, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 0.617901623249054, "step": 18480 }, { "ce_loss": 0.16611261665821075, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.2678413391113281, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.11507727205753326, "step": 18480 }, { "epoch": 6.164109406270847, "loss": 1.0038886070251465, "step": 18480 }, { "ce_loss": 0.17187930643558502, "epoch": 6.164109406270847, "step": 18480 }, { "distill_loss": 0.27773842215538025, "epoch": 6.164109406270847, "step": 18480 }, { "epoch": 6.164109406270847, "ref_ce_loss": 0.16759958863258362, "step": 18480 }, { "epoch": 6.167444963308872, "loss": 0.6242, "step": 18490 }, { "epoch": 6.167444963308872, "grad_norm": 1.6894118785858154, "step": 18490 }, { "epoch": 6.167444963308872, "learning_rate": 0.0002705720446267442, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.5968695878982544, "step": 18490 }, { "ce_loss": 0.13667964935302734, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.23305979371070862, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.12112811952829361, "step": 18490 }, { "epoch": 6.167444963308872, "loss": 0.5195899605751038, "step": 18490 }, { "ce_loss": 0.11936408281326294, "epoch": 6.167444963308872, "step": 18490 }, { "distill_loss": 0.269527405500412, "epoch": 6.167444963308872, "step": 18490 }, { "epoch": 6.167444963308872, "ref_ce_loss": 0.097865030169487, "step": 18490 }, { "epoch": 6.170780520346898, "loss": 0.6507, "step": 18500 }, { "epoch": 6.170780520346898, "grad_norm": 1.6241803169250488, "step": 18500 }, { "epoch": 6.170780520346898, "learning_rate": 0.0002701632359986083, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.7157676815986633, "step": 18500 }, { "ce_loss": 0.1360265463590622, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.2981744110584259, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.1476881355047226, "step": 18500 }, { "epoch": 6.170780520346898, "loss": 0.630430281162262, "step": 18500 }, { "ce_loss": 0.1654825359582901, "epoch": 6.170780520346898, "step": 18500 }, { "distill_loss": 0.23207513988018036, "epoch": 6.170780520346898, "step": 18500 }, { "epoch": 6.170780520346898, "ref_ce_loss": 0.123602956533432, "step": 18500 }, { "epoch": 6.174116077384923, "loss": 0.6731, "step": 18510 }, { "epoch": 6.174116077384923, "grad_norm": 2.015723466873169, "step": 18510 }, { "epoch": 6.174116077384923, "learning_rate": 0.0002697545789039472, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.5642631649971008, "step": 18510 }, { "ce_loss": 0.1319328248500824, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.295981228351593, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.13301639258861542, "step": 18510 }, { "epoch": 6.174116077384923, "loss": 0.7798033952713013, "step": 18510 }, { "ce_loss": 0.11101414263248444, "epoch": 6.174116077384923, "step": 18510 }, { "distill_loss": 0.22768959403038025, "epoch": 6.174116077384923, "step": 18510 }, { "epoch": 6.174116077384923, "ref_ce_loss": 0.13759620487689972, "step": 18510 }, { "epoch": 6.177451634422948, "loss": 0.6435, "step": 18520 }, { "epoch": 6.177451634422948, "grad_norm": 3.946340560913086, "step": 18520 }, { "epoch": 6.177451634422948, "learning_rate": 0.00026934607381970735, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.5880672931671143, "step": 18520 }, { "ce_loss": 0.1387503296136856, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.2876739501953125, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.12377411127090454, "step": 18520 }, { "epoch": 6.177451634422948, "loss": 0.6849022507667542, "step": 18520 }, { "ce_loss": 0.18922367691993713, "epoch": 6.177451634422948, "step": 18520 }, { "distill_loss": 0.3700937330722809, "epoch": 6.177451634422948, "step": 18520 }, { "epoch": 6.177451634422948, "ref_ce_loss": 0.1246335506439209, "step": 18520 }, { "epoch": 6.180787191460974, "loss": 0.5849, "step": 18530 }, { "epoch": 6.180787191460974, "grad_norm": 1.7427726984024048, "step": 18530 }, { "epoch": 6.180787191460974, "learning_rate": 0.0002689377212226583, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.5720622539520264, "step": 18530 }, { "ce_loss": 0.09880984574556351, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.18975934386253357, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.12640048563480377, "step": 18530 }, { "epoch": 6.180787191460974, "loss": 0.5118294954299927, "step": 18530 }, { "ce_loss": 0.11647960543632507, "epoch": 6.180787191460974, "step": 18530 }, { "distill_loss": 0.23939764499664307, "epoch": 6.180787191460974, "step": 18530 }, { "epoch": 6.180787191460974, "ref_ce_loss": 0.09006281197071075, "step": 18530 }, { "epoch": 6.184122748498999, "loss": 0.5788, "step": 18540 }, { "epoch": 6.184122748498999, "grad_norm": 2.905853509902954, "step": 18540 }, { "epoch": 6.184122748498999, "learning_rate": 0.0002685295215893915, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.7311304807662964, "step": 18540 }, { "ce_loss": 0.10905653983354568, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.24801619350910187, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.11523216217756271, "step": 18540 }, { "epoch": 6.184122748498999, "loss": 0.8149339556694031, "step": 18540 }, { "ce_loss": 0.19311216473579407, "epoch": 6.184122748498999, "step": 18540 }, { "distill_loss": 0.29600635170936584, "epoch": 6.184122748498999, "step": 18540 }, { "epoch": 6.184122748498999, "ref_ce_loss": 0.13978679478168488, "step": 18540 }, { "epoch": 6.1874583055370245, "loss": 0.6132, "step": 18550 }, { "epoch": 6.1874583055370245, "grad_norm": 1.2603939771652222, "step": 18550 }, { "epoch": 6.1874583055370245, "learning_rate": 0.0002681214753963198, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.41944119334220886, "step": 18550 }, { "ce_loss": 0.07490330934524536, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.21885143220424652, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.09496767073869705, "step": 18550 }, { "epoch": 6.1874583055370245, "loss": 0.7222049832344055, "step": 18550 }, { "ce_loss": 0.12777599692344666, "epoch": 6.1874583055370245, "step": 18550 }, { "distill_loss": 0.27243223786354065, "epoch": 6.1874583055370245, "step": 18550 }, { "epoch": 6.1874583055370245, "ref_ce_loss": 0.07724441587924957, "step": 18550 }, { "epoch": 6.19079386257505, "loss": 0.6462, "step": 18560 }, { "epoch": 6.19079386257505, "grad_norm": 2.3181214332580566, "step": 18560 }, { "epoch": 6.19079386257505, "learning_rate": 0.0002677135831196771, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.43296733498573303, "step": 18560 }, { "ce_loss": 0.08375538140535355, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.23206079006195068, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.09651433676481247, "step": 18560 }, { "epoch": 6.19079386257505, "loss": 0.809476912021637, "step": 18560 }, { "ce_loss": 0.15913410484790802, "epoch": 6.19079386257505, "step": 18560 }, { "distill_loss": 0.3202086091041565, "epoch": 6.19079386257505, "step": 18560 }, { "epoch": 6.19079386257505, "ref_ce_loss": 0.12562784552574158, "step": 18560 }, { "epoch": 6.194129419613075, "loss": 0.6267, "step": 18570 }, { "epoch": 6.194129419613075, "grad_norm": 1.503143548965454, "step": 18570 }, { "epoch": 6.194129419613075, "learning_rate": 0.00026730584523551744, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.6719030141830444, "step": 18570 }, { "ce_loss": 0.11842752993106842, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.2962538003921509, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.12865594029426575, "step": 18570 }, { "epoch": 6.194129419613075, "loss": 0.6024667620658875, "step": 18570 }, { "ce_loss": 0.15397170186042786, "epoch": 6.194129419613075, "step": 18570 }, { "distill_loss": 0.2786436975002289, "epoch": 6.194129419613075, "step": 18570 }, { "epoch": 6.194129419613075, "ref_ce_loss": 0.1400149017572403, "step": 18570 }, { "epoch": 6.1974649766511005, "loss": 0.6434, "step": 18580 }, { "epoch": 6.1974649766511005, "grad_norm": 1.6247886419296265, "step": 18580 }, { "epoch": 6.1974649766511005, "learning_rate": 0.0002668982622197148, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.6952130794525146, "step": 18580 }, { "ce_loss": 0.08196988701820374, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.2731947898864746, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.08738795667886734, "step": 18580 }, { "epoch": 6.1974649766511005, "loss": 0.5540481209754944, "step": 18580 }, { "ce_loss": 0.11001595109701157, "epoch": 6.1974649766511005, "step": 18580 }, { "distill_loss": 0.24366402626037598, "epoch": 6.1974649766511005, "step": 18580 }, { "epoch": 6.1974649766511005, "ref_ce_loss": 0.12300706654787064, "step": 18580 }, { "epoch": 6.200800533689126, "loss": 0.6145, "step": 18590 }, { "epoch": 6.200800533689126, "grad_norm": 1.7628047466278076, "step": 18590 }, { "epoch": 6.200800533689126, "learning_rate": 0.0002664908345479625, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.5917359590530396, "step": 18590 }, { "ce_loss": 0.12417437136173248, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.24398832023143768, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.11655285954475403, "step": 18590 }, { "epoch": 6.200800533689126, "loss": 0.5952635407447815, "step": 18590 }, { "ce_loss": 0.1612391471862793, "epoch": 6.200800533689126, "step": 18590 }, { "distill_loss": 0.2423071712255478, "epoch": 6.200800533689126, "step": 18590 }, { "epoch": 6.200800533689126, "ref_ce_loss": 0.1544157713651657, "step": 18590 }, { "epoch": 6.204136090727151, "loss": 0.6139, "step": 18600 }, { "epoch": 6.204136090727151, "grad_norm": 1.9439404010772705, "step": 18600 }, { "epoch": 6.204136090727151, "learning_rate": 0.0002660835626957726, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.4388802647590637, "step": 18600 }, { "ce_loss": 0.08696887642145157, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.17513489723205566, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.11585807055234909, "step": 18600 }, { "epoch": 6.204136090727151, "loss": 0.3982605040073395, "step": 18600 }, { "ce_loss": 0.08773314952850342, "epoch": 6.204136090727151, "step": 18600 }, { "distill_loss": 0.19973807036876678, "epoch": 6.204136090727151, "step": 18600 }, { "epoch": 6.204136090727151, "ref_ce_loss": 0.11063937097787857, "step": 18600 }, { "epoch": 6.207471647765177, "loss": 0.5959, "step": 18610 }, { "epoch": 6.207471647765177, "grad_norm": 2.576935291290283, "step": 18610 }, { "epoch": 6.207471647765177, "learning_rate": 0.0002656764471384749, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.4419233202934265, "step": 18610 }, { "ce_loss": 0.08523976802825928, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.16447491943836212, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.0948585495352745, "step": 18610 }, { "epoch": 6.207471647765177, "loss": 0.5787880420684814, "step": 18610 }, { "ce_loss": 0.12167631834745407, "epoch": 6.207471647765177, "step": 18610 }, { "distill_loss": 0.20232316851615906, "epoch": 6.207471647765177, "step": 18610 }, { "epoch": 6.207471647765177, "ref_ce_loss": 0.1111878976225853, "step": 18610 }, { "epoch": 6.210807204803202, "loss": 0.6047, "step": 18620 }, { "epoch": 6.210807204803202, "grad_norm": 1.482259750366211, "step": 18620 }, { "epoch": 6.210807204803202, "learning_rate": 0.0002652694883512173, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.6412470936775208, "step": 18620 }, { "ce_loss": 0.1887206733226776, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.2719080448150635, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.1438676118850708, "step": 18620 }, { "epoch": 6.210807204803202, "loss": 0.6414347290992737, "step": 18620 }, { "ce_loss": 0.1531907618045807, "epoch": 6.210807204803202, "step": 18620 }, { "distill_loss": 0.25418105721473694, "epoch": 6.210807204803202, "step": 18620 }, { "epoch": 6.210807204803202, "ref_ce_loss": 0.12958569824695587, "step": 18620 }, { "epoch": 6.214142761841227, "loss": 0.5762, "step": 18630 }, { "epoch": 6.214142761841227, "grad_norm": 1.4014415740966797, "step": 18630 }, { "epoch": 6.214142761841227, "learning_rate": 0.0002648626868089644, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.6205796003341675, "step": 18630 }, { "ce_loss": 0.12997017800807953, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.25296303629875183, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.12407161295413971, "step": 18630 }, { "epoch": 6.214142761841227, "loss": 0.5109315514564514, "step": 18630 }, { "ce_loss": 0.11794036626815796, "epoch": 6.214142761841227, "step": 18630 }, { "distill_loss": 0.2793790400028229, "epoch": 6.214142761841227, "step": 18630 }, { "epoch": 6.214142761841227, "ref_ce_loss": 0.08065132796764374, "step": 18630 }, { "epoch": 6.217478318879253, "loss": 0.6147, "step": 18640 }, { "epoch": 6.217478318879253, "grad_norm": 1.4799383878707886, "step": 18640 }, { "epoch": 6.217478318879253, "learning_rate": 0.00026445604298649727, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.517666220664978, "step": 18640 }, { "ce_loss": 0.12360420823097229, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.24259185791015625, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.11556177586317062, "step": 18640 }, { "epoch": 6.217478318879253, "loss": 0.6571850776672363, "step": 18640 }, { "ce_loss": 0.09585979580879211, "epoch": 6.217478318879253, "step": 18640 }, { "distill_loss": 0.2677266597747803, "epoch": 6.217478318879253, "step": 18640 }, { "epoch": 6.217478318879253, "ref_ce_loss": 0.10569890588521957, "step": 18640 }, { "epoch": 6.220813875917278, "loss": 0.5984, "step": 18650 }, { "epoch": 6.220813875917278, "grad_norm": 1.4804229736328125, "step": 18650 }, { "epoch": 6.220813875917278, "learning_rate": 0.00026404955735841325, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.5628892183303833, "step": 18650 }, { "ce_loss": 0.11343202739953995, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.29269713163375854, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.1565801501274109, "step": 18650 }, { "epoch": 6.220813875917278, "loss": 0.7792778015136719, "step": 18650 }, { "ce_loss": 0.14788763225078583, "epoch": 6.220813875917278, "step": 18650 }, { "distill_loss": 0.3236067295074463, "epoch": 6.220813875917278, "step": 18650 }, { "epoch": 6.220813875917278, "ref_ce_loss": 0.14538931846618652, "step": 18650 }, { "epoch": 6.224149432955303, "loss": 0.6762, "step": 18660 }, { "epoch": 6.224149432955303, "grad_norm": 1.1050039529800415, "step": 18660 }, { "epoch": 6.224149432955303, "learning_rate": 0.0002636432303991245, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.5967389345169067, "step": 18660 }, { "ce_loss": 0.11555395275354385, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.27795520424842834, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.11349157243967056, "step": 18660 }, { "epoch": 6.224149432955303, "loss": 0.5443832874298096, "step": 18660 }, { "ce_loss": 0.15342643857002258, "epoch": 6.224149432955303, "step": 18660 }, { "distill_loss": 0.2519240081310272, "epoch": 6.224149432955303, "step": 18660 }, { "epoch": 6.224149432955303, "ref_ce_loss": 0.09871362149715424, "step": 18660 }, { "epoch": 6.227484989993329, "loss": 0.6513, "step": 18670 }, { "epoch": 6.227484989993329, "grad_norm": 6.464293956756592, "step": 18670 }, { "epoch": 6.227484989993329, "learning_rate": 0.00026323706258285864, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.5531576871871948, "step": 18670 }, { "ce_loss": 0.07649177312850952, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.3307036757469177, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.10162324458360672, "step": 18670 }, { "epoch": 6.227484989993329, "loss": 0.5954231023788452, "step": 18670 }, { "ce_loss": 0.1320202797651291, "epoch": 6.227484989993329, "step": 18670 }, { "distill_loss": 0.23801815509796143, "epoch": 6.227484989993329, "step": 18670 }, { "epoch": 6.227484989993329, "ref_ce_loss": 0.11234842240810394, "step": 18670 }, { "epoch": 6.230820547031354, "loss": 0.5936, "step": 18680 }, { "epoch": 6.230820547031354, "grad_norm": 2.4922399520874023, "step": 18680 }, { "epoch": 6.230820547031354, "learning_rate": 0.00026283105438365697, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.6484638452529907, "step": 18680 }, { "ce_loss": 0.1432270109653473, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.338364839553833, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.13439004123210907, "step": 18680 }, { "epoch": 6.230820547031354, "loss": 0.6381657123565674, "step": 18680 }, { "ce_loss": 0.12378459423780441, "epoch": 6.230820547031354, "step": 18680 }, { "distill_loss": 0.3475711941719055, "epoch": 6.230820547031354, "step": 18680 }, { "epoch": 6.230820547031354, "ref_ce_loss": 0.12904280424118042, "step": 18680 }, { "epoch": 6.234156104069379, "loss": 0.6054, "step": 18690 }, { "epoch": 6.234156104069379, "grad_norm": 1.3250396251678467, "step": 18690 }, { "epoch": 6.234156104069379, "learning_rate": 0.00026242520627537465, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.43528658151626587, "step": 18690 }, { "ce_loss": 0.08029738813638687, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.2519910931587219, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.1028536707162857, "step": 18690 }, { "epoch": 6.234156104069379, "loss": 0.5063867568969727, "step": 18690 }, { "ce_loss": 0.08997374773025513, "epoch": 6.234156104069379, "step": 18690 }, { "distill_loss": 0.2542618215084076, "epoch": 6.234156104069379, "step": 18690 }, { "epoch": 6.234156104069379, "ref_ce_loss": 0.11495349556207657, "step": 18690 }, { "epoch": 6.237491661107405, "loss": 0.5642, "step": 18700 }, { "epoch": 6.237491661107405, "grad_norm": 2.5319509506225586, "step": 18700 }, { "epoch": 6.237491661107405, "learning_rate": 0.0002620195187316805, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.5081156492233276, "step": 18700 }, { "ce_loss": 0.10081464052200317, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.23042288422584534, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.08740917593240738, "step": 18700 }, { "epoch": 6.237491661107405, "loss": 0.6952100396156311, "step": 18700 }, { "ce_loss": 0.1750820428133011, "epoch": 6.237491661107405, "step": 18700 }, { "distill_loss": 0.2979185879230499, "epoch": 6.237491661107405, "step": 18700 }, { "epoch": 6.237491661107405, "ref_ce_loss": 0.12627197802066803, "step": 18700 }, { "epoch": 6.24082721814543, "loss": 0.6507, "step": 18710 }, { "epoch": 6.24082721814543, "grad_norm": 1.9645556211471558, "step": 18710 }, { "epoch": 6.24082721814543, "learning_rate": 0.00026161399222605523, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.7466223239898682, "step": 18710 }, { "ce_loss": 0.21908186376094818, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.2823524475097656, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.1386713981628418, "step": 18710 }, { "epoch": 6.24082721814543, "loss": 0.5161977410316467, "step": 18710 }, { "ce_loss": 0.08397981524467468, "epoch": 6.24082721814543, "step": 18710 }, { "distill_loss": 0.2072904109954834, "epoch": 6.24082721814543, "step": 18710 }, { "epoch": 6.24082721814543, "ref_ce_loss": 0.11395914852619171, "step": 18710 }, { "epoch": 6.244162775183455, "loss": 0.5942, "step": 18720 }, { "epoch": 6.244162775183455, "grad_norm": 1.0424362421035767, "step": 18720 }, { "epoch": 6.244162775183455, "learning_rate": 0.00026120862723179203, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.6296104192733765, "step": 18720 }, { "ce_loss": 0.16182675957679749, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.281975656747818, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.15327884256839752, "step": 18720 }, { "epoch": 6.244162775183455, "loss": 0.5066336989402771, "step": 18720 }, { "ce_loss": 0.13477244973182678, "epoch": 6.244162775183455, "step": 18720 }, { "distill_loss": 0.2248753309249878, "epoch": 6.244162775183455, "step": 18720 }, { "epoch": 6.244162775183455, "ref_ce_loss": 0.11454130709171295, "step": 18720 }, { "epoch": 6.247498332221481, "loss": 0.5769, "step": 18730 }, { "epoch": 6.247498332221481, "grad_norm": 1.3180054426193237, "step": 18730 }, { "epoch": 6.247498332221481, "learning_rate": 0.00026080342422199536, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.5453575849533081, "step": 18730 }, { "ce_loss": 0.12045739591121674, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.2378164380788803, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.11338043212890625, "step": 18730 }, { "epoch": 6.247498332221481, "loss": 0.8507588505744934, "step": 18730 }, { "ce_loss": 0.16890116035938263, "epoch": 6.247498332221481, "step": 18730 }, { "distill_loss": 0.3025205135345459, "epoch": 6.247498332221481, "step": 18730 }, { "epoch": 6.247498332221481, "ref_ce_loss": 0.14612118899822235, "step": 18730 }, { "epoch": 6.250833889259506, "loss": 0.6153, "step": 18740 }, { "epoch": 6.250833889259506, "grad_norm": 1.7155362367630005, "step": 18740 }, { "epoch": 6.250833889259506, "learning_rate": 0.00026039838366958087, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.7768467664718628, "step": 18740 }, { "ce_loss": 0.10443000495433807, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.3253715932369232, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.09035629034042358, "step": 18740 }, { "epoch": 6.250833889259506, "loss": 0.5813509225845337, "step": 18740 }, { "ce_loss": 0.1278662383556366, "epoch": 6.250833889259506, "step": 18740 }, { "distill_loss": 0.28240811824798584, "epoch": 6.250833889259506, "step": 18740 }, { "epoch": 6.250833889259506, "ref_ce_loss": 0.12653017044067383, "step": 18740 }, { "epoch": 6.2541694462975315, "loss": 0.6759, "step": 18750 }, { "epoch": 6.2541694462975315, "grad_norm": 1.5573009252548218, "step": 18750 }, { "epoch": 6.2541694462975315, "learning_rate": 0.0002599935060472743, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.6010094881057739, "step": 18750 }, { "ce_loss": 0.1629936397075653, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.27384552359580994, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.13688445091247559, "step": 18750 }, { "epoch": 6.2541694462975315, "loss": 0.8907907605171204, "step": 18750 }, { "ce_loss": 0.1886061728000641, "epoch": 6.2541694462975315, "step": 18750 }, { "distill_loss": 0.3372310996055603, "epoch": 6.2541694462975315, "step": 18750 }, { "epoch": 6.2541694462975315, "ref_ce_loss": 0.16175268590450287, "step": 18750 }, { "epoch": 6.257505003335557, "loss": 0.6196, "step": 18760 }, { "epoch": 6.257505003335557, "grad_norm": 1.3572313785552979, "step": 18760 }, { "epoch": 6.257505003335557, "learning_rate": 0.0002595887918276116, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.5978425741195679, "step": 18760 }, { "ce_loss": 0.17033596336841583, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.2939685583114624, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.10807149857282639, "step": 18760 }, { "epoch": 6.257505003335557, "loss": 0.5586941838264465, "step": 18760 }, { "ce_loss": 0.09725350141525269, "epoch": 6.257505003335557, "step": 18760 }, { "distill_loss": 0.2762550711631775, "epoch": 6.257505003335557, "step": 18760 }, { "epoch": 6.257505003335557, "ref_ce_loss": 0.13775742053985596, "step": 18760 }, { "epoch": 6.260840560373582, "loss": 0.5952, "step": 18770 }, { "epoch": 6.260840560373582, "grad_norm": 1.4143478870391846, "step": 18770 }, { "epoch": 6.260840560373582, "learning_rate": 0.0002591842414829376, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.36823388934135437, "step": 18770 }, { "ce_loss": 0.06665971130132675, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.20049230754375458, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.1006406843662262, "step": 18770 }, { "epoch": 6.260840560373582, "loss": 0.4194163382053375, "step": 18770 }, { "ce_loss": 0.09155339002609253, "epoch": 6.260840560373582, "step": 18770 }, { "distill_loss": 0.22316360473632812, "epoch": 6.260840560373582, "step": 18770 }, { "epoch": 6.260840560373582, "ref_ce_loss": 0.10426671802997589, "step": 18770 }, { "epoch": 6.2641761174116075, "loss": 0.5954, "step": 18780 }, { "epoch": 6.2641761174116075, "grad_norm": 2.267418622970581, "step": 18780 }, { "epoch": 6.2641761174116075, "learning_rate": 0.0002587798554854063, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.579855740070343, "step": 18780 }, { "ce_loss": 0.14227893948554993, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.2690858244895935, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.1261933445930481, "step": 18780 }, { "epoch": 6.2641761174116075, "loss": 0.5423168540000916, "step": 18780 }, { "ce_loss": 0.10789535194635391, "epoch": 6.2641761174116075, "step": 18780 }, { "distill_loss": 0.243142232298851, "epoch": 6.2641761174116075, "step": 18780 }, { "epoch": 6.2641761174116075, "ref_ce_loss": 0.13346213102340698, "step": 18780 }, { "epoch": 6.267511674449633, "loss": 0.6002, "step": 18790 }, { "epoch": 6.267511674449633, "grad_norm": 1.1807934045791626, "step": 18790 }, { "epoch": 6.267511674449633, "learning_rate": 0.00025837563430697953, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.3969920575618744, "step": 18790 }, { "ce_loss": 0.0700957402586937, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.196085587143898, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.09772870689630508, "step": 18790 }, { "epoch": 6.267511674449633, "loss": 0.8665177822113037, "step": 18790 }, { "ce_loss": 0.10357936471700668, "epoch": 6.267511674449633, "step": 18790 }, { "distill_loss": 0.2201247364282608, "epoch": 6.267511674449633, "step": 18790 }, { "epoch": 6.267511674449633, "ref_ce_loss": 0.09220992028713226, "step": 18790 }, { "epoch": 6.270847231487658, "loss": 0.6304, "step": 18800 }, { "epoch": 6.270847231487658, "grad_norm": 1.4579709768295288, "step": 18800 }, { "epoch": 6.270847231487658, "learning_rate": 0.00025797157841942674, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 0.5386521816253662, "step": 18800 }, { "ce_loss": 0.13596661388874054, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.257935106754303, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.11433033645153046, "step": 18800 }, { "epoch": 6.270847231487658, "loss": 1.0312557220458984, "step": 18800 }, { "ce_loss": 0.10936436057090759, "epoch": 6.270847231487658, "step": 18800 }, { "distill_loss": 0.25603824853897095, "epoch": 6.270847231487658, "step": 18800 }, { "epoch": 6.270847231487658, "ref_ce_loss": 0.10420667380094528, "step": 18800 }, { "epoch": 6.274182788525684, "loss": 0.6355, "step": 18810 }, { "epoch": 6.274182788525684, "grad_norm": 1.4766558408737183, "step": 18810 }, { "epoch": 6.274182788525684, "learning_rate": 0.00025756768829432496, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.7762137055397034, "step": 18810 }, { "ce_loss": 0.14689825475215912, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.23592166602611542, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.13268359005451202, "step": 18810 }, { "epoch": 6.274182788525684, "loss": 0.5970646739006042, "step": 18810 }, { "ce_loss": 0.16350924968719482, "epoch": 6.274182788525684, "step": 18810 }, { "distill_loss": 0.2649400532245636, "epoch": 6.274182788525684, "step": 18810 }, { "epoch": 6.274182788525684, "ref_ce_loss": 0.12570339441299438, "step": 18810 }, { "epoch": 6.277518345563709, "loss": 0.6227, "step": 18820 }, { "epoch": 6.277518345563709, "grad_norm": 1.5304725170135498, "step": 18820 }, { "epoch": 6.277518345563709, "learning_rate": 0.0002571639644030574, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.5977932214736938, "step": 18820 }, { "ce_loss": 0.13757960498332977, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.2880479097366333, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.13617762923240662, "step": 18820 }, { "epoch": 6.277518345563709, "loss": 0.6033172011375427, "step": 18820 }, { "ce_loss": 0.14228224754333496, "epoch": 6.277518345563709, "step": 18820 }, { "distill_loss": 0.28515636920928955, "epoch": 6.277518345563709, "step": 18820 }, { "epoch": 6.277518345563709, "ref_ce_loss": 0.12235098332166672, "step": 18820 }, { "epoch": 6.280853902601734, "loss": 0.6056, "step": 18830 }, { "epoch": 6.280853902601734, "grad_norm": 4.085591793060303, "step": 18830 }, { "epoch": 6.280853902601734, "learning_rate": 0.00025676040721681303, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 1.065446376800537, "step": 18830 }, { "ce_loss": 0.13691724836826324, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.28984662890434265, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.1384049654006958, "step": 18830 }, { "epoch": 6.280853902601734, "loss": 0.8404492139816284, "step": 18830 }, { "ce_loss": 0.18925227224826813, "epoch": 6.280853902601734, "step": 18830 }, { "distill_loss": 0.3313180208206177, "epoch": 6.280853902601734, "step": 18830 }, { "epoch": 6.280853902601734, "ref_ce_loss": 0.14136530458927155, "step": 18830 }, { "epoch": 6.28418945963976, "loss": 0.6549, "step": 18840 }, { "epoch": 6.28418945963976, "grad_norm": 1.796584129333496, "step": 18840 }, { "epoch": 6.28418945963976, "learning_rate": 0.00025635701720658677, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.5860927700996399, "step": 18840 }, { "ce_loss": 0.1099044606089592, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.24969011545181274, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.1132139191031456, "step": 18840 }, { "epoch": 6.28418945963976, "loss": 0.6870462894439697, "step": 18840 }, { "ce_loss": 0.15823398530483246, "epoch": 6.28418945963976, "step": 18840 }, { "distill_loss": 0.3364105820655823, "epoch": 6.28418945963976, "step": 18840 }, { "epoch": 6.28418945963976, "ref_ce_loss": 0.1049603670835495, "step": 18840 }, { "epoch": 6.287525016677785, "loss": 0.6795, "step": 18850 }, { "epoch": 6.287525016677785, "grad_norm": 2.2475714683532715, "step": 18850 }, { "epoch": 6.287525016677785, "learning_rate": 0.000255953794843178, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.4239915609359741, "step": 18850 }, { "ce_loss": 0.07156173139810562, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.2099079191684723, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.11280690878629684, "step": 18850 }, { "epoch": 6.287525016677785, "loss": 0.4179385006427765, "step": 18850 }, { "ce_loss": 0.0768141821026802, "epoch": 6.287525016677785, "step": 18850 }, { "distill_loss": 0.20585425198078156, "epoch": 6.287525016677785, "step": 18850 }, { "epoch": 6.287525016677785, "ref_ce_loss": 0.10265933722257614, "step": 18850 }, { "epoch": 6.29086057371581, "loss": 0.7121, "step": 18860 }, { "epoch": 6.29086057371581, "grad_norm": 3.4591379165649414, "step": 18860 }, { "epoch": 6.29086057371581, "learning_rate": 0.00025555074059719073, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.3988853096961975, "step": 18860 }, { "ce_loss": 0.07139135897159576, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.21063552796840668, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.084518663585186, "step": 18860 }, { "epoch": 6.29086057371581, "loss": 0.6735532879829407, "step": 18860 }, { "ce_loss": 0.16425824165344238, "epoch": 6.29086057371581, "step": 18860 }, { "distill_loss": 0.33580243587493896, "epoch": 6.29086057371581, "step": 18860 }, { "epoch": 6.29086057371581, "ref_ce_loss": 0.12752461433410645, "step": 18860 }, { "epoch": 6.294196130753836, "loss": 0.6134, "step": 18870 }, { "epoch": 6.294196130753836, "grad_norm": 1.5801092386245728, "step": 18870 }, { "epoch": 6.294196130753836, "learning_rate": 0.0002551478549390325, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 0.38322246074676514, "step": 18870 }, { "ce_loss": 0.07935591042041779, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.22366979718208313, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.06230417639017105, "step": 18870 }, { "epoch": 6.294196130753836, "loss": 1.041303277015686, "step": 18870 }, { "ce_loss": 0.13986216485500336, "epoch": 6.294196130753836, "step": 18870 }, { "distill_loss": 0.27043458819389343, "epoch": 6.294196130753836, "step": 18870 }, { "epoch": 6.294196130753836, "ref_ce_loss": 0.10601771622896194, "step": 18870 }, { "epoch": 6.297531687791861, "loss": 0.6444, "step": 18880 }, { "epoch": 6.297531687791861, "grad_norm": 4.337099552154541, "step": 18880 }, { "epoch": 6.297531687791861, "learning_rate": 0.00025474513833891434, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.6033490300178528, "step": 18880 }, { "ce_loss": 0.16330893337726593, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.25854185223579407, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.1479196399450302, "step": 18880 }, { "epoch": 6.297531687791861, "loss": 0.40364956855773926, "step": 18880 }, { "ce_loss": 0.08617854118347168, "epoch": 6.297531687791861, "step": 18880 }, { "distill_loss": 0.18217800557613373, "epoch": 6.297531687791861, "step": 18880 }, { "epoch": 6.297531687791861, "ref_ce_loss": 0.11525102704763412, "step": 18880 }, { "epoch": 6.300867244829886, "loss": 0.5843, "step": 18890 }, { "epoch": 6.300867244829886, "grad_norm": 1.3575999736785889, "step": 18890 }, { "epoch": 6.300867244829886, "learning_rate": 0.00025434259126684973, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.7759307622909546, "step": 18890 }, { "ce_loss": 0.171718567609787, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.26840507984161377, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.12806598842144012, "step": 18890 }, { "epoch": 6.300867244829886, "loss": 0.7826226949691772, "step": 18890 }, { "ce_loss": 0.16975845396518707, "epoch": 6.300867244829886, "step": 18890 }, { "distill_loss": 0.3657705783843994, "epoch": 6.300867244829886, "step": 18890 }, { "epoch": 6.300867244829886, "ref_ce_loss": 0.15103502571582794, "step": 18890 }, { "epoch": 6.304202801867912, "loss": 0.634, "step": 18900 }, { "epoch": 6.304202801867912, "grad_norm": 1.8236695528030396, "step": 18900 }, { "epoch": 6.304202801867912, "learning_rate": 0.0002539402141926546, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.6085480451583862, "step": 18900 }, { "ce_loss": 0.10115724802017212, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.3330099880695343, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.1174900159239769, "step": 18900 }, { "epoch": 6.304202801867912, "loss": 0.46579235792160034, "step": 18900 }, { "ce_loss": 0.10644608736038208, "epoch": 6.304202801867912, "step": 18900 }, { "distill_loss": 0.20871976017951965, "epoch": 6.304202801867912, "step": 18900 }, { "epoch": 6.304202801867912, "ref_ce_loss": 0.10825169831514359, "step": 18900 }, { "epoch": 6.307538358905937, "loss": 0.5919, "step": 18910 }, { "epoch": 6.307538358905937, "grad_norm": 1.4280165433883667, "step": 18910 }, { "epoch": 6.307538358905937, "learning_rate": 0.000253538007585946, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.5173388123512268, "step": 18910 }, { "ce_loss": 0.12827540934085846, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.2661437392234802, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.12170099467039108, "step": 18910 }, { "epoch": 6.307538358905937, "loss": 0.5308101177215576, "step": 18910 }, { "ce_loss": 0.11813155561685562, "epoch": 6.307538358905937, "step": 18910 }, { "distill_loss": 0.24241910874843597, "epoch": 6.307538358905937, "step": 18910 }, { "epoch": 6.307538358905937, "ref_ce_loss": 0.12783372402191162, "step": 18910 }, { "epoch": 6.310873915943962, "loss": 0.6298, "step": 18920 }, { "epoch": 6.310873915943962, "grad_norm": 1.4830825328826904, "step": 18920 }, { "epoch": 6.310873915943962, "learning_rate": 0.0002531359719161426, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.6821950674057007, "step": 18920 }, { "ce_loss": 0.16001774370670319, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.2875783145427704, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.13389131426811218, "step": 18920 }, { "epoch": 6.310873915943962, "loss": 0.7266680002212524, "step": 18920 }, { "ce_loss": 0.20016908645629883, "epoch": 6.310873915943962, "step": 18920 }, { "distill_loss": 0.37826067209243774, "epoch": 6.310873915943962, "step": 18920 }, { "epoch": 6.310873915943962, "ref_ce_loss": 0.14504815638065338, "step": 18920 }, { "epoch": 6.314209472981988, "loss": 0.6279, "step": 18930 }, { "epoch": 6.314209472981988, "grad_norm": 1.6696503162384033, "step": 18930 }, { "epoch": 6.314209472981988, "learning_rate": 0.0002527341076524633, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.5761136412620544, "step": 18930 }, { "ce_loss": 0.11060384660959244, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.25381311774253845, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.1130882278084755, "step": 18930 }, { "epoch": 6.314209472981988, "loss": 0.8104597330093384, "step": 18930 }, { "ce_loss": 0.16445434093475342, "epoch": 6.314209472981988, "step": 18930 }, { "distill_loss": 0.33795028924942017, "epoch": 6.314209472981988, "step": 18930 }, { "epoch": 6.314209472981988, "ref_ce_loss": 0.14992178976535797, "step": 18930 }, { "epoch": 6.317545030020013, "loss": 0.657, "step": 18940 }, { "epoch": 6.317545030020013, "grad_norm": 1.4184163808822632, "step": 18940 }, { "epoch": 6.317545030020013, "learning_rate": 0.00025233241526392673, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.5488178730010986, "step": 18940 }, { "ce_loss": 0.15158149600028992, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.29170292615890503, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.10527343302965164, "step": 18940 }, { "epoch": 6.317545030020013, "loss": 0.5522985458374023, "step": 18940 }, { "ce_loss": 0.10687598586082458, "epoch": 6.317545030020013, "step": 18940 }, { "distill_loss": 0.28288522362709045, "epoch": 6.317545030020013, "step": 18940 }, { "epoch": 6.317545030020013, "ref_ce_loss": 0.11455663293600082, "step": 18940 }, { "epoch": 6.3208805870580385, "loss": 0.6514, "step": 18950 }, { "epoch": 6.3208805870580385, "grad_norm": 2.5147652626037598, "step": 18950 }, { "epoch": 6.3208805870580385, "learning_rate": 0.0002519308952193513, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.6704400777816772, "step": 18950 }, { "ce_loss": 0.1592797189950943, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.24356558918952942, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.0978129580616951, "step": 18950 }, { "epoch": 6.3208805870580385, "loss": 0.5375022292137146, "step": 18950 }, { "ce_loss": 0.11981920897960663, "epoch": 6.3208805870580385, "step": 18950 }, { "distill_loss": 0.24265658855438232, "epoch": 6.3208805870580385, "step": 18950 }, { "epoch": 6.3208805870580385, "ref_ce_loss": 0.14514128863811493, "step": 18950 }, { "epoch": 6.324216144096064, "loss": 0.5806, "step": 18960 }, { "epoch": 6.324216144096064, "grad_norm": 2.096982717514038, "step": 18960 }, { "epoch": 6.324216144096064, "learning_rate": 0.000251529547987354, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.6203213930130005, "step": 18960 }, { "ce_loss": 0.10694719105958939, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.21681025624275208, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.13992996513843536, "step": 18960 }, { "epoch": 6.324216144096064, "loss": 0.6286050081253052, "step": 18960 }, { "ce_loss": 0.1949644386768341, "epoch": 6.324216144096064, "step": 18960 }, { "distill_loss": 0.21977798640727997, "epoch": 6.324216144096064, "step": 18960 }, { "epoch": 6.324216144096064, "ref_ce_loss": 0.14627158641815186, "step": 18960 }, { "epoch": 6.327551701134089, "loss": 0.6171, "step": 18970 }, { "epoch": 6.327551701134089, "grad_norm": 1.7369863986968994, "step": 18970 }, { "epoch": 6.327551701134089, "learning_rate": 0.0002511283740363504, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.6021354794502258, "step": 18970 }, { "ce_loss": 0.14903834462165833, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.27035075426101685, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.13529963791370392, "step": 18970 }, { "epoch": 6.327551701134089, "loss": 0.6092727184295654, "step": 18970 }, { "ce_loss": 0.12355395406484604, "epoch": 6.327551701134089, "step": 18970 }, { "distill_loss": 0.305045485496521, "epoch": 6.327551701134089, "step": 18970 }, { "epoch": 6.327551701134089, "ref_ce_loss": 0.13894686102867126, "step": 18970 }, { "epoch": 6.3308872581721145, "loss": 0.6264, "step": 18980 }, { "epoch": 6.3308872581721145, "grad_norm": 1.227952003479004, "step": 18980 }, { "epoch": 6.3308872581721145, "learning_rate": 0.0002507273738345534, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.5542557239532471, "step": 18980 }, { "ce_loss": 0.08878917247056961, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.2355813980102539, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.09402923285961151, "step": 18980 }, { "epoch": 6.3308872581721145, "loss": 0.6132735013961792, "step": 18980 }, { "ce_loss": 0.15250413119792938, "epoch": 6.3308872581721145, "step": 18980 }, { "distill_loss": 0.26554346084594727, "epoch": 6.3308872581721145, "step": 18980 }, { "epoch": 6.3308872581721145, "ref_ce_loss": 0.09666182845830917, "step": 18980 }, { "epoch": 6.33422281521014, "loss": 0.6429, "step": 18990 }, { "epoch": 6.33422281521014, "grad_norm": 1.4251363277435303, "step": 18990 }, { "epoch": 6.33422281521014, "learning_rate": 0.0002503265478499736, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.4695286154747009, "step": 18990 }, { "ce_loss": 0.10755734145641327, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.19876904785633087, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.1363854706287384, "step": 18990 }, { "epoch": 6.33422281521014, "loss": 0.4856092929840088, "step": 18990 }, { "ce_loss": 0.10343562811613083, "epoch": 6.33422281521014, "step": 18990 }, { "distill_loss": 0.24028657376766205, "epoch": 6.33422281521014, "step": 18990 }, { "epoch": 6.33422281521014, "ref_ce_loss": 0.10447201877832413, "step": 18990 }, { "epoch": 6.337558372248165, "loss": 0.6261, "step": 19000 }, { "epoch": 6.337558372248165, "grad_norm": 10.998090744018555, "step": 19000 }, { "epoch": 6.337558372248165, "learning_rate": 0.0002499258965504179, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.7924450635910034, "step": 19000 }, { "ce_loss": 0.16777165234088898, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.2919318675994873, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.16161774098873138, "step": 19000 }, { "epoch": 6.337558372248165, "loss": 0.6410017013549805, "step": 19000 }, { "ce_loss": 0.15153133869171143, "epoch": 6.337558372248165, "step": 19000 }, { "distill_loss": 0.2678484320640564, "epoch": 6.337558372248165, "step": 19000 }, { "epoch": 6.337558372248165, "ref_ce_loss": 0.10910286009311676, "step": 19000 }, { "epoch": 6.3408939292861906, "loss": 0.6259, "step": 19010 }, { "epoch": 6.3408939292861906, "grad_norm": 1.927211046218872, "step": 19010 }, { "epoch": 6.3408939292861906, "learning_rate": 0.0002495254204034897, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.5969608426094055, "step": 19010 }, { "ce_loss": 0.1397782862186432, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.2647230327129364, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.10247133672237396, "step": 19010 }, { "epoch": 6.3408939292861906, "loss": 0.7212368249893188, "step": 19010 }, { "ce_loss": 0.14784960448741913, "epoch": 6.3408939292861906, "step": 19010 }, { "distill_loss": 0.2756213843822479, "epoch": 6.3408939292861906, "step": 19010 }, { "epoch": 6.3408939292861906, "ref_ce_loss": 0.14659282565116882, "step": 19010 }, { "epoch": 6.344229486324216, "loss": 0.6313, "step": 19020 }, { "epoch": 6.344229486324216, "grad_norm": 2.6346116065979004, "step": 19020 }, { "epoch": 6.344229486324216, "learning_rate": 0.00024912511987658744, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.6190280318260193, "step": 19020 }, { "ce_loss": 0.14975640177726746, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.2882692217826843, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.13756629824638367, "step": 19020 }, { "epoch": 6.344229486324216, "loss": 0.7044951915740967, "step": 19020 }, { "ce_loss": 0.1121923103928566, "epoch": 6.344229486324216, "step": 19020 }, { "distill_loss": 0.2728697955608368, "epoch": 6.344229486324216, "step": 19020 }, { "epoch": 6.344229486324216, "ref_ce_loss": 0.1473977267742157, "step": 19020 }, { "epoch": 6.347565043362241, "loss": 0.6377, "step": 19030 }, { "epoch": 6.347565043362241, "grad_norm": 1.7768129110336304, "step": 19030 }, { "epoch": 6.347565043362241, "learning_rate": 0.00024872499543690524, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.7369954586029053, "step": 19030 }, { "ce_loss": 0.13455504179000854, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.23038426041603088, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.13557906448841095, "step": 19030 }, { "epoch": 6.347565043362241, "loss": 0.5275669097900391, "step": 19030 }, { "ce_loss": 0.06495196372270584, "epoch": 6.347565043362241, "step": 19030 }, { "distill_loss": 0.23524709045886993, "epoch": 6.347565043362241, "step": 19030 }, { "epoch": 6.347565043362241, "ref_ce_loss": 0.110582135617733, "step": 19030 }, { "epoch": 6.350900600400267, "loss": 0.594, "step": 19040 }, { "epoch": 6.350900600400267, "grad_norm": 1.8504769802093506, "step": 19040 }, { "epoch": 6.350900600400267, "learning_rate": 0.00024832504755143114, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.6440969705581665, "step": 19040 }, { "ce_loss": 0.1938813179731369, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.3125225305557251, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.13713249564170837, "step": 19040 }, { "epoch": 6.350900600400267, "loss": 0.5212450623512268, "step": 19040 }, { "ce_loss": 0.1180388480424881, "epoch": 6.350900600400267, "step": 19040 }, { "distill_loss": 0.28184014558792114, "epoch": 6.350900600400267, "step": 19040 }, { "epoch": 6.350900600400267, "ref_ce_loss": 0.08356638252735138, "step": 19040 }, { "epoch": 6.354236157438292, "loss": 0.6343, "step": 19050 }, { "epoch": 6.354236157438292, "grad_norm": 1.3376898765563965, "step": 19050 }, { "epoch": 6.354236157438292, "learning_rate": 0.0002479252766869476, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 0.46507495641708374, "step": 19050 }, { "ce_loss": 0.11169010400772095, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.24842992424964905, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.1047307476401329, "step": 19050 }, { "epoch": 6.354236157438292, "loss": 1.4413424730300903, "step": 19050 }, { "ce_loss": 0.1381653994321823, "epoch": 6.354236157438292, "step": 19050 }, { "distill_loss": 0.33976855874061584, "epoch": 6.354236157438292, "step": 19050 }, { "epoch": 6.354236157438292, "ref_ce_loss": 0.10790744423866272, "step": 19050 }, { "epoch": 6.357571714476317, "loss": 0.6824, "step": 19060 }, { "epoch": 6.357571714476317, "grad_norm": 1.5171325206756592, "step": 19060 }, { "epoch": 6.357571714476317, "learning_rate": 0.00024752568331003, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.6507919430732727, "step": 19060 }, { "ce_loss": 0.10762443393468857, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.2979751527309418, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.1204538568854332, "step": 19060 }, { "epoch": 6.357571714476317, "loss": 0.6927547454833984, "step": 19060 }, { "ce_loss": 0.17206089198589325, "epoch": 6.357571714476317, "step": 19060 }, { "distill_loss": 0.4154990613460541, "epoch": 6.357571714476317, "step": 19060 }, { "epoch": 6.357571714476317, "ref_ce_loss": 0.10477491468191147, "step": 19060 }, { "epoch": 6.360907271514343, "loss": 0.6538, "step": 19070 }, { "epoch": 6.360907271514343, "grad_norm": 1.3551887273788452, "step": 19070 }, { "epoch": 6.360907271514343, "learning_rate": 0.0002471262678870469, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.6207748055458069, "step": 19070 }, { "ce_loss": 0.13332809507846832, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.32056692242622375, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.13821867108345032, "step": 19070 }, { "epoch": 6.360907271514343, "loss": 0.44919517636299133, "step": 19070 }, { "ce_loss": 0.05316970869898796, "epoch": 6.360907271514343, "step": 19070 }, { "distill_loss": 0.24726107716560364, "epoch": 6.360907271514343, "step": 19070 }, { "epoch": 6.360907271514343, "ref_ce_loss": 0.10930788516998291, "step": 19070 }, { "epoch": 6.364242828552368, "loss": 0.6372, "step": 19080 }, { "epoch": 6.364242828552368, "grad_norm": 6.933979511260986, "step": 19080 }, { "epoch": 6.364242828552368, "learning_rate": 0.000246727030884159, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.485307514667511, "step": 19080 }, { "ce_loss": 0.10739357769489288, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.2198185920715332, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.11307939141988754, "step": 19080 }, { "epoch": 6.364242828552368, "loss": 0.7215542197227478, "step": 19080 }, { "ce_loss": 0.2537827491760254, "epoch": 6.364242828552368, "step": 19080 }, { "distill_loss": 0.30832213163375854, "epoch": 6.364242828552368, "step": 19080 }, { "epoch": 6.364242828552368, "ref_ce_loss": 0.15916509926319122, "step": 19080 }, { "epoch": 6.367578385590393, "loss": 0.651, "step": 19090 }, { "epoch": 6.367578385590393, "grad_norm": 1.340692162513733, "step": 19090 }, { "epoch": 6.367578385590393, "learning_rate": 0.000246327972767319, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.510690450668335, "step": 19090 }, { "ce_loss": 0.1185477003455162, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.2782058119773865, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.11369862407445908, "step": 19090 }, { "epoch": 6.367578385590393, "loss": 0.4796235263347626, "step": 19090 }, { "ce_loss": 0.08776428550481796, "epoch": 6.367578385590393, "step": 19090 }, { "distill_loss": 0.2346237152814865, "epoch": 6.367578385590393, "step": 19090 }, { "epoch": 6.367578385590393, "ref_ce_loss": 0.0737283006310463, "step": 19090 }, { "epoch": 6.370913942628419, "loss": 0.6061, "step": 19100 }, { "epoch": 6.370913942628419, "grad_norm": 1.6497814655303955, "step": 19100 }, { "epoch": 6.370913942628419, "learning_rate": 0.0002459290940022705, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.5280488133430481, "step": 19100 }, { "ce_loss": 0.10927163064479828, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.19644902646541595, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.12360698729753494, "step": 19100 }, { "epoch": 6.370913942628419, "loss": 0.5164461731910706, "step": 19100 }, { "ce_loss": 0.11721847951412201, "epoch": 6.370913942628419, "step": 19100 }, { "distill_loss": 0.2527116537094116, "epoch": 6.370913942628419, "step": 19100 }, { "epoch": 6.370913942628419, "ref_ce_loss": 0.14623260498046875, "step": 19100 }, { "epoch": 6.374249499666444, "loss": 0.6256, "step": 19110 }, { "epoch": 6.374249499666444, "grad_norm": 3.0138490200042725, "step": 19110 }, { "epoch": 6.374249499666444, "learning_rate": 0.0002455303950545482, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.6398062109947205, "step": 19110 }, { "ce_loss": 0.14219415187835693, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.3258860409259796, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.13845248520374298, "step": 19110 }, { "epoch": 6.374249499666444, "loss": 0.47664421796798706, "step": 19110 }, { "ce_loss": 0.08407106250524521, "epoch": 6.374249499666444, "step": 19110 }, { "distill_loss": 0.2631029784679413, "epoch": 6.374249499666444, "step": 19110 }, { "epoch": 6.374249499666444, "ref_ce_loss": 0.09293530881404877, "step": 19110 }, { "epoch": 6.377585056704469, "loss": 0.5997, "step": 19120 }, { "epoch": 6.377585056704469, "grad_norm": 1.3262351751327515, "step": 19120 }, { "epoch": 6.377585056704469, "learning_rate": 0.00024513187638947634, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.600747287273407, "step": 19120 }, { "ce_loss": 0.169780895113945, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.2887939214706421, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.11096259951591492, "step": 19120 }, { "epoch": 6.377585056704469, "loss": 0.5601503849029541, "step": 19120 }, { "ce_loss": 0.11062528938055038, "epoch": 6.377585056704469, "step": 19120 }, { "distill_loss": 0.23963096737861633, "epoch": 6.377585056704469, "step": 19120 }, { "epoch": 6.377585056704469, "ref_ce_loss": 0.11627501249313354, "step": 19120 }, { "epoch": 6.380920613742495, "loss": 0.6067, "step": 19130 }, { "epoch": 6.380920613742495, "grad_norm": 2.5720584392547607, "step": 19130 }, { "epoch": 6.380920613742495, "learning_rate": 0.00024473353847216927, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.9681487083435059, "step": 19130 }, { "ce_loss": 0.13055378198623657, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.22957968711853027, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.1326015293598175, "step": 19130 }, { "epoch": 6.380920613742495, "loss": 0.7964392304420471, "step": 19130 }, { "ce_loss": 0.23628319799900055, "epoch": 6.380920613742495, "step": 19130 }, { "distill_loss": 0.41514474153518677, "epoch": 6.380920613742495, "step": 19130 }, { "epoch": 6.380920613742495, "ref_ce_loss": 0.11686284840106964, "step": 19130 }, { "epoch": 6.38425617078052, "loss": 0.6259, "step": 19140 }, { "epoch": 6.38425617078052, "grad_norm": 1.1457382440567017, "step": 19140 }, { "epoch": 6.38425617078052, "learning_rate": 0.00024433538176753, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.47281309962272644, "step": 19140 }, { "ce_loss": 0.09508227556943893, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.21699288487434387, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.09583717584609985, "step": 19140 }, { "epoch": 6.38425617078052, "loss": 0.5777966976165771, "step": 19140 }, { "ce_loss": 0.1204901710152626, "epoch": 6.38425617078052, "step": 19140 }, { "distill_loss": 0.2619478404521942, "epoch": 6.38425617078052, "step": 19140 }, { "epoch": 6.38425617078052, "ref_ce_loss": 0.10629819333553314, "step": 19140 }, { "epoch": 6.3875917278185455, "loss": 0.5729, "step": 19150 }, { "epoch": 6.3875917278185455, "grad_norm": 1.4310050010681152, "step": 19150 }, { "epoch": 6.3875917278185455, "learning_rate": 0.00024393740674025054, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.566169023513794, "step": 19150 }, { "ce_loss": 0.1300867199897766, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.2211119830608368, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.10674677044153214, "step": 19150 }, { "epoch": 6.3875917278185455, "loss": 0.4511871337890625, "step": 19150 }, { "ce_loss": 0.12526176869869232, "epoch": 6.3875917278185455, "step": 19150 }, { "distill_loss": 0.2322850525379181, "epoch": 6.3875917278185455, "step": 19150 }, { "epoch": 6.3875917278185455, "ref_ce_loss": 0.09337151795625687, "step": 19150 }, { "epoch": 6.390927284856571, "loss": 0.6005, "step": 19160 }, { "epoch": 6.390927284856571, "grad_norm": 1.8587944507598877, "step": 19160 }, { "epoch": 6.390927284856571, "learning_rate": 0.0002435396138548104, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.7587735056877136, "step": 19160 }, { "ce_loss": 0.1708332747220993, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.2876565456390381, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.14317108690738678, "step": 19160 }, { "epoch": 6.390927284856571, "loss": 0.49477240443229675, "step": 19160 }, { "ce_loss": 0.15645787119865417, "epoch": 6.390927284856571, "step": 19160 }, { "distill_loss": 0.19042925536632538, "epoch": 6.390927284856571, "step": 19160 }, { "epoch": 6.390927284856571, "ref_ce_loss": 0.1474367380142212, "step": 19160 }, { "epoch": 6.394262841894596, "loss": 0.6025, "step": 19170 }, { "epoch": 6.394262841894596, "grad_norm": 1.6388176679611206, "step": 19170 }, { "epoch": 6.394262841894596, "learning_rate": 0.00024314200357547684, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.5780409574508667, "step": 19170 }, { "ce_loss": 0.1522267609834671, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.2433975338935852, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.12864993512630463, "step": 19170 }, { "epoch": 6.394262841894596, "loss": 0.922885000705719, "step": 19170 }, { "ce_loss": 0.12585236132144928, "epoch": 6.394262841894596, "step": 19170 }, { "distill_loss": 0.24935932457447052, "epoch": 6.394262841894596, "step": 19170 }, { "epoch": 6.394262841894596, "ref_ce_loss": 0.14388763904571533, "step": 19170 }, { "epoch": 6.3975983989326215, "loss": 0.6638, "step": 19180 }, { "epoch": 6.3975983989326215, "grad_norm": 1.546163558959961, "step": 19180 }, { "epoch": 6.3975983989326215, "learning_rate": 0.00024274457636630365, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.5836116671562195, "step": 19180 }, { "ce_loss": 0.12790940701961517, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.2516058087348938, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.10985349118709564, "step": 19180 }, { "epoch": 6.3975983989326215, "loss": 0.3911502957344055, "step": 19180 }, { "ce_loss": 0.0864064171910286, "epoch": 6.3975983989326215, "step": 19180 }, { "distill_loss": 0.16342835128307343, "epoch": 6.3975983989326215, "step": 19180 }, { "epoch": 6.3975983989326215, "ref_ce_loss": 0.10683442652225494, "step": 19180 }, { "epoch": 6.400933955970647, "loss": 0.5883, "step": 19190 }, { "epoch": 6.400933955970647, "grad_norm": 3.076702833175659, "step": 19190 }, { "epoch": 6.400933955970647, "learning_rate": 0.00024234733269113128, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.5868853330612183, "step": 19190 }, { "ce_loss": 0.1519070714712143, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.2489883005619049, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.09567808359861374, "step": 19190 }, { "epoch": 6.400933955970647, "loss": 0.4550357460975647, "step": 19190 }, { "ce_loss": 0.1052570566534996, "epoch": 6.400933955970647, "step": 19190 }, { "distill_loss": 0.25030675530433655, "epoch": 6.400933955970647, "step": 19190 }, { "epoch": 6.400933955970647, "ref_ce_loss": 0.09924329817295074, "step": 19190 }, { "epoch": 6.404269513008672, "loss": 0.6101, "step": 19200 }, { "epoch": 6.404269513008672, "grad_norm": 3.8931031227111816, "step": 19200 }, { "epoch": 6.404269513008672, "learning_rate": 0.00024195027301358572, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.5110546350479126, "step": 19200 }, { "ce_loss": 0.09627437591552734, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.24332520365715027, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.13109424710273743, "step": 19200 }, { "epoch": 6.404269513008672, "loss": 0.6022369265556335, "step": 19200 }, { "ce_loss": 0.11745914071798325, "epoch": 6.404269513008672, "step": 19200 }, { "distill_loss": 0.22792620956897736, "epoch": 6.404269513008672, "step": 19200 }, { "epoch": 6.404269513008672, "ref_ce_loss": 0.11005257070064545, "step": 19200 }, { "epoch": 6.4076050700466975, "loss": 0.6063, "step": 19210 }, { "epoch": 6.4076050700466975, "grad_norm": 1.5678305625915527, "step": 19210 }, { "epoch": 6.4076050700466975, "learning_rate": 0.00024155339779707852, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.564750611782074, "step": 19210 }, { "ce_loss": 0.07955728471279144, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.23164673149585724, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.1055203229188919, "step": 19210 }, { "epoch": 6.4076050700466975, "loss": 0.6098896861076355, "step": 19210 }, { "ce_loss": 0.14870597422122955, "epoch": 6.4076050700466975, "step": 19210 }, { "distill_loss": 0.2938266098499298, "epoch": 6.4076050700466975, "step": 19210 }, { "epoch": 6.4076050700466975, "ref_ce_loss": 0.13252505660057068, "step": 19210 }, { "epoch": 6.410940627084723, "loss": 0.6006, "step": 19220 }, { "epoch": 6.410940627084723, "grad_norm": 1.7106890678405762, "step": 19220 }, { "epoch": 6.410940627084723, "learning_rate": 0.00024115670750480552, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.834604799747467, "step": 19220 }, { "ce_loss": 0.23402222990989685, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.28607699275016785, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.18355098366737366, "step": 19220 }, { "epoch": 6.410940627084723, "loss": 0.6679419279098511, "step": 19220 }, { "ce_loss": 0.176821768283844, "epoch": 6.410940627084723, "step": 19220 }, { "distill_loss": 0.2825266420841217, "epoch": 6.410940627084723, "step": 19220 }, { "epoch": 6.410940627084723, "ref_ce_loss": 0.11217405647039413, "step": 19220 }, { "epoch": 6.414276184122748, "loss": 0.6123, "step": 19230 }, { "epoch": 6.414276184122748, "grad_norm": 4.054162502288818, "step": 19230 }, { "epoch": 6.414276184122748, "learning_rate": 0.00024076020259974722, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.9778499603271484, "step": 19230 }, { "ce_loss": 0.2200707495212555, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.3484407067298889, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.1907111555337906, "step": 19230 }, { "epoch": 6.414276184122748, "loss": 0.4964205026626587, "step": 19230 }, { "ce_loss": 0.08122781664133072, "epoch": 6.414276184122748, "step": 19230 }, { "distill_loss": 0.20599791407585144, "epoch": 6.414276184122748, "step": 19230 }, { "epoch": 6.414276184122748, "ref_ce_loss": 0.16448451578617096, "step": 19230 }, { "epoch": 6.417611741160774, "loss": 0.5907, "step": 19240 }, { "epoch": 6.417611741160774, "grad_norm": 1.3038452863693237, "step": 19240 }, { "epoch": 6.417611741160774, "learning_rate": 0.00024036388354466728, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.49369338154792786, "step": 19240 }, { "ce_loss": 0.093342624604702, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.2022968977689743, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.10506366193294525, "step": 19240 }, { "epoch": 6.417611741160774, "loss": 0.6517524123191833, "step": 19240 }, { "ce_loss": 0.14238576591014862, "epoch": 6.417611741160774, "step": 19240 }, { "distill_loss": 0.2635531425476074, "epoch": 6.417611741160774, "step": 19240 }, { "epoch": 6.417611741160774, "ref_ce_loss": 0.11066452413797379, "step": 19240 }, { "epoch": 6.420947298198799, "loss": 0.5716, "step": 19250 }, { "epoch": 6.420947298198799, "grad_norm": 1.5243655443191528, "step": 19250 }, { "epoch": 6.420947298198799, "learning_rate": 0.00023996775080211276, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.6852512359619141, "step": 19250 }, { "ce_loss": 0.1377100795507431, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.2286660373210907, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.09662225842475891, "step": 19250 }, { "epoch": 6.420947298198799, "loss": 0.41076192259788513, "step": 19250 }, { "ce_loss": 0.09102330356836319, "epoch": 6.420947298198799, "step": 19250 }, { "distill_loss": 0.20911958813667297, "epoch": 6.420947298198799, "step": 19250 }, { "epoch": 6.420947298198799, "ref_ce_loss": 0.11036841571331024, "step": 19250 }, { "epoch": 6.424282855236824, "loss": 0.6531, "step": 19260 }, { "epoch": 6.424282855236824, "grad_norm": 1.2440197467803955, "step": 19260 }, { "epoch": 6.424282855236824, "learning_rate": 0.00023957180483441336, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.46109625697135925, "step": 19260 }, { "ce_loss": 0.10086578875780106, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.23006482422351837, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.10876964032649994, "step": 19260 }, { "epoch": 6.424282855236824, "loss": 0.7225244641304016, "step": 19260 }, { "ce_loss": 0.18343444168567657, "epoch": 6.424282855236824, "step": 19260 }, { "distill_loss": 0.28087159991264343, "epoch": 6.424282855236824, "step": 19260 }, { "epoch": 6.424282855236824, "ref_ce_loss": 0.15784253180027008, "step": 19260 }, { "epoch": 6.42761841227485, "loss": 0.6306, "step": 19270 }, { "epoch": 6.42761841227485, "grad_norm": 2.5110762119293213, "step": 19270 }, { "epoch": 6.42761841227485, "learning_rate": 0.00023917604610368049, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.5681543946266174, "step": 19270 }, { "ce_loss": 0.15568585693836212, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.28800246119499207, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.12352965772151947, "step": 19270 }, { "epoch": 6.42761841227485, "loss": 0.45578742027282715, "step": 19270 }, { "ce_loss": 0.08018852770328522, "epoch": 6.42761841227485, "step": 19270 }, { "distill_loss": 0.2554260492324829, "epoch": 6.42761841227485, "step": 19270 }, { "epoch": 6.42761841227485, "ref_ce_loss": 0.09290342777967453, "step": 19270 }, { "epoch": 6.430953969312875, "loss": 0.5742, "step": 19280 }, { "epoch": 6.430953969312875, "grad_norm": 1.568680763244629, "step": 19280 }, { "epoch": 6.430953969312875, "learning_rate": 0.00023878047507180718, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.6375356912612915, "step": 19280 }, { "ce_loss": 0.19469177722930908, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.29809805750846863, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.11061777174472809, "step": 19280 }, { "epoch": 6.430953969312875, "loss": 0.6644027829170227, "step": 19280 }, { "ce_loss": 0.16229896247386932, "epoch": 6.430953969312875, "step": 19280 }, { "distill_loss": 0.2902613878250122, "epoch": 6.430953969312875, "step": 19280 }, { "epoch": 6.430953969312875, "ref_ce_loss": 0.11752331256866455, "step": 19280 }, { "epoch": 6.4342895263509, "loss": 0.6654, "step": 19290 }, { "epoch": 6.4342895263509, "grad_norm": 4.854502201080322, "step": 19290 }, { "epoch": 6.4342895263509, "learning_rate": 0.0002383850922004674, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.6462293863296509, "step": 19290 }, { "ce_loss": 0.15397413074970245, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.36222219467163086, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.09376648813486099, "step": 19290 }, { "epoch": 6.4342895263509, "loss": 0.6333227753639221, "step": 19290 }, { "ce_loss": 0.12945379316806793, "epoch": 6.4342895263509, "step": 19290 }, { "distill_loss": 0.3720274567604065, "epoch": 6.4342895263509, "step": 19290 }, { "epoch": 6.4342895263509, "ref_ce_loss": 0.09307526051998138, "step": 19290 }, { "epoch": 6.437625083388926, "loss": 0.6233, "step": 19300 }, { "epoch": 6.437625083388926, "grad_norm": 1.6913163661956787, "step": 19300 }, { "epoch": 6.437625083388926, "learning_rate": 0.00023798989795111556, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.5537348985671997, "step": 19300 }, { "ce_loss": 0.12098418176174164, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.2599371075630188, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.10150939226150513, "step": 19300 }, { "epoch": 6.437625083388926, "loss": 0.6065669655799866, "step": 19300 }, { "ce_loss": 0.13648119568824768, "epoch": 6.437625083388926, "step": 19300 }, { "distill_loss": 0.3049774765968323, "epoch": 6.437625083388926, "step": 19300 }, { "epoch": 6.437625083388926, "ref_ce_loss": 0.14003805816173553, "step": 19300 }, { "epoch": 6.440960640426951, "loss": 0.6065, "step": 19310 }, { "epoch": 6.440960640426951, "grad_norm": 2.4706809520721436, "step": 19310 }, { "epoch": 6.440960640426951, "learning_rate": 0.0002375948927849857, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.49890419840812683, "step": 19310 }, { "ce_loss": 0.09926062822341919, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.26944243907928467, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.1298120617866516, "step": 19310 }, { "epoch": 6.440960640426951, "loss": 0.5722777843475342, "step": 19310 }, { "ce_loss": 0.12331241369247437, "epoch": 6.440960640426951, "step": 19310 }, { "distill_loss": 0.3165596127510071, "epoch": 6.440960640426951, "step": 19310 }, { "epoch": 6.440960640426951, "ref_ce_loss": 0.09562243521213531, "step": 19310 }, { "epoch": 6.444296197464976, "loss": 0.5972, "step": 19320 }, { "epoch": 6.444296197464976, "grad_norm": 1.4854732751846313, "step": 19320 }, { "epoch": 6.444296197464976, "learning_rate": 0.0002372000771630916, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.6584548354148865, "step": 19320 }, { "ce_loss": 0.12285207957029343, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.23300600051879883, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.14293143153190613, "step": 19320 }, { "epoch": 6.444296197464976, "loss": 0.6606813669204712, "step": 19320 }, { "ce_loss": 0.163771390914917, "epoch": 6.444296197464976, "step": 19320 }, { "distill_loss": 0.263140469789505, "epoch": 6.444296197464976, "step": 19320 }, { "epoch": 6.444296197464976, "ref_ce_loss": 0.12971723079681396, "step": 19320 }, { "epoch": 6.447631754503002, "loss": 0.6658, "step": 19330 }, { "epoch": 6.447631754503002, "grad_norm": 2.2731552124023438, "step": 19330 }, { "epoch": 6.447631754503002, "learning_rate": 0.00023680545154622533, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.8783732056617737, "step": 19330 }, { "ce_loss": 0.1531582623720169, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.2370985746383667, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.10351847857236862, "step": 19330 }, { "epoch": 6.447631754503002, "loss": 0.33935779333114624, "step": 19330 }, { "ce_loss": 0.07009600847959518, "epoch": 6.447631754503002, "step": 19330 }, { "distill_loss": 0.18475466966629028, "epoch": 6.447631754503002, "step": 19330 }, { "epoch": 6.447631754503002, "ref_ce_loss": 0.08419663459062576, "step": 19330 }, { "epoch": 6.450967311541027, "loss": 0.5768, "step": 19340 }, { "epoch": 6.450967311541027, "grad_norm": 1.3007348775863647, "step": 19340 }, { "epoch": 6.450967311541027, "learning_rate": 0.0002364110163949577, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 0.5033159255981445, "step": 19340 }, { "ce_loss": 0.10342025011777878, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.2683402895927429, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.1313643902540207, "step": 19340 }, { "epoch": 6.450967311541027, "loss": 1.1450793743133545, "step": 19340 }, { "ce_loss": 0.19282738864421844, "epoch": 6.450967311541027, "step": 19340 }, { "distill_loss": 0.30856090784072876, "epoch": 6.450967311541027, "step": 19340 }, { "epoch": 6.450967311541027, "ref_ce_loss": 0.14692862331867218, "step": 19340 }, { "epoch": 6.454302868579052, "loss": 0.608, "step": 19350 }, { "epoch": 6.454302868579052, "grad_norm": 1.5652614831924438, "step": 19350 }, { "epoch": 6.454302868579052, "learning_rate": 0.00023601677216963674, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.8717043399810791, "step": 19350 }, { "ce_loss": 0.18375054001808167, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.2988278269767761, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.14031578600406647, "step": 19350 }, { "epoch": 6.454302868579052, "loss": 0.47830939292907715, "step": 19350 }, { "ce_loss": 0.12546370923519135, "epoch": 6.454302868579052, "step": 19350 }, { "distill_loss": 0.22042587399482727, "epoch": 6.454302868579052, "step": 19350 }, { "epoch": 6.454302868579052, "ref_ce_loss": 0.0987526997923851, "step": 19350 }, { "epoch": 6.457638425617078, "loss": 0.682, "step": 19360 }, { "epoch": 6.457638425617078, "grad_norm": 1.4231470823287964, "step": 19360 }, { "epoch": 6.457638425617078, "learning_rate": 0.0002356227193303879, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.6524989604949951, "step": 19360 }, { "ce_loss": 0.11222806572914124, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.24415633082389832, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.1500520259141922, "step": 19360 }, { "epoch": 6.457638425617078, "loss": 0.576420783996582, "step": 19360 }, { "ce_loss": 0.1236271932721138, "epoch": 6.457638425617078, "step": 19360 }, { "distill_loss": 0.2536475658416748, "epoch": 6.457638425617078, "step": 19360 }, { "epoch": 6.457638425617078, "ref_ce_loss": 0.12630252540111542, "step": 19360 }, { "epoch": 6.460973982655103, "loss": 0.5793, "step": 19370 }, { "epoch": 6.460973982655103, "grad_norm": 1.6094447374343872, "step": 19370 }, { "epoch": 6.460973982655103, "learning_rate": 0.00023522885833711339, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.5709271430969238, "step": 19370 }, { "ce_loss": 0.17371134459972382, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.29568082094192505, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.1014719232916832, "step": 19370 }, { "epoch": 6.460973982655103, "loss": 0.3881910443305969, "step": 19370 }, { "ce_loss": 0.07997575402259827, "epoch": 6.460973982655103, "step": 19370 }, { "distill_loss": 0.16086122393608093, "epoch": 6.460973982655103, "step": 19370 }, { "epoch": 6.460973982655103, "ref_ce_loss": 0.11414096504449844, "step": 19370 }, { "epoch": 6.4643095396931285, "loss": 0.5655, "step": 19380 }, { "epoch": 6.4643095396931285, "grad_norm": 1.544553279876709, "step": 19380 }, { "epoch": 6.4643095396931285, "learning_rate": 0.0002348351896494914, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.7244067192077637, "step": 19380 }, { "ce_loss": 0.13852331042289734, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.23501816391944885, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.13060513138771057, "step": 19380 }, { "epoch": 6.4643095396931285, "loss": 0.3620413541793823, "step": 19380 }, { "ce_loss": 0.08747374266386032, "epoch": 6.4643095396931285, "step": 19380 }, { "distill_loss": 0.17260627448558807, "epoch": 6.4643095396931285, "step": 19380 }, { "epoch": 6.4643095396931285, "ref_ce_loss": 0.07209240645170212, "step": 19380 }, { "epoch": 6.467645096731154, "loss": 0.5893, "step": 19390 }, { "epoch": 6.467645096731154, "grad_norm": 1.6576930284500122, "step": 19390 }, { "epoch": 6.467645096731154, "learning_rate": 0.00023444171372697547, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 0.5739819407463074, "step": 19390 }, { "ce_loss": 0.133833110332489, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.27732205390930176, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.12894289195537567, "step": 19390 }, { "epoch": 6.467645096731154, "loss": 1.0972142219543457, "step": 19390 }, { "ce_loss": 0.2051316499710083, "epoch": 6.467645096731154, "step": 19390 }, { "distill_loss": 0.2552916407585144, "epoch": 6.467645096731154, "step": 19390 }, { "epoch": 6.467645096731154, "ref_ce_loss": 0.10956212878227234, "step": 19390 }, { "epoch": 6.470980653769179, "loss": 0.6861, "step": 19400 }, { "epoch": 6.470980653769179, "grad_norm": 1.7194758653640747, "step": 19400 }, { "epoch": 6.470980653769179, "learning_rate": 0.00023404843102879452, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.6886380910873413, "step": 19400 }, { "ce_loss": 0.1668749749660492, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.26839739084243774, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.1428913176059723, "step": 19400 }, { "epoch": 6.470980653769179, "loss": 0.5019069314002991, "step": 19400 }, { "ce_loss": 0.09855532646179199, "epoch": 6.470980653769179, "step": 19400 }, { "distill_loss": 0.23468101024627686, "epoch": 6.470980653769179, "step": 19400 }, { "epoch": 6.470980653769179, "ref_ce_loss": 0.08430713415145874, "step": 19400 }, { "epoch": 6.4743162108072045, "loss": 0.6133, "step": 19410 }, { "epoch": 6.4743162108072045, "grad_norm": 1.5843451023101807, "step": 19410 }, { "epoch": 6.4743162108072045, "learning_rate": 0.0002336553420139516, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.6567657589912415, "step": 19410 }, { "ce_loss": 0.07436903566122055, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.20002348721027374, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.13849644362926483, "step": 19410 }, { "epoch": 6.4743162108072045, "loss": 0.6510065793991089, "step": 19410 }, { "ce_loss": 0.1610298454761505, "epoch": 6.4743162108072045, "step": 19410 }, { "distill_loss": 0.3236733376979828, "epoch": 6.4743162108072045, "step": 19410 }, { "epoch": 6.4743162108072045, "ref_ce_loss": 0.1232231929898262, "step": 19410 }, { "epoch": 6.47765176784523, "loss": 0.6563, "step": 19420 }, { "epoch": 6.47765176784523, "grad_norm": 1.6471127271652222, "step": 19420 }, { "epoch": 6.47765176784523, "learning_rate": 0.0002332624471412241, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.5237685441970825, "step": 19420 }, { "ce_loss": 0.12244009971618652, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.22729995846748352, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.11839036643505096, "step": 19420 }, { "epoch": 6.47765176784523, "loss": 0.8403283357620239, "step": 19420 }, { "ce_loss": 0.15734674036502838, "epoch": 6.47765176784523, "step": 19420 }, { "distill_loss": 0.24373812973499298, "epoch": 6.47765176784523, "step": 19420 }, { "epoch": 6.47765176784523, "ref_ce_loss": 0.12084154784679413, "step": 19420 }, { "epoch": 6.480987324883255, "loss": 0.6131, "step": 19430 }, { "epoch": 6.480987324883255, "grad_norm": 1.3815033435821533, "step": 19430 }, { "epoch": 6.480987324883255, "learning_rate": 0.00023286974686916235, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.6015914082527161, "step": 19430 }, { "ce_loss": 0.16842041909694672, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.2824459671974182, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.10923143476247787, "step": 19430 }, { "epoch": 6.480987324883255, "loss": 0.4720430076122284, "step": 19430 }, { "ce_loss": 0.13118982315063477, "epoch": 6.480987324883255, "step": 19430 }, { "distill_loss": 0.21990752220153809, "epoch": 6.480987324883255, "step": 19430 }, { "epoch": 6.480987324883255, "ref_ce_loss": 0.12076015025377274, "step": 19430 }, { "epoch": 6.484322881921281, "loss": 0.5629, "step": 19440 }, { "epoch": 6.484322881921281, "grad_norm": 1.0681235790252686, "step": 19440 }, { "epoch": 6.484322881921281, "learning_rate": 0.00023247724165609, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.7614908814430237, "step": 19440 }, { "ce_loss": 0.1435561329126358, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.29191237688064575, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.12050356715917587, "step": 19440 }, { "epoch": 6.484322881921281, "loss": 0.49718719720840454, "step": 19440 }, { "ce_loss": 0.09759809821844101, "epoch": 6.484322881921281, "step": 19440 }, { "distill_loss": 0.24840621650218964, "epoch": 6.484322881921281, "step": 19440 }, { "epoch": 6.484322881921281, "ref_ce_loss": 0.10465782880783081, "step": 19440 }, { "epoch": 6.487658438959306, "loss": 0.612, "step": 19450 }, { "epoch": 6.487658438959306, "grad_norm": 1.1900967359542847, "step": 19450 }, { "epoch": 6.487658438959306, "learning_rate": 0.00023208493196010292, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.7231645584106445, "step": 19450 }, { "ce_loss": 0.1517426073551178, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.2369062751531601, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.116610586643219, "step": 19450 }, { "epoch": 6.487658438959306, "loss": 0.6371214389801025, "step": 19450 }, { "ce_loss": 0.1503915637731552, "epoch": 6.487658438959306, "step": 19450 }, { "distill_loss": 0.26254144310951233, "epoch": 6.487658438959306, "step": 19450 }, { "epoch": 6.487658438959306, "ref_ce_loss": 0.11877196282148361, "step": 19450 }, { "epoch": 6.490993995997331, "loss": 0.5708, "step": 19460 }, { "epoch": 6.490993995997331, "grad_norm": 1.619773507118225, "step": 19460 }, { "epoch": 6.490993995997331, "learning_rate": 0.00023169281823906857, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.3980375826358795, "step": 19460 }, { "ce_loss": 0.1058371439576149, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.19484549760818481, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.09722219407558441, "step": 19460 }, { "epoch": 6.490993995997331, "loss": 0.5850816369056702, "step": 19460 }, { "ce_loss": 0.16564002633094788, "epoch": 6.490993995997331, "step": 19460 }, { "distill_loss": 0.25721365213394165, "epoch": 6.490993995997331, "step": 19460 }, { "epoch": 6.490993995997331, "ref_ce_loss": 0.11461980640888214, "step": 19460 }, { "epoch": 6.494329553035357, "loss": 0.5651, "step": 19470 }, { "epoch": 6.494329553035357, "grad_norm": 1.083567500114441, "step": 19470 }, { "epoch": 6.494329553035357, "learning_rate": 0.00023130090095062572, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.7174849510192871, "step": 19470 }, { "ce_loss": 0.11787024140357971, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.2558152675628662, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.13834647834300995, "step": 19470 }, { "epoch": 6.494329553035357, "loss": 0.40972697734832764, "step": 19470 }, { "ce_loss": 0.07389426976442337, "epoch": 6.494329553035357, "step": 19470 }, { "distill_loss": 0.21317392587661743, "epoch": 6.494329553035357, "step": 19470 }, { "epoch": 6.494329553035357, "ref_ce_loss": 0.08200264722108841, "step": 19470 }, { "epoch": 6.497665110073382, "loss": 0.5995, "step": 19480 }, { "epoch": 6.497665110073382, "grad_norm": 1.8454265594482422, "step": 19480 }, { "epoch": 6.497665110073382, "learning_rate": 0.00023090918055218462, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.6001446843147278, "step": 19480 }, { "ce_loss": 0.16934876143932343, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.2737237215042114, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.1263040155172348, "step": 19480 }, { "epoch": 6.497665110073382, "loss": 0.48116031289100647, "step": 19480 }, { "ce_loss": 0.11749963462352753, "epoch": 6.497665110073382, "step": 19480 }, { "distill_loss": 0.2328428030014038, "epoch": 6.497665110073382, "step": 19480 }, { "epoch": 6.497665110073382, "ref_ce_loss": 0.13069671392440796, "step": 19480 }, { "epoch": 6.501000667111407, "loss": 0.5274, "step": 19490 }, { "epoch": 6.501000667111407, "grad_norm": 1.5844311714172363, "step": 19490 }, { "epoch": 6.501000667111407, "learning_rate": 0.00023051765750092454, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.5427977442741394, "step": 19490 }, { "ce_loss": 0.1288299262523651, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.24468667805194855, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.13324861228466034, "step": 19490 }, { "epoch": 6.501000667111407, "loss": 0.6738988161087036, "step": 19490 }, { "ce_loss": 0.17870159447193146, "epoch": 6.501000667111407, "step": 19490 }, { "distill_loss": 0.2239747941493988, "epoch": 6.501000667111407, "step": 19490 }, { "epoch": 6.501000667111407, "ref_ce_loss": 0.15819533169269562, "step": 19490 }, { "epoch": 6.504336224149433, "loss": 0.6324, "step": 19500 }, { "epoch": 6.504336224149433, "grad_norm": 1.2954641580581665, "step": 19500 }, { "epoch": 6.504336224149433, "learning_rate": 0.00023012633225379526, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.6101667284965515, "step": 19500 }, { "ce_loss": 0.1754041314125061, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.2514132559299469, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.11005375534296036, "step": 19500 }, { "epoch": 6.504336224149433, "loss": 0.5737624764442444, "step": 19500 }, { "ce_loss": 0.13218827545642853, "epoch": 6.504336224149433, "step": 19500 }, { "distill_loss": 0.26241040229797363, "epoch": 6.504336224149433, "step": 19500 }, { "epoch": 6.504336224149433, "ref_ce_loss": 0.15054892003536224, "step": 19500 }, { "epoch": 6.507671781187458, "loss": 0.5694, "step": 19510 }, { "epoch": 6.507671781187458, "grad_norm": 1.4394662380218506, "step": 19510 }, { "epoch": 6.507671781187458, "learning_rate": 0.00022973520526751534, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.5179211497306824, "step": 19510 }, { "ce_loss": 0.14793452620506287, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.23366346955299377, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.11347433179616928, "step": 19510 }, { "epoch": 6.507671781187458, "loss": 0.44513532519340515, "step": 19510 }, { "ce_loss": 0.11363400518894196, "epoch": 6.507671781187458, "step": 19510 }, { "distill_loss": 0.25194841623306274, "epoch": 6.507671781187458, "step": 19510 }, { "epoch": 6.507671781187458, "ref_ce_loss": 0.07942268252372742, "step": 19510 }, { "epoch": 6.511007338225483, "loss": 0.5855, "step": 19520 }, { "epoch": 6.511007338225483, "grad_norm": 2.0026657581329346, "step": 19520 }, { "epoch": 6.511007338225483, "learning_rate": 0.00022934427699857212, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.41633138060569763, "step": 19520 }, { "ce_loss": 0.08929416537284851, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.18606798350811005, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.10715822130441666, "step": 19520 }, { "epoch": 6.511007338225483, "loss": 0.617181658744812, "step": 19520 }, { "ce_loss": 0.1636533886194229, "epoch": 6.511007338225483, "step": 19520 }, { "distill_loss": 0.24410457909107208, "epoch": 6.511007338225483, "step": 19520 }, { "epoch": 6.511007338225483, "ref_ce_loss": 0.10966235399246216, "step": 19520 }, { "epoch": 6.514342895263509, "loss": 0.6211, "step": 19530 }, { "epoch": 6.514342895263509, "grad_norm": 1.1486506462097168, "step": 19530 }, { "epoch": 6.514342895263509, "learning_rate": 0.00022895354790322122, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.5419558882713318, "step": 19530 }, { "ce_loss": 0.0822252556681633, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.24467316269874573, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.1084655374288559, "step": 19530 }, { "epoch": 6.514342895263509, "loss": 0.501002848148346, "step": 19530 }, { "ce_loss": 0.09863227605819702, "epoch": 6.514342895263509, "step": 19530 }, { "distill_loss": 0.250343918800354, "epoch": 6.514342895263509, "step": 19530 }, { "epoch": 6.514342895263509, "ref_ce_loss": 0.12293490767478943, "step": 19530 }, { "epoch": 6.517678452301534, "loss": 0.5737, "step": 19540 }, { "epoch": 6.517678452301534, "grad_norm": 1.5684605836868286, "step": 19540 }, { "epoch": 6.517678452301534, "learning_rate": 0.000228563018437485, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.42738595604896545, "step": 19540 }, { "ce_loss": 0.10307467728853226, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.18302753567695618, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.08789996057748795, "step": 19540 }, { "epoch": 6.517678452301534, "loss": 0.5575722455978394, "step": 19540 }, { "ce_loss": 0.10519887506961823, "epoch": 6.517678452301534, "step": 19540 }, { "distill_loss": 0.1517784595489502, "epoch": 6.517678452301534, "step": 19540 }, { "epoch": 6.517678452301534, "ref_ce_loss": 0.07769259810447693, "step": 19540 }, { "epoch": 6.521014009339559, "loss": 0.582, "step": 19550 }, { "epoch": 6.521014009339559, "grad_norm": 1.8316733837127686, "step": 19550 }, { "epoch": 6.521014009339559, "learning_rate": 0.0002281726890571537, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.5858330726623535, "step": 19550 }, { "ce_loss": 0.14364883303642273, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.2579484283924103, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.14223860204219818, "step": 19550 }, { "epoch": 6.521014009339559, "loss": 0.5081810355186462, "step": 19550 }, { "ce_loss": 0.0988958552479744, "epoch": 6.521014009339559, "step": 19550 }, { "distill_loss": 0.21971705555915833, "epoch": 6.521014009339559, "step": 19550 }, { "epoch": 6.521014009339559, "ref_ce_loss": 0.1278819441795349, "step": 19550 }, { "epoch": 6.524349566377585, "loss": 0.5486, "step": 19560 }, { "epoch": 6.524349566377585, "grad_norm": 1.4172102212905884, "step": 19560 }, { "epoch": 6.524349566377585, "learning_rate": 0.00022778256021778367, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.5101405382156372, "step": 19560 }, { "ce_loss": 0.15842664241790771, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.2162400782108307, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.10916368663311005, "step": 19560 }, { "epoch": 6.524349566377585, "loss": 0.6007856130599976, "step": 19560 }, { "ce_loss": 0.12771476805210114, "epoch": 6.524349566377585, "step": 19560 }, { "distill_loss": 0.230327770113945, "epoch": 6.524349566377585, "step": 19560 }, { "epoch": 6.524349566377585, "ref_ce_loss": 0.11886480450630188, "step": 19560 }, { "epoch": 6.52768512341561, "loss": 0.5405, "step": 19570 }, { "epoch": 6.52768512341561, "grad_norm": 1.4749727249145508, "step": 19570 }, { "epoch": 6.52768512341561, "learning_rate": 0.00022739263237469732, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.4140733778476715, "step": 19570 }, { "ce_loss": 0.12525179982185364, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.18022418022155762, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.10837776958942413, "step": 19570 }, { "epoch": 6.52768512341561, "loss": 0.6587346792221069, "step": 19570 }, { "ce_loss": 0.11886288970708847, "epoch": 6.52768512341561, "step": 19570 }, { "distill_loss": 0.23400680720806122, "epoch": 6.52768512341561, "step": 19570 }, { "epoch": 6.52768512341561, "ref_ce_loss": 0.13027873635292053, "step": 19570 }, { "epoch": 6.5310206804536355, "loss": 0.6594, "step": 19580 }, { "epoch": 6.5310206804536355, "grad_norm": 2.201953887939453, "step": 19580 }, { "epoch": 6.5310206804536355, "learning_rate": 0.00022700290598298204, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.6995164752006531, "step": 19580 }, { "ce_loss": 0.1675536185503006, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.347374826669693, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.12447948008775711, "step": 19580 }, { "epoch": 6.5310206804536355, "loss": 0.8357738256454468, "step": 19580 }, { "ce_loss": 0.16174152493476868, "epoch": 6.5310206804536355, "step": 19580 }, { "distill_loss": 0.3044124245643616, "epoch": 6.5310206804536355, "step": 19580 }, { "epoch": 6.5310206804536355, "ref_ce_loss": 0.13059574365615845, "step": 19580 }, { "epoch": 6.534356237491661, "loss": 0.6612, "step": 19590 }, { "epoch": 6.534356237491661, "grad_norm": 1.8671090602874756, "step": 19590 }, { "epoch": 6.534356237491661, "learning_rate": 0.0002266133814974909, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.5545954704284668, "step": 19590 }, { "ce_loss": 0.12362905591726303, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.2608444392681122, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.1253921389579773, "step": 19590 }, { "epoch": 6.534356237491661, "loss": 0.532317042350769, "step": 19590 }, { "ce_loss": 0.12566885352134705, "epoch": 6.534356237491661, "step": 19590 }, { "distill_loss": 0.24860218167304993, "epoch": 6.534356237491661, "step": 19590 }, { "epoch": 6.534356237491661, "ref_ce_loss": 0.1408577263355255, "step": 19590 }, { "epoch": 6.537691794529686, "loss": 0.5622, "step": 19600 }, { "epoch": 6.537691794529686, "grad_norm": 1.6233079433441162, "step": 19600 }, { "epoch": 6.537691794529686, "learning_rate": 0.00022622405937284087, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.6238809823989868, "step": 19600 }, { "ce_loss": 0.12666717171669006, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.23891451954841614, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.12072195112705231, "step": 19600 }, { "epoch": 6.537691794529686, "loss": 0.9023510813713074, "step": 19600 }, { "ce_loss": 0.19780804216861725, "epoch": 6.537691794529686, "step": 19600 }, { "distill_loss": 0.34094032645225525, "epoch": 6.537691794529686, "step": 19600 }, { "epoch": 6.537691794529686, "ref_ce_loss": 0.14249184727668762, "step": 19600 }, { "epoch": 6.5410273515677115, "loss": 0.561, "step": 19610 }, { "epoch": 6.5410273515677115, "grad_norm": 2.4079267978668213, "step": 19610 }, { "epoch": 6.5410273515677115, "learning_rate": 0.00022583494006341243, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.4726938307285309, "step": 19610 }, { "ce_loss": 0.0903717577457428, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.242726668715477, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.1005355715751648, "step": 19610 }, { "epoch": 6.5410273515677115, "loss": 0.5008431673049927, "step": 19610 }, { "ce_loss": 0.09989278763532639, "epoch": 6.5410273515677115, "step": 19610 }, { "distill_loss": 0.23725369572639465, "epoch": 6.5410273515677115, "step": 19610 }, { "epoch": 6.5410273515677115, "ref_ce_loss": 0.12273890525102615, "step": 19610 }, { "epoch": 6.544362908605737, "loss": 0.6709, "step": 19620 }, { "epoch": 6.544362908605737, "grad_norm": 1.5335849523544312, "step": 19620 }, { "epoch": 6.544362908605737, "learning_rate": 0.0002254460240233499, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.4559919834136963, "step": 19620 }, { "ce_loss": 0.11252197623252869, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.21082544326782227, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.10504145175218582, "step": 19620 }, { "epoch": 6.544362908605737, "loss": 0.6788507103919983, "step": 19620 }, { "ce_loss": 0.18909797072410583, "epoch": 6.544362908605737, "step": 19620 }, { "distill_loss": 0.3105465769767761, "epoch": 6.544362908605737, "step": 19620 }, { "epoch": 6.544362908605737, "ref_ce_loss": 0.12650969624519348, "step": 19620 }, { "epoch": 6.547698465643762, "loss": 0.5758, "step": 19630 }, { "epoch": 6.547698465643762, "grad_norm": 1.2773675918579102, "step": 19630 }, { "epoch": 6.547698465643762, "learning_rate": 0.0002250573117065601, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 1.0461361408233643, "step": 19630 }, { "ce_loss": 0.13800853490829468, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.2769632637500763, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.12022116780281067, "step": 19630 }, { "epoch": 6.547698465643762, "loss": 0.8002582788467407, "step": 19630 }, { "ce_loss": 0.24044691026210785, "epoch": 6.547698465643762, "step": 19630 }, { "distill_loss": 0.3901602327823639, "epoch": 6.547698465643762, "step": 19630 }, { "epoch": 6.547698465643762, "ref_ce_loss": 0.1429591327905655, "step": 19630 }, { "epoch": 6.551034022681788, "loss": 0.6265, "step": 19640 }, { "epoch": 6.551034022681788, "grad_norm": 1.434065341949463, "step": 19640 }, { "epoch": 6.551034022681788, "learning_rate": 0.00022466880356671233, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.8791804313659668, "step": 19640 }, { "ce_loss": 0.1697702407836914, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.2402646541595459, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.14590327441692352, "step": 19640 }, { "epoch": 6.551034022681788, "loss": 0.5269797444343567, "step": 19640 }, { "ce_loss": 0.14256873726844788, "epoch": 6.551034022681788, "step": 19640 }, { "distill_loss": 0.22623835504055023, "epoch": 6.551034022681788, "step": 19640 }, { "epoch": 6.551034022681788, "ref_ce_loss": 0.12891146540641785, "step": 19640 }, { "epoch": 6.554369579719813, "loss": 0.644, "step": 19650 }, { "epoch": 6.554369579719813, "grad_norm": 1.3951469659805298, "step": 19650 }, { "epoch": 6.554369579719813, "learning_rate": 0.0002242805000572371, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.6440078616142273, "step": 19650 }, { "ce_loss": 0.18327969312667847, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.27862748503685, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.1434606909751892, "step": 19650 }, { "epoch": 6.554369579719813, "loss": 0.6104453206062317, "step": 19650 }, { "ce_loss": 0.15171493589878082, "epoch": 6.554369579719813, "step": 19650 }, { "distill_loss": 0.28208792209625244, "epoch": 6.554369579719813, "step": 19650 }, { "epoch": 6.554369579719813, "ref_ce_loss": 0.13384582102298737, "step": 19650 }, { "epoch": 6.557705136757838, "loss": 0.6289, "step": 19660 }, { "epoch": 6.557705136757838, "grad_norm": 2.730189561843872, "step": 19660 }, { "epoch": 6.557705136757838, "learning_rate": 0.00022389240163132645, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.6251262426376343, "step": 19660 }, { "ce_loss": 0.1534355878829956, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.2709234952926636, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.11832322925329208, "step": 19660 }, { "epoch": 6.557705136757838, "loss": 0.7124171853065491, "step": 19660 }, { "ce_loss": 0.1600910872220993, "epoch": 6.557705136757838, "step": 19660 }, { "distill_loss": 0.3154120147228241, "epoch": 6.557705136757838, "step": 19660 }, { "epoch": 6.557705136757838, "ref_ce_loss": 0.10829029977321625, "step": 19660 }, { "epoch": 6.561040693795864, "loss": 0.6438, "step": 19670 }, { "epoch": 6.561040693795864, "grad_norm": 3.5961387157440186, "step": 19670 }, { "epoch": 6.561040693795864, "learning_rate": 0.0002235045087419331, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.9230295419692993, "step": 19670 }, { "ce_loss": 0.16147533059120178, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.30377304553985596, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.12979774177074432, "step": 19670 }, { "epoch": 6.561040693795864, "loss": 0.5350285172462463, "step": 19670 }, { "ce_loss": 0.10126829892396927, "epoch": 6.561040693795864, "step": 19670 }, { "distill_loss": 0.22712314128875732, "epoch": 6.561040693795864, "step": 19670 }, { "epoch": 6.561040693795864, "ref_ce_loss": 0.10902617871761322, "step": 19670 }, { "epoch": 6.564376250833889, "loss": 0.6751, "step": 19680 }, { "epoch": 6.564376250833889, "grad_norm": 1.9512437582015991, "step": 19680 }, { "epoch": 6.564376250833889, "learning_rate": 0.00022311682184176986, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.7731413841247559, "step": 19680 }, { "ce_loss": 0.1124635860323906, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.2667849659919739, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.11324404180049896, "step": 19680 }, { "epoch": 6.564376250833889, "loss": 0.52984619140625, "step": 19680 }, { "ce_loss": 0.1323978304862976, "epoch": 6.564376250833889, "step": 19680 }, { "distill_loss": 0.23706260323524475, "epoch": 6.564376250833889, "step": 19680 }, { "epoch": 6.564376250833889, "ref_ce_loss": 0.12182512879371643, "step": 19680 }, { "epoch": 6.567711807871914, "loss": 0.5983, "step": 19690 }, { "epoch": 6.567711807871914, "grad_norm": 1.7688568830490112, "step": 19690 }, { "epoch": 6.567711807871914, "learning_rate": 0.00022272934138330865, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.5021007657051086, "step": 19690 }, { "ce_loss": 0.15027810633182526, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.23919159173965454, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.11207623034715652, "step": 19690 }, { "epoch": 6.567711807871914, "loss": 0.4850795269012451, "step": 19690 }, { "ce_loss": 0.1430709809064865, "epoch": 6.567711807871914, "step": 19690 }, { "distill_loss": 0.22025123238563538, "epoch": 6.567711807871914, "step": 19690 }, { "epoch": 6.567711807871914, "ref_ce_loss": 0.09491828083992004, "step": 19690 }, { "epoch": 6.57104736490994, "loss": 0.6223, "step": 19700 }, { "epoch": 6.57104736490994, "grad_norm": 1.7269474267959595, "step": 19700 }, { "epoch": 6.57104736490994, "learning_rate": 0.00022234206781878126, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.4886223375797272, "step": 19700 }, { "ce_loss": 0.12366501241922379, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.20032623410224915, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.12621523439884186, "step": 19700 }, { "epoch": 6.57104736490994, "loss": 0.5551965236663818, "step": 19700 }, { "ce_loss": 0.16444770991802216, "epoch": 6.57104736490994, "step": 19700 }, { "distill_loss": 0.25806915760040283, "epoch": 6.57104736490994, "step": 19700 }, { "epoch": 6.57104736490994, "ref_ce_loss": 0.1323830783367157, "step": 19700 }, { "epoch": 6.574382921947965, "loss": 0.6061, "step": 19710 }, { "epoch": 6.574382921947965, "grad_norm": 1.507322907447815, "step": 19710 }, { "epoch": 6.574382921947965, "learning_rate": 0.0002219550016001776, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.5559046268463135, "step": 19710 }, { "ce_loss": 0.12786805629730225, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.27983155846595764, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.11760944873094559, "step": 19710 }, { "epoch": 6.574382921947965, "loss": 0.4719151556491852, "step": 19710 }, { "ce_loss": 0.11807572841644287, "epoch": 6.574382921947965, "step": 19710 }, { "distill_loss": 0.21849408745765686, "epoch": 6.574382921947965, "step": 19710 }, { "epoch": 6.574382921947965, "ref_ce_loss": 0.10040108114480972, "step": 19710 }, { "epoch": 6.57771847898599, "loss": 0.6238, "step": 19720 }, { "epoch": 6.57771847898599, "grad_norm": 2.1423234939575195, "step": 19720 }, { "epoch": 6.57771847898599, "learning_rate": 0.00022156814317924562, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.810075044631958, "step": 19720 }, { "ce_loss": 0.17612165212631226, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.32744908332824707, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.19267144799232483, "step": 19720 }, { "epoch": 6.57771847898599, "loss": 0.6173214912414551, "step": 19720 }, { "ce_loss": 0.1226503923535347, "epoch": 6.57771847898599, "step": 19720 }, { "distill_loss": 0.25212758779525757, "epoch": 6.57771847898599, "step": 19720 }, { "epoch": 6.57771847898599, "ref_ce_loss": 0.13104918599128723, "step": 19720 }, { "epoch": 6.581054036024016, "loss": 0.6279, "step": 19730 }, { "epoch": 6.581054036024016, "grad_norm": 1.9452593326568604, "step": 19730 }, { "epoch": 6.581054036024016, "learning_rate": 0.00022118149300749047, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.6032732725143433, "step": 19730 }, { "ce_loss": 0.14312177896499634, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.2730581760406494, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.1491086333990097, "step": 19730 }, { "epoch": 6.581054036024016, "loss": 0.6786301136016846, "step": 19730 }, { "ce_loss": 0.13837352395057678, "epoch": 6.581054036024016, "step": 19730 }, { "distill_loss": 0.2833654582500458, "epoch": 6.581054036024016, "step": 19730 }, { "epoch": 6.581054036024016, "ref_ce_loss": 0.12843720614910126, "step": 19730 }, { "epoch": 6.584389593062041, "loss": 0.6008, "step": 19740 }, { "epoch": 6.584389593062041, "grad_norm": 1.8112263679504395, "step": 19740 }, { "epoch": 6.584389593062041, "learning_rate": 0.00022079505153617466, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.6665618419647217, "step": 19740 }, { "ce_loss": 0.14172880351543427, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.2878859341144562, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.15004181861877441, "step": 19740 }, { "epoch": 6.584389593062041, "loss": 0.4591222107410431, "step": 19740 }, { "ce_loss": 0.09805697947740555, "epoch": 6.584389593062041, "step": 19740 }, { "distill_loss": 0.23241685330867767, "epoch": 6.584389593062041, "step": 19740 }, { "epoch": 6.584389593062041, "ref_ce_loss": 0.0994115024805069, "step": 19740 }, { "epoch": 6.587725150100066, "loss": 0.5594, "step": 19750 }, { "epoch": 6.587725150100066, "grad_norm": 1.4407625198364258, "step": 19750 }, { "epoch": 6.587725150100066, "learning_rate": 0.00022040881921631692, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.6755169034004211, "step": 19750 }, { "ce_loss": 0.18521958589553833, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.3709476590156555, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.1117776557803154, "step": 19750 }, { "epoch": 6.587725150100066, "loss": 0.7607985734939575, "step": 19750 }, { "ce_loss": 0.12100774049758911, "epoch": 6.587725150100066, "step": 19750 }, { "distill_loss": 0.32476189732551575, "epoch": 6.587725150100066, "step": 19750 }, { "epoch": 6.587725150100066, "ref_ce_loss": 0.09815852344036102, "step": 19750 }, { "epoch": 6.591060707138092, "loss": 0.5864, "step": 19760 }, { "epoch": 6.591060707138092, "grad_norm": 1.4858148097991943, "step": 19760 }, { "epoch": 6.591060707138092, "learning_rate": 0.00022002279649869214, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.6402103900909424, "step": 19760 }, { "ce_loss": 0.10261153429746628, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.18682360649108887, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.08958607912063599, "step": 19760 }, { "epoch": 6.591060707138092, "loss": 0.5752401947975159, "step": 19760 }, { "ce_loss": 0.13084650039672852, "epoch": 6.591060707138092, "step": 19760 }, { "distill_loss": 0.27911341190338135, "epoch": 6.591060707138092, "step": 19760 }, { "epoch": 6.591060707138092, "ref_ce_loss": 0.09355991333723068, "step": 19760 }, { "epoch": 6.594396264176117, "loss": 0.6532, "step": 19770 }, { "epoch": 6.594396264176117, "grad_norm": 1.2409772872924805, "step": 19770 }, { "epoch": 6.594396264176117, "learning_rate": 0.00021963698383383005, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.5927982330322266, "step": 19770 }, { "ce_loss": 0.1433178186416626, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.302937388420105, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.14607828855514526, "step": 19770 }, { "epoch": 6.594396264176117, "loss": 0.5983748435974121, "step": 19770 }, { "ce_loss": 0.16430151462554932, "epoch": 6.594396264176117, "step": 19770 }, { "distill_loss": 0.23618659377098083, "epoch": 6.594396264176117, "step": 19770 }, { "epoch": 6.594396264176117, "ref_ce_loss": 0.1571560800075531, "step": 19770 }, { "epoch": 6.5977318212141425, "loss": 0.6228, "step": 19780 }, { "epoch": 6.5977318212141425, "grad_norm": 1.5878171920776367, "step": 19780 }, { "epoch": 6.5977318212141425, "learning_rate": 0.00021925138167201564, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.701454222202301, "step": 19780 }, { "ce_loss": 0.1458064168691635, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.2965890169143677, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.12389638274908066, "step": 19780 }, { "epoch": 6.5977318212141425, "loss": 0.636038601398468, "step": 19780 }, { "ce_loss": 0.17314723134040833, "epoch": 6.5977318212141425, "step": 19780 }, { "distill_loss": 0.31819021701812744, "epoch": 6.5977318212141425, "step": 19780 }, { "epoch": 6.5977318212141425, "ref_ce_loss": 0.14442434906959534, "step": 19780 }, { "epoch": 6.601067378252168, "loss": 0.6227, "step": 19790 }, { "epoch": 6.601067378252168, "grad_norm": 2.4692835807800293, "step": 19790 }, { "epoch": 6.601067378252168, "learning_rate": 0.00021886599046328824, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.656324565410614, "step": 19790 }, { "ce_loss": 0.15108564496040344, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.28433552384376526, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.11606331914663315, "step": 19790 }, { "epoch": 6.601067378252168, "loss": 0.6400468349456787, "step": 19790 }, { "ce_loss": 0.1558593362569809, "epoch": 6.601067378252168, "step": 19790 }, { "distill_loss": 0.34761950373649597, "epoch": 6.601067378252168, "step": 19790 }, { "epoch": 6.601067378252168, "ref_ce_loss": 0.1162491887807846, "step": 19790 }, { "epoch": 6.604402935290193, "loss": 0.6978, "step": 19800 }, { "epoch": 6.604402935290193, "grad_norm": 3.7015187740325928, "step": 19800 }, { "epoch": 6.604402935290193, "learning_rate": 0.00021848081065744076, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.9426319599151611, "step": 19800 }, { "ce_loss": 0.08227524161338806, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.24910229444503784, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.10845942050218582, "step": 19800 }, { "epoch": 6.604402935290193, "loss": 0.5317683219909668, "step": 19800 }, { "ce_loss": 0.11848925799131393, "epoch": 6.604402935290193, "step": 19800 }, { "distill_loss": 0.2813676595687866, "epoch": 6.604402935290193, "step": 19800 }, { "epoch": 6.604402935290193, "ref_ce_loss": 0.1314525604248047, "step": 19800 }, { "epoch": 6.6077384923282185, "loss": 0.6737, "step": 19810 }, { "epoch": 6.6077384923282185, "grad_norm": 3.4987027645111084, "step": 19810 }, { "epoch": 6.6077384923282185, "learning_rate": 0.0002180958427040195, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.5265316367149353, "step": 19810 }, { "ce_loss": 0.11821822822093964, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.28454506397247314, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.1234145313501358, "step": 19810 }, { "epoch": 6.6077384923282185, "loss": 0.7258698344230652, "step": 19810 }, { "ce_loss": 0.19357775151729584, "epoch": 6.6077384923282185, "step": 19810 }, { "distill_loss": 0.33576950430870056, "epoch": 6.6077384923282185, "step": 19810 }, { "epoch": 6.6077384923282185, "ref_ce_loss": 0.1354198008775711, "step": 19810 }, { "epoch": 6.611074049366244, "loss": 0.6058, "step": 19820 }, { "epoch": 6.611074049366244, "grad_norm": 4.5951972007751465, "step": 19820 }, { "epoch": 6.611074049366244, "learning_rate": 0.00021771108705232356, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.45154914259910583, "step": 19820 }, { "ce_loss": 0.13545911014080048, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.22132493555545807, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.09376255422830582, "step": 19820 }, { "epoch": 6.611074049366244, "loss": 0.5493002533912659, "step": 19820 }, { "ce_loss": 0.12436985969543457, "epoch": 6.611074049366244, "step": 19820 }, { "distill_loss": 0.30755358934402466, "epoch": 6.611074049366244, "step": 19820 }, { "epoch": 6.611074049366244, "ref_ce_loss": 0.11703960597515106, "step": 19820 }, { "epoch": 6.614409606404269, "loss": 0.6026, "step": 19830 }, { "epoch": 6.614409606404269, "grad_norm": 1.9668054580688477, "step": 19830 }, { "epoch": 6.614409606404269, "learning_rate": 0.00021732654415140425, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.5193825364112854, "step": 19830 }, { "ce_loss": 0.13210050761699677, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.29486167430877686, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.09222698956727982, "step": 19830 }, { "epoch": 6.614409606404269, "loss": 0.5412237644195557, "step": 19830 }, { "ce_loss": 0.11810997873544693, "epoch": 6.614409606404269, "step": 19830 }, { "distill_loss": 0.27205947041511536, "epoch": 6.614409606404269, "step": 19830 }, { "epoch": 6.614409606404269, "ref_ce_loss": 0.12044981867074966, "step": 19830 }, { "epoch": 6.617745163442295, "loss": 0.5936, "step": 19840 }, { "epoch": 6.617745163442295, "grad_norm": 1.418731689453125, "step": 19840 }, { "epoch": 6.617745163442295, "learning_rate": 0.00021694221445006426, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 0.44266384840011597, "step": 19840 }, { "ce_loss": 0.11144088953733444, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.20961447060108185, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.0956166684627533, "step": 19840 }, { "epoch": 6.617745163442295, "loss": 1.132645845413208, "step": 19840 }, { "ce_loss": 0.15278828144073486, "epoch": 6.617745163442295, "step": 19840 }, { "distill_loss": 0.2940169870853424, "epoch": 6.617745163442295, "step": 19840 }, { "epoch": 6.617745163442295, "ref_ce_loss": 0.1412506103515625, "step": 19840 }, { "epoch": 6.62108072048032, "loss": 0.5931, "step": 19850 }, { "epoch": 6.62108072048032, "grad_norm": 2.844691276550293, "step": 19850 }, { "epoch": 6.62108072048032, "learning_rate": 0.00021655809839685782, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.6301782131195068, "step": 19850 }, { "ce_loss": 0.15369366109371185, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.25507915019989014, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.10438395291566849, "step": 19850 }, { "epoch": 6.62108072048032, "loss": 0.7400352954864502, "step": 19850 }, { "ce_loss": 0.16687007248401642, "epoch": 6.62108072048032, "step": 19850 }, { "distill_loss": 0.2876301407814026, "epoch": 6.62108072048032, "step": 19850 }, { "epoch": 6.62108072048032, "ref_ce_loss": 0.11608906835317612, "step": 19850 }, { "epoch": 6.624416277518345, "loss": 0.5765, "step": 19860 }, { "epoch": 6.624416277518345, "grad_norm": 1.8998688459396362, "step": 19860 }, { "epoch": 6.624416277518345, "learning_rate": 0.00021617419644008972, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.6107144951820374, "step": 19860 }, { "ce_loss": 0.11949118226766586, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.22605156898498535, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.11626297235488892, "step": 19860 }, { "epoch": 6.624416277518345, "loss": 0.5601349472999573, "step": 19860 }, { "ce_loss": 0.1702495664358139, "epoch": 6.624416277518345, "step": 19860 }, { "distill_loss": 0.2407875508069992, "epoch": 6.624416277518345, "step": 19860 }, { "epoch": 6.624416277518345, "ref_ce_loss": 0.11027728021144867, "step": 19860 }, { "epoch": 6.627751834556371, "loss": 0.6048, "step": 19870 }, { "epoch": 6.627751834556371, "grad_norm": 1.0039499998092651, "step": 19870 }, { "epoch": 6.627751834556371, "learning_rate": 0.00021579050902781498, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.5014527440071106, "step": 19870 }, { "ce_loss": 0.12715910375118256, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.23520439863204956, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.11276977509260178, "step": 19870 }, { "epoch": 6.627751834556371, "loss": 0.5735328793525696, "step": 19870 }, { "ce_loss": 0.14892864227294922, "epoch": 6.627751834556371, "step": 19870 }, { "distill_loss": 0.24366481602191925, "epoch": 6.627751834556371, "step": 19870 }, { "epoch": 6.627751834556371, "ref_ce_loss": 0.11686741560697556, "step": 19870 }, { "epoch": 6.631087391594396, "loss": 0.6034, "step": 19880 }, { "epoch": 6.631087391594396, "grad_norm": 2.24519419670105, "step": 19880 }, { "epoch": 6.631087391594396, "learning_rate": 0.00021540703660783783, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.7075437307357788, "step": 19880 }, { "ce_loss": 0.12260802090167999, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.2838866710662842, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.11997436732053757, "step": 19880 }, { "epoch": 6.631087391594396, "loss": 0.6045704483985901, "step": 19880 }, { "ce_loss": 0.16746434569358826, "epoch": 6.631087391594396, "step": 19880 }, { "distill_loss": 0.28688085079193115, "epoch": 6.631087391594396, "step": 19880 }, { "epoch": 6.631087391594396, "ref_ce_loss": 0.12655295431613922, "step": 19880 }, { "epoch": 6.634422948632421, "loss": 0.6433, "step": 19890 }, { "epoch": 6.634422948632421, "grad_norm": 1.102996826171875, "step": 19890 }, { "epoch": 6.634422948632421, "learning_rate": 0.00021502377962771198, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.6029108762741089, "step": 19890 }, { "ce_loss": 0.1609131395816803, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.2370425909757614, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.1219489723443985, "step": 19890 }, { "epoch": 6.634422948632421, "loss": 0.6594313979148865, "step": 19890 }, { "ce_loss": 0.19880236685276031, "epoch": 6.634422948632421, "step": 19890 }, { "distill_loss": 0.30750787258148193, "epoch": 6.634422948632421, "step": 19890 }, { "epoch": 6.634422948632421, "ref_ce_loss": 0.12001223862171173, "step": 19890 }, { "epoch": 6.637758505670447, "loss": 0.6094, "step": 19900 }, { "epoch": 6.637758505670447, "grad_norm": 0.9801173210144043, "step": 19900 }, { "epoch": 6.637758505670447, "learning_rate": 0.0002146407385347396, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.5906522870063782, "step": 19900 }, { "ce_loss": 0.11420835554599762, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.2585413157939911, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.15924397110939026, "step": 19900 }, { "epoch": 6.637758505670447, "loss": 0.5947010517120361, "step": 19900 }, { "ce_loss": 0.1298929899930954, "epoch": 6.637758505670447, "step": 19900 }, { "distill_loss": 0.2795071601867676, "epoch": 6.637758505670447, "step": 19900 }, { "epoch": 6.637758505670447, "ref_ce_loss": 0.14532022178173065, "step": 19900 }, { "epoch": 6.641094062708472, "loss": 0.624, "step": 19910 }, { "epoch": 6.641094062708472, "grad_norm": 2.43601131439209, "step": 19910 }, { "epoch": 6.641094062708472, "learning_rate": 0.00021425791377597072, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.5134724974632263, "step": 19910 }, { "ce_loss": 0.14351071417331696, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.22447390854358673, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.14510710537433624, "step": 19910 }, { "epoch": 6.641094062708472, "loss": 0.5465179085731506, "step": 19910 }, { "ce_loss": 0.12087228894233704, "epoch": 6.641094062708472, "step": 19910 }, { "distill_loss": 0.19813315570354462, "epoch": 6.641094062708472, "step": 19910 }, { "epoch": 6.641094062708472, "ref_ce_loss": 0.1294873207807541, "step": 19910 }, { "epoch": 6.644429619746497, "loss": 0.6308, "step": 19920 }, { "epoch": 6.644429619746497, "grad_norm": 2.103402614593506, "step": 19920 }, { "epoch": 6.644429619746497, "learning_rate": 0.0002138753057982033, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 0.5028867721557617, "step": 19920 }, { "ce_loss": 0.11792866885662079, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.23079612851142883, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.1540123075246811, "step": 19920 }, { "epoch": 6.644429619746497, "loss": 1.1755242347717285, "step": 19920 }, { "ce_loss": 0.16438809037208557, "epoch": 6.644429619746497, "step": 19920 }, { "distill_loss": 0.2752496898174286, "epoch": 6.644429619746497, "step": 19920 }, { "epoch": 6.644429619746497, "ref_ce_loss": 0.10607973486185074, "step": 19920 }, { "epoch": 6.647765176784523, "loss": 0.5982, "step": 19930 }, { "epoch": 6.647765176784523, "grad_norm": 1.9858911037445068, "step": 19930 }, { "epoch": 6.647765176784523, "learning_rate": 0.00021349291504798177, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.5443336963653564, "step": 19930 }, { "ce_loss": 0.15421469509601593, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.2539355754852295, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.09973197430372238, "step": 19930 }, { "epoch": 6.647765176784523, "loss": 0.6012151837348938, "step": 19930 }, { "ce_loss": 0.14680509269237518, "epoch": 6.647765176784523, "step": 19930 }, { "distill_loss": 0.29390209913253784, "epoch": 6.647765176784523, "step": 19930 }, { "epoch": 6.647765176784523, "ref_ce_loss": 0.11073038727045059, "step": 19930 }, { "epoch": 6.651100733822548, "loss": 0.6459, "step": 19940 }, { "epoch": 6.651100733822548, "grad_norm": 1.4681084156036377, "step": 19940 }, { "epoch": 6.651100733822548, "learning_rate": 0.00021311074197159736, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.9424260854721069, "step": 19940 }, { "ce_loss": 0.13379743695259094, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.2306695282459259, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.11070249229669571, "step": 19940 }, { "epoch": 6.651100733822548, "loss": 0.47121453285217285, "step": 19940 }, { "ce_loss": 0.0640735775232315, "epoch": 6.651100733822548, "step": 19940 }, { "distill_loss": 0.21163296699523926, "epoch": 6.651100733822548, "step": 19940 }, { "epoch": 6.651100733822548, "ref_ce_loss": 0.10320694744586945, "step": 19940 }, { "epoch": 6.654436290860573, "loss": 0.6308, "step": 19950 }, { "epoch": 6.654436290860573, "grad_norm": 2.588499069213867, "step": 19950 }, { "epoch": 6.654436290860573, "learning_rate": 0.00021272878701508735, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.6308241486549377, "step": 19950 }, { "ce_loss": 0.19566896557807922, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.2965598702430725, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.11354543268680573, "step": 19950 }, { "epoch": 6.654436290860573, "loss": 0.42993810772895813, "step": 19950 }, { "ce_loss": 0.08403245359659195, "epoch": 6.654436290860573, "step": 19950 }, { "distill_loss": 0.21457430720329285, "epoch": 6.654436290860573, "step": 19950 }, { "epoch": 6.654436290860573, "ref_ce_loss": 0.08863157033920288, "step": 19950 }, { "epoch": 6.657771847898599, "loss": 0.569, "step": 19960 }, { "epoch": 6.657771847898599, "grad_norm": 1.2746546268463135, "step": 19960 }, { "epoch": 6.657771847898599, "learning_rate": 0.000212347050624234, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.5694778561592102, "step": 19960 }, { "ce_loss": 0.1148284450173378, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.30706483125686646, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.10699665546417236, "step": 19960 }, { "epoch": 6.657771847898599, "loss": 0.5459879636764526, "step": 19960 }, { "ce_loss": 0.11935421824455261, "epoch": 6.657771847898599, "step": 19960 }, { "distill_loss": 0.22676879167556763, "epoch": 6.657771847898599, "step": 19960 }, { "epoch": 6.657771847898599, "ref_ce_loss": 0.0830860361456871, "step": 19960 }, { "epoch": 6.661107404936624, "loss": 0.6038, "step": 19970 }, { "epoch": 6.661107404936624, "grad_norm": 2.4813232421875, "step": 19970 }, { "epoch": 6.661107404936624, "learning_rate": 0.00021196553324456482, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.7138746976852417, "step": 19970 }, { "ce_loss": 0.214167058467865, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.3329528570175171, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.1282276213169098, "step": 19970 }, { "epoch": 6.661107404936624, "loss": 0.4795456528663635, "step": 19970 }, { "ce_loss": 0.13675269484519958, "epoch": 6.661107404936624, "step": 19970 }, { "distill_loss": 0.22916845977306366, "epoch": 6.661107404936624, "step": 19970 }, { "epoch": 6.661107404936624, "ref_ce_loss": 0.11339738219976425, "step": 19970 }, { "epoch": 6.6644429619746495, "loss": 0.6919, "step": 19980 }, { "epoch": 6.6644429619746495, "grad_norm": 1.3644931316375732, "step": 19980 }, { "epoch": 6.6644429619746495, "learning_rate": 0.0002115842353213517, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.5393396615982056, "step": 19980 }, { "ce_loss": 0.10573761910200119, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.2217199206352234, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.09572423249483109, "step": 19980 }, { "epoch": 6.6644429619746495, "loss": 0.4343937039375305, "step": 19980 }, { "ce_loss": 0.0948638767004013, "epoch": 6.6644429619746495, "step": 19980 }, { "distill_loss": 0.2013542652130127, "epoch": 6.6644429619746495, "step": 19980 }, { "epoch": 6.6644429619746495, "ref_ce_loss": 0.10309243202209473, "step": 19980 }, { "epoch": 6.667778519012675, "loss": 0.6172, "step": 19990 }, { "epoch": 6.667778519012675, "grad_norm": 1.7576813697814941, "step": 19990 }, { "epoch": 6.667778519012675, "learning_rate": 0.0002112031572996105, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.7140644788742065, "step": 19990 }, { "ce_loss": 0.2293837070465088, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.3485688865184784, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.1356680691242218, "step": 19990 }, { "epoch": 6.667778519012675, "loss": 0.6210101246833801, "step": 19990 }, { "ce_loss": 0.16044512391090393, "epoch": 6.667778519012675, "step": 19990 }, { "distill_loss": 0.2541531026363373, "epoch": 6.667778519012675, "step": 19990 }, { "epoch": 6.667778519012675, "ref_ce_loss": 0.13174085319042206, "step": 19990 }, { "epoch": 6.6711140760507, "loss": 0.6575, "step": 20000 }, { "epoch": 6.6711140760507, "grad_norm": 2.1928982734680176, "step": 20000 }, { "epoch": 6.6711140760507, "learning_rate": 0.00021082229962409997, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.5633940696716309, "step": 20000 }, { "ce_loss": 0.19303011894226074, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.2460951954126358, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.12412123382091522, "step": 20000 }, { "epoch": 6.6711140760507, "loss": 0.39880824089050293, "step": 20000 }, { "ce_loss": 0.08768723905086517, "epoch": 6.6711140760507, "step": 20000 }, { "distill_loss": 0.1907358467578888, "epoch": 6.6711140760507, "step": 20000 }, { "epoch": 6.6711140760507, "ref_ce_loss": 0.12010857462882996, "step": 20000 }, { "epoch": 6.6744496330887255, "loss": 0.5422, "step": 20010 }, { "epoch": 6.6744496330887255, "grad_norm": 1.323743224143982, "step": 20010 }, { "epoch": 6.6744496330887255, "learning_rate": 0.00021044166273932212, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.6675091981887817, "step": 20010 }, { "ce_loss": 0.1589205414056778, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.24982638657093048, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.158026322722435, "step": 20010 }, { "epoch": 6.6744496330887255, "loss": 0.42034339904785156, "step": 20010 }, { "ce_loss": 0.09510764479637146, "epoch": 6.6744496330887255, "step": 20010 }, { "distill_loss": 0.2156810611486435, "epoch": 6.6744496330887255, "step": 20010 }, { "epoch": 6.6744496330887255, "ref_ce_loss": 0.10906819999217987, "step": 20010 }, { "epoch": 6.677785190126751, "loss": 0.5977, "step": 20020 }, { "epoch": 6.677785190126751, "grad_norm": 1.453233003616333, "step": 20020 }, { "epoch": 6.677785190126751, "learning_rate": 0.00021006124708952117, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.5579141974449158, "step": 20020 }, { "ce_loss": 0.15179507434368134, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.28201889991760254, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.12390779703855515, "step": 20020 }, { "epoch": 6.677785190126751, "loss": 0.5579657554626465, "step": 20020 }, { "ce_loss": 0.10925626009702682, "epoch": 6.677785190126751, "step": 20020 }, { "distill_loss": 0.33345967531204224, "epoch": 6.677785190126751, "step": 20020 }, { "epoch": 6.677785190126751, "ref_ce_loss": 0.09767402708530426, "step": 20020 }, { "epoch": 6.681120747164776, "loss": 0.5548, "step": 20030 }, { "epoch": 6.681120747164776, "grad_norm": 1.407572865486145, "step": 20030 }, { "epoch": 6.681120747164776, "learning_rate": 0.00020968105311868312, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.7544924020767212, "step": 20030 }, { "ce_loss": 0.15766873955726624, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.4101894497871399, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.14876404404640198, "step": 20030 }, { "epoch": 6.681120747164776, "loss": 0.6088945865631104, "step": 20030 }, { "ce_loss": 0.09153804928064346, "epoch": 6.681120747164776, "step": 20030 }, { "distill_loss": 0.22568279504776, "epoch": 6.681120747164776, "step": 20030 }, { "epoch": 6.681120747164776, "ref_ce_loss": 0.10213732719421387, "step": 20030 }, { "epoch": 6.684456304202802, "loss": 0.5674, "step": 20040 }, { "epoch": 6.684456304202802, "grad_norm": 1.58797025680542, "step": 20040 }, { "epoch": 6.684456304202802, "learning_rate": 0.00020930108127053526, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.4611407518386841, "step": 20040 }, { "ce_loss": 0.11111941933631897, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.2381780445575714, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.08858481049537659, "step": 20040 }, { "epoch": 6.684456304202802, "loss": 0.5269935131072998, "step": 20040 }, { "ce_loss": 0.12398573756217957, "epoch": 6.684456304202802, "step": 20040 }, { "distill_loss": 0.20361027121543884, "epoch": 6.684456304202802, "step": 20040 }, { "epoch": 6.684456304202802, "ref_ce_loss": 0.10248218476772308, "step": 20040 }, { "epoch": 6.687791861240827, "loss": 0.5685, "step": 20050 }, { "epoch": 6.687791861240827, "grad_norm": 2.0134479999542236, "step": 20050 }, { "epoch": 6.687791861240827, "learning_rate": 0.0002089213319885456, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.41977277398109436, "step": 20050 }, { "ce_loss": 0.07606784999370575, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.1722736656665802, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.08966904133558273, "step": 20050 }, { "epoch": 6.687791861240827, "loss": 0.8288323879241943, "step": 20050 }, { "ce_loss": 0.20224134624004364, "epoch": 6.687791861240827, "step": 20050 }, { "distill_loss": 0.2906288802623749, "epoch": 6.687791861240827, "step": 20050 }, { "epoch": 6.687791861240827, "ref_ce_loss": 0.1143127977848053, "step": 20050 }, { "epoch": 6.691127418278852, "loss": 0.5795, "step": 20060 }, { "epoch": 6.691127418278852, "grad_norm": 2.094135046005249, "step": 20060 }, { "epoch": 6.691127418278852, "learning_rate": 0.00020854180571592244, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.6016823053359985, "step": 20060 }, { "ce_loss": 0.10965229570865631, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.26941967010498047, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.11607593297958374, "step": 20060 }, { "epoch": 6.691127418278852, "loss": 0.5134243369102478, "step": 20060 }, { "ce_loss": 0.12020660936832428, "epoch": 6.691127418278852, "step": 20060 }, { "distill_loss": 0.22750511765480042, "epoch": 6.691127418278852, "step": 20060 }, { "epoch": 6.691127418278852, "ref_ce_loss": 0.1258605271577835, "step": 20060 }, { "epoch": 6.694462975316878, "loss": 0.6106, "step": 20070 }, { "epoch": 6.694462975316878, "grad_norm": 2.0959811210632324, "step": 20070 }, { "epoch": 6.694462975316878, "learning_rate": 0.00020816250289561387, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.4728214144706726, "step": 20070 }, { "ce_loss": 0.1056213229894638, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.2268831729888916, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.10493393987417221, "step": 20070 }, { "epoch": 6.694462975316878, "loss": 0.7689107060432434, "step": 20070 }, { "ce_loss": 0.15808147192001343, "epoch": 6.694462975316878, "step": 20070 }, { "distill_loss": 0.30683696269989014, "epoch": 6.694462975316878, "step": 20070 }, { "epoch": 6.694462975316878, "ref_ce_loss": 0.10166820138692856, "step": 20070 }, { "epoch": 6.697798532354903, "loss": 0.6243, "step": 20080 }, { "epoch": 6.697798532354903, "grad_norm": 2.2686753273010254, "step": 20080 }, { "epoch": 6.697798532354903, "learning_rate": 0.00020778342397030693, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.639522910118103, "step": 20080 }, { "ce_loss": 0.1620551347732544, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.3112872540950775, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.1349463313817978, "step": 20080 }, { "epoch": 6.697798532354903, "loss": 0.5782184600830078, "step": 20080 }, { "ce_loss": 0.14392095804214478, "epoch": 6.697798532354903, "step": 20080 }, { "distill_loss": 0.2486431896686554, "epoch": 6.697798532354903, "step": 20080 }, { "epoch": 6.697798532354903, "ref_ce_loss": 0.1579214185476303, "step": 20080 }, { "epoch": 6.701134089392928, "loss": 0.6096, "step": 20090 }, { "epoch": 6.701134089392928, "grad_norm": 1.7706823348999023, "step": 20090 }, { "epoch": 6.701134089392928, "learning_rate": 0.0002074045693824275, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.7955951690673828, "step": 20090 }, { "ce_loss": 0.10051006078720093, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.2936019003391266, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.11262084543704987, "step": 20090 }, { "epoch": 6.701134089392928, "loss": 0.4961718022823334, "step": 20090 }, { "ce_loss": 0.1312670260667801, "epoch": 6.701134089392928, "step": 20090 }, { "distill_loss": 0.2476268708705902, "epoch": 6.701134089392928, "step": 20090 }, { "epoch": 6.701134089392928, "ref_ce_loss": 0.09604895859956741, "step": 20090 }, { "epoch": 6.704469646430954, "loss": 0.6018, "step": 20100 }, { "epoch": 6.704469646430954, "grad_norm": 1.0696649551391602, "step": 20100 }, { "epoch": 6.704469646430954, "learning_rate": 0.00020702593957413971, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.5171288847923279, "step": 20100 }, { "ce_loss": 0.16199396550655365, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.2498839944601059, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.10504694283008575, "step": 20100 }, { "epoch": 6.704469646430954, "loss": 0.6891152858734131, "step": 20100 }, { "ce_loss": 0.16890086233615875, "epoch": 6.704469646430954, "step": 20100 }, { "distill_loss": 0.27810660004615784, "epoch": 6.704469646430954, "step": 20100 }, { "epoch": 6.704469646430954, "ref_ce_loss": 0.13627681136131287, "step": 20100 }, { "epoch": 6.707805203468979, "loss": 0.636, "step": 20110 }, { "epoch": 6.707805203468979, "grad_norm": 1.3927689790725708, "step": 20110 }, { "epoch": 6.707805203468979, "learning_rate": 0.00020664753498734554, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.44129008054733276, "step": 20110 }, { "ce_loss": 0.10400844365358353, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.2058953046798706, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.1013561561703682, "step": 20110 }, { "epoch": 6.707805203468979, "loss": 0.5037030577659607, "step": 20110 }, { "ce_loss": 0.140580952167511, "epoch": 6.707805203468979, "step": 20110 }, { "distill_loss": 0.20188310742378235, "epoch": 6.707805203468979, "step": 20110 }, { "epoch": 6.707805203468979, "ref_ce_loss": 0.13261297345161438, "step": 20110 }, { "epoch": 6.711140760507004, "loss": 0.5781, "step": 20120 }, { "epoch": 6.711140760507004, "grad_norm": 1.409254550933838, "step": 20120 }, { "epoch": 6.711140760507004, "learning_rate": 0.00020626935606368342, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.45771175622940063, "step": 20120 }, { "ce_loss": 0.08707324415445328, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.2113952487707138, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.11662886291742325, "step": 20120 }, { "epoch": 6.711140760507004, "loss": 0.6429270505905151, "step": 20120 }, { "ce_loss": 0.10872305184602737, "epoch": 6.711140760507004, "step": 20120 }, { "distill_loss": 0.2824891209602356, "epoch": 6.711140760507004, "step": 20120 }, { "epoch": 6.711140760507004, "ref_ce_loss": 0.12237393110990524, "step": 20120 }, { "epoch": 6.71447631754503, "loss": 0.6371, "step": 20130 }, { "epoch": 6.71447631754503, "grad_norm": 1.7113875150680542, "step": 20130 }, { "epoch": 6.71447631754503, "learning_rate": 0.0002058914032445289, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.5755742192268372, "step": 20130 }, { "ce_loss": 0.15556345880031586, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.23803101480007172, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.1206558421254158, "step": 20130 }, { "epoch": 6.71447631754503, "loss": 0.6326802372932434, "step": 20130 }, { "ce_loss": 0.1521034687757492, "epoch": 6.71447631754503, "step": 20130 }, { "distill_loss": 0.32943302392959595, "epoch": 6.71447631754503, "step": 20130 }, { "epoch": 6.71447631754503, "ref_ce_loss": 0.12123904377222061, "step": 20130 }, { "epoch": 6.717811874583055, "loss": 0.6306, "step": 20140 }, { "epoch": 6.717811874583055, "grad_norm": 1.4116154909133911, "step": 20140 }, { "epoch": 6.717811874583055, "learning_rate": 0.00020551367697099404, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.6386064291000366, "step": 20140 }, { "ce_loss": 0.1630381941795349, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.359139621257782, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.11624115705490112, "step": 20140 }, { "epoch": 6.717811874583055, "loss": 0.7267446517944336, "step": 20140 }, { "ce_loss": 0.09767545014619827, "epoch": 6.717811874583055, "step": 20140 }, { "distill_loss": 0.2750421464443207, "epoch": 6.717811874583055, "step": 20140 }, { "epoch": 6.717811874583055, "ref_ce_loss": 0.11414022743701935, "step": 20140 }, { "epoch": 6.72114743162108, "loss": 0.6079, "step": 20150 }, { "epoch": 6.72114743162108, "grad_norm": 1.8769910335540771, "step": 20150 }, { "epoch": 6.72114743162108, "learning_rate": 0.00020513617768392562, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.6491705775260925, "step": 20150 }, { "ce_loss": 0.08372032642364502, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.2563663721084595, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.10501579195261002, "step": 20150 }, { "epoch": 6.72114743162108, "loss": 0.7487512230873108, "step": 20150 }, { "ce_loss": 0.1768907606601715, "epoch": 6.72114743162108, "step": 20150 }, { "distill_loss": 0.263735830783844, "epoch": 6.72114743162108, "step": 20150 }, { "epoch": 6.72114743162108, "ref_ce_loss": 0.1748792678117752, "step": 20150 }, { "epoch": 6.724482988659106, "loss": 0.5921, "step": 20160 }, { "epoch": 6.724482988659106, "grad_norm": 1.0785341262817383, "step": 20160 }, { "epoch": 6.724482988659106, "learning_rate": 0.00020475890582390607, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.6199877262115479, "step": 20160 }, { "ce_loss": 0.14046601951122284, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.25729498267173767, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.1453024297952652, "step": 20160 }, { "epoch": 6.724482988659106, "loss": 0.5459374785423279, "step": 20160 }, { "ce_loss": 0.12143059819936752, "epoch": 6.724482988659106, "step": 20160 }, { "distill_loss": 0.26327502727508545, "epoch": 6.724482988659106, "step": 20160 }, { "epoch": 6.724482988659106, "ref_ce_loss": 0.12969057261943817, "step": 20160 }, { "epoch": 6.727818545697131, "loss": 0.6833, "step": 20170 }, { "epoch": 6.727818545697131, "grad_norm": 4.0255961418151855, "step": 20170 }, { "epoch": 6.727818545697131, "learning_rate": 0.0002043818618312522, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.5204952955245972, "step": 20170 }, { "ce_loss": 0.12066006660461426, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.2361370325088501, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.13006658852100372, "step": 20170 }, { "epoch": 6.727818545697131, "loss": 0.6388102173805237, "step": 20170 }, { "ce_loss": 0.1897311508655548, "epoch": 6.727818545697131, "step": 20170 }, { "distill_loss": 0.31053489446640015, "epoch": 6.727818545697131, "step": 20170 }, { "epoch": 6.727818545697131, "ref_ce_loss": 0.11395397782325745, "step": 20170 }, { "epoch": 6.7311541027351565, "loss": 0.5882, "step": 20180 }, { "epoch": 6.7311541027351565, "grad_norm": 2.0025501251220703, "step": 20180 }, { "epoch": 6.7311541027351565, "learning_rate": 0.00020400504614601515, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.6728408336639404, "step": 20180 }, { "ce_loss": 0.18863923847675323, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.32389456033706665, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.15970559418201447, "step": 20180 }, { "epoch": 6.7311541027351565, "loss": 0.545183539390564, "step": 20180 }, { "ce_loss": 0.11113359034061432, "epoch": 6.7311541027351565, "step": 20180 }, { "distill_loss": 0.23649749159812927, "epoch": 6.7311541027351565, "step": 20180 }, { "epoch": 6.7311541027351565, "ref_ce_loss": 0.12272656708955765, "step": 20180 }, { "epoch": 6.734489659773182, "loss": 0.6417, "step": 20190 }, { "epoch": 6.734489659773182, "grad_norm": 1.689151644706726, "step": 20190 }, { "epoch": 6.734489659773182, "learning_rate": 0.00020362845920797898, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.6169806718826294, "step": 20190 }, { "ce_loss": 0.1869346797466278, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.2971252501010895, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.10956262052059174, "step": 20190 }, { "epoch": 6.734489659773182, "loss": 0.43868401646614075, "step": 20190 }, { "ce_loss": 0.11213511228561401, "epoch": 6.734489659773182, "step": 20190 }, { "distill_loss": 0.22156301140785217, "epoch": 6.734489659773182, "step": 20190 }, { "epoch": 6.734489659773182, "ref_ce_loss": 0.10480619966983795, "step": 20190 }, { "epoch": 6.737825216811207, "loss": 0.5495, "step": 20200 }, { "epoch": 6.737825216811207, "grad_norm": 1.1433922052383423, "step": 20200 }, { "epoch": 6.737825216811207, "learning_rate": 0.0002032521014566614, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.6528211236000061, "step": 20200 }, { "ce_loss": 0.16544964909553528, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.2605472803115845, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.12842920422554016, "step": 20200 }, { "epoch": 6.737825216811207, "loss": 0.6164642572402954, "step": 20200 }, { "ce_loss": 0.1673274040222168, "epoch": 6.737825216811207, "step": 20200 }, { "distill_loss": 0.2415827512741089, "epoch": 6.737825216811207, "step": 20200 }, { "epoch": 6.737825216811207, "ref_ce_loss": 0.128843292593956, "step": 20200 }, { "epoch": 6.7411607738492325, "loss": 0.5752, "step": 20210 }, { "epoch": 6.7411607738492325, "grad_norm": 1.4512262344360352, "step": 20210 }, { "epoch": 6.7411607738492325, "learning_rate": 0.00020287597333131232, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.5911370515823364, "step": 20210 }, { "ce_loss": 0.10620848089456558, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.28432372212409973, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.11449485272169113, "step": 20210 }, { "epoch": 6.7411607738492325, "loss": 0.5357170104980469, "step": 20210 }, { "ce_loss": 0.12888075411319733, "epoch": 6.7411607738492325, "step": 20210 }, { "distill_loss": 0.1979178935289383, "epoch": 6.7411607738492325, "step": 20210 }, { "epoch": 6.7411607738492325, "ref_ce_loss": 0.1619427651166916, "step": 20210 }, { "epoch": 6.744496330887258, "loss": 0.5962, "step": 20220 }, { "epoch": 6.744496330887258, "grad_norm": 1.2024363279342651, "step": 20220 }, { "epoch": 6.744496330887258, "learning_rate": 0.000202500075270914, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.3185579776763916, "step": 20220 }, { "ce_loss": 0.03577226400375366, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.1970852166414261, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.06377988308668137, "step": 20220 }, { "epoch": 6.744496330887258, "loss": 0.4299435019493103, "step": 20220 }, { "ce_loss": 0.08698124438524246, "epoch": 6.744496330887258, "step": 20220 }, { "distill_loss": 0.2009291648864746, "epoch": 6.744496330887258, "step": 20220 }, { "epoch": 6.744496330887258, "ref_ce_loss": 0.08196726441383362, "step": 20220 }, { "epoch": 6.747831887925283, "loss": 0.5674, "step": 20230 }, { "epoch": 6.747831887925283, "grad_norm": 3.3245155811309814, "step": 20230 }, { "epoch": 6.747831887925283, "learning_rate": 0.00020212440771417956, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.5869963765144348, "step": 20230 }, { "ce_loss": 0.16733090579509735, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.29322487115859985, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.12613563239574432, "step": 20230 }, { "epoch": 6.747831887925283, "loss": 0.5164973735809326, "step": 20230 }, { "ce_loss": 0.11459538340568542, "epoch": 6.747831887925283, "step": 20230 }, { "distill_loss": 0.22587640583515167, "epoch": 6.747831887925283, "step": 20230 }, { "epoch": 6.747831887925283, "ref_ce_loss": 0.0881464034318924, "step": 20230 }, { "epoch": 6.751167444963309, "loss": 0.6506, "step": 20240 }, { "epoch": 6.751167444963309, "grad_norm": 1.2759590148925781, "step": 20240 }, { "epoch": 6.751167444963309, "learning_rate": 0.00020174897109955338, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.6824588179588318, "step": 20240 }, { "ce_loss": 0.10708875209093094, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.37594008445739746, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.1465224176645279, "step": 20240 }, { "epoch": 6.751167444963309, "loss": 0.5953068733215332, "step": 20240 }, { "ce_loss": 0.08849772065877914, "epoch": 6.751167444963309, "step": 20240 }, { "distill_loss": 0.26119664311408997, "epoch": 6.751167444963309, "step": 20240 }, { "epoch": 6.751167444963309, "ref_ce_loss": 0.11912015080451965, "step": 20240 }, { "epoch": 6.754503002001334, "loss": 0.6006, "step": 20250 }, { "epoch": 6.754503002001334, "grad_norm": 1.7669841051101685, "step": 20250 }, { "epoch": 6.754503002001334, "learning_rate": 0.00020137376586521085, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.6398548483848572, "step": 20250 }, { "ce_loss": 0.16085104644298553, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.26059287786483765, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.11758947372436523, "step": 20250 }, { "epoch": 6.754503002001334, "loss": 0.5975475907325745, "step": 20250 }, { "ce_loss": 0.14858491718769073, "epoch": 6.754503002001334, "step": 20250 }, { "distill_loss": 0.23770475387573242, "epoch": 6.754503002001334, "step": 20250 }, { "epoch": 6.754503002001334, "ref_ce_loss": 0.10475146770477295, "step": 20250 }, { "epoch": 6.757838559039359, "loss": 0.7459, "step": 20260 }, { "epoch": 6.757838559039359, "grad_norm": 1.4378308057785034, "step": 20260 }, { "epoch": 6.757838559039359, "learning_rate": 0.00020099879244905676, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.46980538964271545, "step": 20260 }, { "ce_loss": 0.10932157188653946, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.26287272572517395, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.09724202007055283, "step": 20260 }, { "epoch": 6.757838559039359, "loss": 0.44305968284606934, "step": 20260 }, { "ce_loss": 0.07696262001991272, "epoch": 6.757838559039359, "step": 20260 }, { "distill_loss": 0.24827659130096436, "epoch": 6.757838559039359, "step": 20260 }, { "epoch": 6.757838559039359, "ref_ce_loss": 0.11764057725667953, "step": 20260 }, { "epoch": 6.761174116077385, "loss": 0.6272, "step": 20270 }, { "epoch": 6.761174116077385, "grad_norm": 1.1700968742370605, "step": 20270 }, { "epoch": 6.761174116077385, "learning_rate": 0.0002006240512887251, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.5167644023895264, "step": 20270 }, { "ce_loss": 0.13465654850006104, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.25756293535232544, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.09358154237270355, "step": 20270 }, { "epoch": 6.761174116077385, "loss": 0.6173653602600098, "step": 20270 }, { "ce_loss": 0.1697811782360077, "epoch": 6.761174116077385, "step": 20270 }, { "distill_loss": 0.2595403790473938, "epoch": 6.761174116077385, "step": 20270 }, { "epoch": 6.761174116077385, "ref_ce_loss": 0.11999509483575821, "step": 20270 }, { "epoch": 6.76450967311541, "loss": 0.5765, "step": 20280 }, { "epoch": 6.76450967311541, "grad_norm": 1.1367998123168945, "step": 20280 }, { "epoch": 6.76450967311541, "learning_rate": 0.0002002495428215794, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.4597811698913574, "step": 20280 }, { "ce_loss": 0.10254833847284317, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.23351740837097168, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.12356647849082947, "step": 20280 }, { "epoch": 6.76450967311541, "loss": 0.7911397218704224, "step": 20280 }, { "ce_loss": 0.1131599172949791, "epoch": 6.76450967311541, "step": 20280 }, { "distill_loss": 0.26167136430740356, "epoch": 6.76450967311541, "step": 20280 }, { "epoch": 6.76450967311541, "ref_ce_loss": 0.12388217449188232, "step": 20280 }, { "epoch": 6.767845230153435, "loss": 0.5609, "step": 20290 }, { "epoch": 6.767845230153435, "grad_norm": 1.8873783349990845, "step": 20290 }, { "epoch": 6.767845230153435, "learning_rate": 0.0001998752674847112, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.6501887440681458, "step": 20290 }, { "ce_loss": 0.18052825331687927, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.2731967568397522, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.14176741242408752, "step": 20290 }, { "epoch": 6.767845230153435, "loss": 0.7093862295150757, "step": 20290 }, { "ce_loss": 0.16083455085754395, "epoch": 6.767845230153435, "step": 20290 }, { "distill_loss": 0.24816177785396576, "epoch": 6.767845230153435, "step": 20290 }, { "epoch": 6.767845230153435, "ref_ce_loss": 0.15041442215442657, "step": 20290 }, { "epoch": 6.771180787191461, "loss": 0.6082, "step": 20300 }, { "epoch": 6.771180787191461, "grad_norm": 2.3813726902008057, "step": 20300 }, { "epoch": 6.771180787191461, "learning_rate": 0.00019950122571494038, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.5191268920898438, "step": 20300 }, { "ce_loss": 0.1417926847934723, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.25416481494903564, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.12228623032569885, "step": 20300 }, { "epoch": 6.771180787191461, "loss": 0.7120752334594727, "step": 20300 }, { "ce_loss": 0.10820872336626053, "epoch": 6.771180787191461, "step": 20300 }, { "distill_loss": 0.21667304635047913, "epoch": 6.771180787191461, "step": 20300 }, { "epoch": 6.771180787191461, "ref_ce_loss": 0.12783846259117126, "step": 20300 }, { "epoch": 6.774516344229486, "loss": 0.6154, "step": 20310 }, { "epoch": 6.774516344229486, "grad_norm": 1.7450374364852905, "step": 20310 }, { "epoch": 6.774516344229486, "learning_rate": 0.00019912741794881348, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.42318326234817505, "step": 20310 }, { "ce_loss": 0.0868317186832428, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.2112046629190445, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.10673084110021591, "step": 20310 }, { "epoch": 6.774516344229486, "loss": 0.5567744374275208, "step": 20310 }, { "ce_loss": 0.1336977630853653, "epoch": 6.774516344229486, "step": 20310 }, { "distill_loss": 0.23806677758693695, "epoch": 6.774516344229486, "step": 20310 }, { "epoch": 6.774516344229486, "ref_ce_loss": 0.11441931873559952, "step": 20310 }, { "epoch": 6.777851901267511, "loss": 0.5744, "step": 20320 }, { "epoch": 6.777851901267511, "grad_norm": 1.237932801246643, "step": 20320 }, { "epoch": 6.777851901267511, "learning_rate": 0.00019875384462260466, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.34857824444770813, "step": 20320 }, { "ce_loss": 0.062375832349061966, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.19724702835083008, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.08858238160610199, "step": 20320 }, { "epoch": 6.777851901267511, "loss": 0.5977160930633545, "step": 20320 }, { "ce_loss": 0.16513806581497192, "epoch": 6.777851901267511, "step": 20320 }, { "distill_loss": 0.2866596281528473, "epoch": 6.777851901267511, "step": 20320 }, { "epoch": 6.777851901267511, "ref_ce_loss": 0.08822454512119293, "step": 20320 }, { "epoch": 6.781187458305537, "loss": 0.5947, "step": 20330 }, { "epoch": 6.781187458305537, "grad_norm": 1.3662400245666504, "step": 20330 }, { "epoch": 6.781187458305537, "learning_rate": 0.00019838050617231417, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.7048400640487671, "step": 20330 }, { "ce_loss": 0.1667744368314743, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.27122920751571655, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.12117403745651245, "step": 20330 }, { "epoch": 6.781187458305537, "loss": 0.835236668586731, "step": 20330 }, { "ce_loss": 0.20711590349674225, "epoch": 6.781187458305537, "step": 20330 }, { "distill_loss": 0.27225542068481445, "epoch": 6.781187458305537, "step": 20330 }, { "epoch": 6.781187458305537, "ref_ce_loss": 0.17776216566562653, "step": 20330 }, { "epoch": 6.784523015343562, "loss": 0.5946, "step": 20340 }, { "epoch": 6.784523015343562, "grad_norm": 2.784146785736084, "step": 20340 }, { "epoch": 6.784523015343562, "learning_rate": 0.00019800740303366822, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.5680144429206848, "step": 20340 }, { "ce_loss": 0.15996579825878143, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.25848570466041565, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.11828726530075073, "step": 20340 }, { "epoch": 6.784523015343562, "loss": 0.706413745880127, "step": 20340 }, { "ce_loss": 0.11593703180551529, "epoch": 6.784523015343562, "step": 20340 }, { "distill_loss": 0.2646014392375946, "epoch": 6.784523015343562, "step": 20340 }, { "epoch": 6.784523015343562, "ref_ce_loss": 0.10727044194936752, "step": 20340 }, { "epoch": 6.787858572381587, "loss": 0.5514, "step": 20350 }, { "epoch": 6.787858572381587, "grad_norm": 1.3041218519210815, "step": 20350 }, { "epoch": 6.787858572381587, "learning_rate": 0.00019763453564211795, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 0.4949185252189636, "step": 20350 }, { "ce_loss": 0.13301748037338257, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.24916112422943115, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.11259014904499054, "step": 20350 }, { "epoch": 6.787858572381587, "loss": 0.6650046706199646, "step": 20350 }, { "ce_loss": 0.10717565566301346, "epoch": 6.787858572381587, "step": 20350 }, { "distill_loss": 0.2219514697790146, "epoch": 6.787858572381587, "step": 20350 }, { "epoch": 6.787858572381587, "ref_ce_loss": 0.10976817458868027, "step": 20350 }, { "epoch": 6.791194129419613, "loss": 0.6297, "step": 20360 }, { "epoch": 6.791194129419613, "grad_norm": 1.8092106580734253, "step": 20360 }, { "epoch": 6.791194129419613, "learning_rate": 0.00019726190443284018, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.4623590111732483, "step": 20360 }, { "ce_loss": 0.09109138697385788, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.22240050137043, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.11351132392883301, "step": 20360 }, { "epoch": 6.791194129419613, "loss": 0.5489964485168457, "step": 20360 }, { "ce_loss": 0.12590204179286957, "epoch": 6.791194129419613, "step": 20360 }, { "distill_loss": 0.2594718933105469, "epoch": 6.791194129419613, "step": 20360 }, { "epoch": 6.791194129419613, "ref_ce_loss": 0.13404986262321472, "step": 20360 }, { "epoch": 6.794529686457638, "loss": 0.6037, "step": 20370 }, { "epoch": 6.794529686457638, "grad_norm": 1.3519389629364014, "step": 20370 }, { "epoch": 6.794529686457638, "learning_rate": 0.00019688950984073563, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.47837817668914795, "step": 20370 }, { "ce_loss": 0.09936484694480896, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.2567150592803955, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.1220482885837555, "step": 20370 }, { "epoch": 6.794529686457638, "loss": 0.5348929762840271, "step": 20370 }, { "ce_loss": 0.14279955625534058, "epoch": 6.794529686457638, "step": 20370 }, { "distill_loss": 0.25085532665252686, "epoch": 6.794529686457638, "step": 20370 }, { "epoch": 6.794529686457638, "ref_ce_loss": 0.11229889839887619, "step": 20370 }, { "epoch": 6.7978652434956635, "loss": 0.608, "step": 20380 }, { "epoch": 6.7978652434956635, "grad_norm": 2.1077394485473633, "step": 20380 }, { "epoch": 6.7978652434956635, "learning_rate": 0.00019651735230042852, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 0.618941605091095, "step": 20380 }, { "ce_loss": 0.12865251302719116, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.22706618905067444, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.10655650496482849, "step": 20380 }, { "epoch": 6.7978652434956635, "loss": 0.5625501275062561, "step": 20380 }, { "ce_loss": 0.09577847272157669, "epoch": 6.7978652434956635, "step": 20380 }, { "distill_loss": 0.2644766569137573, "epoch": 6.7978652434956635, "step": 20380 }, { "epoch": 6.7978652434956635, "ref_ce_loss": 0.14019955694675446, "step": 20380 }, { "epoch": 6.801200800533689, "loss": 0.6264, "step": 20390 }, { "epoch": 6.801200800533689, "grad_norm": 1.5742781162261963, "step": 20390 }, { "epoch": 6.801200800533689, "learning_rate": 0.00019614543224626688, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.5375264883041382, "step": 20390 }, { "ce_loss": 0.11992277204990387, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.21302179992198944, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.10849941521883011, "step": 20390 }, { "epoch": 6.801200800533689, "loss": 0.9217984676361084, "step": 20390 }, { "ce_loss": 0.1319456547498703, "epoch": 6.801200800533689, "step": 20390 }, { "distill_loss": 0.23134317994117737, "epoch": 6.801200800533689, "step": 20390 }, { "epoch": 6.801200800533689, "ref_ce_loss": 0.10734369605779648, "step": 20390 }, { "epoch": 6.804536357571714, "loss": 0.5953, "step": 20400 }, { "epoch": 6.804536357571714, "grad_norm": 1.4438633918762207, "step": 20400 }, { "epoch": 6.804536357571714, "learning_rate": 0.00019577375011232154, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 0.5196937322616577, "step": 20400 }, { "ce_loss": 0.08157113939523697, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.2544917166233063, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.1237298771739006, "step": 20400 }, { "epoch": 6.804536357571714, "loss": 1.1017556190490723, "step": 20400 }, { "ce_loss": 0.1311476230621338, "epoch": 6.804536357571714, "step": 20400 }, { "distill_loss": 0.364717960357666, "epoch": 6.804536357571714, "step": 20400 }, { "epoch": 6.804536357571714, "ref_ce_loss": 0.1520640254020691, "step": 20400 }, { "epoch": 6.8078719146097395, "loss": 0.6329, "step": 20410 }, { "epoch": 6.8078719146097395, "grad_norm": 1.2128087282180786, "step": 20410 }, { "epoch": 6.8078719146097395, "learning_rate": 0.0001954023063323854, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.5847972631454468, "step": 20410 }, { "ce_loss": 0.13812997937202454, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.2738398313522339, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.13805897533893585, "step": 20410 }, { "epoch": 6.8078719146097395, "loss": 0.7715336084365845, "step": 20410 }, { "ce_loss": 0.10610752552747726, "epoch": 6.8078719146097395, "step": 20410 }, { "distill_loss": 0.24427151679992676, "epoch": 6.8078719146097395, "step": 20410 }, { "epoch": 6.8078719146097395, "ref_ce_loss": 0.10848834365606308, "step": 20410 }, { "epoch": 6.811207471647765, "loss": 0.6139, "step": 20420 }, { "epoch": 6.811207471647765, "grad_norm": 2.0211548805236816, "step": 20420 }, { "epoch": 6.811207471647765, "learning_rate": 0.00019503110133997357, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.6625146269798279, "step": 20420 }, { "ce_loss": 0.12832562625408173, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.2269144058227539, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.11166250705718994, "step": 20420 }, { "epoch": 6.811207471647765, "loss": 0.5218786597251892, "step": 20420 }, { "ce_loss": 0.1277444213628769, "epoch": 6.811207471647765, "step": 20420 }, { "distill_loss": 0.1871224343776703, "epoch": 6.811207471647765, "step": 20420 }, { "epoch": 6.811207471647765, "ref_ce_loss": 0.13162465393543243, "step": 20420 }, { "epoch": 6.81454302868579, "loss": 0.5679, "step": 20430 }, { "epoch": 6.81454302868579, "grad_norm": 1.8764227628707886, "step": 20430 }, { "epoch": 6.81454302868579, "learning_rate": 0.00019466013556832193, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.5578723549842834, "step": 20430 }, { "ce_loss": 0.14493246376514435, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.270059734582901, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.11068219691514969, "step": 20430 }, { "epoch": 6.81454302868579, "loss": 0.5604507923126221, "step": 20430 }, { "ce_loss": 0.16456195712089539, "epoch": 6.81454302868579, "step": 20430 }, { "distill_loss": 0.23753252625465393, "epoch": 6.81454302868579, "step": 20430 }, { "epoch": 6.81454302868579, "ref_ce_loss": 0.135604128241539, "step": 20430 }, { "epoch": 6.8178785857238156, "loss": 0.6251, "step": 20440 }, { "epoch": 6.8178785857238156, "grad_norm": 2.8086681365966797, "step": 20440 }, { "epoch": 6.8178785857238156, "learning_rate": 0.0001942894094503875, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.4727604389190674, "step": 20440 }, { "ce_loss": 0.08744503557682037, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.22879493236541748, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.12131603062152863, "step": 20440 }, { "epoch": 6.8178785857238156, "loss": 0.6767879724502563, "step": 20440 }, { "ce_loss": 0.0807797983288765, "epoch": 6.8178785857238156, "step": 20440 }, { "distill_loss": 0.25815528631210327, "epoch": 6.8178785857238156, "step": 20440 }, { "epoch": 6.8178785857238156, "ref_ce_loss": 0.09768194705247879, "step": 20440 }, { "epoch": 6.821214142761841, "loss": 0.6603, "step": 20450 }, { "epoch": 6.821214142761841, "grad_norm": 2.1304433345794678, "step": 20450 }, { "epoch": 6.821214142761841, "learning_rate": 0.00019391892341884766, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.4594484865665436, "step": 20450 }, { "ce_loss": 0.10429593920707703, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.22468678653240204, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.0894937589764595, "step": 20450 }, { "epoch": 6.821214142761841, "loss": 0.5158959627151489, "step": 20450 }, { "ce_loss": 0.10562293976545334, "epoch": 6.821214142761841, "step": 20450 }, { "distill_loss": 0.25338807702064514, "epoch": 6.821214142761841, "step": 20450 }, { "epoch": 6.821214142761841, "ref_ce_loss": 0.1233699768781662, "step": 20450 }, { "epoch": 6.824549699799866, "loss": 0.5414, "step": 20460 }, { "epoch": 6.824549699799866, "grad_norm": 1.297013759613037, "step": 20460 }, { "epoch": 6.824549699799866, "learning_rate": 0.0001935486779060994, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.4934908449649811, "step": 20460 }, { "ce_loss": 0.11889277398586273, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.24133098125457764, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.09454958140850067, "step": 20460 }, { "epoch": 6.824549699799866, "loss": 0.5913838744163513, "step": 20460 }, { "ce_loss": 0.11880073696374893, "epoch": 6.824549699799866, "step": 20460 }, { "distill_loss": 0.23581376671791077, "epoch": 6.824549699799866, "step": 20460 }, { "epoch": 6.824549699799866, "ref_ce_loss": 0.11633745580911636, "step": 20460 }, { "epoch": 6.827885256837892, "loss": 0.597, "step": 20470 }, { "epoch": 6.827885256837892, "grad_norm": 2.1039810180664062, "step": 20470 }, { "epoch": 6.827885256837892, "learning_rate": 0.00019317867334425913, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 0.5328623652458191, "step": 20470 }, { "ce_loss": 0.13337953388690948, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.20354673266410828, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.1669895201921463, "step": 20470 }, { "epoch": 6.827885256837892, "loss": 0.9063245058059692, "step": 20470 }, { "ce_loss": 0.07711745798587799, "epoch": 6.827885256837892, "step": 20470 }, { "distill_loss": 0.24881622195243835, "epoch": 6.827885256837892, "step": 20470 }, { "epoch": 6.827885256837892, "ref_ce_loss": 0.10024960339069366, "step": 20470 }, { "epoch": 6.831220813875917, "loss": 0.5842, "step": 20480 }, { "epoch": 6.831220813875917, "grad_norm": 1.2597817182540894, "step": 20480 }, { "epoch": 6.831220813875917, "learning_rate": 0.00019280891016516195, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.4338398873806, "step": 20480 }, { "ce_loss": 0.08800597488880157, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.22484973073005676, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.09363920986652374, "step": 20480 }, { "epoch": 6.831220813875917, "loss": 0.6309686303138733, "step": 20480 }, { "ce_loss": 0.12850897014141083, "epoch": 6.831220813875917, "step": 20480 }, { "distill_loss": 0.24589794874191284, "epoch": 6.831220813875917, "step": 20480 }, { "epoch": 6.831220813875917, "ref_ce_loss": 0.1430305540561676, "step": 20480 }, { "epoch": 6.834556370913942, "loss": 0.6174, "step": 20490 }, { "epoch": 6.834556370913942, "grad_norm": 1.2456542253494263, "step": 20490 }, { "epoch": 6.834556370913942, "learning_rate": 0.00019243938880036149, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.7051937580108643, "step": 20490 }, { "ce_loss": 0.1593933254480362, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.3085207939147949, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.15409360826015472, "step": 20490 }, { "epoch": 6.834556370913942, "loss": 0.7055217623710632, "step": 20490 }, { "ce_loss": 0.15836520493030548, "epoch": 6.834556370913942, "step": 20490 }, { "distill_loss": 0.3123631477355957, "epoch": 6.834556370913942, "step": 20490 }, { "epoch": 6.834556370913942, "ref_ce_loss": 0.12352539598941803, "step": 20490 }, { "epoch": 6.837891927951968, "loss": 0.5777, "step": 20500 }, { "epoch": 6.837891927951968, "grad_norm": 1.9752660989761353, "step": 20500 }, { "epoch": 6.837891927951968, "learning_rate": 0.00019207010968112854, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.4566480815410614, "step": 20500 }, { "ce_loss": 0.11850826442241669, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.2331375777721405, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.10483713448047638, "step": 20500 }, { "epoch": 6.837891927951968, "loss": 0.5652526021003723, "step": 20500 }, { "ce_loss": 0.14450666308403015, "epoch": 6.837891927951968, "step": 20500 }, { "distill_loss": 0.23598337173461914, "epoch": 6.837891927951968, "step": 20500 }, { "epoch": 6.837891927951968, "ref_ce_loss": 0.12301039695739746, "step": 20500 }, { "epoch": 6.841227484989993, "loss": 0.5996, "step": 20510 }, { "epoch": 6.841227484989993, "grad_norm": 1.2535284757614136, "step": 20510 }, { "epoch": 6.841227484989993, "learning_rate": 0.0001917010732384518, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.6502880454063416, "step": 20510 }, { "ce_loss": 0.1329495757818222, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.30579808354377747, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.11312013864517212, "step": 20510 }, { "epoch": 6.841227484989993, "loss": 0.4479178488254547, "step": 20510 }, { "ce_loss": 0.06486482918262482, "epoch": 6.841227484989993, "step": 20510 }, { "distill_loss": 0.24453657865524292, "epoch": 6.841227484989993, "step": 20510 }, { "epoch": 6.841227484989993, "ref_ce_loss": 0.10726383328437805, "step": 20510 }, { "epoch": 6.844563042028018, "loss": 0.6205, "step": 20520 }, { "epoch": 6.844563042028018, "grad_norm": 1.237539529800415, "step": 20520 }, { "epoch": 6.844563042028018, "learning_rate": 0.00019133227990303646, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.5721306204795837, "step": 20520 }, { "ce_loss": 0.13463234901428223, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.21573877334594727, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.07603821903467178, "step": 20520 }, { "epoch": 6.844563042028018, "loss": 0.5392432808876038, "step": 20520 }, { "ce_loss": 0.0802914947271347, "epoch": 6.844563042028018, "step": 20520 }, { "distill_loss": 0.2541607916355133, "epoch": 6.844563042028018, "step": 20520 }, { "epoch": 6.844563042028018, "ref_ce_loss": 0.07735491544008255, "step": 20520 }, { "epoch": 6.847898599066044, "loss": 0.5919, "step": 20530 }, { "epoch": 6.847898599066044, "grad_norm": 1.5495575666427612, "step": 20530 }, { "epoch": 6.847898599066044, "learning_rate": 0.00019096373010530422, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.5933955311775208, "step": 20530 }, { "ce_loss": 0.11911506950855255, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.29116567969322205, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.09230422228574753, "step": 20530 }, { "epoch": 6.847898599066044, "loss": 0.7542802691459656, "step": 20530 }, { "ce_loss": 0.11548207700252533, "epoch": 6.847898599066044, "step": 20530 }, { "distill_loss": 0.2644978165626526, "epoch": 6.847898599066044, "step": 20530 }, { "epoch": 6.847898599066044, "ref_ce_loss": 0.13322116434574127, "step": 20530 }, { "epoch": 6.851234156104069, "loss": 0.6136, "step": 20540 }, { "epoch": 6.851234156104069, "grad_norm": 1.5028530359268188, "step": 20540 }, { "epoch": 6.851234156104069, "learning_rate": 0.000190595424275392, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.5793891549110413, "step": 20540 }, { "ce_loss": 0.142436683177948, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.30143263936042786, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.1353730857372284, "step": 20540 }, { "epoch": 6.851234156104069, "loss": 0.682435929775238, "step": 20540 }, { "ce_loss": 0.13634717464447021, "epoch": 6.851234156104069, "step": 20540 }, { "distill_loss": 0.32032471895217896, "epoch": 6.851234156104069, "step": 20540 }, { "epoch": 6.851234156104069, "ref_ce_loss": 0.13282988965511322, "step": 20540 }, { "epoch": 6.854569713142094, "loss": 0.6238, "step": 20550 }, { "epoch": 6.854569713142094, "grad_norm": 2.3763959407806396, "step": 20550 }, { "epoch": 6.854569713142094, "learning_rate": 0.00019022736284315256, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.5390588045120239, "step": 20550 }, { "ce_loss": 0.13439245522022247, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.20202970504760742, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.11675325036048889, "step": 20550 }, { "epoch": 6.854569713142094, "loss": 0.7126962542533875, "step": 20550 }, { "ce_loss": 0.10502856969833374, "epoch": 6.854569713142094, "step": 20550 }, { "distill_loss": 0.2715202569961548, "epoch": 6.854569713142094, "step": 20550 }, { "epoch": 6.854569713142094, "ref_ce_loss": 0.1214003637433052, "step": 20550 }, { "epoch": 6.85790527018012, "loss": 0.5758, "step": 20560 }, { "epoch": 6.85790527018012, "grad_norm": 1.0272542238235474, "step": 20560 }, { "epoch": 6.85790527018012, "learning_rate": 0.0001898595462381531, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.4998115301132202, "step": 20560 }, { "ce_loss": 0.10326630622148514, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.23785142600536346, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.10605143010616302, "step": 20560 }, { "epoch": 6.85790527018012, "loss": 0.5037967562675476, "step": 20560 }, { "ce_loss": 0.13062965869903564, "epoch": 6.85790527018012, "step": 20560 }, { "distill_loss": 0.2593288719654083, "epoch": 6.85790527018012, "step": 20560 }, { "epoch": 6.85790527018012, "ref_ce_loss": 0.09385646879673004, "step": 20560 }, { "epoch": 6.861240827218145, "loss": 0.5603, "step": 20570 }, { "epoch": 6.861240827218145, "grad_norm": 1.4330753087997437, "step": 20570 }, { "epoch": 6.861240827218145, "learning_rate": 0.0001894919748896753, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 0.44807168841362, "step": 20570 }, { "ce_loss": 0.11219009011983871, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.21106593310832977, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.10159025341272354, "step": 20570 }, { "epoch": 6.861240827218145, "loss": 0.6952852010726929, "step": 20570 }, { "ce_loss": 0.17087072134017944, "epoch": 6.861240827218145, "step": 20570 }, { "distill_loss": 0.30461642146110535, "epoch": 6.861240827218145, "step": 20570 }, { "epoch": 6.861240827218145, "ref_ce_loss": 0.1436925232410431, "step": 20570 }, { "epoch": 6.8645763842561704, "loss": 0.5903, "step": 20580 }, { "epoch": 6.8645763842561704, "grad_norm": 2.121004581451416, "step": 20580 }, { "epoch": 6.8645763842561704, "learning_rate": 0.00018912464922671434, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.6659446358680725, "step": 20580 }, { "ce_loss": 0.21983709931373596, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.3302499055862427, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.11565963923931122, "step": 20580 }, { "epoch": 6.8645763842561704, "loss": 0.5692784786224365, "step": 20580 }, { "ce_loss": 0.08923733234405518, "epoch": 6.8645763842561704, "step": 20580 }, { "distill_loss": 0.2060239613056183, "epoch": 6.8645763842561704, "step": 20580 }, { "epoch": 6.8645763842561704, "ref_ce_loss": 0.09318091720342636, "step": 20580 }, { "epoch": 6.867911941294196, "loss": 0.5868, "step": 20590 }, { "epoch": 6.867911941294196, "grad_norm": 1.8310197591781616, "step": 20590 }, { "epoch": 6.867911941294196, "learning_rate": 0.0001887575696779789, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 1.1411585807800293, "step": 20590 }, { "ce_loss": 0.15374459326267242, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.2981603443622589, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.1392323076725006, "step": 20590 }, { "epoch": 6.867911941294196, "loss": 0.4504973888397217, "step": 20590 }, { "ce_loss": 0.11747331917285919, "epoch": 6.867911941294196, "step": 20590 }, { "distill_loss": 0.20415905117988586, "epoch": 6.867911941294196, "step": 20590 }, { "epoch": 6.867911941294196, "ref_ce_loss": 0.10316745191812515, "step": 20590 }, { "epoch": 6.871247498332221, "loss": 0.6374, "step": 20600 }, { "epoch": 6.871247498332221, "grad_norm": 1.2335033416748047, "step": 20600 }, { "epoch": 6.871247498332221, "learning_rate": 0.00018839073667189021, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.5924480557441711, "step": 20600 }, { "ce_loss": 0.15742860734462738, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.21921710669994354, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.12174395471811295, "step": 20600 }, { "epoch": 6.871247498332221, "loss": 0.3247009217739105, "step": 20600 }, { "ce_loss": 0.07317493110895157, "epoch": 6.871247498332221, "step": 20600 }, { "distill_loss": 0.1657167673110962, "epoch": 6.871247498332221, "step": 20600 }, { "epoch": 6.871247498332221, "ref_ce_loss": 0.08565723896026611, "step": 20600 }, { "epoch": 6.8745830553702465, "loss": 0.5437, "step": 20610 }, { "epoch": 6.8745830553702465, "grad_norm": 2.104970932006836, "step": 20610 }, { "epoch": 6.8745830553702465, "learning_rate": 0.00018802415063658216, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 0.5424124002456665, "step": 20610 }, { "ce_loss": 0.16102339327335358, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.2664029896259308, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.08970746397972107, "step": 20610 }, { "epoch": 6.8745830553702465, "loss": 0.5977059006690979, "step": 20610 }, { "ce_loss": 0.0721786767244339, "epoch": 6.8745830553702465, "step": 20610 }, { "distill_loss": 0.20928135514259338, "epoch": 6.8745830553702465, "step": 20610 }, { "epoch": 6.8745830553702465, "ref_ce_loss": 0.09289656579494476, "step": 20610 }, { "epoch": 6.877918612408272, "loss": 0.6608, "step": 20620 }, { "epoch": 6.877918612408272, "grad_norm": 2.158698558807373, "step": 20620 }, { "epoch": 6.877918612408272, "learning_rate": 0.00018765781199989965, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 0.8673412799835205, "step": 20620 }, { "ce_loss": 0.15912854671478271, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 0.2529142498970032, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.14166447520256042, "step": 20620 }, { "epoch": 6.877918612408272, "loss": 0.6670170426368713, "step": 20620 }, { "ce_loss": 0.13092097640037537, "epoch": 6.877918612408272, "step": 20620 }, { "distill_loss": 0.2830592095851898, "epoch": 6.877918612408272, "step": 20620 }, { "epoch": 6.877918612408272, "ref_ce_loss": 0.08492620289325714, "step": 20620 }, { "epoch": 6.881254169446297, "loss": 0.6021, "step": 20630 }, { "epoch": 6.881254169446297, "grad_norm": 1.5774734020233154, "step": 20630 }, { "epoch": 6.881254169446297, "learning_rate": 0.0001872917211893995, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.46152612566947937, "step": 20630 }, { "ce_loss": 0.1152229830622673, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.2160659283399582, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.0975537896156311, "step": 20630 }, { "epoch": 6.881254169446297, "loss": 0.501051127910614, "step": 20630 }, { "ce_loss": 0.12634457647800446, "epoch": 6.881254169446297, "step": 20630 }, { "distill_loss": 0.19545955955982208, "epoch": 6.881254169446297, "step": 20630 }, { "epoch": 6.881254169446297, "ref_ce_loss": 0.10637293010950089, "step": 20630 }, { "epoch": 6.8845897264843225, "loss": 0.5485, "step": 20640 }, { "epoch": 6.8845897264843225, "grad_norm": 1.7691820859909058, "step": 20640 }, { "epoch": 6.8845897264843225, "learning_rate": 0.00018692587863234912, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 0.5000690817832947, "step": 20640 }, { "ce_loss": 0.11656507849693298, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.2345747947692871, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.11726991087198257, "step": 20640 }, { "epoch": 6.8845897264843225, "loss": 0.5075468420982361, "step": 20640 }, { "ce_loss": 0.10804903507232666, "epoch": 6.8845897264843225, "step": 20640 }, { "distill_loss": 0.27083054184913635, "epoch": 6.8845897264843225, "step": 20640 }, { "epoch": 6.8845897264843225, "ref_ce_loss": 0.12640805542469025, "step": 20640 }, { "epoch": 6.887925283522348, "loss": 0.6349, "step": 20650 }, { "epoch": 6.887925283522348, "grad_norm": 3.4797141551971436, "step": 20650 }, { "epoch": 6.887925283522348, "learning_rate": 0.00018656028475572622, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 0.4004143476486206, "step": 20650 }, { "ce_loss": 0.06690140068531036, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 0.21979433298110962, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.10764598846435547, "step": 20650 }, { "epoch": 6.887925283522348, "loss": 0.5797668099403381, "step": 20650 }, { "ce_loss": 0.052560314536094666, "epoch": 6.887925283522348, "step": 20650 }, { "distill_loss": 0.2451619654893875, "epoch": 6.887925283522348, "step": 20650 }, { "epoch": 6.887925283522348, "ref_ce_loss": 0.10148611664772034, "step": 20650 }, { "epoch": 6.891260840560373, "loss": 0.6285, "step": 20660 }, { "epoch": 6.891260840560373, "grad_norm": 1.2031223773956299, "step": 20660 }, { "epoch": 6.891260840560373, "learning_rate": 0.00018619493998621795, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.8077090978622437, "step": 20660 }, { "ce_loss": 0.14289332926273346, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.23440909385681152, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.14893965423107147, "step": 20660 }, { "epoch": 6.891260840560373, "loss": 0.5020773410797119, "step": 20660 }, { "ce_loss": 0.1051107794046402, "epoch": 6.891260840560373, "step": 20660 }, { "distill_loss": 0.28205716609954834, "epoch": 6.891260840560373, "step": 20660 }, { "epoch": 6.891260840560373, "ref_ce_loss": 0.11419624090194702, "step": 20660 }, { "epoch": 6.894596397598399, "loss": 0.6513, "step": 20670 }, { "epoch": 6.894596397598399, "grad_norm": 2.129122495651245, "step": 20670 }, { "epoch": 6.894596397598399, "learning_rate": 0.0001858298447502211, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.5558404922485352, "step": 20670 }, { "ce_loss": 0.1453867107629776, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.2337738275527954, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.12421823292970657, "step": 20670 }, { "epoch": 6.894596397598399, "loss": 0.5864662528038025, "step": 20670 }, { "ce_loss": 0.17098017036914825, "epoch": 6.894596397598399, "step": 20670 }, { "distill_loss": 0.267102986574173, "epoch": 6.894596397598399, "step": 20670 }, { "epoch": 6.894596397598399, "ref_ce_loss": 0.11548590660095215, "step": 20670 }, { "epoch": 6.897931954636424, "loss": 0.5988, "step": 20680 }, { "epoch": 6.897931954636424, "grad_norm": 1.1095306873321533, "step": 20680 }, { "epoch": 6.897931954636424, "learning_rate": 0.00018546499947384105, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.4309590756893158, "step": 20680 }, { "ce_loss": 0.10610014945268631, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.21524930000305176, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.08393006771802902, "step": 20680 }, { "epoch": 6.897931954636424, "loss": 0.5816032290458679, "step": 20680 }, { "ce_loss": 0.13894164562225342, "epoch": 6.897931954636424, "step": 20680 }, { "distill_loss": 0.3071152865886688, "epoch": 6.897931954636424, "step": 20680 }, { "epoch": 6.897931954636424, "ref_ce_loss": 0.13415858149528503, "step": 20680 }, { "epoch": 6.901267511674449, "loss": 0.6277, "step": 20690 }, { "epoch": 6.901267511674449, "grad_norm": 1.5599712133407593, "step": 20690 }, { "epoch": 6.901267511674449, "learning_rate": 0.00018510040458289155, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.49218297004699707, "step": 20690 }, { "ce_loss": 0.1152329295873642, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.22641527652740479, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.11307185143232346, "step": 20690 }, { "epoch": 6.901267511674449, "loss": 0.4263184070587158, "step": 20690 }, { "ce_loss": 0.09234665334224701, "epoch": 6.901267511674449, "step": 20690 }, { "distill_loss": 0.21400585770606995, "epoch": 6.901267511674449, "step": 20690 }, { "epoch": 6.901267511674449, "ref_ce_loss": 0.08838541060686111, "step": 20690 }, { "epoch": 6.904603068712475, "loss": 0.6255, "step": 20700 }, { "epoch": 6.904603068712475, "grad_norm": 1.3064284324645996, "step": 20700 }, { "epoch": 6.904603068712475, "learning_rate": 0.00018473606050289405, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.8105974197387695, "step": 20700 }, { "ce_loss": 0.23844876885414124, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.3396758735179901, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.23215480148792267, "step": 20700 }, { "epoch": 6.904603068712475, "loss": 0.5795097351074219, "step": 20700 }, { "ce_loss": 0.11340656131505966, "epoch": 6.904603068712475, "step": 20700 }, { "distill_loss": 0.29568004608154297, "epoch": 6.904603068712475, "step": 20700 }, { "epoch": 6.904603068712475, "ref_ce_loss": 0.11840888112783432, "step": 20700 }, { "epoch": 6.9079386257505, "loss": 0.6162, "step": 20710 }, { "epoch": 6.9079386257505, "grad_norm": 1.2173100709915161, "step": 20710 }, { "epoch": 6.9079386257505, "learning_rate": 0.00018437196765907728, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.7617021799087524, "step": 20710 }, { "ce_loss": 0.17053262889385223, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.3001687824726105, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.17243169248104095, "step": 20710 }, { "epoch": 6.9079386257505, "loss": 0.641623854637146, "step": 20710 }, { "ce_loss": 0.1281440109014511, "epoch": 6.9079386257505, "step": 20710 }, { "distill_loss": 0.320505827665329, "epoch": 6.9079386257505, "step": 20710 }, { "epoch": 6.9079386257505, "ref_ce_loss": 0.11534615606069565, "step": 20710 }, { "epoch": 6.911274182788525, "loss": 0.6241, "step": 20720 }, { "epoch": 6.911274182788525, "grad_norm": 1.6815094947814941, "step": 20720 }, { "epoch": 6.911274182788525, "learning_rate": 0.00018400812647637697, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.9845396280288696, "step": 20720 }, { "ce_loss": 0.07850717008113861, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.21867156028747559, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.11635127663612366, "step": 20720 }, { "epoch": 6.911274182788525, "loss": 0.4374261498451233, "step": 20720 }, { "ce_loss": 0.06454581767320633, "epoch": 6.911274182788525, "step": 20720 }, { "distill_loss": 0.268862783908844, "epoch": 6.911274182788525, "step": 20720 }, { "epoch": 6.911274182788525, "ref_ce_loss": 0.10372491180896759, "step": 20720 }, { "epoch": 6.914609739826551, "loss": 0.5813, "step": 20730 }, { "epoch": 6.914609739826551, "grad_norm": 1.5123430490493774, "step": 20730 }, { "epoch": 6.914609739826551, "learning_rate": 0.0001836445373794346, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.4976940453052521, "step": 20730 }, { "ce_loss": 0.12870489060878754, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.23885779082775116, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.129926860332489, "step": 20730 }, { "epoch": 6.914609739826551, "loss": 0.671566903591156, "step": 20730 }, { "ce_loss": 0.1277875006198883, "epoch": 6.914609739826551, "step": 20730 }, { "distill_loss": 0.22397693991661072, "epoch": 6.914609739826551, "step": 20730 }, { "epoch": 6.914609739826551, "ref_ce_loss": 0.12168483436107635, "step": 20730 }, { "epoch": 6.917945296864576, "loss": 0.5684, "step": 20740 }, { "epoch": 6.917945296864576, "grad_norm": 3.491680383682251, "step": 20740 }, { "epoch": 6.917945296864576, "learning_rate": 0.00018328120079259792, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.5865914225578308, "step": 20740 }, { "ce_loss": 0.16594484448432922, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.2859644889831543, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.13439278304576874, "step": 20740 }, { "epoch": 6.917945296864576, "loss": 0.26961272954940796, "step": 20740 }, { "ce_loss": 0.038990482687950134, "epoch": 6.917945296864576, "step": 20740 }, { "distill_loss": 0.16177760064601898, "epoch": 6.917945296864576, "step": 20740 }, { "epoch": 6.917945296864576, "ref_ce_loss": 0.06862502545118332, "step": 20740 }, { "epoch": 6.921280853902601, "loss": 0.5856, "step": 20750 }, { "epoch": 6.921280853902601, "grad_norm": 1.7987585067749023, "step": 20750 }, { "epoch": 6.921280853902601, "learning_rate": 0.00018291811713991982, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.7716285586357117, "step": 20750 }, { "ce_loss": 0.18918243050575256, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.26490822434425354, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.13002419471740723, "step": 20750 }, { "epoch": 6.921280853902601, "loss": 0.44953399896621704, "step": 20750 }, { "ce_loss": 0.07777289301156998, "epoch": 6.921280853902601, "step": 20750 }, { "distill_loss": 0.2360752373933792, "epoch": 6.921280853902601, "step": 20750 }, { "epoch": 6.921280853902601, "ref_ce_loss": 0.1030561625957489, "step": 20750 }, { "epoch": 6.924616410940627, "loss": 0.5994, "step": 20760 }, { "epoch": 6.924616410940627, "grad_norm": 1.4929414987564087, "step": 20760 }, { "epoch": 6.924616410940627, "learning_rate": 0.00018255528684515816, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.6236853003501892, "step": 20760 }, { "ce_loss": 0.14601117372512817, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.2984391450881958, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.12102136015892029, "step": 20760 }, { "epoch": 6.924616410940627, "loss": 0.8715800046920776, "step": 20760 }, { "ce_loss": 0.132913276553154, "epoch": 6.924616410940627, "step": 20760 }, { "distill_loss": 0.3128644526004791, "epoch": 6.924616410940627, "step": 20760 }, { "epoch": 6.924616410940627, "ref_ce_loss": 0.1070551872253418, "step": 20760 }, { "epoch": 6.927951967978652, "loss": 0.5638, "step": 20770 }, { "epoch": 6.927951967978652, "grad_norm": 2.015686511993408, "step": 20770 }, { "epoch": 6.927951967978652, "learning_rate": 0.0001821927103317746, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.723768413066864, "step": 20770 }, { "ce_loss": 0.17608648538589478, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.37144795060157776, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.11942605674266815, "step": 20770 }, { "epoch": 6.927951967978652, "loss": 0.5449997782707214, "step": 20770 }, { "ce_loss": 0.09751887619495392, "epoch": 6.927951967978652, "step": 20770 }, { "distill_loss": 0.2715526819229126, "epoch": 6.927951967978652, "step": 20770 }, { "epoch": 6.927951967978652, "ref_ce_loss": 0.09283023327589035, "step": 20770 }, { "epoch": 6.931287525016677, "loss": 0.6042, "step": 20780 }, { "epoch": 6.931287525016677, "grad_norm": 1.518445611000061, "step": 20780 }, { "epoch": 6.931287525016677, "learning_rate": 0.0001818303880229351, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.7221246957778931, "step": 20780 }, { "ce_loss": 0.16267530620098114, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.3028196394443512, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.1369720995426178, "step": 20780 }, { "epoch": 6.931287525016677, "loss": 0.6272754073143005, "step": 20780 }, { "ce_loss": 0.1474248319864273, "epoch": 6.931287525016677, "step": 20780 }, { "distill_loss": 0.3028441071510315, "epoch": 6.931287525016677, "step": 20780 }, { "epoch": 6.931287525016677, "ref_ce_loss": 0.1394912451505661, "step": 20780 }, { "epoch": 6.934623082054703, "loss": 0.6315, "step": 20790 }, { "epoch": 6.934623082054703, "grad_norm": 2.4681501388549805, "step": 20790 }, { "epoch": 6.934623082054703, "learning_rate": 0.00018146832034150867, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.5065264105796814, "step": 20790 }, { "ce_loss": 0.09136957675218582, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.2777387499809265, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.0956275686621666, "step": 20790 }, { "epoch": 6.934623082054703, "loss": 0.5836489200592041, "step": 20790 }, { "ce_loss": 0.11539237946271896, "epoch": 6.934623082054703, "step": 20790 }, { "distill_loss": 0.27783769369125366, "epoch": 6.934623082054703, "step": 20790 }, { "epoch": 6.934623082054703, "ref_ce_loss": 0.11264175921678543, "step": 20790 }, { "epoch": 6.937958639092728, "loss": 0.634, "step": 20800 }, { "epoch": 6.937958639092728, "grad_norm": 2.4432549476623535, "step": 20800 }, { "epoch": 6.937958639092728, "learning_rate": 0.00018110650771006772, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.6803032159805298, "step": 20800 }, { "ce_loss": 0.1700230836868286, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.3196053206920624, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.14488141238689423, "step": 20800 }, { "epoch": 6.937958639092728, "loss": 0.4583851993083954, "step": 20800 }, { "ce_loss": 0.1260090172290802, "epoch": 6.937958639092728, "step": 20800 }, { "distill_loss": 0.2324456125497818, "epoch": 6.937958639092728, "step": 20800 }, { "epoch": 6.937958639092728, "ref_ce_loss": 0.09967635571956635, "step": 20800 }, { "epoch": 6.9412941961307535, "loss": 0.5952, "step": 20810 }, { "epoch": 6.9412941961307535, "grad_norm": 2.0505409240722656, "step": 20810 }, { "epoch": 6.9412941961307535, "learning_rate": 0.00018074495055088598, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 1.0997909307479858, "step": 20810 }, { "ce_loss": 0.10792729258537292, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.28397682309150696, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.10799378156661987, "step": 20810 }, { "epoch": 6.9412941961307535, "loss": 0.6416740417480469, "step": 20810 }, { "ce_loss": 0.10900291800498962, "epoch": 6.9412941961307535, "step": 20810 }, { "distill_loss": 0.2863513231277466, "epoch": 6.9412941961307535, "step": 20810 }, { "epoch": 6.9412941961307535, "ref_ce_loss": 0.11546456813812256, "step": 20810 }, { "epoch": 6.944629753168779, "loss": 0.6465, "step": 20820 }, { "epoch": 6.944629753168779, "grad_norm": 3.0398612022399902, "step": 20820 }, { "epoch": 6.944629753168779, "learning_rate": 0.0001803836492859398, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.6857156753540039, "step": 20820 }, { "ce_loss": 0.13258329033851624, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.24844446778297424, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.11770449578762054, "step": 20820 }, { "epoch": 6.944629753168779, "loss": 0.7357656359672546, "step": 20820 }, { "ce_loss": 0.19352516531944275, "epoch": 6.944629753168779, "step": 20820 }, { "distill_loss": 0.30092155933380127, "epoch": 6.944629753168779, "step": 20820 }, { "epoch": 6.944629753168779, "ref_ce_loss": 0.13199712336063385, "step": 20820 }, { "epoch": 6.947965310206804, "loss": 0.595, "step": 20830 }, { "epoch": 6.947965310206804, "grad_norm": 1.429013729095459, "step": 20830 }, { "epoch": 6.947965310206804, "learning_rate": 0.00018002260433690656, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.5082059502601624, "step": 20830 }, { "ce_loss": 0.12579958140850067, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.23692786693572998, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.1452646404504776, "step": 20830 }, { "epoch": 6.947965310206804, "loss": 0.5477200746536255, "step": 20830 }, { "ce_loss": 0.08218104392290115, "epoch": 6.947965310206804, "step": 20830 }, { "distill_loss": 0.23850448429584503, "epoch": 6.947965310206804, "step": 20830 }, { "epoch": 6.947965310206804, "ref_ce_loss": 0.10290637612342834, "step": 20830 }, { "epoch": 6.9513008672448295, "loss": 0.5805, "step": 20840 }, { "epoch": 6.9513008672448295, "grad_norm": 2.0708327293395996, "step": 20840 }, { "epoch": 6.9513008672448295, "learning_rate": 0.00017966181612516478, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.5284128785133362, "step": 20840 }, { "ce_loss": 0.13292750716209412, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.254790335893631, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.11771418154239655, "step": 20840 }, { "epoch": 6.9513008672448295, "loss": 0.5159513354301453, "step": 20840 }, { "ce_loss": 0.10002075135707855, "epoch": 6.9513008672448295, "step": 20840 }, { "distill_loss": 0.22777974605560303, "epoch": 6.9513008672448295, "step": 20840 }, { "epoch": 6.9513008672448295, "ref_ce_loss": 0.10233756899833679, "step": 20840 }, { "epoch": 6.954636424282855, "loss": 0.5243, "step": 20850 }, { "epoch": 6.954636424282855, "grad_norm": 1.4802135229110718, "step": 20850 }, { "epoch": 6.954636424282855, "learning_rate": 0.00017930128507179281, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.605171799659729, "step": 20850 }, { "ce_loss": 0.16204921901226044, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.31319138407707214, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.09180094301700592, "step": 20850 }, { "epoch": 6.954636424282855, "loss": 0.4061069190502167, "step": 20850 }, { "ce_loss": 0.1162717342376709, "epoch": 6.954636424282855, "step": 20850 }, { "distill_loss": 0.17830415070056915, "epoch": 6.954636424282855, "step": 20850 }, { "epoch": 6.954636424282855, "ref_ce_loss": 0.08810706436634064, "step": 20850 }, { "epoch": 6.95797198132088, "loss": 0.5665, "step": 20860 }, { "epoch": 6.95797198132088, "grad_norm": 1.2751325368881226, "step": 20860 }, { "epoch": 6.95797198132088, "learning_rate": 0.00017894101159756932, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 1.0285208225250244, "step": 20860 }, { "ce_loss": 0.2166115641593933, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.24041934311389923, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.19319362938404083, "step": 20860 }, { "epoch": 6.95797198132088, "loss": 0.6077567934989929, "step": 20860 }, { "ce_loss": 0.18720628321170807, "epoch": 6.95797198132088, "step": 20860 }, { "distill_loss": 0.28821271657943726, "epoch": 6.95797198132088, "step": 20860 }, { "epoch": 6.95797198132088, "ref_ce_loss": 0.13216865062713623, "step": 20860 }, { "epoch": 6.961307538358906, "loss": 0.6149, "step": 20870 }, { "epoch": 6.961307538358906, "grad_norm": 1.9878860712051392, "step": 20870 }, { "epoch": 6.961307538358906, "learning_rate": 0.00017858099612297226, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.7590935230255127, "step": 20870 }, { "ce_loss": 0.14209294319152832, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.2697787284851074, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.12165077030658722, "step": 20870 }, { "epoch": 6.961307538358906, "loss": 0.7105188965797424, "step": 20870 }, { "ce_loss": 0.13031932711601257, "epoch": 6.961307538358906, "step": 20870 }, { "distill_loss": 0.2726278007030487, "epoch": 6.961307538358906, "step": 20870 }, { "epoch": 6.961307538358906, "ref_ce_loss": 0.14885155856609344, "step": 20870 }, { "epoch": 6.964643095396931, "loss": 0.5828, "step": 20880 }, { "epoch": 6.964643095396931, "grad_norm": 1.4370341300964355, "step": 20880 }, { "epoch": 6.964643095396931, "learning_rate": 0.00017822123906817848, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.5326735377311707, "step": 20880 }, { "ce_loss": 0.14173398911952972, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.29813286662101746, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.09259048104286194, "step": 20880 }, { "epoch": 6.964643095396931, "loss": 0.5765227675437927, "step": 20880 }, { "ce_loss": 0.15944123268127441, "epoch": 6.964643095396931, "step": 20880 }, { "distill_loss": 0.268841028213501, "epoch": 6.964643095396931, "step": 20880 }, { "epoch": 6.964643095396931, "ref_ce_loss": 0.11185889691114426, "step": 20880 }, { "epoch": 6.967978652434956, "loss": 0.602, "step": 20890 }, { "epoch": 6.967978652434956, "grad_norm": 2.715527057647705, "step": 20890 }, { "epoch": 6.967978652434956, "learning_rate": 0.000177861740853063, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.7061032056808472, "step": 20890 }, { "ce_loss": 0.14459113776683807, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.2508234977722168, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.14969336986541748, "step": 20890 }, { "epoch": 6.967978652434956, "loss": 0.6149274706840515, "step": 20890 }, { "ce_loss": 0.12304043024778366, "epoch": 6.967978652434956, "step": 20890 }, { "distill_loss": 0.2720663845539093, "epoch": 6.967978652434956, "step": 20890 }, { "epoch": 6.967978652434956, "ref_ce_loss": 0.11349066346883774, "step": 20890 }, { "epoch": 6.971314209472982, "loss": 0.5916, "step": 20900 }, { "epoch": 6.971314209472982, "grad_norm": 1.2655351161956787, "step": 20900 }, { "epoch": 6.971314209472982, "learning_rate": 0.00017750250189719883, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.5163251161575317, "step": 20900 }, { "ce_loss": 0.14423733949661255, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.21587130427360535, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.10627759993076324, "step": 20900 }, { "epoch": 6.971314209472982, "loss": 0.5793828964233398, "step": 20900 }, { "ce_loss": 0.09142829477787018, "epoch": 6.971314209472982, "step": 20900 }, { "distill_loss": 0.2316550761461258, "epoch": 6.971314209472982, "step": 20900 }, { "epoch": 6.971314209472982, "ref_ce_loss": 0.12005052715539932, "step": 20900 }, { "epoch": 6.974649766511007, "loss": 0.6028, "step": 20910 }, { "epoch": 6.974649766511007, "grad_norm": 2.137049674987793, "step": 20910 }, { "epoch": 6.974649766511007, "learning_rate": 0.00017714352261985697, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.710256814956665, "step": 20910 }, { "ce_loss": 0.17917829751968384, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.2954549789428711, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.13196998834609985, "step": 20910 }, { "epoch": 6.974649766511007, "loss": 0.5921691656112671, "step": 20910 }, { "ce_loss": 0.1159336194396019, "epoch": 6.974649766511007, "step": 20910 }, { "distill_loss": 0.2698439061641693, "epoch": 6.974649766511007, "step": 20910 }, { "epoch": 6.974649766511007, "ref_ce_loss": 0.11621291190385818, "step": 20910 }, { "epoch": 6.977985323549032, "loss": 0.5851, "step": 20920 }, { "epoch": 6.977985323549032, "grad_norm": 4.465473175048828, "step": 20920 }, { "epoch": 6.977985323549032, "learning_rate": 0.00017678480344000442, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.7718786001205444, "step": 20920 }, { "ce_loss": 0.10429341346025467, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.2350814789533615, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.11928503215312958, "step": 20920 }, { "epoch": 6.977985323549032, "loss": 0.5180379152297974, "step": 20920 }, { "ce_loss": 0.11934104561805725, "epoch": 6.977985323549032, "step": 20920 }, { "distill_loss": 0.26697003841400146, "epoch": 6.977985323549032, "step": 20920 }, { "epoch": 6.977985323549032, "ref_ce_loss": 0.10038395971059799, "step": 20920 }, { "epoch": 6.981320880587058, "loss": 0.5868, "step": 20930 }, { "epoch": 6.981320880587058, "grad_norm": 1.331379771232605, "step": 20930 }, { "epoch": 6.981320880587058, "learning_rate": 0.00017642634477630517, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.5460637211799622, "step": 20930 }, { "ce_loss": 0.1347649246454239, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.28631535172462463, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.12491101026535034, "step": 20930 }, { "epoch": 6.981320880587058, "loss": 0.7150649428367615, "step": 20930 }, { "ce_loss": 0.16883759200572968, "epoch": 6.981320880587058, "step": 20930 }, { "distill_loss": 0.31282228231430054, "epoch": 6.981320880587058, "step": 20930 }, { "epoch": 6.981320880587058, "ref_ce_loss": 0.17268231511116028, "step": 20930 }, { "epoch": 6.984656437625083, "loss": 0.5762, "step": 20940 }, { "epoch": 6.984656437625083, "grad_norm": 1.295901894569397, "step": 20940 }, { "epoch": 6.984656437625083, "learning_rate": 0.00017606814704711915, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.6643977761268616, "step": 20940 }, { "ce_loss": 0.1359083652496338, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.23353464901447296, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.11746269464492798, "step": 20940 }, { "epoch": 6.984656437625083, "loss": 0.6325167417526245, "step": 20940 }, { "ce_loss": 0.14522510766983032, "epoch": 6.984656437625083, "step": 20940 }, { "distill_loss": 0.21115204691886902, "epoch": 6.984656437625083, "step": 20940 }, { "epoch": 6.984656437625083, "ref_ce_loss": 0.09273610264062881, "step": 20940 }, { "epoch": 6.987991994663108, "loss": 0.5873, "step": 20950 }, { "epoch": 6.987991994663108, "grad_norm": 2.0272459983825684, "step": 20950 }, { "epoch": 6.987991994663108, "learning_rate": 0.00017571021067050153, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.6666278839111328, "step": 20950 }, { "ce_loss": 0.14961141347885132, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.2823046147823334, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.1209111213684082, "step": 20950 }, { "epoch": 6.987991994663108, "loss": 0.5868027806282043, "step": 20950 }, { "ce_loss": 0.14440715312957764, "epoch": 6.987991994663108, "step": 20950 }, { "distill_loss": 0.23055055737495422, "epoch": 6.987991994663108, "step": 20950 }, { "epoch": 6.987991994663108, "ref_ce_loss": 0.1070292592048645, "step": 20950 }, { "epoch": 6.991327551701134, "loss": 0.5565, "step": 20960 }, { "epoch": 6.991327551701134, "grad_norm": 1.3845330476760864, "step": 20960 }, { "epoch": 6.991327551701134, "learning_rate": 0.0001753525360642028, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.7456711530685425, "step": 20960 }, { "ce_loss": 0.15054984390735626, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.32618165016174316, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.12339648604393005, "step": 20960 }, { "epoch": 6.991327551701134, "loss": 0.5007907748222351, "step": 20960 }, { "ce_loss": 0.14651557803153992, "epoch": 6.991327551701134, "step": 20960 }, { "distill_loss": 0.2281855344772339, "epoch": 6.991327551701134, "step": 20960 }, { "epoch": 6.991327551701134, "ref_ce_loss": 0.11004862934350967, "step": 20960 }, { "epoch": 6.994663108739159, "loss": 0.6319, "step": 20970 }, { "epoch": 6.994663108739159, "grad_norm": 1.6815906763076782, "step": 20970 }, { "epoch": 6.994663108739159, "learning_rate": 0.0001749951236456674, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.5305715799331665, "step": 20970 }, { "ce_loss": 0.08651381731033325, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.24971520900726318, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.10385092347860336, "step": 20970 }, { "epoch": 6.994663108739159, "loss": 0.6306924223899841, "step": 20970 }, { "ce_loss": 0.08177436143159866, "epoch": 6.994663108739159, "step": 20970 }, { "distill_loss": 0.32261404395103455, "epoch": 6.994663108739159, "step": 20970 }, { "epoch": 6.994663108739159, "ref_ce_loss": 0.13266070187091827, "step": 20970 }, { "epoch": 6.997998665777184, "loss": 0.5823, "step": 20980 }, { "epoch": 6.997998665777184, "grad_norm": 2.4300663471221924, "step": 20980 }, { "epoch": 6.997998665777184, "learning_rate": 0.00017463797383203425, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.5562842488288879, "step": 20980 }, { "ce_loss": 0.12637288868427277, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.27352625131607056, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.10983546078205109, "step": 20980 }, { "epoch": 6.997998665777184, "loss": 0.5764535069465637, "step": 20980 }, { "ce_loss": 0.151265949010849, "epoch": 6.997998665777184, "step": 20980 }, { "distill_loss": 0.28327447175979614, "epoch": 6.997998665777184, "step": 20980 }, { "epoch": 6.997998665777184, "ref_ce_loss": 0.11406426131725311, "step": 20980 }, { "epoch": 7.00133422281521, "loss": 0.5262, "step": 20990 }, { "epoch": 7.00133422281521, "grad_norm": 1.4760485887527466, "step": 20990 }, { "epoch": 7.00133422281521, "learning_rate": 0.0001742810870401356, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.41110774874687195, "step": 20990 }, { "ce_loss": 0.08459983021020889, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.23713694512844086, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.08919335901737213, "step": 20990 }, { "epoch": 7.00133422281521, "loss": 0.8198530673980713, "step": 20990 }, { "ce_loss": 0.17153599858283997, "epoch": 7.00133422281521, "step": 20990 }, { "distill_loss": 0.24566666781902313, "epoch": 7.00133422281521, "step": 20990 }, { "epoch": 7.00133422281521, "ref_ce_loss": 0.12967519462108612, "step": 20990 }, { "epoch": 7.004669779853235, "loss": 0.5345, "step": 21000 }, { "epoch": 7.004669779853235, "grad_norm": 1.178972601890564, "step": 21000 }, { "epoch": 7.004669779853235, "learning_rate": 0.00017392446368649686, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.4565522372722626, "step": 21000 }, { "ce_loss": 0.08238189667463303, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.2576136887073517, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.07807490974664688, "step": 21000 }, { "epoch": 7.004669779853235, "loss": 0.3339233100414276, "step": 21000 }, { "ce_loss": 0.05025802552700043, "epoch": 7.004669779853235, "step": 21000 }, { "distill_loss": 0.17898201942443848, "epoch": 7.004669779853235, "step": 21000 }, { "epoch": 7.004669779853235, "ref_ce_loss": 0.06952040642499924, "step": 21000 }, { "epoch": 7.0080053368912605, "loss": 0.5173, "step": 21010 }, { "epoch": 7.0080053368912605, "grad_norm": 1.0856938362121582, "step": 21010 }, { "epoch": 7.0080053368912605, "learning_rate": 0.00017356810418733547, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.4113653302192688, "step": 21010 }, { "ce_loss": 0.07443442195653915, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.20666775107383728, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.09216611832380295, "step": 21010 }, { "epoch": 7.0080053368912605, "loss": 0.41373708844184875, "step": 21010 }, { "ce_loss": 0.07116129994392395, "epoch": 7.0080053368912605, "step": 21010 }, { "distill_loss": 0.21457628905773163, "epoch": 7.0080053368912605, "step": 21010 }, { "epoch": 7.0080053368912605, "ref_ce_loss": 0.1037018895149231, "step": 21010 }, { "epoch": 7.011340893929286, "loss": 0.5155, "step": 21020 }, { "epoch": 7.011340893929286, "grad_norm": 1.611261248588562, "step": 21020 }, { "epoch": 7.011340893929286, "learning_rate": 0.00017321200895856168, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.49294358491897583, "step": 21020 }, { "ce_loss": 0.11892196536064148, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.1827605664730072, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.11067811399698257, "step": 21020 }, { "epoch": 7.011340893929286, "loss": 0.5166816711425781, "step": 21020 }, { "ce_loss": 0.07782851159572601, "epoch": 7.011340893929286, "step": 21020 }, { "distill_loss": 0.2171994149684906, "epoch": 7.011340893929286, "step": 21020 }, { "epoch": 7.011340893929286, "ref_ce_loss": 0.0989086925983429, "step": 21020 }, { "epoch": 7.014676450967311, "loss": 0.5386, "step": 21030 }, { "epoch": 7.014676450967311, "grad_norm": 1.1420142650604248, "step": 21030 }, { "epoch": 7.014676450967311, "learning_rate": 0.00017285617841577704, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.5629929304122925, "step": 21030 }, { "ce_loss": 0.11045968532562256, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.23228420317173004, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.10397525131702423, "step": 21030 }, { "epoch": 7.014676450967311, "loss": 0.44841301441192627, "step": 21030 }, { "ce_loss": 0.09309506416320801, "epoch": 7.014676450967311, "step": 21030 }, { "distill_loss": 0.21713057160377502, "epoch": 7.014676450967311, "step": 21030 }, { "epoch": 7.014676450967311, "ref_ce_loss": 0.08877697587013245, "step": 21030 }, { "epoch": 7.0180120080053365, "loss": 0.5425, "step": 21040 }, { "epoch": 7.0180120080053365, "grad_norm": 1.5629714727401733, "step": 21040 }, { "epoch": 7.0180120080053365, "learning_rate": 0.00017250061297427368, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.38156411051750183, "step": 21040 }, { "ce_loss": 0.06925290822982788, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.20131511986255646, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.09016577154397964, "step": 21040 }, { "epoch": 7.0180120080053365, "loss": 0.6269943714141846, "step": 21040 }, { "ce_loss": 0.05716466158628464, "epoch": 7.0180120080053365, "step": 21040 }, { "distill_loss": 0.18947528302669525, "epoch": 7.0180120080053365, "step": 21040 }, { "epoch": 7.0180120080053365, "ref_ce_loss": 0.10615312308073044, "step": 21040 }, { "epoch": 7.021347565043362, "loss": 0.5463, "step": 21050 }, { "epoch": 7.021347565043362, "grad_norm": 1.1643590927124023, "step": 21050 }, { "epoch": 7.021347565043362, "learning_rate": 0.00017214531304903492, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.49292388558387756, "step": 21050 }, { "ce_loss": 0.09436366707086563, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.2636721730232239, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.1069205105304718, "step": 21050 }, { "epoch": 7.021347565043362, "loss": 0.5487685203552246, "step": 21050 }, { "ce_loss": 0.11184797435998917, "epoch": 7.021347565043362, "step": 21050 }, { "distill_loss": 0.2645128667354584, "epoch": 7.021347565043362, "step": 21050 }, { "epoch": 7.021347565043362, "ref_ce_loss": 0.09122076630592346, "step": 21050 }, { "epoch": 7.024683122081387, "loss": 0.5474, "step": 21060 }, { "epoch": 7.024683122081387, "grad_norm": 3.8690576553344727, "step": 21060 }, { "epoch": 7.024683122081387, "learning_rate": 0.00017179027905473403, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.39867842197418213, "step": 21060 }, { "ce_loss": 0.042192067950963974, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.2422800064086914, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.09038354456424713, "step": 21060 }, { "epoch": 7.024683122081387, "loss": 0.4865795373916626, "step": 21060 }, { "ce_loss": 0.07612650096416473, "epoch": 7.024683122081387, "step": 21060 }, { "distill_loss": 0.2698541283607483, "epoch": 7.024683122081387, "step": 21060 }, { "epoch": 7.024683122081387, "ref_ce_loss": 0.10052892565727234, "step": 21060 }, { "epoch": 7.028018679119413, "loss": 0.5082, "step": 21070 }, { "epoch": 7.028018679119413, "grad_norm": 1.5303665399551392, "step": 21070 }, { "epoch": 7.028018679119413, "learning_rate": 0.000171435511405734, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.33546188473701477, "step": 21070 }, { "ce_loss": 0.06674647331237793, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.1889113336801529, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.07956317812204361, "step": 21070 }, { "epoch": 7.028018679119413, "loss": 0.5631605982780457, "step": 21070 }, { "ce_loss": 0.15716561675071716, "epoch": 7.028018679119413, "step": 21070 }, { "distill_loss": 0.23613443970680237, "epoch": 7.028018679119413, "step": 21070 }, { "epoch": 7.028018679119413, "ref_ce_loss": 0.11336501687765121, "step": 21070 }, { "epoch": 7.031354236157438, "loss": 0.5358, "step": 21080 }, { "epoch": 7.031354236157438, "grad_norm": 1.5098323822021484, "step": 21080 }, { "epoch": 7.031354236157438, "learning_rate": 0.00017108101051608657, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.3946211338043213, "step": 21080 }, { "ce_loss": 0.0674547627568245, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.1942397654056549, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.08900465071201324, "step": 21080 }, { "epoch": 7.031354236157438, "loss": 0.7322803735733032, "step": 21080 }, { "ce_loss": 0.09472820162773132, "epoch": 7.031354236157438, "step": 21080 }, { "distill_loss": 0.2562478184700012, "epoch": 7.031354236157438, "step": 21080 }, { "epoch": 7.031354236157438, "ref_ce_loss": 0.09331964701414108, "step": 21080 }, { "epoch": 7.034689793195463, "loss": 0.5493, "step": 21090 }, { "epoch": 7.034689793195463, "grad_norm": 1.0239431858062744, "step": 21090 }, { "epoch": 7.034689793195463, "learning_rate": 0.0001707267767995326, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.43437060713768005, "step": 21090 }, { "ce_loss": 0.08299209922552109, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.22612957656383514, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.1250525861978531, "step": 21090 }, { "epoch": 7.034689793195463, "loss": 0.48301953077316284, "step": 21090 }, { "ce_loss": 0.07930684089660645, "epoch": 7.034689793195463, "step": 21090 }, { "distill_loss": 0.2962947189807892, "epoch": 7.034689793195463, "step": 21090 }, { "epoch": 7.034689793195463, "ref_ce_loss": 0.1071903184056282, "step": 21090 }, { "epoch": 7.038025350233489, "loss": 0.5005, "step": 21100 }, { "epoch": 7.038025350233489, "grad_norm": 1.2620716094970703, "step": 21100 }, { "epoch": 7.038025350233489, "learning_rate": 0.0001703728106695009, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.5874612331390381, "step": 21100 }, { "ce_loss": 0.14442382752895355, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.25857293605804443, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.1242932677268982, "step": 21100 }, { "epoch": 7.038025350233489, "loss": 0.30933573842048645, "step": 21100 }, { "ce_loss": 0.038306642323732376, "epoch": 7.038025350233489, "step": 21100 }, { "distill_loss": 0.17048147320747375, "epoch": 7.038025350233489, "step": 21100 }, { "epoch": 7.038025350233489, "ref_ce_loss": 0.03882153332233429, "step": 21100 }, { "epoch": 7.041360907271514, "loss": 0.4931, "step": 21110 }, { "epoch": 7.041360907271514, "grad_norm": 1.7004448175430298, "step": 21110 }, { "epoch": 7.041360907271514, "learning_rate": 0.00017001911253910817, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.491072416305542, "step": 21110 }, { "ce_loss": 0.07500729709863663, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.23993931710720062, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.07017786055803299, "step": 21110 }, { "epoch": 7.041360907271514, "loss": 0.5245043635368347, "step": 21110 }, { "ce_loss": 0.09140478074550629, "epoch": 7.041360907271514, "step": 21110 }, { "distill_loss": 0.19429965317249298, "epoch": 7.041360907271514, "step": 21110 }, { "epoch": 7.041360907271514, "ref_ce_loss": 0.0974140390753746, "step": 21110 }, { "epoch": 7.044696464309539, "loss": 0.4823, "step": 21120 }, { "epoch": 7.044696464309539, "grad_norm": 1.2850415706634521, "step": 21120 }, { "epoch": 7.044696464309539, "learning_rate": 0.00016966568282115785, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.5977675318717957, "step": 21120 }, { "ce_loss": 0.1628226935863495, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.2581752836704254, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.13995656371116638, "step": 21120 }, { "epoch": 7.044696464309539, "loss": 0.5749611854553223, "step": 21120 }, { "ce_loss": 0.1332138031721115, "epoch": 7.044696464309539, "step": 21120 }, { "distill_loss": 0.277914434671402, "epoch": 7.044696464309539, "step": 21120 }, { "epoch": 7.044696464309539, "ref_ce_loss": 0.12948699295520782, "step": 21120 }, { "epoch": 7.048032021347565, "loss": 0.5306, "step": 21130 }, { "epoch": 7.048032021347565, "grad_norm": 1.5037014484405518, "step": 21130 }, { "epoch": 7.048032021347565, "learning_rate": 0.0001693125219281408, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.4484368562698364, "step": 21130 }, { "ce_loss": 0.07722995430231094, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.19566670060157776, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.10668282955884933, "step": 21130 }, { "epoch": 7.048032021347565, "loss": 0.36792391538619995, "step": 21130 }, { "ce_loss": 0.08133158087730408, "epoch": 7.048032021347565, "step": 21130 }, { "distill_loss": 0.20554950833320618, "epoch": 7.048032021347565, "step": 21130 }, { "epoch": 7.048032021347565, "ref_ce_loss": 0.061777785420417786, "step": 21130 }, { "epoch": 7.05136757838559, "loss": 0.5248, "step": 21140 }, { "epoch": 7.05136757838559, "grad_norm": 3.220675468444824, "step": 21140 }, { "epoch": 7.05136757838559, "learning_rate": 0.00016895963027223365, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.3645973801612854, "step": 21140 }, { "ce_loss": 0.09136815369129181, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.1671840101480484, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.10586903989315033, "step": 21140 }, { "epoch": 7.05136757838559, "loss": 0.45609939098358154, "step": 21140 }, { "ce_loss": 0.06002714857459068, "epoch": 7.05136757838559, "step": 21140 }, { "distill_loss": 0.19941852986812592, "epoch": 7.05136757838559, "step": 21140 }, { "epoch": 7.05136757838559, "ref_ce_loss": 0.11288649588823318, "step": 21140 }, { "epoch": 7.054703135423615, "loss": 0.5007, "step": 21150 }, { "epoch": 7.054703135423615, "grad_norm": 1.32329261302948, "step": 21150 }, { "epoch": 7.054703135423615, "learning_rate": 0.00016860700826529907, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.5643146634101868, "step": 21150 }, { "ce_loss": 0.1046956479549408, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.24031804502010345, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.10297542810440063, "step": 21150 }, { "epoch": 7.054703135423615, "loss": 0.2995739281177521, "step": 21150 }, { "ce_loss": 0.040338970720767975, "epoch": 7.054703135423615, "step": 21150 }, { "distill_loss": 0.1610741913318634, "epoch": 7.054703135423615, "step": 21150 }, { "epoch": 7.054703135423615, "ref_ce_loss": 0.06947105377912521, "step": 21150 }, { "epoch": 7.058038692461641, "loss": 0.5235, "step": 21160 }, { "epoch": 7.058038692461641, "grad_norm": 1.9927266836166382, "step": 21160 }, { "epoch": 7.058038692461641, "learning_rate": 0.0001682546563188846, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.3663199245929718, "step": 21160 }, { "ce_loss": 0.06230630725622177, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.18640950322151184, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.08341743797063828, "step": 21160 }, { "epoch": 7.058038692461641, "loss": 0.43763241171836853, "step": 21160 }, { "ce_loss": 0.08718358725309372, "epoch": 7.058038692461641, "step": 21160 }, { "distill_loss": 0.22233569622039795, "epoch": 7.058038692461641, "step": 21160 }, { "epoch": 7.058038692461641, "ref_ce_loss": 0.10211227089166641, "step": 21160 }, { "epoch": 7.061374249499666, "loss": 0.5244, "step": 21170 }, { "epoch": 7.061374249499666, "grad_norm": 4.421433925628662, "step": 21170 }, { "epoch": 7.061374249499666, "learning_rate": 0.0001679025748442231, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.4997464120388031, "step": 21170 }, { "ce_loss": 0.10188456624746323, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.23916609585285187, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.11447103321552277, "step": 21170 }, { "epoch": 7.061374249499666, "loss": 0.5505445003509521, "step": 21170 }, { "ce_loss": 0.1000547856092453, "epoch": 7.061374249499666, "step": 21170 }, { "distill_loss": 0.2162446230649948, "epoch": 7.061374249499666, "step": 21170 }, { "epoch": 7.061374249499666, "ref_ce_loss": 0.09203381836414337, "step": 21170 }, { "epoch": 7.064709806537691, "loss": 0.5328, "step": 21180 }, { "epoch": 7.064709806537691, "grad_norm": 2.3827109336853027, "step": 21180 }, { "epoch": 7.064709806537691, "learning_rate": 0.00016755076425223147, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.43425941467285156, "step": 21180 }, { "ce_loss": 0.12137731909751892, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.22484569251537323, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.08779873698949814, "step": 21180 }, { "epoch": 7.064709806537691, "loss": 0.4549023509025574, "step": 21180 }, { "ce_loss": 0.07701903581619263, "epoch": 7.064709806537691, "step": 21180 }, { "distill_loss": 0.22629885375499725, "epoch": 7.064709806537691, "step": 21180 }, { "epoch": 7.064709806537691, "ref_ce_loss": 0.08000829070806503, "step": 21180 }, { "epoch": 7.068045363575717, "loss": 0.4869, "step": 21190 }, { "epoch": 7.068045363575717, "grad_norm": 1.7620294094085693, "step": 21190 }, { "epoch": 7.068045363575717, "learning_rate": 0.00016719922495351064, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.42543548345565796, "step": 21190 }, { "ce_loss": 0.09694749861955643, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.20173221826553345, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.09933895617723465, "step": 21190 }, { "epoch": 7.068045363575717, "loss": 0.6733165979385376, "step": 21190 }, { "ce_loss": 0.11281838268041611, "epoch": 7.068045363575717, "step": 21190 }, { "distill_loss": 0.21210214495658875, "epoch": 7.068045363575717, "step": 21190 }, { "epoch": 7.068045363575717, "ref_ce_loss": 0.08334226161241531, "step": 21190 }, { "epoch": 7.071380920613742, "loss": 0.5431, "step": 21200 }, { "epoch": 7.071380920613742, "grad_norm": 1.6558399200439453, "step": 21200 }, { "epoch": 7.071380920613742, "learning_rate": 0.00016684795735834453, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.41487324237823486, "step": 21200 }, { "ce_loss": 0.07864386588335037, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.21175822615623474, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.0927249863743782, "step": 21200 }, { "epoch": 7.071380920613742, "loss": 0.33858710527420044, "step": 21200 }, { "ce_loss": 0.06142342835664749, "epoch": 7.071380920613742, "step": 21200 }, { "distill_loss": 0.1634354591369629, "epoch": 7.071380920613742, "step": 21200 }, { "epoch": 7.071380920613742, "ref_ce_loss": 0.08492116630077362, "step": 21200 }, { "epoch": 7.0747164776517675, "loss": 0.4871, "step": 21210 }, { "epoch": 7.0747164776517675, "grad_norm": 1.2901017665863037, "step": 21210 }, { "epoch": 7.0747164776517675, "learning_rate": 0.00016649696187670041, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.3924500644207001, "step": 21210 }, { "ce_loss": 0.07660618424415588, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.20859317481517792, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.07732430845499039, "step": 21210 }, { "epoch": 7.0747164776517675, "loss": 0.5552083253860474, "step": 21210 }, { "ce_loss": 0.0491691492497921, "epoch": 7.0747164776517675, "step": 21210 }, { "distill_loss": 0.18049979209899902, "epoch": 7.0747164776517675, "step": 21210 }, { "epoch": 7.0747164776517675, "ref_ce_loss": 0.07998926937580109, "step": 21210 }, { "epoch": 7.078052034689793, "loss": 0.4609, "step": 21220 }, { "epoch": 7.078052034689793, "grad_norm": 1.6316123008728027, "step": 21220 }, { "epoch": 7.078052034689793, "learning_rate": 0.00016614623891822778, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.5180302262306213, "step": 21220 }, { "ce_loss": 0.11370652168989182, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.23079824447631836, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.11144647747278214, "step": 21220 }, { "epoch": 7.078052034689793, "loss": 0.4807301163673401, "step": 21220 }, { "ce_loss": 0.12959855794906616, "epoch": 7.078052034689793, "step": 21220 }, { "distill_loss": 0.21934184432029724, "epoch": 7.078052034689793, "step": 21220 }, { "epoch": 7.078052034689793, "ref_ce_loss": 0.10569732636213303, "step": 21220 }, { "epoch": 7.081387591727818, "loss": 0.4896, "step": 21230 }, { "epoch": 7.081387591727818, "grad_norm": 1.6368104219436646, "step": 21230 }, { "epoch": 7.081387591727818, "learning_rate": 0.00016579578889225796, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.3786579668521881, "step": 21230 }, { "ce_loss": 0.09192191064357758, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.20896805822849274, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.07722792774438858, "step": 21230 }, { "epoch": 7.081387591727818, "loss": 0.34697994589805603, "step": 21230 }, { "ce_loss": 0.06574396044015884, "epoch": 7.081387591727818, "step": 21230 }, { "distill_loss": 0.1986175775527954, "epoch": 7.081387591727818, "step": 21230 }, { "epoch": 7.081387591727818, "ref_ce_loss": 0.08240870386362076, "step": 21230 }, { "epoch": 7.0847231487658435, "loss": 0.4941, "step": 21240 }, { "epoch": 7.0847231487658435, "grad_norm": 38.46177291870117, "step": 21240 }, { "epoch": 7.0847231487658435, "learning_rate": 0.000165445612207804, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.4953051507472992, "step": 21240 }, { "ce_loss": 0.14665032923221588, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.25713422894477844, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.09130918979644775, "step": 21240 }, { "epoch": 7.0847231487658435, "loss": 0.4350004494190216, "step": 21240 }, { "ce_loss": 0.05916459113359451, "epoch": 7.0847231487658435, "step": 21240 }, { "distill_loss": 0.2253180742263794, "epoch": 7.0847231487658435, "step": 21240 }, { "epoch": 7.0847231487658435, "ref_ce_loss": 0.1087227463722229, "step": 21240 }, { "epoch": 7.088058705803869, "loss": 0.4739, "step": 21250 }, { "epoch": 7.088058705803869, "grad_norm": 1.1803487539291382, "step": 21250 }, { "epoch": 7.088058705803869, "learning_rate": 0.00016509570927355962, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.45185840129852295, "step": 21250 }, { "ce_loss": 0.04442691057920456, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.16830293834209442, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.06186540797352791, "step": 21250 }, { "epoch": 7.088058705803869, "loss": 0.4900377094745636, "step": 21250 }, { "ce_loss": 0.11686450242996216, "epoch": 7.088058705803869, "step": 21250 }, { "distill_loss": 0.25501832365989685, "epoch": 7.088058705803869, "step": 21250 }, { "epoch": 7.088058705803869, "ref_ce_loss": 0.11018063873052597, "step": 21250 }, { "epoch": 7.091394262841894, "loss": 0.4635, "step": 21260 }, { "epoch": 7.091394262841894, "grad_norm": 1.046940803527832, "step": 21260 }, { "epoch": 7.091394262841894, "learning_rate": 0.00016474608049789943, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.35495126247406006, "step": 21260 }, { "ce_loss": 0.0532936193048954, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.1743641197681427, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.09107319265604019, "step": 21260 }, { "epoch": 7.091394262841894, "loss": 0.43149006366729736, "step": 21260 }, { "ce_loss": 0.0692746564745903, "epoch": 7.091394262841894, "step": 21260 }, { "distill_loss": 0.22161293029785156, "epoch": 7.091394262841894, "step": 21260 }, { "epoch": 7.091394262841894, "ref_ce_loss": 0.0617385059595108, "step": 21260 }, { "epoch": 7.09472981987992, "loss": 0.49, "step": 21270 }, { "epoch": 7.09472981987992, "grad_norm": 1.3952606916427612, "step": 21270 }, { "epoch": 7.09472981987992, "learning_rate": 0.00016439672628887757, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.6341660022735596, "step": 21270 }, { "ce_loss": 0.15768156945705414, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.27653753757476807, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.10778310894966125, "step": 21270 }, { "epoch": 7.09472981987992, "loss": 0.42467135190963745, "step": 21270 }, { "ce_loss": 0.07942281663417816, "epoch": 7.09472981987992, "step": 21270 }, { "distill_loss": 0.23953743278980255, "epoch": 7.09472981987992, "step": 21270 }, { "epoch": 7.09472981987992, "ref_ce_loss": 0.08827744424343109, "step": 21270 }, { "epoch": 7.098065376917945, "loss": 0.6013, "step": 21280 }, { "epoch": 7.098065376917945, "grad_norm": 1.2766448259353638, "step": 21280 }, { "epoch": 7.098065376917945, "learning_rate": 0.00016404764705422802, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.4827376902103424, "step": 21280 }, { "ce_loss": 0.06714701652526855, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.29545265436172485, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.08229733258485794, "step": 21280 }, { "epoch": 7.098065376917945, "loss": 0.6882094144821167, "step": 21280 }, { "ce_loss": 0.08233946561813354, "epoch": 7.098065376917945, "step": 21280 }, { "distill_loss": 0.2522861659526825, "epoch": 7.098065376917945, "step": 21280 }, { "epoch": 7.098065376917945, "ref_ce_loss": 0.09695421904325485, "step": 21280 }, { "epoch": 7.10140093395597, "loss": 0.5352, "step": 21290 }, { "epoch": 7.10140093395597, "grad_norm": 0.9868097901344299, "step": 21290 }, { "epoch": 7.10140093395597, "learning_rate": 0.00016369884320136392, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 1.093414545059204, "step": 21290 }, { "ce_loss": 0.10553121566772461, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.28743916749954224, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.11427765339612961, "step": 21290 }, { "epoch": 7.10140093395597, "loss": 0.5632927417755127, "step": 21290 }, { "ce_loss": 0.07510565966367722, "epoch": 7.10140093395597, "step": 21290 }, { "distill_loss": 0.2500094473361969, "epoch": 7.10140093395597, "step": 21290 }, { "epoch": 7.10140093395597, "ref_ce_loss": 0.07853087037801743, "step": 21290 }, { "epoch": 7.104736490993996, "loss": 0.5351, "step": 21300 }, { "epoch": 7.104736490993996, "grad_norm": 1.2402269840240479, "step": 21300 }, { "epoch": 7.104736490993996, "learning_rate": 0.00016335031513737687, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.43117618560791016, "step": 21300 }, { "ce_loss": 0.08685576915740967, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.26075318455696106, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.08343677967786789, "step": 21300 }, { "epoch": 7.104736490993996, "loss": 0.48218634724617004, "step": 21300 }, { "ce_loss": 0.0882478728890419, "epoch": 7.104736490993996, "step": 21300 }, { "distill_loss": 0.2749446928501129, "epoch": 7.104736490993996, "step": 21300 }, { "epoch": 7.104736490993996, "ref_ce_loss": 0.08692453801631927, "step": 21300 }, { "epoch": 7.108072048032021, "loss": 0.4891, "step": 21310 }, { "epoch": 7.108072048032021, "grad_norm": 1.0379635095596313, "step": 21310 }, { "epoch": 7.108072048032021, "learning_rate": 0.00016300206326903672, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.5212631225585938, "step": 21310 }, { "ce_loss": 0.1073252409696579, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.27759629487991333, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.09776793420314789, "step": 21310 }, { "epoch": 7.108072048032021, "loss": 0.4921794533729553, "step": 21310 }, { "ce_loss": 0.10279887169599533, "epoch": 7.108072048032021, "step": 21310 }, { "distill_loss": 0.2897863984107971, "epoch": 7.108072048032021, "step": 21310 }, { "epoch": 7.108072048032021, "ref_ce_loss": 0.09936974197626114, "step": 21310 }, { "epoch": 7.111407605070046, "loss": 0.5527, "step": 21320 }, { "epoch": 7.111407605070046, "grad_norm": 2.3853776454925537, "step": 21320 }, { "epoch": 7.111407605070046, "learning_rate": 0.0001626540880027907, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.3984506130218506, "step": 21320 }, { "ce_loss": 0.06048259884119034, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.2146771401166916, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.07563339173793793, "step": 21320 }, { "epoch": 7.111407605070046, "loss": 0.5447241067886353, "step": 21320 }, { "ce_loss": 0.07636505365371704, "epoch": 7.111407605070046, "step": 21320 }, { "distill_loss": 0.22990357875823975, "epoch": 7.111407605070046, "step": 21320 }, { "epoch": 7.111407605070046, "ref_ce_loss": 0.09303893148899078, "step": 21320 }, { "epoch": 7.114743162108072, "loss": 0.5337, "step": 21330 }, { "epoch": 7.114743162108072, "grad_norm": 1.1696761846542358, "step": 21330 }, { "epoch": 7.114743162108072, "learning_rate": 0.00016230638974476337, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.5816092491149902, "step": 21330 }, { "ce_loss": 0.11028946191072464, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.21065345406532288, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.07422541826963425, "step": 21330 }, { "epoch": 7.114743162108072, "loss": 0.4654453992843628, "step": 21330 }, { "ce_loss": 0.08083511143922806, "epoch": 7.114743162108072, "step": 21330 }, { "distill_loss": 0.25947773456573486, "epoch": 7.114743162108072, "step": 21330 }, { "epoch": 7.114743162108072, "ref_ce_loss": 0.09762498736381531, "step": 21330 }, { "epoch": 7.118078719146097, "loss": 0.5283, "step": 21340 }, { "epoch": 7.118078719146097, "grad_norm": 1.145909070968628, "step": 21340 }, { "epoch": 7.118078719146097, "learning_rate": 0.00016195896890075617, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.9589320421218872, "step": 21340 }, { "ce_loss": 0.08689472824335098, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.294956773519516, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.09442543238401413, "step": 21340 }, { "epoch": 7.118078719146097, "loss": 0.4515564441680908, "step": 21340 }, { "ce_loss": 0.1072135865688324, "epoch": 7.118078719146097, "step": 21340 }, { "distill_loss": 0.25195497274398804, "epoch": 7.118078719146097, "step": 21340 }, { "epoch": 7.118078719146097, "ref_ce_loss": 0.09227581322193146, "step": 21340 }, { "epoch": 7.121414276184122, "loss": 0.5224, "step": 21350 }, { "epoch": 7.121414276184122, "grad_norm": 1.4755406379699707, "step": 21350 }, { "epoch": 7.121414276184122, "learning_rate": 0.0001616118258762465, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.9713801145553589, "step": 21350 }, { "ce_loss": 0.16981974244117737, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.3541743755340576, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.12030630558729172, "step": 21350 }, { "epoch": 7.121414276184122, "loss": 0.3955955505371094, "step": 21350 }, { "ce_loss": 0.0644855797290802, "epoch": 7.121414276184122, "step": 21350 }, { "distill_loss": 0.23553195595741272, "epoch": 7.121414276184122, "step": 21350 }, { "epoch": 7.121414276184122, "ref_ce_loss": 0.06928061693906784, "step": 21350 }, { "epoch": 7.124749833222148, "loss": 0.5653, "step": 21360 }, { "epoch": 7.124749833222148, "grad_norm": 1.499052882194519, "step": 21360 }, { "epoch": 7.124749833222148, "learning_rate": 0.00016126496107638766, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.4505382180213928, "step": 21360 }, { "ce_loss": 0.09331570565700531, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.2338477224111557, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.09295643866062164, "step": 21360 }, { "epoch": 7.124749833222148, "loss": 0.49329978227615356, "step": 21360 }, { "ce_loss": 0.09330855309963226, "epoch": 7.124749833222148, "step": 21360 }, { "distill_loss": 0.24846550822257996, "epoch": 7.124749833222148, "step": 21360 }, { "epoch": 7.124749833222148, "ref_ce_loss": 0.09859941154718399, "step": 21360 }, { "epoch": 7.128085390260173, "loss": 0.4869, "step": 21370 }, { "epoch": 7.128085390260173, "grad_norm": 1.757819414138794, "step": 21370 }, { "epoch": 7.128085390260173, "learning_rate": 0.0001609183749060082, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.4814741015434265, "step": 21370 }, { "ce_loss": 0.11460345983505249, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.22094333171844482, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.11573362350463867, "step": 21370 }, { "epoch": 7.128085390260173, "loss": 0.41160690784454346, "step": 21370 }, { "ce_loss": 0.09539221227169037, "epoch": 7.128085390260173, "step": 21370 }, { "distill_loss": 0.20575033128261566, "epoch": 7.128085390260173, "step": 21370 }, { "epoch": 7.128085390260173, "ref_ce_loss": 0.07427817583084106, "step": 21370 }, { "epoch": 7.131420947298198, "loss": 0.5221, "step": 21380 }, { "epoch": 7.131420947298198, "grad_norm": 1.0497280359268188, "step": 21380 }, { "epoch": 7.131420947298198, "learning_rate": 0.0001605720677696116, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.5087268352508545, "step": 21380 }, { "ce_loss": 0.11951367557048798, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.24779492616653442, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.10620385408401489, "step": 21380 }, { "epoch": 7.131420947298198, "loss": 0.4907005727291107, "step": 21380 }, { "ce_loss": 0.03994790464639664, "epoch": 7.131420947298198, "step": 21380 }, { "distill_loss": 0.21446077525615692, "epoch": 7.131420947298198, "step": 21380 }, { "epoch": 7.131420947298198, "ref_ce_loss": 0.0749351978302002, "step": 21380 }, { "epoch": 7.134756504336224, "loss": 0.5462, "step": 21390 }, { "epoch": 7.134756504336224, "grad_norm": 1.3052724599838257, "step": 21390 }, { "epoch": 7.134756504336224, "learning_rate": 0.00016022604007137533, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 0.7197709083557129, "step": 21390 }, { "ce_loss": 0.14835871756076813, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.23564325273036957, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.1125996932387352, "step": 21390 }, { "epoch": 7.134756504336224, "loss": 1.0123602151870728, "step": 21390 }, { "ce_loss": 0.12748490273952484, "epoch": 7.134756504336224, "step": 21390 }, { "distill_loss": 0.2982759177684784, "epoch": 7.134756504336224, "step": 21390 }, { "epoch": 7.134756504336224, "ref_ce_loss": 0.12087155878543854, "step": 21390 }, { "epoch": 7.138092061374249, "loss": 0.5642, "step": 21400 }, { "epoch": 7.138092061374249, "grad_norm": 1.72278892993927, "step": 21400 }, { "epoch": 7.138092061374249, "learning_rate": 0.000159880292215151, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.5797778367996216, "step": 21400 }, { "ce_loss": 0.11908047646284103, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.26884138584136963, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.11582252383232117, "step": 21400 }, { "epoch": 7.138092061374249, "loss": 0.4562729299068451, "step": 21400 }, { "ce_loss": 0.06269034743309021, "epoch": 7.138092061374249, "step": 21400 }, { "distill_loss": 0.23549781739711761, "epoch": 7.138092061374249, "step": 21400 }, { "epoch": 7.138092061374249, "ref_ce_loss": 0.1063077449798584, "step": 21400 }, { "epoch": 7.1414276184122745, "loss": 0.5039, "step": 21410 }, { "epoch": 7.1414276184122745, "grad_norm": 1.1651095151901245, "step": 21410 }, { "epoch": 7.1414276184122745, "learning_rate": 0.00015953482460446362, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.33815377950668335, "step": 21410 }, { "ce_loss": 0.06978192925453186, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.17233459651470184, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.07788750529289246, "step": 21410 }, { "epoch": 7.1414276184122745, "loss": 0.49239495396614075, "step": 21410 }, { "ce_loss": 0.10934551805257797, "epoch": 7.1414276184122745, "step": 21410 }, { "distill_loss": 0.23807445168495178, "epoch": 7.1414276184122745, "step": 21410 }, { "epoch": 7.1414276184122745, "ref_ce_loss": 0.1099422425031662, "step": 21410 }, { "epoch": 7.1447631754503, "loss": 0.5337, "step": 21420 }, { "epoch": 7.1447631754503, "grad_norm": 1.296330451965332, "step": 21420 }, { "epoch": 7.1447631754503, "learning_rate": 0.00015918963764251118, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.5494571328163147, "step": 21420 }, { "ce_loss": 0.11908654123544693, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.2070154845714569, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.12471811473369598, "step": 21420 }, { "epoch": 7.1447631754503, "loss": 0.5093758702278137, "step": 21420 }, { "ce_loss": 0.11129189282655716, "epoch": 7.1447631754503, "step": 21420 }, { "distill_loss": 0.2689700126647949, "epoch": 7.1447631754503, "step": 21420 }, { "epoch": 7.1447631754503, "ref_ce_loss": 0.1290213018655777, "step": 21420 }, { "epoch": 7.148098732488325, "loss": 0.5399, "step": 21430 }, { "epoch": 7.148098732488325, "grad_norm": 2.2043728828430176, "step": 21430 }, { "epoch": 7.148098732488325, "learning_rate": 0.00015884473173216374, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.4269636273384094, "step": 21430 }, { "ce_loss": 0.08743033558130264, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.2163121998310089, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.12303852289915085, "step": 21430 }, { "epoch": 7.148098732488325, "loss": 0.47688212990760803, "step": 21430 }, { "ce_loss": 0.11899134516716003, "epoch": 7.148098732488325, "step": 21430 }, { "distill_loss": 0.20621243119239807, "epoch": 7.148098732488325, "step": 21430 }, { "epoch": 7.148098732488325, "ref_ce_loss": 0.07384546101093292, "step": 21430 }, { "epoch": 7.1514342895263505, "loss": 0.5083, "step": 21440 }, { "epoch": 7.1514342895263505, "grad_norm": 1.2066408395767212, "step": 21440 }, { "epoch": 7.1514342895263505, "learning_rate": 0.00015850010727596375, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.49252405762672424, "step": 21440 }, { "ce_loss": 0.08908739686012268, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.22361807525157928, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.08267315477132797, "step": 21440 }, { "epoch": 7.1514342895263505, "loss": 0.5239272117614746, "step": 21440 }, { "ce_loss": 0.10164398699998856, "epoch": 7.1514342895263505, "step": 21440 }, { "distill_loss": 0.24876506626605988, "epoch": 7.1514342895263505, "step": 21440 }, { "epoch": 7.1514342895263505, "ref_ce_loss": 0.0726809948682785, "step": 21440 }, { "epoch": 7.154769846564376, "loss": 0.4696, "step": 21450 }, { "epoch": 7.154769846564376, "grad_norm": 1.5307971239089966, "step": 21450 }, { "epoch": 7.154769846564376, "learning_rate": 0.00015815576467612504, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.3120725452899933, "step": 21450 }, { "ce_loss": 0.05826739966869354, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.1749739646911621, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.058990590274333954, "step": 21450 }, { "epoch": 7.154769846564376, "loss": 0.3526802361011505, "step": 21450 }, { "ce_loss": 0.06731356680393219, "epoch": 7.154769846564376, "step": 21450 }, { "distill_loss": 0.19125425815582275, "epoch": 7.154769846564376, "step": 21450 }, { "epoch": 7.154769846564376, "ref_ce_loss": 0.06343653798103333, "step": 21450 }, { "epoch": 7.158105403602401, "loss": 0.5202, "step": 21460 }, { "epoch": 7.158105403602401, "grad_norm": 1.4682691097259521, "step": 21460 }, { "epoch": 7.158105403602401, "learning_rate": 0.0001578117043345325, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.5320351719856262, "step": 21460 }, { "ce_loss": 0.11158326268196106, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.22310832142829895, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.1012367531657219, "step": 21460 }, { "epoch": 7.158105403602401, "loss": 0.38997533917427063, "step": 21460 }, { "ce_loss": 0.06510186195373535, "epoch": 7.158105403602401, "step": 21460 }, { "distill_loss": 0.20934811234474182, "epoch": 7.158105403602401, "step": 21460 }, { "epoch": 7.158105403602401, "ref_ce_loss": 0.0889061838388443, "step": 21460 }, { "epoch": 7.161440960640427, "loss": 0.4985, "step": 21470 }, { "epoch": 7.161440960640427, "grad_norm": 1.1266052722930908, "step": 21470 }, { "epoch": 7.161440960640427, "learning_rate": 0.0001574679266527415, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.46760207414627075, "step": 21470 }, { "ce_loss": 0.10974862426519394, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.21274375915527344, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.08004342019557953, "step": 21470 }, { "epoch": 7.161440960640427, "loss": 0.4235474765300751, "step": 21470 }, { "ce_loss": 0.07097727060317993, "epoch": 7.161440960640427, "step": 21470 }, { "distill_loss": 0.18562710285186768, "epoch": 7.161440960640427, "step": 21470 }, { "epoch": 7.161440960640427, "ref_ce_loss": 0.0987720862030983, "step": 21470 }, { "epoch": 7.164776517678452, "loss": 0.452, "step": 21480 }, { "epoch": 7.164776517678452, "grad_norm": 1.5619783401489258, "step": 21480 }, { "epoch": 7.164776517678452, "learning_rate": 0.00015712443203197763, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.8044539093971252, "step": 21480 }, { "ce_loss": 0.17771480977535248, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.19461336731910706, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.14369718730449677, "step": 21480 }, { "epoch": 7.164776517678452, "loss": 0.4453772306442261, "step": 21480 }, { "ce_loss": 0.10373017191886902, "epoch": 7.164776517678452, "step": 21480 }, { "distill_loss": 0.1853133887052536, "epoch": 7.164776517678452, "step": 21480 }, { "epoch": 7.164776517678452, "ref_ce_loss": 0.10268549621105194, "step": 21480 }, { "epoch": 7.168112074716477, "loss": 0.4631, "step": 21490 }, { "epoch": 7.168112074716477, "grad_norm": 36.443809509277344, "step": 21490 }, { "epoch": 7.168112074716477, "learning_rate": 0.00015678122087313607, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.5804122686386108, "step": 21490 }, { "ce_loss": 0.05610696226358414, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.217197448015213, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.10133112967014313, "step": 21490 }, { "epoch": 7.168112074716477, "loss": 0.564479649066925, "step": 21490 }, { "ce_loss": 0.1543521136045456, "epoch": 7.168112074716477, "step": 21490 }, { "distill_loss": 0.23337118327617645, "epoch": 7.168112074716477, "step": 21490 }, { "epoch": 7.168112074716477, "ref_ce_loss": 0.11876802891492844, "step": 21490 }, { "epoch": 7.171447631754503, "loss": 0.4692, "step": 21500 }, { "epoch": 7.171447631754503, "grad_norm": 1.0185561180114746, "step": 21500 }, { "epoch": 7.171447631754503, "learning_rate": 0.00015643829357678133, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.37515789270401, "step": 21500 }, { "ce_loss": 0.05240470916032791, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.19510981440544128, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.08341556787490845, "step": 21500 }, { "epoch": 7.171447631754503, "loss": 0.38667699694633484, "step": 21500 }, { "ce_loss": 0.06962460279464722, "epoch": 7.171447631754503, "step": 21500 }, { "distill_loss": 0.23856179416179657, "epoch": 7.171447631754503, "step": 21500 }, { "epoch": 7.171447631754503, "ref_ce_loss": 0.07823915034532547, "step": 21500 }, { "epoch": 7.174783188792528, "loss": 0.4711, "step": 21510 }, { "epoch": 7.174783188792528, "grad_norm": 1.8188211917877197, "step": 21510 }, { "epoch": 7.174783188792528, "learning_rate": 0.00015609565054314616, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.3922473192214966, "step": 21510 }, { "ce_loss": 0.08343679457902908, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.21345853805541992, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.07090325653553009, "step": 21510 }, { "epoch": 7.174783188792528, "loss": 0.48854389786720276, "step": 21510 }, { "ce_loss": 0.14709074795246124, "epoch": 7.174783188792528, "step": 21510 }, { "distill_loss": 0.21674580872058868, "epoch": 7.174783188792528, "step": 21510 }, { "epoch": 7.174783188792528, "ref_ce_loss": 0.12438128143548965, "step": 21510 }, { "epoch": 7.178118745830553, "loss": 0.4811, "step": 21520 }, { "epoch": 7.178118745830553, "grad_norm": 1.4690632820129395, "step": 21520 }, { "epoch": 7.178118745830553, "learning_rate": 0.00015575329217213199, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.3819815218448639, "step": 21520 }, { "ce_loss": 0.056364353746175766, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.18936829268932343, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.09404139965772629, "step": 21520 }, { "epoch": 7.178118745830553, "loss": 0.47050580382347107, "step": 21520 }, { "ce_loss": 0.1282368004322052, "epoch": 7.178118745830553, "step": 21520 }, { "distill_loss": 0.21063852310180664, "epoch": 7.178118745830553, "step": 21520 }, { "epoch": 7.178118745830553, "ref_ce_loss": 0.10341715812683105, "step": 21520 }, { "epoch": 7.181454302868579, "loss": 0.467, "step": 21530 }, { "epoch": 7.181454302868579, "grad_norm": 1.3604241609573364, "step": 21530 }, { "epoch": 7.181454302868579, "learning_rate": 0.00015541121886330795, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.4614931344985962, "step": 21530 }, { "ce_loss": 0.10464883595705032, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.26484405994415283, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.07564658671617508, "step": 21530 }, { "epoch": 7.181454302868579, "loss": 0.5691704750061035, "step": 21530 }, { "ce_loss": 0.08403594046831131, "epoch": 7.181454302868579, "step": 21530 }, { "distill_loss": 0.21320290863513947, "epoch": 7.181454302868579, "step": 21530 }, { "epoch": 7.181454302868579, "ref_ce_loss": 0.09035076946020126, "step": 21530 }, { "epoch": 7.184789859906604, "loss": 0.4867, "step": 21540 }, { "epoch": 7.184789859906604, "grad_norm": 1.1559165716171265, "step": 21540 }, { "epoch": 7.184789859906604, "learning_rate": 0.00015506943101591038, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.5226683020591736, "step": 21540 }, { "ce_loss": 0.10446419566869736, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.2686152160167694, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.11150383204221725, "step": 21540 }, { "epoch": 7.184789859906604, "loss": 0.30404114723205566, "step": 21540 }, { "ce_loss": 0.0629800483584404, "epoch": 7.184789859906604, "step": 21540 }, { "distill_loss": 0.13201919198036194, "epoch": 7.184789859906604, "step": 21540 }, { "epoch": 7.184789859906604, "ref_ce_loss": 0.0805046334862709, "step": 21540 }, { "epoch": 7.188125416944629, "loss": 0.5161, "step": 21550 }, { "epoch": 7.188125416944629, "grad_norm": 2.5077500343322754, "step": 21550 }, { "epoch": 7.188125416944629, "learning_rate": 0.00015472792902884237, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.5977288484573364, "step": 21550 }, { "ce_loss": 0.08067172020673752, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.2463260442018509, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.09701764583587646, "step": 21550 }, { "epoch": 7.188125416944629, "loss": 0.40372413396835327, "step": 21550 }, { "ce_loss": 0.08983927965164185, "epoch": 7.188125416944629, "step": 21550 }, { "distill_loss": 0.21180443465709686, "epoch": 7.188125416944629, "step": 21550 }, { "epoch": 7.188125416944629, "ref_ce_loss": 0.07517854869365692, "step": 21550 }, { "epoch": 7.191460973982655, "loss": 0.4764, "step": 21560 }, { "epoch": 7.191460973982655, "grad_norm": 1.196245551109314, "step": 21560 }, { "epoch": 7.191460973982655, "learning_rate": 0.0001543867133006734, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.464174747467041, "step": 21560 }, { "ce_loss": 0.08118841797113419, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.239656001329422, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.11102952063083649, "step": 21560 }, { "epoch": 7.191460973982655, "loss": 0.516892671585083, "step": 21560 }, { "ce_loss": 0.1302904486656189, "epoch": 7.191460973982655, "step": 21560 }, { "distill_loss": 0.2542354464530945, "epoch": 7.191460973982655, "step": 21560 }, { "epoch": 7.191460973982655, "ref_ce_loss": 0.09391342103481293, "step": 21560 }, { "epoch": 7.19479653102068, "loss": 0.4568, "step": 21570 }, { "epoch": 7.19479653102068, "grad_norm": 1.5264159440994263, "step": 21570 }, { "epoch": 7.19479653102068, "learning_rate": 0.00015404578422963932, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.84468674659729, "step": 21570 }, { "ce_loss": 0.07420466095209122, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.18314522504806519, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.08030659705400467, "step": 21570 }, { "epoch": 7.19479653102068, "loss": 0.5354803800582886, "step": 21570 }, { "ce_loss": 0.09700847417116165, "epoch": 7.19479653102068, "step": 21570 }, { "distill_loss": 0.2780064642429352, "epoch": 7.19479653102068, "step": 21570 }, { "epoch": 7.19479653102068, "ref_ce_loss": 0.11125379055738449, "step": 21570 }, { "epoch": 7.198132088058705, "loss": 0.4866, "step": 21580 }, { "epoch": 7.198132088058705, "grad_norm": 1.6607636213302612, "step": 21580 }, { "epoch": 7.198132088058705, "learning_rate": 0.00015370514221364073, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.42872315645217896, "step": 21580 }, { "ce_loss": 0.10537848621606827, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.19183389842510223, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.1090899258852005, "step": 21580 }, { "epoch": 7.198132088058705, "loss": 0.9066057205200195, "step": 21580 }, { "ce_loss": 0.07847505807876587, "epoch": 7.198132088058705, "step": 21580 }, { "distill_loss": 0.21104788780212402, "epoch": 7.198132088058705, "step": 21580 }, { "epoch": 7.198132088058705, "ref_ce_loss": 0.08899524807929993, "step": 21580 }, { "epoch": 7.201467645096731, "loss": 0.5644, "step": 21590 }, { "epoch": 7.201467645096731, "grad_norm": 0.9659031629562378, "step": 21590 }, { "epoch": 7.201467645096731, "learning_rate": 0.00015336478765024358, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.41692793369293213, "step": 21590 }, { "ce_loss": 0.09320396929979324, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.21223962306976318, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.08695603907108307, "step": 21590 }, { "epoch": 7.201467645096731, "loss": 0.31187373399734497, "step": 21590 }, { "ce_loss": 0.06352602690458298, "epoch": 7.201467645096731, "step": 21590 }, { "distill_loss": 0.13763265311717987, "epoch": 7.201467645096731, "step": 21590 }, { "epoch": 7.201467645096731, "ref_ce_loss": 0.1104658916592598, "step": 21590 }, { "epoch": 7.204803202134756, "loss": 0.4597, "step": 21600 }, { "epoch": 7.204803202134756, "grad_norm": 1.227384328842163, "step": 21600 }, { "epoch": 7.204803202134756, "learning_rate": 0.00015302472093667828, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.3964717388153076, "step": 21600 }, { "ce_loss": 0.07394091784954071, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.18971940875053406, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.09033519774675369, "step": 21600 }, { "epoch": 7.204803202134756, "loss": 0.41900038719177246, "step": 21600 }, { "ce_loss": 0.09434220939874649, "epoch": 7.204803202134756, "step": 21600 }, { "distill_loss": 0.1868446320295334, "epoch": 7.204803202134756, "step": 21600 }, { "epoch": 7.204803202134756, "ref_ce_loss": 0.0968356803059578, "step": 21600 }, { "epoch": 7.2081387591727815, "loss": 0.4705, "step": 21610 }, { "epoch": 7.2081387591727815, "grad_norm": 0.9861631989479065, "step": 21610 }, { "epoch": 7.2081387591727815, "learning_rate": 0.0001526849424698394, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.4758778512477875, "step": 21610 }, { "ce_loss": 0.08692550659179688, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.236834317445755, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.1145910695195198, "step": 21610 }, { "epoch": 7.2081387591727815, "loss": 0.47692006826400757, "step": 21610 }, { "ce_loss": 0.07442447543144226, "epoch": 7.2081387591727815, "step": 21610 }, { "distill_loss": 0.21716845035552979, "epoch": 7.2081387591727815, "step": 21610 }, { "epoch": 7.2081387591727815, "ref_ce_loss": 0.09120145440101624, "step": 21610 }, { "epoch": 7.211474316210807, "loss": 0.4786, "step": 21620 }, { "epoch": 7.211474316210807, "grad_norm": 1.013535737991333, "step": 21620 }, { "epoch": 7.211474316210807, "learning_rate": 0.00015234545264628476, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.47235748171806335, "step": 21620 }, { "ce_loss": 0.1114814430475235, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.22301355004310608, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.10892931371927261, "step": 21620 }, { "epoch": 7.211474316210807, "loss": 0.6911946535110474, "step": 21620 }, { "ce_loss": 0.1308412253856659, "epoch": 7.211474316210807, "step": 21620 }, { "distill_loss": 0.22310936450958252, "epoch": 7.211474316210807, "step": 21620 }, { "epoch": 7.211474316210807, "ref_ce_loss": 0.10248128324747086, "step": 21620 }, { "epoch": 7.214809873248832, "loss": 0.4661, "step": 21630 }, { "epoch": 7.214809873248832, "grad_norm": 1.8655632734298706, "step": 21630 }, { "epoch": 7.214809873248832, "learning_rate": 0.00015200625186223565, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.5949530005455017, "step": 21630 }, { "ce_loss": 0.10551103949546814, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.2142055481672287, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.0733252689242363, "step": 21630 }, { "epoch": 7.214809873248832, "loss": 0.35121750831604004, "step": 21630 }, { "ce_loss": 0.056218236684799194, "epoch": 7.214809873248832, "step": 21630 }, { "distill_loss": 0.2145787477493286, "epoch": 7.214809873248832, "step": 21630 }, { "epoch": 7.214809873248832, "ref_ce_loss": 0.059078801423311234, "step": 21630 }, { "epoch": 7.2181454302868575, "loss": 0.4914, "step": 21640 }, { "epoch": 7.2181454302868575, "grad_norm": 1.7390278577804565, "step": 21640 }, { "epoch": 7.2181454302868575, "learning_rate": 0.00015166734051357577, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.5395673513412476, "step": 21640 }, { "ce_loss": 0.12450134754180908, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.2334718406200409, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.11392674595117569, "step": 21640 }, { "epoch": 7.2181454302868575, "loss": 0.48015227913856506, "step": 21640 }, { "ce_loss": 0.07428359240293503, "epoch": 7.2181454302868575, "step": 21640 }, { "distill_loss": 0.26726609468460083, "epoch": 7.2181454302868575, "step": 21640 }, { "epoch": 7.2181454302868575, "ref_ce_loss": 0.09658200293779373, "step": 21640 }, { "epoch": 7.221480987324883, "loss": 0.5486, "step": 21650 }, { "epoch": 7.221480987324883, "grad_norm": 1.0495847463607788, "step": 21650 }, { "epoch": 7.221480987324883, "learning_rate": 0.00015132871899585138, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.49218320846557617, "step": 21650 }, { "ce_loss": 0.08046775311231613, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.2417549341917038, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.09568624198436737, "step": 21650 }, { "epoch": 7.221480987324883, "loss": 0.5142796039581299, "step": 21650 }, { "ce_loss": 0.10060924291610718, "epoch": 7.221480987324883, "step": 21650 }, { "distill_loss": 0.2038334608078003, "epoch": 7.221480987324883, "step": 21650 }, { "epoch": 7.221480987324883, "ref_ce_loss": 0.10336092859506607, "step": 21650 }, { "epoch": 7.224816544362908, "loss": 0.5344, "step": 21660 }, { "epoch": 7.224816544362908, "grad_norm": 1.4265145063400269, "step": 21660 }, { "epoch": 7.224816544362908, "learning_rate": 0.00015099038770426994, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.5247296094894409, "step": 21660 }, { "ce_loss": 0.13266858458518982, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.2536264955997467, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.09633111953735352, "step": 21660 }, { "epoch": 7.224816544362908, "loss": 0.3984815776348114, "step": 21660 }, { "ce_loss": 0.06782064586877823, "epoch": 7.224816544362908, "step": 21660 }, { "distill_loss": 0.21827705204486847, "epoch": 7.224816544362908, "step": 21660 }, { "epoch": 7.224816544362908, "ref_ce_loss": 0.08778180181980133, "step": 21660 }, { "epoch": 7.228152101400934, "loss": 0.5001, "step": 21670 }, { "epoch": 7.228152101400934, "grad_norm": 2.303767204284668, "step": 21670 }, { "epoch": 7.228152101400934, "learning_rate": 0.00015065234703370045, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 0.6092923283576965, "step": 21670 }, { "ce_loss": 0.1271263211965561, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.28391289710998535, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.1012510359287262, "step": 21670 }, { "epoch": 7.228152101400934, "loss": 1.048234462738037, "step": 21670 }, { "ce_loss": 0.1372372806072235, "epoch": 7.228152101400934, "step": 21670 }, { "distill_loss": 0.2900506854057312, "epoch": 7.228152101400934, "step": 21670 }, { "epoch": 7.228152101400934, "ref_ce_loss": 0.09884429723024368, "step": 21670 }, { "epoch": 7.231487658438959, "loss": 0.5523, "step": 21680 }, { "epoch": 7.231487658438959, "grad_norm": 1.5447536706924438, "step": 21680 }, { "epoch": 7.231487658438959, "learning_rate": 0.000150314597378673, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.522541344165802, "step": 21680 }, { "ce_loss": 0.0930483341217041, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.2165500968694687, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.10621573030948639, "step": 21680 }, { "epoch": 7.231487658438959, "loss": 0.5200650095939636, "step": 21680 }, { "ce_loss": 0.1290084421634674, "epoch": 7.231487658438959, "step": 21680 }, { "distill_loss": 0.26626408100128174, "epoch": 7.231487658438959, "step": 21680 }, { "epoch": 7.231487658438959, "ref_ce_loss": 0.0937294289469719, "step": 21680 }, { "epoch": 7.234823215476984, "loss": 0.5085, "step": 21690 }, { "epoch": 7.234823215476984, "grad_norm": 1.0944291353225708, "step": 21690 }, { "epoch": 7.234823215476984, "learning_rate": 0.00014997713913337784, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.4931850731372833, "step": 21690 }, { "ce_loss": 0.09169874340295792, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.2615334689617157, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.1397051066160202, "step": 21690 }, { "epoch": 7.234823215476984, "loss": 0.5440504550933838, "step": 21690 }, { "ce_loss": 0.14239957928657532, "epoch": 7.234823215476984, "step": 21690 }, { "distill_loss": 0.2655315399169922, "epoch": 7.234823215476984, "step": 21690 }, { "epoch": 7.234823215476984, "ref_ce_loss": 0.11141073703765869, "step": 21690 }, { "epoch": 7.23815877251501, "loss": 0.5402, "step": 21700 }, { "epoch": 7.23815877251501, "grad_norm": 1.2775064706802368, "step": 21700 }, { "epoch": 7.23815877251501, "learning_rate": 0.00014963997269166473, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.5907175540924072, "step": 21700 }, { "ce_loss": 0.16186945140361786, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.2696630358695984, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.08794023096561432, "step": 21700 }, { "epoch": 7.23815877251501, "loss": 0.6781678199768066, "step": 21700 }, { "ce_loss": 0.1132517084479332, "epoch": 7.23815877251501, "step": 21700 }, { "distill_loss": 0.2189943492412567, "epoch": 7.23815877251501, "step": 21700 }, { "epoch": 7.23815877251501, "ref_ce_loss": 0.11259305477142334, "step": 21700 }, { "epoch": 7.241494329553035, "loss": 0.4864, "step": 21710 }, { "epoch": 7.241494329553035, "grad_norm": 1.0857279300689697, "step": 21710 }, { "epoch": 7.241494329553035, "learning_rate": 0.00014930309844704334, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.3750140964984894, "step": 21710 }, { "ce_loss": 0.08555597811937332, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.18509650230407715, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.0733766034245491, "step": 21710 }, { "epoch": 7.241494329553035, "loss": 0.3149873912334442, "step": 21710 }, { "ce_loss": 0.05044303089380264, "epoch": 7.241494329553035, "step": 21710 }, { "distill_loss": 0.16425633430480957, "epoch": 7.241494329553035, "step": 21710 }, { "epoch": 7.241494329553035, "ref_ce_loss": 0.07339531928300858, "step": 21710 }, { "epoch": 7.24482988659106, "loss": 0.4943, "step": 21720 }, { "epoch": 7.24482988659106, "grad_norm": 1.3234370946884155, "step": 21720 }, { "epoch": 7.24482988659106, "learning_rate": 0.00014896651679268219, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.4727824926376343, "step": 21720 }, { "ce_loss": 0.07473330199718475, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.2146746814250946, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.07707130163908005, "step": 21720 }, { "epoch": 7.24482988659106, "loss": 0.42264512181282043, "step": 21720 }, { "ce_loss": 0.06445001810789108, "epoch": 7.24482988659106, "step": 21720 }, { "distill_loss": 0.18711066246032715, "epoch": 7.24482988659106, "step": 21720 }, { "epoch": 7.24482988659106, "ref_ce_loss": 0.08896558731794357, "step": 21720 }, { "epoch": 7.248165443629086, "loss": 0.5333, "step": 21730 }, { "epoch": 7.248165443629086, "grad_norm": 1.3943631649017334, "step": 21730 }, { "epoch": 7.248165443629086, "learning_rate": 0.0001486302281214084, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.4616813361644745, "step": 21730 }, { "ce_loss": 0.08090164512395859, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.26880764961242676, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.0775616466999054, "step": 21730 }, { "epoch": 7.248165443629086, "loss": 0.6101661920547485, "step": 21730 }, { "ce_loss": 0.08751758933067322, "epoch": 7.248165443629086, "step": 21730 }, { "distill_loss": 0.22048398852348328, "epoch": 7.248165443629086, "step": 21730 }, { "epoch": 7.248165443629086, "ref_ce_loss": 0.09813635051250458, "step": 21730 }, { "epoch": 7.251501000667111, "loss": 0.5149, "step": 21740 }, { "epoch": 7.251501000667111, "grad_norm": 1.2613996267318726, "step": 21740 }, { "epoch": 7.251501000667111, "learning_rate": 0.0001482942328257067, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.4055621027946472, "step": 21740 }, { "ce_loss": 0.05802743881940842, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.1827191412448883, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.09838934242725372, "step": 21740 }, { "epoch": 7.251501000667111, "loss": 0.7149930000305176, "step": 21740 }, { "ce_loss": 0.15911222994327545, "epoch": 7.251501000667111, "step": 21740 }, { "distill_loss": 0.24919471144676208, "epoch": 7.251501000667111, "step": 21740 }, { "epoch": 7.251501000667111, "ref_ce_loss": 0.11354556679725647, "step": 21740 }, { "epoch": 7.254836557705136, "loss": 0.483, "step": 21750 }, { "epoch": 7.254836557705136, "grad_norm": 1.9538707733154297, "step": 21750 }, { "epoch": 7.254836557705136, "learning_rate": 0.00014795853129772, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.46994128823280334, "step": 21750 }, { "ce_loss": 0.10398206859827042, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.21987509727478027, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.08994855731725693, "step": 21750 }, { "epoch": 7.254836557705136, "loss": 0.5077059864997864, "step": 21750 }, { "ce_loss": 0.0923563688993454, "epoch": 7.254836557705136, "step": 21750 }, { "distill_loss": 0.23000693321228027, "epoch": 7.254836557705136, "step": 21750 }, { "epoch": 7.254836557705136, "ref_ce_loss": 0.10818903893232346, "step": 21750 }, { "epoch": 7.258172114743162, "loss": 0.4675, "step": 21760 }, { "epoch": 7.258172114743162, "grad_norm": 1.8521201610565186, "step": 21760 }, { "epoch": 7.258172114743162, "learning_rate": 0.00014762312392924804, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.6129436492919922, "step": 21760 }, { "ce_loss": 0.10266965627670288, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.22881817817687988, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.08024413883686066, "step": 21760 }, { "epoch": 7.258172114743162, "loss": 0.5525320172309875, "step": 21760 }, { "ce_loss": 0.14949725568294525, "epoch": 7.258172114743162, "step": 21760 }, { "distill_loss": 0.2612256705760956, "epoch": 7.258172114743162, "step": 21760 }, { "epoch": 7.258172114743162, "ref_ce_loss": 0.09284612536430359, "step": 21760 }, { "epoch": 7.261507671781187, "loss": 0.4865, "step": 21770 }, { "epoch": 7.261507671781187, "grad_norm": 1.28142511844635, "step": 21770 }, { "epoch": 7.261507671781187, "learning_rate": 0.00014728801111174754, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.44657081365585327, "step": 21770 }, { "ce_loss": 0.06146930530667305, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.23916156589984894, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.10164698213338852, "step": 21770 }, { "epoch": 7.261507671781187, "loss": 0.559298038482666, "step": 21770 }, { "ce_loss": 0.11093275249004364, "epoch": 7.261507671781187, "step": 21770 }, { "distill_loss": 0.28344765305519104, "epoch": 7.261507671781187, "step": 21770 }, { "epoch": 7.261507671781187, "ref_ce_loss": 0.12337548285722733, "step": 21770 }, { "epoch": 7.264843228819212, "loss": 0.4996, "step": 21780 }, { "epoch": 7.264843228819212, "grad_norm": 1.1702762842178345, "step": 21780 }, { "epoch": 7.264843228819212, "learning_rate": 0.00014695319323633065, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.5426932573318481, "step": 21780 }, { "ce_loss": 0.1180221438407898, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.20660299062728882, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.09778185933828354, "step": 21780 }, { "epoch": 7.264843228819212, "loss": 0.3912394940853119, "step": 21780 }, { "ce_loss": 0.08908562362194061, "epoch": 7.264843228819212, "step": 21780 }, { "distill_loss": 0.21106182038784027, "epoch": 7.264843228819212, "step": 21780 }, { "epoch": 7.264843228819212, "ref_ce_loss": 0.09095821529626846, "step": 21780 }, { "epoch": 7.268178785857238, "loss": 0.4887, "step": 21790 }, { "epoch": 7.268178785857238, "grad_norm": 1.193352460861206, "step": 21790 }, { "epoch": 7.268178785857238, "learning_rate": 0.00014661867069376636, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.505956768989563, "step": 21790 }, { "ce_loss": 0.10640949010848999, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.24152949452400208, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.11776086688041687, "step": 21790 }, { "epoch": 7.268178785857238, "loss": 0.516710638999939, "step": 21790 }, { "ce_loss": 0.1165366917848587, "epoch": 7.268178785857238, "step": 21790 }, { "distill_loss": 0.24122023582458496, "epoch": 7.268178785857238, "step": 21790 }, { "epoch": 7.268178785857238, "ref_ce_loss": 0.10701066255569458, "step": 21790 }, { "epoch": 7.271514342895263, "loss": 0.4608, "step": 21800 }, { "epoch": 7.271514342895263, "grad_norm": 2.8318581581115723, "step": 21800 }, { "epoch": 7.271514342895263, "learning_rate": 0.0001462844438744785, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.559289276599884, "step": 21800 }, { "ce_loss": 0.1380220353603363, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.21198466420173645, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.1316009908914566, "step": 21800 }, { "epoch": 7.271514342895263, "loss": 0.329565167427063, "step": 21800 }, { "ce_loss": 0.04673033952713013, "epoch": 7.271514342895263, "step": 21800 }, { "distill_loss": 0.17576806247234344, "epoch": 7.271514342895263, "step": 21800 }, { "epoch": 7.271514342895263, "ref_ce_loss": 0.07755520194768906, "step": 21800 }, { "epoch": 7.2748498999332885, "loss": 0.5114, "step": 21810 }, { "epoch": 7.2748498999332885, "grad_norm": 1.2743655443191528, "step": 21810 }, { "epoch": 7.2748498999332885, "learning_rate": 0.00014595051316854541, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.42324957251548767, "step": 21810 }, { "ce_loss": 0.09732640534639359, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.21347419917583466, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.08264093846082687, "step": 21810 }, { "epoch": 7.2748498999332885, "loss": 0.40716296434402466, "step": 21810 }, { "ce_loss": 0.09525058418512344, "epoch": 7.2748498999332885, "step": 21810 }, { "distill_loss": 0.22439442574977875, "epoch": 7.2748498999332885, "step": 21810 }, { "epoch": 7.2748498999332885, "ref_ce_loss": 0.08725903183221817, "step": 21810 }, { "epoch": 7.278185456971314, "loss": 0.47, "step": 21820 }, { "epoch": 7.278185456971314, "grad_norm": 1.347745656967163, "step": 21820 }, { "epoch": 7.278185456971314, "learning_rate": 0.00014561687896570032, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.41976019740104675, "step": 21820 }, { "ce_loss": 0.10885760188102722, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.20018614828586578, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.11058416962623596, "step": 21820 }, { "epoch": 7.278185456971314, "loss": 0.5937715768814087, "step": 21820 }, { "ce_loss": 0.07900398224592209, "epoch": 7.278185456971314, "step": 21820 }, { "distill_loss": 0.14981651306152344, "epoch": 7.278185456971314, "step": 21820 }, { "epoch": 7.278185456971314, "ref_ce_loss": 0.09780242294073105, "step": 21820 }, { "epoch": 7.281521014009339, "loss": 0.5263, "step": 21830 }, { "epoch": 7.281521014009339, "grad_norm": 1.4404274225234985, "step": 21830 }, { "epoch": 7.281521014009339, "learning_rate": 0.0001452835416553302, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.5217397212982178, "step": 21830 }, { "ce_loss": 0.131598562002182, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.2908930778503418, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.09916959702968597, "step": 21830 }, { "epoch": 7.281521014009339, "loss": 0.5160905122756958, "step": 21830 }, { "ce_loss": 0.09195408225059509, "epoch": 7.281521014009339, "step": 21830 }, { "distill_loss": 0.23636744916439056, "epoch": 7.281521014009339, "step": 21830 }, { "epoch": 7.281521014009339, "ref_ce_loss": 0.09996692091226578, "step": 21830 }, { "epoch": 7.2848565710473645, "loss": 0.4825, "step": 21840 }, { "epoch": 7.2848565710473645, "grad_norm": 1.2493855953216553, "step": 21840 }, { "epoch": 7.2848565710473645, "learning_rate": 0.00014495050162647565, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.38376322388648987, "step": 21840 }, { "ce_loss": 0.07317432761192322, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.21176281571388245, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.0986928790807724, "step": 21840 }, { "epoch": 7.2848565710473645, "loss": 0.5978377461433411, "step": 21840 }, { "ce_loss": 0.17858096957206726, "epoch": 7.2848565710473645, "step": 21840 }, { "distill_loss": 0.27734702825546265, "epoch": 7.2848565710473645, "step": 21840 }, { "epoch": 7.2848565710473645, "ref_ce_loss": 0.10186594724655151, "step": 21840 }, { "epoch": 7.28819212808539, "loss": 0.5656, "step": 21850 }, { "epoch": 7.28819212808539, "grad_norm": 1.2831820249557495, "step": 21850 }, { "epoch": 7.28819212808539, "learning_rate": 0.00014461775926783026, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.5874098539352417, "step": 21850 }, { "ce_loss": 0.06480753421783447, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.22294045984745026, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.0869978591799736, "step": 21850 }, { "epoch": 7.28819212808539, "loss": 0.36357828974723816, "step": 21850 }, { "ce_loss": 0.07771913707256317, "epoch": 7.28819212808539, "step": 21850 }, { "distill_loss": 0.19707082211971283, "epoch": 7.28819212808539, "step": 21850 }, { "epoch": 7.28819212808539, "ref_ce_loss": 0.08844716846942902, "step": 21850 }, { "epoch": 7.291527685123415, "loss": 0.4883, "step": 21860 }, { "epoch": 7.291527685123415, "grad_norm": 1.7892661094665527, "step": 21860 }, { "epoch": 7.291527685123415, "learning_rate": 0.00014428531496773995, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.44125548005104065, "step": 21860 }, { "ce_loss": 0.0870680883526802, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.1840522587299347, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.08497877418994904, "step": 21860 }, { "epoch": 7.291527685123415, "loss": 0.4629088044166565, "step": 21860 }, { "ce_loss": 0.12965235114097595, "epoch": 7.291527685123415, "step": 21860 }, { "distill_loss": 0.21723756194114685, "epoch": 7.291527685123415, "step": 21860 }, { "epoch": 7.291527685123415, "ref_ce_loss": 0.09699278324842453, "step": 21860 }, { "epoch": 7.2948632421614406, "loss": 0.4749, "step": 21870 }, { "epoch": 7.2948632421614406, "grad_norm": 1.915931224822998, "step": 21870 }, { "epoch": 7.2948632421614406, "learning_rate": 0.00014395316911420308, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.42724180221557617, "step": 21870 }, { "ce_loss": 0.09100129455327988, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.21688386797904968, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.09163907170295715, "step": 21870 }, { "epoch": 7.2948632421614406, "loss": 0.4978054463863373, "step": 21870 }, { "ce_loss": 0.09396034479141235, "epoch": 7.2948632421614406, "step": 21870 }, { "distill_loss": 0.2810831069946289, "epoch": 7.2948632421614406, "step": 21870 }, { "epoch": 7.2948632421614406, "ref_ce_loss": 0.092870332300663, "step": 21870 }, { "epoch": 7.298198799199466, "loss": 0.5138, "step": 21880 }, { "epoch": 7.298198799199466, "grad_norm": 1.2401480674743652, "step": 21880 }, { "epoch": 7.298198799199466, "learning_rate": 0.00014362132209486968, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.473979651927948, "step": 21880 }, { "ce_loss": 0.15663237869739532, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.20220325887203217, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.09212686121463776, "step": 21880 }, { "epoch": 7.298198799199466, "loss": 0.5248937010765076, "step": 21880 }, { "ce_loss": 0.09290619939565659, "epoch": 7.298198799199466, "step": 21880 }, { "distill_loss": 0.22812524437904358, "epoch": 7.298198799199466, "step": 21880 }, { "epoch": 7.298198799199466, "ref_ce_loss": 0.10141429305076599, "step": 21880 }, { "epoch": 7.301534356237491, "loss": 0.4731, "step": 21890 }, { "epoch": 7.301534356237491, "grad_norm": 1.250961422920227, "step": 21890 }, { "epoch": 7.301534356237491, "learning_rate": 0.00014328977429704085, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.4021410346031189, "step": 21890 }, { "ce_loss": 0.056410495191812515, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.23186610639095306, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.091607004404068, "step": 21890 }, { "epoch": 7.301534356237491, "loss": 0.5981037020683289, "step": 21890 }, { "ce_loss": 0.08761950582265854, "epoch": 7.301534356237491, "step": 21890 }, { "distill_loss": 0.17893314361572266, "epoch": 7.301534356237491, "step": 21890 }, { "epoch": 7.301534356237491, "ref_ce_loss": 0.09102575480937958, "step": 21890 }, { "epoch": 7.304869913275517, "loss": 0.5045, "step": 21900 }, { "epoch": 7.304869913275517, "grad_norm": 1.5320053100585938, "step": 21900 }, { "epoch": 7.304869913275517, "learning_rate": 0.0001429585261076686, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.4536799192428589, "step": 21900 }, { "ce_loss": 0.08082486689090729, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.18056809902191162, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.10206877440214157, "step": 21900 }, { "epoch": 7.304869913275517, "loss": 0.5019956827163696, "step": 21900 }, { "ce_loss": 0.11438484489917755, "epoch": 7.304869913275517, "step": 21900 }, { "distill_loss": 0.21802611649036407, "epoch": 7.304869913275517, "step": 21900 }, { "epoch": 7.304869913275517, "ref_ce_loss": 0.1281689703464508, "step": 21900 }, { "epoch": 7.308205470313542, "loss": 0.5188, "step": 21910 }, { "epoch": 7.308205470313542, "grad_norm": 2.2819159030914307, "step": 21910 }, { "epoch": 7.308205470313542, "learning_rate": 0.00014262757791335515, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.5231117010116577, "step": 21910 }, { "ce_loss": 0.12724162638187408, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.22880399227142334, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.0907425805926323, "step": 21910 }, { "epoch": 7.308205470313542, "loss": 0.6519007086753845, "step": 21910 }, { "ce_loss": 0.0975770652294159, "epoch": 7.308205470313542, "step": 21910 }, { "distill_loss": 0.24211576581001282, "epoch": 7.308205470313542, "step": 21910 }, { "epoch": 7.308205470313542, "ref_ce_loss": 0.08263137191534042, "step": 21910 }, { "epoch": 7.311541027351567, "loss": 0.4677, "step": 21920 }, { "epoch": 7.311541027351567, "grad_norm": 1.6301801204681396, "step": 21920 }, { "epoch": 7.311541027351567, "learning_rate": 0.00014229693010035286, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.5612131357192993, "step": 21920 }, { "ce_loss": 0.08595165610313416, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.259377121925354, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.1124444529414177, "step": 21920 }, { "epoch": 7.311541027351567, "loss": 0.6012833714485168, "step": 21920 }, { "ce_loss": 0.11187417060136795, "epoch": 7.311541027351567, "step": 21920 }, { "distill_loss": 0.18224146962165833, "epoch": 7.311541027351567, "step": 21920 }, { "epoch": 7.311541027351567, "ref_ce_loss": 0.11612199246883392, "step": 21920 }, { "epoch": 7.314876584389593, "loss": 0.5822, "step": 21930 }, { "epoch": 7.314876584389593, "grad_norm": 1.509248971939087, "step": 21930 }, { "epoch": 7.314876584389593, "learning_rate": 0.00014196658305456303, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 1.277998685836792, "step": 21930 }, { "ce_loss": 0.10564474016427994, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.23719510436058044, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.09459588676691055, "step": 21930 }, { "epoch": 7.314876584389593, "loss": 0.37890195846557617, "step": 21930 }, { "ce_loss": 0.07021559029817581, "epoch": 7.314876584389593, "step": 21930 }, { "distill_loss": 0.17815831303596497, "epoch": 7.314876584389593, "step": 21930 }, { "epoch": 7.314876584389593, "ref_ce_loss": 0.0899161696434021, "step": 21930 }, { "epoch": 7.318212141427618, "loss": 0.5154, "step": 21940 }, { "epoch": 7.318212141427618, "grad_norm": 1.338369607925415, "step": 21940 }, { "epoch": 7.318212141427618, "learning_rate": 0.00014163653716153628, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.4284664988517761, "step": 21940 }, { "ce_loss": 0.09836511313915253, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.21835702657699585, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.11144930124282837, "step": 21940 }, { "epoch": 7.318212141427618, "loss": 0.39852064847946167, "step": 21940 }, { "ce_loss": 0.0820104330778122, "epoch": 7.318212141427618, "step": 21940 }, { "distill_loss": 0.20707827806472778, "epoch": 7.318212141427618, "step": 21940 }, { "epoch": 7.318212141427618, "ref_ce_loss": 0.07728994637727737, "step": 21940 }, { "epoch": 7.321547698465643, "loss": 0.4578, "step": 21950 }, { "epoch": 7.321547698465643, "grad_norm": 2.4161667823791504, "step": 21950 }, { "epoch": 7.321547698465643, "learning_rate": 0.00014130679280647174, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.564189612865448, "step": 21950 }, { "ce_loss": 0.1051364466547966, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.20676258206367493, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.07331159710884094, "step": 21950 }, { "epoch": 7.321547698465643, "loss": 0.4392548203468323, "step": 21950 }, { "ce_loss": 0.1069713830947876, "epoch": 7.321547698465643, "step": 21950 }, { "distill_loss": 0.2125968188047409, "epoch": 7.321547698465643, "step": 21950 }, { "epoch": 7.321547698465643, "ref_ce_loss": 0.0849190428853035, "step": 21950 }, { "epoch": 7.324883255503669, "loss": 0.5103, "step": 21960 }, { "epoch": 7.324883255503669, "grad_norm": 1.7507930994033813, "step": 21960 }, { "epoch": 7.324883255503669, "learning_rate": 0.00014097735037421668, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.5139173269271851, "step": 21960 }, { "ce_loss": 0.07396847009658813, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.26365751028060913, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.09375769644975662, "step": 21960 }, { "epoch": 7.324883255503669, "loss": 0.5542353391647339, "step": 21960 }, { "ce_loss": 0.141982302069664, "epoch": 7.324883255503669, "step": 21960 }, { "distill_loss": 0.20985393226146698, "epoch": 7.324883255503669, "step": 21960 }, { "epoch": 7.324883255503669, "ref_ce_loss": 0.1077088788151741, "step": 21960 }, { "epoch": 7.328218812541694, "loss": 0.5147, "step": 21970 }, { "epoch": 7.328218812541694, "grad_norm": 3.0060346126556396, "step": 21970 }, { "epoch": 7.328218812541694, "learning_rate": 0.00014064821024926553, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.534508228302002, "step": 21970 }, { "ce_loss": 0.14316098392009735, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.25907862186431885, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.11218282580375671, "step": 21970 }, { "epoch": 7.328218812541694, "loss": 0.5390993356704712, "step": 21970 }, { "ce_loss": 0.09973648935556412, "epoch": 7.328218812541694, "step": 21970 }, { "distill_loss": 0.22498083114624023, "epoch": 7.328218812541694, "step": 21970 }, { "epoch": 7.328218812541694, "ref_ce_loss": 0.10661143809556961, "step": 21970 }, { "epoch": 7.331554369579719, "loss": 0.5304, "step": 21980 }, { "epoch": 7.331554369579719, "grad_norm": 1.9831620454788208, "step": 21980 }, { "epoch": 7.331554369579719, "learning_rate": 0.0001403193728157605, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.5298435091972351, "step": 21980 }, { "ce_loss": 0.0789138600230217, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.25244104862213135, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.08395465463399887, "step": 21980 }, { "epoch": 7.331554369579719, "loss": 0.5483510494232178, "step": 21980 }, { "ce_loss": 0.11609560996294022, "epoch": 7.331554369579719, "step": 21980 }, { "distill_loss": 0.2503870725631714, "epoch": 7.331554369579719, "step": 21980 }, { "epoch": 7.331554369579719, "ref_ce_loss": 0.10731753706932068, "step": 21980 }, { "epoch": 7.334889926617745, "loss": 0.4871, "step": 21990 }, { "epoch": 7.334889926617745, "grad_norm": 1.3999574184417725, "step": 21990 }, { "epoch": 7.334889926617745, "learning_rate": 0.00013999083845749012, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.43087154626846313, "step": 21990 }, { "ce_loss": 0.07881233841180801, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.18328407406806946, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.10450533032417297, "step": 21990 }, { "epoch": 7.334889926617745, "loss": 0.3638628423213959, "step": 21990 }, { "ce_loss": 0.051684122532606125, "epoch": 7.334889926617745, "step": 21990 }, { "distill_loss": 0.16024218499660492, "epoch": 7.334889926617745, "step": 21990 }, { "epoch": 7.334889926617745, "ref_ce_loss": 0.07850784063339233, "step": 21990 }, { "epoch": 7.33822548365577, "loss": 0.4752, "step": 22000 }, { "epoch": 7.33822548365577, "grad_norm": 2.5212299823760986, "step": 22000 }, { "epoch": 7.33822548365577, "learning_rate": 0.00013966260755788947, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 1.059046983718872, "step": 22000 }, { "ce_loss": 0.12717647850513458, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.31576603651046753, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.08922392129898071, "step": 22000 }, { "epoch": 7.33822548365577, "loss": 0.354206919670105, "step": 22000 }, { "ce_loss": 0.05530741438269615, "epoch": 7.33822548365577, "step": 22000 }, { "distill_loss": 0.13689683377742767, "epoch": 7.33822548365577, "step": 22000 }, { "epoch": 7.33822548365577, "ref_ce_loss": 0.06636778265237808, "step": 22000 }, { "epoch": 7.3415610406937954, "loss": 0.5179, "step": 22010 }, { "epoch": 7.3415610406937954, "grad_norm": 1.3943521976470947, "step": 22010 }, { "epoch": 7.3415610406937954, "learning_rate": 0.00013933468050003923, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.43112093210220337, "step": 22010 }, { "ce_loss": 0.08019595593214035, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.2375338226556778, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.11313319206237793, "step": 22010 }, { "epoch": 7.3415610406937954, "loss": 0.6591454148292542, "step": 22010 }, { "ce_loss": 0.10027246177196503, "epoch": 7.3415610406937954, "step": 22010 }, { "distill_loss": 0.2757124900817871, "epoch": 7.3415610406937954, "step": 22010 }, { "epoch": 7.3415610406937954, "ref_ce_loss": 0.07839225232601166, "step": 22010 }, { "epoch": 7.344896597731822, "loss": 0.5175, "step": 22020 }, { "epoch": 7.344896597731822, "grad_norm": 1.3006362915039062, "step": 22020 }, { "epoch": 7.344896597731822, "learning_rate": 0.0001390070576666656, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.40917250514030457, "step": 22020 }, { "ce_loss": 0.08095649629831314, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.22681719064712524, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.06790400296449661, "step": 22020 }, { "epoch": 7.344896597731822, "loss": 0.5089716911315918, "step": 22020 }, { "ce_loss": 0.11554168909788132, "epoch": 7.344896597731822, "step": 22020 }, { "distill_loss": 0.2949783504009247, "epoch": 7.344896597731822, "step": 22020 }, { "epoch": 7.344896597731822, "ref_ce_loss": 0.09493078291416168, "step": 22020 }, { "epoch": 7.348232154769846, "loss": 0.4836, "step": 22030 }, { "epoch": 7.348232154769846, "grad_norm": 1.0829044580459595, "step": 22030 }, { "epoch": 7.348232154769846, "learning_rate": 0.00013867973944013966, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.6953809261322021, "step": 22030 }, { "ce_loss": 0.15069177746772766, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.2764919698238373, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.10267274081707001, "step": 22030 }, { "epoch": 7.348232154769846, "loss": 0.47006797790527344, "step": 22030 }, { "ce_loss": 0.11325942724943161, "epoch": 7.348232154769846, "step": 22030 }, { "distill_loss": 0.20543146133422852, "epoch": 7.348232154769846, "step": 22030 }, { "epoch": 7.348232154769846, "ref_ce_loss": 0.12529228627681732, "step": 22030 }, { "epoch": 7.351567711807872, "loss": 0.5068, "step": 22040 }, { "epoch": 7.351567711807872, "grad_norm": 1.3046250343322754, "step": 22040 }, { "epoch": 7.351567711807872, "learning_rate": 0.00013835272620247717, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.3885897696018219, "step": 22040 }, { "ce_loss": 0.07588858157396317, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.2265595644712448, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.08590566366910934, "step": 22040 }, { "epoch": 7.351567711807872, "loss": 0.41721630096435547, "step": 22040 }, { "ce_loss": 0.10608571022748947, "epoch": 7.351567711807872, "step": 22040 }, { "distill_loss": 0.21725106239318848, "epoch": 7.351567711807872, "step": 22040 }, { "epoch": 7.351567711807872, "ref_ce_loss": 0.09329008311033249, "step": 22040 }, { "epoch": 7.354903268845897, "loss": 0.5419, "step": 22050 }, { "epoch": 7.354903268845897, "grad_norm": 1.8390084505081177, "step": 22050 }, { "epoch": 7.354903268845897, "learning_rate": 0.00013802601833533745, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.40415939688682556, "step": 22050 }, { "ce_loss": 0.08732908219099045, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.20429277420043945, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.11229567974805832, "step": 22050 }, { "epoch": 7.354903268845897, "loss": 0.35471150279045105, "step": 22050 }, { "ce_loss": 0.05761922150850296, "epoch": 7.354903268845897, "step": 22050 }, { "distill_loss": 0.16647978127002716, "epoch": 7.354903268845897, "step": 22050 }, { "epoch": 7.354903268845897, "ref_ce_loss": 0.09119325131177902, "step": 22050 }, { "epoch": 7.358238825883923, "loss": 0.5271, "step": 22060 }, { "epoch": 7.358238825883923, "grad_norm": 1.500783920288086, "step": 22060 }, { "epoch": 7.358238825883923, "learning_rate": 0.00013769961622002393, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.4502326548099518, "step": 22060 }, { "ce_loss": 0.09496644884347916, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.21306586265563965, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.08571892231702805, "step": 22060 }, { "epoch": 7.358238825883923, "loss": 0.4776359796524048, "step": 22060 }, { "ce_loss": 0.1146068349480629, "epoch": 7.358238825883923, "step": 22060 }, { "distill_loss": 0.2597961127758026, "epoch": 7.358238825883923, "step": 22060 }, { "epoch": 7.358238825883923, "ref_ce_loss": 0.07996554672718048, "step": 22060 }, { "epoch": 7.3615743829219475, "loss": 0.5446, "step": 22070 }, { "epoch": 7.3615743829219475, "grad_norm": 3.993879556655884, "step": 22070 }, { "epoch": 7.3615743829219475, "learning_rate": 0.000137373520237483, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.4738246500492096, "step": 22070 }, { "ce_loss": 0.08311747014522552, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.19675718247890472, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.10863441973924637, "step": 22070 }, { "epoch": 7.3615743829219475, "loss": 0.7610086798667908, "step": 22070 }, { "ce_loss": 0.13840071856975555, "epoch": 7.3615743829219475, "step": 22070 }, { "distill_loss": 0.2668496072292328, "epoch": 7.3615743829219475, "step": 22070 }, { "epoch": 7.3615743829219475, "ref_ce_loss": 0.09363804757595062, "step": 22070 }, { "epoch": 7.364909939959974, "loss": 0.5033, "step": 22080 }, { "epoch": 7.364909939959974, "grad_norm": 2.0557427406311035, "step": 22080 }, { "epoch": 7.364909939959974, "learning_rate": 0.00013704773076830378, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 0.7582910656929016, "step": 22080 }, { "ce_loss": 0.1402321755886078, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.28201544284820557, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.0874849334359169, "step": 22080 }, { "epoch": 7.364909939959974, "loss": 0.7125459313392639, "step": 22080 }, { "ce_loss": 0.10863231867551804, "epoch": 7.364909939959974, "step": 22080 }, { "distill_loss": 0.2794286906719208, "epoch": 7.364909939959974, "step": 22080 }, { "epoch": 7.364909939959974, "ref_ce_loss": 0.08346754312515259, "step": 22080 }, { "epoch": 7.368245496997998, "loss": 0.5462, "step": 22090 }, { "epoch": 7.368245496997998, "grad_norm": 1.49296236038208, "step": 22090 }, { "epoch": 7.368245496997998, "learning_rate": 0.0001367222481927175, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.4915519952774048, "step": 22090 }, { "ce_loss": 0.09840986132621765, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.20584022998809814, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.1020207479596138, "step": 22090 }, { "epoch": 7.368245496997998, "loss": 0.40518438816070557, "step": 22090 }, { "ce_loss": 0.057568494230508804, "epoch": 7.368245496997998, "step": 22090 }, { "distill_loss": 0.21791799366474152, "epoch": 7.368245496997998, "step": 22090 }, { "epoch": 7.368245496997998, "ref_ce_loss": 0.08963776379823685, "step": 22090 }, { "epoch": 7.3715810540360245, "loss": 0.4666, "step": 22100 }, { "epoch": 7.3715810540360245, "grad_norm": 1.6419566869735718, "step": 22100 }, { "epoch": 7.3715810540360245, "learning_rate": 0.0001363970728905975, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.652350127696991, "step": 22100 }, { "ce_loss": 0.08931543678045273, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.29681864380836487, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.09608305990695953, "step": 22100 }, { "epoch": 7.3715810540360245, "loss": 0.34036093950271606, "step": 22100 }, { "ce_loss": 0.04965618625283241, "epoch": 7.3715810540360245, "step": 22100 }, { "distill_loss": 0.18944749236106873, "epoch": 7.3715810540360245, "step": 22100 }, { "epoch": 7.3715810540360245, "ref_ce_loss": 0.08181827515363693, "step": 22100 }, { "epoch": 7.374916611074049, "loss": 0.513, "step": 22110 }, { "epoch": 7.374916611074049, "grad_norm": 1.180824637413025, "step": 22110 }, { "epoch": 7.374916611074049, "learning_rate": 0.0001360722052414582, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.5612049102783203, "step": 22110 }, { "ce_loss": 0.1324518769979477, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.2755542993545532, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.12568174302577972, "step": 22110 }, { "epoch": 7.374916611074049, "loss": 0.576231062412262, "step": 22110 }, { "ce_loss": 0.11681327223777771, "epoch": 7.374916611074049, "step": 22110 }, { "distill_loss": 0.21474020183086395, "epoch": 7.374916611074049, "step": 22110 }, { "epoch": 7.374916611074049, "ref_ce_loss": 0.11753689497709274, "step": 22110 }, { "epoch": 7.378252168112075, "loss": 0.5058, "step": 22120 }, { "epoch": 7.378252168112075, "grad_norm": 1.7695060968399048, "step": 22120 }, { "epoch": 7.378252168112075, "learning_rate": 0.0001357476456244552, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.39721181988716125, "step": 22120 }, { "ce_loss": 0.0658101886510849, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.22495456039905548, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.08024078607559204, "step": 22120 }, { "epoch": 7.378252168112075, "loss": 0.5268717408180237, "step": 22120 }, { "ce_loss": 0.1184745505452156, "epoch": 7.378252168112075, "step": 22120 }, { "distill_loss": 0.25744789838790894, "epoch": 7.378252168112075, "step": 22120 }, { "epoch": 7.378252168112075, "ref_ce_loss": 0.0897599533200264, "step": 22120 }, { "epoch": 7.3815877251501, "loss": 0.4758, "step": 22130 }, { "epoch": 7.3815877251501, "grad_norm": 1.3974146842956543, "step": 22130 }, { "epoch": 7.3815877251501, "learning_rate": 0.00013542339441838453, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.4541488587856293, "step": 22130 }, { "ce_loss": 0.0709289163351059, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.2584846615791321, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.09032522141933441, "step": 22130 }, { "epoch": 7.3815877251501, "loss": 0.7902992367744446, "step": 22130 }, { "ce_loss": 0.07666203379631042, "epoch": 7.3815877251501, "step": 22130 }, { "distill_loss": 0.26494210958480835, "epoch": 7.3815877251501, "step": 22130 }, { "epoch": 7.3815877251501, "ref_ce_loss": 0.09109896421432495, "step": 22130 }, { "epoch": 7.384923282188126, "loss": 0.4863, "step": 22140 }, { "epoch": 7.384923282188126, "grad_norm": 1.9298646450042725, "step": 22140 }, { "epoch": 7.384923282188126, "learning_rate": 0.00013509945200168217, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.3699907064437866, "step": 22140 }, { "ce_loss": 0.0853923037648201, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.175503671169281, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.10878586769104004, "step": 22140 }, { "epoch": 7.384923282188126, "loss": 0.49869394302368164, "step": 22140 }, { "ce_loss": 0.10805847495794296, "epoch": 7.384923282188126, "step": 22140 }, { "distill_loss": 0.2761743664741516, "epoch": 7.384923282188126, "step": 22140 }, { "epoch": 7.384923282188126, "ref_ce_loss": 0.09387902915477753, "step": 22140 }, { "epoch": 7.38825883922615, "loss": 0.5507, "step": 22150 }, { "epoch": 7.38825883922615, "grad_norm": 0.9105316400527954, "step": 22150 }, { "epoch": 7.38825883922615, "learning_rate": 0.00013477581875242391, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.5569889545440674, "step": 22150 }, { "ce_loss": 0.09075947850942612, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.20240755379199982, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.09280704706907272, "step": 22150 }, { "epoch": 7.38825883922615, "loss": 0.40518975257873535, "step": 22150 }, { "ce_loss": 0.06980345398187637, "epoch": 7.38825883922615, "step": 22150 }, { "distill_loss": 0.2161484658718109, "epoch": 7.38825883922615, "step": 22150 }, { "epoch": 7.38825883922615, "ref_ce_loss": 0.0939449593424797, "step": 22150 }, { "epoch": 7.391594396264177, "loss": 0.5112, "step": 22160 }, { "epoch": 7.391594396264177, "grad_norm": 1.392278790473938, "step": 22160 }, { "epoch": 7.391594396264177, "learning_rate": 0.00013445249504832435, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.4864904284477234, "step": 22160 }, { "ce_loss": 0.1145547479391098, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.1910143345594406, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.09960886090993881, "step": 22160 }, { "epoch": 7.391594396264177, "loss": 0.4335716962814331, "step": 22160 }, { "ce_loss": 0.08965346217155457, "epoch": 7.391594396264177, "step": 22160 }, { "distill_loss": 0.1827930063009262, "epoch": 7.391594396264177, "step": 22160 }, { "epoch": 7.391594396264177, "ref_ce_loss": 0.13129045069217682, "step": 22160 }, { "epoch": 7.394929953302201, "loss": 0.521, "step": 22170 }, { "epoch": 7.394929953302201, "grad_norm": 1.4772255420684814, "step": 22170 }, { "epoch": 7.394929953302201, "learning_rate": 0.00013412948126673716, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.48208677768707275, "step": 22170 }, { "ce_loss": 0.09089218825101852, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.18222153186798096, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.08755753189325333, "step": 22170 }, { "epoch": 7.394929953302201, "loss": 0.4618496000766754, "step": 22170 }, { "ce_loss": 0.08060234785079956, "epoch": 7.394929953302201, "step": 22170 }, { "distill_loss": 0.21322056651115417, "epoch": 7.394929953302201, "step": 22170 }, { "epoch": 7.394929953302201, "ref_ce_loss": 0.09799622744321823, "step": 22170 }, { "epoch": 7.398265510340227, "loss": 0.4817, "step": 22180 }, { "epoch": 7.398265510340227, "grad_norm": 1.0283700227737427, "step": 22180 }, { "epoch": 7.398265510340227, "learning_rate": 0.00013380677778465421, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.5956416130065918, "step": 22180 }, { "ce_loss": 0.1274511069059372, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.23653876781463623, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.12005682289600372, "step": 22180 }, { "epoch": 7.398265510340227, "loss": 0.5424402952194214, "step": 22180 }, { "ce_loss": 0.09740731120109558, "epoch": 7.398265510340227, "step": 22180 }, { "distill_loss": 0.2374219298362732, "epoch": 7.398265510340227, "step": 22180 }, { "epoch": 7.398265510340227, "ref_ce_loss": 0.12457883358001709, "step": 22180 }, { "epoch": 7.401601067378252, "loss": 0.4967, "step": 22190 }, { "epoch": 7.401601067378252, "grad_norm": 1.1883015632629395, "step": 22190 }, { "epoch": 7.401601067378252, "learning_rate": 0.00013348438497870518, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.356072336435318, "step": 22190 }, { "ce_loss": 0.044157691299915314, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.2180037796497345, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.0936390832066536, "step": 22190 }, { "epoch": 7.401601067378252, "loss": 0.47434988617897034, "step": 22190 }, { "ce_loss": 0.10346195101737976, "epoch": 7.401601067378252, "step": 22190 }, { "distill_loss": 0.23020198941230774, "epoch": 7.401601067378252, "step": 22190 }, { "epoch": 7.401601067378252, "ref_ce_loss": 0.10282209515571594, "step": 22190 }, { "epoch": 7.404936624416278, "loss": 0.479, "step": 22200 }, { "epoch": 7.404936624416278, "grad_norm": 1.337052583694458, "step": 22200 }, { "epoch": 7.404936624416278, "learning_rate": 0.0001331623032251572, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.38373318314552307, "step": 22200 }, { "ce_loss": 0.1033421978354454, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.17759321630001068, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.1023772805929184, "step": 22200 }, { "epoch": 7.404936624416278, "loss": 0.5399565100669861, "step": 22200 }, { "ce_loss": 0.10520821809768677, "epoch": 7.404936624416278, "step": 22200 }, { "distill_loss": 0.20086219906806946, "epoch": 7.404936624416278, "step": 22200 }, { "epoch": 7.404936624416278, "ref_ce_loss": 0.1283208280801773, "step": 22200 }, { "epoch": 7.408272181454302, "loss": 0.5298, "step": 22210 }, { "epoch": 7.408272181454302, "grad_norm": 1.4969794750213623, "step": 22210 }, { "epoch": 7.408272181454302, "learning_rate": 0.00013284053289991423, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.4264291822910309, "step": 22210 }, { "ce_loss": 0.07203280925750732, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.21674410998821259, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.08955416083335876, "step": 22210 }, { "epoch": 7.408272181454302, "loss": 0.46230363845825195, "step": 22210 }, { "ce_loss": 0.07309994846582413, "epoch": 7.408272181454302, "step": 22210 }, { "distill_loss": 0.2028307467699051, "epoch": 7.408272181454302, "step": 22210 }, { "epoch": 7.408272181454302, "ref_ce_loss": 0.09434698522090912, "step": 22210 }, { "epoch": 7.411607738492329, "loss": 0.4573, "step": 22220 }, { "epoch": 7.411607738492329, "grad_norm": 1.280541181564331, "step": 22220 }, { "epoch": 7.411607738492329, "learning_rate": 0.00013251907437851674, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.3875102698802948, "step": 22220 }, { "ce_loss": 0.08745839446783066, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.19196641445159912, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.10794602334499359, "step": 22220 }, { "epoch": 7.411607738492329, "loss": 0.7894054651260376, "step": 22220 }, { "ce_loss": 0.07735282927751541, "epoch": 7.411607738492329, "step": 22220 }, { "distill_loss": 0.20096616446971893, "epoch": 7.411607738492329, "step": 22220 }, { "epoch": 7.411607738492329, "ref_ce_loss": 0.08367619663476944, "step": 22220 }, { "epoch": 7.414943295530353, "loss": 0.5061, "step": 22230 }, { "epoch": 7.414943295530353, "grad_norm": 1.6839677095413208, "step": 22230 }, { "epoch": 7.414943295530353, "learning_rate": 0.00013219792803614183, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.43914952874183655, "step": 22230 }, { "ce_loss": 0.11902012676000595, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.2054997980594635, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.060020480304956436, "step": 22230 }, { "epoch": 7.414943295530353, "loss": 0.8430578708648682, "step": 22230 }, { "ce_loss": 0.17063359916210175, "epoch": 7.414943295530353, "step": 22230 }, { "distill_loss": 0.29544597864151, "epoch": 7.414943295530353, "step": 22230 }, { "epoch": 7.414943295530353, "ref_ce_loss": 0.1225108951330185, "step": 22230 }, { "epoch": 7.418278852568379, "loss": 0.5245, "step": 22240 }, { "epoch": 7.418278852568379, "grad_norm": 1.309335470199585, "step": 22240 }, { "epoch": 7.418278852568379, "learning_rate": 0.00013187709424760153, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.5299336910247803, "step": 22240 }, { "ce_loss": 0.0785478726029396, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.30060285329818726, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.11685214936733246, "step": 22240 }, { "epoch": 7.418278852568379, "loss": 0.3738112449645996, "step": 22240 }, { "ce_loss": 0.07748304307460785, "epoch": 7.418278852568379, "step": 22240 }, { "distill_loss": 0.22111056745052338, "epoch": 7.418278852568379, "step": 22240 }, { "epoch": 7.418278852568379, "ref_ce_loss": 0.07467032223939896, "step": 22240 }, { "epoch": 7.421614409606404, "loss": 0.4995, "step": 22250 }, { "epoch": 7.421614409606404, "grad_norm": 1.4998759031295776, "step": 22250 }, { "epoch": 7.421614409606404, "learning_rate": 0.0001315565733873434, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.6699790358543396, "step": 22250 }, { "ce_loss": 0.05923834070563316, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.17364928126335144, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.07846441119909286, "step": 22250 }, { "epoch": 7.421614409606404, "loss": 0.4596516788005829, "step": 22250 }, { "ce_loss": 0.0986219048500061, "epoch": 7.421614409606404, "step": 22250 }, { "distill_loss": 0.24059095978736877, "epoch": 7.421614409606404, "step": 22250 }, { "epoch": 7.421614409606404, "ref_ce_loss": 0.09746402502059937, "step": 22250 }, { "epoch": 7.42494996664443, "loss": 0.4921, "step": 22260 }, { "epoch": 7.42494996664443, "grad_norm": 1.2402822971343994, "step": 22260 }, { "epoch": 7.42494996664443, "learning_rate": 0.00013123636582944984, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.500768780708313, "step": 22260 }, { "ce_loss": 0.10541671514511108, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.22422003746032715, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.09545684605836868, "step": 22260 }, { "epoch": 7.42494996664443, "loss": 0.47601836919784546, "step": 22260 }, { "ce_loss": 0.086974136531353, "epoch": 7.42494996664443, "step": 22260 }, { "distill_loss": 0.20608215034008026, "epoch": 7.42494996664443, "step": 22260 }, { "epoch": 7.42494996664443, "ref_ce_loss": 0.09012197703123093, "step": 22260 }, { "epoch": 7.4282855236824545, "loss": 0.4806, "step": 22270 }, { "epoch": 7.4282855236824545, "grad_norm": 1.1672219038009644, "step": 22270 }, { "epoch": 7.4282855236824545, "learning_rate": 0.00013091647194763767, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.3636277914047241, "step": 22270 }, { "ce_loss": 0.05640361085534096, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.15501046180725098, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.103425532579422, "step": 22270 }, { "epoch": 7.4282855236824545, "loss": 0.4259013235569, "step": 22270 }, { "ce_loss": 0.07275062799453735, "epoch": 7.4282855236824545, "step": 22270 }, { "distill_loss": 0.19085094332695007, "epoch": 7.4282855236824545, "step": 22270 }, { "epoch": 7.4282855236824545, "ref_ce_loss": 0.0643162727355957, "step": 22270 }, { "epoch": 7.431621080720481, "loss": 0.4486, "step": 22280 }, { "epoch": 7.431621080720481, "grad_norm": 1.9622478485107422, "step": 22280 }, { "epoch": 7.431621080720481, "learning_rate": 0.0001305968921152572, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.4420868456363678, "step": 22280 }, { "ce_loss": 0.13633877038955688, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.2052733451128006, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.08635461330413818, "step": 22280 }, { "epoch": 7.431621080720481, "loss": 0.47934165596961975, "step": 22280 }, { "ce_loss": 0.08335137367248535, "epoch": 7.431621080720481, "step": 22280 }, { "distill_loss": 0.2924102246761322, "epoch": 7.431621080720481, "step": 22280 }, { "epoch": 7.431621080720481, "ref_ce_loss": 0.10293826460838318, "step": 22280 }, { "epoch": 7.434956637758505, "loss": 0.5194, "step": 22290 }, { "epoch": 7.434956637758505, "grad_norm": 1.7894092798233032, "step": 22290 }, { "epoch": 7.434956637758505, "learning_rate": 0.00013027762670529263, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.5704688429832458, "step": 22290 }, { "ce_loss": 0.10463081300258636, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.22579312324523926, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.06787216663360596, "step": 22290 }, { "epoch": 7.434956637758505, "loss": 0.3507632911205292, "step": 22290 }, { "ce_loss": 0.08278298377990723, "epoch": 7.434956637758505, "step": 22290 }, { "distill_loss": 0.19265946745872498, "epoch": 7.434956637758505, "step": 22290 }, { "epoch": 7.434956637758505, "ref_ce_loss": 0.07519949972629547, "step": 22290 }, { "epoch": 7.4382921947965315, "loss": 0.4832, "step": 22300 }, { "epoch": 7.4382921947965315, "grad_norm": 1.4802080392837524, "step": 22300 }, { "epoch": 7.4382921947965315, "learning_rate": 0.00012995867609036097, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.40042388439178467, "step": 22300 }, { "ce_loss": 0.09222708642482758, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.20333006978034973, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.06465158611536026, "step": 22300 }, { "epoch": 7.4382921947965315, "loss": 0.4406663775444031, "step": 22300 }, { "ce_loss": 0.08464495092630386, "epoch": 7.4382921947965315, "step": 22300 }, { "distill_loss": 0.22818736732006073, "epoch": 7.4382921947965315, "step": 22300 }, { "epoch": 7.4382921947965315, "ref_ce_loss": 0.09121893346309662, "step": 22300 }, { "epoch": 7.441627751834556, "loss": 0.5065, "step": 22310 }, { "epoch": 7.441627751834556, "grad_norm": 1.4099823236465454, "step": 22310 }, { "epoch": 7.441627751834556, "learning_rate": 0.0001296400406427121, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.5115585327148438, "step": 22310 }, { "ce_loss": 0.10224932432174683, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.19686448574066162, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.10360269248485565, "step": 22310 }, { "epoch": 7.441627751834556, "loss": 0.44680503010749817, "step": 22310 }, { "ce_loss": 0.08968012034893036, "epoch": 7.441627751834556, "step": 22310 }, { "distill_loss": 0.22415149211883545, "epoch": 7.441627751834556, "step": 22310 }, { "epoch": 7.441627751834556, "ref_ce_loss": 0.09783569723367691, "step": 22310 }, { "epoch": 7.444963308872582, "loss": 0.4535, "step": 22320 }, { "epoch": 7.444963308872582, "grad_norm": 1.3058733940124512, "step": 22320 }, { "epoch": 7.444963308872582, "learning_rate": 0.00012932172073422765, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.3406934142112732, "step": 22320 }, { "ce_loss": 0.07440505921840668, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.16044959425926208, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.06553249061107635, "step": 22320 }, { "epoch": 7.444963308872582, "loss": 0.4342862069606781, "step": 22320 }, { "ce_loss": 0.09907007962465286, "epoch": 7.444963308872582, "step": 22320 }, { "distill_loss": 0.2205602079629898, "epoch": 7.444963308872582, "step": 22320 }, { "epoch": 7.444963308872582, "ref_ce_loss": 0.08764271438121796, "step": 22320 }, { "epoch": 7.448298865910607, "loss": 0.4604, "step": 22330 }, { "epoch": 7.448298865910607, "grad_norm": 4.206222057342529, "step": 22330 }, { "epoch": 7.448298865910607, "learning_rate": 0.00012900371673642112, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.5938883423805237, "step": 22330 }, { "ce_loss": 0.1305702179670334, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.3110573887825012, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.11513999104499817, "step": 22330 }, { "epoch": 7.448298865910607, "loss": 0.4503992199897766, "step": 22330 }, { "ce_loss": 0.10052350908517838, "epoch": 7.448298865910607, "step": 22330 }, { "distill_loss": 0.18781507015228271, "epoch": 7.448298865910607, "step": 22330 }, { "epoch": 7.448298865910607, "ref_ce_loss": 0.11676362156867981, "step": 22330 }, { "epoch": 7.451634422948633, "loss": 0.5262, "step": 22340 }, { "epoch": 7.451634422948633, "grad_norm": 3.2831857204437256, "step": 22340 }, { "epoch": 7.451634422948633, "learning_rate": 0.00012868602902043783, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.3693366050720215, "step": 22340 }, { "ce_loss": 0.06226786598563194, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.18790769577026367, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.07951121777296066, "step": 22340 }, { "epoch": 7.451634422948633, "loss": 0.3679620921611786, "step": 22340 }, { "ce_loss": 0.08266917616128922, "epoch": 7.451634422948633, "step": 22340 }, { "distill_loss": 0.19653236865997314, "epoch": 7.451634422948633, "step": 22340 }, { "epoch": 7.451634422948633, "ref_ce_loss": 0.08857674896717072, "step": 22340 }, { "epoch": 7.454969979986657, "loss": 0.4677, "step": 22350 }, { "epoch": 7.454969979986657, "grad_norm": 1.5119948387145996, "step": 22350 }, { "epoch": 7.454969979986657, "learning_rate": 0.00012836865795705314, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.43310967087745667, "step": 22350 }, { "ce_loss": 0.09349098801612854, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.2101289927959442, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.08622722327709198, "step": 22350 }, { "epoch": 7.454969979986657, "loss": 0.4472948908805847, "step": 22350 }, { "ce_loss": 0.08247264474630356, "epoch": 7.454969979986657, "step": 22350 }, { "distill_loss": 0.20638509094715118, "epoch": 7.454969979986657, "step": 22350 }, { "epoch": 7.454969979986657, "ref_ce_loss": 0.11921993643045425, "step": 22350 }, { "epoch": 7.458305537024684, "loss": 0.4832, "step": 22360 }, { "epoch": 7.458305537024684, "grad_norm": 1.893993854522705, "step": 22360 }, { "epoch": 7.458305537024684, "learning_rate": 0.00012805160391667338, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.43647223711013794, "step": 22360 }, { "ce_loss": 0.10963376611471176, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.2051556259393692, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.09101174026727676, "step": 22360 }, { "epoch": 7.458305537024684, "loss": 0.3410559594631195, "step": 22360 }, { "ce_loss": 0.06607978045940399, "epoch": 7.458305537024684, "step": 22360 }, { "distill_loss": 0.19104278087615967, "epoch": 7.458305537024684, "step": 22360 }, { "epoch": 7.458305537024684, "ref_ce_loss": 0.08367554843425751, "step": 22360 }, { "epoch": 7.461641094062708, "loss": 0.513, "step": 22370 }, { "epoch": 7.461641094062708, "grad_norm": 1.6382286548614502, "step": 22370 }, { "epoch": 7.461641094062708, "learning_rate": 0.00012773486726933467, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.6054667830467224, "step": 22370 }, { "ce_loss": 0.09302318841218948, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.22380737960338593, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.09679944068193436, "step": 22370 }, { "epoch": 7.461641094062708, "loss": 0.5648404359817505, "step": 22370 }, { "ce_loss": 0.11022025346755981, "epoch": 7.461641094062708, "step": 22370 }, { "distill_loss": 0.2605384588241577, "epoch": 7.461641094062708, "step": 22370 }, { "epoch": 7.461641094062708, "ref_ce_loss": 0.09326773136854172, "step": 22370 }, { "epoch": 7.464976651100734, "loss": 0.5024, "step": 22380 }, { "epoch": 7.464976651100734, "grad_norm": 1.2432814836502075, "step": 22380 }, { "epoch": 7.464976651100734, "learning_rate": 0.00012741844838470284, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.5672869682312012, "step": 22380 }, { "ce_loss": 0.13024158775806427, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.2523437738418579, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.10419787466526031, "step": 22380 }, { "epoch": 7.464976651100734, "loss": 0.3623162806034088, "step": 22380 }, { "ce_loss": 0.0839012935757637, "epoch": 7.464976651100734, "step": 22380 }, { "distill_loss": 0.15986678004264832, "epoch": 7.464976651100734, "step": 22380 }, { "epoch": 7.464976651100734, "ref_ce_loss": 0.06418856233358383, "step": 22380 }, { "epoch": 7.468312208138759, "loss": 0.4916, "step": 22390 }, { "epoch": 7.468312208138759, "grad_norm": 1.126115322113037, "step": 22390 }, { "epoch": 7.468312208138759, "learning_rate": 0.00012710234763207282, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.4593971073627472, "step": 22390 }, { "ce_loss": 0.07878857105970383, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.16794341802597046, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.09075488150119781, "step": 22390 }, { "epoch": 7.468312208138759, "loss": 0.7162830829620361, "step": 22390 }, { "ce_loss": 0.11342699080705643, "epoch": 7.468312208138759, "step": 22390 }, { "distill_loss": 0.2433580458164215, "epoch": 7.468312208138759, "step": 22390 }, { "epoch": 7.468312208138759, "ref_ce_loss": 0.10004798322916031, "step": 22390 }, { "epoch": 7.471647765176785, "loss": 0.4843, "step": 22400 }, { "epoch": 7.471647765176785, "grad_norm": 1.6590230464935303, "step": 22400 }, { "epoch": 7.471647765176785, "learning_rate": 0.00012678656538036803, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.5215333700180054, "step": 22400 }, { "ce_loss": 0.11010608822107315, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.23093946278095245, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.09201352298259735, "step": 22400 }, { "epoch": 7.471647765176785, "loss": 0.41143471002578735, "step": 22400 }, { "ce_loss": 0.05778210982680321, "epoch": 7.471647765176785, "step": 22400 }, { "distill_loss": 0.20460692048072815, "epoch": 7.471647765176785, "step": 22400 }, { "epoch": 7.471647765176785, "ref_ce_loss": 0.10184299200773239, "step": 22400 }, { "epoch": 7.474983322214809, "loss": 0.488, "step": 22410 }, { "epoch": 7.474983322214809, "grad_norm": 1.2910398244857788, "step": 22410 }, { "epoch": 7.474983322214809, "learning_rate": 0.0001264711019981404, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.3339017331600189, "step": 22410 }, { "ce_loss": 0.050458505749702454, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.17623841762542725, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.06894265860319138, "step": 22410 }, { "epoch": 7.474983322214809, "loss": 0.7849916815757751, "step": 22410 }, { "ce_loss": 0.13765177130699158, "epoch": 7.474983322214809, "step": 22410 }, { "distill_loss": 0.2127397358417511, "epoch": 7.474983322214809, "step": 22410 }, { "epoch": 7.474983322214809, "ref_ce_loss": 0.1041124016046524, "step": 22410 }, { "epoch": 7.478318879252836, "loss": 0.4916, "step": 22420 }, { "epoch": 7.478318879252836, "grad_norm": 1.5999616384506226, "step": 22420 }, { "epoch": 7.478318879252836, "learning_rate": 0.00012615595785356963, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.29595646262168884, "step": 22420 }, { "ce_loss": 0.05818495899438858, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.15219548344612122, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.08528938889503479, "step": 22420 }, { "epoch": 7.478318879252836, "loss": 0.3686578869819641, "step": 22420 }, { "ce_loss": 0.08551814407110214, "epoch": 7.478318879252836, "step": 22420 }, { "distill_loss": 0.17895744740962982, "epoch": 7.478318879252836, "step": 22420 }, { "epoch": 7.478318879252836, "ref_ce_loss": 0.07805319130420685, "step": 22420 }, { "epoch": 7.48165443629086, "loss": 0.4361, "step": 22430 }, { "epoch": 7.48165443629086, "grad_norm": 1.2545150518417358, "step": 22430 }, { "epoch": 7.48165443629086, "learning_rate": 0.00012584113331446303, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.3689488470554352, "step": 22430 }, { "ce_loss": 0.06294714659452438, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.16390585899353027, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.09834092855453491, "step": 22430 }, { "epoch": 7.48165443629086, "loss": 0.4330061674118042, "step": 22430 }, { "ce_loss": 0.0886654183268547, "epoch": 7.48165443629086, "step": 22430 }, { "distill_loss": 0.2353941649198532, "epoch": 7.48165443629086, "step": 22430 }, { "epoch": 7.48165443629086, "ref_ce_loss": 0.0741928294301033, "step": 22430 }, { "epoch": 7.484989993328886, "loss": 0.4324, "step": 22440 }, { "epoch": 7.484989993328886, "grad_norm": 1.5502991676330566, "step": 22440 }, { "epoch": 7.484989993328886, "learning_rate": 0.00012552662874825432, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.7424986362457275, "step": 22440 }, { "ce_loss": 0.0946795865893364, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.20105278491973877, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.10257663577795029, "step": 22440 }, { "epoch": 7.484989993328886, "loss": 0.4984404444694519, "step": 22440 }, { "ce_loss": 0.09149035066366196, "epoch": 7.484989993328886, "step": 22440 }, { "distill_loss": 0.28340229392051697, "epoch": 7.484989993328886, "step": 22440 }, { "epoch": 7.484989993328886, "ref_ce_loss": 0.12336947023868561, "step": 22440 }, { "epoch": 7.488325550366911, "loss": 0.4556, "step": 22450 }, { "epoch": 7.488325550366911, "grad_norm": 2.0209858417510986, "step": 22450 }, { "epoch": 7.488325550366911, "learning_rate": 0.00012521244452200455, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.47479161620140076, "step": 22450 }, { "ce_loss": 0.08662200719118118, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.2342468798160553, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.0724748894572258, "step": 22450 }, { "epoch": 7.488325550366911, "loss": 0.5271512866020203, "step": 22450 }, { "ce_loss": 0.09744291752576828, "epoch": 7.488325550366911, "step": 22450 }, { "distill_loss": 0.23693248629570007, "epoch": 7.488325550366911, "step": 22450 }, { "epoch": 7.488325550366911, "ref_ce_loss": 0.1411687731742859, "step": 22450 }, { "epoch": 7.491661107404937, "loss": 0.4946, "step": 22460 }, { "epoch": 7.491661107404937, "grad_norm": 1.4775241613388062, "step": 22460 }, { "epoch": 7.491661107404937, "learning_rate": 0.0001248985810024005, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.58427894115448, "step": 22460 }, { "ce_loss": 0.09811204671859741, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.19793188571929932, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.09271074831485748, "step": 22460 }, { "epoch": 7.491661107404937, "loss": 0.4697480797767639, "step": 22460 }, { "ce_loss": 0.05923609435558319, "epoch": 7.491661107404937, "step": 22460 }, { "distill_loss": 0.23396554589271545, "epoch": 7.491661107404937, "step": 22460 }, { "epoch": 7.491661107404937, "ref_ce_loss": 0.10728515684604645, "step": 22460 }, { "epoch": 7.4949966644429615, "loss": 0.4636, "step": 22470 }, { "epoch": 7.4949966644429615, "grad_norm": 1.045827865600586, "step": 22470 }, { "epoch": 7.4949966644429615, "learning_rate": 0.00012458503855575446, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.5289336442947388, "step": 22470 }, { "ce_loss": 0.06923147290945053, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.16591979563236237, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.12149394303560257, "step": 22470 }, { "epoch": 7.4949966644429615, "loss": 0.5726809501647949, "step": 22470 }, { "ce_loss": 0.16617202758789062, "epoch": 7.4949966644429615, "step": 22470 }, { "distill_loss": 0.29169028997421265, "epoch": 7.4949966644429615, "step": 22470 }, { "epoch": 7.4949966644429615, "ref_ce_loss": 0.09331735968589783, "step": 22470 }, { "epoch": 7.498332221480988, "loss": 0.4868, "step": 22480 }, { "epoch": 7.498332221480988, "grad_norm": 1.160749912261963, "step": 22480 }, { "epoch": 7.498332221480988, "learning_rate": 0.0001242718175480043, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.39735355973243713, "step": 22480 }, { "ce_loss": 0.0984911397099495, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.2119746208190918, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.06830558180809021, "step": 22480 }, { "epoch": 7.498332221480988, "loss": 0.44706398248672485, "step": 22480 }, { "ce_loss": 0.10186761617660522, "epoch": 7.498332221480988, "step": 22480 }, { "distill_loss": 0.2096789926290512, "epoch": 7.498332221480988, "step": 22480 }, { "epoch": 7.498332221480988, "ref_ce_loss": 0.09839505702257156, "step": 22480 }, { "epoch": 7.501667778519012, "loss": 0.512, "step": 22490 }, { "epoch": 7.501667778519012, "grad_norm": 1.7275261878967285, "step": 22490 }, { "epoch": 7.501667778519012, "learning_rate": 0.0001239589183447126, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.5226195454597473, "step": 22490 }, { "ce_loss": 0.15770918130874634, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.25905734300613403, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.10564803332090378, "step": 22490 }, { "epoch": 7.501667778519012, "loss": 0.4982665181159973, "step": 22490 }, { "ce_loss": 0.12002474814653397, "epoch": 7.501667778519012, "step": 22490 }, { "distill_loss": 0.2492271214723587, "epoch": 7.501667778519012, "step": 22490 }, { "epoch": 7.501667778519012, "ref_ce_loss": 0.08589009195566177, "step": 22490 }, { "epoch": 7.5050033355570385, "loss": 0.5313, "step": 22500 }, { "epoch": 7.5050033355570385, "grad_norm": 1.6123205423355103, "step": 22500 }, { "epoch": 7.5050033355570385, "learning_rate": 0.00012364634131106664, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.33314186334609985, "step": 22500 }, { "ce_loss": 0.06740890443325043, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.16940796375274658, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.09602712094783783, "step": 22500 }, { "epoch": 7.5050033355570385, "loss": 0.4375060200691223, "step": 22500 }, { "ce_loss": 0.08420051634311676, "epoch": 7.5050033355570385, "step": 22500 }, { "distill_loss": 0.17328374087810516, "epoch": 7.5050033355570385, "step": 22500 }, { "epoch": 7.5050033355570385, "ref_ce_loss": 0.09019183367490768, "step": 22500 }, { "epoch": 7.508338892595063, "loss": 0.5171, "step": 22510 }, { "epoch": 7.508338892595063, "grad_norm": 0.948650598526001, "step": 22510 }, { "epoch": 7.508338892595063, "learning_rate": 0.00012333408681187709, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.41246312856674194, "step": 22510 }, { "ce_loss": 0.09295927733182907, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.2076326459646225, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.07924635708332062, "step": 22510 }, { "epoch": 7.508338892595063, "loss": 0.3275887370109558, "step": 22510 }, { "ce_loss": 0.04823889583349228, "epoch": 7.508338892595063, "step": 22510 }, { "distill_loss": 0.1737799495458603, "epoch": 7.508338892595063, "step": 22510 }, { "epoch": 7.508338892595063, "ref_ce_loss": 0.07115225493907928, "step": 22510 }, { "epoch": 7.511674449633089, "loss": 0.4746, "step": 22520 }, { "epoch": 7.511674449633089, "grad_norm": 1.3547013998031616, "step": 22520 }, { "epoch": 7.511674449633089, "learning_rate": 0.00012302215521157867, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.48953017592430115, "step": 22520 }, { "ce_loss": 0.1005236953496933, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.27121877670288086, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.09814045578241348, "step": 22520 }, { "epoch": 7.511674449633089, "loss": 0.43054911494255066, "step": 22520 }, { "ce_loss": 0.08458945900201797, "epoch": 7.511674449633089, "step": 22520 }, { "distill_loss": 0.22104483842849731, "epoch": 7.511674449633089, "step": 22520 }, { "epoch": 7.511674449633089, "ref_ce_loss": 0.08826296031475067, "step": 22520 }, { "epoch": 7.515010006671114, "loss": 0.506, "step": 22530 }, { "epoch": 7.515010006671114, "grad_norm": 2.74025821685791, "step": 22530 }, { "epoch": 7.515010006671114, "learning_rate": 0.0001227105468742292, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.7157801389694214, "step": 22530 }, { "ce_loss": 0.11807220429182053, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.25563985109329224, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.1178126335144043, "step": 22530 }, { "epoch": 7.515010006671114, "loss": 0.3893582820892334, "step": 22530 }, { "ce_loss": 0.08098858594894409, "epoch": 7.515010006671114, "step": 22530 }, { "distill_loss": 0.17783185839653015, "epoch": 7.515010006671114, "step": 22530 }, { "epoch": 7.515010006671114, "ref_ce_loss": 0.10213252902030945, "step": 22530 }, { "epoch": 7.51834556370914, "loss": 0.5392, "step": 22540 }, { "epoch": 7.51834556370914, "grad_norm": 1.3929061889648438, "step": 22540 }, { "epoch": 7.51834556370914, "learning_rate": 0.00012239926216350928, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.5459983348846436, "step": 22540 }, { "ce_loss": 0.11914133280515671, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.24583551287651062, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.10463111847639084, "step": 22540 }, { "epoch": 7.51834556370914, "loss": 0.3390716016292572, "step": 22540 }, { "ce_loss": 0.056259576231241226, "epoch": 7.51834556370914, "step": 22540 }, { "distill_loss": 0.1948683112859726, "epoch": 7.51834556370914, "step": 22540 }, { "epoch": 7.51834556370914, "ref_ce_loss": 0.08741886168718338, "step": 22540 }, { "epoch": 7.521681120747164, "loss": 0.45, "step": 22550 }, { "epoch": 7.521681120747164, "grad_norm": 1.3042694330215454, "step": 22550 }, { "epoch": 7.521681120747164, "learning_rate": 0.00012208830144272117, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.34747618436813354, "step": 22550 }, { "ce_loss": 0.0798603743314743, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.1654832810163498, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.06433495879173279, "step": 22550 }, { "epoch": 7.521681120747164, "loss": 0.39804723858833313, "step": 22550 }, { "ce_loss": 0.11774889379739761, "epoch": 7.521681120747164, "step": 22550 }, { "distill_loss": 0.19477730989456177, "epoch": 7.521681120747164, "step": 22550 }, { "epoch": 7.521681120747164, "ref_ce_loss": 0.08538540452718735, "step": 22550 }, { "epoch": 7.525016677785191, "loss": 0.4811, "step": 22560 }, { "epoch": 7.525016677785191, "grad_norm": 1.6051409244537354, "step": 22560 }, { "epoch": 7.525016677785191, "learning_rate": 0.00012177766507478998, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.4381016790866852, "step": 22560 }, { "ce_loss": 0.09013054519891739, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.21436019241809845, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.0902925506234169, "step": 22560 }, { "epoch": 7.525016677785191, "loss": 0.4647827744483948, "step": 22560 }, { "ce_loss": 0.08238541334867477, "epoch": 7.525016677785191, "step": 22560 }, { "distill_loss": 0.24512982368469238, "epoch": 7.525016677785191, "step": 22560 }, { "epoch": 7.525016677785191, "ref_ce_loss": 0.10200794041156769, "step": 22560 }, { "epoch": 7.528352234823215, "loss": 0.4775, "step": 22570 }, { "epoch": 7.528352234823215, "grad_norm": 1.4340500831604004, "step": 22570 }, { "epoch": 7.528352234823215, "learning_rate": 0.00012146735342226158, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.6594744920730591, "step": 22570 }, { "ce_loss": 0.14710494875907898, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.23049712181091309, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.10443130135536194, "step": 22570 }, { "epoch": 7.528352234823215, "loss": 0.5185823440551758, "step": 22570 }, { "ce_loss": 0.07006262242794037, "epoch": 7.528352234823215, "step": 22570 }, { "distill_loss": 0.23122458159923553, "epoch": 7.528352234823215, "step": 22570 }, { "epoch": 7.528352234823215, "ref_ce_loss": 0.11255393922328949, "step": 22570 }, { "epoch": 7.531687791861241, "loss": 0.5173, "step": 22580 }, { "epoch": 7.531687791861241, "grad_norm": 1.1945619583129883, "step": 22580 }, { "epoch": 7.531687791861241, "learning_rate": 0.00012115736684730326, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.58127361536026, "step": 22580 }, { "ce_loss": 0.06672295182943344, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.2515077292919159, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.08761141449213028, "step": 22580 }, { "epoch": 7.531687791861241, "loss": 0.3402024507522583, "step": 22580 }, { "ce_loss": 0.05126715078949928, "epoch": 7.531687791861241, "step": 22580 }, { "distill_loss": 0.18758653104305267, "epoch": 7.531687791861241, "step": 22580 }, { "epoch": 7.531687791861241, "ref_ce_loss": 0.06539114564657211, "step": 22580 }, { "epoch": 7.535023348899266, "loss": 0.4659, "step": 22590 }, { "epoch": 7.535023348899266, "grad_norm": 1.089540958404541, "step": 22590 }, { "epoch": 7.535023348899266, "learning_rate": 0.00012084770571170234, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.5159248113632202, "step": 22590 }, { "ce_loss": 0.10363738983869553, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.2532828152179718, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.11867432296276093, "step": 22590 }, { "epoch": 7.535023348899266, "loss": 0.3509080111980438, "step": 22590 }, { "ce_loss": 0.04871860146522522, "epoch": 7.535023348899266, "step": 22590 }, { "distill_loss": 0.17586302757263184, "epoch": 7.535023348899266, "step": 22590 }, { "epoch": 7.535023348899266, "ref_ce_loss": 0.07077796012163162, "step": 22590 }, { "epoch": 7.538358905937292, "loss": 0.4991, "step": 22600 }, { "epoch": 7.538358905937292, "grad_norm": 1.2875605821609497, "step": 22600 }, { "epoch": 7.538358905937292, "learning_rate": 0.00012053837037686694, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.4832446575164795, "step": 22600 }, { "ce_loss": 0.09857060760259628, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.2361626923084259, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.11466442048549652, "step": 22600 }, { "epoch": 7.538358905937292, "loss": 0.5136521458625793, "step": 22600 }, { "ce_loss": 0.09627451747655869, "epoch": 7.538358905937292, "step": 22600 }, { "distill_loss": 0.2690471410751343, "epoch": 7.538358905937292, "step": 22600 }, { "epoch": 7.538358905937292, "ref_ce_loss": 0.1010148823261261, "step": 22600 }, { "epoch": 7.541694462975316, "loss": 0.4601, "step": 22610 }, { "epoch": 7.541694462975316, "grad_norm": 1.381775140762329, "step": 22610 }, { "epoch": 7.541694462975316, "learning_rate": 0.00012022936120382464, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.5695637464523315, "step": 22610 }, { "ce_loss": 0.09625787287950516, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.2347060889005661, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.08520832657814026, "step": 22610 }, { "epoch": 7.541694462975316, "loss": 0.6165584921836853, "step": 22610 }, { "ce_loss": 0.13299477100372314, "epoch": 7.541694462975316, "step": 22610 }, { "distill_loss": 0.30004408955574036, "epoch": 7.541694462975316, "step": 22610 }, { "epoch": 7.541694462975316, "ref_ce_loss": 0.09910961985588074, "step": 22610 }, { "epoch": 7.545030020013343, "loss": 0.5293, "step": 22620 }, { "epoch": 7.545030020013343, "grad_norm": 1.9388092756271362, "step": 22620 }, { "epoch": 7.545030020013343, "learning_rate": 0.00011992067855322248, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.4749663472175598, "step": 22620 }, { "ce_loss": 0.09989020228385925, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.24128346145153046, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.09536205977201462, "step": 22620 }, { "epoch": 7.545030020013343, "loss": 0.4751257598400116, "step": 22620 }, { "ce_loss": 0.09537438303232193, "epoch": 7.545030020013343, "step": 22620 }, { "distill_loss": 0.2695975601673126, "epoch": 7.545030020013343, "step": 22620 }, { "epoch": 7.545030020013343, "ref_ce_loss": 0.10993235558271408, "step": 22620 }, { "epoch": 7.548365577051367, "loss": 0.4868, "step": 22630 }, { "epoch": 7.548365577051367, "grad_norm": 1.4406224489212036, "step": 22630 }, { "epoch": 7.548365577051367, "learning_rate": 0.00011961232278532608, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.36910074949264526, "step": 22630 }, { "ce_loss": 0.07736709713935852, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.20966193079948425, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.0818030834197998, "step": 22630 }, { "epoch": 7.548365577051367, "loss": 0.4263499975204468, "step": 22630 }, { "ce_loss": 0.07568740099668503, "epoch": 7.548365577051367, "step": 22630 }, { "distill_loss": 0.22069811820983887, "epoch": 7.548365577051367, "step": 22630 }, { "epoch": 7.548365577051367, "ref_ce_loss": 0.07900112122297287, "step": 22630 }, { "epoch": 7.551701134089393, "loss": 0.5055, "step": 22640 }, { "epoch": 7.551701134089393, "grad_norm": 1.8832731246948242, "step": 22640 }, { "epoch": 7.551701134089393, "learning_rate": 0.00011930429426001999, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.5240845084190369, "step": 22640 }, { "ce_loss": 0.12520739436149597, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.215688556432724, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.15145321190357208, "step": 22640 }, { "epoch": 7.551701134089393, "loss": 0.37638574838638306, "step": 22640 }, { "ce_loss": 0.06634283065795898, "epoch": 7.551701134089393, "step": 22640 }, { "distill_loss": 0.21147775650024414, "epoch": 7.551701134089393, "step": 22640 }, { "epoch": 7.551701134089393, "ref_ce_loss": 0.08255521953105927, "step": 22640 }, { "epoch": 7.555036691127418, "loss": 0.5192, "step": 22650 }, { "epoch": 7.555036691127418, "grad_norm": 1.0957404375076294, "step": 22650 }, { "epoch": 7.555036691127418, "learning_rate": 0.00011899659333680659, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.3762626349925995, "step": 22650 }, { "ce_loss": 0.0665406882762909, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.20673134922981262, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.06498983502388, "step": 22650 }, { "epoch": 7.555036691127418, "loss": 0.7602916955947876, "step": 22650 }, { "ce_loss": 0.11462393403053284, "epoch": 7.555036691127418, "step": 22650 }, { "distill_loss": 0.22196552157402039, "epoch": 7.555036691127418, "step": 22650 }, { "epoch": 7.555036691127418, "ref_ce_loss": 0.10610775649547577, "step": 22650 }, { "epoch": 7.558372248165444, "loss": 0.5195, "step": 22660 }, { "epoch": 7.558372248165444, "grad_norm": 1.423340082168579, "step": 22660 }, { "epoch": 7.558372248165444, "learning_rate": 0.00011868922037480601, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.41893377900123596, "step": 22660 }, { "ce_loss": 0.08694668114185333, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.2006167471408844, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.09499073773622513, "step": 22660 }, { "epoch": 7.558372248165444, "loss": 0.5745624899864197, "step": 22660 }, { "ce_loss": 0.10483787953853607, "epoch": 7.558372248165444, "step": 22660 }, { "distill_loss": 0.22017048299312592, "epoch": 7.558372248165444, "step": 22660 }, { "epoch": 7.558372248165444, "ref_ce_loss": 0.09431114047765732, "step": 22660 }, { "epoch": 7.5617078052034685, "loss": 0.4536, "step": 22670 }, { "epoch": 7.5617078052034685, "grad_norm": 1.5026270151138306, "step": 22670 }, { "epoch": 7.5617078052034685, "learning_rate": 0.0001183821757327555, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.6381749510765076, "step": 22670 }, { "ce_loss": 0.14865674078464508, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.3198041021823883, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.11032967269420624, "step": 22670 }, { "epoch": 7.5617078052034685, "loss": 0.44358402490615845, "step": 22670 }, { "ce_loss": 0.12820997834205627, "epoch": 7.5617078052034685, "step": 22670 }, { "distill_loss": 0.21085762977600098, "epoch": 7.5617078052034685, "step": 22670 }, { "epoch": 7.5617078052034685, "ref_ce_loss": 0.0794897973537445, "step": 22670 }, { "epoch": 7.565043362241495, "loss": 0.5031, "step": 22680 }, { "epoch": 7.565043362241495, "grad_norm": 1.253045916557312, "step": 22680 }, { "epoch": 7.565043362241495, "learning_rate": 0.00011807545976900929, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.5187609195709229, "step": 22680 }, { "ce_loss": 0.13985517621040344, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.264445424079895, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.11417710781097412, "step": 22680 }, { "epoch": 7.565043362241495, "loss": 0.5495779514312744, "step": 22680 }, { "ce_loss": 0.09258265048265457, "epoch": 7.565043362241495, "step": 22680 }, { "distill_loss": 0.26880165934562683, "epoch": 7.565043362241495, "step": 22680 }, { "epoch": 7.565043362241495, "ref_ce_loss": 0.10621566325426102, "step": 22680 }, { "epoch": 7.568378919279519, "loss": 0.4652, "step": 22690 }, { "epoch": 7.568378919279519, "grad_norm": 1.1245099306106567, "step": 22690 }, { "epoch": 7.568378919279519, "learning_rate": 0.00011776907284153793, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.4744243323802948, "step": 22690 }, { "ce_loss": 0.0841083750128746, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.19078223407268524, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.09597629308700562, "step": 22690 }, { "epoch": 7.568378919279519, "loss": 0.4273073971271515, "step": 22690 }, { "ce_loss": 0.095107801258564, "epoch": 7.568378919279519, "step": 22690 }, { "distill_loss": 0.24718546867370605, "epoch": 7.568378919279519, "step": 22690 }, { "epoch": 7.568378919279519, "ref_ce_loss": 0.08484120666980743, "step": 22690 }, { "epoch": 7.5717144763175455, "loss": 0.4627, "step": 22700 }, { "epoch": 7.5717144763175455, "grad_norm": 2.2187397480010986, "step": 22700 }, { "epoch": 7.5717144763175455, "learning_rate": 0.00011746301530792779, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.4897652268409729, "step": 22700 }, { "ce_loss": 0.11604621261358261, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.23150309920310974, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.11678127944469452, "step": 22700 }, { "epoch": 7.5717144763175455, "loss": 0.3536388874053955, "step": 22700 }, { "ce_loss": 0.0858709067106247, "epoch": 7.5717144763175455, "step": 22700 }, { "distill_loss": 0.1904067099094391, "epoch": 7.5717144763175455, "step": 22700 }, { "epoch": 7.5717144763175455, "ref_ce_loss": 0.07716457545757294, "step": 22700 }, { "epoch": 7.57505003335557, "loss": 0.4752, "step": 22710 }, { "epoch": 7.57505003335557, "grad_norm": 1.5376068353652954, "step": 22710 }, { "epoch": 7.57505003335557, "learning_rate": 0.00011715728752538102, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.48741573095321655, "step": 22710 }, { "ce_loss": 0.09763926267623901, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.23203155398368835, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.09691423177719116, "step": 22710 }, { "epoch": 7.57505003335557, "loss": 0.35795313119888306, "step": 22710 }, { "ce_loss": 0.07701003551483154, "epoch": 7.57505003335557, "step": 22710 }, { "distill_loss": 0.188336580991745, "epoch": 7.57505003335557, "step": 22710 }, { "epoch": 7.57505003335557, "ref_ce_loss": 0.0710507407784462, "step": 22710 }, { "epoch": 7.578385590393596, "loss": 0.4953, "step": 22720 }, { "epoch": 7.578385590393596, "grad_norm": 1.9367997646331787, "step": 22720 }, { "epoch": 7.578385590393596, "learning_rate": 0.00011685188985071485, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.6553285717964172, "step": 22720 }, { "ce_loss": 0.16441576182842255, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.23933960497379303, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.11505937576293945, "step": 22720 }, { "epoch": 7.578385590393596, "loss": 0.3881013095378876, "step": 22720 }, { "ce_loss": 0.06341129541397095, "epoch": 7.578385590393596, "step": 22720 }, { "distill_loss": 0.2118532657623291, "epoch": 7.578385590393596, "step": 22720 }, { "epoch": 7.578385590393596, "ref_ce_loss": 0.06373800337314606, "step": 22720 }, { "epoch": 7.581721147431621, "loss": 0.4939, "step": 22730 }, { "epoch": 7.581721147431621, "grad_norm": 1.2959163188934326, "step": 22730 }, { "epoch": 7.581721147431621, "learning_rate": 0.0001165468226403612, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.5416849851608276, "step": 22730 }, { "ce_loss": 0.11458203941583633, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.2702295184135437, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.07892525941133499, "step": 22730 }, { "epoch": 7.581721147431621, "loss": 0.35920917987823486, "step": 22730 }, { "ce_loss": 0.03701075538992882, "epoch": 7.581721147431621, "step": 22730 }, { "distill_loss": 0.16796259582042694, "epoch": 7.581721147431621, "step": 22730 }, { "epoch": 7.581721147431621, "ref_ce_loss": 0.0796397402882576, "step": 22730 }, { "epoch": 7.585056704469647, "loss": 0.4879, "step": 22740 }, { "epoch": 7.585056704469647, "grad_norm": 1.5322014093399048, "step": 22740 }, { "epoch": 7.585056704469647, "learning_rate": 0.0001162420862503665, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.45357197523117065, "step": 22740 }, { "ce_loss": 0.08511171489953995, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.2759219706058502, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.06644267588853836, "step": 22740 }, { "epoch": 7.585056704469647, "loss": 0.49072521924972534, "step": 22740 }, { "ce_loss": 0.11758553236722946, "epoch": 7.585056704469647, "step": 22740 }, { "distill_loss": 0.2424834817647934, "epoch": 7.585056704469647, "step": 22740 }, { "epoch": 7.585056704469647, "ref_ce_loss": 0.09420862793922424, "step": 22740 }, { "epoch": 7.588392261507671, "loss": 0.4543, "step": 22750 }, { "epoch": 7.588392261507671, "grad_norm": 1.0535856485366821, "step": 22750 }, { "epoch": 7.588392261507671, "learning_rate": 0.00011593768103639062, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.5638757944107056, "step": 22750 }, { "ce_loss": 0.14501330256462097, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.2349630892276764, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.1257646083831787, "step": 22750 }, { "epoch": 7.588392261507671, "loss": 0.3980574905872345, "step": 22750 }, { "ce_loss": 0.0741574838757515, "epoch": 7.588392261507671, "step": 22750 }, { "distill_loss": 0.22142386436462402, "epoch": 7.588392261507671, "step": 22750 }, { "epoch": 7.588392261507671, "ref_ce_loss": 0.10224597156047821, "step": 22750 }, { "epoch": 7.591727818545698, "loss": 0.4443, "step": 22760 }, { "epoch": 7.591727818545698, "grad_norm": 1.2894861698150635, "step": 22760 }, { "epoch": 7.591727818545698, "learning_rate": 0.00011563360735370733, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.4116136431694031, "step": 22760 }, { "ce_loss": 0.05981724336743355, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.21280869841575623, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.05637165158987045, "step": 22760 }, { "epoch": 7.591727818545698, "loss": 0.40976908802986145, "step": 22760 }, { "ce_loss": 0.1144990473985672, "epoch": 7.591727818545698, "step": 22760 }, { "distill_loss": 0.20817773044109344, "epoch": 7.591727818545698, "step": 22760 }, { "epoch": 7.591727818545698, "ref_ce_loss": 0.08667189627885818, "step": 22760 }, { "epoch": 7.595063375583722, "loss": 0.4725, "step": 22770 }, { "epoch": 7.595063375583722, "grad_norm": 1.313011646270752, "step": 22770 }, { "epoch": 7.595063375583722, "learning_rate": 0.00011532986555720335, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.41087859869003296, "step": 22770 }, { "ce_loss": 0.10889429599046707, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.19500449299812317, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.08164864033460617, "step": 22770 }, { "epoch": 7.595063375583722, "loss": 0.6618943214416504, "step": 22770 }, { "ce_loss": 0.11418968439102173, "epoch": 7.595063375583722, "step": 22770 }, { "distill_loss": 0.2610551714897156, "epoch": 7.595063375583722, "step": 22770 }, { "epoch": 7.595063375583722, "ref_ce_loss": 0.11935710161924362, "step": 22770 }, { "epoch": 7.598398932621748, "loss": 0.5124, "step": 22780 }, { "epoch": 7.598398932621748, "grad_norm": 1.1959657669067383, "step": 22780 }, { "epoch": 7.598398932621748, "learning_rate": 0.00011502645600137808, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.46471190452575684, "step": 22780 }, { "ce_loss": 0.07454466819763184, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.20361733436584473, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.09685507416725159, "step": 22780 }, { "epoch": 7.598398932621748, "loss": 0.4594840109348297, "step": 22780 }, { "ce_loss": 0.09722603857517242, "epoch": 7.598398932621748, "step": 22780 }, { "distill_loss": 0.21554715931415558, "epoch": 7.598398932621748, "step": 22780 }, { "epoch": 7.598398932621748, "ref_ce_loss": 0.12070967257022858, "step": 22780 }, { "epoch": 7.601734489659773, "loss": 0.5519, "step": 22790 }, { "epoch": 7.601734489659773, "grad_norm": 3.1931025981903076, "step": 22790 }, { "epoch": 7.601734489659773, "learning_rate": 0.00011472337904034302, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.6544221639633179, "step": 22790 }, { "ce_loss": 0.1270163208246231, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.2164284735918045, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.08375734090805054, "step": 22790 }, { "epoch": 7.601734489659773, "loss": 0.28890594840049744, "step": 22790 }, { "ce_loss": 0.03669268265366554, "epoch": 7.601734489659773, "step": 22790 }, { "distill_loss": 0.18156400322914124, "epoch": 7.601734489659773, "step": 22790 }, { "epoch": 7.601734489659773, "ref_ce_loss": 0.04691380262374878, "step": 22790 }, { "epoch": 7.605070046697799, "loss": 0.5062, "step": 22800 }, { "epoch": 7.605070046697799, "grad_norm": 6.59785270690918, "step": 22800 }, { "epoch": 7.605070046697799, "learning_rate": 0.00011442063502782167, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.7847855091094971, "step": 22800 }, { "ce_loss": 0.05502784252166748, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.17512984573841095, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.06240437552332878, "step": 22800 }, { "epoch": 7.605070046697799, "loss": 0.33179566264152527, "step": 22800 }, { "ce_loss": 0.043317005038261414, "epoch": 7.605070046697799, "step": 22800 }, { "distill_loss": 0.14471793174743652, "epoch": 7.605070046697799, "step": 22800 }, { "epoch": 7.605070046697799, "ref_ce_loss": 0.0970865786075592, "step": 22800 }, { "epoch": 7.608405603735823, "loss": 0.4982, "step": 22810 }, { "epoch": 7.608405603735823, "grad_norm": 1.2467219829559326, "step": 22810 }, { "epoch": 7.608405603735823, "learning_rate": 0.00011411822431714902, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.4001379609107971, "step": 22810 }, { "ce_loss": 0.08985906839370728, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.1979110985994339, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.11203141510486603, "step": 22810 }, { "epoch": 7.608405603735823, "loss": 0.4521886706352234, "step": 22810 }, { "ce_loss": 0.052460163831710815, "epoch": 7.608405603735823, "step": 22810 }, { "distill_loss": 0.18328292667865753, "epoch": 7.608405603735823, "step": 22810 }, { "epoch": 7.608405603735823, "ref_ce_loss": 0.08354941755533218, "step": 22810 }, { "epoch": 7.61174116077385, "loss": 0.5096, "step": 22820 }, { "epoch": 7.61174116077385, "grad_norm": 1.4720590114593506, "step": 22820 }, { "epoch": 7.61174116077385, "learning_rate": 0.00011381614726127057, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.5751574635505676, "step": 22820 }, { "ce_loss": 0.08937083929777145, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.23865702748298645, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.12281100451946259, "step": 22820 }, { "epoch": 7.61174116077385, "loss": 0.8471077084541321, "step": 22820 }, { "ce_loss": 0.05814126506447792, "epoch": 7.61174116077385, "step": 22820 }, { "distill_loss": 0.1887391358613968, "epoch": 7.61174116077385, "step": 22820 }, { "epoch": 7.61174116077385, "ref_ce_loss": 0.0871066004037857, "step": 22820 }, { "epoch": 7.615076717811874, "loss": 0.5446, "step": 22830 }, { "epoch": 7.615076717811874, "grad_norm": 1.2326645851135254, "step": 22830 }, { "epoch": 7.615076717811874, "learning_rate": 0.00011351440421274296, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.8309054374694824, "step": 22830 }, { "ce_loss": 0.07843590527772903, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.2470468282699585, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.08887697756290436, "step": 22830 }, { "epoch": 7.615076717811874, "loss": 0.4266068637371063, "step": 22830 }, { "ce_loss": 0.08766929805278778, "epoch": 7.615076717811874, "step": 22830 }, { "distill_loss": 0.19042043387889862, "epoch": 7.615076717811874, "step": 22830 }, { "epoch": 7.615076717811874, "ref_ce_loss": 0.1035689115524292, "step": 22830 }, { "epoch": 7.6184122748499, "loss": 0.5534, "step": 22840 }, { "epoch": 7.6184122748499, "grad_norm": 4.262497901916504, "step": 22840 }, { "epoch": 7.6184122748499, "learning_rate": 0.00011321299552373274, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.6729900240898132, "step": 22840 }, { "ce_loss": 0.14668934047222137, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.2730126678943634, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.09181664884090424, "step": 22840 }, { "epoch": 7.6184122748499, "loss": 0.4466512203216553, "step": 22840 }, { "ce_loss": 0.09699872881174088, "epoch": 7.6184122748499, "step": 22840 }, { "distill_loss": 0.2412225306034088, "epoch": 7.6184122748499, "step": 22840 }, { "epoch": 7.6184122748499, "ref_ce_loss": 0.10813838243484497, "step": 22840 }, { "epoch": 7.621747831887925, "loss": 0.5271, "step": 22850 }, { "epoch": 7.621747831887925, "grad_norm": 1.896168828010559, "step": 22850 }, { "epoch": 7.621747831887925, "learning_rate": 0.00011291192154601642, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.45912012457847595, "step": 22850 }, { "ce_loss": 0.10149000585079193, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.20684868097305298, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.11774280667304993, "step": 22850 }, { "epoch": 7.621747831887925, "loss": 0.3641864061355591, "step": 22850 }, { "ce_loss": 0.05166614428162575, "epoch": 7.621747831887925, "step": 22850 }, { "distill_loss": 0.1630660593509674, "epoch": 7.621747831887925, "step": 22850 }, { "epoch": 7.621747831887925, "ref_ce_loss": 0.06195435672998428, "step": 22850 }, { "epoch": 7.625083388925951, "loss": 0.4385, "step": 22860 }, { "epoch": 7.625083388925951, "grad_norm": 1.0913865566253662, "step": 22860 }, { "epoch": 7.625083388925951, "learning_rate": 0.00011261118263097952, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.4047800302505493, "step": 22860 }, { "ce_loss": 0.080494225025177, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.19724488258361816, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.10261885076761246, "step": 22860 }, { "epoch": 7.625083388925951, "loss": 0.5060003399848938, "step": 22860 }, { "ce_loss": 0.12229954451322556, "epoch": 7.625083388925951, "step": 22860 }, { "distill_loss": 0.27040326595306396, "epoch": 7.625083388925951, "step": 22860 }, { "epoch": 7.625083388925951, "ref_ce_loss": 0.08427592366933823, "step": 22860 }, { "epoch": 7.6284189459639755, "loss": 0.5, "step": 22870 }, { "epoch": 7.6284189459639755, "grad_norm": 1.7914592027664185, "step": 22870 }, { "epoch": 7.6284189459639755, "learning_rate": 0.00011231077912961678, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.3973727524280548, "step": 22870 }, { "ce_loss": 0.10274583846330643, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.1657116413116455, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.0870896428823471, "step": 22870 }, { "epoch": 7.6284189459639755, "loss": 0.7164053916931152, "step": 22870 }, { "ce_loss": 0.12490220367908478, "epoch": 7.6284189459639755, "step": 22870 }, { "distill_loss": 0.2593473196029663, "epoch": 7.6284189459639755, "step": 22870 }, { "epoch": 7.6284189459639755, "ref_ce_loss": 0.11110673099756241, "step": 22870 }, { "epoch": 7.631754503002002, "loss": 0.4629, "step": 22880 }, { "epoch": 7.631754503002002, "grad_norm": 1.3830268383026123, "step": 22880 }, { "epoch": 7.631754503002002, "learning_rate": 0.00011201071139253132, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.42698851227760315, "step": 22880 }, { "ce_loss": 0.12712013721466064, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.18044383823871613, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.11906865984201431, "step": 22880 }, { "epoch": 7.631754503002002, "loss": 0.36526960134506226, "step": 22880 }, { "ce_loss": 0.0703156515955925, "epoch": 7.631754503002002, "step": 22880 }, { "distill_loss": 0.17102131247520447, "epoch": 7.631754503002002, "step": 22880 }, { "epoch": 7.631754503002002, "ref_ce_loss": 0.060693372040987015, "step": 22880 }, { "epoch": 7.635090060040026, "loss": 0.4636, "step": 22890 }, { "epoch": 7.635090060040026, "grad_norm": 1.891458511352539, "step": 22890 }, { "epoch": 7.635090060040026, "learning_rate": 0.0001117109797699348, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.38846418261528015, "step": 22890 }, { "ce_loss": 0.09349704533815384, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.17836086452007294, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.08837179094552994, "step": 22890 }, { "epoch": 7.635090060040026, "loss": 0.34821924567222595, "step": 22890 }, { "ce_loss": 0.07081709802150726, "epoch": 7.635090060040026, "step": 22890 }, { "distill_loss": 0.17022114992141724, "epoch": 7.635090060040026, "step": 22890 }, { "epoch": 7.635090060040026, "ref_ce_loss": 0.08378297835588455, "step": 22890 }, { "epoch": 7.6384256170780525, "loss": 0.4368, "step": 22900 }, { "epoch": 7.6384256170780525, "grad_norm": 1.291176676750183, "step": 22900 }, { "epoch": 7.6384256170780525, "learning_rate": 0.000111411584611646, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.46497029066085815, "step": 22900 }, { "ce_loss": 0.08809613436460495, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.18776924908161163, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.07957534492015839, "step": 22900 }, { "epoch": 7.6384256170780525, "loss": 0.3915051221847534, "step": 22900 }, { "ce_loss": 0.07151425629854202, "epoch": 7.6384256170780525, "step": 22900 }, { "distill_loss": 0.18844066560268402, "epoch": 7.6384256170780525, "step": 22900 }, { "epoch": 7.6384256170780525, "ref_ce_loss": 0.0949799045920372, "step": 22900 }, { "epoch": 7.641761174116077, "loss": 0.4634, "step": 22910 }, { "epoch": 7.641761174116077, "grad_norm": 1.6681424379348755, "step": 22910 }, { "epoch": 7.641761174116077, "learning_rate": 0.00011111252626709135, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.7183939814567566, "step": 22910 }, { "ce_loss": 0.1378229409456253, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.2599409818649292, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.10532394051551819, "step": 22910 }, { "epoch": 7.641761174116077, "loss": 0.4198286831378937, "step": 22910 }, { "ce_loss": 0.1008639857172966, "epoch": 7.641761174116077, "step": 22910 }, { "distill_loss": 0.22020921111106873, "epoch": 7.641761174116077, "step": 22910 }, { "epoch": 7.641761174116077, "ref_ce_loss": 0.09852571785449982, "step": 22910 }, { "epoch": 7.645096731154103, "loss": 0.4998, "step": 22920 }, { "epoch": 7.645096731154103, "grad_norm": 1.3718444108963013, "step": 22920 }, { "epoch": 7.645096731154103, "learning_rate": 0.00011081380508530413, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.5433452129364014, "step": 22920 }, { "ce_loss": 0.0930216908454895, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.19634249806404114, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.09999752044677734, "step": 22920 }, { "epoch": 7.645096731154103, "loss": 0.4787171185016632, "step": 22920 }, { "ce_loss": 0.12058284878730774, "epoch": 7.645096731154103, "step": 22920 }, { "distill_loss": 0.22844330966472626, "epoch": 7.645096731154103, "step": 22920 }, { "epoch": 7.645096731154103, "ref_ce_loss": 0.10536279529333115, "step": 22920 }, { "epoch": 7.648432288192128, "loss": 0.5512, "step": 22930 }, { "epoch": 7.648432288192128, "grad_norm": 2.082670211791992, "step": 22930 }, { "epoch": 7.648432288192128, "learning_rate": 0.00011051542141492422, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.5069497227668762, "step": 22930 }, { "ce_loss": 0.08902516216039658, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.16418182849884033, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.09682785719633102, "step": 22930 }, { "epoch": 7.648432288192128, "loss": 0.5495210886001587, "step": 22930 }, { "ce_loss": 0.08924379199743271, "epoch": 7.648432288192128, "step": 22930 }, { "distill_loss": 0.14871746301651, "epoch": 7.648432288192128, "step": 22930 }, { "epoch": 7.648432288192128, "ref_ce_loss": 0.07341916114091873, "step": 22930 }, { "epoch": 7.651767845230154, "loss": 0.4669, "step": 22940 }, { "epoch": 7.651767845230154, "grad_norm": 1.1963962316513062, "step": 22940 }, { "epoch": 7.651767845230154, "learning_rate": 0.00011021737560419718, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.4135090708732605, "step": 22940 }, { "ce_loss": 0.05780329555273056, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.19066178798675537, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.08974800258874893, "step": 22940 }, { "epoch": 7.651767845230154, "loss": 0.5159320831298828, "step": 22940 }, { "ce_loss": 0.07121335715055466, "epoch": 7.651767845230154, "step": 22940 }, { "distill_loss": 0.1589433252811432, "epoch": 7.651767845230154, "step": 22940 }, { "epoch": 7.651767845230154, "ref_ce_loss": 0.07944811880588531, "step": 22940 }, { "epoch": 7.655103402268178, "loss": 0.488, "step": 22950 }, { "epoch": 7.655103402268178, "grad_norm": 1.3356716632843018, "step": 22950 }, { "epoch": 7.655103402268178, "learning_rate": 0.00010991966800097473, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.9157035946846008, "step": 22950 }, { "ce_loss": 0.14918577671051025, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.3011043071746826, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.12013433128595352, "step": 22950 }, { "epoch": 7.655103402268178, "loss": 0.5391525030136108, "step": 22950 }, { "ce_loss": 0.15401852130889893, "epoch": 7.655103402268178, "step": 22950 }, { "distill_loss": 0.27220389246940613, "epoch": 7.655103402268178, "step": 22950 }, { "epoch": 7.655103402268178, "ref_ce_loss": 0.11245297640562057, "step": 22950 }, { "epoch": 7.6584389593062046, "loss": 0.5002, "step": 22960 }, { "epoch": 7.6584389593062046, "grad_norm": 2.074615478515625, "step": 22960 }, { "epoch": 7.6584389593062046, "learning_rate": 0.00010962229895271367, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.4456467628479004, "step": 22960 }, { "ce_loss": 0.07245982438325882, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.1963823288679123, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.08949127048254013, "step": 22960 }, { "epoch": 7.6584389593062046, "loss": 0.41796499490737915, "step": 22960 }, { "ce_loss": 0.06558441370725632, "epoch": 7.6584389593062046, "step": 22960 }, { "distill_loss": 0.19469647109508514, "epoch": 7.6584389593062046, "step": 22960 }, { "epoch": 7.6584389593062046, "ref_ce_loss": 0.08166325092315674, "step": 22960 }, { "epoch": 7.661774516344229, "loss": 0.4619, "step": 22970 }, { "epoch": 7.661774516344229, "grad_norm": 1.8865573406219482, "step": 22970 }, { "epoch": 7.661774516344229, "learning_rate": 0.00010932526880647582, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.4492861032485962, "step": 22970 }, { "ce_loss": 0.0742424726486206, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.17198584973812103, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.07258053123950958, "step": 22970 }, { "epoch": 7.661774516344229, "loss": 0.8455418348312378, "step": 22970 }, { "ce_loss": 0.14300350844860077, "epoch": 7.661774516344229, "step": 22970 }, { "distill_loss": 0.25651904940605164, "epoch": 7.661774516344229, "step": 22970 }, { "epoch": 7.661774516344229, "ref_ce_loss": 0.0984906330704689, "step": 22970 }, { "epoch": 7.665110073382255, "loss": 0.4998, "step": 22980 }, { "epoch": 7.665110073382255, "grad_norm": 1.2946828603744507, "step": 22980 }, { "epoch": 7.665110073382255, "learning_rate": 0.00010902857790892703, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.7474709749221802, "step": 22980 }, { "ce_loss": 0.1747969537973404, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.27910372614860535, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.11947666853666306, "step": 22980 }, { "epoch": 7.665110073382255, "loss": 0.43477147817611694, "step": 22980 }, { "ce_loss": 0.10325319319963455, "epoch": 7.665110073382255, "step": 22980 }, { "distill_loss": 0.23486895859241486, "epoch": 7.665110073382255, "step": 22980 }, { "epoch": 7.665110073382255, "ref_ce_loss": 0.09604207426309586, "step": 22980 }, { "epoch": 7.66844563042028, "loss": 0.4873, "step": 22990 }, { "epoch": 7.66844563042028, "grad_norm": 1.5909743309020996, "step": 22990 }, { "epoch": 7.66844563042028, "learning_rate": 0.00010873222660633748, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.5011396408081055, "step": 22990 }, { "ce_loss": 0.1381172388792038, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.24536283314228058, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.11745288223028183, "step": 22990 }, { "epoch": 7.66844563042028, "loss": 0.9448994398117065, "step": 22990 }, { "ce_loss": 0.09806367754936218, "epoch": 7.66844563042028, "step": 22990 }, { "distill_loss": 0.17582949995994568, "epoch": 7.66844563042028, "step": 22990 }, { "epoch": 7.66844563042028, "ref_ce_loss": 0.08818396180868149, "step": 22990 }, { "epoch": 7.671781187458306, "loss": 0.5106, "step": 23000 }, { "epoch": 7.671781187458306, "grad_norm": 1.6053009033203125, "step": 23000 }, { "epoch": 7.671781187458306, "learning_rate": 0.00010843621524458148, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.4494698643684387, "step": 23000 }, { "ce_loss": 0.10007108747959137, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.2064976692199707, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.10766438394784927, "step": 23000 }, { "epoch": 7.671781187458306, "loss": 0.3754218518733978, "step": 23000 }, { "ce_loss": 0.07610825449228287, "epoch": 7.671781187458306, "step": 23000 }, { "distill_loss": 0.21002225577831268, "epoch": 7.671781187458306, "step": 23000 }, { "epoch": 7.671781187458306, "ref_ce_loss": 0.08917049318552017, "step": 23000 }, { "epoch": 7.67511674449633, "loss": 0.4974, "step": 23010 }, { "epoch": 7.67511674449633, "grad_norm": 1.244104266166687, "step": 23010 }, { "epoch": 7.67511674449633, "learning_rate": 0.0001081405441691357, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.43800780177116394, "step": 23010 }, { "ce_loss": 0.08042147010564804, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.18267808854579926, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.09389374405145645, "step": 23010 }, { "epoch": 7.67511674449633, "loss": 0.5576515197753906, "step": 23010 }, { "ce_loss": 0.08384501934051514, "epoch": 7.67511674449633, "step": 23010 }, { "distill_loss": 0.21569280326366425, "epoch": 7.67511674449633, "step": 23010 }, { "epoch": 7.67511674449633, "ref_ce_loss": 0.11242746561765671, "step": 23010 }, { "epoch": 7.678452301534357, "loss": 0.5444, "step": 23020 }, { "epoch": 7.678452301534357, "grad_norm": 1.104618787765503, "step": 23020 }, { "epoch": 7.678452301534357, "learning_rate": 0.00010784521372508027, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.5867886543273926, "step": 23020 }, { "ce_loss": 0.11901320517063141, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.25423428416252136, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.12625397741794586, "step": 23020 }, { "epoch": 7.678452301534357, "loss": 0.45270201563835144, "step": 23020 }, { "ce_loss": 0.09595979750156403, "epoch": 7.678452301534357, "step": 23020 }, { "distill_loss": 0.20388580858707428, "epoch": 7.678452301534357, "step": 23020 }, { "epoch": 7.678452301534357, "ref_ce_loss": 0.09884056448936462, "step": 23020 }, { "epoch": 7.681787858572381, "loss": 0.4848, "step": 23030 }, { "epoch": 7.681787858572381, "grad_norm": 1.514387607574463, "step": 23030 }, { "epoch": 7.681787858572381, "learning_rate": 0.00010755022425709755, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.5719349980354309, "step": 23030 }, { "ce_loss": 0.14947743713855743, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.2445412427186966, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.10241449624300003, "step": 23030 }, { "epoch": 7.681787858572381, "loss": 0.8717924356460571, "step": 23030 }, { "ce_loss": 0.08390536159276962, "epoch": 7.681787858572381, "step": 23030 }, { "distill_loss": 0.23097392916679382, "epoch": 7.681787858572381, "step": 23030 }, { "epoch": 7.681787858572381, "ref_ce_loss": 0.11411327123641968, "step": 23030 }, { "epoch": 7.685123415610407, "loss": 0.5377, "step": 23040 }, { "epoch": 7.685123415610407, "grad_norm": 1.4054793119430542, "step": 23040 }, { "epoch": 7.685123415610407, "learning_rate": 0.00010725557610947214, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.6011195778846741, "step": 23040 }, { "ce_loss": 0.11097917705774307, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.2800613343715668, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.1031576469540596, "step": 23040 }, { "epoch": 7.685123415610407, "loss": 0.4414733648300171, "step": 23040 }, { "ce_loss": 0.08354143053293228, "epoch": 7.685123415610407, "step": 23040 }, { "distill_loss": 0.18651583790779114, "epoch": 7.685123415610407, "step": 23040 }, { "epoch": 7.685123415610407, "ref_ce_loss": 0.10314172506332397, "step": 23040 }, { "epoch": 7.688458972648432, "loss": 0.494, "step": 23050 }, { "epoch": 7.688458972648432, "grad_norm": 2.3914988040924072, "step": 23050 }, { "epoch": 7.688458972648432, "learning_rate": 0.00010696126962608995, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.426511287689209, "step": 23050 }, { "ce_loss": 0.07741673290729523, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.24165627360343933, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.10722195357084274, "step": 23050 }, { "epoch": 7.688458972648432, "loss": 0.6378778219223022, "step": 23050 }, { "ce_loss": 0.1301768571138382, "epoch": 7.688458972648432, "step": 23050 }, { "distill_loss": 0.2512298822402954, "epoch": 7.688458972648432, "step": 23050 }, { "epoch": 7.688458972648432, "ref_ce_loss": 0.09250593930482864, "step": 23050 }, { "epoch": 7.691794529686458, "loss": 0.5472, "step": 23060 }, { "epoch": 7.691794529686458, "grad_norm": 2.4647703170776367, "step": 23060 }, { "epoch": 7.691794529686458, "learning_rate": 0.00010666730515043832, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.4028063118457794, "step": 23060 }, { "ce_loss": 0.04906738921999931, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.17633600533008575, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.07943486422300339, "step": 23060 }, { "epoch": 7.691794529686458, "loss": 0.32616400718688965, "step": 23060 }, { "ce_loss": 0.04622570052742958, "epoch": 7.691794529686458, "step": 23060 }, { "distill_loss": 0.16948921978473663, "epoch": 7.691794529686458, "step": 23060 }, { "epoch": 7.691794529686458, "ref_ce_loss": 0.08727295696735382, "step": 23060 }, { "epoch": 7.6951300867244825, "loss": 0.4505, "step": 23070 }, { "epoch": 7.6951300867244825, "grad_norm": 1.6886954307556152, "step": 23070 }, { "epoch": 7.6951300867244825, "learning_rate": 0.00010637368302560551, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.4469156861305237, "step": 23070 }, { "ce_loss": 0.1259409338235855, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.20817193388938904, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.08971387892961502, "step": 23070 }, { "epoch": 7.6951300867244825, "loss": 0.605049192905426, "step": 23070 }, { "ce_loss": 0.08226354420185089, "epoch": 7.6951300867244825, "step": 23070 }, { "distill_loss": 0.24582314491271973, "epoch": 7.6951300867244825, "step": 23070 }, { "epoch": 7.6951300867244825, "ref_ce_loss": 0.10886520892381668, "step": 23070 }, { "epoch": 7.698465643762509, "loss": 0.4939, "step": 23080 }, { "epoch": 7.698465643762509, "grad_norm": 1.330239176750183, "step": 23080 }, { "epoch": 7.698465643762509, "learning_rate": 0.00010608040359428008, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.47724735736846924, "step": 23080 }, { "ce_loss": 0.09845036268234253, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.22484542429447174, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.09992624819278717, "step": 23080 }, { "epoch": 7.698465643762509, "loss": 0.3705708980560303, "step": 23080 }, { "ce_loss": 0.06413956731557846, "epoch": 7.698465643762509, "step": 23080 }, { "distill_loss": 0.20974738895893097, "epoch": 7.698465643762509, "step": 23080 }, { "epoch": 7.698465643762509, "ref_ce_loss": 0.09660591930150986, "step": 23080 }, { "epoch": 7.701801200800533, "loss": 0.4884, "step": 23090 }, { "epoch": 7.701801200800533, "grad_norm": 1.2784122228622437, "step": 23090 }, { "epoch": 7.701801200800533, "learning_rate": 0.00010578746719875087, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 0.3787055015563965, "step": 23090 }, { "ce_loss": 0.06965330243110657, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.20763544738292694, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.06809075176715851, "step": 23090 }, { "epoch": 7.701801200800533, "loss": 1.2794193029403687, "step": 23090 }, { "ce_loss": 0.11845719069242477, "epoch": 7.701801200800533, "step": 23090 }, { "distill_loss": 0.2328185886144638, "epoch": 7.701801200800533, "step": 23090 }, { "epoch": 7.701801200800533, "ref_ce_loss": 0.11717583239078522, "step": 23090 }, { "epoch": 7.7051367578385594, "loss": 0.5178, "step": 23100 }, { "epoch": 7.7051367578385594, "grad_norm": 1.4460859298706055, "step": 23100 }, { "epoch": 7.7051367578385594, "learning_rate": 0.00010549487418090578, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.30405837297439575, "step": 23100 }, { "ce_loss": 0.037146370857954025, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.19589708745479584, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.07080904394388199, "step": 23100 }, { "epoch": 7.7051367578385594, "loss": 0.8166431188583374, "step": 23100 }, { "ce_loss": 0.12483064830303192, "epoch": 7.7051367578385594, "step": 23100 }, { "distill_loss": 0.2657544016838074, "epoch": 7.7051367578385594, "step": 23100 }, { "epoch": 7.7051367578385594, "ref_ce_loss": 0.07705806940793991, "step": 23100 }, { "epoch": 7.708472314876584, "loss": 0.5018, "step": 23110 }, { "epoch": 7.708472314876584, "grad_norm": 1.9317492246627808, "step": 23110 }, { "epoch": 7.708472314876584, "learning_rate": 0.0001052026248822327, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.579699695110321, "step": 23110 }, { "ce_loss": 0.06592089682817459, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.22586572170257568, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.11663997918367386, "step": 23110 }, { "epoch": 7.708472314876584, "loss": 0.3650611639022827, "step": 23110 }, { "ce_loss": 0.049009211361408234, "epoch": 7.708472314876584, "step": 23110 }, { "distill_loss": 0.18839851021766663, "epoch": 7.708472314876584, "step": 23110 }, { "epoch": 7.708472314876584, "ref_ce_loss": 0.08527813106775284, "step": 23110 }, { "epoch": 7.71180787191461, "loss": 0.4888, "step": 23120 }, { "epoch": 7.71180787191461, "grad_norm": 1.1595827341079712, "step": 23120 }, { "epoch": 7.71180787191461, "learning_rate": 0.00010491071964381798, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.5201905965805054, "step": 23120 }, { "ce_loss": 0.17336910963058472, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.2682691514492035, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.07831382751464844, "step": 23120 }, { "epoch": 7.71180787191461, "loss": 0.3259257972240448, "step": 23120 }, { "ce_loss": 0.03200289607048035, "epoch": 7.71180787191461, "step": 23120 }, { "distill_loss": 0.175248384475708, "epoch": 7.71180787191461, "step": 23120 }, { "epoch": 7.71180787191461, "ref_ce_loss": 0.08027581870555878, "step": 23120 }, { "epoch": 7.715143428952635, "loss": 0.4828, "step": 23130 }, { "epoch": 7.715143428952635, "grad_norm": 1.531973958015442, "step": 23130 }, { "epoch": 7.715143428952635, "learning_rate": 0.00010461915880634627, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.412576824426651, "step": 23130 }, { "ce_loss": 0.06026545166969299, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.21231496334075928, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.07956986874341965, "step": 23130 }, { "epoch": 7.715143428952635, "loss": 0.41848087310791016, "step": 23130 }, { "ce_loss": 0.08960796147584915, "epoch": 7.715143428952635, "step": 23130 }, { "distill_loss": 0.22111055254936218, "epoch": 7.715143428952635, "step": 23130 }, { "epoch": 7.715143428952635, "ref_ce_loss": 0.08399610966444016, "step": 23130 }, { "epoch": 7.718478985990661, "loss": 0.5039, "step": 23140 }, { "epoch": 7.718478985990661, "grad_norm": 1.5510739088058472, "step": 23140 }, { "epoch": 7.718478985990661, "learning_rate": 0.00010432794271010049, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.40824779868125916, "step": 23140 }, { "ce_loss": 0.09309175610542297, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.19793115556240082, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.11696568131446838, "step": 23140 }, { "epoch": 7.718478985990661, "loss": 0.4437388777732849, "step": 23140 }, { "ce_loss": 0.08572607487440109, "epoch": 7.718478985990661, "step": 23140 }, { "distill_loss": 0.2218407690525055, "epoch": 7.718478985990661, "step": 23140 }, { "epoch": 7.718478985990661, "ref_ce_loss": 0.0954335629940033, "step": 23140 }, { "epoch": 7.721814543028685, "loss": 0.481, "step": 23150 }, { "epoch": 7.721814543028685, "grad_norm": 1.44486403465271, "step": 23150 }, { "epoch": 7.721814543028685, "learning_rate": 0.00010403707169496124, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.4209980070590973, "step": 23150 }, { "ce_loss": 0.04794754460453987, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.20606538653373718, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.06347404420375824, "step": 23150 }, { "epoch": 7.721814543028685, "loss": 0.46626389026641846, "step": 23150 }, { "ce_loss": 0.08045854419469833, "epoch": 7.721814543028685, "step": 23150 }, { "distill_loss": 0.21515415608882904, "epoch": 7.721814543028685, "step": 23150 }, { "epoch": 7.721814543028685, "ref_ce_loss": 0.09117396175861359, "step": 23150 }, { "epoch": 7.7251501000667115, "loss": 0.5281, "step": 23160 }, { "epoch": 7.7251501000667115, "grad_norm": 1.3297337293624878, "step": 23160 }, { "epoch": 7.7251501000667115, "learning_rate": 0.00010374654610040636, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.4131792187690735, "step": 23160 }, { "ce_loss": 0.08460301160812378, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.16719715297222137, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.0832613930106163, "step": 23160 }, { "epoch": 7.7251501000667115, "loss": 0.5897133350372314, "step": 23160 }, { "ce_loss": 0.14093492925167084, "epoch": 7.7251501000667115, "step": 23160 }, { "distill_loss": 0.2610013782978058, "epoch": 7.7251501000667115, "step": 23160 }, { "epoch": 7.7251501000667115, "ref_ce_loss": 0.10729961097240448, "step": 23160 }, { "epoch": 7.728485657104736, "loss": 0.5116, "step": 23170 }, { "epoch": 7.728485657104736, "grad_norm": 1.3210251331329346, "step": 23170 }, { "epoch": 7.728485657104736, "learning_rate": 0.00010345636626551022, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.641310453414917, "step": 23170 }, { "ce_loss": 0.06956622749567032, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.2333042323589325, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.1018952876329422, "step": 23170 }, { "epoch": 7.728485657104736, "loss": 0.6080069541931152, "step": 23170 }, { "ce_loss": 0.15181094408035278, "epoch": 7.728485657104736, "step": 23170 }, { "distill_loss": 0.28187862038612366, "epoch": 7.728485657104736, "step": 23170 }, { "epoch": 7.728485657104736, "ref_ce_loss": 0.09707515686750412, "step": 23170 }, { "epoch": 7.731821214142762, "loss": 0.4553, "step": 23180 }, { "epoch": 7.731821214142762, "grad_norm": 1.2260631322860718, "step": 23180 }, { "epoch": 7.731821214142762, "learning_rate": 0.0001031665325289441, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.3943283259868622, "step": 23180 }, { "ce_loss": 0.08596097677946091, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.19753313064575195, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.08808321505784988, "step": 23180 }, { "epoch": 7.731821214142762, "loss": 0.43322786688804626, "step": 23180 }, { "ce_loss": 0.12042468786239624, "epoch": 7.731821214142762, "step": 23180 }, { "distill_loss": 0.2224731743335724, "epoch": 7.731821214142762, "step": 23180 }, { "epoch": 7.731821214142762, "ref_ce_loss": 0.069795623421669, "step": 23180 }, { "epoch": 7.735156771180787, "loss": 0.4834, "step": 23190 }, { "epoch": 7.735156771180787, "grad_norm": 2.1221816539764404, "step": 23190 }, { "epoch": 7.735156771180787, "learning_rate": 0.00010287704522897512, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.5039787292480469, "step": 23190 }, { "ce_loss": 0.1576055884361267, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.24595049023628235, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.10024640709161758, "step": 23190 }, { "epoch": 7.735156771180787, "loss": 0.7509039640426636, "step": 23190 }, { "ce_loss": 0.1318170428276062, "epoch": 7.735156771180787, "step": 23190 }, { "distill_loss": 0.2708563506603241, "epoch": 7.735156771180787, "step": 23190 }, { "epoch": 7.735156771180787, "ref_ce_loss": 0.1185612827539444, "step": 23190 }, { "epoch": 7.738492328218813, "loss": 0.4798, "step": 23200 }, { "epoch": 7.738492328218813, "grad_norm": 1.3106443881988525, "step": 23200 }, { "epoch": 7.738492328218813, "learning_rate": 0.00010258790470346622, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.5049909949302673, "step": 23200 }, { "ce_loss": 0.10278329998254776, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.2146678864955902, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.11462140828371048, "step": 23200 }, { "epoch": 7.738492328218813, "loss": 0.6290625333786011, "step": 23200 }, { "ce_loss": 0.12055748701095581, "epoch": 7.738492328218813, "step": 23200 }, { "distill_loss": 0.19564548134803772, "epoch": 7.738492328218813, "step": 23200 }, { "epoch": 7.738492328218813, "ref_ce_loss": 0.10627972334623337, "step": 23200 }, { "epoch": 7.741827885256837, "loss": 0.4806, "step": 23210 }, { "epoch": 7.741827885256837, "grad_norm": 1.6218887567520142, "step": 23210 }, { "epoch": 7.741827885256837, "learning_rate": 0.00010229911128987515, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.5480747222900391, "step": 23210 }, { "ce_loss": 0.11701656132936478, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.2518675923347473, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.11369401961565018, "step": 23210 }, { "epoch": 7.741827885256837, "loss": 0.39638179540634155, "step": 23210 }, { "ce_loss": 0.05360805243253708, "epoch": 7.741827885256837, "step": 23210 }, { "distill_loss": 0.15531179308891296, "epoch": 7.741827885256837, "step": 23210 }, { "epoch": 7.741827885256837, "ref_ce_loss": 0.06214721128344536, "step": 23210 }, { "epoch": 7.745163442294864, "loss": 0.4309, "step": 23220 }, { "epoch": 7.745163442294864, "grad_norm": 1.4390681982040405, "step": 23220 }, { "epoch": 7.745163442294864, "learning_rate": 0.00010201066532525528, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.3549351096153259, "step": 23220 }, { "ce_loss": 0.09753035753965378, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.18527477979660034, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.07191254943609238, "step": 23220 }, { "epoch": 7.745163442294864, "loss": 0.3918623626232147, "step": 23220 }, { "ce_loss": 0.07541152834892273, "epoch": 7.745163442294864, "step": 23220 }, { "distill_loss": 0.1978979855775833, "epoch": 7.745163442294864, "step": 23220 }, { "epoch": 7.745163442294864, "ref_ce_loss": 0.08795817196369171, "step": 23220 }, { "epoch": 7.748498999332888, "loss": 0.4934, "step": 23230 }, { "epoch": 7.748498999332888, "grad_norm": 1.57815420627594, "step": 23230 }, { "epoch": 7.748498999332888, "learning_rate": 0.00010172256714625406, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.5031512379646301, "step": 23230 }, { "ce_loss": 0.10324139893054962, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.26005861163139343, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.08189309388399124, "step": 23230 }, { "epoch": 7.748498999332888, "loss": 0.44763678312301636, "step": 23230 }, { "ce_loss": 0.10603315383195877, "epoch": 7.748498999332888, "step": 23230 }, { "distill_loss": 0.202081561088562, "epoch": 7.748498999332888, "step": 23230 }, { "epoch": 7.748498999332888, "ref_ce_loss": 0.11808335036039352, "step": 23230 }, { "epoch": 7.751834556370914, "loss": 0.5099, "step": 23240 }, { "epoch": 7.751834556370914, "grad_norm": 0.9400928616523743, "step": 23240 }, { "epoch": 7.751834556370914, "learning_rate": 0.00010143481708911285, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.2977118492126465, "step": 23240 }, { "ce_loss": 0.09156695008277893, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.12004655599594116, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.08571289479732513, "step": 23240 }, { "epoch": 7.751834556370914, "loss": 0.7364984750747681, "step": 23240 }, { "ce_loss": 0.056263577193021774, "epoch": 7.751834556370914, "step": 23240 }, { "distill_loss": 0.226426362991333, "epoch": 7.751834556370914, "step": 23240 }, { "epoch": 7.751834556370914, "ref_ce_loss": 0.0868932455778122, "step": 23240 }, { "epoch": 7.755170113408939, "loss": 0.4672, "step": 23250 }, { "epoch": 7.755170113408939, "grad_norm": 1.5790131092071533, "step": 23250 }, { "epoch": 7.755170113408939, "learning_rate": 0.00010114741548966704, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.36995837092399597, "step": 23250 }, { "ce_loss": 0.0722150132060051, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.16372142732143402, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.08817657828330994, "step": 23250 }, { "epoch": 7.755170113408939, "loss": 0.4690679907798767, "step": 23250 }, { "ce_loss": 0.08446140587329865, "epoch": 7.755170113408939, "step": 23250 }, { "distill_loss": 0.2306831032037735, "epoch": 7.755170113408939, "step": 23250 }, { "epoch": 7.755170113408939, "ref_ce_loss": 0.07352418452501297, "step": 23250 }, { "epoch": 7.758505670446965, "loss": 0.4484, "step": 23260 }, { "epoch": 7.758505670446965, "grad_norm": 1.297808051109314, "step": 23260 }, { "epoch": 7.758505670446965, "learning_rate": 0.00010086036268334522, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.477030485868454, "step": 23260 }, { "ce_loss": 0.098115473985672, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.23980093002319336, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.1157793179154396, "step": 23260 }, { "epoch": 7.758505670446965, "loss": 0.4922773540019989, "step": 23260 }, { "ce_loss": 0.050856947898864746, "epoch": 7.758505670446965, "step": 23260 }, { "distill_loss": 0.2195744812488556, "epoch": 7.758505670446965, "step": 23260 }, { "epoch": 7.758505670446965, "ref_ce_loss": 0.1344345360994339, "step": 23260 }, { "epoch": 7.7618412274849895, "loss": 0.4674, "step": 23270 }, { "epoch": 7.7618412274849895, "grad_norm": 1.9501700401306152, "step": 23270 }, { "epoch": 7.7618412274849895, "learning_rate": 0.0001005736590051689, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.2705991268157959, "step": 23270 }, { "ce_loss": 0.036175936460494995, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.16692174971103668, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.06731446087360382, "step": 23270 }, { "epoch": 7.7618412274849895, "loss": 0.5502997636795044, "step": 23270 }, { "ce_loss": 0.11611030250787735, "epoch": 7.7618412274849895, "step": 23270 }, { "distill_loss": 0.19314511120319366, "epoch": 7.7618412274849895, "step": 23270 }, { "epoch": 7.7618412274849895, "ref_ce_loss": 0.12072230875492096, "step": 23270 }, { "epoch": 7.765176784523016, "loss": 0.5065, "step": 23280 }, { "epoch": 7.765176784523016, "grad_norm": 2.136212110519409, "step": 23280 }, { "epoch": 7.765176784523016, "learning_rate": 0.00010028730478975226, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.47727152705192566, "step": 23280 }, { "ce_loss": 0.11116799712181091, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.22672784328460693, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.09270457178354263, "step": 23280 }, { "epoch": 7.765176784523016, "loss": 0.3988770842552185, "step": 23280 }, { "ce_loss": 0.09395449608564377, "epoch": 7.765176784523016, "step": 23280 }, { "distill_loss": 0.17199018597602844, "epoch": 7.765176784523016, "step": 23280 }, { "epoch": 7.765176784523016, "ref_ce_loss": 0.09608203917741776, "step": 23280 }, { "epoch": 7.76851234156104, "loss": 0.4953, "step": 23290 }, { "epoch": 7.76851234156104, "grad_norm": 1.2677170038223267, "step": 23290 }, { "epoch": 7.76851234156104, "learning_rate": 0.00010000130037130122, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.33433595299720764, "step": 23290 }, { "ce_loss": 0.072944276034832, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.17916573584079742, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.08193469047546387, "step": 23290 }, { "epoch": 7.76851234156104, "loss": 0.3454050123691559, "step": 23290 }, { "ce_loss": 0.07415476441383362, "epoch": 7.76851234156104, "step": 23290 }, { "distill_loss": 0.1711643487215042, "epoch": 7.76851234156104, "step": 23290 }, { "epoch": 7.76851234156104, "ref_ce_loss": 0.07147339731454849, "step": 23290 }, { "epoch": 7.771847898599066, "loss": 0.494, "step": 23300 }, { "epoch": 7.771847898599066, "grad_norm": 2.2848458290100098, "step": 23300 }, { "epoch": 7.771847898599066, "learning_rate": 9.971564608361386e-05, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.7371745109558105, "step": 23300 }, { "ce_loss": 0.07996401935815811, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.23706546425819397, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.08906930685043335, "step": 23300 }, { "epoch": 7.771847898599066, "loss": 0.6135530471801758, "step": 23300 }, { "ce_loss": 0.11487612128257751, "epoch": 7.771847898599066, "step": 23300 }, { "distill_loss": 0.2451004981994629, "epoch": 7.771847898599066, "step": 23300 }, { "epoch": 7.771847898599066, "ref_ce_loss": 0.09790825098752975, "step": 23300 }, { "epoch": 7.775183455637091, "loss": 0.4982, "step": 23310 }, { "epoch": 7.775183455637091, "grad_norm": 1.1426531076431274, "step": 23310 }, { "epoch": 7.775183455637091, "learning_rate": 9.943034226007944e-05, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.5762659311294556, "step": 23310 }, { "ce_loss": 0.07822701334953308, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.21036364138126373, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.09202571213245392, "step": 23310 }, { "epoch": 7.775183455637091, "loss": 0.46803978085517883, "step": 23310 }, { "ce_loss": 0.08661315590143204, "epoch": 7.775183455637091, "step": 23310 }, { "distill_loss": 0.26759105920791626, "epoch": 7.775183455637091, "step": 23310 }, { "epoch": 7.775183455637091, "ref_ce_loss": 0.08875215798616409, "step": 23310 }, { "epoch": 7.778519012675117, "loss": 0.5067, "step": 23320 }, { "epoch": 7.778519012675117, "grad_norm": 1.3050298690795898, "step": 23320 }, { "epoch": 7.778519012675117, "learning_rate": 9.914538923367822e-05, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.31625789403915405, "step": 23320 }, { "ce_loss": 0.061495281755924225, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.16869650781154633, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.07117399573326111, "step": 23320 }, { "epoch": 7.778519012675117, "loss": 0.4371589720249176, "step": 23320 }, { "ce_loss": 0.09910117834806442, "epoch": 7.778519012675117, "step": 23320 }, { "distill_loss": 0.20337212085723877, "epoch": 7.778519012675117, "step": 23320 }, { "epoch": 7.778519012675117, "ref_ce_loss": 0.09639453142881393, "step": 23320 }, { "epoch": 7.781854569713142, "loss": 0.4238, "step": 23330 }, { "epoch": 7.781854569713142, "grad_norm": 1.3703293800354004, "step": 23330 }, { "epoch": 7.781854569713142, "learning_rate": 9.886078733698108e-05, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.6084938645362854, "step": 23330 }, { "ce_loss": 0.09252669662237167, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.2027970552444458, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.0995938628911972, "step": 23330 }, { "epoch": 7.781854569713142, "loss": 0.3872583508491516, "step": 23330 }, { "ce_loss": 0.10837969928979874, "epoch": 7.781854569713142, "step": 23330 }, { "distill_loss": 0.19923055171966553, "epoch": 7.781854569713142, "step": 23330 }, { "epoch": 7.781854569713142, "ref_ce_loss": 0.07942364364862442, "step": 23330 }, { "epoch": 7.785190126751168, "loss": 0.4826, "step": 23340 }, { "epoch": 7.785190126751168, "grad_norm": 1.4044020175933838, "step": 23340 }, { "epoch": 7.785190126751168, "learning_rate": 9.857653690214905e-05, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.6372712850570679, "step": 23340 }, { "ce_loss": 0.1294550895690918, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.289159893989563, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.10100501775741577, "step": 23340 }, { "epoch": 7.785190126751168, "loss": 0.5620340704917908, "step": 23340 }, { "ce_loss": 0.15031994879245758, "epoch": 7.785190126751168, "step": 23340 }, { "distill_loss": 0.26740288734436035, "epoch": 7.785190126751168, "step": 23340 }, { "epoch": 7.785190126751168, "ref_ce_loss": 0.11114541441202164, "step": 23340 }, { "epoch": 7.788525683789192, "loss": 0.4902, "step": 23350 }, { "epoch": 7.788525683789192, "grad_norm": 1.316171646118164, "step": 23350 }, { "epoch": 7.788525683789192, "learning_rate": 9.829263826093305e-05, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.3973608911037445, "step": 23350 }, { "ce_loss": 0.055136047303676605, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.2086581289768219, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.06788954138755798, "step": 23350 }, { "epoch": 7.788525683789192, "loss": 0.3197157084941864, "step": 23350 }, { "ce_loss": 0.0911700427532196, "epoch": 7.788525683789192, "step": 23350 }, { "distill_loss": 0.13513973355293274, "epoch": 7.788525683789192, "step": 23350 }, { "epoch": 7.788525683789192, "ref_ce_loss": 0.07355178147554398, "step": 23350 }, { "epoch": 7.7918612408272185, "loss": 0.467, "step": 23360 }, { "epoch": 7.7918612408272185, "grad_norm": 1.2956640720367432, "step": 23360 }, { "epoch": 7.7918612408272185, "learning_rate": 9.800909174467317e-05, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.5860046148300171, "step": 23360 }, { "ce_loss": 0.08855269849300385, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.20145806670188904, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.07742108404636383, "step": 23360 }, { "epoch": 7.7918612408272185, "loss": 0.500344455242157, "step": 23360 }, { "ce_loss": 0.06911452859640121, "epoch": 7.7918612408272185, "step": 23360 }, { "distill_loss": 0.24287210404872894, "epoch": 7.7918612408272185, "step": 23360 }, { "epoch": 7.7918612408272185, "ref_ce_loss": 0.07664454728364944, "step": 23360 }, { "epoch": 7.795196797865243, "loss": 0.4986, "step": 23370 }, { "epoch": 7.795196797865243, "grad_norm": 2.773852586746216, "step": 23370 }, { "epoch": 7.795196797865243, "learning_rate": 9.772589768429874e-05, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.49430906772613525, "step": 23370 }, { "ce_loss": 0.13668407499790192, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.249668151140213, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.08518452197313309, "step": 23370 }, { "epoch": 7.795196797865243, "loss": 0.3225928843021393, "step": 23370 }, { "ce_loss": 0.06387481838464737, "epoch": 7.795196797865243, "step": 23370 }, { "distill_loss": 0.15631364285945892, "epoch": 7.795196797865243, "step": 23370 }, { "epoch": 7.795196797865243, "ref_ce_loss": 0.07911847531795502, "step": 23370 }, { "epoch": 7.798532354903269, "loss": 0.4476, "step": 23380 }, { "epoch": 7.798532354903269, "grad_norm": 3.633497714996338, "step": 23380 }, { "epoch": 7.798532354903269, "learning_rate": 9.744305641032778e-05, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.5017527937889099, "step": 23380 }, { "ce_loss": 0.10943292081356049, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.26812535524368286, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.0913664773106575, "step": 23380 }, { "epoch": 7.798532354903269, "loss": 0.3241008222103119, "step": 23380 }, { "ce_loss": 0.0440908744931221, "epoch": 7.798532354903269, "step": 23380 }, { "distill_loss": 0.20425589382648468, "epoch": 7.798532354903269, "step": 23380 }, { "epoch": 7.798532354903269, "ref_ce_loss": 0.05590224638581276, "step": 23380 }, { "epoch": 7.801867911941294, "loss": 0.4976, "step": 23390 }, { "epoch": 7.801867911941294, "grad_norm": 1.370785117149353, "step": 23390 }, { "epoch": 7.801867911941294, "learning_rate": 9.71605682528666e-05, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.47232586145401, "step": 23390 }, { "ce_loss": 0.0808471217751503, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.26032713055610657, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.09770216792821884, "step": 23390 }, { "epoch": 7.801867911941294, "loss": 0.5162336230278015, "step": 23390 }, { "ce_loss": 0.11250855028629303, "epoch": 7.801867911941294, "step": 23390 }, { "distill_loss": 0.22452497482299805, "epoch": 7.801867911941294, "step": 23390 }, { "epoch": 7.801867911941294, "ref_ce_loss": 0.10824142396450043, "step": 23390 }, { "epoch": 7.80520346897932, "loss": 0.5396, "step": 23400 }, { "epoch": 7.80520346897932, "grad_norm": 1.8400545120239258, "step": 23400 }, { "epoch": 7.80520346897932, "learning_rate": 9.687843354160904e-05, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.515917956829071, "step": 23400 }, { "ce_loss": 0.10233543813228607, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.19212068617343903, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.109577476978302, "step": 23400 }, { "epoch": 7.80520346897932, "loss": 0.4446874260902405, "step": 23400 }, { "ce_loss": 0.07502096146345139, "epoch": 7.80520346897932, "step": 23400 }, { "distill_loss": 0.2301982194185257, "epoch": 7.80520346897932, "step": 23400 }, { "epoch": 7.80520346897932, "ref_ce_loss": 0.08803768455982208, "step": 23400 }, { "epoch": 7.808539026017344, "loss": 0.5094, "step": 23410 }, { "epoch": 7.808539026017344, "grad_norm": 1.3168748617172241, "step": 23410 }, { "epoch": 7.808539026017344, "learning_rate": 9.65966526058367e-05, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.366154283285141, "step": 23410 }, { "ce_loss": 0.06043058633804321, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.21017275750637054, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.06821354478597641, "step": 23410 }, { "epoch": 7.808539026017344, "loss": 0.38487592339515686, "step": 23410 }, { "ce_loss": 0.08029428869485855, "epoch": 7.808539026017344, "step": 23410 }, { "distill_loss": 0.1981252282857895, "epoch": 7.808539026017344, "step": 23410 }, { "epoch": 7.808539026017344, "ref_ce_loss": 0.07559670507907867, "step": 23410 }, { "epoch": 7.811874583055371, "loss": 0.4743, "step": 23420 }, { "epoch": 7.811874583055371, "grad_norm": 1.4600363969802856, "step": 23420 }, { "epoch": 7.811874583055371, "learning_rate": 9.631522577441838e-05, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.6461279988288879, "step": 23420 }, { "ce_loss": 0.12234245985746384, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.25451117753982544, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.14721155166625977, "step": 23420 }, { "epoch": 7.811874583055371, "loss": 0.4057418406009674, "step": 23420 }, { "ce_loss": 0.09284445643424988, "epoch": 7.811874583055371, "step": 23420 }, { "distill_loss": 0.217638298869133, "epoch": 7.811874583055371, "step": 23420 }, { "epoch": 7.811874583055371, "ref_ce_loss": 0.0950545147061348, "step": 23420 }, { "epoch": 7.815210140093396, "loss": 0.4539, "step": 23430 }, { "epoch": 7.815210140093396, "grad_norm": 1.27256441116333, "step": 23430 }, { "epoch": 7.815210140093396, "learning_rate": 9.603415337580939e-05, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.45491790771484375, "step": 23430 }, { "ce_loss": 0.08659575879573822, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.20614565908908844, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.09821710735559464, "step": 23430 }, { "epoch": 7.815210140093396, "loss": 0.5650123357772827, "step": 23430 }, { "ce_loss": 0.08840961009263992, "epoch": 7.815210140093396, "step": 23430 }, { "distill_loss": 0.2519112825393677, "epoch": 7.815210140093396, "step": 23430 }, { "epoch": 7.815210140093396, "ref_ce_loss": 0.09993168711662292, "step": 23430 }, { "epoch": 7.818545697131421, "loss": 0.4891, "step": 23440 }, { "epoch": 7.818545697131421, "grad_norm": 2.7551286220550537, "step": 23440 }, { "epoch": 7.818545697131421, "learning_rate": 9.575343573805149e-05, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.5477972030639648, "step": 23440 }, { "ce_loss": 0.08709275722503662, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.22108495235443115, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.0938587412238121, "step": 23440 }, { "epoch": 7.818545697131421, "loss": 0.28603890538215637, "step": 23440 }, { "ce_loss": 0.04387114942073822, "epoch": 7.818545697131421, "step": 23440 }, { "distill_loss": 0.16789186000823975, "epoch": 7.818545697131421, "step": 23440 }, { "epoch": 7.818545697131421, "ref_ce_loss": 0.07395945489406586, "step": 23440 }, { "epoch": 7.821881254169447, "loss": 0.4996, "step": 23450 }, { "epoch": 7.821881254169447, "grad_norm": 2.3392765522003174, "step": 23450 }, { "epoch": 7.821881254169447, "learning_rate": 9.547307318877234e-05, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.5413024425506592, "step": 23450 }, { "ce_loss": 0.11601690202951431, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.2168048620223999, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.11610511690378189, "step": 23450 }, { "epoch": 7.821881254169447, "loss": 0.422008752822876, "step": 23450 }, { "ce_loss": 0.08798611164093018, "epoch": 7.821881254169447, "step": 23450 }, { "distill_loss": 0.23578450083732605, "epoch": 7.821881254169447, "step": 23450 }, { "epoch": 7.821881254169447, "ref_ce_loss": 0.0685184970498085, "step": 23450 }, { "epoch": 7.825216811207472, "loss": 0.4755, "step": 23460 }, { "epoch": 7.825216811207472, "grad_norm": 2.4963676929473877, "step": 23460 }, { "epoch": 7.825216811207472, "learning_rate": 9.519306605518527e-05, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.7461239695549011, "step": 23460 }, { "ce_loss": 0.10150115191936493, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.17962796986103058, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.10328700393438339, "step": 23460 }, { "epoch": 7.825216811207472, "loss": 0.3461375832557678, "step": 23460 }, { "ce_loss": 0.07732126116752625, "epoch": 7.825216811207472, "step": 23460 }, { "distill_loss": 0.18031379580497742, "epoch": 7.825216811207472, "step": 23460 }, { "epoch": 7.825216811207472, "ref_ce_loss": 0.07350736856460571, "step": 23460 }, { "epoch": 7.828552368245497, "loss": 0.5119, "step": 23470 }, { "epoch": 7.828552368245497, "grad_norm": 1.6425929069519043, "step": 23470 }, { "epoch": 7.828552368245497, "learning_rate": 9.491341466408882e-05, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.44518062472343445, "step": 23470 }, { "ce_loss": 0.11213728785514832, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.21456876397132874, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.11830023676156998, "step": 23470 }, { "epoch": 7.828552368245497, "loss": 0.4567699432373047, "step": 23470 }, { "ce_loss": 0.0971532016992569, "epoch": 7.828552368245497, "step": 23470 }, { "distill_loss": 0.22243893146514893, "epoch": 7.828552368245497, "step": 23470 }, { "epoch": 7.828552368245497, "ref_ce_loss": 0.10531125962734222, "step": 23470 }, { "epoch": 7.831887925283523, "loss": 0.4603, "step": 23480 }, { "epoch": 7.831887925283523, "grad_norm": 1.9173786640167236, "step": 23480 }, { "epoch": 7.831887925283523, "learning_rate": 9.463411934186601e-05, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.5281015038490295, "step": 23480 }, { "ce_loss": 0.09305088967084885, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.27902358770370483, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.07683122158050537, "step": 23480 }, { "epoch": 7.831887925283523, "loss": 0.44092512130737305, "step": 23480 }, { "ce_loss": 0.05991402268409729, "epoch": 7.831887925283523, "step": 23480 }, { "distill_loss": 0.261231005191803, "epoch": 7.831887925283523, "step": 23480 }, { "epoch": 7.831887925283523, "ref_ce_loss": 0.08075682073831558, "step": 23480 }, { "epoch": 7.835223482321548, "loss": 0.4746, "step": 23490 }, { "epoch": 7.835223482321548, "grad_norm": 1.47471284866333, "step": 23490 }, { "epoch": 7.835223482321548, "learning_rate": 9.435518041448466e-05, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.3915292024612427, "step": 23490 }, { "ce_loss": 0.0853789672255516, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.16837775707244873, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.09448844194412231, "step": 23490 }, { "epoch": 7.835223482321548, "loss": 0.43984946608543396, "step": 23490 }, { "ce_loss": 0.08536282926797867, "epoch": 7.835223482321548, "step": 23490 }, { "distill_loss": 0.207362562417984, "epoch": 7.835223482321548, "step": 23490 }, { "epoch": 7.835223482321548, "ref_ce_loss": 0.11695067584514618, "step": 23490 }, { "epoch": 7.838559039359573, "loss": 0.4805, "step": 23500 }, { "epoch": 7.838559039359573, "grad_norm": 1.0447158813476562, "step": 23500 }, { "epoch": 7.838559039359573, "learning_rate": 9.407659820749648e-05, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.5188337564468384, "step": 23500 }, { "ce_loss": 0.06215955317020416, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.18601679801940918, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.08605736494064331, "step": 23500 }, { "epoch": 7.838559039359573, "loss": 0.4018697142601013, "step": 23500 }, { "ce_loss": 0.09234922379255295, "epoch": 7.838559039359573, "step": 23500 }, { "distill_loss": 0.201063871383667, "epoch": 7.838559039359573, "step": 23500 }, { "epoch": 7.838559039359573, "ref_ce_loss": 0.07423745840787888, "step": 23500 }, { "epoch": 7.841894596397599, "loss": 0.447, "step": 23510 }, { "epoch": 7.841894596397599, "grad_norm": 1.3518065214157104, "step": 23510 }, { "epoch": 7.841894596397599, "learning_rate": 9.379837304603705e-05, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.3417120575904846, "step": 23510 }, { "ce_loss": 0.07121224701404572, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.1719876229763031, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.062478888779878616, "step": 23510 }, { "epoch": 7.841894596397599, "loss": 0.45312342047691345, "step": 23510 }, { "ce_loss": 0.11944121867418289, "epoch": 7.841894596397599, "step": 23510 }, { "distill_loss": 0.23101180791854858, "epoch": 7.841894596397599, "step": 23510 }, { "epoch": 7.841894596397599, "ref_ce_loss": 0.1022668406367302, "step": 23510 }, { "epoch": 7.845230153435624, "loss": 0.4267, "step": 23520 }, { "epoch": 7.845230153435624, "grad_norm": 1.7903634309768677, "step": 23520 }, { "epoch": 7.845230153435624, "learning_rate": 9.352050525482478e-05, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.3670975863933563, "step": 23520 }, { "ce_loss": 0.08908902108669281, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.17360833287239075, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.08187222480773926, "step": 23520 }, { "epoch": 7.845230153435624, "loss": 0.42096009850502014, "step": 23520 }, { "ce_loss": 0.08967997133731842, "epoch": 7.845230153435624, "step": 23520 }, { "distill_loss": 0.20975546538829803, "epoch": 7.845230153435624, "step": 23520 }, { "epoch": 7.845230153435624, "ref_ce_loss": 0.09015170484781265, "step": 23520 }, { "epoch": 7.8485657104736495, "loss": 0.4412, "step": 23530 }, { "epoch": 7.8485657104736495, "grad_norm": 1.3786771297454834, "step": 23530 }, { "epoch": 7.8485657104736495, "learning_rate": 9.324299515816148e-05, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.4102479815483093, "step": 23530 }, { "ce_loss": 0.046972282230854034, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.22596120834350586, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.09007242321968079, "step": 23530 }, { "epoch": 7.8485657104736495, "loss": 0.6155588030815125, "step": 23530 }, { "ce_loss": 0.0814739242196083, "epoch": 7.8485657104736495, "step": 23530 }, { "distill_loss": 0.2271156758069992, "epoch": 7.8485657104736495, "step": 23530 }, { "epoch": 7.8485657104736495, "ref_ce_loss": 0.09563101828098297, "step": 23530 }, { "epoch": 7.851901267511675, "loss": 0.5099, "step": 23540 }, { "epoch": 7.851901267511675, "grad_norm": 1.020455241203308, "step": 23540 }, { "epoch": 7.851901267511675, "learning_rate": 9.296584307993125e-05, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.6030544638633728, "step": 23540 }, { "ce_loss": 0.1653348058462143, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.2651827335357666, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.11329934746026993, "step": 23540 }, { "epoch": 7.851901267511675, "loss": 0.6119688153266907, "step": 23540 }, { "ce_loss": 0.13465023040771484, "epoch": 7.851901267511675, "step": 23540 }, { "distill_loss": 0.22328448295593262, "epoch": 7.851901267511675, "step": 23540 }, { "epoch": 7.851901267511675, "ref_ce_loss": 0.1069074496626854, "step": 23540 }, { "epoch": 7.8552368245497, "loss": 0.5109, "step": 23550 }, { "epoch": 7.8552368245497, "grad_norm": 1.1971163749694824, "step": 23550 }, { "epoch": 7.8552368245497, "learning_rate": 9.268904934360039e-05, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.4943358600139618, "step": 23550 }, { "ce_loss": 0.14528106153011322, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.21824562549591064, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.0931699350476265, "step": 23550 }, { "epoch": 7.8552368245497, "loss": 0.43538621068000793, "step": 23550 }, { "ce_loss": 0.07891083508729935, "epoch": 7.8552368245497, "step": 23550 }, { "distill_loss": 0.23238444328308105, "epoch": 7.8552368245497, "step": 23550 }, { "epoch": 7.8552368245497, "ref_ce_loss": 0.0957845076918602, "step": 23550 }, { "epoch": 7.8585723815877255, "loss": 0.4925, "step": 23560 }, { "epoch": 7.8585723815877255, "grad_norm": 1.2838505506515503, "step": 23560 }, { "epoch": 7.8585723815877255, "learning_rate": 9.241261427221695e-05, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.37081849575042725, "step": 23560 }, { "ce_loss": 0.06106231361627579, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.17535613477230072, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.07697635889053345, "step": 23560 }, { "epoch": 7.8585723815877255, "loss": 0.43605366349220276, "step": 23560 }, { "ce_loss": 0.08120652288198471, "epoch": 7.8585723815877255, "step": 23560 }, { "distill_loss": 0.20109125971794128, "epoch": 7.8585723815877255, "step": 23560 }, { "epoch": 7.8585723815877255, "ref_ce_loss": 0.11010712385177612, "step": 23560 }, { "epoch": 7.861907938625751, "loss": 0.4525, "step": 23570 }, { "epoch": 7.861907938625751, "grad_norm": 2.322266101837158, "step": 23570 }, { "epoch": 7.861907938625751, "learning_rate": 9.213653818841046e-05, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.6685158014297485, "step": 23570 }, { "ce_loss": 0.11310289800167084, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.24341696500778198, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.1057773232460022, "step": 23570 }, { "epoch": 7.861907938625751, "loss": 0.48814332485198975, "step": 23570 }, { "ce_loss": 0.08805195987224579, "epoch": 7.861907938625751, "step": 23570 }, { "distill_loss": 0.2128458321094513, "epoch": 7.861907938625751, "step": 23570 }, { "epoch": 7.861907938625751, "ref_ce_loss": 0.09854722023010254, "step": 23570 }, { "epoch": 7.865243495663776, "loss": 0.5274, "step": 23580 }, { "epoch": 7.865243495663776, "grad_norm": 2.334955930709839, "step": 23580 }, { "epoch": 7.865243495663776, "learning_rate": 9.186082141439145e-05, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.43076473474502563, "step": 23580 }, { "ce_loss": 0.10165365785360336, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.2332915961742401, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.09554749727249146, "step": 23580 }, { "epoch": 7.865243495663776, "loss": 0.41907650232315063, "step": 23580 }, { "ce_loss": 0.0959557369351387, "epoch": 7.865243495663776, "step": 23580 }, { "distill_loss": 0.23277854919433594, "epoch": 7.865243495663776, "step": 23580 }, { "epoch": 7.865243495663776, "ref_ce_loss": 0.08993334323167801, "step": 23580 }, { "epoch": 7.868579052701802, "loss": 0.4974, "step": 23590 }, { "epoch": 7.868579052701802, "grad_norm": 1.3074328899383545, "step": 23590 }, { "epoch": 7.868579052701802, "learning_rate": 9.158546427195092e-05, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.5405129194259644, "step": 23590 }, { "ce_loss": 0.10693372040987015, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.21220289170742035, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.11354639381170273, "step": 23590 }, { "epoch": 7.868579052701802, "loss": 0.3445146083831787, "step": 23590 }, { "ce_loss": 0.09031592309474945, "epoch": 7.868579052701802, "step": 23590 }, { "distill_loss": 0.16584084928035736, "epoch": 7.868579052701802, "step": 23590 }, { "epoch": 7.868579052701802, "ref_ce_loss": 0.08803331106901169, "step": 23590 }, { "epoch": 7.871914609739827, "loss": 0.4623, "step": 23600 }, { "epoch": 7.871914609739827, "grad_norm": 1.153918743133545, "step": 23600 }, { "epoch": 7.871914609739827, "learning_rate": 9.131046708246036e-05, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.3527657985687256, "step": 23600 }, { "ce_loss": 0.07177247107028961, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.19940011203289032, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.08104899525642395, "step": 23600 }, { "epoch": 7.871914609739827, "loss": 0.37144240736961365, "step": 23600 }, { "ce_loss": 0.08155050873756409, "epoch": 7.871914609739827, "step": 23600 }, { "distill_loss": 0.17217794060707092, "epoch": 7.871914609739827, "step": 23600 }, { "epoch": 7.871914609739827, "ref_ce_loss": 0.08980002254247665, "step": 23600 }, { "epoch": 7.875250166777852, "loss": 0.5094, "step": 23610 }, { "epoch": 7.875250166777852, "grad_norm": 2.631946086883545, "step": 23610 }, { "epoch": 7.875250166777852, "learning_rate": 9.103583016687105e-05, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.4280019700527191, "step": 23610 }, { "ce_loss": 0.07770437747240067, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.22484853863716125, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.059737708419561386, "step": 23610 }, { "epoch": 7.875250166777852, "loss": 0.37178662419319153, "step": 23610 }, { "ce_loss": 0.0942569151520729, "epoch": 7.875250166777852, "step": 23610 }, { "distill_loss": 0.1954062432050705, "epoch": 7.875250166777852, "step": 23610 }, { "epoch": 7.875250166777852, "ref_ce_loss": 0.08182733505964279, "step": 23610 }, { "epoch": 7.878585723815878, "loss": 0.4368, "step": 23620 }, { "epoch": 7.878585723815878, "grad_norm": 0.9139115810394287, "step": 23620 }, { "epoch": 7.878585723815878, "learning_rate": 9.076155384571387e-05, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.6207575798034668, "step": 23620 }, { "ce_loss": 0.09600795805454254, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.264886736869812, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.09426525235176086, "step": 23620 }, { "epoch": 7.878585723815878, "loss": 0.4685150384902954, "step": 23620 }, { "ce_loss": 0.12142255157232285, "epoch": 7.878585723815878, "step": 23620 }, { "distill_loss": 0.2318248301744461, "epoch": 7.878585723815878, "step": 23620 }, { "epoch": 7.878585723815878, "ref_ce_loss": 0.08457332104444504, "step": 23620 }, { "epoch": 7.881921280853903, "loss": 0.5001, "step": 23630 }, { "epoch": 7.881921280853903, "grad_norm": 1.480294942855835, "step": 23630 }, { "epoch": 7.881921280853903, "learning_rate": 9.048763843909891e-05, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.5095387101173401, "step": 23630 }, { "ce_loss": 0.11818449944257736, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.24988248944282532, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.09514731168746948, "step": 23630 }, { "epoch": 7.881921280853903, "loss": 0.3793356418609619, "step": 23630 }, { "ce_loss": 0.07153532654047012, "epoch": 7.881921280853903, "step": 23630 }, { "distill_loss": 0.19732148945331573, "epoch": 7.881921280853903, "step": 23630 }, { "epoch": 7.881921280853903, "ref_ce_loss": 0.06991773098707199, "step": 23630 }, { "epoch": 7.885256837891928, "loss": 0.5244, "step": 23640 }, { "epoch": 7.885256837891928, "grad_norm": 2.3088581562042236, "step": 23640 }, { "epoch": 7.885256837891928, "learning_rate": 9.021408426671469e-05, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.46456190943717957, "step": 23640 }, { "ce_loss": 0.08094074577093124, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.2792062759399414, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.06169167906045914, "step": 23640 }, { "epoch": 7.885256837891928, "loss": 0.5832566618919373, "step": 23640 }, { "ce_loss": 0.1753084659576416, "epoch": 7.885256837891928, "step": 23640 }, { "distill_loss": 0.2629208266735077, "epoch": 7.885256837891928, "step": 23640 }, { "epoch": 7.885256837891928, "ref_ce_loss": 0.1448318362236023, "step": 23640 }, { "epoch": 7.888592394929954, "loss": 0.5023, "step": 23650 }, { "epoch": 7.888592394929954, "grad_norm": 1.5563539266586304, "step": 23650 }, { "epoch": 7.888592394929954, "learning_rate": 8.994089164782838e-05, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.37901008129119873, "step": 23650 }, { "ce_loss": 0.06203204765915871, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.17749962210655212, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.10638806968927383, "step": 23650 }, { "epoch": 7.888592394929954, "loss": 0.39251482486724854, "step": 23650 }, { "ce_loss": 0.06868699938058853, "epoch": 7.888592394929954, "step": 23650 }, { "distill_loss": 0.1591005176305771, "epoch": 7.888592394929954, "step": 23650 }, { "epoch": 7.888592394929954, "ref_ce_loss": 0.08333493769168854, "step": 23650 }, { "epoch": 7.891927951967979, "loss": 0.4642, "step": 23660 }, { "epoch": 7.891927951967979, "grad_norm": 1.2899342775344849, "step": 23660 }, { "epoch": 7.891927951967979, "learning_rate": 8.966806090128543e-05, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.3885990083217621, "step": 23660 }, { "ce_loss": 0.06434261053800583, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.21467149257659912, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.07244598120450974, "step": 23660 }, { "epoch": 7.891927951967979, "loss": 0.5642814636230469, "step": 23660 }, { "ce_loss": 0.11021937429904938, "epoch": 7.891927951967979, "step": 23660 }, { "distill_loss": 0.22649425268173218, "epoch": 7.891927951967979, "step": 23660 }, { "epoch": 7.891927951967979, "ref_ce_loss": 0.10326132923364639, "step": 23660 }, { "epoch": 7.895263509006004, "loss": 0.438, "step": 23670 }, { "epoch": 7.895263509006004, "grad_norm": 1.264271855354309, "step": 23670 }, { "epoch": 7.895263509006004, "learning_rate": 8.939559234550845e-05, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.43615591526031494, "step": 23670 }, { "ce_loss": 0.07874233275651932, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.1742517352104187, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.09160982072353363, "step": 23670 }, { "epoch": 7.895263509006004, "loss": 0.5795973539352417, "step": 23670 }, { "ce_loss": 0.10307463258504868, "epoch": 7.895263509006004, "step": 23670 }, { "distill_loss": 0.2566601037979126, "epoch": 7.895263509006004, "step": 23670 }, { "epoch": 7.895263509006004, "ref_ce_loss": 0.08958743512630463, "step": 23670 }, { "epoch": 7.89859906604403, "loss": 0.4488, "step": 23680 }, { "epoch": 7.89859906604403, "grad_norm": 1.1816338300704956, "step": 23680 }, { "epoch": 7.89859906604403, "learning_rate": 8.912348629849759e-05, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.44713523983955383, "step": 23680 }, { "ce_loss": 0.04218914732336998, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.1715305745601654, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.07068831473588943, "step": 23680 }, { "epoch": 7.89859906604403, "loss": 0.3945850133895874, "step": 23680 }, { "ce_loss": 0.05503353103995323, "epoch": 7.89859906604403, "step": 23680 }, { "distill_loss": 0.20405761897563934, "epoch": 7.89859906604403, "step": 23680 }, { "epoch": 7.89859906604403, "ref_ce_loss": 0.09490940719842911, "step": 23680 }, { "epoch": 7.901934623082055, "loss": 0.4506, "step": 23690 }, { "epoch": 7.901934623082055, "grad_norm": 1.0902290344238281, "step": 23690 }, { "epoch": 7.901934623082055, "learning_rate": 8.88517430778299e-05, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.4979480504989624, "step": 23690 }, { "ce_loss": 0.07646001875400543, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.20225203037261963, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.102251335978508, "step": 23690 }, { "epoch": 7.901934623082055, "loss": 0.4533572793006897, "step": 23690 }, { "ce_loss": 0.080350361764431, "epoch": 7.901934623082055, "step": 23690 }, { "distill_loss": 0.1974259465932846, "epoch": 7.901934623082055, "step": 23690 }, { "epoch": 7.901934623082055, "ref_ce_loss": 0.08958312124013901, "step": 23690 }, { "epoch": 7.90527018012008, "loss": 0.4877, "step": 23700 }, { "epoch": 7.90527018012008, "grad_norm": 9.525444984436035, "step": 23700 }, { "epoch": 7.90527018012008, "learning_rate": 8.858036300065911e-05, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.37766262888908386, "step": 23700 }, { "ce_loss": 0.08095361292362213, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.15454107522964478, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.10152625292539597, "step": 23700 }, { "epoch": 7.90527018012008, "loss": 0.38847488164901733, "step": 23700 }, { "ce_loss": 0.08247093111276627, "epoch": 7.90527018012008, "step": 23700 }, { "distill_loss": 0.1659182459115982, "epoch": 7.90527018012008, "step": 23700 }, { "epoch": 7.90527018012008, "ref_ce_loss": 0.108720563352108, "step": 23700 }, { "epoch": 7.908605737158106, "loss": 0.4464, "step": 23710 }, { "epoch": 7.908605737158106, "grad_norm": 1.6320754289627075, "step": 23710 }, { "epoch": 7.908605737158106, "learning_rate": 8.830934638371476e-05, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.40211835503578186, "step": 23710 }, { "ce_loss": 0.10799682885408401, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.19310733675956726, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.10020504891872406, "step": 23710 }, { "epoch": 7.908605737158106, "loss": 0.32844552397727966, "step": 23710 }, { "ce_loss": 0.0812181755900383, "epoch": 7.908605737158106, "step": 23710 }, { "distill_loss": 0.1739908903837204, "epoch": 7.908605737158106, "step": 23710 }, { "epoch": 7.908605737158106, "ref_ce_loss": 0.07298498600721359, "step": 23710 }, { "epoch": 7.911941294196131, "loss": 0.459, "step": 23720 }, { "epoch": 7.911941294196131, "grad_norm": 1.5958127975463867, "step": 23720 }, { "epoch": 7.911941294196131, "learning_rate": 8.80386935433024e-05, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.4221686124801636, "step": 23720 }, { "ce_loss": 0.08290351927280426, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.1929956078529358, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.11231111735105515, "step": 23720 }, { "epoch": 7.911941294196131, "loss": 0.6971574425697327, "step": 23720 }, { "ce_loss": 0.14547531306743622, "epoch": 7.911941294196131, "step": 23720 }, { "distill_loss": 0.2727227210998535, "epoch": 7.911941294196131, "step": 23720 }, { "epoch": 7.911941294196131, "ref_ce_loss": 0.09017284214496613, "step": 23720 }, { "epoch": 7.9152768512341565, "loss": 0.4447, "step": 23730 }, { "epoch": 7.9152768512341565, "grad_norm": 1.317276120185852, "step": 23730 }, { "epoch": 7.9152768512341565, "learning_rate": 8.776840479530317e-05, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.5677560567855835, "step": 23730 }, { "ce_loss": 0.14442239701747894, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.22917179763317108, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.10563664138317108, "step": 23730 }, { "epoch": 7.9152768512341565, "loss": 0.34828436374664307, "step": 23730 }, { "ce_loss": 0.053382810205221176, "epoch": 7.9152768512341565, "step": 23730 }, { "distill_loss": 0.19687311351299286, "epoch": 7.9152768512341565, "step": 23730 }, { "epoch": 7.9152768512341565, "ref_ce_loss": 0.06002996861934662, "step": 23730 }, { "epoch": 7.918612408272182, "loss": 0.4476, "step": 23740 }, { "epoch": 7.918612408272182, "grad_norm": 3.7101621627807617, "step": 23740 }, { "epoch": 7.918612408272182, "learning_rate": 8.749848045517315e-05, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.3004143238067627, "step": 23740 }, { "ce_loss": 0.04413343220949173, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.14866234362125397, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.1073392927646637, "step": 23740 }, { "epoch": 7.918612408272182, "loss": 0.581344723701477, "step": 23740 }, { "ce_loss": 0.16669215261936188, "epoch": 7.918612408272182, "step": 23740 }, { "distill_loss": 0.2569911479949951, "epoch": 7.918612408272182, "step": 23740 }, { "epoch": 7.918612408272182, "ref_ce_loss": 0.08630262315273285, "step": 23740 }, { "epoch": 7.921947965310207, "loss": 0.4464, "step": 23750 }, { "epoch": 7.921947965310207, "grad_norm": 1.4706305265426636, "step": 23750 }, { "epoch": 7.921947965310207, "learning_rate": 8.722892083794287e-05, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.5540456771850586, "step": 23750 }, { "ce_loss": 0.12081640213727951, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.2431809902191162, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.11178777366876602, "step": 23750 }, { "epoch": 7.921947965310207, "loss": 0.5178357362747192, "step": 23750 }, { "ce_loss": 0.08638735115528107, "epoch": 7.921947965310207, "step": 23750 }, { "distill_loss": 0.21622644364833832, "epoch": 7.921947965310207, "step": 23750 }, { "epoch": 7.921947965310207, "ref_ce_loss": 0.08753465861082077, "step": 23750 }, { "epoch": 7.9252835223482325, "loss": 0.4746, "step": 23760 }, { "epoch": 7.9252835223482325, "grad_norm": 1.4268579483032227, "step": 23760 }, { "epoch": 7.9252835223482325, "learning_rate": 8.695972625821744e-05, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.5005850791931152, "step": 23760 }, { "ce_loss": 0.11384665220975876, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.20936432480812073, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.08197855949401855, "step": 23760 }, { "epoch": 7.9252835223482325, "loss": 0.4651836156845093, "step": 23760 }, { "ce_loss": 0.05032093822956085, "epoch": 7.9252835223482325, "step": 23760 }, { "distill_loss": 0.24174442887306213, "epoch": 7.9252835223482325, "step": 23760 }, { "epoch": 7.9252835223482325, "ref_ce_loss": 0.08633485436439514, "step": 23760 }, { "epoch": 7.928619079386258, "loss": 0.4899, "step": 23770 }, { "epoch": 7.928619079386258, "grad_norm": 1.0803759098052979, "step": 23770 }, { "epoch": 7.928619079386258, "learning_rate": 8.669089703017608e-05, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.3728218078613281, "step": 23770 }, { "ce_loss": 0.07326167076826096, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.19154828786849976, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.0846903920173645, "step": 23770 }, { "epoch": 7.928619079386258, "loss": 0.38450494408607483, "step": 23770 }, { "ce_loss": 0.10573725402355194, "epoch": 7.928619079386258, "step": 23770 }, { "distill_loss": 0.18517494201660156, "epoch": 7.928619079386258, "step": 23770 }, { "epoch": 7.928619079386258, "ref_ce_loss": 0.09331288933753967, "step": 23770 }, { "epoch": 7.931954636424283, "loss": 0.476, "step": 23780 }, { "epoch": 7.931954636424283, "grad_norm": 1.3623288869857788, "step": 23780 }, { "epoch": 7.931954636424283, "learning_rate": 8.642243346757144e-05, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.33736154437065125, "step": 23780 }, { "ce_loss": 0.0692714974284172, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.17541882395744324, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.07102624326944351, "step": 23780 }, { "epoch": 7.931954636424283, "loss": 0.43745487928390503, "step": 23780 }, { "ce_loss": 0.08982948958873749, "epoch": 7.931954636424283, "step": 23780 }, { "distill_loss": 0.22856579720973969, "epoch": 7.931954636424283, "step": 23780 }, { "epoch": 7.931954636424283, "ref_ce_loss": 0.08436769992113113, "step": 23780 }, { "epoch": 7.935290193462309, "loss": 0.4873, "step": 23790 }, { "epoch": 7.935290193462309, "grad_norm": 0.9369721412658691, "step": 23790 }, { "epoch": 7.935290193462309, "learning_rate": 8.615433588372921e-05, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.4392630457878113, "step": 23790 }, { "ce_loss": 0.07594869285821915, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.22134481370449066, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.11620956659317017, "step": 23790 }, { "epoch": 7.935290193462309, "loss": 0.35754644870758057, "step": 23790 }, { "ce_loss": 0.055565547198057175, "epoch": 7.935290193462309, "step": 23790 }, { "distill_loss": 0.18978551030158997, "epoch": 7.935290193462309, "step": 23790 }, { "epoch": 7.935290193462309, "ref_ce_loss": 0.07662378996610641, "step": 23790 }, { "epoch": 7.938625750500334, "loss": 0.4711, "step": 23800 }, { "epoch": 7.938625750500334, "grad_norm": 1.3382545709609985, "step": 23800 }, { "epoch": 7.938625750500334, "learning_rate": 8.588660459154821e-05, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.6800578236579895, "step": 23800 }, { "ce_loss": 0.11877725273370743, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.25581854581832886, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.1141149178147316, "step": 23800 }, { "epoch": 7.938625750500334, "loss": 0.4789075255393982, "step": 23800 }, { "ce_loss": 0.06832636147737503, "epoch": 7.938625750500334, "step": 23800 }, { "distill_loss": 0.2092619389295578, "epoch": 7.938625750500334, "step": 23800 }, { "epoch": 7.938625750500334, "ref_ce_loss": 0.070604108273983, "step": 23800 }, { "epoch": 7.941961307538359, "loss": 0.4898, "step": 23810 }, { "epoch": 7.941961307538359, "grad_norm": 1.1001218557357788, "step": 23810 }, { "epoch": 7.941961307538359, "learning_rate": 8.561923990349962e-05, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.34215047955513, "step": 23810 }, { "ce_loss": 0.06366624683141708, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.13738380372524261, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.06837518513202667, "step": 23810 }, { "epoch": 7.941961307538359, "loss": 0.4595257043838501, "step": 23810 }, { "ce_loss": 0.12964025139808655, "epoch": 7.941961307538359, "step": 23810 }, { "distill_loss": 0.1967119425535202, "epoch": 7.941961307538359, "step": 23810 }, { "epoch": 7.941961307538359, "ref_ce_loss": 0.10415495932102203, "step": 23810 }, { "epoch": 7.945296864576385, "loss": 0.4967, "step": 23820 }, { "epoch": 7.945296864576385, "grad_norm": 1.7074393033981323, "step": 23820 }, { "epoch": 7.945296864576385, "learning_rate": 8.535224213162694e-05, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.7736154198646545, "step": 23820 }, { "ce_loss": 0.12521244585514069, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.2687603831291199, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.0855402946472168, "step": 23820 }, { "epoch": 7.945296864576385, "loss": 0.32968664169311523, "step": 23820 }, { "ce_loss": 0.047487348318099976, "epoch": 7.945296864576385, "step": 23820 }, { "distill_loss": 0.19186483323574066, "epoch": 7.945296864576385, "step": 23820 }, { "epoch": 7.945296864576385, "ref_ce_loss": 0.09002339094877243, "step": 23820 }, { "epoch": 7.94863242161441, "loss": 0.524, "step": 23830 }, { "epoch": 7.94863242161441, "grad_norm": 1.674178957939148, "step": 23830 }, { "epoch": 7.94863242161441, "learning_rate": 8.508561158754508e-05, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.5029487609863281, "step": 23830 }, { "ce_loss": 0.10470442473888397, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.3090134561061859, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.08878245949745178, "step": 23830 }, { "epoch": 7.94863242161441, "loss": 0.31632712483406067, "step": 23830 }, { "ce_loss": 0.07198981195688248, "epoch": 7.94863242161441, "step": 23830 }, { "distill_loss": 0.1398410201072693, "epoch": 7.94863242161441, "step": 23830 }, { "epoch": 7.94863242161441, "ref_ce_loss": 0.05910256877541542, "step": 23830 }, { "epoch": 7.951967978652435, "loss": 0.4389, "step": 23840 }, { "epoch": 7.951967978652435, "grad_norm": 1.578067421913147, "step": 23840 }, { "epoch": 7.951967978652435, "learning_rate": 8.481934858244072e-05, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.3850739002227783, "step": 23840 }, { "ce_loss": 0.09572809189558029, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.2023659646511078, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.06878726184368134, "step": 23840 }, { "epoch": 7.951967978652435, "loss": 0.33684679865837097, "step": 23840 }, { "ce_loss": 0.05011402815580368, "epoch": 7.951967978652435, "step": 23840 }, { "distill_loss": 0.17439530789852142, "epoch": 7.951967978652435, "step": 23840 }, { "epoch": 7.951967978652435, "ref_ce_loss": 0.08841081708669662, "step": 23840 }, { "epoch": 7.955303535690461, "loss": 0.4765, "step": 23850 }, { "epoch": 7.955303535690461, "grad_norm": 1.7793841361999512, "step": 23850 }, { "epoch": 7.955303535690461, "learning_rate": 8.455345342707138e-05, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.46160686016082764, "step": 23850 }, { "ce_loss": 0.07929213345050812, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.22592313587665558, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.10625654458999634, "step": 23850 }, { "epoch": 7.955303535690461, "loss": 0.41478806734085083, "step": 23850 }, { "ce_loss": 0.0628419816493988, "epoch": 7.955303535690461, "step": 23850 }, { "distill_loss": 0.2424730658531189, "epoch": 7.955303535690461, "step": 23850 }, { "epoch": 7.955303535690461, "ref_ce_loss": 0.0736921951174736, "step": 23850 }, { "epoch": 7.958639092728486, "loss": 0.4448, "step": 23860 }, { "epoch": 7.958639092728486, "grad_norm": 1.1721047163009644, "step": 23860 }, { "epoch": 7.958639092728486, "learning_rate": 8.428792643176544e-05, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.31136319041252136, "step": 23860 }, { "ce_loss": 0.05320083349943161, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.16525880992412567, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.0927494540810585, "step": 23860 }, { "epoch": 7.958639092728486, "loss": 0.40695562958717346, "step": 23860 }, { "ce_loss": 0.058694299310445786, "epoch": 7.958639092728486, "step": 23860 }, { "distill_loss": 0.23138663172721863, "epoch": 7.958639092728486, "step": 23860 }, { "epoch": 7.958639092728486, "ref_ce_loss": 0.09014415740966797, "step": 23860 }, { "epoch": 7.961974649766511, "loss": 0.4914, "step": 23870 }, { "epoch": 7.961974649766511, "grad_norm": 1.4769777059555054, "step": 23870 }, { "epoch": 7.961974649766511, "learning_rate": 8.402276790642117e-05, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.560581624507904, "step": 23870 }, { "ce_loss": 0.07228103280067444, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.21539956331253052, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.1249655932188034, "step": 23870 }, { "epoch": 7.961974649766511, "loss": 0.44131043553352356, "step": 23870 }, { "ce_loss": 0.08430244773626328, "epoch": 7.961974649766511, "step": 23870 }, { "distill_loss": 0.21277786791324615, "epoch": 7.961974649766511, "step": 23870 }, { "epoch": 7.961974649766511, "ref_ce_loss": 0.08507349342107773, "step": 23870 }, { "epoch": 7.965310206804537, "loss": 0.4711, "step": 23880 }, { "epoch": 7.965310206804537, "grad_norm": 1.1926960945129395, "step": 23880 }, { "epoch": 7.965310206804537, "learning_rate": 8.375797816050743e-05, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.762226939201355, "step": 23880 }, { "ce_loss": 0.11749958992004395, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.2878839373588562, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.13919147849082947, "step": 23880 }, { "epoch": 7.965310206804537, "loss": 0.4957149922847748, "step": 23880 }, { "ce_loss": 0.09586787223815918, "epoch": 7.965310206804537, "step": 23880 }, { "distill_loss": 0.2319086343050003, "epoch": 7.965310206804537, "step": 23880 }, { "epoch": 7.965310206804537, "ref_ce_loss": 0.1141003966331482, "step": 23880 }, { "epoch": 7.968645763842562, "loss": 0.4884, "step": 23890 }, { "epoch": 7.968645763842562, "grad_norm": 2.1842336654663086, "step": 23890 }, { "epoch": 7.968645763842562, "learning_rate": 8.349355750306233e-05, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.4850006401538849, "step": 23890 }, { "ce_loss": 0.1268412321805954, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.24083614349365234, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.0900849848985672, "step": 23890 }, { "epoch": 7.968645763842562, "loss": 0.41669759154319763, "step": 23890 }, { "ce_loss": 0.07890409231185913, "epoch": 7.968645763842562, "step": 23890 }, { "distill_loss": 0.20932383835315704, "epoch": 7.968645763842562, "step": 23890 }, { "epoch": 7.968645763842562, "ref_ce_loss": 0.10311402380466461, "step": 23890 }, { "epoch": 7.971981320880587, "loss": 0.4596, "step": 23900 }, { "epoch": 7.971981320880587, "grad_norm": 1.0587044954299927, "step": 23900 }, { "epoch": 7.971981320880587, "learning_rate": 8.322950624269301e-05, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.4549918472766876, "step": 23900 }, { "ce_loss": 0.08084800839424133, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.18602995574474335, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.09495816379785538, "step": 23900 }, { "epoch": 7.971981320880587, "loss": 0.43853050470352173, "step": 23900 }, { "ce_loss": 0.08710458874702454, "epoch": 7.971981320880587, "step": 23900 }, { "distill_loss": 0.22391125559806824, "epoch": 7.971981320880587, "step": 23900 }, { "epoch": 7.971981320880587, "ref_ce_loss": 0.10002166777849197, "step": 23900 }, { "epoch": 7.975316877918613, "loss": 0.4565, "step": 23910 }, { "epoch": 7.975316877918613, "grad_norm": 0.9432169198989868, "step": 23910 }, { "epoch": 7.975316877918613, "learning_rate": 8.296582468757583e-05, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.22017309069633484, "step": 23910 }, { "ce_loss": 0.0369831807911396, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.10918110609054565, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.0737534835934639, "step": 23910 }, { "epoch": 7.975316877918613, "loss": 0.4532097280025482, "step": 23910 }, { "ce_loss": 0.10423537343740463, "epoch": 7.975316877918613, "step": 23910 }, { "distill_loss": 0.18550725281238556, "epoch": 7.975316877918613, "step": 23910 }, { "epoch": 7.975316877918613, "ref_ce_loss": 0.08551128953695297, "step": 23910 }, { "epoch": 7.978652434956638, "loss": 0.4653, "step": 23920 }, { "epoch": 7.978652434956638, "grad_norm": 1.5507874488830566, "step": 23920 }, { "epoch": 7.978652434956638, "learning_rate": 8.270251314545557e-05, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.4448733329772949, "step": 23920 }, { "ce_loss": 0.07775755971670151, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.18209832906723022, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.08332362771034241, "step": 23920 }, { "epoch": 7.978652434956638, "loss": 0.3152199387550354, "step": 23920 }, { "ce_loss": 0.049137767404317856, "epoch": 7.978652434956638, "step": 23920 }, { "distill_loss": 0.16559302806854248, "epoch": 7.978652434956638, "step": 23920 }, { "epoch": 7.978652434956638, "ref_ce_loss": 0.06261259317398071, "step": 23920 }, { "epoch": 7.9819879919946635, "loss": 0.5149, "step": 23930 }, { "epoch": 7.9819879919946635, "grad_norm": 1.3906818628311157, "step": 23930 }, { "epoch": 7.9819879919946635, "learning_rate": 8.243957192364514e-05, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.4795082211494446, "step": 23930 }, { "ce_loss": 0.07854557782411575, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.19227755069732666, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.12034870684146881, "step": 23930 }, { "epoch": 7.9819879919946635, "loss": 0.3461790978908539, "step": 23930 }, { "ce_loss": 0.08637548983097076, "epoch": 7.9819879919946635, "step": 23930 }, { "distill_loss": 0.1667971909046173, "epoch": 7.9819879919946635, "step": 23930 }, { "epoch": 7.9819879919946635, "ref_ce_loss": 0.09276289492845535, "step": 23930 }, { "epoch": 7.985323549032689, "loss": 0.5295, "step": 23940 }, { "epoch": 7.985323549032689, "grad_norm": 1.211516261100769, "step": 23940 }, { "epoch": 7.985323549032689, "learning_rate": 8.21770013290251e-05, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.41299039125442505, "step": 23940 }, { "ce_loss": 0.10126766562461853, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.2065594643354416, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.06785832345485687, "step": 23940 }, { "epoch": 7.985323549032689, "loss": 0.572719395160675, "step": 23940 }, { "ce_loss": 0.07103711366653442, "epoch": 7.985323549032689, "step": 23940 }, { "distill_loss": 0.19799089431762695, "epoch": 7.985323549032689, "step": 23940 }, { "epoch": 7.985323549032689, "ref_ce_loss": 0.07989655435085297, "step": 23940 }, { "epoch": 7.988659106070714, "loss": 0.4702, "step": 23950 }, { "epoch": 7.988659106070714, "grad_norm": 1.3315993547439575, "step": 23950 }, { "epoch": 7.988659106070714, "learning_rate": 8.191480166804368e-05, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.639316976070404, "step": 23950 }, { "ce_loss": 0.14909887313842773, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.2654898762702942, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.1123674139380455, "step": 23950 }, { "epoch": 7.988659106070714, "loss": 0.44099435210227966, "step": 23950 }, { "ce_loss": 0.09333381056785583, "epoch": 7.988659106070714, "step": 23950 }, { "distill_loss": 0.198538139462471, "epoch": 7.988659106070714, "step": 23950 }, { "epoch": 7.988659106070714, "ref_ce_loss": 0.10574786365032196, "step": 23950 }, { "epoch": 7.9919946631087395, "loss": 0.5087, "step": 23960 }, { "epoch": 7.9919946631087395, "grad_norm": 1.3975638151168823, "step": 23960 }, { "epoch": 7.9919946631087395, "learning_rate": 8.165297324671608e-05, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.37281206250190735, "step": 23960 }, { "ce_loss": 0.0822332426905632, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.16164526343345642, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.08455748856067657, "step": 23960 }, { "epoch": 7.9919946631087395, "loss": 0.5337703227996826, "step": 23960 }, { "ce_loss": 0.11471060663461685, "epoch": 7.9919946631087395, "step": 23960 }, { "distill_loss": 0.2362850159406662, "epoch": 7.9919946631087395, "step": 23960 }, { "epoch": 7.9919946631087395, "ref_ce_loss": 0.08897367119789124, "step": 23960 }, { "epoch": 7.995330220146765, "loss": 0.4487, "step": 23970 }, { "epoch": 7.995330220146765, "grad_norm": 1.1574140787124634, "step": 23970 }, { "epoch": 7.995330220146765, "learning_rate": 8.13915163706243e-05, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.45330312848091125, "step": 23970 }, { "ce_loss": 0.08726263791322708, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.2119605392217636, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.11371081322431564, "step": 23970 }, { "epoch": 7.995330220146765, "loss": 0.501648485660553, "step": 23970 }, { "ce_loss": 0.14696897566318512, "epoch": 7.995330220146765, "step": 23970 }, { "distill_loss": 0.21774685382843018, "epoch": 7.995330220146765, "step": 23970 }, { "epoch": 7.995330220146765, "ref_ce_loss": 0.11590823531150818, "step": 23970 }, { "epoch": 7.99866577718479, "loss": 0.5028, "step": 23980 }, { "epoch": 7.99866577718479, "grad_norm": 1.1483594179153442, "step": 23980 }, { "epoch": 7.99866577718479, "learning_rate": 8.113043134491656e-05, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.8916773200035095, "step": 23980 }, { "ce_loss": 0.08570877462625504, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.2253543585538864, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.10541833192110062, "step": 23980 }, { "epoch": 7.99866577718479, "loss": 0.3779972195625305, "step": 23980 }, { "ce_loss": 0.0662795677781105, "epoch": 7.99866577718479, "step": 23980 }, { "distill_loss": 0.21109434962272644, "epoch": 7.99866577718479, "step": 23980 }, { "epoch": 7.99866577718479, "ref_ce_loss": 0.06937018781900406, "step": 23980 }, { "epoch": 8.002001334222815, "loss": 0.4566, "step": 23990 }, { "epoch": 8.002001334222815, "grad_norm": 1.309220552444458, "step": 23990 }, { "epoch": 8.002001334222815, "learning_rate": 8.086971847430728e-05, "step": 23990 }, { "epoch": 8.002001334222815, "loss": 0.5047003626823425, "step": 23990 }, { "ce_loss": 0.09218957275152206, "epoch": 8.002001334222815, "step": 23990 }, { "distill_loss": 0.2730647325515747, "epoch": 8.002001334222815, "step": 23990 }, { "epoch": 8.002001334222815, "ref_ce_loss": 0.07127413153648376, "step": 23990 }, { "epoch": 8.002001334222815, "loss": 0.4207586944103241, "step": 23990 }, { "ce_loss": 0.08874835073947906, "epoch": 8.002001334222815, "step": 23990 }, { "distill_loss": 0.22054798901081085, "epoch": 8.002001334222815, "step": 23990 }, { "epoch": 8.002001334222815, "ref_ce_loss": 0.07755181193351746, "step": 23990 }, { "epoch": 8.005336891260841, "loss": 0.4223, "step": 24000 }, { "epoch": 8.005336891260841, "grad_norm": 1.0361367464065552, "step": 24000 }, { "epoch": 8.005336891260841, "learning_rate": 8.060937806307633e-05, "step": 24000 }, { "epoch": 8.005336891260841, "loss": 0.3582214117050171, "step": 24000 }, { "ce_loss": 0.059342242777347565, "epoch": 8.005336891260841, "step": 24000 }, { "distill_loss": 0.2024124413728714, "epoch": 8.005336891260841, "step": 24000 }, { "epoch": 8.005336891260841, "ref_ce_loss": 0.05704175680875778, "step": 24000 }, { "epoch": 8.005336891260841, "loss": 0.4356797933578491, "step": 24000 }, { "ce_loss": 0.08584046363830566, "epoch": 8.005336891260841, "step": 24000 }, { "distill_loss": 0.19962406158447266, "epoch": 8.005336891260841, "step": 24000 }, { "epoch": 8.005336891260841, "ref_ce_loss": 0.08645645529031754, "step": 24000 }, { "epoch": 8.008672448298865, "loss": 0.4352, "step": 24010 }, { "epoch": 8.008672448298865, "grad_norm": 1.4843693971633911, "step": 24010 }, { "epoch": 8.008672448298865, "learning_rate": 8.034941041506918e-05, "step": 24010 }, { "epoch": 8.008672448298865, "loss": 0.4060533940792084, "step": 24010 }, { "ce_loss": 0.07756105810403824, "epoch": 8.008672448298865, "step": 24010 }, { "distill_loss": 0.20981661975383759, "epoch": 8.008672448298865, "step": 24010 }, { "epoch": 8.008672448298865, "ref_ce_loss": 0.0870758667588234, "step": 24010 }, { "epoch": 8.008672448298865, "loss": 0.9769420027732849, "step": 24010 }, { "ce_loss": 0.052198830991983414, "epoch": 8.008672448298865, "step": 24010 }, { "distill_loss": 0.25216665863990784, "epoch": 8.008672448298865, "step": 24010 }, { "epoch": 8.008672448298865, "ref_ce_loss": 0.06700126826763153, "step": 24010 }, { "epoch": 8.012008005336892, "loss": 0.4767, "step": 24020 }, { "epoch": 8.012008005336892, "grad_norm": 1.925126314163208, "step": 24020 }, { "epoch": 8.012008005336892, "learning_rate": 8.008981583369575e-05, "step": 24020 }, { "epoch": 8.012008005336892, "loss": 0.3252241313457489, "step": 24020 }, { "ce_loss": 0.06078958138823509, "epoch": 8.012008005336892, "step": 24020 }, { "distill_loss": 0.1793786734342575, "epoch": 8.012008005336892, "step": 24020 }, { "epoch": 8.012008005336892, "ref_ce_loss": 0.08483623713254929, "step": 24020 }, { "epoch": 8.012008005336892, "loss": 0.5393207669258118, "step": 24020 }, { "ce_loss": 0.05587855726480484, "epoch": 8.012008005336892, "step": 24020 }, { "distill_loss": 0.22161366045475006, "epoch": 8.012008005336892, "step": 24020 }, { "epoch": 8.012008005336892, "ref_ce_loss": 0.07822628319263458, "step": 24020 }, { "epoch": 8.015343562374916, "loss": 0.4434, "step": 24030 }, { "epoch": 8.015343562374916, "grad_norm": 1.1432065963745117, "step": 24030 }, { "epoch": 8.015343562374916, "learning_rate": 7.983059462193105e-05, "step": 24030 }, { "epoch": 8.015343562374916, "loss": 0.8035485744476318, "step": 24030 }, { "ce_loss": 0.0552186444401741, "epoch": 8.015343562374916, "step": 24030 }, { "distill_loss": 0.22818496823310852, "epoch": 8.015343562374916, "step": 24030 }, { "epoch": 8.015343562374916, "ref_ce_loss": 0.08648039400577545, "step": 24030 }, { "epoch": 8.015343562374916, "loss": 0.5100343823432922, "step": 24030 }, { "ce_loss": 0.05726398900151253, "epoch": 8.015343562374916, "step": 24030 }, { "distill_loss": 0.2203618735074997, "epoch": 8.015343562374916, "step": 24030 }, { "epoch": 8.015343562374916, "ref_ce_loss": 0.07337582111358643, "step": 24030 }, { "epoch": 8.018679119412942, "loss": 0.4535, "step": 24040 }, { "epoch": 8.018679119412942, "grad_norm": 1.6379269361495972, "step": 24040 }, { "epoch": 8.018679119412942, "learning_rate": 7.957174708231404e-05, "step": 24040 }, { "epoch": 8.018679119412942, "loss": 0.5619211196899414, "step": 24040 }, { "ce_loss": 0.06277598440647125, "epoch": 8.018679119412942, "step": 24040 }, { "distill_loss": 0.21573035418987274, "epoch": 8.018679119412942, "step": 24040 }, { "epoch": 8.018679119412942, "ref_ce_loss": 0.0782727375626564, "step": 24040 }, { "epoch": 8.018679119412942, "loss": 0.3220251202583313, "step": 24040 }, { "ce_loss": 0.04112187772989273, "epoch": 8.018679119412942, "step": 24040 }, { "distill_loss": 0.18421748280525208, "epoch": 8.018679119412942, "step": 24040 }, { "epoch": 8.018679119412942, "ref_ce_loss": 0.0625697672367096, "step": 24040 }, { "epoch": 8.022014676450967, "loss": 0.3997, "step": 24050 }, { "epoch": 8.022014676450967, "grad_norm": 1.0753532648086548, "step": 24050 }, { "epoch": 8.022014676450967, "learning_rate": 7.931327351694781e-05, "step": 24050 }, { "epoch": 8.022014676450967, "loss": 0.4898815453052521, "step": 24050 }, { "ce_loss": 0.06757506728172302, "epoch": 8.022014676450967, "step": 24050 }, { "distill_loss": 0.24555155634880066, "epoch": 8.022014676450967, "step": 24050 }, { "epoch": 8.022014676450967, "ref_ce_loss": 0.05489666759967804, "step": 24050 }, { "epoch": 8.022014676450967, "loss": 0.3805431127548218, "step": 24050 }, { "ce_loss": 0.042916931211948395, "epoch": 8.022014676450967, "step": 24050 }, { "distill_loss": 0.18931376934051514, "epoch": 8.022014676450967, "step": 24050 }, { "epoch": 8.022014676450967, "ref_ce_loss": 0.060156628489494324, "step": 24050 }, { "epoch": 8.025350233488993, "loss": 0.397, "step": 24060 }, { "epoch": 8.025350233488993, "grad_norm": 1.094015121459961, "step": 24060 }, { "epoch": 8.025350233488993, "learning_rate": 7.905517422749862e-05, "step": 24060 }, { "epoch": 8.025350233488993, "loss": 0.31888458132743835, "step": 24060 }, { "ce_loss": 0.024669796228408813, "epoch": 8.025350233488993, "step": 24060 }, { "distill_loss": 0.20169229805469513, "epoch": 8.025350233488993, "step": 24060 }, { "epoch": 8.025350233488993, "ref_ce_loss": 0.06511060893535614, "step": 24060 }, { "epoch": 8.025350233488993, "loss": 0.24008479714393616, "step": 24060 }, { "ce_loss": 0.03973004221916199, "epoch": 8.025350233488993, "step": 24060 }, { "distill_loss": 0.13114789128303528, "epoch": 8.025350233488993, "step": 24060 }, { "epoch": 8.025350233488993, "ref_ce_loss": 0.06896114349365234, "step": 24060 }, { "epoch": 8.028685790527017, "loss": 0.4315, "step": 24070 }, { "epoch": 8.028685790527017, "grad_norm": 1.3001312017440796, "step": 24070 }, { "epoch": 8.028685790527017, "learning_rate": 7.879744951519618e-05, "step": 24070 }, { "epoch": 8.028685790527017, "loss": 0.4929823577404022, "step": 24070 }, { "ce_loss": 0.09074297547340393, "epoch": 8.028685790527017, "step": 24070 }, { "distill_loss": 0.21786439418792725, "epoch": 8.028685790527017, "step": 24070 }, { "epoch": 8.028685790527017, "ref_ce_loss": 0.11397624760866165, "step": 24070 }, { "epoch": 8.028685790527017, "loss": 0.48569536209106445, "step": 24070 }, { "ce_loss": 0.06525580585002899, "epoch": 8.028685790527017, "step": 24070 }, { "distill_loss": 0.20845040678977966, "epoch": 8.028685790527017, "step": 24070 }, { "epoch": 8.028685790527017, "ref_ce_loss": 0.06835462898015976, "step": 24070 }, { "epoch": 8.032021347565044, "loss": 0.4435, "step": 24080 }, { "epoch": 8.032021347565044, "grad_norm": 5.342554569244385, "step": 24080 }, { "epoch": 8.032021347565044, "learning_rate": 7.854009968083298e-05, "step": 24080 }, { "epoch": 8.032021347565044, "loss": 0.5509377717971802, "step": 24080 }, { "ce_loss": 0.07140115648508072, "epoch": 8.032021347565044, "step": 24080 }, { "distill_loss": 0.18408143520355225, "epoch": 8.032021347565044, "step": 24080 }, { "epoch": 8.032021347565044, "ref_ce_loss": 0.0856771245598793, "step": 24080 }, { "epoch": 8.032021347565044, "loss": 0.5194607377052307, "step": 24080 }, { "ce_loss": 0.08500373363494873, "epoch": 8.032021347565044, "step": 24080 }, { "distill_loss": 0.180271178483963, "epoch": 8.032021347565044, "step": 24080 }, { "epoch": 8.032021347565044, "ref_ce_loss": 0.07013080269098282, "step": 24080 }, { "epoch": 8.035356904603068, "loss": 0.4726, "step": 24090 }, { "epoch": 8.035356904603068, "grad_norm": 1.3187562227249146, "step": 24090 }, { "epoch": 8.035356904603068, "learning_rate": 7.828312502476397e-05, "step": 24090 }, { "epoch": 8.035356904603068, "loss": 0.34251484274864197, "step": 24090 }, { "ce_loss": 0.06243763864040375, "epoch": 8.035356904603068, "step": 24090 }, { "distill_loss": 0.2157568335533142, "epoch": 8.035356904603068, "step": 24090 }, { "epoch": 8.035356904603068, "ref_ce_loss": 0.06419803947210312, "step": 24090 }, { "epoch": 8.035356904603068, "loss": 0.33506128191947937, "step": 24090 }, { "ce_loss": 0.04710674658417702, "epoch": 8.035356904603068, "step": 24090 }, { "distill_loss": 0.1622208207845688, "epoch": 8.035356904603068, "step": 24090 }, { "epoch": 8.035356904603068, "ref_ce_loss": 0.0668625682592392, "step": 24090 }, { "epoch": 8.038692461641094, "loss": 0.3903, "step": 24100 }, { "epoch": 8.038692461641094, "grad_norm": 1.0231257677078247, "step": 24100 }, { "epoch": 8.038692461641094, "learning_rate": 7.802652584690626e-05, "step": 24100 }, { "epoch": 8.038692461641094, "loss": 0.34682032465934753, "step": 24100 }, { "ce_loss": 0.074663445353508, "epoch": 8.038692461641094, "step": 24100 }, { "distill_loss": 0.19481858611106873, "epoch": 8.038692461641094, "step": 24100 }, { "epoch": 8.038692461641094, "ref_ce_loss": 0.076902836561203, "step": 24100 }, { "epoch": 8.038692461641094, "loss": 0.40327954292297363, "step": 24100 }, { "ce_loss": 0.06977633386850357, "epoch": 8.038692461641094, "step": 24100 }, { "distill_loss": 0.20230472087860107, "epoch": 8.038692461641094, "step": 24100 }, { "epoch": 8.038692461641094, "ref_ce_loss": 0.09668795019388199, "step": 24100 }, { "epoch": 8.042028018679119, "loss": 0.4016, "step": 24110 }, { "epoch": 8.042028018679119, "grad_norm": 1.466809630393982, "step": 24110 }, { "epoch": 8.042028018679119, "learning_rate": 7.777030244673862e-05, "step": 24110 }, { "epoch": 8.042028018679119, "loss": 0.4166834354400635, "step": 24110 }, { "ce_loss": 0.029378900304436684, "epoch": 8.042028018679119, "step": 24110 }, { "distill_loss": 0.20899909734725952, "epoch": 8.042028018679119, "step": 24110 }, { "epoch": 8.042028018679119, "ref_ce_loss": 0.0682293102145195, "step": 24110 }, { "epoch": 8.042028018679119, "loss": 0.43382200598716736, "step": 24110 }, { "ce_loss": 0.0649435818195343, "epoch": 8.042028018679119, "step": 24110 }, { "distill_loss": 0.18221789598464966, "epoch": 8.042028018679119, "step": 24110 }, { "epoch": 8.042028018679119, "ref_ce_loss": 0.07515498250722885, "step": 24110 }, { "epoch": 8.045363575717145, "loss": 0.4193, "step": 24120 }, { "epoch": 8.045363575717145, "grad_norm": 1.700887680053711, "step": 24120 }, { "epoch": 8.045363575717145, "learning_rate": 7.751445512330149e-05, "step": 24120 }, { "epoch": 8.045363575717145, "loss": 0.5536231994628906, "step": 24120 }, { "ce_loss": 0.09126371145248413, "epoch": 8.045363575717145, "step": 24120 }, { "distill_loss": 0.2512921392917633, "epoch": 8.045363575717145, "step": 24120 }, { "epoch": 8.045363575717145, "ref_ce_loss": 0.09933537989854813, "step": 24120 }, { "epoch": 8.045363575717145, "loss": 0.36023950576782227, "step": 24120 }, { "ce_loss": 0.06016818434000015, "epoch": 8.045363575717145, "step": 24120 }, { "distill_loss": 0.1999480277299881, "epoch": 8.045363575717145, "step": 24120 }, { "epoch": 8.045363575717145, "ref_ce_loss": 0.06371976435184479, "step": 24120 }, { "epoch": 8.04869913275517, "loss": 0.4155, "step": 24130 }, { "epoch": 8.04869913275517, "grad_norm": 1.236303448677063, "step": 24130 }, { "epoch": 8.04869913275517, "learning_rate": 7.725898417519601e-05, "step": 24130 }, { "epoch": 8.04869913275517, "loss": 0.3462640643119812, "step": 24130 }, { "ce_loss": 0.08088426291942596, "epoch": 8.04869913275517, "step": 24130 }, { "distill_loss": 0.20738530158996582, "epoch": 8.04869913275517, "step": 24130 }, { "epoch": 8.04869913275517, "ref_ce_loss": 0.04303397238254547, "step": 24130 }, { "epoch": 8.04869913275517, "loss": 0.3455406725406647, "step": 24130 }, { "ce_loss": 0.03262823447585106, "epoch": 8.04869913275517, "step": 24130 }, { "distill_loss": 0.1683516800403595, "epoch": 8.04869913275517, "step": 24130 }, { "epoch": 8.04869913275517, "ref_ce_loss": 0.07156498730182648, "step": 24130 }, { "epoch": 8.052034689793196, "loss": 0.3823, "step": 24140 }, { "epoch": 8.052034689793196, "grad_norm": 1.3769203424453735, "step": 24140 }, { "epoch": 8.052034689793196, "learning_rate": 7.700388990058436e-05, "step": 24140 }, { "epoch": 8.052034689793196, "loss": 0.28749528527259827, "step": 24140 }, { "ce_loss": 0.05094445124268532, "epoch": 8.052034689793196, "step": 24140 }, { "distill_loss": 0.16284839808940887, "epoch": 8.052034689793196, "step": 24140 }, { "epoch": 8.052034689793196, "ref_ce_loss": 0.05306845158338547, "step": 24140 }, { "epoch": 8.052034689793196, "loss": 0.5040078163146973, "step": 24140 }, { "ce_loss": 0.07084736973047256, "epoch": 8.052034689793196, "step": 24140 }, { "distill_loss": 0.2221842110157013, "epoch": 8.052034689793196, "step": 24140 }, { "epoch": 8.052034689793196, "ref_ce_loss": 0.08755800873041153, "step": 24140 }, { "epoch": 8.05537024683122, "loss": 0.4037, "step": 24150 }, { "epoch": 8.05537024683122, "grad_norm": 1.3036253452301025, "step": 24150 }, { "epoch": 8.05537024683122, "learning_rate": 7.674917259718903e-05, "step": 24150 }, { "epoch": 8.05537024683122, "loss": 0.4268152117729187, "step": 24150 }, { "ce_loss": 0.05660969018936157, "epoch": 8.05537024683122, "step": 24150 }, { "distill_loss": 0.1625378578901291, "epoch": 8.05537024683122, "step": 24150 }, { "epoch": 8.05537024683122, "ref_ce_loss": 0.0825534239411354, "step": 24150 }, { "epoch": 8.05537024683122, "loss": 0.40700727701187134, "step": 24150 }, { "ce_loss": 0.042284682393074036, "epoch": 8.05537024683122, "step": 24150 }, { "distill_loss": 0.2171652764081955, "epoch": 8.05537024683122, "step": 24150 }, { "epoch": 8.05537024683122, "ref_ce_loss": 0.07866621762514114, "step": 24150 }, { "epoch": 8.058705803869247, "loss": 0.3902, "step": 24160 }, { "epoch": 8.058705803869247, "grad_norm": 1.0506428480148315, "step": 24160 }, { "epoch": 8.058705803869247, "learning_rate": 7.649483256229251e-05, "step": 24160 }, { "epoch": 8.058705803869247, "loss": 0.3354337811470032, "step": 24160 }, { "ce_loss": 0.07376205921173096, "epoch": 8.058705803869247, "step": 24160 }, { "distill_loss": 0.19081434607505798, "epoch": 8.058705803869247, "step": 24160 }, { "epoch": 8.058705803869247, "ref_ce_loss": 0.07030003517866135, "step": 24160 }, { "epoch": 8.058705803869247, "loss": 0.3053566515445709, "step": 24160 }, { "ce_loss": 0.034138619899749756, "epoch": 8.058705803869247, "step": 24160 }, { "distill_loss": 0.19562920928001404, "epoch": 8.058705803869247, "step": 24160 }, { "epoch": 8.058705803869247, "ref_ce_loss": 0.07531195878982544, "step": 24160 }, { "epoch": 8.062041360907271, "loss": 0.4097, "step": 24170 }, { "epoch": 8.062041360907271, "grad_norm": 2.4474267959594727, "step": 24170 }, { "epoch": 8.062041360907271, "learning_rate": 7.624087009273707e-05, "step": 24170 }, { "epoch": 8.062041360907271, "loss": 0.3829737901687622, "step": 24170 }, { "ce_loss": 0.0542147271335125, "epoch": 8.062041360907271, "step": 24170 }, { "distill_loss": 0.21440204977989197, "epoch": 8.062041360907271, "step": 24170 }, { "epoch": 8.062041360907271, "ref_ce_loss": 0.0830206423997879, "step": 24170 }, { "epoch": 8.062041360907271, "loss": 0.3945467174053192, "step": 24170 }, { "ce_loss": 0.05573704093694687, "epoch": 8.062041360907271, "step": 24170 }, { "distill_loss": 0.2653251886367798, "epoch": 8.062041360907271, "step": 24170 }, { "epoch": 8.062041360907271, "ref_ce_loss": 0.0554734468460083, "step": 24170 }, { "epoch": 8.065376917945297, "loss": 0.4454, "step": 24180 }, { "epoch": 8.065376917945297, "grad_norm": 1.5293794870376587, "step": 24180 }, { "epoch": 8.065376917945297, "learning_rate": 7.598728548492409e-05, "step": 24180 }, { "epoch": 8.065376917945297, "loss": 0.38905197381973267, "step": 24180 }, { "ce_loss": 0.05039578303694725, "epoch": 8.065376917945297, "step": 24180 }, { "distill_loss": 0.21358920633792877, "epoch": 8.065376917945297, "step": 24180 }, { "epoch": 8.065376917945297, "ref_ce_loss": 0.060911811888217926, "step": 24180 }, { "epoch": 8.065376917945297, "loss": 0.3089878559112549, "step": 24180 }, { "ce_loss": 0.05048203095793724, "epoch": 8.065376917945297, "step": 24180 }, { "distill_loss": 0.16113674640655518, "epoch": 8.065376917945297, "step": 24180 }, { "epoch": 8.065376917945297, "ref_ce_loss": 0.048411983996629715, "step": 24180 }, { "epoch": 8.068712474983322, "loss": 0.4158, "step": 24190 }, { "epoch": 8.068712474983322, "grad_norm": 2.302577257156372, "step": 24190 }, { "epoch": 8.068712474983322, "learning_rate": 7.573407903481414e-05, "step": 24190 }, { "epoch": 8.068712474983322, "loss": 0.3494682312011719, "step": 24190 }, { "ce_loss": 0.03522660210728645, "epoch": 8.068712474983322, "step": 24190 }, { "distill_loss": 0.17571492493152618, "epoch": 8.068712474983322, "step": 24190 }, { "epoch": 8.068712474983322, "ref_ce_loss": 0.05610926076769829, "step": 24190 }, { "epoch": 8.068712474983322, "loss": 0.3886391222476959, "step": 24190 }, { "ce_loss": 0.07231699675321579, "epoch": 8.068712474983322, "step": 24190 }, { "distill_loss": 0.22073721885681152, "epoch": 8.068712474983322, "step": 24190 }, { "epoch": 8.068712474983322, "ref_ce_loss": 0.07439137250185013, "step": 24190 }, { "epoch": 8.072048032021348, "loss": 0.4069, "step": 24200 }, { "epoch": 8.072048032021348, "grad_norm": 1.2229115962982178, "step": 24200 }, { "epoch": 8.072048032021348, "learning_rate": 7.54812510379264e-05, "step": 24200 }, { "epoch": 8.072048032021348, "loss": 0.48251187801361084, "step": 24200 }, { "ce_loss": 0.07577910274267197, "epoch": 8.072048032021348, "step": 24200 }, { "distill_loss": 0.21471984684467316, "epoch": 8.072048032021348, "step": 24200 }, { "epoch": 8.072048032021348, "ref_ce_loss": 0.07716456800699234, "step": 24200 }, { "epoch": 8.072048032021348, "loss": 0.5132521390914917, "step": 24200 }, { "ce_loss": 0.11462902277708054, "epoch": 8.072048032021348, "step": 24200 }, { "distill_loss": 0.21093082427978516, "epoch": 8.072048032021348, "step": 24200 }, { "epoch": 8.072048032021348, "ref_ce_loss": 0.08183901757001877, "step": 24200 }, { "epoch": 8.075383589059372, "loss": 0.4118, "step": 24210 }, { "epoch": 8.075383589059372, "grad_norm": 1.0189741849899292, "step": 24210 }, { "epoch": 8.075383589059372, "learning_rate": 7.522880178933838e-05, "step": 24210 }, { "epoch": 8.075383589059372, "loss": 0.33431535959243774, "step": 24210 }, { "ce_loss": 0.060533054172992706, "epoch": 8.075383589059372, "step": 24210 }, { "distill_loss": 0.18593090772628784, "epoch": 8.075383589059372, "step": 24210 }, { "epoch": 8.075383589059372, "ref_ce_loss": 0.05473209545016289, "step": 24210 }, { "epoch": 8.075383589059372, "loss": 0.40760648250579834, "step": 24210 }, { "ce_loss": 0.0526285283267498, "epoch": 8.075383589059372, "step": 24210 }, { "distill_loss": 0.2151063233613968, "epoch": 8.075383589059372, "step": 24210 }, { "epoch": 8.075383589059372, "ref_ce_loss": 0.06521545350551605, "step": 24210 }, { "epoch": 8.078719146097399, "loss": 0.4553, "step": 24220 }, { "epoch": 8.078719146097399, "grad_norm": 1.1608573198318481, "step": 24220 }, { "epoch": 8.078719146097399, "learning_rate": 7.497673158368547e-05, "step": 24220 }, { "epoch": 8.078719146097399, "loss": 0.5061840415000916, "step": 24220 }, { "ce_loss": 0.0969315618276596, "epoch": 8.078719146097399, "step": 24220 }, { "distill_loss": 0.2560502886772156, "epoch": 8.078719146097399, "step": 24220 }, { "epoch": 8.078719146097399, "ref_ce_loss": 0.1011023297905922, "step": 24220 }, { "epoch": 8.078719146097399, "loss": 0.4152766168117523, "step": 24220 }, { "ce_loss": 0.0723932608962059, "epoch": 8.078719146097399, "step": 24220 }, { "distill_loss": 0.19282567501068115, "epoch": 8.078719146097399, "step": 24220 }, { "epoch": 8.078719146097399, "ref_ce_loss": 0.09820578992366791, "step": 24220 }, { "epoch": 8.082054703135423, "loss": 0.4628, "step": 24230 }, { "epoch": 8.082054703135423, "grad_norm": 1.7025099992752075, "step": 24230 }, { "epoch": 8.082054703135423, "learning_rate": 7.472504071516078e-05, "step": 24230 }, { "epoch": 8.082054703135423, "loss": 0.3487241864204407, "step": 24230 }, { "ce_loss": 0.06950736790895462, "epoch": 8.082054703135423, "step": 24230 }, { "distill_loss": 0.18906283378601074, "epoch": 8.082054703135423, "step": 24230 }, { "epoch": 8.082054703135423, "ref_ce_loss": 0.0899842232465744, "step": 24230 }, { "epoch": 8.082054703135423, "loss": 0.46370089054107666, "step": 24230 }, { "ce_loss": 0.05824560672044754, "epoch": 8.082054703135423, "step": 24230 }, { "distill_loss": 0.21629062294960022, "epoch": 8.082054703135423, "step": 24230 }, { "epoch": 8.082054703135423, "ref_ce_loss": 0.07762110233306885, "step": 24230 }, { "epoch": 8.08539026017345, "loss": 0.4067, "step": 24240 }, { "epoch": 8.08539026017345, "grad_norm": 1.4699956178665161, "step": 24240 }, { "epoch": 8.08539026017345, "learning_rate": 7.447372947751468e-05, "step": 24240 }, { "epoch": 8.08539026017345, "loss": 0.2601238489151001, "step": 24240 }, { "ce_loss": 0.019038693979382515, "epoch": 8.08539026017345, "step": 24240 }, { "distill_loss": 0.1835421919822693, "epoch": 8.08539026017345, "step": 24240 }, { "epoch": 8.08539026017345, "ref_ce_loss": 0.057193856686353683, "step": 24240 }, { "epoch": 8.08539026017345, "loss": 0.2840674817562103, "step": 24240 }, { "ce_loss": 0.04014355316758156, "epoch": 8.08539026017345, "step": 24240 }, { "distill_loss": 0.17024365067481995, "epoch": 8.08539026017345, "step": 24240 }, { "epoch": 8.08539026017345, "ref_ce_loss": 0.054361842572689056, "step": 24240 }, { "epoch": 8.088725817211474, "loss": 0.3839, "step": 24250 }, { "epoch": 8.088725817211474, "grad_norm": 1.4079599380493164, "step": 24250 }, { "epoch": 8.088725817211474, "learning_rate": 7.422279816405428e-05, "step": 24250 }, { "epoch": 8.088725817211474, "loss": 0.25054410099983215, "step": 24250 }, { "ce_loss": 0.043765123933553696, "epoch": 8.088725817211474, "step": 24250 }, { "distill_loss": 0.15811416506767273, "epoch": 8.088725817211474, "step": 24250 }, { "epoch": 8.088725817211474, "ref_ce_loss": 0.04835840314626694, "step": 24250 }, { "epoch": 8.088725817211474, "loss": 0.31558120250701904, "step": 24250 }, { "ce_loss": 0.0415969081223011, "epoch": 8.088725817211474, "step": 24250 }, { "distill_loss": 0.18933415412902832, "epoch": 8.088725817211474, "step": 24250 }, { "epoch": 8.088725817211474, "ref_ce_loss": 0.061390385031700134, "step": 24250 }, { "epoch": 8.0920613742495, "loss": 0.4033, "step": 24260 }, { "epoch": 8.0920613742495, "grad_norm": 1.592460036277771, "step": 24260 }, { "epoch": 8.0920613742495, "learning_rate": 7.397224706764351e-05, "step": 24260 }, { "epoch": 8.0920613742495, "loss": 0.5874719619750977, "step": 24260 }, { "ce_loss": 0.04022669792175293, "epoch": 8.0920613742495, "step": 24260 }, { "distill_loss": 0.20707687735557556, "epoch": 8.0920613742495, "step": 24260 }, { "epoch": 8.0920613742495, "ref_ce_loss": 0.06614813953638077, "step": 24260 }, { "epoch": 8.0920613742495, "loss": 0.39668652415275574, "step": 24260 }, { "ce_loss": 0.07553507387638092, "epoch": 8.0920613742495, "step": 24260 }, { "distill_loss": 0.21756426990032196, "epoch": 8.0920613742495, "step": 24260 }, { "epoch": 8.0920613742495, "ref_ce_loss": 0.0744190663099289, "step": 24260 }, { "epoch": 8.095396931287524, "loss": 0.445, "step": 24270 }, { "epoch": 8.095396931287524, "grad_norm": 1.4449187517166138, "step": 24270 }, { "epoch": 8.095396931287524, "learning_rate": 7.372207648070242e-05, "step": 24270 }, { "epoch": 8.095396931287524, "loss": 0.33231836557388306, "step": 24270 }, { "ce_loss": 0.039733968675136566, "epoch": 8.095396931287524, "step": 24270 }, { "distill_loss": 0.18775151669979095, "epoch": 8.095396931287524, "step": 24270 }, { "epoch": 8.095396931287524, "ref_ce_loss": 0.08045101910829544, "step": 24270 }, { "epoch": 8.095396931287524, "loss": 0.552812933921814, "step": 24270 }, { "ce_loss": 0.046159908175468445, "epoch": 8.095396931287524, "step": 24270 }, { "distill_loss": 0.2231673002243042, "epoch": 8.095396931287524, "step": 24270 }, { "epoch": 8.095396931287524, "ref_ce_loss": 0.05768783763051033, "step": 24270 }, { "epoch": 8.09873248832555, "loss": 0.3826, "step": 24280 }, { "epoch": 8.09873248832555, "grad_norm": 1.5561636686325073, "step": 24280 }, { "epoch": 8.09873248832555, "learning_rate": 7.347228669520716e-05, "step": 24280 }, { "epoch": 8.09873248832555, "loss": 0.33394551277160645, "step": 24280 }, { "ce_loss": 0.034008752554655075, "epoch": 8.09873248832555, "step": 24280 }, { "distill_loss": 0.1595076322555542, "epoch": 8.09873248832555, "step": 24280 }, { "epoch": 8.09873248832555, "ref_ce_loss": 0.07015834003686905, "step": 24280 }, { "epoch": 8.09873248832555, "loss": 0.6523812413215637, "step": 24280 }, { "ce_loss": 0.08651169389486313, "epoch": 8.09873248832555, "step": 24280 }, { "distill_loss": 0.19471541047096252, "epoch": 8.09873248832555, "step": 24280 }, { "epoch": 8.09873248832555, "ref_ce_loss": 0.07609721273183823, "step": 24280 }, { "epoch": 8.102068045363575, "loss": 0.382, "step": 24290 }, { "epoch": 8.102068045363575, "grad_norm": 1.1743472814559937, "step": 24290 }, { "epoch": 8.102068045363575, "learning_rate": 7.322287800268908e-05, "step": 24290 }, { "epoch": 8.102068045363575, "loss": 0.3828940689563751, "step": 24290 }, { "ce_loss": 0.06921820342540741, "epoch": 8.102068045363575, "step": 24290 }, { "distill_loss": 0.18600760400295258, "epoch": 8.102068045363575, "step": 24290 }, { "epoch": 8.102068045363575, "ref_ce_loss": 0.0913751944899559, "step": 24290 }, { "epoch": 8.102068045363575, "loss": 0.2832377552986145, "step": 24290 }, { "ce_loss": 0.0540679506957531, "epoch": 8.102068045363575, "step": 24290 }, { "distill_loss": 0.14307625591754913, "epoch": 8.102068045363575, "step": 24290 }, { "epoch": 8.102068045363575, "ref_ce_loss": 0.08550086617469788, "step": 24290 }, { "epoch": 8.105403602401601, "loss": 0.3986, "step": 24300 }, { "epoch": 8.105403602401601, "grad_norm": 1.3745160102844238, "step": 24300 }, { "epoch": 8.105403602401601, "learning_rate": 7.297385069423502e-05, "step": 24300 }, { "epoch": 8.105403602401601, "loss": 0.48997801542282104, "step": 24300 }, { "ce_loss": 0.04027820751070976, "epoch": 8.105403602401601, "step": 24300 }, { "distill_loss": 0.19105367362499237, "epoch": 8.105403602401601, "step": 24300 }, { "epoch": 8.105403602401601, "ref_ce_loss": 0.056856460869312286, "step": 24300 }, { "epoch": 8.105403602401601, "loss": 0.39413440227508545, "step": 24300 }, { "ce_loss": 0.06110284477472305, "epoch": 8.105403602401601, "step": 24300 }, { "distill_loss": 0.20504486560821533, "epoch": 8.105403602401601, "step": 24300 }, { "epoch": 8.105403602401601, "ref_ce_loss": 0.07440733909606934, "step": 24300 }, { "epoch": 8.108739159439626, "loss": 0.4335, "step": 24310 }, { "epoch": 8.108739159439626, "grad_norm": 2.263183116912842, "step": 24310 }, { "epoch": 8.108739159439626, "learning_rate": 7.272520506048653e-05, "step": 24310 }, { "epoch": 8.108739159439626, "loss": 0.42609086632728577, "step": 24310 }, { "ce_loss": 0.06666922569274902, "epoch": 8.108739159439626, "step": 24310 }, { "distill_loss": 0.2181270867586136, "epoch": 8.108739159439626, "step": 24310 }, { "epoch": 8.108739159439626, "ref_ce_loss": 0.08793104439973831, "step": 24310 }, { "epoch": 8.108739159439626, "loss": 0.29217106103897095, "step": 24310 }, { "ce_loss": 0.030751807615160942, "epoch": 8.108739159439626, "step": 24310 }, { "distill_loss": 0.1928568184375763, "epoch": 8.108739159439626, "step": 24310 }, { "epoch": 8.108739159439626, "ref_ce_loss": 0.04765959829092026, "step": 24310 }, { "epoch": 8.112074716477652, "loss": 0.435, "step": 24320 }, { "epoch": 8.112074716477652, "grad_norm": 1.2746638059616089, "step": 24320 }, { "epoch": 8.112074716477652, "learning_rate": 7.247694139164023e-05, "step": 24320 }, { "epoch": 8.112074716477652, "loss": 0.2549442648887634, "step": 24320 }, { "ce_loss": 0.03367112949490547, "epoch": 8.112074716477652, "step": 24320 }, { "distill_loss": 0.16045795381069183, "epoch": 8.112074716477652, "step": 24320 }, { "epoch": 8.112074716477652, "ref_ce_loss": 0.060577113181352615, "step": 24320 }, { "epoch": 8.112074716477652, "loss": 0.37653446197509766, "step": 24320 }, { "ce_loss": 0.08378375321626663, "epoch": 8.112074716477652, "step": 24320 }, { "distill_loss": 0.18870356678962708, "epoch": 8.112074716477652, "step": 24320 }, { "epoch": 8.112074716477652, "ref_ce_loss": 0.07577314972877502, "step": 24320 }, { "epoch": 8.115410273515677, "loss": 0.4013, "step": 24330 }, { "epoch": 8.115410273515677, "grad_norm": 1.232866883277893, "step": 24330 }, { "epoch": 8.115410273515677, "learning_rate": 7.22290599774461e-05, "step": 24330 }, { "epoch": 8.115410273515677, "loss": 0.36797034740448, "step": 24330 }, { "ce_loss": 0.05298295617103577, "epoch": 8.115410273515677, "step": 24330 }, { "distill_loss": 0.147117018699646, "epoch": 8.115410273515677, "step": 24330 }, { "epoch": 8.115410273515677, "ref_ce_loss": 0.10681744664907455, "step": 24330 }, { "epoch": 8.115410273515677, "loss": 0.29472821950912476, "step": 24330 }, { "ce_loss": 0.0275767482817173, "epoch": 8.115410273515677, "step": 24330 }, { "distill_loss": 0.19650474190711975, "epoch": 8.115410273515677, "step": 24330 }, { "epoch": 8.115410273515677, "ref_ce_loss": 0.053246621042490005, "step": 24330 }, { "epoch": 8.118745830553703, "loss": 0.4056, "step": 24340 }, { "epoch": 8.118745830553703, "grad_norm": 1.2795344591140747, "step": 24340 }, { "epoch": 8.118745830553703, "learning_rate": 7.198156110720864e-05, "step": 24340 }, { "epoch": 8.118745830553703, "loss": 0.334178626537323, "step": 24340 }, { "ce_loss": 0.03340744599699974, "epoch": 8.118745830553703, "step": 24340 }, { "distill_loss": 0.17978930473327637, "epoch": 8.118745830553703, "step": 24340 }, { "epoch": 8.118745830553703, "ref_ce_loss": 0.08155786991119385, "step": 24340 }, { "epoch": 8.118745830553703, "loss": 0.2408166229724884, "step": 24340 }, { "ce_loss": 0.029374677687883377, "epoch": 8.118745830553703, "step": 24340 }, { "distill_loss": 0.14466525614261627, "epoch": 8.118745830553703, "step": 24340 }, { "epoch": 8.118745830553703, "ref_ce_loss": 0.04205675050616264, "step": 24340 }, { "epoch": 8.122081387591727, "loss": 0.4237, "step": 24350 }, { "epoch": 8.122081387591727, "grad_norm": 1.5818060636520386, "step": 24350 }, { "epoch": 8.122081387591727, "learning_rate": 7.173444506978557e-05, "step": 24350 }, { "epoch": 8.122081387591727, "loss": 0.32580411434173584, "step": 24350 }, { "ce_loss": 0.03970218077301979, "epoch": 8.122081387591727, "step": 24350 }, { "distill_loss": 0.2042965441942215, "epoch": 8.122081387591727, "step": 24350 }, { "epoch": 8.122081387591727, "ref_ce_loss": 0.05940883979201317, "step": 24350 }, { "epoch": 8.122081387591727, "loss": 0.4163911044597626, "step": 24350 }, { "ce_loss": 0.08233669400215149, "epoch": 8.122081387591727, "step": 24350 }, { "distill_loss": 0.23903019726276398, "epoch": 8.122081387591727, "step": 24350 }, { "epoch": 8.122081387591727, "ref_ce_loss": 0.07528358697891235, "step": 24350 }, { "epoch": 8.125416944629754, "loss": 0.4194, "step": 24360 }, { "epoch": 8.125416944629754, "grad_norm": 1.5836371183395386, "step": 24360 }, { "epoch": 8.125416944629754, "learning_rate": 7.14877121535881e-05, "step": 24360 }, { "epoch": 8.125416944629754, "loss": 0.30909478664398193, "step": 24360 }, { "ce_loss": 0.031062021851539612, "epoch": 8.125416944629754, "step": 24360 }, { "distill_loss": 0.19787774980068207, "epoch": 8.125416944629754, "step": 24360 }, { "epoch": 8.125416944629754, "ref_ce_loss": 0.07995710521936417, "step": 24360 }, { "epoch": 8.125416944629754, "loss": 0.4969050884246826, "step": 24360 }, { "ce_loss": 0.04320429638028145, "epoch": 8.125416944629754, "step": 24360 }, { "distill_loss": 0.1878196746110916, "epoch": 8.125416944629754, "step": 24360 }, { "epoch": 8.125416944629754, "ref_ce_loss": 0.07372452318668365, "step": 24360 }, { "epoch": 8.128752501667778, "loss": 0.4068, "step": 24370 }, { "epoch": 8.128752501667778, "grad_norm": 1.9463021755218506, "step": 24370 }, { "epoch": 8.128752501667778, "learning_rate": 7.124136264657976e-05, "step": 24370 }, { "epoch": 8.128752501667778, "loss": 0.4209724962711334, "step": 24370 }, { "ce_loss": 0.0419982448220253, "epoch": 8.128752501667778, "step": 24370 }, { "distill_loss": 0.1879243701696396, "epoch": 8.128752501667778, "step": 24370 }, { "epoch": 8.128752501667778, "ref_ce_loss": 0.08443693071603775, "step": 24370 }, { "epoch": 8.128752501667778, "loss": 0.39130768179893494, "step": 24370 }, { "ce_loss": 0.08258134126663208, "epoch": 8.128752501667778, "step": 24370 }, { "distill_loss": 0.19457323849201202, "epoch": 8.128752501667778, "step": 24370 }, { "epoch": 8.128752501667778, "ref_ce_loss": 0.08725336939096451, "step": 24370 }, { "epoch": 8.132088058705804, "loss": 0.3772, "step": 24380 }, { "epoch": 8.132088058705804, "grad_norm": 1.717122197151184, "step": 24380 }, { "epoch": 8.132088058705804, "learning_rate": 7.099539683627714e-05, "step": 24380 }, { "epoch": 8.132088058705804, "loss": 0.41437631845474243, "step": 24380 }, { "ce_loss": 0.0480630062520504, "epoch": 8.132088058705804, "step": 24380 }, { "distill_loss": 0.2810288965702057, "epoch": 8.132088058705804, "step": 24380 }, { "epoch": 8.132088058705804, "ref_ce_loss": 0.08493179082870483, "step": 24380 }, { "epoch": 8.132088058705804, "loss": 0.3595638573169708, "step": 24380 }, { "ce_loss": 0.048516351729631424, "epoch": 8.132088058705804, "step": 24380 }, { "distill_loss": 0.14217573404312134, "epoch": 8.132088058705804, "step": 24380 }, { "epoch": 8.132088058705804, "ref_ce_loss": 0.06873290985822678, "step": 24380 }, { "epoch": 8.135423615743829, "loss": 0.4, "step": 24390 }, { "epoch": 8.135423615743829, "grad_norm": 1.4674592018127441, "step": 24390 }, { "epoch": 8.135423615743829, "learning_rate": 7.07498150097488e-05, "step": 24390 }, { "epoch": 8.135423615743829, "loss": 0.40654972195625305, "step": 24390 }, { "ce_loss": 0.09343191236257553, "epoch": 8.135423615743829, "step": 24390 }, { "distill_loss": 0.19275778532028198, "epoch": 8.135423615743829, "step": 24390 }, { "epoch": 8.135423615743829, "ref_ce_loss": 0.08056753128767014, "step": 24390 }, { "epoch": 8.135423615743829, "loss": 0.44827786087989807, "step": 24390 }, { "ce_loss": 0.04539187625050545, "epoch": 8.135423615743829, "step": 24390 }, { "distill_loss": 0.23888753354549408, "epoch": 8.135423615743829, "step": 24390 }, { "epoch": 8.135423615743829, "ref_ce_loss": 0.07933187484741211, "step": 24390 }, { "epoch": 8.138759172781855, "loss": 0.4132, "step": 24400 }, { "epoch": 8.138759172781855, "grad_norm": 0.793762743473053, "step": 24400 }, { "epoch": 8.138759172781855, "learning_rate": 7.05046174536152e-05, "step": 24400 }, { "epoch": 8.138759172781855, "loss": 0.37410956621170044, "step": 24400 }, { "ce_loss": 0.08392191678285599, "epoch": 8.138759172781855, "step": 24400 }, { "distill_loss": 0.19799497723579407, "epoch": 8.138759172781855, "step": 24400 }, { "epoch": 8.138759172781855, "ref_ce_loss": 0.09189742058515549, "step": 24400 }, { "epoch": 8.138759172781855, "loss": 0.3863672912120819, "step": 24400 }, { "ce_loss": 0.0856778547167778, "epoch": 8.138759172781855, "step": 24400 }, { "distill_loss": 0.20787179470062256, "epoch": 8.138759172781855, "step": 24400 }, { "epoch": 8.138759172781855, "ref_ce_loss": 0.09266812354326248, "step": 24400 }, { "epoch": 8.14209472981988, "loss": 0.3746, "step": 24410 }, { "epoch": 8.14209472981988, "grad_norm": 1.1739890575408936, "step": 24410 }, { "epoch": 8.14209472981988, "learning_rate": 7.025980445404811e-05, "step": 24410 }, { "epoch": 8.14209472981988, "loss": 0.6801389455795288, "step": 24410 }, { "ce_loss": 0.09862788766622543, "epoch": 8.14209472981988, "step": 24410 }, { "distill_loss": 0.2498609572649002, "epoch": 8.14209472981988, "step": 24410 }, { "epoch": 8.14209472981988, "ref_ce_loss": 0.09175879508256912, "step": 24410 }, { "epoch": 8.14209472981988, "loss": 0.4861341714859009, "step": 24410 }, { "ce_loss": 0.06381076574325562, "epoch": 8.14209472981988, "step": 24410 }, { "distill_loss": 0.23393845558166504, "epoch": 8.14209472981988, "step": 24410 }, { "epoch": 8.14209472981988, "ref_ce_loss": 0.07300575077533722, "step": 24410 }, { "epoch": 8.145430286857906, "loss": 0.412, "step": 24420 }, { "epoch": 8.145430286857906, "grad_norm": 2.2138359546661377, "step": 24420 }, { "epoch": 8.145430286857906, "learning_rate": 7.001537629677061e-05, "step": 24420 }, { "epoch": 8.145430286857906, "loss": 0.4213656187057495, "step": 24420 }, { "ce_loss": 0.08057521283626556, "epoch": 8.145430286857906, "step": 24420 }, { "distill_loss": 0.2415209710597992, "epoch": 8.145430286857906, "step": 24420 }, { "epoch": 8.145430286857906, "ref_ce_loss": 0.07787420600652695, "step": 24420 }, { "epoch": 8.145430286857906, "loss": 0.3960283100605011, "step": 24420 }, { "ce_loss": 0.06811496615409851, "epoch": 8.145430286857906, "step": 24420 }, { "distill_loss": 0.2203001230955124, "epoch": 8.145430286857906, "step": 24420 }, { "epoch": 8.145430286857906, "ref_ce_loss": 0.0836324393749237, "step": 24420 }, { "epoch": 8.14876584389593, "loss": 0.4497, "step": 24430 }, { "epoch": 8.14876584389593, "grad_norm": 1.6273220777511597, "step": 24430 }, { "epoch": 8.14876584389593, "learning_rate": 6.977133326705701e-05, "step": 24430 }, { "epoch": 8.14876584389593, "loss": 0.4896760582923889, "step": 24430 }, { "ce_loss": 0.03350548446178436, "epoch": 8.14876584389593, "step": 24430 }, { "distill_loss": 0.16317717730998993, "epoch": 8.14876584389593, "step": 24430 }, { "epoch": 8.14876584389593, "ref_ce_loss": 0.06565791368484497, "step": 24430 }, { "epoch": 8.14876584389593, "loss": 0.5192831158638, "step": 24430 }, { "ce_loss": 0.08715259283781052, "epoch": 8.14876584389593, "step": 24430 }, { "distill_loss": 0.22097237408161163, "epoch": 8.14876584389593, "step": 24430 }, { "epoch": 8.14876584389593, "ref_ce_loss": 0.06737726926803589, "step": 24430 }, { "epoch": 8.152101400933956, "loss": 0.434, "step": 24440 }, { "epoch": 8.152101400933956, "grad_norm": 1.2134703397750854, "step": 24440 }, { "epoch": 8.152101400933956, "learning_rate": 6.952767564973149e-05, "step": 24440 }, { "epoch": 8.152101400933956, "loss": 0.42709025740623474, "step": 24440 }, { "ce_loss": 0.06981752067804337, "epoch": 8.152101400933956, "step": 24440 }, { "distill_loss": 0.26825565099716187, "epoch": 8.152101400933956, "step": 24440 }, { "epoch": 8.152101400933956, "ref_ce_loss": 0.06773539632558823, "step": 24440 }, { "epoch": 8.152101400933956, "loss": 0.2758011221885681, "step": 24440 }, { "ce_loss": 0.03051314689218998, "epoch": 8.152101400933956, "step": 24440 }, { "distill_loss": 0.17103226482868195, "epoch": 8.152101400933956, "step": 24440 }, { "epoch": 8.152101400933956, "ref_ce_loss": 0.07407353818416595, "step": 24440 }, { "epoch": 8.15543695797198, "loss": 0.4523, "step": 24450 }, { "epoch": 8.15543695797198, "grad_norm": 0.9241464734077454, "step": 24450 }, { "epoch": 8.15543695797198, "learning_rate": 6.928440372916872e-05, "step": 24450 }, { "epoch": 8.15543695797198, "loss": 0.3098130524158478, "step": 24450 }, { "ce_loss": 0.06479384750127792, "epoch": 8.15543695797198, "step": 24450 }, { "distill_loss": 0.16387423872947693, "epoch": 8.15543695797198, "step": 24450 }, { "epoch": 8.15543695797198, "ref_ce_loss": 0.051972776651382446, "step": 24450 }, { "epoch": 8.15543695797198, "loss": 0.26677024364471436, "step": 24450 }, { "ce_loss": 0.025871606543660164, "epoch": 8.15543695797198, "step": 24450 }, { "distill_loss": 0.17901934683322906, "epoch": 8.15543695797198, "step": 24450 }, { "epoch": 8.15543695797198, "ref_ce_loss": 0.04680072143673897, "step": 24450 }, { "epoch": 8.158772515010007, "loss": 0.4329, "step": 24460 }, { "epoch": 8.158772515010007, "grad_norm": 1.5348268747329712, "step": 24460 }, { "epoch": 8.158772515010007, "learning_rate": 6.904151778929331e-05, "step": 24460 }, { "epoch": 8.158772515010007, "loss": 0.30024340748786926, "step": 24460 }, { "ce_loss": 0.0465410090982914, "epoch": 8.158772515010007, "step": 24460 }, { "distill_loss": 0.17435558140277863, "epoch": 8.158772515010007, "step": 24460 }, { "epoch": 8.158772515010007, "ref_ce_loss": 0.07861968874931335, "step": 24460 }, { "epoch": 8.158772515010007, "loss": 0.621793806552887, "step": 24460 }, { "ce_loss": 0.07838597893714905, "epoch": 8.158772515010007, "step": 24460 }, { "distill_loss": 0.22223365306854248, "epoch": 8.158772515010007, "step": 24460 }, { "epoch": 8.158772515010007, "ref_ce_loss": 0.0761762410402298, "step": 24460 }, { "epoch": 8.162108072048031, "loss": 0.4525, "step": 24470 }, { "epoch": 8.162108072048031, "grad_norm": 0.9262333512306213, "step": 24470 }, { "epoch": 8.162108072048031, "learning_rate": 6.879901811357931e-05, "step": 24470 }, { "epoch": 8.162108072048031, "loss": 0.4530099034309387, "step": 24470 }, { "ce_loss": 0.07020189613103867, "epoch": 8.162108072048031, "step": 24470 }, { "distill_loss": 0.2217385172843933, "epoch": 8.162108072048031, "step": 24470 }, { "epoch": 8.162108072048031, "ref_ce_loss": 0.07607980072498322, "step": 24470 }, { "epoch": 8.162108072048031, "loss": 0.43743398785591125, "step": 24470 }, { "ce_loss": 0.10149736702442169, "epoch": 8.162108072048031, "step": 24470 }, { "distill_loss": 0.26674532890319824, "epoch": 8.162108072048031, "step": 24470 }, { "epoch": 8.162108072048031, "ref_ce_loss": 0.06348459422588348, "step": 24470 }, { "epoch": 8.165443629086058, "loss": 0.4397, "step": 24480 }, { "epoch": 8.165443629086058, "grad_norm": 1.3768205642700195, "step": 24480 }, { "epoch": 8.165443629086058, "learning_rate": 6.85569049850498e-05, "step": 24480 }, { "epoch": 8.165443629086058, "loss": 0.3973981440067291, "step": 24480 }, { "ce_loss": 0.07804957032203674, "epoch": 8.165443629086058, "step": 24480 }, { "distill_loss": 0.22942686080932617, "epoch": 8.165443629086058, "step": 24480 }, { "epoch": 8.165443629086058, "ref_ce_loss": 0.05899369344115257, "step": 24480 }, { "epoch": 8.165443629086058, "loss": 0.6007339954376221, "step": 24480 }, { "ce_loss": 0.05822492390871048, "epoch": 8.165443629086058, "step": 24480 }, { "distill_loss": 0.23770646750926971, "epoch": 8.165443629086058, "step": 24480 }, { "epoch": 8.165443629086058, "ref_ce_loss": 0.09923166036605835, "step": 24480 }, { "epoch": 8.168779186124082, "loss": 0.4639, "step": 24490 }, { "epoch": 8.168779186124082, "grad_norm": 1.7965821027755737, "step": 24490 }, { "epoch": 8.168779186124082, "learning_rate": 6.831517868627693e-05, "step": 24490 }, { "epoch": 8.168779186124082, "loss": 0.46187624335289, "step": 24490 }, { "ce_loss": 0.10025662183761597, "epoch": 8.168779186124082, "step": 24490 }, { "distill_loss": 0.26213783025741577, "epoch": 8.168779186124082, "step": 24490 }, { "epoch": 8.168779186124082, "ref_ce_loss": 0.08308776468038559, "step": 24490 }, { "epoch": 8.168779186124082, "loss": 0.6011335849761963, "step": 24490 }, { "ce_loss": 0.1039145365357399, "epoch": 8.168779186124082, "step": 24490 }, { "distill_loss": 0.31061768531799316, "epoch": 8.168779186124082, "step": 24490 }, { "epoch": 8.168779186124082, "ref_ce_loss": 0.10056736320257187, "step": 24490 }, { "epoch": 8.172114743162108, "loss": 0.4563, "step": 24500 }, { "epoch": 8.172114743162108, "grad_norm": 1.7553499937057495, "step": 24500 }, { "epoch": 8.172114743162108, "learning_rate": 6.80738394993813e-05, "step": 24500 }, { "epoch": 8.172114743162108, "loss": 0.40037602186203003, "step": 24500 }, { "ce_loss": 0.10152076929807663, "epoch": 8.172114743162108, "step": 24500 }, { "distill_loss": 0.21522164344787598, "epoch": 8.172114743162108, "step": 24500 }, { "epoch": 8.172114743162108, "ref_ce_loss": 0.08338447660207748, "step": 24500 }, { "epoch": 8.172114743162108, "loss": 0.2687154710292816, "step": 24500 }, { "ce_loss": 0.03502975404262543, "epoch": 8.172114743162108, "step": 24500 }, { "distill_loss": 0.16856245696544647, "epoch": 8.172114743162108, "step": 24500 }, { "epoch": 8.172114743162108, "ref_ce_loss": 0.04353145882487297, "step": 24500 }, { "epoch": 8.175450300200133, "loss": 0.4511, "step": 24510 }, { "epoch": 8.175450300200133, "grad_norm": 1.439299464225769, "step": 24510 }, { "epoch": 8.175450300200133, "learning_rate": 6.783288770603169e-05, "step": 24510 }, { "epoch": 8.175450300200133, "loss": 0.5104728937149048, "step": 24510 }, { "ce_loss": 0.06655392050743103, "epoch": 8.175450300200133, "step": 24510 }, { "distill_loss": 0.2705954313278198, "epoch": 8.175450300200133, "step": 24510 }, { "epoch": 8.175450300200133, "ref_ce_loss": 0.08169260621070862, "step": 24510 }, { "epoch": 8.175450300200133, "loss": 0.4592511057853699, "step": 24510 }, { "ce_loss": 0.06678766012191772, "epoch": 8.175450300200133, "step": 24510 }, { "distill_loss": 0.23218107223510742, "epoch": 8.175450300200133, "step": 24510 }, { "epoch": 8.175450300200133, "ref_ce_loss": 0.07493381947278976, "step": 24510 }, { "epoch": 8.178785857238159, "loss": 0.483, "step": 24520 }, { "epoch": 8.178785857238159, "grad_norm": 1.1014498472213745, "step": 24520 }, { "epoch": 8.178785857238159, "learning_rate": 6.75923235874449e-05, "step": 24520 }, { "epoch": 8.178785857238159, "loss": 0.49110180139541626, "step": 24520 }, { "ce_loss": 0.102203369140625, "epoch": 8.178785857238159, "step": 24520 }, { "distill_loss": 0.24019083380699158, "epoch": 8.178785857238159, "step": 24520 }, { "epoch": 8.178785857238159, "ref_ce_loss": 0.0786530077457428, "step": 24520 }, { "epoch": 8.178785857238159, "loss": 0.7304254770278931, "step": 24520 }, { "ce_loss": 0.04971720278263092, "epoch": 8.178785857238159, "step": 24520 }, { "distill_loss": 0.17906175553798676, "epoch": 8.178785857238159, "step": 24520 }, { "epoch": 8.178785857238159, "ref_ce_loss": 0.06416261941194534, "step": 24520 }, { "epoch": 8.182121414276184, "loss": 0.4461, "step": 24530 }, { "epoch": 8.182121414276184, "grad_norm": 1.209089756011963, "step": 24530 }, { "epoch": 8.182121414276184, "learning_rate": 6.735214742438482e-05, "step": 24530 }, { "epoch": 8.182121414276184, "loss": 0.8661839962005615, "step": 24530 }, { "ce_loss": 0.030243270099163055, "epoch": 8.182121414276184, "step": 24530 }, { "distill_loss": 0.21225211024284363, "epoch": 8.182121414276184, "step": 24530 }, { "epoch": 8.182121414276184, "ref_ce_loss": 0.06496108323335648, "step": 24530 }, { "epoch": 8.182121414276184, "loss": 0.5166357755661011, "step": 24530 }, { "ce_loss": 0.06101277470588684, "epoch": 8.182121414276184, "step": 24530 }, { "distill_loss": 0.18440604209899902, "epoch": 8.182121414276184, "step": 24530 }, { "epoch": 8.182121414276184, "ref_ce_loss": 0.047844257205724716, "step": 24530 }, { "epoch": 8.18545697131421, "loss": 0.4986, "step": 24540 }, { "epoch": 8.18545697131421, "grad_norm": 1.5618125200271606, "step": 24540 }, { "epoch": 8.18545697131421, "learning_rate": 6.711235949716316e-05, "step": 24540 }, { "epoch": 8.18545697131421, "loss": 0.3254857361316681, "step": 24540 }, { "ce_loss": 0.03264736011624336, "epoch": 8.18545697131421, "step": 24540 }, { "distill_loss": 0.1845029890537262, "epoch": 8.18545697131421, "step": 24540 }, { "epoch": 8.18545697131421, "ref_ce_loss": 0.05590250343084335, "step": 24540 }, { "epoch": 8.18545697131421, "loss": 0.29291635751724243, "step": 24540 }, { "ce_loss": 0.03950078412890434, "epoch": 8.18545697131421, "step": 24540 }, { "distill_loss": 0.19686773419380188, "epoch": 8.18545697131421, "step": 24540 }, { "epoch": 8.18545697131421, "ref_ce_loss": 0.05645206943154335, "step": 24540 }, { "epoch": 8.188792528352234, "loss": 0.4274, "step": 24550 }, { "epoch": 8.188792528352234, "grad_norm": 1.675386667251587, "step": 24550 }, { "epoch": 8.188792528352234, "learning_rate": 6.687296008563828e-05, "step": 24550 }, { "epoch": 8.188792528352234, "loss": 0.5986274480819702, "step": 24550 }, { "ce_loss": 0.03906279802322388, "epoch": 8.188792528352234, "step": 24550 }, { "distill_loss": 0.19594983756542206, "epoch": 8.188792528352234, "step": 24550 }, { "epoch": 8.188792528352234, "ref_ce_loss": 0.0729498490691185, "step": 24550 }, { "epoch": 8.188792528352234, "loss": 0.41088801622390747, "step": 24550 }, { "ce_loss": 0.03986736014485359, "epoch": 8.188792528352234, "step": 24550 }, { "distill_loss": 0.21624164283275604, "epoch": 8.188792528352234, "step": 24550 }, { "epoch": 8.188792528352234, "ref_ce_loss": 0.08400541543960571, "step": 24550 }, { "epoch": 8.19212808539026, "loss": 0.4294, "step": 24560 }, { "epoch": 8.19212808539026, "grad_norm": 1.3876982927322388, "step": 24560 }, { "epoch": 8.19212808539026, "learning_rate": 6.663394946921489e-05, "step": 24560 }, { "epoch": 8.19212808539026, "loss": 0.3449614942073822, "step": 24560 }, { "ce_loss": 0.05582842603325844, "epoch": 8.19212808539026, "step": 24560 }, { "distill_loss": 0.18621015548706055, "epoch": 8.19212808539026, "step": 24560 }, { "epoch": 8.19212808539026, "ref_ce_loss": 0.07234183698892593, "step": 24560 }, { "epoch": 8.19212808539026, "loss": 0.5139371752738953, "step": 24560 }, { "ce_loss": 0.06884564459323883, "epoch": 8.19212808539026, "step": 24560 }, { "distill_loss": 0.23937104642391205, "epoch": 8.19212808539026, "step": 24560 }, { "epoch": 8.19212808539026, "ref_ce_loss": 0.10060980170965195, "step": 24560 }, { "epoch": 8.195463642428285, "loss": 0.3932, "step": 24570 }, { "epoch": 8.195463642428285, "grad_norm": 1.207762360572815, "step": 24570 }, { "epoch": 8.195463642428285, "learning_rate": 6.639532792684406e-05, "step": 24570 }, { "epoch": 8.195463642428285, "loss": 0.3394352197647095, "step": 24570 }, { "ce_loss": 0.05197358503937721, "epoch": 8.195463642428285, "step": 24570 }, { "distill_loss": 0.19060836732387543, "epoch": 8.195463642428285, "step": 24570 }, { "epoch": 8.195463642428285, "ref_ce_loss": 0.06508396565914154, "step": 24570 }, { "epoch": 8.195463642428285, "loss": 0.40705007314682007, "step": 24570 }, { "ce_loss": 0.05480041354894638, "epoch": 8.195463642428285, "step": 24570 }, { "distill_loss": 0.1943550407886505, "epoch": 8.195463642428285, "step": 24570 }, { "epoch": 8.195463642428285, "ref_ce_loss": 0.08861961960792542, "step": 24570 }, { "epoch": 8.198799199466311, "loss": 0.3948, "step": 24580 }, { "epoch": 8.198799199466311, "grad_norm": 1.828460693359375, "step": 24580 }, { "epoch": 8.198799199466311, "learning_rate": 6.615709573702291e-05, "step": 24580 }, { "epoch": 8.198799199466311, "loss": 0.3562234342098236, "step": 24580 }, { "ce_loss": 0.08233077079057693, "epoch": 8.198799199466311, "step": 24580 }, { "distill_loss": 0.1983891725540161, "epoch": 8.198799199466311, "step": 24580 }, { "epoch": 8.198799199466311, "ref_ce_loss": 0.07498323917388916, "step": 24580 }, { "epoch": 8.198799199466311, "loss": 0.33163464069366455, "step": 24580 }, { "ce_loss": 0.04026639088988304, "epoch": 8.198799199466311, "step": 24580 }, { "distill_loss": 0.17978839576244354, "epoch": 8.198799199466311, "step": 24580 }, { "epoch": 8.198799199466311, "ref_ce_loss": 0.0723813846707344, "step": 24580 }, { "epoch": 8.202134756504336, "loss": 0.4389, "step": 24590 }, { "epoch": 8.202134756504336, "grad_norm": 2.6070868968963623, "step": 24590 }, { "epoch": 8.202134756504336, "learning_rate": 6.591925317779412e-05, "step": 24590 }, { "epoch": 8.202134756504336, "loss": 0.28905245661735535, "step": 24590 }, { "ce_loss": 0.02365030162036419, "epoch": 8.202134756504336, "step": 24590 }, { "distill_loss": 0.18448412418365479, "epoch": 8.202134756504336, "step": 24590 }, { "epoch": 8.202134756504336, "ref_ce_loss": 0.060775455087423325, "step": 24590 }, { "epoch": 8.202134756504336, "loss": 0.3054485619068146, "step": 24590 }, { "ce_loss": 0.05717819184064865, "epoch": 8.202134756504336, "step": 24590 }, { "distill_loss": 0.13961108028888702, "epoch": 8.202134756504336, "step": 24590 }, { "epoch": 8.202134756504336, "ref_ce_loss": 0.07123158872127533, "step": 24590 }, { "epoch": 8.205470313542362, "loss": 0.3917, "step": 24600 }, { "epoch": 8.205470313542362, "grad_norm": 1.6249158382415771, "step": 24600 }, { "epoch": 8.205470313542362, "learning_rate": 6.568180052674535e-05, "step": 24600 }, { "epoch": 8.205470313542362, "loss": 0.41753503680229187, "step": 24600 }, { "ce_loss": 0.06948726624250412, "epoch": 8.205470313542362, "step": 24600 }, { "distill_loss": 0.23197221755981445, "epoch": 8.205470313542362, "step": 24600 }, { "epoch": 8.205470313542362, "ref_ce_loss": 0.07913843542337418, "step": 24600 }, { "epoch": 8.205470313542362, "loss": 0.37326788902282715, "step": 24600 }, { "ce_loss": 0.06086521968245506, "epoch": 8.205470313542362, "step": 24600 }, { "distill_loss": 0.20718753337860107, "epoch": 8.205470313542362, "step": 24600 }, { "epoch": 8.205470313542362, "ref_ce_loss": 0.07859325408935547, "step": 24600 }, { "epoch": 8.208805870580386, "loss": 0.4623, "step": 24610 }, { "epoch": 8.208805870580386, "grad_norm": 1.1451125144958496, "step": 24610 }, { "epoch": 8.208805870580386, "learning_rate": 6.544473806100953e-05, "step": 24610 }, { "epoch": 8.208805870580386, "loss": 0.4168807864189148, "step": 24610 }, { "ce_loss": 0.08794716000556946, "epoch": 8.208805870580386, "step": 24610 }, { "distill_loss": 0.17066535353660583, "epoch": 8.208805870580386, "step": 24610 }, { "epoch": 8.208805870580386, "ref_ce_loss": 0.06990622729063034, "step": 24610 }, { "epoch": 8.208805870580386, "loss": 0.29761603474617004, "step": 24610 }, { "ce_loss": 0.05575099587440491, "epoch": 8.208805870580386, "step": 24610 }, { "distill_loss": 0.18124572932720184, "epoch": 8.208805870580386, "step": 24610 }, { "epoch": 8.208805870580386, "ref_ce_loss": 0.060525815933942795, "step": 24610 }, { "epoch": 8.212141427618413, "loss": 0.4032, "step": 24620 }, { "epoch": 8.212141427618413, "grad_norm": 1.6889187097549438, "step": 24620 }, { "epoch": 8.212141427618413, "learning_rate": 6.520806605726399e-05, "step": 24620 }, { "epoch": 8.212141427618413, "loss": 0.3321390151977539, "step": 24620 }, { "ce_loss": 0.04720529541373253, "epoch": 8.212141427618413, "step": 24620 }, { "distill_loss": 0.16193419694900513, "epoch": 8.212141427618413, "step": 24620 }, { "epoch": 8.212141427618413, "ref_ce_loss": 0.08601251244544983, "step": 24620 }, { "epoch": 8.212141427618413, "loss": 0.33572375774383545, "step": 24620 }, { "ce_loss": 0.0730213150382042, "epoch": 8.212141427618413, "step": 24620 }, { "distill_loss": 0.16905918717384338, "epoch": 8.212141427618413, "step": 24620 }, { "epoch": 8.212141427618413, "ref_ce_loss": 0.09331928193569183, "step": 24620 }, { "epoch": 8.215476984656437, "loss": 0.3877, "step": 24630 }, { "epoch": 8.215476984656437, "grad_norm": 1.0236163139343262, "step": 24630 }, { "epoch": 8.215476984656437, "learning_rate": 6.497178479173056e-05, "step": 24630 }, { "epoch": 8.215476984656437, "loss": 0.4932849407196045, "step": 24630 }, { "ce_loss": 0.04189939796924591, "epoch": 8.215476984656437, "step": 24630 }, { "distill_loss": 0.1931006908416748, "epoch": 8.215476984656437, "step": 24630 }, { "epoch": 8.215476984656437, "ref_ce_loss": 0.08937612175941467, "step": 24630 }, { "epoch": 8.215476984656437, "loss": 0.3568384051322937, "step": 24630 }, { "ce_loss": 0.0696152076125145, "epoch": 8.215476984656437, "step": 24630 }, { "distill_loss": 0.18815016746520996, "epoch": 8.215476984656437, "step": 24630 }, { "epoch": 8.215476984656437, "ref_ce_loss": 0.06303508579730988, "step": 24630 }, { "epoch": 8.218812541694463, "loss": 0.4584, "step": 24640 }, { "epoch": 8.218812541694463, "grad_norm": 1.1400558948516846, "step": 24640 }, { "epoch": 8.218812541694463, "learning_rate": 6.473589454017464e-05, "step": 24640 }, { "epoch": 8.218812541694463, "loss": 0.6148601770401001, "step": 24640 }, { "ce_loss": 0.05197329819202423, "epoch": 8.218812541694463, "step": 24640 }, { "distill_loss": 0.21115422248840332, "epoch": 8.218812541694463, "step": 24640 }, { "epoch": 8.218812541694463, "ref_ce_loss": 0.09626834839582443, "step": 24640 }, { "epoch": 8.218812541694463, "loss": 0.3354915976524353, "step": 24640 }, { "ce_loss": 0.0600191093981266, "epoch": 8.218812541694463, "step": 24640 }, { "distill_loss": 0.20265516638755798, "epoch": 8.218812541694463, "step": 24640 }, { "epoch": 8.218812541694463, "ref_ce_loss": 0.07247176766395569, "step": 24640 }, { "epoch": 8.222148098732488, "loss": 0.4794, "step": 24650 }, { "epoch": 8.222148098732488, "grad_norm": 1.4587262868881226, "step": 24650 }, { "epoch": 8.222148098732488, "learning_rate": 6.450039557790577e-05, "step": 24650 }, { "epoch": 8.222148098732488, "loss": 0.6026730537414551, "step": 24650 }, { "ce_loss": 0.08934365212917328, "epoch": 8.222148098732488, "step": 24650 }, { "distill_loss": 0.26244986057281494, "epoch": 8.222148098732488, "step": 24650 }, { "epoch": 8.222148098732488, "ref_ce_loss": 0.08324063569307327, "step": 24650 }, { "epoch": 8.222148098732488, "loss": 0.4086545705795288, "step": 24650 }, { "ce_loss": 0.06538889557123184, "epoch": 8.222148098732488, "step": 24650 }, { "distill_loss": 0.18500006198883057, "epoch": 8.222148098732488, "step": 24650 }, { "epoch": 8.222148098732488, "ref_ce_loss": 0.07706859707832336, "step": 24650 }, { "epoch": 8.225483655770514, "loss": 0.4254, "step": 24660 }, { "epoch": 8.225483655770514, "grad_norm": 0.9976798892021179, "step": 24660 }, { "epoch": 8.225483655770514, "learning_rate": 6.42652881797765e-05, "step": 24660 }, { "epoch": 8.225483655770514, "loss": 0.35616809129714966, "step": 24660 }, { "ce_loss": 0.03588799387216568, "epoch": 8.225483655770514, "step": 24660 }, { "distill_loss": 0.21862131357192993, "epoch": 8.225483655770514, "step": 24660 }, { "epoch": 8.225483655770514, "ref_ce_loss": 0.07292570918798447, "step": 24660 }, { "epoch": 8.225483655770514, "loss": 0.41984128952026367, "step": 24660 }, { "ce_loss": 0.08020157366991043, "epoch": 8.225483655770514, "step": 24660 }, { "distill_loss": 0.21562683582305908, "epoch": 8.225483655770514, "step": 24660 }, { "epoch": 8.225483655770514, "ref_ce_loss": 0.08471453934907913, "step": 24660 }, { "epoch": 8.228819212808538, "loss": 0.389, "step": 24670 }, { "epoch": 8.228819212808538, "grad_norm": 0.9158706665039062, "step": 24670 }, { "epoch": 8.228819212808538, "learning_rate": 6.403057262018259e-05, "step": 24670 }, { "epoch": 8.228819212808538, "loss": 0.3161277770996094, "step": 24670 }, { "ce_loss": 0.05249093845486641, "epoch": 8.228819212808538, "step": 24670 }, { "distill_loss": 0.16578979790210724, "epoch": 8.228819212808538, "step": 24670 }, { "epoch": 8.228819212808538, "ref_ce_loss": 0.05899889022111893, "step": 24670 }, { "epoch": 8.228819212808538, "loss": 0.36990371346473694, "step": 24670 }, { "ce_loss": 0.0632263645529747, "epoch": 8.228819212808538, "step": 24670 }, { "distill_loss": 0.2002817690372467, "epoch": 8.228819212808538, "step": 24670 }, { "epoch": 8.228819212808538, "ref_ce_loss": 0.07740586251020432, "step": 24670 }, { "epoch": 8.232154769846565, "loss": 0.418, "step": 24680 }, { "epoch": 8.232154769846565, "grad_norm": 1.2354545593261719, "step": 24680 }, { "epoch": 8.232154769846565, "learning_rate": 6.379624917306214e-05, "step": 24680 }, { "epoch": 8.232154769846565, "loss": 0.40954896807670593, "step": 24680 }, { "ce_loss": 0.09720776975154877, "epoch": 8.232154769846565, "step": 24680 }, { "distill_loss": 0.20144513249397278, "epoch": 8.232154769846565, "step": 24680 }, { "epoch": 8.232154769846565, "ref_ce_loss": 0.07739008963108063, "step": 24680 }, { "epoch": 8.232154769846565, "loss": 0.2706911861896515, "step": 24680 }, { "ce_loss": 0.03822727128863335, "epoch": 8.232154769846565, "step": 24680 }, { "distill_loss": 0.15807346999645233, "epoch": 8.232154769846565, "step": 24680 }, { "epoch": 8.232154769846565, "ref_ce_loss": 0.07413903623819351, "step": 24680 }, { "epoch": 8.23549032688459, "loss": 0.4484, "step": 24690 }, { "epoch": 8.23549032688459, "grad_norm": 1.1851049661636353, "step": 24690 }, { "epoch": 8.23549032688459, "learning_rate": 6.356231811189593e-05, "step": 24690 }, { "epoch": 8.23549032688459, "loss": 0.30205926299095154, "step": 24690 }, { "ce_loss": 0.0655367523431778, "epoch": 8.23549032688459, "step": 24690 }, { "distill_loss": 0.16531744599342346, "epoch": 8.23549032688459, "step": 24690 }, { "epoch": 8.23549032688459, "ref_ce_loss": 0.07099558413028717, "step": 24690 }, { "epoch": 8.23549032688459, "loss": 0.38187289237976074, "step": 24690 }, { "ce_loss": 0.04009734466671944, "epoch": 8.23549032688459, "step": 24690 }, { "distill_loss": 0.1971883326768875, "epoch": 8.23549032688459, "step": 24690 }, { "epoch": 8.23549032688459, "ref_ce_loss": 0.0627526342868805, "step": 24690 }, { "epoch": 8.238825883922615, "loss": 0.4152, "step": 24700 }, { "epoch": 8.238825883922615, "grad_norm": 1.4340665340423584, "step": 24700 }, { "epoch": 8.238825883922615, "learning_rate": 6.332877970970667e-05, "step": 24700 }, { "epoch": 8.238825883922615, "loss": 0.41680845618247986, "step": 24700 }, { "ce_loss": 0.0698235034942627, "epoch": 8.238825883922615, "step": 24700 }, { "distill_loss": 0.20547881722450256, "epoch": 8.238825883922615, "step": 24700 }, { "epoch": 8.238825883922615, "ref_ce_loss": 0.08544392883777618, "step": 24700 }, { "epoch": 8.238825883922615, "loss": 0.3988693058490753, "step": 24700 }, { "ce_loss": 0.07071752101182938, "epoch": 8.238825883922615, "step": 24700 }, { "distill_loss": 0.21910333633422852, "epoch": 8.238825883922615, "step": 24700 }, { "epoch": 8.238825883922615, "ref_ce_loss": 0.06319587677717209, "step": 24700 }, { "epoch": 8.24216144096064, "loss": 0.4185, "step": 24710 }, { "epoch": 8.24216144096064, "grad_norm": 1.073905110359192, "step": 24710 }, { "epoch": 8.24216144096064, "learning_rate": 6.309563423905891e-05, "step": 24710 }, { "epoch": 8.24216144096064, "loss": 0.46671733260154724, "step": 24710 }, { "ce_loss": 0.047412920743227005, "epoch": 8.24216144096064, "step": 24710 }, { "distill_loss": 0.21610336005687714, "epoch": 8.24216144096064, "step": 24710 }, { "epoch": 8.24216144096064, "ref_ce_loss": 0.05106896907091141, "step": 24710 }, { "epoch": 8.24216144096064, "loss": 0.6524374485015869, "step": 24710 }, { "ce_loss": 0.07691330462694168, "epoch": 8.24216144096064, "step": 24710 }, { "distill_loss": 0.21490544080734253, "epoch": 8.24216144096064, "step": 24710 }, { "epoch": 8.24216144096064, "ref_ce_loss": 0.11045210808515549, "step": 24710 }, { "epoch": 8.245496997998666, "loss": 0.4092, "step": 24720 }, { "epoch": 8.245496997998666, "grad_norm": 1.0162745714187622, "step": 24720 }, { "epoch": 8.245496997998666, "learning_rate": 6.286288197205834e-05, "step": 24720 }, { "epoch": 8.245496997998666, "loss": 0.6139237880706787, "step": 24720 }, { "ce_loss": 0.0492384098470211, "epoch": 8.245496997998666, "step": 24720 }, { "distill_loss": 0.22562669217586517, "epoch": 8.245496997998666, "step": 24720 }, { "epoch": 8.245496997998666, "ref_ce_loss": 0.07977955788373947, "step": 24720 }, { "epoch": 8.245496997998666, "loss": 0.31097501516342163, "step": 24720 }, { "ce_loss": 0.04455602914094925, "epoch": 8.245496997998666, "step": 24720 }, { "distill_loss": 0.17696070671081543, "epoch": 8.245496997998666, "step": 24720 }, { "epoch": 8.245496997998666, "ref_ce_loss": 0.06480378657579422, "step": 24720 }, { "epoch": 8.24883255503669, "loss": 0.4482, "step": 24730 }, { "epoch": 8.24883255503669, "grad_norm": 1.1851487159729004, "step": 24730 }, { "epoch": 8.24883255503669, "learning_rate": 6.26305231803519e-05, "step": 24730 }, { "epoch": 8.24883255503669, "loss": 0.37734466791152954, "step": 24730 }, { "ce_loss": 0.06257500499486923, "epoch": 8.24883255503669, "step": 24730 }, { "distill_loss": 0.20443084836006165, "epoch": 8.24883255503669, "step": 24730 }, { "epoch": 8.24883255503669, "ref_ce_loss": 0.08424118161201477, "step": 24730 }, { "epoch": 8.24883255503669, "loss": 0.28724437952041626, "step": 24730 }, { "ce_loss": 0.035595282912254333, "epoch": 8.24883255503669, "step": 24730 }, { "distill_loss": 0.1421755701303482, "epoch": 8.24883255503669, "step": 24730 }, { "epoch": 8.24883255503669, "ref_ce_loss": 0.06486207991838455, "step": 24730 }, { "epoch": 8.252168112074717, "loss": 0.3924, "step": 24740 }, { "epoch": 8.252168112074717, "grad_norm": 1.8277186155319214, "step": 24740 }, { "epoch": 8.252168112074717, "learning_rate": 6.239855813512741e-05, "step": 24740 }, { "epoch": 8.252168112074717, "loss": 0.5576151013374329, "step": 24740 }, { "ce_loss": 0.07668827474117279, "epoch": 8.252168112074717, "step": 24740 }, { "distill_loss": 0.2058444321155548, "epoch": 8.252168112074717, "step": 24740 }, { "epoch": 8.252168112074717, "ref_ce_loss": 0.09041954576969147, "step": 24740 }, { "epoch": 8.252168112074717, "loss": 0.5620578527450562, "step": 24740 }, { "ce_loss": 0.10907970368862152, "epoch": 8.252168112074717, "step": 24740 }, { "distill_loss": 0.2825700044631958, "epoch": 8.252168112074717, "step": 24740 }, { "epoch": 8.252168112074717, "ref_ce_loss": 0.08636128902435303, "step": 24740 }, { "epoch": 8.255503669112741, "loss": 0.4372, "step": 24750 }, { "epoch": 8.255503669112741, "grad_norm": 2.166013717651367, "step": 24750 }, { "epoch": 8.255503669112741, "learning_rate": 6.216698710711297e-05, "step": 24750 }, { "epoch": 8.255503669112741, "loss": 0.5015909671783447, "step": 24750 }, { "ce_loss": 0.07607283443212509, "epoch": 8.255503669112741, "step": 24750 }, { "distill_loss": 0.24176400899887085, "epoch": 8.255503669112741, "step": 24750 }, { "epoch": 8.255503669112741, "ref_ce_loss": 0.0735190212726593, "step": 24750 }, { "epoch": 8.255503669112741, "loss": 0.4340944290161133, "step": 24750 }, { "ce_loss": 0.04279591515660286, "epoch": 8.255503669112741, "step": 24750 }, { "distill_loss": 0.21969415247440338, "epoch": 8.255503669112741, "step": 24750 }, { "epoch": 8.255503669112741, "ref_ce_loss": 0.07828209549188614, "step": 24750 }, { "epoch": 8.258839226150767, "loss": 0.4027, "step": 24760 }, { "epoch": 8.258839226150767, "grad_norm": 1.5689845085144043, "step": 24760 }, { "epoch": 8.258839226150767, "learning_rate": 6.193581036657694e-05, "step": 24760 }, { "epoch": 8.258839226150767, "loss": 0.3352030813694, "step": 24760 }, { "ce_loss": 0.08149098604917526, "epoch": 8.258839226150767, "step": 24760 }, { "distill_loss": 0.17032968997955322, "epoch": 8.258839226150767, "step": 24760 }, { "epoch": 8.258839226150767, "ref_ce_loss": 0.08317124843597412, "step": 24760 }, { "epoch": 8.258839226150767, "loss": 0.2671343982219696, "step": 24760 }, { "ce_loss": 0.04549345001578331, "epoch": 8.258839226150767, "step": 24760 }, { "distill_loss": 0.158750981092453, "epoch": 8.258839226150767, "step": 24760 }, { "epoch": 8.258839226150767, "ref_ce_loss": 0.06273841112852097, "step": 24760 }, { "epoch": 8.262174783188792, "loss": 0.4228, "step": 24770 }, { "epoch": 8.262174783188792, "grad_norm": 1.2682881355285645, "step": 24770 }, { "epoch": 8.262174783188792, "learning_rate": 6.17050281833274e-05, "step": 24770 }, { "epoch": 8.262174783188792, "loss": 0.4441709816455841, "step": 24770 }, { "ce_loss": 0.058845460414886475, "epoch": 8.262174783188792, "step": 24770 }, { "distill_loss": 0.2046021968126297, "epoch": 8.262174783188792, "step": 24770 }, { "epoch": 8.262174783188792, "ref_ce_loss": 0.09121354669332504, "step": 24770 }, { "epoch": 8.262174783188792, "loss": 0.39193737506866455, "step": 24770 }, { "ce_loss": 0.06523241847753525, "epoch": 8.262174783188792, "step": 24770 }, { "distill_loss": 0.20892859995365143, "epoch": 8.262174783188792, "step": 24770 }, { "epoch": 8.262174783188792, "ref_ce_loss": 0.09341893345117569, "step": 24770 }, { "epoch": 8.265510340226818, "loss": 0.3767, "step": 24780 }, { "epoch": 8.265510340226818, "grad_norm": 1.4384493827819824, "step": 24780 }, { "epoch": 8.265510340226818, "learning_rate": 6.147464082671213e-05, "step": 24780 }, { "epoch": 8.265510340226818, "loss": 0.2718307375907898, "step": 24780 }, { "ce_loss": 0.04708772897720337, "epoch": 8.265510340226818, "step": 24780 }, { "distill_loss": 0.17009615898132324, "epoch": 8.265510340226818, "step": 24780 }, { "epoch": 8.265510340226818, "ref_ce_loss": 0.054384294897317886, "step": 24780 }, { "epoch": 8.265510340226818, "loss": 0.529242992401123, "step": 24780 }, { "ce_loss": 0.0579858236014843, "epoch": 8.265510340226818, "step": 24780 }, { "distill_loss": 0.19909480214118958, "epoch": 8.265510340226818, "step": 24780 }, { "epoch": 8.265510340226818, "ref_ce_loss": 0.08572480082511902, "step": 24780 }, { "epoch": 8.268845897264843, "loss": 0.3977, "step": 24790 }, { "epoch": 8.268845897264843, "grad_norm": 0.9916096925735474, "step": 24790 }, { "epoch": 8.268845897264843, "learning_rate": 6.124464856561774e-05, "step": 24790 }, { "epoch": 8.268845897264843, "loss": 0.4032076299190521, "step": 24790 }, { "ce_loss": 0.09457050263881683, "epoch": 8.268845897264843, "step": 24790 }, { "distill_loss": 0.21368515491485596, "epoch": 8.268845897264843, "step": 24790 }, { "epoch": 8.268845897264843, "ref_ce_loss": 0.07465735077857971, "step": 24790 }, { "epoch": 8.268845897264843, "loss": 0.3135150671005249, "step": 24790 }, { "ce_loss": 0.045039501041173935, "epoch": 8.268845897264843, "step": 24790 }, { "distill_loss": 0.17188379168510437, "epoch": 8.268845897264843, "step": 24790 }, { "epoch": 8.268845897264843, "ref_ce_loss": 0.06974980980157852, "step": 24790 }, { "epoch": 8.272181454302869, "loss": 0.4001, "step": 24800 }, { "epoch": 8.272181454302869, "grad_norm": 1.6688566207885742, "step": 24800 }, { "epoch": 8.272181454302869, "learning_rate": 6.101505166847008e-05, "step": 24800 }, { "epoch": 8.272181454302869, "loss": 0.32536888122558594, "step": 24800 }, { "ce_loss": 0.06018342077732086, "epoch": 8.272181454302869, "step": 24800 }, { "distill_loss": 0.18512201309204102, "epoch": 8.272181454302869, "step": 24800 }, { "epoch": 8.272181454302869, "ref_ce_loss": 0.07981298118829727, "step": 24800 }, { "epoch": 8.272181454302869, "loss": 0.24707356095314026, "step": 24800 }, { "ce_loss": 0.03561161831021309, "epoch": 8.272181454302869, "step": 24800 }, { "distill_loss": 0.12160885334014893, "epoch": 8.272181454302869, "step": 24800 }, { "epoch": 8.272181454302869, "ref_ce_loss": 0.06553605198860168, "step": 24800 }, { "epoch": 8.275517011340893, "loss": 0.3612, "step": 24810 }, { "epoch": 8.275517011340893, "grad_norm": 0.9926279187202454, "step": 24810 }, { "epoch": 8.275517011340893, "learning_rate": 6.078585040323339e-05, "step": 24810 }, { "epoch": 8.275517011340893, "loss": 0.4812214970588684, "step": 24810 }, { "ce_loss": 0.057430241256952286, "epoch": 8.275517011340893, "step": 24810 }, { "distill_loss": 0.22909767925739288, "epoch": 8.275517011340893, "step": 24810 }, { "epoch": 8.275517011340893, "ref_ce_loss": 0.11038453131914139, "step": 24810 }, { "epoch": 8.275517011340893, "loss": 0.28939327597618103, "step": 24810 }, { "ce_loss": 0.04220176488161087, "epoch": 8.275517011340893, "step": 24810 }, { "distill_loss": 0.15939894318580627, "epoch": 8.275517011340893, "step": 24810 }, { "epoch": 8.275517011340893, "ref_ce_loss": 0.05957774072885513, "step": 24810 }, { "epoch": 8.27885256837892, "loss": 0.3925, "step": 24820 }, { "epoch": 8.27885256837892, "grad_norm": 1.9365723133087158, "step": 24820 }, { "epoch": 8.27885256837892, "learning_rate": 6.0557045037410355e-05, "step": 24820 }, { "epoch": 8.27885256837892, "loss": 0.32409194111824036, "step": 24820 }, { "ce_loss": 0.06542002409696579, "epoch": 8.27885256837892, "step": 24820 }, { "distill_loss": 0.17851068079471588, "epoch": 8.27885256837892, "step": 24820 }, { "epoch": 8.27885256837892, "ref_ce_loss": 0.05876639112830162, "step": 24820 }, { "epoch": 8.27885256837892, "loss": 0.43060964345932007, "step": 24820 }, { "ce_loss": 0.11308394372463226, "epoch": 8.27885256837892, "step": 24820 }, { "distill_loss": 0.19883199036121368, "epoch": 8.27885256837892, "step": 24820 }, { "epoch": 8.27885256837892, "ref_ce_loss": 0.08647770434617996, "step": 24820 }, { "epoch": 8.282188125416944, "loss": 0.3985, "step": 24830 }, { "epoch": 8.282188125416944, "grad_norm": 2.2099313735961914, "step": 24830 }, { "epoch": 8.282188125416944, "learning_rate": 6.0328635838041224e-05, "step": 24830 }, { "epoch": 8.282188125416944, "loss": 0.3571459650993347, "step": 24830 }, { "ce_loss": 0.0637267604470253, "epoch": 8.282188125416944, "step": 24830 }, { "distill_loss": 0.16139563918113708, "epoch": 8.282188125416944, "step": 24830 }, { "epoch": 8.282188125416944, "ref_ce_loss": 0.07349435240030289, "step": 24830 }, { "epoch": 8.282188125416944, "loss": 0.36667025089263916, "step": 24830 }, { "ce_loss": 0.04978648200631142, "epoch": 8.282188125416944, "step": 24830 }, { "distill_loss": 0.19708067178726196, "epoch": 8.282188125416944, "step": 24830 }, { "epoch": 8.282188125416944, "ref_ce_loss": 0.06627427786588669, "step": 24830 }, { "epoch": 8.28552368245497, "loss": 0.4546, "step": 24840 }, { "epoch": 8.28552368245497, "grad_norm": 1.1406466960906982, "step": 24840 }, { "epoch": 8.28552368245497, "learning_rate": 6.0100623071704186e-05, "step": 24840 }, { "epoch": 8.28552368245497, "loss": 0.551635205745697, "step": 24840 }, { "ce_loss": 0.06963831186294556, "epoch": 8.28552368245497, "step": 24840 }, { "distill_loss": 0.23795580863952637, "epoch": 8.28552368245497, "step": 24840 }, { "epoch": 8.28552368245497, "ref_ce_loss": 0.0977717936038971, "step": 24840 }, { "epoch": 8.28552368245497, "loss": 0.36534714698791504, "step": 24840 }, { "ce_loss": 0.06684930622577667, "epoch": 8.28552368245497, "step": 24840 }, { "distill_loss": 0.17508380115032196, "epoch": 8.28552368245497, "step": 24840 }, { "epoch": 8.28552368245497, "ref_ce_loss": 0.08289922773838043, "step": 24840 }, { "epoch": 8.288859239492995, "loss": 0.3985, "step": 24850 }, { "epoch": 8.288859239492995, "grad_norm": 0.8758309483528137, "step": 24850 }, { "epoch": 8.288859239492995, "learning_rate": 5.987300700451477e-05, "step": 24850 }, { "epoch": 8.288859239492995, "loss": 0.2846289277076721, "step": 24850 }, { "ce_loss": 0.04146531596779823, "epoch": 8.288859239492995, "step": 24850 }, { "distill_loss": 0.18570080399513245, "epoch": 8.288859239492995, "step": 24850 }, { "epoch": 8.288859239492995, "ref_ce_loss": 0.05710703507065773, "step": 24850 }, { "epoch": 8.288859239492995, "loss": 0.3122004568576813, "step": 24850 }, { "ce_loss": 0.022249093279242516, "epoch": 8.288859239492995, "step": 24850 }, { "distill_loss": 0.1364220380783081, "epoch": 8.288859239492995, "step": 24850 }, { "epoch": 8.288859239492995, "ref_ce_loss": 0.050004757940769196, "step": 24850 }, { "epoch": 8.292194796531021, "loss": 0.3917, "step": 24860 }, { "epoch": 8.292194796531021, "grad_norm": 1.4198230504989624, "step": 24860 }, { "epoch": 8.292194796531021, "learning_rate": 5.9645787902125314e-05, "step": 24860 }, { "epoch": 8.292194796531021, "loss": 0.4237378239631653, "step": 24860 }, { "ce_loss": 0.053649380803108215, "epoch": 8.292194796531021, "step": 24860 }, { "distill_loss": 0.2529350519180298, "epoch": 8.292194796531021, "step": 24860 }, { "epoch": 8.292194796531021, "ref_ce_loss": 0.07835126668214798, "step": 24860 }, { "epoch": 8.292194796531021, "loss": 0.4145413637161255, "step": 24860 }, { "ce_loss": 0.0244712233543396, "epoch": 8.292194796531021, "step": 24860 }, { "distill_loss": 0.16361355781555176, "epoch": 8.292194796531021, "step": 24860 }, { "epoch": 8.292194796531021, "ref_ce_loss": 0.060614414513111115, "step": 24860 }, { "epoch": 8.295530353569045, "loss": 0.3863, "step": 24870 }, { "epoch": 8.295530353569045, "grad_norm": 1.1518481969833374, "step": 24870 }, { "epoch": 8.295530353569045, "learning_rate": 5.941896602972503e-05, "step": 24870 }, { "epoch": 8.295530353569045, "loss": 0.30724936723709106, "step": 24870 }, { "ce_loss": 0.030511852353811264, "epoch": 8.295530353569045, "step": 24870 }, { "distill_loss": 0.1536020040512085, "epoch": 8.295530353569045, "step": 24870 }, { "epoch": 8.295530353569045, "ref_ce_loss": 0.06268143653869629, "step": 24870 }, { "epoch": 8.295530353569045, "loss": 0.404881089925766, "step": 24870 }, { "ce_loss": 0.07836263626813889, "epoch": 8.295530353569045, "step": 24870 }, { "distill_loss": 0.23570379614830017, "epoch": 8.295530353569045, "step": 24870 }, { "epoch": 8.295530353569045, "ref_ce_loss": 0.06730661541223526, "step": 24870 }, { "epoch": 8.298865910607072, "loss": 0.412, "step": 24880 }, { "epoch": 8.298865910607072, "grad_norm": 0.9654243588447571, "step": 24880 }, { "epoch": 8.298865910607072, "learning_rate": 5.91925416520394e-05, "step": 24880 }, { "epoch": 8.298865910607072, "loss": 0.40532171726226807, "step": 24880 }, { "ce_loss": 0.06659356504678726, "epoch": 8.298865910607072, "step": 24880 }, { "distill_loss": 0.23507378995418549, "epoch": 8.298865910607072, "step": 24880 }, { "epoch": 8.298865910607072, "ref_ce_loss": 0.10342125594615936, "step": 24880 }, { "epoch": 8.298865910607072, "loss": 0.5091399550437927, "step": 24880 }, { "ce_loss": 0.07355561852455139, "epoch": 8.298865910607072, "step": 24880 }, { "distill_loss": 0.22285902500152588, "epoch": 8.298865910607072, "step": 24880 }, { "epoch": 8.298865910607072, "ref_ce_loss": 0.08281935751438141, "step": 24880 }, { "epoch": 8.302201467645096, "loss": 0.4575, "step": 24890 }, { "epoch": 8.302201467645096, "grad_norm": 1.3964868783950806, "step": 24890 }, { "epoch": 8.302201467645096, "learning_rate": 5.89665150333301e-05, "step": 24890 }, { "epoch": 8.302201467645096, "loss": 0.7368953227996826, "step": 24890 }, { "ce_loss": 0.08275753259658813, "epoch": 8.302201467645096, "step": 24890 }, { "distill_loss": 0.2730520963668823, "epoch": 8.302201467645096, "step": 24890 }, { "epoch": 8.302201467645096, "ref_ce_loss": 0.0967688113451004, "step": 24890 }, { "epoch": 8.302201467645096, "loss": 0.3053659200668335, "step": 24890 }, { "ce_loss": 0.053972817957401276, "epoch": 8.302201467645096, "step": 24890 }, { "distill_loss": 0.16566911339759827, "epoch": 8.302201467645096, "step": 24890 }, { "epoch": 8.302201467645096, "ref_ce_loss": 0.06530603021383286, "step": 24890 }, { "epoch": 8.305537024683122, "loss": 0.4027, "step": 24900 }, { "epoch": 8.305537024683122, "grad_norm": 1.1332793235778809, "step": 24900 }, { "epoch": 8.305537024683122, "learning_rate": 5.8740886437394526e-05, "step": 24900 }, { "epoch": 8.305537024683122, "loss": 0.5145341157913208, "step": 24900 }, { "ce_loss": 0.06977076828479767, "epoch": 8.305537024683122, "step": 24900 }, { "distill_loss": 0.20493197441101074, "epoch": 8.305537024683122, "step": 24900 }, { "epoch": 8.305537024683122, "ref_ce_loss": 0.08403408527374268, "step": 24900 }, { "epoch": 8.305537024683122, "loss": 0.42330917716026306, "step": 24900 }, { "ce_loss": 0.09833862632513046, "epoch": 8.305537024683122, "step": 24900 }, { "distill_loss": 0.2172425538301468, "epoch": 8.305537024683122, "step": 24900 }, { "epoch": 8.305537024683122, "ref_ce_loss": 0.07472098618745804, "step": 24900 }, { "epoch": 8.308872581721147, "loss": 0.3979, "step": 24910 }, { "epoch": 8.308872581721147, "grad_norm": 1.4151992797851562, "step": 24910 }, { "epoch": 8.308872581721147, "learning_rate": 5.8515656127565445e-05, "step": 24910 }, { "epoch": 8.308872581721147, "loss": 0.408204585313797, "step": 24910 }, { "ce_loss": 0.07390382885932922, "epoch": 8.308872581721147, "step": 24910 }, { "distill_loss": 0.2057056427001953, "epoch": 8.308872581721147, "step": 24910 }, { "epoch": 8.308872581721147, "ref_ce_loss": 0.08743388950824738, "step": 24910 }, { "epoch": 8.308872581721147, "loss": 0.35966911911964417, "step": 24910 }, { "ce_loss": 0.05391908437013626, "epoch": 8.308872581721147, "step": 24910 }, { "distill_loss": 0.23523716628551483, "epoch": 8.308872581721147, "step": 24910 }, { "epoch": 8.308872581721147, "ref_ce_loss": 0.047440554946660995, "step": 24910 }, { "epoch": 8.312208138759173, "loss": 0.4019, "step": 24920 }, { "epoch": 8.312208138759173, "grad_norm": 1.5864213705062866, "step": 24920 }, { "epoch": 8.312208138759173, "learning_rate": 5.829082436671085e-05, "step": 24920 }, { "epoch": 8.312208138759173, "loss": 0.3313003480434418, "step": 24920 }, { "ce_loss": 0.027458081021904945, "epoch": 8.312208138759173, "step": 24920 }, { "distill_loss": 0.1620972454547882, "epoch": 8.312208138759173, "step": 24920 }, { "epoch": 8.312208138759173, "ref_ce_loss": 0.052886199206113815, "step": 24920 }, { "epoch": 8.312208138759173, "loss": 0.4817555844783783, "step": 24920 }, { "ce_loss": 0.06159020587801933, "epoch": 8.312208138759173, "step": 24920 }, { "distill_loss": 0.18925221264362335, "epoch": 8.312208138759173, "step": 24920 }, { "epoch": 8.312208138759173, "ref_ce_loss": 0.0889902412891388, "step": 24920 }, { "epoch": 8.315543695797198, "loss": 0.4349, "step": 24930 }, { "epoch": 8.315543695797198, "grad_norm": 0.8452260494232178, "step": 24930 }, { "epoch": 8.315543695797198, "learning_rate": 5.806639141723364e-05, "step": 24930 }, { "epoch": 8.315543695797198, "loss": 0.2972012460231781, "step": 24930 }, { "ce_loss": 0.07060505449771881, "epoch": 8.315543695797198, "step": 24930 }, { "distill_loss": 0.15588009357452393, "epoch": 8.315543695797198, "step": 24930 }, { "epoch": 8.315543695797198, "ref_ce_loss": 0.07062255591154099, "step": 24930 }, { "epoch": 8.315543695797198, "loss": 0.29503345489501953, "step": 24930 }, { "ce_loss": 0.03660615533590317, "epoch": 8.315543695797198, "step": 24930 }, { "distill_loss": 0.1910068839788437, "epoch": 8.315543695797198, "step": 24930 }, { "epoch": 8.315543695797198, "ref_ce_loss": 0.052066173404455185, "step": 24930 }, { "epoch": 8.318879252835224, "loss": 0.4054, "step": 24940 }, { "epoch": 8.318879252835224, "grad_norm": 1.2340115308761597, "step": 24940 }, { "epoch": 8.318879252835224, "learning_rate": 5.784235754107137e-05, "step": 24940 }, { "epoch": 8.318879252835224, "loss": 0.3457760810852051, "step": 24940 }, { "ce_loss": 0.031064603477716446, "epoch": 8.318879252835224, "step": 24940 }, { "distill_loss": 0.15679000318050385, "epoch": 8.318879252835224, "step": 24940 }, { "epoch": 8.318879252835224, "ref_ce_loss": 0.06945052742958069, "step": 24940 }, { "epoch": 8.318879252835224, "loss": 0.3431711792945862, "step": 24940 }, { "ce_loss": 0.03292974457144737, "epoch": 8.318879252835224, "step": 24940 }, { "distill_loss": 0.1323884278535843, "epoch": 8.318879252835224, "step": 24940 }, { "epoch": 8.318879252835224, "ref_ce_loss": 0.06366308033466339, "step": 24940 }, { "epoch": 8.322214809873248, "loss": 0.3784, "step": 24950 }, { "epoch": 8.322214809873248, "grad_norm": 3.7198262214660645, "step": 24950 }, { "epoch": 8.322214809873248, "learning_rate": 5.7618722999695394e-05, "step": 24950 }, { "epoch": 8.322214809873248, "loss": 0.38841986656188965, "step": 24950 }, { "ce_loss": 0.08898656815290451, "epoch": 8.322214809873248, "step": 24950 }, { "distill_loss": 0.19516724348068237, "epoch": 8.322214809873248, "step": 24950 }, { "epoch": 8.322214809873248, "ref_ce_loss": 0.07287117093801498, "step": 24950 }, { "epoch": 8.322214809873248, "loss": 0.35441359877586365, "step": 24950 }, { "ce_loss": 0.05294131487607956, "epoch": 8.322214809873248, "step": 24950 }, { "distill_loss": 0.2164778709411621, "epoch": 8.322214809873248, "step": 24950 }, { "epoch": 8.322214809873248, "ref_ce_loss": 0.06361507624387741, "step": 24950 }, { "epoch": 8.325550366911274, "loss": 0.411, "step": 24960 }, { "epoch": 8.325550366911274, "grad_norm": 1.1434744596481323, "step": 24960 }, { "epoch": 8.325550366911274, "learning_rate": 5.7395488054111436e-05, "step": 24960 }, { "epoch": 8.325550366911274, "loss": 0.2941511571407318, "step": 24960 }, { "ce_loss": 0.052058592438697815, "epoch": 8.325550366911274, "step": 24960 }, { "distill_loss": 0.17279891669750214, "epoch": 8.325550366911274, "step": 24960 }, { "epoch": 8.325550366911274, "ref_ce_loss": 0.06911136209964752, "step": 24960 }, { "epoch": 8.325550366911274, "loss": 0.30970582365989685, "step": 24960 }, { "ce_loss": 0.04044627025723457, "epoch": 8.325550366911274, "step": 24960 }, { "distill_loss": 0.17380879819393158, "epoch": 8.325550366911274, "step": 24960 }, { "epoch": 8.325550366911274, "ref_ce_loss": 0.0723646730184555, "step": 24960 }, { "epoch": 8.328885923949299, "loss": 0.3814, "step": 24970 }, { "epoch": 8.328885923949299, "grad_norm": 4.01048469543457, "step": 24970 }, { "epoch": 8.328885923949299, "learning_rate": 5.717265296485872e-05, "step": 24970 }, { "epoch": 8.328885923949299, "loss": 0.4379655122756958, "step": 24970 }, { "ce_loss": 0.053386520594358444, "epoch": 8.328885923949299, "step": 24970 }, { "distill_loss": 0.1718374490737915, "epoch": 8.328885923949299, "step": 24970 }, { "epoch": 8.328885923949299, "ref_ce_loss": 0.06681462377309799, "step": 24970 }, { "epoch": 8.328885923949299, "loss": 0.45380282402038574, "step": 24970 }, { "ce_loss": 0.08911381661891937, "epoch": 8.328885923949299, "step": 24970 }, { "distill_loss": 0.23314177989959717, "epoch": 8.328885923949299, "step": 24970 }, { "epoch": 8.328885923949299, "ref_ce_loss": 0.06985948234796524, "step": 24970 }, { "epoch": 8.332221480987325, "loss": 0.3751, "step": 24980 }, { "epoch": 8.332221480987325, "grad_norm": 1.6863759756088257, "step": 24980 }, { "epoch": 8.332221480987325, "learning_rate": 5.69502179920097e-05, "step": 24980 }, { "epoch": 8.332221480987325, "loss": 0.32353562116622925, "step": 24980 }, { "ce_loss": 0.06093441694974899, "epoch": 8.332221480987325, "step": 24980 }, { "distill_loss": 0.15882942080497742, "epoch": 8.332221480987325, "step": 24980 }, { "epoch": 8.332221480987325, "ref_ce_loss": 0.07053432613611221, "step": 24980 }, { "epoch": 8.332221480987325, "loss": 0.4023553431034088, "step": 24980 }, { "ce_loss": 0.08936289697885513, "epoch": 8.332221480987325, "step": 24980 }, { "distill_loss": 0.20839953422546387, "epoch": 8.332221480987325, "step": 24980 }, { "epoch": 8.332221480987325, "ref_ce_loss": 0.07393033057451248, "step": 24980 }, { "epoch": 8.33555703802535, "loss": 0.4224, "step": 24990 }, { "epoch": 8.33555703802535, "grad_norm": 0.9262280464172363, "step": 24990 }, { "epoch": 8.33555703802535, "learning_rate": 5.672818339517001e-05, "step": 24990 }, { "epoch": 8.33555703802535, "loss": 0.3177008032798767, "step": 24990 }, { "ce_loss": 0.06413191556930542, "epoch": 8.33555703802535, "step": 24990 }, { "distill_loss": 0.15126436948776245, "epoch": 8.33555703802535, "step": 24990 }, { "epoch": 8.33555703802535, "ref_ce_loss": 0.10207774490118027, "step": 24990 }, { "epoch": 8.33555703802535, "loss": 0.3172096312046051, "step": 24990 }, { "ce_loss": 0.04232507944107056, "epoch": 8.33555703802535, "step": 24990 }, { "distill_loss": 0.19980588555335999, "epoch": 8.33555703802535, "step": 24990 }, { "epoch": 8.33555703802535, "ref_ce_loss": 0.07482916116714478, "step": 24990 }, { "epoch": 8.338892595063376, "loss": 0.4413, "step": 25000 }, { "epoch": 8.338892595063376, "grad_norm": 1.1610908508300781, "step": 25000 }, { "epoch": 8.338892595063376, "learning_rate": 5.6506549433477865e-05, "step": 25000 }, { "epoch": 8.338892595063376, "loss": 1.097810983657837, "step": 25000 }, { "ce_loss": 0.05602821335196495, "epoch": 8.338892595063376, "step": 25000 }, { "distill_loss": 0.18375152349472046, "epoch": 8.338892595063376, "step": 25000 }, { "epoch": 8.338892595063376, "ref_ce_loss": 0.05691630393266678, "step": 25000 }, { "epoch": 8.338892595063376, "loss": 0.3631144165992737, "step": 25000 }, { "ce_loss": 0.06245829910039902, "epoch": 8.338892595063376, "step": 25000 }, { "distill_loss": 0.19284473359584808, "epoch": 8.338892595063376, "step": 25000 }, { "epoch": 8.338892595063376, "ref_ce_loss": 0.07752401381731033, "step": 25000 }, { "epoch": 8.3422281521014, "loss": 0.4367, "step": 25010 }, { "epoch": 8.3422281521014, "grad_norm": 1.536965012550354, "step": 25010 }, { "epoch": 8.3422281521014, "learning_rate": 5.6285316365604037e-05, "step": 25010 }, { "epoch": 8.3422281521014, "loss": 0.44820597767829895, "step": 25010 }, { "ce_loss": 0.07226640731096268, "epoch": 8.3422281521014, "step": 25010 }, { "distill_loss": 0.20867325365543365, "epoch": 8.3422281521014, "step": 25010 }, { "epoch": 8.3422281521014, "ref_ce_loss": 0.06741487234830856, "step": 25010 }, { "epoch": 8.3422281521014, "loss": 0.3269725739955902, "step": 25010 }, { "ce_loss": 0.062072884291410446, "epoch": 8.3422281521014, "step": 25010 }, { "distill_loss": 0.17925648391246796, "epoch": 8.3422281521014, "step": 25010 }, { "epoch": 8.3422281521014, "ref_ce_loss": 0.08505354076623917, "step": 25010 }, { "epoch": 8.345563709139427, "loss": 0.3696, "step": 25020 }, { "epoch": 8.345563709139427, "grad_norm": 0.9757553935050964, "step": 25020 }, { "epoch": 8.345563709139427, "learning_rate": 5.6064484449751145e-05, "step": 25020 }, { "epoch": 8.345563709139427, "loss": 0.5743061304092407, "step": 25020 }, { "ce_loss": 0.028917063027620316, "epoch": 8.345563709139427, "step": 25020 }, { "distill_loss": 0.19040493667125702, "epoch": 8.345563709139427, "step": 25020 }, { "epoch": 8.345563709139427, "ref_ce_loss": 0.072723887860775, "step": 25020 }, { "epoch": 8.345563709139427, "loss": 0.33696094155311584, "step": 25020 }, { "ce_loss": 0.08223090320825577, "epoch": 8.345563709139427, "step": 25020 }, { "distill_loss": 0.18200477957725525, "epoch": 8.345563709139427, "step": 25020 }, { "epoch": 8.345563709139427, "ref_ce_loss": 0.07257626950740814, "step": 25020 }, { "epoch": 8.348899266177451, "loss": 0.377, "step": 25030 }, { "epoch": 8.348899266177451, "grad_norm": 1.1528825759887695, "step": 25030 }, { "epoch": 8.348899266177451, "learning_rate": 5.584405394365391e-05, "step": 25030 }, { "epoch": 8.348899266177451, "loss": 0.35527855157852173, "step": 25030 }, { "ce_loss": 0.07070120424032211, "epoch": 8.348899266177451, "step": 25030 }, { "distill_loss": 0.1744517683982849, "epoch": 8.348899266177451, "step": 25030 }, { "epoch": 8.348899266177451, "ref_ce_loss": 0.05900336802005768, "step": 25030 }, { "epoch": 8.348899266177451, "loss": 0.40178489685058594, "step": 25030 }, { "ce_loss": 0.05492159724235535, "epoch": 8.348899266177451, "step": 25030 }, { "distill_loss": 0.21854011714458466, "epoch": 8.348899266177451, "step": 25030 }, { "epoch": 8.348899266177451, "ref_ce_loss": 0.09786759316921234, "step": 25030 }, { "epoch": 8.352234823215477, "loss": 0.4241, "step": 25040 }, { "epoch": 8.352234823215477, "grad_norm": 2.2214083671569824, "step": 25040 }, { "epoch": 8.352234823215477, "learning_rate": 5.5624025104578404e-05, "step": 25040 }, { "epoch": 8.352234823215477, "loss": 0.3765566349029541, "step": 25040 }, { "ce_loss": 0.05312661454081535, "epoch": 8.352234823215477, "step": 25040 }, { "distill_loss": 0.2064882069826126, "epoch": 8.352234823215477, "step": 25040 }, { "epoch": 8.352234823215477, "ref_ce_loss": 0.09187427163124084, "step": 25040 }, { "epoch": 8.352234823215477, "loss": 0.4223754107952118, "step": 25040 }, { "ce_loss": 0.10373418778181076, "epoch": 8.352234823215477, "step": 25040 }, { "distill_loss": 0.20074056088924408, "epoch": 8.352234823215477, "step": 25040 }, { "epoch": 8.352234823215477, "ref_ce_loss": 0.09522716701030731, "step": 25040 }, { "epoch": 8.355570380253502, "loss": 0.4533, "step": 25050 }, { "epoch": 8.355570380253502, "grad_norm": 1.3162434101104736, "step": 25050 }, { "epoch": 8.355570380253502, "learning_rate": 5.540439818932202e-05, "step": 25050 }, { "epoch": 8.355570380253502, "loss": 0.5700833797454834, "step": 25050 }, { "ce_loss": 0.07677993178367615, "epoch": 8.355570380253502, "step": 25050 }, { "distill_loss": 0.1616164743900299, "epoch": 8.355570380253502, "step": 25050 }, { "epoch": 8.355570380253502, "ref_ce_loss": 0.09277770668268204, "step": 25050 }, { "epoch": 8.355570380253502, "loss": 0.3516484797000885, "step": 25050 }, { "ce_loss": 0.04538606479763985, "epoch": 8.355570380253502, "step": 25050 }, { "distill_loss": 0.18248510360717773, "epoch": 8.355570380253502, "step": 25050 }, { "epoch": 8.355570380253502, "ref_ce_loss": 0.06455279886722565, "step": 25050 }, { "epoch": 8.358905937291528, "loss": 0.4638, "step": 25060 }, { "epoch": 8.358905937291528, "grad_norm": 1.3298295736312866, "step": 25060 }, { "epoch": 8.358905937291528, "learning_rate": 5.518517345421304e-05, "step": 25060 }, { "epoch": 8.358905937291528, "loss": 0.4533049464225769, "step": 25060 }, { "ce_loss": 0.08425446599721909, "epoch": 8.358905937291528, "step": 25060 }, { "distill_loss": 0.2121516764163971, "epoch": 8.358905937291528, "step": 25060 }, { "epoch": 8.358905937291528, "ref_ce_loss": 0.08103732019662857, "step": 25060 }, { "epoch": 8.358905937291528, "loss": 0.3355601131916046, "step": 25060 }, { "ce_loss": 0.07693885266780853, "epoch": 8.358905937291528, "step": 25060 }, { "distill_loss": 0.15625596046447754, "epoch": 8.358905937291528, "step": 25060 }, { "epoch": 8.358905937291528, "ref_ce_loss": 0.07718048989772797, "step": 25060 }, { "epoch": 8.362241494329552, "loss": 0.3917, "step": 25070 }, { "epoch": 8.362241494329552, "grad_norm": 1.6331768035888672, "step": 25070 }, { "epoch": 8.362241494329552, "learning_rate": 5.496635115511017e-05, "step": 25070 }, { "epoch": 8.362241494329552, "loss": 0.4228060841560364, "step": 25070 }, { "ce_loss": 0.04212617501616478, "epoch": 8.362241494329552, "step": 25070 }, { "distill_loss": 0.18509574234485626, "epoch": 8.362241494329552, "step": 25070 }, { "epoch": 8.362241494329552, "ref_ce_loss": 0.07182523608207703, "step": 25070 }, { "epoch": 8.362241494329552, "loss": 0.39445173740386963, "step": 25070 }, { "ce_loss": 0.07941533625125885, "epoch": 8.362241494329552, "step": 25070 }, { "distill_loss": 0.18916261196136475, "epoch": 8.362241494329552, "step": 25070 }, { "epoch": 8.362241494329552, "ref_ce_loss": 0.07356342673301697, "step": 25070 }, { "epoch": 8.365577051367579, "loss": 0.3803, "step": 25080 }, { "epoch": 8.365577051367579, "grad_norm": 1.9852405786514282, "step": 25080 }, { "epoch": 8.365577051367579, "learning_rate": 5.474793154740257e-05, "step": 25080 }, { "epoch": 8.365577051367579, "loss": 0.30684754252433777, "step": 25080 }, { "ce_loss": 0.04548907279968262, "epoch": 8.365577051367579, "step": 25080 }, { "distill_loss": 0.1826765537261963, "epoch": 8.365577051367579, "step": 25080 }, { "epoch": 8.365577051367579, "ref_ce_loss": 0.0782376378774643, "step": 25080 }, { "epoch": 8.365577051367579, "loss": 0.5661678314208984, "step": 25080 }, { "ce_loss": 0.07862786203622818, "epoch": 8.365577051367579, "step": 25080 }, { "distill_loss": 0.2614433169364929, "epoch": 8.365577051367579, "step": 25080 }, { "epoch": 8.365577051367579, "ref_ce_loss": 0.07493744790554047, "step": 25080 }, { "epoch": 8.368912608405603, "loss": 0.3811, "step": 25090 }, { "epoch": 8.368912608405603, "grad_norm": 0.8753249645233154, "step": 25090 }, { "epoch": 8.368912608405603, "learning_rate": 5.452991488600967e-05, "step": 25090 }, { "epoch": 8.368912608405603, "loss": 0.40100419521331787, "step": 25090 }, { "ce_loss": 0.08868025243282318, "epoch": 8.368912608405603, "step": 25090 }, { "distill_loss": 0.21525098383426666, "epoch": 8.368912608405603, "step": 25090 }, { "epoch": 8.368912608405603, "ref_ce_loss": 0.07010910660028458, "step": 25090 }, { "epoch": 8.368912608405603, "loss": 0.30055487155914307, "step": 25090 }, { "ce_loss": 0.03426504135131836, "epoch": 8.368912608405603, "step": 25090 }, { "distill_loss": 0.16961617767810822, "epoch": 8.368912608405603, "step": 25090 }, { "epoch": 8.368912608405603, "ref_ce_loss": 0.07461228221654892, "step": 25090 }, { "epoch": 8.37224816544363, "loss": 0.3997, "step": 25100 }, { "epoch": 8.37224816544363, "grad_norm": 1.5080701112747192, "step": 25100 }, { "epoch": 8.37224816544363, "learning_rate": 5.431230142538018e-05, "step": 25100 }, { "epoch": 8.37224816544363, "loss": 0.3572412431240082, "step": 25100 }, { "ce_loss": 0.04778499901294708, "epoch": 8.37224816544363, "step": 25100 }, { "distill_loss": 0.18236097693443298, "epoch": 8.37224816544363, "step": 25100 }, { "epoch": 8.37224816544363, "ref_ce_loss": 0.09740842133760452, "step": 25100 }, { "epoch": 8.37224816544363, "loss": 0.334131121635437, "step": 25100 }, { "ce_loss": 0.04791150614619255, "epoch": 8.37224816544363, "step": 25100 }, { "distill_loss": 0.16604946553707123, "epoch": 8.37224816544363, "step": 25100 }, { "epoch": 8.37224816544363, "ref_ce_loss": 0.055136483162641525, "step": 25100 }, { "epoch": 8.375583722481654, "loss": 0.3765, "step": 25110 }, { "epoch": 8.375583722481654, "grad_norm": 1.1067456007003784, "step": 25110 }, { "epoch": 8.375583722481654, "learning_rate": 5.409509141949243e-05, "step": 25110 }, { "epoch": 8.375583722481654, "loss": 0.3909205496311188, "step": 25110 }, { "ce_loss": 0.07044972479343414, "epoch": 8.375583722481654, "step": 25110 }, { "distill_loss": 0.16520169377326965, "epoch": 8.375583722481654, "step": 25110 }, { "epoch": 8.375583722481654, "ref_ce_loss": 0.09196219593286514, "step": 25110 }, { "epoch": 8.375583722481654, "loss": 0.4352046549320221, "step": 25110 }, { "ce_loss": 0.08850784599781036, "epoch": 8.375583722481654, "step": 25110 }, { "distill_loss": 0.24338781833648682, "epoch": 8.375583722481654, "step": 25110 }, { "epoch": 8.375583722481654, "ref_ce_loss": 0.0809473991394043, "step": 25110 }, { "epoch": 8.37891927951968, "loss": 0.375, "step": 25120 }, { "epoch": 8.37891927951968, "grad_norm": 2.305718421936035, "step": 25120 }, { "epoch": 8.37891927951968, "learning_rate": 5.387828512185387e-05, "step": 25120 }, { "epoch": 8.37891927951968, "loss": 0.3594679832458496, "step": 25120 }, { "ce_loss": 0.058741405606269836, "epoch": 8.37891927951968, "step": 25120 }, { "distill_loss": 0.17034927010536194, "epoch": 8.37891927951968, "step": 25120 }, { "epoch": 8.37891927951968, "ref_ce_loss": 0.09841496497392654, "step": 25120 }, { "epoch": 8.37891927951968, "loss": 0.6309322118759155, "step": 25120 }, { "ce_loss": 0.0625331699848175, "epoch": 8.37891927951968, "step": 25120 }, { "distill_loss": 0.22987818717956543, "epoch": 8.37891927951968, "step": 25120 }, { "epoch": 8.37891927951968, "ref_ce_loss": 0.07687834650278091, "step": 25120 }, { "epoch": 8.382254836557705, "loss": 0.3689, "step": 25130 }, { "epoch": 8.382254836557705, "grad_norm": 1.0419330596923828, "step": 25130 }, { "epoch": 8.382254836557705, "learning_rate": 5.366188278550093e-05, "step": 25130 }, { "epoch": 8.382254836557705, "loss": 0.5490450263023376, "step": 25130 }, { "ce_loss": 0.09026575088500977, "epoch": 8.382254836557705, "step": 25130 }, { "distill_loss": 0.27481669187545776, "epoch": 8.382254836557705, "step": 25130 }, { "epoch": 8.382254836557705, "ref_ce_loss": 0.11525463312864304, "step": 25130 }, { "epoch": 8.382254836557705, "loss": 0.42957377433776855, "step": 25130 }, { "ce_loss": 0.07263614237308502, "epoch": 8.382254836557705, "step": 25130 }, { "distill_loss": 0.21777136623859406, "epoch": 8.382254836557705, "step": 25130 }, { "epoch": 8.382254836557705, "ref_ce_loss": 0.08512672781944275, "step": 25130 }, { "epoch": 8.38559039359573, "loss": 0.4376, "step": 25140 }, { "epoch": 8.38559039359573, "grad_norm": 6.886488914489746, "step": 25140 }, { "epoch": 8.38559039359573, "learning_rate": 5.344588466299825e-05, "step": 25140 }, { "epoch": 8.38559039359573, "loss": 0.3433779776096344, "step": 25140 }, { "ce_loss": 0.05504188314080238, "epoch": 8.38559039359573, "step": 25140 }, { "distill_loss": 0.2048121690750122, "epoch": 8.38559039359573, "step": 25140 }, { "epoch": 8.38559039359573, "ref_ce_loss": 0.058514852076768875, "step": 25140 }, { "epoch": 8.38559039359573, "loss": 0.31423816084861755, "step": 25140 }, { "ce_loss": 0.033430345356464386, "epoch": 8.38559039359573, "step": 25140 }, { "distill_loss": 0.16572749614715576, "epoch": 8.38559039359573, "step": 25140 }, { "epoch": 8.38559039359573, "ref_ce_loss": 0.08138911426067352, "step": 25140 }, { "epoch": 8.388925950633755, "loss": 0.3877, "step": 25150 }, { "epoch": 8.388925950633755, "grad_norm": 1.519091248512268, "step": 25150 }, { "epoch": 8.388925950633755, "learning_rate": 5.323029100643888e-05, "step": 25150 }, { "epoch": 8.388925950633755, "loss": 0.47047603130340576, "step": 25150 }, { "ce_loss": 0.10079821199178696, "epoch": 8.388925950633755, "step": 25150 }, { "distill_loss": 0.2616922855377197, "epoch": 8.388925950633755, "step": 25150 }, { "epoch": 8.388925950633755, "ref_ce_loss": 0.07919727265834808, "step": 25150 }, { "epoch": 8.388925950633755, "loss": 0.37185871601104736, "step": 25150 }, { "ce_loss": 0.03542047366499901, "epoch": 8.388925950633755, "step": 25150 }, { "distill_loss": 0.1679682582616806, "epoch": 8.388925950633755, "step": 25150 }, { "epoch": 8.388925950633755, "ref_ce_loss": 0.05478193610906601, "step": 25150 }, { "epoch": 8.392261507671781, "loss": 0.412, "step": 25160 }, { "epoch": 8.392261507671781, "grad_norm": 1.5826023817062378, "step": 25160 }, { "epoch": 8.392261507671781, "learning_rate": 5.301510206744391e-05, "step": 25160 }, { "epoch": 8.392261507671781, "loss": 0.4558910131454468, "step": 25160 }, { "ce_loss": 0.08379810303449631, "epoch": 8.392261507671781, "step": 25160 }, { "distill_loss": 0.26472505927085876, "epoch": 8.392261507671781, "step": 25160 }, { "epoch": 8.392261507671781, "ref_ce_loss": 0.08704212307929993, "step": 25160 }, { "epoch": 8.392261507671781, "loss": 0.331781804561615, "step": 25160 }, { "ce_loss": 0.05914212018251419, "epoch": 8.392261507671781, "step": 25160 }, { "distill_loss": 0.18458394706249237, "epoch": 8.392261507671781, "step": 25160 }, { "epoch": 8.392261507671781, "ref_ce_loss": 0.0879281759262085, "step": 25160 }, { "epoch": 8.395597064709806, "loss": 0.4502, "step": 25170 }, { "epoch": 8.395597064709806, "grad_norm": 1.1389052867889404, "step": 25170 }, { "epoch": 8.395597064709806, "learning_rate": 5.2800318097161997e-05, "step": 25170 }, { "epoch": 8.395597064709806, "loss": 0.4669540226459503, "step": 25170 }, { "ce_loss": 0.08764485269784927, "epoch": 8.395597064709806, "step": 25170 }, { "distill_loss": 0.2544141709804535, "epoch": 8.395597064709806, "step": 25170 }, { "epoch": 8.395597064709806, "ref_ce_loss": 0.0960792750120163, "step": 25170 }, { "epoch": 8.395597064709806, "loss": 0.5467636585235596, "step": 25170 }, { "ce_loss": 0.0679205060005188, "epoch": 8.395597064709806, "step": 25170 }, { "distill_loss": 0.18445956707000732, "epoch": 8.395597064709806, "step": 25170 }, { "epoch": 8.395597064709806, "ref_ce_loss": 0.10022291541099548, "step": 25170 }, { "epoch": 8.398932621747832, "loss": 0.4217, "step": 25180 }, { "epoch": 8.398932621747832, "grad_norm": 1.0912070274353027, "step": 25180 }, { "epoch": 8.398932621747832, "learning_rate": 5.2585939346269055e-05, "step": 25180 }, { "epoch": 8.398932621747832, "loss": 0.44236764311790466, "step": 25180 }, { "ce_loss": 0.07659192383289337, "epoch": 8.398932621747832, "step": 25180 }, { "distill_loss": 0.27285832166671753, "epoch": 8.398932621747832, "step": 25180 }, { "epoch": 8.398932621747832, "ref_ce_loss": 0.09274059534072876, "step": 25180 }, { "epoch": 8.398932621747832, "loss": 0.3482986092567444, "step": 25180 }, { "ce_loss": 0.041937630623579025, "epoch": 8.398932621747832, "step": 25180 }, { "distill_loss": 0.19385190308094025, "epoch": 8.398932621747832, "step": 25180 }, { "epoch": 8.398932621747832, "ref_ce_loss": 0.07759872823953629, "step": 25180 }, { "epoch": 8.402268178785857, "loss": 0.4, "step": 25190 }, { "epoch": 8.402268178785857, "grad_norm": 1.6916354894638062, "step": 25190 }, { "epoch": 8.402268178785857, "learning_rate": 5.237196606496806e-05, "step": 25190 }, { "epoch": 8.402268178785857, "loss": 0.3225415349006653, "step": 25190 }, { "ce_loss": 0.05989629775285721, "epoch": 8.402268178785857, "step": 25190 }, { "distill_loss": 0.18907451629638672, "epoch": 8.402268178785857, "step": 25190 }, { "epoch": 8.402268178785857, "ref_ce_loss": 0.07331027090549469, "step": 25190 }, { "epoch": 8.402268178785857, "loss": 0.5849019289016724, "step": 25190 }, { "ce_loss": 0.061337001621723175, "epoch": 8.402268178785857, "step": 25190 }, { "distill_loss": 0.302169531583786, "epoch": 8.402268178785857, "step": 25190 }, { "epoch": 8.402268178785857, "ref_ce_loss": 0.09328872710466385, "step": 25190 }, { "epoch": 8.405603735823883, "loss": 0.4324, "step": 25200 }, { "epoch": 8.405603735823883, "grad_norm": 1.3808259963989258, "step": 25200 }, { "epoch": 8.405603735823883, "learning_rate": 5.2158398502989116e-05, "step": 25200 }, { "epoch": 8.405603735823883, "loss": 0.3421984612941742, "step": 25200 }, { "ce_loss": 0.07046947628259659, "epoch": 8.405603735823883, "step": 25200 }, { "distill_loss": 0.17851309478282928, "epoch": 8.405603735823883, "step": 25200 }, { "epoch": 8.405603735823883, "ref_ce_loss": 0.06344214826822281, "step": 25200 }, { "epoch": 8.405603735823883, "loss": 0.236195906996727, "step": 25200 }, { "ce_loss": 0.0421786792576313, "epoch": 8.405603735823883, "step": 25200 }, { "distill_loss": 0.1386895775794983, "epoch": 8.405603735823883, "step": 25200 }, { "epoch": 8.405603735823883, "ref_ce_loss": 0.055049993097782135, "step": 25200 }, { "epoch": 8.408939292861907, "loss": 0.3882, "step": 25210 }, { "epoch": 8.408939292861907, "grad_norm": 0.9948466420173645, "step": 25210 }, { "epoch": 8.408939292861907, "learning_rate": 5.194523690958848e-05, "step": 25210 }, { "epoch": 8.408939292861907, "loss": 0.40159645676612854, "step": 25210 }, { "ce_loss": 0.053242526948451996, "epoch": 8.408939292861907, "step": 25210 }, { "distill_loss": 0.1849006712436676, "epoch": 8.408939292861907, "step": 25210 }, { "epoch": 8.408939292861907, "ref_ce_loss": 0.06335292011499405, "step": 25210 }, { "epoch": 8.408939292861907, "loss": 0.4151378571987152, "step": 25210 }, { "ce_loss": 0.05086888372898102, "epoch": 8.408939292861907, "step": 25210 }, { "distill_loss": 0.17231523990631104, "epoch": 8.408939292861907, "step": 25210 }, { "epoch": 8.408939292861907, "ref_ce_loss": 0.07796960324048996, "step": 25210 }, { "epoch": 8.412274849899934, "loss": 0.3849, "step": 25220 }, { "epoch": 8.412274849899934, "grad_norm": 1.9043771028518677, "step": 25220 }, { "epoch": 8.412274849899934, "learning_rate": 5.173248153354853e-05, "step": 25220 }, { "epoch": 8.412274849899934, "loss": 0.43315085768699646, "step": 25220 }, { "ce_loss": 0.09647293388843536, "epoch": 8.412274849899934, "step": 25220 }, { "distill_loss": 0.2354562133550644, "epoch": 8.412274849899934, "step": 25220 }, { "epoch": 8.412274849899934, "ref_ce_loss": 0.07499422878026962, "step": 25220 }, { "epoch": 8.412274849899934, "loss": 0.32602351903915405, "step": 25220 }, { "ce_loss": 0.029574042186141014, "epoch": 8.412274849899934, "step": 25220 }, { "distill_loss": 0.16238999366760254, "epoch": 8.412274849899934, "step": 25220 }, { "epoch": 8.412274849899934, "ref_ce_loss": 0.05871171876788139, "step": 25220 }, { "epoch": 8.415610406937958, "loss": 0.3866, "step": 25230 }, { "epoch": 8.415610406937958, "grad_norm": 1.129930019378662, "step": 25230 }, { "epoch": 8.415610406937958, "learning_rate": 5.152013262317779e-05, "step": 25230 }, { "epoch": 8.415610406937958, "loss": 0.4822864532470703, "step": 25230 }, { "ce_loss": 0.060090694576501846, "epoch": 8.415610406937958, "step": 25230 }, { "distill_loss": 0.2543797791004181, "epoch": 8.415610406937958, "step": 25230 }, { "epoch": 8.415610406937958, "ref_ce_loss": 0.0755380168557167, "step": 25230 }, { "epoch": 8.415610406937958, "loss": 0.4784907102584839, "step": 25230 }, { "ce_loss": 0.05570368841290474, "epoch": 8.415610406937958, "step": 25230 }, { "distill_loss": 0.23093335330486298, "epoch": 8.415610406937958, "step": 25230 }, { "epoch": 8.415610406937958, "ref_ce_loss": 0.09662224352359772, "step": 25230 }, { "epoch": 8.418945963975984, "loss": 0.3674, "step": 25240 }, { "epoch": 8.418945963975984, "grad_norm": 1.370429515838623, "step": 25240 }, { "epoch": 8.418945963975984, "learning_rate": 5.130819042631023e-05, "step": 25240 }, { "epoch": 8.418945963975984, "loss": 0.3596634864807129, "step": 25240 }, { "ce_loss": 0.044010650366544724, "epoch": 8.418945963975984, "step": 25240 }, { "distill_loss": 0.1983024775981903, "epoch": 8.418945963975984, "step": 25240 }, { "epoch": 8.418945963975984, "ref_ce_loss": 0.08885973691940308, "step": 25240 }, { "epoch": 8.418945963975984, "loss": 0.49388378858566284, "step": 25240 }, { "ce_loss": 0.07588797807693481, "epoch": 8.418945963975984, "step": 25240 }, { "distill_loss": 0.19174428284168243, "epoch": 8.418945963975984, "step": 25240 }, { "epoch": 8.418945963975984, "ref_ce_loss": 0.10179644078016281, "step": 25240 }, { "epoch": 8.422281521014009, "loss": 0.4296, "step": 25250 }, { "epoch": 8.422281521014009, "grad_norm": 1.163691759109497, "step": 25250 }, { "epoch": 8.422281521014009, "learning_rate": 5.109665519030538e-05, "step": 25250 }, { "epoch": 8.422281521014009, "loss": 0.3523298501968384, "step": 25250 }, { "ce_loss": 0.06145976111292839, "epoch": 8.422281521014009, "step": 25250 }, { "distill_loss": 0.1971292793750763, "epoch": 8.422281521014009, "step": 25250 }, { "epoch": 8.422281521014009, "ref_ce_loss": 0.07176455110311508, "step": 25250 }, { "epoch": 8.422281521014009, "loss": 0.2701820433139801, "step": 25250 }, { "ce_loss": 0.02976236678659916, "epoch": 8.422281521014009, "step": 25250 }, { "distill_loss": 0.14251725375652313, "epoch": 8.422281521014009, "step": 25250 }, { "epoch": 8.422281521014009, "ref_ce_loss": 0.06620508432388306, "step": 25250 }, { "epoch": 8.425617078052035, "loss": 0.3815, "step": 25260 }, { "epoch": 8.425617078052035, "grad_norm": 0.999703049659729, "step": 25260 }, { "epoch": 8.425617078052035, "learning_rate": 5.088552716204733e-05, "step": 25260 }, { "epoch": 8.425617078052035, "loss": 0.2414764016866684, "step": 25260 }, { "ce_loss": 0.052632272243499756, "epoch": 8.425617078052035, "step": 25260 }, { "distill_loss": 0.11714950948953629, "epoch": 8.425617078052035, "step": 25260 }, { "epoch": 8.425617078052035, "ref_ce_loss": 0.07144191116094589, "step": 25260 }, { "epoch": 8.425617078052035, "loss": 0.4351324141025543, "step": 25260 }, { "ce_loss": 0.07600032538175583, "epoch": 8.425617078052035, "step": 25260 }, { "distill_loss": 0.19265899062156677, "epoch": 8.425617078052035, "step": 25260 }, { "epoch": 8.425617078052035, "ref_ce_loss": 0.06840462237596512, "step": 25260 }, { "epoch": 8.42895263509006, "loss": 0.4025, "step": 25270 }, { "epoch": 8.42895263509006, "grad_norm": 1.1116409301757812, "step": 25270 }, { "epoch": 8.42895263509006, "learning_rate": 5.067480658794539e-05, "step": 25270 }, { "epoch": 8.42895263509006, "loss": 0.4140152037143707, "step": 25270 }, { "ce_loss": 0.06640651077032089, "epoch": 8.42895263509006, "step": 25270 }, { "distill_loss": 0.21821719408035278, "epoch": 8.42895263509006, "step": 25270 }, { "epoch": 8.42895263509006, "ref_ce_loss": 0.06510384380817413, "step": 25270 }, { "epoch": 8.42895263509006, "loss": 0.3618549406528473, "step": 25270 }, { "ce_loss": 0.028401054441928864, "epoch": 8.42895263509006, "step": 25270 }, { "distill_loss": 0.1603056788444519, "epoch": 8.42895263509006, "step": 25270 }, { "epoch": 8.42895263509006, "ref_ce_loss": 0.061352767050266266, "step": 25270 }, { "epoch": 8.432288192128086, "loss": 0.3696, "step": 25280 }, { "epoch": 8.432288192128086, "grad_norm": 1.4946244955062866, "step": 25280 }, { "epoch": 8.432288192128086, "learning_rate": 5.046449371393309e-05, "step": 25280 }, { "epoch": 8.432288192128086, "loss": 0.6138637065887451, "step": 25280 }, { "ce_loss": 0.06309738010168076, "epoch": 8.432288192128086, "step": 25280 }, { "distill_loss": 0.21630330383777618, "epoch": 8.432288192128086, "step": 25280 }, { "epoch": 8.432288192128086, "ref_ce_loss": 0.08831988275051117, "step": 25280 }, { "epoch": 8.432288192128086, "loss": 0.691237211227417, "step": 25280 }, { "ce_loss": 0.05093845725059509, "epoch": 8.432288192128086, "step": 25280 }, { "distill_loss": 0.19174456596374512, "epoch": 8.432288192128086, "step": 25280 }, { "epoch": 8.432288192128086, "ref_ce_loss": 0.05205420404672623, "step": 25280 }, { "epoch": 8.43562374916611, "loss": 0.4186, "step": 25290 }, { "epoch": 8.43562374916611, "grad_norm": 1.1311005353927612, "step": 25290 }, { "epoch": 8.43562374916611, "learning_rate": 5.0254588785468274e-05, "step": 25290 }, { "epoch": 8.43562374916611, "loss": 0.27860555052757263, "step": 25290 }, { "ce_loss": 0.05554281175136566, "epoch": 8.43562374916611, "step": 25290 }, { "distill_loss": 0.1556360125541687, "epoch": 8.43562374916611, "step": 25290 }, { "epoch": 8.43562374916611, "ref_ce_loss": 0.06723165512084961, "step": 25290 }, { "epoch": 8.43562374916611, "loss": 0.38482168316841125, "step": 25290 }, { "ce_loss": 0.044632989913225174, "epoch": 8.43562374916611, "step": 25290 }, { "distill_loss": 0.15955740213394165, "epoch": 8.43562374916611, "step": 25290 }, { "epoch": 8.43562374916611, "ref_ce_loss": 0.07114876061677933, "step": 25290 }, { "epoch": 8.438959306204136, "loss": 0.3979, "step": 25300 }, { "epoch": 8.438959306204136, "grad_norm": 1.1892763376235962, "step": 25300 }, { "epoch": 8.438959306204136, "learning_rate": 5.004509204753238e-05, "step": 25300 }, { "epoch": 8.438959306204136, "loss": 0.3521316945552826, "step": 25300 }, { "ce_loss": 0.09186404198408127, "epoch": 8.438959306204136, "step": 25300 }, { "distill_loss": 0.16879168152809143, "epoch": 8.438959306204136, "step": 25300 }, { "epoch": 8.438959306204136, "ref_ce_loss": 0.09131795912981033, "step": 25300 }, { "epoch": 8.438959306204136, "loss": 0.45177480578422546, "step": 25300 }, { "ce_loss": 0.07362983375787735, "epoch": 8.438959306204136, "step": 25300 }, { "distill_loss": 0.25475966930389404, "epoch": 8.438959306204136, "step": 25300 }, { "epoch": 8.438959306204136, "ref_ce_loss": 0.09969697892665863, "step": 25300 }, { "epoch": 8.44229486324216, "loss": 0.4684, "step": 25310 }, { "epoch": 8.44229486324216, "grad_norm": 2.0051913261413574, "step": 25310 }, { "epoch": 8.44229486324216, "learning_rate": 4.983600374463082e-05, "step": 25310 }, { "epoch": 8.44229486324216, "loss": 0.4478938579559326, "step": 25310 }, { "ce_loss": 0.09970908612012863, "epoch": 8.44229486324216, "step": 25310 }, { "distill_loss": 0.22506193816661835, "epoch": 8.44229486324216, "step": 25310 }, { "epoch": 8.44229486324216, "ref_ce_loss": 0.06599080562591553, "step": 25310 }, { "epoch": 8.44229486324216, "loss": 0.284869521856308, "step": 25310 }, { "ce_loss": 0.02272508665919304, "epoch": 8.44229486324216, "step": 25310 }, { "distill_loss": 0.1496310979127884, "epoch": 8.44229486324216, "step": 25310 }, { "epoch": 8.44229486324216, "ref_ce_loss": 0.05696079134941101, "step": 25310 }, { "epoch": 8.445630420280187, "loss": 0.3802, "step": 25320 }, { "epoch": 8.445630420280187, "grad_norm": 1.4640692472457886, "step": 25320 }, { "epoch": 8.445630420280187, "learning_rate": 4.962732412079221e-05, "step": 25320 }, { "epoch": 8.445630420280187, "loss": 0.4311991035938263, "step": 25320 }, { "ce_loss": 0.08195716887712479, "epoch": 8.445630420280187, "step": 25320 }, { "distill_loss": 0.23977629840373993, "epoch": 8.445630420280187, "step": 25320 }, { "epoch": 8.445630420280187, "ref_ce_loss": 0.07493677735328674, "step": 25320 }, { "epoch": 8.445630420280187, "loss": 0.3129364550113678, "step": 25320 }, { "ce_loss": 0.03413569554686546, "epoch": 8.445630420280187, "step": 25320 }, { "distill_loss": 0.19769978523254395, "epoch": 8.445630420280187, "step": 25320 }, { "epoch": 8.445630420280187, "ref_ce_loss": 0.08082103729248047, "step": 25320 }, { "epoch": 8.448965977318212, "loss": 0.3941, "step": 25330 }, { "epoch": 8.448965977318212, "grad_norm": 2.3397746086120605, "step": 25330 }, { "epoch": 8.448965977318212, "learning_rate": 4.94190534195679e-05, "step": 25330 }, { "epoch": 8.448965977318212, "loss": 0.3372156620025635, "step": 25330 }, { "ce_loss": 0.05968444049358368, "epoch": 8.448965977318212, "step": 25330 }, { "distill_loss": 0.1731799691915512, "epoch": 8.448965977318212, "step": 25330 }, { "epoch": 8.448965977318212, "ref_ce_loss": 0.06552577018737793, "step": 25330 }, { "epoch": 8.448965977318212, "loss": 1.0824205875396729, "step": 25330 }, { "ce_loss": 0.13243527710437775, "epoch": 8.448965977318212, "step": 25330 }, { "distill_loss": 0.29550617933273315, "epoch": 8.448965977318212, "step": 25330 }, { "epoch": 8.448965977318212, "ref_ce_loss": 0.10851672291755676, "step": 25330 }, { "epoch": 8.452301534356238, "loss": 0.4373, "step": 25340 }, { "epoch": 8.452301534356238, "grad_norm": 1.4478459358215332, "step": 25340 }, { "epoch": 8.452301534356238, "learning_rate": 4.921119188403234e-05, "step": 25340 }, { "epoch": 8.452301534356238, "loss": 0.35270383954048157, "step": 25340 }, { "ce_loss": 0.02878500334918499, "epoch": 8.452301534356238, "step": 25340 }, { "distill_loss": 0.15584421157836914, "epoch": 8.452301534356238, "step": 25340 }, { "epoch": 8.452301534356238, "ref_ce_loss": 0.057748857885599136, "step": 25340 }, { "epoch": 8.452301534356238, "loss": 0.3954959213733673, "step": 25340 }, { "ce_loss": 0.0580030120909214, "epoch": 8.452301534356238, "step": 25340 }, { "distill_loss": 0.16948223114013672, "epoch": 8.452301534356238, "step": 25340 }, { "epoch": 8.452301534356238, "ref_ce_loss": 0.06833826750516891, "step": 25340 }, { "epoch": 8.455637091394262, "loss": 0.3913, "step": 25350 }, { "epoch": 8.455637091394262, "grad_norm": 2.2257843017578125, "step": 25350 }, { "epoch": 8.455637091394262, "learning_rate": 4.900373975678227e-05, "step": 25350 }, { "epoch": 8.455637091394262, "loss": 0.4083612561225891, "step": 25350 }, { "ce_loss": 0.09540177881717682, "epoch": 8.455637091394262, "step": 25350 }, { "distill_loss": 0.1899162232875824, "epoch": 8.455637091394262, "step": 25350 }, { "epoch": 8.455637091394262, "ref_ce_loss": 0.08591882884502411, "step": 25350 }, { "epoch": 8.455637091394262, "loss": 0.4242921769618988, "step": 25350 }, { "ce_loss": 0.054686613380908966, "epoch": 8.455637091394262, "step": 25350 }, { "distill_loss": 0.1933739185333252, "epoch": 8.455637091394262, "step": 25350 }, { "epoch": 8.455637091394262, "ref_ce_loss": 0.08469750732183456, "step": 25350 }, { "epoch": 8.458972648432288, "loss": 0.4048, "step": 25360 }, { "epoch": 8.458972648432288, "grad_norm": 1.147485375404358, "step": 25360 }, { "epoch": 8.458972648432288, "learning_rate": 4.8796697279936784e-05, "step": 25360 }, { "epoch": 8.458972648432288, "loss": 0.4345652461051941, "step": 25360 }, { "ce_loss": 0.04452275484800339, "epoch": 8.458972648432288, "step": 25360 }, { "distill_loss": 0.20094962418079376, "epoch": 8.458972648432288, "step": 25360 }, { "epoch": 8.458972648432288, "ref_ce_loss": 0.05491969734430313, "step": 25360 }, { "epoch": 8.458972648432288, "loss": 0.3355385959148407, "step": 25360 }, { "ce_loss": 0.041421085596084595, "epoch": 8.458972648432288, "step": 25360 }, { "distill_loss": 0.2158370316028595, "epoch": 8.458972648432288, "step": 25360 }, { "epoch": 8.458972648432288, "ref_ce_loss": 0.07798238098621368, "step": 25360 }, { "epoch": 8.462308205470313, "loss": 0.4442, "step": 25370 }, { "epoch": 8.462308205470313, "grad_norm": 1.0965797901153564, "step": 25370 }, { "epoch": 8.462308205470313, "learning_rate": 4.8590064695136496e-05, "step": 25370 }, { "epoch": 8.462308205470313, "loss": 0.22678275406360626, "step": 25370 }, { "ce_loss": 0.018734248355031013, "epoch": 8.462308205470313, "step": 25370 }, { "distill_loss": 0.13638636469841003, "epoch": 8.462308205470313, "step": 25370 }, { "epoch": 8.462308205470313, "ref_ce_loss": 0.04462753236293793, "step": 25370 }, { "epoch": 8.462308205470313, "loss": 0.35289931297302246, "step": 25370 }, { "ce_loss": 0.0470467135310173, "epoch": 8.462308205470313, "step": 25370 }, { "distill_loss": 0.17710834741592407, "epoch": 8.462308205470313, "step": 25370 }, { "epoch": 8.462308205470313, "ref_ce_loss": 0.05419645085930824, "step": 25370 }, { "epoch": 8.46564376250834, "loss": 0.4331, "step": 25380 }, { "epoch": 8.46564376250834, "grad_norm": 1.1017558574676514, "step": 25380 }, { "epoch": 8.46564376250834, "learning_rate": 4.8383842243544e-05, "step": 25380 }, { "epoch": 8.46564376250834, "loss": 0.5567941665649414, "step": 25380 }, { "ce_loss": 0.061687298119068146, "epoch": 8.46564376250834, "step": 25380 }, { "distill_loss": 0.2658335566520691, "epoch": 8.46564376250834, "step": 25380 }, { "epoch": 8.46564376250834, "ref_ce_loss": 0.08968020230531693, "step": 25380 }, { "epoch": 8.46564376250834, "loss": 0.5644046664237976, "step": 25380 }, { "ce_loss": 0.08603052794933319, "epoch": 8.46564376250834, "step": 25380 }, { "distill_loss": 0.23047250509262085, "epoch": 8.46564376250834, "step": 25380 }, { "epoch": 8.46564376250834, "ref_ce_loss": 0.08169778436422348, "step": 25380 }, { "epoch": 8.468979319546364, "loss": 0.4409, "step": 25390 }, { "epoch": 8.468979319546364, "grad_norm": 1.4107959270477295, "step": 25390 }, { "epoch": 8.468979319546364, "learning_rate": 4.8178030165843034e-05, "step": 25390 }, { "epoch": 8.468979319546364, "loss": 0.4246959090232849, "step": 25390 }, { "ce_loss": 0.07812031358480453, "epoch": 8.468979319546364, "step": 25390 }, { "distill_loss": 0.2077268809080124, "epoch": 8.468979319546364, "step": 25390 }, { "epoch": 8.468979319546364, "ref_ce_loss": 0.06722721457481384, "step": 25390 }, { "epoch": 8.468979319546364, "loss": 0.4788954257965088, "step": 25390 }, { "ce_loss": 0.11047182232141495, "epoch": 8.468979319546364, "step": 25390 }, { "distill_loss": 0.2312658578157425, "epoch": 8.468979319546364, "step": 25390 }, { "epoch": 8.468979319546364, "ref_ce_loss": 0.13693365454673767, "step": 25390 }, { "epoch": 8.47231487658439, "loss": 0.4272, "step": 25400 }, { "epoch": 8.47231487658439, "grad_norm": 1.23094642162323, "step": 25400 }, { "epoch": 8.47231487658439, "learning_rate": 4.7972628702238484e-05, "step": 25400 }, { "epoch": 8.47231487658439, "loss": 0.34833505749702454, "step": 25400 }, { "ce_loss": 0.09293520450592041, "epoch": 8.47231487658439, "step": 25400 }, { "distill_loss": 0.1796901375055313, "epoch": 8.47231487658439, "step": 25400 }, { "epoch": 8.47231487658439, "ref_ce_loss": 0.07550303637981415, "step": 25400 }, { "epoch": 8.47231487658439, "loss": 0.4009092450141907, "step": 25400 }, { "ce_loss": 0.05195685103535652, "epoch": 8.47231487658439, "step": 25400 }, { "distill_loss": 0.17379993200302124, "epoch": 8.47231487658439, "step": 25400 }, { "epoch": 8.47231487658439, "ref_ce_loss": 0.09474589675664902, "step": 25400 }, { "epoch": 8.475650433622414, "loss": 0.3689, "step": 25410 }, { "epoch": 8.475650433622414, "grad_norm": 1.3589696884155273, "step": 25410 }, { "epoch": 8.475650433622414, "learning_rate": 4.776763809245597e-05, "step": 25410 }, { "epoch": 8.475650433622414, "loss": 0.3397957980632782, "step": 25410 }, { "ce_loss": 0.05533874034881592, "epoch": 8.475650433622414, "step": 25410 }, { "distill_loss": 0.1839795559644699, "epoch": 8.475650433622414, "step": 25410 }, { "epoch": 8.475650433622414, "ref_ce_loss": 0.06962695717811584, "step": 25410 }, { "epoch": 8.475650433622414, "loss": 0.4177882671356201, "step": 25410 }, { "ce_loss": 0.056037165224552155, "epoch": 8.475650433622414, "step": 25410 }, { "distill_loss": 0.1790320724248886, "epoch": 8.475650433622414, "step": 25410 }, { "epoch": 8.475650433622414, "ref_ce_loss": 0.07811678200960159, "step": 25410 }, { "epoch": 8.47898599066044, "loss": 0.394, "step": 25420 }, { "epoch": 8.47898599066044, "grad_norm": 0.9545964598655701, "step": 25420 }, { "epoch": 8.47898599066044, "learning_rate": 4.756305857574157e-05, "step": 25420 }, { "epoch": 8.47898599066044, "loss": 0.4571429193019867, "step": 25420 }, { "ce_loss": 0.1148986965417862, "epoch": 8.47898599066044, "step": 25420 }, { "distill_loss": 0.23694494366645813, "epoch": 8.47898599066044, "step": 25420 }, { "epoch": 8.47898599066044, "ref_ce_loss": 0.0779845267534256, "step": 25420 }, { "epoch": 8.47898599066044, "loss": 0.37947702407836914, "step": 25420 }, { "ce_loss": 0.035717274993658066, "epoch": 8.47898599066044, "step": 25420 }, { "distill_loss": 0.1585291177034378, "epoch": 8.47898599066044, "step": 25420 }, { "epoch": 8.47898599066044, "ref_ce_loss": 0.08000072836875916, "step": 25420 }, { "epoch": 8.482321547698465, "loss": 0.3691, "step": 25430 }, { "epoch": 8.482321547698465, "grad_norm": 1.3420974016189575, "step": 25430 }, { "epoch": 8.482321547698465, "learning_rate": 4.735889039086163e-05, "step": 25430 }, { "epoch": 8.482321547698465, "loss": 0.4631466865539551, "step": 25430 }, { "ce_loss": 0.11675658822059631, "epoch": 8.482321547698465, "step": 25430 }, { "distill_loss": 0.2317403107881546, "epoch": 8.482321547698465, "step": 25430 }, { "epoch": 8.482321547698465, "ref_ce_loss": 0.08134746551513672, "step": 25430 }, { "epoch": 8.482321547698465, "loss": 0.37111377716064453, "step": 25430 }, { "ce_loss": 0.08108817040920258, "epoch": 8.482321547698465, "step": 25430 }, { "distill_loss": 0.19684088230133057, "epoch": 8.482321547698465, "step": 25430 }, { "epoch": 8.482321547698465, "ref_ce_loss": 0.07465078681707382, "step": 25430 }, { "epoch": 8.485657104736491, "loss": 0.4052, "step": 25440 }, { "epoch": 8.485657104736491, "grad_norm": 1.8001788854599, "step": 25440 }, { "epoch": 8.485657104736491, "learning_rate": 4.715513377610239e-05, "step": 25440 }, { "epoch": 8.485657104736491, "loss": 0.42932698130607605, "step": 25440 }, { "ce_loss": 0.061111774295568466, "epoch": 8.485657104736491, "step": 25440 }, { "distill_loss": 0.2266315519809723, "epoch": 8.485657104736491, "step": 25440 }, { "epoch": 8.485657104736491, "ref_ce_loss": 0.06741586327552795, "step": 25440 }, { "epoch": 8.485657104736491, "loss": 0.30838507413864136, "step": 25440 }, { "ce_loss": 0.05005405843257904, "epoch": 8.485657104736491, "step": 25440 }, { "distill_loss": 0.1564617156982422, "epoch": 8.485657104736491, "step": 25440 }, { "epoch": 8.485657104736491, "ref_ce_loss": 0.07379646599292755, "step": 25440 }, { "epoch": 8.488992661774516, "loss": 0.3964, "step": 25450 }, { "epoch": 8.488992661774516, "grad_norm": 0.9869126677513123, "step": 25450 }, { "epoch": 8.488992661774516, "learning_rate": 4.695178896926966e-05, "step": 25450 }, { "epoch": 8.488992661774516, "loss": 0.4669387936592102, "step": 25450 }, { "ce_loss": 0.11110807955265045, "epoch": 8.488992661774516, "step": 25450 }, { "distill_loss": 0.23916535079479218, "epoch": 8.488992661774516, "step": 25450 }, { "epoch": 8.488992661774516, "ref_ce_loss": 0.07803558558225632, "step": 25450 }, { "epoch": 8.488992661774516, "loss": 0.3536241054534912, "step": 25450 }, { "ce_loss": 0.09098926186561584, "epoch": 8.488992661774516, "step": 25450 }, { "distill_loss": 0.17528116703033447, "epoch": 8.488992661774516, "step": 25450 }, { "epoch": 8.488992661774516, "ref_ce_loss": 0.08723273873329163, "step": 25450 }, { "epoch": 8.492328218812542, "loss": 0.4308, "step": 25460 }, { "epoch": 8.492328218812542, "grad_norm": 1.1579896211624146, "step": 25460 }, { "epoch": 8.492328218812542, "learning_rate": 4.674885620768872e-05, "step": 25460 }, { "epoch": 8.492328218812542, "loss": 0.3379400670528412, "step": 25460 }, { "ce_loss": 0.057282764464616776, "epoch": 8.492328218812542, "step": 25460 }, { "distill_loss": 0.19009092450141907, "epoch": 8.492328218812542, "step": 25460 }, { "epoch": 8.492328218812542, "ref_ce_loss": 0.06612122058868408, "step": 25460 }, { "epoch": 8.492328218812542, "loss": 0.4416944980621338, "step": 25460 }, { "ce_loss": 0.08713212609291077, "epoch": 8.492328218812542, "step": 25460 }, { "distill_loss": 0.22129622101783752, "epoch": 8.492328218812542, "step": 25460 }, { "epoch": 8.492328218812542, "ref_ce_loss": 0.09360598027706146, "step": 25460 }, { "epoch": 8.495663775850566, "loss": 0.4013, "step": 25470 }, { "epoch": 8.495663775850566, "grad_norm": 1.344419002532959, "step": 25470 }, { "epoch": 8.495663775850566, "learning_rate": 4.654633572820402e-05, "step": 25470 }, { "epoch": 8.495663775850566, "loss": 0.4243669807910919, "step": 25470 }, { "ce_loss": 0.07619713246822357, "epoch": 8.495663775850566, "step": 25470 }, { "distill_loss": 0.19245898723602295, "epoch": 8.495663775850566, "step": 25470 }, { "epoch": 8.495663775850566, "ref_ce_loss": 0.0890323594212532, "step": 25470 }, { "epoch": 8.495663775850566, "loss": 0.39107850193977356, "step": 25470 }, { "ce_loss": 0.06887102127075195, "epoch": 8.495663775850566, "step": 25470 }, { "distill_loss": 0.18017184734344482, "epoch": 8.495663775850566, "step": 25470 }, { "epoch": 8.495663775850566, "ref_ce_loss": 0.08554864674806595, "step": 25470 }, { "epoch": 8.498999332888593, "loss": 0.4082, "step": 25480 }, { "epoch": 8.498999332888593, "grad_norm": 1.3751683235168457, "step": 25480 }, { "epoch": 8.498999332888593, "learning_rate": 4.634422776717879e-05, "step": 25480 }, { "epoch": 8.498999332888593, "loss": 0.3181149363517761, "step": 25480 }, { "ce_loss": 0.052022214978933334, "epoch": 8.498999332888593, "step": 25480 }, { "distill_loss": 0.18051841855049133, "epoch": 8.498999332888593, "step": 25480 }, { "epoch": 8.498999332888593, "ref_ce_loss": 0.08533704280853271, "step": 25480 }, { "epoch": 8.498999332888593, "loss": 0.47193965315818787, "step": 25480 }, { "ce_loss": 0.024297937750816345, "epoch": 8.498999332888593, "step": 25480 }, { "distill_loss": 0.2265244424343109, "epoch": 8.498999332888593, "step": 25480 }, { "epoch": 8.498999332888593, "ref_ce_loss": 0.05670903995633125, "step": 25480 }, { "epoch": 8.502334889926617, "loss": 0.3771, "step": 25490 }, { "epoch": 8.502334889926617, "grad_norm": 1.2867672443389893, "step": 25490 }, { "epoch": 8.502334889926617, "learning_rate": 4.614253256049459e-05, "step": 25490 }, { "epoch": 8.502334889926617, "loss": 0.33566558361053467, "step": 25490 }, { "ce_loss": 0.06634427607059479, "epoch": 8.502334889926617, "step": 25490 }, { "distill_loss": 0.16398048400878906, "epoch": 8.502334889926617, "step": 25490 }, { "epoch": 8.502334889926617, "ref_ce_loss": 0.07367944717407227, "step": 25490 }, { "epoch": 8.502334889926617, "loss": 0.5042216777801514, "step": 25490 }, { "ce_loss": 0.05847841128706932, "epoch": 8.502334889926617, "step": 25490 }, { "distill_loss": 0.1794266402721405, "epoch": 8.502334889926617, "step": 25490 }, { "epoch": 8.502334889926617, "ref_ce_loss": 0.08029765635728836, "step": 25490 }, { "epoch": 8.505670446964643, "loss": 0.4017, "step": 25500 }, { "epoch": 8.505670446964643, "grad_norm": 1.5184662342071533, "step": 25500 }, { "epoch": 8.505670446964643, "learning_rate": 4.5941250343551546e-05, "step": 25500 }, { "epoch": 8.505670446964643, "loss": 0.39854538440704346, "step": 25500 }, { "ce_loss": 0.06465799361467361, "epoch": 8.505670446964643, "step": 25500 }, { "distill_loss": 0.20663900673389435, "epoch": 8.505670446964643, "step": 25500 }, { "epoch": 8.505670446964643, "ref_ce_loss": 0.06514228135347366, "step": 25500 }, { "epoch": 8.505670446964643, "loss": 0.38145771622657776, "step": 25500 }, { "ce_loss": 0.040179524570703506, "epoch": 8.505670446964643, "step": 25500 }, { "distill_loss": 0.22532419860363007, "epoch": 8.505670446964643, "step": 25500 }, { "epoch": 8.505670446964643, "ref_ce_loss": 0.0818011611700058, "step": 25500 }, { "epoch": 8.509006004002668, "loss": 0.3792, "step": 25510 }, { "epoch": 8.509006004002668, "grad_norm": 0.836586594581604, "step": 25510 }, { "epoch": 8.509006004002668, "learning_rate": 4.574038135126766e-05, "step": 25510 }, { "epoch": 8.509006004002668, "loss": 0.3313855826854706, "step": 25510 }, { "ce_loss": 0.04506858065724373, "epoch": 8.509006004002668, "step": 25510 }, { "distill_loss": 0.18170565366744995, "epoch": 8.509006004002668, "step": 25510 }, { "epoch": 8.509006004002668, "ref_ce_loss": 0.08849178999662399, "step": 25510 }, { "epoch": 8.509006004002668, "loss": 0.44603872299194336, "step": 25510 }, { "ce_loss": 0.034600429236888885, "epoch": 8.509006004002668, "step": 25510 }, { "distill_loss": 0.13978897035121918, "epoch": 8.509006004002668, "step": 25510 }, { "epoch": 8.509006004002668, "ref_ce_loss": 0.06843680888414383, "step": 25510 }, { "epoch": 8.512341561040694, "loss": 0.4108, "step": 25520 }, { "epoch": 8.512341561040694, "grad_norm": 2.0533783435821533, "step": 25520 }, { "epoch": 8.512341561040694, "learning_rate": 4.5539925818078646e-05, "step": 25520 }, { "epoch": 8.512341561040694, "loss": 0.31537896394729614, "step": 25520 }, { "ce_loss": 0.0696769580245018, "epoch": 8.512341561040694, "step": 25520 }, { "distill_loss": 0.1834687441587448, "epoch": 8.512341561040694, "step": 25520 }, { "epoch": 8.512341561040694, "ref_ce_loss": 0.04670481011271477, "step": 25520 }, { "epoch": 8.512341561040694, "loss": 0.9505656957626343, "step": 25520 }, { "ce_loss": 0.1394726186990738, "epoch": 8.512341561040694, "step": 25520 }, { "distill_loss": 0.2532227635383606, "epoch": 8.512341561040694, "step": 25520 }, { "epoch": 8.512341561040694, "ref_ce_loss": 0.07898080348968506, "step": 25520 }, { "epoch": 8.515677118078719, "loss": 0.4574, "step": 25530 }, { "epoch": 8.515677118078719, "grad_norm": 1.2700072526931763, "step": 25530 }, { "epoch": 8.515677118078719, "learning_rate": 4.533988397793767e-05, "step": 25530 }, { "epoch": 8.515677118078719, "loss": 0.4377422034740448, "step": 25530 }, { "ce_loss": 0.03457576036453247, "epoch": 8.515677118078719, "step": 25530 }, { "distill_loss": 0.2131364643573761, "epoch": 8.515677118078719, "step": 25530 }, { "epoch": 8.515677118078719, "ref_ce_loss": 0.09452904015779495, "step": 25530 }, { "epoch": 8.515677118078719, "loss": 0.2893883287906647, "step": 25530 }, { "ce_loss": 0.03002668172121048, "epoch": 8.515677118078719, "step": 25530 }, { "distill_loss": 0.1513487845659256, "epoch": 8.515677118078719, "step": 25530 }, { "epoch": 8.515677118078719, "ref_ce_loss": 0.10772596299648285, "step": 25530 }, { "epoch": 8.519012675116745, "loss": 0.4143, "step": 25540 }, { "epoch": 8.519012675116745, "grad_norm": 1.567345142364502, "step": 25540 }, { "epoch": 8.519012675116745, "learning_rate": 4.5140256064315136e-05, "step": 25540 }, { "epoch": 8.519012675116745, "loss": 0.48999351263046265, "step": 25540 }, { "ce_loss": 0.07218960672616959, "epoch": 8.519012675116745, "step": 25540 }, { "distill_loss": 0.16818466782569885, "epoch": 8.519012675116745, "step": 25540 }, { "epoch": 8.519012675116745, "ref_ce_loss": 0.09462545812129974, "step": 25540 }, { "epoch": 8.519012675116745, "loss": 0.4208908677101135, "step": 25540 }, { "ce_loss": 0.07465120404958725, "epoch": 8.519012675116745, "step": 25540 }, { "distill_loss": 0.1858339011669159, "epoch": 8.519012675116745, "step": 25540 }, { "epoch": 8.519012675116745, "ref_ce_loss": 0.11447934061288834, "step": 25540 }, { "epoch": 8.52234823215477, "loss": 0.4061, "step": 25550 }, { "epoch": 8.52234823215477, "grad_norm": 1.4897160530090332, "step": 25550 }, { "epoch": 8.52234823215477, "learning_rate": 4.494104231019822e-05, "step": 25550 }, { "epoch": 8.52234823215477, "loss": 0.5165577530860901, "step": 25550 }, { "ce_loss": 0.08921578526496887, "epoch": 8.52234823215477, "step": 25550 }, { "distill_loss": 0.1780136078596115, "epoch": 8.52234823215477, "step": 25550 }, { "epoch": 8.52234823215477, "ref_ce_loss": 0.07086190581321716, "step": 25550 }, { "epoch": 8.52234823215477, "loss": 0.49777376651763916, "step": 25550 }, { "ce_loss": 0.07921870797872543, "epoch": 8.52234823215477, "step": 25550 }, { "distill_loss": 0.22613626718521118, "epoch": 8.52234823215477, "step": 25550 }, { "epoch": 8.52234823215477, "ref_ce_loss": 0.09942245483398438, "step": 25550 }, { "epoch": 8.525683789192795, "loss": 0.4983, "step": 25560 }, { "epoch": 8.525683789192795, "grad_norm": 1.7442575693130493, "step": 25560 }, { "epoch": 8.525683789192795, "learning_rate": 4.474224294809095e-05, "step": 25560 }, { "epoch": 8.525683789192795, "loss": 0.3476119637489319, "step": 25560 }, { "ce_loss": 0.07392597943544388, "epoch": 8.525683789192795, "step": 25560 }, { "distill_loss": 0.18355512619018555, "epoch": 8.525683789192795, "step": 25560 }, { "epoch": 8.525683789192795, "ref_ce_loss": 0.07576129585504532, "step": 25560 }, { "epoch": 8.525683789192795, "loss": 0.29269444942474365, "step": 25560 }, { "ce_loss": 0.03631259873509407, "epoch": 8.525683789192795, "step": 25560 }, { "distill_loss": 0.17361724376678467, "epoch": 8.525683789192795, "step": 25560 }, { "epoch": 8.525683789192795, "ref_ce_loss": 0.08257342129945755, "step": 25560 }, { "epoch": 8.52901934623082, "loss": 0.4647, "step": 25570 }, { "epoch": 8.52901934623082, "grad_norm": 1.6793243885040283, "step": 25570 }, { "epoch": 8.52901934623082, "learning_rate": 4.4543858210013414e-05, "step": 25570 }, { "epoch": 8.52901934623082, "loss": 0.4669928252696991, "step": 25570 }, { "ce_loss": 0.03943252190947533, "epoch": 8.52901934623082, "step": 25570 }, { "distill_loss": 0.19234731793403625, "epoch": 8.52901934623082, "step": 25570 }, { "epoch": 8.52901934623082, "ref_ce_loss": 0.09062094986438751, "step": 25570 }, { "epoch": 8.52901934623082, "loss": 0.3977088928222656, "step": 25570 }, { "ce_loss": 0.061139557510614395, "epoch": 8.52901934623082, "step": 25570 }, { "distill_loss": 0.1872185468673706, "epoch": 8.52901934623082, "step": 25570 }, { "epoch": 8.52901934623082, "ref_ce_loss": 0.06907083839178085, "step": 25570 }, { "epoch": 8.532354903268846, "loss": 0.3617, "step": 25580 }, { "epoch": 8.532354903268846, "grad_norm": 1.6442162990570068, "step": 25580 }, { "epoch": 8.532354903268846, "learning_rate": 4.434588832750195e-05, "step": 25580 }, { "epoch": 8.532354903268846, "loss": 0.4794367551803589, "step": 25580 }, { "ce_loss": 0.08833171427249908, "epoch": 8.532354903268846, "step": 25580 }, { "distill_loss": 0.22108300030231476, "epoch": 8.532354903268846, "step": 25580 }, { "epoch": 8.532354903268846, "ref_ce_loss": 0.0778263658285141, "step": 25580 }, { "epoch": 8.532354903268846, "loss": 0.43841928243637085, "step": 25580 }, { "ce_loss": 0.07592614740133286, "epoch": 8.532354903268846, "step": 25580 }, { "distill_loss": 0.209527850151062, "epoch": 8.532354903268846, "step": 25580 }, { "epoch": 8.532354903268846, "ref_ce_loss": 0.07981114089488983, "step": 25580 }, { "epoch": 8.53569046030687, "loss": 0.4094, "step": 25590 }, { "epoch": 8.53569046030687, "grad_norm": 1.0511441230773926, "step": 25590 }, { "epoch": 8.53569046030687, "learning_rate": 4.414833353160885e-05, "step": 25590 }, { "epoch": 8.53569046030687, "loss": 0.3373606204986572, "step": 25590 }, { "ce_loss": 0.0720352753996849, "epoch": 8.53569046030687, "step": 25590 }, { "distill_loss": 0.16815629601478577, "epoch": 8.53569046030687, "step": 25590 }, { "epoch": 8.53569046030687, "ref_ce_loss": 0.06618942320346832, "step": 25590 }, { "epoch": 8.53569046030687, "loss": 0.2670990526676178, "step": 25590 }, { "ce_loss": 0.025431491434574127, "epoch": 8.53569046030687, "step": 25590 }, { "distill_loss": 0.14715948700904846, "epoch": 8.53569046030687, "step": 25590 }, { "epoch": 8.53569046030687, "ref_ce_loss": 0.09422583132982254, "step": 25590 }, { "epoch": 8.539026017344897, "loss": 0.3929, "step": 25600 }, { "epoch": 8.539026017344897, "grad_norm": 1.3763742446899414, "step": 25600 }, { "epoch": 8.539026017344897, "learning_rate": 4.395119405290178e-05, "step": 25600 }, { "epoch": 8.539026017344897, "loss": 0.3327689468860626, "step": 25600 }, { "ce_loss": 0.07418499886989594, "epoch": 8.539026017344897, "step": 25600 }, { "distill_loss": 0.17308631539344788, "epoch": 8.539026017344897, "step": 25600 }, { "epoch": 8.539026017344897, "ref_ce_loss": 0.06540984660387039, "step": 25600 }, { "epoch": 8.539026017344897, "loss": 0.38300207257270813, "step": 25600 }, { "ce_loss": 0.09874434769153595, "epoch": 8.539026017344897, "step": 25600 }, { "distill_loss": 0.17336177825927734, "epoch": 8.539026017344897, "step": 25600 }, { "epoch": 8.539026017344897, "ref_ce_loss": 0.07843245565891266, "step": 25600 }, { "epoch": 8.542361574382921, "loss": 0.3986, "step": 25610 }, { "epoch": 8.542361574382921, "grad_norm": 1.043558955192566, "step": 25610 }, { "epoch": 8.542361574382921, "learning_rate": 4.375447012146361e-05, "step": 25610 }, { "epoch": 8.542361574382921, "loss": 0.3494413197040558, "step": 25610 }, { "ce_loss": 0.0686168521642685, "epoch": 8.542361574382921, "step": 25610 }, { "distill_loss": 0.19644740223884583, "epoch": 8.542361574382921, "step": 25610 }, { "epoch": 8.542361574382921, "ref_ce_loss": 0.06413035839796066, "step": 25610 }, { "epoch": 8.542361574382921, "loss": 0.2559710741043091, "step": 25610 }, { "ce_loss": 0.043748125433921814, "epoch": 8.542361574382921, "step": 25610 }, { "distill_loss": 0.13352961838245392, "epoch": 8.542361574382921, "step": 25610 }, { "epoch": 8.542361574382921, "ref_ce_loss": 0.05490174889564514, "step": 25610 }, { "epoch": 8.545697131420948, "loss": 0.3927, "step": 25620 }, { "epoch": 8.545697131420948, "grad_norm": 0.8407578468322754, "step": 25620 }, { "epoch": 8.545697131420948, "learning_rate": 4.355816196689242e-05, "step": 25620 }, { "epoch": 8.545697131420948, "loss": 0.4915314316749573, "step": 25620 }, { "ce_loss": 0.08458271622657776, "epoch": 8.545697131420948, "step": 25620 }, { "distill_loss": 0.19072310626506805, "epoch": 8.545697131420948, "step": 25620 }, { "epoch": 8.545697131420948, "ref_ce_loss": 0.10890834778547287, "step": 25620 }, { "epoch": 8.545697131420948, "loss": 0.386917382478714, "step": 25620 }, { "ce_loss": 0.06721224635839462, "epoch": 8.545697131420948, "step": 25620 }, { "distill_loss": 0.19537372887134552, "epoch": 8.545697131420948, "step": 25620 }, { "epoch": 8.545697131420948, "ref_ce_loss": 0.06054548919200897, "step": 25620 }, { "epoch": 8.549032688458972, "loss": 0.3957, "step": 25630 }, { "epoch": 8.549032688458972, "grad_norm": 1.364972710609436, "step": 25630 }, { "epoch": 8.549032688458972, "learning_rate": 4.336226981830094e-05, "step": 25630 }, { "epoch": 8.549032688458972, "loss": 0.4775812029838562, "step": 25630 }, { "ce_loss": 0.06949009001255035, "epoch": 8.549032688458972, "step": 25630 }, { "distill_loss": 0.20447465777397156, "epoch": 8.549032688458972, "step": 25630 }, { "epoch": 8.549032688458972, "ref_ce_loss": 0.061440128833055496, "step": 25630 }, { "epoch": 8.549032688458972, "loss": 0.37345072627067566, "step": 25630 }, { "ce_loss": 0.06991104036569595, "epoch": 8.549032688458972, "step": 25630 }, { "distill_loss": 0.15843914449214935, "epoch": 8.549032688458972, "step": 25630 }, { "epoch": 8.549032688458972, "ref_ce_loss": 0.09249599277973175, "step": 25630 }, { "epoch": 8.552368245496998, "loss": 0.402, "step": 25640 }, { "epoch": 8.552368245496998, "grad_norm": 1.4104706048965454, "step": 25640 }, { "epoch": 8.552368245496998, "learning_rate": 4.316679390431637e-05, "step": 25640 }, { "epoch": 8.552368245496998, "loss": 0.4269067347049713, "step": 25640 }, { "ce_loss": 0.05708153545856476, "epoch": 8.552368245496998, "step": 25640 }, { "distill_loss": 0.18109223246574402, "epoch": 8.552368245496998, "step": 25640 }, { "epoch": 8.552368245496998, "ref_ce_loss": 0.07652582973241806, "step": 25640 }, { "epoch": 8.552368245496998, "loss": 0.2876511514186859, "step": 25640 }, { "ce_loss": 0.05271648243069649, "epoch": 8.552368245496998, "step": 25640 }, { "distill_loss": 0.14108356833457947, "epoch": 8.552368245496998, "step": 25640 }, { "epoch": 8.552368245496998, "ref_ce_loss": 0.06557846069335938, "step": 25640 }, { "epoch": 8.555703802535023, "loss": 0.4044, "step": 25650 }, { "epoch": 8.555703802535023, "grad_norm": 1.0042786598205566, "step": 25650 }, { "epoch": 8.555703802535023, "learning_rate": 4.297173445308018e-05, "step": 25650 }, { "epoch": 8.555703802535023, "loss": 0.4267001748085022, "step": 25650 }, { "ce_loss": 0.06814425438642502, "epoch": 8.555703802535023, "step": 25650 }, { "distill_loss": 0.2259957194328308, "epoch": 8.555703802535023, "step": 25650 }, { "epoch": 8.555703802535023, "ref_ce_loss": 0.10608452558517456, "step": 25650 }, { "epoch": 8.555703802535023, "loss": 0.399760901927948, "step": 25650 }, { "ce_loss": 0.05207909271121025, "epoch": 8.555703802535023, "step": 25650 }, { "distill_loss": 0.17557072639465332, "epoch": 8.555703802535023, "step": 25650 }, { "epoch": 8.555703802535023, "ref_ce_loss": 0.08423539251089096, "step": 25650 }, { "epoch": 8.559039359573049, "loss": 0.3729, "step": 25660 }, { "epoch": 8.559039359573049, "grad_norm": 1.627666711807251, "step": 25660 }, { "epoch": 8.559039359573049, "learning_rate": 4.277709169224773e-05, "step": 25660 }, { "epoch": 8.559039359573049, "loss": 0.4594741463661194, "step": 25660 }, { "ce_loss": 0.03643737733364105, "epoch": 8.559039359573049, "step": 25660 }, { "distill_loss": 0.18254266679286957, "epoch": 8.559039359573049, "step": 25660 }, { "epoch": 8.559039359573049, "ref_ce_loss": 0.0540667325258255, "step": 25660 }, { "epoch": 8.559039359573049, "loss": 0.32987087965011597, "step": 25660 }, { "ce_loss": 0.03342566639184952, "epoch": 8.559039359573049, "step": 25660 }, { "distill_loss": 0.17751745879650116, "epoch": 8.559039359573049, "step": 25660 }, { "epoch": 8.559039359573049, "ref_ce_loss": 0.0852920189499855, "step": 25660 }, { "epoch": 8.562374916611073, "loss": 0.4275, "step": 25670 }, { "epoch": 8.562374916611073, "grad_norm": 1.3796634674072266, "step": 25670 }, { "epoch": 8.562374916611073, "learning_rate": 4.2582865848988095e-05, "step": 25670 }, { "epoch": 8.562374916611073, "loss": 0.43835440278053284, "step": 25670 }, { "ce_loss": 0.09459662437438965, "epoch": 8.562374916611073, "step": 25670 }, { "distill_loss": 0.23603783547878265, "epoch": 8.562374916611073, "step": 25670 }, { "epoch": 8.562374916611073, "ref_ce_loss": 0.0750318244099617, "step": 25670 }, { "epoch": 8.562374916611073, "loss": 0.2934839427471161, "step": 25670 }, { "ce_loss": 0.039706867188215256, "epoch": 8.562374916611073, "step": 25670 }, { "distill_loss": 0.14353084564208984, "epoch": 8.562374916611073, "step": 25670 }, { "epoch": 8.562374916611073, "ref_ce_loss": 0.06643503159284592, "step": 25670 }, { "epoch": 8.5657104736491, "loss": 0.4196, "step": 25680 }, { "epoch": 8.5657104736491, "grad_norm": 0.9201450347900391, "step": 25680 }, { "epoch": 8.5657104736491, "learning_rate": 4.238905714998365e-05, "step": 25680 }, { "epoch": 8.5657104736491, "loss": 0.3691575229167938, "step": 25680 }, { "ce_loss": 0.0561094731092453, "epoch": 8.5657104736491, "step": 25680 }, { "distill_loss": 0.17040081322193146, "epoch": 8.5657104736491, "step": 25680 }, { "epoch": 8.5657104736491, "ref_ce_loss": 0.07381340861320496, "step": 25680 }, { "epoch": 8.5657104736491, "loss": 0.38116931915283203, "step": 25680 }, { "ce_loss": 0.07512297481298447, "epoch": 8.5657104736491, "step": 25680 }, { "distill_loss": 0.23427975177764893, "epoch": 8.5657104736491, "step": 25680 }, { "epoch": 8.5657104736491, "ref_ce_loss": 0.07131833583116531, "step": 25680 }, { "epoch": 8.569046030687124, "loss": 0.4138, "step": 25690 }, { "epoch": 8.569046030687124, "grad_norm": 1.5714385509490967, "step": 25690 }, { "epoch": 8.569046030687124, "learning_rate": 4.219566582143002e-05, "step": 25690 }, { "epoch": 8.569046030687124, "loss": 0.34663069248199463, "step": 25690 }, { "ce_loss": 0.05181189998984337, "epoch": 8.569046030687124, "step": 25690 }, { "distill_loss": 0.18649733066558838, "epoch": 8.569046030687124, "step": 25690 }, { "epoch": 8.569046030687124, "ref_ce_loss": 0.08270205557346344, "step": 25690 }, { "epoch": 8.569046030687124, "loss": 0.9217813611030579, "step": 25690 }, { "ce_loss": 0.053318414837121964, "epoch": 8.569046030687124, "step": 25690 }, { "distill_loss": 0.20686563849449158, "epoch": 8.569046030687124, "step": 25690 }, { "epoch": 8.569046030687124, "ref_ce_loss": 0.08267318457365036, "step": 25690 }, { "epoch": 8.57238158772515, "loss": 0.4133, "step": 25700 }, { "epoch": 8.57238158772515, "grad_norm": 1.0986888408660889, "step": 25700 }, { "epoch": 8.57238158772515, "learning_rate": 4.200269208903569e-05, "step": 25700 }, { "epoch": 8.57238158772515, "loss": 0.4715734124183655, "step": 25700 }, { "ce_loss": 0.09321790933609009, "epoch": 8.57238158772515, "step": 25700 }, { "distill_loss": 0.2323947548866272, "epoch": 8.57238158772515, "step": 25700 }, { "epoch": 8.57238158772515, "ref_ce_loss": 0.06556542962789536, "step": 25700 }, { "epoch": 8.57238158772515, "loss": 0.2592449486255646, "step": 25700 }, { "ce_loss": 0.02691858820617199, "epoch": 8.57238158772515, "step": 25700 }, { "distill_loss": 0.16078926622867584, "epoch": 8.57238158772515, "step": 25700 }, { "epoch": 8.57238158772515, "ref_ce_loss": 0.04746861010789871, "step": 25700 }, { "epoch": 8.575717144763175, "loss": 0.4156, "step": 25710 }, { "epoch": 8.575717144763175, "grad_norm": 3.1051816940307617, "step": 25710 }, { "epoch": 8.575717144763175, "learning_rate": 4.181013617802192e-05, "step": 25710 }, { "epoch": 8.575717144763175, "loss": 0.5697551965713501, "step": 25710 }, { "ce_loss": 0.07349631935358047, "epoch": 8.575717144763175, "step": 25710 }, { "distill_loss": 0.2376619577407837, "epoch": 8.575717144763175, "step": 25710 }, { "epoch": 8.575717144763175, "ref_ce_loss": 0.07407592982053757, "step": 25710 }, { "epoch": 8.575717144763175, "loss": 0.354321151971817, "step": 25710 }, { "ce_loss": 0.058924220502376556, "epoch": 8.575717144763175, "step": 25710 }, { "distill_loss": 0.20981810986995697, "epoch": 8.575717144763175, "step": 25710 }, { "epoch": 8.575717144763175, "ref_ce_loss": 0.08513392508029938, "step": 25710 }, { "epoch": 8.579052701801201, "loss": 0.4023, "step": 25720 }, { "epoch": 8.579052701801201, "grad_norm": 1.033412218093872, "step": 25720 }, { "epoch": 8.579052701801201, "learning_rate": 4.1617998313121966e-05, "step": 25720 }, { "epoch": 8.579052701801201, "loss": 0.4880424439907074, "step": 25720 }, { "ce_loss": 0.07538829743862152, "epoch": 8.579052701801201, "step": 25720 }, { "distill_loss": 0.2588784992694855, "epoch": 8.579052701801201, "step": 25720 }, { "epoch": 8.579052701801201, "ref_ce_loss": 0.06673355400562286, "step": 25720 }, { "epoch": 8.579052701801201, "loss": 0.4788103699684143, "step": 25720 }, { "ce_loss": 0.06297007948160172, "epoch": 8.579052701801201, "step": 25720 }, { "distill_loss": 0.20087918639183044, "epoch": 8.579052701801201, "step": 25720 }, { "epoch": 8.579052701801201, "ref_ce_loss": 0.07254141569137573, "step": 25720 }, { "epoch": 8.582388258839226, "loss": 0.3982, "step": 25730 }, { "epoch": 8.582388258839226, "grad_norm": 2.35874342918396, "step": 25730 }, { "epoch": 8.582388258839226, "learning_rate": 4.1426278718581424e-05, "step": 25730 }, { "epoch": 8.582388258839226, "loss": 0.3429741859436035, "step": 25730 }, { "ce_loss": 0.059212736785411835, "epoch": 8.582388258839226, "step": 25730 }, { "distill_loss": 0.1997862607240677, "epoch": 8.582388258839226, "step": 25730 }, { "epoch": 8.582388258839226, "ref_ce_loss": 0.06289255619049072, "step": 25730 }, { "epoch": 8.582388258839226, "loss": 0.2678738534450531, "step": 25730 }, { "ce_loss": 0.03431813791394234, "epoch": 8.582388258839226, "step": 25730 }, { "distill_loss": 0.1464090496301651, "epoch": 8.582388258839226, "step": 25730 }, { "epoch": 8.582388258839226, "ref_ce_loss": 0.06843450665473938, "step": 25730 }, { "epoch": 8.585723815877252, "loss": 0.4033, "step": 25740 }, { "epoch": 8.585723815877252, "grad_norm": 1.263071894645691, "step": 25740 }, { "epoch": 8.585723815877252, "learning_rate": 4.123497761815776e-05, "step": 25740 }, { "epoch": 8.585723815877252, "loss": 0.44260334968566895, "step": 25740 }, { "ce_loss": 0.08691255003213882, "epoch": 8.585723815877252, "step": 25740 }, { "distill_loss": 0.2279985100030899, "epoch": 8.585723815877252, "step": 25740 }, { "epoch": 8.585723815877252, "ref_ce_loss": 0.0896274745464325, "step": 25740 }, { "epoch": 8.585723815877252, "loss": 0.4468820095062256, "step": 25740 }, { "ce_loss": 0.03885522857308388, "epoch": 8.585723815877252, "step": 25740 }, { "distill_loss": 0.14059850573539734, "epoch": 8.585723815877252, "step": 25740 }, { "epoch": 8.585723815877252, "ref_ce_loss": 0.06513622403144836, "step": 25740 }, { "epoch": 8.589059372915276, "loss": 0.3826, "step": 25750 }, { "epoch": 8.589059372915276, "grad_norm": 2.3015244007110596, "step": 25750 }, { "epoch": 8.589059372915276, "learning_rate": 4.1044095235120004e-05, "step": 25750 }, { "epoch": 8.589059372915276, "loss": 0.5193451642990112, "step": 25750 }, { "ce_loss": 0.04930582642555237, "epoch": 8.589059372915276, "step": 25750 }, { "distill_loss": 0.19719862937927246, "epoch": 8.589059372915276, "step": 25750 }, { "epoch": 8.589059372915276, "ref_ce_loss": 0.07023698091506958, "step": 25750 }, { "epoch": 8.589059372915276, "loss": 0.40798452496528625, "step": 25750 }, { "ce_loss": 0.08019145578145981, "epoch": 8.589059372915276, "step": 25750 }, { "distill_loss": 0.20547153055667877, "epoch": 8.589059372915276, "step": 25750 }, { "epoch": 8.589059372915276, "ref_ce_loss": 0.08603661507368088, "step": 25750 }, { "epoch": 8.592394929953302, "loss": 0.3662, "step": 25760 }, { "epoch": 8.592394929953302, "grad_norm": 1.2376877069473267, "step": 25760 }, { "epoch": 8.592394929953302, "learning_rate": 4.085363179224832e-05, "step": 25760 }, { "epoch": 8.592394929953302, "loss": 0.4375184178352356, "step": 25760 }, { "ce_loss": 0.11429724097251892, "epoch": 8.592394929953302, "step": 25760 }, { "distill_loss": 0.23312397301197052, "epoch": 8.592394929953302, "step": 25760 }, { "epoch": 8.592394929953302, "ref_ce_loss": 0.08966071158647537, "step": 25760 }, { "epoch": 8.592394929953302, "loss": 0.4259558618068695, "step": 25760 }, { "ce_loss": 0.09264373034238815, "epoch": 8.592394929953302, "step": 25760 }, { "distill_loss": 0.19215357303619385, "epoch": 8.592394929953302, "step": 25760 }, { "epoch": 8.592394929953302, "ref_ce_loss": 0.06445678323507309, "step": 25760 }, { "epoch": 8.595730486991327, "loss": 0.4045, "step": 25770 }, { "epoch": 8.595730486991327, "grad_norm": 1.1412529945373535, "step": 25770 }, { "epoch": 8.595730486991327, "learning_rate": 4.06635875118341e-05, "step": 25770 }, { "epoch": 8.595730486991327, "loss": 0.28406891226768494, "step": 25770 }, { "ce_loss": 0.05148237571120262, "epoch": 8.595730486991327, "step": 25770 }, { "distill_loss": 0.17395122349262238, "epoch": 8.595730486991327, "step": 25770 }, { "epoch": 8.595730486991327, "ref_ce_loss": 0.03983690217137337, "step": 25770 }, { "epoch": 8.595730486991327, "loss": 0.3416021764278412, "step": 25770 }, { "ce_loss": 0.08922668546438217, "epoch": 8.595730486991327, "step": 25770 }, { "distill_loss": 0.17985227704048157, "epoch": 8.595730486991327, "step": 25770 }, { "epoch": 8.595730486991327, "ref_ce_loss": 0.056731123477220535, "step": 25770 }, { "epoch": 8.599066044029353, "loss": 0.462, "step": 25780 }, { "epoch": 8.599066044029353, "grad_norm": 1.3721615076065063, "step": 25780 }, { "epoch": 8.599066044029353, "learning_rate": 4.047396261567942e-05, "step": 25780 }, { "epoch": 8.599066044029353, "loss": 0.543890118598938, "step": 25780 }, { "ce_loss": 0.05241483822464943, "epoch": 8.599066044029353, "step": 25780 }, { "distill_loss": 0.21588771045207977, "epoch": 8.599066044029353, "step": 25780 }, { "epoch": 8.599066044029353, "ref_ce_loss": 0.07171756029129028, "step": 25780 }, { "epoch": 8.599066044029353, "loss": 0.40631309151649475, "step": 25780 }, { "ce_loss": 0.09465570747852325, "epoch": 8.599066044029353, "step": 25780 }, { "distill_loss": 0.22124746441841125, "epoch": 8.599066044029353, "step": 25780 }, { "epoch": 8.599066044029353, "ref_ce_loss": 0.09005552530288696, "step": 25780 }, { "epoch": 8.602401601067378, "loss": 0.4049, "step": 25790 }, { "epoch": 8.602401601067378, "grad_norm": 1.1606225967407227, "step": 25790 }, { "epoch": 8.602401601067378, "learning_rate": 4.0284757325097066e-05, "step": 25790 }, { "epoch": 8.602401601067378, "loss": 0.218998983502388, "step": 25790 }, { "ce_loss": 0.02312006615102291, "epoch": 8.602401601067378, "step": 25790 }, { "distill_loss": 0.12065310776233673, "epoch": 8.602401601067378, "step": 25790 }, { "epoch": 8.602401601067378, "ref_ce_loss": 0.04846136271953583, "step": 25790 }, { "epoch": 8.602401601067378, "loss": 0.279868483543396, "step": 25790 }, { "ce_loss": 0.04042745381593704, "epoch": 8.602401601067378, "step": 25790 }, { "distill_loss": 0.15828919410705566, "epoch": 8.602401601067378, "step": 25790 }, { "epoch": 8.602401601067378, "ref_ce_loss": 0.05135457217693329, "step": 25790 }, { "epoch": 8.605737158105404, "loss": 0.4405, "step": 25800 }, { "epoch": 8.605737158105404, "grad_norm": 1.4962409734725952, "step": 25800 }, { "epoch": 8.605737158105404, "learning_rate": 4.0095971860909784e-05, "step": 25800 }, { "epoch": 8.605737158105404, "loss": 0.5475419163703918, "step": 25800 }, { "ce_loss": 0.06993673741817474, "epoch": 8.605737158105404, "step": 25800 }, { "distill_loss": 0.1747581511735916, "epoch": 8.605737158105404, "step": 25800 }, { "epoch": 8.605737158105404, "ref_ce_loss": 0.07766447216272354, "step": 25800 }, { "epoch": 8.605737158105404, "loss": 0.4323030114173889, "step": 25800 }, { "ce_loss": 0.0695943534374237, "epoch": 8.605737158105404, "step": 25800 }, { "distill_loss": 0.17964260280132294, "epoch": 8.605737158105404, "step": 25800 }, { "epoch": 8.605737158105404, "ref_ce_loss": 0.08051568269729614, "step": 25800 }, { "epoch": 8.609072715143428, "loss": 0.3955, "step": 25810 }, { "epoch": 8.609072715143428, "grad_norm": 1.6940163373947144, "step": 25810 }, { "epoch": 8.609072715143428, "learning_rate": 3.9907606443450615e-05, "step": 25810 }, { "epoch": 8.609072715143428, "loss": 0.45385682582855225, "step": 25810 }, { "ce_loss": 0.05719995126128197, "epoch": 8.609072715143428, "step": 25810 }, { "distill_loss": 0.19794762134552002, "epoch": 8.609072715143428, "step": 25810 }, { "epoch": 8.609072715143428, "ref_ce_loss": 0.08196888864040375, "step": 25810 }, { "epoch": 8.609072715143428, "loss": 0.33113548159599304, "step": 25810 }, { "ce_loss": 0.07105362415313721, "epoch": 8.609072715143428, "step": 25810 }, { "distill_loss": 0.1685401201248169, "epoch": 8.609072715143428, "step": 25810 }, { "epoch": 8.609072715143428, "ref_ce_loss": 0.06278117746114731, "step": 25810 }, { "epoch": 8.612408272181455, "loss": 0.3778, "step": 25820 }, { "epoch": 8.612408272181455, "grad_norm": 1.8563313484191895, "step": 25820 }, { "epoch": 8.612408272181455, "learning_rate": 3.9719661292562285e-05, "step": 25820 }, { "epoch": 8.612408272181455, "loss": 0.2569780945777893, "step": 25820 }, { "ce_loss": 0.0177643820643425, "epoch": 8.612408272181455, "step": 25820 }, { "distill_loss": 0.14377647638320923, "epoch": 8.612408272181455, "step": 25820 }, { "epoch": 8.612408272181455, "ref_ce_loss": 0.061776284128427505, "step": 25820 }, { "epoch": 8.612408272181455, "loss": 0.5325338244438171, "step": 25820 }, { "ce_loss": 0.0745001882314682, "epoch": 8.612408272181455, "step": 25820 }, { "distill_loss": 0.21417376399040222, "epoch": 8.612408272181455, "step": 25820 }, { "epoch": 8.612408272181455, "ref_ce_loss": 0.08413225412368774, "step": 25820 }, { "epoch": 8.615743829219479, "loss": 0.3739, "step": 25830 }, { "epoch": 8.615743829219479, "grad_norm": 1.0426115989685059, "step": 25830 }, { "epoch": 8.615743829219479, "learning_rate": 3.9532136627597094e-05, "step": 25830 }, { "epoch": 8.615743829219479, "loss": 0.6225663423538208, "step": 25830 }, { "ce_loss": 0.04983127489686012, "epoch": 8.615743829219479, "step": 25830 }, { "distill_loss": 0.183266282081604, "epoch": 8.615743829219479, "step": 25830 }, { "epoch": 8.615743829219479, "ref_ce_loss": 0.07555000483989716, "step": 25830 }, { "epoch": 8.615743829219479, "loss": 0.5039831399917603, "step": 25830 }, { "ce_loss": 0.09830626845359802, "epoch": 8.615743829219479, "step": 25830 }, { "distill_loss": 0.19829033315181732, "epoch": 8.615743829219479, "step": 25830 }, { "epoch": 8.615743829219479, "ref_ce_loss": 0.08459506183862686, "step": 25830 }, { "epoch": 8.619079386257505, "loss": 0.3921, "step": 25840 }, { "epoch": 8.619079386257505, "grad_norm": 1.0925133228302002, "step": 25840 }, { "epoch": 8.619079386257505, "learning_rate": 3.9345032667416295e-05, "step": 25840 }, { "epoch": 8.619079386257505, "loss": 0.45747706294059753, "step": 25840 }, { "ce_loss": 0.0557032972574234, "epoch": 8.619079386257505, "step": 25840 }, { "distill_loss": 0.2225656658411026, "epoch": 8.619079386257505, "step": 25840 }, { "epoch": 8.619079386257505, "ref_ce_loss": 0.05074243247509003, "step": 25840 }, { "epoch": 8.619079386257505, "loss": 0.3209838569164276, "step": 25840 }, { "ce_loss": 0.04327197000384331, "epoch": 8.619079386257505, "step": 25840 }, { "distill_loss": 0.1957104504108429, "epoch": 8.619079386257505, "step": 25840 }, { "epoch": 8.619079386257505, "ref_ce_loss": 0.06365178525447845, "step": 25840 }, { "epoch": 8.62241494329553, "loss": 0.396, "step": 25850 }, { "epoch": 8.62241494329553, "grad_norm": 0.9981101155281067, "step": 25850 }, { "epoch": 8.62241494329553, "learning_rate": 3.91583496303904e-05, "step": 25850 }, { "epoch": 8.62241494329553, "loss": 0.3310213088989258, "step": 25850 }, { "ce_loss": 0.05629245564341545, "epoch": 8.62241494329553, "step": 25850 }, { "distill_loss": 0.20885275304317474, "epoch": 8.62241494329553, "step": 25850 }, { "epoch": 8.62241494329553, "ref_ce_loss": 0.06559545546770096, "step": 25850 }, { "epoch": 8.62241494329553, "loss": 0.3649715483188629, "step": 25850 }, { "ce_loss": 0.09426181763410568, "epoch": 8.62241494329553, "step": 25850 }, { "distill_loss": 0.19951693713665009, "epoch": 8.62241494329553, "step": 25850 }, { "epoch": 8.62241494329553, "ref_ce_loss": 0.07036200910806656, "step": 25850 }, { "epoch": 8.625750500333556, "loss": 0.4081, "step": 25860 }, { "epoch": 8.625750500333556, "grad_norm": 1.180769681930542, "step": 25860 }, { "epoch": 8.625750500333556, "learning_rate": 3.897208773439878e-05, "step": 25860 }, { "epoch": 8.625750500333556, "loss": 0.3026542365550995, "step": 25860 }, { "ce_loss": 0.05167260766029358, "epoch": 8.625750500333556, "step": 25860 }, { "distill_loss": 0.16578106582164764, "epoch": 8.625750500333556, "step": 25860 }, { "epoch": 8.625750500333556, "ref_ce_loss": 0.06161735951900482, "step": 25860 }, { "epoch": 8.625750500333556, "loss": 0.47855818271636963, "step": 25860 }, { "ce_loss": 0.07787732034921646, "epoch": 8.625750500333556, "step": 25860 }, { "distill_loss": 0.21267056465148926, "epoch": 8.625750500333556, "step": 25860 }, { "epoch": 8.625750500333556, "ref_ce_loss": 0.08411581069231033, "step": 25860 }, { "epoch": 8.62908605737158, "loss": 0.4078, "step": 25870 }, { "epoch": 8.62908605737158, "grad_norm": 1.249125599861145, "step": 25870 }, { "epoch": 8.62908605737158, "learning_rate": 3.878624719682891e-05, "step": 25870 }, { "epoch": 8.62908605737158, "loss": 0.5326840281486511, "step": 25870 }, { "ce_loss": 0.09571799635887146, "epoch": 8.62908605737158, "step": 25870 }, { "distill_loss": 0.24862737953662872, "epoch": 8.62908605737158, "step": 25870 }, { "epoch": 8.62908605737158, "ref_ce_loss": 0.09078372269868851, "step": 25870 }, { "epoch": 8.62908605737158, "loss": 0.2817167639732361, "step": 25870 }, { "ce_loss": 0.029775870963931084, "epoch": 8.62908605737158, "step": 25870 }, { "distill_loss": 0.15809574723243713, "epoch": 8.62908605737158, "step": 25870 }, { "epoch": 8.62908605737158, "ref_ce_loss": 0.07221907377243042, "step": 25870 }, { "epoch": 8.632421614409607, "loss": 0.4131, "step": 25880 }, { "epoch": 8.632421614409607, "grad_norm": 1.827296257019043, "step": 25880 }, { "epoch": 8.632421614409607, "learning_rate": 3.8600828234576804e-05, "step": 25880 }, { "epoch": 8.632421614409607, "loss": 0.41917675733566284, "step": 25880 }, { "ce_loss": 0.1008242815732956, "epoch": 8.632421614409607, "step": 25880 }, { "distill_loss": 0.17240183055400848, "epoch": 8.632421614409607, "step": 25880 }, { "epoch": 8.632421614409607, "ref_ce_loss": 0.08299916237592697, "step": 25880 }, { "epoch": 8.632421614409607, "loss": 0.35314908623695374, "step": 25880 }, { "ce_loss": 0.05821432173252106, "epoch": 8.632421614409607, "step": 25880 }, { "distill_loss": 0.17668472230434418, "epoch": 8.632421614409607, "step": 25880 }, { "epoch": 8.632421614409607, "ref_ce_loss": 0.0680556446313858, "step": 25880 }, { "epoch": 8.635757171447631, "loss": 0.4212, "step": 25890 }, { "epoch": 8.635757171447631, "grad_norm": 1.620147943496704, "step": 25890 }, { "epoch": 8.635757171447631, "learning_rate": 3.841583106404629e-05, "step": 25890 }, { "epoch": 8.635757171447631, "loss": 0.31113311648368835, "step": 25890 }, { "ce_loss": 0.05124642327427864, "epoch": 8.635757171447631, "step": 25890 }, { "distill_loss": 0.18941840529441833, "epoch": 8.635757171447631, "step": 25890 }, { "epoch": 8.635757171447631, "ref_ce_loss": 0.07025238871574402, "step": 25890 }, { "epoch": 8.635757171447631, "loss": 0.36012572050094604, "step": 25890 }, { "ce_loss": 0.053658533841371536, "epoch": 8.635757171447631, "step": 25890 }, { "distill_loss": 0.178445965051651, "epoch": 8.635757171447631, "step": 25890 }, { "epoch": 8.635757171447631, "ref_ce_loss": 0.0968189686536789, "step": 25890 }, { "epoch": 8.639092728485657, "loss": 0.3521, "step": 25900 }, { "epoch": 8.639092728485657, "grad_norm": 15.946155548095703, "step": 25900 }, { "epoch": 8.639092728485657, "learning_rate": 3.823125590114907e-05, "step": 25900 }, { "epoch": 8.639092728485657, "loss": 0.30347707867622375, "step": 25900 }, { "ce_loss": 0.043511081486940384, "epoch": 8.639092728485657, "step": 25900 }, { "distill_loss": 0.15417037904262543, "epoch": 8.639092728485657, "step": 25900 }, { "epoch": 8.639092728485657, "ref_ce_loss": 0.0651017352938652, "step": 25900 }, { "epoch": 8.639092728485657, "loss": 0.32813578844070435, "step": 25900 }, { "ce_loss": 0.061788931488990784, "epoch": 8.639092728485657, "step": 25900 }, { "distill_loss": 0.1606302708387375, "epoch": 8.639092728485657, "step": 25900 }, { "epoch": 8.639092728485657, "ref_ce_loss": 0.06535312533378601, "step": 25900 }, { "epoch": 8.642428285523682, "loss": 0.3467, "step": 25910 }, { "epoch": 8.642428285523682, "grad_norm": 1.2270931005477905, "step": 25910 }, { "epoch": 8.642428285523682, "learning_rate": 3.804710296130405e-05, "step": 25910 }, { "epoch": 8.642428285523682, "loss": 0.3107259273529053, "step": 25910 }, { "ce_loss": 0.05802823603153229, "epoch": 8.642428285523682, "step": 25910 }, { "distill_loss": 0.15126903355121613, "epoch": 8.642428285523682, "step": 25910 }, { "epoch": 8.642428285523682, "ref_ce_loss": 0.06690887361764908, "step": 25910 }, { "epoch": 8.642428285523682, "loss": 0.5598396062850952, "step": 25910 }, { "ce_loss": 0.06821523606777191, "epoch": 8.642428285523682, "step": 25910 }, { "distill_loss": 0.22665062546730042, "epoch": 8.642428285523682, "step": 25910 }, { "epoch": 8.642428285523682, "ref_ce_loss": 0.10408192873001099, "step": 25910 }, { "epoch": 8.645763842561708, "loss": 0.4112, "step": 25920 }, { "epoch": 8.645763842561708, "grad_norm": 1.304826259613037, "step": 25920 }, { "epoch": 8.645763842561708, "learning_rate": 3.786337245943763e-05, "step": 25920 }, { "epoch": 8.645763842561708, "loss": 0.3197181820869446, "step": 25920 }, { "ce_loss": 0.042518723756074905, "epoch": 8.645763842561708, "step": 25920 }, { "distill_loss": 0.20466618239879608, "epoch": 8.645763842561708, "step": 25920 }, { "epoch": 8.645763842561708, "ref_ce_loss": 0.05158605799078941, "step": 25920 }, { "epoch": 8.645763842561708, "loss": 0.4145950376987457, "step": 25920 }, { "ce_loss": 0.09057069569826126, "epoch": 8.645763842561708, "step": 25920 }, { "distill_loss": 0.20773470401763916, "epoch": 8.645763842561708, "step": 25920 }, { "epoch": 8.645763842561708, "ref_ce_loss": 0.09864034503698349, "step": 25920 }, { "epoch": 8.649099399599733, "loss": 0.3941, "step": 25930 }, { "epoch": 8.649099399599733, "grad_norm": 1.0435384511947632, "step": 25930 }, { "epoch": 8.649099399599733, "learning_rate": 3.768006460998303e-05, "step": 25930 }, { "epoch": 8.649099399599733, "loss": 0.46100571751594543, "step": 25930 }, { "ce_loss": 0.1150127723813057, "epoch": 8.649099399599733, "step": 25930 }, { "distill_loss": 0.19515320658683777, "epoch": 8.649099399599733, "step": 25930 }, { "epoch": 8.649099399599733, "ref_ce_loss": 0.0839739441871643, "step": 25930 }, { "epoch": 8.649099399599733, "loss": 0.26864948868751526, "step": 25930 }, { "ce_loss": 0.049335312098264694, "epoch": 8.649099399599733, "step": 25930 }, { "distill_loss": 0.15795031189918518, "epoch": 8.649099399599733, "step": 25930 }, { "epoch": 8.649099399599733, "ref_ce_loss": 0.06104101613163948, "step": 25930 }, { "epoch": 8.652434956637759, "loss": 0.3951, "step": 25940 }, { "epoch": 8.652434956637759, "grad_norm": 1.5876872539520264, "step": 25940 }, { "epoch": 8.652434956637759, "learning_rate": 3.749717962688033e-05, "step": 25940 }, { "epoch": 8.652434956637759, "loss": 0.34112781286239624, "step": 25940 }, { "ce_loss": 0.04529694467782974, "epoch": 8.652434956637759, "step": 25940 }, { "distill_loss": 0.15988574922084808, "epoch": 8.652434956637759, "step": 25940 }, { "epoch": 8.652434956637759, "ref_ce_loss": 0.06164150685071945, "step": 25940 }, { "epoch": 8.652434956637759, "loss": 0.3117307424545288, "step": 25940 }, { "ce_loss": 0.036282043904066086, "epoch": 8.652434956637759, "step": 25940 }, { "distill_loss": 0.18413367867469788, "epoch": 8.652434956637759, "step": 25940 }, { "epoch": 8.652434956637759, "ref_ce_loss": 0.06488938629627228, "step": 25940 }, { "epoch": 8.655770513675783, "loss": 0.3999, "step": 25950 }, { "epoch": 8.655770513675783, "grad_norm": 1.2973941564559937, "step": 25950 }, { "epoch": 8.655770513675783, "learning_rate": 3.7314717723575934e-05, "step": 25950 }, { "epoch": 8.655770513675783, "loss": 0.41473087668418884, "step": 25950 }, { "ce_loss": 0.07483311742544174, "epoch": 8.655770513675783, "step": 25950 }, { "distill_loss": 0.17405973374843597, "epoch": 8.655770513675783, "step": 25950 }, { "epoch": 8.655770513675783, "ref_ce_loss": 0.08251118659973145, "step": 25950 }, { "epoch": 8.655770513675783, "loss": 0.46500372886657715, "step": 25950 }, { "ce_loss": 0.04539445415139198, "epoch": 8.655770513675783, "step": 25950 }, { "distill_loss": 0.21199798583984375, "epoch": 8.655770513675783, "step": 25950 }, { "epoch": 8.655770513675783, "ref_ce_loss": 0.06681691110134125, "step": 25950 }, { "epoch": 8.65910607071381, "loss": 0.436, "step": 25960 }, { "epoch": 8.65910607071381, "grad_norm": 1.4216866493225098, "step": 25960 }, { "epoch": 8.65910607071381, "learning_rate": 3.7132679113022385e-05, "step": 25960 }, { "epoch": 8.65910607071381, "loss": 0.35822737216949463, "step": 25960 }, { "ce_loss": 0.04087655246257782, "epoch": 8.65910607071381, "step": 25960 }, { "distill_loss": 0.19274145364761353, "epoch": 8.65910607071381, "step": 25960 }, { "epoch": 8.65910607071381, "ref_ce_loss": 0.0882585272192955, "step": 25960 }, { "epoch": 8.65910607071381, "loss": 0.34755536913871765, "step": 25960 }, { "ce_loss": 0.05266533046960831, "epoch": 8.65910607071381, "step": 25960 }, { "distill_loss": 0.17872610688209534, "epoch": 8.65910607071381, "step": 25960 }, { "epoch": 8.65910607071381, "ref_ce_loss": 0.07696732878684998, "step": 25960 }, { "epoch": 8.662441627751834, "loss": 0.4989, "step": 25970 }, { "epoch": 8.662441627751834, "grad_norm": 3.4985039234161377, "step": 25970 }, { "epoch": 8.662441627751834, "learning_rate": 3.695106400767854e-05, "step": 25970 }, { "epoch": 8.662441627751834, "loss": 0.3335474133491516, "step": 25970 }, { "ce_loss": 0.03969917073845863, "epoch": 8.662441627751834, "step": 25970 }, { "distill_loss": 0.19208571314811707, "epoch": 8.662441627751834, "step": 25970 }, { "epoch": 8.662441627751834, "ref_ce_loss": 0.06494186818599701, "step": 25970 }, { "epoch": 8.662441627751834, "loss": 0.3575483560562134, "step": 25970 }, { "ce_loss": 0.059382472187280655, "epoch": 8.662441627751834, "step": 25970 }, { "distill_loss": 0.16217409074306488, "epoch": 8.662441627751834, "step": 25970 }, { "epoch": 8.662441627751834, "ref_ce_loss": 0.056934986263513565, "step": 25970 }, { "epoch": 8.66577718478986, "loss": 0.3586, "step": 25980 }, { "epoch": 8.66577718478986, "grad_norm": 1.2654645442962646, "step": 25980 }, { "epoch": 8.66577718478986, "learning_rate": 3.676987261950875e-05, "step": 25980 }, { "epoch": 8.66577718478986, "loss": 0.3632966876029968, "step": 25980 }, { "ce_loss": 0.04773798957467079, "epoch": 8.66577718478986, "step": 25980 }, { "distill_loss": 0.16464367508888245, "epoch": 8.66577718478986, "step": 25980 }, { "epoch": 8.66577718478986, "ref_ce_loss": 0.050506629049777985, "step": 25980 }, { "epoch": 8.66577718478986, "loss": 0.2918108403682709, "step": 25980 }, { "ce_loss": 0.043732013553380966, "epoch": 8.66577718478986, "step": 25980 }, { "distill_loss": 0.142713725566864, "epoch": 8.66577718478986, "step": 25980 }, { "epoch": 8.66577718478986, "ref_ce_loss": 0.07189074903726578, "step": 25980 }, { "epoch": 8.669112741827885, "loss": 0.4013, "step": 25990 }, { "epoch": 8.669112741827885, "grad_norm": 1.0764323472976685, "step": 25990 }, { "epoch": 8.669112741827885, "learning_rate": 3.65891051599827e-05, "step": 25990 }, { "epoch": 8.669112741827885, "loss": 0.4605330228805542, "step": 25990 }, { "ce_loss": 0.08776116371154785, "epoch": 8.669112741827885, "step": 25990 }, { "distill_loss": 0.21826300024986267, "epoch": 8.669112741827885, "step": 25990 }, { "epoch": 8.669112741827885, "ref_ce_loss": 0.09678833931684494, "step": 25990 }, { "epoch": 8.669112741827885, "loss": 0.4143109619617462, "step": 25990 }, { "ce_loss": 0.08480346202850342, "epoch": 8.669112741827885, "step": 25990 }, { "distill_loss": 0.19863438606262207, "epoch": 8.669112741827885, "step": 25990 }, { "epoch": 8.669112741827885, "ref_ce_loss": 0.10676558315753937, "step": 25990 }, { "epoch": 8.67244829886591, "loss": 0.381, "step": 26000 }, { "epoch": 8.67244829886591, "grad_norm": 1.4833221435546875, "step": 26000 }, { "epoch": 8.67244829886591, "learning_rate": 3.64087618400756e-05, "step": 26000 }, { "epoch": 8.67244829886591, "loss": 0.2441224455833435, "step": 26000 }, { "ce_loss": 0.02226584032177925, "epoch": 8.67244829886591, "step": 26000 }, { "distill_loss": 0.1351041942834854, "epoch": 8.67244829886591, "step": 26000 }, { "epoch": 8.67244829886591, "ref_ce_loss": 0.0599244050681591, "step": 26000 }, { "epoch": 8.67244829886591, "loss": 0.31449344754219055, "step": 26000 }, { "ce_loss": 0.05308458209037781, "epoch": 8.67244829886591, "step": 26000 }, { "distill_loss": 0.18237219750881195, "epoch": 8.67244829886591, "step": 26000 }, { "epoch": 8.67244829886591, "ref_ce_loss": 0.06047021597623825, "step": 26000 }, { "epoch": 8.675783855903935, "loss": 0.35, "step": 26010 }, { "epoch": 8.675783855903935, "grad_norm": 1.7155975103378296, "step": 26010 }, { "epoch": 8.675783855903935, "learning_rate": 3.622884287026742e-05, "step": 26010 }, { "epoch": 8.675783855903935, "loss": 0.42710235714912415, "step": 26010 }, { "ce_loss": 0.06678865104913712, "epoch": 8.675783855903935, "step": 26010 }, { "distill_loss": 0.18016648292541504, "epoch": 8.675783855903935, "step": 26010 }, { "epoch": 8.675783855903935, "ref_ce_loss": 0.0980868712067604, "step": 26010 }, { "epoch": 8.675783855903935, "loss": 0.5652369260787964, "step": 26010 }, { "ce_loss": 0.031212544068694115, "epoch": 8.675783855903935, "step": 26010 }, { "distill_loss": 0.16358351707458496, "epoch": 8.675783855903935, "step": 26010 }, { "epoch": 8.675783855903935, "ref_ce_loss": 0.0654633492231369, "step": 26010 }, { "epoch": 8.679119412941962, "loss": 0.4155, "step": 26020 }, { "epoch": 8.679119412941962, "grad_norm": 1.209120512008667, "step": 26020 }, { "epoch": 8.679119412941962, "learning_rate": 3.604934846054309e-05, "step": 26020 }, { "epoch": 8.679119412941962, "loss": 0.3771757483482361, "step": 26020 }, { "ce_loss": 0.028000246733427048, "epoch": 8.679119412941962, "step": 26020 }, { "distill_loss": 0.14734534919261932, "epoch": 8.679119412941962, "step": 26020 }, { "epoch": 8.679119412941962, "ref_ce_loss": 0.061381395906209946, "step": 26020 }, { "epoch": 8.679119412941962, "loss": 0.21141190826892853, "step": 26020 }, { "ce_loss": 0.02972903661429882, "epoch": 8.679119412941962, "step": 26020 }, { "distill_loss": 0.13676223158836365, "epoch": 8.679119412941962, "step": 26020 }, { "epoch": 8.679119412941962, "ref_ce_loss": 0.0446452833712101, "step": 26020 }, { "epoch": 8.682454969979986, "loss": 0.3541, "step": 26030 }, { "epoch": 8.682454969979986, "grad_norm": 1.1638330221176147, "step": 26030 }, { "epoch": 8.682454969979986, "learning_rate": 3.5870278820391777e-05, "step": 26030 }, { "epoch": 8.682454969979986, "loss": 0.5268926024436951, "step": 26030 }, { "ce_loss": 0.06840582937002182, "epoch": 8.682454969979986, "step": 26030 }, { "distill_loss": 0.2558664083480835, "epoch": 8.682454969979986, "step": 26030 }, { "epoch": 8.682454969979986, "ref_ce_loss": 0.07109972089529037, "step": 26030 }, { "epoch": 8.682454969979986, "loss": 0.475436270236969, "step": 26030 }, { "ce_loss": 0.056556470692157745, "epoch": 8.682454969979986, "step": 26030 }, { "distill_loss": 0.22109992802143097, "epoch": 8.682454969979986, "step": 26030 }, { "epoch": 8.682454969979986, "ref_ce_loss": 0.059039387851953506, "step": 26030 }, { "epoch": 8.685790527018012, "loss": 0.415, "step": 26040 }, { "epoch": 8.685790527018012, "grad_norm": 2.160926342010498, "step": 26040 }, { "epoch": 8.685790527018012, "learning_rate": 3.569163415880703e-05, "step": 26040 }, { "epoch": 8.685790527018012, "loss": 0.4365619421005249, "step": 26040 }, { "ce_loss": 0.0994822308421135, "epoch": 8.685790527018012, "step": 26040 }, { "distill_loss": 0.21024930477142334, "epoch": 8.685790527018012, "step": 26040 }, { "epoch": 8.685790527018012, "ref_ce_loss": 0.06467576324939728, "step": 26040 }, { "epoch": 8.685790527018012, "loss": 0.24994733929634094, "step": 26040 }, { "ce_loss": 0.029614150524139404, "epoch": 8.685790527018012, "step": 26040 }, { "distill_loss": 0.14794522523880005, "epoch": 8.685790527018012, "step": 26040 }, { "epoch": 8.685790527018012, "ref_ce_loss": 0.07209348678588867, "step": 26040 }, { "epoch": 8.689126084056037, "loss": 0.374, "step": 26050 }, { "epoch": 8.689126084056037, "grad_norm": 1.0623009204864502, "step": 26050 }, { "epoch": 8.689126084056037, "learning_rate": 3.551341468428642e-05, "step": 26050 }, { "epoch": 8.689126084056037, "loss": 0.3026094436645508, "step": 26050 }, { "ce_loss": 0.04317136108875275, "epoch": 8.689126084056037, "step": 26050 }, { "distill_loss": 0.13667577505111694, "epoch": 8.689126084056037, "step": 26050 }, { "epoch": 8.689126084056037, "ref_ce_loss": 0.046481385827064514, "step": 26050 }, { "epoch": 8.689126084056037, "loss": 0.4735656678676605, "step": 26050 }, { "ce_loss": 0.08374132961034775, "epoch": 8.689126084056037, "step": 26050 }, { "distill_loss": 0.2770083546638489, "epoch": 8.689126084056037, "step": 26050 }, { "epoch": 8.689126084056037, "ref_ce_loss": 0.08520796149969101, "step": 26050 }, { "epoch": 8.692461641094063, "loss": 0.4028, "step": 26060 }, { "epoch": 8.692461641094063, "grad_norm": 1.1022357940673828, "step": 26060 }, { "epoch": 8.692461641094063, "learning_rate": 3.533562060483133e-05, "step": 26060 }, { "epoch": 8.692461641094063, "loss": 0.2600478529930115, "step": 26060 }, { "ce_loss": 0.04642507806420326, "epoch": 8.692461641094063, "step": 26060 }, { "distill_loss": 0.15849417448043823, "epoch": 8.692461641094063, "step": 26060 }, { "epoch": 8.692461641094063, "ref_ce_loss": 0.0549236461520195, "step": 26060 }, { "epoch": 8.692461641094063, "loss": 0.5013510584831238, "step": 26060 }, { "ce_loss": 0.0545508898794651, "epoch": 8.692461641094063, "step": 26060 }, { "distill_loss": 0.19478538632392883, "epoch": 8.692461641094063, "step": 26060 }, { "epoch": 8.692461641094063, "ref_ce_loss": 0.06514310836791992, "step": 26060 }, { "epoch": 8.695797198132087, "loss": 0.404, "step": 26070 }, { "epoch": 8.695797198132087, "grad_norm": 1.3652430772781372, "step": 26070 }, { "epoch": 8.695797198132087, "learning_rate": 3.515825212794637e-05, "step": 26070 }, { "epoch": 8.695797198132087, "loss": 0.3231005370616913, "step": 26070 }, { "ce_loss": 0.062078312039375305, "epoch": 8.695797198132087, "step": 26070 }, { "distill_loss": 0.16984239220619202, "epoch": 8.695797198132087, "step": 26070 }, { "epoch": 8.695797198132087, "ref_ce_loss": 0.0763070359826088, "step": 26070 }, { "epoch": 8.695797198132087, "loss": 0.3238498568534851, "step": 26070 }, { "ce_loss": 0.07663042098283768, "epoch": 8.695797198132087, "step": 26070 }, { "distill_loss": 0.17740431427955627, "epoch": 8.695797198132087, "step": 26070 }, { "epoch": 8.695797198132087, "ref_ce_loss": 0.06952373683452606, "step": 26070 }, { "epoch": 8.699132755170114, "loss": 0.3645, "step": 26080 }, { "epoch": 8.699132755170114, "grad_norm": 1.1212877035140991, "step": 26080 }, { "epoch": 8.699132755170114, "learning_rate": 3.498130946063984e-05, "step": 26080 }, { "epoch": 8.699132755170114, "loss": 0.48007920384407043, "step": 26080 }, { "ce_loss": 0.05316156893968582, "epoch": 8.699132755170114, "step": 26080 }, { "distill_loss": 0.20693735778331757, "epoch": 8.699132755170114, "step": 26080 }, { "epoch": 8.699132755170114, "ref_ce_loss": 0.061040036380290985, "step": 26080 }, { "epoch": 8.699132755170114, "loss": 0.43342125415802, "step": 26080 }, { "ce_loss": 0.06705353409051895, "epoch": 8.699132755170114, "step": 26080 }, { "distill_loss": 0.21660195291042328, "epoch": 8.699132755170114, "step": 26080 }, { "epoch": 8.699132755170114, "ref_ce_loss": 0.078488789498806, "step": 26080 }, { "epoch": 8.702468312208138, "loss": 0.3753, "step": 26090 }, { "epoch": 8.702468312208138, "grad_norm": 0.9908813238143921, "step": 26090 }, { "epoch": 8.702468312208138, "learning_rate": 3.4804792809422795e-05, "step": 26090 }, { "epoch": 8.702468312208138, "loss": 0.2521405518054962, "step": 26090 }, { "ce_loss": 0.024630185216665268, "epoch": 8.702468312208138, "step": 26090 }, { "distill_loss": 0.15732663869857788, "epoch": 8.702468312208138, "step": 26090 }, { "epoch": 8.702468312208138, "ref_ce_loss": 0.06997606158256531, "step": 26090 }, { "epoch": 8.702468312208138, "loss": 0.4396592378616333, "step": 26090 }, { "ce_loss": 0.06013498455286026, "epoch": 8.702468312208138, "step": 26090 }, { "distill_loss": 0.20719094574451447, "epoch": 8.702468312208138, "step": 26090 }, { "epoch": 8.702468312208138, "ref_ce_loss": 0.07338641583919525, "step": 26090 }, { "epoch": 8.705803869246164, "loss": 0.3966, "step": 26100 }, { "epoch": 8.705803869246164, "grad_norm": 1.140104055404663, "step": 26100 }, { "epoch": 8.705803869246164, "learning_rate": 3.4628702380309263e-05, "step": 26100 }, { "epoch": 8.705803869246164, "loss": 0.487138032913208, "step": 26100 }, { "ce_loss": 0.06893015652894974, "epoch": 8.705803869246164, "step": 26100 }, { "distill_loss": 0.21911561489105225, "epoch": 8.705803869246164, "step": 26100 }, { "epoch": 8.705803869246164, "ref_ce_loss": 0.06975898146629333, "step": 26100 }, { "epoch": 8.705803869246164, "loss": 0.3690391778945923, "step": 26100 }, { "ce_loss": 0.04879662021994591, "epoch": 8.705803869246164, "step": 26100 }, { "distill_loss": 0.21773314476013184, "epoch": 8.705803869246164, "step": 26100 }, { "epoch": 8.705803869246164, "ref_ce_loss": 0.08336970210075378, "step": 26100 }, { "epoch": 8.709139426284189, "loss": 0.4231, "step": 26110 }, { "epoch": 8.709139426284189, "grad_norm": 1.1086513996124268, "step": 26110 }, { "epoch": 8.709139426284189, "learning_rate": 3.445303837881557e-05, "step": 26110 }, { "epoch": 8.709139426284189, "loss": 0.3410007953643799, "step": 26110 }, { "ce_loss": 0.03825950622558594, "epoch": 8.709139426284189, "step": 26110 }, { "distill_loss": 0.1740851104259491, "epoch": 8.709139426284189, "step": 26110 }, { "epoch": 8.709139426284189, "ref_ce_loss": 0.04727938026189804, "step": 26110 }, { "epoch": 8.709139426284189, "loss": 0.4755092263221741, "step": 26110 }, { "ce_loss": 0.09632537513971329, "epoch": 8.709139426284189, "step": 26110 }, { "distill_loss": 0.20118463039398193, "epoch": 8.709139426284189, "step": 26110 }, { "epoch": 8.709139426284189, "ref_ce_loss": 0.08340964466333389, "step": 26110 }, { "epoch": 8.712474983322215, "loss": 0.4273, "step": 26120 }, { "epoch": 8.712474983322215, "grad_norm": 1.3226609230041504, "step": 26120 }, { "epoch": 8.712474983322215, "learning_rate": 3.427780100996052e-05, "step": 26120 }, { "epoch": 8.712474983322215, "loss": 0.47836196422576904, "step": 26120 }, { "ce_loss": 0.06917423009872437, "epoch": 8.712474983322215, "step": 26120 }, { "distill_loss": 0.19600701332092285, "epoch": 8.712474983322215, "step": 26120 }, { "epoch": 8.712474983322215, "ref_ce_loss": 0.05268669128417969, "step": 26120 }, { "epoch": 8.712474983322215, "loss": 0.41963690519332886, "step": 26120 }, { "ce_loss": 0.06492365151643753, "epoch": 8.712474983322215, "step": 26120 }, { "distill_loss": 0.14877067506313324, "epoch": 8.712474983322215, "step": 26120 }, { "epoch": 8.712474983322215, "ref_ce_loss": 0.06951910257339478, "step": 26120 }, { "epoch": 8.71581054036024, "loss": 0.4161, "step": 26130 }, { "epoch": 8.71581054036024, "grad_norm": 2.280256509780884, "step": 26130 }, { "epoch": 8.71581054036024, "learning_rate": 3.4102990478265086e-05, "step": 26130 }, { "epoch": 8.71581054036024, "loss": 0.32431760430336, "step": 26130 }, { "ce_loss": 0.07514671236276627, "epoch": 8.71581054036024, "step": 26130 }, { "distill_loss": 0.17155833542346954, "epoch": 8.71581054036024, "step": 26130 }, { "epoch": 8.71581054036024, "ref_ce_loss": 0.055567435920238495, "step": 26130 }, { "epoch": 8.71581054036024, "loss": 0.43972310423851013, "step": 26130 }, { "ce_loss": 0.06478568911552429, "epoch": 8.71581054036024, "step": 26130 }, { "distill_loss": 0.19943878054618835, "epoch": 8.71581054036024, "step": 26130 }, { "epoch": 8.71581054036024, "ref_ce_loss": 0.07526401430368423, "step": 26130 }, { "epoch": 8.719146097398266, "loss": 0.396, "step": 26140 }, { "epoch": 8.719146097398266, "grad_norm": 1.3656131029129028, "step": 26140 }, { "epoch": 8.719146097398266, "learning_rate": 3.392860698775193e-05, "step": 26140 }, { "epoch": 8.719146097398266, "loss": 0.42818355560302734, "step": 26140 }, { "ce_loss": 0.10302330553531647, "epoch": 8.719146097398266, "step": 26140 }, { "distill_loss": 0.22370949387550354, "epoch": 8.719146097398266, "step": 26140 }, { "epoch": 8.719146097398266, "ref_ce_loss": 0.0865187719464302, "step": 26140 }, { "epoch": 8.719146097398266, "loss": 0.3065904676914215, "step": 26140 }, { "ce_loss": 0.041094448417425156, "epoch": 8.719146097398266, "step": 26140 }, { "distill_loss": 0.17131191492080688, "epoch": 8.719146097398266, "step": 26140 }, { "epoch": 8.719146097398266, "ref_ce_loss": 0.07021956145763397, "step": 26140 }, { "epoch": 8.72248165443629, "loss": 0.4599, "step": 26150 }, { "epoch": 8.72248165443629, "grad_norm": 2.174772024154663, "step": 26150 }, { "epoch": 8.72248165443629, "learning_rate": 3.3754650741945324e-05, "step": 26150 }, { "epoch": 8.72248165443629, "loss": 0.48152804374694824, "step": 26150 }, { "ce_loss": 0.044403236359357834, "epoch": 8.72248165443629, "step": 26150 }, { "distill_loss": 0.21540836989879608, "epoch": 8.72248165443629, "step": 26150 }, { "epoch": 8.72248165443629, "ref_ce_loss": 0.06427323073148727, "step": 26150 }, { "epoch": 8.72248165443629, "loss": 1.2604830265045166, "step": 26150 }, { "ce_loss": 0.10439719259738922, "epoch": 8.72248165443629, "step": 26150 }, { "distill_loss": 0.23386874794960022, "epoch": 8.72248165443629, "step": 26150 }, { "epoch": 8.72248165443629, "ref_ce_loss": 0.07590841501951218, "step": 26150 }, { "epoch": 8.725817211474316, "loss": 0.4233, "step": 26160 }, { "epoch": 8.725817211474316, "grad_norm": 0.880950391292572, "step": 26160 }, { "epoch": 8.725817211474316, "learning_rate": 3.358112194387086e-05, "step": 26160 }, { "epoch": 8.725817211474316, "loss": 0.34046900272369385, "step": 26160 }, { "ce_loss": 0.041662558913230896, "epoch": 8.725817211474316, "step": 26160 }, { "distill_loss": 0.21017879247665405, "epoch": 8.725817211474316, "step": 26160 }, { "epoch": 8.725817211474316, "ref_ce_loss": 0.05803197994828224, "step": 26160 }, { "epoch": 8.725817211474316, "loss": 0.36259445548057556, "step": 26160 }, { "ce_loss": 0.07198458164930344, "epoch": 8.725817211474316, "step": 26160 }, { "distill_loss": 0.1978611946105957, "epoch": 8.725817211474316, "step": 26160 }, { "epoch": 8.725817211474316, "ref_ce_loss": 0.09256937354803085, "step": 26160 }, { "epoch": 8.729152768512341, "loss": 0.3867, "step": 26170 }, { "epoch": 8.729152768512341, "grad_norm": 1.1352921724319458, "step": 26170 }, { "epoch": 8.729152768512341, "learning_rate": 3.3408020796055425e-05, "step": 26170 }, { "epoch": 8.729152768512341, "loss": 0.5861698389053345, "step": 26170 }, { "ce_loss": 0.06542176008224487, "epoch": 8.729152768512341, "step": 26170 }, { "distill_loss": 0.19351504743099213, "epoch": 8.729152768512341, "step": 26170 }, { "epoch": 8.729152768512341, "ref_ce_loss": 0.07479386776685715, "step": 26170 }, { "epoch": 8.729152768512341, "loss": 0.38491737842559814, "step": 26170 }, { "ce_loss": 0.0400855578482151, "epoch": 8.729152768512341, "step": 26170 }, { "distill_loss": 0.19322989881038666, "epoch": 8.729152768512341, "step": 26170 }, { "epoch": 8.729152768512341, "ref_ce_loss": 0.0906819999217987, "step": 26170 }, { "epoch": 8.732488325550367, "loss": 0.3808, "step": 26180 }, { "epoch": 8.732488325550367, "grad_norm": 1.597434639930725, "step": 26180 }, { "epoch": 8.732488325550367, "learning_rate": 3.323534750052666e-05, "step": 26180 }, { "epoch": 8.732488325550367, "loss": 0.28560367226600647, "step": 26180 }, { "ce_loss": 0.02849467098712921, "epoch": 8.732488325550367, "step": 26180 }, { "distill_loss": 0.19566433131694794, "epoch": 8.732488325550367, "step": 26180 }, { "epoch": 8.732488325550367, "ref_ce_loss": 0.061204612255096436, "step": 26180 }, { "epoch": 8.732488325550367, "loss": 0.273761510848999, "step": 26180 }, { "ce_loss": 0.045868728309869766, "epoch": 8.732488325550367, "step": 26180 }, { "distill_loss": 0.16468042135238647, "epoch": 8.732488325550367, "step": 26180 }, { "epoch": 8.732488325550367, "ref_ce_loss": 0.062920480966568, "step": 26180 }, { "epoch": 8.735823882588392, "loss": 0.3552, "step": 26190 }, { "epoch": 8.735823882588392, "grad_norm": 1.0125452280044556, "step": 26190 }, { "epoch": 8.735823882588392, "learning_rate": 3.306310225881286e-05, "step": 26190 }, { "epoch": 8.735823882588392, "loss": 0.4199540615081787, "step": 26190 }, { "ce_loss": 0.06449312716722488, "epoch": 8.735823882588392, "step": 26190 }, { "distill_loss": 0.19299976527690887, "epoch": 8.735823882588392, "step": 26190 }, { "epoch": 8.735823882588392, "ref_ce_loss": 0.08478783071041107, "step": 26190 }, { "epoch": 8.735823882588392, "loss": 0.8032296895980835, "step": 26190 }, { "ce_loss": 0.07914341241121292, "epoch": 8.735823882588392, "step": 26190 }, { "distill_loss": 0.22294864058494568, "epoch": 8.735823882588392, "step": 26190 }, { "epoch": 8.735823882588392, "ref_ce_loss": 0.06438795477151871, "step": 26190 }, { "epoch": 8.739159439626418, "loss": 0.4116, "step": 26200 }, { "epoch": 8.739159439626418, "grad_norm": 1.4149339199066162, "step": 26200 }, { "epoch": 8.739159439626418, "learning_rate": 3.289128527194279e-05, "step": 26200 }, { "epoch": 8.739159439626418, "loss": 0.2872389554977417, "step": 26200 }, { "ce_loss": 0.05248313397169113, "epoch": 8.739159439626418, "step": 26200 }, { "distill_loss": 0.12529276311397552, "epoch": 8.739159439626418, "step": 26200 }, { "epoch": 8.739159439626418, "ref_ce_loss": 0.07293683290481567, "step": 26200 }, { "epoch": 8.739159439626418, "loss": 0.6678260564804077, "step": 26200 }, { "ce_loss": 0.07362057268619537, "epoch": 8.739159439626418, "step": 26200 }, { "distill_loss": 0.201996847987175, "epoch": 8.739159439626418, "step": 26200 }, { "epoch": 8.739159439626418, "ref_ce_loss": 0.06612657755613327, "step": 26200 }, { "epoch": 8.742494996664442, "loss": 0.3798, "step": 26210 }, { "epoch": 8.742494996664442, "grad_norm": 1.198406457901001, "step": 26210 }, { "epoch": 8.742494996664442, "learning_rate": 3.27198967404454e-05, "step": 26210 }, { "epoch": 8.742494996664442, "loss": 0.400199830532074, "step": 26210 }, { "ce_loss": 0.06638126820325851, "epoch": 8.742494996664442, "step": 26210 }, { "distill_loss": 0.17453718185424805, "epoch": 8.742494996664442, "step": 26210 }, { "epoch": 8.742494996664442, "ref_ce_loss": 0.08012498915195465, "step": 26210 }, { "epoch": 8.742494996664442, "loss": 0.5893149375915527, "step": 26210 }, { "ce_loss": 0.05600981041789055, "epoch": 8.742494996664442, "step": 26210 }, { "distill_loss": 0.15208479762077332, "epoch": 8.742494996664442, "step": 26210 }, { "epoch": 8.742494996664442, "ref_ce_loss": 0.053888577967882156, "step": 26210 }, { "epoch": 8.745830553702469, "loss": 0.4131, "step": 26220 }, { "epoch": 8.745830553702469, "grad_norm": 1.1500005722045898, "step": 26220 }, { "epoch": 8.745830553702469, "learning_rate": 3.254893686434941e-05, "step": 26220 }, { "epoch": 8.745830553702469, "loss": 0.4473916292190552, "step": 26220 }, { "ce_loss": 0.05827895179390907, "epoch": 8.745830553702469, "step": 26220 }, { "distill_loss": 0.23006944358348846, "epoch": 8.745830553702469, "step": 26220 }, { "epoch": 8.745830553702469, "ref_ce_loss": 0.08817873150110245, "step": 26220 }, { "epoch": 8.745830553702469, "loss": 0.5407593250274658, "step": 26220 }, { "ce_loss": 0.10037504881620407, "epoch": 8.745830553702469, "step": 26220 }, { "distill_loss": 0.20246253907680511, "epoch": 8.745830553702469, "step": 26220 }, { "epoch": 8.745830553702469, "ref_ce_loss": 0.09331716597080231, "step": 26220 }, { "epoch": 8.749166110740493, "loss": 0.4097, "step": 26230 }, { "epoch": 8.749166110740493, "grad_norm": 0.9860525131225586, "step": 26230 }, { "epoch": 8.749166110740493, "learning_rate": 3.2378405843183435e-05, "step": 26230 }, { "epoch": 8.749166110740493, "loss": 0.6364589929580688, "step": 26230 }, { "ce_loss": 0.06990605592727661, "epoch": 8.749166110740493, "step": 26230 }, { "distill_loss": 0.1852722465991974, "epoch": 8.749166110740493, "step": 26230 }, { "epoch": 8.749166110740493, "ref_ce_loss": 0.08063516765832901, "step": 26230 }, { "epoch": 8.749166110740493, "loss": 0.260394424200058, "step": 26230 }, { "ce_loss": 0.03426089510321617, "epoch": 8.749166110740493, "step": 26230 }, { "distill_loss": 0.1199391782283783, "epoch": 8.749166110740493, "step": 26230 }, { "epoch": 8.749166110740493, "ref_ce_loss": 0.05145851522684097, "step": 26230 }, { "epoch": 8.75250166777852, "loss": 0.418, "step": 26240 }, { "epoch": 8.75250166777852, "grad_norm": 1.1872379779815674, "step": 26240 }, { "epoch": 8.75250166777852, "learning_rate": 3.22083038759756e-05, "step": 26240 }, { "epoch": 8.75250166777852, "loss": 0.3817587196826935, "step": 26240 }, { "ce_loss": 0.054080668836832047, "epoch": 8.75250166777852, "step": 26240 }, { "distill_loss": 0.20794880390167236, "epoch": 8.75250166777852, "step": 26240 }, { "epoch": 8.75250166777852, "ref_ce_loss": 0.08582180738449097, "step": 26240 }, { "epoch": 8.75250166777852, "loss": 0.29263123869895935, "step": 26240 }, { "ce_loss": 0.0385243184864521, "epoch": 8.75250166777852, "step": 26240 }, { "distill_loss": 0.20065245032310486, "epoch": 8.75250166777852, "step": 26240 }, { "epoch": 8.75250166777852, "ref_ce_loss": 0.0532672181725502, "step": 26240 }, { "epoch": 8.755837224816544, "loss": 0.3957, "step": 26250 }, { "epoch": 8.755837224816544, "grad_norm": 1.3207815885543823, "step": 26250 }, { "epoch": 8.755837224816544, "learning_rate": 3.2038631161253226e-05, "step": 26250 }, { "epoch": 8.755837224816544, "loss": 0.5277407169342041, "step": 26250 }, { "ce_loss": 0.06554209440946579, "epoch": 8.755837224816544, "step": 26250 }, { "distill_loss": 0.21977917850017548, "epoch": 8.755837224816544, "step": 26250 }, { "epoch": 8.755837224816544, "ref_ce_loss": 0.07910329848527908, "step": 26250 }, { "epoch": 8.755837224816544, "loss": 0.3791898787021637, "step": 26250 }, { "ce_loss": 0.04661162197589874, "epoch": 8.755837224816544, "step": 26250 }, { "distill_loss": 0.16154451668262482, "epoch": 8.755837224816544, "step": 26250 }, { "epoch": 8.755837224816544, "ref_ce_loss": 0.06869801133871078, "step": 26250 }, { "epoch": 8.75917278185457, "loss": 0.3538, "step": 26260 }, { "epoch": 8.75917278185457, "grad_norm": 0.9695892333984375, "step": 26260 }, { "epoch": 8.75917278185457, "learning_rate": 3.18693878970425e-05, "step": 26260 }, { "epoch": 8.75917278185457, "loss": 0.30223411321640015, "step": 26260 }, { "ce_loss": 0.0531826950609684, "epoch": 8.75917278185457, "step": 26260 }, { "distill_loss": 0.16183307766914368, "epoch": 8.75917278185457, "step": 26260 }, { "epoch": 8.75917278185457, "ref_ce_loss": 0.05986183509230614, "step": 26260 }, { "epoch": 8.75917278185457, "loss": 0.2744012475013733, "step": 26260 }, { "ce_loss": 0.032525599002838135, "epoch": 8.75917278185457, "step": 26260 }, { "distill_loss": 0.15671776235103607, "epoch": 8.75917278185457, "step": 26260 }, { "epoch": 8.75917278185457, "ref_ce_loss": 0.05136071890592575, "step": 26260 }, { "epoch": 8.762508338892594, "loss": 0.4084, "step": 26270 }, { "epoch": 8.762508338892594, "grad_norm": 1.2642502784729004, "step": 26270 }, { "epoch": 8.762508338892594, "learning_rate": 3.170057428086861e-05, "step": 26270 }, { "epoch": 8.762508338892594, "loss": 0.5678555369377136, "step": 26270 }, { "ce_loss": 0.10616932809352875, "epoch": 8.762508338892594, "step": 26270 }, { "distill_loss": 0.2803439199924469, "epoch": 8.762508338892594, "step": 26270 }, { "epoch": 8.762508338892594, "ref_ce_loss": 0.10130172967910767, "step": 26270 }, { "epoch": 8.762508338892594, "loss": 0.5963680148124695, "step": 26270 }, { "ce_loss": 0.0781753733754158, "epoch": 8.762508338892594, "step": 26270 }, { "distill_loss": 0.18746884167194366, "epoch": 8.762508338892594, "step": 26270 }, { "epoch": 8.762508338892594, "ref_ce_loss": 0.07577872276306152, "step": 26270 }, { "epoch": 8.76584389593062, "loss": 0.3727, "step": 26280 }, { "epoch": 8.76584389593062, "grad_norm": 1.0161960124969482, "step": 26280 }, { "epoch": 8.76584389593062, "learning_rate": 3.15321905097552e-05, "step": 26280 }, { "epoch": 8.76584389593062, "loss": 0.33225592970848083, "step": 26280 }, { "ce_loss": 0.05595911666750908, "epoch": 8.76584389593062, "step": 26280 }, { "distill_loss": 0.20386509597301483, "epoch": 8.76584389593062, "step": 26280 }, { "epoch": 8.76584389593062, "ref_ce_loss": 0.051552511751651764, "step": 26280 }, { "epoch": 8.76584389593062, "loss": 0.2532827854156494, "step": 26280 }, { "ce_loss": 0.030611203983426094, "epoch": 8.76584389593062, "step": 26280 }, { "distill_loss": 0.16473987698554993, "epoch": 8.76584389593062, "step": 26280 }, { "epoch": 8.76584389593062, "ref_ce_loss": 0.0576915368437767, "step": 26280 }, { "epoch": 8.769179452968645, "loss": 0.3891, "step": 26290 }, { "epoch": 8.769179452968645, "grad_norm": 1.3631794452667236, "step": 26290 }, { "epoch": 8.769179452968645, "learning_rate": 3.136423678022422e-05, "step": 26290 }, { "epoch": 8.769179452968645, "loss": 0.382942795753479, "step": 26290 }, { "ce_loss": 0.06846325099468231, "epoch": 8.769179452968645, "step": 26290 }, { "distill_loss": 0.14988799393177032, "epoch": 8.769179452968645, "step": 26290 }, { "epoch": 8.769179452968645, "ref_ce_loss": 0.10606774687767029, "step": 26290 }, { "epoch": 8.769179452968645, "loss": 0.6185066103935242, "step": 26290 }, { "ce_loss": 0.041892632842063904, "epoch": 8.769179452968645, "step": 26290 }, { "distill_loss": 0.1824905127286911, "epoch": 8.769179452968645, "step": 26290 }, { "epoch": 8.769179452968645, "ref_ce_loss": 0.06602640450000763, "step": 26290 }, { "epoch": 8.772515010006671, "loss": 0.3849, "step": 26300 }, { "epoch": 8.772515010006671, "grad_norm": 1.2957016229629517, "step": 26300 }, { "epoch": 8.772515010006671, "learning_rate": 3.119671328829576e-05, "step": 26300 }, { "epoch": 8.772515010006671, "loss": 0.34795233607292175, "step": 26300 }, { "ce_loss": 0.07486037164926529, "epoch": 8.772515010006671, "step": 26300 }, { "distill_loss": 0.17705968022346497, "epoch": 8.772515010006671, "step": 26300 }, { "epoch": 8.772515010006671, "ref_ce_loss": 0.09589973092079163, "step": 26300 }, { "epoch": 8.772515010006671, "loss": 0.36749204993247986, "step": 26300 }, { "ce_loss": 0.08413571119308472, "epoch": 8.772515010006671, "step": 26300 }, { "distill_loss": 0.1869279444217682, "epoch": 8.772515010006671, "step": 26300 }, { "epoch": 8.772515010006671, "ref_ce_loss": 0.07151706516742706, "step": 26300 }, { "epoch": 8.775850567044696, "loss": 0.3681, "step": 26310 }, { "epoch": 8.775850567044696, "grad_norm": 1.7406237125396729, "step": 26310 }, { "epoch": 8.775850567044696, "learning_rate": 3.102962022948779e-05, "step": 26310 }, { "epoch": 8.775850567044696, "loss": 0.4500211179256439, "step": 26310 }, { "ce_loss": 0.04246877133846283, "epoch": 8.775850567044696, "step": 26310 }, { "distill_loss": 0.15817828476428986, "epoch": 8.775850567044696, "step": 26310 }, { "epoch": 8.775850567044696, "ref_ce_loss": 0.06595441699028015, "step": 26310 }, { "epoch": 8.775850567044696, "loss": 0.20872564613819122, "step": 26310 }, { "ce_loss": 0.028664544224739075, "epoch": 8.775850567044696, "step": 26310 }, { "distill_loss": 0.11665079742670059, "epoch": 8.775850567044696, "step": 26310 }, { "epoch": 8.775850567044696, "ref_ce_loss": 0.036202266812324524, "step": 26310 }, { "epoch": 8.779186124082722, "loss": 0.4003, "step": 26320 }, { "epoch": 8.779186124082722, "grad_norm": 1.1860415935516357, "step": 26320 }, { "epoch": 8.779186124082722, "learning_rate": 3.086295779881585e-05, "step": 26320 }, { "epoch": 8.779186124082722, "loss": 0.35605388879776, "step": 26320 }, { "ce_loss": 0.06914496421813965, "epoch": 8.779186124082722, "step": 26320 }, { "distill_loss": 0.2034725397825241, "epoch": 8.779186124082722, "step": 26320 }, { "epoch": 8.779186124082722, "ref_ce_loss": 0.08318885415792465, "step": 26320 }, { "epoch": 8.779186124082722, "loss": 0.29098474979400635, "step": 26320 }, { "ce_loss": 0.07168518006801605, "epoch": 8.779186124082722, "step": 26320 }, { "distill_loss": 0.15839485824108124, "epoch": 8.779186124082722, "step": 26320 }, { "epoch": 8.779186124082722, "ref_ce_loss": 0.06056111305952072, "step": 26320 }, { "epoch": 8.782521681120746, "loss": 0.3944, "step": 26330 }, { "epoch": 8.782521681120746, "grad_norm": 1.4006574153900146, "step": 26330 }, { "epoch": 8.782521681120746, "learning_rate": 3.0696726190793024e-05, "step": 26330 }, { "epoch": 8.782521681120746, "loss": 0.2608528435230255, "step": 26330 }, { "ce_loss": 0.04916700720787048, "epoch": 8.782521681120746, "step": 26330 }, { "distill_loss": 0.14268429577350616, "epoch": 8.782521681120746, "step": 26330 }, { "epoch": 8.782521681120746, "ref_ce_loss": 0.06881540268659592, "step": 26330 }, { "epoch": 8.782521681120746, "loss": 0.41059139370918274, "step": 26330 }, { "ce_loss": 0.08422189950942993, "epoch": 8.782521681120746, "step": 26330 }, { "distill_loss": 0.20686335861682892, "epoch": 8.782521681120746, "step": 26330 }, { "epoch": 8.782521681120746, "ref_ce_loss": 0.08716470003128052, "step": 26330 }, { "epoch": 8.785857238158773, "loss": 0.4231, "step": 26340 }, { "epoch": 8.785857238158773, "grad_norm": 1.3243461847305298, "step": 26340 }, { "epoch": 8.785857238158773, "learning_rate": 3.053092559942932e-05, "step": 26340 }, { "epoch": 8.785857238158773, "loss": 0.3654715418815613, "step": 26340 }, { "ce_loss": 0.05218815058469772, "epoch": 8.785857238158773, "step": 26340 }, { "distill_loss": 0.16668446362018585, "epoch": 8.785857238158773, "step": 26340 }, { "epoch": 8.785857238158773, "ref_ce_loss": 0.07640382647514343, "step": 26340 }, { "epoch": 8.785857238158773, "loss": 0.41766300797462463, "step": 26340 }, { "ce_loss": 0.07735947519540787, "epoch": 8.785857238158773, "step": 26340 }, { "distill_loss": 0.17040522396564484, "epoch": 8.785857238158773, "step": 26340 }, { "epoch": 8.785857238158773, "ref_ce_loss": 0.06796623021364212, "step": 26340 }, { "epoch": 8.789192795196797, "loss": 0.4219, "step": 26350 }, { "epoch": 8.789192795196797, "grad_norm": 1.217641830444336, "step": 26350 }, { "epoch": 8.789192795196797, "learning_rate": 3.0365556218231983e-05, "step": 26350 }, { "epoch": 8.789192795196797, "loss": 0.42024436593055725, "step": 26350 }, { "ce_loss": 0.0894358679652214, "epoch": 8.789192795196797, "step": 26350 }, { "distill_loss": 0.1911226511001587, "epoch": 8.789192795196797, "step": 26350 }, { "epoch": 8.789192795196797, "ref_ce_loss": 0.06873385608196259, "step": 26350 }, { "epoch": 8.789192795196797, "loss": 0.4086933434009552, "step": 26350 }, { "ce_loss": 0.05865989997982979, "epoch": 8.789192795196797, "step": 26350 }, { "distill_loss": 0.13806791603565216, "epoch": 8.789192795196797, "step": 26350 }, { "epoch": 8.789192795196797, "ref_ce_loss": 0.0668598860502243, "step": 26350 }, { "epoch": 8.792528352234823, "loss": 0.3949, "step": 26360 }, { "epoch": 8.792528352234823, "grad_norm": 1.2597485780715942, "step": 26360 }, { "epoch": 8.792528352234823, "learning_rate": 3.020061824020486e-05, "step": 26360 }, { "epoch": 8.792528352234823, "loss": 0.3705188035964966, "step": 26360 }, { "ce_loss": 0.05611937865614891, "epoch": 8.792528352234823, "step": 26360 }, { "distill_loss": 0.18796519935131073, "epoch": 8.792528352234823, "step": 26360 }, { "epoch": 8.792528352234823, "ref_ce_loss": 0.08399634808301926, "step": 26360 }, { "epoch": 8.792528352234823, "loss": 0.29237592220306396, "step": 26360 }, { "ce_loss": 0.04235415533185005, "epoch": 8.792528352234823, "step": 26360 }, { "distill_loss": 0.16000601649284363, "epoch": 8.792528352234823, "step": 26360 }, { "epoch": 8.792528352234823, "ref_ce_loss": 0.07138751447200775, "step": 26360 }, { "epoch": 8.795863909272848, "loss": 0.4432, "step": 26370 }, { "epoch": 8.795863909272848, "grad_norm": 2.4199059009552, "step": 26370 }, { "epoch": 8.795863909272848, "learning_rate": 3.003611185784836e-05, "step": 26370 }, { "epoch": 8.795863909272848, "loss": 0.389001727104187, "step": 26370 }, { "ce_loss": 0.053413379937410355, "epoch": 8.795863909272848, "step": 26370 }, { "distill_loss": 0.17966410517692566, "epoch": 8.795863909272848, "step": 26370 }, { "epoch": 8.795863909272848, "ref_ce_loss": 0.0777275413274765, "step": 26370 }, { "epoch": 8.795863909272848, "loss": 0.272889643907547, "step": 26370 }, { "ce_loss": 0.05025883764028549, "epoch": 8.795863909272848, "step": 26370 }, { "distill_loss": 0.1371891349554062, "epoch": 8.795863909272848, "step": 26370 }, { "epoch": 8.795863909272848, "ref_ce_loss": 0.05508722737431526, "step": 26370 }, { "epoch": 8.799199466310874, "loss": 0.3584, "step": 26380 }, { "epoch": 8.799199466310874, "grad_norm": 1.75760817527771, "step": 26380 }, { "epoch": 8.799199466310874, "learning_rate": 2.987203726315899e-05, "step": 26380 }, { "epoch": 8.799199466310874, "loss": 0.6803490519523621, "step": 26380 }, { "ce_loss": 0.053485460579395294, "epoch": 8.799199466310874, "step": 26380 }, { "distill_loss": 0.1900993436574936, "epoch": 8.799199466310874, "step": 26380 }, { "epoch": 8.799199466310874, "ref_ce_loss": 0.10119712352752686, "step": 26380 }, { "epoch": 8.799199466310874, "loss": 0.37675246596336365, "step": 26380 }, { "ce_loss": 0.06931252777576447, "epoch": 8.799199466310874, "step": 26380 }, { "distill_loss": 0.21957695484161377, "epoch": 8.799199466310874, "step": 26380 }, { "epoch": 8.799199466310874, "ref_ce_loss": 0.08745820820331573, "step": 26380 }, { "epoch": 8.802535023348899, "loss": 0.4442, "step": 26390 }, { "epoch": 8.802535023348899, "grad_norm": 1.3044569492340088, "step": 26390 }, { "epoch": 8.802535023348899, "learning_rate": 2.970839464762958e-05, "step": 26390 }, { "epoch": 8.802535023348899, "loss": 0.23337514698505402, "step": 26390 }, { "ce_loss": 0.03274532034993172, "epoch": 8.802535023348899, "step": 26390 }, { "distill_loss": 0.12590210139751434, "epoch": 8.802535023348899, "step": 26390 }, { "epoch": 8.802535023348899, "ref_ce_loss": 0.05140114203095436, "step": 26390 }, { "epoch": 8.802535023348899, "loss": 0.3965371549129486, "step": 26390 }, { "ce_loss": 0.07348036020994186, "epoch": 8.802535023348899, "step": 26390 }, { "distill_loss": 0.2069893330335617, "epoch": 8.802535023348899, "step": 26390 }, { "epoch": 8.802535023348899, "ref_ce_loss": 0.08539339900016785, "step": 26390 }, { "epoch": 8.805870580386925, "loss": 0.3868, "step": 26400 }, { "epoch": 8.805870580386925, "grad_norm": 1.3240303993225098, "step": 26400 }, { "epoch": 8.805870580386925, "learning_rate": 2.954518420224868e-05, "step": 26400 }, { "epoch": 8.805870580386925, "loss": 0.31848961114883423, "step": 26400 }, { "ce_loss": 0.07669753581285477, "epoch": 8.805870580386925, "step": 26400 }, { "distill_loss": 0.19089281558990479, "epoch": 8.805870580386925, "step": 26400 }, { "epoch": 8.805870580386925, "ref_ce_loss": 0.050712063908576965, "step": 26400 }, { "epoch": 8.805870580386925, "loss": 0.25955644249916077, "step": 26400 }, { "ce_loss": 0.028958193957805634, "epoch": 8.805870580386925, "step": 26400 }, { "distill_loss": 0.16439704596996307, "epoch": 8.805870580386925, "step": 26400 }, { "epoch": 8.805870580386925, "ref_ce_loss": 0.06597182899713516, "step": 26400 }, { "epoch": 8.80920613742495, "loss": 0.4087, "step": 26410 }, { "epoch": 8.80920613742495, "grad_norm": 1.2474710941314697, "step": 26410 }, { "epoch": 8.80920613742495, "learning_rate": 2.938240611750036e-05, "step": 26410 }, { "epoch": 8.80920613742495, "loss": 0.38615283370018005, "step": 26410 }, { "ce_loss": 0.06278189271688461, "epoch": 8.80920613742495, "step": 26410 }, { "distill_loss": 0.1992618292570114, "epoch": 8.80920613742495, "step": 26410 }, { "epoch": 8.80920613742495, "ref_ce_loss": 0.05499156564474106, "step": 26410 }, { "epoch": 8.80920613742495, "loss": 0.2698794901371002, "step": 26410 }, { "ce_loss": 0.056827716529369354, "epoch": 8.80920613742495, "step": 26410 }, { "distill_loss": 0.15084534883499146, "epoch": 8.80920613742495, "step": 26410 }, { "epoch": 8.80920613742495, "ref_ce_loss": 0.061909135431051254, "step": 26410 }, { "epoch": 8.812541694462976, "loss": 0.3241, "step": 26420 }, { "epoch": 8.812541694462976, "grad_norm": 0.8669230341911316, "step": 26420 }, { "epoch": 8.812541694462976, "learning_rate": 2.9220060583364217e-05, "step": 26420 }, { "epoch": 8.812541694462976, "loss": 0.32821089029312134, "step": 26420 }, { "ce_loss": 0.05677766725420952, "epoch": 8.812541694462976, "step": 26420 }, { "distill_loss": 0.1873900294303894, "epoch": 8.812541694462976, "step": 26420 }, { "epoch": 8.812541694462976, "ref_ce_loss": 0.08385615795850754, "step": 26420 }, { "epoch": 8.812541694462976, "loss": 0.3158455193042755, "step": 26420 }, { "ce_loss": 0.05159135162830353, "epoch": 8.812541694462976, "step": 26420 }, { "distill_loss": 0.15918651223182678, "epoch": 8.812541694462976, "step": 26420 }, { "epoch": 8.812541694462976, "ref_ce_loss": 0.05180855467915535, "step": 26420 }, { "epoch": 8.815877251501, "loss": 0.375, "step": 26430 }, { "epoch": 8.815877251501, "grad_norm": 1.4044928550720215, "step": 26430 }, { "epoch": 8.815877251501, "learning_rate": 2.9058147789314903e-05, "step": 26430 }, { "epoch": 8.815877251501, "loss": 0.3125324249267578, "step": 26430 }, { "ce_loss": 0.05299573391675949, "epoch": 8.815877251501, "step": 26430 }, { "distill_loss": 0.14900648593902588, "epoch": 8.815877251501, "step": 26430 }, { "epoch": 8.815877251501, "ref_ce_loss": 0.07863525301218033, "step": 26430 }, { "epoch": 8.815877251501, "loss": 0.3457852005958557, "step": 26430 }, { "ce_loss": 0.055660802870988846, "epoch": 8.815877251501, "step": 26430 }, { "distill_loss": 0.16527312994003296, "epoch": 8.815877251501, "step": 26430 }, { "epoch": 8.815877251501, "ref_ce_loss": 0.07229301333427429, "step": 26430 }, { "epoch": 8.819212808539026, "loss": 0.3643, "step": 26440 }, { "epoch": 8.819212808539026, "grad_norm": 5.930783748626709, "step": 26440 }, { "epoch": 8.819212808539026, "learning_rate": 2.8896667924322153e-05, "step": 26440 }, { "epoch": 8.819212808539026, "loss": 0.34311580657958984, "step": 26440 }, { "ce_loss": 0.06892959773540497, "epoch": 8.819212808539026, "step": 26440 }, { "distill_loss": 0.214860200881958, "epoch": 8.819212808539026, "step": 26440 }, { "epoch": 8.819212808539026, "ref_ce_loss": 0.05908918380737305, "step": 26440 }, { "epoch": 8.819212808539026, "loss": 0.5133229494094849, "step": 26440 }, { "ce_loss": 0.08564013987779617, "epoch": 8.819212808539026, "step": 26440 }, { "distill_loss": 0.2291836142539978, "epoch": 8.819212808539026, "step": 26440 }, { "epoch": 8.819212808539026, "ref_ce_loss": 0.08884364366531372, "step": 26440 }, { "epoch": 8.82254836557705, "loss": 0.3797, "step": 26450 }, { "epoch": 8.82254836557705, "grad_norm": 1.0255451202392578, "step": 26450 }, { "epoch": 8.82254836557705, "learning_rate": 2.8735621176850404e-05, "step": 26450 }, { "epoch": 8.82254836557705, "loss": 0.3774876892566681, "step": 26450 }, { "ce_loss": 0.03586650267243385, "epoch": 8.82254836557705, "step": 26450 }, { "distill_loss": 0.1628786027431488, "epoch": 8.82254836557705, "step": 26450 }, { "epoch": 8.82254836557705, "ref_ce_loss": 0.07327239960432053, "step": 26450 }, { "epoch": 8.82254836557705, "loss": 0.24797579646110535, "step": 26450 }, { "ce_loss": 0.018110867589712143, "epoch": 8.82254836557705, "step": 26450 }, { "distill_loss": 0.14303602278232574, "epoch": 8.82254836557705, "step": 26450 }, { "epoch": 8.82254836557705, "ref_ce_loss": 0.06473203748464584, "step": 26450 }, { "epoch": 8.825883922615077, "loss": 0.4209, "step": 26460 }, { "epoch": 8.825883922615077, "grad_norm": 1.341780424118042, "step": 26460 }, { "epoch": 8.825883922615077, "learning_rate": 2.8575007734858327e-05, "step": 26460 }, { "epoch": 8.825883922615077, "loss": 0.28869760036468506, "step": 26460 }, { "ce_loss": 0.04272546246647835, "epoch": 8.825883922615077, "step": 26460 }, { "distill_loss": 0.14046624302864075, "epoch": 8.825883922615077, "step": 26460 }, { "epoch": 8.825883922615077, "ref_ce_loss": 0.07356305420398712, "step": 26460 }, { "epoch": 8.825883922615077, "loss": 0.2093331515789032, "step": 26460 }, { "ce_loss": 0.016943862661719322, "epoch": 8.825883922615077, "step": 26460 }, { "distill_loss": 0.1336776614189148, "epoch": 8.825883922615077, "step": 26460 }, { "epoch": 8.825883922615077, "ref_ce_loss": 0.042915888130664825, "step": 26460 }, { "epoch": 8.829219479653101, "loss": 0.3831, "step": 26470 }, { "epoch": 8.829219479653101, "grad_norm": 2.0843732357025146, "step": 26470 }, { "epoch": 8.829219479653101, "learning_rate": 2.8414827785799226e-05, "step": 26470 }, { "epoch": 8.829219479653101, "loss": 0.3446524143218994, "step": 26470 }, { "ce_loss": 0.05340450629591942, "epoch": 8.829219479653101, "step": 26470 }, { "distill_loss": 0.17530488967895508, "epoch": 8.829219479653101, "step": 26470 }, { "epoch": 8.829219479653101, "ref_ce_loss": 0.07423309236764908, "step": 26470 }, { "epoch": 8.829219479653101, "loss": 0.44540131092071533, "step": 26470 }, { "ce_loss": 0.10081108659505844, "epoch": 8.829219479653101, "step": 26470 }, { "distill_loss": 0.1769978553056717, "epoch": 8.829219479653101, "step": 26470 }, { "epoch": 8.829219479653101, "ref_ce_loss": 0.08398478478193283, "step": 26470 }, { "epoch": 8.832555036691128, "loss": 0.36, "step": 26480 }, { "epoch": 8.832555036691128, "grad_norm": 1.5165905952453613, "step": 26480 }, { "epoch": 8.832555036691128, "learning_rate": 2.8255081516620307e-05, "step": 26480 }, { "epoch": 8.832555036691128, "loss": 0.3594178557395935, "step": 26480 }, { "ce_loss": 0.05317145213484764, "epoch": 8.832555036691128, "step": 26480 }, { "distill_loss": 0.1571061611175537, "epoch": 8.832555036691128, "step": 26480 }, { "epoch": 8.832555036691128, "ref_ce_loss": 0.07676803320646286, "step": 26480 }, { "epoch": 8.832555036691128, "loss": 0.6816879510879517, "step": 26480 }, { "ce_loss": 0.05865504592657089, "epoch": 8.832555036691128, "step": 26480 }, { "distill_loss": 0.1645684540271759, "epoch": 8.832555036691128, "step": 26480 }, { "epoch": 8.832555036691128, "ref_ce_loss": 0.07839664816856384, "step": 26480 }, { "epoch": 8.835890593729152, "loss": 0.4173, "step": 26490 }, { "epoch": 8.835890593729152, "grad_norm": 1.228607177734375, "step": 26490 }, { "epoch": 8.835890593729152, "learning_rate": 2.809576911376275e-05, "step": 26490 }, { "epoch": 8.835890593729152, "loss": 0.3466040790081024, "step": 26490 }, { "ce_loss": 0.08471109718084335, "epoch": 8.835890593729152, "step": 26490 }, { "distill_loss": 0.18720370531082153, "epoch": 8.835890593729152, "step": 26490 }, { "epoch": 8.835890593729152, "ref_ce_loss": 0.074448361992836, "step": 26490 }, { "epoch": 8.835890593729152, "loss": 1.1317849159240723, "step": 26490 }, { "ce_loss": 0.06539808958768845, "epoch": 8.835890593729152, "step": 26490 }, { "distill_loss": 0.1788891851902008, "epoch": 8.835890593729152, "step": 26490 }, { "epoch": 8.835890593729152, "ref_ce_loss": 0.09472262859344482, "step": 26490 }, { "epoch": 8.839226150767178, "loss": 0.3817, "step": 26500 }, { "epoch": 8.839226150767178, "grad_norm": 2.034001350402832, "step": 26500 }, { "epoch": 8.839226150767178, "learning_rate": 2.7936890763161106e-05, "step": 26500 }, { "epoch": 8.839226150767178, "loss": 0.7755275368690491, "step": 26500 }, { "ce_loss": 0.05330686643719673, "epoch": 8.839226150767178, "step": 26500 }, { "distill_loss": 0.15255343914031982, "epoch": 8.839226150767178, "step": 26500 }, { "epoch": 8.839226150767178, "ref_ce_loss": 0.08559133112430573, "step": 26500 }, { "epoch": 8.839226150767178, "loss": 0.3958842158317566, "step": 26500 }, { "ce_loss": 0.08466436713933945, "epoch": 8.839226150767178, "step": 26500 }, { "distill_loss": 0.2283436357975006, "epoch": 8.839226150767178, "step": 26500 }, { "epoch": 8.839226150767178, "ref_ce_loss": 0.08261482417583466, "step": 26500 }, { "epoch": 8.842561707805203, "loss": 0.3551, "step": 26510 }, { "epoch": 8.842561707805203, "grad_norm": 1.1143865585327148, "step": 26510 }, { "epoch": 8.842561707805203, "learning_rate": 2.7778446650243582e-05, "step": 26510 }, { "epoch": 8.842561707805203, "loss": 0.45422789454460144, "step": 26510 }, { "ce_loss": 0.05119134113192558, "epoch": 8.842561707805203, "step": 26510 }, { "distill_loss": 0.20761536061763763, "epoch": 8.842561707805203, "step": 26510 }, { "epoch": 8.842561707805203, "ref_ce_loss": 0.0848504975438118, "step": 26510 }, { "epoch": 8.842561707805203, "loss": 0.2579534351825714, "step": 26510 }, { "ce_loss": 0.044855259358882904, "epoch": 8.842561707805203, "step": 26510 }, { "distill_loss": 0.12745003402233124, "epoch": 8.842561707805203, "step": 26510 }, { "epoch": 8.842561707805203, "ref_ce_loss": 0.05585559085011482, "step": 26510 }, { "epoch": 8.845897264843229, "loss": 0.3267, "step": 26520 }, { "epoch": 8.845897264843229, "grad_norm": 0.9358294010162354, "step": 26520 }, { "epoch": 8.845897264843229, "learning_rate": 2.762043695993155e-05, "step": 26520 }, { "epoch": 8.845897264843229, "loss": 0.38387444615364075, "step": 26520 }, { "ce_loss": 0.04712322726845741, "epoch": 8.845897264843229, "step": 26520 }, { "distill_loss": 0.1761668622493744, "epoch": 8.845897264843229, "step": 26520 }, { "epoch": 8.845897264843229, "ref_ce_loss": 0.07620155811309814, "step": 26520 }, { "epoch": 8.845897264843229, "loss": 1.0314216613769531, "step": 26520 }, { "ce_loss": 0.0835966244339943, "epoch": 8.845897264843229, "step": 26520 }, { "distill_loss": 0.2500387728214264, "epoch": 8.845897264843229, "step": 26520 }, { "epoch": 8.845897264843229, "ref_ce_loss": 0.11306946724653244, "step": 26520 }, { "epoch": 8.849232821881253, "loss": 0.4203, "step": 26530 }, { "epoch": 8.849232821881253, "grad_norm": 2.512843370437622, "step": 26530 }, { "epoch": 8.849232821881253, "learning_rate": 2.7462861876639223e-05, "step": 26530 }, { "epoch": 8.849232821881253, "loss": 0.36169400811195374, "step": 26530 }, { "ce_loss": 0.08441756665706635, "epoch": 8.849232821881253, "step": 26530 }, { "distill_loss": 0.18359051644802094, "epoch": 8.849232821881253, "step": 26530 }, { "epoch": 8.849232821881253, "ref_ce_loss": 0.09323541074991226, "step": 26530 }, { "epoch": 8.849232821881253, "loss": 0.4454735219478607, "step": 26530 }, { "ce_loss": 0.09442145377397537, "epoch": 8.849232821881253, "step": 26530 }, { "distill_loss": 0.16280341148376465, "epoch": 8.849232821881253, "step": 26530 }, { "epoch": 8.849232821881253, "ref_ce_loss": 0.07556495815515518, "step": 26530 }, { "epoch": 8.85256837891928, "loss": 0.3797, "step": 26540 }, { "epoch": 8.85256837891928, "grad_norm": 1.3537561893463135, "step": 26540 }, { "epoch": 8.85256837891928, "learning_rate": 2.7305721584273802e-05, "step": 26540 }, { "epoch": 8.85256837891928, "loss": 0.34503787755966187, "step": 26540 }, { "ce_loss": 0.05430424585938454, "epoch": 8.85256837891928, "step": 26540 }, { "distill_loss": 0.18627524375915527, "epoch": 8.85256837891928, "step": 26540 }, { "epoch": 8.85256837891928, "ref_ce_loss": 0.06884271651506424, "step": 26540 }, { "epoch": 8.85256837891928, "loss": 0.22353094816207886, "step": 26540 }, { "ce_loss": 0.028932973742485046, "epoch": 8.85256837891928, "step": 26540 }, { "distill_loss": 0.12646234035491943, "epoch": 8.85256837891928, "step": 26540 }, { "epoch": 8.85256837891928, "ref_ce_loss": 0.06783977150917053, "step": 26540 }, { "epoch": 8.855903935957304, "loss": 0.4032, "step": 26550 }, { "epoch": 8.855903935957304, "grad_norm": 1.1834700107574463, "step": 26550 }, { "epoch": 8.855903935957304, "learning_rate": 2.714901626623485e-05, "step": 26550 }, { "epoch": 8.855903935957304, "loss": 0.3856170177459717, "step": 26550 }, { "ce_loss": 0.08056189864873886, "epoch": 8.855903935957304, "step": 26550 }, { "distill_loss": 0.19755177199840546, "epoch": 8.855903935957304, "step": 26550 }, { "epoch": 8.855903935957304, "ref_ce_loss": 0.06736253947019577, "step": 26550 }, { "epoch": 8.855903935957304, "loss": 0.31577253341674805, "step": 26550 }, { "ce_loss": 0.029894186183810234, "epoch": 8.855903935957304, "step": 26550 }, { "distill_loss": 0.16263779997825623, "epoch": 8.855903935957304, "step": 26550 }, { "epoch": 8.855903935957304, "ref_ce_loss": 0.06558357179164886, "step": 26550 }, { "epoch": 8.85923949299533, "loss": 0.3364, "step": 26560 }, { "epoch": 8.85923949299533, "grad_norm": 1.1230621337890625, "step": 26560 }, { "epoch": 8.85923949299533, "learning_rate": 2.699274610541438e-05, "step": 26560 }, { "epoch": 8.85923949299533, "loss": 0.37808358669281006, "step": 26560 }, { "ce_loss": 0.03817775472998619, "epoch": 8.85923949299533, "step": 26560 }, { "distill_loss": 0.16854511201381683, "epoch": 8.85923949299533, "step": 26560 }, { "epoch": 8.85923949299533, "ref_ce_loss": 0.07931677252054214, "step": 26560 }, { "epoch": 8.85923949299533, "loss": 0.38735297322273254, "step": 26560 }, { "ce_loss": 0.06041007861495018, "epoch": 8.85923949299533, "step": 26560 }, { "distill_loss": 0.2303398847579956, "epoch": 8.85923949299533, "step": 26560 }, { "epoch": 8.85923949299533, "ref_ce_loss": 0.09647249430418015, "step": 26560 }, { "epoch": 8.862575050033355, "loss": 0.3923, "step": 26570 }, { "epoch": 8.862575050033355, "grad_norm": 1.7934776544570923, "step": 26570 }, { "epoch": 8.862575050033355, "learning_rate": 2.6836911284196363e-05, "step": 26570 }, { "epoch": 8.862575050033355, "loss": 0.2773299217224121, "step": 26570 }, { "ce_loss": 0.05012989789247513, "epoch": 8.862575050033355, "step": 26570 }, { "distill_loss": 0.1513012945652008, "epoch": 8.862575050033355, "step": 26570 }, { "epoch": 8.862575050033355, "ref_ce_loss": 0.05779888108372688, "step": 26570 }, { "epoch": 8.862575050033355, "loss": 0.5293566584587097, "step": 26570 }, { "ce_loss": 0.09456038475036621, "epoch": 8.862575050033355, "step": 26570 }, { "distill_loss": 0.22474022209644318, "epoch": 8.862575050033355, "step": 26570 }, { "epoch": 8.862575050033355, "ref_ce_loss": 0.06779570877552032, "step": 26570 }, { "epoch": 8.865910607071381, "loss": 0.4038, "step": 26580 }, { "epoch": 8.865910607071381, "grad_norm": 1.2114170789718628, "step": 26580 }, { "epoch": 8.865910607071381, "learning_rate": 2.668151198445692e-05, "step": 26580 }, { "epoch": 8.865910607071381, "loss": 0.7759679555892944, "step": 26580 }, { "ce_loss": 0.08975588530302048, "epoch": 8.865910607071381, "step": 26580 }, { "distill_loss": 0.18606550991535187, "epoch": 8.865910607071381, "step": 26580 }, { "epoch": 8.865910607071381, "ref_ce_loss": 0.08597087115049362, "step": 26580 }, { "epoch": 8.865910607071381, "loss": 0.5669875741004944, "step": 26580 }, { "ce_loss": 0.06825076043605804, "epoch": 8.865910607071381, "step": 26580 }, { "distill_loss": 0.17964768409729004, "epoch": 8.865910607071381, "step": 26580 }, { "epoch": 8.865910607071381, "ref_ce_loss": 0.07940545678138733, "step": 26580 }, { "epoch": 8.869246164109406, "loss": 0.3841, "step": 26590 }, { "epoch": 8.869246164109406, "grad_norm": 1.518477439880371, "step": 26590 }, { "epoch": 8.869246164109406, "learning_rate": 2.6526548387563722e-05, "step": 26590 }, { "epoch": 8.869246164109406, "loss": 0.4557240605354309, "step": 26590 }, { "ce_loss": 0.04701073467731476, "epoch": 8.869246164109406, "step": 26590 }, { "distill_loss": 0.1662449836730957, "epoch": 8.869246164109406, "step": 26590 }, { "epoch": 8.869246164109406, "ref_ce_loss": 0.06742780655622482, "step": 26590 }, { "epoch": 8.869246164109406, "loss": 0.3855040967464447, "step": 26590 }, { "ce_loss": 0.08256342262029648, "epoch": 8.869246164109406, "step": 26590 }, { "distill_loss": 0.18862977623939514, "epoch": 8.869246164109406, "step": 26590 }, { "epoch": 8.869246164109406, "ref_ce_loss": 0.08041725307703018, "step": 26590 }, { "epoch": 8.872581721147432, "loss": 0.3927, "step": 26600 }, { "epoch": 8.872581721147432, "grad_norm": 1.193103313446045, "step": 26600 }, { "epoch": 8.872581721147432, "learning_rate": 2.637202067437605e-05, "step": 26600 }, { "epoch": 8.872581721147432, "loss": 0.2769448161125183, "step": 26600 }, { "ce_loss": 0.021347135305404663, "epoch": 8.872581721147432, "step": 26600 }, { "distill_loss": 0.12924575805664062, "epoch": 8.872581721147432, "step": 26600 }, { "epoch": 8.872581721147432, "ref_ce_loss": 0.05707591772079468, "step": 26600 }, { "epoch": 8.872581721147432, "loss": 0.2787862718105316, "step": 26600 }, { "ce_loss": 0.028742941096425056, "epoch": 8.872581721147432, "step": 26600 }, { "distill_loss": 0.19323575496673584, "epoch": 8.872581721147432, "step": 26600 }, { "epoch": 8.872581721147432, "ref_ce_loss": 0.05664723366498947, "step": 26600 }, { "epoch": 8.875917278185456, "loss": 0.3619, "step": 26610 }, { "epoch": 8.875917278185456, "grad_norm": 0.8265054225921631, "step": 26610 }, { "epoch": 8.875917278185456, "learning_rate": 2.6217929025244182e-05, "step": 26610 }, { "epoch": 8.875917278185456, "loss": 0.38113945722579956, "step": 26610 }, { "ce_loss": 0.037726499140262604, "epoch": 8.875917278185456, "step": 26610 }, { "distill_loss": 0.19174793362617493, "epoch": 8.875917278185456, "step": 26610 }, { "epoch": 8.875917278185456, "ref_ce_loss": 0.06536515057086945, "step": 26610 }, { "epoch": 8.875917278185456, "loss": 0.4439065158367157, "step": 26610 }, { "ce_loss": 0.10350029915571213, "epoch": 8.875917278185456, "step": 26610 }, { "distill_loss": 0.19466392695903778, "epoch": 8.875917278185456, "step": 26610 }, { "epoch": 8.875917278185456, "ref_ce_loss": 0.09374602884054184, "step": 26610 }, { "epoch": 8.879252835223483, "loss": 0.3905, "step": 26620 }, { "epoch": 8.879252835223483, "grad_norm": 1.5056222677230835, "step": 26620 }, { "epoch": 8.879252835223483, "learning_rate": 2.606427362000976e-05, "step": 26620 }, { "epoch": 8.879252835223483, "loss": 0.9306776523590088, "step": 26620 }, { "ce_loss": 0.05697760358452797, "epoch": 8.879252835223483, "step": 26620 }, { "distill_loss": 0.22343315184116364, "epoch": 8.879252835223483, "step": 26620 }, { "epoch": 8.879252835223483, "ref_ce_loss": 0.08988448232412338, "step": 26620 }, { "epoch": 8.879252835223483, "loss": 0.3787762224674225, "step": 26620 }, { "ce_loss": 0.08007381856441498, "epoch": 8.879252835223483, "step": 26620 }, { "distill_loss": 0.2230757623910904, "epoch": 8.879252835223483, "step": 26620 }, { "epoch": 8.879252835223483, "ref_ce_loss": 0.07537058740854263, "step": 26620 }, { "epoch": 8.882588392261507, "loss": 0.4268, "step": 26630 }, { "epoch": 8.882588392261507, "grad_norm": 1.1199023723602295, "step": 26630 }, { "epoch": 8.882588392261507, "learning_rate": 2.5911054638005115e-05, "step": 26630 }, { "epoch": 8.882588392261507, "loss": 0.23840931057929993, "step": 26630 }, { "ce_loss": 0.028699828311800957, "epoch": 8.882588392261507, "step": 26630 }, { "distill_loss": 0.15274080634117126, "epoch": 8.882588392261507, "step": 26630 }, { "epoch": 8.882588392261507, "ref_ce_loss": 0.0567190982401371, "step": 26630 }, { "epoch": 8.882588392261507, "loss": 0.32390138506889343, "step": 26630 }, { "ce_loss": 0.059554897248744965, "epoch": 8.882588392261507, "step": 26630 }, { "distill_loss": 0.1620517522096634, "epoch": 8.882588392261507, "step": 26630 }, { "epoch": 8.882588392261507, "ref_ce_loss": 0.07728970050811768, "step": 26630 }, { "epoch": 8.885923949299533, "loss": 0.3893, "step": 26640 }, { "epoch": 8.885923949299533, "grad_norm": 1.9550362825393677, "step": 26640 }, { "epoch": 8.885923949299533, "learning_rate": 2.5758272258053473e-05, "step": 26640 }, { "epoch": 8.885923949299533, "loss": 0.38129478693008423, "step": 26640 }, { "ce_loss": 0.06399346888065338, "epoch": 8.885923949299533, "step": 26640 }, { "distill_loss": 0.19712619483470917, "epoch": 8.885923949299533, "step": 26640 }, { "epoch": 8.885923949299533, "ref_ce_loss": 0.08902854472398758, "step": 26640 }, { "epoch": 8.885923949299533, "loss": 0.5494421720504761, "step": 26640 }, { "ce_loss": 0.09578592330217361, "epoch": 8.885923949299533, "step": 26640 }, { "distill_loss": 0.1795535534620285, "epoch": 8.885923949299533, "step": 26640 }, { "epoch": 8.885923949299533, "ref_ce_loss": 0.07753363251686096, "step": 26640 }, { "epoch": 8.889259506337558, "loss": 0.3977, "step": 26650 }, { "epoch": 8.889259506337558, "grad_norm": 1.3418828248977661, "step": 26650 }, { "epoch": 8.889259506337558, "learning_rate": 2.5605926658468105e-05, "step": 26650 }, { "epoch": 8.889259506337558, "loss": 0.5159924626350403, "step": 26650 }, { "ce_loss": 0.06970398128032684, "epoch": 8.889259506337558, "step": 26650 }, { "distill_loss": 0.22379133105278015, "epoch": 8.889259506337558, "step": 26650 }, { "epoch": 8.889259506337558, "ref_ce_loss": 0.10952253639698029, "step": 26650 }, { "epoch": 8.889259506337558, "loss": 0.40654951333999634, "step": 26650 }, { "ce_loss": 0.061383120715618134, "epoch": 8.889259506337558, "step": 26650 }, { "distill_loss": 0.17689788341522217, "epoch": 8.889259506337558, "step": 26650 }, { "epoch": 8.889259506337558, "ref_ce_loss": 0.07978083193302155, "step": 26650 }, { "epoch": 8.892595063375584, "loss": 0.4451, "step": 26660 }, { "epoch": 8.892595063375584, "grad_norm": 1.0386784076690674, "step": 26660 }, { "epoch": 8.892595063375584, "learning_rate": 2.5454018017052826e-05, "step": 26660 }, { "epoch": 8.892595063375584, "loss": 0.40768587589263916, "step": 26660 }, { "ce_loss": 0.06482963263988495, "epoch": 8.892595063375584, "step": 26660 }, { "distill_loss": 0.1913122683763504, "epoch": 8.892595063375584, "step": 26660 }, { "epoch": 8.892595063375584, "ref_ce_loss": 0.08091914653778076, "step": 26660 }, { "epoch": 8.892595063375584, "loss": 0.4252420663833618, "step": 26660 }, { "ce_loss": 0.07114198803901672, "epoch": 8.892595063375584, "step": 26660 }, { "distill_loss": 0.18928878009319305, "epoch": 8.892595063375584, "step": 26660 }, { "epoch": 8.892595063375584, "ref_ce_loss": 0.07031705230474472, "step": 26660 }, { "epoch": 8.895930620413608, "loss": 0.362, "step": 26670 }, { "epoch": 8.895930620413608, "grad_norm": 1.1728801727294922, "step": 26670 }, { "epoch": 8.895930620413608, "learning_rate": 2.5302546511101333e-05, "step": 26670 }, { "epoch": 8.895930620413608, "loss": 0.3263809084892273, "step": 26670 }, { "ce_loss": 0.06961500644683838, "epoch": 8.895930620413608, "step": 26670 }, { "distill_loss": 0.18741771578788757, "epoch": 8.895930620413608, "step": 26670 }, { "epoch": 8.895930620413608, "ref_ce_loss": 0.06905804574489594, "step": 26670 }, { "epoch": 8.895930620413608, "loss": 0.4924054443836212, "step": 26670 }, { "ce_loss": 0.12667293846607208, "epoch": 8.895930620413608, "step": 26670 }, { "distill_loss": 0.2042117863893509, "epoch": 8.895930620413608, "step": 26670 }, { "epoch": 8.895930620413608, "ref_ce_loss": 0.12243467569351196, "step": 26670 }, { "epoch": 8.899266177451635, "loss": 0.4095, "step": 26680 }, { "epoch": 8.899266177451635, "grad_norm": 1.0906306505203247, "step": 26680 }, { "epoch": 8.899266177451635, "learning_rate": 2.515151231739723e-05, "step": 26680 }, { "epoch": 8.899266177451635, "loss": 0.36820027232170105, "step": 26680 }, { "ce_loss": 0.07361310720443726, "epoch": 8.899266177451635, "step": 26680 }, { "distill_loss": 0.17105498909950256, "epoch": 8.899266177451635, "step": 26680 }, { "epoch": 8.899266177451635, "ref_ce_loss": 0.09113267064094543, "step": 26680 }, { "epoch": 8.899266177451635, "loss": 0.29275742173194885, "step": 26680 }, { "ce_loss": 0.0631311684846878, "epoch": 8.899266177451635, "step": 26680 }, { "distill_loss": 0.16057291626930237, "epoch": 8.899266177451635, "step": 26680 }, { "epoch": 8.899266177451635, "ref_ce_loss": 0.05351213365793228, "step": 26680 }, { "epoch": 8.902601734489659, "loss": 0.4033, "step": 26690 }, { "epoch": 8.902601734489659, "grad_norm": 1.6852984428405762, "step": 26690 }, { "epoch": 8.902601734489659, "learning_rate": 2.500091561221356e-05, "step": 26690 }, { "epoch": 8.902601734489659, "loss": 0.31909945607185364, "step": 26690 }, { "ce_loss": 0.043618910014629364, "epoch": 8.902601734489659, "step": 26690 }, { "distill_loss": 0.14286518096923828, "epoch": 8.902601734489659, "step": 26690 }, { "epoch": 8.902601734489659, "ref_ce_loss": 0.057593002915382385, "step": 26690 }, { "epoch": 8.902601734489659, "loss": 0.22959819436073303, "step": 26690 }, { "ce_loss": 0.028676237910985947, "epoch": 8.902601734489659, "step": 26690 }, { "distill_loss": 0.12937778234481812, "epoch": 8.902601734489659, "step": 26690 }, { "epoch": 8.902601734489659, "ref_ce_loss": 0.05122470483183861, "step": 26690 }, { "epoch": 8.905937291527685, "loss": 0.3535, "step": 26700 }, { "epoch": 8.905937291527685, "grad_norm": 1.6896381378173828, "step": 26700 }, { "epoch": 8.905937291527685, "learning_rate": 2.4850756571312927e-05, "step": 26700 }, { "epoch": 8.905937291527685, "loss": 0.21083098649978638, "step": 26700 }, { "ce_loss": 0.02656303346157074, "epoch": 8.905937291527685, "step": 26700 }, { "distill_loss": 0.11085355281829834, "epoch": 8.905937291527685, "step": 26700 }, { "epoch": 8.905937291527685, "ref_ce_loss": 0.04174497351050377, "step": 26700 }, { "epoch": 8.905937291527685, "loss": 0.41885051131248474, "step": 26700 }, { "ce_loss": 0.03569251671433449, "epoch": 8.905937291527685, "step": 26700 }, { "distill_loss": 0.15860214829444885, "epoch": 8.905937291527685, "step": 26700 }, { "epoch": 8.905937291527685, "ref_ce_loss": 0.07976032793521881, "step": 26700 }, { "epoch": 8.90927284856571, "loss": 0.4428, "step": 26710 }, { "epoch": 8.90927284856571, "grad_norm": 2.085545778274536, "step": 26710 }, { "epoch": 8.90927284856571, "learning_rate": 2.47010353699471e-05, "step": 26710 }, { "epoch": 8.90927284856571, "loss": 0.35387659072875977, "step": 26710 }, { "ce_loss": 0.04973173886537552, "epoch": 8.90927284856571, "step": 26710 }, { "distill_loss": 0.16496436297893524, "epoch": 8.90927284856571, "step": 26710 }, { "epoch": 8.90927284856571, "ref_ce_loss": 0.06505236029624939, "step": 26710 }, { "epoch": 8.90927284856571, "loss": 0.35952889919281006, "step": 26710 }, { "ce_loss": 0.08064170926809311, "epoch": 8.90927284856571, "step": 26710 }, { "distill_loss": 0.18596360087394714, "epoch": 8.90927284856571, "step": 26710 }, { "epoch": 8.90927284856571, "ref_ce_loss": 0.07266595214605331, "step": 26710 }, { "epoch": 8.912608405603736, "loss": 0.4175, "step": 26720 }, { "epoch": 8.912608405603736, "grad_norm": 2.2813949584960938, "step": 26720 }, { "epoch": 8.912608405603736, "learning_rate": 2.4551752182856923e-05, "step": 26720 }, { "epoch": 8.912608405603736, "loss": 0.408029168844223, "step": 26720 }, { "ce_loss": 0.05037664622068405, "epoch": 8.912608405603736, "step": 26720 }, { "distill_loss": 0.19227397441864014, "epoch": 8.912608405603736, "step": 26720 }, { "epoch": 8.912608405603736, "ref_ce_loss": 0.08918608725070953, "step": 26720 }, { "epoch": 8.912608405603736, "loss": 0.5027357339859009, "step": 26720 }, { "ce_loss": 0.10362458229064941, "epoch": 8.912608405603736, "step": 26720 }, { "distill_loss": 0.19801238179206848, "epoch": 8.912608405603736, "step": 26720 }, { "epoch": 8.912608405603736, "ref_ce_loss": 0.11099687218666077, "step": 26720 }, { "epoch": 8.91594396264176, "loss": 0.4118, "step": 26730 }, { "epoch": 8.91594396264176, "grad_norm": 1.9336222410202026, "step": 26730 }, { "epoch": 8.91594396264176, "learning_rate": 2.4402907184271783e-05, "step": 26730 }, { "epoch": 8.91594396264176, "loss": 0.29472947120666504, "step": 26730 }, { "ce_loss": 0.027242260053753853, "epoch": 8.91594396264176, "step": 26730 }, { "distill_loss": 0.18607719242572784, "epoch": 8.91594396264176, "step": 26730 }, { "epoch": 8.91594396264176, "ref_ce_loss": 0.058557845652103424, "step": 26730 }, { "epoch": 8.91594396264176, "loss": 0.3766292929649353, "step": 26730 }, { "ce_loss": 0.051607754081487656, "epoch": 8.91594396264176, "step": 26730 }, { "distill_loss": 0.19452683627605438, "epoch": 8.91594396264176, "step": 26730 }, { "epoch": 8.91594396264176, "ref_ce_loss": 0.08398887515068054, "step": 26730 }, { "epoch": 8.919279519679787, "loss": 0.3726, "step": 26740 }, { "epoch": 8.919279519679787, "grad_norm": 1.3201515674591064, "step": 26740 }, { "epoch": 8.919279519679787, "learning_rate": 2.4254500547909787e-05, "step": 26740 }, { "epoch": 8.919279519679787, "loss": 0.31672608852386475, "step": 26740 }, { "ce_loss": 0.07216809689998627, "epoch": 8.919279519679787, "step": 26740 }, { "distill_loss": 0.16296908259391785, "epoch": 8.919279519679787, "step": 26740 }, { "epoch": 8.919279519679787, "ref_ce_loss": 0.08134753257036209, "step": 26740 }, { "epoch": 8.919279519679787, "loss": 0.37821874022483826, "step": 26740 }, { "ce_loss": 0.06907200068235397, "epoch": 8.919279519679787, "step": 26740 }, { "distill_loss": 0.2010176181793213, "epoch": 8.919279519679787, "step": 26740 }, { "epoch": 8.919279519679787, "ref_ce_loss": 0.08417536318302155, "step": 26740 }, { "epoch": 8.922615076717811, "loss": 0.4093, "step": 26750 }, { "epoch": 8.922615076717811, "grad_norm": 1.8416671752929688, "step": 26750 }, { "epoch": 8.922615076717811, "learning_rate": 2.4106532446977715e-05, "step": 26750 }, { "epoch": 8.922615076717811, "loss": 0.3364465832710266, "step": 26750 }, { "ce_loss": 0.0730133056640625, "epoch": 8.922615076717811, "step": 26750 }, { "distill_loss": 0.14422747492790222, "epoch": 8.922615076717811, "step": 26750 }, { "epoch": 8.922615076717811, "ref_ce_loss": 0.06264062970876694, "step": 26750 }, { "epoch": 8.922615076717811, "loss": 0.33681172132492065, "step": 26750 }, { "ce_loss": 0.05357135459780693, "epoch": 8.922615076717811, "step": 26750 }, { "distill_loss": 0.16947956383228302, "epoch": 8.922615076717811, "step": 26750 }, { "epoch": 8.922615076717811, "ref_ce_loss": 0.07739750295877457, "step": 26750 }, { "epoch": 8.925950633755837, "loss": 0.3788, "step": 26760 }, { "epoch": 8.925950633755837, "grad_norm": 1.0190787315368652, "step": 26760 }, { "epoch": 8.925950633755837, "learning_rate": 2.395900305416996e-05, "step": 26760 }, { "epoch": 8.925950633755837, "loss": 0.34849247336387634, "step": 26760 }, { "ce_loss": 0.04305924475193024, "epoch": 8.925950633755837, "step": 26760 }, { "distill_loss": 0.20099864900112152, "epoch": 8.925950633755837, "step": 26760 }, { "epoch": 8.925950633755837, "ref_ce_loss": 0.07525210082530975, "step": 26760 }, { "epoch": 8.925950633755837, "loss": 0.42576754093170166, "step": 26760 }, { "ce_loss": 0.05474647879600525, "epoch": 8.925950633755837, "step": 26760 }, { "distill_loss": 0.17419682443141937, "epoch": 8.925950633755837, "step": 26760 }, { "epoch": 8.925950633755837, "ref_ce_loss": 0.05943509563803673, "step": 26760 }, { "epoch": 8.929286190793862, "loss": 0.4248, "step": 26770 }, { "epoch": 8.929286190793862, "grad_norm": 1.1845428943634033, "step": 26770 }, { "epoch": 8.929286190793862, "learning_rate": 2.3811912541669368e-05, "step": 26770 }, { "epoch": 8.929286190793862, "loss": 0.6398705840110779, "step": 26770 }, { "ce_loss": 0.07201068103313446, "epoch": 8.929286190793862, "step": 26770 }, { "distill_loss": 0.18719375133514404, "epoch": 8.929286190793862, "step": 26770 }, { "epoch": 8.929286190793862, "ref_ce_loss": 0.07512786239385605, "step": 26770 }, { "epoch": 8.929286190793862, "loss": 0.5534118413925171, "step": 26770 }, { "ce_loss": 0.058874137699604034, "epoch": 8.929286190793862, "step": 26770 }, { "distill_loss": 0.22906601428985596, "epoch": 8.929286190793862, "step": 26770 }, { "epoch": 8.929286190793862, "ref_ce_loss": 0.09107699245214462, "step": 26770 }, { "epoch": 8.932621747831888, "loss": 0.3822, "step": 26780 }, { "epoch": 8.932621747831888, "grad_norm": 1.1596795320510864, "step": 26780 }, { "epoch": 8.932621747831888, "learning_rate": 2.366526108114635e-05, "step": 26780 }, { "epoch": 8.932621747831888, "loss": 0.37828749418258667, "step": 26780 }, { "ce_loss": 0.09381183981895447, "epoch": 8.932621747831888, "step": 26780 }, { "distill_loss": 0.19912533462047577, "epoch": 8.932621747831888, "step": 26780 }, { "epoch": 8.932621747831888, "ref_ce_loss": 0.037565797567367554, "step": 26780 }, { "epoch": 8.932621747831888, "loss": 0.2733290493488312, "step": 26780 }, { "ce_loss": 0.06196272000670433, "epoch": 8.932621747831888, "step": 26780 }, { "distill_loss": 0.14887866377830505, "epoch": 8.932621747831888, "step": 26780 }, { "epoch": 8.932621747831888, "ref_ce_loss": 0.062240395694971085, "step": 26780 }, { "epoch": 8.935957304869913, "loss": 0.4107, "step": 26790 }, { "epoch": 8.935957304869913, "grad_norm": 1.1044889688491821, "step": 26790 }, { "epoch": 8.935957304869913, "learning_rate": 2.3519048843758973e-05, "step": 26790 }, { "epoch": 8.935957304869913, "loss": 0.27776286005973816, "step": 26790 }, { "ce_loss": 0.04569617286324501, "epoch": 8.935957304869913, "step": 26790 }, { "distill_loss": 0.17505523562431335, "epoch": 8.935957304869913, "step": 26790 }, { "epoch": 8.935957304869913, "ref_ce_loss": 0.05621863901615143, "step": 26790 }, { "epoch": 8.935957304869913, "loss": 0.3137989044189453, "step": 26790 }, { "ce_loss": 0.04200056195259094, "epoch": 8.935957304869913, "step": 26790 }, { "distill_loss": 0.17961399257183075, "epoch": 8.935957304869913, "step": 26790 }, { "epoch": 8.935957304869913, "ref_ce_loss": 0.06352463364601135, "step": 26790 }, { "epoch": 8.939292861907939, "loss": 0.3986, "step": 26800 }, { "epoch": 8.939292861907939, "grad_norm": 1.3312222957611084, "step": 26800 }, { "epoch": 8.939292861907939, "learning_rate": 2.3373276000152645e-05, "step": 26800 }, { "epoch": 8.939292861907939, "loss": 0.7195945978164673, "step": 26800 }, { "ce_loss": 0.05876842886209488, "epoch": 8.939292861907939, "step": 26800 }, { "distill_loss": 0.21101140975952148, "epoch": 8.939292861907939, "step": 26800 }, { "epoch": 8.939292861907939, "ref_ce_loss": 0.09533131867647171, "step": 26800 }, { "epoch": 8.939292861907939, "loss": 0.4126310646533966, "step": 26800 }, { "ce_loss": 0.042031995952129364, "epoch": 8.939292861907939, "step": 26800 }, { "distill_loss": 0.1630733758211136, "epoch": 8.939292861907939, "step": 26800 }, { "epoch": 8.939292861907939, "ref_ce_loss": 0.07871302217245102, "step": 26800 }, { "epoch": 8.942628418945963, "loss": 0.443, "step": 26810 }, { "epoch": 8.942628418945963, "grad_norm": 2.133390426635742, "step": 26810 }, { "epoch": 8.942628418945963, "learning_rate": 2.3227942720459896e-05, "step": 26810 }, { "epoch": 8.942628418945963, "loss": 0.36603066325187683, "step": 26810 }, { "ce_loss": 0.05009675770998001, "epoch": 8.942628418945963, "step": 26810 }, { "distill_loss": 0.21553771197795868, "epoch": 8.942628418945963, "step": 26810 }, { "epoch": 8.942628418945963, "ref_ce_loss": 0.10018293559551239, "step": 26810 }, { "epoch": 8.942628418945963, "loss": 0.30181068181991577, "step": 26810 }, { "ce_loss": 0.0363122820854187, "epoch": 8.942628418945963, "step": 26810 }, { "distill_loss": 0.1500847041606903, "epoch": 8.942628418945963, "step": 26810 }, { "epoch": 8.942628418945963, "ref_ce_loss": 0.08789963275194168, "step": 26810 }, { "epoch": 8.94596397598399, "loss": 0.3884, "step": 26820 }, { "epoch": 8.94596397598399, "grad_norm": 2.869149684906006, "step": 26820 }, { "epoch": 8.94596397598399, "learning_rate": 2.3083049174300332e-05, "step": 26820 }, { "epoch": 8.94596397598399, "loss": 0.3147127032279968, "step": 26820 }, { "ce_loss": 0.02978779561817646, "epoch": 8.94596397598399, "step": 26820 }, { "distill_loss": 0.20026488602161407, "epoch": 8.94596397598399, "step": 26820 }, { "epoch": 8.94596397598399, "ref_ce_loss": 0.0839177593588829, "step": 26820 }, { "epoch": 8.94596397598399, "loss": 0.27547311782836914, "step": 26820 }, { "ce_loss": 0.040116142481565475, "epoch": 8.94596397598399, "step": 26820 }, { "distill_loss": 0.16665637493133545, "epoch": 8.94596397598399, "step": 26820 }, { "epoch": 8.94596397598399, "ref_ce_loss": 0.0683833584189415, "step": 26820 }, { "epoch": 8.949299533022014, "loss": 0.3734, "step": 26830 }, { "epoch": 8.949299533022014, "grad_norm": 1.3682441711425781, "step": 26830 }, { "epoch": 8.949299533022014, "learning_rate": 2.2938595530780325e-05, "step": 26830 }, { "epoch": 8.949299533022014, "loss": 0.37970811128616333, "step": 26830 }, { "ce_loss": 0.042396221309900284, "epoch": 8.949299533022014, "step": 26830 }, { "distill_loss": 0.17948472499847412, "epoch": 8.949299533022014, "step": 26830 }, { "epoch": 8.949299533022014, "ref_ce_loss": 0.08113700896501541, "step": 26830 }, { "epoch": 8.949299533022014, "loss": 0.3110351860523224, "step": 26830 }, { "ce_loss": 0.04640982300043106, "epoch": 8.949299533022014, "step": 26830 }, { "distill_loss": 0.16055357456207275, "epoch": 8.949299533022014, "step": 26830 }, { "epoch": 8.949299533022014, "ref_ce_loss": 0.07340537011623383, "step": 26830 }, { "epoch": 8.95263509006004, "loss": 0.3662, "step": 26840 }, { "epoch": 8.95263509006004, "grad_norm": 1.3402122259140015, "step": 26840 }, { "epoch": 8.95263509006004, "learning_rate": 2.279458195849289e-05, "step": 26840 }, { "epoch": 8.95263509006004, "loss": 0.5659681558609009, "step": 26840 }, { "ce_loss": 0.03422752767801285, "epoch": 8.95263509006004, "step": 26840 }, { "distill_loss": 0.18524156510829926, "epoch": 8.95263509006004, "step": 26840 }, { "epoch": 8.95263509006004, "ref_ce_loss": 0.06659182906150818, "step": 26840 }, { "epoch": 8.95263509006004, "loss": 0.4884268045425415, "step": 26840 }, { "ce_loss": 0.10238605737686157, "epoch": 8.95263509006004, "step": 26840 }, { "distill_loss": 0.224435955286026, "epoch": 8.95263509006004, "step": 26840 }, { "epoch": 8.95263509006004, "ref_ce_loss": 0.10202771425247192, "step": 26840 }, { "epoch": 8.955970647098065, "loss": 0.435, "step": 26850 }, { "epoch": 8.955970647098065, "grad_norm": 1.8279234170913696, "step": 26850 }, { "epoch": 8.955970647098065, "learning_rate": 2.2651008625517168e-05, "step": 26850 }, { "epoch": 8.955970647098065, "loss": 0.3548109531402588, "step": 26850 }, { "ce_loss": 0.059652846306562424, "epoch": 8.955970647098065, "step": 26850 }, { "distill_loss": 0.15193244814872742, "epoch": 8.955970647098065, "step": 26850 }, { "epoch": 8.955970647098065, "ref_ce_loss": 0.0689244195818901, "step": 26850 }, { "epoch": 8.955970647098065, "loss": 0.31553858518600464, "step": 26850 }, { "ce_loss": 0.0573444664478302, "epoch": 8.955970647098065, "step": 26850 }, { "distill_loss": 0.18329612910747528, "epoch": 8.955970647098065, "step": 26850 }, { "epoch": 8.955970647098065, "ref_ce_loss": 0.05450303852558136, "step": 26850 }, { "epoch": 8.959306204136091, "loss": 0.3977, "step": 26860 }, { "epoch": 8.959306204136091, "grad_norm": 1.8529592752456665, "step": 26860 }, { "epoch": 8.959306204136091, "learning_rate": 2.2507875699418855e-05, "step": 26860 }, { "epoch": 8.959306204136091, "loss": 0.35732439160346985, "step": 26860 }, { "ce_loss": 0.06432899832725525, "epoch": 8.959306204136091, "step": 26860 }, { "distill_loss": 0.1751275658607483, "epoch": 8.959306204136091, "step": 26860 }, { "epoch": 8.959306204136091, "ref_ce_loss": 0.0556345172226429, "step": 26860 }, { "epoch": 8.959306204136091, "loss": 0.2738136351108551, "step": 26860 }, { "ce_loss": 0.02206704579293728, "epoch": 8.959306204136091, "step": 26860 }, { "distill_loss": 0.16342735290527344, "epoch": 8.959306204136091, "step": 26860 }, { "epoch": 8.959306204136091, "ref_ce_loss": 0.04390689358115196, "step": 26860 }, { "epoch": 8.962641761174115, "loss": 0.3404, "step": 26870 }, { "epoch": 8.962641761174115, "grad_norm": 1.2462575435638428, "step": 26870 }, { "epoch": 8.962641761174115, "learning_rate": 2.2365183347249486e-05, "step": 26870 }, { "epoch": 8.962641761174115, "loss": 0.34319478273391724, "step": 26870 }, { "ce_loss": 0.06394265592098236, "epoch": 8.962641761174115, "step": 26870 }, { "distill_loss": 0.2054232507944107, "epoch": 8.962641761174115, "step": 26870 }, { "epoch": 8.962641761174115, "ref_ce_loss": 0.05607668310403824, "step": 26870 }, { "epoch": 8.962641761174115, "loss": 0.4374099373817444, "step": 26870 }, { "ce_loss": 0.05983159318566322, "epoch": 8.962641761174115, "step": 26870 }, { "distill_loss": 0.2247617542743683, "epoch": 8.962641761174115, "step": 26870 }, { "epoch": 8.962641761174115, "ref_ce_loss": 0.07600671797990799, "step": 26870 }, { "epoch": 8.965977318212142, "loss": 0.3653, "step": 26880 }, { "epoch": 8.965977318212142, "grad_norm": 1.3279523849487305, "step": 26880 }, { "epoch": 8.965977318212142, "learning_rate": 2.2222931735546327e-05, "step": 26880 }, { "epoch": 8.965977318212142, "loss": 0.263740599155426, "step": 26880 }, { "ce_loss": 0.0419314019382, "epoch": 8.965977318212142, "step": 26880 }, { "distill_loss": 0.14178217947483063, "epoch": 8.965977318212142, "step": 26880 }, { "epoch": 8.965977318212142, "ref_ce_loss": 0.07977408170700073, "step": 26880 }, { "epoch": 8.965977318212142, "loss": 0.33606958389282227, "step": 26880 }, { "ce_loss": 0.0492708757519722, "epoch": 8.965977318212142, "step": 26880 }, { "distill_loss": 0.1874486356973648, "epoch": 8.965977318212142, "step": 26880 }, { "epoch": 8.965977318212142, "ref_ce_loss": 0.07295677810907364, "step": 26880 }, { "epoch": 8.969312875250166, "loss": 0.3834, "step": 26890 }, { "epoch": 8.969312875250166, "grad_norm": 0.9264974594116211, "step": 26890 }, { "epoch": 8.969312875250166, "learning_rate": 2.2081121030332262e-05, "step": 26890 }, { "epoch": 8.969312875250166, "loss": 0.5124914050102234, "step": 26890 }, { "ce_loss": 0.07620886713266373, "epoch": 8.969312875250166, "step": 26890 }, { "distill_loss": 0.25074541568756104, "epoch": 8.969312875250166, "step": 26890 }, { "epoch": 8.969312875250166, "ref_ce_loss": 0.08912394195795059, "step": 26890 }, { "epoch": 8.969312875250166, "loss": 0.2802472412586212, "step": 26890 }, { "ce_loss": 0.044443145394325256, "epoch": 8.969312875250166, "step": 26890 }, { "distill_loss": 0.14432059228420258, "epoch": 8.969312875250166, "step": 26890 }, { "epoch": 8.969312875250166, "ref_ce_loss": 0.06065668910741806, "step": 26890 }, { "epoch": 8.972648432288192, "loss": 0.3987, "step": 26900 }, { "epoch": 8.972648432288192, "grad_norm": 1.4514647722244263, "step": 26900 }, { "epoch": 8.972648432288192, "learning_rate": 2.193975139711575e-05, "step": 26900 }, { "epoch": 8.972648432288192, "loss": 0.48129957914352417, "step": 26900 }, { "ce_loss": 0.05864886939525604, "epoch": 8.972648432288192, "step": 26900 }, { "distill_loss": 0.16718582808971405, "epoch": 8.972648432288192, "step": 26900 }, { "epoch": 8.972648432288192, "ref_ce_loss": 0.06964981555938721, "step": 26900 }, { "epoch": 8.972648432288192, "loss": 0.4144725799560547, "step": 26900 }, { "ce_loss": 0.06540128588676453, "epoch": 8.972648432288192, "step": 26900 }, { "distill_loss": 0.24699874222278595, "epoch": 8.972648432288192, "step": 26900 }, { "epoch": 8.972648432288192, "ref_ce_loss": 0.07367391884326935, "step": 26900 }, { "epoch": 8.975983989326217, "loss": 0.3679, "step": 26910 }, { "epoch": 8.975983989326217, "grad_norm": 1.1033375263214111, "step": 26910 }, { "epoch": 8.975983989326217, "learning_rate": 2.17988230008904e-05, "step": 26910 }, { "epoch": 8.975983989326217, "loss": 0.2607868015766144, "step": 26910 }, { "ce_loss": 0.02850893698632717, "epoch": 8.975983989326217, "step": 26910 }, { "distill_loss": 0.1603696495294571, "epoch": 8.975983989326217, "step": 26910 }, { "epoch": 8.975983989326217, "ref_ce_loss": 0.04794909805059433, "step": 26910 }, { "epoch": 8.975983989326217, "loss": 0.4003443717956543, "step": 26910 }, { "ce_loss": 0.05196845531463623, "epoch": 8.975983989326217, "step": 26910 }, { "distill_loss": 0.1829056739807129, "epoch": 8.975983989326217, "step": 26910 }, { "epoch": 8.975983989326217, "ref_ce_loss": 0.08970125019550323, "step": 26910 }, { "epoch": 8.979319546364243, "loss": 0.3785, "step": 26920 }, { "epoch": 8.979319546364243, "grad_norm": 1.235166072845459, "step": 26920 }, { "epoch": 8.979319546364243, "learning_rate": 2.165833600613465e-05, "step": 26920 }, { "epoch": 8.979319546364243, "loss": 0.2534266710281372, "step": 26920 }, { "ce_loss": 0.03887072950601578, "epoch": 8.979319546364243, "step": 26920 }, { "distill_loss": 0.1585645079612732, "epoch": 8.979319546364243, "step": 26920 }, { "epoch": 8.979319546364243, "ref_ce_loss": 0.05570882931351662, "step": 26920 }, { "epoch": 8.979319546364243, "loss": 0.35729390382766724, "step": 26920 }, { "ce_loss": 0.04741301015019417, "epoch": 8.979319546364243, "step": 26920 }, { "distill_loss": 0.1947993040084839, "epoch": 8.979319546364243, "step": 26920 }, { "epoch": 8.979319546364243, "ref_ce_loss": 0.08480067551136017, "step": 26920 }, { "epoch": 8.982655103402267, "loss": 0.3672, "step": 26930 }, { "epoch": 8.982655103402267, "grad_norm": 1.1416871547698975, "step": 26930 }, { "epoch": 8.982655103402267, "learning_rate": 2.151829057681205e-05, "step": 26930 }, { "epoch": 8.982655103402267, "loss": 0.30041593313217163, "step": 26930 }, { "ce_loss": 0.03660650923848152, "epoch": 8.982655103402267, "step": 26930 }, { "distill_loss": 0.14040933549404144, "epoch": 8.982655103402267, "step": 26930 }, { "epoch": 8.982655103402267, "ref_ce_loss": 0.06650853157043457, "step": 26930 }, { "epoch": 8.982655103402267, "loss": 0.3716757595539093, "step": 26930 }, { "ce_loss": 0.05143218860030174, "epoch": 8.982655103402267, "step": 26930 }, { "distill_loss": 0.1850205957889557, "epoch": 8.982655103402267, "step": 26930 }, { "epoch": 8.982655103402267, "ref_ce_loss": 0.10274700075387955, "step": 26930 }, { "epoch": 8.985990660440294, "loss": 0.385, "step": 26940 }, { "epoch": 8.985990660440294, "grad_norm": 1.246756672859192, "step": 26940 }, { "epoch": 8.985990660440294, "learning_rate": 2.1378686876370656e-05, "step": 26940 }, { "epoch": 8.985990660440294, "loss": 0.2395678460597992, "step": 26940 }, { "ce_loss": 0.02067268081009388, "epoch": 8.985990660440294, "step": 26940 }, { "distill_loss": 0.14293518662452698, "epoch": 8.985990660440294, "step": 26940 }, { "epoch": 8.985990660440294, "ref_ce_loss": 0.04838182032108307, "step": 26940 }, { "epoch": 8.985990660440294, "loss": 0.23595669865608215, "step": 26940 }, { "ce_loss": 0.02198474481701851, "epoch": 8.985990660440294, "step": 26940 }, { "distill_loss": 0.12834428250789642, "epoch": 8.985990660440294, "step": 26940 }, { "epoch": 8.985990660440294, "ref_ce_loss": 0.05260973051190376, "step": 26940 }, { "epoch": 8.989326217478318, "loss": 0.3931, "step": 26950 }, { "epoch": 8.989326217478318, "grad_norm": 1.2465322017669678, "step": 26950 }, { "epoch": 8.989326217478318, "learning_rate": 2.123952506774307e-05, "step": 26950 }, { "epoch": 8.989326217478318, "loss": 0.4282079339027405, "step": 26950 }, { "ce_loss": 0.10361059755086899, "epoch": 8.989326217478318, "step": 26950 }, { "distill_loss": 0.19960245490074158, "epoch": 8.989326217478318, "step": 26950 }, { "epoch": 8.989326217478318, "ref_ce_loss": 0.09390628337860107, "step": 26950 }, { "epoch": 8.989326217478318, "loss": 0.4041498005390167, "step": 26950 }, { "ce_loss": 0.06759005039930344, "epoch": 8.989326217478318, "step": 26950 }, { "distill_loss": 0.20927953720092773, "epoch": 8.989326217478318, "step": 26950 }, { "epoch": 8.989326217478318, "ref_ce_loss": 0.07374574989080429, "step": 26950 }, { "epoch": 8.992661774516344, "loss": 0.3814, "step": 26960 }, { "epoch": 8.992661774516344, "grad_norm": 1.1587697267532349, "step": 26960 }, { "epoch": 8.992661774516344, "learning_rate": 2.1100805313345907e-05, "step": 26960 }, { "epoch": 8.992661774516344, "loss": 0.46586868166923523, "step": 26960 }, { "ce_loss": 0.05573577433824539, "epoch": 8.992661774516344, "step": 26960 }, { "distill_loss": 0.2000192403793335, "epoch": 8.992661774516344, "step": 26960 }, { "epoch": 8.992661774516344, "ref_ce_loss": 0.06393946707248688, "step": 26960 }, { "epoch": 8.992661774516344, "loss": 0.25798797607421875, "step": 26960 }, { "ce_loss": 0.027636967599391937, "epoch": 8.992661774516344, "step": 26960 }, { "distill_loss": 0.1450037807226181, "epoch": 8.992661774516344, "step": 26960 }, { "epoch": 8.992661774516344, "ref_ce_loss": 0.058194465935230255, "step": 26960 }, { "epoch": 8.995997331554369, "loss": 0.3803, "step": 26970 }, { "epoch": 8.995997331554369, "grad_norm": 1.6340363025665283, "step": 26970 }, { "epoch": 8.995997331554369, "learning_rate": 2.0962527775080275e-05, "step": 26970 }, { "epoch": 8.995997331554369, "loss": 0.418100506067276, "step": 26970 }, { "ce_loss": 0.07010974735021591, "epoch": 8.995997331554369, "step": 26970 }, { "distill_loss": 0.2157282531261444, "epoch": 8.995997331554369, "step": 26970 }, { "epoch": 8.995997331554369, "ref_ce_loss": 0.09423127770423889, "step": 26970 }, { "epoch": 8.995997331554369, "loss": 0.7165571451187134, "step": 26970 }, { "ce_loss": 0.06447438895702362, "epoch": 8.995997331554369, "step": 26970 }, { "distill_loss": 0.15642520785331726, "epoch": 8.995997331554369, "step": 26970 }, { "epoch": 8.995997331554369, "ref_ce_loss": 0.08733781427145004, "step": 26970 }, { "epoch": 8.999332888592395, "loss": 0.4126, "step": 26980 }, { "epoch": 8.999332888592395, "grad_norm": 1.3778339624404907, "step": 26980 }, { "epoch": 8.999332888592395, "learning_rate": 2.082469261433082e-05, "step": 26980 }, { "epoch": 8.999332888592395, "loss": 0.5340480208396912, "step": 26980 }, { "ce_loss": 0.07798661291599274, "epoch": 8.999332888592395, "step": 26980 }, { "distill_loss": 0.21405644714832306, "epoch": 8.999332888592395, "step": 26980 }, { "epoch": 8.999332888592395, "ref_ce_loss": 0.08211926370859146, "step": 26980 }, { "epoch": 8.999332888592395, "loss": 0.4084705710411072, "step": 26980 }, { "ce_loss": 0.07932519912719727, "epoch": 8.999332888592395, "step": 26980 }, { "distill_loss": 0.18646341562271118, "epoch": 8.999332888592395, "step": 26980 }, { "epoch": 8.999332888592395, "ref_ce_loss": 0.07593373954296112, "step": 26980 }, { "epoch": 9.00266844563042, "loss": 0.3665, "step": 26990 }, { "epoch": 9.00266844563042, "grad_norm": 0.8887509107589722, "step": 26990 }, { "epoch": 9.00266844563042, "learning_rate": 2.0687299991966104e-05, "step": 26990 }, { "epoch": 9.00266844563042, "loss": 0.2786753177642822, "step": 26990 }, { "ce_loss": 0.03038201667368412, "epoch": 9.00266844563042, "step": 26990 }, { "distill_loss": 0.15614908933639526, "epoch": 9.00266844563042, "step": 26990 }, { "epoch": 9.00266844563042, "ref_ce_loss": 0.053884122520685196, "step": 26990 }, { "epoch": 9.00266844563042, "loss": 0.38568711280822754, "step": 26990 }, { "ce_loss": 0.053761474788188934, "epoch": 9.00266844563042, "step": 26990 }, { "distill_loss": 0.22754529118537903, "epoch": 9.00266844563042, "step": 26990 }, { "epoch": 9.00266844563042, "ref_ce_loss": 0.07468869537115097, "step": 26990 }, { "epoch": 9.006004002668446, "loss": 0.3239, "step": 27000 }, { "epoch": 9.006004002668446, "grad_norm": 0.9528130292892456, "step": 27000 }, { "epoch": 9.006004002668446, "learning_rate": 2.0550350068337987e-05, "step": 27000 }, { "epoch": 9.006004002668446, "loss": 0.32287442684173584, "step": 27000 }, { "ce_loss": 0.03582875803112984, "epoch": 9.006004002668446, "step": 27000 }, { "distill_loss": 0.19158251583576202, "epoch": 9.006004002668446, "step": 27000 }, { "epoch": 9.006004002668446, "ref_ce_loss": 0.06319230049848557, "step": 27000 }, { "epoch": 9.006004002668446, "loss": 0.4253171682357788, "step": 27000 }, { "ce_loss": 0.03548054024577141, "epoch": 9.006004002668446, "step": 27000 }, { "distill_loss": 0.15410447120666504, "epoch": 9.006004002668446, "step": 27000 }, { "epoch": 9.006004002668446, "ref_ce_loss": 0.060177017003297806, "step": 27000 }, { "epoch": 9.00933955970647, "loss": 0.3901, "step": 27010 }, { "epoch": 9.00933955970647, "grad_norm": 1.0065354108810425, "step": 27010 }, { "epoch": 9.00933955970647, "learning_rate": 2.0413843003281818e-05, "step": 27010 }, { "epoch": 9.00933955970647, "loss": 0.22900214791297913, "step": 27010 }, { "ce_loss": 0.020226458087563515, "epoch": 9.00933955970647, "step": 27010 }, { "distill_loss": 0.14763259887695312, "epoch": 9.00933955970647, "step": 27010 }, { "epoch": 9.00933955970647, "ref_ce_loss": 0.051381126046180725, "step": 27010 }, { "epoch": 9.00933955970647, "loss": 0.28583458065986633, "step": 27010 }, { "ce_loss": 0.04080132767558098, "epoch": 9.00933955970647, "step": 27010 }, { "distill_loss": 0.1587204784154892, "epoch": 9.00933955970647, "step": 27010 }, { "epoch": 9.00933955970647, "ref_ce_loss": 0.06302133202552795, "step": 27010 }, { "epoch": 9.012675116744497, "loss": 0.3127, "step": 27020 }, { "epoch": 9.012675116744497, "grad_norm": 1.1079349517822266, "step": 27020 }, { "epoch": 9.012675116744497, "learning_rate": 2.0277778956116023e-05, "step": 27020 }, { "epoch": 9.012675116744497, "loss": 0.29996344447135925, "step": 27020 }, { "ce_loss": 0.03723428398370743, "epoch": 9.012675116744497, "step": 27020 }, { "distill_loss": 0.19037845730781555, "epoch": 9.012675116744497, "step": 27020 }, { "epoch": 9.012675116744497, "ref_ce_loss": 0.05403125286102295, "step": 27020 }, { "epoch": 9.012675116744497, "loss": 0.26005294919013977, "step": 27020 }, { "ce_loss": 0.017900634557008743, "epoch": 9.012675116744497, "step": 27020 }, { "distill_loss": 0.10674063116312027, "epoch": 9.012675116744497, "step": 27020 }, { "epoch": 9.012675116744497, "ref_ce_loss": 0.04297463968396187, "step": 27020 }, { "epoch": 9.016010673782521, "loss": 0.334, "step": 27030 }, { "epoch": 9.016010673782521, "grad_norm": 0.9277923107147217, "step": 27030 }, { "epoch": 9.016010673782521, "learning_rate": 2.0142158085642014e-05, "step": 27030 }, { "epoch": 9.016010673782521, "loss": 0.32305458188056946, "step": 27030 }, { "ce_loss": 0.03489192947745323, "epoch": 9.016010673782521, "step": 27030 }, { "distill_loss": 0.22198455035686493, "epoch": 9.016010673782521, "step": 27030 }, { "epoch": 9.016010673782521, "ref_ce_loss": 0.06594966351985931, "step": 27030 }, { "epoch": 9.016010673782521, "loss": 0.24616390466690063, "step": 27030 }, { "ce_loss": 0.04013339802622795, "epoch": 9.016010673782521, "step": 27030 }, { "distill_loss": 0.14001591503620148, "epoch": 9.016010673782521, "step": 27030 }, { "epoch": 9.016010673782521, "ref_ce_loss": 0.04386579245328903, "step": 27030 }, { "epoch": 9.019346230820547, "loss": 0.3458, "step": 27040 }, { "epoch": 9.019346230820547, "grad_norm": 0.9151402711868286, "step": 27040 }, { "epoch": 9.019346230820547, "learning_rate": 2.0006980550143893e-05, "step": 27040 }, { "epoch": 9.019346230820547, "loss": 0.3370954990386963, "step": 27040 }, { "ce_loss": 0.030183374881744385, "epoch": 9.019346230820547, "step": 27040 }, { "distill_loss": 0.17306430637836456, "epoch": 9.019346230820547, "step": 27040 }, { "epoch": 9.019346230820547, "ref_ce_loss": 0.06660886108875275, "step": 27040 }, { "epoch": 9.019346230820547, "loss": 0.39963436126708984, "step": 27040 }, { "ce_loss": 0.03261478245258331, "epoch": 9.019346230820547, "step": 27040 }, { "distill_loss": 0.18940173089504242, "epoch": 9.019346230820547, "step": 27040 }, { "epoch": 9.019346230820547, "ref_ce_loss": 0.06742677092552185, "step": 27040 }, { "epoch": 9.022681787858572, "loss": 0.3546, "step": 27050 }, { "epoch": 9.022681787858572, "grad_norm": 0.9811869263648987, "step": 27050 }, { "epoch": 9.022681787858572, "learning_rate": 1.9872246507388394e-05, "step": 27050 }, { "epoch": 9.022681787858572, "loss": 0.20993368327617645, "step": 27050 }, { "ce_loss": 0.01603628695011139, "epoch": 9.022681787858572, "step": 27050 }, { "distill_loss": 0.13603952527046204, "epoch": 9.022681787858572, "step": 27050 }, { "epoch": 9.022681787858572, "ref_ce_loss": 0.04286492243409157, "step": 27050 }, { "epoch": 9.022681787858572, "loss": 0.4756922125816345, "step": 27050 }, { "ce_loss": 0.060111287981271744, "epoch": 9.022681787858572, "step": 27050 }, { "distill_loss": 0.2132369875907898, "epoch": 9.022681787858572, "step": 27050 }, { "epoch": 9.022681787858572, "ref_ce_loss": 0.07596529275178909, "step": 27050 }, { "epoch": 9.026017344896598, "loss": 0.3577, "step": 27060 }, { "epoch": 9.026017344896598, "grad_norm": 1.1416993141174316, "step": 27060 }, { "epoch": 9.026017344896598, "learning_rate": 1.9737956114624655e-05, "step": 27060 }, { "epoch": 9.026017344896598, "loss": 0.35747769474983215, "step": 27060 }, { "ce_loss": 0.03079773299396038, "epoch": 9.026017344896598, "step": 27060 }, { "distill_loss": 0.2055409699678421, "epoch": 9.026017344896598, "step": 27060 }, { "epoch": 9.026017344896598, "ref_ce_loss": 0.08336233347654343, "step": 27060 }, { "epoch": 9.026017344896598, "loss": 0.2289038896560669, "step": 27060 }, { "ce_loss": 0.009864287450909615, "epoch": 9.026017344896598, "step": 27060 }, { "distill_loss": 0.12856513261795044, "epoch": 9.026017344896598, "step": 27060 }, { "epoch": 9.026017344896598, "ref_ce_loss": 0.059212226420640945, "step": 27060 }, { "epoch": 9.029352901934622, "loss": 0.36, "step": 27070 }, { "epoch": 9.029352901934622, "grad_norm": 1.1386539936065674, "step": 27070 }, { "epoch": 9.029352901934622, "learning_rate": 1.9604109528584025e-05, "step": 27070 }, { "epoch": 9.029352901934622, "loss": 0.297063946723938, "step": 27070 }, { "ce_loss": 0.017145255580544472, "epoch": 9.029352901934622, "step": 27070 }, { "distill_loss": 0.20598909258842468, "epoch": 9.029352901934622, "step": 27070 }, { "epoch": 9.029352901934622, "ref_ce_loss": 0.07353837788105011, "step": 27070 }, { "epoch": 9.029352901934622, "loss": 0.2827029228210449, "step": 27070 }, { "ce_loss": 0.020517654716968536, "epoch": 9.029352901934622, "step": 27070 }, { "distill_loss": 0.18179626762866974, "epoch": 9.029352901934622, "step": 27070 }, { "epoch": 9.029352901934622, "ref_ce_loss": 0.08018902689218521, "step": 27070 }, { "epoch": 9.032688458972649, "loss": 0.3358, "step": 27080 }, { "epoch": 9.032688458972649, "grad_norm": 1.0653818845748901, "step": 27080 }, { "epoch": 9.032688458972649, "learning_rate": 1.9470706905479897e-05, "step": 27080 }, { "epoch": 9.032688458972649, "loss": 0.2931612432003021, "step": 27080 }, { "ce_loss": 0.04542126879096031, "epoch": 9.032688458972649, "step": 27080 }, { "distill_loss": 0.16980938613414764, "epoch": 9.032688458972649, "step": 27080 }, { "epoch": 9.032688458972649, "ref_ce_loss": 0.054527461528778076, "step": 27080 }, { "epoch": 9.032688458972649, "loss": 0.28220316767692566, "step": 27080 }, { "ce_loss": 0.06899258494377136, "epoch": 9.032688458972649, "step": 27080 }, { "distill_loss": 0.14135973155498505, "epoch": 9.032688458972649, "step": 27080 }, { "epoch": 9.032688458972649, "ref_ce_loss": 0.07163500785827637, "step": 27080 }, { "epoch": 9.036024016010673, "loss": 0.3818, "step": 27090 }, { "epoch": 9.036024016010673, "grad_norm": 0.8686916828155518, "step": 27090 }, { "epoch": 9.036024016010673, "learning_rate": 1.9337748401007418e-05, "step": 27090 }, { "epoch": 9.036024016010673, "loss": 0.3689887523651123, "step": 27090 }, { "ce_loss": 0.041207872331142426, "epoch": 9.036024016010673, "step": 27090 }, { "distill_loss": 0.18242886662483215, "epoch": 9.036024016010673, "step": 27090 }, { "epoch": 9.036024016010673, "ref_ce_loss": 0.06660563498735428, "step": 27090 }, { "epoch": 9.036024016010673, "loss": 0.3805079460144043, "step": 27090 }, { "ce_loss": 0.06088177487254143, "epoch": 9.036024016010673, "step": 27090 }, { "distill_loss": 0.2561386823654175, "epoch": 9.036024016010673, "step": 27090 }, { "epoch": 9.036024016010673, "ref_ce_loss": 0.06328141689300537, "step": 27090 }, { "epoch": 9.0393595730487, "loss": 0.3654, "step": 27100 }, { "epoch": 9.0393595730487, "grad_norm": 1.8930615186691284, "step": 27100 }, { "epoch": 9.0393595730487, "learning_rate": 1.9205234170343567e-05, "step": 27100 }, { "epoch": 9.0393595730487, "loss": 0.2853085994720459, "step": 27100 }, { "ce_loss": 0.028148401528596878, "epoch": 9.0393595730487, "step": 27100 }, { "distill_loss": 0.1661919206380844, "epoch": 9.0393595730487, "step": 27100 }, { "epoch": 9.0393595730487, "ref_ce_loss": 0.055086735635995865, "step": 27100 }, { "epoch": 9.0393595730487, "loss": 0.34833627939224243, "step": 27100 }, { "ce_loss": 0.036774538457393646, "epoch": 9.0393595730487, "step": 27100 }, { "distill_loss": 0.14977332949638367, "epoch": 9.0393595730487, "step": 27100 }, { "epoch": 9.0393595730487, "ref_ce_loss": 0.0676417276263237, "step": 27100 }, { "epoch": 9.042695130086724, "loss": 0.3574, "step": 27110 }, { "epoch": 9.042695130086724, "grad_norm": 0.9850945472717285, "step": 27110 }, { "epoch": 9.042695130086724, "learning_rate": 1.907316436814659e-05, "step": 27110 }, { "epoch": 9.042695130086724, "loss": 0.3692997395992279, "step": 27110 }, { "ce_loss": 0.030617495998740196, "epoch": 9.042695130086724, "step": 27110 }, { "distill_loss": 0.17378318309783936, "epoch": 9.042695130086724, "step": 27110 }, { "epoch": 9.042695130086724, "ref_ce_loss": 0.07603063434362411, "step": 27110 }, { "epoch": 9.042695130086724, "loss": 0.4435376822948456, "step": 27110 }, { "ce_loss": 0.05087882652878761, "epoch": 9.042695130086724, "step": 27110 }, { "distill_loss": 0.19516976177692413, "epoch": 9.042695130086724, "step": 27110 }, { "epoch": 9.042695130086724, "ref_ce_loss": 0.0701480358839035, "step": 27110 }, { "epoch": 9.04603068712475, "loss": 0.3588, "step": 27120 }, { "epoch": 9.04603068712475, "grad_norm": 1.261003017425537, "step": 27120 }, { "epoch": 9.04603068712475, "learning_rate": 1.894153914855625e-05, "step": 27120 }, { "epoch": 9.04603068712475, "loss": 0.2516244947910309, "step": 27120 }, { "ce_loss": 0.01986541971564293, "epoch": 9.04603068712475, "step": 27120 }, { "distill_loss": 0.18128304183483124, "epoch": 9.04603068712475, "step": 27120 }, { "epoch": 9.04603068712475, "ref_ce_loss": 0.04984374716877937, "step": 27120 }, { "epoch": 9.04603068712475, "loss": 0.37950924038887024, "step": 27120 }, { "ce_loss": 0.0460323728621006, "epoch": 9.04603068712475, "step": 27120 }, { "distill_loss": 0.21613475680351257, "epoch": 9.04603068712475, "step": 27120 }, { "epoch": 9.04603068712475, "ref_ce_loss": 0.07548118382692337, "step": 27120 }, { "epoch": 9.049366244162774, "loss": 0.3741, "step": 27130 }, { "epoch": 9.049366244162774, "grad_norm": 1.4003740549087524, "step": 27130 }, { "epoch": 9.049366244162774, "learning_rate": 1.8810358665193273e-05, "step": 27130 }, { "epoch": 9.049366244162774, "loss": 0.24134016036987305, "step": 27130 }, { "ce_loss": 0.01722213812172413, "epoch": 9.049366244162774, "step": 27130 }, { "distill_loss": 0.12212380766868591, "epoch": 9.049366244162774, "step": 27130 }, { "epoch": 9.049366244162774, "ref_ce_loss": 0.04552808403968811, "step": 27130 }, { "epoch": 9.049366244162774, "loss": 0.29700788855552673, "step": 27130 }, { "ce_loss": 0.013893130235373974, "epoch": 9.049366244162774, "step": 27130 }, { "distill_loss": 0.1976793259382248, "epoch": 9.049366244162774, "step": 27130 }, { "epoch": 9.049366244162774, "ref_ce_loss": 0.05386512726545334, "step": 27130 }, { "epoch": 9.0527018012008, "loss": 0.3745, "step": 27140 }, { "epoch": 9.0527018012008, "grad_norm": 1.0930335521697998, "step": 27140 }, { "epoch": 9.0527018012008, "learning_rate": 1.8679623071159535e-05, "step": 27140 }, { "epoch": 9.0527018012008, "loss": 0.2979638874530792, "step": 27140 }, { "ce_loss": 0.038029931485652924, "epoch": 9.0527018012008, "step": 27140 }, { "distill_loss": 0.17696678638458252, "epoch": 9.0527018012008, "step": 27140 }, { "epoch": 9.0527018012008, "ref_ce_loss": 0.0641922727227211, "step": 27140 }, { "epoch": 9.0527018012008, "loss": 0.3175976276397705, "step": 27140 }, { "ce_loss": 0.027088569477200508, "epoch": 9.0527018012008, "step": 27140 }, { "distill_loss": 0.12965548038482666, "epoch": 9.0527018012008, "step": 27140 }, { "epoch": 9.0527018012008, "ref_ce_loss": 0.07547727227210999, "step": 27140 }, { "epoch": 9.056037358238825, "loss": 0.3573, "step": 27150 }, { "epoch": 9.056037358238825, "grad_norm": 1.1945284605026245, "step": 27150 }, { "epoch": 9.056037358238825, "learning_rate": 1.8549332519037344e-05, "step": 27150 }, { "epoch": 9.056037358238825, "loss": 0.369720458984375, "step": 27150 }, { "ce_loss": 0.041470445692539215, "epoch": 9.056037358238825, "step": 27150 }, { "distill_loss": 0.24201340973377228, "epoch": 9.056037358238825, "step": 27150 }, { "epoch": 9.056037358238825, "ref_ce_loss": 0.058247197419404984, "step": 27150 }, { "epoch": 9.056037358238825, "loss": 0.2370813488960266, "step": 27150 }, { "ce_loss": 0.011418032459914684, "epoch": 9.056037358238825, "step": 27150 }, { "distill_loss": 0.13475269079208374, "epoch": 9.056037358238825, "step": 27150 }, { "epoch": 9.056037358238825, "ref_ce_loss": 0.059003476053476334, "step": 27150 }, { "epoch": 9.059372915276851, "loss": 0.3416, "step": 27160 }, { "epoch": 9.059372915276851, "grad_norm": 0.990787923336029, "step": 27160 }, { "epoch": 9.059372915276851, "learning_rate": 1.8419487160889947e-05, "step": 27160 }, { "epoch": 9.059372915276851, "loss": 0.33093833923339844, "step": 27160 }, { "ce_loss": 0.05214123800396919, "epoch": 9.059372915276851, "step": 27160 }, { "distill_loss": 0.19237744808197021, "epoch": 9.059372915276851, "step": 27160 }, { "epoch": 9.059372915276851, "ref_ce_loss": 0.06416904181241989, "step": 27160 }, { "epoch": 9.059372915276851, "loss": 0.3283519446849823, "step": 27160 }, { "ce_loss": 0.037675682455301285, "epoch": 9.059372915276851, "step": 27160 }, { "distill_loss": 0.18677571415901184, "epoch": 9.059372915276851, "step": 27160 }, { "epoch": 9.059372915276851, "ref_ce_loss": 0.07229268550872803, "step": 27160 }, { "epoch": 9.062708472314876, "loss": 0.3212, "step": 27170 }, { "epoch": 9.062708472314876, "grad_norm": 1.0613186359405518, "step": 27170 }, { "epoch": 9.062708472314876, "learning_rate": 1.8290087148260748e-05, "step": 27170 }, { "epoch": 9.062708472314876, "loss": 0.17752593755722046, "step": 27170 }, { "ce_loss": 0.021205492317676544, "epoch": 9.062708472314876, "step": 27170 }, { "distill_loss": 0.10737836360931396, "epoch": 9.062708472314876, "step": 27170 }, { "epoch": 9.062708472314876, "ref_ce_loss": 0.03590012341737747, "step": 27170 }, { "epoch": 9.062708472314876, "loss": 0.38357293605804443, "step": 27170 }, { "ce_loss": 0.07017988711595535, "epoch": 9.062708472314876, "step": 27170 }, { "distill_loss": 0.16169802844524384, "epoch": 9.062708472314876, "step": 27170 }, { "epoch": 9.062708472314876, "ref_ce_loss": 0.07344874739646912, "step": 27170 }, { "epoch": 9.066044029352902, "loss": 0.3508, "step": 27180 }, { "epoch": 9.066044029352902, "grad_norm": 1.3445948362350464, "step": 27180 }, { "epoch": 9.066044029352902, "learning_rate": 1.8161132632173562e-05, "step": 27180 }, { "epoch": 9.066044029352902, "loss": 0.3429279625415802, "step": 27180 }, { "ce_loss": 0.03333231061697006, "epoch": 9.066044029352902, "step": 27180 }, { "distill_loss": 0.1579761505126953, "epoch": 9.066044029352902, "step": 27180 }, { "epoch": 9.066044029352902, "ref_ce_loss": 0.06460767239332199, "step": 27180 }, { "epoch": 9.066044029352902, "loss": 0.53684401512146, "step": 27180 }, { "ce_loss": 0.0448799729347229, "epoch": 9.066044029352902, "step": 27180 }, { "distill_loss": 0.23155061900615692, "epoch": 9.066044029352902, "step": 27180 }, { "epoch": 9.066044029352902, "ref_ce_loss": 0.06540628522634506, "step": 27180 }, { "epoch": 9.069379586390927, "loss": 0.3759, "step": 27190 }, { "epoch": 9.069379586390927, "grad_norm": 1.0961995124816895, "step": 27190 }, { "epoch": 9.069379586390927, "learning_rate": 1.803262376313213e-05, "step": 27190 }, { "epoch": 9.069379586390927, "loss": 0.26204830408096313, "step": 27190 }, { "ce_loss": 0.033201009035110474, "epoch": 9.069379586390927, "step": 27190 }, { "distill_loss": 0.1589648425579071, "epoch": 9.069379586390927, "step": 27190 }, { "epoch": 9.069379586390927, "ref_ce_loss": 0.05141977593302727, "step": 27190 }, { "epoch": 9.069379586390927, "loss": 0.2719113826751709, "step": 27190 }, { "ce_loss": 0.031158355996012688, "epoch": 9.069379586390927, "step": 27190 }, { "distill_loss": 0.160211443901062, "epoch": 9.069379586390927, "step": 27190 }, { "epoch": 9.069379586390927, "ref_ce_loss": 0.05910457298159599, "step": 27190 }, { "epoch": 9.072715143428953, "loss": 0.3287, "step": 27200 }, { "epoch": 9.072715143428953, "grad_norm": 0.9663136005401611, "step": 27200 }, { "epoch": 9.072715143428953, "learning_rate": 1.7904560691120164e-05, "step": 27200 }, { "epoch": 9.072715143428953, "loss": 0.41447609663009644, "step": 27200 }, { "ce_loss": 0.04455767944455147, "epoch": 9.072715143428953, "step": 27200 }, { "distill_loss": 0.2078428715467453, "epoch": 9.072715143428953, "step": 27200 }, { "epoch": 9.072715143428953, "ref_ce_loss": 0.08008597791194916, "step": 27200 }, { "epoch": 9.072715143428953, "loss": 0.2740389406681061, "step": 27200 }, { "ce_loss": 0.026906346902251244, "epoch": 9.072715143428953, "step": 27200 }, { "distill_loss": 0.13863641023635864, "epoch": 9.072715143428953, "step": 27200 }, { "epoch": 9.072715143428953, "ref_ce_loss": 0.0535602904856205, "step": 27200 }, { "epoch": 9.076050700466977, "loss": 0.3602, "step": 27210 }, { "epoch": 9.076050700466977, "grad_norm": 1.1693758964538574, "step": 27210 }, { "epoch": 9.076050700466977, "learning_rate": 1.7776943565601046e-05, "step": 27210 }, { "epoch": 9.076050700466977, "loss": 0.3499804735183716, "step": 27210 }, { "ce_loss": 0.028613731265068054, "epoch": 9.076050700466977, "step": 27210 }, { "distill_loss": 0.1530584990978241, "epoch": 9.076050700466977, "step": 27210 }, { "epoch": 9.076050700466977, "ref_ce_loss": 0.07354908436536789, "step": 27210 }, { "epoch": 9.076050700466977, "loss": 0.6023759245872498, "step": 27210 }, { "ce_loss": 0.03832937777042389, "epoch": 9.076050700466977, "step": 27210 }, { "distill_loss": 0.1743897646665573, "epoch": 9.076050700466977, "step": 27210 }, { "epoch": 9.076050700466977, "ref_ce_loss": 0.055310387164354324, "step": 27210 }, { "epoch": 9.079386257505003, "loss": 0.3774, "step": 27220 }, { "epoch": 9.079386257505003, "grad_norm": 1.050165057182312, "step": 27220 }, { "epoch": 9.079386257505003, "learning_rate": 1.764977253551776e-05, "step": 27220 }, { "epoch": 9.079386257505003, "loss": 0.5067462921142578, "step": 27220 }, { "ce_loss": 0.02488376945257187, "epoch": 9.079386257505003, "step": 27220 }, { "distill_loss": 0.15910297632217407, "epoch": 9.079386257505003, "step": 27220 }, { "epoch": 9.079386257505003, "ref_ce_loss": 0.06450691819190979, "step": 27220 }, { "epoch": 9.079386257505003, "loss": 0.3871625065803528, "step": 27220 }, { "ce_loss": 0.023935766890645027, "epoch": 9.079386257505003, "step": 27220 }, { "distill_loss": 0.14328233897686005, "epoch": 9.079386257505003, "step": 27220 }, { "epoch": 9.079386257505003, "ref_ce_loss": 0.0571419782936573, "step": 27220 }, { "epoch": 9.082721814543028, "loss": 0.3639, "step": 27230 }, { "epoch": 9.082721814543028, "grad_norm": 1.2589771747589111, "step": 27230 }, { "epoch": 9.082721814543028, "learning_rate": 1.7523047749292433e-05, "step": 27230 }, { "epoch": 9.082721814543028, "loss": 0.27285081148147583, "step": 27230 }, { "ce_loss": 0.022948715835809708, "epoch": 9.082721814543028, "step": 27230 }, { "distill_loss": 0.16958534717559814, "epoch": 9.082721814543028, "step": 27230 }, { "epoch": 9.082721814543028, "ref_ce_loss": 0.0546344593167305, "step": 27230 }, { "epoch": 9.082721814543028, "loss": 0.26805174350738525, "step": 27230 }, { "ce_loss": 0.03970019891858101, "epoch": 9.082721814543028, "step": 27230 }, { "distill_loss": 0.16495995223522186, "epoch": 9.082721814543028, "step": 27230 }, { "epoch": 9.082721814543028, "ref_ce_loss": 0.044752802699804306, "step": 27230 }, { "epoch": 9.086057371581054, "loss": 0.343, "step": 27240 }, { "epoch": 9.086057371581054, "grad_norm": 0.9426337480545044, "step": 27240 }, { "epoch": 9.086057371581054, "learning_rate": 1.7396769354826616e-05, "step": 27240 }, { "epoch": 9.086057371581054, "loss": 0.2930073142051697, "step": 27240 }, { "ce_loss": 0.034419164061546326, "epoch": 9.086057371581054, "step": 27240 }, { "distill_loss": 0.18961873650550842, "epoch": 9.086057371581054, "step": 27240 }, { "epoch": 9.086057371581054, "ref_ce_loss": 0.050885505974292755, "step": 27240 }, { "epoch": 9.086057371581054, "loss": 0.36552560329437256, "step": 27240 }, { "ce_loss": 0.039030518382787704, "epoch": 9.086057371581054, "step": 27240 }, { "distill_loss": 0.20410792529582977, "epoch": 9.086057371581054, "step": 27240 }, { "epoch": 9.086057371581054, "ref_ce_loss": 0.07064566016197205, "step": 27240 }, { "epoch": 9.089392928619079, "loss": 0.3392, "step": 27250 }, { "epoch": 9.089392928619079, "grad_norm": 1.2283823490142822, "step": 27250 }, { "epoch": 9.089392928619079, "learning_rate": 1.7270937499500773e-05, "step": 27250 }, { "epoch": 9.089392928619079, "loss": 0.377672016620636, "step": 27250 }, { "ce_loss": 0.039118725806474686, "epoch": 9.089392928619079, "step": 27250 }, { "distill_loss": 0.15970379114151, "epoch": 9.089392928619079, "step": 27250 }, { "epoch": 9.089392928619079, "ref_ce_loss": 0.0498061366379261, "step": 27250 }, { "epoch": 9.089392928619079, "loss": 0.26500624418258667, "step": 27250 }, { "ce_loss": 0.05562635511159897, "epoch": 9.089392928619079, "step": 27250 }, { "distill_loss": 0.15516263246536255, "epoch": 9.089392928619079, "step": 27250 }, { "epoch": 9.089392928619079, "ref_ce_loss": 0.03663584962487221, "step": 27250 }, { "epoch": 9.092728485657105, "loss": 0.3697, "step": 27260 }, { "epoch": 9.092728485657105, "grad_norm": 1.1213407516479492, "step": 27260 }, { "epoch": 9.092728485657105, "learning_rate": 1.7145552330174276e-05, "step": 27260 }, { "epoch": 9.092728485657105, "loss": 0.2562912404537201, "step": 27260 }, { "ce_loss": 0.01841781474649906, "epoch": 9.092728485657105, "step": 27260 }, { "distill_loss": 0.16973228752613068, "epoch": 9.092728485657105, "step": 27260 }, { "epoch": 9.092728485657105, "ref_ce_loss": 0.05428987368941307, "step": 27260 }, { "epoch": 9.092728485657105, "loss": 0.3668363094329834, "step": 27260 }, { "ce_loss": 0.04437238350510597, "epoch": 9.092728485657105, "step": 27260 }, { "distill_loss": 0.17178454995155334, "epoch": 9.092728485657105, "step": 27260 }, { "epoch": 9.092728485657105, "ref_ce_loss": 0.06676838546991348, "step": 27260 }, { "epoch": 9.09606404269513, "loss": 0.4288, "step": 27270 }, { "epoch": 9.09606404269513, "grad_norm": 1.3704133033752441, "step": 27270 }, { "epoch": 9.09606404269513, "learning_rate": 1.7020613993184996e-05, "step": 27270 }, { "epoch": 9.09606404269513, "loss": 1.1396571397781372, "step": 27270 }, { "ce_loss": 0.06560547649860382, "epoch": 9.09606404269513, "step": 27270 }, { "distill_loss": 0.22419370710849762, "epoch": 9.09606404269513, "step": 27270 }, { "epoch": 9.09606404269513, "ref_ce_loss": 0.06717196851968765, "step": 27270 }, { "epoch": 9.09606404269513, "loss": 0.32848432660102844, "step": 27270 }, { "ce_loss": 0.0432603545486927, "epoch": 9.09606404269513, "step": 27270 }, { "distill_loss": 0.16509680449962616, "epoch": 9.09606404269513, "step": 27270 }, { "epoch": 9.09606404269513, "ref_ce_loss": 0.04808349534869194, "step": 27270 }, { "epoch": 9.099399599733156, "loss": 0.3899, "step": 27280 }, { "epoch": 9.099399599733156, "grad_norm": 1.3461713790893555, "step": 27280 }, { "epoch": 9.099399599733156, "learning_rate": 1.68961226343495e-05, "step": 27280 }, { "epoch": 9.099399599733156, "loss": 0.31953686475753784, "step": 27280 }, { "ce_loss": 0.013826312497258186, "epoch": 9.099399599733156, "step": 27280 }, { "distill_loss": 0.17898410558700562, "epoch": 9.099399599733156, "step": 27280 }, { "epoch": 9.099399599733156, "ref_ce_loss": 0.054237280040979385, "step": 27280 }, { "epoch": 9.099399599733156, "loss": 0.29063862562179565, "step": 27280 }, { "ce_loss": 0.025695666670799255, "epoch": 9.099399599733156, "step": 27280 }, { "distill_loss": 0.16870594024658203, "epoch": 9.099399599733156, "step": 27280 }, { "epoch": 9.099399599733156, "ref_ce_loss": 0.06049703061580658, "step": 27280 }, { "epoch": 9.10273515677118, "loss": 0.3537, "step": 27290 }, { "epoch": 9.10273515677118, "grad_norm": 1.5573277473449707, "step": 27290 }, { "epoch": 9.10273515677118, "learning_rate": 1.677207839896253e-05, "step": 27290 }, { "epoch": 9.10273515677118, "loss": 0.29031074047088623, "step": 27290 }, { "ce_loss": 0.02433249168097973, "epoch": 9.10273515677118, "step": 27290 }, { "distill_loss": 0.16359396278858185, "epoch": 9.10273515677118, "step": 27290 }, { "epoch": 9.10273515677118, "ref_ce_loss": 0.03717683628201485, "step": 27290 }, { "epoch": 9.10273515677118, "loss": 0.3187103867530823, "step": 27290 }, { "ce_loss": 0.0356263630092144, "epoch": 9.10273515677118, "step": 27290 }, { "distill_loss": 0.1854030191898346, "epoch": 9.10273515677118, "step": 27290 }, { "epoch": 9.10273515677118, "ref_ce_loss": 0.06397075951099396, "step": 27290 }, { "epoch": 9.106070713809206, "loss": 0.3855, "step": 27300 }, { "epoch": 9.106070713809206, "grad_norm": 2.3191182613372803, "step": 27300 }, { "epoch": 9.106070713809206, "learning_rate": 1.6648481431797135e-05, "step": 27300 }, { "epoch": 9.106070713809206, "loss": 0.3305771052837372, "step": 27300 }, { "ce_loss": 0.03626731410622597, "epoch": 9.106070713809206, "step": 27300 }, { "distill_loss": 0.16633017361164093, "epoch": 9.106070713809206, "step": 27300 }, { "epoch": 9.106070713809206, "ref_ce_loss": 0.039277367293834686, "step": 27300 }, { "epoch": 9.106070713809206, "loss": 0.25371691584587097, "step": 27300 }, { "ce_loss": 0.039535846561193466, "epoch": 9.106070713809206, "step": 27300 }, { "distill_loss": 0.16822320222854614, "epoch": 9.106070713809206, "step": 27300 }, { "epoch": 9.106070713809206, "ref_ce_loss": 0.04572812095284462, "step": 27300 }, { "epoch": 9.10940627084723, "loss": 0.3495, "step": 27310 }, { "epoch": 9.10940627084723, "grad_norm": 0.8396355509757996, "step": 27310 }, { "epoch": 9.10940627084723, "learning_rate": 1.652533187710419e-05, "step": 27310 }, { "epoch": 9.10940627084723, "loss": 0.3427233397960663, "step": 27310 }, { "ce_loss": 0.028757216408848763, "epoch": 9.10940627084723, "step": 27310 }, { "distill_loss": 0.1467723846435547, "epoch": 9.10940627084723, "step": 27310 }, { "epoch": 9.10940627084723, "ref_ce_loss": 0.07151523232460022, "step": 27310 }, { "epoch": 9.10940627084723, "loss": 0.41144073009490967, "step": 27310 }, { "ce_loss": 0.049031734466552734, "epoch": 9.10940627084723, "step": 27310 }, { "distill_loss": 0.20296907424926758, "epoch": 9.10940627084723, "step": 27310 }, { "epoch": 9.10940627084723, "ref_ce_loss": 0.06365089118480682, "step": 27310 }, { "epoch": 9.112741827885257, "loss": 0.3414, "step": 27320 }, { "epoch": 9.112741827885257, "grad_norm": 1.8548704385757446, "step": 27320 }, { "epoch": 9.112741827885257, "learning_rate": 1.6402629878612586e-05, "step": 27320 }, { "epoch": 9.112741827885257, "loss": 0.33567383885383606, "step": 27320 }, { "ce_loss": 0.03355925902724266, "epoch": 9.112741827885257, "step": 27320 }, { "distill_loss": 0.1899872124195099, "epoch": 9.112741827885257, "step": 27320 }, { "epoch": 9.112741827885257, "ref_ce_loss": 0.06817373633384705, "step": 27320 }, { "epoch": 9.112741827885257, "loss": 0.3683999180793762, "step": 27320 }, { "ce_loss": 0.0811447873711586, "epoch": 9.112741827885257, "step": 27320 }, { "distill_loss": 0.1963319629430771, "epoch": 9.112741827885257, "step": 27320 }, { "epoch": 9.112741827885257, "ref_ce_loss": 0.07135666906833649, "step": 27320 }, { "epoch": 9.116077384923281, "loss": 0.3904, "step": 27330 }, { "epoch": 9.116077384923281, "grad_norm": 0.9724132418632507, "step": 27330 }, { "epoch": 9.116077384923281, "learning_rate": 1.6280375579528663e-05, "step": 27330 }, { "epoch": 9.116077384923281, "loss": 0.29246678948402405, "step": 27330 }, { "ce_loss": 0.035326674580574036, "epoch": 9.116077384923281, "step": 27330 }, { "distill_loss": 0.1687965989112854, "epoch": 9.116077384923281, "step": 27330 }, { "epoch": 9.116077384923281, "ref_ce_loss": 0.05702679604291916, "step": 27330 }, { "epoch": 9.116077384923281, "loss": 0.39776480197906494, "step": 27330 }, { "ce_loss": 0.053400568664073944, "epoch": 9.116077384923281, "step": 27330 }, { "distill_loss": 0.20380781590938568, "epoch": 9.116077384923281, "step": 27330 }, { "epoch": 9.116077384923281, "ref_ce_loss": 0.046494729816913605, "step": 27330 }, { "epoch": 9.119412941961308, "loss": 0.3396, "step": 27340 }, { "epoch": 9.119412941961308, "grad_norm": 1.214045763015747, "step": 27340 }, { "epoch": 9.119412941961308, "learning_rate": 1.6158569122536414e-05, "step": 27340 }, { "epoch": 9.119412941961308, "loss": 0.37317508459091187, "step": 27340 }, { "ce_loss": 0.0343099981546402, "epoch": 9.119412941961308, "step": 27340 }, { "distill_loss": 0.16131190955638885, "epoch": 9.119412941961308, "step": 27340 }, { "epoch": 9.119412941961308, "ref_ce_loss": 0.042967405170202255, "step": 27340 }, { "epoch": 9.119412941961308, "loss": 0.3499917685985565, "step": 27340 }, { "ce_loss": 0.04128754511475563, "epoch": 9.119412941961308, "step": 27340 }, { "distill_loss": 0.1890879124403, "epoch": 9.119412941961308, "step": 27340 }, { "epoch": 9.119412941961308, "ref_ce_loss": 0.0573907345533371, "step": 27340 }, { "epoch": 9.122748498999332, "loss": 0.3856, "step": 27350 }, { "epoch": 9.122748498999332, "grad_norm": 1.5197536945343018, "step": 27350 }, { "epoch": 9.122748498999332, "learning_rate": 1.6037210649797063e-05, "step": 27350 }, { "epoch": 9.122748498999332, "loss": 0.3776387572288513, "step": 27350 }, { "ce_loss": 0.08085337281227112, "epoch": 9.122748498999332, "step": 27350 }, { "distill_loss": 0.17287683486938477, "epoch": 9.122748498999332, "step": 27350 }, { "epoch": 9.122748498999332, "ref_ce_loss": 0.06616152077913284, "step": 27350 }, { "epoch": 9.122748498999332, "loss": 0.373631089925766, "step": 27350 }, { "ce_loss": 0.030946599319577217, "epoch": 9.122748498999332, "step": 27350 }, { "distill_loss": 0.19271759688854218, "epoch": 9.122748498999332, "step": 27350 }, { "epoch": 9.122748498999332, "ref_ce_loss": 0.07093948125839233, "step": 27350 }, { "epoch": 9.126084056037358, "loss": 0.3804, "step": 27360 }, { "epoch": 9.126084056037358, "grad_norm": 1.1146163940429688, "step": 27360 }, { "epoch": 9.126084056037358, "learning_rate": 1.5916300302948905e-05, "step": 27360 }, { "epoch": 9.126084056037358, "loss": 0.2942475378513336, "step": 27360 }, { "ce_loss": 0.029155785217881203, "epoch": 9.126084056037358, "step": 27360 }, { "distill_loss": 0.17508697509765625, "epoch": 9.126084056037358, "step": 27360 }, { "epoch": 9.126084056037358, "ref_ce_loss": 0.0685693621635437, "step": 27360 }, { "epoch": 9.126084056037358, "loss": 0.3584308624267578, "step": 27360 }, { "ce_loss": 0.02746543474495411, "epoch": 9.126084056037358, "step": 27360 }, { "distill_loss": 0.19145989418029785, "epoch": 9.126084056037358, "step": 27360 }, { "epoch": 9.126084056037358, "ref_ce_loss": 0.06303809583187103, "step": 27360 }, { "epoch": 9.129419613075383, "loss": 0.3486, "step": 27370 }, { "epoch": 9.129419613075383, "grad_norm": 0.8257373571395874, "step": 27370 }, { "epoch": 9.129419613075383, "learning_rate": 1.579583822310746e-05, "step": 27370 }, { "epoch": 9.129419613075383, "loss": 0.27437683939933777, "step": 27370 }, { "ce_loss": 0.018562039360404015, "epoch": 9.129419613075383, "step": 27370 }, { "distill_loss": 0.14559762179851532, "epoch": 9.129419613075383, "step": 27370 }, { "epoch": 9.129419613075383, "ref_ce_loss": 0.04495834559202194, "step": 27370 }, { "epoch": 9.129419613075383, "loss": 0.36754468083381653, "step": 27370 }, { "ce_loss": 0.05180488899350166, "epoch": 9.129419613075383, "step": 27370 }, { "distill_loss": 0.21629267930984497, "epoch": 9.129419613075383, "step": 27370 }, { "epoch": 9.129419613075383, "ref_ce_loss": 0.07912370562553406, "step": 27370 }, { "epoch": 9.132755170113409, "loss": 0.3541, "step": 27380 }, { "epoch": 9.132755170113409, "grad_norm": 1.5731775760650635, "step": 27380 }, { "epoch": 9.132755170113409, "learning_rate": 1.567582455086494e-05, "step": 27380 }, { "epoch": 9.132755170113409, "loss": 0.3551574945449829, "step": 27380 }, { "ce_loss": 0.03758762776851654, "epoch": 9.132755170113409, "step": 27380 }, { "distill_loss": 0.15888771414756775, "epoch": 9.132755170113409, "step": 27380 }, { "epoch": 9.132755170113409, "ref_ce_loss": 0.06447268277406693, "step": 27380 }, { "epoch": 9.132755170113409, "loss": 0.3179193139076233, "step": 27380 }, { "ce_loss": 0.048470109701156616, "epoch": 9.132755170113409, "step": 27380 }, { "distill_loss": 0.15303857624530792, "epoch": 9.132755170113409, "step": 27380 }, { "epoch": 9.132755170113409, "ref_ce_loss": 0.05130033940076828, "step": 27380 }, { "epoch": 9.136090727151434, "loss": 0.3701, "step": 27390 }, { "epoch": 9.136090727151434, "grad_norm": 1.0357892513275146, "step": 27390 }, { "epoch": 9.136090727151434, "learning_rate": 1.5556259426290086e-05, "step": 27390 }, { "epoch": 9.136090727151434, "loss": 0.29864880442619324, "step": 27390 }, { "ce_loss": 0.05004766955971718, "epoch": 9.136090727151434, "step": 27390 }, { "distill_loss": 0.19834557175636292, "epoch": 9.136090727151434, "step": 27390 }, { "epoch": 9.136090727151434, "ref_ce_loss": 0.04956059902906418, "step": 27390 }, { "epoch": 9.136090727151434, "loss": 0.24494145810604095, "step": 27390 }, { "ce_loss": 0.03270229697227478, "epoch": 9.136090727151434, "step": 27390 }, { "distill_loss": 0.1631958782672882, "epoch": 9.136090727151434, "step": 27390 }, { "epoch": 9.136090727151434, "ref_ce_loss": 0.04877462610602379, "step": 27390 }, { "epoch": 9.13942628418946, "loss": 0.3564, "step": 27400 }, { "epoch": 9.13942628418946, "grad_norm": 0.8601231575012207, "step": 27400 }, { "epoch": 9.13942628418946, "learning_rate": 1.543714298892831e-05, "step": 27400 }, { "epoch": 9.13942628418946, "loss": 0.2472851574420929, "step": 27400 }, { "ce_loss": 0.02401360310614109, "epoch": 9.13942628418946, "step": 27400 }, { "distill_loss": 0.14570125937461853, "epoch": 9.13942628418946, "step": 27400 }, { "epoch": 9.13942628418946, "ref_ce_loss": 0.05537709966301918, "step": 27400 }, { "epoch": 9.13942628418946, "loss": 0.3812285363674164, "step": 27400 }, { "ce_loss": 0.06255284696817398, "epoch": 9.13942628418946, "step": 27400 }, { "distill_loss": 0.2078147828578949, "epoch": 9.13942628418946, "step": 27400 }, { "epoch": 9.13942628418946, "ref_ce_loss": 0.0539514385163784, "step": 27400 }, { "epoch": 9.142761841227484, "loss": 0.3798, "step": 27410 }, { "epoch": 9.142761841227484, "grad_norm": 1.2774498462677002, "step": 27410 }, { "epoch": 9.142761841227484, "learning_rate": 1.5318475377801422e-05, "step": 27410 }, { "epoch": 9.142761841227484, "loss": 0.3671090304851532, "step": 27410 }, { "ce_loss": 0.06406539678573608, "epoch": 9.142761841227484, "step": 27410 }, { "distill_loss": 0.1353074312210083, "epoch": 9.142761841227484, "step": 27410 }, { "epoch": 9.142761841227484, "ref_ce_loss": 0.07601393014192581, "step": 27410 }, { "epoch": 9.142761841227484, "loss": 0.2966614365577698, "step": 27410 }, { "ce_loss": 0.03425230830907822, "epoch": 9.142761841227484, "step": 27410 }, { "distill_loss": 0.16910341382026672, "epoch": 9.142761841227484, "step": 27410 }, { "epoch": 9.142761841227484, "ref_ce_loss": 0.06891260296106339, "step": 27410 }, { "epoch": 9.14609739826551, "loss": 0.3266, "step": 27420 }, { "epoch": 9.14609739826551, "grad_norm": 1.0859674215316772, "step": 27420 }, { "epoch": 9.14609739826551, "learning_rate": 1.5200256731407214e-05, "step": 27420 }, { "epoch": 9.14609739826551, "loss": 0.42090123891830444, "step": 27420 }, { "ce_loss": 0.02037889137864113, "epoch": 9.14609739826551, "step": 27420 }, { "distill_loss": 0.15968488156795502, "epoch": 9.14609739826551, "step": 27420 }, { "epoch": 9.14609739826551, "ref_ce_loss": 0.0486813485622406, "step": 27420 }, { "epoch": 9.14609739826551, "loss": 0.3073991537094116, "step": 27420 }, { "ce_loss": 0.028052043169736862, "epoch": 9.14609739826551, "step": 27420 }, { "distill_loss": 0.18357442319393158, "epoch": 9.14609739826551, "step": 27420 }, { "epoch": 9.14609739826551, "ref_ce_loss": 0.06518913060426712, "step": 27420 }, { "epoch": 9.149432955303535, "loss": 0.3282, "step": 27430 }, { "epoch": 9.149432955303535, "grad_norm": 1.4294742345809937, "step": 27430 }, { "epoch": 9.149432955303535, "learning_rate": 1.5082487187719495e-05, "step": 27430 }, { "epoch": 9.149432955303535, "loss": 0.2739667594432831, "step": 27430 }, { "ce_loss": 0.04306583106517792, "epoch": 9.149432955303535, "step": 27430 }, { "distill_loss": 0.16011951863765717, "epoch": 9.149432955303535, "step": 27430 }, { "epoch": 9.149432955303535, "ref_ce_loss": 0.070659339427948, "step": 27430 }, { "epoch": 9.149432955303535, "loss": 1.037210464477539, "step": 27430 }, { "ce_loss": 0.06925816833972931, "epoch": 9.149432955303535, "step": 27430 }, { "distill_loss": 0.1676364243030548, "epoch": 9.149432955303535, "step": 27430 }, { "epoch": 9.149432955303535, "ref_ce_loss": 0.06458926945924759, "step": 27430 }, { "epoch": 9.152768512341561, "loss": 0.3814, "step": 27440 }, { "epoch": 9.152768512341561, "grad_norm": 0.8522936105728149, "step": 27440 }, { "epoch": 9.152768512341561, "learning_rate": 1.4965166884188097e-05, "step": 27440 }, { "epoch": 9.152768512341561, "loss": 0.29712921380996704, "step": 27440 }, { "ce_loss": 0.05974900349974632, "epoch": 9.152768512341561, "step": 27440 }, { "distill_loss": 0.17028288543224335, "epoch": 9.152768512341561, "step": 27440 }, { "epoch": 9.152768512341561, "ref_ce_loss": 0.06694556027650833, "step": 27440 }, { "epoch": 9.152768512341561, "loss": 0.33882614970207214, "step": 27440 }, { "ce_loss": 0.0511908158659935, "epoch": 9.152768512341561, "step": 27440 }, { "distill_loss": 0.194253608584404, "epoch": 9.152768512341561, "step": 27440 }, { "epoch": 9.152768512341561, "ref_ce_loss": 0.06548138707876205, "step": 27440 }, { "epoch": 9.156104069379586, "loss": 0.3127, "step": 27450 }, { "epoch": 9.156104069379586, "grad_norm": 0.9218608140945435, "step": 27450 }, { "epoch": 9.156104069379586, "learning_rate": 1.4848295957738467e-05, "step": 27450 }, { "epoch": 9.156104069379586, "loss": 0.35625752806663513, "step": 27450 }, { "ce_loss": 0.06627841293811798, "epoch": 9.156104069379586, "step": 27450 }, { "distill_loss": 0.18910615146160126, "epoch": 9.156104069379586, "step": 27450 }, { "epoch": 9.156104069379586, "ref_ce_loss": 0.07055387645959854, "step": 27450 }, { "epoch": 9.156104069379586, "loss": 0.5226238369941711, "step": 27450 }, { "ce_loss": 0.05875205248594284, "epoch": 9.156104069379586, "step": 27450 }, { "distill_loss": 0.19257190823554993, "epoch": 9.156104069379586, "step": 27450 }, { "epoch": 9.156104069379586, "ref_ce_loss": 0.07424885034561157, "step": 27450 }, { "epoch": 9.159439626417612, "loss": 0.3483, "step": 27460 }, { "epoch": 9.159439626417612, "grad_norm": 1.1902096271514893, "step": 27460 }, { "epoch": 9.159439626417612, "learning_rate": 1.4731874544771452e-05, "step": 27460 }, { "epoch": 9.159439626417612, "loss": 0.31832242012023926, "step": 27460 }, { "ce_loss": 0.03065738081932068, "epoch": 9.159439626417612, "step": 27460 }, { "distill_loss": 0.1613398790359497, "epoch": 9.159439626417612, "step": 27460 }, { "epoch": 9.159439626417612, "ref_ce_loss": 0.05969864875078201, "step": 27460 }, { "epoch": 9.159439626417612, "loss": 0.3331111967563629, "step": 27460 }, { "ce_loss": 0.0514845997095108, "epoch": 9.159439626417612, "step": 27460 }, { "distill_loss": 0.19371838867664337, "epoch": 9.159439626417612, "step": 27460 }, { "epoch": 9.159439626417612, "ref_ce_loss": 0.06049351766705513, "step": 27460 }, { "epoch": 9.162775183455636, "loss": 0.3752, "step": 27470 }, { "epoch": 9.162775183455636, "grad_norm": 4.025130748748779, "step": 27470 }, { "epoch": 9.162775183455636, "learning_rate": 1.4615902781163382e-05, "step": 27470 }, { "epoch": 9.162775183455636, "loss": 0.3633132874965668, "step": 27470 }, { "ce_loss": 0.03981828689575195, "epoch": 9.162775183455636, "step": 27470 }, { "distill_loss": 0.14197511970996857, "epoch": 9.162775183455636, "step": 27470 }, { "epoch": 9.162775183455636, "ref_ce_loss": 0.0731300413608551, "step": 27470 }, { "epoch": 9.162775183455636, "loss": 0.31256744265556335, "step": 27470 }, { "ce_loss": 0.0272130835801363, "epoch": 9.162775183455636, "step": 27470 }, { "distill_loss": 0.18780873715877533, "epoch": 9.162775183455636, "step": 27470 }, { "epoch": 9.162775183455636, "ref_ce_loss": 0.06526809185743332, "step": 27470 }, { "epoch": 9.166110740493663, "loss": 0.3465, "step": 27480 }, { "epoch": 9.166110740493663, "grad_norm": 0.8970112204551697, "step": 27480 }, { "epoch": 9.166110740493663, "learning_rate": 1.4500380802265856e-05, "step": 27480 }, { "epoch": 9.166110740493663, "loss": 0.3625814616680145, "step": 27480 }, { "ce_loss": 0.018771648406982422, "epoch": 9.166110740493663, "step": 27480 }, { "distill_loss": 0.16558365523815155, "epoch": 9.166110740493663, "step": 27480 }, { "epoch": 9.166110740493663, "ref_ce_loss": 0.0564962700009346, "step": 27480 }, { "epoch": 9.166110740493663, "loss": 0.31672704219818115, "step": 27480 }, { "ce_loss": 0.030506962910294533, "epoch": 9.166110740493663, "step": 27480 }, { "distill_loss": 0.1810687929391861, "epoch": 9.166110740493663, "step": 27480 }, { "epoch": 9.166110740493663, "ref_ce_loss": 0.07672914117574692, "step": 27480 }, { "epoch": 9.169446297531687, "loss": 0.3435, "step": 27490 }, { "epoch": 9.169446297531687, "grad_norm": 0.8674625158309937, "step": 27490 }, { "epoch": 9.169446297531687, "learning_rate": 1.4385308742905423e-05, "step": 27490 }, { "epoch": 9.169446297531687, "loss": 0.3576408922672272, "step": 27490 }, { "ce_loss": 0.03308350220322609, "epoch": 9.169446297531687, "step": 27490 }, { "distill_loss": 0.18558846414089203, "epoch": 9.169446297531687, "step": 27490 }, { "epoch": 9.169446297531687, "ref_ce_loss": 0.07912314683198929, "step": 27490 }, { "epoch": 9.169446297531687, "loss": 0.41283512115478516, "step": 27490 }, { "ce_loss": 0.0438760407269001, "epoch": 9.169446297531687, "step": 27490 }, { "distill_loss": 0.19519221782684326, "epoch": 9.169446297531687, "step": 27490 }, { "epoch": 9.169446297531687, "ref_ce_loss": 0.07502762228250504, "step": 27490 }, { "epoch": 9.172781854569713, "loss": 0.3492, "step": 27500 }, { "epoch": 9.172781854569713, "grad_norm": 1.755750060081482, "step": 27500 }, { "epoch": 9.172781854569713, "learning_rate": 1.42706867373835e-05, "step": 27500 }, { "epoch": 9.172781854569713, "loss": 0.3151405155658722, "step": 27500 }, { "ce_loss": 0.05989915877580643, "epoch": 9.172781854569713, "step": 27500 }, { "distill_loss": 0.16233427822589874, "epoch": 9.172781854569713, "step": 27500 }, { "epoch": 9.172781854569713, "ref_ce_loss": 0.0649150013923645, "step": 27500 }, { "epoch": 9.172781854569713, "loss": 0.5010256767272949, "step": 27500 }, { "ce_loss": 0.060962971299886703, "epoch": 9.172781854569713, "step": 27500 }, { "distill_loss": 0.2040940821170807, "epoch": 9.172781854569713, "step": 27500 }, { "epoch": 9.172781854569713, "ref_ce_loss": 0.04863875359296799, "step": 27500 }, { "epoch": 9.176117411607738, "loss": 0.3409, "step": 27510 }, { "epoch": 9.176117411607738, "grad_norm": 1.3543940782546997, "step": 27510 }, { "epoch": 9.176117411607738, "learning_rate": 1.4156514919476271e-05, "step": 27510 }, { "epoch": 9.176117411607738, "loss": 0.3677135407924652, "step": 27510 }, { "ce_loss": 0.04805472865700722, "epoch": 9.176117411607738, "step": 27510 }, { "distill_loss": 0.20692524313926697, "epoch": 9.176117411607738, "step": 27510 }, { "epoch": 9.176117411607738, "ref_ce_loss": 0.055242788046598434, "step": 27510 }, { "epoch": 9.176117411607738, "loss": 0.44220152497291565, "step": 27510 }, { "ce_loss": 0.04361943155527115, "epoch": 9.176117411607738, "step": 27510 }, { "distill_loss": 0.193200945854187, "epoch": 9.176117411607738, "step": 27510 }, { "epoch": 9.176117411607738, "ref_ce_loss": 0.055958181619644165, "step": 27510 }, { "epoch": 9.179452968645764, "loss": 0.3342, "step": 27520 }, { "epoch": 9.179452968645764, "grad_norm": 1.265563726425171, "step": 27520 }, { "epoch": 9.179452968645764, "learning_rate": 1.4042793422434707e-05, "step": 27520 }, { "epoch": 9.179452968645764, "loss": 0.44008785486221313, "step": 27520 }, { "ce_loss": 0.02725176140666008, "epoch": 9.179452968645764, "step": 27520 }, { "distill_loss": 0.1367584764957428, "epoch": 9.179452968645764, "step": 27520 }, { "epoch": 9.179452968645764, "ref_ce_loss": 0.0530785396695137, "step": 27520 }, { "epoch": 9.179452968645764, "loss": 0.36948102712631226, "step": 27520 }, { "ce_loss": 0.048951759934425354, "epoch": 9.179452968645764, "step": 27520 }, { "distill_loss": 0.20727337896823883, "epoch": 9.179452968645764, "step": 27520 }, { "epoch": 9.179452968645764, "ref_ce_loss": 0.05927341803908348, "step": 27520 }, { "epoch": 9.182788525683788, "loss": 0.3524, "step": 27530 }, { "epoch": 9.182788525683788, "grad_norm": 1.4308565855026245, "step": 27530 }, { "epoch": 9.182788525683788, "learning_rate": 1.3929522378983928e-05, "step": 27530 }, { "epoch": 9.182788525683788, "loss": 0.41626620292663574, "step": 27530 }, { "ce_loss": 0.03843936324119568, "epoch": 9.182788525683788, "step": 27530 }, { "distill_loss": 0.18540190160274506, "epoch": 9.182788525683788, "step": 27530 }, { "epoch": 9.182788525683788, "ref_ce_loss": 0.061426643282175064, "step": 27530 }, { "epoch": 9.182788525683788, "loss": 0.2769725024700165, "step": 27530 }, { "ce_loss": 0.03272394835948944, "epoch": 9.182788525683788, "step": 27530 }, { "distill_loss": 0.1454697698354721, "epoch": 9.182788525683788, "step": 27530 }, { "epoch": 9.182788525683788, "ref_ce_loss": 0.06404945999383926, "step": 27530 }, { "epoch": 9.186124082721815, "loss": 0.368, "step": 27540 }, { "epoch": 9.186124082721815, "grad_norm": 1.2058454751968384, "step": 27540 }, { "epoch": 9.186124082721815, "learning_rate": 1.3816701921323428e-05, "step": 27540 }, { "epoch": 9.186124082721815, "loss": 0.43945053219795227, "step": 27540 }, { "ce_loss": 0.050008222460746765, "epoch": 9.186124082721815, "step": 27540 }, { "distill_loss": 0.1664680391550064, "epoch": 9.186124082721815, "step": 27540 }, { "epoch": 9.186124082721815, "ref_ce_loss": 0.0641934797167778, "step": 27540 }, { "epoch": 9.186124082721815, "loss": 0.3098108172416687, "step": 27540 }, { "ce_loss": 0.05395625904202461, "epoch": 9.186124082721815, "step": 27540 }, { "distill_loss": 0.14219501614570618, "epoch": 9.186124082721815, "step": 27540 }, { "epoch": 9.186124082721815, "ref_ce_loss": 0.06553469598293304, "step": 27540 }, { "epoch": 9.18945963975984, "loss": 0.3232, "step": 27550 }, { "epoch": 9.18945963975984, "grad_norm": 1.7351081371307373, "step": 27550 }, { "epoch": 9.18945963975984, "learning_rate": 1.3704332181126811e-05, "step": 27550 }, { "epoch": 9.18945963975984, "loss": 0.3305128216743469, "step": 27550 }, { "ce_loss": 0.052353035658597946, "epoch": 9.18945963975984, "step": 27550 }, { "distill_loss": 0.19812655448913574, "epoch": 9.18945963975984, "step": 27550 }, { "epoch": 9.18945963975984, "ref_ce_loss": 0.05271931365132332, "step": 27550 }, { "epoch": 9.18945963975984, "loss": 0.36235523223876953, "step": 27550 }, { "ce_loss": 0.07153531163930893, "epoch": 9.18945963975984, "step": 27550 }, { "distill_loss": 0.21841926872730255, "epoch": 9.18945963975984, "step": 27550 }, { "epoch": 9.18945963975984, "ref_ce_loss": 0.06689158082008362, "step": 27550 }, { "epoch": 9.192795196797865, "loss": 0.3532, "step": 27560 }, { "epoch": 9.192795196797865, "grad_norm": 1.0499604940414429, "step": 27560 }, { "epoch": 9.192795196797865, "learning_rate": 1.3592413289541661e-05, "step": 27560 }, { "epoch": 9.192795196797865, "loss": 0.4369659125804901, "step": 27560 }, { "ce_loss": 0.06519733369350433, "epoch": 9.192795196797865, "step": 27560 }, { "distill_loss": 0.23988565802574158, "epoch": 9.192795196797865, "step": 27560 }, { "epoch": 9.192795196797865, "ref_ce_loss": 0.09593500941991806, "step": 27560 }, { "epoch": 9.192795196797865, "loss": 0.2988507151603699, "step": 27560 }, { "ce_loss": 0.03475344553589821, "epoch": 9.192795196797865, "step": 27560 }, { "distill_loss": 0.17159003019332886, "epoch": 9.192795196797865, "step": 27560 }, { "epoch": 9.192795196797865, "ref_ce_loss": 0.06151943281292915, "step": 27560 }, { "epoch": 9.19613075383589, "loss": 0.3207, "step": 27570 }, { "epoch": 9.19613075383589, "grad_norm": 1.0811963081359863, "step": 27570 }, { "epoch": 9.19613075383589, "learning_rate": 1.3480945377189446e-05, "step": 27570 }, { "epoch": 9.19613075383589, "loss": 0.4045826196670532, "step": 27570 }, { "ce_loss": 0.031967297196388245, "epoch": 9.19613075383589, "step": 27570 }, { "distill_loss": 0.1706383228302002, "epoch": 9.19613075383589, "step": 27570 }, { "epoch": 9.19613075383589, "ref_ce_loss": 0.05827203392982483, "step": 27570 }, { "epoch": 9.19613075383589, "loss": 0.2599678635597229, "step": 27570 }, { "ce_loss": 0.038392551243305206, "epoch": 9.19613075383589, "step": 27570 }, { "distill_loss": 0.12934038043022156, "epoch": 9.19613075383589, "step": 27570 }, { "epoch": 9.19613075383589, "ref_ce_loss": 0.06656457483768463, "step": 27570 }, { "epoch": 9.199466310873916, "loss": 0.3678, "step": 27580 }, { "epoch": 9.199466310873916, "grad_norm": 1.811726450920105, "step": 27580 }, { "epoch": 9.199466310873916, "learning_rate": 1.3369928574165124e-05, "step": 27580 }, { "epoch": 9.199466310873916, "loss": 0.5186915993690491, "step": 27580 }, { "ce_loss": 0.0656609907746315, "epoch": 9.199466310873916, "step": 27580 }, { "distill_loss": 0.22513791918754578, "epoch": 9.199466310873916, "step": 27580 }, { "epoch": 9.199466310873916, "ref_ce_loss": 0.07715293765068054, "step": 27580 }, { "epoch": 9.199466310873916, "loss": 0.2685270309448242, "step": 27580 }, { "ce_loss": 0.030084757134318352, "epoch": 9.199466310873916, "step": 27580 }, { "distill_loss": 0.16327223181724548, "epoch": 9.199466310873916, "step": 27580 }, { "epoch": 9.199466310873916, "ref_ce_loss": 0.0434606596827507, "step": 27580 }, { "epoch": 9.20280186791194, "loss": 0.3612, "step": 27590 }, { "epoch": 9.20280186791194, "grad_norm": 1.1356041431427002, "step": 27590 }, { "epoch": 9.20280186791194, "learning_rate": 1.325936301003723e-05, "step": 27590 }, { "epoch": 9.20280186791194, "loss": 0.33782315254211426, "step": 27590 }, { "ce_loss": 0.03354613110423088, "epoch": 9.20280186791194, "step": 27590 }, { "distill_loss": 0.16796359419822693, "epoch": 9.20280186791194, "step": 27590 }, { "epoch": 9.20280186791194, "ref_ce_loss": 0.05903716757893562, "step": 27590 }, { "epoch": 9.20280186791194, "loss": 0.259998619556427, "step": 27590 }, { "ce_loss": 0.033992376178503036, "epoch": 9.20280186791194, "step": 27590 }, { "distill_loss": 0.18147948384284973, "epoch": 9.20280186791194, "step": 27590 }, { "epoch": 9.20280186791194, "ref_ce_loss": 0.044291913509368896, "step": 27590 }, { "epoch": 9.206137424949967, "loss": 0.3284, "step": 27600 }, { "epoch": 9.206137424949967, "grad_norm": 1.02849280834198, "step": 27600 }, { "epoch": 9.206137424949967, "learning_rate": 1.3149248813847737e-05, "step": 27600 }, { "epoch": 9.206137424949967, "loss": 0.29262685775756836, "step": 27600 }, { "ce_loss": 0.045261502265930176, "epoch": 9.206137424949967, "step": 27600 }, { "distill_loss": 0.154446542263031, "epoch": 9.206137424949967, "step": 27600 }, { "epoch": 9.206137424949967, "ref_ce_loss": 0.060710638761520386, "step": 27600 }, { "epoch": 9.206137424949967, "loss": 0.42080560326576233, "step": 27600 }, { "ce_loss": 0.0515153594315052, "epoch": 9.206137424949967, "step": 27600 }, { "distill_loss": 0.22277718782424927, "epoch": 9.206137424949967, "step": 27600 }, { "epoch": 9.206137424949967, "ref_ce_loss": 0.05690597742795944, "step": 27600 }, { "epoch": 9.209472981987991, "loss": 0.3054, "step": 27610 }, { "epoch": 9.209472981987991, "grad_norm": 1.1911672353744507, "step": 27610 }, { "epoch": 9.209472981987991, "learning_rate": 1.30395861141118e-05, "step": 27610 }, { "epoch": 9.209472981987991, "loss": 0.3175136148929596, "step": 27610 }, { "ce_loss": 0.036621760576963425, "epoch": 9.209472981987991, "step": 27610 }, { "distill_loss": 0.1766699254512787, "epoch": 9.209472981987991, "step": 27610 }, { "epoch": 9.209472981987991, "ref_ce_loss": 0.06541144847869873, "step": 27610 }, { "epoch": 9.209472981987991, "loss": 0.28621190786361694, "step": 27610 }, { "ce_loss": 0.04701051861047745, "epoch": 9.209472981987991, "step": 27610 }, { "distill_loss": 0.16483399271965027, "epoch": 9.209472981987991, "step": 27610 }, { "epoch": 9.209472981987991, "ref_ce_loss": 0.05313900113105774, "step": 27610 }, { "epoch": 9.212808539026017, "loss": 0.3519, "step": 27620 }, { "epoch": 9.212808539026017, "grad_norm": 1.150766372680664, "step": 27620 }, { "epoch": 9.212808539026017, "learning_rate": 1.29303750388174e-05, "step": 27620 }, { "epoch": 9.212808539026017, "loss": 0.39144226908683777, "step": 27620 }, { "ce_loss": 0.03537900000810623, "epoch": 9.212808539026017, "step": 27620 }, { "distill_loss": 0.15409021079540253, "epoch": 9.212808539026017, "step": 27620 }, { "epoch": 9.212808539026017, "ref_ce_loss": 0.058260228484869, "step": 27620 }, { "epoch": 9.212808539026017, "loss": 0.5103629231452942, "step": 27620 }, { "ce_loss": 0.03485684096813202, "epoch": 9.212808539026017, "step": 27620 }, { "distill_loss": 0.2051319181919098, "epoch": 9.212808539026017, "step": 27620 }, { "epoch": 9.212808539026017, "ref_ce_loss": 0.05551000311970711, "step": 27620 }, { "epoch": 9.216144096064042, "loss": 0.3647, "step": 27630 }, { "epoch": 9.216144096064042, "grad_norm": 1.2545127868652344, "step": 27630 }, { "epoch": 9.216144096064042, "learning_rate": 1.2821615715425817e-05, "step": 27630 }, { "epoch": 9.216144096064042, "loss": 0.21375618875026703, "step": 27630 }, { "ce_loss": 0.02147822454571724, "epoch": 9.216144096064042, "step": 27630 }, { "distill_loss": 0.13803425431251526, "epoch": 9.216144096064042, "step": 27630 }, { "epoch": 9.216144096064042, "ref_ce_loss": 0.053897660225629807, "step": 27630 }, { "epoch": 9.216144096064042, "loss": 0.3832044005393982, "step": 27630 }, { "ce_loss": 0.07676784694194794, "epoch": 9.216144096064042, "step": 27630 }, { "distill_loss": 0.18233522772789001, "epoch": 9.216144096064042, "step": 27630 }, { "epoch": 9.216144096064042, "ref_ce_loss": 0.06758108735084534, "step": 27630 }, { "epoch": 9.219479653102068, "loss": 0.3424, "step": 27640 }, { "epoch": 9.219479653102068, "grad_norm": 0.9458558559417725, "step": 27640 }, { "epoch": 9.219479653102068, "learning_rate": 1.271330827087085e-05, "step": 27640 }, { "epoch": 9.219479653102068, "loss": 0.2649664878845215, "step": 27640 }, { "ce_loss": 0.041778381913900375, "epoch": 9.219479653102068, "step": 27640 }, { "distill_loss": 0.17980529367923737, "epoch": 9.219479653102068, "step": 27640 }, { "epoch": 9.219479653102068, "ref_ce_loss": 0.04321242496371269, "step": 27640 }, { "epoch": 9.219479653102068, "loss": 0.40541356801986694, "step": 27640 }, { "ce_loss": 0.03248671814799309, "epoch": 9.219479653102068, "step": 27640 }, { "distill_loss": 0.16780821979045868, "epoch": 9.219479653102068, "step": 27640 }, { "epoch": 9.219479653102068, "ref_ce_loss": 0.038705579936504364, "step": 27640 }, { "epoch": 9.222815210140093, "loss": 0.3235, "step": 27650 }, { "epoch": 9.222815210140093, "grad_norm": 2.606320858001709, "step": 27650 }, { "epoch": 9.222815210140093, "learning_rate": 1.2605452831558896e-05, "step": 27650 }, { "epoch": 9.222815210140093, "loss": 0.4550734758377075, "step": 27650 }, { "ce_loss": 0.09363550692796707, "epoch": 9.222815210140093, "step": 27650 }, { "distill_loss": 0.20949184894561768, "epoch": 9.222815210140093, "step": 27650 }, { "epoch": 9.222815210140093, "ref_ce_loss": 0.07188888639211655, "step": 27650 }, { "epoch": 9.222815210140093, "loss": 0.3436967432498932, "step": 27650 }, { "ce_loss": 0.04648522660136223, "epoch": 9.222815210140093, "step": 27650 }, { "distill_loss": 0.15692740678787231, "epoch": 9.222815210140093, "step": 27650 }, { "epoch": 9.222815210140093, "ref_ce_loss": 0.06392422318458557, "step": 27650 }, { "epoch": 9.226150767178119, "loss": 0.3399, "step": 27660 }, { "epoch": 9.226150767178119, "grad_norm": 1.5653630495071411, "step": 27660 }, { "epoch": 9.226150767178119, "learning_rate": 1.2498049523368816e-05, "step": 27660 }, { "epoch": 9.226150767178119, "loss": 0.3628976345062256, "step": 27660 }, { "ce_loss": 0.06816162168979645, "epoch": 9.226150767178119, "step": 27660 }, { "distill_loss": 0.1749858856201172, "epoch": 9.226150767178119, "step": 27660 }, { "epoch": 9.226150767178119, "ref_ce_loss": 0.06702398508787155, "step": 27660 }, { "epoch": 9.226150767178119, "loss": 0.2408471554517746, "step": 27660 }, { "ce_loss": 0.03688124194741249, "epoch": 9.226150767178119, "step": 27660 }, { "distill_loss": 0.1462167203426361, "epoch": 9.226150767178119, "step": 27660 }, { "epoch": 9.226150767178119, "ref_ce_loss": 0.0576145276427269, "step": 27660 }, { "epoch": 9.229486324216143, "loss": 0.3421, "step": 27670 }, { "epoch": 9.229486324216143, "grad_norm": 1.1789419651031494, "step": 27670 }, { "epoch": 9.229486324216143, "learning_rate": 1.2391098471651896e-05, "step": 27670 }, { "epoch": 9.229486324216143, "loss": 0.33586499094963074, "step": 27670 }, { "ce_loss": 0.05634840950369835, "epoch": 9.229486324216143, "step": 27670 }, { "distill_loss": 0.1920376718044281, "epoch": 9.229486324216143, "step": 27670 }, { "epoch": 9.229486324216143, "ref_ce_loss": 0.06841642409563065, "step": 27670 }, { "epoch": 9.229486324216143, "loss": 0.4046776592731476, "step": 27670 }, { "ce_loss": 0.0436110757291317, "epoch": 9.229486324216143, "step": 27670 }, { "distill_loss": 0.17451398074626923, "epoch": 9.229486324216143, "step": 27670 }, { "epoch": 9.229486324216143, "ref_ce_loss": 0.06450141966342926, "step": 27670 }, { "epoch": 9.23282188125417, "loss": 0.327, "step": 27680 }, { "epoch": 9.23282188125417, "grad_norm": 1.0851166248321533, "step": 27680 }, { "epoch": 9.23282188125417, "learning_rate": 1.2284599801231489e-05, "step": 27680 }, { "epoch": 9.23282188125417, "loss": 0.2801929712295532, "step": 27680 }, { "ce_loss": 0.03620101511478424, "epoch": 9.23282188125417, "step": 27680 }, { "distill_loss": 0.18607118725776672, "epoch": 9.23282188125417, "step": 27680 }, { "epoch": 9.23282188125417, "ref_ce_loss": 0.04491456225514412, "step": 27680 }, { "epoch": 9.23282188125417, "loss": 0.38286706805229187, "step": 27680 }, { "ce_loss": 0.0755334123969078, "epoch": 9.23282188125417, "step": 27680 }, { "distill_loss": 0.2033005952835083, "epoch": 9.23282188125417, "step": 27680 }, { "epoch": 9.23282188125417, "ref_ce_loss": 0.07997366786003113, "step": 27680 }, { "epoch": 9.236157438292194, "loss": 0.3318, "step": 27690 }, { "epoch": 9.236157438292194, "grad_norm": 0.9657601714134216, "step": 27690 }, { "epoch": 9.236157438292194, "learning_rate": 1.2178553636403101e-05, "step": 27690 }, { "epoch": 9.236157438292194, "loss": 0.8263682126998901, "step": 27690 }, { "ce_loss": 0.07217461615800858, "epoch": 9.236157438292194, "step": 27690 }, { "distill_loss": 0.17183834314346313, "epoch": 9.236157438292194, "step": 27690 }, { "epoch": 9.236157438292194, "ref_ce_loss": 0.06462493538856506, "step": 27690 }, { "epoch": 9.236157438292194, "loss": 0.3429591655731201, "step": 27690 }, { "ce_loss": 0.04136383906006813, "epoch": 9.236157438292194, "step": 27690 }, { "distill_loss": 0.18173544108867645, "epoch": 9.236157438292194, "step": 27690 }, { "epoch": 9.236157438292194, "ref_ce_loss": 0.05896923691034317, "step": 27690 }, { "epoch": 9.23949299533022, "loss": 0.3641, "step": 27700 }, { "epoch": 9.23949299533022, "grad_norm": 1.0028430223464966, "step": 27700 }, { "epoch": 9.23949299533022, "learning_rate": 1.207296010093386e-05, "step": 27700 }, { "epoch": 9.23949299533022, "loss": 0.37061363458633423, "step": 27700 }, { "ce_loss": 0.030712340027093887, "epoch": 9.23949299533022, "step": 27700 }, { "distill_loss": 0.1883632242679596, "epoch": 9.23949299533022, "step": 27700 }, { "epoch": 9.23949299533022, "ref_ce_loss": 0.07665921002626419, "step": 27700 }, { "epoch": 9.23949299533022, "loss": 0.3146851658821106, "step": 27700 }, { "ce_loss": 0.055054206401109695, "epoch": 9.23949299533022, "step": 27700 }, { "distill_loss": 0.17789779603481293, "epoch": 9.23949299533022, "step": 27700 }, { "epoch": 9.23949299533022, "ref_ce_loss": 0.059667814522981644, "step": 27700 }, { "epoch": 9.242828552368245, "loss": 0.3384, "step": 27710 }, { "epoch": 9.242828552368245, "grad_norm": 1.1923667192459106, "step": 27710 }, { "epoch": 9.242828552368245, "learning_rate": 1.1967819318062835e-05, "step": 27710 }, { "epoch": 9.242828552368245, "loss": 0.2766267955303192, "step": 27710 }, { "ce_loss": 0.05385156348347664, "epoch": 9.242828552368245, "step": 27710 }, { "distill_loss": 0.14416009187698364, "epoch": 9.242828552368245, "step": 27710 }, { "epoch": 9.242828552368245, "ref_ce_loss": 0.0671229436993599, "step": 27710 }, { "epoch": 9.242828552368245, "loss": 0.3347581923007965, "step": 27710 }, { "ce_loss": 0.05921385437250137, "epoch": 9.242828552368245, "step": 27710 }, { "distill_loss": 0.18549509346485138, "epoch": 9.242828552368245, "step": 27710 }, { "epoch": 9.242828552368245, "ref_ce_loss": 0.0644620805978775, "step": 27710 }, { "epoch": 9.246164109406271, "loss": 0.3363, "step": 27720 }, { "epoch": 9.246164109406271, "grad_norm": 1.4752428531646729, "step": 27720 }, { "epoch": 9.246164109406271, "learning_rate": 1.1863131410500706e-05, "step": 27720 }, { "epoch": 9.246164109406271, "loss": 0.45307832956314087, "step": 27720 }, { "ce_loss": 0.06924276053905487, "epoch": 9.246164109406271, "step": 27720 }, { "distill_loss": 0.18389859795570374, "epoch": 9.246164109406271, "step": 27720 }, { "epoch": 9.246164109406271, "ref_ce_loss": 0.06640004366636276, "step": 27720 }, { "epoch": 9.246164109406271, "loss": 0.3857943117618561, "step": 27720 }, { "ce_loss": 0.05119406431913376, "epoch": 9.246164109406271, "step": 27720 }, { "distill_loss": 0.21899224817752838, "epoch": 9.246164109406271, "step": 27720 }, { "epoch": 9.246164109406271, "ref_ce_loss": 0.06401392817497253, "step": 27720 }, { "epoch": 9.249499666444295, "loss": 0.3581, "step": 27730 }, { "epoch": 9.249499666444295, "grad_norm": 0.9835862517356873, "step": 27730 }, { "epoch": 9.249499666444295, "learning_rate": 1.1758896500429428e-05, "step": 27730 }, { "epoch": 9.249499666444295, "loss": 0.29481619596481323, "step": 27730 }, { "ce_loss": 0.050965506583452225, "epoch": 9.249499666444295, "step": 27730 }, { "distill_loss": 0.15273383259773254, "epoch": 9.249499666444295, "step": 27730 }, { "epoch": 9.249499666444295, "ref_ce_loss": 0.0674981027841568, "step": 27730 }, { "epoch": 9.249499666444295, "loss": 0.3622666895389557, "step": 27730 }, { "ce_loss": 0.034598208963871, "epoch": 9.249499666444295, "step": 27730 }, { "distill_loss": 0.18676534295082092, "epoch": 9.249499666444295, "step": 27730 }, { "epoch": 9.249499666444295, "ref_ce_loss": 0.06467654556035995, "step": 27730 }, { "epoch": 9.252835223482322, "loss": 0.3628, "step": 27740 }, { "epoch": 9.252835223482322, "grad_norm": 0.8673005104064941, "step": 27740 }, { "epoch": 9.252835223482322, "learning_rate": 1.1655114709502447e-05, "step": 27740 }, { "epoch": 9.252835223482322, "loss": 0.29098695516586304, "step": 27740 }, { "ce_loss": 0.02971721440553665, "epoch": 9.252835223482322, "step": 27740 }, { "distill_loss": 0.15608163177967072, "epoch": 9.252835223482322, "step": 27740 }, { "epoch": 9.252835223482322, "ref_ce_loss": 0.052347734570503235, "step": 27740 }, { "epoch": 9.252835223482322, "loss": 0.24176278710365295, "step": 27740 }, { "ce_loss": 0.024755051359534264, "epoch": 9.252835223482322, "step": 27740 }, { "distill_loss": 0.12914715707302094, "epoch": 9.252835223482322, "step": 27740 }, { "epoch": 9.252835223482322, "ref_ce_loss": 0.057348188012838364, "step": 27740 }, { "epoch": 9.256170780520346, "loss": 0.3621, "step": 27750 }, { "epoch": 9.256170780520346, "grad_norm": 1.534041404724121, "step": 27750 }, { "epoch": 9.256170780520346, "learning_rate": 1.1551786158844246e-05, "step": 27750 }, { "epoch": 9.256170780520346, "loss": 0.2658484876155853, "step": 27750 }, { "ce_loss": 0.030037011951208115, "epoch": 9.256170780520346, "step": 27750 }, { "distill_loss": 0.15437012910842896, "epoch": 9.256170780520346, "step": 27750 }, { "epoch": 9.256170780520346, "ref_ce_loss": 0.05552500858902931, "step": 27750 }, { "epoch": 9.256170780520346, "loss": 0.29277652502059937, "step": 27750 }, { "ce_loss": 0.03480086103081703, "epoch": 9.256170780520346, "step": 27750 }, { "distill_loss": 0.18554575741291046, "epoch": 9.256170780520346, "step": 27750 }, { "epoch": 9.256170780520346, "ref_ce_loss": 0.05078282952308655, "step": 27750 }, { "epoch": 9.259506337558372, "loss": 0.3517, "step": 27760 }, { "epoch": 9.259506337558372, "grad_norm": 0.868651807308197, "step": 27760 }, { "epoch": 9.259506337558372, "learning_rate": 1.1448910969050363e-05, "step": 27760 }, { "epoch": 9.259506337558372, "loss": 0.9711523056030273, "step": 27760 }, { "ce_loss": 0.09152238070964813, "epoch": 9.259506337558372, "step": 27760 }, { "distill_loss": 0.22823432087898254, "epoch": 9.259506337558372, "step": 27760 }, { "epoch": 9.259506337558372, "ref_ce_loss": 0.07608745247125626, "step": 27760 }, { "epoch": 9.259506337558372, "loss": 0.34535083174705505, "step": 27760 }, { "ce_loss": 0.03188411146402359, "epoch": 9.259506337558372, "step": 27760 }, { "distill_loss": 0.14663533866405487, "epoch": 9.259506337558372, "step": 27760 }, { "epoch": 9.259506337558372, "ref_ce_loss": 0.0729282945394516, "step": 27760 }, { "epoch": 9.262841894596397, "loss": 0.3681, "step": 27770 }, { "epoch": 9.262841894596397, "grad_norm": 1.2464836835861206, "step": 27770 }, { "epoch": 9.262841894596397, "learning_rate": 1.1346489260187155e-05, "step": 27770 }, { "epoch": 9.262841894596397, "loss": 0.2442484349012375, "step": 27770 }, { "ce_loss": 0.016015250235795975, "epoch": 9.262841894596397, "step": 27770 }, { "distill_loss": 0.1256130486726761, "epoch": 9.262841894596397, "step": 27770 }, { "epoch": 9.262841894596397, "ref_ce_loss": 0.04880331829190254, "step": 27770 }, { "epoch": 9.262841894596397, "loss": 0.2674102783203125, "step": 27770 }, { "ce_loss": 0.01958044432103634, "epoch": 9.262841894596397, "step": 27770 }, { "distill_loss": 0.15196123719215393, "epoch": 9.262841894596397, "step": 27770 }, { "epoch": 9.262841894596397, "ref_ce_loss": 0.05983925610780716, "step": 27770 }, { "epoch": 9.266177451634423, "loss": 0.3175, "step": 27780 }, { "epoch": 9.266177451634423, "grad_norm": 1.0320186614990234, "step": 27780 }, { "epoch": 9.266177451634423, "learning_rate": 1.1244521151791887e-05, "step": 27780 }, { "epoch": 9.266177451634423, "loss": 0.3113320469856262, "step": 27780 }, { "ce_loss": 0.034634802490472794, "epoch": 9.266177451634423, "step": 27780 }, { "distill_loss": 0.19550350308418274, "epoch": 9.266177451634423, "step": 27780 }, { "epoch": 9.266177451634423, "ref_ce_loss": 0.05874716863036156, "step": 27780 }, { "epoch": 9.266177451634423, "loss": 0.2644667327404022, "step": 27780 }, { "ce_loss": 0.03832489624619484, "epoch": 9.266177451634423, "step": 27780 }, { "distill_loss": 0.11675840616226196, "epoch": 9.266177451634423, "step": 27780 }, { "epoch": 9.266177451634423, "ref_ce_loss": 0.06073426082730293, "step": 27780 }, { "epoch": 9.269513008672448, "loss": 0.3293, "step": 27790 }, { "epoch": 9.269513008672448, "grad_norm": 1.3379724025726318, "step": 27790 }, { "epoch": 9.269513008672448, "learning_rate": 1.114300676287221e-05, "step": 27790 }, { "epoch": 9.269513008672448, "loss": 0.42799869179725647, "step": 27790 }, { "ce_loss": 0.07342212647199631, "epoch": 9.269513008672448, "step": 27790 }, { "distill_loss": 0.16694113612174988, "epoch": 9.269513008672448, "step": 27790 }, { "epoch": 9.269513008672448, "ref_ce_loss": 0.07477279007434845, "step": 27790 }, { "epoch": 9.269513008672448, "loss": 0.4181850552558899, "step": 27790 }, { "ce_loss": 0.04979084059596062, "epoch": 9.269513008672448, "step": 27790 }, { "distill_loss": 0.2051783800125122, "epoch": 9.269513008672448, "step": 27790 }, { "epoch": 9.269513008672448, "ref_ce_loss": 0.06781855970621109, "step": 27790 }, { "epoch": 9.272848565710474, "loss": 0.325, "step": 27800 }, { "epoch": 9.272848565710474, "grad_norm": 1.0779036283493042, "step": 27800 }, { "epoch": 9.272848565710474, "learning_rate": 1.1041946211906418e-05, "step": 27800 }, { "epoch": 9.272848565710474, "loss": 0.20625613629817963, "step": 27800 }, { "ce_loss": 0.04753445088863373, "epoch": 9.272848565710474, "step": 27800 }, { "distill_loss": 0.12213581055402756, "epoch": 9.272848565710474, "step": 27800 }, { "epoch": 9.272848565710474, "ref_ce_loss": 0.03632541745901108, "step": 27800 }, { "epoch": 9.272848565710474, "loss": 0.35115480422973633, "step": 27800 }, { "ce_loss": 0.07139122486114502, "epoch": 9.272848565710474, "step": 27800 }, { "distill_loss": 0.17072322964668274, "epoch": 9.272848565710474, "step": 27800 }, { "epoch": 9.272848565710474, "ref_ce_loss": 0.06434401869773865, "step": 27800 }, { "epoch": 9.276184122748498, "loss": 0.3474, "step": 27810 }, { "epoch": 9.276184122748498, "grad_norm": 0.9952632784843445, "step": 27810 }, { "epoch": 9.276184122748498, "learning_rate": 1.0941339616843006e-05, "step": 27810 }, { "epoch": 9.276184122748498, "loss": 0.4912530779838562, "step": 27810 }, { "ce_loss": 0.04322037845849991, "epoch": 9.276184122748498, "step": 27810 }, { "distill_loss": 0.1929270476102829, "epoch": 9.276184122748498, "step": 27810 }, { "epoch": 9.276184122748498, "ref_ce_loss": 0.0678233802318573, "step": 27810 }, { "epoch": 9.276184122748498, "loss": 0.20256903767585754, "step": 27810 }, { "ce_loss": 0.022716665640473366, "epoch": 9.276184122748498, "step": 27810 }, { "distill_loss": 0.120547354221344, "epoch": 9.276184122748498, "step": 27810 }, { "epoch": 9.276184122748498, "ref_ce_loss": 0.04453602060675621, "step": 27810 }, { "epoch": 9.279519679786524, "loss": 0.3334, "step": 27820 }, { "epoch": 9.279519679786524, "grad_norm": 1.1183052062988281, "step": 27820 }, { "epoch": 9.279519679786524, "learning_rate": 1.0841187095100668e-05, "step": 27820 }, { "epoch": 9.279519679786524, "loss": 0.37389904260635376, "step": 27820 }, { "ce_loss": 0.031152600422501564, "epoch": 9.279519679786524, "step": 27820 }, { "distill_loss": 0.13883738219738007, "epoch": 9.279519679786524, "step": 27820 }, { "epoch": 9.279519679786524, "ref_ce_loss": 0.048974327743053436, "step": 27820 }, { "epoch": 9.279519679786524, "loss": 0.40885597467422485, "step": 27820 }, { "ce_loss": 0.08726421743631363, "epoch": 9.279519679786524, "step": 27820 }, { "distill_loss": 0.22419197857379913, "epoch": 9.279519679786524, "step": 27820 }, { "epoch": 9.279519679786524, "ref_ce_loss": 0.09708337485790253, "step": 27820 }, { "epoch": 9.282855236824549, "loss": 0.3368, "step": 27830 }, { "epoch": 9.282855236824549, "grad_norm": 0.9071395397186279, "step": 27830 }, { "epoch": 9.282855236824549, "learning_rate": 1.0741488763568263e-05, "step": 27830 }, { "epoch": 9.282855236824549, "loss": 0.2448447197675705, "step": 27830 }, { "ce_loss": 0.03727051243185997, "epoch": 9.282855236824549, "step": 27830 }, { "distill_loss": 0.12109909951686859, "epoch": 9.282855236824549, "step": 27830 }, { "epoch": 9.282855236824549, "ref_ce_loss": 0.06266836076974869, "step": 27830 }, { "epoch": 9.282855236824549, "loss": 0.4689825177192688, "step": 27830 }, { "ce_loss": 0.033723898231983185, "epoch": 9.282855236824549, "step": 27830 }, { "distill_loss": 0.16918250918388367, "epoch": 9.282855236824549, "step": 27830 }, { "epoch": 9.282855236824549, "ref_ce_loss": 0.046082448214292526, "step": 27830 }, { "epoch": 9.286190793862575, "loss": 0.3384, "step": 27840 }, { "epoch": 9.286190793862575, "grad_norm": 1.576418161392212, "step": 27840 }, { "epoch": 9.286190793862575, "learning_rate": 1.0642244738604356e-05, "step": 27840 }, { "epoch": 9.286190793862575, "loss": 0.2886964976787567, "step": 27840 }, { "ce_loss": 0.04928829148411751, "epoch": 9.286190793862575, "step": 27840 }, { "distill_loss": 0.1505202353000641, "epoch": 9.286190793862575, "step": 27840 }, { "epoch": 9.286190793862575, "ref_ce_loss": 0.059994716197252274, "step": 27840 }, { "epoch": 9.286190793862575, "loss": 0.31100377440452576, "step": 27840 }, { "ce_loss": 0.03669671714305878, "epoch": 9.286190793862575, "step": 27840 }, { "distill_loss": 0.17948545515537262, "epoch": 9.286190793862575, "step": 27840 }, { "epoch": 9.286190793862575, "ref_ce_loss": 0.05879655480384827, "step": 27840 }, { "epoch": 9.2895263509006, "loss": 0.3843, "step": 27850 }, { "epoch": 9.2895263509006, "grad_norm": 1.546885371208191, "step": 27850 }, { "epoch": 9.2895263509006, "learning_rate": 1.0543455136037495e-05, "step": 27850 }, { "epoch": 9.2895263509006, "loss": 0.35940316319465637, "step": 27850 }, { "ce_loss": 0.059567444026470184, "epoch": 9.2895263509006, "step": 27850 }, { "distill_loss": 0.20273186266422272, "epoch": 9.2895263509006, "step": 27850 }, { "epoch": 9.2895263509006, "ref_ce_loss": 0.06466156989336014, "step": 27850 }, { "epoch": 9.2895263509006, "loss": 0.4103323817253113, "step": 27850 }, { "ce_loss": 0.044299185276031494, "epoch": 9.2895263509006, "step": 27850 }, { "distill_loss": 0.15502387285232544, "epoch": 9.2895263509006, "step": 27850 }, { "epoch": 9.2895263509006, "ref_ce_loss": 0.0782664492726326, "step": 27850 }, { "epoch": 9.292861907938626, "loss": 0.4034, "step": 27860 }, { "epoch": 9.292861907938626, "grad_norm": 2.5437092781066895, "step": 27860 }, { "epoch": 9.292861907938626, "learning_rate": 1.0445120071165759e-05, "step": 27860 }, { "epoch": 9.292861907938626, "loss": 0.2795525789260864, "step": 27860 }, { "ce_loss": 0.050536125898361206, "epoch": 9.292861907938626, "step": 27860 }, { "distill_loss": 0.16047285497188568, "epoch": 9.292861907938626, "step": 27860 }, { "epoch": 9.292861907938626, "ref_ce_loss": 0.047581858932971954, "step": 27860 }, { "epoch": 9.292861907938626, "loss": 0.24045586585998535, "step": 27860 }, { "ce_loss": 0.027032705023884773, "epoch": 9.292861907938626, "step": 27860 }, { "distill_loss": 0.15831856429576874, "epoch": 9.292861907938626, "step": 27860 }, { "epoch": 9.292861907938626, "ref_ce_loss": 0.05472426861524582, "step": 27860 }, { "epoch": 9.29619746497665, "loss": 0.3288, "step": 27870 }, { "epoch": 9.29619746497665, "grad_norm": 1.1688228845596313, "step": 27870 }, { "epoch": 9.29619746497665, "learning_rate": 1.034723965875677e-05, "step": 27870 }, { "epoch": 9.29619746497665, "loss": 0.33194631338119507, "step": 27870 }, { "ce_loss": 0.031508613377809525, "epoch": 9.29619746497665, "step": 27870 }, { "distill_loss": 0.1925259530544281, "epoch": 9.29619746497665, "step": 27870 }, { "epoch": 9.29619746497665, "ref_ce_loss": 0.0777515098452568, "step": 27870 }, { "epoch": 9.29619746497665, "loss": 0.2267584502696991, "step": 27870 }, { "ce_loss": 0.01810150034725666, "epoch": 9.29619746497665, "step": 27870 }, { "distill_loss": 0.1645008623600006, "epoch": 9.29619746497665, "step": 27870 }, { "epoch": 9.29619746497665, "ref_ce_loss": 0.04402892664074898, "step": 27870 }, { "epoch": 9.299533022014677, "loss": 0.3294, "step": 27880 }, { "epoch": 9.299533022014677, "grad_norm": 1.1693073511123657, "step": 27880 }, { "epoch": 9.299533022014677, "learning_rate": 1.0249814013047455e-05, "step": 27880 }, { "epoch": 9.299533022014677, "loss": 0.37019437551498413, "step": 27880 }, { "ce_loss": 0.04375609755516052, "epoch": 9.299533022014677, "step": 27880 }, { "distill_loss": 0.15550339221954346, "epoch": 9.299533022014677, "step": 27880 }, { "epoch": 9.299533022014677, "ref_ce_loss": 0.06085462495684624, "step": 27880 }, { "epoch": 9.299533022014677, "loss": 0.30347010493278503, "step": 27880 }, { "ce_loss": 0.05368916317820549, "epoch": 9.299533022014677, "step": 27880 }, { "distill_loss": 0.16740760207176208, "epoch": 9.299533022014677, "step": 27880 }, { "epoch": 9.299533022014677, "ref_ce_loss": 0.08216187357902527, "step": 27880 }, { "epoch": 9.302868579052701, "loss": 0.3902, "step": 27890 }, { "epoch": 9.302868579052701, "grad_norm": 1.1709650754928589, "step": 27890 }, { "epoch": 9.302868579052701, "learning_rate": 1.0152843247744015e-05, "step": 27890 }, { "epoch": 9.302868579052701, "loss": 0.22320252656936646, "step": 27890 }, { "ce_loss": 0.02152726799249649, "epoch": 9.302868579052701, "step": 27890 }, { "distill_loss": 0.14967907965183258, "epoch": 9.302868579052701, "step": 27890 }, { "epoch": 9.302868579052701, "ref_ce_loss": 0.05175358057022095, "step": 27890 }, { "epoch": 9.302868579052701, "loss": 0.23550841212272644, "step": 27890 }, { "ce_loss": 0.024001408368349075, "epoch": 9.302868579052701, "step": 27890 }, { "distill_loss": 0.13121449947357178, "epoch": 9.302868579052701, "step": 27890 }, { "epoch": 9.302868579052701, "ref_ce_loss": 0.0575178898870945, "step": 27890 }, { "epoch": 9.306204136090727, "loss": 0.3341, "step": 27900 }, { "epoch": 9.306204136090727, "grad_norm": 1.2723243236541748, "step": 27900 }, { "epoch": 9.306204136090727, "learning_rate": 1.0056327476021831e-05, "step": 27900 }, { "epoch": 9.306204136090727, "loss": 0.2968706488609314, "step": 27900 }, { "ce_loss": 0.025245482102036476, "epoch": 9.306204136090727, "step": 27900 }, { "distill_loss": 0.14647573232650757, "epoch": 9.306204136090727, "step": 27900 }, { "epoch": 9.306204136090727, "ref_ce_loss": 0.05946638807654381, "step": 27900 }, { "epoch": 9.306204136090727, "loss": 0.2807845175266266, "step": 27900 }, { "ce_loss": 0.030696626752614975, "epoch": 9.306204136090727, "step": 27900 }, { "distill_loss": 0.18408195674419403, "epoch": 9.306204136090727, "step": 27900 }, { "epoch": 9.306204136090727, "ref_ce_loss": 0.06579753011465073, "step": 27900 }, { "epoch": 9.309539693128752, "loss": 0.3317, "step": 27910 }, { "epoch": 9.309539693128752, "grad_norm": 0.8362427353858948, "step": 27910 }, { "epoch": 9.309539693128752, "learning_rate": 9.96026681052511e-06, "step": 27910 }, { "epoch": 9.309539693128752, "loss": 0.5383756160736084, "step": 27910 }, { "ce_loss": 0.03612060472369194, "epoch": 9.309539693128752, "step": 27910 }, { "distill_loss": 0.18417330086231232, "epoch": 9.309539693128752, "step": 27910 }, { "epoch": 9.309539693128752, "ref_ce_loss": 0.05404425412416458, "step": 27910 }, { "epoch": 9.309539693128752, "loss": 0.26968586444854736, "step": 27910 }, { "ce_loss": 0.04281514510512352, "epoch": 9.309539693128752, "step": 27910 }, { "distill_loss": 0.13925480842590332, "epoch": 9.309539693128752, "step": 27910 }, { "epoch": 9.309539693128752, "ref_ce_loss": 0.06350491940975189, "step": 27910 }, { "epoch": 9.312875250166778, "loss": 0.3545, "step": 27920 }, { "epoch": 9.312875250166778, "grad_norm": 1.2448768615722656, "step": 27920 }, { "epoch": 9.312875250166778, "learning_rate": 9.864661363367101e-06, "step": 27920 }, { "epoch": 9.312875250166778, "loss": 0.2633543312549591, "step": 27920 }, { "ce_loss": 0.038385823369026184, "epoch": 9.312875250166778, "step": 27920 }, { "distill_loss": 0.14372439682483673, "epoch": 9.312875250166778, "step": 27920 }, { "epoch": 9.312875250166778, "ref_ce_loss": 0.061395276337862015, "step": 27920 }, { "epoch": 9.312875250166778, "loss": 0.46539098024368286, "step": 27920 }, { "ce_loss": 0.06722792237997055, "epoch": 9.312875250166778, "step": 27920 }, { "distill_loss": 0.1808156967163086, "epoch": 9.312875250166778, "step": 27920 }, { "epoch": 9.312875250166778, "ref_ce_loss": 0.06751332432031631, "step": 27920 }, { "epoch": 9.316210807204802, "loss": 0.357, "step": 27930 }, { "epoch": 9.316210807204802, "grad_norm": 1.2320008277893066, "step": 27930 }, { "epoch": 9.316210807204802, "learning_rate": 9.769511246129526e-06, "step": 27930 }, { "epoch": 9.316210807204802, "loss": 0.35350170731544495, "step": 27930 }, { "ce_loss": 0.01923227868974209, "epoch": 9.316210807204802, "step": 27930 }, { "distill_loss": 0.16920588910579681, "epoch": 9.316210807204802, "step": 27930 }, { "epoch": 9.316210807204802, "ref_ce_loss": 0.057839956134557724, "step": 27930 }, { "epoch": 9.316210807204802, "loss": 0.348351389169693, "step": 27930 }, { "ce_loss": 0.07113120704889297, "epoch": 9.316210807204802, "step": 27930 }, { "distill_loss": 0.17669452726840973, "epoch": 9.316210807204802, "step": 27930 }, { "epoch": 9.316210807204802, "ref_ce_loss": 0.07339446991682053, "step": 27930 }, { "epoch": 9.319546364242829, "loss": 0.3476, "step": 27940 }, { "epoch": 9.319546364242829, "grad_norm": 0.8937927484512329, "step": 27940 }, { "epoch": 9.319546364242829, "learning_rate": 9.674816569862887e-06, "step": 27940 }, { "epoch": 9.319546364242829, "loss": 0.28209495544433594, "step": 27940 }, { "ce_loss": 0.01969868130981922, "epoch": 9.319546364242829, "step": 27940 }, { "distill_loss": 0.13597862422466278, "epoch": 9.319546364242829, "step": 27940 }, { "epoch": 9.319546364242829, "ref_ce_loss": 0.06439182907342911, "step": 27940 }, { "epoch": 9.319546364242829, "loss": 0.36773681640625, "step": 27940 }, { "ce_loss": 0.04736010730266571, "epoch": 9.319546364242829, "step": 27940 }, { "distill_loss": 0.16696158051490784, "epoch": 9.319546364242829, "step": 27940 }, { "epoch": 9.319546364242829, "ref_ce_loss": 0.05129920691251755, "step": 27940 }, { "epoch": 9.322881921280853, "loss": 0.348, "step": 27950 }, { "epoch": 9.322881921280853, "grad_norm": 1.2185635566711426, "step": 27950 }, { "epoch": 9.322881921280853, "learning_rate": 9.580577445086025e-06, "step": 27950 }, { "epoch": 9.322881921280853, "loss": 0.3241177201271057, "step": 27950 }, { "ce_loss": 0.04607256501913071, "epoch": 9.322881921280853, "step": 27950 }, { "distill_loss": 0.1539202630519867, "epoch": 9.322881921280853, "step": 27950 }, { "epoch": 9.322881921280853, "ref_ce_loss": 0.054715596139431, "step": 27950 }, { "epoch": 9.322881921280853, "loss": 0.3932846188545227, "step": 27950 }, { "ce_loss": 0.029749859124422073, "epoch": 9.322881921280853, "step": 27950 }, { "distill_loss": 0.20683810114860535, "epoch": 9.322881921280853, "step": 27950 }, { "epoch": 9.322881921280853, "ref_ce_loss": 0.07392022013664246, "step": 27950 }, { "epoch": 9.32621747831888, "loss": 0.3126, "step": 27960 }, { "epoch": 9.32621747831888, "grad_norm": 0.9083806872367859, "step": 27960 }, { "epoch": 9.32621747831888, "learning_rate": 9.486793981786158e-06, "step": 27960 }, { "epoch": 9.32621747831888, "loss": 0.3950866758823395, "step": 27960 }, { "ce_loss": 0.06246607378125191, "epoch": 9.32621747831888, "step": 27960 }, { "distill_loss": 0.2375527322292328, "epoch": 9.32621747831888, "step": 27960 }, { "epoch": 9.32621747831888, "ref_ce_loss": 0.06229804828763008, "step": 27960 }, { "epoch": 9.32621747831888, "loss": 0.326924592256546, "step": 27960 }, { "ce_loss": 0.03776393085718155, "epoch": 9.32621747831888, "step": 27960 }, { "distill_loss": 0.1792905628681183, "epoch": 9.32621747831888, "step": 27960 }, { "epoch": 9.32621747831888, "ref_ce_loss": 0.053115006536245346, "step": 27960 }, { "epoch": 9.329553035356904, "loss": 0.3583, "step": 27970 }, { "epoch": 9.329553035356904, "grad_norm": 0.990890622138977, "step": 27970 }, { "epoch": 9.329553035356904, "learning_rate": 9.393466289418662e-06, "step": 27970 }, { "epoch": 9.329553035356904, "loss": 0.3162615895271301, "step": 27970 }, { "ce_loss": 0.055841028690338135, "epoch": 9.329553035356904, "step": 27970 }, { "distill_loss": 0.16669586300849915, "epoch": 9.329553035356904, "step": 27970 }, { "epoch": 9.329553035356904, "ref_ce_loss": 0.06520682573318481, "step": 27970 }, { "epoch": 9.329553035356904, "loss": 0.7081807851791382, "step": 27970 }, { "ce_loss": 0.05520046129822731, "epoch": 9.329553035356904, "step": 27970 }, { "distill_loss": 0.17978860437870026, "epoch": 9.329553035356904, "step": 27970 }, { "epoch": 9.329553035356904, "ref_ce_loss": 0.06650884449481964, "step": 27970 }, { "epoch": 9.33288859239493, "loss": 0.3937, "step": 27980 }, { "epoch": 9.33288859239493, "grad_norm": 1.0608779191970825, "step": 27980 }, { "epoch": 9.33288859239493, "learning_rate": 9.300594476907031e-06, "step": 27980 }, { "epoch": 9.33288859239493, "loss": 0.3241865932941437, "step": 27980 }, { "ce_loss": 0.05885559692978859, "epoch": 9.33288859239493, "step": 27980 }, { "distill_loss": 0.13292686641216278, "epoch": 9.33288859239493, "step": 27980 }, { "epoch": 9.33288859239493, "ref_ce_loss": 0.0678175836801529, "step": 27980 }, { "epoch": 9.33288859239493, "loss": 0.2897075414657593, "step": 27980 }, { "ce_loss": 0.01826099492609501, "epoch": 9.33288859239493, "step": 27980 }, { "distill_loss": 0.2000298947095871, "epoch": 9.33288859239493, "step": 27980 }, { "epoch": 9.33288859239493, "ref_ce_loss": 0.050724390894174576, "step": 27980 }, { "epoch": 9.336224149432955, "loss": 0.3584, "step": 27990 }, { "epoch": 9.336224149432955, "grad_norm": 1.5480546951293945, "step": 27990 }, { "epoch": 9.336224149432955, "learning_rate": 9.208178652642651e-06, "step": 27990 }, { "epoch": 9.336224149432955, "loss": 0.39300987124443054, "step": 27990 }, { "ce_loss": 0.020847437903285027, "epoch": 9.336224149432955, "step": 27990 }, { "distill_loss": 0.15032193064689636, "epoch": 9.336224149432955, "step": 27990 }, { "epoch": 9.336224149432955, "ref_ce_loss": 0.06037368252873421, "step": 27990 }, { "epoch": 9.336224149432955, "loss": 1.7793481349945068, "step": 27990 }, { "ce_loss": 0.06162801384925842, "epoch": 9.336224149432955, "step": 27990 }, { "distill_loss": 0.2171134352684021, "epoch": 9.336224149432955, "step": 27990 }, { "epoch": 9.336224149432955, "ref_ce_loss": 0.08113553375005722, "step": 27990 }, { "epoch": 9.33955970647098, "loss": 0.4329, "step": 28000 }, { "epoch": 9.33955970647098, "grad_norm": 2.510063648223877, "step": 28000 }, { "epoch": 9.33955970647098, "learning_rate": 9.11621892448471e-06, "step": 28000 }, { "epoch": 9.33955970647098, "loss": 0.3115657567977905, "step": 28000 }, { "ce_loss": 0.04781182110309601, "epoch": 9.33955970647098, "step": 28000 }, { "distill_loss": 0.19052788615226746, "epoch": 9.33955970647098, "step": 28000 }, { "epoch": 9.33955970647098, "ref_ce_loss": 0.05586477741599083, "step": 28000 }, { "epoch": 9.33955970647098, "loss": 0.3089342415332794, "step": 28000 }, { "ce_loss": 0.039804648607969284, "epoch": 9.33955970647098, "step": 28000 }, { "distill_loss": 0.18772819638252258, "epoch": 9.33955970647098, "step": 28000 }, { "epoch": 9.33955970647098, "ref_ce_loss": 0.05897476524114609, "step": 28000 }, { "epoch": 9.342895263509005, "loss": 0.3561, "step": 28010 }, { "epoch": 9.342895263509005, "grad_norm": 1.1208604574203491, "step": 28010 }, { "epoch": 9.342895263509005, "learning_rate": 9.02471539976011e-06, "step": 28010 }, { "epoch": 9.342895263509005, "loss": 0.3297306299209595, "step": 28010 }, { "ce_loss": 0.055515870451927185, "epoch": 9.342895263509005, "step": 28010 }, { "distill_loss": 0.18339870870113373, "epoch": 9.342895263509005, "step": 28010 }, { "epoch": 9.342895263509005, "ref_ce_loss": 0.06366925686597824, "step": 28010 }, { "epoch": 9.342895263509005, "loss": 0.381563276052475, "step": 28010 }, { "ce_loss": 0.06505367904901505, "epoch": 9.342895263509005, "step": 28010 }, { "distill_loss": 0.2163468599319458, "epoch": 9.342895263509005, "step": 28010 }, { "epoch": 9.342895263509005, "ref_ce_loss": 0.06672647595405579, "step": 28010 }, { "epoch": 9.346230820547031, "loss": 0.3435, "step": 28020 }, { "epoch": 9.346230820547031, "grad_norm": 1.4113551378250122, "step": 28020 }, { "epoch": 9.346230820547031, "learning_rate": 8.933668185263288e-06, "step": 28020 }, { "epoch": 9.346230820547031, "loss": 0.2933332324028015, "step": 28020 }, { "ce_loss": 0.06315771490335464, "epoch": 9.346230820547031, "step": 28020 }, { "distill_loss": 0.15205860137939453, "epoch": 9.346230820547031, "step": 28020 }, { "epoch": 9.346230820547031, "ref_ce_loss": 0.054141364991664886, "step": 28020 }, { "epoch": 9.346230820547031, "loss": 0.2890551686286926, "step": 28020 }, { "ce_loss": 0.031157713383436203, "epoch": 9.346230820547031, "step": 28020 }, { "distill_loss": 0.15079186856746674, "epoch": 9.346230820547031, "step": 28020 }, { "epoch": 9.346230820547031, "ref_ce_loss": 0.05227925628423691, "step": 28020 }, { "epoch": 9.349566377585056, "loss": 0.3245, "step": 28030 }, { "epoch": 9.349566377585056, "grad_norm": 1.0965887308120728, "step": 28030 }, { "epoch": 9.349566377585056, "learning_rate": 8.843077387256271e-06, "step": 28030 }, { "epoch": 9.349566377585056, "loss": 0.2993690073490143, "step": 28030 }, { "ce_loss": 0.04467551410198212, "epoch": 9.349566377585056, "step": 28030 }, { "distill_loss": 0.1874786913394928, "epoch": 9.349566377585056, "step": 28030 }, { "epoch": 9.349566377585056, "ref_ce_loss": 0.06677830219268799, "step": 28030 }, { "epoch": 9.349566377585056, "loss": 0.3004530370235443, "step": 28030 }, { "ce_loss": 0.02141544595360756, "epoch": 9.349566377585056, "step": 28030 }, { "distill_loss": 0.13225886225700378, "epoch": 9.349566377585056, "step": 28030 }, { "epoch": 9.349566377585056, "ref_ce_loss": 0.04972206801176071, "step": 28030 }, { "epoch": 9.352901934623082, "loss": 0.3292, "step": 28040 }, { "epoch": 9.352901934623082, "grad_norm": 1.0150585174560547, "step": 28040 }, { "epoch": 9.352901934623082, "learning_rate": 8.752943111468082e-06, "step": 28040 }, { "epoch": 9.352901934623082, "loss": 0.3618951737880707, "step": 28040 }, { "ce_loss": 0.0385294072329998, "epoch": 9.352901934623082, "step": 28040 }, { "distill_loss": 0.19680960476398468, "epoch": 9.352901934623082, "step": 28040 }, { "epoch": 9.352901934623082, "ref_ce_loss": 0.08438844233751297, "step": 28040 }, { "epoch": 9.352901934623082, "loss": 0.3036952316761017, "step": 28040 }, { "ce_loss": 0.021761497482657433, "epoch": 9.352901934623082, "step": 28040 }, { "distill_loss": 0.15879257023334503, "epoch": 9.352901934623082, "step": 28040 }, { "epoch": 9.352901934623082, "ref_ce_loss": 0.056911952793598175, "step": 28040 }, { "epoch": 9.356237491661107, "loss": 0.3486, "step": 28050 }, { "epoch": 9.356237491661107, "grad_norm": 1.1633007526397705, "step": 28050 }, { "epoch": 9.356237491661107, "learning_rate": 8.663265463095238e-06, "step": 28050 }, { "epoch": 9.356237491661107, "loss": 0.3508574068546295, "step": 28050 }, { "ce_loss": 0.06212170422077179, "epoch": 9.356237491661107, "step": 28050 }, { "distill_loss": 0.20021311938762665, "epoch": 9.356237491661107, "step": 28050 }, { "epoch": 9.356237491661107, "ref_ce_loss": 0.08815666288137436, "step": 28050 }, { "epoch": 9.356237491661107, "loss": 0.2556230425834656, "step": 28050 }, { "ce_loss": 0.03384950011968613, "epoch": 9.356237491661107, "step": 28050 }, { "distill_loss": 0.1471124291419983, "epoch": 9.356237491661107, "step": 28050 }, { "epoch": 9.356237491661107, "ref_ce_loss": 0.05072746053338051, "step": 28050 }, { "epoch": 9.359573048699133, "loss": 0.3159, "step": 28060 }, { "epoch": 9.359573048699133, "grad_norm": 0.8221273422241211, "step": 28060 }, { "epoch": 9.359573048699133, "learning_rate": 8.57404454680113e-06, "step": 28060 }, { "epoch": 9.359573048699133, "loss": 0.6741282343864441, "step": 28060 }, { "ce_loss": 0.10154782235622406, "epoch": 9.359573048699133, "step": 28060 }, { "distill_loss": 0.24765446782112122, "epoch": 9.359573048699133, "step": 28060 }, { "epoch": 9.359573048699133, "ref_ce_loss": 0.09763228893280029, "step": 28060 }, { "epoch": 9.359573048699133, "loss": 0.23623378574848175, "step": 28060 }, { "ce_loss": 0.037810999900102615, "epoch": 9.359573048699133, "step": 28060 }, { "distill_loss": 0.14654332399368286, "epoch": 9.359573048699133, "step": 28060 }, { "epoch": 9.359573048699133, "ref_ce_loss": 0.051665063947439194, "step": 28060 }, { "epoch": 9.362908605737157, "loss": 0.3258, "step": 28070 }, { "epoch": 9.362908605737157, "grad_norm": 0.7860631346702576, "step": 28070 }, { "epoch": 9.362908605737157, "learning_rate": 8.485280466716284e-06, "step": 28070 }, { "epoch": 9.362908605737157, "loss": 0.2561877965927124, "step": 28070 }, { "ce_loss": 0.045687541365623474, "epoch": 9.362908605737157, "step": 28070 }, { "distill_loss": 0.16708163917064667, "epoch": 9.362908605737157, "step": 28070 }, { "epoch": 9.362908605737157, "ref_ce_loss": 0.04313550889492035, "step": 28070 }, { "epoch": 9.362908605737157, "loss": 0.1792113482952118, "step": 28070 }, { "ce_loss": 0.012736702337861061, "epoch": 9.362908605737157, "step": 28070 }, { "distill_loss": 0.1242901012301445, "epoch": 9.362908605737157, "step": 28070 }, { "epoch": 9.362908605737157, "ref_ce_loss": 0.028838559985160828, "step": 28070 }, { "epoch": 9.366244162775184, "loss": 0.347, "step": 28080 }, { "epoch": 9.366244162775184, "grad_norm": 1.0601820945739746, "step": 28080 }, { "epoch": 9.366244162775184, "learning_rate": 8.39697332643783e-06, "step": 28080 }, { "epoch": 9.366244162775184, "loss": 0.4131063222885132, "step": 28080 }, { "ce_loss": 0.040342796593904495, "epoch": 9.366244162775184, "step": 28080 }, { "distill_loss": 0.22460660338401794, "epoch": 9.366244162775184, "step": 28080 }, { "epoch": 9.366244162775184, "ref_ce_loss": 0.08412070572376251, "step": 28080 }, { "epoch": 9.366244162775184, "loss": 0.38089701533317566, "step": 28080 }, { "ce_loss": 0.03375541418790817, "epoch": 9.366244162775184, "step": 28080 }, { "distill_loss": 0.15850619971752167, "epoch": 9.366244162775184, "step": 28080 }, { "epoch": 9.366244162775184, "ref_ce_loss": 0.06123631075024605, "step": 28080 }, { "epoch": 9.369579719813208, "loss": 0.3772, "step": 28090 }, { "epoch": 9.369579719813208, "grad_norm": 1.264539122581482, "step": 28090 }, { "epoch": 9.369579719813208, "learning_rate": 8.30912322902968e-06, "step": 28090 }, { "epoch": 9.369579719813208, "loss": 0.31226345896720886, "step": 28090 }, { "ce_loss": 0.047768257558345795, "epoch": 9.369579719813208, "step": 28090 }, { "distill_loss": 0.17230334877967834, "epoch": 9.369579719813208, "step": 28090 }, { "epoch": 9.369579719813208, "ref_ce_loss": 0.06894991546869278, "step": 28090 }, { "epoch": 9.369579719813208, "loss": 0.3473667502403259, "step": 28090 }, { "ce_loss": 0.05084200203418732, "epoch": 9.369579719813208, "step": 28090 }, { "distill_loss": 0.1941516399383545, "epoch": 9.369579719813208, "step": 28090 }, { "epoch": 9.369579719813208, "ref_ce_loss": 0.07747070491313934, "step": 28090 }, { "epoch": 9.372915276851234, "loss": 0.3396, "step": 28100 }, { "epoch": 9.372915276851234, "grad_norm": 2.1545708179473877, "step": 28100 }, { "epoch": 9.372915276851234, "learning_rate": 8.221730277022488e-06, "step": 28100 }, { "epoch": 9.372915276851234, "loss": 0.8439842462539673, "step": 28100 }, { "ce_loss": 0.044698260724544525, "epoch": 9.372915276851234, "step": 28100 }, { "distill_loss": 0.18024414777755737, "epoch": 9.372915276851234, "step": 28100 }, { "epoch": 9.372915276851234, "ref_ce_loss": 0.049982912838459015, "step": 28100 }, { "epoch": 9.372915276851234, "loss": 0.6536636352539062, "step": 28100 }, { "ce_loss": 0.03565140813589096, "epoch": 9.372915276851234, "step": 28100 }, { "distill_loss": 0.18457381427288055, "epoch": 9.372915276851234, "step": 28100 }, { "epoch": 9.372915276851234, "ref_ce_loss": 0.053002066910266876, "step": 28100 }, { "epoch": 9.376250833889259, "loss": 0.4018, "step": 28110 }, { "epoch": 9.376250833889259, "grad_norm": 1.1988686323165894, "step": 28110 }, { "epoch": 9.376250833889259, "learning_rate": 8.134794572413106e-06, "step": 28110 }, { "epoch": 9.376250833889259, "loss": 0.3881773352622986, "step": 28110 }, { "ce_loss": 0.055975291877985, "epoch": 9.376250833889259, "step": 28110 }, { "distill_loss": 0.1879711151123047, "epoch": 9.376250833889259, "step": 28110 }, { "epoch": 9.376250833889259, "ref_ce_loss": 0.07531239837408066, "step": 28110 }, { "epoch": 9.376250833889259, "loss": 0.4659346342086792, "step": 28110 }, { "ce_loss": 0.045423828065395355, "epoch": 9.376250833889259, "step": 28110 }, { "distill_loss": 0.14331869781017303, "epoch": 9.376250833889259, "step": 28110 }, { "epoch": 9.376250833889259, "ref_ce_loss": 0.08407054096460342, "step": 28110 }, { "epoch": 9.379586390927285, "loss": 0.3772, "step": 28120 }, { "epoch": 9.379586390927285, "grad_norm": 1.0428130626678467, "step": 28120 }, { "epoch": 9.379586390927285, "learning_rate": 8.048316216664908e-06, "step": 28120 }, { "epoch": 9.379586390927285, "loss": 0.40925315022468567, "step": 28120 }, { "ce_loss": 0.036471374332904816, "epoch": 9.379586390927285, "step": 28120 }, { "distill_loss": 0.20590730011463165, "epoch": 9.379586390927285, "step": 28120 }, { "epoch": 9.379586390927285, "ref_ce_loss": 0.07913188636302948, "step": 28120 }, { "epoch": 9.379586390927285, "loss": 0.44879019260406494, "step": 28120 }, { "ce_loss": 0.06290306150913239, "epoch": 9.379586390927285, "step": 28120 }, { "distill_loss": 0.19268600642681122, "epoch": 9.379586390927285, "step": 28120 }, { "epoch": 9.379586390927285, "ref_ce_loss": 0.07037319988012314, "step": 28120 }, { "epoch": 9.38292194796531, "loss": 0.3721, "step": 28130 }, { "epoch": 9.38292194796531, "grad_norm": 1.0024430751800537, "step": 28130 }, { "epoch": 9.38292194796531, "learning_rate": 7.962295310707424e-06, "step": 28130 }, { "epoch": 9.38292194796531, "loss": 0.3708806335926056, "step": 28130 }, { "ce_loss": 0.020602921023964882, "epoch": 9.38292194796531, "step": 28130 }, { "distill_loss": 0.13433831930160522, "epoch": 9.38292194796531, "step": 28130 }, { "epoch": 9.38292194796531, "ref_ce_loss": 0.053088247776031494, "step": 28130 }, { "epoch": 9.38292194796531, "loss": 0.24727138876914978, "step": 28130 }, { "ce_loss": 0.030050475150346756, "epoch": 9.38292194796531, "step": 28130 }, { "distill_loss": 0.14565815031528473, "epoch": 9.38292194796531, "step": 28130 }, { "epoch": 9.38292194796531, "ref_ce_loss": 0.07143155485391617, "step": 28130 }, { "epoch": 9.386257505003336, "loss": 0.3378, "step": 28140 }, { "epoch": 9.386257505003336, "grad_norm": 0.9009705781936646, "step": 28140 }, { "epoch": 9.386257505003336, "learning_rate": 7.876731954936346e-06, "step": 28140 }, { "epoch": 9.386257505003336, "loss": 0.27454906702041626, "step": 28140 }, { "ce_loss": 0.03395192325115204, "epoch": 9.386257505003336, "step": 28140 }, { "distill_loss": 0.13065120577812195, "epoch": 9.386257505003336, "step": 28140 }, { "epoch": 9.386257505003336, "ref_ce_loss": 0.051687318831682205, "step": 28140 }, { "epoch": 9.386257505003336, "loss": 0.3121086657047272, "step": 28140 }, { "ce_loss": 0.04262286424636841, "epoch": 9.386257505003336, "step": 28140 }, { "distill_loss": 0.18090584874153137, "epoch": 9.386257505003336, "step": 28140 }, { "epoch": 9.386257505003336, "ref_ce_loss": 0.06977871805429459, "step": 28140 }, { "epoch": 9.38959306204136, "loss": 0.3314, "step": 28150 }, { "epoch": 9.38959306204136, "grad_norm": 1.156765103340149, "step": 28150 }, { "epoch": 9.38959306204136, "learning_rate": 7.791626249213301e-06, "step": 28150 }, { "epoch": 9.38959306204136, "loss": 0.5182572603225708, "step": 28150 }, { "ce_loss": 0.028388435021042824, "epoch": 9.38959306204136, "step": 28150 }, { "distill_loss": 0.1807260513305664, "epoch": 9.38959306204136, "step": 28150 }, { "epoch": 9.38959306204136, "ref_ce_loss": 0.06478993594646454, "step": 28150 }, { "epoch": 9.38959306204136, "loss": 0.3600061237812042, "step": 28150 }, { "ce_loss": 0.03588123619556427, "epoch": 9.38959306204136, "step": 28150 }, { "distill_loss": 0.21699689328670502, "epoch": 9.38959306204136, "step": 28150 }, { "epoch": 9.38959306204136, "ref_ce_loss": 0.057673607021570206, "step": 28150 }, { "epoch": 9.392928619079386, "loss": 0.3238, "step": 28160 }, { "epoch": 9.392928619079386, "grad_norm": 0.9872157573699951, "step": 28160 }, { "epoch": 9.392928619079386, "learning_rate": 7.70697829286573e-06, "step": 28160 }, { "epoch": 9.392928619079386, "loss": 0.3103258013725281, "step": 28160 }, { "ce_loss": 0.02353808470070362, "epoch": 9.392928619079386, "step": 28160 }, { "distill_loss": 0.17616422474384308, "epoch": 9.392928619079386, "step": 28160 }, { "epoch": 9.392928619079386, "ref_ce_loss": 0.057305362075567245, "step": 28160 }, { "epoch": 9.392928619079386, "loss": 0.2516862452030182, "step": 28160 }, { "ce_loss": 0.015492540784180164, "epoch": 9.392928619079386, "step": 28160 }, { "distill_loss": 0.14997431635856628, "epoch": 9.392928619079386, "step": 28160 }, { "epoch": 9.392928619079386, "ref_ce_loss": 0.05999534949660301, "step": 28160 }, { "epoch": 9.39626417611741, "loss": 0.326, "step": 28170 }, { "epoch": 9.39626417611741, "grad_norm": 0.9308201670646667, "step": 28170 }, { "epoch": 9.39626417611741, "learning_rate": 7.622788184686958e-06, "step": 28170 }, { "epoch": 9.39626417611741, "loss": 0.31401658058166504, "step": 28170 }, { "ce_loss": 0.028999431058764458, "epoch": 9.39626417611741, "step": 28170 }, { "distill_loss": 0.1737707108259201, "epoch": 9.39626417611741, "step": 28170 }, { "epoch": 9.39626417611741, "ref_ce_loss": 0.05012731999158859, "step": 28170 }, { "epoch": 9.39626417611741, "loss": 0.4022290110588074, "step": 28170 }, { "ce_loss": 0.05216839537024498, "epoch": 9.39626417611741, "step": 28170 }, { "distill_loss": 0.21104665100574493, "epoch": 9.39626417611741, "step": 28170 }, { "epoch": 9.39626417611741, "ref_ce_loss": 0.07470600306987762, "step": 28170 }, { "epoch": 9.399599733155437, "loss": 0.3583, "step": 28180 }, { "epoch": 9.399599733155437, "grad_norm": 1.374057412147522, "step": 28180 }, { "epoch": 9.399599733155437, "learning_rate": 7.539056022935986e-06, "step": 28180 }, { "epoch": 9.399599733155437, "loss": 0.38167569041252136, "step": 28180 }, { "ce_loss": 0.055582933127880096, "epoch": 9.399599733155437, "step": 28180 }, { "distill_loss": 0.19373860955238342, "epoch": 9.399599733155437, "step": 28180 }, { "epoch": 9.399599733155437, "ref_ce_loss": 0.062473032623529434, "step": 28180 }, { "epoch": 9.399599733155437, "loss": 0.3486148416996002, "step": 28180 }, { "ce_loss": 0.025748463347554207, "epoch": 9.399599733155437, "step": 28180 }, { "distill_loss": 0.16828617453575134, "epoch": 9.399599733155437, "step": 28180 }, { "epoch": 9.399599733155437, "ref_ce_loss": 0.05271229147911072, "step": 28180 }, { "epoch": 9.402935290193462, "loss": 0.3882, "step": 28190 }, { "epoch": 9.402935290193462, "grad_norm": 1.0406385660171509, "step": 28190 }, { "epoch": 9.402935290193462, "learning_rate": 7.455781905337089e-06, "step": 28190 }, { "epoch": 9.402935290193462, "loss": 0.3039928078651428, "step": 28190 }, { "ce_loss": 0.07042131572961807, "epoch": 9.402935290193462, "step": 28190 }, { "distill_loss": 0.1568891704082489, "epoch": 9.402935290193462, "step": 28190 }, { "epoch": 9.402935290193462, "ref_ce_loss": 0.057233791798353195, "step": 28190 }, { "epoch": 9.402935290193462, "loss": 0.68961101770401, "step": 28190 }, { "ce_loss": 0.022175131365656853, "epoch": 9.402935290193462, "step": 28190 }, { "distill_loss": 0.1406789869070053, "epoch": 9.402935290193462, "step": 28190 }, { "epoch": 9.402935290193462, "ref_ce_loss": 0.05205327644944191, "step": 28190 }, { "epoch": 9.406270847231488, "loss": 0.345, "step": 28200 }, { "epoch": 9.406270847231488, "grad_norm": 1.2920187711715698, "step": 28200 }, { "epoch": 9.406270847231488, "learning_rate": 7.3729659290802555e-06, "step": 28200 }, { "epoch": 9.406270847231488, "loss": 0.4167248010635376, "step": 28200 }, { "ce_loss": 0.0833716168999672, "epoch": 9.406270847231488, "step": 28200 }, { "distill_loss": 0.21213626861572266, "epoch": 9.406270847231488, "step": 28200 }, { "epoch": 9.406270847231488, "ref_ce_loss": 0.1018867939710617, "step": 28200 }, { "epoch": 9.406270847231488, "loss": 0.3711719512939453, "step": 28200 }, { "ce_loss": 0.06522081792354584, "epoch": 9.406270847231488, "step": 28200 }, { "distill_loss": 0.17841891944408417, "epoch": 9.406270847231488, "step": 28200 }, { "epoch": 9.406270847231488, "ref_ce_loss": 0.06807571649551392, "step": 28200 }, { "epoch": 9.409606404269512, "loss": 0.3968, "step": 28210 }, { "epoch": 9.409606404269512, "grad_norm": 0.9074429273605347, "step": 28210 }, { "epoch": 9.409606404269512, "learning_rate": 7.2906081908206135e-06, "step": 28210 }, { "epoch": 9.409606404269512, "loss": 0.23347681760787964, "step": 28210 }, { "ce_loss": 0.034057483077049255, "epoch": 9.409606404269512, "step": 28210 }, { "distill_loss": 0.1385912448167801, "epoch": 9.409606404269512, "step": 28210 }, { "epoch": 9.409606404269512, "ref_ce_loss": 0.06067446991801262, "step": 28210 }, { "epoch": 9.409606404269512, "loss": 0.4569370150566101, "step": 28210 }, { "ce_loss": 0.03701329976320267, "epoch": 9.409606404269512, "step": 28210 }, { "distill_loss": 0.17066524922847748, "epoch": 9.409606404269512, "step": 28210 }, { "epoch": 9.409606404269512, "ref_ce_loss": 0.062454596161842346, "step": 28210 }, { "epoch": 9.412941961307538, "loss": 0.3649, "step": 28220 }, { "epoch": 9.412941961307538, "grad_norm": 0.9702861905097961, "step": 28220 }, { "epoch": 9.412941961307538, "learning_rate": 7.2087087866784755e-06, "step": 28220 }, { "epoch": 9.412941961307538, "loss": 0.3839876055717468, "step": 28220 }, { "ce_loss": 0.0502345971763134, "epoch": 9.412941961307538, "step": 28220 }, { "distill_loss": 0.16278576850891113, "epoch": 9.412941961307538, "step": 28220 }, { "epoch": 9.412941961307538, "ref_ce_loss": 0.07953373342752457, "step": 28220 }, { "epoch": 9.412941961307538, "loss": 0.3641839027404785, "step": 28220 }, { "ce_loss": 0.036535587161779404, "epoch": 9.412941961307538, "step": 28220 }, { "distill_loss": 0.14896996319293976, "epoch": 9.412941961307538, "step": 28220 }, { "epoch": 9.412941961307538, "ref_ce_loss": 0.08135932683944702, "step": 28220 }, { "epoch": 9.416277518345563, "loss": 0.349, "step": 28230 }, { "epoch": 9.416277518345563, "grad_norm": 1.0224323272705078, "step": 28230 }, { "epoch": 9.416277518345563, "learning_rate": 7.127267812239335e-06, "step": 28230 }, { "epoch": 9.416277518345563, "loss": 0.2518828511238098, "step": 28230 }, { "ce_loss": 0.02607305347919464, "epoch": 9.416277518345563, "step": 28230 }, { "distill_loss": 0.14921921491622925, "epoch": 9.416277518345563, "step": 28230 }, { "epoch": 9.416277518345563, "ref_ce_loss": 0.04809458181262016, "step": 28230 }, { "epoch": 9.416277518345563, "loss": 0.430438756942749, "step": 28230 }, { "ce_loss": 0.027637429535388947, "epoch": 9.416277518345563, "step": 28230 }, { "distill_loss": 0.1614387333393097, "epoch": 9.416277518345563, "step": 28230 }, { "epoch": 9.416277518345563, "ref_ce_loss": 0.057256538420915604, "step": 28230 }, { "epoch": 9.41961307538359, "loss": 0.3586, "step": 28240 }, { "epoch": 9.41961307538359, "grad_norm": 1.3031396865844727, "step": 28240 }, { "epoch": 9.41961307538359, "learning_rate": 7.046285362553428e-06, "step": 28240 }, { "epoch": 9.41961307538359, "loss": 0.3229510486125946, "step": 28240 }, { "ce_loss": 0.024988116696476936, "epoch": 9.41961307538359, "step": 28240 }, { "distill_loss": 0.195112943649292, "epoch": 9.41961307538359, "step": 28240 }, { "epoch": 9.41961307538359, "ref_ce_loss": 0.06355010718107224, "step": 28240 }, { "epoch": 9.41961307538359, "loss": 0.1794387400150299, "step": 28240 }, { "ce_loss": 0.008832786232233047, "epoch": 9.41961307538359, "step": 28240 }, { "distill_loss": 0.13273176550865173, "epoch": 9.41961307538359, "step": 28240 }, { "epoch": 9.41961307538359, "ref_ce_loss": 0.037653353065252304, "step": 28240 }, { "epoch": 9.422948632421614, "loss": 0.3492, "step": 28250 }, { "epoch": 9.422948632421614, "grad_norm": 1.0684586763381958, "step": 28250 }, { "epoch": 9.422948632421614, "learning_rate": 6.9657615321361284e-06, "step": 28250 }, { "epoch": 9.422948632421614, "loss": 0.3427239656448364, "step": 28250 }, { "ce_loss": 0.056136444211006165, "epoch": 9.422948632421614, "step": 28250 }, { "distill_loss": 0.19746029376983643, "epoch": 9.422948632421614, "step": 28250 }, { "epoch": 9.422948632421614, "ref_ce_loss": 0.0558629184961319, "step": 28250 }, { "epoch": 9.422948632421614, "loss": 0.36456623673439026, "step": 28250 }, { "ce_loss": 0.04950832575559616, "epoch": 9.422948632421614, "step": 28250 }, { "distill_loss": 0.18908393383026123, "epoch": 9.422948632421614, "step": 28250 }, { "epoch": 9.422948632421614, "ref_ce_loss": 0.06433885544538498, "step": 28250 }, { "epoch": 9.42628418945964, "loss": 0.336, "step": 28260 }, { "epoch": 9.42628418945964, "grad_norm": 0.9209970831871033, "step": 28260 }, { "epoch": 9.42628418945964, "learning_rate": 6.885696414967324e-06, "step": 28260 }, { "epoch": 9.42628418945964, "loss": 0.29922136664390564, "step": 28260 }, { "ce_loss": 0.02344970777630806, "epoch": 9.42628418945964, "step": 28260 }, { "distill_loss": 0.18124236166477203, "epoch": 9.42628418945964, "step": 28260 }, { "epoch": 9.42628418945964, "ref_ce_loss": 0.05942835658788681, "step": 28260 }, { "epoch": 9.42628418945964, "loss": 0.34051281213760376, "step": 28260 }, { "ce_loss": 0.052809569984674454, "epoch": 9.42628418945964, "step": 28260 }, { "distill_loss": 0.17449237406253815, "epoch": 9.42628418945964, "step": 28260 }, { "epoch": 9.42628418945964, "ref_ce_loss": 0.07341314107179642, "step": 28260 }, { "epoch": 9.429619746497664, "loss": 0.326, "step": 28270 }, { "epoch": 9.429619746497664, "grad_norm": 1.575285792350769, "step": 28270 }, { "epoch": 9.429619746497664, "learning_rate": 6.806090104491691e-06, "step": 28270 }, { "epoch": 9.429619746497664, "loss": 0.35301825404167175, "step": 28270 }, { "ce_loss": 0.04694546386599541, "epoch": 9.429619746497664, "step": 28270 }, { "distill_loss": 0.14384940266609192, "epoch": 9.429619746497664, "step": 28270 }, { "epoch": 9.429619746497664, "ref_ce_loss": 0.06755001097917557, "step": 28270 }, { "epoch": 9.429619746497664, "loss": 0.23267331719398499, "step": 28270 }, { "ce_loss": 0.021335484459996223, "epoch": 9.429619746497664, "step": 28270 }, { "distill_loss": 0.1247900053858757, "epoch": 9.429619746497664, "step": 28270 }, { "epoch": 9.429619746497664, "ref_ce_loss": 0.058344725519418716, "step": 28270 }, { "epoch": 9.43295530353569, "loss": 0.3326, "step": 28280 }, { "epoch": 9.43295530353569, "grad_norm": 0.9571400880813599, "step": 28280 }, { "epoch": 9.43295530353569, "learning_rate": 6.726942693618243e-06, "step": 28280 }, { "epoch": 9.43295530353569, "loss": 0.3410034477710724, "step": 28280 }, { "ce_loss": 0.04474620148539543, "epoch": 9.43295530353569, "step": 28280 }, { "distill_loss": 0.19685468077659607, "epoch": 9.43295530353569, "step": 28280 }, { "epoch": 9.43295530353569, "ref_ce_loss": 0.05485452339053154, "step": 28280 }, { "epoch": 9.43295530353569, "loss": 0.3124542534351349, "step": 28280 }, { "ce_loss": 0.04805135726928711, "epoch": 9.43295530353569, "step": 28280 }, { "distill_loss": 0.17606854438781738, "epoch": 9.43295530353569, "step": 28280 }, { "epoch": 9.43295530353569, "ref_ce_loss": 0.06179404631257057, "step": 28280 }, { "epoch": 9.436290860573715, "loss": 0.3536, "step": 28290 }, { "epoch": 9.436290860573715, "grad_norm": 2.1430957317352295, "step": 28290 }, { "epoch": 9.436290860573715, "learning_rate": 6.648254274720644e-06, "step": 28290 }, { "epoch": 9.436290860573715, "loss": 0.5729732513427734, "step": 28290 }, { "ce_loss": 0.0864250436425209, "epoch": 9.436290860573715, "step": 28290 }, { "distill_loss": 0.2062661051750183, "epoch": 9.436290860573715, "step": 28290 }, { "epoch": 9.436290860573715, "ref_ce_loss": 0.10020098090171814, "step": 28290 }, { "epoch": 9.436290860573715, "loss": 0.25513356924057007, "step": 28290 }, { "ce_loss": 0.03391078859567642, "epoch": 9.436290860573715, "step": 28290 }, { "distill_loss": 0.1471998691558838, "epoch": 9.436290860573715, "step": 28290 }, { "epoch": 9.436290860573715, "ref_ce_loss": 0.05318285524845123, "step": 28290 }, { "epoch": 9.439626417611741, "loss": 0.3672, "step": 28300 }, { "epoch": 9.439626417611741, "grad_norm": 3.0576388835906982, "step": 28300 }, { "epoch": 9.439626417611741, "learning_rate": 6.570024939636765e-06, "step": 28300 }, { "epoch": 9.439626417611741, "loss": 0.3838430643081665, "step": 28300 }, { "ce_loss": 0.05252906307578087, "epoch": 9.439626417611741, "step": 28300 }, { "distill_loss": 0.18925431370735168, "epoch": 9.439626417611741, "step": 28300 }, { "epoch": 9.439626417611741, "ref_ce_loss": 0.06236163154244423, "step": 28300 }, { "epoch": 9.439626417611741, "loss": 0.3756336271762848, "step": 28300 }, { "ce_loss": 0.03550528734922409, "epoch": 9.439626417611741, "step": 28300 }, { "distill_loss": 0.1285904347896576, "epoch": 9.439626417611741, "step": 28300 }, { "epoch": 9.439626417611741, "ref_ce_loss": 0.05124359950423241, "step": 28300 }, { "epoch": 9.442961974649766, "loss": 0.4092, "step": 28310 }, { "epoch": 9.442961974649766, "grad_norm": 1.1369801759719849, "step": 28310 }, { "epoch": 9.442961974649766, "learning_rate": 6.49225477966855e-06, "step": 28310 }, { "epoch": 9.442961974649766, "loss": 0.3174965977668762, "step": 28310 }, { "ce_loss": 0.031249184161424637, "epoch": 9.442961974649766, "step": 28310 }, { "distill_loss": 0.143992617726326, "epoch": 9.442961974649766, "step": 28310 }, { "epoch": 9.442961974649766, "ref_ce_loss": 0.04668223857879639, "step": 28310 }, { "epoch": 9.442961974649766, "loss": 0.2718362808227539, "step": 28310 }, { "ce_loss": 0.03560258075594902, "epoch": 9.442961974649766, "step": 28310 }, { "distill_loss": 0.17445442080497742, "epoch": 9.442961974649766, "step": 28310 }, { "epoch": 9.442961974649766, "ref_ce_loss": 0.06162376329302788, "step": 28310 }, { "epoch": 9.446297531687792, "loss": 0.3277, "step": 28320 }, { "epoch": 9.446297531687792, "grad_norm": 1.1759134531021118, "step": 28320 }, { "epoch": 9.446297531687792, "learning_rate": 6.414943885582192e-06, "step": 28320 }, { "epoch": 9.446297531687792, "loss": 0.31244683265686035, "step": 28320 }, { "ce_loss": 0.029821570962667465, "epoch": 9.446297531687792, "step": 28320 }, { "distill_loss": 0.1645803302526474, "epoch": 9.446297531687792, "step": 28320 }, { "epoch": 9.446297531687792, "ref_ce_loss": 0.06684327870607376, "step": 28320 }, { "epoch": 9.446297531687792, "loss": 0.3090226352214813, "step": 28320 }, { "ce_loss": 0.051436956971883774, "epoch": 9.446297531687792, "step": 28320 }, { "distill_loss": 0.178633451461792, "epoch": 9.446297531687792, "step": 28320 }, { "epoch": 9.446297531687792, "ref_ce_loss": 0.07815413177013397, "step": 28320 }, { "epoch": 9.449633088725816, "loss": 0.3709, "step": 28330 }, { "epoch": 9.449633088725816, "grad_norm": 1.9918997287750244, "step": 28330 }, { "epoch": 9.449633088725816, "learning_rate": 6.338092347607782e-06, "step": 28330 }, { "epoch": 9.449633088725816, "loss": 0.4164873957633972, "step": 28330 }, { "ce_loss": 0.06631603091955185, "epoch": 9.449633088725816, "step": 28330 }, { "distill_loss": 0.20923581719398499, "epoch": 9.449633088725816, "step": 28330 }, { "epoch": 9.449633088725816, "ref_ce_loss": 0.05797014757990837, "step": 28330 }, { "epoch": 9.449633088725816, "loss": 0.27249330282211304, "step": 28330 }, { "ce_loss": 0.06557907909154892, "epoch": 9.449633088725816, "step": 28330 }, { "distill_loss": 0.150743305683136, "epoch": 9.449633088725816, "step": 28330 }, { "epoch": 9.449633088725816, "ref_ce_loss": 0.05572926625609398, "step": 28330 }, { "epoch": 9.452968645763843, "loss": 0.3418, "step": 28340 }, { "epoch": 9.452968645763843, "grad_norm": 1.0681581497192383, "step": 28340 }, { "epoch": 9.452968645763843, "learning_rate": 6.26170025543944e-06, "step": 28340 }, { "epoch": 9.452968645763843, "loss": 0.4454483985900879, "step": 28340 }, { "ce_loss": 0.05483581870794296, "epoch": 9.452968645763843, "step": 28340 }, { "distill_loss": 0.1950834095478058, "epoch": 9.452968645763843, "step": 28340 }, { "epoch": 9.452968645763843, "ref_ce_loss": 0.08751071244478226, "step": 28340 }, { "epoch": 9.452968645763843, "loss": 0.28754252195358276, "step": 28340 }, { "ce_loss": 0.01936127245426178, "epoch": 9.452968645763843, "step": 28340 }, { "distill_loss": 0.1522999405860901, "epoch": 9.452968645763843, "step": 28340 }, { "epoch": 9.452968645763843, "ref_ce_loss": 0.08448221534490585, "step": 28340 }, { "epoch": 9.456304202801867, "loss": 0.3307, "step": 28350 }, { "epoch": 9.456304202801867, "grad_norm": 2.2357776165008545, "step": 28350 }, { "epoch": 9.456304202801867, "learning_rate": 6.1857676982348675e-06, "step": 28350 }, { "epoch": 9.456304202801867, "loss": 0.5820572376251221, "step": 28350 }, { "ce_loss": 0.0633431151509285, "epoch": 9.456304202801867, "step": 28350 }, { "distill_loss": 0.19299493730068207, "epoch": 9.456304202801867, "step": 28350 }, { "epoch": 9.456304202801867, "ref_ce_loss": 0.0826629102230072, "step": 28350 }, { "epoch": 9.456304202801867, "loss": 0.29847437143325806, "step": 28350 }, { "ce_loss": 0.025454547256231308, "epoch": 9.456304202801867, "step": 28350 }, { "distill_loss": 0.16801771521568298, "epoch": 9.456304202801867, "step": 28350 }, { "epoch": 9.456304202801867, "ref_ce_loss": 0.06851398944854736, "step": 28350 }, { "epoch": 9.459639759839893, "loss": 0.3551, "step": 28360 }, { "epoch": 9.459639759839893, "grad_norm": 1.0600039958953857, "step": 28360 }, { "epoch": 9.459639759839893, "learning_rate": 6.110294764615576e-06, "step": 28360 }, { "epoch": 9.459639759839893, "loss": 0.3243717849254608, "step": 28360 }, { "ce_loss": 0.034493587911129, "epoch": 9.459639759839893, "step": 28360 }, { "distill_loss": 0.1711745709180832, "epoch": 9.459639759839893, "step": 28360 }, { "epoch": 9.459639759839893, "ref_ce_loss": 0.07968221604824066, "step": 28360 }, { "epoch": 9.459639759839893, "loss": 0.353845477104187, "step": 28360 }, { "ce_loss": 0.04074955731630325, "epoch": 9.459639759839893, "step": 28360 }, { "distill_loss": 0.17961429059505463, "epoch": 9.459639759839893, "step": 28360 }, { "epoch": 9.459639759839893, "ref_ce_loss": 0.0658039078116417, "step": 28360 }, { "epoch": 9.462975316877918, "loss": 0.3469, "step": 28370 }, { "epoch": 9.462975316877918, "grad_norm": 1.6155695915222168, "step": 28370 }, { "epoch": 9.462975316877918, "learning_rate": 6.035281542666571e-06, "step": 28370 }, { "epoch": 9.462975316877918, "loss": 0.28634515404701233, "step": 28370 }, { "ce_loss": 0.03958677500486374, "epoch": 9.462975316877918, "step": 28370 }, { "distill_loss": 0.1375505030155182, "epoch": 9.462975316877918, "step": 28370 }, { "epoch": 9.462975316877918, "ref_ce_loss": 0.07145129144191742, "step": 28370 }, { "epoch": 9.462975316877918, "loss": 0.41217994689941406, "step": 28370 }, { "ce_loss": 0.04383128136396408, "epoch": 9.462975316877918, "step": 28370 }, { "distill_loss": 0.1717100888490677, "epoch": 9.462975316877918, "step": 28370 }, { "epoch": 9.462975316877918, "ref_ce_loss": 0.06356970965862274, "step": 28370 }, { "epoch": 9.466310873915944, "loss": 0.3584, "step": 28380 }, { "epoch": 9.466310873915944, "grad_norm": 0.9267465472221375, "step": 28380 }, { "epoch": 9.466310873915944, "learning_rate": 5.960728119936399e-06, "step": 28380 }, { "epoch": 9.466310873915944, "loss": 0.3397396504878998, "step": 28380 }, { "ce_loss": 0.060050420463085175, "epoch": 9.466310873915944, "step": 28380 }, { "distill_loss": 0.17834344506263733, "epoch": 9.466310873915944, "step": 28380 }, { "epoch": 9.466310873915944, "ref_ce_loss": 0.07434417307376862, "step": 28380 }, { "epoch": 9.466310873915944, "loss": 0.280172735452652, "step": 28380 }, { "ce_loss": 0.04056550934910774, "epoch": 9.466310873915944, "step": 28380 }, { "distill_loss": 0.12052446603775024, "epoch": 9.466310873915944, "step": 28380 }, { "epoch": 9.466310873915944, "ref_ce_loss": 0.054708823561668396, "step": 28380 }, { "epoch": 9.469646430953969, "loss": 0.3581, "step": 28390 }, { "epoch": 9.469646430953969, "grad_norm": 1.1847563982009888, "step": 28390 }, { "epoch": 9.469646430953969, "learning_rate": 5.88663458343679e-06, "step": 28390 }, { "epoch": 9.469646430953969, "loss": 0.36242082715034485, "step": 28390 }, { "ce_loss": 0.05414927378296852, "epoch": 9.469646430953969, "step": 28390 }, { "distill_loss": 0.15785610675811768, "epoch": 9.469646430953969, "step": 28390 }, { "epoch": 9.469646430953969, "ref_ce_loss": 0.06853984296321869, "step": 28390 }, { "epoch": 9.469646430953969, "loss": 0.2403300404548645, "step": 28390 }, { "ce_loss": 0.028485354036092758, "epoch": 9.469646430953969, "step": 28390 }, { "distill_loss": 0.15575310587882996, "epoch": 9.469646430953969, "step": 28390 }, { "epoch": 9.469646430953969, "ref_ce_loss": 0.036536816507577896, "step": 28390 }, { "epoch": 9.472981987991995, "loss": 0.3638, "step": 28400 }, { "epoch": 9.472981987991995, "grad_norm": 1.3311790227890015, "step": 28400 }, { "epoch": 9.472981987991995, "learning_rate": 5.813001019643016e-06, "step": 28400 }, { "epoch": 9.472981987991995, "loss": 0.3263969123363495, "step": 28400 }, { "ce_loss": 0.044232018291950226, "epoch": 9.472981987991995, "step": 28400 }, { "distill_loss": 0.16511671245098114, "epoch": 9.472981987991995, "step": 28400 }, { "epoch": 9.472981987991995, "ref_ce_loss": 0.08508986979722977, "step": 28400 }, { "epoch": 9.472981987991995, "loss": 0.2716098427772522, "step": 28400 }, { "ce_loss": 0.0314282663166523, "epoch": 9.472981987991995, "step": 28400 }, { "distill_loss": 0.15377303957939148, "epoch": 9.472981987991995, "step": 28400 }, { "epoch": 9.472981987991995, "ref_ce_loss": 0.05766654759645462, "step": 28400 }, { "epoch": 9.47631754503002, "loss": 0.3144, "step": 28410 }, { "epoch": 9.47631754503002, "grad_norm": 1.8568049669265747, "step": 28410 }, { "epoch": 9.47631754503002, "learning_rate": 5.739827514493357e-06, "step": 28410 }, { "epoch": 9.47631754503002, "loss": 0.32465481758117676, "step": 28410 }, { "ce_loss": 0.04101501405239105, "epoch": 9.47631754503002, "step": 28410 }, { "distill_loss": 0.14696335792541504, "epoch": 9.47631754503002, "step": 28410 }, { "epoch": 9.47631754503002, "ref_ce_loss": 0.06247838959097862, "step": 28410 }, { "epoch": 9.47631754503002, "loss": 0.3745761215686798, "step": 28410 }, { "ce_loss": 0.05117378756403923, "epoch": 9.47631754503002, "step": 28410 }, { "distill_loss": 0.18519654870033264, "epoch": 9.47631754503002, "step": 28410 }, { "epoch": 9.47631754503002, "ref_ce_loss": 0.06911353766918182, "step": 28410 }, { "epoch": 9.479653102068045, "loss": 0.3539, "step": 28420 }, { "epoch": 9.479653102068045, "grad_norm": 1.837537407875061, "step": 28420 }, { "epoch": 9.479653102068045, "learning_rate": 5.667114153389142e-06, "step": 28420 }, { "epoch": 9.479653102068045, "loss": 0.3297516405582428, "step": 28420 }, { "ce_loss": 0.029736388474702835, "epoch": 9.479653102068045, "step": 28420 }, { "distill_loss": 0.20523138344287872, "epoch": 9.479653102068045, "step": 28420 }, { "epoch": 9.479653102068045, "ref_ce_loss": 0.07494564354419708, "step": 28420 }, { "epoch": 9.479653102068045, "loss": 0.2599937915802002, "step": 28420 }, { "ce_loss": 0.03557970002293587, "epoch": 9.479653102068045, "step": 28420 }, { "distill_loss": 0.16381414234638214, "epoch": 9.479653102068045, "step": 28420 }, { "epoch": 9.479653102068045, "ref_ce_loss": 0.043581776320934296, "step": 28420 }, { "epoch": 9.48298865910607, "loss": 0.3452, "step": 28430 }, { "epoch": 9.48298865910607, "grad_norm": 1.0753458738327026, "step": 28430 }, { "epoch": 9.48298865910607, "learning_rate": 5.594861021194709e-06, "step": 28430 }, { "epoch": 9.48298865910607, "loss": 0.2921322286128998, "step": 28430 }, { "ce_loss": 0.04107680916786194, "epoch": 9.48298865910607, "step": 28430 }, { "distill_loss": 0.1625596284866333, "epoch": 9.48298865910607, "step": 28430 }, { "epoch": 9.48298865910607, "ref_ce_loss": 0.06385013461112976, "step": 28430 }, { "epoch": 9.48298865910607, "loss": 0.22679473459720612, "step": 28430 }, { "ce_loss": 0.01926489546895027, "epoch": 9.48298865910607, "step": 28430 }, { "distill_loss": 0.16145484149456024, "epoch": 9.48298865910607, "step": 28430 }, { "epoch": 9.48298865910607, "ref_ce_loss": 0.045216940343379974, "step": 28430 }, { "epoch": 9.486324216144096, "loss": 0.3659, "step": 28440 }, { "epoch": 9.486324216144096, "grad_norm": 1.2148197889328003, "step": 28440 }, { "epoch": 9.486324216144096, "learning_rate": 5.523068202237136e-06, "step": 28440 }, { "epoch": 9.486324216144096, "loss": 0.42336106300354004, "step": 28440 }, { "ce_loss": 0.05302092432975769, "epoch": 9.486324216144096, "step": 28440 }, { "distill_loss": 0.22245055437088013, "epoch": 9.486324216144096, "step": 28440 }, { "epoch": 9.486324216144096, "ref_ce_loss": 0.06507351249456406, "step": 28440 }, { "epoch": 9.486324216144096, "loss": 0.28218674659729004, "step": 28440 }, { "ce_loss": 0.02374487742781639, "epoch": 9.486324216144096, "step": 28440 }, { "distill_loss": 0.16830646991729736, "epoch": 9.486324216144096, "step": 28440 }, { "epoch": 9.486324216144096, "ref_ce_loss": 0.05579949542880058, "step": 28440 }, { "epoch": 9.48965977318212, "loss": 0.3579, "step": 28450 }, { "epoch": 9.48965977318212, "grad_norm": 0.9573124051094055, "step": 28450 }, { "epoch": 9.48965977318212, "learning_rate": 5.45173578030651e-06, "step": 28450 }, { "epoch": 9.48965977318212, "loss": 0.29650261998176575, "step": 28450 }, { "ce_loss": 0.029416002333164215, "epoch": 9.48965977318212, "step": 28450 }, { "distill_loss": 0.19150586426258087, "epoch": 9.48965977318212, "step": 28450 }, { "epoch": 9.48965977318212, "ref_ce_loss": 0.057308148592710495, "step": 28450 }, { "epoch": 9.48965977318212, "loss": 0.5148458480834961, "step": 28450 }, { "ce_loss": 0.06594638526439667, "epoch": 9.48965977318212, "step": 28450 }, { "distill_loss": 0.19446289539337158, "epoch": 9.48965977318212, "step": 28450 }, { "epoch": 9.48965977318212, "ref_ce_loss": 0.06992293149232864, "step": 28450 }, { "epoch": 9.492995330220147, "loss": 0.3606, "step": 28460 }, { "epoch": 9.492995330220147, "grad_norm": 1.1321545839309692, "step": 28460 }, { "epoch": 9.492995330220147, "learning_rate": 5.380863838655348e-06, "step": 28460 }, { "epoch": 9.492995330220147, "loss": 0.25574859976768494, "step": 28460 }, { "ce_loss": 0.03863326460123062, "epoch": 9.492995330220147, "step": 28460 }, { "distill_loss": 0.1366451531648636, "epoch": 9.492995330220147, "step": 28460 }, { "epoch": 9.492995330220147, "ref_ce_loss": 0.056483346968889236, "step": 28460 }, { "epoch": 9.492995330220147, "loss": 0.3196426331996918, "step": 28460 }, { "ce_loss": 0.043017297983169556, "epoch": 9.492995330220147, "step": 28460 }, { "distill_loss": 0.19430020451545715, "epoch": 9.492995330220147, "step": 28460 }, { "epoch": 9.492995330220147, "ref_ce_loss": 0.0640929564833641, "step": 28460 }, { "epoch": 9.496330887258171, "loss": 0.3627, "step": 28470 }, { "epoch": 9.496330887258171, "grad_norm": 1.075714349746704, "step": 28470 }, { "epoch": 9.496330887258171, "learning_rate": 5.31045245999886e-06, "step": 28470 }, { "epoch": 9.496330887258171, "loss": 0.28867387771606445, "step": 28470 }, { "ce_loss": 0.03961672633886337, "epoch": 9.496330887258171, "step": 28470 }, { "distill_loss": 0.17778590321540833, "epoch": 9.496330887258171, "step": 28470 }, { "epoch": 9.496330887258171, "ref_ce_loss": 0.047913145273923874, "step": 28470 }, { "epoch": 9.496330887258171, "loss": 0.3131311535835266, "step": 28470 }, { "ce_loss": 0.04571129009127617, "epoch": 9.496330887258171, "step": 28470 }, { "distill_loss": 0.18334239721298218, "epoch": 9.496330887258171, "step": 28470 }, { "epoch": 9.496330887258171, "ref_ce_loss": 0.08380914479494095, "step": 28470 }, { "epoch": 9.499666444296198, "loss": 0.371, "step": 28480 }, { "epoch": 9.499666444296198, "grad_norm": 0.9681311845779419, "step": 28480 }, { "epoch": 9.499666444296198, "learning_rate": 5.240501726514735e-06, "step": 28480 }, { "epoch": 9.499666444296198, "loss": 0.3067833185195923, "step": 28480 }, { "ce_loss": 0.030180882662534714, "epoch": 9.499666444296198, "step": 28480 }, { "distill_loss": 0.16230109333992004, "epoch": 9.499666444296198, "step": 28480 }, { "epoch": 9.499666444296198, "ref_ce_loss": 0.055661167949438095, "step": 28480 }, { "epoch": 9.499666444296198, "loss": 0.31161099672317505, "step": 28480 }, { "ce_loss": 0.04685712605714798, "epoch": 9.499666444296198, "step": 28480 }, { "distill_loss": 0.15889328718185425, "epoch": 9.499666444296198, "step": 28480 }, { "epoch": 9.499666444296198, "ref_ce_loss": 0.06073801964521408, "step": 28480 }, { "epoch": 9.503002001334222, "loss": 0.3329, "step": 28490 }, { "epoch": 9.503002001334222, "grad_norm": 1.2925001382827759, "step": 28490 }, { "epoch": 9.503002001334222, "learning_rate": 5.171011719842955e-06, "step": 28490 }, { "epoch": 9.503002001334222, "loss": 0.39370524883270264, "step": 28490 }, { "ce_loss": 0.041692305356264114, "epoch": 9.503002001334222, "step": 28490 }, { "distill_loss": 0.15185731649398804, "epoch": 9.503002001334222, "step": 28490 }, { "epoch": 9.503002001334222, "ref_ce_loss": 0.0779699832201004, "step": 28490 }, { "epoch": 9.503002001334222, "loss": 0.39848214387893677, "step": 28490 }, { "ce_loss": 0.06291890889406204, "epoch": 9.503002001334222, "step": 28490 }, { "distill_loss": 0.19283688068389893, "epoch": 9.503002001334222, "step": 28490 }, { "epoch": 9.503002001334222, "ref_ce_loss": 0.06672590225934982, "step": 28490 }, { "epoch": 9.506337558372248, "loss": 0.3091, "step": 28500 }, { "epoch": 9.506337558372248, "grad_norm": 1.1803241968154907, "step": 28500 }, { "epoch": 9.506337558372248, "learning_rate": 5.101982521085846e-06, "step": 28500 }, { "epoch": 9.506337558372248, "loss": 0.3626993000507355, "step": 28500 }, { "ce_loss": 0.06457323580980301, "epoch": 9.506337558372248, "step": 28500 }, { "distill_loss": 0.20585688948631287, "epoch": 9.506337558372248, "step": 28500 }, { "epoch": 9.506337558372248, "ref_ce_loss": 0.0644054189324379, "step": 28500 }, { "epoch": 9.506337558372248, "loss": 0.22315269708633423, "step": 28500 }, { "ce_loss": 0.021638058125972748, "epoch": 9.506337558372248, "step": 28500 }, { "distill_loss": 0.14434072375297546, "epoch": 9.506337558372248, "step": 28500 }, { "epoch": 9.506337558372248, "ref_ce_loss": 0.046219129115343094, "step": 28500 }, { "epoch": 9.509673115410273, "loss": 0.3629, "step": 28510 }, { "epoch": 9.509673115410273, "grad_norm": 1.8690041303634644, "step": 28510 }, { "epoch": 9.509673115410273, "learning_rate": 5.033414210807896e-06, "step": 28510 }, { "epoch": 9.509673115410273, "loss": 0.3256169557571411, "step": 28510 }, { "ce_loss": 0.04394880309700966, "epoch": 9.509673115410273, "step": 28510 }, { "distill_loss": 0.1812673956155777, "epoch": 9.509673115410273, "step": 28510 }, { "epoch": 9.509673115410273, "ref_ce_loss": 0.06298881769180298, "step": 28510 }, { "epoch": 9.509673115410273, "loss": 0.3566301465034485, "step": 28510 }, { "ce_loss": 0.05997853726148605, "epoch": 9.509673115410273, "step": 28510 }, { "distill_loss": 0.16302239894866943, "epoch": 9.509673115410273, "step": 28510 }, { "epoch": 9.509673115410273, "ref_ce_loss": 0.06253132224082947, "step": 28510 }, { "epoch": 9.513008672448299, "loss": 0.3379, "step": 28520 }, { "epoch": 9.513008672448299, "grad_norm": 0.9525768756866455, "step": 28520 }, { "epoch": 9.513008672448299, "learning_rate": 4.9653068690357575e-06, "step": 28520 }, { "epoch": 9.513008672448299, "loss": 0.5037875175476074, "step": 28520 }, { "ce_loss": 0.03574688360095024, "epoch": 9.513008672448299, "step": 28520 }, { "distill_loss": 0.17958644032478333, "epoch": 9.513008672448299, "step": 28520 }, { "epoch": 9.513008672448299, "ref_ce_loss": 0.051626432687044144, "step": 28520 }, { "epoch": 9.513008672448299, "loss": 0.3209737539291382, "step": 28520 }, { "ce_loss": 0.04713620990514755, "epoch": 9.513008672448299, "step": 28520 }, { "distill_loss": 0.17461776733398438, "epoch": 9.513008672448299, "step": 28520 }, { "epoch": 9.513008672448299, "ref_ce_loss": 0.05999940261244774, "step": 28520 }, { "epoch": 9.516344229486323, "loss": 0.3456, "step": 28530 }, { "epoch": 9.516344229486323, "grad_norm": 1.2907161712646484, "step": 28530 }, { "epoch": 9.516344229486323, "learning_rate": 4.897660575258023e-06, "step": 28530 }, { "epoch": 9.516344229486323, "loss": 0.48610827326774597, "step": 28530 }, { "ce_loss": 0.05657501518726349, "epoch": 9.516344229486323, "step": 28530 }, { "distill_loss": 0.2150333821773529, "epoch": 9.516344229486323, "step": 28530 }, { "epoch": 9.516344229486323, "ref_ce_loss": 0.08219249546527863, "step": 28530 }, { "epoch": 9.516344229486323, "loss": 0.2625681161880493, "step": 28530 }, { "ce_loss": 0.024848783388733864, "epoch": 9.516344229486323, "step": 28530 }, { "distill_loss": 0.15303784608840942, "epoch": 9.516344229486323, "step": 28530 }, { "epoch": 9.516344229486323, "ref_ce_loss": 0.05679015815258026, "step": 28530 }, { "epoch": 9.51967978652435, "loss": 0.3443, "step": 28540 }, { "epoch": 9.51967978652435, "grad_norm": 0.9048293828964233, "step": 28540 }, { "epoch": 9.51967978652435, "learning_rate": 4.830475408425139e-06, "step": 28540 }, { "epoch": 9.51967978652435, "loss": 0.3716161251068115, "step": 28540 }, { "ce_loss": 0.04324284568428993, "epoch": 9.51967978652435, "step": 28540 }, { "distill_loss": 0.15873301029205322, "epoch": 9.51967978652435, "step": 28540 }, { "epoch": 9.51967978652435, "ref_ce_loss": 0.05497406795620918, "step": 28540 }, { "epoch": 9.51967978652435, "loss": 0.40590211749076843, "step": 28540 }, { "ce_loss": 0.06277185678482056, "epoch": 9.51967978652435, "step": 28540 }, { "distill_loss": 0.19825831055641174, "epoch": 9.51967978652435, "step": 28540 }, { "epoch": 9.51967978652435, "ref_ce_loss": 0.059562280774116516, "step": 28540 }, { "epoch": 9.523015343562374, "loss": 0.3976, "step": 28550 }, { "epoch": 9.523015343562374, "grad_norm": 1.2088080644607544, "step": 28550 }, { "epoch": 9.523015343562374, "learning_rate": 4.76375144694945e-06, "step": 28550 }, { "epoch": 9.523015343562374, "loss": 0.36790820956230164, "step": 28550 }, { "ce_loss": 0.051003482192754745, "epoch": 9.523015343562374, "step": 28550 }, { "distill_loss": 0.2250767946243286, "epoch": 9.523015343562374, "step": 28550 }, { "epoch": 9.523015343562374, "ref_ce_loss": 0.06892771273851395, "step": 28550 }, { "epoch": 9.523015343562374, "loss": 0.32154223322868347, "step": 28550 }, { "ce_loss": 0.0505017451941967, "epoch": 9.523015343562374, "step": 28550 }, { "distill_loss": 0.17499235272407532, "epoch": 9.523015343562374, "step": 28550 }, { "epoch": 9.523015343562374, "ref_ce_loss": 0.04032899811863899, "step": 28550 }, { "epoch": 9.5263509006004, "loss": 0.3422, "step": 28560 }, { "epoch": 9.5263509006004, "grad_norm": 1.096779465675354, "step": 28560 }, { "epoch": 9.5263509006004, "learning_rate": 4.697488768705016e-06, "step": 28560 }, { "epoch": 9.5263509006004, "loss": 0.3258456587791443, "step": 28560 }, { "ce_loss": 0.04681219905614853, "epoch": 9.5263509006004, "step": 28560 }, { "distill_loss": 0.15567587316036224, "epoch": 9.5263509006004, "step": 28560 }, { "epoch": 9.5263509006004, "ref_ce_loss": 0.056013528257608414, "step": 28560 }, { "epoch": 9.5263509006004, "loss": 0.3023555278778076, "step": 28560 }, { "ce_loss": 0.033247340470552444, "epoch": 9.5263509006004, "step": 28560 }, { "distill_loss": 0.16405390202999115, "epoch": 9.5263509006004, "step": 28560 }, { "epoch": 9.5263509006004, "ref_ce_loss": 0.06972135603427887, "step": 28560 }, { "epoch": 9.529686457638425, "loss": 0.3583, "step": 28570 }, { "epoch": 9.529686457638425, "grad_norm": 0.7542944550514221, "step": 28570 }, { "epoch": 9.529686457638425, "learning_rate": 4.631687451027489e-06, "step": 28570 }, { "epoch": 9.529686457638425, "loss": 0.29079288244247437, "step": 28570 }, { "ce_loss": 0.029417484998703003, "epoch": 9.529686457638425, "step": 28570 }, { "distill_loss": 0.18318980932235718, "epoch": 9.529686457638425, "step": 28570 }, { "epoch": 9.529686457638425, "ref_ce_loss": 0.05650854483246803, "step": 28570 }, { "epoch": 9.529686457638425, "loss": 0.2807072103023529, "step": 28570 }, { "ce_loss": 0.029622601345181465, "epoch": 9.529686457638425, "step": 28570 }, { "distill_loss": 0.16439847648143768, "epoch": 9.529686457638425, "step": 28570 }, { "epoch": 9.529686457638425, "ref_ce_loss": 0.057991236448287964, "step": 28570 }, { "epoch": 9.533022014676451, "loss": 0.308, "step": 28580 }, { "epoch": 9.533022014676451, "grad_norm": 1.3596519231796265, "step": 28580 }, { "epoch": 9.533022014676451, "learning_rate": 4.566347570714102e-06, "step": 28580 }, { "epoch": 9.533022014676451, "loss": 0.22487899661064148, "step": 28580 }, { "ce_loss": 0.03615245968103409, "epoch": 9.533022014676451, "step": 28580 }, { "distill_loss": 0.14556825160980225, "epoch": 9.533022014676451, "step": 28580 }, { "epoch": 9.533022014676451, "ref_ce_loss": 0.042983442544937134, "step": 28580 }, { "epoch": 9.533022014676451, "loss": 0.5066213607788086, "step": 28580 }, { "ce_loss": 0.055620044469833374, "epoch": 9.533022014676451, "step": 28580 }, { "distill_loss": 0.17550809681415558, "epoch": 9.533022014676451, "step": 28580 }, { "epoch": 9.533022014676451, "ref_ce_loss": 0.07425516843795776, "step": 28580 }, { "epoch": 9.536357571714476, "loss": 0.3642, "step": 28590 }, { "epoch": 9.536357571714476, "grad_norm": 0.8081924319267273, "step": 28590 }, { "epoch": 9.536357571714476, "learning_rate": 4.5014692040235455e-06, "step": 28590 }, { "epoch": 9.536357571714476, "loss": 0.4211300015449524, "step": 28590 }, { "ce_loss": 0.02656661719083786, "epoch": 9.536357571714476, "step": 28590 }, { "distill_loss": 0.15321145951747894, "epoch": 9.536357571714476, "step": 28590 }, { "epoch": 9.536357571714476, "ref_ce_loss": 0.06345057487487793, "step": 28590 }, { "epoch": 9.536357571714476, "loss": 0.3505942225456238, "step": 28590 }, { "ce_loss": 0.01590488851070404, "epoch": 9.536357571714476, "step": 28590 }, { "distill_loss": 0.15594804286956787, "epoch": 9.536357571714476, "step": 28590 }, { "epoch": 9.536357571714476, "ref_ce_loss": 0.0463242270052433, "step": 28590 }, { "epoch": 9.539693128752502, "loss": 0.3481, "step": 28600 }, { "epoch": 9.539693128752502, "grad_norm": 1.1376338005065918, "step": 28600 }, { "epoch": 9.539693128752502, "learning_rate": 4.437052426675781e-06, "step": 28600 }, { "epoch": 9.539693128752502, "loss": 0.3184284567832947, "step": 28600 }, { "ce_loss": 0.025577332824468613, "epoch": 9.539693128752502, "step": 28600 }, { "distill_loss": 0.14735321700572968, "epoch": 9.539693128752502, "step": 28600 }, { "epoch": 9.539693128752502, "ref_ce_loss": 0.05905190855264664, "step": 28600 }, { "epoch": 9.539693128752502, "loss": 1.036750316619873, "step": 28600 }, { "ce_loss": 0.051478635519742966, "epoch": 9.539693128752502, "step": 28600 }, { "distill_loss": 0.19417119026184082, "epoch": 9.539693128752502, "step": 28600 }, { "epoch": 9.539693128752502, "ref_ce_loss": 0.09487366676330566, "step": 28600 }, { "epoch": 9.543028685790526, "loss": 0.3749, "step": 28610 }, { "epoch": 9.543028685790526, "grad_norm": 1.288905143737793, "step": 28610 }, { "epoch": 9.543028685790526, "learning_rate": 4.373097313852182e-06, "step": 28610 }, { "epoch": 9.543028685790526, "loss": 0.33301958441734314, "step": 28610 }, { "ce_loss": 0.03987917676568031, "epoch": 9.543028685790526, "step": 28610 }, { "distill_loss": 0.20812353491783142, "epoch": 9.543028685790526, "step": 28610 }, { "epoch": 9.543028685790526, "ref_ce_loss": 0.0582069493830204, "step": 28610 }, { "epoch": 9.543028685790526, "loss": 0.5744978785514832, "step": 28610 }, { "ce_loss": 0.07136088609695435, "epoch": 9.543028685790526, "step": 28610 }, { "distill_loss": 0.18274860084056854, "epoch": 9.543028685790526, "step": 28610 }, { "epoch": 9.543028685790526, "ref_ce_loss": 0.046957675367593765, "step": 28610 }, { "epoch": 9.546364242828552, "loss": 0.3571, "step": 28620 }, { "epoch": 9.546364242828552, "grad_norm": 1.8873906135559082, "step": 28620 }, { "epoch": 9.546364242828552, "learning_rate": 4.309603940195261e-06, "step": 28620 }, { "epoch": 9.546364242828552, "loss": 0.2871871888637543, "step": 28620 }, { "ce_loss": 0.022181112319231033, "epoch": 9.546364242828552, "step": 28620 }, { "distill_loss": 0.16967681050300598, "epoch": 9.546364242828552, "step": 28620 }, { "epoch": 9.546364242828552, "ref_ce_loss": 0.04003141447901726, "step": 28620 }, { "epoch": 9.546364242828552, "loss": 0.27087387442588806, "step": 28620 }, { "ce_loss": 0.019921233877539635, "epoch": 9.546364242828552, "step": 28620 }, { "distill_loss": 0.1713012307882309, "epoch": 9.546364242828552, "step": 28620 }, { "epoch": 9.546364242828552, "ref_ce_loss": 0.05143344774842262, "step": 28620 }, { "epoch": 9.549699799866577, "loss": 0.3838, "step": 28630 }, { "epoch": 9.549699799866577, "grad_norm": 2.1031012535095215, "step": 28630 }, { "epoch": 9.549699799866577, "learning_rate": 4.246572379808545e-06, "step": 28630 }, { "epoch": 9.549699799866577, "loss": 0.24774429202079773, "step": 28630 }, { "ce_loss": 0.021202141419053078, "epoch": 9.549699799866577, "step": 28630 }, { "distill_loss": 0.16074173152446747, "epoch": 9.549699799866577, "step": 28630 }, { "epoch": 9.549699799866577, "ref_ce_loss": 0.05151132866740227, "step": 28630 }, { "epoch": 9.549699799866577, "loss": 0.5488268733024597, "step": 28630 }, { "ce_loss": 0.046389929950237274, "epoch": 9.549699799866577, "step": 28630 }, { "distill_loss": 0.1870126873254776, "epoch": 9.549699799866577, "step": 28630 }, { "epoch": 9.549699799866577, "ref_ce_loss": 0.06963532418012619, "step": 28630 }, { "epoch": 9.553035356904603, "loss": 0.3577, "step": 28640 }, { "epoch": 9.553035356904603, "grad_norm": 0.8177258968353271, "step": 28640 }, { "epoch": 9.553035356904603, "learning_rate": 4.18400270625674e-06, "step": 28640 }, { "epoch": 9.553035356904603, "loss": 0.2963216304779053, "step": 28640 }, { "ce_loss": 0.038040950894355774, "epoch": 9.553035356904603, "step": 28640 }, { "distill_loss": 0.18547900021076202, "epoch": 9.553035356904603, "step": 28640 }, { "epoch": 9.553035356904603, "ref_ce_loss": 0.05713136866688728, "step": 28640 }, { "epoch": 9.553035356904603, "loss": 0.42804527282714844, "step": 28640 }, { "ce_loss": 0.05795781686902046, "epoch": 9.553035356904603, "step": 28640 }, { "distill_loss": 0.15100808441638947, "epoch": 9.553035356904603, "step": 28640 }, { "epoch": 9.553035356904603, "ref_ce_loss": 0.0495360866189003, "step": 28640 }, { "epoch": 9.556370913942628, "loss": 0.3788, "step": 28650 }, { "epoch": 9.556370913942628, "grad_norm": 1.4477307796478271, "step": 28650 }, { "epoch": 9.556370913942628, "learning_rate": 4.121894992565345e-06, "step": 28650 }, { "epoch": 9.556370913942628, "loss": 0.2551484704017639, "step": 28650 }, { "ce_loss": 0.03109881281852722, "epoch": 9.556370913942628, "step": 28650 }, { "distill_loss": 0.17904944717884064, "epoch": 9.556370913942628, "step": 28650 }, { "epoch": 9.556370913942628, "ref_ce_loss": 0.04486895352602005, "step": 28650 }, { "epoch": 9.556370913942628, "loss": 0.2952268719673157, "step": 28650 }, { "ce_loss": 0.0575919933617115, "epoch": 9.556370913942628, "step": 28650 }, { "distill_loss": 0.17165498435497284, "epoch": 9.556370913942628, "step": 28650 }, { "epoch": 9.556370913942628, "ref_ce_loss": 0.06577247381210327, "step": 28650 }, { "epoch": 9.559706470980654, "loss": 0.319, "step": 28660 }, { "epoch": 9.559706470980654, "grad_norm": 1.4955540895462036, "step": 28660 }, { "epoch": 9.559706470980654, "learning_rate": 4.060249311220687e-06, "step": 28660 }, { "epoch": 9.559706470980654, "loss": 0.3374966084957123, "step": 28660 }, { "ce_loss": 0.044201888144016266, "epoch": 9.559706470980654, "step": 28660 }, { "distill_loss": 0.1826128214597702, "epoch": 9.559706470980654, "step": 28660 }, { "epoch": 9.559706470980654, "ref_ce_loss": 0.05706416070461273, "step": 28660 }, { "epoch": 9.559706470980654, "loss": 0.30890583992004395, "step": 28660 }, { "ce_loss": 0.045475609600543976, "epoch": 9.559706470980654, "step": 28660 }, { "distill_loss": 0.1853654533624649, "epoch": 9.559706470980654, "step": 28660 }, { "epoch": 9.559706470980654, "ref_ce_loss": 0.03793710097670555, "step": 28660 }, { "epoch": 9.563042028018678, "loss": 0.342, "step": 28670 }, { "epoch": 9.563042028018678, "grad_norm": 1.226629614830017, "step": 28670 }, { "epoch": 9.563042028018678, "learning_rate": 3.9990657341700156e-06, "step": 28670 }, { "epoch": 9.563042028018678, "loss": 0.30386072397232056, "step": 28670 }, { "ce_loss": 0.023753361776471138, "epoch": 9.563042028018678, "step": 28670 }, { "distill_loss": 0.14758071303367615, "epoch": 9.563042028018678, "step": 28670 }, { "epoch": 9.563042028018678, "ref_ce_loss": 0.0571836493909359, "step": 28670 }, { "epoch": 9.563042028018678, "loss": 0.2814745008945465, "step": 28670 }, { "ce_loss": 0.040350768715143204, "epoch": 9.563042028018678, "step": 28670 }, { "distill_loss": 0.17561466991901398, "epoch": 9.563042028018678, "step": 28670 }, { "epoch": 9.563042028018678, "ref_ce_loss": 0.06502601504325867, "step": 28670 }, { "epoch": 9.566377585056705, "loss": 0.3655, "step": 28680 }, { "epoch": 9.566377585056705, "grad_norm": 0.9547922015190125, "step": 28680 }, { "epoch": 9.566377585056705, "learning_rate": 3.938344332821053e-06, "step": 28680 }, { "epoch": 9.566377585056705, "loss": 0.32980644702911377, "step": 28680 }, { "ce_loss": 0.03864011913537979, "epoch": 9.566377585056705, "step": 28680 }, { "distill_loss": 0.20437286794185638, "epoch": 9.566377585056705, "step": 28680 }, { "epoch": 9.566377585056705, "ref_ce_loss": 0.08655925840139389, "step": 28680 }, { "epoch": 9.566377585056705, "loss": 0.3018178641796112, "step": 28680 }, { "ce_loss": 0.04785173386335373, "epoch": 9.566377585056705, "step": 28680 }, { "distill_loss": 0.17461104691028595, "epoch": 9.566377585056705, "step": 28680 }, { "epoch": 9.566377585056705, "ref_ce_loss": 0.07911676168441772, "step": 28680 }, { "epoch": 9.569713142094729, "loss": 0.3689, "step": 28690 }, { "epoch": 9.569713142094729, "grad_norm": 1.2437167167663574, "step": 28690 }, { "epoch": 9.569713142094729, "learning_rate": 3.878085178042312e-06, "step": 28690 }, { "epoch": 9.569713142094729, "loss": 0.614569365978241, "step": 28690 }, { "ce_loss": 0.04580362141132355, "epoch": 9.569713142094729, "step": 28690 }, { "distill_loss": 0.16972200572490692, "epoch": 9.569713142094729, "step": 28690 }, { "epoch": 9.569713142094729, "ref_ce_loss": 0.06595150381326675, "step": 28690 }, { "epoch": 9.569713142094729, "loss": 0.22208955883979797, "step": 28690 }, { "ce_loss": 0.020961729809641838, "epoch": 9.569713142094729, "step": 28690 }, { "distill_loss": 0.15288269519805908, "epoch": 9.569713142094729, "step": 28690 }, { "epoch": 9.569713142094729, "ref_ce_loss": 0.04804127290844917, "step": 28690 }, { "epoch": 9.573048699132755, "loss": 0.3584, "step": 28700 }, { "epoch": 9.573048699132755, "grad_norm": 1.0217654705047607, "step": 28700 }, { "epoch": 9.573048699132755, "learning_rate": 3.8182883401626015e-06, "step": 28700 }, { "epoch": 9.573048699132755, "loss": 0.30012840032577515, "step": 28700 }, { "ce_loss": 0.037350449711084366, "epoch": 9.573048699132755, "step": 28700 }, { "distill_loss": 0.1833217740058899, "epoch": 9.573048699132755, "step": 28700 }, { "epoch": 9.573048699132755, "ref_ce_loss": 0.05961844325065613, "step": 28700 }, { "epoch": 9.573048699132755, "loss": 0.36112287640571594, "step": 28700 }, { "ce_loss": 0.06320945918560028, "epoch": 9.573048699132755, "step": 28700 }, { "distill_loss": 0.20540209114551544, "epoch": 9.573048699132755, "step": 28700 }, { "epoch": 9.573048699132755, "ref_ce_loss": 0.06766784191131592, "step": 28700 }, { "epoch": 9.57638425617078, "loss": 0.3146, "step": 28710 }, { "epoch": 9.57638425617078, "grad_norm": 1.1562711000442505, "step": 28710 }, { "epoch": 9.57638425617078, "learning_rate": 3.758953888971295e-06, "step": 28710 }, { "epoch": 9.57638425617078, "loss": 0.3072027564048767, "step": 28710 }, { "ce_loss": 0.04621975123882294, "epoch": 9.57638425617078, "step": 28710 }, { "distill_loss": 0.19785268604755402, "epoch": 9.57638425617078, "step": 28710 }, { "epoch": 9.57638425617078, "ref_ce_loss": 0.06301284581422806, "step": 28710 }, { "epoch": 9.57638425617078, "loss": 0.275651216506958, "step": 28710 }, { "ce_loss": 0.04094674810767174, "epoch": 9.57638425617078, "step": 28710 }, { "distill_loss": 0.13219065964221954, "epoch": 9.57638425617078, "step": 28710 }, { "epoch": 9.57638425617078, "ref_ce_loss": 0.07614676654338837, "step": 28710 }, { "epoch": 9.579719813208806, "loss": 0.2964, "step": 28720 }, { "epoch": 9.579719813208806, "grad_norm": 1.2158347368240356, "step": 28720 }, { "epoch": 9.579719813208806, "learning_rate": 3.7000818937181546e-06, "step": 28720 }, { "epoch": 9.579719813208806, "loss": 0.24544508755207062, "step": 28720 }, { "ce_loss": 0.03466008976101875, "epoch": 9.579719813208806, "step": 28720 }, { "distill_loss": 0.16733793914318085, "epoch": 9.579719813208806, "step": 28720 }, { "epoch": 9.579719813208806, "ref_ce_loss": 0.04320121556520462, "step": 28720 }, { "epoch": 9.579719813208806, "loss": 0.3183065950870514, "step": 28720 }, { "ce_loss": 0.04318279027938843, "epoch": 9.579719813208806, "step": 28720 }, { "distill_loss": 0.16793055832386017, "epoch": 9.579719813208806, "step": 28720 }, { "epoch": 9.579719813208806, "ref_ce_loss": 0.053741853684186935, "step": 28720 }, { "epoch": 9.58305537024683, "loss": 0.3425, "step": 28730 }, { "epoch": 9.58305537024683, "grad_norm": 0.9732711315155029, "step": 28730 }, { "epoch": 9.58305537024683, "learning_rate": 3.6416724231130186e-06, "step": 28730 }, { "epoch": 9.58305537024683, "loss": 0.3047448694705963, "step": 28730 }, { "ce_loss": 0.02543429099023342, "epoch": 9.58305537024683, "step": 28730 }, { "distill_loss": 0.18523433804512024, "epoch": 9.58305537024683, "step": 28730 }, { "epoch": 9.58305537024683, "ref_ce_loss": 0.046858031302690506, "step": 28730 }, { "epoch": 9.58305537024683, "loss": 0.23915736377239227, "step": 28730 }, { "ce_loss": 0.02462298981845379, "epoch": 9.58305537024683, "step": 28730 }, { "distill_loss": 0.16042344272136688, "epoch": 9.58305537024683, "step": 28730 }, { "epoch": 9.58305537024683, "ref_ce_loss": 0.05374724790453911, "step": 28730 }, { "epoch": 9.586390927284857, "loss": 0.3176, "step": 28740 }, { "epoch": 9.586390927284857, "grad_norm": 1.7445679903030396, "step": 28740 }, { "epoch": 9.586390927284857, "learning_rate": 3.5837255453261554e-06, "step": 28740 }, { "epoch": 9.586390927284857, "loss": 0.32195746898651123, "step": 28740 }, { "ce_loss": 0.0703810602426529, "epoch": 9.586390927284857, "step": 28740 }, { "distill_loss": 0.17174682021141052, "epoch": 9.586390927284857, "step": 28740 }, { "epoch": 9.586390927284857, "ref_ce_loss": 0.06239185482263565, "step": 28740 }, { "epoch": 9.586390927284857, "loss": 0.3455526530742645, "step": 28740 }, { "ce_loss": 0.03157772123813629, "epoch": 9.586390927284857, "step": 28740 }, { "distill_loss": 0.17570674419403076, "epoch": 9.586390927284857, "step": 28740 }, { "epoch": 9.586390927284857, "ref_ce_loss": 0.056114934384822845, "step": 28740 }, { "epoch": 9.589726484322881, "loss": 0.3097, "step": 28750 }, { "epoch": 9.589726484322881, "grad_norm": 1.0213404893875122, "step": 28750 }, { "epoch": 9.589726484322881, "learning_rate": 3.5262413279876894e-06, "step": 28750 }, { "epoch": 9.589726484322881, "loss": 0.4088827967643738, "step": 28750 }, { "ce_loss": 0.07769577205181122, "epoch": 9.589726484322881, "step": 28750 }, { "distill_loss": 0.2127758264541626, "epoch": 9.589726484322881, "step": 28750 }, { "epoch": 9.589726484322881, "ref_ce_loss": 0.06413999199867249, "step": 28750 }, { "epoch": 9.589726484322881, "loss": 0.529901385307312, "step": 28750 }, { "ce_loss": 0.03473932668566704, "epoch": 9.589726484322881, "step": 28750 }, { "distill_loss": 0.1991306096315384, "epoch": 9.589726484322881, "step": 28750 }, { "epoch": 9.589726484322881, "ref_ce_loss": 0.06662367284297943, "step": 28750 }, { "epoch": 9.593062041360907, "loss": 0.343, "step": 28760 }, { "epoch": 9.593062041360907, "grad_norm": 1.1475307941436768, "step": 28760 }, { "epoch": 9.593062041360907, "learning_rate": 3.4692198381879536e-06, "step": 28760 }, { "epoch": 9.593062041360907, "loss": 0.32468339800834656, "step": 28760 }, { "ce_loss": 0.04863545671105385, "epoch": 9.593062041360907, "step": 28760 }, { "distill_loss": 0.16399593651294708, "epoch": 9.593062041360907, "step": 28760 }, { "epoch": 9.593062041360907, "ref_ce_loss": 0.0735296681523323, "step": 28760 }, { "epoch": 9.593062041360907, "loss": 0.42887789011001587, "step": 28760 }, { "ce_loss": 0.05044868215918541, "epoch": 9.593062041360907, "step": 28760 }, { "distill_loss": 0.1644851565361023, "epoch": 9.593062041360907, "step": 28760 }, { "epoch": 9.593062041360907, "ref_ce_loss": 0.05538942664861679, "step": 28760 }, { "epoch": 9.596397598398932, "loss": 0.331, "step": 28770 }, { "epoch": 9.596397598398932, "grad_norm": 6.332052230834961, "step": 28770 }, { "epoch": 9.596397598398932, "learning_rate": 3.412661142477136e-06, "step": 28770 }, { "epoch": 9.596397598398932, "loss": 0.30600395798683167, "step": 28770 }, { "ce_loss": 0.036587513983249664, "epoch": 9.596397598398932, "step": 28770 }, { "distill_loss": 0.1614750772714615, "epoch": 9.596397598398932, "step": 28770 }, { "epoch": 9.596397598398932, "ref_ce_loss": 0.07749360054731369, "step": 28770 }, { "epoch": 9.596397598398932, "loss": 0.3054625689983368, "step": 28770 }, { "ce_loss": 0.06228421628475189, "epoch": 9.596397598398932, "step": 28770 }, { "distill_loss": 0.1685221642255783, "epoch": 9.596397598398932, "step": 28770 }, { "epoch": 9.596397598398932, "ref_ce_loss": 0.07439344376325607, "step": 28770 }, { "epoch": 9.599733155436958, "loss": 0.3373, "step": 28780 }, { "epoch": 9.599733155436958, "grad_norm": 1.7799866199493408, "step": 28780 }, { "epoch": 9.599733155436958, "learning_rate": 3.356565306865367e-06, "step": 28780 }, { "epoch": 9.599733155436958, "loss": 0.27912962436676025, "step": 28780 }, { "ce_loss": 0.027723034843802452, "epoch": 9.599733155436958, "step": 28780 }, { "distill_loss": 0.17505702376365662, "epoch": 9.599733155436958, "step": 28780 }, { "epoch": 9.599733155436958, "ref_ce_loss": 0.05043905973434448, "step": 28780 }, { "epoch": 9.599733155436958, "loss": 0.21506954729557037, "step": 28780 }, { "ce_loss": 0.036209072917699814, "epoch": 9.599733155436958, "step": 28780 }, { "distill_loss": 0.12021758407354355, "epoch": 9.599733155436958, "step": 28780 }, { "epoch": 9.599733155436958, "ref_ce_loss": 0.058201637119054794, "step": 28780 }, { "epoch": 9.603068712474983, "loss": 0.3961, "step": 28790 }, { "epoch": 9.603068712474983, "grad_norm": 1.1351579427719116, "step": 28790 }, { "epoch": 9.603068712474983, "learning_rate": 3.300932396822409e-06, "step": 28790 }, { "epoch": 9.603068712474983, "loss": 0.377149760723114, "step": 28790 }, { "ce_loss": 0.02792281098663807, "epoch": 9.603068712474983, "step": 28790 }, { "distill_loss": 0.14689025282859802, "epoch": 9.603068712474983, "step": 28790 }, { "epoch": 9.603068712474983, "ref_ce_loss": 0.05250436067581177, "step": 28790 }, { "epoch": 9.603068712474983, "loss": 0.3787650465965271, "step": 28790 }, { "ce_loss": 0.05750597268342972, "epoch": 9.603068712474983, "step": 28790 }, { "distill_loss": 0.17340369522571564, "epoch": 9.603068712474983, "step": 28790 }, { "epoch": 9.603068712474983, "ref_ce_loss": 0.07258488237857819, "step": 28790 }, { "epoch": 9.606404269513009, "loss": 0.3483, "step": 28800 }, { "epoch": 9.606404269513009, "grad_norm": 1.082783818244934, "step": 28800 }, { "epoch": 9.606404269513009, "learning_rate": 3.245762477277969e-06, "step": 28800 }, { "epoch": 9.606404269513009, "loss": 0.3081613779067993, "step": 28800 }, { "ce_loss": 0.01638505607843399, "epoch": 9.606404269513009, "step": 28800 }, { "distill_loss": 0.18335667252540588, "epoch": 9.606404269513009, "step": 28800 }, { "epoch": 9.606404269513009, "ref_ce_loss": 0.059051476418972015, "step": 28800 }, { "epoch": 9.606404269513009, "loss": 0.28536441922187805, "step": 28800 }, { "ce_loss": 0.04424785077571869, "epoch": 9.606404269513009, "step": 28800 }, { "distill_loss": 0.18336057662963867, "epoch": 9.606404269513009, "step": 28800 }, { "epoch": 9.606404269513009, "ref_ce_loss": 0.05742606520652771, "step": 28800 }, { "epoch": 9.609739826551033, "loss": 0.3274, "step": 28810 }, { "epoch": 9.609739826551033, "grad_norm": 1.2012810707092285, "step": 28810 }, { "epoch": 9.609739826551033, "learning_rate": 3.1910556126212963e-06, "step": 28810 }, { "epoch": 9.609739826551033, "loss": 0.3235844373703003, "step": 28810 }, { "ce_loss": 0.06070598587393761, "epoch": 9.609739826551033, "step": 28810 }, { "distill_loss": 0.1533758044242859, "epoch": 9.609739826551033, "step": 28810 }, { "epoch": 9.609739826551033, "ref_ce_loss": 0.061860065907239914, "step": 28810 }, { "epoch": 9.609739826551033, "loss": 0.30414530634880066, "step": 28810 }, { "ce_loss": 0.03909186273813248, "epoch": 9.609739826551033, "step": 28810 }, { "distill_loss": 0.1880081593990326, "epoch": 9.609739826551033, "step": 28810 }, { "epoch": 9.609739826551033, "ref_ce_loss": 0.046735141426324844, "step": 28810 }, { "epoch": 9.61307538358906, "loss": 0.3513, "step": 28820 }, { "epoch": 9.61307538358906, "grad_norm": 1.2907757759094238, "step": 28820 }, { "epoch": 9.61307538358906, "learning_rate": 3.1368118667010505e-06, "step": 28820 }, { "epoch": 9.61307538358906, "loss": 0.34699732065200806, "step": 28820 }, { "ce_loss": 0.020871445536613464, "epoch": 9.61307538358906, "step": 28820 }, { "distill_loss": 0.1841685175895691, "epoch": 9.61307538358906, "step": 28820 }, { "epoch": 9.61307538358906, "ref_ce_loss": 0.06769346445798874, "step": 28820 }, { "epoch": 9.61307538358906, "loss": 0.4415394067764282, "step": 28820 }, { "ce_loss": 0.028012122958898544, "epoch": 9.61307538358906, "step": 28820 }, { "distill_loss": 0.16002513468265533, "epoch": 9.61307538358906, "step": 28820 }, { "epoch": 9.61307538358906, "ref_ce_loss": 0.05012548342347145, "step": 28820 }, { "epoch": 9.616410940627084, "loss": 0.3138, "step": 28830 }, { "epoch": 9.616410940627084, "grad_norm": 1.590085506439209, "step": 28830 }, { "epoch": 9.616410940627084, "learning_rate": 3.083031302825612e-06, "step": 28830 }, { "epoch": 9.616410940627084, "loss": 0.3491607904434204, "step": 28830 }, { "ce_loss": 0.037974365055561066, "epoch": 9.616410940627084, "step": 28830 }, { "distill_loss": 0.17866151034832, "epoch": 9.616410940627084, "step": 28830 }, { "epoch": 9.616410940627084, "ref_ce_loss": 0.05684041231870651, "step": 28830 }, { "epoch": 9.616410940627084, "loss": 0.36506855487823486, "step": 28830 }, { "ce_loss": 0.05224547162652016, "epoch": 9.616410940627084, "step": 28830 }, { "distill_loss": 0.21156731247901917, "epoch": 9.616410940627084, "step": 28830 }, { "epoch": 9.616410940627084, "ref_ce_loss": 0.07331357151269913, "step": 28830 }, { "epoch": 9.61974649766511, "loss": 0.3419, "step": 28840 }, { "epoch": 9.61974649766511, "grad_norm": 1.2835090160369873, "step": 28840 }, { "epoch": 9.61974649766511, "learning_rate": 3.0297139837627273e-06, "step": 28840 }, { "epoch": 9.61974649766511, "loss": 0.46609291434288025, "step": 28840 }, { "ce_loss": 0.0682363361120224, "epoch": 9.61974649766511, "step": 28840 }, { "distill_loss": 0.17836935818195343, "epoch": 9.61974649766511, "step": 28840 }, { "epoch": 9.61974649766511, "ref_ce_loss": 0.07604022324085236, "step": 28840 }, { "epoch": 9.61974649766511, "loss": 0.2486204355955124, "step": 28840 }, { "ce_loss": 0.028242826461791992, "epoch": 9.61974649766511, "step": 28840 }, { "distill_loss": 0.14386576414108276, "epoch": 9.61974649766511, "step": 28840 }, { "epoch": 9.61974649766511, "ref_ce_loss": 0.05152817815542221, "step": 28840 }, { "epoch": 9.623082054703135, "loss": 0.3885, "step": 28850 }, { "epoch": 9.623082054703135, "grad_norm": 1.1511974334716797, "step": 28850 }, { "epoch": 9.623082054703135, "learning_rate": 2.9768599717393763e-06, "step": 28850 }, { "epoch": 9.623082054703135, "loss": 0.23758172988891602, "step": 28850 }, { "ce_loss": 0.02366476133465767, "epoch": 9.623082054703135, "step": 28850 }, { "distill_loss": 0.15192754566669464, "epoch": 9.623082054703135, "step": 28850 }, { "epoch": 9.623082054703135, "ref_ce_loss": 0.043029189109802246, "step": 28850 }, { "epoch": 9.623082054703135, "loss": 0.29049375653266907, "step": 28850 }, { "ce_loss": 0.04398266226053238, "epoch": 9.623082054703135, "step": 28850 }, { "distill_loss": 0.17389945685863495, "epoch": 9.623082054703135, "step": 28850 }, { "epoch": 9.623082054703135, "ref_ce_loss": 0.05363408103585243, "step": 28850 }, { "epoch": 9.62641761174116, "loss": 0.336, "step": 28860 }, { "epoch": 9.62641761174116, "grad_norm": 1.0026607513427734, "step": 28860 }, { "epoch": 9.62641761174116, "learning_rate": 2.924469328441859e-06, "step": 28860 }, { "epoch": 9.62641761174116, "loss": 0.4305479824542999, "step": 28860 }, { "ce_loss": 0.06890071928501129, "epoch": 9.62641761174116, "step": 28860 }, { "distill_loss": 0.21621493995189667, "epoch": 9.62641761174116, "step": 28860 }, { "epoch": 9.62641761174116, "ref_ce_loss": 0.06498651951551437, "step": 28860 }, { "epoch": 9.62641761174116, "loss": 0.253123939037323, "step": 28860 }, { "ce_loss": 0.03857715055346489, "epoch": 9.62641761174116, "step": 28860 }, { "distill_loss": 0.1470891535282135, "epoch": 9.62641761174116, "step": 28860 }, { "epoch": 9.62641761174116, "ref_ce_loss": 0.053641896694898605, "step": 28860 }, { "epoch": 9.629753168779185, "loss": 0.3424, "step": 28870 }, { "epoch": 9.629753168779185, "grad_norm": 0.844653308391571, "step": 28870 }, { "epoch": 9.629753168779185, "learning_rate": 2.8725421150157527e-06, "step": 28870 }, { "epoch": 9.629753168779185, "loss": 0.3580155074596405, "step": 28870 }, { "ce_loss": 0.07331318408250809, "epoch": 9.629753168779185, "step": 28870 }, { "distill_loss": 0.21698784828186035, "epoch": 9.629753168779185, "step": 28870 }, { "epoch": 9.629753168779185, "ref_ce_loss": 0.053149282932281494, "step": 28870 }, { "epoch": 9.629753168779185, "loss": 0.3565460443496704, "step": 28870 }, { "ce_loss": 0.05684034526348114, "epoch": 9.629753168779185, "step": 28870 }, { "distill_loss": 0.21384520828723907, "epoch": 9.629753168779185, "step": 28870 }, { "epoch": 9.629753168779185, "ref_ce_loss": 0.06192195042967796, "step": 28870 }, { "epoch": 9.633088725817212, "loss": 0.3564, "step": 28880 }, { "epoch": 9.633088725817212, "grad_norm": 1.2454036474227905, "step": 28880 }, { "epoch": 9.633088725817212, "learning_rate": 2.8210783920656903e-06, "step": 28880 }, { "epoch": 9.633088725817212, "loss": 0.5010125041007996, "step": 28880 }, { "ce_loss": 0.03849237039685249, "epoch": 9.633088725817212, "step": 28880 }, { "distill_loss": 0.17564459145069122, "epoch": 9.633088725817212, "step": 28880 }, { "epoch": 9.633088725817212, "ref_ce_loss": 0.07166219502687454, "step": 28880 }, { "epoch": 9.633088725817212, "loss": 0.338174968957901, "step": 28880 }, { "ce_loss": 0.0471452958881855, "epoch": 9.633088725817212, "step": 28880 }, { "distill_loss": 0.16874736547470093, "epoch": 9.633088725817212, "step": 28880 }, { "epoch": 9.633088725817212, "ref_ce_loss": 0.09181161969900131, "step": 28880 }, { "epoch": 9.636424282855236, "loss": 0.3748, "step": 28890 }, { "epoch": 9.636424282855236, "grad_norm": 2.602931261062622, "step": 28890 }, { "epoch": 9.636424282855236, "learning_rate": 2.7700782196553585e-06, "step": 28890 }, { "epoch": 9.636424282855236, "loss": 0.36306819319725037, "step": 28890 }, { "ce_loss": 0.04997847229242325, "epoch": 9.636424282855236, "step": 28890 }, { "distill_loss": 0.22075606882572174, "epoch": 9.636424282855236, "step": 28890 }, { "epoch": 9.636424282855236, "ref_ce_loss": 0.07122057676315308, "step": 28890 }, { "epoch": 9.636424282855236, "loss": 0.4285609722137451, "step": 28890 }, { "ce_loss": 0.0449661985039711, "epoch": 9.636424282855236, "step": 28890 }, { "distill_loss": 0.18159648776054382, "epoch": 9.636424282855236, "step": 28890 }, { "epoch": 9.636424282855236, "ref_ce_loss": 0.05567074567079544, "step": 28890 }, { "epoch": 9.639759839893262, "loss": 0.3606, "step": 28900 }, { "epoch": 9.639759839893262, "grad_norm": 1.0745766162872314, "step": 28900 }, { "epoch": 9.639759839893262, "learning_rate": 2.7195416573074562e-06, "step": 28900 }, { "epoch": 9.639759839893262, "loss": 0.28361329436302185, "step": 28900 }, { "ce_loss": 0.03859997168183327, "epoch": 9.639759839893262, "step": 28900 }, { "distill_loss": 0.1544235497713089, "epoch": 9.639759839893262, "step": 28900 }, { "epoch": 9.639759839893262, "ref_ce_loss": 0.07481688261032104, "step": 28900 }, { "epoch": 9.639759839893262, "loss": 0.5972962379455566, "step": 28900 }, { "ce_loss": 0.06487046927213669, "epoch": 9.639759839893262, "step": 28900 }, { "distill_loss": 0.14648039638996124, "epoch": 9.639759839893262, "step": 28900 }, { "epoch": 9.639759839893262, "ref_ce_loss": 0.097092404961586, "step": 28900 }, { "epoch": 9.643095396931287, "loss": 0.333, "step": 28910 }, { "epoch": 9.643095396931287, "grad_norm": 0.9275277256965637, "step": 28910 }, { "epoch": 9.643095396931287, "learning_rate": 2.6694687640036022e-06, "step": 28910 }, { "epoch": 9.643095396931287, "loss": 0.3020519018173218, "step": 28910 }, { "ce_loss": 0.038193780928850174, "epoch": 9.643095396931287, "step": 28910 }, { "distill_loss": 0.19235727190971375, "epoch": 9.643095396931287, "step": 28910 }, { "epoch": 9.643095396931287, "ref_ce_loss": 0.05098062381148338, "step": 28910 }, { "epoch": 9.643095396931287, "loss": 0.2890506088733673, "step": 28910 }, { "ce_loss": 0.05658218637108803, "epoch": 9.643095396931287, "step": 28910 }, { "distill_loss": 0.18204079568386078, "epoch": 9.643095396931287, "step": 28910 }, { "epoch": 9.643095396931287, "ref_ce_loss": 0.05018565058708191, "step": 28910 }, { "epoch": 9.646430953969313, "loss": 0.3726, "step": 28920 }, { "epoch": 9.646430953969313, "grad_norm": 1.0181881189346313, "step": 28920 }, { "epoch": 9.646430953969313, "learning_rate": 2.6198595981842046e-06, "step": 28920 }, { "epoch": 9.646430953969313, "loss": 0.3459164798259735, "step": 28920 }, { "ce_loss": 0.0480022206902504, "epoch": 9.646430953969313, "step": 28920 }, { "distill_loss": 0.164689838886261, "epoch": 9.646430953969313, "step": 28920 }, { "epoch": 9.646430953969313, "ref_ce_loss": 0.046335261315107346, "step": 28920 }, { "epoch": 9.646430953969313, "loss": 0.2523891031742096, "step": 28920 }, { "ce_loss": 0.028045305982232094, "epoch": 9.646430953969313, "step": 28920 }, { "distill_loss": 0.16294798254966736, "epoch": 9.646430953969313, "step": 28920 }, { "epoch": 9.646430953969313, "ref_ce_loss": 0.06082697585225105, "step": 28920 }, { "epoch": 9.649766511007337, "loss": 0.3963, "step": 28930 }, { "epoch": 9.649766511007337, "grad_norm": 1.255349040031433, "step": 28930 }, { "epoch": 9.649766511007337, "learning_rate": 2.570714217748549e-06, "step": 28930 }, { "epoch": 9.649766511007337, "loss": 0.42087769508361816, "step": 28930 }, { "ce_loss": 0.0513894259929657, "epoch": 9.649766511007337, "step": 28930 }, { "distill_loss": 0.17212828993797302, "epoch": 9.649766511007337, "step": 28930 }, { "epoch": 9.649766511007337, "ref_ce_loss": 0.0597614161670208, "step": 28930 }, { "epoch": 9.649766511007337, "loss": 0.27796125411987305, "step": 28930 }, { "ce_loss": 0.04492616653442383, "epoch": 9.649766511007337, "step": 28930 }, { "distill_loss": 0.17121589183807373, "epoch": 9.649766511007337, "step": 28930 }, { "epoch": 9.649766511007337, "ref_ce_loss": 0.04602546989917755, "step": 28930 }, { "epoch": 9.653102068045364, "loss": 0.3608, "step": 28940 }, { "epoch": 9.653102068045364, "grad_norm": 0.8002476096153259, "step": 28940 }, { "epoch": 9.653102068045364, "learning_rate": 2.5220326800545313e-06, "step": 28940 }, { "epoch": 9.653102068045364, "loss": 0.3676643967628479, "step": 28940 }, { "ce_loss": 0.04760293290019035, "epoch": 9.653102068045364, "step": 28940 }, { "distill_loss": 0.20377562940120697, "epoch": 9.653102068045364, "step": 28940 }, { "epoch": 9.653102068045364, "ref_ce_loss": 0.0758914053440094, "step": 28940 }, { "epoch": 9.653102068045364, "loss": 0.30955246090888977, "step": 28940 }, { "ce_loss": 0.035490572452545166, "epoch": 9.653102068045364, "step": 28940 }, { "distill_loss": 0.18446147441864014, "epoch": 9.653102068045364, "step": 28940 }, { "epoch": 9.653102068045364, "ref_ce_loss": 0.06045921519398689, "step": 28940 }, { "epoch": 9.656437625083388, "loss": 0.3389, "step": 28950 }, { "epoch": 9.656437625083388, "grad_norm": 1.1092398166656494, "step": 28950 }, { "epoch": 9.656437625083388, "learning_rate": 2.473815041918792e-06, "step": 28950 }, { "epoch": 9.656437625083388, "loss": 0.23038047552108765, "step": 28950 }, { "ce_loss": 0.024803156033158302, "epoch": 9.656437625083388, "step": 28950 }, { "distill_loss": 0.13957466185092926, "epoch": 9.656437625083388, "step": 28950 }, { "epoch": 9.656437625083388, "ref_ce_loss": 0.04500986263155937, "step": 28950 }, { "epoch": 9.656437625083388, "loss": 0.31989532709121704, "step": 28950 }, { "ce_loss": 0.042344290763139725, "epoch": 9.656437625083388, "step": 28950 }, { "distill_loss": 0.20331068336963654, "epoch": 9.656437625083388, "step": 28950 }, { "epoch": 9.656437625083388, "ref_ce_loss": 0.05306573957204819, "step": 28950 }, { "epoch": 9.659773182121414, "loss": 0.3883, "step": 28960 }, { "epoch": 9.659773182121414, "grad_norm": 0.9196416139602661, "step": 28960 }, { "epoch": 9.659773182121414, "learning_rate": 2.426061359616494e-06, "step": 28960 }, { "epoch": 9.659773182121414, "loss": 0.276298850774765, "step": 28960 }, { "ce_loss": 0.018789373338222504, "epoch": 9.659773182121414, "step": 28960 }, { "distill_loss": 0.14118222892284393, "epoch": 9.659773182121414, "step": 28960 }, { "epoch": 9.659773182121414, "ref_ce_loss": 0.06556124240159988, "step": 28960 }, { "epoch": 9.659773182121414, "loss": 0.9002341032028198, "step": 28960 }, { "ce_loss": 0.06075263395905495, "epoch": 9.659773182121414, "step": 28960 }, { "distill_loss": 0.17979496717453003, "epoch": 9.659773182121414, "step": 28960 }, { "epoch": 9.659773182121414, "ref_ce_loss": 0.05195245519280434, "step": 28960 }, { "epoch": 9.663108739159439, "loss": 0.3365, "step": 28970 }, { "epoch": 9.663108739159439, "grad_norm": 1.0752198696136475, "step": 28970 }, { "epoch": 9.663108739159439, "learning_rate": 2.3787716888813206e-06, "step": 28970 }, { "epoch": 9.663108739159439, "loss": 0.43198779225349426, "step": 28970 }, { "ce_loss": 0.05001339688897133, "epoch": 9.663108739159439, "step": 28970 }, { "distill_loss": 0.17285940051078796, "epoch": 9.663108739159439, "step": 28970 }, { "epoch": 9.663108739159439, "ref_ce_loss": 0.06730245053768158, "step": 28970 }, { "epoch": 9.663108739159439, "loss": 0.41112181544303894, "step": 28970 }, { "ce_loss": 0.055711500346660614, "epoch": 9.663108739159439, "step": 28970 }, { "distill_loss": 0.17232096195220947, "epoch": 9.663108739159439, "step": 28970 }, { "epoch": 9.663108739159439, "ref_ce_loss": 0.07165464013814926, "step": 28970 }, { "epoch": 9.666444296197465, "loss": 0.3493, "step": 28980 }, { "epoch": 9.666444296197465, "grad_norm": 1.0368164777755737, "step": 28980 }, { "epoch": 9.666444296197465, "learning_rate": 2.3319460849053897e-06, "step": 28980 }, { "epoch": 9.666444296197465, "loss": 0.29087287187576294, "step": 28980 }, { "ce_loss": 0.028599737212061882, "epoch": 9.666444296197465, "step": 28980 }, { "distill_loss": 0.19898228347301483, "epoch": 9.666444296197465, "step": 28980 }, { "epoch": 9.666444296197465, "ref_ce_loss": 0.06319849193096161, "step": 28980 }, { "epoch": 9.666444296197465, "loss": 0.23333315551280975, "step": 28980 }, { "ce_loss": 0.019216326996684074, "epoch": 9.666444296197465, "step": 28980 }, { "distill_loss": 0.13311071693897247, "epoch": 9.666444296197465, "step": 28980 }, { "epoch": 9.666444296197465, "ref_ce_loss": 0.06234074756503105, "step": 28980 }, { "epoch": 9.66977985323549, "loss": 0.3176, "step": 28990 }, { "epoch": 9.66977985323549, "grad_norm": 0.9731513857841492, "step": 28990 }, { "epoch": 9.66977985323549, "learning_rate": 2.2855846023392524e-06, "step": 28990 }, { "epoch": 9.66977985323549, "loss": 0.39693683385849, "step": 28990 }, { "ce_loss": 0.022053180262446404, "epoch": 9.66977985323549, "step": 28990 }, { "distill_loss": 0.17590022087097168, "epoch": 9.66977985323549, "step": 28990 }, { "epoch": 9.66977985323549, "ref_ce_loss": 0.05321211740374565, "step": 28990 }, { "epoch": 9.66977985323549, "loss": 0.340067982673645, "step": 28990 }, { "ce_loss": 0.06889030337333679, "epoch": 9.66977985323549, "step": 28990 }, { "distill_loss": 0.19582036137580872, "epoch": 9.66977985323549, "step": 28990 }, { "epoch": 9.66977985323549, "ref_ce_loss": 0.07497034221887589, "step": 28990 }, { "epoch": 9.673115410273516, "loss": 0.3492, "step": 29000 }, { "epoch": 9.673115410273516, "grad_norm": 1.4342679977416992, "step": 29000 }, { "epoch": 9.673115410273516, "learning_rate": 2.239687295291715e-06, "step": 29000 }, { "epoch": 9.673115410273516, "loss": 0.2772221267223358, "step": 29000 }, { "ce_loss": 0.017711373046040535, "epoch": 9.673115410273516, "step": 29000 }, { "distill_loss": 0.1418878436088562, "epoch": 9.673115410273516, "step": 29000 }, { "epoch": 9.673115410273516, "ref_ce_loss": 0.05305256322026253, "step": 29000 }, { "epoch": 9.673115410273516, "loss": 0.44195717573165894, "step": 29000 }, { "ce_loss": 0.05625804141163826, "epoch": 9.673115410273516, "step": 29000 }, { "distill_loss": 0.17068497836589813, "epoch": 9.673115410273516, "step": 29000 }, { "epoch": 9.673115410273516, "ref_ce_loss": 0.06807989627122879, "step": 29000 }, { "epoch": 9.67645096731154, "loss": 0.3439, "step": 29010 }, { "epoch": 9.67645096731154, "grad_norm": 1.06513249874115, "step": 29010 }, { "epoch": 9.67645096731154, "learning_rate": 2.194254217329883e-06, "step": 29010 }, { "epoch": 9.67645096731154, "loss": 0.323006808757782, "step": 29010 }, { "ce_loss": 0.028976066038012505, "epoch": 9.67645096731154, "step": 29010 }, { "distill_loss": 0.19334059953689575, "epoch": 9.67645096731154, "step": 29010 }, { "epoch": 9.67645096731154, "ref_ce_loss": 0.06735754758119583, "step": 29010 }, { "epoch": 9.67645096731154, "loss": 0.257314532995224, "step": 29010 }, { "ce_loss": 0.02535068616271019, "epoch": 9.67645096731154, "step": 29010 }, { "distill_loss": 0.1416252702474594, "epoch": 9.67645096731154, "step": 29010 }, { "epoch": 9.67645096731154, "ref_ce_loss": 0.06281197816133499, "step": 29010 }, { "epoch": 9.679786524349566, "loss": 0.3346, "step": 29020 }, { "epoch": 9.679786524349566, "grad_norm": 1.426174521446228, "step": 29020 }, { "epoch": 9.679786524349566, "learning_rate": 2.1492854214790304e-06, "step": 29020 }, { "epoch": 9.679786524349566, "loss": 0.28953003883361816, "step": 29020 }, { "ce_loss": 0.044721540063619614, "epoch": 9.679786524349566, "step": 29020 }, { "distill_loss": 0.18095023930072784, "epoch": 9.679786524349566, "step": 29020 }, { "epoch": 9.679786524349566, "ref_ce_loss": 0.04519897699356079, "step": 29020 }, { "epoch": 9.679786524349566, "loss": 0.4611284136772156, "step": 29020 }, { "ce_loss": 0.07004204392433167, "epoch": 9.679786524349566, "step": 29020 }, { "distill_loss": 0.19099056720733643, "epoch": 9.679786524349566, "step": 29020 }, { "epoch": 9.679786524349566, "ref_ce_loss": 0.07121763378381729, "step": 29020 }, { "epoch": 9.683122081387591, "loss": 0.3151, "step": 29030 }, { "epoch": 9.683122081387591, "grad_norm": 0.7787420153617859, "step": 29030 }, { "epoch": 9.683122081387591, "learning_rate": 2.1047809602225966e-06, "step": 29030 }, { "epoch": 9.683122081387591, "loss": 0.283674031496048, "step": 29030 }, { "ce_loss": 0.03510609641671181, "epoch": 9.683122081387591, "step": 29030 }, { "distill_loss": 0.164617657661438, "epoch": 9.683122081387591, "step": 29030 }, { "epoch": 9.683122081387591, "ref_ce_loss": 0.05867642164230347, "step": 29030 }, { "epoch": 9.683122081387591, "loss": 0.39628174901008606, "step": 29030 }, { "ce_loss": 0.07344058156013489, "epoch": 9.683122081387591, "step": 29030 }, { "distill_loss": 0.1842685043811798, "epoch": 9.683122081387591, "step": 29030 }, { "epoch": 9.683122081387591, "ref_ce_loss": 0.076644167304039, "step": 29030 }, { "epoch": 9.686457638425617, "loss": 0.3965, "step": 29040 }, { "epoch": 9.686457638425617, "grad_norm": 1.1861432790756226, "step": 29040 }, { "epoch": 9.686457638425617, "learning_rate": 2.0607408855020995e-06, "step": 29040 }, { "epoch": 9.686457638425617, "loss": 0.2955714166164398, "step": 29040 }, { "ce_loss": 0.03226979076862335, "epoch": 9.686457638425617, "step": 29040 }, { "distill_loss": 0.1686120629310608, "epoch": 9.686457638425617, "step": 29040 }, { "epoch": 9.686457638425617, "ref_ce_loss": 0.07210496068000793, "step": 29040 }, { "epoch": 9.686457638425617, "loss": 0.3625262975692749, "step": 29040 }, { "ce_loss": 0.05599134787917137, "epoch": 9.686457638425617, "step": 29040 }, { "distill_loss": 0.20067884027957916, "epoch": 9.686457638425617, "step": 29040 }, { "epoch": 9.686457638425617, "ref_ce_loss": 0.046158257871866226, "step": 29040 }, { "epoch": 9.689793195463643, "loss": 0.3506, "step": 29050 }, { "epoch": 9.689793195463643, "grad_norm": 0.8858498930931091, "step": 29050 }, { "epoch": 9.689793195463643, "learning_rate": 2.017165248717001e-06, "step": 29050 }, { "epoch": 9.689793195463643, "loss": 0.2790871560573578, "step": 29050 }, { "ce_loss": 0.03252222016453743, "epoch": 9.689793195463643, "step": 29050 }, { "distill_loss": 0.1491430401802063, "epoch": 9.689793195463643, "step": 29050 }, { "epoch": 9.689793195463643, "ref_ce_loss": 0.07085377722978592, "step": 29050 }, { "epoch": 9.689793195463643, "loss": 0.2061745971441269, "step": 29050 }, { "ce_loss": 0.037018779665231705, "epoch": 9.689793195463643, "step": 29050 }, { "distill_loss": 0.12827187776565552, "epoch": 9.689793195463643, "step": 29050 }, { "epoch": 9.689793195463643, "ref_ce_loss": 0.040482133626937866, "step": 29050 }, { "epoch": 9.693128752501668, "loss": 0.3418, "step": 29060 }, { "epoch": 9.693128752501668, "grad_norm": 1.0434974431991577, "step": 29060 }, { "epoch": 9.693128752501668, "learning_rate": 1.9740541007247983e-06, "step": 29060 }, { "epoch": 9.693128752501668, "loss": 0.32193756103515625, "step": 29060 }, { "ce_loss": 0.029255038127303123, "epoch": 9.693128752501668, "step": 29060 }, { "distill_loss": 0.15391921997070312, "epoch": 9.693128752501668, "step": 29060 }, { "epoch": 9.693128752501668, "ref_ce_loss": 0.08128950744867325, "step": 29060 }, { "epoch": 9.693128752501668, "loss": 0.23065324127674103, "step": 29060 }, { "ce_loss": 0.01794644072651863, "epoch": 9.693128752501668, "step": 29060 }, { "distill_loss": 0.13754110038280487, "epoch": 9.693128752501668, "step": 29060 }, { "epoch": 9.693128752501668, "ref_ce_loss": 0.04723289608955383, "step": 29060 }, { "epoch": 9.696464309539692, "loss": 0.3458, "step": 29070 }, { "epoch": 9.696464309539692, "grad_norm": 1.0453249216079712, "step": 29070 }, { "epoch": 9.696464309539692, "learning_rate": 1.931407491840842e-06, "step": 29070 }, { "epoch": 9.696464309539692, "loss": 0.3305038511753082, "step": 29070 }, { "ce_loss": 0.0752817839384079, "epoch": 9.696464309539692, "step": 29070 }, { "distill_loss": 0.16382677853107452, "epoch": 9.696464309539692, "step": 29070 }, { "epoch": 9.696464309539692, "ref_ce_loss": 0.07009885460138321, "step": 29070 }, { "epoch": 9.696464309539692, "loss": 0.3689107894897461, "step": 29070 }, { "ce_loss": 0.04890412092208862, "epoch": 9.696464309539692, "step": 29070 }, { "distill_loss": 0.17433670163154602, "epoch": 9.696464309539692, "step": 29070 }, { "epoch": 9.696464309539692, "ref_ce_loss": 0.08437875658273697, "step": 29070 }, { "epoch": 9.699799866577719, "loss": 0.3394, "step": 29080 }, { "epoch": 9.699799866577719, "grad_norm": 1.406949758529663, "step": 29080 }, { "epoch": 9.699799866577719, "learning_rate": 1.8892254718382962e-06, "step": 29080 }, { "epoch": 9.699799866577719, "loss": 0.4878078103065491, "step": 29080 }, { "ce_loss": 0.050842683762311935, "epoch": 9.699799866577719, "step": 29080 }, { "distill_loss": 0.24324414134025574, "epoch": 9.699799866577719, "step": 29080 }, { "epoch": 9.699799866577719, "ref_ce_loss": 0.07579061388969421, "step": 29080 }, { "epoch": 9.699799866577719, "loss": 0.3403063714504242, "step": 29080 }, { "ce_loss": 0.042193423956632614, "epoch": 9.699799866577719, "step": 29080 }, { "distill_loss": 0.1843198984861374, "epoch": 9.699799866577719, "step": 29080 }, { "epoch": 9.699799866577719, "ref_ce_loss": 0.08377090841531754, "step": 29080 }, { "epoch": 9.703135423615745, "loss": 0.3533, "step": 29090 }, { "epoch": 9.703135423615745, "grad_norm": 0.9102470874786377, "step": 29090 }, { "epoch": 9.703135423615745, "learning_rate": 1.8475080899480913e-06, "step": 29090 }, { "epoch": 9.703135423615745, "loss": 0.7278520464897156, "step": 29090 }, { "ce_loss": 0.09050574898719788, "epoch": 9.703135423615745, "step": 29090 }, { "distill_loss": 0.20973047614097595, "epoch": 9.703135423615745, "step": 29090 }, { "epoch": 9.703135423615745, "ref_ce_loss": 0.07229988276958466, "step": 29090 }, { "epoch": 9.703135423615745, "loss": 0.36448508501052856, "step": 29090 }, { "ce_loss": 0.05310250073671341, "epoch": 9.703135423615745, "step": 29090 }, { "distill_loss": 0.17795789241790771, "epoch": 9.703135423615745, "step": 29090 }, { "epoch": 9.703135423615745, "ref_ce_loss": 0.07669250667095184, "step": 29090 }, { "epoch": 9.70647098065377, "loss": 0.4006, "step": 29100 }, { "epoch": 9.70647098065377, "grad_norm": 1.8485174179077148, "step": 29100 }, { "epoch": 9.70647098065377, "learning_rate": 1.8062553948589244e-06, "step": 29100 }, { "epoch": 9.70647098065377, "loss": 0.33109721541404724, "step": 29100 }, { "ce_loss": 0.03415803238749504, "epoch": 9.70647098065377, "step": 29100 }, { "distill_loss": 0.21031081676483154, "epoch": 9.70647098065377, "step": 29100 }, { "epoch": 9.70647098065377, "ref_ce_loss": 0.062195755541324615, "step": 29100 }, { "epoch": 9.70647098065377, "loss": 0.2148629128932953, "step": 29100 }, { "ce_loss": 0.03016166016459465, "epoch": 9.70647098065377, "step": 29100 }, { "distill_loss": 0.14262332022190094, "epoch": 9.70647098065377, "step": 29100 }, { "epoch": 9.70647098065377, "ref_ce_loss": 0.041959211230278015, "step": 29100 }, { "epoch": 9.709806537691794, "loss": 0.3232, "step": 29110 }, { "epoch": 9.709806537691794, "grad_norm": 1.0245168209075928, "step": 29110 }, { "epoch": 9.709806537691794, "learning_rate": 1.7654674347171719e-06, "step": 29110 }, { "epoch": 9.709806537691794, "loss": 0.3630022406578064, "step": 29110 }, { "ce_loss": 0.03858557716012001, "epoch": 9.709806537691794, "step": 29110 }, { "distill_loss": 0.22540675103664398, "epoch": 9.709806537691794, "step": 29110 }, { "epoch": 9.709806537691794, "ref_ce_loss": 0.07783447206020355, "step": 29110 }, { "epoch": 9.709806537691794, "loss": 0.28904521465301514, "step": 29110 }, { "ce_loss": 0.022058136761188507, "epoch": 9.709806537691794, "step": 29110 }, { "distill_loss": 0.1771065890789032, "epoch": 9.709806537691794, "step": 29110 }, { "epoch": 9.709806537691794, "ref_ce_loss": 0.06527817249298096, "step": 29110 }, { "epoch": 9.71314209472982, "loss": 0.3433, "step": 29120 }, { "epoch": 9.71314209472982, "grad_norm": 0.9284103512763977, "step": 29120 }, { "epoch": 9.71314209472982, "learning_rate": 1.7251442571267095e-06, "step": 29120 }, { "epoch": 9.71314209472982, "loss": 0.37761133909225464, "step": 29120 }, { "ce_loss": 0.01938902959227562, "epoch": 9.71314209472982, "step": 29120 }, { "distill_loss": 0.20532874763011932, "epoch": 9.71314209472982, "step": 29120 }, { "epoch": 9.71314209472982, "ref_ce_loss": 0.06604550033807755, "step": 29120 }, { "epoch": 9.71314209472982, "loss": 0.37872540950775146, "step": 29120 }, { "ce_loss": 0.042316507548093796, "epoch": 9.71314209472982, "step": 29120 }, { "distill_loss": 0.19641393423080444, "epoch": 9.71314209472982, "step": 29120 }, { "epoch": 9.71314209472982, "ref_ce_loss": 0.06613562256097794, "step": 29120 }, { "epoch": 9.716477651767846, "loss": 0.3223, "step": 29130 }, { "epoch": 9.716477651767846, "grad_norm": 0.7625808715820312, "step": 29130 }, { "epoch": 9.716477651767846, "learning_rate": 1.685285909149048e-06, "step": 29130 }, { "epoch": 9.716477651767846, "loss": 0.36743196845054626, "step": 29130 }, { "ce_loss": 0.03555121272802353, "epoch": 9.716477651767846, "step": 29130 }, { "distill_loss": 0.15161210298538208, "epoch": 9.716477651767846, "step": 29130 }, { "epoch": 9.716477651767846, "ref_ce_loss": 0.07639976590871811, "step": 29130 }, { "epoch": 9.716477651767846, "loss": 0.4357321262359619, "step": 29130 }, { "ce_loss": 0.04716123268008232, "epoch": 9.716477651767846, "step": 29130 }, { "distill_loss": 0.2087934911251068, "epoch": 9.716477651767846, "step": 29130 }, { "epoch": 9.716477651767846, "ref_ce_loss": 0.05667942389845848, "step": 29130 }, { "epoch": 9.71981320880587, "loss": 0.3243, "step": 29140 }, { "epoch": 9.71981320880587, "grad_norm": 0.7821916937828064, "step": 29140 }, { "epoch": 9.71981320880587, "learning_rate": 1.6458924373031537e-06, "step": 29140 }, { "epoch": 9.71981320880587, "loss": 0.35661622881889343, "step": 29140 }, { "ce_loss": 0.03580138087272644, "epoch": 9.71981320880587, "step": 29140 }, { "distill_loss": 0.193109393119812, "epoch": 9.71981320880587, "step": 29140 }, { "epoch": 9.71981320880587, "ref_ce_loss": 0.0483783595263958, "step": 29140 }, { "epoch": 9.71981320880587, "loss": 0.32538557052612305, "step": 29140 }, { "ce_loss": 0.05694977939128876, "epoch": 9.71981320880587, "step": 29140 }, { "distill_loss": 0.17848563194274902, "epoch": 9.71981320880587, "step": 29140 }, { "epoch": 9.71981320880587, "ref_ce_loss": 0.06146978586912155, "step": 29140 }, { "epoch": 9.723148765843895, "loss": 0.3345, "step": 29150 }, { "epoch": 9.723148765843895, "grad_norm": 0.824491560459137, "step": 29150 }, { "epoch": 9.723148765843895, "learning_rate": 1.6069638875654491e-06, "step": 29150 }, { "epoch": 9.723148765843895, "loss": 0.6099705100059509, "step": 29150 }, { "ce_loss": 0.058300528675317764, "epoch": 9.723148765843895, "step": 29150 }, { "distill_loss": 0.16668078303337097, "epoch": 9.723148765843895, "step": 29150 }, { "epoch": 9.723148765843895, "ref_ce_loss": 0.08649856597185135, "step": 29150 }, { "epoch": 9.723148765843895, "loss": 0.34289857745170593, "step": 29150 }, { "ce_loss": 0.042254604399204254, "epoch": 9.723148765843895, "step": 29150 }, { "distill_loss": 0.16260452568531036, "epoch": 9.723148765843895, "step": 29150 }, { "epoch": 9.723148765843895, "ref_ce_loss": 0.05944216996431351, "step": 29150 }, { "epoch": 9.726484322881921, "loss": 0.3579, "step": 29160 }, { "epoch": 9.726484322881921, "grad_norm": 1.2022480964660645, "step": 29160 }, { "epoch": 9.726484322881921, "learning_rate": 1.5685003053698134e-06, "step": 29160 }, { "epoch": 9.726484322881921, "loss": 0.35259881615638733, "step": 29160 }, { "ce_loss": 0.044145677238702774, "epoch": 9.726484322881921, "step": 29160 }, { "distill_loss": 0.16626057028770447, "epoch": 9.726484322881921, "step": 29160 }, { "epoch": 9.726484322881921, "ref_ce_loss": 0.05313951522111893, "step": 29160 }, { "epoch": 9.726484322881921, "loss": 0.3314366340637207, "step": 29160 }, { "ce_loss": 0.04013233259320259, "epoch": 9.726484322881921, "step": 29160 }, { "distill_loss": 0.19226589798927307, "epoch": 9.726484322881921, "step": 29160 }, { "epoch": 9.726484322881921, "ref_ce_loss": 0.06861289590597153, "step": 29160 }, { "epoch": 9.729819879919948, "loss": 0.3212, "step": 29170 }, { "epoch": 9.729819879919948, "grad_norm": 1.4517748355865479, "step": 29170 }, { "epoch": 9.729819879919948, "learning_rate": 1.5305017356072704e-06, "step": 29170 }, { "epoch": 9.729819879919948, "loss": 0.2889649569988251, "step": 29170 }, { "ce_loss": 0.054967913776636124, "epoch": 9.729819879919948, "step": 29170 }, { "distill_loss": 0.17594516277313232, "epoch": 9.729819879919948, "step": 29170 }, { "epoch": 9.729819879919948, "ref_ce_loss": 0.05794268473982811, "step": 29170 }, { "epoch": 9.729819879919948, "loss": 0.23234641551971436, "step": 29170 }, { "ce_loss": 0.04126816242933273, "epoch": 9.729819879919948, "step": 29170 }, { "distill_loss": 0.13288410007953644, "epoch": 9.729819879919948, "step": 29170 }, { "epoch": 9.729819879919948, "ref_ce_loss": 0.04256091266870499, "step": 29170 }, { "epoch": 9.733155436957972, "loss": 0.3142, "step": 29180 }, { "epoch": 9.733155436957972, "grad_norm": 0.9493514895439148, "step": 29180 }, { "epoch": 9.733155436957972, "learning_rate": 1.4929682226263009e-06, "step": 29180 }, { "epoch": 9.733155436957972, "loss": 0.25031107664108276, "step": 29180 }, { "ce_loss": 0.016915347427129745, "epoch": 9.733155436957972, "step": 29180 }, { "distill_loss": 0.16258159279823303, "epoch": 9.733155436957972, "step": 29180 }, { "epoch": 9.733155436957972, "ref_ce_loss": 0.042231179773807526, "step": 29180 }, { "epoch": 9.733155436957972, "loss": 0.2727430760860443, "step": 29180 }, { "ce_loss": 0.03402730077505112, "epoch": 9.733155436957972, "step": 29180 }, { "distill_loss": 0.17123806476593018, "epoch": 9.733155436957972, "step": 29180 }, { "epoch": 9.733155436957972, "ref_ce_loss": 0.06728982925415039, "step": 29180 }, { "epoch": 9.736490993995996, "loss": 0.3432, "step": 29190 }, { "epoch": 9.736490993995996, "grad_norm": 1.1605498790740967, "step": 29190 }, { "epoch": 9.736490993995996, "learning_rate": 1.455899810232575e-06, "step": 29190 }, { "epoch": 9.736490993995996, "loss": 0.29889416694641113, "step": 29190 }, { "ce_loss": 0.032608695328235626, "epoch": 9.736490993995996, "step": 29190 }, { "distill_loss": 0.21592484414577484, "epoch": 9.736490993995996, "step": 29190 }, { "epoch": 9.736490993995996, "ref_ce_loss": 0.0500856414437294, "step": 29190 }, { "epoch": 9.736490993995996, "loss": 0.3522995412349701, "step": 29190 }, { "ce_loss": 0.044759076088666916, "epoch": 9.736490993995996, "step": 29190 }, { "distill_loss": 0.17451868951320648, "epoch": 9.736490993995996, "step": 29190 }, { "epoch": 9.736490993995996, "ref_ce_loss": 0.055287107825279236, "step": 29190 }, { "epoch": 9.739826551034023, "loss": 0.3752, "step": 29200 }, { "epoch": 9.739826551034023, "grad_norm": 1.1452367305755615, "step": 29200 }, { "epoch": 9.739826551034023, "learning_rate": 1.4192965416888637e-06, "step": 29200 }, { "epoch": 9.739826551034023, "loss": 0.24558773636817932, "step": 29200 }, { "ce_loss": 0.033938828855752945, "epoch": 9.739826551034023, "step": 29200 }, { "distill_loss": 0.1440381407737732, "epoch": 9.739826551034023, "step": 29200 }, { "epoch": 9.739826551034023, "ref_ce_loss": 0.04648728296160698, "step": 29200 }, { "epoch": 9.739826551034023, "loss": 0.37278178334236145, "step": 29200 }, { "ce_loss": 0.052839286625385284, "epoch": 9.739826551034023, "step": 29200 }, { "distill_loss": 0.150975301861763, "epoch": 9.739826551034023, "step": 29200 }, { "epoch": 9.739826551034023, "ref_ce_loss": 0.07346734404563904, "step": 29200 }, { "epoch": 9.743162108072049, "loss": 0.309, "step": 29210 }, { "epoch": 9.743162108072049, "grad_norm": 1.1472444534301758, "step": 29210 }, { "epoch": 9.743162108072049, "learning_rate": 1.3831584597151282e-06, "step": 29210 }, { "epoch": 9.743162108072049, "loss": 0.2397063970565796, "step": 29210 }, { "ce_loss": 0.03823701664805412, "epoch": 9.743162108072049, "step": 29210 }, { "distill_loss": 0.16035953164100647, "epoch": 9.743162108072049, "step": 29210 }, { "epoch": 9.743162108072049, "ref_ce_loss": 0.040390852838754654, "step": 29210 }, { "epoch": 9.743162108072049, "loss": 0.2995006740093231, "step": 29210 }, { "ce_loss": 0.04192734509706497, "epoch": 9.743162108072049, "step": 29210 }, { "distill_loss": 0.17175507545471191, "epoch": 9.743162108072049, "step": 29210 }, { "epoch": 9.743162108072049, "ref_ce_loss": 0.06877624988555908, "step": 29210 }, { "epoch": 9.746497665110073, "loss": 0.3703, "step": 29220 }, { "epoch": 9.746497665110073, "grad_norm": 1.6166616678237915, "step": 29220 }, { "epoch": 9.746497665110073, "learning_rate": 1.3474856064884745e-06, "step": 29220 }, { "epoch": 9.746497665110073, "loss": 0.309572696685791, "step": 29220 }, { "ce_loss": 0.028945567086338997, "epoch": 9.746497665110073, "step": 29220 }, { "distill_loss": 0.1187836080789566, "epoch": 9.746497665110073, "step": 29220 }, { "epoch": 9.746497665110073, "ref_ce_loss": 0.04301934316754341, "step": 29220 }, { "epoch": 9.746497665110073, "loss": 0.2944486439228058, "step": 29220 }, { "ce_loss": 0.036328088492155075, "epoch": 9.746497665110073, "step": 29220 }, { "distill_loss": 0.19853277504444122, "epoch": 9.746497665110073, "step": 29220 }, { "epoch": 9.746497665110073, "ref_ce_loss": 0.059367354959249496, "step": 29220 }, { "epoch": 9.749833222148098, "loss": 0.3794, "step": 29230 }, { "epoch": 9.749833222148098, "grad_norm": 1.2834672927856445, "step": 29230 }, { "epoch": 9.749833222148098, "learning_rate": 1.3122780236428433e-06, "step": 29230 }, { "epoch": 9.749833222148098, "loss": 0.2742766737937927, "step": 29230 }, { "ce_loss": 0.030356809496879578, "epoch": 9.749833222148098, "step": 29230 }, { "distill_loss": 0.15357893705368042, "epoch": 9.749833222148098, "step": 29230 }, { "epoch": 9.749833222148098, "ref_ce_loss": 0.06088166683912277, "step": 29230 }, { "epoch": 9.749833222148098, "loss": 0.3923237919807434, "step": 29230 }, { "ce_loss": 0.029607567936182022, "epoch": 9.749833222148098, "step": 29230 }, { "distill_loss": 0.19450455904006958, "epoch": 9.749833222148098, "step": 29230 }, { "epoch": 9.749833222148098, "ref_ce_loss": 0.06190716102719307, "step": 29230 }, { "epoch": 9.753168779186124, "loss": 0.3846, "step": 29240 }, { "epoch": 9.753168779186124, "grad_norm": 2.3320555686950684, "step": 29240 }, { "epoch": 9.753168779186124, "learning_rate": 1.2775357522693653e-06, "step": 29240 }, { "epoch": 9.753168779186124, "loss": 0.34208378195762634, "step": 29240 }, { "ce_loss": 0.051376692950725555, "epoch": 9.753168779186124, "step": 29240 }, { "distill_loss": 0.2006101906299591, "epoch": 9.753168779186124, "step": 29240 }, { "epoch": 9.753168779186124, "ref_ce_loss": 0.06693777441978455, "step": 29240 }, { "epoch": 9.753168779186124, "loss": 0.277177095413208, "step": 29240 }, { "ce_loss": 0.034729231148958206, "epoch": 9.753168779186124, "step": 29240 }, { "distill_loss": 0.14436738193035126, "epoch": 9.753168779186124, "step": 29240 }, { "epoch": 9.753168779186124, "ref_ce_loss": 0.058638881891965866, "step": 29240 }, { "epoch": 9.75650433622415, "loss": 0.3215, "step": 29250 }, { "epoch": 9.75650433622415, "grad_norm": 0.9842942357063293, "step": 29250 }, { "epoch": 9.75650433622415, "learning_rate": 1.243258832915961e-06, "step": 29250 }, { "epoch": 9.75650433622415, "loss": 0.22968745231628418, "step": 29250 }, { "ce_loss": 0.024926142767071724, "epoch": 9.75650433622415, "step": 29250 }, { "distill_loss": 0.1374337375164032, "epoch": 9.75650433622415, "step": 29250 }, { "epoch": 9.75650433622415, "ref_ce_loss": 0.04320191964507103, "step": 29250 }, { "epoch": 9.75650433622415, "loss": 0.4632025361061096, "step": 29250 }, { "ce_loss": 0.08126223832368851, "epoch": 9.75650433622415, "step": 29250 }, { "distill_loss": 0.21030139923095703, "epoch": 9.75650433622415, "step": 29250 }, { "epoch": 9.75650433622415, "ref_ce_loss": 0.0613284595310688, "step": 29250 }, { "epoch": 9.759839893262175, "loss": 0.3143, "step": 29260 }, { "epoch": 9.759839893262175, "grad_norm": 0.7296186685562134, "step": 29260 }, { "epoch": 9.759839893262175, "learning_rate": 1.2094473055875188e-06, "step": 29260 }, { "epoch": 9.759839893262175, "loss": 0.2869877517223358, "step": 29260 }, { "ce_loss": 0.05808643996715546, "epoch": 9.759839893262175, "step": 29260 }, { "distill_loss": 0.164188951253891, "epoch": 9.759839893262175, "step": 29260 }, { "epoch": 9.759839893262175, "ref_ce_loss": 0.06443438678979874, "step": 29260 }, { "epoch": 9.759839893262175, "loss": 0.41061264276504517, "step": 29260 }, { "ce_loss": 0.039066676050424576, "epoch": 9.759839893262175, "step": 29260 }, { "distill_loss": 0.19832192361354828, "epoch": 9.759839893262175, "step": 29260 }, { "epoch": 9.759839893262175, "ref_ce_loss": 0.06585456430912018, "step": 29260 }, { "epoch": 9.7631754503002, "loss": 0.3178, "step": 29270 }, { "epoch": 9.7631754503002, "grad_norm": 0.836146354675293, "step": 29270 }, { "epoch": 9.7631754503002, "learning_rate": 1.176101209745717e-06, "step": 29270 }, { "epoch": 9.7631754503002, "loss": 0.40682804584503174, "step": 29270 }, { "ce_loss": 0.05214698240160942, "epoch": 9.7631754503002, "step": 29270 }, { "distill_loss": 0.13995395600795746, "epoch": 9.7631754503002, "step": 29270 }, { "epoch": 9.7631754503002, "ref_ce_loss": 0.05854807794094086, "step": 29270 }, { "epoch": 9.7631754503002, "loss": 0.3249422013759613, "step": 29270 }, { "ce_loss": 0.06329483538866043, "epoch": 9.7631754503002, "step": 29270 }, { "distill_loss": 0.19355738162994385, "epoch": 9.7631754503002, "step": 29270 }, { "epoch": 9.7631754503002, "ref_ce_loss": 0.06793557852506638, "step": 29270 }, { "epoch": 9.766511007338226, "loss": 0.3547, "step": 29280 }, { "epoch": 9.766511007338226, "grad_norm": 1.3675034046173096, "step": 29280 }, { "epoch": 9.766511007338226, "learning_rate": 1.1432205843090237e-06, "step": 29280 }, { "epoch": 9.766511007338226, "loss": 0.35455620288848877, "step": 29280 }, { "ce_loss": 0.04943772777915001, "epoch": 9.766511007338226, "step": 29280 }, { "distill_loss": 0.1921720802783966, "epoch": 9.766511007338226, "step": 29280 }, { "epoch": 9.766511007338226, "ref_ce_loss": 0.049962118268013, "step": 29280 }, { "epoch": 9.766511007338226, "loss": 0.26242080330848694, "step": 29280 }, { "ce_loss": 0.029609311372041702, "epoch": 9.766511007338226, "step": 29280 }, { "distill_loss": 0.16324296593666077, "epoch": 9.766511007338226, "step": 29280 }, { "epoch": 9.766511007338226, "ref_ce_loss": 0.0418899767100811, "step": 29280 }, { "epoch": 9.769846564376252, "loss": 0.3091, "step": 29290 }, { "epoch": 9.769846564376252, "grad_norm": 0.939579963684082, "step": 29290 }, { "epoch": 9.769846564376252, "learning_rate": 1.1108054676526535e-06, "step": 29290 }, { "epoch": 9.769846564376252, "loss": 0.31739896535873413, "step": 29290 }, { "ce_loss": 0.04685492068529129, "epoch": 9.769846564376252, "step": 29290 }, { "distill_loss": 0.17787881195545197, "epoch": 9.769846564376252, "step": 29290 }, { "epoch": 9.769846564376252, "ref_ce_loss": 0.06475568562746048, "step": 29290 }, { "epoch": 9.769846564376252, "loss": 0.29376351833343506, "step": 29290 }, { "ce_loss": 0.02153054066002369, "epoch": 9.769846564376252, "step": 29290 }, { "distill_loss": 0.19466844201087952, "epoch": 9.769846564376252, "step": 29290 }, { "epoch": 9.769846564376252, "ref_ce_loss": 0.07677976787090302, "step": 29290 }, { "epoch": 9.773182121414276, "loss": 0.3034, "step": 29300 }, { "epoch": 9.773182121414276, "grad_norm": 0.784531831741333, "step": 29300 }, { "epoch": 9.773182121414276, "learning_rate": 1.078855897608566e-06, "step": 29300 }, { "epoch": 9.773182121414276, "loss": 0.37095391750335693, "step": 29300 }, { "ce_loss": 0.06039052456617355, "epoch": 9.773182121414276, "step": 29300 }, { "distill_loss": 0.19763991236686707, "epoch": 9.773182121414276, "step": 29300 }, { "epoch": 9.773182121414276, "ref_ce_loss": 0.0946519672870636, "step": 29300 }, { "epoch": 9.773182121414276, "loss": 0.40771812200546265, "step": 29300 }, { "ce_loss": 0.027861036360263824, "epoch": 9.773182121414276, "step": 29300 }, { "distill_loss": 0.18898458778858185, "epoch": 9.773182121414276, "step": 29300 }, { "epoch": 9.773182121414276, "ref_ce_loss": 0.0597851425409317, "step": 29300 }, { "epoch": 9.7765176784523, "loss": 0.3896, "step": 29310 }, { "epoch": 9.7765176784523, "grad_norm": 0.9433816075325012, "step": 29310 }, { "epoch": 9.7765176784523, "learning_rate": 1.0473719114653336e-06, "step": 29310 }, { "epoch": 9.7765176784523, "loss": 0.36913833022117615, "step": 29310 }, { "ce_loss": 0.03785061836242676, "epoch": 9.7765176784523, "step": 29310 }, { "distill_loss": 0.18730568885803223, "epoch": 9.7765176784523, "step": 29310 }, { "epoch": 9.7765176784523, "ref_ce_loss": 0.05920045077800751, "step": 29310 }, { "epoch": 9.7765176784523, "loss": 0.3134559094905853, "step": 29310 }, { "ce_loss": 0.0518287718296051, "epoch": 9.7765176784523, "step": 29310 }, { "distill_loss": 0.16476958990097046, "epoch": 9.7765176784523, "step": 29310 }, { "epoch": 9.7765176784523, "ref_ce_loss": 0.04951930791139603, "step": 29310 }, { "epoch": 9.779853235490327, "loss": 0.3495, "step": 29320 }, { "epoch": 9.779853235490327, "grad_norm": 1.077572226524353, "step": 29320 }, { "epoch": 9.779853235490327, "learning_rate": 1.016353545968185e-06, "step": 29320 }, { "epoch": 9.779853235490327, "loss": 0.20148417353630066, "step": 29320 }, { "ce_loss": 0.01355352159589529, "epoch": 9.779853235490327, "step": 29320 }, { "distill_loss": 0.12288177013397217, "epoch": 9.779853235490327, "step": 29320 }, { "epoch": 9.779853235490327, "ref_ce_loss": 0.03917378932237625, "step": 29320 }, { "epoch": 9.779853235490327, "loss": 0.32212889194488525, "step": 29320 }, { "ce_loss": 0.05061560869216919, "epoch": 9.779853235490327, "step": 29320 }, { "distill_loss": 0.17453105747699738, "epoch": 9.779853235490327, "step": 29320 }, { "epoch": 9.779853235490327, "ref_ce_loss": 0.06870347261428833, "step": 29320 }, { "epoch": 9.783188792528353, "loss": 0.3211, "step": 29330 }, { "epoch": 9.783188792528353, "grad_norm": 1.481539011001587, "step": 29330 }, { "epoch": 9.783188792528353, "learning_rate": 9.858008373188288e-07, "step": 29330 }, { "epoch": 9.783188792528353, "loss": 0.2580861747264862, "step": 29330 }, { "ce_loss": 0.034626785665750504, "epoch": 9.783188792528353, "step": 29330 }, { "distill_loss": 0.13589392602443695, "epoch": 9.783188792528353, "step": 29330 }, { "epoch": 9.783188792528353, "ref_ce_loss": 0.06425262242555618, "step": 29330 }, { "epoch": 9.783188792528353, "loss": 0.28824493288993835, "step": 29330 }, { "ce_loss": 0.03179427236318588, "epoch": 9.783188792528353, "step": 29330 }, { "distill_loss": 0.16806454956531525, "epoch": 9.783188792528353, "step": 29330 }, { "epoch": 9.783188792528353, "ref_ce_loss": 0.06407367438077927, "step": 29330 }, { "epoch": 9.786524349566378, "loss": 0.3656, "step": 29340 }, { "epoch": 9.786524349566378, "grad_norm": 1.1869555711746216, "step": 29340 }, { "epoch": 9.786524349566378, "learning_rate": 9.5571382117563e-07, "step": 29340 }, { "epoch": 9.786524349566378, "loss": 0.49091100692749023, "step": 29340 }, { "ce_loss": 0.032899271696805954, "epoch": 9.786524349566378, "step": 29340 }, { "distill_loss": 0.1702505201101303, "epoch": 9.786524349566378, "step": 29340 }, { "epoch": 9.786524349566378, "ref_ce_loss": 0.06909888237714767, "step": 29340 }, { "epoch": 9.786524349566378, "loss": 0.31385013461112976, "step": 29340 }, { "ce_loss": 0.04734751954674721, "epoch": 9.786524349566378, "step": 29340 }, { "distill_loss": 0.17034968733787537, "epoch": 9.786524349566378, "step": 29340 }, { "epoch": 9.786524349566378, "ref_ce_loss": 0.07363146543502808, "step": 29340 }, { "epoch": 9.789859906604402, "loss": 0.3686, "step": 29350 }, { "epoch": 9.789859906604402, "grad_norm": 0.7241499423980713, "step": 29350 }, { "epoch": 9.789859906604402, "learning_rate": 9.260925326533443e-07, "step": 29350 }, { "epoch": 9.789859906604402, "loss": 0.4805721342563629, "step": 29350 }, { "ce_loss": 0.02742915414273739, "epoch": 9.789859906604402, "step": 29350 }, { "distill_loss": 0.2021547257900238, "epoch": 9.789859906604402, "step": 29350 }, { "epoch": 9.789859906604402, "ref_ce_loss": 0.05579015612602234, "step": 29350 }, { "epoch": 9.789859906604402, "loss": 0.39362767338752747, "step": 29350 }, { "ce_loss": 0.08456600457429886, "epoch": 9.789859906604402, "step": 29350 }, { "distill_loss": 0.2172761857509613, "epoch": 9.789859906604402, "step": 29350 }, { "epoch": 9.789859906604402, "ref_ce_loss": 0.0690891370177269, "step": 29350 }, { "epoch": 9.793195463642428, "loss": 0.3328, "step": 29360 }, { "epoch": 9.793195463642428, "grad_norm": 1.407702922821045, "step": 29360 }, { "epoch": 9.793195463642428, "learning_rate": 8.969370063231619e-07, "step": 29360 }, { "epoch": 9.793195463642428, "loss": 0.4750968813896179, "step": 29360 }, { "ce_loss": 0.08430177718400955, "epoch": 9.793195463642428, "step": 29360 }, { "distill_loss": 0.17226916551589966, "epoch": 9.793195463642428, "step": 29360 }, { "epoch": 9.793195463642428, "ref_ce_loss": 0.08060526847839355, "step": 29360 }, { "epoch": 9.793195463642428, "loss": 0.3968356251716614, "step": 29360 }, { "ce_loss": 0.05416855216026306, "epoch": 9.793195463642428, "step": 29360 }, { "distill_loss": 0.16487491130828857, "epoch": 9.793195463642428, "step": 29360 }, { "epoch": 9.793195463642428, "ref_ce_loss": 0.04678542539477348, "step": 29360 }, { "epoch": 9.796531020680455, "loss": 0.3292, "step": 29370 }, { "epoch": 9.796531020680455, "grad_norm": 1.0663679838180542, "step": 29370 }, { "epoch": 9.796531020680455, "learning_rate": 8.682472762127969e-07, "step": 29370 }, { "epoch": 9.796531020680455, "loss": 0.4573275148868561, "step": 29370 }, { "ce_loss": 0.03016098216176033, "epoch": 9.796531020680455, "step": 29370 }, { "distill_loss": 0.20741283893585205, "epoch": 9.796531020680455, "step": 29370 }, { "epoch": 9.796531020680455, "ref_ce_loss": 0.055127277970314026, "step": 29370 }, { "epoch": 9.796531020680455, "loss": 0.35826027393341064, "step": 29370 }, { "ce_loss": 0.026030803099274635, "epoch": 9.796531020680455, "step": 29370 }, { "distill_loss": 0.17437167465686798, "epoch": 9.796531020680455, "step": 29370 }, { "epoch": 9.796531020680455, "ref_ce_loss": 0.06781844049692154, "step": 29370 }, { "epoch": 9.799866577718479, "loss": 0.3586, "step": 29380 }, { "epoch": 9.799866577718479, "grad_norm": 1.5492557287216187, "step": 29380 }, { "epoch": 9.799866577718479, "learning_rate": 8.400233758062203e-07, "step": 29380 }, { "epoch": 9.799866577718479, "loss": 0.3331243097782135, "step": 29380 }, { "ce_loss": 0.02931608445942402, "epoch": 9.799866577718479, "step": 29380 }, { "distill_loss": 0.19186915457248688, "epoch": 9.799866577718479, "step": 29380 }, { "epoch": 9.799866577718479, "ref_ce_loss": 0.08439023792743683, "step": 29380 }, { "epoch": 9.799866577718479, "loss": 0.44308149814605713, "step": 29380 }, { "ce_loss": 0.05596522241830826, "epoch": 9.799866577718479, "step": 29380 }, { "distill_loss": 0.20761224627494812, "epoch": 9.799866577718479, "step": 29380 }, { "epoch": 9.799866577718479, "ref_ce_loss": 0.07270487397909164, "step": 29380 }, { "epoch": 9.803202134756503, "loss": 0.356, "step": 29390 }, { "epoch": 9.803202134756503, "grad_norm": 1.0058664083480835, "step": 29390 }, { "epoch": 9.803202134756503, "learning_rate": 8.122653380437494e-07, "step": 29390 }, { "epoch": 9.803202134756503, "loss": 0.4317482113838196, "step": 29390 }, { "ce_loss": 0.09004373848438263, "epoch": 9.803202134756503, "step": 29390 }, { "distill_loss": 0.22809693217277527, "epoch": 9.803202134756503, "step": 29390 }, { "epoch": 9.803202134756503, "ref_ce_loss": 0.07940634340047836, "step": 29390 }, { "epoch": 9.803202134756503, "loss": 0.3935472071170807, "step": 29390 }, { "ce_loss": 0.031198523938655853, "epoch": 9.803202134756503, "step": 29390 }, { "distill_loss": 0.1570068597793579, "epoch": 9.803202134756503, "step": 29390 }, { "epoch": 9.803202134756503, "ref_ce_loss": 0.06303267925977707, "step": 29390 }, { "epoch": 9.80653769179453, "loss": 0.3451, "step": 29400 }, { "epoch": 9.80653769179453, "grad_norm": 1.0701510906219482, "step": 29400 }, { "epoch": 9.80653769179453, "learning_rate": 7.849731953219586e-07, "step": 29400 }, { "epoch": 9.80653769179453, "loss": 0.4929094612598419, "step": 29400 }, { "ce_loss": 0.06670527160167694, "epoch": 9.80653769179453, "step": 29400 }, { "distill_loss": 0.21560506522655487, "epoch": 9.80653769179453, "step": 29400 }, { "epoch": 9.80653769179453, "ref_ce_loss": 0.0714222639799118, "step": 29400 }, { "epoch": 9.80653769179453, "loss": 0.5444076657295227, "step": 29400 }, { "ce_loss": 0.05342886596918106, "epoch": 9.80653769179453, "step": 29400 }, { "distill_loss": 0.2137109786272049, "epoch": 9.80653769179453, "step": 29400 }, { "epoch": 9.80653769179453, "ref_ce_loss": 0.07326829433441162, "step": 29400 }, { "epoch": 9.809873248832556, "loss": 0.3673, "step": 29410 }, { "epoch": 9.809873248832556, "grad_norm": 1.2126073837280273, "step": 29410 }, { "epoch": 9.809873248832556, "learning_rate": 7.581469794938123e-07, "step": 29410 }, { "epoch": 9.809873248832556, "loss": 0.24275337159633636, "step": 29410 }, { "ce_loss": 0.02250901237130165, "epoch": 9.809873248832556, "step": 29410 }, { "distill_loss": 0.11409185826778412, "epoch": 9.809873248832556, "step": 29410 }, { "epoch": 9.809873248832556, "ref_ce_loss": 0.054690055549144745, "step": 29410 }, { "epoch": 9.809873248832556, "loss": 0.28774747252464294, "step": 29410 }, { "ce_loss": 0.033412329852581024, "epoch": 9.809873248832556, "step": 29410 }, { "distill_loss": 0.205092191696167, "epoch": 9.809873248832556, "step": 29410 }, { "epoch": 9.809873248832556, "ref_ce_loss": 0.04910941794514656, "step": 29410 }, { "epoch": 9.81320880587058, "loss": 0.2915, "step": 29420 }, { "epoch": 9.81320880587058, "grad_norm": 1.0646889209747314, "step": 29420 }, { "epoch": 9.81320880587058, "learning_rate": 7.317867218683549e-07, "step": 29420 }, { "epoch": 9.81320880587058, "loss": 0.3459312319755554, "step": 29420 }, { "ce_loss": 0.05426276847720146, "epoch": 9.81320880587058, "step": 29420 }, { "distill_loss": 0.17972438037395477, "epoch": 9.81320880587058, "step": 29420 }, { "epoch": 9.81320880587058, "ref_ce_loss": 0.04935836419463158, "step": 29420 }, { "epoch": 9.81320880587058, "loss": 0.4190525412559509, "step": 29420 }, { "ce_loss": 0.023804601281881332, "epoch": 9.81320880587058, "step": 29420 }, { "distill_loss": 0.1546304076910019, "epoch": 9.81320880587058, "step": 29420 }, { "epoch": 9.81320880587058, "ref_ce_loss": 0.07818574458360672, "step": 29420 }, { "epoch": 9.816544362908605, "loss": 0.3582, "step": 29430 }, { "epoch": 9.816544362908605, "grad_norm": 1.1533007621765137, "step": 29430 }, { "epoch": 9.816544362908605, "learning_rate": 7.058924532107991e-07, "step": 29430 }, { "epoch": 9.816544362908605, "loss": 0.33537954092025757, "step": 29430 }, { "ce_loss": 0.04029664024710655, "epoch": 9.816544362908605, "step": 29430 }, { "distill_loss": 0.1852310448884964, "epoch": 9.816544362908605, "step": 29430 }, { "epoch": 9.816544362908605, "ref_ce_loss": 0.07313943654298782, "step": 29430 }, { "epoch": 9.816544362908605, "loss": 0.22467778623104095, "step": 29430 }, { "ce_loss": 0.02593729831278324, "epoch": 9.816544362908605, "step": 29430 }, { "distill_loss": 0.12661072611808777, "epoch": 9.816544362908605, "step": 29430 }, { "epoch": 9.816544362908605, "ref_ce_loss": 0.049755774438381195, "step": 29430 }, { "epoch": 9.819879919946631, "loss": 0.3504, "step": 29440 }, { "epoch": 9.819879919946631, "grad_norm": 1.1615389585494995, "step": 29440 }, { "epoch": 9.819879919946631, "learning_rate": 6.804642037425701e-07, "step": 29440 }, { "epoch": 9.819879919946631, "loss": 0.4558985233306885, "step": 29440 }, { "ce_loss": 0.023333707824349403, "epoch": 9.819879919946631, "step": 29440 }, { "distill_loss": 0.177892804145813, "epoch": 9.819879919946631, "step": 29440 }, { "epoch": 9.819879919946631, "ref_ce_loss": 0.06867125630378723, "step": 29440 }, { "epoch": 9.819879919946631, "loss": 0.33130738139152527, "step": 29440 }, { "ce_loss": 0.023025374859571457, "epoch": 9.819879919946631, "step": 29440 }, { "distill_loss": 0.16406647861003876, "epoch": 9.819879919946631, "step": 29440 }, { "epoch": 9.819879919946631, "ref_ce_loss": 0.06003274768590927, "step": 29440 }, { "epoch": 9.823215476984657, "loss": 0.3357, "step": 29450 }, { "epoch": 9.823215476984657, "grad_norm": 1.0378285646438599, "step": 29450 }, { "epoch": 9.823215476984657, "learning_rate": 6.555020031412173e-07, "step": 29450 }, { "epoch": 9.823215476984657, "loss": 0.26800641417503357, "step": 29450 }, { "ce_loss": 0.015543187037110329, "epoch": 9.823215476984657, "step": 29450 }, { "distill_loss": 0.15196271240711212, "epoch": 9.823215476984657, "step": 29450 }, { "epoch": 9.823215476984657, "ref_ce_loss": 0.04392324015498161, "step": 29450 }, { "epoch": 9.823215476984657, "loss": 0.37747907638549805, "step": 29450 }, { "ce_loss": 0.057008299976587296, "epoch": 9.823215476984657, "step": 29450 }, { "distill_loss": 0.20629453659057617, "epoch": 9.823215476984657, "step": 29450 }, { "epoch": 9.823215476984657, "ref_ce_loss": 0.05720047280192375, "step": 29450 }, { "epoch": 9.826551034022682, "loss": 0.3457, "step": 29460 }, { "epoch": 9.826551034022682, "grad_norm": 1.5450215339660645, "step": 29460 }, { "epoch": 9.826551034022682, "learning_rate": 6.310058805402364e-07, "step": 29460 }, { "epoch": 9.826551034022682, "loss": 0.1829114556312561, "step": 29460 }, { "ce_loss": 0.016622690483927727, "epoch": 9.826551034022682, "step": 29460 }, { "distill_loss": 0.12692323327064514, "epoch": 9.826551034022682, "step": 29460 }, { "epoch": 9.826551034022682, "ref_ce_loss": 0.039235811680555344, "step": 29460 }, { "epoch": 9.826551034022682, "loss": 0.23720866441726685, "step": 29460 }, { "ce_loss": 0.0072109573520720005, "epoch": 9.826551034022682, "step": 29460 }, { "distill_loss": 0.1425161361694336, "epoch": 9.826551034022682, "step": 29460 }, { "epoch": 9.826551034022682, "ref_ce_loss": 0.03943220153450966, "step": 29460 }, { "epoch": 9.829886591060706, "loss": 0.3084, "step": 29470 }, { "epoch": 9.829886591060706, "grad_norm": 1.1493730545043945, "step": 29470 }, { "epoch": 9.829886591060706, "learning_rate": 6.069758645292911e-07, "step": 29470 }, { "epoch": 9.829886591060706, "loss": 0.2494208961725235, "step": 29470 }, { "ce_loss": 0.032901324331760406, "epoch": 9.829886591060706, "step": 29470 }, { "distill_loss": 0.15291579067707062, "epoch": 9.829886591060706, "step": 29470 }, { "epoch": 9.829886591060706, "ref_ce_loss": 0.04548323526978493, "step": 29470 }, { "epoch": 9.829886591060706, "loss": 0.3046186566352844, "step": 29470 }, { "ce_loss": 0.03033779375255108, "epoch": 9.829886591060706, "step": 29470 }, { "distill_loss": 0.1353941112756729, "epoch": 9.829886591060706, "step": 29470 }, { "epoch": 9.829886591060706, "ref_ce_loss": 0.05030620098114014, "step": 29470 }, { "epoch": 9.833222148098733, "loss": 0.3507, "step": 29480 }, { "epoch": 9.833222148098733, "grad_norm": 1.4322553873062134, "step": 29480 }, { "epoch": 9.833222148098733, "learning_rate": 5.834119831539476e-07, "step": 29480 }, { "epoch": 9.833222148098733, "loss": 0.2620355486869812, "step": 29480 }, { "ce_loss": 0.03061460703611374, "epoch": 9.833222148098733, "step": 29480 }, { "distill_loss": 0.15820306539535522, "epoch": 9.833222148098733, "step": 29480 }, { "epoch": 9.833222148098733, "ref_ce_loss": 0.07300352305173874, "step": 29480 }, { "epoch": 9.833222148098733, "loss": 0.37205538153648376, "step": 29480 }, { "ce_loss": 0.06081470847129822, "epoch": 9.833222148098733, "step": 29480 }, { "distill_loss": 0.19428209960460663, "epoch": 9.833222148098733, "step": 29480 }, { "epoch": 9.833222148098733, "ref_ce_loss": 0.0742083266377449, "step": 29480 }, { "epoch": 9.836557705136759, "loss": 0.33, "step": 29490 }, { "epoch": 9.836557705136759, "grad_norm": 0.9687492251396179, "step": 29490 }, { "epoch": 9.836557705136759, "learning_rate": 5.603142639158509e-07, "step": 29490 }, { "epoch": 9.836557705136759, "loss": 0.3235549032688141, "step": 29490 }, { "ce_loss": 0.07004064321517944, "epoch": 9.836557705136759, "step": 29490 }, { "distill_loss": 0.17677612602710724, "epoch": 9.836557705136759, "step": 29490 }, { "epoch": 9.836557705136759, "ref_ce_loss": 0.05695893242955208, "step": 29490 }, { "epoch": 9.836557705136759, "loss": 0.3133566081523895, "step": 29490 }, { "ce_loss": 0.026781374588608742, "epoch": 9.836557705136759, "step": 29490 }, { "distill_loss": 0.15334056317806244, "epoch": 9.836557705136759, "step": 29490 }, { "epoch": 9.836557705136759, "ref_ce_loss": 0.06433262676000595, "step": 29490 }, { "epoch": 9.839893262174783, "loss": 0.345, "step": 29500 }, { "epoch": 9.839893262174783, "grad_norm": 2.4552054405212402, "step": 29500 }, { "epoch": 9.839893262174783, "learning_rate": 5.376827337725043e-07, "step": 29500 }, { "epoch": 9.839893262174783, "loss": 0.2624857723712921, "step": 29500 }, { "ce_loss": 0.03923223912715912, "epoch": 9.839893262174783, "step": 29500 }, { "distill_loss": 0.13083603978157043, "epoch": 9.839893262174783, "step": 29500 }, { "epoch": 9.839893262174783, "ref_ce_loss": 0.07049775868654251, "step": 29500 }, { "epoch": 9.839893262174783, "loss": 0.2594766914844513, "step": 29500 }, { "ce_loss": 0.02417309582233429, "epoch": 9.839893262174783, "step": 29500 }, { "distill_loss": 0.1470203399658203, "epoch": 9.839893262174783, "step": 29500 }, { "epoch": 9.839893262174783, "ref_ce_loss": 0.053291480988264084, "step": 29500 }, { "epoch": 9.843228819212808, "loss": 0.3229, "step": 29510 }, { "epoch": 9.843228819212808, "grad_norm": 0.9992929100990295, "step": 29510 }, { "epoch": 9.843228819212808, "learning_rate": 5.155174191373125e-07, "step": 29510 }, { "epoch": 9.843228819212808, "loss": 0.37495696544647217, "step": 29510 }, { "ce_loss": 0.03893393650650978, "epoch": 9.843228819212808, "step": 29510 }, { "distill_loss": 0.20713326334953308, "epoch": 9.843228819212808, "step": 29510 }, { "epoch": 9.843228819212808, "ref_ce_loss": 0.0703553631901741, "step": 29510 }, { "epoch": 9.843228819212808, "loss": 0.2708454728126526, "step": 29510 }, { "ce_loss": 0.018474353477358818, "epoch": 9.843228819212808, "step": 29510 }, { "distill_loss": 0.14013908803462982, "epoch": 9.843228819212808, "step": 29510 }, { "epoch": 9.843228819212808, "ref_ce_loss": 0.049369096755981445, "step": 29510 }, { "epoch": 9.846564376250834, "loss": 0.3381, "step": 29520 }, { "epoch": 9.846564376250834, "grad_norm": 1.5160677433013916, "step": 29520 }, { "epoch": 9.846564376250834, "learning_rate": 4.938183458796263e-07, "step": 29520 }, { "epoch": 9.846564376250834, "loss": 0.3293970227241516, "step": 29520 }, { "ce_loss": 0.06448846310377121, "epoch": 9.846564376250834, "step": 29520 }, { "distill_loss": 0.17817185819149017, "epoch": 9.846564376250834, "step": 29520 }, { "epoch": 9.846564376250834, "ref_ce_loss": 0.0654769167304039, "step": 29520 }, { "epoch": 9.846564376250834, "loss": 0.2610923945903778, "step": 29520 }, { "ce_loss": 0.028385812416672707, "epoch": 9.846564376250834, "step": 29520 }, { "distill_loss": 0.12585677206516266, "epoch": 9.846564376250834, "step": 29520 }, { "epoch": 9.846564376250834, "ref_ce_loss": 0.05476205796003342, "step": 29520 }, { "epoch": 9.84989993328886, "loss": 0.3552, "step": 29530 }, { "epoch": 9.84989993328886, "grad_norm": 1.071336030960083, "step": 29530 }, { "epoch": 9.84989993328886, "learning_rate": 4.7258553932456597e-07, "step": 29530 }, { "epoch": 9.84989993328886, "loss": 0.3657957911491394, "step": 29530 }, { "ce_loss": 0.05615170672535896, "epoch": 9.84989993328886, "step": 29530 }, { "distill_loss": 0.2162410318851471, "epoch": 9.84989993328886, "step": 29530 }, { "epoch": 9.84989993328886, "ref_ce_loss": 0.07879635691642761, "step": 29530 }, { "epoch": 9.84989993328886, "loss": 0.34869688749313354, "step": 29530 }, { "ce_loss": 0.017612233757972717, "epoch": 9.84989993328886, "step": 29530 }, { "distill_loss": 0.1881450116634369, "epoch": 9.84989993328886, "step": 29530 }, { "epoch": 9.84989993328886, "ref_ce_loss": 0.06109461188316345, "step": 29530 }, { "epoch": 9.853235490326885, "loss": 0.3863, "step": 29540 }, { "epoch": 9.853235490326885, "grad_norm": 1.999192237854004, "step": 29540 }, { "epoch": 9.853235490326885, "learning_rate": 4.518190242531084e-07, "step": 29540 }, { "epoch": 9.853235490326885, "loss": 0.30932730436325073, "step": 29540 }, { "ce_loss": 0.03714052587747574, "epoch": 9.853235490326885, "step": 29540 }, { "distill_loss": 0.15744534134864807, "epoch": 9.853235490326885, "step": 29540 }, { "epoch": 9.853235490326885, "ref_ce_loss": 0.047130778431892395, "step": 29540 }, { "epoch": 9.853235490326885, "loss": 0.4077582359313965, "step": 29540 }, { "ce_loss": 0.05509041249752045, "epoch": 9.853235490326885, "step": 29540 }, { "distill_loss": 0.18503007292747498, "epoch": 9.853235490326885, "step": 29540 }, { "epoch": 9.853235490326885, "ref_ce_loss": 0.0741603821516037, "step": 29540 }, { "epoch": 9.856571047364909, "loss": 0.3228, "step": 29550 }, { "epoch": 9.856571047364909, "grad_norm": 1.1742290258407593, "step": 29550 }, { "epoch": 9.856571047364909, "learning_rate": 4.315188249019997e-07, "step": 29550 }, { "epoch": 9.856571047364909, "loss": 0.2508600950241089, "step": 29550 }, { "ce_loss": 0.031531572341918945, "epoch": 9.856571047364909, "step": 29550 }, { "distill_loss": 0.1460273116827011, "epoch": 9.856571047364909, "step": 29550 }, { "epoch": 9.856571047364909, "ref_ce_loss": 0.05061808601021767, "step": 29550 }, { "epoch": 9.856571047364909, "loss": 0.34020158648490906, "step": 29550 }, { "ce_loss": 0.042917292565107346, "epoch": 9.856571047364909, "step": 29550 }, { "distill_loss": 0.18208904564380646, "epoch": 9.856571047364909, "step": 29550 }, { "epoch": 9.856571047364909, "ref_ce_loss": 0.07280907779932022, "step": 29550 }, { "epoch": 9.859906604402935, "loss": 0.3637, "step": 29560 }, { "epoch": 9.859906604402935, "grad_norm": 1.4106426239013672, "step": 29560 }, { "epoch": 9.859906604402935, "learning_rate": 4.116849649637544e-07, "step": 29560 }, { "epoch": 9.859906604402935, "loss": 0.27583619952201843, "step": 29560 }, { "ce_loss": 0.019909940659999847, "epoch": 9.859906604402935, "step": 29560 }, { "distill_loss": 0.1528017371892929, "epoch": 9.859906604402935, "step": 29560 }, { "epoch": 9.859906604402935, "ref_ce_loss": 0.04741690680384636, "step": 29560 }, { "epoch": 9.859906604402935, "loss": 0.39467301964759827, "step": 29560 }, { "ce_loss": 0.05224955081939697, "epoch": 9.859906604402935, "step": 29560 }, { "distill_loss": 0.21950089931488037, "epoch": 9.859906604402935, "step": 29560 }, { "epoch": 9.859906604402935, "ref_ce_loss": 0.0884372740983963, "step": 29560 }, { "epoch": 9.863242161440962, "loss": 0.3691, "step": 29570 }, { "epoch": 9.863242161440962, "grad_norm": 1.08582603931427, "step": 29570 }, { "epoch": 9.863242161440962, "learning_rate": 3.923174675866559e-07, "step": 29570 }, { "epoch": 9.863242161440962, "loss": 0.297434002161026, "step": 29570 }, { "ce_loss": 0.018246673047542572, "epoch": 9.863242161440962, "step": 29570 }, { "distill_loss": 0.16588199138641357, "epoch": 9.863242161440962, "step": 29570 }, { "epoch": 9.863242161440962, "ref_ce_loss": 0.05477264150977135, "step": 29570 }, { "epoch": 9.863242161440962, "loss": 0.19365330040454865, "step": 29570 }, { "ce_loss": 0.0199178084731102, "epoch": 9.863242161440962, "step": 29570 }, { "distill_loss": 0.1329023540019989, "epoch": 9.863242161440962, "step": 29570 }, { "epoch": 9.863242161440962, "ref_ce_loss": 0.040626198053359985, "step": 29570 }, { "epoch": 9.866577718478986, "loss": 0.3131, "step": 29580 }, { "epoch": 9.866577718478986, "grad_norm": 1.564894199371338, "step": 29580 }, { "epoch": 9.866577718478986, "learning_rate": 3.734163553746672e-07, "step": 29580 }, { "epoch": 9.866577718478986, "loss": 0.3847261667251587, "step": 29580 }, { "ce_loss": 0.028989629819989204, "epoch": 9.866577718478986, "step": 29580 }, { "distill_loss": 0.1808268129825592, "epoch": 9.866577718478986, "step": 29580 }, { "epoch": 9.866577718478986, "ref_ce_loss": 0.06549061834812164, "step": 29580 }, { "epoch": 9.866577718478986, "loss": 0.3929581046104431, "step": 29580 }, { "ce_loss": 0.037682678550481796, "epoch": 9.866577718478986, "step": 29580 }, { "distill_loss": 0.21550612151622772, "epoch": 9.866577718478986, "step": 29580 }, { "epoch": 9.866577718478986, "ref_ce_loss": 0.07935473322868347, "step": 29580 }, { "epoch": 9.86991327551701, "loss": 0.3386, "step": 29590 }, { "epoch": 9.86991327551701, "grad_norm": 1.890181064605713, "step": 29590 }, { "epoch": 9.86991327551701, "learning_rate": 3.5498165038734263e-07, "step": 29590 }, { "epoch": 9.86991327551701, "loss": 0.31606292724609375, "step": 29590 }, { "ce_loss": 0.06806986778974533, "epoch": 9.86991327551701, "step": 29590 }, { "distill_loss": 0.18640464544296265, "epoch": 9.86991327551701, "step": 29590 }, { "epoch": 9.86991327551701, "ref_ce_loss": 0.061429791152477264, "step": 29590 }, { "epoch": 9.86991327551701, "loss": 0.31619423627853394, "step": 29590 }, { "ce_loss": 0.0290507934987545, "epoch": 9.86991327551701, "step": 29590 }, { "distill_loss": 0.1894863247871399, "epoch": 9.86991327551701, "step": 29590 }, { "epoch": 9.86991327551701, "ref_ce_loss": 0.08042974025011063, "step": 29590 }, { "epoch": 9.873248832555037, "loss": 0.3569, "step": 29600 }, { "epoch": 9.873248832555037, "grad_norm": 1.0221617221832275, "step": 29600 }, { "epoch": 9.873248832555037, "learning_rate": 3.370133741400494e-07, "step": 29600 }, { "epoch": 9.873248832555037, "loss": 0.25058314204216003, "step": 29600 }, { "ce_loss": 0.011658146977424622, "epoch": 9.873248832555037, "step": 29600 }, { "distill_loss": 0.15176154673099518, "epoch": 9.873248832555037, "step": 29600 }, { "epoch": 9.873248832555037, "ref_ce_loss": 0.062388718128204346, "step": 29600 }, { "epoch": 9.873248832555037, "loss": 0.3523558974266052, "step": 29600 }, { "ce_loss": 0.056740108877420425, "epoch": 9.873248832555037, "step": 29600 }, { "distill_loss": 0.1592649668455124, "epoch": 9.873248832555037, "step": 29600 }, { "epoch": 9.873248832555037, "ref_ce_loss": 0.0709981843829155, "step": 29600 }, { "epoch": 9.876584389593063, "loss": 0.3249, "step": 29610 }, { "epoch": 9.876584389593063, "grad_norm": 0.8258252143859863, "step": 29610 }, { "epoch": 9.876584389593063, "learning_rate": 3.1951154760365696e-07, "step": 29610 }, { "epoch": 9.876584389593063, "loss": 0.35080069303512573, "step": 29610 }, { "ce_loss": 0.03846856951713562, "epoch": 9.876584389593063, "step": 29610 }, { "distill_loss": 0.17402790486812592, "epoch": 9.876584389593063, "step": 29610 }, { "epoch": 9.876584389593063, "ref_ce_loss": 0.06167008355259895, "step": 29610 }, { "epoch": 9.876584389593063, "loss": 0.31475287675857544, "step": 29610 }, { "ce_loss": 0.03450953960418701, "epoch": 9.876584389593063, "step": 29610 }, { "distill_loss": 0.19979970157146454, "epoch": 9.876584389593063, "step": 29610 }, { "epoch": 9.876584389593063, "ref_ce_loss": 0.05786199867725372, "step": 29610 }, { "epoch": 9.879919946631087, "loss": 0.3241, "step": 29620 }, { "epoch": 9.879919946631087, "grad_norm": 0.9958456754684448, "step": 29620 }, { "epoch": 9.879919946631087, "learning_rate": 3.024761912046703e-07, "step": 29620 }, { "epoch": 9.879919946631087, "loss": 0.3097935914993286, "step": 29620 }, { "ce_loss": 0.035024844110012054, "epoch": 9.879919946631087, "step": 29620 }, { "distill_loss": 0.19233094155788422, "epoch": 9.879919946631087, "step": 29620 }, { "epoch": 9.879919946631087, "ref_ce_loss": 0.0557713583111763, "step": 29620 }, { "epoch": 9.879919946631087, "loss": 0.2509743869304657, "step": 29620 }, { "ce_loss": 0.041433319449424744, "epoch": 9.879919946631087, "step": 29620 }, { "distill_loss": 0.14673636853694916, "epoch": 9.879919946631087, "step": 29620 }, { "epoch": 9.879919946631087, "ref_ce_loss": 0.05089586228132248, "step": 29620 }, { "epoch": 9.883255503669112, "loss": 0.3602, "step": 29630 }, { "epoch": 9.883255503669112, "grad_norm": 1.3219614028930664, "step": 29630 }, { "epoch": 9.883255503669112, "learning_rate": 2.8590732482522977e-07, "step": 29630 }, { "epoch": 9.883255503669112, "loss": 0.29133516550064087, "step": 29630 }, { "ce_loss": 0.054918576031923294, "epoch": 9.883255503669112, "step": 29630 }, { "distill_loss": 0.16251620650291443, "epoch": 9.883255503669112, "step": 29630 }, { "epoch": 9.883255503669112, "ref_ce_loss": 0.05042388662695885, "step": 29630 }, { "epoch": 9.883255503669112, "loss": 0.27201393246650696, "step": 29630 }, { "ce_loss": 0.04357181861996651, "epoch": 9.883255503669112, "step": 29630 }, { "distill_loss": 0.14555370807647705, "epoch": 9.883255503669112, "step": 29630 }, { "epoch": 9.883255503669112, "ref_ce_loss": 0.05804718658328056, "step": 29630 }, { "epoch": 9.886591060707138, "loss": 0.3212, "step": 29640 }, { "epoch": 9.886591060707138, "grad_norm": 0.9410602450370789, "step": 29640 }, { "epoch": 9.886591060707138, "learning_rate": 2.698049678029335e-07, "step": 29640 }, { "epoch": 9.886591060707138, "loss": 0.3896907567977905, "step": 29640 }, { "ce_loss": 0.027953682467341423, "epoch": 9.886591060707138, "step": 29640 }, { "distill_loss": 0.23215749859809875, "epoch": 9.886591060707138, "step": 29640 }, { "epoch": 9.886591060707138, "ref_ce_loss": 0.0854712501168251, "step": 29640 }, { "epoch": 9.886591060707138, "loss": 0.2254749983549118, "step": 29640 }, { "ce_loss": 0.03735177218914032, "epoch": 9.886591060707138, "step": 29640 }, { "distill_loss": 0.12590903043746948, "epoch": 9.886591060707138, "step": 29640 }, { "epoch": 9.886591060707138, "ref_ce_loss": 0.06130973994731903, "step": 29640 }, { "epoch": 9.889926617745164, "loss": 0.3663, "step": 29650 }, { "epoch": 9.889926617745164, "grad_norm": 1.0769139528274536, "step": 29650 }, { "epoch": 9.889926617745164, "learning_rate": 2.5416913893101526e-07, "step": 29650 }, { "epoch": 9.889926617745164, "loss": 0.31788861751556396, "step": 29650 }, { "ce_loss": 0.026286710053682327, "epoch": 9.889926617745164, "step": 29650 }, { "distill_loss": 0.17324940860271454, "epoch": 9.889926617745164, "step": 29650 }, { "epoch": 9.889926617745164, "ref_ce_loss": 0.058500248938798904, "step": 29650 }, { "epoch": 9.889926617745164, "loss": 0.32835084199905396, "step": 29650 }, { "ce_loss": 0.02977113611996174, "epoch": 9.889926617745164, "step": 29650 }, { "distill_loss": 0.15367326140403748, "epoch": 9.889926617745164, "step": 29650 }, { "epoch": 9.889926617745164, "ref_ce_loss": 0.04256702959537506, "step": 29650 }, { "epoch": 9.893262174783189, "loss": 0.3122, "step": 29660 }, { "epoch": 9.893262174783189, "grad_norm": 1.0725162029266357, "step": 29660 }, { "epoch": 9.893262174783189, "learning_rate": 2.389998564581664e-07, "step": 29660 }, { "epoch": 9.893262174783189, "loss": 0.29371413588523865, "step": 29660 }, { "ce_loss": 0.0522097647190094, "epoch": 9.893262174783189, "step": 29660 }, { "distill_loss": 0.1740894466638565, "epoch": 9.893262174783189, "step": 29660 }, { "epoch": 9.893262174783189, "ref_ce_loss": 0.039615560322999954, "step": 29660 }, { "epoch": 9.893262174783189, "loss": 0.36313530802726746, "step": 29660 }, { "ce_loss": 0.05687631294131279, "epoch": 9.893262174783189, "step": 29660 }, { "distill_loss": 0.23256993293762207, "epoch": 9.893262174783189, "step": 29660 }, { "epoch": 9.893262174783189, "ref_ce_loss": 0.0732324868440628, "step": 29660 }, { "epoch": 9.896597731821213, "loss": 0.3409, "step": 29670 }, { "epoch": 9.896597731821213, "grad_norm": 0.787322998046875, "step": 29670 }, { "epoch": 9.896597731821213, "learning_rate": 2.2429713808849174e-07, "step": 29670 }, { "epoch": 9.896597731821213, "loss": 0.29244524240493774, "step": 29670 }, { "ce_loss": 0.04288395494222641, "epoch": 9.896597731821213, "step": 29670 }, { "distill_loss": 0.18360666930675507, "epoch": 9.896597731821213, "step": 29670 }, { "epoch": 9.896597731821213, "ref_ce_loss": 0.06513893604278564, "step": 29670 }, { "epoch": 9.896597731821213, "loss": 0.3685804009437561, "step": 29670 }, { "ce_loss": 0.04656131938099861, "epoch": 9.896597731821213, "step": 29670 }, { "distill_loss": 0.165052130818367, "epoch": 9.896597731821213, "step": 29670 }, { "epoch": 9.896597731821213, "ref_ce_loss": 0.0664345920085907, "step": 29670 }, { "epoch": 9.89993328885924, "loss": 0.3623, "step": 29680 }, { "epoch": 9.89993328885924, "grad_norm": 0.9299017786979675, "step": 29680 }, { "epoch": 9.89993328885924, "learning_rate": 2.1006100098173164e-07, "step": 29680 }, { "epoch": 9.89993328885924, "loss": 0.3087444305419922, "step": 29680 }, { "ce_loss": 0.0318371057510376, "epoch": 9.89993328885924, "step": 29680 }, { "distill_loss": 0.1628321409225464, "epoch": 9.89993328885924, "step": 29680 }, { "epoch": 9.89993328885924, "ref_ce_loss": 0.07229151576757431, "step": 29680 }, { "epoch": 9.89993328885924, "loss": 0.7314703464508057, "step": 29680 }, { "ce_loss": 0.02534298598766327, "epoch": 9.89993328885924, "step": 29680 }, { "distill_loss": 0.21285401284694672, "epoch": 9.89993328885924, "step": 29680 }, { "epoch": 9.89993328885924, "ref_ce_loss": 0.06899610161781311, "step": 29680 }, { "epoch": 9.903268845897266, "loss": 0.3585, "step": 29690 }, { "epoch": 9.903268845897266, "grad_norm": 1.0982376337051392, "step": 29690 }, { "epoch": 9.903268845897266, "learning_rate": 1.9629146175295098e-07, "step": 29690 }, { "epoch": 9.903268845897266, "loss": 0.3895324170589447, "step": 29690 }, { "ce_loss": 0.054450660943984985, "epoch": 9.903268845897266, "step": 29690 }, { "distill_loss": 0.18156380951404572, "epoch": 9.903268845897266, "step": 29690 }, { "epoch": 9.903268845897266, "ref_ce_loss": 0.08041262626647949, "step": 29690 }, { "epoch": 9.903268845897266, "loss": 0.26195472478866577, "step": 29690 }, { "ce_loss": 0.04622939974069595, "epoch": 9.903268845897266, "step": 29690 }, { "distill_loss": 0.13963152468204498, "epoch": 9.903268845897266, "step": 29690 }, { "epoch": 9.903268845897266, "ref_ce_loss": 0.05275070294737816, "step": 29690 }, { "epoch": 9.90660440293529, "loss": 0.3262, "step": 29700 }, { "epoch": 9.90660440293529, "grad_norm": 2.3947901725769043, "step": 29700 }, { "epoch": 9.90660440293529, "learning_rate": 1.8298853647267245e-07, "step": 29700 }, { "epoch": 9.90660440293529, "loss": 0.24890460073947906, "step": 29700 }, { "ce_loss": 0.023720307275652885, "epoch": 9.90660440293529, "step": 29700 }, { "distill_loss": 0.15437108278274536, "epoch": 9.90660440293529, "step": 29700 }, { "epoch": 9.90660440293529, "ref_ce_loss": 0.0504058375954628, "step": 29700 }, { "epoch": 9.90660440293529, "loss": 0.28565430641174316, "step": 29700 }, { "ce_loss": 0.034484151750802994, "epoch": 9.90660440293529, "step": 29700 }, { "distill_loss": 0.15133896470069885, "epoch": 9.90660440293529, "step": 29700 }, { "epoch": 9.90660440293529, "ref_ce_loss": 0.06469812989234924, "step": 29700 }, { "epoch": 9.909939959973315, "loss": 0.312, "step": 29710 }, { "epoch": 9.909939959973315, "grad_norm": 1.3704745769500732, "step": 29710 }, { "epoch": 9.909939959973315, "learning_rate": 1.7015224066692092e-07, "step": 29710 }, { "epoch": 9.909939959973315, "loss": 0.4401421546936035, "step": 29710 }, { "ce_loss": 0.07302363961935043, "epoch": 9.909939959973315, "step": 29710 }, { "distill_loss": 0.22405001521110535, "epoch": 9.909939959973315, "step": 29710 }, { "epoch": 9.909939959973315, "ref_ce_loss": 0.06941098719835281, "step": 29710 }, { "epoch": 9.909939959973315, "loss": 0.2773750424385071, "step": 29710 }, { "ce_loss": 0.037247832864522934, "epoch": 9.909939959973315, "step": 29710 }, { "distill_loss": 0.15443992614746094, "epoch": 9.909939959973315, "step": 29710 }, { "epoch": 9.909939959973315, "ref_ce_loss": 0.062117595225572586, "step": 29710 }, { "epoch": 9.913275517011341, "loss": 0.3324, "step": 29720 }, { "epoch": 9.913275517011341, "grad_norm": 1.1411337852478027, "step": 29720 }, { "epoch": 9.913275517011341, "learning_rate": 1.577825893169127e-07, "step": 29720 }, { "epoch": 9.913275517011341, "loss": 0.32584357261657715, "step": 29720 }, { "ce_loss": 0.04434728994965553, "epoch": 9.913275517011341, "step": 29720 }, { "distill_loss": 0.18526092171669006, "epoch": 9.913275517011341, "step": 29720 }, { "epoch": 9.913275517011341, "ref_ce_loss": 0.06991927325725555, "step": 29720 }, { "epoch": 9.913275517011341, "loss": 0.2466956079006195, "step": 29720 }, { "ce_loss": 0.04231082275509834, "epoch": 9.913275517011341, "step": 29720 }, { "distill_loss": 0.16381202638149261, "epoch": 9.913275517011341, "step": 29720 }, { "epoch": 9.913275517011341, "ref_ce_loss": 0.04034992679953575, "step": 29720 }, { "epoch": 9.916611074049367, "loss": 0.3259, "step": 29730 }, { "epoch": 9.916611074049367, "grad_norm": 0.9959713816642761, "step": 29730 }, { "epoch": 9.916611074049367, "learning_rate": 1.4587959685945508e-07, "step": 29730 }, { "epoch": 9.916611074049367, "loss": 0.2603667378425598, "step": 29730 }, { "ce_loss": 0.026754045858979225, "epoch": 9.916611074049367, "step": 29730 }, { "distill_loss": 0.17127679288387299, "epoch": 9.916611074049367, "step": 29730 }, { "epoch": 9.916611074049367, "ref_ce_loss": 0.038138411939144135, "step": 29730 }, { "epoch": 9.916611074049367, "loss": 0.29234760999679565, "step": 29730 }, { "ce_loss": 0.03642908111214638, "epoch": 9.916611074049367, "step": 29730 }, { "distill_loss": 0.16722798347473145, "epoch": 9.916611074049367, "step": 29730 }, { "epoch": 9.916611074049367, "ref_ce_loss": 0.052527159452438354, "step": 29730 }, { "epoch": 9.919946631087392, "loss": 0.3273, "step": 29740 }, { "epoch": 9.919946631087392, "grad_norm": 0.7950869798660278, "step": 29740 }, { "epoch": 9.919946631087392, "learning_rate": 1.3444327718659112e-07, "step": 29740 }, { "epoch": 9.919946631087392, "loss": 0.3004519045352936, "step": 29740 }, { "ce_loss": 0.03613626956939697, "epoch": 9.919946631087392, "step": 29740 }, { "distill_loss": 0.18733999133110046, "epoch": 9.919946631087392, "step": 29740 }, { "epoch": 9.919946631087392, "ref_ce_loss": 0.05546070635318756, "step": 29740 }, { "epoch": 9.919946631087392, "loss": 0.23486210405826569, "step": 29740 }, { "ce_loss": 0.030922196805477142, "epoch": 9.919946631087392, "step": 29740 }, { "distill_loss": 0.11665789783000946, "epoch": 9.919946631087392, "step": 29740 }, { "epoch": 9.919946631087392, "ref_ce_loss": 0.06070510670542717, "step": 29740 }, { "epoch": 9.923282188125416, "loss": 0.3393, "step": 29750 }, { "epoch": 9.923282188125416, "grad_norm": 0.9183117747306824, "step": 29750 }, { "epoch": 9.923282188125416, "learning_rate": 1.2347364364573288e-07, "step": 29750 }, { "epoch": 9.923282188125416, "loss": 0.27825382351875305, "step": 29750 }, { "ce_loss": 0.05024456977844238, "epoch": 9.923282188125416, "step": 29750 }, { "distill_loss": 0.18313361704349518, "epoch": 9.923282188125416, "step": 29750 }, { "epoch": 9.923282188125416, "ref_ce_loss": 0.04465365782380104, "step": 29750 }, { "epoch": 9.923282188125416, "loss": 0.28916436433792114, "step": 29750 }, { "ce_loss": 0.04393957555294037, "epoch": 9.923282188125416, "step": 29750 }, { "distill_loss": 0.17849382758140564, "epoch": 9.923282188125416, "step": 29750 }, { "epoch": 9.923282188125416, "ref_ce_loss": 0.06638529151678085, "step": 29750 }, { "epoch": 9.926617745163442, "loss": 0.3188, "step": 29760 }, { "epoch": 9.926617745163442, "grad_norm": 1.2711544036865234, "step": 29760 }, { "epoch": 9.926617745163442, "learning_rate": 1.1297070903966145e-07, "step": 29760 }, { "epoch": 9.926617745163442, "loss": 0.37991073727607727, "step": 29760 }, { "ce_loss": 0.051403578370809555, "epoch": 9.926617745163442, "step": 29760 }, { "distill_loss": 0.20777808129787445, "epoch": 9.926617745163442, "step": 29760 }, { "epoch": 9.926617745163442, "ref_ce_loss": 0.07273232191801071, "step": 29760 }, { "epoch": 9.926617745163442, "loss": 0.3107052147388458, "step": 29760 }, { "ce_loss": 0.018291166052222252, "epoch": 9.926617745163442, "step": 29760 }, { "distill_loss": 0.18891987204551697, "epoch": 9.926617745163442, "step": 29760 }, { "epoch": 9.926617745163442, "ref_ce_loss": 0.07224489748477936, "step": 29760 }, { "epoch": 9.929953302201469, "loss": 0.3146, "step": 29770 }, { "epoch": 9.929953302201469, "grad_norm": 1.1790691614151, "step": 29770 }, { "epoch": 9.929953302201469, "learning_rate": 1.0293448562634922e-07, "step": 29770 }, { "epoch": 9.929953302201469, "loss": 0.29821261763572693, "step": 29770 }, { "ce_loss": 0.03646909445524216, "epoch": 9.929953302201469, "step": 29770 }, { "distill_loss": 0.18316808342933655, "epoch": 9.929953302201469, "step": 29770 }, { "epoch": 9.929953302201469, "ref_ce_loss": 0.05794721841812134, "step": 29770 }, { "epoch": 9.929953302201469, "loss": 0.2972831130027771, "step": 29770 }, { "ce_loss": 0.05520421639084816, "epoch": 9.929953302201469, "step": 29770 }, { "distill_loss": 0.15047815442085266, "epoch": 9.929953302201469, "step": 29770 }, { "epoch": 9.929953302201469, "ref_ce_loss": 0.07132155448198318, "step": 29770 }, { "epoch": 9.933288859239493, "loss": 0.3568, "step": 29780 }, { "epoch": 9.933288859239493, "grad_norm": 1.081078052520752, "step": 29780 }, { "epoch": 9.933288859239493, "learning_rate": 9.336498511922643e-08, "step": 29780 }, { "epoch": 9.933288859239493, "loss": 0.3281066119670868, "step": 29780 }, { "ce_loss": 0.0696646198630333, "epoch": 9.933288859239493, "step": 29780 }, { "distill_loss": 0.16033565998077393, "epoch": 9.933288859239493, "step": 29780 }, { "epoch": 9.933288859239493, "ref_ce_loss": 0.0567629374563694, "step": 29780 }, { "epoch": 9.933288859239493, "loss": 0.20319396257400513, "step": 29780 }, { "ce_loss": 0.025403108447790146, "epoch": 9.933288859239493, "step": 29780 }, { "distill_loss": 0.11004454642534256, "epoch": 9.933288859239493, "step": 29780 }, { "epoch": 9.933288859239493, "ref_ce_loss": 0.052880749106407166, "step": 29780 }, { "epoch": 9.936624416277517, "loss": 0.3406, "step": 29790 }, { "epoch": 9.936624416277517, "grad_norm": 1.4391337633132935, "step": 29790 }, { "epoch": 9.936624416277517, "learning_rate": 8.426221868687023e-08, "step": 29790 }, { "epoch": 9.936624416277517, "loss": 0.34950166940689087, "step": 29790 }, { "ce_loss": 0.06122155115008354, "epoch": 9.936624416277517, "step": 29790 }, { "distill_loss": 0.17466667294502258, "epoch": 9.936624416277517, "step": 29790 }, { "epoch": 9.936624416277517, "ref_ce_loss": 0.08305674046278, "step": 29790 }, { "epoch": 9.936624416277517, "loss": 0.28921905159950256, "step": 29790 }, { "ce_loss": 0.030770935118198395, "epoch": 9.936624416277517, "step": 29790 }, { "distill_loss": 0.1881738156080246, "epoch": 9.936624416277517, "step": 29790 }, { "epoch": 9.936624416277517, "ref_ce_loss": 0.04746009409427643, "step": 29790 }, { "epoch": 9.939959973315544, "loss": 0.3373, "step": 29800 }, { "epoch": 9.939959973315544, "grad_norm": 2.557058334350586, "step": 29800 }, { "epoch": 9.939959973315544, "learning_rate": 7.562619695327122e-08, "step": 29800 }, { "epoch": 9.939959973315544, "loss": 0.42486026883125305, "step": 29800 }, { "ce_loss": 0.056674372404813766, "epoch": 9.939959973315544, "step": 29800 }, { "distill_loss": 0.19006691873073578, "epoch": 9.939959973315544, "step": 29800 }, { "epoch": 9.939959973315544, "ref_ce_loss": 0.06043754145503044, "step": 29800 }, { "epoch": 9.939959973315544, "loss": 0.3627516031265259, "step": 29800 }, { "ce_loss": 0.041415825486183167, "epoch": 9.939959973315544, "step": 29800 }, { "distill_loss": 0.19671253859996796, "epoch": 9.939959973315544, "step": 29800 }, { "epoch": 9.939959973315544, "ref_ce_loss": 0.08194684237241745, "step": 29800 }, { "epoch": 9.94329553035357, "loss": 0.348, "step": 29810 }, { "epoch": 9.94329553035357, "grad_norm": 1.006632924079895, "step": 29810 }, { "epoch": 9.94329553035357, "learning_rate": 6.74569299974781e-08, "step": 29810 }, { "epoch": 9.94329553035357, "loss": 0.22724904119968414, "step": 29810 }, { "ce_loss": 0.025708623230457306, "epoch": 9.94329553035357, "step": 29810 }, { "distill_loss": 0.13873682916164398, "epoch": 9.94329553035357, "step": 29810 }, { "epoch": 9.94329553035357, "ref_ce_loss": 0.04845819249749184, "step": 29810 }, { "epoch": 9.94329553035357, "loss": 0.23277434706687927, "step": 29810 }, { "ce_loss": 0.016086967661976814, "epoch": 9.94329553035357, "step": 29810 }, { "distill_loss": 0.16063126921653748, "epoch": 9.94329553035357, "step": 29810 }, { "epoch": 9.94329553035357, "ref_ce_loss": 0.03908000513911247, "step": 29810 }, { "epoch": 9.946631087391594, "loss": 0.3277, "step": 29820 }, { "epoch": 9.946631087391594, "grad_norm": 1.4111415147781372, "step": 29820 }, { "epoch": 9.946631087391594, "learning_rate": 5.975442735404179e-08, "step": 29820 }, { "epoch": 9.946631087391594, "loss": 0.27478665113449097, "step": 29820 }, { "ce_loss": 0.033356938511133194, "epoch": 9.946631087391594, "step": 29820 }, { "distill_loss": 0.1275174617767334, "epoch": 9.946631087391594, "step": 29820 }, { "epoch": 9.946631087391594, "ref_ce_loss": 0.07789397239685059, "step": 29820 }, { "epoch": 9.946631087391594, "loss": 0.2878682613372803, "step": 29820 }, { "ce_loss": 0.041646961122751236, "epoch": 9.946631087391594, "step": 29820 }, { "distill_loss": 0.16942068934440613, "epoch": 9.946631087391594, "step": 29820 }, { "epoch": 9.946631087391594, "ref_ce_loss": 0.05287106707692146, "step": 29820 }, { "epoch": 9.949966644429619, "loss": 0.3172, "step": 29830 }, { "epoch": 9.949966644429619, "grad_norm": 1.569103479385376, "step": 29830 }, { "epoch": 9.949966644429619, "learning_rate": 5.251869801248255e-08, "step": 29830 }, { "epoch": 9.949966644429619, "loss": 0.2843485474586487, "step": 29830 }, { "ce_loss": 0.02224954031407833, "epoch": 9.949966644429619, "step": 29830 }, { "distill_loss": 0.14809651672840118, "epoch": 9.949966644429619, "step": 29830 }, { "epoch": 9.949966644429619, "ref_ce_loss": 0.05193183571100235, "step": 29830 }, { "epoch": 9.949966644429619, "loss": 0.2797556519508362, "step": 29830 }, { "ce_loss": 0.042071759700775146, "epoch": 9.949966644429619, "step": 29830 }, { "distill_loss": 0.1801556646823883, "epoch": 9.949966644429619, "step": 29830 }, { "epoch": 9.949966644429619, "ref_ce_loss": 0.057339444756507874, "step": 29830 }, { "epoch": 9.953302201467645, "loss": 0.3358, "step": 29840 }, { "epoch": 9.953302201467645, "grad_norm": 1.1686184406280518, "step": 29840 }, { "epoch": 9.953302201467645, "learning_rate": 4.5749750417733994e-08, "step": 29840 }, { "epoch": 9.953302201467645, "loss": 0.34728512167930603, "step": 29840 }, { "ce_loss": 0.028365690261125565, "epoch": 9.953302201467645, "step": 29840 }, { "distill_loss": 0.2081349641084671, "epoch": 9.953302201467645, "step": 29840 }, { "epoch": 9.953302201467645, "ref_ce_loss": 0.07218490540981293, "step": 29840 }, { "epoch": 9.953302201467645, "loss": 0.23002469539642334, "step": 29840 }, { "ce_loss": 0.03352981060743332, "epoch": 9.953302201467645, "step": 29840 }, { "distill_loss": 0.14592669904232025, "epoch": 9.953302201467645, "step": 29840 }, { "epoch": 9.953302201467645, "ref_ce_loss": 0.03543630242347717, "step": 29840 }, { "epoch": 9.956637758505671, "loss": 0.3346, "step": 29850 }, { "epoch": 9.956637758505671, "grad_norm": 0.8662817478179932, "step": 29850 }, { "epoch": 9.956637758505671, "learning_rate": 3.944759246992113e-08, "step": 29850 }, { "epoch": 9.956637758505671, "loss": 0.345037043094635, "step": 29850 }, { "ce_loss": 0.04612068086862564, "epoch": 9.956637758505671, "step": 29850 }, { "distill_loss": 0.19077233970165253, "epoch": 9.956637758505671, "step": 29850 }, { "epoch": 9.956637758505671, "ref_ce_loss": 0.056209057569503784, "step": 29850 }, { "epoch": 9.956637758505671, "loss": 0.3140699565410614, "step": 29850 }, { "ce_loss": 0.04668711870908737, "epoch": 9.956637758505671, "step": 29850 }, { "distill_loss": 0.18919126689434052, "epoch": 9.956637758505671, "step": 29850 }, { "epoch": 9.956637758505671, "ref_ce_loss": 0.05338788405060768, "step": 29850 }, { "epoch": 9.959973315543696, "loss": 0.351, "step": 29860 }, { "epoch": 9.959973315543696, "grad_norm": 0.947829008102417, "step": 29860 }, { "epoch": 9.959973315543696, "learning_rate": 3.361223152427151e-08, "step": 29860 }, { "epoch": 9.959973315543696, "loss": 0.21806228160858154, "step": 29860 }, { "ce_loss": 0.03290942311286926, "epoch": 9.959973315543696, "step": 29860 }, { "distill_loss": 0.140326589345932, "epoch": 9.959973315543696, "step": 29860 }, { "epoch": 9.959973315543696, "ref_ce_loss": 0.04470537230372429, "step": 29860 }, { "epoch": 9.959973315543696, "loss": 0.37957972288131714, "step": 29860 }, { "ce_loss": 0.04370058327913284, "epoch": 9.959973315543696, "step": 29860 }, { "distill_loss": 0.18686765432357788, "epoch": 9.959973315543696, "step": 29860 }, { "epoch": 9.959973315543696, "ref_ce_loss": 0.07017166912555695, "step": 29860 }, { "epoch": 9.96330887258172, "loss": 0.3735, "step": 29870 }, { "epoch": 9.96330887258172, "grad_norm": 1.2191232442855835, "step": 29870 }, { "epoch": 9.96330887258172, "learning_rate": 2.824367439133724e-08, "step": 29870 }, { "epoch": 9.96330887258172, "loss": 0.3440263867378235, "step": 29870 }, { "ce_loss": 0.05631376802921295, "epoch": 9.96330887258172, "step": 29870 }, { "distill_loss": 0.19631989300251007, "epoch": 9.96330887258172, "step": 29870 }, { "epoch": 9.96330887258172, "ref_ce_loss": 0.07191712409257889, "step": 29870 }, { "epoch": 9.96330887258172, "loss": 0.2361232340335846, "step": 29870 }, { "ce_loss": 0.023900402709841728, "epoch": 9.96330887258172, "step": 29870 }, { "distill_loss": 0.15150588750839233, "epoch": 9.96330887258172, "step": 29870 }, { "epoch": 9.96330887258172, "ref_ce_loss": 0.04112594574689865, "step": 29870 }, { "epoch": 9.966644429619747, "loss": 0.3224, "step": 29880 }, { "epoch": 9.966644429619747, "grad_norm": 0.8823342323303223, "step": 29880 }, { "epoch": 9.966644429619747, "learning_rate": 2.3341927336772986e-08, "step": 29880 }, { "epoch": 9.966644429619747, "loss": 0.42281824350357056, "step": 29880 }, { "ce_loss": 0.03692351654171944, "epoch": 9.966644429619747, "step": 29880 }, { "distill_loss": 0.22779785096645355, "epoch": 9.966644429619747, "step": 29880 }, { "epoch": 9.966644429619747, "ref_ce_loss": 0.06586943566799164, "step": 29880 }, { "epoch": 9.966644429619747, "loss": 0.2905508279800415, "step": 29880 }, { "ce_loss": 0.011483977548778057, "epoch": 9.966644429619747, "step": 29880 }, { "distill_loss": 0.18474937975406647, "epoch": 9.966644429619747, "step": 29880 }, { "epoch": 9.966644429619747, "ref_ce_loss": 0.07037250697612762, "step": 29880 }, { "epoch": 9.969979986657773, "loss": 0.3249, "step": 29890 }, { "epoch": 9.969979986657773, "grad_norm": 0.8157691359519958, "step": 29890 }, { "epoch": 9.969979986657773, "learning_rate": 1.8906996081424765e-08, "step": 29890 }, { "epoch": 9.969979986657773, "loss": 0.9363209009170532, "step": 29890 }, { "ce_loss": 0.04010238125920296, "epoch": 9.969979986657773, "step": 29890 }, { "distill_loss": 0.12882179021835327, "epoch": 9.969979986657773, "step": 29890 }, { "epoch": 9.969979986657773, "ref_ce_loss": 0.042074572294950485, "step": 29890 }, { "epoch": 9.969979986657773, "loss": 0.20374956727027893, "step": 29890 }, { "ce_loss": 0.018373459577560425, "epoch": 9.969979986657773, "step": 29890 }, { "distill_loss": 0.14011499285697937, "epoch": 9.969979986657773, "step": 29890 }, { "epoch": 9.969979986657773, "ref_ce_loss": 0.030234120786190033, "step": 29890 }, { "epoch": 9.973315543695797, "loss": 0.3742, "step": 29900 }, { "epoch": 9.973315543695797, "grad_norm": 0.848705530166626, "step": 29900 }, { "epoch": 9.973315543695797, "learning_rate": 1.493888580137437e-08, "step": 29900 }, { "epoch": 9.973315543695797, "loss": 0.2602086365222931, "step": 29900 }, { "ce_loss": 0.03535810858011246, "epoch": 9.973315543695797, "step": 29900 }, { "distill_loss": 0.14325125515460968, "epoch": 9.973315543695797, "step": 29900 }, { "epoch": 9.973315543695797, "ref_ce_loss": 0.053540270775556564, "step": 29900 }, { "epoch": 9.973315543695797, "loss": 0.22060787677764893, "step": 29900 }, { "ce_loss": 0.02888793684542179, "epoch": 9.973315543695797, "step": 29900 }, { "distill_loss": 0.13869284093379974, "epoch": 9.973315543695797, "step": 29900 }, { "epoch": 9.973315543695797, "ref_ce_loss": 0.0529271699488163, "step": 29900 }, { "epoch": 9.976651100733822, "loss": 0.3507, "step": 29910 }, { "epoch": 9.976651100733822, "grad_norm": 1.8673821687698364, "step": 29910 }, { "epoch": 9.976651100733822, "learning_rate": 1.1437601127850527e-08, "step": 29910 }, { "epoch": 9.976651100733822, "loss": 0.5325973033905029, "step": 29910 }, { "ce_loss": 0.08878213167190552, "epoch": 9.976651100733822, "step": 29910 }, { "distill_loss": 0.230632483959198, "epoch": 9.976651100733822, "step": 29910 }, { "epoch": 9.976651100733822, "ref_ce_loss": 0.08264369517564774, "step": 29910 }, { "epoch": 9.976651100733822, "loss": 0.3474941551685333, "step": 29910 }, { "ce_loss": 0.038783978670835495, "epoch": 9.976651100733822, "step": 29910 }, { "distill_loss": 0.18959704041481018, "epoch": 9.976651100733822, "step": 29910 }, { "epoch": 9.976651100733822, "ref_ce_loss": 0.05137256532907486, "step": 29910 }, { "epoch": 9.979986657771848, "loss": 0.3746, "step": 29920 }, { "epoch": 9.979986657771848, "grad_norm": 0.875569224357605, "step": 29920 }, { "epoch": 9.979986657771848, "learning_rate": 8.403146147140106e-09, "step": 29920 }, { "epoch": 9.979986657771848, "loss": 0.2730117440223694, "step": 29920 }, { "ce_loss": 0.03304222598671913, "epoch": 9.979986657771848, "step": 29920 }, { "distill_loss": 0.17824025452136993, "epoch": 9.979986657771848, "step": 29920 }, { "epoch": 9.979986657771848, "ref_ce_loss": 0.06153935194015503, "step": 29920 }, { "epoch": 9.979986657771848, "loss": 0.2830379605293274, "step": 29920 }, { "ce_loss": 0.024901097640395164, "epoch": 9.979986657771848, "step": 29920 }, { "distill_loss": 0.16472214460372925, "epoch": 9.979986657771848, "step": 29920 }, { "epoch": 9.979986657771848, "ref_ce_loss": 0.05382291227579117, "step": 29920 }, { "epoch": 9.983322214809874, "loss": 0.3348, "step": 29930 }, { "epoch": 9.983322214809874, "grad_norm": 1.073372721672058, "step": 29930 }, { "epoch": 9.983322214809874, "learning_rate": 5.835524400854553e-09, "step": 29930 }, { "epoch": 9.983322214809874, "loss": 0.3343769907951355, "step": 29930 }, { "ce_loss": 0.04512088745832443, "epoch": 9.983322214809874, "step": 29930 }, { "distill_loss": 0.13358773291110992, "epoch": 9.983322214809874, "step": 29930 }, { "epoch": 9.983322214809874, "ref_ce_loss": 0.07107693701982498, "step": 29930 }, { "epoch": 9.983322214809874, "loss": 0.29759302735328674, "step": 29930 }, { "ce_loss": 0.025530952960252762, "epoch": 9.983322214809874, "step": 29930 }, { "distill_loss": 0.16844472289085388, "epoch": 9.983322214809874, "step": 29930 }, { "epoch": 9.983322214809874, "ref_ce_loss": 0.060609109699726105, "step": 29930 }, { "epoch": 9.986657771847899, "loss": 0.3355, "step": 29940 }, { "epoch": 9.986657771847899, "grad_norm": 9.1760835647583, "step": 29940 }, { "epoch": 9.986657771847899, "learning_rate": 3.7347388857078554e-09, "step": 29940 }, { "epoch": 9.986657771847899, "loss": 0.29515504837036133, "step": 29940 }, { "ce_loss": 0.037243470549583435, "epoch": 9.986657771847899, "step": 29940 }, { "distill_loss": 0.1859862208366394, "epoch": 9.986657771847899, "step": 29940 }, { "epoch": 9.986657771847899, "ref_ce_loss": 0.04966844245791435, "step": 29940 }, { "epoch": 9.986657771847899, "loss": 0.3100915849208832, "step": 29940 }, { "ce_loss": 0.039968203753232956, "epoch": 9.986657771847899, "step": 29940 }, { "distill_loss": 0.19227924942970276, "epoch": 9.986657771847899, "step": 29940 }, { "epoch": 9.986657771847899, "ref_ce_loss": 0.038896676152944565, "step": 29940 }, { "epoch": 9.989993328885923, "loss": 0.37, "step": 29950 }, { "epoch": 9.989993328885923, "grad_norm": 1.4707958698272705, "step": 29950 }, { "epoch": 9.989993328885923, "learning_rate": 2.1007920534277248e-09, "step": 29950 }, { "epoch": 9.989993328885923, "loss": 0.4190083146095276, "step": 29950 }, { "ce_loss": 0.021757889539003372, "epoch": 9.989993328885923, "step": 29950 }, { "distill_loss": 0.17043279111385345, "epoch": 9.989993328885923, "step": 29950 }, { "epoch": 9.989993328885923, "ref_ce_loss": 0.0661436915397644, "step": 29950 }, { "epoch": 9.989993328885923, "loss": 0.4909171462059021, "step": 29950 }, { "ce_loss": 0.02767323888838291, "epoch": 9.989993328885923, "step": 29950 }, { "distill_loss": 0.1564040184020996, "epoch": 9.989993328885923, "step": 29950 }, { "epoch": 9.989993328885923, "ref_ce_loss": 0.06023760512471199, "step": 29950 }, { "epoch": 9.99332888592395, "loss": 0.3398, "step": 29960 }, { "epoch": 9.99332888592395, "grad_norm": 1.0020558834075928, "step": 29960 }, { "epoch": 9.99332888592395, "learning_rate": 9.336858111552715e-10, "step": 29960 }, { "epoch": 9.99332888592395, "loss": 0.2585368752479553, "step": 29960 }, { "ce_loss": 0.032630253583192825, "epoch": 9.99332888592395, "step": 29960 }, { "distill_loss": 0.1443522572517395, "epoch": 9.99332888592395, "step": 29960 }, { "epoch": 9.99332888592395, "ref_ce_loss": 0.030707091093063354, "step": 29960 }, { "epoch": 9.99332888592395, "loss": 0.31847187876701355, "step": 29960 }, { "ce_loss": 0.02757086232304573, "epoch": 9.99332888592395, "step": 29960 }, { "distill_loss": 0.18047651648521423, "epoch": 9.99332888592395, "step": 29960 }, { "epoch": 9.99332888592395, "ref_ce_loss": 0.07202086597681046, "step": 29960 }, { "epoch": 9.996664442961976, "loss": 0.3786, "step": 29970 }, { "epoch": 9.996664442961976, "grad_norm": 1.3275409936904907, "step": 29970 }, { "epoch": 9.996664442961976, "learning_rate": 2.3342152091210265e-10, "step": 29970 }, { "epoch": 9.996664442961976, "loss": 0.26913413405418396, "step": 29970 }, { "ce_loss": 0.058668166399002075, "epoch": 9.996664442961976, "step": 29970 }, { "distill_loss": 0.15468889474868774, "epoch": 9.996664442961976, "step": 29970 }, { "epoch": 9.996664442961976, "ref_ce_loss": 0.039234500378370285, "step": 29970 }, { "epoch": 9.996664442961976, "loss": 0.5042160749435425, "step": 29970 }, { "ce_loss": 0.036590516567230225, "epoch": 9.996664442961976, "step": 29970 }, { "distill_loss": 0.20055048167705536, "epoch": 9.996664442961976, "step": 29970 }, { "epoch": 9.996664442961976, "ref_ce_loss": 0.051263727247714996, "step": 29970 }, { "epoch": 10.0, "loss": 0.3577, "step": 29980 }, { "epoch": 10.0, "grad_norm": 1.942399501800537, "step": 29980 }, { "epoch": 10.0, "learning_rate": 0.0, "step": 29980 }, { "epoch": 10.0, "step": 29980, "train_runtime": 69684.4226 }, { "epoch": 10.0, "step": 29980, "train_samples_per_second": 55.066 }, { "epoch": 10.0, "step": 29980, "train_steps_per_second": 0.43 }, { "epoch": 10.0, "step": 29980, "total_flos": 0.0 }, { "epoch": 10.0, "step": 29980, "train_loss": 0.8395808646009316 } ], "logging_steps": 10, "max_steps": 29980, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }