{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5767012687427913, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014417531718569781, "grad_norm": 3808.0, "kd_loss": 0.4765625, "learning_rate": 1.3333333333333334e-06, "loss": 2.4865, "step": 5, "student_loss": 1.2782293558120728, "teacher_loss": 0.0020202866289764643 }, { "epoch": 0.0028835063437139563, "grad_norm": 496.0, "kd_loss": 0.453125, "learning_rate": 3e-06, "loss": 2.0957, "step": 10, "student_loss": 1.0292338132858276, "teacher_loss": 0.005245466250926256 }, { "epoch": 0.004325259515570935, "grad_norm": 238.0, "kd_loss": 0.4453125, "learning_rate": 4.666666666666667e-06, "loss": 1.9295, "step": 15, "student_loss": 0.631219208240509, "teacher_loss": 0.0013347615022212267 }, { "epoch": 0.0057670126874279125, "grad_norm": 133.0, "kd_loss": 0.44140625, "learning_rate": 6.333333333333333e-06, "loss": 1.8503, "step": 20, "student_loss": 1.4391331672668457, "teacher_loss": 0.0005473981145769358 }, { "epoch": 0.00720876585928489, "grad_norm": 75.5, "kd_loss": 0.3984375, "learning_rate": 8.000000000000001e-06, "loss": 1.0972, "step": 25, "student_loss": 0.18605084717273712, "teacher_loss": 0.0011649713851511478 }, { "epoch": 0.00865051903114187, "grad_norm": 24.25, "kd_loss": 0.376953125, "learning_rate": 9.666666666666667e-06, "loss": 0.5883, "step": 30, "student_loss": 0.05895603448152542, "teacher_loss": 0.030797353014349937 }, { "epoch": 0.010092272202998846, "grad_norm": 9.1875, "kd_loss": 0.318359375, "learning_rate": 9.99958042442916e-06, "loss": 0.4657, "step": 35, "student_loss": 0.00801500491797924, "teacher_loss": 0.04946871101856232 }, { "epoch": 0.011534025374855825, "grad_norm": 7.09375, "kd_loss": 0.296875, "learning_rate": 9.997876019358083e-06, "loss": 0.402, "step": 40, "student_loss": 0.27407729625701904, "teacher_loss": 0.04900167137384415 }, { "epoch": 0.012975778546712802, "grad_norm": 10.6875, "kd_loss": 0.265625, "learning_rate": 9.99486100792044e-06, "loss": 0.3281, "step": 45, "student_loss": 0.3285456597805023, "teacher_loss": 0.003808467648923397 }, { "epoch": 0.01441753171856978, "grad_norm": 21.875, "kd_loss": 0.2578125, "learning_rate": 9.990536180750724e-06, "loss": 0.351, "step": 50, "student_loss": 0.03528054431080818, "teacher_loss": 0.04187293350696564 }, { "epoch": 0.015859284890426758, "grad_norm": 228.0, "kd_loss": 0.2421875, "learning_rate": 9.984902671959911e-06, "loss": 0.3368, "step": 55, "student_loss": 0.004587164148688316, "teacher_loss": 0.0026238495483994484 }, { "epoch": 0.01730103806228374, "grad_norm": 6.4375, "kd_loss": 0.23046875, "learning_rate": 9.97796195883804e-06, "loss": 0.3291, "step": 60, "student_loss": 0.048888176679611206, "teacher_loss": 0.0033420324325561523 }, { "epoch": 0.018742791234140715, "grad_norm": 6.375, "kd_loss": 0.2099609375, "learning_rate": 9.969715861466839e-06, "loss": 0.3147, "step": 65, "student_loss": 0.11189773678779602, "teacher_loss": 0.0413309670984745 }, { "epoch": 0.020184544405997693, "grad_norm": 4.09375, "kd_loss": 0.208984375, "learning_rate": 9.96016654224243e-06, "loss": 0.3096, "step": 70, "student_loss": 0.026729928329586983, "teacher_loss": 0.0019827873911708593 }, { "epoch": 0.02162629757785467, "grad_norm": 4.25, "kd_loss": 0.1748046875, "learning_rate": 9.94931650530827e-06, "loss": 0.2729, "step": 75, "student_loss": 0.006473238579928875, "teacher_loss": 0.005465318448841572 }, { "epoch": 0.02306805074971165, "grad_norm": 3.859375, "kd_loss": 0.16796875, "learning_rate": 9.93716859589851e-06, "loss": 0.2662, "step": 80, "student_loss": 0.00662571657449007, "teacher_loss": 0.004144964274019003 }, { "epoch": 0.024509803921568627, "grad_norm": 2.78125, "kd_loss": 0.1640625, "learning_rate": 9.923725999591846e-06, "loss": 0.2261, "step": 85, "student_loss": 0.004623747896403074, "teacher_loss": 0.002723712706938386 }, { "epoch": 0.025951557093425604, "grad_norm": 5.5625, "kd_loss": 0.1943359375, "learning_rate": 9.908992241476189e-06, "loss": 0.2543, "step": 90, "student_loss": 0.11273087561130524, "teacher_loss": 0.0015502618625760078 }, { "epoch": 0.027393310265282585, "grad_norm": 3.4375, "kd_loss": 0.189453125, "learning_rate": 9.892971185224244e-06, "loss": 0.2267, "step": 95, "student_loss": 0.006197327747941017, "teacher_loss": 0.008993545547127724 }, { "epoch": 0.02883506343713956, "grad_norm": 6.34375, "kd_loss": 0.134765625, "learning_rate": 9.875667032080354e-06, "loss": 0.2274, "step": 100, "student_loss": 0.0032730416860431433, "teacher_loss": 0.0036007578019052744 }, { "epoch": 0.03027681660899654, "grad_norm": 4.0, "kd_loss": 0.146484375, "learning_rate": 9.857084319758772e-06, "loss": 0.2421, "step": 105, "student_loss": 0.04058241471648216, "teacher_loss": 0.0012296679196879268 }, { "epoch": 0.031718569780853516, "grad_norm": 4.6875, "kd_loss": 0.1787109375, "learning_rate": 9.837227921253747e-06, "loss": 0.2273, "step": 110, "student_loss": 0.004547884222120047, "teacher_loss": 0.023880567401647568 }, { "epoch": 0.03316032295271049, "grad_norm": 2.625, "kd_loss": 0.14453125, "learning_rate": 9.816103043561648e-06, "loss": 0.2142, "step": 115, "student_loss": 0.001855566632002592, "teacher_loss": 0.0016716865357011557 }, { "epoch": 0.03460207612456748, "grad_norm": 3.078125, "kd_loss": 0.1357421875, "learning_rate": 9.79371522631553e-06, "loss": 0.2149, "step": 120, "student_loss": 0.019737211987376213, "teacher_loss": 0.0027425403241068125 }, { "epoch": 0.036043829296424454, "grad_norm": 3.390625, "kd_loss": 0.146484375, "learning_rate": 9.770070340332457e-06, "loss": 0.1956, "step": 125, "student_loss": 0.10938042402267456, "teacher_loss": 0.001064821844920516 }, { "epoch": 0.03748558246828143, "grad_norm": 3.734375, "kd_loss": 0.146484375, "learning_rate": 9.745174586073982e-06, "loss": 0.2099, "step": 130, "student_loss": 0.0035836249589920044, "teacher_loss": 0.002439548959955573 }, { "epoch": 0.03892733564013841, "grad_norm": 3.0, "kd_loss": 0.1337890625, "learning_rate": 9.719034492020183e-06, "loss": 0.202, "step": 135, "student_loss": 0.003862213110551238, "teacher_loss": 0.0010834896238520741 }, { "epoch": 0.040369088811995385, "grad_norm": 7.1875, "kd_loss": 0.1357421875, "learning_rate": 9.691656912957686e-06, "loss": 0.218, "step": 140, "student_loss": 0.0022195407655090094, "teacher_loss": 0.0014228483196347952 }, { "epoch": 0.04181084198385236, "grad_norm": 3.703125, "kd_loss": 0.150390625, "learning_rate": 9.663049028182112e-06, "loss": 0.2077, "step": 145, "student_loss": 0.11333022266626358, "teacher_loss": 0.00682886503636837 }, { "epoch": 0.04325259515570934, "grad_norm": 2.859375, "kd_loss": 0.14453125, "learning_rate": 9.633218339615433e-06, "loss": 0.1935, "step": 150, "student_loss": 0.0032606760505586863, "teacher_loss": 0.0031273479107767344 }, { "epoch": 0.04469434832756632, "grad_norm": 3.578125, "kd_loss": 0.12451171875, "learning_rate": 9.602172669838721e-06, "loss": 0.2199, "step": 155, "student_loss": 0.0088576041162014, "teacher_loss": 0.0016166985733434558 }, { "epoch": 0.0461361014994233, "grad_norm": 4.09375, "kd_loss": 0.1708984375, "learning_rate": 9.569920160040815e-06, "loss": 0.2018, "step": 160, "student_loss": 0.13294154405593872, "teacher_loss": 0.03791189566254616 }, { "epoch": 0.04757785467128028, "grad_norm": 3.921875, "kd_loss": 0.138671875, "learning_rate": 9.536469267883432e-06, "loss": 0.208, "step": 165, "student_loss": 0.002772042527794838, "teacher_loss": 0.005522818770259619 }, { "epoch": 0.049019607843137254, "grad_norm": 5.03125, "kd_loss": 0.126953125, "learning_rate": 9.501828765283295e-06, "loss": 0.1962, "step": 170, "student_loss": 0.003656906308606267, "teacher_loss": 0.0018494409741833806 }, { "epoch": 0.05046136101499423, "grad_norm": 3.515625, "kd_loss": 0.12451171875, "learning_rate": 9.466007736111846e-06, "loss": 0.1935, "step": 175, "student_loss": 0.017079656943678856, "teacher_loss": 0.0010717228287830949 }, { "epoch": 0.05190311418685121, "grad_norm": 5.03125, "kd_loss": 0.11669921875, "learning_rate": 9.429015573813163e-06, "loss": 0.1861, "step": 180, "student_loss": 0.003456867765635252, "teacher_loss": 0.0010596277425065637 }, { "epoch": 0.05334486735870819, "grad_norm": 4.0625, "kd_loss": 0.1455078125, "learning_rate": 9.390861978940687e-06, "loss": 0.1921, "step": 185, "student_loss": 0.31187787652015686, "teacher_loss": 0.0008243238553404808 }, { "epoch": 0.05478662053056517, "grad_norm": 5.09375, "kd_loss": 0.1416015625, "learning_rate": 9.351556956613423e-06, "loss": 0.2044, "step": 190, "student_loss": 0.011734717525541782, "teacher_loss": 0.0015390698099508882 }, { "epoch": 0.056228373702422146, "grad_norm": 4.5, "kd_loss": 0.1337890625, "learning_rate": 9.31111081389227e-06, "loss": 0.1778, "step": 195, "student_loss": 0.05741060897707939, "teacher_loss": 0.0007789382943883538 }, { "epoch": 0.05767012687427912, "grad_norm": 2.28125, "kd_loss": 0.12890625, "learning_rate": 9.269534157077177e-06, "loss": 0.1743, "step": 200, "student_loss": 0.0014264394994825125, "teacher_loss": 0.0006533891428261995 }, { "epoch": 0.0591118800461361, "grad_norm": 4.25, "kd_loss": 0.1376953125, "learning_rate": 9.226837888925813e-06, "loss": 0.1969, "step": 205, "student_loss": 0.0015782959526404738, "teacher_loss": 0.0368424728512764 }, { "epoch": 0.06055363321799308, "grad_norm": 7.5625, "kd_loss": 0.1123046875, "learning_rate": 9.183033205794525e-06, "loss": 0.1836, "step": 210, "student_loss": 0.01342203002423048, "teacher_loss": 0.0011842605890706182 }, { "epoch": 0.061995386389850055, "grad_norm": 3.0, "kd_loss": 0.126953125, "learning_rate": 9.13813159470227e-06, "loss": 0.1824, "step": 215, "student_loss": 0.0014404732501134276, "teacher_loss": 0.0007589462329633534 }, { "epoch": 0.06343713956170703, "grad_norm": 7.125, "kd_loss": 0.1328125, "learning_rate": 9.092144830318357e-06, "loss": 0.21, "step": 220, "student_loss": 0.2732444703578949, "teacher_loss": 0.00960276648402214 }, { "epoch": 0.06487889273356401, "grad_norm": 6.15625, "kd_loss": 0.1083984375, "learning_rate": 9.045084971874738e-06, "loss": 0.1941, "step": 225, "student_loss": 0.2691424489021301, "teacher_loss": 0.003315337933599949 }, { "epoch": 0.06632064590542099, "grad_norm": 2.46875, "kd_loss": 0.10986328125, "learning_rate": 8.99696436000368e-06, "loss": 0.1702, "step": 230, "student_loss": 0.08001423627138138, "teacher_loss": 0.009347192943096161 }, { "epoch": 0.06776239907727798, "grad_norm": 3.734375, "kd_loss": 0.1337890625, "learning_rate": 8.947795613501658e-06, "loss": 0.1778, "step": 235, "student_loss": 0.003426821669563651, "teacher_loss": 0.0008243515621870756 }, { "epoch": 0.06920415224913495, "grad_norm": 3.421875, "kd_loss": 0.10205078125, "learning_rate": 8.897591626020284e-06, "loss": 0.1928, "step": 240, "student_loss": 0.004231320694088936, "teacher_loss": 0.0010777115821838379 }, { "epoch": 0.07064590542099193, "grad_norm": 3.640625, "kd_loss": 0.11181640625, "learning_rate": 8.846365562685178e-06, "loss": 0.1721, "step": 245, "student_loss": 0.003189836163073778, "teacher_loss": 0.0029278292786329985 }, { "epoch": 0.07208765859284891, "grad_norm": 3.28125, "kd_loss": 0.107421875, "learning_rate": 8.794130856643635e-06, "loss": 0.1624, "step": 250, "student_loss": 0.0030161093454807997, "teacher_loss": 0.0015030049253255129 }, { "epoch": 0.07352941176470588, "grad_norm": 2.671875, "kd_loss": 0.1171875, "learning_rate": 8.74090120554202e-06, "loss": 0.181, "step": 255, "student_loss": 0.0010447532404214144, "teacher_loss": 0.0014572968939319253 }, { "epoch": 0.07497116493656286, "grad_norm": 3.03125, "kd_loss": 0.125, "learning_rate": 8.686690567933803e-06, "loss": 0.18, "step": 260, "student_loss": 0.002235305029898882, "teacher_loss": 0.03705403953790665 }, { "epoch": 0.07641291810841984, "grad_norm": 3.84375, "kd_loss": 0.10107421875, "learning_rate": 8.63151315961915e-06, "loss": 0.1751, "step": 265, "student_loss": 0.0019888102542608976, "teacher_loss": 0.0012628042604774237 }, { "epoch": 0.07785467128027682, "grad_norm": 4.03125, "kd_loss": 0.103515625, "learning_rate": 8.575383449917103e-06, "loss": 0.1698, "step": 270, "student_loss": 0.009670126251876354, "teacher_loss": 0.0018007074249908328 }, { "epoch": 0.07929642445213379, "grad_norm": 5.0625, "kd_loss": 0.11669921875, "learning_rate": 8.518316157871232e-06, "loss": 0.1792, "step": 275, "student_loss": 0.0027291348669677973, "teacher_loss": 0.03865275904536247 }, { "epoch": 0.08073817762399077, "grad_norm": 6.40625, "kd_loss": 0.12451171875, "learning_rate": 8.460326248389825e-06, "loss": 0.1868, "step": 280, "student_loss": 0.0005779159837402403, "teacher_loss": 0.0004988706787116826 }, { "epoch": 0.08217993079584775, "grad_norm": 5.03125, "kd_loss": 0.1083984375, "learning_rate": 8.401428928321607e-06, "loss": 0.1777, "step": 285, "student_loss": 0.00322159961797297, "teacher_loss": 0.0016653644852340221 }, { "epoch": 0.08362168396770472, "grad_norm": 4.15625, "kd_loss": 0.119140625, "learning_rate": 8.341639642468002e-06, "loss": 0.2245, "step": 290, "student_loss": 0.025423452258110046, "teacher_loss": 0.006107364781200886 }, { "epoch": 0.0850634371395617, "grad_norm": 2.21875, "kd_loss": 0.10498046875, "learning_rate": 8.280974069532999e-06, "loss": 0.1742, "step": 295, "student_loss": 0.0032805479131639004, "teacher_loss": 0.002079744590446353 }, { "epoch": 0.08650519031141868, "grad_norm": 4.875, "kd_loss": 0.1220703125, "learning_rate": 8.219448118011687e-06, "loss": 0.1698, "step": 300, "student_loss": 0.05386965721845627, "teacher_loss": 0.0015291464515030384 }, { "epoch": 0.08794694348327567, "grad_norm": 2.703125, "kd_loss": 0.09375, "learning_rate": 8.157077922018537e-06, "loss": 0.1735, "step": 305, "student_loss": 0.007909238338470459, "teacher_loss": 0.0032228778582066298 }, { "epoch": 0.08938869665513265, "grad_norm": 6.375, "kd_loss": 0.091796875, "learning_rate": 8.093879837056486e-06, "loss": 0.1662, "step": 310, "student_loss": 0.0014559343690052629, "teacher_loss": 0.0014570873463526368 }, { "epoch": 0.09083044982698962, "grad_norm": 6.09375, "kd_loss": 0.09716796875, "learning_rate": 8.029870435728018e-06, "loss": 0.1905, "step": 315, "student_loss": 0.13904070854187012, "teacher_loss": 0.00045576939010061324 }, { "epoch": 0.0922722029988466, "grad_norm": 8.75, "kd_loss": 0.091796875, "learning_rate": 7.965066503389264e-06, "loss": 0.1801, "step": 320, "student_loss": 0.0017298327293246984, "teacher_loss": 0.001036101020872593 }, { "epoch": 0.09371395617070358, "grad_norm": 5.53125, "kd_loss": 0.15625, "learning_rate": 7.89948503374835e-06, "loss": 0.1636, "step": 325, "student_loss": 0.0033407427836209536, "teacher_loss": 0.02077825367450714 }, { "epoch": 0.09515570934256055, "grad_norm": 5.09375, "kd_loss": 0.10693359375, "learning_rate": 7.833143224409076e-06, "loss": 0.1884, "step": 330, "student_loss": 0.006418874487280846, "teacher_loss": 0.0011637036222964525 }, { "epoch": 0.09659746251441753, "grad_norm": 4.71875, "kd_loss": 0.09716796875, "learning_rate": 7.766058472361154e-06, "loss": 0.1577, "step": 335, "student_loss": 0.0016794800758361816, "teacher_loss": 0.0023754944559186697 }, { "epoch": 0.09803921568627451, "grad_norm": 5.09375, "kd_loss": 0.09326171875, "learning_rate": 7.698248369418146e-06, "loss": 0.1589, "step": 340, "student_loss": 0.044694170355796814, "teacher_loss": 0.007826481945812702 }, { "epoch": 0.09948096885813149, "grad_norm": 5.375, "kd_loss": 0.0966796875, "learning_rate": 7.629730697604314e-06, "loss": 0.1807, "step": 345, "student_loss": 0.09194417297840118, "teacher_loss": 0.0007945778197608888 }, { "epoch": 0.10092272202998846, "grad_norm": 5.875, "kd_loss": 0.11865234375, "learning_rate": 7.560523424491595e-06, "loss": 0.1526, "step": 350, "student_loss": 0.005946993827819824, "teacher_loss": 0.0006145219667814672 }, { "epoch": 0.10236447520184544, "grad_norm": 9.1875, "kd_loss": 0.10302734375, "learning_rate": 7.490644698487909e-06, "loss": 0.1627, "step": 355, "student_loss": 0.0015843416331335902, "teacher_loss": 0.0014968032483011484 }, { "epoch": 0.10380622837370242, "grad_norm": 12.625, "kd_loss": 0.10791015625, "learning_rate": 7.420112844078066e-06, "loss": 0.1682, "step": 360, "student_loss": 0.01987134851515293, "teacher_loss": 0.001595525536686182 }, { "epoch": 0.1052479815455594, "grad_norm": 6.5625, "kd_loss": 0.103515625, "learning_rate": 7.348946357018479e-06, "loss": 0.1509, "step": 365, "student_loss": 0.006010106764733791, "teacher_loss": 0.032394833862781525 }, { "epoch": 0.10668973471741638, "grad_norm": 6.6875, "kd_loss": 0.1181640625, "learning_rate": 7.277163899486975e-06, "loss": 0.1623, "step": 370, "student_loss": 0.15845070779323578, "teacher_loss": 0.0004756299313157797 }, { "epoch": 0.10813148788927336, "grad_norm": 4.625, "kd_loss": 0.1240234375, "learning_rate": 7.204784295188959e-06, "loss": 0.1506, "step": 375, "student_loss": 0.10649572312831879, "teacher_loss": 0.02242193929851055 }, { "epoch": 0.10957324106113034, "grad_norm": 3.4375, "kd_loss": 0.1025390625, "learning_rate": 7.1318265244212305e-06, "loss": 0.1752, "step": 380, "student_loss": 0.00281524658203125, "teacher_loss": 0.0015117195434868336 }, { "epoch": 0.11101499423298732, "grad_norm": 2.359375, "kd_loss": 0.10205078125, "learning_rate": 7.05830971909472e-06, "loss": 0.1547, "step": 385, "student_loss": 0.0016335069667547941, "teacher_loss": 0.0012311713071539998 }, { "epoch": 0.11245674740484429, "grad_norm": 4.125, "kd_loss": 0.10205078125, "learning_rate": 6.9842531577174865e-06, "loss": 0.1538, "step": 390, "student_loss": 0.0012884392635896802, "teacher_loss": 0.001418368425220251 }, { "epoch": 0.11389850057670127, "grad_norm": 5.1875, "kd_loss": 0.1025390625, "learning_rate": 6.9096762603392595e-06, "loss": 0.1698, "step": 395, "student_loss": 0.0018499374855309725, "teacher_loss": 0.0013690440682694316 }, { "epoch": 0.11534025374855825, "grad_norm": 6.53125, "kd_loss": 0.10546875, "learning_rate": 6.834598583458862e-06, "loss": 0.16, "step": 400, "student_loss": 0.0014036521315574646, "teacher_loss": 0.00040830764919519424 }, { "epoch": 0.11678200692041522, "grad_norm": 4.71875, "kd_loss": 0.0888671875, "learning_rate": 6.7590398148958625e-06, "loss": 0.1718, "step": 405, "student_loss": 0.10261467099189758, "teacher_loss": 0.0006754493806511164 }, { "epoch": 0.1182237600922722, "grad_norm": 4.3125, "kd_loss": 0.171875, "learning_rate": 6.6830197686277945e-06, "loss": 0.1878, "step": 410, "student_loss": 0.4882833659648895, "teacher_loss": 0.00981883890926838 }, { "epoch": 0.11966551326412918, "grad_norm": 2.8125, "kd_loss": 0.1083984375, "learning_rate": 6.6065583795942625e-06, "loss": 0.182, "step": 415, "student_loss": 0.03729023039340973, "teacher_loss": 0.0042837257497012615 }, { "epoch": 0.12110726643598616, "grad_norm": 4.65625, "kd_loss": 0.1064453125, "learning_rate": 6.52967569846937e-06, "loss": 0.1607, "step": 420, "student_loss": 0.05881139263510704, "teacher_loss": 0.024456653743982315 }, { "epoch": 0.12254901960784313, "grad_norm": 4.25, "kd_loss": 0.10009765625, "learning_rate": 6.452391886403767e-06, "loss": 0.1674, "step": 425, "student_loss": 0.05037780851125717, "teacher_loss": 0.0040146904066205025 }, { "epoch": 0.12399077277970011, "grad_norm": 4.75, "kd_loss": 0.1748046875, "learning_rate": 6.374727209737743e-06, "loss": 0.1766, "step": 430, "student_loss": 0.00238221138715744, "teacher_loss": 0.06439146399497986 }, { "epoch": 0.1254325259515571, "grad_norm": 5.53125, "kd_loss": 0.1015625, "learning_rate": 6.296702034686726e-06, "loss": 0.1714, "step": 435, "student_loss": 0.002659996272996068, "teacher_loss": 0.0022907655220478773 }, { "epoch": 0.12687427912341406, "grad_norm": 3.078125, "kd_loss": 0.1845703125, "learning_rate": 6.218336822000598e-06, "loss": 0.1775, "step": 440, "student_loss": 0.46329638361930847, "teacher_loss": 0.008188321255147457 }, { "epoch": 0.12831603229527105, "grad_norm": 3.953125, "kd_loss": 0.1259765625, "learning_rate": 6.139652121598219e-06, "loss": 0.1769, "step": 445, "student_loss": 0.0006292742909863591, "teacher_loss": 0.02016839198768139 }, { "epoch": 0.12975778546712802, "grad_norm": 3.53125, "kd_loss": 0.10400390625, "learning_rate": 6.060668567178561e-06, "loss": 0.1663, "step": 450, "student_loss": 0.002717025112360716, "teacher_loss": 0.0016874197172001004 }, { "epoch": 0.131199538638985, "grad_norm": 2.671875, "kd_loss": 0.087890625, "learning_rate": 5.981406870809889e-06, "loss": 0.1748, "step": 455, "student_loss": 0.012300008907914162, "teacher_loss": 0.0016890015685930848 }, { "epoch": 0.13264129181084197, "grad_norm": 5.1875, "kd_loss": 0.10791015625, "learning_rate": 5.9018878174983674e-06, "loss": 0.17, "step": 460, "student_loss": 0.03240777552127838, "teacher_loss": 0.0010722745209932327 }, { "epoch": 0.13408304498269896, "grad_norm": 2.765625, "kd_loss": 0.1328125, "learning_rate": 5.822132259737565e-06, "loss": 0.1858, "step": 465, "student_loss": 0.0023128872271627188, "teacher_loss": 0.0006816239329054952 }, { "epoch": 0.13552479815455595, "grad_norm": 5.3125, "kd_loss": 0.09521484375, "learning_rate": 5.742161112040237e-06, "loss": 0.1887, "step": 470, "student_loss": 0.0013243159046396613, "teacher_loss": 0.0008191668312065303 }, { "epoch": 0.13696655132641292, "grad_norm": 4.34375, "kd_loss": 0.12890625, "learning_rate": 5.661995345453867e-06, "loss": 0.1479, "step": 475, "student_loss": 0.0022922754287719727, "teacher_loss": 0.0007053640438243747 }, { "epoch": 0.1384083044982699, "grad_norm": 4.1875, "kd_loss": 0.10888671875, "learning_rate": 5.581655982061367e-06, "loss": 0.2052, "step": 480, "student_loss": 0.016067378222942352, "teacher_loss": 0.0467948317527771 }, { "epoch": 0.13985005767012687, "grad_norm": 4.625, "kd_loss": 0.09521484375, "learning_rate": 5.501164089468406e-06, "loss": 0.1535, "step": 485, "student_loss": 0.001838831347413361, "teacher_loss": 0.0018535954877734184 }, { "epoch": 0.14129181084198386, "grad_norm": 4.3125, "kd_loss": 0.099609375, "learning_rate": 5.4205407752787884e-06, "loss": 0.1702, "step": 490, "student_loss": 0.0021060549188405275, "teacher_loss": 0.0013811348471790552 }, { "epoch": 0.14273356401384082, "grad_norm": 5.0, "kd_loss": 0.12109375, "learning_rate": 5.339807181559359e-06, "loss": 0.1698, "step": 495, "student_loss": 0.004670781549066305, "teacher_loss": 0.0008499641553498805 }, { "epoch": 0.14417531718569782, "grad_norm": 7.28125, "kd_loss": 0.1005859375, "learning_rate": 5.258984479295853e-06, "loss": 0.1663, "step": 500, "student_loss": 0.0009078571456484497, "teacher_loss": 0.0008732817368581891 }, { "epoch": 0.14561707035755478, "grad_norm": 4.8125, "kd_loss": 0.09521484375, "learning_rate": 5.1780938628411795e-06, "loss": 0.1857, "step": 505, "student_loss": 0.002319552004337311, "teacher_loss": 0.0009417013498023152 }, { "epoch": 0.14705882352941177, "grad_norm": 5.5, "kd_loss": 0.09130859375, "learning_rate": 5.097156544357567e-06, "loss": 0.168, "step": 510, "student_loss": 0.001904567121528089, "teacher_loss": 0.0011415554909035563 }, { "epoch": 0.14850057670126873, "grad_norm": 4.9375, "kd_loss": 0.11474609375, "learning_rate": 5.016193748254045e-06, "loss": 0.1561, "step": 515, "student_loss": 0.004430091939866543, "teacher_loss": 0.000705283775459975 }, { "epoch": 0.14994232987312572, "grad_norm": 4.0, "kd_loss": 0.1005859375, "learning_rate": 4.935226705620699e-06, "loss": 0.1742, "step": 520, "student_loss": 0.4650050103664398, "teacher_loss": 0.011486685834825039 }, { "epoch": 0.1513840830449827, "grad_norm": 2.28125, "kd_loss": 0.09423828125, "learning_rate": 4.8542766486612035e-06, "loss": 0.1568, "step": 525, "student_loss": 0.004688178189098835, "teacher_loss": 0.0005817305063828826 }, { "epoch": 0.15282583621683968, "grad_norm": 6.21875, "kd_loss": 0.1025390625, "learning_rate": 4.773364805125025e-06, "loss": 0.1569, "step": 530, "student_loss": 0.002902889158576727, "teacher_loss": 0.0036108619533479214 }, { "epoch": 0.15426758938869667, "grad_norm": 2.8125, "kd_loss": 0.0947265625, "learning_rate": 4.6925123927408265e-06, "loss": 0.146, "step": 535, "student_loss": 0.004958340898156166, "teacher_loss": 0.0009314365452155471 }, { "epoch": 0.15570934256055363, "grad_norm": 5.09375, "kd_loss": 0.1083984375, "learning_rate": 4.611740613652485e-06, "loss": 0.1485, "step": 540, "student_loss": 0.022316506132483482, "teacher_loss": 0.0009606878156773746 }, { "epoch": 0.15715109573241062, "grad_norm": 5.90625, "kd_loss": 0.095703125, "learning_rate": 4.531070648859186e-06, "loss": 0.171, "step": 545, "student_loss": 0.005919112823903561, "teacher_loss": 0.016547029837965965 }, { "epoch": 0.15859284890426759, "grad_norm": 4.375, "kd_loss": 0.1123046875, "learning_rate": 4.450523652661086e-06, "loss": 0.142, "step": 550, "student_loss": 0.0007885328959673643, "teacher_loss": 0.0045303236693143845 }, { "epoch": 0.16003460207612458, "grad_norm": 4.03125, "kd_loss": 0.09326171875, "learning_rate": 4.370120747111956e-06, "loss": 0.1566, "step": 555, "student_loss": 0.0045122369192540646, "teacher_loss": 0.0012258175993338227 }, { "epoch": 0.16147635524798154, "grad_norm": 5.1875, "kd_loss": 0.09423828125, "learning_rate": 4.289883016480291e-06, "loss": 0.1694, "step": 560, "student_loss": 0.038154710084199905, "teacher_loss": 0.00046423348248936236 }, { "epoch": 0.16291810841983853, "grad_norm": 3.34375, "kd_loss": 0.109375, "learning_rate": 4.209831501720328e-06, "loss": 0.1557, "step": 565, "student_loss": 0.018078487366437912, "teacher_loss": 0.021091489121317863 }, { "epoch": 0.1643598615916955, "grad_norm": 5.21875, "kd_loss": 0.1689453125, "learning_rate": 4.129987194954421e-06, "loss": 0.17, "step": 570, "student_loss": 0.15178009867668152, "teacher_loss": 0.0086033521220088 }, { "epoch": 0.16580161476355249, "grad_norm": 2.890625, "kd_loss": 0.08544921875, "learning_rate": 4.050371033968216e-06, "loss": 0.1651, "step": 575, "student_loss": 0.0016716659301891923, "teacher_loss": 0.0008001797832548618 }, { "epoch": 0.16724336793540945, "grad_norm": 4.1875, "kd_loss": 0.236328125, "learning_rate": 3.9710038967200825e-06, "loss": 0.1443, "step": 580, "student_loss": 0.004638470709323883, "teacher_loss": 0.006588623858988285 }, { "epoch": 0.16868512110726644, "grad_norm": 2.890625, "kd_loss": 0.0966796875, "learning_rate": 3.89190659586623e-06, "loss": 0.1551, "step": 585, "student_loss": 0.00187311926856637, "teacher_loss": 0.0005596915725618601 }, { "epoch": 0.1701268742791234, "grad_norm": 6.65625, "kd_loss": 0.091796875, "learning_rate": 3.8130998733029517e-06, "loss": 0.1722, "step": 590, "student_loss": 0.017516393214464188, "teacher_loss": 0.002362610539421439 }, { "epoch": 0.1715686274509804, "grad_norm": 2.234375, "kd_loss": 0.09423828125, "learning_rate": 3.734604394727419e-06, "loss": 0.1736, "step": 595, "student_loss": 0.0015100985765457153, "teacher_loss": 0.0012370356125757098 }, { "epoch": 0.17301038062283736, "grad_norm": 5.375, "kd_loss": 0.1064453125, "learning_rate": 3.656440744218464e-06, "loss": 0.1822, "step": 600, "student_loss": 0.3471376895904541, "teacher_loss": 0.006922336760908365 }, { "epoch": 0.17445213379469435, "grad_norm": 5.65625, "kd_loss": 0.10400390625, "learning_rate": 3.578629418838757e-06, "loss": 0.1706, "step": 605, "student_loss": 0.09560892730951309, "teacher_loss": 0.04084807634353638 }, { "epoch": 0.17589388696655134, "grad_norm": 3.609375, "kd_loss": 0.10888671875, "learning_rate": 3.5011908232598124e-06, "loss": 0.1418, "step": 610, "student_loss": 0.0035140912514179945, "teacher_loss": 0.0005105194286443293 }, { "epoch": 0.1773356401384083, "grad_norm": 3.375, "kd_loss": 0.10498046875, "learning_rate": 3.4241452644112085e-06, "loss": 0.1453, "step": 615, "student_loss": 0.0014288002857938409, "teacher_loss": 0.001070382189936936 }, { "epoch": 0.1787773933102653, "grad_norm": 2.578125, "kd_loss": 0.1416015625, "learning_rate": 3.3475129461554567e-06, "loss": 0.1677, "step": 620, "student_loss": 0.0047634500078856945, "teacher_loss": 0.009211473166942596 }, { "epoch": 0.18021914648212226, "grad_norm": 3.578125, "kd_loss": 0.09814453125, "learning_rate": 3.271313963989886e-06, "loss": 0.1556, "step": 625, "student_loss": 0.019517898559570312, "teacher_loss": 0.004466219339519739 }, { "epoch": 0.18166089965397925, "grad_norm": 4.21875, "kd_loss": 0.10009765625, "learning_rate": 3.195568299776945e-06, "loss": 0.1587, "step": 630, "student_loss": 0.09341763705015182, "teacher_loss": 0.0017769263358786702 }, { "epoch": 0.1831026528258362, "grad_norm": 4.21875, "kd_loss": 0.09130859375, "learning_rate": 3.1202958165043053e-06, "loss": 0.1877, "step": 635, "student_loss": 0.0012313922634348273, "teacher_loss": 0.0007036713068373501 }, { "epoch": 0.1845444059976932, "grad_norm": 5.84375, "kd_loss": 0.0966796875, "learning_rate": 3.045516253076137e-06, "loss": 0.1654, "step": 640, "student_loss": 0.001555976108647883, "teacher_loss": 0.0010528129059821367 }, { "epoch": 0.18598615916955016, "grad_norm": 7.96875, "kd_loss": 0.09228515625, "learning_rate": 2.9712492191369245e-06, "loss": 0.1564, "step": 645, "student_loss": 0.0033667683601379395, "teacher_loss": 0.0009755496867001057 }, { "epoch": 0.18742791234140715, "grad_norm": 2.921875, "kd_loss": 0.11376953125, "learning_rate": 2.8975141899291777e-06, "loss": 0.1552, "step": 650, "student_loss": 0.001696955063380301, "teacher_loss": 0.0012513434048742056 }, { "epoch": 0.18886966551326412, "grad_norm": 3.234375, "kd_loss": 0.08544921875, "learning_rate": 2.8243305011863843e-06, "loss": 0.1481, "step": 655, "student_loss": 0.027264071628451347, "teacher_loss": 0.0005043753772042692 }, { "epoch": 0.1903114186851211, "grad_norm": 3.84375, "kd_loss": 0.0966796875, "learning_rate": 2.751717344062552e-06, "loss": 0.1658, "step": 660, "student_loss": 0.006026037037372589, "teacher_loss": 0.0037035837303847075 }, { "epoch": 0.19175317185697807, "grad_norm": 3.96875, "kd_loss": 0.11181640625, "learning_rate": 2.6796937600996587e-06, "loss": 0.1585, "step": 665, "student_loss": 0.0023006678093224764, "teacher_loss": 0.0006673650932498276 }, { "epoch": 0.19319492502883506, "grad_norm": 4.125, "kd_loss": 0.08837890625, "learning_rate": 2.6082786362343377e-06, "loss": 0.1818, "step": 670, "student_loss": 0.0015634546289220452, "teacher_loss": 0.0005979883135296404 }, { "epoch": 0.19463667820069205, "grad_norm": 3.59375, "kd_loss": 0.09814453125, "learning_rate": 2.5374906998451094e-06, "loss": 0.1598, "step": 675, "student_loss": 0.0016033351421356201, "teacher_loss": 0.001516613527201116 }, { "epoch": 0.19607843137254902, "grad_norm": 3.625, "kd_loss": 0.162109375, "learning_rate": 2.467348513841447e-06, "loss": 0.1566, "step": 680, "student_loss": 0.15181653201580048, "teacher_loss": 0.04114415496587753 }, { "epoch": 0.197520184544406, "grad_norm": 2.703125, "kd_loss": 0.15625, "learning_rate": 2.3978704717959777e-06, "loss": 0.154, "step": 685, "student_loss": 0.0007377453148365021, "teacher_loss": 0.0339120589196682 }, { "epoch": 0.19896193771626297, "grad_norm": 3.15625, "kd_loss": 0.09521484375, "learning_rate": 2.329074793121085e-06, "loss": 0.1582, "step": 690, "student_loss": 0.0044479165226221085, "teacher_loss": 0.012265580706298351 }, { "epoch": 0.20040369088811996, "grad_norm": 3.234375, "kd_loss": 0.10693359375, "learning_rate": 2.260979518291186e-06, "loss": 0.1724, "step": 695, "student_loss": 0.015444566495716572, "teacher_loss": 0.010763188824057579 }, { "epoch": 0.20184544405997693, "grad_norm": 3.75, "kd_loss": 0.091796875, "learning_rate": 2.1936025041119268e-06, "loss": 0.1753, "step": 700, "student_loss": 0.0019369354704394937, "teacher_loss": 0.0009062191820703447 }, { "epoch": 0.20328719723183392, "grad_norm": 3.296875, "kd_loss": 0.1044921875, "learning_rate": 2.1269614190375477e-06, "loss": 0.1584, "step": 705, "student_loss": 0.001297777402214706, "teacher_loss": 0.0018579652532935143 }, { "epoch": 0.20472895040369088, "grad_norm": 3.75, "kd_loss": 0.10400390625, "learning_rate": 2.061073738537635e-06, "loss": 0.1901, "step": 710, "student_loss": 0.08087821304798126, "teacher_loss": 0.004622929729521275 }, { "epoch": 0.20617070357554787, "grad_norm": 4.1875, "kd_loss": 0.08935546875, "learning_rate": 1.9959567405144825e-06, "loss": 0.1863, "step": 715, "student_loss": 0.009472950361669064, "teacher_loss": 0.007570087444037199 }, { "epoch": 0.20761245674740483, "grad_norm": 4.34375, "kd_loss": 0.11767578125, "learning_rate": 1.931627500772263e-06, "loss": 0.1746, "step": 720, "student_loss": 0.001279592514038086, "teacher_loss": 0.004464911296963692 }, { "epoch": 0.20905420991926182, "grad_norm": 2.25, "kd_loss": 0.10888671875, "learning_rate": 1.8681028885391905e-06, "loss": 0.1528, "step": 725, "student_loss": 0.0024647831451147795, "teacher_loss": 0.0011802453082054853 }, { "epoch": 0.2104959630911188, "grad_norm": 2.671875, "kd_loss": 0.0986328125, "learning_rate": 1.8053995620438625e-06, "loss": 0.152, "step": 730, "student_loss": 0.04315745085477829, "teacher_loss": 0.00156076205894351 }, { "epoch": 0.21193771626297578, "grad_norm": 4.78125, "kd_loss": 0.10498046875, "learning_rate": 1.743533964146924e-06, "loss": 0.1704, "step": 735, "student_loss": 0.0016925001982599497, "teacher_loss": 0.0004609136376529932 }, { "epoch": 0.21337946943483277, "grad_norm": 3.9375, "kd_loss": 0.1142578125, "learning_rate": 1.6825223180292138e-06, "loss": 0.1432, "step": 740, "student_loss": 0.012965809553861618, "teacher_loss": 0.0004022814682684839 }, { "epoch": 0.21482122260668973, "grad_norm": 5.1875, "kd_loss": 0.12890625, "learning_rate": 1.6223806229375182e-06, "loss": 0.1491, "step": 745, "student_loss": 0.1358025223016739, "teacher_loss": 0.02106391452252865 }, { "epoch": 0.21626297577854672, "grad_norm": 4.4375, "kd_loss": 0.11328125, "learning_rate": 1.563124649989043e-06, "loss": 0.1605, "step": 750, "student_loss": 0.10271821916103363, "teacher_loss": 0.004584586247801781 }, { "epoch": 0.2177047289504037, "grad_norm": 4.28125, "kd_loss": 0.091796875, "learning_rate": 1.5047699380357134e-06, "loss": 0.1681, "step": 755, "student_loss": 0.1378186047077179, "teacher_loss": 0.006503281649202108 }, { "epoch": 0.21914648212226068, "grad_norm": 9.5, "kd_loss": 0.0908203125, "learning_rate": 1.4473317895893773e-06, "loss": 0.16, "step": 760, "student_loss": 0.4880536198616028, "teacher_loss": 0.00078756851144135 }, { "epoch": 0.22058823529411764, "grad_norm": 6.96875, "kd_loss": 0.11328125, "learning_rate": 1.39082526680899e-06, "loss": 0.1728, "step": 765, "student_loss": 0.07314120978116989, "teacher_loss": 0.0007181121036410332 }, { "epoch": 0.22202998846597463, "grad_norm": 2.65625, "kd_loss": 0.09912109375, "learning_rate": 1.3352651875508204e-06, "loss": 0.1513, "step": 770, "student_loss": 0.004254591651260853, "teacher_loss": 0.0007984668482095003 }, { "epoch": 0.2234717416378316, "grad_norm": 3.640625, "kd_loss": 0.1044921875, "learning_rate": 1.2806661214827286e-06, "loss": 0.1587, "step": 775, "student_loss": 0.002741985023021698, "teacher_loss": 0.0007013682625256479 }, { "epoch": 0.22491349480968859, "grad_norm": 4.53125, "kd_loss": 0.09765625, "learning_rate": 1.2270423862635188e-06, "loss": 0.1708, "step": 780, "student_loss": 0.0015033041127026081, "teacher_loss": 0.0006990543915890157 }, { "epoch": 0.22635524798154555, "grad_norm": 3.90625, "kd_loss": 0.09423828125, "learning_rate": 1.1744080437883859e-06, "loss": 0.1409, "step": 785, "student_loss": 0.001736114383675158, "teacher_loss": 0.001990710385143757 }, { "epoch": 0.22779700115340254, "grad_norm": 9.25, "kd_loss": 0.11083984375, "learning_rate": 1.1227768965014246e-06, "loss": 0.1804, "step": 790, "student_loss": 0.03133748471736908, "teacher_loss": 0.008059236221015453 }, { "epoch": 0.2292387543252595, "grad_norm": 3.84375, "kd_loss": 0.11474609375, "learning_rate": 1.0721624837761768e-06, "loss": 0.1703, "step": 795, "student_loss": 0.005942783784121275, "teacher_loss": 0.0006947139045223594 }, { "epoch": 0.2306805074971165, "grad_norm": 3.921875, "kd_loss": 0.166015625, "learning_rate": 1.0225780783651689e-06, "loss": 0.1879, "step": 800, "student_loss": 0.05473716929554939, "teacher_loss": 0.03766282647848129 }, { "epoch": 0.23212226066897348, "grad_norm": 4.34375, "kd_loss": 0.08642578125, "learning_rate": 9.740366829193587e-07, "loss": 0.1824, "step": 805, "student_loss": 0.001553440117277205, "teacher_loss": 0.0012171101989224553 }, { "epoch": 0.23356401384083045, "grad_norm": 7.34375, "kd_loss": 0.09423828125, "learning_rate": 9.265510265784189e-07, "loss": 0.1771, "step": 810, "student_loss": 0.0017513898201286793, "teacher_loss": 0.0006000488647259772 }, { "epoch": 0.23500576701268744, "grad_norm": 5.46875, "kd_loss": 0.09912109375, "learning_rate": 8.801335616327378e-07, "loss": 0.1664, "step": 815, "student_loss": 0.007318615913391113, "teacher_loss": 0.010634765028953552 }, { "epoch": 0.2364475201845444, "grad_norm": 3.90625, "kd_loss": 0.11474609375, "learning_rate": 8.347964602580245e-07, "loss": 0.1615, "step": 820, "student_loss": 0.04161018878221512, "teacher_loss": 0.001969601958990097 }, { "epoch": 0.2378892733564014, "grad_norm": 4.65625, "kd_loss": 0.1142578125, "learning_rate": 7.905516113233652e-07, "loss": 0.1532, "step": 825, "student_loss": 0.000943031394854188, "teacher_loss": 0.020420216023921967 }, { "epoch": 0.23933102652825836, "grad_norm": 2.875, "kd_loss": 0.09521484375, "learning_rate": 7.474106172735746e-07, "loss": 0.1601, "step": 830, "student_loss": 0.018866391852498055, "teacher_loss": 0.0037704347632825375 }, { "epoch": 0.24077277970011535, "grad_norm": 5.09375, "kd_loss": 0.0888671875, "learning_rate": 7.053847910866513e-07, "loss": 0.1552, "step": 835, "student_loss": 0.12261331081390381, "teacher_loss": 0.005213484168052673 }, { "epoch": 0.2422145328719723, "grad_norm": 2.84375, "kd_loss": 0.11767578125, "learning_rate": 6.644851533071556e-07, "loss": 0.1478, "step": 840, "student_loss": 0.0071019199676811695, "teacher_loss": 0.0005135077517479658 }, { "epoch": 0.2436562860438293, "grad_norm": 4.6875, "kd_loss": 0.1337890625, "learning_rate": 6.24722429156251e-07, "loss": 0.228, "step": 845, "student_loss": 0.0022525617387145758, "teacher_loss": 0.0012849880149587989 }, { "epoch": 0.24509803921568626, "grad_norm": 6.5625, "kd_loss": 0.0966796875, "learning_rate": 5.861070457192081e-07, "loss": 0.1695, "step": 850, "student_loss": 0.06699959933757782, "teacher_loss": 0.0007787467329762876 }, { "epoch": 0.24653979238754326, "grad_norm": 2.46875, "kd_loss": 0.10546875, "learning_rate": 5.486491292110796e-07, "loss": 0.1498, "step": 855, "student_loss": 0.0011905976571142673, "teacher_loss": 0.0006796009838581085 }, { "epoch": 0.24798154555940022, "grad_norm": 3.40625, "kd_loss": 0.10107421875, "learning_rate": 5.123585023212785e-07, "loss": 0.1846, "step": 860, "student_loss": 0.005176758859306574, "teacher_loss": 0.0015740481903776526 }, { "epoch": 0.2494232987312572, "grad_norm": 3.46875, "kd_loss": 0.095703125, "learning_rate": 4.772446816377408e-07, "loss": 0.1519, "step": 865, "student_loss": 0.0017797622131183743, "teacher_loss": 0.001129323965869844 }, { "epoch": 0.2508650519031142, "grad_norm": 2.515625, "kd_loss": 0.09326171875, "learning_rate": 4.4331687515137614e-07, "loss": 0.1724, "step": 870, "student_loss": 0.0048367022536695, "teacher_loss": 0.0008864006958901882 }, { "epoch": 0.25230680507497116, "grad_norm": 2.390625, "kd_loss": 0.09619140625, "learning_rate": 4.1058397984142405e-07, "loss": 0.1396, "step": 875, "student_loss": 0.0008503241115249693, "teacher_loss": 0.0008235117420554161 }, { "epoch": 0.2537485582468281, "grad_norm": 3.0625, "kd_loss": 0.09423828125, "learning_rate": 3.790545793423761e-07, "loss": 0.1662, "step": 880, "student_loss": 0.0019780993461608887, "teacher_loss": 0.0007267682813107967 }, { "epoch": 0.25519031141868515, "grad_norm": 3.25, "kd_loss": 0.1025390625, "learning_rate": 3.4873694169306915e-07, "loss": 0.1567, "step": 885, "student_loss": 0.020344872027635574, "teacher_loss": 0.056762393563985825 }, { "epoch": 0.2566320645905421, "grad_norm": 2.796875, "kd_loss": 0.09326171875, "learning_rate": 3.196390171685343e-07, "loss": 0.1636, "step": 890, "student_loss": 0.001583437086082995, "teacher_loss": 0.00122374901548028 }, { "epoch": 0.25807381776239907, "grad_norm": 4.09375, "kd_loss": 0.11083984375, "learning_rate": 2.917684361951728e-07, "loss": 0.1583, "step": 895, "student_loss": 0.11338726431131363, "teacher_loss": 0.006426448002457619 }, { "epoch": 0.25951557093425603, "grad_norm": 3.34375, "kd_loss": 0.1201171875, "learning_rate": 2.65132507349814e-07, "loss": 0.1934, "step": 900, "student_loss": 0.0020212531089782715, "teacher_loss": 0.03192909434437752 }, { "epoch": 0.26095732410611305, "grad_norm": 3.3125, "kd_loss": 0.09716796875, "learning_rate": 2.397382154431621e-07, "loss": 0.1627, "step": 905, "student_loss": 0.0032915188930928707, "teacher_loss": 0.0014988789334893227 }, { "epoch": 0.26239907727797, "grad_norm": 4.0, "kd_loss": 0.0986328125, "learning_rate": 2.1559221968815547e-07, "loss": 0.182, "step": 910, "student_loss": 0.001489490270614624, "teacher_loss": 0.0012580000329762697 }, { "epoch": 0.263840830449827, "grad_norm": 3.390625, "kd_loss": 0.09716796875, "learning_rate": 1.9270085195370048e-07, "loss": 0.143, "step": 915, "student_loss": 0.04323554039001465, "teacher_loss": 0.001784435473382473 }, { "epoch": 0.26528258362168394, "grad_norm": 2.546875, "kd_loss": 0.1318359375, "learning_rate": 1.7107011510424766e-07, "loss": 0.1721, "step": 920, "student_loss": 0.01360052265226841, "teacher_loss": 0.018217744305729866 }, { "epoch": 0.26672433679354096, "grad_norm": 3.71875, "kd_loss": 0.087890625, "learning_rate": 1.5070568142564912e-07, "loss": 0.1489, "step": 925, "student_loss": 0.0011945515871047974, "teacher_loss": 0.0009807685855776072 }, { "epoch": 0.2681660899653979, "grad_norm": 3.40625, "kd_loss": 0.1142578125, "learning_rate": 1.3161289113769405e-07, "loss": 0.1539, "step": 930, "student_loss": 0.055781442672014236, "teacher_loss": 0.0011405627010390162 }, { "epoch": 0.2696078431372549, "grad_norm": 2.84375, "kd_loss": 0.10009765625, "learning_rate": 1.1379675099373489e-07, "loss": 0.1501, "step": 935, "student_loss": 0.005637112073600292, "teacher_loss": 0.002441459335386753 }, { "epoch": 0.2710495963091119, "grad_norm": 5.625, "kd_loss": 0.18359375, "learning_rate": 9.726193296774767e-08, "loss": 0.1684, "step": 940, "student_loss": 0.011978531256318092, "teacher_loss": 0.009902331046760082 }, { "epoch": 0.27249134948096887, "grad_norm": 5.375, "kd_loss": 0.0966796875, "learning_rate": 8.201277302919086e-08, "loss": 0.1661, "step": 945, "student_loss": 0.12388397753238678, "teacher_loss": 0.0019162542885169387 }, { "epoch": 0.27393310265282583, "grad_norm": 3.046875, "kd_loss": 0.12451171875, "learning_rate": 6.805327000596995e-08, "loss": 0.1539, "step": 950, "student_loss": 0.004596967715770006, "teacher_loss": 0.0005623517790809274 }, { "epoch": 0.2753748558246828, "grad_norm": 4.3125, "kd_loss": 0.083984375, "learning_rate": 5.538708453581787e-08, "loss": 0.1616, "step": 955, "student_loss": 0.007824474945664406, "teacher_loss": 0.0011617924319580197 }, { "epoch": 0.2768166089965398, "grad_norm": 8.0625, "kd_loss": 0.0947265625, "learning_rate": 4.40175381063529e-08, "loss": 0.1586, "step": 960, "student_loss": 0.0018909722566604614, "teacher_loss": 0.00250077061355114 }, { "epoch": 0.2782583621683968, "grad_norm": 4.40625, "kd_loss": 0.162109375, "learning_rate": 3.394761218407705e-08, "loss": 0.1666, "step": 965, "student_loss": 0.11850693821907043, "teacher_loss": 0.008037789724767208 }, { "epoch": 0.27970011534025374, "grad_norm": 3.1875, "kd_loss": 0.11083984375, "learning_rate": 2.5179947432540376e-08, "loss": 0.1665, "step": 970, "student_loss": 0.0006995275616645813, "teacher_loss": 0.0004927213303744793 }, { "epoch": 0.2811418685121107, "grad_norm": 2.9375, "kd_loss": 0.087890625, "learning_rate": 1.7716843019867646e-08, "loss": 0.1614, "step": 975, "student_loss": 0.11272090673446655, "teacher_loss": 0.002362866187468171 }, { "epoch": 0.2825836216839677, "grad_norm": 4.6875, "kd_loss": 0.0927734375, "learning_rate": 1.156025601584676e-08, "loss": 0.1578, "step": 980, "student_loss": 0.002314644167199731, "teacher_loss": 0.0005237645236775279 }, { "epoch": 0.2840253748558247, "grad_norm": 5.28125, "kd_loss": 0.09814453125, "learning_rate": 6.711800878718144e-09, "loss": 0.1708, "step": 985, "student_loss": 0.0012468647910282016, "teacher_loss": 0.0008435134077444673 }, { "epoch": 0.28546712802768165, "grad_norm": 9.3125, "kd_loss": 0.09130859375, "learning_rate": 3.1727490318111953e-09, "loss": 0.1632, "step": 990, "student_loss": 0.002036402700468898, "teacher_loss": 0.0007938549388200045 }, { "epoch": 0.2869088811995386, "grad_norm": 4.5, "kd_loss": 0.10986328125, "learning_rate": 9.440285301370865e-10, "loss": 0.183, "step": 995, "student_loss": 0.0015997332520782948, "teacher_loss": 0.00439803209155798 }, { "epoch": 0.28835063437139563, "grad_norm": 2.375, "kd_loss": 0.09423828125, "learning_rate": 2.622381702066523e-11, "loss": 0.1477, "step": 1000, "student_loss": 0.07939934730529785, "teacher_loss": 0.0005344336968846619 }, { "epoch": 0.28835063437139563, "kd_loss": 0.09423828125, "step": 1000, "student_loss": 0.07939934730529785, "teacher_loss": 0.0005344336968846619, "total_flos": 0.0, "train_loss": 0.22623604363203048, "train_runtime": 7596.2458, "train_samples_per_second": 2.106, "train_steps_per_second": 0.132 }, { "epoch": 0.2897923875432526, "grad_norm": 3.265625, "kd_loss": 0.1083984375, "learning_rate": 7.75705864825114e-06, "loss": 0.1639, "step": 1005, "student_loss": 0.021362992003560066, "teacher_loss": 0.07951661199331284 }, { "epoch": 0.29123414071510956, "grad_norm": 7.375, "kd_loss": 0.09326171875, "learning_rate": 7.734502946076656e-06, "loss": 0.1608, "step": 1010, "student_loss": 0.0017111932393163443, "teacher_loss": 0.004124164581298828 }, { "epoch": 0.2926758938869666, "grad_norm": 5.3125, "kd_loss": 0.146484375, "learning_rate": 7.711867567242769e-06, "loss": 0.1511, "step": 1015, "student_loss": 0.0037448785733431578, "teacher_loss": 0.01143695879727602 }, { "epoch": 0.29411764705882354, "grad_norm": 3.046875, "kd_loss": 0.1767578125, "learning_rate": 7.689153171288487e-06, "loss": 0.1481, "step": 1020, "student_loss": 0.018935445696115494, "teacher_loss": 0.03223176300525665 }, { "epoch": 0.2955594002306805, "grad_norm": 5.03125, "kd_loss": 0.10107421875, "learning_rate": 7.666360420055188e-06, "loss": 0.1648, "step": 1025, "student_loss": 0.00270785391330719, "teacher_loss": 0.0004231084603816271 }, { "epoch": 0.29700115340253747, "grad_norm": 4.71875, "kd_loss": 0.0908203125, "learning_rate": 7.643489977667327e-06, "loss": 0.1659, "step": 1030, "student_loss": 0.02544678933918476, "teacher_loss": 0.0005848580040037632 }, { "epoch": 0.2984429065743945, "grad_norm": 4.75, "kd_loss": 0.0986328125, "learning_rate": 7.6205425105130855e-06, "loss": 0.1671, "step": 1035, "student_loss": 0.009377697482705116, "teacher_loss": 0.002407669322565198 }, { "epoch": 0.29988465974625145, "grad_norm": 6.0, "kd_loss": 0.1005859375, "learning_rate": 7.597518687224959e-06, "loss": 0.1854, "step": 1040, "student_loss": 0.09456347674131393, "teacher_loss": 0.0008634831756353378 }, { "epoch": 0.3013264129181084, "grad_norm": 5.03125, "kd_loss": 0.07958984375, "learning_rate": 7.574419178660269e-06, "loss": 0.1669, "step": 1045, "student_loss": 0.0017204463947564363, "teacher_loss": 0.0008596886764280498 }, { "epoch": 0.3027681660899654, "grad_norm": 6.21875, "kd_loss": 0.095703125, "learning_rate": 7.551244657881618e-06, "loss": 0.1942, "step": 1050, "student_loss": 0.16669750213623047, "teacher_loss": 0.0011612839298322797 }, { "epoch": 0.3042099192618224, "grad_norm": 2.484375, "kd_loss": 0.09375, "learning_rate": 7.527995800137287e-06, "loss": 0.1475, "step": 1055, "student_loss": 0.0016981420340016484, "teacher_loss": 0.001001509721390903 }, { "epoch": 0.30565167243367936, "grad_norm": 5.21875, "kd_loss": 0.09326171875, "learning_rate": 7.504673282841544e-06, "loss": 0.1647, "step": 1060, "student_loss": 0.09439224749803543, "teacher_loss": 0.0003985276853200048 }, { "epoch": 0.3070934256055363, "grad_norm": 4.875, "kd_loss": 0.1044921875, "learning_rate": 7.481277785554918e-06, "loss": 0.161, "step": 1065, "student_loss": 0.059324074536561966, "teacher_loss": 0.0028861502651125193 }, { "epoch": 0.30853517877739334, "grad_norm": 6.78125, "kd_loss": 0.11474609375, "learning_rate": 7.457809989964393e-06, "loss": 0.1812, "step": 1070, "student_loss": 0.0447225496172905, "teacher_loss": 0.00039993959944695234 }, { "epoch": 0.3099769319492503, "grad_norm": 3.84375, "kd_loss": 0.099609375, "learning_rate": 7.434270579863549e-06, "loss": 0.1539, "step": 1075, "student_loss": 0.0011834139004349709, "teacher_loss": 0.0010074133751913905 }, { "epoch": 0.31141868512110726, "grad_norm": 4.21875, "kd_loss": 0.1005859375, "learning_rate": 7.4106602411326345e-06, "loss": 0.1642, "step": 1080, "student_loss": 0.003048022510483861, "teacher_loss": 0.00988290086388588 }, { "epoch": 0.3128604382929642, "grad_norm": 4.0625, "kd_loss": 0.09765625, "learning_rate": 7.386979661718585e-06, "loss": 0.1702, "step": 1085, "student_loss": 0.003489202819764614, "teacher_loss": 0.0008156410767696798 }, { "epoch": 0.31430219146482125, "grad_norm": 5.65625, "kd_loss": 0.0888671875, "learning_rate": 7.363229531614973e-06, "loss": 0.1515, "step": 1090, "student_loss": 0.2183372676372528, "teacher_loss": 0.004773187451064587 }, { "epoch": 0.3157439446366782, "grad_norm": 4.125, "kd_loss": 0.1171875, "learning_rate": 7.339410542841906e-06, "loss": 0.1799, "step": 1095, "student_loss": 0.13511748611927032, "teacher_loss": 0.00648617185652256 }, { "epoch": 0.31718569780853517, "grad_norm": 5.03125, "kd_loss": 0.10498046875, "learning_rate": 7.315523389425867e-06, "loss": 0.1607, "step": 1100, "student_loss": 0.0012231277069076896, "teacher_loss": 0.0004083520616404712 }, { "epoch": 0.31862745098039214, "grad_norm": 4.0625, "kd_loss": 0.0859375, "learning_rate": 7.291568767379484e-06, "loss": 0.144, "step": 1105, "student_loss": 0.016774829477071762, "teacher_loss": 0.0005346160614863038 }, { "epoch": 0.32006920415224915, "grad_norm": 3.984375, "kd_loss": 0.095703125, "learning_rate": 7.267547374681259e-06, "loss": 0.1602, "step": 1110, "student_loss": 0.024096982553601265, "teacher_loss": 0.0008525612065568566 }, { "epoch": 0.3215109573241061, "grad_norm": 4.59375, "kd_loss": 0.10400390625, "learning_rate": 7.24345991125522e-06, "loss": 0.1532, "step": 1115, "student_loss": 0.0033125807531177998, "teacher_loss": 0.0005502361455000937 }, { "epoch": 0.3229527104959631, "grad_norm": 5.59375, "kd_loss": 0.107421875, "learning_rate": 7.219307078950536e-06, "loss": 0.1625, "step": 1120, "student_loss": 0.0204778965562582, "teacher_loss": 0.004392318893224001 }, { "epoch": 0.32439446366782004, "grad_norm": 10.0625, "kd_loss": 0.0986328125, "learning_rate": 7.195089581521064e-06, "loss": 0.1654, "step": 1125, "student_loss": 0.19389592111110687, "teacher_loss": 0.0214696004986763 }, { "epoch": 0.32583621683967706, "grad_norm": 3.859375, "kd_loss": 0.10986328125, "learning_rate": 7.170808124604842e-06, "loss": 0.1556, "step": 1130, "student_loss": 0.03847292810678482, "teacher_loss": 0.0006047665374353528 }, { "epoch": 0.327277970011534, "grad_norm": 4.96875, "kd_loss": 0.091796875, "learning_rate": 7.14646341570353e-06, "loss": 0.1696, "step": 1135, "student_loss": 0.06856270879507065, "teacher_loss": 0.01667657122015953 }, { "epoch": 0.328719723183391, "grad_norm": 3.90625, "kd_loss": 0.1142578125, "learning_rate": 7.122056164161795e-06, "loss": 0.1778, "step": 1140, "student_loss": 0.021477092057466507, "teacher_loss": 0.012335257604718208 }, { "epoch": 0.330161476355248, "grad_norm": 11.3125, "kd_loss": 0.080078125, "learning_rate": 7.097587081146636e-06, "loss": 0.1589, "step": 1145, "student_loss": 0.045279014855623245, "teacher_loss": 0.0029319608584046364 }, { "epoch": 0.33160322952710497, "grad_norm": 4.125, "kd_loss": 0.08203125, "learning_rate": 7.073056879626681e-06, "loss": 0.204, "step": 1150, "student_loss": 0.002648564986884594, "teacher_loss": 0.0008686608052812517 }, { "epoch": 0.33304498269896193, "grad_norm": 2.640625, "kd_loss": 0.09814453125, "learning_rate": 7.048466274351389e-06, "loss": 0.1497, "step": 1155, "student_loss": 0.058320529758930206, "teacher_loss": 0.00035479728830978274 }, { "epoch": 0.3344867358708189, "grad_norm": 6.4375, "kd_loss": 0.09716796875, "learning_rate": 7.023815981830236e-06, "loss": 0.1904, "step": 1160, "student_loss": 0.0025387869682163, "teacher_loss": 0.028956690803170204 }, { "epoch": 0.3359284890426759, "grad_norm": 3.53125, "kd_loss": 0.10302734375, "learning_rate": 6.999106720311846e-06, "loss": 0.1704, "step": 1165, "student_loss": 0.08381687104701996, "teacher_loss": 0.000761769013479352 }, { "epoch": 0.3373702422145329, "grad_norm": 4.03125, "kd_loss": 0.11669921875, "learning_rate": 6.974339209763043e-06, "loss": 0.1536, "step": 1170, "student_loss": 0.021977189928293228, "teacher_loss": 0.02045821212232113 }, { "epoch": 0.33881199538638984, "grad_norm": 5.125, "kd_loss": 0.09033203125, "learning_rate": 6.949514171847891e-06, "loss": 0.1685, "step": 1175, "student_loss": 0.004976021591573954, "teacher_loss": 0.0024228477850556374 }, { "epoch": 0.3402537485582468, "grad_norm": 4.90625, "kd_loss": 0.09423828125, "learning_rate": 6.924632329906657e-06, "loss": 0.1613, "step": 1180, "student_loss": 0.008308586664497852, "teacher_loss": 0.0010117895435541868 }, { "epoch": 0.3416955017301038, "grad_norm": 2.96875, "kd_loss": 0.09423828125, "learning_rate": 6.899694408934734e-06, "loss": 0.1462, "step": 1185, "student_loss": 0.0045226323418319225, "teacher_loss": 0.000678456446621567 }, { "epoch": 0.3431372549019608, "grad_norm": 2.359375, "kd_loss": 0.09912109375, "learning_rate": 6.874701135561524e-06, "loss": 0.1473, "step": 1190, "student_loss": 0.0010705965105444193, "teacher_loss": 0.0005625460762530565 }, { "epoch": 0.34457900807381775, "grad_norm": 3.90625, "kd_loss": 0.0869140625, "learning_rate": 6.849653238029261e-06, "loss": 0.144, "step": 1195, "student_loss": 0.03853433579206467, "teacher_loss": 0.0004980422672815621 }, { "epoch": 0.3460207612456747, "grad_norm": 2.109375, "kd_loss": 0.08984375, "learning_rate": 6.824551446171788e-06, "loss": 0.2125, "step": 1200, "student_loss": 0.0008995746029540896, "teacher_loss": 0.0007796635036356747 }, { "epoch": 0.34746251441753173, "grad_norm": 4.03125, "kd_loss": 0.0888671875, "learning_rate": 6.7993964913932975e-06, "loss": 0.1821, "step": 1205, "student_loss": 0.07766856998205185, "teacher_loss": 0.0004231579077895731 }, { "epoch": 0.3489042675893887, "grad_norm": 3.3125, "kd_loss": 0.091796875, "learning_rate": 6.774189106647021e-06, "loss": 0.1555, "step": 1210, "student_loss": 0.002076697302982211, "teacher_loss": 0.0008232980617322028 }, { "epoch": 0.35034602076124566, "grad_norm": 5.65625, "kd_loss": 0.095703125, "learning_rate": 6.748930026413865e-06, "loss": 0.1712, "step": 1215, "student_loss": 0.11520007997751236, "teacher_loss": 0.0005120415589772165 }, { "epoch": 0.3517877739331027, "grad_norm": 4.125, "kd_loss": 0.11279296875, "learning_rate": 6.7236199866810185e-06, "loss": 0.164, "step": 1220, "student_loss": 0.06622859835624695, "teacher_loss": 0.010332350619137287 }, { "epoch": 0.35322952710495964, "grad_norm": 4.75, "kd_loss": 0.10595703125, "learning_rate": 6.698259724920503e-06, "loss": 0.1654, "step": 1225, "student_loss": 0.03172338008880615, "teacher_loss": 0.0024025817401707172 }, { "epoch": 0.3546712802768166, "grad_norm": 4.96875, "kd_loss": 0.0986328125, "learning_rate": 6.672849980067685e-06, "loss": 0.1646, "step": 1230, "student_loss": 0.0014958116225898266, "teacher_loss": 0.00136648362968117 }, { "epoch": 0.35611303344867357, "grad_norm": 7.4375, "kd_loss": 0.10791015625, "learning_rate": 6.647391492499746e-06, "loss": 0.1467, "step": 1235, "student_loss": 0.001844382262788713, "teacher_loss": 0.0012089475058019161 }, { "epoch": 0.3575547866205306, "grad_norm": 5.125, "kd_loss": 0.09375, "learning_rate": 6.621885004014113e-06, "loss": 0.1856, "step": 1240, "student_loss": 0.0013189911842346191, "teacher_loss": 0.0011863983236253262 }, { "epoch": 0.35899653979238755, "grad_norm": 3.6875, "kd_loss": 0.10986328125, "learning_rate": 6.596331257806837e-06, "loss": 0.1588, "step": 1245, "student_loss": 0.0016421154141426086, "teacher_loss": 0.001257838448509574 }, { "epoch": 0.3604382929642445, "grad_norm": 5.1875, "kd_loss": 0.1083984375, "learning_rate": 6.570730998450945e-06, "loss": 0.1663, "step": 1250, "student_loss": 0.19827650487422943, "teacher_loss": 0.002391376066952944 }, { "epoch": 0.3618800461361015, "grad_norm": 4.84375, "kd_loss": 0.109375, "learning_rate": 6.545084971874738e-06, "loss": 0.1693, "step": 1255, "student_loss": 0.033441461622714996, "teacher_loss": 0.007763924542814493 }, { "epoch": 0.3633217993079585, "grad_norm": 4.875, "kd_loss": 0.10400390625, "learning_rate": 6.519393925340067e-06, "loss": 0.1687, "step": 1260, "student_loss": 0.0005883485428057611, "teacher_loss": 0.0006072800024412572 }, { "epoch": 0.36476355247981546, "grad_norm": 5.0, "kd_loss": 0.1123046875, "learning_rate": 6.49365860742055e-06, "loss": 0.1807, "step": 1265, "student_loss": 0.3620021343231201, "teacher_loss": 0.0192008875310421 }, { "epoch": 0.3662053056516724, "grad_norm": 2.546875, "kd_loss": 0.09033203125, "learning_rate": 6.467879767979764e-06, "loss": 0.1367, "step": 1270, "student_loss": 0.0019303744193166494, "teacher_loss": 0.0008835737244226038 }, { "epoch": 0.36764705882352944, "grad_norm": 4.84375, "kd_loss": 0.09765625, "learning_rate": 6.442058158149396e-06, "loss": 0.1364, "step": 1275, "student_loss": 0.017311925068497658, "teacher_loss": 0.010822150856256485 }, { "epoch": 0.3690888119953864, "grad_norm": 2.390625, "kd_loss": 0.08837890625, "learning_rate": 6.4161945303073535e-06, "loss": 0.1339, "step": 1280, "student_loss": 0.0016525188693776727, "teacher_loss": 0.00045569639769382775 }, { "epoch": 0.37053056516724336, "grad_norm": 2.59375, "kd_loss": 0.09716796875, "learning_rate": 6.390289638055851e-06, "loss": 0.1514, "step": 1285, "student_loss": 0.0017534851795062423, "teacher_loss": 0.000792986829765141 }, { "epoch": 0.3719723183391003, "grad_norm": 5.15625, "kd_loss": 0.08203125, "learning_rate": 6.364344236199441e-06, "loss": 0.1544, "step": 1290, "student_loss": 0.4242388606071472, "teacher_loss": 0.015242666937410831 }, { "epoch": 0.37341407151095735, "grad_norm": 4.0, "kd_loss": 0.08251953125, "learning_rate": 6.3383590807230264e-06, "loss": 0.1732, "step": 1295, "student_loss": 0.0017893314361572266, "teacher_loss": 0.005244513973593712 }, { "epoch": 0.3748558246828143, "grad_norm": 4.875, "kd_loss": 0.09765625, "learning_rate": 6.3123349287698345e-06, "loss": 0.1343, "step": 1300, "student_loss": 0.0016616806387901306, "teacher_loss": 0.000766773009672761 }, { "epoch": 0.3762975778546713, "grad_norm": 4.34375, "kd_loss": 0.0986328125, "learning_rate": 6.286272538619351e-06, "loss": 0.1656, "step": 1305, "student_loss": 0.001378720044158399, "teacher_loss": 0.2784559726715088 }, { "epoch": 0.37773933102652824, "grad_norm": 4.78125, "kd_loss": 0.09326171875, "learning_rate": 6.260172669665233e-06, "loss": 0.1376, "step": 1310, "student_loss": 0.0015898743877187371, "teacher_loss": 0.001270298846065998 }, { "epoch": 0.37918108419838525, "grad_norm": 6.34375, "kd_loss": 0.09716796875, "learning_rate": 6.234036082393171e-06, "loss": 0.1719, "step": 1315, "student_loss": 0.2977891266345978, "teacher_loss": 0.0018072956008836627 }, { "epoch": 0.3806228373702422, "grad_norm": 4.5, "kd_loss": 0.09814453125, "learning_rate": 6.207863538358741e-06, "loss": 0.166, "step": 1320, "student_loss": 0.002190067432820797, "teacher_loss": 0.0004770367522723973 }, { "epoch": 0.3820645905420992, "grad_norm": 4.625, "kd_loss": 0.10546875, "learning_rate": 6.181655800165207e-06, "loss": 0.1752, "step": 1325, "student_loss": 0.014989044517278671, "teacher_loss": 0.0011476778890937567 }, { "epoch": 0.38350634371395614, "grad_norm": 3.609375, "kd_loss": 0.08740234375, "learning_rate": 6.155413631441307e-06, "loss": 0.1513, "step": 1330, "student_loss": 0.04820695146918297, "teacher_loss": 0.0003813351795542985 }, { "epoch": 0.38494809688581316, "grad_norm": 7.1875, "kd_loss": 0.09228515625, "learning_rate": 6.129137796818997e-06, "loss": 0.149, "step": 1335, "student_loss": 0.0018507987260818481, "teacher_loss": 0.0005417931824922562 }, { "epoch": 0.3863898500576701, "grad_norm": 5.1875, "kd_loss": 0.1728515625, "learning_rate": 6.102829061911176e-06, "loss": 0.1629, "step": 1340, "student_loss": 0.0006290597375482321, "teacher_loss": 0.007059386931359768 }, { "epoch": 0.3878316032295271, "grad_norm": 4.53125, "kd_loss": 0.09375, "learning_rate": 6.076488193289375e-06, "loss": 0.154, "step": 1345, "student_loss": 0.001199022983200848, "teacher_loss": 0.001256449380889535 }, { "epoch": 0.3892733564013841, "grad_norm": 3.796875, "kd_loss": 0.09912109375, "learning_rate": 6.050115958461423e-06, "loss": 0.1423, "step": 1350, "student_loss": 0.028213880956172943, "teacher_loss": 0.0014634531689807773 }, { "epoch": 0.39071510957324107, "grad_norm": 5.1875, "kd_loss": 0.09814453125, "learning_rate": 6.02371312584908e-06, "loss": 0.1606, "step": 1355, "student_loss": 0.046754222363233566, "teacher_loss": 0.0003306324942968786 }, { "epoch": 0.39215686274509803, "grad_norm": 6.75, "kd_loss": 0.1025390625, "learning_rate": 5.997280464765655e-06, "loss": 0.1687, "step": 1360, "student_loss": 0.01089841965585947, "teacher_loss": 0.0005644945777021348 }, { "epoch": 0.393598615916955, "grad_norm": 3.734375, "kd_loss": 0.1025390625, "learning_rate": 5.970818745393579e-06, "loss": 0.1514, "step": 1365, "student_loss": 0.012727648951113224, "teacher_loss": 0.04049056023359299 }, { "epoch": 0.395040369088812, "grad_norm": 4.59375, "kd_loss": 0.095703125, "learning_rate": 5.9443287387619754e-06, "loss": 0.1645, "step": 1370, "student_loss": 0.3426652252674103, "teacher_loss": 0.02165866084396839 }, { "epoch": 0.396482122260669, "grad_norm": 6.65625, "kd_loss": 0.0966796875, "learning_rate": 5.9178112167241805e-06, "loss": 0.1544, "step": 1375, "student_loss": 0.09628524631261826, "teacher_loss": 0.0004050543357152492 }, { "epoch": 0.39792387543252594, "grad_norm": 2.828125, "kd_loss": 0.099609375, "learning_rate": 5.8912669519352725e-06, "loss": 0.1484, "step": 1380, "student_loss": 0.024134894832968712, "teacher_loss": 0.0030764644034206867 }, { "epoch": 0.3993656286043829, "grad_norm": 4.5625, "kd_loss": 0.0849609375, "learning_rate": 5.864696717829539e-06, "loss": 0.1566, "step": 1385, "student_loss": 0.06617551296949387, "teacher_loss": 0.011502874083817005 }, { "epoch": 0.4008073817762399, "grad_norm": 4.6875, "kd_loss": 0.09619140625, "learning_rate": 5.838101288597951e-06, "loss": 0.1487, "step": 1390, "student_loss": 0.0014513310743495822, "teacher_loss": 0.0005679084570147097 }, { "epoch": 0.4022491349480969, "grad_norm": 3.421875, "kd_loss": 0.08984375, "learning_rate": 5.8114814391656046e-06, "loss": 0.1609, "step": 1395, "student_loss": 0.001117706298828125, "teacher_loss": 0.00049404869787395 }, { "epoch": 0.40369088811995385, "grad_norm": 3.375, "kd_loss": 0.091796875, "learning_rate": 5.78483794516914e-06, "loss": 0.1509, "step": 1400, "student_loss": 0.005229848902672529, "teacher_loss": 0.00046830569044686854 }, { "epoch": 0.40513264129181087, "grad_norm": 3.28125, "kd_loss": 0.10107421875, "learning_rate": 5.75817158293414e-06, "loss": 0.1549, "step": 1405, "student_loss": 0.000877993879839778, "teacher_loss": 0.0005521044950000942 }, { "epoch": 0.40657439446366783, "grad_norm": 4.25, "kd_loss": 0.0927734375, "learning_rate": 5.731483129452514e-06, "loss": 0.1684, "step": 1410, "student_loss": 0.008468794636428356, "teacher_loss": 0.0004955396871082485 }, { "epoch": 0.4080161476355248, "grad_norm": 4.78125, "kd_loss": 0.10009765625, "learning_rate": 5.704773362359854e-06, "loss": 0.1529, "step": 1415, "student_loss": 0.023767048493027687, "teacher_loss": 0.014984571374952793 }, { "epoch": 0.40945790080738176, "grad_norm": 4.9375, "kd_loss": 0.103515625, "learning_rate": 5.678043059912776e-06, "loss": 0.1818, "step": 1420, "student_loss": 0.0011405398836359382, "teacher_loss": 0.0018354527419432998 }, { "epoch": 0.4108996539792388, "grad_norm": 3.859375, "kd_loss": 0.0849609375, "learning_rate": 5.6512930009662524e-06, "loss": 0.1643, "step": 1425, "student_loss": 0.20638686418533325, "teacher_loss": 0.0004497423942666501 }, { "epoch": 0.41234140715109574, "grad_norm": 5.15625, "kd_loss": 0.111328125, "learning_rate": 5.624523964950903e-06, "loss": 0.1493, "step": 1430, "student_loss": 0.003926432225853205, "teacher_loss": 0.0005563534796237946 }, { "epoch": 0.4137831603229527, "grad_norm": 2.734375, "kd_loss": 0.08935546875, "learning_rate": 5.597736731850295e-06, "loss": 0.164, "step": 1435, "student_loss": 0.012115873396396637, "teacher_loss": 0.00135420064907521 }, { "epoch": 0.41522491349480967, "grad_norm": 8.375, "kd_loss": 0.095703125, "learning_rate": 5.570932082178219e-06, "loss": 0.1733, "step": 1440, "student_loss": 0.05656226724386215, "teacher_loss": 0.0003561509947758168 }, { "epoch": 0.4166666666666667, "grad_norm": 5.25, "kd_loss": 0.1005859375, "learning_rate": 5.5441107969559315e-06, "loss": 0.1578, "step": 1445, "student_loss": 0.2771185636520386, "teacher_loss": 0.002755317836999893 }, { "epoch": 0.41810841983852365, "grad_norm": 4.1875, "kd_loss": 0.138671875, "learning_rate": 5.517273657689419e-06, "loss": 0.1413, "step": 1450, "student_loss": 0.004985239822417498, "teacher_loss": 0.002304993337020278 }, { "epoch": 0.4195501730103806, "grad_norm": 5.625, "kd_loss": 0.10595703125, "learning_rate": 5.490421446346608e-06, "loss": 0.1495, "step": 1455, "student_loss": 0.002044258639216423, "teacher_loss": 0.0006097870063968003 }, { "epoch": 0.4209919261822376, "grad_norm": 3.890625, "kd_loss": 0.1123046875, "learning_rate": 5.463554945334589e-06, "loss": 0.1499, "step": 1460, "student_loss": 0.001577138900756836, "teacher_loss": 0.00036811313475482166 }, { "epoch": 0.4224336793540946, "grad_norm": 4.46875, "kd_loss": 0.126953125, "learning_rate": 5.43667493747682e-06, "loss": 0.1629, "step": 1465, "student_loss": 0.0005169888027012348, "teacher_loss": 0.015567619353532791 }, { "epoch": 0.42387543252595156, "grad_norm": 5.75, "kd_loss": 0.0859375, "learning_rate": 5.409782205990317e-06, "loss": 0.1757, "step": 1470, "student_loss": 0.0012229635613039136, "teacher_loss": 0.0044844611547887325 }, { "epoch": 0.4253171856978085, "grad_norm": 6.8125, "kd_loss": 0.0888671875, "learning_rate": 5.3828775344628245e-06, "loss": 0.1525, "step": 1475, "student_loss": 0.0009603102807886899, "teacher_loss": 0.0009318754309788346 }, { "epoch": 0.42675893886966554, "grad_norm": 3.734375, "kd_loss": 0.0908203125, "learning_rate": 5.355961706829997e-06, "loss": 0.1638, "step": 1480, "student_loss": 0.0015584760112687945, "teacher_loss": 0.0012691307347267866 }, { "epoch": 0.4282006920415225, "grad_norm": 9.375, "kd_loss": 0.10009765625, "learning_rate": 5.329035507352548e-06, "loss": 0.1616, "step": 1485, "student_loss": 0.0008603151072748005, "teacher_loss": 0.0008554637315683067 }, { "epoch": 0.42964244521337946, "grad_norm": 8.375, "kd_loss": 0.09814453125, "learning_rate": 5.3020997205933985e-06, "loss": 0.1486, "step": 1490, "student_loss": 0.10303473472595215, "teacher_loss": 0.005800185259431601 }, { "epoch": 0.43108419838523643, "grad_norm": 3.4375, "kd_loss": 0.09033203125, "learning_rate": 5.275155131394825e-06, "loss": 0.1504, "step": 1495, "student_loss": 0.002375382697209716, "teacher_loss": 0.001016065594740212 }, { "epoch": 0.43252595155709345, "grad_norm": 4.65625, "kd_loss": 0.0927734375, "learning_rate": 5.248202524855578e-06, "loss": 0.1497, "step": 1500, "student_loss": 0.046541083604097366, "teacher_loss": 0.0038300170563161373 }, { "epoch": 0.4339677047289504, "grad_norm": 2.84375, "kd_loss": 0.08642578125, "learning_rate": 5.221242686308019e-06, "loss": 0.1424, "step": 1505, "student_loss": 0.04092458263039589, "teacher_loss": 0.0005184438778087497 }, { "epoch": 0.4354094579008074, "grad_norm": 5.9375, "kd_loss": 0.14453125, "learning_rate": 5.194276401295231e-06, "loss": 0.1581, "step": 1510, "student_loss": 0.11372507363557816, "teacher_loss": 0.012486970983445644 }, { "epoch": 0.43685121107266434, "grad_norm": 5.6875, "kd_loss": 0.103515625, "learning_rate": 5.167304455548128e-06, "loss": 0.1542, "step": 1515, "student_loss": 0.0004513502062764019, "teacher_loss": 0.0004412997222971171 }, { "epoch": 0.43829296424452135, "grad_norm": 4.46875, "kd_loss": 0.09326171875, "learning_rate": 5.14032763496257e-06, "loss": 0.1503, "step": 1520, "student_loss": 0.016323139891028404, "teacher_loss": 0.0005559992277994752 }, { "epoch": 0.4397347174163783, "grad_norm": 2.875, "kd_loss": 0.1171875, "learning_rate": 5.11334672557645e-06, "loss": 0.1516, "step": 1525, "student_loss": 0.009535513818264008, "teacher_loss": 0.001467025140300393 }, { "epoch": 0.4411764705882353, "grad_norm": 3.203125, "kd_loss": 0.146484375, "learning_rate": 5.086362513546807e-06, "loss": 0.1389, "step": 1530, "student_loss": 0.1711445301771164, "teacher_loss": 0.008548562414944172 }, { "epoch": 0.4426182237600923, "grad_norm": 3.96875, "kd_loss": 0.09033203125, "learning_rate": 5.059375785126907e-06, "loss": 0.1367, "step": 1535, "student_loss": 0.09691781550645828, "teacher_loss": 0.011890435591340065 }, { "epoch": 0.44405997693194926, "grad_norm": 3.15625, "kd_loss": 0.1025390625, "learning_rate": 5.032387326643331e-06, "loss": 0.15, "step": 1540, "student_loss": 0.11253131926059723, "teacher_loss": 0.00038647381006740034 }, { "epoch": 0.4455017301038062, "grad_norm": 4.28125, "kd_loss": 0.0986328125, "learning_rate": 5.005397924473082e-06, "loss": 0.1453, "step": 1545, "student_loss": 0.0029098070226609707, "teacher_loss": 0.007037348113954067 }, { "epoch": 0.4469434832756632, "grad_norm": 4.5625, "kd_loss": 0.08935546875, "learning_rate": 4.978408365020651e-06, "loss": 0.1724, "step": 1550, "student_loss": 0.09735474735498428, "teacher_loss": 0.0007218251703307033 }, { "epoch": 0.4483852364475202, "grad_norm": 5.40625, "kd_loss": 0.10888671875, "learning_rate": 4.951419434695115e-06, "loss": 0.1426, "step": 1555, "student_loss": 0.024885384365916252, "teacher_loss": 0.0010013995924964547 }, { "epoch": 0.44982698961937717, "grad_norm": 4.34375, "kd_loss": 0.0927734375, "learning_rate": 4.924431919887216e-06, "loss": 0.1592, "step": 1560, "student_loss": 0.0011940286494791508, "teacher_loss": 0.0006662014056928456 }, { "epoch": 0.45126874279123413, "grad_norm": 4.78125, "kd_loss": 0.095703125, "learning_rate": 4.897446606946459e-06, "loss": 0.1363, "step": 1565, "student_loss": 0.008483109064400196, "teacher_loss": 0.0011815401958301663 }, { "epoch": 0.4527104959630911, "grad_norm": 6.09375, "kd_loss": 0.1337890625, "learning_rate": 4.870464282158184e-06, "loss": 0.1435, "step": 1570, "student_loss": 0.0016101751243695617, "teacher_loss": 0.03063173033297062 }, { "epoch": 0.4541522491349481, "grad_norm": 4.75, "kd_loss": 0.1337890625, "learning_rate": 4.84348573172067e-06, "loss": 0.1472, "step": 1575, "student_loss": 0.000616877747233957, "teacher_loss": 0.02498156577348709 }, { "epoch": 0.4555940023068051, "grad_norm": 6.65625, "kd_loss": 0.0791015625, "learning_rate": 4.816511741722215e-06, "loss": 0.1727, "step": 1580, "student_loss": 0.09299268573522568, "teacher_loss": 0.000702059711329639 }, { "epoch": 0.45703575547866204, "grad_norm": 5.3125, "kd_loss": 0.09423828125, "learning_rate": 4.7895430981182415e-06, "loss": 0.1725, "step": 1585, "student_loss": 0.001776401768438518, "teacher_loss": 0.0011725560761988163 }, { "epoch": 0.458477508650519, "grad_norm": 5.375, "kd_loss": 0.0966796875, "learning_rate": 4.762580586708389e-06, "loss": 0.1547, "step": 1590, "student_loss": 0.0018952718237414956, "teacher_loss": 0.0010945210233330727 }, { "epoch": 0.459919261822376, "grad_norm": 4.4375, "kd_loss": 0.09619140625, "learning_rate": 4.73562499311362e-06, "loss": 0.1534, "step": 1595, "student_loss": 0.0008283228962682188, "teacher_loss": 0.0009313338669016957 }, { "epoch": 0.461361014994233, "grad_norm": 3.984375, "kd_loss": 0.08984375, "learning_rate": 4.708677102753331e-06, "loss": 0.1371, "step": 1600, "student_loss": 0.04035179316997528, "teacher_loss": 0.002971302019432187 }, { "epoch": 0.46280276816608995, "grad_norm": 5.4375, "kd_loss": 0.10888671875, "learning_rate": 4.681737700822464e-06, "loss": 0.1731, "step": 1605, "student_loss": 0.0004709873755928129, "teacher_loss": 0.025587571784853935 }, { "epoch": 0.46424452133794697, "grad_norm": 3.015625, "kd_loss": 0.142578125, "learning_rate": 4.654807572268628e-06, "loss": 0.1602, "step": 1610, "student_loss": 0.002385765314102173, "teacher_loss": 0.0010947687551379204 }, { "epoch": 0.46568627450980393, "grad_norm": 4.65625, "kd_loss": 0.10009765625, "learning_rate": 4.627887501769231e-06, "loss": 0.1628, "step": 1615, "student_loss": 0.004139338154345751, "teacher_loss": 0.009530629962682724 }, { "epoch": 0.4671280276816609, "grad_norm": 3.9375, "kd_loss": 0.119140625, "learning_rate": 4.600978273708612e-06, "loss": 0.153, "step": 1620, "student_loss": 0.003188611473888159, "teacher_loss": 0.001373080536723137 }, { "epoch": 0.46856978085351786, "grad_norm": 4.28125, "kd_loss": 0.1025390625, "learning_rate": 4.574080672155189e-06, "loss": 0.1591, "step": 1625, "student_loss": 0.0014868304133415222, "teacher_loss": 0.000639898469671607 }, { "epoch": 0.4700115340253749, "grad_norm": 3.21875, "kd_loss": 0.0810546875, "learning_rate": 4.547195480838612e-06, "loss": 0.1515, "step": 1630, "student_loss": 0.040516145527362823, "teacher_loss": 0.0004312426899559796 }, { "epoch": 0.47145328719723184, "grad_norm": 4.84375, "kd_loss": 0.087890625, "learning_rate": 4.520323483126928e-06, "loss": 0.1862, "step": 1635, "student_loss": 0.12052398920059204, "teacher_loss": 0.0003846465260721743 }, { "epoch": 0.4728950403690888, "grad_norm": 4.65625, "kd_loss": 0.0908203125, "learning_rate": 4.493465462003756e-06, "loss": 0.1453, "step": 1640, "student_loss": 0.0010070661082863808, "teacher_loss": 0.0012260322691872716 }, { "epoch": 0.47433679354094577, "grad_norm": 2.875, "kd_loss": 0.09375, "learning_rate": 4.4666222000454685e-06, "loss": 0.1545, "step": 1645, "student_loss": 0.0013779783621430397, "teacher_loss": 0.00042751312139444053 }, { "epoch": 0.4757785467128028, "grad_norm": 7.15625, "kd_loss": 0.0869140625, "learning_rate": 4.4397944793983946e-06, "loss": 0.1599, "step": 1650, "student_loss": 0.0005161279696039855, "teacher_loss": 0.0007879316690377891 }, { "epoch": 0.47722029988465975, "grad_norm": 3.421875, "kd_loss": 0.08203125, "learning_rate": 4.4129830817560284e-06, "loss": 0.1627, "step": 1655, "student_loss": 0.003220248268917203, "teacher_loss": 0.0008016406209208071 }, { "epoch": 0.4786620530565167, "grad_norm": 4.125, "kd_loss": 0.1015625, "learning_rate": 4.386188788336251e-06, "loss": 0.1404, "step": 1660, "student_loss": 0.08471440523862839, "teacher_loss": 0.0003611688152886927 }, { "epoch": 0.4801038062283737, "grad_norm": 4.0, "kd_loss": 0.09716796875, "learning_rate": 4.359412379858569e-06, "loss": 0.1428, "step": 1665, "student_loss": 0.0006392439245246351, "teacher_loss": 0.00034953776048496366 }, { "epoch": 0.4815455594002307, "grad_norm": 6.0, "kd_loss": 0.1484375, "learning_rate": 4.332654636521365e-06, "loss": 0.1493, "step": 1670, "student_loss": 0.13810043036937714, "teacher_loss": 0.005655170418322086 }, { "epoch": 0.48298731257208766, "grad_norm": 2.9375, "kd_loss": 0.1015625, "learning_rate": 4.3059163379791676e-06, "loss": 0.1588, "step": 1675, "student_loss": 0.0011410564184188843, "teacher_loss": 0.0009165616356767714 }, { "epoch": 0.4844290657439446, "grad_norm": 3.96875, "kd_loss": 0.103515625, "learning_rate": 4.279198263319932e-06, "loss": 0.1983, "step": 1680, "student_loss": 0.042820997536182404, "teacher_loss": 0.0006943390471860766 }, { "epoch": 0.48587081891580164, "grad_norm": 4.65625, "kd_loss": 0.0927734375, "learning_rate": 4.252501191042334e-06, "loss": 0.1458, "step": 1685, "student_loss": 0.001107779797166586, "teacher_loss": 0.000587086018640548 }, { "epoch": 0.4873125720876586, "grad_norm": 3.84375, "kd_loss": 0.08154296875, "learning_rate": 4.2258258990331015e-06, "loss": 0.1505, "step": 1690, "student_loss": 0.0010096587939187884, "teacher_loss": 0.0006587179377675056 }, { "epoch": 0.48875432525951557, "grad_norm": 3.296875, "kd_loss": 0.09423828125, "learning_rate": 4.199173164544331e-06, "loss": 0.1301, "step": 1695, "student_loss": 0.0007151216268539429, "teacher_loss": 0.0005271242698654532 }, { "epoch": 0.49019607843137253, "grad_norm": 2.75, "kd_loss": 0.111328125, "learning_rate": 4.1725437641708535e-06, "loss": 0.1292, "step": 1700, "student_loss": 0.0004782706964761019, "teacher_loss": 0.00025354631361551583 }, { "epoch": 0.49163783160322955, "grad_norm": 4.84375, "kd_loss": 0.0908203125, "learning_rate": 4.145938473827598e-06, "loss": 0.1694, "step": 1705, "student_loss": 0.03137379139661789, "teacher_loss": 0.0014872003812342882 }, { "epoch": 0.4930795847750865, "grad_norm": 3.890625, "kd_loss": 0.0859375, "learning_rate": 4.1193580687269896e-06, "loss": 0.1799, "step": 1710, "student_loss": 0.13360068202018738, "teacher_loss": 0.0003576852031983435 }, { "epoch": 0.4945213379469435, "grad_norm": 3.6875, "kd_loss": 0.0849609375, "learning_rate": 4.092803323356357e-06, "loss": 0.1568, "step": 1715, "student_loss": 0.0007690335623919964, "teacher_loss": 0.0004268670454621315 }, { "epoch": 0.49596309111880044, "grad_norm": 4.53125, "kd_loss": 0.09423828125, "learning_rate": 4.066275011455369e-06, "loss": 0.1345, "step": 1720, "student_loss": 0.0010869682300835848, "teacher_loss": 0.0008715330623090267 }, { "epoch": 0.49740484429065746, "grad_norm": 3.78125, "kd_loss": 0.10302734375, "learning_rate": 4.039773905993486e-06, "loss": 0.1661, "step": 1725, "student_loss": 0.001568131148815155, "teacher_loss": 0.0013625255087390542 }, { "epoch": 0.4988465974625144, "grad_norm": 3.234375, "kd_loss": 0.103515625, "learning_rate": 4.013300779147445e-06, "loss": 0.1311, "step": 1730, "student_loss": 0.003118544816970825, "teacher_loss": 0.027190769091248512 }, { "epoch": 0.5002883506343714, "grad_norm": 3.984375, "kd_loss": 0.1337890625, "learning_rate": 3.98685640227875e-06, "loss": 0.1465, "step": 1735, "student_loss": 0.004482457414269447, "teacher_loss": 0.06759393215179443 }, { "epoch": 0.5017301038062284, "grad_norm": 4.84375, "kd_loss": 0.1123046875, "learning_rate": 3.960441545911205e-06, "loss": 0.1692, "step": 1740, "student_loss": 0.0023235215339809656, "teacher_loss": 0.027365142479538918 }, { "epoch": 0.5031718569780853, "grad_norm": 2.78125, "kd_loss": 0.11474609375, "learning_rate": 3.934056979708456e-06, "loss": 0.1393, "step": 1745, "student_loss": 0.0015286001143977046, "teacher_loss": 0.02260914258658886 }, { "epoch": 0.5046136101499423, "grad_norm": 6.3125, "kd_loss": 0.10009765625, "learning_rate": 3.907703472451574e-06, "loss": 0.1627, "step": 1750, "student_loss": 0.009829165413975716, "teacher_loss": 0.0006399175035767257 }, { "epoch": 0.5060553633217993, "grad_norm": 7.96875, "kd_loss": 0.099609375, "learning_rate": 3.881381792016645e-06, "loss": 0.1749, "step": 1755, "student_loss": 0.0006391415954567492, "teacher_loss": 0.0003294479101896286 }, { "epoch": 0.5074971164936563, "grad_norm": 3.53125, "kd_loss": 0.1220703125, "learning_rate": 3.8550927053523994e-06, "loss": 0.1389, "step": 1760, "student_loss": 0.0007123491377569735, "teacher_loss": 0.046919528394937515 }, { "epoch": 0.5089388696655133, "grad_norm": 5.09375, "kd_loss": 0.083984375, "learning_rate": 3.828836978457868e-06, "loss": 0.1522, "step": 1765, "student_loss": 0.0016679943073540926, "teacher_loss": 0.000926964043173939 }, { "epoch": 0.5103806228373703, "grad_norm": 3.671875, "kd_loss": 0.1357421875, "learning_rate": 3.8026153763600603e-06, "loss": 0.1477, "step": 1770, "student_loss": 0.022712958976626396, "teacher_loss": 0.029349761083722115 }, { "epoch": 0.5118223760092272, "grad_norm": 3.625, "kd_loss": 0.09228515625, "learning_rate": 3.7764286630916704e-06, "loss": 0.1425, "step": 1775, "student_loss": 0.0012370613403618336, "teacher_loss": 0.0007929064449854195 }, { "epoch": 0.5132641291810842, "grad_norm": 3.234375, "kd_loss": 0.0947265625, "learning_rate": 3.7502776016688234e-06, "loss": 0.1589, "step": 1780, "student_loss": 0.008692040108144283, "teacher_loss": 0.0190599225461483 }, { "epoch": 0.5147058823529411, "grad_norm": 4.46875, "kd_loss": 0.1181640625, "learning_rate": 3.724162954068835e-06, "loss": 0.1568, "step": 1785, "student_loss": 0.1760350614786148, "teacher_loss": 0.028955036774277687 }, { "epoch": 0.5161476355247981, "grad_norm": 3.28125, "kd_loss": 0.083984375, "learning_rate": 3.6980854812080097e-06, "loss": 0.1497, "step": 1790, "student_loss": 0.0018669115379452705, "teacher_loss": 0.0008881228277459741 }, { "epoch": 0.5175893886966552, "grad_norm": 4.6875, "kd_loss": 0.08984375, "learning_rate": 3.6720459429194743e-06, "loss": 0.1635, "step": 1795, "student_loss": 0.1518515795469284, "teacher_loss": 0.00034799822606146336 }, { "epoch": 0.5190311418685121, "grad_norm": 2.828125, "kd_loss": 0.0869140625, "learning_rate": 3.646045097931037e-06, "loss": 0.1584, "step": 1800, "student_loss": 0.0007797479629516602, "teacher_loss": 0.0005545561434701085 }, { "epoch": 0.5204728950403691, "grad_norm": 5.5, "kd_loss": 0.08056640625, "learning_rate": 3.620083703843077e-06, "loss": 0.1433, "step": 1805, "student_loss": 0.03987161070108414, "teacher_loss": 0.00046788767213001847 }, { "epoch": 0.5219146482122261, "grad_norm": 4.0, "kd_loss": 0.095703125, "learning_rate": 3.594162517106472e-06, "loss": 0.1646, "step": 1810, "student_loss": 0.0409666933119297, "teacher_loss": 0.0038134430069476366 }, { "epoch": 0.523356401384083, "grad_norm": 3.71875, "kd_loss": 0.10693359375, "learning_rate": 3.5682822930005567e-06, "loss": 0.1424, "step": 1815, "student_loss": 0.30990689992904663, "teacher_loss": 0.019313883036375046 }, { "epoch": 0.52479815455594, "grad_norm": 6.0625, "kd_loss": 0.1015625, "learning_rate": 3.542443785611117e-06, "loss": 0.1455, "step": 1820, "student_loss": 0.05496774613857269, "teacher_loss": 0.003318265313282609 }, { "epoch": 0.526239907727797, "grad_norm": 7.9375, "kd_loss": 0.103515625, "learning_rate": 3.516647747808417e-06, "loss": 0.1445, "step": 1825, "student_loss": 0.0014039704110473394, "teacher_loss": 0.004902483429759741 }, { "epoch": 0.527681660899654, "grad_norm": 3.84375, "kd_loss": 0.08642578125, "learning_rate": 3.4908949312252593e-06, "loss": 0.1453, "step": 1830, "student_loss": 0.001090447069145739, "teacher_loss": 0.0005476956139318645 }, { "epoch": 0.529123414071511, "grad_norm": 3.828125, "kd_loss": 0.10595703125, "learning_rate": 3.4651860862350893e-06, "loss": 0.1355, "step": 1835, "student_loss": 0.00827399455010891, "teacher_loss": 0.0005034086061641574 }, { "epoch": 0.5305651672433679, "grad_norm": 3.375, "kd_loss": 0.083984375, "learning_rate": 3.4395219619301288e-06, "loss": 0.1429, "step": 1840, "student_loss": 0.02982058748602867, "teacher_loss": 0.003948381636291742 }, { "epoch": 0.5320069204152249, "grad_norm": 4.0625, "kd_loss": 0.10791015625, "learning_rate": 3.4139033060995484e-06, "loss": 0.1606, "step": 1845, "student_loss": 0.0009567984379827976, "teacher_loss": 0.0006957401055842638 }, { "epoch": 0.5334486735870819, "grad_norm": 3.484375, "kd_loss": 0.0859375, "learning_rate": 3.388330865207681e-06, "loss": 0.1516, "step": 1850, "student_loss": 0.09060114622116089, "teacher_loss": 0.011022915132343769 }, { "epoch": 0.5348904267589388, "grad_norm": 3.8125, "kd_loss": 0.09326171875, "learning_rate": 3.3628053843722674e-06, "loss": 0.1586, "step": 1855, "student_loss": 0.0023815552704036236, "teacher_loss": 0.0009834859520196915 }, { "epoch": 0.5363321799307958, "grad_norm": 2.890625, "kd_loss": 0.08642578125, "learning_rate": 3.337327607342753e-06, "loss": 0.1443, "step": 1860, "student_loss": 0.0012727677822113037, "teacher_loss": 0.0003461229207459837 }, { "epoch": 0.5377739331026529, "grad_norm": 3.796875, "kd_loss": 0.09765625, "learning_rate": 3.3118982764786055e-06, "loss": 0.1753, "step": 1865, "student_loss": 0.12450817972421646, "teacher_loss": 0.00035991144250147045 }, { "epoch": 0.5392156862745098, "grad_norm": 6.3125, "kd_loss": 0.099609375, "learning_rate": 3.2865181327277007e-06, "loss": 0.1487, "step": 1870, "student_loss": 0.14028604328632355, "teacher_loss": 0.0013080085627734661 }, { "epoch": 0.5406574394463668, "grad_norm": 3.25, "kd_loss": 0.08984375, "learning_rate": 3.2611879156047147e-06, "loss": 0.1471, "step": 1875, "student_loss": 0.0018398945685476065, "teacher_loss": 0.0011750732082873583 }, { "epoch": 0.5420991926182238, "grad_norm": 4.53125, "kd_loss": 0.11376953125, "learning_rate": 3.2359083631695897e-06, "loss": 0.1327, "step": 1880, "student_loss": 0.03754269704222679, "teacher_loss": 0.0011901544639840722 }, { "epoch": 0.5435409457900807, "grad_norm": 4.625, "kd_loss": 0.10400390625, "learning_rate": 3.2106802120060197e-06, "loss": 0.1568, "step": 1885, "student_loss": 0.003520218888297677, "teacher_loss": 0.0013501073699444532 }, { "epoch": 0.5449826989619377, "grad_norm": 4.1875, "kd_loss": 0.08203125, "learning_rate": 3.185504197199999e-06, "loss": 0.1376, "step": 1890, "student_loss": 0.008974825032055378, "teacher_loss": 0.00042682504863478243 }, { "epoch": 0.5464244521337946, "grad_norm": 5.1875, "kd_loss": 0.0966796875, "learning_rate": 3.160381052318393e-06, "loss": 0.1649, "step": 1895, "student_loss": 0.0789928063750267, "teacher_loss": 0.0007635668735019863 }, { "epoch": 0.5478662053056517, "grad_norm": 2.859375, "kd_loss": 0.0791015625, "learning_rate": 3.1353115093875676e-06, "loss": 0.1554, "step": 1900, "student_loss": 0.002470338949933648, "teacher_loss": 0.0004501968214754015 }, { "epoch": 0.5493079584775087, "grad_norm": 4.46875, "kd_loss": 0.09375, "learning_rate": 3.1102962988720615e-06, "loss": 0.1432, "step": 1905, "student_loss": 0.04193798825144768, "teacher_loss": 0.0009529749513603747 }, { "epoch": 0.5507497116493656, "grad_norm": 5.25, "kd_loss": 0.09716796875, "learning_rate": 3.085336149653303e-06, "loss": 0.1487, "step": 1910, "student_loss": 0.000636325916275382, "teacher_loss": 0.0005562056903727353 }, { "epoch": 0.5521914648212226, "grad_norm": 3.359375, "kd_loss": 0.10107421875, "learning_rate": 3.060431789008368e-06, "loss": 0.1681, "step": 1915, "student_loss": 0.0016948822885751724, "teacher_loss": 0.0023001739755272865 }, { "epoch": 0.5536332179930796, "grad_norm": 3.578125, "kd_loss": 0.08447265625, "learning_rate": 3.035583942588791e-06, "loss": 0.1655, "step": 1920, "student_loss": 0.055358272045850754, "teacher_loss": 0.00030110430088825524 }, { "epoch": 0.5550749711649365, "grad_norm": 4.125, "kd_loss": 0.09814453125, "learning_rate": 3.0107933343994233e-06, "loss": 0.1582, "step": 1925, "student_loss": 0.008619318716228008, "teacher_loss": 0.0032901125960052013 }, { "epoch": 0.5565167243367936, "grad_norm": 6.75, "kd_loss": 0.08349609375, "learning_rate": 2.9860606867773323e-06, "loss": 0.1394, "step": 1930, "student_loss": 0.03425801545381546, "teacher_loss": 0.00030506699113175273 }, { "epoch": 0.5579584775086506, "grad_norm": 4.3125, "kd_loss": 0.0908203125, "learning_rate": 2.9613867203707627e-06, "loss": 0.1535, "step": 1935, "student_loss": 0.14860902726650238, "teacher_loss": 0.021592382341623306 }, { "epoch": 0.5594002306805075, "grad_norm": 4.96875, "kd_loss": 0.10546875, "learning_rate": 2.936772154118129e-06, "loss": 0.1545, "step": 1940, "student_loss": 0.007172099314630032, "teacher_loss": 0.000852234719786793 }, { "epoch": 0.5608419838523645, "grad_norm": 3.828125, "kd_loss": 0.09619140625, "learning_rate": 2.912217705227075e-06, "loss": 0.1493, "step": 1945, "student_loss": 0.04466139152646065, "teacher_loss": 0.028232689946889877 }, { "epoch": 0.5622837370242214, "grad_norm": 8.1875, "kd_loss": 0.08740234375, "learning_rate": 2.88772408915357e-06, "loss": 0.1749, "step": 1950, "student_loss": 0.0008998726261779666, "teacher_loss": 0.0005217011785134673 }, { "epoch": 0.5637254901960784, "grad_norm": 4.0625, "kd_loss": 0.10498046875, "learning_rate": 2.863292019581071e-06, "loss": 0.1535, "step": 1955, "student_loss": 0.23264119029045105, "teacher_loss": 0.0003505937347654253 }, { "epoch": 0.5651672433679354, "grad_norm": 4.5, "kd_loss": 0.11572265625, "learning_rate": 2.838922208399712e-06, "loss": 0.1646, "step": 1960, "student_loss": 0.005253693088889122, "teacher_loss": 0.0008160973084159195 }, { "epoch": 0.5666089965397924, "grad_norm": 3.359375, "kd_loss": 0.08935546875, "learning_rate": 2.8146153656855858e-06, "loss": 0.1571, "step": 1965, "student_loss": 0.0008905039285309613, "teacher_loss": 0.00038642369327135384 }, { "epoch": 0.5680507497116494, "grad_norm": 6.96875, "kd_loss": 0.10107421875, "learning_rate": 2.7903721996800248e-06, "loss": 0.1488, "step": 1970, "student_loss": 0.001944546471349895, "teacher_loss": 0.0004150049644522369 }, { "epoch": 0.5694925028835064, "grad_norm": 2.453125, "kd_loss": 0.09716796875, "learning_rate": 2.7661934167689887e-06, "loss": 0.1556, "step": 1975, "student_loss": 0.0032470019068568945, "teacher_loss": 0.0009415296372026205 }, { "epoch": 0.5709342560553633, "grad_norm": 2.25, "kd_loss": 0.08642578125, "learning_rate": 2.742079721462471e-06, "loss": 0.1674, "step": 1980, "student_loss": 0.05945152789354324, "teacher_loss": 0.0008091035415418446 }, { "epoch": 0.5723760092272203, "grad_norm": 3.65625, "kd_loss": 0.0771484375, "learning_rate": 2.7180318163739704e-06, "loss": 0.1519, "step": 1985, "student_loss": 0.0015980260213837028, "teacher_loss": 0.0005768106202594936 }, { "epoch": 0.5738177623990772, "grad_norm": 2.65625, "kd_loss": 0.115234375, "learning_rate": 2.6940504022000248e-06, "loss": 0.1546, "step": 1990, "student_loss": 0.0632084533572197, "teacher_loss": 0.015749456360936165 }, { "epoch": 0.5752595155709342, "grad_norm": 5.4375, "kd_loss": 0.0869140625, "learning_rate": 2.67013617769979e-06, "loss": 0.153, "step": 1995, "student_loss": 0.0011626326013356447, "teacher_loss": 0.000745030993130058 }, { "epoch": 0.5767012687427913, "grad_norm": 4.0, "kd_loss": 0.09912109375, "learning_rate": 2.6462898396746783e-06, "loss": 0.1493, "step": 2000, "student_loss": 0.002248254604637623, "teacher_loss": 0.0006202560034580529 } ], "logging_steps": 5, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }