sirui6011's picture
add checkpoints/codi-single-3b
8a4bc09 verified
Raw
History Blame Contribute Delete
114 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5767012687427913,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014417531718569781,
"grad_norm": 3808.0,
"kd_loss": 0.4765625,
"learning_rate": 1.3333333333333334e-06,
"loss": 2.4865,
"step": 5,
"student_loss": 1.2782293558120728,
"teacher_loss": 0.0020202866289764643
},
{
"epoch": 0.0028835063437139563,
"grad_norm": 496.0,
"kd_loss": 0.453125,
"learning_rate": 3e-06,
"loss": 2.0957,
"step": 10,
"student_loss": 1.0292338132858276,
"teacher_loss": 0.005245466250926256
},
{
"epoch": 0.004325259515570935,
"grad_norm": 238.0,
"kd_loss": 0.4453125,
"learning_rate": 4.666666666666667e-06,
"loss": 1.9295,
"step": 15,
"student_loss": 0.631219208240509,
"teacher_loss": 0.0013347615022212267
},
{
"epoch": 0.0057670126874279125,
"grad_norm": 133.0,
"kd_loss": 0.44140625,
"learning_rate": 6.333333333333333e-06,
"loss": 1.8503,
"step": 20,
"student_loss": 1.4391331672668457,
"teacher_loss": 0.0005473981145769358
},
{
"epoch": 0.00720876585928489,
"grad_norm": 75.5,
"kd_loss": 0.3984375,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0972,
"step": 25,
"student_loss": 0.18605084717273712,
"teacher_loss": 0.0011649713851511478
},
{
"epoch": 0.00865051903114187,
"grad_norm": 24.25,
"kd_loss": 0.376953125,
"learning_rate": 9.666666666666667e-06,
"loss": 0.5883,
"step": 30,
"student_loss": 0.05895603448152542,
"teacher_loss": 0.030797353014349937
},
{
"epoch": 0.010092272202998846,
"grad_norm": 9.1875,
"kd_loss": 0.318359375,
"learning_rate": 9.99958042442916e-06,
"loss": 0.4657,
"step": 35,
"student_loss": 0.00801500491797924,
"teacher_loss": 0.04946871101856232
},
{
"epoch": 0.011534025374855825,
"grad_norm": 7.09375,
"kd_loss": 0.296875,
"learning_rate": 9.997876019358083e-06,
"loss": 0.402,
"step": 40,
"student_loss": 0.27407729625701904,
"teacher_loss": 0.04900167137384415
},
{
"epoch": 0.012975778546712802,
"grad_norm": 10.6875,
"kd_loss": 0.265625,
"learning_rate": 9.99486100792044e-06,
"loss": 0.3281,
"step": 45,
"student_loss": 0.3285456597805023,
"teacher_loss": 0.003808467648923397
},
{
"epoch": 0.01441753171856978,
"grad_norm": 21.875,
"kd_loss": 0.2578125,
"learning_rate": 9.990536180750724e-06,
"loss": 0.351,
"step": 50,
"student_loss": 0.03528054431080818,
"teacher_loss": 0.04187293350696564
},
{
"epoch": 0.015859284890426758,
"grad_norm": 228.0,
"kd_loss": 0.2421875,
"learning_rate": 9.984902671959911e-06,
"loss": 0.3368,
"step": 55,
"student_loss": 0.004587164148688316,
"teacher_loss": 0.0026238495483994484
},
{
"epoch": 0.01730103806228374,
"grad_norm": 6.4375,
"kd_loss": 0.23046875,
"learning_rate": 9.97796195883804e-06,
"loss": 0.3291,
"step": 60,
"student_loss": 0.048888176679611206,
"teacher_loss": 0.0033420324325561523
},
{
"epoch": 0.018742791234140715,
"grad_norm": 6.375,
"kd_loss": 0.2099609375,
"learning_rate": 9.969715861466839e-06,
"loss": 0.3147,
"step": 65,
"student_loss": 0.11189773678779602,
"teacher_loss": 0.0413309670984745
},
{
"epoch": 0.020184544405997693,
"grad_norm": 4.09375,
"kd_loss": 0.208984375,
"learning_rate": 9.96016654224243e-06,
"loss": 0.3096,
"step": 70,
"student_loss": 0.026729928329586983,
"teacher_loss": 0.0019827873911708593
},
{
"epoch": 0.02162629757785467,
"grad_norm": 4.25,
"kd_loss": 0.1748046875,
"learning_rate": 9.94931650530827e-06,
"loss": 0.2729,
"step": 75,
"student_loss": 0.006473238579928875,
"teacher_loss": 0.005465318448841572
},
{
"epoch": 0.02306805074971165,
"grad_norm": 3.859375,
"kd_loss": 0.16796875,
"learning_rate": 9.93716859589851e-06,
"loss": 0.2662,
"step": 80,
"student_loss": 0.00662571657449007,
"teacher_loss": 0.004144964274019003
},
{
"epoch": 0.024509803921568627,
"grad_norm": 2.78125,
"kd_loss": 0.1640625,
"learning_rate": 9.923725999591846e-06,
"loss": 0.2261,
"step": 85,
"student_loss": 0.004623747896403074,
"teacher_loss": 0.002723712706938386
},
{
"epoch": 0.025951557093425604,
"grad_norm": 5.5625,
"kd_loss": 0.1943359375,
"learning_rate": 9.908992241476189e-06,
"loss": 0.2543,
"step": 90,
"student_loss": 0.11273087561130524,
"teacher_loss": 0.0015502618625760078
},
{
"epoch": 0.027393310265282585,
"grad_norm": 3.4375,
"kd_loss": 0.189453125,
"learning_rate": 9.892971185224244e-06,
"loss": 0.2267,
"step": 95,
"student_loss": 0.006197327747941017,
"teacher_loss": 0.008993545547127724
},
{
"epoch": 0.02883506343713956,
"grad_norm": 6.34375,
"kd_loss": 0.134765625,
"learning_rate": 9.875667032080354e-06,
"loss": 0.2274,
"step": 100,
"student_loss": 0.0032730416860431433,
"teacher_loss": 0.0036007578019052744
},
{
"epoch": 0.03027681660899654,
"grad_norm": 4.0,
"kd_loss": 0.146484375,
"learning_rate": 9.857084319758772e-06,
"loss": 0.2421,
"step": 105,
"student_loss": 0.04058241471648216,
"teacher_loss": 0.0012296679196879268
},
{
"epoch": 0.031718569780853516,
"grad_norm": 4.6875,
"kd_loss": 0.1787109375,
"learning_rate": 9.837227921253747e-06,
"loss": 0.2273,
"step": 110,
"student_loss": 0.004547884222120047,
"teacher_loss": 0.023880567401647568
},
{
"epoch": 0.03316032295271049,
"grad_norm": 2.625,
"kd_loss": 0.14453125,
"learning_rate": 9.816103043561648e-06,
"loss": 0.2142,
"step": 115,
"student_loss": 0.001855566632002592,
"teacher_loss": 0.0016716865357011557
},
{
"epoch": 0.03460207612456748,
"grad_norm": 3.078125,
"kd_loss": 0.1357421875,
"learning_rate": 9.79371522631553e-06,
"loss": 0.2149,
"step": 120,
"student_loss": 0.019737211987376213,
"teacher_loss": 0.0027425403241068125
},
{
"epoch": 0.036043829296424454,
"grad_norm": 3.390625,
"kd_loss": 0.146484375,
"learning_rate": 9.770070340332457e-06,
"loss": 0.1956,
"step": 125,
"student_loss": 0.10938042402267456,
"teacher_loss": 0.001064821844920516
},
{
"epoch": 0.03748558246828143,
"grad_norm": 3.734375,
"kd_loss": 0.146484375,
"learning_rate": 9.745174586073982e-06,
"loss": 0.2099,
"step": 130,
"student_loss": 0.0035836249589920044,
"teacher_loss": 0.002439548959955573
},
{
"epoch": 0.03892733564013841,
"grad_norm": 3.0,
"kd_loss": 0.1337890625,
"learning_rate": 9.719034492020183e-06,
"loss": 0.202,
"step": 135,
"student_loss": 0.003862213110551238,
"teacher_loss": 0.0010834896238520741
},
{
"epoch": 0.040369088811995385,
"grad_norm": 7.1875,
"kd_loss": 0.1357421875,
"learning_rate": 9.691656912957686e-06,
"loss": 0.218,
"step": 140,
"student_loss": 0.0022195407655090094,
"teacher_loss": 0.0014228483196347952
},
{
"epoch": 0.04181084198385236,
"grad_norm": 3.703125,
"kd_loss": 0.150390625,
"learning_rate": 9.663049028182112e-06,
"loss": 0.2077,
"step": 145,
"student_loss": 0.11333022266626358,
"teacher_loss": 0.00682886503636837
},
{
"epoch": 0.04325259515570934,
"grad_norm": 2.859375,
"kd_loss": 0.14453125,
"learning_rate": 9.633218339615433e-06,
"loss": 0.1935,
"step": 150,
"student_loss": 0.0032606760505586863,
"teacher_loss": 0.0031273479107767344
},
{
"epoch": 0.04469434832756632,
"grad_norm": 3.578125,
"kd_loss": 0.12451171875,
"learning_rate": 9.602172669838721e-06,
"loss": 0.2199,
"step": 155,
"student_loss": 0.0088576041162014,
"teacher_loss": 0.0016166985733434558
},
{
"epoch": 0.0461361014994233,
"grad_norm": 4.09375,
"kd_loss": 0.1708984375,
"learning_rate": 9.569920160040815e-06,
"loss": 0.2018,
"step": 160,
"student_loss": 0.13294154405593872,
"teacher_loss": 0.03791189566254616
},
{
"epoch": 0.04757785467128028,
"grad_norm": 3.921875,
"kd_loss": 0.138671875,
"learning_rate": 9.536469267883432e-06,
"loss": 0.208,
"step": 165,
"student_loss": 0.002772042527794838,
"teacher_loss": 0.005522818770259619
},
{
"epoch": 0.049019607843137254,
"grad_norm": 5.03125,
"kd_loss": 0.126953125,
"learning_rate": 9.501828765283295e-06,
"loss": 0.1962,
"step": 170,
"student_loss": 0.003656906308606267,
"teacher_loss": 0.0018494409741833806
},
{
"epoch": 0.05046136101499423,
"grad_norm": 3.515625,
"kd_loss": 0.12451171875,
"learning_rate": 9.466007736111846e-06,
"loss": 0.1935,
"step": 175,
"student_loss": 0.017079656943678856,
"teacher_loss": 0.0010717228287830949
},
{
"epoch": 0.05190311418685121,
"grad_norm": 5.03125,
"kd_loss": 0.11669921875,
"learning_rate": 9.429015573813163e-06,
"loss": 0.1861,
"step": 180,
"student_loss": 0.003456867765635252,
"teacher_loss": 0.0010596277425065637
},
{
"epoch": 0.05334486735870819,
"grad_norm": 4.0625,
"kd_loss": 0.1455078125,
"learning_rate": 9.390861978940687e-06,
"loss": 0.1921,
"step": 185,
"student_loss": 0.31187787652015686,
"teacher_loss": 0.0008243238553404808
},
{
"epoch": 0.05478662053056517,
"grad_norm": 5.09375,
"kd_loss": 0.1416015625,
"learning_rate": 9.351556956613423e-06,
"loss": 0.2044,
"step": 190,
"student_loss": 0.011734717525541782,
"teacher_loss": 0.0015390698099508882
},
{
"epoch": 0.056228373702422146,
"grad_norm": 4.5,
"kd_loss": 0.1337890625,
"learning_rate": 9.31111081389227e-06,
"loss": 0.1778,
"step": 195,
"student_loss": 0.05741060897707939,
"teacher_loss": 0.0007789382943883538
},
{
"epoch": 0.05767012687427912,
"grad_norm": 2.28125,
"kd_loss": 0.12890625,
"learning_rate": 9.269534157077177e-06,
"loss": 0.1743,
"step": 200,
"student_loss": 0.0014264394994825125,
"teacher_loss": 0.0006533891428261995
},
{
"epoch": 0.0591118800461361,
"grad_norm": 4.25,
"kd_loss": 0.1376953125,
"learning_rate": 9.226837888925813e-06,
"loss": 0.1969,
"step": 205,
"student_loss": 0.0015782959526404738,
"teacher_loss": 0.0368424728512764
},
{
"epoch": 0.06055363321799308,
"grad_norm": 7.5625,
"kd_loss": 0.1123046875,
"learning_rate": 9.183033205794525e-06,
"loss": 0.1836,
"step": 210,
"student_loss": 0.01342203002423048,
"teacher_loss": 0.0011842605890706182
},
{
"epoch": 0.061995386389850055,
"grad_norm": 3.0,
"kd_loss": 0.126953125,
"learning_rate": 9.13813159470227e-06,
"loss": 0.1824,
"step": 215,
"student_loss": 0.0014404732501134276,
"teacher_loss": 0.0007589462329633534
},
{
"epoch": 0.06343713956170703,
"grad_norm": 7.125,
"kd_loss": 0.1328125,
"learning_rate": 9.092144830318357e-06,
"loss": 0.21,
"step": 220,
"student_loss": 0.2732444703578949,
"teacher_loss": 0.00960276648402214
},
{
"epoch": 0.06487889273356401,
"grad_norm": 6.15625,
"kd_loss": 0.1083984375,
"learning_rate": 9.045084971874738e-06,
"loss": 0.1941,
"step": 225,
"student_loss": 0.2691424489021301,
"teacher_loss": 0.003315337933599949
},
{
"epoch": 0.06632064590542099,
"grad_norm": 2.46875,
"kd_loss": 0.10986328125,
"learning_rate": 8.99696436000368e-06,
"loss": 0.1702,
"step": 230,
"student_loss": 0.08001423627138138,
"teacher_loss": 0.009347192943096161
},
{
"epoch": 0.06776239907727798,
"grad_norm": 3.734375,
"kd_loss": 0.1337890625,
"learning_rate": 8.947795613501658e-06,
"loss": 0.1778,
"step": 235,
"student_loss": 0.003426821669563651,
"teacher_loss": 0.0008243515621870756
},
{
"epoch": 0.06920415224913495,
"grad_norm": 3.421875,
"kd_loss": 0.10205078125,
"learning_rate": 8.897591626020284e-06,
"loss": 0.1928,
"step": 240,
"student_loss": 0.004231320694088936,
"teacher_loss": 0.0010777115821838379
},
{
"epoch": 0.07064590542099193,
"grad_norm": 3.640625,
"kd_loss": 0.11181640625,
"learning_rate": 8.846365562685178e-06,
"loss": 0.1721,
"step": 245,
"student_loss": 0.003189836163073778,
"teacher_loss": 0.0029278292786329985
},
{
"epoch": 0.07208765859284891,
"grad_norm": 3.28125,
"kd_loss": 0.107421875,
"learning_rate": 8.794130856643635e-06,
"loss": 0.1624,
"step": 250,
"student_loss": 0.0030161093454807997,
"teacher_loss": 0.0015030049253255129
},
{
"epoch": 0.07352941176470588,
"grad_norm": 2.671875,
"kd_loss": 0.1171875,
"learning_rate": 8.74090120554202e-06,
"loss": 0.181,
"step": 255,
"student_loss": 0.0010447532404214144,
"teacher_loss": 0.0014572968939319253
},
{
"epoch": 0.07497116493656286,
"grad_norm": 3.03125,
"kd_loss": 0.125,
"learning_rate": 8.686690567933803e-06,
"loss": 0.18,
"step": 260,
"student_loss": 0.002235305029898882,
"teacher_loss": 0.03705403953790665
},
{
"epoch": 0.07641291810841984,
"grad_norm": 3.84375,
"kd_loss": 0.10107421875,
"learning_rate": 8.63151315961915e-06,
"loss": 0.1751,
"step": 265,
"student_loss": 0.0019888102542608976,
"teacher_loss": 0.0012628042604774237
},
{
"epoch": 0.07785467128027682,
"grad_norm": 4.03125,
"kd_loss": 0.103515625,
"learning_rate": 8.575383449917103e-06,
"loss": 0.1698,
"step": 270,
"student_loss": 0.009670126251876354,
"teacher_loss": 0.0018007074249908328
},
{
"epoch": 0.07929642445213379,
"grad_norm": 5.0625,
"kd_loss": 0.11669921875,
"learning_rate": 8.518316157871232e-06,
"loss": 0.1792,
"step": 275,
"student_loss": 0.0027291348669677973,
"teacher_loss": 0.03865275904536247
},
{
"epoch": 0.08073817762399077,
"grad_norm": 6.40625,
"kd_loss": 0.12451171875,
"learning_rate": 8.460326248389825e-06,
"loss": 0.1868,
"step": 280,
"student_loss": 0.0005779159837402403,
"teacher_loss": 0.0004988706787116826
},
{
"epoch": 0.08217993079584775,
"grad_norm": 5.03125,
"kd_loss": 0.1083984375,
"learning_rate": 8.401428928321607e-06,
"loss": 0.1777,
"step": 285,
"student_loss": 0.00322159961797297,
"teacher_loss": 0.0016653644852340221
},
{
"epoch": 0.08362168396770472,
"grad_norm": 4.15625,
"kd_loss": 0.119140625,
"learning_rate": 8.341639642468002e-06,
"loss": 0.2245,
"step": 290,
"student_loss": 0.025423452258110046,
"teacher_loss": 0.006107364781200886
},
{
"epoch": 0.0850634371395617,
"grad_norm": 2.21875,
"kd_loss": 0.10498046875,
"learning_rate": 8.280974069532999e-06,
"loss": 0.1742,
"step": 295,
"student_loss": 0.0032805479131639004,
"teacher_loss": 0.002079744590446353
},
{
"epoch": 0.08650519031141868,
"grad_norm": 4.875,
"kd_loss": 0.1220703125,
"learning_rate": 8.219448118011687e-06,
"loss": 0.1698,
"step": 300,
"student_loss": 0.05386965721845627,
"teacher_loss": 0.0015291464515030384
},
{
"epoch": 0.08794694348327567,
"grad_norm": 2.703125,
"kd_loss": 0.09375,
"learning_rate": 8.157077922018537e-06,
"loss": 0.1735,
"step": 305,
"student_loss": 0.007909238338470459,
"teacher_loss": 0.0032228778582066298
},
{
"epoch": 0.08938869665513265,
"grad_norm": 6.375,
"kd_loss": 0.091796875,
"learning_rate": 8.093879837056486e-06,
"loss": 0.1662,
"step": 310,
"student_loss": 0.0014559343690052629,
"teacher_loss": 0.0014570873463526368
},
{
"epoch": 0.09083044982698962,
"grad_norm": 6.09375,
"kd_loss": 0.09716796875,
"learning_rate": 8.029870435728018e-06,
"loss": 0.1905,
"step": 315,
"student_loss": 0.13904070854187012,
"teacher_loss": 0.00045576939010061324
},
{
"epoch": 0.0922722029988466,
"grad_norm": 8.75,
"kd_loss": 0.091796875,
"learning_rate": 7.965066503389264e-06,
"loss": 0.1801,
"step": 320,
"student_loss": 0.0017298327293246984,
"teacher_loss": 0.001036101020872593
},
{
"epoch": 0.09371395617070358,
"grad_norm": 5.53125,
"kd_loss": 0.15625,
"learning_rate": 7.89948503374835e-06,
"loss": 0.1636,
"step": 325,
"student_loss": 0.0033407427836209536,
"teacher_loss": 0.02077825367450714
},
{
"epoch": 0.09515570934256055,
"grad_norm": 5.09375,
"kd_loss": 0.10693359375,
"learning_rate": 7.833143224409076e-06,
"loss": 0.1884,
"step": 330,
"student_loss": 0.006418874487280846,
"teacher_loss": 0.0011637036222964525
},
{
"epoch": 0.09659746251441753,
"grad_norm": 4.71875,
"kd_loss": 0.09716796875,
"learning_rate": 7.766058472361154e-06,
"loss": 0.1577,
"step": 335,
"student_loss": 0.0016794800758361816,
"teacher_loss": 0.0023754944559186697
},
{
"epoch": 0.09803921568627451,
"grad_norm": 5.09375,
"kd_loss": 0.09326171875,
"learning_rate": 7.698248369418146e-06,
"loss": 0.1589,
"step": 340,
"student_loss": 0.044694170355796814,
"teacher_loss": 0.007826481945812702
},
{
"epoch": 0.09948096885813149,
"grad_norm": 5.375,
"kd_loss": 0.0966796875,
"learning_rate": 7.629730697604314e-06,
"loss": 0.1807,
"step": 345,
"student_loss": 0.09194417297840118,
"teacher_loss": 0.0007945778197608888
},
{
"epoch": 0.10092272202998846,
"grad_norm": 5.875,
"kd_loss": 0.11865234375,
"learning_rate": 7.560523424491595e-06,
"loss": 0.1526,
"step": 350,
"student_loss": 0.005946993827819824,
"teacher_loss": 0.0006145219667814672
},
{
"epoch": 0.10236447520184544,
"grad_norm": 9.1875,
"kd_loss": 0.10302734375,
"learning_rate": 7.490644698487909e-06,
"loss": 0.1627,
"step": 355,
"student_loss": 0.0015843416331335902,
"teacher_loss": 0.0014968032483011484
},
{
"epoch": 0.10380622837370242,
"grad_norm": 12.625,
"kd_loss": 0.10791015625,
"learning_rate": 7.420112844078066e-06,
"loss": 0.1682,
"step": 360,
"student_loss": 0.01987134851515293,
"teacher_loss": 0.001595525536686182
},
{
"epoch": 0.1052479815455594,
"grad_norm": 6.5625,
"kd_loss": 0.103515625,
"learning_rate": 7.348946357018479e-06,
"loss": 0.1509,
"step": 365,
"student_loss": 0.006010106764733791,
"teacher_loss": 0.032394833862781525
},
{
"epoch": 0.10668973471741638,
"grad_norm": 6.6875,
"kd_loss": 0.1181640625,
"learning_rate": 7.277163899486975e-06,
"loss": 0.1623,
"step": 370,
"student_loss": 0.15845070779323578,
"teacher_loss": 0.0004756299313157797
},
{
"epoch": 0.10813148788927336,
"grad_norm": 4.625,
"kd_loss": 0.1240234375,
"learning_rate": 7.204784295188959e-06,
"loss": 0.1506,
"step": 375,
"student_loss": 0.10649572312831879,
"teacher_loss": 0.02242193929851055
},
{
"epoch": 0.10957324106113034,
"grad_norm": 3.4375,
"kd_loss": 0.1025390625,
"learning_rate": 7.1318265244212305e-06,
"loss": 0.1752,
"step": 380,
"student_loss": 0.00281524658203125,
"teacher_loss": 0.0015117195434868336
},
{
"epoch": 0.11101499423298732,
"grad_norm": 2.359375,
"kd_loss": 0.10205078125,
"learning_rate": 7.05830971909472e-06,
"loss": 0.1547,
"step": 385,
"student_loss": 0.0016335069667547941,
"teacher_loss": 0.0012311713071539998
},
{
"epoch": 0.11245674740484429,
"grad_norm": 4.125,
"kd_loss": 0.10205078125,
"learning_rate": 6.9842531577174865e-06,
"loss": 0.1538,
"step": 390,
"student_loss": 0.0012884392635896802,
"teacher_loss": 0.001418368425220251
},
{
"epoch": 0.11389850057670127,
"grad_norm": 5.1875,
"kd_loss": 0.1025390625,
"learning_rate": 6.9096762603392595e-06,
"loss": 0.1698,
"step": 395,
"student_loss": 0.0018499374855309725,
"teacher_loss": 0.0013690440682694316
},
{
"epoch": 0.11534025374855825,
"grad_norm": 6.53125,
"kd_loss": 0.10546875,
"learning_rate": 6.834598583458862e-06,
"loss": 0.16,
"step": 400,
"student_loss": 0.0014036521315574646,
"teacher_loss": 0.00040830764919519424
},
{
"epoch": 0.11678200692041522,
"grad_norm": 4.71875,
"kd_loss": 0.0888671875,
"learning_rate": 6.7590398148958625e-06,
"loss": 0.1718,
"step": 405,
"student_loss": 0.10261467099189758,
"teacher_loss": 0.0006754493806511164
},
{
"epoch": 0.1182237600922722,
"grad_norm": 4.3125,
"kd_loss": 0.171875,
"learning_rate": 6.6830197686277945e-06,
"loss": 0.1878,
"step": 410,
"student_loss": 0.4882833659648895,
"teacher_loss": 0.00981883890926838
},
{
"epoch": 0.11966551326412918,
"grad_norm": 2.8125,
"kd_loss": 0.1083984375,
"learning_rate": 6.6065583795942625e-06,
"loss": 0.182,
"step": 415,
"student_loss": 0.03729023039340973,
"teacher_loss": 0.0042837257497012615
},
{
"epoch": 0.12110726643598616,
"grad_norm": 4.65625,
"kd_loss": 0.1064453125,
"learning_rate": 6.52967569846937e-06,
"loss": 0.1607,
"step": 420,
"student_loss": 0.05881139263510704,
"teacher_loss": 0.024456653743982315
},
{
"epoch": 0.12254901960784313,
"grad_norm": 4.25,
"kd_loss": 0.10009765625,
"learning_rate": 6.452391886403767e-06,
"loss": 0.1674,
"step": 425,
"student_loss": 0.05037780851125717,
"teacher_loss": 0.0040146904066205025
},
{
"epoch": 0.12399077277970011,
"grad_norm": 4.75,
"kd_loss": 0.1748046875,
"learning_rate": 6.374727209737743e-06,
"loss": 0.1766,
"step": 430,
"student_loss": 0.00238221138715744,
"teacher_loss": 0.06439146399497986
},
{
"epoch": 0.1254325259515571,
"grad_norm": 5.53125,
"kd_loss": 0.1015625,
"learning_rate": 6.296702034686726e-06,
"loss": 0.1714,
"step": 435,
"student_loss": 0.002659996272996068,
"teacher_loss": 0.0022907655220478773
},
{
"epoch": 0.12687427912341406,
"grad_norm": 3.078125,
"kd_loss": 0.1845703125,
"learning_rate": 6.218336822000598e-06,
"loss": 0.1775,
"step": 440,
"student_loss": 0.46329638361930847,
"teacher_loss": 0.008188321255147457
},
{
"epoch": 0.12831603229527105,
"grad_norm": 3.953125,
"kd_loss": 0.1259765625,
"learning_rate": 6.139652121598219e-06,
"loss": 0.1769,
"step": 445,
"student_loss": 0.0006292742909863591,
"teacher_loss": 0.02016839198768139
},
{
"epoch": 0.12975778546712802,
"grad_norm": 3.53125,
"kd_loss": 0.10400390625,
"learning_rate": 6.060668567178561e-06,
"loss": 0.1663,
"step": 450,
"student_loss": 0.002717025112360716,
"teacher_loss": 0.0016874197172001004
},
{
"epoch": 0.131199538638985,
"grad_norm": 2.671875,
"kd_loss": 0.087890625,
"learning_rate": 5.981406870809889e-06,
"loss": 0.1748,
"step": 455,
"student_loss": 0.012300008907914162,
"teacher_loss": 0.0016890015685930848
},
{
"epoch": 0.13264129181084197,
"grad_norm": 5.1875,
"kd_loss": 0.10791015625,
"learning_rate": 5.9018878174983674e-06,
"loss": 0.17,
"step": 460,
"student_loss": 0.03240777552127838,
"teacher_loss": 0.0010722745209932327
},
{
"epoch": 0.13408304498269896,
"grad_norm": 2.765625,
"kd_loss": 0.1328125,
"learning_rate": 5.822132259737565e-06,
"loss": 0.1858,
"step": 465,
"student_loss": 0.0023128872271627188,
"teacher_loss": 0.0006816239329054952
},
{
"epoch": 0.13552479815455595,
"grad_norm": 5.3125,
"kd_loss": 0.09521484375,
"learning_rate": 5.742161112040237e-06,
"loss": 0.1887,
"step": 470,
"student_loss": 0.0013243159046396613,
"teacher_loss": 0.0008191668312065303
},
{
"epoch": 0.13696655132641292,
"grad_norm": 4.34375,
"kd_loss": 0.12890625,
"learning_rate": 5.661995345453867e-06,
"loss": 0.1479,
"step": 475,
"student_loss": 0.0022922754287719727,
"teacher_loss": 0.0007053640438243747
},
{
"epoch": 0.1384083044982699,
"grad_norm": 4.1875,
"kd_loss": 0.10888671875,
"learning_rate": 5.581655982061367e-06,
"loss": 0.2052,
"step": 480,
"student_loss": 0.016067378222942352,
"teacher_loss": 0.0467948317527771
},
{
"epoch": 0.13985005767012687,
"grad_norm": 4.625,
"kd_loss": 0.09521484375,
"learning_rate": 5.501164089468406e-06,
"loss": 0.1535,
"step": 485,
"student_loss": 0.001838831347413361,
"teacher_loss": 0.0018535954877734184
},
{
"epoch": 0.14129181084198386,
"grad_norm": 4.3125,
"kd_loss": 0.099609375,
"learning_rate": 5.4205407752787884e-06,
"loss": 0.1702,
"step": 490,
"student_loss": 0.0021060549188405275,
"teacher_loss": 0.0013811348471790552
},
{
"epoch": 0.14273356401384082,
"grad_norm": 5.0,
"kd_loss": 0.12109375,
"learning_rate": 5.339807181559359e-06,
"loss": 0.1698,
"step": 495,
"student_loss": 0.004670781549066305,
"teacher_loss": 0.0008499641553498805
},
{
"epoch": 0.14417531718569782,
"grad_norm": 7.28125,
"kd_loss": 0.1005859375,
"learning_rate": 5.258984479295853e-06,
"loss": 0.1663,
"step": 500,
"student_loss": 0.0009078571456484497,
"teacher_loss": 0.0008732817368581891
},
{
"epoch": 0.14561707035755478,
"grad_norm": 4.8125,
"kd_loss": 0.09521484375,
"learning_rate": 5.1780938628411795e-06,
"loss": 0.1857,
"step": 505,
"student_loss": 0.002319552004337311,
"teacher_loss": 0.0009417013498023152
},
{
"epoch": 0.14705882352941177,
"grad_norm": 5.5,
"kd_loss": 0.09130859375,
"learning_rate": 5.097156544357567e-06,
"loss": 0.168,
"step": 510,
"student_loss": 0.001904567121528089,
"teacher_loss": 0.0011415554909035563
},
{
"epoch": 0.14850057670126873,
"grad_norm": 4.9375,
"kd_loss": 0.11474609375,
"learning_rate": 5.016193748254045e-06,
"loss": 0.1561,
"step": 515,
"student_loss": 0.004430091939866543,
"teacher_loss": 0.000705283775459975
},
{
"epoch": 0.14994232987312572,
"grad_norm": 4.0,
"kd_loss": 0.1005859375,
"learning_rate": 4.935226705620699e-06,
"loss": 0.1742,
"step": 520,
"student_loss": 0.4650050103664398,
"teacher_loss": 0.011486685834825039
},
{
"epoch": 0.1513840830449827,
"grad_norm": 2.28125,
"kd_loss": 0.09423828125,
"learning_rate": 4.8542766486612035e-06,
"loss": 0.1568,
"step": 525,
"student_loss": 0.004688178189098835,
"teacher_loss": 0.0005817305063828826
},
{
"epoch": 0.15282583621683968,
"grad_norm": 6.21875,
"kd_loss": 0.1025390625,
"learning_rate": 4.773364805125025e-06,
"loss": 0.1569,
"step": 530,
"student_loss": 0.002902889158576727,
"teacher_loss": 0.0036108619533479214
},
{
"epoch": 0.15426758938869667,
"grad_norm": 2.8125,
"kd_loss": 0.0947265625,
"learning_rate": 4.6925123927408265e-06,
"loss": 0.146,
"step": 535,
"student_loss": 0.004958340898156166,
"teacher_loss": 0.0009314365452155471
},
{
"epoch": 0.15570934256055363,
"grad_norm": 5.09375,
"kd_loss": 0.1083984375,
"learning_rate": 4.611740613652485e-06,
"loss": 0.1485,
"step": 540,
"student_loss": 0.022316506132483482,
"teacher_loss": 0.0009606878156773746
},
{
"epoch": 0.15715109573241062,
"grad_norm": 5.90625,
"kd_loss": 0.095703125,
"learning_rate": 4.531070648859186e-06,
"loss": 0.171,
"step": 545,
"student_loss": 0.005919112823903561,
"teacher_loss": 0.016547029837965965
},
{
"epoch": 0.15859284890426759,
"grad_norm": 4.375,
"kd_loss": 0.1123046875,
"learning_rate": 4.450523652661086e-06,
"loss": 0.142,
"step": 550,
"student_loss": 0.0007885328959673643,
"teacher_loss": 0.0045303236693143845
},
{
"epoch": 0.16003460207612458,
"grad_norm": 4.03125,
"kd_loss": 0.09326171875,
"learning_rate": 4.370120747111956e-06,
"loss": 0.1566,
"step": 555,
"student_loss": 0.0045122369192540646,
"teacher_loss": 0.0012258175993338227
},
{
"epoch": 0.16147635524798154,
"grad_norm": 5.1875,
"kd_loss": 0.09423828125,
"learning_rate": 4.289883016480291e-06,
"loss": 0.1694,
"step": 560,
"student_loss": 0.038154710084199905,
"teacher_loss": 0.00046423348248936236
},
{
"epoch": 0.16291810841983853,
"grad_norm": 3.34375,
"kd_loss": 0.109375,
"learning_rate": 4.209831501720328e-06,
"loss": 0.1557,
"step": 565,
"student_loss": 0.018078487366437912,
"teacher_loss": 0.021091489121317863
},
{
"epoch": 0.1643598615916955,
"grad_norm": 5.21875,
"kd_loss": 0.1689453125,
"learning_rate": 4.129987194954421e-06,
"loss": 0.17,
"step": 570,
"student_loss": 0.15178009867668152,
"teacher_loss": 0.0086033521220088
},
{
"epoch": 0.16580161476355249,
"grad_norm": 2.890625,
"kd_loss": 0.08544921875,
"learning_rate": 4.050371033968216e-06,
"loss": 0.1651,
"step": 575,
"student_loss": 0.0016716659301891923,
"teacher_loss": 0.0008001797832548618
},
{
"epoch": 0.16724336793540945,
"grad_norm": 4.1875,
"kd_loss": 0.236328125,
"learning_rate": 3.9710038967200825e-06,
"loss": 0.1443,
"step": 580,
"student_loss": 0.004638470709323883,
"teacher_loss": 0.006588623858988285
},
{
"epoch": 0.16868512110726644,
"grad_norm": 2.890625,
"kd_loss": 0.0966796875,
"learning_rate": 3.89190659586623e-06,
"loss": 0.1551,
"step": 585,
"student_loss": 0.00187311926856637,
"teacher_loss": 0.0005596915725618601
},
{
"epoch": 0.1701268742791234,
"grad_norm": 6.65625,
"kd_loss": 0.091796875,
"learning_rate": 3.8130998733029517e-06,
"loss": 0.1722,
"step": 590,
"student_loss": 0.017516393214464188,
"teacher_loss": 0.002362610539421439
},
{
"epoch": 0.1715686274509804,
"grad_norm": 2.234375,
"kd_loss": 0.09423828125,
"learning_rate": 3.734604394727419e-06,
"loss": 0.1736,
"step": 595,
"student_loss": 0.0015100985765457153,
"teacher_loss": 0.0012370356125757098
},
{
"epoch": 0.17301038062283736,
"grad_norm": 5.375,
"kd_loss": 0.1064453125,
"learning_rate": 3.656440744218464e-06,
"loss": 0.1822,
"step": 600,
"student_loss": 0.3471376895904541,
"teacher_loss": 0.006922336760908365
},
{
"epoch": 0.17445213379469435,
"grad_norm": 5.65625,
"kd_loss": 0.10400390625,
"learning_rate": 3.578629418838757e-06,
"loss": 0.1706,
"step": 605,
"student_loss": 0.09560892730951309,
"teacher_loss": 0.04084807634353638
},
{
"epoch": 0.17589388696655134,
"grad_norm": 3.609375,
"kd_loss": 0.10888671875,
"learning_rate": 3.5011908232598124e-06,
"loss": 0.1418,
"step": 610,
"student_loss": 0.0035140912514179945,
"teacher_loss": 0.0005105194286443293
},
{
"epoch": 0.1773356401384083,
"grad_norm": 3.375,
"kd_loss": 0.10498046875,
"learning_rate": 3.4241452644112085e-06,
"loss": 0.1453,
"step": 615,
"student_loss": 0.0014288002857938409,
"teacher_loss": 0.001070382189936936
},
{
"epoch": 0.1787773933102653,
"grad_norm": 2.578125,
"kd_loss": 0.1416015625,
"learning_rate": 3.3475129461554567e-06,
"loss": 0.1677,
"step": 620,
"student_loss": 0.0047634500078856945,
"teacher_loss": 0.009211473166942596
},
{
"epoch": 0.18021914648212226,
"grad_norm": 3.578125,
"kd_loss": 0.09814453125,
"learning_rate": 3.271313963989886e-06,
"loss": 0.1556,
"step": 625,
"student_loss": 0.019517898559570312,
"teacher_loss": 0.004466219339519739
},
{
"epoch": 0.18166089965397925,
"grad_norm": 4.21875,
"kd_loss": 0.10009765625,
"learning_rate": 3.195568299776945e-06,
"loss": 0.1587,
"step": 630,
"student_loss": 0.09341763705015182,
"teacher_loss": 0.0017769263358786702
},
{
"epoch": 0.1831026528258362,
"grad_norm": 4.21875,
"kd_loss": 0.09130859375,
"learning_rate": 3.1202958165043053e-06,
"loss": 0.1877,
"step": 635,
"student_loss": 0.0012313922634348273,
"teacher_loss": 0.0007036713068373501
},
{
"epoch": 0.1845444059976932,
"grad_norm": 5.84375,
"kd_loss": 0.0966796875,
"learning_rate": 3.045516253076137e-06,
"loss": 0.1654,
"step": 640,
"student_loss": 0.001555976108647883,
"teacher_loss": 0.0010528129059821367
},
{
"epoch": 0.18598615916955016,
"grad_norm": 7.96875,
"kd_loss": 0.09228515625,
"learning_rate": 2.9712492191369245e-06,
"loss": 0.1564,
"step": 645,
"student_loss": 0.0033667683601379395,
"teacher_loss": 0.0009755496867001057
},
{
"epoch": 0.18742791234140715,
"grad_norm": 2.921875,
"kd_loss": 0.11376953125,
"learning_rate": 2.8975141899291777e-06,
"loss": 0.1552,
"step": 650,
"student_loss": 0.001696955063380301,
"teacher_loss": 0.0012513434048742056
},
{
"epoch": 0.18886966551326412,
"grad_norm": 3.234375,
"kd_loss": 0.08544921875,
"learning_rate": 2.8243305011863843e-06,
"loss": 0.1481,
"step": 655,
"student_loss": 0.027264071628451347,
"teacher_loss": 0.0005043753772042692
},
{
"epoch": 0.1903114186851211,
"grad_norm": 3.84375,
"kd_loss": 0.0966796875,
"learning_rate": 2.751717344062552e-06,
"loss": 0.1658,
"step": 660,
"student_loss": 0.006026037037372589,
"teacher_loss": 0.0037035837303847075
},
{
"epoch": 0.19175317185697807,
"grad_norm": 3.96875,
"kd_loss": 0.11181640625,
"learning_rate": 2.6796937600996587e-06,
"loss": 0.1585,
"step": 665,
"student_loss": 0.0023006678093224764,
"teacher_loss": 0.0006673650932498276
},
{
"epoch": 0.19319492502883506,
"grad_norm": 4.125,
"kd_loss": 0.08837890625,
"learning_rate": 2.6082786362343377e-06,
"loss": 0.1818,
"step": 670,
"student_loss": 0.0015634546289220452,
"teacher_loss": 0.0005979883135296404
},
{
"epoch": 0.19463667820069205,
"grad_norm": 3.59375,
"kd_loss": 0.09814453125,
"learning_rate": 2.5374906998451094e-06,
"loss": 0.1598,
"step": 675,
"student_loss": 0.0016033351421356201,
"teacher_loss": 0.001516613527201116
},
{
"epoch": 0.19607843137254902,
"grad_norm": 3.625,
"kd_loss": 0.162109375,
"learning_rate": 2.467348513841447e-06,
"loss": 0.1566,
"step": 680,
"student_loss": 0.15181653201580048,
"teacher_loss": 0.04114415496587753
},
{
"epoch": 0.197520184544406,
"grad_norm": 2.703125,
"kd_loss": 0.15625,
"learning_rate": 2.3978704717959777e-06,
"loss": 0.154,
"step": 685,
"student_loss": 0.0007377453148365021,
"teacher_loss": 0.0339120589196682
},
{
"epoch": 0.19896193771626297,
"grad_norm": 3.15625,
"kd_loss": 0.09521484375,
"learning_rate": 2.329074793121085e-06,
"loss": 0.1582,
"step": 690,
"student_loss": 0.0044479165226221085,
"teacher_loss": 0.012265580706298351
},
{
"epoch": 0.20040369088811996,
"grad_norm": 3.234375,
"kd_loss": 0.10693359375,
"learning_rate": 2.260979518291186e-06,
"loss": 0.1724,
"step": 695,
"student_loss": 0.015444566495716572,
"teacher_loss": 0.010763188824057579
},
{
"epoch": 0.20184544405997693,
"grad_norm": 3.75,
"kd_loss": 0.091796875,
"learning_rate": 2.1936025041119268e-06,
"loss": 0.1753,
"step": 700,
"student_loss": 0.0019369354704394937,
"teacher_loss": 0.0009062191820703447
},
{
"epoch": 0.20328719723183392,
"grad_norm": 3.296875,
"kd_loss": 0.1044921875,
"learning_rate": 2.1269614190375477e-06,
"loss": 0.1584,
"step": 705,
"student_loss": 0.001297777402214706,
"teacher_loss": 0.0018579652532935143
},
{
"epoch": 0.20472895040369088,
"grad_norm": 3.75,
"kd_loss": 0.10400390625,
"learning_rate": 2.061073738537635e-06,
"loss": 0.1901,
"step": 710,
"student_loss": 0.08087821304798126,
"teacher_loss": 0.004622929729521275
},
{
"epoch": 0.20617070357554787,
"grad_norm": 4.1875,
"kd_loss": 0.08935546875,
"learning_rate": 1.9959567405144825e-06,
"loss": 0.1863,
"step": 715,
"student_loss": 0.009472950361669064,
"teacher_loss": 0.007570087444037199
},
{
"epoch": 0.20761245674740483,
"grad_norm": 4.34375,
"kd_loss": 0.11767578125,
"learning_rate": 1.931627500772263e-06,
"loss": 0.1746,
"step": 720,
"student_loss": 0.001279592514038086,
"teacher_loss": 0.004464911296963692
},
{
"epoch": 0.20905420991926182,
"grad_norm": 2.25,
"kd_loss": 0.10888671875,
"learning_rate": 1.8681028885391905e-06,
"loss": 0.1528,
"step": 725,
"student_loss": 0.0024647831451147795,
"teacher_loss": 0.0011802453082054853
},
{
"epoch": 0.2104959630911188,
"grad_norm": 2.671875,
"kd_loss": 0.0986328125,
"learning_rate": 1.8053995620438625e-06,
"loss": 0.152,
"step": 730,
"student_loss": 0.04315745085477829,
"teacher_loss": 0.00156076205894351
},
{
"epoch": 0.21193771626297578,
"grad_norm": 4.78125,
"kd_loss": 0.10498046875,
"learning_rate": 1.743533964146924e-06,
"loss": 0.1704,
"step": 735,
"student_loss": 0.0016925001982599497,
"teacher_loss": 0.0004609136376529932
},
{
"epoch": 0.21337946943483277,
"grad_norm": 3.9375,
"kd_loss": 0.1142578125,
"learning_rate": 1.6825223180292138e-06,
"loss": 0.1432,
"step": 740,
"student_loss": 0.012965809553861618,
"teacher_loss": 0.0004022814682684839
},
{
"epoch": 0.21482122260668973,
"grad_norm": 5.1875,
"kd_loss": 0.12890625,
"learning_rate": 1.6223806229375182e-06,
"loss": 0.1491,
"step": 745,
"student_loss": 0.1358025223016739,
"teacher_loss": 0.02106391452252865
},
{
"epoch": 0.21626297577854672,
"grad_norm": 4.4375,
"kd_loss": 0.11328125,
"learning_rate": 1.563124649989043e-06,
"loss": 0.1605,
"step": 750,
"student_loss": 0.10271821916103363,
"teacher_loss": 0.004584586247801781
},
{
"epoch": 0.2177047289504037,
"grad_norm": 4.28125,
"kd_loss": 0.091796875,
"learning_rate": 1.5047699380357134e-06,
"loss": 0.1681,
"step": 755,
"student_loss": 0.1378186047077179,
"teacher_loss": 0.006503281649202108
},
{
"epoch": 0.21914648212226068,
"grad_norm": 9.5,
"kd_loss": 0.0908203125,
"learning_rate": 1.4473317895893773e-06,
"loss": 0.16,
"step": 760,
"student_loss": 0.4880536198616028,
"teacher_loss": 0.00078756851144135
},
{
"epoch": 0.22058823529411764,
"grad_norm": 6.96875,
"kd_loss": 0.11328125,
"learning_rate": 1.39082526680899e-06,
"loss": 0.1728,
"step": 765,
"student_loss": 0.07314120978116989,
"teacher_loss": 0.0007181121036410332
},
{
"epoch": 0.22202998846597463,
"grad_norm": 2.65625,
"kd_loss": 0.09912109375,
"learning_rate": 1.3352651875508204e-06,
"loss": 0.1513,
"step": 770,
"student_loss": 0.004254591651260853,
"teacher_loss": 0.0007984668482095003
},
{
"epoch": 0.2234717416378316,
"grad_norm": 3.640625,
"kd_loss": 0.1044921875,
"learning_rate": 1.2806661214827286e-06,
"loss": 0.1587,
"step": 775,
"student_loss": 0.002741985023021698,
"teacher_loss": 0.0007013682625256479
},
{
"epoch": 0.22491349480968859,
"grad_norm": 4.53125,
"kd_loss": 0.09765625,
"learning_rate": 1.2270423862635188e-06,
"loss": 0.1708,
"step": 780,
"student_loss": 0.0015033041127026081,
"teacher_loss": 0.0006990543915890157
},
{
"epoch": 0.22635524798154555,
"grad_norm": 3.90625,
"kd_loss": 0.09423828125,
"learning_rate": 1.1744080437883859e-06,
"loss": 0.1409,
"step": 785,
"student_loss": 0.001736114383675158,
"teacher_loss": 0.001990710385143757
},
{
"epoch": 0.22779700115340254,
"grad_norm": 9.25,
"kd_loss": 0.11083984375,
"learning_rate": 1.1227768965014246e-06,
"loss": 0.1804,
"step": 790,
"student_loss": 0.03133748471736908,
"teacher_loss": 0.008059236221015453
},
{
"epoch": 0.2292387543252595,
"grad_norm": 3.84375,
"kd_loss": 0.11474609375,
"learning_rate": 1.0721624837761768e-06,
"loss": 0.1703,
"step": 795,
"student_loss": 0.005942783784121275,
"teacher_loss": 0.0006947139045223594
},
{
"epoch": 0.2306805074971165,
"grad_norm": 3.921875,
"kd_loss": 0.166015625,
"learning_rate": 1.0225780783651689e-06,
"loss": 0.1879,
"step": 800,
"student_loss": 0.05473716929554939,
"teacher_loss": 0.03766282647848129
},
{
"epoch": 0.23212226066897348,
"grad_norm": 4.34375,
"kd_loss": 0.08642578125,
"learning_rate": 9.740366829193587e-07,
"loss": 0.1824,
"step": 805,
"student_loss": 0.001553440117277205,
"teacher_loss": 0.0012171101989224553
},
{
"epoch": 0.23356401384083045,
"grad_norm": 7.34375,
"kd_loss": 0.09423828125,
"learning_rate": 9.265510265784189e-07,
"loss": 0.1771,
"step": 810,
"student_loss": 0.0017513898201286793,
"teacher_loss": 0.0006000488647259772
},
{
"epoch": 0.23500576701268744,
"grad_norm": 5.46875,
"kd_loss": 0.09912109375,
"learning_rate": 8.801335616327378e-07,
"loss": 0.1664,
"step": 815,
"student_loss": 0.007318615913391113,
"teacher_loss": 0.010634765028953552
},
{
"epoch": 0.2364475201845444,
"grad_norm": 3.90625,
"kd_loss": 0.11474609375,
"learning_rate": 8.347964602580245e-07,
"loss": 0.1615,
"step": 820,
"student_loss": 0.04161018878221512,
"teacher_loss": 0.001969601958990097
},
{
"epoch": 0.2378892733564014,
"grad_norm": 4.65625,
"kd_loss": 0.1142578125,
"learning_rate": 7.905516113233652e-07,
"loss": 0.1532,
"step": 825,
"student_loss": 0.000943031394854188,
"teacher_loss": 0.020420216023921967
},
{
"epoch": 0.23933102652825836,
"grad_norm": 2.875,
"kd_loss": 0.09521484375,
"learning_rate": 7.474106172735746e-07,
"loss": 0.1601,
"step": 830,
"student_loss": 0.018866391852498055,
"teacher_loss": 0.0037704347632825375
},
{
"epoch": 0.24077277970011535,
"grad_norm": 5.09375,
"kd_loss": 0.0888671875,
"learning_rate": 7.053847910866513e-07,
"loss": 0.1552,
"step": 835,
"student_loss": 0.12261331081390381,
"teacher_loss": 0.005213484168052673
},
{
"epoch": 0.2422145328719723,
"grad_norm": 2.84375,
"kd_loss": 0.11767578125,
"learning_rate": 6.644851533071556e-07,
"loss": 0.1478,
"step": 840,
"student_loss": 0.0071019199676811695,
"teacher_loss": 0.0005135077517479658
},
{
"epoch": 0.2436562860438293,
"grad_norm": 4.6875,
"kd_loss": 0.1337890625,
"learning_rate": 6.24722429156251e-07,
"loss": 0.228,
"step": 845,
"student_loss": 0.0022525617387145758,
"teacher_loss": 0.0012849880149587989
},
{
"epoch": 0.24509803921568626,
"grad_norm": 6.5625,
"kd_loss": 0.0966796875,
"learning_rate": 5.861070457192081e-07,
"loss": 0.1695,
"step": 850,
"student_loss": 0.06699959933757782,
"teacher_loss": 0.0007787467329762876
},
{
"epoch": 0.24653979238754326,
"grad_norm": 2.46875,
"kd_loss": 0.10546875,
"learning_rate": 5.486491292110796e-07,
"loss": 0.1498,
"step": 855,
"student_loss": 0.0011905976571142673,
"teacher_loss": 0.0006796009838581085
},
{
"epoch": 0.24798154555940022,
"grad_norm": 3.40625,
"kd_loss": 0.10107421875,
"learning_rate": 5.123585023212785e-07,
"loss": 0.1846,
"step": 860,
"student_loss": 0.005176758859306574,
"teacher_loss": 0.0015740481903776526
},
{
"epoch": 0.2494232987312572,
"grad_norm": 3.46875,
"kd_loss": 0.095703125,
"learning_rate": 4.772446816377408e-07,
"loss": 0.1519,
"step": 865,
"student_loss": 0.0017797622131183743,
"teacher_loss": 0.001129323965869844
},
{
"epoch": 0.2508650519031142,
"grad_norm": 2.515625,
"kd_loss": 0.09326171875,
"learning_rate": 4.4331687515137614e-07,
"loss": 0.1724,
"step": 870,
"student_loss": 0.0048367022536695,
"teacher_loss": 0.0008864006958901882
},
{
"epoch": 0.25230680507497116,
"grad_norm": 2.390625,
"kd_loss": 0.09619140625,
"learning_rate": 4.1058397984142405e-07,
"loss": 0.1396,
"step": 875,
"student_loss": 0.0008503241115249693,
"teacher_loss": 0.0008235117420554161
},
{
"epoch": 0.2537485582468281,
"grad_norm": 3.0625,
"kd_loss": 0.09423828125,
"learning_rate": 3.790545793423761e-07,
"loss": 0.1662,
"step": 880,
"student_loss": 0.0019780993461608887,
"teacher_loss": 0.0007267682813107967
},
{
"epoch": 0.25519031141868515,
"grad_norm": 3.25,
"kd_loss": 0.1025390625,
"learning_rate": 3.4873694169306915e-07,
"loss": 0.1567,
"step": 885,
"student_loss": 0.020344872027635574,
"teacher_loss": 0.056762393563985825
},
{
"epoch": 0.2566320645905421,
"grad_norm": 2.796875,
"kd_loss": 0.09326171875,
"learning_rate": 3.196390171685343e-07,
"loss": 0.1636,
"step": 890,
"student_loss": 0.001583437086082995,
"teacher_loss": 0.00122374901548028
},
{
"epoch": 0.25807381776239907,
"grad_norm": 4.09375,
"kd_loss": 0.11083984375,
"learning_rate": 2.917684361951728e-07,
"loss": 0.1583,
"step": 895,
"student_loss": 0.11338726431131363,
"teacher_loss": 0.006426448002457619
},
{
"epoch": 0.25951557093425603,
"grad_norm": 3.34375,
"kd_loss": 0.1201171875,
"learning_rate": 2.65132507349814e-07,
"loss": 0.1934,
"step": 900,
"student_loss": 0.0020212531089782715,
"teacher_loss": 0.03192909434437752
},
{
"epoch": 0.26095732410611305,
"grad_norm": 3.3125,
"kd_loss": 0.09716796875,
"learning_rate": 2.397382154431621e-07,
"loss": 0.1627,
"step": 905,
"student_loss": 0.0032915188930928707,
"teacher_loss": 0.0014988789334893227
},
{
"epoch": 0.26239907727797,
"grad_norm": 4.0,
"kd_loss": 0.0986328125,
"learning_rate": 2.1559221968815547e-07,
"loss": 0.182,
"step": 910,
"student_loss": 0.001489490270614624,
"teacher_loss": 0.0012580000329762697
},
{
"epoch": 0.263840830449827,
"grad_norm": 3.390625,
"kd_loss": 0.09716796875,
"learning_rate": 1.9270085195370048e-07,
"loss": 0.143,
"step": 915,
"student_loss": 0.04323554039001465,
"teacher_loss": 0.001784435473382473
},
{
"epoch": 0.26528258362168394,
"grad_norm": 2.546875,
"kd_loss": 0.1318359375,
"learning_rate": 1.7107011510424766e-07,
"loss": 0.1721,
"step": 920,
"student_loss": 0.01360052265226841,
"teacher_loss": 0.018217744305729866
},
{
"epoch": 0.26672433679354096,
"grad_norm": 3.71875,
"kd_loss": 0.087890625,
"learning_rate": 1.5070568142564912e-07,
"loss": 0.1489,
"step": 925,
"student_loss": 0.0011945515871047974,
"teacher_loss": 0.0009807685855776072
},
{
"epoch": 0.2681660899653979,
"grad_norm": 3.40625,
"kd_loss": 0.1142578125,
"learning_rate": 1.3161289113769405e-07,
"loss": 0.1539,
"step": 930,
"student_loss": 0.055781442672014236,
"teacher_loss": 0.0011405627010390162
},
{
"epoch": 0.2696078431372549,
"grad_norm": 2.84375,
"kd_loss": 0.10009765625,
"learning_rate": 1.1379675099373489e-07,
"loss": 0.1501,
"step": 935,
"student_loss": 0.005637112073600292,
"teacher_loss": 0.002441459335386753
},
{
"epoch": 0.2710495963091119,
"grad_norm": 5.625,
"kd_loss": 0.18359375,
"learning_rate": 9.726193296774767e-08,
"loss": 0.1684,
"step": 940,
"student_loss": 0.011978531256318092,
"teacher_loss": 0.009902331046760082
},
{
"epoch": 0.27249134948096887,
"grad_norm": 5.375,
"kd_loss": 0.0966796875,
"learning_rate": 8.201277302919086e-08,
"loss": 0.1661,
"step": 945,
"student_loss": 0.12388397753238678,
"teacher_loss": 0.0019162542885169387
},
{
"epoch": 0.27393310265282583,
"grad_norm": 3.046875,
"kd_loss": 0.12451171875,
"learning_rate": 6.805327000596995e-08,
"loss": 0.1539,
"step": 950,
"student_loss": 0.004596967715770006,
"teacher_loss": 0.0005623517790809274
},
{
"epoch": 0.2753748558246828,
"grad_norm": 4.3125,
"kd_loss": 0.083984375,
"learning_rate": 5.538708453581787e-08,
"loss": 0.1616,
"step": 955,
"student_loss": 0.007824474945664406,
"teacher_loss": 0.0011617924319580197
},
{
"epoch": 0.2768166089965398,
"grad_norm": 8.0625,
"kd_loss": 0.0947265625,
"learning_rate": 4.40175381063529e-08,
"loss": 0.1586,
"step": 960,
"student_loss": 0.0018909722566604614,
"teacher_loss": 0.00250077061355114
},
{
"epoch": 0.2782583621683968,
"grad_norm": 4.40625,
"kd_loss": 0.162109375,
"learning_rate": 3.394761218407705e-08,
"loss": 0.1666,
"step": 965,
"student_loss": 0.11850693821907043,
"teacher_loss": 0.008037789724767208
},
{
"epoch": 0.27970011534025374,
"grad_norm": 3.1875,
"kd_loss": 0.11083984375,
"learning_rate": 2.5179947432540376e-08,
"loss": 0.1665,
"step": 970,
"student_loss": 0.0006995275616645813,
"teacher_loss": 0.0004927213303744793
},
{
"epoch": 0.2811418685121107,
"grad_norm": 2.9375,
"kd_loss": 0.087890625,
"learning_rate": 1.7716843019867646e-08,
"loss": 0.1614,
"step": 975,
"student_loss": 0.11272090673446655,
"teacher_loss": 0.002362866187468171
},
{
"epoch": 0.2825836216839677,
"grad_norm": 4.6875,
"kd_loss": 0.0927734375,
"learning_rate": 1.156025601584676e-08,
"loss": 0.1578,
"step": 980,
"student_loss": 0.002314644167199731,
"teacher_loss": 0.0005237645236775279
},
{
"epoch": 0.2840253748558247,
"grad_norm": 5.28125,
"kd_loss": 0.09814453125,
"learning_rate": 6.711800878718144e-09,
"loss": 0.1708,
"step": 985,
"student_loss": 0.0012468647910282016,
"teacher_loss": 0.0008435134077444673
},
{
"epoch": 0.28546712802768165,
"grad_norm": 9.3125,
"kd_loss": 0.09130859375,
"learning_rate": 3.1727490318111953e-09,
"loss": 0.1632,
"step": 990,
"student_loss": 0.002036402700468898,
"teacher_loss": 0.0007938549388200045
},
{
"epoch": 0.2869088811995386,
"grad_norm": 4.5,
"kd_loss": 0.10986328125,
"learning_rate": 9.440285301370865e-10,
"loss": 0.183,
"step": 995,
"student_loss": 0.0015997332520782948,
"teacher_loss": 0.00439803209155798
},
{
"epoch": 0.28835063437139563,
"grad_norm": 2.375,
"kd_loss": 0.09423828125,
"learning_rate": 2.622381702066523e-11,
"loss": 0.1477,
"step": 1000,
"student_loss": 0.07939934730529785,
"teacher_loss": 0.0005344336968846619
},
{
"epoch": 0.28835063437139563,
"kd_loss": 0.09423828125,
"step": 1000,
"student_loss": 0.07939934730529785,
"teacher_loss": 0.0005344336968846619,
"total_flos": 0.0,
"train_loss": 0.22623604363203048,
"train_runtime": 7596.2458,
"train_samples_per_second": 2.106,
"train_steps_per_second": 0.132
},
{
"epoch": 0.2897923875432526,
"grad_norm": 3.265625,
"kd_loss": 0.1083984375,
"learning_rate": 7.75705864825114e-06,
"loss": 0.1639,
"step": 1005,
"student_loss": 0.021362992003560066,
"teacher_loss": 0.07951661199331284
},
{
"epoch": 0.29123414071510956,
"grad_norm": 7.375,
"kd_loss": 0.09326171875,
"learning_rate": 7.734502946076656e-06,
"loss": 0.1608,
"step": 1010,
"student_loss": 0.0017111932393163443,
"teacher_loss": 0.004124164581298828
},
{
"epoch": 0.2926758938869666,
"grad_norm": 5.3125,
"kd_loss": 0.146484375,
"learning_rate": 7.711867567242769e-06,
"loss": 0.1511,
"step": 1015,
"student_loss": 0.0037448785733431578,
"teacher_loss": 0.01143695879727602
},
{
"epoch": 0.29411764705882354,
"grad_norm": 3.046875,
"kd_loss": 0.1767578125,
"learning_rate": 7.689153171288487e-06,
"loss": 0.1481,
"step": 1020,
"student_loss": 0.018935445696115494,
"teacher_loss": 0.03223176300525665
},
{
"epoch": 0.2955594002306805,
"grad_norm": 5.03125,
"kd_loss": 0.10107421875,
"learning_rate": 7.666360420055188e-06,
"loss": 0.1648,
"step": 1025,
"student_loss": 0.00270785391330719,
"teacher_loss": 0.0004231084603816271
},
{
"epoch": 0.29700115340253747,
"grad_norm": 4.71875,
"kd_loss": 0.0908203125,
"learning_rate": 7.643489977667327e-06,
"loss": 0.1659,
"step": 1030,
"student_loss": 0.02544678933918476,
"teacher_loss": 0.0005848580040037632
},
{
"epoch": 0.2984429065743945,
"grad_norm": 4.75,
"kd_loss": 0.0986328125,
"learning_rate": 7.6205425105130855e-06,
"loss": 0.1671,
"step": 1035,
"student_loss": 0.009377697482705116,
"teacher_loss": 0.002407669322565198
},
{
"epoch": 0.29988465974625145,
"grad_norm": 6.0,
"kd_loss": 0.1005859375,
"learning_rate": 7.597518687224959e-06,
"loss": 0.1854,
"step": 1040,
"student_loss": 0.09456347674131393,
"teacher_loss": 0.0008634831756353378
},
{
"epoch": 0.3013264129181084,
"grad_norm": 5.03125,
"kd_loss": 0.07958984375,
"learning_rate": 7.574419178660269e-06,
"loss": 0.1669,
"step": 1045,
"student_loss": 0.0017204463947564363,
"teacher_loss": 0.0008596886764280498
},
{
"epoch": 0.3027681660899654,
"grad_norm": 6.21875,
"kd_loss": 0.095703125,
"learning_rate": 7.551244657881618e-06,
"loss": 0.1942,
"step": 1050,
"student_loss": 0.16669750213623047,
"teacher_loss": 0.0011612839298322797
},
{
"epoch": 0.3042099192618224,
"grad_norm": 2.484375,
"kd_loss": 0.09375,
"learning_rate": 7.527995800137287e-06,
"loss": 0.1475,
"step": 1055,
"student_loss": 0.0016981420340016484,
"teacher_loss": 0.001001509721390903
},
{
"epoch": 0.30565167243367936,
"grad_norm": 5.21875,
"kd_loss": 0.09326171875,
"learning_rate": 7.504673282841544e-06,
"loss": 0.1647,
"step": 1060,
"student_loss": 0.09439224749803543,
"teacher_loss": 0.0003985276853200048
},
{
"epoch": 0.3070934256055363,
"grad_norm": 4.875,
"kd_loss": 0.1044921875,
"learning_rate": 7.481277785554918e-06,
"loss": 0.161,
"step": 1065,
"student_loss": 0.059324074536561966,
"teacher_loss": 0.0028861502651125193
},
{
"epoch": 0.30853517877739334,
"grad_norm": 6.78125,
"kd_loss": 0.11474609375,
"learning_rate": 7.457809989964393e-06,
"loss": 0.1812,
"step": 1070,
"student_loss": 0.0447225496172905,
"teacher_loss": 0.00039993959944695234
},
{
"epoch": 0.3099769319492503,
"grad_norm": 3.84375,
"kd_loss": 0.099609375,
"learning_rate": 7.434270579863549e-06,
"loss": 0.1539,
"step": 1075,
"student_loss": 0.0011834139004349709,
"teacher_loss": 0.0010074133751913905
},
{
"epoch": 0.31141868512110726,
"grad_norm": 4.21875,
"kd_loss": 0.1005859375,
"learning_rate": 7.4106602411326345e-06,
"loss": 0.1642,
"step": 1080,
"student_loss": 0.003048022510483861,
"teacher_loss": 0.00988290086388588
},
{
"epoch": 0.3128604382929642,
"grad_norm": 4.0625,
"kd_loss": 0.09765625,
"learning_rate": 7.386979661718585e-06,
"loss": 0.1702,
"step": 1085,
"student_loss": 0.003489202819764614,
"teacher_loss": 0.0008156410767696798
},
{
"epoch": 0.31430219146482125,
"grad_norm": 5.65625,
"kd_loss": 0.0888671875,
"learning_rate": 7.363229531614973e-06,
"loss": 0.1515,
"step": 1090,
"student_loss": 0.2183372676372528,
"teacher_loss": 0.004773187451064587
},
{
"epoch": 0.3157439446366782,
"grad_norm": 4.125,
"kd_loss": 0.1171875,
"learning_rate": 7.339410542841906e-06,
"loss": 0.1799,
"step": 1095,
"student_loss": 0.13511748611927032,
"teacher_loss": 0.00648617185652256
},
{
"epoch": 0.31718569780853517,
"grad_norm": 5.03125,
"kd_loss": 0.10498046875,
"learning_rate": 7.315523389425867e-06,
"loss": 0.1607,
"step": 1100,
"student_loss": 0.0012231277069076896,
"teacher_loss": 0.0004083520616404712
},
{
"epoch": 0.31862745098039214,
"grad_norm": 4.0625,
"kd_loss": 0.0859375,
"learning_rate": 7.291568767379484e-06,
"loss": 0.144,
"step": 1105,
"student_loss": 0.016774829477071762,
"teacher_loss": 0.0005346160614863038
},
{
"epoch": 0.32006920415224915,
"grad_norm": 3.984375,
"kd_loss": 0.095703125,
"learning_rate": 7.267547374681259e-06,
"loss": 0.1602,
"step": 1110,
"student_loss": 0.024096982553601265,
"teacher_loss": 0.0008525612065568566
},
{
"epoch": 0.3215109573241061,
"grad_norm": 4.59375,
"kd_loss": 0.10400390625,
"learning_rate": 7.24345991125522e-06,
"loss": 0.1532,
"step": 1115,
"student_loss": 0.0033125807531177998,
"teacher_loss": 0.0005502361455000937
},
{
"epoch": 0.3229527104959631,
"grad_norm": 5.59375,
"kd_loss": 0.107421875,
"learning_rate": 7.219307078950536e-06,
"loss": 0.1625,
"step": 1120,
"student_loss": 0.0204778965562582,
"teacher_loss": 0.004392318893224001
},
{
"epoch": 0.32439446366782004,
"grad_norm": 10.0625,
"kd_loss": 0.0986328125,
"learning_rate": 7.195089581521064e-06,
"loss": 0.1654,
"step": 1125,
"student_loss": 0.19389592111110687,
"teacher_loss": 0.0214696004986763
},
{
"epoch": 0.32583621683967706,
"grad_norm": 3.859375,
"kd_loss": 0.10986328125,
"learning_rate": 7.170808124604842e-06,
"loss": 0.1556,
"step": 1130,
"student_loss": 0.03847292810678482,
"teacher_loss": 0.0006047665374353528
},
{
"epoch": 0.327277970011534,
"grad_norm": 4.96875,
"kd_loss": 0.091796875,
"learning_rate": 7.14646341570353e-06,
"loss": 0.1696,
"step": 1135,
"student_loss": 0.06856270879507065,
"teacher_loss": 0.01667657122015953
},
{
"epoch": 0.328719723183391,
"grad_norm": 3.90625,
"kd_loss": 0.1142578125,
"learning_rate": 7.122056164161795e-06,
"loss": 0.1778,
"step": 1140,
"student_loss": 0.021477092057466507,
"teacher_loss": 0.012335257604718208
},
{
"epoch": 0.330161476355248,
"grad_norm": 11.3125,
"kd_loss": 0.080078125,
"learning_rate": 7.097587081146636e-06,
"loss": 0.1589,
"step": 1145,
"student_loss": 0.045279014855623245,
"teacher_loss": 0.0029319608584046364
},
{
"epoch": 0.33160322952710497,
"grad_norm": 4.125,
"kd_loss": 0.08203125,
"learning_rate": 7.073056879626681e-06,
"loss": 0.204,
"step": 1150,
"student_loss": 0.002648564986884594,
"teacher_loss": 0.0008686608052812517
},
{
"epoch": 0.33304498269896193,
"grad_norm": 2.640625,
"kd_loss": 0.09814453125,
"learning_rate": 7.048466274351389e-06,
"loss": 0.1497,
"step": 1155,
"student_loss": 0.058320529758930206,
"teacher_loss": 0.00035479728830978274
},
{
"epoch": 0.3344867358708189,
"grad_norm": 6.4375,
"kd_loss": 0.09716796875,
"learning_rate": 7.023815981830236e-06,
"loss": 0.1904,
"step": 1160,
"student_loss": 0.0025387869682163,
"teacher_loss": 0.028956690803170204
},
{
"epoch": 0.3359284890426759,
"grad_norm": 3.53125,
"kd_loss": 0.10302734375,
"learning_rate": 6.999106720311846e-06,
"loss": 0.1704,
"step": 1165,
"student_loss": 0.08381687104701996,
"teacher_loss": 0.000761769013479352
},
{
"epoch": 0.3373702422145329,
"grad_norm": 4.03125,
"kd_loss": 0.11669921875,
"learning_rate": 6.974339209763043e-06,
"loss": 0.1536,
"step": 1170,
"student_loss": 0.021977189928293228,
"teacher_loss": 0.02045821212232113
},
{
"epoch": 0.33881199538638984,
"grad_norm": 5.125,
"kd_loss": 0.09033203125,
"learning_rate": 6.949514171847891e-06,
"loss": 0.1685,
"step": 1175,
"student_loss": 0.004976021591573954,
"teacher_loss": 0.0024228477850556374
},
{
"epoch": 0.3402537485582468,
"grad_norm": 4.90625,
"kd_loss": 0.09423828125,
"learning_rate": 6.924632329906657e-06,
"loss": 0.1613,
"step": 1180,
"student_loss": 0.008308586664497852,
"teacher_loss": 0.0010117895435541868
},
{
"epoch": 0.3416955017301038,
"grad_norm": 2.96875,
"kd_loss": 0.09423828125,
"learning_rate": 6.899694408934734e-06,
"loss": 0.1462,
"step": 1185,
"student_loss": 0.0045226323418319225,
"teacher_loss": 0.000678456446621567
},
{
"epoch": 0.3431372549019608,
"grad_norm": 2.359375,
"kd_loss": 0.09912109375,
"learning_rate": 6.874701135561524e-06,
"loss": 0.1473,
"step": 1190,
"student_loss": 0.0010705965105444193,
"teacher_loss": 0.0005625460762530565
},
{
"epoch": 0.34457900807381775,
"grad_norm": 3.90625,
"kd_loss": 0.0869140625,
"learning_rate": 6.849653238029261e-06,
"loss": 0.144,
"step": 1195,
"student_loss": 0.03853433579206467,
"teacher_loss": 0.0004980422672815621
},
{
"epoch": 0.3460207612456747,
"grad_norm": 2.109375,
"kd_loss": 0.08984375,
"learning_rate": 6.824551446171788e-06,
"loss": 0.2125,
"step": 1200,
"student_loss": 0.0008995746029540896,
"teacher_loss": 0.0007796635036356747
},
{
"epoch": 0.34746251441753173,
"grad_norm": 4.03125,
"kd_loss": 0.0888671875,
"learning_rate": 6.7993964913932975e-06,
"loss": 0.1821,
"step": 1205,
"student_loss": 0.07766856998205185,
"teacher_loss": 0.0004231579077895731
},
{
"epoch": 0.3489042675893887,
"grad_norm": 3.3125,
"kd_loss": 0.091796875,
"learning_rate": 6.774189106647021e-06,
"loss": 0.1555,
"step": 1210,
"student_loss": 0.002076697302982211,
"teacher_loss": 0.0008232980617322028
},
{
"epoch": 0.35034602076124566,
"grad_norm": 5.65625,
"kd_loss": 0.095703125,
"learning_rate": 6.748930026413865e-06,
"loss": 0.1712,
"step": 1215,
"student_loss": 0.11520007997751236,
"teacher_loss": 0.0005120415589772165
},
{
"epoch": 0.3517877739331027,
"grad_norm": 4.125,
"kd_loss": 0.11279296875,
"learning_rate": 6.7236199866810185e-06,
"loss": 0.164,
"step": 1220,
"student_loss": 0.06622859835624695,
"teacher_loss": 0.010332350619137287
},
{
"epoch": 0.35322952710495964,
"grad_norm": 4.75,
"kd_loss": 0.10595703125,
"learning_rate": 6.698259724920503e-06,
"loss": 0.1654,
"step": 1225,
"student_loss": 0.03172338008880615,
"teacher_loss": 0.0024025817401707172
},
{
"epoch": 0.3546712802768166,
"grad_norm": 4.96875,
"kd_loss": 0.0986328125,
"learning_rate": 6.672849980067685e-06,
"loss": 0.1646,
"step": 1230,
"student_loss": 0.0014958116225898266,
"teacher_loss": 0.00136648362968117
},
{
"epoch": 0.35611303344867357,
"grad_norm": 7.4375,
"kd_loss": 0.10791015625,
"learning_rate": 6.647391492499746e-06,
"loss": 0.1467,
"step": 1235,
"student_loss": 0.001844382262788713,
"teacher_loss": 0.0012089475058019161
},
{
"epoch": 0.3575547866205306,
"grad_norm": 5.125,
"kd_loss": 0.09375,
"learning_rate": 6.621885004014113e-06,
"loss": 0.1856,
"step": 1240,
"student_loss": 0.0013189911842346191,
"teacher_loss": 0.0011863983236253262
},
{
"epoch": 0.35899653979238755,
"grad_norm": 3.6875,
"kd_loss": 0.10986328125,
"learning_rate": 6.596331257806837e-06,
"loss": 0.1588,
"step": 1245,
"student_loss": 0.0016421154141426086,
"teacher_loss": 0.001257838448509574
},
{
"epoch": 0.3604382929642445,
"grad_norm": 5.1875,
"kd_loss": 0.1083984375,
"learning_rate": 6.570730998450945e-06,
"loss": 0.1663,
"step": 1250,
"student_loss": 0.19827650487422943,
"teacher_loss": 0.002391376066952944
},
{
"epoch": 0.3618800461361015,
"grad_norm": 4.84375,
"kd_loss": 0.109375,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1693,
"step": 1255,
"student_loss": 0.033441461622714996,
"teacher_loss": 0.007763924542814493
},
{
"epoch": 0.3633217993079585,
"grad_norm": 4.875,
"kd_loss": 0.10400390625,
"learning_rate": 6.519393925340067e-06,
"loss": 0.1687,
"step": 1260,
"student_loss": 0.0005883485428057611,
"teacher_loss": 0.0006072800024412572
},
{
"epoch": 0.36476355247981546,
"grad_norm": 5.0,
"kd_loss": 0.1123046875,
"learning_rate": 6.49365860742055e-06,
"loss": 0.1807,
"step": 1265,
"student_loss": 0.3620021343231201,
"teacher_loss": 0.0192008875310421
},
{
"epoch": 0.3662053056516724,
"grad_norm": 2.546875,
"kd_loss": 0.09033203125,
"learning_rate": 6.467879767979764e-06,
"loss": 0.1367,
"step": 1270,
"student_loss": 0.0019303744193166494,
"teacher_loss": 0.0008835737244226038
},
{
"epoch": 0.36764705882352944,
"grad_norm": 4.84375,
"kd_loss": 0.09765625,
"learning_rate": 6.442058158149396e-06,
"loss": 0.1364,
"step": 1275,
"student_loss": 0.017311925068497658,
"teacher_loss": 0.010822150856256485
},
{
"epoch": 0.3690888119953864,
"grad_norm": 2.390625,
"kd_loss": 0.08837890625,
"learning_rate": 6.4161945303073535e-06,
"loss": 0.1339,
"step": 1280,
"student_loss": 0.0016525188693776727,
"teacher_loss": 0.00045569639769382775
},
{
"epoch": 0.37053056516724336,
"grad_norm": 2.59375,
"kd_loss": 0.09716796875,
"learning_rate": 6.390289638055851e-06,
"loss": 0.1514,
"step": 1285,
"student_loss": 0.0017534851795062423,
"teacher_loss": 0.000792986829765141
},
{
"epoch": 0.3719723183391003,
"grad_norm": 5.15625,
"kd_loss": 0.08203125,
"learning_rate": 6.364344236199441e-06,
"loss": 0.1544,
"step": 1290,
"student_loss": 0.4242388606071472,
"teacher_loss": 0.015242666937410831
},
{
"epoch": 0.37341407151095735,
"grad_norm": 4.0,
"kd_loss": 0.08251953125,
"learning_rate": 6.3383590807230264e-06,
"loss": 0.1732,
"step": 1295,
"student_loss": 0.0017893314361572266,
"teacher_loss": 0.005244513973593712
},
{
"epoch": 0.3748558246828143,
"grad_norm": 4.875,
"kd_loss": 0.09765625,
"learning_rate": 6.3123349287698345e-06,
"loss": 0.1343,
"step": 1300,
"student_loss": 0.0016616806387901306,
"teacher_loss": 0.000766773009672761
},
{
"epoch": 0.3762975778546713,
"grad_norm": 4.34375,
"kd_loss": 0.0986328125,
"learning_rate": 6.286272538619351e-06,
"loss": 0.1656,
"step": 1305,
"student_loss": 0.001378720044158399,
"teacher_loss": 0.2784559726715088
},
{
"epoch": 0.37773933102652824,
"grad_norm": 4.78125,
"kd_loss": 0.09326171875,
"learning_rate": 6.260172669665233e-06,
"loss": 0.1376,
"step": 1310,
"student_loss": 0.0015898743877187371,
"teacher_loss": 0.001270298846065998
},
{
"epoch": 0.37918108419838525,
"grad_norm": 6.34375,
"kd_loss": 0.09716796875,
"learning_rate": 6.234036082393171e-06,
"loss": 0.1719,
"step": 1315,
"student_loss": 0.2977891266345978,
"teacher_loss": 0.0018072956008836627
},
{
"epoch": 0.3806228373702422,
"grad_norm": 4.5,
"kd_loss": 0.09814453125,
"learning_rate": 6.207863538358741e-06,
"loss": 0.166,
"step": 1320,
"student_loss": 0.002190067432820797,
"teacher_loss": 0.0004770367522723973
},
{
"epoch": 0.3820645905420992,
"grad_norm": 4.625,
"kd_loss": 0.10546875,
"learning_rate": 6.181655800165207e-06,
"loss": 0.1752,
"step": 1325,
"student_loss": 0.014989044517278671,
"teacher_loss": 0.0011476778890937567
},
{
"epoch": 0.38350634371395614,
"grad_norm": 3.609375,
"kd_loss": 0.08740234375,
"learning_rate": 6.155413631441307e-06,
"loss": 0.1513,
"step": 1330,
"student_loss": 0.04820695146918297,
"teacher_loss": 0.0003813351795542985
},
{
"epoch": 0.38494809688581316,
"grad_norm": 7.1875,
"kd_loss": 0.09228515625,
"learning_rate": 6.129137796818997e-06,
"loss": 0.149,
"step": 1335,
"student_loss": 0.0018507987260818481,
"teacher_loss": 0.0005417931824922562
},
{
"epoch": 0.3863898500576701,
"grad_norm": 5.1875,
"kd_loss": 0.1728515625,
"learning_rate": 6.102829061911176e-06,
"loss": 0.1629,
"step": 1340,
"student_loss": 0.0006290597375482321,
"teacher_loss": 0.007059386931359768
},
{
"epoch": 0.3878316032295271,
"grad_norm": 4.53125,
"kd_loss": 0.09375,
"learning_rate": 6.076488193289375e-06,
"loss": 0.154,
"step": 1345,
"student_loss": 0.001199022983200848,
"teacher_loss": 0.001256449380889535
},
{
"epoch": 0.3892733564013841,
"grad_norm": 3.796875,
"kd_loss": 0.09912109375,
"learning_rate": 6.050115958461423e-06,
"loss": 0.1423,
"step": 1350,
"student_loss": 0.028213880956172943,
"teacher_loss": 0.0014634531689807773
},
{
"epoch": 0.39071510957324107,
"grad_norm": 5.1875,
"kd_loss": 0.09814453125,
"learning_rate": 6.02371312584908e-06,
"loss": 0.1606,
"step": 1355,
"student_loss": 0.046754222363233566,
"teacher_loss": 0.0003306324942968786
},
{
"epoch": 0.39215686274509803,
"grad_norm": 6.75,
"kd_loss": 0.1025390625,
"learning_rate": 5.997280464765655e-06,
"loss": 0.1687,
"step": 1360,
"student_loss": 0.01089841965585947,
"teacher_loss": 0.0005644945777021348
},
{
"epoch": 0.393598615916955,
"grad_norm": 3.734375,
"kd_loss": 0.1025390625,
"learning_rate": 5.970818745393579e-06,
"loss": 0.1514,
"step": 1365,
"student_loss": 0.012727648951113224,
"teacher_loss": 0.04049056023359299
},
{
"epoch": 0.395040369088812,
"grad_norm": 4.59375,
"kd_loss": 0.095703125,
"learning_rate": 5.9443287387619754e-06,
"loss": 0.1645,
"step": 1370,
"student_loss": 0.3426652252674103,
"teacher_loss": 0.02165866084396839
},
{
"epoch": 0.396482122260669,
"grad_norm": 6.65625,
"kd_loss": 0.0966796875,
"learning_rate": 5.9178112167241805e-06,
"loss": 0.1544,
"step": 1375,
"student_loss": 0.09628524631261826,
"teacher_loss": 0.0004050543357152492
},
{
"epoch": 0.39792387543252594,
"grad_norm": 2.828125,
"kd_loss": 0.099609375,
"learning_rate": 5.8912669519352725e-06,
"loss": 0.1484,
"step": 1380,
"student_loss": 0.024134894832968712,
"teacher_loss": 0.0030764644034206867
},
{
"epoch": 0.3993656286043829,
"grad_norm": 4.5625,
"kd_loss": 0.0849609375,
"learning_rate": 5.864696717829539e-06,
"loss": 0.1566,
"step": 1385,
"student_loss": 0.06617551296949387,
"teacher_loss": 0.011502874083817005
},
{
"epoch": 0.4008073817762399,
"grad_norm": 4.6875,
"kd_loss": 0.09619140625,
"learning_rate": 5.838101288597951e-06,
"loss": 0.1487,
"step": 1390,
"student_loss": 0.0014513310743495822,
"teacher_loss": 0.0005679084570147097
},
{
"epoch": 0.4022491349480969,
"grad_norm": 3.421875,
"kd_loss": 0.08984375,
"learning_rate": 5.8114814391656046e-06,
"loss": 0.1609,
"step": 1395,
"student_loss": 0.001117706298828125,
"teacher_loss": 0.00049404869787395
},
{
"epoch": 0.40369088811995385,
"grad_norm": 3.375,
"kd_loss": 0.091796875,
"learning_rate": 5.78483794516914e-06,
"loss": 0.1509,
"step": 1400,
"student_loss": 0.005229848902672529,
"teacher_loss": 0.00046830569044686854
},
{
"epoch": 0.40513264129181087,
"grad_norm": 3.28125,
"kd_loss": 0.10107421875,
"learning_rate": 5.75817158293414e-06,
"loss": 0.1549,
"step": 1405,
"student_loss": 0.000877993879839778,
"teacher_loss": 0.0005521044950000942
},
{
"epoch": 0.40657439446366783,
"grad_norm": 4.25,
"kd_loss": 0.0927734375,
"learning_rate": 5.731483129452514e-06,
"loss": 0.1684,
"step": 1410,
"student_loss": 0.008468794636428356,
"teacher_loss": 0.0004955396871082485
},
{
"epoch": 0.4080161476355248,
"grad_norm": 4.78125,
"kd_loss": 0.10009765625,
"learning_rate": 5.704773362359854e-06,
"loss": 0.1529,
"step": 1415,
"student_loss": 0.023767048493027687,
"teacher_loss": 0.014984571374952793
},
{
"epoch": 0.40945790080738176,
"grad_norm": 4.9375,
"kd_loss": 0.103515625,
"learning_rate": 5.678043059912776e-06,
"loss": 0.1818,
"step": 1420,
"student_loss": 0.0011405398836359382,
"teacher_loss": 0.0018354527419432998
},
{
"epoch": 0.4108996539792388,
"grad_norm": 3.859375,
"kd_loss": 0.0849609375,
"learning_rate": 5.6512930009662524e-06,
"loss": 0.1643,
"step": 1425,
"student_loss": 0.20638686418533325,
"teacher_loss": 0.0004497423942666501
},
{
"epoch": 0.41234140715109574,
"grad_norm": 5.15625,
"kd_loss": 0.111328125,
"learning_rate": 5.624523964950903e-06,
"loss": 0.1493,
"step": 1430,
"student_loss": 0.003926432225853205,
"teacher_loss": 0.0005563534796237946
},
{
"epoch": 0.4137831603229527,
"grad_norm": 2.734375,
"kd_loss": 0.08935546875,
"learning_rate": 5.597736731850295e-06,
"loss": 0.164,
"step": 1435,
"student_loss": 0.012115873396396637,
"teacher_loss": 0.00135420064907521
},
{
"epoch": 0.41522491349480967,
"grad_norm": 8.375,
"kd_loss": 0.095703125,
"learning_rate": 5.570932082178219e-06,
"loss": 0.1733,
"step": 1440,
"student_loss": 0.05656226724386215,
"teacher_loss": 0.0003561509947758168
},
{
"epoch": 0.4166666666666667,
"grad_norm": 5.25,
"kd_loss": 0.1005859375,
"learning_rate": 5.5441107969559315e-06,
"loss": 0.1578,
"step": 1445,
"student_loss": 0.2771185636520386,
"teacher_loss": 0.002755317836999893
},
{
"epoch": 0.41810841983852365,
"grad_norm": 4.1875,
"kd_loss": 0.138671875,
"learning_rate": 5.517273657689419e-06,
"loss": 0.1413,
"step": 1450,
"student_loss": 0.004985239822417498,
"teacher_loss": 0.002304993337020278
},
{
"epoch": 0.4195501730103806,
"grad_norm": 5.625,
"kd_loss": 0.10595703125,
"learning_rate": 5.490421446346608e-06,
"loss": 0.1495,
"step": 1455,
"student_loss": 0.002044258639216423,
"teacher_loss": 0.0006097870063968003
},
{
"epoch": 0.4209919261822376,
"grad_norm": 3.890625,
"kd_loss": 0.1123046875,
"learning_rate": 5.463554945334589e-06,
"loss": 0.1499,
"step": 1460,
"student_loss": 0.001577138900756836,
"teacher_loss": 0.00036811313475482166
},
{
"epoch": 0.4224336793540946,
"grad_norm": 4.46875,
"kd_loss": 0.126953125,
"learning_rate": 5.43667493747682e-06,
"loss": 0.1629,
"step": 1465,
"student_loss": 0.0005169888027012348,
"teacher_loss": 0.015567619353532791
},
{
"epoch": 0.42387543252595156,
"grad_norm": 5.75,
"kd_loss": 0.0859375,
"learning_rate": 5.409782205990317e-06,
"loss": 0.1757,
"step": 1470,
"student_loss": 0.0012229635613039136,
"teacher_loss": 0.0044844611547887325
},
{
"epoch": 0.4253171856978085,
"grad_norm": 6.8125,
"kd_loss": 0.0888671875,
"learning_rate": 5.3828775344628245e-06,
"loss": 0.1525,
"step": 1475,
"student_loss": 0.0009603102807886899,
"teacher_loss": 0.0009318754309788346
},
{
"epoch": 0.42675893886966554,
"grad_norm": 3.734375,
"kd_loss": 0.0908203125,
"learning_rate": 5.355961706829997e-06,
"loss": 0.1638,
"step": 1480,
"student_loss": 0.0015584760112687945,
"teacher_loss": 0.0012691307347267866
},
{
"epoch": 0.4282006920415225,
"grad_norm": 9.375,
"kd_loss": 0.10009765625,
"learning_rate": 5.329035507352548e-06,
"loss": 0.1616,
"step": 1485,
"student_loss": 0.0008603151072748005,
"teacher_loss": 0.0008554637315683067
},
{
"epoch": 0.42964244521337946,
"grad_norm": 8.375,
"kd_loss": 0.09814453125,
"learning_rate": 5.3020997205933985e-06,
"loss": 0.1486,
"step": 1490,
"student_loss": 0.10303473472595215,
"teacher_loss": 0.005800185259431601
},
{
"epoch": 0.43108419838523643,
"grad_norm": 3.4375,
"kd_loss": 0.09033203125,
"learning_rate": 5.275155131394825e-06,
"loss": 0.1504,
"step": 1495,
"student_loss": 0.002375382697209716,
"teacher_loss": 0.001016065594740212
},
{
"epoch": 0.43252595155709345,
"grad_norm": 4.65625,
"kd_loss": 0.0927734375,
"learning_rate": 5.248202524855578e-06,
"loss": 0.1497,
"step": 1500,
"student_loss": 0.046541083604097366,
"teacher_loss": 0.0038300170563161373
},
{
"epoch": 0.4339677047289504,
"grad_norm": 2.84375,
"kd_loss": 0.08642578125,
"learning_rate": 5.221242686308019e-06,
"loss": 0.1424,
"step": 1505,
"student_loss": 0.04092458263039589,
"teacher_loss": 0.0005184438778087497
},
{
"epoch": 0.4354094579008074,
"grad_norm": 5.9375,
"kd_loss": 0.14453125,
"learning_rate": 5.194276401295231e-06,
"loss": 0.1581,
"step": 1510,
"student_loss": 0.11372507363557816,
"teacher_loss": 0.012486970983445644
},
{
"epoch": 0.43685121107266434,
"grad_norm": 5.6875,
"kd_loss": 0.103515625,
"learning_rate": 5.167304455548128e-06,
"loss": 0.1542,
"step": 1515,
"student_loss": 0.0004513502062764019,
"teacher_loss": 0.0004412997222971171
},
{
"epoch": 0.43829296424452135,
"grad_norm": 4.46875,
"kd_loss": 0.09326171875,
"learning_rate": 5.14032763496257e-06,
"loss": 0.1503,
"step": 1520,
"student_loss": 0.016323139891028404,
"teacher_loss": 0.0005559992277994752
},
{
"epoch": 0.4397347174163783,
"grad_norm": 2.875,
"kd_loss": 0.1171875,
"learning_rate": 5.11334672557645e-06,
"loss": 0.1516,
"step": 1525,
"student_loss": 0.009535513818264008,
"teacher_loss": 0.001467025140300393
},
{
"epoch": 0.4411764705882353,
"grad_norm": 3.203125,
"kd_loss": 0.146484375,
"learning_rate": 5.086362513546807e-06,
"loss": 0.1389,
"step": 1530,
"student_loss": 0.1711445301771164,
"teacher_loss": 0.008548562414944172
},
{
"epoch": 0.4426182237600923,
"grad_norm": 3.96875,
"kd_loss": 0.09033203125,
"learning_rate": 5.059375785126907e-06,
"loss": 0.1367,
"step": 1535,
"student_loss": 0.09691781550645828,
"teacher_loss": 0.011890435591340065
},
{
"epoch": 0.44405997693194926,
"grad_norm": 3.15625,
"kd_loss": 0.1025390625,
"learning_rate": 5.032387326643331e-06,
"loss": 0.15,
"step": 1540,
"student_loss": 0.11253131926059723,
"teacher_loss": 0.00038647381006740034
},
{
"epoch": 0.4455017301038062,
"grad_norm": 4.28125,
"kd_loss": 0.0986328125,
"learning_rate": 5.005397924473082e-06,
"loss": 0.1453,
"step": 1545,
"student_loss": 0.0029098070226609707,
"teacher_loss": 0.007037348113954067
},
{
"epoch": 0.4469434832756632,
"grad_norm": 4.5625,
"kd_loss": 0.08935546875,
"learning_rate": 4.978408365020651e-06,
"loss": 0.1724,
"step": 1550,
"student_loss": 0.09735474735498428,
"teacher_loss": 0.0007218251703307033
},
{
"epoch": 0.4483852364475202,
"grad_norm": 5.40625,
"kd_loss": 0.10888671875,
"learning_rate": 4.951419434695115e-06,
"loss": 0.1426,
"step": 1555,
"student_loss": 0.024885384365916252,
"teacher_loss": 0.0010013995924964547
},
{
"epoch": 0.44982698961937717,
"grad_norm": 4.34375,
"kd_loss": 0.0927734375,
"learning_rate": 4.924431919887216e-06,
"loss": 0.1592,
"step": 1560,
"student_loss": 0.0011940286494791508,
"teacher_loss": 0.0006662014056928456
},
{
"epoch": 0.45126874279123413,
"grad_norm": 4.78125,
"kd_loss": 0.095703125,
"learning_rate": 4.897446606946459e-06,
"loss": 0.1363,
"step": 1565,
"student_loss": 0.008483109064400196,
"teacher_loss": 0.0011815401958301663
},
{
"epoch": 0.4527104959630911,
"grad_norm": 6.09375,
"kd_loss": 0.1337890625,
"learning_rate": 4.870464282158184e-06,
"loss": 0.1435,
"step": 1570,
"student_loss": 0.0016101751243695617,
"teacher_loss": 0.03063173033297062
},
{
"epoch": 0.4541522491349481,
"grad_norm": 4.75,
"kd_loss": 0.1337890625,
"learning_rate": 4.84348573172067e-06,
"loss": 0.1472,
"step": 1575,
"student_loss": 0.000616877747233957,
"teacher_loss": 0.02498156577348709
},
{
"epoch": 0.4555940023068051,
"grad_norm": 6.65625,
"kd_loss": 0.0791015625,
"learning_rate": 4.816511741722215e-06,
"loss": 0.1727,
"step": 1580,
"student_loss": 0.09299268573522568,
"teacher_loss": 0.000702059711329639
},
{
"epoch": 0.45703575547866204,
"grad_norm": 5.3125,
"kd_loss": 0.09423828125,
"learning_rate": 4.7895430981182415e-06,
"loss": 0.1725,
"step": 1585,
"student_loss": 0.001776401768438518,
"teacher_loss": 0.0011725560761988163
},
{
"epoch": 0.458477508650519,
"grad_norm": 5.375,
"kd_loss": 0.0966796875,
"learning_rate": 4.762580586708389e-06,
"loss": 0.1547,
"step": 1590,
"student_loss": 0.0018952718237414956,
"teacher_loss": 0.0010945210233330727
},
{
"epoch": 0.459919261822376,
"grad_norm": 4.4375,
"kd_loss": 0.09619140625,
"learning_rate": 4.73562499311362e-06,
"loss": 0.1534,
"step": 1595,
"student_loss": 0.0008283228962682188,
"teacher_loss": 0.0009313338669016957
},
{
"epoch": 0.461361014994233,
"grad_norm": 3.984375,
"kd_loss": 0.08984375,
"learning_rate": 4.708677102753331e-06,
"loss": 0.1371,
"step": 1600,
"student_loss": 0.04035179316997528,
"teacher_loss": 0.002971302019432187
},
{
"epoch": 0.46280276816608995,
"grad_norm": 5.4375,
"kd_loss": 0.10888671875,
"learning_rate": 4.681737700822464e-06,
"loss": 0.1731,
"step": 1605,
"student_loss": 0.0004709873755928129,
"teacher_loss": 0.025587571784853935
},
{
"epoch": 0.46424452133794697,
"grad_norm": 3.015625,
"kd_loss": 0.142578125,
"learning_rate": 4.654807572268628e-06,
"loss": 0.1602,
"step": 1610,
"student_loss": 0.002385765314102173,
"teacher_loss": 0.0010947687551379204
},
{
"epoch": 0.46568627450980393,
"grad_norm": 4.65625,
"kd_loss": 0.10009765625,
"learning_rate": 4.627887501769231e-06,
"loss": 0.1628,
"step": 1615,
"student_loss": 0.004139338154345751,
"teacher_loss": 0.009530629962682724
},
{
"epoch": 0.4671280276816609,
"grad_norm": 3.9375,
"kd_loss": 0.119140625,
"learning_rate": 4.600978273708612e-06,
"loss": 0.153,
"step": 1620,
"student_loss": 0.003188611473888159,
"teacher_loss": 0.001373080536723137
},
{
"epoch": 0.46856978085351786,
"grad_norm": 4.28125,
"kd_loss": 0.1025390625,
"learning_rate": 4.574080672155189e-06,
"loss": 0.1591,
"step": 1625,
"student_loss": 0.0014868304133415222,
"teacher_loss": 0.000639898469671607
},
{
"epoch": 0.4700115340253749,
"grad_norm": 3.21875,
"kd_loss": 0.0810546875,
"learning_rate": 4.547195480838612e-06,
"loss": 0.1515,
"step": 1630,
"student_loss": 0.040516145527362823,
"teacher_loss": 0.0004312426899559796
},
{
"epoch": 0.47145328719723184,
"grad_norm": 4.84375,
"kd_loss": 0.087890625,
"learning_rate": 4.520323483126928e-06,
"loss": 0.1862,
"step": 1635,
"student_loss": 0.12052398920059204,
"teacher_loss": 0.0003846465260721743
},
{
"epoch": 0.4728950403690888,
"grad_norm": 4.65625,
"kd_loss": 0.0908203125,
"learning_rate": 4.493465462003756e-06,
"loss": 0.1453,
"step": 1640,
"student_loss": 0.0010070661082863808,
"teacher_loss": 0.0012260322691872716
},
{
"epoch": 0.47433679354094577,
"grad_norm": 2.875,
"kd_loss": 0.09375,
"learning_rate": 4.4666222000454685e-06,
"loss": 0.1545,
"step": 1645,
"student_loss": 0.0013779783621430397,
"teacher_loss": 0.00042751312139444053
},
{
"epoch": 0.4757785467128028,
"grad_norm": 7.15625,
"kd_loss": 0.0869140625,
"learning_rate": 4.4397944793983946e-06,
"loss": 0.1599,
"step": 1650,
"student_loss": 0.0005161279696039855,
"teacher_loss": 0.0007879316690377891
},
{
"epoch": 0.47722029988465975,
"grad_norm": 3.421875,
"kd_loss": 0.08203125,
"learning_rate": 4.4129830817560284e-06,
"loss": 0.1627,
"step": 1655,
"student_loss": 0.003220248268917203,
"teacher_loss": 0.0008016406209208071
},
{
"epoch": 0.4786620530565167,
"grad_norm": 4.125,
"kd_loss": 0.1015625,
"learning_rate": 4.386188788336251e-06,
"loss": 0.1404,
"step": 1660,
"student_loss": 0.08471440523862839,
"teacher_loss": 0.0003611688152886927
},
{
"epoch": 0.4801038062283737,
"grad_norm": 4.0,
"kd_loss": 0.09716796875,
"learning_rate": 4.359412379858569e-06,
"loss": 0.1428,
"step": 1665,
"student_loss": 0.0006392439245246351,
"teacher_loss": 0.00034953776048496366
},
{
"epoch": 0.4815455594002307,
"grad_norm": 6.0,
"kd_loss": 0.1484375,
"learning_rate": 4.332654636521365e-06,
"loss": 0.1493,
"step": 1670,
"student_loss": 0.13810043036937714,
"teacher_loss": 0.005655170418322086
},
{
"epoch": 0.48298731257208766,
"grad_norm": 2.9375,
"kd_loss": 0.1015625,
"learning_rate": 4.3059163379791676e-06,
"loss": 0.1588,
"step": 1675,
"student_loss": 0.0011410564184188843,
"teacher_loss": 0.0009165616356767714
},
{
"epoch": 0.4844290657439446,
"grad_norm": 3.96875,
"kd_loss": 0.103515625,
"learning_rate": 4.279198263319932e-06,
"loss": 0.1983,
"step": 1680,
"student_loss": 0.042820997536182404,
"teacher_loss": 0.0006943390471860766
},
{
"epoch": 0.48587081891580164,
"grad_norm": 4.65625,
"kd_loss": 0.0927734375,
"learning_rate": 4.252501191042334e-06,
"loss": 0.1458,
"step": 1685,
"student_loss": 0.001107779797166586,
"teacher_loss": 0.000587086018640548
},
{
"epoch": 0.4873125720876586,
"grad_norm": 3.84375,
"kd_loss": 0.08154296875,
"learning_rate": 4.2258258990331015e-06,
"loss": 0.1505,
"step": 1690,
"student_loss": 0.0010096587939187884,
"teacher_loss": 0.0006587179377675056
},
{
"epoch": 0.48875432525951557,
"grad_norm": 3.296875,
"kd_loss": 0.09423828125,
"learning_rate": 4.199173164544331e-06,
"loss": 0.1301,
"step": 1695,
"student_loss": 0.0007151216268539429,
"teacher_loss": 0.0005271242698654532
},
{
"epoch": 0.49019607843137253,
"grad_norm": 2.75,
"kd_loss": 0.111328125,
"learning_rate": 4.1725437641708535e-06,
"loss": 0.1292,
"step": 1700,
"student_loss": 0.0004782706964761019,
"teacher_loss": 0.00025354631361551583
},
{
"epoch": 0.49163783160322955,
"grad_norm": 4.84375,
"kd_loss": 0.0908203125,
"learning_rate": 4.145938473827598e-06,
"loss": 0.1694,
"step": 1705,
"student_loss": 0.03137379139661789,
"teacher_loss": 0.0014872003812342882
},
{
"epoch": 0.4930795847750865,
"grad_norm": 3.890625,
"kd_loss": 0.0859375,
"learning_rate": 4.1193580687269896e-06,
"loss": 0.1799,
"step": 1710,
"student_loss": 0.13360068202018738,
"teacher_loss": 0.0003576852031983435
},
{
"epoch": 0.4945213379469435,
"grad_norm": 3.6875,
"kd_loss": 0.0849609375,
"learning_rate": 4.092803323356357e-06,
"loss": 0.1568,
"step": 1715,
"student_loss": 0.0007690335623919964,
"teacher_loss": 0.0004268670454621315
},
{
"epoch": 0.49596309111880044,
"grad_norm": 4.53125,
"kd_loss": 0.09423828125,
"learning_rate": 4.066275011455369e-06,
"loss": 0.1345,
"step": 1720,
"student_loss": 0.0010869682300835848,
"teacher_loss": 0.0008715330623090267
},
{
"epoch": 0.49740484429065746,
"grad_norm": 3.78125,
"kd_loss": 0.10302734375,
"learning_rate": 4.039773905993486e-06,
"loss": 0.1661,
"step": 1725,
"student_loss": 0.001568131148815155,
"teacher_loss": 0.0013625255087390542
},
{
"epoch": 0.4988465974625144,
"grad_norm": 3.234375,
"kd_loss": 0.103515625,
"learning_rate": 4.013300779147445e-06,
"loss": 0.1311,
"step": 1730,
"student_loss": 0.003118544816970825,
"teacher_loss": 0.027190769091248512
},
{
"epoch": 0.5002883506343714,
"grad_norm": 3.984375,
"kd_loss": 0.1337890625,
"learning_rate": 3.98685640227875e-06,
"loss": 0.1465,
"step": 1735,
"student_loss": 0.004482457414269447,
"teacher_loss": 0.06759393215179443
},
{
"epoch": 0.5017301038062284,
"grad_norm": 4.84375,
"kd_loss": 0.1123046875,
"learning_rate": 3.960441545911205e-06,
"loss": 0.1692,
"step": 1740,
"student_loss": 0.0023235215339809656,
"teacher_loss": 0.027365142479538918
},
{
"epoch": 0.5031718569780853,
"grad_norm": 2.78125,
"kd_loss": 0.11474609375,
"learning_rate": 3.934056979708456e-06,
"loss": 0.1393,
"step": 1745,
"student_loss": 0.0015286001143977046,
"teacher_loss": 0.02260914258658886
},
{
"epoch": 0.5046136101499423,
"grad_norm": 6.3125,
"kd_loss": 0.10009765625,
"learning_rate": 3.907703472451574e-06,
"loss": 0.1627,
"step": 1750,
"student_loss": 0.009829165413975716,
"teacher_loss": 0.0006399175035767257
},
{
"epoch": 0.5060553633217993,
"grad_norm": 7.96875,
"kd_loss": 0.099609375,
"learning_rate": 3.881381792016645e-06,
"loss": 0.1749,
"step": 1755,
"student_loss": 0.0006391415954567492,
"teacher_loss": 0.0003294479101896286
},
{
"epoch": 0.5074971164936563,
"grad_norm": 3.53125,
"kd_loss": 0.1220703125,
"learning_rate": 3.8550927053523994e-06,
"loss": 0.1389,
"step": 1760,
"student_loss": 0.0007123491377569735,
"teacher_loss": 0.046919528394937515
},
{
"epoch": 0.5089388696655133,
"grad_norm": 5.09375,
"kd_loss": 0.083984375,
"learning_rate": 3.828836978457868e-06,
"loss": 0.1522,
"step": 1765,
"student_loss": 0.0016679943073540926,
"teacher_loss": 0.000926964043173939
},
{
"epoch": 0.5103806228373703,
"grad_norm": 3.671875,
"kd_loss": 0.1357421875,
"learning_rate": 3.8026153763600603e-06,
"loss": 0.1477,
"step": 1770,
"student_loss": 0.022712958976626396,
"teacher_loss": 0.029349761083722115
},
{
"epoch": 0.5118223760092272,
"grad_norm": 3.625,
"kd_loss": 0.09228515625,
"learning_rate": 3.7764286630916704e-06,
"loss": 0.1425,
"step": 1775,
"student_loss": 0.0012370613403618336,
"teacher_loss": 0.0007929064449854195
},
{
"epoch": 0.5132641291810842,
"grad_norm": 3.234375,
"kd_loss": 0.0947265625,
"learning_rate": 3.7502776016688234e-06,
"loss": 0.1589,
"step": 1780,
"student_loss": 0.008692040108144283,
"teacher_loss": 0.0190599225461483
},
{
"epoch": 0.5147058823529411,
"grad_norm": 4.46875,
"kd_loss": 0.1181640625,
"learning_rate": 3.724162954068835e-06,
"loss": 0.1568,
"step": 1785,
"student_loss": 0.1760350614786148,
"teacher_loss": 0.028955036774277687
},
{
"epoch": 0.5161476355247981,
"grad_norm": 3.28125,
"kd_loss": 0.083984375,
"learning_rate": 3.6980854812080097e-06,
"loss": 0.1497,
"step": 1790,
"student_loss": 0.0018669115379452705,
"teacher_loss": 0.0008881228277459741
},
{
"epoch": 0.5175893886966552,
"grad_norm": 4.6875,
"kd_loss": 0.08984375,
"learning_rate": 3.6720459429194743e-06,
"loss": 0.1635,
"step": 1795,
"student_loss": 0.1518515795469284,
"teacher_loss": 0.00034799822606146336
},
{
"epoch": 0.5190311418685121,
"grad_norm": 2.828125,
"kd_loss": 0.0869140625,
"learning_rate": 3.646045097931037e-06,
"loss": 0.1584,
"step": 1800,
"student_loss": 0.0007797479629516602,
"teacher_loss": 0.0005545561434701085
},
{
"epoch": 0.5204728950403691,
"grad_norm": 5.5,
"kd_loss": 0.08056640625,
"learning_rate": 3.620083703843077e-06,
"loss": 0.1433,
"step": 1805,
"student_loss": 0.03987161070108414,
"teacher_loss": 0.00046788767213001847
},
{
"epoch": 0.5219146482122261,
"grad_norm": 4.0,
"kd_loss": 0.095703125,
"learning_rate": 3.594162517106472e-06,
"loss": 0.1646,
"step": 1810,
"student_loss": 0.0409666933119297,
"teacher_loss": 0.0038134430069476366
},
{
"epoch": 0.523356401384083,
"grad_norm": 3.71875,
"kd_loss": 0.10693359375,
"learning_rate": 3.5682822930005567e-06,
"loss": 0.1424,
"step": 1815,
"student_loss": 0.30990689992904663,
"teacher_loss": 0.019313883036375046
},
{
"epoch": 0.52479815455594,
"grad_norm": 6.0625,
"kd_loss": 0.1015625,
"learning_rate": 3.542443785611117e-06,
"loss": 0.1455,
"step": 1820,
"student_loss": 0.05496774613857269,
"teacher_loss": 0.003318265313282609
},
{
"epoch": 0.526239907727797,
"grad_norm": 7.9375,
"kd_loss": 0.103515625,
"learning_rate": 3.516647747808417e-06,
"loss": 0.1445,
"step": 1825,
"student_loss": 0.0014039704110473394,
"teacher_loss": 0.004902483429759741
},
{
"epoch": 0.527681660899654,
"grad_norm": 3.84375,
"kd_loss": 0.08642578125,
"learning_rate": 3.4908949312252593e-06,
"loss": 0.1453,
"step": 1830,
"student_loss": 0.001090447069145739,
"teacher_loss": 0.0005476956139318645
},
{
"epoch": 0.529123414071511,
"grad_norm": 3.828125,
"kd_loss": 0.10595703125,
"learning_rate": 3.4651860862350893e-06,
"loss": 0.1355,
"step": 1835,
"student_loss": 0.00827399455010891,
"teacher_loss": 0.0005034086061641574
},
{
"epoch": 0.5305651672433679,
"grad_norm": 3.375,
"kd_loss": 0.083984375,
"learning_rate": 3.4395219619301288e-06,
"loss": 0.1429,
"step": 1840,
"student_loss": 0.02982058748602867,
"teacher_loss": 0.003948381636291742
},
{
"epoch": 0.5320069204152249,
"grad_norm": 4.0625,
"kd_loss": 0.10791015625,
"learning_rate": 3.4139033060995484e-06,
"loss": 0.1606,
"step": 1845,
"student_loss": 0.0009567984379827976,
"teacher_loss": 0.0006957401055842638
},
{
"epoch": 0.5334486735870819,
"grad_norm": 3.484375,
"kd_loss": 0.0859375,
"learning_rate": 3.388330865207681e-06,
"loss": 0.1516,
"step": 1850,
"student_loss": 0.09060114622116089,
"teacher_loss": 0.011022915132343769
},
{
"epoch": 0.5348904267589388,
"grad_norm": 3.8125,
"kd_loss": 0.09326171875,
"learning_rate": 3.3628053843722674e-06,
"loss": 0.1586,
"step": 1855,
"student_loss": 0.0023815552704036236,
"teacher_loss": 0.0009834859520196915
},
{
"epoch": 0.5363321799307958,
"grad_norm": 2.890625,
"kd_loss": 0.08642578125,
"learning_rate": 3.337327607342753e-06,
"loss": 0.1443,
"step": 1860,
"student_loss": 0.0012727677822113037,
"teacher_loss": 0.0003461229207459837
},
{
"epoch": 0.5377739331026529,
"grad_norm": 3.796875,
"kd_loss": 0.09765625,
"learning_rate": 3.3118982764786055e-06,
"loss": 0.1753,
"step": 1865,
"student_loss": 0.12450817972421646,
"teacher_loss": 0.00035991144250147045
},
{
"epoch": 0.5392156862745098,
"grad_norm": 6.3125,
"kd_loss": 0.099609375,
"learning_rate": 3.2865181327277007e-06,
"loss": 0.1487,
"step": 1870,
"student_loss": 0.14028604328632355,
"teacher_loss": 0.0013080085627734661
},
{
"epoch": 0.5406574394463668,
"grad_norm": 3.25,
"kd_loss": 0.08984375,
"learning_rate": 3.2611879156047147e-06,
"loss": 0.1471,
"step": 1875,
"student_loss": 0.0018398945685476065,
"teacher_loss": 0.0011750732082873583
},
{
"epoch": 0.5420991926182238,
"grad_norm": 4.53125,
"kd_loss": 0.11376953125,
"learning_rate": 3.2359083631695897e-06,
"loss": 0.1327,
"step": 1880,
"student_loss": 0.03754269704222679,
"teacher_loss": 0.0011901544639840722
},
{
"epoch": 0.5435409457900807,
"grad_norm": 4.625,
"kd_loss": 0.10400390625,
"learning_rate": 3.2106802120060197e-06,
"loss": 0.1568,
"step": 1885,
"student_loss": 0.003520218888297677,
"teacher_loss": 0.0013501073699444532
},
{
"epoch": 0.5449826989619377,
"grad_norm": 4.1875,
"kd_loss": 0.08203125,
"learning_rate": 3.185504197199999e-06,
"loss": 0.1376,
"step": 1890,
"student_loss": 0.008974825032055378,
"teacher_loss": 0.00042682504863478243
},
{
"epoch": 0.5464244521337946,
"grad_norm": 5.1875,
"kd_loss": 0.0966796875,
"learning_rate": 3.160381052318393e-06,
"loss": 0.1649,
"step": 1895,
"student_loss": 0.0789928063750267,
"teacher_loss": 0.0007635668735019863
},
{
"epoch": 0.5478662053056517,
"grad_norm": 2.859375,
"kd_loss": 0.0791015625,
"learning_rate": 3.1353115093875676e-06,
"loss": 0.1554,
"step": 1900,
"student_loss": 0.002470338949933648,
"teacher_loss": 0.0004501968214754015
},
{
"epoch": 0.5493079584775087,
"grad_norm": 4.46875,
"kd_loss": 0.09375,
"learning_rate": 3.1102962988720615e-06,
"loss": 0.1432,
"step": 1905,
"student_loss": 0.04193798825144768,
"teacher_loss": 0.0009529749513603747
},
{
"epoch": 0.5507497116493656,
"grad_norm": 5.25,
"kd_loss": 0.09716796875,
"learning_rate": 3.085336149653303e-06,
"loss": 0.1487,
"step": 1910,
"student_loss": 0.000636325916275382,
"teacher_loss": 0.0005562056903727353
},
{
"epoch": 0.5521914648212226,
"grad_norm": 3.359375,
"kd_loss": 0.10107421875,
"learning_rate": 3.060431789008368e-06,
"loss": 0.1681,
"step": 1915,
"student_loss": 0.0016948822885751724,
"teacher_loss": 0.0023001739755272865
},
{
"epoch": 0.5536332179930796,
"grad_norm": 3.578125,
"kd_loss": 0.08447265625,
"learning_rate": 3.035583942588791e-06,
"loss": 0.1655,
"step": 1920,
"student_loss": 0.055358272045850754,
"teacher_loss": 0.00030110430088825524
},
{
"epoch": 0.5550749711649365,
"grad_norm": 4.125,
"kd_loss": 0.09814453125,
"learning_rate": 3.0107933343994233e-06,
"loss": 0.1582,
"step": 1925,
"student_loss": 0.008619318716228008,
"teacher_loss": 0.0032901125960052013
},
{
"epoch": 0.5565167243367936,
"grad_norm": 6.75,
"kd_loss": 0.08349609375,
"learning_rate": 2.9860606867773323e-06,
"loss": 0.1394,
"step": 1930,
"student_loss": 0.03425801545381546,
"teacher_loss": 0.00030506699113175273
},
{
"epoch": 0.5579584775086506,
"grad_norm": 4.3125,
"kd_loss": 0.0908203125,
"learning_rate": 2.9613867203707627e-06,
"loss": 0.1535,
"step": 1935,
"student_loss": 0.14860902726650238,
"teacher_loss": 0.021592382341623306
},
{
"epoch": 0.5594002306805075,
"grad_norm": 4.96875,
"kd_loss": 0.10546875,
"learning_rate": 2.936772154118129e-06,
"loss": 0.1545,
"step": 1940,
"student_loss": 0.007172099314630032,
"teacher_loss": 0.000852234719786793
},
{
"epoch": 0.5608419838523645,
"grad_norm": 3.828125,
"kd_loss": 0.09619140625,
"learning_rate": 2.912217705227075e-06,
"loss": 0.1493,
"step": 1945,
"student_loss": 0.04466139152646065,
"teacher_loss": 0.028232689946889877
},
{
"epoch": 0.5622837370242214,
"grad_norm": 8.1875,
"kd_loss": 0.08740234375,
"learning_rate": 2.88772408915357e-06,
"loss": 0.1749,
"step": 1950,
"student_loss": 0.0008998726261779666,
"teacher_loss": 0.0005217011785134673
},
{
"epoch": 0.5637254901960784,
"grad_norm": 4.0625,
"kd_loss": 0.10498046875,
"learning_rate": 2.863292019581071e-06,
"loss": 0.1535,
"step": 1955,
"student_loss": 0.23264119029045105,
"teacher_loss": 0.0003505937347654253
},
{
"epoch": 0.5651672433679354,
"grad_norm": 4.5,
"kd_loss": 0.11572265625,
"learning_rate": 2.838922208399712e-06,
"loss": 0.1646,
"step": 1960,
"student_loss": 0.005253693088889122,
"teacher_loss": 0.0008160973084159195
},
{
"epoch": 0.5666089965397924,
"grad_norm": 3.359375,
"kd_loss": 0.08935546875,
"learning_rate": 2.8146153656855858e-06,
"loss": 0.1571,
"step": 1965,
"student_loss": 0.0008905039285309613,
"teacher_loss": 0.00038642369327135384
},
{
"epoch": 0.5680507497116494,
"grad_norm": 6.96875,
"kd_loss": 0.10107421875,
"learning_rate": 2.7903721996800248e-06,
"loss": 0.1488,
"step": 1970,
"student_loss": 0.001944546471349895,
"teacher_loss": 0.0004150049644522369
},
{
"epoch": 0.5694925028835064,
"grad_norm": 2.453125,
"kd_loss": 0.09716796875,
"learning_rate": 2.7661934167689887e-06,
"loss": 0.1556,
"step": 1975,
"student_loss": 0.0032470019068568945,
"teacher_loss": 0.0009415296372026205
},
{
"epoch": 0.5709342560553633,
"grad_norm": 2.25,
"kd_loss": 0.08642578125,
"learning_rate": 2.742079721462471e-06,
"loss": 0.1674,
"step": 1980,
"student_loss": 0.05945152789354324,
"teacher_loss": 0.0008091035415418446
},
{
"epoch": 0.5723760092272203,
"grad_norm": 3.65625,
"kd_loss": 0.0771484375,
"learning_rate": 2.7180318163739704e-06,
"loss": 0.1519,
"step": 1985,
"student_loss": 0.0015980260213837028,
"teacher_loss": 0.0005768106202594936
},
{
"epoch": 0.5738177623990772,
"grad_norm": 2.65625,
"kd_loss": 0.115234375,
"learning_rate": 2.6940504022000248e-06,
"loss": 0.1546,
"step": 1990,
"student_loss": 0.0632084533572197,
"teacher_loss": 0.015749456360936165
},
{
"epoch": 0.5752595155709342,
"grad_norm": 5.4375,
"kd_loss": 0.0869140625,
"learning_rate": 2.67013617769979e-06,
"loss": 0.153,
"step": 1995,
"student_loss": 0.0011626326013356447,
"teacher_loss": 0.000745030993130058
},
{
"epoch": 0.5767012687427913,
"grad_norm": 4.0,
"kd_loss": 0.09912109375,
"learning_rate": 2.6462898396746783e-06,
"loss": 0.1493,
"step": 2000,
"student_loss": 0.002248254604637623,
"teacher_loss": 0.0006202560034580529
}
],
"logging_steps": 5,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}