UMSR-Reasoner-7B / trainer_state.json
NorthernTribe-Research's picture
Autonomous Space trainer update
b39845b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 25,
"global_step": 256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"distill_ce_loss": 10.82673168182373,
"distill_ce_weight": 0.35,
"distill_kd_loss": -0.0361328125,
"distill_kd_weight": 0.65,
"distill_temperature": 2.5,
"epoch": 0.00390625,
"grad_norm": 0.0181884765625,
"learning_rate": 0.0,
"loss": 3.765918493270874,
"step": 1
},
{
"distill_ce_loss": 10.829505920410156,
"distill_ce_weight": 0.3505859375,
"distill_kd_loss": -0.0361328125,
"distill_kd_weight": 0.6494140625,
"distill_temperature": 2.494921875,
"epoch": 0.0078125,
"grad_norm": 0.004180908203125,
"learning_rate": 1.25e-05,
"loss": 3.7732350826263428,
"step": 2
},
{
"distill_ce_loss": 10.825729370117188,
"distill_ce_weight": 0.35117187499999997,
"distill_kd_loss": -0.032470703125,
"distill_kd_weight": 0.648828125,
"distill_temperature": 2.48984375,
"epoch": 0.01171875,
"grad_norm": 0.0125732421875,
"learning_rate": 2.5e-05,
"loss": 3.780573606491089,
"step": 3
},
{
"distill_ce_loss": 10.830185890197754,
"distill_ce_weight": 0.3517578125,
"distill_kd_loss": -0.03125,
"distill_kd_weight": 0.6482421875000001,
"distill_temperature": 2.484765625,
"epoch": 0.015625,
"grad_norm": 0.00628662109375,
"learning_rate": 3.7500000000000003e-05,
"loss": 3.789339065551758,
"step": 4
},
{
"distill_ce_loss": 10.824288368225098,
"distill_ce_weight": 0.35234374999999996,
"distill_kd_loss": -0.0361328125,
"distill_kd_weight": 0.64765625,
"distill_temperature": 2.4796875,
"epoch": 0.01953125,
"grad_norm": 0.017333984375,
"learning_rate": 5e-05,
"loss": 3.7904326915740967,
"step": 5
},
{
"distill_ce_loss": 10.823088645935059,
"distill_ce_weight": 0.3529296875,
"distill_kd_loss": -0.031005859375,
"distill_kd_weight": 0.6470703125,
"distill_temperature": 2.474609375,
"epoch": 0.0234375,
"grad_norm": 0.01312255859375,
"learning_rate": 6.25e-05,
"loss": 3.799769639968872,
"step": 6
},
{
"distill_ce_loss": 10.827218055725098,
"distill_ce_weight": 0.353515625,
"distill_kd_loss": -0.031494140625,
"distill_kd_weight": 0.646484375,
"distill_temperature": 2.46953125,
"epoch": 0.02734375,
"grad_norm": 0.00823974609375,
"learning_rate": 7.500000000000001e-05,
"loss": 3.8072049617767334,
"step": 7
},
{
"distill_ce_loss": 10.827470779418945,
"distill_ce_weight": 0.35410156249999997,
"distill_kd_loss": -0.0296630859375,
"distill_kd_weight": 0.6458984375,
"distill_temperature": 2.464453125,
"epoch": 0.03125,
"grad_norm": 0.002349853515625,
"learning_rate": 8.75e-05,
"loss": 3.814859390258789,
"step": 8
},
{
"distill_ce_loss": 10.827388763427734,
"distill_ce_weight": 0.3546875,
"distill_kd_loss": -0.03271484375,
"distill_kd_weight": 0.6453125000000001,
"distill_temperature": 2.459375,
"epoch": 0.03515625,
"grad_norm": 0.01153564453125,
"learning_rate": 0.0001,
"loss": 3.8192214965820312,
"step": 9
},
{
"distill_ce_loss": 10.828258514404297,
"distill_ce_weight": 0.35527343749999996,
"distill_kd_loss": -0.031982421875,
"distill_kd_weight": 0.6447265625,
"distill_temperature": 2.454296875,
"epoch": 0.0390625,
"grad_norm": 0.00823974609375,
"learning_rate": 9.95967741935484e-05,
"loss": 3.8263626098632812,
"step": 10
},
{
"distill_ce_loss": 10.82706069946289,
"distill_ce_weight": 0.355859375,
"distill_kd_loss": -0.031982421875,
"distill_kd_weight": 0.644140625,
"distill_temperature": 2.44921875,
"epoch": 0.04296875,
"grad_norm": 0.010498046875,
"learning_rate": 9.919354838709678e-05,
"loss": 3.8322811126708984,
"step": 11
},
{
"distill_ce_loss": 10.826086044311523,
"distill_ce_weight": 0.3564453125,
"distill_kd_loss": -0.0341796875,
"distill_kd_weight": 0.6435546875,
"distill_temperature": 2.444140625,
"epoch": 0.046875,
"grad_norm": 0.004302978515625,
"learning_rate": 9.879032258064517e-05,
"loss": 3.836935043334961,
"step": 12
},
{
"distill_ce_loss": 10.82719612121582,
"distill_ce_weight": 0.35703124999999997,
"distill_kd_loss": -0.0306396484375,
"distill_kd_weight": 0.64296875,
"distill_temperature": 2.4390625,
"epoch": 0.05078125,
"grad_norm": 0.0322265625,
"learning_rate": 9.838709677419355e-05,
"loss": 3.845993995666504,
"step": 13
},
{
"distill_ce_loss": 10.827816009521484,
"distill_ce_weight": 0.3576171875,
"distill_kd_loss": -0.030029296875,
"distill_kd_weight": 0.6423828125000001,
"distill_temperature": 2.433984375,
"epoch": 0.0546875,
"grad_norm": 0.00165557861328125,
"learning_rate": 9.798387096774194e-05,
"loss": 3.852926015853882,
"step": 14
},
{
"distill_ce_loss": 10.826566696166992,
"distill_ce_weight": 0.35820312499999996,
"distill_kd_loss": -0.033935546875,
"distill_kd_weight": 0.641796875,
"distill_temperature": 2.42890625,
"epoch": 0.05859375,
"grad_norm": 0.0146484375,
"learning_rate": 9.758064516129033e-05,
"loss": 3.856381416320801,
"step": 15
},
{
"distill_ce_loss": 10.825676918029785,
"distill_ce_weight": 0.3587890625,
"distill_kd_loss": -0.032470703125,
"distill_kd_weight": 0.6412109375,
"distill_temperature": 2.423828125,
"epoch": 0.0625,
"grad_norm": 0.0007781982421875,
"learning_rate": 9.717741935483872e-05,
"loss": 3.863260269165039,
"step": 16
},
{
"distill_ce_loss": 10.827549934387207,
"distill_ce_weight": 0.359375,
"distill_kd_loss": -0.03369140625,
"distill_kd_weight": 0.640625,
"distill_temperature": 2.41875,
"epoch": 0.06640625,
"grad_norm": 0.00131988525390625,
"learning_rate": 9.677419354838711e-05,
"loss": 3.869544267654419,
"step": 17
},
{
"distill_ce_loss": 10.827980995178223,
"distill_ce_weight": 0.35996093749999997,
"distill_kd_loss": -0.03369140625,
"distill_kd_weight": 0.6400390625,
"distill_temperature": 2.413671875,
"epoch": 0.0703125,
"grad_norm": 0.000782012939453125,
"learning_rate": 9.63709677419355e-05,
"loss": 3.8760437965393066,
"step": 18
},
{
"distill_ce_loss": 10.82925033569336,
"distill_ce_weight": 0.360546875,
"distill_kd_loss": -0.03369140625,
"distill_kd_weight": 0.6394531250000001,
"distill_temperature": 2.40859375,
"epoch": 0.07421875,
"grad_norm": 0.000431060791015625,
"learning_rate": 9.596774193548387e-05,
"loss": 3.8829681873321533,
"step": 19
},
{
"distill_ce_loss": 10.827664375305176,
"distill_ce_weight": 0.36113281249999996,
"distill_kd_loss": -0.03369140625,
"distill_kd_weight": 0.6388671875,
"distill_temperature": 2.403515625,
"epoch": 0.078125,
"grad_norm": 0.000698089599609375,
"learning_rate": 9.556451612903226e-05,
"loss": 3.888740301132202,
"step": 20
},
{
"distill_ce_loss": 10.823915481567383,
"distill_ce_weight": 0.36171875,
"distill_kd_loss": -0.027587890625,
"distill_kd_weight": 0.63828125,
"distill_temperature": 2.3984375,
"epoch": 0.08203125,
"grad_norm": 0.0120849609375,
"learning_rate": 9.516129032258065e-05,
"loss": 3.897634983062744,
"step": 21
},
{
"distill_ce_loss": 10.826141357421875,
"distill_ce_weight": 0.3623046875,
"distill_kd_loss": -0.03173828125,
"distill_kd_weight": 0.6376953125,
"distill_temperature": 2.393359375,
"epoch": 0.0859375,
"grad_norm": 0.00787353515625,
"learning_rate": 9.475806451612904e-05,
"loss": 3.9020981788635254,
"step": 22
},
{
"distill_ce_loss": 10.828008651733398,
"distill_ce_weight": 0.36289062499999997,
"distill_kd_loss": -0.031494140625,
"distill_kd_weight": 0.637109375,
"distill_temperature": 2.38828125,
"epoch": 0.08984375,
"grad_norm": 0.0072021484375,
"learning_rate": 9.435483870967743e-05,
"loss": 3.909363269805908,
"step": 23
},
{
"distill_ce_loss": 10.826719284057617,
"distill_ce_weight": 0.3634765625,
"distill_kd_loss": -0.031982421875,
"distill_kd_weight": 0.6365234375000001,
"distill_temperature": 2.383203125,
"epoch": 0.09375,
"grad_norm": 0.0021514892578125,
"learning_rate": 9.395161290322582e-05,
"loss": 3.9148731231689453,
"step": 24
},
{
"distill_ce_loss": 10.828035354614258,
"distill_ce_weight": 0.36406249999999996,
"distill_kd_loss": -0.03271484375,
"distill_kd_weight": 0.6359375,
"distill_temperature": 2.378125,
"epoch": 0.09765625,
"grad_norm": 0.0015411376953125,
"learning_rate": 9.35483870967742e-05,
"loss": 3.9213294982910156,
"step": 25
},
{
"distill_ce_loss": 10.826539039611816,
"distill_ce_weight": 0.3646484375,
"distill_kd_loss": -0.032470703125,
"distill_kd_weight": 0.6353515625,
"distill_temperature": 2.373046875,
"epoch": 0.09765625,
"eval_loss": 3.927885055541992,
"eval_runtime": 15.0677,
"eval_samples_per_second": 4.247,
"eval_steps_per_second": 4.247,
"step": 25
},
{
"distill_ce_loss": 10.82087516784668,
"distill_ce_weight": 0.3646484375,
"distill_kd_loss": -0.03173828125,
"distill_kd_weight": 0.6353515625,
"distill_temperature": 2.373046875,
"epoch": 0.1015625,
"grad_norm": 0.003143310546875,
"learning_rate": 9.314516129032259e-05,
"loss": 3.925673484802246,
"step": 26
},
{
"distill_ce_loss": 10.827393531799316,
"distill_ce_weight": 0.365234375,
"distill_kd_loss": -0.0322265625,
"distill_kd_weight": 0.634765625,
"distill_temperature": 2.36796875,
"epoch": 0.10546875,
"grad_norm": 0.00021648406982421875,
"learning_rate": 9.274193548387096e-05,
"loss": 3.934028387069702,
"step": 27
},
{
"distill_ce_loss": 10.828241348266602,
"distill_ce_weight": 0.36582031249999997,
"distill_kd_loss": -0.0302734375,
"distill_kd_weight": 0.6341796875,
"distill_temperature": 2.362890625,
"epoch": 0.109375,
"grad_norm": 0.005126953125,
"learning_rate": 9.233870967741935e-05,
"loss": 3.942025661468506,
"step": 28
},
{
"distill_ce_loss": 10.829521179199219,
"distill_ce_weight": 0.36640625,
"distill_kd_loss": -0.029052734375,
"distill_kd_weight": 0.6335937500000001,
"distill_temperature": 2.3578125,
"epoch": 0.11328125,
"grad_norm": 0.0032806396484375,
"learning_rate": 9.193548387096774e-05,
"loss": 3.9495718479156494,
"step": 29
},
{
"distill_ce_loss": 10.825637817382812,
"distill_ce_weight": 0.36699218749999996,
"distill_kd_loss": -0.0311279296875,
"distill_kd_weight": 0.6330078125,
"distill_temperature": 2.352734375,
"epoch": 0.1171875,
"grad_norm": 0.00286865234375,
"learning_rate": 9.153225806451613e-05,
"loss": 3.9532711505889893,
"step": 30
},
{
"distill_ce_loss": 10.827566146850586,
"distill_ce_weight": 0.367578125,
"distill_kd_loss": -0.03173828125,
"distill_kd_weight": 0.632421875,
"distill_temperature": 2.34765625,
"epoch": 0.12109375,
"grad_norm": 0.0004177093505859375,
"learning_rate": 9.112903225806452e-05,
"loss": 3.9599568843841553,
"step": 31
},
{
"distill_ce_loss": 10.824216842651367,
"distill_ce_weight": 0.3681640625,
"distill_kd_loss": -0.0301513671875,
"distill_kd_weight": 0.6318359375,
"distill_temperature": 2.342578125,
"epoch": 0.125,
"grad_norm": 0.00482177734375,
"learning_rate": 9.072580645161291e-05,
"loss": 3.9660446643829346,
"step": 32
},
{
"distill_ce_loss": 10.827281951904297,
"distill_ce_weight": 0.36874999999999997,
"distill_kd_loss": -0.0303955078125,
"distill_kd_weight": 0.63125,
"distill_temperature": 2.3375,
"epoch": 0.12890625,
"grad_norm": 0.00250244140625,
"learning_rate": 9.032258064516129e-05,
"loss": 3.973395347595215,
"step": 33
},
{
"distill_ce_loss": 10.82607364654541,
"distill_ce_weight": 0.3693359375,
"distill_kd_loss": -0.028564453125,
"distill_kd_weight": 0.6306640625000001,
"distill_temperature": 2.332421875,
"epoch": 0.1328125,
"grad_norm": 0.01165771484375,
"learning_rate": 8.991935483870968e-05,
"loss": 3.98039174079895,
"step": 34
},
{
"distill_ce_loss": 10.825122833251953,
"distill_ce_weight": 0.36992187499999996,
"distill_kd_loss": -0.0299072265625,
"distill_kd_weight": 0.630078125,
"distill_temperature": 2.32734375,
"epoch": 0.13671875,
"grad_norm": 0.0021514892578125,
"learning_rate": 8.951612903225806e-05,
"loss": 3.9856510162353516,
"step": 35
},
{
"distill_ce_loss": 10.829912185668945,
"distill_ce_weight": 0.3705078125,
"distill_kd_loss": -0.03076171875,
"distill_kd_weight": 0.6294921875,
"distill_temperature": 2.322265625,
"epoch": 0.140625,
"grad_norm": 0.0008697509765625,
"learning_rate": 8.911290322580645e-05,
"loss": 3.9931578636169434,
"step": 36
},
{
"distill_ce_loss": 10.827726364135742,
"distill_ce_weight": 0.37109375,
"distill_kd_loss": -0.03076171875,
"distill_kd_weight": 0.62890625,
"distill_temperature": 2.3171875,
"epoch": 0.14453125,
"grad_norm": 0.0003376007080078125,
"learning_rate": 8.870967741935484e-05,
"loss": 3.998814582824707,
"step": 37
},
{
"distill_ce_loss": 10.82931137084961,
"distill_ce_weight": 0.37167968749999997,
"distill_kd_loss": -0.0301513671875,
"distill_kd_weight": 0.6283203125,
"distill_temperature": 2.312109375,
"epoch": 0.1484375,
"grad_norm": 0.004302978515625,
"learning_rate": 8.830645161290323e-05,
"loss": 4.0061140060424805,
"step": 38
},
{
"distill_ce_loss": 10.828788757324219,
"distill_ce_weight": 0.372265625,
"distill_kd_loss": -0.02880859375,
"distill_kd_weight": 0.6277343750000001,
"distill_temperature": 2.30703125,
"epoch": 0.15234375,
"grad_norm": 0.00543212890625,
"learning_rate": 8.790322580645162e-05,
"loss": 4.013119697570801,
"step": 39
},
{
"distill_ce_loss": 10.826772689819336,
"distill_ce_weight": 0.37285156249999996,
"distill_kd_loss": -0.0294189453125,
"distill_kd_weight": 0.6271484375,
"distill_temperature": 2.301953125,
"epoch": 0.15625,
"grad_norm": 0.004791259765625,
"learning_rate": 8.75e-05,
"loss": 4.018346309661865,
"step": 40
},
{
"distill_ce_loss": 10.826981544494629,
"distill_ce_weight": 0.3734375,
"distill_kd_loss": -0.02734375,
"distill_kd_weight": 0.6265625,
"distill_temperature": 2.296875,
"epoch": 0.16015625,
"grad_norm": 0.0167236328125,
"learning_rate": 8.709677419354839e-05,
"loss": 4.026111125946045,
"step": 41
},
{
"distill_ce_loss": 10.826519966125488,
"distill_ce_weight": 0.3740234375,
"distill_kd_loss": -0.02978515625,
"distill_kd_weight": 0.6259765625,
"distill_temperature": 2.291796875,
"epoch": 0.1640625,
"grad_norm": 0.00041961669921875,
"learning_rate": 8.669354838709678e-05,
"loss": 4.03069543838501,
"step": 42
},
{
"distill_ce_loss": 10.828641891479492,
"distill_ce_weight": 0.37460937499999997,
"distill_kd_loss": -0.0302734375,
"distill_kd_weight": 0.625390625,
"distill_temperature": 2.28671875,
"epoch": 0.16796875,
"grad_norm": 0.000732421875,
"learning_rate": 8.629032258064517e-05,
"loss": 4.037590026855469,
"step": 43
},
{
"distill_ce_loss": 10.826750755310059,
"distill_ce_weight": 0.3751953125,
"distill_kd_loss": -0.0296630859375,
"distill_kd_weight": 0.6248046875000001,
"distill_temperature": 2.281640625,
"epoch": 0.171875,
"grad_norm": 0.0003414154052734375,
"learning_rate": 8.588709677419356e-05,
"loss": 4.043591499328613,
"step": 44
},
{
"distill_ce_loss": 10.826141357421875,
"distill_ce_weight": 0.37578124999999996,
"distill_kd_loss": -0.030029296875,
"distill_kd_weight": 0.62421875,
"distill_temperature": 2.2765625,
"epoch": 0.17578125,
"grad_norm": 0.0005035400390625,
"learning_rate": 8.548387096774195e-05,
"loss": 4.049461841583252,
"step": 45
},
{
"distill_ce_loss": 10.829056739807129,
"distill_ce_weight": 0.3763671875,
"distill_kd_loss": -0.02880859375,
"distill_kd_weight": 0.6236328125,
"distill_temperature": 2.271484375,
"epoch": 0.1796875,
"grad_norm": 0.000476837158203125,
"learning_rate": 8.508064516129032e-05,
"loss": 4.057757377624512,
"step": 46
},
{
"distill_ce_loss": 10.82729721069336,
"distill_ce_weight": 0.376953125,
"distill_kd_loss": -0.0296630859375,
"distill_kd_weight": 0.623046875,
"distill_temperature": 2.26640625,
"epoch": 0.18359375,
"grad_norm": 0.00677490234375,
"learning_rate": 8.467741935483871e-05,
"loss": 4.06295108795166,
"step": 47
},
{
"distill_ce_loss": 10.822839736938477,
"distill_ce_weight": 0.37753906249999997,
"distill_kd_loss": -0.0291748046875,
"distill_kd_weight": 0.6224609375,
"distill_temperature": 2.261328125,
"epoch": 0.1875,
"grad_norm": 0.000530242919921875,
"learning_rate": 8.42741935483871e-05,
"loss": 4.067856311798096,
"step": 48
},
{
"distill_ce_loss": 10.82568073272705,
"distill_ce_weight": 0.378125,
"distill_kd_loss": -0.029052734375,
"distill_kd_weight": 0.6218750000000001,
"distill_temperature": 2.25625,
"epoch": 0.19140625,
"grad_norm": 0.0003108978271484375,
"learning_rate": 8.387096774193549e-05,
"loss": 4.075394153594971,
"step": 49
},
{
"distill_ce_loss": 10.828625679016113,
"distill_ce_weight": 0.37871093749999996,
"distill_kd_loss": -0.0291748046875,
"distill_kd_weight": 0.6212890625,
"distill_temperature": 2.251171875,
"epoch": 0.1953125,
"grad_norm": 0.0003566741943359375,
"learning_rate": 8.346774193548388e-05,
"loss": 4.082852363586426,
"step": 50
},
{
"distill_ce_loss": 10.82653522491455,
"distill_ce_weight": 0.379296875,
"distill_kd_loss": -0.02880859375,
"distill_kd_weight": 0.620703125,
"distill_temperature": 2.24609375,
"epoch": 0.1953125,
"eval_loss": 4.088868141174316,
"eval_runtime": 13.2468,
"eval_samples_per_second": 4.831,
"eval_steps_per_second": 4.831,
"step": 50
},
{
"distill_ce_loss": 10.827178955078125,
"distill_ce_weight": 0.379296875,
"distill_kd_loss": -0.0289306640625,
"distill_kd_weight": 0.620703125,
"distill_temperature": 2.24609375,
"epoch": 0.19921875,
"grad_norm": 6.198883056640625e-05,
"learning_rate": 8.306451612903227e-05,
"loss": 4.088770866394043,
"step": 51
},
{
"distill_ce_loss": 10.826313018798828,
"distill_ce_weight": 0.3798828125,
"distill_kd_loss": -0.0291748046875,
"distill_kd_weight": 0.6201171875,
"distill_temperature": 2.241015625,
"epoch": 0.203125,
"grad_norm": 0.00010395050048828125,
"learning_rate": 8.266129032258066e-05,
"loss": 4.094663619995117,
"step": 52
},
{
"distill_ce_loss": 10.829632759094238,
"distill_ce_weight": 0.38046874999999997,
"distill_kd_loss": -0.02880859375,
"distill_kd_weight": 0.61953125,
"distill_temperature": 2.2359375,
"epoch": 0.20703125,
"grad_norm": 0.000644683837890625,
"learning_rate": 8.225806451612904e-05,
"loss": 4.102514743804932,
"step": 53
},
{
"distill_ce_loss": 10.825312614440918,
"distill_ce_weight": 0.3810546875,
"distill_kd_loss": -0.02880859375,
"distill_kd_weight": 0.6189453125000001,
"distill_temperature": 2.230859375,
"epoch": 0.2109375,
"grad_norm": 0.0079345703125,
"learning_rate": 8.185483870967743e-05,
"loss": 4.107213973999023,
"step": 54
},
{
"distill_ce_loss": 10.826827049255371,
"distill_ce_weight": 0.38164062499999996,
"distill_kd_loss": -0.02734375,
"distill_kd_weight": 0.618359375,
"distill_temperature": 2.22578125,
"epoch": 0.21484375,
"grad_norm": 0.00110626220703125,
"learning_rate": 8.145161290322582e-05,
"loss": 4.114989280700684,
"step": 55
},
{
"distill_ce_loss": 10.83178997039795,
"distill_ce_weight": 0.3822265625,
"distill_kd_loss": -0.0281982421875,
"distill_kd_weight": 0.6177734375,
"distill_temperature": 2.220703125,
"epoch": 0.21875,
"grad_norm": 0.0059814453125,
"learning_rate": 8.104838709677419e-05,
"loss": 4.12274169921875,
"step": 56
},
{
"distill_ce_loss": 10.824783325195312,
"distill_ce_weight": 0.3828125,
"distill_kd_loss": -0.0281982421875,
"distill_kd_weight": 0.6171875,
"distill_temperature": 2.215625,
"epoch": 0.22265625,
"grad_norm": 0.000339508056640625,
"learning_rate": 8.064516129032258e-05,
"loss": 4.126406192779541,
"step": 57
},
{
"distill_ce_loss": 10.824844360351562,
"distill_ce_weight": 0.38339843749999997,
"distill_kd_loss": -0.02685546875,
"distill_kd_weight": 0.6166015625,
"distill_temperature": 2.210546875,
"epoch": 0.2265625,
"grad_norm": 0.00390625,
"learning_rate": 8.024193548387097e-05,
"loss": 4.133626937866211,
"step": 58
},
{
"distill_ce_loss": 10.829590797424316,
"distill_ce_weight": 0.383984375,
"distill_kd_loss": -0.0277099609375,
"distill_kd_weight": 0.6160156250000001,
"distill_temperature": 2.20546875,
"epoch": 0.23046875,
"grad_norm": 0.006011962890625,
"learning_rate": 7.983870967741936e-05,
"loss": 4.141304016113281,
"step": 59
},
{
"distill_ce_loss": 10.826620101928711,
"distill_ce_weight": 0.38457031249999996,
"distill_kd_loss": -0.025146484375,
"distill_kd_weight": 0.6154296875,
"distill_temperature": 2.200390625,
"epoch": 0.234375,
"grad_norm": 0.0019378662109375,
"learning_rate": 7.943548387096774e-05,
"loss": 4.1480937004089355,
"step": 60
},
{
"distill_ce_loss": 10.827092170715332,
"distill_ce_weight": 0.38515625,
"distill_kd_loss": -0.0277099609375,
"distill_kd_weight": 0.61484375,
"distill_temperature": 2.1953125,
"epoch": 0.23828125,
"grad_norm": 0.0003719329833984375,
"learning_rate": 7.903225806451613e-05,
"loss": 4.153032302856445,
"step": 61
},
{
"distill_ce_loss": 10.82585334777832,
"distill_ce_weight": 0.3857421875,
"distill_kd_loss": -0.0274658203125,
"distill_kd_weight": 0.6142578125,
"distill_temperature": 2.190234375,
"epoch": 0.2421875,
"grad_norm": 0.0002956390380859375,
"learning_rate": 7.862903225806451e-05,
"loss": 4.15914249420166,
"step": 62
},
{
"distill_ce_loss": 10.82608413696289,
"distill_ce_weight": 0.38632812499999997,
"distill_kd_loss": -0.027099609375,
"distill_kd_weight": 0.613671875,
"distill_temperature": 2.18515625,
"epoch": 0.24609375,
"grad_norm": 0.002288818359375,
"learning_rate": 7.82258064516129e-05,
"loss": 4.16581916809082,
"step": 63
},
{
"distill_ce_loss": 10.821599960327148,
"distill_ce_weight": 0.3869140625,
"distill_kd_loss": -0.0274658203125,
"distill_kd_weight": 0.6130859375000001,
"distill_temperature": 2.180078125,
"epoch": 0.25,
"grad_norm": 0.0004634857177734375,
"learning_rate": 7.78225806451613e-05,
"loss": 4.1701836585998535,
"step": 64
},
{
"distill_ce_loss": 10.82644271850586,
"distill_ce_weight": 0.38749999999999996,
"distill_kd_loss": -0.02685546875,
"distill_kd_weight": 0.6125,
"distill_temperature": 2.175,
"epoch": 0.25390625,
"grad_norm": 0.00035858154296875,
"learning_rate": 7.741935483870968e-05,
"loss": 4.17876672744751,
"step": 65
},
{
"distill_ce_loss": 10.825672149658203,
"distill_ce_weight": 0.3880859375,
"distill_kd_loss": -0.0274658203125,
"distill_kd_weight": 0.6119140625,
"distill_temperature": 2.169921875,
"epoch": 0.2578125,
"grad_norm": 0.0016632080078125,
"learning_rate": 7.701612903225807e-05,
"loss": 4.184445381164551,
"step": 66
},
{
"distill_ce_loss": 10.82388973236084,
"distill_ce_weight": 0.388671875,
"distill_kd_loss": -0.026611328125,
"distill_kd_weight": 0.611328125,
"distill_temperature": 2.16484375,
"epoch": 0.26171875,
"grad_norm": 0.00138092041015625,
"learning_rate": 7.661290322580645e-05,
"loss": 4.190706253051758,
"step": 67
},
{
"distill_ce_loss": 10.82534408569336,
"distill_ce_weight": 0.38925781249999997,
"distill_kd_loss": -0.0267333984375,
"distill_kd_weight": 0.6107421875,
"distill_temperature": 2.159765625,
"epoch": 0.265625,
"grad_norm": 0.0003376007080078125,
"learning_rate": 7.620967741935484e-05,
"loss": 4.197492599487305,
"step": 68
},
{
"distill_ce_loss": 10.828094482421875,
"distill_ce_weight": 0.38984375,
"distill_kd_loss": -0.0269775390625,
"distill_kd_weight": 0.6101562500000001,
"distill_temperature": 2.1546875,
"epoch": 0.26953125,
"grad_norm": 0.000881195068359375,
"learning_rate": 7.580645161290323e-05,
"loss": 4.2047858238220215,
"step": 69
},
{
"distill_ce_loss": 10.828548431396484,
"distill_ce_weight": 0.39042968749999996,
"distill_kd_loss": -0.0263671875,
"distill_kd_weight": 0.6095703125,
"distill_temperature": 2.149609375,
"epoch": 0.2734375,
"grad_norm": 0.00139617919921875,
"learning_rate": 7.540322580645162e-05,
"loss": 4.211673259735107,
"step": 70
},
{
"distill_ce_loss": 10.827938079833984,
"distill_ce_weight": 0.391015625,
"distill_kd_loss": -0.02490234375,
"distill_kd_weight": 0.608984375,
"distill_temperature": 2.14453125,
"epoch": 0.27734375,
"grad_norm": 0.0054931640625,
"learning_rate": 7.500000000000001e-05,
"loss": 4.218756198883057,
"step": 71
},
{
"distill_ce_loss": 10.829117774963379,
"distill_ce_weight": 0.3916015625,
"distill_kd_loss": -0.025390625,
"distill_kd_weight": 0.6083984375,
"distill_temperature": 2.139453125,
"epoch": 0.28125,
"grad_norm": 0.0019073486328125,
"learning_rate": 7.45967741935484e-05,
"loss": 4.225257396697998,
"step": 72
},
{
"distill_ce_loss": 10.828458786010742,
"distill_ce_weight": 0.39218749999999997,
"distill_kd_loss": -0.026123046875,
"distill_kd_weight": 0.6078125,
"distill_temperature": 2.134375,
"epoch": 0.28515625,
"grad_norm": 0.00016880035400390625,
"learning_rate": 7.419354838709677e-05,
"loss": 4.230916976928711,
"step": 73
},
{
"distill_ce_loss": 10.827070236206055,
"distill_ce_weight": 0.3927734375,
"distill_kd_loss": -0.026123046875,
"distill_kd_weight": 0.6072265625000001,
"distill_temperature": 2.129296875,
"epoch": 0.2890625,
"grad_norm": 0.00238037109375,
"learning_rate": 7.379032258064516e-05,
"loss": 4.2367167472839355,
"step": 74
},
{
"distill_ce_loss": 10.826481819152832,
"distill_ce_weight": 0.39335937499999996,
"distill_kd_loss": -0.02587890625,
"distill_kd_weight": 0.606640625,
"distill_temperature": 2.12421875,
"epoch": 0.29296875,
"grad_norm": 0.000629425048828125,
"learning_rate": 7.338709677419355e-05,
"loss": 4.242950916290283,
"step": 75
},
{
"distill_ce_loss": 10.826537132263184,
"distill_ce_weight": 0.3939453125,
"distill_kd_loss": -0.025634765625,
"distill_kd_weight": 0.6060546875,
"distill_temperature": 2.119140625,
"epoch": 0.29296875,
"eval_loss": 4.2497992515563965,
"eval_runtime": 14.844,
"eval_samples_per_second": 4.312,
"eval_steps_per_second": 4.312,
"step": 75
},
{
"distill_ce_loss": 10.827073097229004,
"distill_ce_weight": 0.3939453125,
"distill_kd_loss": -0.025634765625,
"distill_kd_weight": 0.6060546875,
"distill_temperature": 2.119140625,
"epoch": 0.296875,
"grad_norm": 0.000728607177734375,
"learning_rate": 7.298387096774194e-05,
"loss": 4.249710559844971,
"step": 76
},
{
"distill_ce_loss": 10.827564239501953,
"distill_ce_weight": 0.39453125,
"distill_kd_loss": -0.0252685546875,
"distill_kd_weight": 0.60546875,
"distill_temperature": 2.1140625,
"epoch": 0.30078125,
"grad_norm": 0.0002899169921875,
"learning_rate": 7.258064516129033e-05,
"loss": 4.256492614746094,
"step": 77
},
{
"distill_ce_loss": 10.82872486114502,
"distill_ce_weight": 0.39511718749999997,
"distill_kd_loss": -0.0255126953125,
"distill_kd_weight": 0.6048828125,
"distill_temperature": 2.108984375,
"epoch": 0.3046875,
"grad_norm": 0.00021839141845703125,
"learning_rate": 7.217741935483872e-05,
"loss": 4.263173580169678,
"step": 78
},
{
"distill_ce_loss": 10.823871612548828,
"distill_ce_weight": 0.395703125,
"distill_kd_loss": -0.0242919921875,
"distill_kd_weight": 0.6042968750000001,
"distill_temperature": 2.10390625,
"epoch": 0.30859375,
"grad_norm": 0.00262451171875,
"learning_rate": 7.177419354838711e-05,
"loss": 4.2683305740356445,
"step": 79
},
{
"distill_ce_loss": 10.828267097473145,
"distill_ce_weight": 0.39628906249999996,
"distill_kd_loss": -0.025146484375,
"distill_kd_weight": 0.6037109375,
"distill_temperature": 2.098828125,
"epoch": 0.3125,
"grad_norm": 0.00029754638671875,
"learning_rate": 7.137096774193549e-05,
"loss": 4.275926113128662,
"step": 80
},
{
"distill_ce_loss": 10.82632827758789,
"distill_ce_weight": 0.396875,
"distill_kd_loss": -0.0250244140625,
"distill_kd_weight": 0.603125,
"distill_temperature": 2.09375,
"epoch": 0.31640625,
"grad_norm": 0.000240325927734375,
"learning_rate": 7.096774193548388e-05,
"loss": 4.281623363494873,
"step": 81
},
{
"distill_ce_loss": 10.827393531799316,
"distill_ce_weight": 0.3974609375,
"distill_kd_loss": -0.025146484375,
"distill_kd_weight": 0.6025390625,
"distill_temperature": 2.088671875,
"epoch": 0.3203125,
"grad_norm": 0.000492095947265625,
"learning_rate": 7.056451612903226e-05,
"loss": 4.288329124450684,
"step": 82
},
{
"distill_ce_loss": 10.82739543914795,
"distill_ce_weight": 0.39804687499999997,
"distill_kd_loss": -0.024658203125,
"distill_kd_weight": 0.601953125,
"distill_temperature": 2.08359375,
"epoch": 0.32421875,
"grad_norm": 0.006439208984375,
"learning_rate": 7.016129032258065e-05,
"loss": 4.294979572296143,
"step": 83
},
{
"distill_ce_loss": 10.826316833496094,
"distill_ce_weight": 0.3986328125,
"distill_kd_loss": -0.0240478515625,
"distill_kd_weight": 0.6013671875000001,
"distill_temperature": 2.078515625,
"epoch": 0.328125,
"grad_norm": 0.00177764892578125,
"learning_rate": 6.975806451612904e-05,
"loss": 4.301259994506836,
"step": 84
},
{
"distill_ce_loss": 10.825986862182617,
"distill_ce_weight": 0.39921874999999996,
"distill_kd_loss": -0.024169921875,
"distill_kd_weight": 0.60078125,
"distill_temperature": 2.0734375,
"epoch": 0.33203125,
"grad_norm": 0.0010528564453125,
"learning_rate": 6.935483870967743e-05,
"loss": 4.30741024017334,
"step": 85
},
{
"distill_ce_loss": 10.822548866271973,
"distill_ce_weight": 0.3998046875,
"distill_kd_loss": -0.02392578125,
"distill_kd_weight": 0.6001953125,
"distill_temperature": 2.068359375,
"epoch": 0.3359375,
"grad_norm": 0.000331878662109375,
"learning_rate": 6.895161290322581e-05,
"loss": 4.312562465667725,
"step": 86
},
{
"distill_ce_loss": 10.8272066116333,
"distill_ce_weight": 0.400390625,
"distill_kd_loss": -0.0242919921875,
"distill_kd_weight": 0.599609375,
"distill_temperature": 2.06328125,
"epoch": 0.33984375,
"grad_norm": 0.0120849609375,
"learning_rate": 6.854838709677419e-05,
"loss": 4.3205246925354,
"step": 87
},
{
"distill_ce_loss": 10.828274726867676,
"distill_ce_weight": 0.40097656249999997,
"distill_kd_loss": -0.0233154296875,
"distill_kd_weight": 0.5990234375,
"distill_temperature": 2.058203125,
"epoch": 0.34375,
"grad_norm": 0.00982666015625,
"learning_rate": 6.814516129032257e-05,
"loss": 4.327907562255859,
"step": 88
},
{
"distill_ce_loss": 10.824246406555176,
"distill_ce_weight": 0.4015625,
"distill_kd_loss": -0.023681640625,
"distill_kd_weight": 0.5984375,
"distill_temperature": 2.053125,
"epoch": 0.34765625,
"grad_norm": 0.0019378662109375,
"learning_rate": 6.774193548387096e-05,
"loss": 4.332451343536377,
"step": 89
},
{
"distill_ce_loss": 10.828485488891602,
"distill_ce_weight": 0.40214843749999996,
"distill_kd_loss": -0.0233154296875,
"distill_kd_weight": 0.5978515625,
"distill_temperature": 2.048046875,
"epoch": 0.3515625,
"grad_norm": 0.00112152099609375,
"learning_rate": 6.733870967741935e-05,
"loss": 4.340742588043213,
"step": 90
},
{
"distill_ce_loss": 10.828805923461914,
"distill_ce_weight": 0.402734375,
"distill_kd_loss": -0.0224609375,
"distill_kd_weight": 0.597265625,
"distill_temperature": 2.04296875,
"epoch": 0.35546875,
"grad_norm": 0.00023174285888671875,
"learning_rate": 6.693548387096774e-05,
"loss": 4.3477044105529785,
"step": 91
},
{
"distill_ce_loss": 10.826385498046875,
"distill_ce_weight": 0.4033203125,
"distill_kd_loss": -0.02294921875,
"distill_kd_weight": 0.5966796875,
"distill_temperature": 2.037890625,
"epoch": 0.359375,
"grad_norm": 0.00016689300537109375,
"learning_rate": 6.653225806451613e-05,
"loss": 4.352829456329346,
"step": 92
},
{
"distill_ce_loss": 10.828919410705566,
"distill_ce_weight": 0.40390624999999997,
"distill_kd_loss": -0.022705078125,
"distill_kd_weight": 0.59609375,
"distill_temperature": 2.0328125,
"epoch": 0.36328125,
"grad_norm": 0.000606536865234375,
"learning_rate": 6.612903225806452e-05,
"loss": 4.360318660736084,
"step": 93
},
{
"distill_ce_loss": 10.828105926513672,
"distill_ce_weight": 0.4044921875,
"distill_kd_loss": -0.022705078125,
"distill_kd_weight": 0.5955078125,
"distill_temperature": 2.027734375,
"epoch": 0.3671875,
"grad_norm": 0.00011110305786132812,
"learning_rate": 6.57258064516129e-05,
"loss": 4.366334438323975,
"step": 94
},
{
"distill_ce_loss": 10.829119682312012,
"distill_ce_weight": 0.40507812499999996,
"distill_kd_loss": -0.021484375,
"distill_kd_weight": 0.594921875,
"distill_temperature": 2.02265625,
"epoch": 0.37109375,
"grad_norm": 0.000762939453125,
"learning_rate": 6.532258064516129e-05,
"loss": 4.373883247375488,
"step": 95
},
{
"distill_ce_loss": 10.825207710266113,
"distill_ce_weight": 0.4056640625,
"distill_kd_loss": -0.0220947265625,
"distill_kd_weight": 0.5943359375,
"distill_temperature": 2.017578125,
"epoch": 0.375,
"grad_norm": 0.0001430511474609375,
"learning_rate": 6.491935483870968e-05,
"loss": 4.378274917602539,
"step": 96
},
{
"distill_ce_loss": 10.826506614685059,
"distill_ce_weight": 0.40625,
"distill_kd_loss": -0.0216064453125,
"distill_kd_weight": 0.59375,
"distill_temperature": 2.0125,
"epoch": 0.37890625,
"grad_norm": 0.0086669921875,
"learning_rate": 6.451612903225807e-05,
"loss": 4.385450839996338,
"step": 97
},
{
"distill_ce_loss": 10.829300880432129,
"distill_ce_weight": 0.40683593749999997,
"distill_kd_loss": -0.021484375,
"distill_kd_weight": 0.5931640625,
"distill_temperature": 2.007421875,
"epoch": 0.3828125,
"grad_norm": 0.000446319580078125,
"learning_rate": 6.411290322580646e-05,
"loss": 4.3929924964904785,
"step": 98
},
{
"distill_ce_loss": 10.826828002929688,
"distill_ce_weight": 0.407421875,
"distill_kd_loss": -0.02099609375,
"distill_kd_weight": 0.592578125,
"distill_temperature": 2.00234375,
"epoch": 0.38671875,
"grad_norm": 0.0084228515625,
"learning_rate": 6.370967741935485e-05,
"loss": 4.398635387420654,
"step": 99
},
{
"distill_ce_loss": 10.824376106262207,
"distill_ce_weight": 0.40800781249999996,
"distill_kd_loss": -0.021728515625,
"distill_kd_weight": 0.5919921875,
"distill_temperature": 1.9972656249999998,
"epoch": 0.390625,
"grad_norm": 0.0003108978271484375,
"learning_rate": 6.330645161290322e-05,
"loss": 4.4035515785217285,
"step": 100
},
{
"distill_ce_loss": 10.826534271240234,
"distill_ce_weight": 0.40859375,
"distill_kd_loss": -0.0213623046875,
"distill_kd_weight": 0.59140625,
"distill_temperature": 1.9921875,
"epoch": 0.390625,
"eval_loss": 4.411334991455078,
"eval_runtime": 15.556,
"eval_samples_per_second": 4.114,
"eval_steps_per_second": 4.114,
"step": 100
},
{
"distill_ce_loss": 10.824360847473145,
"distill_ce_weight": 0.40859375,
"distill_kd_loss": -0.021240234375,
"distill_kd_weight": 0.59140625,
"distill_temperature": 1.9921875,
"epoch": 0.39453125,
"grad_norm": 0.0004673004150390625,
"learning_rate": 6.290322580645161e-05,
"loss": 4.410192966461182,
"step": 101
},
{
"distill_ce_loss": 10.825730323791504,
"distill_ce_weight": 0.4091796875,
"distill_kd_loss": -0.0211181640625,
"distill_kd_weight": 0.5908203125,
"distill_temperature": 1.987109375,
"epoch": 0.3984375,
"grad_norm": 0.0002460479736328125,
"learning_rate": 6.25e-05,
"loss": 4.41721773147583,
"step": 102
},
{
"distill_ce_loss": 10.828606605529785,
"distill_ce_weight": 0.40976562499999997,
"distill_kd_loss": -0.0213623046875,
"distill_kd_weight": 0.590234375,
"distill_temperature": 1.98203125,
"epoch": 0.40234375,
"grad_norm": 0.000484466552734375,
"learning_rate": 6.209677419354839e-05,
"loss": 4.424556732177734,
"step": 103
},
{
"distill_ce_loss": 10.827896118164062,
"distill_ce_weight": 0.4103515625,
"distill_kd_loss": -0.0205078125,
"distill_kd_weight": 0.5896484375,
"distill_temperature": 1.976953125,
"epoch": 0.40625,
"grad_norm": 0.000240325927734375,
"learning_rate": 6.169354838709678e-05,
"loss": 4.431159496307373,
"step": 104
},
{
"distill_ce_loss": 10.82929801940918,
"distill_ce_weight": 0.41093749999999996,
"distill_kd_loss": -0.018798828125,
"distill_kd_weight": 0.5890625,
"distill_temperature": 1.9718749999999998,
"epoch": 0.41015625,
"grad_norm": 0.016357421875,
"learning_rate": 6.129032258064517e-05,
"loss": 4.439116954803467,
"step": 105
},
{
"distill_ce_loss": 10.8274564743042,
"distill_ce_weight": 0.4115234375,
"distill_kd_loss": -0.0205078125,
"distill_kd_weight": 0.5884765625,
"distill_temperature": 1.966796875,
"epoch": 0.4140625,
"grad_norm": 0.00054168701171875,
"learning_rate": 6.088709677419355e-05,
"loss": 4.443666934967041,
"step": 106
},
{
"distill_ce_loss": 10.824851036071777,
"distill_ce_weight": 0.412109375,
"distill_kd_loss": -0.019775390625,
"distill_kd_weight": 0.587890625,
"distill_temperature": 1.96171875,
"epoch": 0.41796875,
"grad_norm": 0.0004425048828125,
"learning_rate": 6.048387096774194e-05,
"loss": 4.44942569732666,
"step": 107
},
{
"distill_ce_loss": 10.825765609741211,
"distill_ce_weight": 0.41269531249999997,
"distill_kd_loss": -0.02001953125,
"distill_kd_weight": 0.5873046875,
"distill_temperature": 1.956640625,
"epoch": 0.421875,
"grad_norm": 0.00150299072265625,
"learning_rate": 6.0080645161290325e-05,
"loss": 4.455963134765625,
"step": 108
},
{
"distill_ce_loss": 10.827280044555664,
"distill_ce_weight": 0.41328125,
"distill_kd_loss": -0.0196533203125,
"distill_kd_weight": 0.58671875,
"distill_temperature": 1.9515625,
"epoch": 0.42578125,
"grad_norm": 0.00075531005859375,
"learning_rate": 5.9677419354838715e-05,
"loss": 4.463176250457764,
"step": 109
},
{
"distill_ce_loss": 10.828229904174805,
"distill_ce_weight": 0.41386718749999996,
"distill_kd_loss": -0.0191650390625,
"distill_kd_weight": 0.5861328125,
"distill_temperature": 1.9464843749999998,
"epoch": 0.4296875,
"grad_norm": 0.0005645751953125,
"learning_rate": 5.9274193548387104e-05,
"loss": 4.470218658447266,
"step": 110
},
{
"distill_ce_loss": 10.823951721191406,
"distill_ce_weight": 0.414453125,
"distill_kd_loss": -0.0186767578125,
"distill_kd_weight": 0.585546875,
"distill_temperature": 1.94140625,
"epoch": 0.43359375,
"grad_norm": 0.033935546875,
"learning_rate": 5.887096774193549e-05,
"loss": 4.475095272064209,
"step": 111
},
{
"distill_ce_loss": 10.825326919555664,
"distill_ce_weight": 0.4150390625,
"distill_kd_loss": -0.0191650390625,
"distill_kd_weight": 0.5849609375,
"distill_temperature": 1.936328125,
"epoch": 0.4375,
"grad_norm": 0.0002918243408203125,
"learning_rate": 5.8467741935483876e-05,
"loss": 4.481703281402588,
"step": 112
},
{
"distill_ce_loss": 10.831355094909668,
"distill_ce_weight": 0.415625,
"distill_kd_loss": -0.0184326171875,
"distill_kd_weight": 0.584375,
"distill_temperature": 1.93125,
"epoch": 0.44140625,
"grad_norm": 0.0032196044921875,
"learning_rate": 5.8064516129032266e-05,
"loss": 4.491039752960205,
"step": 113
},
{
"distill_ce_loss": 10.82675838470459,
"distill_ce_weight": 0.4162109375,
"distill_kd_loss": -0.018310546875,
"distill_kd_weight": 0.5837890625,
"distill_temperature": 1.926171875,
"epoch": 0.4453125,
"grad_norm": 0.00494384765625,
"learning_rate": 5.7661290322580655e-05,
"loss": 4.495534420013428,
"step": 114
},
{
"distill_ce_loss": 10.830732345581055,
"distill_ce_weight": 0.41679687499999996,
"distill_kd_loss": -0.0174560546875,
"distill_kd_weight": 0.583203125,
"distill_temperature": 1.9210937499999998,
"epoch": 0.44921875,
"grad_norm": 0.01043701171875,
"learning_rate": 5.725806451612904e-05,
"loss": 4.504022598266602,
"step": 115
},
{
"distill_ce_loss": 10.830469131469727,
"distill_ce_weight": 0.4173828125,
"distill_kd_loss": -0.0174560546875,
"distill_kd_weight": 0.5826171875,
"distill_temperature": 1.916015625,
"epoch": 0.453125,
"grad_norm": 0.000820159912109375,
"learning_rate": 5.685483870967743e-05,
"loss": 4.510258674621582,
"step": 116
},
{
"distill_ce_loss": 10.826086044311523,
"distill_ce_weight": 0.41796875,
"distill_kd_loss": -0.0172119140625,
"distill_kd_weight": 0.58203125,
"distill_temperature": 1.9109375,
"epoch": 0.45703125,
"grad_norm": 0.005523681640625,
"learning_rate": 5.645161290322582e-05,
"loss": 4.514955997467041,
"step": 117
},
{
"distill_ce_loss": 10.82542610168457,
"distill_ce_weight": 0.4185546875,
"distill_kd_loss": -0.017578125,
"distill_kd_weight": 0.5814453125,
"distill_temperature": 1.905859375,
"epoch": 0.4609375,
"grad_norm": 0.001007080078125,
"learning_rate": 5.604838709677419e-05,
"loss": 4.520840167999268,
"step": 118
},
{
"distill_ce_loss": 10.828349113464355,
"distill_ce_weight": 0.419140625,
"distill_kd_loss": -0.0164794921875,
"distill_kd_weight": 0.580859375,
"distill_temperature": 1.90078125,
"epoch": 0.46484375,
"grad_norm": 0.001495361328125,
"learning_rate": 5.5645161290322576e-05,
"loss": 4.529018402099609,
"step": 119
},
{
"distill_ce_loss": 10.82563591003418,
"distill_ce_weight": 0.41972656249999996,
"distill_kd_loss": -0.0169677734375,
"distill_kd_weight": 0.5802734375,
"distill_temperature": 1.8957031249999998,
"epoch": 0.46875,
"grad_norm": 0.0025787353515625,
"learning_rate": 5.5241935483870966e-05,
"loss": 4.533980369567871,
"step": 120
},
{
"distill_ce_loss": 10.827801704406738,
"distill_ce_weight": 0.4203125,
"distill_kd_loss": -0.015625,
"distill_kd_weight": 0.5796875,
"distill_temperature": 1.890625,
"epoch": 0.47265625,
"grad_norm": 0.000392913818359375,
"learning_rate": 5.4838709677419355e-05,
"loss": 4.542026996612549,
"step": 121
},
{
"distill_ce_loss": 10.826509475708008,
"distill_ce_weight": 0.4208984375,
"distill_kd_loss": -0.01611328125,
"distill_kd_weight": 0.5791015625,
"distill_temperature": 1.885546875,
"epoch": 0.4765625,
"grad_norm": 0.000789642333984375,
"learning_rate": 5.443548387096774e-05,
"loss": 4.54752254486084,
"step": 122
},
{
"distill_ce_loss": 10.82880687713623,
"distill_ce_weight": 0.421484375,
"distill_kd_loss": -0.0159912109375,
"distill_kd_weight": 0.578515625,
"distill_temperature": 1.88046875,
"epoch": 0.48046875,
"grad_norm": 0.0004596710205078125,
"learning_rate": 5.403225806451613e-05,
"loss": 4.554895401000977,
"step": 123
},
{
"distill_ce_loss": 10.826998710632324,
"distill_ce_weight": 0.4220703125,
"distill_kd_loss": -0.0135498046875,
"distill_kd_weight": 0.5779296875,
"distill_temperature": 1.875390625,
"epoch": 0.484375,
"grad_norm": 0.0016021728515625,
"learning_rate": 5.362903225806452e-05,
"loss": 4.5619425773620605,
"step": 124
},
{
"distill_ce_loss": 10.827878952026367,
"distill_ce_weight": 0.42265624999999996,
"distill_kd_loss": -0.01495361328125,
"distill_kd_weight": 0.57734375,
"distill_temperature": 1.8703124999999998,
"epoch": 0.48828125,
"grad_norm": 0.000545501708984375,
"learning_rate": 5.32258064516129e-05,
"loss": 4.567864418029785,
"step": 125
},
{
"distill_ce_loss": 10.82653522491455,
"distill_ce_weight": 0.4232421875,
"distill_kd_loss": -0.014892578125,
"distill_kd_weight": 0.5767578125,
"distill_temperature": 1.865234375,
"epoch": 0.48828125,
"eval_loss": 4.573975086212158,
"eval_runtime": 12.3534,
"eval_samples_per_second": 5.181,
"eval_steps_per_second": 5.181,
"step": 125
},
{
"distill_ce_loss": 10.823301315307617,
"distill_ce_weight": 0.4232421875,
"distill_kd_loss": -0.014892578125,
"distill_kd_weight": 0.5767578125,
"distill_temperature": 1.865234375,
"epoch": 0.4921875,
"grad_norm": 0.003753662109375,
"learning_rate": 5.282258064516129e-05,
"loss": 4.572271823883057,
"step": 126
},
{
"distill_ce_loss": 10.826366424560547,
"distill_ce_weight": 0.423828125,
"distill_kd_loss": -0.0142822265625,
"distill_kd_weight": 0.576171875,
"distill_temperature": 1.86015625,
"epoch": 0.49609375,
"grad_norm": 0.00138092041015625,
"learning_rate": 5.241935483870968e-05,
"loss": 4.5802788734436035,
"step": 127
},
{
"distill_ce_loss": 10.8292818069458,
"distill_ce_weight": 0.4244140625,
"distill_kd_loss": -0.01397705078125,
"distill_kd_weight": 0.5755859375,
"distill_temperature": 1.855078125,
"epoch": 0.5,
"grad_norm": 0.00010061264038085938,
"learning_rate": 5.201612903225807e-05,
"loss": 4.588042736053467,
"step": 128
},
{
"distill_ce_loss": 10.823813438415527,
"distill_ce_weight": 0.425,
"distill_kd_loss": -0.01318359375,
"distill_kd_weight": 0.575,
"distill_temperature": 1.85,
"epoch": 0.50390625,
"grad_norm": 0.00028228759765625,
"learning_rate": 5.161290322580645e-05,
"loss": 4.592552661895752,
"step": 129
},
{
"distill_ce_loss": 10.830474853515625,
"distill_ce_weight": 0.42558593749999996,
"distill_kd_loss": -0.01300048828125,
"distill_kd_weight": 0.5744140625,
"distill_temperature": 1.8449218749999998,
"epoch": 0.5078125,
"grad_norm": 0.00055694580078125,
"learning_rate": 5.120967741935484e-05,
"loss": 4.601820945739746,
"step": 130
},
{
"distill_ce_loss": 10.827618598937988,
"distill_ce_weight": 0.426171875,
"distill_kd_loss": -0.0125732421875,
"distill_kd_weight": 0.573828125,
"distill_temperature": 1.83984375,
"epoch": 0.51171875,
"grad_norm": 0.0162353515625,
"learning_rate": 5.080645161290323e-05,
"loss": 4.607224464416504,
"step": 131
},
{
"distill_ce_loss": 10.828215599060059,
"distill_ce_weight": 0.4267578125,
"distill_kd_loss": -0.01190185546875,
"distill_kd_weight": 0.5732421875,
"distill_temperature": 1.834765625,
"epoch": 0.515625,
"grad_norm": 0.000640869140625,
"learning_rate": 5.040322580645161e-05,
"loss": 4.614189624786377,
"step": 132
},
{
"distill_ce_loss": 10.828178405761719,
"distill_ce_weight": 0.42734375,
"distill_kd_loss": -0.01080322265625,
"distill_kd_weight": 0.57265625,
"distill_temperature": 1.8296875,
"epoch": 0.51953125,
"grad_norm": 0.014892578125,
"learning_rate": 5e-05,
"loss": 4.621159553527832,
"step": 133
},
{
"distill_ce_loss": 10.828635215759277,
"distill_ce_weight": 0.4279296875,
"distill_kd_loss": -0.01104736328125,
"distill_kd_weight": 0.5720703125,
"distill_temperature": 1.824609375,
"epoch": 0.5234375,
"grad_norm": 0.000926971435546875,
"learning_rate": 4.959677419354839e-05,
"loss": 4.627577304840088,
"step": 134
},
{
"distill_ce_loss": 10.828261375427246,
"distill_ce_weight": 0.42851562499999996,
"distill_kd_loss": -0.0107421875,
"distill_kd_weight": 0.571484375,
"distill_temperature": 1.8195312499999998,
"epoch": 0.52734375,
"grad_norm": 0.00141143798828125,
"learning_rate": 4.9193548387096775e-05,
"loss": 4.633944988250732,
"step": 135
},
{
"distill_ce_loss": 10.830928802490234,
"distill_ce_weight": 0.4291015625,
"distill_kd_loss": -0.01116943359375,
"distill_kd_weight": 0.5708984375,
"distill_temperature": 1.814453125,
"epoch": 0.53125,
"grad_norm": 0.00958251953125,
"learning_rate": 4.8790322580645164e-05,
"loss": 4.641190052032471,
"step": 136
},
{
"distill_ce_loss": 10.82923412322998,
"distill_ce_weight": 0.4296875,
"distill_kd_loss": -0.00958251953125,
"distill_kd_weight": 0.5703125,
"distill_temperature": 1.809375,
"epoch": 0.53515625,
"grad_norm": 0.0004596710205078125,
"learning_rate": 4.8387096774193554e-05,
"loss": 4.64772367477417,
"step": 137
},
{
"distill_ce_loss": 10.826897621154785,
"distill_ce_weight": 0.4302734375,
"distill_kd_loss": -0.009521484375,
"distill_kd_weight": 0.5697265625,
"distill_temperature": 1.804296875,
"epoch": 0.5390625,
"grad_norm": 0.00054931640625,
"learning_rate": 4.7983870967741937e-05,
"loss": 4.653094291687012,
"step": 138
},
{
"distill_ce_loss": 10.824013710021973,
"distill_ce_weight": 0.430859375,
"distill_kd_loss": -0.0086669921875,
"distill_kd_weight": 0.569140625,
"distill_temperature": 1.79921875,
"epoch": 0.54296875,
"grad_norm": 0.000835418701171875,
"learning_rate": 4.7580645161290326e-05,
"loss": 4.658684253692627,
"step": 139
},
{
"distill_ce_loss": 10.827019691467285,
"distill_ce_weight": 0.43144531249999996,
"distill_kd_loss": -0.008544921875,
"distill_kd_weight": 0.5685546875,
"distill_temperature": 1.7941406249999998,
"epoch": 0.546875,
"grad_norm": 0.0004291534423828125,
"learning_rate": 4.7177419354838716e-05,
"loss": 4.666414260864258,
"step": 140
},
{
"distill_ce_loss": 10.82970142364502,
"distill_ce_weight": 0.43203125,
"distill_kd_loss": -0.0089111328125,
"distill_kd_weight": 0.56796875,
"distill_temperature": 1.7890625,
"epoch": 0.55078125,
"grad_norm": 0.0013275146484375,
"learning_rate": 4.67741935483871e-05,
"loss": 4.673703670501709,
"step": 141
},
{
"distill_ce_loss": 10.828492164611816,
"distill_ce_weight": 0.4326171875,
"distill_kd_loss": -0.00762939453125,
"distill_kd_weight": 0.5673828125,
"distill_temperature": 1.783984375,
"epoch": 0.5546875,
"grad_norm": 0.0255126953125,
"learning_rate": 4.637096774193548e-05,
"loss": 4.680258274078369,
"step": 142
},
{
"distill_ce_loss": 10.826722145080566,
"distill_ce_weight": 0.433203125,
"distill_kd_loss": -0.007781982421875,
"distill_kd_weight": 0.566796875,
"distill_temperature": 1.77890625,
"epoch": 0.55859375,
"grad_norm": 0.000545501708984375,
"learning_rate": 4.596774193548387e-05,
"loss": 4.685744762420654,
"step": 143
},
{
"distill_ce_loss": 10.829012870788574,
"distill_ce_weight": 0.4337890625,
"distill_kd_loss": -0.006591796875,
"distill_kd_weight": 0.5662109375,
"distill_temperature": 1.773828125,
"epoch": 0.5625,
"grad_norm": 0.0024566650390625,
"learning_rate": 4.556451612903226e-05,
"loss": 4.6937689781188965,
"step": 144
},
{
"distill_ce_loss": 10.826948165893555,
"distill_ce_weight": 0.43437499999999996,
"distill_kd_loss": -0.00634765625,
"distill_kd_weight": 0.565625,
"distill_temperature": 1.7687499999999998,
"epoch": 0.56640625,
"grad_norm": 0.000736236572265625,
"learning_rate": 4.516129032258064e-05,
"loss": 4.699369430541992,
"step": 145
},
{
"distill_ce_loss": 10.826860427856445,
"distill_ce_weight": 0.4349609375,
"distill_kd_loss": -0.0057373046875,
"distill_kd_weight": 0.5650390625,
"distill_temperature": 1.763671875,
"epoch": 0.5703125,
"grad_norm": 0.00116729736328125,
"learning_rate": 4.475806451612903e-05,
"loss": 4.706026554107666,
"step": 146
},
{
"distill_ce_loss": 10.827485084533691,
"distill_ce_weight": 0.435546875,
"distill_kd_loss": -0.00482177734375,
"distill_kd_weight": 0.564453125,
"distill_temperature": 1.75859375,
"epoch": 0.57421875,
"grad_norm": 0.004547119140625,
"learning_rate": 4.435483870967742e-05,
"loss": 4.713160991668701,
"step": 147
},
{
"distill_ce_loss": 10.83022403717041,
"distill_ce_weight": 0.4361328125,
"distill_kd_loss": -0.00531005859375,
"distill_kd_weight": 0.5638671875,
"distill_temperature": 1.753515625,
"epoch": 0.578125,
"grad_norm": 0.00909423828125,
"learning_rate": 4.395161290322581e-05,
"loss": 4.720425605773926,
"step": 148
},
{
"distill_ce_loss": 10.825448989868164,
"distill_ce_weight": 0.43671875,
"distill_kd_loss": -0.004119873046875,
"distill_kd_weight": 0.56328125,
"distill_temperature": 1.7484375,
"epoch": 0.58203125,
"grad_norm": 0.01239013671875,
"learning_rate": 4.3548387096774194e-05,
"loss": 4.725357532501221,
"step": 149
},
{
"distill_ce_loss": 10.825879096984863,
"distill_ce_weight": 0.4373046875,
"distill_kd_loss": -0.0040283203125,
"distill_kd_weight": 0.5626953125,
"distill_temperature": 1.7433593749999998,
"epoch": 0.5859375,
"grad_norm": 0.0004177093505859375,
"learning_rate": 4.3145161290322584e-05,
"loss": 4.731934070587158,
"step": 150
},
{
"distill_ce_loss": 10.826534271240234,
"distill_ce_weight": 0.437890625,
"distill_kd_loss": -0.0031280517578125,
"distill_kd_weight": 0.562109375,
"distill_temperature": 1.73828125,
"epoch": 0.5859375,
"eval_loss": 4.739337921142578,
"eval_runtime": 14.2325,
"eval_samples_per_second": 4.497,
"eval_steps_per_second": 4.497,
"step": 150
},
{
"distill_ce_loss": 10.829754829406738,
"distill_ce_weight": 0.437890625,
"distill_kd_loss": -0.00311279296875,
"distill_kd_weight": 0.562109375,
"distill_temperature": 1.73828125,
"epoch": 0.58984375,
"grad_norm": 0.00042724609375,
"learning_rate": 4.2741935483870973e-05,
"loss": 4.7405009269714355,
"step": 151
},
{
"distill_ce_loss": 10.823193550109863,
"distill_ce_weight": 0.4384765625,
"distill_kd_loss": -0.0029144287109375,
"distill_kd_weight": 0.5615234375,
"distill_temperature": 1.733203125,
"epoch": 0.59375,
"grad_norm": 0.0010528564453125,
"learning_rate": 4.2338709677419356e-05,
"loss": 4.744076251983643,
"step": 152
},
{
"distill_ce_loss": 10.824686050415039,
"distill_ce_weight": 0.4390625,
"distill_kd_loss": -0.0017852783203125,
"distill_kd_weight": 0.5609375,
"distill_temperature": 1.728125,
"epoch": 0.59765625,
"grad_norm": 0.00069427490234375,
"learning_rate": 4.1935483870967746e-05,
"loss": 4.75171422958374,
"step": 153
},
{
"distill_ce_loss": 10.827430725097656,
"distill_ce_weight": 0.4396484375,
"distill_kd_loss": -0.00174713134765625,
"distill_kd_weight": 0.5603515625,
"distill_temperature": 1.723046875,
"epoch": 0.6015625,
"grad_norm": 0.000179290771484375,
"learning_rate": 4.1532258064516135e-05,
"loss": 4.759286403656006,
"step": 154
},
{
"distill_ce_loss": 10.828149795532227,
"distill_ce_weight": 0.440234375,
"distill_kd_loss": -0.00072479248046875,
"distill_kd_weight": 0.559765625,
"distill_temperature": 1.7179687499999998,
"epoch": 0.60546875,
"grad_norm": 0.000392913818359375,
"learning_rate": 4.112903225806452e-05,
"loss": 4.766517162322998,
"step": 155
},
{
"distill_ce_loss": 10.82726001739502,
"distill_ce_weight": 0.4408203125,
"distill_kd_loss": -0.000560760498046875,
"distill_kd_weight": 0.5591796875,
"distill_temperature": 1.712890625,
"epoch": 0.609375,
"grad_norm": 0.0005035400390625,
"learning_rate": 4.072580645161291e-05,
"loss": 4.772563457489014,
"step": 156
},
{
"distill_ce_loss": 10.829056739807129,
"distill_ce_weight": 0.44140625,
"distill_kd_loss": -0.0004673004150390625,
"distill_kd_weight": 0.55859375,
"distill_temperature": 1.7078125,
"epoch": 0.61328125,
"grad_norm": 0.00738525390625,
"learning_rate": 4.032258064516129e-05,
"loss": 4.779752254486084,
"step": 157
},
{
"distill_ce_loss": 10.826592445373535,
"distill_ce_weight": 0.4419921875,
"distill_kd_loss": 0.00078582763671875,
"distill_kd_weight": 0.5580078125,
"distill_temperature": 1.702734375,
"epoch": 0.6171875,
"grad_norm": 0.000579833984375,
"learning_rate": 3.991935483870968e-05,
"loss": 4.785707950592041,
"step": 158
},
{
"distill_ce_loss": 10.825506210327148,
"distill_ce_weight": 0.442578125,
"distill_kd_loss": 0.000885009765625,
"distill_kd_weight": 0.557421875,
"distill_temperature": 1.69765625,
"epoch": 0.62109375,
"grad_norm": 0.0031890869140625,
"learning_rate": 3.951612903225806e-05,
"loss": 4.791624546051025,
"step": 159
},
{
"distill_ce_loss": 10.829129219055176,
"distill_ce_weight": 0.4431640625,
"distill_kd_loss": 0.0016937255859375,
"distill_kd_weight": 0.5568359375,
"distill_temperature": 1.6925781249999998,
"epoch": 0.625,
"grad_norm": 0.00151824951171875,
"learning_rate": 3.911290322580645e-05,
"loss": 4.800023078918457,
"step": 160
},
{
"distill_ce_loss": 10.828397750854492,
"distill_ce_weight": 0.44375,
"distill_kd_loss": 0.001953125,
"distill_kd_weight": 0.55625,
"distill_temperature": 1.6875,
"epoch": 0.62890625,
"grad_norm": 0.0003986358642578125,
"learning_rate": 3.870967741935484e-05,
"loss": 4.806184768676758,
"step": 161
},
{
"distill_ce_loss": 10.826486587524414,
"distill_ce_weight": 0.4443359375,
"distill_kd_loss": 0.00311279296875,
"distill_kd_weight": 0.5556640625,
"distill_temperature": 1.682421875,
"epoch": 0.6328125,
"grad_norm": 0.0008087158203125,
"learning_rate": 3.8306451612903224e-05,
"loss": 4.812328815460205,
"step": 162
},
{
"distill_ce_loss": 10.826553344726562,
"distill_ce_weight": 0.444921875,
"distill_kd_loss": 0.003265380859375,
"distill_kd_weight": 0.555078125,
"distill_temperature": 1.67734375,
"epoch": 0.63671875,
"grad_norm": 0.000675201416015625,
"learning_rate": 3.7903225806451614e-05,
"loss": 4.818786144256592,
"step": 163
},
{
"distill_ce_loss": 10.82434368133545,
"distill_ce_weight": 0.4455078125,
"distill_kd_loss": 0.00457763671875,
"distill_kd_weight": 0.5544921875,
"distill_temperature": 1.672265625,
"epoch": 0.640625,
"grad_norm": 0.0003795623779296875,
"learning_rate": 3.7500000000000003e-05,
"loss": 4.824862957000732,
"step": 164
},
{
"distill_ce_loss": 10.830697059631348,
"distill_ce_weight": 0.44609375,
"distill_kd_loss": 0.004638671875,
"distill_kd_weight": 0.55390625,
"distill_temperature": 1.6671874999999998,
"epoch": 0.64453125,
"grad_norm": 0.00185394287109375,
"learning_rate": 3.7096774193548386e-05,
"loss": 4.834069728851318,
"step": 165
},
{
"distill_ce_loss": 10.830793380737305,
"distill_ce_weight": 0.4466796875,
"distill_kd_loss": 0.005645751953125,
"distill_kd_weight": 0.5533203125,
"distill_temperature": 1.662109375,
"epoch": 0.6484375,
"grad_norm": 0.0081787109375,
"learning_rate": 3.6693548387096776e-05,
"loss": 4.8410234451293945,
"step": 166
},
{
"distill_ce_loss": 10.82587718963623,
"distill_ce_weight": 0.447265625,
"distill_kd_loss": 0.0054931640625,
"distill_kd_weight": 0.552734375,
"distill_temperature": 1.65703125,
"epoch": 0.65234375,
"grad_norm": 0.0010528564453125,
"learning_rate": 3.6290322580645165e-05,
"loss": 4.84507942199707,
"step": 167
},
{
"distill_ce_loss": 10.825627326965332,
"distill_ce_weight": 0.4478515625,
"distill_kd_loss": 0.0068359375,
"distill_kd_weight": 0.5521484375,
"distill_temperature": 1.651953125,
"epoch": 0.65625,
"grad_norm": 0.0022430419921875,
"learning_rate": 3.5887096774193555e-05,
"loss": 4.852043151855469,
"step": 168
},
{
"distill_ce_loss": 10.826099395751953,
"distill_ce_weight": 0.4484375,
"distill_kd_loss": 0.0068359375,
"distill_kd_weight": 0.5515625,
"distill_temperature": 1.646875,
"epoch": 0.66015625,
"grad_norm": 0.0025634765625,
"learning_rate": 3.548387096774194e-05,
"loss": 4.858598232269287,
"step": 169
},
{
"distill_ce_loss": 10.825976371765137,
"distill_ce_weight": 0.4490234375,
"distill_kd_loss": 0.00811767578125,
"distill_kd_weight": 0.5509765625,
"distill_temperature": 1.6417968749999998,
"epoch": 0.6640625,
"grad_norm": 0.0033111572265625,
"learning_rate": 3.508064516129033e-05,
"loss": 4.865602970123291,
"step": 170
},
{
"distill_ce_loss": 10.824938774108887,
"distill_ce_weight": 0.449609375,
"distill_kd_loss": 0.00848388671875,
"distill_kd_weight": 0.550390625,
"distill_temperature": 1.63671875,
"epoch": 0.66796875,
"grad_norm": 0.0005950927734375,
"learning_rate": 3.467741935483872e-05,
"loss": 4.8716630935668945,
"step": 171
},
{
"distill_ce_loss": 10.831235885620117,
"distill_ce_weight": 0.4501953125,
"distill_kd_loss": 0.0098876953125,
"distill_kd_weight": 0.5498046875,
"distill_temperature": 1.631640625,
"epoch": 0.671875,
"grad_norm": 0.0186767578125,
"learning_rate": 3.427419354838709e-05,
"loss": 4.881603717803955,
"step": 172
},
{
"distill_ce_loss": 10.82729721069336,
"distill_ce_weight": 0.45078125,
"distill_kd_loss": 0.00982666015625,
"distill_kd_weight": 0.54921875,
"distill_temperature": 1.6265625,
"epoch": 0.67578125,
"grad_norm": 0.0003757476806640625,
"learning_rate": 3.387096774193548e-05,
"loss": 4.886144161224365,
"step": 173
},
{
"distill_ce_loss": 10.82698917388916,
"distill_ce_weight": 0.4513671875,
"distill_kd_loss": 0.01019287109375,
"distill_kd_weight": 0.5486328125,
"distill_temperature": 1.621484375,
"epoch": 0.6796875,
"grad_norm": 0.0003185272216796875,
"learning_rate": 3.346774193548387e-05,
"loss": 4.8925323486328125,
"step": 174
},
{
"distill_ce_loss": 10.829041481018066,
"distill_ce_weight": 0.451953125,
"distill_kd_loss": 0.01068115234375,
"distill_kd_weight": 0.548046875,
"distill_temperature": 1.6164062499999998,
"epoch": 0.68359375,
"grad_norm": 0.0005950927734375,
"learning_rate": 3.306451612903226e-05,
"loss": 4.900078296661377,
"step": 175
},
{
"distill_ce_loss": 10.826534271240234,
"distill_ce_weight": 0.4525390625,
"distill_kd_loss": 0.01141357421875,
"distill_kd_weight": 0.5474609375,
"distill_temperature": 1.611328125,
"epoch": 0.68359375,
"eval_loss": 4.90593957901001,
"eval_runtime": 13.6074,
"eval_samples_per_second": 4.703,
"eval_steps_per_second": 4.703,
"step": 175
},
{
"distill_ce_loss": 10.825757026672363,
"distill_ce_weight": 0.4525390625,
"distill_kd_loss": 0.01123046875,
"distill_kd_weight": 0.5474609375,
"distill_temperature": 1.611328125,
"epoch": 0.6875,
"grad_norm": 0.002685546875,
"learning_rate": 3.2661290322580644e-05,
"loss": 4.905211925506592,
"step": 176
},
{
"distill_ce_loss": 10.827515602111816,
"distill_ce_weight": 0.453125,
"distill_kd_loss": 0.01202392578125,
"distill_kd_weight": 0.546875,
"distill_temperature": 1.60625,
"epoch": 0.69140625,
"grad_norm": 0.0003662109375,
"learning_rate": 3.2258064516129034e-05,
"loss": 4.912779331207275,
"step": 177
},
{
"distill_ce_loss": 10.826964378356934,
"distill_ce_weight": 0.4537109375,
"distill_kd_loss": 0.012939453125,
"distill_kd_weight": 0.5462890625,
"distill_temperature": 1.601171875,
"epoch": 0.6953125,
"grad_norm": 0.00616455078125,
"learning_rate": 3.185483870967742e-05,
"loss": 4.919392108917236,
"step": 178
},
{
"distill_ce_loss": 10.825935363769531,
"distill_ce_weight": 0.454296875,
"distill_kd_loss": 0.0128173828125,
"distill_kd_weight": 0.545703125,
"distill_temperature": 1.59609375,
"epoch": 0.69921875,
"grad_norm": 0.000652313232421875,
"learning_rate": 3.1451612903225806e-05,
"loss": 4.925177097320557,
"step": 179
},
{
"distill_ce_loss": 10.826337814331055,
"distill_ce_weight": 0.4548828125,
"distill_kd_loss": 0.013916015625,
"distill_kd_weight": 0.5451171875,
"distill_temperature": 1.5910156249999998,
"epoch": 0.703125,
"grad_norm": 0.000934600830078125,
"learning_rate": 3.1048387096774195e-05,
"loss": 4.932313919067383,
"step": 180
},
{
"distill_ce_loss": 10.827921867370605,
"distill_ce_weight": 0.45546875,
"distill_kd_loss": 0.013916015625,
"distill_kd_weight": 0.54453125,
"distill_temperature": 1.5859375,
"epoch": 0.70703125,
"grad_norm": 0.000762939453125,
"learning_rate": 3.0645161290322585e-05,
"loss": 4.939348220825195,
"step": 181
},
{
"distill_ce_loss": 10.8247709274292,
"distill_ce_weight": 0.4560546875,
"distill_kd_loss": 0.0150146484375,
"distill_kd_weight": 0.5439453125,
"distill_temperature": 1.580859375,
"epoch": 0.7109375,
"grad_norm": 0.00848388671875,
"learning_rate": 3.024193548387097e-05,
"loss": 4.944866180419922,
"step": 182
},
{
"distill_ce_loss": 10.825705528259277,
"distill_ce_weight": 0.456640625,
"distill_kd_loss": 0.01495361328125,
"distill_kd_weight": 0.543359375,
"distill_temperature": 1.57578125,
"epoch": 0.71484375,
"grad_norm": 0.00201416015625,
"learning_rate": 2.9838709677419357e-05,
"loss": 4.951574802398682,
"step": 183
},
{
"distill_ce_loss": 10.822872161865234,
"distill_ce_weight": 0.4572265625,
"distill_kd_loss": 0.0166015625,
"distill_kd_weight": 0.5427734375,
"distill_temperature": 1.570703125,
"epoch": 0.71875,
"grad_norm": 0.00110626220703125,
"learning_rate": 2.9435483870967743e-05,
"loss": 4.95753812789917,
"step": 184
},
{
"distill_ce_loss": 10.825102806091309,
"distill_ce_weight": 0.4578125,
"distill_kd_loss": 0.016357421875,
"distill_kd_weight": 0.5421875,
"distill_temperature": 1.5656249999999998,
"epoch": 0.72265625,
"grad_norm": 0.01507568359375,
"learning_rate": 2.9032258064516133e-05,
"loss": 4.964717388153076,
"step": 185
},
{
"distill_ce_loss": 10.828420639038086,
"distill_ce_weight": 0.4583984375,
"distill_kd_loss": 0.0179443359375,
"distill_kd_weight": 0.5416015625,
"distill_temperature": 1.560546875,
"epoch": 0.7265625,
"grad_norm": 0.00130462646484375,
"learning_rate": 2.862903225806452e-05,
"loss": 4.973435401916504,
"step": 186
},
{
"distill_ce_loss": 10.829689979553223,
"distill_ce_weight": 0.458984375,
"distill_kd_loss": 0.01806640625,
"distill_kd_weight": 0.541015625,
"distill_temperature": 1.55546875,
"epoch": 0.73046875,
"grad_norm": 0.0002574920654296875,
"learning_rate": 2.822580645161291e-05,
"loss": 4.980423927307129,
"step": 187
},
{
"distill_ce_loss": 10.828069686889648,
"distill_ce_weight": 0.4595703125,
"distill_kd_loss": 0.0194091796875,
"distill_kd_weight": 0.5404296875,
"distill_temperature": 1.550390625,
"epoch": 0.734375,
"grad_norm": 0.01904296875,
"learning_rate": 2.7822580645161288e-05,
"loss": 4.986757278442383,
"step": 188
},
{
"distill_ce_loss": 10.82535457611084,
"distill_ce_weight": 0.46015625,
"distill_kd_loss": 0.0196533203125,
"distill_kd_weight": 0.53984375,
"distill_temperature": 1.5453125,
"epoch": 0.73828125,
"grad_norm": 0.00075531005859375,
"learning_rate": 2.7419354838709678e-05,
"loss": 4.991974830627441,
"step": 189
},
{
"distill_ce_loss": 10.818794250488281,
"distill_ce_weight": 0.4607421875,
"distill_kd_loss": 0.0196533203125,
"distill_kd_weight": 0.5392578125,
"distill_temperature": 1.5402343749999998,
"epoch": 0.7421875,
"grad_norm": 0.0062255859375,
"learning_rate": 2.7016129032258064e-05,
"loss": 4.99529504776001,
"step": 190
},
{
"distill_ce_loss": 10.827811241149902,
"distill_ce_weight": 0.461328125,
"distill_kd_loss": 0.0216064453125,
"distill_kd_weight": 0.538671875,
"distill_temperature": 1.53515625,
"epoch": 0.74609375,
"grad_norm": 0.018310546875,
"learning_rate": 2.661290322580645e-05,
"loss": 5.006831645965576,
"step": 191
},
{
"distill_ce_loss": 10.825983047485352,
"distill_ce_weight": 0.4619140625,
"distill_kd_loss": 0.02099609375,
"distill_kd_weight": 0.5380859375,
"distill_temperature": 1.530078125,
"epoch": 0.75,
"grad_norm": 0.00162506103515625,
"learning_rate": 2.620967741935484e-05,
"loss": 5.011965274810791,
"step": 192
},
{
"distill_ce_loss": 10.828679084777832,
"distill_ce_weight": 0.4625,
"distill_kd_loss": 0.0223388671875,
"distill_kd_weight": 0.5375,
"distill_temperature": 1.525,
"epoch": 0.75390625,
"grad_norm": 0.00045013427734375,
"learning_rate": 2.5806451612903226e-05,
"loss": 5.020287990570068,
"step": 193
},
{
"distill_ce_loss": 10.829313278198242,
"distill_ce_weight": 0.4630859375,
"distill_kd_loss": 0.0223388671875,
"distill_kd_weight": 0.5369140625,
"distill_temperature": 1.519921875,
"epoch": 0.7578125,
"grad_norm": 0.004302978515625,
"learning_rate": 2.5403225806451615e-05,
"loss": 5.026926517486572,
"step": 194
},
{
"distill_ce_loss": 10.825671195983887,
"distill_ce_weight": 0.463671875,
"distill_kd_loss": 0.0230712890625,
"distill_kd_weight": 0.536328125,
"distill_temperature": 1.5148437499999998,
"epoch": 0.76171875,
"grad_norm": 0.0091552734375,
"learning_rate": 2.5e-05,
"loss": 5.031949043273926,
"step": 195
},
{
"distill_ce_loss": 10.828692436218262,
"distill_ce_weight": 0.4642578125,
"distill_kd_loss": 0.0242919921875,
"distill_kd_weight": 0.5357421875,
"distill_temperature": 1.509765625,
"epoch": 0.765625,
"grad_norm": 0.0004253387451171875,
"learning_rate": 2.4596774193548387e-05,
"loss": 5.0403056144714355,
"step": 196
},
{
"distill_ce_loss": 10.827781677246094,
"distill_ce_weight": 0.46484375,
"distill_kd_loss": 0.024658203125,
"distill_kd_weight": 0.53515625,
"distill_temperature": 1.5046875,
"epoch": 0.76953125,
"grad_norm": 0.015625,
"learning_rate": 2.4193548387096777e-05,
"loss": 5.046410083770752,
"step": 197
},
{
"distill_ce_loss": 10.828465461730957,
"distill_ce_weight": 0.4654296875,
"distill_kd_loss": 0.025390625,
"distill_kd_weight": 0.5345703125,
"distill_temperature": 1.499609375,
"epoch": 0.7734375,
"grad_norm": 0.000553131103515625,
"learning_rate": 2.3790322580645163e-05,
"loss": 5.053439140319824,
"step": 198
},
{
"distill_ce_loss": 10.828309059143066,
"distill_ce_weight": 0.466015625,
"distill_kd_loss": 0.0250244140625,
"distill_kd_weight": 0.533984375,
"distill_temperature": 1.4945312499999999,
"epoch": 0.77734375,
"grad_norm": 0.0012054443359375,
"learning_rate": 2.338709677419355e-05,
"loss": 5.05952787399292,
"step": 199
},
{
"distill_ce_loss": 10.825377464294434,
"distill_ce_weight": 0.4666015625,
"distill_kd_loss": 0.026123046875,
"distill_kd_weight": 0.5333984375,
"distill_temperature": 1.489453125,
"epoch": 0.78125,
"grad_norm": 0.00109100341796875,
"learning_rate": 2.2983870967741935e-05,
"loss": 5.065053939819336,
"step": 200
},
{
"distill_ce_loss": 10.826531410217285,
"distill_ce_weight": 0.4671875,
"distill_kd_loss": 0.0274658203125,
"distill_kd_weight": 0.5328125,
"distill_temperature": 1.484375,
"epoch": 0.78125,
"eval_loss": 5.072885513305664,
"eval_runtime": 15.1986,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 4.211,
"step": 200
},
{
"distill_ce_loss": 10.828163146972656,
"distill_ce_weight": 0.4671875,
"distill_kd_loss": 0.027587890625,
"distill_kd_weight": 0.5328125,
"distill_temperature": 1.484375,
"epoch": 0.78515625,
"grad_norm": 0.003814697265625,
"learning_rate": 2.258064516129032e-05,
"loss": 5.073492050170898,
"step": 201
},
{
"distill_ce_loss": 10.826192855834961,
"distill_ce_weight": 0.4677734375,
"distill_kd_loss": 0.02734375,
"distill_kd_weight": 0.5322265625,
"distill_temperature": 1.479296875,
"epoch": 0.7890625,
"grad_norm": 0.00019931793212890625,
"learning_rate": 2.217741935483871e-05,
"loss": 5.078732013702393,
"step": 202
},
{
"distill_ce_loss": 10.825226783752441,
"distill_ce_weight": 0.468359375,
"distill_kd_loss": 0.028076171875,
"distill_kd_weight": 0.531640625,
"distill_temperature": 1.47421875,
"epoch": 0.79296875,
"grad_norm": 0.000347137451171875,
"learning_rate": 2.1774193548387097e-05,
"loss": 5.085050106048584,
"step": 203
},
{
"distill_ce_loss": 10.82629680633545,
"distill_ce_weight": 0.4689453125,
"distill_kd_loss": 0.0281982421875,
"distill_kd_weight": 0.5310546875,
"distill_temperature": 1.4691406249999999,
"epoch": 0.796875,
"grad_norm": 0.007110595703125,
"learning_rate": 2.1370967741935487e-05,
"loss": 5.09189510345459,
"step": 204
},
{
"distill_ce_loss": 10.826847076416016,
"distill_ce_weight": 0.46953125,
"distill_kd_loss": 0.029541015625,
"distill_kd_weight": 0.53046875,
"distill_temperature": 1.4640625,
"epoch": 0.80078125,
"grad_norm": 0.00113677978515625,
"learning_rate": 2.0967741935483873e-05,
"loss": 5.099167823791504,
"step": 205
},
{
"distill_ce_loss": 10.829412460327148,
"distill_ce_weight": 0.4701171875,
"distill_kd_loss": 0.030517578125,
"distill_kd_weight": 0.5298828125,
"distill_temperature": 1.458984375,
"epoch": 0.8046875,
"grad_norm": 0.000423431396484375,
"learning_rate": 2.056451612903226e-05,
"loss": 5.107206344604492,
"step": 206
},
{
"distill_ce_loss": 10.826083183288574,
"distill_ce_weight": 0.470703125,
"distill_kd_loss": 0.0301513671875,
"distill_kd_weight": 0.529296875,
"distill_temperature": 1.45390625,
"epoch": 0.80859375,
"grad_norm": 0.0023040771484375,
"learning_rate": 2.0161290322580645e-05,
"loss": 5.1118621826171875,
"step": 207
},
{
"distill_ce_loss": 10.826690673828125,
"distill_ce_weight": 0.4712890625,
"distill_kd_loss": 0.03076171875,
"distill_kd_weight": 0.5287109375,
"distill_temperature": 1.448828125,
"epoch": 0.8125,
"grad_norm": 0.005859375,
"learning_rate": 1.975806451612903e-05,
"loss": 5.118736267089844,
"step": 208
},
{
"distill_ce_loss": 10.82703685760498,
"distill_ce_weight": 0.471875,
"distill_kd_loss": 0.0308837890625,
"distill_kd_weight": 0.528125,
"distill_temperature": 1.4437499999999999,
"epoch": 0.81640625,
"grad_norm": 0.00032806396484375,
"learning_rate": 1.935483870967742e-05,
"loss": 5.125365734100342,
"step": 209
},
{
"distill_ce_loss": 10.826313972473145,
"distill_ce_weight": 0.4724609375,
"distill_kd_loss": 0.031494140625,
"distill_kd_weight": 0.5275390625,
"distill_temperature": 1.438671875,
"epoch": 0.8203125,
"grad_norm": 0.00182342529296875,
"learning_rate": 1.8951612903225807e-05,
"loss": 5.1316118240356445,
"step": 210
},
{
"distill_ce_loss": 10.829218864440918,
"distill_ce_weight": 0.473046875,
"distill_kd_loss": 0.03271484375,
"distill_kd_weight": 0.526953125,
"distill_temperature": 1.43359375,
"epoch": 0.82421875,
"grad_norm": 0.00592041015625,
"learning_rate": 1.8548387096774193e-05,
"loss": 5.139939785003662,
"step": 211
},
{
"distill_ce_loss": 10.826104164123535,
"distill_ce_weight": 0.4736328125,
"distill_kd_loss": 0.03271484375,
"distill_kd_weight": 0.5263671875,
"distill_temperature": 1.428515625,
"epoch": 0.828125,
"grad_norm": 0.0030364990234375,
"learning_rate": 1.8145161290322583e-05,
"loss": 5.144810199737549,
"step": 212
},
{
"distill_ce_loss": 10.83273696899414,
"distill_ce_weight": 0.47421875,
"distill_kd_loss": 0.033935546875,
"distill_kd_weight": 0.52578125,
"distill_temperature": 1.4234375,
"epoch": 0.83203125,
"grad_norm": 0.000957489013671875,
"learning_rate": 1.774193548387097e-05,
"loss": 5.154909133911133,
"step": 213
},
{
"distill_ce_loss": 10.829684257507324,
"distill_ce_weight": 0.4748046875,
"distill_kd_loss": 0.03466796875,
"distill_kd_weight": 0.5251953125,
"distill_temperature": 1.4183593749999999,
"epoch": 0.8359375,
"grad_norm": 0.0009765625,
"learning_rate": 1.733870967741936e-05,
"loss": 5.160173416137695,
"step": 214
},
{
"distill_ce_loss": 10.829045295715332,
"distill_ce_weight": 0.47539062499999996,
"distill_kd_loss": 0.03466796875,
"distill_kd_weight": 0.524609375,
"distill_temperature": 1.41328125,
"epoch": 0.83984375,
"grad_norm": 0.0004596710205078125,
"learning_rate": 1.693548387096774e-05,
"loss": 5.166214942932129,
"step": 215
},
{
"distill_ce_loss": 10.825078964233398,
"distill_ce_weight": 0.4759765625,
"distill_kd_loss": 0.035400390625,
"distill_kd_weight": 0.5240234375,
"distill_temperature": 1.408203125,
"epoch": 0.84375,
"grad_norm": 0.0006256103515625,
"learning_rate": 1.653225806451613e-05,
"loss": 5.171038627624512,
"step": 216
},
{
"distill_ce_loss": 10.829195976257324,
"distill_ce_weight": 0.4765625,
"distill_kd_loss": 0.0361328125,
"distill_kd_weight": 0.5234375,
"distill_temperature": 1.403125,
"epoch": 0.84765625,
"grad_norm": 0.000316619873046875,
"learning_rate": 1.6129032258064517e-05,
"loss": 5.179709434509277,
"step": 217
},
{
"distill_ce_loss": 10.827247619628906,
"distill_ce_weight": 0.4771484375,
"distill_kd_loss": 0.035888671875,
"distill_kd_weight": 0.5228515625,
"distill_temperature": 1.398046875,
"epoch": 0.8515625,
"grad_norm": 0.0021514892578125,
"learning_rate": 1.5725806451612903e-05,
"loss": 5.185003280639648,
"step": 218
},
{
"distill_ce_loss": 10.824097633361816,
"distill_ce_weight": 0.477734375,
"distill_kd_loss": 0.0361328125,
"distill_kd_weight": 0.522265625,
"distill_temperature": 1.3929687499999999,
"epoch": 0.85546875,
"grad_norm": 0.0019073486328125,
"learning_rate": 1.5322580645161292e-05,
"loss": 5.189964771270752,
"step": 219
},
{
"distill_ce_loss": 10.828665733337402,
"distill_ce_weight": 0.47832031249999996,
"distill_kd_loss": 0.03759765625,
"distill_kd_weight": 0.5216796875,
"distill_temperature": 1.387890625,
"epoch": 0.859375,
"grad_norm": 0.0026092529296875,
"learning_rate": 1.4919354838709679e-05,
"loss": 5.19922399520874,
"step": 220
},
{
"distill_ce_loss": 10.828384399414062,
"distill_ce_weight": 0.47890625,
"distill_kd_loss": 0.037109375,
"distill_kd_weight": 0.52109375,
"distill_temperature": 1.3828125,
"epoch": 0.86328125,
"grad_norm": 0.0140380859375,
"learning_rate": 1.4516129032258066e-05,
"loss": 5.205068111419678,
"step": 221
},
{
"distill_ce_loss": 10.828526496887207,
"distill_ce_weight": 0.4794921875,
"distill_kd_loss": 0.03857421875,
"distill_kd_weight": 0.5205078125,
"distill_temperature": 1.377734375,
"epoch": 0.8671875,
"grad_norm": 0.000518798828125,
"learning_rate": 1.4112903225806454e-05,
"loss": 5.212213516235352,
"step": 222
},
{
"distill_ce_loss": 10.830307960510254,
"distill_ce_weight": 0.480078125,
"distill_kd_loss": 0.038330078125,
"distill_kd_weight": 0.519921875,
"distill_temperature": 1.37265625,
"epoch": 0.87109375,
"grad_norm": 0.0032196044921875,
"learning_rate": 1.3709677419354839e-05,
"loss": 5.219291687011719,
"step": 223
},
{
"distill_ce_loss": 10.827667236328125,
"distill_ce_weight": 0.4806640625,
"distill_kd_loss": 0.039306640625,
"distill_kd_weight": 0.5193359375,
"distill_temperature": 1.3675781249999999,
"epoch": 0.875,
"grad_norm": 0.016357421875,
"learning_rate": 1.3306451612903225e-05,
"loss": 5.224856376647949,
"step": 224
},
{
"distill_ce_loss": 10.828167915344238,
"distill_ce_weight": 0.48125,
"distill_kd_loss": 0.03955078125,
"distill_kd_weight": 0.51875,
"distill_temperature": 1.3625,
"epoch": 0.87890625,
"grad_norm": 0.0008087158203125,
"learning_rate": 1.2903225806451613e-05,
"loss": 5.231563568115234,
"step": 225
},
{
"distill_ce_loss": 10.826531410217285,
"distill_ce_weight": 0.4818359375,
"distill_kd_loss": 0.040283203125,
"distill_kd_weight": 0.5181640625,
"distill_temperature": 1.357421875,
"epoch": 0.87890625,
"eval_loss": 5.237745761871338,
"eval_runtime": 12.9169,
"eval_samples_per_second": 4.955,
"eval_steps_per_second": 4.955,
"step": 225
},
{
"distill_ce_loss": 10.8232421875,
"distill_ce_weight": 0.4818359375,
"distill_kd_loss": 0.040283203125,
"distill_kd_weight": 0.5181640625,
"distill_temperature": 1.357421875,
"epoch": 0.8828125,
"grad_norm": 0.0020599365234375,
"learning_rate": 1.25e-05,
"loss": 5.23590087890625,
"step": 226
},
{
"distill_ce_loss": 10.828264236450195,
"distill_ce_weight": 0.482421875,
"distill_kd_loss": 0.04052734375,
"distill_kd_weight": 0.517578125,
"distill_temperature": 1.35234375,
"epoch": 0.88671875,
"grad_norm": 0.030029296875,
"learning_rate": 1.2096774193548388e-05,
"loss": 5.244787693023682,
"step": 227
},
{
"distill_ce_loss": 10.828550338745117,
"distill_ce_weight": 0.4830078125,
"distill_kd_loss": 0.040283203125,
"distill_kd_weight": 0.5169921875,
"distill_temperature": 1.347265625,
"epoch": 0.890625,
"grad_norm": 0.01513671875,
"learning_rate": 1.1693548387096775e-05,
"loss": 5.251148700714111,
"step": 228
},
{
"distill_ce_loss": 10.828879356384277,
"distill_ce_weight": 0.48359375,
"distill_kd_loss": 0.041015625,
"distill_kd_weight": 0.51640625,
"distill_temperature": 1.3421874999999999,
"epoch": 0.89453125,
"grad_norm": 0.002685546875,
"learning_rate": 1.129032258064516e-05,
"loss": 5.258018970489502,
"step": 229
},
{
"distill_ce_loss": 10.825315475463867,
"distill_ce_weight": 0.4841796875,
"distill_kd_loss": 0.041748046875,
"distill_kd_weight": 0.5158203125,
"distill_temperature": 1.337109375,
"epoch": 0.8984375,
"grad_norm": 0.000537872314453125,
"learning_rate": 1.0887096774193549e-05,
"loss": 5.262882232666016,
"step": 230
},
{
"distill_ce_loss": 10.828563690185547,
"distill_ce_weight": 0.484765625,
"distill_kd_loss": 0.042236328125,
"distill_kd_weight": 0.515234375,
"distill_temperature": 1.33203125,
"epoch": 0.90234375,
"grad_norm": 0.00592041015625,
"learning_rate": 1.0483870967741936e-05,
"loss": 5.27104377746582,
"step": 231
},
{
"distill_ce_loss": 10.82773208618164,
"distill_ce_weight": 0.4853515625,
"distill_kd_loss": 0.042236328125,
"distill_kd_weight": 0.5146484375,
"distill_temperature": 1.326953125,
"epoch": 0.90625,
"grad_norm": 0.00063323974609375,
"learning_rate": 1.0080645161290323e-05,
"loss": 5.276985168457031,
"step": 232
},
{
"distill_ce_loss": 10.827362060546875,
"distill_ce_weight": 0.4859375,
"distill_kd_loss": 0.04296875,
"distill_kd_weight": 0.5140625,
"distill_temperature": 1.321875,
"epoch": 0.91015625,
"grad_norm": 0.0025482177734375,
"learning_rate": 9.67741935483871e-06,
"loss": 5.283515930175781,
"step": 233
},
{
"distill_ce_loss": 10.828764915466309,
"distill_ce_weight": 0.4865234375,
"distill_kd_loss": 0.04345703125,
"distill_kd_weight": 0.5134765625,
"distill_temperature": 1.3167968749999999,
"epoch": 0.9140625,
"grad_norm": 0.0027618408203125,
"learning_rate": 9.274193548387097e-06,
"loss": 5.2907867431640625,
"step": 234
},
{
"distill_ce_loss": 10.827953338623047,
"distill_ce_weight": 0.487109375,
"distill_kd_loss": 0.04443359375,
"distill_kd_weight": 0.512890625,
"distill_temperature": 1.31171875,
"epoch": 0.91796875,
"grad_norm": 0.00012874603271484375,
"learning_rate": 8.870967741935484e-06,
"loss": 5.297224521636963,
"step": 235
},
{
"distill_ce_loss": 10.826726913452148,
"distill_ce_weight": 0.4876953125,
"distill_kd_loss": 0.044677734375,
"distill_kd_weight": 0.5123046875,
"distill_temperature": 1.306640625,
"epoch": 0.921875,
"grad_norm": 0.0003662109375,
"learning_rate": 8.46774193548387e-06,
"loss": 5.303092956542969,
"step": 236
},
{
"distill_ce_loss": 10.828742980957031,
"distill_ce_weight": 0.48828125,
"distill_kd_loss": 0.04443359375,
"distill_kd_weight": 0.51171875,
"distill_temperature": 1.3015625,
"epoch": 0.92578125,
"grad_norm": 0.00115966796875,
"learning_rate": 8.064516129032258e-06,
"loss": 5.310177326202393,
"step": 237
},
{
"distill_ce_loss": 10.826173782348633,
"distill_ce_weight": 0.4888671875,
"distill_kd_loss": 0.044677734375,
"distill_kd_weight": 0.5111328125,
"distill_temperature": 1.296484375,
"epoch": 0.9296875,
"grad_norm": 0.0027313232421875,
"learning_rate": 7.661290322580646e-06,
"loss": 5.315388202667236,
"step": 238
},
{
"distill_ce_loss": 10.829797744750977,
"distill_ce_weight": 0.489453125,
"distill_kd_loss": 0.045654296875,
"distill_kd_weight": 0.510546875,
"distill_temperature": 1.2914062499999999,
"epoch": 0.93359375,
"grad_norm": 0.00051116943359375,
"learning_rate": 7.258064516129033e-06,
"loss": 5.323993682861328,
"step": 239
},
{
"distill_ce_loss": 10.828352928161621,
"distill_ce_weight": 0.4900390625,
"distill_kd_loss": 0.0458984375,
"distill_kd_weight": 0.5099609375,
"distill_temperature": 1.286328125,
"epoch": 0.9375,
"grad_norm": 0.00052642822265625,
"learning_rate": 6.854838709677419e-06,
"loss": 5.329753398895264,
"step": 240
},
{
"distill_ce_loss": 10.826845169067383,
"distill_ce_weight": 0.490625,
"distill_kd_loss": 0.045654296875,
"distill_kd_weight": 0.509375,
"distill_temperature": 1.28125,
"epoch": 0.94140625,
"grad_norm": 0.00087738037109375,
"learning_rate": 6.451612903225806e-06,
"loss": 5.335236072540283,
"step": 241
},
{
"distill_ce_loss": 10.826852798461914,
"distill_ce_weight": 0.4912109375,
"distill_kd_loss": 0.045654296875,
"distill_kd_weight": 0.5087890625,
"distill_temperature": 1.276171875,
"epoch": 0.9453125,
"grad_norm": 0.00107574462890625,
"learning_rate": 6.048387096774194e-06,
"loss": 5.341461658477783,
"step": 242
},
{
"distill_ce_loss": 10.829383850097656,
"distill_ce_weight": 0.491796875,
"distill_kd_loss": 0.0458984375,
"distill_kd_weight": 0.508203125,
"distill_temperature": 1.27109375,
"epoch": 0.94921875,
"grad_norm": 0.0030670166015625,
"learning_rate": 5.64516129032258e-06,
"loss": 5.349172592163086,
"step": 243
},
{
"distill_ce_loss": 10.826229095458984,
"distill_ce_weight": 0.4923828125,
"distill_kd_loss": 0.046875,
"distill_kd_weight": 0.5076171875,
"distill_temperature": 1.2660156249999999,
"epoch": 0.953125,
"grad_norm": 0.006622314453125,
"learning_rate": 5.241935483870968e-06,
"loss": 5.354453086853027,
"step": 244
},
{
"distill_ce_loss": 10.825600624084473,
"distill_ce_weight": 0.49296875,
"distill_kd_loss": 0.04736328125,
"distill_kd_weight": 0.50703125,
"distill_temperature": 1.2609375,
"epoch": 0.95703125,
"grad_norm": 0.00091552734375,
"learning_rate": 4.838709677419355e-06,
"loss": 5.3607306480407715,
"step": 245
},
{
"distill_ce_loss": 10.824897766113281,
"distill_ce_weight": 0.4935546875,
"distill_kd_loss": 0.0478515625,
"distill_kd_weight": 0.5064453125,
"distill_temperature": 1.255859375,
"epoch": 0.9609375,
"grad_norm": 0.000568389892578125,
"learning_rate": 4.435483870967742e-06,
"loss": 5.366971015930176,
"step": 246
},
{
"distill_ce_loss": 10.827354431152344,
"distill_ce_weight": 0.494140625,
"distill_kd_loss": 0.047607421875,
"distill_kd_weight": 0.505859375,
"distill_temperature": 1.25078125,
"epoch": 0.96484375,
"grad_norm": 0.000457763671875,
"learning_rate": 4.032258064516129e-06,
"loss": 5.374283313751221,
"step": 247
},
{
"distill_ce_loss": 10.826087951660156,
"distill_ce_weight": 0.4947265625,
"distill_kd_loss": 0.0478515625,
"distill_kd_weight": 0.5052734375,
"distill_temperature": 1.245703125,
"epoch": 0.96875,
"grad_norm": 0.0004558563232421875,
"learning_rate": 3.6290322580645166e-06,
"loss": 5.380123138427734,
"step": 248
},
{
"distill_ce_loss": 10.825430870056152,
"distill_ce_weight": 0.4953125,
"distill_kd_loss": 0.048095703125,
"distill_kd_weight": 0.5046875,
"distill_temperature": 1.2406249999999999,
"epoch": 0.97265625,
"grad_norm": 0.000820159912109375,
"learning_rate": 3.225806451612903e-06,
"loss": 5.386263370513916,
"step": 249
},
{
"distill_ce_loss": 10.829666137695312,
"distill_ce_weight": 0.4958984375,
"distill_kd_loss": 0.048095703125,
"distill_kd_weight": 0.5041015625,
"distill_temperature": 1.235546875,
"epoch": 0.9765625,
"grad_norm": 0.00061798095703125,
"learning_rate": 2.82258064516129e-06,
"loss": 5.3947062492370605,
"step": 250
},
{
"distill_ce_loss": 10.826531410217285,
"distill_ce_weight": 0.496484375,
"distill_kd_loss": 0.04833984375,
"distill_kd_weight": 0.503515625,
"distill_temperature": 1.23046875,
"epoch": 0.9765625,
"eval_loss": 5.399942398071289,
"eval_runtime": 13.5232,
"eval_samples_per_second": 4.733,
"eval_steps_per_second": 4.733,
"step": 250
},
{
"distill_ce_loss": 10.832559585571289,
"distill_ce_weight": 0.496484375,
"distill_kd_loss": 0.04833984375,
"distill_kd_weight": 0.503515625,
"distill_temperature": 1.23046875,
"epoch": 0.98046875,
"grad_norm": 0.001220703125,
"learning_rate": 2.4193548387096776e-06,
"loss": 5.402488708496094,
"step": 251
},
{
"distill_ce_loss": 10.828208923339844,
"distill_ce_weight": 0.4970703125,
"distill_kd_loss": 0.048828125,
"distill_kd_weight": 0.5029296875,
"distill_temperature": 1.225390625,
"epoch": 0.984375,
"grad_norm": 0.0022430419921875,
"learning_rate": 2.0161290322580646e-06,
"loss": 5.406917095184326,
"step": 252
},
{
"distill_ce_loss": 10.82684326171875,
"distill_ce_weight": 0.49765625,
"distill_kd_loss": 0.0498046875,
"distill_kd_weight": 0.50234375,
"distill_temperature": 1.2203125,
"epoch": 0.98828125,
"grad_norm": 0.000362396240234375,
"learning_rate": 1.6129032258064516e-06,
"loss": 5.4130706787109375,
"step": 253
},
{
"distill_ce_loss": 10.826776504516602,
"distill_ce_weight": 0.4982421875,
"distill_kd_loss": 0.04931640625,
"distill_kd_weight": 0.5017578125,
"distill_temperature": 1.2152343749999999,
"epoch": 0.9921875,
"grad_norm": 0.00125885009765625,
"learning_rate": 1.2096774193548388e-06,
"loss": 5.419137001037598,
"step": 254
},
{
"distill_ce_loss": 10.826760292053223,
"distill_ce_weight": 0.498828125,
"distill_kd_loss": 0.04931640625,
"distill_kd_weight": 0.501171875,
"distill_temperature": 1.21015625,
"epoch": 0.99609375,
"grad_norm": 0.0029449462890625,
"learning_rate": 8.064516129032258e-07,
"loss": 5.425350666046143,
"step": 255
},
{
"distill_ce_loss": 10.825718879699707,
"distill_ce_weight": 0.4994140625,
"distill_kd_loss": 0.04931640625,
"distill_kd_weight": 0.5005859375,
"distill_temperature": 1.205078125,
"epoch": 1.0,
"grad_norm": 0.00159454345703125,
"learning_rate": 4.032258064516129e-07,
"loss": 5.431174278259277,
"step": 256
},
{
"distill_ce_loss": 10.825718879699707,
"distill_ce_weight": 0.4994140625,
"distill_kd_loss": 0.04931640625,
"distill_kd_weight": 0.5005859375,
"distill_temperature": 1.205078125,
"epoch": 1.0,
"step": 256,
"total_flos": 42322071132.0,
"train_loss": 4.595640664920211,
"train_runtime": 251.849,
"train_samples_per_second": 1.016,
"train_steps_per_second": 1.016
}
],
"logging_steps": 1,
"max_steps": 256,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 42322071132.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}