| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 25, | |
| "global_step": 256, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "distill_ce_loss": 10.82673168182373, | |
| "distill_ce_weight": 0.35, | |
| "distill_kd_loss": -0.0361328125, | |
| "distill_kd_weight": 0.65, | |
| "distill_temperature": 2.5, | |
| "epoch": 0.00390625, | |
| "grad_norm": 0.0181884765625, | |
| "learning_rate": 0.0, | |
| "loss": 3.765918493270874, | |
| "step": 1 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829505920410156, | |
| "distill_ce_weight": 0.3505859375, | |
| "distill_kd_loss": -0.0361328125, | |
| "distill_kd_weight": 0.6494140625, | |
| "distill_temperature": 2.494921875, | |
| "epoch": 0.0078125, | |
| "grad_norm": 0.004180908203125, | |
| "learning_rate": 1.25e-05, | |
| "loss": 3.7732350826263428, | |
| "step": 2 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825729370117188, | |
| "distill_ce_weight": 0.35117187499999997, | |
| "distill_kd_loss": -0.032470703125, | |
| "distill_kd_weight": 0.648828125, | |
| "distill_temperature": 2.48984375, | |
| "epoch": 0.01171875, | |
| "grad_norm": 0.0125732421875, | |
| "learning_rate": 2.5e-05, | |
| "loss": 3.780573606491089, | |
| "step": 3 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830185890197754, | |
| "distill_ce_weight": 0.3517578125, | |
| "distill_kd_loss": -0.03125, | |
| "distill_kd_weight": 0.6482421875000001, | |
| "distill_temperature": 2.484765625, | |
| "epoch": 0.015625, | |
| "grad_norm": 0.00628662109375, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 3.789339065551758, | |
| "step": 4 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824288368225098, | |
| "distill_ce_weight": 0.35234374999999996, | |
| "distill_kd_loss": -0.0361328125, | |
| "distill_kd_weight": 0.64765625, | |
| "distill_temperature": 2.4796875, | |
| "epoch": 0.01953125, | |
| "grad_norm": 0.017333984375, | |
| "learning_rate": 5e-05, | |
| "loss": 3.7904326915740967, | |
| "step": 5 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823088645935059, | |
| "distill_ce_weight": 0.3529296875, | |
| "distill_kd_loss": -0.031005859375, | |
| "distill_kd_weight": 0.6470703125, | |
| "distill_temperature": 2.474609375, | |
| "epoch": 0.0234375, | |
| "grad_norm": 0.01312255859375, | |
| "learning_rate": 6.25e-05, | |
| "loss": 3.799769639968872, | |
| "step": 6 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827218055725098, | |
| "distill_ce_weight": 0.353515625, | |
| "distill_kd_loss": -0.031494140625, | |
| "distill_kd_weight": 0.646484375, | |
| "distill_temperature": 2.46953125, | |
| "epoch": 0.02734375, | |
| "grad_norm": 0.00823974609375, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 3.8072049617767334, | |
| "step": 7 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827470779418945, | |
| "distill_ce_weight": 0.35410156249999997, | |
| "distill_kd_loss": -0.0296630859375, | |
| "distill_kd_weight": 0.6458984375, | |
| "distill_temperature": 2.464453125, | |
| "epoch": 0.03125, | |
| "grad_norm": 0.002349853515625, | |
| "learning_rate": 8.75e-05, | |
| "loss": 3.814859390258789, | |
| "step": 8 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827388763427734, | |
| "distill_ce_weight": 0.3546875, | |
| "distill_kd_loss": -0.03271484375, | |
| "distill_kd_weight": 0.6453125000000001, | |
| "distill_temperature": 2.459375, | |
| "epoch": 0.03515625, | |
| "grad_norm": 0.01153564453125, | |
| "learning_rate": 0.0001, | |
| "loss": 3.8192214965820312, | |
| "step": 9 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828258514404297, | |
| "distill_ce_weight": 0.35527343749999996, | |
| "distill_kd_loss": -0.031982421875, | |
| "distill_kd_weight": 0.6447265625, | |
| "distill_temperature": 2.454296875, | |
| "epoch": 0.0390625, | |
| "grad_norm": 0.00823974609375, | |
| "learning_rate": 9.95967741935484e-05, | |
| "loss": 3.8263626098632812, | |
| "step": 10 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82706069946289, | |
| "distill_ce_weight": 0.355859375, | |
| "distill_kd_loss": -0.031982421875, | |
| "distill_kd_weight": 0.644140625, | |
| "distill_temperature": 2.44921875, | |
| "epoch": 0.04296875, | |
| "grad_norm": 0.010498046875, | |
| "learning_rate": 9.919354838709678e-05, | |
| "loss": 3.8322811126708984, | |
| "step": 11 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826086044311523, | |
| "distill_ce_weight": 0.3564453125, | |
| "distill_kd_loss": -0.0341796875, | |
| "distill_kd_weight": 0.6435546875, | |
| "distill_temperature": 2.444140625, | |
| "epoch": 0.046875, | |
| "grad_norm": 0.004302978515625, | |
| "learning_rate": 9.879032258064517e-05, | |
| "loss": 3.836935043334961, | |
| "step": 12 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82719612121582, | |
| "distill_ce_weight": 0.35703124999999997, | |
| "distill_kd_loss": -0.0306396484375, | |
| "distill_kd_weight": 0.64296875, | |
| "distill_temperature": 2.4390625, | |
| "epoch": 0.05078125, | |
| "grad_norm": 0.0322265625, | |
| "learning_rate": 9.838709677419355e-05, | |
| "loss": 3.845993995666504, | |
| "step": 13 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827816009521484, | |
| "distill_ce_weight": 0.3576171875, | |
| "distill_kd_loss": -0.030029296875, | |
| "distill_kd_weight": 0.6423828125000001, | |
| "distill_temperature": 2.433984375, | |
| "epoch": 0.0546875, | |
| "grad_norm": 0.00165557861328125, | |
| "learning_rate": 9.798387096774194e-05, | |
| "loss": 3.852926015853882, | |
| "step": 14 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826566696166992, | |
| "distill_ce_weight": 0.35820312499999996, | |
| "distill_kd_loss": -0.033935546875, | |
| "distill_kd_weight": 0.641796875, | |
| "distill_temperature": 2.42890625, | |
| "epoch": 0.05859375, | |
| "grad_norm": 0.0146484375, | |
| "learning_rate": 9.758064516129033e-05, | |
| "loss": 3.856381416320801, | |
| "step": 15 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825676918029785, | |
| "distill_ce_weight": 0.3587890625, | |
| "distill_kd_loss": -0.032470703125, | |
| "distill_kd_weight": 0.6412109375, | |
| "distill_temperature": 2.423828125, | |
| "epoch": 0.0625, | |
| "grad_norm": 0.0007781982421875, | |
| "learning_rate": 9.717741935483872e-05, | |
| "loss": 3.863260269165039, | |
| "step": 16 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827549934387207, | |
| "distill_ce_weight": 0.359375, | |
| "distill_kd_loss": -0.03369140625, | |
| "distill_kd_weight": 0.640625, | |
| "distill_temperature": 2.41875, | |
| "epoch": 0.06640625, | |
| "grad_norm": 0.00131988525390625, | |
| "learning_rate": 9.677419354838711e-05, | |
| "loss": 3.869544267654419, | |
| "step": 17 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827980995178223, | |
| "distill_ce_weight": 0.35996093749999997, | |
| "distill_kd_loss": -0.03369140625, | |
| "distill_kd_weight": 0.6400390625, | |
| "distill_temperature": 2.413671875, | |
| "epoch": 0.0703125, | |
| "grad_norm": 0.000782012939453125, | |
| "learning_rate": 9.63709677419355e-05, | |
| "loss": 3.8760437965393066, | |
| "step": 18 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82925033569336, | |
| "distill_ce_weight": 0.360546875, | |
| "distill_kd_loss": -0.03369140625, | |
| "distill_kd_weight": 0.6394531250000001, | |
| "distill_temperature": 2.40859375, | |
| "epoch": 0.07421875, | |
| "grad_norm": 0.000431060791015625, | |
| "learning_rate": 9.596774193548387e-05, | |
| "loss": 3.8829681873321533, | |
| "step": 19 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827664375305176, | |
| "distill_ce_weight": 0.36113281249999996, | |
| "distill_kd_loss": -0.03369140625, | |
| "distill_kd_weight": 0.6388671875, | |
| "distill_temperature": 2.403515625, | |
| "epoch": 0.078125, | |
| "grad_norm": 0.000698089599609375, | |
| "learning_rate": 9.556451612903226e-05, | |
| "loss": 3.888740301132202, | |
| "step": 20 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823915481567383, | |
| "distill_ce_weight": 0.36171875, | |
| "distill_kd_loss": -0.027587890625, | |
| "distill_kd_weight": 0.63828125, | |
| "distill_temperature": 2.3984375, | |
| "epoch": 0.08203125, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 9.516129032258065e-05, | |
| "loss": 3.897634983062744, | |
| "step": 21 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826141357421875, | |
| "distill_ce_weight": 0.3623046875, | |
| "distill_kd_loss": -0.03173828125, | |
| "distill_kd_weight": 0.6376953125, | |
| "distill_temperature": 2.393359375, | |
| "epoch": 0.0859375, | |
| "grad_norm": 0.00787353515625, | |
| "learning_rate": 9.475806451612904e-05, | |
| "loss": 3.9020981788635254, | |
| "step": 22 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828008651733398, | |
| "distill_ce_weight": 0.36289062499999997, | |
| "distill_kd_loss": -0.031494140625, | |
| "distill_kd_weight": 0.637109375, | |
| "distill_temperature": 2.38828125, | |
| "epoch": 0.08984375, | |
| "grad_norm": 0.0072021484375, | |
| "learning_rate": 9.435483870967743e-05, | |
| "loss": 3.909363269805908, | |
| "step": 23 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826719284057617, | |
| "distill_ce_weight": 0.3634765625, | |
| "distill_kd_loss": -0.031982421875, | |
| "distill_kd_weight": 0.6365234375000001, | |
| "distill_temperature": 2.383203125, | |
| "epoch": 0.09375, | |
| "grad_norm": 0.0021514892578125, | |
| "learning_rate": 9.395161290322582e-05, | |
| "loss": 3.9148731231689453, | |
| "step": 24 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828035354614258, | |
| "distill_ce_weight": 0.36406249999999996, | |
| "distill_kd_loss": -0.03271484375, | |
| "distill_kd_weight": 0.6359375, | |
| "distill_temperature": 2.378125, | |
| "epoch": 0.09765625, | |
| "grad_norm": 0.0015411376953125, | |
| "learning_rate": 9.35483870967742e-05, | |
| "loss": 3.9213294982910156, | |
| "step": 25 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826539039611816, | |
| "distill_ce_weight": 0.3646484375, | |
| "distill_kd_loss": -0.032470703125, | |
| "distill_kd_weight": 0.6353515625, | |
| "distill_temperature": 2.373046875, | |
| "epoch": 0.09765625, | |
| "eval_loss": 3.927885055541992, | |
| "eval_runtime": 15.0677, | |
| "eval_samples_per_second": 4.247, | |
| "eval_steps_per_second": 4.247, | |
| "step": 25 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82087516784668, | |
| "distill_ce_weight": 0.3646484375, | |
| "distill_kd_loss": -0.03173828125, | |
| "distill_kd_weight": 0.6353515625, | |
| "distill_temperature": 2.373046875, | |
| "epoch": 0.1015625, | |
| "grad_norm": 0.003143310546875, | |
| "learning_rate": 9.314516129032259e-05, | |
| "loss": 3.925673484802246, | |
| "step": 26 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827393531799316, | |
| "distill_ce_weight": 0.365234375, | |
| "distill_kd_loss": -0.0322265625, | |
| "distill_kd_weight": 0.634765625, | |
| "distill_temperature": 2.36796875, | |
| "epoch": 0.10546875, | |
| "grad_norm": 0.00021648406982421875, | |
| "learning_rate": 9.274193548387096e-05, | |
| "loss": 3.934028387069702, | |
| "step": 27 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828241348266602, | |
| "distill_ce_weight": 0.36582031249999997, | |
| "distill_kd_loss": -0.0302734375, | |
| "distill_kd_weight": 0.6341796875, | |
| "distill_temperature": 2.362890625, | |
| "epoch": 0.109375, | |
| "grad_norm": 0.005126953125, | |
| "learning_rate": 9.233870967741935e-05, | |
| "loss": 3.942025661468506, | |
| "step": 28 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829521179199219, | |
| "distill_ce_weight": 0.36640625, | |
| "distill_kd_loss": -0.029052734375, | |
| "distill_kd_weight": 0.6335937500000001, | |
| "distill_temperature": 2.3578125, | |
| "epoch": 0.11328125, | |
| "grad_norm": 0.0032806396484375, | |
| "learning_rate": 9.193548387096774e-05, | |
| "loss": 3.9495718479156494, | |
| "step": 29 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825637817382812, | |
| "distill_ce_weight": 0.36699218749999996, | |
| "distill_kd_loss": -0.0311279296875, | |
| "distill_kd_weight": 0.6330078125, | |
| "distill_temperature": 2.352734375, | |
| "epoch": 0.1171875, | |
| "grad_norm": 0.00286865234375, | |
| "learning_rate": 9.153225806451613e-05, | |
| "loss": 3.9532711505889893, | |
| "step": 30 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827566146850586, | |
| "distill_ce_weight": 0.367578125, | |
| "distill_kd_loss": -0.03173828125, | |
| "distill_kd_weight": 0.632421875, | |
| "distill_temperature": 2.34765625, | |
| "epoch": 0.12109375, | |
| "grad_norm": 0.0004177093505859375, | |
| "learning_rate": 9.112903225806452e-05, | |
| "loss": 3.9599568843841553, | |
| "step": 31 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824216842651367, | |
| "distill_ce_weight": 0.3681640625, | |
| "distill_kd_loss": -0.0301513671875, | |
| "distill_kd_weight": 0.6318359375, | |
| "distill_temperature": 2.342578125, | |
| "epoch": 0.125, | |
| "grad_norm": 0.00482177734375, | |
| "learning_rate": 9.072580645161291e-05, | |
| "loss": 3.9660446643829346, | |
| "step": 32 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827281951904297, | |
| "distill_ce_weight": 0.36874999999999997, | |
| "distill_kd_loss": -0.0303955078125, | |
| "distill_kd_weight": 0.63125, | |
| "distill_temperature": 2.3375, | |
| "epoch": 0.12890625, | |
| "grad_norm": 0.00250244140625, | |
| "learning_rate": 9.032258064516129e-05, | |
| "loss": 3.973395347595215, | |
| "step": 33 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82607364654541, | |
| "distill_ce_weight": 0.3693359375, | |
| "distill_kd_loss": -0.028564453125, | |
| "distill_kd_weight": 0.6306640625000001, | |
| "distill_temperature": 2.332421875, | |
| "epoch": 0.1328125, | |
| "grad_norm": 0.01165771484375, | |
| "learning_rate": 8.991935483870968e-05, | |
| "loss": 3.98039174079895, | |
| "step": 34 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825122833251953, | |
| "distill_ce_weight": 0.36992187499999996, | |
| "distill_kd_loss": -0.0299072265625, | |
| "distill_kd_weight": 0.630078125, | |
| "distill_temperature": 2.32734375, | |
| "epoch": 0.13671875, | |
| "grad_norm": 0.0021514892578125, | |
| "learning_rate": 8.951612903225806e-05, | |
| "loss": 3.9856510162353516, | |
| "step": 35 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829912185668945, | |
| "distill_ce_weight": 0.3705078125, | |
| "distill_kd_loss": -0.03076171875, | |
| "distill_kd_weight": 0.6294921875, | |
| "distill_temperature": 2.322265625, | |
| "epoch": 0.140625, | |
| "grad_norm": 0.0008697509765625, | |
| "learning_rate": 8.911290322580645e-05, | |
| "loss": 3.9931578636169434, | |
| "step": 36 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827726364135742, | |
| "distill_ce_weight": 0.37109375, | |
| "distill_kd_loss": -0.03076171875, | |
| "distill_kd_weight": 0.62890625, | |
| "distill_temperature": 2.3171875, | |
| "epoch": 0.14453125, | |
| "grad_norm": 0.0003376007080078125, | |
| "learning_rate": 8.870967741935484e-05, | |
| "loss": 3.998814582824707, | |
| "step": 37 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82931137084961, | |
| "distill_ce_weight": 0.37167968749999997, | |
| "distill_kd_loss": -0.0301513671875, | |
| "distill_kd_weight": 0.6283203125, | |
| "distill_temperature": 2.312109375, | |
| "epoch": 0.1484375, | |
| "grad_norm": 0.004302978515625, | |
| "learning_rate": 8.830645161290323e-05, | |
| "loss": 4.0061140060424805, | |
| "step": 38 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828788757324219, | |
| "distill_ce_weight": 0.372265625, | |
| "distill_kd_loss": -0.02880859375, | |
| "distill_kd_weight": 0.6277343750000001, | |
| "distill_temperature": 2.30703125, | |
| "epoch": 0.15234375, | |
| "grad_norm": 0.00543212890625, | |
| "learning_rate": 8.790322580645162e-05, | |
| "loss": 4.013119697570801, | |
| "step": 39 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826772689819336, | |
| "distill_ce_weight": 0.37285156249999996, | |
| "distill_kd_loss": -0.0294189453125, | |
| "distill_kd_weight": 0.6271484375, | |
| "distill_temperature": 2.301953125, | |
| "epoch": 0.15625, | |
| "grad_norm": 0.004791259765625, | |
| "learning_rate": 8.75e-05, | |
| "loss": 4.018346309661865, | |
| "step": 40 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826981544494629, | |
| "distill_ce_weight": 0.3734375, | |
| "distill_kd_loss": -0.02734375, | |
| "distill_kd_weight": 0.6265625, | |
| "distill_temperature": 2.296875, | |
| "epoch": 0.16015625, | |
| "grad_norm": 0.0167236328125, | |
| "learning_rate": 8.709677419354839e-05, | |
| "loss": 4.026111125946045, | |
| "step": 41 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826519966125488, | |
| "distill_ce_weight": 0.3740234375, | |
| "distill_kd_loss": -0.02978515625, | |
| "distill_kd_weight": 0.6259765625, | |
| "distill_temperature": 2.291796875, | |
| "epoch": 0.1640625, | |
| "grad_norm": 0.00041961669921875, | |
| "learning_rate": 8.669354838709678e-05, | |
| "loss": 4.03069543838501, | |
| "step": 42 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828641891479492, | |
| "distill_ce_weight": 0.37460937499999997, | |
| "distill_kd_loss": -0.0302734375, | |
| "distill_kd_weight": 0.625390625, | |
| "distill_temperature": 2.28671875, | |
| "epoch": 0.16796875, | |
| "grad_norm": 0.000732421875, | |
| "learning_rate": 8.629032258064517e-05, | |
| "loss": 4.037590026855469, | |
| "step": 43 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826750755310059, | |
| "distill_ce_weight": 0.3751953125, | |
| "distill_kd_loss": -0.0296630859375, | |
| "distill_kd_weight": 0.6248046875000001, | |
| "distill_temperature": 2.281640625, | |
| "epoch": 0.171875, | |
| "grad_norm": 0.0003414154052734375, | |
| "learning_rate": 8.588709677419356e-05, | |
| "loss": 4.043591499328613, | |
| "step": 44 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826141357421875, | |
| "distill_ce_weight": 0.37578124999999996, | |
| "distill_kd_loss": -0.030029296875, | |
| "distill_kd_weight": 0.62421875, | |
| "distill_temperature": 2.2765625, | |
| "epoch": 0.17578125, | |
| "grad_norm": 0.0005035400390625, | |
| "learning_rate": 8.548387096774195e-05, | |
| "loss": 4.049461841583252, | |
| "step": 45 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829056739807129, | |
| "distill_ce_weight": 0.3763671875, | |
| "distill_kd_loss": -0.02880859375, | |
| "distill_kd_weight": 0.6236328125, | |
| "distill_temperature": 2.271484375, | |
| "epoch": 0.1796875, | |
| "grad_norm": 0.000476837158203125, | |
| "learning_rate": 8.508064516129032e-05, | |
| "loss": 4.057757377624512, | |
| "step": 46 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82729721069336, | |
| "distill_ce_weight": 0.376953125, | |
| "distill_kd_loss": -0.0296630859375, | |
| "distill_kd_weight": 0.623046875, | |
| "distill_temperature": 2.26640625, | |
| "epoch": 0.18359375, | |
| "grad_norm": 0.00677490234375, | |
| "learning_rate": 8.467741935483871e-05, | |
| "loss": 4.06295108795166, | |
| "step": 47 | |
| }, | |
| { | |
| "distill_ce_loss": 10.822839736938477, | |
| "distill_ce_weight": 0.37753906249999997, | |
| "distill_kd_loss": -0.0291748046875, | |
| "distill_kd_weight": 0.6224609375, | |
| "distill_temperature": 2.261328125, | |
| "epoch": 0.1875, | |
| "grad_norm": 0.000530242919921875, | |
| "learning_rate": 8.42741935483871e-05, | |
| "loss": 4.067856311798096, | |
| "step": 48 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82568073272705, | |
| "distill_ce_weight": 0.378125, | |
| "distill_kd_loss": -0.029052734375, | |
| "distill_kd_weight": 0.6218750000000001, | |
| "distill_temperature": 2.25625, | |
| "epoch": 0.19140625, | |
| "grad_norm": 0.0003108978271484375, | |
| "learning_rate": 8.387096774193549e-05, | |
| "loss": 4.075394153594971, | |
| "step": 49 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828625679016113, | |
| "distill_ce_weight": 0.37871093749999996, | |
| "distill_kd_loss": -0.0291748046875, | |
| "distill_kd_weight": 0.6212890625, | |
| "distill_temperature": 2.251171875, | |
| "epoch": 0.1953125, | |
| "grad_norm": 0.0003566741943359375, | |
| "learning_rate": 8.346774193548388e-05, | |
| "loss": 4.082852363586426, | |
| "step": 50 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82653522491455, | |
| "distill_ce_weight": 0.379296875, | |
| "distill_kd_loss": -0.02880859375, | |
| "distill_kd_weight": 0.620703125, | |
| "distill_temperature": 2.24609375, | |
| "epoch": 0.1953125, | |
| "eval_loss": 4.088868141174316, | |
| "eval_runtime": 13.2468, | |
| "eval_samples_per_second": 4.831, | |
| "eval_steps_per_second": 4.831, | |
| "step": 50 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827178955078125, | |
| "distill_ce_weight": 0.379296875, | |
| "distill_kd_loss": -0.0289306640625, | |
| "distill_kd_weight": 0.620703125, | |
| "distill_temperature": 2.24609375, | |
| "epoch": 0.19921875, | |
| "grad_norm": 6.198883056640625e-05, | |
| "learning_rate": 8.306451612903227e-05, | |
| "loss": 4.088770866394043, | |
| "step": 51 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826313018798828, | |
| "distill_ce_weight": 0.3798828125, | |
| "distill_kd_loss": -0.0291748046875, | |
| "distill_kd_weight": 0.6201171875, | |
| "distill_temperature": 2.241015625, | |
| "epoch": 0.203125, | |
| "grad_norm": 0.00010395050048828125, | |
| "learning_rate": 8.266129032258066e-05, | |
| "loss": 4.094663619995117, | |
| "step": 52 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829632759094238, | |
| "distill_ce_weight": 0.38046874999999997, | |
| "distill_kd_loss": -0.02880859375, | |
| "distill_kd_weight": 0.61953125, | |
| "distill_temperature": 2.2359375, | |
| "epoch": 0.20703125, | |
| "grad_norm": 0.000644683837890625, | |
| "learning_rate": 8.225806451612904e-05, | |
| "loss": 4.102514743804932, | |
| "step": 53 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825312614440918, | |
| "distill_ce_weight": 0.3810546875, | |
| "distill_kd_loss": -0.02880859375, | |
| "distill_kd_weight": 0.6189453125000001, | |
| "distill_temperature": 2.230859375, | |
| "epoch": 0.2109375, | |
| "grad_norm": 0.0079345703125, | |
| "learning_rate": 8.185483870967743e-05, | |
| "loss": 4.107213973999023, | |
| "step": 54 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826827049255371, | |
| "distill_ce_weight": 0.38164062499999996, | |
| "distill_kd_loss": -0.02734375, | |
| "distill_kd_weight": 0.618359375, | |
| "distill_temperature": 2.22578125, | |
| "epoch": 0.21484375, | |
| "grad_norm": 0.00110626220703125, | |
| "learning_rate": 8.145161290322582e-05, | |
| "loss": 4.114989280700684, | |
| "step": 55 | |
| }, | |
| { | |
| "distill_ce_loss": 10.83178997039795, | |
| "distill_ce_weight": 0.3822265625, | |
| "distill_kd_loss": -0.0281982421875, | |
| "distill_kd_weight": 0.6177734375, | |
| "distill_temperature": 2.220703125, | |
| "epoch": 0.21875, | |
| "grad_norm": 0.0059814453125, | |
| "learning_rate": 8.104838709677419e-05, | |
| "loss": 4.12274169921875, | |
| "step": 56 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824783325195312, | |
| "distill_ce_weight": 0.3828125, | |
| "distill_kd_loss": -0.0281982421875, | |
| "distill_kd_weight": 0.6171875, | |
| "distill_temperature": 2.215625, | |
| "epoch": 0.22265625, | |
| "grad_norm": 0.000339508056640625, | |
| "learning_rate": 8.064516129032258e-05, | |
| "loss": 4.126406192779541, | |
| "step": 57 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824844360351562, | |
| "distill_ce_weight": 0.38339843749999997, | |
| "distill_kd_loss": -0.02685546875, | |
| "distill_kd_weight": 0.6166015625, | |
| "distill_temperature": 2.210546875, | |
| "epoch": 0.2265625, | |
| "grad_norm": 0.00390625, | |
| "learning_rate": 8.024193548387097e-05, | |
| "loss": 4.133626937866211, | |
| "step": 58 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829590797424316, | |
| "distill_ce_weight": 0.383984375, | |
| "distill_kd_loss": -0.0277099609375, | |
| "distill_kd_weight": 0.6160156250000001, | |
| "distill_temperature": 2.20546875, | |
| "epoch": 0.23046875, | |
| "grad_norm": 0.006011962890625, | |
| "learning_rate": 7.983870967741936e-05, | |
| "loss": 4.141304016113281, | |
| "step": 59 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826620101928711, | |
| "distill_ce_weight": 0.38457031249999996, | |
| "distill_kd_loss": -0.025146484375, | |
| "distill_kd_weight": 0.6154296875, | |
| "distill_temperature": 2.200390625, | |
| "epoch": 0.234375, | |
| "grad_norm": 0.0019378662109375, | |
| "learning_rate": 7.943548387096774e-05, | |
| "loss": 4.1480937004089355, | |
| "step": 60 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827092170715332, | |
| "distill_ce_weight": 0.38515625, | |
| "distill_kd_loss": -0.0277099609375, | |
| "distill_kd_weight": 0.61484375, | |
| "distill_temperature": 2.1953125, | |
| "epoch": 0.23828125, | |
| "grad_norm": 0.0003719329833984375, | |
| "learning_rate": 7.903225806451613e-05, | |
| "loss": 4.153032302856445, | |
| "step": 61 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82585334777832, | |
| "distill_ce_weight": 0.3857421875, | |
| "distill_kd_loss": -0.0274658203125, | |
| "distill_kd_weight": 0.6142578125, | |
| "distill_temperature": 2.190234375, | |
| "epoch": 0.2421875, | |
| "grad_norm": 0.0002956390380859375, | |
| "learning_rate": 7.862903225806451e-05, | |
| "loss": 4.15914249420166, | |
| "step": 62 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82608413696289, | |
| "distill_ce_weight": 0.38632812499999997, | |
| "distill_kd_loss": -0.027099609375, | |
| "distill_kd_weight": 0.613671875, | |
| "distill_temperature": 2.18515625, | |
| "epoch": 0.24609375, | |
| "grad_norm": 0.002288818359375, | |
| "learning_rate": 7.82258064516129e-05, | |
| "loss": 4.16581916809082, | |
| "step": 63 | |
| }, | |
| { | |
| "distill_ce_loss": 10.821599960327148, | |
| "distill_ce_weight": 0.3869140625, | |
| "distill_kd_loss": -0.0274658203125, | |
| "distill_kd_weight": 0.6130859375000001, | |
| "distill_temperature": 2.180078125, | |
| "epoch": 0.25, | |
| "grad_norm": 0.0004634857177734375, | |
| "learning_rate": 7.78225806451613e-05, | |
| "loss": 4.1701836585998535, | |
| "step": 64 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82644271850586, | |
| "distill_ce_weight": 0.38749999999999996, | |
| "distill_kd_loss": -0.02685546875, | |
| "distill_kd_weight": 0.6125, | |
| "distill_temperature": 2.175, | |
| "epoch": 0.25390625, | |
| "grad_norm": 0.00035858154296875, | |
| "learning_rate": 7.741935483870968e-05, | |
| "loss": 4.17876672744751, | |
| "step": 65 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825672149658203, | |
| "distill_ce_weight": 0.3880859375, | |
| "distill_kd_loss": -0.0274658203125, | |
| "distill_kd_weight": 0.6119140625, | |
| "distill_temperature": 2.169921875, | |
| "epoch": 0.2578125, | |
| "grad_norm": 0.0016632080078125, | |
| "learning_rate": 7.701612903225807e-05, | |
| "loss": 4.184445381164551, | |
| "step": 66 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82388973236084, | |
| "distill_ce_weight": 0.388671875, | |
| "distill_kd_loss": -0.026611328125, | |
| "distill_kd_weight": 0.611328125, | |
| "distill_temperature": 2.16484375, | |
| "epoch": 0.26171875, | |
| "grad_norm": 0.00138092041015625, | |
| "learning_rate": 7.661290322580645e-05, | |
| "loss": 4.190706253051758, | |
| "step": 67 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82534408569336, | |
| "distill_ce_weight": 0.38925781249999997, | |
| "distill_kd_loss": -0.0267333984375, | |
| "distill_kd_weight": 0.6107421875, | |
| "distill_temperature": 2.159765625, | |
| "epoch": 0.265625, | |
| "grad_norm": 0.0003376007080078125, | |
| "learning_rate": 7.620967741935484e-05, | |
| "loss": 4.197492599487305, | |
| "step": 68 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828094482421875, | |
| "distill_ce_weight": 0.38984375, | |
| "distill_kd_loss": -0.0269775390625, | |
| "distill_kd_weight": 0.6101562500000001, | |
| "distill_temperature": 2.1546875, | |
| "epoch": 0.26953125, | |
| "grad_norm": 0.000881195068359375, | |
| "learning_rate": 7.580645161290323e-05, | |
| "loss": 4.2047858238220215, | |
| "step": 69 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828548431396484, | |
| "distill_ce_weight": 0.39042968749999996, | |
| "distill_kd_loss": -0.0263671875, | |
| "distill_kd_weight": 0.6095703125, | |
| "distill_temperature": 2.149609375, | |
| "epoch": 0.2734375, | |
| "grad_norm": 0.00139617919921875, | |
| "learning_rate": 7.540322580645162e-05, | |
| "loss": 4.211673259735107, | |
| "step": 70 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827938079833984, | |
| "distill_ce_weight": 0.391015625, | |
| "distill_kd_loss": -0.02490234375, | |
| "distill_kd_weight": 0.608984375, | |
| "distill_temperature": 2.14453125, | |
| "epoch": 0.27734375, | |
| "grad_norm": 0.0054931640625, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 4.218756198883057, | |
| "step": 71 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829117774963379, | |
| "distill_ce_weight": 0.3916015625, | |
| "distill_kd_loss": -0.025390625, | |
| "distill_kd_weight": 0.6083984375, | |
| "distill_temperature": 2.139453125, | |
| "epoch": 0.28125, | |
| "grad_norm": 0.0019073486328125, | |
| "learning_rate": 7.45967741935484e-05, | |
| "loss": 4.225257396697998, | |
| "step": 72 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828458786010742, | |
| "distill_ce_weight": 0.39218749999999997, | |
| "distill_kd_loss": -0.026123046875, | |
| "distill_kd_weight": 0.6078125, | |
| "distill_temperature": 2.134375, | |
| "epoch": 0.28515625, | |
| "grad_norm": 0.00016880035400390625, | |
| "learning_rate": 7.419354838709677e-05, | |
| "loss": 4.230916976928711, | |
| "step": 73 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827070236206055, | |
| "distill_ce_weight": 0.3927734375, | |
| "distill_kd_loss": -0.026123046875, | |
| "distill_kd_weight": 0.6072265625000001, | |
| "distill_temperature": 2.129296875, | |
| "epoch": 0.2890625, | |
| "grad_norm": 0.00238037109375, | |
| "learning_rate": 7.379032258064516e-05, | |
| "loss": 4.2367167472839355, | |
| "step": 74 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826481819152832, | |
| "distill_ce_weight": 0.39335937499999996, | |
| "distill_kd_loss": -0.02587890625, | |
| "distill_kd_weight": 0.606640625, | |
| "distill_temperature": 2.12421875, | |
| "epoch": 0.29296875, | |
| "grad_norm": 0.000629425048828125, | |
| "learning_rate": 7.338709677419355e-05, | |
| "loss": 4.242950916290283, | |
| "step": 75 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826537132263184, | |
| "distill_ce_weight": 0.3939453125, | |
| "distill_kd_loss": -0.025634765625, | |
| "distill_kd_weight": 0.6060546875, | |
| "distill_temperature": 2.119140625, | |
| "epoch": 0.29296875, | |
| "eval_loss": 4.2497992515563965, | |
| "eval_runtime": 14.844, | |
| "eval_samples_per_second": 4.312, | |
| "eval_steps_per_second": 4.312, | |
| "step": 75 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827073097229004, | |
| "distill_ce_weight": 0.3939453125, | |
| "distill_kd_loss": -0.025634765625, | |
| "distill_kd_weight": 0.6060546875, | |
| "distill_temperature": 2.119140625, | |
| "epoch": 0.296875, | |
| "grad_norm": 0.000728607177734375, | |
| "learning_rate": 7.298387096774194e-05, | |
| "loss": 4.249710559844971, | |
| "step": 76 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827564239501953, | |
| "distill_ce_weight": 0.39453125, | |
| "distill_kd_loss": -0.0252685546875, | |
| "distill_kd_weight": 0.60546875, | |
| "distill_temperature": 2.1140625, | |
| "epoch": 0.30078125, | |
| "grad_norm": 0.0002899169921875, | |
| "learning_rate": 7.258064516129033e-05, | |
| "loss": 4.256492614746094, | |
| "step": 77 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82872486114502, | |
| "distill_ce_weight": 0.39511718749999997, | |
| "distill_kd_loss": -0.0255126953125, | |
| "distill_kd_weight": 0.6048828125, | |
| "distill_temperature": 2.108984375, | |
| "epoch": 0.3046875, | |
| "grad_norm": 0.00021839141845703125, | |
| "learning_rate": 7.217741935483872e-05, | |
| "loss": 4.263173580169678, | |
| "step": 78 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823871612548828, | |
| "distill_ce_weight": 0.395703125, | |
| "distill_kd_loss": -0.0242919921875, | |
| "distill_kd_weight": 0.6042968750000001, | |
| "distill_temperature": 2.10390625, | |
| "epoch": 0.30859375, | |
| "grad_norm": 0.00262451171875, | |
| "learning_rate": 7.177419354838711e-05, | |
| "loss": 4.2683305740356445, | |
| "step": 79 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828267097473145, | |
| "distill_ce_weight": 0.39628906249999996, | |
| "distill_kd_loss": -0.025146484375, | |
| "distill_kd_weight": 0.6037109375, | |
| "distill_temperature": 2.098828125, | |
| "epoch": 0.3125, | |
| "grad_norm": 0.00029754638671875, | |
| "learning_rate": 7.137096774193549e-05, | |
| "loss": 4.275926113128662, | |
| "step": 80 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82632827758789, | |
| "distill_ce_weight": 0.396875, | |
| "distill_kd_loss": -0.0250244140625, | |
| "distill_kd_weight": 0.603125, | |
| "distill_temperature": 2.09375, | |
| "epoch": 0.31640625, | |
| "grad_norm": 0.000240325927734375, | |
| "learning_rate": 7.096774193548388e-05, | |
| "loss": 4.281623363494873, | |
| "step": 81 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827393531799316, | |
| "distill_ce_weight": 0.3974609375, | |
| "distill_kd_loss": -0.025146484375, | |
| "distill_kd_weight": 0.6025390625, | |
| "distill_temperature": 2.088671875, | |
| "epoch": 0.3203125, | |
| "grad_norm": 0.000492095947265625, | |
| "learning_rate": 7.056451612903226e-05, | |
| "loss": 4.288329124450684, | |
| "step": 82 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82739543914795, | |
| "distill_ce_weight": 0.39804687499999997, | |
| "distill_kd_loss": -0.024658203125, | |
| "distill_kd_weight": 0.601953125, | |
| "distill_temperature": 2.08359375, | |
| "epoch": 0.32421875, | |
| "grad_norm": 0.006439208984375, | |
| "learning_rate": 7.016129032258065e-05, | |
| "loss": 4.294979572296143, | |
| "step": 83 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826316833496094, | |
| "distill_ce_weight": 0.3986328125, | |
| "distill_kd_loss": -0.0240478515625, | |
| "distill_kd_weight": 0.6013671875000001, | |
| "distill_temperature": 2.078515625, | |
| "epoch": 0.328125, | |
| "grad_norm": 0.00177764892578125, | |
| "learning_rate": 6.975806451612904e-05, | |
| "loss": 4.301259994506836, | |
| "step": 84 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825986862182617, | |
| "distill_ce_weight": 0.39921874999999996, | |
| "distill_kd_loss": -0.024169921875, | |
| "distill_kd_weight": 0.60078125, | |
| "distill_temperature": 2.0734375, | |
| "epoch": 0.33203125, | |
| "grad_norm": 0.0010528564453125, | |
| "learning_rate": 6.935483870967743e-05, | |
| "loss": 4.30741024017334, | |
| "step": 85 | |
| }, | |
| { | |
| "distill_ce_loss": 10.822548866271973, | |
| "distill_ce_weight": 0.3998046875, | |
| "distill_kd_loss": -0.02392578125, | |
| "distill_kd_weight": 0.6001953125, | |
| "distill_temperature": 2.068359375, | |
| "epoch": 0.3359375, | |
| "grad_norm": 0.000331878662109375, | |
| "learning_rate": 6.895161290322581e-05, | |
| "loss": 4.312562465667725, | |
| "step": 86 | |
| }, | |
| { | |
| "distill_ce_loss": 10.8272066116333, | |
| "distill_ce_weight": 0.400390625, | |
| "distill_kd_loss": -0.0242919921875, | |
| "distill_kd_weight": 0.599609375, | |
| "distill_temperature": 2.06328125, | |
| "epoch": 0.33984375, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 6.854838709677419e-05, | |
| "loss": 4.3205246925354, | |
| "step": 87 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828274726867676, | |
| "distill_ce_weight": 0.40097656249999997, | |
| "distill_kd_loss": -0.0233154296875, | |
| "distill_kd_weight": 0.5990234375, | |
| "distill_temperature": 2.058203125, | |
| "epoch": 0.34375, | |
| "grad_norm": 0.00982666015625, | |
| "learning_rate": 6.814516129032257e-05, | |
| "loss": 4.327907562255859, | |
| "step": 88 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824246406555176, | |
| "distill_ce_weight": 0.4015625, | |
| "distill_kd_loss": -0.023681640625, | |
| "distill_kd_weight": 0.5984375, | |
| "distill_temperature": 2.053125, | |
| "epoch": 0.34765625, | |
| "grad_norm": 0.0019378662109375, | |
| "learning_rate": 6.774193548387096e-05, | |
| "loss": 4.332451343536377, | |
| "step": 89 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828485488891602, | |
| "distill_ce_weight": 0.40214843749999996, | |
| "distill_kd_loss": -0.0233154296875, | |
| "distill_kd_weight": 0.5978515625, | |
| "distill_temperature": 2.048046875, | |
| "epoch": 0.3515625, | |
| "grad_norm": 0.00112152099609375, | |
| "learning_rate": 6.733870967741935e-05, | |
| "loss": 4.340742588043213, | |
| "step": 90 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828805923461914, | |
| "distill_ce_weight": 0.402734375, | |
| "distill_kd_loss": -0.0224609375, | |
| "distill_kd_weight": 0.597265625, | |
| "distill_temperature": 2.04296875, | |
| "epoch": 0.35546875, | |
| "grad_norm": 0.00023174285888671875, | |
| "learning_rate": 6.693548387096774e-05, | |
| "loss": 4.3477044105529785, | |
| "step": 91 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826385498046875, | |
| "distill_ce_weight": 0.4033203125, | |
| "distill_kd_loss": -0.02294921875, | |
| "distill_kd_weight": 0.5966796875, | |
| "distill_temperature": 2.037890625, | |
| "epoch": 0.359375, | |
| "grad_norm": 0.00016689300537109375, | |
| "learning_rate": 6.653225806451613e-05, | |
| "loss": 4.352829456329346, | |
| "step": 92 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828919410705566, | |
| "distill_ce_weight": 0.40390624999999997, | |
| "distill_kd_loss": -0.022705078125, | |
| "distill_kd_weight": 0.59609375, | |
| "distill_temperature": 2.0328125, | |
| "epoch": 0.36328125, | |
| "grad_norm": 0.000606536865234375, | |
| "learning_rate": 6.612903225806452e-05, | |
| "loss": 4.360318660736084, | |
| "step": 93 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828105926513672, | |
| "distill_ce_weight": 0.4044921875, | |
| "distill_kd_loss": -0.022705078125, | |
| "distill_kd_weight": 0.5955078125, | |
| "distill_temperature": 2.027734375, | |
| "epoch": 0.3671875, | |
| "grad_norm": 0.00011110305786132812, | |
| "learning_rate": 6.57258064516129e-05, | |
| "loss": 4.366334438323975, | |
| "step": 94 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829119682312012, | |
| "distill_ce_weight": 0.40507812499999996, | |
| "distill_kd_loss": -0.021484375, | |
| "distill_kd_weight": 0.594921875, | |
| "distill_temperature": 2.02265625, | |
| "epoch": 0.37109375, | |
| "grad_norm": 0.000762939453125, | |
| "learning_rate": 6.532258064516129e-05, | |
| "loss": 4.373883247375488, | |
| "step": 95 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825207710266113, | |
| "distill_ce_weight": 0.4056640625, | |
| "distill_kd_loss": -0.0220947265625, | |
| "distill_kd_weight": 0.5943359375, | |
| "distill_temperature": 2.017578125, | |
| "epoch": 0.375, | |
| "grad_norm": 0.0001430511474609375, | |
| "learning_rate": 6.491935483870968e-05, | |
| "loss": 4.378274917602539, | |
| "step": 96 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826506614685059, | |
| "distill_ce_weight": 0.40625, | |
| "distill_kd_loss": -0.0216064453125, | |
| "distill_kd_weight": 0.59375, | |
| "distill_temperature": 2.0125, | |
| "epoch": 0.37890625, | |
| "grad_norm": 0.0086669921875, | |
| "learning_rate": 6.451612903225807e-05, | |
| "loss": 4.385450839996338, | |
| "step": 97 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829300880432129, | |
| "distill_ce_weight": 0.40683593749999997, | |
| "distill_kd_loss": -0.021484375, | |
| "distill_kd_weight": 0.5931640625, | |
| "distill_temperature": 2.007421875, | |
| "epoch": 0.3828125, | |
| "grad_norm": 0.000446319580078125, | |
| "learning_rate": 6.411290322580646e-05, | |
| "loss": 4.3929924964904785, | |
| "step": 98 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826828002929688, | |
| "distill_ce_weight": 0.407421875, | |
| "distill_kd_loss": -0.02099609375, | |
| "distill_kd_weight": 0.592578125, | |
| "distill_temperature": 2.00234375, | |
| "epoch": 0.38671875, | |
| "grad_norm": 0.0084228515625, | |
| "learning_rate": 6.370967741935485e-05, | |
| "loss": 4.398635387420654, | |
| "step": 99 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824376106262207, | |
| "distill_ce_weight": 0.40800781249999996, | |
| "distill_kd_loss": -0.021728515625, | |
| "distill_kd_weight": 0.5919921875, | |
| "distill_temperature": 1.9972656249999998, | |
| "epoch": 0.390625, | |
| "grad_norm": 0.0003108978271484375, | |
| "learning_rate": 6.330645161290322e-05, | |
| "loss": 4.4035515785217285, | |
| "step": 100 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826534271240234, | |
| "distill_ce_weight": 0.40859375, | |
| "distill_kd_loss": -0.0213623046875, | |
| "distill_kd_weight": 0.59140625, | |
| "distill_temperature": 1.9921875, | |
| "epoch": 0.390625, | |
| "eval_loss": 4.411334991455078, | |
| "eval_runtime": 15.556, | |
| "eval_samples_per_second": 4.114, | |
| "eval_steps_per_second": 4.114, | |
| "step": 100 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824360847473145, | |
| "distill_ce_weight": 0.40859375, | |
| "distill_kd_loss": -0.021240234375, | |
| "distill_kd_weight": 0.59140625, | |
| "distill_temperature": 1.9921875, | |
| "epoch": 0.39453125, | |
| "grad_norm": 0.0004673004150390625, | |
| "learning_rate": 6.290322580645161e-05, | |
| "loss": 4.410192966461182, | |
| "step": 101 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825730323791504, | |
| "distill_ce_weight": 0.4091796875, | |
| "distill_kd_loss": -0.0211181640625, | |
| "distill_kd_weight": 0.5908203125, | |
| "distill_temperature": 1.987109375, | |
| "epoch": 0.3984375, | |
| "grad_norm": 0.0002460479736328125, | |
| "learning_rate": 6.25e-05, | |
| "loss": 4.41721773147583, | |
| "step": 102 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828606605529785, | |
| "distill_ce_weight": 0.40976562499999997, | |
| "distill_kd_loss": -0.0213623046875, | |
| "distill_kd_weight": 0.590234375, | |
| "distill_temperature": 1.98203125, | |
| "epoch": 0.40234375, | |
| "grad_norm": 0.000484466552734375, | |
| "learning_rate": 6.209677419354839e-05, | |
| "loss": 4.424556732177734, | |
| "step": 103 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827896118164062, | |
| "distill_ce_weight": 0.4103515625, | |
| "distill_kd_loss": -0.0205078125, | |
| "distill_kd_weight": 0.5896484375, | |
| "distill_temperature": 1.976953125, | |
| "epoch": 0.40625, | |
| "grad_norm": 0.000240325927734375, | |
| "learning_rate": 6.169354838709678e-05, | |
| "loss": 4.431159496307373, | |
| "step": 104 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82929801940918, | |
| "distill_ce_weight": 0.41093749999999996, | |
| "distill_kd_loss": -0.018798828125, | |
| "distill_kd_weight": 0.5890625, | |
| "distill_temperature": 1.9718749999999998, | |
| "epoch": 0.41015625, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 6.129032258064517e-05, | |
| "loss": 4.439116954803467, | |
| "step": 105 | |
| }, | |
| { | |
| "distill_ce_loss": 10.8274564743042, | |
| "distill_ce_weight": 0.4115234375, | |
| "distill_kd_loss": -0.0205078125, | |
| "distill_kd_weight": 0.5884765625, | |
| "distill_temperature": 1.966796875, | |
| "epoch": 0.4140625, | |
| "grad_norm": 0.00054168701171875, | |
| "learning_rate": 6.088709677419355e-05, | |
| "loss": 4.443666934967041, | |
| "step": 106 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824851036071777, | |
| "distill_ce_weight": 0.412109375, | |
| "distill_kd_loss": -0.019775390625, | |
| "distill_kd_weight": 0.587890625, | |
| "distill_temperature": 1.96171875, | |
| "epoch": 0.41796875, | |
| "grad_norm": 0.0004425048828125, | |
| "learning_rate": 6.048387096774194e-05, | |
| "loss": 4.44942569732666, | |
| "step": 107 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825765609741211, | |
| "distill_ce_weight": 0.41269531249999997, | |
| "distill_kd_loss": -0.02001953125, | |
| "distill_kd_weight": 0.5873046875, | |
| "distill_temperature": 1.956640625, | |
| "epoch": 0.421875, | |
| "grad_norm": 0.00150299072265625, | |
| "learning_rate": 6.0080645161290325e-05, | |
| "loss": 4.455963134765625, | |
| "step": 108 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827280044555664, | |
| "distill_ce_weight": 0.41328125, | |
| "distill_kd_loss": -0.0196533203125, | |
| "distill_kd_weight": 0.58671875, | |
| "distill_temperature": 1.9515625, | |
| "epoch": 0.42578125, | |
| "grad_norm": 0.00075531005859375, | |
| "learning_rate": 5.9677419354838715e-05, | |
| "loss": 4.463176250457764, | |
| "step": 109 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828229904174805, | |
| "distill_ce_weight": 0.41386718749999996, | |
| "distill_kd_loss": -0.0191650390625, | |
| "distill_kd_weight": 0.5861328125, | |
| "distill_temperature": 1.9464843749999998, | |
| "epoch": 0.4296875, | |
| "grad_norm": 0.0005645751953125, | |
| "learning_rate": 5.9274193548387104e-05, | |
| "loss": 4.470218658447266, | |
| "step": 110 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823951721191406, | |
| "distill_ce_weight": 0.414453125, | |
| "distill_kd_loss": -0.0186767578125, | |
| "distill_kd_weight": 0.585546875, | |
| "distill_temperature": 1.94140625, | |
| "epoch": 0.43359375, | |
| "grad_norm": 0.033935546875, | |
| "learning_rate": 5.887096774193549e-05, | |
| "loss": 4.475095272064209, | |
| "step": 111 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825326919555664, | |
| "distill_ce_weight": 0.4150390625, | |
| "distill_kd_loss": -0.0191650390625, | |
| "distill_kd_weight": 0.5849609375, | |
| "distill_temperature": 1.936328125, | |
| "epoch": 0.4375, | |
| "grad_norm": 0.0002918243408203125, | |
| "learning_rate": 5.8467741935483876e-05, | |
| "loss": 4.481703281402588, | |
| "step": 112 | |
| }, | |
| { | |
| "distill_ce_loss": 10.831355094909668, | |
| "distill_ce_weight": 0.415625, | |
| "distill_kd_loss": -0.0184326171875, | |
| "distill_kd_weight": 0.584375, | |
| "distill_temperature": 1.93125, | |
| "epoch": 0.44140625, | |
| "grad_norm": 0.0032196044921875, | |
| "learning_rate": 5.8064516129032266e-05, | |
| "loss": 4.491039752960205, | |
| "step": 113 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82675838470459, | |
| "distill_ce_weight": 0.4162109375, | |
| "distill_kd_loss": -0.018310546875, | |
| "distill_kd_weight": 0.5837890625, | |
| "distill_temperature": 1.926171875, | |
| "epoch": 0.4453125, | |
| "grad_norm": 0.00494384765625, | |
| "learning_rate": 5.7661290322580655e-05, | |
| "loss": 4.495534420013428, | |
| "step": 114 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830732345581055, | |
| "distill_ce_weight": 0.41679687499999996, | |
| "distill_kd_loss": -0.0174560546875, | |
| "distill_kd_weight": 0.583203125, | |
| "distill_temperature": 1.9210937499999998, | |
| "epoch": 0.44921875, | |
| "grad_norm": 0.01043701171875, | |
| "learning_rate": 5.725806451612904e-05, | |
| "loss": 4.504022598266602, | |
| "step": 115 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830469131469727, | |
| "distill_ce_weight": 0.4173828125, | |
| "distill_kd_loss": -0.0174560546875, | |
| "distill_kd_weight": 0.5826171875, | |
| "distill_temperature": 1.916015625, | |
| "epoch": 0.453125, | |
| "grad_norm": 0.000820159912109375, | |
| "learning_rate": 5.685483870967743e-05, | |
| "loss": 4.510258674621582, | |
| "step": 116 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826086044311523, | |
| "distill_ce_weight": 0.41796875, | |
| "distill_kd_loss": -0.0172119140625, | |
| "distill_kd_weight": 0.58203125, | |
| "distill_temperature": 1.9109375, | |
| "epoch": 0.45703125, | |
| "grad_norm": 0.005523681640625, | |
| "learning_rate": 5.645161290322582e-05, | |
| "loss": 4.514955997467041, | |
| "step": 117 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82542610168457, | |
| "distill_ce_weight": 0.4185546875, | |
| "distill_kd_loss": -0.017578125, | |
| "distill_kd_weight": 0.5814453125, | |
| "distill_temperature": 1.905859375, | |
| "epoch": 0.4609375, | |
| "grad_norm": 0.001007080078125, | |
| "learning_rate": 5.604838709677419e-05, | |
| "loss": 4.520840167999268, | |
| "step": 118 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828349113464355, | |
| "distill_ce_weight": 0.419140625, | |
| "distill_kd_loss": -0.0164794921875, | |
| "distill_kd_weight": 0.580859375, | |
| "distill_temperature": 1.90078125, | |
| "epoch": 0.46484375, | |
| "grad_norm": 0.001495361328125, | |
| "learning_rate": 5.5645161290322576e-05, | |
| "loss": 4.529018402099609, | |
| "step": 119 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82563591003418, | |
| "distill_ce_weight": 0.41972656249999996, | |
| "distill_kd_loss": -0.0169677734375, | |
| "distill_kd_weight": 0.5802734375, | |
| "distill_temperature": 1.8957031249999998, | |
| "epoch": 0.46875, | |
| "grad_norm": 0.0025787353515625, | |
| "learning_rate": 5.5241935483870966e-05, | |
| "loss": 4.533980369567871, | |
| "step": 120 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827801704406738, | |
| "distill_ce_weight": 0.4203125, | |
| "distill_kd_loss": -0.015625, | |
| "distill_kd_weight": 0.5796875, | |
| "distill_temperature": 1.890625, | |
| "epoch": 0.47265625, | |
| "grad_norm": 0.000392913818359375, | |
| "learning_rate": 5.4838709677419355e-05, | |
| "loss": 4.542026996612549, | |
| "step": 121 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826509475708008, | |
| "distill_ce_weight": 0.4208984375, | |
| "distill_kd_loss": -0.01611328125, | |
| "distill_kd_weight": 0.5791015625, | |
| "distill_temperature": 1.885546875, | |
| "epoch": 0.4765625, | |
| "grad_norm": 0.000789642333984375, | |
| "learning_rate": 5.443548387096774e-05, | |
| "loss": 4.54752254486084, | |
| "step": 122 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82880687713623, | |
| "distill_ce_weight": 0.421484375, | |
| "distill_kd_loss": -0.0159912109375, | |
| "distill_kd_weight": 0.578515625, | |
| "distill_temperature": 1.88046875, | |
| "epoch": 0.48046875, | |
| "grad_norm": 0.0004596710205078125, | |
| "learning_rate": 5.403225806451613e-05, | |
| "loss": 4.554895401000977, | |
| "step": 123 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826998710632324, | |
| "distill_ce_weight": 0.4220703125, | |
| "distill_kd_loss": -0.0135498046875, | |
| "distill_kd_weight": 0.5779296875, | |
| "distill_temperature": 1.875390625, | |
| "epoch": 0.484375, | |
| "grad_norm": 0.0016021728515625, | |
| "learning_rate": 5.362903225806452e-05, | |
| "loss": 4.5619425773620605, | |
| "step": 124 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827878952026367, | |
| "distill_ce_weight": 0.42265624999999996, | |
| "distill_kd_loss": -0.01495361328125, | |
| "distill_kd_weight": 0.57734375, | |
| "distill_temperature": 1.8703124999999998, | |
| "epoch": 0.48828125, | |
| "grad_norm": 0.000545501708984375, | |
| "learning_rate": 5.32258064516129e-05, | |
| "loss": 4.567864418029785, | |
| "step": 125 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82653522491455, | |
| "distill_ce_weight": 0.4232421875, | |
| "distill_kd_loss": -0.014892578125, | |
| "distill_kd_weight": 0.5767578125, | |
| "distill_temperature": 1.865234375, | |
| "epoch": 0.48828125, | |
| "eval_loss": 4.573975086212158, | |
| "eval_runtime": 12.3534, | |
| "eval_samples_per_second": 5.181, | |
| "eval_steps_per_second": 5.181, | |
| "step": 125 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823301315307617, | |
| "distill_ce_weight": 0.4232421875, | |
| "distill_kd_loss": -0.014892578125, | |
| "distill_kd_weight": 0.5767578125, | |
| "distill_temperature": 1.865234375, | |
| "epoch": 0.4921875, | |
| "grad_norm": 0.003753662109375, | |
| "learning_rate": 5.282258064516129e-05, | |
| "loss": 4.572271823883057, | |
| "step": 126 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826366424560547, | |
| "distill_ce_weight": 0.423828125, | |
| "distill_kd_loss": -0.0142822265625, | |
| "distill_kd_weight": 0.576171875, | |
| "distill_temperature": 1.86015625, | |
| "epoch": 0.49609375, | |
| "grad_norm": 0.00138092041015625, | |
| "learning_rate": 5.241935483870968e-05, | |
| "loss": 4.5802788734436035, | |
| "step": 127 | |
| }, | |
| { | |
| "distill_ce_loss": 10.8292818069458, | |
| "distill_ce_weight": 0.4244140625, | |
| "distill_kd_loss": -0.01397705078125, | |
| "distill_kd_weight": 0.5755859375, | |
| "distill_temperature": 1.855078125, | |
| "epoch": 0.5, | |
| "grad_norm": 0.00010061264038085938, | |
| "learning_rate": 5.201612903225807e-05, | |
| "loss": 4.588042736053467, | |
| "step": 128 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823813438415527, | |
| "distill_ce_weight": 0.425, | |
| "distill_kd_loss": -0.01318359375, | |
| "distill_kd_weight": 0.575, | |
| "distill_temperature": 1.85, | |
| "epoch": 0.50390625, | |
| "grad_norm": 0.00028228759765625, | |
| "learning_rate": 5.161290322580645e-05, | |
| "loss": 4.592552661895752, | |
| "step": 129 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830474853515625, | |
| "distill_ce_weight": 0.42558593749999996, | |
| "distill_kd_loss": -0.01300048828125, | |
| "distill_kd_weight": 0.5744140625, | |
| "distill_temperature": 1.8449218749999998, | |
| "epoch": 0.5078125, | |
| "grad_norm": 0.00055694580078125, | |
| "learning_rate": 5.120967741935484e-05, | |
| "loss": 4.601820945739746, | |
| "step": 130 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827618598937988, | |
| "distill_ce_weight": 0.426171875, | |
| "distill_kd_loss": -0.0125732421875, | |
| "distill_kd_weight": 0.573828125, | |
| "distill_temperature": 1.83984375, | |
| "epoch": 0.51171875, | |
| "grad_norm": 0.0162353515625, | |
| "learning_rate": 5.080645161290323e-05, | |
| "loss": 4.607224464416504, | |
| "step": 131 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828215599060059, | |
| "distill_ce_weight": 0.4267578125, | |
| "distill_kd_loss": -0.01190185546875, | |
| "distill_kd_weight": 0.5732421875, | |
| "distill_temperature": 1.834765625, | |
| "epoch": 0.515625, | |
| "grad_norm": 0.000640869140625, | |
| "learning_rate": 5.040322580645161e-05, | |
| "loss": 4.614189624786377, | |
| "step": 132 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828178405761719, | |
| "distill_ce_weight": 0.42734375, | |
| "distill_kd_loss": -0.01080322265625, | |
| "distill_kd_weight": 0.57265625, | |
| "distill_temperature": 1.8296875, | |
| "epoch": 0.51953125, | |
| "grad_norm": 0.014892578125, | |
| "learning_rate": 5e-05, | |
| "loss": 4.621159553527832, | |
| "step": 133 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828635215759277, | |
| "distill_ce_weight": 0.4279296875, | |
| "distill_kd_loss": -0.01104736328125, | |
| "distill_kd_weight": 0.5720703125, | |
| "distill_temperature": 1.824609375, | |
| "epoch": 0.5234375, | |
| "grad_norm": 0.000926971435546875, | |
| "learning_rate": 4.959677419354839e-05, | |
| "loss": 4.627577304840088, | |
| "step": 134 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828261375427246, | |
| "distill_ce_weight": 0.42851562499999996, | |
| "distill_kd_loss": -0.0107421875, | |
| "distill_kd_weight": 0.571484375, | |
| "distill_temperature": 1.8195312499999998, | |
| "epoch": 0.52734375, | |
| "grad_norm": 0.00141143798828125, | |
| "learning_rate": 4.9193548387096775e-05, | |
| "loss": 4.633944988250732, | |
| "step": 135 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830928802490234, | |
| "distill_ce_weight": 0.4291015625, | |
| "distill_kd_loss": -0.01116943359375, | |
| "distill_kd_weight": 0.5708984375, | |
| "distill_temperature": 1.814453125, | |
| "epoch": 0.53125, | |
| "grad_norm": 0.00958251953125, | |
| "learning_rate": 4.8790322580645164e-05, | |
| "loss": 4.641190052032471, | |
| "step": 136 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82923412322998, | |
| "distill_ce_weight": 0.4296875, | |
| "distill_kd_loss": -0.00958251953125, | |
| "distill_kd_weight": 0.5703125, | |
| "distill_temperature": 1.809375, | |
| "epoch": 0.53515625, | |
| "grad_norm": 0.0004596710205078125, | |
| "learning_rate": 4.8387096774193554e-05, | |
| "loss": 4.64772367477417, | |
| "step": 137 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826897621154785, | |
| "distill_ce_weight": 0.4302734375, | |
| "distill_kd_loss": -0.009521484375, | |
| "distill_kd_weight": 0.5697265625, | |
| "distill_temperature": 1.804296875, | |
| "epoch": 0.5390625, | |
| "grad_norm": 0.00054931640625, | |
| "learning_rate": 4.7983870967741937e-05, | |
| "loss": 4.653094291687012, | |
| "step": 138 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824013710021973, | |
| "distill_ce_weight": 0.430859375, | |
| "distill_kd_loss": -0.0086669921875, | |
| "distill_kd_weight": 0.569140625, | |
| "distill_temperature": 1.79921875, | |
| "epoch": 0.54296875, | |
| "grad_norm": 0.000835418701171875, | |
| "learning_rate": 4.7580645161290326e-05, | |
| "loss": 4.658684253692627, | |
| "step": 139 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827019691467285, | |
| "distill_ce_weight": 0.43144531249999996, | |
| "distill_kd_loss": -0.008544921875, | |
| "distill_kd_weight": 0.5685546875, | |
| "distill_temperature": 1.7941406249999998, | |
| "epoch": 0.546875, | |
| "grad_norm": 0.0004291534423828125, | |
| "learning_rate": 4.7177419354838716e-05, | |
| "loss": 4.666414260864258, | |
| "step": 140 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82970142364502, | |
| "distill_ce_weight": 0.43203125, | |
| "distill_kd_loss": -0.0089111328125, | |
| "distill_kd_weight": 0.56796875, | |
| "distill_temperature": 1.7890625, | |
| "epoch": 0.55078125, | |
| "grad_norm": 0.0013275146484375, | |
| "learning_rate": 4.67741935483871e-05, | |
| "loss": 4.673703670501709, | |
| "step": 141 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828492164611816, | |
| "distill_ce_weight": 0.4326171875, | |
| "distill_kd_loss": -0.00762939453125, | |
| "distill_kd_weight": 0.5673828125, | |
| "distill_temperature": 1.783984375, | |
| "epoch": 0.5546875, | |
| "grad_norm": 0.0255126953125, | |
| "learning_rate": 4.637096774193548e-05, | |
| "loss": 4.680258274078369, | |
| "step": 142 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826722145080566, | |
| "distill_ce_weight": 0.433203125, | |
| "distill_kd_loss": -0.007781982421875, | |
| "distill_kd_weight": 0.566796875, | |
| "distill_temperature": 1.77890625, | |
| "epoch": 0.55859375, | |
| "grad_norm": 0.000545501708984375, | |
| "learning_rate": 4.596774193548387e-05, | |
| "loss": 4.685744762420654, | |
| "step": 143 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829012870788574, | |
| "distill_ce_weight": 0.4337890625, | |
| "distill_kd_loss": -0.006591796875, | |
| "distill_kd_weight": 0.5662109375, | |
| "distill_temperature": 1.773828125, | |
| "epoch": 0.5625, | |
| "grad_norm": 0.0024566650390625, | |
| "learning_rate": 4.556451612903226e-05, | |
| "loss": 4.6937689781188965, | |
| "step": 144 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826948165893555, | |
| "distill_ce_weight": 0.43437499999999996, | |
| "distill_kd_loss": -0.00634765625, | |
| "distill_kd_weight": 0.565625, | |
| "distill_temperature": 1.7687499999999998, | |
| "epoch": 0.56640625, | |
| "grad_norm": 0.000736236572265625, | |
| "learning_rate": 4.516129032258064e-05, | |
| "loss": 4.699369430541992, | |
| "step": 145 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826860427856445, | |
| "distill_ce_weight": 0.4349609375, | |
| "distill_kd_loss": -0.0057373046875, | |
| "distill_kd_weight": 0.5650390625, | |
| "distill_temperature": 1.763671875, | |
| "epoch": 0.5703125, | |
| "grad_norm": 0.00116729736328125, | |
| "learning_rate": 4.475806451612903e-05, | |
| "loss": 4.706026554107666, | |
| "step": 146 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827485084533691, | |
| "distill_ce_weight": 0.435546875, | |
| "distill_kd_loss": -0.00482177734375, | |
| "distill_kd_weight": 0.564453125, | |
| "distill_temperature": 1.75859375, | |
| "epoch": 0.57421875, | |
| "grad_norm": 0.004547119140625, | |
| "learning_rate": 4.435483870967742e-05, | |
| "loss": 4.713160991668701, | |
| "step": 147 | |
| }, | |
| { | |
| "distill_ce_loss": 10.83022403717041, | |
| "distill_ce_weight": 0.4361328125, | |
| "distill_kd_loss": -0.00531005859375, | |
| "distill_kd_weight": 0.5638671875, | |
| "distill_temperature": 1.753515625, | |
| "epoch": 0.578125, | |
| "grad_norm": 0.00909423828125, | |
| "learning_rate": 4.395161290322581e-05, | |
| "loss": 4.720425605773926, | |
| "step": 148 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825448989868164, | |
| "distill_ce_weight": 0.43671875, | |
| "distill_kd_loss": -0.004119873046875, | |
| "distill_kd_weight": 0.56328125, | |
| "distill_temperature": 1.7484375, | |
| "epoch": 0.58203125, | |
| "grad_norm": 0.01239013671875, | |
| "learning_rate": 4.3548387096774194e-05, | |
| "loss": 4.725357532501221, | |
| "step": 149 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825879096984863, | |
| "distill_ce_weight": 0.4373046875, | |
| "distill_kd_loss": -0.0040283203125, | |
| "distill_kd_weight": 0.5626953125, | |
| "distill_temperature": 1.7433593749999998, | |
| "epoch": 0.5859375, | |
| "grad_norm": 0.0004177093505859375, | |
| "learning_rate": 4.3145161290322584e-05, | |
| "loss": 4.731934070587158, | |
| "step": 150 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826534271240234, | |
| "distill_ce_weight": 0.437890625, | |
| "distill_kd_loss": -0.0031280517578125, | |
| "distill_kd_weight": 0.562109375, | |
| "distill_temperature": 1.73828125, | |
| "epoch": 0.5859375, | |
| "eval_loss": 4.739337921142578, | |
| "eval_runtime": 14.2325, | |
| "eval_samples_per_second": 4.497, | |
| "eval_steps_per_second": 4.497, | |
| "step": 150 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829754829406738, | |
| "distill_ce_weight": 0.437890625, | |
| "distill_kd_loss": -0.00311279296875, | |
| "distill_kd_weight": 0.562109375, | |
| "distill_temperature": 1.73828125, | |
| "epoch": 0.58984375, | |
| "grad_norm": 0.00042724609375, | |
| "learning_rate": 4.2741935483870973e-05, | |
| "loss": 4.7405009269714355, | |
| "step": 151 | |
| }, | |
| { | |
| "distill_ce_loss": 10.823193550109863, | |
| "distill_ce_weight": 0.4384765625, | |
| "distill_kd_loss": -0.0029144287109375, | |
| "distill_kd_weight": 0.5615234375, | |
| "distill_temperature": 1.733203125, | |
| "epoch": 0.59375, | |
| "grad_norm": 0.0010528564453125, | |
| "learning_rate": 4.2338709677419356e-05, | |
| "loss": 4.744076251983643, | |
| "step": 152 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824686050415039, | |
| "distill_ce_weight": 0.4390625, | |
| "distill_kd_loss": -0.0017852783203125, | |
| "distill_kd_weight": 0.5609375, | |
| "distill_temperature": 1.728125, | |
| "epoch": 0.59765625, | |
| "grad_norm": 0.00069427490234375, | |
| "learning_rate": 4.1935483870967746e-05, | |
| "loss": 4.75171422958374, | |
| "step": 153 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827430725097656, | |
| "distill_ce_weight": 0.4396484375, | |
| "distill_kd_loss": -0.00174713134765625, | |
| "distill_kd_weight": 0.5603515625, | |
| "distill_temperature": 1.723046875, | |
| "epoch": 0.6015625, | |
| "grad_norm": 0.000179290771484375, | |
| "learning_rate": 4.1532258064516135e-05, | |
| "loss": 4.759286403656006, | |
| "step": 154 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828149795532227, | |
| "distill_ce_weight": 0.440234375, | |
| "distill_kd_loss": -0.00072479248046875, | |
| "distill_kd_weight": 0.559765625, | |
| "distill_temperature": 1.7179687499999998, | |
| "epoch": 0.60546875, | |
| "grad_norm": 0.000392913818359375, | |
| "learning_rate": 4.112903225806452e-05, | |
| "loss": 4.766517162322998, | |
| "step": 155 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82726001739502, | |
| "distill_ce_weight": 0.4408203125, | |
| "distill_kd_loss": -0.000560760498046875, | |
| "distill_kd_weight": 0.5591796875, | |
| "distill_temperature": 1.712890625, | |
| "epoch": 0.609375, | |
| "grad_norm": 0.0005035400390625, | |
| "learning_rate": 4.072580645161291e-05, | |
| "loss": 4.772563457489014, | |
| "step": 156 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829056739807129, | |
| "distill_ce_weight": 0.44140625, | |
| "distill_kd_loss": -0.0004673004150390625, | |
| "distill_kd_weight": 0.55859375, | |
| "distill_temperature": 1.7078125, | |
| "epoch": 0.61328125, | |
| "grad_norm": 0.00738525390625, | |
| "learning_rate": 4.032258064516129e-05, | |
| "loss": 4.779752254486084, | |
| "step": 157 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826592445373535, | |
| "distill_ce_weight": 0.4419921875, | |
| "distill_kd_loss": 0.00078582763671875, | |
| "distill_kd_weight": 0.5580078125, | |
| "distill_temperature": 1.702734375, | |
| "epoch": 0.6171875, | |
| "grad_norm": 0.000579833984375, | |
| "learning_rate": 3.991935483870968e-05, | |
| "loss": 4.785707950592041, | |
| "step": 158 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825506210327148, | |
| "distill_ce_weight": 0.442578125, | |
| "distill_kd_loss": 0.000885009765625, | |
| "distill_kd_weight": 0.557421875, | |
| "distill_temperature": 1.69765625, | |
| "epoch": 0.62109375, | |
| "grad_norm": 0.0031890869140625, | |
| "learning_rate": 3.951612903225806e-05, | |
| "loss": 4.791624546051025, | |
| "step": 159 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829129219055176, | |
| "distill_ce_weight": 0.4431640625, | |
| "distill_kd_loss": 0.0016937255859375, | |
| "distill_kd_weight": 0.5568359375, | |
| "distill_temperature": 1.6925781249999998, | |
| "epoch": 0.625, | |
| "grad_norm": 0.00151824951171875, | |
| "learning_rate": 3.911290322580645e-05, | |
| "loss": 4.800023078918457, | |
| "step": 160 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828397750854492, | |
| "distill_ce_weight": 0.44375, | |
| "distill_kd_loss": 0.001953125, | |
| "distill_kd_weight": 0.55625, | |
| "distill_temperature": 1.6875, | |
| "epoch": 0.62890625, | |
| "grad_norm": 0.0003986358642578125, | |
| "learning_rate": 3.870967741935484e-05, | |
| "loss": 4.806184768676758, | |
| "step": 161 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826486587524414, | |
| "distill_ce_weight": 0.4443359375, | |
| "distill_kd_loss": 0.00311279296875, | |
| "distill_kd_weight": 0.5556640625, | |
| "distill_temperature": 1.682421875, | |
| "epoch": 0.6328125, | |
| "grad_norm": 0.0008087158203125, | |
| "learning_rate": 3.8306451612903224e-05, | |
| "loss": 4.812328815460205, | |
| "step": 162 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826553344726562, | |
| "distill_ce_weight": 0.444921875, | |
| "distill_kd_loss": 0.003265380859375, | |
| "distill_kd_weight": 0.555078125, | |
| "distill_temperature": 1.67734375, | |
| "epoch": 0.63671875, | |
| "grad_norm": 0.000675201416015625, | |
| "learning_rate": 3.7903225806451614e-05, | |
| "loss": 4.818786144256592, | |
| "step": 163 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82434368133545, | |
| "distill_ce_weight": 0.4455078125, | |
| "distill_kd_loss": 0.00457763671875, | |
| "distill_kd_weight": 0.5544921875, | |
| "distill_temperature": 1.672265625, | |
| "epoch": 0.640625, | |
| "grad_norm": 0.0003795623779296875, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 4.824862957000732, | |
| "step": 164 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830697059631348, | |
| "distill_ce_weight": 0.44609375, | |
| "distill_kd_loss": 0.004638671875, | |
| "distill_kd_weight": 0.55390625, | |
| "distill_temperature": 1.6671874999999998, | |
| "epoch": 0.64453125, | |
| "grad_norm": 0.00185394287109375, | |
| "learning_rate": 3.7096774193548386e-05, | |
| "loss": 4.834069728851318, | |
| "step": 165 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830793380737305, | |
| "distill_ce_weight": 0.4466796875, | |
| "distill_kd_loss": 0.005645751953125, | |
| "distill_kd_weight": 0.5533203125, | |
| "distill_temperature": 1.662109375, | |
| "epoch": 0.6484375, | |
| "grad_norm": 0.0081787109375, | |
| "learning_rate": 3.6693548387096776e-05, | |
| "loss": 4.8410234451293945, | |
| "step": 166 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82587718963623, | |
| "distill_ce_weight": 0.447265625, | |
| "distill_kd_loss": 0.0054931640625, | |
| "distill_kd_weight": 0.552734375, | |
| "distill_temperature": 1.65703125, | |
| "epoch": 0.65234375, | |
| "grad_norm": 0.0010528564453125, | |
| "learning_rate": 3.6290322580645165e-05, | |
| "loss": 4.84507942199707, | |
| "step": 167 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825627326965332, | |
| "distill_ce_weight": 0.4478515625, | |
| "distill_kd_loss": 0.0068359375, | |
| "distill_kd_weight": 0.5521484375, | |
| "distill_temperature": 1.651953125, | |
| "epoch": 0.65625, | |
| "grad_norm": 0.0022430419921875, | |
| "learning_rate": 3.5887096774193555e-05, | |
| "loss": 4.852043151855469, | |
| "step": 168 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826099395751953, | |
| "distill_ce_weight": 0.4484375, | |
| "distill_kd_loss": 0.0068359375, | |
| "distill_kd_weight": 0.5515625, | |
| "distill_temperature": 1.646875, | |
| "epoch": 0.66015625, | |
| "grad_norm": 0.0025634765625, | |
| "learning_rate": 3.548387096774194e-05, | |
| "loss": 4.858598232269287, | |
| "step": 169 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825976371765137, | |
| "distill_ce_weight": 0.4490234375, | |
| "distill_kd_loss": 0.00811767578125, | |
| "distill_kd_weight": 0.5509765625, | |
| "distill_temperature": 1.6417968749999998, | |
| "epoch": 0.6640625, | |
| "grad_norm": 0.0033111572265625, | |
| "learning_rate": 3.508064516129033e-05, | |
| "loss": 4.865602970123291, | |
| "step": 170 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824938774108887, | |
| "distill_ce_weight": 0.449609375, | |
| "distill_kd_loss": 0.00848388671875, | |
| "distill_kd_weight": 0.550390625, | |
| "distill_temperature": 1.63671875, | |
| "epoch": 0.66796875, | |
| "grad_norm": 0.0005950927734375, | |
| "learning_rate": 3.467741935483872e-05, | |
| "loss": 4.8716630935668945, | |
| "step": 171 | |
| }, | |
| { | |
| "distill_ce_loss": 10.831235885620117, | |
| "distill_ce_weight": 0.4501953125, | |
| "distill_kd_loss": 0.0098876953125, | |
| "distill_kd_weight": 0.5498046875, | |
| "distill_temperature": 1.631640625, | |
| "epoch": 0.671875, | |
| "grad_norm": 0.0186767578125, | |
| "learning_rate": 3.427419354838709e-05, | |
| "loss": 4.881603717803955, | |
| "step": 172 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82729721069336, | |
| "distill_ce_weight": 0.45078125, | |
| "distill_kd_loss": 0.00982666015625, | |
| "distill_kd_weight": 0.54921875, | |
| "distill_temperature": 1.6265625, | |
| "epoch": 0.67578125, | |
| "grad_norm": 0.0003757476806640625, | |
| "learning_rate": 3.387096774193548e-05, | |
| "loss": 4.886144161224365, | |
| "step": 173 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82698917388916, | |
| "distill_ce_weight": 0.4513671875, | |
| "distill_kd_loss": 0.01019287109375, | |
| "distill_kd_weight": 0.5486328125, | |
| "distill_temperature": 1.621484375, | |
| "epoch": 0.6796875, | |
| "grad_norm": 0.0003185272216796875, | |
| "learning_rate": 3.346774193548387e-05, | |
| "loss": 4.8925323486328125, | |
| "step": 174 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829041481018066, | |
| "distill_ce_weight": 0.451953125, | |
| "distill_kd_loss": 0.01068115234375, | |
| "distill_kd_weight": 0.548046875, | |
| "distill_temperature": 1.6164062499999998, | |
| "epoch": 0.68359375, | |
| "grad_norm": 0.0005950927734375, | |
| "learning_rate": 3.306451612903226e-05, | |
| "loss": 4.900078296661377, | |
| "step": 175 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826534271240234, | |
| "distill_ce_weight": 0.4525390625, | |
| "distill_kd_loss": 0.01141357421875, | |
| "distill_kd_weight": 0.5474609375, | |
| "distill_temperature": 1.611328125, | |
| "epoch": 0.68359375, | |
| "eval_loss": 4.90593957901001, | |
| "eval_runtime": 13.6074, | |
| "eval_samples_per_second": 4.703, | |
| "eval_steps_per_second": 4.703, | |
| "step": 175 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825757026672363, | |
| "distill_ce_weight": 0.4525390625, | |
| "distill_kd_loss": 0.01123046875, | |
| "distill_kd_weight": 0.5474609375, | |
| "distill_temperature": 1.611328125, | |
| "epoch": 0.6875, | |
| "grad_norm": 0.002685546875, | |
| "learning_rate": 3.2661290322580644e-05, | |
| "loss": 4.905211925506592, | |
| "step": 176 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827515602111816, | |
| "distill_ce_weight": 0.453125, | |
| "distill_kd_loss": 0.01202392578125, | |
| "distill_kd_weight": 0.546875, | |
| "distill_temperature": 1.60625, | |
| "epoch": 0.69140625, | |
| "grad_norm": 0.0003662109375, | |
| "learning_rate": 3.2258064516129034e-05, | |
| "loss": 4.912779331207275, | |
| "step": 177 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826964378356934, | |
| "distill_ce_weight": 0.4537109375, | |
| "distill_kd_loss": 0.012939453125, | |
| "distill_kd_weight": 0.5462890625, | |
| "distill_temperature": 1.601171875, | |
| "epoch": 0.6953125, | |
| "grad_norm": 0.00616455078125, | |
| "learning_rate": 3.185483870967742e-05, | |
| "loss": 4.919392108917236, | |
| "step": 178 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825935363769531, | |
| "distill_ce_weight": 0.454296875, | |
| "distill_kd_loss": 0.0128173828125, | |
| "distill_kd_weight": 0.545703125, | |
| "distill_temperature": 1.59609375, | |
| "epoch": 0.69921875, | |
| "grad_norm": 0.000652313232421875, | |
| "learning_rate": 3.1451612903225806e-05, | |
| "loss": 4.925177097320557, | |
| "step": 179 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826337814331055, | |
| "distill_ce_weight": 0.4548828125, | |
| "distill_kd_loss": 0.013916015625, | |
| "distill_kd_weight": 0.5451171875, | |
| "distill_temperature": 1.5910156249999998, | |
| "epoch": 0.703125, | |
| "grad_norm": 0.000934600830078125, | |
| "learning_rate": 3.1048387096774195e-05, | |
| "loss": 4.932313919067383, | |
| "step": 180 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827921867370605, | |
| "distill_ce_weight": 0.45546875, | |
| "distill_kd_loss": 0.013916015625, | |
| "distill_kd_weight": 0.54453125, | |
| "distill_temperature": 1.5859375, | |
| "epoch": 0.70703125, | |
| "grad_norm": 0.000762939453125, | |
| "learning_rate": 3.0645161290322585e-05, | |
| "loss": 4.939348220825195, | |
| "step": 181 | |
| }, | |
| { | |
| "distill_ce_loss": 10.8247709274292, | |
| "distill_ce_weight": 0.4560546875, | |
| "distill_kd_loss": 0.0150146484375, | |
| "distill_kd_weight": 0.5439453125, | |
| "distill_temperature": 1.580859375, | |
| "epoch": 0.7109375, | |
| "grad_norm": 0.00848388671875, | |
| "learning_rate": 3.024193548387097e-05, | |
| "loss": 4.944866180419922, | |
| "step": 182 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825705528259277, | |
| "distill_ce_weight": 0.456640625, | |
| "distill_kd_loss": 0.01495361328125, | |
| "distill_kd_weight": 0.543359375, | |
| "distill_temperature": 1.57578125, | |
| "epoch": 0.71484375, | |
| "grad_norm": 0.00201416015625, | |
| "learning_rate": 2.9838709677419357e-05, | |
| "loss": 4.951574802398682, | |
| "step": 183 | |
| }, | |
| { | |
| "distill_ce_loss": 10.822872161865234, | |
| "distill_ce_weight": 0.4572265625, | |
| "distill_kd_loss": 0.0166015625, | |
| "distill_kd_weight": 0.5427734375, | |
| "distill_temperature": 1.570703125, | |
| "epoch": 0.71875, | |
| "grad_norm": 0.00110626220703125, | |
| "learning_rate": 2.9435483870967743e-05, | |
| "loss": 4.95753812789917, | |
| "step": 184 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825102806091309, | |
| "distill_ce_weight": 0.4578125, | |
| "distill_kd_loss": 0.016357421875, | |
| "distill_kd_weight": 0.5421875, | |
| "distill_temperature": 1.5656249999999998, | |
| "epoch": 0.72265625, | |
| "grad_norm": 0.01507568359375, | |
| "learning_rate": 2.9032258064516133e-05, | |
| "loss": 4.964717388153076, | |
| "step": 185 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828420639038086, | |
| "distill_ce_weight": 0.4583984375, | |
| "distill_kd_loss": 0.0179443359375, | |
| "distill_kd_weight": 0.5416015625, | |
| "distill_temperature": 1.560546875, | |
| "epoch": 0.7265625, | |
| "grad_norm": 0.00130462646484375, | |
| "learning_rate": 2.862903225806452e-05, | |
| "loss": 4.973435401916504, | |
| "step": 186 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829689979553223, | |
| "distill_ce_weight": 0.458984375, | |
| "distill_kd_loss": 0.01806640625, | |
| "distill_kd_weight": 0.541015625, | |
| "distill_temperature": 1.55546875, | |
| "epoch": 0.73046875, | |
| "grad_norm": 0.0002574920654296875, | |
| "learning_rate": 2.822580645161291e-05, | |
| "loss": 4.980423927307129, | |
| "step": 187 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828069686889648, | |
| "distill_ce_weight": 0.4595703125, | |
| "distill_kd_loss": 0.0194091796875, | |
| "distill_kd_weight": 0.5404296875, | |
| "distill_temperature": 1.550390625, | |
| "epoch": 0.734375, | |
| "grad_norm": 0.01904296875, | |
| "learning_rate": 2.7822580645161288e-05, | |
| "loss": 4.986757278442383, | |
| "step": 188 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82535457611084, | |
| "distill_ce_weight": 0.46015625, | |
| "distill_kd_loss": 0.0196533203125, | |
| "distill_kd_weight": 0.53984375, | |
| "distill_temperature": 1.5453125, | |
| "epoch": 0.73828125, | |
| "grad_norm": 0.00075531005859375, | |
| "learning_rate": 2.7419354838709678e-05, | |
| "loss": 4.991974830627441, | |
| "step": 189 | |
| }, | |
| { | |
| "distill_ce_loss": 10.818794250488281, | |
| "distill_ce_weight": 0.4607421875, | |
| "distill_kd_loss": 0.0196533203125, | |
| "distill_kd_weight": 0.5392578125, | |
| "distill_temperature": 1.5402343749999998, | |
| "epoch": 0.7421875, | |
| "grad_norm": 0.0062255859375, | |
| "learning_rate": 2.7016129032258064e-05, | |
| "loss": 4.99529504776001, | |
| "step": 190 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827811241149902, | |
| "distill_ce_weight": 0.461328125, | |
| "distill_kd_loss": 0.0216064453125, | |
| "distill_kd_weight": 0.538671875, | |
| "distill_temperature": 1.53515625, | |
| "epoch": 0.74609375, | |
| "grad_norm": 0.018310546875, | |
| "learning_rate": 2.661290322580645e-05, | |
| "loss": 5.006831645965576, | |
| "step": 191 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825983047485352, | |
| "distill_ce_weight": 0.4619140625, | |
| "distill_kd_loss": 0.02099609375, | |
| "distill_kd_weight": 0.5380859375, | |
| "distill_temperature": 1.530078125, | |
| "epoch": 0.75, | |
| "grad_norm": 0.00162506103515625, | |
| "learning_rate": 2.620967741935484e-05, | |
| "loss": 5.011965274810791, | |
| "step": 192 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828679084777832, | |
| "distill_ce_weight": 0.4625, | |
| "distill_kd_loss": 0.0223388671875, | |
| "distill_kd_weight": 0.5375, | |
| "distill_temperature": 1.525, | |
| "epoch": 0.75390625, | |
| "grad_norm": 0.00045013427734375, | |
| "learning_rate": 2.5806451612903226e-05, | |
| "loss": 5.020287990570068, | |
| "step": 193 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829313278198242, | |
| "distill_ce_weight": 0.4630859375, | |
| "distill_kd_loss": 0.0223388671875, | |
| "distill_kd_weight": 0.5369140625, | |
| "distill_temperature": 1.519921875, | |
| "epoch": 0.7578125, | |
| "grad_norm": 0.004302978515625, | |
| "learning_rate": 2.5403225806451615e-05, | |
| "loss": 5.026926517486572, | |
| "step": 194 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825671195983887, | |
| "distill_ce_weight": 0.463671875, | |
| "distill_kd_loss": 0.0230712890625, | |
| "distill_kd_weight": 0.536328125, | |
| "distill_temperature": 1.5148437499999998, | |
| "epoch": 0.76171875, | |
| "grad_norm": 0.0091552734375, | |
| "learning_rate": 2.5e-05, | |
| "loss": 5.031949043273926, | |
| "step": 195 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828692436218262, | |
| "distill_ce_weight": 0.4642578125, | |
| "distill_kd_loss": 0.0242919921875, | |
| "distill_kd_weight": 0.5357421875, | |
| "distill_temperature": 1.509765625, | |
| "epoch": 0.765625, | |
| "grad_norm": 0.0004253387451171875, | |
| "learning_rate": 2.4596774193548387e-05, | |
| "loss": 5.0403056144714355, | |
| "step": 196 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827781677246094, | |
| "distill_ce_weight": 0.46484375, | |
| "distill_kd_loss": 0.024658203125, | |
| "distill_kd_weight": 0.53515625, | |
| "distill_temperature": 1.5046875, | |
| "epoch": 0.76953125, | |
| "grad_norm": 0.015625, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 5.046410083770752, | |
| "step": 197 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828465461730957, | |
| "distill_ce_weight": 0.4654296875, | |
| "distill_kd_loss": 0.025390625, | |
| "distill_kd_weight": 0.5345703125, | |
| "distill_temperature": 1.499609375, | |
| "epoch": 0.7734375, | |
| "grad_norm": 0.000553131103515625, | |
| "learning_rate": 2.3790322580645163e-05, | |
| "loss": 5.053439140319824, | |
| "step": 198 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828309059143066, | |
| "distill_ce_weight": 0.466015625, | |
| "distill_kd_loss": 0.0250244140625, | |
| "distill_kd_weight": 0.533984375, | |
| "distill_temperature": 1.4945312499999999, | |
| "epoch": 0.77734375, | |
| "grad_norm": 0.0012054443359375, | |
| "learning_rate": 2.338709677419355e-05, | |
| "loss": 5.05952787399292, | |
| "step": 199 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825377464294434, | |
| "distill_ce_weight": 0.4666015625, | |
| "distill_kd_loss": 0.026123046875, | |
| "distill_kd_weight": 0.5333984375, | |
| "distill_temperature": 1.489453125, | |
| "epoch": 0.78125, | |
| "grad_norm": 0.00109100341796875, | |
| "learning_rate": 2.2983870967741935e-05, | |
| "loss": 5.065053939819336, | |
| "step": 200 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826531410217285, | |
| "distill_ce_weight": 0.4671875, | |
| "distill_kd_loss": 0.0274658203125, | |
| "distill_kd_weight": 0.5328125, | |
| "distill_temperature": 1.484375, | |
| "epoch": 0.78125, | |
| "eval_loss": 5.072885513305664, | |
| "eval_runtime": 15.1986, | |
| "eval_samples_per_second": 4.211, | |
| "eval_steps_per_second": 4.211, | |
| "step": 200 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828163146972656, | |
| "distill_ce_weight": 0.4671875, | |
| "distill_kd_loss": 0.027587890625, | |
| "distill_kd_weight": 0.5328125, | |
| "distill_temperature": 1.484375, | |
| "epoch": 0.78515625, | |
| "grad_norm": 0.003814697265625, | |
| "learning_rate": 2.258064516129032e-05, | |
| "loss": 5.073492050170898, | |
| "step": 201 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826192855834961, | |
| "distill_ce_weight": 0.4677734375, | |
| "distill_kd_loss": 0.02734375, | |
| "distill_kd_weight": 0.5322265625, | |
| "distill_temperature": 1.479296875, | |
| "epoch": 0.7890625, | |
| "grad_norm": 0.00019931793212890625, | |
| "learning_rate": 2.217741935483871e-05, | |
| "loss": 5.078732013702393, | |
| "step": 202 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825226783752441, | |
| "distill_ce_weight": 0.468359375, | |
| "distill_kd_loss": 0.028076171875, | |
| "distill_kd_weight": 0.531640625, | |
| "distill_temperature": 1.47421875, | |
| "epoch": 0.79296875, | |
| "grad_norm": 0.000347137451171875, | |
| "learning_rate": 2.1774193548387097e-05, | |
| "loss": 5.085050106048584, | |
| "step": 203 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82629680633545, | |
| "distill_ce_weight": 0.4689453125, | |
| "distill_kd_loss": 0.0281982421875, | |
| "distill_kd_weight": 0.5310546875, | |
| "distill_temperature": 1.4691406249999999, | |
| "epoch": 0.796875, | |
| "grad_norm": 0.007110595703125, | |
| "learning_rate": 2.1370967741935487e-05, | |
| "loss": 5.09189510345459, | |
| "step": 204 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826847076416016, | |
| "distill_ce_weight": 0.46953125, | |
| "distill_kd_loss": 0.029541015625, | |
| "distill_kd_weight": 0.53046875, | |
| "distill_temperature": 1.4640625, | |
| "epoch": 0.80078125, | |
| "grad_norm": 0.00113677978515625, | |
| "learning_rate": 2.0967741935483873e-05, | |
| "loss": 5.099167823791504, | |
| "step": 205 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829412460327148, | |
| "distill_ce_weight": 0.4701171875, | |
| "distill_kd_loss": 0.030517578125, | |
| "distill_kd_weight": 0.5298828125, | |
| "distill_temperature": 1.458984375, | |
| "epoch": 0.8046875, | |
| "grad_norm": 0.000423431396484375, | |
| "learning_rate": 2.056451612903226e-05, | |
| "loss": 5.107206344604492, | |
| "step": 206 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826083183288574, | |
| "distill_ce_weight": 0.470703125, | |
| "distill_kd_loss": 0.0301513671875, | |
| "distill_kd_weight": 0.529296875, | |
| "distill_temperature": 1.45390625, | |
| "epoch": 0.80859375, | |
| "grad_norm": 0.0023040771484375, | |
| "learning_rate": 2.0161290322580645e-05, | |
| "loss": 5.1118621826171875, | |
| "step": 207 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826690673828125, | |
| "distill_ce_weight": 0.4712890625, | |
| "distill_kd_loss": 0.03076171875, | |
| "distill_kd_weight": 0.5287109375, | |
| "distill_temperature": 1.448828125, | |
| "epoch": 0.8125, | |
| "grad_norm": 0.005859375, | |
| "learning_rate": 1.975806451612903e-05, | |
| "loss": 5.118736267089844, | |
| "step": 208 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82703685760498, | |
| "distill_ce_weight": 0.471875, | |
| "distill_kd_loss": 0.0308837890625, | |
| "distill_kd_weight": 0.528125, | |
| "distill_temperature": 1.4437499999999999, | |
| "epoch": 0.81640625, | |
| "grad_norm": 0.00032806396484375, | |
| "learning_rate": 1.935483870967742e-05, | |
| "loss": 5.125365734100342, | |
| "step": 209 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826313972473145, | |
| "distill_ce_weight": 0.4724609375, | |
| "distill_kd_loss": 0.031494140625, | |
| "distill_kd_weight": 0.5275390625, | |
| "distill_temperature": 1.438671875, | |
| "epoch": 0.8203125, | |
| "grad_norm": 0.00182342529296875, | |
| "learning_rate": 1.8951612903225807e-05, | |
| "loss": 5.1316118240356445, | |
| "step": 210 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829218864440918, | |
| "distill_ce_weight": 0.473046875, | |
| "distill_kd_loss": 0.03271484375, | |
| "distill_kd_weight": 0.526953125, | |
| "distill_temperature": 1.43359375, | |
| "epoch": 0.82421875, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 1.8548387096774193e-05, | |
| "loss": 5.139939785003662, | |
| "step": 211 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826104164123535, | |
| "distill_ce_weight": 0.4736328125, | |
| "distill_kd_loss": 0.03271484375, | |
| "distill_kd_weight": 0.5263671875, | |
| "distill_temperature": 1.428515625, | |
| "epoch": 0.828125, | |
| "grad_norm": 0.0030364990234375, | |
| "learning_rate": 1.8145161290322583e-05, | |
| "loss": 5.144810199737549, | |
| "step": 212 | |
| }, | |
| { | |
| "distill_ce_loss": 10.83273696899414, | |
| "distill_ce_weight": 0.47421875, | |
| "distill_kd_loss": 0.033935546875, | |
| "distill_kd_weight": 0.52578125, | |
| "distill_temperature": 1.4234375, | |
| "epoch": 0.83203125, | |
| "grad_norm": 0.000957489013671875, | |
| "learning_rate": 1.774193548387097e-05, | |
| "loss": 5.154909133911133, | |
| "step": 213 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829684257507324, | |
| "distill_ce_weight": 0.4748046875, | |
| "distill_kd_loss": 0.03466796875, | |
| "distill_kd_weight": 0.5251953125, | |
| "distill_temperature": 1.4183593749999999, | |
| "epoch": 0.8359375, | |
| "grad_norm": 0.0009765625, | |
| "learning_rate": 1.733870967741936e-05, | |
| "loss": 5.160173416137695, | |
| "step": 214 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829045295715332, | |
| "distill_ce_weight": 0.47539062499999996, | |
| "distill_kd_loss": 0.03466796875, | |
| "distill_kd_weight": 0.524609375, | |
| "distill_temperature": 1.41328125, | |
| "epoch": 0.83984375, | |
| "grad_norm": 0.0004596710205078125, | |
| "learning_rate": 1.693548387096774e-05, | |
| "loss": 5.166214942932129, | |
| "step": 215 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825078964233398, | |
| "distill_ce_weight": 0.4759765625, | |
| "distill_kd_loss": 0.035400390625, | |
| "distill_kd_weight": 0.5240234375, | |
| "distill_temperature": 1.408203125, | |
| "epoch": 0.84375, | |
| "grad_norm": 0.0006256103515625, | |
| "learning_rate": 1.653225806451613e-05, | |
| "loss": 5.171038627624512, | |
| "step": 216 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829195976257324, | |
| "distill_ce_weight": 0.4765625, | |
| "distill_kd_loss": 0.0361328125, | |
| "distill_kd_weight": 0.5234375, | |
| "distill_temperature": 1.403125, | |
| "epoch": 0.84765625, | |
| "grad_norm": 0.000316619873046875, | |
| "learning_rate": 1.6129032258064517e-05, | |
| "loss": 5.179709434509277, | |
| "step": 217 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827247619628906, | |
| "distill_ce_weight": 0.4771484375, | |
| "distill_kd_loss": 0.035888671875, | |
| "distill_kd_weight": 0.5228515625, | |
| "distill_temperature": 1.398046875, | |
| "epoch": 0.8515625, | |
| "grad_norm": 0.0021514892578125, | |
| "learning_rate": 1.5725806451612903e-05, | |
| "loss": 5.185003280639648, | |
| "step": 218 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824097633361816, | |
| "distill_ce_weight": 0.477734375, | |
| "distill_kd_loss": 0.0361328125, | |
| "distill_kd_weight": 0.522265625, | |
| "distill_temperature": 1.3929687499999999, | |
| "epoch": 0.85546875, | |
| "grad_norm": 0.0019073486328125, | |
| "learning_rate": 1.5322580645161292e-05, | |
| "loss": 5.189964771270752, | |
| "step": 219 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828665733337402, | |
| "distill_ce_weight": 0.47832031249999996, | |
| "distill_kd_loss": 0.03759765625, | |
| "distill_kd_weight": 0.5216796875, | |
| "distill_temperature": 1.387890625, | |
| "epoch": 0.859375, | |
| "grad_norm": 0.0026092529296875, | |
| "learning_rate": 1.4919354838709679e-05, | |
| "loss": 5.19922399520874, | |
| "step": 220 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828384399414062, | |
| "distill_ce_weight": 0.47890625, | |
| "distill_kd_loss": 0.037109375, | |
| "distill_kd_weight": 0.52109375, | |
| "distill_temperature": 1.3828125, | |
| "epoch": 0.86328125, | |
| "grad_norm": 0.0140380859375, | |
| "learning_rate": 1.4516129032258066e-05, | |
| "loss": 5.205068111419678, | |
| "step": 221 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828526496887207, | |
| "distill_ce_weight": 0.4794921875, | |
| "distill_kd_loss": 0.03857421875, | |
| "distill_kd_weight": 0.5205078125, | |
| "distill_temperature": 1.377734375, | |
| "epoch": 0.8671875, | |
| "grad_norm": 0.000518798828125, | |
| "learning_rate": 1.4112903225806454e-05, | |
| "loss": 5.212213516235352, | |
| "step": 222 | |
| }, | |
| { | |
| "distill_ce_loss": 10.830307960510254, | |
| "distill_ce_weight": 0.480078125, | |
| "distill_kd_loss": 0.038330078125, | |
| "distill_kd_weight": 0.519921875, | |
| "distill_temperature": 1.37265625, | |
| "epoch": 0.87109375, | |
| "grad_norm": 0.0032196044921875, | |
| "learning_rate": 1.3709677419354839e-05, | |
| "loss": 5.219291687011719, | |
| "step": 223 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827667236328125, | |
| "distill_ce_weight": 0.4806640625, | |
| "distill_kd_loss": 0.039306640625, | |
| "distill_kd_weight": 0.5193359375, | |
| "distill_temperature": 1.3675781249999999, | |
| "epoch": 0.875, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 1.3306451612903225e-05, | |
| "loss": 5.224856376647949, | |
| "step": 224 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828167915344238, | |
| "distill_ce_weight": 0.48125, | |
| "distill_kd_loss": 0.03955078125, | |
| "distill_kd_weight": 0.51875, | |
| "distill_temperature": 1.3625, | |
| "epoch": 0.87890625, | |
| "grad_norm": 0.0008087158203125, | |
| "learning_rate": 1.2903225806451613e-05, | |
| "loss": 5.231563568115234, | |
| "step": 225 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826531410217285, | |
| "distill_ce_weight": 0.4818359375, | |
| "distill_kd_loss": 0.040283203125, | |
| "distill_kd_weight": 0.5181640625, | |
| "distill_temperature": 1.357421875, | |
| "epoch": 0.87890625, | |
| "eval_loss": 5.237745761871338, | |
| "eval_runtime": 12.9169, | |
| "eval_samples_per_second": 4.955, | |
| "eval_steps_per_second": 4.955, | |
| "step": 225 | |
| }, | |
| { | |
| "distill_ce_loss": 10.8232421875, | |
| "distill_ce_weight": 0.4818359375, | |
| "distill_kd_loss": 0.040283203125, | |
| "distill_kd_weight": 0.5181640625, | |
| "distill_temperature": 1.357421875, | |
| "epoch": 0.8828125, | |
| "grad_norm": 0.0020599365234375, | |
| "learning_rate": 1.25e-05, | |
| "loss": 5.23590087890625, | |
| "step": 226 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828264236450195, | |
| "distill_ce_weight": 0.482421875, | |
| "distill_kd_loss": 0.04052734375, | |
| "distill_kd_weight": 0.517578125, | |
| "distill_temperature": 1.35234375, | |
| "epoch": 0.88671875, | |
| "grad_norm": 0.030029296875, | |
| "learning_rate": 1.2096774193548388e-05, | |
| "loss": 5.244787693023682, | |
| "step": 227 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828550338745117, | |
| "distill_ce_weight": 0.4830078125, | |
| "distill_kd_loss": 0.040283203125, | |
| "distill_kd_weight": 0.5169921875, | |
| "distill_temperature": 1.347265625, | |
| "epoch": 0.890625, | |
| "grad_norm": 0.01513671875, | |
| "learning_rate": 1.1693548387096775e-05, | |
| "loss": 5.251148700714111, | |
| "step": 228 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828879356384277, | |
| "distill_ce_weight": 0.48359375, | |
| "distill_kd_loss": 0.041015625, | |
| "distill_kd_weight": 0.51640625, | |
| "distill_temperature": 1.3421874999999999, | |
| "epoch": 0.89453125, | |
| "grad_norm": 0.002685546875, | |
| "learning_rate": 1.129032258064516e-05, | |
| "loss": 5.258018970489502, | |
| "step": 229 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825315475463867, | |
| "distill_ce_weight": 0.4841796875, | |
| "distill_kd_loss": 0.041748046875, | |
| "distill_kd_weight": 0.5158203125, | |
| "distill_temperature": 1.337109375, | |
| "epoch": 0.8984375, | |
| "grad_norm": 0.000537872314453125, | |
| "learning_rate": 1.0887096774193549e-05, | |
| "loss": 5.262882232666016, | |
| "step": 230 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828563690185547, | |
| "distill_ce_weight": 0.484765625, | |
| "distill_kd_loss": 0.042236328125, | |
| "distill_kd_weight": 0.515234375, | |
| "distill_temperature": 1.33203125, | |
| "epoch": 0.90234375, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 1.0483870967741936e-05, | |
| "loss": 5.27104377746582, | |
| "step": 231 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82773208618164, | |
| "distill_ce_weight": 0.4853515625, | |
| "distill_kd_loss": 0.042236328125, | |
| "distill_kd_weight": 0.5146484375, | |
| "distill_temperature": 1.326953125, | |
| "epoch": 0.90625, | |
| "grad_norm": 0.00063323974609375, | |
| "learning_rate": 1.0080645161290323e-05, | |
| "loss": 5.276985168457031, | |
| "step": 232 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827362060546875, | |
| "distill_ce_weight": 0.4859375, | |
| "distill_kd_loss": 0.04296875, | |
| "distill_kd_weight": 0.5140625, | |
| "distill_temperature": 1.321875, | |
| "epoch": 0.91015625, | |
| "grad_norm": 0.0025482177734375, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 5.283515930175781, | |
| "step": 233 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828764915466309, | |
| "distill_ce_weight": 0.4865234375, | |
| "distill_kd_loss": 0.04345703125, | |
| "distill_kd_weight": 0.5134765625, | |
| "distill_temperature": 1.3167968749999999, | |
| "epoch": 0.9140625, | |
| "grad_norm": 0.0027618408203125, | |
| "learning_rate": 9.274193548387097e-06, | |
| "loss": 5.2907867431640625, | |
| "step": 234 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827953338623047, | |
| "distill_ce_weight": 0.487109375, | |
| "distill_kd_loss": 0.04443359375, | |
| "distill_kd_weight": 0.512890625, | |
| "distill_temperature": 1.31171875, | |
| "epoch": 0.91796875, | |
| "grad_norm": 0.00012874603271484375, | |
| "learning_rate": 8.870967741935484e-06, | |
| "loss": 5.297224521636963, | |
| "step": 235 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826726913452148, | |
| "distill_ce_weight": 0.4876953125, | |
| "distill_kd_loss": 0.044677734375, | |
| "distill_kd_weight": 0.5123046875, | |
| "distill_temperature": 1.306640625, | |
| "epoch": 0.921875, | |
| "grad_norm": 0.0003662109375, | |
| "learning_rate": 8.46774193548387e-06, | |
| "loss": 5.303092956542969, | |
| "step": 236 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828742980957031, | |
| "distill_ce_weight": 0.48828125, | |
| "distill_kd_loss": 0.04443359375, | |
| "distill_kd_weight": 0.51171875, | |
| "distill_temperature": 1.3015625, | |
| "epoch": 0.92578125, | |
| "grad_norm": 0.00115966796875, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 5.310177326202393, | |
| "step": 237 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826173782348633, | |
| "distill_ce_weight": 0.4888671875, | |
| "distill_kd_loss": 0.044677734375, | |
| "distill_kd_weight": 0.5111328125, | |
| "distill_temperature": 1.296484375, | |
| "epoch": 0.9296875, | |
| "grad_norm": 0.0027313232421875, | |
| "learning_rate": 7.661290322580646e-06, | |
| "loss": 5.315388202667236, | |
| "step": 238 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829797744750977, | |
| "distill_ce_weight": 0.489453125, | |
| "distill_kd_loss": 0.045654296875, | |
| "distill_kd_weight": 0.510546875, | |
| "distill_temperature": 1.2914062499999999, | |
| "epoch": 0.93359375, | |
| "grad_norm": 0.00051116943359375, | |
| "learning_rate": 7.258064516129033e-06, | |
| "loss": 5.323993682861328, | |
| "step": 239 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828352928161621, | |
| "distill_ce_weight": 0.4900390625, | |
| "distill_kd_loss": 0.0458984375, | |
| "distill_kd_weight": 0.5099609375, | |
| "distill_temperature": 1.286328125, | |
| "epoch": 0.9375, | |
| "grad_norm": 0.00052642822265625, | |
| "learning_rate": 6.854838709677419e-06, | |
| "loss": 5.329753398895264, | |
| "step": 240 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826845169067383, | |
| "distill_ce_weight": 0.490625, | |
| "distill_kd_loss": 0.045654296875, | |
| "distill_kd_weight": 0.509375, | |
| "distill_temperature": 1.28125, | |
| "epoch": 0.94140625, | |
| "grad_norm": 0.00087738037109375, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 5.335236072540283, | |
| "step": 241 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826852798461914, | |
| "distill_ce_weight": 0.4912109375, | |
| "distill_kd_loss": 0.045654296875, | |
| "distill_kd_weight": 0.5087890625, | |
| "distill_temperature": 1.276171875, | |
| "epoch": 0.9453125, | |
| "grad_norm": 0.00107574462890625, | |
| "learning_rate": 6.048387096774194e-06, | |
| "loss": 5.341461658477783, | |
| "step": 242 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829383850097656, | |
| "distill_ce_weight": 0.491796875, | |
| "distill_kd_loss": 0.0458984375, | |
| "distill_kd_weight": 0.508203125, | |
| "distill_temperature": 1.27109375, | |
| "epoch": 0.94921875, | |
| "grad_norm": 0.0030670166015625, | |
| "learning_rate": 5.64516129032258e-06, | |
| "loss": 5.349172592163086, | |
| "step": 243 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826229095458984, | |
| "distill_ce_weight": 0.4923828125, | |
| "distill_kd_loss": 0.046875, | |
| "distill_kd_weight": 0.5076171875, | |
| "distill_temperature": 1.2660156249999999, | |
| "epoch": 0.953125, | |
| "grad_norm": 0.006622314453125, | |
| "learning_rate": 5.241935483870968e-06, | |
| "loss": 5.354453086853027, | |
| "step": 244 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825600624084473, | |
| "distill_ce_weight": 0.49296875, | |
| "distill_kd_loss": 0.04736328125, | |
| "distill_kd_weight": 0.50703125, | |
| "distill_temperature": 1.2609375, | |
| "epoch": 0.95703125, | |
| "grad_norm": 0.00091552734375, | |
| "learning_rate": 4.838709677419355e-06, | |
| "loss": 5.3607306480407715, | |
| "step": 245 | |
| }, | |
| { | |
| "distill_ce_loss": 10.824897766113281, | |
| "distill_ce_weight": 0.4935546875, | |
| "distill_kd_loss": 0.0478515625, | |
| "distill_kd_weight": 0.5064453125, | |
| "distill_temperature": 1.255859375, | |
| "epoch": 0.9609375, | |
| "grad_norm": 0.000568389892578125, | |
| "learning_rate": 4.435483870967742e-06, | |
| "loss": 5.366971015930176, | |
| "step": 246 | |
| }, | |
| { | |
| "distill_ce_loss": 10.827354431152344, | |
| "distill_ce_weight": 0.494140625, | |
| "distill_kd_loss": 0.047607421875, | |
| "distill_kd_weight": 0.505859375, | |
| "distill_temperature": 1.25078125, | |
| "epoch": 0.96484375, | |
| "grad_norm": 0.000457763671875, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 5.374283313751221, | |
| "step": 247 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826087951660156, | |
| "distill_ce_weight": 0.4947265625, | |
| "distill_kd_loss": 0.0478515625, | |
| "distill_kd_weight": 0.5052734375, | |
| "distill_temperature": 1.245703125, | |
| "epoch": 0.96875, | |
| "grad_norm": 0.0004558563232421875, | |
| "learning_rate": 3.6290322580645166e-06, | |
| "loss": 5.380123138427734, | |
| "step": 248 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825430870056152, | |
| "distill_ce_weight": 0.4953125, | |
| "distill_kd_loss": 0.048095703125, | |
| "distill_kd_weight": 0.5046875, | |
| "distill_temperature": 1.2406249999999999, | |
| "epoch": 0.97265625, | |
| "grad_norm": 0.000820159912109375, | |
| "learning_rate": 3.225806451612903e-06, | |
| "loss": 5.386263370513916, | |
| "step": 249 | |
| }, | |
| { | |
| "distill_ce_loss": 10.829666137695312, | |
| "distill_ce_weight": 0.4958984375, | |
| "distill_kd_loss": 0.048095703125, | |
| "distill_kd_weight": 0.5041015625, | |
| "distill_temperature": 1.235546875, | |
| "epoch": 0.9765625, | |
| "grad_norm": 0.00061798095703125, | |
| "learning_rate": 2.82258064516129e-06, | |
| "loss": 5.3947062492370605, | |
| "step": 250 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826531410217285, | |
| "distill_ce_weight": 0.496484375, | |
| "distill_kd_loss": 0.04833984375, | |
| "distill_kd_weight": 0.503515625, | |
| "distill_temperature": 1.23046875, | |
| "epoch": 0.9765625, | |
| "eval_loss": 5.399942398071289, | |
| "eval_runtime": 13.5232, | |
| "eval_samples_per_second": 4.733, | |
| "eval_steps_per_second": 4.733, | |
| "step": 250 | |
| }, | |
| { | |
| "distill_ce_loss": 10.832559585571289, | |
| "distill_ce_weight": 0.496484375, | |
| "distill_kd_loss": 0.04833984375, | |
| "distill_kd_weight": 0.503515625, | |
| "distill_temperature": 1.23046875, | |
| "epoch": 0.98046875, | |
| "grad_norm": 0.001220703125, | |
| "learning_rate": 2.4193548387096776e-06, | |
| "loss": 5.402488708496094, | |
| "step": 251 | |
| }, | |
| { | |
| "distill_ce_loss": 10.828208923339844, | |
| "distill_ce_weight": 0.4970703125, | |
| "distill_kd_loss": 0.048828125, | |
| "distill_kd_weight": 0.5029296875, | |
| "distill_temperature": 1.225390625, | |
| "epoch": 0.984375, | |
| "grad_norm": 0.0022430419921875, | |
| "learning_rate": 2.0161290322580646e-06, | |
| "loss": 5.406917095184326, | |
| "step": 252 | |
| }, | |
| { | |
| "distill_ce_loss": 10.82684326171875, | |
| "distill_ce_weight": 0.49765625, | |
| "distill_kd_loss": 0.0498046875, | |
| "distill_kd_weight": 0.50234375, | |
| "distill_temperature": 1.2203125, | |
| "epoch": 0.98828125, | |
| "grad_norm": 0.000362396240234375, | |
| "learning_rate": 1.6129032258064516e-06, | |
| "loss": 5.4130706787109375, | |
| "step": 253 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826776504516602, | |
| "distill_ce_weight": 0.4982421875, | |
| "distill_kd_loss": 0.04931640625, | |
| "distill_kd_weight": 0.5017578125, | |
| "distill_temperature": 1.2152343749999999, | |
| "epoch": 0.9921875, | |
| "grad_norm": 0.00125885009765625, | |
| "learning_rate": 1.2096774193548388e-06, | |
| "loss": 5.419137001037598, | |
| "step": 254 | |
| }, | |
| { | |
| "distill_ce_loss": 10.826760292053223, | |
| "distill_ce_weight": 0.498828125, | |
| "distill_kd_loss": 0.04931640625, | |
| "distill_kd_weight": 0.501171875, | |
| "distill_temperature": 1.21015625, | |
| "epoch": 0.99609375, | |
| "grad_norm": 0.0029449462890625, | |
| "learning_rate": 8.064516129032258e-07, | |
| "loss": 5.425350666046143, | |
| "step": 255 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825718879699707, | |
| "distill_ce_weight": 0.4994140625, | |
| "distill_kd_loss": 0.04931640625, | |
| "distill_kd_weight": 0.5005859375, | |
| "distill_temperature": 1.205078125, | |
| "epoch": 1.0, | |
| "grad_norm": 0.00159454345703125, | |
| "learning_rate": 4.032258064516129e-07, | |
| "loss": 5.431174278259277, | |
| "step": 256 | |
| }, | |
| { | |
| "distill_ce_loss": 10.825718879699707, | |
| "distill_ce_weight": 0.4994140625, | |
| "distill_kd_loss": 0.04931640625, | |
| "distill_kd_weight": 0.5005859375, | |
| "distill_temperature": 1.205078125, | |
| "epoch": 1.0, | |
| "step": 256, | |
| "total_flos": 42322071132.0, | |
| "train_loss": 4.595640664920211, | |
| "train_runtime": 251.849, | |
| "train_samples_per_second": 1.016, | |
| "train_steps_per_second": 1.016 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 256, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 42322071132.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |