/home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 500, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps3', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[-0.0094, -0.0244, 0.0308, ..., 0.0211, 0.0115, 0.0051], [-0.0293, 0.0002, 0.0169, ..., 0.0102, -0.0280, 0.0046], [ 0.0241, 0.0062, 0.0289, ..., 0.0189, -0.0016, -0.0282], ..., [ 0.0229, -0.0107, -0.0058, ..., 0.0187, 0.0176, -0.0031], [ 0.0023, -0.0012, 0.0263, ..., -0.0122, -0.0144, 0.0034], [-0.0090, -0.0070, 0.0110, ..., 0.0217, 0.0004, -0.0025]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([-0.0112, -0.0106, -0.0244, -0.0266, -0.0124, 0.0053, 0.0089, -0.0239, -0.0148, 0.0090], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 223.10, cls_loss 1.1838 cls_loss_mapping 1.7620 cls_loss_causal 2.2054 re_mapping 0.1758 re_causal 0.1904 /// teacc 87.96 lr 0.00010000 Epoch 2, weight, value: tensor([[-0.0102, -0.0302, 0.0347, ..., 0.0147, 0.0101, 0.0058], [-0.0288, 0.0037, 0.0112, ..., 0.0028, -0.0296, 0.0049], [ 0.0232, 0.0029, 0.0223, ..., 0.0115, -0.0036, -0.0281], ..., [ 0.0221, -0.0058, -0.0028, ..., 0.0245, 0.0157, -0.0037], [ 0.0014, -0.0071, 0.0280, ..., -0.0174, -0.0159, 0.0027], [-0.0098, -0.0099, 0.0150, ..., 0.0273, -0.0014, -0.0019]], device='cuda:0'), grad: tensor([[ 0.0000, -0.0006, -0.0244, ..., -0.0132, 0.0000, 0.0000], [ 0.0000, -0.0006, 0.0034, ..., 0.0035, 0.0000, 0.0000], [ 0.0000, 0.0057, 0.0125, ..., 0.0074, 0.0000, 0.0000], ..., [ 0.0000, -0.0370, -0.0328, ..., -0.0488, 0.0000, 0.0000], [ 0.0000, 0.0048, -0.0117, ..., 0.0083, 0.0000, 0.0000], [ 0.0000, 0.0233, 0.0095, ..., 0.0180, 0.0000, 0.0000]], device='cuda:0') Epoch 2, bias, value: tensor([-0.0133, -0.0088, -0.0258, -0.0262, -0.0123, 0.0054, 0.0089, -0.0243, -0.0152, 0.0095], device='cuda:0'), grad: tensor([-0.0223, 0.0052, -0.0102, 0.0014, 0.0158, 0.0041, 0.0057, -0.0280, 0.0059, 0.0224], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 222.14, cls_loss 0.3420 cls_loss_mapping 0.6896 cls_loss_causal 1.8949 re_mapping 0.2106 re_causal 0.2836 /// teacc 93.35 lr 0.00010000 Epoch 3, weight, value: tensor([[-0.0102, -0.0333, 0.0370, ..., 0.0127, 0.0100, 0.0040], [-0.0288, 0.0042, 0.0085, ..., -0.0009, -0.0296, 0.0020], [ 0.0232, 0.0022, 0.0201, ..., 0.0104, -0.0037, -0.0317], ..., [ 0.0221, -0.0019, -0.0017, ..., 0.0269, 0.0157, -0.0065], [ 0.0014, -0.0097, 0.0313, ..., -0.0202, -0.0159, 0.0017], [-0.0098, -0.0135, 0.0188, ..., 0.0291, -0.0014, -0.0024]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.3562e-04, -1.3481e-02, ..., 1.4420e-03, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.4663e-04, -1.0521e-02, ..., 3.3236e-04, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.9564e-04, 4.7150e-03, ..., 2.2984e-03, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -5.7411e-03, 7.5054e-04, ..., -4.2439e-05, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.5196e-03, 2.0004e-02, ..., 9.8267e-03, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.3160e-03, -1.0891e-03, ..., 8.2626e-03, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 3, bias, value: tensor([-0.0136, -0.0088, -0.0263, -0.0266, -0.0123, 0.0060, 0.0084, -0.0246, -0.0148, 0.0101], device='cuda:0'), grad: tensor([-0.0090, -0.0100, 0.0161, -0.0004, -0.0300, -0.0151, 0.0059, 0.0083, 0.0104, 0.0238], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 222.14, cls_loss 0.2206 cls_loss_mapping 0.4132 cls_loss_causal 1.6410 re_mapping 0.1505 re_causal 0.2434 /// teacc 95.27 lr 0.00010000 Epoch 4, weight, value: tensor([[-0.0102, -0.0355, 0.0397, ..., 0.0110, 0.0100, 0.0043], [-0.0288, 0.0033, 0.0065, ..., -0.0033, -0.0296, 0.0016], [ 0.0232, 0.0019, 0.0186, ..., 0.0091, -0.0037, -0.0322], ..., [ 0.0221, 0.0012, -0.0015, ..., 0.0283, 0.0157, -0.0069], [ 0.0014, -0.0113, 0.0332, ..., -0.0215, -0.0160, 0.0015], [-0.0098, -0.0165, 0.0203, ..., 0.0301, -0.0014, -0.0025]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.3035e-04, -4.9553e-03, ..., 7.9489e-04, 0.0000e+00, 1.5929e-05], [ 0.0000e+00, 3.3784e-04, -2.4147e-03, ..., 1.3103e-03, 0.0000e+00, 4.8950e-06], [ 0.0000e+00, 2.2662e-04, -6.0081e-03, ..., 1.0223e-03, 0.0000e+00, 4.8205e-06], ..., [ 0.0000e+00, -4.8876e-04, 2.9030e-03, ..., -3.5572e-04, 0.0000e+00, 2.2855e-06], [ 0.0000e+00, 1.3504e-03, 7.3395e-03, ..., 5.1498e-03, 0.0000e+00, -1.3342e-03], [ 0.0000e+00, 1.2960e-03, -1.7517e-02, ..., -1.7960e-02, 0.0000e+00, 1.3202e-05]], device='cuda:0') Epoch 4, bias, value: tensor([-0.0129, -0.0087, -0.0266, -0.0261, -0.0122, 0.0059, 0.0081, -0.0248, -0.0148, 0.0098], device='cuda:0'), grad: tensor([ 0.0034, -0.0030, -0.0192, 0.0032, 0.0040, -0.0036, 0.0095, 0.0063, 0.0117, -0.0123], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 222.10, cls_loss 0.1628 cls_loss_mapping 0.2806 cls_loss_causal 1.4610 re_mapping 0.1172 re_causal 0.2083 /// teacc 96.23 lr 0.00010000 Epoch 5, weight, value: tensor([[-0.0102, -0.0371, 0.0407, ..., 0.0088, 0.0085, 0.0027], [-0.0288, 0.0017, 0.0046, ..., -0.0058, -0.0314, -0.0096], [ 0.0232, 0.0019, 0.0177, ..., 0.0077, -0.0053, -0.0411], ..., [ 0.0221, 0.0029, -0.0011, ..., 0.0296, 0.0140, -0.0111], [ 0.0014, -0.0124, 0.0343, ..., -0.0232, -0.0198, 0.0009], [-0.0098, -0.0180, 0.0221, ..., 0.0316, -0.0029, -0.0077]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.4312e-04, 1.6203e-03, ..., 1.5535e-03, 0.0000e+00, 3.6502e-04], [ 0.0000e+00, 2.3639e-04, -1.1816e-03, ..., 5.6601e-04, 0.0000e+00, 8.8155e-05], [ 0.0000e+00, 1.7805e-03, 2.1248e-03, ..., 1.9646e-03, 0.0000e+00, 1.6415e-04], ..., [ 0.0000e+00, -2.6798e-03, 1.2650e-02, ..., 4.8103e-03, 0.0000e+00, 1.2474e-03], [ 0.0000e+00, 1.2999e-03, 7.7782e-03, ..., 5.3101e-03, 0.0000e+00, 5.9986e-04], [ 0.0000e+00, 2.3403e-03, -2.6749e-02, ..., -2.0981e-02, 0.0000e+00, -1.9569e-03]], device='cuda:0') Epoch 5, bias, value: tensor([-0.0128, -0.0091, -0.0265, -0.0258, -0.0123, 0.0058, 0.0079, -0.0247, -0.0145, 0.0098], device='cuda:0'), grad: tensor([ 0.0049, -0.0069, 0.0021, -0.0152, 0.0052, 0.0053, 0.0014, 0.0087, 0.0106, -0.0161], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 222.46, cls_loss 0.1178 cls_loss_mapping 0.2042 cls_loss_causal 1.3494 re_mapping 0.0931 re_causal 0.1854 /// teacc 96.63 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0102, -0.0377, 0.0420, ..., 0.0074, 0.0108, 0.0013], [-0.0288, 0.0008, 0.0034, ..., -0.0066, -0.0332, -0.0179], [ 0.0232, 0.0011, 0.0173, ..., 0.0065, -0.0113, -0.0469], ..., [ 0.0221, 0.0053, -0.0012, ..., 0.0306, 0.0100, -0.0171], [ 0.0014, -0.0133, 0.0352, ..., -0.0243, -0.0199, 0.0053], [-0.0098, -0.0200, 0.0240, ..., 0.0326, -0.0083, -0.0096]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.0282e-04, -5.1117e-04, ..., 1.9741e-04, -3.6335e-04, 3.5167e-05], [ 0.0000e+00, 5.9986e-04, 1.0345e-02, ..., 1.1072e-03, 4.6417e-06, 1.5659e-03], [ 0.0000e+00, 1.9169e-03, 7.7591e-03, ..., 2.5730e-03, 9.7036e-05, 7.6771e-05], ..., [ 0.0000e+00, -3.6316e-03, -2.2903e-02, ..., -7.1907e-03, 5.6893e-05, -9.1743e-04], [ 0.0000e+00, 1.2264e-03, -1.7147e-03, ..., 2.7752e-03, 5.6714e-05, -1.0956e-02], [ 0.0000e+00, 1.1806e-03, 1.5430e-03, ..., -3.5839e-03, 6.1214e-05, 1.1034e-03]], device='cuda:0') Epoch 6, bias, value: tensor([-0.0128, -0.0090, -0.0265, -0.0258, -0.0120, 0.0053, 0.0076, -0.0248, -0.0144, 0.0101], device='cuda:0'), grad: tensor([ 0.0010, 0.0238, -0.0146, -0.0152, 0.0044, 0.0085, 0.0042, -0.0163, 0.0046, -0.0004], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 222.53, cls_loss 0.1038 cls_loss_mapping 0.1739 cls_loss_causal 1.2566 re_mapping 0.0784 re_causal 0.1645 /// teacc 97.31 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0102, -0.0384, 0.0434, ..., 0.0062, 0.0153, 0.0021], [-0.0288, 0.0005, 0.0026, ..., -0.0087, -0.0386, -0.0227], [ 0.0232, 0.0002, 0.0165, ..., 0.0048, -0.0176, -0.0535], ..., [ 0.0221, 0.0068, -0.0014, ..., 0.0317, 0.0040, -0.0231], [ 0.0014, -0.0133, 0.0366, ..., -0.0250, -0.0232, 0.0117], [-0.0098, -0.0219, 0.0251, ..., 0.0333, -0.0145, -0.0131]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.7881e-03, -4.2877e-03, ..., 2.2125e-04, 1.0414e-02, 4.4227e-05], [ 0.0000e+00, 2.7919e-04, 1.3857e-03, ..., 1.9670e-04, 2.4247e-04, 2.6569e-03], [ 0.0000e+00, 4.4327e-03, 4.9744e-03, ..., 4.5853e-03, 7.8964e-04, 2.3270e-03], ..., [ 0.0000e+00, -7.0381e-03, -3.9024e-03, ..., -8.3694e-03, 3.3855e-04, 8.6784e-04], [ 0.0000e+00, 4.2295e-04, -3.5324e-03, ..., -4.2677e-04, 2.3890e-04, 1.1196e-03], [ 0.0000e+00, 1.8501e-03, 2.3727e-03, ..., 2.0180e-03, 2.3007e-04, 1.9274e-03]], device='cuda:0') Epoch 7, bias, value: tensor([-0.0128, -0.0089, -0.0266, -0.0257, -0.0118, 0.0051, 0.0076, -0.0250, -0.0140, 0.0098], device='cuda:0'), grad: tensor([ 0.0036, 0.0059, 0.0101, 0.0082, 0.0014, -0.0275, -0.0049, -0.0062, 0.0035, 0.0059], device='cuda:0') 100 0.0001 changing lr epoch 6, time 221.76, cls_loss 0.1073 cls_loss_mapping 0.1726 cls_loss_causal 1.2434 re_mapping 0.0640 re_causal 0.1467 /// teacc 97.28 lr 0.00010000 Epoch 8, weight, value: tensor([[-1.0243e-02, -3.9061e-02, 4.4652e-02, ..., 5.0099e-03, 1.7334e-02, 2.3563e-03], [-2.8781e-02, 4.5420e-05, 1.5794e-03, ..., -9.1317e-03, -4.5100e-02, -2.4899e-02], [ 2.3198e-02, 4.5113e-05, 1.5047e-02, ..., 3.7007e-03, -2.2945e-02, -6.0259e-02], ..., [ 2.2054e-02, 8.2435e-03, -1.5907e-03, ..., 3.2357e-02, -3.4719e-03, -2.6924e-02], [ 1.4234e-03, -1.4464e-02, 3.8041e-02, ..., -2.5661e-02, -2.9706e-02, 1.5350e-02], [-9.8281e-03, -2.2710e-02, 2.5992e-02, ..., 3.4042e-02, -2.3881e-02, -1.5346e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3997e-05, 6.1493e-03, ..., 3.3417e-03, -9.2983e-06, 3.4695e-03], [ 0.0000e+00, 2.1029e-04, 4.6468e-04, ..., 3.7789e-04, 1.6636e-07, 1.3638e-04], [ 0.0000e+00, 4.2892e-04, 3.8929e-03, ..., 1.2379e-03, 3.1888e-06, 1.6861e-03], ..., [ 0.0000e+00, -7.6628e-04, 7.6218e-03, ..., 2.9945e-03, 2.5518e-06, 4.9782e-03], [ 0.0000e+00, 1.6892e-04, -4.7607e-02, ..., -1.8585e-02, 3.1339e-07, -3.2349e-02], [ 0.0000e+00, 8.4877e-04, 1.2276e-02, ..., 8.6746e-03, 4.1490e-07, 9.2850e-03]], device='cuda:0') Epoch 8, bias, value: tensor([-0.0126, -0.0092, -0.0264, -0.0255, -0.0119, 0.0051, 0.0074, -0.0256, -0.0137, 0.0099], device='cuda:0'), grad: tensor([ 0.0132, 0.0012, 0.0076, -0.0029, -0.0009, 0.0133, -0.0075, 0.0056, -0.0480, 0.0183], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 222.34, cls_loss 0.0913 cls_loss_mapping 0.1452 cls_loss_causal 1.1394 re_mapping 0.0562 re_causal 0.1284 /// teacc 97.83 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0102, -0.0393, 0.0454, ..., 0.0038, 0.0197, 0.0022], [-0.0288, -0.0001, 0.0007, ..., -0.0098, -0.0513, -0.0270], [ 0.0232, -0.0011, 0.0141, ..., 0.0022, -0.0221, -0.0638], ..., [ 0.0221, 0.0100, -0.0016, ..., 0.0340, -0.0119, -0.0300], [ 0.0014, -0.0152, 0.0391, ..., -0.0262, -0.0419, 0.0188], [-0.0098, -0.0246, 0.0267, ..., 0.0341, -0.0353, -0.0173]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8358e-04, -7.5722e-03, ..., 1.1629e-04, 6.6519e-05, -1.9970e-03], [ 0.0000e+00, 4.9263e-05, 6.8521e-04, ..., 8.8751e-05, 1.5154e-05, 3.6860e-04], [ 0.0000e+00, -1.7989e-04, 2.6560e-04, ..., 6.4969e-05, 3.0234e-05, 1.6956e-03], ..., [ 0.0000e+00, -6.5374e-04, -3.3045e-04, ..., -1.7166e-03, 3.1859e-05, 3.7551e-04], [ 0.0000e+00, 1.2016e-04, -5.6419e-03, ..., -9.7215e-05, 8.0729e-04, -4.3335e-03], [ 0.0000e+00, 5.5838e-04, 3.0613e-03, ..., 2.4567e-03, 1.7536e-04, 2.4929e-03]], device='cuda:0') Epoch 9, bias, value: tensor([-0.0129, -0.0092, -0.0264, -0.0250, -0.0117, 0.0045, 0.0074, -0.0252, -0.0135, 0.0094], device='cuda:0'), grad: tensor([-0.0085, -0.0006, -0.0001, 0.0066, -0.0010, -0.0028, 0.0025, -0.0002, -0.0042, 0.0083], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 222.29, cls_loss 0.0680 cls_loss_mapping 0.1142 cls_loss_causal 1.1269 re_mapping 0.0524 re_causal 0.1266 /// teacc 98.02 lr 0.00010000 Epoch 10, weight, value: tensor([[-1.0243e-02, -4.0390e-02, 4.6042e-02, ..., 2.5107e-03, 2.0465e-02, 3.1441e-03], [-2.8781e-02, -3.7975e-04, -5.2456e-06, ..., -9.3388e-03, -5.1943e-02, -2.9051e-02], [ 2.3198e-02, -8.8327e-04, 1.3205e-02, ..., 1.3086e-03, -2.2454e-02, -6.7880e-02], ..., [ 2.2054e-02, 1.0954e-02, -1.4651e-03, ..., 3.4792e-02, -1.3332e-02, -3.1514e-02], [ 1.4234e-03, -1.5573e-02, 4.0345e-02, ..., -2.6844e-02, -4.4691e-02, 2.1960e-02], [-9.8281e-03, -2.5898e-02, 2.7957e-02, ..., 3.4514e-02, -3.8534e-02, -1.8399e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.5465e-05, -1.1673e-03, ..., 1.3840e-04, -9.9480e-05, -8.4257e-04], [ 0.0000e+00, -6.3610e-04, 7.6056e-05, ..., 1.6302e-05, 6.7428e-07, 4.5300e-05], [ 0.0000e+00, 1.3542e-04, 4.6039e-04, ..., 1.9436e-03, 7.3351e-06, 2.2686e-04], ..., [ 0.0000e+00, -4.0233e-05, 1.3661e-04, ..., 1.3895e-03, 1.9968e-06, 1.1212e-04], [ 0.0000e+00, 7.7367e-05, 5.9962e-05, ..., 3.4189e-04, 5.2825e-06, 4.2152e-04], [ 0.0000e+00, 2.6083e-04, 2.4348e-05, ..., 2.0161e-03, 1.7941e-05, 1.7071e-04]], device='cuda:0') Epoch 10, bias, value: tensor([-0.0131, -0.0091, -0.0261, -0.0250, -0.0117, 0.0044, 0.0071, -0.0253, -0.0132, 0.0094], device='cuda:0'), grad: tensor([-0.0010, -0.0046, 0.0032, 0.0024, -0.0084, 0.0006, 0.0001, 0.0027, 0.0013, 0.0037], device='cuda:0') 100 0.0001 changing lr epoch 9, time 221.74, cls_loss 0.0747 cls_loss_mapping 0.1213 cls_loss_causal 1.0785 re_mapping 0.0462 re_causal 0.1112 /// teacc 97.36 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0102, -0.0406, 0.0467, ..., 0.0018, 0.0202, 0.0046], [-0.0288, -0.0011, -0.0010, ..., -0.0100, -0.0526, -0.0331], [ 0.0232, -0.0017, 0.0123, ..., -0.0002, -0.0234, -0.0721], ..., [ 0.0221, 0.0119, -0.0016, ..., 0.0356, -0.0156, -0.0328], [ 0.0014, -0.0156, 0.0417, ..., -0.0263, -0.0482, 0.0246], [-0.0098, -0.0269, 0.0285, ..., 0.0349, -0.0431, -0.0213]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.4687e-04, 3.8457e-04, ..., 5.7316e-04, -3.8981e-04, -2.1148e-04], [ 0.0000e+00, 9.1267e-04, 3.9077e-04, ..., 3.1400e-04, 2.1607e-05, 4.4227e-04], [ 0.0000e+00, -1.0710e-03, 3.6383e-04, ..., 3.7408e-04, 2.8744e-05, -3.4380e-04], ..., [ 0.0000e+00, -6.2895e-04, -2.0733e-03, ..., -2.6417e-03, 1.9297e-05, -2.7657e-04], [ 0.0000e+00, 4.4417e-04, 2.3518e-03, ..., 1.2255e-03, 3.2067e-04, 1.2140e-03], [ 0.0000e+00, 9.9754e-04, -1.2360e-03, ..., -5.3978e-04, 9.8825e-05, 8.8155e-05]], device='cuda:0') Epoch 11, bias, value: tensor([-0.0130, -0.0093, -0.0265, -0.0248, -0.0120, 0.0043, 0.0072, -0.0252, -0.0127, 0.0092], device='cuda:0'), grad: tensor([ 0.0005, 0.0089, -0.0119, -0.0398, -0.0009, 0.0359, 0.0014, -0.0014, 0.0058, 0.0016], device='cuda:0') 100 0.0001 changing lr epoch 10, time 221.48, cls_loss 0.0622 cls_loss_mapping 0.1063 cls_loss_causal 1.0394 re_mapping 0.0428 re_causal 0.1072 /// teacc 98.01 lr 0.00010000 Epoch 12, weight, value: tensor([[-0.0102, -0.0417, 0.0473, ..., 0.0008, 0.0199, 0.0058], [-0.0288, -0.0018, -0.0016, ..., -0.0113, -0.0525, -0.0337], [ 0.0232, -0.0021, 0.0113, ..., -0.0007, -0.0249, -0.0759], ..., [ 0.0221, 0.0130, -0.0017, ..., 0.0364, -0.0178, -0.0331], [ 0.0014, -0.0160, 0.0422, ..., -0.0266, -0.0512, 0.0266], [-0.0098, -0.0281, 0.0297, ..., 0.0354, -0.0478, -0.0228]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1873e-04, 2.4378e-04, ..., 8.0109e-04, 1.6270e-03, 2.8782e-03], [ 0.0000e+00, 6.4325e-04, 1.1129e-03, ..., 8.1253e-04, 6.8963e-05, 4.5037e-04], [ 0.0000e+00, 4.9782e-04, 1.1263e-03, ..., 6.9571e-04, 2.4700e-04, 1.0624e-03], ..., [ 0.0000e+00, -1.7910e-03, -1.0061e-03, ..., -1.2445e-03, 4.2677e-04, 1.3103e-03], [ 0.0000e+00, 1.3471e-04, -1.2875e-03, ..., 4.4084e-04, 5.3883e-04, -3.1395e-03], [ 0.0000e+00, 3.3593e-04, 3.3140e-05, ..., -8.6832e-04, 1.4770e-04, 1.7357e-03]], device='cuda:0') Epoch 12, bias, value: tensor([-0.0129, -0.0096, -0.0263, -0.0248, -0.0120, 0.0045, 0.0070, -0.0253, -0.0127, 0.0093], device='cuda:0'), grad: tensor([ 0.0106, -0.0064, 0.0107, -0.0079, -0.0087, -0.0131, 0.0101, 0.0008, -0.0002, 0.0042], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 222.29, cls_loss 0.0581 cls_loss_mapping 0.0946 cls_loss_causal 1.0517 re_mapping 0.0386 re_causal 0.1013 /// teacc 98.12 lr 0.00010000 Epoch 13, weight, value: tensor([[-0.0102, -0.0425, 0.0478, ..., -0.0002, 0.0206, 0.0062], [-0.0288, -0.0027, -0.0025, ..., -0.0111, -0.0523, -0.0354], [ 0.0232, -0.0030, 0.0104, ..., -0.0019, -0.0256, -0.0799], ..., [ 0.0221, 0.0142, -0.0018, ..., 0.0373, -0.0201, -0.0333], [ 0.0014, -0.0163, 0.0431, ..., -0.0273, -0.0562, 0.0287], [-0.0098, -0.0293, 0.0304, ..., 0.0358, -0.0521, -0.0235]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.0327e-04, -2.8992e-04, ..., 2.6178e-04, 4.5955e-05, -5.0008e-05], [ 0.0000e+00, 2.4772e-04, 2.6894e-04, ..., 3.1757e-04, 9.8068e-07, 5.8293e-05], [ 0.0000e+00, -3.2091e-04, 6.8426e-04, ..., 4.2105e-04, 8.8988e-07, 2.6679e-04], ..., [ 0.0000e+00, -2.7485e-03, -1.0834e-03, ..., -3.9902e-03, 2.5332e-06, 1.6785e-04], [ 0.0000e+00, 2.2590e-04, -3.7861e-03, ..., -6.0081e-04, 4.6194e-06, -2.9564e-03], [ 0.0000e+00, 8.8596e-04, 2.4223e-03, ..., 1.1959e-03, 1.3411e-05, 2.3785e-03]], device='cuda:0') Epoch 13, bias, value: tensor([-0.0133, -0.0098, -0.0264, -0.0244, -0.0123, 0.0046, 0.0073, -0.0250, -0.0126, 0.0090], device='cuda:0'), grad: tensor([ 0.0007, 0.0013, -0.0017, -0.0024, -0.0036, 0.0010, 0.0030, -0.0036, -0.0003, 0.0055], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 222.24, cls_loss 0.0468 cls_loss_mapping 0.0808 cls_loss_causal 0.9970 re_mapping 0.0358 re_causal 0.0954 /// teacc 98.38 lr 0.00010000 Epoch 14, weight, value: tensor([[-0.0102, -0.0435, 0.0486, ..., -0.0006, 0.0206, 0.0072], [-0.0288, -0.0035, -0.0030, ..., -0.0124, -0.0525, -0.0352], [ 0.0232, -0.0044, 0.0095, ..., -0.0031, -0.0261, -0.0835], ..., [ 0.0221, 0.0157, -0.0019, ..., 0.0379, -0.0215, -0.0345], [ 0.0014, -0.0169, 0.0440, ..., -0.0278, -0.0580, 0.0307], [-0.0098, -0.0304, 0.0311, ..., 0.0363, -0.0545, -0.0237]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3723e-05, -2.0707e-04, ..., 9.2506e-05, 1.0991e-04, 1.0672e-03], [ 0.0000e+00, 6.0320e-05, 3.0851e-04, ..., 3.9291e-04, 6.9924e-06, 6.1274e-05], [ 0.0000e+00, 7.6711e-05, 2.6870e-04, ..., 1.1873e-04, 1.7613e-05, 2.1064e-04], ..., [ 0.0000e+00, -1.1902e-03, 7.3147e-04, ..., 1.2236e-03, 2.6867e-05, 2.1708e-04], [ 0.0000e+00, 8.5354e-05, 1.2941e-03, ..., 2.3389e-04, 4.4250e-04, 4.0321e-03], [ 0.0000e+00, 2.5368e-04, -4.0627e-04, ..., 1.2279e-04, 6.9320e-05, 8.9693e-04]], device='cuda:0') Epoch 14, bias, value: tensor([-0.0131, -0.0099, -0.0267, -0.0243, -0.0121, 0.0046, 0.0069, -0.0249, -0.0123, 0.0088], device='cuda:0'), grad: tensor([ 0.0024, 0.0007, -0.0007, 0.0017, -0.0054, -0.0066, -0.0025, 0.0021, 0.0051, 0.0031], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 13---------------------------------------------------- epoch 13, time 222.28, cls_loss 0.0430 cls_loss_mapping 0.0745 cls_loss_causal 0.9732 re_mapping 0.0335 re_causal 0.0909 /// teacc 98.40 lr 0.00010000 Epoch 15, weight, value: tensor([[-0.0102, -0.0444, 0.0491, ..., -0.0013, 0.0218, 0.0075], [-0.0288, -0.0042, -0.0037, ..., -0.0124, -0.0522, -0.0362], [ 0.0232, -0.0046, 0.0087, ..., -0.0032, -0.0263, -0.0867], ..., [ 0.0221, 0.0172, -0.0019, ..., 0.0384, -0.0242, -0.0355], [ 0.0014, -0.0177, 0.0452, ..., -0.0284, -0.0609, 0.0333], [-0.0098, -0.0316, 0.0318, ..., 0.0367, -0.0590, -0.0250]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.8052e-04, -8.1863e-03, ..., 2.5582e-04, -5.2631e-05, -7.6141e-03], [ 0.0000e+00, 8.3542e-04, 8.0299e-04, ..., 6.4850e-05, 9.5926e-07, 3.5310e-04], [ 0.0000e+00, 1.4954e-03, 1.4963e-03, ..., 2.3413e-04, 1.2465e-05, 2.9230e-04], ..., [ 0.0000e+00, 4.3945e-03, 1.9350e-03, ..., -5.7125e-04, 2.4829e-06, 1.5748e-04], [ 0.0000e+00, 1.2960e-03, 1.8108e-04, ..., -2.8163e-05, 1.7472e-06, -3.0541e-04], [ 0.0000e+00, 2.2564e-03, 1.3342e-03, ..., -2.6393e-04, 4.8243e-06, 7.5626e-04]], device='cuda:0') Epoch 15, bias, value: tensor([-0.0132, -0.0101, -0.0264, -0.0247, -0.0122, 0.0043, 0.0070, -0.0249, -0.0117, 0.0089], device='cuda:0'), grad: tensor([-0.0105, 0.0027, 0.0030, -0.0222, 0.0007, -0.0007, 0.0106, 0.0087, 0.0029, 0.0048], device='cuda:0') 100 0.0001 changing lr epoch 14, time 221.47, cls_loss 0.0342 cls_loss_mapping 0.0605 cls_loss_causal 0.9607 re_mapping 0.0322 re_causal 0.0886 /// teacc 98.11 lr 0.00010000 Epoch 16, weight, value: tensor([[-0.0102, -0.0449, 0.0498, ..., -0.0018, 0.0219, 0.0077], [-0.0288, -0.0049, -0.0041, ..., -0.0133, -0.0520, -0.0359], [ 0.0232, -0.0049, 0.0076, ..., -0.0037, -0.0258, -0.0894], ..., [ 0.0221, 0.0181, -0.0017, ..., 0.0391, -0.0293, -0.0375], [ 0.0014, -0.0184, 0.0457, ..., -0.0286, -0.0664, 0.0346], [-0.0098, -0.0327, 0.0324, ..., 0.0369, -0.0652, -0.0253]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.3745e-05, 2.6202e-04, ..., 3.7241e-04, 1.4625e-05, 7.4387e-05], [ 0.0000e+00, 5.4687e-05, 1.7703e-04, ..., 2.3663e-04, 2.6412e-06, 2.8238e-06], [ 0.0000e+00, 9.6381e-05, 3.3587e-05, ..., 5.7077e-04, -7.0989e-05, 1.8865e-05], ..., [ 0.0000e+00, 3.5477e-04, 1.6785e-03, ..., 1.3418e-03, 9.2462e-06, 1.6689e-04], [ 0.0000e+00, 1.4234e-04, 1.0500e-03, ..., 7.0190e-04, 4.6223e-05, 2.5535e-04], [ 0.0000e+00, 1.8969e-03, 3.7651e-03, ..., 3.4885e-03, 8.2850e-06, -1.5825e-05]], device='cuda:0') Epoch 16, bias, value: tensor([-0.0131, -0.0101, -0.0264, -0.0244, -0.0120, 0.0041, 0.0070, -0.0251, -0.0120, 0.0089], device='cuda:0'), grad: tensor([ 0.0012, 0.0006, 0.0004, -0.0017, -0.0162, -0.0011, 0.0003, 0.0035, 0.0018, 0.0113], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 15---------------------------------------------------- epoch 15, time 222.11, cls_loss 0.0443 cls_loss_mapping 0.0782 cls_loss_causal 0.9163 re_mapping 0.0300 re_causal 0.0841 /// teacc 98.44 lr 0.00010000 Epoch 17, weight, value: tensor([[-0.0102, -0.0455, 0.0501, ..., -0.0025, 0.0221, 0.0078], [-0.0288, -0.0056, -0.0051, ..., -0.0138, -0.0538, -0.0371], [ 0.0232, -0.0057, 0.0074, ..., -0.0044, -0.0257, -0.0922], ..., [ 0.0221, 0.0192, -0.0020, ..., 0.0397, -0.0285, -0.0393], [ 0.0014, -0.0189, 0.0464, ..., -0.0289, -0.0722, 0.0369], [-0.0098, -0.0340, 0.0333, ..., 0.0372, -0.0695, -0.0258]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.4150e-04, -3.8409e-04, ..., 1.5032e-04, 9.3281e-06, -3.8886e-04], [ 0.0000e+00, 1.0180e-04, 1.0270e-04, ..., 1.2255e-04, 1.0476e-05, 2.9042e-05], [ 0.0000e+00, 5.6458e-04, 5.9462e-04, ..., 5.7316e-04, -8.3208e-05, 3.2616e-04], ..., [ 0.0000e+00, -1.4238e-03, -9.2649e-04, ..., -1.4534e-03, 1.3828e-05, 1.0973e-04], [ 0.0000e+00, 9.9719e-05, -1.1759e-03, ..., 1.1027e-04, 4.4070e-06, -2.2488e-03], [ 0.0000e+00, 2.6369e-04, 2.3103e-04, ..., 2.5463e-04, 1.7453e-06, 1.2076e-04]], device='cuda:0') Epoch 17, bias, value: tensor([-0.0136, -0.0107, -0.0264, -0.0245, -0.0119, 0.0044, 0.0071, -0.0249, -0.0117, 0.0090], device='cuda:0'), grad: tensor([ 6.9253e-06, 2.5582e-04, 3.7146e-04, 4.9257e-04, 1.6689e-04, 1.1673e-03, 1.0824e-03, -2.4261e-03, -1.8377e-03, 7.2098e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 16---------------------------------------------------- epoch 16, time 221.91, cls_loss 0.0319 cls_loss_mapping 0.0588 cls_loss_causal 0.8806 re_mapping 0.0283 re_causal 0.0804 /// teacc 98.46 lr 0.00010000 Epoch 18, weight, value: tensor([[-0.0102, -0.0465, 0.0506, ..., -0.0028, 0.0226, 0.0079], [-0.0288, -0.0058, -0.0056, ..., -0.0141, -0.0553, -0.0380], [ 0.0232, -0.0061, 0.0066, ..., -0.0050, -0.0243, -0.0947], ..., [ 0.0221, 0.0203, -0.0017, ..., 0.0405, -0.0273, -0.0393], [ 0.0014, -0.0192, 0.0472, ..., -0.0290, -0.0772, 0.0380], [-0.0098, -0.0353, 0.0337, ..., 0.0375, -0.0725, -0.0267]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.0206e-05, -5.5504e-04, ..., 6.8724e-05, -1.5691e-05, -1.1492e-04], [ 0.0000e+00, 4.6015e-05, 9.7096e-05, ..., 3.9756e-05, 7.0892e-06, 6.7616e-04], [ 0.0000e+00, 8.9288e-05, 1.1814e-04, ..., -2.0134e-04, -1.7738e-04, 3.2926e-04], ..., [ 0.0000e+00, -5.9652e-04, -1.0383e-04, ..., -3.8409e-04, 4.5002e-05, 1.9348e-04], [ 0.0000e+00, 3.7491e-05, 6.1989e-06, ..., 7.7426e-05, 1.3299e-05, 5.4884e-04], [ 0.0000e+00, 1.5652e-04, -1.1599e-04, ..., -3.4198e-06, 2.8864e-05, 6.5041e-04]], device='cuda:0') Epoch 18, bias, value: tensor([-0.0134, -0.0107, -0.0265, -0.0244, -0.0119, 0.0043, 0.0071, -0.0249, -0.0116, 0.0088], device='cuda:0'), grad: tensor([-9.1255e-05, 1.1120e-03, -7.4911e-04, 4.8599e-03, 2.0962e-03, -1.9485e-02, 9.4376e-03, 9.1076e-05, 1.5030e-03, 1.2341e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 17---------------------------------------------------- epoch 17, time 222.01, cls_loss 0.0265 cls_loss_mapping 0.0527 cls_loss_causal 0.8679 re_mapping 0.0277 re_causal 0.0802 /// teacc 98.50 lr 0.00010000 Epoch 19, weight, value: tensor([[-0.0102, -0.0473, 0.0511, ..., -0.0034, 0.0230, 0.0084], [-0.0288, -0.0067, -0.0060, ..., -0.0148, -0.0554, -0.0373], [ 0.0232, -0.0068, 0.0060, ..., -0.0057, -0.0252, -0.0978], ..., [ 0.0221, 0.0218, -0.0015, ..., 0.0411, -0.0286, -0.0400], [ 0.0014, -0.0197, 0.0477, ..., -0.0296, -0.0798, 0.0395], [-0.0098, -0.0363, 0.0342, ..., 0.0377, -0.0779, -0.0278]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.1723e-06, 3.1024e-05, ..., 1.6451e-05, 5.0128e-05, 3.0565e-04], [ 0.0000e+00, 2.3067e-05, -5.0105e-07, ..., 3.1412e-05, 1.4268e-05, -8.7976e-05], [ 0.0000e+00, 1.1903e-04, 7.6115e-05, ..., 3.2097e-05, -8.6904e-05, 9.2924e-05], ..., [ 0.0000e+00, -2.3037e-05, 1.9670e-04, ..., 1.9526e-04, 3.1829e-05, 1.2803e-04], [ 0.0000e+00, 8.6248e-05, 1.4961e-04, ..., 4.9496e-04, 5.8591e-05, 1.2565e-04], [ 0.0000e+00, 3.8385e-05, -7.0238e-04, ..., -3.6860e-04, 2.2307e-05, -2.0492e-04]], device='cuda:0') Epoch 19, bias, value: tensor([-0.0133, -0.0110, -0.0264, -0.0244, -0.0117, 0.0040, 0.0073, -0.0247, -0.0115, 0.0084], device='cuda:0'), grad: tensor([ 5.4359e-04, 6.8069e-05, -2.5749e-04, -2.0370e-03, -9.8324e-04, 6.1464e-04, -3.4690e-04, 5.2881e-04, 1.2436e-03, 6.2704e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 18---------------------------------------------------- epoch 18, time 222.27, cls_loss 0.0275 cls_loss_mapping 0.0500 cls_loss_causal 0.8878 re_mapping 0.0265 re_causal 0.0817 /// teacc 98.56 lr 0.00010000 Epoch 20, weight, value: tensor([[-0.0102, -0.0482, 0.0515, ..., -0.0039, 0.0231, 0.0086], [-0.0288, -0.0073, -0.0069, ..., -0.0155, -0.0566, -0.0388], [ 0.0232, -0.0073, 0.0057, ..., -0.0063, -0.0241, -0.1003], ..., [ 0.0221, 0.0227, -0.0018, ..., 0.0415, -0.0269, -0.0404], [ 0.0014, -0.0200, 0.0482, ..., -0.0298, -0.0831, 0.0404], [-0.0098, -0.0370, 0.0350, ..., 0.0381, -0.0803, -0.0277]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.8965e-05, -8.6784e-04, ..., 7.6413e-05, -4.4167e-05, -7.4863e-04], [ 0.0000e+00, 3.3170e-05, 1.5628e-04, ..., 8.8513e-05, 3.1516e-06, 1.4079e-04], [ 0.0000e+00, 1.0902e-04, 5.0926e-04, ..., 1.2493e-04, -8.2552e-06, 5.9223e-04], ..., [ 0.0000e+00, -9.5797e-04, -5.8222e-04, ..., -8.4066e-04, 4.0308e-06, 2.9349e-04], [ 0.0000e+00, 4.3184e-05, 2.7218e-03, ..., 1.4601e-03, 5.0254e-06, 1.4830e-03], [ 0.0000e+00, 1.4985e-04, -4.3488e-03, ..., -1.7738e-03, 3.0752e-06, -2.4090e-03]], device='cuda:0') Epoch 20, bias, value: tensor([-0.0136, -0.0113, -0.0261, -0.0245, -0.0117, 0.0042, 0.0075, -0.0250, -0.0116, 0.0088], device='cuda:0'), grad: tensor([-2.1019e-03, -1.4037e-05, 1.1406e-03, 2.3365e-03, -2.6321e-04, 5.6219e-04, 1.2922e-04, -6.2561e-04, 2.7504e-03, -3.9139e-03], device='cuda:0') 100 0.0001 changing lr epoch 19, time 221.37, cls_loss 0.0294 cls_loss_mapping 0.0511 cls_loss_causal 0.8758 re_mapping 0.0254 re_causal 0.0775 /// teacc 98.38 lr 0.00010000 Epoch 21, weight, value: tensor([[-0.0102, -0.0490, 0.0513, ..., -0.0043, 0.0226, 0.0078], [-0.0288, -0.0076, -0.0074, ..., -0.0160, -0.0544, -0.0402], [ 0.0232, -0.0075, 0.0051, ..., -0.0069, -0.0235, -0.1032], ..., [ 0.0221, 0.0236, -0.0018, ..., 0.0423, -0.0275, -0.0407], [ 0.0014, -0.0207, 0.0489, ..., -0.0300, -0.0842, 0.0419], [-0.0098, -0.0383, 0.0359, ..., 0.0382, -0.0802, -0.0268]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3575e-05, -8.9931e-04, ..., 1.6764e-05, 2.3270e-04, -1.3380e-03], [ 0.0000e+00, 4.0650e-05, 1.3466e-03, ..., 3.7938e-05, 1.2722e-06, 1.7948e-03], [ 0.0000e+00, 1.1492e-04, 2.7037e-04, ..., 8.3089e-05, -8.6021e-04, 3.0541e-04], ..., [ 0.0000e+00, -3.6716e-04, 1.6463e-04, ..., -3.9577e-05, 8.7261e-05, 1.5914e-04], [ 0.0000e+00, -2.3134e-06, -2.0008e-03, ..., 8.0109e-05, 5.5408e-04, -2.4719e-03], [ 0.0000e+00, 1.4555e-04, -4.9210e-04, ..., 1.8239e-04, 1.2815e-04, 6.6090e-04]], device='cuda:0') Epoch 21, bias, value: tensor([-0.0140, -0.0114, -0.0263, -0.0247, -0.0120, 0.0045, 0.0074, -0.0246, -0.0112, 0.0089], device='cuda:0'), grad: tensor([-0.0014, 0.0018, -0.0033, 0.0011, 0.0004, -0.0018, 0.0030, 0.0004, -0.0015, 0.0013], device='cuda:0') 100 0.0001 changing lr epoch 20, time 221.26, cls_loss 0.0270 cls_loss_mapping 0.0525 cls_loss_causal 0.8307 re_mapping 0.0250 re_causal 0.0737 /// teacc 98.31 lr 0.00010000 Epoch 22, weight, value: tensor([[-0.0102, -0.0498, 0.0518, ..., -0.0049, 0.0233, 0.0084], [-0.0288, -0.0079, -0.0076, ..., -0.0164, -0.0557, -0.0399], [ 0.0232, -0.0085, 0.0044, ..., -0.0078, -0.0234, -0.1054], ..., [ 0.0221, 0.0244, -0.0019, ..., 0.0431, -0.0264, -0.0416], [ 0.0014, -0.0205, 0.0493, ..., -0.0300, -0.0867, 0.0425], [-0.0098, -0.0404, 0.0363, ..., 0.0382, -0.0836, -0.0279]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.1837e-05, -4.0829e-05, ..., 9.5129e-05, -3.0577e-05, 9.2447e-05], [ 0.0000e+00, 6.0022e-05, 3.1173e-05, ..., 8.6486e-05, 5.2527e-06, -6.1691e-05], [ 0.0000e+00, 1.8489e-04, -1.3180e-05, ..., 1.5008e-04, -1.6296e-04, 4.0650e-05], ..., [ 0.0000e+00, -5.5695e-04, -2.2018e-04, ..., -4.5252e-04, 7.9155e-05, 2.0489e-06], [ 0.0000e+00, 4.2409e-05, 3.7217e-04, ..., 2.9445e-04, 1.9014e-05, -4.4405e-05], [ 0.0000e+00, 1.0085e-04, -6.4898e-04, ..., -1.7512e-04, 1.1630e-05, 4.3750e-05]], device='cuda:0') Epoch 22, bias, value: tensor([-0.0138, -0.0113, -0.0263, -0.0243, -0.0119, 0.0046, 0.0073, -0.0245, -0.0115, 0.0085], device='cuda:0'), grad: tensor([ 2.8992e-04, -4.5031e-05, -8.3733e-04, -2.7802e-02, -5.4502e-04, 2.8412e-02, -3.0071e-05, -6.5863e-05, 4.6039e-04, 1.6022e-04], device='cuda:0') 100 0.0001 changing lr epoch 21, time 221.62, cls_loss 0.0261 cls_loss_mapping 0.0515 cls_loss_causal 0.8687 re_mapping 0.0234 re_causal 0.0715 /// teacc 98.51 lr 0.00010000 Epoch 23, weight, value: tensor([[-0.0102, -0.0506, 0.0523, ..., -0.0053, 0.0250, 0.0095], [-0.0288, -0.0085, -0.0084, ..., -0.0166, -0.0559, -0.0399], [ 0.0232, -0.0094, 0.0036, ..., -0.0082, -0.0229, -0.1096], ..., [ 0.0221, 0.0255, -0.0018, ..., 0.0435, -0.0270, -0.0424], [ 0.0014, -0.0209, 0.0500, ..., -0.0303, -0.0877, 0.0441], [-0.0098, -0.0414, 0.0370, ..., 0.0384, -0.0868, -0.0282]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.4938e-05, -2.0289e-04, ..., 1.0766e-05, 1.3031e-05, -1.7858e-04], [ 0.0000e+00, 5.3942e-05, 8.0824e-05, ..., 5.1022e-05, 3.4384e-06, -1.8632e-04], [ 0.0000e+00, -3.3826e-06, 1.1587e-04, ..., 1.2445e-04, -1.2016e-04, 9.3937e-05], ..., [ 0.0000e+00, -1.8501e-04, -9.9063e-05, ..., -1.2553e-04, 5.1558e-05, 1.3196e-04], [ 0.0000e+00, 1.0586e-04, 1.0729e-05, ..., 4.8280e-05, 4.2707e-05, 1.0508e-04], [ 0.0000e+00, 5.3085e-08, -3.1495e-04, ..., -2.6751e-04, 2.4717e-06, 1.0109e-04]], device='cuda:0') Epoch 23, bias, value: tensor([-0.0137, -0.0111, -0.0266, -0.0245, -0.0119, 0.0046, 0.0070, -0.0244, -0.0112, 0.0085], device='cuda:0'), grad: tensor([-1.9997e-05, -3.1829e-04, -5.3644e-04, 1.9932e-04, 2.1565e-04, 3.0422e-04, -3.6550e-04, 4.3797e-04, 4.6587e-04, -3.8385e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 22---------------------------------------------------- epoch 22, time 221.51, cls_loss 0.0279 cls_loss_mapping 0.0520 cls_loss_causal 0.8301 re_mapping 0.0219 re_causal 0.0648 /// teacc 98.57 lr 0.00010000 Epoch 24, weight, value: tensor([[-0.0102, -0.0518, 0.0532, ..., -0.0057, 0.0253, 0.0104], [-0.0288, -0.0090, -0.0089, ..., -0.0166, -0.0560, -0.0405], [ 0.0232, -0.0099, 0.0031, ..., -0.0088, -0.0231, -0.1120], ..., [ 0.0221, 0.0258, -0.0017, ..., 0.0438, -0.0273, -0.0424], [ 0.0014, -0.0205, 0.0506, ..., -0.0303, -0.0886, 0.0453], [-0.0098, -0.0423, 0.0371, ..., 0.0386, -0.0881, -0.0294]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1705e-05, -1.3173e-04, ..., 7.6056e-05, -1.0140e-05, -8.1122e-05], [ 0.0000e+00, 1.8048e-04, 3.1447e-04, ..., 3.6120e-04, 5.3411e-07, -1.6272e-04], [ 0.0000e+00, 7.5579e-05, 6.5684e-05, ..., 2.2805e-04, 4.2394e-06, 8.4758e-05], ..., [ 0.0000e+00, 2.4853e-03, 6.3171e-03, ..., 7.3166e-03, 1.2470e-06, 3.6359e-05], [ 0.0000e+00, 2.5940e-04, 3.8362e-04, ..., 3.0899e-04, 1.2089e-06, 3.7217e-04], [ 0.0000e+00, -3.5915e-03, -8.5449e-03, ..., -9.8877e-03, 1.5832e-06, -8.9705e-06]], device='cuda:0') Epoch 24, bias, value: tensor([-0.0133, -0.0114, -0.0265, -0.0245, -0.0113, 0.0046, 0.0069, -0.0247, -0.0110, 0.0079], device='cuda:0'), grad: tensor([-7.3314e-05, 3.2139e-04, -4.4799e-04, 7.0095e-04, 1.7262e-03, 2.1040e-04, -2.3746e-04, 1.2276e-02, 1.2541e-03, -1.5732e-02], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 23---------------------------------------------------- epoch 23, time 222.21, cls_loss 0.0224 cls_loss_mapping 0.0467 cls_loss_causal 0.8327 re_mapping 0.0222 re_causal 0.0680 /// teacc 98.68 lr 0.00010000 Epoch 25, weight, value: tensor([[-0.0102, -0.0529, 0.0533, ..., -0.0063, 0.0254, 0.0104], [-0.0288, -0.0090, -0.0095, ..., -0.0177, -0.0568, -0.0403], [ 0.0232, -0.0107, 0.0025, ..., -0.0095, -0.0227, -0.1139], ..., [ 0.0221, 0.0266, -0.0019, ..., 0.0443, -0.0268, -0.0427], [ 0.0014, -0.0206, 0.0513, ..., -0.0304, -0.0897, 0.0461], [-0.0098, -0.0429, 0.0380, ..., 0.0390, -0.0892, -0.0295]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3007e-05, -2.3861e-03, ..., 1.4015e-05, -8.4639e-06, -1.1292e-03], [ 0.0000e+00, 2.9892e-05, 5.7310e-05, ..., 2.1964e-05, 3.5688e-06, -1.2865e-03], [ 0.0000e+00, 3.1352e-05, 3.3283e-03, ..., 2.8238e-05, 8.7023e-04, 2.1267e-03], ..., [ 0.0000e+00, -6.5804e-05, -1.8626e-05, ..., -1.5366e-04, 3.3796e-05, 1.3363e-04], [ 0.0000e+00, 5.9843e-04, -2.0199e-03, ..., 3.1829e-05, -9.1505e-04, -1.3609e-03], [ 0.0000e+00, 1.5478e-03, 3.5038e-03, ..., 1.3399e-03, 1.7047e-05, 1.0071e-03]], device='cuda:0') Epoch 25, bias, value: tensor([-0.0139, -0.0118, -0.0264, -0.0243, -0.0112, 0.0045, 0.0070, -0.0245, -0.0109, 0.0081], device='cuda:0'), grad: tensor([-0.0035, -0.0016, 0.0030, -0.0037, -0.0021, -0.0012, 0.0013, 0.0002, -0.0005, 0.0080], device='cuda:0') 100 0.0001 changing lr epoch 24, time 221.01, cls_loss 0.0255 cls_loss_mapping 0.0496 cls_loss_causal 0.8450 re_mapping 0.0209 re_causal 0.0660 /// teacc 98.47 lr 0.00010000 Epoch 26, weight, value: tensor([[-0.0102, -0.0539, 0.0537, ..., -0.0067, 0.0249, 0.0106], [-0.0288, -0.0092, -0.0100, ..., -0.0185, -0.0586, -0.0400], [ 0.0232, -0.0121, 0.0020, ..., -0.0100, -0.0213, -0.1154], ..., [ 0.0221, 0.0273, -0.0019, ..., 0.0450, -0.0254, -0.0427], [ 0.0014, -0.0216, 0.0516, ..., -0.0308, -0.0923, 0.0471], [-0.0098, -0.0443, 0.0384, ..., 0.0392, -0.0927, -0.0295]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6819e-05, 3.4332e-04, ..., 6.5506e-05, 1.9884e-04, 6.8045e-04], [ 0.0000e+00, 2.9549e-05, 1.7715e-04, ..., 8.5950e-05, 3.0100e-05, 2.0909e-04], [ 0.0000e+00, 1.4603e-05, 1.0929e-03, ..., 1.8132e-04, 3.8266e-04, 1.8244e-03], ..., [ 0.0000e+00, -2.6560e-04, 7.7534e-04, ..., 4.4227e-04, 8.2135e-05, 2.4557e-04], [ 0.0000e+00, -9.1456e-07, -2.6932e-03, ..., -2.5129e-04, -1.1272e-03, -5.0163e-03], [ 0.0000e+00, -1.8370e-04, -1.5631e-03, ..., -1.4019e-03, 7.7367e-05, 3.6216e-04]], device='cuda:0') Epoch 26, bias, value: tensor([-0.0141, -0.0120, -0.0265, -0.0236, -0.0116, 0.0041, 0.0066, -0.0243, -0.0109, 0.0084], device='cuda:0'), grad: tensor([ 0.0012, 0.0004, 0.0029, 0.0019, 0.0005, -0.0004, 0.0018, 0.0011, -0.0079, -0.0015], device='cuda:0') 100 0.0001 changing lr epoch 25, time 221.25, cls_loss 0.0216 cls_loss_mapping 0.0436 cls_loss_causal 0.8224 re_mapping 0.0205 re_causal 0.0617 /// teacc 98.66 lr 0.00010000 Epoch 27, weight, value: tensor([[-0.0102, -0.0549, 0.0541, ..., -0.0074, 0.0250, 0.0110], [-0.0288, -0.0085, -0.0102, ..., -0.0186, -0.0591, -0.0400], [ 0.0232, -0.0131, 0.0015, ..., -0.0109, -0.0205, -0.1181], ..., [ 0.0221, 0.0281, -0.0018, ..., 0.0462, -0.0253, -0.0435], [ 0.0014, -0.0223, 0.0521, ..., -0.0311, -0.0936, 0.0477], [-0.0098, -0.0450, 0.0393, ..., 0.0395, -0.0941, -0.0288]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.0115e-05, -3.4261e-04, ..., -4.3988e-05, -9.0718e-05, 5.3883e-05], [ 0.0000e+00, 8.3685e-05, 1.1092e-04, ..., 7.2718e-05, 1.6056e-06, 3.2008e-05], [ 0.0000e+00, 1.7560e-04, 2.5487e-04, ..., 1.7416e-04, 2.3004e-06, 6.8247e-05], ..., [ 0.0000e+00, 2.6779e-03, 1.9569e-03, ..., -6.3562e-04, 3.0659e-06, 3.5048e-05], [ 0.0000e+00, 1.1474e-04, -1.1069e-04, ..., 2.5436e-05, 8.8215e-06, -9.9778e-05], [ 0.0000e+00, 1.5569e-04, -1.4976e-05, ..., 7.6473e-05, 5.0254e-06, 5.3793e-05]], device='cuda:0') Epoch 27, bias, value: tensor([-0.0143, -0.0117, -0.0266, -0.0240, -0.0115, 0.0045, 0.0063, -0.0243, -0.0110, 0.0087], device='cuda:0'), grad: tensor([-0.0002, 0.0002, 0.0002, -0.0031, 0.0002, 0.0004, -0.0005, 0.0025, 0.0002, 0.0001], device='cuda:0') 100 0.0001 changing lr epoch 26, time 221.30, cls_loss 0.0206 cls_loss_mapping 0.0410 cls_loss_causal 0.7796 re_mapping 0.0205 re_causal 0.0605 /// teacc 98.52 lr 0.00010000 Epoch 28, weight, value: tensor([[-0.0102, -0.0561, 0.0545, ..., -0.0078, 0.0252, 0.0111], [-0.0288, -0.0082, -0.0103, ..., -0.0184, -0.0592, -0.0402], [ 0.0232, -0.0138, 0.0012, ..., -0.0113, -0.0202, -0.1188], ..., [ 0.0221, 0.0291, -0.0018, ..., 0.0465, -0.0255, -0.0441], [ 0.0014, -0.0231, 0.0521, ..., -0.0313, -0.0942, 0.0485], [-0.0098, -0.0468, 0.0397, ..., 0.0396, -0.0956, -0.0293]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.9174e-06, -3.0255e-04, ..., 3.3528e-07, -2.2009e-05, -1.6284e-04], [ 0.0000e+00, 6.2771e-06, 2.7522e-05, ..., 2.8729e-05, 5.6531e-07, -9.9850e-04], [ 0.0000e+00, 2.4378e-05, 1.0425e-04, ..., 4.8250e-05, -4.5672e-06, 1.1075e-04], ..., [ 0.0000e+00, 3.0823e-03, 4.6730e-03, ..., 5.3825e-03, 4.8839e-06, 1.1694e-04], [ 0.0000e+00, 6.6519e-05, -2.9016e-04, ..., 9.7230e-06, 5.6513e-06, -1.3411e-05], [ 0.0000e+00, -3.2902e-03, -5.0278e-03, ..., -6.0005e-03, 4.2506e-06, 4.1246e-04]], device='cuda:0') Epoch 28, bias, value: tensor([-0.0144, -0.0121, -0.0260, -0.0239, -0.0113, 0.0045, 0.0065, -0.0246, -0.0111, 0.0086], device='cuda:0'), grad: tensor([-0.0003, -0.0039, 0.0002, 0.0009, 0.0009, 0.0006, -0.0001, 0.0070, 0.0013, -0.0066], device='cuda:0') 100 0.0001 changing lr epoch 27, time 221.17, cls_loss 0.0174 cls_loss_mapping 0.0388 cls_loss_causal 0.7790 re_mapping 0.0193 re_causal 0.0599 /// teacc 98.65 lr 0.00010000 Epoch 29, weight, value: tensor([[-0.0194, -0.0569, 0.0551, ..., -0.0084, 0.0259, 0.0119], [-0.0262, -0.0091, -0.0108, ..., -0.0192, -0.0599, -0.0397], [ 0.0264, -0.0149, 0.0007, ..., -0.0119, -0.0198, -0.1209], ..., [ 0.0160, 0.0306, -0.0018, ..., 0.0473, -0.0254, -0.0444], [-0.0017, -0.0234, 0.0524, ..., -0.0315, -0.0951, 0.0492], [-0.0144, -0.0475, 0.0401, ..., 0.0398, -0.0979, -0.0293]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3210e-05, -1.9586e-04, ..., 1.0684e-05, -3.8385e-05, -2.1803e-04], [ 0.0000e+00, 2.9996e-05, 2.1845e-05, ..., 9.4324e-06, 3.4384e-06, -1.0200e-05], [ 0.0000e+00, 3.3230e-05, 4.0293e-05, ..., 9.6858e-06, -2.2352e-05, 7.6473e-05], ..., [ 0.0000e+00, -8.5533e-05, 1.1072e-05, ..., -5.1409e-05, 1.1429e-05, 4.3631e-05], [ 0.0000e+00, 2.4557e-04, 3.1680e-05, ..., 1.1630e-05, 1.4156e-05, 1.6510e-04], [ 0.0000e+00, 1.0693e-04, -9.4235e-05, ..., -7.0512e-05, 9.6932e-06, 1.0651e-04]], device='cuda:0') Epoch 29, bias, value: tensor([-0.0141, -0.0119, -0.0263, -0.0243, -0.0113, 0.0044, 0.0066, -0.0241, -0.0111, 0.0085], device='cuda:0'), grad: tensor([-2.4962e-04, -4.2289e-05, 8.6010e-05, -4.1313e-03, 1.5509e-04, 3.3150e-03, -1.5783e-04, 9.6083e-05, 7.7343e-04, 1.5187e-04], device='cuda:0') 100 0.0001 changing lr epoch 28, time 221.43, cls_loss 0.0179 cls_loss_mapping 0.0371 cls_loss_causal 0.7640 re_mapping 0.0196 re_causal 0.0604 /// teacc 98.64 lr 0.00010000 Epoch 30, weight, value: tensor([[-0.0195, -0.0580, 0.0552, ..., -0.0094, 0.0258, 0.0122], [-0.0264, -0.0085, -0.0109, ..., -0.0195, -0.0601, -0.0395], [ 0.0266, -0.0157, 0.0001, ..., -0.0118, -0.0169, -0.1236], ..., [ 0.0160, 0.0312, -0.0019, ..., 0.0477, -0.0267, -0.0453], [-0.0017, -0.0238, 0.0532, ..., -0.0314, -0.0963, 0.0502], [-0.0144, -0.0483, 0.0406, ..., 0.0399, -0.1018, -0.0300]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.5423e-05, -9.0957e-05, ..., 2.9832e-05, -7.8306e-06, 2.0489e-05], [ 0.0000e+00, 3.0017e-04, 1.1319e-04, ..., 2.1625e-04, 9.5427e-05, -1.2469e-04], [ 0.0000e+00, 9.4354e-05, 3.4404e-04, ..., 3.4785e-04, -1.7583e-04, 3.4142e-04], ..., [ 0.0000e+00, -4.3845e-04, 1.3075e-03, ..., 3.6573e-04, 9.4622e-06, 8.6129e-05], [ 0.0000e+00, 6.3241e-05, -8.0681e-04, ..., -3.8981e-04, 2.3216e-05, -2.5773e-04], [ 0.0000e+00, -6.6805e-04, -1.7977e-03, ..., -1.4868e-03, 6.3553e-06, 1.1146e-05]], device='cuda:0') Epoch 30, bias, value: tensor([-0.0143, -0.0117, -0.0256, -0.0243, -0.0114, 0.0044, 0.0064, -0.0245, -0.0108, 0.0081], device='cuda:0'), grad: tensor([ 8.3029e-05, 6.9523e-04, -1.8454e-04, 1.5488e-03, 7.8773e-04, 1.6317e-03, -2.0504e-03, 8.2731e-04, 1.0705e-04, -3.4447e-03], device='cuda:0') 100 0.0001 changing lr epoch 29, time 221.19, cls_loss 0.0218 cls_loss_mapping 0.0440 cls_loss_causal 0.7997 re_mapping 0.0187 re_causal 0.0597 /// teacc 98.61 lr 0.00010000 Epoch 31, weight, value: tensor([[-0.0197, -0.0586, 0.0550, ..., -0.0101, 0.0277, 0.0113], [-0.0286, -0.0090, -0.0115, ..., -0.0206, -0.0612, -0.0401], [ 0.0293, -0.0160, -0.0002, ..., -0.0122, -0.0164, -0.1254], ..., [ 0.0157, 0.0321, -0.0015, ..., 0.0486, -0.0265, -0.0452], [-0.0015, -0.0248, 0.0535, ..., -0.0316, -0.0991, 0.0514], [-0.0144, -0.0494, 0.0415, ..., 0.0401, -0.1082, -0.0300]], device='cuda:0'), grad: tensor([[ 2.5309e-07, 1.9300e-04, 3.3557e-05, ..., 2.2495e-04, -7.7784e-06, -3.0786e-05], [ 1.0850e-07, 1.4603e-04, 5.0813e-05, ..., 1.2219e-04, 8.9128e-07, -1.1069e-04], [ 1.0547e-07, -2.9774e-03, -1.0437e-04, ..., -2.0905e-03, -1.8045e-05, 1.3649e-04], ..., [ 2.8010e-07, 3.1815e-03, 1.0748e-03, ..., 1.6909e-03, 6.7838e-06, 9.0981e-04], [ 1.0831e-06, -1.7805e-03, -1.5593e-03, ..., -3.0828e-04, 3.1888e-06, -1.4696e-03], [ 2.4959e-06, 2.4939e-04, -4.9543e-04, ..., -3.3641e-04, 2.2165e-06, 1.6916e-04]], device='cuda:0') Epoch 31, bias, value: tensor([-0.0147, -0.0120, -0.0258, -0.0243, -0.0113, 0.0042, 0.0065, -0.0244, -0.0105, 0.0083], device='cuda:0'), grad: tensor([ 0.0005, -0.0006, -0.0063, 0.0019, 0.0007, 0.0001, -0.0003, 0.0067, -0.0025, -0.0002], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 30---------------------------------------------------- epoch 30, time 221.83, cls_loss 0.0148 cls_loss_mapping 0.0338 cls_loss_causal 0.7667 re_mapping 0.0195 re_causal 0.0595 /// teacc 98.69 lr 0.00010000 Epoch 32, weight, value: tensor([[-0.0205, -0.0593, 0.0555, ..., -0.0108, 0.0282, 0.0117], [-0.0316, -0.0090, -0.0120, ..., -0.0208, -0.0615, -0.0402], [ 0.0339, -0.0166, -0.0006, ..., -0.0129, -0.0152, -0.1265], ..., [ 0.0146, 0.0330, -0.0016, ..., 0.0492, -0.0267, -0.0460], [-0.0008, -0.0250, 0.0543, ..., -0.0310, -0.0998, 0.0525], [-0.0145, -0.0512, 0.0416, ..., 0.0399, -0.1103, -0.0310]], device='cuda:0'), grad: tensor([[ 1.4901e-07, 1.1846e-05, 1.8626e-05, ..., 7.5065e-06, 5.4054e-06, -6.5327e-05], [ 8.1956e-08, 3.2157e-05, 8.8513e-05, ..., 1.0088e-05, 3.2037e-06, -5.9038e-05], [-2.1998e-06, 5.4985e-05, -3.3855e-04, ..., 8.8215e-06, -8.2612e-05, 6.9737e-05], ..., [ 1.6950e-06, -1.8597e-04, 2.1291e-04, ..., -1.9014e-04, 1.7837e-05, 3.3069e-04], [-6.5425e-07, -6.8426e-04, -4.3631e-04, ..., 1.7673e-05, 5.4687e-05, -5.0497e-04], [ 1.9302e-07, 3.4511e-05, -2.6369e-04, ..., -1.0091e-04, 6.5863e-06, -7.7784e-05]], device='cuda:0') Epoch 32, bias, value: tensor([-0.0147, -0.0121, -0.0256, -0.0242, -0.0112, 0.0039, 0.0064, -0.0245, -0.0099, 0.0081], device='cuda:0'), grad: tensor([ 2.8992e-04, 7.9572e-05, -1.1978e-03, 9.0361e-04, 1.4985e-04, 3.5048e-05, 4.2766e-06, 3.1257e-04, -4.1127e-04, -1.6403e-04], device='cuda:0') 100 0.0001 changing lr epoch 31, time 221.61, cls_loss 0.0135 cls_loss_mapping 0.0301 cls_loss_causal 0.7681 re_mapping 0.0186 re_causal 0.0595 /// teacc 98.60 lr 0.00010000 Epoch 33, weight, value: tensor([[-0.0221, -0.0600, 0.0557, ..., -0.0113, 0.0289, 0.0119], [-0.0324, -0.0081, -0.0120, ..., -0.0212, -0.0619, -0.0406], [ 0.0352, -0.0180, -0.0012, ..., -0.0140, -0.0147, -0.1286], ..., [ 0.0129, 0.0341, -0.0014, ..., 0.0498, -0.0270, -0.0466], [ 0.0021, -0.0252, 0.0547, ..., -0.0312, -0.1003, 0.0530], [-0.0148, -0.0524, 0.0422, ..., 0.0401, -0.1115, -0.0306]], device='cuda:0'), grad: tensor([[ 1.0012e-07, 1.0692e-05, -4.0078e-04, ..., -1.3016e-05, -5.0813e-05, -5.8651e-04], [ 3.5437e-07, 1.3433e-05, 3.0577e-05, ..., -3.1441e-05, 8.7097e-06, 6.0806e-03], [-3.2820e-06, -1.8883e-04, 3.9607e-05, ..., 2.4691e-05, -4.6879e-05, 1.2016e-04], ..., [ 9.9000e-07, -2.8515e-04, -1.3471e-04, ..., -1.7047e-04, 1.5453e-05, 1.3578e-04], [ 1.4286e-06, 4.4674e-05, -1.3602e-04, ..., -4.8429e-05, 1.5095e-05, 2.6488e-04], [ 2.3050e-08, 1.4186e-04, 1.3971e-04, ..., 9.7275e-05, 7.1861e-06, 3.9601e-04]], device='cuda:0') Epoch 33, bias, value: tensor([-0.0148, -0.0120, -0.0259, -0.0245, -0.0114, 0.0041, 0.0066, -0.0243, -0.0100, 0.0083], device='cuda:0'), grad: tensor([-0.0011, 0.0077, -0.0008, 0.0033, 0.0002, -0.0115, 0.0005, 0.0001, 0.0007, 0.0008], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 32---------------------------------------------------- epoch 32, time 221.79, cls_loss 0.0131 cls_loss_mapping 0.0304 cls_loss_causal 0.7708 re_mapping 0.0173 re_causal 0.0567 /// teacc 98.74 lr 0.00010000 Epoch 34, weight, value: tensor([[-0.0246, -0.0612, 0.0562, ..., -0.0124, 0.0300, 0.0126], [-0.0339, -0.0087, -0.0124, ..., -0.0215, -0.0620, -0.0413], [ 0.0370, -0.0185, -0.0018, ..., -0.0144, -0.0147, -0.1306], ..., [ 0.0116, 0.0345, -0.0017, ..., 0.0502, -0.0274, -0.0473], [ 0.0024, -0.0258, 0.0551, ..., -0.0313, -0.1008, 0.0538], [-0.0150, -0.0533, 0.0425, ..., 0.0403, -0.1137, -0.0309]], device='cuda:0'), grad: tensor([[ 1.2051e-06, 6.5118e-06, -1.9896e-04, ..., 4.5076e-06, -3.6359e-05, -1.5211e-04], [ 3.1829e-04, 2.6420e-05, -1.7136e-05, ..., 8.0094e-06, 4.3511e-06, -1.4496e-04], [-3.3617e-04, 1.5640e-04, 2.0623e-05, ..., -2.1920e-05, -9.6500e-05, 3.1352e-05], ..., [ 7.5288e-06, 1.3721e-04, -6.4850e-05, ..., -2.4843e-04, 9.7573e-05, 5.3227e-05], [ 3.5986e-06, 1.0744e-05, -2.4632e-05, ..., 1.2591e-05, 3.4422e-05, 1.2122e-05], [ 4.4331e-07, 1.2201e-04, 1.3971e-04, ..., 2.5153e-04, 5.5321e-06, 4.3541e-05]], device='cuda:0') Epoch 34, bias, value: tensor([-0.0147, -0.0123, -0.0259, -0.0242, -0.0114, 0.0040, 0.0066, -0.0245, -0.0098, 0.0082], device='cuda:0'), grad: tensor([-1.8144e-04, 7.2384e-04, -1.2417e-03, -3.3712e-04, -4.0650e-05, -7.5400e-06, 1.5974e-04, 3.8600e-04, 1.5426e-04, 3.8528e-04], device='cuda:0') 100 0.0001 changing lr epoch 33, time 221.33, cls_loss 0.0133 cls_loss_mapping 0.0301 cls_loss_causal 0.7677 re_mapping 0.0168 re_causal 0.0551 /// teacc 98.72 lr 0.00010000 Epoch 35, weight, value: tensor([[-0.0267, -0.0620, 0.0567, ..., -0.0130, 0.0312, 0.0131], [-0.0323, -0.0095, -0.0131, ..., -0.0220, -0.0620, -0.0417], [ 0.0375, -0.0190, -0.0023, ..., -0.0151, -0.0143, -0.1320], ..., [ 0.0099, 0.0358, -0.0015, ..., 0.0512, -0.0279, -0.0480], [ 0.0020, -0.0264, 0.0555, ..., -0.0315, -0.1017, 0.0549], [-0.0152, -0.0546, 0.0429, ..., 0.0403, -0.1121, -0.0308]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 6.5982e-05, -5.9223e-04, ..., 1.4104e-05, 0.0000e+00, -9.1600e-04], [ 4.4238e-09, 9.1076e-05, 1.2800e-05, ..., 1.1414e-04, 0.0000e+00, -1.9157e-04], [ 1.4435e-08, 4.8599e-03, 1.7393e-04, ..., 5.9128e-03, 0.0000e+00, 1.9312e-04], ..., [ 5.8208e-09, -4.2458e-03, -9.8825e-05, ..., -5.8784e-03, 0.0000e+00, 1.6546e-04], [-6.5193e-08, 1.0853e-03, 2.3115e-04, ..., 1.7846e-04, 0.0000e+00, 5.3930e-04], [ 1.1874e-08, 4.1270e-04, 2.0742e-04, ..., 1.4269e-04, 0.0000e+00, 2.5964e-04]], device='cuda:0') Epoch 35, bias, value: tensor([-0.0146, -0.0124, -0.0256, -0.0247, -0.0110, 0.0040, 0.0067, -0.0242, -0.0097, 0.0079], device='cuda:0'), grad: tensor([-1.1101e-03, -2.8968e-04, 1.0834e-02, -4.5624e-03, -9.6321e-04, 8.5533e-05, 1.5430e-03, -9.3384e-03, 2.6493e-03, 1.1597e-03], device='cuda:0') 100 0.0001 changing lr epoch 34, time 221.31, cls_loss 0.0132 cls_loss_mapping 0.0335 cls_loss_causal 0.7453 re_mapping 0.0172 re_causal 0.0540 /// teacc 98.62 lr 0.00010000 Epoch 36, weight, value: tensor([[-0.0301, -0.0628, 0.0568, ..., -0.0136, 0.0319, 0.0132], [-0.0326, -0.0096, -0.0135, ..., -0.0226, -0.0622, -0.0414], [ 0.0382, -0.0195, -0.0027, ..., -0.0158, -0.0139, -0.1334], ..., [ 0.0091, 0.0362, -0.0013, ..., 0.0521, -0.0283, -0.0488], [ 0.0017, -0.0278, 0.0558, ..., -0.0316, -0.1026, 0.0558], [-0.0154, -0.0559, 0.0431, ..., 0.0403, -0.1131, -0.0317]], device='cuda:0'), grad: tensor([[ 6.9151e-08, 1.2517e-05, -1.9759e-05, ..., 1.5184e-05, -1.0252e-05, -4.7654e-05], [ 1.1176e-08, 6.8605e-05, 5.4777e-05, ..., 4.6581e-05, 4.0070e-07, -3.3408e-05], [-6.5146e-07, 5.4121e-05, 6.7234e-05, ..., 3.4422e-05, 1.4203e-07, 2.2784e-05], ..., [ 1.2410e-07, -2.3246e-04, -8.3089e-05, ..., -1.7607e-04, 6.2399e-07, 3.4243e-05], [ 3.7043e-07, 2.7940e-05, 5.3406e-05, ..., 3.7342e-05, 1.0617e-06, 1.2986e-05], [ 1.6298e-09, 8.9586e-05, -1.4734e-04, ..., -8.4817e-05, 6.7148e-07, -4.2111e-05]], device='cuda:0') Epoch 36, bias, value: tensor([-0.0148, -0.0123, -0.0257, -0.0245, -0.0111, 0.0042, 0.0067, -0.0242, -0.0096, 0.0075], device='cuda:0'), grad: tensor([-1.8492e-05, 1.1659e-04, 1.3888e-04, -3.0732e-04, 7.0989e-05, 1.7130e-04, -3.7760e-05, -1.8656e-04, 1.2314e-04, -7.1228e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 35---------------------------------------------------- epoch 35, time 222.16, cls_loss 0.0109 cls_loss_mapping 0.0317 cls_loss_causal 0.7748 re_mapping 0.0169 re_causal 0.0552 /// teacc 98.78 lr 0.00010000 Epoch 37, weight, value: tensor([[-0.0307, -0.0636, 0.0572, ..., -0.0140, 0.0322, 0.0139], [-0.0327, -0.0098, -0.0138, ..., -0.0234, -0.0622, -0.0410], [ 0.0384, -0.0204, -0.0033, ..., -0.0163, -0.0137, -0.1348], ..., [ 0.0087, 0.0373, -0.0012, ..., 0.0525, -0.0284, -0.0497], [ 0.0013, -0.0273, 0.0563, ..., -0.0317, -0.1028, 0.0562], [-0.0156, -0.0574, 0.0441, ..., 0.0410, -0.1135, -0.0314]], device='cuda:0'), grad: tensor([[ 1.2562e-05, 2.2352e-05, -9.6083e-05, ..., 2.3186e-05, -4.6287e-07, 1.7989e-04], [ 3.6269e-05, -8.1778e-04, -3.7742e-04, ..., 3.6210e-05, -7.4208e-05, 4.9770e-05], [ 9.0599e-05, -1.4639e-03, -7.6532e-05, ..., -1.0023e-03, 1.9986e-06, 1.3771e-03], ..., [ 1.5691e-05, 1.8358e-03, 4.3106e-04, ..., 7.2050e-04, 3.6247e-06, 5.5742e-04], [-4.2367e-04, 5.1379e-05, 1.8871e-04, ..., 5.0187e-05, 8.7358e-07, -5.3825e-03], [ 5.9493e-06, 2.2531e-04, -6.4659e-04, ..., -6.4611e-04, 6.0834e-06, 2.2399e-04]], device='cuda:0') Epoch 37, bias, value: tensor([-0.0146, -0.0123, -0.0259, -0.0249, -0.0113, 0.0042, 0.0069, -0.0240, -0.0096, 0.0078], device='cuda:0'), grad: tensor([ 0.0003, -0.0019, -0.0008, 0.0008, 0.0011, 0.0051, -0.0004, 0.0050, -0.0089, -0.0002], device='cuda:0') 100 0.0001 changing lr epoch 36, time 221.58, cls_loss 0.0102 cls_loss_mapping 0.0251 cls_loss_causal 0.7310 re_mapping 0.0160 re_causal 0.0516 /// teacc 98.78 lr 0.00010000 Epoch 38, weight, value: tensor([[-0.0343, -0.0640, 0.0577, ..., -0.0146, 0.0328, 0.0146], [-0.0330, -0.0106, -0.0142, ..., -0.0236, -0.0623, -0.0407], [ 0.0390, -0.0212, -0.0037, ..., -0.0166, -0.0134, -0.1356], ..., [ 0.0043, 0.0385, -0.0016, ..., 0.0525, -0.0285, -0.0517], [ 0.0042, -0.0271, 0.0569, ..., -0.0314, -0.1034, 0.0572], [-0.0161, -0.0583, 0.0447, ..., 0.0413, -0.1137, -0.0320]], device='cuda:0'), grad: tensor([[ 1.8477e-05, 1.2711e-05, -4.5896e-05, ..., 1.6704e-05, 8.0466e-06, 2.1124e-04], [ 1.2433e-07, 4.7356e-05, -3.8207e-05, ..., 8.4341e-05, 3.5507e-07, -1.9467e-04], [ 3.4668e-07, 4.9382e-05, 7.3075e-05, ..., 4.1306e-05, 2.6952e-06, 8.8573e-05], ..., [ 1.1958e-06, -3.1400e-04, -1.0407e-04, ..., -3.1900e-04, 1.9139e-07, 3.8773e-05], [ 1.3765e-06, 2.3797e-05, 3.7462e-05, ..., 2.6777e-05, 9.0618e-07, -3.6061e-06], [ 4.3958e-06, -1.3304e-04, -1.4811e-03, ..., 5.5456e-04, 2.7637e-07, -3.5095e-04]], device='cuda:0') Epoch 38, bias, value: tensor([-0.0143, -0.0126, -0.0257, -0.0252, -0.0107, 0.0039, 0.0067, -0.0241, -0.0093, 0.0076], device='cuda:0'), grad: tensor([ 4.0197e-04, -5.9795e-04, 3.4046e-04, 2.3384e-03, -4.4775e-04, -8.0943e-05, -1.0216e-04, -3.0184e-04, 2.4116e-04, -1.7891e-03], device='cuda:0') 100 0.0001 changing lr epoch 37, time 221.47, cls_loss 0.0107 cls_loss_mapping 0.0243 cls_loss_causal 0.7298 re_mapping 0.0164 re_causal 0.0516 /// teacc 98.73 lr 0.00010000 Epoch 39, weight, value: tensor([[-0.0412, -0.0649, 0.0580, ..., -0.0154, 0.0333, 0.0150], [-0.0333, -0.0113, -0.0148, ..., -0.0250, -0.0623, -0.0410], [ 0.0397, -0.0221, -0.0041, ..., -0.0174, -0.0132, -0.1370], ..., [-0.0011, 0.0398, -0.0015, ..., 0.0536, -0.0287, -0.0524], [ 0.0039, -0.0278, 0.0573, ..., -0.0312, -0.1036, 0.0572], [-0.0191, -0.0596, 0.0450, ..., 0.0413, -0.1142, -0.0323]], device='cuda:0'), grad: tensor([[ 1.0328e-06, 2.6560e-04, 9.8705e-04, ..., 6.0606e-04, 6.2864e-09, 1.2302e-04], [ 3.1129e-07, -4.2394e-06, 2.9039e-04, ..., 4.0054e-04, 3.1432e-08, 1.3447e-04], [ 2.5006e-07, 1.8883e-04, 3.2473e-04, ..., 2.0611e-04, 5.3551e-08, 4.4793e-05], ..., [ 3.3621e-06, 3.4630e-05, 2.3804e-03, ..., 1.3609e-03, 4.9826e-08, 4.5276e-04], [ 6.0610e-06, 1.1253e-04, 2.6894e-04, ..., 2.9969e-04, 8.7544e-08, 2.3246e-04], [ 2.0862e-06, -8.6641e-04, -3.8643e-03, ..., -1.7633e-03, 1.7695e-07, 6.5446e-05]], device='cuda:0') Epoch 39, bias, value: tensor([-0.0147, -0.0129, -0.0261, -0.0249, -0.0108, 0.0039, 0.0072, -0.0235, -0.0095, 0.0074], device='cuda:0'), grad: tensor([ 2.0428e-03, 3.9744e-04, 7.0000e-04, 2.5463e-04, -1.6680e-03, -3.3170e-05, 1.2058e-04, 5.0659e-03, 7.4530e-04, -7.6294e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 38---------------------------------------------------- epoch 38, time 222.39, cls_loss 0.0096 cls_loss_mapping 0.0223 cls_loss_causal 0.7305 re_mapping 0.0154 re_causal 0.0492 /// teacc 98.80 lr 0.00010000 Epoch 40, weight, value: tensor([[-0.0438, -0.0652, 0.0586, ..., -0.0160, 0.0338, 0.0153], [-0.0333, -0.0111, -0.0151, ..., -0.0257, -0.0626, -0.0414], [ 0.0399, -0.0227, -0.0047, ..., -0.0179, -0.0131, -0.1387], ..., [-0.0018, 0.0404, -0.0015, ..., 0.0544, -0.0286, -0.0529], [ 0.0036, -0.0284, 0.0580, ..., -0.0312, -0.1040, 0.0578], [-0.0197, -0.0610, 0.0453, ..., 0.0414, -0.1147, -0.0331]], device='cuda:0'), grad: tensor([[ 2.8163e-06, 7.1637e-06, -1.2100e-04, ..., 7.0035e-05, -1.5339e-06, -5.6028e-05], [ 4.4610e-07, 2.3186e-05, 1.5177e-05, ..., 9.8109e-05, 2.8871e-08, -3.6168e-04], [ 3.8883e-07, 5.2512e-05, 3.1054e-05, ..., 1.5700e-04, 1.3318e-07, 5.9307e-05], ..., [ 3.6345e-07, -9.5308e-05, 1.7834e-04, ..., 4.9496e-04, 1.0710e-07, 4.5329e-05], [-7.3537e-06, 1.4693e-05, 5.0783e-05, ..., 1.3006e-04, 1.9628e-07, 2.7478e-05], [ 1.5311e-06, -3.4183e-05, -4.9210e-04, ..., -1.8740e-04, 3.1199e-07, -1.9252e-05]], device='cuda:0') Epoch 40, bias, value: tensor([-0.0144, -0.0129, -0.0260, -0.0249, -0.0106, 0.0039, 0.0074, -0.0236, -0.0094, 0.0070], device='cuda:0'), grad: tensor([ 0.0002, -0.0002, 0.0005, 0.0001, -0.0027, 0.0001, 0.0006, 0.0011, 0.0003, -0.0001], device='cuda:0') 100 0.0001 changing lr epoch 39, time 221.18, cls_loss 0.0095 cls_loss_mapping 0.0264 cls_loss_causal 0.7367 re_mapping 0.0145 re_causal 0.0485 /// teacc 98.74 lr 0.00010000 Epoch 41, weight, value: tensor([[-0.0500, -0.0665, 0.0590, ..., -0.0164, 0.0340, 0.0156], [-0.0335, -0.0115, -0.0160, ..., -0.0263, -0.0626, -0.0413], [ 0.0404, -0.0230, -0.0048, ..., -0.0181, -0.0129, -0.1402], ..., [-0.0066, 0.0414, -0.0016, ..., 0.0547, -0.0286, -0.0538], [ 0.0023, -0.0291, 0.0585, ..., -0.0314, -0.1043, 0.0589], [-0.0222, -0.0621, 0.0461, ..., 0.0424, -0.1148, -0.0334]], device='cuda:0'), grad: tensor([[ 5.7705e-06, 3.2037e-06, -4.7636e-04, ..., 1.1912e-06, -1.4508e-04, -3.5238e-04], [ 1.3988e-06, -4.5848e-04, -5.0068e-05, ..., 1.3717e-05, 6.2399e-08, -2.5797e-04], [ 2.9653e-06, 2.2817e-04, 2.4959e-05, ..., 5.0068e-06, 4.4750e-07, 6.7472e-05], ..., [ 2.4643e-06, -2.5377e-05, -3.0100e-05, ..., -6.4313e-05, 1.5576e-07, 9.4414e-05], [ 7.4804e-06, 3.6925e-05, -1.6546e-04, ..., 9.8944e-06, 8.1491e-07, 1.9535e-05], [ 2.2456e-05, 1.9163e-05, 3.1412e-05, ..., -6.5342e-06, 1.6391e-06, 1.4639e-04]], device='cuda:0') Epoch 41, bias, value: tensor([-0.0144, -0.0131, -0.0257, -0.0252, -0.0110, 0.0040, 0.0072, -0.0236, -0.0092, 0.0073], device='cuda:0'), grad: tensor([-0.0007, -0.0036, 0.0016, 0.0003, 0.0001, -0.0002, 0.0009, 0.0013, 0.0002, 0.0002], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 40---------------------------------------------------- epoch 40, time 221.97, cls_loss 0.0080 cls_loss_mapping 0.0236 cls_loss_causal 0.7013 re_mapping 0.0157 re_causal 0.0509 /// teacc 98.86 lr 0.00010000 Epoch 42, weight, value: tensor([[-5.6239e-02, -6.7219e-02, 5.9017e-02, ..., -1.6961e-02, 3.3974e-02, 1.5501e-02], [-3.3698e-02, -1.2168e-02, -1.6550e-02, ..., -2.6520e-02, -6.2712e-02, -4.0966e-02], [ 3.9960e-02, -2.3497e-02, -5.2284e-03, ..., -1.8055e-02, -1.2972e-02, -1.4215e-01], ..., [-8.7664e-03, 4.2172e-02, -1.7169e-03, ..., 5.4585e-02, -2.8594e-02, -5.4580e-02], [ 6.7347e-05, -2.9791e-02, 5.8638e-02, ..., -3.1699e-02, -1.0404e-01, 5.9027e-02], [-2.0229e-02, -6.2979e-02, 4.6852e-02, ..., 4.2775e-02, -1.1516e-01, -3.3228e-02]], device='cuda:0'), grad: tensor([[ 9.9316e-06, 1.2390e-05, -8.3089e-05, ..., 2.5090e-06, -1.2731e-06, -3.7253e-05], [ 1.8235e-06, -1.6105e-04, 1.2957e-05, ..., 4.0419e-06, 2.7008e-08, -1.0687e-04], [-2.8275e-06, 7.4029e-05, 6.0827e-05, ..., 5.0738e-06, 3.1409e-07, 2.4632e-05], ..., [ 6.2995e-06, 5.6553e-04, 9.5248e-05, ..., 5.8487e-07, 1.7905e-07, 3.0145e-05], [ 1.0751e-05, 3.1114e-05, 1.8984e-05, ..., 8.6799e-06, 6.8918e-08, 3.1739e-05], [ 4.0442e-05, 2.0057e-05, -1.1593e-04, ..., 1.4877e-04, 4.0396e-07, 1.0341e-04]], device='cuda:0') Epoch 42, bias, value: tensor([-0.0147, -0.0128, -0.0259, -0.0248, -0.0108, 0.0039, 0.0073, -0.0238, -0.0097, 0.0075], device='cuda:0'), grad: tensor([-8.8930e-05, -6.5756e-04, -6.7997e-04, -4.8542e-04, -1.6618e-04, -3.7026e-04, 3.7861e-04, 1.4973e-03, 1.1820e-04, 4.5419e-04], device='cuda:0') 100 0.0001 changing lr epoch 41, time 221.03, cls_loss 0.0079 cls_loss_mapping 0.0219 cls_loss_causal 0.6764 re_mapping 0.0151 re_causal 0.0472 /// teacc 98.72 lr 0.00010000 Epoch 43, weight, value: tensor([[-0.0547, -0.0680, 0.0595, ..., -0.0174, 0.0343, 0.0161], [-0.0339, -0.0122, -0.0167, ..., -0.0272, -0.0631, -0.0407], [ 0.0400, -0.0242, -0.0057, ..., -0.0184, -0.0128, -0.1433], ..., [-0.0137, 0.0430, -0.0013, ..., 0.0552, -0.0283, -0.0548], [ 0.0032, -0.0302, 0.0591, ..., -0.0319, -0.1047, 0.0598], [-0.0207, -0.0640, 0.0471, ..., 0.0428, -0.1158, -0.0337]], device='cuda:0'), grad: tensor([[ 1.0361e-07, 4.7609e-06, 6.1244e-06, ..., 6.6720e-06, -5.5246e-06, -8.9109e-06], [ 1.3737e-08, 1.3578e-04, 1.1951e-04, ..., 1.9741e-04, 2.7241e-07, -1.6205e-07], [ 1.1642e-08, 1.5020e-05, 1.4198e-04, ..., 1.5482e-05, 2.5555e-06, 7.3075e-05], ..., [ 4.6799e-08, -4.2653e-04, -2.1100e-04, ..., -6.1893e-04, 4.0047e-07, 2.1055e-05], [ 2.5658e-07, 6.4038e-06, -6.2101e-06, ..., 1.1571e-05, 1.3039e-06, -3.5822e-05], [ 4.1001e-07, 7.2122e-05, -9.2220e-04, ..., 9.1717e-06, 4.8429e-07, -4.0770e-04]], device='cuda:0') Epoch 43, bias, value: tensor([-0.0144, -0.0129, -0.0257, -0.0253, -0.0108, 0.0038, 0.0072, -0.0234, -0.0094, 0.0071], device='cuda:0'), grad: tensor([ 0.0001, -0.0012, 0.0008, 0.0008, 0.0023, 0.0006, 0.0002, -0.0009, 0.0001, -0.0026], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 42---------------------------------------------------- epoch 42, time 221.86, cls_loss 0.0076 cls_loss_mapping 0.0227 cls_loss_causal 0.7138 re_mapping 0.0146 re_causal 0.0502 /// teacc 98.90 lr 0.00010000 Epoch 44, weight, value: tensor([[-0.0561, -0.0691, 0.0599, ..., -0.0177, 0.0343, 0.0166], [-0.0341, -0.0121, -0.0170, ..., -0.0279, -0.0632, -0.0409], [ 0.0400, -0.0245, -0.0062, ..., -0.0188, -0.0130, -0.1446], ..., [-0.0158, 0.0435, -0.0015, ..., 0.0556, -0.0284, -0.0553], [ 0.0019, -0.0300, 0.0598, ..., -0.0322, -0.1049, 0.0601], [-0.0203, -0.0646, 0.0473, ..., 0.0427, -0.1161, -0.0340]], device='cuda:0'), grad: tensor([[ 8.7544e-06, 1.1437e-05, 1.5154e-05, ..., 8.8662e-06, 7.4622e-08, 3.0458e-05], [ 2.1197e-06, 4.9882e-06, 5.0850e-06, ..., 5.0366e-06, 2.8056e-08, -4.8392e-06], [ 1.4855e-06, 4.6268e-06, 1.5080e-05, ..., 3.2280e-06, -1.1567e-06, 3.6091e-05], ..., [ 4.4592e-06, -5.9903e-05, -2.8312e-05, ..., -4.3750e-05, 8.8941e-07, -2.4941e-06], [ 3.4064e-05, 1.1800e-06, -8.2731e-05, ..., 5.6252e-06, 7.1479e-08, -3.9846e-05], [ 1.0304e-05, 3.6448e-05, -2.7612e-05, ..., -5.2154e-06, 4.6566e-09, 3.6687e-05]], device='cuda:0') Epoch 44, bias, value: tensor([-0.0142, -0.0126, -0.0259, -0.0252, -0.0101, 0.0038, 0.0071, -0.0235, -0.0093, 0.0065], device='cuda:0'), grad: tensor([ 9.9182e-05, -8.5384e-06, -2.4462e-04, 5.5599e-04, 1.9297e-05, -6.1417e-04, 1.1289e-04, -8.4490e-06, 5.8383e-05, 2.9683e-05], device='cuda:0') 100 0.0001 changing lr epoch 43, time 221.11, cls_loss 0.0085 cls_loss_mapping 0.0243 cls_loss_causal 0.7035 re_mapping 0.0144 re_causal 0.0463 /// teacc 98.69 lr 0.00010000 Epoch 45, weight, value: tensor([[-0.0580, -0.0699, 0.0598, ..., -0.0184, 0.0345, 0.0167], [-0.0345, -0.0123, -0.0172, ..., -0.0285, -0.0632, -0.0405], [ 0.0399, -0.0245, -0.0065, ..., -0.0192, -0.0127, -0.1460], ..., [-0.0206, 0.0436, -0.0020, ..., 0.0561, -0.0285, -0.0568], [ 0.0002, -0.0292, 0.0604, ..., -0.0325, -0.1051, 0.0610], [-0.0167, -0.0659, 0.0487, ..., 0.0435, -0.1163, -0.0332]], device='cuda:0'), grad: tensor([[ 3.8370e-07, 1.9401e-05, -7.5638e-05, ..., 2.1800e-05, -4.1467e-07, -5.0783e-05], [ 5.7276e-07, 9.9540e-06, 2.1040e-05, ..., 1.2621e-05, 2.2002e-07, 5.1372e-06], [ 2.3609e-07, -1.1221e-05, 5.6267e-05, ..., -8.0466e-06, -5.2154e-06, 4.3571e-05], ..., [ 9.0525e-07, -1.4317e-04, -7.0572e-05, ..., -1.2660e-04, 3.8892e-06, 1.2442e-05], [ 3.6322e-06, 2.8893e-05, -3.2276e-05, ..., 3.7760e-05, 2.1956e-07, -3.0220e-05], [-3.5726e-06, 6.0141e-05, -5.8532e-05, ..., -1.6108e-05, 7.8231e-08, 3.0264e-05]], device='cuda:0') Epoch 45, bias, value: tensor([-0.0145, -0.0126, -0.0254, -0.0252, -0.0112, 0.0040, 0.0069, -0.0239, -0.0091, 0.0073], device='cuda:0'), grad: tensor([-8.3864e-05, 4.0621e-05, -4.4322e-04, 1.7130e-04, 1.1468e-04, 7.6368e-06, -9.5963e-05, -1.2362e-04, 4.1008e-04, 3.3677e-06], device='cuda:0') 100 0.0001 changing lr epoch 44, time 221.22, cls_loss 0.0085 cls_loss_mapping 0.0249 cls_loss_causal 0.7140 re_mapping 0.0140 re_causal 0.0458 /// teacc 98.78 lr 0.00010000 Epoch 46, weight, value: tensor([[-0.0626, -0.0710, 0.0602, ..., -0.0188, 0.0345, 0.0169], [-0.0350, -0.0125, -0.0175, ..., -0.0293, -0.0637, -0.0408], [ 0.0395, -0.0257, -0.0069, ..., -0.0198, -0.0123, -0.1469], ..., [-0.0248, 0.0445, -0.0018, ..., 0.0570, -0.0287, -0.0571], [ 0.0007, -0.0296, 0.0607, ..., -0.0328, -0.1053, 0.0614], [-0.0198, -0.0673, 0.0491, ..., 0.0431, -0.1166, -0.0335]], device='cuda:0'), grad: tensor([[ 7.2002e-05, 1.0945e-05, -1.5516e-03, ..., 3.4332e-05, 6.8080e-07, -3.1948e-03], [ 5.8115e-05, 1.9699e-05, 3.2872e-05, ..., 2.1517e-05, 3.0012e-07, -1.4365e-04], [ 1.6582e-04, 1.3006e-04, 2.7609e-04, ..., 9.2328e-05, 1.2480e-05, 5.9128e-04], ..., [ 1.7837e-05, -4.5359e-05, 6.7241e-07, ..., -3.6806e-05, 1.5218e-06, 4.0501e-05], [ 2.2066e-04, 2.8205e-04, -1.7595e-04, ..., -4.7982e-05, 7.7859e-07, -5.4741e-04], [ 1.1355e-04, 1.5736e-04, 5.5361e-04, ..., 4.1537e-06, 2.0210e-06, 7.2050e-04]], device='cuda:0') Epoch 46, bias, value: tensor([-0.0144, -0.0127, -0.0255, -0.0253, -0.0107, 0.0041, 0.0072, -0.0236, -0.0094, 0.0067], device='cuda:0'), grad: tensor([-5.1613e-03, -3.4833e-04, 1.4801e-03, -1.9684e-03, 1.8816e-03, 8.9455e-04, 2.0638e-03, 8.7678e-05, -2.2757e-04, 1.2980e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 45---------------------------------------------------- epoch 45, time 221.83, cls_loss 0.0078 cls_loss_mapping 0.0193 cls_loss_causal 0.6629 re_mapping 0.0132 re_causal 0.0425 /// teacc 98.94 lr 0.00010000 Epoch 47, weight, value: tensor([[-0.0678, -0.0724, 0.0607, ..., -0.0192, 0.0343, 0.0166], [-0.0351, -0.0126, -0.0178, ..., -0.0296, -0.0649, -0.0417], [ 0.0378, -0.0266, -0.0071, ..., -0.0202, -0.0119, -0.1493], ..., [-0.0261, 0.0456, -0.0017, ..., 0.0574, -0.0285, -0.0575], [-0.0024, -0.0301, 0.0609, ..., -0.0331, -0.1059, 0.0619], [-0.0193, -0.0684, 0.0494, ..., 0.0433, -0.1168, -0.0336]], device='cuda:0'), grad: tensor([[ 4.8101e-05, 6.6422e-06, 1.0334e-05, ..., 1.9580e-05, 1.3737e-08, 1.6904e-04], [ 1.4074e-05, 1.1846e-05, 1.8448e-05, ..., 1.5616e-05, 9.3132e-10, 1.0677e-05], [ 5.7429e-05, 3.5822e-05, 3.1859e-05, ..., 4.7803e-05, 1.3970e-09, 7.0214e-05], ..., [ 5.9716e-06, -9.5665e-05, -1.7807e-05, ..., -6.8188e-05, 0.0000e+00, 3.1441e-05], [ 3.7163e-05, -4.2468e-06, 2.6798e-04, ..., 9.7826e-06, 1.5367e-08, 1.0481e-03], [ 2.0862e-05, 5.1498e-05, 2.7448e-05, ..., 2.4140e-06, 1.1642e-09, 6.8009e-05]], device='cuda:0') Epoch 47, bias, value: tensor([-0.0146, -0.0124, -0.0260, -0.0253, -0.0105, 0.0043, 0.0070, -0.0235, -0.0093, 0.0068], device='cuda:0'), grad: tensor([ 5.0163e-04, -1.2755e-04, 5.4836e-04, 5.7757e-05, 2.6817e-03, 4.1237e-03, -9.1476e-03, 4.8250e-05, 1.0233e-03, 2.8729e-04], device='cuda:0') 100 0.0001 changing lr epoch 46, time 220.87, cls_loss 0.0079 cls_loss_mapping 0.0207 cls_loss_causal 0.7199 re_mapping 0.0135 re_causal 0.0434 /// teacc 98.84 lr 0.00010000 Epoch 48, weight, value: tensor([[-0.0696, -0.0736, 0.0604, ..., -0.0205, 0.0344, 0.0164], [-0.0355, -0.0130, -0.0184, ..., -0.0307, -0.0651, -0.0419], [ 0.0380, -0.0273, -0.0077, ..., -0.0213, -0.0116, -0.1505], ..., [-0.0290, 0.0459, -0.0019, ..., 0.0580, -0.0285, -0.0583], [-0.0040, -0.0303, 0.0614, ..., -0.0330, -0.1061, 0.0620], [-0.0209, -0.0694, 0.0503, ..., 0.0439, -0.1170, -0.0340]], device='cuda:0'), grad: tensor([[ 2.4289e-06, 3.0901e-06, -1.2469e-04, ..., 3.3975e-06, -2.2456e-05, -5.7071e-06], [ 1.6704e-05, 1.8567e-05, 1.5640e-04, ..., 1.4506e-05, 2.6263e-07, -3.3545e-04], [ 5.3167e-05, 6.2346e-05, 3.9011e-05, ..., 4.2319e-05, 3.2932e-06, 8.2314e-05], ..., [-2.2054e-05, -2.6369e-04, -2.0742e-04, ..., -3.1447e-04, 1.4724e-06, 2.9594e-05], [-3.7819e-05, 1.2353e-05, -2.7227e-04, ..., 2.3901e-05, 7.6089e-07, -4.1389e-04], [ 2.6274e-04, 1.5259e-04, 1.2529e-04, ..., 2.6393e-04, 9.2387e-06, 4.4227e-05]], device='cuda:0') Epoch 48, bias, value: tensor([-0.0154, -0.0126, -0.0259, -0.0249, -0.0107, 0.0043, 0.0078, -0.0238, -0.0096, 0.0071], device='cuda:0'), grad: tensor([-0.0001, -0.0008, 0.0001, 0.0002, -0.0004, 0.0002, 0.0005, -0.0004, -0.0002, 0.0009], device='cuda:0') 100 0.0001 changing lr epoch 47, time 221.62, cls_loss 0.0068 cls_loss_mapping 0.0191 cls_loss_causal 0.6630 re_mapping 0.0129 re_causal 0.0418 /// teacc 98.72 lr 0.00010000 Epoch 49, weight, value: tensor([[-0.0697, -0.0746, 0.0609, ..., -0.0207, 0.0346, 0.0165], [-0.0357, -0.0139, -0.0192, ..., -0.0320, -0.0656, -0.0420], [ 0.0381, -0.0281, -0.0081, ..., -0.0216, -0.0115, -0.1518], ..., [-0.0306, 0.0473, -0.0018, ..., 0.0587, -0.0282, -0.0589], [-0.0043, -0.0305, 0.0620, ..., -0.0331, -0.1068, 0.0637], [-0.0205, -0.0706, 0.0506, ..., 0.0439, -0.1174, -0.0345]], device='cuda:0'), grad: tensor([[ 3.7136e-07, 3.8415e-05, 1.1377e-05, ..., 1.5646e-06, 1.6913e-06, 2.9162e-05], [ 5.2992e-07, -1.9705e-04, 2.1309e-05, ..., 3.1870e-06, 1.6764e-07, -1.5318e-04], [ 1.2293e-07, -6.8426e-04, -3.9554e-04, ..., -6.5416e-06, -6.5327e-05, 2.4021e-05], ..., [ 4.5402e-07, 1.5783e-04, 7.7665e-05, ..., -5.3644e-06, 7.1060e-07, 2.5883e-05], [ 4.4629e-06, 2.9898e-04, 1.1462e-04, ..., 1.0826e-05, 5.8174e-05, -2.1780e-04], [ 2.9150e-07, 6.1914e-06, -8.9228e-05, ..., -7.3016e-05, 1.5786e-07, 4.2021e-05]], device='cuda:0') Epoch 49, bias, value: tensor([-0.0153, -0.0131, -0.0263, -0.0253, -0.0104, 0.0042, 0.0074, -0.0233, -0.0087, 0.0069], device='cuda:0'), grad: tensor([ 2.0373e-04, -1.4105e-03, -1.3838e-03, 1.2455e-03, 2.2590e-04, -8.2374e-05, 1.2457e-04, 5.1308e-04, 5.7220e-04, -6.8992e-06], device='cuda:0') 100 0.0001 changing lr epoch 48, time 221.23, cls_loss 0.0086 cls_loss_mapping 0.0210 cls_loss_causal 0.7020 re_mapping 0.0132 re_causal 0.0422 /// teacc 98.91 lr 0.00010000 Epoch 50, weight, value: tensor([[-0.0701, -0.0758, 0.0608, ..., -0.0218, 0.0349, 0.0169], [-0.0358, -0.0138, -0.0204, ..., -0.0329, -0.0659, -0.0426], [ 0.0380, -0.0284, -0.0087, ..., -0.0223, -0.0106, -0.1529], ..., [-0.0311, 0.0484, -0.0013, ..., 0.0595, -0.0283, -0.0594], [-0.0048, -0.0307, 0.0626, ..., -0.0331, -0.1078, 0.0641], [-0.0204, -0.0721, 0.0513, ..., 0.0441, -0.1177, -0.0345]], device='cuda:0'), grad: tensor([[ 7.4692e-07, 2.1979e-05, 6.2287e-06, ..., 9.5293e-06, 6.3479e-06, 4.3213e-05], [ 1.8207e-07, 9.4399e-06, 9.7081e-06, ..., 1.0863e-05, 1.3970e-05, 4.0233e-05], [ 2.1770e-07, 6.8545e-05, 5.6505e-05, ..., 5.6803e-05, 2.1622e-05, 1.4174e-04], ..., [ 1.2312e-06, -1.1814e-04, -6.1274e-05, ..., -1.1331e-04, 8.9966e-07, 2.1562e-05], [ 5.6215e-06, 1.1957e-04, 3.1382e-05, ..., -1.6540e-05, 1.7554e-05, 2.4557e-04], [ 9.3728e-06, 4.0740e-05, 5.7727e-05, ..., 5.6237e-05, 5.6392e-07, 4.8637e-05]], device='cuda:0') Epoch 50, bias, value: tensor([-0.0151, -0.0135, -0.0256, -0.0256, -0.0105, 0.0038, 0.0072, -0.0231, -0.0087, 0.0071], device='cuda:0'), grad: tensor([ 9.6619e-05, 8.7440e-05, 3.7432e-04, -8.1253e-04, -3.7819e-05, 1.8501e-04, -5.6791e-04, -1.0949e-04, 6.1655e-04, 1.6749e-04], device='cuda:0') 100 0.0001 changing lr epoch 49, time 220.86, cls_loss 0.0061 cls_loss_mapping 0.0193 cls_loss_causal 0.6482 re_mapping 0.0129 re_causal 0.0405 /// teacc 98.86 lr 0.00010000 Epoch 51, weight, value: tensor([[-0.0708, -0.0765, 0.0611, ..., -0.0224, 0.0350, 0.0173], [-0.0358, -0.0142, -0.0208, ..., -0.0334, -0.0659, -0.0419], [ 0.0381, -0.0291, -0.0092, ..., -0.0229, -0.0105, -0.1544], ..., [-0.0313, 0.0493, -0.0012, ..., 0.0599, -0.0284, -0.0600], [-0.0056, -0.0314, 0.0628, ..., -0.0332, -0.1082, 0.0642], [-0.0208, -0.0730, 0.0518, ..., 0.0443, -0.1179, -0.0351]], device='cuda:0'), grad: tensor([[ 2.6114e-06, 2.5332e-05, 2.6852e-05, ..., 2.0102e-05, 7.0259e-06, 7.3731e-05], [ 4.4750e-07, 5.3614e-05, 1.2793e-05, ..., 1.9699e-05, 2.4997e-06, -1.1808e-04], [ 8.4843e-07, -1.6940e-04, 3.6836e-05, ..., -7.2718e-05, 3.5763e-06, 1.1784e-04], ..., [ 4.3102e-06, 2.5058e-04, 7.5698e-05, ..., -8.0228e-05, 3.3178e-07, 4.3273e-05], [ 4.9397e-06, 1.1081e-04, 1.2290e-04, ..., 4.9502e-05, 1.4722e-05, 1.0192e-04], [-2.3823e-06, 7.5996e-05, -1.5700e-04, ..., 2.2721e-04, 1.2117e-06, -1.1140e-04]], device='cuda:0') Epoch 51, bias, value: tensor([-0.0150, -0.0135, -0.0257, -0.0255, -0.0103, 0.0036, 0.0076, -0.0231, -0.0090, 0.0070], device='cuda:0'), grad: tensor([ 0.0002, -0.0002, -0.0004, -0.0007, -0.0005, 0.0002, -0.0003, 0.0007, 0.0004, 0.0005], device='cuda:0') 100 0.0001 changing lr epoch 50, time 221.53, cls_loss 0.0066 cls_loss_mapping 0.0181 cls_loss_causal 0.7141 re_mapping 0.0126 re_causal 0.0411 /// teacc 98.62 lr 0.00010000 Epoch 52, weight, value: tensor([[-0.0731, -0.0775, 0.0612, ..., -0.0229, 0.0350, 0.0169], [-0.0359, -0.0136, -0.0207, ..., -0.0326, -0.0658, -0.0405], [ 0.0382, -0.0291, -0.0096, ..., -0.0231, -0.0105, -0.1560], ..., [-0.0319, 0.0495, -0.0014, ..., 0.0597, -0.0284, -0.0614], [-0.0062, -0.0321, 0.0636, ..., -0.0331, -0.1085, 0.0655], [-0.0216, -0.0742, 0.0522, ..., 0.0447, -0.1183, -0.0363]], device='cuda:0'), grad: tensor([[ 5.1036e-07, 2.3186e-05, 1.4916e-05, ..., 1.1228e-05, 6.6757e-06, 4.2409e-05], [ 5.3318e-08, 2.5749e-04, 2.0051e-04, ..., 1.0580e-04, 1.6196e-06, 7.2002e-05], [ 8.0327e-08, 4.4316e-05, 4.6521e-05, ..., 1.8224e-05, -1.0453e-05, 5.2452e-05], ..., [ 3.2969e-07, -6.3562e-04, -4.3464e-04, ..., -2.1458e-04, 4.2096e-06, -2.2173e-04], [ 8.6334e-07, 6.1572e-05, 2.8722e-06, ..., 3.4034e-05, 1.2200e-06, -7.4387e-05], [ 1.0580e-06, 1.4544e-04, 2.2650e-05, ..., 2.2680e-05, 1.4645e-07, 4.2558e-05]], device='cuda:0') Epoch 52, bias, value: tensor([-0.0156, -0.0128, -0.0252, -0.0252, -0.0106, 0.0033, 0.0074, -0.0238, -0.0084, 0.0070], device='cuda:0'), grad: tensor([ 1.1647e-04, 1.9860e-04, 7.9274e-05, 1.6809e-04, 3.7909e-05, 6.8367e-05, -8.9169e-05, -7.4673e-04, 1.1258e-05, 1.5557e-04], device='cuda:0') 100 0.0001 changing lr epoch 51, time 221.36, cls_loss 0.0068 cls_loss_mapping 0.0206 cls_loss_causal 0.6648 re_mapping 0.0126 re_causal 0.0403 /// teacc 98.77 lr 0.00010000 Epoch 53, weight, value: tensor([[-0.0744, -0.0780, 0.0614, ..., -0.0235, 0.0350, 0.0165], [-0.0360, -0.0141, -0.0212, ..., -0.0335, -0.0665, -0.0406], [ 0.0381, -0.0298, -0.0103, ..., -0.0235, -0.0105, -0.1582], ..., [-0.0331, 0.0504, -0.0013, ..., 0.0605, -0.0278, -0.0617], [-0.0073, -0.0324, 0.0645, ..., -0.0332, -0.1087, 0.0663], [-0.0214, -0.0752, 0.0527, ..., 0.0448, -0.1185, -0.0368]], device='cuda:0'), grad: tensor([[ 1.3024e-05, 4.5523e-06, 7.9349e-07, ..., 4.4852e-06, 0.0000e+00, 2.9296e-05], [ 1.7844e-06, 3.6471e-06, 3.2801e-06, ..., 4.4554e-06, 0.0000e+00, -2.7686e-05], [ 1.2189e-05, 1.4424e-05, 1.2815e-05, ..., 1.2532e-05, 0.0000e+00, 2.2545e-05], ..., [ 1.1615e-05, -5.3287e-05, 4.0233e-06, ..., -1.5929e-05, 0.0000e+00, 1.6630e-05], [ 1.8984e-05, 5.2825e-06, -4.6194e-05, ..., -4.3362e-06, 0.0000e+00, -2.4408e-05], [ 1.4710e-04, 2.9117e-05, -6.2764e-05, ..., -6.5029e-05, 0.0000e+00, 1.6117e-04]], device='cuda:0') Epoch 53, bias, value: tensor([-0.0161, -0.0131, -0.0257, -0.0253, -0.0105, 0.0043, 0.0065, -0.0235, -0.0080, 0.0070], device='cuda:0'), grad: tensor([ 7.5221e-05, -7.5817e-05, 2.4855e-05, 9.7752e-04, 8.0466e-05, -1.5793e-03, -3.8266e-05, 3.4332e-05, 7.9393e-05, 4.2200e-04], device='cuda:0') 100 0.0001 changing lr epoch 52, time 221.39, cls_loss 0.0060 cls_loss_mapping 0.0156 cls_loss_causal 0.6599 re_mapping 0.0127 re_causal 0.0395 /// teacc 98.84 lr 0.00010000 Epoch 54, weight, value: tensor([[-0.0755, -0.0786, 0.0617, ..., -0.0239, 0.0350, 0.0169], [-0.0362, -0.0144, -0.0216, ..., -0.0342, -0.0666, -0.0406], [ 0.0384, -0.0305, -0.0108, ..., -0.0239, -0.0105, -0.1609], ..., [-0.0343, 0.0511, -0.0013, ..., 0.0605, -0.0279, -0.0622], [-0.0082, -0.0329, 0.0651, ..., -0.0328, -0.1089, 0.0666], [-0.0210, -0.0760, 0.0529, ..., 0.0449, -0.1185, -0.0377]], device='cuda:0'), grad: tensor([[ 1.0327e-05, 1.8477e-05, 4.9407e-07, ..., 1.0826e-05, 4.5411e-06, 3.1084e-05], [ 4.5784e-06, 2.2531e-05, 1.4871e-05, ..., 1.0774e-05, 2.0489e-06, 1.2107e-05], [ 8.4750e-07, -4.7088e-05, 5.2273e-05, ..., 1.4409e-05, -8.2612e-05, 3.3736e-05], ..., [ 8.9779e-06, 1.2732e-04, 1.1295e-04, ..., 5.0992e-05, 2.6196e-05, 1.0198e-04], [ 9.0450e-06, 8.4519e-05, -2.0540e-04, ..., -1.3030e-04, 9.7379e-06, -1.7512e-04], [ 1.8024e-04, 4.5121e-05, 7.5996e-05, ..., 1.1820e-04, 9.6858e-07, 4.2057e-04]], device='cuda:0') Epoch 54, bias, value: tensor([-0.0156, -0.0134, -0.0262, -0.0251, -0.0101, 0.0049, 0.0066, -0.0237, -0.0080, 0.0066], device='cuda:0'), grad: tensor([ 8.8871e-05, 4.5061e-05, -3.6025e-04, -1.7996e-03, 5.6028e-06, 4.1699e-04, 4.1199e-04, 4.4823e-04, -1.3120e-05, 7.5626e-04], device='cuda:0') 100 0.0001 changing lr epoch 53, time 221.18, cls_loss 0.0048 cls_loss_mapping 0.0144 cls_loss_causal 0.6426 re_mapping 0.0127 re_causal 0.0399 /// teacc 98.82 lr 0.00010000 Epoch 55, weight, value: tensor([[-0.0772, -0.0785, 0.0622, ..., -0.0243, 0.0358, 0.0177], [-0.0364, -0.0147, -0.0218, ..., -0.0349, -0.0668, -0.0401], [ 0.0381, -0.0309, -0.0113, ..., -0.0244, -0.0101, -0.1619], ..., [-0.0360, 0.0516, -0.0015, ..., 0.0606, -0.0279, -0.0626], [-0.0095, -0.0333, 0.0650, ..., -0.0332, -0.1092, 0.0667], [-0.0197, -0.0766, 0.0537, ..., 0.0454, -0.1189, -0.0377]], device='cuda:0'), grad: tensor([[ 3.0478e-07, 3.0249e-06, -6.8367e-05, ..., 3.2801e-06, 6.3842e-07, -2.8387e-05], [ 1.0841e-06, 1.8343e-05, 8.5905e-06, ..., 1.2703e-05, 5.3691e-07, -1.4491e-05], [ 6.7614e-07, 6.9201e-05, 5.6088e-05, ..., 2.5347e-05, 1.2971e-05, 8.8751e-05], ..., [ 3.5703e-05, -6.9618e-05, -2.5213e-05, ..., 1.2457e-04, 1.1129e-07, 1.4164e-05], [ 3.3248e-06, 1.0207e-05, -4.9472e-05, ..., 1.4957e-06, 9.2667e-07, 2.1029e-06], [ 1.0245e-05, 1.9163e-05, -3.7588e-06, ..., 3.6985e-05, 1.3923e-07, 2.6509e-05]], device='cuda:0') Epoch 55, bias, value: tensor([-0.0153, -0.0132, -0.0263, -0.0251, -0.0102, 0.0045, 0.0071, -0.0240, -0.0087, 0.0070], device='cuda:0'), grad: tensor([-8.7321e-05, 1.7032e-05, 6.2609e-04, -3.2616e-04, -2.7552e-05, 3.3766e-05, -7.9727e-04, 3.1352e-04, 8.8632e-05, 1.5903e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 54---------------------------------------------------- epoch 54, time 222.17, cls_loss 0.0061 cls_loss_mapping 0.0194 cls_loss_causal 0.6924 re_mapping 0.0119 re_causal 0.0388 /// teacc 99.00 lr 0.00010000 Epoch 56, weight, value: tensor([[-0.0803, -0.0795, 0.0623, ..., -0.0250, 0.0358, 0.0177], [-0.0370, -0.0156, -0.0228, ..., -0.0361, -0.0668, -0.0400], [ 0.0383, -0.0315, -0.0119, ..., -0.0247, -0.0101, -0.1634], ..., [-0.0371, 0.0523, -0.0009, ..., 0.0612, -0.0279, -0.0627], [-0.0116, -0.0337, 0.0655, ..., -0.0334, -0.1094, 0.0670], [-0.0191, -0.0782, 0.0541, ..., 0.0457, -0.1190, -0.0377]], device='cuda:0'), grad: tensor([[ 8.8587e-06, 5.5581e-05, 1.6764e-05, ..., 3.4004e-05, 0.0000e+00, 2.5421e-05], [ 8.5682e-07, 4.1202e-06, 3.5577e-06, ..., 9.3162e-05, 0.0000e+00, 9.2387e-06], [ 6.0583e-07, -4.1723e-05, -7.5519e-05, ..., 2.2814e-05, 0.0000e+00, 2.0444e-05], ..., [-1.9029e-05, -1.2058e-04, -3.8743e-05, ..., -5.4479e-05, 0.0000e+00, -1.1343e-04], [ 8.4117e-06, 2.3007e-05, 3.7670e-05, ..., 2.6405e-05, 0.0000e+00, 3.3706e-05], [ 1.2759e-06, 6.9812e-06, -1.2413e-05, ..., 3.6657e-05, 0.0000e+00, 2.2486e-05]], device='cuda:0') Epoch 56, bias, value: tensor([-0.0154, -0.0137, -0.0265, -0.0245, -0.0101, 0.0042, 0.0076, -0.0235, -0.0088, 0.0067], device='cuda:0'), grad: tensor([ 1.8227e-04, 6.0177e-04, -3.6645e-04, 1.3912e-04, -1.5125e-03, 1.1253e-04, 1.8454e-04, -9.2983e-06, 4.1962e-04, 2.4819e-04], device='cuda:0') 100 0.0001 changing lr epoch 55, time 220.79, cls_loss 0.0069 cls_loss_mapping 0.0173 cls_loss_causal 0.6373 re_mapping 0.0120 re_causal 0.0365 /// teacc 98.92 lr 0.00010000 Epoch 57, weight, value: tensor([[-0.0815, -0.0810, 0.0626, ..., -0.0255, 0.0357, 0.0180], [-0.0374, -0.0161, -0.0234, ..., -0.0368, -0.0670, -0.0407], [ 0.0393, -0.0317, -0.0126, ..., -0.0246, -0.0099, -0.1650], ..., [-0.0398, 0.0525, -0.0007, ..., 0.0619, -0.0278, -0.0635], [-0.0128, -0.0341, 0.0660, ..., -0.0335, -0.1096, 0.0679], [-0.0179, -0.0793, 0.0545, ..., 0.0460, -0.1191, -0.0382]], device='cuda:0'), grad: tensor([[ 4.3074e-07, 8.4098e-07, -2.1473e-05, ..., 1.0896e-06, 0.0000e+00, -3.6927e-07], [ 7.9395e-08, -1.4696e-06, 7.8231e-06, ..., 2.0657e-06, 0.0000e+00, 2.2247e-05], [ 5.9139e-08, 4.9844e-06, 2.0877e-05, ..., 4.5076e-06, 0.0000e+00, 6.8665e-05], ..., [ 7.2271e-07, -1.7881e-05, -4.5262e-06, ..., -1.2711e-05, 0.0000e+00, 2.7612e-05], [ 2.3656e-07, 5.2759e-07, -1.8525e-04, ..., -2.6152e-06, 0.0000e+00, -7.7152e-04], [-3.9153e-06, 1.8366e-06, 1.4573e-05, ..., -3.7942e-06, 0.0000e+00, 3.3885e-05]], device='cuda:0') Epoch 57, bias, value: tensor([-0.0152, -0.0144, -0.0259, -0.0242, -0.0104, 0.0042, 0.0079, -0.0235, -0.0086, 0.0065], device='cuda:0'), grad: tensor([ 6.7689e-06, 3.1501e-05, 1.0920e-04, 4.6444e-04, 5.6362e-04, 1.0377e-04, -2.2873e-05, 6.6578e-05, -1.3742e-03, 5.2333e-05], device='cuda:0') 100 0.0001 changing lr epoch 56, time 221.14, cls_loss 0.0050 cls_loss_mapping 0.0183 cls_loss_causal 0.6524 re_mapping 0.0121 re_causal 0.0392 /// teacc 98.84 lr 0.00010000 Epoch 58, weight, value: tensor([[-0.0823, -0.0824, 0.0629, ..., -0.0265, 0.0357, 0.0182], [-0.0376, -0.0164, -0.0238, ..., -0.0372, -0.0671, -0.0407], [ 0.0394, -0.0321, -0.0132, ..., -0.0251, -0.0097, -0.1664], ..., [-0.0412, 0.0533, -0.0006, ..., 0.0621, -0.0278, -0.0639], [-0.0136, -0.0344, 0.0665, ..., -0.0336, -0.1100, 0.0689], [-0.0170, -0.0802, 0.0553, ..., 0.0465, -0.1191, -0.0386]], device='cuda:0'), grad: tensor([[ 1.2806e-06, 2.6263e-06, 4.1217e-05, ..., 4.7386e-06, 1.6997e-08, 7.3195e-05], [ 5.3085e-07, 5.6922e-06, 9.4473e-06, ..., 1.1161e-05, 1.0245e-08, 7.4469e-06], [-5.8450e-06, -2.5168e-05, 1.1511e-05, ..., -2.6956e-05, -3.3760e-07, 3.8415e-05], ..., [ 5.3719e-06, -8.2478e-06, 2.8029e-05, ..., 1.5333e-05, 1.9162e-07, 1.5140e-05], [ 4.2886e-05, 4.9084e-05, 7.9041e-03, ..., 3.2902e-04, 7.8697e-08, 1.2505e-02], [-9.0957e-05, -1.6121e-06, -5.7077e-04, ..., -4.2486e-04, 3.4925e-09, -2.3305e-04]], device='cuda:0') Epoch 58, bias, value: tensor([-0.0154, -0.0144, -0.0261, -0.0246, -0.0106, 0.0043, 0.0081, -0.0234, -0.0083, 0.0066], device='cuda:0'), grad: tensor([ 6.4433e-05, 9.7930e-05, -2.0325e-04, 6.9514e-06, -1.7977e-04, 1.7905e-04, -1.4740e-02, 1.0967e-04, 1.5236e-02, -5.7220e-04], device='cuda:0') 100 0.0001 changing lr epoch 57, time 220.81, cls_loss 0.0049 cls_loss_mapping 0.0150 cls_loss_causal 0.6467 re_mapping 0.0116 re_causal 0.0371 /// teacc 98.95 lr 0.00010000 Epoch 59, weight, value: tensor([[-0.0838, -0.0838, 0.0632, ..., -0.0271, 0.0356, 0.0186], [-0.0379, -0.0166, -0.0240, ..., -0.0377, -0.0672, -0.0405], [ 0.0396, -0.0322, -0.0136, ..., -0.0251, -0.0095, -0.1681], ..., [-0.0442, 0.0538, -0.0006, ..., 0.0624, -0.0278, -0.0647], [-0.0158, -0.0348, 0.0666, ..., -0.0338, -0.1103, 0.0686], [-0.0178, -0.0811, 0.0555, ..., 0.0463, -0.1192, -0.0396]], device='cuda:0'), grad: tensor([[-5.0012e-07, 6.9104e-07, -4.0047e-06, ..., 1.6000e-06, 3.9185e-07, 4.9740e-05], [-2.6021e-06, 6.6981e-06, 1.3918e-05, ..., 1.4246e-05, 2.7940e-09, -1.4722e-04], [ 1.1288e-06, 1.1377e-05, 1.2279e-05, ..., -1.8347e-07, 4.1910e-09, 2.5630e-05], ..., [ 6.7707e-07, -1.0744e-05, -3.2075e-06, ..., -7.8678e-06, 6.9849e-10, 1.2897e-05], [-2.7061e-05, 7.0445e-06, -9.9599e-05, ..., -8.6203e-06, 8.6147e-09, -2.0063e-04], [ 1.0487e-06, 1.0669e-05, 2.6897e-05, ..., 2.0638e-05, 9.3132e-10, 2.5347e-05]], device='cuda:0') Epoch 59, bias, value: tensor([-0.0155, -0.0147, -0.0256, -0.0248, -0.0101, 0.0049, 0.0085, -0.0238, -0.0089, 0.0063], device='cuda:0'), grad: tensor([ 7.3791e-05, -1.1742e-04, 4.5806e-05, 3.7719e-06, -2.3782e-04, 1.1724e-04, 5.0694e-05, 6.2764e-05, -1.0830e-04, 1.0967e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 58---------------------------------------------------- epoch 58, time 221.73, cls_loss 0.0053 cls_loss_mapping 0.0166 cls_loss_causal 0.6714 re_mapping 0.0116 re_causal 0.0375 /// teacc 99.04 lr 0.00010000 Epoch 60, weight, value: tensor([[-0.0848, -0.0845, 0.0634, ..., -0.0282, 0.0356, 0.0191], [-0.0379, -0.0171, -0.0245, ..., -0.0381, -0.0674, -0.0398], [ 0.0395, -0.0342, -0.0141, ..., -0.0254, -0.0095, -0.1694], ..., [-0.0457, 0.0555, -0.0003, ..., 0.0629, -0.0275, -0.0654], [-0.0163, -0.0351, 0.0671, ..., -0.0338, -0.1104, 0.0692], [-0.0173, -0.0821, 0.0559, ..., 0.0464, -0.1192, -0.0399]], device='cuda:0'), grad: tensor([[ 2.4810e-06, 1.0990e-06, -1.1295e-04, ..., 9.3412e-07, 0.0000e+00, -1.7166e-05], [ 6.0536e-06, 1.7732e-06, 7.7635e-06, ..., 2.5518e-06, 0.0000e+00, 3.2485e-06], [ 6.5472e-07, 2.5213e-05, 4.6551e-05, ..., 4.3400e-06, 0.0000e+00, 1.4655e-05], ..., [ 6.0797e-06, -9.8795e-06, 9.4026e-06, ..., -1.2688e-05, 0.0000e+00, 1.4648e-05], [ 1.2688e-05, 5.6550e-06, 7.6368e-06, ..., 7.6890e-06, 0.0000e+00, 1.7881e-05], [-6.6683e-06, 1.2808e-05, -1.1817e-05, ..., -2.2754e-05, 0.0000e+00, 1.3858e-05]], device='cuda:0') Epoch 60, bias, value: tensor([-0.0153, -0.0143, -0.0266, -0.0250, -0.0101, 0.0046, 0.0084, -0.0230, -0.0087, 0.0062], device='cuda:0'), grad: tensor([-1.9073e-04, -8.6248e-05, 1.9014e-04, -5.2840e-05, 6.4611e-05, -7.3254e-05, -1.1716e-06, 6.6340e-05, 6.0350e-05, 2.2903e-05], device='cuda:0') 100 0.0001 changing lr epoch 59, time 221.07, cls_loss 0.0050 cls_loss_mapping 0.0152 cls_loss_causal 0.6401 re_mapping 0.0115 re_causal 0.0357 /// teacc 98.70 lr 0.00010000 Epoch 61, weight, value: tensor([[-8.7586e-02, -8.5348e-02, 6.3669e-02, ..., -2.8630e-02, 3.5521e-02, 1.8728e-02], [-3.7998e-02, -1.7688e-02, -2.5157e-02, ..., -3.8419e-02, -6.7636e-02, -3.9765e-02], [ 3.9595e-02, -3.4645e-02, -1.4802e-02, ..., -2.5841e-02, -9.1996e-03, -1.7051e-01], ..., [-4.7898e-02, 5.6211e-02, -9.4824e-05, ..., 6.3397e-02, -2.7647e-02, -6.6495e-02], [-1.7915e-02, -3.5388e-02, 6.7867e-02, ..., -3.3921e-02, -1.1050e-01, 7.0252e-02], [-1.6809e-02, -8.3604e-02, 5.5888e-02, ..., 4.6394e-02, -1.1937e-01, -4.0386e-02]], device='cuda:0'), grad: tensor([[ 4.2804e-06, 7.5027e-06, 6.4075e-05, ..., 2.4103e-06, -1.3653e-06, 9.1970e-05], [ 3.5129e-06, 2.2724e-06, 1.1182e-04, ..., 4.1462e-06, 9.6159e-08, 9.9897e-05], [ 8.4788e-06, 2.5898e-05, 2.7871e-04, ..., 3.2112e-06, -1.2163e-06, 3.5381e-04], ..., [ 3.9004e-06, -2.0638e-05, 7.6294e-05, ..., -6.0722e-07, 8.5495e-07, 1.2088e-04], [-1.5885e-05, -3.9965e-05, -6.2704e-04, ..., 3.5949e-06, 4.1490e-07, -7.7486e-04], [ 6.9849e-06, 1.3098e-05, 4.3549e-06, ..., 2.2918e-05, 1.2829e-07, 2.0817e-05]], device='cuda:0') Epoch 61, bias, value: tensor([-0.0151, -0.0139, -0.0270, -0.0251, -0.0104, 0.0051, 0.0078, -0.0229, -0.0079, 0.0057], device='cuda:0'), grad: tensor([ 1.4710e-04, -1.8030e-05, 5.7602e-04, -8.5950e-05, -7.0274e-05, 1.9622e-04, 5.2750e-05, 2.5415e-04, -1.1501e-03, 9.7215e-05], device='cuda:0') 100 0.0001 changing lr epoch 60, time 221.05, cls_loss 0.0050 cls_loss_mapping 0.0139 cls_loss_causal 0.6380 re_mapping 0.0114 re_causal 0.0361 /// teacc 98.90 lr 0.00010000 Epoch 62, weight, value: tensor([[-8.9935e-02, -8.6741e-02, 6.3638e-02, ..., -2.9091e-02, 3.5424e-02, 1.8451e-02], [-3.8314e-02, -1.7649e-02, -2.5083e-02, ..., -3.7885e-02, -6.7814e-02, -3.9270e-02], [ 3.9670e-02, -3.5146e-02, -1.5428e-02, ..., -2.6617e-02, -8.9856e-03, -1.7238e-01], ..., [-4.8691e-02, 5.6915e-02, -1.2902e-04, ..., 6.3803e-02, -2.7468e-02, -6.6767e-02], [-1.8037e-02, -3.5685e-02, 6.8185e-02, ..., -3.4353e-02, -1.1063e-01, 7.1167e-02], [-1.5477e-02, -8.4803e-02, 5.6596e-02, ..., 4.6860e-02, -1.1947e-01, -4.0722e-02]], device='cuda:0'), grad: tensor([[ 1.5497e-05, 6.2473e-06, 1.5497e-04, ..., 6.3255e-06, 1.0765e-04, 4.2009e-04], [ 5.0664e-07, 3.4511e-05, 2.4676e-05, ..., 3.4511e-05, 7.1526e-07, -3.3528e-05], [ 3.2061e-07, 1.7226e-05, 1.2994e-05, ..., 1.3366e-05, -6.7949e-06, 6.2026e-06], ..., [ 8.8383e-07, -1.0099e-03, -1.0519e-03, ..., -1.0061e-03, 1.6466e-06, -7.7039e-06], [ 8.3894e-06, 1.6272e-05, 3.2097e-05, ..., 2.1622e-05, 8.3745e-06, 5.2243e-05], [ 5.7667e-06, 8.7309e-04, 9.2268e-04, ..., 9.9754e-04, 2.7735e-06, 4.3213e-05]], device='cuda:0') Epoch 62, bias, value: tensor([-0.0156, -0.0130, -0.0269, -0.0251, -0.0110, 0.0048, 0.0078, -0.0231, -0.0077, 0.0059], device='cuda:0'), grad: tensor([ 5.1689e-04, -6.9976e-05, 6.9058e-07, 1.4412e-04, -3.5620e-04, 1.1826e-04, -6.5422e-04, -2.2984e-03, 1.2827e-04, 2.4681e-03], device='cuda:0') 100 0.0001 changing lr epoch 61, time 220.74, cls_loss 0.0050 cls_loss_mapping 0.0175 cls_loss_causal 0.6167 re_mapping 0.0113 re_causal 0.0350 /// teacc 98.81 lr 0.00010000 Epoch 63, weight, value: tensor([[-0.0911, -0.0882, 0.0642, ..., -0.0294, 0.0358, 0.0187], [-0.0388, -0.0173, -0.0250, ..., -0.0378, -0.0683, -0.0389], [ 0.0396, -0.0360, -0.0157, ..., -0.0269, -0.0084, -0.1735], ..., [-0.0487, 0.0578, -0.0003, ..., 0.0639, -0.0268, -0.0672], [-0.0201, -0.0362, 0.0682, ..., -0.0346, -0.1119, 0.0715], [-0.0134, -0.0861, 0.0572, ..., 0.0472, -0.1200, -0.0402]], device='cuda:0'), grad: tensor([[ 3.4249e-07, 1.5190e-06, 1.9418e-07, ..., 7.9162e-07, 2.8359e-07, 2.5053e-06], [ 6.3796e-08, 8.6576e-06, 3.4738e-06, ..., 3.9898e-06, 7.1246e-08, -3.4384e-06], [ 1.1967e-07, 6.7130e-06, 1.8880e-05, ..., 6.3609e-07, 3.8883e-08, 2.2277e-05], ..., [ 4.8522e-07, -2.5436e-05, -6.9439e-06, ..., -1.3456e-05, 4.4238e-09, 3.6173e-06], [ 3.4040e-07, 1.9576e-06, -3.1948e-05, ..., 4.6147e-07, 2.3562e-07, -4.1008e-05], [-1.2922e-07, 8.4192e-06, 2.8294e-06, ..., 4.1048e-07, 2.3982e-08, 7.6182e-06]], device='cuda:0') Epoch 63, bias, value: tensor([-0.0154, -0.0130, -0.0267, -0.0252, -0.0111, 0.0043, 0.0079, -0.0232, -0.0077, 0.0064], device='cuda:0'), grad: tensor([ 3.9786e-06, 1.4156e-06, 3.8862e-05, 3.9488e-06, 3.1795e-06, -3.8929e-06, 2.8480e-06, -1.7554e-05, -4.9859e-05, 1.7032e-05], device='cuda:0') 100 0.0001 changing lr epoch 62, time 220.62, cls_loss 0.0035 cls_loss_mapping 0.0118 cls_loss_causal 0.6114 re_mapping 0.0110 re_causal 0.0341 /// teacc 98.94 lr 0.00010000 Epoch 64, weight, value: tensor([[-0.0920, -0.0888, 0.0641, ..., -0.0308, 0.0357, 0.0191], [-0.0395, -0.0178, -0.0255, ..., -0.0386, -0.0689, -0.0389], [ 0.0403, -0.0365, -0.0161, ..., -0.0271, -0.0082, -0.1743], ..., [-0.0491, 0.0585, -0.0003, ..., 0.0644, -0.0270, -0.0677], [-0.0213, -0.0367, 0.0684, ..., -0.0347, -0.1121, 0.0718], [-0.0142, -0.0870, 0.0577, ..., 0.0471, -0.1202, -0.0407]], device='cuda:0'), grad: tensor([[ 9.9614e-06, 9.5963e-06, -4.3452e-05, ..., 9.2909e-06, -7.4739e-07, -3.3885e-05], [ 4.6581e-05, 6.9812e-06, 6.6049e-06, ..., 1.1690e-05, 8.7917e-06, 2.6878e-06], [-5.1111e-05, 1.7792e-05, 1.9968e-05, ..., 5.5492e-05, 2.2799e-05, 1.1109e-05], ..., [-2.7232e-06, -5.7817e-05, -7.0557e-06, ..., -2.2173e-04, 7.8753e-06, -1.0513e-05], [ 1.5736e-05, 2.2426e-05, 2.2545e-05, ..., 6.7204e-06, 1.5087e-06, 1.1288e-05], [ 7.4804e-05, 1.6272e-05, 3.7700e-05, ..., -3.9250e-05, 8.2329e-07, 9.6500e-05]], device='cuda:0') Epoch 64, bias, value: tensor([-0.0157, -0.0132, -0.0268, -0.0252, -0.0099, 0.0046, 0.0076, -0.0231, -0.0079, 0.0059], device='cuda:0'), grad: tensor([-9.4414e-05, 5.0545e-04, -2.8753e-04, 4.5300e-05, 1.7357e-04, -1.8716e-04, 5.1796e-05, -4.3631e-04, 7.7307e-05, 1.5175e-04], device='cuda:0') 100 0.0001 changing lr epoch 63, time 221.45, cls_loss 0.0045 cls_loss_mapping 0.0141 cls_loss_causal 0.6705 re_mapping 0.0113 re_causal 0.0358 /// teacc 98.92 lr 0.00010000 Epoch 65, weight, value: tensor([[-0.0931, -0.0889, 0.0644, ..., -0.0311, 0.0357, 0.0196], [-0.0399, -0.0187, -0.0261, ..., -0.0393, -0.0693, -0.0387], [ 0.0401, -0.0367, -0.0165, ..., -0.0274, -0.0079, -0.1755], ..., [-0.0498, 0.0582, -0.0005, ..., 0.0646, -0.0266, -0.0690], [-0.0223, -0.0371, 0.0692, ..., -0.0341, -0.1128, 0.0726], [-0.0140, -0.0877, 0.0581, ..., 0.0474, -0.1207, -0.0415]], device='cuda:0'), grad: tensor([[ 1.1288e-06, -2.1867e-06, -6.5804e-05, ..., 8.2701e-07, 1.4983e-07, -5.1796e-05], [ 7.2364e-07, 1.4678e-06, 4.3251e-06, ..., 2.2091e-06, 8.2422e-08, -8.7768e-06], [ 3.3062e-07, 2.3749e-06, 1.3240e-05, ..., 1.2387e-06, -1.1250e-05, 1.4104e-05], ..., [ 2.7437e-06, 2.8331e-06, 2.4378e-05, ..., 1.5929e-05, 3.1409e-07, 1.8209e-05], [ 7.1004e-06, 2.2501e-05, 4.2409e-05, ..., 9.0227e-06, 9.7901e-06, 2.6941e-05], [-1.2711e-05, 2.6420e-05, -5.5403e-05, ..., -4.9025e-05, 6.5193e-09, -1.6555e-05]], device='cuda:0') Epoch 65, bias, value: tensor([-0.0155, -0.0140, -0.0261, -0.0248, -0.0098, 0.0046, 0.0072, -0.0236, -0.0076, 0.0059], device='cuda:0'), grad: tensor([-1.0228e-04, -1.3299e-06, -7.1287e-05, -7.7128e-05, 1.5363e-05, 3.8669e-06, 1.2077e-05, 7.1824e-05, 1.7154e-04, -2.2575e-05], device='cuda:0') 100 0.0001 changing lr epoch 64, time 220.71, cls_loss 0.0052 cls_loss_mapping 0.0170 cls_loss_causal 0.6611 re_mapping 0.0108 re_causal 0.0334 /// teacc 98.82 lr 0.00010000 Epoch 66, weight, value: tensor([[-9.5910e-02, -8.9588e-02, 6.4716e-02, ..., -3.1470e-02, 3.5619e-02, 1.9589e-02], [-4.0356e-02, -1.8216e-02, -2.6165e-02, ..., -4.0121e-02, -6.9409e-02, -3.8593e-02], [ 3.9682e-02, -3.8073e-02, -1.7227e-02, ..., -2.8051e-02, -7.5368e-03, -1.7700e-01], ..., [-5.1391e-02, 5.9070e-02, -1.7051e-04, ..., 6.5523e-02, -2.6714e-02, -6.9117e-02], [-2.0215e-02, -3.7696e-02, 6.9631e-02, ..., -3.4256e-02, -1.1316e-01, 7.4453e-02], [-1.4236e-02, -8.9063e-02, 5.8154e-02, ..., 4.7135e-02, -1.2092e-01, -4.2342e-02]], device='cuda:0'), grad: tensor([[ 3.0547e-06, 3.5763e-06, -9.6336e-06, ..., 3.5707e-06, 3.8766e-07, -3.9414e-06], [ 5.4911e-06, 4.1649e-06, 2.3320e-06, ..., 3.2745e-06, -5.2527e-07, -6.2771e-06], [-7.6175e-05, 7.6182e-06, 4.7088e-06, ..., 2.1923e-06, -1.2922e-08, 2.9299e-06], ..., [ 8.0690e-06, -2.3261e-05, -1.2346e-05, ..., -1.5885e-05, 3.5274e-07, 3.5539e-06], [ 3.1024e-05, 9.6709e-06, -6.4857e-06, ..., 3.6396e-06, 6.8638e-07, -9.5069e-06], [ 2.4401e-06, 3.6597e-05, 3.1382e-05, ..., 2.9945e-04, 1.2200e-07, 9.7603e-06]], device='cuda:0') Epoch 66, bias, value: tensor([-0.0157, -0.0138, -0.0266, -0.0249, -0.0094, 0.0043, 0.0070, -0.0229, -0.0068, 0.0050], device='cuda:0'), grad: tensor([ 1.5691e-05, 3.3647e-05, -6.1035e-04, -1.0931e-04, -5.5122e-04, 2.1446e-04, 6.3121e-05, 8.3566e-05, 2.6560e-04, 5.9366e-04], device='cuda:0') 100 0.0001 changing lr epoch 65, time 220.94, cls_loss 0.0043 cls_loss_mapping 0.0141 cls_loss_causal 0.6327 re_mapping 0.0108 re_causal 0.0334 /// teacc 98.86 lr 0.00010000 Epoch 67, weight, value: tensor([[-0.0966, -0.0902, 0.0650, ..., -0.0322, 0.0358, 0.0194], [-0.0405, -0.0189, -0.0266, ..., -0.0406, -0.0700, -0.0385], [ 0.0396, -0.0382, -0.0176, ..., -0.0281, -0.0068, -0.1782], ..., [-0.0520, 0.0601, 0.0002, ..., 0.0662, -0.0267, -0.0695], [-0.0206, -0.0385, 0.0702, ..., -0.0340, -0.1151, 0.0751], [-0.0144, -0.0909, 0.0583, ..., 0.0473, -0.1216, -0.0436]], device='cuda:0'), grad: tensor([[ 1.6214e-06, 9.0674e-06, 2.1085e-06, ..., 1.3962e-05, 1.1467e-07, 1.3992e-05], [ 3.0436e-06, 1.9348e-04, 6.0201e-05, ..., 2.5797e-04, 7.6182e-07, -4.0698e-04], [ 5.6997e-07, 4.9919e-05, 1.6585e-05, ..., 1.5900e-05, -2.5537e-06, 1.9521e-05], ..., [ 2.2314e-06, -3.5954e-04, -1.4818e-04, ..., -4.5300e-04, 4.4424e-07, -2.6727e-04], [ 7.9572e-06, 1.0258e-04, 8.2195e-05, ..., 1.4198e-04, 7.3947e-07, 4.2200e-04], [-5.2415e-06, 4.5002e-05, -1.7476e-04, ..., 1.5945e-03, 2.3283e-08, 5.4884e-04]], device='cuda:0') Epoch 67, bias, value: tensor([-0.0158, -0.0139, -0.0265, -0.0249, -0.0097, 0.0043, 0.0075, -0.0227, -0.0065, 0.0045], device='cuda:0'), grad: tensor([ 3.9697e-05, -2.0936e-05, 1.4150e-04, -2.2328e-04, -4.0092e-03, 1.4412e-04, 1.1539e-04, -9.1124e-04, 7.5865e-04, 3.9635e-03], device='cuda:0') 100 0.0001 changing lr epoch 66, time 220.89, cls_loss 0.0036 cls_loss_mapping 0.0115 cls_loss_causal 0.6132 re_mapping 0.0105 re_causal 0.0332 /// teacc 98.82 lr 0.00010000 Epoch 68, weight, value: tensor([[-0.0973, -0.0910, 0.0652, ..., -0.0326, 0.0356, 0.0193], [-0.0416, -0.0193, -0.0271, ..., -0.0413, -0.0700, -0.0387], [ 0.0395, -0.0389, -0.0180, ..., -0.0285, -0.0057, -0.1788], ..., [-0.0528, 0.0606, -0.0002, ..., 0.0661, -0.0270, -0.0699], [-0.0213, -0.0392, 0.0701, ..., -0.0348, -0.1161, 0.0754], [-0.0131, -0.0914, 0.0597, ..., 0.0483, -0.1223, -0.0436]], device='cuda:0'), grad: tensor([[ 1.4054e-06, 1.2666e-05, -1.0245e-05, ..., 4.8392e-06, -1.3718e-06, -1.1474e-05], [ 5.8208e-07, 9.3281e-06, 5.0403e-06, ..., 2.3210e-04, 4.3074e-08, -5.4762e-06], [-7.4580e-06, 6.6662e-04, 6.8471e-06, ..., 3.0339e-05, 2.5658e-07, 1.4514e-05], ..., [ 2.2277e-06, -1.2562e-05, -1.6555e-05, ..., -1.7017e-05, 1.5192e-07, 4.7833e-06], [ 2.6226e-06, 3.7223e-05, -1.2740e-05, ..., 1.2107e-05, 9.0455e-08, -1.8924e-05], [-3.1367e-06, 2.9102e-05, 1.8124e-06, ..., 3.7104e-05, 2.0128e-07, 8.2254e-06]], device='cuda:0') Epoch 68, bias, value: tensor([-0.0161, -0.0140, -0.0266, -0.0247, -0.0098, 0.0045, 0.0075, -0.0230, -0.0070, 0.0053], device='cuda:0'), grad: tensor([ 2.7135e-05, 1.0939e-03, 1.6813e-03, -2.0046e-03, -1.3571e-03, 7.3731e-05, 3.6329e-05, 1.1641e-04, 1.6308e-04, 1.6844e-04], device='cuda:0') 100 0.0001 changing lr epoch 67, time 220.91, cls_loss 0.0052 cls_loss_mapping 0.0138 cls_loss_causal 0.6095 re_mapping 0.0107 re_causal 0.0315 /// teacc 98.89 lr 0.00010000 Epoch 69, weight, value: tensor([[-9.9328e-02, -9.2960e-02, 6.5479e-02, ..., -3.3219e-02, 3.5898e-02, 1.9430e-02], [-4.2223e-02, -1.9435e-02, -2.7322e-02, ..., -4.2476e-02, -7.0213e-02, -3.7225e-02], [ 3.8645e-02, -3.9803e-02, -1.8525e-02, ..., -2.8716e-02, -5.1369e-03, -1.7998e-01], ..., [-5.4270e-02, 6.1442e-02, 7.7591e-05, ..., 6.6688e-02, -2.7054e-02, -7.0322e-02], [-2.2508e-02, -3.9859e-02, 6.9951e-02, ..., -3.5444e-02, -1.1677e-01, 7.5236e-02], [-1.1081e-02, -9.2710e-02, 6.0678e-02, ..., 4.8831e-02, -1.2263e-01, -4.2861e-02]], device='cuda:0'), grad: tensor([[ 1.4659e-06, 7.4245e-06, -1.0222e-04, ..., 6.0163e-06, -2.5183e-06, -4.1306e-05], [ 4.2003e-07, 1.2025e-05, 1.6406e-05, ..., 1.8757e-06, -8.2329e-06, -2.2724e-06], [ 8.0233e-07, 2.1607e-05, 2.3246e-05, ..., 7.9945e-06, 1.7844e-06, 1.2398e-05], ..., [ 1.6708e-06, -8.1360e-05, -3.8087e-05, ..., -9.9182e-05, 3.5577e-06, 9.6187e-06], [ 6.3470e-07, 7.2047e-06, -9.1121e-06, ..., 9.1502e-08, 6.6916e-07, -1.4260e-05], [ 1.2539e-05, 7.4625e-05, 5.9277e-05, ..., 4.8429e-05, 1.4780e-06, 2.9534e-05]], device='cuda:0') Epoch 69, bias, value: tensor([-0.0162, -0.0134, -0.0271, -0.0241, -0.0100, 0.0040, 0.0072, -0.0230, -0.0078, 0.0062], device='cuda:0'), grad: tensor([-1.9825e-04, -1.3798e-05, 7.6771e-05, -7.8380e-05, 3.5167e-05, 1.5199e-05, 2.6479e-05, -9.0897e-05, 2.3872e-05, 2.0397e-04], device='cuda:0') 100 0.0001 changing lr epoch 68, time 221.14, cls_loss 0.0041 cls_loss_mapping 0.0124 cls_loss_causal 0.6249 re_mapping 0.0107 re_causal 0.0326 /// teacc 98.93 lr 0.00010000 Epoch 70, weight, value: tensor([[-1.0028e-01, -9.4065e-02, 6.5735e-02, ..., -3.4036e-02, 3.5948e-02, 1.9471e-02], [-4.3422e-02, -1.9520e-02, -2.7693e-02, ..., -4.3268e-02, -7.0333e-02, -3.7810e-02], [ 3.8321e-02, -4.0124e-02, -1.8931e-02, ..., -2.8981e-02, -4.3879e-03, -1.8022e-01], ..., [-5.5846e-02, 6.1808e-02, -2.8262e-05, ..., 6.7153e-02, -2.7870e-02, -7.1521e-02], [-2.5063e-02, -4.0118e-02, 7.0309e-02, ..., -3.5769e-02, -1.1718e-01, 7.5464e-02], [-1.0742e-02, -9.3598e-02, 6.1329e-02, ..., 4.9233e-02, -1.2344e-01, -4.3237e-02]], device='cuda:0'), grad: tensor([[ 9.6634e-06, 7.7533e-08, 1.5721e-05, ..., 1.4164e-05, 9.7556e-08, 1.2398e-05], [ 2.0526e-06, 3.9837e-07, 1.7062e-05, ..., 1.5780e-05, -1.3085e-06, 3.6228e-06], [ 8.8364e-06, 7.2550e-07, -6.6519e-05, ..., -9.9897e-05, 3.7206e-07, 1.9699e-05], ..., [ 3.5353e-06, -1.7118e-06, 4.2021e-05, ..., 4.0144e-05, 1.8510e-07, 1.3083e-05], [ 3.1125e-06, 8.7777e-08, 5.0575e-05, ..., 7.3612e-05, 1.8766e-07, -4.8503e-06], [-4.8280e-05, 9.4669e-07, -1.5652e-04, ..., -1.0288e-04, 3.8417e-08, -1.0014e-04]], device='cuda:0') Epoch 70, bias, value: tensor([-0.0163, -0.0143, -0.0261, -0.0243, -0.0101, 0.0049, 0.0071, -0.0233, -0.0081, 0.0062], device='cuda:0'), grad: tensor([ 1.4114e-04, 2.3353e-04, -1.7042e-03, 2.2817e-04, 1.6201e-04, 4.4554e-05, 7.4744e-05, 4.4107e-04, 7.7963e-04, -3.9983e-04], device='cuda:0') 100 0.0001 changing lr epoch 69, time 221.15, cls_loss 0.0042 cls_loss_mapping 0.0139 cls_loss_causal 0.6302 re_mapping 0.0103 re_causal 0.0303 /// teacc 99.03 lr 0.00010000 Epoch 71, weight, value: tensor([[-1.0128e-01, -9.5025e-02, 6.6271e-02, ..., -3.4453e-02, 3.5453e-02, 1.9490e-02], [-4.4875e-02, -1.9944e-02, -2.8387e-02, ..., -4.3862e-02, -7.0604e-02, -3.8275e-02], [ 3.9023e-02, -4.0689e-02, -1.9194e-02, ..., -2.9701e-02, -4.1942e-03, -1.8142e-01], ..., [-5.8773e-02, 6.2668e-02, 1.5886e-04, ..., 6.7803e-02, -2.7503e-02, -7.1914e-02], [-2.5152e-02, -4.0960e-02, 7.0609e-02, ..., -3.6206e-02, -1.1760e-01, 7.6072e-02], [-1.1733e-02, -9.4554e-02, 6.1799e-02, ..., 4.9529e-02, -1.2386e-01, -4.4147e-02]], device='cuda:0'), grad: tensor([[-3.9667e-05, 4.1761e-06, -2.1553e-04, ..., -9.2527e-07, 0.0000e+00, -1.1730e-04], [ 1.1874e-06, 7.5363e-06, 9.7752e-06, ..., 5.8077e-06, 0.0000e+00, -1.0312e-05], [ 8.1509e-06, 5.5432e-06, 3.7372e-05, ..., 7.1637e-06, 0.0000e+00, 3.2157e-05], ..., [ 5.1782e-06, -1.0139e-04, -8.8155e-05, ..., -5.9903e-05, 0.0000e+00, 6.8098e-06], [-1.6034e-05, 2.6226e-05, -4.5270e-05, ..., -5.3048e-06, 0.0000e+00, -7.4148e-05], [ 3.0905e-05, 2.3305e-05, 2.0158e-04, ..., 1.3709e-06, 0.0000e+00, 1.5104e-04]], device='cuda:0') Epoch 71, bias, value: tensor([-0.0158, -0.0146, -0.0259, -0.0252, -0.0101, 0.0058, 0.0073, -0.0232, -0.0081, 0.0058], device='cuda:0'), grad: tensor([-3.5024e-04, -4.1485e-05, 9.1970e-05, 5.7399e-05, 3.3557e-05, 7.9155e-05, -5.0098e-05, -1.3900e-04, -2.5943e-05, 3.4428e-04], device='cuda:0') 100 0.0001 changing lr epoch 70, time 221.02, cls_loss 0.0040 cls_loss_mapping 0.0122 cls_loss_causal 0.6702 re_mapping 0.0098 re_causal 0.0307 /// teacc 98.85 lr 0.00010000 Epoch 72, weight, value: tensor([[-1.0383e-01, -9.6180e-02, 6.5927e-02, ..., -3.6494e-02, 3.5432e-02, 1.9657e-02], [-4.5761e-02, -1.9981e-02, -2.8816e-02, ..., -4.4818e-02, -7.0526e-02, -3.8264e-02], [ 3.9723e-02, -4.1471e-02, -1.9756e-02, ..., -3.0688e-02, -4.1340e-03, -1.8225e-01], ..., [-6.2494e-02, 6.3408e-02, -7.6233e-05, ..., 6.8104e-02, -2.7516e-02, -7.2993e-02], [-2.5614e-02, -4.1368e-02, 7.1568e-02, ..., -3.5893e-02, -1.1780e-01, 7.6823e-02], [-1.1123e-02, -9.5848e-02, 6.2688e-02, ..., 5.0189e-02, -1.2414e-01, -4.4614e-02]], device='cuda:0'), grad: tensor([[ 9.2201e-07, 8.3074e-06, -7.1347e-05, ..., 1.1548e-06, -4.1956e-07, -4.2945e-05], [ 1.0841e-06, 1.7822e-05, 8.2627e-06, ..., 4.9025e-05, 1.6834e-07, 5.4613e-06], [ 1.8198e-06, 3.8087e-05, 1.8492e-05, ..., 3.3639e-06, -1.1781e-07, 1.5005e-05], ..., [ 2.1011e-06, 3.0965e-05, 1.5914e-05, ..., 4.3958e-06, 1.9884e-07, 6.2399e-06], [-2.2948e-05, 8.3387e-05, -8.0690e-06, ..., 2.7679e-06, 2.8871e-08, -1.7154e-04], [-2.0877e-05, 3.9607e-05, 1.0163e-05, ..., -5.3287e-05, 1.0314e-07, 2.7716e-05]], device='cuda:0') Epoch 72, bias, value: tensor([-0.0165, -0.0139, -0.0266, -0.0252, -0.0098, 0.0054, 0.0076, -0.0235, -0.0077, 0.0060], device='cuda:0'), grad: tensor([-1.1939e-04, 3.4308e-04, 7.6294e-05, -4.5586e-04, -2.9516e-04, 2.3723e-04, 4.7684e-05, 1.1444e-04, -2.1517e-05, 7.2956e-05], device='cuda:0') 100 0.0001 changing lr epoch 71, time 221.38, cls_loss 0.0029 cls_loss_mapping 0.0105 cls_loss_causal 0.6300 re_mapping 0.0102 re_causal 0.0316 /// teacc 98.96 lr 0.00010000 Epoch 73, weight, value: tensor([[-1.0472e-01, -9.7094e-02, 6.6779e-02, ..., -3.6808e-02, 3.5866e-02, 2.0704e-02], [-4.6260e-02, -1.9846e-02, -2.9030e-02, ..., -4.5041e-02, -7.0563e-02, -3.8531e-02], [ 3.9378e-02, -4.2009e-02, -2.0163e-02, ..., -3.0945e-02, -3.9697e-03, -1.8269e-01], ..., [-6.5326e-02, 6.3799e-02, 7.3742e-05, ..., 6.8669e-02, -2.7593e-02, -7.3399e-02], [-2.5925e-02, -4.1601e-02, 7.1880e-02, ..., -3.6257e-02, -1.1793e-01, 7.7332e-02], [-1.1195e-02, -9.6822e-02, 6.2974e-02, ..., 5.0153e-02, -1.2440e-01, -4.4980e-02]], device='cuda:0'), grad: tensor([[ 5.5879e-08, 3.8147e-06, 9.4948e-07, ..., 9.6038e-06, 4.5868e-08, 4.9211e-06], [ 5.3411e-07, 9.9242e-05, 4.6790e-05, ..., 8.5711e-05, 1.3597e-07, 6.7353e-06], [ 4.3074e-08, 8.4221e-05, 5.1677e-05, ..., 2.9095e-06, -4.2794e-07, 9.9316e-06], ..., [ 3.5367e-07, -2.2817e-04, -1.3697e-04, ..., -1.2994e-04, 7.9162e-08, 3.1348e-06], [ 7.7765e-08, 2.4110e-05, 1.8850e-05, ..., 2.2829e-05, 3.5623e-08, -1.1146e-05], [-1.1623e-06, 1.1042e-05, -5.0757e-07, ..., 1.4216e-05, 1.0943e-08, 1.6475e-06]], device='cuda:0') Epoch 73, bias, value: tensor([-0.0158, -0.0137, -0.0265, -0.0254, -0.0096, 0.0055, 0.0076, -0.0236, -0.0077, 0.0056], device='cuda:0'), grad: tensor([ 5.1349e-05, 1.9932e-04, -1.0353e-04, 9.8422e-06, 4.2510e-04, 1.0258e-04, -6.3467e-04, -1.4627e-04, 4.9919e-05, 4.5598e-05], device='cuda:0') 100 0.0001 changing lr epoch 72, time 220.79, cls_loss 0.0034 cls_loss_mapping 0.0105 cls_loss_causal 0.6402 re_mapping 0.0099 re_causal 0.0318 /// teacc 98.96 lr 0.00010000 Epoch 74, weight, value: tensor([[-1.0622e-01, -9.8084e-02, 6.7103e-02, ..., -3.7010e-02, 3.5898e-02, 2.0848e-02], [-4.6850e-02, -2.0325e-02, -2.9660e-02, ..., -4.5662e-02, -7.0578e-02, -3.8688e-02], [ 3.9122e-02, -4.2322e-02, -2.0700e-02, ..., -3.1431e-02, -3.8751e-03, -1.8379e-01], ..., [-6.5980e-02, 6.4523e-02, 7.6562e-05, ..., 6.9139e-02, -2.7647e-02, -7.3702e-02], [-2.5934e-02, -4.2124e-02, 7.2241e-02, ..., -3.6740e-02, -1.1800e-01, 7.8442e-02], [-1.0924e-02, -9.7798e-02, 6.3615e-02, ..., 5.0389e-02, -1.2446e-01, -4.5077e-02]], device='cuda:0'), grad: tensor([[ 8.1491e-07, 8.2701e-07, -1.5783e-04, ..., -1.2696e-05, -4.8429e-05, -1.0395e-04], [ 3.1125e-06, 6.3479e-06, 4.0419e-06, ..., 7.9125e-06, 9.8161e-07, 3.0212e-06], [ 1.7155e-06, 2.2545e-05, 9.9838e-05, ..., 3.0667e-05, 2.3216e-05, 7.4863e-05], ..., [ 7.8371e-07, -4.6939e-05, 9.9763e-06, ..., -3.7044e-05, 1.0990e-05, 1.5289e-05], [ 2.9262e-06, 2.6915e-06, 5.1707e-06, ..., 1.0645e-06, 9.1270e-07, 8.4341e-06], [ 4.3884e-06, 8.9854e-06, 1.1161e-05, ..., 1.0498e-05, 3.7495e-06, 9.6858e-06]], device='cuda:0') Epoch 74, bias, value: tensor([-0.0154, -0.0140, -0.0266, -0.0257, -0.0094, 0.0054, 0.0070, -0.0235, -0.0070, 0.0055], device='cuda:0'), grad: tensor([-3.8552e-04, 7.0572e-05, 3.4356e-04, 4.8071e-05, -5.1558e-05, -6.3926e-06, -1.1837e-04, -5.2191e-06, 3.0965e-05, 7.3433e-05], device='cuda:0') 100 0.0001 changing lr epoch 73, time 221.18, cls_loss 0.0044 cls_loss_mapping 0.0136 cls_loss_causal 0.5990 re_mapping 0.0093 re_causal 0.0275 /// teacc 98.93 lr 0.00010000 Epoch 75, weight, value: tensor([[-0.1078, -0.0998, 0.0673, ..., -0.0377, 0.0358, 0.0214], [-0.0468, -0.0204, -0.0301, ..., -0.0460, -0.0706, -0.0382], [ 0.0393, -0.0429, -0.0208, ..., -0.0320, -0.0035, -0.1850], ..., [-0.0673, 0.0662, 0.0021, ..., 0.0711, -0.0280, -0.0727], [-0.0272, -0.0434, 0.0721, ..., -0.0375, -0.1182, 0.0786], [-0.0099, -0.1006, 0.0633, ..., 0.0499, -0.1247, -0.0460]], device='cuda:0'), grad: tensor([[ 7.0967e-07, 1.2731e-06, -1.2591e-05, ..., 1.9465e-06, 1.0710e-08, -1.3702e-05], [ 1.4789e-06, 8.8215e-06, 9.3365e-07, ..., 5.6326e-06, 1.7276e-07, 3.6359e-06], [ 6.0536e-07, 1.3039e-05, 1.1511e-05, ..., 7.0781e-06, 1.5530e-07, 1.5438e-05], ..., [ 7.0296e-06, -2.0951e-05, 5.7012e-05, ..., 1.9029e-05, -8.7777e-07, 8.5756e-06], [ 1.7826e-06, 6.4448e-07, -1.7896e-05, ..., -4.1015e-06, 2.0256e-08, -1.7926e-05], [ 7.0743e-06, -3.0547e-05, -3.0637e-05, ..., 9.0837e-05, 1.3039e-08, 1.0520e-05]], device='cuda:0') Epoch 75, bias, value: tensor([-0.0158, -0.0137, -0.0263, -0.0247, -0.0099, 0.0040, 0.0076, -0.0221, -0.0077, 0.0046], device='cuda:0'), grad: tensor([-1.9521e-05, -3.2485e-06, 4.9204e-05, 4.3303e-05, -2.5463e-04, 3.3945e-05, -7.7307e-05, 4.6074e-05, -1.0580e-06, 1.8299e-04], device='cuda:0') 100 0.0001 changing lr epoch 74, time 220.59, cls_loss 0.0033 cls_loss_mapping 0.0101 cls_loss_causal 0.6326 re_mapping 0.0092 re_causal 0.0293 /// teacc 98.90 lr 0.00010000 Epoch 76, weight, value: tensor([[-0.1085, -0.1008, 0.0676, ..., -0.0383, 0.0359, 0.0224], [-0.0479, -0.0204, -0.0307, ..., -0.0469, -0.0703, -0.0379], [ 0.0403, -0.0425, -0.0214, ..., -0.0323, -0.0036, -0.1860], ..., [-0.0693, 0.0664, 0.0020, ..., 0.0716, -0.0282, -0.0735], [-0.0285, -0.0440, 0.0722, ..., -0.0380, -0.1184, 0.0786], [-0.0101, -0.1014, 0.0641, ..., 0.0501, -0.1250, -0.0460]], device='cuda:0'), grad: tensor([[-8.6799e-06, 9.8627e-07, -1.7092e-05, ..., 3.2037e-06, 6.6403e-07, -7.4446e-05], [ 8.3586e-07, -1.8165e-05, 1.4165e-06, ..., -1.4706e-06, -1.2070e-05, -2.9318e-06], [ 1.1036e-06, 5.0887e-06, 4.5411e-06, ..., 5.0776e-06, -2.0256e-07, 7.2867e-06], ..., [ 3.6340e-06, -2.0862e-07, -9.2248e-07, ..., 4.6864e-06, 6.3367e-06, 5.9046e-06], [ 7.5437e-06, 1.4715e-06, -9.9540e-06, ..., 1.1377e-05, 2.1197e-06, -4.0978e-06], [-3.3919e-06, 3.6247e-06, -1.3530e-05, ..., -9.7007e-06, 1.8161e-07, 8.3596e-06]], device='cuda:0') Epoch 76, bias, value: tensor([-0.0151, -0.0135, -0.0259, -0.0249, -0.0095, 0.0043, 0.0072, -0.0226, -0.0081, 0.0045], device='cuda:0'), grad: tensor([-8.6725e-05, -1.6594e-04, 2.5973e-05, 2.6539e-05, -9.2626e-05, 8.2493e-05, 3.9995e-05, 1.2612e-04, 4.2826e-05, 1.4910e-06], device='cuda:0') 100 0.0001 changing lr epoch 75, time 220.28, cls_loss 0.0050 cls_loss_mapping 0.0130 cls_loss_causal 0.6295 re_mapping 0.0091 re_causal 0.0292 /// teacc 98.90 lr 0.00010000 Epoch 77, weight, value: tensor([[-0.1096, -0.1016, 0.0675, ..., -0.0389, 0.0359, 0.0207], [-0.0496, -0.0199, -0.0293, ..., -0.0449, -0.0701, -0.0381], [ 0.0401, -0.0447, -0.0237, ..., -0.0329, -0.0036, -0.1892], ..., [-0.0706, 0.0669, 0.0015, ..., 0.0710, -0.0280, -0.0729], [-0.0316, -0.0421, 0.0733, ..., -0.0385, -0.1186, 0.0775], [-0.0090, -0.1022, 0.0648, ..., 0.0503, -0.1251, -0.0461]], device='cuda:0'), grad: tensor([[ 4.4182e-06, 3.7029e-06, 8.2478e-06, ..., 7.6666e-06, 5.0757e-08, -1.9111e-06], [ 1.0014e-05, 4.2580e-06, 2.6941e-05, ..., 1.4380e-05, 4.7497e-08, 2.7958e-06], [ 8.9360e-07, 3.0398e-06, 3.6862e-06, ..., 1.2899e-06, -1.7053e-06, 4.4517e-06], ..., [ 4.1842e-05, 2.7001e-05, 1.4150e-04, ..., 7.5519e-05, 9.0711e-07, 5.6848e-06], [-2.2769e-05, 2.4159e-06, 7.2941e-06, ..., 1.2539e-05, 4.5355e-07, -1.0157e-04], [-8.5831e-05, -6.8247e-05, -3.2806e-04, ..., -1.8466e-04, 4.8894e-09, 6.0536e-06]], device='cuda:0') Epoch 77, bias, value: tensor([-0.0164, -0.0131, -0.0273, -0.0258, -0.0093, 0.0058, 0.0086, -0.0227, -0.0082, 0.0045], device='cuda:0'), grad: tensor([ 2.4438e-05, 2.2888e-05, 3.5968e-06, 1.2350e-04, 3.1292e-05, 1.7726e-04, 3.6448e-05, 3.1805e-04, -8.8692e-05, -6.4898e-04], device='cuda:0') 100 0.0001 changing lr epoch 76, time 220.71, cls_loss 0.0032 cls_loss_mapping 0.0093 cls_loss_causal 0.5902 re_mapping 0.0098 re_causal 0.0301 /// teacc 98.88 lr 0.00010000 Epoch 78, weight, value: tensor([[-0.1101, -0.1025, 0.0678, ..., -0.0393, 0.0359, 0.0209], [-0.0504, -0.0203, -0.0302, ..., -0.0458, -0.0703, -0.0385], [ 0.0400, -0.0452, -0.0241, ..., -0.0333, -0.0030, -0.1895], ..., [-0.0708, 0.0671, 0.0018, ..., 0.0717, -0.0282, -0.0731], [-0.0319, -0.0426, 0.0739, ..., -0.0386, -0.1188, 0.0782], [-0.0082, -0.1031, 0.0653, ..., 0.0506, -0.1255, -0.0462]], device='cuda:0'), grad: tensor([[ 2.7404e-07, 1.5542e-05, 2.7299e-05, ..., 2.3078e-06, 1.1824e-05, 3.3796e-05], [ 4.6240e-07, 6.9499e-05, 3.1710e-05, ..., 3.4243e-05, 2.6412e-06, 6.3330e-06], [ 5.9605e-08, -5.7876e-05, 4.9710e-05, ..., 4.5329e-05, -1.5199e-04, 1.8016e-05], ..., [ 9.3225e-07, -2.6345e-04, -1.0407e-04, ..., -1.3864e-04, 1.8045e-05, 4.5523e-06], [ 3.2000e-06, 2.8908e-05, -1.1230e-04, ..., 4.5598e-06, 2.1726e-05, -1.4293e-04], [-5.8487e-06, 1.3582e-05, -1.7196e-05, ..., -1.0751e-05, 9.7416e-07, 8.6948e-06]], device='cuda:0') Epoch 78, bias, value: tensor([-0.0162, -0.0137, -0.0267, -0.0253, -0.0092, 0.0050, 0.0086, -0.0227, -0.0081, 0.0046], device='cuda:0'), grad: tensor([ 1.1289e-04, 1.2720e-04, -3.6693e-04, 5.8460e-04, 5.5879e-05, 8.1956e-06, 1.9133e-05, -3.6001e-04, -1.7643e-04, -4.9211e-06], device='cuda:0') 100 0.0001 changing lr epoch 77, time 220.51, cls_loss 0.0033 cls_loss_mapping 0.0126 cls_loss_causal 0.6020 re_mapping 0.0096 re_causal 0.0293 /// teacc 99.01 lr 0.00010000 Epoch 79, weight, value: tensor([[-0.1123, -0.1038, 0.0674, ..., -0.0405, 0.0361, 0.0211], [-0.0514, -0.0207, -0.0307, ..., -0.0463, -0.0712, -0.0389], [ 0.0400, -0.0453, -0.0245, ..., -0.0336, -0.0024, -0.1901], ..., [-0.0723, 0.0681, 0.0023, ..., 0.0724, -0.0277, -0.0734], [-0.0331, -0.0432, 0.0741, ..., -0.0389, -0.1194, 0.0784], [-0.0074, -0.1048, 0.0661, ..., 0.0507, -0.1259, -0.0464]], device='cuda:0'), grad: tensor([[ 1.4296e-07, 7.6648e-07, -4.8317e-06, ..., 4.9686e-07, -8.2422e-07, -2.5239e-06], [ 3.8533e-07, 6.2361e-06, 2.0377e-06, ..., 3.8706e-06, 9.5926e-08, -5.0589e-06], [ 4.0862e-07, 1.0297e-05, 4.1053e-06, ..., 2.8946e-06, 3.8673e-07, 3.0342e-06], ..., [ 2.7614e-07, -1.5222e-05, -1.1273e-05, ..., -2.3320e-05, 4.2166e-07, 2.6282e-06], [ 2.6282e-06, 6.1169e-06, 6.7465e-06, ..., 6.1952e-06, 7.6322e-07, 6.9328e-06], [-1.8403e-06, 1.3672e-05, -2.7977e-06, ..., -4.3213e-07, 2.0326e-07, 1.0408e-07]], device='cuda:0') Epoch 79, bias, value: tensor([-0.0168, -0.0141, -0.0265, -0.0252, -0.0093, 0.0052, 0.0087, -0.0223, -0.0083, 0.0047], device='cuda:0'), grad: tensor([-6.4410e-06, -1.7518e-06, 1.7568e-05, -5.4896e-05, 6.5900e-06, 2.9638e-05, -1.4208e-05, -1.6540e-05, 2.5243e-05, 1.4775e-05], device='cuda:0') 100 0.0001 changing lr epoch 78, time 220.98, cls_loss 0.0034 cls_loss_mapping 0.0095 cls_loss_causal 0.5775 re_mapping 0.0090 re_causal 0.0276 /// teacc 98.96 lr 0.00010000 Epoch 80, weight, value: tensor([[-0.1131, -0.1047, 0.0676, ..., -0.0410, 0.0362, 0.0214], [-0.0526, -0.0206, -0.0312, ..., -0.0467, -0.0718, -0.0380], [ 0.0396, -0.0456, -0.0247, ..., -0.0337, -0.0021, -0.1907], ..., [-0.0735, 0.0678, 0.0023, ..., 0.0726, -0.0278, -0.0738], [-0.0338, -0.0437, 0.0742, ..., -0.0392, -0.1197, 0.0780], [-0.0063, -0.1059, 0.0668, ..., 0.0513, -0.1266, -0.0465]], device='cuda:0'), grad: tensor([[ 3.0873e-07, 1.0915e-06, -5.1521e-06, ..., 9.9279e-07, -8.2422e-08, -1.1675e-05], [ 6.1933e-08, 1.3098e-05, 5.5432e-06, ..., 1.0014e-05, 1.8775e-06, 7.2300e-05], [-7.8185e-07, 2.6319e-06, 1.1791e-06, ..., 8.0094e-07, -1.6809e-05, -9.1493e-05], ..., [ 5.7556e-07, -3.5077e-05, -1.2867e-05, ..., -2.6822e-05, 1.0477e-06, 7.6182e-07], [ 7.7765e-07, 4.3809e-06, 4.3772e-08, ..., 3.1982e-06, 9.2387e-07, 2.1860e-05], [-2.7590e-07, 5.6624e-06, -3.7104e-06, ..., -6.2771e-07, 6.3935e-07, 2.9355e-06]], device='cuda:0') Epoch 80, bias, value: tensor([-0.0171, -0.0139, -0.0266, -0.0241, -0.0097, 0.0051, 0.0091, -0.0226, -0.0091, 0.0050], device='cuda:0'), grad: tensor([-4.3325e-06, 6.6519e-04, -8.3733e-04, 5.3979e-06, 3.1330e-06, 4.4823e-05, 1.6749e-05, -4.3303e-05, 1.3638e-04, 1.3039e-05], device='cuda:0') 100 0.0001 changing lr epoch 79, time 220.47, cls_loss 0.0042 cls_loss_mapping 0.0111 cls_loss_causal 0.6112 re_mapping 0.0090 re_causal 0.0272 /// teacc 98.86 lr 0.00010000 Epoch 81, weight, value: tensor([[-0.1132, -0.1052, 0.0684, ..., -0.0415, 0.0368, 0.0224], [-0.0532, -0.0213, -0.0320, ..., -0.0472, -0.0717, -0.0381], [ 0.0394, -0.0465, -0.0255, ..., -0.0335, -0.0010, -0.1917], ..., [-0.0745, 0.0693, 0.0024, ..., 0.0732, -0.0282, -0.0744], [-0.0342, -0.0441, 0.0746, ..., -0.0396, -0.1205, 0.0785], [-0.0064, -0.1063, 0.0678, ..., 0.0515, -0.1269, -0.0473]], device='cuda:0'), grad: tensor([[-8.4098e-07, 1.5721e-05, -2.5943e-05, ..., 3.3174e-06, -4.2990e-06, -2.4796e-05], [-1.5569e-04, -3.0708e-03, -1.5659e-03, ..., -4.8518e-04, 6.9756e-07, -1.0565e-05], [ 1.6373e-06, 5.8556e-04, 1.8179e-05, ..., 1.8328e-05, -4.0568e-06, 1.1146e-05], ..., [ 8.8274e-05, 8.5354e-04, 8.8406e-04, ..., 2.1017e-04, 1.1111e-06, 9.1419e-06], [-3.0901e-06, 1.0394e-05, -2.0549e-05, ..., -3.6545e-06, 2.5742e-06, -2.8953e-05], [ 6.4850e-05, 1.4420e-03, 6.4659e-04, ..., 2.3103e-04, 2.3888e-07, 2.5004e-05]], device='cuda:0') Epoch 81, bias, value: tensor([-0.0165, -0.0153, -0.0253, -0.0249, -0.0095, 0.0047, 0.0088, -0.0222, -0.0089, 0.0054], device='cuda:0'), grad: tensor([-1.9699e-05, -8.4839e-03, 7.6628e-04, 2.4486e-04, 1.1927e-04, 3.3647e-05, 1.3381e-05, 3.5477e-03, 1.2200e-06, 3.7804e-03], device='cuda:0') 100 0.0001 changing lr epoch 80, time 220.65, cls_loss 0.0032 cls_loss_mapping 0.0105 cls_loss_causal 0.6311 re_mapping 0.0091 re_causal 0.0287 /// teacc 98.89 lr 0.00010000 Epoch 82, weight, value: tensor([[-0.1150, -0.1059, 0.0685, ..., -0.0421, 0.0365, 0.0224], [-0.0537, -0.0215, -0.0321, ..., -0.0477, -0.0719, -0.0380], [ 0.0381, -0.0472, -0.0264, ..., -0.0341, -0.0010, -0.1939], ..., [-0.0756, 0.0704, 0.0025, ..., 0.0740, -0.0288, -0.0750], [-0.0335, -0.0443, 0.0760, ..., -0.0390, -0.1198, 0.0802], [-0.0059, -0.1076, 0.0679, ..., 0.0515, -0.1292, -0.0484]], device='cuda:0'), grad: tensor([[ 3.9525e-06, 6.5565e-07, -2.9981e-05, ..., 1.6224e-06, -5.3179e-07, -2.2411e-05], [ 7.5996e-07, 1.3364e-07, 2.3358e-06, ..., 8.4490e-06, -3.0827e-07, -4.8019e-06], [-2.3887e-05, 3.8370e-06, 1.2465e-05, ..., 4.0121e-06, 1.6275e-07, 1.0967e-05], ..., [ 9.1922e-07, -1.7017e-05, 4.9397e-06, ..., 2.4736e-06, 2.9011e-07, 7.9051e-06], [ 1.4894e-05, 1.0021e-06, -7.0371e-06, ..., 5.0366e-06, 5.2154e-08, -1.0237e-05], [ 7.9302e-07, 3.7700e-06, -5.5879e-06, ..., 1.5751e-05, 7.8231e-08, 6.1505e-06]], device='cuda:0') Epoch 82, bias, value: tensor([-0.0168, -0.0151, -0.0261, -0.0258, -0.0094, 0.0049, 0.0084, -0.0218, -0.0071, 0.0048], device='cuda:0'), grad: tensor([-6.1035e-05, 1.6466e-05, -1.0848e-04, 3.3677e-05, -1.4806e-04, 2.8566e-05, 5.1185e-06, 4.4912e-05, 8.6963e-05, 1.0163e-04], device='cuda:0') 100 0.0001 changing lr epoch 81, time 220.60, cls_loss 0.0034 cls_loss_mapping 0.0094 cls_loss_causal 0.5551 re_mapping 0.0092 re_causal 0.0262 /// teacc 98.89 lr 0.00010000 Epoch 83, weight, value: tensor([[-0.1156, -0.1067, 0.0689, ..., -0.0425, 0.0365, 0.0226], [-0.0541, -0.0216, -0.0323, ..., -0.0480, -0.0716, -0.0379], [ 0.0379, -0.0476, -0.0267, ..., -0.0345, -0.0008, -0.1944], ..., [-0.0766, 0.0708, 0.0025, ..., 0.0742, -0.0289, -0.0755], [-0.0340, -0.0447, 0.0762, ..., -0.0394, -0.1200, 0.0804], [-0.0057, -0.1083, 0.0688, ..., 0.0523, -0.1293, -0.0487]], device='cuda:0'), grad: tensor([[ 1.6671e-07, 1.3607e-06, -1.2882e-05, ..., 9.5740e-07, 2.9765e-06, 2.6412e-06], [ 1.1385e-07, 2.5108e-06, -2.1402e-06, ..., 2.1085e-06, 1.3621e-07, -7.6368e-06], [ 1.4715e-07, 6.6273e-06, 3.7700e-06, ..., 2.5574e-06, -5.6848e-06, 1.2018e-05], ..., [ 2.8731e-07, -2.7671e-05, -8.6874e-06, ..., -2.3350e-05, 4.9919e-07, 3.4850e-06], [ 1.0490e-05, 4.9844e-06, -5.9903e-06, ..., 9.3598e-07, 1.7509e-07, 2.6435e-05], [-1.1530e-06, 6.8247e-06, -4.7009e-07, ..., -1.3970e-07, 1.4668e-07, 9.8944e-06]], device='cuda:0') Epoch 83, bias, value: tensor([-0.0167, -0.0149, -0.0260, -0.0253, -0.0094, 0.0042, 0.0090, -0.0221, -0.0075, 0.0050], device='cuda:0'), grad: tensor([ 6.5207e-05, -3.8326e-05, 4.5486e-06, 8.8587e-06, 5.0485e-05, 1.4162e-04, -2.9874e-04, -2.6450e-05, 6.7890e-05, 2.4796e-05], device='cuda:0') 100 0.0001 changing lr epoch 82, time 220.69, cls_loss 0.0029 cls_loss_mapping 0.0110 cls_loss_causal 0.5669 re_mapping 0.0090 re_causal 0.0272 /// teacc 98.92 lr 0.00010000 Epoch 84, weight, value: tensor([[-1.1605e-01, -1.0887e-01, 6.9048e-02, ..., -4.3175e-02, 3.6572e-02, 2.2682e-02], [-5.4631e-02, -2.0724e-02, -3.2177e-02, ..., -4.8267e-02, -7.1158e-02, -3.7393e-02], [ 3.7713e-02, -4.9078e-02, -2.7272e-02, ..., -3.5733e-02, 1.1970e-04, -1.9499e-01], ..., [-7.6891e-02, 7.1293e-02, 2.4593e-03, ..., 7.5045e-02, -2.8202e-02, -7.6081e-02], [-3.4615e-02, -4.5182e-02, 7.6742e-02, ..., -3.9170e-02, -1.2020e-01, 8.0557e-02], [-6.3753e-03, -1.0903e-01, 6.9221e-02, ..., 5.2737e-02, -1.2944e-01, -4.9051e-02]], device='cuda:0'), grad: tensor([[ 5.1223e-09, 1.0128e-07, -5.8254e-07, ..., 4.7428e-07, 1.8161e-08, -3.5460e-07], [ 1.0943e-08, 7.7579e-07, 7.7020e-07, ..., 8.7358e-07, 4.6566e-10, 9.3132e-08], [ 2.3283e-09, 3.2373e-06, 1.5274e-06, ..., 1.1809e-06, -2.4913e-08, 1.4529e-06], ..., [ 8.8941e-08, -2.2873e-06, 5.2201e-07, ..., -1.4026e-06, 5.1223e-09, 7.0501e-07], [ 4.7963e-08, -9.5461e-07, -1.1483e-06, ..., 2.3972e-06, 2.0023e-08, -4.8578e-06], [-2.8848e-07, 7.1619e-07, -5.2482e-05, ..., -4.5419e-05, 2.5611e-09, 8.8615e-07]], device='cuda:0') Epoch 84, bias, value: tensor([-0.0166, -0.0146, -0.0263, -0.0255, -0.0099, 0.0044, 0.0091, -0.0220, -0.0076, 0.0052], device='cuda:0'), grad: tensor([ 3.3062e-06, 1.9115e-07, 3.2876e-06, 1.1688e-06, 1.0478e-04, 9.1866e-06, -1.1154e-05, 1.9409e-06, 2.3842e-06, -1.1533e-04], device='cuda:0') 100 0.0001 changing lr epoch 83, time 220.94, cls_loss 0.0027 cls_loss_mapping 0.0101 cls_loss_causal 0.5791 re_mapping 0.0091 re_causal 0.0271 /// teacc 98.86 lr 0.00010000 Epoch 85, weight, value: tensor([[-0.1167, -0.1092, 0.0692, ..., -0.0437, 0.0365, 0.0232], [-0.0547, -0.0210, -0.0323, ..., -0.0486, -0.0710, -0.0373], [ 0.0379, -0.0496, -0.0278, ..., -0.0362, 0.0006, -0.1962], ..., [-0.0783, 0.0720, 0.0025, ..., 0.0755, -0.0284, -0.0767], [-0.0345, -0.0454, 0.0777, ..., -0.0388, -0.1204, 0.0815], [-0.0060, -0.1096, 0.0694, ..., 0.0524, -0.1298, -0.0499]], device='cuda:0'), grad: tensor([[-1.0366e-06, 1.2212e-05, -3.5852e-05, ..., 8.6501e-06, 2.3283e-10, -2.0161e-05], [ 5.5833e-07, 4.6529e-06, 2.7753e-06, ..., 4.0866e-06, 6.9849e-10, -4.3027e-06], [ 1.3458e-06, 3.3770e-06, 1.5497e-05, ..., 3.7402e-06, 6.9849e-10, 1.2554e-05], ..., [-3.7160e-06, -4.1574e-05, -1.4625e-05, ..., -1.7956e-05, 1.3970e-09, -6.8620e-06], [ 1.6466e-05, 1.1623e-06, 2.0787e-05, ..., 2.1607e-05, 4.6566e-10, 1.0937e-05], [-2.1815e-05, 4.1015e-06, -2.7984e-05, ..., -5.5432e-06, 1.6298e-09, -1.2308e-05]], device='cuda:0') Epoch 85, bias, value: tensor([-0.0167, -0.0147, -0.0259, -0.0260, -0.0095, 0.0039, 0.0098, -0.0221, -0.0069, 0.0046], device='cuda:0'), grad: tensor([-8.1837e-05, 5.9009e-06, 4.3511e-05, 5.8591e-05, -6.6638e-05, 1.3880e-05, 4.1239e-06, -4.7386e-05, 5.7936e-05, 1.1690e-05], device='cuda:0') 100 0.0001 changing lr epoch 84, time 221.02, cls_loss 0.0030 cls_loss_mapping 0.0090 cls_loss_causal 0.5993 re_mapping 0.0089 re_causal 0.0276 /// teacc 99.00 lr 0.00010000 Epoch 86, weight, value: tensor([[-0.1174, -0.1091, 0.0698, ..., -0.0441, 0.0365, 0.0243], [-0.0548, -0.0211, -0.0325, ..., -0.0490, -0.0709, -0.0372], [ 0.0377, -0.0490, -0.0267, ..., -0.0368, 0.0032, -0.1969], ..., [-0.0772, 0.0727, 0.0029, ..., 0.0773, -0.0282, -0.0773], [-0.0351, -0.0467, 0.0771, ..., -0.0394, -0.1234, 0.0819], [-0.0053, -0.1113, 0.0701, ..., 0.0525, -0.1300, -0.0502]], device='cuda:0'), grad: tensor([[ 1.5171e-06, 2.0233e-07, -4.9211e-06, ..., 1.3143e-05, 4.9137e-06, -7.3388e-06], [ 6.8126e-07, 1.4575e-07, 4.4033e-06, ..., 2.3600e-06, 7.5810e-07, 1.6680e-06], [-1.8124e-06, 2.0079e-06, -1.5453e-05, ..., -4.4435e-05, -1.5065e-05, 2.9802e-06], ..., [ 7.9675e-07, -4.2617e-06, 8.5607e-06, ..., 1.9386e-05, 9.5554e-07, 3.2559e-06], [ 2.7735e-06, 3.1665e-07, 2.9281e-06, ..., 9.0525e-06, 3.6750e-06, -6.9775e-06], [-3.2876e-06, 1.6009e-06, -2.0579e-05, ..., -9.8944e-06, 8.1398e-07, 1.9595e-06]], device='cuda:0') Epoch 86, bias, value: tensor([-0.0155, -0.0145, -0.0256, -0.0265, -0.0104, 0.0034, 0.0097, -0.0213, -0.0073, 0.0046], device='cuda:0'), grad: tensor([ 4.3243e-05, 4.2133e-06, -1.8573e-04, 9.9540e-06, 6.0610e-06, 9.6619e-05, -6.8367e-05, 7.0512e-05, 3.4630e-05, -1.1630e-05], device='cuda:0') 100 0.0001 changing lr epoch 85, time 220.61, cls_loss 0.0032 cls_loss_mapping 0.0099 cls_loss_causal 0.5822 re_mapping 0.0090 re_causal 0.0263 /// teacc 98.91 lr 0.00010000 Epoch 87, weight, value: tensor([[-0.1201, -0.1115, 0.0699, ..., -0.0444, 0.0363, 0.0239], [-0.0552, -0.0207, -0.0326, ..., -0.0489, -0.0713, -0.0377], [ 0.0368, -0.0493, -0.0271, ..., -0.0373, 0.0034, -0.1981], ..., [-0.0781, 0.0730, 0.0033, ..., 0.0780, -0.0276, -0.0780], [-0.0350, -0.0471, 0.0777, ..., -0.0395, -0.1236, 0.0828], [-0.0036, -0.1131, 0.0700, ..., 0.0524, -0.1303, -0.0505]], device='cuda:0'), grad: tensor([[-3.3760e-08, 4.9686e-07, 1.5823e-06, ..., 6.5565e-07, 4.0680e-06, 2.6166e-05], [ 1.3388e-07, 3.0287e-06, 4.6372e-04, ..., 2.6915e-06, 3.3691e-07, 1.1606e-03], [ 7.0315e-08, 2.0768e-06, 3.7503e-04, ..., 1.0068e-06, 5.6438e-07, 9.3889e-04], ..., [ 3.9442e-07, -2.7075e-05, 1.8165e-05, ..., -2.3142e-05, 1.2876e-07, 6.4552e-05], [ 1.1805e-07, 4.6217e-07, -9.0790e-04, ..., 6.3144e-07, 3.9823e-06, -2.2640e-03], [-7.2643e-07, 1.4991e-05, 4.8019e-06, ..., 7.5623e-06, 6.6403e-07, 9.7156e-06]], device='cuda:0') Epoch 87, bias, value: tensor([-0.0161, -0.0139, -0.0260, -0.0264, -0.0104, 0.0032, 0.0102, -0.0215, -0.0069, 0.0045], device='cuda:0'), grad: tensor([ 7.2777e-05, 2.1763e-03, 1.7605e-03, 4.0680e-05, 8.7321e-06, 7.3910e-05, -6.0111e-05, 8.4400e-05, -4.2000e-03, 4.3720e-05], device='cuda:0') 100 0.0001 changing lr epoch 86, time 220.47, cls_loss 0.0029 cls_loss_mapping 0.0108 cls_loss_causal 0.5926 re_mapping 0.0082 re_causal 0.0251 /// teacc 98.97 lr 0.00010000 Epoch 88, weight, value: tensor([[-0.1209, -0.1122, 0.0702, ..., -0.0446, 0.0363, 0.0235], [-0.0557, -0.0203, -0.0332, ..., -0.0496, -0.0714, -0.0383], [ 0.0364, -0.0498, -0.0279, ..., -0.0366, 0.0034, -0.2001], ..., [-0.0790, 0.0734, 0.0033, ..., 0.0784, -0.0268, -0.0788], [-0.0355, -0.0473, 0.0783, ..., -0.0401, -0.1237, 0.0837], [-0.0036, -0.1143, 0.0706, ..., 0.0524, -0.1307, -0.0511]], device='cuda:0'), grad: tensor([[ 1.5949e-07, 1.0364e-05, -7.0818e-06, ..., 3.9581e-07, 5.5647e-08, -8.9705e-06], [ 1.0221e-07, 5.4911e-06, 2.4401e-06, ..., 3.3788e-06, 1.3104e-06, 2.3525e-06], [ 9.3598e-07, 9.4175e-05, 2.9653e-06, ..., 2.2724e-06, 3.4668e-07, 4.8131e-06], ..., [ 4.9081e-07, 8.4639e-06, 2.7847e-07, ..., 1.8626e-08, 1.0878e-06, 1.8394e-06], [ 4.9062e-06, 1.8589e-06, -1.5214e-05, ..., -6.5267e-06, 1.1083e-06, -1.3843e-05], [ 5.5879e-07, 3.5577e-06, 6.4373e-06, ..., 2.6040e-06, 3.1688e-07, 9.8869e-06]], device='cuda:0') Epoch 88, bias, value: tensor([-0.0167, -0.0140, -0.0263, -0.0265, -0.0100, 0.0038, 0.0106, -0.0218, -0.0064, 0.0040], device='cuda:0'), grad: tensor([ 7.9125e-06, 2.7239e-05, 2.7418e-04, -3.8600e-04, -2.4721e-05, 4.4465e-05, -4.5355e-07, 3.5971e-05, -6.1765e-06, 2.6971e-05], device='cuda:0') 100 0.0001 changing lr epoch 87, time 220.13, cls_loss 0.0029 cls_loss_mapping 0.0103 cls_loss_causal 0.6152 re_mapping 0.0084 re_causal 0.0263 /// teacc 98.98 lr 0.00010000 Epoch 89, weight, value: tensor([[-0.1219, -0.1127, 0.0708, ..., -0.0450, 0.0364, 0.0236], [-0.0564, -0.0209, -0.0336, ..., -0.0506, -0.0722, -0.0383], [ 0.0363, -0.0505, -0.0281, ..., -0.0373, 0.0040, -0.2011], ..., [-0.0812, 0.0744, 0.0033, ..., 0.0791, -0.0269, -0.0793], [-0.0365, -0.0477, 0.0785, ..., -0.0405, -0.1239, 0.0846], [-0.0019, -0.1149, 0.0713, ..., 0.0528, -0.1313, -0.0512]], device='cuda:0'), grad: tensor([[ 8.6846e-07, 1.7649e-06, -8.9556e-06, ..., 1.5087e-06, -1.6382e-06, -1.1049e-05], [ 3.0641e-07, 4.0904e-06, 3.3416e-06, ..., 2.9039e-06, -3.4436e-07, -2.6566e-07], [ 1.2647e-06, -2.6077e-05, 4.6641e-05, ..., -2.1160e-05, -3.2480e-07, 4.2677e-05], ..., [ 1.1679e-06, 1.1019e-05, 2.7427e-07, ..., 1.9431e-05, 4.4028e-07, 5.0180e-06], [ 6.0052e-06, 3.6377e-06, -8.9586e-05, ..., -4.5836e-05, 1.5064e-07, -8.2076e-05], [-3.7309e-06, 9.2201e-07, 2.6584e-05, ..., 3.2216e-05, 2.5611e-07, 4.8012e-05]], device='cuda:0') Epoch 89, bias, value: tensor([-0.0166, -0.0148, -0.0258, -0.0265, -0.0101, 0.0036, 0.0100, -0.0213, -0.0060, 0.0042], device='cuda:0'), grad: tensor([-9.0897e-06, 9.0376e-06, -1.2887e-04, 2.2635e-05, 7.3969e-05, -9.9540e-06, -4.4674e-05, 1.7655e-04, -1.4770e-04, 5.7995e-05], device='cuda:0') 100 0.0001 changing lr epoch 88, time 220.83, cls_loss 0.0035 cls_loss_mapping 0.0114 cls_loss_causal 0.5983 re_mapping 0.0087 re_causal 0.0249 /// teacc 98.83 lr 0.00010000 Epoch 90, weight, value: tensor([[-0.1236, -0.1138, 0.0712, ..., -0.0454, 0.0359, 0.0236], [-0.0578, -0.0208, -0.0340, ..., -0.0510, -0.0715, -0.0386], [ 0.0367, -0.0500, -0.0284, ..., -0.0379, 0.0039, -0.2017], ..., [-0.0809, 0.0744, 0.0036, ..., 0.0802, -0.0271, -0.0802], [-0.0382, -0.0479, 0.0792, ..., -0.0405, -0.1239, 0.0848], [-0.0036, -0.1166, 0.0711, ..., 0.0525, -0.1328, -0.0530]], device='cuda:0'), grad: tensor([[ 1.3423e-07, 4.8662e-07, -7.0818e-06, ..., 3.0966e-07, -5.9837e-08, -6.3777e-06], [ 1.1292e-07, 1.4510e-06, 1.2200e-06, ..., 1.3532e-06, 4.6566e-09, -1.2256e-06], [ 8.2538e-08, 1.4044e-06, 3.0659e-06, ..., 7.8417e-07, -5.4482e-08, 4.2394e-06], ..., [ 5.7183e-06, -8.2701e-06, 3.0771e-06, ..., -1.2377e-06, 1.5716e-08, 2.2501e-06], [ 2.0731e-06, 3.7141e-06, -4.0792e-06, ..., 2.4401e-06, 3.6671e-08, -4.2915e-06], [-1.1295e-05, 4.5933e-06, -3.3807e-06, ..., -1.2033e-05, 7.6834e-09, 3.0342e-06]], device='cuda:0') Epoch 90, bias, value: tensor([-0.0169, -0.0146, -0.0250, -0.0257, -0.0100, 0.0041, 0.0101, -0.0220, -0.0063, 0.0033], device='cuda:0'), grad: tensor([-1.2539e-05, -2.7604e-06, 4.7237e-06, -1.0610e-05, 7.8976e-06, 4.3422e-05, -3.4451e-05, 6.3181e-06, 4.1388e-06, -6.1728e-06], device='cuda:0') 100 0.0001 changing lr epoch 89, time 220.52, cls_loss 0.0024 cls_loss_mapping 0.0062 cls_loss_causal 0.6230 re_mapping 0.0084 re_causal 0.0251 /// teacc 98.76 lr 0.00010000 Epoch 91, weight, value: tensor([[-0.1243, -0.1148, 0.0716, ..., -0.0458, 0.0360, 0.0240], [-0.0583, -0.0212, -0.0344, ..., -0.0515, -0.0712, -0.0387], [ 0.0366, -0.0508, -0.0291, ..., -0.0386, 0.0041, -0.2028], ..., [-0.0813, 0.0751, 0.0036, ..., 0.0804, -0.0273, -0.0807], [-0.0393, -0.0480, 0.0797, ..., -0.0411, -0.1241, 0.0851], [-0.0043, -0.1172, 0.0715, ..., 0.0526, -0.1331, -0.0539]], device='cuda:0'), grad: tensor([[ 7.0548e-08, 1.8150e-05, 5.6960e-06, ..., 1.3694e-05, 2.9220e-08, 5.9744e-07], [ 6.2166e-08, 5.5432e-05, 2.3142e-05, ..., 2.0489e-05, -1.9907e-08, -1.0347e-06], [ 4.6217e-08, 2.8923e-05, 9.2313e-06, ..., 1.4469e-05, -1.4703e-07, 1.2843e-06], ..., [ 5.8487e-07, -1.6117e-04, -4.3929e-05, ..., 3.5167e-05, 1.4005e-07, 1.2359e-06], [ 3.6415e-07, 2.2724e-06, 2.0757e-05, ..., 2.1219e-05, 6.1118e-08, -4.0941e-06], [-3.6918e-06, 3.1799e-05, -5.8025e-05, ..., 4.0323e-05, 7.4506e-09, 9.8534e-07]], device='cuda:0') Epoch 91, bias, value: tensor([-0.0166, -0.0149, -0.0253, -0.0255, -0.0093, 0.0042, 0.0101, -0.0221, -0.0063, 0.0030], device='cuda:0'), grad: tensor([ 4.6045e-05, 1.0115e-04, 5.1498e-05, 7.3195e-05, -5.2500e-04, 2.9907e-05, 2.8014e-06, 1.9267e-05, 7.1824e-05, 1.2887e-04], device='cuda:0') 100 0.0001 changing lr epoch 90, time 220.76, cls_loss 0.0024 cls_loss_mapping 0.0091 cls_loss_causal 0.5834 re_mapping 0.0085 re_causal 0.0252 /// teacc 98.78 lr 0.00010000 Epoch 92, weight, value: tensor([[-0.1248, -0.1154, 0.0722, ..., -0.0463, 0.0360, 0.0247], [-0.0591, -0.0214, -0.0348, ..., -0.0521, -0.0712, -0.0385], [ 0.0362, -0.0515, -0.0299, ..., -0.0392, 0.0042, -0.2037], ..., [-0.0818, 0.0758, 0.0039, ..., 0.0810, -0.0274, -0.0810], [-0.0396, -0.0483, 0.0804, ..., -0.0413, -0.1241, 0.0858], [-0.0038, -0.1182, 0.0721, ..., 0.0529, -0.1335, -0.0544]], device='cuda:0'), grad: tensor([[-9.4296e-08, 3.5018e-07, -6.7055e-06, ..., 2.6077e-08, 6.4494e-08, 6.7115e-05], [ 3.1525e-07, -1.3232e-04, -1.2022e-04, ..., -2.4334e-05, -3.3667e-07, -2.0361e-04], [-3.9376e-06, 6.2920e-06, 2.8554e-06, ..., 3.0585e-06, -7.0315e-08, 3.1590e-05], ..., [ 3.3551e-07, 8.0884e-05, 7.7367e-05, ..., 1.1928e-05, 9.9186e-08, 1.2982e-04], [ 5.6531e-07, 1.4447e-05, 1.3553e-05, ..., 3.3323e-06, 1.0361e-07, 2.0039e-04], [ 2.3283e-07, 2.7820e-05, 2.1935e-05, ..., 2.8051e-06, 2.4913e-08, 4.3392e-05]], device='cuda:0') Epoch 92, bias, value: tensor([-0.0147, -0.0154, -0.0257, -0.0258, -0.0094, 0.0041, 0.0093, -0.0218, -0.0059, 0.0031], device='cuda:0'), grad: tensor([ 1.0765e-04, -5.0068e-04, 1.6108e-05, 2.0534e-05, -1.4174e-04, 1.2231e-04, -5.4121e-04, 4.0603e-04, 3.5095e-04, 1.6022e-04], device='cuda:0') 100 0.0001 changing lr epoch 91, time 220.52, cls_loss 0.0033 cls_loss_mapping 0.0102 cls_loss_causal 0.5862 re_mapping 0.0085 re_causal 0.0253 /// teacc 98.87 lr 0.00010000 Epoch 93, weight, value: tensor([[-0.1253, -0.1170, 0.0728, ..., -0.0466, 0.0359, 0.0227], [-0.0597, -0.0219, -0.0355, ..., -0.0526, -0.0714, -0.0371], [ 0.0355, -0.0520, -0.0307, ..., -0.0388, 0.0041, -0.2055], ..., [-0.0805, 0.0772, 0.0048, ..., 0.0826, -0.0268, -0.0816], [-0.0398, -0.0488, 0.0815, ..., -0.0412, -0.1242, 0.0863], [-0.0029, -0.1205, 0.0717, ..., 0.0523, -0.1339, -0.0555]], device='cuda:0'), grad: tensor([[ 1.0431e-07, 1.7593e-06, -1.0960e-05, ..., 9.4529e-07, 1.1642e-09, -2.8014e-06], [ 4.4005e-08, -3.6001e-05, 2.4643e-06, ..., 1.8105e-06, 3.4925e-09, -1.6659e-05], [ 4.4471e-08, 4.4197e-05, 7.8455e-06, ..., 2.8089e-06, 3.4925e-09, 1.6212e-05], ..., [-7.7672e-07, -1.3538e-05, -1.2070e-06, ..., -9.6262e-06, 1.6764e-08, 1.7164e-06], [ 4.5798e-07, 1.2390e-05, 4.7870e-06, ..., 1.7304e-06, 6.9849e-10, -4.6194e-06], [ 3.7532e-07, 7.3016e-06, -9.7528e-06, ..., -5.1260e-06, 1.3970e-09, 1.7444e-06]], device='cuda:0') Epoch 93, bias, value: tensor([-0.0160, -0.0150, -0.0259, -0.0263, -0.0095, 0.0038, 0.0108, -0.0208, -0.0059, 0.0022], device='cuda:0'), grad: tensor([-1.7121e-05, -5.1022e-04, 4.6182e-04, -1.7539e-05, 1.5814e-06, 1.5885e-05, 6.4485e-06, 3.2693e-05, 3.3200e-05, -7.1153e-06], device='cuda:0') 100 0.0001 changing lr epoch 92, time 220.44, cls_loss 0.0034 cls_loss_mapping 0.0092 cls_loss_causal 0.6051 re_mapping 0.0083 re_causal 0.0247 /// teacc 98.78 lr 0.00010000 Epoch 94, weight, value: tensor([[-0.1259, -0.1175, 0.0730, ..., -0.0471, 0.0359, 0.0226], [-0.0603, -0.0218, -0.0362, ..., -0.0530, -0.0712, -0.0373], [ 0.0362, -0.0525, -0.0309, ..., -0.0392, 0.0043, -0.2061], ..., [-0.0814, 0.0775, 0.0046, ..., 0.0829, -0.0269, -0.0820], [-0.0401, -0.0491, 0.0822, ..., -0.0415, -0.1243, 0.0871], [-0.0013, -0.1215, 0.0733, ..., 0.0533, -0.1343, -0.0557]], device='cuda:0'), grad: tensor([[ 1.5832e-06, 2.6412e-06, 5.4799e-06, ..., 4.5151e-06, 7.2177e-09, 3.7141e-06], [ 2.6897e-06, 2.6841e-06, 5.0329e-06, ..., 7.6741e-06, 3.7253e-09, 4.3795e-07], [ 5.2992e-07, 4.4256e-06, 5.4426e-06, ..., 6.8471e-06, -5.5879e-08, 2.3469e-06], ..., [ 4.4778e-06, -5.5581e-05, -6.7592e-05, ..., -8.7857e-05, 3.1199e-08, -2.3663e-05], [ 1.2226e-05, 1.0140e-05, 1.2942e-05, ..., 2.2337e-05, 8.8476e-09, 4.5868e-07], [ 1.3806e-05, 2.2173e-05, -7.3195e-05, ..., 6.7592e-05, 6.9849e-10, -9.1866e-06]], device='cuda:0') Epoch 94, bias, value: tensor([-0.0160, -0.0156, -0.0252, -0.0260, -0.0105, 0.0031, 0.0101, -0.0212, -0.0057, 0.0040], device='cuda:0'), grad: tensor([ 1.9506e-05, 3.5286e-05, 2.0623e-05, 7.3135e-05, -2.2054e-04, 8.5354e-05, 2.5854e-05, -1.3101e-04, 6.1512e-05, 3.0428e-05], device='cuda:0') 100 0.0001 changing lr epoch 93, time 220.77, cls_loss 0.0027 cls_loss_mapping 0.0080 cls_loss_causal 0.5983 re_mapping 0.0082 re_causal 0.0238 /// teacc 98.98 lr 0.00010000 Epoch 95, weight, value: tensor([[-0.1266, -0.1188, 0.0727, ..., -0.0479, 0.0358, 0.0223], [-0.0603, -0.0230, -0.0365, ..., -0.0538, -0.0716, -0.0368], [ 0.0365, -0.0533, -0.0311, ..., -0.0398, 0.0045, -0.2069], ..., [-0.0822, 0.0791, 0.0048, ..., 0.0837, -0.0270, -0.0824], [-0.0406, -0.0497, 0.0825, ..., -0.0421, -0.1245, 0.0875], [-0.0013, -0.1227, 0.0746, ..., 0.0539, -0.1347, -0.0561]], device='cuda:0'), grad: tensor([[ 1.1735e-07, 7.2457e-07, -1.0859e-06, ..., 6.4541e-07, 2.6077e-08, 3.9488e-07], [ 6.5193e-08, 9.3356e-06, 8.4424e-07, ..., 6.4857e-06, -1.8114e-07, -2.2184e-06], [ 3.6974e-07, 1.3590e-05, 1.2644e-05, ..., 9.4920e-06, -4.4331e-07, 1.4879e-05], ..., [ 1.6880e-07, -2.3127e-05, -5.8021e-07, ..., -1.8939e-05, 3.1036e-07, -3.0920e-07], [-2.6692e-06, 1.5981e-06, -3.4988e-05, ..., -1.8533e-06, 5.4250e-08, -5.2482e-05], [ 1.0780e-07, 1.7602e-06, 6.5193e-06, ..., 5.3365e-07, 2.6310e-08, 1.5393e-05]], device='cuda:0') Epoch 95, bias, value: tensor([-0.0167, -0.0160, -0.0249, -0.0260, -0.0109, 0.0027, 0.0105, -0.0206, -0.0058, 0.0043], device='cuda:0'), grad: tensor([ 2.1625e-06, 7.1079e-06, 4.4078e-05, -9.9540e-06, 2.1383e-06, 2.2113e-05, 1.4469e-05, -2.6688e-05, -7.7248e-05, 2.1830e-05], device='cuda:0') 100 0.0001 changing lr epoch 94, time 220.64, cls_loss 0.0037 cls_loss_mapping 0.0085 cls_loss_causal 0.5913 re_mapping 0.0077 re_causal 0.0233 /// teacc 98.89 lr 0.00010000 Epoch 96, weight, value: tensor([[-0.1273, -0.1200, 0.0718, ..., -0.0497, 0.0366, 0.0224], [-0.0624, -0.0230, -0.0367, ..., -0.0541, -0.0717, -0.0369], [ 0.0372, -0.0542, -0.0316, ..., -0.0404, 0.0047, -0.2077], ..., [-0.0826, 0.0798, 0.0043, ..., 0.0839, -0.0269, -0.0836], [-0.0412, -0.0503, 0.0826, ..., -0.0425, -0.1247, 0.0872], [-0.0007, -0.1244, 0.0765, ..., 0.0546, -0.1352, -0.0558]], device='cuda:0'), grad: tensor([[ 3.9279e-07, 9.1270e-07, -4.8503e-06, ..., 9.5181e-07, 3.0268e-09, -3.2503e-06], [ 3.3737e-07, 4.6380e-06, 1.2619e-06, ..., 3.9414e-06, 4.1211e-08, 5.0385e-07], [ 7.0967e-07, 5.7817e-06, 1.7863e-06, ..., 2.5146e-06, -7.1479e-08, 2.1271e-06], ..., [ 4.6683e-07, -2.5794e-05, -5.7183e-06, ..., -4.7237e-05, 1.3039e-08, 1.8086e-06], [ 1.4395e-05, 4.3958e-06, -5.5507e-07, ..., 9.9372e-07, 6.7521e-09, 1.6913e-05], [ 9.3952e-06, 1.6674e-05, 2.2706e-06, ..., 1.7136e-05, 1.3970e-09, 1.4402e-05]], device='cuda:0') Epoch 96, bias, value: tensor([-0.0173, -0.0156, -0.0252, -0.0241, -0.0109, 0.0010, 0.0113, -0.0213, -0.0067, 0.0052], device='cuda:0'), grad: tensor([ 4.5309e-07, 1.3232e-05, -3.4600e-05, 4.0740e-05, 3.0249e-05, -8.5235e-05, -5.1968e-06, -4.1544e-05, 2.8446e-05, 5.3346e-05], device='cuda:0') 100 0.0001 changing lr epoch 95, time 220.42, cls_loss 0.0026 cls_loss_mapping 0.0079 cls_loss_causal 0.5762 re_mapping 0.0079 re_causal 0.0240 /// teacc 98.91 lr 0.00010000 Epoch 97, weight, value: tensor([[-0.1276, -0.1210, 0.0721, ..., -0.0499, 0.0365, 0.0227], [-0.0636, -0.0231, -0.0373, ..., -0.0547, -0.0717, -0.0368], [ 0.0369, -0.0544, -0.0319, ..., -0.0409, 0.0048, -0.2085], ..., [-0.0827, 0.0801, 0.0037, ..., 0.0841, -0.0269, -0.0846], [-0.0416, -0.0507, 0.0830, ..., -0.0422, -0.1248, 0.0877], [-0.0008, -0.1254, 0.0772, ..., 0.0549, -0.1353, -0.0560]], device='cuda:0'), grad: tensor([[ 1.6931e-06, 5.0198e-07, -1.4892e-06, ..., 1.0980e-06, 0.0000e+00, 2.3004e-06], [ 5.9232e-07, 1.9539e-06, 1.6242e-06, ..., 1.5171e-06, 0.0000e+00, 1.0151e-06], [ 2.0582e-06, 1.0923e-05, 8.2776e-06, ..., 8.1509e-06, 0.0000e+00, 1.7546e-06], ..., [ 7.4394e-06, -1.6600e-05, -6.9588e-06, ..., -1.0639e-05, 0.0000e+00, 1.7568e-05], [ 3.9876e-05, 6.4820e-06, 4.4674e-05, ..., 2.6584e-05, 0.0000e+00, 3.9607e-05], [-5.9038e-05, 2.6319e-06, -9.1732e-05, ..., -5.0157e-05, 0.0000e+00, -1.6615e-05]], device='cuda:0') Epoch 97, bias, value: tensor([-0.0172, -0.0158, -0.0251, -0.0245, -0.0106, 0.0022, 0.0106, -0.0220, -0.0066, 0.0054], device='cuda:0'), grad: tensor([ 4.3213e-06, 8.2701e-06, 2.7850e-05, 4.6104e-05, 4.4256e-05, -1.4156e-05, -6.8605e-05, 6.1095e-06, 1.4162e-04, -1.9574e-04], device='cuda:0') 100 0.0001 changing lr epoch 96, time 220.47, cls_loss 0.0027 cls_loss_mapping 0.0080 cls_loss_causal 0.5928 re_mapping 0.0080 re_causal 0.0239 /// teacc 98.83 lr 0.00010000 Epoch 98, weight, value: tensor([[-1.2902e-01, -1.2323e-01, 7.1964e-02, ..., -5.0926e-02, 3.6483e-02, 2.2723e-02], [-6.5255e-02, -2.3615e-02, -3.7695e-02, ..., -5.5340e-02, -7.1735e-02, -3.7051e-02], [ 3.6466e-02, -5.5314e-02, -3.2395e-02, ..., -4.1621e-02, 4.8801e-03, -2.0956e-01], ..., [-8.3021e-02, 8.1728e-02, 4.1624e-03, ..., 8.5183e-02, -2.6903e-02, -8.5333e-02], [-4.1732e-02, -5.1501e-02, 8.3464e-02, ..., -4.2543e-02, -1.2482e-01, 8.8447e-02], [-1.8180e-04, -1.2707e-01, 7.7528e-02, ..., 5.4682e-02, -1.3581e-01, -5.6596e-02]], device='cuda:0'), grad: tensor([[ 1.7099e-06, 4.5518e-07, -8.6650e-06, ..., 2.0899e-06, 2.3283e-10, -3.5316e-06], [ 7.2876e-07, 8.6101e-07, 1.8170e-06, ..., 3.6061e-06, 4.6566e-10, 2.5192e-07], [ 4.2804e-06, 1.2591e-06, 7.4469e-06, ..., 4.8801e-06, 2.0955e-09, 3.0901e-06], ..., [ 3.5286e-05, -1.0263e-06, 4.8727e-05, ..., 4.1187e-05, 4.6566e-09, 1.9342e-05], [ 1.8273e-06, 3.1441e-06, -1.3955e-05, ..., 3.5446e-06, 4.6566e-10, -1.2614e-05], [-5.5999e-05, 1.7453e-06, -6.1870e-05, ..., -1.2249e-05, 9.3132e-10, -1.9819e-05]], device='cuda:0') Epoch 98, bias, value: tensor([-0.0173, -0.0163, -0.0256, -0.0250, -0.0104, 0.0022, 0.0108, -0.0210, -0.0063, 0.0050], device='cuda:0'), grad: tensor([-7.8008e-06, 9.8869e-06, 2.3887e-05, 2.5313e-06, -1.3268e-04, 1.7166e-05, 5.5172e-06, 1.3983e-04, -1.0185e-05, -4.8161e-05], device='cuda:0') 100 0.0001 changing lr epoch 97, time 220.78, cls_loss 0.0025 cls_loss_mapping 0.0076 cls_loss_causal 0.5469 re_mapping 0.0079 re_causal 0.0223 /// teacc 98.95 lr 0.00010000 Epoch 99, weight, value: tensor([[-1.2927e-01, -1.2396e-01, 7.2616e-02, ..., -5.1338e-02, 3.6392e-02, 2.2933e-02], [-6.5970e-02, -2.4011e-02, -3.8223e-02, ..., -5.6125e-02, -7.1555e-02, -3.7248e-02], [ 3.6582e-02, -5.5851e-02, -3.2798e-02, ..., -4.2062e-02, 5.0595e-03, -2.1055e-01], ..., [-8.4358e-02, 8.2614e-02, 4.2962e-03, ..., 8.5965e-02, -2.6940e-02, -8.5672e-02], [-4.2361e-02, -5.1977e-02, 8.4030e-02, ..., -4.2685e-02, -1.2489e-01, 8.9035e-02], [ 9.0722e-05, -1.2806e-01, 7.7720e-02, ..., 5.4486e-02, -1.3632e-01, -5.7351e-02]], device='cuda:0'), grad: tensor([[ 7.6648e-07, 1.9674e-07, -3.6731e-06, ..., 9.8487e-08, 1.1898e-07, -1.1930e-06], [ 5.0198e-07, 7.4599e-07, 6.4261e-07, ..., 1.8789e-07, 1.2596e-07, -3.0408e-07], [-6.3956e-05, -1.1243e-05, 8.3726e-07, ..., -1.3476e-06, -1.3784e-06, 1.7984e-06], ..., [ 6.6264e-07, 4.3698e-06, 5.4296e-07, ..., 1.3616e-06, 2.4471e-07, 9.3877e-07], [ 1.1018e-06, 1.9856e-06, 1.2806e-06, ..., 5.5972e-07, 5.3830e-07, 8.4341e-06], [ 6.2346e-05, 6.3591e-06, 8.4378e-07, ..., -9.7509e-07, 2.2585e-08, 3.7104e-06]], device='cuda:0') Epoch 99, bias, value: tensor([-0.0172, -0.0164, -0.0257, -0.0251, -0.0102, 0.0023, 0.0107, -0.0208, -0.0060, 0.0047], device='cuda:0'), grad: tensor([ 4.9826e-08, 4.0904e-06, -3.4523e-04, 3.0577e-05, 3.5204e-06, -9.1195e-05, 4.9025e-05, 2.1890e-05, 2.9325e-05, 2.9826e-04], device='cuda:0') 100 0.0001 changing lr epoch 98, time 221.06, cls_loss 0.0027 cls_loss_mapping 0.0059 cls_loss_causal 0.5309 re_mapping 0.0075 re_causal 0.0221 /// teacc 98.88 lr 0.00010000 Epoch 100, weight, value: tensor([[-0.1297, -0.1245, 0.0729, ..., -0.0516, 0.0367, 0.0231], [-0.0667, -0.0243, -0.0388, ..., -0.0565, -0.0719, -0.0367], [ 0.0366, -0.0562, -0.0330, ..., -0.0419, 0.0055, -0.2112], ..., [-0.0848, 0.0833, 0.0046, ..., 0.0864, -0.0259, -0.0862], [-0.0432, -0.0528, 0.0843, ..., -0.0430, -0.1251, 0.0892], [-0.0024, -0.1292, 0.0778, ..., 0.0533, -0.1371, -0.0586]], device='cuda:0'), grad: tensor([[ 2.5821e-07, 5.2666e-07, -1.5604e-04, ..., 1.4110e-07, -2.4736e-06, -1.6475e-04], [ 6.9849e-08, 1.3849e-06, 2.7213e-06, ..., 2.4028e-07, 6.1700e-08, -6.1020e-06], [ 5.3318e-08, 4.8056e-07, 7.6517e-06, ..., 1.6298e-07, 4.3050e-07, 1.4357e-05], ..., [ 3.3784e-07, 7.9069e-07, 3.8184e-06, ..., 8.5495e-07, 2.1094e-07, 4.1425e-06], [ 1.7462e-06, 4.1723e-06, -2.9773e-05, ..., 5.8580e-07, 1.9185e-07, -4.8429e-05], [-4.2818e-07, 2.0433e-06, 1.1936e-05, ..., -4.5970e-06, 5.4482e-07, 1.9372e-05]], device='cuda:0') Epoch 100, bias, value: tensor([-0.0172, -0.0161, -0.0255, -0.0248, -0.0087, 0.0030, 0.0097, -0.0207, -0.0064, 0.0031], device='cuda:0'), grad: tensor([-3.7503e-04, -3.2872e-05, 2.2814e-05, 2.9393e-06, 1.2517e-05, 9.8571e-06, 3.7479e-04, 1.5616e-05, -5.6922e-05, 2.6435e-05], device='cuda:0') 100 0.0001 changing lr epoch 99, time 220.53, cls_loss 0.0023 cls_loss_mapping 0.0060 cls_loss_causal 0.5899 re_mapping 0.0072 re_causal 0.0225 /// teacc 98.98 lr 0.00010000 Epoch 101, weight, value: tensor([[-0.1291, -0.1247, 0.0739, ..., -0.0519, 0.0368, 0.0237], [-0.0690, -0.0246, -0.0392, ..., -0.0569, -0.0719, -0.0367], [ 0.0361, -0.0566, -0.0335, ..., -0.0424, 0.0056, -0.2124], ..., [-0.0856, 0.0837, 0.0046, ..., 0.0869, -0.0259, -0.0868], [-0.0439, -0.0532, 0.0847, ..., -0.0432, -0.1251, 0.0899], [-0.0015, -0.1301, 0.0783, ..., 0.0537, -0.1375, -0.0587]], device='cuda:0'), grad: tensor([[ 3.1702e-06, 3.5405e-05, -8.6240e-07, ..., 8.7321e-06, 6.9849e-10, 3.7737e-06], [ 2.3637e-06, -7.8797e-05, 7.0315e-07, ..., 6.2399e-06, 4.6566e-10, -1.4752e-05], [ 8.0280e-07, 1.0036e-05, 4.5076e-06, ..., 1.7593e-06, -7.6834e-09, 1.6436e-05], ..., [-8.7693e-06, -5.9515e-05, 1.9595e-06, ..., -3.5614e-05, 5.3551e-09, 1.2875e-05], [ 1.2033e-05, 5.2787e-06, 2.1420e-06, ..., 1.1828e-06, 9.3132e-10, 2.2709e-05], [-3.9220e-05, 1.4715e-05, -1.9550e-05, ..., -3.4750e-05, 0.0000e+00, -1.7151e-05]], device='cuda:0') Epoch 101, bias, value: tensor([-0.0166, -0.0163, -0.0255, -0.0250, -0.0091, 0.0027, 0.0101, -0.0208, -0.0062, 0.0035], device='cuda:0'), grad: tensor([ 6.2048e-05, -1.9860e-04, 9.4473e-05, 4.3929e-05, 4.7565e-05, 7.1526e-05, -1.8048e-04, 3.7283e-05, 6.6042e-05, -4.3392e-05], device='cuda:0') 100 0.0001 changing lr epoch 100, time 220.87, cls_loss 0.0027 cls_loss_mapping 0.0089 cls_loss_causal 0.5637 re_mapping 0.0078 re_causal 0.0227 /// teacc 98.88 lr 0.00010000 Epoch 102, weight, value: tensor([[-0.1293, -0.1255, 0.0745, ..., -0.0523, 0.0368, 0.0246], [-0.0698, -0.0251, -0.0397, ..., -0.0577, -0.0722, -0.0367], [ 0.0361, -0.0569, -0.0339, ..., -0.0423, 0.0057, -0.2128], ..., [-0.0878, 0.0847, 0.0047, ..., 0.0869, -0.0253, -0.0869], [-0.0454, -0.0541, 0.0847, ..., -0.0439, -0.1255, 0.0910], [-0.0011, -0.1309, 0.0790, ..., 0.0539, -0.1376, -0.0590]], device='cuda:0'), grad: tensor([[-2.6617e-06, 1.8580e-07, -6.1467e-06, ..., 2.1397e-07, 3.9022e-07, -1.8954e-05], [ 4.9826e-07, 1.7639e-06, 5.5553e-07, ..., 5.4808e-07, -2.7986e-07, 4.2804e-06], [ 1.4622e-06, 9.7692e-05, 2.0087e-05, ..., 1.0079e-04, -4.2818e-07, -4.0643e-06], ..., [ 8.4657e-07, -1.0109e-04, -1.7703e-05, ..., -1.0222e-04, 2.1607e-07, 6.6943e-06], [ 1.2163e-06, 2.4238e-07, 2.0787e-06, ..., 1.4603e-06, 2.7195e-07, 5.5283e-06], [ 5.7369e-06, 1.8142e-06, -2.7250e-06, ..., -1.8254e-06, 3.0035e-08, 5.3234e-06]], device='cuda:0') Epoch 102, bias, value: tensor([-0.0153, -0.0167, -0.0249, -0.0256, -0.0088, 0.0027, 0.0087, -0.0208, -0.0058, 0.0034], device='cuda:0'), grad: tensor([-3.1769e-05, 3.6716e-05, 1.6963e-04, 8.7693e-06, 2.0135e-06, 1.1154e-05, -7.6666e-06, -2.1636e-04, 1.5870e-05, 1.1511e-05], device='cuda:0') 100 0.0001 changing lr epoch 101, time 220.33, cls_loss 0.0028 cls_loss_mapping 0.0085 cls_loss_causal 0.5693 re_mapping 0.0075 re_causal 0.0222 /// teacc 98.86 lr 0.00010000 Epoch 103, weight, value: tensor([[-0.1311, -0.1249, 0.0747, ..., -0.0527, 0.0361, 0.0244], [-0.0710, -0.0256, -0.0403, ..., -0.0587, -0.0727, -0.0365], [ 0.0356, -0.0576, -0.0343, ..., -0.0428, 0.0057, -0.2135], ..., [-0.0900, 0.0850, 0.0034, ..., 0.0870, -0.0246, -0.0897], [-0.0463, -0.0533, 0.0864, ..., -0.0425, -0.1254, 0.0912], [-0.0005, -0.1321, 0.0796, ..., 0.0542, -0.1385, -0.0594]], device='cuda:0'), grad: tensor([[ 2.1383e-06, 1.0803e-06, 3.4254e-06, ..., 3.3919e-06, 4.8894e-08, 2.6003e-06], [ 1.0189e-06, 3.1926e-06, 2.3786e-06, ..., 4.2766e-06, -3.2596e-08, -6.7689e-06], [ 8.5542e-07, 3.0156e-06, 2.9523e-06, ..., 3.3360e-06, 4.8894e-08, 3.6284e-06], ..., [ 2.0433e-06, -2.1309e-05, -7.4394e-06, ..., -1.8463e-05, 1.1409e-08, -1.6075e-06], [ 1.4767e-05, 2.1998e-06, 2.7686e-05, ..., 1.8761e-05, 1.0012e-08, 9.0301e-06], [ 1.3137e-04, 7.9051e-06, -5.6624e-05, ..., 3.9244e-04, 6.0536e-09, -1.8954e-05]], device='cuda:0') Epoch 103, bias, value: tensor([-0.0157, -0.0170, -0.0249, -0.0258, -0.0087, 0.0022, 0.0103, -0.0218, -0.0054, 0.0037], device='cuda:0'), grad: tensor([ 1.2659e-05, -1.2286e-05, 1.5959e-05, 3.8296e-05, -9.6655e-04, 7.0706e-06, 1.5870e-06, -2.8253e-05, 6.3777e-05, 8.6880e-04], device='cuda:0') 100 0.0001 changing lr epoch 102, time 220.72, cls_loss 0.0025 cls_loss_mapping 0.0070 cls_loss_causal 0.5962 re_mapping 0.0073 re_causal 0.0225 /// teacc 98.85 lr 0.00010000 Epoch 104, weight, value: tensor([[-0.1334, -0.1256, 0.0752, ..., -0.0538, 0.0365, 0.0242], [-0.0717, -0.0259, -0.0407, ..., -0.0593, -0.0728, -0.0363], [ 0.0345, -0.0584, -0.0349, ..., -0.0435, 0.0059, -0.2148], ..., [-0.0906, 0.0859, 0.0034, ..., 0.0876, -0.0247, -0.0902], [-0.0478, -0.0538, 0.0864, ..., -0.0428, -0.1254, 0.0911], [-0.0004, -0.1333, 0.0804, ..., 0.0541, -0.1389, -0.0594]], device='cuda:0'), grad: tensor([[ 5.4110e-07, 3.0305e-06, -2.1188e-08, ..., 3.2485e-06, -1.1222e-07, 5.2247e-07], [ 5.7369e-07, 6.5304e-06, 6.6347e-06, ..., 7.2159e-06, 3.8417e-08, 2.3507e-06], [ 1.3807e-07, 1.0885e-05, 1.0714e-05, ..., 1.1757e-05, 6.5193e-08, 4.4256e-06], ..., [ 5.4762e-07, -6.7234e-05, -6.3300e-05, ..., -7.5340e-05, 1.3271e-08, -1.8835e-05], [ 7.0333e-06, 1.4901e-05, 1.1653e-05, ..., 1.2338e-05, 2.2352e-08, 1.0997e-05], [ 8.1817e-07, 3.3945e-05, 3.1859e-05, ..., 3.6329e-05, 7.7067e-08, 1.1928e-05]], device='cuda:0') Epoch 104, bias, value: tensor([-0.0162, -0.0171, -0.0252, -0.0261, -0.0082, 0.0033, 0.0102, -0.0218, -0.0059, 0.0036], device='cuda:0'), grad: tensor([ 5.8375e-06, 1.6510e-05, 3.0547e-05, -2.7884e-06, 1.5870e-05, -1.3076e-05, -1.9103e-05, -1.6904e-04, 4.6581e-05, 8.8573e-05], device='cuda:0') 100 0.0001 changing lr epoch 103, time 220.80, cls_loss 0.0027 cls_loss_mapping 0.0076 cls_loss_causal 0.5501 re_mapping 0.0077 re_causal 0.0210 /// teacc 98.92 lr 0.00010000 Epoch 105, weight, value: tensor([[-0.1337, -0.1268, 0.0750, ..., -0.0559, 0.0380, 0.0256], [-0.0717, -0.0271, -0.0412, ..., -0.0605, -0.0728, -0.0363], [ 0.0341, -0.0592, -0.0356, ..., -0.0445, 0.0059, -0.2156], ..., [-0.0909, 0.0876, 0.0033, ..., 0.0884, -0.0247, -0.0908], [-0.0484, -0.0546, 0.0866, ..., -0.0432, -0.1256, 0.0915], [-0.0003, -0.1346, 0.0818, ..., 0.0548, -0.1399, -0.0597]], device='cuda:0'), grad: tensor([[ 7.9582e-07, 1.3532e-06, -1.3387e-04, ..., 2.6990e-06, -9.0659e-05, -2.0897e-04], [ 9.0718e-05, 4.3698e-06, 4.3184e-05, ..., 7.6219e-06, 1.3886e-06, 3.5852e-05], [ 1.0908e-05, -1.4789e-05, 1.6734e-05, ..., -4.9114e-05, 6.0536e-06, 2.2054e-05], ..., [ 2.3358e-06, 7.6322e-07, 3.6359e-06, ..., 2.0489e-05, 2.5835e-06, 8.1137e-06], [ 1.9232e-07, 9.6206e-07, -9.3654e-06, ..., 1.7751e-06, 4.5337e-06, -3.4094e-05], [-1.4532e-04, 4.5113e-06, -6.1095e-05, ..., 4.9546e-06, 2.8592e-06, -3.9846e-05]], device='cuda:0') Epoch 105, bias, value: tensor([-0.0154, -0.0181, -0.0251, -0.0265, -0.0083, 0.0030, 0.0098, -0.0212, -0.0059, 0.0041], device='cuda:0'), grad: tensor([-5.8270e-04, 4.1127e-04, -1.3196e-04, 1.8656e-04, 1.1760e-04, 3.9339e-04, 2.8729e-05, 1.4770e-04, -2.7940e-05, -5.4264e-04], device='cuda:0') 100 0.0001 changing lr epoch 104, time 220.28, cls_loss 0.0037 cls_loss_mapping 0.0084 cls_loss_causal 0.5590 re_mapping 0.0073 re_causal 0.0208 /// teacc 98.98 lr 0.00010000 Epoch 106, weight, value: tensor([[-1.3398e-01, -1.2755e-01, 7.5774e-02, ..., -5.6171e-02, 3.8864e-02, 2.6090e-02], [-7.2445e-02, -2.8048e-02, -4.2173e-02, ..., -5.8669e-02, -7.2861e-02, -3.6340e-02], [ 3.3669e-02, -6.1068e-02, -3.6191e-02, ..., -4.6640e-02, 5.8702e-03, -2.1739e-01], ..., [-9.1139e-02, 8.9187e-02, 3.6625e-03, ..., 8.8551e-02, -2.4946e-02, -9.1034e-02], [-4.9131e-02, -5.5057e-02, 8.6511e-02, ..., -4.3750e-02, -1.2588e-01, 9.2437e-02], [-8.9680e-06, -1.3590e-01, 8.2141e-02, ..., 5.4388e-02, -1.4077e-01, -6.0066e-02]], device='cuda:0'), grad: tensor([[ 6.5658e-08, 1.3877e-06, 1.0636e-06, ..., 1.4761e-06, 2.0210e-07, 1.6531e-06], [ 1.8859e-08, 7.0259e-06, 4.1611e-06, ..., 5.5432e-06, -9.1828e-07, -1.7434e-05], [ 2.0722e-08, 9.1493e-06, 4.2319e-06, ..., 3.5800e-06, -1.1390e-06, 2.1253e-06], ..., [ 1.1339e-07, -5.7131e-05, -2.9117e-05, ..., -3.2395e-05, 2.3656e-07, -9.5740e-07], [ 4.1421e-07, 1.9073e-06, 3.7551e-05, ..., 2.1830e-05, 8.2981e-07, 2.6926e-05], [-5.0664e-07, 1.1228e-05, -6.6161e-05, ..., -2.3931e-05, 4.8894e-08, -3.4034e-05]], device='cuda:0') Epoch 106, bias, value: tensor([-0.0150, -0.0183, -0.0267, -0.0272, -0.0077, 0.0031, 0.0101, -0.0203, -0.0051, 0.0028], device='cuda:0'), grad: tensor([ 8.2701e-06, -1.7747e-05, 1.1280e-05, 4.1336e-05, 2.5123e-05, 1.1891e-05, 9.8161e-07, -9.0599e-05, 7.8738e-05, -6.9439e-05], device='cuda:0') 100 0.0001 changing lr epoch 105, time 220.72, cls_loss 0.0020 cls_loss_mapping 0.0057 cls_loss_causal 0.5607 re_mapping 0.0075 re_causal 0.0216 /// teacc 98.87 lr 0.00010000 Epoch 107, weight, value: tensor([[-0.1342, -0.1283, 0.0762, ..., -0.0564, 0.0403, 0.0264], [-0.0730, -0.0282, -0.0425, ..., -0.0589, -0.0733, -0.0364], [ 0.0334, -0.0617, -0.0366, ..., -0.0471, 0.0060, -0.2175], ..., [-0.0920, 0.0896, 0.0037, ..., 0.0887, -0.0250, -0.0915], [-0.0499, -0.0557, 0.0864, ..., -0.0443, -0.1260, 0.0924], [ 0.0003, -0.1364, 0.0830, ..., 0.0549, -0.1414, -0.0601]], device='cuda:0'), grad: tensor([[ 1.9986e-06, 6.0303e-08, 6.9616e-07, ..., 7.0874e-07, 2.3283e-10, 3.3416e-06], [ 1.1995e-06, 1.5181e-07, 1.0720e-06, ..., 1.7146e-06, 0.0000e+00, -2.2296e-06], [ 1.1260e-06, -8.6380e-07, 5.5879e-07, ..., 5.5134e-07, 0.0000e+00, 2.3227e-06], ..., [ 6.8471e-06, 5.2061e-07, 3.8110e-06, ..., 5.4911e-06, 0.0000e+00, 6.4075e-06], [-2.1644e-06, 1.2643e-07, -2.0146e-05, ..., 1.5777e-06, 2.3283e-10, -5.7995e-05], [-2.9225e-06, 1.5553e-07, -1.6838e-05, ..., 1.8239e-05, 0.0000e+00, 3.4291e-06]], device='cuda:0') Epoch 107, bias, value: tensor([-0.0148, -0.0186, -0.0261, -0.0268, -0.0080, 0.0032, 0.0101, -0.0203, -0.0059, 0.0031], device='cuda:0'), grad: tensor([ 1.0282e-05, -1.7080e-06, -7.6443e-06, 2.3961e-05, -8.6904e-05, 2.2590e-05, 2.9862e-05, 3.4809e-05, -7.7605e-05, 5.2303e-05], device='cuda:0') 100 0.0001 changing lr epoch 106, time 220.60, cls_loss 0.0024 cls_loss_mapping 0.0074 cls_loss_causal 0.5928 re_mapping 0.0072 re_causal 0.0220 /// teacc 98.80 lr 0.00010000 Epoch 108, weight, value: tensor([[-0.1346, -0.1290, 0.0763, ..., -0.0567, 0.0405, 0.0262], [-0.0735, -0.0275, -0.0426, ..., -0.0593, -0.0739, -0.0354], [ 0.0331, -0.0622, -0.0370, ..., -0.0478, 0.0061, -0.2185], ..., [-0.0927, 0.0894, 0.0037, ..., 0.0892, -0.0239, -0.0917], [-0.0510, -0.0565, 0.0866, ..., -0.0453, -0.1268, 0.0923], [ 0.0005, -0.1371, 0.0836, ..., 0.0554, -0.1416, -0.0600]], device='cuda:0'), grad: tensor([[ 1.6456e-06, 2.0210e-06, 6.1393e-05, ..., 1.6287e-05, 2.3283e-10, 5.7518e-05], [ 2.8824e-07, 1.3374e-06, 4.1351e-06, ..., 1.6978e-06, 0.0000e+00, -4.1239e-06], [ 6.0350e-07, 2.7977e-06, 2.0698e-05, ..., 5.6438e-06, -9.3132e-09, 1.8343e-05], ..., [-6.2957e-06, -5.0634e-05, 6.9151e-07, ..., -7.2062e-05, 6.9849e-10, 1.8492e-05], [-2.2754e-05, 1.4612e-06, -1.4019e-03, ..., -3.4213e-04, 2.3283e-09, -1.2980e-03], [ 2.2531e-05, 2.6807e-05, 1.2207e-03, ..., 3.0398e-04, 0.0000e+00, 1.1215e-03]], device='cuda:0') Epoch 108, bias, value: tensor([-0.0152, -0.0177, -0.0265, -0.0264, -0.0084, 0.0025, 0.0112, -0.0207, -0.0064, 0.0035], device='cuda:0'), grad: tensor([ 1.2338e-04, -9.8422e-06, 4.9233e-05, 1.4186e-04, 1.0002e-04, 2.8327e-05, 9.6858e-06, -1.0961e-04, -2.7370e-03, 2.4071e-03], device='cuda:0') 100 0.0001 changing lr epoch 107, time 220.69, cls_loss 0.0026 cls_loss_mapping 0.0068 cls_loss_causal 0.5484 re_mapping 0.0077 re_causal 0.0215 /// teacc 99.01 lr 0.00010000 Epoch 109, weight, value: tensor([[-0.1346, -0.1293, 0.0764, ..., -0.0571, 0.0406, 0.0260], [-0.0739, -0.0285, -0.0431, ..., -0.0596, -0.0741, -0.0356], [ 0.0329, -0.0635, -0.0377, ..., -0.0483, 0.0061, -0.2195], ..., [-0.0933, 0.0909, 0.0029, ..., 0.0892, -0.0237, -0.0925], [-0.0529, -0.0569, 0.0876, ..., -0.0451, -0.1269, 0.0930], [ 0.0005, -0.1374, 0.0844, ..., 0.0559, -0.1417, -0.0613]], device='cuda:0'), grad: tensor([[ 8.1491e-09, 6.1560e-07, -6.1980e-07, ..., 6.4541e-07, 6.9849e-10, -1.1013e-07], [ 9.5461e-09, 2.1290e-06, 1.1548e-06, ..., 2.7679e-06, 1.1642e-09, -2.9579e-06], [ 4.6566e-09, 1.8552e-06, 1.7704e-06, ..., 3.7509e-07, -5.2387e-08, 2.1439e-06], ..., [ 3.0268e-08, -2.1175e-05, -7.1786e-06, ..., -2.0087e-05, 5.1223e-09, 2.1420e-06], [ 3.0734e-08, 1.0766e-06, -3.0510e-06, ..., 2.5285e-07, 4.0745e-08, -5.3346e-06], [-3.2550e-07, 8.1211e-06, 4.2580e-06, ..., 7.8529e-06, 2.3283e-10, 2.9337e-06]], device='cuda:0') Epoch 109, bias, value: tensor([-0.0159, -0.0186, -0.0269, -0.0274, -0.0087, 0.0029, 0.0126, -0.0204, -0.0059, 0.0039], device='cuda:0'), grad: tensor([ 2.3209e-06, -6.3963e-06, -2.0459e-05, 5.4315e-06, 1.3359e-05, 6.4149e-06, -3.6433e-06, -1.4745e-05, -3.9041e-06, 2.1636e-05], device='cuda:0') 100 0.0001 changing lr epoch 108, time 220.86, cls_loss 0.0025 cls_loss_mapping 0.0077 cls_loss_causal 0.5597 re_mapping 0.0070 re_causal 0.0205 /// teacc 98.94 lr 0.00010000 Epoch 110, weight, value: tensor([[-0.1347, -0.1303, 0.0768, ..., -0.0574, 0.0406, 0.0260], [-0.0742, -0.0280, -0.0432, ..., -0.0591, -0.0741, -0.0346], [ 0.0324, -0.0644, -0.0384, ..., -0.0491, 0.0063, -0.2210], ..., [-0.0937, 0.0912, 0.0028, ..., 0.0893, -0.0238, -0.0941], [-0.0532, -0.0573, 0.0878, ..., -0.0456, -0.1269, 0.0944], [ 0.0009, -0.1383, 0.0849, ..., 0.0562, -0.1418, -0.0617]], device='cuda:0'), grad: tensor([[ 8.1062e-06, 3.0897e-07, 4.7013e-06, ..., 6.7335e-07, 1.0105e-07, 1.6969e-06], [ 4.8988e-06, 1.8813e-07, 3.2336e-06, ..., 8.8057e-07, 2.1188e-08, -5.3737e-07], [ 6.3121e-05, -5.3272e-07, 3.8564e-05, ..., 6.5286e-07, -1.6037e-06, 1.0550e-05], ..., [ 2.9467e-06, -2.1197e-06, 8.5495e-07, ..., -2.4699e-06, 3.9837e-07, 1.0515e-06], [ 9.8944e-06, 3.3062e-07, 5.8115e-06, ..., 1.3513e-06, 2.9593e-07, 1.0338e-06], [-1.2884e-03, 1.0692e-06, -4.4823e-04, ..., -1.4181e-03, 3.2829e-08, -5.7173e-04]], device='cuda:0') Epoch 110, bias, value: tensor([-0.0160, -0.0178, -0.0272, -0.0278, -0.0090, 0.0025, 0.0125, -0.0209, -0.0049, 0.0039], device='cuda:0'), grad: tensor([ 2.9787e-05, 1.3575e-05, 2.2233e-04, 1.7416e-04, 4.1466e-03, 7.6652e-05, 5.7936e-05, 8.9854e-06, 3.5733e-05, -4.7684e-03], device='cuda:0') 100 0.0001 changing lr epoch 109, time 220.86, cls_loss 0.0023 cls_loss_mapping 0.0071 cls_loss_causal 0.5800 re_mapping 0.0070 re_causal 0.0206 /// teacc 98.85 lr 0.00010000 Epoch 111, weight, value: tensor([[-0.1357, -0.1313, 0.0767, ..., -0.0580, 0.0407, 0.0260], [-0.0762, -0.0277, -0.0437, ..., -0.0597, -0.0743, -0.0348], [ 0.0336, -0.0649, -0.0389, ..., -0.0496, 0.0065, -0.2220], ..., [-0.0945, 0.0911, 0.0030, ..., 0.0901, -0.0239, -0.0946], [-0.0540, -0.0576, 0.0883, ..., -0.0459, -0.1270, 0.0950], [ 0.0012, -0.1395, 0.0853, ..., 0.0561, -0.1421, -0.0618]], device='cuda:0'), grad: tensor([[ 4.2724e-07, 3.2526e-07, -1.4789e-06, ..., 1.7565e-06, -4.0978e-08, -1.4296e-06], [ 5.8766e-07, 1.4305e-06, 1.6503e-06, ..., 3.5204e-06, 3.2596e-09, 3.0152e-07], [ 3.3760e-07, 1.2526e-06, 2.8498e-06, ..., 2.2165e-06, 4.8894e-09, 2.0619e-06], ..., [ 4.7311e-06, -6.1877e-06, 7.5474e-06, ..., 9.6485e-06, 5.1223e-09, 6.4820e-07], [ 2.2799e-06, 2.9802e-08, 3.5875e-06, ..., 8.0690e-06, 3.2596e-09, -1.6149e-06], [-4.4107e-05, 1.6075e-06, -8.7619e-05, ..., -1.4758e-04, 3.4925e-09, 3.7742e-07]], device='cuda:0') Epoch 111, bias, value: tensor([-0.0160, -0.0174, -0.0270, -0.0279, -0.0086, 0.0030, 0.0122, -0.0214, -0.0049, 0.0037], device='cuda:0'), grad: tensor([-3.0501e-08, 8.5384e-06, 1.9316e-06, 2.0146e-05, 2.6441e-04, 5.4091e-06, -3.1609e-06, 3.5763e-05, 1.7524e-05, -3.5095e-04], device='cuda:0') 100 0.0001 changing lr epoch 110, time 220.83, cls_loss 0.0026 cls_loss_mapping 0.0070 cls_loss_causal 0.5335 re_mapping 0.0072 re_causal 0.0197 /// teacc 98.94 lr 0.00010000 Epoch 112, weight, value: tensor([[-0.1359, -0.1307, 0.0757, ..., -0.0606, 0.0407, 0.0270], [-0.0766, -0.0280, -0.0443, ..., -0.0603, -0.0750, -0.0350], [ 0.0332, -0.0656, -0.0401, ..., -0.0500, 0.0072, -0.2230], ..., [-0.0952, 0.0918, 0.0030, ..., 0.0906, -0.0240, -0.0951], [-0.0551, -0.0583, 0.0888, ..., -0.0462, -0.1271, 0.0955], [ 0.0018, -0.1416, 0.0869, ..., 0.0569, -0.1422, -0.0625]], device='cuda:0'), grad: tensor([[ 1.2908e-06, 1.2224e-07, -7.8678e-06, ..., 7.6951e-08, -2.1153e-07, -5.8375e-06], [ 1.5628e-06, 3.3434e-07, 6.4000e-06, ..., 2.2934e-07, 1.2002e-07, 3.2634e-06], [ 8.7991e-06, -5.8971e-06, 2.4393e-05, ..., 1.8964e-07, -2.3097e-06, 7.7933e-06], ..., [ 9.1316e-07, 1.1399e-06, 2.5537e-06, ..., -6.1560e-07, 6.6031e-07, 1.4147e-06], [-8.7261e-05, 7.2643e-07, -2.5654e-04, ..., 1.0477e-07, 4.4843e-07, -4.8667e-05], [ 4.8637e-05, 8.5030e-07, 1.2082e-04, ..., -3.7858e-07, 3.4226e-08, 3.7223e-05]], device='cuda:0') Epoch 112, bias, value: tensor([-0.0166, -0.0175, -0.0273, -0.0276, -0.0084, 0.0030, 0.0114, -0.0214, -0.0048, 0.0043], device='cuda:0'), grad: tensor([-6.2734e-06, 1.5676e-05, 4.5806e-05, 1.8620e-04, -3.5972e-08, 2.8148e-05, -1.3344e-05, 1.1258e-05, -5.5408e-04, 2.8658e-04], device='cuda:0') 100 0.0001 changing lr epoch 111, time 220.65, cls_loss 0.0020 cls_loss_mapping 0.0050 cls_loss_causal 0.5534 re_mapping 0.0067 re_causal 0.0204 /// teacc 98.93 lr 0.00010000 Epoch 113, weight, value: tensor([[-0.1359, -0.1312, 0.0761, ..., -0.0607, 0.0427, 0.0277], [-0.0773, -0.0287, -0.0453, ..., -0.0605, -0.0752, -0.0350], [ 0.0323, -0.0662, -0.0405, ..., -0.0504, 0.0074, -0.2239], ..., [-0.0957, 0.0922, 0.0031, ..., 0.0909, -0.0242, -0.0958], [-0.0564, -0.0583, 0.0897, ..., -0.0462, -0.1273, 0.0956], [ 0.0011, -0.1427, 0.0869, ..., 0.0571, -0.1432, -0.0637]], device='cuda:0'), grad: tensor([[ 5.3644e-07, 1.1770e-07, 3.7774e-06, ..., 3.4962e-06, -6.4145e-08, 4.2492e-07], [ 8.7544e-08, 5.8580e-07, 7.6462e-07, ..., 6.2305e-07, 1.9989e-07, -1.0328e-06], [ 7.9861e-08, -5.6997e-07, 2.7958e-06, ..., 1.1884e-06, -9.5461e-07, 2.2706e-06], ..., [ 1.3039e-06, 3.8510e-07, 1.7434e-05, ..., 1.3441e-05, 5.9232e-07, 2.4159e-06], [ 7.7765e-07, -3.1013e-07, -5.9456e-06, ..., 4.4284e-07, 6.8219e-08, -9.3505e-06], [ 3.9116e-06, 1.1902e-06, -3.4809e-05, ..., -2.7657e-05, 1.1874e-08, 1.4432e-05]], device='cuda:0') Epoch 113, bias, value: tensor([-0.0161, -0.0181, -0.0275, -0.0274, -0.0082, 0.0039, 0.0109, -0.0212, -0.0047, 0.0038], device='cuda:0'), grad: tensor([ 8.9407e-06, -1.6391e-06, -1.6838e-06, 1.2994e-05, 1.1757e-05, -1.7166e-05, 5.1484e-06, 4.2319e-05, -1.0118e-05, -5.0694e-05], device='cuda:0') 100 0.0001 changing lr epoch 112, time 221.31, cls_loss 0.0021 cls_loss_mapping 0.0055 cls_loss_causal 0.5629 re_mapping 0.0070 re_causal 0.0204 /// teacc 98.92 lr 0.00010000 Epoch 114, weight, value: tensor([[-0.1365, -0.1316, 0.0765, ..., -0.0609, 0.0428, 0.0280], [-0.0777, -0.0284, -0.0454, ..., -0.0611, -0.0754, -0.0350], [ 0.0319, -0.0667, -0.0411, ..., -0.0509, 0.0079, -0.2247], ..., [-0.0960, 0.0923, 0.0028, ..., 0.0915, -0.0243, -0.0962], [-0.0574, -0.0587, 0.0904, ..., -0.0463, -0.1277, 0.0960], [ 0.0013, -0.1440, 0.0870, ..., 0.0568, -0.1434, -0.0644]], device='cuda:0'), grad: tensor([[-8.5915e-08, 1.7113e-07, -1.6140e-06, ..., 4.8243e-07, 6.8266e-07, 1.1269e-07], [ 2.5611e-08, 7.7765e-08, 1.2601e-06, ..., 4.7544e-07, 2.2687e-06, 5.2117e-06], [ 3.8650e-08, -8.3074e-07, 1.6242e-06, ..., 7.9814e-07, 1.6531e-07, 2.0098e-06], ..., [ 1.6904e-07, -3.9372e-07, 2.7586e-06, ..., 1.0459e-06, 4.5169e-08, 2.2575e-06], [ 3.0850e-07, -1.0384e-07, -9.1866e-06, ..., -2.1607e-06, 8.9873e-08, -8.7991e-06], [ 1.9395e-07, 4.7125e-07, 1.1898e-07, ..., 2.1257e-07, 3.2829e-08, 2.6207e-06]], device='cuda:0') Epoch 114, bias, value: tensor([-0.0159, -0.0183, -0.0270, -0.0272, -0.0078, 0.0038, 0.0110, -0.0215, -0.0045, 0.0035], device='cuda:0'), grad: tensor([ 5.3123e-06, -4.6998e-05, -7.7486e-06, 4.7460e-06, 3.2663e-05, 4.6976e-06, -2.4855e-05, 1.8731e-05, 3.3788e-06, 9.9838e-06], device='cuda:0') 100 0.0001 changing lr epoch 113, time 220.79, cls_loss 0.0021 cls_loss_mapping 0.0060 cls_loss_causal 0.5457 re_mapping 0.0067 re_causal 0.0196 /// teacc 99.00 lr 0.00010000 Epoch 115, weight, value: tensor([[-0.1368, -0.1324, 0.0763, ..., -0.0614, 0.0427, 0.0278], [-0.0783, -0.0285, -0.0455, ..., -0.0614, -0.0757, -0.0352], [ 0.0324, -0.0673, -0.0417, ..., -0.0516, 0.0091, -0.2256], ..., [-0.0964, 0.0925, 0.0024, ..., 0.0919, -0.0244, -0.0976], [-0.0585, -0.0587, 0.0906, ..., -0.0472, -0.1289, 0.0959], [ 0.0012, -0.1448, 0.0881, ..., 0.0573, -0.1437, -0.0639]], device='cuda:0'), grad: tensor([[ 6.8359e-06, -6.5705e-07, 2.6543e-08, ..., 1.3644e-06, -3.1665e-08, 3.3919e-06], [ 2.6124e-07, 5.4343e-07, 2.3637e-06, ..., 1.3094e-06, 2.8405e-08, 9.5367e-06], [-1.3493e-05, 1.2433e-06, 1.6525e-05, ..., 7.2196e-06, -1.5199e-06, 1.5914e-05], ..., [ 5.0059e-07, -5.2080e-06, 8.2701e-07, ..., -9.6112e-07, 3.3481e-07, 2.5183e-06], [ 5.9009e-06, 1.6540e-06, -1.4448e-04, ..., -6.7532e-05, 2.2165e-07, -2.4486e-04], [ 1.0416e-05, 7.7160e-07, 9.3699e-05, ..., 6.0827e-05, 5.9139e-08, 1.0049e-04]], device='cuda:0') Epoch 115, bias, value: tensor([-0.0164, -0.0183, -0.0268, -0.0272, -0.0079, 0.0042, 0.0114, -0.0217, -0.0053, 0.0040], device='cuda:0'), grad: tensor([ 5.4002e-05, 1.7673e-05, -8.1062e-05, 9.0361e-05, -2.9892e-05, 6.5923e-05, 4.0889e-05, 4.5970e-06, -4.2152e-04, 2.5916e-04], device='cuda:0') 100 0.0001 changing lr epoch 114, time 220.71, cls_loss 0.0017 cls_loss_mapping 0.0052 cls_loss_causal 0.5714 re_mapping 0.0067 re_causal 0.0207 /// teacc 98.98 lr 0.00010000 Epoch 116, weight, value: tensor([[-0.1372, -0.1338, 0.0762, ..., -0.0618, 0.0426, 0.0279], [-0.0785, -0.0290, -0.0458, ..., -0.0617, -0.0761, -0.0352], [ 0.0323, -0.0676, -0.0418, ..., -0.0514, 0.0094, -0.2261], ..., [-0.0967, 0.0932, 0.0020, ..., 0.0920, -0.0245, -0.0982], [-0.0591, -0.0589, 0.0911, ..., -0.0475, -0.1292, 0.0962], [ 0.0013, -0.1453, 0.0887, ..., 0.0577, -0.1440, -0.0642]], device='cuda:0'), grad: tensor([[ 5.3877e-07, 2.8331e-06, 3.9348e-07, ..., 2.5406e-06, 1.1176e-07, -6.9384e-08], [ 2.0117e-07, 1.1876e-05, 5.9754e-06, ..., 6.5751e-06, 1.3039e-08, -6.0815e-07], [ 1.7369e-07, 1.1131e-05, 5.5879e-06, ..., 6.6087e-06, -1.3281e-06, 1.1856e-06], ..., [ 1.0058e-07, -7.0274e-05, -3.1501e-05, ..., -3.6180e-05, 1.5367e-07, 2.6636e-07], [ 2.2668e-06, 1.7956e-05, 1.0803e-05, ..., 2.8647e-06, 5.1083e-07, 8.7917e-07], [-1.9185e-06, 1.2331e-05, 7.8930e-07, ..., 9.5293e-06, 6.1002e-08, -8.8476e-08]], device='cuda:0') Epoch 116, bias, value: tensor([-0.0169, -0.0185, -0.0265, -0.0272, -0.0083, 0.0042, 0.0115, -0.0215, -0.0053, 0.0042], device='cuda:0'), grad: tensor([ 1.0833e-05, 2.0668e-05, 2.9132e-06, 1.8612e-05, -3.7163e-05, 1.4529e-06, 2.0042e-05, -1.1253e-04, 5.0515e-05, 2.4676e-05], device='cuda:0') 100 0.0001 changing lr epoch 115, time 220.72, cls_loss 0.0020 cls_loss_mapping 0.0063 cls_loss_causal 0.5355 re_mapping 0.0072 re_causal 0.0199 /// teacc 98.99 lr 0.00010000 Epoch 117, weight, value: tensor([[-0.1375, -0.1366, 0.0763, ..., -0.0620, 0.0425, 0.0279], [-0.0792, -0.0300, -0.0462, ..., -0.0621, -0.0765, -0.0351], [ 0.0321, -0.0683, -0.0421, ..., -0.0521, 0.0107, -0.2265], ..., [-0.0965, 0.0960, 0.0023, ..., 0.0931, -0.0238, -0.0986], [-0.0605, -0.0608, 0.0912, ..., -0.0478, -0.1309, 0.0964], [ 0.0014, -0.1474, 0.0893, ..., 0.0580, -0.1442, -0.0644]], device='cuda:0'), grad: tensor([[ 2.1420e-08, 9.2667e-08, -1.6764e-06, ..., 8.3353e-08, 4.1910e-09, -2.2016e-06], [ 1.0245e-08, -8.2403e-06, 3.0827e-07, ..., 1.2666e-07, -2.3283e-09, 1.0524e-07], [ 2.5146e-08, 3.6974e-07, 2.4866e-07, ..., 2.0955e-07, 1.3970e-08, 3.5390e-07], ..., [ 5.1688e-08, -9.3644e-07, -8.4285e-08, ..., -1.2182e-06, 1.7695e-08, 4.0140e-07], [ 5.7789e-07, 1.2573e-07, 2.1327e-07, ..., 1.2899e-07, 2.2817e-08, 1.7863e-06], [-4.7171e-07, 6.8638e-07, -1.4678e-06, ..., -1.0058e-06, 7.9162e-09, 4.2189e-07]], device='cuda:0') Epoch 117, bias, value: tensor([-0.0177, -0.0191, -0.0266, -0.0278, -0.0091, 0.0040, 0.0114, -0.0195, -0.0057, 0.0042], device='cuda:0'), grad: tensor([-4.7013e-06, -4.8488e-05, 1.2554e-06, 7.5735e-06, 4.7415e-05, -1.4365e-05, 7.4059e-06, 1.3374e-06, 3.2037e-06, -6.5146e-07], device='cuda:0') 100 0.0001 changing lr epoch 116, time 220.67, cls_loss 0.0024 cls_loss_mapping 0.0073 cls_loss_causal 0.5962 re_mapping 0.0067 re_causal 0.0200 /// teacc 98.98 lr 0.00010000 Epoch 118, weight, value: tensor([[-0.1378, -0.1368, 0.0759, ..., -0.0634, 0.0424, 0.0282], [-0.0799, -0.0303, -0.0468, ..., -0.0624, -0.0749, -0.0341], [ 0.0321, -0.0684, -0.0429, ..., -0.0526, 0.0107, -0.2282], ..., [-0.0969, 0.0965, 0.0024, ..., 0.0936, -0.0240, -0.0994], [-0.0614, -0.0613, 0.0919, ..., -0.0482, -0.1312, 0.0970], [ 0.0014, -0.1480, 0.0901, ..., 0.0584, -0.1447, -0.0651]], device='cuda:0'), grad: tensor([[ 4.6380e-07, 2.8173e-07, -1.3039e-08, ..., 6.5193e-07, 4.6566e-10, 1.6866e-06], [ 9.4110e-07, 1.3281e-06, 1.1884e-05, ..., 1.3215e-06, 4.6566e-10, 2.7642e-05], [ 4.6287e-07, 1.3541e-06, 1.0610e-04, ..., 8.7125e-07, -8.3819e-09, 2.7204e-04], ..., [ 1.6615e-06, -1.4901e-07, 4.6268e-06, ..., 2.8871e-06, 2.7940e-09, 3.4459e-06], [ 1.7229e-06, 8.0978e-07, -1.2279e-04, ..., 2.0359e-06, 2.3283e-09, -3.2043e-04], [-6.6400e-05, 3.2969e-07, -4.1336e-05, ..., -5.0604e-05, 0.0000e+00, -3.7737e-06]], device='cuda:0') Epoch 118, bias, value: tensor([-0.0178, -0.0190, -0.0267, -0.0283, -0.0090, 0.0042, 0.0107, -0.0195, -0.0055, 0.0043], device='cuda:0'), grad: tensor([ 7.1339e-06, 8.3745e-05, 6.9618e-04, 1.7583e-05, 1.2875e-04, 3.2842e-05, 1.0893e-05, 2.2724e-05, -8.2970e-04, -1.7047e-04], device='cuda:0') 100 0.0001 changing lr epoch 117, time 220.75, cls_loss 0.0022 cls_loss_mapping 0.0058 cls_loss_causal 0.5815 re_mapping 0.0065 re_causal 0.0194 /// teacc 98.94 lr 0.00010000 Epoch 119, weight, value: tensor([[-0.1382, -0.1370, 0.0761, ..., -0.0636, 0.0433, 0.0283], [-0.0800, -0.0303, -0.0472, ..., -0.0628, -0.0753, -0.0346], [ 0.0316, -0.0688, -0.0434, ..., -0.0530, 0.0111, -0.2306], ..., [-0.0976, 0.0968, 0.0022, ..., 0.0939, -0.0238, -0.0999], [-0.0618, -0.0616, 0.0926, ..., -0.0488, -0.1315, 0.0984], [ 0.0020, -0.1488, 0.0907, ..., 0.0588, -0.1461, -0.0659]], device='cuda:0'), grad: tensor([[ 1.5926e-07, 1.5050e-06, -3.4692e-07, ..., 5.8860e-07, 1.7136e-07, -9.9931e-07], [ 1.1921e-07, 5.3048e-06, 1.7900e-06, ..., 1.1967e-06, 5.2154e-08, 4.1910e-09], [ 4.0000e-07, 2.9728e-06, 1.0747e-06, ..., -6.9803e-07, -7.7719e-07, 1.8813e-07], ..., [ 8.3679e-07, -1.6436e-05, -5.4128e-06, ..., -1.7788e-06, 4.1444e-07, 3.4971e-07], [ 1.1586e-06, 1.2480e-06, 3.5437e-07, ..., 7.2736e-07, 6.3796e-08, 1.4976e-06], [ 2.8759e-06, 5.5358e-06, 2.0489e-08, ..., 4.6305e-06, 1.0245e-08, 8.4937e-07]], device='cuda:0') Epoch 119, bias, value: tensor([-0.0179, -0.0188, -0.0272, -0.0286, -0.0090, 0.0042, 0.0107, -0.0198, -0.0042, 0.0043], device='cuda:0'), grad: tensor([ 1.0423e-05, 1.0565e-05, -2.4550e-06, -1.2163e-06, -1.6376e-05, -3.2149e-06, -1.1325e-05, -1.9982e-05, 7.5586e-06, 2.6077e-05], device='cuda:0') 100 0.0001 changing lr epoch 118, time 221.05, cls_loss 0.0028 cls_loss_mapping 0.0061 cls_loss_causal 0.5739 re_mapping 0.0072 re_causal 0.0197 /// teacc 99.02 lr 0.00010000 Epoch 120, weight, value: tensor([[-0.1392, -0.1375, 0.0762, ..., -0.0636, 0.0431, 0.0279], [-0.0807, -0.0303, -0.0475, ..., -0.0633, -0.0762, -0.0337], [ 0.0305, -0.0695, -0.0439, ..., -0.0535, 0.0119, -0.2326], ..., [-0.0984, 0.0972, 0.0022, ..., 0.0943, -0.0246, -0.1009], [-0.0622, -0.0622, 0.0938, ..., -0.0490, -0.1321, 0.0997], [ 0.0025, -0.1503, 0.0908, ..., 0.0586, -0.1477, -0.0672]], device='cuda:0'), grad: tensor([[ 2.7008e-07, 1.5292e-06, -5.0431e-07, ..., 2.3656e-07, 1.5320e-07, 6.3423e-07], [ 3.0221e-07, 1.6298e-06, 3.8045e-07, ..., 3.6554e-07, 1.6410e-06, 9.9279e-07], [ 5.5274e-07, 3.8221e-06, 2.9383e-07, ..., 2.1420e-07, 3.9581e-07, 1.3858e-06], ..., [ 8.4843e-07, 2.9728e-06, 1.7341e-06, ..., 8.1630e-07, 3.8138e-07, 1.6363e-06], [ 4.9509e-06, 2.7433e-05, 1.2470e-06, ..., 2.9225e-06, 1.0151e-07, -5.0098e-05], [-1.1511e-06, 3.2634e-06, -9.5293e-06, ..., -6.1691e-06, 6.0070e-07, -1.1642e-06]], device='cuda:0') Epoch 120, bias, value: tensor([-0.0182, -0.0184, -0.0277, -0.0287, -0.0085, 0.0037, 0.0113, -0.0199, -0.0034, 0.0036], device='cuda:0'), grad: tensor([ 5.1931e-06, 2.2113e-05, 1.0520e-05, -9.5749e-04, -4.3201e-04, 9.4175e-04, 4.4227e-04, 1.2778e-05, -4.0859e-05, -5.5581e-06], device='cuda:0') 100 0.0001 changing lr epoch 119, time 220.53, cls_loss 0.0027 cls_loss_mapping 0.0078 cls_loss_causal 0.5718 re_mapping 0.0068 re_causal 0.0192 /// teacc 98.86 lr 0.00010000 Epoch 121, weight, value: tensor([[-0.1391, -0.1383, 0.0761, ..., -0.0644, 0.0428, 0.0279], [-0.0816, -0.0308, -0.0482, ..., -0.0639, -0.0765, -0.0340], [ 0.0327, -0.0711, -0.0444, ..., -0.0541, 0.0118, -0.2337], ..., [-0.1003, 0.0976, 0.0022, ..., 0.0949, -0.0246, -0.1016], [-0.0637, -0.0640, 0.0938, ..., -0.0495, -0.1322, 0.1002], [ 0.0026, -0.1519, 0.0915, ..., 0.0590, -0.1481, -0.0673]], device='cuda:0'), grad: tensor([[-5.5879e-09, 7.7533e-07, -6.0583e-07, ..., 1.2433e-07, 0.0000e+00, -1.3132e-06], [ 4.1910e-09, 2.2706e-06, 2.5891e-07, ..., 2.2678e-07, 4.6566e-10, -1.0496e-06], [ 1.8626e-09, 3.4533e-06, 9.6299e-07, ..., 2.9653e-06, -2.1886e-08, 2.1979e-07], ..., [ 1.6764e-08, 1.6773e-04, 7.2062e-05, ..., -7.0371e-06, 7.9162e-09, 3.3155e-07], [ 1.7881e-07, 2.0321e-06, 9.8906e-07, ..., 4.4284e-07, 6.5193e-09, 4.4517e-07], [-6.8918e-08, 1.6969e-06, -1.1958e-06, ..., -1.0654e-06, 9.3132e-10, 2.1560e-07]], device='cuda:0') Epoch 121, bias, value: tensor([-0.0188, -0.0193, -0.0269, -0.0262, -0.0080, 0.0030, 0.0115, -0.0199, -0.0038, 0.0033], device='cuda:0'), grad: tensor([-1.8124e-06, 1.1519e-05, -1.7017e-05, -1.8764e-04, 2.1476e-06, 1.4631e-06, 2.1830e-06, 1.8442e-04, 4.8317e-06, -3.4506e-07], device='cuda:0') 100 0.0001 changing lr epoch 120, time 220.98, cls_loss 0.0032 cls_loss_mapping 0.0063 cls_loss_causal 0.5652 re_mapping 0.0070 re_causal 0.0196 /// teacc 99.01 lr 0.00010000 Epoch 122, weight, value: tensor([[-0.1396, -0.1387, 0.0763, ..., -0.0647, 0.0428, 0.0281], [-0.0798, -0.0312, -0.0481, ..., -0.0634, -0.0766, -0.0303], [ 0.0323, -0.0711, -0.0452, ..., -0.0554, 0.0120, -0.2358], ..., [-0.1007, 0.0988, 0.0023, ..., 0.0952, -0.0247, -0.1032], [-0.0648, -0.0660, 0.0944, ..., -0.0497, -0.1325, 0.1003], [ 0.0033, -0.1534, 0.0919, ..., 0.0591, -0.1485, -0.0678]], device='cuda:0'), grad: tensor([[ 1.1642e-08, 2.5611e-08, -6.2399e-07, ..., 6.5193e-08, 1.3970e-09, -2.6124e-07], [ 2.5611e-08, 1.5460e-07, 1.2713e-07, ..., 1.1688e-07, -6.0070e-08, -7.0455e-07], [-2.1420e-07, 3.7272e-06, 1.4249e-06, ..., 2.3693e-06, -1.9558e-08, 1.3085e-07], ..., [ 7.4692e-07, -4.5188e-06, -1.5199e-06, ..., -2.9635e-06, 5.1223e-09, 2.8126e-06], [ 2.4252e-06, 4.2003e-07, 3.6275e-07, ..., 3.3947e-07, 5.7276e-08, 8.3819e-06], [-1.2619e-07, 1.0477e-07, -7.5530e-07, ..., -3.2736e-07, 4.6566e-10, 4.8894e-08]], device='cuda:0') Epoch 122, bias, value: tensor([-0.0189, -0.0181, -0.0275, -0.0271, -0.0079, 0.0028, 0.0110, -0.0197, -0.0048, 0.0034], device='cuda:0'), grad: tensor([-5.3365e-07, -9.1409e-07, -6.5565e-06, 3.3397e-06, 8.9454e-07, -9.9838e-06, 6.5984e-07, -3.1404e-06, 1.6302e-05, -1.0384e-07], device='cuda:0') 100 0.0001 changing lr epoch 121, time 220.59, cls_loss 0.0024 cls_loss_mapping 0.0043 cls_loss_causal 0.5396 re_mapping 0.0068 re_causal 0.0193 /// teacc 98.93 lr 0.00010000 Epoch 123, weight, value: tensor([[-0.1400, -0.1389, 0.0766, ..., -0.0648, 0.0427, 0.0282], [-0.0802, -0.0306, -0.0482, ..., -0.0638, -0.0769, -0.0301], [ 0.0332, -0.0694, -0.0445, ..., -0.0534, 0.0121, -0.2362], ..., [-0.1015, 0.0976, 0.0018, ..., 0.0950, -0.0244, -0.1037], [-0.0652, -0.0671, 0.0949, ..., -0.0501, -0.1328, 0.1008], [ 0.0035, -0.1544, 0.0920, ..., 0.0589, -0.1487, -0.0678]], device='cuda:0'), grad: tensor([[-1.6717e-07, 9.4855e-07, -1.2815e-05, ..., 4.2422e-07, -4.3772e-08, -8.6650e-06], [ 2.2352e-08, 6.9523e-07, 1.2182e-06, ..., 4.5775e-07, 1.3970e-09, 3.2177e-07], [ 5.2620e-08, 2.0508e-06, 2.7362e-06, ..., 1.0887e-06, 3.7253e-09, 2.1011e-06], ..., [-4.3306e-08, -1.2465e-05, -2.7623e-06, ..., -6.3740e-06, 7.9162e-09, 1.6745e-06], [ 1.2945e-07, -3.4785e-07, -8.2180e-06, ..., 7.6974e-07, 1.8626e-09, -1.2189e-05], [ 3.4925e-08, 6.0610e-06, 3.8594e-06, ..., 1.3635e-06, 1.2107e-08, 5.1782e-06]], device='cuda:0') Epoch 123, bias, value: tensor([-0.0188, -0.0176, -0.0254, -0.0272, -0.0071, 0.0023, 0.0110, -0.0210, -0.0048, 0.0027], device='cuda:0'), grad: tensor([-5.2214e-05, 2.6990e-06, 1.0259e-05, 3.9071e-05, -1.2685e-06, 9.4324e-06, 6.8061e-06, -1.2174e-05, -2.1979e-05, 1.9401e-05], device='cuda:0') 100 0.0001 changing lr epoch 122, time 220.15, cls_loss 0.0022 cls_loss_mapping 0.0060 cls_loss_causal 0.5607 re_mapping 0.0067 re_causal 0.0192 /// teacc 98.93 lr 0.00010000 Epoch 124, weight, value: tensor([[-0.1400, -0.1389, 0.0768, ..., -0.0650, 0.0427, 0.0280], [-0.0807, -0.0309, -0.0486, ..., -0.0644, -0.0769, -0.0302], [ 0.0330, -0.0698, -0.0449, ..., -0.0541, 0.0122, -0.2367], ..., [-0.1022, 0.0983, 0.0025, ..., 0.0960, -0.0245, -0.1038], [-0.0659, -0.0676, 0.0953, ..., -0.0504, -0.1328, 0.1013], [ 0.0028, -0.1573, 0.0922, ..., 0.0585, -0.1488, -0.0693]], device='cuda:0'), grad: tensor([[ 2.7381e-06, 1.1455e-07, 1.5035e-05, ..., 1.6406e-05, 7.9162e-09, -2.8824e-07], [ 1.2266e-06, 4.8941e-07, 4.7162e-06, ..., 1.0863e-05, 4.1910e-09, -1.3644e-07], [ 3.2708e-06, 2.5239e-07, 3.6694e-06, ..., 1.2062e-05, -4.6100e-08, 4.5635e-08], ..., [ 2.2184e-06, -1.0114e-06, 1.4342e-06, ..., 1.4889e-04, 5.1223e-09, 1.3690e-07], [ 5.2117e-06, 9.5041e-07, 2.4270e-06, ..., 1.8358e-05, 1.3039e-08, 3.5856e-08], [-9.4175e-05, 9.0385e-07, -6.2883e-05, ..., -6.7592e-05, 1.3970e-09, 1.9791e-07]], device='cuda:0') Epoch 124, bias, value: tensor([-0.0188, -0.0180, -0.0255, -0.0293, -0.0068, 0.0045, 0.0111, -0.0205, -0.0046, 0.0019], device='cuda:0'), grad: tensor([ 6.7294e-05, 3.3557e-05, 3.6687e-05, -2.6628e-05, -3.7074e-04, 9.0182e-05, 1.4067e-04, 3.3498e-04, 5.2303e-05, -3.5763e-04], device='cuda:0') 100 0.0001 changing lr epoch 123, time 220.41, cls_loss 0.0023 cls_loss_mapping 0.0063 cls_loss_causal 0.5327 re_mapping 0.0066 re_causal 0.0184 /// teacc 99.00 lr 0.00010000 Epoch 125, weight, value: tensor([[-0.1401, -0.1391, 0.0771, ..., -0.0651, 0.0427, 0.0280], [-0.0812, -0.0309, -0.0494, ..., -0.0652, -0.0771, -0.0308], [ 0.0332, -0.0700, -0.0454, ..., -0.0545, 0.0125, -0.2376], ..., [-0.1029, 0.0989, 0.0034, ..., 0.0971, -0.0246, -0.1039], [-0.0673, -0.0680, 0.0957, ..., -0.0508, -0.1330, 0.1030], [ 0.0026, -0.1599, 0.0921, ..., 0.0582, -0.1488, -0.0701]], device='cuda:0'), grad: tensor([[-3.4599e-07, 5.0897e-07, -4.7162e-06, ..., 4.4703e-08, -5.2573e-07, -6.1980e-07], [ 1.0990e-06, 4.6678e-06, 5.2974e-06, ..., 8.9873e-07, 1.8626e-09, 4.2468e-06], [ 1.0058e-07, 8.7768e-06, 1.8820e-05, ..., -3.0268e-08, 8.3353e-08, 1.3389e-05], ..., [ 9.2201e-08, 2.2221e-06, 3.7998e-06, ..., 2.7148e-07, 9.3132e-10, 2.5928e-06], [ 1.6997e-07, -1.5251e-05, -3.4034e-05, ..., 1.4203e-07, 2.5611e-08, -2.4691e-05], [ 2.9942e-07, 1.0077e-06, 1.1940e-06, ..., 1.4063e-07, 2.1886e-08, 9.3924e-07]], device='cuda:0') Epoch 125, bias, value: tensor([-0.0188, -0.0180, -0.0257, -0.0288, -0.0069, 0.0039, 0.0103, -0.0203, -0.0032, 0.0015], device='cuda:0'), grad: tensor([-1.2077e-05, 3.7879e-05, 4.5300e-05, -2.1756e-06, -1.9580e-05, 1.6987e-05, -4.2170e-06, 1.6004e-05, -8.4937e-05, 6.8657e-06], device='cuda:0') 100 0.0001 changing lr epoch 124, time 220.41, cls_loss 0.0022 cls_loss_mapping 0.0056 cls_loss_causal 0.5346 re_mapping 0.0066 re_causal 0.0180 /// teacc 99.03 lr 0.00010000 Epoch 126, weight, value: tensor([[-0.1399, -0.1392, 0.0796, ..., -0.0640, 0.0429, 0.0300], [-0.0817, -0.0315, -0.0500, ..., -0.0660, -0.0784, -0.0315], [ 0.0336, -0.0703, -0.0460, ..., -0.0548, 0.0124, -0.2386], ..., [-0.1035, 0.0993, 0.0023, ..., 0.0976, -0.0246, -0.1050], [-0.0697, -0.0685, 0.0954, ..., -0.0516, -0.1330, 0.1021], [ 0.0032, -0.1606, 0.0923, ..., 0.0586, -0.1490, -0.0714]], device='cuda:0'), grad: tensor([[ 3.5856e-07, 2.2119e-07, -1.3970e-08, ..., 2.0955e-08, 0.0000e+00, 4.6473e-07], [ 1.9791e-07, 1.3551e-07, 2.2817e-08, ..., 2.5146e-08, 0.0000e+00, 2.8033e-07], [ 5.2899e-07, -3.9628e-07, 9.4529e-08, ..., 7.9628e-08, 0.0000e+00, 7.3621e-07], ..., [ 3.1665e-07, 9.4995e-08, -7.9628e-08, ..., -1.1362e-07, 0.0000e+00, 3.9535e-07], [ 1.7229e-06, 6.8499e-07, 1.2061e-07, ..., 8.5682e-08, 4.6566e-10, 2.6375e-06], [ 2.3469e-06, 8.2236e-07, -3.3015e-07, ..., -1.4063e-07, 0.0000e+00, 2.8722e-06]], device='cuda:0') Epoch 126, bias, value: tensor([-0.0160, -0.0186, -0.0258, -0.0281, -0.0072, 0.0037, 0.0112, -0.0204, -0.0044, 0.0010], device='cuda:0'), grad: tensor([ 1.9427e-06, 1.0775e-06, -4.8243e-06, 1.7047e-05, -4.6473e-07, -2.3663e-05, -4.8652e-06, 2.1737e-06, 6.4485e-06, 5.1297e-06], device='cuda:0') 100 0.0001 changing lr epoch 125, time 220.53, cls_loss 0.0021 cls_loss_mapping 0.0046 cls_loss_causal 0.5821 re_mapping 0.0065 re_causal 0.0189 /// teacc 98.86 lr 0.00010000 Epoch 127, weight, value: tensor([[-0.1414, -0.1394, 0.0796, ..., -0.0642, 0.0432, 0.0299], [-0.0821, -0.0316, -0.0515, ..., -0.0669, -0.0788, -0.0319], [ 0.0338, -0.0706, -0.0467, ..., -0.0551, 0.0126, -0.2397], ..., [-0.1041, 0.0998, 0.0023, ..., 0.0982, -0.0242, -0.1053], [-0.0708, -0.0696, 0.0951, ..., -0.0528, -0.1332, 0.1024], [ 0.0041, -0.1612, 0.0938, ..., 0.0595, -0.1494, -0.0704]], device='cuda:0'), grad: tensor([[ 3.7858e-07, 1.2107e-07, 6.0983e-06, ..., 9.4995e-08, 0.0000e+00, 2.7418e-05], [ 1.8682e-06, 5.2527e-07, -4.1962e-05, ..., 3.6601e-07, 0.0000e+00, -1.0389e-04], [ 4.3027e-06, 3.5763e-06, 1.6615e-06, ..., 3.0994e-06, 0.0000e+00, 1.1063e-04], ..., [ 3.2270e-07, -4.6045e-06, 1.5991e-06, ..., -4.0457e-06, 0.0000e+00, 1.3940e-05], [-2.1830e-05, 1.8766e-07, 2.1547e-05, ..., 1.2806e-07, 0.0000e+00, -3.7503e-04], [ 1.4575e-07, 4.4238e-07, 8.6520e-07, ..., 2.9337e-08, 0.0000e+00, 5.5358e-06]], device='cuda:0') Epoch 127, bias, value: tensor([-0.0162, -0.0191, -0.0257, -0.0275, -0.0079, 0.0031, 0.0112, -0.0202, -0.0047, 0.0023], device='cuda:0'), grad: tensor([ 7.9274e-05, -3.8099e-04, 2.2626e-04, 1.9819e-05, 3.0011e-05, 3.6454e-04, 2.5702e-04, 3.2127e-05, -6.4230e-04, 1.4432e-05], device='cuda:0') 100 0.0001 changing lr epoch 126, time 220.99, cls_loss 0.0021 cls_loss_mapping 0.0065 cls_loss_causal 0.5581 re_mapping 0.0064 re_causal 0.0186 /// teacc 98.88 lr 0.00010000 Epoch 128, weight, value: tensor([[-0.1416, -0.1397, 0.0798, ..., -0.0645, 0.0431, 0.0299], [-0.0823, -0.0316, -0.0517, ..., -0.0673, -0.0789, -0.0315], [ 0.0339, -0.0713, -0.0474, ..., -0.0553, 0.0127, -0.2407], ..., [-0.1045, 0.1000, 0.0006, ..., 0.0967, -0.0242, -0.1057], [-0.0711, -0.0699, 0.0954, ..., -0.0531, -0.1333, 0.1034], [ 0.0041, -0.1627, 0.0961, ..., 0.0623, -0.1496, -0.0709]], device='cuda:0'), grad: tensor([[-2.5611e-08, 2.5239e-07, -3.3993e-08, ..., 2.3562e-07, 1.3039e-08, 3.2596e-09], [ 1.8626e-09, 1.3165e-05, 2.9374e-06, ..., 5.4091e-06, 6.6124e-08, -7.7300e-08], [ 2.3283e-09, 1.2042e-06, 5.4296e-07, ..., 7.1293e-07, 7.5437e-08, 4.2235e-07], ..., [ 5.1223e-09, -3.7462e-05, -1.1541e-05, ..., -2.1189e-05, 1.3039e-08, 1.9511e-07], [ 1.8626e-09, 2.2352e-06, -4.2608e-07, ..., 1.0431e-06, 1.5367e-08, -4.8522e-07], [ 4.6566e-09, 8.3596e-06, 4.8354e-06, ..., 8.9034e-06, 4.6566e-09, 2.1094e-07]], device='cuda:0') Epoch 128, bias, value: tensor([-0.0166, -0.0189, -0.0258, -0.0271, -0.0088, 0.0031, 0.0104, -0.0214, -0.0040, 0.0043], device='cuda:0'), grad: tensor([ 9.2899e-07, 2.6315e-05, -2.0303e-07, 1.5691e-05, 8.2105e-06, 4.0159e-06, -8.1733e-06, -7.6354e-05, 6.7353e-06, 2.2933e-05], device='cuda:0') 100 0.0001 changing lr epoch 127, time 220.50, cls_loss 0.0025 cls_loss_mapping 0.0060 cls_loss_causal 0.5463 re_mapping 0.0065 re_causal 0.0183 /// teacc 99.00 lr 0.00010000 Epoch 129, weight, value: tensor([[-1.4290e-01, -1.4000e-01, 7.9997e-02, ..., -6.4629e-02, 4.3052e-02, 2.9482e-02], [-8.2983e-02, -3.2171e-02, -5.2553e-02, ..., -6.8332e-02, -7.9693e-02, -3.1947e-02], [ 3.3566e-02, -7.1794e-02, -4.8039e-02, ..., -5.5922e-02, 1.2678e-02, -2.4167e-01], ..., [-1.0481e-01, 9.9471e-02, 1.1697e-04, ..., 9.7235e-02, -2.3901e-02, -1.0618e-01], [-7.4304e-02, -7.0558e-02, 9.6268e-02, ..., -5.3398e-02, -1.3333e-01, 1.0218e-01], [ 4.1234e-03, -1.6389e-01, 9.6092e-02, ..., 6.2147e-02, -1.4965e-01, -7.1429e-02]], device='cuda:0'), grad: tensor([[ 2.8592e-07, 1.4119e-06, 1.5860e-06, ..., 2.3097e-06, 0.0000e+00, 7.5251e-07], [ 8.0559e-08, 1.9576e-06, 3.0287e-06, ..., 4.8727e-06, 0.0000e+00, 1.6708e-06], [ 2.0955e-07, 2.5406e-06, 2.9355e-06, ..., 4.5225e-06, -4.6566e-10, 4.0345e-06], ..., [ 3.3388e-07, -1.2064e-04, -1.9801e-04, ..., -3.2663e-04, 0.0000e+00, 5.7230e-07], [ 1.7472e-06, 7.2382e-06, 7.6890e-06, ..., 8.5011e-06, 0.0000e+00, 2.7418e-06], [ 1.8580e-07, 1.0020e-04, 1.7166e-04, ..., 2.9206e-04, 0.0000e+00, 4.8848e-07]], device='cuda:0') Epoch 129, bias, value: tensor([-0.0164, -0.0193, -0.0261, -0.0253, -0.0086, 0.0023, 0.0124, -0.0216, -0.0049, 0.0038], device='cuda:0'), grad: tensor([ 1.2085e-05, 1.5177e-05, 6.7651e-05, 2.0057e-05, 5.3018e-05, -1.8980e-06, -1.4222e-04, -6.8712e-04, 3.8147e-05, 6.2513e-04], device='cuda:0') 100 0.0001 changing lr epoch 128, time 221.14, cls_loss 0.0021 cls_loss_mapping 0.0053 cls_loss_causal 0.5017 re_mapping 0.0067 re_causal 0.0181 /// teacc 99.01 lr 0.00010000 Epoch 130, weight, value: tensor([[-0.1436, -0.1402, 0.0802, ..., -0.0650, 0.0431, 0.0297], [-0.0840, -0.0324, -0.0543, ..., -0.0698, -0.0798, -0.0345], [ 0.0319, -0.0723, -0.0486, ..., -0.0567, 0.0126, -0.2436], ..., [-0.1057, 0.1000, 0.0011, ..., 0.0982, -0.0238, -0.1038], [-0.0756, -0.0709, 0.0965, ..., -0.0538, -0.1334, 0.1021], [ 0.0043, -0.1648, 0.0963, ..., 0.0621, -0.1498, -0.0718]], device='cuda:0'), grad: tensor([[ 2.4028e-07, 1.2107e-07, -5.9046e-07, ..., 1.0012e-07, 1.0245e-08, -5.6904e-07], [ 1.2806e-07, 4.9919e-07, 6.1747e-07, ..., 2.4028e-07, 9.7789e-09, 5.0757e-08], [ 1.6717e-07, 1.6727e-06, 1.5348e-06, ..., 1.8580e-07, 4.3772e-08, 9.2899e-07], ..., [ 1.9837e-07, -4.0680e-06, -3.2652e-06, ..., -6.0424e-06, 2.4214e-08, 8.5169e-07], [ 2.4633e-07, 3.0156e-06, -2.7463e-05, ..., -1.3798e-05, 9.7789e-09, -3.2693e-05], [ 5.9046e-07, 4.9174e-06, 2.9474e-05, ..., 1.7807e-05, 3.6322e-08, 2.9176e-05]], device='cuda:0') Epoch 130, bias, value: tensor([-0.0165, -0.0199, -0.0266, -0.0263, -0.0085, 0.0033, 0.0130, -0.0209, -0.0053, 0.0036], device='cuda:0'), grad: tensor([-8.0699e-07, -9.3319e-07, 6.0424e-06, -1.1459e-05, 1.5423e-06, -2.3432e-06, 2.8852e-06, -6.4336e-06, -6.4611e-05, 7.6115e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 221.12, cls_loss 0.0020 cls_loss_mapping 0.0049 cls_loss_causal 0.5496 re_mapping 0.0064 re_causal 0.0181 /// teacc 98.96 lr 0.00010000 Epoch 131, weight, value: tensor([[-0.1440, -0.1407, 0.0803, ..., -0.0652, 0.0431, 0.0298], [-0.0847, -0.0327, -0.0550, ..., -0.0703, -0.0798, -0.0345], [ 0.0318, -0.0721, -0.0487, ..., -0.0572, 0.0127, -0.2441], ..., [-0.1063, 0.1007, 0.0020, ..., 0.0989, -0.0239, -0.1034], [-0.0763, -0.0724, 0.0963, ..., -0.0548, -0.1334, 0.1021], [ 0.0046, -0.1664, 0.0964, ..., 0.0619, -0.1499, -0.0722]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 2.8238e-06, 2.2147e-06, ..., 2.9709e-07, 0.0000e+00, 3.5614e-06], [ 2.3283e-09, 4.6492e-06, 8.6473e-07, ..., 7.1386e-07, 0.0000e+00, -3.1339e-07], [ 1.8626e-09, 5.4352e-06, 1.2644e-05, ..., 2.7288e-07, -9.3132e-10, 2.1085e-05], ..., [ 1.0710e-08, 1.2359e-06, 3.1758e-06, ..., -2.6561e-06, 0.0000e+00, 5.4836e-06], [ 6.9849e-09, 3.4105e-06, -4.3988e-05, ..., 6.1467e-08, 0.0000e+00, -7.0572e-05], [-5.1223e-09, 4.8913e-06, 1.8030e-06, ..., 2.5667e-06, 0.0000e+00, 1.6317e-06]], device='cuda:0') Epoch 131, bias, value: tensor([-0.0164, -0.0199, -0.0265, -0.0266, -0.0081, 0.0036, 0.0128, -0.0206, -0.0058, 0.0033], device='cuda:0'), grad: tensor([ 1.0528e-05, 5.5991e-06, 3.7044e-05, -2.8038e-04, 8.8150e-07, 2.7895e-04, 7.7253e-07, 1.5587e-05, -8.4937e-05, 1.5676e-05], device='cuda:0') 100 0.0001 changing lr epoch 130, time 221.11, cls_loss 0.0024 cls_loss_mapping 0.0056 cls_loss_causal 0.5521 re_mapping 0.0066 re_causal 0.0179 /// teacc 99.00 lr 0.00010000 Epoch 132, weight, value: tensor([[-0.1463, -0.1413, 0.0804, ..., -0.0655, 0.0430, 0.0294], [-0.0852, -0.0330, -0.0555, ..., -0.0711, -0.0798, -0.0348], [ 0.0317, -0.0727, -0.0494, ..., -0.0578, 0.0127, -0.2452], ..., [-0.1068, 0.1015, 0.0025, ..., 0.0997, -0.0239, -0.1034], [-0.0772, -0.0734, 0.0968, ..., -0.0553, -0.1335, 0.1027], [ 0.0041, -0.1687, 0.0964, ..., 0.0619, -0.1499, -0.0733]], device='cuda:0'), grad: tensor([[ 4.5933e-06, 3.6741e-07, 1.7896e-05, ..., 1.1340e-05, 0.0000e+00, 7.1898e-06], [ 2.5611e-08, -1.6158e-07, 7.5949e-07, ..., 1.7947e-06, 0.0000e+00, -1.2875e-05], [ 2.0070e-07, 1.1064e-06, 9.7975e-07, ..., 2.3320e-06, -4.6566e-10, 5.4855e-07], ..., [ 1.4994e-07, -4.9733e-06, 3.3733e-06, ..., 5.1931e-06, 0.0000e+00, 1.3877e-06], [ 5.5833e-07, 9.4017e-07, 1.2387e-06, ..., 2.1402e-06, 4.6566e-10, 3.3658e-06], [-1.3568e-05, 1.2778e-06, -6.6936e-05, ..., -6.0916e-05, 0.0000e+00, -2.7176e-06]], device='cuda:0') Epoch 132, bias, value: tensor([-0.0168, -0.0200, -0.0269, -0.0259, -0.0081, 0.0049, 0.0120, -0.0202, -0.0055, 0.0025], device='cuda:0'), grad: tensor([ 4.8548e-05, -3.4213e-05, 9.7007e-06, 8.3959e-07, 6.9916e-05, -2.3559e-05, 2.4229e-05, 2.1845e-05, 1.9372e-05, -1.3673e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 131---------------------------------------------------- epoch 131, time 221.44, cls_loss 0.0022 cls_loss_mapping 0.0061 cls_loss_causal 0.5876 re_mapping 0.0070 re_causal 0.0194 /// teacc 99.06 lr 0.00010000 Epoch 133, weight, value: tensor([[-0.1459, -0.1404, 0.0813, ..., -0.0659, 0.0430, 0.0296], [-0.0860, -0.0331, -0.0559, ..., -0.0715, -0.0819, -0.0356], [ 0.0314, -0.0731, -0.0504, ..., -0.0582, 0.0128, -0.2465], ..., [-0.1076, 0.1018, 0.0027, ..., 0.1002, -0.0237, -0.1038], [-0.0776, -0.0730, 0.0981, ..., -0.0557, -0.1335, 0.1042], [ 0.0022, -0.1716, 0.0944, ..., 0.0616, -0.1500, -0.0746]], device='cuda:0'), grad: tensor([[ 4.1910e-09, -1.1362e-06, -4.0010e-06, ..., 1.0943e-07, 0.0000e+00, -3.7570e-06], [ 2.7940e-09, 3.4645e-07, 5.4622e-07, ..., 8.8476e-08, 0.0000e+00, -9.3132e-08], [ 4.6566e-10, 4.7917e-07, 5.6112e-07, ..., -5.2620e-08, -9.3132e-10, 6.3237e-07], ..., [ 1.7229e-08, -2.6030e-07, 5.7556e-07, ..., 1.1595e-07, 0.0000e+00, 4.5495e-07], [ 5.5879e-09, 2.4959e-07, 2.8405e-07, ..., 3.8790e-07, 0.0000e+00, -4.9546e-07], [-1.8952e-07, 4.8196e-07, -2.4587e-06, ..., 3.2466e-06, 0.0000e+00, 3.7719e-07]], device='cuda:0') Epoch 133, bias, value: tensor([-0.0159, -0.0204, -0.0272, -0.0242, -0.0069, 0.0041, 0.0122, -0.0202, -0.0042, 0.0003], device='cuda:0'), grad: tensor([-8.0541e-06, -8.6520e-07, 1.4007e-06, 2.1923e-06, -2.0474e-05, 3.0342e-06, -7.6788e-07, 2.2538e-06, 1.2536e-06, 2.0042e-05], device='cuda:0') 100 0.0001 changing lr epoch 132, time 220.42, cls_loss 0.0021 cls_loss_mapping 0.0059 cls_loss_causal 0.5416 re_mapping 0.0064 re_causal 0.0173 /// teacc 98.90 lr 0.00010000 Epoch 134, weight, value: tensor([[-0.1462, -0.1409, 0.0816, ..., -0.0663, 0.0430, 0.0300], [-0.0867, -0.0329, -0.0556, ..., -0.0721, -0.0825, -0.0354], [ 0.0309, -0.0731, -0.0511, ..., -0.0586, 0.0132, -0.2478], ..., [-0.1077, 0.1020, 0.0026, ..., 0.1007, -0.0233, -0.1046], [-0.0787, -0.0731, 0.0986, ..., -0.0560, -0.1337, 0.1046], [ 0.0021, -0.1721, 0.0946, ..., 0.0613, -0.1502, -0.0750]], device='cuda:0'), grad: tensor([[ 4.0978e-08, 2.3004e-07, 5.9837e-07, ..., 9.0944e-07, 1.3970e-09, 8.5728e-07], [ 1.3039e-08, 5.3644e-07, 2.0023e-07, ..., 7.2969e-07, -1.9558e-08, -1.4314e-06], [ 2.7008e-08, 2.5388e-06, 3.6154e-06, ..., -4.5123e-07, 3.7253e-09, 1.2638e-06], ..., [ 1.9558e-08, 3.0100e-06, 1.5832e-07, ..., 9.5228e-07, 4.1910e-09, 6.3889e-07], [ 4.6100e-08, 7.5297e-07, 5.3644e-06, ..., 4.0717e-06, 3.7253e-09, 2.1867e-06], [ 1.7509e-07, 5.4715e-07, -1.2748e-05, ..., -1.6028e-06, 9.3132e-10, -3.0287e-06]], device='cuda:0') Epoch 134, bias, value: tensor([-1.5717e-02, -2.0013e-02, -2.7000e-02, -2.4380e-02, -6.2750e-03, 4.3196e-03, 1.1450e-02, -2.0538e-02, -3.7213e-03, -8.6620e-05], device='cuda:0'), grad: tensor([ 5.7071e-06, -4.2701e-07, 1.6596e-06, -1.9744e-05, -1.9029e-05, 1.2740e-05, -5.3160e-06, 1.2703e-05, 2.0817e-05, -9.1568e-06], device='cuda:0') 100 0.0001 changing lr epoch 133, time 220.73, cls_loss 0.0023 cls_loss_mapping 0.0052 cls_loss_causal 0.5491 re_mapping 0.0063 re_causal 0.0178 /// teacc 98.97 lr 0.00010000 Epoch 135, weight, value: tensor([[-0.1465, -0.1417, 0.0815, ..., -0.0667, 0.0433, 0.0301], [-0.0868, -0.0336, -0.0563, ..., -0.0731, -0.0832, -0.0364], [ 0.0318, -0.0752, -0.0527, ..., -0.0600, 0.0140, -0.2490], ..., [-0.1080, 0.1041, 0.0040, ..., 0.1026, -0.0225, -0.1036], [-0.0788, -0.0751, 0.0986, ..., -0.0573, -0.1342, 0.1050], [ 0.0023, -0.1735, 0.0949, ..., 0.0612, -0.1507, -0.0758]], device='cuda:0'), grad: tensor([[ 4.4238e-09, 2.0233e-07, 1.0058e-06, ..., 2.4773e-07, 3.9139e-07, 3.2168e-06], [ 4.4238e-09, 1.0291e-07, 1.1618e-07, ..., 8.7311e-08, -5.4250e-08, -1.4831e-07], [ 9.3132e-10, -2.1812e-06, 4.3749e-07, ..., -1.4938e-06, 2.1630e-07, 1.5264e-06], ..., [ 8.3819e-09, 1.3001e-06, 9.4995e-08, ..., 9.7137e-07, 5.6578e-08, 2.8964e-07], [ 1.6228e-07, -2.4983e-07, -2.9076e-06, ..., -3.6415e-07, -1.3434e-07, -4.8615e-06], [-2.9337e-08, 1.0803e-07, -1.6540e-06, ..., -1.1744e-06, 4.2375e-08, 7.7393e-07]], device='cuda:0') Epoch 135, bias, value: tensor([-0.0160, -0.0204, -0.0276, -0.0248, -0.0073, 0.0054, 0.0108, -0.0191, -0.0039, -0.0002], device='cuda:0'), grad: tensor([ 1.1310e-05, 6.4261e-08, -1.0446e-05, 6.2585e-06, 2.4289e-05, 1.4799e-06, -3.2395e-05, 1.1675e-05, -1.0476e-05, -1.8254e-06], device='cuda:0') 100 0.0001 changing lr epoch 134, time 220.67, cls_loss 0.0024 cls_loss_mapping 0.0056 cls_loss_causal 0.5407 re_mapping 0.0064 re_causal 0.0177 /// teacc 98.97 lr 0.00010000 Epoch 136, weight, value: tensor([[-0.1467, -0.1420, 0.0815, ..., -0.0671, 0.0433, 0.0301], [-0.0874, -0.0355, -0.0584, ..., -0.0734, -0.0835, -0.0368], [ 0.0314, -0.0755, -0.0534, ..., -0.0603, 0.0142, -0.2500], ..., [-0.1086, 0.1063, 0.0060, ..., 0.1032, -0.0227, -0.1017], [-0.0799, -0.0755, 0.0984, ..., -0.0580, -0.1343, 0.1050], [ 0.0030, -0.1743, 0.0956, ..., 0.0620, -0.1510, -0.0757]], device='cuda:0'), grad: tensor([[ 1.4529e-07, 2.4145e-07, 6.3796e-07, ..., 7.3295e-07, 0.0000e+00, 5.7975e-08], [ 1.7965e-06, 6.5472e-07, 9.2015e-06, ..., 1.6913e-05, 0.0000e+00, -4.1351e-07], [-1.1874e-07, 1.7667e-06, -1.8068e-07, ..., 1.5739e-06, -1.1642e-09, -3.0734e-08], ..., [ 6.4783e-06, -5.0701e-06, 3.2037e-05, ..., 2.2113e-05, 4.6566e-10, 3.0291e-07], [ 4.6985e-07, 3.1246e-07, 2.5202e-06, ..., 3.7905e-06, 2.3283e-10, 2.4540e-07], [-9.2685e-06, -9.1083e-07, -4.8965e-05, ..., -1.8314e-05, 0.0000e+00, -2.1560e-07]], device='cuda:0') Epoch 136, bias, value: tensor([-0.0163, -0.0216, -0.0278, -0.0254, -0.0084, 0.0057, 0.0111, -0.0172, -0.0046, 0.0004], device='cuda:0'), grad: tensor([ 3.3528e-06, 5.3912e-05, -4.7944e-06, 8.6278e-06, -8.5950e-05, 4.1127e-06, 1.3774e-06, 9.3877e-05, 1.4573e-05, -8.8930e-05], device='cuda:0') 100 0.0001 changing lr epoch 135, time 220.55, cls_loss 0.0021 cls_loss_mapping 0.0041 cls_loss_causal 0.5660 re_mapping 0.0060 re_causal 0.0177 /// teacc 99.01 lr 0.00010000 Epoch 137, weight, value: tensor([[-0.1471, -0.1424, 0.0818, ..., -0.0673, 0.0434, 0.0304], [-0.0877, -0.0352, -0.0581, ..., -0.0739, -0.0836, -0.0363], [ 0.0315, -0.0753, -0.0539, ..., -0.0590, 0.0143, -0.2507], ..., [-0.1090, 0.1063, 0.0058, ..., 0.1032, -0.0229, -0.1021], [-0.0803, -0.0762, 0.0987, ..., -0.0584, -0.1343, 0.1055], [ 0.0033, -0.1749, 0.0959, ..., 0.0614, -0.1514, -0.0763]], device='cuda:0'), grad: tensor([[ 7.0082e-08, 1.6764e-08, 6.5193e-09, ..., 8.3819e-09, 2.4028e-06, 3.5539e-06], [ 1.5832e-08, 1.3807e-07, 4.6566e-08, ..., 8.7544e-08, 1.7742e-07, -4.9081e-07], [ 1.0943e-08, 5.1921e-08, 1.2806e-08, ..., 2.2585e-08, 3.2131e-07, 5.1688e-07], ..., [ 6.7754e-08, -3.6834e-07, -1.2456e-07, ..., -2.9383e-07, 3.8650e-08, 6.6403e-07], [ 2.4363e-06, 2.2305e-07, 7.0781e-08, ..., 3.7020e-08, 3.4049e-06, 9.3505e-06], [ 2.0140e-07, 1.6321e-07, -2.3306e-07, ..., 1.3970e-09, 7.8417e-07, 1.6401e-06]], device='cuda:0') Epoch 137, bias, value: tensor([-0.0159, -0.0208, -0.0279, -0.0255, -0.0077, 0.0056, 0.0108, -0.0178, -0.0043, -0.0001], device='cuda:0'), grad: tensor([ 2.0191e-05, -1.3532e-06, 3.0957e-06, 1.3568e-05, 3.9749e-06, 3.5353e-06, -8.6725e-05, 2.0638e-06, 3.4332e-05, 7.2829e-06], device='cuda:0') 100 0.0001 changing lr epoch 136, time 220.22, cls_loss 0.0020 cls_loss_mapping 0.0059 cls_loss_causal 0.5620 re_mapping 0.0060 re_causal 0.0180 /// teacc 98.98 lr 0.00010000 Epoch 138, weight, value: tensor([[-0.1478, -0.1426, 0.0818, ..., -0.0675, 0.0433, 0.0301], [-0.0885, -0.0353, -0.0583, ..., -0.0743, -0.0834, -0.0363], [ 0.0315, -0.0754, -0.0540, ..., -0.0592, 0.0144, -0.2513], ..., [-0.1096, 0.1060, 0.0052, ..., 0.1036, -0.0227, -0.1023], [-0.0829, -0.0766, 0.0992, ..., -0.0589, -0.1346, 0.1050], [ 0.0037, -0.1752, 0.0962, ..., 0.0613, -0.1518, -0.0763]], device='cuda:0'), grad: tensor([[ 3.3062e-08, 2.4680e-07, -9.5181e-07, ..., 2.7940e-07, 9.7789e-09, -9.7090e-08], [ 4.8894e-09, -8.6240e-07, 7.5437e-08, ..., 5.7276e-07, -3.7951e-08, -1.0341e-05], [ 4.4238e-09, 1.5143e-06, 2.0047e-07, ..., 1.0580e-06, -1.8766e-07, 5.6392e-07], ..., [ 1.9558e-08, -1.9576e-06, 3.1060e-07, ..., -2.1271e-06, 2.5146e-08, 4.3847e-06], [ 2.0000e-07, 2.7567e-07, 8.5449e-08, ..., 1.4203e-07, 1.5181e-07, 2.2687e-06], [ 4.3074e-08, 4.2631e-07, -5.3179e-07, ..., -6.1281e-07, 2.7940e-09, 1.3635e-06]], device='cuda:0') Epoch 138, bias, value: tensor([-1.6052e-02, -2.1034e-02, -2.7399e-02, -2.5140e-02, -7.1120e-03, 6.0711e-03, 1.0604e-02, -1.8083e-02, -5.0000e-03, -7.7302e-05], device='cuda:0'), grad: tensor([-1.2033e-06, -4.0114e-05, 2.1365e-06, 2.4773e-06, 2.8964e-06, 5.3085e-07, 2.8647e-06, 1.5132e-05, 9.7752e-06, 5.5395e-06], device='cuda:0') 100 0.0001 changing lr epoch 137, time 220.39, cls_loss 0.0015 cls_loss_mapping 0.0041 cls_loss_causal 0.5559 re_mapping 0.0061 re_causal 0.0178 /// teacc 98.95 lr 0.00010000 Epoch 139, weight, value: tensor([[-0.1479, -0.1430, 0.0819, ..., -0.0677, 0.0433, 0.0300], [-0.0888, -0.0353, -0.0584, ..., -0.0747, -0.0833, -0.0359], [ 0.0314, -0.0756, -0.0542, ..., -0.0593, 0.0145, -0.2519], ..., [-0.1103, 0.1063, 0.0053, ..., 0.1042, -0.0227, -0.1024], [-0.0832, -0.0768, 0.0994, ..., -0.0596, -0.1349, 0.1053], [ 0.0040, -0.1757, 0.0964, ..., 0.0612, -0.1520, -0.0765]], device='cuda:0'), grad: tensor([[ 3.0035e-08, 3.5390e-08, -1.2098e-06, ..., 5.6112e-08, 0.0000e+00, -4.7721e-06], [ 1.7695e-08, 9.4762e-08, 5.6345e-08, ..., 9.4762e-08, 0.0000e+00, -1.6997e-07], [ 8.6147e-09, 1.1967e-07, 3.8720e-07, ..., 8.1491e-08, 0.0000e+00, 6.0257e-07], ..., [ 7.2410e-08, -4.9081e-07, 2.6077e-07, ..., -1.4040e-07, 0.0000e+00, 5.6997e-07], [ 1.7649e-07, -6.1467e-08, -7.2364e-07, ..., 1.0198e-07, 0.0000e+00, -5.0617e-07], [-2.6869e-07, 1.7113e-07, -6.5984e-07, ..., -6.6496e-07, 0.0000e+00, 5.0664e-07]], device='cuda:0') Epoch 139, bias, value: tensor([-0.0163, -0.0210, -0.0270, -0.0253, -0.0068, 0.0058, 0.0107, -0.0180, -0.0050, -0.0003], device='cuda:0'), grad: tensor([-1.5691e-05, -2.4028e-07, 2.2892e-06, 4.9695e-06, -9.2108e-07, -8.1165e-07, 8.3596e-06, 2.1122e-06, -1.5320e-07, 4.6566e-08], device='cuda:0') 100 0.0001 changing lr epoch 138, time 220.44, cls_loss 0.0018 cls_loss_mapping 0.0051 cls_loss_causal 0.5168 re_mapping 0.0063 re_causal 0.0178 /// teacc 98.98 lr 0.00010000 Epoch 140, weight, value: tensor([[-0.1481, -0.1432, 0.0820, ..., -0.0680, 0.0433, 0.0305], [-0.0885, -0.0334, -0.0586, ..., -0.0756, -0.0838, -0.0357], [ 0.0311, -0.0761, -0.0553, ..., -0.0596, 0.0145, -0.2537], ..., [-0.1107, 0.1048, 0.0054, ..., 0.1048, -0.0226, -0.1025], [-0.0835, -0.0759, 0.0999, ..., -0.0600, -0.1350, 0.1056], [ 0.0045, -0.1763, 0.0968, ..., 0.0613, -0.1522, -0.0763]], device='cuda:0'), grad: tensor([[ 3.2387e-07, 3.4808e-07, -4.2794e-07, ..., 9.9419e-08, 6.2864e-09, 7.1526e-07], [-3.0771e-06, 3.9786e-06, 1.5134e-07, ..., 3.1311e-06, 6.9849e-10, -7.1302e-06], [ 6.6357e-08, 2.5816e-06, 4.4820e-07, ..., -2.3516e-08, -1.1642e-08, 2.8312e-07], ..., [ 2.8801e-07, -2.8629e-06, 1.8161e-07, ..., -3.2391e-06, 3.9581e-09, 2.0838e-07], [ 4.5635e-07, 2.1234e-06, 3.9116e-07, ..., 1.9209e-07, 1.8626e-09, 8.8196e-07], [-1.2722e-06, 6.3796e-07, -2.0657e-06, ..., -1.4137e-06, 2.3283e-10, 1.5739e-07]], device='cuda:0') Epoch 140, bias, value: tensor([-0.0161, -0.0194, -0.0275, -0.0260, -0.0068, 0.0060, 0.0112, -0.0193, -0.0048, -0.0003], device='cuda:0'), grad: tensor([ 3.8482e-06, -5.5969e-05, 3.9607e-05, -3.6925e-05, 1.1854e-05, 2.5257e-05, 7.2159e-06, -1.6354e-06, 8.4266e-06, -1.7677e-06], device='cuda:0') 100 0.0001 changing lr epoch 139, time 220.35, cls_loss 0.0020 cls_loss_mapping 0.0052 cls_loss_causal 0.5619 re_mapping 0.0057 re_causal 0.0173 /// teacc 98.92 lr 0.00010000 Epoch 141, weight, value: tensor([[-0.1484, -0.1436, 0.0820, ..., -0.0684, 0.0433, 0.0306], [-0.0890, -0.0336, -0.0588, ..., -0.0763, -0.0837, -0.0356], [ 0.0309, -0.0754, -0.0557, ..., -0.0596, 0.0147, -0.2545], ..., [-0.1112, 0.1050, 0.0054, ..., 0.1051, -0.0225, -0.1025], [-0.0833, -0.0763, 0.1004, ..., -0.0611, -0.1351, 0.1063], [ 0.0051, -0.1767, 0.0976, ..., 0.0622, -0.1523, -0.0749]], device='cuda:0'), grad: tensor([[ 4.8894e-09, 1.0049e-06, -2.1211e-07, ..., 6.7102e-07, 1.0245e-08, -1.0291e-07], [ 6.5193e-09, 1.0841e-05, 4.7265e-08, ..., 7.4282e-06, -1.4435e-08, -1.1194e-06], [ 6.9849e-10, 4.0770e-05, 2.1886e-07, ..., 2.8208e-05, -1.3853e-07, 3.5926e-07], ..., [ 5.3551e-09, -6.1393e-05, 7.4506e-08, ..., -4.2617e-05, 8.8476e-09, 3.4249e-07], [ 2.4214e-08, 1.2554e-06, 5.0012e-07, ..., 1.1120e-06, 3.1199e-08, 1.1176e-07], [-1.0524e-07, 4.7125e-06, -1.0896e-06, ..., 2.4997e-06, 3.4925e-09, 3.6787e-08]], device='cuda:0') Epoch 141, bias, value: tensor([-0.0161, -0.0197, -0.0266, -0.0258, -0.0074, 0.0048, 0.0109, -0.0193, -0.0044, 0.0003], device='cuda:0'), grad: tensor([ 2.3413e-06, 1.9997e-05, 8.8215e-05, 5.7034e-06, 2.5891e-06, 1.3337e-06, 4.1979e-07, -1.3518e-04, 5.2452e-06, 9.1866e-06], device='cuda:0') 100 0.0001 changing lr epoch 140, time 220.56, cls_loss 0.0018 cls_loss_mapping 0.0042 cls_loss_causal 0.5496 re_mapping 0.0060 re_causal 0.0174 /// teacc 98.97 lr 0.00010000 Epoch 142, weight, value: tensor([[-0.1488, -0.1438, 0.0823, ..., -0.0685, 0.0432, 0.0307], [-0.0892, -0.0338, -0.0593, ..., -0.0769, -0.0836, -0.0354], [ 0.0308, -0.0758, -0.0563, ..., -0.0601, 0.0146, -0.2555], ..., [-0.1118, 0.1054, 0.0052, ..., 0.1055, -0.0223, -0.1028], [-0.0836, -0.0765, 0.1012, ..., -0.0616, -0.1352, 0.1068], [ 0.0054, -0.1770, 0.0980, ..., 0.0622, -0.1524, -0.0754]], device='cuda:0'), grad: tensor([[ 8.6846e-08, 2.9919e-07, -4.2003e-07, ..., 7.9861e-08, 3.0035e-08, -5.3830e-07], [ 5.9139e-08, 3.7788e-07, 1.7486e-07, ..., 6.6776e-07, 4.6566e-09, -2.0415e-06], [ 6.0303e-08, 6.7614e-07, 1.6554e-07, ..., 4.6100e-08, 5.1921e-08, 1.7229e-06], ..., [ 1.0966e-07, 1.2806e-08, 4.1444e-07, ..., 3.3900e-07, 9.0804e-09, 1.5786e-07], [ 5.0105e-07, 4.7660e-07, 3.4412e-07, ..., 8.3726e-07, 8.8476e-09, 1.7649e-07], [-1.8626e-09, 2.0070e-07, -1.8701e-06, ..., -1.1660e-06, 1.2806e-08, 2.9500e-07]], device='cuda:0') Epoch 142, bias, value: tensor([-0.0160, -0.0197, -0.0264, -0.0260, -0.0072, 0.0046, 0.0110, -0.0193, -0.0043, 0.0004], device='cuda:0'), grad: tensor([-1.2387e-06, -4.2394e-06, 3.8818e-06, -1.9401e-05, -3.5502e-06, 2.5600e-05, -7.0259e-06, 3.0994e-06, 3.5334e-06, -6.5099e-07], device='cuda:0') 100 0.0001 changing lr epoch 141, time 220.51, cls_loss 0.0022 cls_loss_mapping 0.0053 cls_loss_causal 0.5694 re_mapping 0.0057 re_causal 0.0169 /// teacc 99.00 lr 0.00010000 Epoch 143, weight, value: tensor([[-0.1484, -0.1441, 0.0825, ..., -0.0693, 0.0432, 0.0315], [-0.0892, -0.0338, -0.0596, ..., -0.0773, -0.0830, -0.0351], [ 0.0305, -0.0760, -0.0572, ..., -0.0605, 0.0147, -0.2572], ..., [-0.1129, 0.1056, 0.0052, ..., 0.1060, -0.0225, -0.1030], [-0.0842, -0.0769, 0.1018, ..., -0.0621, -0.1354, 0.1072], [ 0.0059, -0.1775, 0.0986, ..., 0.0624, -0.1528, -0.0759]], device='cuda:0'), grad: tensor([[ 4.5449e-06, 5.8208e-08, -1.7986e-05, ..., 1.3039e-08, 3.0757e-07, -3.5278e-06], [ 9.6299e-07, 3.1758e-07, 2.8941e-07, ..., 1.6554e-07, 1.3760e-07, 1.3057e-06], [ 6.8806e-06, 1.7323e-07, 3.3923e-07, ..., 2.6776e-08, 4.4191e-07, 9.1940e-06], ..., [ 8.0653e-07, 3.7951e-08, 2.7427e-07, ..., 5.9139e-08, 1.7323e-07, 1.1912e-06], [ 6.6273e-06, 4.7963e-07, -4.4703e-07, ..., 5.7276e-08, 3.4738e-07, 7.1190e-06], [ 3.9418e-07, 2.3586e-07, 1.6645e-05, ..., -9.9884e-08, 5.9372e-08, 9.6187e-06]], device='cuda:0') Epoch 143, bias, value: tensor([-0.0155, -0.0186, -0.0284, -0.0257, -0.0074, 0.0039, 0.0104, -0.0193, -0.0042, 0.0006], device='cuda:0'), grad: tensor([-2.9385e-05, 6.9253e-06, 1.8850e-05, 1.3113e-05, 7.6219e-06, 5.0992e-05, -1.4865e-04, 4.0233e-06, 2.8133e-05, 4.8429e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 142---------------------------------------------------- epoch 142, time 221.18, cls_loss 0.0023 cls_loss_mapping 0.0047 cls_loss_causal 0.5552 re_mapping 0.0061 re_causal 0.0165 /// teacc 99.08 lr 0.00010000 Epoch 144, weight, value: tensor([[-0.1486, -0.1444, 0.0836, ..., -0.0695, 0.0435, 0.0335], [-0.0907, -0.0339, -0.0598, ..., -0.0777, -0.0836, -0.0350], [ 0.0307, -0.0762, -0.0579, ..., -0.0608, 0.0153, -0.2580], ..., [-0.1140, 0.1058, 0.0052, ..., 0.1062, -0.0223, -0.1033], [-0.0847, -0.0778, 0.1021, ..., -0.0624, -0.1357, 0.1077], [ 0.0061, -0.1776, 0.0988, ..., 0.0627, -0.1536, -0.0768]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 8.9640e-08, -2.4051e-07, ..., 4.5868e-08, 1.3504e-08, -4.4773e-07], [ 9.3132e-10, 2.4796e-07, 1.4366e-07, ..., 9.8022e-08, 9.3132e-09, -8.0280e-07], [ 2.3283e-10, 1.8510e-07, 1.1316e-07, ..., 6.1700e-08, -5.8440e-08, 7.4971e-08], ..., [ 7.4506e-09, -8.2422e-08, 1.4878e-07, ..., 1.0454e-07, 1.2573e-08, 2.4354e-07], [ 4.1910e-09, 1.0012e-06, 3.5088e-07, ..., 3.3528e-08, 2.5611e-09, -5.6811e-08], [-9.3132e-09, 1.5059e-06, 3.4343e-07, ..., -2.2002e-07, 3.2596e-09, 5.3784e-08]], device='cuda:0') Epoch 144, bias, value: tensor([-0.0133, -0.0182, -0.0286, -0.0243, -0.0076, 0.0024, 0.0089, -0.0195, -0.0043, 0.0005], device='cuda:0'), grad: tensor([-1.9115e-07, -1.6093e-05, -8.3167e-07, -6.1207e-06, 1.2666e-05, 1.4547e-06, -3.5251e-07, 4.0904e-06, 2.3544e-06, 3.0380e-06], device='cuda:0') 100 0.0001 changing lr epoch 143, time 220.33, cls_loss 0.0020 cls_loss_mapping 0.0051 cls_loss_causal 0.5599 re_mapping 0.0061 re_causal 0.0173 /// teacc 98.94 lr 0.00010000 Epoch 145, weight, value: tensor([[-0.1490, -0.1447, 0.0836, ..., -0.0698, 0.0434, 0.0327], [-0.0919, -0.0336, -0.0596, ..., -0.0789, -0.0838, -0.0346], [ 0.0307, -0.0766, -0.0584, ..., -0.0612, 0.0155, -0.2588], ..., [-0.1148, 0.1062, 0.0047, ..., 0.1060, -0.0224, -0.1036], [-0.0851, -0.0784, 0.1022, ..., -0.0631, -0.1358, 0.1079], [ 0.0063, -0.1787, 0.0996, ..., 0.0632, -0.1540, -0.0769]], device='cuda:0'), grad: tensor([[-5.3085e-08, 6.0769e-07, -9.1851e-05, ..., -9.5591e-06, 6.9849e-09, -1.9923e-05], [ 8.3819e-09, 2.8722e-06, 1.6559e-06, ..., 1.4286e-06, 6.0536e-09, 3.5670e-07], [ 1.1642e-08, 1.4370e-06, 2.2948e-06, ..., 2.0629e-07, -3.6787e-08, 1.3607e-06], ..., [ 1.6298e-08, -1.8269e-05, -6.9886e-06, ..., -1.3068e-05, 6.0536e-09, 3.6834e-07], [ 1.2107e-07, 4.0559e-07, 4.0233e-06, ..., 1.5181e-07, 3.2596e-09, 3.7495e-06], [ 7.5903e-08, 6.7502e-06, 8.3506e-05, ..., 1.3299e-05, 3.2596e-09, 1.2971e-05]], device='cuda:0') Epoch 145, bias, value: tensor([-0.0143, -0.0177, -0.0286, -0.0249, -0.0075, 0.0027, 0.0095, -0.0199, -0.0046, 0.0008], device='cuda:0'), grad: tensor([-1.8263e-04, 6.3144e-06, 7.5623e-06, 9.8944e-06, 2.9113e-06, 6.7167e-06, -6.3553e-06, -2.8014e-05, 1.1906e-05, 1.7190e-04], device='cuda:0') 100 0.0001 changing lr epoch 144, time 220.51, cls_loss 0.0018 cls_loss_mapping 0.0045 cls_loss_causal 0.5484 re_mapping 0.0064 re_causal 0.0172 /// teacc 98.94 lr 0.00010000 Epoch 146, weight, value: tensor([[-0.1493, -0.1450, 0.0839, ..., -0.0700, 0.0434, 0.0328], [-0.0935, -0.0332, -0.0590, ..., -0.0794, -0.0839, -0.0341], [ 0.0304, -0.0771, -0.0590, ..., -0.0618, 0.0155, -0.2597], ..., [-0.1170, 0.1063, 0.0043, ..., 0.1065, -0.0225, -0.1045], [-0.0856, -0.0785, 0.1025, ..., -0.0633, -0.1359, 0.1082], [ 0.0072, -0.1797, 0.1001, ..., 0.0632, -0.1542, -0.0770]], device='cuda:0'), grad: tensor([[ 1.0710e-08, 7.8185e-07, 3.5577e-07, ..., 6.2445e-07, 4.6566e-10, 2.1840e-07], [ 3.2596e-09, 1.2191e-06, 1.0617e-06, ..., 1.0058e-06, 0.0000e+00, 1.7267e-06], [ 2.7940e-09, 1.2526e-07, 1.0394e-06, ..., 6.0536e-08, -7.4506e-09, 3.1926e-06], ..., [ 5.1223e-09, -3.5428e-06, -1.5981e-06, ..., -2.9095e-06, 4.1910e-09, -4.4517e-07], [ 2.7008e-08, 8.2888e-08, -1.9670e-06, ..., 4.3306e-08, 1.3970e-09, -6.0275e-06], [ 5.1223e-09, 7.8743e-07, 3.6415e-07, ..., 6.3982e-07, 0.0000e+00, 1.8114e-07]], device='cuda:0') Epoch 146, bias, value: tensor([-0.0143, -0.0173, -0.0287, -0.0251, -0.0074, 0.0026, 0.0099, -0.0204, -0.0044, 0.0009], device='cuda:0'), grad: tensor([ 1.7118e-06, 5.2936e-06, 7.0184e-06, 1.9837e-06, 3.8929e-07, 8.5309e-07, -3.4738e-07, -5.8711e-06, -1.2733e-05, 1.6540e-06], device='cuda:0') 100 0.0001 changing lr epoch 145, time 220.16, cls_loss 0.0015 cls_loss_mapping 0.0044 cls_loss_causal 0.5361 re_mapping 0.0061 re_causal 0.0176 /// teacc 99.04 lr 0.00010000 Epoch 147, weight, value: tensor([[-0.1492, -0.1454, 0.0840, ..., -0.0704, 0.0435, 0.0333], [-0.0935, -0.0333, -0.0593, ..., -0.0800, -0.0841, -0.0338], [ 0.0303, -0.0776, -0.0595, ..., -0.0625, 0.0158, -0.2602], ..., [-0.1183, 0.1068, 0.0043, ..., 0.1070, -0.0224, -0.1046], [-0.0863, -0.0789, 0.1021, ..., -0.0642, -0.1360, 0.1082], [ 0.0077, -0.1802, 0.1007, ..., 0.0633, -0.1545, -0.0770]], device='cuda:0'), grad: tensor([[ 3.0780e-07, 1.1874e-07, 1.4575e-07, ..., 9.4902e-07, 0.0000e+00, 1.8766e-07], [ 2.3469e-06, 7.5623e-07, 5.6578e-07, ..., 7.5549e-06, 0.0000e+00, 2.1840e-07], [ 3.6834e-07, 1.3895e-06, 5.3085e-07, ..., 2.7996e-06, 0.0000e+00, 1.1362e-07], ..., [ 8.8215e-06, -3.6210e-06, -1.9372e-07, ..., 2.0310e-05, 0.0000e+00, 1.3085e-07], [ 3.4319e-07, 3.3993e-08, -8.2003e-07, ..., 9.2294e-07, 0.0000e+00, -2.6524e-06], [ 1.8311e-04, 9.4157e-07, 1.8403e-05, ..., 5.1737e-04, 0.0000e+00, 1.3411e-07]], device='cuda:0') Epoch 147, bias, value: tensor([-0.0141, -0.0173, -0.0288, -0.0251, -0.0073, 0.0024, 0.0097, -0.0202, -0.0048, 0.0011], device='cuda:0'), grad: tensor([ 2.9057e-06, 1.9938e-05, 7.3835e-06, 1.5832e-06, -1.4210e-03, 2.0768e-06, 3.5018e-06, 5.3018e-05, -2.0731e-06, 1.3332e-03], device='cuda:0') 100 0.0001 changing lr epoch 146, time 220.19, cls_loss 0.0021 cls_loss_mapping 0.0047 cls_loss_causal 0.5633 re_mapping 0.0056 re_causal 0.0161 /// teacc 99.02 lr 0.00010000 Epoch 148, weight, value: tensor([[-0.1511, -0.1460, 0.0841, ..., -0.0708, 0.0435, 0.0329], [-0.0925, -0.0338, -0.0599, ..., -0.0818, -0.0841, -0.0339], [ 0.0298, -0.0782, -0.0603, ..., -0.0633, 0.0159, -0.2619], ..., [-0.1189, 0.1077, 0.0045, ..., 0.1081, -0.0225, -0.1045], [-0.0874, -0.0793, 0.1025, ..., -0.0648, -0.1362, 0.1085], [ 0.0076, -0.1812, 0.1013, ..., 0.0629, -0.1547, -0.0776]], device='cuda:0'), grad: tensor([[ 2.2817e-07, 1.1083e-06, 4.8801e-07, ..., 2.4680e-07, -8.3819e-09, 7.7020e-07], [ 1.8626e-07, 2.1920e-05, 2.2016e-06, ..., 3.2969e-06, 9.3132e-10, 2.4252e-06], [ 3.2131e-08, 5.8711e-06, 1.0803e-06, ..., 1.1874e-06, 4.6566e-10, 6.0583e-07], ..., [ 1.2759e-07, -6.0320e-05, -2.5444e-06, ..., -1.0878e-05, 4.6566e-10, 5.2527e-07], [ 1.4901e-06, 1.4175e-06, -5.3495e-06, ..., 1.4948e-07, 4.6566e-10, -5.7369e-06], [ 1.1874e-07, 8.0094e-06, 1.6484e-06, ..., 1.7565e-06, 1.8626e-09, 1.4650e-06]], device='cuda:0') Epoch 148, bias, value: tensor([-0.0147, -0.0176, -0.0287, -0.0255, -0.0071, 0.0030, 0.0095, -0.0197, -0.0051, 0.0007], device='cuda:0'), grad: tensor([ 1.1586e-05, 4.5478e-05, -1.0423e-05, 3.0845e-05, 8.0690e-06, -1.3575e-05, 2.0996e-05, -1.0127e-04, -9.5367e-06, 1.7703e-05], device='cuda:0') 100 0.0001 changing lr epoch 147, time 220.29, cls_loss 0.0018 cls_loss_mapping 0.0046 cls_loss_causal 0.5142 re_mapping 0.0059 re_causal 0.0163 /// teacc 99.02 lr 0.00010000 Epoch 149, weight, value: tensor([[-0.1518, -0.1464, 0.0845, ..., -0.0710, 0.0438, 0.0326], [-0.0910, -0.0340, -0.0603, ..., -0.0825, -0.0845, -0.0333], [ 0.0308, -0.0785, -0.0605, ..., -0.0639, 0.0159, -0.2603], ..., [-0.1195, 0.1079, 0.0042, ..., 0.1084, -0.0224, -0.1047], [-0.0881, -0.0796, 0.1026, ..., -0.0654, -0.1363, 0.1083], [ 0.0085, -0.1816, 0.1024, ..., 0.0634, -0.1552, -0.0768]], device='cuda:0'), grad: tensor([[ 2.1933e-07, 2.0955e-08, -1.8626e-09, ..., 4.2841e-08, 0.0000e+00, 1.9511e-07], [ 5.9139e-07, 3.1712e-07, 4.1956e-07, ..., 3.0082e-07, 0.0000e+00, 9.9186e-07], [ 1.0012e-07, 2.6263e-07, 7.3109e-08, ..., -1.7462e-07, 0.0000e+00, 1.3597e-07], ..., [ 4.5542e-07, -3.9348e-07, 3.9395e-07, ..., -1.4901e-08, 0.0000e+00, 5.3365e-07], [ 1.6205e-06, 2.8359e-07, 5.3458e-07, ..., 6.4215e-07, 0.0000e+00, 6.6310e-07], [ 3.0287e-06, 2.4959e-07, -5.5097e-06, ..., -2.5257e-06, 0.0000e+00, 3.4589e-06]], device='cuda:0') Epoch 149, bias, value: tensor([-0.0153, -0.0176, -0.0278, -0.0251, -0.0080, 0.0026, 0.0096, -0.0197, -0.0060, 0.0014], device='cuda:0'), grad: tensor([ 6.4820e-07, 2.8387e-06, -2.0899e-06, 7.2010e-06, 4.8056e-06, -1.8701e-05, 7.9861e-07, 2.4028e-06, 4.7348e-06, -2.6729e-06], device='cuda:0') 100 0.0001 changing lr epoch 148, time 220.19, cls_loss 0.0019 cls_loss_mapping 0.0059 cls_loss_causal 0.5536 re_mapping 0.0058 re_causal 0.0165 /// teacc 98.97 lr 0.00010000 Epoch 150, weight, value: tensor([[-0.1520, -0.1466, 0.0847, ..., -0.0713, 0.0452, 0.0330], [-0.0904, -0.0346, -0.0622, ..., -0.0831, -0.0845, -0.0341], [ 0.0310, -0.0790, -0.0612, ..., -0.0649, 0.0158, -0.2610], ..., [-0.1209, 0.1087, 0.0051, ..., 0.1090, -0.0225, -0.1042], [-0.0885, -0.0797, 0.1033, ..., -0.0652, -0.1366, 0.1089], [ 0.0086, -0.1823, 0.1027, ..., 0.0629, -0.1559, -0.0770]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 7.5437e-08, -1.7695e-08, ..., 4.2841e-08, 5.1223e-09, -3.4925e-08], [ 8.3819e-09, 3.3667e-07, 1.0105e-07, ..., 9.3598e-08, 1.9092e-08, -3.9116e-08], [ 1.3970e-09, 6.2305e-07, 3.6322e-07, ..., 1.0245e-07, 1.9558e-08, 7.0315e-07], ..., [ 5.5414e-08, 2.3004e-07, 4.3819e-07, ..., 2.2491e-07, 3.5390e-08, 2.8126e-07], [ 4.4703e-08, 2.5239e-07, -5.8580e-07, ..., 1.4901e-07, 2.6077e-08, -1.7844e-06], [-2.8824e-07, 3.3807e-07, -2.4419e-06, ..., -1.8105e-06, 7.9162e-09, -2.5658e-07]], device='cuda:0') Epoch 150, bias, value: tensor([-0.0149, -0.0184, -0.0278, -0.0247, -0.0076, 0.0017, 0.0099, -0.0189, -0.0056, 0.0009], device='cuda:0'), grad: tensor([ 1.1642e-07, 4.9267e-07, 2.0750e-06, -1.0878e-05, 2.9895e-06, 8.0168e-06, 2.2631e-07, 2.0079e-06, -1.1558e-06, -3.8892e-06], device='cuda:0') 100 0.0001 changing lr epoch 149, time 220.13, cls_loss 0.0020 cls_loss_mapping 0.0046 cls_loss_causal 0.5281 re_mapping 0.0056 re_causal 0.0164 /// teacc 99.08 lr 0.00010000 Epoch 151, weight, value: tensor([[-0.1529, -0.1473, 0.0848, ..., -0.0715, 0.0452, 0.0330], [-0.0906, -0.0356, -0.0637, ..., -0.0850, -0.0846, -0.0360], [ 0.0310, -0.0797, -0.0616, ..., -0.0654, 0.0158, -0.2620], ..., [-0.1232, 0.1100, 0.0063, ..., 0.1101, -0.0225, -0.1026], [-0.0892, -0.0796, 0.1039, ..., -0.0650, -0.1366, 0.1098], [ 0.0087, -0.1830, 0.1029, ..., 0.0626, -0.1561, -0.0770]], device='cuda:0'), grad: tensor([[ 1.6764e-08, 6.7055e-08, -1.3784e-07, ..., 4.0978e-08, 1.3970e-09, -1.3830e-07], [ 2.1141e-07, 2.5034e-06, 2.0824e-06, ..., 1.0571e-06, -8.5682e-08, 4.8429e-08], [ 7.9162e-09, -2.7986e-07, 1.1455e-07, ..., 2.3749e-08, 2.7940e-09, 9.4064e-08], ..., [ 6.2399e-08, -4.7348e-06, -3.0156e-06, ..., -1.4286e-06, 7.4040e-08, 2.1886e-07], [ 1.9930e-07, 2.9290e-07, 1.4622e-07, ..., 2.7241e-07, 4.6566e-10, -4.7637e-07], [-7.4366e-07, 2.4913e-07, -1.4212e-06, ..., -9.6764e-07, 2.3283e-09, -2.7847e-07]], device='cuda:0') Epoch 151, bias, value: tensor([-0.0149, -0.0192, -0.0279, -0.0252, -0.0075, 0.0023, 0.0094, -0.0179, -0.0049, 0.0005], device='cuda:0'), grad: tensor([-1.8161e-08, 5.5842e-06, -1.0118e-05, 6.9551e-06, 8.8662e-07, 1.8999e-07, -4.0326e-07, -6.2957e-06, 5.1409e-06, -1.9558e-06], device='cuda:0') 100 0.0001 changing lr epoch 150, time 220.57, cls_loss 0.0017 cls_loss_mapping 0.0031 cls_loss_causal 0.5114 re_mapping 0.0057 re_causal 0.0162 /// teacc 98.97 lr 0.00010000 Epoch 152, weight, value: tensor([[-0.1540, -0.1478, 0.0851, ..., -0.0716, 0.0458, 0.0327], [-0.0914, -0.0357, -0.0648, ..., -0.0854, -0.0845, -0.0373], [ 0.0307, -0.0800, -0.0622, ..., -0.0658, 0.0158, -0.2627], ..., [-0.1242, 0.1101, 0.0064, ..., 0.1104, -0.0226, -0.1028], [-0.0898, -0.0794, 0.1056, ..., -0.0653, -0.1367, 0.1113], [ 0.0089, -0.1834, 0.1034, ..., 0.0624, -0.1566, -0.0770]], device='cuda:0'), grad: tensor([[ 1.1642e-08, 1.4435e-08, 1.8254e-07, ..., 1.9558e-07, 0.0000e+00, 2.0815e-07], [ 4.6566e-09, 2.2398e-07, -8.5160e-06, ..., 2.1048e-07, 0.0000e+00, -6.6422e-06], [ 1.8626e-09, 1.8813e-07, 1.7742e-07, ..., 1.3597e-07, 0.0000e+00, 1.5413e-07], ..., [ 1.8626e-08, -3.7206e-07, 4.4405e-06, ..., 5.9791e-07, 0.0000e+00, 3.4012e-06], [ 1.5832e-08, 5.8673e-08, -3.3760e-07, ..., 2.2678e-07, 0.0000e+00, -1.0012e-06], [-1.5367e-08, 9.4995e-08, 2.9095e-06, ..., 3.7365e-06, 0.0000e+00, 2.8089e-06]], device='cuda:0') Epoch 152, bias, value: tensor([-0.0150, -0.0200, -0.0280, -0.0255, -0.0077, 0.0027, 0.0096, -0.0180, -0.0030, 0.0008], device='cuda:0'), grad: tensor([ 2.8647e-06, -7.4625e-05, -4.6752e-07, 1.3327e-06, -1.1809e-05, 3.3760e-07, 4.8010e-07, 4.0233e-05, 1.1846e-06, 4.0561e-05], device='cuda:0') 100 0.0001 changing lr epoch 151, time 220.41, cls_loss 0.0018 cls_loss_mapping 0.0044 cls_loss_causal 0.5545 re_mapping 0.0057 re_causal 0.0165 /// teacc 98.97 lr 0.00010000 Epoch 153, weight, value: tensor([[-0.1542, -0.1483, 0.0851, ..., -0.0719, 0.0458, 0.0310], [-0.0919, -0.0357, -0.0652, ..., -0.0860, -0.0845, -0.0376], [ 0.0310, -0.0803, -0.0627, ..., -0.0662, 0.0162, -0.2633], ..., [-0.1249, 0.1101, 0.0060, ..., 0.1116, -0.0227, -0.1030], [-0.0911, -0.0798, 0.1048, ..., -0.0657, -0.1370, 0.1114], [ 0.0096, -0.1839, 0.1047, ..., 0.0626, -0.1567, -0.0758]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.9558e-08, -3.3993e-08, ..., 9.3132e-09, 0.0000e+00, 4.3772e-08], [ 0.0000e+00, 2.7381e-07, 2.2352e-08, ..., 1.0384e-07, 0.0000e+00, -3.6787e-08], [ 0.0000e+00, 5.6811e-08, 8.8476e-09, ..., 3.7253e-09, -9.3132e-10, 8.2888e-08], ..., [ 4.6566e-10, -2.9244e-07, -4.2375e-08, ..., -2.4633e-07, 4.6566e-10, 4.7497e-08], [ 4.6566e-10, 2.7148e-07, 3.4459e-08, ..., 1.0710e-08, 0.0000e+00, 1.7229e-07], [-6.9849e-09, 4.6007e-07, 3.2596e-09, ..., 1.3784e-07, 0.0000e+00, 4.7497e-08]], device='cuda:0') Epoch 153, bias, value: tensor([-0.0166, -0.0200, -0.0280, -0.0253, -0.0083, 0.0026, 0.0105, -0.0179, -0.0032, 0.0012], device='cuda:0'), grad: tensor([ 3.7206e-07, 5.7369e-07, 3.7672e-07, -3.3025e-06, 1.0312e-05, 2.5108e-06, -1.3031e-05, -2.1979e-07, 1.0477e-06, 1.3197e-06], device='cuda:0') 100 0.0001 changing lr epoch 152, time 220.43, cls_loss 0.0017 cls_loss_mapping 0.0041 cls_loss_causal 0.5196 re_mapping 0.0057 re_causal 0.0156 /// teacc 99.08 lr 0.00010000 Epoch 154, weight, value: tensor([[-0.1550, -0.1499, 0.0854, ..., -0.0722, 0.0460, 0.0310], [-0.0917, -0.0357, -0.0648, ..., -0.0868, -0.0844, -0.0375], [ 0.0308, -0.0807, -0.0632, ..., -0.0667, 0.0166, -0.2637], ..., [-0.1257, 0.1105, 0.0057, ..., 0.1124, -0.0227, -0.1034], [-0.0921, -0.0804, 0.1049, ..., -0.0662, -0.1374, 0.1115], [ 0.0092, -0.1848, 0.1048, ..., 0.0625, -0.1572, -0.0763]], device='cuda:0'), grad: tensor([[ 1.3085e-07, 1.2983e-06, -4.8662e-07, ..., 7.1200e-07, -1.3970e-08, -9.2667e-07], [ 8.9873e-08, 5.5246e-06, 8.1956e-07, ..., 1.9027e-06, 9.3132e-10, 4.6566e-08], [ 3.2596e-08, 1.7256e-05, 2.0303e-06, ..., 5.1223e-06, 4.6566e-10, 1.3877e-07], ..., [ 4.6333e-07, -3.1447e-04, -3.4004e-05, ..., -8.9347e-05, 4.6566e-10, -3.7998e-07], [ 1.8161e-07, 8.8587e-06, 1.0077e-06, ..., 2.6189e-06, 4.6566e-10, 1.7462e-07], [-6.8545e-07, 2.0918e-06, -2.2929e-06, ..., -3.1348e-06, 4.1910e-09, 4.7544e-07]], device='cuda:0') Epoch 154, bias, value: tensor([-0.0168, -0.0198, -0.0278, -0.0254, -0.0084, 0.0033, 0.0103, -0.0180, -0.0035, 0.0009], device='cuda:0'), grad: tensor([-1.6354e-06, 9.9167e-06, 2.1890e-05, 3.9911e-04, 2.1234e-06, 3.5465e-06, 4.7311e-06, -4.5085e-04, 1.3858e-05, -3.3360e-06], device='cuda:0') 100 0.0001 changing lr epoch 153, time 220.69, cls_loss 0.0019 cls_loss_mapping 0.0040 cls_loss_causal 0.5296 re_mapping 0.0059 re_causal 0.0163 /// teacc 98.97 lr 0.00010000 Epoch 155, weight, value: tensor([[-0.1563, -0.1501, 0.0859, ..., -0.0724, 0.0462, 0.0326], [-0.0920, -0.0360, -0.0650, ..., -0.0875, -0.0846, -0.0372], [ 0.0303, -0.0805, -0.0636, ..., -0.0672, 0.0168, -0.2646], ..., [-0.1267, 0.1110, 0.0056, ..., 0.1128, -0.0228, -0.1038], [-0.0937, -0.0812, 0.1052, ..., -0.0669, -0.1375, 0.1121], [ 0.0089, -0.1852, 0.1054, ..., 0.0622, -0.1577, -0.0771]], device='cuda:0'), grad: tensor([[ 6.6590e-08, 4.0652e-07, 2.7008e-08, ..., 9.4529e-08, 3.4459e-08, 4.6566e-08], [ 2.3283e-08, 1.2815e-06, 2.1793e-07, ..., 5.9558e-07, 6.8452e-08, -2.9383e-07], [-1.1409e-07, 1.4119e-06, 9.6392e-08, ..., -4.3400e-07, -1.9046e-07, 4.2841e-08], ..., [ 8.9407e-08, -1.8999e-06, -9.4529e-07, ..., -2.4885e-06, 3.5623e-07, 2.1607e-07], [ 9.0804e-08, 5.7742e-07, 6.1002e-08, ..., 2.6030e-07, 8.8476e-08, 9.3598e-08], [ 1.2945e-07, 2.1774e-06, 3.5251e-07, ..., 8.9640e-07, 8.8476e-09, 1.2200e-07]], device='cuda:0') Epoch 155, bias, value: tensor([-0.0150, -0.0200, -0.0271, -0.0258, -0.0079, 0.0056, 0.0063, -0.0181, -0.0031, 0.0004], device='cuda:0'), grad: tensor([ 8.8096e-05, 2.1607e-06, 4.9263e-05, -1.7500e-04, 1.3448e-06, 8.6054e-06, 1.3314e-05, 1.6280e-06, 4.6678e-06, 5.8115e-06], device='cuda:0') 100 0.0001 changing lr epoch 154, time 220.77, cls_loss 0.0014 cls_loss_mapping 0.0042 cls_loss_causal 0.5198 re_mapping 0.0057 re_causal 0.0163 /// teacc 98.86 lr 0.00010000 Epoch 156, weight, value: tensor([[-0.1566, -0.1506, 0.0857, ..., -0.0725, 0.0461, 0.0323], [-0.0920, -0.0361, -0.0651, ..., -0.0878, -0.0845, -0.0372], [ 0.0302, -0.0801, -0.0639, ..., -0.0671, 0.0171, -0.2650], ..., [-0.1271, 0.1111, 0.0057, ..., 0.1129, -0.0232, -0.1039], [-0.0950, -0.0824, 0.1054, ..., -0.0673, -0.1375, 0.1122], [ 0.0089, -0.1857, 0.1058, ..., 0.0622, -0.1578, -0.0772]], device='cuda:0'), grad: tensor([[ 8.0373e-07, 2.3888e-07, 2.7381e-06, ..., 2.7344e-06, -1.1176e-08, 3.3248e-07], [ 2.8033e-07, 3.5623e-07, 1.0515e-06, ..., 1.0645e-06, 5.5879e-09, -6.6077e-07], [ 6.7335e-07, -2.0862e-07, 2.4289e-06, ..., 4.0904e-06, -7.4506e-09, 1.4855e-07], ..., [ 2.5593e-06, -9.5926e-08, 9.0897e-06, ..., -6.0238e-06, 3.7253e-09, 2.4633e-07], [ 7.9023e-07, 2.4959e-07, 1.2200e-06, ..., 1.1511e-06, 9.3132e-10, 9.4855e-07], [-7.1637e-06, 4.8801e-07, -2.6867e-05, ..., -2.4378e-05, 2.3283e-09, 1.4529e-07]], device='cuda:0') Epoch 156, bias, value: tensor([-0.0154, -0.0199, -0.0267, -0.0252, -0.0076, 0.0057, 0.0062, -0.0183, -0.0034, 0.0001], device='cuda:0'), grad: tensor([ 1.4357e-05, 3.8669e-06, 1.0364e-05, 4.8280e-06, 5.8591e-05, 4.6007e-06, -3.4813e-06, 8.0317e-06, 7.1526e-06, -1.0818e-04], device='cuda:0') 100 0.0001 changing lr epoch 155, time 220.52, cls_loss 0.0026 cls_loss_mapping 0.0048 cls_loss_causal 0.5196 re_mapping 0.0057 re_causal 0.0152 /// teacc 99.08 lr 0.00010000 Epoch 157, weight, value: tensor([[-0.1570, -0.1512, 0.0850, ..., -0.0739, 0.0460, 0.0322], [-0.0922, -0.0363, -0.0662, ..., -0.0890, -0.0849, -0.0372], [ 0.0303, -0.0827, -0.0652, ..., -0.0689, 0.0168, -0.2659], ..., [-0.1279, 0.1119, 0.0033, ..., 0.1126, -0.0234, -0.1041], [-0.0957, -0.0831, 0.1071, ..., -0.0667, -0.1377, 0.1127], [ 0.0097, -0.1869, 0.1087, ..., 0.0632, -0.1584, -0.0790]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 7.1712e-08, -1.3951e-06, ..., 1.2852e-07, 0.0000e+00, -1.5711e-06], [ 1.3039e-08, 7.9442e-07, 3.4273e-07, ..., 3.2503e-07, 9.3132e-10, -2.8126e-07], [ 2.7940e-09, 5.1595e-07, 1.7602e-07, ..., 2.9802e-07, -5.5879e-09, 1.1269e-07], ..., [ 5.8673e-08, -1.5749e-06, 4.8708e-07, ..., -2.0396e-07, 2.7940e-09, 2.5053e-07], [ 4.0978e-08, 4.0699e-07, -3.2503e-07, ..., 3.5204e-07, 9.3132e-10, -9.9279e-07], [-1.7975e-07, 2.7101e-07, -5.5850e-05, ..., -7.2002e-05, 0.0000e+00, 7.1526e-07]], device='cuda:0') Epoch 157, bias, value: tensor([-0.0159, -0.0203, -0.0279, -0.0244, -0.0077, 0.0053, 0.0067, -0.0193, -0.0033, 0.0021], device='cuda:0'), grad: tensor([-4.9621e-06, 1.1409e-06, -1.6075e-06, 7.1712e-07, 1.5318e-04, 2.9989e-06, 2.2687e-06, 1.0040e-06, 4.2468e-07, -1.5497e-04], device='cuda:0') 100 0.0001 changing lr epoch 156, time 220.57, cls_loss 0.0015 cls_loss_mapping 0.0032 cls_loss_causal 0.5212 re_mapping 0.0055 re_causal 0.0158 /// teacc 98.98 lr 0.00010000 Epoch 158, weight, value: tensor([[-0.1573, -0.1516, 0.0852, ..., -0.0742, 0.0461, 0.0323], [-0.0924, -0.0365, -0.0664, ..., -0.0894, -0.0850, -0.0372], [ 0.0302, -0.0833, -0.0663, ..., -0.0700, 0.0170, -0.2665], ..., [-0.1283, 0.1125, 0.0035, ..., 0.1136, -0.0235, -0.1041], [-0.0960, -0.0838, 0.1077, ..., -0.0669, -0.1378, 0.1129], [ 0.0090, -0.1875, 0.1087, ..., 0.0625, -0.1587, -0.0797]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 1.5832e-08, -7.4506e-08, ..., 9.2201e-08, -6.5193e-09, 2.0310e-05], [ 2.2352e-08, 7.2643e-08, 1.5553e-07, ..., 1.8999e-07, 7.4506e-09, 1.2666e-07], [ 1.8626e-09, -1.5553e-07, 2.7008e-08, ..., 1.3039e-07, -4.3027e-07, 7.9162e-08], ..., [ 5.6811e-08, 2.6822e-07, 1.1986e-06, ..., 1.2424e-06, 4.8429e-08, 7.0781e-08], [ 8.6613e-08, 2.3376e-07, 3.3807e-07, ..., 3.2131e-07, 2.2631e-07, 1.6019e-07], [-4.1537e-07, -3.3155e-07, -3.4701e-06, ..., 5.8115e-07, 1.8626e-09, 1.2200e-07]], device='cuda:0') Epoch 158, bias, value: tensor([-0.0160, -0.0204, -0.0279, -0.0244, -0.0073, 0.0053, 0.0070, -0.0189, -0.0034, 0.0015], device='cuda:0'), grad: tensor([ 4.4823e-05, 1.0058e-06, -4.2319e-06, 1.6624e-06, -9.1344e-06, 1.5572e-05, -5.9336e-05, 4.4592e-06, 3.3639e-06, 1.8319e-06], device='cuda:0') 100 0.0001 changing lr epoch 157, time 220.31, cls_loss 0.0020 cls_loss_mapping 0.0043 cls_loss_causal 0.5309 re_mapping 0.0055 re_causal 0.0152 /// teacc 99.05 lr 0.00010000 Epoch 159, weight, value: tensor([[-0.1576, -0.1520, 0.0853, ..., -0.0745, 0.0462, 0.0324], [-0.0928, -0.0365, -0.0665, ..., -0.0901, -0.0849, -0.0368], [ 0.0301, -0.0843, -0.0670, ..., -0.0715, 0.0171, -0.2671], ..., [-0.1294, 0.1132, 0.0026, ..., 0.1139, -0.0236, -0.1046], [-0.0972, -0.0843, 0.1078, ..., -0.0686, -0.1379, 0.1129], [ 0.0092, -0.1890, 0.1100, ..., 0.0635, -0.1589, -0.0806]], device='cuda:0'), grad: tensor([[ 1.2107e-08, 1.4249e-07, -1.0710e-07, ..., 1.5832e-08, 0.0000e+00, 1.3784e-07], [ 1.9651e-07, 8.4843e-07, 2.2631e-07, ..., 4.2003e-07, 0.0000e+00, 4.1910e-07], [ 6.5193e-09, 1.7416e-07, 9.7789e-08, ..., 5.8673e-08, -1.3039e-08, 1.2014e-07], ..., [ 4.0047e-08, -1.9651e-07, -3.1013e-07, ..., -8.5309e-07, 3.7253e-09, 1.9558e-07], [ 2.0973e-06, 1.9260e-06, -3.3062e-07, ..., 2.8871e-08, 9.3132e-10, 3.4831e-06], [ 7.4320e-07, 3.9209e-07, 7.5437e-08, ..., 3.2876e-07, 9.3132e-10, 8.8010e-07]], device='cuda:0') Epoch 159, bias, value: tensor([-0.0161, -0.0201, -0.0281, -0.0248, -0.0080, 0.0062, 0.0066, -0.0193, -0.0040, 0.0023], device='cuda:0'), grad: tensor([ 4.4145e-07, 2.1625e-06, 2.5146e-07, -6.0499e-05, 6.1467e-07, 4.5002e-05, 3.5018e-07, -1.5646e-07, 9.4622e-06, 2.2389e-06], device='cuda:0') 100 0.0001 changing lr epoch 158, time 220.26, cls_loss 0.0015 cls_loss_mapping 0.0031 cls_loss_causal 0.5131 re_mapping 0.0060 re_causal 0.0155 /// teacc 98.99 lr 0.00010000 Epoch 160, weight, value: tensor([[-0.1578, -0.1525, 0.0854, ..., -0.0748, 0.0462, 0.0324], [-0.0931, -0.0365, -0.0664, ..., -0.0905, -0.0848, -0.0367], [ 0.0299, -0.0846, -0.0674, ..., -0.0721, 0.0173, -0.2674], ..., [-0.1297, 0.1136, 0.0030, ..., 0.1148, -0.0236, -0.1047], [-0.0976, -0.0860, 0.1079, ..., -0.0688, -0.1380, 0.1130], [ 0.0090, -0.1905, 0.1099, ..., 0.0629, -0.1589, -0.0811]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.1642e-07, -7.4506e-09, ..., 3.1665e-08, -9.3132e-10, 1.3690e-07], [ 3.7253e-09, 6.7707e-07, 1.2387e-07, ..., 1.8161e-07, 9.3132e-10, 1.2480e-07], [ 0.0000e+00, 3.5111e-07, 4.7497e-08, ..., 8.3819e-08, -2.7940e-09, 4.1910e-08], ..., [ 5.5879e-09, -1.8952e-06, -2.3097e-07, ..., -6.5845e-07, 1.8626e-09, 1.9558e-08], [ 2.5146e-08, 1.7788e-07, 6.7055e-08, ..., 9.5926e-08, 0.0000e+00, 1.2387e-07], [-6.2399e-08, 1.1642e-07, -2.4866e-07, ..., -6.2399e-08, 0.0000e+00, 9.3132e-09]], device='cuda:0') Epoch 160, bias, value: tensor([-0.0159, -0.0200, -0.0282, -0.0246, -0.0077, 0.0061, 0.0068, -0.0192, -0.0042, 0.0018], device='cuda:0'), grad: tensor([ 7.7672e-07, 1.6438e-06, 6.8825e-07, 3.7532e-07, 1.5246e-06, 1.3607e-06, -4.0568e-06, -3.1888e-06, 1.0645e-06, -1.9465e-07], device='cuda:0') 100 0.0001 changing lr epoch 159, time 220.70, cls_loss 0.0015 cls_loss_mapping 0.0041 cls_loss_causal 0.5278 re_mapping 0.0057 re_causal 0.0156 /// teacc 99.07 lr 0.00010000 Epoch 161, weight, value: tensor([[-0.1582, -0.1529, 0.0855, ..., -0.0750, 0.0463, 0.0323], [-0.0934, -0.0367, -0.0667, ..., -0.0910, -0.0848, -0.0367], [ 0.0296, -0.0850, -0.0677, ..., -0.0726, 0.0175, -0.2679], ..., [-0.1302, 0.1140, 0.0031, ..., 0.1158, -0.0237, -0.1049], [-0.0979, -0.0865, 0.1082, ..., -0.0691, -0.1381, 0.1133], [ 0.0092, -0.1909, 0.1104, ..., 0.0629, -0.1591, -0.0812]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 2.7008e-08, -8.3819e-09, ..., 3.0734e-08, 4.6566e-09, 8.6665e-05], [ 1.8626e-09, 3.5390e-08, 1.6764e-08, ..., 1.1642e-07, -2.7474e-07, 1.6540e-06], [ 0.0000e+00, 2.6077e-07, 6.3330e-08, ..., 1.2666e-07, 1.8626e-08, 1.7602e-07], ..., [ 5.5879e-09, -3.7253e-07, -8.7544e-08, ..., -1.8347e-07, 1.1083e-07, 3.2410e-07], [ 2.7940e-09, 1.9558e-08, 1.0245e-08, ..., 3.7253e-08, 1.3970e-08, 7.7579e-07], [-2.3283e-08, 5.0291e-08, -5.8673e-08, ..., 5.9325e-07, 5.5879e-09, 1.6671e-07]], device='cuda:0') Epoch 161, bias, value: tensor([-0.0163, -0.0201, -0.0283, -0.0240, -0.0083, 0.0055, 0.0068, -0.0189, -0.0043, 0.0023], device='cuda:0'), grad: tensor([ 3.8171e-04, 7.1526e-06, 7.6834e-07, 2.3469e-07, 3.0756e-05, 1.3616e-06, -4.3201e-04, 1.6661e-06, 3.7253e-06, 5.2191e-06], device='cuda:0') 100 0.0001 changing lr epoch 160, time 220.06, cls_loss 0.0016 cls_loss_mapping 0.0034 cls_loss_causal 0.5431 re_mapping 0.0058 re_causal 0.0156 /// teacc 99.01 lr 0.00010000 Epoch 162, weight, value: tensor([[-0.1594, -0.1534, 0.0857, ..., -0.0753, 0.0465, 0.0318], [-0.0937, -0.0365, -0.0667, ..., -0.0914, -0.0877, -0.0375], [ 0.0294, -0.0855, -0.0686, ..., -0.0731, 0.0176, -0.2687], ..., [-0.1308, 0.1142, 0.0033, ..., 0.1164, -0.0238, -0.1049], [-0.0982, -0.0874, 0.1083, ..., -0.0707, -0.1383, 0.1136], [ 0.0093, -0.1917, 0.1105, ..., 0.0626, -0.1598, -0.0819]], device='cuda:0'), grad: tensor([[ 9.3132e-09, 1.0710e-07, -2.8126e-07, ..., 8.3260e-07, -1.8813e-07, -1.3942e-06], [ 8.3819e-09, 3.6228e-07, 6.6776e-07, ..., 1.0487e-06, 3.7253e-09, 1.1176e-07], [ 1.2107e-08, 2.8498e-07, 3.4459e-07, ..., 2.4121e-07, 3.3528e-08, 4.9639e-07], ..., [ 2.4214e-08, -2.0154e-06, 5.9605e-06, ..., 1.0334e-05, 2.2352e-08, 2.5705e-07], [-5.3085e-08, 8.8476e-08, 6.6217e-07, ..., 1.5842e-06, 1.9558e-08, -1.8552e-06], [-1.7602e-07, 4.0419e-07, -2.1160e-05, ..., -3.6895e-05, 1.3970e-08, 4.8988e-07]], device='cuda:0') Epoch 162, bias, value: tensor([-0.0170, -0.0201, -0.0284, -0.0241, -0.0079, 0.0055, 0.0079, -0.0190, -0.0046, 0.0019], device='cuda:0'), grad: tensor([-1.2591e-06, 3.7104e-06, 2.3227e-06, 1.4521e-05, 4.9084e-05, 6.1542e-06, 1.8105e-06, 3.2723e-05, 2.3171e-06, -1.1140e-04], device='cuda:0') 100 0.0001 changing lr epoch 161, time 220.08, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5141 re_mapping 0.0056 re_causal 0.0154 /// teacc 98.99 lr 0.00010000 Epoch 163, weight, value: tensor([[-0.1596, -0.1532, 0.0860, ..., -0.0755, 0.0467, 0.0315], [-0.0937, -0.0367, -0.0668, ..., -0.0922, -0.0876, -0.0373], [ 0.0292, -0.0856, -0.0692, ..., -0.0731, 0.0178, -0.2691], ..., [-0.1313, 0.1146, 0.0034, ..., 0.1166, -0.0243, -0.1051], [-0.0987, -0.0876, 0.1090, ..., -0.0711, -0.1386, 0.1138], [ 0.0091, -0.1925, 0.1105, ..., 0.0615, -0.1604, -0.0829]], device='cuda:0'), grad: tensor([[ 1.3970e-08, 1.3225e-07, -9.3132e-08, ..., 8.5682e-08, 0.0000e+00, -4.3772e-08], [ 6.0536e-08, 2.6785e-06, 8.4843e-07, ..., 1.8151e-06, 0.0000e+00, 4.1910e-08], [ 3.7253e-09, 4.1816e-07, 9.7789e-08, ..., 2.5891e-07, 0.0000e+00, 1.1269e-07], ..., [ 5.9605e-08, -1.0610e-05, -1.8794e-06, ..., -6.7018e-06, 0.0000e+00, 8.3819e-08], [ 5.3085e-08, 8.2329e-07, 2.6915e-07, ..., 2.5798e-07, 0.0000e+00, 9.4343e-07], [-1.5814e-06, 1.4808e-06, -7.5325e-06, ..., -4.2431e-06, 0.0000e+00, -2.0899e-06]], device='cuda:0') Epoch 163, bias, value: tensor([-0.0175, -0.0200, -0.0280, -0.0245, -0.0067, 0.0055, 0.0086, -0.0191, -0.0050, 0.0009], device='cuda:0'), grad: tensor([ 8.2981e-07, 5.7705e-06, -3.1386e-07, 8.8587e-06, 2.5243e-05, 3.1330e-06, -3.7067e-06, -1.8701e-05, 4.3325e-06, -2.5481e-05], device='cuda:0') 100 0.0001 changing lr epoch 162, time 220.18, cls_loss 0.0013 cls_loss_mapping 0.0031 cls_loss_causal 0.5428 re_mapping 0.0056 re_causal 0.0157 /// teacc 99.06 lr 0.00010000 Epoch 164, weight, value: tensor([[-0.1605, -0.1540, 0.0862, ..., -0.0757, 0.0478, 0.0316], [-0.0938, -0.0367, -0.0670, ..., -0.0927, -0.0878, -0.0370], [ 0.0291, -0.0858, -0.0699, ..., -0.0736, 0.0185, -0.2712], ..., [-0.1320, 0.1149, 0.0035, ..., 0.1168, -0.0245, -0.1052], [-0.0991, -0.0885, 0.1091, ..., -0.0719, -0.1396, 0.1141], [ 0.0093, -0.1929, 0.1108, ..., 0.0608, -0.1624, -0.0832]], device='cuda:0'), grad: tensor([[ 1.1176e-08, -2.9337e-07, -1.5786e-06, ..., 1.1176e-08, -1.0617e-07, -5.9325e-07], [ 1.3039e-08, 3.9116e-08, 6.2399e-08, ..., 3.9116e-08, -3.7253e-09, -2.1420e-08], [ 2.7940e-09, 7.5437e-08, 4.0047e-07, ..., 4.4703e-08, 1.8626e-08, 2.0862e-07], ..., [ 3.0734e-08, -6.7055e-08, 1.3970e-07, ..., -1.4901e-08, 8.3819e-09, 9.6858e-08], [ 7.2643e-08, 9.6858e-08, -3.7998e-06, ..., -1.3830e-06, 1.3970e-08, -2.7083e-06], [-7.4506e-09, 4.5635e-08, 3.3304e-06, ..., 1.0142e-06, 4.6566e-09, 2.6077e-06]], device='cuda:0') Epoch 164, bias, value: tensor([-0.0175, -0.0196, -0.0288, -0.0252, -0.0057, 0.0058, 0.0088, -0.0192, -0.0051, 0.0004], device='cuda:0'), grad: tensor([-2.7679e-06, -3.1479e-07, 1.0096e-06, 2.1607e-06, 5.9325e-07, -6.4354e-07, 1.6019e-07, 3.9209e-07, -8.0168e-06, 7.4357e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 163---------------------------------------------------- epoch 163, time 220.89, cls_loss 0.0016 cls_loss_mapping 0.0031 cls_loss_causal 0.5121 re_mapping 0.0056 re_causal 0.0154 /// teacc 99.11 lr 0.00010000 Epoch 165, weight, value: tensor([[-0.1603, -0.1545, 0.0868, ..., -0.0760, 0.0480, 0.0318], [-0.0940, -0.0344, -0.0644, ..., -0.0933, -0.0878, -0.0371], [ 0.0289, -0.0859, -0.0704, ..., -0.0736, 0.0187, -0.2724], ..., [-0.1326, 0.1129, 0.0016, ..., 0.1171, -0.0246, -0.1053], [-0.0997, -0.0891, 0.1093, ..., -0.0728, -0.1399, 0.1144], [ 0.0102, -0.1938, 0.1112, ..., 0.0611, -0.1628, -0.0834]], device='cuda:0'), grad: tensor([[ 4.0978e-08, 2.8685e-07, -4.5542e-07, ..., 1.0896e-07, 0.0000e+00, -3.9767e-07], [ 5.4110e-07, 1.1260e-06, 1.2452e-06, ..., 1.0412e-06, -9.3132e-10, 1.2107e-07], [-6.1560e-07, 4.7721e-06, 8.3819e-08, ..., -3.1926e-06, 0.0000e+00, 5.5879e-08], ..., [ 1.7695e-07, -2.6077e-07, 4.4890e-07, ..., -8.2422e-07, 9.3132e-10, -2.6077e-08], [ 2.6263e-07, 8.1211e-07, 2.8778e-07, ..., 2.0675e-07, 0.0000e+00, 6.9849e-08], [-1.6496e-05, 8.4341e-06, -3.4302e-05, ..., -2.3678e-05, 0.0000e+00, -1.6261e-06]], device='cuda:0') Epoch 165, bias, value: tensor([-0.0174, -0.0177, -0.0287, -0.0253, -0.0059, 0.0057, 0.0093, -0.0212, -0.0053, 0.0006], device='cuda:0'), grad: tensor([-5.6066e-07, 6.6385e-06, -2.7746e-05, -2.7463e-05, 1.3375e-04, 3.7514e-06, 7.0594e-07, 7.1526e-07, 2.2184e-06, -9.1970e-05], device='cuda:0') 100 0.0001 changing lr epoch 164, time 220.31, cls_loss 0.0014 cls_loss_mapping 0.0031 cls_loss_causal 0.5319 re_mapping 0.0055 re_causal 0.0158 /// teacc 99.01 lr 0.00010000 Epoch 166, weight, value: tensor([[-0.1605, -0.1551, 0.0871, ..., -0.0761, 0.0483, 0.0320], [-0.0943, -0.0347, -0.0649, ..., -0.0942, -0.0876, -0.0370], [ 0.0287, -0.0860, -0.0711, ..., -0.0739, 0.0195, -0.2728], ..., [-0.1333, 0.1135, 0.0020, ..., 0.1175, -0.0249, -0.1054], [-0.1003, -0.0907, 0.1093, ..., -0.0734, -0.1407, 0.1147], [ 0.0103, -0.1945, 0.1116, ..., 0.0613, -0.1642, -0.0837]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 1.3039e-08, -5.7742e-08, ..., 5.5879e-09, 2.8871e-08, -9.3132e-09], [ 8.3819e-09, -8.7395e-06, 3.8184e-08, ..., 4.9360e-08, 3.7253e-08, -7.1712e-08], [ 1.8626e-09, 1.7034e-06, 4.5635e-08, ..., 7.2643e-08, -1.4342e-07, 4.6566e-08], ..., [ 1.2107e-08, 6.7428e-06, -5.5879e-08, ..., -1.4156e-07, 1.2107e-08, 3.6322e-08], [ 2.1793e-07, 2.5146e-08, 8.5682e-08, ..., 2.1607e-07, 3.8184e-08, -3.1665e-08], [-3.9861e-07, 3.4459e-08, -4.6100e-07, ..., -4.7404e-07, 2.7940e-09, -1.2387e-07]], device='cuda:0') Epoch 166, bias, value: tensor([-0.0174, -0.0179, -0.0280, -0.0258, -0.0060, 0.0057, 0.0093, -0.0210, -0.0056, 0.0008], device='cuda:0'), grad: tensor([ 8.2608e-07, -3.1263e-05, 9.8161e-07, 1.2079e-06, 7.1712e-07, 2.9150e-07, -7.0781e-08, 2.5630e-05, 1.7928e-06, -1.9185e-07], device='cuda:0') 100 0.0001 changing lr epoch 165, time 220.15, cls_loss 0.0016 cls_loss_mapping 0.0028 cls_loss_causal 0.5153 re_mapping 0.0055 re_causal 0.0154 /// teacc 98.96 lr 0.00010000 Epoch 167, weight, value: tensor([[-0.1606, -0.1553, 0.0874, ..., -0.0763, 0.0486, 0.0320], [-0.0945, -0.0351, -0.0653, ..., -0.0950, -0.0875, -0.0366], [ 0.0285, -0.0867, -0.0728, ..., -0.0743, 0.0194, -0.2755], ..., [-0.1337, 0.1144, 0.0025, ..., 0.1182, -0.0249, -0.1056], [-0.1007, -0.0906, 0.1103, ..., -0.0737, -0.1407, 0.1153], [ 0.0104, -0.1958, 0.1116, ..., 0.0610, -0.1649, -0.0843]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3784e-07, 1.3709e-06, ..., 7.3761e-07, 0.0000e+00, 5.5507e-07], [ 0.0000e+00, 7.7859e-07, 6.5751e-07, ..., 6.1933e-07, -9.3132e-09, 3.8184e-08], [ 0.0000e+00, 5.9605e-07, 2.4959e-07, ..., 6.2864e-07, 0.0000e+00, 1.4529e-07], ..., [ 0.0000e+00, -1.7220e-06, 2.3283e-07, ..., -8.2143e-07, 6.5193e-09, -1.0524e-07], [ 1.8626e-09, 6.1467e-08, 2.9709e-07, ..., 2.4959e-07, 0.0000e+00, -2.4494e-07], [ 9.3132e-10, 8.1956e-08, -5.7481e-06, ..., -2.7586e-06, 0.0000e+00, 1.9092e-07]], device='cuda:0') Epoch 167, bias, value: tensor([-0.0173, -0.0180, -0.0282, -0.0260, -0.0060, 0.0053, 0.0100, -0.0206, -0.0055, 0.0004], device='cuda:0'), grad: tensor([ 5.6326e-06, 3.9600e-06, -7.0967e-06, 1.5963e-06, 5.7332e-06, 1.6335e-06, -2.3842e-06, 2.6301e-06, 9.6019e-07, -1.2696e-05], device='cuda:0') 100 0.0001 changing lr epoch 166, time 220.15, cls_loss 0.0016 cls_loss_mapping 0.0033 cls_loss_causal 0.5337 re_mapping 0.0055 re_causal 0.0159 /// teacc 99.04 lr 0.00010000 Epoch 168, weight, value: tensor([[-0.1608, -0.1556, 0.0876, ..., -0.0766, 0.0486, 0.0321], [-0.0948, -0.0351, -0.0652, ..., -0.0958, -0.0874, -0.0366], [ 0.0283, -0.0887, -0.0735, ..., -0.0762, 0.0194, -0.2759], ..., [-0.1340, 0.1152, 0.0025, ..., 0.1189, -0.0250, -0.1057], [-0.1028, -0.0910, 0.1104, ..., -0.0745, -0.1409, 0.1143], [ 0.0109, -0.1975, 0.1119, ..., 0.0610, -0.1653, -0.0843]], device='cuda:0'), grad: tensor([[ 1.8626e-08, 8.5682e-08, 9.4995e-08, ..., 6.5193e-08, -1.8626e-09, 7.7300e-08], [ 1.1735e-07, 3.5390e-07, 4.4703e-07, ..., 5.0664e-07, -9.3132e-10, 7.5437e-07], [ 8.3819e-09, 3.9581e-07, 2.4214e-07, ..., 1.2573e-07, -9.3132e-10, 2.1886e-07], ..., [ 6.4261e-08, -8.3633e-07, 1.3132e-07, ..., -2.5146e-07, 0.0000e+00, 2.3562e-07], [-4.3772e-07, -1.0990e-07, -3.3118e-06, ..., -1.2759e-07, 0.0000e+00, -9.0450e-06], [ 6.0052e-06, 2.4773e-07, 1.0822e-06, ..., 1.1466e-05, 0.0000e+00, 4.0755e-06]], device='cuda:0') Epoch 168, bias, value: tensor([-0.0173, -0.0181, -0.0280, -0.0261, -0.0058, 0.0051, 0.0104, -0.0204, -0.0062, 0.0003], device='cuda:0'), grad: tensor([ 4.5635e-07, 4.0233e-06, 1.0030e-06, 2.5108e-06, -4.2439e-05, 2.9020e-06, 1.4538e-06, 1.4342e-07, -1.6093e-05, 4.6074e-05], device='cuda:0') 100 0.0001 changing lr epoch 167, time 220.36, cls_loss 0.0020 cls_loss_mapping 0.0038 cls_loss_causal 0.5034 re_mapping 0.0053 re_causal 0.0146 /// teacc 99.11 lr 0.00010000 Epoch 169, weight, value: tensor([[-0.1611, -0.1570, 0.0876, ..., -0.0774, 0.0489, 0.0321], [-0.0960, -0.0357, -0.0658, ..., -0.0977, -0.0873, -0.0367], [ 0.0278, -0.0916, -0.0740, ..., -0.0791, 0.0196, -0.2763], ..., [-0.1360, 0.1171, 0.0031, ..., 0.1213, -0.0254, -0.1060], [-0.1034, -0.0916, 0.1107, ..., -0.0756, -0.1409, 0.1145], [ 0.0103, -0.2001, 0.1120, ..., 0.0607, -0.1655, -0.0849]], device='cuda:0'), grad: tensor([[-5.5879e-09, 1.2666e-07, -1.0803e-07, ..., 1.2107e-08, 9.3132e-10, -1.1642e-07], [ 1.8626e-09, 2.3283e-06, 6.0536e-08, ..., 3.0454e-07, 1.4901e-08, -1.0272e-06], [ 9.3132e-10, 2.3544e-06, 3.7253e-08, ..., 9.2201e-08, -2.1420e-08, 4.1910e-08], ..., [ 2.7940e-09, 6.8955e-06, -3.3341e-07, ..., -1.1139e-06, 2.7940e-09, 3.4831e-07], [-9.4064e-08, 4.3027e-07, -4.6380e-07, ..., 1.2107e-08, 9.3132e-10, -8.6240e-07], [ 3.7253e-09, 6.2771e-07, 2.7847e-07, ..., 2.8871e-07, 0.0000e+00, 4.3493e-07]], device='cuda:0') Epoch 169, bias, value: tensor([-0.0174, -0.0187, -0.0288, -0.0257, -0.0056, 0.0046, 0.0100, -0.0191, -0.0063, 0.0001], device='cuda:0'), grad: tensor([ 1.2014e-07, 1.8636e-06, 2.3469e-06, -2.4214e-05, 1.7555e-06, 3.6061e-06, -7.6927e-07, 1.2845e-05, -1.6484e-07, 2.6114e-06], device='cuda:0') 100 0.0001 changing lr epoch 168, time 220.34, cls_loss 0.0017 cls_loss_mapping 0.0035 cls_loss_causal 0.5102 re_mapping 0.0052 re_causal 0.0144 /// teacc 99.06 lr 0.00010000 Epoch 170, weight, value: tensor([[-0.1618, -0.1575, 0.0875, ..., -0.0777, 0.0494, 0.0320], [-0.0963, -0.0353, -0.0655, ..., -0.0986, -0.0874, -0.0360], [ 0.0271, -0.0918, -0.0746, ..., -0.0792, 0.0201, -0.2783], ..., [-0.1370, 0.1168, 0.0022, ..., 0.1206, -0.0257, -0.1073], [-0.1042, -0.0915, 0.1118, ..., -0.0743, -0.1413, 0.1154], [ 0.0104, -0.2013, 0.1129, ..., 0.0610, -0.1660, -0.0857]], device='cuda:0'), grad: tensor([[ 3.7253e-07, 1.9707e-06, 9.6206e-07, ..., 3.3621e-07, 0.0000e+00, 5.1223e-07], [ 1.8068e-07, 1.4398e-06, 7.1786e-06, ..., 5.3830e-07, -1.8626e-09, 9.0897e-06], [ 1.5739e-07, 2.1420e-06, 6.9663e-07, ..., 3.1386e-07, 0.0000e+00, 6.2771e-07], ..., [ 1.4249e-06, 8.1062e-06, 3.0957e-06, ..., -6.0815e-07, 9.3132e-10, 1.9036e-06], [ 1.6233e-06, 2.9802e-07, -1.0133e-05, ..., 2.6375e-06, 0.0000e+00, -1.7554e-05], [-1.9222e-06, 7.8380e-06, -2.5406e-06, ..., -2.9393e-06, 0.0000e+00, 1.6317e-06]], device='cuda:0') Epoch 170, bias, value: tensor([-0.0179, -0.0184, -0.0281, -0.0260, -0.0056, 0.0050, 0.0104, -0.0202, -0.0060, 0.0003], device='cuda:0'), grad: tensor([ 6.2324e-06, 2.7701e-05, 8.7693e-06, -4.1485e-05, -1.7241e-05, 7.5139e-06, 3.6843e-06, 2.0072e-05, -3.0145e-05, 1.4886e-05], device='cuda:0') 100 0.0001 changing lr epoch 169, time 220.43, cls_loss 0.0020 cls_loss_mapping 0.0041 cls_loss_causal 0.5085 re_mapping 0.0050 re_causal 0.0137 /// teacc 98.94 lr 0.00010000 Epoch 171, weight, value: tensor([[-0.1622, -0.1582, 0.0868, ..., -0.0791, 0.0495, 0.0321], [-0.0966, -0.0359, -0.0670, ..., -0.1018, -0.0873, -0.0358], [ 0.0262, -0.0927, -0.0754, ..., -0.0800, 0.0202, -0.2794], ..., [-0.1378, 0.1176, 0.0030, ..., 0.1219, -0.0257, -0.1076], [-0.1035, -0.0919, 0.1127, ..., -0.0749, -0.1414, 0.1162], [ 0.0106, -0.2022, 0.1136, ..., 0.0618, -0.1663, -0.0866]], device='cuda:0'), grad: tensor([[ 7.3574e-08, 6.5193e-09, -5.8487e-07, ..., 8.5682e-08, -1.9558e-08, -4.4517e-07], [ 8.1956e-08, 9.9652e-08, 2.4214e-07, ..., 1.8813e-07, 1.8626e-09, 5.0291e-08], [ 8.8476e-08, -1.9558e-08, 3.4459e-07, ..., 2.3376e-07, 1.8626e-09, 1.4994e-07], ..., [ 1.0524e-07, 9.2201e-08, 7.5698e-06, ..., 6.3777e-06, 9.3132e-10, 2.0396e-07], [ 6.8825e-07, 2.1420e-08, 4.9826e-07, ..., 4.8056e-07, 9.3132e-10, 5.3924e-07], [ 5.9977e-07, 1.3970e-08, -1.0386e-05, ..., -9.0599e-06, 5.5879e-09, 5.5041e-07]], device='cuda:0') Epoch 171, bias, value: tensor([-0.0184, -0.0193, -0.0281, -0.0257, -0.0061, 0.0048, 0.0105, -0.0194, -0.0055, 0.0007], device='cuda:0'), grad: tensor([-1.7565e-06, 1.4110e-06, -2.3395e-06, 2.3767e-06, 5.5358e-06, -6.7316e-06, 2.3972e-06, 2.1666e-05, 2.7195e-06, -2.5257e-05], device='cuda:0') 100 0.0001 changing lr epoch 170, time 220.26, cls_loss 0.0021 cls_loss_mapping 0.0041 cls_loss_causal 0.5368 re_mapping 0.0052 re_causal 0.0145 /// teacc 98.96 lr 0.00010000 Epoch 172, weight, value: tensor([[-0.1634, -0.1588, 0.0863, ..., -0.0804, 0.0496, 0.0319], [-0.0983, -0.0361, -0.0674, ..., -0.1036, -0.0875, -0.0358], [ 0.0263, -0.0937, -0.0761, ..., -0.0813, 0.0204, -0.2802], ..., [-0.1391, 0.1181, 0.0027, ..., 0.1224, -0.0259, -0.1080], [-0.1046, -0.0929, 0.1127, ..., -0.0773, -0.1416, 0.1165], [ 0.0122, -0.2031, 0.1149, ..., 0.0638, -0.1667, -0.0845]], device='cuda:0'), grad: tensor([[ 1.3970e-08, 3.7253e-09, 1.6764e-08, ..., 1.8440e-07, -9.3132e-10, 2.9802e-08], [ 5.5879e-09, -4.6566e-09, -5.1968e-07, ..., 5.1223e-07, -5.5879e-09, -7.4878e-07], [ 3.7253e-09, 1.0245e-08, 2.1420e-08, ..., 1.6298e-07, 0.0000e+00, 3.1665e-08], ..., [ 3.9116e-08, 2.3283e-08, 2.6915e-07, ..., 1.2787e-06, 3.7253e-09, 3.3341e-07], [ 1.0338e-07, 1.3970e-08, 1.3318e-07, ..., 7.6927e-07, 0.0000e+00, 1.6857e-07], [-5.1502e-07, 3.3528e-08, -8.0746e-07, ..., 6.2063e-06, 0.0000e+00, 3.7160e-07]], device='cuda:0') Epoch 172, bias, value: tensor([-0.0192, -0.0195, -0.0286, -0.0256, -0.0071, 0.0043, 0.0106, -0.0196, -0.0061, 0.0025], device='cuda:0'), grad: tensor([ 7.3668e-07, -3.5577e-07, 7.6182e-07, -5.2154e-08, -3.8475e-05, 1.3951e-06, 2.4028e-06, 5.6885e-06, 3.3639e-06, 2.4572e-05], device='cuda:0') 100 0.0001 changing lr epoch 171, time 220.23, cls_loss 0.0018 cls_loss_mapping 0.0031 cls_loss_causal 0.5273 re_mapping 0.0054 re_causal 0.0148 /// teacc 99.09 lr 0.00010000 Epoch 173, weight, value: tensor([[-0.1653, -0.1593, 0.0865, ..., -0.0811, 0.0499, 0.0315], [-0.0989, -0.0363, -0.0675, ..., -0.1046, -0.0876, -0.0358], [ 0.0259, -0.0941, -0.0773, ..., -0.0828, 0.0221, -0.2812], ..., [-0.1401, 0.1189, 0.0031, ..., 0.1243, -0.0261, -0.1084], [-0.1056, -0.0942, 0.1129, ..., -0.0780, -0.1434, 0.1166], [ 0.0125, -0.2055, 0.1151, ..., 0.0636, -0.1670, -0.0836]], device='cuda:0'), grad: tensor([[ 4.1910e-08, 1.0058e-07, 1.8626e-08, ..., 9.1270e-08, 0.0000e+00, 9.3132e-09], [ 1.3411e-07, -4.7125e-06, 3.7253e-07, ..., 5.6811e-07, -1.3039e-08, 2.0489e-07], [ 2.0489e-08, 3.1553e-06, 7.1116e-06, ..., 3.3900e-07, 1.8626e-09, 9.3579e-06], ..., [ 6.4168e-07, -4.7348e-06, 1.1409e-06, ..., -5.9903e-06, 1.8626e-09, 6.3051e-07], [-1.2200e-07, -3.0417e-06, -8.9109e-06, ..., -9.7789e-08, 0.0000e+00, -1.2539e-05], [-1.5860e-06, 8.8103e-07, -1.8720e-06, ..., -4.6939e-07, 0.0000e+00, 2.1420e-07]], device='cuda:0') Epoch 173, bias, value: tensor([-0.0196, -0.0198, -0.0284, -0.0258, -0.0072, 0.0044, 0.0106, -0.0190, -0.0067, 0.0026], device='cuda:0'), grad: tensor([ 2.1830e-06, -1.8597e-05, -4.6939e-07, 2.8610e-06, 3.1471e-05, 4.2319e-06, -6.4448e-07, 1.4998e-05, -3.7193e-05, 1.2564e-06], device='cuda:0') 100 0.0001 changing lr epoch 172, time 220.21, cls_loss 0.0014 cls_loss_mapping 0.0032 cls_loss_causal 0.5246 re_mapping 0.0050 re_causal 0.0149 /// teacc 98.99 lr 0.00010000 Epoch 174, weight, value: tensor([[-0.1658, -0.1600, 0.0868, ..., -0.0814, 0.0504, 0.0319], [-0.0992, -0.0366, -0.0676, ..., -0.1056, -0.0875, -0.0357], [ 0.0255, -0.0943, -0.0777, ..., -0.0832, 0.0226, -0.2817], ..., [-0.1416, 0.1196, 0.0033, ..., 0.1253, -0.0264, -0.1086], [-0.1066, -0.0947, 0.1134, ..., -0.0783, -0.1439, 0.1168], [ 0.0123, -0.2065, 0.1152, ..., 0.0635, -0.1676, -0.0840]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 1.9558e-08, -4.1071e-07, ..., 9.3132e-10, 0.0000e+00, -3.4925e-07], [ 2.5146e-08, -1.0246e-04, -5.2959e-05, ..., 4.2841e-08, 0.0000e+00, 3.9116e-08], [ 1.2107e-08, 1.2387e-07, 1.9372e-07, ..., 5.5879e-09, 1.8626e-09, 2.5332e-07], ..., [ 1.2107e-08, 1.0133e-04, 5.2512e-05, ..., -2.2352e-08, 9.3132e-10, 9.5926e-08], [ 1.3597e-07, -6.0629e-07, -5.6252e-07, ..., -4.3772e-08, 0.0000e+00, -9.9372e-07], [ 5.1223e-08, 4.7404e-07, 3.2410e-07, ..., 1.9558e-08, 0.0000e+00, 1.9837e-07]], device='cuda:0') Epoch 174, bias, value: tensor([-0.0186, -0.0199, -0.0281, -0.0261, -0.0073, 0.0045, 0.0103, -0.0187, -0.0070, 0.0024], device='cuda:0'), grad: tensor([-1.4454e-06, -2.8515e-04, 9.1176e-07, 5.3048e-06, -3.4831e-07, -3.0212e-06, 1.7276e-06, 2.8253e-04, -2.8014e-06, 1.8403e-06], device='cuda:0') 100 0.0001 changing lr epoch 173, time 220.45, cls_loss 0.0014 cls_loss_mapping 0.0035 cls_loss_causal 0.4998 re_mapping 0.0054 re_causal 0.0148 /// teacc 99.08 lr 0.00010000 Epoch 175, weight, value: tensor([[-0.1662, -0.1594, 0.0873, ..., -0.0817, 0.0506, 0.0327], [-0.0996, -0.0372, -0.0684, ..., -0.1067, -0.0875, -0.0356], [ 0.0249, -0.0945, -0.0782, ..., -0.0832, 0.0226, -0.2826], ..., [-0.1428, 0.1203, 0.0037, ..., 0.1257, -0.0267, -0.1091], [-0.1069, -0.0951, 0.1138, ..., -0.0787, -0.1440, 0.1171], [ 0.0122, -0.2071, 0.1154, ..., 0.0636, -0.1682, -0.0846]], device='cuda:0'), grad: tensor([[ 8.1025e-08, 1.6112e-07, -3.9116e-08, ..., 4.4703e-08, 2.5146e-08, 7.3574e-08], [ 3.4459e-08, 6.4261e-08, 2.7940e-08, ..., 5.9605e-08, 1.8626e-09, 2.7008e-08], [ 3.6322e-08, 4.9639e-07, 2.4121e-07, ..., 1.5832e-08, -1.6764e-07, 9.2387e-07], ..., [ 3.6322e-08, 2.0564e-06, 1.1455e-06, ..., -1.8626e-08, 4.9360e-08, 4.5486e-06], [ 5.4576e-07, -3.0082e-07, -1.7472e-06, ..., 2.0768e-07, 1.4901e-08, -6.5491e-06], [-1.7323e-07, 1.7229e-07, -3.2783e-07, ..., 1.8720e-06, 6.5193e-09, 6.0536e-08]], device='cuda:0') Epoch 175, bias, value: tensor([-0.0176, -0.0205, -0.0279, -0.0265, -0.0075, 0.0049, 0.0103, -0.0183, -0.0069, 0.0025], device='cuda:0'), grad: tensor([ 1.3746e-06, 5.1223e-07, -1.6391e-07, -1.1757e-05, -7.9423e-06, 4.4778e-06, 1.1176e-06, 8.8960e-06, -2.4326e-06, 5.8897e-06], device='cuda:0') 100 0.0001 changing lr epoch 174, time 220.33, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5040 re_mapping 0.0052 re_causal 0.0143 /// teacc 99.08 lr 0.00010000 Epoch 176, weight, value: tensor([[-0.1666, -0.1606, 0.0892, ..., -0.0812, 0.0511, 0.0359], [-0.0999, -0.0376, -0.0688, ..., -0.1077, -0.0874, -0.0355], [ 0.0246, -0.0949, -0.0790, ..., -0.0840, 0.0223, -0.2836], ..., [-0.1436, 0.1209, 0.0042, ..., 0.1273, -0.0270, -0.1093], [-0.1083, -0.0955, 0.1139, ..., -0.0797, -0.1441, 0.1171], [ 0.0118, -0.2083, 0.1152, ..., 0.0633, -0.1688, -0.0853]], device='cuda:0'), grad: tensor([[ 2.7101e-07, 9.3132e-10, -1.1288e-06, ..., 4.0159e-06, 2.1141e-07, 2.5220e-06], [ 4.4703e-08, 5.5879e-09, 1.8626e-08, ..., 2.8219e-07, 6.9849e-08, -3.5297e-07], [ 2.4214e-08, 1.8626e-09, 5.8115e-07, ..., 6.8452e-07, 6.1747e-07, 6.7055e-07], ..., [ 1.8720e-07, -6.5193e-09, 6.7055e-08, ..., 2.1979e-07, 5.4948e-08, 5.7090e-07], [ 5.8021e-07, 1.8626e-09, 6.2399e-08, ..., 6.9663e-07, 1.8626e-07, 1.0626e-06], [ 4.3865e-07, 3.7253e-09, -8.9407e-08, ..., 8.8476e-07, 2.7288e-07, 1.2321e-06]], device='cuda:0') Epoch 176, bias, value: tensor([-0.0144, -0.0206, -0.0281, -0.0265, -0.0075, 0.0052, 0.0088, -0.0177, -0.0075, 0.0019], device='cuda:0'), grad: tensor([ 1.0081e-05, -2.1085e-06, 7.7933e-06, 2.5295e-06, 2.0951e-05, 1.4871e-05, -6.7234e-05, 2.6412e-06, 4.6417e-06, 5.7928e-06], device='cuda:0') 100 0.0001 changing lr epoch 175, time 220.37, cls_loss 0.0015 cls_loss_mapping 0.0030 cls_loss_causal 0.4922 re_mapping 0.0051 re_causal 0.0143 /// teacc 99.08 lr 0.00010000 Epoch 177, weight, value: tensor([[-0.1668, -0.1614, 0.0895, ..., -0.0817, 0.0510, 0.0360], [-0.1003, -0.0372, -0.0687, ..., -0.1084, -0.0873, -0.0355], [ 0.0241, -0.0952, -0.0794, ..., -0.0845, 0.0222, -0.2841], ..., [-0.1446, 0.1210, 0.0041, ..., 0.1275, -0.0272, -0.1096], [-0.1100, -0.0961, 0.1144, ..., -0.0799, -0.1443, 0.1172], [ 0.0117, -0.2102, 0.1153, ..., 0.0626, -0.1692, -0.0858]], device='cuda:0'), grad: tensor([[ 4.7684e-07, 1.0571e-07, 2.3749e-08, ..., 3.7719e-08, 0.0000e+00, 2.0973e-06], [ 6.4727e-08, 2.8126e-07, 9.8255e-08, ..., 1.1828e-07, -1.2573e-08, 7.5391e-07], [ 1.4901e-08, 1.9884e-07, 1.5832e-07, ..., 9.4995e-08, 9.3132e-10, 2.1933e-07], ..., [ 1.1642e-08, -9.0757e-07, -1.2899e-07, ..., -4.9919e-07, -3.3993e-08, 8.8476e-08], [ 1.8021e-07, 3.6322e-08, -2.5015e-06, ..., 1.3970e-09, 2.3283e-09, -2.7586e-06], [ 1.6205e-07, 3.0361e-07, 1.2852e-06, ..., 9.0338e-08, 4.6566e-10, 1.9502e-06]], device='cuda:0') Epoch 177, bias, value: tensor([-0.0143, -0.0202, -0.0279, -0.0273, -0.0066, 0.0065, 0.0085, -0.0182, -0.0077, 0.0012], device='cuda:0'), grad: tensor([ 9.0450e-06, 4.0047e-06, 9.4296e-07, 4.3446e-07, 6.6496e-06, 3.2634e-05, -5.2214e-05, -1.0943e-06, -5.6773e-06, 5.2713e-06], device='cuda:0') 100 0.0001 changing lr epoch 176, time 220.14, cls_loss 0.0015 cls_loss_mapping 0.0030 cls_loss_causal 0.5403 re_mapping 0.0050 re_causal 0.0146 /// teacc 98.96 lr 0.00010000 Epoch 178, weight, value: tensor([[-0.1677, -0.1623, 0.0890, ..., -0.0823, 0.0508, 0.0351], [-0.1012, -0.0374, -0.0689, ..., -0.1096, -0.0867, -0.0353], [ 0.0239, -0.0953, -0.0797, ..., -0.0841, 0.0225, -0.2845], ..., [-0.1459, 0.1215, 0.0040, ..., 0.1278, -0.0275, -0.1098], [-0.1102, -0.0966, 0.1149, ..., -0.0805, -0.1446, 0.1176], [ 0.0124, -0.2117, 0.1159, ..., 0.0628, -0.1694, -0.0860]], device='cuda:0'), grad: tensor([[ 4.0047e-08, 2.0582e-07, 1.8626e-09, ..., 1.6298e-08, 0.0000e+00, 2.8871e-08], [ 4.3306e-08, 5.5414e-07, 2.2305e-07, ..., 7.1246e-08, 0.0000e+00, -4.7404e-07], [ 1.0710e-08, 3.1199e-08, 8.6147e-08, ..., 1.7695e-08, 0.0000e+00, 2.1840e-07], ..., [ 2.5146e-08, 2.3749e-07, 5.0757e-08, ..., -1.4994e-07, 0.0000e+00, 6.0070e-08], [ 3.3062e-08, 2.5239e-07, -7.2224e-07, ..., 4.1910e-09, 0.0000e+00, -8.1770e-07], [-6.5984e-07, 1.9651e-06, -1.1176e-08, ..., 3.7253e-08, 0.0000e+00, 3.0594e-07]], device='cuda:0') Epoch 178, bias, value: tensor([-0.0154, -0.0205, -0.0274, -0.0272, -0.0067, 0.0065, 0.0088, -0.0182, -0.0077, 0.0014], device='cuda:0'), grad: tensor([ 1.2033e-06, 2.8852e-06, -7.5847e-06, -5.0925e-06, -6.2399e-08, 2.2035e-06, 1.1539e-06, 3.6024e-06, -6.7800e-07, 2.3264e-06], device='cuda:0') 100 0.0001 changing lr epoch 177, time 219.98, cls_loss 0.0016 cls_loss_mapping 0.0041 cls_loss_causal 0.5235 re_mapping 0.0049 re_causal 0.0141 /// teacc 99.09 lr 0.00010000 Epoch 179, weight, value: tensor([[-0.1681, -0.1632, 0.0884, ..., -0.0833, 0.0508, 0.0351], [-0.1016, -0.0386, -0.0690, ..., -0.1120, -0.0866, -0.0350], [ 0.0238, -0.0959, -0.0805, ..., -0.0847, 0.0226, -0.2853], ..., [-0.1465, 0.1231, 0.0043, ..., 0.1292, -0.0277, -0.1101], [-0.1109, -0.0980, 0.1153, ..., -0.0813, -0.1446, 0.1184], [ 0.0128, -0.2131, 0.1162, ..., 0.0629, -0.1696, -0.0861]], device='cuda:0'), grad: tensor([[ 3.2596e-08, 2.9337e-08, 2.7940e-09, ..., 2.5611e-08, 0.0000e+00, 3.0734e-08], [ 3.5856e-08, 1.4622e-07, 1.1642e-08, ..., 3.1665e-08, -3.2596e-09, -1.2759e-07], [ 1.2713e-07, -1.2089e-06, 8.8476e-09, ..., -2.1011e-06, 0.0000e+00, 3.3528e-08], ..., [ 9.7789e-08, 2.1867e-06, 2.4214e-08, ..., 2.0172e-06, 9.3132e-10, 8.1025e-08], [ 8.8010e-08, 1.1921e-07, -6.3330e-08, ..., 1.0105e-07, 9.3132e-10, -3.7253e-08], [-2.0489e-08, 2.4121e-07, -1.8394e-07, ..., 1.9073e-06, 0.0000e+00, 4.5635e-08]], device='cuda:0') Epoch 179, bias, value: tensor([-0.0157, -0.0212, -0.0275, -0.0272, -0.0070, 0.0064, 0.0088, -0.0171, -0.0073, 0.0014], device='cuda:0'), grad: tensor([ 4.3260e-07, -2.1234e-06, -7.1898e-06, -3.2242e-06, -7.8082e-06, -6.9849e-08, -2.8545e-07, 1.0483e-05, 1.0580e-06, 8.7023e-06], device='cuda:0') 100 0.0001 changing lr epoch 178, time 220.54, cls_loss 0.0011 cls_loss_mapping 0.0029 cls_loss_causal 0.4874 re_mapping 0.0053 re_causal 0.0145 /// teacc 98.99 lr 0.00010000 Epoch 180, weight, value: tensor([[-0.1682, -0.1636, 0.0891, ..., -0.0836, 0.0508, 0.0352], [-0.1018, -0.0388, -0.0692, ..., -0.1127, -0.0866, -0.0350], [ 0.0239, -0.0963, -0.0811, ..., -0.0850, 0.0228, -0.2857], ..., [-0.1469, 0.1236, 0.0045, ..., 0.1300, -0.0278, -0.1103], [-0.1111, -0.0980, 0.1158, ..., -0.0815, -0.1446, 0.1188], [ 0.0131, -0.2140, 0.1161, ..., 0.0629, -0.1698, -0.0867]], device='cuda:0'), grad: tensor([[-4.0978e-08, 8.3819e-09, -1.6913e-06, ..., 6.3330e-08, 0.0000e+00, -1.9372e-06], [ 3.7253e-08, 5.4948e-08, 7.5437e-08, ..., 3.0734e-08, 0.0000e+00, 4.5635e-08], [-3.1348e-06, 1.2945e-07, -2.1607e-07, ..., -5.1176e-07, 0.0000e+00, 3.7765e-07], ..., [ 2.6077e-08, -1.0012e-07, 1.2200e-06, ..., -1.2014e-07, 0.0000e+00, 1.3486e-06], [ 1.6810e-07, 8.3819e-09, 1.6298e-08, ..., 1.0896e-07, 0.0000e+00, -1.5041e-07], [ 2.6822e-06, 4.3772e-08, -8.4937e-07, ..., -1.0282e-06, 0.0000e+00, 6.0070e-08]], device='cuda:0') Epoch 180, bias, value: tensor([-0.0157, -0.0213, -0.0272, -0.0274, -0.0071, 0.0061, 0.0088, -0.0169, -0.0068, 0.0011], device='cuda:0'), grad: tensor([-4.7944e-06, 2.9523e-07, -1.5795e-05, 1.7257e-06, 3.9116e-06, -2.2836e-06, 1.8701e-06, 4.0978e-06, 1.4212e-06, 9.5442e-06], device='cuda:0') 100 0.0001 changing lr epoch 179, time 220.06, cls_loss 0.0016 cls_loss_mapping 0.0036 cls_loss_causal 0.5433 re_mapping 0.0055 re_causal 0.0148 /// teacc 98.98 lr 0.00010000 Epoch 181, weight, value: tensor([[-0.1683, -0.1644, 0.0897, ..., -0.0838, 0.0508, 0.0353], [-0.1022, -0.0392, -0.0693, ..., -0.1141, -0.0869, -0.0351], [ 0.0239, -0.0969, -0.0821, ..., -0.0857, 0.0229, -0.2864], ..., [-0.1473, 0.1243, 0.0044, ..., 0.1306, -0.0282, -0.1107], [-0.1112, -0.0986, 0.1167, ..., -0.0817, -0.1448, 0.1202], [ 0.0132, -0.2148, 0.1164, ..., 0.0628, -0.1700, -0.0870]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.7695e-08, -1.9073e-06, ..., 2.4680e-08, 4.6566e-10, -7.1712e-08], [ 0.0000e+00, 5.7742e-08, 1.0012e-07, ..., 3.9581e-08, -4.6566e-10, -2.8871e-08], [ 0.0000e+00, -1.7956e-06, 1.1176e-07, ..., -6.8545e-07, -6.9849e-09, 2.0489e-08], ..., [ 4.6566e-10, 1.5013e-06, 7.4971e-08, ..., 6.6543e-07, 2.3283e-09, 3.8650e-08], [ 1.3970e-09, 1.2247e-07, 3.7253e-08, ..., 5.0291e-08, 2.3283e-09, -5.9139e-08], [ 4.6566e-10, 2.2352e-08, 1.0617e-06, ..., 7.3574e-08, 0.0000e+00, 3.4925e-08]], device='cuda:0') Epoch 181, bias, value: tensor([-0.0156, -0.0218, -0.0271, -0.0272, -0.0069, 0.0031, 0.0116, -0.0166, -0.0061, 0.0010], device='cuda:0'), grad: tensor([-3.7979e-06, -2.6077e-08, -7.4022e-06, 8.7265e-07, -1.2293e-06, 2.7055e-07, 1.7975e-07, 7.8529e-06, 6.4261e-07, 2.6319e-06], device='cuda:0') 100 0.0001 changing lr epoch 180, time 220.54, cls_loss 0.0020 cls_loss_mapping 0.0034 cls_loss_causal 0.5467 re_mapping 0.0049 re_causal 0.0141 /// teacc 99.04 lr 0.00010000 Epoch 182, weight, value: tensor([[-0.1684, -0.1648, 0.0900, ..., -0.0844, 0.0508, 0.0355], [-0.1027, -0.0398, -0.0699, ..., -0.1152, -0.0869, -0.0347], [ 0.0235, -0.0984, -0.0828, ..., -0.0869, 0.0230, -0.2869], ..., [-0.1486, 0.1253, 0.0040, ..., 0.1309, -0.0283, -0.1110], [-0.1120, -0.0991, 0.1170, ..., -0.0821, -0.1449, 0.1202], [ 0.0149, -0.2157, 0.1175, ..., 0.0642, -0.1702, -0.0874]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 8.9407e-08, 1.0943e-07, ..., 4.6985e-07, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 8.1537e-07, 5.1130e-07, ..., 1.0831e-06, 0.0000e+00, 1.5832e-08], [ 4.6566e-10, 3.8091e-06, 9.1270e-08, ..., 1.2945e-07, -2.3283e-09, 2.7940e-08], ..., [ 8.8476e-09, 7.2382e-06, 4.6790e-06, ..., 1.0841e-05, 4.6566e-10, 2.4214e-08], [ 4.2841e-08, 2.1188e-07, 2.2585e-07, ..., 8.5169e-07, 9.3132e-10, -1.5181e-07], [-6.5193e-09, -5.9791e-07, -1.2323e-05, ..., -2.8566e-05, 0.0000e+00, 8.5216e-08]], device='cuda:0') Epoch 182, bias, value: tensor([-0.0149, -0.0223, -0.0281, -0.0280, -0.0084, 0.0037, 0.0114, -0.0161, -0.0064, 0.0023], device='cuda:0'), grad: tensor([ 1.6578e-06, 5.2825e-06, 8.7470e-06, -1.8477e-05, 4.2856e-05, 4.3996e-06, 8.9360e-07, 5.5552e-05, 3.2466e-06, -1.0425e-04], device='cuda:0') 100 0.0001 changing lr epoch 181, time 220.20, cls_loss 0.0014 cls_loss_mapping 0.0027 cls_loss_causal 0.5216 re_mapping 0.0050 re_causal 0.0139 /// teacc 98.96 lr 0.00010000 Epoch 183, weight, value: tensor([[-0.1688, -0.1661, 0.0906, ..., -0.0846, 0.0508, 0.0356], [-0.1028, -0.0405, -0.0703, ..., -0.1156, -0.0869, -0.0342], [ 0.0237, -0.0971, -0.0831, ..., -0.0866, 0.0232, -0.2874], ..., [-0.1496, 0.1254, 0.0041, ..., 0.1309, -0.0286, -0.1114], [-0.1126, -0.0996, 0.1176, ..., -0.0823, -0.1449, 0.1203], [ 0.0147, -0.2162, 0.1176, ..., 0.0640, -0.1704, -0.0879]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.7963e-08, -7.8697e-08, ..., 2.1420e-08, 1.8626e-09, -3.8184e-08], [ 4.6566e-10, 4.0000e-07, 2.0349e-07, ..., 2.3423e-07, 4.6566e-10, -9.4064e-08], [ 0.0000e+00, -7.4040e-08, 2.7940e-08, ..., 3.5856e-08, -1.9092e-08, 1.7229e-08], ..., [ 6.5193e-09, -1.9092e-07, -2.3143e-07, ..., -5.0478e-07, 1.8626e-09, 5.3551e-08], [ 5.5879e-09, 4.6566e-08, -1.1036e-07, ..., 1.7695e-08, 9.3132e-10, -1.0990e-07], [ 6.3330e-08, 1.8114e-07, 5.3085e-08, ..., 9.6392e-08, 4.6566e-10, 7.5903e-08]], device='cuda:0') Epoch 183, bias, value: tensor([-0.0148, -0.0225, -0.0266, -0.0279, -0.0080, 0.0040, 0.0111, -0.0164, -0.0068, 0.0020], device='cuda:0'), grad: tensor([ 3.2596e-08, 4.6706e-07, -1.1446e-06, -4.0419e-07, 9.1270e-08, 8.7079e-08, 1.3271e-07, 3.4925e-07, -3.5856e-08, 4.3679e-07], device='cuda:0') 100 0.0001 changing lr epoch 182, time 220.31, cls_loss 0.0013 cls_loss_mapping 0.0029 cls_loss_causal 0.4984 re_mapping 0.0050 re_causal 0.0143 /// teacc 99.03 lr 0.00010000 Epoch 184, weight, value: tensor([[-0.1694, -0.1665, 0.0915, ..., -0.0848, 0.0507, 0.0359], [-0.1038, -0.0400, -0.0699, ..., -0.1162, -0.0869, -0.0336], [ 0.0234, -0.0960, -0.0830, ..., -0.0865, 0.0264, -0.2879], ..., [-0.1505, 0.1251, 0.0039, ..., 0.1316, -0.0299, -0.1121], [-0.1129, -0.1028, 0.1181, ..., -0.0826, -0.1481, 0.1209], [ 0.0150, -0.2174, 0.1174, ..., 0.0637, -0.1707, -0.0882]], device='cuda:0'), grad: tensor([[-2.3749e-08, 1.2433e-07, 1.2107e-08, ..., 1.9930e-07, 0.0000e+00, -4.9360e-08], [ 1.9558e-08, 1.7649e-07, 8.5682e-08, ..., 1.4622e-07, 0.0000e+00, -2.9290e-07], [ 7.9162e-09, 2.8312e-07, 1.2945e-07, ..., 1.3784e-07, 0.0000e+00, 8.6613e-08], ..., [ 1.0245e-08, -2.4438e-06, -4.1090e-06, ..., -4.4107e-06, 0.0000e+00, 7.4040e-08], [ 3.2596e-09, 8.6613e-08, 1.0291e-07, ..., 9.5461e-08, 0.0000e+00, 1.2852e-07], [ 9.2667e-08, 2.0172e-06, 3.3174e-06, ..., 3.6918e-06, 0.0000e+00, 3.2596e-08]], device='cuda:0') Epoch 184, bias, value: tensor([-0.0144, -0.0218, -0.0255, -0.0279, -0.0078, 0.0037, 0.0111, -0.0171, -0.0075, 0.0018], device='cuda:0'), grad: tensor([ 5.5181e-07, -1.9073e-06, 1.2238e-06, -7.7626e-07, -2.9383e-07, 6.0443e-07, 4.1910e-09, -7.3574e-06, 7.1339e-07, 7.2494e-06], device='cuda:0') 100 0.0001 changing lr epoch 183, time 220.41, cls_loss 0.0016 cls_loss_mapping 0.0031 cls_loss_causal 0.5063 re_mapping 0.0046 re_causal 0.0133 /// teacc 99.04 lr 0.00010000 Epoch 185, weight, value: tensor([[-0.1697, -0.1677, 0.0937, ..., -0.0845, 0.0507, 0.0372], [-0.1037, -0.0395, -0.0694, ..., -0.1169, -0.0869, -0.0333], [ 0.0229, -0.0962, -0.0842, ..., -0.0868, 0.0264, -0.2887], ..., [-0.1517, 0.1248, 0.0036, ..., 0.1322, -0.0301, -0.1131], [-0.1143, -0.1034, 0.1197, ..., -0.0827, -0.1481, 0.1217], [ 0.0149, -0.2180, 0.1174, ..., 0.0636, -0.1710, -0.0895]], device='cuda:0'), grad: tensor([[ 6.2073e-07, 1.1846e-06, 7.4320e-07, ..., 7.5065e-07, -8.8476e-09, 7.1200e-07], [ 1.9604e-07, 3.1432e-07, 2.3516e-07, ..., 2.1001e-07, 9.3132e-10, 1.5227e-07], [ 4.9267e-07, 2.1420e-07, 2.0955e-07, ..., 1.3877e-07, 0.0000e+00, 3.9302e-07], ..., [ 2.6962e-07, -1.1474e-05, -8.2403e-06, ..., -7.4022e-06, 0.0000e+00, 1.4901e-07], [ 5.0366e-06, 5.0385e-07, -6.4261e-08, ..., 6.7987e-08, 0.0000e+00, 2.5239e-06], [ 6.7800e-07, 9.3803e-06, 6.1579e-06, ..., 6.3665e-06, 0.0000e+00, 8.3679e-07]], device='cuda:0') Epoch 185, bias, value: tensor([-0.0119, -0.0212, -0.0253, -0.0282, -0.0079, 0.0040, 0.0103, -0.0177, -0.0073, 0.0015], device='cuda:0'), grad: tensor([ 5.7705e-06, 1.3970e-06, 1.6959e-06, 7.2062e-05, 5.5879e-06, -9.5844e-05, -3.0994e-06, -2.0236e-05, 1.2651e-05, 1.9923e-05], device='cuda:0') 100 0.0001 changing lr epoch 184, time 220.00, cls_loss 0.0014 cls_loss_mapping 0.0032 cls_loss_causal 0.5273 re_mapping 0.0048 re_causal 0.0147 /// teacc 99.05 lr 0.00010000 Epoch 186, weight, value: tensor([[-0.1703, -0.1684, 0.0937, ..., -0.0850, 0.0507, 0.0371], [-0.1058, -0.0394, -0.0694, ..., -0.1174, -0.0870, -0.0337], [ 0.0245, -0.0964, -0.0852, ..., -0.0870, 0.0265, -0.2893], ..., [-0.1529, 0.1248, 0.0032, ..., 0.1320, -0.0304, -0.1132], [-0.1174, -0.1038, 0.1207, ..., -0.0834, -0.1481, 0.1201], [ 0.0154, -0.2188, 0.1186, ..., 0.0652, -0.1712, -0.0900]], device='cuda:0'), grad: tensor([[-7.3574e-08, 1.6298e-08, -5.0105e-06, ..., 2.1653e-07, 0.0000e+00, -2.3823e-06], [ 4.4238e-08, 5.2201e-07, 1.3690e-07, ..., 2.1840e-07, 0.0000e+00, -3.6368e-07], [ 1.1176e-08, 1.2340e-07, 1.6345e-07, ..., -1.3821e-06, -1.3970e-09, 1.3411e-07], ..., [ 2.1886e-08, -7.6229e-07, -1.1129e-07, ..., 1.0654e-06, 1.3970e-09, 1.6438e-07], [ 2.0564e-06, -1.4855e-07, 1.8682e-06, ..., 1.5516e-06, 0.0000e+00, 1.4110e-06], [-5.4762e-07, 1.7602e-07, 2.2240e-06, ..., -1.9334e-06, 0.0000e+00, 1.4473e-06]], device='cuda:0') Epoch 186, bias, value: tensor([-0.0121, -0.0210, -0.0254, -0.0284, -0.0093, 0.0051, 0.0102, -0.0181, -0.0085, 0.0027], device='cuda:0'), grad: tensor([-8.3074e-06, 1.9092e-08, -7.7263e-06, 1.0438e-05, 4.9360e-07, -1.2636e-05, -1.0785e-06, 6.6645e-06, 8.5384e-06, 3.5726e-06], device='cuda:0') 100 0.0001 changing lr epoch 185, time 220.50, cls_loss 0.0013 cls_loss_mapping 0.0036 cls_loss_causal 0.5169 re_mapping 0.0050 re_causal 0.0146 /// teacc 98.96 lr 0.00010000 Epoch 187, weight, value: tensor([[-0.1709, -0.1701, 0.0939, ..., -0.0853, 0.0507, 0.0374], [-0.1077, -0.0393, -0.0695, ..., -0.1183, -0.0871, -0.0337], [ 0.0247, -0.0965, -0.0859, ..., -0.0872, 0.0265, -0.2899], ..., [-0.1549, 0.1249, 0.0031, ..., 0.1322, -0.0306, -0.1134], [-0.1180, -0.1043, 0.1213, ..., -0.0837, -0.1481, 0.1205], [ 0.0147, -0.2200, 0.1188, ..., 0.0648, -0.1713, -0.0904]], device='cuda:0'), grad: tensor([[ 6.9849e-09, 1.5367e-08, -3.3062e-08, ..., 9.3132e-09, 9.3132e-10, -1.4901e-08], [ 3.8184e-08, 1.4529e-07, 2.6077e-08, ..., 8.8010e-08, 4.6566e-10, -2.1886e-08], [ 6.0536e-09, 2.9104e-07, 1.2573e-08, ..., 1.9092e-08, -1.4435e-08, 1.6764e-08], ..., [ 1.2573e-08, 1.2573e-07, 1.5367e-08, ..., 1.3039e-08, 2.7940e-09, 2.1886e-08], [ 3.7719e-08, 2.3749e-08, -4.1910e-09, ..., 2.7008e-08, 9.3132e-10, 3.2596e-09], [ 2.6077e-08, 3.7253e-08, -1.1129e-07, ..., -3.3528e-08, 4.6566e-10, 1.6298e-08]], device='cuda:0') Epoch 187, bias, value: tensor([-0.0121, -0.0210, -0.0250, -0.0282, -0.0086, 0.0054, 0.0100, -0.0185, -0.0086, 0.0023], device='cuda:0'), grad: tensor([ 9.9186e-08, 6.6310e-07, 8.7870e-07, -1.3169e-06, -1.8645e-06, -1.7807e-06, 1.9334e-06, 7.5949e-07, 2.1746e-07, 3.9861e-07], device='cuda:0') 100 0.0001 changing lr epoch 186, time 220.75, cls_loss 0.0020 cls_loss_mapping 0.0038 cls_loss_causal 0.4810 re_mapping 0.0054 re_causal 0.0143 /// teacc 99.05 lr 0.00010000 Epoch 188, weight, value: tensor([[-0.1718, -0.1717, 0.0940, ..., -0.0857, 0.0507, 0.0374], [-0.1084, -0.0395, -0.0697, ..., -0.1191, -0.0871, -0.0337], [ 0.0239, -0.0971, -0.0869, ..., -0.0872, 0.0265, -0.2909], ..., [-0.1567, 0.1252, 0.0034, ..., 0.1330, -0.0308, -0.1135], [-0.1184, -0.1050, 0.1219, ..., -0.0840, -0.1482, 0.1209], [ 0.0152, -0.2223, 0.1195, ..., 0.0646, -0.1714, -0.0909]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 8.8476e-09, 1.2806e-07, ..., 9.3132e-09, 0.0000e+00, 1.9092e-08], [ 3.5856e-08, 2.1420e-08, -1.2875e-05, ..., 1.1642e-08, 0.0000e+00, 3.1991e-07], [ 1.3039e-08, 1.2852e-07, 6.1467e-07, ..., 9.6392e-08, -4.6566e-10, 1.5041e-07], ..., [ 1.8626e-09, -1.9139e-07, 1.8384e-06, ..., -3.2503e-07, 0.0000e+00, 3.4925e-08], [-4.4797e-07, 1.1036e-07, -8.1817e-07, ..., 3.2596e-09, 1.8626e-09, -5.0105e-06], [ 2.3283e-09, 1.1967e-07, 9.8497e-06, ..., 1.5646e-07, 0.0000e+00, 2.5146e-08]], device='cuda:0') Epoch 188, bias, value: tensor([-0.0126, -0.0205, -0.0256, -0.0297, -0.0087, 0.0069, 0.0099, -0.0185, -0.0088, 0.0019], device='cuda:0'), grad: tensor([ 1.0710e-06, -8.1122e-05, 3.3490e-06, -2.3143e-07, 3.3416e-06, -7.1637e-06, 1.1764e-05, 1.1951e-05, -4.8429e-06, 6.1810e-05], device='cuda:0') 100 0.0001 changing lr epoch 187, time 220.54, cls_loss 0.0020 cls_loss_mapping 0.0036 cls_loss_causal 0.4965 re_mapping 0.0047 re_causal 0.0134 /// teacc 98.97 lr 0.00010000 Epoch 189, weight, value: tensor([[-0.1724, -0.1724, 0.0946, ..., -0.0864, 0.0507, 0.0376], [-0.1088, -0.0397, -0.0702, ..., -0.1205, -0.0871, -0.0337], [ 0.0232, -0.0976, -0.0888, ..., -0.0881, 0.0265, -0.2917], ..., [-0.1580, 0.1260, 0.0033, ..., 0.1332, -0.0308, -0.1139], [-0.1190, -0.1054, 0.1232, ..., -0.0848, -0.1482, 0.1221], [ 0.0153, -0.2255, 0.1204, ..., 0.0652, -0.1714, -0.0916]], device='cuda:0'), grad: tensor([[-5.0245e-07, 2.3283e-09, -3.1888e-06, ..., 1.1967e-07, 0.0000e+00, -4.3176e-06], [ 2.6450e-07, 1.1642e-08, 8.3819e-08, ..., 5.0990e-07, 0.0000e+00, 8.1956e-08], [ 2.0117e-07, 2.8871e-08, 9.3179e-07, ..., 6.4261e-08, 0.0000e+00, 1.3364e-06], ..., [ 8.8010e-08, 4.1910e-08, 4.0932e-07, ..., 2.7567e-07, 0.0000e+00, 3.3900e-07], [ 3.0221e-07, 1.8626e-09, 3.2131e-08, ..., 1.0896e-07, 0.0000e+00, 6.7428e-07], [ 2.8126e-07, 1.4901e-08, 9.6858e-08, ..., 2.0163e-07, 0.0000e+00, 1.0245e-06]], device='cuda:0') Epoch 189, bias, value: tensor([-0.0126, -0.0191, -0.0281, -0.0295, -0.0089, 0.0068, 0.0095, -0.0186, -0.0084, 0.0021], device='cuda:0'), grad: tensor([-1.1392e-05, 5.4352e-06, 4.0568e-06, 1.8831e-06, -1.2696e-05, -5.5619e-06, 8.3670e-06, 2.4177e-06, 2.1737e-06, 5.2750e-06], device='cuda:0') 100 0.0001 changing lr epoch 188, time 220.38, cls_loss 0.0014 cls_loss_mapping 0.0030 cls_loss_causal 0.5036 re_mapping 0.0048 re_causal 0.0137 /// teacc 99.05 lr 0.00010000 Epoch 190, weight, value: tensor([[-0.1728, -0.1735, 0.0946, ..., -0.0868, 0.0507, 0.0375], [-0.1097, -0.0398, -0.0705, ..., -0.1213, -0.0871, -0.0338], [ 0.0231, -0.0983, -0.0900, ..., -0.0890, 0.0265, -0.2923], ..., [-0.1594, 0.1265, 0.0029, ..., 0.1334, -0.0309, -0.1141], [-0.1197, -0.1058, 0.1238, ..., -0.0851, -0.1482, 0.1221], [ 0.0161, -0.2266, 0.1212, ..., 0.0658, -0.1715, -0.0919]], device='cuda:0'), grad: tensor([[ 1.9930e-07, 1.3504e-08, -4.7963e-08, ..., 2.6543e-08, 0.0000e+00, 1.8394e-07], [ 3.4925e-08, 1.6578e-07, 2.2352e-08, ..., 6.4727e-08, 0.0000e+00, -2.3283e-09], [ 6.1467e-08, -4.4703e-08, 2.2352e-08, ..., -7.7253e-07, 0.0000e+00, 5.5879e-08], ..., [ 1.0990e-07, 3.3062e-08, -7.8231e-08, ..., 6.4867e-07, 0.0000e+00, 1.4482e-07], [ 8.7917e-07, 1.0384e-07, 2.8405e-08, ..., 6.9849e-08, 0.0000e+00, 7.9582e-07], [ 2.4168e-07, 4.0513e-08, 5.5879e-09, ..., 4.6566e-10, 0.0000e+00, 2.1793e-07]], device='cuda:0') Epoch 190, bias, value: tensor([-0.0128, -0.0193, -0.0283, -0.0293, -0.0094, 0.0067, 0.0099, -0.0187, -0.0088, 0.0027], device='cuda:0'), grad: tensor([ 1.6801e-06, 2.6748e-06, -1.3679e-05, 6.0312e-06, 2.1420e-07, -1.3903e-05, 2.8629e-06, 9.4697e-06, 3.5428e-06, 1.0589e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 220.51, cls_loss 0.0014 cls_loss_mapping 0.0029 cls_loss_causal 0.5046 re_mapping 0.0048 re_causal 0.0135 /// teacc 99.07 lr 0.00010000 Epoch 191, weight, value: tensor([[-0.1731, -0.1744, 0.0948, ..., -0.0875, 0.0507, 0.0374], [-0.1099, -0.0404, -0.0712, ..., -0.1226, -0.0871, -0.0337], [ 0.0231, -0.0993, -0.0917, ..., -0.0901, 0.0265, -0.2932], ..., [-0.1602, 0.1280, 0.0045, ..., 0.1354, -0.0309, -0.1144], [-0.1205, -0.1060, 0.1244, ..., -0.0855, -0.1482, 0.1223], [ 0.0143, -0.2294, 0.1203, ..., 0.0645, -0.1715, -0.0923]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 8.8057e-07, 1.9418e-07, ..., 1.2573e-08, 0.0000e+00, 1.3644e-07], [ 2.3283e-09, -6.1467e-07, -1.4286e-06, ..., 8.8476e-09, 0.0000e+00, -2.5406e-06], [ 9.3132e-10, 1.5041e-06, 3.4459e-07, ..., 3.7253e-09, 0.0000e+00, 3.3341e-07], ..., [ 1.3039e-08, 1.1465e-06, 9.0804e-07, ..., 9.3132e-10, 0.0000e+00, 1.4175e-06], [ 4.6566e-09, 6.0676e-07, -5.0291e-07, ..., 1.0710e-08, 0.0000e+00, -1.4044e-06], [-1.5600e-07, 1.1399e-06, 1.0850e-07, ..., -1.1455e-07, 0.0000e+00, 2.4773e-07]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0132, -0.0198, -0.0285, -0.0294, -0.0081, 0.0067, 0.0101, -0.0175, -0.0087, 0.0011], device='cuda:0'), grad: tensor([ 4.8652e-06, -1.6347e-05, 8.7321e-06, -2.3827e-05, 2.2613e-06, 2.2966e-06, 4.4294e-06, 1.2971e-05, -1.9167e-06, 6.4746e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 190---------------------------------------------------- epoch 190, time 221.48, cls_loss 0.0018 cls_loss_mapping 0.0038 cls_loss_causal 0.5507 re_mapping 0.0048 re_causal 0.0140 /// teacc 99.12 lr 0.00010000 Epoch 192, weight, value: tensor([[-0.1735, -0.1753, 0.0951, ..., -0.0883, 0.0506, 0.0373], [-0.1106, -0.0400, -0.0704, ..., -0.1249, -0.0871, -0.0334], [ 0.0231, -0.0997, -0.0925, ..., -0.0904, 0.0265, -0.2935], ..., [-0.1607, 0.1282, 0.0041, ..., 0.1367, -0.0310, -0.1146], [-0.1211, -0.1065, 0.1242, ..., -0.0868, -0.1482, 0.1224], [ 0.0142, -0.2317, 0.1206, ..., 0.0641, -0.1715, -0.0924]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 7.1712e-07, 1.3504e-07, ..., 7.2643e-08, 0.0000e+00, -1.9558e-08], [ 1.1176e-08, 1.1735e-06, 5.0850e-07, ..., 3.4645e-07, 0.0000e+00, -1.8626e-08], [ 1.8626e-09, 1.5059e-06, 3.9861e-07, ..., 8.3819e-08, 0.0000e+00, 1.3039e-08], ..., [ 2.0489e-08, -1.0049e-06, -8.0839e-07, ..., -9.9093e-07, 0.0000e+00, 2.3283e-08], [ 3.0734e-08, 8.0373e-07, 4.3679e-07, ..., 3.1292e-07, 0.0000e+00, 1.0710e-07], [-2.2631e-07, 1.1148e-06, -2.6729e-07, ..., -2.3376e-07, 0.0000e+00, 1.0245e-08]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0128, -0.0196, -0.0281, -0.0290, -0.0077, 0.0063, 0.0104, -0.0177, -0.0093, 0.0005], device='cuda:0'), grad: tensor([ 2.6245e-06, 3.9153e-06, 5.6960e-06, -1.9446e-05, 1.0971e-06, 1.8273e-06, -3.2503e-07, -1.1874e-06, 3.0771e-06, 2.7530e-06], device='cuda:0') 100 0.0001 changing lr epoch 191, time 220.54, cls_loss 0.0012 cls_loss_mapping 0.0035 cls_loss_causal 0.5186 re_mapping 0.0047 re_causal 0.0138 /// teacc 99.08 lr 0.00010000 Epoch 193, weight, value: tensor([[-0.1751, -0.1760, 0.0952, ..., -0.0892, 0.0507, 0.0371], [-0.1118, -0.0406, -0.0713, ..., -0.1268, -0.0872, -0.0330], [ 0.0229, -0.0999, -0.0931, ..., -0.0903, 0.0265, -0.2940], ..., [-0.1615, 0.1292, 0.0049, ..., 0.1379, -0.0310, -0.1151], [-0.1213, -0.1071, 0.1253, ..., -0.0874, -0.1482, 0.1233], [ 0.0169, -0.2331, 0.1214, ..., 0.0646, -0.1716, -0.0921]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 6.5193e-08, ..., 2.2352e-08, 0.0000e+00, 4.0792e-07], [ 1.8626e-09, 5.5879e-09, 4.0345e-06, ..., 4.1910e-08, 0.0000e+00, 5.8599e-06], [ 0.0000e+00, 9.3132e-10, 4.2934e-07, ..., 3.3528e-08, 0.0000e+00, 5.8580e-07], ..., [ 1.4901e-08, -9.3132e-09, 1.2778e-06, ..., 2.8033e-07, 0.0000e+00, 1.5087e-06], [ 1.5832e-08, 0.0000e+00, -6.6794e-06, ..., -2.0675e-07, 0.0000e+00, -9.5367e-06], [-4.2841e-08, 9.3132e-10, -6.2399e-08, ..., -4.1910e-08, 0.0000e+00, 7.4506e-08]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0133, -0.0199, -0.0282, -0.0291, -0.0093, 0.0064, 0.0102, -0.0170, -0.0090, 0.0016], device='cuda:0'), grad: tensor([ 2.3842e-07, 9.5442e-06, 1.1232e-06, 4.4703e-07, -7.6089e-07, 3.0641e-07, 1.4575e-06, 2.6394e-06, -1.5222e-05, 1.8068e-07], device='cuda:0') 100 0.0001 changing lr epoch 192, time 220.28, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.4918 re_mapping 0.0050 re_causal 0.0134 /// teacc 99.03 lr 0.00010000 Epoch 194, weight, value: tensor([[-0.1754, -0.1770, 0.0949, ..., -0.0898, 0.0507, 0.0369], [-0.1123, -0.0403, -0.0711, ..., -0.1275, -0.0871, -0.0331], [ 0.0224, -0.1001, -0.0948, ..., -0.0900, 0.0266, -0.2952], ..., [-0.1617, 0.1291, 0.0048, ..., 0.1378, -0.0314, -0.1155], [-0.1216, -0.1073, 0.1271, ..., -0.0880, -0.1482, 0.1248], [ 0.0168, -0.2338, 0.1215, ..., 0.0647, -0.1717, -0.0926]], device='cuda:0'), grad: tensor([[ 2.6077e-08, 8.3819e-09, -1.1176e-07, ..., 5.5879e-09, 0.0000e+00, -1.9651e-06], [ 2.9802e-08, 2.4959e-07, 9.3132e-10, ..., 1.7881e-07, 0.0000e+00, -4.6566e-09], [ 8.3819e-09, 2.4028e-07, 2.4214e-08, ..., 1.7509e-07, 0.0000e+00, 2.2352e-08], ..., [ 8.3819e-09, -5.4948e-07, -3.4459e-08, ..., -4.1537e-07, 0.0000e+00, 3.6322e-08], [ 5.4017e-08, 1.1176e-08, -1.3039e-08, ..., 0.0000e+00, 0.0000e+00, 3.1665e-08], [ 2.2352e-08, 2.7940e-08, 5.5879e-09, ..., 1.4901e-08, 0.0000e+00, 3.1665e-08]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0138, -0.0196, -0.0280, -0.0291, -0.0092, 0.0064, 0.0097, -0.0174, -0.0076, 0.0015], device='cuda:0'), grad: tensor([-2.8480e-06, 1.2647e-06, -1.5469e-06, 3.0734e-06, 7.4506e-08, -2.2147e-06, 2.2575e-06, -7.1526e-07, 3.1479e-07, 3.0082e-07], device='cuda:0') 100 0.0001 changing lr epoch 193, time 220.17, cls_loss 0.0018 cls_loss_mapping 0.0033 cls_loss_causal 0.5275 re_mapping 0.0047 re_causal 0.0130 /// teacc 99.06 lr 0.00010000 Epoch 195, weight, value: tensor([[-0.1757, -0.1778, 0.0952, ..., -0.0908, 0.0507, 0.0352], [-0.1130, -0.0400, -0.0710, ..., -0.1296, -0.0875, -0.0335], [ 0.0224, -0.1010, -0.0960, ..., -0.0910, 0.0266, -0.2963], ..., [-0.1641, 0.1293, 0.0052, ..., 0.1384, -0.0311, -0.1158], [-0.1226, -0.1077, 0.1285, ..., -0.0886, -0.1483, 0.1254], [ 0.0169, -0.2355, 0.1212, ..., 0.0648, -0.1720, -0.0927]], device='cuda:0'), grad: tensor([[-2.2352e-08, 2.7940e-09, -1.7323e-07, ..., 3.7253e-09, 0.0000e+00, -1.4063e-07], [ 1.8626e-09, -1.2349e-06, 1.2107e-08, ..., -9.5647e-07, 0.0000e+00, -2.8871e-07], [ 9.3132e-10, 6.5193e-08, 2.6077e-08, ..., 2.7940e-08, 0.0000e+00, 1.7695e-08], ..., [ 5.5879e-09, 1.1763e-06, -6.5193e-09, ..., 9.4622e-07, 0.0000e+00, 2.7567e-07], [ 3.5390e-08, 2.8871e-08, 2.3283e-08, ..., 4.8429e-08, 0.0000e+00, 2.5146e-08], [-1.9558e-08, 1.0245e-08, -5.9605e-08, ..., -3.2596e-08, 0.0000e+00, 1.5832e-08]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0162, -0.0196, -0.0283, -0.0287, -0.0090, 0.0064, 0.0105, -0.0177, -0.0075, 0.0012], device='cuda:0'), grad: tensor([-5.9512e-07, -5.2340e-06, 1.9930e-07, -1.2387e-07, -2.4401e-07, 1.9185e-07, 4.4238e-07, 5.1782e-06, 1.9651e-07, 3.0734e-08], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 194---------------------------------------------------- epoch 194, time 221.37, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.5006 re_mapping 0.0049 re_causal 0.0135 /// teacc 99.15 lr 0.00010000 Epoch 196, weight, value: tensor([[-0.1757, -0.1783, 0.0964, ..., -0.0912, 0.0508, 0.0359], [-0.1132, -0.0402, -0.0712, ..., -0.1305, -0.0877, -0.0335], [ 0.0222, -0.1013, -0.0965, ..., -0.0913, 0.0267, -0.2968], ..., [-0.1649, 0.1297, 0.0053, ..., 0.1389, -0.0314, -0.1165], [-0.1231, -0.1078, 0.1290, ..., -0.0896, -0.1483, 0.1258], [ 0.0173, -0.2360, 0.1213, ..., 0.0648, -0.1724, -0.0933]], device='cuda:0'), grad: tensor([[-2.1514e-06, 3.4459e-08, 4.6566e-09, ..., 5.4948e-08, 4.6566e-09, -3.9600e-06], [ 5.5879e-09, 4.9919e-07, -3.9116e-08, ..., 7.5717e-07, 4.4703e-08, -1.0803e-07], [ 1.5832e-08, 1.6335e-06, 2.8871e-08, ..., 2.6841e-06, 3.3528e-08, 6.5193e-08], ..., [ 3.1758e-07, -8.4490e-06, -1.6578e-07, ..., -1.4067e-05, -3.1665e-08, 6.9570e-07], [ 6.5193e-09, 1.0896e-07, 2.7940e-09, ..., 1.3784e-07, 5.0571e-07, 1.4184e-06], [ 1.3970e-08, 3.3155e-06, 5.6811e-08, ..., 5.6252e-06, 4.6566e-09, 7.4506e-08]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0153, -0.0198, -0.0282, -0.0288, -0.0090, 0.0064, 0.0104, -0.0176, -0.0076, 0.0012], device='cuda:0'), grad: tensor([-9.5740e-06, 1.3402e-06, 6.1169e-06, 5.1130e-07, 1.0043e-05, 7.9870e-06, -5.7593e-06, -2.9400e-05, 5.9642e-06, 1.2733e-05], device='cuda:0') 100 0.0001 changing lr epoch 195, time 220.40, cls_loss 0.0011 cls_loss_mapping 0.0028 cls_loss_causal 0.4992 re_mapping 0.0048 re_causal 0.0135 /// teacc 99.04 lr 0.00010000 Epoch 197, weight, value: tensor([[-0.1758, -0.1787, 0.0965, ..., -0.0914, 0.0510, 0.0359], [-0.1133, -0.0401, -0.0711, ..., -0.1308, -0.0882, -0.0334], [ 0.0221, -0.1016, -0.0967, ..., -0.0914, 0.0268, -0.2977], ..., [-0.1654, 0.1297, 0.0051, ..., 0.1391, -0.0324, -0.1169], [-0.1233, -0.1078, 0.1297, ..., -0.0898, -0.1484, 0.1264], [ 0.0172, -0.2363, 0.1219, ..., 0.0656, -0.1732, -0.0940]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.9802e-08, 8.3819e-09, ..., 1.0245e-08, 0.0000e+00, 3.7253e-09], [ 4.6566e-09, 9.4101e-06, 1.8207e-06, ..., 7.9814e-07, 1.8626e-09, -5.4017e-08], [ 9.3132e-10, 5.6345e-07, 1.1083e-07, ..., 5.6811e-08, 1.8626e-09, 8.3819e-09], ..., [-1.7975e-07, -1.1459e-05, -2.1271e-06, ..., -1.5628e-06, 1.8626e-09, 1.6764e-08], [ 2.7940e-09, 4.0047e-08, -6.5193e-09, ..., 1.6764e-08, 3.7253e-09, -2.7940e-08], [-3.9116e-08, 5.1223e-08, -9.9652e-08, ..., -1.6764e-08, 1.8626e-09, 2.5146e-08]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0154, -0.0197, -0.0276, -0.0289, -0.0100, 0.0065, 0.0104, -0.0179, -0.0073, 0.0018], device='cuda:0'), grad: tensor([ 9.1270e-08, 1.6272e-05, 8.5589e-07, 1.6922e-06, 9.6019e-07, 2.7567e-07, -2.7940e-08, -2.0236e-05, 1.1921e-07, -1.7695e-08], device='cuda:0') 100 0.0001 changing lr epoch 196, time 220.05, cls_loss 0.0012 cls_loss_mapping 0.0032 cls_loss_causal 0.4959 re_mapping 0.0047 re_causal 0.0135 /// teacc 99.07 lr 0.00010000 Epoch 198, weight, value: tensor([[-0.1760, -0.1791, 0.0967, ..., -0.0917, 0.0509, 0.0358], [-0.1133, -0.0406, -0.0716, ..., -0.1319, -0.0885, -0.0335], [ 0.0220, -0.1018, -0.0974, ..., -0.0913, 0.0269, -0.2984], ..., [-0.1659, 0.1302, 0.0053, ..., 0.1393, -0.0341, -0.1168], [-0.1237, -0.1080, 0.1306, ..., -0.0904, -0.1484, 0.1279], [ 0.0174, -0.2367, 0.1222, ..., 0.0657, -0.1741, -0.0942]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 9.3132e-10, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 1.1176e-08], [ 9.3132e-09, 7.4506e-09, 1.5832e-08, ..., 7.4506e-09, 0.0000e+00, 9.3132e-10], [ 9.3132e-10, -5.5879e-09, 1.5832e-08, ..., 1.8626e-09, -2.2352e-08, 1.9558e-08], ..., [ 7.4506e-09, 1.6037e-06, 4.6492e-06, ..., 7.6741e-07, 5.5879e-09, 3.6806e-06], [ 1.2666e-07, -1.6242e-06, -4.7460e-06, ..., -7.7952e-07, 1.7695e-08, -3.6061e-06], [ 1.9558e-08, 7.4506e-09, 8.3819e-09, ..., 3.3528e-08, 9.3132e-10, 4.0978e-08]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0155, -0.0201, -0.0271, -0.0289, -0.0100, 0.0063, 0.0104, -0.0179, -0.0063, 0.0019], device='cuda:0'), grad: tensor([ 3.8184e-08, -5.4948e-08, -7.5437e-08, 1.0710e-07, -1.0617e-07, -3.5763e-07, 7.2643e-08, 7.4878e-06, -7.2792e-06, 1.8906e-07], device='cuda:0') 100 0.0001 changing lr epoch 197, time 220.58, cls_loss 0.0014 cls_loss_mapping 0.0027 cls_loss_causal 0.5047 re_mapping 0.0049 re_causal 0.0134 /// teacc 99.06 lr 0.00010000 Epoch 199, weight, value: tensor([[-0.1763, -0.1790, 0.0970, ..., -0.0921, 0.0511, 0.0357], [-0.1144, -0.0403, -0.0714, ..., -0.1338, -0.0891, -0.0340], [ 0.0216, -0.1021, -0.0987, ..., -0.0920, 0.0270, -0.3006], ..., [-0.1665, 0.1304, 0.0055, ..., 0.1408, -0.0326, -0.1171], [-0.1247, -0.1081, 0.1315, ..., -0.0910, -0.1484, 0.1284], [ 0.0175, -0.2378, 0.1222, ..., 0.0651, -0.1755, -0.0947]], device='cuda:0'), grad: tensor([[ 2.4214e-08, 1.0245e-08, 2.7940e-08, ..., 1.0896e-07, 3.7253e-09, 1.0179e-06], [ 3.8184e-08, 1.6671e-07, 5.4948e-08, ..., 2.4866e-07, 6.5193e-09, 1.9372e-07], [ 4.6566e-09, 3.9563e-06, 6.9756e-07, ..., 2.5295e-06, 7.4506e-09, 6.9290e-07], ..., [ 5.1223e-08, -4.1537e-06, -3.0734e-07, ..., -2.5071e-06, 6.5193e-09, 5.7835e-07], [ 1.1269e-07, -1.7695e-08, -3.9767e-07, ..., 2.2072e-07, 1.3690e-07, -1.2275e-06], [ 5.7556e-06, 6.5193e-08, -1.4054e-06, ..., 1.8612e-05, 1.8626e-09, 2.0489e-08]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0156, -0.0201, -0.0264, -0.0290, -0.0097, 0.0064, 0.0104, -0.0177, -0.0058, 0.0011], device='cuda:0'), grad: tensor([ 5.9828e-06, 1.8701e-06, 1.0327e-05, 2.6822e-07, -6.0886e-05, -2.1420e-06, -9.1717e-06, -8.4713e-06, -6.3796e-07, 6.2943e-05], device='cuda:0') 100 0.0001 changing lr epoch 198, time 220.51, cls_loss 0.0012 cls_loss_mapping 0.0026 cls_loss_causal 0.5130 re_mapping 0.0049 re_causal 0.0139 /// teacc 99.10 lr 0.00010000 Epoch 200, weight, value: tensor([[-0.1766, -0.1798, 0.0970, ..., -0.0927, 0.0511, 0.0357], [-0.1145, -0.0419, -0.0727, ..., -0.1348, -0.0887, -0.0339], [ 0.0218, -0.1022, -0.0996, ..., -0.0924, 0.0270, -0.3018], ..., [-0.1677, 0.1319, 0.0060, ..., 0.1409, -0.0332, -0.1175], [-0.1250, -0.1081, 0.1323, ..., -0.0916, -0.1484, 0.1291], [ 0.0175, -0.2380, 0.1226, ..., 0.0649, -0.1762, -0.0951]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, -3.9954e-07, ..., 9.3132e-09, 9.3132e-10, 1.0245e-08], [ 0.0000e+00, 1.7695e-08, 2.5146e-08, ..., 2.4214e-08, 1.8626e-09, -6.5193e-08], [ 0.0000e+00, -2.1420e-08, 6.1467e-08, ..., -1.1828e-07, -2.2352e-08, 5.5879e-08], ..., [ 3.7253e-09, -4.9360e-08, 1.8626e-09, ..., 4.4703e-08, 1.6764e-08, 5.6811e-08], [ 2.4214e-08, 3.7253e-09, -1.0617e-07, ..., -1.9558e-08, 9.3132e-10, -1.1921e-07], [ 3.7253e-09, 2.7008e-08, 3.3248e-07, ..., 8.6613e-08, 9.3132e-10, 6.5193e-08]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0157, -0.0212, -0.0263, -0.0291, -0.0093, 0.0064, 0.0104, -0.0168, -0.0053, 0.0009], device='cuda:0'), grad: tensor([-9.3225e-07, -1.2871e-06, -6.8452e-07, 1.6391e-07, 1.5542e-05, -7.5437e-08, -1.5125e-05, 1.3607e-06, -2.3562e-07, 1.2768e-06], device='cuda:0') 100 0.0001 changing lr epoch 199, time 220.45, cls_loss 0.0018 cls_loss_mapping 0.0031 cls_loss_causal 0.4862 re_mapping 0.0050 re_causal 0.0133 /// teacc 99.01 lr 0.00010000 Epoch 201, weight, value: tensor([[-0.1766, -0.1800, 0.0973, ..., -0.0931, 0.0509, 0.0358], [-0.1165, -0.0454, -0.0765, ..., -0.1356, -0.0892, -0.0345], [ 0.0216, -0.1013, -0.1013, ..., -0.0935, 0.0272, -0.3036], ..., [-0.1678, 0.1355, 0.0092, ..., 0.1417, -0.0353, -0.1177], [-0.1260, -0.1088, 0.1330, ..., -0.0927, -0.1485, 0.1296], [ 0.0175, -0.2385, 0.1228, ..., 0.0645, -0.1792, -0.0955]], device='cuda:0'), grad: tensor([[-5.5879e-09, 9.3132e-10, -4.7497e-08, ..., 9.8720e-08, 9.3132e-10, -2.0675e-07], [-1.9278e-07, -1.3970e-08, -1.9185e-07, ..., -2.2855e-06, 9.3132e-10, -7.4506e-09], [ 3.7253e-09, 3.1665e-08, 8.5682e-08, ..., 6.5193e-09, 9.3132e-10, 6.2473e-06], ..., [ 1.2293e-07, 2.7008e-08, 2.6543e-07, ..., 2.2780e-06, 0.0000e+00, 1.7360e-06], [ 7.2643e-08, -1.3039e-08, 2.4214e-08, ..., 3.2596e-08, 1.8626e-09, -1.2547e-05], [-8.4750e-08, 4.6566e-09, -8.0746e-07, ..., -4.5728e-07, 0.0000e+00, -7.4506e-08]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0158, -0.0244, -0.0243, -0.0306, -0.0088, 0.0065, 0.0102, -0.0135, -0.0044, 0.0006], device='cuda:0'), grad: tensor([ 4.8429e-08, -4.5657e-05, 1.8120e-05, 4.0978e-06, 1.1735e-06, 7.9498e-06, 2.3842e-06, 4.9621e-05, -3.5912e-05, -1.8990e-06], device='cuda:0') 100 0.0001 changing lr epoch 200, time 220.82, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.5121 re_mapping 0.0048 re_causal 0.0137 /// teacc 99.00 lr 0.00010000 Epoch 202, weight, value: tensor([[-0.1768, -0.1800, 0.0972, ..., -0.0936, 0.0512, 0.0357], [-0.1169, -0.0450, -0.0762, ..., -0.1368, -0.0873, -0.0344], [ 0.0233, -0.1020, -0.1033, ..., -0.0947, 0.0271, -0.3051], ..., [-0.1687, 0.1353, 0.0089, ..., 0.1424, -0.0369, -0.1182], [-0.1262, -0.1091, 0.1354, ..., -0.0927, -0.1484, 0.1306], [ 0.0173, -0.2396, 0.1227, ..., 0.0642, -0.1816, -0.0970]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 2.3283e-08, 6.5193e-09, ..., 1.4901e-08, 0.0000e+00, 7.4506e-09], [ 1.8626e-09, 4.2841e-08, 8.9407e-08, ..., 1.2293e-07, 0.0000e+00, -6.6124e-08], [ 1.8626e-09, 2.0768e-07, 3.4459e-08, ..., 1.1735e-07, 0.0000e+00, 1.6764e-08], ..., [ 1.3039e-08, -5.2620e-07, -2.0862e-07, ..., -3.5670e-07, 0.0000e+00, 6.7055e-08], [ 3.7253e-08, 3.3528e-08, -8.3819e-09, ..., 2.9802e-08, 0.0000e+00, 1.8626e-09], [ 2.5146e-08, 1.2666e-07, -4.8429e-08, ..., -3.9116e-08, 0.0000e+00, 2.3283e-08]], device='cuda:0') Epoch 202, bias, value: tensor([-1.6109e-02, -2.3727e-02, -2.4842e-02, -3.0751e-02, -8.5017e-03, 6.5119e-03, 1.0222e-02, -1.3897e-02, -3.2987e-03, -4.6578e-06], device='cuda:0'), grad: tensor([ 1.3970e-07, -9.1828e-07, 2.0396e-07, 4.2655e-07, 3.3900e-07, -3.5204e-07, 1.8626e-09, 1.4435e-07, 1.2945e-07, -1.0151e-07], device='cuda:0') 100 0.0001 changing lr epoch 201, time 220.28, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4771 re_mapping 0.0048 re_causal 0.0131 /// teacc 98.98 lr 0.00010000 Epoch 203, weight, value: tensor([[-0.1768, -0.1804, 0.0975, ..., -0.0940, 0.0510, 0.0358], [-0.1172, -0.0448, -0.0761, ..., -0.1376, -0.0884, -0.0343], [ 0.0236, -0.1025, -0.1050, ..., -0.0957, 0.0273, -0.3059], ..., [-0.1694, 0.1349, 0.0082, ..., 0.1426, -0.0377, -0.1187], [-0.1265, -0.1093, 0.1364, ..., -0.0934, -0.1485, 0.1314], [ 0.0174, -0.2400, 0.1236, ..., 0.0641, -0.1824, -0.0972]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 3.7253e-09, 1.7695e-08, ..., 1.3970e-08, 0.0000e+00, 4.6566e-09], [ 7.4506e-09, 4.8131e-06, 5.0291e-08, ..., 3.9712e-06, 9.3132e-10, -3.1386e-07], [ 6.6124e-08, 8.0094e-08, 8.4750e-08, ..., 1.1362e-07, 9.3132e-10, 6.5193e-09], ..., [ 3.7625e-07, -4.9546e-06, 3.7346e-07, ..., -3.8259e-06, -4.6566e-09, 9.5926e-08], [ 1.0245e-08, 7.4506e-09, 1.8626e-09, ..., 1.3970e-08, 0.0000e+00, -2.7940e-09], [-1.0990e-06, -5.4017e-08, -1.2470e-06, ..., -8.0187e-07, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 203, bias, value: tensor([-1.6049e-02, -2.3541e-02, -2.4778e-02, -3.0594e-02, -8.1404e-03, 6.4418e-03, 1.0046e-02, -1.4275e-02, -2.7867e-03, -3.0324e-05], device='cuda:0'), grad: tensor([ 8.9407e-08, 9.5889e-06, 3.5856e-07, 7.2084e-07, 6.3051e-07, 1.2023e-06, 8.1398e-07, -9.3430e-06, 7.5437e-08, -4.1425e-06], device='cuda:0') 100 0.0001 changing lr epoch 202, time 220.70, cls_loss 0.0012 cls_loss_mapping 0.0028 cls_loss_causal 0.5164 re_mapping 0.0048 re_causal 0.0136 /// teacc 99.09 lr 0.00010000 Epoch 204, weight, value: tensor([[-0.1772, -0.1814, 0.0977, ..., -0.0946, 0.0509, 0.0356], [-0.1164, -0.0448, -0.0762, ..., -0.1389, -0.0889, -0.0336], [ 0.0235, -0.1029, -0.1058, ..., -0.0964, 0.0273, -0.3078], ..., [-0.1701, 0.1347, 0.0081, ..., 0.1439, -0.0381, -0.1197], [-0.1270, -0.1093, 0.1369, ..., -0.0944, -0.1485, 0.1320], [ 0.0165, -0.2419, 0.1238, ..., 0.0640, -0.1828, -0.0988]], device='cuda:0'), grad: tensor([[ 3.8184e-08, 3.7532e-07, -2.0210e-07, ..., 1.5832e-08, 0.0000e+00, -1.1083e-07], [ 2.2911e-07, -2.4587e-05, -7.2159e-06, ..., 3.8184e-08, 0.0000e+00, -1.0066e-05], [-7.6815e-06, 1.6484e-07, 6.3330e-08, ..., -1.8347e-07, 0.0000e+00, 7.5437e-08], ..., [ 3.1292e-07, 2.2799e-05, 6.6906e-06, ..., -1.1176e-07, 0.0000e+00, 9.3430e-06], [ 1.0338e-07, 9.6858e-08, 1.5739e-07, ..., 1.0245e-08, 0.0000e+00, 1.5832e-07], [ 5.7705e-06, 1.0636e-06, 2.6077e-07, ..., 1.8068e-07, 0.0000e+00, 4.6007e-07]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0163, -0.0235, -0.0248, -0.0303, -0.0080, 0.0064, 0.0101, -0.0143, -0.0026, -0.0005], device='cuda:0'), grad: tensor([ 9.5833e-07, -8.6904e-05, -5.7727e-05, 8.2701e-06, 1.2806e-06, 3.0734e-07, 2.6077e-07, 8.7678e-05, 2.4550e-06, 4.3601e-05], device='cuda:0') 100 0.0001 changing lr epoch 203, time 220.46, cls_loss 0.0014 cls_loss_mapping 0.0029 cls_loss_causal 0.5090 re_mapping 0.0045 re_causal 0.0129 /// teacc 99.06 lr 0.00010000 Epoch 205, weight, value: tensor([[-0.1774, -0.1819, 0.0990, ..., -0.0950, 0.0507, 0.0365], [-0.1169, -0.0445, -0.0760, ..., -0.1397, -0.0918, -0.0331], [ 0.0239, -0.1035, -0.1066, ..., -0.0967, 0.0283, -0.3086], ..., [-0.1705, 0.1345, 0.0078, ..., 0.1445, -0.0401, -0.1207], [-0.1273, -0.1097, 0.1373, ..., -0.0955, -0.1486, 0.1323], [ 0.0165, -0.2428, 0.1238, ..., 0.0640, -0.1834, -0.0994]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 5.5879e-09, -3.7253e-09, ..., 7.4506e-09, -1.8626e-09, 3.0734e-08], [-2.7288e-07, 4.2841e-08, 2.4214e-08, ..., 4.7497e-08, 0.0000e+00, -1.1204e-06], [ 4.6566e-09, 1.6764e-07, 2.9802e-08, ..., 2.2352e-07, 0.0000e+00, 4.8429e-08], ..., [ 8.3819e-09, -2.6822e-07, -1.9558e-08, ..., -3.0920e-07, 0.0000e+00, 8.1956e-08], [ 2.0955e-07, 3.7253e-09, -2.7660e-07, ..., -1.6764e-08, 0.0000e+00, 6.5193e-08], [ 3.7253e-09, 2.9802e-08, -5.3085e-08, ..., -2.7008e-08, 0.0000e+00, 3.5390e-08]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0149, -0.0232, -0.0241, -0.0303, -0.0082, 0.0065, 0.0098, -0.0148, -0.0029, -0.0010], device='cuda:0'), grad: tensor([ 1.1735e-07, -2.9206e-06, 5.1502e-07, 9.3132e-08, 1.6484e-07, 8.1398e-07, 6.4075e-07, -2.2538e-07, 8.2236e-07, -1.1176e-08], device='cuda:0') 100 0.0001 changing lr epoch 204, time 220.84, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4972 re_mapping 0.0046 re_causal 0.0131 /// teacc 99.01 lr 0.00010000 Epoch 206, weight, value: tensor([[-0.1778, -0.1825, 0.0999, ..., -0.0955, 0.0507, 0.0367], [-0.1174, -0.0445, -0.0760, ..., -0.1416, -0.0919, -0.0331], [ 0.0238, -0.1040, -0.1071, ..., -0.0978, 0.0284, -0.3091], ..., [-0.1713, 0.1347, 0.0078, ..., 0.1461, -0.0405, -0.1210], [-0.1274, -0.1098, 0.1374, ..., -0.0960, -0.1486, 0.1327], [ 0.0166, -0.2433, 0.1238, ..., 0.0639, -0.1834, -0.0996]], device='cuda:0'), grad: tensor([[-4.0978e-08, 2.9895e-07, -1.1455e-07, ..., 1.6764e-08, 0.0000e+00, -5.0664e-07], [ 3.7253e-09, 5.5134e-07, 1.2945e-07, ..., 1.9558e-07, 1.8626e-09, 1.6578e-07], [ 1.8626e-09, 3.9004e-06, 1.6764e-07, ..., 1.2107e-07, -1.8626e-09, 2.5425e-07], ..., [ 1.0245e-08, 1.1042e-05, 1.3784e-07, ..., -1.3039e-08, 0.0000e+00, 2.3842e-07], [ 1.8626e-09, 3.1162e-06, -1.8915e-06, ..., 1.0431e-07, 0.0000e+00, -2.8443e-06], [ 9.3132e-10, 2.8685e-07, 5.8115e-07, ..., -9.3132e-08, 0.0000e+00, 6.0629e-07]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0145, -0.0232, -0.0242, -0.0303, -0.0084, 0.0064, 0.0097, -0.0147, -0.0028, -0.0011], device='cuda:0'), grad: tensor([-6.5938e-07, 1.1129e-06, 5.5917e-06, -2.1607e-05, 7.6927e-07, 3.4906e-06, -1.2834e-06, 1.3351e-05, -3.5241e-06, 2.7344e-06], device='cuda:0') 100 0.0001 changing lr epoch 205, time 220.53, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4775 re_mapping 0.0048 re_causal 0.0132 /// teacc 99.03 lr 0.00010000 Epoch 207, weight, value: tensor([[-0.1780, -0.1831, 0.0987, ..., -0.0980, 0.0507, 0.0368], [-0.1169, -0.0447, -0.0761, ..., -0.1428, -0.0920, -0.0329], [ 0.0236, -0.1045, -0.1076, ..., -0.0989, 0.0286, -0.3097], ..., [-0.1718, 0.1350, 0.0079, ..., 0.1459, -0.0414, -0.1213], [-0.1276, -0.1102, 0.1377, ..., -0.0963, -0.1486, 0.1331], [ 0.0167, -0.2437, 0.1243, ..., 0.0623, -0.1837, -0.1000]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, -3.2969e-07, ..., 2.4214e-08, 0.0000e+00, -3.7812e-07], [ 0.0000e+00, 1.0245e-08, 9.4995e-08, ..., 4.3772e-08, 0.0000e+00, 4.4703e-08], [ 0.0000e+00, 5.5879e-09, 1.8068e-07, ..., 5.5879e-09, -1.8626e-09, 2.7101e-07], ..., [ 0.0000e+00, -4.1910e-08, 1.3001e-06, ..., 8.4378e-07, 0.0000e+00, 1.9185e-07], [ 0.0000e+00, 9.3132e-10, -5.5600e-07, ..., 7.1712e-08, 0.0000e+00, -9.7603e-07], [ 0.0000e+00, 3.7253e-09, -1.4408e-06, ..., -1.2470e-06, 0.0000e+00, 3.7905e-07]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0151, -0.0232, -0.0242, -0.0302, -0.0067, 0.0061, 0.0105, -0.0147, -0.0028, -0.0021], device='cuda:0'), grad: tensor([-6.9011e-07, 8.9407e-08, 7.3761e-07, 5.5321e-07, 5.6066e-07, 9.0525e-07, -2.7753e-07, 3.0454e-06, -2.2911e-06, -2.6375e-06], device='cuda:0') 100 0.0001 changing lr epoch 206, time 220.70, cls_loss 0.0014 cls_loss_mapping 0.0031 cls_loss_causal 0.5256 re_mapping 0.0048 re_causal 0.0132 /// teacc 98.96 lr 0.00010000 Epoch 208, weight, value: tensor([[-0.1786, -0.1836, 0.0990, ..., -0.0984, 0.0509, 0.0368], [-0.1168, -0.0447, -0.0762, ..., -0.1435, -0.0926, -0.0328], [ 0.0231, -0.1052, -0.1093, ..., -0.0998, 0.0286, -0.3109], ..., [-0.1723, 0.1350, 0.0080, ..., 0.1461, -0.0412, -0.1219], [-0.1279, -0.1106, 0.1384, ..., -0.0967, -0.1487, 0.1338], [ 0.0174, -0.2442, 0.1249, ..., 0.0623, -0.1842, -0.1004]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-09, -5.2154e-08, ..., 2.5146e-08, 1.8626e-09, 1.1548e-07], [ 0.0000e+00, 1.9558e-08, 7.4506e-09, ..., 9.5926e-08, 3.5390e-08, 2.7008e-08], [ 0.0000e+00, 2.7381e-07, 3.0734e-08, ..., 1.9558e-08, -1.5832e-08, 1.1176e-07], ..., [ 0.0000e+00, 4.0047e-08, 8.3819e-09, ..., 6.5193e-09, 5.5879e-09, 4.5635e-08], [ 0.0000e+00, 2.1420e-08, -4.7497e-08, ..., 3.2596e-08, 0.0000e+00, -1.9558e-08], [-9.3132e-10, 1.0245e-08, 1.8626e-09, ..., 6.4261e-08, 3.7253e-09, 7.7300e-08]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0153, -0.0231, -0.0242, -0.0302, -0.0070, 0.0063, 0.0098, -0.0149, -0.0027, -0.0018], device='cuda:0'), grad: tensor([ 1.9651e-06, 1.7127e-06, 1.6307e-06, -2.9895e-07, 9.1270e-06, 1.3262e-06, -1.7822e-05, 6.4168e-07, 1.0123e-06, 6.9104e-07], device='cuda:0') 100 0.0001 changing lr epoch 207, time 220.48, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4879 re_mapping 0.0046 re_causal 0.0127 /// teacc 99.08 lr 0.00010000 Epoch 209, weight, value: tensor([[-0.1790, -0.1840, 0.0987, ..., -0.0992, 0.0510, 0.0365], [-0.1169, -0.0447, -0.0764, ..., -0.1454, -0.0927, -0.0326], [ 0.0229, -0.1055, -0.1106, ..., -0.1001, 0.0286, -0.3112], ..., [-0.1727, 0.1351, 0.0079, ..., 0.1463, -0.0412, -0.1228], [-0.1284, -0.1113, 0.1387, ..., -0.0976, -0.1487, 0.1338], [ 0.0180, -0.2445, 0.1260, ..., 0.0627, -0.1845, -0.1008]], device='cuda:0'), grad: tensor([[ 2.7008e-08, 3.1665e-08, -5.0943e-07, ..., 4.4703e-08, -2.7940e-09, -1.5274e-07], [ 7.4506e-09, 6.3330e-08, 5.0291e-08, ..., 8.6613e-08, 0.0000e+00, -4.4703e-08], [ 5.5879e-09, 4.1910e-08, 3.0827e-07, ..., 5.3085e-08, 0.0000e+00, 1.4901e-07], ..., [ 2.3283e-08, -5.6997e-07, -4.3772e-08, ..., -7.2177e-07, 0.0000e+00, 5.7742e-08], [ 8.4750e-08, 2.9802e-08, 2.5146e-08, ..., 8.0094e-08, 0.0000e+00, 1.1176e-08], [-1.4622e-07, 1.2759e-07, -1.1362e-07, ..., 3.8184e-08, 0.0000e+00, -5.8673e-08]], device='cuda:0') Epoch 209, bias, value: tensor([-0.0158, -0.0230, -0.0239, -0.0302, -0.0072, 0.0061, 0.0103, -0.0151, -0.0034, -0.0010], device='cuda:0'), grad: tensor([-1.4110e-06, 5.0291e-08, 1.1306e-06, 2.5518e-07, 9.2853e-07, -1.0245e-06, 1.3765e-06, -1.6363e-06, 1.5646e-07, 1.5087e-07], device='cuda:0') 100 0.0001 changing lr epoch 208, time 220.01, cls_loss 0.0014 cls_loss_mapping 0.0032 cls_loss_causal 0.5218 re_mapping 0.0046 re_causal 0.0128 /// teacc 99.04 lr 0.00010000 Epoch 210, weight, value: tensor([[-0.1804, -0.1852, 0.0988, ..., -0.0995, 0.0510, 0.0353], [-0.1149, -0.0448, -0.0764, ..., -0.1461, -0.0927, -0.0310], [ 0.0224, -0.1066, -0.1119, ..., -0.1009, 0.0286, -0.3126], ..., [-0.1739, 0.1352, 0.0083, ..., 0.1474, -0.0413, -0.1234], [-0.1295, -0.1133, 0.1398, ..., -0.0984, -0.1487, 0.1339], [ 0.0179, -0.2465, 0.1253, ..., 0.0612, -0.1851, -0.1016]], device='cuda:0'), grad: tensor([[ 6.3237e-07, 6.5193e-09, 7.4506e-09, ..., 7.4506e-09, 0.0000e+00, 1.0319e-06], [ 3.8091e-07, 7.5437e-08, 8.3819e-09, ..., 4.6566e-08, 0.0000e+00, 5.8208e-07], [ 1.1269e-07, 3.8836e-07, 1.3039e-08, ..., 1.2573e-07, 0.0000e+00, 1.9092e-07], ..., [ 1.5739e-07, -2.2724e-07, -9.3132e-09, ..., -9.2201e-08, 0.0000e+00, 2.8126e-07], [ 1.1563e-05, 3.0734e-08, 6.1467e-08, ..., 2.7847e-07, 0.0000e+00, 2.5123e-05], [ 1.4994e-07, 3.2596e-08, -1.6764e-07, ..., 3.4459e-07, 0.0000e+00, 3.1479e-07]], device='cuda:0') Epoch 210, bias, value: tensor([-0.0168, -0.0228, -0.0242, -0.0301, -0.0062, 0.0063, 0.0105, -0.0151, -0.0039, -0.0023], device='cuda:0'), grad: tensor([ 2.2128e-06, 1.8142e-06, 1.1278e-06, 8.1807e-06, -1.9576e-06, -5.0217e-05, -3.0957e-06, 4.7684e-07, 3.8892e-05, 2.5742e-06], device='cuda:0') 100 0.0001 changing lr epoch 209, time 220.47, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.4678 re_mapping 0.0046 re_causal 0.0131 /// teacc 99.07 lr 0.00010000 Epoch 211, weight, value: tensor([[-0.1812, -0.1856, 0.0983, ..., -0.1009, 0.0510, 0.0356], [-0.1161, -0.0448, -0.0765, ..., -0.1473, -0.0927, -0.0312], [ 0.0220, -0.1071, -0.1125, ..., -0.1014, 0.0286, -0.3144], ..., [-0.1747, 0.1356, 0.0085, ..., 0.1487, -0.0414, -0.1234], [-0.1331, -0.1138, 0.1403, ..., -0.0996, -0.1487, 0.1319], [ 0.0179, -0.2471, 0.1256, ..., 0.0613, -0.1852, -0.1022]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, -4.6566e-08, ..., 0.0000e+00, 0.0000e+00, -1.8626e-08], [ 0.0000e+00, 2.7008e-08, 5.5879e-09, ..., 4.6566e-09, 0.0000e+00, 2.7940e-09], [ 9.3132e-10, 3.0734e-08, 3.3528e-08, ..., 1.8626e-09, 0.0000e+00, 1.5832e-08], ..., [ 9.3132e-10, 3.7253e-08, 3.7253e-09, ..., -7.4506e-09, 0.0000e+00, 6.5193e-09], [ 1.2107e-08, 6.5193e-09, 2.8871e-08, ..., 2.5146e-08, 0.0000e+00, 1.5832e-08], [-5.5879e-09, 6.5193e-09, -5.0291e-08, ..., -2.9802e-08, 0.0000e+00, -1.2107e-08]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0170, -0.0228, -0.0245, -0.0305, -0.0063, 0.0071, 0.0108, -0.0150, -0.0064, -0.0024], device='cuda:0'), grad: tensor([-8.9407e-08, 8.3819e-08, 1.3970e-08, -1.9278e-07, 5.7742e-08, 1.2107e-08, -6.7987e-08, 9.7789e-08, 1.6764e-07, -8.0094e-08], device='cuda:0') 100 0.0001 changing lr epoch 210, time 220.49, cls_loss 0.0012 cls_loss_mapping 0.0032 cls_loss_causal 0.5046 re_mapping 0.0045 re_causal 0.0126 /// teacc 99.08 lr 0.00010000 Epoch 212, weight, value: tensor([[-0.1815, -0.1871, 0.0955, ..., -0.1044, 0.0510, 0.0360], [-0.1161, -0.0449, -0.0767, ..., -0.1488, -0.0927, -0.0309], [ 0.0218, -0.1078, -0.1137, ..., -0.1035, 0.0286, -0.3156], ..., [-0.1755, 0.1358, 0.0079, ..., 0.1489, -0.0415, -0.1255], [-0.1335, -0.1143, 0.1406, ..., -0.1012, -0.1487, 0.1326], [ 0.0179, -0.2476, 0.1288, ..., 0.0626, -0.1853, -0.1006]], device='cuda:0'), grad: tensor([[ 5.1223e-08, 6.9849e-09, 1.1967e-07, ..., 1.7695e-08, 0.0000e+00, -4.6566e-10], [ 3.8650e-08, 5.6345e-08, 1.0477e-07, ..., 3.0734e-08, -5.1223e-09, -4.6566e-08], [ 3.8883e-07, 4.1444e-08, 9.2946e-07, ..., 1.1222e-07, -9.3132e-10, 2.3283e-09], ..., [ 1.3039e-08, -6.2399e-08, 4.6566e-10, ..., -4.1910e-08, 6.5193e-09, 1.2573e-08], [ 7.5437e-08, 1.7695e-08, 1.8766e-07, ..., 3.6787e-08, 9.3132e-10, 8.8476e-09], [-1.0598e-06, 2.3283e-08, -2.5276e-06, ..., -3.0175e-07, 4.6566e-10, -4.1910e-09]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0188, -0.0228, -0.0247, -0.0307, -0.0063, 0.0072, 0.0106, -0.0152, -0.0060, -0.0007], device='cuda:0'), grad: tensor([ 4.9500e-07, 6.0955e-07, 2.4494e-06, 9.9000e-07, 1.0021e-06, 1.3448e-06, 8.3214e-07, 2.3050e-07, 6.9616e-07, -8.6576e-06], device='cuda:0') 100 0.0001 changing lr epoch 211, time 220.59, cls_loss 0.0012 cls_loss_mapping 0.0029 cls_loss_causal 0.5070 re_mapping 0.0047 re_causal 0.0129 /// teacc 99.08 lr 0.00010000 Epoch 213, weight, value: tensor([[-0.1818, -0.1886, 0.0958, ..., -0.1049, 0.0482, 0.0362], [-0.1162, -0.0447, -0.0768, ..., -0.1499, -0.0930, -0.0312], [ 0.0217, -0.1083, -0.1147, ..., -0.1035, 0.0290, -0.3172], ..., [-0.1756, 0.1358, 0.0079, ..., 0.1502, -0.0407, -0.1256], [-0.1341, -0.1149, 0.1411, ..., -0.1034, -0.1489, 0.1333], [ 0.0181, -0.2483, 0.1290, ..., 0.0624, -0.1859, -0.1010]], device='cuda:0'), grad: tensor([[ 8.3819e-09, 3.2596e-09, 1.1753e-06, ..., 9.3924e-07, 4.6566e-10, -5.5879e-09], [ 7.4506e-09, 2.0629e-07, 1.3504e-07, ..., 1.0198e-07, 4.6566e-10, -2.6077e-08], [ 0.0000e+00, 1.1176e-08, 6.8918e-08, ..., 5.3085e-08, -4.6566e-10, 1.9558e-08], ..., [ 6.5193e-09, 1.0729e-06, 3.2876e-07, ..., 2.1560e-07, 1.3970e-09, 4.6566e-08], [ 1.1176e-08, 3.7253e-09, -9.7789e-09, ..., 2.3283e-09, 0.0000e+00, 2.3283e-09], [ 8.8476e-09, 9.5461e-08, -2.1867e-06, ..., -1.7043e-06, 0.0000e+00, 1.3970e-08]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0183, -0.0226, -0.0247, -0.0307, -0.0064, 0.0071, 0.0107, -0.0153, -0.0056, -0.0009], device='cuda:0'), grad: tensor([ 2.2613e-06, 2.8498e-07, 9.2667e-08, -1.8645e-06, 1.3560e-06, -1.2061e-07, -8.2888e-08, 2.0452e-06, 5.6345e-08, -4.0270e-06], device='cuda:0') 100 0.0001 changing lr epoch 212, time 220.41, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4699 re_mapping 0.0045 re_causal 0.0125 /// teacc 99.08 lr 0.00010000 Epoch 214, weight, value: tensor([[-0.1820, -0.1893, 0.0960, ..., -0.1050, 0.0482, 0.0363], [-0.1159, -0.0448, -0.0768, ..., -0.1505, -0.0932, -0.0311], [ 0.0214, -0.1088, -0.1159, ..., -0.1039, 0.0290, -0.3181], ..., [-0.1762, 0.1359, 0.0079, ..., 0.1508, -0.0408, -0.1258], [-0.1343, -0.1156, 0.1422, ..., -0.1042, -0.1489, 0.1339], [ 0.0181, -0.2485, 0.1291, ..., 0.0625, -0.1862, -0.1018]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.1886e-08, 5.5879e-09, ..., 1.8626e-08, 0.0000e+00, 3.2596e-09], [ 4.6566e-10, 3.3528e-08, 7.4506e-09, ..., 2.7008e-08, 0.0000e+00, 2.7940e-09], [ 0.0000e+00, 3.3528e-08, 7.9162e-09, ..., 1.8626e-08, 0.0000e+00, 2.3283e-09], ..., [ 9.3132e-10, -1.0291e-07, -1.9092e-08, ..., -7.3574e-08, 0.0000e+00, 2.7940e-09], [ 6.0536e-09, -4.6566e-10, -1.9558e-08, ..., -4.6566e-10, 0.0000e+00, 1.8626e-08], [ 2.3283e-09, 1.2107e-08, -1.3039e-08, ..., -1.2573e-08, 0.0000e+00, 6.0536e-09]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0181, -0.0226, -0.0248, -0.0306, -0.0060, 0.0069, 0.0101, -0.0153, -0.0053, -0.0009], device='cuda:0'), grad: tensor([ 7.3574e-08, 7.3109e-08, 3.8650e-08, 3.2131e-08, 1.1642e-08, 1.2247e-07, -2.9942e-07, -1.4203e-07, 1.1735e-07, -1.3504e-08], device='cuda:0') 100 0.0001 changing lr epoch 213, time 220.43, cls_loss 0.0011 cls_loss_mapping 0.0022 cls_loss_causal 0.4958 re_mapping 0.0045 re_causal 0.0126 /// teacc 99.03 lr 0.00010000 Epoch 215, weight, value: tensor([[-0.1838, -0.1901, 0.0958, ..., -0.1050, 0.0482, 0.0359], [-0.1163, -0.0448, -0.0769, ..., -0.1511, -0.0933, -0.0312], [ 0.0223, -0.1098, -0.1174, ..., -0.1046, 0.0290, -0.3199], ..., [-0.1771, 0.1362, 0.0080, ..., 0.1513, -0.0409, -0.1260], [-0.1339, -0.1158, 0.1443, ..., -0.1032, -0.1489, 0.1361], [ 0.0169, -0.2497, 0.1288, ..., 0.0625, -0.1862, -0.1039]], device='cuda:0'), grad: tensor([[ 1.0710e-08, 1.0710e-08, 1.2573e-08, ..., 3.2131e-08, 0.0000e+00, 9.3132e-10], [ 3.4925e-08, 6.9849e-08, 3.2131e-08, ..., 1.2992e-07, 0.0000e+00, 1.8626e-09], [ 1.2573e-08, 6.2399e-08, 1.4901e-08, ..., 2.1420e-08, 0.0000e+00, 5.1223e-09], ..., [ 2.5611e-08, 6.4727e-08, 5.8208e-08, ..., 9.2201e-08, 0.0000e+00, 2.3283e-09], [ 6.3330e-08, 5.1223e-08, 1.2014e-07, ..., 1.2293e-07, 0.0000e+00, 2.0489e-08], [-7.0315e-08, 2.5611e-08, -5.8953e-07, ..., -1.6624e-07, 0.0000e+00, 4.6566e-09]], device='cuda:0') Epoch 215, bias, value: tensor([-0.0184, -0.0227, -0.0251, -0.0308, -0.0061, 0.0071, 0.0104, -0.0152, -0.0035, -0.0014], device='cuda:0'), grad: tensor([ 1.6904e-07, 1.0002e-06, -5.9605e-08, 6.4587e-07, -2.6561e-06, -3.2783e-07, 6.7241e-07, 5.7882e-07, 7.3528e-07, -7.3435e-07], device='cuda:0') 100 0.0001 changing lr epoch 214, time 220.73, cls_loss 0.0013 cls_loss_mapping 0.0030 cls_loss_causal 0.5174 re_mapping 0.0044 re_causal 0.0124 /// teacc 98.97 lr 0.00010000 Epoch 216, weight, value: tensor([[-0.1842, -0.1905, 0.0960, ..., -0.1052, 0.0470, 0.0359], [-0.1166, -0.0449, -0.0776, ..., -0.1529, -0.0934, -0.0314], [ 0.0221, -0.1120, -0.1182, ..., -0.1067, 0.0293, -0.3207], ..., [-0.1781, 0.1366, 0.0076, ..., 0.1514, -0.0418, -0.1263], [-0.1341, -0.1162, 0.1450, ..., -0.1042, -0.1489, 0.1366], [ 0.0145, -0.2502, 0.1303, ..., 0.0602, -0.1865, -0.1043]], device='cuda:0'), grad: tensor([[ 1.8161e-08, 1.0710e-08, 2.3283e-08, ..., 5.4017e-08, 0.0000e+00, 3.8184e-08], [ 5.1223e-09, 4.3912e-07, 4.7497e-08, ..., 7.1013e-07, 0.0000e+00, 3.2596e-09], [ 2.7940e-09, 8.8429e-07, 8.8476e-09, ..., 1.3709e-06, 0.0000e+00, 5.1223e-09], ..., [ 1.5832e-08, -1.5404e-06, 8.1863e-07, ..., -1.6093e-06, 0.0000e+00, 1.0245e-08], [ 2.2352e-08, 3.3993e-08, 9.7323e-08, ..., 1.2526e-07, 0.0000e+00, 2.5611e-08], [-1.7695e-08, 5.5414e-08, -1.6121e-06, ..., -1.4864e-06, 0.0000e+00, 2.3749e-08]], device='cuda:0') Epoch 216, bias, value: tensor([-0.0183, -0.0230, -0.0252, -0.0307, -0.0034, 0.0071, 0.0103, -0.0151, -0.0034, -0.0035], device='cuda:0'), grad: tensor([ 1.9558e-07, 1.5311e-06, 2.8815e-06, 6.9477e-07, 1.6764e-06, -1.5926e-07, -1.9092e-07, -2.4550e-06, 4.5402e-07, -4.6268e-06], device='cuda:0') 100 0.0001 changing lr epoch 215, time 220.75, cls_loss 0.0011 cls_loss_mapping 0.0029 cls_loss_causal 0.4941 re_mapping 0.0045 re_causal 0.0125 /// teacc 98.97 lr 0.00010000 Epoch 217, weight, value: tensor([[-0.1847, -0.1914, 0.0958, ..., -0.1060, 0.0461, 0.0361], [-0.1169, -0.0450, -0.0777, ..., -0.1554, -0.0936, -0.0314], [ 0.0222, -0.1124, -0.1192, ..., -0.1071, 0.0297, -0.3214], ..., [-0.1791, 0.1368, 0.0075, ..., 0.1520, -0.0428, -0.1265], [-0.1341, -0.1169, 0.1460, ..., -0.1053, -0.1490, 0.1381], [ 0.0148, -0.2505, 0.1309, ..., 0.0606, -0.1866, -0.1052]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.6566e-10, 4.2375e-08, ..., 4.3772e-08, 0.0000e+00, -6.0536e-09], [ 5.1223e-09, 3.7253e-09, -1.8766e-07, ..., 2.5798e-07, 0.0000e+00, -2.1933e-07], [ 0.0000e+00, 3.7253e-09, 4.0047e-08, ..., 1.1176e-08, 4.6566e-10, 6.3796e-08], ..., [ 1.8626e-09, 4.1910e-09, 3.4217e-06, ..., 2.6431e-06, 4.6566e-10, 1.0710e-07], [ 1.1642e-08, 1.2107e-08, 8.3353e-08, ..., 4.7637e-07, 0.0000e+00, -2.6915e-07], [-1.5181e-07, 1.3970e-09, -6.0722e-06, ..., -5.7146e-06, 0.0000e+00, 1.4575e-07]], device='cuda:0') Epoch 217, bias, value: tensor([-0.0184, -0.0232, -0.0252, -0.0310, -0.0037, 0.0073, 0.0098, -0.0151, -0.0023, -0.0031], device='cuda:0'), grad: tensor([ 9.4064e-08, -2.9579e-06, 5.4296e-07, 1.2144e-06, 5.4725e-06, 2.1374e-07, 5.2294e-07, 9.3803e-06, 1.1967e-07, -1.4633e-05], device='cuda:0') 100 0.0001 changing lr epoch 216, time 220.31, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.5040 re_mapping 0.0046 re_causal 0.0122 /// teacc 99.09 lr 0.00010000 Epoch 218, weight, value: tensor([[-0.1850, -0.1924, 0.0958, ..., -0.1063, 0.0472, 0.0360], [-0.1169, -0.0452, -0.0792, ..., -0.1592, -0.0937, -0.0316], [ 0.0220, -0.1127, -0.1219, ..., -0.1075, 0.0299, -0.3226], ..., [-0.1797, 0.1371, 0.0095, ..., 0.1548, -0.0431, -0.1266], [-0.1346, -0.1189, 0.1461, ..., -0.1110, -0.1492, 0.1387], [ 0.0155, -0.2519, 0.1309, ..., 0.0613, -0.1876, -0.1048]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 2.2352e-08, 2.6077e-08, ..., 9.3132e-10, 0.0000e+00, 1.8626e-09], [ 4.6566e-10, 8.3819e-09, 9.3132e-10, ..., 1.2107e-08, 0.0000e+00, -2.4214e-08], [ 0.0000e+00, 4.5169e-08, 5.4017e-08, ..., 4.7963e-08, 0.0000e+00, 5.5879e-09], ..., [ 3.7253e-09, -3.8650e-08, -2.4680e-08, ..., -3.8650e-08, 0.0000e+00, 1.7695e-08], [ 6.5193e-09, 2.3795e-07, 3.3900e-07, ..., 1.3039e-08, 4.6566e-10, -2.7940e-08], [ 2.7940e-09, 4.1910e-09, -4.3772e-08, ..., -4.1444e-08, 0.0000e+00, 1.2573e-08]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0186, -0.0233, -0.0259, -0.0309, -0.0048, 0.0073, 0.0105, -0.0144, -0.0029, -0.0026], device='cuda:0'), grad: tensor([ 1.7462e-07, -4.7963e-08, 3.0734e-08, -3.0007e-06, -1.3970e-09, 1.0785e-06, 2.4680e-08, 5.9139e-08, 1.7481e-06, -5.5414e-08], device='cuda:0') 100 0.0001 changing lr epoch 217, time 220.42, cls_loss 0.0013 cls_loss_mapping 0.0027 cls_loss_causal 0.4942 re_mapping 0.0043 re_causal 0.0122 /// teacc 99.05 lr 0.00010000 Epoch 219, weight, value: tensor([[-0.1855, -0.1929, 0.0962, ..., -0.1064, 0.0472, 0.0363], [-0.1166, -0.0453, -0.0792, ..., -0.1607, -0.0938, -0.0315], [ 0.0217, -0.1130, -0.1235, ..., -0.1082, 0.0300, -0.3232], ..., [-0.1808, 0.1374, 0.0096, ..., 0.1563, -0.0426, -0.1268], [-0.1350, -0.1196, 0.1467, ..., -0.1120, -0.1493, 0.1397], [ 0.0153, -0.2530, 0.1309, ..., 0.0616, -0.1879, -0.1059]], device='cuda:0'), grad: tensor([[ 9.2667e-08, 6.0536e-09, -1.2107e-08, ..., 8.8476e-09, 0.0000e+00, 3.7625e-07], [ 3.5996e-07, 4.0047e-08, 1.8161e-08, ..., 5.5414e-08, 0.0000e+00, 1.6065e-06], [ 7.4506e-09, 3.7253e-08, 2.4680e-08, ..., 3.6322e-08, -2.7940e-09, 3.4925e-08], ..., [ 5.1223e-09, -1.7788e-07, -4.8429e-08, ..., -2.3562e-07, 1.8626e-09, 2.0489e-08], [ 2.2352e-08, 1.3970e-09, -9.0804e-08, ..., 4.1910e-09, 0.0000e+00, 9.5461e-08], [ 2.9802e-08, 7.0781e-08, -6.5193e-09, ..., 4.4703e-08, 0.0000e+00, 4.0047e-08]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0183, -0.0232, -0.0259, -0.0308, -0.0052, 0.0073, 0.0102, -0.0143, -0.0027, -0.0026], device='cuda:0'), grad: tensor([ 1.2480e-06, 5.1484e-06, -4.8755e-07, 7.4646e-07, 5.0664e-07, 1.3970e-06, -8.7470e-06, -3.6787e-07, 3.5809e-07, 2.1886e-07], device='cuda:0') 100 0.0001 changing lr epoch 218, time 219.89, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.4738 re_mapping 0.0044 re_causal 0.0119 /// teacc 99.11 lr 0.00010000 Epoch 220, weight, value: tensor([[-0.1859, -0.1933, 0.0955, ..., -0.1072, 0.0472, 0.0365], [-0.1168, -0.0454, -0.0795, ..., -0.1616, -0.0939, -0.0314], [ 0.0217, -0.1132, -0.1243, ..., -0.1084, 0.0300, -0.3237], ..., [-0.1820, 0.1376, 0.0098, ..., 0.1568, -0.0424, -0.1270], [-0.1358, -0.1200, 0.1473, ..., -0.1124, -0.1493, 0.1381], [ 0.0156, -0.2533, 0.1315, ..., 0.0617, -0.1882, -0.1062]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 6.9849e-09, -7.9162e-09, ..., 6.5193e-09, 2.0955e-08, 2.1029e-06], [ 1.3970e-09, 6.9384e-08, 3.6322e-08, ..., 6.1933e-08, 1.3644e-07, 3.7253e-08], [ 4.6566e-10, 3.0734e-08, 8.3819e-09, ..., 7.9162e-09, 2.4680e-08, 3.0734e-08], ..., [ 1.8626e-09, -1.6019e-07, -9.9186e-08, ..., -1.7183e-07, 2.8405e-08, 3.4459e-08], [ 8.4285e-08, 4.7963e-08, 2.9663e-07, ..., 1.9325e-07, 4.0978e-08, 3.7579e-07], [-6.2864e-08, 1.0803e-07, -2.8173e-07, ..., -1.1409e-07, 2.7008e-08, 4.6100e-08]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0188, -0.0233, -0.0259, -0.0307, -0.0053, 0.0072, 0.0112, -0.0143, -0.0040, -0.0024], device='cuda:0'), grad: tensor([ 1.0267e-05, 1.6084e-06, 1.0571e-07, -1.0757e-07, -2.6338e-06, 1.4864e-06, -1.3664e-05, -4.6100e-08, 2.8424e-06, 1.4948e-07], device='cuda:0') 100 0.0001 changing lr epoch 219, time 220.35, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4763 re_mapping 0.0046 re_causal 0.0123 /// teacc 99.08 lr 0.00010000 Epoch 221, weight, value: tensor([[-0.1885, -0.1950, 0.0960, ..., -0.1072, 0.0473, 0.0365], [-0.1171, -0.0455, -0.0798, ..., -0.1626, -0.0941, -0.0311], [ 0.0212, -0.1135, -0.1264, ..., -0.1088, 0.0297, -0.3254], ..., [-0.1837, 0.1378, 0.0098, ..., 0.1568, -0.0429, -0.1273], [-0.1355, -0.1206, 0.1475, ..., -0.1129, -0.1493, 0.1389], [ 0.0159, -0.2538, 0.1324, ..., 0.0619, -0.1890, -0.1064]], device='cuda:0'), grad: tensor([[ 1.1409e-07, 4.1910e-09, 0.0000e+00, ..., 6.9849e-09, 0.0000e+00, 3.3528e-07], [ 7.0781e-08, 3.4925e-08, 9.3132e-09, ..., 2.8871e-08, 0.0000e+00, 9.9186e-08], [ 1.7229e-08, 3.2131e-08, 9.3132e-09, ..., 2.4214e-08, 0.0000e+00, 2.5146e-08], ..., [ 4.1444e-08, -1.1595e-07, -4.8894e-08, ..., -8.5682e-08, 0.0000e+00, 3.9581e-08], [ 6.7521e-07, 9.7789e-09, -4.1910e-09, ..., 1.8626e-09, 0.0000e+00, 6.9337e-07], [ 1.8021e-07, 3.8650e-08, 2.7940e-09, ..., 1.8859e-07, 0.0000e+00, 1.3039e-07]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0182, -0.0233, -0.0260, -0.0311, -0.0052, 0.0072, 0.0109, -0.0144, -0.0035, -0.0020], device='cuda:0'), grad: tensor([ 1.1493e-06, 3.6601e-07, -3.7253e-09, 2.0452e-06, 7.7393e-07, -3.0845e-06, -3.6974e-06, -1.1129e-07, 1.6838e-06, 8.7684e-07], device='cuda:0') 100 0.0001 changing lr epoch 220, time 219.98, cls_loss 0.0014 cls_loss_mapping 0.0027 cls_loss_causal 0.4747 re_mapping 0.0046 re_causal 0.0121 /// teacc 99.10 lr 0.00010000 Epoch 222, weight, value: tensor([[-0.1907, -0.1962, 0.0963, ..., -0.1074, 0.0473, 0.0363], [-0.1176, -0.0457, -0.0801, ..., -0.1640, -0.0943, -0.0308], [ 0.0210, -0.1144, -0.1292, ..., -0.1099, 0.0298, -0.3272], ..., [-0.1845, 0.1381, 0.0100, ..., 0.1575, -0.0432, -0.1276], [-0.1359, -0.1218, 0.1477, ..., -0.1136, -0.1493, 0.1394], [ 0.0156, -0.2553, 0.1326, ..., 0.0618, -0.1893, -0.1069]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.6566e-10, -6.5193e-09, ..., 1.4901e-08, 0.0000e+00, -2.7940e-09], [ 2.3283e-09, 3.6322e-08, 2.5611e-08, ..., 3.0641e-07, 0.0000e+00, 8.3819e-09], [ 0.0000e+00, 4.6566e-09, 2.7940e-09, ..., 1.6764e-08, 0.0000e+00, 1.8626e-09], ..., [ 7.4506e-09, -3.3993e-08, -1.3970e-08, ..., 3.3062e-08, 0.0000e+00, 5.5879e-09], [ 2.7940e-09, 9.3132e-09, -2.2212e-07, ..., -8.7544e-08, 0.0000e+00, -2.1653e-07], [ 1.1502e-07, 4.6566e-09, 1.9651e-07, ..., 1.1967e-06, 0.0000e+00, 2.0629e-07]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0182, -0.0234, -0.0261, -0.0310, -0.0049, 0.0072, 0.0105, -0.0144, -0.0025, -0.0023], device='cuda:0'), grad: tensor([ 5.1688e-08, 1.8226e-06, -9.8720e-07, -8.8476e-09, -7.0482e-06, -1.3039e-08, 4.0699e-07, 1.4715e-07, 4.6054e-07, 5.1856e-06], device='cuda:0') 100 0.0001 changing lr epoch 221, time 220.05, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4874 re_mapping 0.0043 re_causal 0.0121 /// teacc 99.08 lr 0.00010000 Epoch 223, weight, value: tensor([[-0.1909, -0.1967, 0.0964, ..., -0.1075, 0.0472, 0.0366], [-0.1175, -0.0458, -0.0801, ..., -0.1654, -0.0944, -0.0307], [ 0.0212, -0.1154, -0.1300, ..., -0.1115, 0.0297, -0.3276], ..., [-0.1850, 0.1385, 0.0099, ..., 0.1580, -0.0421, -0.1279], [-0.1362, -0.1225, 0.1474, ..., -0.1140, -0.1494, 0.1395], [ 0.0156, -0.2555, 0.1330, ..., 0.0618, -0.1896, -0.1071]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 6.9849e-09, -3.2131e-08, ..., 7.4506e-09, 4.6566e-10, -1.9558e-08], [ 1.3970e-09, 4.6100e-08, 1.0710e-08, ..., 2.3283e-08, 0.0000e+00, 2.7940e-09], [ 4.6566e-10, 5.0291e-08, 1.6298e-08, ..., 3.3993e-08, -1.3970e-09, 1.5832e-08], ..., [ 3.7253e-09, -6.7055e-08, -9.3132e-09, ..., -5.9605e-08, 4.6566e-10, 1.3504e-08], [ 5.5879e-09, 9.0338e-08, -1.4901e-08, ..., 9.3132e-10, 4.6566e-10, -1.5367e-08], [ 5.5879e-09, 1.4901e-08, -7.4506e-09, ..., 1.0710e-08, 0.0000e+00, 1.9092e-08]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0182, -0.0236, -0.0265, -0.0310, -0.0047, 0.0071, 0.0107, -0.0140, -0.0028, -0.0024], device='cuda:0'), grad: tensor([-4.1910e-08, 1.2852e-07, 1.5087e-07, -7.5921e-06, -1.4156e-07, 6.8061e-06, 3.9814e-07, -6.9849e-08, 2.3143e-07, 1.4529e-07], device='cuda:0') 100 0.0001 changing lr epoch 222, time 220.28, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.5020 re_mapping 0.0044 re_causal 0.0127 /// teacc 99.07 lr 0.00010000 Epoch 224, weight, value: tensor([[-0.1912, -0.1973, 0.0963, ..., -0.1079, 0.0471, 0.0348], [-0.1176, -0.0459, -0.0802, ..., -0.1662, -0.0936, -0.0308], [ 0.0215, -0.1160, -0.1305, ..., -0.1123, 0.0299, -0.3281], ..., [-0.1856, 0.1387, 0.0098, ..., 0.1585, -0.0438, -0.1282], [-0.1361, -0.1234, 0.1474, ..., -0.1144, -0.1495, 0.1412], [ 0.0157, -0.2563, 0.1335, ..., 0.0618, -0.1899, -0.1073]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.3970e-09, -1.1176e-08, ..., 1.3970e-09, 0.0000e+00, 9.7789e-09], [ 1.3970e-09, 8.3353e-08, -2.3982e-07, ..., 7.9162e-08, 1.9558e-08, -6.8732e-07], [ 9.3132e-10, 1.6764e-08, 6.9849e-09, ..., 1.4435e-08, 4.1910e-09, 1.0710e-08], ..., [ 6.9849e-09, -8.9873e-08, 8.6613e-08, ..., -1.0105e-07, -3.3062e-08, 2.6356e-07], [ 1.3970e-09, 9.4064e-08, 1.6717e-07, ..., 2.3283e-09, 4.6566e-10, 3.7299e-07], [ 1.8626e-08, 2.1420e-08, -1.1176e-08, ..., 3.5856e-08, 9.3132e-10, 3.3528e-08]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0193, -0.0236, -0.0266, -0.0309, -0.0047, 0.0067, 0.0110, -0.0141, -0.0017, -0.0023], device='cuda:0'), grad: tensor([ 5.7230e-07, -3.2596e-09, -4.5486e-06, 1.7555e-07, -1.6391e-07, -6.1467e-08, 7.9162e-08, 1.5236e-06, 1.4007e-06, 1.0021e-06], device='cuda:0') 100 0.0001 changing lr epoch 223, time 220.42, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.5024 re_mapping 0.0043 re_causal 0.0121 /// teacc 99.11 lr 0.00010000 Epoch 225, weight, value: tensor([[-0.1917, -0.1984, 0.0965, ..., -0.1080, 0.0471, 0.0349], [-0.1173, -0.0461, -0.0804, ..., -0.1672, -0.0938, -0.0306], [ 0.0214, -0.1163, -0.1314, ..., -0.1125, 0.0301, -0.3287], ..., [-0.1863, 0.1389, 0.0099, ..., 0.1592, -0.0433, -0.1285], [-0.1364, -0.1241, 0.1482, ..., -0.1141, -0.1496, 0.1417], [ 0.0159, -0.2570, 0.1340, ..., 0.0620, -0.1902, -0.1075]], device='cuda:0'), grad: tensor([[ 6.9849e-09, 1.3970e-09, -3.1665e-07, ..., 1.5832e-08, -4.6566e-10, -3.7532e-07], [ 4.6566e-10, 2.5611e-08, 1.7229e-08, ..., 4.7032e-08, 0.0000e+00, -3.9116e-07], [ 0.0000e+00, 3.3062e-08, 2.4214e-08, ..., 3.9116e-08, 0.0000e+00, 2.3283e-08], ..., [ 4.1910e-09, -2.0210e-07, -5.7276e-08, ..., -1.9837e-07, 0.0000e+00, 2.1048e-07], [ 8.8476e-09, 1.4901e-08, 2.1746e-07, ..., 1.3970e-08, 0.0000e+00, 2.8266e-07], [-2.3749e-08, 1.1222e-07, 2.9802e-08, ..., 3.5912e-06, 0.0000e+00, 1.8161e-08]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0193, -0.0236, -0.0266, -0.0307, -0.0049, 0.0066, 0.0107, -0.0141, -0.0014, -0.0022], device='cuda:0'), grad: tensor([-9.6485e-07, -4.3213e-06, 2.5099e-07, 5.5879e-08, -5.9456e-06, -1.2247e-06, 1.5851e-06, 2.0452e-06, 8.0932e-07, 7.6815e-06], device='cuda:0') 100 0.0001 changing lr epoch 224, time 221.00, cls_loss 0.0011 cls_loss_mapping 0.0023 cls_loss_causal 0.4924 re_mapping 0.0043 re_causal 0.0122 /// teacc 99.05 lr 0.00010000 Epoch 226, weight, value: tensor([[-0.1923, -0.1990, 0.0968, ..., -0.1081, 0.0470, 0.0352], [-0.1180, -0.0464, -0.0805, ..., -0.1694, -0.0939, -0.0297], [ 0.0220, -0.1170, -0.1319, ..., -0.1133, 0.0302, -0.3297], ..., [-0.1868, 0.1394, 0.0100, ..., 0.1608, -0.0435, -0.1291], [-0.1366, -0.1246, 0.1483, ..., -0.1146, -0.1496, 0.1418], [ 0.0161, -0.2576, 0.1342, ..., 0.0620, -0.1904, -0.1077]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 0.0000e+00, -4.1910e-09, ..., 2.7940e-09, 4.6566e-10, -7.9162e-09], [ 3.2596e-09, 1.8626e-09, 1.9558e-08, ..., 1.3039e-08, 1.3970e-09, -2.0023e-08], [ 0.0000e+00, 4.6566e-10, 3.3528e-08, ..., 0.0000e+00, -4.0513e-08, 1.9185e-07], ..., [ 5.5879e-09, 1.8626e-09, 2.7008e-08, ..., 1.8161e-08, 3.8184e-08, 2.1886e-08], [ 5.1223e-09, 4.6566e-10, -3.3993e-08, ..., 5.1223e-09, 9.3132e-10, -2.3050e-07], [-9.7789e-09, 4.6566e-10, -1.0477e-07, ..., -6.9849e-08, 0.0000e+00, 8.8476e-09]], device='cuda:0') Epoch 226, bias, value: tensor([-0.0192, -0.0237, -0.0259, -0.0309, -0.0051, 0.0068, 0.0106, -0.0140, -0.0018, -0.0022], device='cuda:0'), grad: tensor([ 2.6543e-08, -5.5879e-09, -1.8347e-06, 2.4959e-07, 3.2596e-09, -4.2841e-08, 1.7509e-07, 3.8743e-07, 1.2610e-06, -1.9977e-07], device='cuda:0') 100 0.0001 changing lr epoch 225, time 220.00, cls_loss 0.0012 cls_loss_mapping 0.0028 cls_loss_causal 0.5211 re_mapping 0.0040 re_causal 0.0119 /// teacc 99.01 lr 0.00010000 Epoch 227, weight, value: tensor([[-0.1925, -0.1997, 0.0971, ..., -0.1085, 0.0466, 0.0357], [-0.1185, -0.0465, -0.0807, ..., -0.1701, -0.0940, -0.0301], [ 0.0216, -0.1178, -0.1325, ..., -0.1140, 0.0312, -0.3312], ..., [-0.1874, 0.1396, 0.0101, ..., 0.1616, -0.0464, -0.1295], [-0.1368, -0.1262, 0.1485, ..., -0.1148, -0.1497, 0.1415], [ 0.0172, -0.2584, 0.1348, ..., 0.0623, -0.1906, -0.1081]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 1.9558e-08, 6.7567e-07, ..., 1.0245e-08, 0.0000e+00, 8.2981e-07], [ 1.3970e-09, 8.7544e-08, 3.3993e-08, ..., 6.5193e-09, 0.0000e+00, -3.1665e-08], [ 0.0000e+00, 4.2375e-08, 1.0245e-07, ..., 9.3132e-10, 0.0000e+00, 1.2387e-07], ..., [ 9.3132e-10, 4.0047e-08, 9.7789e-09, ..., -3.7253e-09, 0.0000e+00, 1.8626e-08], [ 2.3283e-09, 7.0315e-08, -1.0170e-06, ..., 4.6566e-10, 0.0000e+00, -1.2685e-06], [-1.9558e-08, 2.0489e-08, 1.4994e-07, ..., -5.8208e-08, 0.0000e+00, 2.8871e-07]], device='cuda:0') Epoch 227, bias, value: tensor([-0.0190, -0.0239, -0.0259, -0.0301, -0.0055, 0.0065, 0.0102, -0.0139, -0.0023, -0.0019], device='cuda:0'), grad: tensor([ 1.5870e-06, 1.2526e-07, -2.0210e-07, -6.1281e-07, 1.7229e-07, 1.0477e-07, -7.1712e-08, 1.4622e-07, -1.6391e-06, 3.8836e-07], device='cuda:0') 100 0.0001 changing lr epoch 226, time 220.54, cls_loss 0.0011 cls_loss_mapping 0.0024 cls_loss_causal 0.5220 re_mapping 0.0042 re_causal 0.0125 /// teacc 99.06 lr 0.00010000 Epoch 228, weight, value: tensor([[-0.1926, -0.2009, 0.0977, ..., -0.1087, 0.0465, 0.0361], [-0.1191, -0.0456, -0.0798, ..., -0.1719, -0.0944, -0.0294], [ 0.0217, -0.1183, -0.1314, ..., -0.1147, 0.0312, -0.3316], ..., [-0.1893, 0.1390, 0.0094, ..., 0.1631, -0.0452, -0.1288], [-0.1373, -0.1278, 0.1476, ..., -0.1176, -0.1499, 0.1410], [ 0.0170, -0.2592, 0.1349, ..., 0.0622, -0.1907, -0.1093]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 1.7229e-08, 3.7253e-09, ..., 1.3504e-08, 0.0000e+00, 1.2107e-08], [ 9.3132e-10, 1.6205e-07, 3.7719e-08, ..., 1.3271e-07, 0.0000e+00, -2.0955e-08], [ 0.0000e+00, 1.0896e-07, 1.3970e-08, ..., 1.0105e-07, 0.0000e+00, 9.3132e-09], ..., [ 1.8626e-09, -3.7206e-07, -5.7742e-08, ..., -3.3481e-07, 0.0000e+00, 2.0489e-08], [ 6.0536e-09, 1.0245e-07, 1.7695e-08, ..., 4.1444e-08, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.2806e-07, -3.6787e-08, ..., -4.6566e-09, 0.0000e+00, 5.1223e-09]], device='cuda:0') Epoch 228, bias, value: tensor([-0.0186, -0.0229, -0.0257, -0.0302, -0.0054, 0.0068, 0.0100, -0.0148, -0.0034, -0.0021], device='cuda:0'), grad: tensor([ 1.0477e-07, 3.1153e-07, 8.9873e-08, -4.9919e-07, 6.4727e-08, 2.2445e-07, -7.6368e-08, -6.4494e-07, 2.6962e-07, 1.6578e-07], device='cuda:0') 100 0.0001 changing lr epoch 227, time 220.18, cls_loss 0.0013 cls_loss_mapping 0.0024 cls_loss_causal 0.5015 re_mapping 0.0043 re_causal 0.0122 /// teacc 99.09 lr 0.00010000 Epoch 229, weight, value: tensor([[-0.1927, -0.2022, 0.0979, ..., -0.1089, 0.0465, 0.0333], [-0.1196, -0.0465, -0.0816, ..., -0.1729, -0.0944, -0.0313], [ 0.0219, -0.1198, -0.1332, ..., -0.1154, 0.0313, -0.3320], ..., [-0.1899, 0.1400, 0.0110, ..., 0.1638, -0.0454, -0.1272], [-0.1379, -0.1288, 0.1475, ..., -0.1186, -0.1499, 0.1408], [ 0.0171, -0.2599, 0.1352, ..., 0.0623, -0.1909, -0.1095]], device='cuda:0'), grad: tensor([[ 1.8161e-08, 9.3132e-10, -4.3958e-07, ..., 9.3132e-10, 4.6566e-10, -4.0838e-07], [ 7.9162e-09, 6.0536e-09, 6.0536e-09, ..., 5.1223e-09, 4.6566e-10, -5.5879e-08], [ 4.6566e-09, 1.3970e-09, 3.4319e-07, ..., 4.6566e-10, -5.4482e-08, 3.2783e-07], ..., [ 1.5367e-08, 4.6566e-10, 2.9337e-08, ..., 1.6764e-08, 4.6566e-10, 4.4238e-08], [ 7.1712e-08, 4.1910e-09, 2.0023e-08, ..., 8.3819e-09, 0.0000e+00, 8.0094e-08], [ 2.5099e-07, 2.3283e-09, 5.1223e-09, ..., -2.2352e-08, 0.0000e+00, 2.6217e-07]], device='cuda:0') Epoch 229, bias, value: tensor([-0.0206, -0.0238, -0.0251, -0.0297, -0.0053, 0.0061, 0.0130, -0.0143, -0.0045, -0.0021], device='cuda:0'), grad: tensor([-1.7285e-06, -1.4389e-07, -2.3236e-07, 8.2422e-08, 1.5693e-06, -1.6363e-06, 1.0338e-06, 2.1979e-07, 2.3749e-07, 5.9837e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 228---------------------------------------------------- epoch 228, time 220.68, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4948 re_mapping 0.0046 re_causal 0.0124 /// teacc 99.16 lr 0.00010000 Epoch 230, weight, value: tensor([[-0.1932, -0.2027, 0.0976, ..., -0.1092, 0.0464, 0.0334], [-0.1200, -0.0466, -0.0817, ..., -0.1735, -0.0943, -0.0315], [ 0.0216, -0.1197, -0.1337, ..., -0.1151, 0.0313, -0.3324], ..., [-0.1949, 0.1400, 0.0110, ..., 0.1636, -0.0455, -0.1277], [-0.1389, -0.1294, 0.1478, ..., -0.1194, -0.1500, 0.1406], [ 0.0170, -0.2606, 0.1358, ..., 0.0625, -0.1910, -0.1102]], device='cuda:0'), grad: tensor([[ 1.0245e-08, -1.0245e-08, -9.7323e-08, ..., 3.7253e-09, 0.0000e+00, -5.2107e-07], [ 5.1223e-09, 6.0536e-09, 4.6566e-09, ..., 9.3132e-09, 0.0000e+00, 1.0245e-08], [ 4.6566e-10, 7.9162e-09, 2.3283e-08, ..., 8.8476e-09, 0.0000e+00, 2.7008e-08], ..., [ 5.5879e-09, -7.9162e-09, 8.8476e-09, ..., 0.0000e+00, 0.0000e+00, 1.4901e-08], [ 7.9162e-09, 7.5903e-08, 7.9162e-09, ..., 2.7474e-08, 0.0000e+00, 3.7253e-09], [ 2.2817e-08, 1.3039e-08, -7.7300e-08, ..., -5.4482e-08, 0.0000e+00, 3.2596e-08]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0207, -0.0238, -0.0248, -0.0299, -0.0052, 0.0066, 0.0129, -0.0146, -0.0049, -0.0020], device='cuda:0'), grad: tensor([-6.0117e-07, 6.6590e-08, 1.0896e-07, 3.7253e-08, 3.5763e-07, -3.3854e-07, 7.9628e-08, 5.8673e-08, 2.1653e-07, 1.4435e-08], device='cuda:0') 100 0.0001 changing lr epoch 229, time 220.43, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.5204 re_mapping 0.0043 re_causal 0.0126 /// teacc 99.15 lr 0.00010000 Epoch 231, weight, value: tensor([[-0.1931, -0.2032, 0.0983, ..., -0.1090, 0.0464, 0.0343], [-0.1194, -0.0467, -0.0819, ..., -0.1745, -0.0944, -0.0312], [ 0.0220, -0.1202, -0.1344, ..., -0.1157, 0.0313, -0.3330], ..., [-0.1955, 0.1404, 0.0114, ..., 0.1646, -0.0456, -0.1277], [-0.1395, -0.1298, 0.1476, ..., -0.1201, -0.1500, 0.1409], [ 0.0168, -0.2612, 0.1358, ..., 0.0624, -0.1914, -0.1108]], device='cuda:0'), grad: tensor([[ 2.1886e-08, 7.5903e-08, 7.9162e-09, ..., 4.1910e-09, 0.0000e+00, 2.8405e-08], [ 3.4785e-07, 8.3819e-08, 2.9802e-08, ..., 1.1642e-08, 0.0000e+00, 5.4576e-07], [-1.1176e-08, 8.8010e-08, 5.2154e-08, ..., 4.6566e-09, 0.0000e+00, 3.6787e-08], ..., [ 7.2177e-08, 1.1688e-07, 1.9558e-08, ..., -1.5367e-08, 0.0000e+00, 7.0315e-08], [ 1.2871e-06, 1.0384e-07, -1.1269e-07, ..., 1.1642e-08, 0.0000e+00, 1.8924e-06], [ 7.2224e-07, 4.9779e-07, 7.9162e-09, ..., -1.2387e-07, 0.0000e+00, 6.4448e-07]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0197, -0.0238, -0.0249, -0.0300, -0.0052, 0.0066, 0.0124, -0.0144, -0.0050, -0.0021], device='cuda:0'), grad: tensor([ 3.6508e-07, 1.3243e-06, -3.7253e-09, -3.6154e-06, 7.4925e-07, -7.8157e-06, 8.8057e-07, 6.7055e-07, 3.7700e-06, 3.6787e-06], device='cuda:0') 100 0.0001 changing lr epoch 230, time 220.56, cls_loss 0.0008 cls_loss_mapping 0.0018 cls_loss_causal 0.4989 re_mapping 0.0044 re_causal 0.0124 /// teacc 99.14 lr 0.00010000 Epoch 232, weight, value: tensor([[-0.1931, -0.2033, 0.0989, ..., -0.1088, 0.0462, 0.0341], [-0.1198, -0.0469, -0.0822, ..., -0.1753, -0.0940, -0.0315], [ 0.0220, -0.1201, -0.1351, ..., -0.1156, 0.0316, -0.3335], ..., [-0.1961, 0.1408, 0.0116, ..., 0.1650, -0.0460, -0.1277], [-0.1398, -0.1300, 0.1478, ..., -0.1202, -0.1502, 0.1410], [ 0.0168, -0.2619, 0.1362, ..., 0.0625, -0.1921, -0.1111]], device='cuda:0'), grad: tensor([[ 1.1642e-09, 1.6298e-09, -4.1910e-08, ..., 1.6298e-09, 0.0000e+00, -1.1642e-08], [ 1.1409e-08, 8.6147e-09, 3.7253e-09, ..., 6.7521e-09, -1.1642e-09, 1.4668e-08], [ 2.3283e-10, 1.7928e-08, 3.9581e-09, ..., -1.4901e-08, 2.3283e-10, 3.4925e-09], ..., [ 1.1642e-09, 2.1886e-08, 4.6566e-10, ..., 2.3283e-09, 4.6566e-10, 4.4238e-09], [ 1.2573e-08, 6.9849e-10, -6.0536e-09, ..., 3.4925e-09, 0.0000e+00, -4.6566e-10], [ 1.8626e-09, 4.6566e-09, 1.9092e-08, ..., 6.9849e-09, 4.6566e-10, 1.0943e-08]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0196, -0.0239, -0.0247, -0.0301, -0.0053, 0.0067, 0.0127, -0.0144, -0.0051, -0.0021], device='cuda:0'), grad: tensor([-4.1444e-08, 1.3504e-07, -7.3388e-07, 2.7241e-08, 7.3109e-08, -5.9372e-08, 7.7998e-08, 3.3388e-07, 7.9861e-08, 1.2270e-07], device='cuda:0') 100 0.0001 changing lr epoch 231, time 220.56, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.4844 re_mapping 0.0043 re_causal 0.0120 /// teacc 99.11 lr 0.00010000 Epoch 233, weight, value: tensor([[-0.1924, -0.2036, 0.1000, ..., -0.1088, 0.0476, 0.0344], [-0.1206, -0.0482, -0.0837, ..., -0.1783, -0.0949, -0.0324], [ 0.0224, -0.1201, -0.1363, ..., -0.1158, 0.0317, -0.3346], ..., [-0.1964, 0.1423, 0.0133, ..., 0.1679, -0.0463, -0.1265], [-0.1403, -0.1302, 0.1478, ..., -0.1205, -0.1502, 0.1409], [ 0.0168, -0.2631, 0.1361, ..., 0.0625, -0.1949, -0.1116]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 2.7940e-09, -2.3283e-09, ..., 2.7940e-09, 4.6566e-10, 2.3283e-10], [ 2.3283e-10, 1.0710e-08, 2.0955e-09, ..., 3.2363e-08, 8.6147e-09, -1.4668e-08], [ 0.0000e+00, 2.4913e-08, 6.7521e-09, ..., 1.3737e-08, 0.0000e+00, 2.5611e-09], ..., [ 1.3970e-09, -6.9151e-08, -1.3039e-08, ..., -2.6543e-08, 5.5879e-09, 4.1910e-09], [ 3.7253e-09, 2.3283e-09, 2.0955e-09, ..., 7.2876e-08, 2.2817e-08, 6.9849e-09], [-4.6566e-10, 7.2177e-09, -4.8894e-09, ..., 2.0722e-08, 6.0536e-09, 4.1910e-09]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0188, -0.0249, -0.0245, -0.0306, -0.0049, 0.0067, 0.0124, -0.0134, -0.0055, -0.0022], device='cuda:0'), grad: tensor([ 2.2119e-08, 1.6484e-07, 4.5169e-08, 2.5146e-08, -1.1446e-06, 2.6776e-08, -1.1409e-08, 6.3330e-08, 6.2678e-07, 1.7812e-07], device='cuda:0') 100 0.0001 changing lr epoch 232, time 220.37, cls_loss 0.0010 cls_loss_mapping 0.0028 cls_loss_causal 0.5050 re_mapping 0.0043 re_causal 0.0121 /// teacc 99.09 lr 0.00010000 Epoch 234, weight, value: tensor([[-0.1919, -0.2045, 0.1010, ..., -0.1089, 0.0474, 0.0344], [-0.1208, -0.0485, -0.0840, ..., -0.1788, -0.0931, -0.0315], [ 0.0225, -0.1203, -0.1370, ..., -0.1159, 0.0314, -0.3357], ..., [-0.1967, 0.1426, 0.0134, ..., 0.1682, -0.0467, -0.1265], [-0.1411, -0.1307, 0.1477, ..., -0.1207, -0.1502, 0.1406], [ 0.0168, -0.2636, 0.1364, ..., 0.0625, -0.1951, -0.1122]], device='cuda:0'), grad: tensor([[ 3.0268e-09, 2.2352e-08, 2.3283e-10, ..., 1.7928e-08, 0.0000e+00, -3.7253e-09], [ 6.7521e-09, 9.9884e-08, 3.2829e-08, ..., 7.2410e-08, 0.0000e+00, 0.0000e+00], [-4.6566e-10, 7.3388e-07, 6.7521e-09, ..., 1.0477e-08, 0.0000e+00, 6.7521e-09], ..., [ 6.9849e-09, 7.1479e-08, 7.6834e-09, ..., 7.2177e-08, 0.0000e+00, 3.7253e-09], [ 2.0023e-08, 4.4238e-08, 7.2177e-09, ..., 3.0268e-08, 0.0000e+00, 1.1176e-08], [-6.8452e-08, 3.3528e-08, -2.3353e-07, ..., 2.8890e-06, 0.0000e+00, 4.6566e-09]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0182, -0.0249, -0.0239, -0.0307, -0.0048, 0.0063, 0.0130, -0.0135, -0.0063, -0.0024], device='cuda:0'), grad: tensor([ 1.3807e-07, 8.2236e-07, 1.7807e-06, -5.8599e-06, -1.1623e-05, 2.2389e-06, 7.7998e-07, 6.9616e-07, 2.7148e-07, 1.0736e-05], device='cuda:0') 100 0.0001 changing lr epoch 233, time 220.09, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.4984 re_mapping 0.0043 re_causal 0.0123 /// teacc 99.02 lr 0.00010000 Epoch 235, weight, value: tensor([[-0.1921, -0.2056, 0.1019, ..., -0.1090, 0.0474, 0.0345], [-0.1214, -0.0486, -0.0842, ..., -0.1790, -0.0929, -0.0316], [ 0.0221, -0.1206, -0.1378, ..., -0.1162, 0.0314, -0.3362], ..., [-0.1981, 0.1427, 0.0134, ..., 0.1684, -0.0469, -0.1266], [-0.1415, -0.1310, 0.1482, ..., -0.1210, -0.1503, 0.1409], [ 0.0168, -0.2639, 0.1365, ..., 0.0625, -0.1953, -0.1134]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 3.9581e-09, 1.6531e-08, ..., 2.3283e-10, 0.0000e+00, -5.7742e-08], [ 1.6298e-09, 2.8173e-08, 9.0804e-09, ..., 1.2573e-08, 0.0000e+00, -6.2864e-09], [ 1.3970e-09, 1.6065e-08, -8.6240e-07, ..., 1.6298e-09, -4.6566e-10, 6.0536e-09], ..., [ 2.2585e-08, 1.7975e-07, 1.5600e-08, ..., -1.1409e-08, 4.6566e-10, 4.4238e-09], [ 4.4238e-09, 2.7474e-08, -5.1316e-07, ..., 1.3970e-09, 0.0000e+00, -3.1702e-06], [-3.4925e-09, 3.7253e-09, -7.6834e-09, ..., -8.6147e-09, 0.0000e+00, 4.4238e-09]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0179, -0.0240, -0.0254, -0.0305, -0.0047, 0.0062, 0.0131, -0.0135, -0.0063, -0.0027], device='cuda:0'), grad: tensor([ 1.4598e-07, -7.7765e-08, -4.4778e-06, -2.7497e-07, 3.7951e-08, 3.2689e-07, 6.7204e-06, 4.3376e-07, -2.8424e-06, 1.1409e-08], device='cuda:0') 100 0.0001 changing lr epoch 234, time 220.59, cls_loss 0.0009 cls_loss_mapping 0.0025 cls_loss_causal 0.4984 re_mapping 0.0044 re_causal 0.0123 /// teacc 98.97 lr 0.00010000 Epoch 236, weight, value: tensor([[-0.1922, -0.2063, 0.1023, ..., -0.1091, 0.0474, 0.0346], [-0.1225, -0.0487, -0.0844, ..., -0.1793, -0.0929, -0.0318], [ 0.0222, -0.1216, -0.1403, ..., -0.1183, 0.0314, -0.3365], ..., [-0.1988, 0.1431, 0.0137, ..., 0.1692, -0.0468, -0.1269], [-0.1418, -0.1312, 0.1488, ..., -0.1213, -0.1503, 0.1412], [ 0.0169, -0.2657, 0.1367, ..., 0.0622, -0.1954, -0.1137]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 1.1642e-09, 2.0722e-08, ..., 3.6089e-08, 0.0000e+00, 1.2224e-07], [ 2.3586e-07, 4.1910e-09, 1.5879e-06, ..., 3.4925e-09, 0.0000e+00, 2.3060e-06], [ 3.9581e-09, 5.8208e-09, 2.9104e-08, ..., 2.7940e-09, 0.0000e+00, 4.0280e-08], ..., [ 1.7928e-08, -8.3819e-09, 1.1618e-07, ..., -1.3039e-08, 0.0000e+00, 1.7509e-07], [-2.2165e-07, 4.6566e-10, -2.2929e-06, ..., 9.3132e-10, 0.0000e+00, -3.1907e-06], [ 2.7940e-08, 5.1223e-09, 1.9325e-07, ..., 5.3551e-09, 0.0000e+00, 2.8475e-07]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0177, -0.0235, -0.0266, -0.0304, -0.0048, 0.0064, 0.0123, -0.0132, -0.0061, -0.0027], device='cuda:0'), grad: tensor([ 7.2923e-07, 2.5257e-06, 1.0040e-06, 2.5821e-07, 2.1677e-07, 1.8114e-07, -3.6368e-07, 2.4606e-06, -7.6964e-06, 7.0035e-07], device='cuda:0') 100 0.0001 changing lr epoch 235, time 220.87, cls_loss 0.0010 cls_loss_mapping 0.0026 cls_loss_causal 0.5046 re_mapping 0.0041 re_causal 0.0119 /// teacc 99.02 lr 0.00010000 Epoch 237, weight, value: tensor([[-0.1925, -0.2072, 0.1041, ..., -0.1092, 0.0476, 0.0347], [-0.1229, -0.0492, -0.0846, ..., -0.1801, -0.0929, -0.0321], [ 0.0226, -0.1226, -0.1426, ..., -0.1204, 0.0315, -0.3369], ..., [-0.1994, 0.1439, 0.0140, ..., 0.1705, -0.0464, -0.1270], [-0.1433, -0.1316, 0.1497, ..., -0.1218, -0.1503, 0.1412], [ 0.0169, -0.2675, 0.1363, ..., 0.0621, -0.1955, -0.1140]], device='cuda:0'), grad: tensor([[ 5.3551e-09, 2.8638e-08, 8.6147e-09, ..., 2.1886e-08, 0.0000e+00, 1.2107e-08], [ 4.1910e-09, 1.8794e-06, 1.7928e-08, ..., 1.3029e-06, 0.0000e+00, 5.6112e-08], [ 6.9849e-10, 2.1979e-06, 1.8626e-09, ..., 1.5162e-06, 0.0000e+00, 3.7253e-09], ..., [ 1.6065e-08, -4.5858e-06, 5.2853e-08, ..., -3.1460e-06, 0.0000e+00, 6.2864e-09], [ 8.6846e-08, 1.8626e-08, 1.5274e-07, ..., 1.1013e-07, 0.0000e+00, 6.2631e-08], [-6.6124e-08, 8.6846e-08, -3.5646e-07, ..., -1.6182e-07, 0.0000e+00, 1.1176e-08]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0168, -0.0240, -0.0267, -0.0297, -0.0048, 0.0058, 0.0123, -0.0126, -0.0062, -0.0031], device='cuda:0'), grad: tensor([ 2.5355e-07, 6.1989e-06, 4.6752e-06, 6.8359e-07, 5.3179e-07, -1.7579e-07, -3.4403e-06, -9.3207e-06, 1.1427e-06, -5.3737e-07], device='cuda:0') 100 0.0001 changing lr epoch 236, time 220.48, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4970 re_mapping 0.0040 re_causal 0.0115 /// teacc 99.16 lr 0.00010000 Epoch 238, weight, value: tensor([[-0.1926, -0.2083, 0.1044, ..., -0.1095, 0.0475, 0.0350], [-0.1236, -0.0494, -0.0849, ..., -0.1807, -0.0901, -0.0322], [ 0.0223, -0.1229, -0.1432, ..., -0.1207, 0.0304, -0.3375], ..., [-0.1997, 0.1443, 0.0143, ..., 0.1712, -0.0471, -0.1272], [-0.1475, -0.1320, 0.1501, ..., -0.1222, -0.1503, 0.1400], [ 0.0172, -0.2683, 0.1368, ..., 0.0620, -0.1960, -0.1144]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.3982e-08, 8.3819e-09, ..., 1.6531e-08, 0.0000e+00, 7.6834e-09], [ 4.6566e-10, 4.0699e-07, 1.3807e-07, ..., 4.0024e-07, 2.3283e-10, 5.5879e-09], [ 2.3283e-10, 2.4494e-07, 2.4214e-08, ..., 5.5647e-08, -9.3132e-10, 1.1176e-08], ..., [ 2.3283e-10, -6.5379e-07, -3.0245e-07, ..., -9.7416e-07, 4.6566e-10, 3.2596e-09], [ 0.0000e+00, 1.3853e-07, -4.0745e-08, ..., 1.1409e-08, 2.3283e-10, -6.1933e-08], [ 2.3283e-10, 2.0186e-07, 6.7521e-08, ..., 2.6426e-07, 0.0000e+00, 6.9849e-10]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0163, -0.0240, -0.0268, -0.0304, -0.0048, 0.0075, 0.0118, -0.0125, -0.0084, -0.0032], device='cuda:0'), grad: tensor([ 1.0594e-07, 1.1530e-06, 6.2585e-07, -2.1681e-06, 3.2014e-07, 9.9838e-07, 1.5879e-07, -2.2072e-06, 1.1805e-07, 9.0245e-07], device='cuda:0') 100 0.0001 changing lr epoch 237, time 220.48, cls_loss 0.0009 cls_loss_mapping 0.0026 cls_loss_causal 0.5089 re_mapping 0.0041 re_causal 0.0120 /// teacc 98.99 lr 0.00010000 Epoch 239, weight, value: tensor([[-0.1917, -0.2098, 0.1051, ..., -0.1100, 0.0477, 0.0363], [-0.1239, -0.0495, -0.0850, ..., -0.1808, -0.0901, -0.0325], [ 0.0254, -0.1232, -0.1436, ..., -0.1204, 0.0306, -0.3379], ..., [-0.2002, 0.1444, 0.0143, ..., 0.1715, -0.0473, -0.1275], [-0.1477, -0.1329, 0.1510, ..., -0.1226, -0.1503, 0.1410], [ 0.0157, -0.2698, 0.1373, ..., 0.0620, -0.1960, -0.1147]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 6.9849e-09, -9.9419e-08, ..., 1.5832e-08, -1.0477e-07, -9.5228e-08], [ 3.9581e-09, 2.8638e-08, 7.9162e-09, ..., 1.6205e-07, 6.9849e-09, 1.7462e-08], [ 2.3283e-10, 7.2410e-08, 4.7497e-08, ..., 7.2177e-09, 4.9826e-08, 5.6345e-08], ..., [ 2.3283e-09, 7.0781e-08, 4.8894e-09, ..., 3.4692e-08, 2.0955e-09, 4.6566e-09], [ 5.0990e-08, 3.4925e-09, 3.2596e-09, ..., 2.1118e-07, 3.2596e-09, 2.7381e-07], [ 8.6147e-09, 1.0012e-08, 4.6566e-10, ..., 5.6252e-07, 2.2817e-08, 2.8173e-08]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0150, -0.0240, -0.0261, -0.0302, -0.0048, 0.0072, 0.0109, -0.0125, -0.0079, -0.0040], device='cuda:0'), grad: tensor([-5.5879e-07, 1.0636e-06, 5.2433e-07, -3.3830e-07, -5.6922e-06, 1.8328e-05, -1.8597e-05, 4.0140e-07, 1.6149e-06, 3.2485e-06], device='cuda:0') 100 0.0001 changing lr epoch 238, time 220.56, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.5179 re_mapping 0.0041 re_causal 0.0117 /// teacc 99.05 lr 0.00010000 Epoch 240, weight, value: tensor([[-0.1919, -0.2102, 0.1055, ..., -0.1101, 0.0479, 0.0364], [-0.1243, -0.0495, -0.0851, ..., -0.1811, -0.0903, -0.0332], [ 0.0253, -0.1238, -0.1459, ..., -0.1208, 0.0313, -0.3415], ..., [-0.2011, 0.1445, 0.0143, ..., 0.1718, -0.0477, -0.1284], [-0.1486, -0.1332, 0.1542, ..., -0.1232, -0.1508, 0.1426], [ 0.0158, -0.2707, 0.1375, ..., 0.0617, -0.1964, -0.1153]], device='cuda:0'), grad: tensor([[ 8.6147e-09, 3.7253e-09, 1.0361e-07, ..., 5.8208e-08, 6.5193e-09, 2.2585e-08], [ 1.5134e-08, 1.9558e-08, 3.2596e-08, ..., 4.8894e-08, 1.1642e-09, -9.5461e-09], [ 2.5611e-09, 1.0012e-08, 1.2573e-08, ..., 1.2107e-08, -1.5832e-08, 4.8894e-09], ..., [ 2.7008e-08, 8.8476e-09, 1.5018e-07, ..., 1.1642e-07, 2.3283e-09, 2.7707e-08], [ 5.1921e-08, 1.6298e-09, 1.0924e-06, ..., 5.1968e-07, 2.3283e-09, 1.9697e-07], [ 1.1642e-07, 1.2573e-08, -3.6657e-06, ..., -1.1167e-06, 6.9849e-10, -6.3796e-07]], device='cuda:0') Epoch 240, bias, value: tensor([-0.0148, -0.0241, -0.0262, -0.0300, -0.0044, 0.0070, 0.0111, -0.0126, -0.0068, -0.0043], device='cuda:0'), grad: tensor([ 4.2864e-07, 2.6217e-07, -2.8173e-08, 5.1297e-06, -7.2233e-06, 6.4913e-07, 4.4773e-07, 8.8103e-07, 2.8647e-06, -3.4012e-06], device='cuda:0') 100 0.0001 changing lr epoch 239, time 221.02, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.4587 re_mapping 0.0043 re_causal 0.0118 /// teacc 99.07 lr 0.00010000 Epoch 241, weight, value: tensor([[-0.1920, -0.2122, 0.1056, ..., -0.1102, 0.0478, 0.0371], [-0.1246, -0.0500, -0.0855, ..., -0.1814, -0.0905, -0.0339], [ 0.0253, -0.1239, -0.1477, ..., -0.1207, 0.0322, -0.3435], ..., [-0.2014, 0.1451, 0.0144, ..., 0.1720, -0.0495, -0.1290], [-0.1490, -0.1333, 0.1562, ..., -0.1241, -0.1510, 0.1435], [ 0.0160, -0.2710, 0.1383, ..., 0.0616, -0.1967, -0.1153]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.1642e-09, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 6.9849e-10, 2.5611e-09, ..., 0.0000e+00, -1.8626e-09, -3.2596e-09], [ 0.0000e+00, 1.8626e-09, 4.2841e-08, ..., 0.0000e+00, 1.1642e-09, 4.0513e-08], ..., [ 6.9849e-10, 1.3970e-09, 1.5367e-08, ..., 7.2177e-09, 4.6566e-10, 8.8476e-09], [ 2.3283e-10, 3.2596e-09, -7.4971e-08, ..., 0.0000e+00, 0.0000e+00, -6.7754e-08], [-1.1642e-09, 4.6566e-10, -1.0245e-08, ..., -1.1642e-08, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0143, -0.0245, -0.0261, -0.0301, -0.0041, 0.0069, 0.0107, -0.0124, -0.0063, -0.0044], device='cuda:0'), grad: tensor([ 8.1491e-09, -3.9116e-08, 1.2363e-07, 2.1188e-08, 1.2340e-08, 1.8859e-08, -4.6566e-10, 6.3563e-08, -1.7765e-07, -1.8859e-08], device='cuda:0') 100 0.0001 changing lr epoch 240, time 220.68, cls_loss 0.0008 cls_loss_mapping 0.0022 cls_loss_causal 0.4840 re_mapping 0.0044 re_causal 0.0122 /// teacc 99.09 lr 0.00010000 Epoch 242, weight, value: tensor([[-0.1922, -0.2132, 0.1058, ..., -0.1103, 0.0478, 0.0371], [-0.1252, -0.0500, -0.0856, ..., -0.1815, -0.0906, -0.0337], [ 0.0253, -0.1239, -0.1473, ..., -0.1208, 0.0326, -0.3435], ..., [-0.2021, 0.1452, 0.0142, ..., 0.1719, -0.0502, -0.1292], [-0.1493, -0.1341, 0.1561, ..., -0.1245, -0.1511, 0.1437], [ 0.0162, -0.2710, 0.1395, ..., 0.0618, -0.1967, -0.1154]], device='cuda:0'), grad: tensor([[ 8.8476e-09, 1.5832e-08, -1.3015e-07, ..., 3.0268e-09, 0.0000e+00, 2.4214e-06], [ 4.9826e-08, 8.6147e-08, 5.8208e-09, ..., 4.1910e-09, 0.0000e+00, 3.0664e-07], [ 3.7253e-09, 6.9151e-07, 1.0012e-08, ..., 1.8626e-09, -4.6566e-10, 2.6776e-08], ..., [ 1.8394e-08, 3.1362e-07, 5.3551e-09, ..., -1.1595e-07, 0.0000e+00, 2.9104e-08], [ 8.2422e-07, 3.5390e-08, -4.4703e-08, ..., 2.3283e-10, 0.0000e+00, 9.2154e-07], [ 1.9325e-08, 2.0256e-08, -6.5193e-09, ..., 6.2864e-09, 0.0000e+00, 3.0268e-08]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0142, -0.0245, -0.0260, -0.0297, -0.0040, 0.0065, 0.0105, -0.0125, -0.0065, -0.0042], device='cuda:0'), grad: tensor([ 1.7077e-05, 2.0638e-06, 1.8394e-06, -3.3285e-06, 6.6962e-07, -1.1390e-06, -1.9938e-05, 9.8534e-07, 1.6233e-06, 1.2456e-07], device='cuda:0') 100 0.0001 changing lr epoch 241, time 220.23, cls_loss 0.0010 cls_loss_mapping 0.0024 cls_loss_causal 0.5155 re_mapping 0.0043 re_causal 0.0121 /// teacc 99.16 lr 0.00010000 Epoch 243, weight, value: tensor([[-0.1928, -0.2141, 0.1061, ..., -0.1107, 0.0476, 0.0371], [-0.1258, -0.0505, -0.0857, ..., -0.1820, -0.0901, -0.0335], [ 0.0252, -0.1244, -0.1473, ..., -0.1212, 0.0328, -0.3439], ..., [-0.2025, 0.1457, 0.0141, ..., 0.1724, -0.0510, -0.1296], [-0.1503, -0.1345, 0.1564, ..., -0.1249, -0.1513, 0.1438], [ 0.0162, -0.2715, 0.1405, ..., 0.0621, -0.1968, -0.1158]], device='cuda:0'), grad: tensor([[-1.3970e-09, 4.6566e-10, -1.7229e-08, ..., 1.1642e-09, 0.0000e+00, -2.2585e-08], [ 0.0000e+00, 2.5611e-09, 1.3970e-09, ..., 3.4925e-09, 2.3283e-10, -4.6566e-08], [ 2.3283e-10, 6.9849e-10, -1.6298e-09, ..., 0.0000e+00, -1.1642e-09, 7.9162e-09], ..., [ 0.0000e+00, 2.3283e-10, 1.0012e-08, ..., 6.5193e-09, 6.9849e-10, 3.0268e-09], [ 4.6566e-10, 3.7253e-09, 2.0955e-09, ..., 2.3283e-10, 0.0000e+00, 2.7707e-08], [ 0.0000e+00, 1.1642e-09, -1.9791e-08, ..., 2.2817e-08, 0.0000e+00, 3.4925e-09]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0141, -0.0248, -0.0259, -0.0299, -0.0042, 0.0066, 0.0108, -0.0122, -0.0069, -0.0040], device='cuda:0'), grad: tensor([ 1.4435e-08, -1.3993e-07, -3.5344e-07, 6.4960e-08, -2.1770e-07, 2.9802e-08, 7.1712e-08, 1.2969e-07, 2.0908e-07, 2.0349e-07], device='cuda:0') 100 0.0001 changing lr epoch 242, time 220.14, cls_loss 0.0009 cls_loss_mapping 0.0022 cls_loss_causal 0.4903 re_mapping 0.0041 re_causal 0.0116 /// teacc 99.06 lr 0.00010000 Epoch 244, weight, value: tensor([[-0.1927, -0.2157, 0.1067, ..., -0.1108, 0.0486, 0.0372], [-0.1261, -0.0506, -0.0858, ..., -0.1821, -0.0905, -0.0334], [ 0.0252, -0.1240, -0.1475, ..., -0.1212, 0.0337, -0.3442], ..., [-0.2029, 0.1458, 0.0142, ..., 0.1724, -0.0520, -0.1298], [-0.1508, -0.1354, 0.1554, ..., -0.1274, -0.1514, 0.1434], [ 0.0163, -0.2721, 0.1418, ..., 0.0624, -0.1975, -0.1157]], device='cuda:0'), grad: tensor([[ 3.9116e-08, 1.3970e-09, -3.9116e-08, ..., 9.3132e-10, 0.0000e+00, -3.7253e-09], [ 1.4901e-08, 1.7695e-08, 1.4901e-08, ..., 1.8626e-08, 0.0000e+00, 1.0710e-08], [ 3.7719e-08, 5.1223e-09, 1.0710e-08, ..., 2.7940e-09, 0.0000e+00, 2.4214e-08], ..., [ 8.5589e-07, -2.7474e-08, -3.2596e-09, ..., -2.5611e-08, 4.6566e-10, 4.0932e-07], [ 1.2619e-07, 6.5193e-09, -4.1910e-09, ..., 3.7253e-09, 0.0000e+00, 3.6787e-08], [ 4.9360e-08, 8.8476e-09, -4.0513e-08, ..., -2.3283e-08, 0.0000e+00, 2.7474e-08]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0141, -0.0249, -0.0255, -0.0299, -0.0042, 0.0066, 0.0112, -0.0124, -0.0077, -0.0039], device='cuda:0'), grad: tensor([ 1.7229e-08, 9.5926e-08, 1.7881e-07, 4.9500e-07, 7.5111e-07, -5.5507e-06, 3.1432e-07, 3.1479e-06, 4.5821e-07, 1.0803e-07], device='cuda:0') 100 0.0001 changing lr epoch 243, time 220.42, cls_loss 0.0009 cls_loss_mapping 0.0022 cls_loss_causal 0.4849 re_mapping 0.0041 re_causal 0.0115 /// teacc 99.04 lr 0.00010000 Epoch 245, weight, value: tensor([[-0.1930, -0.2167, 0.1067, ..., -0.1112, 0.0485, 0.0370], [-0.1264, -0.0507, -0.0860, ..., -0.1823, -0.0900, -0.0335], [ 0.0252, -0.1244, -0.1479, ..., -0.1216, 0.0335, -0.3445], ..., [-0.2033, 0.1460, 0.0141, ..., 0.1727, -0.0519, -0.1300], [-0.1510, -0.1362, 0.1559, ..., -0.1282, -0.1515, 0.1440], [ 0.0163, -0.2728, 0.1429, ..., 0.0627, -0.1977, -0.1166]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 8.3819e-09, -1.2573e-08, ..., 9.3132e-10, 4.6566e-10, 1.3970e-09], [-5.3551e-08, 1.3970e-08, -6.8452e-08, ..., 2.7940e-09, -1.0245e-08, -3.7858e-07], [ 9.3132e-10, 4.6100e-08, 2.7940e-09, ..., 4.6566e-10, 5.5879e-09, 1.5832e-08], ..., [ 1.4435e-08, 4.6566e-09, 2.9337e-08, ..., -2.7940e-09, 1.3970e-09, 1.0058e-07], [ 1.8161e-08, 9.3132e-10, -4.8429e-08, ..., -6.0536e-09, 0.0000e+00, 4.6566e-09], [ 5.1223e-09, 4.6566e-09, 3.2131e-08, ..., 1.8626e-09, 0.0000e+00, 6.1933e-08]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0145, -0.0249, -0.0256, -0.0298, -0.0042, 0.0066, 0.0112, -0.0124, -0.0075, -0.0037], device='cuda:0'), grad: tensor([ 1.4901e-08, -1.3057e-06, 2.1979e-07, -6.7754e-07, 1.9558e-08, 8.9360e-07, 2.2631e-07, 3.9209e-07, 6.9849e-09, 2.0396e-07], device='cuda:0') 100 0.0001 changing lr epoch 244, time 220.04, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4995 re_mapping 0.0043 re_causal 0.0117 /// teacc 99.01 lr 0.00010000 Epoch 246, weight, value: tensor([[-0.1957, -0.2184, 0.1069, ..., -0.1115, 0.0483, 0.0367], [-0.1269, -0.0511, -0.0866, ..., -0.1829, -0.0889, -0.0335], [ 0.0252, -0.1249, -0.1484, ..., -0.1219, 0.0333, -0.3450], ..., [-0.2037, 0.1465, 0.0141, ..., 0.1731, -0.0523, -0.1309], [-0.1523, -0.1372, 0.1571, ..., -0.1287, -0.1516, 0.1437], [ 0.0165, -0.2731, 0.1448, ..., 0.0632, -0.1979, -0.1175]], device='cuda:0'), grad: tensor([[-1.7602e-07, 4.9826e-08, -9.2201e-07, ..., 4.2375e-08, 0.0000e+00, -8.6287e-07], [ 6.0536e-09, 2.3143e-07, 9.4995e-08, ..., 1.7835e-07, 0.0000e+00, -4.0978e-08], [ 3.2596e-09, 2.0349e-07, 6.7987e-08, ..., 2.3004e-07, 0.0000e+00, 4.0047e-08], ..., [ 1.1176e-08, 6.0536e-09, -1.0245e-07, ..., -8.2841e-07, 0.0000e+00, 1.0757e-07], [ 5.2620e-08, 5.4017e-08, 9.0804e-08, ..., 1.5832e-08, 0.0000e+00, 1.3784e-07], [ 3.8184e-08, 2.4401e-07, 1.7229e-07, ..., 1.0291e-07, 0.0000e+00, 1.8533e-07]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0151, -0.0245, -0.0263, -0.0297, -0.0048, 0.0071, 0.0107, -0.0125, -0.0081, -0.0030], device='cuda:0'), grad: tensor([-2.7493e-06, 7.5810e-07, -2.4214e-07, -1.6475e-06, -1.0170e-06, 1.5479e-06, 1.5795e-06, 6.0536e-08, 5.7137e-07, 1.1390e-06], device='cuda:0') 100 0.0001 changing lr epoch 245, time 220.49, cls_loss 0.0012 cls_loss_mapping 0.0022 cls_loss_causal 0.4990 re_mapping 0.0041 re_causal 0.0115 /// teacc 99.08 lr 0.00010000 Epoch 247, weight, value: tensor([[-0.1962, -0.2215, 0.1068, ..., -0.1122, 0.0480, 0.0368], [-0.1271, -0.0513, -0.0868, ..., -0.1833, -0.0880, -0.0313], [ 0.0252, -0.1272, -0.1498, ..., -0.1242, 0.0328, -0.3460], ..., [-0.2039, 0.1473, 0.0144, ..., 0.1744, -0.0525, -0.1320], [-0.1528, -0.1383, 0.1581, ..., -0.1302, -0.1516, 0.1446], [ 0.0166, -0.2744, 0.1452, ..., 0.0633, -0.1980, -0.1181]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 2.3283e-09, 8.8476e-09, ..., 1.3970e-09, 0.0000e+00, 8.8476e-09], [ 3.7253e-09, 5.2107e-07, 1.5926e-07, ..., 4.1444e-07, 0.0000e+00, 6.5193e-09], [ 4.6566e-10, 8.1956e-08, 7.8697e-08, ..., 6.2864e-08, 0.0000e+00, 5.5879e-08], ..., [ 1.8626e-09, -6.7567e-07, -1.9837e-07, ..., -5.3644e-07, 0.0000e+00, 8.3819e-09], [ 4.7963e-08, 8.3819e-09, -9.4064e-08, ..., 2.7940e-09, 0.0000e+00, -9.2201e-08], [-2.0629e-07, 9.3132e-09, -3.3574e-07, ..., 2.1420e-08, 0.0000e+00, -8.3819e-08]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0157, -0.0245, -0.0260, -0.0276, -0.0049, 0.0048, 0.0094, -0.0120, -0.0080, -0.0032], device='cuda:0'), grad: tensor([ 3.9116e-08, 1.2061e-06, 3.3714e-07, 5.3132e-07, 1.6391e-07, 1.0105e-07, 3.0734e-08, -1.4789e-06, -2.0396e-07, -7.2364e-07], device='cuda:0') 100 0.0001 changing lr epoch 246, time 220.24, cls_loss 0.0008 cls_loss_mapping 0.0019 cls_loss_causal 0.4755 re_mapping 0.0041 re_causal 0.0116 /// teacc 99.07 lr 0.00010000 Epoch 248, weight, value: tensor([[-0.1963, -0.2227, 0.1073, ..., -0.1124, 0.0481, 0.0367], [-0.1272, -0.0513, -0.0869, ..., -0.1835, -0.0880, -0.0313], [ 0.0252, -0.1273, -0.1501, ..., -0.1243, 0.0329, -0.3462], ..., [-0.2051, 0.1474, 0.0145, ..., 0.1747, -0.0525, -0.1329], [-0.1532, -0.1405, 0.1580, ..., -0.1311, -0.1518, 0.1445], [ 0.0165, -0.2753, 0.1454, ..., 0.0633, -0.1983, -0.1186]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, -6.9384e-08, ..., 1.3970e-09, 0.0000e+00, -4.3772e-08], [ 5.5414e-08, 3.3062e-08, 3.6322e-08, ..., 2.1886e-08, 0.0000e+00, -1.7695e-08], [ 1.1642e-08, 4.6566e-09, 3.9581e-08, ..., 3.7253e-09, 0.0000e+00, 2.2352e-08], ..., [ 1.8626e-09, -6.5658e-08, -2.5146e-08, ..., -5.1223e-08, 0.0000e+00, 8.3819e-09], [ 6.9849e-09, 2.7940e-09, 2.0023e-08, ..., 2.7940e-09, 0.0000e+00, 1.0710e-08], [-3.5064e-07, 1.0245e-08, -2.2631e-07, ..., 7.9162e-09, 4.6566e-10, 2.0489e-08]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0155, -0.0245, -0.0259, -0.0275, -0.0050, 0.0050, 0.0097, -0.0121, -0.0087, -0.0033], device='cuda:0'), grad: tensor([-1.7881e-07, 8.0559e-08, 1.7043e-07, 5.1782e-07, 4.2049e-07, 2.7148e-07, -2.7753e-07, -4.7032e-08, 7.9628e-08, -1.0207e-06], device='cuda:0') 100 0.0001 changing lr epoch 247, time 220.08, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4633 re_mapping 0.0042 re_causal 0.0118 /// teacc 99.11 lr 0.00010000 Epoch 249, weight, value: tensor([[-0.1963, -0.2231, 0.1078, ..., -0.1125, 0.0478, 0.0369], [-0.1280, -0.0508, -0.0860, ..., -0.1836, -0.0880, -0.0311], [ 0.0252, -0.1277, -0.1508, ..., -0.1246, 0.0332, -0.3466], ..., [-0.2063, 0.1471, 0.0138, ..., 0.1751, -0.0526, -0.1335], [-0.1535, -0.1411, 0.1588, ..., -0.1314, -0.1518, 0.1447], [ 0.0168, -0.2771, 0.1458, ..., 0.0633, -0.1985, -0.1196]], device='cuda:0'), grad: tensor([[ 6.9849e-09, 4.6566e-09, 3.5390e-08, ..., 6.5193e-09, 0.0000e+00, 5.2154e-08], [ 1.3970e-09, 3.7253e-09, 8.3819e-09, ..., 9.3132e-09, 0.0000e+00, 9.3132e-10], [ 1.3970e-09, 1.3970e-09, 1.1176e-08, ..., -1.0710e-08, 0.0000e+00, 1.0245e-08], ..., [ 1.3970e-09, -1.6298e-08, 7.4506e-09, ..., -4.1910e-09, 0.0000e+00, 1.0710e-08], [ 1.3504e-08, -1.8626e-08, -7.9162e-08, ..., 2.3749e-08, 0.0000e+00, -1.5367e-07], [-1.4063e-07, 9.3132e-09, -6.1141e-07, ..., -3.3574e-07, 0.0000e+00, 4.1910e-09]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0153, -0.0238, -0.0261, -0.0259, -0.0054, 0.0035, 0.0097, -0.0127, -0.0087, -0.0033], device='cuda:0'), grad: tensor([ 1.9232e-07, 4.9826e-08, -1.4063e-07, 3.2829e-07, 1.4957e-06, 3.7905e-07, -4.5355e-07, 6.3330e-08, -3.2829e-07, -1.5721e-06], device='cuda:0') 100 0.0001 changing lr epoch 248, time 220.76, cls_loss 0.0011 cls_loss_mapping 0.0038 cls_loss_causal 0.5015 re_mapping 0.0040 re_causal 0.0113 /// teacc 99.16 lr 0.00010000 Epoch 250, weight, value: tensor([[-0.1964, -0.2242, 0.1084, ..., -0.1132, 0.0477, 0.0371], [-0.1284, -0.0509, -0.0862, ..., -0.1841, -0.0884, -0.0309], [ 0.0252, -0.1281, -0.1509, ..., -0.1249, 0.0344, -0.3472], ..., [-0.2066, 0.1478, 0.0145, ..., 0.1763, -0.0526, -0.1339], [-0.1538, -0.1415, 0.1602, ..., -0.1316, -0.1519, 0.1457], [ 0.0175, -0.2790, 0.1490, ..., 0.0650, -0.1987, -0.1213]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, -1.3504e-08, ..., 0.0000e+00, -3.7253e-09, -4.6566e-10], [ 0.0000e+00, 6.5193e-09, 1.3970e-09, ..., 6.0536e-09, 9.3132e-10, 1.3970e-09], [ 0.0000e+00, 2.1420e-08, 1.2573e-08, ..., 2.0489e-08, 5.1223e-09, 2.7940e-09], ..., [ 4.6566e-10, -2.9337e-08, 9.3132e-10, ..., -2.7940e-08, -3.7253e-09, 3.7253e-09], [ 2.3283e-09, 4.6566e-10, -1.6298e-08, ..., 0.0000e+00, 0.0000e+00, -1.3970e-08], [ 4.6566e-10, 9.3132e-10, 4.6566e-10, ..., -9.3132e-10, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0153, -0.0240, -0.0254, -0.0261, -0.0071, 0.0035, 0.0093, -0.0125, -0.0080, -0.0022], device='cuda:0'), grad: tensor([-4.3772e-08, 1.8626e-08, 6.3796e-08, 5.4482e-08, 1.8161e-08, -4.4703e-08, 5.1223e-09, -4.0978e-08, -2.9802e-08, 5.5879e-09], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 249---------------------------------------------------- epoch 249, time 221.21, cls_loss 0.0008 cls_loss_mapping 0.0021 cls_loss_causal 0.4744 re_mapping 0.0040 re_causal 0.0116 /// teacc 99.18 lr 0.00010000 Epoch 251, weight, value: tensor([[-0.1968, -0.2256, 0.1079, ..., -0.1139, 0.0475, 0.0367], [-0.1286, -0.0520, -0.0879, ..., -0.1844, -0.0885, -0.0320], [ 0.0252, -0.1284, -0.1516, ..., -0.1250, 0.0344, -0.3480], ..., [-0.2071, 0.1491, 0.0163, ..., 0.1769, -0.0523, -0.1326], [-0.1537, -0.1419, 0.1621, ..., -0.1318, -0.1518, 0.1469], [ 0.0175, -0.2797, 0.1488, ..., 0.0650, -0.1989, -0.1231]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 3.0268e-08, 1.3970e-09, ..., 2.7940e-09, 1.8626e-09, -3.7253e-09], [ 3.7253e-09, 2.1840e-07, 7.9628e-08, ..., 2.2817e-08, 3.2596e-09, 4.6566e-10], [ 4.6566e-10, 2.8964e-07, 1.1316e-07, ..., -2.3283e-09, -2.4214e-08, 3.2596e-09], ..., [ 8.3819e-09, -5.9418e-07, -2.1374e-07, ..., 5.0291e-08, 1.3039e-08, 0.0000e+00], [ 9.3132e-10, 2.3283e-09, 6.0536e-09, ..., 6.5193e-09, 2.7940e-09, 3.7253e-09], [ 2.0489e-08, 2.8405e-08, -3.2596e-09, ..., 9.9652e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 251, bias, value: tensor([-0.0159, -0.0247, -0.0252, -0.0263, -0.0071, 0.0033, 0.0095, -0.0117, -0.0069, -0.0025], device='cuda:0'), grad: tensor([ 6.9384e-08, 5.0059e-07, 3.0221e-07, 5.2154e-08, -7.1758e-07, 5.0757e-08, 1.8906e-07, -8.4843e-07, 7.5903e-08, 3.3202e-07], device='cuda:0') 100 0.0001 changing lr epoch 250, time 220.36, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4910 re_mapping 0.0039 re_causal 0.0108 /// teacc 99.10 lr 0.00010000 Epoch 252, weight, value: tensor([[-0.1970, -0.2264, 0.1080, ..., -0.1142, 0.0482, 0.0368], [-0.1293, -0.0523, -0.0883, ..., -0.1849, -0.0889, -0.0322], [ 0.0253, -0.1282, -0.1522, ..., -0.1250, 0.0345, -0.3483], ..., [-0.2084, 0.1494, 0.0166, ..., 0.1773, -0.0516, -0.1330], [-0.1539, -0.1423, 0.1627, ..., -0.1327, -0.1519, 0.1473], [ 0.0187, -0.2803, 0.1499, ..., 0.0649, -0.1995, -0.1220]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 4.4238e-08, ..., 1.8626e-09, 0.0000e+00, 1.8161e-08], [ 1.6298e-08, 1.3970e-09, 6.0536e-09, ..., 3.3528e-08, 0.0000e+00, 2.3283e-09], [ 1.0245e-08, 9.3132e-10, -5.0291e-08, ..., 5.1223e-09, 0.0000e+00, 8.8476e-09], ..., [-2.0023e-08, -1.7555e-07, 2.5146e-08, ..., -1.6997e-07, 0.0000e+00, 1.4901e-08], [ 9.3132e-10, 0.0000e+00, -1.5274e-07, ..., 9.3132e-10, 0.0000e+00, -1.3411e-07], [ 4.5868e-07, 1.3970e-09, -6.0536e-08, ..., 1.2117e-06, 0.0000e+00, 3.3528e-08]], device='cuda:0') Epoch 252, bias, value: tensor([-0.0160, -0.0249, -0.0248, -0.0263, -0.0061, 0.0032, 0.0088, -0.0118, -0.0070, -0.0027], device='cuda:0'), grad: tensor([ 3.3760e-07, 5.8487e-07, -1.3020e-06, 7.5437e-08, -3.7551e-06, 5.6578e-07, 5.1223e-08, -2.1141e-07, 9.6858e-08, 3.5539e-06], device='cuda:0') 100 0.0001 changing lr epoch 251, time 220.51, cls_loss 0.0009 cls_loss_mapping 0.0021 cls_loss_causal 0.4835 re_mapping 0.0038 re_causal 0.0108 /// teacc 99.12 lr 0.00010000 Epoch 253, weight, value: tensor([[-0.1971, -0.2270, 0.1088, ..., -0.1144, 0.0483, 0.0369], [-0.1296, -0.0524, -0.0883, ..., -0.1852, -0.0891, -0.0320], [ 0.0254, -0.1285, -0.1532, ..., -0.1252, 0.0349, -0.3491], ..., [-0.2090, 0.1496, 0.0167, ..., 0.1777, -0.0526, -0.1332], [-0.1543, -0.1427, 0.1630, ..., -0.1332, -0.1519, 0.1476], [ 0.0187, -0.2810, 0.1499, ..., 0.0646, -0.1996, -0.1226]], device='cuda:0'), grad: tensor([[ 6.9849e-09, 6.4261e-08, -1.0878e-06, ..., 5.4482e-08, 0.0000e+00, -2.3516e-07], [ 1.0245e-08, -1.4100e-06, -8.8243e-07, ..., 1.0449e-06, 0.0000e+00, 3.1665e-08], [ 1.4435e-08, 1.8813e-07, 6.7055e-07, ..., 1.4249e-07, 0.0000e+00, 1.7369e-07], ..., [ 3.5856e-08, 2.4820e-07, 5.8394e-07, ..., -1.6689e-06, 0.0000e+00, 1.1455e-07], [-4.1677e-07, 8.7544e-08, -1.0533e-06, ..., 6.9849e-08, 0.0000e+00, -1.3635e-06], [ 6.3796e-08, 4.4284e-07, 2.7055e-07, ..., 3.6135e-07, 0.0000e+00, 2.3609e-07]], device='cuda:0') Epoch 253, bias, value: tensor([-0.0158, -0.0245, -0.0249, -0.0264, -0.0063, 0.0033, 0.0087, -0.0117, -0.0070, -0.0033], device='cuda:0'), grad: tensor([-3.9972e-06, -6.5528e-06, 2.7642e-06, 1.4342e-06, -1.1642e-07, 1.7975e-06, 6.2445e-07, 3.6415e-06, -3.1069e-06, 3.5092e-06], device='cuda:0') 100 0.0001 changing lr epoch 252, time 220.31, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.5040 re_mapping 0.0039 re_causal 0.0115 /// teacc 99.07 lr 0.00010000 Epoch 254, weight, value: tensor([[-0.1972, -0.2276, 0.1087, ..., -0.1152, 0.0483, 0.0367], [-0.1310, -0.0524, -0.0885, ..., -0.1854, -0.0891, -0.0323], [ 0.0253, -0.1286, -0.1537, ..., -0.1252, 0.0351, -0.3496], ..., [-0.2095, 0.1497, 0.0163, ..., 0.1770, -0.0528, -0.1337], [-0.1548, -0.1427, 0.1634, ..., -0.1337, -0.1520, 0.1478], [ 0.0193, -0.2816, 0.1525, ..., 0.0665, -0.1997, -0.1234]], device='cuda:0'), grad: tensor([[-4.6566e-10, 0.0000e+00, -3.2876e-07, ..., 0.0000e+00, 0.0000e+00, -1.1781e-07], [ 0.0000e+00, 1.3970e-09, 2.2817e-08, ..., 2.1420e-08, 0.0000e+00, 1.3039e-08], [ 4.6566e-10, 1.8626e-09, 3.2131e-08, ..., 2.3283e-09, 0.0000e+00, 2.2352e-08], ..., [ 4.6566e-10, -3.7253e-09, 4.6566e-09, ..., -1.3970e-09, 0.0000e+00, 5.1223e-09], [ 4.6566e-10, 0.0000e+00, -3.7719e-08, ..., 0.0000e+00, 0.0000e+00, -5.8673e-08], [ 9.3132e-10, 9.3132e-10, 2.3283e-08, ..., 6.3796e-08, 0.0000e+00, 1.2107e-08]], device='cuda:0') Epoch 254, bias, value: tensor([-0.0161, -0.0247, -0.0241, -0.0272, -0.0077, 0.0042, 0.0089, -0.0125, -0.0071, -0.0015], device='cuda:0'), grad: tensor([-1.0105e-06, 1.3970e-07, 1.1595e-07, 5.0757e-08, -2.8918e-07, 6.5193e-08, 7.1805e-07, 2.6543e-08, -1.2899e-07, 3.1572e-07], device='cuda:0') 100 0.0001 changing lr epoch 253, time 220.02, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4937 re_mapping 0.0039 re_causal 0.0115 /// teacc 99.11 lr 0.00010000 Epoch 255, weight, value: tensor([[-0.1976, -0.2281, 0.1077, ..., -0.1163, 0.0483, 0.0364], [-0.1315, -0.0524, -0.0885, ..., -0.1856, -0.0892, -0.0322], [ 0.0253, -0.1290, -0.1546, ..., -0.1254, 0.0351, -0.3501], ..., [-0.2065, 0.1497, 0.0163, ..., 0.1775, -0.0527, -0.1343], [-0.1553, -0.1429, 0.1635, ..., -0.1345, -0.1520, 0.1480], [ 0.0194, -0.2818, 0.1534, ..., 0.0668, -0.1997, -0.1232]], device='cuda:0'), grad: tensor([[ 4.2841e-08, 8.7544e-08, 3.6322e-08, ..., 7.8697e-08, 0.0000e+00, 2.3283e-09], [ 4.8429e-08, 2.5565e-07, 1.0664e-07, ..., 1.9651e-07, 0.0000e+00, 1.3504e-08], [ 4.9360e-08, 1.1642e-07, 4.3772e-08, ..., 1.0105e-07, 0.0000e+00, 2.7940e-09], ..., [-2.8266e-07, -7.9395e-07, -2.8685e-07, ..., -6.6636e-07, 0.0000e+00, 5.1223e-09], [ 3.6787e-08, 1.3970e-09, -1.0990e-07, ..., 2.3283e-09, 0.0000e+00, -2.0443e-07], [ 2.2817e-08, 4.7963e-08, 7.9162e-09, ..., 1.0990e-07, 0.0000e+00, 1.0710e-08]], device='cuda:0') Epoch 255, bias, value: tensor([-0.0173, -0.0247, -0.0241, -0.0277, -0.0080, 0.0050, 0.0095, -0.0125, -0.0072, -0.0013], device='cuda:0'), grad: tensor([ 2.8638e-07, 6.4308e-07, 3.6089e-07, 8.5356e-07, -2.3423e-07, -4.0513e-08, 3.0734e-07, -2.2650e-06, -3.3062e-07, 4.1397e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 254---------------------------------------------------- epoch 254, time 221.06, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4680 re_mapping 0.0042 re_causal 0.0112 /// teacc 99.19 lr 0.00010000 Epoch 256, weight, value: tensor([[-0.1977, -0.2293, 0.1080, ..., -0.1165, 0.0483, 0.0365], [-0.1318, -0.0525, -0.0886, ..., -0.1861, -0.0892, -0.0319], [ 0.0252, -0.1295, -0.1551, ..., -0.1259, 0.0351, -0.3505], ..., [-0.2037, 0.1500, 0.0163, ..., 0.1785, -0.0527, -0.1347], [-0.1559, -0.1432, 0.1637, ..., -0.1350, -0.1520, 0.1479], [ 0.0193, -0.2824, 0.1536, ..., 0.0667, -0.1997, -0.1235]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 1.3970e-09, 9.3132e-10, ..., 1.8626e-09, 0.0000e+00, -2.5146e-08], [ 0.0000e+00, 1.3970e-09, 1.3970e-09, ..., 2.3283e-09, 0.0000e+00, 9.3132e-10], ..., [ 4.6566e-10, -6.5193e-09, -5.1223e-09, ..., -7.9162e-09, 0.0000e+00, 1.3504e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09], [ 0.0000e+00, 1.8626e-09, 1.3970e-09, ..., 4.1910e-09, 0.0000e+00, 4.6566e-09]], device='cuda:0') Epoch 256, bias, value: tensor([-0.0168, -0.0257, -0.0226, -0.0278, -0.0082, 0.0050, 0.0096, -0.0123, -0.0075, -0.0014], device='cuda:0'), grad: tensor([ 3.3993e-08, -1.4016e-07, -3.0268e-08, 2.0955e-08, 3.9116e-08, -1.3970e-09, -4.7963e-08, 7.4040e-08, 1.7695e-08, 3.9116e-08], device='cuda:0') 100 0.0001 changing lr epoch 255, time 220.65, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4391 re_mapping 0.0040 re_causal 0.0108 /// teacc 99.15 lr 0.00010000 Epoch 257, weight, value: tensor([[-0.1977, -0.2301, 0.1081, ..., -0.1166, 0.0483, 0.0364], [-0.1328, -0.0526, -0.0886, ..., -0.1866, -0.0892, -0.0312], [ 0.0255, -0.1298, -0.1555, ..., -0.1261, 0.0351, -0.3509], ..., [-0.2041, 0.1503, 0.0165, ..., 0.1787, -0.0526, -0.1354], [-0.1566, -0.1436, 0.1639, ..., -0.1355, -0.1520, 0.1477], [ 0.0189, -0.2842, 0.1536, ..., 0.0662, -0.1997, -0.1244]], device='cuda:0'), grad: tensor([[-9.3132e-10, 4.6566e-10, -8.8476e-09, ..., 4.6566e-10, 0.0000e+00, -2.0955e-08], [ 0.0000e+00, 7.1712e-08, 1.5832e-08, ..., 5.4017e-08, 1.1176e-08, -5.1223e-09], [ 0.0000e+00, 4.7963e-08, 1.0710e-08, ..., 3.5856e-08, 7.4506e-09, 5.5879e-09], ..., [ 0.0000e+00, -1.4715e-07, -2.5146e-08, ..., -1.0896e-07, -2.2817e-08, 6.0536e-09], [ 0.0000e+00, 9.3132e-10, -1.4901e-08, ..., 9.3132e-10, 0.0000e+00, -2.0023e-08], [-1.3970e-09, 3.7253e-09, -3.7253e-09, ..., -9.3132e-10, 4.6566e-10, 1.3970e-09]], device='cuda:0') Epoch 257, bias, value: tensor([-0.0168, -0.0258, -0.0224, -0.0285, -0.0073, 0.0058, 0.0095, -0.0124, -0.0080, -0.0022], device='cuda:0'), grad: tensor([-3.4925e-08, 3.4412e-07, -2.7148e-07, 3.4459e-08, 3.4459e-08, 5.4482e-08, 1.5367e-08, -1.7881e-07, 4.6566e-10, 4.6566e-09], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 256---------------------------------------------------- epoch 256, time 221.31, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.5087 re_mapping 0.0041 re_causal 0.0115 /// teacc 99.20 lr 0.00010000 Epoch 258, weight, value: tensor([[-0.1979, -0.2308, 0.1083, ..., -0.1168, 0.0483, 0.0364], [-0.1328, -0.0523, -0.0885, ..., -0.1868, -0.0891, -0.0312], [ 0.0254, -0.1300, -0.1557, ..., -0.1261, 0.0351, -0.3512], ..., [-0.2046, 0.1502, 0.0164, ..., 0.1788, -0.0527, -0.1357], [-0.1576, -0.1439, 0.1642, ..., -0.1358, -0.1519, 0.1473], [ 0.0190, -0.2856, 0.1540, ..., 0.0662, -0.1997, -0.1247]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.2596e-08, -9.4064e-08, ..., 2.1886e-08, 0.0000e+00, 1.9465e-07], [ 2.7940e-09, 1.4249e-07, 8.7079e-08, ..., 9.7323e-08, 0.0000e+00, -4.0978e-08], [ 9.3132e-10, 2.1933e-07, 1.7090e-07, ..., 1.4994e-07, 0.0000e+00, 1.5367e-08], ..., [ 4.6566e-10, -6.7055e-07, -3.9069e-07, ..., -4.6333e-07, 0.0000e+00, 1.0710e-08], [ 1.8626e-09, 6.0536e-08, 5.6811e-08, ..., 4.0978e-08, 0.0000e+00, 3.3993e-08], [ 1.7649e-07, 2.2259e-07, 8.6613e-08, ..., 9.4995e-08, 0.0000e+00, 1.0151e-07]], device='cuda:0') Epoch 258, bias, value: tensor([-0.0170, -0.0247, -0.0229, -0.0289, -0.0077, 0.0067, 0.0083, -0.0128, -0.0084, -0.0024], device='cuda:0'), grad: tensor([ 6.7707e-07, 4.0047e-07, 9.4576e-07, 3.1050e-06, 5.9372e-07, -2.7996e-06, -2.5705e-06, -1.5814e-06, 3.7672e-07, 8.4611e-07], device='cuda:0') 100 0.0001 changing lr epoch 257, time 220.23, cls_loss 0.0008 cls_loss_mapping 0.0017 cls_loss_causal 0.4767 re_mapping 0.0038 re_causal 0.0110 /// teacc 99.09 lr 0.00010000 Epoch 259, weight, value: tensor([[-0.1981, -0.2315, 0.1080, ..., -0.1173, 0.0483, 0.0361], [-0.1330, -0.0525, -0.0887, ..., -0.1872, -0.0891, -0.0311], [ 0.0254, -0.1304, -0.1564, ..., -0.1269, 0.0350, -0.3517], ..., [-0.2048, 0.1510, 0.0176, ..., 0.1808, -0.0527, -0.1359], [-0.1578, -0.1441, 0.1643, ..., -0.1368, -0.1519, 0.1475], [ 0.0199, -0.2892, 0.1534, ..., 0.0652, -0.1997, -0.1247]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.3970e-09, 1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 2.2352e-08, 8.3819e-09, ..., 8.8476e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.2596e-09, 1.8626e-09, ..., 1.3970e-09, 0.0000e+00, 1.3970e-09], ..., [ 4.6566e-10, -3.6322e-08, -1.3970e-08, ..., -1.5367e-08, 0.0000e+00, 9.3132e-10], [-4.6566e-09, 4.6566e-10, -4.5169e-08, ..., 0.0000e+00, 0.0000e+00, -5.2154e-08], [ 2.7940e-09, 2.7940e-09, 6.5193e-09, ..., 1.8626e-09, 0.0000e+00, 8.3819e-09]], device='cuda:0') Epoch 259, bias, value: tensor([-0.0176, -0.0247, -0.0231, -0.0290, -0.0079, 0.0063, 0.0096, -0.0119, -0.0087, -0.0031], device='cuda:0'), grad: tensor([ 3.9116e-08, 3.2363e-07, -4.5821e-07, 4.7032e-08, 1.2107e-08, 4.0513e-08, 3.3528e-08, 2.7940e-09, -7.2177e-08, 3.4459e-08], device='cuda:0') 100 0.0001 changing lr epoch 258, time 220.70, cls_loss 0.0009 cls_loss_mapping 0.0025 cls_loss_causal 0.4658 re_mapping 0.0041 re_causal 0.0111 /// teacc 99.19 lr 0.00010000 Epoch 260, weight, value: tensor([[-0.1983, -0.2329, 0.1087, ..., -0.1176, 0.0483, 0.0363], [-0.1332, -0.0525, -0.0888, ..., -0.1874, -0.0891, -0.0307], [ 0.0254, -0.1308, -0.1571, ..., -0.1273, 0.0351, -0.3524], ..., [-0.2056, 0.1511, 0.0176, ..., 0.1810, -0.0527, -0.1363], [-0.1585, -0.1443, 0.1644, ..., -0.1376, -0.1520, 0.1477], [ 0.0201, -0.2899, 0.1539, ..., 0.0653, -0.1997, -0.1254]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 2.7940e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 1.8161e-08, 9.3132e-09, ..., 1.6764e-08, 0.0000e+00, -1.3970e-09], [ 0.0000e+00, 3.7253e-09, 9.3132e-10, ..., 4.6566e-10, 0.0000e+00, 9.3132e-10], ..., [ 4.6566e-10, -3.9116e-08, -1.5367e-08, ..., -5.1688e-08, 0.0000e+00, 4.6566e-10], [ 1.3970e-09, 4.1910e-09, 5.5879e-09, ..., 3.7253e-09, 0.0000e+00, -1.3504e-08], [-5.5879e-09, 3.8184e-08, -2.8871e-08, ..., 1.5367e-08, 0.0000e+00, -1.3970e-09]], device='cuda:0') Epoch 260, bias, value: tensor([-0.0172, -0.0245, -0.0233, -0.0291, -0.0082, 0.0063, 0.0106, -0.0120, -0.0089, -0.0032], device='cuda:0'), grad: tensor([ 1.3970e-08, 6.2864e-08, -1.1642e-07, -1.6857e-07, 5.6345e-08, 1.9372e-07, 6.5193e-09, -7.1246e-08, 1.7695e-08, 2.2817e-08], device='cuda:0') 100 0.0001 changing lr epoch 259, time 220.39, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4810 re_mapping 0.0040 re_causal 0.0113 /// teacc 99.12 lr 0.00010000 Epoch 261, weight, value: tensor([[-0.1986, -0.2335, 0.1090, ..., -0.1180, 0.0483, 0.0351], [-0.1337, -0.0526, -0.0887, ..., -0.1877, -0.0891, -0.0299], [ 0.0253, -0.1310, -0.1577, ..., -0.1275, 0.0351, -0.3528], ..., [-0.2059, 0.1514, 0.0178, ..., 0.1817, -0.0527, -0.1370], [-0.1589, -0.1446, 0.1643, ..., -0.1383, -0.1520, 0.1477], [ 0.0194, -0.2913, 0.1539, ..., 0.0642, -0.1998, -0.1258]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, -9.3132e-10, ..., 4.6566e-10, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 7.4506e-08, 4.8429e-08, ..., 5.6345e-08, 0.0000e+00, 4.2841e-08], [ 0.0000e+00, 1.1642e-08, 8.7544e-08, ..., 9.3132e-09, 0.0000e+00, 1.2014e-07], ..., [ 0.0000e+00, -1.9930e-07, -4.3772e-08, ..., -1.5181e-07, 0.0000e+00, 9.3132e-09], [ 0.0000e+00, 2.3283e-09, -1.6578e-07, ..., 1.8626e-09, 0.0000e+00, -2.6030e-07], [ 9.3132e-10, 3.6787e-08, 1.1176e-08, ..., 2.7940e-08, 0.0000e+00, 6.5193e-09]], device='cuda:0') Epoch 261, bias, value: tensor([-0.0187, -0.0241, -0.0233, -0.0291, -0.0073, 0.0065, 0.0114, -0.0121, -0.0093, -0.0043], device='cuda:0'), grad: tensor([-8.6799e-06, 2.9011e-07, 1.1306e-06, 3.0361e-07, 1.7975e-06, 4.1910e-09, 5.9418e-06, -3.7486e-07, -8.3260e-07, 4.2003e-07], device='cuda:0') 100 0.0001 changing lr epoch 260, time 221.00, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.4950 re_mapping 0.0039 re_causal 0.0111 /// teacc 99.05 lr 0.00010000 Epoch 262, weight, value: tensor([[-0.1983, -0.2345, 0.1106, ..., -0.1183, 0.0483, 0.0360], [-0.1345, -0.0527, -0.0890, ..., -0.1880, -0.0891, -0.0303], [ 0.0252, -0.1315, -0.1588, ..., -0.1277, 0.0351, -0.3581], ..., [-0.2062, 0.1516, 0.0179, ..., 0.1819, -0.0527, -0.1379], [-0.1593, -0.1449, 0.1652, ..., -0.1391, -0.1520, 0.1508], [ 0.0197, -0.2918, 0.1542, ..., 0.0641, -0.1998, -0.1264]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.1223e-09, 2.3283e-09, ..., 4.6566e-09, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 2.7474e-08, 3.2596e-09, ..., 1.7695e-08, 0.0000e+00, -3.2596e-09], [-4.6566e-10, 1.0664e-07, 4.6566e-09, ..., 3.9581e-08, 0.0000e+00, 2.7940e-09], ..., [ 9.3132e-10, -1.7416e-07, 7.9162e-09, ..., -5.4948e-08, 0.0000e+00, 3.2596e-09], [ 9.3132e-10, 2.3283e-09, -1.8626e-09, ..., 6.9849e-09, 0.0000e+00, -3.7719e-08], [-3.7253e-09, 5.1223e-09, -3.6787e-08, ..., 8.4611e-07, 0.0000e+00, 2.6543e-08]], device='cuda:0') Epoch 262, bias, value: tensor([-0.0167, -0.0240, -0.0237, -0.0293, -0.0070, 0.0061, 0.0110, -0.0124, -0.0069, -0.0046], device='cuda:0'), grad: tensor([ 2.7940e-08, 6.5658e-08, 1.4994e-07, 5.5879e-08, -2.4308e-06, 2.7474e-08, -5.5879e-09, -2.5379e-07, -8.3819e-09, 2.3842e-06], device='cuda:0') 100 0.0001 changing lr epoch 261, time 220.38, cls_loss 0.0009 cls_loss_mapping 0.0022 cls_loss_causal 0.4957 re_mapping 0.0039 re_causal 0.0111 /// teacc 99.13 lr 0.00010000 Epoch 263, weight, value: tensor([[-0.1987, -0.2357, 0.1099, ..., -0.1185, 0.0483, 0.0354], [-0.1348, -0.0527, -0.0890, ..., -0.1883, -0.0891, -0.0294], [ 0.0252, -0.1324, -0.1593, ..., -0.1287, 0.0351, -0.3585], ..., [-0.2063, 0.1518, 0.0180, ..., 0.1824, -0.0527, -0.1386], [-0.1600, -0.1455, 0.1649, ..., -0.1404, -0.1520, 0.1509], [ 0.0195, -0.2921, 0.1547, ..., 0.0639, -0.1998, -0.1273]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -9.7323e-08, ..., 2.7008e-08, 0.0000e+00, -2.5658e-07], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 1.3132e-07, 0.0000e+00, -2.3283e-09], [ 0.0000e+00, 7.9162e-09, 4.6566e-10, ..., 2.1420e-08, 0.0000e+00, 4.6566e-10], ..., [ 1.8626e-09, -2.3283e-09, 4.6566e-10, ..., 1.9046e-07, 0.0000e+00, 3.2596e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.9558e-08, 0.0000e+00, 4.6566e-10], [ 4.6566e-08, 1.3970e-09, 0.0000e+00, ..., 5.5805e-06, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 263, bias, value: tensor([-0.0181, -0.0237, -0.0239, -0.0295, -0.0068, 0.0067, 0.0111, -0.0125, -0.0073, -0.0048], device='cuda:0'), grad: tensor([-6.6590e-07, 4.2142e-07, -3.8650e-08, -1.2619e-07, -1.8418e-05, 2.7660e-07, 8.7172e-07, 6.7474e-07, 7.8231e-08, 1.6928e-05], device='cuda:0') 100 0.0001 changing lr epoch 262, time 220.25, cls_loss 0.0008 cls_loss_mapping 0.0021 cls_loss_causal 0.4853 re_mapping 0.0039 re_causal 0.0115 /// teacc 99.12 lr 0.00010000 Epoch 264, weight, value: tensor([[-0.1992, -0.2373, 0.1104, ..., -0.1187, 0.0484, 0.0355], [-0.1351, -0.0545, -0.0901, ..., -0.1891, -0.0891, -0.0290], [ 0.0251, -0.1332, -0.1600, ..., -0.1290, 0.0351, -0.3588], ..., [-0.2064, 0.1538, 0.0190, ..., 0.1836, -0.0528, -0.1391], [-0.1604, -0.1460, 0.1653, ..., -0.1405, -0.1520, 0.1511], [ 0.0194, -0.2936, 0.1549, ..., 0.0636, -0.1998, -0.1278]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7020e-08, 2.3283e-10, ..., 1.6298e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 7.0548e-08, 5.8208e-09, ..., 1.1409e-08, 0.0000e+00, -7.4506e-09], [ 6.9849e-10, 1.1805e-07, 8.8476e-09, ..., 1.7462e-08, 0.0000e+00, 4.6566e-09], ..., [ 0.0000e+00, 8.1491e-09, -9.5461e-09, ..., -2.0955e-08, 0.0000e+00, 5.1223e-09], [-6.9849e-10, 1.8626e-08, -3.0268e-09, ..., -1.3970e-09, 0.0000e+00, -6.9849e-10], [ 2.3283e-10, 5.2154e-08, 2.3283e-09, ..., 1.2107e-08, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 264, bias, value: tensor([-0.0180, -0.0249, -0.0238, -0.0294, -0.0070, 0.0078, 0.0090, -0.0109, -0.0075, -0.0052], device='cuda:0'), grad: tensor([ 1.1991e-07, 1.6065e-07, 3.5926e-07, -9.4250e-07, -1.0757e-07, 9.8487e-08, -1.6997e-08, 7.6601e-08, 6.1467e-08, 1.9674e-07], device='cuda:0') 100 0.0001 changing lr epoch 263, time 220.24, cls_loss 0.0006 cls_loss_mapping 0.0020 cls_loss_causal 0.4835 re_mapping 0.0041 re_causal 0.0117 /// teacc 99.13 lr 0.00010000 Epoch 265, weight, value: tensor([[-0.1992, -0.2366, 0.1107, ..., -0.1187, 0.0484, 0.0360], [-0.1352, -0.0546, -0.0902, ..., -0.1892, -0.0891, -0.0290], [ 0.0251, -0.1334, -0.1603, ..., -0.1292, 0.0351, -0.3590], ..., [-0.2065, 0.1538, 0.0190, ..., 0.1837, -0.0528, -0.1393], [-0.1605, -0.1464, 0.1656, ..., -0.1407, -0.1520, 0.1513], [ 0.0194, -0.2938, 0.1551, ..., 0.0635, -0.1998, -0.1281]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 1.1642e-09, 6.9849e-09, 1.3970e-09, ..., 6.5193e-09, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 9.3132e-10, 2.3283e-10, ..., 6.9849e-10, -9.3132e-10, 0.0000e+00], ..., [ 2.3283e-10, -6.0536e-09, 4.6566e-10, ..., -5.3551e-09, 6.9849e-10, 2.3283e-10], [ 3.7253e-09, 2.3283e-10, 4.6566e-10, ..., 2.3283e-10, 2.3283e-10, 3.0268e-09], [-1.6298e-09, 2.3283e-10, -2.2352e-08, ..., -1.2573e-08, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 265, bias, value: tensor([-0.0176, -0.0249, -0.0237, -0.0291, -0.0068, 0.0072, 0.0092, -0.0110, -0.0074, -0.0053], device='cuda:0'), grad: tensor([ 5.5879e-09, -1.0198e-06, 1.5134e-08, 4.8662e-08, 8.3819e-07, -5.4715e-08, 3.9581e-09, 2.0163e-07, 1.6065e-08, -3.2829e-08], device='cuda:0') 100 0.0001 changing lr epoch 264, time 220.79, cls_loss 0.0009 cls_loss_mapping 0.0022 cls_loss_causal 0.4789 re_mapping 0.0041 re_causal 0.0112 /// teacc 99.12 lr 0.00010000 Epoch 266, weight, value: tensor([[-0.1994, -0.2375, 0.1111, ..., -0.1188, 0.0483, 0.0363], [-0.1354, -0.0546, -0.0902, ..., -0.1893, -0.0891, -0.0287], [ 0.0251, -0.1338, -0.1612, ..., -0.1294, 0.0353, -0.3594], ..., [-0.2067, 0.1539, 0.0191, ..., 0.1838, -0.0531, -0.1399], [-0.1609, -0.1470, 0.1673, ..., -0.1410, -0.1522, 0.1518], [ 0.0197, -0.2940, 0.1555, ..., 0.0637, -0.1998, -0.1288]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 5.1223e-09, -8.2422e-08, ..., 9.3132e-09, 6.9849e-10, -5.5181e-08], [ 1.3039e-08, 1.2526e-07, 3.9348e-08, ..., 1.3527e-07, 2.0955e-09, -2.3283e-10], [ 9.3132e-10, 4.4238e-08, 6.2864e-09, ..., 3.5390e-08, 7.2177e-08, 2.0955e-09], ..., [ 6.9849e-09, -4.5449e-07, -3.3062e-08, ..., -3.8906e-07, 0.0000e+00, 1.6298e-09], [ 2.2119e-08, 1.6298e-09, 1.1595e-07, ..., 5.1688e-08, 1.3970e-09, 4.6333e-08], [-1.3672e-06, 1.0361e-07, -2.9337e-06, ..., -2.7716e-06, 0.0000e+00, 3.0268e-09]], device='cuda:0') Epoch 266, bias, value: tensor([-0.0173, -0.0245, -0.0239, -0.0291, -0.0069, 0.0066, 0.0104, -0.0113, -0.0072, -0.0053], device='cuda:0'), grad: tensor([-3.6554e-08, 7.6322e-07, 1.0222e-05, 3.5553e-07, 1.0110e-05, 1.3458e-07, -1.1295e-05, -1.1409e-06, 5.7090e-07, -9.7305e-06], device='cuda:0') 100 0.0001 changing lr epoch 265, time 220.71, cls_loss 0.0008 cls_loss_mapping 0.0017 cls_loss_causal 0.4769 re_mapping 0.0040 re_causal 0.0116 /// teacc 99.10 lr 0.00010000 Epoch 267, weight, value: tensor([[-0.1995, -0.2388, 0.1114, ..., -0.1189, 0.0483, 0.0363], [-0.1364, -0.0550, -0.0909, ..., -0.1899, -0.0891, -0.0289], [ 0.0250, -0.1343, -0.1618, ..., -0.1295, 0.0353, -0.3597], ..., [-0.2069, 0.1542, 0.0195, ..., 0.1842, -0.0531, -0.1395], [-0.1626, -0.1483, 0.1707, ..., -0.1412, -0.1522, 0.1529], [ 0.0184, -0.2949, 0.1548, ..., 0.0632, -0.1998, -0.1317]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.9849e-10, 6.9849e-10, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, 8.3819e-09, 1.6298e-09, ..., 6.9849e-10, 0.0000e+00, -7.9162e-09], [ 0.0000e+00, 3.9581e-09, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 1.5134e-08, 2.7940e-09, ..., 4.6566e-10, 0.0000e+00, 2.5611e-09], [ 4.4703e-08, 9.3132e-10, 2.1886e-08, ..., 8.6147e-09, 0.0000e+00, 6.3563e-08], [-6.2864e-09, 2.0955e-09, -6.0070e-08, ..., -2.1188e-08, 0.0000e+00, -3.4925e-08]], device='cuda:0') Epoch 267, bias, value: tensor([-0.0173, -0.0247, -0.0240, -0.0287, -0.0066, 0.0088, 0.0075, -0.0111, -0.0065, -0.0062], device='cuda:0'), grad: tensor([ 5.5879e-09, -7.6834e-09, -4.7730e-08, -3.3993e-08, -7.6834e-09, -4.4238e-09, 1.9558e-08, 5.7044e-08, 1.7928e-07, -1.5437e-07], device='cuda:0') 100 0.0001 changing lr epoch 266, time 220.05, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4829 re_mapping 0.0041 re_causal 0.0111 /// teacc 99.08 lr 0.00010000 Epoch 268, weight, value: tensor([[-0.1995, -0.2406, 0.1116, ..., -0.1191, 0.0483, 0.0364], [-0.1368, -0.0552, -0.0910, ..., -0.1902, -0.0891, -0.0290], [ 0.0250, -0.1341, -0.1626, ..., -0.1298, 0.0355, -0.3604], ..., [-0.2069, 0.1544, 0.0196, ..., 0.1846, -0.0531, -0.1398], [-0.1630, -0.1507, 0.1713, ..., -0.1425, -0.1524, 0.1542], [ 0.0183, -0.2957, 0.1551, ..., 0.0631, -0.1998, -0.1321]], device='cuda:0'), grad: tensor([[ 5.3551e-09, 6.9849e-10, 3.4925e-09, ..., 1.6065e-08, 0.0000e+00, 1.0617e-07], [ 4.8894e-09, 8.6147e-09, 2.0349e-07, ..., 4.1444e-08, 0.0000e+00, 2.3586e-07], [ 2.3283e-09, -9.5461e-09, 2.0489e-08, ..., 9.5461e-09, 0.0000e+00, 2.5611e-08], ..., [ 3.5623e-08, 4.4238e-09, 1.8161e-08, ..., 9.0804e-09, 0.0000e+00, 2.9104e-08], [ 1.8626e-08, 9.3132e-10, -3.9325e-07, ..., 2.0955e-09, 0.0000e+00, -1.1874e-08], [ 1.0477e-08, 2.0955e-09, 3.2829e-08, ..., 1.1059e-07, 0.0000e+00, 2.0117e-07]], device='cuda:0') Epoch 268, bias, value: tensor([-0.0174, -0.0247, -0.0239, -0.0280, -0.0066, 0.0084, 0.0072, -0.0111, -0.0060, -0.0063], device='cuda:0'), grad: tensor([ 4.4634e-07, 6.4913e-07, 1.3132e-07, 1.1828e-07, -7.2271e-07, 1.6958e-05, -1.9118e-05, 6.0443e-07, -3.4459e-07, 1.2498e-06], device='cuda:0') 100 0.0001 changing lr epoch 267, time 220.31, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4768 re_mapping 0.0040 re_causal 0.0104 /// teacc 99.15 lr 0.00010000 Epoch 269, weight, value: tensor([[-0.1997, -0.2422, 0.1115, ..., -0.1193, 0.0483, 0.0361], [-0.1375, -0.0553, -0.0912, ..., -0.1905, -0.0892, -0.0292], [ 0.0248, -0.1349, -0.1632, ..., -0.1302, 0.0355, -0.3613], ..., [-0.2061, 0.1546, 0.0195, ..., 0.1851, -0.0532, -0.1407], [-0.1643, -0.1510, 0.1732, ..., -0.1428, -0.1524, 0.1560], [ 0.0177, -0.2962, 0.1553, ..., 0.0631, -0.1999, -0.1333]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, -3.5390e-08, ..., 7.4506e-09, 0.0000e+00, 4.0513e-08], [ 1.5832e-08, 1.3970e-09, 1.4016e-07, ..., 2.4680e-08, 0.0000e+00, 1.0757e-07], [ 4.6566e-10, 4.6566e-10, 6.0536e-09, ..., 2.3283e-09, 0.0000e+00, -6.0583e-07], ..., [ 9.3132e-10, -5.5879e-09, 2.7940e-09, ..., -4.1910e-09, 0.0000e+00, 1.5832e-07], [ 1.3970e-09, 4.6566e-10, 4.0513e-08, ..., 3.0268e-08, 0.0000e+00, 1.3225e-07], [-4.1910e-09, 9.3132e-10, -2.4494e-07, ..., -7.4971e-08, 0.0000e+00, 1.1316e-07]], device='cuda:0') Epoch 269, bias, value: tensor([-0.0183, -0.0247, -0.0240, -0.0284, -0.0067, 0.0085, 0.0078, -0.0112, -0.0047, -0.0068], device='cuda:0'), grad: tensor([ 9.3644e-07, 1.4734e-06, -8.3297e-06, 1.7043e-07, 3.3528e-08, 1.3970e-07, 1.9325e-07, 2.0210e-06, 2.3134e-06, 1.0468e-06], device='cuda:0') 100 0.0001 changing lr epoch 268, time 220.81, cls_loss 0.0009 cls_loss_mapping 0.0026 cls_loss_causal 0.4686 re_mapping 0.0040 re_causal 0.0109 /// teacc 99.18 lr 0.00010000 Epoch 270, weight, value: tensor([[-0.1998, -0.2435, 0.1120, ..., -0.1196, 0.0482, 0.0364], [-0.1377, -0.0559, -0.0915, ..., -0.1911, -0.0893, -0.0306], [ 0.0248, -0.1357, -0.1636, ..., -0.1305, 0.0355, -0.3616], ..., [-0.2052, 0.1551, 0.0196, ..., 0.1858, -0.0536, -0.1410], [-0.1646, -0.1514, 0.1741, ..., -0.1432, -0.1525, 0.1573], [ 0.0177, -0.2968, 0.1555, ..., 0.0631, -0.1999, -0.1336]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 2.3283e-09, -1.2573e-08, ..., 4.6566e-10, 0.0000e+00, -4.6566e-10], [ 8.2422e-08, 1.9558e-07, 4.1910e-09, ..., 1.2852e-07, 0.0000e+00, 4.1910e-08], [ 1.0245e-08, 8.4750e-08, 4.6566e-10, ..., 1.3039e-08, 0.0000e+00, 5.1223e-09], ..., [ 9.7789e-09, 4.5123e-07, -1.3970e-09, ..., -1.5041e-07, 0.0000e+00, 4.6566e-09], [ 8.0559e-08, 1.0245e-08, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 4.0978e-08], [ 1.6810e-07, 1.2573e-08, -2.0955e-08, ..., -7.9162e-09, 0.0000e+00, 8.4285e-08]], device='cuda:0') Epoch 270, bias, value: tensor([-0.0180, -0.0253, -0.0236, -0.0283, -0.0067, 0.0085, 0.0075, -0.0109, -0.0041, -0.0069], device='cuda:0'), grad: tensor([-1.8161e-08, 6.5612e-07, 1.4761e-07, 6.3740e-06, 7.3109e-08, -8.1584e-06, 1.2247e-07, 2.5798e-07, 1.9930e-07, 3.6508e-07], device='cuda:0') 100 0.0001 changing lr epoch 269, time 220.54, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4827 re_mapping 0.0039 re_causal 0.0113 /// teacc 98.95 lr 0.00010000 Epoch 271, weight, value: tensor([[-0.1999, -0.2440, 0.1113, ..., -0.1202, 0.0483, 0.0354], [-0.1380, -0.0558, -0.0915, ..., -0.1913, -0.0893, -0.0306], [ 0.0248, -0.1357, -0.1636, ..., -0.1308, 0.0356, -0.3623], ..., [-0.2055, 0.1550, 0.0195, ..., 0.1860, -0.0536, -0.1411], [-0.1648, -0.1523, 0.1754, ..., -0.1448, -0.1525, 0.1586], [ 0.0176, -0.2970, 0.1557, ..., 0.0632, -0.1999, -0.1360]], device='cuda:0'), grad: tensor([[ 8.3819e-09, 0.0000e+00, -1.9232e-07, ..., 0.0000e+00, 0.0000e+00, -1.2107e-07], [ 2.7940e-09, 1.8161e-08, 1.0245e-08, ..., 1.1176e-08, 0.0000e+00, -5.1223e-09], [ 1.1176e-08, 8.3819e-09, 1.2107e-08, ..., 5.1223e-09, 0.0000e+00, 2.5611e-08], ..., [ 2.5611e-08, -2.9802e-08, -7.4506e-09, ..., 1.3970e-08, 5.5879e-09, 4.7497e-08], [ 4.2841e-08, 4.6566e-10, 1.5739e-07, ..., 4.6566e-10, 0.0000e+00, 1.8161e-07], [ 8.8802e-07, 2.3283e-09, 9.3132e-10, ..., 2.8405e-08, 4.6566e-10, 1.3690e-06]], device='cuda:0') Epoch 271, bias, value: tensor([-0.0189, -0.0252, -0.0233, -0.0282, -0.0064, 0.0083, 0.0077, -0.0112, -0.0034, -0.0073], device='cuda:0'), grad: tensor([-4.2655e-07, -1.0338e-06, 5.6578e-07, 5.2759e-07, -7.4506e-09, -5.4576e-06, 5.7276e-07, 4.2887e-07, 6.4820e-07, 4.1537e-06], device='cuda:0') 100 0.0001 changing lr epoch 270, time 220.29, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.4727 re_mapping 0.0040 re_causal 0.0111 /// teacc 99.08 lr 0.00010000 Epoch 272, weight, value: tensor([[-0.2004, -0.2452, 0.1112, ..., -0.1205, 0.0483, 0.0352], [-0.1394, -0.0557, -0.0914, ..., -0.1916, -0.0893, -0.0303], [ 0.0245, -0.1380, -0.1644, ..., -0.1328, 0.0355, -0.3629], ..., [-0.2079, 0.1555, 0.0195, ..., 0.1865, -0.0541, -0.1416], [-0.1652, -0.1529, 0.1764, ..., -0.1456, -0.1525, 0.1592], [ 0.0138, -0.2973, 0.1558, ..., 0.0617, -0.2000, -0.1378]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -2.9802e-08, ..., 4.6566e-10, 0.0000e+00, -1.3970e-08], [ 0.0000e+00, 1.1642e-08, 5.5879e-09, ..., 9.7789e-09, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, -9.8255e-08, 4.6566e-10, ..., -1.1781e-07, 0.0000e+00, -1.9418e-07], ..., [ 0.0000e+00, 4.8429e-08, -1.9558e-08, ..., 7.4971e-08, 0.0000e+00, 1.9325e-07], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 2.1886e-08, 3.2131e-08, ..., 1.6298e-08, 0.0000e+00, 1.3039e-08]], device='cuda:0') Epoch 272, bias, value: tensor([-0.0196, -0.0250, -0.0234, -0.0286, -0.0052, 0.0091, 0.0075, -0.0115, -0.0030, -0.0090], device='cuda:0'), grad: tensor([ 9.3132e-09, 4.6613e-07, -5.5134e-06, 1.5181e-07, 9.7789e-08, 4.9826e-08, -8.4285e-08, 4.4852e-06, 7.2177e-08, 2.6869e-07], device='cuda:0') 100 0.0001 changing lr epoch 271, time 220.48, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4775 re_mapping 0.0039 re_causal 0.0108 /// teacc 99.05 lr 0.00010000 Epoch 273, weight, value: tensor([[-0.2006, -0.2479, 0.1115, ..., -0.1208, 0.0483, 0.0352], [-0.1397, -0.0560, -0.0918, ..., -0.1921, -0.0893, -0.0306], [ 0.0242, -0.1389, -0.1653, ..., -0.1338, 0.0355, -0.3639], ..., [-0.2083, 0.1565, 0.0197, ..., 0.1875, -0.0541, -0.1426], [-0.1657, -0.1532, 0.1777, ..., -0.1438, -0.1525, 0.1601], [ 0.0139, -0.2976, 0.1560, ..., 0.0615, -0.2000, -0.1388]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, -2.6589e-07, ..., 0.0000e+00, 0.0000e+00, -8.7079e-08], [ 0.0000e+00, 6.1933e-08, 2.7474e-08, ..., 3.7719e-08, 0.0000e+00, 3.7253e-09], [ 0.0000e+00, 5.0291e-08, 6.0536e-09, ..., -3.0268e-08, 0.0000e+00, 1.8626e-09], ..., [ 0.0000e+00, -2.1840e-07, -2.9802e-08, ..., -6.1002e-08, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 2.3283e-09, 3.7253e-09, ..., 1.3970e-09, 0.0000e+00, 1.3970e-09], [-4.6566e-10, 4.1910e-09, 1.0710e-08, ..., 9.3132e-10, 0.0000e+00, 4.1910e-09]], device='cuda:0') Epoch 273, bias, value: tensor([-0.0190, -0.0234, -0.0255, -0.0295, -0.0051, 0.0094, 0.0076, -0.0111, -0.0022, -0.0094], device='cuda:0'), grad: tensor([-8.4937e-07, 8.8476e-09, -1.1129e-06, 1.5274e-07, -9.7789e-08, 2.5146e-08, 8.0559e-07, 8.7824e-07, 1.4622e-07, 4.8894e-08], device='cuda:0') 100 0.0001 changing lr epoch 272, time 220.50, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4886 re_mapping 0.0040 re_causal 0.0109 /// teacc 99.11 lr 0.00010000 Epoch 274, weight, value: tensor([[-0.2006, -0.2485, 0.1124, ..., -0.1211, 0.0483, 0.0355], [-0.1409, -0.0561, -0.0923, ..., -0.1926, -0.0893, -0.0306], [ 0.0241, -0.1387, -0.1657, ..., -0.1339, 0.0355, -0.3642], ..., [-0.2088, 0.1572, 0.0209, ..., 0.1890, -0.0541, -0.1434], [-0.1662, -0.1535, 0.1783, ..., -0.1441, -0.1525, 0.1604], [ 0.0139, -0.3005, 0.1550, ..., 0.0611, -0.2000, -0.1395]], device='cuda:0'), grad: tensor([[-7.4506e-09, -1.1642e-08, -7.4971e-08, ..., 4.6566e-10, -4.6566e-10, -4.5635e-08], [ 0.0000e+00, 2.3283e-09, 8.3819e-09, ..., 2.3283e-09, 0.0000e+00, 7.4506e-09], [ 9.3132e-10, 5.5879e-09, 1.2107e-08, ..., 9.3132e-10, 0.0000e+00, 7.9162e-09], ..., [ 1.8626e-09, 3.7253e-09, 3.7253e-08, ..., 2.1420e-08, 0.0000e+00, 1.1642e-08], [ 7.4506e-09, 1.8626e-09, 1.2247e-07, ..., 8.8476e-08, 0.0000e+00, 2.2817e-08], [-6.0536e-09, 0.0000e+00, -1.6531e-07, ..., -1.3690e-07, 0.0000e+00, 8.8476e-09]], device='cuda:0') Epoch 274, bias, value: tensor([-0.0186, -0.0233, -0.0254, -0.0302, -0.0053, 0.0098, 0.0076, -0.0107, -0.0022, -0.0100], device='cuda:0'), grad: tensor([-9.1270e-08, 4.0978e-08, -6.0070e-08, 2.0489e-08, 7.9162e-08, 6.6357e-07, -7.8836e-07, 1.2480e-07, 3.9162e-07, -3.8370e-07], device='cuda:0') 100 0.0001 changing lr epoch 273, time 220.60, cls_loss 0.0008 cls_loss_mapping 0.0018 cls_loss_causal 0.4632 re_mapping 0.0038 re_causal 0.0108 /// teacc 99.12 lr 0.00010000 Epoch 275, weight, value: tensor([[-0.2007, -0.2492, 0.1120, ..., -0.1224, 0.0483, 0.0354], [-0.1410, -0.0559, -0.0924, ..., -0.1928, -0.0893, -0.0300], [ 0.0241, -0.1393, -0.1662, ..., -0.1346, 0.0355, -0.3652], ..., [-0.2089, 0.1572, 0.0208, ..., 0.1901, -0.0541, -0.1442], [-0.1666, -0.1541, 0.1786, ..., -0.1443, -0.1525, 0.1606], [ 0.0140, -0.3009, 0.1556, ..., 0.0611, -0.2000, -0.1394]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 2.7940e-09, 4.6566e-10, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 7.7300e-08, 0.0000e+00, ..., 8.1491e-08, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 3.5465e-06, 0.0000e+00, ..., 3.7942e-06, 0.0000e+00, 4.6566e-10], ..., [ 4.6566e-10, -3.6471e-06, 0.0000e+00, ..., -3.9078e-06, 0.0000e+00, 1.8626e-09], [ 9.7789e-09, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 7.9162e-09], [ 9.3132e-10, 1.7695e-08, 9.3132e-10, ..., 7.4506e-09, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 275, bias, value: tensor([-0.0187, -0.0234, -0.0252, -0.0301, -0.0059, 0.0098, 0.0078, -0.0106, -0.0021, -0.0101], device='cuda:0'), grad: tensor([ 9.7789e-09, 1.6764e-07, 8.9109e-06, 3.0408e-07, 5.1223e-09, -3.3574e-07, -9.3132e-10, -9.1344e-06, 3.3528e-08, 4.1444e-08], device='cuda:0') 100 0.0001 changing lr epoch 274, time 220.14, cls_loss 0.0008 cls_loss_mapping 0.0022 cls_loss_causal 0.4683 re_mapping 0.0040 re_causal 0.0108 /// teacc 99.16 lr 0.00010000 Epoch 276, weight, value: tensor([[-0.2010, -0.2502, 0.1114, ..., -0.1249, 0.0483, 0.0356], [-0.1410, -0.0564, -0.0931, ..., -0.1937, -0.0894, -0.0307], [ 0.0241, -0.1406, -0.1670, ..., -0.1369, 0.0355, -0.3656], ..., [-0.2092, 0.1579, 0.0213, ..., 0.1912, -0.0541, -0.1437], [-0.1669, -0.1549, 0.1801, ..., -0.1446, -0.1525, 0.1626], [ 0.0140, -0.3017, 0.1563, ..., 0.0613, -0.2000, -0.1395]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.0536e-09, 4.6566e-09, ..., 6.0536e-09, 0.0000e+00, 1.2107e-08], [ 0.0000e+00, 9.5926e-08, 2.8405e-08, ..., 0.0000e+00, 0.0000e+00, 1.3411e-07], ..., [ 4.6566e-10, -2.3283e-09, -5.5879e-09, ..., -1.4901e-08, 0.0000e+00, 4.1910e-09], [ 4.6566e-10, 4.6566e-10, -8.2422e-08, ..., 0.0000e+00, 0.0000e+00, -3.8976e-07], [ 1.8626e-09, 9.7789e-09, 4.1910e-09, ..., 1.0245e-08, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 276, bias, value: tensor([-0.0192, -0.0238, -0.0250, -0.0299, -0.0059, 0.0095, 0.0071, -0.0103, -0.0003, -0.0101], device='cuda:0'), grad: tensor([ 4.6566e-10, 3.6787e-08, 4.6473e-07, -1.8626e-09, -2.3283e-09, 4.2003e-07, 2.1420e-08, 1.0710e-08, -9.7137e-07, 2.3283e-08], device='cuda:0') 100 0.0001 changing lr epoch 275, time 221.13, cls_loss 0.0009 cls_loss_mapping 0.0023 cls_loss_causal 0.4923 re_mapping 0.0038 re_causal 0.0109 /// teacc 99.07 lr 0.00010000 Epoch 277, weight, value: tensor([[-0.2012, -0.2507, 0.1125, ..., -0.1251, 0.0487, 0.0363], [-0.1415, -0.0579, -0.0943, ..., -0.1972, -0.0894, -0.0308], [ 0.0241, -0.1408, -0.1675, ..., -0.1371, 0.0354, -0.3659], ..., [-0.2087, 0.1598, 0.0226, ..., 0.1949, -0.0541, -0.1439], [-0.1690, -0.1558, 0.1801, ..., -0.1456, -0.1526, 0.1621], [ 0.0140, -0.3025, 0.1563, ..., 0.0609, -0.2000, -0.1398]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 1.8720e-07, ..., 1.1129e-07, 0.0000e+00, -4.6566e-09], [ 4.6566e-10, -3.2596e-09, 1.3784e-07, ..., 7.4971e-08, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 9.3132e-10, 1.4901e-08, ..., -6.5193e-09, 0.0000e+00, 1.3970e-09], ..., [ 4.6566e-10, 7.4506e-09, 2.3786e-06, ..., 1.3039e-06, 0.0000e+00, 4.1910e-09], [ 4.6566e-10, 0.0000e+00, 2.4214e-08, ..., 1.5367e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, -3.2019e-06, ..., -1.7490e-06, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 277, bias, value: tensor([-0.0183, -0.0252, -0.0250, -0.0300, -0.0059, 0.0081, 0.0087, -0.0081, -0.0010, -0.0106], device='cuda:0'), grad: tensor([ 5.1968e-07, 3.1665e-07, -3.0268e-08, 6.9151e-07, 1.0990e-07, 4.4284e-07, -1.8161e-08, 6.4969e-06, 6.6124e-08, -8.5607e-06], device='cuda:0') 100 0.0001 changing lr epoch 276, time 220.53, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4916 re_mapping 0.0040 re_causal 0.0111 /// teacc 99.05 lr 0.00010000 Epoch 278, weight, value: tensor([[-0.2014, -0.2514, 0.1135, ..., -0.1255, 0.0489, 0.0369], [-0.1420, -0.0583, -0.0944, ..., -0.1976, -0.0894, -0.0307], [ 0.0246, -0.1414, -0.1680, ..., -0.1379, 0.0352, -0.3664], ..., [-0.2092, 0.1604, 0.0226, ..., 0.1953, -0.0542, -0.1443], [-0.1696, -0.1569, 0.1804, ..., -0.1463, -0.1528, 0.1622], [ 0.0139, -0.3028, 0.1568, ..., 0.0608, -0.2002, -0.1400]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, -1.0245e-08, ..., 8.3819e-09, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 1.6904e-07, 1.7649e-07, ..., 4.7730e-07, 0.0000e+00, -6.0536e-09], [ 9.3132e-10, 0.0000e+00, 4.6566e-09, ..., 3.2596e-09, 0.0000e+00, 2.7940e-09], ..., [ 1.5832e-08, -1.8114e-07, -1.8720e-07, ..., -4.5542e-07, 0.0000e+00, 2.4214e-08], [ 2.7940e-09, 0.0000e+00, 9.3132e-10, ..., 2.3283e-09, 0.0000e+00, 4.1910e-09], [ 1.9558e-08, 1.2573e-08, 1.6298e-08, ..., 2.2575e-06, 0.0000e+00, 2.5611e-08]], device='cuda:0') Epoch 278, bias, value: tensor([-0.0176, -0.0254, -0.0249, -0.0298, -0.0054, 0.0080, 0.0087, -0.0079, -0.0014, -0.0108], device='cuda:0'), grad: tensor([ 6.5193e-09, 9.4529e-07, 4.3772e-08, 1.2293e-07, -7.8753e-06, -2.5984e-07, 5.5414e-08, -7.7765e-07, 2.2817e-08, 7.7263e-06], device='cuda:0') 100 0.0001 changing lr epoch 277, time 220.52, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.5067 re_mapping 0.0037 re_causal 0.0108 /// teacc 99.13 lr 0.00010000 Epoch 279, weight, value: tensor([[-0.2016, -0.2514, 0.1139, ..., -0.1263, 0.0490, 0.0372], [-0.1425, -0.0583, -0.0947, ..., -0.1976, -0.0892, -0.0307], [ 0.0245, -0.1418, -0.1686, ..., -0.1386, 0.0351, -0.3670], ..., [-0.2095, 0.1606, 0.0225, ..., 0.1954, -0.0539, -0.1446], [-0.1698, -0.1576, 0.1807, ..., -0.1471, -0.1529, 0.1624], [ 0.0140, -0.3030, 0.1574, ..., 0.0608, -0.2002, -0.1405]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.6298e-08, ..., 7.4506e-09, -4.6566e-10, -1.0245e-08], [ 0.0000e+00, 1.8626e-09, 4.1910e-09, ..., 4.6566e-09, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 4.6566e-10, 3.7253e-09, ..., 4.6566e-10, -9.3132e-10, 9.3132e-10], ..., [ 0.0000e+00, -9.3132e-09, 4.6566e-09, ..., -8.3819e-09, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 4.6566e-10, 9.7789e-09, ..., 8.8476e-09, 9.3132e-10, -3.7253e-09], [ 0.0000e+00, 9.7789e-09, -4.6100e-08, ..., -4.1444e-08, 4.6566e-10, 4.1910e-09]], device='cuda:0') Epoch 279, bias, value: tensor([-0.0171, -0.0252, -0.0249, -0.0300, -0.0053, 0.0079, 0.0088, -0.0083, -0.0014, -0.0107], device='cuda:0'), grad: tensor([-4.4703e-08, 1.1176e-08, -4.1910e-09, -3.8184e-08, 5.2154e-08, 6.3796e-08, 1.6298e-08, 3.7253e-09, 2.7940e-08, -8.7079e-08], device='cuda:0') 100 0.0001 changing lr epoch 278, time 220.88, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4800 re_mapping 0.0036 re_causal 0.0106 /// teacc 99.07 lr 0.00010000 Epoch 280, weight, value: tensor([[-0.2023, -0.2525, 0.1146, ..., -0.1268, 0.0490, 0.0373], [-0.1433, -0.0584, -0.0948, ..., -0.1977, -0.0893, -0.0306], [ 0.0244, -0.1416, -0.1692, ..., -0.1389, 0.0351, -0.3677], ..., [-0.2108, 0.1606, 0.0224, ..., 0.1954, -0.0538, -0.1451], [-0.1704, -0.1585, 0.1808, ..., -0.1475, -0.1529, 0.1626], [ 0.0160, -0.3031, 0.1587, ..., 0.0614, -0.2003, -0.1407]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-09, 3.2596e-09, ..., 3.2596e-09, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 5.1782e-07, 5.5879e-09, ..., 5.9139e-07, 0.0000e+00, -2.7940e-09], [ 0.0000e+00, 1.4622e-07, 6.0536e-09, ..., 1.5739e-07, 0.0000e+00, 1.3970e-09], ..., [ 4.6566e-10, -6.8592e-07, -2.0023e-08, ..., -7.7393e-07, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 1.2573e-08, 6.0536e-09, ..., 3.2596e-09, 0.0000e+00, 1.3970e-09], [ 1.8626e-09, 4.0513e-08, 2.7940e-09, ..., 2.3283e-09, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 280, bias, value: tensor([-0.0166, -0.0252, -0.0247, -0.0300, -0.0050, 0.0078, 0.0085, -0.0086, -0.0014, -0.0102], device='cuda:0'), grad: tensor([ 1.9558e-08, 1.2107e-06, 3.6135e-07, -1.3690e-07, 2.1420e-08, 2.6543e-08, -5.1223e-09, -1.6121e-06, 4.1910e-08, 8.2422e-08], device='cuda:0') 100 0.0001 changing lr epoch 279, time 220.50, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.5053 re_mapping 0.0037 re_causal 0.0104 /// teacc 99.10 lr 0.00010000 Epoch 281, weight, value: tensor([[-0.2037, -0.2532, 0.1143, ..., -0.1276, 0.0484, 0.0369], [-0.1457, -0.0569, -0.0946, ..., -0.1979, -0.0891, -0.0300], [ 0.0216, -0.1422, -0.1713, ..., -0.1396, 0.0339, -0.3710], ..., [-0.2113, 0.1596, 0.0220, ..., 0.1954, -0.0538, -0.1465], [-0.1721, -0.1594, 0.1809, ..., -0.1484, -0.1531, 0.1626], [ 0.0164, -0.3033, 0.1614, ..., 0.0637, -0.2003, -0.1406]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 1.8626e-09, -1.1642e-07, ..., 6.5193e-09, 0.0000e+00, -5.8673e-08], [ 9.3132e-10, 3.1199e-08, 2.3749e-08, ..., 2.8405e-08, 0.0000e+00, 1.1176e-08], [ 0.0000e+00, -7.2643e-08, 8.3819e-09, ..., -7.2177e-08, 0.0000e+00, -7.9162e-09], ..., [ 1.3970e-09, 4.9360e-08, 5.8208e-08, ..., 8.3353e-08, 0.0000e+00, 2.7008e-08], [ 7.4506e-09, 5.5879e-09, 3.3993e-08, ..., 2.5146e-08, 0.0000e+00, 2.2352e-08], [-9.3132e-10, 9.3132e-10, -2.5099e-07, ..., -3.6322e-07, 0.0000e+00, -1.2573e-08]], device='cuda:0') Epoch 281, bias, value: tensor([-0.0163, -0.0245, -0.0253, -0.0294, -0.0065, 0.0080, 0.0083, -0.0099, -0.0018, -0.0081], device='cuda:0'), grad: tensor([-3.0873e-07, 3.7532e-07, -1.2759e-06, 2.7940e-08, 5.5647e-07, 1.2573e-08, 4.0047e-08, 1.0990e-06, 1.4668e-07, -6.7474e-07], device='cuda:0') 100 0.0001 changing lr epoch 280, time 221.02, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4744 re_mapping 0.0037 re_causal 0.0109 /// teacc 99.11 lr 0.00010000 Epoch 282, weight, value: tensor([[-0.2039, -0.2538, 0.1131, ..., -0.1302, 0.0484, 0.0364], [-0.1459, -0.0569, -0.0946, ..., -0.1980, -0.0891, -0.0297], [ 0.0216, -0.1422, -0.1721, ..., -0.1388, 0.0341, -0.3708], ..., [-0.2117, 0.1596, 0.0220, ..., 0.1953, -0.0542, -0.1477], [-0.1722, -0.1597, 0.1811, ..., -0.1490, -0.1531, 0.1629], [ 0.0164, -0.3035, 0.1622, ..., 0.0631, -0.2010, -0.1409]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.3970e-09, -1.9092e-08, ..., 1.4901e-08, 0.0000e+00, -9.3132e-09], [ 1.3970e-09, 1.0245e-08, 3.7253e-09, ..., 5.4482e-08, 0.0000e+00, -1.8626e-09], [ 1.3970e-09, 1.8161e-08, 6.0536e-09, ..., 2.9337e-08, 0.0000e+00, 1.8626e-09], ..., [ 4.6566e-09, 1.0245e-08, 1.3504e-08, ..., 3.0734e-08, 0.0000e+00, 5.1223e-09], [ 2.3283e-09, 4.1910e-09, 1.1735e-07, ..., 9.8720e-08, 0.0000e+00, 2.7474e-08], [-2.2352e-08, 3.7253e-09, -2.2585e-07, ..., 1.3039e-07, 0.0000e+00, -5.1223e-08]], device='cuda:0') Epoch 282, bias, value: tensor([-0.0177, -0.0244, -0.0250, -0.0295, -0.0058, 0.0079, 0.0086, -0.0103, -0.0017, -0.0085], device='cuda:0'), grad: tensor([-1.1176e-08, 1.6671e-07, 1.0943e-07, -2.5937e-07, -1.2312e-06, 2.5472e-07, 6.4261e-08, 1.4668e-07, 2.6124e-07, 5.0152e-07], device='cuda:0') 100 0.0001 changing lr epoch 281, time 220.64, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4430 re_mapping 0.0039 re_causal 0.0105 /// teacc 99.13 lr 0.00010000 Epoch 283, weight, value: tensor([[-0.2044, -0.2545, 0.1155, ..., -0.1310, 0.0485, 0.0372], [-0.1462, -0.0569, -0.0948, ..., -0.1982, -0.0888, -0.0294], [ 0.0216, -0.1426, -0.1755, ..., -0.1394, 0.0338, -0.3711], ..., [-0.2118, 0.1597, 0.0221, ..., 0.1956, -0.0544, -0.1483], [-0.1726, -0.1601, 0.1813, ..., -0.1496, -0.1532, 0.1630], [ 0.0162, -0.3040, 0.1618, ..., 0.0623, -0.2018, -0.1422]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, -6.3563e-07, ..., 1.3970e-09, 0.0000e+00, -1.7276e-07], [ 0.0000e+00, 2.7008e-08, 1.7695e-08, ..., 1.8626e-08, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 1.3085e-07, 6.2585e-07, ..., 5.9139e-08, 0.0000e+00, 1.2480e-07], ..., [ 0.0000e+00, -2.4494e-07, -1.0803e-07, ..., -1.3458e-07, 0.0000e+00, 2.7940e-09], [ 9.3132e-10, 5.3085e-08, 2.7008e-08, ..., 2.3283e-08, 0.0000e+00, 2.4214e-08], [-6.9849e-09, 2.5146e-08, 2.6543e-08, ..., 2.4214e-08, 0.0000e+00, 6.0536e-09]], device='cuda:0') Epoch 283, bias, value: tensor([-0.0162, -0.0243, -0.0251, -0.0294, -0.0059, 0.0095, 0.0078, -0.0105, -0.0018, -0.0098], device='cuda:0'), grad: tensor([-1.8757e-06, 8.7079e-08, 1.5115e-06, 1.1828e-07, 1.6298e-08, 3.2596e-08, 1.9558e-08, -5.2946e-07, 4.9733e-07, 1.2713e-07], device='cuda:0') 100 0.0001 changing lr epoch 282, time 220.32, cls_loss 0.0007 cls_loss_mapping 0.0017 cls_loss_causal 0.4963 re_mapping 0.0039 re_causal 0.0109 /// teacc 99.12 lr 0.00010000 Epoch 284, weight, value: tensor([[-0.2056, -0.2549, 0.1167, ..., -0.1311, 0.0487, 0.0374], [-0.1464, -0.0569, -0.0948, ..., -0.1983, -0.0887, -0.0290], [ 0.0215, -0.1428, -0.1773, ..., -0.1394, 0.0338, -0.3716], ..., [-0.2119, 0.1598, 0.0222, ..., 0.1957, -0.0547, -0.1486], [-0.1733, -0.1610, 0.1812, ..., -0.1511, -0.1532, 0.1631], [ 0.0160, -0.3043, 0.1618, ..., 0.0622, -0.2020, -0.1426]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -7.9162e-09, ..., 4.6566e-10, 0.0000e+00, 2.6971e-06], [ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 1.8626e-09, 0.0000e+00, -1.3039e-08], [ 0.0000e+00, -4.6566e-09, 0.0000e+00, ..., -9.7789e-09, 0.0000e+00, 2.7940e-09], ..., [ 0.0000e+00, 2.3283e-09, 2.1420e-08, ..., 1.9092e-08, 0.0000e+00, 1.3970e-09], [-9.3132e-10, 0.0000e+00, -4.1910e-09, ..., 0.0000e+00, 0.0000e+00, 6.0536e-09], [ 4.6566e-10, 0.0000e+00, -2.4680e-08, ..., -9.7789e-09, 0.0000e+00, 4.6566e-09]], device='cuda:0') Epoch 284, bias, value: tensor([-0.0156, -0.0243, -0.0250, -0.0296, -0.0056, 0.0096, 0.0077, -0.0107, -0.0023, -0.0102], device='cuda:0'), grad: tensor([ 9.8869e-06, -4.1910e-08, -7.2643e-08, 1.2107e-08, 2.7940e-08, -4.6100e-08, -9.9093e-06, 1.0291e-07, 4.1444e-08, -1.1642e-08], device='cuda:0') 100 0.0001 changing lr epoch 283, time 220.40, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4982 re_mapping 0.0037 re_causal 0.0105 /// teacc 99.05 lr 0.00010000 Epoch 285, weight, value: tensor([[-0.2060, -0.2568, 0.1177, ..., -0.1315, 0.0489, 0.0379], [-0.1469, -0.0570, -0.0949, ..., -0.1984, -0.0887, -0.0287], [ 0.0217, -0.1436, -0.1778, ..., -0.1398, 0.0339, -0.3719], ..., [-0.2123, 0.1601, 0.0226, ..., 0.1960, -0.0548, -0.1490], [-0.1741, -0.1621, 0.1808, ..., -0.1523, -0.1532, 0.1630], [ 0.0156, -0.3054, 0.1618, ..., 0.0606, -0.2020, -0.1430]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 9.3132e-10, -4.6566e-10, ..., 2.7940e-09, 0.0000e+00, 2.7940e-09], [ 3.2596e-09, 1.1642e-08, 3.7253e-09, ..., 1.5832e-08, 0.0000e+00, 2.3283e-09], [ 3.3062e-08, 7.2643e-08, 1.8626e-09, ..., 8.8941e-08, 0.0000e+00, 2.3283e-09], ..., [-3.9581e-08, -1.2107e-07, -9.3132e-09, ..., -1.5972e-07, 0.0000e+00, 1.3970e-09], [ 1.1176e-08, 4.6566e-10, -3.5390e-08, ..., 9.3132e-10, 0.0000e+00, -2.5146e-08], [ 2.8266e-07, 1.6764e-08, 3.2596e-08, ..., 1.3970e-07, 0.0000e+00, 1.8626e-07]], device='cuda:0') Epoch 285, bias, value: tensor([-0.0150, -0.0243, -0.0250, -0.0290, -0.0045, 0.0096, 0.0077, -0.0106, -0.0027, -0.0118], device='cuda:0'), grad: tensor([ 1.5832e-08, 6.3330e-08, 3.1479e-07, 1.0151e-07, -4.1956e-07, -6.6776e-07, 3.8184e-08, -5.0524e-07, -3.7719e-08, 1.1064e-06], device='cuda:0') 100 0.0001 changing lr epoch 284, time 220.21, cls_loss 0.0008 cls_loss_mapping 0.0026 cls_loss_causal 0.4685 re_mapping 0.0036 re_causal 0.0104 /// teacc 99.08 lr 0.00010000 Epoch 286, weight, value: tensor([[-0.2062, -0.2578, 0.1156, ..., -0.1341, 0.0487, 0.0380], [-0.1473, -0.0574, -0.0953, ..., -0.1986, -0.0887, -0.0287], [ 0.0217, -0.1445, -0.1786, ..., -0.1408, 0.0341, -0.3721], ..., [-0.2123, 0.1606, 0.0228, ..., 0.1964, -0.0548, -0.1493], [-0.1746, -0.1627, 0.1810, ..., -0.1526, -0.1532, 0.1628], [ 0.0156, -0.3057, 0.1637, ..., 0.0612, -0.2020, -0.1431]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, -9.8441e-07, ..., 9.7789e-09, 0.0000e+00, -1.3905e-06], [ 1.3970e-09, 2.3283e-09, 1.0710e-08, ..., 2.7940e-09, 0.0000e+00, 1.3970e-08], [ 4.6566e-10, 9.3132e-10, 4.6566e-09, ..., 4.1910e-09, 0.0000e+00, 6.5193e-09], ..., [ 4.6566e-09, 2.3283e-09, 3.0268e-08, ..., 4.1910e-09, 0.0000e+00, 3.7253e-08], [ 9.3132e-10, 9.3132e-10, 3.9442e-07, ..., 1.3970e-09, 0.0000e+00, 5.5460e-07], [-5.0291e-08, 9.3132e-09, -6.9849e-08, ..., 3.5064e-07, 0.0000e+00, 1.7229e-08]], device='cuda:0') Epoch 286, bias, value: tensor([-0.0164, -0.0245, -0.0247, -0.0292, -0.0053, 0.0096, 0.0077, -0.0102, -0.0033, -0.0111], device='cuda:0'), grad: tensor([-3.2671e-06, 5.0291e-08, 3.1199e-08, -1.4529e-07, -1.1008e-06, 7.9349e-07, 1.2573e-06, 1.1222e-07, 1.3318e-06, 9.4250e-07], device='cuda:0') 100 0.0001 changing lr epoch 285, time 220.38, cls_loss 0.0008 cls_loss_mapping 0.0015 cls_loss_causal 0.4817 re_mapping 0.0037 re_causal 0.0105 /// teacc 99.04 lr 0.00010000 Epoch 287, weight, value: tensor([[-0.2064, -0.2580, 0.1160, ..., -0.1342, 0.0488, 0.0386], [-0.1492, -0.0577, -0.0959, ..., -0.1991, -0.0879, -0.0289], [ 0.0218, -0.1449, -0.1793, ..., -0.1409, 0.0337, -0.3720], ..., [-0.2131, 0.1608, 0.0228, ..., 0.1969, -0.0549, -0.1501], [-0.1748, -0.1629, 0.1816, ..., -0.1524, -0.1532, 0.1631], [ 0.0159, -0.3061, 0.1643, ..., 0.0613, -0.2020, -0.1437]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.7987e-08, 4.9826e-08, ..., 7.1246e-08, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 4.6566e-10, -1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -8.8941e-08, -6.4261e-08, ..., -9.2667e-08, 0.0000e+00, -1.3970e-09], [ 4.6566e-10, 5.5879e-09, 5.1223e-09, ..., 6.0536e-09, 0.0000e+00, 4.6566e-10], [-4.1910e-09, 9.7789e-09, 1.3970e-09, ..., 9.7789e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 287, bias, value: tensor([-0.0161, -0.0249, -0.0244, -0.0293, -0.0055, 0.0096, 0.0078, -0.0101, -0.0034, -0.0110], device='cuda:0'), grad: tensor([ 3.7719e-08, 1.8300e-07, -3.7253e-08, 9.3132e-09, 1.0096e-06, 1.5134e-07, -1.1791e-06, -2.5611e-07, 4.0047e-08, 4.0047e-08], device='cuda:0') 100 0.0001 changing lr epoch 286, time 220.25, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4385 re_mapping 0.0038 re_causal 0.0101 /// teacc 99.12 lr 0.00010000 Epoch 288, weight, value: tensor([[-0.2072, -0.2590, 0.1158, ..., -0.1346, 0.0487, 0.0382], [-0.1502, -0.0578, -0.0961, ..., -0.1992, -0.0877, -0.0287], [ 0.0218, -0.1453, -0.1797, ..., -0.1413, 0.0336, -0.3722], ..., [-0.2133, 0.1609, 0.0228, ..., 0.1970, -0.0550, -0.1504], [-0.1756, -0.1632, 0.1817, ..., -0.1528, -0.1532, 0.1630], [ 0.0156, -0.3065, 0.1647, ..., 0.0610, -0.2021, -0.1440]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 3.2596e-09, 4.6566e-10, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 3.4925e-08, 2.7940e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, -3.7253e-09, ..., -2.7940e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 288, bias, value: tensor([-0.0166, -0.0248, -0.0244, -0.0293, -0.0048, 0.0095, 0.0078, -0.0103, -0.0037, -0.0115], device='cuda:0'), grad: tensor([ 5.1223e-09, 1.1409e-07, -1.1735e-07, -7.6368e-08, 3.7253e-09, 2.3749e-08, -1.8626e-09, 5.8208e-08, 6.9849e-09, -3.2596e-09], device='cuda:0') 100 0.0001 changing lr epoch 287, time 220.48, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4865 re_mapping 0.0036 re_causal 0.0105 /// teacc 99.09 lr 0.00010000 Epoch 289, weight, value: tensor([[-0.2088, -0.2603, 0.1156, ..., -0.1349, 0.0487, 0.0378], [-0.1507, -0.0580, -0.0962, ..., -0.1993, -0.0876, -0.0293], [ 0.0214, -0.1450, -0.1808, ..., -0.1418, 0.0336, -0.3727], ..., [-0.2137, 0.1610, 0.0228, ..., 0.1972, -0.0551, -0.1506], [-0.1760, -0.1636, 0.1819, ..., -0.1533, -0.1532, 0.1635], [ 0.0161, -0.3068, 0.1663, ..., 0.0619, -0.2021, -0.1442]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.5611e-09, 6.7521e-09, ..., 9.3132e-10, 0.0000e+00, 2.3516e-08], [ 9.3132e-10, 9.5228e-08, 6.9384e-08, ..., 6.4960e-08, 0.0000e+00, 2.4005e-07], [ 0.0000e+00, 3.5856e-08, 3.4925e-09, ..., 2.5146e-08, -1.1642e-09, 1.1874e-08], ..., [ 2.3283e-10, -1.1944e-07, 1.0710e-08, ..., -9.1502e-08, 9.3132e-10, 3.7951e-08], [ 1.6298e-09, -8.0559e-08, -2.3865e-07, ..., 9.3132e-10, 0.0000e+00, -8.3260e-07], [ 1.2340e-08, 4.6566e-09, -1.5367e-08, ..., 2.5914e-07, 0.0000e+00, 2.3749e-08]], device='cuda:0') Epoch 289, bias, value: tensor([-0.0172, -0.0250, -0.0239, -0.0291, -0.0051, 0.0095, 0.0078, -0.0104, -0.0041, -0.0111], device='cuda:0'), grad: tensor([ 8.3121e-08, 8.5169e-07, 9.8255e-08, 5.4948e-07, -1.3374e-06, 8.1584e-07, 2.3586e-07, -2.9337e-08, -2.6934e-06, 1.4259e-06], device='cuda:0') 100 0.0001 changing lr epoch 288, time 220.84, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4893 re_mapping 0.0036 re_causal 0.0103 /// teacc 99.07 lr 0.00010000 Epoch 290, weight, value: tensor([[-0.2095, -0.2627, 0.1153, ..., -0.1352, 0.0486, 0.0372], [-0.1515, -0.0579, -0.0963, ..., -0.1995, -0.0876, -0.0296], [ 0.0214, -0.1452, -0.1812, ..., -0.1426, 0.0337, -0.3729], ..., [-0.2139, 0.1611, 0.0229, ..., 0.1976, -0.0551, -0.1509], [-0.1770, -0.1649, 0.1834, ..., -0.1538, -0.1532, 0.1641], [ 0.0162, -0.3078, 0.1666, ..., 0.0620, -0.2021, -0.1461]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 6.9849e-10, 2.7940e-09, ..., 2.2352e-08, 0.0000e+00, 3.7253e-09], [ 6.3796e-08, 1.6997e-08, 8.6147e-09, ..., 2.0629e-07, 0.0000e+00, 6.5193e-09], [ 3.2596e-09, 4.1910e-09, 1.0012e-08, ..., 2.3283e-08, 0.0000e+00, 2.0955e-09], ..., [ 6.1234e-08, 1.0012e-07, 1.2503e-07, ..., 2.7358e-07, 0.0000e+00, 3.7253e-09], [ 8.3819e-09, 4.6566e-10, 3.7253e-09, ..., 2.9337e-08, 0.0000e+00, -1.8626e-09], [ 3.3760e-08, 5.8208e-09, -1.9278e-07, ..., -4.4471e-08, 0.0000e+00, 3.4925e-09]], device='cuda:0') Epoch 290, bias, value: tensor([-0.0184, -0.0250, -0.0237, -0.0293, -0.0054, 0.0095, 0.0079, -0.0105, -0.0037, -0.0116], device='cuda:0'), grad: tensor([ 1.1176e-07, 9.7323e-07, 7.5903e-08, -1.5576e-07, -3.4329e-06, 2.3632e-07, 6.7707e-07, 1.2554e-06, 1.2689e-07, 1.4156e-07], device='cuda:0') 100 0.0001 changing lr epoch 289, time 220.83, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.4774 re_mapping 0.0035 re_causal 0.0100 /// teacc 99.04 lr 0.00010000 Epoch 291, weight, value: tensor([[-0.2100, -0.2657, 0.1152, ..., -0.1356, 0.0487, 0.0372], [-0.1486, -0.0604, -0.0975, ..., -0.2011, -0.0875, -0.0299], [ 0.0207, -0.1461, -0.1830, ..., -0.1442, 0.0337, -0.3734], ..., [-0.2174, 0.1636, 0.0234, ..., 0.1992, -0.0552, -0.1520], [-0.1780, -0.1656, 0.1854, ..., -0.1535, -0.1532, 0.1655], [ 0.0171, -0.3081, 0.1677, ..., 0.0617, -0.2021, -0.1477]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.3271e-08, 4.1910e-09, ..., 9.3132e-10, 0.0000e+00, 3.2596e-09], [ 1.1642e-09, 1.8161e-08, 3.0268e-09, ..., 4.6566e-10, 0.0000e+00, 2.4680e-07], [ 4.6566e-10, 3.9581e-09, 3.4925e-09, ..., 2.3283e-10, 0.0000e+00, 1.3504e-08], ..., [ 4.6566e-10, 1.9092e-08, 1.8626e-09, ..., 6.9849e-10, 0.0000e+00, 1.1874e-08], [ 2.0955e-09, 2.8173e-08, -9.7789e-09, ..., 4.6566e-10, 0.0000e+00, -1.5022e-06], [-1.9791e-08, 4.1910e-09, -4.4703e-08, ..., -1.0477e-08, 0.0000e+00, 7.5204e-08]], device='cuda:0') Epoch 291, bias, value: tensor([-0.0188, -0.0263, -0.0238, -0.0300, -0.0050, 0.0095, 0.0079, -0.0092, -0.0024, -0.0114], device='cuda:0'), grad: tensor([ 1.4063e-07, 7.9582e-07, -1.3150e-06, -1.5041e-06, 1.3458e-07, 3.1125e-06, 2.2016e-06, 1.2806e-07, -3.7737e-06, 8.8708e-08], device='cuda:0') 100 0.0001 changing lr epoch 290, time 221.39, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4808 re_mapping 0.0037 re_causal 0.0101 /// teacc 99.06 lr 0.00010000 Epoch 292, weight, value: tensor([[-0.2113, -0.2670, 0.1153, ..., -0.1358, 0.0487, 0.0372], [-0.1482, -0.0607, -0.0981, ..., -0.2013, -0.0873, -0.0304], [ 0.0207, -0.1466, -0.1838, ..., -0.1446, 0.0336, -0.3737], ..., [-0.2187, 0.1638, 0.0236, ..., 0.1993, -0.0553, -0.1526], [-0.1828, -0.1670, 0.1858, ..., -0.1539, -0.1533, 0.1623], [ 0.0172, -0.3087, 0.1686, ..., 0.0620, -0.2021, -0.1482]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, -1.2806e-08, ..., 0.0000e+00, -2.5611e-09, -1.2806e-08], [ 0.0000e+00, 4.4238e-09, 2.3283e-10, ..., 6.9849e-10, 0.0000e+00, -1.8859e-08], [ 0.0000e+00, -1.6764e-08, 5.5879e-09, ..., -1.7462e-08, 9.3132e-10, 1.2573e-08], ..., [ 0.0000e+00, 2.0256e-08, -2.3283e-10, ..., 1.6065e-08, 0.0000e+00, 5.1223e-09], [ 0.0000e+00, 9.3132e-10, -1.0245e-08, ..., -3.0268e-09, 2.3283e-10, -1.1874e-08], [ 2.3283e-10, 2.7940e-09, 7.9162e-09, ..., 2.5611e-09, 2.3283e-10, 1.0012e-08]], device='cuda:0') Epoch 292, bias, value: tensor([-0.0191, -0.0259, -0.0238, -0.0299, -0.0062, 0.0099, 0.0078, -0.0100, -0.0059, -0.0113], device='cuda:0'), grad: tensor([-4.1677e-08, -5.4715e-08, -7.4273e-08, -5.4017e-08, 2.6776e-08, 5.4948e-08, -4.4238e-09, 1.4040e-07, -1.4668e-08, 2.8405e-08], device='cuda:0') 100 0.0001 changing lr epoch 291, time 220.68, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4471 re_mapping 0.0036 re_causal 0.0101 /// teacc 99.19 lr 0.00010000 Epoch 293, weight, value: tensor([[-0.2115, -0.2677, 0.1153, ..., -0.1359, 0.0486, 0.0371], [-0.1483, -0.0606, -0.0979, ..., -0.2014, -0.0872, -0.0290], [ 0.0206, -0.1469, -0.1845, ..., -0.1452, 0.0335, -0.3738], ..., [-0.2186, 0.1638, 0.0233, ..., 0.1995, -0.0550, -0.1542], [-0.1829, -0.1678, 0.1862, ..., -0.1542, -0.1533, 0.1623], [ 0.0175, -0.3089, 0.1691, ..., 0.0623, -0.2022, -0.1485]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 8.4750e-08, 6.9849e-10, ..., 1.3970e-09, 0.0000e+00, 9.3132e-10], [ 6.9849e-10, 1.2317e-07, 2.3283e-10, ..., 1.3970e-09, 0.0000e+00, -1.9558e-08], [ 6.9849e-10, 2.8242e-07, 1.1642e-09, ..., 2.3283e-09, 0.0000e+00, 6.9849e-10], ..., [ 6.5193e-09, 9.8255e-08, 2.3283e-09, ..., 5.1223e-09, 0.0000e+00, 2.2585e-08], [ 1.3970e-09, 2.3050e-08, 1.3970e-09, ..., 3.0268e-09, 0.0000e+00, -1.6298e-09], [ 3.2596e-08, 2.7707e-08, -2.7008e-08, ..., 4.0513e-08, 0.0000e+00, 1.2340e-08]], device='cuda:0') Epoch 293, bias, value: tensor([-0.0193, -0.0255, -0.0236, -0.0295, -0.0066, 0.0099, 0.0078, -0.0104, -0.0060, -0.0111], device='cuda:0'), grad: tensor([ 3.3597e-07, 3.0082e-07, 9.7696e-07, -2.6394e-06, -1.9395e-07, 1.8626e-09, 1.6973e-07, 6.6590e-07, 1.0105e-07, 2.7963e-07], device='cuda:0') 100 0.0001 changing lr epoch 292, time 220.97, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4761 re_mapping 0.0036 re_causal 0.0102 /// teacc 99.10 lr 0.00010000 Epoch 294, weight, value: tensor([[-0.2116, -0.2683, 0.1154, ..., -0.1363, 0.0487, 0.0377], [-0.1490, -0.0607, -0.0980, ..., -0.2015, -0.0872, -0.0289], [ 0.0204, -0.1478, -0.1855, ..., -0.1461, 0.0334, -0.3742], ..., [-0.2184, 0.1641, 0.0231, ..., 0.1998, -0.0548, -0.1548], [-0.1829, -0.1683, 0.1865, ..., -0.1547, -0.1534, 0.1624], [ 0.0177, -0.3092, 0.1698, ..., 0.0625, -0.2022, -0.1486]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 4.6566e-09, ..., 4.6566e-10, 0.0000e+00, 7.2177e-09], [ 6.9849e-10, 1.7229e-08, 4.4238e-09, ..., 1.4901e-08, -2.3283e-10, -4.6566e-09], [ 2.3283e-10, 1.3970e-09, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 1.6298e-09], ..., [ 6.9849e-10, -1.7229e-08, -2.3283e-10, ..., -1.3737e-08, 0.0000e+00, 2.5611e-09], [ 9.3132e-10, 2.3283e-10, -6.9849e-10, ..., 2.0955e-09, 0.0000e+00, -7.4506e-09], [-1.7928e-08, 9.3132e-10, -5.4482e-08, ..., -1.5367e-08, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 294, bias, value: tensor([-0.0189, -0.0255, -0.0240, -0.0300, -0.0068, 0.0099, 0.0078, -0.0102, -0.0060, -0.0110], device='cuda:0'), grad: tensor([ 1.5134e-08, 1.2806e-08, 1.2573e-08, 7.0548e-08, 4.5169e-08, -9.3132e-10, -1.9092e-08, -2.3283e-08, 1.8626e-09, -9.6159e-08], device='cuda:0') 100 0.0001 changing lr epoch 293, time 220.61, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.5011 re_mapping 0.0035 re_causal 0.0103 /// teacc 99.02 lr 0.00010000 Epoch 295, weight, value: tensor([[-0.2120, -0.2694, 0.1156, ..., -0.1368, 0.0487, 0.0380], [-0.1491, -0.0618, -0.0981, ..., -0.2028, -0.0871, -0.0278], [ 0.0203, -0.1486, -0.1865, ..., -0.1469, 0.0334, -0.3746], ..., [-0.2189, 0.1654, 0.0232, ..., 0.2012, -0.0548, -0.1562], [-0.1829, -0.1684, 0.1873, ..., -0.1550, -0.1530, 0.1624], [ 0.0179, -0.3096, 0.1700, ..., 0.0625, -0.2023, -0.1490]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 5.8208e-09, ..., 3.4925e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 2.0955e-09, 2.0955e-09, ..., 1.8626e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 9.5461e-09, 2.6776e-08, ..., 1.3039e-08, 0.0000e+00, 2.4447e-08], ..., [-2.5611e-09, -1.6298e-08, 3.0268e-09, ..., -1.7229e-08, 0.0000e+00, 2.0955e-09], [ 0.0000e+00, 6.9849e-10, -5.7975e-08, ..., -1.3039e-08, 0.0000e+00, -5.4250e-08], [ 2.5611e-09, 6.5193e-09, -1.9791e-08, ..., -3.7253e-09, 0.0000e+00, 4.1910e-09]], device='cuda:0') Epoch 295, bias, value: tensor([-0.0189, -0.0250, -0.0249, -0.0303, -0.0069, 0.0098, 0.0078, -0.0098, -0.0059, -0.0111], device='cuda:0'), grad: tensor([ 1.5134e-08, 7.6834e-09, 1.0221e-07, 3.8883e-08, 6.5193e-09, 1.5367e-08, 7.1712e-08, -2.3516e-08, -1.7881e-07, -2.9337e-08], device='cuda:0') 100 0.0001 changing lr epoch 294, time 220.69, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4783 re_mapping 0.0034 re_causal 0.0102 /// teacc 99.11 lr 0.00010000 Epoch 296, weight, value: tensor([[-0.2123, -0.2695, 0.1160, ..., -0.1369, 0.0485, 0.0386], [-0.1493, -0.0624, -0.0990, ..., -0.2028, -0.0869, -0.0285], [ 0.0203, -0.1492, -0.1872, ..., -0.1474, 0.0333, -0.3750], ..., [-0.2189, 0.1660, 0.0239, ..., 0.2013, -0.0549, -0.1556], [-0.1830, -0.1688, 0.1877, ..., -0.1551, -0.1530, 0.1625], [ 0.0175, -0.3098, 0.1700, ..., 0.0625, -0.2023, -0.1498]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 2.3283e-09, ..., 1.3970e-09, 0.0000e+00, 6.0536e-09], [ 0.0000e+00, 8.1491e-09, -9.3132e-09, ..., 6.2864e-09, 0.0000e+00, -1.9209e-07], [ 0.0000e+00, 3.9581e-09, 4.6566e-10, ..., 6.9849e-10, 0.0000e+00, 2.3283e-09], ..., [ 6.9849e-10, -1.6298e-09, 3.4459e-08, ..., 2.6310e-08, 0.0000e+00, 6.2864e-09], [ 4.1910e-09, 4.8894e-09, 3.0035e-08, ..., 2.5611e-08, 0.0000e+00, 7.5437e-08], [-6.9849e-09, 6.7521e-09, -9.2434e-08, ..., -8.4052e-08, 0.0000e+00, -2.4447e-08]], device='cuda:0') Epoch 296, bias, value: tensor([-0.0186, -0.0257, -0.0242, -0.0307, -0.0070, 0.0099, 0.0078, -0.0096, -0.0059, -0.0114], device='cuda:0'), grad: tensor([ 3.5623e-08, -7.6834e-07, -4.2608e-08, -7.1898e-07, 1.9325e-08, 7.3155e-07, 4.8522e-07, 1.3295e-07, 3.8696e-07, -2.5542e-07], device='cuda:0') 100 0.0001 changing lr epoch 295, time 220.73, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.5134 re_mapping 0.0036 re_causal 0.0108 /// teacc 99.12 lr 0.00010000 Epoch 297, weight, value: tensor([[-0.2125, -0.2697, 0.1161, ..., -0.1370, 0.0487, 0.0384], [-0.1493, -0.0624, -0.0989, ..., -0.2030, -0.0865, -0.0268], [ 0.0200, -0.1495, -0.1885, ..., -0.1481, 0.0333, -0.3753], ..., [-0.2190, 0.1662, 0.0239, ..., 0.2016, -0.0550, -0.1561], [-0.1832, -0.1698, 0.1875, ..., -0.1557, -0.1534, 0.1624], [ 0.0173, -0.3101, 0.1706, ..., 0.0624, -0.2024, -0.1503]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 2.3283e-09, ..., 6.9849e-10, 0.0000e+00, 1.1874e-08], [ 6.9849e-10, 0.0000e+00, 5.3551e-09, ..., 2.3283e-10, 0.0000e+00, 9.0804e-09], [-1.2806e-08, 0.0000e+00, 1.0245e-08, ..., 4.6566e-10, 0.0000e+00, 1.6997e-08], ..., [ 1.1642e-09, 0.0000e+00, 3.9581e-09, ..., 2.7940e-09, 0.0000e+00, 2.3283e-09], [ 3.2596e-09, 0.0000e+00, -1.2573e-07, ..., 6.9849e-10, 0.0000e+00, -3.0175e-07], [ 1.3039e-08, 0.0000e+00, -6.9849e-09, ..., -7.2177e-09, 0.0000e+00, 5.8208e-09]], device='cuda:0') Epoch 297, bias, value: tensor([-0.0189, -0.0248, -0.0249, -0.0323, -0.0067, 0.0100, 0.0078, -0.0099, -0.0063, -0.0116], device='cuda:0'), grad: tensor([ 5.5647e-08, 3.3062e-08, -9.8022e-08, 5.5879e-08, 2.2119e-08, 9.3365e-08, 5.1269e-07, 1.6531e-08, -7.9721e-07, 1.0571e-07], device='cuda:0') 100 0.0001 changing lr epoch 296, time 220.81, cls_loss 0.0008 cls_loss_mapping 0.0017 cls_loss_causal 0.4788 re_mapping 0.0037 re_causal 0.0099 /// teacc 99.12 lr 0.00010000 Epoch 298, weight, value: tensor([[-0.2130, -0.2704, 0.1161, ..., -0.1377, 0.0487, 0.0385], [-0.1497, -0.0628, -0.0991, ..., -0.2042, -0.0863, -0.0270], [ 0.0199, -0.1500, -0.1890, ..., -0.1493, 0.0333, -0.3756], ..., [-0.2194, 0.1666, 0.0237, ..., 0.2027, -0.0548, -0.1571], [-0.1833, -0.1699, 0.1881, ..., -0.1558, -0.1536, 0.1623], [ 0.0180, -0.3105, 0.1715, ..., 0.0629, -0.2026, -0.1503]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 2.3283e-10, 3.9581e-09, ..., 2.0955e-09, 4.6566e-10, 4.6566e-09], [ 3.4925e-09, 4.1910e-09, 1.2573e-08, ..., 4.4238e-09, -3.0268e-09, 7.2177e-09], [ 6.9849e-10, 4.6566e-10, 2.5611e-09, ..., 4.6566e-10, 0.0000e+00, 3.7253e-09], ..., [ 4.4238e-08, 4.1910e-09, 1.2596e-07, ..., 8.5915e-08, 2.3283e-10, 3.2596e-08], [-5.4482e-08, 2.3283e-10, -1.7020e-07, ..., 2.5611e-09, 2.3283e-10, -2.6333e-07], [-5.0757e-08, -1.2107e-08, -1.4366e-07, ..., -1.4668e-07, 0.0000e+00, 4.6100e-08]], device='cuda:0') Epoch 298, bias, value: tensor([-0.0192, -0.0252, -0.0249, -0.0323, -0.0069, 0.0100, 0.0078, -0.0097, -0.0064, -0.0110], device='cuda:0'), grad: tensor([ 2.0955e-08, 2.6543e-08, 4.8894e-09, 9.4762e-08, 1.6228e-07, 4.2398e-07, -5.2387e-08, 4.0722e-07, -6.7893e-07, -4.0350e-07], device='cuda:0') 100 0.0001 changing lr epoch 297, time 220.38, cls_loss 0.0007 cls_loss_mapping 0.0017 cls_loss_causal 0.4921 re_mapping 0.0038 re_causal 0.0104 /// teacc 99.13 lr 0.00010000 Epoch 299, weight, value: tensor([[-0.2133, -0.2713, 0.1166, ..., -0.1381, 0.0491, 0.0388], [-0.1497, -0.0627, -0.0992, ..., -0.2042, -0.0862, -0.0260], [ 0.0199, -0.1502, -0.1904, ..., -0.1497, 0.0334, -0.3759], ..., [-0.2195, 0.1666, 0.0236, ..., 0.2027, -0.0552, -0.1585], [-0.1834, -0.1699, 0.1885, ..., -0.1560, -0.1536, 0.1623], [ 0.0174, -0.3109, 0.1721, ..., 0.0621, -0.2027, -0.1506]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, -3.7253e-09, ..., 4.6566e-10, 0.0000e+00, -4.1910e-09], [ 2.3283e-09, 2.3283e-09, 2.3283e-09, ..., 3.2596e-09, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 9.3132e-10], ..., [ 4.6566e-09, -2.7940e-09, 8.3819e-09, ..., 2.3283e-09, 0.0000e+00, 1.3970e-09], [ 4.6566e-10, -4.6566e-10, -2.7940e-09, ..., -4.6566e-10, 0.0000e+00, -9.3132e-10], [-1.3970e-08, 4.6566e-10, -2.0023e-08, ..., -1.2107e-08, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 299, bias, value: tensor([-0.0190, -0.0250, -0.0246, -0.0323, -0.0058, 0.0100, 0.0078, -0.0100, -0.0064, -0.0118], device='cuda:0'), grad: tensor([-1.8161e-08, 1.6764e-08, 3.2596e-09, 1.0245e-08, 2.4680e-08, 7.9162e-08, -7.9162e-08, 1.7695e-08, -2.3283e-09, -5.4482e-08], device='cuda:0') 100 0.0001 changing lr epoch 298, time 220.30, cls_loss 0.0008 cls_loss_mapping 0.0018 cls_loss_causal 0.4586 re_mapping 0.0036 re_causal 0.0099 /// teacc 99.08 lr 0.00010000 Epoch 300, weight, value: tensor([[-0.2136, -0.2722, 0.1184, ..., -0.1376, 0.0491, 0.0407], [-0.1509, -0.0625, -0.1001, ..., -0.2042, -0.0862, -0.0264], [ 0.0199, -0.1518, -0.1923, ..., -0.1517, 0.0337, -0.3764], ..., [-0.2192, 0.1667, 0.0234, ..., 0.2030, -0.0555, -0.1598], [-0.1834, -0.1700, 0.1905, ..., -0.1565, -0.1539, 0.1625], [ 0.0179, -0.3111, 0.1714, ..., 0.0621, -0.2027, -0.1534]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, -9.7789e-09, -2.4680e-08, ..., 0.0000e+00, 0.0000e+00, -3.4459e-08], [-9.3132e-10, 1.8626e-09, 0.0000e+00, ..., -9.3132e-10, 0.0000e+00, -4.6566e-10], ..., [ 4.6566e-10, 9.3132e-09, 1.9092e-08, ..., 1.3970e-09, 0.0000e+00, 2.3749e-08], [ 0.0000e+00, 0.0000e+00, -1.3970e-09, ..., 4.6566e-10, 0.0000e+00, -2.3283e-09], [-4.6566e-10, 3.2596e-09, 3.2596e-09, ..., -1.8626e-09, 0.0000e+00, 9.3132e-09]], device='cuda:0') Epoch 300, bias, value: tensor([-0.0172, -0.0251, -0.0249, -0.0322, -0.0059, 0.0100, 0.0078, -0.0101, -0.0063, -0.0128], device='cuda:0'), grad: tensor([ 4.6566e-09, -7.1712e-08, -2.3469e-07, 1.2107e-08, 1.3504e-08, 3.3528e-08, 6.1933e-08, 1.4622e-07, 2.3283e-09, 4.2841e-08], device='cuda:0') 100 0.0001 changing lr epoch 299, time 220.57, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4561 re_mapping 0.0035 re_causal 0.0100 /// teacc 99.08 lr 0.00010000 Epoch 301, weight, value: tensor([[-0.2141, -0.2731, 0.1176, ..., -0.1399, 0.0498, 0.0411], [-0.1509, -0.0625, -0.1001, ..., -0.2043, -0.0862, -0.0240], [ 0.0197, -0.1527, -0.1936, ..., -0.1524, 0.0337, -0.3770], ..., [-0.2194, 0.1671, 0.0233, ..., 0.2032, -0.0555, -0.1616], [-0.1836, -0.1705, 0.1907, ..., -0.1573, -0.1539, 0.1624], [ 0.0174, -0.3115, 0.1726, ..., 0.0628, -0.2028, -0.1549]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, -1.7975e-07, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09], [ 2.3283e-09, 4.6566e-10, 8.4285e-08, ..., 0.0000e+00, 0.0000e+00, -5.5879e-08], [ 9.3132e-10, 0.0000e+00, 5.1223e-09, ..., 0.0000e+00, 0.0000e+00, 8.3819e-09], ..., [ 2.3283e-09, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.4435e-08], [ 1.0291e-07, 0.0000e+00, 6.0536e-09, ..., 1.8626e-09, 0.0000e+00, 8.3353e-08], [ 3.2596e-09, 0.0000e+00, -9.3132e-09, ..., -6.5193e-09, 0.0000e+00, 1.3970e-08]], device='cuda:0') Epoch 301, bias, value: tensor([-0.0178, -0.0244, -0.0250, -0.0326, -0.0059, 0.0100, 0.0078, -0.0107, -0.0066, -0.0123], device='cuda:0'), grad: tensor([-1.1269e-06, 2.6077e-07, 7.9162e-08, 2.0815e-07, 5.5414e-08, -4.4983e-07, 5.9512e-07, 7.8231e-08, 2.4866e-07, 4.9826e-08], device='cuda:0') 100 0.0001 changing lr epoch 300, time 220.59, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.5051 re_mapping 0.0036 re_causal 0.0099 /// teacc 99.08 lr 0.00010000 Epoch 302, weight, value: tensor([[-0.2160, -0.2781, 0.1176, ..., -0.1403, 0.0488, 0.0409], [-0.1510, -0.0632, -0.0995, ..., -0.2051, -0.0841, -0.0206], [ 0.0195, -0.1567, -0.1957, ..., -0.1547, 0.0328, -0.3782], ..., [-0.2197, 0.1692, 0.0245, ..., 0.2048, -0.0568, -0.1623], [-0.1839, -0.1742, 0.1886, ..., -0.1581, -0.1545, 0.1619], [ 0.0173, -0.3146, 0.1728, ..., 0.0622, -0.2033, -0.1556]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.3970e-09, -2.1886e-08, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, -2.2398e-07, -4.6100e-08, ..., 2.0955e-08, -2.3283e-09, -4.0047e-08], [ 0.0000e+00, 5.1223e-09, 5.1223e-09, ..., -2.7940e-09, 4.6566e-10, 1.6764e-08], ..., [ 4.6566e-10, 1.8673e-07, 2.7008e-08, ..., -3.1665e-08, 1.3970e-09, 1.1176e-08], [ 3.2596e-09, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 9.3132e-10, 1.1176e-08], [-4.6566e-09, 2.1420e-08, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 302, bias, value: tensor([-0.0181, -0.0231, -0.0267, -0.0321, -0.0052, 0.0100, 0.0078, -0.0099, -0.0081, -0.0130], device='cuda:0'), grad: tensor([-2.9802e-08, -9.8161e-07, -5.2527e-07, 9.0804e-08, 9.3132e-08, -4.6194e-06, 4.9770e-06, 8.8103e-07, 4.0513e-08, 6.3330e-08], device='cuda:0') 100 0.0001 changing lr epoch 301, time 220.28, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4722 re_mapping 0.0035 re_causal 0.0097 /// teacc 99.09 lr 0.00010000 Epoch 303, weight, value: tensor([[-0.2162, -0.2793, 0.1177, ..., -0.1405, 0.0490, 0.0412], [-0.1511, -0.0633, -0.0997, ..., -0.2052, -0.0841, -0.0206], [ 0.0196, -0.1570, -0.1957, ..., -0.1547, 0.0324, -0.3771], ..., [-0.2196, 0.1696, 0.0250, ..., 0.2052, -0.0557, -0.1628], [-0.1838, -0.1748, 0.1895, ..., -0.1580, -0.1546, 0.1620], [ 0.0176, -0.3155, 0.1727, ..., 0.0614, -0.2035, -0.1566]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 1.2573e-08, 4.4238e-08, ..., 2.2817e-08, 0.0000e+00, 5.9605e-08], [ 5.5879e-09, 9.3132e-09, 3.5856e-08, ..., 1.3504e-08, 0.0000e+00, 3.9581e-08], [ 2.3283e-09, 1.7695e-08, 4.8894e-08, ..., 1.3970e-08, 0.0000e+00, 4.6566e-08], ..., [ 6.9849e-09, -1.1967e-07, -1.8673e-07, ..., -1.6345e-07, 0.0000e+00, 3.7719e-08], [ 3.0268e-08, 4.6566e-09, -7.7579e-07, ..., 2.6543e-08, 0.0000e+00, -1.0431e-06], [-9.8255e-08, 4.1910e-08, 2.0023e-08, ..., -1.5879e-07, 0.0000e+00, 1.2759e-07]], device='cuda:0') Epoch 303, bias, value: tensor([-0.0181, -0.0231, -0.0259, -0.0321, -0.0044, 0.0100, 0.0078, -0.0099, -0.0082, -0.0141], device='cuda:0'), grad: tensor([ 2.9104e-07, 1.5227e-07, 1.8300e-07, 7.5437e-08, 7.4320e-07, 2.2314e-06, -8.5309e-07, -4.8755e-07, -2.2687e-06, -6.8452e-08], device='cuda:0') 100 0.0001 changing lr epoch 302, time 220.39, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4841 re_mapping 0.0037 re_causal 0.0103 /// teacc 99.06 lr 0.00010000 Epoch 304, weight, value: tensor([[-0.2164, -0.2822, 0.1176, ..., -0.1408, 0.0488, 0.0410], [-0.1513, -0.0633, -0.0997, ..., -0.2054, -0.0838, -0.0208], [ 0.0196, -0.1576, -0.1973, ..., -0.1556, 0.0327, -0.3778], ..., [-0.2197, 0.1700, 0.0252, ..., 0.2057, -0.0560, -0.1628], [-0.1839, -0.1749, 0.1916, ..., -0.1584, -0.1546, 0.1623], [ 0.0176, -0.3163, 0.1727, ..., 0.0609, -0.2039, -0.1581]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -1.8626e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 4.1910e-09], ..., [ 9.3132e-10, 4.6566e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 5.5879e-09], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [-1.8626e-09, 0.0000e+00, -3.2596e-09, ..., -1.3970e-09, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 304, bias, value: tensor([-0.0185, -0.0231, -0.0259, -0.0321, -0.0037, 0.0100, 0.0078, -0.0098, -0.0076, -0.0149], device='cuda:0'), grad: tensor([ 1.6298e-08, -7.0874e-07, 1.5832e-07, 5.2620e-08, 3.5437e-07, -2.2817e-08, 2.3283e-09, 1.2992e-07, 1.2573e-08, 1.4435e-08], device='cuda:0') 100 0.0001 changing lr epoch 303, time 220.54, cls_loss 0.0006 cls_loss_mapping 0.0013 cls_loss_causal 0.4616 re_mapping 0.0037 re_causal 0.0101 /// teacc 99.07 lr 0.00010000 Epoch 305, weight, value: tensor([[-0.2167, -0.2837, 0.1176, ..., -0.1409, 0.0484, 0.0409], [-0.1512, -0.0634, -0.0998, ..., -0.2054, -0.0833, -0.0207], [ 0.0196, -0.1576, -0.1981, ..., -0.1556, 0.0325, -0.3787], ..., [-0.2200, 0.1702, 0.0247, ..., 0.2056, -0.0562, -0.1634], [-0.1840, -0.1750, 0.1919, ..., -0.1588, -0.1546, 0.1624], [ 0.0178, -0.3169, 0.1731, ..., 0.0612, -0.2040, -0.1583]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, 1.3970e-09, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.2596e-09, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, -9.3132e-10, ..., -3.2596e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 305, bias, value: tensor([-0.0188, -0.0230, -0.0262, -0.0305, -0.0039, 0.0099, 0.0078, -0.0101, -0.0076, -0.0150], device='cuda:0'), grad: tensor([ 6.9849e-09, 2.8871e-08, 6.5193e-09, -4.0978e-08, 4.7032e-08, 6.0536e-09, -5.9605e-08, 6.0536e-09, 2.3283e-09, 5.5879e-09], device='cuda:0') 100 0.0001 changing lr epoch 304, time 220.62, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4944 re_mapping 0.0035 re_causal 0.0102 /// teacc 99.00 lr 0.00010000 Epoch 306, weight, value: tensor([[-0.2170, -0.2869, 0.1176, ..., -0.1411, 0.0484, 0.0411], [-0.1512, -0.0655, -0.1013, ..., -0.2058, -0.0835, -0.0206], [ 0.0196, -0.1579, -0.1993, ..., -0.1557, 0.0309, -0.3792], ..., [-0.2203, 0.1726, 0.0261, ..., 0.2059, -0.0564, -0.1636], [-0.1840, -0.1752, 0.1924, ..., -0.1587, -0.1547, 0.1625], [ 0.0178, -0.3178, 0.1733, ..., 0.0609, -0.2040, -0.1588]], device='cuda:0'), grad: tensor([[ 4.1444e-08, 1.8626e-09, 4.6566e-08, ..., 9.7789e-09, 0.0000e+00, 8.8941e-08], [ 1.8626e-09, 1.8161e-08, 2.0023e-08, ..., 9.3132e-09, 0.0000e+00, 1.6764e-08], [-6.0536e-09, 5.9139e-08, 2.4727e-07, ..., 3.6787e-08, -4.6566e-10, 2.7008e-07], ..., [ 2.3283e-09, 1.0710e-08, 1.5460e-06, ..., 2.7986e-07, 0.0000e+00, 1.7043e-06], [ 4.6566e-09, 4.6566e-10, -2.3581e-06, ..., -4.6706e-07, 0.0000e+00, -2.5630e-06], [ 1.1176e-08, 1.5832e-08, 3.9302e-07, ..., 1.0291e-07, 0.0000e+00, 4.1490e-07]], device='cuda:0') Epoch 306, bias, value: tensor([-0.0190, -0.0241, -0.0263, -0.0307, -0.0034, 0.0099, 0.0079, -0.0085, -0.0075, -0.0155], device='cuda:0'), grad: tensor([ 2.1188e-07, 1.3830e-07, 4.0140e-07, 3.4273e-07, 3.1665e-08, -5.1921e-07, 1.3690e-07, 3.5018e-06, -5.1931e-06, 9.4110e-07], device='cuda:0') 100 0.0001 changing lr epoch 305, time 220.55, cls_loss 0.0013 cls_loss_mapping 0.0021 cls_loss_causal 0.4690 re_mapping 0.0035 re_causal 0.0093 /// teacc 99.08 lr 0.00010000 Epoch 307, weight, value: tensor([[-0.2206, -0.2879, 0.1178, ..., -0.1413, 0.0465, 0.0394], [-0.1520, -0.0657, -0.1016, ..., -0.2059, -0.0837, -0.0210], [ 0.0182, -0.1587, -0.2003, ..., -0.1564, 0.0299, -0.3815], ..., [-0.2179, 0.1730, 0.0260, ..., 0.2083, -0.0564, -0.1675], [-0.1851, -0.1755, 0.1938, ..., -0.1582, -0.1557, 0.1623], [ 0.0166, -0.3188, 0.1738, ..., 0.0604, -0.2045, -0.1604]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 3.7253e-09], [ 0.0000e+00, 6.5658e-08, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 1.0710e-08, 0.0000e+00, ..., -1.3970e-08, 0.0000e+00, 1.3970e-09], ..., [ 4.6566e-10, -1.1269e-07, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 4.6566e-10], [ 3.2596e-09, 0.0000e+00, -1.8626e-09, ..., 2.7940e-09, 0.0000e+00, 1.8626e-09], [ 1.3504e-08, 2.1886e-08, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 1.0710e-08]], device='cuda:0') Epoch 307, bias, value: tensor([-0.0196, -0.0251, -0.0246, -0.0315, -0.0050, 0.0101, 0.0079, -0.0073, -0.0083, -0.0166], device='cuda:0'), grad: tensor([ 1.2200e-07, 8.0047e-07, -2.1718e-06, 1.5181e-07, 7.2643e-08, 1.0151e-07, -9.3132e-10, 3.6135e-07, 4.0792e-07, 1.6531e-07], device='cuda:0') 100 0.0001 changing lr epoch 306, time 220.36, cls_loss 0.0007 cls_loss_mapping 0.0018 cls_loss_causal 0.4795 re_mapping 0.0034 re_causal 0.0100 /// teacc 99.10 lr 0.00010000 Epoch 308, weight, value: tensor([[-0.2209, -0.2887, 0.1178, ..., -0.1415, 0.0465, 0.0392], [-0.1514, -0.0659, -0.1018, ..., -0.2062, -0.0837, -0.0208], [ 0.0182, -0.1591, -0.2008, ..., -0.1568, 0.0299, -0.3815], ..., [-0.2181, 0.1733, 0.0258, ..., 0.2085, -0.0563, -0.1683], [-0.1851, -0.1758, 0.1939, ..., -0.1587, -0.1558, 0.1625], [ 0.0166, -0.3191, 0.1743, ..., 0.0608, -0.2045, -0.1602]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 4.6566e-10, -1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 308, bias, value: tensor([-0.0198, -0.0249, -0.0245, -0.0314, -0.0050, 0.0100, 0.0079, -0.0076, -0.0083, -0.0164], device='cuda:0'), grad: tensor([ 4.6566e-10, 1.8626e-09, 9.3132e-10, 1.8626e-09, -2.7940e-09, 1.3970e-09, -6.0536e-09, 4.6566e-10, 1.1642e-08, 1.3970e-09], device='cuda:0') 100 0.0001 changing lr epoch 307, time 220.87, cls_loss 0.0006 cls_loss_mapping 0.0016 cls_loss_causal 0.4396 re_mapping 0.0034 re_causal 0.0098 /// teacc 99.05 lr 0.00010000 Epoch 309, weight, value: tensor([[-0.2211, -0.2892, 0.1178, ..., -0.1416, 0.0464, 0.0391], [-0.1516, -0.0665, -0.1023, ..., -0.2065, -0.0838, -0.0213], [ 0.0182, -0.1593, -0.2011, ..., -0.1571, 0.0298, -0.3816], ..., [-0.2183, 0.1737, 0.0255, ..., 0.2085, -0.0560, -0.1684], [-0.1853, -0.1762, 0.1945, ..., -0.1598, -0.1558, 0.1627], [ 0.0170, -0.3194, 0.1748, ..., 0.0613, -0.2046, -0.1604]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -2.3283e-09, ..., 9.3132e-10, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.1642e-08, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.0291e-08, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, -1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -2.7940e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.2352e-07, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 309, bias, value: tensor([-0.0200, -0.0252, -0.0244, -0.0310, -0.0051, 0.0100, 0.0079, -0.0075, -0.0081, -0.0160], device='cuda:0'), grad: tensor([-8.3819e-09, -4.8429e-08, -5.1223e-09, 7.4506e-09, -5.9186e-07, 3.7253e-09, 2.0489e-08, 1.6997e-07, -5.1223e-09, 4.7358e-07], device='cuda:0') 100 0.0001 changing lr epoch 308, time 220.49, cls_loss 0.0009 cls_loss_mapping 0.0026 cls_loss_causal 0.4588 re_mapping 0.0036 re_causal 0.0099 /// teacc 99.16 lr 0.00010000 Epoch 310, weight, value: tensor([[-0.2211, -0.2901, 0.1184, ..., -0.1418, 0.0464, 0.0415], [-0.1517, -0.0668, -0.1026, ..., -0.2082, -0.0838, -0.0198], [ 0.0181, -0.1601, -0.2015, ..., -0.1577, 0.0297, -0.3819], ..., [-0.2183, 0.1741, 0.0252, ..., 0.2093, -0.0562, -0.1716], [-0.1854, -0.1796, 0.1945, ..., -0.1610, -0.1560, 0.1628], [ 0.0170, -0.3199, 0.1753, ..., 0.0608, -0.2046, -0.1609]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 5.2201e-07, 4.6566e-10, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.2375e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -6.6264e-07, 1.3970e-09, ..., -9.3132e-10, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 4.6566e-10, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, -9.3132e-10], [-2.7940e-09, 1.0384e-07, -1.1642e-08, ..., -9.3132e-09, 0.0000e+00, -9.3132e-10]], device='cuda:0') Epoch 310, bias, value: tensor([-0.0182, -0.0252, -0.0244, -0.0306, -0.0041, 0.0099, 0.0079, -0.0083, -0.0085, -0.0163], device='cuda:0'), grad: tensor([ 3.3062e-08, 1.7677e-06, 8.7544e-08, -1.4110e-07, 1.0291e-07, 1.1595e-07, -9.2201e-08, -2.1812e-06, 6.9849e-09, 3.0501e-07], device='cuda:0') 100 0.0001 changing lr epoch 309, time 220.90, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.4638 re_mapping 0.0036 re_causal 0.0100 /// teacc 99.15 lr 0.00010000 Epoch 311, weight, value: tensor([[-0.2213, -0.2903, 0.1183, ..., -0.1419, 0.0464, 0.0411], [-0.1517, -0.0676, -0.1039, ..., -0.2091, -0.0838, -0.0214], [ 0.0180, -0.1605, -0.2021, ..., -0.1580, 0.0299, -0.3822], ..., [-0.2183, 0.1748, 0.0252, ..., 0.2097, -0.0564, -0.1713], [-0.1854, -0.1798, 0.1959, ..., -0.1607, -0.1560, 0.1632], [ 0.0169, -0.3201, 0.1757, ..., 0.0611, -0.2046, -0.1615]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-08, 3.0268e-08, ..., 5.3551e-08, -4.6566e-10, -2.7940e-09], [ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 9.3132e-10, 4.6566e-10, 1.3970e-09], ..., [ 0.0000e+00, -1.2061e-07, -6.2864e-08, ..., -1.2154e-07, 4.6566e-10, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.1910e-09], [ 4.6566e-10, 4.6100e-08, 2.4214e-08, ..., 5.8673e-08, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 311, bias, value: tensor([-0.0184, -0.0259, -0.0244, -0.0304, -0.0041, 0.0098, 0.0080, -0.0081, -0.0080, -0.0158], device='cuda:0'), grad: tensor([ 2.1886e-08, 1.7742e-07, -1.7323e-07, 3.5390e-08, 1.3411e-07, -3.8184e-08, -2.4214e-08, -3.2550e-07, 4.8429e-08, 1.3690e-07], device='cuda:0') 100 0.0001 changing lr epoch 310, time 220.13, cls_loss 0.0007 cls_loss_mapping 0.0019 cls_loss_causal 0.4751 re_mapping 0.0034 re_causal 0.0098 /// teacc 99.05 lr 0.00010000 Epoch 312, weight, value: tensor([[-0.2215, -0.2905, 0.1183, ..., -0.1420, 0.0464, 0.0409], [-0.1517, -0.0685, -0.1052, ..., -0.2100, -0.0838, -0.0214], [ 0.0180, -0.1610, -0.2026, ..., -0.1586, 0.0300, -0.3825], ..., [-0.2184, 0.1757, 0.0262, ..., 0.2104, -0.0566, -0.1711], [-0.1855, -0.1799, 0.1961, ..., -0.1609, -0.1560, 0.1631], [ 0.0171, -0.3204, 0.1761, ..., 0.0615, -0.2047, -0.1619]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 1.3970e-09, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 1.4435e-08, 1.0710e-08, ..., 2.4680e-08, 0.0000e+00, 2.2817e-08], [ 0.0000e+00, 4.1910e-09, 2.3283e-09, ..., 5.5879e-09, 0.0000e+00, 4.1910e-09], ..., [ 9.3132e-10, -2.2817e-08, 6.9849e-09, ..., -3.3528e-08, 0.0000e+00, 4.6566e-10], [ 3.2596e-09, 0.0000e+00, -7.1712e-08, ..., 1.3970e-09, 0.0000e+00, -1.8347e-07], [ 1.8626e-09, 1.8626e-09, -2.7474e-08, ..., -2.0489e-08, 0.0000e+00, 6.5193e-09]], device='cuda:0') Epoch 312, bias, value: tensor([-0.0187, -0.0261, -0.0244, -0.0304, -0.0045, 0.0098, 0.0080, -0.0080, -0.0082, -0.0156], device='cuda:0'), grad: tensor([ 9.3132e-09, 1.5507e-07, 2.7474e-08, 1.6205e-07, 4.2375e-08, 2.0955e-08, 3.7625e-07, -9.5461e-08, -6.4168e-07, -4.9360e-08], device='cuda:0') 100 0.0001 changing lr epoch 311, time 220.66, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4662 re_mapping 0.0032 re_causal 0.0094 /// teacc 99.06 lr 0.00010000 Epoch 313, weight, value: tensor([[-0.2217, -0.2906, 0.1182, ..., -0.1424, 0.0464, 0.0409], [-0.1518, -0.0688, -0.1054, ..., -0.2104, -0.0832, -0.0215], [ 0.0179, -0.1634, -0.2033, ..., -0.1618, 0.0297, -0.3832], ..., [-0.2184, 0.1764, 0.0262, ..., 0.2111, -0.0566, -0.1713], [-0.1855, -0.1800, 0.1966, ..., -0.1613, -0.1561, 0.1633], [ 0.0181, -0.3207, 0.1770, ..., 0.0620, -0.2047, -0.1622]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 2.3283e-09, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 1.3970e-09, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 5.1223e-09, 3.2596e-09, ..., 4.6566e-09, 0.0000e+00, 9.3132e-10], ..., [ 4.6566e-10, -6.9849e-09, -1.8626e-09, ..., -6.5193e-09, 0.0000e+00, 1.3970e-09], [ 4.1910e-09, 4.6566e-10, -3.5856e-08, ..., 9.3132e-10, 0.0000e+00, -7.0781e-08], [ 5.5879e-09, 1.8626e-09, -1.3970e-08, ..., -1.0710e-08, 0.0000e+00, 6.9849e-09]], device='cuda:0') Epoch 313, bias, value: tensor([-0.0188, -0.0263, -0.0247, -0.0304, -0.0051, 0.0098, 0.0081, -0.0077, -0.0079, -0.0150], device='cuda:0'), grad: tensor([ 9.7789e-09, 9.3132e-10, 5.5879e-09, -4.1910e-09, 3.9581e-08, -7.4878e-07, 9.0478e-07, 8.8476e-09, -1.9372e-07, -1.1642e-08], device='cuda:0') 100 0.0001 changing lr epoch 312, time 220.35, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.5017 re_mapping 0.0035 re_causal 0.0102 /// teacc 99.16 lr 0.00010000 Epoch 314, weight, value: tensor([[-0.2218, -0.2908, 0.1183, ..., -0.1425, 0.0463, 0.0409], [-0.1519, -0.0690, -0.1056, ..., -0.2107, -0.0833, -0.0215], [ 0.0179, -0.1639, -0.2039, ..., -0.1622, 0.0306, -0.3838], ..., [-0.2184, 0.1767, 0.0262, ..., 0.2113, -0.0587, -0.1715], [-0.1856, -0.1801, 0.1974, ..., -0.1614, -0.1568, 0.1635], [ 0.0182, -0.3209, 0.1772, ..., 0.0622, -0.2047, -0.1626]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.3283e-09, 5.5879e-09, ..., 5.1223e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.4214e-08, 1.3970e-09, ..., 2.1886e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 3.7719e-08, 1.3970e-09, ..., 4.0513e-08, 0.0000e+00, 0.0000e+00], ..., [ 2.7940e-09, -5.2620e-08, 1.5832e-08, ..., -5.2620e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-5.5879e-09, 9.3132e-10, -3.3528e-08, ..., -2.2817e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 314, bias, value: tensor([-0.0189, -0.0265, -0.0245, -0.0303, -0.0052, 0.0098, 0.0081, -0.0078, -0.0076, -0.0150], device='cuda:0'), grad: tensor([ 2.1886e-08, 3.0734e-08, 1.2247e-07, -2.6878e-06, 2.2817e-08, 2.6505e-06, 9.3132e-10, -9.2667e-08, 1.2573e-08, -7.0315e-08], device='cuda:0') 100 0.0001 changing lr epoch 313, time 220.82, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4663 re_mapping 0.0036 re_causal 0.0098 /// teacc 99.07 lr 0.00010000 Epoch 315, weight, value: tensor([[-0.2219, -0.2911, 0.1184, ..., -0.1426, 0.0464, 0.0412], [-0.1520, -0.0694, -0.1060, ..., -0.2111, -0.0833, -0.0214], [ 0.0179, -0.1644, -0.2045, ..., -0.1626, 0.0306, -0.3841], ..., [-0.2183, 0.1771, 0.0264, ..., 0.2118, -0.0588, -0.1716], [-0.1858, -0.1805, 0.1978, ..., -0.1617, -0.1571, 0.1635], [ 0.0184, -0.3212, 0.1775, ..., 0.0623, -0.2048, -0.1630]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.3970e-09, 8.3819e-09, ..., 3.2596e-09, 0.0000e+00, 9.3132e-10], [ 9.3132e-10, 9.3132e-10, 1.4296e-07, ..., 3.7253e-09, 0.0000e+00, 4.6566e-10], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 3.2596e-09, -7.4506e-09, 4.6566e-10, ..., -1.3970e-09, 0.0000e+00, -1.3970e-09], [ 9.3132e-10, 0.0000e+00, 3.7253e-09, ..., 1.3970e-09, 0.0000e+00, 9.3132e-10], [-2.4680e-08, 3.2596e-09, -3.4925e-07, ..., -5.4948e-08, 0.0000e+00, 2.7940e-09]], device='cuda:0') Epoch 315, bias, value: tensor([-0.0187, -0.0264, -0.0249, -0.0303, -0.0053, 0.0098, 0.0081, -0.0076, -0.0078, -0.0149], device='cuda:0'), grad: tensor([ 3.7253e-08, 5.7695e-07, -1.7695e-08, 1.3784e-07, 6.7009e-07, -4.5169e-08, 9.3132e-09, 1.4901e-08, 1.9092e-08, -1.3970e-06], device='cuda:0') 100 0.0001 changing lr epoch 314, time 220.43, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4788 re_mapping 0.0032 re_causal 0.0097 /// teacc 99.14 lr 0.00010000 Epoch 316, weight, value: tensor([[-0.2221, -0.2913, 0.1186, ..., -0.1427, 0.0465, 0.0416], [-0.1520, -0.0697, -0.1062, ..., -0.2114, -0.0833, -0.0213], [ 0.0179, -0.1648, -0.2039, ..., -0.1631, 0.0307, -0.3842], ..., [-0.2183, 0.1775, 0.0265, ..., 0.2122, -0.0588, -0.1717], [-0.1858, -0.1806, 0.1978, ..., -0.1618, -0.1571, 0.1636], [ 0.0184, -0.3216, 0.1777, ..., 0.0618, -0.2049, -0.1633]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 1.8626e-09, 4.6566e-10, ..., 9.3132e-10, 2.3283e-09, 0.0000e+00], [-3.7253e-09, -1.3970e-09, 4.6566e-10, ..., 9.3132e-10, 4.6566e-10, 0.0000e+00], ..., [ 0.0000e+00, -5.5879e-09, -2.3283e-09, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 3.7253e-09, 5.1223e-09, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 316, bias, value: tensor([-0.0185, -0.0262, -0.0246, -0.0305, -0.0045, 0.0098, 0.0080, -0.0079, -0.0078, -0.0155], device='cuda:0'), grad: tensor([ 1.9558e-08, 3.0268e-08, -1.9325e-07, 6.9849e-09, -1.2852e-07, 3.1665e-08, 2.9802e-08, -4.6566e-09, 1.8626e-09, 2.1094e-07], device='cuda:0') 100 0.0001 changing lr epoch 315, time 220.70, cls_loss 0.0009 cls_loss_mapping 0.0028 cls_loss_causal 0.4688 re_mapping 0.0033 re_causal 0.0093 /// teacc 99.06 lr 0.00010000 Epoch 317, weight, value: tensor([[-0.2223, -0.2917, 0.1187, ..., -0.1428, 0.0466, 0.0417], [-0.1522, -0.0734, -0.1098, ..., -0.2118, -0.0834, -0.0242], [ 0.0178, -0.1659, -0.2053, ..., -0.1639, 0.0311, -0.3848], ..., [-0.2184, 0.1814, 0.0299, ..., 0.2126, -0.0590, -0.1689], [-0.1872, -0.1808, 0.1986, ..., -0.1619, -0.1571, 0.1630], [ 0.0182, -0.3230, 0.1777, ..., 0.0613, -0.2052, -0.1642]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, -2.9337e-08, ..., -3.2596e-09, 0.0000e+00, -1.0245e-08], [ 1.8626e-09, 4.6566e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -6.9849e-09], [ 4.6566e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [ 4.6566e-09, 0.0000e+00, 3.7253e-09, ..., 4.6566e-10, 0.0000e+00, 4.6566e-09], [ 3.7253e-09, 0.0000e+00, 1.8626e-08, ..., 1.3970e-09, 0.0000e+00, 9.7789e-09]], device='cuda:0') Epoch 317, bias, value: tensor([-0.0185, -0.0291, -0.0251, -0.0298, -0.0039, 0.0097, 0.0082, -0.0047, -0.0085, -0.0164], device='cuda:0'), grad: tensor([-6.5658e-08, -1.5972e-07, 1.2573e-08, 1.1176e-08, -4.2375e-08, -1.8859e-07, 1.6717e-07, 1.8394e-07, 1.9092e-08, 6.4727e-08], device='cuda:0') 100 0.0001 changing lr epoch 316, time 220.64, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4742 re_mapping 0.0036 re_causal 0.0099 /// teacc 99.13 lr 0.00010000 Epoch 318, weight, value: tensor([[-0.2228, -0.2923, 0.1188, ..., -0.1429, 0.0466, 0.0416], [-0.1526, -0.0736, -0.1099, ..., -0.2119, -0.0836, -0.0244], [ 0.0175, -0.1664, -0.2075, ..., -0.1645, 0.0311, -0.3856], ..., [-0.2186, 0.1816, 0.0300, ..., 0.2130, -0.0592, -0.1688], [-0.1881, -0.1810, 0.1993, ..., -0.1621, -0.1575, 0.1625], [ 0.0178, -0.3244, 0.1778, ..., 0.0611, -0.2054, -0.1648]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.3970e-08, ..., 4.6566e-10, 0.0000e+00, -8.3819e-09], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 9.3132e-10, -2.7940e-09, -8.8476e-09], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, -9.3132e-10, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 1.9092e-08, ..., 1.3504e-08, 1.3970e-09, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 1.8626e-09, 0.0000e+00, 9.3132e-10], [-9.3132e-10, 0.0000e+00, -3.3062e-08, ..., -2.9337e-08, 0.0000e+00, 6.9849e-09]], device='cuda:0') Epoch 318, bias, value: tensor([-0.0185, -0.0292, -0.0255, -0.0297, -0.0039, 0.0098, 0.0081, -0.0046, -0.0089, -0.0168], device='cuda:0'), grad: tensor([-3.3993e-08, -4.5169e-08, -4.6566e-10, 6.0536e-09, 5.1688e-08, 1.1176e-08, -7.9162e-09, 8.3819e-08, 1.3504e-08, -6.7521e-08], device='cuda:0') 100 0.0001 changing lr epoch 317, time 220.74, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4557 re_mapping 0.0035 re_causal 0.0094 /// teacc 99.13 lr 0.00010000 Epoch 319, weight, value: tensor([[-0.2234, -0.2930, 0.1191, ..., -0.1430, 0.0461, 0.0416], [-0.1530, -0.0736, -0.1099, ..., -0.2120, -0.0809, -0.0244], [ 0.0172, -0.1671, -0.2095, ..., -0.1648, 0.0303, -0.3862], ..., [-0.2188, 0.1817, 0.0300, ..., 0.2131, -0.0621, -0.1689], [-0.1883, -0.1818, 0.1996, ..., -0.1621, -0.1581, 0.1625], [ 0.0174, -0.3252, 0.1785, ..., 0.0615, -0.2055, -0.1660]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -6.4960e-08, ..., 0.0000e+00, 0.0000e+00, -1.7462e-08], [ 0.0000e+00, -1.0920e-07, 8.1491e-09, ..., 0.0000e+00, 0.0000e+00, 7.2177e-09], [ 0.0000e+00, 0.0000e+00, 1.9814e-07, ..., 0.0000e+00, 0.0000e+00, 1.6089e-07], ..., [ 0.0000e+00, 1.0384e-07, 8.8476e-09, ..., 2.3283e-10, 0.0000e+00, 7.6834e-09], [ 0.0000e+00, 2.3283e-10, -1.6205e-07, ..., 0.0000e+00, 0.0000e+00, -1.7066e-07], [ 0.0000e+00, 4.1910e-09, 2.5611e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 319, bias, value: tensor([-0.0177, -0.0291, -0.0260, -0.0297, -0.0042, 0.0099, 0.0081, -0.0047, -0.0091, -0.0167], device='cuda:0'), grad: tensor([-2.0303e-07, -1.0105e-06, 7.0781e-07, 1.8626e-08, -1.4855e-07, 1.3039e-08, 2.3982e-08, 1.0738e-06, -5.8766e-07, 1.2759e-07], device='cuda:0') 100 0.0001 changing lr epoch 318, time 220.36, cls_loss 0.0008 cls_loss_mapping 0.0021 cls_loss_causal 0.4661 re_mapping 0.0032 re_causal 0.0096 /// teacc 99.12 lr 0.00010000 Epoch 320, weight, value: tensor([[-0.2245, -0.2933, 0.1182, ..., -0.1431, 0.0462, 0.0406], [-0.1538, -0.0736, -0.1100, ..., -0.2122, -0.0809, -0.0245], [ 0.0170, -0.1692, -0.2120, ..., -0.1662, 0.0303, -0.3872], ..., [-0.2188, 0.1817, 0.0300, ..., 0.2136, -0.0621, -0.1689], [-0.1884, -0.1822, 0.2000, ..., -0.1620, -0.1581, 0.1627], [ 0.0182, -0.3265, 0.1807, ..., 0.0614, -0.2056, -0.1635]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, 1.1642e-09, 6.9849e-10, ..., 1.1642e-09, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 2.3283e-09, 1.3970e-09, ..., 2.3283e-09, 0.0000e+00, 2.3283e-10], ..., [ 4.6566e-10, -9.0804e-09, -3.7253e-09, ..., -6.9849e-09, 0.0000e+00, 2.3283e-10], [ 8.1491e-09, 0.0000e+00, -2.3283e-09, ..., 2.3283e-10, 0.0000e+00, 5.3551e-09], [ 3.7253e-09, 2.5611e-09, -6.9849e-10, ..., 2.3283e-10, 0.0000e+00, 5.1223e-09]], device='cuda:0') Epoch 320, bias, value: tensor([-0.0191, -0.0291, -0.0269, -0.0313, -0.0044, 0.0107, 0.0081, -0.0047, -0.0090, -0.0147], device='cuda:0'), grad: tensor([ 3.9581e-09, 4.6566e-09, 7.9162e-09, 3.3993e-08, 1.0012e-08, -6.2631e-08, 1.1642e-09, -1.8161e-08, 1.3504e-08, 1.2806e-08], device='cuda:0') 100 0.0001 changing lr epoch 319, time 220.61, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4883 re_mapping 0.0033 re_causal 0.0092 /// teacc 99.10 lr 0.00010000 Epoch 321, weight, value: tensor([[-0.2249, -0.2936, 0.1186, ..., -0.1431, 0.0463, 0.0408], [-0.1533, -0.0737, -0.1101, ..., -0.2125, -0.0810, -0.0245], [ 0.0169, -0.1701, -0.2136, ..., -0.1668, 0.0303, -0.3877], ..., [-0.2189, 0.1818, 0.0301, ..., 0.2142, -0.0620, -0.1689], [-0.1886, -0.1831, 0.2006, ..., -0.1626, -0.1582, 0.1628], [ 0.0185, -0.3281, 0.1811, ..., 0.0610, -0.2056, -0.1638]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 7.2177e-09, 2.0955e-09, ..., 4.4238e-09, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 1.1642e-09, 2.3283e-10, 2.0955e-09], ..., [ 0.0000e+00, -9.5461e-09, -2.3283e-09, ..., -5.8208e-09, -6.9849e-10, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, -1.1176e-08, ..., 0.0000e+00, 0.0000e+00, -1.8161e-08], [ 0.0000e+00, 6.9849e-10, 9.3132e-10, ..., -1.3970e-09, 0.0000e+00, 6.5193e-09]], device='cuda:0') Epoch 321, bias, value: tensor([-0.0188, -0.0289, -0.0277, -0.0332, -0.0050, 0.0119, 0.0081, -0.0047, -0.0090, -0.0149], device='cuda:0'), grad: tensor([ 1.6298e-09, 1.1176e-08, 9.5461e-09, 5.1223e-09, 1.6065e-08, -2.0955e-08, 4.4703e-08, -1.4668e-08, -3.7020e-08, 7.6834e-09], device='cuda:0') 100 0.0001 changing lr epoch 320, time 220.23, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4644 re_mapping 0.0034 re_causal 0.0097 /// teacc 99.17 lr 0.00010000 Epoch 322, weight, value: tensor([[-0.2254, -0.2938, 0.1186, ..., -0.1432, 0.0463, 0.0407], [-0.1512, -0.0737, -0.1100, ..., -0.2125, -0.0810, -0.0245], [ 0.0168, -0.1706, -0.2142, ..., -0.1673, 0.0304, -0.3881], ..., [-0.2192, 0.1819, 0.0301, ..., 0.2144, -0.0621, -0.1689], [-0.1887, -0.1833, 0.2009, ..., -0.1634, -0.1582, 0.1629], [ 0.0211, -0.3284, 0.1824, ..., 0.0616, -0.2056, -0.1639]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 2.5611e-09, 1.8626e-09, ..., 1.6298e-09, 0.0000e+00, 2.3283e-10], [ 4.6566e-10, 1.1176e-08, 8.1491e-09, ..., 9.3132e-09, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.7940e-09, 1.1642e-09, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00], ..., [ 6.9849e-10, -7.6834e-08, -6.7055e-08, ..., -7.9861e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-09, 1.3970e-08, 3.7253e-09, ..., 2.5611e-09, 0.0000e+00, 0.0000e+00], [-9.3132e-10, 6.3796e-08, -1.7439e-07, ..., -6.4960e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 322, bias, value: tensor([-0.0188, -0.0288, -0.0279, -0.0332, -0.0073, 0.0120, 0.0081, -0.0047, -0.0090, -0.0133], device='cuda:0'), grad: tensor([ 1.1642e-08, 3.7719e-08, 8.6147e-09, -8.7544e-07, 7.0315e-07, 8.5495e-07, -6.0536e-09, -2.6100e-07, 4.5402e-08, -5.0105e-07], device='cuda:0') 100 0.0001 changing lr epoch 321, time 220.59, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4961 re_mapping 0.0034 re_causal 0.0101 /// teacc 99.17 lr 0.00010000 Epoch 323, weight, value: tensor([[-0.2260, -0.2941, 0.1186, ..., -0.1433, 0.0463, 0.0405], [-0.1513, -0.0736, -0.1100, ..., -0.2126, -0.0810, -0.0245], [ 0.0169, -0.1707, -0.2132, ..., -0.1674, 0.0305, -0.3881], ..., [-0.2193, 0.1819, 0.0301, ..., 0.2147, -0.0621, -0.1689], [-0.1890, -0.1836, 0.2007, ..., -0.1644, -0.1582, 0.1630], [ 0.0213, -0.3297, 0.1828, ..., 0.0618, -0.2056, -0.1638]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 9.7789e-09, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 1.0245e-08, 5.5879e-09, ..., 2.3283e-08, 0.0000e+00, 4.6566e-10], [-4.1910e-09, 2.3283e-09, 9.3132e-10, ..., -3.5856e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.3283e-08, 2.2352e-08, ..., 3.4459e-08, 0.0000e+00, 0.0000e+00], [-1.8626e-09, 0.0000e+00, -6.9849e-09, ..., 9.3132e-10, 0.0000e+00, -1.1642e-08], [ 0.0000e+00, 6.5193e-09, -5.6345e-08, ..., -7.2177e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 323, bias, value: tensor([-0.0190, -0.0286, -0.0274, -0.0332, -0.0073, 0.0120, 0.0081, -0.0049, -0.0093, -0.0135], device='cuda:0'), grad: tensor([ 2.3376e-07, 4.8196e-07, -1.1176e-06, 3.0268e-08, 9.8255e-08, 3.5856e-08, 4.9826e-08, 3.5809e-07, 3.2596e-09, -1.7369e-07], device='cuda:0') 100 0.0001 changing lr epoch 322, time 220.35, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.4623 re_mapping 0.0033 re_causal 0.0092 /// teacc 99.06 lr 0.00010000 Epoch 324, weight, value: tensor([[-0.2266, -0.2951, 0.1186, ..., -0.1442, 0.0463, 0.0407], [-0.1476, -0.0736, -0.1100, ..., -0.2127, -0.0811, -0.0246], [ 0.0171, -0.1712, -0.2141, ..., -0.1678, 0.0306, -0.3886], ..., [-0.2200, 0.1819, 0.0300, ..., 0.2150, -0.0621, -0.1690], [-0.1890, -0.1841, 0.2020, ..., -0.1648, -0.1583, 0.1639], [ 0.0179, -0.3307, 0.1826, ..., 0.0618, -0.2057, -0.1667]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -2.2352e-08, ..., 0.0000e+00, 0.0000e+00, 4.6566e-09], [ 0.0000e+00, 4.6566e-10, 9.3132e-09, ..., 0.0000e+00, 0.0000e+00, -6.0070e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 8.8476e-09], ..., [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 1.1176e-08, 0.0000e+00, 3.2596e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 324, bias, value: tensor([-0.0190, -0.0283, -0.0275, -0.0332, -0.0076, 0.0120, 0.0080, -0.0051, -0.0077, -0.0155], device='cuda:0'), grad: tensor([-1.1642e-08, -4.9174e-07, 8.6147e-08, 2.7940e-09, 7.4506e-08, 1.2573e-08, 2.7474e-07, 1.5367e-08, 1.3970e-08, 2.7008e-08], device='cuda:0') 100 0.0001 changing lr epoch 323, time 220.65, cls_loss 0.0012 cls_loss_mapping 0.0018 cls_loss_causal 0.4841 re_mapping 0.0032 re_causal 0.0092 /// teacc 99.18 lr 0.00010000 Epoch 325, weight, value: tensor([[-0.2274, -0.2958, 0.1185, ..., -0.1449, 0.0465, 0.0406], [-0.1476, -0.0736, -0.1100, ..., -0.2131, -0.0814, -0.0245], [ 0.0171, -0.1727, -0.2152, ..., -0.1690, 0.0358, -0.3898], ..., [-0.2203, 0.1819, 0.0300, ..., 0.2154, -0.0663, -0.1691], [-0.1899, -0.1849, 0.2022, ..., -0.1667, -0.1583, 0.1640], [ 0.0184, -0.3314, 0.1837, ..., 0.0595, -0.2058, -0.1668]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, -1.8859e-07, ..., 0.0000e+00, 4.6566e-10, -2.4820e-07], [ 4.6566e-10, 4.6566e-10, 2.0955e-08, ..., 4.6566e-10, 2.3283e-09, 3.2131e-08], [ 4.6566e-10, 4.6566e-10, 8.8476e-09, ..., -4.1910e-09, -1.4435e-08, 1.2107e-08], ..., [ 4.6566e-10, 9.3132e-10, 5.2620e-08, ..., 2.7940e-09, 1.0245e-08, 7.0315e-08], [-2.3283e-09, 2.3283e-09, -3.0268e-08, ..., -2.3283e-09, 0.0000e+00, -1.7602e-07], [ 1.3970e-09, 4.6566e-10, 3.7719e-08, ..., 0.0000e+00, 0.0000e+00, 5.1688e-08]], device='cuda:0') Epoch 325, bias, value: tensor([-0.0194, -0.0281, -0.0269, -0.0330, -0.0050, 0.0120, 0.0078, -0.0053, -0.0083, -0.0172], device='cuda:0'), grad: tensor([-1.8142e-06, 1.7229e-07, 2.7474e-08, 2.0536e-07, 6.1002e-08, 5.4017e-07, 1.9511e-07, 5.6019e-07, -3.2876e-07, 3.6974e-07], device='cuda:0') 100 0.0001 changing lr epoch 324, time 220.78, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4363 re_mapping 0.0032 re_causal 0.0094 /// teacc 99.15 lr 0.00010000 Epoch 326, weight, value: tensor([[-0.2287, -0.2960, 0.1192, ..., -0.1450, 0.0463, 0.0415], [-0.1478, -0.0737, -0.1100, ..., -0.2134, -0.0820, -0.0245], [ 0.0175, -0.1727, -0.2164, ..., -0.1694, 0.0395, -0.3906], ..., [-0.2204, 0.1819, 0.0300, ..., 0.2161, -0.0694, -0.1691], [-0.1906, -0.1854, 0.2028, ..., -0.1675, -0.1586, 0.1640], [ 0.0184, -0.3337, 0.1837, ..., 0.0592, -0.2063, -0.1669]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 6.0536e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -1.3970e-09], [ 3.8650e-08, 1.0151e-07, 3.0268e-08, ..., 7.3574e-08, 0.0000e+00, 4.6566e-08], [ 4.6566e-10, 1.8626e-08, 4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 4.1910e-09], ..., [ 1.3970e-09, -1.8440e-07, -6.0070e-08, ..., -1.7183e-07, 0.0000e+00, 1.3970e-09], [ 2.1653e-07, 1.4901e-08, 3.7253e-09, ..., 8.3819e-09, 0.0000e+00, 2.3842e-07], [ 5.7276e-08, 4.0513e-08, 6.9849e-09, ..., 2.3283e-09, 0.0000e+00, 3.6322e-08]], device='cuda:0') Epoch 326, bias, value: tensor([-0.0175, -0.0281, -0.0259, -0.0330, -0.0051, 0.0121, 0.0077, -0.0054, -0.0086, -0.0175], device='cuda:0'), grad: tensor([ 1.2107e-08, 4.1071e-07, 1.0431e-07, -4.0233e-07, 4.0699e-07, -1.0263e-06, 4.1816e-07, -6.4727e-07, 5.0385e-07, 2.3982e-07], device='cuda:0') 100 0.0001 changing lr epoch 325, time 220.55, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4543 re_mapping 0.0033 re_causal 0.0095 /// teacc 99.12 lr 0.00010000 Epoch 327, weight, value: tensor([[-0.2289, -0.2963, 0.1192, ..., -0.1451, 0.0461, 0.0413], [-0.1479, -0.0737, -0.1100, ..., -0.2138, -0.0821, -0.0245], [ 0.0176, -0.1734, -0.2169, ..., -0.1704, 0.0399, -0.3913], ..., [-0.2205, 0.1820, 0.0300, ..., 0.2170, -0.0696, -0.1692], [-0.1920, -0.1875, 0.2028, ..., -0.1682, -0.1587, 0.1634], [ 0.0184, -0.3363, 0.1838, ..., 0.0590, -0.2064, -0.1670]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -6.1002e-08, ..., 0.0000e+00, 0.0000e+00, -6.1467e-08], [ 0.0000e+00, 1.8626e-09, 4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 2.3283e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09], ..., [ 0.0000e+00, -1.8626e-09, 4.6566e-10, ..., -4.6566e-10, 0.0000e+00, 2.3283e-09], [ 3.7253e-09, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09]], device='cuda:0') Epoch 327, bias, value: tensor([-0.0178, -0.0281, -0.0263, -0.0330, -0.0051, 0.0122, 0.0077, -0.0054, -0.0095, -0.0178], device='cuda:0'), grad: tensor([-5.6624e-07, 2.1420e-08, 2.1420e-08, 3.1199e-08, 6.0536e-09, 3.7719e-08, 4.1351e-07, 1.3504e-08, 1.2573e-08, 2.2352e-08], device='cuda:0') 100 0.0001 changing lr epoch 326, time 220.86, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4539 re_mapping 0.0032 re_causal 0.0093 /// teacc 99.15 lr 0.00010000 Epoch 328, weight, value: tensor([[-0.2293, -0.2964, 0.1192, ..., -0.1454, 0.0456, 0.0412], [-0.1480, -0.0737, -0.1101, ..., -0.2139, -0.0821, -0.0245], [ 0.0176, -0.1735, -0.2184, ..., -0.1704, 0.0400, -0.3921], ..., [-0.2206, 0.1820, 0.0300, ..., 0.2170, -0.0696, -0.1692], [-0.1935, -0.1877, 0.2040, ..., -0.1689, -0.1587, 0.1629], [ 0.0185, -0.3367, 0.1844, ..., 0.0592, -0.2064, -0.1671]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [-2.3283e-09, 5.5879e-09, 0.0000e+00, ..., 5.1223e-09, 0.0000e+00, -4.6566e-09], [ 0.0000e+00, 1.0245e-08, 1.3970e-09, ..., 9.7789e-09, 0.0000e+00, 4.6566e-10], ..., [ 1.3970e-09, -1.8161e-08, -1.3970e-09, ..., -1.6764e-08, 0.0000e+00, 3.7253e-09], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 9.3132e-10, 2.7940e-09, -2.7940e-09, ..., -9.3132e-10, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 328, bias, value: tensor([-0.0177, -0.0281, -0.0262, -0.0330, -0.0052, 0.0123, 0.0075, -0.0054, -0.0101, -0.0175], device='cuda:0'), grad: tensor([ 1.3970e-09, -9.0059e-07, 6.9849e-07, 2.4214e-08, 2.1886e-08, -3.3993e-08, 1.0245e-08, 1.8068e-07, 4.1910e-09, 4.6566e-10], device='cuda:0') 100 0.0001 changing lr epoch 327, time 220.27, cls_loss 0.0008 cls_loss_mapping 0.0015 cls_loss_causal 0.4823 re_mapping 0.0033 re_causal 0.0096 /// teacc 99.15 lr 0.00010000 Epoch 329, weight, value: tensor([[-0.2302, -0.2976, 0.1189, ..., -0.1464, 0.0449, 0.0410], [-0.1481, -0.0740, -0.1103, ..., -0.2157, -0.0824, -0.0245], [ 0.0177, -0.1742, -0.2185, ..., -0.1712, 0.0403, -0.3924], ..., [-0.2207, 0.1824, 0.0302, ..., 0.2185, -0.0697, -0.1692], [-0.1935, -0.1879, 0.2058, ..., -0.1697, -0.1589, 0.1636], [ 0.0185, -0.3372, 0.1850, ..., 0.0596, -0.2066, -0.1673]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -8.3819e-09, ..., 0.0000e+00, 0.0000e+00, -1.3504e-08], [ 4.6566e-10, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-9.3132e-10, 1.2573e-08, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [ 0.0000e+00, 2.8871e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 329, bias, value: tensor([-0.0184, -0.0283, -0.0259, -0.0330, -0.0053, 0.0123, 0.0075, -0.0052, -0.0091, -0.0174], device='cuda:0'), grad: tensor([-5.4017e-08, 7.6834e-08, -1.2061e-07, -9.2201e-08, 3.2596e-09, 1.9558e-08, 2.9802e-08, 1.0710e-07, 1.9092e-08, 1.7229e-08], device='cuda:0') 100 0.0001 changing lr epoch 328, time 220.16, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4913 re_mapping 0.0032 re_causal 0.0094 /// teacc 99.10 lr 0.00010000 Epoch 330, weight, value: tensor([[-0.2306, -0.2987, 0.1187, ..., -0.1470, 0.0449, 0.0407], [-0.1481, -0.0741, -0.1103, ..., -0.2160, -0.0826, -0.0245], [ 0.0180, -0.1762, -0.2199, ..., -0.1729, 0.0404, -0.3929], ..., [-0.2208, 0.1825, 0.0302, ..., 0.2189, -0.0698, -0.1692], [-0.1936, -0.1879, 0.2063, ..., -0.1708, -0.1590, 0.1639], [ 0.0186, -0.3377, 0.1868, ..., 0.0609, -0.2072, -0.1674]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -4.6566e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-08, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 330, bias, value: tensor([-0.0189, -0.0283, -0.0265, -0.0330, -0.0061, 0.0123, 0.0076, -0.0052, -0.0089, -0.0165], device='cuda:0'), grad: tensor([ 4.1910e-09, 3.9581e-08, -9.2201e-08, 5.4482e-08, -4.3772e-08, -7.9162e-09, -2.9802e-08, 1.7229e-08, -1.3970e-09, 6.4261e-08], device='cuda:0') 100 0.0001 changing lr epoch 329, time 220.89, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4467 re_mapping 0.0034 re_causal 0.0095 /// teacc 99.08 lr 0.00010000 Epoch 331, weight, value: tensor([[-0.2325, -0.2988, 0.1184, ..., -0.1479, 0.0449, 0.0401], [-0.1481, -0.0741, -0.1103, ..., -0.2161, -0.0826, -0.0244], [ 0.0179, -0.1759, -0.2201, ..., -0.1726, 0.0404, -0.3931], ..., [-0.2208, 0.1825, 0.0302, ..., 0.2189, -0.0698, -0.1693], [-0.1939, -0.1883, 0.2052, ..., -0.1737, -0.1592, 0.1637], [ 0.0183, -0.3382, 0.1878, ..., 0.0614, -0.2073, -0.1673]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -9.3132e-09, ..., 0.0000e+00, -1.3970e-09, -4.6566e-10], [ 0.0000e+00, 1.7229e-08, 5.5879e-09, ..., 1.3970e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 5.5879e-09, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], ..., [ 0.0000e+00, -1.9092e-08, -4.1910e-09, ..., -1.4901e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 1.8626e-09, 4.6566e-10, -4.6566e-09], [ 0.0000e+00, 9.3132e-10, -8.3819e-09, ..., -4.6566e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 331, bias, value: tensor([-0.0194, -0.0283, -0.0258, -0.0331, -0.0063, 0.0124, 0.0075, -0.0052, -0.0095, -0.0162], device='cuda:0'), grad: tensor([-2.5611e-08, 4.7032e-08, 2.0023e-08, 1.4435e-08, 1.3504e-08, 6.0536e-09, 3.7253e-09, -4.4703e-08, -5.5879e-09, -1.8626e-08], device='cuda:0') 100 0.0001 changing lr epoch 330, time 220.46, cls_loss 0.0008 cls_loss_mapping 0.0015 cls_loss_causal 0.4762 re_mapping 0.0033 re_causal 0.0093 /// teacc 99.07 lr 0.00010000 Epoch 332, weight, value: tensor([[-0.2327, -0.2991, 0.1183, ..., -0.1485, 0.0445, 0.0401], [-0.1482, -0.0741, -0.1103, ..., -0.2163, -0.0827, -0.0245], [ 0.0176, -0.1783, -0.2220, ..., -0.1745, 0.0406, -0.3937], ..., [-0.2210, 0.1827, 0.0302, ..., 0.2197, -0.0699, -0.1693], [-0.1940, -0.1886, 0.2080, ..., -0.1741, -0.1596, 0.1642], [ 0.0183, -0.3387, 0.1884, ..., 0.0614, -0.2073, -0.1675]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 1.3970e-09, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.1910e-09, -2.7940e-09, ..., -5.5879e-09, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 3.2596e-09], [ 4.6566e-10, 4.1910e-09, 1.3970e-09, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 332, bias, value: tensor([-0.0198, -0.0283, -0.0264, -0.0331, -0.0069, 0.0124, 0.0076, -0.0051, -0.0086, -0.0162], device='cuda:0'), grad: tensor([ 1.3970e-09, 3.8650e-08, -1.8161e-08, 6.0536e-09, 5.0291e-08, -4.4703e-08, -2.7474e-08, -8.3819e-09, 7.4506e-09, 7.4506e-09], device='cuda:0') 100 0.0001 changing lr epoch 331, time 220.32, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4814 re_mapping 0.0032 re_causal 0.0092 /// teacc 99.05 lr 0.00010000 Epoch 333, weight, value: tensor([[-0.2331, -0.2996, 0.1188, ..., -0.1487, 0.0445, 0.0407], [-0.1483, -0.0744, -0.1103, ..., -0.2190, -0.0827, -0.0246], [ 0.0175, -0.1783, -0.2224, ..., -0.1748, 0.0407, -0.3940], ..., [-0.2211, 0.1831, 0.0302, ..., 0.2224, -0.0699, -0.1693], [-0.1942, -0.1888, 0.2084, ..., -0.1744, -0.1597, 0.1644], [ 0.0180, -0.3405, 0.1882, ..., 0.0611, -0.2073, -0.1677]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.9558e-08, 0.0000e+00, ..., 5.1223e-09, 0.0000e+00, 9.3132e-10], [-9.3132e-10, 1.0338e-07, 0.0000e+00, ..., 2.8871e-08, 0.0000e+00, -3.2596e-09], [ 0.0000e+00, 5.1223e-09, 1.8626e-09, ..., 1.3970e-09, 0.0000e+00, 1.8626e-09], ..., [ 4.6566e-10, -2.2771e-07, 0.0000e+00, ..., -6.3330e-08, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 1.3970e-09, -3.7253e-09, ..., 4.6566e-10, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 9.7789e-09, 4.6566e-10, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 333, bias, value: tensor([-0.0189, -0.0288, -0.0262, -0.0332, -0.0068, 0.0125, 0.0075, -0.0047, -0.0086, -0.0166], device='cuda:0'), grad: tensor([ 5.0757e-08, 2.0117e-07, 1.9092e-08, 3.8184e-08, 5.5879e-08, 1.5320e-07, -5.0757e-08, -4.9407e-07, -9.3132e-10, 2.4214e-08], device='cuda:0') 100 0.0001 changing lr epoch 332, time 220.32, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4914 re_mapping 0.0031 re_causal 0.0094 /// teacc 99.13 lr 0.00010000 Epoch 334, weight, value: tensor([[-0.2333, -0.2998, 0.1189, ..., -0.1490, 0.0445, 0.0409], [-0.1484, -0.0750, -0.1110, ..., -0.2192, -0.0828, -0.0243], [ 0.0175, -0.1785, -0.2226, ..., -0.1750, 0.0407, -0.3942], ..., [-0.2211, 0.1837, 0.0309, ..., 0.2225, -0.0700, -0.1696], [-0.1943, -0.1890, 0.2089, ..., -0.1749, -0.1597, 0.1645], [ 0.0180, -0.3411, 0.1886, ..., 0.0614, -0.2074, -0.1679]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-09], [ 0.0000e+00, 4.6566e-10, -2.0070e-07, ..., 9.3132e-10, 0.0000e+00, -2.9476e-07], [ 0.0000e+00, 0.0000e+00, 8.3819e-09, ..., 0.0000e+00, 0.0000e+00, 1.2573e-08], ..., [ 0.0000e+00, 2.7940e-09, 3.8184e-08, ..., 1.8626e-09, 0.0000e+00, 5.3551e-08], [ 4.6566e-10, 4.6566e-10, 1.3644e-07, ..., 0.0000e+00, 0.0000e+00, 2.0023e-07], [-9.3132e-10, 4.6566e-10, 3.7253e-09, ..., -8.3819e-09, 0.0000e+00, 1.2107e-08]], device='cuda:0') Epoch 334, bias, value: tensor([-0.0191, -0.0294, -0.0257, -0.0332, -0.0068, 0.0125, 0.0075, -0.0042, -0.0086, -0.0165], device='cuda:0'), grad: tensor([ 5.6345e-08, -3.3844e-06, 1.3970e-07, -2.7940e-09, 5.7276e-08, 4.4703e-08, 2.8405e-08, 6.2771e-07, 2.3041e-06, 1.2247e-07], device='cuda:0') 100 0.0001 changing lr epoch 333, time 220.64, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4603 re_mapping 0.0033 re_causal 0.0095 /// teacc 99.07 lr 0.00010000 Epoch 335, weight, value: tensor([[-0.2334, -0.3015, 0.1190, ..., -0.1495, 0.0445, 0.0409], [-0.1487, -0.0750, -0.1111, ..., -0.2198, -0.0828, -0.0243], [ 0.0174, -0.1787, -0.2232, ..., -0.1755, 0.0407, -0.3946], ..., [-0.2212, 0.1838, 0.0309, ..., 0.2231, -0.0700, -0.1696], [-0.1943, -0.1893, 0.2095, ..., -0.1764, -0.1598, 0.1648], [ 0.0186, -0.3424, 0.1892, ..., 0.0618, -0.2074, -0.1677]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 2.3283e-09, 1.0245e-08, ..., 1.3970e-09, 0.0000e+00, 1.8626e-08], ..., [ 0.0000e+00, -5.1223e-09, -4.6566e-10, ..., -6.0536e-09, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, -1.5832e-08, ..., 0.0000e+00, 0.0000e+00, -2.5611e-08], [ 0.0000e+00, 9.3132e-10, 2.3283e-09, ..., 0.0000e+00, 0.0000e+00, 5.5879e-09]], device='cuda:0') Epoch 335, bias, value: tensor([-0.0193, -0.0295, -0.0259, -0.0332, -0.0063, 0.0125, 0.0075, -0.0041, -0.0088, -0.0163], device='cuda:0'), grad: tensor([ 3.2596e-08, -6.6590e-08, 1.2666e-07, 9.3132e-10, 3.0734e-08, 3.5856e-08, -1.1828e-07, 7.4506e-09, -6.1467e-08, 1.8161e-08], device='cuda:0') 100 0.0001 changing lr epoch 334, time 220.45, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4741 re_mapping 0.0031 re_causal 0.0091 /// teacc 99.14 lr 0.00010000 Epoch 336, weight, value: tensor([[-0.2340, -0.3024, 0.1190, ..., -0.1497, 0.0445, 0.0403], [-0.1488, -0.0751, -0.1111, ..., -0.2199, -0.0828, -0.0242], [ 0.0158, -0.1793, -0.2242, ..., -0.1762, 0.0407, -0.3960], ..., [-0.2213, 0.1839, 0.0310, ..., 0.2235, -0.0700, -0.1697], [-0.1945, -0.1896, 0.2098, ..., -0.1771, -0.1598, 0.1648], [ 0.0182, -0.3439, 0.1896, ..., 0.0620, -0.2074, -0.1682]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 3.7253e-09, 4.6566e-10, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -5.5879e-09, 2.7940e-09, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, -6.5193e-09, ..., -3.2596e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 336, bias, value: tensor([-0.0199, -0.0294, -0.0264, -0.0332, -0.0065, 0.0125, 0.0076, -0.0041, -0.0089, -0.0164], device='cuda:0'), grad: tensor([ 6.0536e-09, 9.3132e-09, 4.1910e-09, 1.8626e-09, 3.7253e-09, 8.3819e-09, -1.4901e-08, -4.6566e-09, 2.7940e-09, -1.3039e-08], device='cuda:0') 100 0.0001 changing lr epoch 335, time 220.35, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4657 re_mapping 0.0031 re_causal 0.0090 /// teacc 99.12 lr 0.00010000 Epoch 337, weight, value: tensor([[-0.2341, -0.3032, 0.1190, ..., -0.1501, 0.0445, 0.0401], [-0.1487, -0.0751, -0.1111, ..., -0.2201, -0.0829, -0.0242], [ 0.0153, -0.1794, -0.2251, ..., -0.1762, 0.0407, -0.3968], ..., [-0.2213, 0.1840, 0.0309, ..., 0.2235, -0.0700, -0.1697], [-0.1946, -0.1916, 0.2104, ..., -0.1773, -0.1598, 0.1651], [ 0.0182, -0.3448, 0.1899, ..., 0.0622, -0.2075, -0.1683]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -2.1420e-08, ..., 0.0000e+00, 0.0000e+00, -8.3819e-09], [ 0.0000e+00, 4.6566e-10, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], ..., [ 0.0000e+00, 4.6566e-10, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 6.5193e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09], [-4.6566e-10, 0.0000e+00, 2.7940e-09, ..., -2.3283e-09, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 337, bias, value: tensor([-0.0203, -0.0294, -0.0266, -0.0332, -0.0064, 0.0125, 0.0076, -0.0042, -0.0087, -0.0164], device='cuda:0'), grad: tensor([ 4.6566e-09, 2.9337e-08, -2.6263e-07, 1.5181e-07, 9.7789e-09, 3.2596e-09, 8.3819e-09, 2.4680e-08, 1.4901e-08, 1.9558e-08], device='cuda:0') 100 0.0001 changing lr epoch 336, time 220.33, cls_loss 0.0007 cls_loss_mapping 0.0018 cls_loss_causal 0.4826 re_mapping 0.0032 re_causal 0.0095 /// teacc 99.19 lr 0.00010000 Epoch 338, weight, value: tensor([[-0.2342, -0.3021, 0.1195, ..., -0.1503, 0.0444, 0.0402], [-0.1488, -0.0751, -0.1111, ..., -0.2205, -0.0828, -0.0241], [ 0.0149, -0.1803, -0.2271, ..., -0.1765, 0.0407, -0.3978], ..., [-0.2214, 0.1840, 0.0309, ..., 0.2240, -0.0700, -0.1698], [-0.1946, -0.1920, 0.2113, ..., -0.1770, -0.1598, 0.1655], [ 0.0183, -0.3475, 0.1899, ..., 0.0622, -0.2076, -0.1685]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 4.6566e-10, 9.3132e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, -1.1642e-08, ..., 0.0000e+00, 0.0000e+00, -6.5193e-09], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 338, bias, value: tensor([-0.0199, -0.0292, -0.0270, -0.0331, -0.0064, 0.0125, 0.0076, -0.0043, -0.0082, -0.0167], device='cuda:0'), grad: tensor([-3.7253e-09, 3.2596e-09, 1.8626e-09, 2.5146e-08, 1.3970e-09, -1.2107e-08, 7.9162e-09, 1.8626e-09, -2.6077e-08, 9.3132e-09], device='cuda:0') 100 0.0001 changing lr epoch 337, time 221.02, cls_loss 0.0006 cls_loss_mapping 0.0011 cls_loss_causal 0.4641 re_mapping 0.0033 re_causal 0.0096 /// teacc 99.12 lr 0.00010000 Epoch 339, weight, value: tensor([[-0.2344, -0.3023, 0.1197, ..., -0.1504, 0.0444, 0.0403], [-0.1488, -0.0751, -0.1111, ..., -0.2207, -0.0825, -0.0241], [ 0.0145, -0.1811, -0.2280, ..., -0.1775, 0.0404, -0.3992], ..., [-0.2214, 0.1842, 0.0310, ..., 0.2249, -0.0701, -0.1698], [-0.1947, -0.1923, 0.2119, ..., -0.1763, -0.1602, 0.1656], [ 0.0184, -0.3496, 0.1900, ..., 0.0625, -0.2076, -0.1687]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.8476e-09, 1.8626e-09, ..., 8.8476e-09, 0.0000e+00, -1.3970e-09], [ 0.0000e+00, 9.3132e-10, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, -2.2817e-08, -5.1223e-09, ..., -2.3283e-08, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, -1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -9.3132e-10], [ 0.0000e+00, 1.0710e-08, 2.7940e-09, ..., 1.0710e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 339, bias, value: tensor([-0.0198, -0.0293, -0.0277, -0.0331, -0.0071, 0.0124, 0.0078, -0.0042, -0.0084, -0.0166], device='cuda:0'), grad: tensor([ 2.7940e-09, -9.1735e-08, -2.3050e-07, 3.2596e-09, 1.3318e-07, 6.0536e-09, 4.6566e-09, -5.2620e-08, 1.8999e-07, 4.3306e-08], device='cuda:0') 100 0.0001 changing lr epoch 338, time 221.01, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4713 re_mapping 0.0031 re_causal 0.0089 /// teacc 99.20 lr 0.00010000 Epoch 340, weight, value: tensor([[-0.2347, -0.3029, 0.1197, ..., -0.1511, 0.0441, 0.0403], [-0.1489, -0.0751, -0.1111, ..., -0.2209, -0.0826, -0.0241], [ 0.0115, -0.1846, -0.2290, ..., -0.1782, 0.0403, -0.3999], ..., [-0.2215, 0.1842, 0.0310, ..., 0.2253, -0.0701, -0.1698], [-0.1948, -0.1929, 0.2127, ..., -0.1767, -0.1603, 0.1663], [ 0.0183, -0.3512, 0.1904, ..., 0.0619, -0.2076, -0.1689]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -5.5879e-09, 4.6566e-10, ..., -2.7940e-09, 0.0000e+00, 4.6566e-10], [ 1.3970e-09, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 4.6566e-10, -9.3132e-10, ..., -1.3970e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 340, bias, value: tensor([-0.0200, -0.0293, -0.0304, -0.0319, -0.0067, 0.0122, 0.0078, -0.0043, -0.0079, -0.0172], device='cuda:0'), grad: tensor([ 3.2596e-09, 2.6776e-07, -2.6822e-07, -9.3132e-10, 3.7253e-09, -2.3283e-09, 5.1223e-09, -6.0536e-09, 5.1223e-09, -4.6566e-10], device='cuda:0') 100 0.0001 changing lr epoch 339, time 220.47, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4786 re_mapping 0.0031 re_causal 0.0089 /// teacc 99.12 lr 0.00010000 Epoch 341, weight, value: tensor([[-0.2349, -0.3036, 0.1200, ..., -0.1514, 0.0438, 0.0400], [-0.1490, -0.0752, -0.1111, ..., -0.2211, -0.0790, -0.0239], [ 0.0115, -0.1855, -0.2303, ..., -0.1798, 0.0379, -0.4018], ..., [-0.2215, 0.1844, 0.0310, ..., 0.2257, -0.0711, -0.1699], [-0.1949, -0.1945, 0.2124, ..., -0.1774, -0.1615, 0.1667], [ 0.0182, -0.3522, 0.1906, ..., 0.0618, -0.2077, -0.1690]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 0.0000e+00, 8.8476e-09, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -4.1910e-09], ..., [ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., -3.2596e-09, 0.0000e+00, 1.3970e-09], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09], [ 4.6566e-10, 4.6566e-10, -1.3970e-09, ..., -4.6566e-10, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 341, bias, value: tensor([-0.0203, -0.0290, -0.0316, -0.0318, -0.0063, 0.0121, 0.0079, -0.0043, -0.0078, -0.0175], device='cuda:0'), grad: tensor([ 2.9337e-08, 2.9337e-08, -1.0105e-07, -1.2573e-08, 3.2596e-09, 1.0710e-08, -1.1176e-08, 2.0023e-08, 3.7719e-08, 4.1910e-09], device='cuda:0') 100 0.0001 changing lr epoch 340, time 220.69, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4767 re_mapping 0.0031 re_causal 0.0090 /// teacc 99.13 lr 0.00010000 Epoch 342, weight, value: tensor([[-0.2361, -0.3021, 0.1205, ..., -0.1515, 0.0438, 0.0394], [-0.1490, -0.0753, -0.1112, ..., -0.2213, -0.0790, -0.0238], [ 0.0116, -0.1852, -0.2307, ..., -0.1790, 0.0378, -0.4024], ..., [-0.2216, 0.1844, 0.0310, ..., 0.2257, -0.0711, -0.1700], [-0.1950, -0.1949, 0.2136, ..., -0.1774, -0.1616, 0.1671], [ 0.0181, -0.3531, 0.1909, ..., 0.0621, -0.2077, -0.1692]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 1.3970e-09, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.5832e-08, 2.7940e-09, ..., 1.1642e-08, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 1.2107e-08, 9.3132e-10, ..., 8.3819e-09, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, -2.1979e-07, -3.0734e-08, ..., -1.9278e-07, 0.0000e+00, -1.8626e-09], [ 7.9162e-09, 9.3132e-10, 5.5879e-09, ..., 4.1910e-09, 0.0000e+00, -9.3132e-10], [-2.7008e-08, 9.3132e-10, -2.7474e-08, ..., -1.2107e-08, 0.0000e+00, -3.2596e-09]], device='cuda:0') Epoch 342, bias, value: tensor([-0.0207, -0.0300, -0.0285, -0.0318, -0.0063, 0.0121, 0.0079, -0.0044, -0.0077, -0.0175], device='cuda:0'), grad: tensor([ 9.3132e-09, 3.8184e-08, 4.6566e-08, 5.5879e-09, 4.8662e-07, 6.6590e-08, 3.2596e-09, -5.4762e-07, 3.3993e-08, -1.3364e-07], device='cuda:0') 100 0.0001 changing lr epoch 341, time 220.76, cls_loss 0.0006 cls_loss_mapping 0.0011 cls_loss_causal 0.4716 re_mapping 0.0031 re_causal 0.0092 /// teacc 99.11 lr 0.00010000 Epoch 343, weight, value: tensor([[-0.2375, -0.3028, 0.1204, ..., -0.1518, 0.0432, 0.0386], [-0.1491, -0.0753, -0.1112, ..., -0.2215, -0.0760, -0.0238], [ 0.0112, -0.1852, -0.2314, ..., -0.1793, 0.0348, -0.4036], ..., [-0.2217, 0.1845, 0.0310, ..., 0.2261, -0.0716, -0.1700], [-0.1952, -0.1952, 0.2147, ..., -0.1778, -0.1619, 0.1673], [ 0.0165, -0.3543, 0.1913, ..., 0.0597, -0.2078, -0.1695]], device='cuda:0'), grad: tensor([[ 4.8894e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.3551e-09], [ 4.6566e-10, 1.6298e-09, 2.3283e-10, ..., 9.3132e-10, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 2.3283e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10], ..., [ 9.3132e-10, -1.5832e-08, -2.7940e-09, ..., -1.4668e-08, 0.0000e+00, 9.3132e-10], [ 1.0012e-08, -9.3132e-10, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 3.9581e-09], [ 1.1642e-09, 2.3283e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 343, bias, value: tensor([-0.0215, -0.0298, -0.0293, -0.0318, -0.0036, 0.0121, 0.0079, -0.0044, -0.0077, -0.0202], device='cuda:0'), grad: tensor([ 1.4668e-08, 6.7521e-09, -4.6566e-10, 1.5367e-08, 3.2596e-08, -3.5111e-07, 3.0315e-07, -2.5844e-08, 9.3132e-09, 4.8894e-09], device='cuda:0') 100 0.0001 changing lr epoch 342, time 220.81, cls_loss 0.0005 cls_loss_mapping 0.0010 cls_loss_causal 0.4470 re_mapping 0.0031 re_causal 0.0094 /// teacc 99.14 lr 0.00010000 Epoch 344, weight, value: tensor([[-0.2380, -0.3031, 0.1194, ..., -0.1542, 0.0432, 0.0364], [-0.1491, -0.0754, -0.1112, ..., -0.2216, -0.0760, -0.0238], [ 0.0113, -0.1849, -0.2315, ..., -0.1788, 0.0349, -0.4036], ..., [-0.2218, 0.1846, 0.0310, ..., 0.2265, -0.0716, -0.1700], [-0.1954, -0.1972, 0.2145, ..., -0.1782, -0.1625, 0.1672], [ 0.0165, -0.3549, 0.1922, ..., 0.0601, -0.2078, -0.1695]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, -1.8626e-09, ..., 2.3283e-10, -2.3283e-10, -1.8626e-09], [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 6.9849e-10, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 2.3283e-10, 2.3283e-10, ..., 9.3132e-10, 0.0000e+00, 6.9849e-10], [ 2.3283e-10, 6.9849e-10, -4.1910e-09, ..., -8.3819e-09, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 344, bias, value: tensor([-0.0246, -0.0298, -0.0290, -0.0317, -0.0041, 0.0120, 0.0084, -0.0043, -0.0083, -0.0200], device='cuda:0'), grad: tensor([-5.8208e-09, 1.2340e-08, -5.8208e-09, -7.6834e-09, 1.2107e-08, 1.1176e-08, -1.2573e-08, 4.6566e-09, 8.1491e-09, -1.5134e-08], device='cuda:0') 100 0.0001 changing lr epoch 343, time 220.47, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4618 re_mapping 0.0030 re_causal 0.0089 /// teacc 99.15 lr 0.00010000 Epoch 345, weight, value: tensor([[-0.2381, -0.3032, 0.1199, ..., -0.1541, 0.0432, 0.0370], [-0.1492, -0.0754, -0.1112, ..., -0.2218, -0.0760, -0.0238], [ 0.0115, -0.1850, -0.2325, ..., -0.1789, 0.0349, -0.4042], ..., [-0.2218, 0.1846, 0.0310, ..., 0.2264, -0.0716, -0.1701], [-0.1956, -0.1975, 0.2148, ..., -0.1791, -0.1628, 0.1674], [ 0.0165, -0.3553, 0.1925, ..., 0.0604, -0.2078, -0.1700]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 4.6566e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 2.7940e-09, 6.9849e-10, ..., 1.1642e-09, 0.0000e+00, -6.9849e-10], [ 9.3132e-10, 2.0955e-09, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], ..., [ 9.3132e-10, -1.1176e-08, -5.3551e-09, ..., -7.4506e-09, 0.0000e+00, 2.3283e-10], [ 2.5611e-09, 2.0955e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 1.6298e-09, 1.4668e-08, -9.5228e-08, ..., -2.2375e-07, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 345, bias, value: tensor([-0.0244, -0.0298, -0.0290, -0.0317, -0.0041, 0.0120, 0.0084, -0.0044, -0.0084, -0.0199], device='cuda:0'), grad: tensor([ 1.1642e-09, -5.1223e-09, 1.0710e-08, -8.5915e-08, 4.4028e-07, 5.3551e-08, 2.7940e-09, -1.4668e-08, 8.8476e-09, -4.0978e-07], device='cuda:0') 100 0.0001 changing lr epoch 344, time 220.82, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4953 re_mapping 0.0030 re_causal 0.0092 /// teacc 99.10 lr 0.00010000 Epoch 346, weight, value: tensor([[-0.2382, -0.3038, 0.1179, ..., -0.1566, 0.0430, 0.0368], [-0.1493, -0.0754, -0.1113, ..., -0.2219, -0.0758, -0.0242], [ 0.0115, -0.1851, -0.2328, ..., -0.1791, 0.0347, -0.4044], ..., [-0.2219, 0.1846, 0.0309, ..., 0.2263, -0.0718, -0.1702], [-0.1967, -0.1982, 0.2147, ..., -0.1801, -0.1629, 0.1668], [ 0.0167, -0.3560, 0.1951, ..., 0.0617, -0.2079, -0.1699]], device='cuda:0'), grad: tensor([[ 6.2864e-09, 1.1642e-09, 4.6566e-10, ..., 6.9849e-10, 0.0000e+00, 2.7940e-09], [ 1.2806e-08, 2.8638e-08, -1.8626e-08, ..., 2.7940e-08, 0.0000e+00, -3.8184e-08], [ 2.1886e-08, 6.2864e-08, 4.4238e-09, ..., 6.1002e-08, 0.0000e+00, 7.6834e-09], ..., [ 1.0012e-08, -2.0862e-07, -1.1409e-08, ..., -2.1211e-07, 0.0000e+00, 7.2177e-09], [ 6.8918e-08, 6.7521e-09, 1.6531e-08, ..., 1.3970e-09, 0.0000e+00, 6.7288e-08], [ 3.1921e-07, 1.4435e-07, -1.5600e-08, ..., 7.9861e-08, 0.0000e+00, 1.1176e-07]], device='cuda:0') Epoch 346, bias, value: tensor([-0.0258, -0.0299, -0.0290, -0.0316, -0.0041, 0.0120, 0.0086, -0.0045, -0.0096, -0.0189], device='cuda:0'), grad: tensor([ 2.2817e-08, -1.7905e-07, 2.2654e-07, 3.0603e-06, 1.5856e-07, -3.9935e-06, 8.8243e-08, -6.2864e-07, 3.6834e-07, 8.7172e-07], device='cuda:0') 100 0.0001 changing lr epoch 345, time 220.68, cls_loss 0.0006 cls_loss_mapping 0.0011 cls_loss_causal 0.4507 re_mapping 0.0031 re_causal 0.0089 /// teacc 99.11 lr 0.00010000 Epoch 347, weight, value: tensor([[-0.2385, -0.3045, 0.1175, ..., -0.1572, 0.0429, 0.0367], [-0.1494, -0.0754, -0.1113, ..., -0.2221, -0.0758, -0.0241], [ 0.0118, -0.1852, -0.2333, ..., -0.1796, 0.0347, -0.4052], ..., [-0.2221, 0.1846, 0.0309, ..., 0.2265, -0.0718, -0.1702], [-0.1981, -0.1994, 0.2145, ..., -0.1811, -0.1629, 0.1664], [ 0.0164, -0.3571, 0.1960, ..., 0.0620, -0.2079, -0.1701]], device='cuda:0'), grad: tensor([[ 2.3283e-10, -1.6298e-09, -1.4203e-08, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 6.9849e-10, 6.0536e-09, 5.8208e-09, ..., 1.1642e-09, 0.0000e+00, -6.9849e-10], [-1.1642e-09, 1.6298e-09, 2.5611e-09, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], ..., [ 2.3283e-10, -6.5193e-09, 9.3132e-10, ..., -2.0955e-09, 0.0000e+00, 1.3970e-09], [ 2.0955e-09, 2.3283e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 1.3970e-09], [ 8.3819e-09, 9.3132e-10, -1.1409e-08, ..., -1.4203e-08, 0.0000e+00, 6.2864e-09]], device='cuda:0') Epoch 347, bias, value: tensor([-0.0262, -0.0297, -0.0291, -0.0315, -0.0040, 0.0121, 0.0086, -0.0045, -0.0109, -0.0189], device='cuda:0'), grad: tensor([-3.6322e-08, 6.9384e-08, -8.9966e-07, 8.3121e-08, 2.3912e-07, -6.7521e-08, 1.4203e-08, 7.1246e-08, 5.5414e-07, -1.8161e-08], device='cuda:0') 100 0.0001 changing lr epoch 346, time 220.28, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4360 re_mapping 0.0030 re_causal 0.0084 /// teacc 99.09 lr 0.00010000 Epoch 348, weight, value: tensor([[-0.2386, -0.3047, 0.1178, ..., -0.1573, 0.0429, 0.0369], [-0.1497, -0.0754, -0.1113, ..., -0.2223, -0.0758, -0.0241], [ 0.0117, -0.1853, -0.2340, ..., -0.1797, 0.0347, -0.4055], ..., [-0.2224, 0.1849, 0.0311, ..., 0.2278, -0.0719, -0.1703], [-0.1986, -0.1997, 0.2148, ..., -0.1817, -0.1629, 0.1663], [ 0.0163, -0.3582, 0.1975, ..., 0.0636, -0.2079, -0.1699]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-09], [ 0.0000e+00, 6.9849e-10, 2.3283e-10, ..., 6.9849e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -9.7789e-09, -7.4506e-09, ..., -1.4901e-08, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.3819e-09, 6.5193e-09, ..., 1.3504e-08, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 348, bias, value: tensor([-0.0262, -0.0297, -0.0289, -0.0317, -0.0053, 0.0121, 0.0086, -0.0044, -0.0116, -0.0179], device='cuda:0'), grad: tensor([ 1.2084e-07, 4.4238e-09, 2.0955e-09, 6.9849e-10, 1.6997e-08, 6.0536e-08, -2.0256e-07, -3.1432e-08, 2.7940e-09, 3.3062e-08], device='cuda:0') 100 0.0001 changing lr epoch 347, time 220.47, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4739 re_mapping 0.0030 re_causal 0.0085 /// teacc 99.13 lr 0.00010000 Epoch 349, weight, value: tensor([[-0.2390, -0.3049, 0.1181, ..., -0.1574, 0.0430, 0.0366], [-0.1491, -0.0756, -0.1113, ..., -0.2230, -0.0733, -0.0237], [ 0.0118, -0.1857, -0.2351, ..., -0.1804, 0.0323, -0.4055], ..., [-0.2227, 0.1851, 0.0311, ..., 0.2287, -0.0722, -0.1704], [-0.1995, -0.1999, 0.2155, ..., -0.1819, -0.1634, 0.1663], [ 0.0163, -0.3587, 0.1995, ..., 0.0654, -0.2080, -0.1704]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, -1.6764e-08, ..., 0.0000e+00, 0.0000e+00, -1.6764e-08], [ 7.6834e-09, 2.3283e-10, 1.1176e-08, ..., 2.3283e-10, 0.0000e+00, 3.7719e-08], [ 6.9849e-09, 0.0000e+00, 5.8208e-09, ..., 0.0000e+00, 0.0000e+00, 3.0268e-08], ..., [ 1.8626e-09, 2.3283e-10, 3.9581e-09, ..., 6.2864e-09, 0.0000e+00, 9.3132e-09], [-3.5041e-07, 0.0000e+00, -2.5774e-07, ..., 4.4238e-09, 0.0000e+00, -1.4678e-06], [-2.3283e-09, 2.3283e-10, -5.8208e-09, ..., -6.9849e-09, 0.0000e+00, -4.8894e-09]], device='cuda:0') Epoch 349, bias, value: tensor([-0.0265, -0.0290, -0.0300, -0.0317, -0.0071, 0.0121, 0.0085, -0.0043, -0.0118, -0.0165], device='cuda:0'), grad: tensor([-6.9151e-08, 1.2456e-07, 9.5461e-08, -4.5868e-08, -2.7940e-08, 3.9414e-06, 3.1106e-07, 5.9837e-08, -4.3511e-06, -1.5600e-08], device='cuda:0') 100 0.0001 changing lr epoch 348, time 220.19, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4619 re_mapping 0.0031 re_causal 0.0087 /// teacc 98.99 lr 0.00010000 Epoch 350, weight, value: tensor([[-0.2392, -0.3050, 0.1184, ..., -0.1573, 0.0430, 0.0377], [-0.1493, -0.0766, -0.1129, ..., -0.2234, -0.0732, -0.0266], [ 0.0120, -0.1861, -0.2358, ..., -0.1813, 0.0323, -0.4071], ..., [-0.2228, 0.1862, 0.0326, ..., 0.2291, -0.0726, -0.1679], [-0.1996, -0.2023, 0.2165, ..., -0.1829, -0.1640, 0.1677], [ 0.0163, -0.3592, 0.2004, ..., 0.0659, -0.2080, -0.1706]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.4925e-09, 0.0000e+00, ..., 1.1642e-09, 0.0000e+00, -1.2573e-08], [ 0.0000e+00, 3.0268e-09, 2.3283e-10, ..., 4.1910e-09, 0.0000e+00, 1.6298e-09], ..., [ 0.0000e+00, 3.3760e-07, 1.4435e-08, ..., 4.6566e-10, 0.0000e+00, 1.0477e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 6.0536e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 350, bias, value: tensor([-0.0260, -0.0307, -0.0298, -0.0317, -0.0071, 0.0120, 0.0084, -0.0027, -0.0107, -0.0162], device='cuda:0'), grad: tensor([ 1.0012e-08, -1.6182e-07, 5.8440e-08, -5.7649e-07, -1.5530e-07, 3.0966e-08, 6.0536e-09, 7.2038e-07, 1.3271e-08, 6.2864e-08], device='cuda:0') 100 0.0001 changing lr epoch 349, time 220.15, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4501 re_mapping 0.0032 re_causal 0.0090 /// teacc 99.10 lr 0.00010000 Epoch 351, weight, value: tensor([[-0.2394, -0.3046, 0.1185, ..., -0.1573, 0.0431, 0.0375], [-0.1494, -0.0766, -0.1129, ..., -0.2236, -0.0733, -0.0267], [ 0.0119, -0.1857, -0.2363, ..., -0.1819, 0.0324, -0.4093], ..., [-0.2229, 0.1863, 0.0326, ..., 0.2294, -0.0726, -0.1680], [-0.1996, -0.2027, 0.2167, ..., -0.1830, -0.1642, 0.1682], [ 0.0163, -0.3595, 0.2005, ..., 0.0654, -0.2080, -0.1708]], device='cuda:0'), grad: tensor([[ 6.9849e-10, 4.1910e-09, 2.7940e-09, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, -6.3563e-08, 7.2177e-09, ..., 3.7253e-09, 1.3970e-09, 2.3283e-10], [ 4.6566e-10, 5.4250e-08, 2.5146e-08, ..., 9.3132e-10, -1.6298e-09, 2.3283e-10], ..., [ 2.3283e-10, 5.8208e-08, 1.8626e-09, ..., -6.9849e-09, 0.0000e+00, 2.3283e-10], [ 1.9325e-08, 4.7265e-08, 2.9802e-08, ..., 0.0000e+00, 0.0000e+00, 4.1910e-09], [ 3.0734e-08, 2.3283e-09, 1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 1.7462e-08]], device='cuda:0') Epoch 351, bias, value: tensor([-0.0263, -0.0305, -0.0296, -0.0318, -0.0067, 0.0116, 0.0092, -0.0031, -0.0107, -0.0167], device='cuda:0'), grad: tensor([ 1.7229e-08, -2.9709e-07, 9.1502e-08, -2.6356e-07, 3.7253e-09, -8.5682e-08, 1.3504e-08, 3.3691e-07, 1.4203e-07, 4.8196e-08], device='cuda:0') 100 0.0001 changing lr epoch 350, time 220.33, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4870 re_mapping 0.0031 re_causal 0.0086 /// teacc 99.16 lr 0.00010000 Epoch 352, weight, value: tensor([[-0.2406, -0.3048, 0.1183, ..., -0.1583, 0.0418, 0.0375], [-0.1498, -0.0765, -0.1131, ..., -0.2239, -0.0733, -0.0267], [ 0.0115, -0.1858, -0.2370, ..., -0.1824, 0.0322, -0.4108], ..., [-0.2230, 0.1862, 0.0327, ..., 0.2298, -0.0727, -0.1681], [-0.2005, -0.2032, 0.2169, ..., -0.1830, -0.1647, 0.1681], [ 0.0175, -0.3603, 0.2017, ..., 0.0656, -0.2082, -0.1715]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.4238e-09, 1.6298e-09, ..., 6.0536e-09, 0.0000e+00, -6.9849e-10], [ 3.2596e-09, 1.7136e-07, 6.1700e-08, ..., 2.0722e-07, 0.0000e+00, 1.6298e-09], [ 1.3970e-09, 2.4214e-08, 6.2864e-09, ..., 2.8173e-08, 0.0000e+00, 2.3283e-10], ..., [ 4.8894e-09, -4.5472e-07, -1.2666e-07, ..., -6.7102e-07, 0.0000e+00, 2.0955e-09], [ 8.8476e-09, 8.6147e-09, 2.3283e-09, ..., 5.1223e-09, 0.0000e+00, 7.9162e-09], [ 6.0536e-09, 7.6136e-08, -3.8417e-08, ..., 6.6590e-08, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 352, bias, value: tensor([-0.0272, -0.0302, -0.0288, -0.0319, -0.0082, 0.0116, 0.0094, -0.0037, -0.0118, -0.0150], device='cuda:0'), grad: tensor([ 4.3306e-08, 1.1642e-06, -1.0533e-06, 3.8324e-07, 1.7192e-06, 4.6031e-07, 7.5204e-08, -3.2559e-06, 6.5425e-08, 4.0815e-07], device='cuda:0') 100 0.0001 changing lr epoch 351, time 220.59, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4645 re_mapping 0.0030 re_causal 0.0087 /// teacc 99.07 lr 0.00010000 Epoch 353, weight, value: tensor([[-0.2408, -0.3050, 0.1191, ..., -0.1583, 0.0418, 0.0383], [-0.1499, -0.0769, -0.1133, ..., -0.2254, -0.0733, -0.0269], [ 0.0115, -0.1868, -0.2382, ..., -0.1849, 0.0322, -0.4116], ..., [-0.2231, 0.1868, 0.0330, ..., 0.2320, -0.0727, -0.1681], [-0.2008, -0.2050, 0.2175, ..., -0.1848, -0.1647, 0.1688], [ 0.0188, -0.3615, 0.2024, ..., 0.0655, -0.2082, -0.1718]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.3283e-10, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 1.7928e-08, 6.0536e-09, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 3.4925e-09, 1.1642e-09, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, -2.1420e-08, -8.6147e-09, ..., -1.1176e-08, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 6.9849e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 7.6834e-09, 3.9581e-09, 1.3970e-09, ..., 1.1642e-09, 0.0000e+00, 1.0245e-08]], device='cuda:0') Epoch 353, bias, value: tensor([-0.0269, -0.0306, -0.0285, -0.0319, -0.0097, 0.0114, 0.0096, -0.0035, -0.0120, -0.0135], device='cuda:0'), grad: tensor([-3.9581e-09, 4.2375e-08, 9.0804e-09, 1.8394e-08, 6.5193e-09, -5.9372e-08, 1.0943e-08, -4.9593e-08, 6.0536e-09, 2.7707e-08], device='cuda:0') 100 0.0001 changing lr epoch 352, time 220.33, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4427 re_mapping 0.0031 re_causal 0.0088 /// teacc 99.09 lr 0.00010000 Epoch 354, weight, value: tensor([[-0.2404, -0.3051, 0.1193, ..., -0.1583, 0.0418, 0.0391], [-0.1500, -0.0771, -0.1133, ..., -0.2258, -0.0733, -0.0269], [ 0.0118, -0.1868, -0.2382, ..., -0.1847, 0.0322, -0.4119], ..., [-0.2231, 0.1870, 0.0330, ..., 0.2323, -0.0727, -0.1681], [-0.2010, -0.2052, 0.2176, ..., -0.1852, -0.1647, 0.1690], [ 0.0188, -0.3620, 0.2024, ..., 0.0655, -0.2082, -0.1720]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 6.2864e-09, 3.0268e-09, ..., 5.3551e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 8.3819e-09, 2.3283e-10, ..., 7.6834e-09, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, -2.1188e-08, 5.1223e-09, ..., 2.3097e-07, 0.0000e+00, 4.4238e-09], [ 9.3132e-10, 1.6298e-09, -1.3970e-09, ..., 6.9849e-10, 0.0000e+00, -6.9849e-10], [-1.7229e-08, 4.6566e-10, -7.6368e-08, ..., -2.0955e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 354, bias, value: tensor([-0.0265, -0.0315, -0.0266, -0.0319, -0.0096, 0.0114, 0.0096, -0.0035, -0.0123, -0.0135], device='cuda:0'), grad: tensor([ 7.6834e-09, 2.4913e-08, 2.4680e-08, 1.3970e-08, -4.3330e-07, 4.7497e-08, -4.7265e-08, 7.5251e-07, 4.1910e-08, -4.4378e-07], device='cuda:0') 100 0.0001 changing lr epoch 353, time 220.72, cls_loss 0.0006 cls_loss_mapping 0.0011 cls_loss_causal 0.4593 re_mapping 0.0032 re_causal 0.0090 /// teacc 99.17 lr 0.00010000 Epoch 355, weight, value: tensor([[-0.2404, -0.3051, 0.1194, ..., -0.1583, 0.0418, 0.0393], [-0.1500, -0.0771, -0.1134, ..., -0.2259, -0.0732, -0.0269], [ 0.0118, -0.1869, -0.2394, ..., -0.1848, 0.0322, -0.4125], ..., [-0.2233, 0.1870, 0.0330, ..., 0.2323, -0.0727, -0.1682], [-0.2028, -0.2055, 0.2182, ..., -0.1857, -0.1648, 0.1682], [ 0.0188, -0.3622, 0.2025, ..., 0.0656, -0.2083, -0.1722]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 4.0559e-07, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 1.6298e-08, 5.8208e-09, ..., 2.3283e-09, 0.0000e+00, -4.6566e-10], [-9.3132e-10, 2.3283e-09, 3.4925e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 2.3283e-10, -1.9092e-08, -7.4506e-09, ..., -3.2596e-09, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, 2.3283e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 1.1642e-09, 2.0955e-09, -4.1374e-07, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 355, bias, value: tensor([-0.0265, -0.0315, -0.0272, -0.0319, -0.0098, 0.0115, 0.0101, -0.0036, -0.0130, -0.0135], device='cuda:0'), grad: tensor([ 8.6147e-07, 2.1420e-08, 1.0710e-08, -2.3050e-08, -1.3271e-08, 1.6997e-08, 1.3039e-08, -2.4447e-08, 2.5611e-09, -8.6613e-07], device='cuda:0') 100 0.0001 changing lr epoch 354, time 221.33, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.4350 re_mapping 0.0032 re_causal 0.0088 /// teacc 99.12 lr 0.00010000 Epoch 356, weight, value: tensor([[-0.2412, -0.3053, 0.1195, ..., -0.1583, 0.0417, 0.0393], [-0.1500, -0.0771, -0.1134, ..., -0.2261, -0.0732, -0.0269], [ 0.0113, -0.1870, -0.2398, ..., -0.1849, 0.0322, -0.4137], ..., [-0.2239, 0.1870, 0.0330, ..., 0.2325, -0.0727, -0.1682], [-0.2030, -0.2057, 0.2193, ..., -0.1852, -0.1649, 0.1686], [ 0.0192, -0.3635, 0.2026, ..., 0.0657, -0.2083, -0.1729]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, -1.6764e-08, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [-5.0757e-08, 1.2806e-08, 1.1642e-09, ..., 1.0245e-08, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 1.1642e-09, 3.0268e-09, ..., 9.3132e-10, 0.0000e+00, 2.3283e-10], ..., [ 1.3504e-08, -1.9558e-08, 2.0955e-09, ..., -5.5879e-09, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, -4.6566e-10, ..., 4.6566e-10, 0.0000e+00, -1.1642e-09], [ 2.5611e-08, 2.3283e-09, 1.8626e-09, ..., 2.1863e-07, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 356, bias, value: tensor([-0.0266, -0.0315, -0.0270, -0.0315, -0.0098, 0.0113, 0.0101, -0.0037, -0.0126, -0.0136], device='cuda:0'), grad: tensor([-4.4005e-08, -6.4494e-07, 2.3749e-08, 5.1223e-08, -9.7696e-07, 1.0012e-07, -1.8370e-07, 1.9139e-07, 1.4668e-08, 1.4687e-06], device='cuda:0') 100 0.0001 changing lr epoch 355, time 220.53, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.4610 re_mapping 0.0033 re_causal 0.0093 /// teacc 99.15 lr 0.00010000 Epoch 357, weight, value: tensor([[-0.2424, -0.3053, 0.1198, ..., -0.1583, 0.0417, 0.0393], [-0.1497, -0.0771, -0.1134, ..., -0.2262, -0.0732, -0.0268], [ 0.0112, -0.1872, -0.2408, ..., -0.1851, 0.0322, -0.4140], ..., [-0.2243, 0.1871, 0.0330, ..., 0.2327, -0.0727, -0.1683], [-0.2039, -0.2057, 0.2203, ..., -0.1852, -0.1649, 0.1685], [ 0.0192, -0.3645, 0.2026, ..., 0.0655, -0.2083, -0.1739]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 1.3970e-09, -9.3132e-10, ..., 4.4238e-09, 0.0000e+00, -2.3283e-10], [ 0.0000e+00, 1.3970e-09, -1.6298e-09, ..., 9.5461e-09, 0.0000e+00, -3.9581e-09], [ 2.3283e-10, 1.4901e-08, 4.4238e-09, ..., 2.7940e-09, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, 4.4238e-09, 2.0955e-09, ..., 6.0536e-09, 0.0000e+00, 1.1642e-09], [ 4.1910e-09, 1.8626e-09, 1.1642e-09, ..., 1.3970e-09, 0.0000e+00, 3.2596e-09], [ 1.4435e-08, 1.6298e-09, -3.0268e-09, ..., 9.7323e-08, 0.0000e+00, 9.5461e-09]], device='cuda:0') Epoch 357, bias, value: tensor([-0.0265, -0.0315, -0.0270, -0.0315, -0.0097, 0.0113, 0.0102, -0.0038, -0.0128, -0.0137], device='cuda:0'), grad: tensor([ 2.6543e-08, 1.6065e-08, 5.9837e-08, -8.1491e-08, -8.1351e-07, 9.5461e-09, 1.2433e-07, 5.6578e-08, 2.9337e-08, 5.7276e-07], device='cuda:0') 100 0.0001 changing lr epoch 356, time 220.38, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4666 re_mapping 0.0032 re_causal 0.0090 /// teacc 99.15 lr 0.00010000 Epoch 358, weight, value: tensor([[-0.2427, -0.3061, 0.1199, ..., -0.1584, 0.0417, 0.0391], [-0.1491, -0.0776, -0.1135, ..., -0.2273, -0.0732, -0.0268], [ 0.0112, -0.1876, -0.2412, ..., -0.1859, 0.0322, -0.4141], ..., [-0.2260, 0.1878, 0.0332, ..., 0.2346, -0.0728, -0.1684], [-0.2047, -0.2059, 0.2211, ..., -0.1854, -0.1649, 0.1685], [ 0.0184, -0.3685, 0.2026, ..., 0.0651, -0.2083, -0.1756]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -4.1910e-09, ..., 0.0000e+00, 0.0000e+00, -2.7940e-09], [ 0.0000e+00, 1.8626e-09, 6.9849e-10, ..., 9.3132e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., -1.6298e-09, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 6.9849e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 358, bias, value: tensor([-0.0268, -0.0317, -0.0269, -0.0315, -0.0096, 0.0114, 0.0103, -0.0035, -0.0131, -0.0139], device='cuda:0'), grad: tensor([-1.0245e-08, -4.6566e-09, -5.5879e-09, -2.1188e-08, 2.0955e-09, 1.7229e-08, -5.3551e-09, 2.6543e-08, 1.8626e-09, 4.1910e-09], device='cuda:0') 100 0.0001 changing lr epoch 357, time 220.24, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4926 re_mapping 0.0031 re_causal 0.0093 /// teacc 99.12 lr 0.00010000 Epoch 359, weight, value: tensor([[-0.2426, -0.3061, 0.1201, ..., -0.1584, 0.0417, 0.0395], [-0.1496, -0.0777, -0.1136, ..., -0.2281, -0.0732, -0.0269], [ 0.0108, -0.1878, -0.2414, ..., -0.1861, 0.0322, -0.4142], ..., [-0.2279, 0.1879, 0.0332, ..., 0.2354, -0.0728, -0.1685], [-0.2054, -0.2063, 0.2215, ..., -0.1850, -0.1649, 0.1686], [ 0.0183, -0.3696, 0.2026, ..., 0.0650, -0.2084, -0.1763]], device='cuda:0'), grad: tensor([[-2.3283e-10, 0.0000e+00, -1.0245e-08, ..., 0.0000e+00, 0.0000e+00, -3.4459e-08], [ 0.0000e+00, 3.4925e-09, 9.3132e-10, ..., 3.0268e-09, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, -6.9849e-10, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.3551e-09], [ 2.3283e-10, 1.8626e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 359, bias, value: tensor([-0.0266, -0.0319, -0.0267, -0.0313, -0.0096, 0.0112, 0.0105, -0.0035, -0.0133, -0.0141], device='cuda:0'), grad: tensor([-9.1502e-08, 2.1188e-08, -5.1223e-09, -6.0536e-09, 1.6298e-09, 2.9569e-08, 2.5146e-08, 2.3283e-10, 3.2363e-08, 6.2864e-09], device='cuda:0') 100 0.0001 changing lr epoch 358, time 220.48, cls_loss 0.0007 cls_loss_mapping 0.0017 cls_loss_causal 0.4444 re_mapping 0.0030 re_causal 0.0085 /// teacc 99.16 lr 0.00010000 Epoch 360, weight, value: tensor([[-0.2429, -0.3061, 0.1231, ..., -0.1585, 0.0417, 0.0418], [-0.1499, -0.0779, -0.1137, ..., -0.2287, -0.0733, -0.0271], [ 0.0107, -0.1879, -0.2422, ..., -0.1863, 0.0321, -0.4144], ..., [-0.2281, 0.1881, 0.0333, ..., 0.2361, -0.0728, -0.1686], [-0.2058, -0.2066, 0.2216, ..., -0.1854, -0.1650, 0.1687], [ 0.0183, -0.3706, 0.2006, ..., 0.0649, -0.2084, -0.1778]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 4.4238e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -2.3283e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.3271e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, -1.1642e-09, ..., 0.0000e+00, 0.0000e+00, -1.6298e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 360, bias, value: tensor([-0.0238, -0.0319, -0.0266, -0.0312, -0.0095, 0.0111, 0.0108, -0.0036, -0.0136, -0.0146], device='cuda:0'), grad: tensor([ 1.0477e-08, 5.3551e-09, 1.6298e-09, -4.5868e-08, 4.6566e-09, 3.9814e-08, -5.3551e-08, 4.1444e-08, 9.3132e-10, 2.5611e-09], device='cuda:0') 100 0.0001 changing lr epoch 359, time 220.50, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4619 re_mapping 0.0030 re_causal 0.0087 /// teacc 99.12 lr 0.00010000 Epoch 361, weight, value: tensor([[-0.2431, -0.3063, 0.1232, ..., -0.1585, 0.0417, 0.0420], [-0.1503, -0.0776, -0.1137, ..., -0.2291, -0.0733, -0.0273], [ 0.0108, -0.1878, -0.2424, ..., -0.1859, 0.0321, -0.4145], ..., [-0.2283, 0.1878, 0.0333, ..., 0.2365, -0.0728, -0.1686], [-0.2068, -0.2068, 0.2228, ..., -0.1854, -0.1650, 0.1654], [ 0.0184, -0.3712, 0.2007, ..., 0.0649, -0.2084, -0.1784]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 6.9849e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, -6.0536e-09, 1.6298e-09, ..., 6.9849e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 1.1176e-08, 6.9849e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 2.3283e-10, 7.4506e-09, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], [ 1.8626e-09, 9.5461e-09, 3.2596e-08, ..., 1.9092e-08, 0.0000e+00, 1.0012e-08], [ 9.3132e-10, 0.0000e+00, -3.7253e-08, ..., -2.6310e-08, 0.0000e+00, -1.1874e-08]], device='cuda:0') Epoch 361, bias, value: tensor([-0.0236, -0.0317, -0.0262, -0.0312, -0.0095, 0.0110, 0.0124, -0.0042, -0.0186, -0.0146], device='cuda:0'), grad: tensor([ 6.7521e-09, -1.7835e-07, 3.7020e-08, -6.9384e-08, 1.3062e-07, -3.9581e-09, 2.3516e-08, 6.2864e-08, 1.3947e-07, -1.4110e-07], device='cuda:0') 100 0.0001 changing lr epoch 360, time 220.66, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4665 re_mapping 0.0032 re_causal 0.0091 /// teacc 99.15 lr 0.00010000 Epoch 362, weight, value: tensor([[-0.2433, -0.3065, 0.1232, ..., -0.1587, 0.0417, 0.0421], [-0.1500, -0.0777, -0.1138, ..., -0.2299, -0.0733, -0.0273], [ 0.0108, -0.1884, -0.2445, ..., -0.1867, 0.0321, -0.4154], ..., [-0.2283, 0.1880, 0.0334, ..., 0.2376, -0.0728, -0.1686], [-0.2077, -0.2069, 0.2239, ..., -0.1860, -0.1651, 0.1656], [ 0.0183, -0.3722, 0.2008, ..., 0.0649, -0.2084, -0.1786]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 7.6834e-09, 1.6298e-09, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -7.6834e-09, -1.6298e-09, ..., -5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 362, bias, value: tensor([-0.0235, -0.0318, -0.0264, -0.0311, -0.0096, 0.0110, 0.0124, -0.0040, -0.0186, -0.0146], device='cuda:0'), grad: tensor([ 2.7940e-09, -9.3132e-10, 3.0501e-08, 2.3283e-10, -1.8626e-09, 1.6298e-08, -4.0513e-08, -1.2107e-08, 4.6566e-09, 1.4435e-08], device='cuda:0') 100 0.0001 changing lr epoch 361, time 220.63, cls_loss 0.0005 cls_loss_mapping 0.0013 cls_loss_causal 0.4606 re_mapping 0.0033 re_causal 0.0091 /// teacc 99.16 lr 0.00010000 Epoch 363, weight, value: tensor([[-0.2435, -0.3067, 0.1242, ..., -0.1587, 0.0419, 0.0423], [-0.1500, -0.0777, -0.1139, ..., -0.2303, -0.0733, -0.0270], [ 0.0107, -0.1889, -0.2473, ..., -0.1875, 0.0321, -0.4177], ..., [-0.2284, 0.1878, 0.0334, ..., 0.2381, -0.0728, -0.1686], [-0.2082, -0.2070, 0.2248, ..., -0.1860, -0.1651, 0.1657], [ 0.0185, -0.3726, 0.2005, ..., 0.0650, -0.2084, -0.1790]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 1.5134e-09, 5.8208e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 7.6834e-09, 3.0268e-09, ..., 5.1223e-09, 0.0000e+00, 1.0477e-09], [ 0.0000e+00, 1.8626e-08, 3.8417e-09, ..., 2.2817e-08, 0.0000e+00, 1.1642e-10], ..., [ 1.1642e-09, -3.5623e-08, -5.2387e-09, ..., -4.8196e-08, 0.0000e+00, 0.0000e+00], [ 1.6298e-09, 6.4028e-09, 1.1642e-10, ..., 1.1642e-10, 0.0000e+00, -3.2596e-09], [ 5.1223e-09, 2.3516e-08, 8.6147e-09, ..., 1.0477e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 363, bias, value: tensor([-0.0227, -0.0303, -0.0282, -0.0311, -0.0097, 0.0111, 0.0124, -0.0048, -0.0185, -0.0147], device='cuda:0'), grad: tensor([ 3.9581e-09, 2.5379e-08, 4.9360e-08, -8.9174e-08, 1.6065e-08, 2.7008e-08, 8.1491e-10, -1.1094e-07, 1.3039e-08, 6.6590e-08], device='cuda:0') 100 0.0001 changing lr epoch 362, time 220.50, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4738 re_mapping 0.0031 re_causal 0.0091 /// teacc 99.16 lr 0.00010000 Epoch 364, weight, value: tensor([[-0.2437, -0.3069, 0.1243, ..., -0.1587, 0.0419, 0.0422], [-0.1501, -0.0776, -0.1139, ..., -0.2306, -0.0733, -0.0271], [ 0.0108, -0.1888, -0.2471, ..., -0.1871, 0.0321, -0.4179], ..., [-0.2285, 0.1878, 0.0334, ..., 0.2385, -0.0728, -0.1688], [-0.2083, -0.2076, 0.2257, ..., -0.1872, -0.1651, 0.1659], [ 0.0182, -0.3732, 0.2006, ..., 0.0651, -0.2084, -0.1797]], device='cuda:0'), grad: tensor([[2.3283e-10, 9.1968e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [8.1491e-10, 2.8173e-08, 0.0000e+00, ..., 5.8208e-10, 0.0000e+00, 6.9849e-10], [1.1642e-10, 1.9209e-08, 1.1642e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [1.1642e-10, 9.5461e-09, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 1.1642e-10], [6.5193e-09, 4.3074e-09, 0.0000e+00, ..., 0.0000e+00, 1.1642e-10, 5.7044e-09], [1.2806e-09, 2.2119e-09, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 8.1491e-10]], device='cuda:0') Epoch 364, bias, value: tensor([-0.0228, -0.0301, -0.0277, -0.0311, -0.0097, 0.0110, 0.0126, -0.0051, -0.0187, -0.0147], device='cuda:0'), grad: tensor([ 3.5274e-08, 1.0617e-07, 6.8103e-08, -2.7521e-07, 7.2177e-09, -7.3691e-08, 6.2864e-08, 3.7951e-08, 2.9220e-08, 1.2806e-08], device='cuda:0') 100 0.0001 changing lr epoch 363, time 220.52, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4351 re_mapping 0.0031 re_causal 0.0087 /// teacc 99.06 lr 0.00010000 Epoch 365, weight, value: tensor([[-0.2438, -0.3077, 0.1243, ..., -0.1588, 0.0419, 0.0421], [-0.1502, -0.0779, -0.1139, ..., -0.2312, -0.0732, -0.0271], [ 0.0108, -0.1891, -0.2475, ..., -0.1877, 0.0320, -0.4181], ..., [-0.2285, 0.1880, 0.0334, ..., 0.2394, -0.0728, -0.1688], [-0.2092, -0.2084, 0.2257, ..., -0.1873, -0.1651, 0.1656], [ 0.0186, -0.3746, 0.2008, ..., 0.0646, -0.2084, -0.1799]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.9849e-10, 1.0477e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 2.6776e-09, 9.3132e-10, ..., 1.1642e-10, 0.0000e+00, 1.0477e-09], [ 0.0000e+00, 6.9849e-10, 2.2119e-09, ..., 0.0000e+00, 0.0000e+00, 5.1223e-09], ..., [ 1.1642e-10, 7.7998e-09, 8.1491e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 1.1642e-10, 2.4447e-09, -1.5483e-08, ..., 0.0000e+00, 0.0000e+00, -2.0955e-08], [ 0.0000e+00, 4.6566e-10, 3.3760e-09, ..., 0.0000e+00, 0.0000e+00, 4.5402e-09]], device='cuda:0') Epoch 365, bias, value: tensor([-0.0231, -0.0302, -0.0277, -0.0310, -0.0096, 0.0110, 0.0127, -0.0050, -0.0190, -0.0148], device='cuda:0'), grad: tensor([ 8.6147e-09, 1.1642e-09, 5.7044e-09, -4.9546e-07, 8.1491e-10, 4.6776e-07, 1.4086e-08, 3.5274e-08, -5.6112e-08, 1.5716e-08], device='cuda:0') 100 0.0001 changing lr epoch 364, time 220.20, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4496 re_mapping 0.0030 re_causal 0.0085 /// teacc 98.98 lr 0.00010000 Epoch 366, weight, value: tensor([[-0.2441, -0.3080, 0.1242, ..., -0.1588, 0.0419, 0.0419], [-0.1505, -0.0779, -0.1140, ..., -0.2314, -0.0732, -0.0271], [ 0.0109, -0.1898, -0.2483, ..., -0.1882, 0.0320, -0.4185], ..., [-0.2288, 0.1878, 0.0334, ..., 0.2395, -0.0728, -0.1689], [-0.2096, -0.2092, 0.2262, ..., -0.1879, -0.1651, 0.1657], [ 0.0185, -0.3750, 0.2012, ..., 0.0631, -0.2084, -0.1799]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.0955e-09, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 6.2864e-09, 6.9849e-10, ..., 2.5611e-09, 0.0000e+00, 3.4925e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, -1.8626e-09, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 366, bias, value: tensor([-0.0233, -0.0302, -0.0278, -0.0306, -0.0088, 0.0109, 0.0128, -0.0054, -0.0192, -0.0155], device='cuda:0'), grad: tensor([ 1.8626e-09, -1.0640e-07, 9.3132e-09, -2.7241e-08, -3.9581e-09, 8.1491e-09, 1.1642e-09, 1.2876e-07, 2.3283e-10, -6.7521e-09], device='cuda:0') 100 0.0001 changing lr epoch 365, time 220.52, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.4382 re_mapping 0.0031 re_causal 0.0087 /// teacc 99.18 lr 0.00010000 Epoch 367, weight, value: tensor([[-0.2444, -0.3082, 0.1243, ..., -0.1588, 0.0419, 0.0420], [-0.1506, -0.0779, -0.1140, ..., -0.2315, -0.0732, -0.0271], [ 0.0111, -0.1899, -0.2482, ..., -0.1884, 0.0320, -0.4188], ..., [-0.2293, 0.1877, 0.0334, ..., 0.2395, -0.0728, -0.1691], [-0.2098, -0.2095, 0.2266, ..., -0.1879, -0.1651, 0.1659], [ 0.0184, -0.3753, 0.2014, ..., 0.0631, -0.2085, -0.1803]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, -2.5611e-09, ..., 0.0000e+00, 0.0000e+00, -1.1642e-09], [ 0.0000e+00, 1.3970e-09, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.5832e-08, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 1.1642e-09, 6.9849e-10, ..., 6.9849e-10, 0.0000e+00, 4.6566e-10], [-3.7253e-09, 2.3283e-10, -1.5367e-08, ..., -2.3749e-08, 0.0000e+00, -4.6566e-10]], device='cuda:0') Epoch 367, bias, value: tensor([-0.0233, -0.0301, -0.0274, -0.0306, -0.0088, 0.0111, 0.0126, -0.0056, -0.0192, -0.0155], device='cuda:0'), grad: tensor([-6.5193e-09, 4.8894e-09, 3.4925e-09, -4.3772e-08, 7.2410e-08, 4.6566e-09, 6.9849e-09, 3.7719e-08, 5.5879e-09, -8.2655e-08], device='cuda:0') 100 0.0001 changing lr epoch 366, time 220.54, cls_loss 0.0006 cls_loss_mapping 0.0012 cls_loss_causal 0.4675 re_mapping 0.0029 re_causal 0.0085 /// teacc 99.17 lr 0.00010000 Epoch 368, weight, value: tensor([[-0.2446, -0.3087, 0.1267, ..., -0.1574, 0.0420, 0.0420], [-0.1507, -0.0783, -0.1144, ..., -0.2323, -0.0732, -0.0272], [ 0.0111, -0.1902, -0.2487, ..., -0.1888, 0.0320, -0.4189], ..., [-0.2293, 0.1880, 0.0337, ..., 0.2404, -0.0728, -0.1690], [-0.2102, -0.2102, 0.2271, ..., -0.1881, -0.1651, 0.1660], [ 0.0180, -0.3764, 0.1994, ..., 0.0627, -0.2085, -0.1811]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.3283e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 5.8906e-08, 4.6566e-09, ..., 7.2643e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.2340e-08, -4.6566e-10, ..., 1.1642e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.0547e-07, -7.9162e-09, ..., -1.2619e-07, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.0955e-09, 1.8626e-09, ..., 2.7940e-09, 0.0000e+00, 2.3283e-10], [-2.3283e-10, 6.9849e-10, -2.3283e-09, ..., -6.9849e-10, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 368, bias, value: tensor([-0.0212, -0.0305, -0.0274, -0.0303, -0.0087, 0.0110, 0.0126, -0.0054, -0.0192, -0.0162], device='cuda:0'), grad: tensor([ 5.5879e-09, 2.9383e-07, 2.4913e-08, 1.3970e-09, 1.6671e-07, -8.9221e-07, 8.9779e-07, -5.0012e-07, 2.0023e-08, -2.0955e-09], device='cuda:0') 100 0.0001 changing lr epoch 367, time 220.65, cls_loss 0.0007 cls_loss_mapping 0.0015 cls_loss_causal 0.4804 re_mapping 0.0028 re_causal 0.0084 /// teacc 99.16 lr 0.00010000 Epoch 369, weight, value: tensor([[-0.2450, -0.3089, 0.1268, ..., -0.1574, 0.0420, 0.0424], [-0.1511, -0.0784, -0.1144, ..., -0.2331, -0.0732, -0.0272], [ 0.0110, -0.1927, -0.2521, ..., -0.1922, 0.0320, -0.4203], ..., [-0.2295, 0.1885, 0.0337, ..., 0.2423, -0.0728, -0.1691], [-0.2100, -0.2105, 0.2308, ..., -0.1875, -0.1651, 0.1673], [ 0.0180, -0.3774, 0.1995, ..., 0.0623, -0.2085, -0.1818]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 2.3283e-10, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 4.1910e-09, 1.6298e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, -3.0268e-09, ..., 6.9849e-10, 0.0000e+00, -3.2596e-09], ..., [ 0.0000e+00, -6.0536e-09, 2.3283e-10, ..., -5.3551e-09, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, 0.0000e+00, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 2.3283e-10, -4.8894e-08, ..., -5.4250e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 369, bias, value: tensor([-0.0211, -0.0306, -0.0282, -0.0303, -0.0085, 0.0110, 0.0123, -0.0049, -0.0183, -0.0164], device='cuda:0'), grad: tensor([ 2.9802e-08, 1.1944e-07, -1.7090e-07, 1.1642e-09, 1.8859e-07, 9.3132e-09, 2.3283e-09, -4.8894e-09, 7.6834e-09, -1.8789e-07], device='cuda:0') 100 0.0001 changing lr epoch 368, time 220.42, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4754 re_mapping 0.0029 re_causal 0.0086 /// teacc 99.19 lr 0.00010000 Epoch 370, weight, value: tensor([[-0.2452, -0.3093, 0.1269, ..., -0.1575, 0.0420, 0.0426], [-0.1513, -0.0792, -0.1146, ..., -0.2364, -0.0732, -0.0273], [ 0.0110, -0.1935, -0.2533, ..., -0.1936, 0.0320, -0.4209], ..., [-0.2296, 0.1894, 0.0338, ..., 0.2456, -0.0728, -0.1691], [-0.2101, -0.2111, 0.2333, ..., -0.1878, -0.1651, 0.1683], [ 0.0180, -0.3804, 0.1997, ..., 0.0625, -0.2085, -0.1829]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.3551e-09, -1.3970e-09, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.3819e-09, 2.3283e-10, ..., 5.1223e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.1828e-07, 0.0000e+00, ..., -7.3807e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.9419e-08, 0.0000e+00, ..., 6.2166e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 370, bias, value: tensor([-0.0211, -0.0305, -0.0289, -0.0301, -0.0086, 0.0104, 0.0128, -0.0043, -0.0179, -0.0164], device='cuda:0'), grad: tensor([ 1.1735e-07, 2.0931e-07, -3.0119e-06, 1.0291e-06, 3.5623e-08, -9.6485e-07, 2.8173e-08, 2.1830e-06, 3.5577e-07, 2.3982e-08], device='cuda:0') 100 0.0001 changing lr epoch 369, time 220.21, cls_loss 0.0005 cls_loss_mapping 0.0011 cls_loss_causal 0.4676 re_mapping 0.0029 re_causal 0.0091 /// teacc 99.09 lr 0.00010000 Epoch 371, weight, value: tensor([[-0.2454, -0.3096, 0.1269, ..., -0.1576, 0.0421, 0.0427], [-0.1514, -0.0793, -0.1147, ..., -0.2370, -0.0732, -0.0274], [ 0.0107, -0.1939, -0.2537, ..., -0.1947, 0.0320, -0.4210], ..., [-0.2297, 0.1896, 0.0339, ..., 0.2477, -0.0728, -0.1691], [-0.2101, -0.2113, 0.2348, ..., -0.1882, -0.1651, 0.1692], [ 0.0182, -0.3826, 0.1997, ..., 0.0620, -0.2085, -0.1838]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.6298e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 371, bias, value: tensor([-0.0212, -0.0307, -0.0288, -0.0300, -0.0087, 0.0102, 0.0129, -0.0041, -0.0174, -0.0166], device='cuda:0'), grad: tensor([ 5.5414e-08, -3.6787e-08, 4.1979e-07, -1.0710e-08, -5.2899e-07, 1.0477e-08, 2.9569e-08, 5.1688e-08, 9.3132e-10, 5.3551e-09], device='cuda:0') 100 0.0001 changing lr epoch 370, time 220.79, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4774 re_mapping 0.0029 re_causal 0.0086 /// teacc 99.20 lr 0.00010000 Epoch 372, weight, value: tensor([[-0.2458, -0.3102, 0.1270, ..., -0.1578, 0.0424, 0.0429], [-0.1520, -0.0793, -0.1148, ..., -0.2374, -0.0732, -0.0275], [ 0.0106, -0.1942, -0.2556, ..., -0.1951, 0.0319, -0.4228], ..., [-0.2299, 0.1896, 0.0339, ..., 0.2481, -0.0728, -0.1693], [-0.2102, -0.2111, 0.2368, ..., -0.1897, -0.1651, 0.1707], [ 0.0183, -0.3832, 0.2000, ..., 0.0602, -0.2085, -0.1844]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 3.9581e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 1.3970e-09, 0.0000e+00, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 2.3283e-10, 0.0000e+00, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 1.1642e-08, ..., 0.0000e+00, 0.0000e+00, -2.7940e-09], [ 6.0536e-09, 0.0000e+00, -1.7695e-08, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 372, bias, value: tensor([-0.0211, -0.0312, -0.0279, -0.0304, -0.0069, 0.0106, 0.0124, -0.0043, -0.0166, -0.0182], device='cuda:0'), grad: tensor([ 8.8476e-09, 1.0477e-08, -4.8894e-09, 3.0734e-07, 3.2596e-09, -3.2643e-07, 4.1910e-09, 3.4925e-09, 1.8626e-08, -1.9092e-08], device='cuda:0') 100 0.0001 changing lr epoch 371, time 220.54, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4394 re_mapping 0.0029 re_causal 0.0083 /// teacc 99.14 lr 0.00010000 Epoch 373, weight, value: tensor([[-0.2472, -0.3102, 0.1271, ..., -0.1579, 0.0425, 0.0426], [-0.1522, -0.0794, -0.1149, ..., -0.2379, -0.0733, -0.0275], [ 0.0104, -0.1935, -0.2561, ..., -0.1935, 0.0320, -0.4232], ..., [-0.2301, 0.1897, 0.0339, ..., 0.2485, -0.0729, -0.1693], [-0.2115, -0.2113, 0.2361, ..., -0.1918, -0.1651, 0.1707], [ 0.0186, -0.3845, 0.2006, ..., 0.0604, -0.2085, -0.1847]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.0733e-07, 6.9849e-10, ..., 1.6298e-09, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 2.0023e-08, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.1642e-09], ..., [ 0.0000e+00, 8.7777e-08, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, -3.0268e-09, ..., 2.3283e-10, 0.0000e+00, -5.1223e-09], [ 2.3283e-10, 4.6566e-10, -2.0955e-09, ..., -2.0955e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 373, bias, value: tensor([-0.0212, -0.0316, -0.0263, -0.0306, -0.0069, 0.0103, 0.0129, -0.0047, -0.0172, -0.0181], device='cuda:0'), grad: tensor([ 3.0268e-09, -9.2667e-07, 1.7532e-07, 1.2340e-08, 4.4238e-09, 5.8208e-09, 3.0268e-09, 7.5111e-07, -9.5461e-09, -5.8208e-09], device='cuda:0') 100 0.0001 changing lr epoch 372, time 220.71, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4515 re_mapping 0.0029 re_causal 0.0082 /// teacc 99.15 lr 0.00010000 Epoch 374, weight, value: tensor([[-0.2485, -0.3106, 0.1258, ..., -0.1580, 0.0425, 0.0395], [-0.1521, -0.0794, -0.1150, ..., -0.2384, -0.0733, -0.0271], [ 0.0102, -0.1939, -0.2580, ..., -0.1939, 0.0320, -0.4250], ..., [-0.2303, 0.1898, 0.0340, ..., 0.2491, -0.0729, -0.1694], [-0.2120, -0.2115, 0.2401, ..., -0.1926, -0.1651, 0.1731], [ 0.0159, -0.3848, 0.2010, ..., 0.0605, -0.2085, -0.1858]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -6.9849e-10, -4.8894e-09, ..., 0.0000e+00, 0.0000e+00, -1.1642e-09], [ 0.0000e+00, 2.3283e-10, 1.6298e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 2.3283e-09, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, -2.5611e-09, ..., 0.0000e+00, 0.0000e+00, -3.2596e-09], [-4.6566e-10, 2.3283e-10, -2.7940e-09, ..., -2.3283e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 374, bias, value: tensor([-0.0225, -0.0311, -0.0267, -0.0306, -0.0069, 0.0105, 0.0126, -0.0049, -0.0152, -0.0184], device='cuda:0'), grad: tensor([-1.3039e-08, -1.3970e-09, 8.6147e-09, 5.3551e-09, 5.8208e-09, 6.5193e-09, 6.9849e-09, 4.1910e-09, -1.3737e-08, -2.3283e-09], device='cuda:0') 100 0.0001 changing lr epoch 373, time 221.14, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4618 re_mapping 0.0029 re_causal 0.0080 /// teacc 99.13 lr 0.00010000 Epoch 375, weight, value: tensor([[-0.2505, -0.3110, 0.1266, ..., -0.1595, 0.0425, 0.0418], [-0.1497, -0.0797, -0.1151, ..., -0.2394, -0.0727, -0.0262], [ 0.0102, -0.1942, -0.2588, ..., -0.1935, 0.0313, -0.4266], ..., [-0.2307, 0.1902, 0.0340, ..., 0.2502, -0.0731, -0.1695], [-0.2128, -0.2121, 0.2405, ..., -0.1926, -0.1652, 0.1731], [ 0.0156, -0.3866, 0.2016, ..., 0.0596, -0.2085, -0.1864]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, -1.1874e-08, ..., 0.0000e+00, 0.0000e+00, -1.1409e-08], [ 2.3283e-10, 1.3970e-09, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 4.6566e-10, 2.0955e-09, ..., 0.0000e+00, 0.0000e+00, 5.3551e-09], ..., [ 4.4238e-09, -4.6566e-10, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, 4.6566e-09], [ 4.6566e-10, 0.0000e+00, -3.7253e-09, ..., 0.0000e+00, 0.0000e+00, -1.8626e-08], [ 2.0955e-09, 6.9849e-10, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 375, bias, value: tensor([-0.0212, -0.0305, -0.0271, -0.0303, -0.0067, 0.0101, 0.0121, -0.0048, -0.0154, -0.0187], device='cuda:0'), grad: tensor([-4.4238e-08, 1.9092e-08, 2.7940e-09, 1.8626e-09, 1.8626e-09, 5.8208e-09, 4.1444e-08, 1.3970e-08, -4.1444e-08, 8.6147e-09], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 374---------------------------------------------------- epoch 374, time 221.15, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4212 re_mapping 0.0029 re_causal 0.0083 /// teacc 99.25 lr 0.00010000 Epoch 376, weight, value: tensor([[-0.2510, -0.3111, 0.1266, ..., -0.1597, 0.0423, 0.0417], [-0.1499, -0.0797, -0.1149, ..., -0.2410, -0.0728, -0.0262], [ 0.0100, -0.1944, -0.2592, ..., -0.1936, 0.0313, -0.4273], ..., [-0.2312, 0.1903, 0.0338, ..., 0.2514, -0.0732, -0.1696], [-0.2133, -0.2122, 0.2409, ..., -0.1927, -0.1652, 0.1735], [ 0.0158, -0.3870, 0.2018, ..., 0.0595, -0.2085, -0.1867]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -7.9162e-09, ..., 0.0000e+00, 0.0000e+00, -4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], ..., [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8324e-07, 0.0000e+00, 2.0955e-09, ..., 1.1642e-09, 0.0000e+00, 5.9372e-08]], device='cuda:0') Epoch 376, bias, value: tensor([-0.0214, -0.0299, -0.0270, -0.0304, -0.0073, 0.0101, 0.0132, -0.0055, -0.0154, -0.0188], device='cuda:0'), grad: tensor([-2.2119e-08, 6.9849e-10, 3.0268e-09, -4.4238e-09, -3.9581e-09, -2.5542e-07, 6.2864e-09, 3.4925e-09, 2.3283e-10, 2.7288e-07], device='cuda:0') 100 0.0001 changing lr epoch 375, time 220.24, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4595 re_mapping 0.0029 re_causal 0.0083 /// teacc 99.20 lr 0.00010000 Epoch 377, weight, value: tensor([[-0.2519, -0.3115, 0.1266, ..., -0.1598, 0.0425, 0.0417], [-0.1501, -0.0807, -0.1152, ..., -0.2436, -0.0728, -0.0263], [ 0.0101, -0.1948, -0.2604, ..., -0.1941, 0.0313, -0.4285], ..., [-0.2313, 0.1911, 0.0339, ..., 0.2534, -0.0733, -0.1697], [-0.2146, -0.2127, 0.2415, ..., -0.1927, -0.1652, 0.1706], [ 0.0156, -0.3873, 0.2022, ..., 0.0559, -0.2086, -0.1873]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 7.7067e-08, ..., 0.0000e+00, 0.0000e+00, 8.8476e-08], [ 2.3283e-10, 0.0000e+00, -7.8231e-08, ..., 0.0000e+00, 0.0000e+00, -8.9640e-08], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 377, bias, value: tensor([-0.0207, -0.0307, -0.0272, -0.0303, -0.0058, 0.0125, 0.0142, -0.0051, -0.0184, -0.0203], device='cuda:0'), grad: tensor([ 2.8405e-08, 3.2596e-09, 3.4925e-09, -2.3283e-09, 7.5903e-08, 6.2864e-09, -1.1385e-07, 1.4040e-07, -1.3388e-07, 9.3132e-10], device='cuda:0') 100 0.0001 changing lr epoch 376, time 220.44, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4587 re_mapping 0.0030 re_causal 0.0088 /// teacc 99.20 lr 0.00010000 Epoch 378, weight, value: tensor([[-0.2523, -0.3120, 0.1268, ..., -0.1599, 0.0429, 0.0420], [-0.1500, -0.0810, -0.1157, ..., -0.2444, -0.0728, -0.0272], [ 0.0099, -0.1950, -0.2621, ..., -0.1941, 0.0313, -0.4304], ..., [-0.2314, 0.1916, 0.0340, ..., 0.2542, -0.0734, -0.1698], [-0.2150, -0.2129, 0.2438, ..., -0.1928, -0.1653, 0.1711], [ 0.0156, -0.3880, 0.2021, ..., 0.0556, -0.2086, -0.1890]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, 6.9849e-10, ..., 9.3132e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 2.3283e-10, 2.7940e-09, ..., 2.3283e-10, 0.0000e+00, 5.1223e-09], ..., [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, -7.6834e-09, ..., 0.0000e+00, 0.0000e+00, -1.3504e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 378, bias, value: tensor([-0.0206, -0.0309, -0.0276, -0.0305, -0.0057, 0.0131, 0.0136, -0.0049, -0.0179, -0.0205], device='cuda:0'), grad: tensor([ 4.6566e-10, 3.2596e-09, 1.3737e-08, -4.8894e-09, -2.7940e-09, 1.9092e-08, 2.3283e-10, 1.3970e-09, -3.1432e-08, 2.3283e-10], device='cuda:0') 100 0.0001 changing lr epoch 377, time 220.11, cls_loss 0.0005 cls_loss_mapping 0.0010 cls_loss_causal 0.4538 re_mapping 0.0030 re_causal 0.0087 /// teacc 99.14 lr 0.00010000 Epoch 379, weight, value: tensor([[-0.2535, -0.3121, 0.1268, ..., -0.1600, 0.0430, 0.0416], [-0.1501, -0.0816, -0.1161, ..., -0.2459, -0.0728, -0.0273], [ 0.0098, -0.1952, -0.2624, ..., -0.1944, 0.0313, -0.4306], ..., [-0.2315, 0.1922, 0.0344, ..., 0.2556, -0.0733, -0.1698], [-0.2151, -0.2132, 0.2443, ..., -0.1928, -0.1653, 0.1713], [ 0.0156, -0.3882, 0.2023, ..., 0.0556, -0.2086, -0.1895]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 2.3283e-09, 4.6566e-09, ..., 6.5193e-09, 0.0000e+00, -2.0955e-09], [ 3.4925e-09, 2.7241e-08, 7.5204e-08, ..., 8.8010e-08, 0.0000e+00, 1.8626e-09], [ 2.0955e-09, 2.3283e-09, 9.0804e-09, ..., 4.6566e-09, 0.0000e+00, 2.3283e-10], ..., [-2.0210e-07, -1.8976e-07, -9.2480e-07, ..., -5.6066e-07, 0.0000e+00, -1.3970e-09], [ 0.0000e+00, 0.0000e+00, -2.5611e-09, ..., 0.0000e+00, 0.0000e+00, -5.1223e-09], [ 1.6298e-07, 1.3364e-07, 7.0129e-07, ..., 3.8790e-07, 0.0000e+00, 2.0955e-09]], device='cuda:0') Epoch 379, bias, value: tensor([-0.0209, -0.0314, -0.0274, -0.0305, -0.0057, 0.0131, 0.0136, -0.0045, -0.0179, -0.0205], device='cuda:0'), grad: tensor([ 1.1176e-08, 2.5635e-07, 3.0268e-08, 1.8859e-08, 4.1910e-09, 3.2550e-07, 1.4668e-08, -2.4457e-06, -1.4203e-08, 1.8040e-06], device='cuda:0') 100 0.0001 changing lr epoch 378, time 220.42, cls_loss 0.0007 cls_loss_mapping 0.0009 cls_loss_causal 0.4214 re_mapping 0.0029 re_causal 0.0081 /// teacc 99.08 lr 0.00010000 Epoch 380, weight, value: tensor([[-0.2552, -0.3124, 0.1269, ..., -0.1601, 0.0432, 0.0411], [-0.1501, -0.0817, -0.1164, ..., -0.2463, -0.0728, -0.0271], [ 0.0104, -0.1956, -0.2636, ..., -0.1950, 0.0313, -0.4317], ..., [-0.2314, 0.1925, 0.0346, ..., 0.2563, -0.0728, -0.1699], [-0.2153, -0.2134, 0.2448, ..., -0.1929, -0.1653, 0.1714], [ 0.0156, -0.3894, 0.2025, ..., 0.0555, -0.2086, -0.1896]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.0722e-07, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -6.3330e-08], [ 0.0000e+00, 2.5611e-09, 0.0000e+00, ..., 2.3283e-10, 2.3283e-10, 2.3283e-10], ..., [ 0.0000e+00, 4.0000e-07, 2.3283e-10, ..., -1.1642e-09, -6.9849e-10, 6.2399e-08], [ 2.3283e-10, 0.0000e+00, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.3283e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 380, bias, value: tensor([-0.0211, -0.0313, -0.0274, -0.0308, -0.0057, 0.0132, 0.0135, -0.0044, -0.0177, -0.0206], device='cuda:0'), grad: tensor([ 4.4238e-09, -2.8554e-06, 1.8859e-08, 3.5157e-08, 6.7521e-09, -4.0513e-08, 1.5367e-08, 2.8107e-06, 1.6298e-09, 3.7253e-09], device='cuda:0') 100 0.0001 changing lr epoch 379, time 220.83, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4788 re_mapping 0.0029 re_causal 0.0085 /// teacc 99.12 lr 0.00010000 Epoch 381, weight, value: tensor([[-0.2553, -0.3129, 0.1270, ..., -0.1603, 0.0442, 0.0414], [-0.1516, -0.0820, -0.1193, ..., -0.2468, -0.0727, -0.0273], [ 0.0101, -0.1968, -0.2665, ..., -0.1962, 0.0312, -0.4341], ..., [-0.2315, 0.1929, 0.0320, ..., 0.2539, -0.0728, -0.1700], [-0.2158, -0.2139, 0.2458, ..., -0.1926, -0.1653, 0.1716], [ 0.0161, -0.3900, 0.2073, ..., 0.0574, -0.2087, -0.1905]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.1910e-09, -1.8626e-09, ..., -4.1910e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 381, bias, value: tensor([-0.0210, -0.0334, -0.0287, -0.0302, -0.0058, 0.0134, 0.0132, -0.0065, -0.0175, -0.0175], device='cuda:0'), grad: tensor([ 1.8626e-09, 3.7253e-09, -1.8626e-09, 2.3283e-09, 1.3970e-09, 2.7940e-09, -2.7940e-09, -6.9849e-09, 0.0000e+00, 3.2596e-09], device='cuda:0') 100 0.0001 changing lr epoch 380, time 221.17, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4408 re_mapping 0.0029 re_causal 0.0084 /// teacc 99.12 lr 0.00010000 Epoch 382, weight, value: tensor([[-0.2553, -0.3131, 0.1273, ..., -0.1605, 0.0461, 0.0419], [-0.1520, -0.0820, -0.1194, ..., -0.2472, -0.0727, -0.0284], [ 0.0101, -0.1972, -0.2678, ..., -0.1968, 0.0312, -0.4351], ..., [-0.2316, 0.1930, 0.0321, ..., 0.2543, -0.0730, -0.1701], [-0.2161, -0.2137, 0.2471, ..., -0.1928, -0.1654, 0.1718], [ 0.0160, -0.3907, 0.2074, ..., 0.0573, -0.2088, -0.1910]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 7.1712e-08, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 7.1712e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 382, bias, value: tensor([-0.0209, -0.0336, -0.0289, -0.0303, -0.0058, 0.0135, 0.0133, -0.0065, -0.0173, -0.0176], device='cuda:0'), grad: tensor([ 4.0047e-08, 7.4506e-09, 5.1223e-09, -7.6834e-08, -1.3597e-06, 8.9407e-08, 5.1688e-08, 1.0245e-08, 6.1793e-07, 6.1700e-07], device='cuda:0') 100 0.0001 changing lr epoch 381, time 220.93, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4756 re_mapping 0.0029 re_causal 0.0081 /// teacc 99.11 lr 0.00010000 Epoch 383, weight, value: tensor([[-0.2554, -0.3136, 0.1284, ..., -0.1607, 0.0468, 0.0473], [-0.1521, -0.0824, -0.1195, ..., -0.2486, -0.0727, -0.0283], [ 0.0100, -0.1976, -0.2698, ..., -0.1973, 0.0311, -0.4362], ..., [-0.2320, 0.1936, 0.0323, ..., 0.2551, -0.0730, -0.1704], [-0.2163, -0.2143, 0.2473, ..., -0.1934, -0.1654, 0.1720], [ 0.0161, -0.3930, 0.2073, ..., 0.0572, -0.2089, -0.1915]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.3283e-09, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], [-7.9162e-09, 1.3970e-09, 4.6566e-10, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -3.6322e-08, 9.3132e-10, ..., -4.1910e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.9849e-09, ..., 1.8626e-09, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, -1.3504e-08, ..., -4.1910e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 383, bias, value: tensor([-0.0163, -0.0336, -0.0295, -0.0302, -0.0059, 0.0132, 0.0121, -0.0064, -0.0172, -0.0177], device='cuda:0'), grad: tensor([ 2.4214e-08, -3.4925e-08, 7.4506e-09, 8.3819e-09, 1.0291e-07, 1.6764e-08, -4.4703e-08, -1.0571e-07, 3.6322e-08, -1.4435e-08], device='cuda:0') 100 0.0001 changing lr epoch 382, time 220.95, cls_loss 0.0006 cls_loss_mapping 0.0008 cls_loss_causal 0.4514 re_mapping 0.0029 re_causal 0.0082 /// teacc 99.16 lr 0.00010000 Epoch 384, weight, value: tensor([[-0.2576, -0.3140, 0.1285, ..., -0.1609, 0.0468, 0.0472], [-0.1522, -0.0827, -0.1195, ..., -0.2492, -0.0727, -0.0283], [ 0.0081, -0.1992, -0.2704, ..., -0.1982, 0.0311, -0.4379], ..., [-0.2321, 0.1939, 0.0323, ..., 0.2554, -0.0730, -0.1704], [-0.2166, -0.2146, 0.2474, ..., -0.1936, -0.1654, 0.1720], [ 0.0160, -0.3931, 0.2074, ..., 0.0573, -0.2089, -0.1917]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, -1.3970e-09, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 384, bias, value: tensor([-0.0165, -0.0336, -0.0300, -0.0312, -0.0060, 0.0140, 0.0121, -0.0065, -0.0172, -0.0177], device='cuda:0'), grad: tensor([ 6.0536e-09, 9.3132e-10, -2.1420e-08, 4.1910e-09, 1.0245e-08, 2.3283e-09, -6.0536e-09, 6.5193e-09, 4.6566e-09, -4.6566e-10], device='cuda:0') 100 0.0001 changing lr epoch 383, time 220.64, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4698 re_mapping 0.0031 re_causal 0.0083 /// teacc 99.06 lr 0.00010000 Epoch 385, weight, value: tensor([[-0.2576, -0.3142, 0.1286, ..., -0.1611, 0.0468, 0.0472], [-0.1524, -0.0830, -0.1196, ..., -0.2507, -0.0727, -0.0281], [ 0.0082, -0.2025, -0.2730, ..., -0.2019, 0.0310, -0.4381], ..., [-0.2323, 0.1958, 0.0327, ..., 0.2578, -0.0730, -0.1705], [-0.2170, -0.2149, 0.2475, ..., -0.1940, -0.1655, 0.1720], [ 0.0159, -0.3937, 0.2075, ..., 0.0573, -0.2089, -0.1921]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 4.2841e-08, ..., 2.2817e-08, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 4.6566e-10, 1.3039e-08, ..., 6.0536e-09, 0.0000e+00, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 3.5530e-07, ..., 2.7940e-09, 0.0000e+00, 3.0641e-07], ..., [ 4.6566e-10, 0.0000e+00, 1.0338e-07, ..., 9.3132e-09, 0.0000e+00, 7.6368e-08], [ 1.3970e-09, 0.0000e+00, -4.0140e-07, ..., 4.0513e-08, 0.0000e+00, -3.9442e-07], [-7.9162e-09, 0.0000e+00, -2.9150e-07, ..., -5.3085e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 385, bias, value: tensor([-0.0165, -0.0336, -0.0326, -0.0314, -0.0061, 0.0139, 0.0123, -0.0047, -0.0173, -0.0178], device='cuda:0'), grad: tensor([ 1.7462e-07, 4.8429e-08, 1.0114e-06, 4.4843e-07, -5.9418e-07, 9.6858e-08, 3.2596e-08, 3.1944e-07, -9.8720e-07, -5.3877e-07], device='cuda:0') 100 0.0001 changing lr epoch 384, time 220.59, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4586 re_mapping 0.0028 re_causal 0.0080 /// teacc 99.12 lr 0.00010000 Epoch 386, weight, value: tensor([[-0.2577, -0.3146, 0.1286, ..., -0.1612, 0.0467, 0.0473], [-0.1524, -0.0849, -0.1202, ..., -0.2527, -0.0727, -0.0281], [ 0.0082, -0.2028, -0.2738, ..., -0.2022, 0.0309, -0.4389], ..., [-0.2324, 0.1976, 0.0334, ..., 0.2585, -0.0729, -0.1706], [-0.2173, -0.2153, 0.2481, ..., -0.1940, -0.1655, 0.1721], [ 0.0159, -0.3939, 0.2076, ..., 0.0573, -0.2089, -0.1923]], device='cuda:0'), grad: tensor([[0.0000e+00, 3.2596e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 8.3819e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 2.6543e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 2.8871e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 1.1642e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 386, bias, value: tensor([-0.0165, -0.0347, -0.0328, -0.0314, -0.0060, 0.0139, 0.0123, -0.0033, -0.0173, -0.0178], device='cuda:0'), grad: tensor([ 1.1642e-08, 3.1199e-08, 9.3132e-08, -5.2527e-07, 4.1910e-09, 2.5379e-07, -1.8626e-09, 9.9652e-08, 3.9116e-08, 4.1910e-09], device='cuda:0') 100 0.0001 changing lr epoch 385, time 220.74, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4737 re_mapping 0.0028 re_causal 0.0081 /// teacc 99.07 lr 0.00010000 Epoch 387, weight, value: tensor([[-0.2579, -0.3151, 0.1286, ..., -0.1615, 0.0468, 0.0471], [-0.1526, -0.0859, -0.1207, ..., -0.2540, -0.0727, -0.0283], [ 0.0094, -0.2034, -0.2741, ..., -0.2023, 0.0309, -0.4392], ..., [-0.2326, 0.1986, 0.0339, ..., 0.2591, -0.0728, -0.1707], [-0.2175, -0.2159, 0.2491, ..., -0.1942, -0.1655, 0.1723], [ 0.0157, -0.3944, 0.2076, ..., 0.0573, -0.2089, -0.1933]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.0536e-08, 2.4680e-08, ..., 5.4948e-08, 0.0000e+00, 0.0000e+00], [ 2.9802e-08, 1.6764e-08, 6.0536e-09, ..., 1.4435e-08, 0.0000e+00, 3.0268e-08], [ 0.0000e+00, 5.0757e-08, 2.0489e-08, ..., 4.5635e-08, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -1.8580e-07, -6.8452e-08, ..., -1.5553e-07, 0.0000e+00, 4.6566e-10], [ 7.4506e-09, 0.0000e+00, -4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 6.0536e-09], [ 9.3132e-10, 2.0489e-08, 8.3819e-09, ..., 1.8161e-08, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 387, bias, value: tensor([-0.0172, -0.0354, -0.0329, -0.0311, -0.0061, 0.0138, 0.0126, -0.0025, -0.0172, -0.0179], device='cuda:0'), grad: tensor([ 2.3143e-07, 1.3877e-07, 1.9465e-07, 7.1712e-08, 5.1223e-09, -2.6962e-07, 1.9046e-07, -6.6357e-07, 1.5832e-08, 8.1491e-08], device='cuda:0') 100 0.0001 changing lr epoch 386, time 221.28, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4724 re_mapping 0.0026 re_causal 0.0080 /// teacc 99.18 lr 0.00010000 Epoch 388, weight, value: tensor([[-0.2580, -0.3160, 0.1286, ..., -0.1617, 0.0468, 0.0471], [-0.1521, -0.0862, -0.1207, ..., -0.2548, -0.0727, -0.0282], [ 0.0097, -0.2041, -0.2744, ..., -0.2020, 0.0309, -0.4396], ..., [-0.2326, 0.1988, 0.0339, ..., 0.2593, -0.0728, -0.1708], [-0.2180, -0.2163, 0.2496, ..., -0.1942, -0.1655, 0.1725], [ 0.0157, -0.3947, 0.2077, ..., 0.0574, -0.2089, -0.1935]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 4.6566e-10, 2.5565e-07, 2.3283e-09, ..., 1.7975e-07, 0.0000e+00, 2.7940e-09], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 3.2596e-09, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, -2.7567e-07, -4.6566e-10, ..., -1.9372e-07, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, -2.3283e-08, ..., 0.0000e+00, 0.0000e+00, -3.2131e-08], [ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 388, bias, value: tensor([-0.0173, -0.0351, -0.0331, -0.0305, -0.0063, 0.0137, 0.0125, -0.0026, -0.0171, -0.0179], device='cuda:0'), grad: tensor([ 7.4506e-09, 6.1747e-07, 1.2107e-08, 7.9628e-08, -4.1910e-09, 1.5367e-08, 6.1933e-08, -6.4820e-07, -1.4948e-07, 1.2107e-08], device='cuda:0') 100 0.0001 changing lr epoch 387, time 220.49, cls_loss 0.0008 cls_loss_mapping 0.0017 cls_loss_causal 0.4601 re_mapping 0.0028 re_causal 0.0081 /// teacc 99.18 lr 0.00010000 Epoch 389, weight, value: tensor([[-0.2585, -0.3183, 0.1283, ..., -0.1622, 0.0468, 0.0469], [-0.1522, -0.0866, -0.1211, ..., -0.2559, -0.0727, -0.0288], [ 0.0095, -0.2042, -0.2718, ..., -0.2020, 0.0309, -0.4377], ..., [-0.2328, 0.1992, 0.0341, ..., 0.2598, -0.0729, -0.1709], [-0.2191, -0.2186, 0.2501, ..., -0.1943, -0.1655, 0.1726], [ 0.0158, -0.3953, 0.2078, ..., 0.0574, -0.2089, -0.1938]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.2107e-08, 0.0000e+00, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.3062e-08, 0.0000e+00, ..., 2.0955e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.4703e-08, 4.6566e-10, ..., -2.7940e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, -1.8626e-09, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 389, bias, value: tensor([-0.0179, -0.0361, -0.0313, -0.0274, -0.0062, 0.0109, 0.0123, -0.0024, -0.0171, -0.0180], device='cuda:0'), grad: tensor([ 9.3132e-10, 3.2131e-08, 8.5682e-08, 5.5879e-09, 6.9849e-09, 6.0536e-09, -6.9849e-09, -1.1269e-07, 9.3132e-10, -6.5193e-09], device='cuda:0') 100 0.0001 changing lr epoch 388, time 221.15, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4542 re_mapping 0.0030 re_causal 0.0085 /// teacc 99.17 lr 0.00010000 Epoch 390, weight, value: tensor([[-0.2588, -0.3186, 0.1283, ..., -0.1623, 0.0468, 0.0469], [-0.1523, -0.0866, -0.1211, ..., -0.2562, -0.0727, -0.0287], [ 0.0094, -0.2044, -0.2727, ..., -0.2021, 0.0309, -0.4392], ..., [-0.2331, 0.1988, 0.0340, ..., 0.2599, -0.0729, -0.1711], [-0.2196, -0.2190, 0.2513, ..., -0.1944, -0.1655, 0.1728], [ 0.0138, -0.3956, 0.2079, ..., 0.0558, -0.2089, -0.1961]], device='cuda:0'), grad: tensor([[9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], ..., [0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.7695e-08, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.2107e-08], [5.1223e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09]], device='cuda:0') Epoch 390, bias, value: tensor([-0.0180, -0.0358, -0.0316, -0.0275, -0.0051, 0.0109, 0.0123, -0.0028, -0.0170, -0.0191], device='cuda:0'), grad: tensor([ 2.7940e-09, 5.5879e-09, 5.1223e-09, 1.6019e-06, 4.6566e-10, -1.6885e-06, 4.1910e-09, 4.1910e-09, 5.1688e-08, 1.2107e-08], device='cuda:0') 100 0.0001 changing lr epoch 389, time 220.79, cls_loss 0.0006 cls_loss_mapping 0.0019 cls_loss_causal 0.4765 re_mapping 0.0029 re_causal 0.0085 /// teacc 99.15 lr 0.00010000 Epoch 391, weight, value: tensor([[-0.2589, -0.3190, 0.1283, ..., -0.1625, 0.0468, 0.0469], [-0.1523, -0.0864, -0.1211, ..., -0.2566, -0.0728, -0.0286], [ 0.0094, -0.2044, -0.2728, ..., -0.2020, 0.0310, -0.4393], ..., [-0.2332, 0.1988, 0.0341, ..., 0.2604, -0.0730, -0.1712], [-0.2202, -0.2188, 0.2516, ..., -0.1946, -0.1655, 0.1728], [ 0.0138, -0.3969, 0.2079, ..., 0.0557, -0.2090, -0.1963]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 1.6764e-08, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.0955e-08], [ 5.5879e-09, 4.6566e-10, -5.1223e-09, ..., -2.3283e-09, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 391, bias, value: tensor([-0.0181, -0.0355, -0.0315, -0.0275, -0.0051, 0.0109, 0.0122, -0.0030, -0.0169, -0.0191], device='cuda:0'), grad: tensor([ 3.7253e-09, 8.8476e-09, -2.5146e-08, 7.4506e-09, 4.1910e-09, -6.6217e-07, 5.8720e-07, 1.1176e-08, 8.3819e-08, -6.9849e-09], device='cuda:0') 100 0.0001 changing lr epoch 390, time 220.81, cls_loss 0.0006 cls_loss_mapping 0.0011 cls_loss_causal 0.4527 re_mapping 0.0029 re_causal 0.0080 /// teacc 99.17 lr 0.00010000 Epoch 392, weight, value: tensor([[-0.2590, -0.3190, 0.1285, ..., -0.1625, 0.0468, 0.0469], [-0.1535, -0.0865, -0.1211, ..., -0.2567, -0.0728, -0.0289], [ 0.0093, -0.2045, -0.2729, ..., -0.2020, 0.0310, -0.4395], ..., [-0.2334, 0.1988, 0.0341, ..., 0.2604, -0.0730, -0.1714], [-0.2212, -0.2189, 0.2521, ..., -0.1947, -0.1655, 0.1730], [ 0.0137, -0.3972, 0.2080, ..., 0.0559, -0.2090, -0.1965]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -5.7742e-08, ..., 0.0000e+00, 0.0000e+00, -3.2596e-09], [ 0.0000e+00, 4.6566e-10, 3.2596e-09, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.3993e-08, ..., -1.8626e-09, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 392, bias, value: tensor([-0.0181, -0.0356, -0.0312, -0.0275, -0.0053, 0.0110, 0.0121, -0.0032, -0.0168, -0.0191], device='cuda:0'), grad: tensor([-1.7323e-07, -6.4261e-08, 7.4506e-09, 3.4459e-08, -3.7253e-09, 1.8626e-09, 1.2107e-08, 8.0094e-08, 4.1910e-09, 1.1269e-07], device='cuda:0') 100 0.0001 changing lr epoch 391, time 220.55, cls_loss 0.0007 cls_loss_mapping 0.0008 cls_loss_causal 0.4605 re_mapping 0.0028 re_causal 0.0081 /// teacc 99.13 lr 0.00010000 Epoch 393, weight, value: tensor([[-0.2591, -0.3192, 0.1287, ..., -0.1626, 0.0468, 0.0469], [-0.1536, -0.0858, -0.1212, ..., -0.2574, -0.0728, -0.0287], [ 0.0094, -0.2051, -0.2738, ..., -0.2027, 0.0310, -0.4400], ..., [-0.2335, 0.1982, 0.0342, ..., 0.2610, -0.0730, -0.1716], [-0.2214, -0.2189, 0.2529, ..., -0.1948, -0.1655, 0.1731], [ 0.0137, -0.3976, 0.2080, ..., 0.0559, -0.2090, -0.1967]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.2596e-09, ..., 2.7940e-09, 0.0000e+00, 4.6566e-10], [ 1.3970e-09, 0.0000e+00, 1.3970e-09, ..., 5.1223e-09, 0.0000e+00, -9.3132e-09], [-4.1910e-09, 0.0000e+00, -2.2352e-08, ..., -2.0489e-08, 0.0000e+00, -3.7253e-09]], device='cuda:0') Epoch 393, bias, value: tensor([-0.0182, -0.0345, -0.0314, -0.0275, -0.0053, 0.0110, 0.0121, -0.0044, -0.0167, -0.0191], device='cuda:0'), grad: tensor([ 1.8626e-09, 4.6566e-09, -6.5193e-09, 1.0710e-08, 3.7253e-08, 2.2352e-08, 2.7940e-09, 1.2107e-08, -4.6566e-10, -8.2422e-08], device='cuda:0') 100 0.0001 changing lr epoch 392, time 220.99, cls_loss 0.0006 cls_loss_mapping 0.0013 cls_loss_causal 0.4416 re_mapping 0.0029 re_causal 0.0082 /// teacc 99.16 lr 0.00010000 Epoch 394, weight, value: tensor([[-0.2591, -0.3194, 0.1287, ..., -0.1627, 0.0468, 0.0468], [-0.1547, -0.0859, -0.1213, ..., -0.2581, -0.0728, -0.0291], [ 0.0094, -0.2055, -0.2743, ..., -0.2029, 0.0310, -0.4414], ..., [-0.2335, 0.1984, 0.0344, ..., 0.2615, -0.0730, -0.1717], [-0.2217, -0.2193, 0.2538, ..., -0.1948, -0.1655, 0.1734], [ 0.0137, -0.3978, 0.2082, ..., 0.0559, -0.2090, -0.1969]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 8.3819e-09, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-4.1444e-08, 0.0000e+00, -2.8685e-07, ..., -1.1083e-07, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 394, bias, value: tensor([-0.0183, -0.0347, -0.0316, -0.0275, -0.0053, 0.0110, 0.0122, -0.0043, -0.0165, -0.0191], device='cuda:0'), grad: tensor([ 1.9558e-08, 3.2596e-09, -6.9849e-09, 3.7253e-09, 6.0350e-07, 3.4459e-08, 1.3970e-08, 4.1910e-09, 0.0000e+00, -6.7428e-07], device='cuda:0') 100 0.0001 changing lr epoch 393, time 220.70, cls_loss 0.0006 cls_loss_mapping 0.0009 cls_loss_causal 0.4572 re_mapping 0.0028 re_causal 0.0084 /// teacc 99.14 lr 0.00010000 Epoch 395, weight, value: tensor([[-0.2593, -0.3193, 0.1287, ..., -0.1628, 0.0468, 0.0468], [-0.1547, -0.0860, -0.1213, ..., -0.2591, -0.0728, -0.0289], [ 0.0093, -0.2061, -0.2746, ..., -0.2033, 0.0310, -0.4426], ..., [-0.2336, 0.1988, 0.0344, ..., 0.2622, -0.0730, -0.1719], [-0.2221, -0.2196, 0.2538, ..., -0.1948, -0.1655, 0.1736], [ 0.0141, -0.3988, 0.2085, ..., 0.0560, -0.2090, -0.1969]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 0.0000e+00, -7.9162e-09, ..., 0.0000e+00, 0.0000e+00, -9.3132e-10], [ 9.3132e-10, 4.6566e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 4.6566e-10, 0.0000e+00, -1.2107e-08, ..., 0.0000e+00, 0.0000e+00, -1.1642e-08], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 395, bias, value: tensor([-0.0184, -0.0347, -0.0319, -0.0275, -0.0055, 0.0109, 0.0122, -0.0042, -0.0163, -0.0189], device='cuda:0'), grad: tensor([-9.3132e-09, 1.1642e-08, 3.7253e-09, 3.2596e-09, 2.7008e-08, 4.2841e-08, -4.3772e-08, 9.3132e-09, -3.3993e-08, 4.6566e-09], device='cuda:0') 100 0.0001 changing lr epoch 394, time 220.47, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4652 re_mapping 0.0028 re_causal 0.0083 /// teacc 99.01 lr 0.00010000 Epoch 396, weight, value: tensor([[-0.2595, -0.3195, 0.1285, ..., -0.1629, 0.0468, 0.0467], [-0.1548, -0.0860, -0.1213, ..., -0.2594, -0.0728, -0.0289], [ 0.0092, -0.2063, -0.2749, ..., -0.2034, 0.0309, -0.4431], ..., [-0.2337, 0.1988, 0.0344, ..., 0.2625, -0.0729, -0.1719], [-0.2228, -0.2202, 0.2539, ..., -0.1951, -0.1658, 0.1737], [ 0.0139, -0.3997, 0.2089, ..., 0.0561, -0.2090, -0.1976]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 396, bias, value: tensor([-0.0188, -0.0346, -0.0320, -0.0275, -0.0057, 0.0111, 0.0116, -0.0043, -0.0163, -0.0188], device='cuda:0'), grad: tensor([ 9.3132e-10, -2.1979e-07, 5.9605e-08, -6.5193e-09, 0.0000e+00, 6.9849e-09, 4.6566e-10, 1.6298e-07, 0.0000e+00, 4.6566e-10], device='cuda:0') 100 0.0001 changing lr epoch 395, time 220.72, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4588 re_mapping 0.0028 re_causal 0.0080 /// teacc 99.14 lr 0.00010000 Epoch 397, weight, value: tensor([[-0.2596, -0.3209, 0.1285, ..., -0.1632, 0.0469, 0.0467], [-0.1550, -0.0865, -0.1214, ..., -0.2602, -0.0728, -0.0287], [ 0.0091, -0.2064, -0.2752, ..., -0.2035, 0.0310, -0.4436], ..., [-0.2340, 0.1995, 0.0343, ..., 0.2628, -0.0729, -0.1723], [-0.2259, -0.2205, 0.2545, ..., -0.1952, -0.1658, 0.1736], [ 0.0138, -0.4008, 0.2093, ..., 0.0561, -0.2090, -0.1984]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, -1.8626e-09, 2.5611e-08, ..., 0.0000e+00, 0.0000e+00, -1.7695e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [-2.7940e-09, 0.0000e+00, -3.8650e-08, ..., 0.0000e+00, 0.0000e+00, -1.3970e-09]], device='cuda:0') Epoch 397, bias, value: tensor([-0.0189, -0.0348, -0.0319, -0.0275, -0.0057, 0.0111, 0.0116, -0.0042, -0.0164, -0.0188], device='cuda:0'), grad: tensor([ 5.1223e-09, -2.5239e-07, 2.6543e-08, 1.0710e-08, 4.3306e-08, 6.0536e-09, 1.8673e-07, 6.8918e-08, 2.0023e-08, -1.1316e-07], device='cuda:0') 100 0.0001 changing lr epoch 396, time 220.72, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4805 re_mapping 0.0029 re_causal 0.0084 /// teacc 99.09 lr 0.00010000 Epoch 398, weight, value: tensor([[-0.2596, -0.3212, 0.1268, ..., -0.1653, 0.0469, 0.0468], [-0.1551, -0.0862, -0.1213, ..., -0.2606, -0.0728, -0.0261], [ 0.0084, -0.2065, -0.2757, ..., -0.2036, 0.0310, -0.4442], ..., [-0.2340, 0.1994, 0.0343, ..., 0.2632, -0.0729, -0.1750], [-0.2262, -0.2206, 0.2546, ..., -0.1954, -0.1658, 0.1737], [ 0.0146, -0.4014, 0.2101, ..., 0.0555, -0.2090, -0.1988]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-08], [ 0.0000e+00, 1.3970e-09, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.7789e-09, 0.0000e+00, ..., -2.3283e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.2107e-08, 1.8626e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-9.3132e-09, 0.0000e+00, -2.0955e-08, ..., -2.0955e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 398, bias, value: tensor([-0.0194, -0.0336, -0.0319, -0.0275, -0.0043, 0.0110, 0.0117, -0.0053, -0.0164, -0.0196], device='cuda:0'), grad: tensor([ 1.3877e-07, 2.3283e-08, -6.4727e-08, 5.8673e-08, 6.5658e-08, 7.8743e-07, -1.2629e-06, 8.7544e-08, 2.3097e-07, -6.7521e-08], device='cuda:0') 100 0.0001 changing lr epoch 397, time 220.75, cls_loss 0.0005 cls_loss_mapping 0.0012 cls_loss_causal 0.4498 re_mapping 0.0029 re_causal 0.0083 /// teacc 99.10 lr 0.00010000 Epoch 399, weight, value: tensor([[-0.2598, -0.3214, 0.1269, ..., -0.1655, 0.0468, 0.0469], [-0.1551, -0.0857, -0.1213, ..., -0.2606, -0.0728, -0.0248], [ 0.0080, -0.2065, -0.2758, ..., -0.2036, 0.0308, -0.4455], ..., [-0.2342, 0.1991, 0.0342, ..., 0.2634, -0.0729, -0.1763], [-0.2266, -0.2209, 0.2553, ..., -0.1955, -0.1663, 0.1738], [ 0.0161, -0.4032, 0.2108, ..., 0.0572, -0.2090, -0.1996]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 1.3970e-09, -1.7229e-08, ..., 9.3132e-10, 0.0000e+00, -2.3283e-08], [ 4.6566e-10, 9.3132e-10, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 5.5879e-09, 4.6566e-10, ..., 5.1223e-09, 0.0000e+00, 9.3132e-10], ..., [ 4.6566e-10, -9.3132e-09, -1.3970e-09, ..., -8.3819e-09, 0.0000e+00, -9.3132e-10], [ 3.2596e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 1.8626e-09], [ 1.3970e-09, 4.6566e-10, 1.8626e-09, ..., 4.6566e-10, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 399, bias, value: tensor([-0.0194, -0.0329, -0.0320, -0.0275, -0.0056, 0.0111, 0.0117, -0.0060, -0.0164, -0.0186], device='cuda:0'), grad: tensor([-6.5658e-08, 3.7253e-09, 1.5367e-08, 9.1270e-08, 2.7940e-09, -1.0198e-07, 5.5414e-08, -1.9092e-08, 5.5879e-09, 8.8476e-09], device='cuda:0') 100 0.0001 changing lr epoch 398, time 220.77, cls_loss 0.0005 cls_loss_mapping 0.0009 cls_loss_causal 0.4224 re_mapping 0.0027 re_causal 0.0077 /// teacc 99.18 lr 0.00010000 Epoch 400, weight, value: tensor([[-0.2599, -0.3215, 0.1272, ..., -0.1655, 0.0468, 0.0471], [-0.1551, -0.0858, -0.1213, ..., -0.2610, -0.0728, -0.0249], [ 0.0082, -0.2065, -0.2759, ..., -0.2036, 0.0308, -0.4456], ..., [-0.2346, 0.1994, 0.0343, ..., 0.2640, -0.0729, -0.1763], [-0.2269, -0.2204, 0.2557, ..., -0.1955, -0.1663, 0.1738], [ 0.0161, -0.4048, 0.2108, ..., 0.0570, -0.2090, -0.1998]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [-4.6566e-10, 0.0000e+00, -3.2596e-09, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 4.6566e-10, 0.0000e+00, 2.7940e-09, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 400, bias, value: tensor([-0.0192, -0.0328, -0.0320, -0.0275, -0.0055, 0.0111, 0.0115, -0.0059, -0.0164, -0.0188], device='cuda:0'), grad: tensor([ 1.8626e-09, 6.0070e-08, -1.5832e-08, 3.7253e-09, 1.3113e-06, 6.9849e-09, -1.3448e-06, 2.7940e-09, -2.0955e-08, 9.7789e-09], device='cuda:0') 100 0.0001 changing lr epoch 399, time 220.62, cls_loss 0.0006 cls_loss_mapping 0.0009 cls_loss_causal 0.4706 re_mapping 0.0028 re_causal 0.0082 /// teacc 99.14 lr 0.00001000 Epoch 401, weight, value: tensor([[-0.2608, -0.3219, 0.1270, ..., -0.1658, 0.0469, 0.0470], [-0.1562, -0.0858, -0.1216, ..., -0.2614, -0.0728, -0.0249], [ 0.0081, -0.2066, -0.2763, ..., -0.2037, 0.0308, -0.4459], ..., [-0.2349, 0.1994, 0.0342, ..., 0.2641, -0.0729, -0.1763], [-0.2272, -0.2207, 0.2557, ..., -0.1958, -0.1663, 0.1739], [ 0.0168, -0.4057, 0.2114, ..., 0.0571, -0.2090, -0.1999]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 1.6298e-09, 2.5611e-09, ..., 1.6298e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.1910e-09, 2.5611e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00], ..., [ 2.3283e-10, -1.2573e-08, -8.3819e-09, ..., -1.0477e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 6.7521e-09, 3.0268e-09, ..., 3.9581e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 401, bias, value: tensor([-0.0194, -0.0332, -0.0319, -0.0275, -0.0057, 0.0111, 0.0116, -0.0061, -0.0164, -0.0184], device='cuda:0'), grad: tensor([ 7.4506e-09, 1.0012e-08, 3.2596e-09, 1.6298e-09, 2.5611e-09, -6.5193e-09, 2.5611e-09, -3.0734e-08, 2.3283e-09, 1.3737e-08], device='cuda:0') 100 1e-05 changing lr epoch 400, time 220.37, cls_loss 0.0005 cls_loss_mapping 0.0008 cls_loss_causal 0.4582 re_mapping 0.0028 re_causal 0.0084 /// teacc 99.19 lr 0.00001000 Epoch 402, weight, value: tensor([[-0.2608, -0.3219, 0.1270, ..., -0.1658, 0.0469, 0.0470], [-0.1562, -0.0857, -0.1215, ..., -0.2614, -0.0728, -0.0249], [ 0.0081, -0.2066, -0.2763, ..., -0.2037, 0.0308, -0.4459], ..., [-0.2349, 0.1993, 0.0342, ..., 0.2641, -0.0729, -0.1764], [-0.2273, -0.2207, 0.2557, ..., -0.1958, -0.1663, 0.1739], [ 0.0167, -0.4057, 0.2114, ..., 0.0571, -0.2090, -0.1999]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, -1.2340e-08, ..., 0.0000e+00, -4.6566e-10, -6.7521e-09], [ 0.0000e+00, 1.9791e-08, 9.3132e-09, ..., 1.3970e-08, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.1420e-08, -1.0012e-08, ..., -1.5600e-08, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 1.8161e-08, 2.3283e-10, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 9.5461e-09]], device='cuda:0') Epoch 402, bias, value: tensor([-0.0194, -0.0331, -0.0319, -0.0275, -0.0056, 0.0111, 0.0116, -0.0061, -0.0164, -0.0184], device='cuda:0'), grad: tensor([-3.7020e-08, 4.5635e-08, 2.3283e-10, 3.4925e-09, -1.3970e-09, -3.7951e-08, 4.0745e-08, -4.7497e-08, 9.3132e-09, 3.3062e-08], device='cuda:0') 100 1e-05 changing lr epoch 401, time 220.67, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4141 re_mapping 0.0027 re_causal 0.0079 /// teacc 99.21 lr 0.00001000 Epoch 403, weight, value: tensor([[-0.2608, -0.3219, 0.1270, ..., -0.1658, 0.0469, 0.0470], [-0.1563, -0.0857, -0.1215, ..., -0.2615, -0.0728, -0.0249], [ 0.0081, -0.2066, -0.2763, ..., -0.2037, 0.0308, -0.4459], ..., [-0.2350, 0.1994, 0.0342, ..., 0.2642, -0.0729, -0.1764], [-0.2273, -0.2207, 0.2558, ..., -0.1958, -0.1663, 0.1739], [ 0.0165, -0.4058, 0.2114, ..., 0.0570, -0.2090, -0.2001]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -4.6566e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, 5.1223e-09, ..., 2.7940e-09, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.6566e-09, -9.3132e-10, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, -4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 403, bias, value: tensor([-0.0194, -0.0331, -0.0319, -0.0275, -0.0055, 0.0111, 0.0116, -0.0061, -0.0164, -0.0185], device='cuda:0'), grad: tensor([-1.0710e-08, 8.3819e-09, 1.8626e-09, 6.0536e-09, 2.7940e-09, -8.3819e-09, 1.4901e-08, -5.5879e-09, -9.3132e-10, -4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 402, time 220.85, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4405 re_mapping 0.0027 re_causal 0.0079 /// teacc 99.22 lr 0.00001000 Epoch 404, weight, value: tensor([[-0.2608, -0.3219, 0.1270, ..., -0.1658, 0.0468, 0.0470], [-0.1564, -0.0857, -0.1216, ..., -0.2616, -0.0728, -0.0249], [ 0.0081, -0.2066, -0.2763, ..., -0.2037, 0.0308, -0.4460], ..., [-0.2350, 0.1994, 0.0342, ..., 0.2642, -0.0729, -0.1764], [-0.2276, -0.2207, 0.2558, ..., -0.1959, -0.1663, 0.1739], [ 0.0164, -0.4058, 0.2114, ..., 0.0570, -0.2090, -0.2002]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.3970e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3039e-08, 4.1910e-09, ..., 9.3132e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, -2.7474e-08, -8.8476e-09, ..., -1.6298e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 5.1223e-09, 3.7253e-09, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 404, bias, value: tensor([-0.0194, -0.0331, -0.0319, -0.0275, -0.0055, 0.0111, 0.0116, -0.0061, -0.0164, -0.0185], device='cuda:0'), grad: tensor([-3.7253e-09, 1.0571e-07, 3.7253e-09, 1.6764e-08, -7.5903e-08, 2.3283e-09, 2.7940e-09, -5.9139e-08, 4.6566e-10, 1.8626e-08], device='cuda:0') 100 1e-05 changing lr epoch 403, time 220.28, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4493 re_mapping 0.0026 re_causal 0.0079 /// teacc 99.21 lr 0.00001000 Epoch 405, weight, value: tensor([[-0.2609, -0.3219, 0.1270, ..., -0.1658, 0.0468, 0.0470], [-0.1564, -0.0857, -0.1215, ..., -0.2616, -0.0728, -0.0249], [ 0.0081, -0.2066, -0.2763, ..., -0.2037, 0.0308, -0.4460], ..., [-0.2350, 0.1994, 0.0342, ..., 0.2642, -0.0729, -0.1764], [-0.2276, -0.2207, 0.2558, ..., -0.1959, -0.1663, 0.1739], [ 0.0163, -0.4059, 0.2114, ..., 0.0570, -0.2090, -0.2003]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, -5.5879e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 9.3132e-10, 5.1223e-09, ..., 2.0955e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 405, bias, value: tensor([-0.0195, -0.0331, -0.0319, -0.0275, -0.0055, 0.0111, 0.0115, -0.0061, -0.0164, -0.0185], device='cuda:0'), grad: tensor([-8.8476e-09, 7.9162e-09, -4.6566e-10, -1.6764e-08, -1.3784e-07, 8.3819e-09, 4.1910e-09, 6.0536e-09, 1.8626e-09, 1.3877e-07], device='cuda:0') 100 1e-05 changing lr epoch 404, time 220.43, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4240 re_mapping 0.0024 re_causal 0.0077 /// teacc 99.19 lr 0.00001000 Epoch 406, weight, value: tensor([[-0.2609, -0.3220, 0.1270, ..., -0.1658, 0.0468, 0.0470], [-0.1565, -0.0858, -0.1216, ..., -0.2617, -0.0728, -0.0249], [ 0.0080, -0.2066, -0.2764, ..., -0.2037, 0.0308, -0.4461], ..., [-0.2350, 0.1994, 0.0342, ..., 0.2643, -0.0729, -0.1764], [-0.2278, -0.2208, 0.2558, ..., -0.1959, -0.1664, 0.1738], [ 0.0162, -0.4059, 0.2114, ..., 0.0570, -0.2090, -0.2005]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, -4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 3.2596e-09, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00], [-1.3970e-09, 0.0000e+00, -6.5193e-09, ..., 0.0000e+00, 0.0000e+00, -6.9849e-09], [-9.3132e-10, 9.3132e-10, -4.1910e-09, ..., 1.2573e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 406, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0055, 0.0111, 0.0115, -0.0061, -0.0164, -0.0185], device='cuda:0'), grad: tensor([ 2.7940e-09, -1.0710e-08, 1.5367e-08, 9.3132e-09, -7.6368e-08, -3.0501e-07, 3.2037e-07, 1.6298e-08, -2.7008e-08, 5.9605e-08], device='cuda:0') 100 1e-05 changing lr epoch 405, time 220.71, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4448 re_mapping 0.0024 re_causal 0.0078 /// teacc 99.21 lr 0.00001000 Epoch 407, weight, value: tensor([[-0.2609, -0.3220, 0.1270, ..., -0.1658, 0.0468, 0.0470], [-0.1565, -0.0858, -0.1216, ..., -0.2618, -0.0728, -0.0249], [ 0.0080, -0.2067, -0.2764, ..., -0.2037, 0.0308, -0.4461], ..., [-0.2350, 0.1994, 0.0342, ..., 0.2643, -0.0729, -0.1764], [-0.2278, -0.2208, 0.2559, ..., -0.1959, -0.1664, 0.1739], [ 0.0161, -0.4060, 0.2114, ..., 0.0570, -0.2090, -0.2006]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, -4.6566e-10], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, 3.2596e-09, 1.8626e-09, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, -4.6566e-09, ..., -4.6566e-10, 0.0000e+00, -3.7253e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 407, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0055, 0.0111, 0.0115, -0.0061, -0.0164, -0.0186], device='cuda:0'), grad: tensor([ 9.3132e-10, 4.6566e-10, 3.7253e-09, -7.4506e-09, -5.2620e-08, -1.8626e-08, 2.8871e-08, 6.5193e-09, -1.5832e-08, 4.6566e-08], device='cuda:0') 100 1e-05 changing lr epoch 406, time 220.57, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4385 re_mapping 0.0024 re_causal 0.0077 /// teacc 99.20 lr 0.00001000 Epoch 408, weight, value: tensor([[-0.2609, -0.3220, 0.1270, ..., -0.1658, 0.0468, 0.0470], [-0.1566, -0.0858, -0.1216, ..., -0.2619, -0.0728, -0.0249], [ 0.0080, -0.2067, -0.2764, ..., -0.2037, 0.0308, -0.4461], ..., [-0.2351, 0.1995, 0.0342, ..., 0.2644, -0.0729, -0.1764], [-0.2278, -0.2208, 0.2559, ..., -0.1959, -0.1664, 0.1739], [ 0.0161, -0.4061, 0.2114, ..., 0.0569, -0.2090, -0.2006]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., 0.0000e+00, -4.6566e-10, -1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 1.3970e-09, 4.6566e-10, 1.8626e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [-9.3132e-10, 0.0000e+00, -5.5879e-09, ..., 0.0000e+00, 0.0000e+00, -5.1223e-09], [-3.7253e-09, 0.0000e+00, -4.6566e-09, ..., -1.3970e-09, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 408, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0055, 0.0111, 0.0115, -0.0061, -0.0164, -0.0186], device='cuda:0'), grad: tensor([-1.4435e-08, 1.3970e-09, -3.7253e-09, 3.2596e-09, -3.2596e-09, 1.6764e-08, 1.0710e-08, 1.3039e-08, -1.2573e-08, -7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 407, time 220.66, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4437 re_mapping 0.0023 re_causal 0.0077 /// teacc 99.20 lr 0.00001000 Epoch 409, weight, value: tensor([[-0.2609, -0.3220, 0.1270, ..., -0.1659, 0.0468, 0.0470], [-0.1566, -0.0858, -0.1216, ..., -0.2619, -0.0728, -0.0250], [ 0.0080, -0.2067, -0.2764, ..., -0.2038, 0.0308, -0.4461], ..., [-0.2351, 0.1995, 0.0342, ..., 0.2644, -0.0729, -0.1764], [-0.2279, -0.2208, 0.2559, ..., -0.1959, -0.1664, 0.1739], [ 0.0160, -0.4061, 0.2115, ..., 0.0569, -0.2090, -0.2007]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.2107e-08, 4.6566e-09, ..., 1.0245e-08, 0.0000e+00, 0.0000e+00], [-4.6566e-10, -1.3970e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -1.4901e-08, -6.0536e-09, ..., -1.3504e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 1.8626e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 409, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0115, -0.0061, -0.0164, -0.0186], device='cuda:0'), grad: tensor([ 1.3970e-09, 3.7253e-08, -1.3504e-08, 1.3970e-08, -1.8626e-09, -9.7789e-09, 1.8626e-09, -3.3993e-08, 9.3132e-10, 1.4435e-08], device='cuda:0') 100 1e-05 changing lr epoch 408, time 220.53, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4458 re_mapping 0.0023 re_causal 0.0075 /// teacc 99.20 lr 0.00001000 Epoch 410, weight, value: tensor([[-0.2609, -0.3220, 0.1270, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0858, -0.1216, ..., -0.2620, -0.0728, -0.0250], [ 0.0080, -0.2067, -0.2765, ..., -0.2038, 0.0308, -0.4462], ..., [-0.2351, 0.1995, 0.0342, ..., 0.2645, -0.0729, -0.1764], [-0.2280, -0.2208, 0.2559, ..., -0.1960, -0.1664, 0.1739], [ 0.0160, -0.4062, 0.2115, ..., 0.0569, -0.2090, -0.2007]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [-4.6566e-10, 9.3132e-10, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, -9.3132e-10], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, 4.6566e-10, 1.3970e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, -4.6566e-10, ..., 4.6566e-10, 0.0000e+00, -4.6566e-10], [ 9.3132e-10, 4.6566e-10, -6.1467e-08, ..., -7.4971e-08, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 410, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0164, -0.0186], device='cuda:0'), grad: tensor([ 1.1176e-08, 1.1502e-07, -1.4575e-07, 6.9849e-09, 2.7847e-07, 3.4459e-08, -4.9826e-08, 2.3749e-08, 2.7008e-08, -2.9523e-07], device='cuda:0') 100 1e-05 changing lr epoch 409, time 220.40, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4504 re_mapping 0.0023 re_causal 0.0077 /// teacc 99.19 lr 0.00001000 Epoch 411, weight, value: tensor([[-0.2610, -0.3220, 0.1270, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0858, -0.1216, ..., -0.2621, -0.0728, -0.0250], [ 0.0079, -0.2067, -0.2765, ..., -0.2038, 0.0308, -0.4462], ..., [-0.2351, 0.1996, 0.0342, ..., 0.2646, -0.0729, -0.1764], [-0.2281, -0.2208, 0.2559, ..., -0.1960, -0.1664, 0.1738], [ 0.0160, -0.4062, 0.2115, ..., 0.0569, -0.2090, -0.2007]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, -2.7940e-09], [ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [-4.6566e-10, 0.0000e+00, -1.3970e-09, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 411, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0186], device='cuda:0'), grad: tensor([-1.4901e-08, -2.0489e-08, -6.9849e-09, -1.4435e-08, 9.3132e-10, 1.8161e-08, 1.1642e-08, 2.7474e-08, 2.3283e-09, -3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 410, time 220.34, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4415 re_mapping 0.0024 re_causal 0.0077 /// teacc 99.18 lr 0.00001000 Epoch 412, weight, value: tensor([[-0.2610, -0.3220, 0.1271, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0859, -0.1216, ..., -0.2622, -0.0728, -0.0250], [ 0.0079, -0.2068, -0.2765, ..., -0.2038, 0.0308, -0.4463], ..., [-0.2351, 0.1996, 0.0342, ..., 0.2646, -0.0729, -0.1764], [-0.2281, -0.2208, 0.2559, ..., -0.1960, -0.1664, 0.1739], [ 0.0160, -0.4062, 0.2115, ..., 0.0569, -0.2090, -0.2008]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, -3.1665e-08, 2.3283e-09, ..., -3.6275e-07, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, -4.6566e-10, -5.1223e-09, ..., -1.8626e-09, 0.0000e+00, -3.2596e-09], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 412, bias, value: tensor([-0.0195, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0186], device='cuda:0'), grad: tensor([ 9.3132e-10, 3.2596e-09, 6.0536e-09, 2.7940e-09, 9.9093e-07, 1.1642e-08, 6.9849e-09, -1.0002e-06, -1.3039e-08, 5.1223e-09], device='cuda:0') 100 1e-05 changing lr epoch 411, time 220.51, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4385 re_mapping 0.0023 re_causal 0.0074 /// teacc 99.19 lr 0.00001000 Epoch 413, weight, value: tensor([[-0.2610, -0.3221, 0.1271, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0858, -0.1216, ..., -0.2622, -0.0728, -0.0250], [ 0.0079, -0.2068, -0.2765, ..., -0.2039, 0.0308, -0.4463], ..., [-0.2351, 0.1996, 0.0342, ..., 0.2647, -0.0729, -0.1764], [-0.2281, -0.2208, 0.2560, ..., -0.1960, -0.1664, 0.1739], [ 0.0159, -0.4063, 0.2115, ..., 0.0569, -0.2090, -0.2009]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.7940e-09, -2.3283e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 6.0536e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 413, bias, value: tensor([-0.0194, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 9.3132e-09, 3.7253e-09, 2.7940e-09, 4.6566e-10, -6.7055e-08, -2.5053e-07, 2.4540e-07, -1.8626e-09, 1.1176e-08, 5.1688e-08], device='cuda:0') 100 1e-05 changing lr epoch 412, time 220.16, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4277 re_mapping 0.0023 re_causal 0.0075 /// teacc 99.19 lr 0.00001000 Epoch 414, weight, value: tensor([[-0.2610, -0.3221, 0.1271, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0859, -0.1216, ..., -0.2624, -0.0728, -0.0250], [ 0.0079, -0.2068, -0.2766, ..., -0.2039, 0.0308, -0.4464], ..., [-0.2352, 0.1997, 0.0342, ..., 0.2648, -0.0729, -0.1764], [-0.2282, -0.2208, 0.2560, ..., -0.1960, -0.1664, 0.1739], [ 0.0159, -0.4063, 0.2116, ..., 0.0569, -0.2090, -0.2009]], device='cuda:0'), grad: tensor([[1.3970e-09, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 2.3283e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 414, bias, value: tensor([-0.0194, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 8.3819e-09, 4.1910e-09, 2.3283e-09, -5.3551e-08, 1.3970e-09, 2.8871e-08, 9.3132e-10, 0.0000e+00, 1.3504e-08, 1.3970e-09], device='cuda:0') 100 1e-05 changing lr epoch 413, time 220.50, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4228 re_mapping 0.0022 re_causal 0.0073 /// teacc 99.19 lr 0.00001000 Epoch 415, weight, value: tensor([[-0.2610, -0.3221, 0.1271, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0859, -0.1216, ..., -0.2625, -0.0728, -0.0250], [ 0.0078, -0.2068, -0.2766, ..., -0.2039, 0.0307, -0.4465], ..., [-0.2352, 0.1997, 0.0342, ..., 0.2649, -0.0729, -0.1764], [-0.2282, -0.2208, 0.2560, ..., -0.1961, -0.1665, 0.1739], [ 0.0159, -0.4063, 0.2116, ..., 0.0569, -0.2090, -0.2009]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [-4.6566e-10, 0.0000e+00, -2.7940e-09, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 415, bias, value: tensor([-0.0194, -0.0332, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 4.1910e-09, 1.2899e-07, -2.1793e-07, 8.3819e-09, 8.3819e-08, -1.1642e-08, -7.9162e-09, 3.2596e-09, 1.3970e-08, -7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 414, time 220.49, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4249 re_mapping 0.0022 re_causal 0.0070 /// teacc 99.16 lr 0.00001000 Epoch 416, weight, value: tensor([[-0.2610, -0.3221, 0.1271, ..., -0.1659, 0.0468, 0.0470], [-0.1567, -0.0860, -0.1217, ..., -0.2626, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2766, ..., -0.2039, 0.0307, -0.4465], ..., [-0.2352, 0.1998, 0.0342, ..., 0.2649, -0.0729, -0.1764], [-0.2283, -0.2208, 0.2560, ..., -0.1961, -0.1665, 0.1739], [ 0.0159, -0.4064, 0.2116, ..., 0.0569, -0.2090, -0.2010]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-09, -3.7253e-09, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3516e-07, 4.2375e-08, ..., 3.8184e-08, 0.0000e+00, -1.7229e-08], [ 0.0000e+00, 2.7940e-09, 4.1910e-09, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.8219e-07, -5.2620e-08, ..., -4.5635e-08, 0.0000e+00, 1.6298e-08], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 2.7940e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 416, bias, value: tensor([-0.0194, -0.0333, -0.0319, -0.0275, -0.0054, 0.0111, 0.0114, -0.0061, -0.0165, -0.0187], device='cuda:0'), grad: tensor([-1.0710e-08, 4.0978e-07, 2.5611e-08, 2.5146e-08, 8.8476e-09, 3.3993e-08, 4.6566e-09, -5.1083e-07, 4.1910e-09, 7.9162e-09], device='cuda:0') 100 1e-05 changing lr epoch 415, time 220.76, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4488 re_mapping 0.0022 re_causal 0.0072 /// teacc 99.17 lr 0.00001000 Epoch 417, weight, value: tensor([[-0.2611, -0.3221, 0.1272, ..., -0.1659, 0.0468, 0.0470], [-0.1568, -0.0860, -0.1217, ..., -0.2626, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2766, ..., -0.2039, 0.0307, -0.4465], ..., [-0.2352, 0.1998, 0.0342, ..., 0.2649, -0.0729, -0.1764], [-0.2284, -0.2208, 0.2560, ..., -0.1961, -0.1665, 0.1739], [ 0.0158, -0.4064, 0.2116, ..., 0.0569, -0.2090, -0.2011]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, 9.3132e-10, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.1910e-09, -1.3970e-09, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 417, bias, value: tensor([-0.0194, -0.0333, -0.0319, -0.0275, -0.0054, 0.0111, 0.0113, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 1.3970e-09, -2.0815e-07, 1.0990e-07, 1.3970e-09, 1.1176e-08, 1.8626e-09, 2.7940e-09, 7.4040e-08, 4.6566e-09, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 416, time 220.01, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4483 re_mapping 0.0021 re_causal 0.0074 /// teacc 99.17 lr 0.00001000 Epoch 418, weight, value: tensor([[-0.2611, -0.3221, 0.1272, ..., -0.1659, 0.0468, 0.0470], [-0.1569, -0.0860, -0.1217, ..., -0.2627, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2766, ..., -0.2039, 0.0307, -0.4465], ..., [-0.2352, 0.1998, 0.0342, ..., 0.2650, -0.0729, -0.1764], [-0.2285, -0.2209, 0.2561, ..., -0.1961, -0.1665, 0.1739], [ 0.0158, -0.4065, 0.2116, ..., 0.0569, -0.2090, -0.2011]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, -1.0245e-08, ..., 0.0000e+00, 0.0000e+00, -9.3132e-10], [ 9.3132e-09, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 3.2596e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], ..., [ 6.9849e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 5.1223e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 9.7789e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09]], device='cuda:0') Epoch 418, bias, value: tensor([-0.0194, -0.0333, -0.0318, -0.0275, -0.0054, 0.0111, 0.0113, -0.0061, -0.0165, -0.0187], device='cuda:0'), grad: tensor([-2.7474e-08, 3.8650e-08, 7.9162e-09, 3.4506e-07, 1.4435e-08, -5.4715e-07, 1.0664e-07, 1.7695e-08, 1.3039e-08, 3.1665e-08], device='cuda:0') 100 1e-05 changing lr epoch 417, time 220.58, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4427 re_mapping 0.0021 re_causal 0.0073 /// teacc 99.17 lr 0.00001000 Epoch 419, weight, value: tensor([[-0.2611, -0.3222, 0.1272, ..., -0.1659, 0.0468, 0.0470], [-0.1570, -0.0860, -0.1217, ..., -0.2628, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2767, ..., -0.2039, 0.0307, -0.4466], ..., [-0.2353, 0.1999, 0.0343, ..., 0.2651, -0.0729, -0.1764], [-0.2286, -0.2209, 0.2561, ..., -0.1961, -0.1665, 0.1738], [ 0.0157, -0.4067, 0.2116, ..., 0.0569, -0.2090, -0.2012]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6566e-10, ..., 0.0000e+00, 0.0000e+00, -1.3970e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 419, bias, value: tensor([-0.0194, -0.0333, -0.0319, -0.0275, -0.0054, 0.0111, 0.0112, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 4.6566e-10, 6.5193e-09, -4.6566e-09, 9.3132e-10, 2.3283e-09, 1.8626e-09, 4.6566e-09, 0.0000e+00, -3.2596e-09, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 418, time 220.49, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4269 re_mapping 0.0022 re_causal 0.0072 /// teacc 99.19 lr 0.00001000 Epoch 420, weight, value: tensor([[-0.2611, -0.3222, 0.1272, ..., -0.1660, 0.0468, 0.0470], [-0.1570, -0.0861, -0.1217, ..., -0.2629, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2767, ..., -0.2040, 0.0307, -0.4466], ..., [-0.2353, 0.2000, 0.0343, ..., 0.2652, -0.0729, -0.1764], [-0.2286, -0.2209, 0.2561, ..., -0.1962, -0.1665, 0.1739], [ 0.0157, -0.4068, 0.2117, ..., 0.0569, -0.2090, -0.2014]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 3.2596e-09, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 420, bias, value: tensor([-0.0194, -0.0333, -0.0318, -0.0275, -0.0054, 0.0111, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 3.2596e-09, 6.0536e-09, -4.3306e-08, 2.7940e-08, -4.7963e-08, -8.3819e-09, 2.7940e-09, 9.7789e-09, 5.5879e-09, 3.6322e-08], device='cuda:0') 100 1e-05 changing lr epoch 419, time 221.21, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.3942 re_mapping 0.0022 re_causal 0.0070 /// teacc 99.17 lr 0.00001000 Epoch 421, weight, value: tensor([[-0.2611, -0.3222, 0.1272, ..., -0.1660, 0.0468, 0.0470], [-0.1570, -0.0861, -0.1217, ..., -0.2629, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2767, ..., -0.2040, 0.0307, -0.4467], ..., [-0.2353, 0.2000, 0.0343, ..., 0.2652, -0.0729, -0.1764], [-0.2287, -0.2209, 0.2561, ..., -0.1962, -0.1665, 0.1739], [ 0.0156, -0.4069, 0.2117, ..., 0.0569, -0.2090, -0.2015]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 4.6566e-10, ..., 4.1910e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -5.1223e-09, -1.3970e-09, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 421, bias, value: tensor([-0.0194, -0.0333, -0.0318, -0.0275, -0.0054, 0.0111, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 4.6566e-10, 2.2817e-08, 1.3970e-09, 9.3132e-10, -2.2817e-08, 3.2596e-09, 1.3970e-09, -1.8626e-08, 9.3132e-10, 1.3970e-08], device='cuda:0') 100 1e-05 changing lr epoch 420, time 221.22, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4547 re_mapping 0.0022 re_causal 0.0075 /// teacc 99.17 lr 0.00001000 Epoch 422, weight, value: tensor([[-0.2612, -0.3222, 0.1271, ..., -0.1661, 0.0468, 0.0470], [-0.1570, -0.0861, -0.1217, ..., -0.2630, -0.0728, -0.0250], [ 0.0078, -0.2069, -0.2767, ..., -0.2040, 0.0307, -0.4468], ..., [-0.2353, 0.2000, 0.0343, ..., 0.2653, -0.0729, -0.1764], [-0.2287, -0.2209, 0.2561, ..., -0.1962, -0.1665, 0.1739], [ 0.0156, -0.4069, 0.2118, ..., 0.0569, -0.2090, -0.2015]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-2.3283e-09, 0.0000e+00, -7.4506e-09, ..., -6.5193e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 422, bias, value: tensor([-0.0195, -0.0333, -0.0318, -0.0275, -0.0053, 0.0112, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 8.8476e-09, 9.3132e-09, -5.1223e-09, 1.4435e-08, -5.6345e-08, -9.3132e-10, 2.6077e-08, 4.6566e-09, 4.6566e-10, 2.3283e-09], device='cuda:0') 100 1e-05 changing lr epoch 421, time 220.41, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4403 re_mapping 0.0022 re_causal 0.0074 /// teacc 99.17 lr 0.00001000 Epoch 423, weight, value: tensor([[-0.2612, -0.3223, 0.1271, ..., -0.1661, 0.0467, 0.0470], [-0.1570, -0.0861, -0.1218, ..., -0.2631, -0.0728, -0.0250], [ 0.0077, -0.2070, -0.2768, ..., -0.2040, 0.0307, -0.4469], ..., [-0.2354, 0.2001, 0.0343, ..., 0.2653, -0.0729, -0.1764], [-0.2288, -0.2209, 0.2562, ..., -0.1962, -0.1665, 0.1739], [ 0.0155, -0.4069, 0.2118, ..., 0.0569, -0.2090, -0.2016]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -6.9849e-09, -2.5611e-09, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [-1.1642e-09, 6.5193e-09, -1.3970e-09, ..., 3.4925e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 423, bias, value: tensor([-0.0195, -0.0333, -0.0319, -0.0275, -0.0054, 0.0112, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 1.3970e-09, 4.4238e-09, -6.7521e-09, 6.9849e-09, 7.9162e-09, 6.9849e-09, -6.7521e-09, -1.2573e-08, 5.1223e-09, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 422, time 220.23, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4176 re_mapping 0.0021 re_causal 0.0071 /// teacc 99.14 lr 0.00001000 Epoch 424, weight, value: tensor([[-0.2613, -0.3223, 0.1271, ..., -0.1661, 0.0467, 0.0470], [-0.1571, -0.0862, -0.1218, ..., -0.2632, -0.0728, -0.0250], [ 0.0077, -0.2069, -0.2768, ..., -0.2040, 0.0307, -0.4469], ..., [-0.2354, 0.2001, 0.0343, ..., 0.2654, -0.0729, -0.1764], [-0.2288, -0.2209, 0.2562, ..., -0.1962, -0.1665, 0.1739], [ 0.0153, -0.4070, 0.2118, ..., 0.0569, -0.2090, -0.2018]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, -1.8626e-09, ..., 2.3283e-10, -6.9849e-10, -1.3970e-09], [ 2.0955e-09, 7.6368e-08, 6.4727e-08, ..., 3.0966e-08, 0.0000e+00, 5.1223e-09], [ 2.3283e-10, 3.4925e-09, 3.0268e-09, ..., 1.1642e-09, 0.0000e+00, 6.9849e-10], ..., [ 4.6566e-10, -7.6136e-08, -6.4727e-08, ..., -2.9337e-08, 0.0000e+00, 9.3132e-10], [ 1.1642e-09, -1.1642e-08, -1.5600e-08, ..., 1.3970e-09, 0.0000e+00, -3.0035e-08], [ 1.8626e-09, 4.6566e-10, 2.3283e-10, ..., 2.0955e-09, 2.3283e-10, 1.3970e-09]], device='cuda:0') Epoch 424, bias, value: tensor([-0.0195, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([-4.6566e-09, 2.3772e-07, 1.3039e-08, 1.3039e-08, -1.3295e-07, 6.0303e-08, 8.6613e-08, -2.0000e-07, -8.7079e-08, 2.0023e-08], device='cuda:0') 100 1e-05 changing lr epoch 423, time 220.89, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4180 re_mapping 0.0021 re_causal 0.0072 /// teacc 99.15 lr 0.00001000 Epoch 425, weight, value: tensor([[-0.2613, -0.3223, 0.1271, ..., -0.1661, 0.0467, 0.0470], [-0.1571, -0.0862, -0.1218, ..., -0.2632, -0.0728, -0.0251], [ 0.0077, -0.2070, -0.2768, ..., -0.2040, 0.0307, -0.4469], ..., [-0.2354, 0.2002, 0.0343, ..., 0.2654, -0.0729, -0.1764], [-0.2288, -0.2209, 0.2562, ..., -0.1962, -0.1665, 0.1739], [ 0.0153, -0.4071, 0.2119, ..., 0.0569, -0.2090, -0.2018]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.7521e-09, 2.3283e-09, ..., 4.8894e-09, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 2.0955e-08, 1.3970e-09, ..., 1.3970e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.7707e-08, -3.4925e-09, ..., -2.0256e-08, 0.0000e+00, -2.3283e-10], [ 6.9849e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 425, bias, value: tensor([-0.0195, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0111, -0.0060, -0.0165, -0.0187], device='cuda:0'), grad: tensor([ 6.9849e-10, 1.5832e-08, 4.9360e-08, -6.9849e-10, 2.3283e-09, -8.8476e-09, 1.0012e-08, -6.2631e-08, 2.0955e-09, 2.3283e-10], device='cuda:0') 100 1e-05 changing lr epoch 424, time 220.38, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4032 re_mapping 0.0021 re_causal 0.0072 /// teacc 99.14 lr 0.00001000 Epoch 426, weight, value: tensor([[-0.2614, -0.3223, 0.1272, ..., -0.1661, 0.0467, 0.0470], [-0.1571, -0.0862, -0.1218, ..., -0.2633, -0.0728, -0.0251], [ 0.0076, -0.2070, -0.2769, ..., -0.2040, 0.0307, -0.4470], ..., [-0.2355, 0.2002, 0.0343, ..., 0.2655, -0.0729, -0.1764], [-0.2289, -0.2209, 0.2563, ..., -0.1963, -0.1665, 0.1739], [ 0.0152, -0.4071, 0.2119, ..., 0.0569, -0.2090, -0.2019]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 4.6566e-10, 1.3970e-09, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, -2.6310e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 2.3283e-10, 2.4680e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 2.3283e-10, 1.1642e-09, ..., 6.9849e-10, 0.0000e+00, 6.9849e-10], [-6.9849e-09, 0.0000e+00, -9.5461e-09, ..., -8.6147e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 426, bias, value: tensor([-0.0195, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0111, -0.0060, -0.0166, -0.0187], device='cuda:0'), grad: tensor([ 3.4925e-09, 9.5461e-09, -1.2456e-07, 2.6543e-08, 2.7241e-08, -1.4901e-08, -9.3132e-10, 1.1781e-07, 9.0804e-09, -4.4005e-08], device='cuda:0') 100 1e-05 changing lr epoch 425, time 220.71, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4473 re_mapping 0.0021 re_causal 0.0074 /// teacc 99.17 lr 0.00001000 Epoch 427, weight, value: tensor([[-0.2614, -0.3223, 0.1272, ..., -0.1662, 0.0467, 0.0470], [-0.1571, -0.0863, -0.1218, ..., -0.2633, -0.0728, -0.0251], [ 0.0076, -0.2070, -0.2769, ..., -0.2040, 0.0306, -0.4471], ..., [-0.2355, 0.2002, 0.0343, ..., 0.2655, -0.0729, -0.1764], [-0.2290, -0.2209, 0.2563, ..., -0.1963, -0.1666, 0.1739], [ 0.0151, -0.4072, 0.2120, ..., 0.0570, -0.2090, -0.2020]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-10, 0.0000e+00, 9.3132e-10], [ 1.1642e-09, 2.3283e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., -1.6298e-09, 0.0000e+00, -1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, 9.3132e-10], [ 9.3132e-10, 0.0000e+00, -9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -4.6566e-10], [-4.6566e-10, 0.0000e+00, -9.3132e-10, ..., -2.3283e-10, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 427, bias, value: tensor([-0.0195, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0110, -0.0060, -0.0166, -0.0187], device='cuda:0'), grad: tensor([ 2.5146e-08, 1.6531e-08, 7.9162e-09, 2.0023e-08, -1.8859e-08, -3.9348e-08, -1.4435e-08, 1.5367e-08, 1.8626e-09, -1.6298e-09], device='cuda:0') 100 1e-05 changing lr epoch 426, time 220.27, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.3990 re_mapping 0.0021 re_causal 0.0072 /// teacc 99.17 lr 0.00001000 Epoch 428, weight, value: tensor([[-0.2615, -0.3224, 0.1271, ..., -0.1663, 0.0467, 0.0470], [-0.1571, -0.0863, -0.1218, ..., -0.2634, -0.0728, -0.0250], [ 0.0075, -0.2070, -0.2769, ..., -0.2040, 0.0306, -0.4471], ..., [-0.2355, 0.2003, 0.0344, ..., 0.2656, -0.0729, -0.1765], [-0.2290, -0.2210, 0.2563, ..., -0.1963, -0.1666, 0.1739], [ 0.0151, -0.4072, 0.2120, ..., 0.0570, -0.2090, -0.2021]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -6.9849e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 6.9849e-10, ..., 2.3283e-10, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 428, bias, value: tensor([-0.0196, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0110, -0.0059, -0.0166, -0.0187], device='cuda:0'), grad: tensor([-3.0268e-09, 1.3970e-09, 1.8626e-09, 2.5611e-09, 1.3970e-09, 1.6298e-09, 1.6298e-09, 1.6298e-09, 1.1642e-09, 5.1223e-09], device='cuda:0') 100 1e-05 changing lr epoch 427, time 220.36, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4038 re_mapping 0.0021 re_causal 0.0071 /// teacc 99.18 lr 0.00001000 Epoch 429, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1665, 0.0467, 0.0470], [-0.1571, -0.0863, -0.1218, ..., -0.2635, -0.0728, -0.0250], [ 0.0075, -0.2070, -0.2770, ..., -0.2040, 0.0306, -0.4472], ..., [-0.2356, 0.2004, 0.0344, ..., 0.2657, -0.0729, -0.1765], [-0.2290, -0.2210, 0.2563, ..., -0.1963, -0.1666, 0.1739], [ 0.0151, -0.4073, 0.2122, ..., 0.0570, -0.2090, -0.2021]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 1.3970e-09, 2.3283e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-09, 3.0268e-09, 6.9849e-10, ..., 3.2596e-09, 0.0000e+00, -2.3283e-10], [-1.0710e-08, 1.8626e-09, 4.6566e-10, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, -9.7789e-09, -9.3132e-10, ..., -9.3132e-09, 0.0000e+00, 0.0000e+00], [ 5.1223e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, -6.9849e-10, ..., 1.6298e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 429, bias, value: tensor([-0.0196, -0.0334, -0.0318, -0.0275, -0.0054, 0.0112, 0.0110, -0.0059, -0.0166, -0.0187], device='cuda:0'), grad: tensor([ 6.2864e-09, 1.0943e-08, -4.2375e-08, 2.7940e-09, 1.0943e-08, 2.3283e-09, 3.7253e-09, -1.9558e-08, 2.3749e-08, 4.4238e-09], device='cuda:0') 100 1e-05 changing lr epoch 428, time 220.39, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4428 re_mapping 0.0021 re_causal 0.0073 /// teacc 99.17 lr 0.00001000 Epoch 430, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1665, 0.0467, 0.0470], [-0.1572, -0.0864, -0.1219, ..., -0.2635, -0.0728, -0.0251], [ 0.0075, -0.2070, -0.2770, ..., -0.2041, 0.0306, -0.4472], ..., [-0.2356, 0.2004, 0.0344, ..., 0.2658, -0.0729, -0.1765], [-0.2291, -0.2210, 0.2563, ..., -0.1963, -0.1666, 0.1739], [ 0.0151, -0.4074, 0.2122, ..., 0.0570, -0.2090, -0.2022]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, -1.3970e-09, ..., 2.3283e-10, 0.0000e+00, -2.3283e-10], [ 2.3283e-10, -1.8626e-09, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, 6.9849e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, -4.6566e-10, 2.3283e-10, ..., -2.0955e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.3970e-09, 2.3283e-10, -3.2596e-09, ..., -1.6298e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 430, bias, value: tensor([-0.0196, -0.0335, -0.0318, -0.0275, -0.0055, 0.0112, 0.0110, -0.0059, -0.0166, -0.0187], device='cuda:0'), grad: tensor([-2.0955e-09, -7.9162e-09, 4.4238e-09, 1.3970e-09, 1.0245e-08, 1.3970e-09, 4.6566e-10, 3.9581e-09, 2.3283e-10, -9.7789e-09], device='cuda:0') 100 1e-05 changing lr epoch 429, time 219.93, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4230 re_mapping 0.0021 re_causal 0.0074 /// teacc 99.16 lr 0.00001000 Epoch 431, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1665, 0.0467, 0.0470], [-0.1573, -0.0864, -0.1219, ..., -0.2636, -0.0728, -0.0251], [ 0.0075, -0.2071, -0.2770, ..., -0.2041, 0.0306, -0.4472], ..., [-0.2356, 0.2004, 0.0344, ..., 0.2658, -0.0729, -0.1765], [-0.2292, -0.2210, 0.2563, ..., -0.1964, -0.1666, 0.1739], [ 0.0151, -0.4074, 0.2122, ..., 0.0570, -0.2090, -0.2022]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.0012e-08, ..., 0.0000e+00, 0.0000e+00, -6.9849e-10], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -2.3283e-10], [ 0.0000e+00, 0.0000e+00, 4.4238e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.1642e-09, 0.0000e+00, -1.1642e-09, ..., 0.0000e+00, 0.0000e+00, -3.0268e-09], [ 0.0000e+00, 0.0000e+00, 2.3283e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 431, bias, value: tensor([-0.0196, -0.0335, -0.0318, -0.0275, -0.0054, 0.0112, 0.0110, -0.0059, -0.0166, -0.0187], device='cuda:0'), grad: tensor([-1.0431e-07, 5.1223e-09, 4.9826e-08, 4.1910e-09, 1.2806e-08, 2.0955e-09, 2.2817e-08, 3.2596e-09, -6.7521e-09, 5.5879e-09], device='cuda:0') 100 1e-05 changing lr epoch 430, time 220.27, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4234 re_mapping 0.0021 re_causal 0.0071 /// teacc 99.16 lr 0.00001000 Epoch 432, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1665, 0.0467, 0.0471], [-0.1573, -0.0864, -0.1219, ..., -0.2636, -0.0728, -0.0251], [ 0.0075, -0.2071, -0.2771, ..., -0.2041, 0.0306, -0.4472], ..., [-0.2356, 0.2004, 0.0344, ..., 0.2658, -0.0729, -0.1765], [-0.2293, -0.2210, 0.2564, ..., -0.1964, -0.1666, 0.1739], [ 0.0151, -0.4074, 0.2123, ..., 0.0571, -0.2090, -0.2023]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, -1.1176e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 432, bias, value: tensor([-0.0196, -0.0335, -0.0318, -0.0275, -0.0054, 0.0112, 0.0109, -0.0060, -0.0166, -0.0186], device='cuda:0'), grad: tensor([ 5.8208e-09, -3.9814e-08, 3.0268e-09, -1.9325e-08, -3.0268e-09, 2.2585e-08, -7.4506e-09, 3.4925e-09, 2.6776e-08, 1.1874e-08], device='cuda:0') 100 1e-05 changing lr epoch 431, time 220.41, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4287 re_mapping 0.0021 re_causal 0.0073 /// teacc 99.17 lr 0.00001000 Epoch 433, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1666, 0.0467, 0.0471], [-0.1573, -0.0864, -0.1219, ..., -0.2637, -0.0728, -0.0251], [ 0.0076, -0.2071, -0.2771, ..., -0.2041, 0.0306, -0.4473], ..., [-0.2356, 0.2005, 0.0344, ..., 0.2659, -0.0729, -0.1765], [-0.2294, -0.2210, 0.2564, ..., -0.1964, -0.1666, 0.1739], [ 0.0151, -0.4076, 0.2123, ..., 0.0571, -0.2090, -0.2023]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 6.9849e-10, ..., 6.9849e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 7.2177e-08, 6.4727e-08, ..., 2.8173e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.1910e-09, 3.2596e-09, ..., 2.0955e-09, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, -7.8231e-08, -4.4936e-08, ..., -1.3039e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 6.9849e-10, 3.0268e-09, ..., 2.0955e-09, 0.0000e+00, 0.0000e+00], [-1.8626e-09, 4.6566e-10, -6.1467e-08, ..., -4.7497e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 433, bias, value: tensor([-0.0196, -0.0335, -0.0318, -0.0275, -0.0054, 0.0112, 0.0109, -0.0059, -0.0167, -0.0187], device='cuda:0'), grad: tensor([ 3.4925e-09, 2.0117e-07, 1.1642e-08, 3.1898e-08, 8.8708e-08, -2.1653e-08, 2.8173e-08, -1.3178e-07, 1.0245e-08, -2.1514e-07], device='cuda:0') 100 1e-05 changing lr epoch 432, time 220.36, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4437 re_mapping 0.0021 re_causal 0.0074 /// teacc 99.17 lr 0.00001000 Epoch 434, weight, value: tensor([[-0.2616, -0.3224, 0.1269, ..., -0.1666, 0.0467, 0.0471], [-0.1574, -0.0865, -0.1219, ..., -0.2638, -0.0728, -0.0251], [ 0.0076, -0.2071, -0.2771, ..., -0.2041, 0.0306, -0.4473], ..., [-0.2356, 0.2005, 0.0344, ..., 0.2659, -0.0729, -0.1765], [-0.2294, -0.2210, 0.2564, ..., -0.1964, -0.1666, 0.1739], [ 0.0150, -0.4076, 0.2124, ..., 0.0571, -0.2090, -0.2024]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.5367e-08, ..., 0.0000e+00, 0.0000e+00, -3.4925e-09], [ 0.0000e+00, 0.0000e+00, 2.0955e-09, ..., 0.0000e+00, 0.0000e+00, 1.1642e-09], [ 0.0000e+00, 0.0000e+00, 6.7521e-09, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], ..., [ 0.0000e+00, 0.0000e+00, 6.9849e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, 6.9849e-10, ..., 1.6298e-09, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 434, bias, value: tensor([-0.0196, -0.0335, -0.0317, -0.0275, -0.0054, 0.0112, 0.0109, -0.0060, -0.0167, -0.0186], device='cuda:0'), grad: tensor([-3.9814e-08, 9.5461e-09, 1.4435e-08, 8.8476e-09, -4.4238e-09, 1.4203e-08, -1.7695e-08, 3.0268e-09, 8.8476e-09, 9.0804e-09], device='cuda:0') 100 1e-05 changing lr epoch 433, time 220.22, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4170 re_mapping 0.0020 re_causal 0.0073 /// teacc 99.17 lr 0.00001000 Epoch 435, weight, value: tensor([[-0.2617, -0.3224, 0.1270, ..., -0.1666, 0.0467, 0.0471], [-0.1574, -0.0865, -0.1219, ..., -0.2639, -0.0728, -0.0251], [ 0.0076, -0.2071, -0.2772, ..., -0.2041, 0.0306, -0.4473], ..., [-0.2356, 0.2005, 0.0344, ..., 0.2660, -0.0729, -0.1765], [-0.2295, -0.2211, 0.2564, ..., -0.1964, -0.1666, 0.1739], [ 0.0150, -0.4076, 0.2124, ..., 0.0571, -0.2090, -0.2025]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 3.2596e-09, 2.3283e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, -5.5879e-09], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 1.8626e-09], ..., [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.4238e-09, 0.0000e+00, 4.6566e-10], [ 2.0955e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.5193e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 3.4226e-08, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 435, bias, value: tensor([-0.0196, -0.0336, -0.0316, -0.0275, -0.0055, 0.0112, 0.0109, -0.0060, -0.0167, -0.0187], device='cuda:0'), grad: tensor([ 3.7253e-09, -8.3121e-08, 2.3516e-08, 1.8161e-08, -1.8044e-07, -3.3062e-08, 8.6147e-09, 2.4680e-08, 6.4494e-08, 1.5832e-07], device='cuda:0') 100 1e-05 changing lr epoch 434, time 220.74, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4147 re_mapping 0.0021 re_causal 0.0072 /// teacc 99.16 lr 0.00001000 Epoch 436, weight, value: tensor([[-0.2617, -0.3225, 0.1270, ..., -0.1666, 0.0467, 0.0471], [-0.1575, -0.0866, -0.1220, ..., -0.2640, -0.0728, -0.0251], [ 0.0076, -0.2071, -0.2772, ..., -0.2041, 0.0306, -0.4473], ..., [-0.2357, 0.2006, 0.0345, ..., 0.2661, -0.0729, -0.1765], [-0.2296, -0.2211, 0.2564, ..., -0.1964, -0.1666, 0.1739], [ 0.0150, -0.4077, 0.2124, ..., 0.0570, -0.2090, -0.2025]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 1.6298e-09, 1.1642e-09, ..., 2.0955e-09, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.3970e-09, -6.9849e-10, ..., -1.6298e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 436, bias, value: tensor([-0.0196, -0.0336, -0.0316, -0.0275, -0.0055, 0.0112, 0.0108, -0.0060, -0.0167, -0.0187], device='cuda:0'), grad: tensor([ 2.3283e-09, 6.2864e-09, -1.0477e-08, 6.5193e-09, -1.1642e-09, -7.5437e-08, 7.4739e-08, -3.2596e-09, 3.0268e-09, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 435, time 220.66, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4283 re_mapping 0.0020 re_causal 0.0072 /// teacc 99.17 lr 0.00001000 Epoch 437, weight, value: tensor([[-0.2617, -0.3225, 0.1270, ..., -0.1667, 0.0467, 0.0471], [-0.1575, -0.0866, -0.1220, ..., -0.2641, -0.0728, -0.0252], [ 0.0076, -0.2071, -0.2772, ..., -0.2041, 0.0306, -0.4473], ..., [-0.2357, 0.2006, 0.0345, ..., 0.2661, -0.0729, -0.1765], [-0.2297, -0.2211, 0.2565, ..., -0.1964, -0.1666, 0.1739], [ 0.0150, -0.4078, 0.2124, ..., 0.0570, -0.2090, -0.2026]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -2.3283e-10], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 0.0000e+00, 3.9581e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, -7.4506e-09, ..., 0.0000e+00, 0.0000e+00, -3.9581e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 437, bias, value: tensor([-0.0196, -0.0336, -0.0316, -0.0275, -0.0055, 0.0112, 0.0108, -0.0060, -0.0167, -0.0187], device='cuda:0'), grad: tensor([ 0.0000e+00, -3.7253e-09, 1.5134e-08, 4.6566e-10, 6.9849e-10, 2.7940e-09, 1.1642e-09, 3.4925e-09, -1.4901e-08, 6.9849e-10], device='cuda:0') 100 1e-05 changing lr epoch 436, time 220.66, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4060 re_mapping 0.0020 re_causal 0.0069 /// teacc 99.17 lr 0.00001000 Epoch 438, weight, value: tensor([[-0.2617, -0.3225, 0.1270, ..., -0.1667, 0.0467, 0.0471], [-0.1575, -0.0868, -0.1221, ..., -0.2644, -0.0728, -0.0251], [ 0.0077, -0.2071, -0.2773, ..., -0.2041, 0.0306, -0.4474], ..., [-0.2357, 0.2008, 0.0346, ..., 0.2663, -0.0729, -0.1765], [-0.2297, -0.2211, 0.2566, ..., -0.1965, -0.1666, 0.1739], [ 0.0150, -0.4080, 0.2125, ..., 0.0570, -0.2090, -0.2027]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 4.6566e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, -1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -2.0955e-09], [ 0.0000e+00, -2.3283e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 2.3283e-10, 0.0000e+00, -2.7940e-09, ..., 2.3283e-10, 0.0000e+00, -7.6834e-09], [ 2.3283e-10, 4.6566e-10, -1.6298e-09, ..., -9.3132e-10, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 438, bias, value: tensor([-0.0196, -0.0337, -0.0315, -0.0275, -0.0054, 0.0112, 0.0108, -0.0059, -0.0168, -0.0187], device='cuda:0'), grad: tensor([ 4.8894e-09, -1.7928e-08, -3.5157e-08, 1.3039e-08, 3.7253e-09, -2.3283e-10, 3.1898e-08, 3.7486e-08, -3.0268e-08, 1.3970e-09], device='cuda:0') 100 1e-05 changing lr epoch 437, time 220.97, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4003 re_mapping 0.0020 re_causal 0.0069 /// teacc 99.18 lr 0.00001000 Epoch 439, weight, value: tensor([[-0.2618, -0.3225, 0.1270, ..., -0.1667, 0.0467, 0.0471], [-0.1576, -0.0869, -0.1221, ..., -0.2645, -0.0728, -0.0251], [ 0.0076, -0.2071, -0.2773, ..., -0.2041, 0.0306, -0.4475], ..., [-0.2357, 0.2009, 0.0346, ..., 0.2664, -0.0729, -0.1765], [-0.2298, -0.2212, 0.2566, ..., -0.1965, -0.1666, 0.1739], [ 0.0150, -0.4080, 0.2125, ..., 0.0571, -0.2090, -0.2027]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.3283e-10, -3.2596e-09, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], ..., [ 2.3283e-10, -3.9581e-09, 2.7940e-09, ..., -9.5461e-09, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.0710e-08, 2.3283e-09, -1.8859e-08, ..., -2.2119e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 439, bias, value: tensor([-0.0196, -0.0337, -0.0315, -0.0275, -0.0055, 0.0112, 0.0107, -0.0059, -0.0168, -0.0187], device='cuda:0'), grad: tensor([ 1.6298e-09, -5.1921e-08, 3.0268e-09, 3.7253e-09, 7.3807e-08, 8.1491e-09, 4.6566e-10, 4.9593e-08, 1.3970e-09, -8.0094e-08], device='cuda:0') 100 1e-05 changing lr epoch 438, time 220.34, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4363 re_mapping 0.0020 re_causal 0.0072 /// teacc 99.18 lr 0.00001000 Epoch 440, weight, value: tensor([[-0.2619, -0.3225, 0.1271, ..., -0.1667, 0.0467, 0.0471], [-0.1576, -0.0869, -0.1221, ..., -0.2646, -0.0728, -0.0251], [ 0.0074, -0.2072, -0.2773, ..., -0.2041, 0.0306, -0.4477], ..., [-0.2357, 0.2009, 0.0346, ..., 0.2665, -0.0729, -0.1765], [-0.2299, -0.2212, 0.2566, ..., -0.1965, -0.1666, 0.1738], [ 0.0150, -0.4081, 0.2125, ..., 0.0571, -0.2090, -0.2027]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 6.9849e-10, 0.0000e+00, -2.5611e-09], [ 0.0000e+00, -2.6310e-08, 0.0000e+00, ..., -1.8161e-08, 0.0000e+00, 2.3283e-10], ..., [ 6.9849e-10, 2.5844e-08, 1.1642e-09, ..., 1.8161e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 2.3283e-10, 1.1642e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-3.0268e-09, 0.0000e+00, -5.1223e-09, ..., -1.6298e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 440, bias, value: tensor([-0.0196, -0.0337, -0.0315, -0.0275, -0.0055, 0.0113, 0.0107, -0.0059, -0.0168, -0.0187], device='cuda:0'), grad: tensor([ 2.0955e-09, -1.0245e-08, -1.7462e-07, 6.9849e-10, 2.5611e-09, 5.8208e-09, 1.6997e-08, 1.7462e-07, 4.1910e-09, -1.3504e-08], device='cuda:0') 100 1e-05 changing lr epoch 439, time 220.84, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4151 re_mapping 0.0020 re_causal 0.0069 /// teacc 99.19 lr 0.00001000 Epoch 441, weight, value: tensor([[-0.2619, -0.3226, 0.1271, ..., -0.1667, 0.0467, 0.0471], [-0.1576, -0.0869, -0.1221, ..., -0.2646, -0.0728, -0.0251], [ 0.0074, -0.2072, -0.2774, ..., -0.2042, 0.0306, -0.4478], ..., [-0.2358, 0.2010, 0.0346, ..., 0.2666, -0.0729, -0.1765], [-0.2300, -0.2212, 0.2566, ..., -0.1965, -0.1666, 0.1738], [ 0.0150, -0.4082, 0.2126, ..., 0.0571, -0.2090, -0.2028]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 4.6566e-10, 4.6566e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 9.3132e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 2.3283e-10, 6.9849e-10, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 2.0955e-09], [ 6.9849e-10, 4.6566e-10, -2.3283e-09, ..., -1.1642e-09, 0.0000e+00, -2.5611e-09], [ 1.3970e-09, 1.1642e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 1.1642e-09]], device='cuda:0') Epoch 441, bias, value: tensor([-0.0196, -0.0337, -0.0314, -0.0276, -0.0055, 0.0113, 0.0106, -0.0059, -0.0169, -0.0187], device='cuda:0'), grad: tensor([ 2.5611e-09, 3.9581e-09, 3.2596e-09, -1.2107e-08, 2.5611e-09, -2.3283e-10, -1.1176e-08, 9.3132e-09, -3.9581e-09, 7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 440, time 220.74, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4120 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.16 lr 0.00001000 Epoch 442, weight, value: tensor([[-0.2619, -0.3226, 0.1271, ..., -0.1667, 0.0467, 0.0471], [-0.1577, -0.0870, -0.1222, ..., -0.2648, -0.0728, -0.0251], [ 0.0074, -0.2072, -0.2774, ..., -0.2042, 0.0306, -0.4478], ..., [-0.2358, 0.2011, 0.0346, ..., 0.2667, -0.0729, -0.1765], [-0.2301, -0.2212, 0.2566, ..., -0.1965, -0.1666, 0.1738], [ 0.0150, -0.4082, 0.2126, ..., 0.0571, -0.2090, -0.2028]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -6.9849e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [-1.3970e-09, 0.0000e+00, -1.8626e-09, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 442, bias, value: tensor([-0.0196, -0.0337, -0.0314, -0.0276, -0.0056, 0.0113, 0.0106, -0.0059, -0.0169, -0.0186], device='cuda:0'), grad: tensor([ 6.9849e-10, 1.3970e-09, -3.4925e-09, 3.7951e-08, 6.7521e-09, -4.7265e-08, 1.1176e-08, 2.3283e-09, 2.3283e-09, -6.7521e-09], device='cuda:0') 100 1e-05 changing lr epoch 441, time 220.58, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4422 re_mapping 0.0020 re_causal 0.0071 /// teacc 99.18 lr 0.00001000 Epoch 443, weight, value: tensor([[-0.2619, -0.3226, 0.1272, ..., -0.1667, 0.0467, 0.0471], [-0.1578, -0.0871, -0.1222, ..., -0.2649, -0.0728, -0.0252], [ 0.0074, -0.2072, -0.2774, ..., -0.2042, 0.0306, -0.4478], ..., [-0.2358, 0.2011, 0.0347, ..., 0.2667, -0.0729, -0.1766], [-0.2302, -0.2213, 0.2566, ..., -0.1966, -0.1666, 0.1738], [ 0.0150, -0.4082, 0.2127, ..., 0.0572, -0.2090, -0.2028]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 1.1642e-10, 4.6566e-10, ..., 1.1642e-10, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 1.1642e-10, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], ..., [ 0.0000e+00, 1.1642e-10, 5.8208e-10, ..., 2.3283e-10, 0.0000e+00, 1.0477e-09], [ 0.0000e+00, 1.1642e-10, -2.5611e-09, ..., 0.0000e+00, 0.0000e+00, -4.0745e-09], [ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 1.1525e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 443, bias, value: tensor([-0.0196, -0.0338, -0.0313, -0.0276, -0.0056, 0.0113, 0.0106, -0.0059, -0.0170, -0.0186], device='cuda:0'), grad: tensor([ 5.8208e-10, 3.4925e-09, -1.4086e-08, -4.3074e-09, -3.2480e-08, 2.1886e-08, 3.2596e-09, 5.1223e-09, -1.4552e-08, 3.3411e-08], device='cuda:0') 100 1e-05 changing lr epoch 442, time 220.64, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4424 re_mapping 0.0020 re_causal 0.0071 /// teacc 99.17 lr 0.00001000 Epoch 444, weight, value: tensor([[-0.2620, -0.3226, 0.1271, ..., -0.1668, 0.0467, 0.0471], [-0.1578, -0.0872, -0.1223, ..., -0.2650, -0.0728, -0.0252], [ 0.0074, -0.2072, -0.2775, ..., -0.2042, 0.0306, -0.4478], ..., [-0.2358, 0.2012, 0.0347, ..., 0.2668, -0.0729, -0.1766], [-0.2303, -0.2213, 0.2565, ..., -0.1966, -0.1666, 0.1738], [ 0.0151, -0.4083, 0.2128, ..., 0.0572, -0.2090, -0.2028]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 8.1491e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 0.0000e+00, 2.9104e-09, ..., 1.7462e-09, 0.0000e+00, 9.3132e-10], [ 3.9581e-09, 0.0000e+00, -4.5402e-09, ..., -2.6776e-09, 0.0000e+00, 1.6298e-09]], device='cuda:0') Epoch 444, bias, value: tensor([-0.0197, -0.0338, -0.0313, -0.0276, -0.0056, 0.0113, 0.0106, -0.0059, -0.0170, -0.0185], device='cuda:0'), grad: tensor([ 2.2119e-09, 5.8208e-10, 1.1642e-10, 1.7462e-09, 4.4238e-09, -1.0245e-08, 1.9791e-09, 4.6566e-10, 7.5670e-09, -3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 443, time 220.45, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4196 re_mapping 0.0020 re_causal 0.0070 /// teacc 99.22 lr 0.00001000 Epoch 445, weight, value: tensor([[-0.2620, -0.3227, 0.1271, ..., -0.1668, 0.0467, 0.0471], [-0.1578, -0.0873, -0.1223, ..., -0.2652, -0.0728, -0.0252], [ 0.0074, -0.2072, -0.2775, ..., -0.2042, 0.0306, -0.4479], ..., [-0.2358, 0.2013, 0.0348, ..., 0.2669, -0.0729, -0.1766], [-0.2303, -0.2213, 0.2566, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4083, 0.2129, ..., 0.0573, -0.2090, -0.2029]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 3.4925e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], ..., [ 1.1642e-10, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.1642e-10, -3.4925e-10, ..., 0.0000e+00, 0.0000e+00, -8.1491e-10], [ 9.3132e-10, 2.3283e-10, -2.7940e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 445, bias, value: tensor([-0.0197, -0.0339, -0.0312, -0.0276, -0.0057, 0.0113, 0.0106, -0.0059, -0.0170, -0.0185], device='cuda:0'), grad: tensor([ 1.5134e-09, 3.4925e-09, 3.0268e-09, -5.0059e-09, -2.5495e-08, 6.8685e-09, 4.6566e-09, 2.2119e-09, -1.3970e-09, 9.4296e-09], device='cuda:0') 100 1e-05 changing lr epoch 444, time 220.32, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4099 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.19 lr 0.00001000 Epoch 446, weight, value: tensor([[-0.2620, -0.3227, 0.1272, ..., -0.1668, 0.0467, 0.0471], [-0.1578, -0.0873, -0.1224, ..., -0.2652, -0.0728, -0.0252], [ 0.0074, -0.2072, -0.2775, ..., -0.2042, 0.0305, -0.4479], ..., [-0.2359, 0.2013, 0.0348, ..., 0.2670, -0.0729, -0.1766], [-0.2304, -0.2213, 0.2566, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4084, 0.2129, ..., 0.0573, -0.2090, -0.2030]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.1491e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.1642e-09, 0.0000e+00, ..., -1.7462e-09, 0.0000e+00, 0.0000e+00], [ 8.1491e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 2.3283e-10, 3.4925e-10, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 446, bias, value: tensor([-0.0197, -0.0339, -0.0312, -0.0276, -0.0057, 0.0113, 0.0106, -0.0059, -0.0170, -0.0185], device='cuda:0'), grad: tensor([ 1.2806e-09, 1.8626e-09, 2.2119e-09, 3.6089e-09, -4.7730e-09, -2.5611e-09, -1.5134e-09, -2.7940e-09, 2.0955e-09, 8.3819e-09], device='cuda:0') 100 1e-05 changing lr epoch 445, time 220.54, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4225 re_mapping 0.0020 re_causal 0.0071 /// teacc 99.21 lr 0.00001000 Epoch 447, weight, value: tensor([[-0.2620, -0.3227, 0.1272, ..., -0.1668, 0.0467, 0.0472], [-0.1579, -0.0873, -0.1224, ..., -0.2653, -0.0728, -0.0253], [ 0.0074, -0.2073, -0.2776, ..., -0.2042, 0.0305, -0.4480], ..., [-0.2359, 0.2014, 0.0348, ..., 0.2670, -0.0729, -0.1766], [-0.2305, -0.2213, 0.2566, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4084, 0.2129, ..., 0.0573, -0.2090, -0.2030]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 2.3283e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-10, 4.6566e-10, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, 3.4925e-10, ..., 4.6566e-10, 0.0000e+00, 1.1642e-10], [-6.9849e-10, 0.0000e+00, -1.6298e-09, ..., -2.2119e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 447, bias, value: tensor([-0.0196, -0.0339, -0.0312, -0.0276, -0.0056, 0.0113, 0.0105, -0.0059, -0.0171, -0.0185], device='cuda:0'), grad: tensor([ 4.6566e-10, 1.5134e-09, 1.0477e-09, 2.0955e-09, 8.0327e-09, -8.1491e-10, -1.6298e-09, 2.9104e-09, 2.3283e-09, -6.7521e-09], device='cuda:0') 100 1e-05 changing lr epoch 446, time 220.14, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4263 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.18 lr 0.00001000 Epoch 448, weight, value: tensor([[-0.2621, -0.3227, 0.1273, ..., -0.1668, 0.0467, 0.0472], [-0.1579, -0.0873, -0.1224, ..., -0.2654, -0.0728, -0.0252], [ 0.0074, -0.2073, -0.2776, ..., -0.2042, 0.0305, -0.4481], ..., [-0.2359, 0.2014, 0.0348, ..., 0.2670, -0.0729, -0.1766], [-0.2306, -0.2213, 0.2567, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4085, 0.2130, ..., 0.0573, -0.2090, -0.2031]], device='cuda:0'), grad: tensor([[ 3.4925e-10, 0.0000e+00, 3.4925e-10, ..., 1.1642e-10, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], ..., [ 5.8208e-10, 1.1642e-10, 1.8626e-09, ..., 8.1491e-10, 0.0000e+00, 1.1642e-10], [ 3.9581e-09, 1.1642e-10, 1.5134e-09, ..., 1.1642e-09, 0.0000e+00, 2.2119e-09], [ 1.1409e-08, 0.0000e+00, -5.0059e-09, ..., -3.0268e-09, 0.0000e+00, 8.6147e-09]], device='cuda:0') Epoch 448, bias, value: tensor([-0.0196, -0.0339, -0.0311, -0.0276, -0.0057, 0.0113, 0.0105, -0.0059, -0.0171, -0.0185], device='cuda:0'), grad: tensor([ 1.3970e-09, 2.9104e-09, 4.6566e-10, 8.0559e-08, 5.8208e-09, -1.1746e-07, 9.0804e-09, 4.6566e-09, 1.1292e-08, 1.5250e-08], device='cuda:0') 100 1e-05 changing lr epoch 447, time 220.69, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.3927 re_mapping 0.0020 re_causal 0.0067 /// teacc 99.21 lr 0.00001000 Epoch 449, weight, value: tensor([[-0.2621, -0.3226, 0.1273, ..., -0.1668, 0.0467, 0.0472], [-0.1580, -0.0874, -0.1224, ..., -0.2655, -0.0728, -0.0252], [ 0.0074, -0.2073, -0.2776, ..., -0.2042, 0.0305, -0.4481], ..., [-0.2359, 0.2015, 0.0348, ..., 0.2671, -0.0729, -0.1766], [-0.2306, -0.2214, 0.2567, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4085, 0.2131, ..., 0.0573, -0.2090, -0.2031]], device='cuda:0'), grad: tensor([[1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [1.1642e-10, 1.1642e-10, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [1.1642e-10, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [2.3283e-10, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [3.4925e-09, 1.1642e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 1.9791e-09], [4.0745e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09]], device='cuda:0') Epoch 449, bias, value: tensor([-0.0196, -0.0340, -0.0311, -0.0276, -0.0057, 0.0113, 0.0105, -0.0059, -0.0171, -0.0185], device='cuda:0'), grad: tensor([ 6.0536e-09, 1.9791e-09, 1.1642e-09, 1.5832e-08, -2.7940e-09, -2.9569e-08, -3.2596e-09, 1.3970e-09, 8.7311e-09, 9.4296e-09], device='cuda:0') 100 1e-05 changing lr epoch 448, time 220.50, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4387 re_mapping 0.0020 re_causal 0.0072 /// teacc 99.20 lr 0.00001000 Epoch 450, weight, value: tensor([[-0.2621, -0.3227, 0.1273, ..., -0.1668, 0.0467, 0.0472], [-0.1580, -0.0875, -0.1225, ..., -0.2655, -0.0728, -0.0252], [ 0.0073, -0.2074, -0.2777, ..., -0.2043, 0.0305, -0.4482], ..., [-0.2360, 0.2016, 0.0348, ..., 0.2672, -0.0729, -0.1766], [-0.2307, -0.2214, 0.2567, ..., -0.1967, -0.1666, 0.1738], [ 0.0151, -0.4085, 0.2131, ..., 0.0573, -0.2090, -0.2032]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 450, bias, value: tensor([-0.0196, -0.0340, -0.0311, -0.0276, -0.0057, 0.0113, 0.0105, -0.0059, -0.0171, -0.0185], device='cuda:0'), grad: tensor([ 2.3283e-10, 1.2806e-09, -1.1642e-09, -7.4506e-09, 1.1642e-10, 2.4447e-09, 1.1642e-10, 1.8626e-09, 1.1642e-09, 8.1491e-10], device='cuda:0') 100 1e-05 changing lr epoch 449, time 220.58, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4178 re_mapping 0.0020 re_causal 0.0070 /// teacc 99.19 lr 0.00001000 Epoch 451, weight, value: tensor([[-0.2621, -0.3227, 0.1274, ..., -0.1668, 0.0467, 0.0472], [-0.1580, -0.0875, -0.1225, ..., -0.2656, -0.0728, -0.0252], [ 0.0073, -0.2074, -0.2777, ..., -0.2043, 0.0305, -0.4482], ..., [-0.2360, 0.2016, 0.0349, ..., 0.2673, -0.0729, -0.1766], [-0.2307, -0.2214, 0.2567, ..., -0.1968, -0.1666, 0.1738], [ 0.0152, -0.4086, 0.2132, ..., 0.0573, -0.2090, -0.2032]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [1.7462e-09, 0.0000e+00, 0.0000e+00, ..., 6.4028e-09, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 3.4925e-10, 0.0000e+00, 0.0000e+00], [2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [2.5611e-09, 0.0000e+00, 0.0000e+00, ..., 9.4296e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 451, bias, value: tensor([-0.0195, -0.0341, -0.0311, -0.0276, -0.0056, 0.0114, 0.0105, -0.0059, -0.0171, -0.0184], device='cuda:0'), grad: tensor([ 9.1968e-09, 4.5053e-08, -1.1059e-08, 1.2806e-09, -1.1432e-07, 1.2806e-09, 4.0745e-09, 6.7521e-09, 4.6566e-10, 6.6822e-08], device='cuda:0') 100 1e-05 changing lr epoch 450, time 220.50, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4171 re_mapping 0.0019 re_causal 0.0071 /// teacc 99.21 lr 0.00001000 Epoch 452, weight, value: tensor([[-0.2621, -0.3227, 0.1274, ..., -0.1669, 0.0467, 0.0473], [-0.1580, -0.0875, -0.1225, ..., -0.2657, -0.0728, -0.0252], [ 0.0073, -0.2074, -0.2778, ..., -0.2043, 0.0305, -0.4482], ..., [-0.2360, 0.2016, 0.0349, ..., 0.2673, -0.0729, -0.1767], [-0.2308, -0.2214, 0.2567, ..., -0.1968, -0.1666, 0.1739], [ 0.0152, -0.4086, 0.2133, ..., 0.0574, -0.2090, -0.2033]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, -5.2387e-09, 1.1642e-10, ..., 3.4925e-10, 0.0000e+00, -1.9791e-09], [ 0.0000e+00, 1.1642e-10, 3.4925e-10, ..., 1.1642e-10, 0.0000e+00, 2.3283e-10], ..., [ 1.1642e-10, 1.1642e-09, -2.6776e-09, ..., -3.8417e-09, 0.0000e+00, 1.3970e-09], [ 1.0012e-08, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [ 9.3132e-10, 3.9581e-09, 3.1432e-09, ..., 4.6566e-09, 0.0000e+00, 8.1491e-10]], device='cuda:0') Epoch 452, bias, value: tensor([-0.0195, -0.0340, -0.0310, -0.0276, -0.0056, 0.0113, 0.0105, -0.0060, -0.0172, -0.0184], device='cuda:0'), grad: tensor([ 1.0477e-09, -2.8638e-08, 1.5134e-09, 5.9372e-09, 3.7253e-09, -2.6193e-08, 4.7730e-09, 1.5716e-08, 1.6298e-08, 1.2922e-08], device='cuda:0') 100 1e-05 changing lr epoch 451, time 220.47, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4111 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.20 lr 0.00001000 Epoch 453, weight, value: tensor([[-0.2621, -0.3227, 0.1275, ..., -0.1669, 0.0467, 0.0473], [-0.1581, -0.0876, -0.1226, ..., -0.2658, -0.0728, -0.0252], [ 0.0073, -0.2075, -0.2778, ..., -0.2044, 0.0305, -0.4483], ..., [-0.2360, 0.2017, 0.0349, ..., 0.2674, -0.0729, -0.1767], [-0.2309, -0.2214, 0.2567, ..., -0.1968, -0.1666, 0.1738], [ 0.0152, -0.4087, 0.2134, ..., 0.0574, -0.2090, -0.2033]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.2119e-09, ..., 0.0000e+00, 0.0000e+00, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 3.4925e-10], [ 0.0000e+00, 1.1642e-10, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8208e-10, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, -1.1642e-09, ..., 1.1642e-10, 0.0000e+00, -2.6776e-09], [ 1.1642e-10, 0.0000e+00, 5.8208e-10, ..., 3.4925e-10, 0.0000e+00, 1.1642e-09]], device='cuda:0') Epoch 453, bias, value: tensor([-0.0195, -0.0341, -0.0310, -0.0276, -0.0056, 0.0114, 0.0105, -0.0060, -0.0172, -0.0184], device='cuda:0'), grad: tensor([ 2.5728e-08, 3.7253e-09, -1.3737e-08, 5.9372e-09, -2.4447e-09, 1.6764e-08, -4.5518e-08, 4.1910e-09, -2.9104e-09, 1.4319e-08], device='cuda:0') 100 1e-05 changing lr epoch 452, time 220.39, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4163 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.18 lr 0.00001000 Epoch 454, weight, value: tensor([[-0.2621, -0.3227, 0.1275, ..., -0.1669, 0.0467, 0.0473], [-0.1581, -0.0876, -0.1226, ..., -0.2659, -0.0728, -0.0252], [ 0.0073, -0.2075, -0.2779, ..., -0.2044, 0.0305, -0.4483], ..., [-0.2360, 0.2018, 0.0349, ..., 0.2675, -0.0729, -0.1767], [-0.2309, -0.2214, 0.2567, ..., -0.1968, -0.1666, 0.1738], [ 0.0152, -0.4087, 0.2134, ..., 0.0574, -0.2090, -0.2034]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-4.6566e-10, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, -1.3970e-09], [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 8.1491e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 454, bias, value: tensor([-0.0195, -0.0341, -0.0309, -0.0276, -0.0057, 0.0114, 0.0104, -0.0060, -0.0173, -0.0184], device='cuda:0'), grad: tensor([ 1.5134e-09, -6.5193e-09, 1.1642e-09, 2.3283e-10, -1.0477e-09, 1.9791e-09, 7.3342e-09, 5.8208e-10, 6.9849e-10, 3.9581e-09], device='cuda:0') 100 1e-05 changing lr epoch 453, time 220.47, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4152 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.19 lr 0.00001000 Epoch 455, weight, value: tensor([[-0.2621, -0.3227, 0.1276, ..., -0.1669, 0.0467, 0.0473], [-0.1581, -0.0877, -0.1226, ..., -0.2660, -0.0728, -0.0252], [ 0.0073, -0.2075, -0.2779, ..., -0.2044, 0.0305, -0.4483], ..., [-0.2361, 0.2018, 0.0349, ..., 0.2676, -0.0729, -0.1767], [-0.2310, -0.2214, 0.2567, ..., -0.1968, -0.1666, 0.1739], [ 0.0153, -0.4088, 0.2135, ..., 0.0575, -0.2090, -0.2034]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 1.1642e-10, -1.8626e-09, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.4447e-09, 2.5611e-09, ..., 3.3760e-09, 0.0000e+00, 1.8626e-09], [ 1.1642e-10, 5.8208e-10, 4.1910e-09, ..., 5.8208e-10, 0.0000e+00, 6.8685e-09], ..., [ 6.9849e-10, -5.1223e-09, 5.8208e-10, ..., -1.9791e-09, 0.0000e+00, 3.4925e-10], [ 6.4028e-09, 0.0000e+00, -5.8208e-09, ..., 2.3283e-10, 0.0000e+00, -7.9162e-09], [-2.0955e-09, 2.3283e-10, -8.7311e-09, ..., -1.3737e-08, 0.0000e+00, 6.9849e-10]], device='cuda:0') Epoch 455, bias, value: tensor([-0.0194, -0.0341, -0.0310, -0.0276, -0.0057, 0.0114, 0.0104, -0.0060, -0.0173, -0.0183], device='cuda:0'), grad: tensor([-4.3772e-08, 4.3889e-08, 3.9116e-08, 3.4692e-08, 1.6764e-08, -4.1095e-08, 1.9209e-08, -5.0059e-09, -2.0722e-08, -3.3877e-08], device='cuda:0') 100 1e-05 changing lr epoch 454, time 220.49, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4294 re_mapping 0.0019 re_causal 0.0072 /// teacc 99.19 lr 0.00001000 Epoch 456, weight, value: tensor([[-0.2622, -0.3227, 0.1276, ..., -0.1669, 0.0467, 0.0473], [-0.1581, -0.0877, -0.1227, ..., -0.2661, -0.0727, -0.0253], [ 0.0073, -0.2075, -0.2780, ..., -0.2044, 0.0305, -0.4484], ..., [-0.2361, 0.2018, 0.0350, ..., 0.2676, -0.0729, -0.1767], [-0.2311, -0.2215, 0.2568, ..., -0.1969, -0.1666, 0.1738], [ 0.0153, -0.4089, 0.2136, ..., 0.0575, -0.2090, -0.2034]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 5.8208e-10], [-5.8208e-10, 8.1491e-10, 3.4925e-10, ..., 6.9849e-10, 0.0000e+00, -5.5879e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], ..., [ 0.0000e+00, 1.1642e-10, 6.9849e-10, ..., 8.1491e-10, 0.0000e+00, 4.6566e-10], [ 3.4925e-10, 0.0000e+00, 1.1642e-10, ..., 1.1642e-10, 0.0000e+00, 3.2596e-09], [ 1.1642e-10, 8.1491e-10, -1.1874e-08, ..., -1.4435e-08, 0.0000e+00, 1.2806e-09]], device='cuda:0') Epoch 456, bias, value: tensor([-0.0194, -0.0341, -0.0309, -0.0276, -0.0057, 0.0114, 0.0104, -0.0061, -0.0173, -0.0183], device='cuda:0'), grad: tensor([ 3.7253e-09, -1.9441e-08, 1.6298e-09, -1.1642e-10, 4.8778e-08, 5.2387e-09, 9.3132e-10, 6.9849e-09, 1.3504e-08, -4.7032e-08], device='cuda:0') 100 1e-05 changing lr epoch 455, time 220.49, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4543 re_mapping 0.0019 re_causal 0.0073 /// teacc 99.19 lr 0.00001000 Epoch 457, weight, value: tensor([[-0.2622, -0.3227, 0.1277, ..., -0.1669, 0.0467, 0.0473], [-0.1582, -0.0878, -0.1228, ..., -0.2662, -0.0727, -0.0253], [ 0.0073, -0.2075, -0.2780, ..., -0.2044, 0.0305, -0.4485], ..., [-0.2361, 0.2019, 0.0350, ..., 0.2678, -0.0729, -0.1767], [-0.2311, -0.2215, 0.2568, ..., -0.1969, -0.1666, 0.1738], [ 0.0153, -0.4090, 0.2136, ..., 0.0574, -0.2090, -0.2035]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, 3.4925e-10, ..., 3.1432e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 2.0955e-09, ..., 2.3283e-10, 0.0000e+00, 1.5134e-09], ..., [ 0.0000e+00, 1.1642e-10, 5.8208e-10, ..., 1.6298e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 8.1491e-10, -3.1432e-09, ..., 0.0000e+00, 0.0000e+00, -2.2119e-09], [ 0.0000e+00, 1.1642e-10, 1.1642e-10, ..., 8.1491e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 457, bias, value: tensor([-0.0194, -0.0343, -0.0308, -0.0276, -0.0057, 0.0114, 0.0106, -0.0061, -0.0174, -0.0184], device='cuda:0'), grad: tensor([ 3.4925e-10, 1.4086e-08, 6.1700e-09, -2.9104e-09, -3.7020e-08, 2.2119e-09, 1.5716e-08, 8.4983e-09, -4.3074e-09, 3.8417e-09], device='cuda:0') 100 1e-05 changing lr epoch 456, time 220.33, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4151 re_mapping 0.0019 re_causal 0.0072 /// teacc 99.19 lr 0.00001000 Epoch 458, weight, value: tensor([[-0.2622, -0.3227, 0.1277, ..., -0.1669, 0.0467, 0.0474], [-0.1582, -0.0879, -0.1228, ..., -0.2662, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2781, ..., -0.2045, 0.0305, -0.4485], ..., [-0.2361, 0.2020, 0.0350, ..., 0.2678, -0.0729, -0.1767], [-0.2312, -0.2215, 0.2568, ..., -0.1969, -0.1666, 0.1739], [ 0.0153, -0.4090, 0.2137, ..., 0.0574, -0.2090, -0.2035]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 5.8208e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 1.1642e-10], [ 2.3283e-10, 3.2596e-09, 5.8208e-10, ..., 1.0477e-09, 0.0000e+00, 0.0000e+00], [-2.3283e-10, 2.4447e-09, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, -1.1642e-10], ..., [ 1.1642e-10, 2.2375e-07, 0.0000e+00, ..., 3.4808e-08, 0.0000e+00, 1.1642e-10], [ 5.8208e-10, 1.1642e-10, 4.6566e-10, ..., 6.9849e-10, 0.0000e+00, 1.1642e-10], [-6.0536e-09, 6.6357e-09, -5.8208e-09, ..., -5.5879e-09, 0.0000e+00, -1.6298e-09]], device='cuda:0') Epoch 458, bias, value: tensor([-0.0194, -0.0343, -0.0307, -0.0276, -0.0057, 0.0113, 0.0106, -0.0061, -0.0174, -0.0183], device='cuda:0'), grad: tensor([ 3.4925e-09, 9.6625e-09, -5.2387e-09, -4.5076e-07, 1.4668e-08, 2.2119e-08, 1.3970e-09, 4.1630e-07, 4.5402e-09, -1.3039e-08], device='cuda:0') 100 1e-05 changing lr epoch 457, time 220.56, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4032 re_mapping 0.0020 re_causal 0.0071 /// teacc 99.20 lr 0.00001000 Epoch 459, weight, value: tensor([[-0.2622, -0.3227, 0.1278, ..., -0.1670, 0.0467, 0.0474], [-0.1582, -0.0879, -0.1228, ..., -0.2664, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2781, ..., -0.2045, 0.0305, -0.4485], ..., [-0.2362, 0.2020, 0.0350, ..., 0.2679, -0.0729, -0.1768], [-0.2312, -0.2215, 0.2569, ..., -0.1969, -0.1666, 0.1739], [ 0.0154, -0.4091, 0.2138, ..., 0.0575, -0.2090, -0.2035]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.4925e-10, -7.6834e-09, ..., 1.1642e-10, 0.0000e+00, -9.3132e-10], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 3.4925e-10], [ 0.0000e+00, 1.1642e-10, 3.2596e-09, ..., 0.0000e+00, 0.0000e+00, 2.6776e-09], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [ 1.1642e-10, 0.0000e+00, -5.8208e-10, ..., 1.1642e-10, 0.0000e+00, -3.9581e-09], [ 0.0000e+00, 0.0000e+00, -5.8208e-10, ..., -6.9849e-10, 0.0000e+00, 8.1491e-10]], device='cuda:0') Epoch 459, bias, value: tensor([-0.0193, -0.0343, -0.0306, -0.0276, -0.0057, 0.0113, 0.0106, -0.0062, -0.0174, -0.0183], device='cuda:0'), grad: tensor([-1.9441e-08, 2.2119e-09, 1.2806e-08, 7.7998e-09, 2.5611e-09, -5.8208e-10, 2.3283e-09, 3.1432e-09, -5.7044e-09, -2.3283e-10], device='cuda:0') 100 1e-05 changing lr epoch 458, time 220.77, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.3960 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.19 lr 0.00001000 Epoch 460, weight, value: tensor([[-0.2622, -0.3228, 0.1279, ..., -0.1670, 0.0467, 0.0474], [-0.1582, -0.0879, -0.1228, ..., -0.2664, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2781, ..., -0.2045, 0.0305, -0.4486], ..., [-0.2362, 0.2020, 0.0350, ..., 0.2679, -0.0729, -0.1768], [-0.2312, -0.2215, 0.2569, ..., -0.1969, -0.1666, 0.1739], [ 0.0153, -0.4092, 0.2138, ..., 0.0575, -0.2090, -0.2036]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.0477e-09, ..., 3.4925e-10, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -3.2596e-09, 2.3283e-10, ..., -6.5193e-09, 0.0000e+00, 6.9849e-10], [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.5134e-09, 2.3283e-10, ..., 3.1432e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 460, bias, value: tensor([-0.0193, -0.0344, -0.0305, -0.0276, -0.0057, 0.0114, 0.0106, -0.0062, -0.0174, -0.0183], device='cuda:0'), grad: tensor([ 2.3283e-10, -6.8685e-09, 5.8208e-10, 3.4925e-10, 1.6298e-09, 5.0059e-09, 1.6298e-09, -7.5670e-09, 3.4925e-10, 7.2177e-09], device='cuda:0') 100 1e-05 changing lr epoch 459, time 220.39, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4120 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.21 lr 0.00001000 Epoch 461, weight, value: tensor([[-0.2623, -0.3228, 0.1279, ..., -0.1670, 0.0467, 0.0475], [-0.1582, -0.0879, -0.1228, ..., -0.2664, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2782, ..., -0.2045, 0.0305, -0.4486], ..., [-0.2362, 0.2019, 0.0350, ..., 0.2679, -0.0729, -0.1768], [-0.2313, -0.2215, 0.2569, ..., -0.1969, -0.1666, 0.1739], [ 0.0153, -0.4092, 0.2139, ..., 0.0575, -0.2090, -0.2036]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.3283e-10, 9.3132e-10, ..., 8.1491e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.1491e-10, 3.4925e-10, ..., 8.1491e-10, 0.0000e+00, 0.0000e+00], [-3.3760e-09, -1.1642e-09, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-10, -5.0059e-09, -1.6298e-09, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 1.8626e-09, -1.0477e-09, ..., 1.0827e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 461, bias, value: tensor([-0.0192, -0.0344, -0.0305, -0.0276, -0.0057, 0.0114, 0.0105, -0.0063, -0.0174, -0.0183], device='cuda:0'), grad: tensor([ 6.6357e-09, 3.0268e-09, -2.6659e-08, 7.4506e-09, -6.2981e-08, 1.7462e-09, 2.5611e-09, -8.4983e-09, 1.1642e-09, 7.8930e-08], device='cuda:0') 100 1e-05 changing lr epoch 460, time 220.64, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4124 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.20 lr 0.00001000 Epoch 462, weight, value: tensor([[-0.2623, -0.3228, 0.1279, ..., -0.1670, 0.0467, 0.0475], [-0.1582, -0.0880, -0.1229, ..., -0.2665, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2782, ..., -0.2045, 0.0305, -0.4486], ..., [-0.2362, 0.2020, 0.0350, ..., 0.2680, -0.0729, -0.1768], [-0.2313, -0.2215, 0.2570, ..., -0.1969, -0.1666, 0.1740], [ 0.0153, -0.4093, 0.2140, ..., 0.0575, -0.2090, -0.2037]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.3388e-08, ..., 0.0000e+00, 0.0000e+00, -5.5879e-09], [ 0.0000e+00, 2.3283e-10, 1.1642e-10, ..., 3.4925e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [-1.5134e-09, 0.0000e+00, -4.5402e-09, ..., 0.0000e+00, 0.0000e+00, -4.7730e-09], [ 1.1642e-10, 1.1642e-10, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 2.3283e-10]], device='cuda:0') Epoch 462, bias, value: tensor([-0.0192, -0.0344, -0.0302, -0.0277, -0.0057, 0.0114, 0.0105, -0.0063, -0.0175, -0.0183], device='cuda:0'), grad: tensor([-6.0769e-08, 2.7940e-09, 2.3283e-10, 6.9849e-10, -3.0268e-09, 2.4447e-09, 7.6485e-08, 3.0268e-09, -2.2352e-08, 2.2119e-09], device='cuda:0') 100 1e-05 changing lr epoch 461, time 220.19, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4267 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.20 lr 0.00001000 Epoch 463, weight, value: tensor([[-0.2623, -0.3228, 0.1280, ..., -0.1670, 0.0467, 0.0475], [-0.1582, -0.0881, -0.1229, ..., -0.2667, -0.0727, -0.0253], [ 0.0073, -0.2076, -0.2783, ..., -0.2045, 0.0305, -0.4487], ..., [-0.2362, 0.2020, 0.0350, ..., 0.2681, -0.0729, -0.1768], [-0.2314, -0.2216, 0.2571, ..., -0.1970, -0.1666, 0.1740], [ 0.0153, -0.4093, 0.2140, ..., 0.0575, -0.2090, -0.2037]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.1491e-10, 2.3283e-10, ..., 9.3132e-10, 3.0268e-09, 0.0000e+00], [ 0.0000e+00, 2.2701e-09, 5.2387e-10, ..., 2.9104e-09, 0.0000e+00, 0.0000e+00], [ 5.8208e-11, 8.0909e-09, 2.9104e-10, ..., 1.1059e-09, 1.7462e-10, 0.0000e+00], ..., [ 0.0000e+00, -5.1223e-09, -1.3388e-09, ..., -5.4133e-09, 0.0000e+00, 0.0000e+00], [ 8.1491e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.8208e-11, 6.9849e-10], [ 5.8208e-11, 2.1537e-09, 6.4028e-10, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 463, bias, value: tensor([-0.0192, -0.0345, -0.0302, -0.0277, -0.0057, 0.0114, 0.0105, -0.0064, -0.0175, -0.0183], device='cuda:0'), grad: tensor([ 1.1787e-07, 1.3446e-08, 3.3644e-08, -3.9523e-08, -6.1700e-09, 3.0443e-08, -1.4773e-07, -1.3446e-08, 2.7358e-09, 1.2049e-08], device='cuda:0') 100 1e-05 changing lr epoch 462, time 220.34, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4351 re_mapping 0.0019 re_causal 0.0072 /// teacc 99.20 lr 0.00001000 Epoch 464, weight, value: tensor([[-0.2623, -0.3228, 0.1281, ..., -0.1670, 0.0467, 0.0476], [-0.1583, -0.0881, -0.1229, ..., -0.2667, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2783, ..., -0.2045, 0.0305, -0.4487], ..., [-0.2363, 0.2021, 0.0350, ..., 0.2681, -0.0729, -0.1768], [-0.2314, -0.2216, 0.2571, ..., -0.1970, -0.1666, 0.1741], [ 0.0153, -0.4094, 0.2141, ..., 0.0575, -0.2090, -0.2038]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.0373e-09, 0.0000e+00, ..., -1.5134e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.9104e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.7462e-10, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 464, bias, value: tensor([-0.0191, -0.0345, -0.0300, -0.0277, -0.0057, 0.0114, 0.0104, -0.0064, -0.0175, -0.0183], device='cuda:0'), grad: tensor([ 1.7462e-10, 8.7311e-10, 8.7311e-10, -3.3120e-08, 2.6193e-09, 3.5157e-08, 4.6566e-10, -3.2014e-09, 6.9849e-10, 4.6566e-10], device='cuda:0') 100 1e-05 changing lr epoch 463, time 220.41, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4037 re_mapping 0.0019 re_causal 0.0067 /// teacc 99.22 lr 0.00001000 Epoch 465, weight, value: tensor([[-0.2623, -0.3228, 0.1281, ..., -0.1670, 0.0467, 0.0476], [-0.1583, -0.0882, -0.1230, ..., -0.2669, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2783, ..., -0.2045, 0.0305, -0.4488], ..., [-0.2363, 0.2022, 0.0351, ..., 0.2682, -0.0729, -0.1768], [-0.2314, -0.2216, 0.2571, ..., -0.1970, -0.1666, 0.1741], [ 0.0153, -0.4094, 0.2141, ..., 0.0575, -0.2090, -0.2039]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 0.0000e+00], [ 5.8208e-11, 1.2224e-09, 2.9104e-10, ..., 1.1642e-09, 0.0000e+00, 5.8208e-11], [ 0.0000e+00, -1.4552e-09, 0.0000e+00, ..., -1.4435e-08, 0.0000e+00, -4.7148e-09], ..., [ 5.8208e-11, -4.0745e-10, -4.0745e-10, ..., 1.2456e-08, 0.0000e+00, 4.6566e-09], [ 5.8208e-10, 0.0000e+00, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 4.0745e-10], [ 1.5134e-09, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 465, bias, value: tensor([-0.0191, -0.0346, -0.0299, -0.0277, -0.0057, 0.0114, 0.0104, -0.0064, -0.0175, -0.0184], device='cuda:0'), grad: tensor([ 5.2387e-10, 3.9581e-09, -6.1293e-08, 4.0163e-09, 1.0477e-09, -4.0745e-09, -2.0955e-09, 5.0291e-08, 4.1910e-09, 3.4925e-09], device='cuda:0') 100 1e-05 changing lr epoch 464, time 220.67, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4162 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.21 lr 0.00001000 Epoch 466, weight, value: tensor([[-0.2623, -0.3228, 0.1281, ..., -0.1671, 0.0467, 0.0476], [-0.1583, -0.0883, -0.1231, ..., -0.2670, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2784, ..., -0.2046, 0.0305, -0.4488], ..., [-0.2363, 0.2023, 0.0351, ..., 0.2683, -0.0729, -0.1768], [-0.2315, -0.2216, 0.2572, ..., -0.1970, -0.1666, 0.1741], [ 0.0153, -0.4095, 0.2143, ..., 0.0576, -0.2090, -0.2039]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, -9.7207e-09, ..., 5.8208e-11, 0.0000e+00, -2.9104e-09], [ 2.3283e-10, 1.7462e-10, 7.5670e-09, ..., 2.3283e-10, 0.0000e+00, 1.9791e-09], [ 2.2119e-09, 5.8208e-11, 2.6193e-09, ..., 5.8208e-10, 0.0000e+00, 1.1642e-10], ..., [ 4.6566e-10, -1.1642e-09, 2.3283e-10, ..., -1.3970e-09, 0.0000e+00, 5.8208e-11], [ 9.3132e-10, 0.0000e+00, 3.4925e-10, ..., 1.1642e-10, 0.0000e+00, 6.4028e-10], [-9.0222e-09, 5.2387e-10, -8.3237e-09, ..., -2.0955e-09, 0.0000e+00, 3.4925e-10]], device='cuda:0') Epoch 466, bias, value: tensor([-0.0191, -0.0347, -0.0298, -0.0277, -0.0057, 0.0114, 0.0103, -0.0064, -0.0176, -0.0183], device='cuda:0'), grad: tensor([-2.4971e-08, 2.0082e-08, 1.0128e-08, 1.8685e-08, 8.9058e-09, 1.1642e-10, 5.4715e-09, -8.1491e-10, 3.7835e-09, -3.2713e-08], device='cuda:0') 100 1e-05 changing lr epoch 465, time 220.47, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4149 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 467, weight, value: tensor([[-0.2623, -0.3228, 0.1282, ..., -0.1671, 0.0467, 0.0477], [-0.1584, -0.0884, -0.1231, ..., -0.2671, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2784, ..., -0.2045, 0.0305, -0.4488], ..., [-0.2363, 0.2023, 0.0350, ..., 0.2683, -0.0729, -0.1769], [-0.2316, -0.2216, 0.2573, ..., -0.1970, -0.1666, 0.1742], [ 0.0152, -0.4095, 0.2144, ..., 0.0576, -0.2090, -0.2040]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.2224e-09, ..., 5.8208e-11, 0.0000e+00, 5.8208e-11], [-5.8208e-11, 1.1642e-10, 1.1642e-10, ..., 3.4925e-10, 0.0000e+00, -6.4028e-10], [ 0.0000e+00, 0.0000e+00, 3.0850e-09, ..., -2.4447e-09, 0.0000e+00, 5.3551e-09], ..., [ 0.0000e+00, 2.9104e-10, 3.4925e-10, ..., 1.9209e-09, 0.0000e+00, 5.2387e-10], [ 0.0000e+00, 0.0000e+00, -4.3074e-09, ..., 0.0000e+00, 0.0000e+00, -7.5088e-09], [ 2.3283e-10, 5.8208e-11, 1.1642e-10, ..., 5.2387e-10, 0.0000e+00, 5.8208e-11]], device='cuda:0') Epoch 467, bias, value: tensor([-0.0190, -0.0347, -0.0296, -0.0277, -0.0057, 0.0114, 0.0101, -0.0065, -0.0175, -0.0183], device='cuda:0'), grad: tensor([-2.9104e-09, -2.3283e-10, -1.3039e-08, 6.0536e-09, -2.0373e-09, 2.8522e-09, 6.5193e-09, 2.4796e-08, -2.0082e-08, 3.4925e-09], device='cuda:0') 100 1e-05 changing lr epoch 466, time 220.11, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4157 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 468, weight, value: tensor([[-0.2623, -0.3228, 0.1282, ..., -0.1671, 0.0467, 0.0477], [-0.1584, -0.0884, -0.1232, ..., -0.2671, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2785, ..., -0.2045, 0.0304, -0.4489], ..., [-0.2363, 0.2023, 0.0351, ..., 0.2684, -0.0729, -0.1769], [-0.2316, -0.2216, 0.2574, ..., -0.1970, -0.1666, 0.1743], [ 0.0152, -0.4095, 0.2144, ..., 0.0576, -0.2090, -0.2041]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 1.7462e-10, 1.1642e-10, ..., 5.8208e-11, 0.0000e+00, 1.1642e-10], [ 5.8208e-11, -1.7462e-10, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], ..., [ 5.8208e-11, 2.3283e-10, 7.5670e-10, ..., 4.6566e-10, 0.0000e+00, 1.1642e-10], [-1.6880e-09, 1.1642e-10, -3.1432e-09, ..., 5.8208e-11, 0.0000e+00, -4.5984e-09], [ 1.2224e-09, 0.0000e+00, 7.5670e-10, ..., -9.8953e-10, 0.0000e+00, 2.9104e-09]], device='cuda:0') Epoch 468, bias, value: tensor([-0.0190, -0.0348, -0.0295, -0.0277, -0.0057, 0.0114, 0.0101, -0.0066, -0.0174, -0.0183], device='cuda:0'), grad: tensor([ 1.3970e-09, 4.9477e-09, -3.2596e-09, 2.7358e-09, 2.4447e-09, 3.7719e-08, -3.1025e-08, 5.5879e-09, -9.6625e-09, 3.0268e-09], device='cuda:0') 100 1e-05 changing lr epoch 467, time 220.04, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4119 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.21 lr 0.00001000 Epoch 469, weight, value: tensor([[-0.2623, -0.3228, 0.1283, ..., -0.1672, 0.0467, 0.0478], [-0.1584, -0.0885, -0.1232, ..., -0.2672, -0.0727, -0.0253], [ 0.0075, -0.2077, -0.2785, ..., -0.2046, 0.0304, -0.4489], ..., [-0.2364, 0.2024, 0.0351, ..., 0.2685, -0.0729, -0.1769], [-0.2316, -0.2216, 0.2575, ..., -0.1970, -0.1666, 0.1743], [ 0.0152, -0.4096, 0.2145, ..., 0.0576, -0.2090, -0.2042]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 5.8208e-11], [ 5.8208e-11, -3.7835e-09, -6.5193e-09, ..., 0.0000e+00, 0.0000e+00, -1.1642e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], ..., [ 5.8208e-11, 3.5507e-09, 6.1700e-09, ..., 9.3132e-10, 0.0000e+00, 1.1001e-08], [ 2.9104e-10, 5.8208e-11, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 1.6880e-09], [ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11]], device='cuda:0') Epoch 469, bias, value: tensor([-0.0190, -0.0348, -0.0294, -0.0277, -0.0057, 0.0115, 0.0101, -0.0066, -0.0174, -0.0184], device='cuda:0'), grad: tensor([ 1.1642e-09, -6.0827e-08, 1.3388e-09, 4.3656e-09, -9.0222e-09, 2.8522e-09, -1.0710e-08, 6.1700e-08, 1.7288e-08, 4.6566e-10], device='cuda:0') 100 1e-05 changing lr epoch 468, time 220.57, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4249 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.21 lr 0.00001000 Epoch 470, weight, value: tensor([[-0.2623, -0.3228, 0.1283, ..., -0.1672, 0.0467, 0.0478], [-0.1584, -0.0885, -0.1232, ..., -0.2673, -0.0727, -0.0253], [ 0.0074, -0.2077, -0.2786, ..., -0.2046, 0.0304, -0.4490], ..., [-0.2364, 0.2024, 0.0351, ..., 0.2686, -0.0729, -0.1769], [-0.2316, -0.2216, 0.2575, ..., -0.1970, -0.1666, 0.1744], [ 0.0151, -0.4097, 0.2146, ..., 0.0577, -0.2090, -0.2043]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 1.1642e-10], [ 1.2806e-09, 7.7998e-09, 3.4925e-09, ..., 1.1292e-08, 1.1642e-10, 1.7462e-10], [ 5.8208e-11, 2.3283e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 1.7462e-10], ..., [ 1.7462e-10, -8.2655e-09, -3.5507e-09, ..., -6.9267e-09, 0.0000e+00, 0.0000e+00], [ 6.4028e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.1642e-10, 8.7311e-10], [ 1.5716e-09, 1.1642e-10, 5.8208e-11, ..., 5.4133e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 470, bias, value: tensor([-0.0189, -0.0349, -0.0294, -0.0277, -0.0057, 0.0114, 0.0102, -0.0067, -0.0174, -0.0184], device='cuda:0'), grad: tensor([ 1.5134e-09, 4.3074e-08, 2.2701e-09, 1.9791e-09, -6.1525e-08, 1.1793e-07, -1.1799e-07, -1.7812e-08, 9.6625e-09, 2.7474e-08], device='cuda:0') 100 1e-05 changing lr epoch 469, time 220.27, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4142 re_mapping 0.0019 re_causal 0.0067 /// teacc 99.19 lr 0.00001000 Epoch 471, weight, value: tensor([[-0.2624, -0.3229, 0.1283, ..., -0.1672, 0.0467, 0.0479], [-0.1585, -0.0886, -0.1233, ..., -0.2674, -0.0727, -0.0253], [ 0.0075, -0.2078, -0.2786, ..., -0.2046, 0.0304, -0.4490], ..., [-0.2364, 0.2024, 0.0351, ..., 0.2686, -0.0729, -0.1770], [-0.2317, -0.2216, 0.2576, ..., -0.1970, -0.1666, 0.1745], [ 0.0152, -0.4097, 0.2147, ..., 0.0577, -0.2090, -0.2043]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, -4.4238e-09, ..., 5.8208e-10, 0.0000e+00, -1.8626e-09], [ 1.1642e-10, 3.9581e-09, 1.1059e-09, ..., 3.4925e-09, 0.0000e+00, 5.8208e-11], [ 5.8208e-11, 0.0000e+00, 2.9104e-10, ..., 5.8208e-11, 0.0000e+00, 1.1642e-10], ..., [ 5.8208e-11, -4.4820e-09, -1.1059e-09, ..., -4.0163e-09, 0.0000e+00, 5.8208e-11], [ 4.0745e-10, 0.0000e+00, 4.8894e-09, ..., 0.0000e+00, 0.0000e+00, 2.4447e-09], [ 2.3283e-10, 5.8208e-11, 8.1491e-10, ..., 1.7462e-10, 0.0000e+00, 5.2387e-10]], device='cuda:0') Epoch 471, bias, value: tensor([-0.0189, -0.0350, -0.0292, -0.0277, -0.0057, 0.0114, 0.0102, -0.0067, -0.0174, -0.0183], device='cuda:0'), grad: tensor([-5.3551e-09, 1.0827e-08, 1.6880e-09, 7.9162e-09, -2.9686e-09, -6.4028e-09, 3.4343e-09, -9.4878e-09, 1.3446e-08, 3.2014e-09], device='cuda:0') 100 1e-05 changing lr epoch 470, time 220.83, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4075 re_mapping 0.0018 re_causal 0.0066 /// teacc 99.20 lr 0.00001000 Epoch 472, weight, value: tensor([[-0.2624, -0.3229, 0.1284, ..., -0.1672, 0.0467, 0.0479], [-0.1586, -0.0887, -0.1234, ..., -0.2675, -0.0727, -0.0254], [ 0.0075, -0.2078, -0.2787, ..., -0.2046, 0.0304, -0.4491], ..., [-0.2365, 0.2025, 0.0351, ..., 0.2687, -0.0729, -0.1770], [-0.2318, -0.2216, 0.2577, ..., -0.1971, -0.1666, 0.1745], [ 0.0152, -0.4097, 0.2149, ..., 0.0579, -0.2090, -0.2044]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 5.2387e-10, -5.0641e-09, ..., 6.9849e-10, 0.0000e+00, -4.1327e-09], [ 1.7462e-10, 2.0373e-09, 4.4238e-09, ..., 1.4552e-09, 0.0000e+00, 4.0745e-09], [ 5.8208e-11, 2.6193e-09, 1.3970e-09, ..., 1.3970e-09, 0.0000e+00, 6.9849e-10], ..., [ 1.1642e-10, -5.6461e-09, -5.1805e-09, ..., -6.0536e-09, 0.0000e+00, 1.7462e-10], [ 4.3074e-09, 1.7462e-10, -4.8196e-08, ..., 0.0000e+00, 0.0000e+00, -5.9488e-08], [ 2.9104e-10, 1.4552e-09, 4.0978e-08, ..., 7.5670e-10, 0.0000e+00, 5.3435e-08]], device='cuda:0') Epoch 472, bias, value: tensor([-0.0188, -0.0352, -0.0292, -0.0277, -0.0058, 0.0115, 0.0099, -0.0067, -0.0174, -0.0182], device='cuda:0'), grad: tensor([-2.4738e-08, 1.2573e-08, 9.5461e-09, 1.7462e-09, 1.5716e-09, 1.2224e-08, 2.8347e-08, -1.5600e-08, -1.3434e-07, 1.2270e-07], device='cuda:0') 100 1e-05 changing lr epoch 471, time 220.43, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4303 re_mapping 0.0018 re_causal 0.0069 /// teacc 99.22 lr 0.00001000 Epoch 473, weight, value: tensor([[-0.2624, -0.3229, 0.1284, ..., -0.1673, 0.0467, 0.0480], [-0.1586, -0.0888, -0.1235, ..., -0.2677, -0.0727, -0.0254], [ 0.0075, -0.2079, -0.2787, ..., -0.2047, 0.0304, -0.4492], ..., [-0.2365, 0.2027, 0.0351, ..., 0.2687, -0.0729, -0.1770], [-0.2319, -0.2217, 0.2578, ..., -0.1971, -0.1666, 0.1746], [ 0.0153, -0.4098, 0.2151, ..., 0.0580, -0.2090, -0.2045]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [0.0000e+00, 1.7462e-10, 5.8208e-11, ..., 2.9104e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 1.7462e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [1.7462e-10, 5.8208e-11, 8.7311e-10, ..., 6.4028e-10, 0.0000e+00, 5.8208e-11], [5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [1.1642e-10, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 473, bias, value: tensor([-0.0189, -0.0353, -0.0292, -0.0277, -0.0058, 0.0115, 0.0099, -0.0067, -0.0174, -0.0181], device='cuda:0'), grad: tensor([ 3.7253e-09, 4.3248e-08, 1.3970e-09, 2.3865e-09, -5.3493e-08, 5.4133e-09, -5.5297e-09, 4.7730e-09, 4.2492e-09, 4.8894e-09], device='cuda:0') 100 1e-05 changing lr epoch 472, time 220.39, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4139 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.20 lr 0.00001000 Epoch 474, weight, value: tensor([[-0.2624, -0.3229, 0.1285, ..., -0.1673, 0.0467, 0.0480], [-0.1586, -0.0889, -0.1235, ..., -0.2678, -0.0727, -0.0254], [ 0.0075, -0.2079, -0.2788, ..., -0.2047, 0.0304, -0.4493], ..., [-0.2365, 0.2027, 0.0350, ..., 0.2688, -0.0729, -0.1770], [-0.2320, -0.2217, 0.2579, ..., -0.1971, -0.1666, 0.1746], [ 0.0152, -0.4099, 0.2153, ..., 0.0580, -0.2090, -0.2046]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [ 1.7462e-10, 8.1491e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [ 1.7462e-10, 5.8208e-11, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], ..., [ 0.0000e+00, -2.7940e-09, 5.8208e-11, ..., -1.2515e-08, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 5.8208e-11, 5.8208e-11, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11]], device='cuda:0') Epoch 474, bias, value: tensor([-0.0188, -0.0354, -0.0292, -0.0277, -0.0058, 0.0115, 0.0098, -0.0068, -0.0175, -0.0180], device='cuda:0'), grad: tensor([ 1.5716e-09, 4.1327e-09, 1.7462e-09, -3.4925e-09, 3.2713e-08, 1.1059e-08, -1.0768e-08, -2.9162e-08, 3.2014e-09, 6.4028e-10], device='cuda:0') 100 1e-05 changing lr epoch 473, time 220.19, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4239 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.21 lr 0.00001000 Epoch 475, weight, value: tensor([[-0.2624, -0.3229, 0.1285, ..., -0.1673, 0.0467, 0.0480], [-0.1587, -0.0890, -0.1236, ..., -0.2679, -0.0727, -0.0255], [ 0.0074, -0.2080, -0.2788, ..., -0.2047, 0.0304, -0.4493], ..., [-0.2365, 0.2029, 0.0351, ..., 0.2689, -0.0729, -0.1770], [-0.2321, -0.2217, 0.2580, ..., -0.1971, -0.1666, 0.1746], [ 0.0153, -0.4099, 0.2154, ..., 0.0580, -0.2090, -0.2047]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [ 5.8208e-11, 9.8953e-10, 2.9104e-10, ..., 1.0477e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.3388e-09, 5.8208e-11, ..., 5.8208e-11, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 5.8208e-11], [ 1.1642e-10, 1.2806e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 5.8208e-11, 1.7462e-10, 5.8208e-11, ..., 1.7462e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 475, bias, value: tensor([-0.0188, -0.0355, -0.0293, -0.0277, -0.0058, 0.0116, 0.0097, -0.0068, -0.0175, -0.0179], device='cuda:0'), grad: tensor([ 2.3283e-09, 3.2596e-09, -1.4494e-08, 1.3388e-09, 2.1537e-09, 6.5193e-09, -5.0059e-09, 7.5670e-10, 1.4319e-08, 9.8953e-10], device='cuda:0') 100 1e-05 changing lr epoch 474, time 220.24, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4219 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 476, weight, value: tensor([[-0.2624, -0.3229, 0.1286, ..., -0.1673, 0.0467, 0.0481], [-0.1587, -0.0892, -0.1237, ..., -0.2680, -0.0727, -0.0254], [ 0.0074, -0.2080, -0.2789, ..., -0.2048, 0.0304, -0.4494], ..., [-0.2366, 0.2031, 0.0352, ..., 0.2691, -0.0729, -0.1770], [-0.2322, -0.2217, 0.2580, ..., -0.1971, -0.1666, 0.1746], [ 0.0153, -0.4100, 0.2155, ..., 0.0581, -0.2090, -0.2047]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.4925e-10, 2.3283e-10, 1.7462e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-10, 0.0000e+00, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 1.7462e-10], [ 1.7462e-10, 0.0000e+00, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [-6.9849e-10, 1.1642e-10, -9.8953e-10, ..., -4.6566e-10, 0.0000e+00, 5.8208e-11]], device='cuda:0') Epoch 476, bias, value: tensor([-0.0188, -0.0356, -0.0293, -0.0277, -0.0058, 0.0116, 0.0096, -0.0066, -0.0175, -0.0179], device='cuda:0'), grad: tensor([ 6.4028e-10, 1.0070e-08, -1.0477e-08, 1.2806e-09, -2.9686e-09, -9.8953e-10, 3.9581e-09, 4.2492e-09, 3.4925e-10, -2.7358e-09], device='cuda:0') 100 1e-05 changing lr epoch 475, time 220.49, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4242 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.19 lr 0.00001000 Epoch 477, weight, value: tensor([[-0.2625, -0.3229, 0.1286, ..., -0.1673, 0.0467, 0.0481], [-0.1588, -0.0893, -0.1238, ..., -0.2681, -0.0727, -0.0254], [ 0.0074, -0.2081, -0.2789, ..., -0.2048, 0.0304, -0.4495], ..., [-0.2366, 0.2031, 0.0352, ..., 0.2692, -0.0729, -0.1771], [-0.2322, -0.2218, 0.2581, ..., -0.1971, -0.1666, 0.1747], [ 0.0153, -0.4101, 0.2156, ..., 0.0581, -0.2090, -0.2047]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.2806e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 8.1491e-10], [1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 477, bias, value: tensor([-0.0187, -0.0357, -0.0294, -0.0277, -0.0059, 0.0116, 0.0095, -0.0066, -0.0175, -0.0179], device='cuda:0'), grad: tensor([ 2.3283e-10, 6.9849e-10, 1.1642e-10, 8.1491e-10, 1.1642e-10, -2.3283e-09, 2.0955e-09, 8.1491e-10, 1.9791e-09, 8.1491e-10], device='cuda:0') 100 1e-05 changing lr epoch 476, time 220.66, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4191 re_mapping 0.0018 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 478, weight, value: tensor([[-0.2625, -0.3230, 0.1287, ..., -0.1673, 0.0467, 0.0481], [-0.1588, -0.0894, -0.1238, ..., -0.2682, -0.0727, -0.0254], [ 0.0074, -0.2081, -0.2790, ..., -0.2049, 0.0304, -0.4495], ..., [-0.2366, 0.2032, 0.0352, ..., 0.2693, -0.0729, -0.1771], [-0.2322, -0.2218, 0.2581, ..., -0.1971, -0.1666, 0.1747], [ 0.0152, -0.4102, 0.2156, ..., 0.0580, -0.2090, -0.2049]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 5.8208e-10, ..., 1.1642e-10, 1.1642e-10, 0.0000e+00], [ 0.0000e+00, 1.0477e-09, 3.4925e-10, ..., 9.3132e-10, 1.0477e-09, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 1.1642e-10, 9.1968e-09, 0.0000e+00], ..., [ 0.0000e+00, -1.3970e-09, -5.8208e-10, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 6.9849e-10, 3.4925e-10, ..., 0.0000e+00, 6.9849e-10, 1.1642e-10], [ 1.1642e-10, 8.1491e-10, 3.4925e-10, ..., 8.1491e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 478, bias, value: tensor([-0.0188, -0.0358, -0.0293, -0.0277, -0.0059, 0.0116, 0.0096, -0.0066, -0.0175, -0.0180], device='cuda:0'), grad: tensor([ 4.5402e-09, 8.4983e-09, 5.0291e-08, -1.7462e-09, 1.0710e-08, 9.1968e-09, -7.1712e-08, -3.2596e-09, 5.9372e-09, 2.3283e-09], device='cuda:0') 100 1e-05 changing lr epoch 477, time 220.67, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4027 re_mapping 0.0019 re_causal 0.0067 /// teacc 99.18 lr 0.00001000 Epoch 479, weight, value: tensor([[-0.2625, -0.3230, 0.1287, ..., -0.1674, 0.0467, 0.0481], [-0.1588, -0.0894, -0.1239, ..., -0.2684, -0.0727, -0.0254], [ 0.0074, -0.2082, -0.2790, ..., -0.2049, 0.0303, -0.4496], ..., [-0.2366, 0.2034, 0.0353, ..., 0.2695, -0.0729, -0.1771], [-0.2323, -0.2218, 0.2582, ..., -0.1972, -0.1666, 0.1747], [ 0.0153, -0.4103, 0.2157, ..., 0.0581, -0.2090, -0.2049]], device='cuda:0'), grad: tensor([[-2.4447e-09, 0.0000e+00, -1.0245e-08, ..., 0.0000e+00, 0.0000e+00, -2.1188e-08], [ 5.8208e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 2.4447e-09], [ 1.9791e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 1.5134e-09]], device='cuda:0') Epoch 479, bias, value: tensor([-0.0188, -0.0358, -0.0294, -0.0277, -0.0059, 0.0117, 0.0094, -0.0066, -0.0176, -0.0179], device='cuda:0'), grad: tensor([-7.0548e-08, 1.9791e-09, 2.3283e-10, 1.0827e-07, -3.1432e-09, -1.1781e-07, 6.6124e-08, 9.3132e-10, 6.4028e-09, 7.9162e-09], device='cuda:0') 100 1e-05 changing lr epoch 478, time 220.68, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4252 re_mapping 0.0019 re_causal 0.0070 /// teacc 99.17 lr 0.00001000 Epoch 480, weight, value: tensor([[-0.2625, -0.3230, 0.1288, ..., -0.1674, 0.0467, 0.0481], [-0.1589, -0.0895, -0.1239, ..., -0.2684, -0.0727, -0.0254], [ 0.0074, -0.2082, -0.2791, ..., -0.2049, 0.0303, -0.4496], ..., [-0.2366, 0.2034, 0.0353, ..., 0.2695, -0.0729, -0.1771], [-0.2324, -0.2218, 0.2582, ..., -0.1972, -0.1666, 0.1747], [ 0.0154, -0.4103, 0.2158, ..., 0.0581, -0.2090, -0.2049]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 480, bias, value: tensor([-0.0187, -0.0358, -0.0293, -0.0277, -0.0060, 0.0117, 0.0093, -0.0066, -0.0176, -0.0179], device='cuda:0'), grad: tensor([ 2.4447e-09, 1.8626e-09, 1.1642e-09, 6.9849e-10, -3.4925e-09, 1.3970e-09, 3.2596e-09, 6.9849e-10, 4.6566e-10, 8.1491e-10], device='cuda:0') 100 1e-05 changing lr epoch 479, time 220.64, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4246 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.18 lr 0.00001000 Epoch 481, weight, value: tensor([[-0.2625, -0.3230, 0.1288, ..., -0.1674, 0.0467, 0.0482], [-0.1589, -0.0895, -0.1240, ..., -0.2685, -0.0727, -0.0255], [ 0.0075, -0.2083, -0.2791, ..., -0.2050, 0.0303, -0.4497], ..., [-0.2367, 0.2034, 0.0353, ..., 0.2696, -0.0729, -0.1771], [-0.2324, -0.2218, 0.2583, ..., -0.1972, -0.1666, 0.1748], [ 0.0153, -0.4103, 0.2159, ..., 0.0581, -0.2090, -0.2050]], device='cuda:0'), grad: tensor([[ 3.4925e-10, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.4925e-10], [ 2.3283e-10, -2.5611e-09, -3.4925e-10, ..., 2.3283e-10, 0.0000e+00, -1.2806e-09], [ 1.1642e-10, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], ..., [ 5.8208e-10, 6.9849e-10, 2.3283e-10, ..., -1.2806e-09, 0.0000e+00, 1.3970e-09], [ 1.9791e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.6298e-09], [ 9.3132e-10, 5.8208e-10, -9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.5134e-09]], device='cuda:0') Epoch 481, bias, value: tensor([-0.0187, -0.0358, -0.0292, -0.0278, -0.0060, 0.0117, 0.0093, -0.0067, -0.0176, -0.0179], device='cuda:0'), grad: tensor([ 2.0955e-09, -1.3155e-08, 2.3283e-09, 8.1491e-09, 6.9849e-10, -1.0675e-07, 8.7428e-08, 1.2456e-08, 6.8685e-09, 3.9581e-09], device='cuda:0') 100 1e-05 changing lr epoch 480, time 220.62, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4308 re_mapping 0.0018 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 482, weight, value: tensor([[-0.2626, -0.3230, 0.1288, ..., -0.1674, 0.0467, 0.0482], [-0.1589, -0.0895, -0.1240, ..., -0.2686, -0.0727, -0.0254], [ 0.0074, -0.2083, -0.2791, ..., -0.2050, 0.0303, -0.4498], ..., [-0.2367, 0.2034, 0.0353, ..., 0.2696, -0.0729, -0.1772], [-0.2325, -0.2218, 0.2583, ..., -0.1972, -0.1666, 0.1748], [ 0.0153, -0.4103, 0.2160, ..., 0.0581, -0.2090, -0.2050]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.1642e-10, -1.1642e-10, ..., 2.3283e-10, 0.0000e+00, -5.8208e-10], [ 0.0000e+00, 3.4925e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 8.1491e-10, 4.6566e-10, ..., 2.3283e-10, 0.0000e+00, 5.8208e-10], [ 0.0000e+00, 3.4925e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [-6.9849e-10, 3.4925e-10, -2.2119e-09, ..., -3.6089e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 482, bias, value: tensor([-0.0188, -0.0358, -0.0291, -0.0278, -0.0061, 0.0118, 0.0093, -0.0069, -0.0177, -0.0179], device='cuda:0'), grad: tensor([ 5.8208e-10, -5.1223e-09, 1.7462e-09, -2.9104e-09, 9.8953e-09, 3.7253e-09, 1.7462e-09, 8.0327e-09, 1.7462e-09, -1.0594e-08], device='cuda:0') 100 1e-05 changing lr epoch 481, time 220.69, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4029 re_mapping 0.0019 re_causal 0.0066 /// teacc 99.16 lr 0.00001000 Epoch 483, weight, value: tensor([[-0.2626, -0.3230, 0.1289, ..., -0.1674, 0.0467, 0.0482], [-0.1590, -0.0897, -0.1240, ..., -0.2687, -0.0727, -0.0255], [ 0.0074, -0.2083, -0.2792, ..., -0.2050, 0.0303, -0.4498], ..., [-0.2367, 0.2035, 0.0353, ..., 0.2697, -0.0729, -0.1772], [-0.2326, -0.2219, 0.2584, ..., -0.1972, -0.1666, 0.1748], [ 0.0153, -0.4104, 0.2161, ..., 0.0582, -0.2090, -0.2051]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -1.0827e-08, ..., 0.0000e+00, -2.3283e-10, -1.4203e-08], [ 1.1642e-10, 3.4925e-10, 6.9849e-10, ..., 1.1642e-10, 0.0000e+00, 3.4925e-10], [-2.3283e-09, -4.6566e-10, 2.6776e-09, ..., 0.0000e+00, 0.0000e+00, 2.9104e-09], ..., [ 0.0000e+00, 2.3283e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 8.1491e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 9.3132e-10, 4.6566e-10, 5.8208e-10, ..., 0.0000e+00, 0.0000e+00, 5.8208e-10]], device='cuda:0') Epoch 483, bias, value: tensor([-0.0188, -0.0359, -0.0290, -0.0278, -0.0061, 0.0119, 0.0092, -0.0069, -0.0179, -0.0179], device='cuda:0'), grad: tensor([-5.5530e-08, 4.3074e-09, -9.8953e-09, 3.1432e-09, 1.5134e-09, 2.0838e-08, 2.4564e-08, 3.2596e-09, 3.8417e-09, 1.2340e-08], device='cuda:0') 100 1e-05 changing lr epoch 482, time 220.32, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4415 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.17 lr 0.00001000 Epoch 484, weight, value: tensor([[-0.2626, -0.3230, 0.1289, ..., -0.1674, 0.0467, 0.0482], [-0.1590, -0.0898, -0.1241, ..., -0.2688, -0.0727, -0.0255], [ 0.0074, -0.2084, -0.2792, ..., -0.2050, 0.0303, -0.4499], ..., [-0.2367, 0.2036, 0.0353, ..., 0.2698, -0.0729, -0.1772], [-0.2327, -0.2219, 0.2584, ..., -0.1972, -0.1666, 0.1748], [ 0.0154, -0.4104, 0.2163, ..., 0.0582, -0.2090, -0.2051]], device='cuda:0'), grad: tensor([[2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [1.1642e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [2.3283e-10, 1.1642e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 1.1642e-10], [1.5134e-09, 3.4925e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [6.0536e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.8417e-09]], device='cuda:0') Epoch 484, bias, value: tensor([-0.0188, -0.0361, -0.0289, -0.0278, -0.0063, 0.0119, 0.0092, -0.0069, -0.0178, -0.0177], device='cuda:0'), grad: tensor([ 6.9849e-10, 1.2806e-09, 1.0477e-09, -3.1083e-08, 2.0955e-09, 8.0327e-09, 6.4028e-09, 1.7462e-09, 3.4925e-09, 1.3039e-08], device='cuda:0') 100 1e-05 changing lr epoch 483, time 220.57, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4020 re_mapping 0.0019 re_causal 0.0066 /// teacc 99.19 lr 0.00001000 Epoch 485, weight, value: tensor([[-0.2626, -0.3230, 0.1290, ..., -0.1674, 0.0467, 0.0482], [-0.1591, -0.0899, -0.1243, ..., -0.2690, -0.0727, -0.0255], [ 0.0074, -0.2085, -0.2793, ..., -0.2051, 0.0303, -0.4499], ..., [-0.2368, 0.2037, 0.0354, ..., 0.2700, -0.0729, -0.1772], [-0.2328, -0.2219, 0.2584, ..., -0.1972, -0.1666, 0.1748], [ 0.0154, -0.4105, 0.2164, ..., 0.0583, -0.2090, -0.2052]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -9.8953e-09, ..., -2.0955e-09, 0.0000e+00, -8.1491e-09], [ 0.0000e+00, 6.9849e-10, 2.3283e-09, ..., 1.5134e-09, 0.0000e+00, 8.1491e-10], [ 0.0000e+00, -8.1491e-10, 5.8208e-10, ..., 1.1642e-10, 0.0000e+00, 8.1491e-10], ..., [ 0.0000e+00, 6.9849e-10, 1.6531e-08, ..., 1.3621e-08, 0.0000e+00, 1.2806e-09], [ 2.3283e-10, 2.6776e-09, 2.3283e-10, ..., 4.6566e-10, 0.0000e+00, -1.5134e-09], [ 2.3283e-10, 9.3132e-10, -3.0152e-08, ..., -2.8522e-08, 0.0000e+00, 2.5611e-09]], device='cuda:0') Epoch 485, bias, value: tensor([-0.0188, -0.0363, -0.0289, -0.0278, -0.0063, 0.0120, 0.0090, -0.0068, -0.0180, -0.0176], device='cuda:0'), grad: tensor([-3.5157e-08, 1.0477e-08, -2.0606e-08, -7.4506e-09, 4.6799e-08, 5.5181e-08, -2.1770e-08, 6.4028e-08, 2.2934e-08, -1.0629e-07], device='cuda:0') 100 1e-05 changing lr epoch 484, time 220.50, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4091 re_mapping 0.0018 re_causal 0.0068 /// teacc 99.18 lr 0.00001000 Epoch 486, weight, value: tensor([[-0.2626, -0.3230, 0.1291, ..., -0.1675, 0.0467, 0.0483], [-0.1591, -0.0900, -0.1243, ..., -0.2691, -0.0727, -0.0256], [ 0.0074, -0.2085, -0.2793, ..., -0.2051, 0.0303, -0.4500], ..., [-0.2368, 0.2038, 0.0354, ..., 0.2701, -0.0729, -0.1772], [-0.2329, -0.2219, 0.2585, ..., -0.1972, -0.1666, 0.1748], [ 0.0154, -0.4106, 0.2166, ..., 0.0583, -0.2090, -0.2052]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, -1.2806e-09, -1.5134e-09, ..., -1.1642e-09, 0.0000e+00, -1.1642e-09], ..., [ 0.0000e+00, 2.3283e-10, 5.8208e-10, ..., 4.6566e-10, 0.0000e+00, 3.4925e-10], [ 2.3283e-10, 3.4925e-10, 6.9849e-10, ..., 5.8208e-10, 0.0000e+00, 6.9849e-10], [ 0.0000e+00, 1.1642e-10, 1.1642e-10, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 486, bias, value: tensor([-0.0187, -0.0364, -0.0289, -0.0278, -0.0063, 0.0119, 0.0091, -0.0069, -0.0181, -0.0176], device='cuda:0'), grad: tensor([ 2.0955e-09, 2.1886e-08, -3.5157e-08, 2.4447e-09, 1.8626e-09, 2.7940e-09, -2.7940e-09, 5.0059e-09, 6.4028e-09, 5.8208e-10], device='cuda:0') 100 1e-05 changing lr epoch 485, time 220.61, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4117 re_mapping 0.0019 re_causal 0.0067 /// teacc 99.18 lr 0.00001000 Epoch 487, weight, value: tensor([[-0.2627, -0.3230, 0.1291, ..., -0.1675, 0.0467, 0.0484], [-0.1591, -0.0900, -0.1244, ..., -0.2691, -0.0727, -0.0256], [ 0.0074, -0.2085, -0.2793, ..., -0.2051, 0.0303, -0.4500], ..., [-0.2368, 0.2038, 0.0354, ..., 0.2702, -0.0729, -0.1773], [-0.2329, -0.2220, 0.2585, ..., -0.1972, -0.1666, 0.1748], [ 0.0155, -0.4106, 0.2167, ..., 0.0584, -0.2090, -0.2053]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.0477e-09, ..., 1.5134e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, 1.1642e-10, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, -2.4098e-08, ..., -3.3993e-08, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 487, bias, value: tensor([-0.0186, -0.0365, -0.0288, -0.0278, -0.0065, 0.0119, 0.0090, -0.0070, -0.0181, -0.0175], device='cuda:0'), grad: tensor([ 4.6566e-10, 1.9791e-09, 1.8626e-09, 1.9791e-09, 8.8592e-08, 1.1642e-09, 9.1968e-09, 6.5193e-09, 9.3132e-10, -9.9652e-08], device='cuda:0') 100 1e-05 changing lr epoch 486, time 220.27, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4382 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.20 lr 0.00001000 Epoch 488, weight, value: tensor([[-0.2627, -0.3230, 0.1292, ..., -0.1675, 0.0467, 0.0484], [-0.1592, -0.0901, -0.1245, ..., -0.2692, -0.0727, -0.0255], [ 0.0074, -0.2086, -0.2794, ..., -0.2052, 0.0303, -0.4500], ..., [-0.2368, 0.2039, 0.0354, ..., 0.2702, -0.0729, -0.1773], [-0.2329, -0.2220, 0.2585, ..., -0.1972, -0.1666, 0.1748], [ 0.0155, -0.4107, 0.2169, ..., 0.0585, -0.2090, -0.2053]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 1.2806e-09, 4.6566e-10, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -3.4925e-10, -6.9849e-10, ..., 0.0000e+00, 1.7462e-08, 0.0000e+00], ..., [ 0.0000e+00, -2.4447e-09, -8.1491e-10, ..., -1.0477e-09, 0.0000e+00, -3.4925e-10], [ 0.0000e+00, 1.6298e-09, 0.0000e+00, ..., 2.3283e-10, 1.1642e-09, -1.2806e-09], [ 0.0000e+00, 3.4925e-10, -3.9581e-09, ..., -3.7253e-09, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 488, bias, value: tensor([-0.0185, -0.0365, -0.0287, -0.0278, -0.0067, 0.0119, 0.0091, -0.0071, -0.0182, -0.0174], device='cuda:0'), grad: tensor([ 2.4447e-09, 1.4086e-08, 2.4913e-08, 6.7055e-08, 2.0140e-08, 5.1572e-08, -1.6892e-07, -1.9791e-09, 6.9849e-09, -9.5461e-09], device='cuda:0') 100 1e-05 changing lr epoch 487, time 220.21, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4160 re_mapping 0.0018 re_causal 0.0066 /// teacc 99.18 lr 0.00001000 Epoch 489, weight, value: tensor([[-0.2627, -0.3231, 0.1293, ..., -0.1675, 0.0467, 0.0485], [-0.1592, -0.0901, -0.1245, ..., -0.2694, -0.0727, -0.0256], [ 0.0075, -0.2086, -0.2794, ..., -0.2052, 0.0303, -0.4501], ..., [-0.2369, 0.2040, 0.0353, ..., 0.2704, -0.0729, -0.1773], [-0.2330, -0.2220, 0.2586, ..., -0.1973, -0.1666, 0.1749], [ 0.0155, -0.4108, 0.2171, ..., 0.0586, -0.2090, -0.2054]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, -6.9849e-10], [ 0.0000e+00, 2.3283e-10, 1.1642e-10, ..., 3.4925e-10, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 1.8626e-09, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 489, bias, value: tensor([-0.0185, -0.0366, -0.0286, -0.0279, -0.0068, 0.0120, 0.0090, -0.0071, -0.0182, -0.0173], device='cuda:0'), grad: tensor([ 3.3760e-09, -4.6566e-09, 9.1968e-09, 3.4925e-10, -8.3819e-09, 5.1223e-09, -9.3132e-09, 2.6776e-09, 1.8626e-09, 9.4296e-09], device='cuda:0') 100 1e-05 changing lr epoch 488, time 220.51, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4301 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.18 lr 0.00001000 Epoch 490, weight, value: tensor([[-0.2627, -0.3231, 0.1294, ..., -0.1675, 0.0467, 0.0486], [-0.1592, -0.0902, -0.1246, ..., -0.2695, -0.0727, -0.0255], [ 0.0074, -0.2087, -0.2795, ..., -0.2053, 0.0303, -0.4502], ..., [-0.2369, 0.2041, 0.0353, ..., 0.2705, -0.0729, -0.1773], [-0.2331, -0.2220, 0.2586, ..., -0.1973, -0.1666, 0.1748], [ 0.0155, -0.4109, 0.2173, ..., 0.0587, -0.2090, -0.2055]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.8208e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, -8.4983e-09, 2.6776e-09, ..., 5.8208e-09, 0.0000e+00, -2.3283e-09], [-3.4925e-10, 2.4447e-09, 5.8208e-10, ..., 1.2806e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.0477e-09, -3.6089e-09, ..., -8.1491e-09, 0.0000e+00, 1.9791e-09], [ 2.3283e-10, 2.3283e-10, 4.6566e-10, ..., 3.4925e-10, 0.0000e+00, 0.0000e+00], [-5.8208e-10, 1.2806e-09, -9.3132e-10, ..., -4.6566e-10, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 490, bias, value: tensor([-0.0183, -0.0366, -0.0286, -0.0279, -0.0069, 0.0120, 0.0089, -0.0071, -0.0184, -0.0172], device='cuda:0'), grad: tensor([ 2.9104e-09, -7.7649e-08, 3.4925e-09, 6.0536e-09, 5.4715e-09, 4.5402e-09, 2.7940e-09, 4.9127e-08, 3.6089e-09, 2.6776e-09], device='cuda:0') 100 1e-05 changing lr epoch 489, time 220.47, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4347 re_mapping 0.0019 re_causal 0.0068 /// teacc 99.19 lr 0.00001000 Epoch 491, weight, value: tensor([[-0.2627, -0.3231, 0.1296, ..., -0.1675, 0.0467, 0.0487], [-0.1592, -0.0902, -0.1247, ..., -0.2696, -0.0727, -0.0256], [ 0.0075, -0.2087, -0.2795, ..., -0.2053, 0.0303, -0.4503], ..., [-0.2369, 0.2041, 0.0354, ..., 0.2706, -0.0729, -0.1774], [-0.2332, -0.2220, 0.2586, ..., -0.1973, -0.1666, 0.1748], [ 0.0154, -0.4109, 0.2174, ..., 0.0588, -0.2090, -0.2056]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.1642e-10, 1.1642e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, -9.0804e-09, -7.9162e-09], [ 0.0000e+00, 1.9791e-09, 0.0000e+00, ..., 0.0000e+00, 4.8894e-09, 7.7998e-09], ..., [ 1.1642e-10, 1.1642e-10, 1.1642e-10, ..., 1.1642e-10, 3.7253e-09, 1.1642e-10], [ 1.0477e-09, 1.5134e-09, 6.9849e-10, ..., 0.0000e+00, 1.1642e-10, 9.3132e-10], [ 0.0000e+00, 4.6566e-10, 1.1642e-10, ..., 2.3283e-10, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 491, bias, value: tensor([-0.0181, -0.0367, -0.0283, -0.0280, -0.0071, 0.0121, 0.0090, -0.0072, -0.0186, -0.0172], device='cuda:0'), grad: tensor([ 2.9104e-09, -2.4168e-07, 1.6158e-07, 4.5402e-09, 2.7940e-09, -5.1921e-08, 3.3178e-08, 8.2888e-08, 9.4296e-09, 4.0745e-09], device='cuda:0') 100 1e-05 changing lr epoch 490, time 220.82, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4055 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.18 lr 0.00001000 Epoch 492, weight, value: tensor([[-0.2627, -0.3231, 0.1297, ..., -0.1675, 0.0467, 0.0487], [-0.1593, -0.0903, -0.1247, ..., -0.2696, -0.0727, -0.0255], [ 0.0075, -0.2087, -0.2796, ..., -0.2053, 0.0302, -0.4504], ..., [-0.2369, 0.2042, 0.0354, ..., 0.2707, -0.0729, -0.1774], [-0.2333, -0.2221, 0.2587, ..., -0.1973, -0.1666, 0.1748], [ 0.0154, -0.4110, 0.2176, ..., 0.0589, -0.2090, -0.2056]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.2806e-09, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 1.1642e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., -3.4925e-10, 0.0000e+00, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 0.0000e+00, 1.0012e-08, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 1.8626e-09, 1.1642e-10, -1.2573e-08, ..., 3.0268e-09, 0.0000e+00, -1.3970e-09]], device='cuda:0') Epoch 492, bias, value: tensor([-0.0180, -0.0367, -0.0283, -0.0280, -0.0073, 0.0121, 0.0089, -0.0073, -0.0188, -0.0171], device='cuda:0'), grad: tensor([ 3.3760e-09, 2.5611e-09, -2.6776e-09, 2.5611e-09, -2.0838e-08, 2.7940e-09, 5.9372e-09, 3.9581e-09, 2.3632e-08, -1.2224e-08], device='cuda:0') 100 1e-05 changing lr epoch 491, time 220.64, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4043 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.18 lr 0.00001000 Epoch 493, weight, value: tensor([[-0.2627, -0.3231, 0.1298, ..., -0.1675, 0.0467, 0.0488], [-0.1593, -0.0903, -0.1248, ..., -0.2697, -0.0727, -0.0255], [ 0.0075, -0.2088, -0.2797, ..., -0.2053, 0.0302, -0.4505], ..., [-0.2370, 0.2042, 0.0354, ..., 0.2707, -0.0729, -0.1775], [-0.2334, -0.2221, 0.2587, ..., -0.1973, -0.1666, 0.1748], [ 0.0154, -0.4110, 0.2177, ..., 0.0590, -0.2090, -0.2057]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 1.7462e-09, ..., 1.3970e-09, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 1.1642e-10, 3.4925e-10, ..., 1.1642e-10, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 2.3283e-10, 6.9849e-10, ..., 2.3283e-10, 1.1642e-10, 1.3970e-09], ..., [ 0.0000e+00, 1.1642e-10, 1.6298e-09, ..., 1.0477e-09, 0.0000e+00, 5.8208e-10], [ 0.0000e+00, -3.4925e-10, -2.6776e-09, ..., -9.3132e-10, 0.0000e+00, -1.5134e-09], [-8.1491e-10, 0.0000e+00, -4.0745e-09, ..., -3.3760e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 493, bias, value: tensor([-0.0178, -0.0365, -0.0284, -0.0280, -0.0074, 0.0122, 0.0088, -0.0075, -0.0189, -0.0170], device='cuda:0'), grad: tensor([ 6.0536e-09, -2.8173e-08, 1.9441e-08, 1.1642e-09, 2.6776e-09, 2.5611e-09, 5.2387e-09, 7.3342e-09, -3.3760e-09, -1.0827e-08], device='cuda:0') 100 1e-05 changing lr epoch 492, time 220.60, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4123 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.20 lr 0.00001000 Epoch 494, weight, value: tensor([[-0.2628, -0.3231, 0.1299, ..., -0.1675, 0.0467, 0.0489], [-0.1594, -0.0904, -0.1249, ..., -0.2698, -0.0727, -0.0255], [ 0.0075, -0.2089, -0.2797, ..., -0.2054, 0.0302, -0.4505], ..., [-0.2370, 0.2043, 0.0354, ..., 0.2708, -0.0729, -0.1775], [-0.2335, -0.2221, 0.2587, ..., -0.1974, -0.1666, 0.1748], [ 0.0155, -0.4111, 0.2179, ..., 0.0592, -0.2090, -0.2058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.8208e-10, -6.9849e-10, ..., 1.1642e-10, -1.1642e-10, -8.1491e-10], [ 0.0000e+00, -2.0256e-08, 4.6566e-10, ..., 1.0477e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 2.3283e-10, ..., 1.1642e-10, 0.0000e+00, 1.1642e-10], ..., [ 0.0000e+00, 1.7462e-08, -1.7462e-09, ..., -3.2596e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 3.4925e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.3283e-10], [ 1.1642e-10, 3.3760e-09, 1.6298e-09, ..., 2.4447e-09, 0.0000e+00, 1.1642e-10]], device='cuda:0') Epoch 494, bias, value: tensor([-0.0177, -0.0366, -0.0284, -0.0280, -0.0075, 0.0122, 0.0089, -0.0075, -0.0190, -0.0168], device='cuda:0'), grad: tensor([-2.2119e-09, -1.3667e-07, 4.3074e-09, 8.6147e-09, 8.1491e-10, -5.4715e-09, 3.3760e-09, 1.2596e-07, 1.3970e-09, 9.1968e-09], device='cuda:0') 100 1e-05 changing lr epoch 493, time 220.59, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4101 re_mapping 0.0018 re_causal 0.0067 /// teacc 99.19 lr 0.00001000 Epoch 495, weight, value: tensor([[-0.2628, -0.3231, 0.1299, ..., -0.1676, 0.0467, 0.0489], [-0.1594, -0.0904, -0.1249, ..., -0.2698, -0.0727, -0.0255], [ 0.0075, -0.2089, -0.2797, ..., -0.2054, 0.0302, -0.4505], ..., [-0.2370, 0.2043, 0.0354, ..., 0.2709, -0.0729, -0.1775], [-0.2335, -0.2221, 0.2588, ..., -0.1974, -0.1666, 0.1749], [ 0.0155, -0.4111, 0.2180, ..., 0.0592, -0.2090, -0.2058]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 495, bias, value: tensor([-0.0176, -0.0367, -0.0283, -0.0280, -0.0076, 0.0122, 0.0089, -0.0076, -0.0191, -0.0168], device='cuda:0'), grad: tensor([ 1.1642e-09, 2.9104e-09, -3.7253e-09, 4.6566e-10, 3.4925e-10, 5.8208e-10, 2.3283e-10, 9.3132e-10, 1.1642e-10, 2.3283e-10], device='cuda:0') 100 1e-05 changing lr epoch 494, time 221.21, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4283 re_mapping 0.0019 re_causal 0.0069 /// teacc 99.20 lr 0.00001000 Epoch 496, weight, value: tensor([[-0.2628, -0.3231, 0.1301, ..., -0.1676, 0.0467, 0.0490], [-0.1594, -0.0905, -0.1250, ..., -0.2699, -0.0727, -0.0256], [ 0.0075, -0.2090, -0.2798, ..., -0.2055, 0.0302, -0.4505], ..., [-0.2370, 0.2043, 0.0354, ..., 0.2709, -0.0729, -0.1776], [-0.2336, -0.2221, 0.2588, ..., -0.1974, -0.1666, 0.1749], [ 0.0155, -0.4112, 0.2181, ..., 0.0593, -0.2090, -0.2059]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, -3.2596e-09, ..., -5.8208e-10, 0.0000e+00, -4.3074e-09], [ 0.0000e+00, 1.3970e-09, 3.4925e-10, ..., 6.9849e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 3.7253e-09, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, 6.9849e-10], ..., [ 4.6566e-10, -5.8208e-10, 2.3283e-09, ..., -1.1642e-09, 0.0000e+00, 5.8208e-10], [ 0.0000e+00, 2.3283e-10, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, -1.9791e-09], [-5.8208e-10, 3.4925e-10, -2.4447e-09, ..., -1.9791e-09, 0.0000e+00, 6.9849e-10]], device='cuda:0') Epoch 496, bias, value: tensor([-0.0174, -0.0367, -0.0282, -0.0280, -0.0076, 0.0121, 0.0089, -0.0077, -0.0193, -0.0168], device='cuda:0'), grad: tensor([-1.0827e-08, 4.8894e-09, 1.3504e-08, -2.6892e-08, 3.6089e-09, 1.5716e-08, 4.8894e-09, 8.2655e-09, -6.6357e-09, -2.7940e-09], device='cuda:0') 100 1e-05 changing lr epoch 495, time 220.80, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4348 re_mapping 0.0018 re_causal 0.0068 /// teacc 99.18 lr 0.00001000 Epoch 497, weight, value: tensor([[-0.2628, -0.3231, 0.1302, ..., -0.1676, 0.0467, 0.0491], [-0.1594, -0.0905, -0.1250, ..., -0.2700, -0.0727, -0.0256], [ 0.0075, -0.2090, -0.2798, ..., -0.2055, 0.0302, -0.4506], ..., [-0.2370, 0.2043, 0.0354, ..., 0.2710, -0.0729, -0.1776], [-0.2337, -0.2222, 0.2589, ..., -0.1974, -0.1666, 0.1749], [ 0.0155, -0.4112, 0.2182, ..., 0.0593, -0.2090, -0.2060]], device='cuda:0'), grad: tensor([[ 4.0745e-10, 0.0000e+00, 5.2387e-10, ..., 5.8208e-11, 0.0000e+00, 4.3074e-09], [ 5.0641e-09, 5.8208e-11, 5.8208e-09, ..., 2.3283e-10, 0.0000e+00, 5.8208e-11], [ 1.1642e-10, 0.0000e+00, 1.1642e-10, ..., 0.0000e+00, 1.7462e-10, 0.0000e+00], ..., [ 5.8208e-11, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 1.7462e-10, 1.1642e-10, 2.3283e-10, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [-1.0710e-08, 0.0000e+00, -1.2398e-08, ..., 4.6566e-10, 0.0000e+00, 5.8208e-11]], device='cuda:0') Epoch 497, bias, value: tensor([-0.0173, -0.0367, -0.0282, -0.0280, -0.0077, 0.0122, 0.0089, -0.0078, -0.0193, -0.0168], device='cuda:0'), grad: tensor([ 3.8242e-08, 4.0105e-08, 2.7940e-09, 1.3446e-08, 1.5774e-08, 3.0268e-08, -5.2736e-08, 2.2701e-09, 1.9209e-09, -8.1898e-08], device='cuda:0') 100 1e-05 changing lr epoch 496, time 220.10, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4339 re_mapping 0.0018 re_causal 0.0069 /// teacc 99.21 lr 0.00001000 Epoch 498, weight, value: tensor([[-0.2628, -0.3231, 0.1302, ..., -0.1676, 0.0467, 0.0491], [-0.1595, -0.0905, -0.1250, ..., -0.2700, -0.0727, -0.0256], [ 0.0075, -0.2090, -0.2798, ..., -0.2055, 0.0302, -0.4506], ..., [-0.2371, 0.2044, 0.0354, ..., 0.2710, -0.0729, -0.1776], [-0.2337, -0.2222, 0.2589, ..., -0.1974, -0.1666, 0.1749], [ 0.0154, -0.4113, 0.2182, ..., 0.0593, -0.2090, -0.2061]], device='cuda:0'), grad: tensor([[ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.7462e-10], [-5.8208e-11, 1.1642e-10, 5.8208e-11, ..., 1.1642e-10, 0.0000e+00, -4.0745e-10], [ 0.0000e+00, 5.8208e-11, 0.0000e+00, ..., 5.8208e-11, 0.0000e+00, 5.8208e-11], ..., [ 5.8208e-11, 0.0000e+00, 3.4925e-10, ..., 1.7462e-10, 0.0000e+00, 5.8208e-11], [ 0.0000e+00, 0.0000e+00, 1.1642e-10, ..., 5.8208e-11, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 498, bias, value: tensor([-0.0173, -0.0367, -0.0281, -0.0280, -0.0076, 0.0122, 0.0088, -0.0078, -0.0193, -0.0169], device='cuda:0'), grad: tensor([ 1.8626e-09, -2.8522e-09, 5.2387e-10, 1.5716e-09, 1.6298e-09, 1.3970e-09, 5.8208e-10, 1.9209e-09, 5.8208e-10, 3.4925e-10], device='cuda:0') 100 1e-05 changing lr epoch 497, time 219.92, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4140 re_mapping 0.0018 re_causal 0.0066 /// teacc 99.21 lr 0.00001000 Epoch 499, weight, value: tensor([[-0.2628, -0.3231, 0.1303, ..., -0.1676, 0.0467, 0.0492], [-0.1595, -0.0905, -0.1251, ..., -0.2701, -0.0727, -0.0256], [ 0.0075, -0.2091, -0.2799, ..., -0.2055, 0.0302, -0.4507], ..., [-0.2371, 0.2044, 0.0354, ..., 0.2711, -0.0729, -0.1776], [-0.2338, -0.2222, 0.2590, ..., -0.1974, -0.1666, 0.1750], [ 0.0155, -0.4114, 0.2183, ..., 0.0594, -0.2090, -0.2062]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 0.0000e+00, 5.8208e-11, ..., 5.8208e-11, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.8208e-11, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], ..., [ 0.0000e+00, 1.1642e-10, 5.8208e-11, ..., 1.7462e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.1525e-08, 0.0000e+00, -8.2073e-09, ..., -2.6193e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 499, bias, value: tensor([-0.0171, -0.0369, -0.0280, -0.0281, -0.0075, 0.0123, 0.0087, -0.0079, -0.0194, -0.0169], device='cuda:0'), grad: tensor([ 2.3283e-10, 7.9744e-09, -7.1013e-09, 2.9104e-10, 3.9756e-08, 5.2387e-10, 8.7311e-10, 2.2701e-09, 1.1642e-10, -3.6962e-08], device='cuda:0') 100 1e-05 changing lr epoch 498, time 219.92, cls_loss 0.0004 cls_loss_mapping 0.0004 cls_loss_causal 0.4347 re_mapping 0.0018 re_causal 0.0068 /// teacc 99.19 lr 0.00001000 Epoch 500, weight, value: tensor([[-0.2628, -0.3231, 0.1304, ..., -0.1676, 0.0467, 0.0492], [-0.1595, -0.0905, -0.1251, ..., -0.2702, -0.0727, -0.0256], [ 0.0075, -0.2091, -0.2799, ..., -0.2055, 0.0302, -0.4507], ..., [-0.2371, 0.2044, 0.0354, ..., 0.2712, -0.0729, -0.1777], [-0.2339, -0.2222, 0.2590, ..., -0.1974, -0.1666, 0.1750], [ 0.0154, -0.4114, 0.2185, ..., 0.0595, -0.2090, -0.2062]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -5.2387e-10, ..., 0.0000e+00, 0.0000e+00, -1.2806e-09], [ 1.1642e-10, 2.3283e-10, 0.0000e+00, ..., 1.1642e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.9104e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 5.8208e-11, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.8208e-11], [ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 1.7462e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 500, bias, value: tensor([-0.0170, -0.0369, -0.0279, -0.0281, -0.0076, 0.0123, 0.0087, -0.0081, -0.0195, -0.0169], device='cuda:0'), grad: tensor([-3.2596e-09, 2.6776e-09, 1.0477e-09, 9.8953e-10, -5.8790e-09, 5.4715e-09, 5.8208e-09, 1.6298e-09, 5.8208e-10, 2.0955e-09], device='cuda:0') 100 1e-05 changing lr epoch 499, time 219.98, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4091 re_mapping 0.0018 re_causal 0.0065 /// teacc 99.19 lr 0.00001000 ---------------------saving last model at epoch 499---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps3', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps3/14factor_best.csv', 'channels': 3, 'factor_num': 14, 'stride': 3, 'epoch': 'best', 'eval_mapping': True} loading weight of best randm: False stride: 3 loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best Using downloaded and verified file: /home/yuqian_fu/.pytorch/SVHN/test_32x32.mat mnist mnist_FA ... usps_FA Avg ShearX 98.790001 99.040001 ... 87.045341 69.751637 ShearY 98.809998 98.830002 ... 87.045341 65.730553 AutoContrast 98.799995 99.190002 ... 87.045341 56.790876 Invert 98.699997 95.540001 ... 87.045341 55.481846 Equalize 98.000000 98.369995 ... 87.045341 68.588643 Solarize 98.150002 98.220001 ... 87.045341 56.272267 SolarizeAdd 98.449997 98.119995 ... 87.045341 66.457808 Posterize 98.930000 99.049995 ... 87.045341 73.027880 Contrast 98.979996 99.209999 ... 87.045341 68.391936 Color 99.010002 99.250000 ... 87.045341 63.124621 Brightness 98.940002 99.239998 ... 87.045341 67.506813 Sharpness 99.059998 99.129997 ... 87.045341 71.598598 NoiseSalt 99.150002 99.180000 ... 87.045341 60.771899 NoiseGaussian 99.070000 99.250000 ... 87.045341 58.463491 w/o do (original x) 99.250000 0.000000 ... 0.000000 71.863251 [15 rows x 11 columns] mnist svhn mnist_m syndigit usps Avg do 99.08 66.345267 78.791245 75.379462 85.749875 76.566463