/home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 500, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[-0.0256, 0.0311, 0.0202, ..., -0.0092, 0.0157, 0.0004], [ 0.0206, 0.0280, -0.0197, ..., -0.0287, -0.0107, 0.0182], [ 0.0262, 0.0067, -0.0170, ..., -0.0002, -0.0249, 0.0204], ..., [ 0.0079, 0.0292, 0.0166, ..., 0.0049, 0.0199, 0.0240], [ 0.0082, -0.0053, -0.0057, ..., -0.0270, -0.0140, 0.0196], [ 0.0090, 0.0293, -0.0138, ..., -0.0019, -0.0216, 0.0208]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([-0.0081, -0.0085, -0.0046, -0.0105, -0.0169, -0.0071, 0.0164, -0.0019, 0.0246, 0.0014], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 225.02, cls_loss 1.7388 cls_loss_mapping 2.0137 cls_loss_causal 2.2418 re_mapping 0.0700 re_causal 0.0682 /// teacc 79.37 lr 0.00010000 Epoch 2, weight, value: tensor([[-0.0355, 0.0352, 0.0261, ..., -0.0077, 0.0137, -0.0090], [ 0.0269, 0.0208, -0.0298, ..., -0.0371, -0.0168, 0.0250], [ 0.0299, 0.0008, -0.0227, ..., -0.0027, -0.0262, 0.0209], ..., [ 0.0082, 0.0308, 0.0243, ..., 0.0127, 0.0284, 0.0257], [ 0.0117, -0.0102, -0.0127, ..., -0.0320, -0.0156, 0.0231], [ 0.0038, 0.0307, -0.0128, ..., -0.0043, -0.0193, 0.0209]], device='cuda:0'), grad: tensor([[ 2.6875e-03, 1.0078e-02, 5.2795e-03, ..., 1.4557e-02, 2.4967e-03, 1.6403e-03], [-1.4076e-02, 2.6509e-05, 1.7376e-03, ..., -5.1956e-03, -3.9673e-03, -2.7588e-02], [ 2.3007e-04, -1.8082e-02, 1.5087e-03, ..., 1.2903e-03, -8.3876e-04, 1.2703e-02], ..., [-1.5198e-02, 1.3838e-03, -3.1250e-02, ..., -2.1011e-02, -3.7079e-02, -1.4610e-02], [-1.9958e-02, 1.3199e-02, 9.2392e-03, ..., 1.4076e-02, 7.1068e-03, -1.3199e-02], [ 1.6708e-02, -4.0627e-03, 5.7678e-03, ..., 1.8570e-02, 2.8183e-02, 3.6713e-02]], device='cuda:0') Epoch 2, bias, value: tensor([-0.0083, -0.0080, -0.0044, -0.0110, -0.0160, -0.0072, 0.0160, -0.0009, 0.0239, 0.0007], device='cuda:0'), grad: tensor([ 0.0194, -0.0116, -0.0517, 0.0079, -0.0030, -0.0464, 0.0831, -0.0077, -0.0031, 0.0132], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 231.02, cls_loss 0.5377 cls_loss_mapping 0.8528 cls_loss_causal 1.9007 re_mapping 0.2121 re_causal 0.2516 /// teacc 91.60 lr 0.00010000 Epoch 3, weight, value: tensor([[-0.0381, 0.0368, 0.0271, ..., -0.0083, 0.0109, -0.0109], [ 0.0276, 0.0206, -0.0329, ..., -0.0403, -0.0184, 0.0273], [ 0.0315, -0.0022, -0.0229, ..., -0.0030, -0.0253, 0.0185], ..., [ 0.0103, 0.0296, 0.0270, ..., 0.0141, 0.0320, 0.0264], [ 0.0138, -0.0134, -0.0159, ..., -0.0359, -0.0176, 0.0251], [ 0.0027, 0.0326, -0.0096, ..., -0.0036, -0.0175, 0.0213]], device='cuda:0'), grad: tensor([[ 1.2026e-03, -1.1848e-02, -1.3016e-02, ..., -4.4212e-03, 6.9389e-03, 1.2054e-03], [ 3.7785e-03, 2.2049e-03, 1.7500e-03, ..., 1.8940e-03, 9.8896e-04, 5.9509e-03], [-2.5040e-02, -1.4849e-03, 2.0199e-03, ..., -1.7673e-05, 8.8739e-04, -2.8656e-02], ..., [ 2.3890e-04, 1.3123e-02, 1.4961e-02, ..., 1.1703e-02, 3.7632e-03, 5.4703e-03], [-1.3817e-02, 7.8011e-03, 4.6577e-03, ..., -4.8370e-03, 1.2150e-03, -1.4687e-02], [ 3.4981e-03, -4.7668e-02, -8.7158e-02, ..., -6.1340e-02, -4.0375e-02, 4.7150e-03]], device='cuda:0') Epoch 3, bias, value: tensor([-0.0084, -0.0081, -0.0047, -0.0105, -0.0162, -0.0060, 0.0152, -0.0014, 0.0237, 0.0007], device='cuda:0'), grad: tensor([-0.0068, 0.0059, -0.0256, 0.0390, -0.0054, 0.0425, -0.0387, 0.0103, -0.0045, -0.0167], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 231.25, cls_loss 0.3231 cls_loss_mapping 0.4890 cls_loss_causal 1.6727 re_mapping 0.1563 re_causal 0.2430 /// teacc 93.14 lr 0.00010000 Epoch 4, weight, value: tensor([[-0.0399, 0.0377, 0.0280, ..., -0.0088, 0.0094, -0.0126], [ 0.0285, 0.0211, -0.0349, ..., -0.0418, -0.0185, 0.0287], [ 0.0327, -0.0044, -0.0236, ..., -0.0032, -0.0251, 0.0173], ..., [ 0.0119, 0.0292, 0.0291, ..., 0.0155, 0.0348, 0.0265], [ 0.0147, -0.0160, -0.0187, ..., -0.0386, -0.0189, 0.0267], [ 0.0009, 0.0343, -0.0074, ..., -0.0033, -0.0172, 0.0215]], device='cuda:0'), grad: tensor([[-6.9427e-03, -2.3895e-02, -2.1408e-02, ..., -2.5192e-02, 6.3658e-04, -2.8877e-03], [ 3.5095e-03, 3.6955e-04, 3.7460e-03, ..., 3.1166e-03, 8.3876e-04, 9.4593e-05], [-2.3926e-02, 4.6272e-03, -1.8311e-03, ..., -1.1002e-02, 6.0368e-04, -1.2459e-02], ..., [-1.6832e-03, -2.8286e-03, -2.7435e-02, ..., -3.6316e-03, -3.2745e-02, -1.8036e-02], [ 1.2085e-02, 1.2169e-02, 1.8158e-02, ..., 1.3412e-02, 4.6997e-03, 1.4366e-02], [ 8.0414e-03, -1.5160e-02, 3.3545e-04, ..., 1.1925e-02, 1.6388e-02, -6.1378e-03]], device='cuda:0') Epoch 4, bias, value: tensor([-0.0083, -0.0077, -0.0046, -0.0106, -0.0162, -0.0057, 0.0149, -0.0011, 0.0233, 0.0007], device='cuda:0'), grad: tensor([-0.0356, 0.0061, -0.0238, 0.0141, 0.0317, -0.0174, 0.0058, -0.0032, 0.0307, -0.0085], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 230.96, cls_loss 0.2490 cls_loss_mapping 0.3600 cls_loss_causal 1.5010 re_mapping 0.1211 re_causal 0.2153 /// teacc 94.93 lr 0.00010000 Epoch 5, weight, value: tensor([[-0.0417, 0.0385, 0.0286, ..., -0.0085, 0.0079, -0.0141], [ 0.0292, 0.0212, -0.0366, ..., -0.0427, -0.0190, 0.0299], [ 0.0345, -0.0061, -0.0237, ..., -0.0029, -0.0248, 0.0163], ..., [ 0.0129, 0.0290, 0.0308, ..., 0.0162, 0.0375, 0.0260], [ 0.0153, -0.0186, -0.0224, ..., -0.0411, -0.0205, 0.0281], [-0.0012, 0.0359, -0.0057, ..., -0.0040, -0.0174, 0.0220]], device='cuda:0'), grad: tensor([[ 0.0006, -0.0004, -0.0003, ..., -0.0010, 0.0004, 0.0006], [-0.0272, -0.0137, 0.0006, ..., -0.0071, -0.0086, -0.0281], [ 0.0099, 0.0067, 0.0028, ..., 0.0053, 0.0053, 0.0085], ..., [-0.0113, -0.0112, -0.0245, ..., -0.0173, -0.0207, -0.0051], [ 0.0141, 0.0128, 0.0042, ..., 0.0067, 0.0031, 0.0190], [ 0.0095, 0.0090, 0.0152, ..., 0.0122, 0.0122, 0.0054]], device='cuda:0') Epoch 5, bias, value: tensor([-0.0082, -0.0076, -0.0043, -0.0104, -0.0162, -0.0062, 0.0144, -0.0010, 0.0234, 0.0007], device='cuda:0'), grad: tensor([ 0.0015, -0.0276, 0.0137, 0.0022, 0.0031, -0.0129, -0.0070, -0.0186, 0.0291, 0.0164], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 230.80, cls_loss 0.1967 cls_loss_mapping 0.2618 cls_loss_causal 1.3837 re_mapping 0.0988 re_causal 0.1978 /// teacc 95.61 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0435, 0.0389, 0.0284, ..., -0.0084, 0.0068, -0.0149], [ 0.0297, 0.0227, -0.0381, ..., -0.0433, -0.0191, 0.0309], [ 0.0356, -0.0082, -0.0247, ..., -0.0029, -0.0255, 0.0153], ..., [ 0.0142, 0.0284, 0.0324, ..., 0.0169, 0.0394, 0.0261], [ 0.0164, -0.0208, -0.0246, ..., -0.0438, -0.0212, 0.0292], [-0.0038, 0.0377, -0.0044, ..., -0.0040, -0.0179, 0.0223]], device='cuda:0'), grad: tensor([[ 4.8447e-03, -1.4887e-03, -1.7977e-03, ..., -2.4700e-03, 1.3971e-04, 3.6097e-04], [-1.0719e-03, -1.5701e-02, -1.3733e-02, ..., -8.9111e-03, -6.4313e-05, -1.3161e-02], [ 1.1597e-03, 1.5335e-03, 1.5078e-03, ..., 1.4095e-03, 1.8072e-04, 1.7118e-03], ..., [-6.5136e-04, 4.1733e-03, 9.7656e-04, ..., 3.4618e-04, -3.2387e-03, 3.8948e-03], [-1.2444e-02, 5.9891e-03, 5.8174e-03, ..., 4.2038e-03, 2.2519e-04, -2.7447e-03], [ 5.8842e-04, 1.0742e-02, 6.9809e-03, ..., 3.3832e-04, 1.7824e-03, 9.4604e-03]], device='cuda:0') Epoch 6, bias, value: tensor([-0.0081, -0.0071, -0.0044, -0.0106, -0.0164, -0.0062, 0.0141, -0.0006, 0.0236, 0.0004], device='cuda:0'), grad: tensor([ 0.0181, -0.0152, 0.0045, 0.0120, -0.0160, 0.0135, 0.0078, 0.0042, -0.0415, 0.0128], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 231.00, cls_loss 0.1648 cls_loss_mapping 0.2202 cls_loss_causal 1.3461 re_mapping 0.0845 re_causal 0.1866 /// teacc 96.50 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0448, 0.0392, 0.0286, ..., -0.0078, 0.0058, -0.0157], [ 0.0295, 0.0236, -0.0394, ..., -0.0445, -0.0196, 0.0320], [ 0.0365, -0.0102, -0.0257, ..., -0.0033, -0.0253, 0.0148], ..., [ 0.0148, 0.0279, 0.0336, ..., 0.0178, 0.0410, 0.0253], [ 0.0174, -0.0230, -0.0271, ..., -0.0458, -0.0221, 0.0303], [-0.0062, 0.0395, -0.0034, ..., -0.0043, -0.0187, 0.0222]], device='cuda:0'), grad: tensor([[ 0.0008, -0.0011, 0.0002, ..., 0.0009, 0.0003, 0.0003], [-0.0026, 0.0005, 0.0012, ..., 0.0008, -0.0003, -0.0047], [-0.0034, 0.0010, -0.0062, ..., -0.0117, 0.0001, 0.0027], ..., [-0.0081, -0.0045, -0.0128, ..., -0.0055, -0.0092, -0.0030], [-0.0015, -0.0029, 0.0020, ..., 0.0019, 0.0009, -0.0036], [ 0.0040, 0.0009, 0.0016, ..., 0.0020, 0.0028, 0.0036]], device='cuda:0') Epoch 7, bias, value: tensor([-0.0079, -0.0070, -0.0044, -0.0104, -0.0159, -0.0069, 0.0138, -0.0006, 0.0234, 0.0006], device='cuda:0'), grad: tensor([ 0.0025, -0.0015, -0.0120, 0.0113, 0.0019, 0.0014, 0.0009, -0.0065, -0.0031, 0.0051], device='cuda:0') 100 0.0001 changing lr epoch 6, time 214.86, cls_loss 0.1555 cls_loss_mapping 0.1990 cls_loss_causal 1.2760 re_mapping 0.0740 re_causal 0.1701 /// teacc 96.28 lr 0.00010000 Epoch 8, weight, value: tensor([[-0.0462, 0.0399, 0.0280, ..., -0.0080, 0.0046, -0.0164], [ 0.0300, 0.0240, -0.0402, ..., -0.0445, -0.0188, 0.0324], [ 0.0372, -0.0110, -0.0254, ..., -0.0028, -0.0252, 0.0140], ..., [ 0.0157, 0.0273, 0.0343, ..., 0.0181, 0.0423, 0.0252], [ 0.0183, -0.0244, -0.0287, ..., -0.0469, -0.0229, 0.0315], [-0.0080, 0.0407, -0.0023, ..., -0.0052, -0.0191, 0.0222]], device='cuda:0'), grad: tensor([[ 0.0011, 0.0012, 0.0027, ..., 0.0022, 0.0015, 0.0002], [-0.0056, -0.0027, 0.0010, ..., -0.0013, 0.0011, -0.0086], [-0.0023, 0.0006, -0.0048, ..., -0.0166, 0.0025, 0.0034], ..., [-0.0025, -0.0020, -0.0053, ..., 0.0010, -0.0102, 0.0034], [-0.0135, 0.0025, 0.0015, ..., -0.0071, 0.0009, -0.0045], [ 0.0027, -0.0085, -0.0126, ..., -0.0011, -0.0026, -0.0025]], device='cuda:0') Epoch 8, bias, value: tensor([-0.0077, -0.0069, -0.0042, -0.0106, -0.0161, -0.0070, 0.0135, -0.0008, 0.0238, 0.0007], device='cuda:0'), grad: tensor([ 0.0035, -0.0025, -0.0062, 0.0287, 0.0078, 0.0044, 0.0069, -0.0029, -0.0362, -0.0034], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 231.65, cls_loss 0.1254 cls_loss_mapping 0.1577 cls_loss_causal 1.2108 re_mapping 0.0669 re_causal 0.1563 /// teacc 96.58 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0470, 0.0400, 0.0276, ..., -0.0083, 0.0035, -0.0170], [ 0.0305, 0.0250, -0.0412, ..., -0.0451, -0.0183, 0.0331], [ 0.0379, -0.0122, -0.0260, ..., -0.0023, -0.0251, 0.0135], ..., [ 0.0167, 0.0267, 0.0354, ..., 0.0189, 0.0437, 0.0247], [ 0.0190, -0.0264, -0.0298, ..., -0.0482, -0.0240, 0.0325], [-0.0099, 0.0426, -0.0013, ..., -0.0058, -0.0196, 0.0225]], device='cuda:0'), grad: tensor([[ 5.0783e-04, -3.1548e-03, 1.0719e-03, ..., -1.7424e-03, 4.3869e-04, 6.3467e-04], [-1.5078e-03, -7.1144e-04, 4.3678e-04, ..., 1.5050e-05, -2.7180e-03, -2.8667e-03], [ 1.8272e-03, 2.2106e-03, 1.0233e-03, ..., 9.5415e-04, 1.4057e-03, 2.2430e-03], ..., [-4.6272e-03, -1.1772e-02, -1.9196e-02, ..., -4.1428e-03, -8.1482e-03, -3.4571e-04], [-5.5122e-03, 3.2735e-04, 1.1435e-03, ..., 1.0223e-03, 4.0555e-04, -4.4250e-03], [ 4.7150e-03, 4.5509e-03, 1.2329e-02, ..., 3.3875e-03, 6.4201e-03, -9.6436e-03]], device='cuda:0') Epoch 9, bias, value: tensor([-0.0074, -0.0067, -0.0041, -0.0111, -0.0161, -0.0073, 0.0133, -0.0008, 0.0241, 0.0009], device='cuda:0'), grad: tensor([-0.0112, -0.0030, 0.0077, 0.0049, 0.0041, 0.0046, 0.0076, -0.0107, -0.0057, 0.0017], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 231.29, cls_loss 0.1204 cls_loss_mapping 0.1486 cls_loss_causal 1.1595 re_mapping 0.0601 re_causal 0.1435 /// teacc 97.07 lr 0.00010000 Epoch 10, weight, value: tensor([[-0.0482, 0.0401, 0.0272, ..., -0.0080, 0.0025, -0.0177], [ 0.0310, 0.0252, -0.0426, ..., -0.0458, -0.0184, 0.0337], [ 0.0383, -0.0147, -0.0269, ..., -0.0029, -0.0250, 0.0128], ..., [ 0.0177, 0.0272, 0.0366, ..., 0.0199, 0.0451, 0.0249], [ 0.0199, -0.0275, -0.0313, ..., -0.0497, -0.0242, 0.0336], [-0.0117, 0.0436, -0.0004, ..., -0.0065, -0.0201, 0.0219]], device='cuda:0'), grad: tensor([[ 7.2479e-05, 1.6737e-03, 1.6737e-03, ..., 2.4676e-04, 1.2600e-04, 8.1182e-05], [-5.8770e-05, 2.7704e-04, 2.9683e-04, ..., 2.1422e-04, 1.1049e-05, -1.8883e-04], [ 4.5204e-04, 1.6556e-03, 1.3800e-03, ..., 3.4046e-04, 6.3229e-04, 2.7323e-04], ..., [-1.4143e-03, -8.2111e-04, -4.1695e-03, ..., -1.6098e-03, -3.2959e-03, 1.6832e-04], [-2.5392e-04, 1.6994e-03, 8.0061e-04, ..., 4.7112e-04, 1.5676e-04, -6.0558e-04], [ 4.5323e-04, -7.3242e-03, -3.7174e-03, ..., 1.0544e-04, 1.7042e-03, -1.7319e-03]], device='cuda:0') Epoch 10, bias, value: tensor([-0.0071, -0.0065, -0.0046, -0.0109, -0.0157, -0.0079, 0.0130, -0.0005, 0.0243, 0.0006], device='cuda:0'), grad: tensor([ 0.0020, 0.0005, 0.0015, 0.0006, -0.0028, 0.0010, 0.0036, -0.0012, 0.0010, -0.0061], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 9---------------------------------------------------- epoch 9, time 230.84, cls_loss 0.0973 cls_loss_mapping 0.1216 cls_loss_causal 1.1107 re_mapping 0.0573 re_causal 0.1388 /// teacc 97.32 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0497, 0.0407, 0.0269, ..., -0.0075, 0.0013, -0.0185], [ 0.0313, 0.0260, -0.0437, ..., -0.0469, -0.0182, 0.0344], [ 0.0391, -0.0161, -0.0273, ..., -0.0023, -0.0243, 0.0124], ..., [ 0.0183, 0.0273, 0.0372, ..., 0.0203, 0.0460, 0.0247], [ 0.0208, -0.0291, -0.0327, ..., -0.0513, -0.0251, 0.0345], [-0.0126, 0.0451, 0.0008, ..., -0.0067, -0.0204, 0.0220]], device='cuda:0'), grad: tensor([[ 0.0003, 0.0006, 0.0009, ..., 0.0006, 0.0002, 0.0001], [ 0.0005, 0.0008, 0.0016, ..., 0.0014, 0.0002, -0.0009], [-0.0022, 0.0008, 0.0005, ..., -0.0016, -0.0003, 0.0008], ..., [ 0.0019, 0.0077, 0.0087, ..., 0.0037, 0.0040, 0.0027], [-0.0005, 0.0013, 0.0009, ..., 0.0010, 0.0004, -0.0011], [-0.0018, -0.0242, -0.0300, ..., -0.0141, -0.0067, -0.0032]], device='cuda:0') Epoch 11, bias, value: tensor([-0.0072, -0.0066, -0.0044, -0.0112, -0.0158, -0.0083, 0.0130, -0.0002, 0.0245, 0.0009], device='cuda:0'), grad: tensor([ 1.2264e-03, 1.3876e-03, -2.6345e-04, 1.5327e-02, 1.1435e-03, -6.1560e-04, 6.2585e-06, 9.3460e-03, 1.1883e-03, -2.8748e-02], device='cuda:0') 100 0.0001 changing lr epoch 10, time 214.76, cls_loss 0.1025 cls_loss_mapping 0.1267 cls_loss_causal 1.0994 re_mapping 0.0517 re_causal 0.1271 /// teacc 97.25 lr 0.00010000 Epoch 12, weight, value: tensor([[-0.0506, 0.0410, 0.0268, ..., -0.0069, 0.0003, -0.0190], [ 0.0314, 0.0264, -0.0445, ..., -0.0479, -0.0181, 0.0352], [ 0.0394, -0.0171, -0.0282, ..., -0.0022, -0.0242, 0.0118], ..., [ 0.0191, 0.0271, 0.0380, ..., 0.0208, 0.0472, 0.0246], [ 0.0214, -0.0304, -0.0338, ..., -0.0526, -0.0251, 0.0351], [-0.0141, 0.0459, 0.0013, ..., -0.0076, -0.0210, 0.0218]], device='cuda:0'), grad: tensor([[ 0.0001, 0.0003, -0.0008, ..., -0.0008, 0.0004, 0.0002], [ 0.0002, 0.0030, 0.0004, ..., 0.0003, 0.0003, 0.0042], [ 0.0002, 0.0016, 0.0005, ..., 0.0006, 0.0003, 0.0004], ..., [-0.0008, -0.0014, -0.0041, ..., -0.0011, -0.0033, 0.0001], [-0.0010, -0.0093, 0.0003, ..., 0.0006, 0.0001, -0.0193], [ 0.0004, 0.0052, 0.0024, ..., 0.0021, 0.0013, 0.0009]], device='cuda:0') Epoch 12, bias, value: tensor([-0.0069, -0.0068, -0.0044, -0.0111, -0.0158, -0.0084, 0.0129, -0.0001, 0.0246, 0.0007], device='cuda:0'), grad: tensor([ 0.0011, 0.0072, 0.0021, 0.0070, 0.0148, 0.0092, -0.0173, -0.0011, -0.0288, 0.0056], device='cuda:0') 100 0.0001 changing lr epoch 11, time 214.93, cls_loss 0.0883 cls_loss_mapping 0.1132 cls_loss_causal 1.0790 re_mapping 0.0483 re_causal 0.1220 /// teacc 97.21 lr 0.00010000 Epoch 13, weight, value: tensor([[-0.0524, 0.0410, 0.0263, ..., -0.0069, -0.0006, -0.0197], [ 0.0314, 0.0276, -0.0446, ..., -0.0483, -0.0179, 0.0358], [ 0.0399, -0.0178, -0.0293, ..., -0.0023, -0.0247, 0.0115], ..., [ 0.0200, 0.0264, 0.0385, ..., 0.0213, 0.0485, 0.0246], [ 0.0223, -0.0316, -0.0339, ..., -0.0531, -0.0255, 0.0359], [-0.0153, 0.0468, 0.0020, ..., -0.0086, -0.0214, 0.0218]], device='cuda:0'), grad: tensor([[ 7.5245e-04, -9.6283e-03, 4.6682e-04, ..., -3.3455e-03, 1.1218e-04, 1.1814e-04], [-1.4229e-02, -5.9128e-03, -2.3689e-03, ..., -1.8585e-02, -4.9973e-04, -1.4820e-03], [ 9.7733e-03, 4.9896e-03, 3.9215e-03, ..., 1.5129e-02, -7.0190e-04, 1.3924e-03], ..., [ 1.3885e-03, 2.5005e-03, 1.3628e-03, ..., 1.0033e-03, -9.8586e-05, 8.6641e-04], [-1.4214e-02, -2.7370e-03, -6.0577e-03, ..., -1.2451e-02, 1.8382e-04, -6.4659e-04], [ 1.1415e-03, 1.2169e-02, 6.2828e-03, ..., 2.3041e-03, 8.1396e-04, 4.0970e-03]], device='cuda:0') Epoch 13, bias, value: tensor([-0.0071, -0.0070, -0.0043, -0.0110, -0.0158, -0.0087, 0.0126, -0.0001, 0.0252, 0.0007], device='cuda:0'), grad: tensor([-0.0119, -0.0182, 0.0153, 0.0171, -0.0175, 0.0057, 0.0061, 0.0044, -0.0178, 0.0168], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 230.44, cls_loss 0.0744 cls_loss_mapping 0.0954 cls_loss_causal 1.0119 re_mapping 0.0455 re_causal 0.1126 /// teacc 97.58 lr 0.00010000 Epoch 14, weight, value: tensor([[-0.0533, 0.0419, 0.0258, ..., -0.0068, -0.0013, -0.0202], [ 0.0317, 0.0282, -0.0450, ..., -0.0482, -0.0177, 0.0364], [ 0.0400, -0.0189, -0.0298, ..., -0.0018, -0.0246, 0.0105], ..., [ 0.0204, 0.0264, 0.0389, ..., 0.0212, 0.0491, 0.0247], [ 0.0230, -0.0328, -0.0350, ..., -0.0540, -0.0263, 0.0367], [-0.0160, 0.0474, 0.0025, ..., -0.0094, -0.0214, 0.0217]], device='cuda:0'), grad: tensor([[ 7.3135e-05, -7.8630e-04, -1.0347e-04, ..., -1.8823e-04, 2.2933e-05, 1.6570e-05], [-8.0407e-05, 1.1033e-04, 3.6120e-05, ..., 1.1736e-04, -4.0054e-05, -1.3793e-04], [-2.5606e-04, 4.8375e-04, 9.1016e-05, ..., -3.5316e-05, -1.4627e-04, 1.4687e-04], ..., [ 1.3506e-04, 3.0541e-04, 1.1182e-04, ..., 1.6987e-04, 4.2200e-05, 1.1498e-04], [-2.9206e-05, 1.0691e-03, 3.1042e-04, ..., 5.6458e-04, 6.8843e-05, 7.4267e-05], [ 2.0444e-05, 2.6836e-03, 1.8024e-04, ..., 2.3401e-04, -1.8269e-05, 7.7105e-04]], device='cuda:0') Epoch 14, bias, value: tensor([-0.0070, -0.0068, -0.0042, -0.0110, -0.0159, -0.0088, 0.0126, -0.0005, 0.0252, 0.0008], device='cuda:0'), grad: tensor([ 1.5869e-03, 7.6532e-05, 2.9492e-04, -1.7052e-03, -3.9368e-03, 1.1196e-03, -2.7504e-03, 4.6372e-04, 1.4658e-03, 3.3798e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 13---------------------------------------------------- epoch 13, time 231.16, cls_loss 0.0683 cls_loss_mapping 0.0915 cls_loss_causal 0.9978 re_mapping 0.0413 re_causal 0.1072 /// teacc 97.60 lr 0.00010000 Epoch 15, weight, value: tensor([[-0.0540, 0.0426, 0.0253, ..., -0.0064, -0.0019, -0.0206], [ 0.0318, 0.0288, -0.0462, ..., -0.0492, -0.0174, 0.0369], [ 0.0405, -0.0204, -0.0305, ..., -0.0019, -0.0250, 0.0104], ..., [ 0.0206, 0.0265, 0.0396, ..., 0.0215, 0.0499, 0.0248], [ 0.0233, -0.0343, -0.0361, ..., -0.0548, -0.0268, 0.0372], [-0.0172, 0.0483, 0.0031, ..., -0.0102, -0.0217, 0.0217]], device='cuda:0'), grad: tensor([[ 1.2565e-04, -1.2379e-03, 2.7037e-04, ..., -4.1246e-04, 5.3525e-05, 2.1979e-05], [ 4.0889e-04, 2.0313e-03, 8.0442e-04, ..., 5.1832e-04, 3.1686e-04, 1.5650e-03], [-2.6417e-03, 1.0185e-03, 5.3501e-04, ..., -1.2131e-03, -1.4223e-05, -7.8082e-05], ..., [-1.0300e-03, 3.9673e-04, -9.6035e-04, ..., -1.4746e-04, -1.4067e-03, 1.4699e-04], [ 1.8425e-03, 2.0390e-03, 1.2455e-03, ..., 2.8286e-03, 3.1614e-04, 5.8746e-04], [ 4.7565e-04, -1.2352e-02, -4.1509e-04, ..., 5.9967e-03, 2.6536e-04, -1.3580e-02]], device='cuda:0') Epoch 15, bias, value: tensor([-0.0067, -0.0066, -0.0042, -0.0109, -0.0162, -0.0089, 0.0127, -0.0004, 0.0251, 0.0008], device='cuda:0'), grad: tensor([-1.2589e-03, 2.4986e-03, -1.6842e-03, -2.6016e-02, 1.1261e-02, 1.7929e-02, 1.0891e-03, -8.9407e-06, 4.2648e-03, -8.0795e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 14---------------------------------------------------- epoch 14, time 230.77, cls_loss 0.0694 cls_loss_mapping 0.0901 cls_loss_causal 0.9835 re_mapping 0.0401 re_causal 0.0973 /// teacc 97.68 lr 0.00010000 Epoch 16, weight, value: tensor([[-0.0557, 0.0433, 0.0249, ..., -0.0059, -0.0029, -0.0214], [ 0.0328, 0.0289, -0.0474, ..., -0.0494, -0.0171, 0.0373], [ 0.0407, -0.0212, -0.0309, ..., -0.0011, -0.0252, 0.0098], ..., [ 0.0208, 0.0268, 0.0406, ..., 0.0221, 0.0510, 0.0243], [ 0.0243, -0.0349, -0.0370, ..., -0.0561, -0.0271, 0.0381], [-0.0181, 0.0490, 0.0037, ..., -0.0109, -0.0217, 0.0217]], device='cuda:0'), grad: tensor([[ 1.6618e-04, -2.1785e-05, 3.9876e-05, ..., 7.1339e-06, 4.5657e-05, 1.4472e-04], [ 6.7043e-04, 1.0163e-04, 1.6773e-04, ..., 4.0936e-04, 2.3460e-04, 2.0373e-04], [-5.0545e-04, 4.2963e-04, 3.2592e-04, ..., -5.2786e-04, 1.1396e-04, 3.3998e-04], ..., [-7.0333e-04, -7.5245e-04, -1.3685e-03, ..., -4.3869e-04, -1.2436e-03, 1.2720e-04], [-7.4863e-04, 1.6184e-03, 2.1064e-04, ..., 4.7326e-04, 1.5318e-04, -4.7016e-04], [ 3.2043e-04, 1.3819e-03, 4.9829e-04, ..., 4.3893e-04, 5.2500e-04, 9.8133e-04]], device='cuda:0') Epoch 16, bias, value: tensor([-0.0066, -0.0066, -0.0042, -0.0112, -0.0160, -0.0092, 0.0129, -0.0004, 0.0254, 0.0006], device='cuda:0'), grad: tensor([ 0.0003, 0.0010, -0.0002, 0.0006, -0.0020, -0.0059, 0.0025, -0.0006, 0.0024, 0.0019], device='cuda:0') 100 0.0001 changing lr epoch 15, time 214.52, cls_loss 0.0726 cls_loss_mapping 0.0841 cls_loss_causal 0.9889 re_mapping 0.0384 re_causal 0.0922 /// teacc 97.36 lr 0.00010000 Epoch 17, weight, value: tensor([[-0.0567, 0.0438, 0.0244, ..., -0.0060, -0.0036, -0.0220], [ 0.0319, 0.0302, -0.0477, ..., -0.0504, -0.0175, 0.0378], [ 0.0416, -0.0222, -0.0317, ..., -0.0017, -0.0251, 0.0094], ..., [ 0.0219, 0.0267, 0.0413, ..., 0.0231, 0.0525, 0.0244], [ 0.0250, -0.0364, -0.0384, ..., -0.0571, -0.0275, 0.0389], [-0.0193, 0.0492, 0.0041, ..., -0.0117, -0.0222, 0.0216]], device='cuda:0'), grad: tensor([[ 6.0081e-04, -4.0114e-05, 4.6706e-04, ..., 2.2781e-04, 2.5439e-04, 3.6311e-04], [ 2.0905e-03, -2.8634e-04, 1.0929e-03, ..., 1.1454e-03, 1.3280e-04, 7.5865e-04], [-1.2680e-02, 8.1587e-04, 6.7854e-04, ..., -2.0523e-03, -1.8263e-03, -6.0387e-03], ..., [ 1.1154e-02, 4.1771e-03, 1.2299e-02, ..., 1.0269e-02, 2.7580e-03, 8.2092e-03], [ 4.1618e-03, 1.1501e-03, 2.4166e-03, ..., 2.4395e-03, 1.4820e-03, 2.2926e-03], [ 8.6021e-04, -3.0098e-03, -3.3550e-03, ..., 4.6182e-04, -1.5199e-04, -8.7833e-04]], device='cuda:0') Epoch 17, bias, value: tensor([-6.4189e-03, -6.6149e-03, -4.1205e-03, -1.0993e-02, -1.5864e-02, -9.0205e-03, 1.2469e-02, 3.3429e-07, 2.5280e-02, 9.3300e-05], device='cuda:0'), grad: tensor([ 0.0005, 0.0015, -0.0091, -0.0127, 0.0009, 0.0003, 0.0004, 0.0164, 0.0046, -0.0027], device='cuda:0') 100 0.0001 changing lr epoch 16, time 214.84, cls_loss 0.0616 cls_loss_mapping 0.0751 cls_loss_causal 0.9772 re_mapping 0.0374 re_causal 0.0935 /// teacc 97.67 lr 0.00010000 Epoch 18, weight, value: tensor([[-0.0576, 0.0438, 0.0241, ..., -0.0059, -0.0045, -0.0224], [ 0.0315, 0.0304, -0.0481, ..., -0.0503, -0.0180, 0.0379], [ 0.0422, -0.0228, -0.0325, ..., -0.0017, -0.0248, 0.0091], ..., [ 0.0223, 0.0262, 0.0416, ..., 0.0229, 0.0533, 0.0243], [ 0.0257, -0.0375, -0.0396, ..., -0.0579, -0.0281, 0.0398], [-0.0199, 0.0500, 0.0047, ..., -0.0121, -0.0220, 0.0214]], device='cuda:0'), grad: tensor([[ 5.5122e-04, 1.3185e-04, 4.3225e-04, ..., 7.4196e-04, 2.1279e-04, 1.6320e-04], [ 4.0174e-04, 1.4865e-04, 5.5933e-04, ..., 5.3883e-04, 1.7798e-04, 1.1009e-04], [-1.0710e-03, -6.2287e-06, 4.3035e-04, ..., -1.4610e-03, 2.5845e-04, 1.7989e-04], ..., [-2.1827e-04, 8.3160e-03, 1.0284e-02, ..., 2.5177e-03, 2.5291e-03, -1.4210e-03], [-3.0923e-04, -2.1374e-04, 3.4475e-04, ..., 7.7200e-04, 5.5933e-04, -1.8721e-03], [-1.1892e-03, -7.8430e-03, -1.4580e-02, ..., -5.0621e-03, -4.8561e-03, 2.3766e-03]], device='cuda:0') Epoch 18, bias, value: tensor([-0.0068, -0.0068, -0.0042, -0.0109, -0.0155, -0.0094, 0.0129, -0.0003, 0.0253, 0.0003], device='cuda:0'), grad: tensor([ 0.0014, 0.0008, -0.0028, 0.0030, -0.0028, -0.0002, 0.0006, 0.0062, -0.0006, -0.0056], device='cuda:0') 100 0.0001 changing lr epoch 17, time 214.73, cls_loss 0.0637 cls_loss_mapping 0.0804 cls_loss_causal 0.9249 re_mapping 0.0351 re_causal 0.0871 /// teacc 97.64 lr 0.00010000 Epoch 19, weight, value: tensor([[-0.0584, 0.0441, 0.0236, ..., -0.0055, -0.0056, -0.0231], [ 0.0320, 0.0305, -0.0487, ..., -0.0510, -0.0174, 0.0386], [ 0.0423, -0.0231, -0.0330, ..., -0.0015, -0.0245, 0.0083], ..., [ 0.0226, 0.0261, 0.0423, ..., 0.0230, 0.0541, 0.0243], [ 0.0262, -0.0385, -0.0407, ..., -0.0593, -0.0288, 0.0410], [-0.0209, 0.0509, 0.0053, ..., -0.0125, -0.0224, 0.0211]], device='cuda:0'), grad: tensor([[ 5.8621e-05, -4.5657e-05, 1.1694e-04, ..., 4.2677e-05, 9.8467e-05, 4.5121e-05], [-2.3483e-02, -6.8054e-03, -1.3374e-02, ..., -6.3057e-03, -1.6922e-02, -1.5793e-02], [-1.1110e-03, 2.7800e-04, 4.5538e-04, ..., 6.1369e-04, 1.7941e-04, -9.8133e-04], ..., [ 2.2278e-02, 6.1302e-03, 1.1856e-02, ..., 5.8594e-03, 1.5190e-02, 1.5152e-02], [-5.2243e-05, 2.6441e-04, 3.9816e-04, ..., 7.2670e-04, 6.2346e-05, -2.3544e-04], [ 4.2343e-04, 2.2125e-04, 4.9782e-04, ..., 4.9639e-04, 5.4312e-04, 2.3580e-04]], device='cuda:0') Epoch 19, bias, value: tensor([-6.8119e-03, -7.0288e-03, -3.9785e-03, -1.0953e-02, -1.5697e-02, -8.7681e-03, 1.2069e-02, -5.2090e-05, 2.5437e-02, 2.9523e-04], device='cuda:0'), grad: tensor([ 3.4094e-05, -2.1469e-02, -2.9678e-03, -1.0614e-03, 6.9475e-04, 1.4849e-03, 3.3379e-04, 2.0935e-02, 1.0843e-03, 9.2363e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 18---------------------------------------------------- epoch 18, time 229.04, cls_loss 0.0529 cls_loss_mapping 0.0677 cls_loss_causal 0.9468 re_mapping 0.0340 re_causal 0.0894 /// teacc 98.05 lr 0.00010000 Epoch 20, weight, value: tensor([[-0.0595, 0.0443, 0.0231, ..., -0.0056, -0.0066, -0.0236], [ 0.0318, 0.0311, -0.0496, ..., -0.0520, -0.0176, 0.0392], [ 0.0430, -0.0233, -0.0335, ..., -0.0009, -0.0238, 0.0079], ..., [ 0.0233, 0.0260, 0.0430, ..., 0.0235, 0.0550, 0.0243], [ 0.0267, -0.0392, -0.0415, ..., -0.0605, -0.0294, 0.0416], [-0.0223, 0.0512, 0.0055, ..., -0.0134, -0.0226, 0.0210]], device='cuda:0'), grad: tensor([[ 2.7275e-04, -3.9649e-04, -3.7462e-05, ..., -5.5283e-05, 4.6194e-05, 1.8466e-04], [ 1.2046e-04, 1.6308e-04, 1.5426e-04, ..., 2.1279e-04, -1.3256e-04, 1.1355e-04], [-1.1170e-04, -1.1605e-04, -1.1516e-04, ..., -6.2275e-04, -3.0184e-04, 8.7214e-04], ..., [ 3.0947e-04, 1.5295e-04, -2.1052e-04, ..., 1.4770e-04, -3.2216e-05, 3.4142e-04], [-9.4070e-03, 1.6975e-03, 5.5361e-04, ..., 1.8990e-04, 6.0469e-05, -1.0239e-02], [ 3.1528e-03, 2.6665e-03, 8.5771e-05, ..., 1.3828e-04, 9.7930e-05, 9.3460e-03]], device='cuda:0') Epoch 20, bias, value: tensor([-7.0737e-03, -7.1977e-03, -3.4984e-03, -1.1235e-02, -1.5931e-02, -8.6086e-03, 1.2708e-02, 5.7766e-05, 2.5508e-02, -1.2062e-04], device='cuda:0'), grad: tensor([-0.0003, 0.0005, 0.0014, 0.0006, -0.0129, 0.0043, 0.0090, 0.0008, -0.0152, 0.0118], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 19---------------------------------------------------- epoch 19, time 231.29, cls_loss 0.0562 cls_loss_mapping 0.0771 cls_loss_causal 0.9101 re_mapping 0.0320 re_causal 0.0845 /// teacc 98.20 lr 0.00010000 Epoch 21, weight, value: tensor([[-0.0604, 0.0448, 0.0226, ..., -0.0057, -0.0071, -0.0240], [ 0.0323, 0.0310, -0.0506, ..., -0.0520, -0.0171, 0.0397], [ 0.0430, -0.0238, -0.0340, ..., -0.0010, -0.0237, 0.0074], ..., [ 0.0233, 0.0261, 0.0436, ..., 0.0237, 0.0557, 0.0242], [ 0.0280, -0.0399, -0.0420, ..., -0.0610, -0.0291, 0.0425], [-0.0232, 0.0516, 0.0055, ..., -0.0142, -0.0235, 0.0209]], device='cuda:0'), grad: tensor([[ 1.3626e-04, 1.2457e-04, 3.1203e-05, ..., 2.7746e-05, 2.6926e-05, 4.7982e-05], [ 7.6413e-05, 1.8179e-05, 8.6963e-05, ..., 4.3869e-05, 1.2573e-07, 2.8476e-05], [-1.0881e-03, 4.1187e-05, -2.3639e-04, ..., -5.9748e-04, -7.5483e-04, 6.0529e-05], ..., [ 5.7077e-04, 8.2076e-05, 2.2316e-04, ..., 3.5930e-04, 5.4169e-04, 1.7071e-04], [ 4.1366e-04, 1.0395e-03, 8.0395e-04, ..., 2.1255e-04, 1.1539e-04, 9.6464e-04], [ 3.8986e-03, 9.1400e-03, 8.7204e-03, ..., 4.3654e-04, 9.4223e-04, 1.2726e-02]], device='cuda:0') Epoch 21, bias, value: tensor([-7.1765e-03, -7.1599e-03, -3.7100e-03, -1.1132e-02, -1.6029e-02, -8.8079e-03, 1.2442e-02, 1.2523e-04, 2.5943e-02, -4.6145e-05], device='cuda:0'), grad: tensor([ 6.2180e-04, 2.8467e-04, -1.1215e-03, -2.9355e-05, -1.9073e-02, 1.1311e-03, -3.1433e-03, 7.9393e-04, 2.4815e-03, 1.8051e-02], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 20---------------------------------------------------- epoch 20, time 230.81, cls_loss 0.0493 cls_loss_mapping 0.0664 cls_loss_causal 0.9091 re_mapping 0.0309 re_causal 0.0811 /// teacc 98.22 lr 0.00010000 Epoch 22, weight, value: tensor([[-0.0610, 0.0451, 0.0222, ..., -0.0057, -0.0072, -0.0247], [ 0.0321, 0.0313, -0.0517, ..., -0.0523, -0.0173, 0.0400], [ 0.0429, -0.0245, -0.0348, ..., -0.0013, -0.0238, 0.0068], ..., [ 0.0241, 0.0260, 0.0441, ..., 0.0236, 0.0565, 0.0244], [ 0.0287, -0.0407, -0.0428, ..., -0.0614, -0.0292, 0.0430], [-0.0242, 0.0520, 0.0057, ..., -0.0146, -0.0238, 0.0209]], device='cuda:0'), grad: tensor([[ 6.6161e-05, -3.5739e-04, 1.5116e-04, ..., 9.6858e-05, 1.2660e-04, 1.3269e-05], [ 3.0231e-04, 4.7594e-05, 1.6415e-04, ..., 1.4138e-04, 3.2520e-04, 4.8161e-05], [-4.8971e-04, 2.4283e-04, -4.0643e-06, ..., -6.6614e-04, -3.5858e-04, 1.3697e-04], ..., [-7.1287e-04, -3.3188e-04, -7.2908e-04, ..., 2.6658e-05, -1.1215e-03, -2.2137e-04], [ 6.4373e-05, 2.3580e-04, 1.9383e-04, ..., 1.5819e-04, 1.5402e-04, -2.5320e-04], [ 1.1927e-04, 1.1778e-03, 1.2827e-03, ..., 1.0891e-03, 1.7524e-04, -2.6003e-05]], device='cuda:0') Epoch 22, bias, value: tensor([-6.8432e-03, -7.1374e-03, -4.0128e-03, -1.0986e-02, -1.5977e-02, -8.7012e-03, 1.2376e-02, 3.2097e-05, 2.5943e-02, -1.4922e-04], device='cuda:0'), grad: tensor([-0.0006, 0.0003, -0.0004, 0.0016, 0.0007, -0.0070, 0.0004, -0.0002, 0.0005, 0.0046], device='cuda:0') 100 0.0001 changing lr epoch 21, time 214.81, cls_loss 0.0481 cls_loss_mapping 0.0610 cls_loss_causal 0.9001 re_mapping 0.0299 re_causal 0.0779 /// teacc 98.17 lr 0.00010000 Epoch 23, weight, value: tensor([[-0.0619, 0.0453, 0.0218, ..., -0.0059, -0.0080, -0.0254], [ 0.0322, 0.0322, -0.0524, ..., -0.0523, -0.0169, 0.0407], [ 0.0436, -0.0254, -0.0347, ..., -0.0012, -0.0232, 0.0063], ..., [ 0.0245, 0.0263, 0.0450, ..., 0.0241, 0.0577, 0.0242], [ 0.0294, -0.0414, -0.0435, ..., -0.0622, -0.0300, 0.0435], [-0.0243, 0.0528, 0.0060, ..., -0.0152, -0.0243, 0.0211]], device='cuda:0'), grad: tensor([[ 3.0923e-04, 2.0742e-04, 5.5408e-04, ..., 2.8658e-04, 1.9538e-04, 1.0885e-05], [-2.9698e-05, -3.3665e-04, 4.4346e-05, ..., 1.0705e-04, -1.7357e-04, -3.3450e-04], [ 4.1847e-03, 1.3971e-03, 7.7782e-03, ..., 4.7569e-03, 6.5498e-03, 6.1393e-05], ..., [-5.3406e-03, -1.8063e-03, -9.9258e-03, ..., -6.0349e-03, -7.4959e-03, 2.2089e-04], [-1.2958e-04, 7.9393e-05, 1.0169e-04, ..., 6.3121e-05, -7.5817e-05, -2.0885e-04], [ 2.2733e-04, 2.3854e-04, 5.9336e-05, ..., 2.5010e-04, 1.9979e-04, 3.2425e-04]], device='cuda:0') Epoch 23, bias, value: tensor([-0.0069, -0.0070, -0.0038, -0.0115, -0.0162, -0.0091, 0.0125, 0.0001, 0.0261, 0.0003], device='cuda:0'), grad: tensor([ 5.7507e-04, -4.0817e-04, 7.2021e-03, 7.6532e-04, -1.8239e-04, 4.1866e-04, -1.4460e-04, -8.8196e-03, -5.9634e-05, 6.5804e-04], device='cuda:0') 100 0.0001 changing lr epoch 22, time 215.05, cls_loss 0.0522 cls_loss_mapping 0.0684 cls_loss_causal 0.8888 re_mapping 0.0288 re_causal 0.0762 /// teacc 98.10 lr 0.00010000 Epoch 24, weight, value: tensor([[-0.0627, 0.0454, 0.0212, ..., -0.0062, -0.0088, -0.0260], [ 0.0326, 0.0327, -0.0535, ..., -0.0515, -0.0169, 0.0413], [ 0.0434, -0.0267, -0.0356, ..., -0.0016, -0.0238, 0.0059], ..., [ 0.0248, 0.0263, 0.0455, ..., 0.0243, 0.0583, 0.0238], [ 0.0303, -0.0424, -0.0441, ..., -0.0625, -0.0293, 0.0444], [-0.0250, 0.0538, 0.0065, ..., -0.0157, -0.0245, 0.0209]], device='cuda:0'), grad: tensor([[ 4.9442e-05, -1.1921e-04, 1.6665e-04, ..., 4.4137e-05, 7.5459e-05, 1.0774e-05], [ 4.2105e-04, 1.1176e-04, 1.4591e-04, ..., 3.2115e-04, 2.8920e-04, 1.5008e-04], [-7.2432e-04, 1.7536e-04, 1.6749e-04, ..., -5.9652e-04, -5.0259e-04, -1.2338e-04], ..., [-1.3375e-04, 9.4295e-05, -6.2037e-04, ..., -1.5116e-04, -3.8743e-04, 1.0544e-04], [-1.2875e-04, 2.3794e-04, 2.1100e-04, ..., 2.0850e-04, 1.0431e-04, -2.6059e-04], [ 1.1539e-04, 5.7745e-04, 3.5572e-04, ..., 1.1551e-04, 5.2691e-04, 3.7998e-05]], device='cuda:0') Epoch 24, bias, value: tensor([-7.1215e-03, -6.3597e-03, -4.2000e-03, -1.1755e-02, -1.6257e-02, -8.9121e-03, 1.2129e-02, 5.5141e-05, 2.6057e-02, 7.3294e-04], device='cuda:0'), grad: tensor([-9.0122e-05, 6.2180e-04, -7.4482e-04, 8.1396e-04, -1.4801e-03, -3.3021e-04, 7.3195e-05, 1.2644e-05, 9.2268e-05, 1.0300e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 23---------------------------------------------------- epoch 23, time 231.37, cls_loss 0.0410 cls_loss_mapping 0.0539 cls_loss_causal 0.8602 re_mapping 0.0289 re_causal 0.0751 /// teacc 98.40 lr 0.00010000 Epoch 25, weight, value: tensor([[-0.0635, 0.0456, 0.0209, ..., -0.0061, -0.0095, -0.0263], [ 0.0323, 0.0335, -0.0534, ..., -0.0520, -0.0169, 0.0416], [ 0.0441, -0.0278, -0.0364, ..., -0.0016, -0.0242, 0.0057], ..., [ 0.0251, 0.0264, 0.0461, ..., 0.0248, 0.0592, 0.0234], [ 0.0306, -0.0438, -0.0448, ..., -0.0635, -0.0293, 0.0451], [-0.0253, 0.0540, 0.0065, ..., -0.0165, -0.0248, 0.0207]], device='cuda:0'), grad: tensor([[ 2.1890e-05, -2.5463e-04, 1.2660e-04, ..., 5.0962e-06, 1.3530e-05, 3.2838e-06], [ 1.5364e-03, 6.7663e-04, 2.3985e-04, ..., 9.4271e-04, 7.8154e-04, 3.7694e-04], [-1.4496e-03, -5.2595e-04, -1.6701e-04, ..., -9.9945e-04, -8.6784e-04, 3.0696e-05], ..., [ 1.7262e-04, 6.0129e-04, 7.5102e-04, ..., 3.4976e-04, 1.0836e-04, 4.7743e-05], [-1.8859e-04, 8.1491e-04, 6.9618e-04, ..., 3.7408e-04, 1.1504e-04, -7.3576e-04], [-2.3890e-04, 1.0139e-02, 1.4694e-02, ..., 8.2092e-03, -2.2948e-04, 8.7202e-05]], device='cuda:0') Epoch 25, bias, value: tensor([-0.0072, -0.0067, -0.0037, -0.0117, -0.0162, -0.0090, 0.0124, 0.0003, 0.0258, 0.0004], device='cuda:0'), grad: tensor([-0.0004, 0.0021, -0.0017, -0.0133, 0.0013, -0.0030, 0.0002, 0.0008, 0.0008, 0.0133], device='cuda:0') 100 0.0001 changing lr epoch 24, time 214.65, cls_loss 0.0394 cls_loss_mapping 0.0513 cls_loss_causal 0.8296 re_mapping 0.0283 re_causal 0.0723 /// teacc 98.19 lr 0.00010000 Epoch 26, weight, value: tensor([[-0.0645, 0.0453, 0.0205, ..., -0.0064, -0.0103, -0.0265], [ 0.0320, 0.0336, -0.0542, ..., -0.0527, -0.0175, 0.0416], [ 0.0443, -0.0283, -0.0369, ..., -0.0013, -0.0244, 0.0052], ..., [ 0.0257, 0.0264, 0.0466, ..., 0.0251, 0.0601, 0.0235], [ 0.0316, -0.0443, -0.0450, ..., -0.0640, -0.0287, 0.0460], [-0.0261, 0.0545, 0.0069, ..., -0.0172, -0.0251, 0.0206]], device='cuda:0'), grad: tensor([[-2.6083e-04, -3.7408e-04, 1.2624e-04, ..., 5.8353e-05, 1.1152e-04, -1.2302e-04], [ 7.9803e-03, 4.4098e-03, 5.7411e-03, ..., 3.2310e-03, 6.7482e-03, 9.7656e-03], [-1.1140e-04, 3.2806e-04, 4.7374e-04, ..., 3.1209e-04, 9.0718e-05, 3.3236e-04], ..., [-2.4246e-02, -1.8143e-02, -3.1860e-02, ..., -2.0416e-02, -2.4261e-02, -2.1194e-02], [ 4.3716e-03, 3.3264e-03, 3.6850e-03, ..., 2.1420e-03, 2.9812e-03, 4.9934e-03], [ 1.3056e-03, 8.2970e-04, 1.5440e-03, ..., 1.2951e-03, 1.1797e-03, 8.7833e-04]], device='cuda:0') Epoch 26, bias, value: tensor([-0.0076, -0.0070, -0.0038, -0.0122, -0.0160, -0.0085, 0.0124, 0.0007, 0.0261, 0.0003], device='cuda:0'), grad: tensor([-0.0008, 0.0095, 0.0003, 0.0159, 0.0009, -0.0002, -0.0009, -0.0333, 0.0066, 0.0019], device='cuda:0') 100 0.0001 changing lr epoch 25, time 214.61, cls_loss 0.0370 cls_loss_mapping 0.0490 cls_loss_causal 0.7954 re_mapping 0.0276 re_causal 0.0724 /// teacc 98.31 lr 0.00010000 Epoch 27, weight, value: tensor([[-0.0652, 0.0455, 0.0201, ..., -0.0065, -0.0108, -0.0269], [ 0.0319, 0.0343, -0.0543, ..., -0.0527, -0.0174, 0.0420], [ 0.0449, -0.0295, -0.0378, ..., -0.0014, -0.0245, 0.0050], ..., [ 0.0261, 0.0266, 0.0472, ..., 0.0256, 0.0609, 0.0236], [ 0.0315, -0.0452, -0.0460, ..., -0.0651, -0.0293, 0.0463], [-0.0266, 0.0546, 0.0072, ..., -0.0178, -0.0252, 0.0203]], device='cuda:0'), grad: tensor([[ 6.1095e-05, -1.2684e-03, 8.1882e-06, ..., -6.0177e-04, 4.5687e-05, 1.2428e-05], [ 2.2113e-04, -3.4511e-05, 2.0540e-04, ..., 3.7289e-04, 1.7321e-04, -1.4591e-04], [-4.6616e-03, -9.0361e-04, -3.5667e-03, ..., -6.3057e-03, -4.2610e-03, 1.4150e-04], ..., [ 4.0779e-03, 1.4105e-03, 3.1509e-03, ..., 5.8746e-03, 3.7022e-03, 1.4913e-04], [-6.7616e-04, 2.0623e-04, -1.2880e-06, ..., -1.7345e-05, 3.4332e-05, -1.0176e-03], [ 7.1049e-05, 6.2990e-04, 3.3426e-04, ..., 4.6206e-04, 4.5687e-05, 8.4043e-05]], device='cuda:0') Epoch 27, bias, value: tensor([-0.0076, -0.0067, -0.0039, -0.0122, -0.0157, -0.0085, 0.0123, 0.0009, 0.0258, 0.0001], device='cuda:0'), grad: tensor([-1.5726e-03, 2.5845e-04, -5.2605e-03, 6.0654e-04, -3.4547e-04, 4.4417e-04, 6.1512e-05, 5.6343e-03, -7.5197e-04, 9.2077e-04], device='cuda:0') 100 0.0001 changing lr epoch 26, time 215.10, cls_loss 0.0322 cls_loss_mapping 0.0477 cls_loss_causal 0.8245 re_mapping 0.0261 re_causal 0.0701 /// teacc 98.33 lr 0.00010000 Epoch 28, weight, value: tensor([[-0.0661, 0.0457, 0.0197, ..., -0.0064, -0.0111, -0.0275], [ 0.0315, 0.0348, -0.0550, ..., -0.0530, -0.0177, 0.0423], [ 0.0452, -0.0302, -0.0380, ..., -0.0012, -0.0245, 0.0048], ..., [ 0.0265, 0.0264, 0.0476, ..., 0.0257, 0.0616, 0.0239], [ 0.0321, -0.0462, -0.0468, ..., -0.0657, -0.0295, 0.0470], [-0.0267, 0.0552, 0.0078, ..., -0.0186, -0.0253, 0.0201]], device='cuda:0'), grad: tensor([[ 1.6257e-05, -4.2588e-05, 3.6955e-05, ..., -2.0284e-06, 2.2322e-05, 3.6508e-06], [ 2.9374e-06, -3.2157e-05, 2.7061e-05, ..., 3.2902e-05, 1.1928e-05, -4.3213e-05], [-8.9128e-07, 5.0008e-05, 4.8310e-05, ..., 1.4547e-06, 4.3511e-05, 1.5251e-05], ..., [-1.7500e-04, -1.1057e-04, -3.1281e-04, ..., -1.9705e-04, -3.1233e-04, 2.0698e-05], [-1.8299e-05, 1.2141e-04, 9.6083e-05, ..., 1.4448e-04, 2.6613e-05, -3.7789e-05], [ 7.1943e-05, 1.0633e-04, 7.7188e-05, ..., 2.1970e-04, 9.6202e-05, 2.6107e-05]], device='cuda:0') Epoch 28, bias, value: tensor([-0.0076, -0.0069, -0.0040, -0.0120, -0.0158, -0.0088, 0.0124, 0.0009, 0.0257, 0.0004], device='cuda:0'), grad: tensor([ 3.9041e-05, -1.1899e-05, 7.7784e-05, 1.2512e-03, -9.6440e-05, -1.4696e-03, -2.6643e-05, -2.3746e-04, 1.8024e-04, 2.9516e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 27---------------------------------------------------- epoch 27, time 231.11, cls_loss 0.0360 cls_loss_mapping 0.0535 cls_loss_causal 0.8226 re_mapping 0.0262 re_causal 0.0724 /// teacc 98.44 lr 0.00010000 Epoch 29, weight, value: tensor([[-0.0668, 0.0459, 0.0193, ..., -0.0065, -0.0116, -0.0278], [ 0.0313, 0.0348, -0.0556, ..., -0.0536, -0.0177, 0.0427], [ 0.0455, -0.0307, -0.0387, ..., -0.0009, -0.0248, 0.0045], ..., [ 0.0268, 0.0262, 0.0479, ..., 0.0257, 0.0622, 0.0237], [ 0.0324, -0.0469, -0.0472, ..., -0.0668, -0.0299, 0.0476], [-0.0277, 0.0559, 0.0081, ..., -0.0191, -0.0257, 0.0198]], device='cuda:0'), grad: tensor([[ 8.4698e-05, -8.8501e-04, -3.2735e-04, ..., -3.0041e-04, 2.1327e-06, 5.5760e-05], [-2.8563e-04, -3.3641e-04, -3.9101e-04, ..., 9.1136e-05, -3.6049e-04, -4.5013e-04], [-1.6012e-03, 1.2553e-04, 4.9353e-05, ..., -1.1244e-03, 1.8969e-05, -2.2876e-04], ..., [ 5.2023e-04, 3.7003e-04, 3.3092e-04, ..., 1.0610e-04, 2.3699e-04, 4.7898e-04], [ 1.9360e-04, -2.0303e-06, 2.2089e-04, ..., 8.0013e-04, 4.2886e-05, -9.2602e-04], [ 2.4140e-04, 5.1165e-04, 7.3835e-06, ..., 1.6809e-04, -3.3796e-05, 1.6022e-04]], device='cuda:0') Epoch 29, bias, value: tensor([-0.0077, -0.0071, -0.0038, -0.0120, -0.0152, -0.0089, 0.0128, 0.0007, 0.0255, 0.0003], device='cuda:0'), grad: tensor([-8.5020e-04, -2.5630e-04, -2.3289e-03, 7.7200e-04, 7.9870e-05, -4.7016e-04, 8.8978e-04, 8.6784e-04, 3.9196e-04, 9.0504e-04], device='cuda:0') 100 0.0001 changing lr epoch 28, time 214.83, cls_loss 0.0353 cls_loss_mapping 0.0468 cls_loss_causal 0.8024 re_mapping 0.0253 re_causal 0.0663 /// teacc 98.34 lr 0.00010000 Epoch 30, weight, value: tensor([[-0.0677, 0.0464, 0.0189, ..., -0.0064, -0.0124, -0.0282], [ 0.0316, 0.0354, -0.0563, ..., -0.0540, -0.0171, 0.0432], [ 0.0457, -0.0309, -0.0390, ..., -0.0009, -0.0250, 0.0041], ..., [ 0.0270, 0.0266, 0.0484, ..., 0.0260, 0.0630, 0.0234], [ 0.0330, -0.0477, -0.0475, ..., -0.0671, -0.0297, 0.0484], [-0.0284, 0.0564, 0.0082, ..., -0.0195, -0.0261, 0.0198]], device='cuda:0'), grad: tensor([[ 5.0902e-05, -1.0490e-04, 5.7459e-05, ..., -1.3486e-05, 4.7088e-05, 6.9365e-06], [ 1.3582e-05, -1.9848e-04, 9.1910e-05, ..., 6.2108e-05, 5.6595e-05, -2.6536e-04], [-2.7679e-02, -4.2419e-03, -2.6520e-02, ..., -9.3842e-03, -2.2354e-02, 4.6998e-05], ..., [ 2.7039e-02, 4.2877e-03, 2.5955e-02, ..., 9.1400e-03, 2.1805e-02, 8.1718e-05], [ 2.3365e-05, 3.0780e-04, 2.7251e-04, ..., 1.1343e-04, 5.0247e-05, -1.8701e-05], [ 8.8453e-05, -6.4659e-04, -6.5136e-04, ..., 4.4703e-05, 1.8775e-05, -7.3135e-05]], device='cuda:0') Epoch 30, bias, value: tensor([-0.0076, -0.0070, -0.0036, -0.0123, -0.0156, -0.0087, 0.0121, 0.0010, 0.0257, 0.0003], device='cuda:0'), grad: tensor([-3.1978e-05, -1.6284e-04, -2.5864e-02, 2.2182e-03, 1.0481e-03, -1.8415e-03, -3.3450e-04, 2.5436e-02, 4.0889e-04, -8.7261e-04], device='cuda:0') 100 0.0001 changing lr epoch 29, time 214.76, cls_loss 0.0360 cls_loss_mapping 0.0462 cls_loss_causal 0.8216 re_mapping 0.0244 re_causal 0.0637 /// teacc 98.39 lr 0.00010000 Epoch 31, weight, value: tensor([[-0.0689, 0.0470, 0.0187, ..., -0.0064, -0.0132, -0.0288], [ 0.0314, 0.0360, -0.0571, ..., -0.0539, -0.0170, 0.0437], [ 0.0468, -0.0313, -0.0393, ..., -0.0012, -0.0243, 0.0038], ..., [ 0.0272, 0.0266, 0.0491, ..., 0.0261, 0.0637, 0.0232], [ 0.0330, -0.0489, -0.0484, ..., -0.0679, -0.0300, 0.0490], [-0.0281, 0.0565, 0.0086, ..., -0.0199, -0.0265, 0.0197]], device='cuda:0'), grad: tensor([[ 6.2525e-05, 2.5034e-05, 7.7248e-05, ..., 5.7817e-06, 6.6876e-05, 3.4600e-05], [ 2.3469e-05, -1.2946e-04, 1.6987e-04, ..., 6.0618e-05, 8.0287e-05, -3.2401e-04], [ 1.5554e-03, 1.1473e-03, 2.4166e-03, ..., 8.2588e-04, 2.3098e-03, 1.3483e-04], ..., [-3.7365e-03, -2.5158e-03, -5.5923e-03, ..., -1.6718e-03, -5.1994e-03, 5.3793e-05], [-1.5986e-04, 1.9503e-04, 2.8968e-04, ..., 1.2720e-04, 2.1219e-04, -3.8004e-04], [ 1.8044e-03, 1.8587e-03, 2.8572e-03, ..., 1.0195e-03, 2.0752e-03, 4.6396e-04]], device='cuda:0') Epoch 31, bias, value: tensor([-7.5410e-03, -7.1090e-03, -3.3202e-03, -1.2278e-02, -1.5424e-02, -8.4791e-03, 1.2319e-02, 1.0175e-03, 2.5229e-02, 9.9798e-05], device='cuda:0'), grad: tensor([ 0.0002, -0.0001, 0.0028, 0.0004, -0.0008, 0.0023, -0.0027, -0.0055, -0.0003, 0.0038], device='cuda:0') 100 0.0001 changing lr epoch 30, time 214.77, cls_loss 0.0350 cls_loss_mapping 0.0442 cls_loss_causal 0.7742 re_mapping 0.0233 re_causal 0.0607 /// teacc 98.38 lr 0.00010000 Epoch 32, weight, value: tensor([[-0.0698, 0.0471, 0.0180, ..., -0.0063, -0.0140, -0.0294], [ 0.0312, 0.0365, -0.0574, ..., -0.0544, -0.0170, 0.0439], [ 0.0470, -0.0316, -0.0396, ..., -0.0010, -0.0241, 0.0039], ..., [ 0.0279, 0.0261, 0.0493, ..., 0.0265, 0.0646, 0.0231], [ 0.0338, -0.0493, -0.0490, ..., -0.0687, -0.0303, 0.0501], [-0.0289, 0.0567, 0.0087, ..., -0.0203, -0.0270, 0.0192]], device='cuda:0'), grad: tensor([[ 6.7174e-05, 6.9439e-05, 1.6212e-04, ..., 7.9989e-05, 6.5923e-05, 2.4542e-05], [ 4.0591e-05, -3.8475e-05, 2.2411e-04, ..., 1.3733e-04, 1.0449e-04, -1.2481e-04], [ 7.4625e-04, 3.6907e-04, 1.5097e-03, ..., 1.1606e-03, 3.1352e-04, 1.1253e-04], ..., [ 2.9640e-03, 4.4346e-04, 5.8556e-03, ..., 5.1460e-03, 4.7231e-04, 4.2975e-05], [ 4.9400e-04, 4.3368e-04, 7.8583e-04, ..., 8.7214e-04, 4.0233e-05, -2.6393e-04], [ 7.9572e-05, 3.7384e-03, 2.6093e-03, ..., 9.5701e-04, 9.1374e-05, 3.2067e-05]], device='cuda:0') Epoch 32, bias, value: tensor([-0.0077, -0.0074, -0.0029, -0.0124, -0.0151, -0.0085, 0.0121, 0.0012, 0.0254, -0.0003], device='cuda:0'), grad: tensor([ 0.0003, 0.0001, 0.0016, -0.0063, -0.0119, 0.0005, 0.0003, 0.0045, 0.0012, 0.0097], device='cuda:0') 100 0.0001 changing lr epoch 31, time 214.71, cls_loss 0.0250 cls_loss_mapping 0.0352 cls_loss_causal 0.8064 re_mapping 0.0222 re_causal 0.0629 /// teacc 98.33 lr 0.00010000 Epoch 33, weight, value: tensor([[-0.0699, 0.0477, 0.0177, ..., -0.0065, -0.0144, -0.0296], [ 0.0305, 0.0366, -0.0582, ..., -0.0548, -0.0170, 0.0441], [ 0.0467, -0.0323, -0.0402, ..., -0.0011, -0.0248, 0.0033], ..., [ 0.0283, 0.0263, 0.0498, ..., 0.0267, 0.0655, 0.0230], [ 0.0348, -0.0502, -0.0500, ..., -0.0694, -0.0304, 0.0508], [-0.0289, 0.0572, 0.0091, ..., -0.0206, -0.0270, 0.0191]], device='cuda:0'), grad: tensor([[ 6.2704e-05, -1.7917e-04, 6.0737e-05, ..., -9.1612e-05, 4.6223e-05, 3.1084e-05], [ 2.8777e-04, 3.2216e-05, 1.0037e-04, ..., 5.9694e-05, 8.0585e-05, 2.3520e-04], [ 3.9139e-03, 1.2655e-03, 4.5052e-03, ..., 3.4695e-03, 4.5700e-03, 3.4356e-04], ..., [-4.1618e-03, -1.5039e-03, -5.4169e-03, ..., -3.7212e-03, -5.4474e-03, 9.5963e-05], [-1.5249e-03, -3.2634e-05, 1.4699e-04, ..., 7.2539e-05, 4.5419e-05, -1.7605e-03], [ 4.2319e-04, 1.8346e-04, 4.4370e-04, ..., 2.1029e-04, 4.2057e-04, 1.4031e-04]], device='cuda:0') Epoch 33, bias, value: tensor([-0.0074, -0.0077, -0.0033, -0.0123, -0.0152, -0.0087, 0.0124, 0.0014, 0.0255, -0.0002], device='cuda:0'), grad: tensor([-0.0001, 0.0005, 0.0054, 0.0008, 0.0003, 0.0002, 0.0002, -0.0057, -0.0024, 0.0009], device='cuda:0') 100 0.0001 changing lr epoch 32, time 214.74, cls_loss 0.0275 cls_loss_mapping 0.0347 cls_loss_causal 0.7906 re_mapping 0.0228 re_causal 0.0609 /// teacc 98.28 lr 0.00010000 Epoch 34, weight, value: tensor([[-0.0709, 0.0481, 0.0177, ..., -0.0064, -0.0148, -0.0302], [ 0.0306, 0.0370, -0.0588, ..., -0.0549, -0.0168, 0.0445], [ 0.0468, -0.0333, -0.0409, ..., -0.0014, -0.0252, 0.0028], ..., [ 0.0289, 0.0266, 0.0504, ..., 0.0267, 0.0662, 0.0232], [ 0.0354, -0.0512, -0.0508, ..., -0.0701, -0.0307, 0.0514], [-0.0297, 0.0576, 0.0095, ..., -0.0214, -0.0273, 0.0190]], device='cuda:0'), grad: tensor([[ 1.1760e-04, -2.8815e-06, 4.7266e-05, ..., 9.4697e-06, 1.6481e-05, 3.3885e-05], [ 2.0540e-04, -2.6841e-06, 4.9859e-05, ..., 5.8830e-05, 8.5384e-06, -2.1487e-05], [ 1.0481e-03, 5.2273e-05, -6.3926e-06, ..., -8.1420e-05, -1.2085e-05, 3.4189e-04], ..., [ 1.2308e-05, 1.4961e-04, -1.1742e-04, ..., -8.9630e-06, -1.2934e-04, 1.1003e-04], [-2.4261e-03, 1.1659e-04, 9.6858e-05, ..., 6.0111e-05, 1.2480e-05, -6.4850e-04], [ 5.0932e-05, -2.7943e-04, -1.6146e-03, ..., -6.5470e-04, 4.1068e-05, 2.6631e-04]], device='cuda:0') Epoch 34, bias, value: tensor([-0.0074, -0.0076, -0.0037, -0.0121, -0.0155, -0.0088, 0.0127, 0.0017, 0.0257, -0.0005], device='cuda:0'), grad: tensor([ 0.0004, 0.0008, 0.0039, 0.0021, -0.0017, 0.0010, 0.0020, 0.0004, -0.0085, -0.0003], device='cuda:0') 100 0.0001 changing lr epoch 33, time 214.67, cls_loss 0.0309 cls_loss_mapping 0.0412 cls_loss_causal 0.8196 re_mapping 0.0216 re_causal 0.0615 /// teacc 98.43 lr 0.00010000 Epoch 35, weight, value: tensor([[-0.0719, 0.0486, 0.0175, ..., -0.0064, -0.0152, -0.0309], [ 0.0312, 0.0369, -0.0593, ..., -0.0551, -0.0163, 0.0449], [ 0.0470, -0.0330, -0.0410, ..., -0.0009, -0.0248, 0.0024], ..., [ 0.0287, 0.0263, 0.0505, ..., 0.0263, 0.0665, 0.0230], [ 0.0361, -0.0520, -0.0518, ..., -0.0708, -0.0309, 0.0523], [-0.0303, 0.0579, 0.0097, ..., -0.0219, -0.0275, 0.0185]], device='cuda:0'), grad: tensor([[ 5.1826e-05, 1.6764e-05, 6.0409e-05, ..., 2.8551e-05, 4.4227e-05, 3.0726e-05], [-1.5390e-04, -8.7976e-04, -2.2483e-04, ..., -4.4727e-04, -3.1996e-04, -4.4012e-04], [ 4.3011e-04, 7.3147e-04, 4.7565e-04, ..., 4.6015e-04, 4.3297e-04, 4.7827e-04], ..., [-2.1095e-03, -4.2248e-04, -3.4962e-03, ..., -2.0218e-03, -2.5349e-03, 3.0264e-05], [-7.8201e-04, -5.9843e-04, 1.0914e-04, ..., 8.2433e-05, 7.7248e-05, -1.2703e-03], [ 1.5342e-04, -3.3646e-03, -5.2214e-04, ..., 1.0538e-04, 1.0419e-04, -1.7605e-03]], device='cuda:0') Epoch 35, bias, value: tensor([-0.0073, -0.0081, -0.0029, -0.0120, -0.0154, -0.0085, 0.0123, 0.0011, 0.0259, -0.0006], device='cuda:0'), grad: tensor([ 1.0675e-04, -1.6880e-03, 1.5955e-03, 3.0899e-03, 5.2834e-03, 4.7255e-04, -5.4296e-07, -2.3842e-03, -1.5516e-03, -4.9248e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 34---------------------------------------------------- epoch 34, time 231.04, cls_loss 0.0304 cls_loss_mapping 0.0418 cls_loss_causal 0.7979 re_mapping 0.0222 re_causal 0.0616 /// teacc 98.56 lr 0.00010000 Epoch 36, weight, value: tensor([[-0.0730, 0.0484, 0.0171, ..., -0.0069, -0.0159, -0.0313], [ 0.0321, 0.0383, -0.0588, ..., -0.0552, -0.0152, 0.0458], [ 0.0469, -0.0335, -0.0417, ..., -0.0012, -0.0251, 0.0018], ..., [ 0.0287, 0.0257, 0.0506, ..., 0.0264, 0.0669, 0.0224], [ 0.0365, -0.0532, -0.0524, ..., -0.0715, -0.0314, 0.0530], [-0.0306, 0.0583, 0.0097, ..., -0.0224, -0.0276, 0.0184]], device='cuda:0'), grad: tensor([[ 6.3002e-05, 4.0680e-05, 5.7071e-05, ..., 4.7743e-05, 6.7711e-05, 2.5213e-05], [ 1.9205e-04, -2.1141e-06, 8.7500e-05, ..., 2.0361e-04, 2.1350e-04, -2.6822e-05], [-6.2513e-04, 9.3222e-05, 5.3465e-05, ..., -4.2105e-04, -4.6396e-04, 3.7074e-05], ..., [-2.0754e-04, -2.6846e-04, -5.7697e-04, ..., -2.6846e-04, -4.4823e-04, -6.9216e-06], [-4.7803e-04, -3.7700e-05, 7.4863e-05, ..., 2.6256e-05, 1.9944e-04, -1.1587e-03], [ 2.2185e-04, 2.1666e-05, -1.5870e-05, ..., 5.5820e-05, 5.4687e-05, 1.9550e-04]], device='cuda:0') Epoch 36, bias, value: tensor([-0.0076, -0.0072, -0.0029, -0.0118, -0.0155, -0.0083, 0.0120, 0.0008, 0.0256, -0.0006], device='cuda:0'), grad: tensor([ 0.0003, 0.0003, -0.0009, 0.0011, -0.0001, 0.0005, -0.0002, -0.0003, -0.0011, 0.0004], device='cuda:0') 100 0.0001 changing lr epoch 35, time 215.06, cls_loss 0.0228 cls_loss_mapping 0.0312 cls_loss_causal 0.7089 re_mapping 0.0209 re_causal 0.0552 /// teacc 98.38 lr 0.00010000 Epoch 37, weight, value: tensor([[-0.0733, 0.0489, 0.0169, ..., -0.0071, -0.0165, -0.0317], [ 0.0318, 0.0382, -0.0595, ..., -0.0556, -0.0155, 0.0460], [ 0.0470, -0.0343, -0.0423, ..., -0.0013, -0.0255, 0.0012], ..., [ 0.0294, 0.0260, 0.0510, ..., 0.0267, 0.0677, 0.0226], [ 0.0371, -0.0542, -0.0533, ..., -0.0723, -0.0318, 0.0538], [-0.0312, 0.0586, 0.0101, ..., -0.0229, -0.0271, 0.0182]], device='cuda:0'), grad: tensor([[ 9.1642e-06, -2.4486e-04, 1.3269e-05, ..., -2.5868e-04, 6.7167e-06, 1.7229e-06], [-1.6969e-06, -2.4468e-05, 2.2501e-05, ..., 2.1994e-05, 8.4341e-06, -6.8605e-05], [-2.4825e-05, 1.8418e-04, 3.2854e-04, ..., 1.8382e-04, 2.0158e-04, -4.2021e-05], ..., [-4.8232e-04, -1.6838e-05, -4.9639e-04, ..., -3.9983e-04, -4.9210e-04, 1.5527e-05], [ 2.4676e-04, 6.3241e-05, 4.5896e-05, ..., 1.3518e-04, 9.6917e-05, 5.5909e-05], [-3.2574e-05, -5.4836e-04, -4.9591e-04, ..., 1.0610e-04, -6.3419e-05, -1.0327e-05]], device='cuda:0') Epoch 37, bias, value: tensor([-0.0071, -0.0075, -0.0032, -0.0118, -0.0153, -0.0084, 0.0120, 0.0009, 0.0255, -0.0005], device='cuda:0'), grad: tensor([-5.8031e-04, -2.1681e-05, 5.2035e-05, 4.7255e-04, 6.8855e-04, -8.4758e-05, -5.0098e-05, -3.1781e-04, 4.3988e-04, -5.9700e-04], device='cuda:0') 100 0.0001 changing lr epoch 36, time 215.13, cls_loss 0.0253 cls_loss_mapping 0.0362 cls_loss_causal 0.7899 re_mapping 0.0213 re_causal 0.0576 /// teacc 98.35 lr 0.00010000 Epoch 38, weight, value: tensor([[-0.0744, 0.0490, 0.0165, ..., -0.0070, -0.0171, -0.0323], [ 0.0321, 0.0389, -0.0592, ..., -0.0561, -0.0152, 0.0466], [ 0.0471, -0.0355, -0.0426, ..., -0.0014, -0.0256, 0.0007], ..., [ 0.0299, 0.0263, 0.0515, ..., 0.0271, 0.0689, 0.0226], [ 0.0375, -0.0548, -0.0542, ..., -0.0733, -0.0323, 0.0546], [-0.0321, 0.0589, 0.0100, ..., -0.0234, -0.0279, 0.0177]], device='cuda:0'), grad: tensor([[ 1.6379e-04, -5.1212e-04, 8.9109e-06, ..., -5.0497e-04, 2.3082e-05, 7.5996e-05], [-1.6159e-02, -1.1612e-02, -6.9847e-03, ..., 3.4499e-04, -1.4000e-02, -7.3586e-03], [-7.4053e-04, -6.2525e-05, 1.7691e-04, ..., -8.7643e-04, -5.4932e-04, 2.3234e-04], ..., [ 1.4206e-02, 9.9258e-03, 5.6267e-03, ..., 9.1732e-05, 1.2054e-02, 6.2943e-03], [-1.6861e-03, -1.0710e-03, 2.9027e-05, ..., 1.6868e-04, 9.5367e-05, -2.0237e-03], [ 1.2951e-03, 1.4420e-03, 5.8031e-04, ..., 3.2592e-04, 1.1873e-03, 6.1178e-04]], device='cuda:0') Epoch 38, bias, value: tensor([-0.0073, -0.0073, -0.0035, -0.0114, -0.0150, -0.0088, 0.0121, 0.0013, 0.0254, -0.0008], device='cuda:0'), grad: tensor([-0.0021, -0.0184, -0.0019, 0.0010, 0.0014, 0.0021, 0.0027, 0.0165, -0.0046, 0.0034], device='cuda:0') 100 0.0001 changing lr epoch 37, time 215.39, cls_loss 0.0241 cls_loss_mapping 0.0367 cls_loss_causal 0.7490 re_mapping 0.0199 re_causal 0.0560 /// teacc 98.36 lr 0.00010000 Epoch 39, weight, value: tensor([[-0.0750, 0.0490, 0.0162, ..., -0.0068, -0.0175, -0.0327], [ 0.0323, 0.0394, -0.0593, ..., -0.0563, -0.0147, 0.0469], [ 0.0472, -0.0360, -0.0432, ..., -0.0013, -0.0259, 0.0004], ..., [ 0.0300, 0.0258, 0.0517, ..., 0.0272, 0.0694, 0.0222], [ 0.0379, -0.0554, -0.0547, ..., -0.0740, -0.0326, 0.0556], [-0.0325, 0.0590, 0.0107, ..., -0.0237, -0.0278, 0.0177]], device='cuda:0'), grad: tensor([[ 5.4955e-05, -3.7909e-05, 6.2346e-05, ..., 2.9787e-05, 4.7237e-05, 2.5570e-05], [ 2.5177e-03, 1.6041e-03, 2.5501e-03, ..., 7.4244e-04, 2.7542e-03, 7.4863e-04], [ 9.1267e-04, 6.5994e-04, 1.0662e-03, ..., 7.1192e-04, 7.1096e-04, 2.3913e-04], ..., [-3.6392e-03, -2.2011e-03, -3.5324e-03, ..., -4.3249e-04, -4.5776e-03, -1.0405e-03], [ 2.0790e-04, 3.7098e-04, 3.6383e-04, ..., 1.4448e-04, 3.3402e-04, 9.5189e-05], [ 3.6168e-04, -4.7493e-04, 3.5048e-04, ..., 1.4150e-04, 3.8052e-04, 2.4242e-03]], device='cuda:0') Epoch 39, bias, value: tensor([-0.0074, -0.0073, -0.0033, -0.0114, -0.0145, -0.0093, 0.0122, 0.0009, 0.0255, -0.0009], device='cuda:0'), grad: tensor([-1.5527e-05, 3.6716e-03, 1.6241e-03, -9.7656e-04, -2.1248e-03, 3.0279e-04, 2.2936e-04, -4.6768e-03, 6.5851e-04, 1.3046e-03], device='cuda:0') 100 0.0001 changing lr epoch 38, time 214.81, cls_loss 0.0247 cls_loss_mapping 0.0314 cls_loss_causal 0.7198 re_mapping 0.0199 re_causal 0.0542 /// teacc 98.49 lr 0.00010000 Epoch 40, weight, value: tensor([[-7.5558e-02, 4.8734e-02, 1.5599e-02, ..., -7.1549e-03, -1.8099e-02, -3.3183e-02], [ 3.1851e-02, 3.9503e-02, -6.0699e-02, ..., -5.7021e-02, -1.5786e-02, 4.7270e-02], [ 4.7919e-02, -3.6365e-02, -4.3390e-02, ..., -1.0510e-03, -2.5506e-02, -5.8012e-05], ..., [ 3.0494e-02, 2.6070e-02, 5.2435e-02, ..., 2.7253e-02, 7.0422e-02, 2.2360e-02], [ 3.7992e-02, -5.6698e-02, -5.5239e-02, ..., -7.5137e-02, -3.2685e-02, 5.6172e-02], [-3.2974e-02, 5.9861e-02, 1.1100e-02, ..., -2.4012e-02, -2.8156e-02, 1.7345e-02]], device='cuda:0'), grad: tensor([[ 1.0490e-05, -1.3232e-04, 7.2233e-06, ..., -5.8487e-06, 7.6033e-06, 6.0275e-06], [ 9.4414e-05, 2.3052e-05, 6.5625e-05, ..., 6.0558e-05, 7.7546e-05, 1.5363e-05], [ 2.1398e-05, 2.2963e-05, 1.0818e-04, ..., -2.8923e-05, 6.1989e-05, 7.6175e-05], ..., [-4.9019e-04, -4.7058e-05, -3.9291e-04, ..., -2.2662e-04, -4.5180e-04, -1.7464e-04], [ 3.6538e-05, 2.0325e-04, 1.0800e-04, ..., 6.7472e-05, 9.2566e-05, -4.2230e-05], [ 1.3918e-05, -4.0698e-04, -1.3936e-04, ..., 3.7044e-05, 9.9540e-06, -1.0854e-04]], device='cuda:0') Epoch 40, bias, value: tensor([-0.0080, -0.0077, -0.0030, -0.0118, -0.0149, -0.0085, 0.0125, 0.0011, 0.0250, -0.0004], device='cuda:0'), grad: tensor([-1.5724e-04, 1.3161e-04, -4.2439e-05, 1.3936e-04, 1.5295e-04, 3.7885e-04, 1.3280e-04, -3.6764e-04, 3.9530e-04, -7.6246e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 39---------------------------------------------------- epoch 39, time 230.86, cls_loss 0.0248 cls_loss_mapping 0.0329 cls_loss_causal 0.7405 re_mapping 0.0202 re_causal 0.0538 /// teacc 98.58 lr 0.00010000 Epoch 41, weight, value: tensor([[-0.0764, 0.0492, 0.0153, ..., -0.0071, -0.0188, -0.0339], [ 0.0321, 0.0401, -0.0606, ..., -0.0572, -0.0154, 0.0476], [ 0.0480, -0.0371, -0.0439, ..., -0.0009, -0.0253, -0.0009], ..., [ 0.0313, 0.0259, 0.0529, ..., 0.0275, 0.0709, 0.0221], [ 0.0383, -0.0574, -0.0562, ..., -0.0763, -0.0333, 0.0571], [-0.0334, 0.0602, 0.0113, ..., -0.0248, -0.0285, 0.0173]], device='cuda:0'), grad: tensor([[ 9.9778e-05, 5.3108e-05, 2.1935e-05, ..., 1.3304e-04, 1.9208e-05, 3.8326e-05], [ 2.0134e-04, 1.2434e-04, 1.3518e-04, ..., 9.9063e-05, 1.3590e-04, 1.1492e-04], [-8.2731e-04, -4.7892e-05, -1.3447e-03, ..., -1.7967e-03, -9.0408e-04, 4.2176e-04], ..., [ 1.7238e-04, -1.5903e-04, 7.2002e-05, ..., 5.0402e-04, -6.8426e-05, -8.5831e-05], [-8.4066e-04, 7.6532e-05, 6.1154e-05, ..., 4.3154e-04, 4.4644e-05, -7.3195e-04], [ 1.5211e-04, 9.5487e-05, 8.9288e-05, ..., 1.1963e-04, 7.3314e-05, 7.5698e-05]], device='cuda:0') Epoch 41, bias, value: tensor([-0.0079, -0.0075, -0.0033, -0.0120, -0.0149, -0.0082, 0.0119, 0.0014, 0.0251, -0.0002], device='cuda:0'), grad: tensor([ 4.3464e-04, 4.3344e-04, -1.5450e-03, 9.8648e-03, 2.4819e-04, -1.3222e-02, 2.8954e-03, 4.6110e-04, -2.4676e-05, 4.4894e-04], device='cuda:0') 100 0.0001 changing lr epoch 40, time 214.65, cls_loss 0.0305 cls_loss_mapping 0.0361 cls_loss_causal 0.7446 re_mapping 0.0195 re_causal 0.0523 /// teacc 98.38 lr 0.00010000 Epoch 42, weight, value: tensor([[-0.0776, 0.0496, 0.0148, ..., -0.0073, -0.0194, -0.0346], [ 0.0327, 0.0410, -0.0609, ..., -0.0574, -0.0147, 0.0481], [ 0.0481, -0.0381, -0.0448, ..., -0.0014, -0.0255, -0.0015], ..., [ 0.0317, 0.0262, 0.0535, ..., 0.0280, 0.0720, 0.0225], [ 0.0387, -0.0582, -0.0568, ..., -0.0773, -0.0341, 0.0579], [-0.0344, 0.0607, 0.0117, ..., -0.0256, -0.0281, 0.0170]], device='cuda:0'), grad: tensor([[ 8.4937e-06, -4.3809e-05, 1.6764e-05, ..., 4.9844e-06, 1.4760e-05, 2.1290e-06], [ 4.3958e-05, -2.4319e-05, 1.7077e-05, ..., 2.7329e-05, 3.0905e-05, -1.5274e-05], [-2.1294e-05, 2.9296e-05, 6.3181e-05, ..., -1.5959e-05, 2.7195e-05, 1.8582e-05], ..., [-1.3912e-04, -6.8903e-05, -2.2995e-04, ..., -1.2720e-04, -1.8382e-04, 4.9844e-06], [-5.5730e-05, 2.9385e-05, 1.4126e-05, ..., 2.2411e-05, -1.3009e-05, -9.1374e-05], [ 3.4362e-05, 1.3463e-05, 3.3796e-05, ..., 3.4094e-05, 3.6895e-05, 4.8056e-06]], device='cuda:0') Epoch 42, bias, value: tensor([-0.0080, -0.0073, -0.0036, -0.0123, -0.0154, -0.0074, 0.0115, 0.0020, 0.0251, -0.0004], device='cuda:0'), grad: tensor([ 3.1233e-05, 2.8715e-05, -1.1124e-05, -8.8394e-05, 1.4640e-05, -3.9268e-04, 4.9639e-04, -1.3244e-04, -6.5267e-06, 6.0380e-05], device='cuda:0') 100 0.0001 changing lr epoch 41, time 214.55, cls_loss 0.0251 cls_loss_mapping 0.0297 cls_loss_causal 0.7324 re_mapping 0.0191 re_causal 0.0524 /// teacc 98.43 lr 0.00010000 Epoch 43, weight, value: tensor([[-0.0780, 0.0499, 0.0145, ..., -0.0072, -0.0197, -0.0350], [ 0.0331, 0.0414, -0.0613, ..., -0.0578, -0.0139, 0.0489], [ 0.0483, -0.0382, -0.0449, ..., -0.0009, -0.0254, -0.0021], ..., [ 0.0318, 0.0263, 0.0539, ..., 0.0278, 0.0726, 0.0224], [ 0.0390, -0.0591, -0.0573, ..., -0.0779, -0.0344, 0.0584], [-0.0351, 0.0608, 0.0117, ..., -0.0266, -0.0285, 0.0167]], device='cuda:0'), grad: tensor([[ 4.6402e-05, 1.2726e-05, 2.7597e-05, ..., 5.2184e-05, 2.1279e-05, 1.8686e-05], [ 3.6478e-05, 6.3665e-06, 3.7581e-05, ..., 5.4359e-05, 3.0234e-05, -9.7305e-06], [ 1.4133e-03, 5.6505e-04, 7.1764e-04, ..., 1.5917e-03, 6.1989e-04, 7.0333e-04], ..., [ 3.3998e-04, 1.2755e-04, 1.3864e-04, ..., 3.9172e-04, 1.1224e-04, 1.7643e-04], [ 2.1362e-03, 9.0933e-04, 1.1168e-03, ..., 2.4967e-03, 9.6512e-04, 9.7609e-04], [ 3.3587e-05, -6.2168e-05, -5.9843e-05, ..., 4.2260e-05, 1.7300e-05, 1.4491e-05]], device='cuda:0') Epoch 43, bias, value: tensor([-0.0079, -0.0072, -0.0029, -0.0126, -0.0156, -0.0072, 0.0118, 0.0020, 0.0247, -0.0009], device='cuda:0'), grad: tensor([ 9.0063e-05, 5.1528e-05, 2.2335e-03, -6.4735e-03, 2.1756e-05, 1.0878e-04, -6.3539e-05, 5.3453e-04, 3.5114e-03, -8.6352e-06], device='cuda:0') 100 0.0001 changing lr epoch 42, time 214.78, cls_loss 0.0230 cls_loss_mapping 0.0354 cls_loss_causal 0.7725 re_mapping 0.0187 re_causal 0.0532 /// teacc 98.47 lr 0.00010000 Epoch 44, weight, value: tensor([[-0.0779, 0.0508, 0.0140, ..., -0.0070, -0.0201, -0.0354], [ 0.0330, 0.0412, -0.0617, ..., -0.0582, -0.0141, 0.0493], [ 0.0482, -0.0393, -0.0455, ..., -0.0009, -0.0257, -0.0028], ..., [ 0.0320, 0.0262, 0.0541, ..., 0.0281, 0.0731, 0.0219], [ 0.0401, -0.0594, -0.0566, ..., -0.0780, -0.0331, 0.0595], [-0.0353, 0.0610, 0.0117, ..., -0.0271, -0.0288, 0.0166]], device='cuda:0'), grad: tensor([[ 2.5845e-04, -6.9857e-04, 2.4974e-05, ..., 1.3149e-04, 1.3280e-04, 5.4501e-06], [-2.4295e-04, -7.0620e-04, 9.8228e-05, ..., 6.3479e-05, -2.3150e-04, -4.5633e-04], [-3.8099e-04, 1.3816e-04, 1.0812e-04, ..., -4.3035e-04, -1.0073e-04, 6.3002e-05], ..., [-5.2691e-04, -1.0943e-04, -8.2731e-04, ..., -1.6499e-04, -5.5647e-04, 3.8266e-05], [ 2.7752e-04, 2.8324e-04, 1.4508e-04, ..., 7.1168e-05, 2.0766e-04, 9.6083e-05], [ 2.5749e-04, 3.2353e-04, 3.5405e-04, ..., 2.1088e-04, 2.3115e-04, 4.2289e-05]], device='cuda:0') Epoch 44, bias, value: tensor([-0.0073, -0.0074, -0.0032, -0.0125, -0.0152, -0.0075, 0.0114, 0.0017, 0.0250, -0.0006], device='cuda:0'), grad: tensor([-0.0007, -0.0009, -0.0013, 0.0001, 0.0004, 0.0005, 0.0011, -0.0006, 0.0008, 0.0007], device='cuda:0') 100 0.0001 changing lr epoch 43, time 214.55, cls_loss 0.0217 cls_loss_mapping 0.0293 cls_loss_causal 0.7475 re_mapping 0.0185 re_causal 0.0512 /// teacc 98.48 lr 0.00010000 Epoch 45, weight, value: tensor([[-0.0786, 0.0509, 0.0138, ..., -0.0070, -0.0206, -0.0359], [ 0.0328, 0.0416, -0.0619, ..., -0.0589, -0.0145, 0.0497], [ 0.0483, -0.0404, -0.0463, ..., -0.0010, -0.0268, -0.0031], ..., [ 0.0327, 0.0266, 0.0547, ..., 0.0287, 0.0747, 0.0218], [ 0.0404, -0.0603, -0.0570, ..., -0.0787, -0.0334, 0.0602], [-0.0362, 0.0610, 0.0116, ..., -0.0278, -0.0294, 0.0163]], device='cuda:0'), grad: tensor([[ 3.1829e-04, 4.2510e-04, 5.9120e-06, ..., 4.8161e-04, 7.4580e-06, 2.7493e-06], [ 1.3649e-05, 1.2442e-05, 1.2279e-05, ..., 3.5048e-05, 2.1070e-05, -1.9804e-05], [-2.4343e-04, 3.6091e-05, -7.1049e-05, ..., -2.1183e-04, -2.1911e-04, 1.6779e-05], ..., [ 1.2827e-04, 9.1910e-05, 1.2532e-05, ..., 1.5247e-04, 4.9800e-05, 1.3113e-05], [ 1.7154e-04, 2.5010e-04, 2.5854e-05, ..., 3.1400e-04, 3.5256e-05, -2.5019e-05], [ 3.2395e-05, 9.0361e-05, -2.9966e-05, ..., 4.6700e-05, -7.5437e-07, 2.9653e-05]], device='cuda:0') Epoch 45, bias, value: tensor([-0.0079, -0.0076, -0.0035, -0.0121, -0.0156, -0.0075, 0.0118, 0.0025, 0.0249, -0.0009], device='cuda:0'), grad: tensor([ 1.9970e-03, 8.7321e-05, -2.4378e-04, 1.2884e-03, -9.3579e-05, -4.1428e-03, -8.9931e-04, 4.8137e-04, 1.2684e-03, 2.5797e-04], device='cuda:0') 100 0.0001 changing lr epoch 44, time 214.67, cls_loss 0.0242 cls_loss_mapping 0.0346 cls_loss_causal 0.7296 re_mapping 0.0174 re_causal 0.0490 /// teacc 98.28 lr 0.00010000 Epoch 46, weight, value: tensor([[-0.0793, 0.0512, 0.0133, ..., -0.0071, -0.0215, -0.0364], [ 0.0323, 0.0413, -0.0627, ..., -0.0593, -0.0153, 0.0499], [ 0.0483, -0.0404, -0.0465, ..., -0.0010, -0.0266, -0.0038], ..., [ 0.0334, 0.0266, 0.0548, ..., 0.0287, 0.0754, 0.0223], [ 0.0411, -0.0608, -0.0574, ..., -0.0789, -0.0333, 0.0611], [-0.0362, 0.0615, 0.0125, ..., -0.0283, -0.0290, 0.0162]], device='cuda:0'), grad: tensor([[ 3.4720e-05, 1.9222e-05, 2.4945e-05, ..., 4.0084e-05, 2.4185e-05, 1.5244e-05], [ 8.4281e-05, -2.8539e-04, 6.4611e-05, ..., -1.2141e-04, -1.8418e-04, -4.9084e-05], [ 6.1572e-05, 4.9019e-04, 3.4642e-04, ..., 4.7922e-04, 1.3626e-04, -2.0170e-04], ..., [ 2.0809e-03, 1.8606e-03, 2.0218e-03, ..., 3.0270e-03, 1.1311e-03, 1.6797e-04], [ 1.0021e-05, 1.7309e-04, 1.5986e-04, ..., 2.1136e-04, 9.4712e-05, -1.2016e-04], [ 4.6343e-05, -1.6367e-04, -2.0659e-04, ..., 3.7372e-05, -8.5533e-05, 4.1872e-05]], device='cuda:0') Epoch 46, bias, value: tensor([-0.0078, -0.0080, -0.0035, -0.0125, -0.0157, -0.0071, 0.0117, 0.0028, 0.0250, -0.0008], device='cuda:0'), grad: tensor([ 7.5817e-05, -6.5565e-04, 5.6314e-04, -3.8166e-03, 5.6088e-05, 2.1076e-04, 4.3303e-05, 3.5591e-03, 1.2922e-04, -1.6224e-04], device='cuda:0') 100 0.0001 changing lr epoch 45, time 214.80, cls_loss 0.0215 cls_loss_mapping 0.0312 cls_loss_causal 0.7238 re_mapping 0.0181 re_causal 0.0491 /// teacc 98.48 lr 0.00010000 Epoch 47, weight, value: tensor([[-0.0802, 0.0516, 0.0132, ..., -0.0071, -0.0222, -0.0372], [ 0.0328, 0.0422, -0.0629, ..., -0.0592, -0.0151, 0.0509], [ 0.0485, -0.0409, -0.0469, ..., -0.0009, -0.0268, -0.0040], ..., [ 0.0340, 0.0266, 0.0553, ..., 0.0289, 0.0764, 0.0221], [ 0.0410, -0.0617, -0.0581, ..., -0.0796, -0.0338, 0.0616], [-0.0373, 0.0620, 0.0131, ..., -0.0289, -0.0294, 0.0159]], device='cuda:0'), grad: tensor([[ 4.8280e-05, 2.5466e-05, 3.0115e-05, ..., 2.5675e-05, 2.6032e-05, 6.8955e-06], [ 6.1131e-04, 2.9516e-04, 2.0361e-04, ..., 1.3888e-04, 4.4560e-04, 1.2410e-04], [-3.1567e-03, 3.8177e-05, 2.8536e-05, ..., -2.3142e-05, -2.1515e-03, -7.4673e-04], ..., [ 1.4114e-04, -1.5056e-04, -2.7275e-04, ..., 5.8353e-05, -7.2122e-05, 5.8919e-05], [ 2.1877e-03, 1.0765e-04, 5.7310e-05, ..., 1.0645e-04, 1.4563e-03, 4.9543e-04], [ 2.1827e-04, 8.6737e-04, 1.2660e-04, ..., 1.4567e-04, 1.3077e-04, 1.8263e-04]], device='cuda:0') Epoch 47, bias, value: tensor([-0.0077, -0.0073, -0.0034, -0.0128, -0.0155, -0.0066, 0.0110, 0.0028, 0.0242, -0.0008], device='cuda:0'), grad: tensor([ 0.0003, 0.0010, -0.0026, 0.0044, -0.0014, -0.0034, -0.0026, 0.0002, 0.0024, 0.0018], device='cuda:0') 100 0.0001 changing lr epoch 46, time 214.83, cls_loss 0.0205 cls_loss_mapping 0.0262 cls_loss_causal 0.6983 re_mapping 0.0171 re_causal 0.0473 /// teacc 98.55 lr 0.00010000 Epoch 48, weight, value: tensor([[-0.0801, 0.0523, 0.0129, ..., -0.0068, -0.0221, -0.0376], [ 0.0330, 0.0423, -0.0631, ..., -0.0589, -0.0147, 0.0517], [ 0.0485, -0.0414, -0.0470, ..., -0.0009, -0.0272, -0.0047], ..., [ 0.0341, 0.0268, 0.0556, ..., 0.0285, 0.0772, 0.0218], [ 0.0418, -0.0624, -0.0584, ..., -0.0800, -0.0337, 0.0623], [-0.0380, 0.0622, 0.0131, ..., -0.0293, -0.0298, 0.0157]], device='cuda:0'), grad: tensor([[ 2.2054e-05, -2.1681e-05, 3.4541e-05, ..., 2.8908e-05, 2.1026e-05, 1.4275e-05], [-1.0805e-03, -1.8368e-03, 3.9548e-05, ..., 2.9492e-04, 1.8215e-04, -1.9484e-03], [-1.8132e-04, 9.0182e-05, 9.4831e-05, ..., -1.5008e-04, -1.3983e-04, 4.9412e-05], ..., [-4.6611e-04, -1.3046e-03, -2.4986e-03, ..., -3.2592e-04, -1.5802e-03, 5.8830e-05], [ 9.5987e-04, 1.6556e-03, 1.5306e-04, ..., 5.9545e-05, 8.5890e-05, 1.4820e-03], [ 5.1308e-04, 1.6270e-03, 2.2430e-03, ..., 6.1941e-04, 1.4133e-03, 1.5652e-04]], device='cuda:0') Epoch 48, bias, value: tensor([-0.0072, -0.0070, -0.0034, -0.0126, -0.0153, -0.0075, 0.0111, 0.0027, 0.0245, -0.0009], device='cuda:0'), grad: tensor([ 2.0996e-05, -3.0918e-03, -2.3282e-04, -8.3160e-04, 6.3702e-06, 3.7408e-04, 7.6473e-05, -2.2411e-03, 2.9831e-03, 2.9354e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 47---------------------------------------------------- epoch 47, time 231.10, cls_loss 0.0233 cls_loss_mapping 0.0282 cls_loss_causal 0.7449 re_mapping 0.0168 re_causal 0.0472 /// teacc 98.60 lr 0.00010000 Epoch 49, weight, value: tensor([[-0.0807, 0.0530, 0.0129, ..., -0.0062, -0.0224, -0.0382], [ 0.0330, 0.0429, -0.0629, ..., -0.0597, -0.0145, 0.0521], [ 0.0493, -0.0424, -0.0476, ..., -0.0008, -0.0275, -0.0048], ..., [ 0.0341, 0.0269, 0.0560, ..., 0.0291, 0.0779, 0.0213], [ 0.0418, -0.0633, -0.0590, ..., -0.0811, -0.0343, 0.0631], [-0.0381, 0.0621, 0.0131, ..., -0.0297, -0.0302, 0.0151]], device='cuda:0'), grad: tensor([[ 2.5127e-06, -3.2723e-05, 7.4804e-06, ..., -1.1906e-05, 3.1870e-06, 9.3952e-06], [ 3.4660e-05, 1.1368e-03, 1.9228e-04, ..., 4.2111e-05, 3.5554e-05, 3.3736e-04], [ 6.0022e-05, 7.6115e-05, 5.1767e-05, ..., 3.1203e-05, 5.1886e-05, 2.6092e-05], ..., [-1.2445e-04, 4.1395e-05, -1.0788e-04, ..., -4.7952e-05, -1.2946e-04, 4.5775e-07], [-3.8324e-07, 4.9496e-04, 1.6391e-05, ..., 2.2888e-04, 2.9318e-06, 1.5810e-05], [ 1.2375e-05, -2.1133e-03, -3.3116e-04, ..., 3.5018e-05, 2.0787e-05, -6.7806e-04]], device='cuda:0') Epoch 49, bias, value: tensor([-0.0070, -0.0068, -0.0033, -0.0126, -0.0153, -0.0076, 0.0114, 0.0026, 0.0242, -0.0012], device='cuda:0'), grad: tensor([-9.8646e-05, 1.4915e-03, 1.6713e-04, -5.3673e-03, 1.0939e-03, 3.8357e-03, 3.3522e-04, 1.8133e-06, 1.1120e-03, -2.5654e-03], device='cuda:0') 100 0.0001 changing lr epoch 48, time 214.72, cls_loss 0.0182 cls_loss_mapping 0.0259 cls_loss_causal 0.6848 re_mapping 0.0174 re_causal 0.0469 /// teacc 98.60 lr 0.00010000 Epoch 50, weight, value: tensor([[-0.0813, 0.0534, 0.0126, ..., -0.0062, -0.0228, -0.0388], [ 0.0330, 0.0431, -0.0634, ..., -0.0602, -0.0148, 0.0527], [ 0.0496, -0.0426, -0.0474, ..., -0.0009, -0.0272, -0.0056], ..., [ 0.0341, 0.0269, 0.0560, ..., 0.0295, 0.0785, 0.0212], [ 0.0427, -0.0639, -0.0596, ..., -0.0814, -0.0345, 0.0637], [-0.0385, 0.0624, 0.0135, ..., -0.0304, -0.0304, 0.0151]], device='cuda:0'), grad: tensor([[ 1.2085e-05, 2.5600e-05, 3.6091e-05, ..., 2.7508e-05, 6.6087e-06, 2.7686e-05], [ 2.8715e-05, 3.8922e-05, 2.3350e-05, ..., 3.1799e-05, 2.3857e-05, 4.7743e-05], [-4.3631e-05, -2.0891e-05, 2.4587e-05, ..., -1.3255e-05, -3.9637e-05, -5.8502e-05], ..., [ 3.5834e-04, 1.2712e-03, 1.7319e-03, ..., 6.4373e-04, 8.3590e-04, 5.7369e-05], [-1.6201e-04, -5.2750e-05, 4.2945e-05, ..., -8.2791e-05, 1.1340e-05, -6.3181e-04], [-3.6693e-04, -9.4032e-04, -1.5163e-03, ..., -3.7408e-04, -8.7023e-04, 8.4698e-05]], device='cuda:0') Epoch 50, bias, value: tensor([-0.0069, -0.0068, -0.0034, -0.0124, -0.0153, -0.0079, 0.0116, 0.0026, 0.0245, -0.0013], device='cuda:0'), grad: tensor([ 1.0800e-04, 1.2326e-04, 1.7917e-04, -1.4997e-04, -6.8307e-05, 1.5426e-04, -8.5533e-05, 1.8368e-03, -8.1730e-04, -1.2789e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 49---------------------------------------------------- epoch 49, time 230.92, cls_loss 0.0142 cls_loss_mapping 0.0208 cls_loss_causal 0.6810 re_mapping 0.0177 re_causal 0.0478 /// teacc 98.68 lr 0.00010000 Epoch 51, weight, value: tensor([[-0.0820, 0.0531, 0.0123, ..., -0.0062, -0.0232, -0.0394], [ 0.0329, 0.0432, -0.0639, ..., -0.0601, -0.0150, 0.0529], [ 0.0493, -0.0429, -0.0480, ..., -0.0012, -0.0277, -0.0062], ..., [ 0.0349, 0.0269, 0.0564, ..., 0.0298, 0.0792, 0.0214], [ 0.0431, -0.0646, -0.0601, ..., -0.0819, -0.0346, 0.0644], [-0.0387, 0.0633, 0.0140, ..., -0.0305, -0.0300, 0.0150]], device='cuda:0'), grad: tensor([[ 5.3197e-06, -2.3283e-06, 9.4175e-06, ..., 2.6152e-06, 1.1027e-05, 9.1568e-06], [ 8.7395e-06, 1.2852e-05, 2.4498e-05, ..., 1.2144e-05, 2.9802e-05, -1.7388e-06], [-5.1081e-05, 2.3261e-05, -4.5411e-06, ..., -5.7399e-05, 2.2780e-06, -2.2333e-06], ..., [-2.5183e-05, 4.8780e-04, 2.1565e-04, ..., -1.0759e-05, 4.5180e-04, 3.7193e-04], [ 1.8150e-05, 6.6638e-05, 5.1796e-05, ..., 6.4611e-05, 4.8876e-05, 2.2128e-05], [ 2.4185e-05, 3.0479e-03, 1.6193e-03, ..., 1.8388e-05, 2.9831e-03, 2.2697e-03]], device='cuda:0') Epoch 51, bias, value: tensor([-0.0074, -0.0069, -0.0037, -0.0128, -0.0150, -0.0079, 0.0109, 0.0028, 0.0247, -0.0007], device='cuda:0'), grad: tensor([ 1.5855e-05, 4.8816e-05, -7.3433e-05, -3.3307e-04, -7.4081e-03, 3.6192e-04, -2.1458e-05, 9.9277e-04, 1.9515e-04, 6.2256e-03], device='cuda:0') 100 0.0001 changing lr epoch 50, time 215.02, cls_loss 0.0143 cls_loss_mapping 0.0205 cls_loss_causal 0.6864 re_mapping 0.0160 re_causal 0.0453 /// teacc 98.57 lr 0.00010000 Epoch 52, weight, value: tensor([[-0.0827, 0.0535, 0.0120, ..., -0.0065, -0.0239, -0.0399], [ 0.0330, 0.0435, -0.0644, ..., -0.0603, -0.0150, 0.0532], [ 0.0495, -0.0434, -0.0484, ..., -0.0012, -0.0278, -0.0066], ..., [ 0.0353, 0.0271, 0.0569, ..., 0.0301, 0.0799, 0.0217], [ 0.0432, -0.0656, -0.0606, ..., -0.0827, -0.0349, 0.0647], [-0.0392, 0.0633, 0.0141, ..., -0.0310, -0.0304, 0.0146]], device='cuda:0'), grad: tensor([[ 5.0592e-04, 8.2636e-04, 5.5075e-04, ..., 9.2030e-04, 3.9525e-06, 2.3365e-05], [ 3.7581e-05, 3.9995e-05, 3.8058e-05, ..., 7.8559e-05, 1.6898e-05, -4.4793e-05], [ 3.2615e-06, 1.9979e-04, 1.9622e-04, ..., 4.4155e-04, -1.7539e-05, 1.0684e-05], ..., [ 6.5446e-05, 1.4937e-04, -2.4632e-05, ..., 9.9361e-05, -6.9976e-05, 6.1810e-05], [ 3.1424e-04, 9.3651e-04, 4.3893e-04, ..., 6.7854e-04, 1.5795e-05, 2.0337e-04], [-1.6141e-04, -1.5917e-03, 7.9989e-05, ..., 3.3951e-04, 2.0638e-05, -1.5812e-03]], device='cuda:0') Epoch 52, bias, value: tensor([-0.0074, -0.0069, -0.0039, -0.0127, -0.0148, -0.0073, 0.0108, 0.0032, 0.0241, -0.0010], device='cuda:0'), grad: tensor([ 0.0030, 0.0003, 0.0006, -0.0046, 0.0038, -0.0033, 0.0007, 0.0008, 0.0030, -0.0043], device='cuda:0') 100 0.0001 changing lr epoch 51, time 215.02, cls_loss 0.0176 cls_loss_mapping 0.0229 cls_loss_causal 0.6949 re_mapping 0.0161 re_causal 0.0464 /// teacc 98.56 lr 0.00010000 Epoch 53, weight, value: tensor([[-0.0835, 0.0539, 0.0117, ..., -0.0067, -0.0243, -0.0404], [ 0.0331, 0.0440, -0.0642, ..., -0.0607, -0.0153, 0.0538], [ 0.0496, -0.0439, -0.0491, ..., -0.0010, -0.0283, -0.0070], ..., [ 0.0361, 0.0271, 0.0576, ..., 0.0308, 0.0811, 0.0218], [ 0.0434, -0.0662, -0.0613, ..., -0.0838, -0.0353, 0.0653], [-0.0398, 0.0632, 0.0139, ..., -0.0317, -0.0308, 0.0143]], device='cuda:0'), grad: tensor([[-1.1735e-05, 3.4256e-03, 4.7731e-04, ..., 3.8803e-05, -6.3896e-05, -9.6932e-06], [ 2.0221e-05, 5.4449e-05, 2.8029e-05, ..., 2.7269e-05, 2.7463e-05, -1.8075e-05], [ 1.1644e-03, 1.2465e-03, 1.8425e-03, ..., 2.2507e-03, -2.6673e-06, 2.8670e-05], ..., [ 5.5656e-06, 7.7248e-05, -4.4644e-05, ..., 3.1829e-05, -3.1255e-06, -2.0123e-04], [ 2.7561e-04, 2.9039e-04, 4.5657e-04, ..., 5.7745e-04, 7.0520e-06, -1.7762e-05], [ 3.1978e-05, -3.8052e-03, -4.3869e-04, ..., 4.9323e-05, 1.2778e-05, 1.4253e-05]], device='cuda:0') Epoch 53, bias, value: tensor([-0.0074, -0.0067, -0.0040, -0.0132, -0.0146, -0.0072, 0.0112, 0.0038, 0.0238, -0.0015], device='cuda:0'), grad: tensor([ 0.0038, 0.0002, 0.0046, -0.0052, 0.0058, 0.0004, -0.0018, -0.0054, 0.0011, -0.0037], device='cuda:0') 100 0.0001 changing lr epoch 52, time 214.82, cls_loss 0.0161 cls_loss_mapping 0.0250 cls_loss_causal 0.6895 re_mapping 0.0163 re_causal 0.0460 /// teacc 98.55 lr 0.00010000 Epoch 54, weight, value: tensor([[-0.0840, 0.0536, 0.0114, ..., -0.0066, -0.0245, -0.0411], [ 0.0329, 0.0444, -0.0647, ..., -0.0608, -0.0157, 0.0538], [ 0.0496, -0.0444, -0.0496, ..., -0.0010, -0.0282, -0.0074], ..., [ 0.0361, 0.0275, 0.0578, ..., 0.0306, 0.0814, 0.0209], [ 0.0446, -0.0671, -0.0610, ..., -0.0842, -0.0338, 0.0669], [-0.0407, 0.0634, 0.0139, ..., -0.0321, -0.0316, 0.0139]], device='cuda:0'), grad: tensor([[ 1.2159e-04, 2.4724e-04, 1.0289e-05, ..., 2.0608e-05, 9.5367e-06, 1.6582e-04], [-8.2159e-04, -1.1520e-03, 1.3721e-04, ..., 5.8711e-05, 1.3852e-04, -1.1740e-03], [ 6.2764e-05, 5.1439e-05, 2.5943e-05, ..., 1.5572e-05, 2.3007e-05, 1.0133e-04], ..., [-1.5962e-04, -3.1561e-05, -2.1887e-04, ..., -8.4639e-05, -1.9765e-04, -4.0144e-05], [ 3.6478e-04, 8.3065e-04, 1.5616e-05, ..., -1.3840e-04, 9.2089e-06, 4.4203e-04], [ 4.7147e-05, 1.2243e-04, -1.5050e-05, ..., 2.8476e-05, 2.0579e-05, 1.6499e-04]], device='cuda:0') Epoch 54, bias, value: tensor([-0.0078, -0.0067, -0.0041, -0.0126, -0.0146, -0.0075, 0.0115, 0.0038, 0.0240, -0.0017], device='cuda:0'), grad: tensor([ 8.6355e-04, -2.9869e-03, 2.0957e-04, 3.2115e-04, -4.3654e-04, 4.9210e-04, -5.4687e-05, -1.9327e-05, 1.2550e-03, 3.5501e-04], device='cuda:0') 100 0.0001 changing lr epoch 53, time 215.21, cls_loss 0.0133 cls_loss_mapping 0.0192 cls_loss_causal 0.6994 re_mapping 0.0158 re_causal 0.0451 /// teacc 98.61 lr 0.00010000 Epoch 55, weight, value: tensor([[-0.0845, 0.0539, 0.0110, ..., -0.0069, -0.0249, -0.0416], [ 0.0330, 0.0444, -0.0650, ..., -0.0608, -0.0155, 0.0540], [ 0.0498, -0.0454, -0.0501, ..., -0.0011, -0.0286, -0.0075], ..., [ 0.0364, 0.0274, 0.0583, ..., 0.0308, 0.0819, 0.0208], [ 0.0449, -0.0677, -0.0616, ..., -0.0847, -0.0339, 0.0674], [-0.0412, 0.0636, 0.0145, ..., -0.0325, -0.0318, 0.0136]], device='cuda:0'), grad: tensor([[ 6.7614e-06, -1.0043e-04, 7.3612e-06, ..., -2.2873e-05, 4.9919e-06, 4.1574e-06], [-5.8636e-06, -4.6611e-05, 2.0340e-05, ..., 3.2365e-05, 2.0206e-05, -9.7990e-05], [-1.2646e-03, -1.8239e-04, -4.9686e-04, ..., -1.0614e-03, -8.4686e-04, 3.0398e-05], ..., [ 4.7803e-04, -3.1066e-04, -8.7309e-04, ..., 1.0080e-03, 1.6820e-04, -1.9634e-04], [-2.7984e-05, 7.7188e-05, 1.3375e-04, ..., 2.1309e-05, 5.9783e-05, -8.3447e-05], [ 6.4659e-04, 4.4513e-04, 1.1139e-03, ..., 2.5094e-05, 5.4073e-04, 1.9932e-04]], device='cuda:0') Epoch 55, bias, value: tensor([-0.0074, -0.0068, -0.0042, -0.0128, -0.0145, -0.0074, 0.0114, 0.0038, 0.0238, -0.0016], device='cuda:0'), grad: tensor([-1.0079e-04, -1.2827e-04, -1.6117e-03, 9.5904e-05, -1.1615e-05, 7.0214e-05, 1.4174e-04, 2.5654e-04, 4.3839e-05, 1.2445e-03], device='cuda:0') 100 0.0001 changing lr epoch 54, time 214.93, cls_loss 0.0145 cls_loss_mapping 0.0236 cls_loss_causal 0.6786 re_mapping 0.0157 re_causal 0.0445 /// teacc 98.56 lr 0.00010000 Epoch 56, weight, value: tensor([[-0.0852, 0.0540, 0.0103, ..., -0.0070, -0.0255, -0.0423], [ 0.0334, 0.0450, -0.0649, ..., -0.0611, -0.0149, 0.0549], [ 0.0500, -0.0456, -0.0503, ..., -0.0006, -0.0283, -0.0081], ..., [ 0.0362, 0.0265, 0.0581, ..., 0.0303, 0.0818, 0.0201], [ 0.0454, -0.0686, -0.0620, ..., -0.0852, -0.0341, 0.0680], [-0.0417, 0.0645, 0.0150, ..., -0.0329, -0.0315, 0.0136]], device='cuda:0'), grad: tensor([[ 6.0536e-06, -4.3735e-06, 1.4983e-05, ..., -1.5013e-05, 5.6215e-06, 6.6049e-06], [ 1.7926e-05, -1.6494e-06, 3.0965e-05, ..., 1.2048e-05, 2.3365e-05, -1.2510e-05], [ 2.8253e-05, 3.3766e-05, 2.5213e-05, ..., 1.1995e-05, 1.5467e-05, 4.7833e-05], ..., [-9.1493e-05, -9.4712e-05, -1.9979e-04, ..., -1.7285e-05, -1.8990e-04, 3.4682e-06], [-8.1539e-05, 2.3022e-05, 6.9857e-05, ..., 7.5161e-05, 4.6700e-05, -1.8251e-04], [ 1.1837e-04, -7.5459e-05, 6.5088e-05, ..., 6.4731e-05, 1.4448e-04, 7.3493e-05]], device='cuda:0') Epoch 56, bias, value: tensor([-0.0077, -0.0065, -0.0040, -0.0125, -0.0139, -0.0077, 0.0107, 0.0030, 0.0241, -0.0012], device='cuda:0'), grad: tensor([-3.7998e-05, 1.6451e-05, 8.8215e-05, -1.1206e-04, 1.1635e-04, 2.2084e-05, 3.1203e-05, -1.2231e-04, -5.2214e-05, 5.0306e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 55---------------------------------------------------- epoch 55, time 225.16, cls_loss 0.0174 cls_loss_mapping 0.0217 cls_loss_causal 0.6880 re_mapping 0.0148 re_causal 0.0411 /// teacc 98.69 lr 0.00010000 Epoch 57, weight, value: tensor([[-0.0858, 0.0543, 0.0095, ..., -0.0070, -0.0261, -0.0433], [ 0.0331, 0.0454, -0.0654, ..., -0.0617, -0.0150, 0.0556], [ 0.0502, -0.0458, -0.0508, ..., -0.0005, -0.0285, -0.0083], ..., [ 0.0368, 0.0266, 0.0587, ..., 0.0308, 0.0827, 0.0200], [ 0.0458, -0.0694, -0.0625, ..., -0.0859, -0.0343, 0.0688], [-0.0421, 0.0649, 0.0154, ..., -0.0333, -0.0318, 0.0135]], device='cuda:0'), grad: tensor([[ 6.9812e-06, 1.1392e-05, 1.0960e-05, ..., 3.9563e-06, 6.8396e-06, 1.2949e-05], [-4.7588e-04, -4.1604e-04, 1.1820e-04, ..., 4.8950e-06, 7.8678e-05, -9.3794e-04], [ 3.3200e-05, 5.0128e-05, 3.1471e-05, ..., -1.6004e-05, 1.5333e-05, 6.7890e-05], ..., [ 1.6940e-04, 4.4179e-04, 4.4584e-04, ..., 4.2375e-07, 2.9135e-04, 4.1628e-04], [ 3.8648e-04, 4.3344e-04, 4.8459e-05, ..., 1.5795e-05, 2.0862e-05, 7.6246e-04], [-2.1935e-04, -6.6662e-04, -8.6355e-04, ..., 1.9535e-05, -5.7602e-04, -5.5647e-04]], device='cuda:0') Epoch 57, bias, value: tensor([-0.0074, -0.0064, -0.0037, -0.0128, -0.0141, -0.0080, 0.0108, 0.0031, 0.0237, -0.0010], device='cuda:0'), grad: tensor([ 6.8569e-04, -1.2522e-03, 8.6427e-05, 1.3277e-05, 3.5715e-04, -1.2732e-04, -6.8092e-04, 7.1049e-04, 1.1387e-03, -9.3031e-04], device='cuda:0') 100 0.0001 changing lr epoch 56, time 214.72, cls_loss 0.0144 cls_loss_mapping 0.0217 cls_loss_causal 0.6888 re_mapping 0.0152 re_causal 0.0425 /// teacc 98.63 lr 0.00010000 Epoch 58, weight, value: tensor([[-0.0867, 0.0547, 0.0092, ..., -0.0068, -0.0266, -0.0443], [ 0.0330, 0.0457, -0.0658, ..., -0.0620, -0.0152, 0.0562], [ 0.0502, -0.0465, -0.0515, ..., -0.0008, -0.0287, -0.0089], ..., [ 0.0371, 0.0267, 0.0589, ..., 0.0304, 0.0835, 0.0198], [ 0.0461, -0.0702, -0.0632, ..., -0.0869, -0.0346, 0.0696], [-0.0428, 0.0648, 0.0155, ..., -0.0339, -0.0321, 0.0126]], device='cuda:0'), grad: tensor([[ 5.8681e-05, -7.4990e-06, 1.2226e-05, ..., 9.6262e-06, 5.5023e-06, 1.0710e-06], [ 4.9412e-05, -3.8415e-05, 1.5557e-05, ..., 1.1243e-05, 6.6385e-06, -8.5473e-05], [ 3.5038e-03, 5.9992e-05, 4.3780e-05, ..., 1.4361e-06, 1.2808e-05, 6.9201e-05], ..., [-5.2631e-05, -2.6554e-05, -1.3769e-04, ..., -3.7640e-05, -1.0413e-04, 2.1029e-06], [-4.3259e-03, 3.3498e-05, 3.5495e-05, ..., 3.2604e-05, 4.3809e-06, 4.6492e-06], [ 2.3067e-05, -7.8976e-05, -4.3750e-05, ..., 1.9222e-05, 2.1398e-05, -1.8179e-05]], device='cuda:0') Epoch 58, bias, value: tensor([-0.0075, -0.0063, -0.0042, -0.0120, -0.0141, -0.0076, 0.0116, 0.0027, 0.0235, -0.0016], device='cuda:0'), grad: tensor([ 1.9288e-04, 1.2312e-06, 1.1772e-02, 1.9157e-04, 1.5736e-04, 4.7016e-04, 1.6079e-03, -7.3195e-05, -1.4244e-02, -7.2539e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 57---------------------------------------------------- epoch 57, time 224.63, cls_loss 0.0151 cls_loss_mapping 0.0209 cls_loss_causal 0.6313 re_mapping 0.0150 re_causal 0.0407 /// teacc 98.73 lr 0.00010000 Epoch 59, weight, value: tensor([[-0.0872, 0.0549, 0.0088, ..., -0.0067, -0.0270, -0.0450], [ 0.0326, 0.0454, -0.0661, ..., -0.0624, -0.0160, 0.0565], [ 0.0505, -0.0464, -0.0520, ..., -0.0003, -0.0283, -0.0093], ..., [ 0.0373, 0.0267, 0.0592, ..., 0.0310, 0.0840, 0.0193], [ 0.0467, -0.0707, -0.0633, ..., -0.0877, -0.0345, 0.0701], [-0.0424, 0.0648, 0.0158, ..., -0.0345, -0.0320, 0.0126]], device='cuda:0'), grad: tensor([[ 8.6874e-06, 2.3320e-05, 2.5019e-05, ..., 8.4937e-06, 1.3910e-05, 3.0678e-06], [ 2.2933e-05, 1.0467e-04, 7.7665e-05, ..., 2.2486e-05, 4.1783e-05, -1.1802e-05], [ 6.7353e-05, 6.8069e-05, 1.5175e-04, ..., -5.9813e-05, 1.0592e-04, 1.1154e-05], ..., [-2.7323e-04, -1.3089e-04, -7.8964e-04, ..., -2.2554e-04, -4.9353e-04, 9.6858e-06], [-6.0499e-06, 1.2720e-04, 1.3971e-04, ..., 1.3478e-05, 1.8090e-05, -9.5144e-06], [ 4.0203e-05, -1.2207e-04, -4.2021e-05, ..., 3.7849e-05, 9.8884e-05, -5.4449e-05]], device='cuda:0') Epoch 59, bias, value: tensor([-0.0071, -0.0068, -0.0039, -0.0120, -0.0139, -0.0077, 0.0110, 0.0026, 0.0240, -0.0017], device='cuda:0'), grad: tensor([ 1.6117e-04, 3.9220e-04, -3.4761e-04, 4.6468e-04, 4.8161e-04, 2.1422e-04, -6.6376e-04, -8.6498e-04, 2.4152e-04, -7.7665e-05], device='cuda:0') 100 0.0001 changing lr epoch 58, time 214.76, cls_loss 0.0121 cls_loss_mapping 0.0176 cls_loss_causal 0.6447 re_mapping 0.0155 re_causal 0.0415 /// teacc 98.70 lr 0.00010000 Epoch 60, weight, value: tensor([[-0.0879, 0.0552, 0.0085, ..., -0.0066, -0.0274, -0.0456], [ 0.0323, 0.0455, -0.0665, ..., -0.0626, -0.0160, 0.0566], [ 0.0506, -0.0468, -0.0528, ..., -0.0003, -0.0285, -0.0096], ..., [ 0.0372, 0.0265, 0.0596, ..., 0.0311, 0.0846, 0.0188], [ 0.0477, -0.0709, -0.0632, ..., -0.0881, -0.0345, 0.0711], [-0.0428, 0.0646, 0.0159, ..., -0.0352, -0.0323, 0.0123]], device='cuda:0'), grad: tensor([[ 2.5984e-07, -1.8373e-05, 1.7928e-06, ..., -3.0920e-06, 1.8720e-07, 6.7521e-07], [-2.5973e-05, -2.6271e-05, 9.5740e-06, ..., 1.3553e-05, -4.0904e-06, -5.0426e-05], [ 5.3644e-05, 4.6194e-05, 9.6560e-05, ..., 1.4615e-04, 1.6866e-06, 1.5780e-05], ..., [ 1.3500e-05, 1.5497e-05, 4.3586e-06, ..., 1.4015e-05, -2.6748e-06, 1.8597e-05], [ 1.0645e-06, 1.6496e-05, 1.2629e-05, ..., 1.7643e-05, 5.4576e-07, -1.4491e-06], [ 4.5560e-06, 5.5656e-06, -7.8380e-06, ..., 5.9269e-06, 2.3823e-06, 9.0003e-06]], device='cuda:0') Epoch 60, bias, value: tensor([-0.0072, -0.0070, -0.0040, -0.0122, -0.0132, -0.0074, 0.0109, 0.0024, 0.0245, -0.0022], device='cuda:0'), grad: tensor([-2.7090e-05, -7.4327e-05, 1.3757e-04, -1.4317e-04, 5.0515e-06, -6.3658e-05, 5.1647e-05, 4.2975e-05, 4.6194e-05, 2.4796e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 59---------------------------------------------------- epoch 59, time 230.72, cls_loss 0.0110 cls_loss_mapping 0.0184 cls_loss_causal 0.6622 re_mapping 0.0146 re_causal 0.0414 /// teacc 98.81 lr 0.00010000 Epoch 61, weight, value: tensor([[-0.0885, 0.0550, 0.0079, ..., -0.0068, -0.0277, -0.0461], [ 0.0324, 0.0456, -0.0669, ..., -0.0630, -0.0160, 0.0570], [ 0.0508, -0.0471, -0.0533, ..., -0.0002, -0.0287, -0.0100], ..., [ 0.0377, 0.0261, 0.0600, ..., 0.0310, 0.0851, 0.0185], [ 0.0479, -0.0715, -0.0637, ..., -0.0885, -0.0347, 0.0716], [-0.0434, 0.0651, 0.0157, ..., -0.0357, -0.0326, 0.0121]], device='cuda:0'), grad: tensor([[ 2.7314e-05, 9.9912e-06, 1.5363e-05, ..., 2.6524e-05, 1.9129e-06, 1.6978e-06], [-7.2527e-04, -2.0707e-04, 4.4107e-06, ..., 2.5108e-05, -2.7609e-04, -3.9172e-04], [ 3.7014e-05, 9.7573e-05, 5.4576e-06, ..., -4.2057e-04, 2.2733e-04, 2.6393e-04], ..., [ 6.0499e-05, 2.2441e-05, -9.7007e-06, ..., 3.0786e-05, -3.1535e-06, 2.4706e-05], [ 3.0351e-04, 2.1374e-04, 1.3137e-04, ..., 2.3842e-04, 2.7180e-05, 3.8505e-05], [ 6.6757e-05, -7.8738e-05, -9.3102e-05, ..., 1.2577e-04, 5.9940e-06, 1.9208e-05]], device='cuda:0') Epoch 61, bias, value: tensor([-0.0075, -0.0071, -0.0041, -0.0120, -0.0131, -0.0066, 0.0105, 0.0023, 0.0243, -0.0023], device='cuda:0'), grad: tensor([ 9.1910e-05, -1.0405e-03, -3.8004e-04, 1.9932e-03, 5.9366e-05, -2.1114e-03, 1.2529e-04, 1.5521e-04, 1.0223e-03, 8.7261e-05], device='cuda:0') 100 0.0001 changing lr epoch 60, time 214.73, cls_loss 0.0159 cls_loss_mapping 0.0203 cls_loss_causal 0.6741 re_mapping 0.0141 re_causal 0.0397 /// teacc 98.74 lr 0.00010000 Epoch 62, weight, value: tensor([[-0.0893, 0.0549, 0.0075, ..., -0.0072, -0.0282, -0.0467], [ 0.0321, 0.0461, -0.0673, ..., -0.0637, -0.0163, 0.0574], [ 0.0512, -0.0472, -0.0537, ..., 0.0004, -0.0291, -0.0100], ..., [ 0.0383, 0.0260, 0.0603, ..., 0.0312, 0.0861, 0.0184], [ 0.0482, -0.0723, -0.0642, ..., -0.0897, -0.0348, 0.0722], [-0.0438, 0.0648, 0.0157, ..., -0.0365, -0.0330, 0.0118]], device='cuda:0'), grad: tensor([[ 1.5807e-04, 5.9557e-04, 2.4843e-04, ..., 5.1641e-04, 3.1257e-04, 2.5675e-05], [ 8.0967e-04, 9.8526e-05, 1.4901e-04, ..., 5.6934e-04, 1.0500e-03, -6.9380e-05], [-8.0681e-04, 9.9897e-05, 8.8751e-05, ..., -4.8923e-04, -8.7595e-04, 8.1003e-05], ..., [-1.3673e-04, -4.2439e-04, -6.8521e-04, ..., -2.9087e-04, -7.6914e-04, 1.9699e-05], [-9.8467e-05, 4.9442e-05, 5.1826e-05, ..., 5.7846e-05, 4.8995e-05, -1.5175e-04], [ 2.8819e-05, -4.7016e-04, -7.2289e-04, ..., 5.0277e-05, -3.4904e-04, 1.3404e-05]], device='cuda:0') Epoch 62, bias, value: tensor([-0.0080, -0.0075, -0.0033, -0.0119, -0.0127, -0.0062, 0.0105, 0.0027, 0.0239, -0.0030], device='cuda:0'), grad: tensor([ 1.1158e-03, 2.0332e-03, -1.7595e-03, 8.9216e-04, 8.9884e-04, -1.4858e-03, -3.5256e-05, -7.6437e-04, -1.0419e-04, -7.9298e-04], device='cuda:0') 100 0.0001 changing lr epoch 61, time 214.72, cls_loss 0.0144 cls_loss_mapping 0.0211 cls_loss_causal 0.6536 re_mapping 0.0148 re_causal 0.0424 /// teacc 98.67 lr 0.00010000 Epoch 63, weight, value: tensor([[-9.0131e-02, 5.5090e-02, 6.9804e-03, ..., -7.2474e-03, -2.9012e-02, -4.8310e-02], [ 3.1593e-02, 4.6702e-02, -6.8191e-02, ..., -6.4467e-02, -1.7217e-02, 5.7917e-02], [ 5.0951e-02, -4.7940e-02, -5.5087e-02, ..., -2.7335e-05, -3.0230e-02, -1.0337e-02], ..., [ 3.8590e-02, 2.6356e-02, 6.0410e-02, ..., 3.1009e-02, 8.7287e-02, 1.8017e-02], [ 4.7961e-02, -7.3184e-02, -6.5033e-02, ..., -9.0978e-02, -3.5257e-02, 7.2934e-02], [-4.3146e-02, 6.5725e-02, 1.6971e-02, ..., -3.6201e-02, -3.2671e-02, 1.3038e-02]], device='cuda:0'), grad: tensor([[ 2.4870e-05, -2.6599e-05, 1.4096e-05, ..., 1.0443e-04, 1.1340e-05, -5.0198e-07], [ 1.2353e-05, 2.3320e-05, 3.4958e-05, ..., 4.5359e-05, 1.6689e-05, -1.3737e-06], [-3.5334e-04, 5.3257e-05, -3.6895e-05, ..., -5.0354e-04, -2.6274e-04, 1.0327e-05], ..., [ 1.6832e-04, 2.2084e-05, 8.2850e-05, ..., 2.0289e-04, 1.6689e-04, 7.1488e-06], [ 3.3855e-05, 4.4048e-05, 4.2647e-05, ..., 6.0171e-05, 8.4713e-06, 1.3597e-05], [ 1.3262e-05, 7.0333e-04, -2.0757e-05, ..., 1.1660e-05, 2.0847e-05, 7.4434e-04]], device='cuda:0') Epoch 63, bias, value: tensor([-0.0085, -0.0074, -0.0036, -0.0118, -0.0133, -0.0059, 0.0104, 0.0024, 0.0233, -0.0017], device='cuda:0'), grad: tensor([ 1.4734e-04, 8.0764e-05, -9.2649e-04, 2.5702e-04, -1.7815e-03, 2.6107e-05, 3.0965e-05, 3.5286e-04, 1.6940e-04, 1.6422e-03], device='cuda:0') 100 0.0001 changing lr epoch 62, time 214.97, cls_loss 0.0128 cls_loss_mapping 0.0191 cls_loss_causal 0.6625 re_mapping 0.0145 re_causal 0.0408 /// teacc 98.68 lr 0.00010000 Epoch 64, weight, value: tensor([[-0.0912, 0.0551, 0.0066, ..., -0.0071, -0.0297, -0.0492], [ 0.0314, 0.0470, -0.0684, ..., -0.0650, -0.0174, 0.0584], [ 0.0521, -0.0485, -0.0546, ..., 0.0004, -0.0294, -0.0111], ..., [ 0.0384, 0.0268, 0.0608, ..., 0.0314, 0.0875, 0.0177], [ 0.0481, -0.0745, -0.0653, ..., -0.0917, -0.0349, 0.0735], [-0.0430, 0.0662, 0.0172, ..., -0.0370, -0.0326, 0.0133]], device='cuda:0'), grad: tensor([[ 2.2292e-05, -1.4916e-05, 3.9265e-06, ..., 1.7695e-06, 2.7493e-06, 3.4064e-05], [ 8.7404e-04, -4.9174e-05, 6.4895e-06, ..., 4.5970e-06, 2.9188e-06, 1.5402e-03], [ 2.4706e-05, 2.4274e-05, 1.1846e-05, ..., -7.1637e-06, -2.6785e-06, 9.0957e-05], ..., [-2.0146e-05, 5.0031e-06, -4.6164e-05, ..., -1.9565e-05, -3.1590e-05, 4.6670e-05], [-1.0118e-03, 1.3866e-05, 1.6928e-05, ..., 6.2585e-06, 4.7125e-06, -1.8845e-03], [ 1.2577e-05, -5.6475e-06, -3.1944e-07, ..., 1.3590e-05, 5.7966e-06, 5.8338e-06]], device='cuda:0') Epoch 64, bias, value: tensor([-0.0085, -0.0073, -0.0031, -0.0120, -0.0138, -0.0059, 0.0108, 0.0025, 0.0227, -0.0014], device='cuda:0'), grad: tensor([ 3.5346e-05, 1.7958e-03, 1.1325e-04, 3.0905e-05, 8.4758e-05, 9.7811e-05, 4.3184e-05, 1.1034e-05, -2.2221e-03, 9.1717e-06], device='cuda:0') 100 0.0001 changing lr epoch 63, time 215.05, cls_loss 0.0120 cls_loss_mapping 0.0181 cls_loss_causal 0.6824 re_mapping 0.0143 re_causal 0.0408 /// teacc 98.69 lr 0.00010000 Epoch 65, weight, value: tensor([[-9.1802e-02, 5.5720e-02, 6.3943e-03, ..., -7.2533e-03, -3.0065e-02, -4.9806e-02], [ 3.1856e-02, 4.7580e-02, -6.7240e-02, ..., -6.4421e-02, -1.6967e-02, 5.9152e-02], [ 5.1922e-02, -4.8785e-02, -5.5649e-02, ..., 6.0204e-05, -3.0386e-02, -1.1354e-02], ..., [ 3.9207e-02, 2.6878e-02, 6.1477e-02, ..., 3.1801e-02, 8.8768e-02, 1.7651e-02], [ 4.8599e-02, -7.4676e-02, -6.5740e-02, ..., -9.2264e-02, -3.5223e-02, 7.4609e-02], [-4.4124e-02, 6.5959e-02, 1.6824e-02, ..., -3.7561e-02, -3.3331e-02, 1.2376e-02]], device='cuda:0'), grad: tensor([[ 2.4557e-05, -1.3094e-06, 1.6481e-05, ..., 1.3359e-05, 2.5824e-05, 8.9481e-06], [ 4.0501e-05, 2.6807e-05, 2.1839e-04, ..., 1.6093e-04, 2.0933e-04, -1.3900e-04], [-3.8290e-04, 4.4167e-05, -1.8871e-04, ..., -2.4307e-04, -4.8018e-04, 2.3708e-05], ..., [-2.1183e-04, -7.7200e-04, -8.9359e-04, ..., -5.5408e-04, -6.1178e-04, 1.4663e-05], [ 2.1279e-04, 2.2602e-04, 2.0528e-04, ..., 1.9884e-04, 1.1170e-04, 2.1803e-04], [ 3.2711e-04, 4.3344e-04, 5.9414e-04, ..., 4.5490e-04, 5.8985e-04, 8.0764e-06]], device='cuda:0') Epoch 65, bias, value: tensor([-0.0084, -0.0067, -0.0030, -0.0120, -0.0140, -0.0066, 0.0108, 0.0028, 0.0232, -0.0021], device='cuda:0'), grad: tensor([ 9.2506e-05, 2.3997e-04, -7.3338e-04, -1.6046e-04, -3.3355e-04, 1.7262e-04, 2.4605e-04, -1.2217e-03, 5.3549e-04, 1.1597e-03], device='cuda:0') 100 0.0001 changing lr epoch 64, time 215.02, cls_loss 0.0138 cls_loss_mapping 0.0204 cls_loss_causal 0.6667 re_mapping 0.0140 re_causal 0.0384 /// teacc 98.74 lr 0.00010000 Epoch 66, weight, value: tensor([[-9.2671e-02, 5.6513e-02, 6.0798e-03, ..., -6.9074e-03, -3.1055e-02, -5.0347e-02], [ 3.1625e-02, 4.7524e-02, -6.7843e-02, ..., -6.5017e-02, -1.7943e-02, 5.9382e-02], [ 5.1893e-02, -4.9488e-02, -5.5866e-02, ..., -2.9052e-05, -3.0425e-02, -1.2488e-02], ..., [ 3.8996e-02, 2.6471e-02, 6.0450e-02, ..., 3.1353e-02, 8.8967e-02, 1.8121e-02], [ 4.8972e-02, -7.5253e-02, -6.6376e-02, ..., -9.2968e-02, -3.5481e-02, 7.5510e-02], [-4.3719e-02, 6.6098e-02, 1.8009e-02, ..., -3.7854e-02, -3.2249e-02, 1.1444e-02]], device='cuda:0'), grad: tensor([[-2.1942e-06, -3.5584e-05, 2.7083e-06, ..., -7.1749e-06, 3.5968e-06, 3.1162e-06], [ 4.3102e-06, -5.4359e-05, 4.4674e-05, ..., 1.1288e-05, 2.6315e-05, -1.3769e-04], [ 1.4439e-05, 1.4715e-05, 1.8314e-05, ..., 6.1840e-06, 1.5393e-05, 1.0341e-05], ..., [-1.0681e-03, -2.5415e-04, -1.6079e-03, ..., -3.9673e-04, -1.2474e-03, 3.2261e-06], [ 3.1084e-05, 5.8919e-05, 2.8700e-05, ..., 1.0252e-05, 2.0742e-05, 8.2672e-05], [ 9.4461e-04, 2.2328e-04, 1.3971e-03, ..., 3.5357e-04, 1.0967e-03, 2.0370e-05]], device='cuda:0') Epoch 66, bias, value: tensor([-0.0077, -0.0068, -0.0034, -0.0119, -0.0133, -0.0062, 0.0108, 0.0018, 0.0230, -0.0020], device='cuda:0'), grad: tensor([-3.3855e-05, -1.2422e-04, 4.9472e-05, 1.1826e-04, 3.6925e-05, 2.0236e-05, -1.0383e-04, -1.5221e-03, 1.8656e-04, 1.3733e-03], device='cuda:0') 100 0.0001 changing lr epoch 65, time 214.75, cls_loss 0.0121 cls_loss_mapping 0.0171 cls_loss_causal 0.6589 re_mapping 0.0136 re_causal 0.0390 /// teacc 98.64 lr 0.00010000 Epoch 67, weight, value: tensor([[-9.4084e-02, 5.6880e-02, 5.7173e-03, ..., -6.5814e-03, -3.1736e-02, -5.2937e-02], [ 3.1793e-02, 4.7365e-02, -6.8117e-02, ..., -6.5509e-02, -1.7528e-02, 5.9585e-02], [ 5.2018e-02, -4.9698e-02, -5.6268e-02, ..., 2.7254e-05, -3.0388e-02, -1.2801e-02], ..., [ 3.9262e-02, 2.5856e-02, 6.0914e-02, ..., 3.1889e-02, 8.9248e-02, 1.7148e-02], [ 4.9730e-02, -7.5219e-02, -6.6653e-02, ..., -9.3494e-02, -3.5677e-02, 7.7292e-02], [-4.4559e-02, 6.6297e-02, 1.7968e-02, ..., -3.8633e-02, -3.2572e-02, 1.0745e-02]], device='cuda:0'), grad: tensor([[ 1.0110e-05, 5.8413e-05, 6.1154e-05, ..., 6.3837e-05, 1.1817e-05, 7.7114e-06], [ 3.6240e-05, 6.1393e-05, 1.0538e-04, ..., 8.6427e-05, 4.7892e-05, -1.3351e-05], [ 2.5138e-05, 7.3910e-05, 8.3506e-05, ..., 8.3268e-05, 2.9832e-05, 1.3001e-05], ..., [-1.5104e-04, 2.5034e-04, -7.2300e-05, ..., 1.2743e-04, -2.2149e-04, 1.2837e-05], [-3.8981e-05, 7.5459e-05, 2.8944e-04, ..., 3.9721e-04, 8.1658e-06, -1.9267e-05], [ 4.0442e-05, -8.3804e-05, -3.7849e-05, ..., 8.3327e-05, 6.9618e-05, 5.8301e-06]], device='cuda:0') Epoch 67, bias, value: tensor([-0.0075, -0.0073, -0.0033, -0.0121, -0.0128, -0.0060, 0.0109, 0.0016, 0.0238, -0.0024], device='cuda:0'), grad: tensor([ 0.0014, 0.0010, 0.0012, -0.0016, 0.0020, -0.0133, 0.0007, 0.0061, 0.0013, 0.0013], device='cuda:0') 100 0.0001 changing lr epoch 66, time 214.81, cls_loss 0.0109 cls_loss_mapping 0.0158 cls_loss_causal 0.6325 re_mapping 0.0141 re_causal 0.0367 /// teacc 98.68 lr 0.00010000 Epoch 68, weight, value: tensor([[-0.0948, 0.0571, 0.0054, ..., -0.0066, -0.0322, -0.0537], [ 0.0315, 0.0475, -0.0687, ..., -0.0662, -0.0179, 0.0595], [ 0.0517, -0.0507, -0.0574, ..., -0.0002, -0.0313, -0.0135], ..., [ 0.0400, 0.0262, 0.0618, ..., 0.0328, 0.0902, 0.0166], [ 0.0500, -0.0762, -0.0673, ..., -0.0946, -0.0359, 0.0782], [-0.0453, 0.0667, 0.0180, ..., -0.0396, -0.0330, 0.0107]], device='cuda:0'), grad: tensor([[ 5.0999e-06, 1.0878e-05, 7.9423e-06, ..., 5.9083e-06, 5.4576e-06, 1.9476e-05], [-1.2159e-04, -2.5606e-04, 2.3693e-05, ..., 2.5257e-05, -1.9765e-04, -7.3528e-04], [ 1.0765e-04, 8.5115e-05, 1.8334e-04, ..., 2.2244e-04, 5.8711e-05, 1.5545e-04], ..., [ 3.3617e-05, 7.1824e-05, 4.7207e-05, ..., 1.8090e-05, 5.4508e-05, 1.0985e-04], [ 3.5107e-05, 6.9499e-05, 5.9783e-05, ..., 6.2406e-05, 2.9176e-05, 1.2994e-04], [ 6.3330e-06, -3.3283e-04, -2.9469e-04, ..., 1.0379e-05, -9.7394e-05, 1.5691e-05]], device='cuda:0') Epoch 68, bias, value: tensor([-0.0076, -0.0077, -0.0036, -0.0123, -0.0127, -0.0055, 0.0110, 0.0020, 0.0235, -0.0023], device='cuda:0'), grad: tensor([ 5.5939e-05, -1.4257e-03, 5.8746e-04, -4.3416e-04, 5.4789e-04, 1.9526e-04, 2.7561e-04, 2.7657e-04, 3.4571e-04, -4.2415e-04], device='cuda:0') 100 0.0001 changing lr epoch 67, time 214.68, cls_loss 0.0116 cls_loss_mapping 0.0155 cls_loss_causal 0.6524 re_mapping 0.0135 re_causal 0.0361 /// teacc 98.79 lr 0.00010000 Epoch 69, weight, value: tensor([[-0.0955, 0.0572, 0.0053, ..., -0.0065, -0.0327, -0.0546], [ 0.0310, 0.0479, -0.0698, ..., -0.0665, -0.0183, 0.0598], [ 0.0519, -0.0513, -0.0577, ..., -0.0003, -0.0315, -0.0137], ..., [ 0.0410, 0.0269, 0.0631, ..., 0.0333, 0.0917, 0.0165], [ 0.0502, -0.0773, -0.0681, ..., -0.0956, -0.0363, 0.0790], [-0.0462, 0.0668, 0.0177, ..., -0.0405, -0.0339, 0.0105]], device='cuda:0'), grad: tensor([[ 7.1749e-06, -2.4140e-06, 1.0736e-05, ..., 4.6194e-06, 1.1288e-05, 1.9241e-06], [-1.1816e-03, -1.8013e-04, -1.1883e-03, ..., -1.9178e-03, -2.4033e-03, -2.1644e-06], [ 9.5940e-04, 1.6081e-04, 1.0014e-03, ..., 1.4763e-03, 1.8845e-03, 1.5117e-05], ..., [-2.2566e-04, -8.2910e-05, -4.0340e-04, ..., 3.0547e-05, -1.4603e-04, -8.6010e-05], [ 5.2303e-05, 7.7665e-05, 1.3137e-04, ..., 3.1918e-05, 6.0409e-05, 4.3035e-05], [ 5.2273e-05, -2.7561e-04, -2.3007e-04, ..., 2.6360e-05, 7.2896e-05, -1.4162e-04]], device='cuda:0') Epoch 69, bias, value: tensor([-0.0079, -0.0077, -0.0040, -0.0125, -0.0129, -0.0048, 0.0113, 0.0027, 0.0229, -0.0026], device='cuda:0'), grad: tensor([ 3.6545e-06, -4.7455e-03, 3.6755e-03, 1.1730e-03, 3.4869e-05, 3.3826e-05, 1.5602e-05, 2.1860e-05, 1.9765e-04, -4.0603e-04], device='cuda:0') 100 0.0001 changing lr epoch 68, time 214.88, cls_loss 0.0116 cls_loss_mapping 0.0173 cls_loss_causal 0.6437 re_mapping 0.0137 re_causal 0.0377 /// teacc 98.71 lr 0.00010000 Epoch 70, weight, value: tensor([[-9.5797e-02, 5.7788e-02, 6.6002e-03, ..., -6.4182e-03, -3.3385e-02, -5.5504e-02], [ 3.1207e-02, 4.8217e-02, -6.9822e-02, ..., -6.6095e-02, -1.7887e-02, 6.0344e-02], [ 5.2696e-02, -5.2102e-02, -5.7815e-02, ..., -3.6206e-05, -3.1531e-02, -1.2868e-02], ..., [ 4.1461e-02, 2.7217e-02, 6.3732e-02, ..., 3.3716e-02, 9.2492e-02, 1.6250e-02], [ 4.9684e-02, -7.7900e-02, -6.9060e-02, ..., -9.6234e-02, -3.7097e-02, 7.8802e-02], [-4.6658e-02, 6.7132e-02, 1.7915e-02, ..., -4.1135e-02, -3.3683e-02, 1.0405e-02]], device='cuda:0'), grad: tensor([[ 9.0823e-06, 6.8881e-06, 4.8392e-06, ..., 1.1874e-06, 5.1335e-06, 1.4916e-05], [-3.1263e-05, 3.7217e-04, 1.7333e-04, ..., 3.5577e-06, 1.1241e-04, 1.2107e-05], [ 1.2457e-05, 2.0906e-05, 6.0610e-06, ..., -4.5411e-06, 1.1958e-05, 3.3677e-05], ..., [-2.7686e-05, 1.4782e-04, -1.1094e-05, ..., -2.5094e-05, 3.0715e-06, 7.1883e-05], [-6.1691e-05, -4.5925e-05, 6.2697e-06, ..., 2.7250e-06, 5.7071e-06, -3.1543e-04], [ 1.9342e-05, -7.9060e-04, -3.0565e-04, ..., 1.1012e-05, -1.6642e-04, -1.9252e-04]], device='cuda:0') Epoch 70, bias, value: tensor([-0.0077, -0.0073, -0.0037, -0.0125, -0.0133, -0.0054, 0.0117, 0.0029, 0.0223, -0.0027], device='cuda:0'), grad: tensor([ 2.9817e-05, 3.5238e-04, 3.7223e-05, 1.5426e-04, 5.4985e-05, 2.0266e-04, 2.0432e-04, 1.4651e-04, -3.6955e-04, -8.1301e-04], device='cuda:0') 100 0.0001 changing lr epoch 69, time 214.85, cls_loss 0.0095 cls_loss_mapping 0.0135 cls_loss_causal 0.6541 re_mapping 0.0128 re_causal 0.0376 /// teacc 98.77 lr 0.00010000 Epoch 71, weight, value: tensor([[-0.0963, 0.0580, 0.0063, ..., -0.0062, -0.0339, -0.0561], [ 0.0312, 0.0485, -0.0701, ..., -0.0665, -0.0178, 0.0605], [ 0.0524, -0.0527, -0.0587, ..., -0.0003, -0.0318, -0.0134], ..., [ 0.0421, 0.0278, 0.0646, ..., 0.0343, 0.0935, 0.0161], [ 0.0499, -0.0783, -0.0696, ..., -0.0967, -0.0374, 0.0794], [-0.0472, 0.0671, 0.0179, ..., -0.0417, -0.0342, 0.0100]], device='cuda:0'), grad: tensor([[ 1.6645e-05, 2.9709e-06, 1.5721e-06, ..., -8.8755e-07, 7.3574e-07, 3.3349e-05], [ 1.0677e-05, 5.1320e-05, 5.8971e-06, ..., 5.7258e-06, 3.2261e-06, 5.8021e-07], [-1.7798e-04, -5.0575e-05, 2.9951e-06, ..., -6.8545e-05, -5.2810e-05, -7.3574e-07], ..., [ 7.9334e-05, 2.2531e-05, 8.6129e-06, ..., 4.7356e-05, 4.4137e-05, 5.7332e-06], [-5.2512e-05, -3.9190e-05, 3.7197e-06, ..., 6.9663e-06, 1.0571e-06, -1.8442e-04], [ 6.9559e-05, 1.9467e-04, 5.0664e-06, ..., 2.5302e-05, 3.6228e-07, 4.3422e-05]], device='cuda:0') Epoch 71, bias, value: tensor([-0.0078, -0.0073, -0.0041, -0.0124, -0.0135, -0.0054, 0.0122, 0.0036, 0.0223, -0.0030], device='cuda:0'), grad: tensor([-2.4676e-05, 2.2268e-04, -8.8453e-04, 3.0088e-04, 1.8954e-04, -5.7125e-04, 3.5000e-04, 1.7214e-04, -2.6393e-04, 5.0831e-04], device='cuda:0') 100 0.0001 changing lr epoch 70, time 215.06, cls_loss 0.0093 cls_loss_mapping 0.0134 cls_loss_causal 0.6893 re_mapping 0.0127 re_causal 0.0378 /// teacc 98.68 lr 0.00010000 Epoch 72, weight, value: tensor([[-0.0968, 0.0581, 0.0057, ..., -0.0063, -0.0345, -0.0565], [ 0.0308, 0.0482, -0.0707, ..., -0.0668, -0.0181, 0.0604], [ 0.0522, -0.0536, -0.0594, ..., -0.0001, -0.0323, -0.0140], ..., [ 0.0427, 0.0279, 0.0650, ..., 0.0343, 0.0942, 0.0164], [ 0.0501, -0.0788, -0.0703, ..., -0.0975, -0.0377, 0.0804], [-0.0472, 0.0674, 0.0182, ..., -0.0422, -0.0345, 0.0100]], device='cuda:0'), grad: tensor([[ 4.5300e-06, 3.0920e-06, 8.4564e-06, ..., 4.6119e-06, 3.2689e-06, 2.2091e-06], [ 7.3552e-05, 4.6372e-05, 5.6356e-05, ..., 2.6882e-05, 8.0466e-05, 2.2464e-06], [ 3.1686e-04, 4.9204e-05, 1.8239e-04, ..., 1.4746e-04, 2.7514e-04, 1.0066e-05], ..., [-4.3964e-04, -1.9789e-04, -2.8872e-04, ..., -1.5759e-04, -4.6039e-04, -4.3176e-06], [-1.4566e-05, 2.9057e-05, 4.1217e-05, ..., 1.6481e-05, 5.4017e-06, -3.8415e-05], [ 1.3411e-05, 3.8803e-05, -7.8604e-06, ..., 1.0207e-05, 4.2379e-05, 3.6880e-06]], device='cuda:0') Epoch 72, bias, value: tensor([-0.0079, -0.0077, -0.0043, -0.0125, -0.0136, -0.0050, 0.0122, 0.0038, 0.0222, -0.0028], device='cuda:0'), grad: tensor([ 1.5110e-05, 1.3900e-04, 2.7275e-04, 1.4752e-05, 5.1647e-05, 3.8803e-05, 1.1645e-05, -6.4182e-04, 2.0236e-05, 7.8380e-05], device='cuda:0') 100 0.0001 changing lr epoch 71, time 214.90, cls_loss 0.0088 cls_loss_mapping 0.0123 cls_loss_causal 0.6377 re_mapping 0.0128 re_causal 0.0374 /// teacc 98.69 lr 0.00010000 Epoch 73, weight, value: tensor([[-0.0973, 0.0580, 0.0053, ..., -0.0068, -0.0349, -0.0569], [ 0.0307, 0.0483, -0.0717, ..., -0.0662, -0.0185, 0.0607], [ 0.0517, -0.0546, -0.0596, ..., -0.0004, -0.0327, -0.0148], ..., [ 0.0430, 0.0280, 0.0649, ..., 0.0340, 0.0946, 0.0167], [ 0.0509, -0.0787, -0.0707, ..., -0.0979, -0.0378, 0.0814], [-0.0470, 0.0680, 0.0188, ..., -0.0426, -0.0342, 0.0096]], device='cuda:0'), grad: tensor([[-1.6959e-06, -6.9588e-06, 2.1346e-06, ..., 3.5577e-07, -2.5574e-06, 4.4629e-06], [ 8.5175e-05, 1.6046e-04, 2.0564e-06, ..., 1.6978e-06, 1.9693e-04, -3.7346e-07], [-7.4506e-05, -1.5306e-04, 2.7083e-06, ..., -1.5898e-06, -1.9455e-04, 1.4551e-05], ..., [-1.3955e-05, 1.3366e-05, -8.7991e-06, ..., -1.2420e-05, -1.6019e-05, 7.1786e-06], [-1.2589e-04, 1.0423e-05, 1.9521e-06, ..., 1.8999e-06, 2.1048e-06, -1.9085e-04], [ 2.0504e-05, 3.8600e-04, -2.4125e-05, ..., 2.5108e-06, 7.9814e-07, 2.3639e-04]], device='cuda:0') Epoch 73, bias, value: tensor([-0.0082, -0.0074, -0.0051, -0.0124, -0.0140, -0.0047, 0.0123, 0.0034, 0.0226, -0.0023], device='cuda:0'), grad: tensor([-2.0303e-06, 4.3654e-04, -3.9864e-04, 1.0622e-04, -6.7091e-04, -7.3016e-05, 2.6107e-04, 1.6972e-05, -3.4857e-04, 6.7377e-04], device='cuda:0') 100 0.0001 changing lr epoch 72, time 214.92, cls_loss 0.0094 cls_loss_mapping 0.0141 cls_loss_causal 0.6511 re_mapping 0.0134 re_causal 0.0380 /// teacc 98.73 lr 0.00010000 Epoch 74, weight, value: tensor([[-0.0983, 0.0584, 0.0049, ..., -0.0070, -0.0359, -0.0574], [ 0.0306, 0.0481, -0.0723, ..., -0.0675, -0.0186, 0.0612], [ 0.0509, -0.0547, -0.0611, ..., -0.0013, -0.0347, -0.0150], ..., [ 0.0438, 0.0286, 0.0657, ..., 0.0355, 0.0960, 0.0159], [ 0.0521, -0.0795, -0.0705, ..., -0.0981, -0.0370, 0.0824], [-0.0474, 0.0675, 0.0190, ..., -0.0428, -0.0347, 0.0088]], device='cuda:0'), grad: tensor([[ 1.7704e-06, -1.7130e-04, 3.4347e-06, ..., 1.8291e-06, 2.6654e-06, 2.9150e-06], [-2.5239e-07, 1.7248e-06, 9.3281e-06, ..., 3.4552e-06, 4.0010e-06, -8.4266e-06], [ 2.2091e-06, 6.8508e-06, 5.1744e-06, ..., -1.1176e-07, 3.3639e-06, 3.1777e-06], ..., [-5.7042e-05, -3.1382e-05, -6.9022e-05, ..., -2.3693e-05, -9.4235e-05, -1.2964e-05], [ 3.2485e-06, 1.9550e-05, 1.4067e-05, ..., 7.2531e-06, 3.9898e-06, 2.1979e-05], [ 1.8820e-05, -6.7949e-05, -9.1553e-05, ..., -1.7345e-05, -2.4408e-05, -1.8068e-07]], device='cuda:0') Epoch 74, bias, value: tensor([-0.0081, -0.0078, -0.0053, -0.0130, -0.0130, -0.0046, 0.0119, 0.0041, 0.0227, -0.0029], device='cuda:0'), grad: tensor([-2.7108e-04, 1.2726e-05, 1.3225e-05, -2.0251e-05, 1.4985e-04, -8.6606e-05, 2.7156e-04, -6.7651e-05, 8.4937e-05, -8.7440e-05], device='cuda:0') 100 0.0001 changing lr epoch 73, time 214.80, cls_loss 0.0112 cls_loss_mapping 0.0172 cls_loss_causal 0.6517 re_mapping 0.0124 re_causal 0.0359 /// teacc 98.63 lr 0.00010000 Epoch 75, weight, value: tensor([[-0.0987, 0.0589, 0.0044, ..., -0.0071, -0.0368, -0.0576], [ 0.0306, 0.0491, -0.0729, ..., -0.0685, -0.0182, 0.0622], [ 0.0515, -0.0553, -0.0617, ..., -0.0017, -0.0348, -0.0144], ..., [ 0.0440, 0.0287, 0.0673, ..., 0.0363, 0.0968, 0.0150], [ 0.0518, -0.0801, -0.0711, ..., -0.0986, -0.0376, 0.0827], [-0.0471, 0.0671, 0.0186, ..., -0.0430, -0.0351, 0.0083]], device='cuda:0'), grad: tensor([[ 6.0499e-06, -2.4334e-05, 1.2964e-06, ..., -6.8992e-06, 2.0172e-06, 5.1297e-06], [ 2.2352e-05, 6.6683e-06, 1.9535e-05, ..., 1.0237e-05, 2.8372e-05, -1.2271e-05], [-4.9084e-05, 6.4969e-06, 5.5954e-06, ..., -3.8177e-05, -1.1288e-05, 1.6838e-05], ..., [-2.4602e-05, -8.3372e-06, -4.0442e-05, ..., -2.9430e-07, -5.0396e-05, 1.0468e-05], [ 1.3985e-05, 1.1049e-05, 4.5970e-06, ..., 3.2067e-05, 1.1779e-05, -3.5673e-05], [ 2.5064e-05, 1.4216e-05, -8.9407e-07, ..., 1.3143e-05, 8.7470e-06, 2.2203e-06]], device='cuda:0') Epoch 75, bias, value: tensor([-0.0077, -0.0081, -0.0051, -0.0132, -0.0128, -0.0047, 0.0120, 0.0046, 0.0221, -0.0031], device='cuda:0'), grad: tensor([-4.2021e-05, 2.6390e-05, -3.5286e-05, -6.6161e-05, 5.5507e-06, -2.2814e-05, 3.3081e-05, -1.1891e-05, 2.6137e-05, 8.6904e-05], device='cuda:0') 100 0.0001 changing lr epoch 74, time 215.15, cls_loss 0.0081 cls_loss_mapping 0.0163 cls_loss_causal 0.6193 re_mapping 0.0127 re_causal 0.0374 /// teacc 98.58 lr 0.00010000 Epoch 76, weight, value: tensor([[-0.0992, 0.0593, 0.0043, ..., -0.0068, -0.0376, -0.0582], [ 0.0302, 0.0488, -0.0734, ..., -0.0687, -0.0186, 0.0620], [ 0.0516, -0.0554, -0.0619, ..., -0.0014, -0.0348, -0.0150], ..., [ 0.0445, 0.0296, 0.0681, ..., 0.0364, 0.0976, 0.0149], [ 0.0524, -0.0804, -0.0712, ..., -0.0990, -0.0376, 0.0838], [-0.0479, 0.0670, 0.0183, ..., -0.0439, -0.0359, 0.0081]], device='cuda:0'), grad: tensor([[ 2.5127e-06, 1.1707e-06, 1.5115e-06, ..., 5.0552e-06, 8.9221e-07, 6.4261e-07], [ 5.8338e-06, 8.2105e-06, 9.6336e-06, ..., 1.1720e-05, 6.0722e-06, -3.1460e-06], [-8.7768e-06, 8.0094e-06, 3.0752e-06, ..., -9.1419e-06, -2.7604e-06, 6.5751e-06], ..., [-6.0536e-06, 2.2352e-07, -9.2313e-06, ..., 3.9898e-06, -1.1474e-05, 2.2482e-06], [-3.7663e-06, 1.5885e-05, 5.0701e-06, ..., 9.6709e-06, 8.9314e-07, -1.1735e-05], [ 3.7551e-06, 3.5409e-06, 5.2433e-07, ..., 6.8918e-06, 2.8927e-06, 2.6561e-06]], device='cuda:0') Epoch 76, bias, value: tensor([-0.0073, -0.0087, -0.0049, -0.0136, -0.0131, -0.0047, 0.0122, 0.0053, 0.0224, -0.0035], device='cuda:0'), grad: tensor([ 1.2010e-05, 2.3037e-05, 1.3430e-06, -2.4533e-04, -9.5218e-06, 1.4472e-04, 3.2067e-05, 3.6880e-06, 2.2218e-05, 1.5363e-05], device='cuda:0') 100 0.0001 changing lr epoch 75, time 215.04, cls_loss 0.0077 cls_loss_mapping 0.0111 cls_loss_causal 0.6284 re_mapping 0.0125 re_causal 0.0356 /// teacc 98.72 lr 0.00010000 Epoch 77, weight, value: tensor([[-0.1000, 0.0596, 0.0041, ..., -0.0067, -0.0378, -0.0591], [ 0.0302, 0.0487, -0.0736, ..., -0.0685, -0.0185, 0.0622], [ 0.0519, -0.0555, -0.0617, ..., -0.0007, -0.0348, -0.0151], ..., [ 0.0450, 0.0297, 0.0686, ..., 0.0366, 0.0983, 0.0147], [ 0.0528, -0.0809, -0.0715, ..., -0.0996, -0.0378, 0.0849], [-0.0486, 0.0670, 0.0182, ..., -0.0444, -0.0362, 0.0077]], device='cuda:0'), grad: tensor([[ 1.3597e-05, 8.5831e-05, 7.6771e-05, ..., 1.8671e-05, 1.3225e-06, 5.4538e-05], [-1.8552e-06, 4.3660e-05, 4.4644e-05, ..., 1.1042e-05, 2.8778e-07, -1.5333e-05], [ 3.1352e-05, 6.6817e-05, 4.4495e-05, ..., 5.2810e-05, 8.5309e-07, 5.9038e-05], ..., [ 6.7428e-07, 2.8014e-04, 2.4533e-04, ..., 4.9211e-06, 2.0266e-04, 2.4870e-05], [ 9.3400e-05, 2.4962e-04, 1.2136e-04, ..., 3.4285e-04, 3.3341e-06, 6.4492e-05], [ 2.3656e-06, -1.0786e-03, -9.6416e-04, ..., 7.4506e-06, -2.4939e-04, -3.4237e-04]], device='cuda:0') Epoch 77, bias, value: tensor([-0.0071, -0.0087, -0.0045, -0.0136, -0.0131, -0.0050, 0.0121, 0.0054, 0.0226, -0.0038], device='cuda:0'), grad: tensor([ 2.2173e-04, 7.7367e-05, 1.9455e-04, 1.1677e-04, 1.5485e-04, 4.3559e-04, 2.8044e-05, 4.7708e-04, 5.8079e-04, -2.2869e-03], device='cuda:0') 100 0.0001 changing lr epoch 76, time 215.06, cls_loss 0.0086 cls_loss_mapping 0.0131 cls_loss_causal 0.6295 re_mapping 0.0118 re_causal 0.0340 /// teacc 98.71 lr 0.00010000 Epoch 78, weight, value: tensor([[-0.1014, 0.0595, 0.0038, ..., -0.0076, -0.0384, -0.0608], [ 0.0306, 0.0492, -0.0739, ..., -0.0685, -0.0180, 0.0629], [ 0.0514, -0.0560, -0.0622, ..., -0.0008, -0.0353, -0.0155], ..., [ 0.0450, 0.0290, 0.0685, ..., 0.0361, 0.0984, 0.0140], [ 0.0534, -0.0817, -0.0719, ..., -0.1004, -0.0381, 0.0857], [-0.0490, 0.0673, 0.0183, ..., -0.0448, -0.0364, 0.0073]], device='cuda:0'), grad: tensor([[ 3.3733e-06, 1.6987e-04, 7.4506e-05, ..., 1.9461e-05, 6.8592e-07, 2.1476e-06], [-3.0976e-06, -5.6297e-05, -5.1185e-06, ..., 6.4857e-06, -8.2925e-06, -5.1141e-05], [ 4.2981e-07, 1.3483e-04, 7.7128e-05, ..., 3.4481e-05, -4.7274e-06, 4.4294e-06], ..., [ 4.0419e-06, 4.8965e-05, 2.5362e-05, ..., 8.0839e-06, 2.1867e-06, 1.5810e-05], [ 3.0458e-05, 2.5466e-05, 1.7241e-05, ..., 1.0230e-05, 2.2314e-06, 2.3097e-06], [ 5.4762e-06, -9.4509e-04, -4.3654e-04, ..., -4.8399e-05, 2.1886e-07, -1.1297e-06]], device='cuda:0') Epoch 78, bias, value: tensor([-0.0081, -0.0081, -0.0048, -0.0131, -0.0132, -0.0044, 0.0118, 0.0050, 0.0223, -0.0036], device='cuda:0'), grad: tensor([ 0.0003, -0.0002, 0.0003, 0.0006, 0.0001, 0.0008, -0.0010, 0.0001, 0.0003, -0.0014], device='cuda:0') 100 0.0001 changing lr epoch 77, time 215.07, cls_loss 0.0065 cls_loss_mapping 0.0110 cls_loss_causal 0.5932 re_mapping 0.0124 re_causal 0.0350 /// teacc 98.68 lr 0.00010000 Epoch 79, weight, value: tensor([[-0.1018, 0.0595, 0.0036, ..., -0.0077, -0.0388, -0.0611], [ 0.0309, 0.0492, -0.0742, ..., -0.0685, -0.0179, 0.0630], [ 0.0512, -0.0562, -0.0626, ..., -0.0008, -0.0356, -0.0158], ..., [ 0.0452, 0.0290, 0.0689, ..., 0.0362, 0.0988, 0.0139], [ 0.0538, -0.0821, -0.0722, ..., -0.1007, -0.0381, 0.0863], [-0.0492, 0.0678, 0.0185, ..., -0.0450, -0.0361, 0.0072]], device='cuda:0'), grad: tensor([[ 1.0908e-05, -2.0973e-06, 5.8375e-06, ..., 5.4650e-06, 7.1116e-06, 7.4096e-06], [ 1.3697e-04, 2.5928e-05, 1.1516e-04, ..., 7.7426e-05, 1.6737e-04, -3.4720e-06], [-2.2009e-05, 7.7337e-06, 2.3857e-05, ..., -3.0726e-05, 2.2128e-05, 6.5714e-06], ..., [-2.8777e-04, -6.4790e-05, -2.8253e-04, ..., -1.6701e-04, -3.4499e-04, 1.1161e-05], [-1.5068e-04, -6.7353e-05, 1.2174e-05, ..., 2.5749e-05, 1.6883e-05, -3.4428e-04], [ 1.8820e-05, -3.0905e-05, -2.2635e-05, ..., 1.2644e-05, 1.3493e-05, 3.8780e-06]], device='cuda:0') Epoch 79, bias, value: tensor([-0.0080, -0.0082, -0.0046, -0.0133, -0.0129, -0.0043, 0.0113, 0.0051, 0.0220, -0.0034], device='cuda:0'), grad: tensor([ 1.9416e-05, 2.2316e-04, -6.8247e-05, 6.2227e-04, -2.3887e-05, 2.4423e-05, -3.1441e-05, -4.1318e-04, -3.3879e-04, -1.4596e-05], device='cuda:0') 100 0.0001 changing lr epoch 78, time 215.06, cls_loss 0.0082 cls_loss_mapping 0.0136 cls_loss_causal 0.6094 re_mapping 0.0122 re_causal 0.0341 /// teacc 98.73 lr 0.00010000 Epoch 80, weight, value: tensor([[-0.1029, 0.0588, 0.0033, ..., -0.0077, -0.0398, -0.0617], [ 0.0308, 0.0495, -0.0745, ..., -0.0689, -0.0183, 0.0637], [ 0.0513, -0.0569, -0.0632, ..., -0.0009, -0.0353, -0.0162], ..., [ 0.0459, 0.0291, 0.0693, ..., 0.0363, 0.0995, 0.0138], [ 0.0541, -0.0830, -0.0725, ..., -0.1007, -0.0383, 0.0867], [-0.0497, 0.0686, 0.0186, ..., -0.0453, -0.0365, 0.0066]], device='cuda:0'), grad: tensor([[ 3.9518e-05, 2.4036e-05, 1.4640e-06, ..., 9.6411e-06, 6.2212e-06, 6.0737e-05], [-3.8934e-04, -5.1737e-04, 3.4347e-06, ..., 2.1234e-06, -6.8665e-05, -7.3814e-04], [ 5.3376e-05, 9.6500e-05, 3.1516e-06, ..., -5.4657e-05, 1.7658e-05, 1.6117e-04], ..., [ 1.2904e-05, 2.7478e-05, -8.4490e-06, ..., 1.9819e-06, -9.1568e-06, 3.7789e-05], [ 1.9884e-04, 3.0684e-04, 6.5342e-06, ..., 2.3693e-05, 3.8415e-05, 3.5667e-04], [ 1.1601e-05, -5.4576e-06, -1.9029e-05, ..., 5.1707e-06, -2.7902e-06, 1.9297e-05]], device='cuda:0') Epoch 80, bias, value: tensor([-0.0088, -0.0082, -0.0047, -0.0139, -0.0128, -0.0042, 0.0113, 0.0055, 0.0219, -0.0030], device='cuda:0'), grad: tensor([ 1.0753e-04, -1.1368e-03, 5.2989e-05, 7.7009e-05, 3.6955e-05, 5.7489e-05, 5.0038e-05, 6.5029e-05, 6.8188e-04, 9.3058e-06], device='cuda:0') 100 0.0001 changing lr epoch 79, time 214.88, cls_loss 0.0094 cls_loss_mapping 0.0140 cls_loss_causal 0.6520 re_mapping 0.0118 re_causal 0.0348 /// teacc 98.73 lr 0.00010000 Epoch 81, weight, value: tensor([[-0.1040, 0.0592, 0.0041, ..., -0.0080, -0.0406, -0.0625], [ 0.0310, 0.0501, -0.0745, ..., -0.0691, -0.0181, 0.0646], [ 0.0518, -0.0576, -0.0632, ..., -0.0005, -0.0348, -0.0166], ..., [ 0.0457, 0.0286, 0.0693, ..., 0.0363, 0.0995, 0.0135], [ 0.0543, -0.0841, -0.0726, ..., -0.1011, -0.0386, 0.0872], [-0.0496, 0.0691, 0.0188, ..., -0.0457, -0.0366, 0.0063]], device='cuda:0'), grad: tensor([[ 1.8388e-05, -2.3901e-04, 1.1906e-05, ..., -2.2739e-05, 1.0930e-05, 3.7216e-06], [ 1.7118e-04, 2.5302e-05, 3.5912e-05, ..., 7.3254e-05, 1.4246e-04, -3.3509e-06], [-1.8561e-04, 4.7117e-05, 1.8284e-05, ..., -6.1691e-05, -9.3818e-05, -1.2647e-06], ..., [-1.4842e-04, -1.5080e-04, -2.8896e-04, ..., -9.1717e-06, -3.4404e-04, 3.0193e-06], [ 8.7619e-06, 1.9863e-05, 1.0245e-05, ..., 7.4469e-06, 4.5635e-06, 2.1663e-06], [ 9.6977e-05, 2.1291e-04, 1.5986e-04, ..., 1.6496e-05, 2.3055e-04, 8.6939e-07]], device='cuda:0') Epoch 81, bias, value: tensor([-0.0085, -0.0078, -0.0045, -0.0132, -0.0124, -0.0051, 0.0114, 0.0050, 0.0213, -0.0028], device='cuda:0'), grad: tensor([-4.6968e-04, 3.0351e-04, -2.3174e-04, 6.3837e-05, 5.5134e-05, 3.3706e-05, 5.2840e-05, -3.6907e-04, 5.3585e-05, 5.0783e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 80---------------------------------------------------- epoch 80, time 231.50, cls_loss 0.0073 cls_loss_mapping 0.0132 cls_loss_causal 0.6114 re_mapping 0.0116 re_causal 0.0325 /// teacc 98.84 lr 0.00010000 Epoch 82, weight, value: tensor([[-0.1048, 0.0603, 0.0038, ..., -0.0067, -0.0416, -0.0629], [ 0.0310, 0.0507, -0.0746, ..., -0.0695, -0.0178, 0.0646], [ 0.0519, -0.0581, -0.0635, ..., -0.0003, -0.0346, -0.0173], ..., [ 0.0457, 0.0279, 0.0695, ..., 0.0364, 0.0995, 0.0129], [ 0.0553, -0.0842, -0.0728, ..., -0.1016, -0.0384, 0.0888], [-0.0501, 0.0693, 0.0190, ..., -0.0461, -0.0367, 0.0059]], device='cuda:0'), grad: tensor([[ 3.1590e-06, -3.9823e-06, 6.5304e-06, ..., 4.0047e-06, 7.3668e-07, 1.3625e-06], [ 1.7695e-08, -2.5347e-05, 1.2204e-05, ..., 7.4841e-06, 3.0827e-07, -3.9965e-05], [ 5.2452e-05, 5.2713e-06, 1.0842e-04, ..., 6.7115e-05, 9.2909e-06, 4.7795e-06], ..., [ 1.4257e-04, 1.1273e-05, 2.9922e-04, ..., 1.8549e-04, 1.8895e-05, 1.2405e-05], [ 6.8098e-06, 4.8913e-06, 1.5274e-05, ..., 9.0152e-06, 1.8422e-06, 3.0454e-06], [ 1.9856e-06, 4.7311e-06, -7.1991e-07, ..., 2.2128e-06, 3.5996e-07, 7.2643e-06]], device='cuda:0') Epoch 82, bias, value: tensor([-0.0077, -0.0080, -0.0045, -0.0139, -0.0118, -0.0051, 0.0107, 0.0047, 0.0220, -0.0028], device='cuda:0'), grad: tensor([ 7.5623e-07, -6.3598e-05, 1.2046e-04, -4.4727e-04, 1.1146e-05, 7.3090e-06, 2.5034e-06, 3.2616e-04, 2.5824e-05, 1.6734e-05], device='cuda:0') 100 0.0001 changing lr epoch 81, time 215.03, cls_loss 0.0090 cls_loss_mapping 0.0148 cls_loss_causal 0.6320 re_mapping 0.0121 re_causal 0.0328 /// teacc 98.64 lr 0.00010000 Epoch 83, weight, value: tensor([[-0.1059, 0.0606, 0.0035, ..., -0.0072, -0.0426, -0.0637], [ 0.0301, 0.0507, -0.0759, ..., -0.0701, -0.0189, 0.0645], [ 0.0523, -0.0585, -0.0638, ..., 0.0003, -0.0342, -0.0175], ..., [ 0.0467, 0.0281, 0.0705, ..., 0.0367, 0.1007, 0.0132], [ 0.0557, -0.0846, -0.0732, ..., -0.1025, -0.0386, 0.0898], [-0.0506, 0.0698, 0.0193, ..., -0.0468, -0.0371, 0.0057]], device='cuda:0'), grad: tensor([[ 1.6084e-06, 5.5833e-07, 1.6587e-06, ..., 1.0571e-06, 9.4017e-07, 2.4252e-06], [-5.2869e-05, -6.4015e-05, 3.5539e-06, ..., 6.1914e-06, 3.9786e-06, -9.9599e-05], [ 4.4155e-04, 1.6719e-05, 4.2176e-04, ..., 3.3236e-04, 7.4530e-04, 1.1578e-05], ..., [-4.5013e-04, 7.9814e-07, -4.3225e-04, ..., -3.4308e-04, -7.6675e-04, 1.2226e-05], [ 1.2293e-05, 2.6748e-05, 4.1202e-06, ..., 4.9435e-06, 1.5134e-06, 2.1458e-05], [ 1.0692e-05, 1.7643e-05, 2.0981e-05, ..., 1.8179e-05, 4.2021e-06, 1.5825e-05]], device='cuda:0') Epoch 83, bias, value: tensor([-0.0076, -0.0090, -0.0040, -0.0142, -0.0122, -0.0054, 0.0110, 0.0053, 0.0221, -0.0027], device='cuda:0'), grad: tensor([ 1.0870e-05, -1.7822e-04, 7.2670e-04, -3.3200e-05, -1.4231e-05, 1.8582e-05, 5.2392e-05, -7.1335e-04, 6.9857e-05, 6.0618e-05], device='cuda:0') 100 0.0001 changing lr epoch 82, time 214.93, cls_loss 0.0071 cls_loss_mapping 0.0120 cls_loss_causal 0.5977 re_mapping 0.0121 re_causal 0.0341 /// teacc 98.60 lr 0.00010000 Epoch 84, weight, value: tensor([[-0.1066, 0.0606, 0.0031, ..., -0.0074, -0.0433, -0.0643], [ 0.0301, 0.0508, -0.0760, ..., -0.0704, -0.0190, 0.0650], [ 0.0518, -0.0593, -0.0648, ..., -0.0004, -0.0349, -0.0179], ..., [ 0.0476, 0.0283, 0.0713, ..., 0.0375, 0.1018, 0.0131], [ 0.0559, -0.0852, -0.0739, ..., -0.1033, -0.0389, 0.0902], [-0.0514, 0.0696, 0.0192, ..., -0.0475, -0.0379, 0.0051]], device='cuda:0'), grad: tensor([[-6.8219e-07, -3.4831e-06, 3.3174e-06, ..., 3.6787e-06, -7.4832e-07, 1.6550e-06], [ 4.0978e-06, 2.4572e-05, 4.0904e-06, ..., 4.7423e-06, 3.0808e-06, 4.7265e-07], [-4.0621e-05, -3.2216e-05, -6.7234e-05, ..., -2.6727e-04, -2.8342e-05, 1.2971e-05], ..., [ 3.4958e-05, 3.7730e-05, 2.6107e-05, ..., 2.5898e-05, 2.0906e-05, 1.8030e-05], [-2.3469e-05, 1.3851e-05, 7.7114e-06, ..., 1.4886e-05, -2.5295e-06, -2.8104e-05], [ 2.0340e-05, -5.9545e-05, -6.0260e-05, ..., 7.2718e-06, -1.0282e-06, 2.8521e-05]], device='cuda:0') Epoch 84, bias, value: tensor([-0.0080, -0.0089, -0.0048, -0.0142, -0.0116, -0.0050, 0.0114, 0.0059, 0.0218, -0.0032], device='cuda:0'), grad: tensor([ 5.4426e-06, 1.0514e-04, -3.8910e-04, 4.1866e-04, -2.3544e-04, -1.5521e-04, 8.7619e-05, 1.4770e-04, 3.7938e-05, -2.2978e-05], device='cuda:0') 100 0.0001 changing lr epoch 83, time 214.94, cls_loss 0.0094 cls_loss_mapping 0.0134 cls_loss_causal 0.6151 re_mapping 0.0117 re_causal 0.0323 /// teacc 98.73 lr 0.00010000 Epoch 85, weight, value: tensor([[-0.1078, 0.0607, 0.0026, ..., -0.0077, -0.0442, -0.0652], [ 0.0303, 0.0517, -0.0758, ..., -0.0712, -0.0185, 0.0659], [ 0.0513, -0.0604, -0.0666, ..., -0.0010, -0.0357, -0.0180], ..., [ 0.0482, 0.0282, 0.0719, ..., 0.0380, 0.1030, 0.0122], [ 0.0565, -0.0857, -0.0747, ..., -0.1040, -0.0394, 0.0910], [-0.0522, 0.0696, 0.0194, ..., -0.0480, -0.0386, 0.0047]], device='cuda:0'), grad: tensor([[ 5.7757e-05, 1.9491e-05, 3.0413e-05, ..., 2.0251e-05, 3.5256e-05, 3.5018e-05], [ 2.1338e-04, 3.8177e-05, 1.3685e-04, ..., 3.2216e-05, 1.5676e-04, 1.3125e-04], [ 1.4150e-04, 2.8938e-05, 3.8475e-05, ..., -6.1810e-05, 6.5446e-05, 1.7953e-04], ..., [ 9.8228e-04, 1.5390e-04, 6.8665e-04, ..., 1.4973e-04, 7.5817e-04, 5.7554e-04], [-2.8276e-04, -1.9968e-04, 1.3173e-04, ..., -3.3468e-05, 2.9624e-05, -3.9721e-04], [-1.7071e-03, -1.9598e-04, -1.2560e-03, ..., -1.8954e-04, -1.3876e-03, -9.5081e-04]], device='cuda:0') Epoch 85, bias, value: tensor([-0.0083, -0.0087, -0.0055, -0.0141, -0.0108, -0.0045, 0.0107, 0.0058, 0.0220, -0.0035], device='cuda:0'), grad: tensor([ 1.1903e-04, 4.8351e-04, 1.7512e-04, 4.4727e-04, 4.5013e-04, 1.0210e-04, 4.0084e-05, 2.4204e-03, -3.5137e-05, -4.2000e-03], device='cuda:0') 100 0.0001 changing lr epoch 84, time 214.85, cls_loss 0.0105 cls_loss_mapping 0.0155 cls_loss_causal 0.5995 re_mapping 0.0115 re_causal 0.0320 /// teacc 98.80 lr 0.00010000 Epoch 86, weight, value: tensor([[-0.1090, 0.0602, 0.0016, ..., -0.0082, -0.0454, -0.0657], [ 0.0304, 0.0512, -0.0759, ..., -0.0718, -0.0185, 0.0667], [ 0.0512, -0.0596, -0.0673, ..., -0.0010, -0.0360, -0.0180], ..., [ 0.0483, 0.0282, 0.0720, ..., 0.0373, 0.1035, 0.0121], [ 0.0575, -0.0858, -0.0746, ..., -0.1044, -0.0397, 0.0920], [-0.0523, 0.0714, 0.0199, ..., -0.0488, -0.0387, 0.0060]], device='cuda:0'), grad: tensor([[ 2.9206e-06, -2.2918e-05, 2.4419e-06, ..., -9.3430e-06, 2.2966e-06, 1.0384e-06], [ 1.4625e-05, -5.1931e-06, 2.2098e-05, ..., 1.3784e-05, 2.2471e-05, -3.8028e-05], [ 4.8423e-04, 1.9407e-04, 3.6526e-04, ..., 2.5272e-04, 3.6240e-04, 2.3633e-05], ..., [-5.7793e-04, -1.8823e-04, -4.3368e-04, ..., -2.9159e-04, -4.4107e-04, 1.3709e-06], [ 1.6585e-05, 1.1787e-05, 1.5542e-05, ..., 9.7305e-06, 1.3366e-05, 2.3004e-06], [ 8.4192e-06, 4.3869e-05, -2.0452e-06, ..., 5.8748e-06, 7.3276e-06, 2.8741e-06]], device='cuda:0') Epoch 86, bias, value: tensor([-0.0091, -0.0089, -0.0049, -0.0137, -0.0126, -0.0051, 0.0102, 0.0055, 0.0224, -0.0015], device='cuda:0'), grad: tensor([-5.1767e-05, -1.0192e-05, 8.5545e-04, 4.3988e-05, -1.6224e-04, 1.3627e-05, 4.8488e-05, -8.7070e-04, 3.9846e-05, 9.3102e-05], device='cuda:0') 100 0.0001 changing lr epoch 85, time 214.78, cls_loss 0.0070 cls_loss_mapping 0.0097 cls_loss_causal 0.6229 re_mapping 0.0112 re_causal 0.0327 /// teacc 98.78 lr 0.00010000 Epoch 87, weight, value: tensor([[-0.1096, 0.0611, 0.0017, ..., -0.0063, -0.0460, -0.0662], [ 0.0306, 0.0514, -0.0765, ..., -0.0722, -0.0183, 0.0673], [ 0.0511, -0.0604, -0.0674, ..., -0.0010, -0.0360, -0.0187], ..., [ 0.0485, 0.0281, 0.0722, ..., 0.0371, 0.1039, 0.0121], [ 0.0576, -0.0864, -0.0753, ..., -0.1052, -0.0400, 0.0926], [-0.0526, 0.0712, 0.0195, ..., -0.0495, -0.0394, 0.0055]], device='cuda:0'), grad: tensor([[ 2.5835e-06, -2.2948e-04, 9.7603e-07, ..., 1.4529e-06, 9.3691e-07, -5.4359e-05], [ 4.9127e-07, 3.9041e-06, 3.2205e-06, ..., 2.8778e-06, 1.4752e-06, -9.0823e-06], [-4.5925e-05, -1.2718e-05, 1.2601e-06, ..., -3.4422e-05, -1.8448e-05, 3.3267e-06], ..., [ 4.0978e-06, 1.7397e-06, -1.1154e-05, ..., 6.2324e-06, -4.0047e-07, 3.8594e-06], [ 2.6450e-06, 2.8700e-05, 1.5618e-06, ..., 7.6666e-06, 3.2764e-06, -5.9940e-06], [ 6.6571e-06, 1.9744e-05, 8.4415e-06, ..., 4.5523e-06, 4.9062e-06, 9.8348e-06]], device='cuda:0') Epoch 87, bias, value: tensor([-0.0078, -0.0087, -0.0052, -0.0137, -0.0126, -0.0050, 0.0100, 0.0053, 0.0220, -0.0018], device='cuda:0'), grad: tensor([-3.7336e-04, 6.9067e-06, -1.7571e-04, 3.3140e-05, 2.7597e-05, 3.4451e-05, 3.0375e-04, 2.4885e-05, 6.9499e-05, 4.8488e-05], device='cuda:0') 100 0.0001 changing lr epoch 86, time 215.13, cls_loss 0.0069 cls_loss_mapping 0.0112 cls_loss_causal 0.5931 re_mapping 0.0111 re_causal 0.0319 /// teacc 98.79 lr 0.00010000 Epoch 88, weight, value: tensor([[-0.1110, 0.0609, 0.0012, ..., -0.0070, -0.0467, -0.0677], [ 0.0304, 0.0517, -0.0770, ..., -0.0725, -0.0183, 0.0678], [ 0.0513, -0.0608, -0.0678, ..., -0.0010, -0.0362, -0.0190], ..., [ 0.0490, 0.0282, 0.0732, ..., 0.0376, 0.1049, 0.0122], [ 0.0578, -0.0871, -0.0760, ..., -0.1062, -0.0404, 0.0931], [-0.0529, 0.0712, 0.0191, ..., -0.0513, -0.0401, 0.0053]], device='cuda:0'), grad: tensor([[ 6.6459e-06, 6.9151e-07, 3.9376e-06, ..., 2.9895e-06, 1.4147e-06, 3.9637e-06], [-3.6974e-06, -4.7944e-06, 1.1250e-06, ..., 1.3970e-06, -1.5227e-06, -8.6203e-06], [ 5.8621e-05, 3.1888e-06, 3.7640e-05, ..., 1.1116e-05, -8.9183e-06, 5.1320e-05], ..., [ 4.4882e-05, -5.3504e-07, 1.5810e-05, ..., 2.2426e-05, 1.6838e-05, 1.5289e-05], [-1.4496e-04, 4.4294e-06, -7.9811e-05, ..., -4.8429e-05, -1.8120e-05, -8.0109e-05], [ 2.3674e-06, 1.0204e-04, 1.8075e-05, ..., 8.6203e-06, 2.4904e-06, 1.2374e-04]], device='cuda:0') Epoch 88, bias, value: tensor([-0.0082, -0.0083, -0.0053, -0.0131, -0.0128, -0.0052, 0.0101, 0.0057, 0.0217, -0.0021], device='cuda:0'), grad: tensor([ 1.4171e-05, -1.2368e-05, 1.0520e-04, 6.8784e-05, -1.9395e-04, -3.0071e-05, -1.6600e-05, 7.5579e-05, -2.1935e-04, 2.0790e-04], device='cuda:0') 100 0.0001 changing lr epoch 87, time 215.02, cls_loss 0.0071 cls_loss_mapping 0.0099 cls_loss_causal 0.6009 re_mapping 0.0112 re_causal 0.0325 /// teacc 98.74 lr 0.00010000 Epoch 89, weight, value: tensor([[-0.1122, 0.0605, 0.0008, ..., -0.0070, -0.0472, -0.0687], [ 0.0300, 0.0521, -0.0774, ..., -0.0730, -0.0185, 0.0684], [ 0.0514, -0.0611, -0.0679, ..., -0.0008, -0.0360, -0.0199], ..., [ 0.0497, 0.0279, 0.0733, ..., 0.0377, 0.1055, 0.0125], [ 0.0580, -0.0878, -0.0760, ..., -0.1073, -0.0406, 0.0938], [-0.0536, 0.0721, 0.0195, ..., -0.0518, -0.0403, 0.0048]], device='cuda:0'), grad: tensor([[-1.0091e-04, -1.4603e-04, 6.1654e-06, ..., 2.0400e-05, 6.4634e-07, -6.3241e-05], [-6.7711e-05, -8.5115e-05, -5.1647e-05, ..., 1.5646e-05, -6.6340e-05, -1.2910e-04], [ 1.9744e-05, 3.2157e-05, 3.9369e-05, ..., 9.8169e-05, 2.1264e-05, 1.0058e-05], ..., [-1.3225e-05, 1.1623e-05, -2.1383e-05, ..., -2.3291e-05, -2.3827e-05, 1.9863e-05], [ 1.4269e-04, 5.3740e-04, 3.9649e-04, ..., 6.0177e-04, 3.9972e-06, 4.5776e-04], [ 5.6028e-05, 9.5248e-05, 4.8161e-05, ..., 1.7807e-05, 5.2005e-05, 1.0008e-04]], device='cuda:0') Epoch 89, bias, value: tensor([-0.0089, -0.0081, -0.0054, -0.0130, -0.0135, -0.0049, 0.0104, 0.0059, 0.0212, -0.0015], device='cuda:0'), grad: tensor([-3.4761e-04, -1.8466e-04, 1.4138e-04, -1.5907e-03, 3.0851e-04, -9.8133e-04, 5.6458e-04, 3.3211e-06, 1.8568e-03, 2.3007e-04], device='cuda:0') 100 0.0001 changing lr epoch 88, time 214.74, cls_loss 0.0096 cls_loss_mapping 0.0159 cls_loss_causal 0.6375 re_mapping 0.0112 re_causal 0.0314 /// teacc 98.64 lr 0.00010000 Epoch 90, weight, value: tensor([[-0.1131, 0.0606, 0.0001, ..., -0.0071, -0.0489, -0.0697], [ 0.0307, 0.0518, -0.0776, ..., -0.0732, -0.0181, 0.0690], [ 0.0517, -0.0613, -0.0675, ..., -0.0002, -0.0365, -0.0200], ..., [ 0.0500, 0.0289, 0.0748, ..., 0.0378, 0.1071, 0.0119], [ 0.0583, -0.0885, -0.0767, ..., -0.1080, -0.0408, 0.0946], [-0.0546, 0.0717, 0.0185, ..., -0.0524, -0.0412, 0.0044]], device='cuda:0'), grad: tensor([[ 4.6268e-06, 2.9970e-06, 6.0759e-06, ..., 3.5577e-06, 6.2920e-06, 2.6431e-06], [-2.1681e-06, 2.6360e-05, 9.0152e-06, ..., 7.6517e-06, 7.5027e-06, -1.9427e-06], [ 4.7356e-05, 2.1979e-05, 3.4362e-05, ..., 8.0988e-06, 2.9147e-05, 1.3880e-05], ..., [-8.4400e-05, 2.0459e-05, -4.8041e-05, ..., -9.5069e-06, -7.4983e-05, 2.2307e-05], [ 1.5527e-05, 2.5302e-05, 2.4423e-05, ..., 4.7266e-05, 6.2473e-06, 2.2456e-05], [ 9.5814e-06, 6.6876e-05, 2.3529e-05, ..., 1.5691e-05, 1.3337e-05, 4.9978e-05]], device='cuda:0') Epoch 90, bias, value: tensor([-0.0091, -0.0080, -0.0047, -0.0130, -0.0133, -0.0050, 0.0103, 0.0066, 0.0209, -0.0025], device='cuda:0'), grad: tensor([ 1.9252e-05, 9.8288e-05, 9.6440e-05, -3.3736e-04, -5.9843e-04, 3.7432e-04, 1.1325e-05, 8.4192e-06, 9.2387e-05, 2.3401e-04], device='cuda:0') 100 0.0001 changing lr epoch 89, time 215.06, cls_loss 0.0063 cls_loss_mapping 0.0101 cls_loss_causal 0.5909 re_mapping 0.0110 re_causal 0.0315 /// teacc 98.71 lr 0.00010000 Epoch 91, weight, value: tensor([[-1.1349e-01, 6.1119e-02, -8.3829e-05, ..., -6.9841e-03, -4.9288e-02, -7.0300e-02], [ 3.0311e-02, 5.1560e-02, -7.8058e-02, ..., -7.3620e-02, -1.8435e-02, 6.8914e-02], [ 5.1805e-02, -6.1402e-02, -6.8099e-02, ..., -2.2289e-04, -3.6480e-02, -2.0132e-02], ..., [ 5.0269e-02, 2.8600e-02, 7.5160e-02, ..., 3.8152e-02, 1.0761e-01, 1.1717e-02], [ 5.8803e-02, -8.8817e-02, -7.6880e-02, ..., -1.0861e-01, -4.0962e-02, 9.5204e-02], [-5.4935e-02, 7.1068e-02, 1.8568e-02, ..., -5.2845e-02, -4.1878e-02, 3.8089e-03]], device='cuda:0'), grad: tensor([[ 3.4869e-06, 1.1558e-06, 1.3234e-06, ..., 1.0999e-06, 1.2405e-06, 1.8319e-06], [ 1.8790e-05, -3.5372e-06, 1.9401e-05, ..., 2.7437e-06, -2.6468e-06, -7.0751e-05], [-3.4541e-05, -1.2420e-05, 9.8944e-06, ..., 3.2037e-07, 2.0757e-05, 4.0591e-05], ..., [-4.0174e-05, 3.3323e-06, -4.1008e-05, ..., -8.3521e-06, -4.3660e-05, 1.0036e-05], [ 2.8595e-05, 1.5408e-05, 5.5544e-06, ..., 2.8573e-06, 4.0159e-06, 5.6699e-06], [ 1.2010e-05, 3.2842e-05, -3.6741e-07, ..., 3.8445e-06, 4.5784e-06, 4.4554e-05]], device='cuda:0') Epoch 91, bias, value: tensor([-0.0087, -0.0086, -0.0046, -0.0134, -0.0122, -0.0044, 0.0107, 0.0065, 0.0207, -0.0034], device='cuda:0'), grad: tensor([ 4.4167e-05, -1.1420e-04, -4.1533e-04, 1.2651e-05, -6.1452e-05, 1.1809e-05, 3.7905e-06, -3.2455e-05, 3.5310e-04, 1.9801e-04], device='cuda:0') 100 0.0001 changing lr epoch 90, time 215.07, cls_loss 0.0068 cls_loss_mapping 0.0091 cls_loss_causal 0.6113 re_mapping 0.0107 re_causal 0.0312 /// teacc 98.74 lr 0.00010000 Epoch 92, weight, value: tensor([[-0.1140, 0.0608, -0.0003, ..., -0.0070, -0.0498, -0.0708], [ 0.0310, 0.0524, -0.0776, ..., -0.0743, -0.0181, 0.0703], [ 0.0525, -0.0617, -0.0687, ..., -0.0002, -0.0361, -0.0207], ..., [ 0.0496, 0.0279, 0.0752, ..., 0.0386, 0.1075, 0.0105], [ 0.0595, -0.0892, -0.0773, ..., -0.1091, -0.0412, 0.0962], [-0.0553, 0.0717, 0.0191, ..., -0.0533, -0.0418, 0.0036]], device='cuda:0'), grad: tensor([[ 8.6278e-06, -3.7644e-06, 5.3942e-06, ..., 2.9095e-06, 9.5442e-06, 2.0163e-07], [ 1.3247e-05, 1.8058e-06, 8.7246e-06, ..., 7.8827e-06, 1.5378e-05, -5.2974e-06], [-1.5616e-04, -1.7405e-05, -2.9355e-05, ..., -5.8323e-05, -1.4293e-04, 1.7276e-06], ..., [ 3.8028e-05, 1.4892e-06, -1.7956e-05, ..., 1.9357e-05, 2.3782e-05, 3.6461e-07], [ 1.0878e-05, 2.0415e-06, 5.9567e-06, ..., 6.8769e-06, 1.5408e-05, -2.8387e-06], [ 2.2486e-05, 1.0312e-05, 2.0593e-05, ..., 6.3367e-06, 1.9327e-05, 8.4564e-07]], device='cuda:0') Epoch 92, bias, value: tensor([-0.0091, -0.0081, -0.0039, -0.0140, -0.0126, -0.0042, 0.0104, 0.0059, 0.0213, -0.0030], device='cuda:0'), grad: tensor([ 1.4238e-05, 3.3885e-05, -3.7384e-04, 9.1970e-05, 1.2100e-05, 2.2724e-05, 4.1574e-06, 9.9957e-05, 3.6895e-05, 5.7667e-05], device='cuda:0') 100 0.0001 changing lr epoch 91, time 215.05, cls_loss 0.0056 cls_loss_mapping 0.0104 cls_loss_causal 0.5917 re_mapping 0.0102 re_causal 0.0309 /// teacc 98.78 lr 0.00010000 Epoch 93, weight, value: tensor([[-1.1490e-01, 6.1446e-02, -4.2594e-04, ..., -7.0450e-03, -5.0205e-02, -7.1270e-02], [ 3.0594e-02, 5.2816e-02, -7.7803e-02, ..., -7.4437e-02, -1.8102e-02, 7.0136e-02], [ 5.3685e-02, -6.2480e-02, -6.8219e-02, ..., 9.7826e-05, -3.5198e-02, -2.0515e-02], ..., [ 4.9382e-02, 2.7617e-02, 7.5143e-02, ..., 3.8585e-02, 1.0728e-01, 1.0451e-02], [ 5.9562e-02, -9.0459e-02, -7.7996e-02, ..., -1.0960e-01, -4.1483e-02, 9.6639e-02], [-5.5637e-02, 7.1233e-02, 1.9046e-02, ..., -5.3717e-02, -4.2198e-02, 3.2040e-03]], device='cuda:0'), grad: tensor([[ 5.9232e-07, -2.3985e-04, 4.6939e-07, ..., 5.6345e-08, 4.8243e-07, 5.3924e-07], [ 6.1274e-05, 3.0994e-06, 5.2094e-05, ..., 1.4968e-05, 6.3837e-05, 1.2718e-05], [ 9.2015e-06, 5.9158e-06, 7.1935e-06, ..., 1.9930e-06, 8.5086e-06, 2.4308e-06], ..., [-7.8797e-05, -2.2165e-06, -6.8724e-05, ..., -1.9431e-05, -8.3387e-05, -1.6034e-05], [-1.5693e-07, 1.8682e-06, 2.0862e-06, ..., 7.6182e-07, 1.8161e-06, -1.5432e-06], [ 5.8897e-06, 2.3949e-04, 4.0010e-06, ..., 1.5013e-06, 5.6513e-06, 2.2966e-06]], device='cuda:0') Epoch 93, bias, value: tensor([-0.0088, -0.0086, -0.0030, -0.0142, -0.0123, -0.0036, 0.0105, 0.0057, 0.0207, -0.0035], device='cuda:0'), grad: tensor([-3.8481e-04, 9.8169e-05, 3.5435e-05, 4.2804e-06, -2.6114e-06, 3.6895e-05, -1.0693e-04, -1.1885e-04, 2.9609e-05, 4.0841e-04], device='cuda:0') 100 0.0001 changing lr epoch 92, time 214.53, cls_loss 0.0078 cls_loss_mapping 0.0117 cls_loss_causal 0.6235 re_mapping 0.0104 re_causal 0.0310 /// teacc 98.74 lr 0.00010000 Epoch 94, weight, value: tensor([[-0.1160, 0.0617, -0.0009, ..., -0.0073, -0.0511, -0.0719], [ 0.0293, 0.0531, -0.0799, ..., -0.0763, -0.0199, 0.0703], [ 0.0529, -0.0635, -0.0696, ..., -0.0010, -0.0363, -0.0213], ..., [ 0.0500, 0.0285, 0.0758, ..., 0.0403, 0.1089, 0.0103], [ 0.0596, -0.0912, -0.0787, ..., -0.1102, -0.0418, 0.0967], [-0.0538, 0.0704, 0.0199, ..., -0.0541, -0.0415, 0.0037]], device='cuda:0'), grad: tensor([[ 9.0292e-07, -2.8968e-05, 1.3113e-05, ..., 1.2806e-07, 9.2993e-07, 4.4331e-07], [ 2.0210e-06, 1.2964e-06, 8.5607e-06, ..., 1.9427e-06, 6.1952e-06, -6.5193e-06], [ 1.2107e-08, 4.0084e-06, 1.0274e-05, ..., -1.5832e-06, 1.4780e-06, 3.5111e-06], ..., [-1.7926e-05, 1.2830e-05, 4.1351e-06, ..., -8.6129e-06, -2.5168e-05, -7.1898e-07], [ 6.0396e-07, 5.7817e-06, 2.1551e-06, ..., 1.4268e-06, 1.3588e-06, -1.2591e-06], [ 5.7481e-06, -2.9966e-05, -7.6473e-05, ..., 2.3209e-06, 7.6443e-06, 2.0433e-06]], device='cuda:0') Epoch 94, bias, value: tensor([-0.0089, -0.0095, -0.0038, -0.0143, -0.0115, -0.0037, 0.0110, 0.0065, 0.0203, -0.0034], device='cuda:0'), grad: tensor([-5.7101e-05, 3.6694e-06, 9.8571e-06, 4.3094e-05, 1.9148e-05, -3.8773e-05, 7.8917e-05, 2.1815e-05, 2.0742e-05, -1.0163e-04], device='cuda:0') 100 0.0001 changing lr epoch 93, time 214.42, cls_loss 0.0058 cls_loss_mapping 0.0085 cls_loss_causal 0.5990 re_mapping 0.0103 re_causal 0.0301 /// teacc 98.83 lr 0.00010000 Epoch 95, weight, value: tensor([[-0.1166, 0.0623, -0.0008, ..., -0.0074, -0.0520, -0.0726], [ 0.0292, 0.0532, -0.0804, ..., -0.0765, -0.0201, 0.0707], [ 0.0528, -0.0639, -0.0698, ..., -0.0010, -0.0364, -0.0222], ..., [ 0.0502, 0.0282, 0.0762, ..., 0.0403, 0.1094, 0.0102], [ 0.0599, -0.0913, -0.0791, ..., -0.1107, -0.0419, 0.0981], [-0.0541, 0.0703, 0.0199, ..., -0.0546, -0.0419, 0.0038]], device='cuda:0'), grad: tensor([[ 8.2375e-07, -2.7791e-05, 1.6829e-06, ..., 7.0687e-07, 4.2170e-06, 4.9127e-07], [-2.6450e-07, -5.4855e-07, 3.1367e-06, ..., 5.8347e-07, 1.8561e-06, -8.2701e-06], [-2.7474e-06, 4.6305e-06, 1.5497e-06, ..., -9.8422e-06, -1.2405e-05, 3.1721e-06], ..., [ 2.1625e-04, 1.1575e-04, 4.8208e-04, ..., 1.0096e-06, 3.1400e-04, 2.3264e-06], [-1.0118e-05, 1.3500e-05, 9.8571e-06, ..., -2.4494e-06, 2.7325e-06, -1.9580e-05], [-2.2566e-04, -1.7285e-04, -5.5313e-04, ..., 2.8461e-06, -3.2091e-04, -1.3039e-08]], device='cuda:0') Epoch 95, bias, value: tensor([-0.0083, -0.0094, -0.0040, -0.0146, -0.0112, -0.0036, 0.0111, 0.0065, 0.0207, -0.0040], device='cuda:0'), grad: tensor([-6.1750e-05, -3.3416e-06, -4.4376e-05, 3.5316e-05, 1.3089e-04, 3.6657e-05, 9.8720e-06, 6.0940e-04, 1.4305e-05, -7.2718e-04], device='cuda:0') 100 0.0001 changing lr epoch 94, time 214.70, cls_loss 0.0076 cls_loss_mapping 0.0116 cls_loss_causal 0.6043 re_mapping 0.0102 re_causal 0.0302 /// teacc 98.82 lr 0.00010000 Epoch 96, weight, value: tensor([[-0.1179, 0.0638, -0.0005, ..., -0.0075, -0.0539, -0.0747], [ 0.0296, 0.0535, -0.0802, ..., -0.0765, -0.0195, 0.0720], [ 0.0526, -0.0646, -0.0704, ..., -0.0011, -0.0368, -0.0243], ..., [ 0.0502, 0.0280, 0.0758, ..., 0.0398, 0.1095, 0.0099], [ 0.0613, -0.0939, -0.0807, ..., -0.1110, -0.0424, 0.0989], [-0.0543, 0.0706, 0.0203, ..., -0.0551, -0.0420, 0.0043]], device='cuda:0'), grad: tensor([[ 1.5059e-06, -7.7300e-08, 3.3341e-07, ..., 2.0023e-07, 3.1758e-07, 5.4995e-07], [-2.6133e-06, -8.7470e-06, 1.0552e-06, ..., 6.2259e-07, -1.1243e-05, -2.6643e-05], [ 5.0897e-07, 1.2890e-06, -4.6566e-09, ..., -2.6189e-06, -7.1432e-07, 2.7679e-06], ..., [-1.1241e-06, 4.5262e-06, -4.5262e-06, ..., -1.1362e-07, 1.8552e-06, 1.1541e-05], [ 5.0008e-05, 2.5313e-06, 1.4203e-06, ..., 1.4445e-06, 1.2647e-06, -2.2864e-07], [ 3.4738e-06, 1.1928e-05, -1.1334e-06, ..., 1.3364e-06, 8.6278e-06, 1.4663e-05]], device='cuda:0') Epoch 96, bias, value: tensor([-0.0076, -0.0085, -0.0047, -0.0137, -0.0111, -0.0040, 0.0100, 0.0060, 0.0202, -0.0036], device='cuda:0'), grad: tensor([ 4.0680e-05, -2.9728e-05, 5.3719e-06, 2.2352e-06, -1.2644e-05, 1.3514e-03, -2.8839e-03, 1.6481e-05, 1.4477e-03, 6.2048e-05], device='cuda:0') 100 0.0001 changing lr epoch 95, time 215.15, cls_loss 0.0073 cls_loss_mapping 0.0107 cls_loss_causal 0.6109 re_mapping 0.0099 re_causal 0.0303 /// teacc 98.79 lr 0.00010000 Epoch 97, weight, value: tensor([[-0.1189, 0.0638, -0.0009, ..., -0.0075, -0.0549, -0.0754], [ 0.0283, 0.0540, -0.0811, ..., -0.0768, -0.0200, 0.0716], [ 0.0522, -0.0656, -0.0713, ..., -0.0015, -0.0376, -0.0255], ..., [ 0.0512, 0.0279, 0.0764, ..., 0.0403, 0.1107, 0.0105], [ 0.0617, -0.0940, -0.0816, ..., -0.1114, -0.0429, 0.0999], [-0.0538, 0.0704, 0.0205, ..., -0.0560, -0.0420, 0.0041]], device='cuda:0'), grad: tensor([[ 9.4771e-06, 3.2652e-06, 5.3160e-06, ..., 1.1781e-06, 3.2745e-06, 1.1742e-05], [ 1.9029e-05, 4.8894e-07, 2.8193e-05, ..., 6.8136e-06, 1.6585e-05, -9.9186e-07], [ 1.2286e-05, 2.3302e-06, 1.7986e-05, ..., 2.4904e-06, 7.5288e-06, 4.8429e-06], ..., [-1.1826e-03, 5.9716e-06, -1.7252e-03, ..., -4.0889e-04, -1.0014e-03, -1.0067e-04], [-1.0431e-07, -1.6734e-05, 4.5389e-05, ..., 1.0647e-05, 2.6062e-05, -4.9978e-05], [ 1.0767e-03, 8.3327e-05, 1.5478e-03, ..., 3.7026e-04, 9.3031e-04, 1.5438e-04]], device='cuda:0') Epoch 97, bias, value: tensor([-0.0076, -0.0092, -0.0055, -0.0137, -0.0111, -0.0035, 0.0102, 0.0065, 0.0200, -0.0036], device='cuda:0'), grad: tensor([ 3.0264e-05, 3.5495e-05, 2.6271e-05, 9.1255e-05, -1.7846e-04, 4.5538e-05, -2.5406e-05, -2.1400e-03, -2.5064e-05, 2.1420e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 96---------------------------------------------------- epoch 96, time 231.73, cls_loss 0.0051 cls_loss_mapping 0.0064 cls_loss_causal 0.5691 re_mapping 0.0100 re_causal 0.0285 /// teacc 98.86 lr 0.00010000 Epoch 98, weight, value: tensor([[-0.1199, 0.0634, -0.0016, ..., -0.0078, -0.0556, -0.0786], [ 0.0286, 0.0539, -0.0808, ..., -0.0771, -0.0200, 0.0721], [ 0.0517, -0.0661, -0.0724, ..., -0.0016, -0.0384, -0.0259], ..., [ 0.0517, 0.0281, 0.0772, ..., 0.0411, 0.1118, 0.0102], [ 0.0622, -0.0936, -0.0817, ..., -0.1121, -0.0433, 0.1015], [-0.0544, 0.0705, 0.0201, ..., -0.0568, -0.0427, 0.0033]], device='cuda:0'), grad: tensor([[ 2.9989e-07, -8.0884e-05, 1.6764e-07, ..., -2.7955e-05, 3.6601e-07, 9.1968e-07], [ 4.7358e-07, 1.5600e-07, 3.7765e-07, ..., 2.9476e-07, 3.5670e-07, -1.6084e-06], [-1.7211e-06, 5.5693e-06, 3.9395e-07, ..., -6.9384e-07, -1.0962e-06, 4.9099e-06], ..., [-6.9244e-07, 1.7444e-06, -1.7332e-06, ..., -2.9523e-07, -1.6205e-06, 1.1493e-06], [-1.2018e-05, 8.7321e-06, 2.3078e-06, ..., 2.9523e-06, -4.3400e-06, -2.2024e-05], [ 1.8002e-06, 8.3819e-06, -5.9791e-07, ..., 2.5406e-06, 1.0636e-06, 1.1344e-06]], device='cuda:0') Epoch 98, bias, value: tensor([-0.0084, -0.0089, -0.0058, -0.0140, -0.0107, -0.0038, 0.0098, 0.0070, 0.0208, -0.0039], device='cuda:0'), grad: tensor([-2.1243e-04, 1.3672e-06, 3.7514e-06, 3.1684e-06, 1.7643e-05, 7.4744e-05, 8.4043e-05, 4.2655e-06, 2.5574e-06, 2.1055e-05], device='cuda:0') 100 0.0001 changing lr epoch 97, time 214.75, cls_loss 0.0058 cls_loss_mapping 0.0100 cls_loss_causal 0.5821 re_mapping 0.0102 re_causal 0.0297 /// teacc 98.70 lr 0.00010000 Epoch 99, weight, value: tensor([[-0.1207, 0.0631, -0.0021, ..., -0.0079, -0.0567, -0.0793], [ 0.0291, 0.0548, -0.0806, ..., -0.0774, -0.0189, 0.0730], [ 0.0516, -0.0668, -0.0725, ..., -0.0013, -0.0389, -0.0266], ..., [ 0.0520, 0.0277, 0.0777, ..., 0.0413, 0.1121, 0.0099], [ 0.0624, -0.0941, -0.0823, ..., -0.1129, -0.0437, 0.1021], [-0.0549, 0.0706, 0.0197, ..., -0.0577, -0.0433, 0.0028]], device='cuda:0'), grad: tensor([[ 1.2189e-05, 3.3641e-04, 1.4007e-04, ..., 3.4310e-06, 8.8736e-06, 1.3389e-05], [ 1.9073e-03, 1.1425e-03, 8.6880e-04, ..., 5.7779e-06, 1.6899e-03, 2.3632e-03], [-2.6627e-03, -1.5144e-03, -1.1663e-03, ..., -7.5340e-05, -2.2945e-03, -3.1700e-03], ..., [ 4.7421e-04, 3.0565e-04, 2.2030e-04, ..., 4.0792e-06, 4.1890e-04, 5.9843e-04], [ 1.0848e-04, 8.0764e-05, 5.8681e-05, ..., 1.6496e-05, 8.5652e-05, 1.2600e-04], [ 6.9976e-05, -2.3568e-04, -1.5450e-04, ..., 5.6177e-06, 6.0201e-05, 2.5439e-04]], device='cuda:0') Epoch 99, bias, value: tensor([-0.0088, -0.0080, -0.0058, -0.0137, -0.0108, -0.0039, 0.0098, 0.0071, 0.0205, -0.0042], device='cuda:0'), grad: tensor([ 5.5790e-04, 3.7460e-03, -5.5008e-03, 4.9591e-04, -4.8041e-04, -1.8552e-05, -4.2692e-06, 9.9277e-04, 3.2973e-04, -1.1545e-04], device='cuda:0') 100 0.0001 changing lr epoch 98, time 214.44, cls_loss 0.0064 cls_loss_mapping 0.0094 cls_loss_causal 0.5831 re_mapping 0.0103 re_causal 0.0286 /// teacc 98.77 lr 0.00010000 Epoch 100, weight, value: tensor([[-0.1219, 0.0630, -0.0026, ..., -0.0085, -0.0577, -0.0805], [ 0.0266, 0.0540, -0.0828, ..., -0.0776, -0.0220, 0.0704], [ 0.0535, -0.0665, -0.0719, ..., 0.0002, -0.0367, -0.0254], ..., [ 0.0528, 0.0281, 0.0784, ..., 0.0400, 0.1133, 0.0119], [ 0.0627, -0.0944, -0.0826, ..., -0.1134, -0.0432, 0.1026], [-0.0551, 0.0706, 0.0197, ..., -0.0584, -0.0436, 0.0023]], device='cuda:0'), grad: tensor([[ 1.0230e-05, 2.5965e-06, 1.0975e-05, ..., 1.7853e-06, 1.4745e-05, 1.0960e-05], [-7.4530e-04, -2.6298e-04, 8.9705e-06, ..., 2.6803e-06, -1.8573e-04, -1.6642e-03], [ 3.5197e-05, 8.3745e-06, 2.8491e-05, ..., 2.1651e-05, 4.3303e-05, 2.1189e-05], ..., [-5.2750e-05, -3.9162e-07, -4.9621e-05, ..., -5.0455e-05, -1.3494e-04, 5.2661e-05], [ 4.9305e-04, 1.8644e-04, 1.9908e-05, ..., 2.8946e-06, 1.4031e-04, 1.0109e-03], [ 2.1875e-04, 6.0230e-05, -1.5795e-04, ..., 1.6943e-05, 1.3113e-04, 4.5395e-04]], device='cuda:0') Epoch 100, bias, value: tensor([-0.0092, -0.0102, -0.0038, -0.0131, -0.0111, -0.0026, 0.0094, 0.0075, 0.0196, -0.0042], device='cuda:0'), grad: tensor([ 2.4885e-05, -2.3937e-03, 7.7546e-05, 1.9157e-04, -5.2035e-05, -3.8475e-05, 1.7929e-04, 1.4573e-05, 1.5125e-03, 4.8208e-04], device='cuda:0') 100 0.0001 changing lr epoch 99, time 214.79, cls_loss 0.0052 cls_loss_mapping 0.0083 cls_loss_causal 0.6080 re_mapping 0.0100 re_causal 0.0307 /// teacc 98.85 lr 0.00010000 Epoch 101, weight, value: tensor([[-1.2243e-01, 6.3021e-02, -2.6671e-03, ..., -8.7930e-03, -5.8369e-02, -8.0959e-02], [ 2.6563e-02, 5.4652e-02, -8.2717e-02, ..., -7.7451e-02, -2.1994e-02, 7.0961e-02], [ 5.3388e-02, -6.6882e-02, -7.2237e-02, ..., 1.0859e-04, -3.6889e-02, -2.6186e-02], ..., [ 5.2747e-02, 2.7832e-02, 7.8331e-02, ..., 4.0245e-02, 1.1347e-01, 1.1596e-02], [ 6.3279e-02, -9.4487e-02, -8.2951e-02, ..., -1.1371e-01, -4.3377e-02, 1.0349e-01], [-5.4881e-02, 7.1502e-02, 2.1022e-02, ..., -5.9062e-02, -4.2726e-02, 1.6109e-03]], device='cuda:0'), grad: tensor([[ 2.4419e-06, -1.2964e-06, 1.0589e-06, ..., -5.0478e-07, 3.2736e-07, 1.9018e-06], [ 8.2282e-07, 9.4529e-07, 2.0824e-06, ..., 3.2829e-07, 7.3807e-07, -2.4568e-06], [ 4.1611e-06, 3.2969e-06, 3.9004e-06, ..., 3.5856e-06, 2.5351e-06, 1.6950e-06], ..., [ 1.8224e-05, 1.2979e-05, 4.4107e-05, ..., -9.4064e-07, 2.5719e-05, 4.1723e-06], [ 7.9349e-06, 9.7975e-06, 8.3596e-06, ..., 7.1758e-07, 4.8168e-06, 3.5111e-06], [ 3.0160e-05, -7.3388e-06, -1.2577e-04, ..., 3.8557e-07, -4.6462e-05, 5.0902e-05]], device='cuda:0') Epoch 101, bias, value: tensor([-0.0095, -0.0097, -0.0042, -0.0134, -0.0119, -0.0028, 0.0098, 0.0071, 0.0197, -0.0031], device='cuda:0'), grad: tensor([ 2.8480e-06, 4.9137e-06, 1.8403e-05, 1.7136e-05, -3.8147e-04, 2.9132e-06, 2.3693e-05, 7.8559e-05, 6.1512e-05, 1.7166e-04], device='cuda:0') 100 0.0001 changing lr epoch 100, time 214.83, cls_loss 0.0051 cls_loss_mapping 0.0087 cls_loss_causal 0.5521 re_mapping 0.0103 re_causal 0.0288 /// teacc 98.68 lr 0.00010000 Epoch 102, weight, value: tensor([[-0.1231, 0.0632, -0.0029, ..., -0.0090, -0.0592, -0.0813], [ 0.0261, 0.0548, -0.0837, ..., -0.0789, -0.0227, 0.0711], [ 0.0531, -0.0673, -0.0726, ..., 0.0002, -0.0373, -0.0264], ..., [ 0.0525, 0.0279, 0.0784, ..., 0.0408, 0.1140, 0.0108], [ 0.0649, -0.0947, -0.0824, ..., -0.1130, -0.0424, 0.1051], [-0.0543, 0.0713, 0.0216, ..., -0.0599, -0.0424, 0.0010]], device='cuda:0'), grad: tensor([[ 3.5902e-07, -2.8759e-05, 4.2515e-07, ..., -2.7064e-06, 3.6322e-07, 1.1818e-06], [ 4.5076e-06, 6.1141e-07, 4.9435e-06, ..., 7.7672e-07, 4.7386e-06, -1.0997e-04], [-7.0315e-08, 3.8790e-07, -7.5018e-07, ..., -1.7807e-06, -2.0806e-06, 2.3872e-05], ..., [-2.2203e-05, 2.2836e-06, -2.3365e-05, ..., -1.5721e-06, -2.1830e-05, 2.0817e-05], [ 3.9898e-06, 1.4296e-06, 6.9328e-06, ..., 1.2089e-06, 5.8301e-06, 1.5900e-05], [ 1.1519e-05, 1.8030e-05, 1.0058e-05, ..., 2.6170e-06, 1.1496e-05, 2.4289e-05]], device='cuda:0') Epoch 102, bias, value: tensor([-0.0093, -0.0100, -0.0046, -0.0124, -0.0116, -0.0037, 0.0096, 0.0069, 0.0203, -0.0032], device='cuda:0'), grad: tensor([-4.1872e-05, -2.2173e-04, 4.4405e-05, 1.5736e-05, 2.3052e-05, 1.6004e-05, 1.6853e-05, 4.1962e-05, 3.9011e-05, 6.6638e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 101---------------------------------------------------- epoch 101, time 231.01, cls_loss 0.0065 cls_loss_mapping 0.0099 cls_loss_causal 0.6120 re_mapping 0.0098 re_causal 0.0283 /// teacc 98.91 lr 0.00010000 Epoch 103, weight, value: tensor([[-0.1239, 0.0637, -0.0034, ..., -0.0095, -0.0605, -0.0808], [ 0.0261, 0.0543, -0.0841, ..., -0.0791, -0.0225, 0.0712], [ 0.0534, -0.0679, -0.0727, ..., 0.0010, -0.0373, -0.0262], ..., [ 0.0524, 0.0280, 0.0787, ..., 0.0410, 0.1141, 0.0105], [ 0.0649, -0.0952, -0.0833, ..., -0.1148, -0.0429, 0.1054], [-0.0538, 0.0710, 0.0217, ..., -0.0611, -0.0423, 0.0008]], device='cuda:0'), grad: tensor([[ 5.6345e-07, -1.8135e-05, 9.8627e-07, ..., -1.1260e-06, 4.0233e-07, 2.1048e-07], [ 5.6066e-07, 0.0000e+00, 1.2061e-06, ..., 6.6776e-07, 9.0152e-07, -9.5088e-07], [ 6.3032e-06, 2.9206e-06, 6.4969e-05, ..., 7.5758e-05, -1.4249e-07, 1.4538e-06], ..., [-9.5069e-06, 1.5590e-06, -6.3851e-06, ..., 1.4538e-06, -1.1191e-05, -3.8557e-06], [ 7.8157e-06, 6.4671e-06, 9.2536e-06, ..., 4.2580e-06, 4.7982e-06, 2.4717e-06], [ 8.7824e-07, -6.5677e-06, -1.5661e-05, ..., 3.3937e-06, 2.8871e-06, 4.5635e-07]], device='cuda:0') Epoch 103, bias, value: tensor([-0.0090, -0.0104, -0.0040, -0.0130, -0.0101, -0.0031, 0.0081, 0.0066, 0.0197, -0.0032], device='cuda:0'), grad: tensor([-2.8223e-05, 1.9521e-06, 7.1585e-05, -7.5877e-05, 2.8074e-05, -1.3679e-05, 9.1121e-06, -5.0776e-06, 3.5703e-05, -2.3678e-05], device='cuda:0') 100 0.0001 changing lr epoch 102, time 214.92, cls_loss 0.0062 cls_loss_mapping 0.0104 cls_loss_causal 0.5684 re_mapping 0.0103 re_causal 0.0278 /// teacc 98.72 lr 0.00010000 Epoch 104, weight, value: tensor([[-0.1247, 0.0646, -0.0036, ..., -0.0094, -0.0613, -0.0812], [ 0.0258, 0.0539, -0.0848, ..., -0.0793, -0.0226, 0.0712], [ 0.0537, -0.0682, -0.0729, ..., 0.0014, -0.0372, -0.0265], ..., [ 0.0526, 0.0279, 0.0791, ..., 0.0410, 0.1144, 0.0103], [ 0.0649, -0.0956, -0.0839, ..., -0.1157, -0.0432, 0.1056], [-0.0536, 0.0715, 0.0221, ..., -0.0616, -0.0425, 0.0016]], device='cuda:0'), grad: tensor([[ 9.3505e-07, -5.8800e-05, 4.1388e-06, ..., -2.3082e-05, 2.4643e-06, 7.6182e-07], [ 3.0547e-06, -1.7779e-06, 4.3325e-06, ..., 2.4773e-06, 2.4736e-06, -3.9376e-06], [ 2.3007e-05, 3.4198e-06, 1.9521e-05, ..., 1.1355e-05, 1.4558e-05, 9.7230e-06], ..., [-7.5877e-05, 2.1994e-05, -1.2314e-04, ..., -3.8803e-05, -6.5684e-05, 6.9290e-06], [-6.0797e-05, -3.3155e-07, 7.6070e-06, ..., -4.1611e-06, -2.6271e-05, -5.8979e-05], [ 6.7532e-05, 1.3605e-05, 4.7266e-05, ..., 2.9087e-05, 4.2856e-05, 3.1114e-05]], device='cuda:0') Epoch 104, bias, value: tensor([-0.0082, -0.0106, -0.0037, -0.0133, -0.0104, -0.0034, 0.0082, 0.0064, 0.0192, -0.0025], device='cuda:0'), grad: tensor([-1.0371e-04, 1.5814e-06, 5.0753e-05, 4.0054e-05, 1.9908e-05, 2.6360e-05, 2.7478e-05, -1.0484e-04, -1.0526e-04, 1.4746e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 103---------------------------------------------------- epoch 103, time 232.48, cls_loss 0.0041 cls_loss_mapping 0.0069 cls_loss_causal 0.5946 re_mapping 0.0098 re_causal 0.0282 /// teacc 98.99 lr 0.00010000 Epoch 105, weight, value: tensor([[-0.1264, 0.0646, -0.0039, ..., -0.0097, -0.0619, -0.0820], [ 0.0256, 0.0542, -0.0854, ..., -0.0799, -0.0227, 0.0715], [ 0.0532, -0.0686, -0.0737, ..., 0.0012, -0.0375, -0.0275], ..., [ 0.0534, 0.0277, 0.0801, ..., 0.0418, 0.1151, 0.0105], [ 0.0652, -0.0957, -0.0847, ..., -0.1163, -0.0437, 0.1061], [-0.0540, 0.0713, 0.0221, ..., -0.0626, -0.0431, 0.0008]], device='cuda:0'), grad: tensor([[ 8.6799e-07, -3.2336e-05, 6.5099e-07, ..., -1.3523e-06, 8.4471e-07, 8.9314e-07], [ 3.1274e-06, -7.8231e-07, 2.9225e-06, ..., 5.6215e-06, 4.9360e-06, -4.7795e-06], [-4.9174e-05, 2.4773e-06, -1.9804e-05, ..., -5.0485e-05, -4.4137e-05, 2.3320e-06], ..., [ 3.2604e-05, 5.3272e-06, 1.3351e-05, ..., 3.3289e-05, 3.0547e-05, 5.9195e-06], [-5.5321e-06, 3.0845e-06, 1.7369e-06, ..., -4.2282e-06, 2.6301e-06, -1.6287e-05], [ 1.4352e-06, 7.5400e-05, 2.1860e-05, ..., 2.6692e-06, 2.1473e-05, 5.4479e-05]], device='cuda:0') Epoch 105, bias, value: tensor([-0.0086, -0.0107, -0.0041, -0.0136, -0.0102, -0.0031, 0.0089, 0.0070, 0.0190, -0.0031], device='cuda:0'), grad: tensor([-5.8353e-05, 4.6231e-06, -8.2135e-05, 4.6164e-05, -1.4043e-04, 5.4091e-06, 3.7290e-06, 7.0035e-05, -1.4819e-05, 1.6582e-04], device='cuda:0') 100 0.0001 changing lr epoch 104, time 214.64, cls_loss 0.0060 cls_loss_mapping 0.0097 cls_loss_causal 0.5615 re_mapping 0.0097 re_causal 0.0274 /// teacc 98.86 lr 0.00010000 Epoch 106, weight, value: tensor([[-0.1281, 0.0645, -0.0045, ..., -0.0096, -0.0628, -0.0826], [ 0.0260, 0.0549, -0.0852, ..., -0.0802, -0.0226, 0.0725], [ 0.0529, -0.0690, -0.0742, ..., 0.0011, -0.0375, -0.0285], ..., [ 0.0538, 0.0274, 0.0806, ..., 0.0426, 0.1154, 0.0102], [ 0.0655, -0.0961, -0.0854, ..., -0.1168, -0.0440, 0.1067], [-0.0546, 0.0710, 0.0221, ..., -0.0642, -0.0437, -0.0003]], device='cuda:0'), grad: tensor([[ 3.6180e-05, 3.4004e-05, 8.0094e-06, ..., 1.8477e-06, 6.5342e-06, 8.8811e-05], [ 3.5971e-05, 1.7273e-04, 1.8096e-04, ..., 3.4850e-06, 1.3459e-04, 1.1940e-06], [ 9.0674e-06, 1.4357e-05, 2.0787e-06, ..., -2.9624e-05, -1.2919e-05, 3.6985e-05], ..., [-8.9407e-05, -4.0722e-04, -4.3941e-04, ..., -1.3085e-06, -3.2282e-04, 5.3179e-07], [-6.6698e-05, -4.6939e-05, 6.2175e-06, ..., 3.0380e-06, 5.6662e-06, -1.7607e-04], [ 4.5121e-05, 1.5008e-04, 1.4317e-04, ..., 3.6433e-06, 1.0788e-04, 4.3780e-05]], device='cuda:0') Epoch 106, bias, value: tensor([-0.0087, -0.0101, -0.0047, -0.0137, -0.0095, -0.0032, 0.0095, 0.0071, 0.0189, -0.0038], device='cuda:0'), grad: tensor([ 2.1243e-04, 4.1437e-04, 3.2365e-05, 1.4710e-04, 8.0764e-05, -5.5507e-06, 8.8811e-06, -9.5701e-04, -3.5667e-04, 4.2272e-04], device='cuda:0') 100 0.0001 changing lr epoch 105, time 214.34, cls_loss 0.0057 cls_loss_mapping 0.0074 cls_loss_causal 0.5831 re_mapping 0.0099 re_causal 0.0279 /// teacc 98.88 lr 0.00010000 Epoch 107, weight, value: tensor([[-1.2887e-01, 6.4654e-02, -5.2609e-03, ..., -9.8035e-03, -6.4007e-02, -8.3582e-02], [ 2.5826e-02, 5.5312e-02, -8.5839e-02, ..., -8.1350e-02, -2.2914e-02, 7.3066e-02], [ 5.2679e-02, -7.0137e-02, -7.5388e-02, ..., 7.3491e-04, -3.8034e-02, -2.8988e-02], ..., [ 5.3936e-02, 2.8049e-02, 8.1363e-02, ..., 4.3794e-02, 1.1619e-01, 9.7921e-03], [ 6.5762e-02, -9.6593e-02, -8.5960e-02, ..., -1.1750e-01, -4.4540e-02, 1.0723e-01], [-5.3784e-02, 7.1393e-02, 2.2733e-02, ..., -6.4972e-02, -4.3043e-02, 9.7140e-05]], device='cuda:0'), grad: tensor([[ 1.4901e-07, -1.1064e-05, 2.1700e-07, ..., 3.7439e-07, 2.4308e-07, -7.7710e-06], [ 8.5682e-08, 1.4603e-06, 5.5321e-07, ..., 3.9954e-07, 2.0303e-07, -2.0117e-06], [-0.0000e+00, 3.7253e-06, 1.2228e-06, ..., 2.5332e-07, -5.5600e-07, 2.0489e-06], ..., [ 4.9360e-07, 2.3283e-06, 4.2804e-06, ..., 4.8690e-06, 9.8255e-07, 7.2084e-07], [ 3.0175e-07, 7.0743e-06, 1.8133e-06, ..., 1.7295e-06, 6.2305e-07, 4.1351e-06], [ 4.7404e-07, 5.2433e-07, -3.2093e-06, ..., 5.5786e-07, 4.3679e-07, 1.3923e-06]], device='cuda:0') Epoch 107, bias, value: tensor([-0.0090, -0.0100, -0.0052, -0.0143, -0.0100, -0.0032, 0.0096, 0.0072, 0.0188, -0.0029], device='cuda:0'), grad: tensor([-5.8621e-05, 8.2403e-06, 1.8075e-05, -7.2643e-07, -4.0263e-05, -1.6302e-05, 2.5615e-05, 1.4529e-05, 3.9399e-05, 1.0058e-05], device='cuda:0') 100 0.0001 changing lr epoch 106, time 214.56, cls_loss 0.0040 cls_loss_mapping 0.0072 cls_loss_causal 0.5961 re_mapping 0.0093 re_causal 0.0281 /// teacc 98.92 lr 0.00010000 Epoch 108, weight, value: tensor([[-1.2956e-01, 6.5283e-02, -5.5275e-03, ..., -9.7910e-03, -6.4728e-02, -8.3802e-02], [ 2.5746e-02, 5.5225e-02, -8.5800e-02, ..., -8.1675e-02, -2.2938e-02, 7.3292e-02], [ 5.2822e-02, -7.0412e-02, -7.5613e-02, ..., 7.4171e-04, -3.7971e-02, -2.9272e-02], ..., [ 5.3924e-02, 2.8354e-02, 8.1749e-02, ..., 4.4249e-02, 1.1659e-01, 9.5030e-03], [ 6.5823e-02, -9.6769e-02, -8.6358e-02, ..., -1.1824e-01, -4.4775e-02, 1.0756e-01], [-5.3801e-02, 7.1343e-02, 2.2677e-02, ..., -6.5766e-02, -4.3392e-02, 3.6780e-05]], device='cuda:0'), grad: tensor([[ 1.5330e-06, -1.8813e-07, 5.0664e-07, ..., 1.0030e-06, 6.2771e-07, 1.6773e-06], [ 1.2271e-05, 6.7614e-06, 1.5348e-05, ..., 1.1116e-05, 2.3037e-05, -2.2724e-05], [ 2.4866e-07, -4.3064e-05, -1.0669e-05, ..., -6.7353e-05, -6.1691e-06, 1.4454e-05], ..., [-4.6581e-05, -7.9572e-06, -3.6806e-05, ..., -2.3901e-05, -5.3585e-05, 3.5111e-07], [-2.3156e-05, -1.0855e-05, 1.8142e-06, ..., 2.4103e-06, 2.2091e-06, -3.9369e-05], [ 1.5169e-05, 3.2991e-05, 1.1638e-05, ..., 3.9995e-05, 1.3225e-05, 8.0988e-06]], device='cuda:0') Epoch 108, bias, value: tensor([-0.0083, -0.0100, -0.0054, -0.0143, -0.0100, -0.0030, 0.0095, 0.0071, 0.0184, -0.0030], device='cuda:0'), grad: tensor([ 4.6939e-06, 2.8852e-06, -1.4842e-04, 1.1772e-04, -1.1683e-05, 3.9816e-05, -2.3339e-06, -5.1707e-05, -7.8082e-05, 1.2743e-04], device='cuda:0') 100 0.0001 changing lr epoch 107, time 214.22, cls_loss 0.0045 cls_loss_mapping 0.0080 cls_loss_causal 0.5918 re_mapping 0.0097 re_causal 0.0281 /// teacc 98.86 lr 0.00010000 Epoch 109, weight, value: tensor([[-0.1307, 0.0654, -0.0058, ..., -0.0100, -0.0654, -0.0842], [ 0.0260, 0.0559, -0.0860, ..., -0.0816, -0.0227, 0.0738], [ 0.0528, -0.0709, -0.0757, ..., 0.0013, -0.0381, -0.0298], ..., [ 0.0542, 0.0279, 0.0819, ..., 0.0442, 0.1168, 0.0094], [ 0.0655, -0.0973, -0.0870, ..., -0.1190, -0.0451, 0.1078], [-0.0544, 0.0714, 0.0225, ..., -0.0673, -0.0439, -0.0008]], device='cuda:0'), grad: tensor([[ 2.2855e-06, 6.4075e-07, 2.5686e-06, ..., 4.8801e-07, 1.4817e-06, 2.3097e-07], [ 2.3339e-06, -1.3597e-07, 2.8852e-06, ..., 1.1204e-06, 1.7742e-06, -1.7490e-06], [ 9.5129e-05, 1.4752e-06, 7.7546e-05, ..., 2.3752e-05, 6.6757e-05, 1.3839e-06], ..., [-2.6846e-04, -1.1012e-05, -2.6035e-04, ..., -1.2040e-04, -2.2101e-04, -4.0531e-06], [ 4.6521e-05, 7.6964e-06, 4.2558e-05, ..., 2.0042e-05, 2.3887e-05, 1.0040e-06], [ 4.2528e-05, 7.4916e-06, 3.4183e-05, ..., 1.5870e-05, 6.6698e-05, 8.3968e-06]], device='cuda:0') Epoch 109, bias, value: tensor([-0.0083, -0.0094, -0.0054, -0.0142, -0.0097, -0.0027, 0.0100, 0.0071, 0.0175, -0.0034], device='cuda:0'), grad: tensor([ 5.6550e-06, 2.6524e-06, 9.1732e-05, 9.1910e-05, 6.4820e-06, -7.7665e-05, 4.5478e-05, -3.2043e-04, 9.2149e-05, 6.1929e-05], device='cuda:0') 100 0.0001 changing lr epoch 108, time 214.61, cls_loss 0.0066 cls_loss_mapping 0.0074 cls_loss_causal 0.5725 re_mapping 0.0096 re_causal 0.0274 /// teacc 98.72 lr 0.00010000 Epoch 110, weight, value: tensor([[-1.3215e-01, 6.4564e-02, -6.5148e-03, ..., -1.1245e-02, -6.6529e-02, -8.5058e-02], [ 2.5540e-02, 5.4919e-02, -8.6100e-02, ..., -8.2607e-02, -2.3124e-02, 7.3633e-02], [ 5.2793e-02, -7.0152e-02, -7.7243e-02, ..., 4.1386e-05, -3.7448e-02, -2.8473e-02], ..., [ 5.4851e-02, 2.8397e-02, 8.3007e-02, ..., 4.4650e-02, 1.1775e-01, 9.2202e-03], [ 6.6402e-02, -9.7765e-02, -8.7675e-02, ..., -1.1985e-01, -4.5636e-02, 1.0880e-01], [-5.5372e-02, 7.1734e-02, 2.2360e-02, ..., -6.9406e-02, -4.5352e-02, -1.5588e-03]], device='cuda:0'), grad: tensor([[ 1.4342e-07, 1.9372e-05, 1.3970e-07, ..., -1.4510e-06, 1.3970e-07, 3.1199e-07], [ 8.0187e-07, 1.8151e-06, 2.7549e-06, ..., 1.8803e-06, 1.3104e-06, -3.1199e-07], [ 3.4831e-07, 7.8324e-07, 2.4494e-07, ..., -1.0030e-06, -1.0217e-06, 6.0257e-07], ..., [-3.6117e-06, 9.2015e-07, -2.6450e-07, ..., 2.4252e-06, -4.2245e-06, 5.7183e-07], [-1.1541e-05, 1.0552e-06, 3.1050e-06, ..., 2.4587e-06, -4.2655e-06, -1.5870e-05], [ 1.1988e-05, 1.2502e-05, 1.0289e-05, ..., 7.7263e-06, 8.8587e-06, 1.4290e-05]], device='cuda:0') Epoch 110, bias, value: tensor([-0.0094, -0.0106, -0.0052, -0.0144, -0.0100, -0.0015, 0.0104, 0.0078, 0.0178, -0.0038], device='cuda:0'), grad: tensor([ 6.9141e-05, 6.2808e-06, 1.4603e-06, -3.7789e-05, 4.2439e-05, 3.2395e-05, -1.4460e-04, 1.9027e-06, -2.1368e-05, 5.0336e-05], device='cuda:0') 100 0.0001 changing lr epoch 109, time 214.36, cls_loss 0.0054 cls_loss_mapping 0.0080 cls_loss_causal 0.5582 re_mapping 0.0092 re_causal 0.0266 /// teacc 98.76 lr 0.00010000 Epoch 111, weight, value: tensor([[-0.1331, 0.0642, -0.0069, ..., -0.0106, -0.0687, -0.0857], [ 0.0256, 0.0546, -0.0864, ..., -0.0826, -0.0229, 0.0739], [ 0.0528, -0.0707, -0.0776, ..., -0.0003, -0.0375, -0.0287], ..., [ 0.0550, 0.0278, 0.0831, ..., 0.0448, 0.1179, 0.0091], [ 0.0665, -0.0984, -0.0883, ..., -0.1208, -0.0460, 0.1092], [-0.0555, 0.0719, 0.0226, ..., -0.0703, -0.0458, -0.0023]], device='cuda:0'), grad: tensor([[ 2.3842e-07, -8.9593e-07, 5.0943e-07, ..., 3.8836e-07, 5.7090e-07, 1.7043e-07], [-1.2508e-06, -4.4033e-06, 4.1071e-07, ..., 3.9395e-07, -4.2878e-06, -1.0870e-05], [ 2.3395e-06, 3.4943e-06, 1.6978e-06, ..., 1.2107e-06, 5.3942e-06, 8.6799e-06], ..., [-7.7248e-05, -4.1366e-05, -1.8632e-04, ..., -3.6240e-05, -1.5855e-04, 1.1921e-06], [-2.6729e-06, -1.9632e-06, 2.5891e-07, ..., -5.1875e-07, 3.6508e-07, -5.1260e-06], [ 6.6400e-05, 4.1872e-05, 1.6725e-04, ..., 2.1443e-05, 1.3733e-04, 8.5682e-07]], device='cuda:0') Epoch 111, bias, value: tensor([-0.0100, -0.0107, -0.0054, -0.0144, -0.0099, -0.0013, 0.0105, 0.0084, 0.0174, -0.0039], device='cuda:0'), grad: tensor([ 2.4866e-07, -1.6987e-05, 1.6823e-05, 1.7434e-05, 3.0696e-06, 8.1956e-06, -2.9206e-06, -2.2268e-04, -6.3032e-06, 2.0289e-04], device='cuda:0') 100 0.0001 changing lr epoch 110, time 214.95, cls_loss 0.0049 cls_loss_mapping 0.0083 cls_loss_causal 0.5674 re_mapping 0.0097 re_causal 0.0276 /// teacc 98.90 lr 0.00010000 Epoch 112, weight, value: tensor([[-0.1340, 0.0644, -0.0079, ..., -0.0113, -0.0694, -0.0863], [ 0.0256, 0.0552, -0.0867, ..., -0.0828, -0.0227, 0.0743], [ 0.0535, -0.0712, -0.0779, ..., 0.0002, -0.0372, -0.0288], ..., [ 0.0548, 0.0276, 0.0833, ..., 0.0447, 0.1179, 0.0088], [ 0.0667, -0.0986, -0.0888, ..., -0.1218, -0.0464, 0.1100], [-0.0555, 0.0716, 0.0228, ..., -0.0716, -0.0458, -0.0027]], device='cuda:0'), grad: tensor([[ 4.9360e-08, -4.0643e-06, 1.5926e-07, ..., -1.5013e-06, 5.1223e-08, 2.6077e-08], [-6.2399e-08, -4.9360e-08, 5.4110e-07, ..., 7.3016e-07, 6.7055e-08, -9.7603e-07], [ 9.4064e-08, 9.6206e-07, 9.2108e-07, ..., -3.9116e-08, -3.4552e-07, 2.0210e-07], ..., [-1.8720e-07, 6.0536e-07, 2.6915e-07, ..., 9.4343e-07, -3.3248e-07, 2.7660e-07], [ 5.4017e-08, 1.1669e-06, 5.7463e-07, ..., 1.0263e-06, 3.5390e-08, 4.9360e-08], [ 3.0827e-07, 2.5164e-06, 9.6206e-07, ..., 1.3663e-06, 1.7509e-07, 2.2445e-07]], device='cuda:0') Epoch 112, bias, value: tensor([-0.0099, -0.0103, -0.0052, -0.0138, -0.0097, -0.0018, 0.0111, 0.0081, 0.0174, -0.0043], device='cuda:0'), grad: tensor([-4.2096e-06, 3.6228e-07, 2.2389e-06, -5.0217e-06, 1.6168e-06, 2.6435e-05, -4.1962e-05, 2.2165e-06, 9.4846e-06, 8.7619e-06], device='cuda:0') 100 0.0001 changing lr epoch 111, time 214.83, cls_loss 0.0041 cls_loss_mapping 0.0068 cls_loss_causal 0.5715 re_mapping 0.0088 re_causal 0.0273 /// teacc 98.81 lr 0.00010000 Epoch 113, weight, value: tensor([[-1.3488e-01, 6.4776e-02, -8.3121e-03, ..., -1.0928e-02, -7.0167e-02, -8.6715e-02], [ 2.5152e-02, 5.5399e-02, -8.6895e-02, ..., -8.3144e-02, -2.2846e-02, 7.4103e-02], [ 5.3771e-02, -7.1466e-02, -7.8507e-02, ..., 7.1163e-05, -3.7105e-02, -2.8292e-02], ..., [ 5.4838e-02, 2.7564e-02, 8.3689e-02, ..., 4.5115e-02, 1.1820e-01, 8.5024e-03], [ 6.7641e-02, -9.9024e-02, -8.9214e-02, ..., -1.2225e-01, -4.6433e-02, 1.1102e-01], [-5.5626e-02, 7.1637e-02, 2.3037e-02, ..., -7.2606e-02, -4.5843e-02, -2.9646e-03]], device='cuda:0'), grad: tensor([[ 7.2550e-07, -1.4079e-04, 4.3865e-07, ..., 6.2585e-07, 5.6345e-07, 2.7604e-06], [ 2.3559e-05, 4.5914e-07, 1.0245e-06, ..., 5.5321e-07, 2.5973e-05, -3.2485e-06], [-3.4571e-05, 1.4063e-06, 9.5367e-07, ..., -1.5637e-06, -3.5763e-05, 1.2852e-07], ..., [-4.6603e-06, 8.6892e-07, -1.3702e-05, ..., -5.5395e-06, -5.3607e-06, 9.7975e-07], [ 1.5022e-06, 4.0047e-06, 1.1213e-06, ..., 6.9663e-07, 1.3039e-06, 1.0710e-06], [ 5.8040e-06, 1.4222e-04, 1.0118e-05, ..., 4.2282e-06, 6.3106e-06, 1.5963e-06]], device='cuda:0') Epoch 113, bias, value: tensor([-0.0096, -0.0109, -0.0047, -0.0134, -0.0101, -0.0019, 0.0114, 0.0079, 0.0177, -0.0044], device='cuda:0'), grad: tensor([-1.9121e-04, 9.1374e-05, -1.3173e-04, 1.6719e-05, -4.0144e-05, -5.8766e-07, -1.7717e-05, 1.6158e-06, 1.9297e-05, 2.5177e-04], device='cuda:0') 100 0.0001 changing lr epoch 112, time 214.88, cls_loss 0.0063 cls_loss_mapping 0.0076 cls_loss_causal 0.5927 re_mapping 0.0093 re_causal 0.0269 /// teacc 98.89 lr 0.00010000 Epoch 114, weight, value: tensor([[-0.1357, 0.0648, -0.0085, ..., -0.0112, -0.0713, -0.0869], [ 0.0282, 0.0548, -0.0873, ..., -0.0790, -0.0197, 0.0743], [ 0.0503, -0.0712, -0.0791, ..., -0.0021, -0.0407, -0.0287], ..., [ 0.0551, 0.0275, 0.0842, ..., 0.0454, 0.1185, 0.0085], [ 0.0681, -0.0994, -0.0895, ..., -0.1229, -0.0468, 0.1118], [-0.0560, 0.0717, 0.0228, ..., -0.0736, -0.0461, -0.0034]], device='cuda:0'), grad: tensor([[ 5.2340e-07, -7.4841e-06, 2.4494e-06, ..., 1.1055e-06, 2.2817e-07, 5.0850e-07], [ 2.2221e-06, -1.3784e-07, 1.7164e-06, ..., 1.2154e-06, 2.7902e-06, -4.8149e-07], [-4.4145e-06, 9.8255e-07, 4.7050e-06, ..., 7.2643e-08, -6.9253e-06, 7.3574e-07], ..., [ 3.9116e-07, 1.5516e-06, 1.0513e-05, ..., 5.5134e-06, 9.1083e-07, 7.3481e-07], [-7.1079e-06, 4.0568e-06, 2.7776e-05, ..., 1.2122e-05, 5.1316e-07, -1.0759e-05], [ 2.9132e-06, 2.5824e-05, 2.6631e-04, ..., 1.1253e-04, 8.9779e-07, 4.4703e-06]], device='cuda:0') Epoch 114, bias, value: tensor([-0.0099, -0.0083, -0.0072, -0.0139, -0.0099, -0.0016, 0.0113, 0.0080, 0.0177, -0.0045], device='cuda:0'), grad: tensor([-1.3210e-05, 5.0589e-06, -3.7663e-06, -3.5930e-04, 1.0822e-06, 2.5481e-05, 6.7316e-06, 1.6645e-05, 2.0415e-05, 3.0112e-04], device='cuda:0') 100 0.0001 changing lr epoch 113, time 214.89, cls_loss 0.0062 cls_loss_mapping 0.0073 cls_loss_causal 0.5676 re_mapping 0.0088 re_causal 0.0270 /// teacc 98.84 lr 0.00010000 Epoch 115, weight, value: tensor([[-0.1388, 0.0652, -0.0088, ..., -0.0113, -0.0721, -0.0897], [ 0.0309, 0.0545, -0.0859, ..., -0.0748, -0.0177, 0.0764], [ 0.0474, -0.0710, -0.0815, ..., -0.0054, -0.0428, -0.0317], ..., [ 0.0555, 0.0274, 0.0848, ..., 0.0456, 0.1191, 0.0084], [ 0.0692, -0.0990, -0.0898, ..., -0.1235, -0.0473, 0.1134], [-0.0564, 0.0719, 0.0227, ..., -0.0743, -0.0465, -0.0037]], device='cuda:0'), grad: tensor([[ 2.7474e-07, 6.9197e-07, 7.6741e-07, ..., 1.6112e-07, 3.1292e-07, 1.5926e-07], [ 3.8370e-07, 3.8091e-07, 6.5099e-07, ..., 4.8243e-07, 5.1130e-07, 2.2817e-07], [ 1.3039e-08, 2.9150e-07, 1.8906e-07, ..., -6.6124e-07, -6.3330e-08, 3.0920e-07], ..., [-7.0110e-06, 5.0701e-06, -1.2718e-05, ..., -2.0377e-06, -1.3858e-05, 6.4224e-06], [ 7.5903e-07, 1.8738e-06, 2.0266e-06, ..., 1.4799e-06, 5.5693e-07, 6.4448e-07], [ 4.8615e-06, -1.2584e-05, -4.8839e-06, ..., 2.5388e-06, 1.1258e-05, 1.5730e-06]], device='cuda:0') Epoch 115, bias, value: tensor([-0.0101, -0.0056, -0.0097, -0.0145, -0.0099, -0.0016, 0.0108, 0.0082, 0.0185, -0.0046], device='cuda:0'), grad: tensor([ 1.7257e-06, 1.6615e-06, -8.3353e-07, -2.1420e-07, 3.6173e-06, 2.4997e-06, 1.2564e-06, -8.1062e-06, 6.3628e-06, -7.9349e-06], device='cuda:0') 100 0.0001 changing lr epoch 114, time 214.63, cls_loss 0.0044 cls_loss_mapping 0.0074 cls_loss_causal 0.5560 re_mapping 0.0085 re_causal 0.0258 /// teacc 98.91 lr 0.00010000 Epoch 116, weight, value: tensor([[-0.1393, 0.0652, -0.0093, ..., -0.0115, -0.0740, -0.0900], [ 0.0307, 0.0549, -0.0867, ..., -0.0751, -0.0182, 0.0768], [ 0.0472, -0.0714, -0.0818, ..., -0.0058, -0.0434, -0.0318], ..., [ 0.0569, 0.0274, 0.0856, ..., 0.0472, 0.1210, 0.0085], [ 0.0699, -0.0991, -0.0907, ..., -0.1244, -0.0480, 0.1144], [-0.0570, 0.0720, 0.0227, ..., -0.0760, -0.0470, -0.0043]], device='cuda:0'), grad: tensor([[ 5.3197e-06, -2.9316e-03, 7.3984e-06, ..., -5.3365e-07, -4.9025e-05, 2.6524e-06], [ 6.6280e-05, 4.8280e-06, 8.2627e-06, ..., 5.5917e-06, 1.4931e-05, 1.1736e-04], [-1.5900e-05, 1.0319e-05, 6.5826e-06, ..., -9.0003e-06, -3.9846e-05, 7.5847e-06], ..., [-2.0221e-05, 5.3905e-06, -2.0266e-06, ..., -1.7896e-05, -2.7701e-05, -2.2128e-06], [-9.8765e-05, 2.9728e-06, 1.0937e-05, ..., -3.2037e-06, 3.5465e-06, -2.1231e-04], [ 3.9488e-05, 2.8419e-03, 4.0114e-05, ..., 2.1458e-05, 7.9036e-05, 3.0428e-05]], device='cuda:0') Epoch 116, bias, value: tensor([-0.0103, -0.0058, -0.0098, -0.0142, -0.0101, -0.0020, 0.0106, 0.0092, 0.0187, -0.0050], device='cuda:0'), grad: tensor([-4.9896e-03, 1.9252e-04, 4.6879e-05, 3.2544e-04, 1.2362e-04, -1.3437e-03, 7.3016e-05, 4.9973e-04, -8.3029e-05, 5.1575e-03], device='cuda:0') 100 0.0001 changing lr epoch 115, time 214.72, cls_loss 0.0058 cls_loss_mapping 0.0062 cls_loss_causal 0.5531 re_mapping 0.0089 re_causal 0.0248 /// teacc 98.77 lr 0.00010000 Epoch 117, weight, value: tensor([[-0.1403, 0.0662, -0.0098, ..., -0.0116, -0.0759, -0.0903], [ 0.0309, 0.0538, -0.0868, ..., -0.0749, -0.0179, 0.0771], [ 0.0469, -0.0701, -0.0824, ..., -0.0059, -0.0438, -0.0324], ..., [ 0.0575, 0.0271, 0.0860, ..., 0.0473, 0.1216, 0.0088], [ 0.0702, -0.0996, -0.0917, ..., -0.1253, -0.0484, 0.1150], [-0.0572, 0.0724, 0.0233, ..., -0.0767, -0.0476, -0.0036]], device='cuda:0'), grad: tensor([[ 6.5286e-07, -1.6436e-05, 5.1223e-07, ..., 2.4028e-07, 4.0606e-07, -1.7695e-08], [ 1.5378e-05, 1.0449e-06, 1.0647e-05, ..., 5.6960e-06, 7.5884e-06, 4.0606e-06], [ 6.6012e-06, 1.2582e-06, 5.9530e-06, ..., -4.2468e-07, 1.5823e-06, 1.6866e-06], ..., [-1.6421e-05, 1.8151e-06, -2.4468e-05, ..., -1.0036e-05, -1.6600e-05, 1.2092e-05], [-2.4080e-05, 2.3898e-06, 2.2780e-06, ..., 1.3085e-06, 1.1632e-06, -2.9862e-05], [ 3.5372e-06, 1.7043e-07, -1.9185e-06, ..., 8.8383e-07, 1.6028e-06, 2.0899e-06]], device='cuda:0') Epoch 117, bias, value: tensor([-0.0096, -0.0059, -0.0094, -0.0146, -0.0113, -0.0016, 0.0100, 0.0093, 0.0184, -0.0046], device='cuda:0'), grad: tensor([-4.1783e-05, 2.7061e-05, 7.5176e-06, 7.4804e-06, 7.3835e-06, 1.2204e-05, 2.0504e-05, -2.0280e-05, -2.5511e-05, 5.2787e-06], device='cuda:0') 100 0.0001 changing lr epoch 116, time 214.80, cls_loss 0.0048 cls_loss_mapping 0.0063 cls_loss_causal 0.5858 re_mapping 0.0089 re_causal 0.0266 /// teacc 98.90 lr 0.00010000 Epoch 118, weight, value: tensor([[-0.1412, 0.0664, -0.0105, ..., -0.0127, -0.0768, -0.0908], [ 0.0310, 0.0554, -0.0864, ..., -0.0750, -0.0179, 0.0780], [ 0.0471, -0.0702, -0.0823, ..., -0.0057, -0.0435, -0.0326], ..., [ 0.0577, 0.0255, 0.0862, ..., 0.0472, 0.1217, 0.0081], [ 0.0710, -0.0998, -0.0927, ..., -0.1261, -0.0491, 0.1160], [-0.0578, 0.0715, 0.0228, ..., -0.0774, -0.0481, -0.0052]], device='cuda:0'), grad: tensor([[ 3.8296e-06, -3.2969e-07, 1.7723e-06, ..., 1.5832e-07, 4.0680e-06, 1.6391e-06], [ 4.1466e-03, 1.0179e-06, 1.8196e-03, ..., 4.6901e-06, 4.3755e-03, 1.6356e-03], [ 9.2015e-06, 3.9004e-06, 1.6987e-05, ..., -5.7444e-06, 1.2912e-05, 2.0012e-05], ..., [-4.2801e-03, 9.1493e-06, -1.8911e-03, ..., 5.7649e-07, -4.5166e-03, -1.6918e-03], [ 2.0400e-05, 7.1339e-06, 9.4995e-06, ..., 1.3532e-06, 2.0012e-05, 1.0215e-05], [ 1.4693e-05, 1.2852e-05, 4.9435e-06, ..., 4.1164e-07, 1.6332e-05, 2.3976e-05]], device='cuda:0') Epoch 118, bias, value: tensor([-0.0099, -0.0056, -0.0092, -0.0140, -0.0110, -0.0020, 0.0096, 0.0094, 0.0186, -0.0054], device='cuda:0'), grad: tensor([ 6.7502e-06, 7.0381e-03, 3.3289e-05, 1.5926e-04, -7.4208e-05, -2.8566e-05, 4.6100e-07, -7.2556e-03, 5.0962e-05, 6.8367e-05], device='cuda:0') 100 0.0001 changing lr epoch 117, time 214.99, cls_loss 0.0034 cls_loss_mapping 0.0078 cls_loss_causal 0.5768 re_mapping 0.0091 re_causal 0.0257 /// teacc 98.88 lr 0.00010000 Epoch 119, weight, value: tensor([[-0.1416, 0.0666, -0.0108, ..., -0.0129, -0.0771, -0.0911], [ 0.0304, 0.0554, -0.0874, ..., -0.0751, -0.0184, 0.0777], [ 0.0475, -0.0702, -0.0819, ..., -0.0053, -0.0431, -0.0327], ..., [ 0.0582, 0.0251, 0.0866, ..., 0.0471, 0.1222, 0.0084], [ 0.0719, -0.0999, -0.0931, ..., -0.1266, -0.0495, 0.1175], [-0.0580, 0.0712, 0.0231, ..., -0.0778, -0.0482, -0.0058]], device='cuda:0'), grad: tensor([[ 9.2834e-06, 4.1388e-06, 2.2035e-06, ..., 6.9849e-07, 1.4771e-06, 1.2524e-05], [-1.2422e-04, -1.1867e-04, 1.7006e-06, ..., 1.7341e-06, -4.6521e-05, -2.1970e-04], [ 4.9993e-06, 3.1777e-06, -7.1432e-07, ..., -6.0722e-07, -5.4762e-06, 4.7088e-05], ..., [ 2.1726e-05, 1.7092e-05, -1.2917e-06, ..., -7.9628e-07, 8.7544e-06, 5.2631e-05], [-2.1309e-05, 7.1347e-05, 1.1176e-06, ..., -4.3474e-06, 3.2514e-05, 5.0038e-05], [ 1.5721e-05, 8.6799e-06, -6.2846e-06, ..., -1.8254e-07, 4.9882e-06, 3.5375e-05]], device='cuda:0') Epoch 119, bias, value: tensor([-0.0099, -0.0062, -0.0088, -0.0144, -0.0107, -0.0018, 0.0097, 0.0096, 0.0193, -0.0058], device='cuda:0'), grad: tensor([ 3.1412e-05, -3.4285e-04, 1.8847e-04, 9.9838e-05, -4.0960e-04, 2.8938e-05, 1.1617e-04, 1.5664e-04, 4.6968e-05, 8.3923e-05], device='cuda:0') 100 0.0001 changing lr epoch 118, time 214.55, cls_loss 0.0035 cls_loss_mapping 0.0060 cls_loss_causal 0.5798 re_mapping 0.0087 re_causal 0.0265 /// teacc 98.91 lr 0.00010000 Epoch 120, weight, value: tensor([[-0.1421, 0.0665, -0.0110, ..., -0.0133, -0.0781, -0.0915], [ 0.0302, 0.0557, -0.0878, ..., -0.0752, -0.0188, 0.0779], [ 0.0486, -0.0703, -0.0809, ..., -0.0049, -0.0422, -0.0321], ..., [ 0.0580, 0.0251, 0.0867, ..., 0.0471, 0.1223, 0.0084], [ 0.0708, -0.1003, -0.0934, ..., -0.1270, -0.0503, 0.1172], [-0.0589, 0.0711, 0.0231, ..., -0.0786, -0.0488, -0.0063]], device='cuda:0'), grad: tensor([[ 1.1986e-06, 3.4608e-06, 4.4852e-06, ..., 2.3469e-07, 5.0571e-07, 7.2233e-06], [-3.1203e-05, 3.0136e-04, 3.5310e-04, ..., 1.7006e-06, -2.2613e-06, 3.8171e-04], [ 6.6347e-06, 1.4022e-05, 1.4544e-05, ..., -1.5814e-06, -2.8759e-06, 2.7448e-05], ..., [ 1.5739e-07, 1.4372e-05, 6.9179e-06, ..., -2.5854e-06, -2.6207e-06, 2.2456e-05], [ 1.7196e-05, 5.1796e-05, 2.6733e-05, ..., -5.2713e-07, 3.9227e-06, 7.3612e-05], [ 3.0790e-06, -9.1457e-04, -9.6369e-04, ..., 6.8359e-07, 1.0375e-06, -1.2274e-03]], device='cuda:0') Epoch 120, bias, value: tensor([-0.0100, -0.0065, -0.0078, -0.0145, -0.0110, -0.0016, 0.0096, 0.0096, 0.0183, -0.0061], device='cuda:0'), grad: tensor([ 1.6421e-05, 1.1120e-03, 5.6505e-05, 1.6394e-03, 4.1455e-05, 2.1982e-04, 2.4438e-06, 4.4674e-05, 1.5497e-04, -3.2864e-03], device='cuda:0') 100 0.0001 changing lr epoch 119, time 214.90, cls_loss 0.0036 cls_loss_mapping 0.0054 cls_loss_causal 0.5874 re_mapping 0.0083 re_causal 0.0261 /// teacc 98.84 lr 0.00010000 Epoch 121, weight, value: tensor([[-0.1424, 0.0664, -0.0113, ..., -0.0130, -0.0791, -0.0917], [ 0.0304, 0.0558, -0.0875, ..., -0.0753, -0.0186, 0.0784], [ 0.0488, -0.0706, -0.0809, ..., -0.0049, -0.0421, -0.0323], ..., [ 0.0574, 0.0247, 0.0864, ..., 0.0472, 0.1221, 0.0075], [ 0.0716, -0.1009, -0.0931, ..., -0.1273, -0.0496, 0.1182], [-0.0593, 0.0717, 0.0234, ..., -0.0790, -0.0493, -0.0063]], device='cuda:0'), grad: tensor([[ 1.1791e-06, -2.0996e-05, 1.5264e-06, ..., -3.1106e-07, 1.0580e-06, 4.2561e-07], [ 1.1817e-05, 5.8673e-07, 1.7300e-05, ..., 2.6394e-06, 1.2539e-05, 2.9057e-06], [ 6.3144e-06, 1.4156e-06, 8.1658e-06, ..., 1.0012e-06, 5.6587e-06, 1.9316e-06], ..., [-1.0002e-04, 9.0003e-06, -1.5390e-04, ..., -2.2918e-05, -8.7440e-05, 3.2246e-05], [ 3.4925e-06, 1.7947e-06, 8.2776e-06, ..., 1.4817e-06, 6.2175e-06, -2.7474e-07], [ 6.7294e-05, 9.1121e-06, 8.4817e-05, ..., 1.3545e-05, 7.3254e-05, 1.4916e-05]], device='cuda:0') Epoch 121, bias, value: tensor([-0.0101, -0.0062, -0.0078, -0.0148, -0.0110, -0.0022, 0.0104, 0.0089, 0.0183, -0.0056], device='cuda:0'), grad: tensor([-3.6329e-05, 3.0279e-05, 2.1949e-05, 4.1425e-05, -2.1756e-04, 2.8592e-07, -1.3672e-05, -5.5507e-06, 2.5049e-05, 1.5402e-04], device='cuda:0') 100 0.0001 changing lr epoch 120, time 214.78, cls_loss 0.0040 cls_loss_mapping 0.0066 cls_loss_causal 0.5680 re_mapping 0.0084 re_causal 0.0251 /// teacc 98.87 lr 0.00010000 Epoch 122, weight, value: tensor([[-0.1430, 0.0675, -0.0117, ..., -0.0131, -0.0799, -0.0919], [ 0.0305, 0.0556, -0.0872, ..., -0.0754, -0.0185, 0.0787], [ 0.0491, -0.0706, -0.0808, ..., -0.0044, -0.0419, -0.0325], ..., [ 0.0573, 0.0248, 0.0869, ..., 0.0472, 0.1224, 0.0074], [ 0.0719, -0.1021, -0.0939, ..., -0.1280, -0.0498, 0.1193], [-0.0602, 0.0725, 0.0233, ..., -0.0797, -0.0500, -0.0063]], device='cuda:0'), grad: tensor([[ 8.5123e-07, 5.7369e-07, 1.0617e-06, ..., 6.3330e-07, 1.0468e-06, 3.0920e-07], [ 1.3877e-07, -1.5553e-07, 2.0266e-06, ..., 1.6941e-06, 9.1549e-07, -4.0829e-06], [-4.9453e-07, 3.1330e-06, 3.1721e-06, ..., -1.0217e-06, -1.2694e-06, 2.3693e-06], ..., [-1.0714e-05, -6.8061e-06, -1.5825e-05, ..., -9.6560e-06, -1.5363e-05, -1.0254e-06], [-2.7996e-06, 3.5129e-06, 2.8554e-06, ..., 1.5870e-06, 2.0396e-06, -5.0478e-06], [ 5.3309e-06, -3.1721e-06, -3.8929e-07, ..., 4.9621e-06, 6.8061e-06, 1.3476e-06]], device='cuda:0') Epoch 122, bias, value: tensor([-0.0089, -0.0061, -0.0074, -0.0154, -0.0116, -0.0022, 0.0095, 0.0090, 0.0180, -0.0054], device='cuda:0'), grad: tensor([ 5.2489e-06, -2.2911e-06, 1.0937e-05, 4.9680e-05, 2.4974e-05, -4.4852e-05, -6.0461e-06, -2.2307e-05, 7.2047e-06, -2.2575e-05], device='cuda:0') 100 0.0001 changing lr epoch 121, time 214.61, cls_loss 0.0041 cls_loss_mapping 0.0065 cls_loss_causal 0.5779 re_mapping 0.0091 re_causal 0.0244 /// teacc 98.89 lr 0.00010000 Epoch 123, weight, value: tensor([[-0.1437, 0.0677, -0.0122, ..., -0.0132, -0.0809, -0.0924], [ 0.0303, 0.0563, -0.0876, ..., -0.0755, -0.0187, 0.0791], [ 0.0494, -0.0709, -0.0806, ..., -0.0041, -0.0415, -0.0326], ..., [ 0.0572, 0.0240, 0.0869, ..., 0.0467, 0.1224, 0.0068], [ 0.0733, -0.1028, -0.0935, ..., -0.1292, -0.0487, 0.1210], [-0.0607, 0.0728, 0.0237, ..., -0.0806, -0.0505, -0.0071]], device='cuda:0'), grad: tensor([[ 1.3784e-07, -2.2165e-06, 1.3318e-07, ..., -1.6158e-06, 1.9651e-07, 2.0210e-07], [-5.0291e-07, 1.1567e-06, 3.0641e-07, ..., 3.7067e-07, 4.4052e-07, -2.4829e-06], [ 7.9162e-07, 5.7928e-07, 1.5544e-06, ..., 7.6741e-07, 6.5379e-07, 7.4785e-07], ..., [-1.5339e-06, 9.3877e-07, -2.1476e-06, ..., -1.1865e-06, -2.5313e-06, 2.0545e-06], [ 1.6019e-07, 1.0496e-06, 1.9278e-07, ..., 8.0187e-07, 1.4435e-07, 7.9256e-07], [ 1.2945e-07, 1.0077e-06, -7.4506e-08, ..., 4.6100e-07, 1.2573e-07, 7.3668e-07]], device='cuda:0') Epoch 123, bias, value: tensor([-0.0091, -0.0063, -0.0071, -0.0155, -0.0118, -0.0022, 0.0103, 0.0086, 0.0183, -0.0055], device='cuda:0'), grad: tensor([-4.4927e-06, 1.6857e-07, 2.6636e-06, 1.3672e-06, -1.0036e-05, -1.1828e-06, 1.1194e-06, 8.2422e-07, 6.6161e-06, 2.9169e-06], device='cuda:0') 100 0.0001 changing lr epoch 122, time 214.63, cls_loss 0.0044 cls_loss_mapping 0.0070 cls_loss_causal 0.5524 re_mapping 0.0088 re_causal 0.0257 /// teacc 98.85 lr 0.00010000 Epoch 124, weight, value: tensor([[-0.1442, 0.0682, -0.0125, ..., -0.0124, -0.0809, -0.0922], [ 0.0304, 0.0566, -0.0876, ..., -0.0762, -0.0189, 0.0797], [ 0.0494, -0.0713, -0.0815, ..., -0.0043, -0.0418, -0.0326], ..., [ 0.0572, 0.0232, 0.0873, ..., 0.0482, 0.1233, 0.0057], [ 0.0737, -0.1036, -0.0941, ..., -0.1302, -0.0491, 0.1217], [-0.0610, 0.0722, 0.0236, ..., -0.0821, -0.0507, -0.0078]], device='cuda:0'), grad: tensor([[ 4.7497e-07, 6.3106e-06, 1.1735e-06, ..., 7.8138e-07, 3.8743e-07, 8.8811e-06], [ 1.2722e-06, 2.5984e-06, 2.8033e-06, ..., 1.2089e-06, 1.8580e-06, 2.0266e-06], [ 3.5614e-06, 7.2643e-06, 8.5905e-06, ..., 8.2627e-06, 2.6617e-06, 1.1161e-05], ..., [-2.4959e-06, 3.8482e-06, 5.6624e-07, ..., 3.7719e-06, -3.8408e-06, 4.7237e-06], [ 4.1686e-06, 4.7460e-06, 9.3728e-06, ..., 8.3223e-06, 3.4198e-06, 7.3463e-06], [ 2.0079e-06, -1.4752e-05, -6.2771e-06, ..., 1.1409e-06, 1.7928e-06, -6.9030e-06]], device='cuda:0') Epoch 124, bias, value: tensor([-0.0083, -0.0063, -0.0071, -0.0147, -0.0113, -0.0024, 0.0111, 0.0082, 0.0178, -0.0063], device='cuda:0'), grad: tensor([ 3.8207e-05, 1.1228e-05, 3.9220e-05, -3.5614e-05, -1.0973e-04, 4.0308e-06, 2.8774e-05, 1.4007e-05, 1.9968e-05, -1.0207e-05], device='cuda:0') 100 0.0001 changing lr epoch 123, time 214.54, cls_loss 0.0051 cls_loss_mapping 0.0078 cls_loss_causal 0.5790 re_mapping 0.0088 re_causal 0.0261 /// teacc 98.81 lr 0.00010000 Epoch 125, weight, value: tensor([[-0.1448, 0.0668, -0.0127, ..., -0.0142, -0.0814, -0.0930], [ 0.0301, 0.0566, -0.0881, ..., -0.0765, -0.0192, 0.0795], [ 0.0497, -0.0718, -0.0815, ..., -0.0043, -0.0415, -0.0325], ..., [ 0.0571, 0.0229, 0.0868, ..., 0.0475, 0.1237, 0.0053], [ 0.0738, -0.1044, -0.0953, ..., -0.1314, -0.0495, 0.1220], [-0.0617, 0.0726, 0.0224, ..., -0.0808, -0.0515, -0.0090]], device='cuda:0'), grad: tensor([[ 3.5949e-07, -1.1474e-06, 7.1898e-07, ..., 2.8312e-07, 7.0129e-07, -4.3772e-08], [ 3.1088e-06, 4.4797e-07, 5.8636e-06, ..., 3.7756e-06, 6.0014e-06, -1.0738e-06], [ 6.7167e-06, 3.8557e-07, 1.3053e-05, ..., 7.9572e-06, 1.3813e-05, 2.5705e-07], ..., [-2.8536e-05, 2.8219e-07, -5.5075e-05, ..., -3.4302e-05, -5.5045e-05, 2.3376e-07], [-1.6391e-07, 4.4145e-06, 7.3574e-07, ..., 1.0533e-06, 5.0757e-07, -1.6298e-07], [ 1.2899e-06, 5.6159e-07, 2.5947e-06, ..., 1.9129e-06, 2.2836e-06, 4.6939e-07]], device='cuda:0') Epoch 125, bias, value: tensor([-0.0104, -0.0067, -0.0070, -0.0166, -0.0093, 0.0005, 0.0108, 0.0078, 0.0173, -0.0058], device='cuda:0'), grad: tensor([-1.6857e-07, 8.6501e-06, 1.6987e-05, 5.6267e-05, 2.3320e-06, -9.1255e-05, 5.0902e-05, -6.6400e-05, 1.8269e-05, 4.4331e-06], device='cuda:0') 100 0.0001 changing lr epoch 124, time 214.41, cls_loss 0.0038 cls_loss_mapping 0.0062 cls_loss_causal 0.5496 re_mapping 0.0084 re_causal 0.0243 /// teacc 98.93 lr 0.00010000 Epoch 126, weight, value: tensor([[-0.1453, 0.0663, -0.0137, ..., -0.0149, -0.0820, -0.0931], [ 0.0300, 0.0568, -0.0889, ..., -0.0766, -0.0194, 0.0796], [ 0.0496, -0.0722, -0.0819, ..., -0.0043, -0.0415, -0.0328], ..., [ 0.0575, 0.0223, 0.0873, ..., 0.0477, 0.1241, 0.0052], [ 0.0746, -0.1055, -0.0966, ..., -0.1333, -0.0497, 0.1229], [-0.0617, 0.0739, 0.0232, ..., -0.0816, -0.0517, -0.0083]], device='cuda:0'), grad: tensor([[ 6.3796e-07, 7.3798e-06, 1.0636e-06, ..., 3.1292e-07, 6.5099e-07, 1.3504e-06], [ 3.8892e-06, -6.8806e-06, 4.0792e-06, ..., 2.4810e-06, 4.1015e-06, -2.8223e-05], [ 8.8364e-06, 3.4254e-06, 1.1049e-05, ..., 6.3404e-06, 1.2197e-05, 9.2313e-06], ..., [-1.9848e-05, 2.5760e-06, -2.7820e-05, ..., -1.5602e-05, -3.0905e-05, 2.8778e-06], [-3.0845e-05, 8.9929e-06, 1.1930e-06, ..., 8.4005e-07, 7.4878e-07, -3.1829e-05], [ 1.4575e-06, 2.5183e-05, 5.8860e-07, ..., 1.7462e-06, 1.7378e-06, 1.2016e-04]], device='cuda:0') Epoch 126, bias, value: tensor([-1.0898e-02, -6.7702e-03, -7.0332e-03, -1.5952e-02, -9.9472e-03, -5.1618e-05, 1.0435e-02, 7.8232e-03, 1.6963e-02, -4.7845e-03], device='cuda:0'), grad: tensor([ 4.7743e-05, -4.7207e-05, 3.4213e-05, 2.7090e-05, -3.4380e-04, 6.2823e-05, 7.0572e-05, -2.4021e-05, -2.0549e-05, 1.9348e-04], device='cuda:0') 100 0.0001 changing lr epoch 125, time 214.35, cls_loss 0.0042 cls_loss_mapping 0.0068 cls_loss_causal 0.5487 re_mapping 0.0082 re_causal 0.0238 /// teacc 98.87 lr 0.00010000 Epoch 127, weight, value: tensor([[-0.1461, 0.0661, -0.0139, ..., -0.0148, -0.0828, -0.0941], [ 0.0303, 0.0576, -0.0890, ..., -0.0766, -0.0191, 0.0811], [ 0.0493, -0.0730, -0.0827, ..., -0.0047, -0.0419, -0.0342], ..., [ 0.0576, 0.0214, 0.0876, ..., 0.0477, 0.1244, 0.0046], [ 0.0747, -0.1079, -0.0975, ..., -0.1344, -0.0500, 0.1232], [-0.0620, 0.0749, 0.0234, ..., -0.0826, -0.0519, -0.0085]], device='cuda:0'), grad: tensor([[ 1.3970e-07, -2.5127e-06, 1.5097e-06, ..., 1.7472e-06, 2.4866e-07, 4.6566e-08], [ 5.3830e-07, 8.4098e-07, 2.6524e-06, ..., 2.5835e-06, 1.0682e-06, -7.9162e-08], [-1.9670e-06, 7.3798e-06, 2.5570e-05, ..., 3.0160e-05, -3.5632e-06, 2.1607e-07], ..., [-1.7500e-04, -9.0241e-05, -4.7684e-04, ..., -3.9309e-05, -4.3845e-04, 1.0617e-07], [-1.0096e-06, 1.6615e-06, 2.0005e-06, ..., 2.3842e-06, 3.8929e-07, -1.5590e-06], [ 1.7548e-04, 9.1434e-05, 4.7493e-04, ..., 4.3184e-05, 4.3821e-04, 3.9954e-07]], device='cuda:0') Epoch 127, bias, value: tensor([-0.0110, -0.0063, -0.0076, -0.0147, -0.0102, -0.0010, 0.0109, 0.0076, 0.0155, -0.0040], device='cuda:0'), grad: tensor([-2.6990e-06, 7.5586e-06, 5.0902e-05, -7.5042e-05, 3.8892e-06, 1.6403e-04, -1.6320e-04, -6.8521e-04, 7.7263e-06, 6.9237e-04], device='cuda:0') 100 0.0001 changing lr epoch 126, time 214.78, cls_loss 0.0037 cls_loss_mapping 0.0065 cls_loss_causal 0.5467 re_mapping 0.0085 re_causal 0.0241 /// teacc 98.86 lr 0.00010000 Epoch 128, weight, value: tensor([[-0.1465, 0.0663, -0.0143, ..., -0.0150, -0.0836, -0.0944], [ 0.0300, 0.0577, -0.0894, ..., -0.0767, -0.0192, 0.0809], [ 0.0493, -0.0733, -0.0834, ..., -0.0049, -0.0420, -0.0343], ..., [ 0.0571, 0.0213, 0.0875, ..., 0.0475, 0.1241, 0.0034], [ 0.0775, -0.1084, -0.0948, ..., -0.1327, -0.0474, 0.1256], [-0.0642, 0.0745, 0.0225, ..., -0.0832, -0.0537, -0.0091]], device='cuda:0'), grad: tensor([[ 7.7020e-07, -6.4634e-07, 1.2536e-06, ..., 4.8615e-07, 1.2834e-06, 8.8476e-08], [ 3.3349e-05, 9.6709e-06, 5.4866e-05, ..., 3.4660e-05, 6.7115e-05, -1.1772e-06], [ 6.8583e-06, 2.9504e-06, 1.0230e-05, ..., 1.8338e-06, 1.1399e-05, 5.2992e-07], ..., [-1.7989e-04, -5.6863e-05, -2.7418e-04, ..., -1.3602e-04, -3.3307e-04, -1.5348e-06], [ 2.8890e-06, 1.7453e-06, 8.1360e-06, ..., 4.6454e-06, 9.9540e-06, -2.3302e-06], [ 8.3923e-05, 1.8075e-05, 1.2010e-04, ..., 5.7787e-05, 1.5318e-04, 6.0257e-07]], device='cuda:0') Epoch 128, bias, value: tensor([-0.0110, -0.0065, -0.0078, -0.0147, -0.0099, -0.0010, 0.0109, 0.0077, 0.0175, -0.0052], device='cuda:0'), grad: tensor([ 5.9083e-06, 1.1575e-04, 1.9982e-05, 1.1176e-04, 5.8740e-05, 1.2882e-05, -1.2353e-05, -5.7125e-04, 1.3717e-05, 2.4533e-04], device='cuda:0') 100 0.0001 changing lr epoch 127, time 214.54, cls_loss 0.0034 cls_loss_mapping 0.0055 cls_loss_causal 0.5744 re_mapping 0.0080 re_causal 0.0250 /// teacc 98.73 lr 0.00010000 Epoch 129, weight, value: tensor([[-0.1472, 0.0663, -0.0146, ..., -0.0151, -0.0844, -0.0949], [ 0.0295, 0.0577, -0.0902, ..., -0.0773, -0.0198, 0.0810], [ 0.0493, -0.0734, -0.0839, ..., -0.0049, -0.0422, -0.0343], ..., [ 0.0579, 0.0217, 0.0886, ..., 0.0484, 0.1252, 0.0033], [ 0.0776, -0.1088, -0.0950, ..., -0.1331, -0.0475, 0.1260], [-0.0647, 0.0747, 0.0223, ..., -0.0839, -0.0542, -0.0090]], device='cuda:0'), grad: tensor([[ 2.7008e-07, -1.8343e-05, 1.9092e-07, ..., -8.1137e-06, 2.0489e-08, 7.7393e-07], [-1.1353e-06, -1.7919e-06, 3.7439e-07, ..., 4.6846e-07, -1.8626e-09, -4.1649e-06], [-6.8918e-08, 4.1723e-06, 2.3004e-07, ..., 1.7025e-06, -1.4156e-07, 4.2003e-07], ..., [ 2.5425e-07, 2.6189e-06, 2.5425e-06, ..., 3.8091e-07, 1.9744e-07, 8.6799e-07], [ 6.3330e-08, 2.3916e-06, 1.5534e-06, ..., 8.3819e-07, 6.5193e-08, 4.9174e-07], [ 6.9849e-08, 2.9225e-06, -1.2204e-05, ..., 4.9770e-06, -5.4855e-07, -1.2107e-07]], device='cuda:0') Epoch 129, bias, value: tensor([-0.0113, -0.0069, -0.0078, -0.0150, -0.0098, -0.0009, 0.0114, 0.0084, 0.0173, -0.0052], device='cuda:0'), grad: tensor([-4.7863e-05, -4.5598e-06, 1.0766e-05, 5.3272e-06, 1.4193e-05, -5.5656e-06, 3.3565e-06, 6.4671e-06, 6.1207e-06, 1.1623e-05], device='cuda:0') 100 0.0001 changing lr epoch 128, time 214.67, cls_loss 0.0034 cls_loss_mapping 0.0059 cls_loss_causal 0.5587 re_mapping 0.0076 re_causal 0.0229 /// teacc 98.93 lr 0.00010000 Epoch 130, weight, value: tensor([[-0.1476, 0.0661, -0.0149, ..., -0.0156, -0.0852, -0.0952], [ 0.0282, 0.0575, -0.0910, ..., -0.0774, -0.0210, 0.0799], [ 0.0496, -0.0736, -0.0840, ..., -0.0046, -0.0420, -0.0343], ..., [ 0.0590, 0.0213, 0.0890, ..., 0.0485, 0.1265, 0.0043], [ 0.0777, -0.1093, -0.0953, ..., -0.1338, -0.0477, 0.1264], [-0.0648, 0.0749, 0.0230, ..., -0.0846, -0.0542, -0.0091]], device='cuda:0'), grad: tensor([[ 1.0151e-07, -2.6766e-06, 1.2107e-07, ..., -8.7917e-07, 6.0536e-08, 1.7043e-07], [-4.0419e-07, -1.9614e-06, 9.8720e-08, ..., 9.0338e-08, -8.4843e-07, -4.8392e-06], [ 4.7125e-07, 5.2247e-07, 6.2399e-08, ..., -2.5891e-07, -7.8231e-08, 1.5134e-06], ..., [ 4.6473e-07, 5.8487e-07, -1.4994e-07, ..., -5.4017e-08, 2.5798e-07, 1.3784e-06], [-2.8666e-06, 1.0934e-06, 5.9418e-07, ..., 3.9022e-07, -1.6019e-07, -3.1944e-06], [ 7.1898e-07, 1.1355e-05, 2.3358e-06, ..., 2.9858e-06, -3.1851e-07, 8.5309e-07]], device='cuda:0') Epoch 130, bias, value: tensor([-0.0115, -0.0081, -0.0075, -0.0149, -0.0099, -0.0008, 0.0121, 0.0093, 0.0169, -0.0052], device='cuda:0'), grad: tensor([-4.2394e-06, -6.8806e-06, 1.9725e-06, 1.2748e-05, 2.2538e-06, -7.2241e-05, 4.1127e-05, 2.6040e-06, -1.7900e-06, 2.4483e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 214.55, cls_loss 0.0035 cls_loss_mapping 0.0056 cls_loss_causal 0.5515 re_mapping 0.0080 re_causal 0.0236 /// teacc 98.69 lr 0.00010000 Epoch 131, weight, value: tensor([[-0.1480, 0.0664, -0.0151, ..., -0.0151, -0.0861, -0.0955], [ 0.0289, 0.0592, -0.0887, ..., -0.0775, -0.0201, 0.0812], [ 0.0495, -0.0739, -0.0846, ..., -0.0046, -0.0420, -0.0344], ..., [ 0.0588, 0.0195, 0.0888, ..., 0.0494, 0.1262, 0.0030], [ 0.0779, -0.1094, -0.0956, ..., -0.1344, -0.0478, 0.1269], [-0.0654, 0.0751, 0.0227, ..., -0.0858, -0.0551, -0.0086]], device='cuda:0'), grad: tensor([[ 7.3835e-06, 2.0582e-07, 2.7772e-06, ..., 2.3749e-07, 5.6531e-07, 9.1344e-06], [ 3.7923e-06, 1.6261e-06, 3.1590e-06, ..., 9.4622e-07, 1.6177e-06, 3.9674e-06], [ 6.6385e-06, 9.3728e-06, 3.9265e-06, ..., -3.8184e-07, 1.2619e-06, 1.4998e-05], ..., [-5.3570e-06, 3.5930e-06, -7.1451e-06, ..., -2.9132e-06, -6.2995e-06, 5.0813e-06], [-4.9919e-05, 1.3197e-06, -1.2077e-05, ..., 2.7008e-07, 9.8534e-07, -6.1393e-05], [ 3.0696e-05, -1.4830e-04, -2.0409e-04, ..., 6.3796e-07, -4.9680e-05, -7.9274e-05]], device='cuda:0') Epoch 131, bias, value: tensor([-0.0111, -0.0070, -0.0075, -0.0148, -0.0104, -0.0011, 0.0116, 0.0085, 0.0170, -0.0050], device='cuda:0'), grad: tensor([ 2.2277e-05, 1.4901e-05, 4.8280e-05, 9.0599e-06, 3.4547e-04, 7.6145e-06, -4.3549e-06, 3.3975e-06, -1.2457e-04, -3.2258e-04], device='cuda:0') 100 0.0001 changing lr epoch 130, time 214.69, cls_loss 0.0032 cls_loss_mapping 0.0055 cls_loss_causal 0.5380 re_mapping 0.0081 re_causal 0.0234 /// teacc 98.90 lr 0.00010000 Epoch 132, weight, value: tensor([[-0.1484, 0.0673, -0.0154, ..., -0.0148, -0.0865, -0.0959], [ 0.0288, 0.0591, -0.0887, ..., -0.0777, -0.0203, 0.0812], [ 0.0499, -0.0742, -0.0853, ..., -0.0046, -0.0418, -0.0342], ..., [ 0.0589, 0.0194, 0.0891, ..., 0.0496, 0.1264, 0.0029], [ 0.0777, -0.1102, -0.0959, ..., -0.1353, -0.0478, 0.1273], [-0.0657, 0.0759, 0.0233, ..., -0.0861, -0.0553, -0.0076]], device='cuda:0'), grad: tensor([[ 5.3365e-07, -2.6263e-06, 1.7695e-07, ..., 8.2888e-08, 2.4587e-07, 7.0035e-07], [ 1.1027e-06, -4.3306e-07, 1.0002e-06, ..., 5.3830e-07, 4.7218e-07, -1.7220e-06], [ 3.4403e-06, 1.1567e-06, 7.3109e-07, ..., -9.8348e-07, -4.9453e-07, 6.1616e-06], ..., [-4.1537e-07, 2.6729e-06, -5.4203e-06, ..., -1.2973e-06, -3.4627e-06, 8.0913e-06], [-2.3651e-04, 4.5113e-06, 1.5981e-06, ..., 3.1013e-07, 2.5891e-07, -1.9431e-04], [ 1.8269e-05, 7.4282e-06, 4.3400e-07, ..., 8.5775e-07, 1.6410e-06, 2.6569e-05]], device='cuda:0') Epoch 132, bias, value: tensor([-0.0098, -0.0073, -0.0071, -0.0147, -0.0115, -0.0012, 0.0104, 0.0085, 0.0165, -0.0041], device='cuda:0'), grad: tensor([-3.7029e-06, -1.5991e-06, 9.1717e-06, 4.9099e-06, -4.4048e-05, 5.8126e-04, 5.2974e-06, 8.5607e-06, -6.1798e-04, 5.7995e-05], device='cuda:0') 100 0.0001 changing lr epoch 131, time 214.58, cls_loss 0.0033 cls_loss_mapping 0.0060 cls_loss_causal 0.5468 re_mapping 0.0078 re_causal 0.0230 /// teacc 98.89 lr 0.00010000 Epoch 133, weight, value: tensor([[-0.1489, 0.0680, -0.0157, ..., -0.0146, -0.0874, -0.0964], [ 0.0289, 0.0591, -0.0888, ..., -0.0779, -0.0203, 0.0815], [ 0.0503, -0.0748, -0.0855, ..., -0.0048, -0.0413, -0.0343], ..., [ 0.0586, 0.0189, 0.0888, ..., 0.0496, 0.1261, 0.0028], [ 0.0778, -0.1110, -0.0961, ..., -0.1360, -0.0479, 0.1275], [-0.0660, 0.0748, 0.0232, ..., -0.0864, -0.0552, -0.0088]], device='cuda:0'), grad: tensor([[ 1.0431e-07, -2.1961e-06, 3.1944e-07, ..., -4.4890e-07, 1.3411e-07, 8.5682e-08], [ 4.0233e-07, 1.8626e-09, 6.1840e-07, ..., 4.2655e-07, 7.2829e-07, -6.4354e-07], [-1.6615e-06, 4.9174e-07, 9.8441e-07, ..., -7.1898e-07, -1.6419e-06, 3.3062e-07], ..., [-1.6587e-06, 4.0941e-06, 9.9242e-06, ..., 2.9188e-06, 5.2713e-07, 2.6543e-07], [ 5.1130e-07, 1.4510e-06, 1.2871e-06, ..., 7.6834e-07, 7.7020e-07, 5.7742e-08], [ 2.1514e-07, -9.2387e-07, -2.1681e-05, ..., -6.5453e-06, -4.3660e-06, 2.0675e-06]], device='cuda:0') Epoch 133, bias, value: tensor([-0.0090, -0.0073, -0.0070, -0.0146, -0.0109, -0.0014, 0.0106, 0.0083, 0.0161, -0.0047], device='cuda:0'), grad: tensor([-6.2212e-07, 1.6056e-06, -4.7684e-06, 2.1279e-05, -1.7941e-05, -3.2216e-05, 1.6987e-05, 2.0728e-05, 8.7917e-06, -1.3866e-05], device='cuda:0') 100 0.0001 changing lr epoch 132, time 214.49, cls_loss 0.0041 cls_loss_mapping 0.0058 cls_loss_causal 0.5755 re_mapping 0.0078 re_causal 0.0231 /// teacc 98.98 lr 0.00010000 Epoch 134, weight, value: tensor([[-0.1492, 0.0674, -0.0160, ..., -0.0149, -0.0879, -0.0970], [ 0.0288, 0.0581, -0.0893, ..., -0.0782, -0.0205, 0.0817], [ 0.0502, -0.0747, -0.0861, ..., -0.0047, -0.0415, -0.0344], ..., [ 0.0590, 0.0178, 0.0892, ..., 0.0500, 0.1266, 0.0027], [ 0.0782, -0.1110, -0.0963, ..., -0.1366, -0.0480, 0.1284], [-0.0667, 0.0758, 0.0235, ..., -0.0870, -0.0554, -0.0091]], device='cuda:0'), grad: tensor([[ 3.0641e-07, -6.4913e-07, 4.8429e-08, ..., -2.1420e-08, 2.2072e-07, 1.7602e-07], [-1.9502e-06, -1.2051e-06, 6.6124e-08, ..., 2.4214e-07, -9.3505e-07, -4.4852e-06], [-5.4017e-07, 6.3330e-07, 5.7090e-07, ..., -4.5542e-07, -9.8534e-07, 1.2163e-06], ..., [ 1.7444e-06, 1.3625e-06, 5.0291e-08, ..., 4.0699e-07, 9.2480e-07, 2.7176e-06], [-6.9197e-07, 6.9942e-07, 1.3318e-07, ..., 2.4494e-07, 2.3842e-07, -5.9791e-07], [ 3.5763e-07, 1.5832e-06, 4.5635e-08, ..., 1.9837e-07, 1.4156e-07, 1.1390e-06]], device='cuda:0') Epoch 134, bias, value: tensor([-0.0097, -0.0080, -0.0067, -0.0149, -0.0109, -0.0011, 0.0101, 0.0083, 0.0166, -0.0040], device='cuda:0'), grad: tensor([ 2.9299e-06, -8.5682e-06, -1.3504e-07, 7.3686e-06, 1.0610e-05, -6.0834e-06, -2.3082e-05, 8.7991e-06, 1.9353e-06, 6.2063e-06], device='cuda:0') 100 0.0001 changing lr epoch 133, time 214.55, cls_loss 0.0039 cls_loss_mapping 0.0058 cls_loss_causal 0.5726 re_mapping 0.0079 re_causal 0.0225 /// teacc 98.85 lr 0.00010000 Epoch 135, weight, value: tensor([[-0.1494, 0.0681, -0.0166, ..., -0.0148, -0.0887, -0.0980], [ 0.0287, 0.0577, -0.0898, ..., -0.0784, -0.0205, 0.0818], [ 0.0503, -0.0753, -0.0865, ..., -0.0045, -0.0415, -0.0346], ..., [ 0.0594, 0.0175, 0.0900, ..., 0.0503, 0.1271, 0.0026], [ 0.0785, -0.1115, -0.0965, ..., -0.1371, -0.0481, 0.1291], [-0.0674, 0.0753, 0.0233, ..., -0.0879, -0.0559, -0.0095]], device='cuda:0'), grad: tensor([[ 4.6100e-07, -3.5405e-05, 2.6189e-06, ..., 5.4017e-06, 1.0245e-07, 2.1420e-07], [ 1.6559e-06, 5.6587e-06, 8.4843e-07, ..., 1.3830e-06, 2.0117e-07, 4.4852e-06], [-5.3458e-06, 2.0817e-05, 2.2396e-05, ..., 4.5478e-05, -6.4913e-07, 8.0373e-07], ..., [-1.4715e-07, 3.4962e-06, -4.4145e-07, ..., 1.9111e-06, -4.7311e-07, 1.8552e-06], [-1.7807e-06, 3.4776e-06, 3.0994e-06, ..., 6.9067e-06, 1.6298e-07, -4.2617e-06], [ 9.3691e-07, 5.3376e-05, 4.3660e-06, ..., 8.4937e-06, 1.4529e-07, 1.1444e-05]], device='cuda:0') Epoch 135, bias, value: tensor([-0.0096, -0.0082, -0.0067, -0.0151, -0.0104, -0.0012, 0.0108, 0.0086, 0.0167, -0.0047], device='cuda:0'), grad: tensor([-6.6221e-05, 1.6108e-05, 1.3053e-04, 1.3185e-04, -4.8727e-05, -3.3903e-04, 1.5989e-05, 1.1109e-05, 1.6659e-05, 1.3173e-04], device='cuda:0') 100 0.0001 changing lr epoch 134, time 214.84, cls_loss 0.0041 cls_loss_mapping 0.0059 cls_loss_causal 0.5543 re_mapping 0.0079 re_causal 0.0227 /// teacc 98.86 lr 0.00010000 Epoch 136, weight, value: tensor([[-0.1499, 0.0682, -0.0174, ..., -0.0144, -0.0892, -0.0985], [ 0.0275, 0.0581, -0.0916, ..., -0.0785, -0.0219, 0.0804], [ 0.0506, -0.0756, -0.0872, ..., -0.0043, -0.0415, -0.0347], ..., [ 0.0605, 0.0168, 0.0912, ..., 0.0501, 0.1288, 0.0041], [ 0.0787, -0.1118, -0.0967, ..., -0.1383, -0.0481, 0.1300], [-0.0678, 0.0755, 0.0236, ..., -0.0890, -0.0562, -0.0097]], device='cuda:0'), grad: tensor([[ 1.2144e-06, 1.7695e-08, 5.9232e-07, ..., 8.1304e-07, 1.0533e-06, 1.3243e-06], [ 3.9428e-05, -9.4343e-07, 2.6166e-05, ..., 3.9786e-06, 4.8667e-05, 2.4602e-05], [-2.0653e-05, 1.2293e-06, 5.4762e-06, ..., -6.0946e-06, -1.1012e-05, -2.8908e-05], ..., [-4.8488e-05, 5.3737e-07, -3.8594e-05, ..., 1.2824e-06, -6.6519e-05, -2.4945e-05], [ 1.7405e-05, 2.7753e-07, 5.5395e-06, ..., 8.2552e-06, 1.4618e-05, 1.9640e-05], [ 1.0710e-06, -3.0827e-07, 5.0291e-07, ..., 5.3179e-07, 1.4240e-06, 7.4413e-07]], device='cuda:0') Epoch 136, bias, value: tensor([-0.0096, -0.0095, -0.0065, -0.0154, -0.0103, -0.0010, 0.0104, 0.0099, 0.0169, -0.0048], device='cuda:0'), grad: tensor([ 3.1665e-06, 7.4148e-05, -3.7640e-05, -5.9605e-08, 2.6710e-06, 3.2857e-06, 1.0990e-06, -8.8274e-05, 3.9876e-05, 1.8952e-06], device='cuda:0') 100 0.0001 changing lr epoch 135, time 214.58, cls_loss 0.0034 cls_loss_mapping 0.0057 cls_loss_causal 0.5186 re_mapping 0.0082 re_causal 0.0228 /// teacc 98.89 lr 0.00010000 Epoch 137, weight, value: tensor([[-0.1505, 0.0686, -0.0178, ..., -0.0143, -0.0892, -0.0993], [ 0.0274, 0.0588, -0.0921, ..., -0.0786, -0.0220, 0.0808], [ 0.0506, -0.0759, -0.0874, ..., -0.0041, -0.0414, -0.0353], ..., [ 0.0607, 0.0160, 0.0920, ..., 0.0503, 0.1290, 0.0039], [ 0.0791, -0.1130, -0.0971, ..., -0.1388, -0.0482, 0.1309], [-0.0680, 0.0754, 0.0241, ..., -0.0895, -0.0563, -0.0097]], device='cuda:0'), grad: tensor([[ 7.4226e-07, -6.6124e-08, 1.0394e-06, ..., 2.4587e-07, 5.5972e-07, 4.5449e-07], [ 4.2953e-06, 2.1979e-06, 1.9204e-06, ..., 5.6624e-07, 1.2107e-07, 2.5164e-06], [-1.9912e-06, 4.0829e-06, 5.9232e-07, ..., -8.4564e-07, -1.3970e-06, 5.1335e-06], ..., [-5.1595e-06, 8.4341e-06, -7.3016e-06, ..., -3.6117e-06, -5.5619e-06, 5.7966e-06], [ 8.3297e-06, 1.4968e-05, 1.5320e-06, ..., 6.4820e-07, 1.5572e-06, 1.9982e-05], [ 1.8194e-05, 1.1079e-05, -4.1984e-06, ..., 2.7530e-06, 2.1234e-06, 3.6120e-05]], device='cuda:0') Epoch 137, bias, value: tensor([-0.0092, -0.0095, -0.0065, -0.0159, -0.0105, -0.0006, 0.0105, 0.0099, 0.0170, -0.0048], device='cuda:0'), grad: tensor([ 1.5087e-07, 4.2468e-06, 5.0962e-06, 4.0680e-06, -1.0067e-04, 1.5227e-06, 3.5372e-06, 9.0227e-06, 3.6448e-05, 3.6627e-05], device='cuda:0') 100 0.0001 changing lr epoch 136, time 214.77, cls_loss 0.0027 cls_loss_mapping 0.0050 cls_loss_causal 0.5337 re_mapping 0.0082 re_causal 0.0244 /// teacc 98.93 lr 0.00010000 Epoch 138, weight, value: tensor([[-0.1511, 0.0685, -0.0182, ..., -0.0145, -0.0903, -0.1000], [ 0.0274, 0.0590, -0.0921, ..., -0.0788, -0.0220, 0.0810], [ 0.0504, -0.0764, -0.0894, ..., -0.0044, -0.0419, -0.0355], ..., [ 0.0612, 0.0168, 0.0930, ..., 0.0514, 0.1297, 0.0038], [ 0.0791, -0.1146, -0.0973, ..., -0.1396, -0.0483, 0.1308], [-0.0681, 0.0756, 0.0241, ..., -0.0900, -0.0565, -0.0095]], device='cuda:0'), grad: tensor([[ 1.9893e-06, -1.5959e-05, 2.5332e-07, ..., -1.4435e-06, 9.8720e-08, -4.1090e-06], [ 1.8999e-05, 1.7043e-06, 2.6207e-06, ..., 1.4547e-06, 5.0105e-06, 1.7717e-05], [ 1.9088e-05, 1.3150e-06, 8.8476e-07, ..., -1.2945e-07, -2.2613e-06, 2.2382e-05], ..., [ 2.5362e-05, 1.3009e-05, 1.5646e-07, ..., 2.3451e-06, 5.5097e-06, 3.5942e-05], [-1.2636e-04, 3.3043e-06, 2.7996e-06, ..., 2.7213e-06, 7.5065e-07, -1.2505e-04], [ 3.5483e-06, 3.2187e-06, 1.3411e-07, ..., 1.0235e-06, 8.6240e-07, 6.4261e-06]], device='cuda:0') Epoch 138, bias, value: tensor([-0.0095, -0.0093, -0.0068, -0.0164, -0.0107, -0.0003, 0.0104, 0.0106, 0.0162, -0.0046], device='cuda:0'), grad: tensor([-5.4330e-05, 3.1263e-05, 3.0607e-05, -1.9938e-05, -4.9055e-05, 8.6427e-05, 5.6207e-05, 6.6876e-05, -1.6463e-04, 1.6570e-05], device='cuda:0') 100 0.0001 changing lr epoch 137, time 214.63, cls_loss 0.0038 cls_loss_mapping 0.0049 cls_loss_causal 0.5768 re_mapping 0.0081 re_causal 0.0227 /// teacc 98.78 lr 0.00010000 Epoch 139, weight, value: tensor([[-0.1511, 0.0671, -0.0181, ..., -0.0146, -0.0883, -0.0993], [ 0.0275, 0.0593, -0.0924, ..., -0.0789, -0.0218, 0.0817], [ 0.0508, -0.0770, -0.0897, ..., -0.0041, -0.0416, -0.0358], ..., [ 0.0610, 0.0156, 0.0933, ..., 0.0513, 0.1296, 0.0032], [ 0.0792, -0.1154, -0.0975, ..., -0.1402, -0.0484, 0.1313], [-0.0684, 0.0771, 0.0242, ..., -0.0908, -0.0570, -0.0097]], device='cuda:0'), grad: tensor([[ 2.4587e-07, -2.0675e-07, 5.9325e-07, ..., 2.9057e-07, 2.5332e-07, 3.3155e-07], [ 4.8056e-07, -2.8964e-07, 2.5686e-06, ..., 8.9221e-07, 8.5868e-07, -4.9621e-06], [ 7.1712e-07, 1.4575e-06, 1.1893e-06, ..., 8.3447e-07, 6.5472e-07, 2.5816e-06], ..., [-2.9001e-06, 7.1246e-07, -4.5337e-06, ..., -1.6401e-06, -4.3251e-06, 6.6496e-07], [-1.4612e-06, 1.2722e-06, 1.4463e-06, ..., 7.1805e-07, 1.9651e-07, -2.5630e-06], [ 9.8161e-07, -2.4922e-06, -2.7083e-06, ..., 9.4064e-07, 1.2936e-06, 5.3085e-08]], device='cuda:0') Epoch 139, bias, value: tensor([-0.0106, -0.0091, -0.0065, -0.0163, -0.0105, -0.0003, 0.0098, 0.0102, 0.0159, -0.0037], device='cuda:0'), grad: tensor([ 3.0994e-06, -7.4692e-07, 1.3731e-05, -2.3410e-05, -7.9155e-05, 2.7210e-05, 6.1870e-05, -4.3213e-06, 1.8235e-06, -1.8533e-07], device='cuda:0') 100 0.0001 changing lr epoch 138, time 214.46, cls_loss 0.0025 cls_loss_mapping 0.0040 cls_loss_causal 0.5689 re_mapping 0.0076 re_causal 0.0235 /// teacc 98.91 lr 0.00010000 Epoch 140, weight, value: tensor([[-0.1521, 0.0675, -0.0186, ..., -0.0147, -0.0884, -0.1003], [ 0.0275, 0.0599, -0.0927, ..., -0.0791, -0.0220, 0.0822], [ 0.0508, -0.0773, -0.0903, ..., -0.0042, -0.0415, -0.0358], ..., [ 0.0614, 0.0154, 0.0940, ..., 0.0512, 0.1301, 0.0030], [ 0.0794, -0.1155, -0.0977, ..., -0.1410, -0.0485, 0.1322], [-0.0695, 0.0767, 0.0233, ..., -0.0922, -0.0581, -0.0104]], device='cuda:0'), grad: tensor([[ 3.1106e-07, 4.9360e-07, 2.2724e-07, ..., 4.9360e-08, 6.9849e-08, 1.7229e-07], [ 7.6741e-07, -2.6114e-06, 1.7919e-06, ..., 1.4864e-06, 1.0207e-06, -3.8892e-06], [ 3.7625e-07, 4.3586e-07, 8.4005e-07, ..., 5.4669e-07, 3.4459e-07, 6.7987e-08], ..., [-1.6913e-06, 2.2110e-06, 3.4552e-07, ..., -7.8510e-07, -1.3197e-06, 8.7824e-07], [ 3.1590e-06, 9.8050e-06, 1.4910e-06, ..., 7.8045e-07, 4.7870e-07, 1.2340e-06], [ 4.7404e-07, -3.6657e-06, -5.7220e-06, ..., 6.3702e-07, 2.8312e-07, 3.1292e-07]], device='cuda:0') Epoch 140, bias, value: tensor([-0.0104, -0.0090, -0.0064, -0.0156, -0.0102, -0.0010, 0.0094, 0.0103, 0.0162, -0.0044], device='cuda:0'), grad: tensor([ 1.8878e-06, -4.7684e-07, 1.9632e-06, -5.0180e-06, 7.6815e-06, -3.1888e-05, 2.9542e-06, 2.3488e-06, 2.7955e-05, -7.4692e-06], device='cuda:0') 100 0.0001 changing lr epoch 139, time 214.59, cls_loss 0.0033 cls_loss_mapping 0.0061 cls_loss_causal 0.5438 re_mapping 0.0081 re_causal 0.0239 /// teacc 98.94 lr 0.00010000 Epoch 141, weight, value: tensor([[-0.1529, 0.0686, -0.0192, ..., -0.0140, -0.0888, -0.1011], [ 0.0271, 0.0593, -0.0932, ..., -0.0797, -0.0225, 0.0819], [ 0.0517, -0.0764, -0.0897, ..., -0.0030, -0.0405, -0.0357], ..., [ 0.0612, 0.0154, 0.0942, ..., 0.0504, 0.1301, 0.0033], [ 0.0798, -0.1165, -0.0983, ..., -0.1422, -0.0486, 0.1329], [-0.0698, 0.0768, 0.0232, ..., -0.0935, -0.0583, -0.0107]], device='cuda:0'), grad: tensor([[ 5.4482e-07, -5.2862e-06, 2.1793e-07, ..., -2.5332e-07, 3.7253e-09, 1.5199e-06], [ 2.5719e-05, 9.2294e-07, 2.6915e-07, ..., 2.7474e-07, 1.3039e-08, 7.5221e-05], [ 1.3225e-06, 5.8580e-07, 3.0268e-07, ..., 9.1270e-08, -2.6356e-07, 4.4294e-06], ..., [ 7.0781e-07, 5.1223e-07, -1.3970e-07, ..., 1.3877e-07, 9.9652e-08, 2.0098e-06], [-3.0234e-05, 1.4063e-07, 1.0626e-06, ..., 7.3574e-07, -7.7300e-08, -8.9645e-05], [ 2.2911e-07, 6.6683e-06, 7.1246e-07, ..., 2.0098e-06, 3.2596e-08, 1.0319e-06]], device='cuda:0') Epoch 141, bias, value: tensor([-0.0098, -0.0098, -0.0051, -0.0157, -0.0101, -0.0008, 0.0081, 0.0102, 0.0164, -0.0044], device='cuda:0'), grad: tensor([-5.3421e-06, 7.9751e-05, 5.3272e-06, -3.4451e-05, -7.0129e-07, 4.9055e-05, -2.0996e-05, 3.0175e-06, -8.5890e-05, 1.0148e-05], device='cuda:0') 100 0.0001 changing lr epoch 140, time 214.70, cls_loss 0.0027 cls_loss_mapping 0.0051 cls_loss_causal 0.5738 re_mapping 0.0077 re_causal 0.0234 /// teacc 98.87 lr 0.00010000 Epoch 142, weight, value: tensor([[-0.1546, 0.0680, -0.0212, ..., -0.0148, -0.0908, -0.1016], [ 0.0272, 0.0597, -0.0933, ..., -0.0798, -0.0224, 0.0823], [ 0.0516, -0.0770, -0.0901, ..., -0.0030, -0.0407, -0.0362], ..., [ 0.0616, 0.0154, 0.0950, ..., 0.0513, 0.1305, 0.0032], [ 0.0801, -0.1152, -0.0992, ..., -0.1436, -0.0489, 0.1339], [-0.0702, 0.0764, 0.0228, ..., -0.0952, -0.0586, -0.0111]], device='cuda:0'), grad: tensor([[ 9.1270e-08, -5.5842e-06, 3.2596e-08, ..., -1.1548e-07, 5.6811e-08, 3.6508e-07], [-1.1353e-06, -5.3085e-06, 1.4435e-07, ..., 1.2200e-07, -4.4052e-07, -6.3591e-06], [ 3.1479e-07, 6.5472e-07, 1.9744e-07, ..., -4.5355e-07, -4.2841e-08, 9.6671e-07], ..., [ 2.9337e-07, 1.8468e-06, -1.5460e-07, ..., -2.9802e-08, -5.4948e-08, 2.1290e-06], [-1.7621e-06, 5.7276e-07, 1.5087e-07, ..., 4.2841e-08, 6.5193e-08, -2.2799e-06], [ 2.7288e-07, 3.6769e-06, -1.6261e-06, ..., 4.2934e-07, 1.3970e-07, 6.8452e-07]], device='cuda:0') Epoch 142, bias, value: tensor([-0.0106, -0.0096, -0.0054, -0.0153, -0.0101, -0.0019, 0.0098, 0.0105, 0.0173, -0.0048], device='cuda:0'), grad: tensor([-5.9977e-06, -1.1265e-05, 3.1479e-07, 2.7865e-06, 5.8338e-06, 1.2908e-06, -2.8778e-07, 3.9265e-06, -2.1756e-06, 5.5432e-06], device='cuda:0') 100 0.0001 changing lr epoch 141, time 214.55, cls_loss 0.0033 cls_loss_mapping 0.0044 cls_loss_causal 0.5623 re_mapping 0.0075 re_causal 0.0228 /// teacc 98.90 lr 0.00010000 Epoch 143, weight, value: tensor([[-0.1552, 0.0684, -0.0218, ..., -0.0137, -0.0916, -0.1019], [ 0.0273, 0.0609, -0.0933, ..., -0.0799, -0.0222, 0.0832], [ 0.0510, -0.0778, -0.0926, ..., -0.0039, -0.0415, -0.0365], ..., [ 0.0621, 0.0147, 0.0963, ..., 0.0525, 0.1312, 0.0029], [ 0.0801, -0.1164, -0.0997, ..., -0.1446, -0.0490, 0.1340], [-0.0706, 0.0763, 0.0233, ..., -0.0959, -0.0589, -0.0113]], device='cuda:0'), grad: tensor([[ 1.1269e-07, 3.7719e-08, 9.3132e-08, ..., -8.2422e-08, 1.8161e-07, 1.1502e-07], [ 1.4249e-07, 2.4438e-06, 3.8650e-07, ..., 1.5367e-07, 2.3562e-07, -6.3982e-07], [-7.4692e-07, 1.3988e-06, 1.3504e-07, ..., -7.8883e-07, -7.4925e-07, 3.1525e-07], ..., [-2.2352e-07, 2.6003e-06, -2.9132e-06, ..., -3.4459e-07, -1.7276e-06, 2.8685e-07], [-3.8650e-08, 8.6501e-06, 1.5739e-07, ..., 4.6380e-07, 4.0838e-07, -1.5479e-06], [ 1.0477e-07, 4.2111e-05, 3.9116e-07, ..., 2.4308e-07, 2.2911e-07, 6.1234e-07]], device='cuda:0') Epoch 143, bias, value: tensor([-0.0103, -0.0092, -0.0063, -0.0158, -0.0101, -0.0008, 0.0091, 0.0108, 0.0166, -0.0049], device='cuda:0'), grad: tensor([ 1.9148e-05, 1.0580e-05, 2.0266e-06, 5.7034e-06, -3.1471e-05, 3.0145e-05, -1.4615e-04, 9.7416e-07, 3.6478e-05, 7.2420e-05], device='cuda:0') 100 0.0001 changing lr epoch 142, time 214.57, cls_loss 0.0032 cls_loss_mapping 0.0057 cls_loss_causal 0.5412 re_mapping 0.0080 re_causal 0.0228 /// teacc 98.89 lr 0.00010000 Epoch 144, weight, value: tensor([[-0.1556, 0.0682, -0.0223, ..., -0.0137, -0.0923, -0.1022], [ 0.0280, 0.0612, -0.0923, ..., -0.0798, -0.0216, 0.0839], [ 0.0507, -0.0780, -0.0926, ..., -0.0035, -0.0418, -0.0368], ..., [ 0.0621, 0.0145, 0.0960, ..., 0.0521, 0.1314, 0.0026], [ 0.0803, -0.1162, -0.1000, ..., -0.1454, -0.0491, 0.1355], [-0.0716, 0.0767, 0.0230, ..., -0.0965, -0.0602, -0.0120]], device='cuda:0'), grad: tensor([[ 1.7090e-07, 6.3963e-06, 1.8999e-06, ..., 1.4948e-07, 1.4435e-07, 7.3155e-07], [ 3.1460e-06, 6.3372e-04, 1.5032e-04, ..., 2.7139e-06, 4.4554e-06, 3.9041e-05], [ 5.1782e-07, 4.6007e-06, 1.1474e-06, ..., 2.7288e-07, -3.4925e-08, 1.3299e-06], ..., [-4.7088e-06, 4.4614e-05, 7.2494e-06, ..., -4.0010e-06, -7.0520e-06, 5.4613e-06], [-4.6305e-06, 7.3314e-06, 1.5236e-06, ..., -1.7630e-06, -2.1467e-07, -3.7048e-06], [ 7.6368e-07, -1.0443e-03, -2.6917e-04, ..., 8.3167e-07, 9.6206e-07, -4.8429e-05]], device='cuda:0') Epoch 144, bias, value: tensor([-0.0106, -0.0082, -0.0061, -0.0157, -0.0109, -0.0010, 0.0096, 0.0103, 0.0171, -0.0052], device='cuda:0'), grad: tensor([ 1.6600e-05, 1.4591e-03, 1.4253e-05, 5.3197e-05, 5.8699e-04, 4.8518e-05, 4.1910e-06, 1.3041e-04, 1.4260e-05, -2.3270e-03], device='cuda:0') 100 0.0001 changing lr epoch 143, time 214.70, cls_loss 0.0025 cls_loss_mapping 0.0051 cls_loss_causal 0.5730 re_mapping 0.0075 re_causal 0.0229 /// teacc 98.91 lr 0.00010000 Epoch 145, weight, value: tensor([[-0.1559, 0.0686, -0.0227, ..., -0.0138, -0.0927, -0.1026], [ 0.0279, 0.0613, -0.0925, ..., -0.0800, -0.0216, 0.0846], [ 0.0514, -0.0781, -0.0918, ..., -0.0028, -0.0409, -0.0368], ..., [ 0.0618, 0.0144, 0.0961, ..., 0.0517, 0.1312, 0.0024], [ 0.0804, -0.1170, -0.1002, ..., -0.1462, -0.0492, 0.1358], [-0.0720, 0.0766, 0.0231, ..., -0.0973, -0.0610, -0.0123]], device='cuda:0'), grad: tensor([[ 2.7940e-07, -1.6671e-07, 2.8778e-07, ..., 9.4995e-08, 1.4249e-07, 6.0163e-07], [ 3.4682e-06, 7.5996e-06, 3.5129e-06, ..., 1.5516e-06, 2.6524e-06, 9.2536e-06], [ 9.5442e-06, 3.8650e-07, 2.1085e-06, ..., 8.1956e-07, 1.3569e-06, 8.1658e-06], ..., [-1.3933e-05, -2.3730e-06, -1.8269e-05, ..., -7.9498e-06, -1.3918e-05, -2.1197e-06], [-2.2292e-05, 3.2298e-06, 2.6375e-06, ..., 1.0757e-06, 1.7192e-06, -1.8269e-05], [ 3.5632e-06, -1.5423e-06, 2.7940e-07, ..., 1.4910e-06, 2.5462e-06, 2.2780e-06]], device='cuda:0') Epoch 145, bias, value: tensor([-0.0105, -0.0083, -0.0053, -0.0141, -0.0109, -0.0023, 0.0096, 0.0100, 0.0168, -0.0054], device='cuda:0'), grad: tensor([ 2.6785e-06, 4.1455e-05, 2.0742e-05, 2.1994e-05, 5.3085e-07, 6.1214e-05, -9.7215e-05, -2.0832e-05, -3.2932e-05, 2.2985e-06], device='cuda:0') 100 0.0001 changing lr epoch 144, time 214.47, cls_loss 0.0025 cls_loss_mapping 0.0049 cls_loss_causal 0.5270 re_mapping 0.0075 re_causal 0.0223 /// teacc 98.83 lr 0.00010000 Epoch 146, weight, value: tensor([[-0.1566, 0.0688, -0.0230, ..., -0.0138, -0.0930, -0.1033], [ 0.0279, 0.0620, -0.0928, ..., -0.0800, -0.0216, 0.0851], [ 0.0514, -0.0781, -0.0921, ..., -0.0028, -0.0409, -0.0374], ..., [ 0.0622, 0.0139, 0.0966, ..., 0.0519, 0.1315, 0.0026], [ 0.0810, -0.1179, -0.1010, ..., -0.1467, -0.0495, 0.1368], [-0.0725, 0.0767, 0.0237, ..., -0.0979, -0.0613, -0.0126]], device='cuda:0'), grad: tensor([[ 5.0291e-07, -3.5558e-06, 5.4948e-08, ..., -5.8487e-07, 3.4552e-07, 3.8184e-08], [ 1.1344e-06, 2.0489e-07, 1.4715e-07, ..., 1.7518e-06, 1.0990e-06, 1.8068e-07], [-9.9614e-06, 6.7614e-07, -1.8626e-07, ..., -1.4283e-05, -5.4613e-06, 5.0291e-08], ..., [ 1.1055e-06, 1.3690e-06, 8.8103e-07, ..., 1.8422e-06, 1.3430e-06, 6.1002e-07], [ 4.6752e-07, 4.3493e-07, 2.3842e-07, ..., 1.2256e-06, 4.6659e-07, -9.9465e-07], [ 3.8184e-07, -4.1053e-06, -8.3596e-06, ..., 8.9314e-07, 2.3190e-07, 9.8255e-07]], device='cuda:0') Epoch 146, bias, value: tensor([-0.0104, -0.0082, -0.0053, -0.0143, -0.0107, -0.0022, 0.0079, 0.0101, 0.0174, -0.0052], device='cuda:0'), grad: tensor([-6.4522e-06, 5.5768e-06, -4.0919e-05, 2.4915e-05, 8.5831e-06, -3.6620e-06, 4.8392e-06, 8.1137e-06, 2.9728e-06, -4.0010e-06], device='cuda:0') 100 0.0001 changing lr epoch 145, time 214.40, cls_loss 0.0031 cls_loss_mapping 0.0045 cls_loss_causal 0.5260 re_mapping 0.0074 re_causal 0.0220 /// teacc 98.99 lr 0.00010000 Epoch 147, weight, value: tensor([[-0.1569, 0.0689, -0.0237, ..., -0.0139, -0.0934, -0.1037], [ 0.0273, 0.0621, -0.0939, ..., -0.0813, -0.0225, 0.0852], [ 0.0512, -0.0790, -0.0934, ..., -0.0038, -0.0417, -0.0378], ..., [ 0.0633, 0.0141, 0.0983, ..., 0.0540, 0.1332, 0.0028], [ 0.0810, -0.1184, -0.1014, ..., -0.1478, -0.0496, 0.1370], [-0.0732, 0.0767, 0.0232, ..., -0.0996, -0.0621, -0.0128]], device='cuda:0'), grad: tensor([[ 2.8592e-07, -5.5939e-05, -1.8075e-05, ..., 1.3607e-06, 1.1157e-06, 2.0582e-07], [ 1.8887e-06, -3.2317e-06, 4.4610e-07, ..., 2.0228e-06, 1.9912e-06, -8.3521e-06], [-3.6895e-05, 2.0172e-06, -1.2875e-05, ..., -5.4777e-05, -5.8830e-05, 1.8328e-06], ..., [ 2.4959e-05, -3.5197e-05, -7.7009e-05, ..., 2.7329e-05, -8.2433e-05, 6.7335e-07], [ 4.0680e-06, 1.4622e-06, 8.8010e-07, ..., 3.6247e-06, 3.3304e-06, -6.7987e-07], [ 1.0598e-06, 3.5375e-05, 1.1690e-05, ..., 1.8161e-06, 3.3714e-06, -5.0198e-07]], device='cuda:0') Epoch 147, bias, value: tensor([-0.0105, -0.0088, -0.0060, -0.0143, -0.0107, -0.0012, 0.0079, 0.0114, 0.0160, -0.0054], device='cuda:0'), grad: tensor([-1.1617e-04, -7.3612e-06, -1.1706e-04, 2.3007e-05, 1.9038e-04, 1.2785e-05, 1.9237e-05, -9.7990e-05, 1.4000e-05, 7.8857e-05], device='cuda:0') 100 0.0001 changing lr epoch 146, time 214.70, cls_loss 0.0038 cls_loss_mapping 0.0059 cls_loss_causal 0.5834 re_mapping 0.0071 re_causal 0.0223 /// teacc 98.92 lr 0.00010000 Epoch 148, weight, value: tensor([[-0.1581, 0.0686, -0.0247, ..., -0.0135, -0.0947, -0.1046], [ 0.0267, 0.0625, -0.0949, ..., -0.0825, -0.0232, 0.0850], [ 0.0508, -0.0796, -0.0943, ..., -0.0036, -0.0422, -0.0381], ..., [ 0.0648, 0.0140, 0.0999, ..., 0.0557, 0.1347, 0.0036], [ 0.0814, -0.1196, -0.1018, ..., -0.1492, -0.0498, 0.1379], [-0.0740, 0.0793, 0.0242, ..., -0.1009, -0.0625, -0.0108]], device='cuda:0'), grad: tensor([[ 6.0257e-07, -4.2655e-06, 2.1905e-06, ..., 6.6776e-07, 5.4017e-08, 2.8498e-07], [-1.3169e-06, -2.0325e-05, 3.7625e-07, ..., -3.1944e-07, 5.0291e-08, -3.7074e-05], [ 5.9567e-06, 1.1802e-05, 1.1865e-06, ..., 1.6904e-06, 2.6822e-07, 2.6375e-05], ..., [-2.1048e-06, 2.7008e-06, -7.8827e-06, ..., -4.2208e-06, -3.0175e-07, 3.0566e-06], [-9.1940e-06, 1.6429e-06, 5.4110e-07, ..., -1.2098e-06, -3.6135e-07, -9.0301e-06], [ 2.5183e-06, -2.2262e-05, -5.0783e-05, ..., -1.0453e-05, 1.0710e-07, 5.6140e-06]], device='cuda:0') Epoch 148, bias, value: tensor([-0.0108, -0.0095, -0.0061, -0.0148, -0.0130, -0.0016, 0.0073, 0.0128, 0.0169, -0.0035], device='cuda:0'), grad: tensor([-3.6322e-06, -8.4579e-05, 6.3002e-05, 6.6996e-05, 1.0140e-05, 2.0415e-05, -3.0294e-05, -4.4443e-06, -8.1211e-06, -2.9564e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 147---------------------------------------------------- epoch 147, time 230.51, cls_loss 0.0028 cls_loss_mapping 0.0051 cls_loss_causal 0.5440 re_mapping 0.0071 re_causal 0.0214 /// teacc 99.02 lr 0.00010000 Epoch 149, weight, value: tensor([[-0.1587, 0.0694, -0.0249, ..., -0.0129, -0.0951, -0.1053], [ 0.0276, 0.0633, -0.0943, ..., -0.0825, -0.0224, 0.0864], [ 0.0505, -0.0806, -0.0947, ..., -0.0038, -0.0425, -0.0390], ..., [ 0.0643, 0.0130, 0.0998, ..., 0.0560, 0.1344, 0.0025], [ 0.0819, -0.1205, -0.1020, ..., -0.1501, -0.0499, 0.1389], [-0.0748, 0.0785, 0.0241, ..., -0.1022, -0.0629, -0.0120]], device='cuda:0'), grad: tensor([[ 1.6578e-07, -4.8019e-06, 1.0990e-07, ..., -2.0135e-06, 1.6764e-07, 2.7008e-08], [ 2.8964e-07, 1.9651e-07, 2.1141e-07, ..., 8.9128e-07, 1.9465e-07, -5.7742e-07], [-2.3227e-06, 5.4296e-07, -8.4378e-07, ..., -5.1409e-06, -1.8906e-06, 3.0734e-07], ..., [ 3.3993e-07, 4.5355e-07, -6.0443e-07, ..., 1.6056e-06, 2.1420e-07, 5.0198e-07], [ 4.6007e-07, 1.1073e-06, 3.2969e-07, ..., 1.5693e-06, 4.2468e-07, 3.2596e-08], [ 1.4249e-07, 1.8161e-06, 9.0338e-08, ..., 1.0235e-06, 1.4063e-07, 1.7136e-07]], device='cuda:0') Epoch 149, bias, value: tensor([-0.0102, -0.0081, -0.0067, -0.0151, -0.0120, -0.0012, 0.0072, 0.0118, 0.0171, -0.0046], device='cuda:0'), grad: tensor([-1.5289e-05, 1.3290e-06, -6.1616e-06, 3.5726e-06, -1.2042e-06, 1.4901e-06, 8.6706e-07, 3.6322e-06, 5.6811e-06, 6.0461e-06], device='cuda:0') 100 0.0001 changing lr epoch 148, time 214.66, cls_loss 0.0031 cls_loss_mapping 0.0047 cls_loss_causal 0.5254 re_mapping 0.0076 re_causal 0.0216 /// teacc 98.97 lr 0.00010000 Epoch 150, weight, value: tensor([[-0.1596, 0.0697, -0.0248, ..., -0.0120, -0.0956, -0.1061], [ 0.0274, 0.0635, -0.0947, ..., -0.0831, -0.0224, 0.0867], [ 0.0504, -0.0809, -0.0951, ..., -0.0037, -0.0425, -0.0397], ..., [ 0.0638, 0.0129, 0.0999, ..., 0.0565, 0.1344, 0.0014], [ 0.0840, -0.1207, -0.1010, ..., -0.1492, -0.0490, 0.1417], [-0.0755, 0.0782, 0.0242, ..., -0.1035, -0.0632, -0.0126]], device='cuda:0'), grad: tensor([[ 5.3272e-07, -1.1218e-04, 1.2089e-06, ..., -2.4602e-05, 2.0023e-07, 1.1465e-06], [ 2.6952e-06, 6.9737e-06, 4.5113e-06, ..., 6.0312e-06, 1.5022e-06, 6.4820e-06], [-1.8645e-06, 2.8703e-06, 7.3537e-06, ..., 2.6841e-06, -9.7882e-07, 2.2370e-06], ..., [-8.8155e-05, 1.4439e-05, -8.9884e-05, ..., -5.4777e-05, -3.5256e-05, -3.1572e-06], [ 4.6313e-05, 1.8299e-05, 5.8174e-05, ..., 3.6120e-05, 1.8373e-05, 1.5497e-05], [ 1.5637e-06, 3.5620e-04, -2.2221e-06, ..., 5.7742e-06, 7.8045e-07, 2.4438e-04]], device='cuda:0') Epoch 150, bias, value: tensor([-0.0104, -0.0083, -0.0068, -0.0149, -0.0118, -0.0019, 0.0085, 0.0115, 0.0186, -0.0052], device='cuda:0'), grad: tensor([-4.3988e-04, 3.4988e-05, 1.8343e-05, 5.9642e-06, -1.2760e-03, 2.7150e-05, 4.6182e-04, -1.2481e-04, 1.6856e-04, 1.1234e-03], device='cuda:0') 100 0.0001 changing lr epoch 149, time 214.43, cls_loss 0.0028 cls_loss_mapping 0.0041 cls_loss_causal 0.5267 re_mapping 0.0075 re_causal 0.0219 /// teacc 99.00 lr 0.00010000 Epoch 151, weight, value: tensor([[-0.1605, 0.0698, -0.0253, ..., -0.0118, -0.0961, -0.1066], [ 0.0257, 0.0637, -0.0951, ..., -0.0836, -0.0236, 0.0852], [ 0.0498, -0.0812, -0.0955, ..., -0.0037, -0.0429, -0.0401], ..., [ 0.0651, 0.0124, 0.1000, ..., 0.0567, 0.1359, 0.0022], [ 0.0855, -0.1209, -0.1006, ..., -0.1505, -0.0492, 0.1431], [-0.0759, 0.0781, 0.0243, ..., -0.1040, -0.0635, -0.0130]], device='cuda:0'), grad: tensor([[ 7.8231e-08, -6.3702e-07, 4.6566e-09, ..., -1.1362e-07, 8.6613e-08, 1.3039e-08], [-2.2188e-05, -8.7544e-08, 2.0489e-08, ..., -3.4064e-05, -2.7433e-05, -2.2165e-07], [ 2.0191e-05, 4.1910e-08, 1.7695e-08, ..., 3.0026e-05, 2.4140e-05, 8.3912e-07], ..., [ 1.8505e-06, 1.7509e-07, -5.5879e-09, ..., 2.8163e-06, 2.2426e-06, 1.6019e-07], [-8.4750e-07, 1.3225e-07, 3.5390e-08, ..., 1.4808e-07, 6.7055e-08, -1.0272e-06], [ 1.2387e-07, -2.1420e-08, -1.8161e-07, ..., 1.7695e-07, 7.6368e-08, 1.0524e-07]], device='cuda:0') Epoch 151, bias, value: tensor([-0.0103, -0.0096, -0.0072, -0.0142, -0.0116, -0.0023, 0.0085, 0.0125, 0.0193, -0.0054], device='cuda:0'), grad: tensor([-1.0217e-06, -1.1641e-04, 1.0335e-04, 2.8498e-06, 3.5018e-07, 4.0606e-07, 4.3306e-07, 9.9912e-06, -5.3737e-07, 5.8860e-07], device='cuda:0') 100 0.0001 changing lr epoch 150, time 214.80, cls_loss 0.0024 cls_loss_mapping 0.0047 cls_loss_causal 0.5567 re_mapping 0.0076 re_causal 0.0223 /// teacc 98.97 lr 0.00010000 Epoch 152, weight, value: tensor([[-0.1611, 0.0702, -0.0254, ..., -0.0121, -0.0964, -0.1068], [ 0.0255, 0.0638, -0.0953, ..., -0.0838, -0.0237, 0.0854], [ 0.0501, -0.0815, -0.0958, ..., -0.0030, -0.0429, -0.0405], ..., [ 0.0653, 0.0117, 0.1003, ..., 0.0567, 0.1361, 0.0021], [ 0.0854, -0.1211, -0.1008, ..., -0.1512, -0.0494, 0.1435], [-0.0762, 0.0779, 0.0247, ..., -0.1046, -0.0637, -0.0132]], device='cuda:0'), grad: tensor([[ 9.3132e-08, -2.3935e-06, 3.3993e-07, ..., -3.0361e-07, 4.6566e-09, 1.5553e-07], [-2.0877e-05, -4.1910e-08, 4.8615e-07, ..., 2.6915e-07, 2.0303e-07, -3.4660e-05], [ 5.8673e-06, 4.8801e-07, 1.0040e-06, ..., 1.0058e-06, 1.1269e-07, 9.5218e-06], ..., [ 7.6741e-06, 1.9558e-07, -1.2051e-06, ..., -4.1258e-07, -6.5751e-07, 1.3746e-05], [ 4.0159e-06, 5.6718e-07, 4.6846e-07, ..., 1.8906e-07, 2.0489e-08, 6.1654e-06], [ 1.9241e-06, -3.4180e-07, -7.9721e-07, ..., 6.3889e-07, 2.8498e-07, 2.6189e-06]], device='cuda:0') Epoch 152, bias, value: tensor([-0.0103, -0.0097, -0.0067, -0.0155, -0.0111, -0.0014, 0.0084, 0.0124, 0.0191, -0.0057], device='cuda:0'), grad: tensor([-3.3416e-06, -6.3956e-05, 1.9401e-05, 6.7167e-06, 4.6045e-06, -5.8264e-06, -2.9430e-07, 2.4498e-05, 1.3039e-05, 4.9956e-06], device='cuda:0') 100 0.0001 changing lr epoch 151, time 214.67, cls_loss 0.0037 cls_loss_mapping 0.0061 cls_loss_causal 0.5193 re_mapping 0.0078 re_causal 0.0219 /// teacc 98.96 lr 0.00010000 Epoch 153, weight, value: tensor([[-0.1620, 0.0706, -0.0263, ..., -0.0123, -0.0969, -0.1075], [ 0.0254, 0.0637, -0.0952, ..., -0.0840, -0.0239, 0.0857], [ 0.0514, -0.0817, -0.0949, ..., -0.0008, -0.0421, -0.0419], ..., [ 0.0636, 0.0134, 0.0990, ..., 0.0542, 0.1360, 0.0019], [ 0.0862, -0.1218, -0.1006, ..., -0.1515, -0.0489, 0.1446], [-0.0762, 0.0777, 0.0247, ..., -0.1051, -0.0640, -0.0134]], device='cuda:0'), grad: tensor([[ 2.2538e-07, -9.0618e-07, 1.4156e-07, ..., 6.5193e-08, 7.1712e-08, 1.6112e-07], [ 4.4517e-06, -6.2678e-07, 3.9376e-06, ..., 6.6776e-07, 1.6894e-06, -2.4270e-06], [ 4.3958e-07, 3.1758e-07, 2.5034e-06, ..., 1.5739e-07, 8.3726e-07, -7.7765e-07], ..., [-1.2279e-05, 4.0140e-07, -1.1384e-05, ..., -2.1569e-06, -5.2825e-06, 6.7428e-07], [ 1.8664e-06, 4.8149e-07, 1.0198e-06, ..., 6.0070e-07, 5.1502e-07, 8.8569e-07], [ 3.6582e-06, 1.5926e-07, 1.6913e-06, ..., 4.3400e-07, 1.6019e-06, 2.1700e-07]], device='cuda:0') Epoch 153, bias, value: tensor([-0.0106, -0.0097, -0.0055, -0.0149, -0.0118, -0.0018, 0.0089, 0.0113, 0.0201, -0.0060], device='cuda:0'), grad: tensor([-1.8859e-06, 4.4629e-06, -2.9057e-07, 1.1874e-06, -2.6152e-05, 1.0720e-06, 2.4691e-05, -1.5058e-05, 5.2154e-06, 6.7726e-06], device='cuda:0') 100 0.0001 changing lr epoch 152, time 214.67, cls_loss 0.0028 cls_loss_mapping 0.0040 cls_loss_causal 0.5416 re_mapping 0.0076 re_causal 0.0219 /// teacc 98.94 lr 0.00010000 Epoch 154, weight, value: tensor([[-1.6294e-01, 7.1158e-02, -2.6980e-02, ..., -1.1700e-02, -9.7284e-02, -1.0796e-01], [ 2.4883e-02, 6.3166e-02, -9.6171e-02, ..., -8.4620e-02, -2.4299e-02, 8.5606e-02], [ 5.2276e-02, -8.2091e-02, -9.4249e-02, ..., 1.1465e-04, -4.1739e-02, -4.2401e-02], ..., [ 6.3415e-02, 1.4235e-02, 9.9814e-02, ..., 5.3723e-02, 1.3648e-01, 2.4348e-03], [ 8.6179e-02, -1.2237e-01, -1.0100e-01, ..., -1.5249e-01, -4.9038e-02, 1.4487e-01], [-7.7673e-02, 7.7732e-02, 2.4149e-02, ..., -1.0675e-01, -6.5447e-02, -1.3673e-02]], device='cuda:0'), grad: tensor([[ 2.3376e-07, -2.1476e-06, 9.6858e-08, ..., 2.2929e-06, 4.2561e-07, 1.6112e-07], [ 1.1493e-06, -3.3993e-07, 1.3411e-07, ..., 1.1967e-06, 4.5914e-07, -4.4145e-07], [-2.0549e-05, 1.7136e-07, -9.0338e-07, ..., -2.4915e-05, -6.7018e-06, 6.1374e-07], ..., [ 1.0736e-05, 4.1071e-07, 2.8312e-07, ..., 6.3516e-06, 2.5686e-06, 1.7695e-07], [ 4.4964e-06, 1.3784e-07, 3.7346e-07, ..., 4.9509e-06, 1.8869e-06, -2.2706e-06], [ 7.6182e-07, 6.8732e-07, -5.6531e-07, ..., 4.0885e-07, 1.0990e-07, 1.2238e-06]], device='cuda:0') Epoch 154, bias, value: tensor([-0.0103, -0.0102, -0.0048, -0.0157, -0.0119, -0.0012, 0.0090, 0.0116, 0.0197, -0.0063], device='cuda:0'), grad: tensor([ 1.9800e-06, 2.5667e-06, -6.4731e-05, 2.3365e-05, -2.1141e-07, 2.3860e-06, 5.3421e-06, 1.6972e-05, 8.8364e-06, 3.4813e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 153---------------------------------------------------- epoch 153, time 231.30, cls_loss 0.0024 cls_loss_mapping 0.0041 cls_loss_causal 0.5377 re_mapping 0.0075 re_causal 0.0209 /// teacc 99.09 lr 0.00010000 Epoch 155, weight, value: tensor([[-0.1637, 0.0717, -0.0276, ..., -0.0107, -0.0975, -0.1085], [ 0.0247, 0.0637, -0.0966, ..., -0.0853, -0.0245, 0.0860], [ 0.0524, -0.0827, -0.0946, ..., 0.0005, -0.0419, -0.0426], ..., [ 0.0639, 0.0138, 0.1003, ..., 0.0540, 0.1369, 0.0025], [ 0.0859, -0.1231, -0.1022, ..., -0.1543, -0.0493, 0.1448], [-0.0782, 0.0774, 0.0239, ..., -0.1080, -0.0658, -0.0139]], device='cuda:0'), grad: tensor([[ 7.8883e-07, -5.3756e-06, 1.8217e-06, ..., -2.3842e-07, 7.6834e-07, 2.0899e-06], [-3.8648e-04, -6.5041e-04, -9.2983e-04, ..., 5.6438e-07, -3.7599e-04, -1.1387e-03], [-3.1497e-06, 1.5935e-06, 1.4622e-06, ..., -2.2948e-06, -3.4999e-06, 2.3637e-06], ..., [ 3.6025e-04, 6.0177e-04, 8.5974e-04, ..., 1.4538e-06, 3.5095e-04, 1.0529e-03], [ 2.8498e-07, 1.0179e-06, 8.8569e-07, ..., 1.4435e-07, 3.8184e-07, 4.6752e-07], [ 1.5348e-05, 2.7597e-05, 3.5644e-05, ..., 3.0641e-07, 1.4871e-05, 4.4793e-05]], device='cuda:0') Epoch 155, bias, value: tensor([-0.0100, -0.0102, -0.0042, -0.0162, -0.0115, -0.0015, 0.0097, 0.0118, 0.0188, -0.0067], device='cuda:0'), grad: tensor([-1.2584e-05, -1.8759e-03, -3.7868e-06, 2.2743e-06, 5.4479e-05, 3.7327e-06, 5.7109e-06, 1.7424e-03, 2.3134e-06, 8.0764e-05], device='cuda:0') 100 0.0001 changing lr epoch 154, time 214.46, cls_loss 0.0020 cls_loss_mapping 0.0036 cls_loss_causal 0.5628 re_mapping 0.0073 re_causal 0.0221 /// teacc 98.92 lr 0.00010000 Epoch 156, weight, value: tensor([[-0.1650, 0.0721, -0.0285, ..., -0.0106, -0.0980, -0.1095], [ 0.0252, 0.0650, -0.0957, ..., -0.0854, -0.0241, 0.0873], [ 0.0524, -0.0831, -0.0948, ..., 0.0005, -0.0420, -0.0432], ..., [ 0.0638, 0.0131, 0.1002, ..., 0.0542, 0.1369, 0.0016], [ 0.0860, -0.1232, -0.1025, ..., -0.1548, -0.0495, 0.1450], [-0.0787, 0.0772, 0.0238, ..., -0.1087, -0.0662, -0.0143]], device='cuda:0'), grad: tensor([[ 5.1782e-07, 1.0896e-07, 2.6543e-07, ..., 1.2387e-07, 3.4180e-07, 5.4110e-07], [ 4.2543e-06, -1.7695e-08, 5.0291e-07, ..., 2.9709e-07, 3.4813e-06, 3.3341e-06], [-3.6120e-05, 2.9244e-07, 6.6403e-07, ..., -2.1886e-06, -2.6926e-05, -3.1322e-05], ..., [ 8.9854e-06, 1.8068e-07, -1.0142e-06, ..., 2.8033e-07, 6.1654e-06, 9.1344e-06], [ 1.9535e-05, 1.1772e-06, 1.7863e-06, ..., 2.3991e-06, 1.5602e-05, 1.6227e-05], [ 9.8161e-07, 3.5428e-06, 6.1877e-06, ..., 3.0864e-06, 3.0268e-07, 1.5246e-06]], device='cuda:0') Epoch 156, bias, value: tensor([-0.0101, -0.0094, -0.0043, -0.0162, -0.0114, -0.0017, 0.0100, 0.0115, 0.0188, -0.0071], device='cuda:0'), grad: tensor([ 1.5255e-06, 9.4920e-06, -7.7426e-05, -1.7792e-05, 1.3951e-06, 1.3430e-06, 8.8941e-07, 1.9982e-05, 4.6879e-05, 1.3679e-05], device='cuda:0') 100 0.0001 changing lr epoch 155, time 214.74, cls_loss 0.0029 cls_loss_mapping 0.0049 cls_loss_causal 0.5499 re_mapping 0.0078 re_causal 0.0219 /// teacc 98.95 lr 0.00010000 Epoch 157, weight, value: tensor([[-0.1660, 0.0723, -0.0291, ..., -0.0109, -0.0985, -0.1100], [ 0.0258, 0.0673, -0.0957, ..., -0.0862, -0.0243, 0.0889], [ 0.0520, -0.0838, -0.0954, ..., 0.0002, -0.0429, -0.0435], ..., [ 0.0639, 0.0112, 0.1009, ..., 0.0550, 0.1379, 0.0002], [ 0.0865, -0.1223, -0.1029, ..., -0.1563, -0.0494, 0.1462], [-0.0797, 0.0772, 0.0237, ..., -0.1093, -0.0668, -0.0145]], device='cuda:0'), grad: tensor([[ 1.1642e-07, -2.1886e-07, 6.0629e-07, ..., 1.3597e-07, 8.4750e-08, 6.0536e-08], [ 9.7230e-07, 1.4165e-06, 1.8217e-06, ..., 6.2399e-07, 9.8348e-07, 8.0094e-08], [ 3.2783e-07, 1.4780e-06, 2.4140e-06, ..., 1.7229e-07, 8.7544e-08, 2.5332e-07], ..., [-2.1085e-06, 2.0992e-06, -2.8368e-06, ..., -1.0664e-06, -2.2929e-06, 8.1025e-07], [-9.8273e-06, 4.6417e-06, 4.1798e-06, ..., 2.1327e-07, 2.0023e-07, -4.5411e-06], [ 5.3644e-07, -1.6019e-05, -1.2830e-05, ..., 4.4424e-07, 3.7439e-07, -4.7404e-07]], device='cuda:0') Epoch 157, bias, value: tensor([-0.0100, -0.0086, -0.0048, -0.0166, -0.0113, -0.0022, 0.0100, 0.0114, 0.0197, -0.0073], device='cuda:0'), grad: tensor([-9.1270e-07, 5.6773e-06, 4.2170e-06, 2.0102e-05, -4.1164e-07, 1.2495e-05, 2.9519e-05, 1.7108e-06, -3.7432e-05, -3.4899e-05], device='cuda:0') 100 0.0001 changing lr epoch 156, time 214.65, cls_loss 0.0025 cls_loss_mapping 0.0063 cls_loss_causal 0.5276 re_mapping 0.0076 re_causal 0.0218 /// teacc 98.95 lr 0.00010000 Epoch 158, weight, value: tensor([[-1.6648e-01, 7.2482e-02, -2.9594e-02, ..., -1.1325e-02, -9.8867e-02, -1.1017e-01], [ 2.6267e-02, 6.8086e-02, -9.6103e-02, ..., -8.7093e-02, -2.4636e-02, 9.0681e-02], [ 5.1938e-02, -8.4105e-02, -9.6005e-02, ..., -5.4659e-06, -4.3686e-02, -4.3579e-02], ..., [ 6.4309e-02, 1.1079e-02, 1.0168e-01, ..., 5.5606e-02, 1.3898e-01, 1.5304e-04], [ 8.5047e-02, -1.2398e-01, -1.0392e-01, ..., -1.5903e-01, -4.9659e-02, 1.4417e-01], [-8.0093e-02, 7.7007e-02, 2.3471e-02, ..., -1.1011e-01, -6.7483e-02, -1.4686e-02]], device='cuda:0'), grad: tensor([[ 2.1979e-06, -3.3006e-06, 1.0058e-06, ..., -2.5984e-07, 1.6093e-06, 1.1362e-06], [ 1.0881e-03, -3.6601e-07, 7.7391e-04, ..., 3.1944e-07, 7.8917e-04, 5.6410e-04], [ 3.0160e-05, 2.3842e-07, 2.2665e-05, ..., 6.8173e-07, 2.1607e-05, 1.5482e-05], ..., [-1.1930e-03, 9.8627e-07, -8.4925e-04, ..., -1.8496e-06, -8.6498e-04, -6.1750e-04], [ 5.3197e-05, 3.3062e-07, 3.8654e-05, ..., 1.7975e-07, 3.9428e-05, 2.6479e-05], [ 9.1717e-06, 1.4156e-06, 5.8822e-06, ..., 4.5821e-07, 6.5006e-06, 5.2005e-06]], device='cuda:0') Epoch 158, bias, value: tensor([-0.0100, -0.0076, -0.0050, -0.0167, -0.0111, -0.0015, 0.0102, 0.0119, 0.0172, -0.0076], device='cuda:0'), grad: tensor([-2.5854e-06, 2.1343e-03, 5.8860e-05, 1.4573e-05, 9.2108e-07, 1.5022e-06, 2.0601e-06, -2.3365e-03, 1.0425e-04, 2.1592e-05], device='cuda:0') 100 0.0001 changing lr epoch 157, time 214.28, cls_loss 0.0040 cls_loss_mapping 0.0054 cls_loss_causal 0.5501 re_mapping 0.0078 re_causal 0.0209 /// teacc 98.94 lr 0.00010000 Epoch 159, weight, value: tensor([[-0.1674, 0.0713, -0.0301, ..., -0.0108, -0.0996, -0.1110], [ 0.0260, 0.0686, -0.0966, ..., -0.0876, -0.0250, 0.0918], [ 0.0522, -0.0849, -0.0959, ..., 0.0003, -0.0433, -0.0441], ..., [ 0.0647, 0.0103, 0.1021, ..., 0.0554, 0.1394, -0.0002], [ 0.0845, -0.1257, -0.1043, ..., -0.1599, -0.0499, 0.1435], [-0.0810, 0.0782, 0.0233, ..., -0.1116, -0.0680, -0.0146]], device='cuda:0'), grad: tensor([[ 2.6077e-08, -3.5614e-06, 2.9802e-08, ..., -8.1677e-07, 1.3970e-08, 1.0710e-07], [-5.5395e-06, 6.9216e-06, 4.0513e-07, ..., -3.2149e-06, -4.9248e-06, -2.5854e-06], [ 4.9472e-06, 4.9733e-07, 1.0524e-07, ..., 2.0918e-06, 5.0478e-06, 8.3596e-06], ..., [-1.0990e-06, 8.0932e-07, -2.6263e-06, ..., -1.8124e-06, -1.7295e-06, 8.5589e-07], [ 3.0734e-07, 1.2666e-06, 1.6987e-06, ..., 8.8848e-07, 4.5635e-08, 3.8836e-07], [ 8.0094e-08, 2.2426e-06, -1.9595e-06, ..., 3.6694e-07, 6.4261e-08, 6.5751e-07]], device='cuda:0') Epoch 159, bias, value: tensor([-0.0113, -0.0074, -0.0051, -0.0170, -0.0111, -0.0013, 0.0108, 0.0118, 0.0167, -0.0067], device='cuda:0'), grad: tensor([-6.6049e-06, -2.6412e-06, 1.8999e-05, 3.5912e-06, -2.5302e-05, 1.4091e-06, 2.0005e-06, -1.0058e-07, 5.8748e-06, 2.7884e-06], device='cuda:0') 100 0.0001 changing lr epoch 158, time 214.69, cls_loss 0.0027 cls_loss_mapping 0.0047 cls_loss_causal 0.5370 re_mapping 0.0075 re_causal 0.0218 /// teacc 98.90 lr 0.00010000 Epoch 160, weight, value: tensor([[-1.6793e-01, 7.1738e-02, -3.0766e-02, ..., -9.9862e-03, -9.9903e-02, -1.1138e-01], [ 2.5850e-02, 6.7588e-02, -9.7046e-02, ..., -8.8755e-02, -2.5160e-02, 9.1904e-02], [ 5.2039e-02, -8.4225e-02, -9.6477e-02, ..., 7.0035e-05, -4.3605e-02, -4.4739e-02], ..., [ 6.5010e-02, 1.0158e-02, 1.0269e-01, ..., 5.5871e-02, 1.3984e-01, -2.4710e-04], [ 8.4581e-02, -1.2606e-01, -1.0469e-01, ..., -1.6075e-01, -5.0027e-02, 1.4359e-01], [-8.1196e-02, 7.7011e-02, 2.3064e-02, ..., -1.1242e-01, -6.8750e-02, -1.5213e-02]], device='cuda:0'), grad: tensor([[ 3.9488e-07, -3.8557e-07, 6.6124e-08, ..., -1.2293e-07, 1.3784e-07, 2.8219e-07], [ 2.0657e-06, 3.3621e-07, 5.8487e-07, ..., 9.9279e-07, 2.5891e-07, 1.1455e-06], [-5.1335e-06, 1.0869e-06, 2.8405e-07, ..., -3.3937e-06, -6.0070e-07, 5.1502e-07], ..., [-1.8999e-07, 3.4366e-07, -1.1679e-06, ..., -4.9453e-07, -6.1002e-07, 2.3562e-07], [-2.8219e-06, 2.7008e-07, 1.1176e-08, ..., 1.8980e-06, 4.3213e-07, -5.8301e-06], [ 7.0035e-07, 3.0901e-06, 7.1712e-08, ..., 5.5507e-07, 7.4506e-08, 1.5385e-06]], device='cuda:0') Epoch 160, bias, value: tensor([-0.0109, -0.0084, -0.0044, -0.0159, -0.0095, -0.0022, 0.0101, 0.0120, 0.0166, -0.0077], device='cuda:0'), grad: tensor([ 2.6226e-06, 8.1882e-06, -1.8135e-05, 4.5337e-06, -4.9695e-06, 2.5764e-05, -2.3544e-05, 8.8941e-07, -5.2452e-06, 9.7901e-06], device='cuda:0') 100 0.0001 changing lr epoch 159, time 214.89, cls_loss 0.0031 cls_loss_mapping 0.0047 cls_loss_causal 0.5505 re_mapping 0.0074 re_causal 0.0207 /// teacc 98.97 lr 0.00010000 Epoch 161, weight, value: tensor([[-0.1689, 0.0718, -0.0316, ..., -0.0105, -0.1006, -0.1120], [ 0.0271, 0.0698, -0.0970, ..., -0.0893, -0.0240, 0.0928], [ 0.0519, -0.0845, -0.0966, ..., 0.0002, -0.0438, -0.0453], ..., [ 0.0644, 0.0081, 0.1031, ..., 0.0562, 0.1394, -0.0013], [ 0.0847, -0.1267, -0.1051, ..., -0.1616, -0.0502, 0.1437], [-0.0821, 0.0749, 0.0219, ..., -0.1142, -0.0704, -0.0163]], device='cuda:0'), grad: tensor([[ 3.7346e-07, -5.1409e-07, 2.7511e-06, ..., 2.0042e-06, 4.8522e-07, 1.3039e-08], [ 1.8431e-06, 3.8557e-06, 4.8019e-06, ..., 1.8664e-06, 2.5872e-06, -3.0268e-07], [-1.6764e-07, 3.1013e-06, 5.3309e-06, ..., 3.4552e-06, -6.4261e-08, 7.7300e-08], ..., [-3.9749e-06, -4.1425e-06, -4.8988e-06, ..., -4.5542e-07, -5.4687e-06, 1.8533e-07], [ 4.3958e-07, 3.0845e-06, 3.1646e-06, ..., 1.5013e-06, 7.0781e-08, 2.5611e-07], [ 1.0384e-06, -3.5428e-06, -4.1910e-06, ..., 8.4285e-07, 1.4175e-06, -8.1025e-08]], device='cuda:0') Epoch 161, bias, value: tensor([-0.0110, -0.0071, -0.0046, -0.0162, -0.0070, -0.0015, 0.0092, 0.0112, 0.0162, -0.0098], device='cuda:0'), grad: tensor([ 2.9616e-06, 1.1161e-05, 1.1846e-05, -2.7776e-05, 1.1474e-05, -4.2059e-06, -4.6864e-06, -8.6650e-06, 1.3985e-05, -6.0946e-06], device='cuda:0') 100 0.0001 changing lr epoch 160, time 214.56, cls_loss 0.0027 cls_loss_mapping 0.0045 cls_loss_causal 0.5481 re_mapping 0.0073 re_causal 0.0204 /// teacc 98.85 lr 0.00010000 Epoch 162, weight, value: tensor([[-0.1700, 0.0718, -0.0323, ..., -0.0107, -0.1011, -0.1126], [ 0.0267, 0.0700, -0.0973, ..., -0.0898, -0.0246, 0.0930], [ 0.0517, -0.0853, -0.0970, ..., 0.0004, -0.0442, -0.0463], ..., [ 0.0652, 0.0079, 0.1036, ..., 0.0564, 0.1403, -0.0009], [ 0.0845, -0.1272, -0.1056, ..., -0.1622, -0.0507, 0.1437], [-0.0826, 0.0749, 0.0223, ..., -0.1146, -0.0706, -0.0165]], device='cuda:0'), grad: tensor([[ 4.3120e-07, 1.2284e-06, 2.3469e-07, ..., 1.5246e-06, 5.5693e-07, 4.6566e-09], [ 8.1304e-07, 4.8429e-08, 6.5193e-07, ..., 9.4716e-07, 1.1632e-06, -1.1083e-07], [-5.6438e-06, 9.1270e-08, -2.7828e-06, ..., -6.0089e-06, -8.4415e-06, 3.6322e-08], ..., [ 1.0980e-06, 1.4715e-07, 4.5914e-07, ..., 1.2573e-06, 1.6931e-06, 6.0536e-08], [ 1.1511e-06, 4.8149e-07, 7.1060e-07, ..., 1.2293e-06, 1.5385e-06, -3.2596e-08], [ 1.3784e-07, 1.3877e-07, 1.6205e-07, ..., 3.4552e-07, 1.6950e-07, 1.4901e-08]], device='cuda:0') Epoch 162, bias, value: tensor([-0.0111, -0.0071, -0.0050, -0.0165, -0.0069, -0.0015, 0.0092, 0.0118, 0.0160, -0.0098], device='cuda:0'), grad: tensor([ 7.6964e-06, 3.5949e-06, -2.4378e-05, 1.1265e-05, 4.6287e-07, -1.0721e-05, -3.6769e-06, 5.7034e-06, 8.7619e-06, 1.2759e-06], device='cuda:0') 100 0.0001 changing lr epoch 161, time 214.67, cls_loss 0.0016 cls_loss_mapping 0.0037 cls_loss_causal 0.5458 re_mapping 0.0069 re_causal 0.0213 /// teacc 98.97 lr 0.00010000 Epoch 163, weight, value: tensor([[-0.1703, 0.0726, -0.0325, ..., -0.0106, -0.1013, -0.1130], [ 0.0265, 0.0697, -0.0977, ..., -0.0902, -0.0248, 0.0929], [ 0.0518, -0.0857, -0.0970, ..., 0.0005, -0.0440, -0.0464], ..., [ 0.0653, 0.0078, 0.1038, ..., 0.0564, 0.1405, -0.0009], [ 0.0845, -0.1277, -0.1060, ..., -0.1626, -0.0508, 0.1438], [-0.0827, 0.0748, 0.0231, ..., -0.1150, -0.0706, -0.0165]], device='cuda:0'), grad: tensor([[ 3.1386e-07, 1.0617e-07, 2.7288e-07, ..., 2.7657e-05, 3.3528e-07, 5.3458e-07], [-1.1347e-05, -5.3383e-06, -5.7071e-06, ..., 6.0722e-07, -1.2577e-05, -2.8253e-05], [ 1.7164e-06, 8.8662e-07, 2.2743e-06, ..., 3.0696e-05, 1.5926e-06, 5.9307e-06], ..., [ 7.6294e-06, 4.1276e-06, 2.7604e-06, ..., -3.5111e-07, 8.0839e-06, 1.9908e-05], [-1.0990e-07, 5.6718e-07, 7.5344e-07, ..., 2.2631e-06, 3.6322e-07, -3.6880e-07], [ 7.2550e-07, -3.1199e-07, -7.0315e-07, ..., 6.1281e-07, 5.4389e-07, 9.0804e-07]], device='cuda:0') Epoch 163, bias, value: tensor([-0.0104, -0.0074, -0.0048, -0.0167, -0.0066, -0.0012, 0.0084, 0.0117, 0.0157, -0.0099], device='cuda:0'), grad: tensor([ 4.6164e-05, -4.8578e-05, 5.9366e-05, -1.1116e-04, 1.7779e-06, 8.5160e-06, 4.4592e-06, 3.4839e-05, 4.2357e-06, 4.6566e-07], device='cuda:0') 100 0.0001 changing lr epoch 162, time 214.83, cls_loss 0.0031 cls_loss_mapping 0.0052 cls_loss_causal 0.5354 re_mapping 0.0069 re_causal 0.0201 /// teacc 98.75 lr 0.00010000 Epoch 164, weight, value: tensor([[-0.1708, 0.0727, -0.0332, ..., -0.0109, -0.1018, -0.1138], [ 0.0268, 0.0696, -0.0964, ..., -0.0897, -0.0246, 0.0932], [ 0.0519, -0.0862, -0.0971, ..., 0.0005, -0.0438, -0.0467], ..., [ 0.0651, 0.0075, 0.1028, ..., 0.0559, 0.1403, -0.0013], [ 0.0847, -0.1284, -0.1063, ..., -0.1632, -0.0509, 0.1440], [-0.0855, 0.0737, 0.0213, ..., -0.1164, -0.0713, -0.0175]], device='cuda:0'), grad: tensor([[ 3.7905e-07, -3.9022e-07, 4.9733e-07, ..., 2.0117e-07, 2.8126e-07, 4.5169e-07], [ 9.2462e-06, 2.0899e-06, 6.6049e-06, ..., 4.1388e-06, 6.7018e-06, -2.7213e-06], [ 1.5222e-05, 2.2426e-06, 9.8273e-06, ..., 7.0818e-06, 1.2025e-05, 6.4224e-06], ..., [-6.4969e-05, 2.5537e-06, -3.8832e-05, ..., -2.0102e-05, -4.1068e-05, -1.5959e-05], [ 2.8253e-05, 1.1725e-06, 1.5885e-05, ..., 5.4464e-06, 1.3746e-05, 1.0602e-05], [ 1.4529e-06, -2.8536e-06, -6.7391e-06, ..., 3.6601e-07, 7.3668e-07, 4.8317e-06]], device='cuda:0') Epoch 164, bias, value: tensor([-0.0107, -0.0069, -0.0048, -0.0160, -0.0046, -0.0015, 0.0099, 0.0110, 0.0154, -0.0118], device='cuda:0'), grad: tensor([ 3.8091e-07, 1.3463e-05, 2.8774e-05, -8.8364e-06, -5.1349e-05, 3.5167e-05, 5.4836e-06, -7.3791e-05, 3.6240e-05, 1.4424e-05], device='cuda:0') 100 0.0001 changing lr epoch 163, time 214.71, cls_loss 0.0027 cls_loss_mapping 0.0039 cls_loss_causal 0.5417 re_mapping 0.0070 re_causal 0.0199 /// teacc 98.99 lr 0.00010000 Epoch 165, weight, value: tensor([[-0.1718, 0.0729, -0.0340, ..., -0.0111, -0.1025, -0.1147], [ 0.0261, 0.0694, -0.0969, ..., -0.0903, -0.0249, 0.0926], [ 0.0518, -0.0867, -0.0974, ..., 0.0004, -0.0438, -0.0471], ..., [ 0.0655, 0.0068, 0.1029, ..., 0.0564, 0.1408, -0.0012], [ 0.0849, -0.1289, -0.1068, ..., -0.1639, -0.0511, 0.1443], [-0.0851, 0.0735, 0.0229, ..., -0.1169, -0.0706, -0.0178]], device='cuda:0'), grad: tensor([[ 3.4412e-07, -2.7288e-07, 4.1258e-07, ..., 1.1316e-07, 5.1688e-08, 2.1886e-08], [ 4.4750e-07, 3.3248e-07, 9.3877e-07, ..., 3.0082e-07, 2.4028e-07, 5.8673e-08], [-5.1921e-07, 9.9652e-08, 2.9569e-07, ..., -7.1665e-07, -1.6764e-07, 6.2399e-08], ..., [-5.1819e-06, -6.3702e-07, -8.3521e-06, ..., -2.6133e-06, -2.6617e-06, 1.3504e-08], [ 1.4529e-07, 4.2887e-07, 4.6939e-07, ..., 3.9395e-07, 2.3004e-07, -7.2224e-07], [ 1.8943e-06, -2.3305e-05, -1.2815e-05, ..., 4.4750e-07, 1.2703e-06, 9.5926e-08]], device='cuda:0') Epoch 165, bias, value: tensor([-0.0106, -0.0078, -0.0048, -0.0160, -0.0043, -0.0012, 0.0114, 0.0109, 0.0151, -0.0117], device='cuda:0'), grad: tensor([-7.9162e-09, 1.4910e-06, -2.0191e-06, 2.3507e-06, 1.5058e-05, 3.1888e-05, -1.0105e-07, -9.6560e-06, 1.5413e-06, -4.0561e-05], device='cuda:0') 100 0.0001 changing lr epoch 164, time 214.55, cls_loss 0.0034 cls_loss_mapping 0.0042 cls_loss_causal 0.5508 re_mapping 0.0072 re_causal 0.0200 /// teacc 98.96 lr 0.00010000 Epoch 166, weight, value: tensor([[-0.1725, 0.0726, -0.0349, ..., -0.0112, -0.1028, -0.1151], [ 0.0256, 0.0689, -0.0975, ..., -0.0909, -0.0253, 0.0920], [ 0.0519, -0.0871, -0.0976, ..., 0.0004, -0.0437, -0.0475], ..., [ 0.0659, 0.0060, 0.1030, ..., 0.0566, 0.1413, -0.0010], [ 0.0855, -0.1290, -0.1076, ..., -0.1645, -0.0512, 0.1449], [-0.0850, 0.0726, 0.0209, ..., -0.1190, -0.0713, -0.0173]], device='cuda:0'), grad: tensor([[-2.3488e-06, -7.7114e-06, -8.5682e-08, ..., -2.6776e-07, -7.4785e-07, 2.6543e-08], [ 2.5518e-07, 5.0012e-07, 8.5216e-08, ..., 4.7497e-08, 1.6158e-07, -1.1595e-06], [-1.4948e-07, 3.3714e-07, 2.2817e-08, ..., 1.0710e-08, -1.9791e-07, 3.3062e-08], ..., [ 8.2422e-08, 3.8510e-07, -9.3132e-10, ..., 4.8894e-08, 8.2422e-08, 7.7765e-08], [ 1.8710e-06, 5.9754e-06, 6.7428e-07, ..., 1.1362e-07, 5.6392e-07, 3.0920e-07], [ 3.3528e-08, -4.8503e-06, -5.0962e-06, ..., 6.4261e-08, 4.7497e-08, -7.1106e-07]], device='cuda:0') Epoch 166, bias, value: tensor([-0.0109, -0.0089, -0.0048, -0.0162, -0.0026, -0.0011, 0.0126, 0.0109, 0.0155, -0.0128], device='cuda:0'), grad: tensor([-2.3246e-05, 1.3644e-07, 7.0455e-07, 1.9781e-06, 1.4499e-05, -3.0547e-07, 1.6149e-06, 1.3402e-06, 1.8686e-05, -1.5408e-05], device='cuda:0') 100 0.0001 changing lr epoch 165, time 214.69, cls_loss 0.0024 cls_loss_mapping 0.0037 cls_loss_causal 0.5292 re_mapping 0.0069 re_causal 0.0206 /// teacc 99.05 lr 0.00010000 Epoch 167, weight, value: tensor([[-0.1730, 0.0728, -0.0355, ..., -0.0113, -0.1031, -0.1156], [ 0.0244, 0.0687, -0.0987, ..., -0.0922, -0.0269, 0.0918], [ 0.0518, -0.0874, -0.0978, ..., 0.0005, -0.0442, -0.0478], ..., [ 0.0667, 0.0059, 0.1036, ..., 0.0570, 0.1428, -0.0012], [ 0.0852, -0.1295, -0.1080, ..., -0.1652, -0.0512, 0.1447], [-0.0847, 0.0728, 0.0214, ..., -0.1196, -0.0711, -0.0175]], device='cuda:0'), grad: tensor([[ 8.4285e-08, -7.3910e-06, 1.0571e-07, ..., 8.1025e-08, 2.4680e-08, 3.7253e-09], [ 1.5087e-07, 2.6077e-08, 2.7614e-07, ..., 1.3225e-07, 1.3551e-07, -3.3993e-08], [-2.6077e-08, 1.9139e-07, 3.0827e-07, ..., 3.5856e-08, -3.9581e-08, 1.7695e-08], ..., [-7.2690e-07, 6.9849e-07, -1.2843e-06, ..., -4.0419e-07, -6.0862e-07, 1.4435e-08], [ 2.1886e-08, 2.0601e-06, 1.1548e-07, ..., 8.1956e-08, 2.7474e-08, -1.5646e-07], [ 1.4435e-07, 9.0105e-07, -3.0128e-07, ..., 1.1828e-07, 1.1548e-07, 4.3772e-08]], device='cuda:0') Epoch 167, bias, value: tensor([-0.0105, -0.0099, -0.0050, -0.0157, -0.0029, -0.0016, 0.0142, 0.0113, 0.0148, -0.0124], device='cuda:0'), grad: tensor([-2.7344e-05, 4.0093e-07, 3.8929e-07, 1.0859e-06, 1.0543e-06, 1.7077e-05, -5.8115e-06, 1.6438e-06, 8.8513e-06, 2.6915e-06], device='cuda:0') 100 0.0001 changing lr epoch 166, time 214.81, cls_loss 0.0019 cls_loss_mapping 0.0043 cls_loss_causal 0.5338 re_mapping 0.0071 re_causal 0.0206 /// teacc 98.88 lr 0.00010000 Epoch 168, weight, value: tensor([[-0.1739, 0.0729, -0.0362, ..., -0.0115, -0.1036, -0.1160], [ 0.0240, 0.0683, -0.0992, ..., -0.0927, -0.0271, 0.0915], [ 0.0518, -0.0880, -0.0981, ..., 0.0005, -0.0442, -0.0479], ..., [ 0.0671, 0.0059, 0.1046, ..., 0.0575, 0.1434, -0.0013], [ 0.0859, -0.1303, -0.1085, ..., -0.1660, -0.0512, 0.1454], [-0.0851, 0.0734, 0.0218, ..., -0.1204, -0.0717, -0.0174]], device='cuda:0'), grad: tensor([[ 5.9232e-07, -3.0417e-06, 1.3458e-07, ..., 9.1270e-08, 3.0268e-08, 1.1697e-06], [-8.4639e-06, -4.9397e-06, 3.5092e-06, ..., 2.3004e-06, 1.2107e-06, -1.4305e-05], [ 3.4105e-06, 2.0117e-06, 5.5460e-07, ..., 2.9802e-07, 2.8405e-08, 5.5879e-06], ..., [-7.9945e-06, 1.2061e-07, -1.4871e-05, ..., -9.6560e-06, -5.3532e-06, -3.2745e-06], [ 4.2580e-06, 2.4531e-06, 1.1018e-06, ..., 1.1288e-06, 2.8964e-07, 2.1979e-05], [ 7.2690e-07, 6.1095e-07, -1.0012e-07, ..., 2.1607e-07, 4.2841e-08, 1.0533e-06]], device='cuda:0') Epoch 168, bias, value: tensor([-0.0105, -0.0103, -0.0050, -0.0159, -0.0032, -0.0020, 0.0141, 0.0117, 0.0147, -0.0118], device='cuda:0'), grad: tensor([-1.3560e-06, -3.1769e-05, 1.5900e-05, 1.2524e-05, 2.1700e-06, 3.8117e-05, -1.7679e-04, -1.6525e-05, 1.5342e-04, 3.9153e-06], device='cuda:0') 100 0.0001 changing lr epoch 167, time 214.94, cls_loss 0.0025 cls_loss_mapping 0.0044 cls_loss_causal 0.5463 re_mapping 0.0064 re_causal 0.0198 /// teacc 98.84 lr 0.00010000 Epoch 169, weight, value: tensor([[-0.1753, 0.0741, -0.0375, ..., -0.0118, -0.1045, -0.1167], [ 0.0249, 0.0700, -0.0989, ..., -0.0930, -0.0263, 0.0927], [ 0.0522, -0.0885, -0.0980, ..., 0.0008, -0.0438, -0.0483], ..., [ 0.0663, 0.0048, 0.1042, ..., 0.0574, 0.1427, -0.0026], [ 0.0858, -0.1318, -0.1089, ..., -0.1669, -0.0514, 0.1453], [-0.0848, 0.0734, 0.0223, ..., -0.1213, -0.0720, -0.0174]], device='cuda:0'), grad: tensor([[ 1.5413e-07, -8.2748e-07, 3.2596e-07, ..., 7.9162e-08, 1.8161e-07, 3.6787e-08], [ 2.7642e-06, 7.6881e-07, 4.5225e-06, ..., 7.6788e-07, 3.1609e-06, 4.7497e-08], [ 2.3842e-07, 4.6799e-07, 7.3249e-07, ..., 5.1223e-08, 2.3469e-07, 8.7079e-08], ..., [-4.0941e-06, -4.1677e-07, -6.4597e-06, ..., -1.2564e-06, -4.7609e-06, 3.1712e-07], [-8.3167e-07, 3.6368e-07, 2.3935e-07, ..., 3.7719e-08, 6.2864e-08, -1.1222e-06], [ 5.5367e-07, -4.1276e-06, -2.9355e-06, ..., 1.4622e-07, 6.1234e-07, 1.3234e-06]], device='cuda:0') Epoch 169, bias, value: tensor([-0.0095, -0.0094, -0.0045, -0.0159, -0.0035, -0.0024, 0.0140, 0.0107, 0.0141, -0.0116], device='cuda:0'), grad: tensor([ 5.3830e-07, 6.8285e-06, 1.4044e-06, 8.1435e-06, -2.2575e-06, 2.9914e-06, -1.8850e-06, -7.8529e-06, -9.6764e-07, -6.9924e-06], device='cuda:0') 100 0.0001 changing lr epoch 168, time 215.11, cls_loss 0.0024 cls_loss_mapping 0.0043 cls_loss_causal 0.5554 re_mapping 0.0068 re_causal 0.0199 /// teacc 98.97 lr 0.00010000 Epoch 170, weight, value: tensor([[-0.1766, 0.0753, -0.0390, ..., -0.0119, -0.1056, -0.1178], [ 0.0248, 0.0700, -0.0991, ..., -0.0933, -0.0264, 0.0929], [ 0.0520, -0.0886, -0.0983, ..., 0.0007, -0.0438, -0.0492], ..., [ 0.0659, 0.0047, 0.1039, ..., 0.0576, 0.1426, -0.0035], [ 0.0881, -0.1319, -0.1066, ..., -0.1671, -0.0497, 0.1467], [-0.0855, 0.0734, 0.0225, ..., -0.1225, -0.0723, -0.0177]], device='cuda:0'), grad: tensor([[ 2.3656e-06, 1.3597e-07, 4.4424e-07, ..., 9.3132e-09, 4.6566e-09, 3.3490e-06], [ 3.5092e-06, 6.7055e-08, 3.8929e-07, ..., 3.6322e-08, 3.3528e-08, 5.5917e-06], [ 3.4943e-06, 1.2014e-07, 3.2224e-07, ..., 4.0047e-08, 6.7055e-08, 6.2212e-06], ..., [ 5.9605e-07, 8.4750e-08, -8.9407e-08, ..., -5.9605e-08, -1.2480e-07, 1.2843e-06], [-3.1173e-05, -1.8636e-06, -4.3362e-06, ..., 2.4214e-08, -3.3528e-08, -4.7654e-05], [ 6.3609e-07, 9.3039e-07, -7.4506e-09, ..., 3.9116e-08, 5.5879e-09, 1.1725e-06]], device='cuda:0') Epoch 170, bias, value: tensor([-0.0086, -0.0098, -0.0042, -0.0148, -0.0037, -0.0043, 0.0139, 0.0103, 0.0160, -0.0118], device='cuda:0'), grad: tensor([ 9.6038e-06, 1.4924e-05, 1.5721e-05, 2.8446e-05, 2.5202e-06, 3.4094e-05, 1.7479e-05, 3.1106e-06, -1.3018e-04, 4.1872e-06], device='cuda:0') 100 0.0001 changing lr epoch 169, time 214.91, cls_loss 0.0028 cls_loss_mapping 0.0045 cls_loss_causal 0.5394 re_mapping 0.0067 re_causal 0.0200 /// teacc 98.99 lr 0.00010000 Epoch 171, weight, value: tensor([[-0.1777, 0.0755, -0.0397, ..., -0.0119, -0.1061, -0.1191], [ 0.0243, 0.0692, -0.0992, ..., -0.0935, -0.0265, 0.0916], [ 0.0519, -0.0889, -0.0986, ..., 0.0006, -0.0438, -0.0498], ..., [ 0.0660, 0.0043, 0.1041, ..., 0.0577, 0.1428, -0.0036], [ 0.0889, -0.1319, -0.1070, ..., -0.1679, -0.0497, 0.1475], [-0.0864, 0.0736, 0.0229, ..., -0.1229, -0.0724, -0.0181]], device='cuda:0'), grad: tensor([[ 5.4389e-07, -5.7332e-06, 2.0582e-07, ..., 1.8161e-07, 4.7404e-07, 1.3402e-06], [-2.3767e-06, -4.3306e-07, 1.0263e-06, ..., 9.8534e-07, -7.3873e-06, -1.6093e-05], [ 1.0557e-05, 6.7987e-08, 2.0280e-05, ..., -1.6065e-06, 8.5980e-06, 6.5006e-06], ..., [-2.5019e-05, 1.6857e-07, -3.1412e-05, ..., -5.0776e-06, -1.1683e-05, 4.1388e-06], [ 6.7651e-06, 4.6100e-07, 3.7961e-06, ..., 2.6394e-06, 3.4831e-06, 1.3951e-06], [ 5.0291e-08, -1.5367e-06, -2.0489e-06, ..., 4.7497e-08, 2.0862e-07, 5.2899e-07]], device='cuda:0') Epoch 171, bias, value: tensor([-0.0087, -0.0120, -0.0043, -0.0145, -0.0041, -0.0050, 0.0164, 0.0103, 0.0171, -0.0117], device='cuda:0'), grad: tensor([-1.7628e-05, -4.3720e-05, 1.8775e-05, 1.0528e-05, -5.1372e-06, 3.2056e-06, 3.0085e-05, -1.0841e-05, 1.6063e-05, -1.3271e-06], device='cuda:0') 100 0.0001 changing lr epoch 170, time 214.56, cls_loss 0.0025 cls_loss_mapping 0.0031 cls_loss_causal 0.5325 re_mapping 0.0065 re_causal 0.0189 /// teacc 98.98 lr 0.00010000 Epoch 172, weight, value: tensor([[-0.1794, 0.0758, -0.0415, ..., -0.0123, -0.1073, -0.1196], [ 0.0239, 0.0691, -0.1002, ..., -0.0942, -0.0270, 0.0917], [ 0.0521, -0.0899, -0.0989, ..., 0.0008, -0.0442, -0.0501], ..., [ 0.0665, 0.0043, 0.1048, ..., 0.0579, 0.1436, -0.0035], [ 0.0888, -0.1326, -0.1076, ..., -0.1694, -0.0499, 0.1475], [-0.0867, 0.0740, 0.0234, ..., -0.1234, -0.0726, -0.0182]], device='cuda:0'), grad: tensor([[ 6.8918e-08, -8.8196e-07, 7.7300e-08, ..., 3.4459e-08, 4.2841e-08, 4.8429e-08], [ 2.6822e-07, -8.3819e-07, 8.2236e-07, ..., 3.1851e-07, 2.2072e-07, -1.6931e-06], [-9.7789e-08, 1.5367e-07, 2.1793e-07, ..., 4.3772e-08, 2.2352e-08, 9.7789e-08], ..., [-1.2256e-06, 4.0792e-07, -1.5218e-06, ..., -4.9546e-07, -1.1977e-06, 6.0443e-07], [ 2.1327e-07, 4.9826e-07, 3.9022e-07, ..., 2.2352e-07, 1.4622e-07, 8.4750e-08], [ 3.9022e-07, 1.2191e-06, -6.1933e-07, ..., 1.8533e-07, 3.3248e-07, 1.0459e-06]], device='cuda:0') Epoch 172, bias, value: tensor([-0.0085, -0.0121, -0.0038, -0.0155, -0.0045, -0.0040, 0.0162, 0.0105, 0.0166, -0.0114], device='cuda:0'), grad: tensor([-1.1846e-06, -2.1029e-06, 2.3562e-07, 1.5078e-06, -1.1539e-06, -2.5183e-06, 9.3132e-07, -4.1816e-07, 1.5525e-06, 3.1460e-06], device='cuda:0') 100 0.0001 changing lr epoch 171, time 214.84, cls_loss 0.0025 cls_loss_mapping 0.0040 cls_loss_causal 0.5421 re_mapping 0.0069 re_causal 0.0195 /// teacc 98.94 lr 0.00010000 Epoch 173, weight, value: tensor([[-0.1826, 0.0755, -0.0431, ..., -0.0126, -0.1083, -0.1201], [ 0.0239, 0.0692, -0.1002, ..., -0.0945, -0.0270, 0.0920], [ 0.0524, -0.0905, -0.0993, ..., 0.0008, -0.0442, -0.0500], ..., [ 0.0668, 0.0042, 0.1054, ..., 0.0584, 0.1440, -0.0037], [ 0.0884, -0.1335, -0.1081, ..., -0.1711, -0.0502, 0.1477], [-0.0873, 0.0742, 0.0234, ..., -0.1245, -0.0733, -0.0187]], device='cuda:0'), grad: tensor([[ 2.4401e-07, 2.1048e-07, 2.2911e-07, ..., 3.7253e-08, 7.6368e-08, 2.4214e-08], [ 1.1221e-05, 3.7160e-07, 5.8040e-06, ..., 2.5257e-06, 6.7912e-06, 2.7958e-06], [-1.4529e-07, 6.9756e-07, 9.5833e-07, ..., 2.9057e-07, 4.4610e-07, 6.3330e-08], ..., [-2.0087e-05, 1.0803e-07, -1.0081e-05, ..., -4.6864e-06, -1.2301e-05, -4.9546e-06], [ 5.2489e-06, 1.1604e-06, 3.1516e-06, ..., 1.1344e-06, 3.0827e-06, 1.1306e-06], [ 1.3178e-06, -5.8487e-06, -2.8573e-06, ..., 3.3714e-07, 7.4226e-07, 3.4552e-07]], device='cuda:0') Epoch 173, bias, value: tensor([-0.0086, -0.0120, -0.0035, -0.0140, -0.0044, -0.0053, 0.0160, 0.0107, 0.0162, -0.0115], device='cuda:0'), grad: tensor([ 1.0896e-06, 1.7405e-05, 1.5181e-07, 7.5437e-06, 2.1346e-06, 1.1083e-06, 7.8231e-08, -2.9013e-05, 1.0766e-05, -1.1310e-05], device='cuda:0') 100 0.0001 changing lr epoch 172, time 214.96, cls_loss 0.0027 cls_loss_mapping 0.0040 cls_loss_causal 0.5096 re_mapping 0.0065 re_causal 0.0190 /// teacc 98.95 lr 0.00010000 Epoch 174, weight, value: tensor([[-0.1850, 0.0757, -0.0439, ..., -0.0132, -0.1091, -0.1208], [ 0.0230, 0.0690, -0.1012, ..., -0.0953, -0.0283, 0.0921], [ 0.0514, -0.0935, -0.1001, ..., 0.0004, -0.0460, -0.0502], ..., [ 0.0682, 0.0044, 0.1065, ..., 0.0590, 0.1460, -0.0040], [ 0.0886, -0.1336, -0.1083, ..., -0.1716, -0.0504, 0.1482], [-0.0876, 0.0746, 0.0239, ..., -0.1250, -0.0730, -0.0180]], device='cuda:0'), grad: tensor([[ 5.5879e-08, 2.3283e-08, 6.4261e-08, ..., 3.7253e-08, 3.6322e-08, 1.2107e-08], [ 4.0513e-07, -6.0536e-08, 3.7905e-07, ..., 1.3411e-07, 3.7346e-07, -2.8405e-07], [-1.0990e-07, 3.6322e-08, 8.6613e-08, ..., -6.8918e-08, -4.7497e-08, 5.2154e-08], ..., [-7.5065e-07, 8.3819e-08, -6.2399e-07, ..., -1.6112e-07, -6.8545e-07, 7.4506e-08], [-5.6811e-08, 1.1735e-07, 1.2293e-07, ..., 4.2841e-08, 6.4261e-08, -1.0803e-07], [ 1.9465e-07, -5.4389e-07, -7.1712e-07, ..., 3.9116e-08, 1.3039e-07, 8.9407e-08]], device='cuda:0') Epoch 174, bias, value: tensor([-0.0086, -0.0126, -0.0052, -0.0142, -0.0048, -0.0042, 0.0156, 0.0119, 0.0162, -0.0111], device='cuda:0'), grad: tensor([ 3.5390e-07, 3.1013e-07, -1.3690e-07, 1.0338e-07, 1.0589e-06, 2.9709e-07, -4.5914e-07, -7.2550e-07, 2.1234e-07, -1.0319e-06], device='cuda:0') 100 0.0001 changing lr epoch 173, time 214.69, cls_loss 0.0027 cls_loss_mapping 0.0046 cls_loss_causal 0.5881 re_mapping 0.0067 re_causal 0.0209 /// teacc 98.85 lr 0.00010000 Epoch 175, weight, value: tensor([[-0.1859, 0.0753, -0.0448, ..., -0.0125, -0.1097, -0.1212], [ 0.0218, 0.0694, -0.1030, ..., -0.0957, -0.0296, 0.0915], [ 0.0522, -0.0938, -0.1002, ..., 0.0010, -0.0458, -0.0505], ..., [ 0.0692, 0.0040, 0.1080, ..., 0.0593, 0.1471, -0.0029], [ 0.0877, -0.1342, -0.1091, ..., -0.1750, -0.0508, 0.1483], [-0.0892, 0.0746, 0.0239, ..., -0.1268, -0.0745, -0.0183]], device='cuda:0'), grad: tensor([[ 8.3819e-08, -4.0978e-08, 1.6857e-07, ..., 1.3970e-07, 1.9558e-08, 8.4750e-08], [-1.5553e-07, -6.6403e-07, 8.2795e-07, ..., 7.6368e-07, -2.3190e-07, -1.9800e-06], [ 1.9465e-07, 1.7136e-07, 5.3085e-07, ..., 4.7311e-07, -8.2888e-08, 2.8592e-07], ..., [ 3.7253e-07, 6.6683e-07, 1.8012e-06, ..., 2.9244e-07, 8.7917e-07, 3.5949e-07], [-2.7008e-06, 1.7136e-07, 4.1630e-07, ..., 3.8184e-07, -2.0768e-07, -3.4459e-06], [ 1.7956e-06, -8.7079e-07, -2.3339e-06, ..., 8.1025e-08, -8.0466e-07, 2.4494e-06]], device='cuda:0') Epoch 175, bias, value: tensor([-0.0089, -0.0133, -0.0044, -0.0143, -0.0054, -0.0019, 0.0141, 0.0128, 0.0155, -0.0114], device='cuda:0'), grad: tensor([ 8.5961e-07, -8.0839e-07, 1.5246e-06, -2.0772e-05, 2.7753e-06, 1.4849e-05, 2.0117e-06, 4.1388e-06, -5.2378e-06, 5.8115e-07], device='cuda:0') 100 0.0001 changing lr epoch 174, time 214.72, cls_loss 0.0032 cls_loss_mapping 0.0062 cls_loss_causal 0.5662 re_mapping 0.0068 re_causal 0.0188 /// teacc 98.93 lr 0.00010000 Epoch 176, weight, value: tensor([[-0.1881, 0.0754, -0.0467, ..., -0.0133, -0.1108, -0.1236], [ 0.0214, 0.0695, -0.1036, ..., -0.0971, -0.0303, 0.0920], [ 0.0520, -0.0933, -0.1007, ..., 0.0013, -0.0458, -0.0537], ..., [ 0.0700, 0.0039, 0.1091, ..., 0.0602, 0.1480, -0.0024], [ 0.0901, -0.1363, -0.1097, ..., -0.1761, -0.0513, 0.1506], [-0.0900, 0.0750, 0.0239, ..., -0.1287, -0.0756, -0.0186]], device='cuda:0'), grad: tensor([[ 4.6007e-06, -3.5856e-07, 4.0699e-07, ..., -1.3039e-08, 1.2107e-08, 7.9796e-06], [ 5.7554e-04, 5.4296e-07, 7.2736e-07, ..., 1.7602e-07, -1.4994e-07, 9.8801e-04], [ 1.4789e-05, 1.9465e-07, 8.3819e-08, ..., -2.9430e-07, -1.1083e-07, 2.6003e-05], ..., [ 1.1168e-05, 1.3724e-05, 2.4348e-05, ..., 5.5879e-09, 7.8231e-08, 2.0429e-05], [-6.3753e-04, 1.7677e-06, 1.0543e-06, ..., 8.7544e-08, 6.7055e-08, -1.0939e-03], [ 1.3569e-06, -1.7929e-03, -3.5262e-04, ..., 4.9360e-08, 2.3283e-08, -1.5087e-03]], device='cuda:0') Epoch 176, bias, value: tensor([-0.0091, -0.0137, -0.0036, -0.0146, -0.0056, -0.0019, 0.0119, 0.0134, 0.0180, -0.0113], device='cuda:0'), grad: tensor([ 1.1511e-05, 1.5020e-03, 3.9220e-05, 2.5406e-05, 5.2719e-03, -7.6070e-06, 6.0052e-05, 7.2002e-05, -1.6565e-03, -5.3215e-03], device='cuda:0') 100 0.0001 changing lr epoch 175, time 214.91, cls_loss 0.0023 cls_loss_mapping 0.0029 cls_loss_causal 0.4960 re_mapping 0.0068 re_causal 0.0188 /// teacc 98.88 lr 0.00010000 Epoch 177, weight, value: tensor([[-0.1898, 0.0758, -0.0474, ..., -0.0138, -0.1122, -0.1242], [ 0.0213, 0.0697, -0.1037, ..., -0.0973, -0.0302, 0.0919], [ 0.0522, -0.0937, -0.1008, ..., 0.0016, -0.0456, -0.0539], ..., [ 0.0700, 0.0037, 0.1094, ..., 0.0603, 0.1481, -0.0026], [ 0.0904, -0.1371, -0.1101, ..., -0.1767, -0.0515, 0.1511], [-0.0903, 0.0758, 0.0242, ..., -0.1294, -0.0759, -0.0170]], device='cuda:0'), grad: tensor([[ 3.0398e-06, 1.4501e-06, 2.1327e-06, ..., 1.3160e-06, 4.5039e-06, 1.2852e-07], [ 8.5980e-06, 9.4716e-07, 7.2643e-07, ..., 1.3057e-06, 2.6301e-06, 7.4469e-06], [-9.9689e-06, 5.8766e-07, 6.8825e-07, ..., -7.5549e-06, -5.0813e-06, 5.1782e-07], ..., [-2.3376e-07, -1.7611e-06, -5.2080e-06, ..., 9.5926e-08, -7.3984e-06, 2.9430e-07], [ 1.7822e-05, 2.3916e-05, 4.8801e-07, ..., 1.0803e-06, 1.0254e-06, -1.0662e-05], [ 1.3160e-06, -7.6834e-07, -2.0489e-06, ..., 6.0163e-07, 7.1060e-07, 4.8149e-07]], device='cuda:0') Epoch 177, bias, value: tensor([-0.0086, -0.0137, -0.0034, -0.0141, -0.0066, -0.0025, 0.0121, 0.0132, 0.0179, -0.0104], device='cuda:0'), grad: tensor([ 1.4789e-05, 2.0817e-05, -2.2501e-05, 5.4091e-05, 8.9556e-06, -2.2578e-04, 9.0599e-06, -1.1362e-05, 1.4997e-04, 2.0452e-06], device='cuda:0') 100 0.0001 changing lr epoch 176, time 214.89, cls_loss 0.0026 cls_loss_mapping 0.0039 cls_loss_causal 0.5085 re_mapping 0.0064 re_causal 0.0183 /// teacc 99.00 lr 0.00010000 Epoch 178, weight, value: tensor([[-0.1906, 0.0761, -0.0484, ..., -0.0140, -0.1134, -0.1247], [ 0.0220, 0.0705, -0.1028, ..., -0.0961, -0.0293, 0.0932], [ 0.0521, -0.0940, -0.1013, ..., 0.0017, -0.0457, -0.0549], ..., [ 0.0697, 0.0031, 0.1093, ..., 0.0602, 0.1478, -0.0040], [ 0.0902, -0.1380, -0.1108, ..., -0.1778, -0.0519, 0.1511], [-0.0904, 0.0754, 0.0247, ..., -0.1300, -0.0762, -0.0175]], device='cuda:0'), grad: tensor([[ 5.6624e-07, -3.9302e-06, 3.1944e-07, ..., 4.2375e-07, 3.3062e-07, 1.0896e-07], [ 2.4557e-05, 8.3353e-07, 3.3975e-05, ..., 1.2994e-05, 3.1471e-05, 1.2435e-05], [-6.0499e-06, 3.7719e-07, 2.8294e-06, ..., -8.1807e-06, 5.8021e-07, 9.5647e-07], ..., [-3.9667e-05, 2.5425e-07, -5.8413e-05, ..., -1.8805e-05, -5.2571e-05, -2.0429e-05], [ 8.5682e-06, 9.3691e-07, 1.2353e-05, ..., 5.5172e-06, 1.1154e-05, 3.7365e-06], [ 4.5076e-06, 2.9728e-06, 8.7246e-06, ..., 3.6489e-06, 6.1579e-06, 2.4531e-06]], device='cuda:0') Epoch 178, bias, value: tensor([-0.0088, -0.0128, -0.0034, -0.0145, -0.0062, -0.0021, 0.0123, 0.0123, 0.0174, -0.0110], device='cuda:0'), grad: tensor([-5.9567e-06, 6.0827e-05, -2.5198e-05, 1.8686e-05, -2.0713e-06, 1.0639e-05, -5.8599e-06, -9.2983e-05, 2.3887e-05, 1.7956e-05], device='cuda:0') 100 0.0001 changing lr epoch 177, time 214.75, cls_loss 0.0023 cls_loss_mapping 0.0041 cls_loss_causal 0.5267 re_mapping 0.0066 re_causal 0.0194 /// teacc 98.95 lr 0.00010000 Epoch 179, weight, value: tensor([[-0.1915, 0.0764, -0.0495, ..., -0.0132, -0.1140, -0.1245], [ 0.0223, 0.0709, -0.1022, ..., -0.0951, -0.0289, 0.0938], [ 0.0526, -0.0928, -0.1011, ..., 0.0015, -0.0455, -0.0543], ..., [ 0.0694, 0.0025, 0.1092, ..., 0.0599, 0.1476, -0.0046], [ 0.0902, -0.1385, -0.1111, ..., -0.1784, -0.0521, 0.1513], [-0.0913, 0.0753, 0.0250, ..., -0.1315, -0.0770, -0.0179]], device='cuda:0'), grad: tensor([[ 5.4017e-08, -9.3132e-07, 2.7101e-07, ..., 2.5891e-07, 5.8673e-08, 1.2107e-08], [ 4.4703e-08, 6.4261e-08, 3.3900e-07, ..., 2.8498e-07, 4.9360e-08, -6.9849e-08], [-1.6391e-07, 3.9302e-07, 1.9372e-06, ..., 1.6065e-06, -1.9092e-07, 2.0489e-08], ..., [-1.7043e-07, 5.8301e-06, 1.3374e-05, ..., 3.9767e-07, -1.2293e-07, 3.9116e-08], [-3.1665e-08, 1.2545e-06, 2.3544e-06, ..., 1.7630e-06, 2.8871e-08, -1.0524e-07], [-8.3819e-09, -1.8433e-05, -2.6494e-05, ..., 5.1130e-07, 2.7008e-08, -3.7719e-07]], device='cuda:0') Epoch 179, bias, value: tensor([-0.0084, -0.0126, -0.0022, -0.0147, -0.0062, -0.0019, 0.0122, 0.0114, 0.0173, -0.0112], device='cuda:0'), grad: tensor([-9.0431e-07, 5.3737e-07, 2.4680e-06, -7.8380e-06, 2.7448e-05, 2.9802e-06, 2.9895e-07, 2.3603e-05, 4.6566e-06, -5.3227e-05], device='cuda:0') 100 0.0001 changing lr epoch 178, time 214.99, cls_loss 0.0021 cls_loss_mapping 0.0036 cls_loss_causal 0.5361 re_mapping 0.0067 re_causal 0.0203 /// teacc 98.90 lr 0.00010000 Epoch 180, weight, value: tensor([[-0.1920, 0.0767, -0.0508, ..., -0.0128, -0.1146, -0.1244], [ 0.0221, 0.0711, -0.1025, ..., -0.0955, -0.0291, 0.0939], [ 0.0526, -0.0934, -0.1013, ..., 0.0015, -0.0456, -0.0554], ..., [ 0.0695, 0.0021, 0.1092, ..., 0.0603, 0.1479, -0.0047], [ 0.0904, -0.1390, -0.1116, ..., -0.1795, -0.0519, 0.1515], [-0.0911, 0.0757, 0.0262, ..., -0.1326, -0.0762, -0.0179]], device='cuda:0'), grad: tensor([[ 2.0619e-06, -2.3041e-06, 9.2201e-08, ..., 4.6194e-07, 2.2352e-08, 8.8476e-08], [ 4.4797e-07, 4.3474e-06, 2.8666e-06, ..., 2.0638e-06, 8.7824e-07, 2.6599e-06], [-3.5260e-06, 4.6659e-07, 5.1875e-07, ..., -6.4261e-07, 6.8918e-08, 3.7998e-07], ..., [-1.6103e-06, 4.0382e-06, 7.8045e-07, ..., 3.1386e-07, -4.5914e-07, 2.6133e-06], [ 1.6177e-06, 1.4845e-06, 1.2778e-06, ..., 6.2305e-07, 1.5646e-07, 1.2610e-06], [ 1.4622e-07, 2.4121e-06, 6.7707e-07, ..., 1.3793e-06, 4.9919e-07, 8.7079e-07]], device='cuda:0') Epoch 180, bias, value: tensor([-0.0083, -0.0126, -0.0024, -0.0139, -0.0066, -0.0025, 0.0122, 0.0113, 0.0172, -0.0105], device='cuda:0'), grad: tensor([ 2.5444e-06, 1.5706e-05, -1.2234e-05, 2.8256e-06, -4.1813e-05, 1.6354e-06, 6.0350e-07, 1.1936e-05, 1.0878e-05, 7.8753e-06], device='cuda:0') 100 0.0001 changing lr epoch 179, time 214.67, cls_loss 0.0016 cls_loss_mapping 0.0030 cls_loss_causal 0.5335 re_mapping 0.0062 re_causal 0.0192 /// teacc 98.91 lr 0.00010000 Epoch 181, weight, value: tensor([[-0.1929, 0.0767, -0.0516, ..., -0.0129, -0.1151, -0.1256], [ 0.0221, 0.0716, -0.1028, ..., -0.0958, -0.0292, 0.0946], [ 0.0525, -0.0940, -0.1015, ..., 0.0017, -0.0457, -0.0565], ..., [ 0.0699, 0.0022, 0.1099, ..., 0.0608, 0.1483, -0.0048], [ 0.0904, -0.1395, -0.1121, ..., -0.1802, -0.0521, 0.1515], [-0.0919, 0.0757, 0.0262, ..., -0.1344, -0.0770, -0.0182]], device='cuda:0'), grad: tensor([[ 1.0431e-07, 5.3830e-07, 2.5146e-07, ..., 1.6764e-08, 3.8184e-08, 4.1816e-07], [-2.5146e-08, -2.8964e-06, 2.7101e-07, ..., 1.4994e-07, 6.2678e-07, -3.9823e-06], [-4.1444e-07, 2.2445e-07, 2.1420e-07, ..., -2.8871e-08, -5.3458e-07, 2.7567e-07], ..., [-4.1258e-07, 3.7160e-07, -7.6927e-07, ..., -1.5274e-07, -5.4017e-07, 4.4983e-07], [ 8.8476e-08, 6.5099e-07, 1.0673e-06, ..., 4.1910e-08, 7.0781e-08, 4.7497e-07], [ 2.8592e-07, -1.5656e-06, -3.0473e-06, ..., 8.9407e-08, 2.3656e-07, 5.7742e-07]], device='cuda:0') Epoch 181, bias, value: tensor([-0.0086, -0.0124, -0.0025, -0.0127, -0.0067, -0.0036, 0.0123, 0.0117, 0.0171, -0.0106], device='cuda:0'), grad: tensor([ 1.6280e-06, -6.6981e-06, -1.1707e-06, -1.4715e-07, 5.3272e-06, 8.3819e-09, 2.8685e-06, 3.9674e-07, 3.6098e-06, -5.8264e-06], device='cuda:0') 100 0.0001 changing lr epoch 180, time 214.98, cls_loss 0.0021 cls_loss_mapping 0.0034 cls_loss_causal 0.5126 re_mapping 0.0061 re_causal 0.0178 /// teacc 98.92 lr 0.00010000 Epoch 182, weight, value: tensor([[-0.1942, 0.0770, -0.0538, ..., -0.0135, -0.1169, -0.1252], [ 0.0219, 0.0719, -0.1033, ..., -0.0965, -0.0294, 0.0949], [ 0.0527, -0.0944, -0.1019, ..., 0.0017, -0.0457, -0.0573], ..., [ 0.0703, 0.0022, 0.1108, ..., 0.0614, 0.1488, -0.0046], [ 0.0905, -0.1403, -0.1124, ..., -0.1808, -0.0522, 0.1517], [-0.0926, 0.0756, 0.0262, ..., -0.1355, -0.0777, -0.0188]], device='cuda:0'), grad: tensor([[ 1.3588e-06, 8.1584e-06, 1.8906e-07, ..., 1.4016e-06, 4.7497e-08, 2.5984e-06], [ 2.2929e-06, -3.4552e-07, 5.7463e-07, ..., 1.1064e-06, 2.3805e-06, -1.9595e-06], [-7.7859e-06, 2.6170e-07, 3.3621e-07, ..., -6.2473e-06, -3.3751e-06, 4.8894e-07], ..., [ 1.8626e-08, 2.5053e-07, -4.2375e-07, ..., 3.4180e-07, -3.8370e-07, -1.2480e-07], [ 4.5635e-07, 1.6699e-06, 1.6680e-06, ..., 4.2096e-07, 1.3225e-07, 5.6531e-07], [ 1.8626e-07, -6.6698e-05, -7.2777e-05, ..., 1.7509e-07, 3.8184e-08, -2.5690e-05]], device='cuda:0') Epoch 182, bias, value: tensor([-0.0086, -0.0125, -0.0023, -0.0130, -0.0066, -0.0035, 0.0122, 0.0122, 0.0171, -0.0108], device='cuda:0'), grad: tensor([ 5.0724e-05, 4.1239e-06, -2.1622e-05, 8.5235e-06, 2.0504e-04, -1.6510e-05, -1.7151e-05, 1.7807e-06, 7.0520e-06, -2.2161e-04], device='cuda:0') 100 0.0001 changing lr epoch 181, time 214.80, cls_loss 0.0024 cls_loss_mapping 0.0035 cls_loss_causal 0.5388 re_mapping 0.0065 re_causal 0.0190 /// teacc 98.97 lr 0.00010000 Epoch 183, weight, value: tensor([[-0.1962, 0.0774, -0.0560, ..., -0.0139, -0.1183, -0.1261], [ 0.0211, 0.0721, -0.1047, ..., -0.0983, -0.0308, 0.0955], [ 0.0515, -0.0953, -0.1034, ..., 0.0005, -0.0473, -0.0575], ..., [ 0.0715, 0.0022, 0.1122, ..., 0.0630, 0.1504, -0.0053], [ 0.0908, -0.1410, -0.1124, ..., -0.1811, -0.0517, 0.1517], [-0.0929, 0.0759, 0.0268, ..., -0.1359, -0.0778, -0.0190]], device='cuda:0'), grad: tensor([[ 1.2666e-07, -3.9078e-06, 1.1362e-07, ..., -1.4901e-07, 1.0990e-07, 2.9244e-07], [-1.6214e-06, -2.6543e-06, 5.5972e-07, ..., 3.6415e-07, 5.5600e-07, -1.3523e-05], [ 1.1772e-06, 7.3668e-07, 1.4296e-06, ..., 6.5006e-07, 1.4249e-06, 1.7369e-06], ..., [-2.5220e-06, 2.4363e-06, -3.6489e-06, ..., -1.9707e-06, -3.6918e-06, 6.2212e-07], [ 8.6054e-07, 1.2768e-06, 1.1455e-07, ..., 5.2154e-08, 5.9605e-08, 5.4017e-06], [ 2.5984e-07, 1.6578e-07, 1.5460e-07, ..., 1.7043e-07, 2.7195e-07, 4.0326e-07]], device='cuda:0') Epoch 183, bias, value: tensor([-0.0084, -0.0130, -0.0034, -0.0133, -0.0068, -0.0032, 0.0121, 0.0132, 0.0170, -0.0106], device='cuda:0'), grad: tensor([-8.6650e-06, -1.7673e-05, 4.7833e-06, 1.4137e-06, 2.4550e-06, 1.3500e-05, -1.0267e-05, 2.8964e-06, 9.7156e-06, 1.7779e-06], device='cuda:0') 100 0.0001 changing lr epoch 182, time 214.63, cls_loss 0.0025 cls_loss_mapping 0.0043 cls_loss_causal 0.5092 re_mapping 0.0065 re_causal 0.0183 /// teacc 98.96 lr 0.00010000 Epoch 184, weight, value: tensor([[-0.1985, 0.0777, -0.0594, ..., -0.0147, -0.1211, -0.1272], [ 0.0201, 0.0715, -0.1060, ..., -0.1005, -0.0325, 0.0961], [ 0.0526, -0.0967, -0.1028, ..., 0.0013, -0.0470, -0.0579], ..., [ 0.0714, 0.0027, 0.1125, ..., 0.0636, 0.1512, -0.0059], [ 0.0909, -0.1411, -0.1127, ..., -0.1819, -0.0517, 0.1520], [-0.0932, 0.0759, 0.0272, ..., -0.1368, -0.0783, -0.0193]], device='cuda:0'), grad: tensor([[ 1.8533e-07, -2.3842e-07, 5.1223e-08, ..., 1.6568e-06, 1.2582e-06, 6.6124e-08], [-2.7940e-08, -9.0338e-08, 1.6205e-07, ..., 8.5030e-07, 7.2923e-07, -1.3728e-06], [-5.1782e-07, 7.4506e-08, 1.1642e-07, ..., -6.1132e-06, -4.5374e-06, 2.3004e-07], ..., [-3.0641e-07, 1.7602e-07, -7.0501e-07, ..., 7.2643e-07, 9.6858e-08, 5.2992e-07], [-1.0617e-07, 3.0641e-07, 1.2759e-07, ..., 2.9802e-07, 2.4028e-07, 1.4063e-07], [ 2.8592e-07, 7.4599e-07, -1.9558e-08, ..., 3.0268e-07, 4.5542e-07, 1.4286e-06]], device='cuda:0') Epoch 184, bias, value: tensor([-0.0083, -0.0138, -0.0030, -0.0142, -0.0068, -0.0022, 0.0121, 0.0134, 0.0170, -0.0105], device='cuda:0'), grad: tensor([ 3.7365e-06, 2.7288e-07, -1.4573e-05, 5.9344e-06, -4.6082e-06, -7.8231e-06, 8.1137e-06, 2.1700e-06, 2.1160e-06, 4.5933e-06], device='cuda:0') 100 0.0001 changing lr epoch 183, time 214.89, cls_loss 0.0026 cls_loss_mapping 0.0032 cls_loss_causal 0.5184 re_mapping 0.0064 re_causal 0.0182 /// teacc 98.87 lr 0.00010000 Epoch 185, weight, value: tensor([[-0.2001, 0.0779, -0.0613, ..., -0.0152, -0.1231, -0.1289], [ 0.0197, 0.0717, -0.1066, ..., -0.1007, -0.0328, 0.0962], [ 0.0519, -0.0973, -0.1035, ..., 0.0009, -0.0475, -0.0602], ..., [ 0.0721, 0.0028, 0.1132, ..., 0.0637, 0.1518, -0.0054], [ 0.0911, -0.1419, -0.1133, ..., -0.1824, -0.0520, 0.1525], [-0.0935, 0.0755, 0.0276, ..., -0.1378, -0.0787, -0.0195]], device='cuda:0'), grad: tensor([[ 4.7404e-07, 8.3540e-07, 1.6671e-07, ..., 2.3656e-07, 2.3562e-07, 2.4717e-06], [ 2.9448e-06, 1.2159e-05, 8.4098e-07, ..., 1.0869e-06, 3.9302e-07, 1.8656e-05], [-1.2740e-06, 5.7556e-07, 2.6692e-06, ..., 1.9893e-06, -1.6177e-06, 1.1176e-06], ..., [ 1.0747e-06, 4.0159e-06, 6.0443e-07, ..., 1.0058e-06, 2.5146e-07, 5.3011e-06], [-1.5900e-05, -5.5991e-06, -7.9423e-06, ..., -7.1824e-06, 1.3970e-07, -2.2352e-05], [ 1.1856e-06, 7.7844e-05, 2.8044e-05, ..., 4.5635e-07, 2.2445e-07, 3.6180e-05]], device='cuda:0') Epoch 185, bias, value: tensor([-0.0084, -0.0139, -0.0037, -0.0149, -0.0068, -0.0017, 0.0124, 0.0139, 0.0170, -0.0108], device='cuda:0'), grad: tensor([ 8.4117e-06, 3.2604e-05, 1.5255e-06, 2.8461e-05, -2.0063e-04, 3.2075e-06, 7.0855e-06, 1.2062e-05, -4.2170e-05, 1.4913e-04], device='cuda:0') 100 0.0001 changing lr epoch 184, time 214.80, cls_loss 0.0018 cls_loss_mapping 0.0039 cls_loss_causal 0.5091 re_mapping 0.0063 re_causal 0.0183 /// teacc 98.86 lr 0.00010000 Epoch 186, weight, value: tensor([[-0.1980, 0.0777, -0.0628, ..., -0.0154, -0.1240, -0.1272], [ 0.0198, 0.0715, -0.1065, ..., -0.1007, -0.0328, 0.0965], [ 0.0520, -0.0972, -0.1031, ..., 0.0014, -0.0473, -0.0611], ..., [ 0.0721, 0.0024, 0.1131, ..., 0.0637, 0.1518, -0.0060], [ 0.0911, -0.1432, -0.1136, ..., -0.1829, -0.0522, 0.1528], [-0.0938, 0.0759, 0.0275, ..., -0.1408, -0.0801, -0.0196]], device='cuda:0'), grad: tensor([[ 1.9465e-07, -5.3924e-07, 5.0291e-08, ..., 7.9162e-08, 2.0489e-08, -7.4506e-09], [-1.5631e-05, -2.3972e-06, 7.3574e-08, ..., 6.6124e-08, -1.4640e-05, -2.8417e-05], [-1.4016e-06, 4.2561e-07, 3.2596e-08, ..., -7.0687e-07, -7.3574e-08, 5.7649e-07], ..., [ 1.5013e-05, 2.7400e-06, -8.7544e-08, ..., 8.3819e-09, 1.3888e-05, 2.7165e-05], [ 1.0272e-06, 3.8035e-06, 1.9688e-06, ..., 2.8498e-06, 3.7346e-07, -9.0618e-07], [ 3.3062e-07, -5.3011e-06, 3.1665e-08, ..., 2.3190e-07, 2.3935e-07, 4.5635e-08]], device='cuda:0') Epoch 186, bias, value: tensor([-0.0085, -0.0138, -0.0030, -0.0153, -0.0066, -0.0015, 0.0125, 0.0136, 0.0169, -0.0110], device='cuda:0'), grad: tensor([-8.6986e-07, -4.6879e-05, -2.0042e-06, 1.2808e-05, 8.4639e-06, -2.4796e-05, 2.0191e-06, 4.6045e-05, 1.5676e-05, -1.0461e-05], device='cuda:0') 100 0.0001 changing lr epoch 185, time 214.55, cls_loss 0.0018 cls_loss_mapping 0.0028 cls_loss_causal 0.5255 re_mapping 0.0061 re_causal 0.0189 /// teacc 99.01 lr 0.00010000 Epoch 187, weight, value: tensor([[-0.1985, 0.0778, -0.0641, ..., -0.0157, -0.1248, -0.1276], [ 0.0190, 0.0715, -0.1072, ..., -0.1009, -0.0332, 0.0963], [ 0.0535, -0.0980, -0.1019, ..., 0.0025, -0.0463, -0.0623], ..., [ 0.0714, 0.0024, 0.1127, ..., 0.0630, 0.1515, -0.0059], [ 0.0917, -0.1438, -0.1133, ..., -0.1836, -0.0515, 0.1534], [-0.0940, 0.0761, 0.0276, ..., -0.1412, -0.0807, -0.0195]], device='cuda:0'), grad: tensor([[ 2.1234e-07, -5.3179e-07, 2.5891e-07, ..., 1.0151e-07, 1.8347e-07, 3.0734e-08], [ 2.5332e-07, 1.9744e-07, 3.6601e-07, ..., 1.2666e-07, 2.5239e-07, -2.2165e-07], [-6.7987e-08, 2.9244e-07, 1.9185e-07, ..., -2.5798e-07, 1.0151e-07, 1.1176e-07], ..., [-9.7137e-07, 5.1688e-07, -1.2619e-06, ..., -2.7008e-07, -9.6112e-07, 1.5087e-07], [ 1.4622e-07, 2.5984e-06, 1.8161e-07, ..., 2.0023e-07, 8.7544e-08, 4.2841e-08], [ 7.0781e-07, 1.0788e-05, 3.0734e-07, ..., 1.3039e-07, 2.3842e-07, 1.4771e-06]], device='cuda:0') Epoch 187, bias, value: tensor([-0.0086, -0.0142, -0.0023, -0.0154, -0.0066, -0.0014, 0.0126, 0.0133, 0.0169, -0.0108], device='cuda:0'), grad: tensor([ 1.6078e-05, 2.6673e-06, 2.3264e-06, 3.3490e-06, 3.2127e-05, -3.4660e-05, -6.1810e-05, 3.7067e-07, 1.1638e-05, 2.7895e-05], device='cuda:0') 100 0.0001 changing lr epoch 186, time 214.60, cls_loss 0.0019 cls_loss_mapping 0.0031 cls_loss_causal 0.5490 re_mapping 0.0058 re_causal 0.0182 /// teacc 98.91 lr 0.00010000 Epoch 188, weight, value: tensor([[-0.1990, 0.0777, -0.0654, ..., -0.0159, -0.1264, -0.1281], [ 0.0198, 0.0720, -0.1069, ..., -0.1010, -0.0323, 0.0967], [ 0.0530, -0.0989, -0.1023, ..., 0.0024, -0.0470, -0.0629], ..., [ 0.0715, 0.0022, 0.1130, ..., 0.0632, 0.1515, -0.0061], [ 0.0917, -0.1443, -0.1137, ..., -0.1841, -0.0517, 0.1534], [-0.0950, 0.0758, 0.0277, ..., -0.1421, -0.0815, -0.0202]], device='cuda:0'), grad: tensor([[ 1.7509e-07, -1.3132e-07, 1.0245e-07, ..., 2.3842e-07, 2.2631e-07, 8.3819e-09], [ 2.0098e-06, -1.4342e-07, 1.7658e-06, ..., 2.0042e-06, 1.9986e-06, -2.5053e-07], [-2.7627e-05, 1.6764e-08, -1.2085e-05, ..., -2.4825e-05, -1.9103e-05, 5.3085e-08], ..., [ 1.7792e-05, 2.2817e-07, 7.3723e-06, ..., 1.5900e-05, 1.1779e-05, 1.3039e-08], [ 1.6754e-06, 6.7987e-08, 7.9628e-07, ..., 1.4482e-06, 1.1511e-06, -1.1548e-07], [ 1.5646e-07, -1.6764e-07, -5.1875e-07, ..., 2.3656e-07, 1.0338e-07, 1.0990e-07]], device='cuda:0') Epoch 188, bias, value: tensor([-0.0087, -0.0132, -0.0032, -0.0155, -0.0063, -0.0014, 0.0128, 0.0132, 0.0166, -0.0113], device='cuda:0'), grad: tensor([ 3.3528e-07, 5.1670e-06, -5.0992e-05, 8.9854e-06, 5.7835e-07, 7.7486e-07, 1.9185e-07, 3.2216e-05, 2.8964e-06, -1.8533e-07], device='cuda:0') 100 0.0001 changing lr epoch 187, time 214.73, cls_loss 0.0016 cls_loss_mapping 0.0033 cls_loss_causal 0.5142 re_mapping 0.0062 re_causal 0.0185 /// teacc 99.02 lr 0.00010000 Epoch 189, weight, value: tensor([[-0.1995, 0.0780, -0.0671, ..., -0.0158, -0.1276, -0.1292], [ 0.0197, 0.0725, -0.1072, ..., -0.1014, -0.0323, 0.0970], [ 0.0529, -0.0992, -0.1027, ..., 0.0022, -0.0473, -0.0640], ..., [ 0.0718, 0.0021, 0.1136, ..., 0.0636, 0.1518, -0.0059], [ 0.0919, -0.1449, -0.1141, ..., -0.1847, -0.0520, 0.1537], [-0.0960, 0.0757, 0.0277, ..., -0.1439, -0.0823, -0.0204]], device='cuda:0'), grad: tensor([[ 3.7719e-07, -2.2314e-06, 2.2538e-07, ..., 2.6543e-07, 1.2852e-07, 2.2911e-07], [ 3.1572e-07, 1.1940e-06, 2.8461e-06, ..., 8.6427e-07, 5.2992e-07, -1.1716e-06], [ 2.8666e-06, 1.1856e-06, -1.1884e-06, ..., -3.1665e-06, -4.6492e-06, 4.7721e-06], ..., [ 1.6186e-06, 8.6799e-07, 7.0315e-07, ..., 1.4203e-06, 4.8336e-07, 6.5658e-07], [-2.0489e-05, 1.4668e-06, 9.7509e-07, ..., 8.3633e-07, 3.1665e-07, -1.0431e-05], [ 2.7493e-06, -1.0673e-06, -2.6170e-07, ..., 2.2687e-06, 1.5274e-06, 2.5798e-07]], device='cuda:0') Epoch 189, bias, value: tensor([-0.0087, -0.0132, -0.0034, -0.0154, -0.0064, -0.0014, 0.0127, 0.0136, 0.0166, -0.0115], device='cuda:0'), grad: tensor([-2.0470e-06, 6.6832e-06, 9.3728e-06, -5.2452e-06, 2.2240e-06, 2.3559e-05, 1.1623e-05, 7.0184e-06, -5.8681e-05, 5.5283e-06], device='cuda:0') 100 0.0001 changing lr epoch 188, time 214.79, cls_loss 0.0018 cls_loss_mapping 0.0033 cls_loss_causal 0.5119 re_mapping 0.0063 re_causal 0.0184 /// teacc 99.00 lr 0.00010000 Epoch 190, weight, value: tensor([[-0.2001, 0.0783, -0.0692, ..., -0.0174, -0.1284, -0.1301], [ 0.0197, 0.0728, -0.1071, ..., -0.1015, -0.0324, 0.0974], [ 0.0526, -0.0997, -0.1036, ..., 0.0010, -0.0476, -0.0650], ..., [ 0.0720, 0.0021, 0.1140, ..., 0.0640, 0.1521, -0.0063], [ 0.0924, -0.1460, -0.1140, ..., -0.1845, -0.0517, 0.1541], [-0.0971, 0.0754, 0.0276, ..., -0.1466, -0.0834, -0.0212]], device='cuda:0'), grad: tensor([[ 5.0291e-08, 9.6858e-08, 1.8887e-06, ..., -1.4715e-07, 7.4506e-09, 5.1223e-07], [-1.3448e-06, -2.7940e-07, 3.3528e-08, ..., 5.5879e-08, -4.4703e-08, -9.7305e-06], [ 1.2349e-06, 2.7195e-07, 1.7323e-07, ..., 7.8231e-08, -1.6764e-08, 3.6843e-06], ..., [ 4.7125e-07, 3.7067e-06, 7.3798e-06, ..., 4.2282e-07, -2.0489e-08, 2.5947e-06], [-1.5572e-06, 3.9116e-07, 5.7742e-08, ..., 1.6019e-07, 4.0978e-08, 1.2852e-07], [ 8.0094e-08, -4.5560e-06, -1.1086e-05, ..., -5.3458e-07, 9.3132e-09, 6.9290e-07]], device='cuda:0') Epoch 190, bias, value: tensor([-0.0089, -0.0130, -0.0040, -0.0151, -0.0061, -0.0015, 0.0127, 0.0137, 0.0166, -0.0117], device='cuda:0'), grad: tensor([ 1.6868e-05, -1.0937e-05, 4.7944e-06, 1.8403e-06, -1.2517e-06, -1.8626e-09, -1.4797e-05, 1.1250e-05, 1.9874e-06, -9.8050e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 214.60, cls_loss 0.0020 cls_loss_mapping 0.0034 cls_loss_causal 0.5276 re_mapping 0.0064 re_causal 0.0179 /// teacc 98.89 lr 0.00010000 Epoch 191, weight, value: tensor([[-0.2010, 0.0795, -0.0714, ..., -0.0180, -0.1298, -0.1305], [ 0.0196, 0.0732, -0.1073, ..., -0.1019, -0.0325, 0.0977], [ 0.0527, -0.1001, -0.1041, ..., 0.0006, -0.0476, -0.0645], ..., [ 0.0722, 0.0017, 0.1145, ..., 0.0644, 0.1523, -0.0065], [ 0.0923, -0.1467, -0.1148, ..., -0.1864, -0.0521, 0.1544], [-0.0977, 0.0755, 0.0279, ..., -0.1480, -0.0840, -0.0214]], device='cuda:0'), grad: tensor([[ 7.0781e-08, -2.0489e-08, 4.4703e-08, ..., 2.0489e-08, 4.8429e-08, 1.8068e-07], [-6.3367e-06, -9.9987e-06, -1.0714e-05, ..., -2.0023e-06, 1.9372e-07, -2.3112e-05], [ 3.0715e-06, 6.3330e-08, 2.1793e-06, ..., 1.6969e-06, 2.7940e-06, 2.3469e-07], ..., [ 1.6205e-06, 8.6054e-06, 6.2250e-06, ..., -6.8918e-07, -3.9451e-06, 2.1994e-05], [-1.4901e-07, 2.6077e-07, 5.8115e-07, ..., 1.9930e-07, 1.5646e-07, 5.2154e-08], [ 5.7369e-07, 8.9966e-07, -1.2107e-07, ..., 2.1793e-07, 1.1921e-07, 1.7788e-06]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0066, -0.0131, -0.0040, -0.0153, -0.0062, -0.0013, 0.0117, 0.0138, 0.0165, -0.0117], device='cuda:0'), grad: tensor([ 1.3616e-06, -1.2733e-05, 4.6566e-06, 8.9779e-07, -1.8440e-06, 2.1145e-05, -4.8161e-05, 2.4199e-05, 4.6343e-06, 5.7295e-06], device='cuda:0') 100 0.0001 changing lr epoch 190, time 214.71, cls_loss 0.0016 cls_loss_mapping 0.0029 cls_loss_causal 0.5355 re_mapping 0.0067 re_causal 0.0199 /// teacc 99.02 lr 0.00010000 Epoch 192, weight, value: tensor([[-0.2017, 0.0795, -0.0731, ..., -0.0177, -0.1301, -0.1322], [ 0.0193, 0.0729, -0.1077, ..., -0.1025, -0.0326, 0.0977], [ 0.0526, -0.1005, -0.1044, ..., 0.0004, -0.0477, -0.0650], ..., [ 0.0726, 0.0019, 0.1150, ..., 0.0650, 0.1526, -0.0065], [ 0.0922, -0.1481, -0.1159, ..., -0.1869, -0.0523, 0.1545], [-0.0983, 0.0760, 0.0283, ..., -0.1496, -0.0853, -0.0211]], device='cuda:0'), grad: tensor([[ 3.3528e-08, 8.3819e-08, 1.1176e-08, ..., 5.5879e-09, 1.8626e-09, 1.1735e-07], [-5.6997e-06, -1.2510e-05, 3.5390e-08, ..., 2.0489e-08, 1.3039e-08, -2.0102e-05], [ 3.1665e-08, 8.3819e-08, 5.9605e-08, ..., 7.4506e-09, -7.4506e-09, 9.6858e-08], ..., [ 2.1793e-07, 8.4750e-07, -7.2643e-08, ..., -2.9802e-08, -4.4703e-08, 9.9279e-07], [ 4.9546e-07, 1.1884e-06, 2.2352e-08, ..., 0.0000e+00, 1.8626e-09, 1.7919e-06], [ 3.5763e-06, 1.7792e-05, -2.0675e-07, ..., 4.0978e-08, 1.6764e-08, 1.2442e-05]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0066, -0.0132, -0.0042, -0.0154, -0.0063, -0.0010, 0.0115, 0.0142, 0.0158, -0.0112], device='cuda:0'), grad: tensor([ 9.3132e-07, -4.1485e-05, 3.0920e-07, 4.2841e-08, -3.0175e-05, 6.4597e-06, -8.4192e-06, 2.8703e-06, 4.0904e-06, 6.5327e-05], device='cuda:0') 100 0.0001 changing lr epoch 191, time 214.72, cls_loss 0.0015 cls_loss_mapping 0.0026 cls_loss_causal 0.5516 re_mapping 0.0066 re_causal 0.0197 /// teacc 98.92 lr 0.00010000 Epoch 193, weight, value: tensor([[-0.2022, 0.0805, -0.0734, ..., -0.0162, -0.1307, -0.1326], [ 0.0191, 0.0734, -0.1080, ..., -0.1028, -0.0328, 0.0982], [ 0.0528, -0.1010, -0.1045, ..., 0.0004, -0.0476, -0.0655], ..., [ 0.0728, 0.0017, 0.1155, ..., 0.0653, 0.1528, -0.0064], [ 0.0923, -0.1484, -0.1164, ..., -0.1878, -0.0526, 0.1546], [-0.0993, 0.0754, 0.0280, ..., -0.1523, -0.0861, -0.0214]], device='cuda:0'), grad: tensor([[ 8.0466e-07, -3.3714e-07, 6.3702e-07, ..., 4.8988e-07, 8.9966e-07, 1.3225e-07], [ 1.8198e-06, 6.0163e-07, 1.9986e-06, ..., 9.8906e-07, 1.9222e-06, 4.5076e-07], [ 1.1325e-05, 1.1604e-06, 1.1526e-05, ..., 6.4559e-06, 1.3165e-05, 7.5251e-07], ..., [-1.6630e-05, 5.0291e-08, -1.7375e-05, ..., -9.3952e-06, -1.9893e-05, 6.5751e-07], [ 1.5926e-06, 6.7614e-07, 1.7267e-06, ..., 8.8476e-07, 1.7285e-06, 5.6252e-07], [ 2.1737e-06, 5.0627e-06, 5.9232e-07, ..., 1.1548e-06, 6.1654e-07, 6.1579e-06]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0062, -0.0132, -0.0039, -0.0157, -0.0064, -0.0009, 0.0116, 0.0145, 0.0158, -0.0118], device='cuda:0'), grad: tensor([ 7.4692e-07, 4.8950e-06, 2.3574e-05, 1.2517e-06, -2.4557e-05, 1.8254e-07, 1.7714e-06, -3.0249e-05, 4.8317e-06, 1.7524e-05], device='cuda:0') 100 0.0001 changing lr epoch 192, time 214.81, cls_loss 0.0016 cls_loss_mapping 0.0029 cls_loss_causal 0.5263 re_mapping 0.0064 re_causal 0.0186 /// teacc 99.05 lr 0.00010000 Epoch 194, weight, value: tensor([[-2.0279e-01, 8.0691e-02, -7.4324e-02, ..., -1.5059e-02, -1.3116e-01, -1.3309e-01], [ 1.8299e-02, 7.4346e-02, -1.0934e-01, ..., -1.0290e-01, -3.3242e-02, 9.8005e-02], [ 5.2739e-02, -1.0213e-01, -1.0478e-01, ..., 1.9529e-04, -4.7642e-02, -6.6802e-02], ..., [ 7.3336e-02, 6.8327e-04, 1.1626e-01, ..., 6.5286e-02, 1.5309e-01, -5.7657e-03], [ 9.2697e-02, -1.4880e-01, -1.1673e-01, ..., -1.8856e-01, -5.2508e-02, 1.5500e-01], [-1.0024e-01, 7.5419e-02, 2.8391e-02, ..., -1.5283e-01, -8.6411e-02, -2.1823e-02]], device='cuda:0'), grad: tensor([[ 9.4995e-08, -1.1034e-05, 9.8720e-08, ..., 7.8231e-08, 7.2643e-08, 5.2154e-08], [-1.2591e-06, -2.3376e-06, 1.1548e-07, ..., 1.0803e-07, 9.1270e-08, -4.9844e-06], [-1.5646e-07, 7.8231e-08, 3.3155e-07, ..., 0.0000e+00, -2.4214e-07, 1.5832e-07], ..., [ 2.5705e-07, 1.0617e-06, 1.4640e-06, ..., 7.1526e-07, -3.5390e-08, 9.6671e-07], [ 2.1607e-07, 8.1770e-07, 2.2724e-07, ..., 2.2352e-07, 1.3784e-07, 6.7428e-07], [ 1.9930e-07, -1.0971e-06, -3.3099e-06, ..., -5.7742e-07, -3.4459e-07, 4.5635e-07]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0061, -0.0135, -0.0045, -0.0159, -0.0063, -0.0004, 0.0114, 0.0148, 0.0159, -0.0120], device='cuda:0'), grad: tensor([-2.6584e-05, -8.4415e-06, -1.7695e-07, 1.3039e-06, 3.8184e-07, 1.0058e-06, 2.9668e-05, 3.7309e-06, 1.9763e-06, -2.9206e-06], device='cuda:0') 100 0.0001 changing lr epoch 193, time 214.93, cls_loss 0.0017 cls_loss_mapping 0.0035 cls_loss_causal 0.5525 re_mapping 0.0063 re_causal 0.0182 /// teacc 99.01 lr 0.00010000 Epoch 195, weight, value: tensor([[-2.0313e-01, 8.0924e-02, -7.4921e-02, ..., -1.5367e-02, -1.3183e-01, -1.3359e-01], [ 1.8617e-02, 7.4323e-02, -1.0978e-01, ..., -1.0316e-01, -3.2954e-02, 9.8162e-02], [ 5.2170e-02, -1.0260e-01, -1.0493e-01, ..., 4.6870e-04, -4.8094e-02, -6.8206e-02], ..., [ 7.3647e-02, 8.3612e-05, 1.1651e-01, ..., 6.5584e-02, 1.5324e-01, -5.9058e-03], [ 9.3081e-02, -1.4930e-01, -1.1758e-01, ..., -1.8936e-01, -5.2513e-02, 1.5587e-01], [-1.0004e-01, 7.5995e-02, 2.9680e-02, ..., -1.5351e-01, -8.5192e-02, -2.1985e-02]], device='cuda:0'), grad: tensor([[ 4.2841e-08, -1.8626e-09, 1.3970e-07, ..., 7.6368e-08, 2.6077e-08, 5.5879e-09], [ 6.8583e-06, -1.4901e-08, 3.3062e-06, ..., 2.8908e-06, 4.7311e-06, 6.1654e-07], [ 2.0608e-05, 4.2841e-07, 8.4192e-06, ..., 9.1568e-06, 1.4156e-05, 2.5406e-06], ..., [-2.8655e-05, 1.1176e-07, -1.1794e-05, ..., -1.1772e-05, -1.9744e-05, -3.3807e-06], [ 3.0603e-06, 3.7253e-07, 5.0254e-06, ..., 5.6401e-06, 4.0606e-07, 0.0000e+00], [ 4.8615e-07, -4.0792e-07, -4.4703e-08, ..., 2.2724e-07, 3.6880e-07, 7.6368e-08]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0060, -0.0132, -0.0050, -0.0161, -0.0065, -0.0006, 0.0113, 0.0148, 0.0163, -0.0113], device='cuda:0'), grad: tensor([ 1.8254e-07, 1.1191e-05, 3.4839e-05, -1.0565e-05, 2.5332e-07, 5.7556e-07, 4.0978e-08, -4.6551e-05, 1.0103e-05, -1.3039e-07], device='cuda:0') 100 0.0001 changing lr epoch 194, time 214.85, cls_loss 0.0021 cls_loss_mapping 0.0038 cls_loss_causal 0.5520 re_mapping 0.0067 re_causal 0.0190 /// teacc 98.99 lr 0.00010000 Epoch 196, weight, value: tensor([[-0.2046, 0.0810, -0.0762, ..., -0.0157, -0.1333, -0.1354], [ 0.0200, 0.0761, -0.1080, ..., -0.1036, -0.0317, 0.1004], [ 0.0517, -0.1034, -0.1056, ..., 0.0004, -0.0486, -0.0699], ..., [ 0.0735, -0.0021, 0.1159, ..., 0.0659, 0.1531, -0.0083], [ 0.0930, -0.1498, -0.1190, ..., -0.1913, -0.0531, 0.1565], [-0.1011, 0.0758, 0.0296, ..., -0.1549, -0.0871, -0.0223]], device='cuda:0'), grad: tensor([[ 9.3319e-07, 4.4703e-08, 2.5518e-07, ..., 4.2655e-07, 6.1467e-07, 3.7253e-08], [ 2.1413e-05, 1.3039e-08, 4.2506e-06, ..., 1.0245e-05, 1.3851e-05, -1.1362e-07], [ 2.8610e-03, 4.8429e-08, 4.0054e-04, ..., 1.3199e-03, 1.8511e-03, 2.3097e-07], ..., [-2.9888e-03, -8.1956e-08, -4.2439e-04, ..., -1.3800e-03, -1.9331e-03, -2.3656e-07], [ 3.8415e-05, 3.4831e-07, 6.1616e-06, ..., 1.8001e-05, 2.6152e-05, -1.7397e-06], [ 7.4692e-07, 3.2801e-06, 7.9535e-07, ..., 3.8370e-07, 4.3213e-07, 1.0669e-05]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0062, -0.0116, -0.0057, -0.0162, -0.0060, -0.0006, 0.0114, 0.0138, 0.0163, -0.0117], device='cuda:0'), grad: tensor([ 1.2442e-06, 2.3648e-05, 3.0632e-03, 6.9737e-05, -3.1888e-05, 4.8429e-07, 1.0673e-06, -3.2024e-03, 4.1693e-05, 2.9773e-05], device='cuda:0') 100 0.0001 changing lr epoch 195, time 214.72, cls_loss 0.0015 cls_loss_mapping 0.0024 cls_loss_causal 0.4992 re_mapping 0.0064 re_causal 0.0176 /// teacc 99.05 lr 0.00010000 Epoch 197, weight, value: tensor([[-0.2055, 0.0811, -0.0771, ..., -0.0161, -0.1351, -0.1357], [ 0.0199, 0.0762, -0.1082, ..., -0.1040, -0.0318, 0.1006], [ 0.0498, -0.1037, -0.1066, ..., -0.0013, -0.0499, -0.0706], ..., [ 0.0752, -0.0023, 0.1167, ..., 0.0678, 0.1542, -0.0084], [ 0.0931, -0.1502, -0.1196, ..., -0.1920, -0.0535, 0.1568], [-0.1021, 0.0761, 0.0300, ..., -0.1560, -0.0876, -0.0226]], device='cuda:0'), grad: tensor([[ 1.8440e-07, -1.1548e-07, 9.3132e-09, ..., 1.0803e-07, 9.3132e-09, 2.7753e-07], [ 1.0923e-05, -3.9116e-07, 5.2154e-08, ..., 1.8440e-07, 2.7940e-08, 1.6868e-05], [ 2.1141e-06, 3.7253e-08, 1.4901e-08, ..., -7.4692e-07, -1.1362e-07, 3.9712e-06], ..., [ 6.5193e-07, 1.4901e-07, -1.5460e-07, ..., 8.1956e-08, -9.4995e-08, 1.3672e-06], [-1.5616e-05, 2.0117e-07, 6.1467e-08, ..., -3.7253e-09, 8.5682e-08, -2.5213e-05], [ 1.4715e-07, 5.4017e-08, -3.1665e-07, ..., 4.6566e-08, 6.7055e-08, 4.1723e-07]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0063, -0.0116, -0.0068, -0.0163, -0.0063, -0.0004, 0.0112, 0.0149, 0.0163, -0.0116], device='cuda:0'), grad: tensor([ 3.9935e-06, 3.0100e-05, 4.1351e-06, 7.3574e-07, -9.6858e-08, 1.7397e-06, -2.2613e-06, 2.4978e-06, -4.1604e-05, 7.0967e-07], device='cuda:0') 100 0.0001 changing lr epoch 196, time 214.94, cls_loss 0.0017 cls_loss_mapping 0.0032 cls_loss_causal 0.5326 re_mapping 0.0062 re_causal 0.0180 /// teacc 98.90 lr 0.00010000 Epoch 198, weight, value: tensor([[-0.2060, 0.0814, -0.0784, ..., -0.0170, -0.1362, -0.1357], [ 0.0194, 0.0766, -0.1083, ..., -0.1051, -0.0327, 0.1010], [ 0.0504, -0.1041, -0.1067, ..., -0.0007, -0.0490, -0.0710], ..., [ 0.0748, -0.0028, 0.1167, ..., 0.0676, 0.1542, -0.0087], [ 0.0932, -0.1506, -0.1200, ..., -0.1927, -0.0537, 0.1570], [-0.1029, 0.0759, 0.0306, ..., -0.1566, -0.0878, -0.0230]], device='cuda:0'), grad: tensor([[ 3.3528e-08, -4.6566e-08, 2.9802e-08, ..., 1.8626e-08, 9.3132e-09, 2.4214e-08], [-2.5332e-07, -3.5018e-07, 1.8068e-07, ..., 1.1921e-07, 6.1467e-08, -8.6240e-07], [ 3.4273e-07, 8.5682e-08, 2.1793e-07, ..., 1.2293e-07, 9.6858e-08, 2.7008e-07], ..., [-1.8813e-07, 9.4995e-08, -3.6694e-07, ..., -2.1420e-07, -3.0175e-07, 1.7509e-07], [-2.6058e-06, 3.5390e-08, -9.2760e-07, ..., -1.0114e-06, -9.8720e-08, -2.8852e-06], [ 2.9802e-07, 2.9057e-07, 1.6019e-07, ..., 1.4342e-07, 9.4995e-08, 4.6007e-07]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0061, -0.0121, -0.0060, -0.0163, -0.0048, -0.0005, 0.0115, 0.0140, 0.0162, -0.0121], device='cuda:0'), grad: tensor([ 6.3330e-08, -1.5069e-06, 8.4378e-07, -7.5996e-06, -2.6077e-08, 1.2666e-05, 4.0978e-08, -1.1176e-07, -5.6997e-06, 1.3094e-06], device='cuda:0') 100 0.0001 changing lr epoch 197, time 214.66, cls_loss 0.0016 cls_loss_mapping 0.0029 cls_loss_causal 0.5187 re_mapping 0.0059 re_causal 0.0184 /// teacc 99.06 lr 0.00010000 Epoch 199, weight, value: tensor([[-0.2064, 0.0817, -0.0788, ..., -0.0176, -0.1369, -0.1363], [ 0.0193, 0.0775, -0.1086, ..., -0.1054, -0.0329, 0.1015], [ 0.0505, -0.1046, -0.1070, ..., -0.0007, -0.0489, -0.0713], ..., [ 0.0750, -0.0028, 0.1172, ..., 0.0677, 0.1544, -0.0088], [ 0.0927, -0.1526, -0.1205, ..., -0.1932, -0.0541, 0.1566], [-0.1036, 0.0758, 0.0307, ..., -0.1576, -0.0883, -0.0235]], device='cuda:0'), grad: tensor([[ 1.0058e-07, -5.9977e-07, 9.4995e-08, ..., 5.9605e-08, 9.1270e-08, 1.3039e-08], [ 1.0617e-07, -1.1995e-06, 5.6624e-07, ..., 2.9802e-07, 4.3586e-07, -2.3153e-06], [-4.0507e-04, 7.0594e-07, -2.0373e-04, ..., -1.0914e-04, -2.5249e-04, 1.1008e-06], ..., [ 4.0245e-04, 2.9616e-07, 2.0146e-04, ..., 1.0771e-04, 2.5105e-04, 2.3842e-07], [ 7.6182e-07, 2.1234e-07, 6.8918e-07, ..., 4.4703e-07, 2.6822e-07, 1.0245e-07], [ 2.7195e-07, -2.4214e-08, -6.6496e-07, ..., 7.6368e-08, 7.6368e-08, 4.1537e-07]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0064, -0.0119, -0.0060, -0.0163, -0.0048, -0.0002, 0.0117, 0.0141, 0.0154, -0.0123], device='cuda:0'), grad: tensor([-7.4692e-07, -3.2205e-06, -4.4203e-04, 1.9185e-06, 7.6927e-07, 9.9652e-07, -4.6194e-07, 4.4060e-04, 2.0135e-06, 2.5146e-07], device='cuda:0') 100 0.0001 changing lr epoch 198, time 214.99, cls_loss 0.0016 cls_loss_mapping 0.0023 cls_loss_causal 0.5206 re_mapping 0.0058 re_causal 0.0182 /// teacc 98.98 lr 0.00010000 Epoch 200, weight, value: tensor([[-0.2071, 0.0817, -0.0803, ..., -0.0177, -0.1377, -0.1373], [ 0.0178, 0.0773, -0.1095, ..., -0.1056, -0.0335, 0.1006], [ 0.0512, -0.1042, -0.1069, ..., -0.0005, -0.0488, -0.0682], ..., [ 0.0755, -0.0029, 0.1180, ..., 0.0677, 0.1549, -0.0084], [ 0.0926, -0.1530, -0.1208, ..., -0.1937, -0.0544, 0.1567], [-0.1054, 0.0758, 0.0305, ..., -0.1580, -0.0897, -0.0242]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 1.8626e-08, ..., 5.5879e-09, 0.0000e+00, 3.1665e-08], [-2.2352e-07, -3.2969e-07, 4.6566e-08, ..., 1.3039e-08, 1.6764e-08, -2.5239e-06], [ 1.4901e-08, 5.4017e-08, 2.6077e-08, ..., 9.3132e-09, 3.7253e-09, 1.4342e-07], ..., [-2.6077e-08, 1.5460e-07, -1.1176e-08, ..., -7.4506e-09, -3.7253e-08, 3.2783e-07], [ 1.6950e-07, 3.2783e-07, 9.8720e-08, ..., 1.4901e-08, 1.8626e-09, 1.5274e-06], [ 2.4214e-08, 2.3134e-06, -1.8254e-07, ..., 1.4901e-08, 7.4506e-09, 1.3076e-06]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0067, -0.0133, -0.0046, -0.0158, -0.0047, -0.0007, 0.0120, 0.0144, 0.0153, -0.0125], device='cuda:0'), grad: tensor([ 1.1548e-07, -3.6173e-06, 3.2596e-07, 7.0296e-06, -5.8860e-06, -1.2249e-05, 3.1535e-06, 7.0035e-07, 2.9225e-06, 7.5325e-06], device='cuda:0') 100 0.0001 changing lr epoch 199, time 214.87, cls_loss 0.0022 cls_loss_mapping 0.0048 cls_loss_causal 0.5339 re_mapping 0.0059 re_causal 0.0176 /// teacc 99.04 lr 0.00010000 Epoch 201, weight, value: tensor([[-0.2092, 0.0791, -0.0835, ..., -0.0194, -0.1401, -0.1396], [ 0.0183, 0.0789, -0.1096, ..., -0.1060, -0.0334, 0.1020], [ 0.0513, -0.1053, -0.1071, ..., -0.0005, -0.0488, -0.0696], ..., [ 0.0756, -0.0034, 0.1183, ..., 0.0678, 0.1550, -0.0086], [ 0.0944, -0.1545, -0.1217, ..., -0.1944, -0.0547, 0.1571], [-0.1064, 0.0776, 0.0308, ..., -0.1591, -0.0903, -0.0248]], device='cuda:0'), grad: tensor([[ 2.4214e-08, -5.7742e-08, 9.8720e-08, ..., 6.1467e-08, 5.5879e-09, 2.6077e-08], [ 1.2107e-07, -1.7323e-07, 2.2165e-07, ..., 1.8068e-07, 8.1956e-08, -3.2037e-07], [ 2.9802e-08, 1.0803e-07, 3.0175e-07, ..., -7.0781e-08, 7.4506e-09, 1.0803e-07], ..., [-2.1048e-07, 1.7136e-07, -4.0978e-08, ..., 1.7509e-07, -2.3469e-07, 1.0990e-07], [-1.6540e-06, 1.6391e-07, 1.8068e-07, ..., 5.1409e-07, 9.3132e-09, -1.7136e-06], [ 1.4398e-06, 7.8231e-08, -1.3039e-08, ..., 3.7812e-07, 2.7940e-08, 1.5385e-06]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0094, -0.0124, -0.0050, -0.0155, -0.0049, -0.0024, 0.0104, 0.0145, 0.0176, -0.0113], device='cuda:0'), grad: tensor([ 2.2724e-07, -9.8720e-08, 2.8498e-07, -2.1774e-06, 1.2480e-07, 7.7859e-07, -3.1106e-07, 6.6869e-07, -2.8312e-06, 3.3155e-06], device='cuda:0') 100 0.0001 changing lr epoch 200, time 215.21, cls_loss 0.0019 cls_loss_mapping 0.0027 cls_loss_causal 0.5164 re_mapping 0.0057 re_causal 0.0174 /// teacc 99.00 lr 0.00010000 Epoch 202, weight, value: tensor([[-0.2102, 0.0793, -0.0847, ..., -0.0197, -0.1416, -0.1403], [ 0.0178, 0.0790, -0.1100, ..., -0.1069, -0.0340, 0.1023], [ 0.0514, -0.1060, -0.1074, ..., -0.0005, -0.0487, -0.0695], ..., [ 0.0759, -0.0034, 0.1187, ..., 0.0682, 0.1554, -0.0087], [ 0.0945, -0.1552, -0.1222, ..., -0.1955, -0.0549, 0.1570], [-0.1071, 0.0774, 0.0316, ..., -0.1603, -0.0908, -0.0250]], device='cuda:0'), grad: tensor([[ 9.3132e-09, -2.6822e-07, 1.3039e-08, ..., 7.4506e-09, 9.3132e-09, 5.5879e-09], [ 4.8429e-08, 5.7742e-08, 8.1956e-08, ..., 1.3039e-08, 6.5193e-08, -5.2154e-08], [-2.9802e-08, 3.1665e-08, 3.3528e-08, ..., -6.8918e-08, -1.3039e-08, 2.0489e-08], ..., [-1.7472e-06, 4.6566e-08, -2.3115e-06, ..., 1.8626e-08, -2.1756e-06, 6.8918e-08], [ 7.4506e-08, 2.6077e-08, 1.2293e-07, ..., 3.7253e-09, 1.0617e-07, -5.5879e-09], [ 1.5181e-06, 1.1921e-07, 1.7062e-06, ..., 1.6764e-08, 1.8682e-06, 4.0978e-08]], device='cuda:0') Epoch 202, bias, value: tensor([-0.0096, -0.0128, -0.0046, -0.0159, -0.0047, -0.0022, 0.0115, 0.0147, 0.0170, -0.0118], device='cuda:0'), grad: tensor([-4.0233e-07, 2.1979e-07, -4.5449e-07, 1.3784e-07, -1.0990e-07, 1.2480e-07, 1.4715e-07, -3.1292e-06, 2.8685e-07, 3.2093e-06], device='cuda:0') 100 0.0001 changing lr epoch 201, time 214.77, cls_loss 0.0027 cls_loss_mapping 0.0036 cls_loss_causal 0.5296 re_mapping 0.0061 re_causal 0.0178 /// teacc 98.90 lr 0.00010000 Epoch 203, weight, value: tensor([[-0.2109, 0.0792, -0.0854, ..., -0.0202, -0.1426, -0.1407], [ 0.0140, 0.0789, -0.1142, ..., -0.1099, -0.0383, 0.0999], [ 0.0509, -0.1065, -0.1086, ..., -0.0010, -0.0490, -0.0699], ..., [ 0.0801, -0.0033, 0.1238, ..., 0.0708, 0.1596, -0.0054], [ 0.0936, -0.1540, -0.1252, ..., -0.1986, -0.0580, 0.1580], [-0.1094, 0.0771, 0.0307, ..., -0.1629, -0.0933, -0.0262]], device='cuda:0'), grad: tensor([[ 9.3691e-07, 3.2615e-06, 3.7253e-09, ..., 0.0000e+00, 3.7253e-09, 3.5558e-06], [-5.4948e-07, -7.9125e-06, 9.3132e-09, ..., 3.7253e-09, -5.7593e-06, -1.4119e-05], [ 1.0580e-06, 2.3544e-06, 1.1176e-08, ..., 1.8626e-09, 5.9232e-07, 4.4741e-06], ..., [ 2.6338e-06, 6.1095e-07, -5.5879e-09, ..., -3.7253e-09, 5.0142e-06, 9.8646e-06], [-6.9402e-06, -8.1137e-06, -1.9372e-07, ..., 1.8626e-09, -1.1176e-08, -1.6153e-05], [ 7.0408e-07, 9.2201e-07, 2.0489e-08, ..., 1.8626e-09, 2.7940e-08, 1.2536e-06]], device='cuda:0') Epoch 203, bias, value: tensor([-0.0102, -0.0172, -0.0047, -0.0168, -0.0039, -0.0022, 0.0114, 0.0199, 0.0168, -0.0127], device='cuda:0'), grad: tensor([ 1.2212e-05, -3.4153e-05, 9.6112e-06, 8.9630e-06, 1.3754e-05, 2.8610e-06, 3.9190e-06, 1.7539e-05, -3.8922e-05, 4.2208e-06], device='cuda:0') 100 0.0001 changing lr epoch 202, time 214.70, cls_loss 0.0013 cls_loss_mapping 0.0028 cls_loss_causal 0.5071 re_mapping 0.0065 re_causal 0.0194 /// teacc 98.99 lr 0.00010000 Epoch 204, weight, value: tensor([[-0.2109, 0.0794, -0.0854, ..., -0.0203, -0.1428, -0.1408], [ 0.0142, 0.0800, -0.1142, ..., -0.1100, -0.0380, 0.1004], [ 0.0511, -0.1069, -0.1087, ..., -0.0009, -0.0489, -0.0700], ..., [ 0.0800, -0.0046, 0.1240, ..., 0.0707, 0.1595, -0.0059], [ 0.0939, -0.1542, -0.1251, ..., -0.1989, -0.0580, 0.1584], [-0.1115, 0.0772, 0.0305, ..., -0.1642, -0.0943, -0.0266]], device='cuda:0'), grad: tensor([[ 3.5390e-08, -2.9299e-06, 2.2352e-08, ..., 2.0489e-08, 1.4901e-08, 9.3132e-09], [ 1.3411e-07, -3.3155e-07, 1.6578e-07, ..., 1.5087e-07, 8.7544e-08, -8.6240e-07], [ 3.2596e-07, 1.3039e-07, 3.2037e-07, ..., 2.1979e-07, 3.6135e-07, 2.2352e-07], ..., [-7.9162e-07, 5.5879e-08, -7.5065e-07, ..., -6.6310e-07, -8.4750e-07, 2.2165e-07], [ 2.2165e-07, 6.4261e-07, 1.7881e-07, ..., 3.7812e-07, 8.5682e-08, 8.5682e-08], [ 6.3330e-08, 1.8068e-07, -1.1362e-07, ..., 1.7323e-07, 7.0781e-08, 1.1176e-07]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0101, -0.0172, -0.0042, -0.0169, -0.0040, -0.0023, 0.0118, 0.0198, 0.0167, -0.0130], device='cuda:0'), grad: tensor([-5.4650e-06, -1.4622e-06, 1.0133e-06, 2.2054e-06, 8.5682e-07, -4.2804e-06, 3.8557e-07, -8.4378e-07, 6.5342e-06, 1.0636e-06], device='cuda:0') 100 0.0001 changing lr epoch 203, time 214.72, cls_loss 0.0040 cls_loss_mapping 0.0045 cls_loss_causal 0.5233 re_mapping 0.0059 re_causal 0.0172 /// teacc 99.03 lr 0.00010000 Epoch 205, weight, value: tensor([[-0.2115, 0.0798, -0.0860, ..., -0.0201, -0.1432, -0.1408], [ 0.0112, 0.0807, -0.1158, ..., -0.1101, -0.0402, 0.0976], [ 0.0510, -0.1077, -0.1090, ..., -0.0009, -0.0490, -0.0713], ..., [ 0.0828, -0.0060, 0.1250, ..., 0.0707, 0.1614, -0.0029], [ 0.0941, -0.1548, -0.1252, ..., -0.1993, -0.0582, 0.1588], [-0.1127, 0.0787, 0.0325, ..., -0.1656, -0.0938, -0.0273]], device='cuda:0'), grad: tensor([[ 8.1956e-08, -1.8906e-06, -1.2070e-06, ..., -2.2054e-06, 5.9605e-08, 3.5204e-07], [ 3.1292e-07, 5.1223e-07, 6.1467e-08, ..., 1.1176e-07, 1.0803e-07, 2.4773e-07], [-3.1218e-06, 3.6322e-07, -5.8673e-07, ..., -3.2634e-06, -4.2953e-06, 9.4436e-07], ..., [ 4.2953e-06, 1.3821e-06, 7.8045e-07, ..., 3.2280e-06, 4.1649e-06, 1.3188e-06], [-4.0568e-06, 4.7311e-07, 1.5087e-07, ..., 1.5274e-07, -6.2212e-07, -4.2953e-06], [ 1.0990e-07, 2.7511e-06, 6.6310e-07, ..., 1.5795e-06, 4.0978e-08, 5.1223e-07]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0097, -0.0202, -0.0045, -0.0167, -0.0048, -0.0024, 0.0114, 0.0228, 0.0167, -0.0119], device='cuda:0'), grad: tensor([-1.0155e-05, 2.5481e-06, -6.6757e-06, 5.3197e-06, -1.3106e-05, 1.5050e-06, 9.5740e-07, 1.5497e-05, -8.5980e-06, 1.2636e-05], device='cuda:0') 100 0.0001 changing lr epoch 204, time 214.82, cls_loss 0.0021 cls_loss_mapping 0.0030 cls_loss_causal 0.4850 re_mapping 0.0061 re_causal 0.0167 /// teacc 98.86 lr 0.00010000 Epoch 206, weight, value: tensor([[-0.2120, 0.0831, -0.0854, ..., -0.0202, -0.1439, -0.1407], [ 0.0113, 0.0813, -0.1149, ..., -0.1090, -0.0404, 0.0979], [ 0.0508, -0.1082, -0.1096, ..., -0.0010, -0.0492, -0.0720], ..., [ 0.0829, -0.0073, 0.1242, ..., 0.0702, 0.1617, -0.0032], [ 0.0943, -0.1554, -0.1252, ..., -0.2000, -0.0583, 0.1590], [-0.1142, 0.0767, 0.0334, ..., -0.1664, -0.0940, -0.0275]], device='cuda:0'), grad: tensor([[-1.8626e-08, -7.5623e-07, 3.7253e-09, ..., 5.5879e-08, 5.0291e-08, 7.2643e-08], [ 9.6858e-08, -1.3411e-07, 1.1176e-08, ..., 1.3784e-07, 1.3784e-07, -4.3586e-07], [-3.2056e-06, 1.0245e-07, -1.4715e-07, ..., -3.0622e-06, -3.0156e-06, 2.9802e-08], ..., [ 1.6894e-06, 1.5832e-07, 7.8231e-08, ..., 1.5888e-06, 1.5534e-06, 2.5705e-07], [ 6.8545e-07, 2.1234e-07, 5.5879e-08, ..., 6.4448e-07, 6.3702e-07, 3.9116e-08], [ 9.6858e-08, 6.6496e-07, -3.7253e-08, ..., 5.0291e-08, 5.7742e-08, 5.9605e-08]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0064, -0.0198, -0.0050, -0.0171, -0.0054, -0.0023, 0.0111, 0.0224, 0.0167, -0.0139], device='cuda:0'), grad: tensor([-1.2182e-06, -1.7323e-07, -8.4043e-06, 2.5332e-06, -1.4398e-06, -2.4401e-06, 1.4156e-06, 5.0887e-06, 2.7269e-06, 1.8999e-06], device='cuda:0') 100 0.0001 changing lr epoch 205, time 214.92, cls_loss 0.0016 cls_loss_mapping 0.0034 cls_loss_causal 0.4859 re_mapping 0.0064 re_causal 0.0187 /// teacc 98.91 lr 0.00010000 Epoch 207, weight, value: tensor([[-0.2121, 0.0811, -0.0857, ..., -0.0202, -0.1441, -0.1409], [ 0.0114, 0.0823, -0.1148, ..., -0.1091, -0.0403, 0.0981], [ 0.0509, -0.1092, -0.1097, ..., -0.0011, -0.0491, -0.0722], ..., [ 0.0828, -0.0084, 0.1241, ..., 0.0702, 0.1616, -0.0033], [ 0.0943, -0.1557, -0.1253, ..., -0.2003, -0.0583, 0.1590], [-0.1145, 0.0784, 0.0338, ..., -0.1672, -0.0943, -0.0279]], device='cuda:0'), grad: tensor([[ 7.4506e-09, -3.5390e-08, 3.7253e-09, ..., 3.7253e-09, 7.4506e-09, 7.4506e-09], [-5.9791e-07, -9.0525e-07, 3.3528e-08, ..., 2.7940e-08, -1.8775e-06, -2.9281e-06], [ 1.5274e-07, 8.5682e-08, 8.1956e-08, ..., 4.8429e-08, 2.4028e-07, 2.7753e-07], ..., [-4.7870e-07, 7.1526e-07, -7.4320e-07, ..., -5.6252e-07, 8.0653e-07, 2.3264e-06], [ 2.0862e-07, 3.9116e-08, 2.0489e-07, ..., 1.6578e-07, 2.1793e-07, -3.3528e-08], [ 5.0291e-08, 8.5682e-08, 1.3039e-08, ..., 1.1176e-08, 1.2480e-07, 1.8626e-07]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0083, -0.0197, -0.0050, -0.0167, -0.0053, -0.0025, 0.0112, 0.0223, 0.0166, -0.0123], device='cuda:0'), grad: tensor([ 6.3330e-08, -7.5549e-06, 8.2888e-07, 7.7672e-07, 1.2480e-07, 2.5332e-07, -3.1851e-07, 4.8205e-06, 4.3586e-07, 5.6438e-07], device='cuda:0') 100 0.0001 changing lr epoch 206, time 214.82, cls_loss 0.0016 cls_loss_mapping 0.0039 cls_loss_causal 0.5377 re_mapping 0.0063 re_causal 0.0178 /// teacc 98.92 lr 0.00010000 Epoch 208, weight, value: tensor([[-0.2128, 0.0815, -0.0863, ..., -0.0232, -0.1447, -0.1403], [ 0.0114, 0.0831, -0.1150, ..., -0.1093, -0.0405, 0.0982], [ 0.0528, -0.1090, -0.1077, ..., 0.0005, -0.0476, -0.0729], ..., [ 0.0824, -0.0092, 0.1236, ..., 0.0691, 0.1612, -0.0033], [ 0.0941, -0.1561, -0.1255, ..., -0.2016, -0.0584, 0.1592], [-0.1164, 0.0781, 0.0337, ..., -0.1701, -0.0952, -0.0288]], device='cuda:0'), grad: tensor([[ 2.0489e-08, 1.3039e-08, 5.7742e-08, ..., 1.4901e-08, 4.2841e-08, 1.6764e-08], [ 1.3597e-07, -1.7509e-07, 2.7195e-07, ..., 7.6368e-08, 1.8440e-07, -4.7684e-07], [ 2.3842e-07, 7.0781e-08, 2.6636e-07, ..., 9.6858e-08, 2.1048e-07, 2.8312e-07], ..., [-8.0690e-06, 1.8999e-07, -1.7390e-05, ..., -3.7346e-06, -1.7703e-05, 2.7195e-07], [-6.8918e-08, -9.5926e-07, 3.2224e-07, ..., 8.0094e-08, 2.3842e-07, -4.7348e-06], [ 7.5437e-06, -1.1303e-05, 1.0550e-05, ..., 3.4980e-06, 8.8662e-06, 3.1162e-06]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0082, -0.0197, -0.0034, -0.0187, -0.0055, -0.0003, 0.0114, 0.0221, 0.0164, -0.0128], device='cuda:0'), grad: tensor([ 1.6764e-07, -1.8254e-07, 1.1064e-06, -1.1817e-05, 4.5300e-05, 1.5467e-05, 2.2091e-06, -4.2379e-05, -1.3463e-05, 3.5949e-06], device='cuda:0') 100 0.0001 changing lr epoch 207, time 214.92, cls_loss 0.0022 cls_loss_mapping 0.0036 cls_loss_causal 0.5284 re_mapping 0.0058 re_causal 0.0175 /// teacc 98.99 lr 0.00010000 Epoch 209, weight, value: tensor([[-0.2131, 0.0816, -0.0868, ..., -0.0233, -0.1448, -0.1406], [ 0.0113, 0.0831, -0.1150, ..., -0.1095, -0.0405, 0.0983], [ 0.0529, -0.1103, -0.1079, ..., 0.0004, -0.0476, -0.0736], ..., [ 0.0821, -0.0110, 0.1221, ..., 0.0662, 0.1609, -0.0034], [ 0.0946, -0.1564, -0.1257, ..., -0.2020, -0.0584, 0.1603], [-0.1160, 0.0793, 0.0367, ..., -0.1706, -0.0929, -0.0282]], device='cuda:0'), grad: tensor([[ 1.6764e-08, -7.4506e-08, 1.3039e-08, ..., 9.3132e-09, 5.5879e-09, 5.5879e-09], [ 1.6391e-07, 1.6950e-07, 7.6368e-08, ..., 4.8429e-08, 5.4017e-08, -1.2293e-07], [ 2.6077e-07, 1.0431e-07, 2.3097e-07, ..., 1.1735e-07, 2.1793e-07, 5.0291e-08], ..., [-9.5740e-07, 1.0245e-07, -7.4878e-07, ..., -4.8429e-07, -7.5996e-07, 8.7544e-08], [ 1.1921e-07, 3.0361e-07, 1.0058e-07, ..., 8.3819e-08, 8.9407e-08, -2.6077e-07], [ 2.9057e-06, 1.4585e-06, -1.3784e-07, ..., 4.2841e-08, 9.4995e-08, 2.1048e-07]], device='cuda:0') Epoch 209, bias, value: tensor([-0.0082, -0.0197, -0.0037, -0.0172, -0.0072, -0.0003, 0.0111, 0.0215, 0.0167, -0.0109], device='cuda:0'), grad: tensor([ 3.7253e-09, 6.5006e-07, 8.0280e-07, 3.8818e-06, 6.2585e-06, 7.5996e-06, 8.0392e-06, -1.2219e-06, 1.3299e-06, -2.7344e-05], device='cuda:0') 100 0.0001 changing lr epoch 208, time 215.02, cls_loss 0.0019 cls_loss_mapping 0.0035 cls_loss_causal 0.5111 re_mapping 0.0059 re_causal 0.0165 /// teacc 98.92 lr 0.00010000 Epoch 210, weight, value: tensor([[-0.2140, 0.0817, -0.0878, ..., -0.0234, -0.1459, -0.1414], [ 0.0113, 0.0857, -0.1151, ..., -0.1098, -0.0404, 0.0987], [ 0.0531, -0.1108, -0.1082, ..., 0.0003, -0.0475, -0.0741], ..., [ 0.0821, -0.0120, 0.1221, ..., 0.0660, 0.1609, -0.0035], [ 0.0953, -0.1569, -0.1259, ..., -0.2022, -0.0585, 0.1617], [-0.1165, 0.0797, 0.0375, ..., -0.1711, -0.0931, -0.0274]], device='cuda:0'), grad: tensor([[ 1.1921e-07, 9.3132e-10, 1.8626e-08, ..., 1.5832e-07, 8.8476e-08, 6.3330e-08], [ 2.7083e-06, 2.4773e-07, 2.1476e-06, ..., 1.8533e-06, 2.2408e-06, -2.2817e-07], [-1.5711e-06, 1.8440e-07, 5.3737e-07, ..., -2.7400e-06, -1.0794e-06, 3.0082e-07], ..., [-4.1127e-06, -1.2834e-06, -4.1500e-06, ..., -2.2538e-06, -3.4645e-06, -1.5618e-06], [ 7.1805e-07, 2.5611e-07, 4.0513e-07, ..., 7.4133e-07, 5.6624e-07, 2.9430e-07], [ 3.6974e-07, 1.7416e-07, 2.4214e-07, ..., 3.1292e-07, 3.1572e-07, 2.4308e-07]], device='cuda:0') Epoch 210, bias, value: tensor([-0.0080, -0.0194, -0.0037, -0.0171, -0.0081, -0.0004, 0.0096, 0.0214, 0.0172, -0.0103], device='cuda:0'), grad: tensor([ 5.0012e-07, 2.2743e-06, -6.1840e-06, 4.1649e-06, 1.4417e-06, -9.7789e-07, 1.3737e-06, -6.2175e-06, 2.3190e-06, 1.2722e-06], device='cuda:0') 100 0.0001 changing lr epoch 209, time 214.96, cls_loss 0.0015 cls_loss_mapping 0.0027 cls_loss_causal 0.5111 re_mapping 0.0061 re_causal 0.0177 /// teacc 99.08 lr 0.00010000 Epoch 211, weight, value: tensor([[-2.1572e-01, 8.1682e-02, -8.8811e-02, ..., -2.4072e-02, -1.4668e-01, -1.4369e-01], [ 1.1355e-02, 8.6367e-02, -1.1516e-01, ..., -1.0995e-01, -4.0405e-02, 9.8854e-02], [ 5.3085e-02, -1.1158e-01, -1.0865e-01, ..., -1.0879e-05, -4.7602e-02, -7.4325e-02], ..., [ 8.2124e-02, -1.2542e-02, 1.2229e-01, ..., 6.6095e-02, 1.6103e-01, -3.5835e-03], [ 9.5953e-02, -1.5709e-01, -1.2617e-01, ..., -2.0381e-01, -5.8390e-02, 1.6284e-01], [-1.1716e-01, 7.9502e-02, 3.7543e-02, ..., -1.7193e-01, -9.3474e-02, -2.7964e-02]], device='cuda:0'), grad: tensor([[ 3.0734e-08, 1.1623e-06, 6.8918e-08, ..., 8.8476e-08, 2.4214e-08, 4.9639e-07], [ 2.8964e-07, 6.5565e-06, 9.2760e-07, ..., 2.7474e-07, 1.4715e-07, 7.7784e-06], [-3.2745e-06, 3.0827e-07, 5.9605e-08, ..., -2.8685e-06, -2.0936e-06, 3.7625e-07], ..., [ 1.1250e-06, 1.2023e-06, 3.4925e-07, ..., 9.9093e-07, 9.0059e-07, 1.5721e-06], [ 1.2433e-06, 1.5765e-05, 1.9409e-06, ..., 1.1893e-06, 7.4320e-07, 1.8016e-05], [ 4.9360e-08, -7.0095e-05, -9.8273e-06, ..., 9.3132e-08, -8.2888e-08, -8.7738e-05]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0081, -0.0193, -0.0037, -0.0167, -0.0080, -0.0003, 0.0089, 0.0214, 0.0176, -0.0105], device='cuda:0'), grad: tensor([ 4.0680e-06, 2.2709e-05, -7.7635e-06, 2.7828e-06, 1.5783e-04, -3.0100e-05, 2.2352e-05, 6.9812e-06, 5.6267e-05, -2.3520e-04], device='cuda:0') 100 0.0001 changing lr epoch 210, time 215.14, cls_loss 0.0028 cls_loss_mapping 0.0043 cls_loss_causal 0.5010 re_mapping 0.0060 re_causal 0.0169 /// teacc 98.94 lr 0.00010000 Epoch 212, weight, value: tensor([[-2.1496e-01, 8.0081e-02, -8.9318e-02, ..., -2.4282e-02, -1.4753e-01, -1.4080e-01], [ 9.3222e-03, 8.7015e-02, -1.1683e-01, ..., -1.1007e-01, -4.2170e-02, 9.6724e-02], [ 5.1880e-02, -1.1267e-01, -1.0964e-01, ..., -7.5343e-05, -4.8630e-02, -7.8786e-02], ..., [ 8.3954e-02, -1.2307e-02, 1.2379e-01, ..., 6.5576e-02, 1.6281e-01, -1.3846e-03], [ 9.8334e-02, -1.5848e-01, -1.2403e-01, ..., -2.0133e-01, -5.5925e-02, 1.6416e-01], [-1.1890e-01, 8.0957e-02, 3.7209e-02, ..., -1.7360e-01, -9.4773e-02, -2.8545e-02]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -1.3269e-05, 1.7695e-08, ..., -2.2445e-06, 1.8626e-09, 5.5879e-09], [-5.4017e-08, -1.1921e-07, 1.4901e-08, ..., 1.3039e-08, 1.8626e-09, -6.2026e-07], [-2.7940e-09, 5.3644e-07, 4.1910e-08, ..., 5.4948e-08, -2.7008e-08, 4.9360e-08], ..., [ 3.8184e-08, 3.0082e-07, -0.0000e+00, ..., 3.0734e-08, -0.0000e+00, 4.9733e-07], [-5.9605e-08, 2.5705e-07, 7.1712e-08, ..., 2.1420e-08, -9.3132e-10, -5.0291e-08], [ 5.5879e-09, 1.1407e-05, -6.1467e-07, ..., 1.9725e-06, 1.8626e-09, 2.6450e-07]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0097, -0.0212, -0.0055, -0.0169, -0.0080, -0.0001, 0.0090, 0.0232, 0.0183, -0.0093], device='cuda:0'), grad: tensor([-1.6922e-06, -5.9884e-07, 1.0878e-06, 1.6335e-06, -1.3970e-07, 2.7120e-06, -2.4840e-05, 9.0618e-07, 7.4320e-07, 2.0117e-05], device='cuda:0') 100 0.0001 changing lr epoch 211, time 214.86, cls_loss 0.0026 cls_loss_mapping 0.0042 cls_loss_causal 0.5133 re_mapping 0.0058 re_causal 0.0168 /// teacc 98.99 lr 0.00010000 Epoch 213, weight, value: tensor([[-0.2155, 0.0803, -0.0910, ..., -0.0237, -0.1500, -0.1407], [ 0.0092, 0.0893, -0.1175, ..., -0.1120, -0.0422, 0.0968], [ 0.0523, -0.1149, -0.1098, ..., 0.0004, -0.0481, -0.0799], ..., [ 0.0839, -0.0161, 0.1233, ..., 0.0644, 0.1624, -0.0014], [ 0.0983, -0.1598, -0.1245, ..., -0.2020, -0.0561, 0.1645], [-0.1207, 0.0812, 0.0392, ..., -0.1767, -0.0933, -0.0291]], device='cuda:0'), grad: tensor([[ 1.7695e-07, 1.3195e-05, 3.2596e-08, ..., 6.5193e-09, 1.3970e-08, 2.0862e-07], [ 1.8869e-06, 3.0547e-07, 1.3411e-07, ..., 2.4401e-07, 8.1025e-08, 1.4734e-06], [-9.1046e-06, 2.5425e-07, 2.3656e-07, ..., -5.8562e-06, -5.7630e-06, 1.5246e-06], ..., [ 9.5293e-06, 5.8021e-07, -8.3167e-07, ..., 5.0627e-06, 4.9174e-06, 1.3327e-06], [-4.8093e-06, 2.9191e-05, 2.2538e-07, ..., 2.6450e-07, 9.1270e-08, -6.2436e-06], [ 8.8476e-08, 1.1645e-05, 7.3574e-08, ..., 9.4064e-08, 4.0978e-08, 2.0023e-07]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0095, -0.0211, -0.0056, -0.0157, -0.0080, -0.0004, 0.0089, 0.0229, 0.0181, -0.0088], device='cuda:0'), grad: tensor([ 1.2267e-04, 7.6741e-06, -1.0386e-05, 5.1737e-05, 5.1409e-07, -7.3004e-04, 3.4094e-04, 1.7092e-05, 1.7202e-04, 2.7552e-05], device='cuda:0') 100 0.0001 changing lr epoch 212, time 214.89, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5215 re_mapping 0.0058 re_causal 0.0178 /// teacc 98.93 lr 0.00010000 Epoch 214, weight, value: tensor([[-2.1595e-01, 8.0279e-02, -9.1560e-02, ..., -2.5181e-02, -1.5106e-01, -1.4096e-01], [ 9.2960e-03, 8.9657e-02, -1.1715e-01, ..., -1.1157e-01, -4.1954e-02, 9.6957e-02], [ 5.1385e-02, -1.1609e-01, -1.1120e-01, ..., -8.3849e-05, -4.9176e-02, -8.0157e-02], ..., [ 8.4175e-02, -1.6419e-02, 1.2342e-01, ..., 6.4589e-02, 1.6275e-01, -1.4949e-03], [ 9.8528e-02, -1.6101e-01, -1.2455e-01, ..., -2.0221e-01, -5.6079e-02, 1.6502e-01], [-1.2168e-01, 8.0904e-02, 3.9426e-02, ..., -1.7768e-01, -9.3383e-02, -2.9649e-02]], device='cuda:0'), grad: tensor([[ 8.3819e-09, -4.9360e-08, 1.8626e-09, ..., 3.7253e-09, 9.3132e-10, 3.7253e-09], [-1.3970e-08, -1.8347e-07, 1.3970e-08, ..., 1.9558e-08, 7.4506e-09, -3.2783e-07], [-4.9639e-07, 1.4901e-08, 5.0291e-08, ..., -3.3248e-07, -3.1386e-07, 2.2352e-08], ..., [ 3.8091e-07, 1.2107e-07, -5.1223e-08, ..., 2.4680e-07, 2.8964e-07, 1.7323e-07], [ 1.3970e-08, 1.5367e-07, 8.0094e-08, ..., 6.8918e-08, 1.0245e-08, 1.2107e-08], [ 8.3819e-09, 9.3132e-08, -7.4506e-08, ..., 7.4506e-09, -1.4901e-08, 7.2643e-08]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0096, -0.0210, -0.0064, -0.0157, -0.0079, 0.0004, 0.0088, 0.0229, 0.0179, -0.0091], device='cuda:0'), grad: tensor([-2.4214e-08, -4.2282e-07, -9.4622e-07, 1.3784e-07, -8.2888e-08, -6.5565e-07, 3.1944e-07, 9.7044e-07, 5.2527e-07, 1.9837e-07], device='cuda:0') 100 0.0001 changing lr epoch 213, time 214.89, cls_loss 0.0016 cls_loss_mapping 0.0033 cls_loss_causal 0.5079 re_mapping 0.0059 re_causal 0.0176 /// teacc 99.04 lr 0.00010000 Epoch 215, weight, value: tensor([[-0.2164, 0.0808, -0.0922, ..., -0.0252, -0.1520, -0.1398], [ 0.0094, 0.0900, -0.1166, ..., -0.1106, -0.0417, 0.0972], [ 0.0512, -0.1165, -0.1117, ..., -0.0002, -0.0494, -0.0805], ..., [ 0.0842, -0.0184, 0.1231, ..., 0.0643, 0.1627, -0.0018], [ 0.0986, -0.1622, -0.1247, ..., -0.2024, -0.0561, 0.1651], [-0.1222, 0.0807, 0.0397, ..., -0.1783, -0.0934, -0.0288]], device='cuda:0'), grad: tensor([[-0.0000e+00, -5.1223e-08, 4.6566e-09, ..., -9.3132e-10, 1.8626e-09, 1.3039e-08], [ 4.5635e-08, -1.5646e-07, 3.7253e-08, ..., 5.2154e-08, -6.5193e-09, -3.2969e-07], [ 4.0047e-08, 8.3819e-09, 1.1455e-07, ..., 1.7881e-07, 2.4214e-08, 1.8626e-08], ..., [ 1.5832e-08, 9.4995e-08, 1.1828e-07, ..., 2.7474e-07, -1.7695e-08, 2.2631e-07], [-2.8126e-07, 9.3132e-08, 7.4506e-08, ..., 1.1269e-07, 1.2107e-08, -4.8336e-07], [ 2.2352e-08, 2.8871e-08, -4.8429e-08, ..., 6.5193e-09, 4.6566e-09, 8.2888e-08]], device='cuda:0') Epoch 215, bias, value: tensor([-9.1212e-03, -2.0824e-02, -6.5486e-03, -1.5617e-02, -7.6305e-03, 9.7456e-05, 8.8742e-03, 2.2635e-02, 1.7739e-02, -9.2100e-03], device='cuda:0'), grad: tensor([-1.4901e-08, -3.1572e-07, 2.5705e-07, 4.2990e-06, -1.0617e-07, -4.5225e-06, -2.2911e-07, 5.9232e-07, -8.3819e-08, 1.2200e-07], device='cuda:0') 100 0.0001 changing lr epoch 214, time 214.86, cls_loss 0.0021 cls_loss_mapping 0.0032 cls_loss_causal 0.5470 re_mapping 0.0061 re_causal 0.0177 /// teacc 98.95 lr 0.00010000 Epoch 216, weight, value: tensor([[-0.2181, 0.0809, -0.0936, ..., -0.0256, -0.1548, -0.1410], [ 0.0104, 0.0903, -0.1167, ..., -0.1098, -0.0400, 0.0973], [ 0.0506, -0.1172, -0.1109, ..., 0.0019, -0.0504, -0.0811], ..., [ 0.0835, -0.0191, 0.1221, ..., 0.0624, 0.1615, -0.0019], [ 0.0995, -0.1614, -0.1250, ..., -0.2030, -0.0562, 0.1671], [-0.1213, 0.0806, 0.0414, ..., -0.1795, -0.0912, -0.0289]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -7.5717e-07, 1.0338e-07, ..., 5.4948e-08, 3.5390e-08, 4.6566e-09], [ 8.5682e-08, 6.3330e-08, 2.5891e-06, ..., 1.4016e-06, 8.5961e-07, -2.3283e-07], [ 2.1420e-08, 1.4901e-08, 6.3330e-08, ..., 5.0291e-08, -5.7090e-07, 6.5193e-09], ..., [ 1.0416e-05, 3.6716e-05, 5.1165e-04, ..., 2.8229e-04, 9.4414e-05, 9.6858e-08], [ 1.3039e-08, 1.4063e-07, 5.3458e-07, ..., 3.0175e-07, 1.0803e-07, 9.1270e-08], [ 2.1700e-07, 1.4026e-06, 2.3581e-06, ..., 1.3243e-06, 4.6287e-07, 5.2527e-07]], device='cuda:0') Epoch 216, bias, value: tensor([-9.1327e-03, -2.0039e-02, -7.6493e-03, -1.5267e-02, -7.6808e-03, -7.8043e-05, 9.0725e-03, 2.2007e-02, 1.8396e-02, -8.6250e-03], device='cuda:0'), grad: tensor([-1.0272e-06, 4.0457e-06, -1.3206e-06, -6.9094e-04, -1.4203e-06, 4.5169e-07, 7.7114e-07, 6.8283e-04, 9.8441e-07, 5.6922e-06], device='cuda:0') 100 0.0001 changing lr epoch 215, time 215.04, cls_loss 0.0017 cls_loss_mapping 0.0025 cls_loss_causal 0.5197 re_mapping 0.0060 re_causal 0.0173 /// teacc 99.01 lr 0.00010000 Epoch 217, weight, value: tensor([[-0.2187, 0.0811, -0.0943, ..., -0.0257, -0.1555, -0.1411], [ 0.0107, 0.0905, -0.1168, ..., -0.1095, -0.0394, 0.0973], [ 0.0502, -0.1181, -0.1108, ..., 0.0024, -0.0509, -0.0818], ..., [ 0.0835, -0.0193, 0.1218, ..., 0.0617, 0.1614, -0.0019], [ 0.1000, -0.1609, -0.1250, ..., -0.2030, -0.0562, 0.1683], [-0.1221, 0.0803, 0.0415, ..., -0.1806, -0.0914, -0.0297]], device='cuda:0'), grad: tensor([[ 4.5355e-07, -5.4426e-06, 4.2841e-08, ..., 1.9278e-07, 2.0675e-07, 8.3819e-09], [ 2.4401e-07, -3.1106e-07, 2.7567e-07, ..., 2.1607e-07, 1.2200e-07, -9.5088e-07], [-9.8199e-06, 7.7300e-08, 3.9022e-07, ..., -4.0382e-06, -4.2915e-06, -7.0781e-08], ..., [ 3.1516e-06, 1.9185e-07, -6.7763e-06, ..., -5.5321e-07, -9.5181e-07, 5.8860e-07], [ 1.8207e-06, 8.2608e-07, 3.2689e-07, ..., 7.1432e-07, 7.5810e-07, 8.4750e-08], [-2.9802e-08, 1.8887e-06, -3.2317e-07, ..., 1.2200e-07, 2.2165e-07, 2.7753e-07]], device='cuda:0') Epoch 217, bias, value: tensor([-9.0201e-03, -1.9812e-02, -8.2540e-03, -1.5311e-02, -7.3845e-03, 8.2756e-05, 9.5756e-03, 2.1864e-02, 1.8853e-02, -8.9927e-03], device='cuda:0'), grad: tensor([-1.1317e-05, -7.4971e-07, -1.6227e-05, 8.4043e-06, 1.4044e-06, -6.6124e-08, 6.8694e-06, 3.4645e-06, 6.4597e-06, 1.7872e-06], device='cuda:0') 100 0.0001 changing lr epoch 216, time 214.69, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.5388 re_mapping 0.0058 re_causal 0.0186 /// teacc 98.89 lr 0.00010000 Epoch 218, weight, value: tensor([[-0.2178, 0.0811, -0.0938, ..., -0.0255, -0.1538, -0.1398], [ 0.0107, 0.0907, -0.1168, ..., -0.1094, -0.0393, 0.0973], [ 0.0505, -0.1188, -0.1104, ..., 0.0026, -0.0508, -0.0827], ..., [ 0.0833, -0.0195, 0.1218, ..., 0.0615, 0.1613, -0.0020], [ 0.1002, -0.1616, -0.1250, ..., -0.2032, -0.0563, 0.1686], [-0.1233, 0.0803, 0.0412, ..., -0.1818, -0.0918, -0.0301]], device='cuda:0'), grad: tensor([[ 1.5832e-08, -6.7987e-08, 4.5635e-08, ..., 4.5635e-08, 1.2107e-08, 2.7940e-09], [ 1.8813e-06, -5.5879e-09, 8.6613e-07, ..., 7.0315e-07, 1.6298e-06, 1.2945e-07], [ 4.7218e-07, 2.3283e-07, 1.8924e-06, ..., 1.4808e-06, 1.9185e-07, 8.1956e-08], ..., [-3.0976e-06, 1.3970e-08, -2.2855e-06, ..., -9.7230e-07, -2.3860e-06, -2.0396e-07], [ 8.3819e-09, 4.3772e-08, 2.4866e-07, ..., 2.4494e-07, 6.3330e-08, -8.2888e-08], [ 2.1514e-07, 6.4261e-08, 1.7788e-07, ..., 1.0617e-07, 1.7602e-07, 2.2352e-08]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0089, -0.0197, -0.0082, -0.0152, -0.0074, 0.0010, 0.0086, 0.0218, 0.0187, -0.0092], device='cuda:0'), grad: tensor([ 1.3970e-08, 3.2410e-06, 2.8927e-06, -3.8296e-06, 6.9290e-07, 7.5903e-07, -8.9314e-07, -3.8818e-06, 5.1595e-07, 4.7963e-07], device='cuda:0') 100 0.0001 changing lr epoch 217, time 214.89, cls_loss 0.0023 cls_loss_mapping 0.0034 cls_loss_causal 0.5245 re_mapping 0.0057 re_causal 0.0175 /// teacc 99.00 lr 0.00010000 Epoch 219, weight, value: tensor([[-0.2202, 0.0811, -0.0958, ..., -0.0259, -0.1548, -0.1401], [ 0.0105, 0.0890, -0.1170, ..., -0.1096, -0.0393, 0.0964], [ 0.0511, -0.1165, -0.1110, ..., 0.0025, -0.0508, -0.0803], ..., [ 0.0825, -0.0203, 0.1212, ..., 0.0616, 0.1606, -0.0024], [ 0.1029, -0.1622, -0.1223, ..., -0.2032, -0.0535, 0.1712], [-0.1246, 0.0804, 0.0414, ..., -0.1824, -0.0923, -0.0305]], device='cuda:0'), grad: tensor([[ 2.1420e-08, -4.0699e-07, 3.6322e-08, ..., 4.6566e-09, 1.0245e-08, 1.5832e-08], [-2.7474e-06, -1.6298e-07, 1.2666e-07, ..., 3.9116e-08, 7.2643e-08, -5.9381e-06], [ 5.6904e-07, 1.1176e-08, 5.6531e-07, ..., 1.8068e-07, 5.0850e-07, 2.0396e-07], ..., [-9.3132e-10, 1.7136e-07, -1.0636e-06, ..., -3.7253e-07, -1.0207e-06, 2.0601e-06], [ 1.6820e-06, 4.9174e-07, 4.2841e-07, ..., 5.4017e-08, 1.6205e-07, 3.0492e-06], [ 1.7602e-07, -9.0245e-07, -2.1029e-06, ..., 3.4459e-08, 8.9407e-08, 1.5087e-07]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0090, -0.0215, -0.0057, -0.0151, -0.0074, 0.0007, 0.0085, 0.0214, 0.0202, -0.0092], device='cuda:0'), grad: tensor([-4.4424e-07, -8.1956e-06, 1.0878e-06, 3.5353e-06, 3.3639e-06, -3.8445e-06, -6.7055e-08, 1.5665e-06, 5.8375e-06, -2.8647e-06], device='cuda:0') 100 0.0001 changing lr epoch 218, time 214.88, cls_loss 0.0020 cls_loss_mapping 0.0026 cls_loss_causal 0.5364 re_mapping 0.0055 re_causal 0.0171 /// teacc 98.95 lr 0.00010000 Epoch 220, weight, value: tensor([[-0.2212, 0.0813, -0.0969, ..., -0.0259, -0.1550, -0.1403], [ 0.0093, 0.0888, -0.1192, ..., -0.1098, -0.0408, 0.0952], [ 0.0511, -0.1160, -0.1115, ..., 0.0024, -0.0509, -0.0798], ..., [ 0.0838, -0.0209, 0.1236, ..., 0.0617, 0.1625, -0.0012], [ 0.1029, -0.1630, -0.1225, ..., -0.2033, -0.0536, 0.1712], [-0.1258, 0.0804, 0.0414, ..., -0.1834, -0.0929, -0.0309]], device='cuda:0'), grad: tensor([[ 7.7393e-07, 1.4622e-07, 5.5879e-09, ..., 6.2399e-08, 1.2107e-08, 7.8883e-07], [-3.1125e-06, -8.4471e-07, 2.7008e-08, ..., 1.0245e-08, -2.2110e-06, -3.3006e-06], [ 1.8617e-06, 4.9081e-07, 2.0582e-07, ..., 5.3085e-08, 1.3141e-06, 1.7863e-06], ..., [ 6.9942e-07, 4.5542e-07, -2.8592e-07, ..., -5.4948e-08, 4.8429e-07, 1.2247e-06], [-1.2023e-06, -2.2631e-07, 2.4214e-08, ..., -1.0431e-07, 8.3819e-08, -1.2619e-06], [ 5.7090e-07, 3.7439e-07, 3.2596e-08, ..., 3.7253e-08, 2.3749e-07, 7.3295e-07]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0090, -0.0228, -0.0052, -0.0154, -0.0077, 0.0009, 0.0085, 0.0228, 0.0200, -0.0092], device='cuda:0'), grad: tensor([ 1.8226e-06, -1.2957e-05, 7.2494e-06, 2.0489e-07, -1.4501e-06, 1.4901e-07, 8.3633e-07, 4.2543e-06, -2.5332e-06, 2.3879e-06], device='cuda:0') 100 0.0001 changing lr epoch 219, time 214.85, cls_loss 0.0017 cls_loss_mapping 0.0031 cls_loss_causal 0.5089 re_mapping 0.0056 re_causal 0.0162 /// teacc 99.05 lr 0.00010000 Epoch 221, weight, value: tensor([[-0.2217, 0.0813, -0.0978, ..., -0.0261, -0.1555, -0.1405], [ 0.0099, 0.0891, -0.1175, ..., -0.1078, -0.0401, 0.0954], [ 0.0511, -0.1161, -0.1118, ..., 0.0025, -0.0509, -0.0799], ..., [ 0.0833, -0.0215, 0.1222, ..., 0.0611, 0.1619, -0.0014], [ 0.1032, -0.1637, -0.1223, ..., -0.2037, -0.0534, 0.1718], [-0.1278, 0.0803, 0.0414, ..., -0.1845, -0.0935, -0.0321]], device='cuda:0'), grad: tensor([[ 5.0291e-08, 6.1877e-06, 7.5437e-08, ..., 1.1176e-08, 3.0734e-08, 1.5926e-07], [ 5.0571e-07, 3.4478e-06, 1.6931e-06, ..., 1.8906e-07, 4.3400e-07, 1.2098e-06], [-1.8468e-06, 6.2492e-07, 2.6263e-07, ..., -7.9721e-07, -1.1632e-06, 3.3714e-07], ..., [-1.5842e-06, -1.6298e-07, -4.1239e-06, ..., 7.3016e-07, -3.3379e-06, 4.8708e-07], [ 4.6566e-08, 1.1809e-06, 5.8580e-07, ..., 2.2259e-07, 6.0536e-08, 3.0175e-07], [ 3.8669e-06, 9.3654e-06, 8.2031e-06, ..., 5.1409e-07, 3.7625e-06, 2.4941e-06]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0090, -0.0221, -0.0052, -0.0158, -0.0077, 0.0009, 0.0089, 0.0219, 0.0200, -0.0094], device='cuda:0'), grad: tensor([ 1.6138e-05, 9.0301e-06, -1.9558e-06, -1.6108e-05, 8.7470e-06, -2.1793e-07, -4.4733e-05, -2.6301e-06, 3.4086e-06, 2.8327e-05], device='cuda:0') 100 0.0001 changing lr epoch 220, time 214.93, cls_loss 0.0012 cls_loss_mapping 0.0026 cls_loss_causal 0.4853 re_mapping 0.0055 re_causal 0.0170 /// teacc 99.01 lr 0.00010000 Epoch 222, weight, value: tensor([[-0.2220, 0.0816, -0.0984, ..., -0.0266, -0.1558, -0.1408], [ 0.0100, 0.0894, -0.1175, ..., -0.1079, -0.0402, 0.0955], [ 0.0511, -0.1161, -0.1122, ..., 0.0023, -0.0509, -0.0799], ..., [ 0.0834, -0.0218, 0.1224, ..., 0.0612, 0.1621, -0.0015], [ 0.1032, -0.1645, -0.1225, ..., -0.2040, -0.0535, 0.1718], [-0.1287, 0.0800, 0.0415, ..., -0.1850, -0.0939, -0.0329]], device='cuda:0'), grad: tensor([[ 5.3085e-08, 3.4459e-08, 1.3970e-08, ..., 3.4459e-08, 3.0734e-08, 2.1420e-08], [-5.5879e-09, -9.2667e-07, 2.2817e-07, ..., 1.0338e-07, 1.7602e-07, -2.5686e-06], [ 2.6915e-07, 4.7497e-08, 3.5483e-07, ..., -7.0781e-08, 6.3330e-08, 9.8720e-08], ..., [-1.7174e-06, 3.4925e-07, -1.3728e-06, ..., -5.6066e-07, -1.0319e-06, 7.0781e-07], [ 7.5437e-08, 4.0699e-07, 1.8813e-07, ..., 2.0675e-07, 2.8033e-07, 7.0315e-07], [ 3.1944e-07, 1.7108e-06, 1.4435e-07, ..., 4.2841e-08, 1.7881e-07, 3.2037e-07]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0088, -0.0220, -0.0052, -0.0160, -0.0077, 0.0005, 0.0096, 0.0219, 0.0198, -0.0097], device='cuda:0'), grad: tensor([ 2.6077e-07, -3.1963e-06, -3.5390e-08, 4.7684e-06, -3.6415e-07, -7.0743e-06, 6.5379e-07, -1.3476e-06, 1.7155e-06, 4.6268e-06], device='cuda:0') 100 0.0001 changing lr epoch 221, time 214.61, cls_loss 0.0017 cls_loss_mapping 0.0024 cls_loss_causal 0.5264 re_mapping 0.0056 re_causal 0.0169 /// teacc 98.92 lr 0.00010000 Epoch 223, weight, value: tensor([[-0.2221, 0.0817, -0.0992, ..., -0.0271, -0.1563, -0.1411], [ 0.0098, 0.0905, -0.1177, ..., -0.1080, -0.0405, 0.0956], [ 0.0511, -0.1162, -0.1124, ..., 0.0024, -0.0509, -0.0800], ..., [ 0.0838, -0.0231, 0.1230, ..., 0.0613, 0.1628, -0.0015], [ 0.1031, -0.1654, -0.1229, ..., -0.2046, -0.0536, 0.1718], [-0.1331, 0.0796, 0.0401, ..., -0.1865, -0.0962, -0.0336]], device='cuda:0'), grad: tensor([[ 1.1083e-07, 9.2201e-08, 8.6613e-08, ..., 1.7695e-08, 6.7987e-08, 1.2759e-07], [-2.4045e-04, -2.7275e-04, -3.1978e-05, ..., 3.1665e-08, -1.6689e-04, -2.5487e-04], [ 5.5600e-07, 1.5274e-07, 4.0792e-07, ..., 8.0094e-08, 4.2096e-07, 3.9861e-07], ..., [ 1.5306e-04, 1.7488e-04, 1.9267e-05, ..., -3.2783e-07, 1.0568e-04, 1.6415e-04], [ 3.6675e-06, 3.4068e-06, 1.1148e-06, ..., 1.0245e-07, 1.8626e-06, 4.6790e-06], [ 7.8797e-05, 8.9884e-05, 9.9242e-06, ..., 1.7602e-07, 5.5909e-05, 8.1658e-05]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0087, -0.0220, -0.0053, -0.0166, -0.0073, 0.0009, 0.0098, 0.0221, 0.0195, -0.0105], device='cuda:0'), grad: tensor([ 3.9302e-07, -6.8665e-04, 1.2266e-06, 9.3319e-07, 9.3132e-06, 5.9418e-07, 9.1642e-07, 4.3941e-04, 1.1913e-05, 2.2221e-04], device='cuda:0') 100 0.0001 changing lr epoch 222, time 214.74, cls_loss 0.0017 cls_loss_mapping 0.0031 cls_loss_causal 0.5709 re_mapping 0.0057 re_causal 0.0172 /// teacc 98.98 lr 0.00010000 Epoch 224, weight, value: tensor([[-0.2226, 0.0817, -0.1009, ..., -0.0272, -0.1571, -0.1421], [ 0.0100, 0.0914, -0.1177, ..., -0.1081, -0.0405, 0.0960], [ 0.0511, -0.1163, -0.1128, ..., 0.0022, -0.0510, -0.0800], ..., [ 0.0839, -0.0239, 0.1232, ..., 0.0614, 0.1629, -0.0016], [ 0.1030, -0.1681, -0.1232, ..., -0.2049, -0.0535, 0.1713], [-0.1345, 0.0780, 0.0400, ..., -0.1874, -0.0968, -0.0374]], device='cuda:0'), grad: tensor([[ 8.9407e-08, -2.7101e-06, 2.4214e-08, ..., 2.7940e-09, 1.5832e-08, 2.6636e-07], [-1.0855e-05, -1.9282e-05, 3.1665e-08, ..., 9.3132e-09, -1.3849e-06, -3.2455e-05], [ 7.4599e-07, 9.6858e-07, 1.3690e-07, ..., 9.3132e-10, 2.4308e-07, 2.2333e-06], ..., [ 3.0249e-06, 6.8024e-06, -4.5076e-07, ..., -8.1956e-08, 1.1921e-07, 1.0654e-05], [ 2.5965e-06, 2.2016e-06, 1.8626e-07, ..., 9.3132e-09, 5.1782e-07, 7.5325e-06], [ 2.9616e-06, 7.2569e-06, -9.7696e-07, ..., 4.7497e-08, 2.2165e-07, 7.6592e-06]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0089, -0.0218, -0.0053, -0.0179, -0.0044, 0.0022, 0.0094, 0.0221, 0.0188, -0.0122], device='cuda:0'), grad: tensor([-9.5367e-06, -6.8724e-05, 5.7183e-06, 1.5134e-06, 9.3728e-06, 1.6345e-06, 2.9989e-06, 2.2694e-05, 1.4454e-05, 1.9848e-05], device='cuda:0') 100 0.0001 changing lr epoch 223, time 214.79, cls_loss 0.0015 cls_loss_mapping 0.0033 cls_loss_causal 0.5010 re_mapping 0.0059 re_causal 0.0166 /// teacc 98.99 lr 0.00010000 Epoch 225, weight, value: tensor([[-0.2230, 0.0818, -0.1019, ..., -0.0266, -0.1574, -0.1424], [ 0.0100, 0.0920, -0.1176, ..., -0.1083, -0.0405, 0.0961], [ 0.0511, -0.1163, -0.1131, ..., 0.0023, -0.0509, -0.0801], ..., [ 0.0840, -0.0250, 0.1232, ..., 0.0616, 0.1630, -0.0017], [ 0.1029, -0.1703, -0.1234, ..., -0.2053, -0.0536, 0.1713], [-0.1352, 0.0780, 0.0405, ..., -0.1893, -0.0972, -0.0377]], device='cuda:0'), grad: tensor([[ 8.2888e-08, -2.1048e-07, 1.9930e-07, ..., 2.5146e-08, 7.5437e-08, 2.5146e-08], [ 2.3818e-04, 2.5611e-07, 6.0815e-07, ..., 2.8944e-04, 7.7772e-04, -3.7253e-09], [-2.3901e-04, 9.3132e-08, 8.3074e-07, ..., -2.9159e-04, -7.8106e-04, 4.9360e-08], ..., [-2.9672e-06, 3.3993e-07, -5.3309e-06, ..., 8.2888e-08, -1.4454e-06, 1.1176e-08], [ 9.6951e-07, 3.4347e-06, 4.0084e-06, ..., 2.5891e-07, 6.6590e-07, 6.7241e-07], [ 1.4231e-06, -2.1145e-05, -1.4864e-05, ..., 9.4995e-07, 2.9039e-06, -2.1327e-06]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0087, -0.0217, -0.0053, -0.0179, -0.0046, 0.0028, 0.0096, 0.0220, 0.0179, -0.0122], device='cuda:0'), grad: tensor([ 3.1013e-07, 2.1572e-03, -2.1706e-03, 7.9256e-07, 4.2081e-05, 2.9430e-06, 3.1851e-07, 1.9968e-06, 9.8050e-06, -4.7296e-05], device='cuda:0') 100 0.0001 changing lr epoch 224, time 214.77, cls_loss 0.0015 cls_loss_mapping 0.0032 cls_loss_causal 0.5142 re_mapping 0.0055 re_causal 0.0168 /// teacc 98.95 lr 0.00010000 Epoch 226, weight, value: tensor([[-0.2233, 0.0819, -0.1030, ..., -0.0265, -0.1581, -0.1425], [ 0.0099, 0.0921, -0.1177, ..., -0.1095, -0.0409, 0.0962], [ 0.0513, -0.1163, -0.1131, ..., 0.0031, -0.0503, -0.0801], ..., [ 0.0840, -0.0253, 0.1235, ..., 0.0616, 0.1630, -0.0018], [ 0.1031, -0.1708, -0.1235, ..., -0.2057, -0.0535, 0.1717], [-0.1360, 0.0788, 0.0410, ..., -0.1909, -0.0975, -0.0368]], device='cuda:0'), grad: tensor([[ 2.2352e-08, -6.4708e-06, 2.0582e-07, ..., -8.5682e-08, 1.2107e-08, 8.8476e-08], [ 1.0338e-07, 6.9011e-07, 4.5355e-07, ..., 4.7497e-08, 5.2154e-08, 2.7381e-07], [-9.6671e-07, 9.2387e-07, 1.4435e-07, ..., -1.0766e-06, -9.2480e-07, 1.4249e-07], ..., [ 9.2853e-07, 1.0924e-06, 1.4231e-06, ..., 9.7323e-07, 8.1025e-07, 4.9174e-07], [-5.4017e-07, 2.6431e-06, 3.4850e-06, ..., -3.5390e-08, 4.0978e-08, 1.3225e-07], [-3.1069e-06, -9.4116e-05, -1.1408e-04, ..., 7.0781e-08, -1.0710e-07, -4.8190e-05]], device='cuda:0') Epoch 226, bias, value: tensor([-0.0087, -0.0219, -0.0050, -0.0179, -0.0060, 0.0034, 0.0091, 0.0220, 0.0177, -0.0112], device='cuda:0'), grad: tensor([-1.1533e-05, 2.0973e-06, 9.5740e-06, 1.4370e-06, 3.0971e-04, 1.9297e-06, 8.2105e-06, 6.3777e-06, 8.2031e-06, -3.3593e-04], device='cuda:0') 100 0.0001 changing lr epoch 225, time 214.61, cls_loss 0.0015 cls_loss_mapping 0.0025 cls_loss_causal 0.5101 re_mapping 0.0054 re_causal 0.0163 /// teacc 98.96 lr 0.00010000 Epoch 227, weight, value: tensor([[-0.2237, 0.0820, -0.1037, ..., -0.0266, -0.1585, -0.1427], [ 0.0098, 0.0927, -0.1177, ..., -0.1098, -0.0410, 0.0964], [ 0.0512, -0.1164, -0.1137, ..., 0.0029, -0.0504, -0.0802], ..., [ 0.0842, -0.0266, 0.1237, ..., 0.0619, 0.1634, -0.0019], [ 0.1032, -0.1712, -0.1237, ..., -0.2059, -0.0536, 0.1718], [-0.1364, 0.0794, 0.0426, ..., -0.1915, -0.0977, -0.0354]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -1.2107e-08, 8.0094e-08, ..., 4.7497e-08, 1.8626e-09, 1.8626e-09], [ 3.4459e-08, -4.6566e-08, 9.9652e-08, ..., 1.0151e-07, 3.6322e-08, -1.0431e-07], [-3.7011e-06, 1.8626e-08, -1.0245e-07, ..., -3.8370e-06, -3.5334e-06, 1.4901e-08], ..., [ 3.6526e-06, 5.6811e-08, 5.4203e-07, ..., 4.2170e-06, 3.4589e-06, 8.7544e-08], [-4.5635e-08, 1.8533e-07, 2.5984e-07, ..., 6.7055e-08, -2.7940e-09, -5.5879e-08], [ 7.4506e-09, 5.9605e-08, 5.5879e-09, ..., 2.6636e-07, 6.5193e-09, 1.0245e-08]], device='cuda:0') Epoch 227, bias, value: tensor([-0.0087, -0.0219, -0.0050, -0.0180, -0.0070, 0.0032, 0.0094, 0.0220, 0.0177, -0.0104], device='cuda:0'), grad: tensor([ 5.2154e-08, 4.1910e-08, -7.8008e-06, -2.2631e-07, 2.7940e-07, -2.1495e-06, 4.2841e-07, 8.7321e-06, 6.7987e-07, -6.7987e-08], device='cuda:0') 100 0.0001 changing lr epoch 226, time 214.84, cls_loss 0.0014 cls_loss_mapping 0.0033 cls_loss_causal 0.5069 re_mapping 0.0054 re_causal 0.0168 /// teacc 98.93 lr 0.00010000 Epoch 228, weight, value: tensor([[-0.2241, 0.0820, -0.1051, ..., -0.0268, -0.1590, -0.1431], [ 0.0099, 0.0930, -0.1171, ..., -0.1095, -0.0408, 0.0965], [ 0.0510, -0.1165, -0.1142, ..., 0.0029, -0.0506, -0.0803], ..., [ 0.0842, -0.0269, 0.1234, ..., 0.0619, 0.1634, -0.0020], [ 0.1033, -0.1721, -0.1238, ..., -0.2061, -0.0536, 0.1721], [-0.1373, 0.0793, 0.0427, ..., -0.1929, -0.0982, -0.0357]], device='cuda:0'), grad: tensor([[ 1.0803e-07, -1.8161e-07, -1.3970e-08, ..., -1.7695e-08, 6.7987e-08, 2.0489e-08], [ 4.2655e-07, 1.1837e-06, 2.0489e-08, ..., 2.8871e-08, 2.2352e-08, 1.6699e-06], [-5.2806e-07, 2.6636e-07, -6.1467e-08, ..., -5.5134e-07, -4.7870e-07, 1.0245e-08], ..., [ 5.1036e-07, 7.0501e-07, -7.4506e-09, ..., 3.9395e-07, 3.0547e-07, 8.4657e-07], [ 4.6473e-07, 3.0976e-06, 4.0978e-08, ..., 9.0338e-08, 3.5390e-08, 1.6410e-06], [ 4.8988e-07, -7.4916e-06, -1.3830e-06, ..., 5.6811e-08, 1.3970e-08, 2.0284e-06]], device='cuda:0') Epoch 228, bias, value: tensor([-0.0088, -0.0216, -0.0051, -0.0164, -0.0070, 0.0019, 0.0095, 0.0218, 0.0175, -0.0104], device='cuda:0'), grad: tensor([ 1.1623e-06, 3.7849e-06, -8.7544e-07, 1.9800e-06, 2.1383e-05, -1.4722e-05, 2.9244e-06, 3.3900e-06, 1.0036e-05, -2.9162e-05], device='cuda:0') 100 0.0001 changing lr epoch 227, time 214.94, cls_loss 0.0018 cls_loss_mapping 0.0033 cls_loss_causal 0.5029 re_mapping 0.0056 re_causal 0.0157 /// teacc 98.94 lr 0.00010000 Epoch 229, weight, value: tensor([[-0.2250, 0.0821, -0.1060, ..., -0.0277, -0.1596, -0.1435], [ 0.0100, 0.0937, -0.1172, ..., -0.1097, -0.0409, 0.0968], [ 0.0511, -0.1166, -0.1147, ..., 0.0030, -0.0506, -0.0803], ..., [ 0.0843, -0.0283, 0.1237, ..., 0.0621, 0.1636, -0.0023], [ 0.1031, -0.1733, -0.1240, ..., -0.2068, -0.0537, 0.1723], [-0.1384, 0.0792, 0.0427, ..., -0.1942, -0.0986, -0.0362]], device='cuda:0'), grad: tensor([[ 1.9558e-08, 1.6112e-07, 1.8813e-07, ..., 3.9116e-08, 3.5390e-08, 2.2352e-08], [ 3.4459e-08, 1.5367e-07, 1.4529e-07, ..., 2.7940e-08, 2.0489e-08, 3.6322e-08], [-8.8476e-08, 4.0978e-08, -3.4459e-08, ..., -2.5891e-07, -2.5053e-07, 2.1420e-08], ..., [-2.0489e-08, 3.1851e-07, 1.7975e-07, ..., 6.0536e-08, 5.0291e-08, 7.6368e-08], [-1.9576e-06, -8.7451e-07, 3.4925e-07, ..., 4.0978e-08, 2.8871e-08, -6.1095e-06], [-4.6194e-07, -1.8001e-05, -1.5959e-05, ..., 1.3970e-08, 5.5879e-09, -3.4925e-06]], device='cuda:0') Epoch 229, bias, value: tensor([-0.0089, -0.0215, -0.0051, -0.0162, -0.0069, 0.0019, 0.0096, 0.0218, 0.0170, -0.0106], device='cuda:0'), grad: tensor([ 6.2771e-07, 5.1036e-07, -7.5344e-07, 1.3068e-05, 4.2230e-05, -2.6226e-05, 2.7090e-05, 1.0040e-06, -1.3314e-05, -4.4316e-05], device='cuda:0') 100 0.0001 changing lr epoch 228, time 214.98, cls_loss 0.0015 cls_loss_mapping 0.0020 cls_loss_causal 0.5078 re_mapping 0.0055 re_causal 0.0166 /// teacc 98.99 lr 0.00010000 Epoch 230, weight, value: tensor([[-0.2257, 0.0822, -0.1034, ..., -0.0270, -0.1599, -0.1445], [ 0.0101, 0.0943, -0.1169, ..., -0.1097, -0.0408, 0.0971], [ 0.0512, -0.1168, -0.1153, ..., 0.0028, -0.0506, -0.0804], ..., [ 0.0842, -0.0295, 0.1234, ..., 0.0619, 0.1637, -0.0025], [ 0.1031, -0.1731, -0.1242, ..., -0.2072, -0.0537, 0.1725], [-0.1392, 0.0789, 0.0423, ..., -0.1959, -0.0989, -0.0368]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -4.4517e-07, 2.7940e-09, ..., 1.2107e-08, 1.3970e-09, -0.0000e+00], [ 7.9162e-09, -6.1933e-08, 5.5879e-08, ..., 3.0268e-08, 3.4459e-08, -2.5099e-07], [ 4.0513e-08, 7.7300e-08, 6.9849e-08, ..., -9.8255e-08, 5.1223e-08, 6.5658e-08], ..., [-1.7090e-07, 8.1491e-08, -2.0675e-07, ..., -4.4238e-08, -1.6904e-07, 7.3109e-08], [ 2.7008e-08, 6.0536e-08, 1.9558e-08, ..., 4.7497e-08, 1.1176e-08, 5.2154e-08], [ 5.9139e-08, 2.0443e-07, 4.4703e-08, ..., 4.0978e-08, 5.1223e-08, 3.0268e-08]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0086, -0.0213, -0.0051, -0.0165, -0.0065, 0.0022, 0.0095, 0.0216, 0.0170, -0.0111], device='cuda:0'), grad: tensor([-6.8173e-07, -2.3050e-07, -3.2969e-07, -6.5193e-09, 5.8673e-08, 3.3295e-07, -2.3982e-07, 1.9092e-08, 4.8988e-07, 5.9651e-07], device='cuda:0') 100 0.0001 changing lr epoch 229, time 214.83, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.4981 re_mapping 0.0054 re_causal 0.0160 /// teacc 98.94 lr 0.00010000 Epoch 231, weight, value: tensor([[-0.2256, 0.0829, -0.1025, ..., -0.0266, -0.1601, -0.1438], [ 0.0105, 0.0943, -0.1168, ..., -0.1098, -0.0405, 0.0972], [ 0.0506, -0.1168, -0.1165, ..., 0.0030, -0.0512, -0.0804], ..., [ 0.0845, -0.0298, 0.1240, ..., 0.0618, 0.1639, -0.0026], [ 0.1030, -0.1737, -0.1245, ..., -0.2077, -0.0538, 0.1724], [-0.1419, 0.0782, 0.0412, ..., -0.1967, -0.1003, -0.0372]], device='cuda:0'), grad: tensor([[ 1.5832e-07, 2.4214e-07, 6.5193e-09, ..., 1.8626e-09, 0.0000e+00, 3.1572e-07], [-1.8636e-06, -3.4831e-06, 5.1223e-08, ..., 2.3283e-08, 2.4214e-08, -4.4629e-06], [ 1.0990e-07, 1.4156e-07, 2.6077e-08, ..., 1.6764e-08, 2.7940e-09, 2.1234e-07], ..., [ 1.3420e-06, 2.2724e-06, -3.3528e-08, ..., -1.2107e-08, -3.6322e-08, 3.0566e-06], [-3.3248e-07, 2.5239e-07, 1.3970e-08, ..., 5.5879e-09, 0.0000e+00, -1.7509e-07], [ 3.4925e-07, 3.5111e-07, -3.2596e-08, ..., 2.7940e-09, 2.7940e-09, 5.9139e-07]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0073, -0.0209, -0.0057, -0.0165, -0.0062, 0.0020, 0.0092, 0.0217, 0.0167, -0.0119], device='cuda:0'), grad: tensor([ 7.0035e-07, -9.1270e-06, 4.9639e-07, -3.3434e-07, 1.1269e-07, 3.3900e-07, 5.1595e-07, 6.3553e-06, -4.2375e-07, 1.3439e-06], device='cuda:0') 100 0.0001 changing lr epoch 230, time 214.63, cls_loss 0.0013 cls_loss_mapping 0.0030 cls_loss_causal 0.5261 re_mapping 0.0058 re_causal 0.0175 /// teacc 98.91 lr 0.00010000 Epoch 232, weight, value: tensor([[-0.2263, 0.0829, -0.1034, ..., -0.0270, -0.1608, -0.1444], [ 0.0105, 0.0945, -0.1169, ..., -0.1104, -0.0406, 0.0973], [ 0.0508, -0.1169, -0.1168, ..., 0.0032, -0.0511, -0.0803], ..., [ 0.0834, -0.0298, 0.1237, ..., 0.0616, 0.1634, -0.0033], [ 0.1053, -0.1757, -0.1227, ..., -0.2065, -0.0516, 0.1746], [-0.1421, 0.0783, 0.0413, ..., -0.1971, -0.1005, -0.0371]], device='cuda:0'), grad: tensor([[ 7.9162e-08, 3.5375e-05, 6.8009e-05, ..., 1.0245e-08, 7.4506e-09, 1.0896e-07], [-1.9574e-04, -1.8189e-06, -9.6142e-05, ..., 2.0489e-08, -2.0459e-05, -2.2566e-04], [ 4.7963e-07, 7.4133e-07, 1.6689e-06, ..., 2.3749e-07, 5.1223e-08, 6.7614e-07], ..., [ 1.9217e-04, 1.4678e-06, 9.5606e-05, ..., -5.5879e-09, 2.0102e-05, 2.2066e-04], [ 1.2731e-06, 5.6066e-07, 1.3430e-06, ..., 1.6764e-08, 1.2945e-07, 1.5879e-06], [ 5.6438e-07, -3.8326e-05, -7.3671e-05, ..., 1.1176e-08, 5.7742e-08, 7.7952e-07]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0074, -0.0209, -0.0056, -0.0165, -0.0062, 0.0021, 0.0092, 0.0213, 0.0178, -0.0118], device='cuda:0'), grad: tensor([ 1.5020e-04, -6.4516e-04, 5.1931e-06, 2.6822e-06, 3.9451e-06, 1.1371e-06, 2.5909e-06, 6.3324e-04, 6.3814e-06, -1.6069e-04], device='cuda:0') 100 0.0001 changing lr epoch 231, time 214.41, cls_loss 0.0019 cls_loss_mapping 0.0034 cls_loss_causal 0.4781 re_mapping 0.0059 re_causal 0.0159 /// teacc 98.93 lr 0.00010000 Epoch 233, weight, value: tensor([[-0.2269, 0.0828, -0.1055, ..., -0.0270, -0.1614, -0.1449], [ 0.0106, 0.0946, -0.1169, ..., -0.1105, -0.0406, 0.0975], [ 0.0508, -0.1170, -0.1172, ..., 0.0031, -0.0511, -0.0804], ..., [ 0.0835, -0.0296, 0.1242, ..., 0.0616, 0.1637, -0.0034], [ 0.1053, -0.1765, -0.1229, ..., -0.2068, -0.0517, 0.1749], [-0.1438, 0.0782, 0.0409, ..., -0.1979, -0.1020, -0.0381]], device='cuda:0'), grad: tensor([[ 2.9802e-08, 1.3234e-06, 1.5395e-06, ..., -9.3132e-10, 0.0000e+00, 7.0781e-08], [-6.3330e-07, -1.0571e-06, 1.6112e-07, ..., 1.8626e-09, 0.0000e+00, -1.5106e-06], [ 5.6811e-08, 1.6484e-07, 6.9849e-08, ..., 8.3819e-09, 0.0000e+00, 2.0396e-07], ..., [ 8.1956e-08, 4.9639e-07, 2.1979e-07, ..., 0.0000e+00, 0.0000e+00, 4.6100e-07], [-1.0431e-07, 2.9523e-07, 2.7288e-07, ..., 3.7253e-09, 0.0000e+00, -1.6205e-07], [ 5.5879e-09, -3.6415e-06, -4.6305e-06, ..., 0.0000e+00, 0.0000e+00, 7.7300e-08]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0077, -0.0208, -0.0056, -0.0167, -0.0061, 0.0025, 0.0097, 0.0213, 0.0175, -0.0121], device='cuda:0'), grad: tensor([ 3.4198e-06, -4.0755e-06, 6.1467e-07, 1.8971e-06, 4.5486e-06, -2.2296e-06, 3.2093e-06, 1.5181e-06, 8.6240e-07, -9.8124e-06], device='cuda:0') 100 0.0001 changing lr epoch 232, time 214.52, cls_loss 0.0016 cls_loss_mapping 0.0027 cls_loss_causal 0.4863 re_mapping 0.0057 re_causal 0.0161 /// teacc 98.97 lr 0.00010000 Epoch 234, weight, value: tensor([[-0.2279, 0.0828, -0.1063, ..., -0.0272, -0.1626, -0.1461], [ 0.0105, 0.0946, -0.1170, ..., -0.1106, -0.0407, 0.0974], [ 0.0509, -0.1170, -0.1175, ..., 0.0032, -0.0510, -0.0804], ..., [ 0.0837, -0.0296, 0.1246, ..., 0.0618, 0.1640, -0.0034], [ 0.1055, -0.1776, -0.1232, ..., -0.2071, -0.0518, 0.1754], [-0.1449, 0.0782, 0.0406, ..., -0.1982, -0.1030, -0.0384]], device='cuda:0'), grad: tensor([[ 2.7940e-08, 2.8219e-07, 2.0210e-07, ..., 2.0489e-08, 2.8871e-08, 2.7008e-08], [ 1.9059e-05, 2.7362e-06, 2.9817e-05, ..., 3.2596e-08, 1.9953e-05, 1.4193e-05], [ 9.9465e-07, 8.4843e-07, 1.6754e-06, ..., 9.8720e-08, 1.0198e-06, 7.4971e-07], ..., [-2.4766e-05, 1.8161e-07, -3.8534e-05, ..., 8.3819e-09, -2.5928e-05, -1.8403e-05], [ 9.6858e-08, 1.8515e-06, 4.0699e-07, ..., 4.2841e-08, 1.2107e-07, 6.7055e-08], [ 4.3474e-06, -4.1313e-06, 6.2101e-06, ..., 3.4459e-08, 4.5411e-06, 3.4589e-06]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0078, -0.0208, -0.0056, -0.0172, -0.0060, 0.0025, 0.0105, 0.0213, 0.0171, -0.0121], device='cuda:0'), grad: tensor([ 2.7865e-06, 8.7619e-05, 7.2792e-06, -2.1793e-06, 3.2056e-06, 5.0664e-06, -5.4240e-05, -5.2691e-05, 1.1973e-05, -8.7321e-06], device='cuda:0') 100 0.0001 changing lr epoch 233, time 214.82, cls_loss 0.0015 cls_loss_mapping 0.0026 cls_loss_causal 0.5377 re_mapping 0.0057 re_causal 0.0167 /// teacc 99.01 lr 0.00010000 Epoch 235, weight, value: tensor([[-0.2284, 0.0829, -0.1069, ..., -0.0272, -0.1633, -0.1463], [ 0.0105, 0.0946, -0.1170, ..., -0.1107, -0.0407, 0.0975], [ 0.0509, -0.1172, -0.1179, ..., 0.0032, -0.0511, -0.0805], ..., [ 0.0837, -0.0299, 0.1247, ..., 0.0618, 0.1642, -0.0035], [ 0.1056, -0.1779, -0.1233, ..., -0.2073, -0.0518, 0.1756], [-0.1452, 0.0785, 0.0411, ..., -0.1984, -0.1032, -0.0384]], device='cuda:0'), grad: tensor([[ 6.8918e-08, -2.9698e-05, 3.5390e-08, ..., 3.8184e-08, 4.6566e-08, -1.5534e-06], [-1.6764e-08, 2.0787e-05, 7.1712e-08, ..., 4.0978e-08, 6.6124e-08, 1.0040e-06], [-2.3209e-06, 5.9120e-06, -5.7556e-07, ..., -1.4435e-06, -1.9912e-06, 6.8825e-07], ..., [ 9.0618e-07, 3.1199e-06, -6.3423e-07, ..., 1.2387e-06, 9.2853e-07, 1.9483e-06], [ 1.0049e-06, 9.9652e-07, 1.0859e-06, ..., 1.6019e-07, 7.1898e-07, 7.5903e-07], [ 2.4773e-07, 3.5882e-05, 9.8720e-08, ..., 2.3283e-08, 1.7509e-07, 2.3827e-05]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0078, -0.0208, -0.0056, -0.0175, -0.0062, 0.0028, 0.0102, 0.0213, 0.0170, -0.0118], device='cuda:0'), grad: tensor([-6.7234e-05, 4.9978e-05, 9.8273e-06, 2.3842e-06, -2.7800e-04, 7.9628e-07, 3.4459e-06, 2.4274e-05, 7.6033e-06, 2.4700e-04], device='cuda:0') 100 0.0001 changing lr epoch 234, time 214.66, cls_loss 0.0014 cls_loss_mapping 0.0017 cls_loss_causal 0.4843 re_mapping 0.0056 re_causal 0.0162 /// teacc 99.01 lr 0.00010000 Epoch 236, weight, value: tensor([[-0.2291, 0.0831, -0.1075, ..., -0.0274, -0.1639, -0.1470], [ 0.0109, 0.0952, -0.1166, ..., -0.1108, -0.0407, 0.0980], [ 0.0510, -0.1172, -0.1182, ..., 0.0033, -0.0510, -0.0806], ..., [ 0.0833, -0.0317, 0.1244, ..., 0.0617, 0.1641, -0.0039], [ 0.1056, -0.1787, -0.1236, ..., -0.2077, -0.0518, 0.1758], [-0.1449, 0.0788, 0.0417, ..., -0.1990, -0.1033, -0.0388]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -1.0142e-06, -2.7847e-07, ..., -3.9767e-07, 7.4506e-09, -1.3206e-06], [ 4.8429e-08, 5.8860e-07, 1.5553e-07, ..., 1.7509e-07, 8.7544e-08, 4.0513e-07], [ 3.0734e-08, 1.9465e-07, 1.3504e-07, ..., 5.1223e-08, 3.7253e-08, 1.3225e-07], ..., [-1.4529e-07, 4.3679e-07, -1.6671e-07, ..., 4.6566e-09, -2.1607e-07, 3.2969e-07], [ 1.0245e-08, 3.6135e-07, 9.4064e-08, ..., 7.3574e-08, 1.3039e-08, 2.2445e-07], [ 3.2596e-08, 1.6298e-07, -5.9605e-08, ..., 5.4017e-08, 4.4703e-08, 1.4994e-07]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0076, -0.0206, -0.0055, -0.0175, -0.0061, 0.0029, 0.0099, 0.0208, 0.0169, -0.0116], device='cuda:0'), grad: tensor([ 4.2245e-06, 1.9632e-06, 6.8545e-07, 1.1083e-07, 2.1420e-07, 3.6228e-07, -1.0267e-05, 8.6054e-07, 1.3253e-06, 5.0385e-07], device='cuda:0') 100 0.0001 changing lr epoch 235, time 214.93, cls_loss 0.0019 cls_loss_mapping 0.0025 cls_loss_causal 0.5347 re_mapping 0.0055 re_causal 0.0163 /// teacc 99.00 lr 0.00010000 Epoch 237, weight, value: tensor([[-0.2295, 0.0831, -0.1084, ..., -0.0275, -0.1647, -0.1473], [ 0.0110, 0.0962, -0.1163, ..., -0.1095, -0.0407, 0.0989], [ 0.0509, -0.1173, -0.1193, ..., 0.0026, -0.0511, -0.0810], ..., [ 0.0834, -0.0326, 0.1243, ..., 0.0617, 0.1643, -0.0043], [ 0.1057, -0.1791, -0.1239, ..., -0.2081, -0.0519, 0.1762], [-0.1451, 0.0788, 0.0423, ..., -0.1997, -0.1035, -0.0396]], device='cuda:0'), grad: tensor([[ 2.1420e-07, -9.3132e-10, 8.0187e-07, ..., 5.2340e-07, 2.8964e-07, 2.5053e-07], [ 1.2904e-05, 1.0207e-05, 1.7866e-05, ..., 7.9572e-06, 1.7539e-05, 1.1511e-05], [ 3.7625e-07, 7.0129e-07, 2.6207e-06, ..., 1.8310e-06, 6.1188e-07, 6.7987e-07], ..., [-1.5751e-05, -1.2271e-05, -2.1592e-05, ..., -9.4175e-06, -2.1279e-05, -1.3970e-05], [ 2.8033e-07, 2.2631e-07, 5.0105e-07, ..., 2.9802e-07, 2.9616e-07, 9.3132e-08], [ 5.6904e-07, 4.5169e-07, 8.5309e-07, ..., 3.9116e-07, 7.0222e-07, 5.0943e-07]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0075, -0.0200, -0.0057, -0.0190, -0.0064, 0.0025, 0.0106, 0.0205, 0.0167, -0.0111], device='cuda:0'), grad: tensor([ 1.1884e-06, 3.9846e-05, 6.1914e-06, -7.5586e-06, 2.0936e-06, 1.9018e-06, 3.8370e-07, -4.7982e-05, 1.7546e-06, 2.1495e-06], device='cuda:0') 100 0.0001 changing lr epoch 236, time 214.70, cls_loss 0.0016 cls_loss_mapping 0.0025 cls_loss_causal 0.5087 re_mapping 0.0055 re_causal 0.0163 /// teacc 98.97 lr 0.00010000 Epoch 238, weight, value: tensor([[-0.2301, 0.0832, -0.1088, ..., -0.0268, -0.1656, -0.1467], [ 0.0109, 0.0962, -0.1163, ..., -0.1098, -0.0408, 0.0990], [ 0.0510, -0.1174, -0.1196, ..., 0.0027, -0.0511, -0.0811], ..., [ 0.0835, -0.0328, 0.1242, ..., 0.0614, 0.1644, -0.0044], [ 0.1057, -0.1796, -0.1240, ..., -0.2095, -0.0519, 0.1766], [-0.1455, 0.0786, 0.0421, ..., -0.2007, -0.1037, -0.0395]], device='cuda:0'), grad: tensor([[ 7.4506e-09, -3.2131e-07, 2.6077e-08, ..., 2.8871e-08, 2.7940e-09, 7.2643e-08], [ 4.8429e-08, 4.5635e-08, 1.2759e-07, ..., 7.5437e-08, 2.0489e-08, 7.0781e-08], [ 1.3970e-07, 3.5390e-08, 2.8033e-07, ..., 1.9837e-07, 5.6811e-08, -4.2003e-07], ..., [-5.2247e-07, 1.4622e-07, -2.0768e-07, ..., -1.5274e-07, -2.0582e-07, 1.3877e-07], [ 9.4064e-08, 1.0617e-07, 4.2375e-07, ..., 2.3842e-07, 3.5390e-08, -2.8126e-07], [ 5.2154e-08, 6.3982e-07, 2.3376e-07, ..., 4.3772e-08, 1.8626e-08, 6.5472e-07]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0075, -0.0200, -0.0058, -0.0191, -0.0059, 0.0033, 0.0105, 0.0204, 0.0168, -0.0116], device='cuda:0'), grad: tensor([-2.4121e-07, 6.3609e-07, -1.1958e-06, 3.4086e-06, -2.1383e-06, -4.5635e-06, 9.1828e-07, 6.3330e-08, 6.3144e-07, 2.4475e-06], device='cuda:0') 100 0.0001 changing lr epoch 237, time 214.88, cls_loss 0.0013 cls_loss_mapping 0.0030 cls_loss_causal 0.5114 re_mapping 0.0055 re_causal 0.0165 /// teacc 99.00 lr 0.00010000 Epoch 239, weight, value: tensor([[-0.2308, 0.0829, -0.1093, ..., -0.0270, -0.1661, -0.1471], [ 0.0110, 0.0963, -0.1161, ..., -0.1096, -0.0407, 0.0992], [ 0.0509, -0.1175, -0.1202, ..., 0.0025, -0.0511, -0.0813], ..., [ 0.0835, -0.0331, 0.1240, ..., 0.0615, 0.1645, -0.0045], [ 0.1060, -0.1792, -0.1241, ..., -0.2097, -0.0519, 0.1778], [-0.1458, 0.0788, 0.0423, ..., -0.2012, -0.1038, -0.0397]], device='cuda:0'), grad: tensor([[ 2.2352e-07, 1.2107e-08, 3.1665e-08, ..., 4.7497e-08, 6.5193e-09, 1.3039e-08], [ 1.0906e-06, 3.4459e-08, 1.2200e-07, ..., 1.2768e-06, 1.3746e-06, -1.1176e-08], [-1.6950e-07, 1.5832e-08, 3.4645e-07, ..., -3.1330e-06, -1.4137e-06, 1.6298e-07], ..., [-2.6330e-05, 2.0582e-07, -2.5686e-06, ..., 1.2480e-07, -4.6566e-09, 9.2201e-08], [ 2.0061e-06, 2.7008e-08, 2.4214e-08, ..., 1.5851e-06, 9.3132e-09, -3.9395e-07], [ 2.4028e-07, -1.0123e-06, -1.5413e-06, ..., 5.5879e-09, 6.5193e-09, 1.3039e-07]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0079, -0.0198, -0.0059, -0.0182, -0.0058, 0.0023, 0.0094, 0.0202, 0.0178, -0.0115], device='cuda:0'), grad: tensor([ 8.2236e-07, 5.4054e-06, 4.9733e-07, 1.7053e-06, 8.0764e-05, 4.6678e-06, 3.8464e-07, -9.8884e-05, 5.8450e-06, -1.1576e-06], device='cuda:0') 100 0.0001 changing lr epoch 238, time 214.98, cls_loss 0.0013 cls_loss_mapping 0.0029 cls_loss_causal 0.4805 re_mapping 0.0056 re_causal 0.0157 /// teacc 98.89 lr 0.00010000 Epoch 240, weight, value: tensor([[-0.2320, 0.0828, -0.1105, ..., -0.0274, -0.1672, -0.1476], [ 0.0112, 0.0970, -0.1160, ..., -0.1097, -0.0408, 0.0995], [ 0.0508, -0.1176, -0.1210, ..., 0.0024, -0.0512, -0.0814], ..., [ 0.0835, -0.0344, 0.1240, ..., 0.0617, 0.1646, -0.0048], [ 0.1060, -0.1798, -0.1243, ..., -0.2102, -0.0520, 0.1779], [-0.1459, 0.0792, 0.0429, ..., -0.2019, -0.1038, -0.0397]], device='cuda:0'), grad: tensor([[ 1.2107e-08, -1.2871e-06, 1.3970e-08, ..., -6.1840e-07, 6.5193e-09, 1.8626e-09], [ 7.6368e-08, -6.5193e-09, 1.0524e-07, ..., 5.3085e-08, 6.4261e-08, -4.7497e-08], [-1.3039e-08, 9.3412e-07, 6.7055e-08, ..., 4.8522e-07, 0.0000e+00, 1.9558e-08], ..., [-1.5087e-07, 2.7940e-08, -1.8906e-07, ..., -4.4703e-08, -1.2852e-07, 1.6764e-08], [-2.5146e-08, 7.7300e-08, 2.8871e-08, ..., 4.5635e-08, 1.3039e-08, -2.5146e-08], [ 2.7940e-08, 2.6170e-07, 3.0734e-08, ..., 1.2666e-07, 2.1420e-08, 5.0291e-08]], device='cuda:0') Epoch 240, bias, value: tensor([-0.0081, -0.0196, -0.0059, -0.0181, -0.0061, 0.0020, 0.0098, 0.0200, 0.0176, -0.0111], device='cuda:0'), grad: tensor([-5.3793e-06, 1.6578e-07, 3.7998e-06, -2.6077e-07, 5.7742e-08, -8.5589e-07, 1.2703e-06, -1.7975e-07, 3.0175e-07, 1.0766e-06], device='cuda:0') 100 0.0001 changing lr epoch 239, time 214.85, cls_loss 0.0015 cls_loss_mapping 0.0024 cls_loss_causal 0.5140 re_mapping 0.0056 re_causal 0.0163 /// teacc 98.93 lr 0.00010000 Epoch 241, weight, value: tensor([[-0.2328, 0.0829, -0.1110, ..., -0.0276, -0.1682, -0.1482], [ 0.0104, 0.0979, -0.1167, ..., -0.1096, -0.0418, 0.0997], [ 0.0506, -0.1177, -0.1219, ..., 0.0022, -0.0512, -0.0817], ..., [ 0.0845, -0.0358, 0.1249, ..., 0.0619, 0.1659, -0.0046], [ 0.1060, -0.1804, -0.1245, ..., -0.2107, -0.0521, 0.1781], [-0.1466, 0.0794, 0.0431, ..., -0.2027, -0.1041, -0.0400]], device='cuda:0'), grad: tensor([[ 6.0350e-07, -1.9558e-08, 1.2107e-08, ..., 1.3448e-06, 9.6858e-08, 4.6566e-09], [ 1.8626e-07, -9.2201e-08, 1.1828e-07, ..., 1.1176e-07, 2.0675e-07, -2.2911e-07], [-9.0990e-07, 4.0047e-08, 2.7567e-07, ..., -2.0899e-06, -2.1048e-07, 3.2596e-08], ..., [-3.3248e-07, -4.6566e-09, -6.1560e-07, ..., 1.1176e-07, -3.5390e-07, 4.0978e-08], [ 6.2399e-08, 1.3877e-07, 1.9185e-07, ..., 5.6811e-08, 2.9802e-08, 1.4901e-08], [ 7.4506e-08, -2.8126e-07, -3.4273e-07, ..., 1.6764e-08, 6.7055e-08, 2.2352e-08]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0081, -0.0200, -0.0061, -0.0181, -0.0064, 0.0017, 0.0099, 0.0208, 0.0174, -0.0110], device='cuda:0'), grad: tensor([ 5.3160e-06, 2.4680e-07, -7.9423e-06, 1.6233e-06, 6.6683e-07, -1.1921e-07, 4.2282e-07, -2.8126e-07, 6.7241e-07, -6.4541e-07], device='cuda:0') 100 0.0001 changing lr epoch 240, time 214.96, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.5177 re_mapping 0.0057 re_causal 0.0163 /// teacc 98.93 lr 0.00010000 Epoch 242, weight, value: tensor([[-0.2341, 0.0829, -0.1117, ..., -0.0279, -0.1692, -0.1494], [ 0.0104, 0.0980, -0.1168, ..., -0.1095, -0.0418, 0.0998], [ 0.0505, -0.1178, -0.1224, ..., 0.0021, -0.0514, -0.0821], ..., [ 0.0846, -0.0361, 0.1250, ..., 0.0620, 0.1660, -0.0046], [ 0.1060, -0.1810, -0.1247, ..., -0.2112, -0.0521, 0.1781], [-0.1467, 0.0795, 0.0435, ..., -0.2034, -0.1041, -0.0402]], device='cuda:0'), grad: tensor([[ 2.8498e-07, 3.2596e-08, 8.3819e-08, ..., 3.0734e-08, 1.8626e-08, 4.2282e-07], [ 7.3574e-08, 7.1712e-08, 9.7789e-08, ..., 5.9605e-08, 5.5879e-08, 4.5635e-08], [ 3.1479e-07, 3.0734e-08, 3.2410e-07, ..., 1.3411e-07, 2.2352e-07, 1.7975e-07], ..., [-3.0454e-07, 2.1234e-07, 8.4564e-07, ..., -7.8231e-08, -3.7253e-07, -1.0245e-08], [-9.4250e-07, 2.5053e-07, 2.9057e-07, ..., 4.9360e-08, 2.7940e-08, -1.4370e-06], [ 2.0862e-07, -2.6356e-07, -1.8608e-06, ..., 2.1420e-08, 7.4506e-09, 3.1665e-07]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0082, -0.0199, -0.0062, -0.0177, -0.0066, 0.0014, 0.0102, 0.0208, 0.0171, -0.0109], device='cuda:0'), grad: tensor([ 1.2070e-06, 7.0501e-07, 8.2701e-07, 2.1353e-05, 6.2399e-07, -2.5377e-05, 1.6773e-06, 1.4324e-06, -2.7474e-07, -2.1569e-06], device='cuda:0') 100 0.0001 changing lr epoch 241, time 215.05, cls_loss 0.0012 cls_loss_mapping 0.0023 cls_loss_causal 0.5149 re_mapping 0.0052 re_causal 0.0161 /// teacc 98.88 lr 0.00010000 Epoch 243, weight, value: tensor([[-0.2347, 0.0829, -0.1120, ..., -0.0283, -0.1700, -0.1495], [ 0.0105, 0.0983, -0.1167, ..., -0.1095, -0.0418, 0.1001], [ 0.0507, -0.1179, -0.1227, ..., 0.0023, -0.0513, -0.0822], ..., [ 0.0845, -0.0366, 0.1251, ..., 0.0621, 0.1661, -0.0048], [ 0.1061, -0.1816, -0.1249, ..., -0.2114, -0.0522, 0.1782], [-0.1470, 0.0789, 0.0433, ..., -0.2040, -0.1043, -0.0402]], device='cuda:0'), grad: tensor([[ 5.9605e-08, -1.4957e-06, 1.4715e-07, ..., 5.4948e-08, 6.4261e-08, 4.6566e-09], [ 7.8976e-06, -2.1607e-07, 1.6280e-06, ..., 4.1053e-06, 4.2170e-06, -2.8126e-07], [-7.6517e-06, 2.3283e-08, 1.3374e-06, ..., -1.8487e-06, -3.6377e-06, 1.2107e-08], ..., [-8.4400e-05, 1.4808e-07, -2.1040e-04, ..., -1.8847e-04, -7.5102e-05, 1.6671e-07], [ 1.8161e-07, 2.3283e-08, 2.9895e-07, ..., 2.7195e-07, 1.3318e-07, 1.6764e-08], [ 3.6322e-08, 1.4901e-06, 3.2596e-08, ..., 1.2480e-07, 5.0291e-08, 4.4703e-08]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0084, -0.0198, -0.0062, -0.0180, -0.0060, 0.0015, 0.0105, 0.0207, 0.0168, -0.0113], device='cuda:0'), grad: tensor([-2.4214e-06, 2.6241e-05, -2.5764e-05, 2.9635e-04, 5.7742e-08, 4.6473e-07, 2.2165e-07, -2.9826e-04, 7.2177e-07, 2.9225e-06], device='cuda:0') 100 0.0001 changing lr epoch 242, time 214.89, cls_loss 0.0013 cls_loss_mapping 0.0029 cls_loss_causal 0.4928 re_mapping 0.0053 re_causal 0.0154 /// teacc 98.92 lr 0.00010000 Epoch 244, weight, value: tensor([[-0.2357, 0.0831, -0.1124, ..., -0.0288, -0.1705, -0.1500], [ 0.0105, 0.0982, -0.1167, ..., -0.1098, -0.0418, 0.1003], [ 0.0507, -0.1182, -0.1230, ..., 0.0024, -0.0513, -0.0825], ..., [ 0.0846, -0.0373, 0.1252, ..., 0.0624, 0.1662, -0.0050], [ 0.1062, -0.1821, -0.1249, ..., -0.2118, -0.0522, 0.1785], [-0.1474, 0.0790, 0.0437, ..., -0.2055, -0.1044, -0.0403]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 3.2689e-07, 5.2247e-07, ..., 3.7253e-09, 9.3132e-10, 4.5635e-08], [-2.8685e-07, -5.5414e-07, 1.9278e-07, ..., 6.5193e-09, 1.8626e-09, -1.2852e-06], [ 8.6613e-08, 4.1351e-07, 2.1420e-07, ..., -7.4506e-09, -3.7253e-09, 2.7660e-07], ..., [ 1.2107e-07, 3.7160e-07, 4.6566e-08, ..., 0.0000e+00, -2.7940e-09, 5.7928e-07], [-5.4296e-07, 2.1514e-07, 2.1793e-07, ..., 2.1420e-08, 1.8626e-09, -3.7719e-07], [ 2.1327e-07, -7.2360e-05, -4.9829e-05, ..., 9.3132e-10, 9.3132e-10, -6.3702e-06]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0083, -0.0198, -0.0063, -0.0183, -0.0056, 0.0013, 0.0112, 0.0206, 0.0169, -0.0113], device='cuda:0'), grad: tensor([ 1.1977e-06, -2.8517e-06, 1.5153e-06, 2.9922e-05, 6.6124e-07, 1.8466e-04, 2.4773e-07, 1.5888e-06, -1.0896e-07, -2.1648e-04], device='cuda:0') 100 0.0001 changing lr epoch 243, time 214.86, cls_loss 0.0014 cls_loss_mapping 0.0022 cls_loss_causal 0.5033 re_mapping 0.0052 re_causal 0.0151 /// teacc 99.04 lr 0.00010000 Epoch 245, weight, value: tensor([[-0.2361, 0.0833, -0.1131, ..., -0.0290, -0.1711, -0.1503], [ 0.0108, 0.0992, -0.1166, ..., -0.1101, -0.0417, 0.1007], [ 0.0506, -0.1184, -0.1235, ..., 0.0024, -0.0514, -0.0828], ..., [ 0.0844, -0.0387, 0.1253, ..., 0.0626, 0.1662, -0.0054], [ 0.1063, -0.1828, -0.1252, ..., -0.2125, -0.0522, 0.1787], [-0.1478, 0.0791, 0.0441, ..., -0.2069, -0.1046, -0.0405]], device='cuda:0'), grad: tensor([[ 2.7940e-08, -1.6391e-07, 7.2643e-08, ..., -1.4901e-08, 0.0000e+00, 4.4703e-08], [ 2.1420e-08, 3.4459e-08, 1.1548e-07, ..., 5.0291e-08, 1.1176e-08, -4.1910e-08], [ 6.5006e-07, 7.0781e-08, 2.1886e-07, ..., 9.4995e-08, 5.5879e-09, 1.1465e-06], ..., [-4.6566e-09, 2.4214e-07, 7.1060e-07, ..., 2.9709e-07, -3.1665e-08, 3.8184e-08], [-7.1805e-07, 1.6764e-07, 1.5460e-07, ..., 8.9407e-08, 3.7253e-09, -1.2843e-06], [ 2.9802e-08, -3.9116e-07, -1.2759e-06, ..., -3.4459e-08, 5.5879e-09, 8.2888e-08]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0081, -0.0195, -0.0065, -0.0180, -0.0056, 0.0008, 0.0114, 0.0203, 0.0165, -0.0111], device='cuda:0'), grad: tensor([-4.0978e-08, 2.7940e-07, 2.7623e-06, -7.9628e-07, 2.5984e-07, 1.1269e-07, -1.8999e-07, 1.6857e-06, -1.7425e-06, -2.3432e-06], device='cuda:0') 100 0.0001 changing lr epoch 244, time 215.04, cls_loss 0.0012 cls_loss_mapping 0.0027 cls_loss_causal 0.4946 re_mapping 0.0053 re_causal 0.0158 /// teacc 99.00 lr 0.00010000 Epoch 246, weight, value: tensor([[-0.2366, 0.0827, -0.1137, ..., -0.0289, -0.1720, -0.1502], [ 0.0106, 0.0994, -0.1168, ..., -0.1104, -0.0418, 0.1009], [ 0.0506, -0.1185, -0.1239, ..., 0.0024, -0.0514, -0.0830], ..., [ 0.0847, -0.0391, 0.1256, ..., 0.0629, 0.1665, -0.0054], [ 0.1063, -0.1836, -0.1254, ..., -0.2132, -0.0522, 0.1788], [-0.1482, 0.0797, 0.0442, ..., -0.2084, -0.1049, -0.0406]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 1.3970e-08, 4.3772e-08, ..., -9.3132e-10, 9.3132e-09, 1.7323e-07], [-7.1712e-08, -4.0978e-08, 6.6403e-07, ..., 3.3621e-07, 6.5193e-09, -2.7940e-07], [ 2.6077e-08, 3.2317e-07, 3.6322e-08, ..., 1.7695e-08, 7.4506e-09, 4.3865e-07], ..., [-3.8184e-08, 1.5404e-06, 1.9684e-05, ..., 9.8944e-06, -5.2154e-08, 2.6543e-07], [ 9.3132e-10, 4.3865e-07, 3.8184e-07, ..., 9.4064e-08, 9.3132e-10, 1.7975e-07], [ 2.2352e-08, 1.9949e-06, 3.9302e-07, ..., 4.0885e-07, 1.8626e-08, 3.3695e-06]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0087, -0.0195, -0.0065, -0.0182, -0.0058, 0.0005, 0.0120, 0.0204, 0.0162, -0.0108], device='cuda:0'), grad: tensor([ 3.6508e-07, 5.7649e-07, 1.2573e-06, -1.8939e-05, -1.1653e-05, -1.7703e-05, 9.0431e-07, 3.4600e-05, 1.6810e-06, 8.8811e-06], device='cuda:0') 100 0.0001 changing lr epoch 245, time 215.00, cls_loss 0.0016 cls_loss_mapping 0.0027 cls_loss_causal 0.4881 re_mapping 0.0055 re_causal 0.0152 /// teacc 98.97 lr 0.00010000 Epoch 247, weight, value: tensor([[-2.3726e-01, 8.2839e-02, -1.1469e-01, ..., -2.9012e-02, -1.7283e-01, -1.5050e-01], [ 1.0028e-02, 9.9441e-02, -1.1733e-01, ..., -1.1276e-01, -4.2461e-02, 1.0092e-01], [ 4.8989e-02, -1.1864e-01, -1.2695e-01, ..., 1.2988e-04, -5.3391e-02, -8.3037e-02], ..., [ 8.6123e-02, -3.9210e-02, 1.2717e-01, ..., 6.5527e-02, 1.6855e-01, -5.4687e-03], [ 1.0614e-01, -1.8428e-01, -1.2573e-01, ..., -2.1414e-01, -5.2315e-02, 1.7866e-01], [-1.4929e-01, 7.9187e-02, 4.3832e-02, ..., -2.1112e-01, -1.0573e-01, -4.1084e-02]], device='cuda:0'), grad: tensor([[ 4.0047e-08, -5.7742e-08, 5.3085e-08, ..., 7.4506e-09, 5.4948e-08, 3.7253e-09], [ 1.5367e-07, -8.0094e-08, 3.6694e-07, ..., 4.9360e-08, 2.3842e-07, -1.9651e-07], [ 2.8871e-08, 1.6764e-08, 5.4017e-08, ..., 1.8626e-09, 4.0047e-08, 1.3039e-08], ..., [-3.9488e-06, -1.2293e-07, -7.7039e-06, ..., -8.1863e-07, -4.8541e-06, 1.3318e-07], [ 4.0792e-07, 3.6322e-08, 8.4378e-07, ..., 8.2888e-08, 4.8708e-07, 9.3132e-09], [ 3.0063e-06, 7.1712e-08, 5.7817e-06, ..., 6.1188e-07, 3.6545e-06, 5.5879e-09]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0088, -0.0198, -0.0071, -0.0180, -0.0054, 0.0036, 0.0091, 0.0213, 0.0149, -0.0112], device='cuda:0'), grad: tensor([ 4.4703e-08, 9.7789e-08, 1.0151e-07, 2.5146e-07, 6.9104e-07, -8.3353e-07, 8.0746e-07, -1.0453e-05, 1.2591e-06, 8.0168e-06], device='cuda:0') 100 0.0001 changing lr epoch 246, time 214.92, cls_loss 0.0015 cls_loss_mapping 0.0023 cls_loss_causal 0.5513 re_mapping 0.0050 re_causal 0.0155 /// teacc 99.00 lr 0.00010000 Epoch 248, weight, value: tensor([[-0.2386, 0.0829, -0.1154, ..., -0.0297, -0.1737, -0.1515], [ 0.0107, 0.0992, -0.1163, ..., -0.1125, -0.0425, 0.1014], [ 0.0491, -0.1187, -0.1270, ..., 0.0003, -0.0533, -0.0831], ..., [ 0.0855, -0.0394, 0.1263, ..., 0.0653, 0.1686, -0.0060], [ 0.1064, -0.1846, -0.1259, ..., -0.2145, -0.0523, 0.1792], [-0.1498, 0.0791, 0.0442, ..., -0.2120, -0.1061, -0.0414]], device='cuda:0'), grad: tensor([[ 1.3039e-08, -1.5043e-05, 7.8231e-08, ..., -4.8429e-08, 1.2107e-08, -1.1874e-06], [ 1.3560e-06, 1.4976e-05, 5.8636e-06, ..., 1.2852e-07, 1.3690e-07, 8.1807e-06], [ 1.8626e-08, 1.1902e-06, 2.1700e-07, ..., 4.1910e-08, 5.5879e-09, 2.3656e-07], ..., [ 2.8312e-07, 2.5313e-06, 1.8226e-06, ..., -4.5635e-08, -2.3283e-07, 2.7865e-06], [ 7.6368e-08, 8.2608e-07, 3.0547e-07, ..., 3.9116e-08, 4.6566e-09, 4.3958e-07], [-5.3011e-06, -2.9922e-05, -2.4080e-05, ..., 8.8476e-08, 3.9116e-08, -3.2455e-05]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0089, -0.0194, -0.0071, -0.0181, -0.0052, 0.0038, 0.0090, 0.0206, 0.0148, -0.0113], device='cuda:0'), grad: tensor([-2.4974e-05, 3.3349e-05, 2.2948e-06, -8.5589e-07, 5.5373e-05, -9.4622e-07, 8.7619e-06, 6.9067e-06, 2.1420e-06, -8.2076e-05], device='cuda:0') 100 0.0001 changing lr epoch 247, time 215.16, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.4831 re_mapping 0.0051 re_causal 0.0152 /// teacc 99.06 lr 0.00010000 Epoch 249, weight, value: tensor([[-0.2400, 0.0831, -0.1157, ..., -0.0302, -0.1743, -0.1519], [ 0.0110, 0.0998, -0.1162, ..., -0.1127, -0.0423, 0.1017], [ 0.0490, -0.1189, -0.1273, ..., 0.0004, -0.0534, -0.0834], ..., [ 0.0854, -0.0406, 0.1263, ..., 0.0657, 0.1686, -0.0063], [ 0.1065, -0.1852, -0.1261, ..., -0.2155, -0.0524, 0.1794], [-0.1501, 0.0791, 0.0447, ..., -0.2130, -0.1062, -0.0412]], device='cuda:0'), grad: tensor([[ 1.7136e-07, -1.5140e-05, 2.4773e-07, ..., 2.0489e-08, 1.3225e-07, 6.4261e-08], [-2.3283e-08, -2.0415e-06, 8.4937e-07, ..., 6.7055e-08, 4.7591e-07, -1.9725e-06], [-8.6613e-08, 1.2107e-07, 6.3330e-08, ..., -1.2293e-07, -2.2352e-08, 1.4901e-08], ..., [-2.7828e-06, 3.2503e-07, -4.6566e-06, ..., -2.7381e-07, -2.6468e-06, 2.3562e-07], [ 6.8638e-07, 1.1418e-06, 7.8324e-07, ..., 1.1269e-07, 4.5076e-07, 6.2026e-07], [ 1.0822e-06, 3.1665e-07, 1.4426e-06, ..., 9.5926e-08, 9.6764e-07, 1.2573e-07]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0090, -0.0192, -0.0071, -0.0184, -0.0054, 0.0056, 0.0073, 0.0205, 0.0145, -0.0112], device='cuda:0'), grad: tensor([-2.5705e-05, -3.3304e-06, 4.6380e-07, 8.6799e-07, 7.6555e-07, 5.7556e-06, 1.5259e-05, -5.9940e-06, 8.7470e-06, 3.1814e-06], device='cuda:0') 100 0.0001 changing lr epoch 248, time 215.16, cls_loss 0.0014 cls_loss_mapping 0.0025 cls_loss_causal 0.4550 re_mapping 0.0054 re_causal 0.0148 /// teacc 98.96 lr 0.00010000 Epoch 250, weight, value: tensor([[-0.2409, 0.0832, -0.1162, ..., -0.0302, -0.1757, -0.1521], [ 0.0103, 0.1000, -0.1170, ..., -0.1135, -0.0430, 0.1017], [ 0.0489, -0.1191, -0.1276, ..., 0.0004, -0.0535, -0.0836], ..., [ 0.0862, -0.0409, 0.1274, ..., 0.0662, 0.1695, -0.0062], [ 0.1065, -0.1863, -0.1267, ..., -0.2168, -0.0527, 0.1801], [-0.1509, 0.0791, 0.0448, ..., -0.2152, -0.1066, -0.0414]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -3.7253e-08, 8.3819e-09, ..., 2.1420e-08, 1.8626e-09, -1.8626e-09], [ 3.6322e-08, 1.3039e-08, 5.5879e-08, ..., 7.5437e-08, 3.4459e-08, -2.7008e-08], [-1.0151e-07, 2.7008e-08, 3.2596e-08, ..., -2.3078e-06, 1.3970e-08, 5.5879e-09], ..., [-9.9652e-08, 1.8626e-08, -1.2480e-07, ..., -4.3772e-08, -9.0338e-08, 1.6764e-08], [ 1.0990e-07, 1.0803e-07, 1.7695e-08, ..., 2.1569e-06, 7.4506e-09, -3.7253e-09], [ 2.1420e-08, 1.0617e-07, 1.5832e-08, ..., 2.3283e-08, 1.6764e-08, 5.7742e-08]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0091, -0.0196, -0.0072, -0.0188, -0.0055, 0.0056, 0.0076, 0.0210, 0.0143, -0.0113], device='cuda:0'), grad: tensor([ 1.5832e-08, 2.4866e-07, -8.7023e-06, -1.3523e-05, -6.0536e-08, 1.2204e-05, 8.1770e-07, -8.1025e-08, 8.7395e-06, 3.4273e-07], device='cuda:0') 100 0.0001 changing lr epoch 249, time 215.02, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.5108 re_mapping 0.0050 re_causal 0.0152 /// teacc 98.96 lr 0.00010000 Epoch 251, weight, value: tensor([[-0.2420, 0.0833, -0.1173, ..., -0.0305, -0.1769, -0.1525], [ 0.0107, 0.1001, -0.1149, ..., -0.1106, -0.0440, 0.1015], [ 0.0491, -0.1192, -0.1278, ..., 0.0009, -0.0534, -0.0837], ..., [ 0.0860, -0.0410, 0.1254, ..., 0.0634, 0.1704, -0.0060], [ 0.1067, -0.1871, -0.1268, ..., -0.2175, -0.0528, 0.1806], [-0.1518, 0.0792, 0.0450, ..., -0.2166, -0.1069, -0.0417]], device='cuda:0'), grad: tensor([[ 1.3504e-08, 8.8988e-07, 5.2759e-07, ..., 8.3959e-07, 1.8626e-09, 8.8941e-08], [ 6.9803e-07, -7.1712e-08, 9.4296e-07, ..., 6.2026e-07, 5.5181e-07, -1.2107e-08], [-6.5193e-09, 8.6613e-08, 1.3970e-07, ..., 1.3271e-07, 8.8476e-09, 3.5390e-08], ..., [-1.3653e-06, 9.9186e-08, -1.3905e-06, ..., -6.9663e-07, -1.0515e-06, -1.8394e-07], [-1.8207e-07, 8.4657e-07, 6.8685e-07, ..., 9.4669e-07, 1.7323e-07, -3.7719e-07], [ 3.9861e-07, 2.9663e-07, 3.1479e-07, ..., 3.2736e-07, 5.3085e-08, 3.8883e-07]], device='cuda:0') Epoch 251, bias, value: tensor([-0.0091, -0.0183, -0.0070, -0.0186, -0.0056, 0.0056, 0.0076, 0.0196, 0.0143, -0.0113], device='cuda:0'), grad: tensor([ 2.7549e-06, 1.7229e-06, 3.8370e-07, -7.3537e-06, 1.1362e-07, 1.0133e-06, 7.4506e-09, -2.5444e-06, 1.7332e-06, 2.1756e-06], device='cuda:0') 100 0.0001 changing lr epoch 250, time 214.84, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4859 re_mapping 0.0050 re_causal 0.0145 /// teacc 99.00 lr 0.00010000 Epoch 252, weight, value: tensor([[-0.2422, 0.0837, -0.1177, ..., -0.0307, -0.1782, -0.1524], [ 0.0106, 0.1002, -0.1150, ..., -0.1106, -0.0442, 0.1018], [ 0.0492, -0.1193, -0.1280, ..., 0.0010, -0.0533, -0.0839], ..., [ 0.0860, -0.0414, 0.1256, ..., 0.0635, 0.1706, -0.0062], [ 0.1068, -0.1875, -0.1269, ..., -0.2177, -0.0529, 0.1808], [-0.1525, 0.0791, 0.0451, ..., -0.2176, -0.1072, -0.0420]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 3.3528e-08, 1.8626e-09, ..., 1.8626e-09, 9.3132e-10, 1.8626e-09], [-4.1444e-08, -1.2154e-07, 2.1886e-08, ..., 5.5879e-09, 1.7695e-08, -2.4727e-07], [-2.0955e-08, 7.4506e-09, 2.3283e-09, ..., -4.5635e-08, -1.4901e-08, 1.2107e-08], ..., [ 9.3132e-10, 1.1036e-07, -5.9139e-08, ..., 6.0536e-09, -3.8650e-08, 1.9232e-07], [-4.6566e-09, 2.9057e-07, 3.8184e-08, ..., 9.3132e-09, 6.0536e-09, -1.2107e-08], [ 3.1199e-08, -5.0291e-08, -4.3772e-08, ..., 3.2596e-09, 2.2817e-08, 1.5367e-08]], device='cuda:0') Epoch 252, bias, value: tensor([-0.0091, -0.0183, -0.0071, -0.0186, -0.0054, 0.0055, 0.0077, 0.0196, 0.0143, -0.0116], device='cuda:0'), grad: tensor([ 5.6578e-07, -3.3295e-07, 1.8626e-09, 1.7649e-07, 4.3958e-06, -6.6590e-07, -6.0238e-06, 3.3155e-07, 1.5991e-06, -6.5658e-08], device='cuda:0') 100 0.0001 changing lr epoch 251, time 214.84, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.5119 re_mapping 0.0053 re_causal 0.0161 /// teacc 98.99 lr 0.00010000 Epoch 253, weight, value: tensor([[-0.2419, 0.0830, -0.1185, ..., -0.0310, -0.1787, -0.1522], [ 0.0114, 0.1002, -0.1149, ..., -0.1107, -0.0437, 0.1025], [ 0.0493, -0.1194, -0.1281, ..., 0.0012, -0.0532, -0.0840], ..., [ 0.0854, -0.0418, 0.1256, ..., 0.0635, 0.1704, -0.0069], [ 0.1070, -0.1878, -0.1270, ..., -0.2180, -0.0529, 0.1810], [-0.1537, 0.0787, 0.0442, ..., -0.2211, -0.1083, -0.0426]], device='cuda:0'), grad: tensor([[ 6.5193e-08, -8.5635e-07, 1.8626e-09, ..., 3.7719e-08, 5.2154e-08, 8.8476e-09], [ 1.3877e-07, -5.0571e-07, 4.1910e-09, ..., 1.1642e-07, 1.6345e-07, -1.1120e-06], [-1.1232e-06, 4.0978e-08, 7.4506e-09, ..., -6.5705e-07, -9.0944e-07, 6.2864e-08], ..., [ 4.5123e-07, 1.6950e-07, -1.7229e-08, ..., 2.5611e-07, 3.4971e-07, 3.1898e-07], [ 1.6764e-07, 1.3737e-07, 1.3970e-08, ..., 1.2387e-07, 1.7043e-07, -9.3132e-10], [ 3.7253e-08, 4.0513e-08, -4.8894e-08, ..., 1.9558e-08, 2.7474e-08, 7.2177e-08]], device='cuda:0') Epoch 253, bias, value: tensor([-0.0098, -0.0179, -0.0070, -0.0187, -0.0043, 0.0054, 0.0079, 0.0191, 0.0143, -0.0124], device='cuda:0'), grad: tensor([-5.1595e-07, -1.4920e-06, -3.0696e-06, 3.1106e-07, 7.1712e-07, 2.5779e-06, -2.4065e-06, 2.0899e-06, 1.5628e-06, 2.0815e-07], device='cuda:0') 100 0.0001 changing lr epoch 252, time 215.00, cls_loss 0.0011 cls_loss_mapping 0.0023 cls_loss_causal 0.5275 re_mapping 0.0053 re_causal 0.0157 /// teacc 99.02 lr 0.00010000 Epoch 254, weight, value: tensor([[-0.2421, 0.0831, -0.1189, ..., -0.0314, -0.1803, -0.1519], [ 0.0116, 0.1003, -0.1149, ..., -0.1107, -0.0436, 0.1027], [ 0.0494, -0.1194, -0.1280, ..., 0.0015, -0.0531, -0.0839], ..., [ 0.0853, -0.0421, 0.1256, ..., 0.0634, 0.1704, -0.0071], [ 0.1070, -0.1886, -0.1271, ..., -0.2185, -0.0530, 0.1812], [-0.1543, 0.0784, 0.0440, ..., -0.2220, -0.1086, -0.0430]], device='cuda:0'), grad: tensor([[ 2.3469e-07, -1.6764e-08, 1.1176e-07, ..., 6.7055e-08, 2.0489e-07, 5.5879e-08], [ 9.0338e-07, -6.8918e-08, 4.8243e-07, ..., 2.7567e-07, 8.4937e-07, -1.3039e-08], [ 3.5167e-06, 3.7253e-08, 3.3975e-06, ..., -1.3281e-06, 4.8913e-06, 1.6503e-06], ..., [-8.3372e-06, 3.7253e-08, -5.2974e-06, ..., -2.4028e-07, -8.9034e-06, -2.2966e-06], [ 1.1418e-06, 5.0291e-08, 4.9919e-07, ..., 4.1537e-07, 9.3877e-07, 2.0117e-07], [ 1.0245e-06, -1.0561e-06, -3.8743e-07, ..., 2.7567e-07, 9.3319e-07, 2.2911e-07]], device='cuda:0') Epoch 254, bias, value: tensor([-0.0098, -0.0177, -0.0069, -0.0187, -0.0040, 0.0055, 0.0078, 0.0190, 0.0140, -0.0128], device='cuda:0'), grad: tensor([ 5.1409e-07, 1.7826e-06, 8.9332e-06, 6.0536e-07, 2.7213e-06, 1.3504e-06, 3.2969e-07, -1.8716e-05, 2.6114e-06, -1.6950e-07], device='cuda:0') 100 0.0001 changing lr epoch 253, time 215.09, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.5052 re_mapping 0.0050 re_causal 0.0150 /// teacc 98.99 lr 0.00010000 Epoch 255, weight, value: tensor([[-0.2419, 0.0834, -0.1195, ..., -0.0318, -0.1823, -0.1517], [ 0.0143, 0.1003, -0.1124, ..., -0.1107, -0.0428, 0.1040], [ 0.0496, -0.1195, -0.1285, ..., 0.0017, -0.0531, -0.0840], ..., [ 0.0828, -0.0424, 0.1233, ..., 0.0635, 0.1698, -0.0086], [ 0.1071, -0.1883, -0.1275, ..., -0.2192, -0.0533, 0.1818], [-0.1555, 0.0785, 0.0440, ..., -0.2232, -0.1094, -0.0432]], device='cuda:0'), grad: tensor([[ 5.7742e-08, -2.5891e-07, 1.4901e-08, ..., 1.8626e-09, 1.8626e-09, 1.4156e-07], [ 7.8231e-08, 7.4506e-09, 5.9605e-08, ..., 2.6077e-08, 5.7742e-08, 4.6566e-08], [ 8.5682e-08, 1.8626e-09, 5.0291e-08, ..., -1.8626e-09, 3.1665e-08, 1.4901e-07], ..., [-1.1362e-07, 3.7253e-09, -1.1548e-07, ..., -4.0978e-08, -1.2293e-07, 2.0489e-08], [-2.5332e-07, 1.5460e-07, -5.2154e-08, ..., 5.5879e-09, 9.3132e-09, -6.7055e-07], [ 2.9802e-08, 2.1793e-07, -0.0000e+00, ..., 1.8626e-09, 7.4506e-09, 3.9116e-08]], device='cuda:0') Epoch 255, bias, value: tensor([-0.0097, -0.0157, -0.0069, -0.0188, -0.0041, 0.0054, 0.0080, 0.0168, 0.0143, -0.0130], device='cuda:0'), grad: tensor([-7.7672e-07, 2.5518e-07, 4.0233e-07, 3.4794e-06, 6.1467e-08, -4.1127e-06, 1.0971e-06, -1.2666e-07, -1.1064e-06, 8.1211e-07], device='cuda:0') 100 0.0001 changing lr epoch 254, time 215.17, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4893 re_mapping 0.0052 re_causal 0.0149 /// teacc 98.92 lr 0.00010000 Epoch 256, weight, value: tensor([[-0.2431, 0.0835, -0.1203, ..., -0.0323, -0.1831, -0.1519], [ 0.0138, 0.1001, -0.1128, ..., -0.1108, -0.0435, 0.1038], [ 0.0497, -0.1196, -0.1287, ..., 0.0019, -0.0531, -0.0840], ..., [ 0.0831, -0.0423, 0.1237, ..., 0.0634, 0.1705, -0.0084], [ 0.1076, -0.1888, -0.1275, ..., -0.2195, -0.0530, 0.1826], [-0.1561, 0.0787, 0.0444, ..., -0.2240, -0.1096, -0.0435]], device='cuda:0'), grad: tensor([[ 4.4703e-08, -1.1735e-07, 3.7253e-08, ..., 1.8626e-09, 1.3039e-08, 1.4901e-08], [ 1.3411e-07, 5.5879e-09, 1.8068e-07, ..., 7.6368e-08, 1.1176e-08, 1.2666e-07], [ 8.9407e-08, 1.8626e-08, 3.5390e-08, ..., 0.0000e+00, 9.3132e-09, 1.0990e-07], ..., [-5.1260e-06, 1.1921e-07, -3.6880e-06, ..., 3.2783e-07, -1.7062e-06, 1.0431e-07], [-8.7544e-07, 6.5193e-08, 5.0291e-08, ..., 5.5879e-09, 1.6764e-08, -1.4715e-06], [ 1.3597e-07, 3.5018e-07, 3.9116e-07, ..., 1.3970e-07, 4.4703e-08, 1.9744e-07]], device='cuda:0') Epoch 256, bias, value: tensor([-0.0097, -0.0160, -0.0068, -0.0188, -0.0044, 0.0053, 0.0081, 0.0171, 0.0145, -0.0126], device='cuda:0'), grad: tensor([-1.5087e-07, 6.1467e-07, 3.4086e-07, -2.9430e-07, -1.2778e-06, 9.0078e-06, 1.7881e-07, -7.0259e-06, -3.0436e-06, 1.6373e-06], device='cuda:0') 100 0.0001 changing lr epoch 255, time 214.98, cls_loss 0.0010 cls_loss_mapping 0.0024 cls_loss_causal 0.5034 re_mapping 0.0053 re_causal 0.0157 /// teacc 98.92 lr 0.00010000 Epoch 257, weight, value: tensor([[-0.2435, 0.0839, -0.1208, ..., -0.0324, -0.1838, -0.1515], [ 0.0134, 0.1002, -0.1131, ..., -0.1109, -0.0446, 0.1034], [ 0.0498, -0.1196, -0.1289, ..., 0.0020, -0.0531, -0.0841], ..., [ 0.0836, -0.0424, 0.1241, ..., 0.0634, 0.1717, -0.0080], [ 0.1080, -0.1892, -0.1276, ..., -0.2197, -0.0530, 0.1832], [-0.1574, 0.0787, 0.0441, ..., -0.2247, -0.1110, -0.0438]], device='cuda:0'), grad: tensor([[ 2.0489e-08, -2.9616e-07, 2.7940e-08, ..., 1.6764e-08, 1.8626e-08, 7.4506e-09], [ 3.0361e-07, -3.1479e-07, 2.7195e-07, ..., 2.0675e-07, 2.2724e-07, -9.7416e-07], [-2.6822e-07, 2.4214e-08, 3.3528e-08, ..., -3.4831e-07, -7.8231e-08, 3.9116e-08], ..., [-6.3144e-07, -6.8918e-08, -1.1120e-06, ..., 3.5390e-08, -7.3947e-07, 8.1956e-08], [ 6.3330e-08, 7.8231e-08, 1.0431e-07, ..., 1.4901e-07, 4.4703e-08, 5.0291e-08], [ 4.2468e-07, 1.7881e-07, 6.5938e-07, ..., 2.9802e-08, 4.6194e-07, 2.9802e-08]], device='cuda:0') Epoch 257, bias, value: tensor([-0.0091, -0.0163, -0.0068, -0.0185, -0.0044, 0.0050, 0.0080, 0.0175, 0.0149, -0.0128], device='cuda:0'), grad: tensor([-6.4261e-07, -9.9652e-07, -9.5367e-07, -4.1164e-07, -9.3132e-09, 4.4703e-07, 1.7751e-06, -7.9535e-07, 5.7183e-07, 9.9652e-07], device='cuda:0') 100 0.0001 changing lr epoch 256, time 215.12, cls_loss 0.0012 cls_loss_mapping 0.0023 cls_loss_causal 0.5248 re_mapping 0.0052 re_causal 0.0152 /// teacc 99.01 lr 0.00010000 Epoch 258, weight, value: tensor([[-0.2447, 0.0843, -0.1217, ..., -0.0365, -0.1850, -0.1525], [ 0.0132, 0.1002, -0.1133, ..., -0.1109, -0.0452, 0.1035], [ 0.0498, -0.1197, -0.1292, ..., 0.0020, -0.0531, -0.0847], ..., [ 0.0839, -0.0426, 0.1244, ..., 0.0634, 0.1723, -0.0079], [ 0.1081, -0.1900, -0.1278, ..., -0.2203, -0.0530, 0.1833], [-0.1582, 0.0784, 0.0442, ..., -0.2254, -0.1116, -0.0439]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.0864e-06, 5.5879e-09, ..., 6.2101e-06, 0.0000e+00, 1.8626e-09], [ 3.7253e-09, 3.7253e-08, 2.6077e-08, ..., 3.3528e-08, 3.7253e-09, -1.0058e-07], [ 5.5879e-09, 9.8720e-08, 5.5879e-09, ..., 1.9558e-07, 1.8626e-09, 3.1665e-08], ..., [-7.4506e-09, 2.9802e-08, 1.1176e-08, ..., 3.7253e-09, -7.4506e-09, 5.4017e-08], [-6.8918e-08, 8.7544e-08, 2.6077e-08, ..., 1.3225e-07, 1.8626e-09, -1.3411e-07], [ 2.0489e-08, 5.5879e-09, -6.3330e-07, ..., 5.5879e-09, 0.0000e+00, 6.7241e-07]], device='cuda:0') Epoch 258, bias, value: tensor([-0.0094, -0.0163, -0.0069, -0.0176, -0.0042, 0.0050, 0.0080, 0.0176, 0.0146, -0.0132], device='cuda:0'), grad: tensor([ 2.5183e-05, 1.4156e-07, 8.6613e-07, -2.9266e-05, -5.2154e-08, 1.8068e-06, 3.0920e-07, 1.5646e-07, 3.8929e-07, 4.8056e-07], device='cuda:0') 100 0.0001 changing lr epoch 257, time 214.96, cls_loss 0.0008 cls_loss_mapping 0.0018 cls_loss_causal 0.4852 re_mapping 0.0050 re_causal 0.0154 /// teacc 98.90 lr 0.00010000 Epoch 259, weight, value: tensor([[-0.2451, 0.0847, -0.1220, ..., -0.0369, -0.1855, -0.1528], [ 0.0132, 0.1001, -0.1134, ..., -0.1109, -0.0454, 0.1036], [ 0.0501, -0.1197, -0.1292, ..., 0.0021, -0.0527, -0.0848], ..., [ 0.0839, -0.0428, 0.1244, ..., 0.0634, 0.1724, -0.0080], [ 0.1082, -0.1906, -0.1281, ..., -0.2205, -0.0530, 0.1837], [-0.1586, 0.0784, 0.0446, ..., -0.2256, -0.1118, -0.0442]], device='cuda:0'), grad: tensor([[-1.4901e-08, 5.2899e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [-5.5879e-09, 6.8918e-08, 3.7253e-09, ..., 1.8626e-09, 3.7253e-09, -8.5682e-08], [ 0.0000e+00, 3.5577e-07, 0.0000e+00, ..., -3.7253e-09, -1.8626e-09, 4.8429e-08], ..., [-5.5879e-09, 5.9605e-08, -5.5879e-09, ..., -1.8626e-09, -5.5879e-09, 3.7253e-08], [-3.9116e-08, 3.5390e-08, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, -4.8429e-08], [ 2.4214e-08, 2.0675e-07, -1.1176e-08, ..., 0.0000e+00, 0.0000e+00, 2.9802e-08]], device='cuda:0') Epoch 259, bias, value: tensor([-0.0090, -0.0164, -0.0067, -0.0171, -0.0041, 0.0049, 0.0078, 0.0175, 0.0146, -0.0133], device='cuda:0'), grad: tensor([ 3.1069e-06, 6.5006e-07, 1.6298e-06, 5.2154e-08, -7.8008e-06, 3.5390e-08, 1.2759e-06, 1.8440e-07, 1.6764e-08, 8.3447e-07], device='cuda:0') 100 0.0001 changing lr epoch 258, time 214.62, cls_loss 0.0011 cls_loss_mapping 0.0022 cls_loss_causal 0.5221 re_mapping 0.0050 re_causal 0.0159 /// teacc 99.01 lr 0.00010000 Epoch 260, weight, value: tensor([[-0.2457, 0.0848, -0.1224, ..., -0.0369, -0.1862, -0.1535], [ 0.0132, 0.0996, -0.1134, ..., -0.1110, -0.0454, 0.1039], [ 0.0496, -0.1199, -0.1301, ..., 0.0019, -0.0531, -0.0850], ..., [ 0.0840, -0.0429, 0.1246, ..., 0.0636, 0.1727, -0.0081], [ 0.1082, -0.1912, -0.1284, ..., -0.2207, -0.0532, 0.1838], [-0.1571, 0.0811, 0.0483, ..., -0.2260, -0.1120, -0.0412]], device='cuda:0'), grad: tensor([[ 1.3039e-08, -6.9477e-07, 3.7253e-09, ..., 1.8626e-09, 1.8626e-09, -2.8685e-07], [-1.5832e-07, -4.8988e-07, 1.1176e-08, ..., 0.0000e+00, -1.8626e-09, -6.5751e-07], [ 3.7253e-08, 2.9802e-07, 3.7253e-09, ..., -5.5879e-09, -1.8626e-09, 2.7940e-07], ..., [ 3.1665e-08, 1.9744e-07, 7.4506e-09, ..., 0.0000e+00, -7.4506e-09, 2.6636e-07], [-1.6764e-08, 5.0105e-07, 4.2841e-08, ..., 1.8626e-09, 1.8626e-09, 2.0862e-07], [ 3.3528e-08, 1.9550e-05, 3.0827e-06, ..., 0.0000e+00, 3.7253e-09, 9.1046e-06]], device='cuda:0') Epoch 260, bias, value: tensor([-0.0089, -0.0163, -0.0069, -0.0176, -0.0069, 0.0051, 0.0078, 0.0175, 0.0144, -0.0105], device='cuda:0'), grad: tensor([ 2.7753e-07, -8.9779e-07, 1.3839e-06, 5.3458e-07, -4.9055e-05, 1.1437e-06, -1.1787e-05, 5.9605e-07, 8.2403e-06, 4.9531e-05], device='cuda:0') 100 0.0001 changing lr epoch 259, time 214.75, cls_loss 0.0016 cls_loss_mapping 0.0021 cls_loss_causal 0.4975 re_mapping 0.0051 re_causal 0.0146 /// teacc 98.92 lr 0.00010000 Epoch 261, weight, value: tensor([[-0.2455, 0.0874, -0.1217, ..., -0.0370, -0.1879, -0.1514], [ 0.0144, 0.1018, -0.1129, ..., -0.1111, -0.0441, 0.1055], [ 0.0494, -0.1202, -0.1311, ..., 0.0018, -0.0534, -0.0853], ..., [ 0.0830, -0.0454, 0.1242, ..., 0.0637, 0.1718, -0.0096], [ 0.1081, -0.1929, -0.1288, ..., -0.2212, -0.0533, 0.1838], [-0.1579, 0.0818, 0.0505, ..., -0.2265, -0.1126, -0.0397]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -3.7253e-09, 5.5879e-09, ..., 1.8626e-09, 1.8626e-09, 1.1176e-08], [ 8.0094e-08, 1.3039e-08, 1.4901e-08, ..., 8.5682e-08, 7.8231e-08, 2.4214e-08], [-8.9407e-08, 1.4901e-08, 7.4506e-09, ..., -1.0058e-07, -8.7544e-08, 1.4901e-08], ..., [-3.7253e-09, 9.8720e-08, 3.7253e-09, ..., 5.5879e-09, -7.4506e-09, 8.9407e-08], [-1.3039e-08, 1.4342e-07, 2.2911e-07, ..., 1.8626e-09, 1.8626e-09, 6.3330e-08], [ 9.3132e-09, 3.5949e-07, -6.5193e-07, ..., 3.7253e-09, 9.3132e-09, 4.1537e-07]], device='cuda:0') Epoch 261, bias, value: tensor([-0.0062, -0.0156, -0.0070, -0.0175, -0.0089, 0.0049, 0.0078, 0.0169, 0.0138, -0.0093], device='cuda:0'), grad: tensor([ 3.5390e-08, 5.0105e-07, -3.8929e-07, 3.2410e-07, -1.8664e-06, -1.8291e-06, 1.1008e-06, 4.0047e-07, 1.2387e-06, 4.8056e-07], device='cuda:0') 100 0.0001 changing lr epoch 260, time 214.89, cls_loss 0.0018 cls_loss_mapping 0.0029 cls_loss_causal 0.5069 re_mapping 0.0052 re_causal 0.0148 /// teacc 99.00 lr 0.00010000 Epoch 262, weight, value: tensor([[-0.2473, 0.0877, -0.1237, ..., -0.0374, -0.1915, -0.1524], [ 0.0154, 0.1030, -0.1123, ..., -0.1111, -0.0429, 0.1066], [ 0.0495, -0.1200, -0.1314, ..., 0.0020, -0.0533, -0.0854], ..., [ 0.0824, -0.0467, 0.1239, ..., 0.0641, 0.1713, -0.0104], [ 0.1082, -0.1944, -0.1292, ..., -0.2219, -0.0536, 0.1839], [-0.1606, 0.0816, 0.0506, ..., -0.2274, -0.1145, -0.0401]], device='cuda:0'), grad: tensor([[ 9.6485e-07, 2.7195e-07, 5.5879e-09, ..., 4.7870e-07, 4.9360e-07, 3.2969e-07], [-3.3714e-07, -1.4827e-06, 9.3132e-09, ..., 1.5460e-07, 3.1665e-08, -1.3616e-06], [-1.3784e-06, 5.0291e-08, 3.7253e-09, ..., -1.1101e-06, -7.9907e-07, 4.6566e-08], ..., [ 1.1362e-07, 9.8720e-08, -1.4901e-08, ..., 6.7055e-08, 4.2841e-08, 1.0431e-07], [ 3.8929e-07, 3.1292e-07, 3.7253e-09, ..., 1.7695e-07, 1.8068e-07, 2.6822e-07], [ 3.1665e-08, 1.5087e-07, -2.4214e-08, ..., 5.5879e-09, 5.5879e-09, 8.1956e-08]], device='cuda:0') Epoch 262, bias, value: tensor([-0.0061, -0.0147, -0.0067, -0.0197, -0.0088, 0.0052, 0.0075, 0.0162, 0.0133, -0.0096], device='cuda:0'), grad: tensor([ 3.8221e-06, -2.8405e-06, -7.5772e-06, 1.4342e-07, 6.5938e-07, 3.2037e-07, 2.2147e-06, 7.4878e-07, 1.9968e-06, 4.4890e-07], device='cuda:0') 100 0.0001 changing lr epoch 261, time 214.94, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4980 re_mapping 0.0051 re_causal 0.0154 /// teacc 99.03 lr 0.00010000 Epoch 263, weight, value: tensor([[-0.2485, 0.0876, -0.1245, ..., -0.0379, -0.1936, -0.1534], [ 0.0156, 0.1036, -0.1119, ..., -0.1107, -0.0428, 0.1069], [ 0.0495, -0.1203, -0.1316, ..., 0.0020, -0.0533, -0.0857], ..., [ 0.0822, -0.0468, 0.1236, ..., 0.0638, 0.1714, -0.0105], [ 0.1082, -0.1957, -0.1295, ..., -0.2225, -0.0537, 0.1839], [-0.1615, 0.0813, 0.0504, ..., -0.2288, -0.1153, -0.0407]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 1.8626e-09, 1.8626e-09, ..., 5.5879e-09, 7.4506e-09, 5.5879e-09], [ 8.2701e-07, -1.0431e-07, 5.4017e-08, ..., 1.6764e-08, 3.0361e-07, -4.5635e-07], [ 1.3225e-07, 3.9116e-08, 2.4214e-08, ..., -2.2352e-08, 5.0291e-08, 2.1607e-07], ..., [-1.5065e-05, 7.2643e-08, -7.8045e-07, ..., 1.4901e-08, -7.1973e-06, -7.7561e-06], [ 4.5821e-07, 8.0094e-08, 2.6077e-08, ..., 5.5879e-09, 2.2538e-07, 3.4831e-07], [ 2.9802e-08, 5.5879e-09, 5.5879e-09, ..., 1.8626e-09, 1.6764e-08, 2.2352e-08]], device='cuda:0') Epoch 263, bias, value: tensor([-0.0064, -0.0142, -0.0068, -0.0195, -0.0086, 0.0051, 0.0074, 0.0159, 0.0129, -0.0099], device='cuda:0'), grad: tensor([ 3.5763e-07, -5.0478e-07, 6.2585e-07, 1.4827e-06, 2.2441e-05, -5.9307e-06, -4.7088e-06, -1.8701e-05, 4.8429e-06, 8.0094e-08], device='cuda:0') 100 0.0001 changing lr epoch 262, time 215.14, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.5000 re_mapping 0.0051 re_causal 0.0147 /// teacc 98.99 lr 0.00010000 Epoch 264, weight, value: tensor([[-2.4934e-01, 8.7733e-02, -1.2530e-01, ..., -3.8183e-02, -1.9506e-01, -1.5312e-01], [ 1.4994e-02, 1.0367e-01, -1.1237e-01, ..., -1.1099e-01, -4.4027e-02, 1.0671e-01], [ 4.9086e-02, -1.2046e-01, -1.3335e-01, ..., -5.3777e-06, -5.3792e-02, -8.5791e-02], ..., [ 8.3102e-02, -4.6851e-02, 1.2423e-01, ..., 6.4068e-02, 1.7311e-01, -1.0195e-02], [ 1.0853e-01, -1.9584e-01, -1.3002e-01, ..., -2.2325e-01, -5.3950e-02, 1.8454e-01], [-1.6298e-01, 8.1286e-02, 5.0317e-02, ..., -2.3228e-01, -1.1637e-01, -4.0816e-02]], device='cuda:0'), grad: tensor([[ 1.3039e-08, 5.7742e-08, 3.3528e-08, ..., 0.0000e+00, 1.8626e-09, 1.4901e-08], [ 3.3528e-07, 2.4028e-06, 8.1956e-08, ..., 3.7253e-09, 1.4901e-08, 1.5050e-06], [ 9.4995e-08, 1.8626e-08, 5.5879e-09, ..., 0.0000e+00, 1.8626e-09, 9.4995e-08], ..., [ 3.7253e-09, 2.9989e-07, 4.4890e-07, ..., -7.4506e-09, -1.6764e-08, 5.5879e-08], [-1.4901e-08, 2.1048e-07, 2.4959e-07, ..., 0.0000e+00, 7.4506e-09, -8.0094e-08], [-5.4017e-08, -1.2666e-07, -1.1176e-06, ..., 1.8626e-09, -1.3039e-08, 2.1979e-07]], device='cuda:0') Epoch 264, bias, value: tensor([-0.0064, -0.0145, -0.0077, -0.0170, -0.0086, 0.0046, 0.0074, 0.0165, 0.0133, -0.0100], device='cuda:0'), grad: tensor([ 1.2666e-06, 6.0350e-06, 4.9733e-07, 2.6636e-07, -8.5086e-06, -4.9360e-07, 1.0990e-07, 8.8476e-07, 7.1712e-07, -8.1770e-07], device='cuda:0') 100 0.0001 changing lr epoch 263, time 215.09, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.5175 re_mapping 0.0050 re_causal 0.0152 /// teacc 98.96 lr 0.00010000 Epoch 265, weight, value: tensor([[-2.5100e-01, 8.7694e-02, -1.2624e-01, ..., -3.8266e-02, -1.9723e-01, -1.5306e-01], [ 1.4832e-02, 1.0374e-01, -1.1252e-01, ..., -1.1112e-01, -4.4380e-02, 1.0675e-01], [ 4.9198e-02, -1.2054e-01, -1.3353e-01, ..., 1.3101e-04, -5.3759e-02, -8.5866e-02], ..., [ 8.3332e-02, -4.6919e-02, 1.2442e-01, ..., 6.4142e-02, 1.7354e-01, -1.0191e-02], [ 1.0858e-01, -1.9649e-01, -1.3041e-01, ..., -2.2369e-01, -5.4105e-02, 1.8475e-01], [-1.6355e-01, 8.1277e-02, 5.0412e-02, ..., -2.3367e-01, -1.1674e-01, -4.0880e-02]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 4.2841e-08, 3.1665e-08, ..., 5.5879e-09, 0.0000e+00, 2.6077e-08], [ 1.6764e-08, 8.3819e-06, 7.3165e-06, ..., 8.5682e-08, 1.6764e-08, 4.0159e-06], [ 0.0000e+00, 4.4703e-08, 4.4703e-08, ..., 9.3132e-09, 0.0000e+00, 1.8626e-08], ..., [-5.2154e-08, 1.3132e-06, 1.1921e-06, ..., 1.0915e-06, -3.9116e-08, 2.7567e-07], [ 5.5879e-09, 8.6613e-07, 7.6555e-07, ..., 2.4959e-07, 5.5879e-09, 3.3900e-07], [ 7.4506e-09, 6.2287e-06, 5.2415e-06, ..., 3.5390e-08, 1.8626e-09, 3.0864e-06]], device='cuda:0') Epoch 265, bias, value: tensor([-0.0066, -0.0146, -0.0077, -0.0169, -0.0086, 0.0046, 0.0073, 0.0166, 0.0131, -0.0100], device='cuda:0'), grad: tensor([ 1.6019e-07, 1.8045e-05, 9.6858e-08, -9.3877e-05, 2.1979e-07, 5.5939e-05, -5.5879e-09, 3.8669e-06, 2.1420e-06, 1.3404e-05], device='cuda:0') 100 0.0001 changing lr epoch 264, time 214.84, cls_loss 0.0012 cls_loss_mapping 0.0015 cls_loss_causal 0.4862 re_mapping 0.0052 re_causal 0.0149 /// teacc 98.88 lr 0.00010000 Epoch 266, weight, value: tensor([[-0.2520, 0.0872, -0.1269, ..., -0.0384, -0.1986, -0.1532], [ 0.0145, 0.1036, -0.1128, ..., -0.1113, -0.0450, 0.1067], [ 0.0496, -0.1206, -0.1340, ..., 0.0007, -0.0535, -0.0859], ..., [ 0.0837, -0.0470, 0.1249, ..., 0.0643, 0.1743, -0.0102], [ 0.1086, -0.1972, -0.1310, ..., -0.2245, -0.0543, 0.1848], [-0.1635, 0.0827, 0.0515, ..., -0.2349, -0.1170, -0.0399]], device='cuda:0'), grad: tensor([[ 3.1665e-08, -5.5321e-07, 5.4017e-08, ..., 1.1176e-08, 2.2352e-08, 1.8626e-09], [ 7.6182e-07, 1.7509e-07, 9.4809e-07, ..., 1.5832e-07, 5.7928e-07, -5.5879e-08], [ 1.6578e-07, 7.6368e-08, 2.5332e-07, ..., 9.6858e-08, 1.3970e-07, 1.8626e-08], ..., [-2.3022e-06, -2.4028e-07, -2.6785e-06, ..., -3.6880e-07, -1.7378e-06, 3.5390e-08], [ 5.7742e-08, 2.1793e-07, 1.6019e-07, ..., 7.0781e-08, 4.0978e-08, 2.2352e-08], [ 1.0934e-06, 4.9546e-07, 1.3355e-06, ..., 2.1793e-07, 8.0466e-07, 8.3819e-08]], device='cuda:0') Epoch 266, bias, value: tensor([-0.0072, -0.0148, -0.0075, -0.0174, -0.0099, 0.0047, 0.0076, 0.0168, 0.0126, -0.0088], device='cuda:0'), grad: tensor([-1.0394e-06, 1.7900e-06, 6.1095e-07, -9.3132e-07, -8.8289e-07, 6.5193e-07, 1.9185e-07, -4.7423e-06, 7.9535e-07, 3.5372e-06], device='cuda:0') 100 0.0001 changing lr epoch 265, time 214.71, cls_loss 0.0017 cls_loss_mapping 0.0029 cls_loss_causal 0.4964 re_mapping 0.0052 re_causal 0.0148 /// teacc 98.91 lr 0.00010000 Epoch 267, weight, value: tensor([[-0.2531, 0.0860, -0.1278, ..., -0.0385, -0.1997, -0.1537], [ 0.0143, 0.1019, -0.1126, ..., -0.1109, -0.0452, 0.1064], [ 0.0497, -0.1208, -0.1344, ..., 0.0007, -0.0534, -0.0863], ..., [ 0.0837, -0.0472, 0.1246, ..., 0.0640, 0.1746, -0.0106], [ 0.1129, -0.1957, -0.1281, ..., -0.2253, -0.0544, 0.1896], [-0.1667, 0.0835, 0.0494, ..., -0.2360, -0.1171, -0.0424]], device='cuda:0'), grad: tensor([[ 7.4506e-09, -4.4703e-08, 1.1176e-08, ..., 1.6764e-08, 1.1176e-08, 2.9802e-08], [ 5.6140e-06, -1.5087e-07, 5.1335e-06, ..., 1.3970e-07, 5.0850e-06, 2.4661e-06], [ 1.8626e-08, 2.9802e-08, 6.5193e-08, ..., -7.8231e-08, -6.8918e-08, 1.0990e-07], ..., [-5.7854e-06, 1.7509e-07, -5.3197e-06, ..., -9.3132e-08, -5.2750e-06, -2.8424e-06], [-2.2352e-08, 6.3330e-08, 4.2841e-08, ..., 1.3039e-08, 3.1665e-08, 9.3132e-08], [ 5.3830e-07, 3.6508e-07, 3.1665e-08, ..., 1.1176e-08, 1.2107e-07, 1.4268e-06]], device='cuda:0') Epoch 267, bias, value: tensor([-0.0089, -0.0151, -0.0076, -0.0179, -0.0098, 0.0047, 0.0076, 0.0164, 0.0173, -0.0109], device='cuda:0'), grad: tensor([-2.9802e-08, 1.3232e-05, 9.3132e-08, -9.3132e-09, -4.1947e-06, 4.5896e-06, -9.5963e-06, -9.1121e-06, 1.4305e-06, 3.5465e-06], device='cuda:0') 100 0.0001 changing lr epoch 266, time 214.96, cls_loss 0.0015 cls_loss_mapping 0.0025 cls_loss_causal 0.5036 re_mapping 0.0051 re_causal 0.0143 /// teacc 98.97 lr 0.00010000 Epoch 268, weight, value: tensor([[-0.2537, 0.0860, -0.1284, ..., -0.0402, -0.2009, -0.1540], [ 0.0145, 0.1013, -0.1121, ..., -0.1101, -0.0456, 0.1070], [ 0.0527, -0.1194, -0.1341, ..., 0.0037, -0.0507, -0.0864], ..., [ 0.0821, -0.0474, 0.1241, ..., 0.0609, 0.1732, -0.0111], [ 0.1129, -0.1959, -0.1281, ..., -0.2260, -0.0546, 0.1898], [-0.1667, 0.0833, 0.0495, ..., -0.2368, -0.1177, -0.0426]], device='cuda:0'), grad: tensor([[ 5.5879e-08, 6.3330e-08, 1.4901e-08, ..., 6.8918e-08, 3.3528e-08, 1.0617e-07], [ 8.1956e-08, -3.4831e-07, 1.9744e-07, ..., 2.4028e-07, 2.0675e-07, -7.4692e-07], [-6.8918e-07, 5.7742e-08, 1.6764e-07, ..., -7.3947e-07, -6.0163e-07, 1.2666e-07], ..., [ 2.6636e-07, 6.8918e-08, -3.7067e-07, ..., 3.4831e-07, 1.8254e-07, 1.4901e-07], [ 5.4017e-08, 1.6391e-07, 2.2352e-08, ..., 8.3819e-08, 6.1467e-08, 9.4995e-08], [ 1.0058e-07, 4.6194e-07, 1.2107e-07, ..., 9.8720e-08, 5.0291e-08, 7.8976e-07]], device='cuda:0') Epoch 268, bias, value: tensor([-0.0092, -0.0149, -0.0045, -0.0180, -0.0094, 0.0047, 0.0076, 0.0147, 0.0173, -0.0111], device='cuda:0'), grad: tensor([ 5.3085e-07, -8.7544e-07, -1.7248e-06, -8.2888e-07, -2.4047e-06, 9.8906e-07, -1.0058e-07, 1.1623e-06, 6.7428e-07, 2.5705e-06], device='cuda:0') 100 0.0001 changing lr epoch 267, time 214.90, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4941 re_mapping 0.0052 re_causal 0.0149 /// teacc 98.99 lr 0.00010000 Epoch 269, weight, value: tensor([[-0.2542, 0.0861, -0.1289, ..., -0.0405, -0.2015, -0.1550], [ 0.0145, 0.1012, -0.1119, ..., -0.1100, -0.0459, 0.1070], [ 0.0526, -0.1195, -0.1344, ..., 0.0036, -0.0508, -0.0865], ..., [ 0.0823, -0.0473, 0.1240, ..., 0.0610, 0.1738, -0.0111], [ 0.1129, -0.1960, -0.1282, ..., -0.2270, -0.0548, 0.1899], [-0.1668, 0.0832, 0.0495, ..., -0.2379, -0.1187, -0.0426]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 1.8626e-07, -1.8626e-09, 1.7136e-07, ..., 7.6368e-08, 1.7136e-07, -1.6764e-08], [ 5.2154e-08, 0.0000e+00, 4.0978e-08, ..., 3.5390e-08, 5.0291e-08, 1.8626e-09], ..., [-4.5635e-07, 0.0000e+00, -4.1164e-07, ..., -2.0303e-07, -4.2282e-07, 5.5879e-09], [ 4.4703e-08, 1.8626e-09, 4.8429e-08, ..., 4.6566e-08, 5.9605e-08, -1.4901e-08], [ 1.2852e-07, 1.8626e-09, 1.2666e-07, ..., 2.2352e-08, 1.0990e-07, 5.5879e-09]], device='cuda:0') Epoch 269, bias, value: tensor([-0.0097, -0.0147, -0.0047, -0.0161, -0.0091, 0.0033, 0.0084, 0.0146, 0.0172, -0.0112], device='cuda:0'), grad: tensor([ 9.3132e-09, 2.9616e-07, 9.1270e-08, 1.0617e-07, 1.6764e-08, -1.3039e-08, -2.3097e-07, -7.9721e-07, 2.8685e-07, 2.3842e-07], device='cuda:0') 100 0.0001 changing lr epoch 268, time 215.00, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.5021 re_mapping 0.0050 re_causal 0.0148 /// teacc 98.97 lr 0.00010000 Epoch 270, weight, value: tensor([[-0.2546, 0.0867, -0.1291, ..., -0.0408, -0.2024, -0.1549], [ 0.0144, 0.1009, -0.1120, ..., -0.1101, -0.0461, 0.1071], [ 0.0527, -0.1195, -0.1347, ..., 0.0038, -0.0507, -0.0868], ..., [ 0.0823, -0.0474, 0.1242, ..., 0.0610, 0.1740, -0.0111], [ 0.1129, -0.1960, -0.1282, ..., -0.2277, -0.0549, 0.1900], [-0.1668, 0.0831, 0.0495, ..., -0.2389, -0.1192, -0.0427]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -8.4005e-07, 1.8626e-09, ..., -3.5390e-08, 0.0000e+00, 7.4506e-09], [ 6.3330e-08, -6.1467e-08, 3.3528e-08, ..., 3.3528e-08, 3.1665e-08, -3.5577e-07], [ 2.8685e-07, 1.0803e-07, 7.4506e-09, ..., -4.6566e-08, 1.2480e-07, 4.4703e-08], ..., [-7.0594e-07, 3.9116e-08, -5.5879e-08, ..., -1.7136e-07, -2.5518e-07, 5.5879e-08], [-5.4017e-08, 3.7253e-08, 5.5879e-09, ..., 2.0489e-08, 3.7253e-09, -6.3330e-08], [ 1.1176e-08, 5.4203e-07, -2.2352e-08, ..., 3.3528e-08, 7.4506e-09, 1.1176e-08]], device='cuda:0') Epoch 270, bias, value: tensor([-0.0093, -0.0148, -0.0046, -0.0162, -0.0089, 0.0032, 0.0083, 0.0146, 0.0172, -0.0113], device='cuda:0'), grad: tensor([-1.8571e-06, -6.2399e-07, 5.7928e-07, 3.3341e-07, 5.5879e-07, 2.1234e-07, 6.9290e-07, -1.0990e-06, -1.0245e-07, 1.2610e-06], device='cuda:0') 100 0.0001 changing lr epoch 269, time 214.94, cls_loss 0.0011 cls_loss_mapping 0.0023 cls_loss_causal 0.5123 re_mapping 0.0049 re_causal 0.0148 /// teacc 98.82 lr 0.00010000 Epoch 271, weight, value: tensor([[-0.2546, 0.0865, -0.1294, ..., -0.0410, -0.2032, -0.1550], [ 0.0143, 0.1011, -0.1122, ..., -0.1103, -0.0463, 0.1073], [ 0.0525, -0.1197, -0.1355, ..., 0.0037, -0.0510, -0.0872], ..., [ 0.0825, -0.0475, 0.1245, ..., 0.0613, 0.1744, -0.0113], [ 0.1130, -0.1962, -0.1282, ..., -0.2278, -0.0548, 0.1901], [-0.1669, 0.0830, 0.0495, ..., -0.2396, -0.1196, -0.0428]], device='cuda:0'), grad: tensor([[ 9.3132e-09, -3.5577e-07, 1.1176e-08, ..., 5.5879e-09, 5.5879e-09, -7.0781e-08], [-7.2084e-07, 4.4703e-08, -6.7055e-08, ..., 2.6077e-08, -9.3132e-08, -1.1455e-06], [ 3.1665e-08, 1.6764e-08, 1.1362e-07, ..., -6.5193e-08, 1.8626e-08, 1.4901e-08], ..., [ 5.3458e-07, 1.8626e-08, -1.6391e-07, ..., -2.4214e-08, -1.8626e-08, 1.1306e-06], [ 3.7253e-09, 2.5518e-07, 7.6368e-08, ..., 7.4506e-09, 9.3132e-09, 1.8626e-08], [ 5.5879e-09, 3.7253e-08, -1.2666e-07, ..., 0.0000e+00, 1.8626e-09, 2.2352e-08]], device='cuda:0') Epoch 271, bias, value: tensor([-0.0095, -0.0147, -0.0045, -0.0162, -0.0089, 0.0032, 0.0080, 0.0147, 0.0173, -0.0115], device='cuda:0'), grad: tensor([-8.0280e-07, -3.1032e-06, -2.6077e-08, 3.0175e-07, -1.5646e-07, -7.6368e-08, 2.0117e-07, 2.9989e-06, 7.1153e-07, -5.2154e-08], device='cuda:0') 100 0.0001 changing lr epoch 270, time 214.94, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4814 re_mapping 0.0051 re_causal 0.0149 /// teacc 98.90 lr 0.00010000 Epoch 272, weight, value: tensor([[-0.2558, 0.0839, -0.1325, ..., -0.0412, -0.2042, -0.1563], [ 0.0140, 0.1011, -0.1124, ..., -0.1105, -0.0468, 0.1073], [ 0.0525, -0.1203, -0.1361, ..., 0.0036, -0.0511, -0.0872], ..., [ 0.0829, -0.0474, 0.1249, ..., 0.0614, 0.1750, -0.0111], [ 0.1130, -0.1964, -0.1283, ..., -0.2287, -0.0551, 0.1901], [-0.1669, 0.0842, 0.0497, ..., -0.2402, -0.1201, -0.0429]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 7.0781e-08, 1.0803e-07, ..., 7.6368e-08, 5.5879e-09, 1.8626e-09], [-1.5274e-07, -2.4028e-07, 1.2293e-07, ..., 4.8429e-08, 8.0094e-08, -1.2759e-06], [-1.1940e-06, 1.4342e-07, 2.1048e-07, ..., -3.7812e-07, -7.9162e-07, 2.0489e-08], ..., [ 9.5181e-07, 4.2841e-08, -8.5682e-08, ..., 4.3400e-07, 6.0163e-07, 7.4506e-08], [ 3.1292e-07, 2.7195e-07, 1.0226e-06, ..., 3.9116e-08, 4.2841e-08, 1.1288e-06], [ 7.4506e-09, -2.5891e-07, -1.3076e-06, ..., 1.4901e-08, 2.9802e-08, 1.4901e-08]], device='cuda:0') Epoch 272, bias, value: tensor([-0.0125, -0.0149, -0.0046, -0.0162, -0.0090, 0.0031, 0.0082, 0.0151, 0.0171, -0.0109], device='cuda:0'), grad: tensor([ 3.7998e-07, -1.7695e-06, -1.5758e-06, -1.4994e-06, 6.5193e-07, 3.4459e-07, 6.1467e-08, 2.1327e-06, 4.3549e-06, -3.0864e-06], device='cuda:0') 100 0.0001 changing lr epoch 271, time 214.88, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4997 re_mapping 0.0049 re_causal 0.0144 /// teacc 98.97 lr 0.00010000 Epoch 273, weight, value: tensor([[-0.2577, 0.0841, -0.1330, ..., -0.0415, -0.2063, -0.1579], [ 0.0135, 0.1000, -0.1127, ..., -0.1108, -0.0475, 0.1068], [ 0.0525, -0.1205, -0.1368, ..., 0.0037, -0.0510, -0.0874], ..., [ 0.0833, -0.0474, 0.1253, ..., 0.0615, 0.1755, -0.0110], [ 0.1132, -0.1966, -0.1282, ..., -0.2296, -0.0546, 0.1904], [-0.1670, 0.0843, 0.0497, ..., -0.2410, -0.1210, -0.0430]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -1.0617e-07, 2.6077e-08, ..., 1.3039e-08, 0.0000e+00, 1.8626e-09], [ 5.0291e-08, -7.6182e-07, 1.8813e-07, ..., 1.1176e-07, 4.0978e-08, -1.8626e-08], [-7.2643e-08, 8.3819e-08, 1.5460e-07, ..., 2.7940e-08, -1.7323e-07, 3.7253e-09], ..., [ 9.1270e-08, 2.1979e-07, 1.9930e-07, ..., 1.5274e-07, 1.2666e-07, 1.3039e-08], [ 6.3330e-08, 7.7486e-07, 5.5693e-07, ..., 2.9244e-07, 1.8626e-09, -2.2352e-08], [ 1.1176e-08, 4.0606e-07, -2.9802e-08, ..., 9.3132e-09, 1.8626e-09, 1.3039e-08]], device='cuda:0') Epoch 273, bias, value: tensor([-0.0126, -0.0157, -0.0045, -0.0164, -0.0087, 0.0031, 0.0098, 0.0153, 0.0172, -0.0110], device='cuda:0'), grad: tensor([-2.4959e-07, -1.6894e-06, 3.8929e-07, 1.6298e-06, 1.5274e-07, -1.3143e-05, 4.2841e-06, 1.3895e-06, 5.9158e-06, 1.3169e-06], device='cuda:0') 100 0.0001 changing lr epoch 272, time 214.84, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4672 re_mapping 0.0056 re_causal 0.0147 /// teacc 98.90 lr 0.00010000 Epoch 274, weight, value: tensor([[-0.2585, 0.0841, -0.1332, ..., -0.0417, -0.2074, -0.1579], [ 0.0131, 0.1000, -0.1129, ..., -0.1110, -0.0481, 0.1067], [ 0.0523, -0.1206, -0.1369, ..., 0.0041, -0.0513, -0.0875], ..., [ 0.0840, -0.0481, 0.1256, ..., 0.0618, 0.1764, -0.0109], [ 0.1132, -0.1967, -0.1283, ..., -0.2309, -0.0548, 0.1906], [-0.1669, 0.0846, 0.0500, ..., -0.2416, -0.1214, -0.0430]], device='cuda:0'), grad: tensor([[ 4.9360e-08, 2.1420e-08, 1.2107e-08, ..., 1.5832e-08, 4.6566e-09, 7.5437e-08], [-2.6505e-06, -2.4457e-06, 7.5437e-08, ..., 1.3970e-08, 5.4017e-08, -1.1802e-05], [ 7.0222e-07, 3.5483e-07, 6.4261e-08, ..., -3.8277e-07, 3.1665e-08, 6.6273e-06], ..., [-8.0466e-07, -1.6857e-07, -4.5635e-06, ..., -7.4506e-09, -2.0303e-06, 1.4910e-06], [ 1.6019e-07, 2.9057e-07, 1.6857e-07, ..., 6.0536e-08, 8.3819e-09, 4.1910e-07], [ 9.5926e-08, -8.9407e-08, -7.5437e-08, ..., 2.7940e-09, 4.6566e-08, 1.5367e-07]], device='cuda:0') Epoch 274, bias, value: tensor([-0.0125, -0.0160, -0.0044, -0.0181, -0.0089, 0.0032, 0.0099, 0.0156, 0.0172, -0.0108], device='cuda:0'), grad: tensor([ 3.6508e-07, -3.3110e-05, 1.5631e-05, 1.6373e-06, 3.0339e-05, 1.5097e-06, 1.5423e-05, -3.4541e-05, 2.1011e-06, 5.9232e-07], device='cuda:0') 100 0.0001 changing lr epoch 273, time 214.73, cls_loss 0.0012 cls_loss_mapping 0.0023 cls_loss_causal 0.5334 re_mapping 0.0056 re_causal 0.0157 /// teacc 99.00 lr 0.00010000 Epoch 275, weight, value: tensor([[-0.2595, 0.0841, -0.1333, ..., -0.0420, -0.2093, -0.1583], [ 0.0122, 0.0995, -0.1133, ..., -0.1114, -0.0494, 0.1065], [ 0.0521, -0.1207, -0.1376, ..., 0.0041, -0.0514, -0.0884], ..., [ 0.0852, -0.0469, 0.1262, ..., 0.0621, 0.1777, -0.0100], [ 0.1130, -0.1970, -0.1284, ..., -0.2328, -0.0560, 0.1904], [-0.1671, 0.0846, 0.0499, ..., -0.2436, -0.1226, -0.0431]], device='cuda:0'), grad: tensor([[ 3.2596e-08, -5.2154e-08, 3.7253e-08, ..., 2.5146e-08, 1.8626e-08, 9.3132e-10], [ 1.5739e-07, 8.3819e-09, 9.4995e-08, ..., 6.4261e-08, 8.8476e-08, -3.5390e-08], [ 1.4342e-07, 2.9802e-08, 9.8720e-08, ..., 2.1420e-08, 5.6811e-08, 5.5879e-09], ..., [-6.6962e-07, 1.5832e-08, -3.2783e-07, ..., -1.7881e-07, -3.5763e-07, 1.4901e-08], [ 3.2596e-07, 5.2154e-08, 2.2352e-07, ..., 1.7975e-07, 1.6391e-07, 1.3039e-08], [ 3.7253e-09, 5.5879e-09, -1.3970e-08, ..., 1.2107e-08, 1.8626e-09, 1.7695e-08]], device='cuda:0') Epoch 275, bias, value: tensor([-0.0126, -0.0163, -0.0046, -0.0183, -0.0089, 0.0029, 0.0101, 0.0165, 0.0171, -0.0109], device='cuda:0'), grad: tensor([-1.0245e-07, 1.3039e-06, -4.2282e-06, -1.1930e-06, 2.8405e-07, 8.5495e-07, 2.8722e-06, -6.2399e-07, 7.2084e-07, 1.2759e-07], device='cuda:0') 100 0.0001 changing lr epoch 274, time 214.95, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4583 re_mapping 0.0052 re_causal 0.0144 /// teacc 98.92 lr 0.00010000 Epoch 276, weight, value: tensor([[-0.2603, 0.0838, -0.1335, ..., -0.0422, -0.2100, -0.1589], [ 0.0121, 0.0991, -0.1134, ..., -0.1115, -0.0495, 0.1068], [ 0.0521, -0.1209, -0.1381, ..., 0.0041, -0.0515, -0.0892], ..., [ 0.0853, -0.0470, 0.1263, ..., 0.0622, 0.1779, -0.0101], [ 0.1131, -0.1973, -0.1285, ..., -0.2334, -0.0561, 0.1905], [-0.1671, 0.0847, 0.0500, ..., -0.2442, -0.1230, -0.0431]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -4.1910e-08, 2.7940e-09, ..., 9.3132e-10, 3.7253e-09, 9.3132e-10], [ 5.4389e-07, 3.1758e-07, 5.3365e-07, ..., 9.3132e-09, 4.1258e-07, -3.9767e-07], [-2.4840e-05, 7.4506e-09, -4.1686e-06, ..., -5.4501e-06, -1.4797e-05, 4.6566e-09], ..., [ 2.3857e-05, -5.9512e-07, 3.1609e-06, ..., 5.4426e-06, 1.4037e-05, 2.0489e-08], [ 7.4506e-09, 1.7695e-08, 5.5879e-09, ..., 1.8626e-09, 5.5879e-09, 9.7789e-08], [ 4.5635e-07, 5.1409e-07, 4.6752e-07, ..., 2.7940e-09, 3.5111e-07, 1.1176e-08]], device='cuda:0') Epoch 276, bias, value: tensor([-0.0130, -0.0163, -0.0047, -0.0185, -0.0087, 0.0033, 0.0100, 0.0165, 0.0170, -0.0109], device='cuda:0'), grad: tensor([-6.9849e-08, 5.6811e-07, -2.9609e-05, 1.9372e-07, -9.8720e-08, -9.5926e-07, 4.5355e-07, 2.7552e-05, 2.1234e-07, 1.7537e-06], device='cuda:0') 100 0.0001 changing lr epoch 275, time 215.19, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.4908 re_mapping 0.0052 re_causal 0.0149 /// teacc 98.97 lr 0.00010000 Epoch 277, weight, value: tensor([[-0.2610, 0.0834, -0.1337, ..., -0.0423, -0.2104, -0.1600], [ 0.0112, 0.0991, -0.1140, ..., -0.1116, -0.0511, 0.1063], [ 0.0522, -0.1210, -0.1385, ..., 0.0041, -0.0515, -0.0893], ..., [ 0.0860, -0.0471, 0.1267, ..., 0.0621, 0.1791, -0.0096], [ 0.1135, -0.1975, -0.1285, ..., -0.2342, -0.0560, 0.1920], [-0.1670, 0.0845, 0.0502, ..., -0.2447, -0.1222, -0.0433]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.3970e-08, 2.4214e-08, ..., 9.3132e-09, 1.8626e-09, -0.0000e+00], [ 3.7253e-08, 1.4435e-07, 3.6880e-07, ..., 7.8231e-08, 3.4459e-08, 2.7940e-09], [-7.4506e-09, 8.7544e-08, 2.8778e-07, ..., 1.8626e-07, -4.6566e-09, 1.8626e-09], ..., [-5.3085e-08, 1.6484e-07, 4.4052e-07, ..., 7.8231e-08, -5.4017e-08, 0.0000e+00], [-7.4506e-09, 1.2293e-07, 3.7625e-07, ..., 7.5437e-08, 9.3132e-09, -2.1420e-08], [ 1.0245e-08, -2.1476e-06, -3.0641e-06, ..., 7.4506e-08, 5.5879e-09, 6.5193e-09]], device='cuda:0') Epoch 277, bias, value: tensor([-0.0133, -0.0169, -0.0048, -0.0181, -0.0085, 0.0031, 0.0090, 0.0169, 0.0176, -0.0109], device='cuda:0'), grad: tensor([ 2.4214e-08, 9.2015e-07, 3.8277e-07, -1.8328e-06, 5.8953e-07, 7.6517e-06, -2.3283e-08, 9.7230e-07, 8.3540e-07, -9.5516e-06], device='cuda:0') 100 0.0001 changing lr epoch 276, time 215.09, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4571 re_mapping 0.0053 re_causal 0.0140 /// teacc 98.90 lr 0.00010000 Epoch 278, weight, value: tensor([[-0.2621, 0.0835, -0.1341, ..., -0.0424, -0.2117, -0.1604], [ 0.0104, 0.0990, -0.1147, ..., -0.1117, -0.0523, 0.1060], [ 0.0522, -0.1211, -0.1392, ..., 0.0040, -0.0517, -0.0894], ..., [ 0.0869, -0.0472, 0.1277, ..., 0.0623, 0.1805, -0.0093], [ 0.1135, -0.1977, -0.1286, ..., -0.2360, -0.0563, 0.1921], [-0.1673, 0.0845, 0.0500, ..., -0.2462, -0.1235, -0.0434]], device='cuda:0'), grad: tensor([[ 3.7067e-07, 7.4506e-09, 5.8673e-08, ..., 2.9709e-07, 1.8161e-07, 1.6764e-08], [ 2.5705e-07, 3.0734e-08, 4.6380e-07, ..., 4.0792e-07, 8.3819e-08, -1.9558e-08], [-1.9595e-06, 2.6077e-08, 1.2293e-07, ..., -1.5739e-06, -1.0161e-06, 2.2352e-08], ..., [ 9.5889e-06, 5.0664e-07, 2.5123e-05, ..., 2.3052e-05, 3.2596e-07, 2.1793e-07], [ 2.0601e-06, 3.6880e-07, 4.2766e-06, ..., 4.1537e-06, 3.5856e-07, -1.1642e-07], [-1.9558e-08, -2.0508e-06, -1.5963e-06, ..., 1.0524e-07, 1.3039e-08, 5.6811e-08]], device='cuda:0') Epoch 278, bias, value: tensor([-0.0132, -0.0175, -0.0047, -0.0180, -0.0083, 0.0029, 0.0088, 0.0178, 0.0175, -0.0111], device='cuda:0'), grad: tensor([ 1.4696e-06, 1.0477e-06, -6.4857e-06, -4.7654e-05, 2.4457e-06, 1.3690e-06, 2.5891e-07, 4.7415e-05, 1.0625e-05, -1.0520e-05], device='cuda:0') 100 0.0001 changing lr epoch 277, time 215.03, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4985 re_mapping 0.0050 re_causal 0.0147 /// teacc 98.91 lr 0.00010000 Epoch 279, weight, value: tensor([[-0.2635, 0.0837, -0.1347, ..., -0.0428, -0.2139, -0.1610], [ 0.0105, 0.0990, -0.1146, ..., -0.1117, -0.0523, 0.1069], [ 0.0522, -0.1213, -0.1398, ..., 0.0041, -0.0516, -0.0914], ..., [ 0.0869, -0.0474, 0.1277, ..., 0.0622, 0.1805, -0.0096], [ 0.1135, -0.1979, -0.1286, ..., -0.2373, -0.0565, 0.1922], [-0.1673, 0.0844, 0.0501, ..., -0.2467, -0.1237, -0.0434]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -2.3190e-07, 3.7253e-09, ..., -2.8871e-08, 9.3132e-10, 2.3283e-08], [-9.2201e-08, -8.1025e-08, 2.9802e-08, ..., 1.0245e-08, 2.4214e-08, -2.5611e-07], [ 5.5879e-09, 1.3970e-08, 6.5193e-09, ..., -1.8626e-09, -3.7253e-09, 1.7695e-08], ..., [ 2.1420e-08, 6.4261e-08, -5.5879e-08, ..., -1.6764e-08, -4.8429e-08, 1.7881e-07], [ 4.6566e-09, 6.5193e-08, 1.0245e-08, ..., 2.7940e-09, 2.7940e-09, 1.2880e-06], [ 4.2841e-08, 1.0338e-07, -2.7008e-08, ..., 2.1420e-08, 2.0489e-08, 4.1910e-08]], device='cuda:0') Epoch 279, bias, value: tensor([-0.0132, -0.0172, -0.0050, -0.0181, -0.0079, 0.0027, 0.0088, 0.0176, 0.0175, -0.0111], device='cuda:0'), grad: tensor([-2.6356e-07, -6.1467e-07, 6.6124e-08, 3.0082e-07, 1.5181e-07, 1.3644e-06, -1.2308e-05, 4.2189e-07, 1.0476e-05, 3.7905e-07], device='cuda:0') 100 0.0001 changing lr epoch 278, time 214.96, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4749 re_mapping 0.0049 re_causal 0.0135 /// teacc 99.09 lr 0.00010000 Epoch 280, weight, value: tensor([[-0.2646, 0.0839, -0.1349, ..., -0.0446, -0.2146, -0.1628], [ 0.0090, 0.0990, -0.1155, ..., -0.1118, -0.0542, 0.1064], [ 0.0521, -0.1214, -0.1403, ..., 0.0041, -0.0519, -0.0918], ..., [ 0.0883, -0.0475, 0.1286, ..., 0.0622, 0.1824, -0.0092], [ 0.1136, -0.1980, -0.1287, ..., -0.2381, -0.0566, 0.1926], [-0.1673, 0.0844, 0.0502, ..., -0.2473, -0.1240, -0.0435]], device='cuda:0'), grad: tensor([[ 3.4459e-08, -2.0210e-07, 9.3132e-10, ..., 2.7008e-08, 5.1223e-08, 2.7940e-09], [ 3.4459e-08, 9.6858e-08, 7.4506e-08, ..., 6.2399e-08, 4.0047e-08, -2.2072e-07], [-3.9116e-07, 3.5390e-08, 2.8871e-08, ..., -3.5670e-07, -6.0908e-07, 2.4214e-08], ..., [ 3.0920e-07, 3.4459e-08, 3.4459e-08, ..., 3.4552e-07, 4.6007e-07, 1.8440e-07], [-3.9116e-08, 3.5390e-08, 7.3574e-08, ..., 6.5193e-08, 3.0734e-08, -1.9558e-08], [ 5.7742e-08, 7.6462e-07, 4.9081e-07, ..., 9.4995e-08, 1.2107e-08, 2.0210e-07]], device='cuda:0') Epoch 280, bias, value: tensor([-0.0134, -0.0186, -0.0052, -0.0179, -0.0077, 0.0025, 0.0091, 0.0190, 0.0176, -0.0112], device='cuda:0'), grad: tensor([-2.8498e-07, 1.0990e-07, -1.3607e-06, -1.9707e-06, -2.0787e-06, 5.0850e-07, 6.3423e-07, 1.5935e-06, 2.4121e-07, 2.6096e-06], device='cuda:0') 100 0.0001 changing lr epoch 279, time 214.88, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.5234 re_mapping 0.0045 re_causal 0.0139 /// teacc 98.89 lr 0.00010000 Epoch 281, weight, value: tensor([[-0.2652, 0.0843, -0.1350, ..., -0.0446, -0.2157, -0.1630], [ 0.0089, 0.0989, -0.1155, ..., -0.1119, -0.0542, 0.1066], [ 0.0521, -0.1216, -0.1410, ..., 0.0041, -0.0519, -0.0921], ..., [ 0.0884, -0.0477, 0.1287, ..., 0.0623, 0.1825, -0.0093], [ 0.1136, -0.1982, -0.1287, ..., -0.2402, -0.0568, 0.1927], [-0.1674, 0.0842, 0.0502, ..., -0.2483, -0.1245, -0.0436]], device='cuda:0'), grad: tensor([[ 9.3132e-09, -1.4715e-07, 9.3132e-10, ..., 4.6566e-09, 4.6566e-09, 0.0000e+00], [ 1.3970e-07, 2.1420e-08, 6.0536e-08, ..., 6.2399e-08, 7.5437e-08, -4.2841e-08], [-5.6997e-07, 1.4901e-08, 8.3819e-09, ..., -3.0454e-07, -3.0641e-07, 8.3819e-09], ..., [-3.3900e-07, 1.3970e-08, -3.4273e-07, ..., -1.1455e-07, -1.9465e-07, 2.5146e-08], [ 6.2212e-07, 4.1537e-07, 2.7474e-07, ..., 2.4680e-07, 2.9709e-07, -1.7695e-08], [ 1.1828e-07, 1.1194e-06, 5.0291e-08, ..., 1.1176e-08, 1.3039e-08, 7.4506e-09]], device='cuda:0') Epoch 281, bias, value: tensor([-0.0129, -0.0186, -0.0051, -0.0181, -0.0076, 0.0025, 0.0090, 0.0190, 0.0175, -0.0113], device='cuda:0'), grad: tensor([-2.4401e-07, 4.1071e-07, -2.4252e-06, 1.2279e-05, 2.2631e-07, -1.7405e-05, 6.2678e-07, -2.3097e-07, 3.0808e-06, 3.6638e-06], device='cuda:0') 100 0.0001 changing lr epoch 280, time 215.05, cls_loss 0.0014 cls_loss_mapping 0.0029 cls_loss_causal 0.4772 re_mapping 0.0049 re_causal 0.0141 /// teacc 98.94 lr 0.00010000 Epoch 282, weight, value: tensor([[-0.2666, 0.0845, -0.1351, ..., -0.0447, -0.2163, -0.1638], [ 0.0095, 0.0989, -0.1156, ..., -0.1120, -0.0538, 0.1077], [ 0.0520, -0.1217, -0.1417, ..., 0.0040, -0.0521, -0.0924], ..., [ 0.0878, -0.0479, 0.1283, ..., 0.0625, 0.1822, -0.0104], [ 0.1137, -0.1982, -0.1288, ..., -0.2410, -0.0575, 0.1934], [-0.1673, 0.0841, 0.0513, ..., -0.2496, -0.1231, -0.0437]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -6.8508e-06, -1.0245e-08, ..., 1.8626e-09, 3.7253e-09, 9.3132e-09], [ 5.0291e-08, -6.3330e-08, 4.1910e-08, ..., 6.5193e-08, 1.6671e-07, -1.4808e-07], [-7.0781e-08, 2.8871e-08, -2.9802e-08, ..., -8.1956e-08, -2.2165e-07, -1.2945e-07], ..., [ 3.7253e-09, 9.2201e-08, -3.0734e-08, ..., -6.5193e-09, 2.7940e-09, 1.9930e-07], [-2.1420e-08, 4.7497e-08, 4.6566e-09, ..., 3.7253e-09, 8.3819e-09, -3.9116e-08], [ 1.2107e-08, 2.3097e-07, 9.3132e-09, ..., 5.5879e-09, 1.2107e-08, 5.1223e-08]], device='cuda:0') Epoch 282, bias, value: tensor([-0.0129, -0.0179, -0.0053, -0.0185, -0.0074, 0.0023, 0.0073, 0.0176, 0.0184, -0.0105], device='cuda:0'), grad: tensor([-1.5527e-05, -7.4506e-09, -1.2759e-06, 2.6356e-06, 5.4017e-08, 1.0483e-05, 1.8002e-06, 9.6392e-07, 1.7975e-07, 7.0222e-07], device='cuda:0') 100 0.0001 changing lr epoch 281, time 214.69, cls_loss 0.0013 cls_loss_mapping 0.0018 cls_loss_causal 0.5125 re_mapping 0.0049 re_causal 0.0141 /// teacc 98.98 lr 0.00010000 Epoch 283, weight, value: tensor([[-0.2689, 0.0844, -0.1355, ..., -0.0449, -0.2190, -0.1659], [ 0.0089, 0.0989, -0.1162, ..., -0.1129, -0.0550, 0.1078], [ 0.0518, -0.1219, -0.1429, ..., 0.0039, -0.0524, -0.0926], ..., [ 0.0888, -0.0478, 0.1295, ..., 0.0640, 0.1843, -0.0105], [ 0.1137, -0.1987, -0.1289, ..., -0.2438, -0.0587, 0.1936], [-0.1676, 0.0842, 0.0507, ..., -0.2585, -0.1279, -0.0439]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -1.8626e-09, 1.0245e-08, ..., 5.5879e-09, 2.7940e-09, 1.8626e-09], [-1.1921e-07, 9.3132e-09, 6.5193e-08, ..., 4.3772e-08, 5.4948e-08, -7.1619e-07], [ 4.2096e-07, 1.5832e-08, 2.4494e-07, ..., 2.3749e-07, 3.5018e-07, 8.1025e-08], ..., [-5.8208e-07, 9.3132e-09, -3.8929e-07, ..., -3.5018e-07, -5.2713e-07, 7.0781e-08], [ 1.7695e-07, 4.6566e-08, 8.9407e-08, ..., 4.2841e-08, 5.5879e-08, 4.6287e-07], [ 7.1712e-08, -7.5903e-07, -2.8033e-07, ..., 4.1910e-08, 5.5879e-08, 2.4214e-08]], device='cuda:0') Epoch 283, bias, value: tensor([-0.0133, -0.0185, -0.0054, -0.0162, -0.0073, 0.0004, 0.0075, 0.0186, 0.0183, -0.0109], device='cuda:0'), grad: tensor([ 8.2888e-08, -1.5749e-06, 8.9314e-07, -4.7404e-07, 1.6000e-06, 1.0347e-06, 2.1979e-07, -7.8045e-07, 1.5115e-06, -2.5053e-06], device='cuda:0') 100 0.0001 changing lr epoch 282, time 214.74, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.5120 re_mapping 0.0049 re_causal 0.0144 /// teacc 98.98 lr 0.00010000 Epoch 284, weight, value: tensor([[-0.2696, 0.0839, -0.1358, ..., -0.0451, -0.2201, -0.1664], [ 0.0089, 0.0982, -0.1164, ..., -0.1131, -0.0551, 0.1083], [ 0.0518, -0.1220, -0.1433, ..., 0.0039, -0.0524, -0.0929], ..., [ 0.0890, -0.0480, 0.1299, ..., 0.0644, 0.1847, -0.0109], [ 0.1136, -0.1989, -0.1290, ..., -0.2451, -0.0595, 0.1937], [-0.1678, 0.0849, 0.0507, ..., -0.2595, -0.1289, -0.0439]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -3.7253e-09, 9.3132e-09, ..., 6.5193e-09, 4.6566e-09, 1.8626e-09], [ 1.0338e-07, -1.8626e-09, 1.4529e-07, ..., 9.5926e-08, 9.0338e-08, -4.0047e-08], [ 9.3132e-10, 2.7940e-09, 5.4017e-08, ..., -1.6764e-08, 4.5635e-08, 5.5879e-09], ..., [-6.7428e-07, 6.5193e-09, -9.7696e-07, ..., -4.9081e-07, -6.6962e-07, 1.9558e-08], [ 2.3656e-07, 1.3970e-08, 2.8498e-07, ..., 1.3411e-07, 1.8254e-07, 5.5879e-09], [ 4.0978e-08, 9.3225e-07, 3.3528e-08, ..., 4.0047e-08, 3.9116e-08, 7.8510e-07]], device='cuda:0') Epoch 284, bias, value: tensor([-0.0139, -0.0183, -0.0055, -0.0166, -0.0072, 0.0004, 0.0074, 0.0186, 0.0182, -0.0106], device='cuda:0'), grad: tensor([ 1.2293e-07, 2.7288e-07, 2.4214e-08, 6.0070e-07, -3.4682e-06, 1.9092e-07, -7.5810e-07, -1.4128e-06, 6.0536e-07, 3.8110e-06], device='cuda:0') 100 0.0001 changing lr epoch 283, time 214.94, cls_loss 0.0014 cls_loss_mapping 0.0024 cls_loss_causal 0.4898 re_mapping 0.0051 re_causal 0.0136 /// teacc 99.02 lr 0.00010000 Epoch 285, weight, value: tensor([[-0.2702, 0.0839, -0.1367, ..., -0.0458, -0.2222, -0.1670], [ 0.0088, 0.0979, -0.1166, ..., -0.1133, -0.0552, 0.1087], [ 0.0520, -0.1222, -0.1438, ..., 0.0040, -0.0523, -0.0931], ..., [ 0.0892, -0.0481, 0.1303, ..., 0.0645, 0.1851, -0.0110], [ 0.1136, -0.1998, -0.1292, ..., -0.2475, -0.0603, 0.1935], [-0.1679, 0.0852, 0.0508, ..., -0.2600, -0.1297, -0.0436]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.7940e-09, 2.7940e-09, ..., 9.3132e-10, 1.8626e-09, 1.8626e-09], [ 1.1362e-07, -3.2596e-08, 1.1176e-07, ..., 6.8918e-08, 1.4901e-07, -9.4157e-07], [-1.7695e-07, 1.5832e-08, 1.6578e-07, ..., 1.7229e-07, -3.3341e-07, 5.7835e-07], ..., [-1.3970e-07, 4.3586e-07, 1.8552e-06, ..., -5.5879e-09, 1.5460e-07, 1.8626e-08], [ 1.4808e-07, 5.5879e-09, 1.2480e-07, ..., 3.7253e-08, 1.9372e-07, 1.4901e-08], [ 3.1665e-08, -4.6566e-07, -2.3153e-06, ..., 5.5879e-09, -2.1048e-07, 4.0978e-08]], device='cuda:0') Epoch 285, bias, value: tensor([-0.0142, -0.0184, -0.0056, -0.0165, -0.0074, 0.0009, 0.0072, 0.0188, 0.0179, -0.0105], device='cuda:0'), grad: tensor([ 6.0536e-08, -7.2680e-06, 4.7013e-06, -5.4855e-07, 2.8405e-07, 3.1665e-08, 2.2762e-06, 2.5686e-06, 4.9639e-07, -2.6412e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 284---------------------------------------------------- epoch 284, time 231.45, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4645 re_mapping 0.0048 re_causal 0.0137 /// teacc 99.17 lr 0.00010000 Epoch 286, weight, value: tensor([[-0.2708, 0.0840, -0.1369, ..., -0.0461, -0.2231, -0.1672], [ 0.0088, 0.0980, -0.1160, ..., -0.1134, -0.0553, 0.1088], [ 0.0521, -0.1222, -0.1442, ..., 0.0041, -0.0522, -0.0932], ..., [ 0.0893, -0.0483, 0.1300, ..., 0.0646, 0.1853, -0.0110], [ 0.1136, -0.2000, -0.1294, ..., -0.2492, -0.0611, 0.1935], [-0.1681, 0.0848, 0.0506, ..., -0.2604, -0.1303, -0.0439]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.0419e-07, 9.3132e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-09, 5.1223e-08, 6.5193e-09, ..., 4.6566e-09, 4.6566e-09, -7.4506e-09], [-5.8673e-08, 1.9558e-08, 4.6566e-09, ..., -4.1910e-08, -5.0291e-08, 2.7940e-09], ..., [ 1.4901e-08, 2.6077e-08, -7.4506e-09, ..., 1.1176e-08, 1.1176e-08, 5.5879e-09], [ 2.2352e-08, 8.1956e-08, 1.8626e-09, ..., 1.8626e-08, 2.2352e-08, -9.3132e-10], [ 3.7253e-09, -1.8626e-09, -7.9162e-08, ..., 1.8626e-09, 2.7940e-09, 3.2596e-08]], device='cuda:0') Epoch 286, bias, value: tensor([-0.0140, -0.0178, -0.0055, -0.0165, -0.0070, 0.0008, 0.0073, 0.0184, 0.0177, -0.0108], device='cuda:0'), grad: tensor([-1.0822e-06, 1.5087e-07, -2.1420e-07, 6.3330e-08, 1.1828e-07, 2.9430e-07, 1.3970e-07, 1.5274e-07, 3.3341e-07, 6.2399e-08], device='cuda:0') 100 0.0001 changing lr epoch 285, time 214.98, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4962 re_mapping 0.0049 re_causal 0.0145 /// teacc 99.04 lr 0.00010000 Epoch 287, weight, value: tensor([[-0.2714, 0.0844, -0.1373, ..., -0.0462, -0.2240, -0.1673], [ 0.0088, 0.0981, -0.1158, ..., -0.1134, -0.0553, 0.1090], [ 0.0528, -0.1223, -0.1441, ..., 0.0047, -0.0512, -0.0933], ..., [ 0.0891, -0.0485, 0.1299, ..., 0.0642, 0.1849, -0.0111], [ 0.1136, -0.2006, -0.1294, ..., -0.2500, -0.0613, 0.1936], [-0.1681, 0.0850, 0.0507, ..., -0.2606, -0.1306, -0.0439]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 3.7253e-09, -7.4506e-09, 1.9558e-08, ..., 1.9558e-08, 8.3819e-09, -3.2596e-08], [-1.8626e-08, 9.3132e-10, 7.6648e-07, ..., 1.1437e-06, -1.7695e-08, 3.7253e-09], ..., [ 6.5193e-09, 3.7253e-09, -2.1420e-08, ..., 3.7253e-09, 2.7940e-09, 7.3574e-08], [ 1.8626e-09, 1.8626e-09, 5.5879e-09, ..., 3.7253e-09, 9.3132e-10, 7.4506e-09], [ 2.7940e-09, -4.6566e-09, -1.8626e-09, ..., 1.8626e-09, 3.7253e-09, 9.3132e-09]], device='cuda:0') Epoch 287, bias, value: tensor([-0.0144, -0.0176, -0.0050, -0.0171, -0.0073, 0.0011, 0.0076, 0.0181, 0.0177, -0.0107], device='cuda:0'), grad: tensor([ 2.0489e-08, 0.0000e+00, 2.5053e-06, -2.6897e-06, -2.9430e-07, 1.2480e-07, -1.4901e-08, 2.7195e-07, 3.8184e-08, 3.1665e-08], device='cuda:0') 100 0.0001 changing lr epoch 286, time 215.06, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4806 re_mapping 0.0050 re_causal 0.0139 /// teacc 98.98 lr 0.00010000 Epoch 288, weight, value: tensor([[-0.2723, 0.0847, -0.1374, ..., -0.0462, -0.2245, -0.1676], [ 0.0088, 0.0980, -0.1159, ..., -0.1136, -0.0553, 0.1091], [ 0.0524, -0.1225, -0.1458, ..., 0.0043, -0.0516, -0.0935], ..., [ 0.0893, -0.0486, 0.1302, ..., 0.0647, 0.1852, -0.0111], [ 0.1138, -0.2008, -0.1294, ..., -0.2506, -0.0610, 0.1940], [-0.1682, 0.0843, 0.0508, ..., -0.2610, -0.1308, -0.0443]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 1.1176e-08, 0.0000e+00, ..., 2.7940e-09, 9.3132e-10, 9.3132e-10], [ 4.6566e-09, 3.9078e-06, 1.8626e-09, ..., 9.3132e-10, 1.8626e-09, -1.2107e-08], [-9.3132e-09, 3.8184e-08, 8.3819e-09, ..., -2.1420e-08, -1.8626e-09, 4.6566e-09], ..., [-4.6566e-09, 1.8626e-08, -1.4901e-08, ..., 8.3819e-09, -1.0245e-08, 1.0245e-08], [ 1.3039e-08, 1.4696e-06, 1.8626e-09, ..., 2.7940e-09, 1.8626e-09, 3.7253e-09], [ 6.7055e-08, 2.1830e-06, -6.5193e-09, ..., 9.3132e-10, 1.8626e-09, 1.1362e-07]], device='cuda:0') Epoch 288, bias, value: tensor([-0.0147, -0.0176, -0.0055, -0.0170, -0.0072, 0.0013, 0.0076, 0.0182, 0.0178, -0.0109], device='cuda:0'), grad: tensor([ 1.1828e-07, 4.8697e-05, 4.9546e-07, 4.4525e-05, -3.6322e-08, -1.1933e-04, 1.3039e-07, 1.0617e-07, 1.7866e-05, 7.4692e-06], device='cuda:0') 100 0.0001 changing lr epoch 287, time 214.77, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.4833 re_mapping 0.0049 re_causal 0.0142 /// teacc 99.03 lr 0.00010000 Epoch 289, weight, value: tensor([[-0.2750, 0.0857, -0.1379, ..., -0.0468, -0.2275, -0.1658], [ 0.0085, 0.0978, -0.1161, ..., -0.1136, -0.0559, 0.1094], [ 0.0525, -0.1227, -0.1466, ..., 0.0045, -0.0515, -0.0948], ..., [ 0.0896, -0.0488, 0.1305, ..., 0.0647, 0.1857, -0.0110], [ 0.1138, -0.2012, -0.1295, ..., -0.2515, -0.0612, 0.1941], [-0.1677, 0.0848, 0.0516, ..., -0.2612, -0.1309, -0.0445]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 4.6566e-09, 6.5193e-09, ..., 5.5879e-09, 3.7253e-09, 0.0000e+00], [ 5.5879e-08, -8.3819e-09, 9.2201e-08, ..., 2.5146e-08, 9.0338e-08, -3.4459e-08], [-1.1176e-08, 3.7253e-09, -9.4064e-08, ..., -1.0245e-08, -7.4506e-08, 2.7940e-09], ..., [-9.4064e-08, 1.3970e-08, -7.5437e-08, ..., -2.3283e-08, -1.0058e-07, 2.4214e-08], [ 5.6811e-08, 4.5635e-08, 1.3039e-08, ..., 7.4506e-09, 1.0245e-08, 9.3132e-10], [ 3.7253e-08, -2.7940e-09, 4.9360e-08, ..., 1.0245e-08, 5.0291e-08, 1.8626e-09]], device='cuda:0') Epoch 289, bias, value: tensor([-0.0136, -0.0178, -0.0055, -0.0171, -0.0082, 0.0019, 0.0074, 0.0183, 0.0177, -0.0105], device='cuda:0'), grad: tensor([ 8.2888e-08, 3.1572e-07, -5.2806e-07, 5.5879e-09, 6.6124e-08, -2.2873e-06, 1.1595e-06, 4.4703e-08, 9.4902e-07, 1.9372e-07], device='cuda:0') 100 0.0001 changing lr epoch 288, time 215.26, cls_loss 0.0013 cls_loss_mapping 0.0014 cls_loss_causal 0.4624 re_mapping 0.0048 re_causal 0.0139 /// teacc 98.97 lr 0.00010000 Epoch 290, weight, value: tensor([[-0.2767, 0.0827, -0.1383, ..., -0.0471, -0.2295, -0.1660], [ 0.0083, 0.0979, -0.1162, ..., -0.1138, -0.0560, 0.1096], [ 0.0523, -0.1229, -0.1477, ..., 0.0044, -0.0518, -0.0950], ..., [ 0.0901, -0.0489, 0.1308, ..., 0.0650, 0.1862, -0.0110], [ 0.1138, -0.2015, -0.1296, ..., -0.2538, -0.0616, 0.1941], [-0.1677, 0.0877, 0.0516, ..., -0.2614, -0.1311, -0.0447]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.2387e-07, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 8.3819e-09, 3.9116e-08, 5.5879e-08, ..., 3.7253e-09, 8.3819e-09, -8.5682e-08], [ 3.8184e-08, 1.8626e-09, 2.7008e-08, ..., 1.8626e-08, 4.2841e-08, 1.3039e-08], ..., [-4.2841e-08, 6.4261e-08, -2.7940e-08, ..., -2.5146e-08, -5.7742e-08, 9.1270e-08], [-1.9558e-08, 1.8626e-08, 1.1176e-08, ..., 9.3132e-10, 1.8626e-09, -2.4214e-08], [ 3.7253e-09, -1.5283e-06, -1.0794e-06, ..., 9.3132e-10, 3.7253e-09, 1.2107e-08]], device='cuda:0') Epoch 290, bias, value: tensor([-0.0165, -0.0178, -0.0057, -0.0171, -0.0086, 0.0021, 0.0071, 0.0188, 0.0176, -0.0080], device='cuda:0'), grad: tensor([-2.3004e-07, 9.1270e-08, 9.8720e-08, 1.7695e-08, 4.0792e-06, 9.2201e-08, -6.9942e-07, 2.0489e-07, 2.1420e-08, -3.6657e-06], device='cuda:0') 100 0.0001 changing lr epoch 289, time 215.28, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4918 re_mapping 0.0049 re_causal 0.0147 /// teacc 98.99 lr 0.00010000 Epoch 291, weight, value: tensor([[-0.2769, 0.0828, -0.1387, ..., -0.0473, -0.2300, -0.1661], [ 0.0083, 0.0979, -0.1162, ..., -0.1139, -0.0561, 0.1098], [ 0.0524, -0.1230, -0.1481, ..., 0.0044, -0.0518, -0.0950], ..., [ 0.0901, -0.0490, 0.1309, ..., 0.0650, 0.1863, -0.0111], [ 0.1138, -0.2018, -0.1296, ..., -0.2546, -0.0618, 0.1942], [-0.1677, 0.0875, 0.0516, ..., -0.2618, -0.1312, -0.0448]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 4.4703e-08, 1.1176e-08, ..., 1.8626e-09, 4.6566e-09, 1.8626e-09], [ 4.7497e-08, 7.5437e-08, 9.7789e-08, ..., 2.5146e-08, 6.8918e-08, -1.0524e-07], [ 2.9802e-08, 3.4459e-08, 4.7497e-08, ..., 1.3970e-08, 3.2596e-08, 6.0536e-08], ..., [-2.5146e-07, 4.0978e-08, -5.3085e-07, ..., -1.3597e-07, -3.8836e-07, 1.1455e-07], [ 8.3819e-09, 2.7213e-06, 2.7940e-08, ..., 4.6566e-09, 1.1176e-08, 3.7253e-08], [ 1.6671e-07, 2.6077e-08, 2.4121e-07, ..., 7.5437e-08, 2.4773e-07, 9.0338e-08]], device='cuda:0') Epoch 291, bias, value: tensor([-0.0165, -0.0178, -0.0057, -0.0171, -0.0084, 0.0015, 0.0079, 0.0188, 0.0176, -0.0082], device='cuda:0'), grad: tensor([ 2.0675e-07, 2.2445e-07, 2.0396e-07, 2.2501e-06, -2.3283e-07, -4.7833e-05, 3.3319e-05, -2.8312e-07, 1.1653e-05, 4.4610e-07], device='cuda:0') 100 0.0001 changing lr epoch 290, time 215.08, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.5037 re_mapping 0.0049 re_causal 0.0141 /// teacc 98.97 lr 0.00010000 Epoch 292, weight, value: tensor([[-0.2767, 0.0829, -0.1396, ..., -0.0481, -0.2307, -0.1658], [ 0.0083, 0.0978, -0.1157, ..., -0.1137, -0.0561, 0.1103], [ 0.0527, -0.1231, -0.1483, ..., 0.0046, -0.0514, -0.0951], ..., [ 0.0901, -0.0493, 0.1305, ..., 0.0648, 0.1862, -0.0115], [ 0.1138, -0.2040, -0.1297, ..., -0.2565, -0.0621, 0.1940], [-0.1678, 0.0874, 0.0516, ..., -0.2622, -0.1314, -0.0451]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.5670e-07, 3.7253e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, 1.6764e-08, 2.0489e-08, ..., 1.3970e-08, 4.6566e-09, -1.3039e-08], [ 2.7940e-08, 1.0245e-08, 2.9802e-08, ..., 2.9802e-08, 3.2596e-08, 1.2107e-08], ..., [-3.3528e-08, 3.8184e-08, -4.6566e-09, ..., -1.3039e-08, -4.0978e-08, 2.8871e-08], [-5.0291e-08, 2.2352e-08, 1.6764e-08, ..., 4.6566e-09, 9.3132e-10, -1.1828e-07], [ 4.0047e-08, 1.9558e-07, -7.9162e-08, ..., 1.8626e-09, 9.3132e-10, 2.5611e-07]], device='cuda:0') Epoch 292, bias, value: tensor([-0.0164, -0.0172, -0.0056, -0.0174, -0.0082, 0.0021, 0.0082, 0.0182, 0.0173, -0.0083], device='cuda:0'), grad: tensor([-7.0222e-07, 9.0338e-08, 1.0524e-07, -6.2399e-08, -4.7870e-07, 4.1537e-07, -9.5926e-08, 1.1176e-07, -1.5367e-07, 7.7393e-07], device='cuda:0') 100 0.0001 changing lr epoch 291, time 215.22, cls_loss 0.0012 cls_loss_mapping 0.0014 cls_loss_causal 0.4478 re_mapping 0.0048 re_causal 0.0134 /// teacc 99.02 lr 0.00010000 Epoch 293, weight, value: tensor([[-0.2779, 0.0829, -0.1411, ..., -0.0484, -0.2320, -0.1669], [ 0.0082, 0.0976, -0.1158, ..., -0.1139, -0.0563, 0.1107], [ 0.0531, -0.1233, -0.1492, ..., 0.0051, -0.0510, -0.0956], ..., [ 0.0901, -0.0495, 0.1307, ..., 0.0645, 0.1864, -0.0116], [ 0.1138, -0.2044, -0.1298, ..., -0.2578, -0.0625, 0.1942], [-0.1678, 0.0880, 0.0523, ..., -0.2626, -0.1315, -0.0438]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -4.9360e-08, 9.3132e-10, ..., 2.7940e-09, 0.0000e+00, 9.3132e-10], [ 4.2841e-08, -6.5193e-09, 5.7742e-08, ..., 2.7940e-08, 3.5390e-08, -1.2107e-08], [ 9.3132e-09, 9.3132e-09, 2.8871e-08, ..., -2.3283e-08, 1.3970e-08, 6.5193e-09], ..., [-6.7987e-08, 8.3819e-09, -1.0431e-07, ..., -3.7253e-08, -6.8918e-08, 2.2352e-08], [-4.0047e-08, 1.1176e-08, 4.6566e-09, ..., 8.3819e-09, 2.7940e-09, -1.3504e-07], [ 1.4901e-08, 2.5146e-08, -1.4901e-08, ..., 1.0245e-08, 1.1176e-08, 2.7940e-08]], device='cuda:0') Epoch 293, bias, value: tensor([-0.0164, -0.0172, -0.0057, -0.0171, -0.0099, 0.0020, 0.0085, 0.0181, 0.0172, -0.0075], device='cuda:0'), grad: tensor([-4.1910e-08, 1.3132e-07, 4.6566e-09, 2.3283e-08, 2.0768e-07, 2.1514e-07, -2.5146e-06, -8.4750e-08, 1.9800e-06, 8.8476e-08], device='cuda:0') 100 0.0001 changing lr epoch 292, time 214.97, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4983 re_mapping 0.0048 re_causal 0.0142 /// teacc 99.02 lr 0.00010000 Epoch 294, weight, value: tensor([[-0.2811, 0.0829, -0.1424, ..., -0.0491, -0.2358, -0.1695], [ 0.0080, 0.0983, -0.1154, ..., -0.1138, -0.0570, 0.1112], [ 0.0532, -0.1235, -0.1500, ..., 0.0054, -0.0509, -0.0954], ..., [ 0.0906, -0.0495, 0.1312, ..., 0.0655, 0.1877, -0.0117], [ 0.1138, -0.2049, -0.1299, ..., -0.2602, -0.0637, 0.1941], [-0.1680, 0.0880, 0.0524, ..., -0.2632, -0.1320, -0.0437]], device='cuda:0'), grad: tensor([[ 3.4925e-08, 4.6566e-10, 6.5193e-08, ..., 5.1223e-09, 4.7497e-08, 4.6566e-10], [ 3.4273e-07, -3.3528e-08, 4.8848e-07, ..., 7.0781e-08, 3.5670e-07, 1.9092e-08], [ 1.5274e-07, 9.3132e-10, 2.8778e-07, ..., -2.9337e-08, 2.0955e-07, 7.9162e-09], ..., [-3.4943e-06, 1.7695e-08, -6.2138e-06, ..., -1.7462e-07, -4.5821e-06, -2.2119e-07], [ 1.9418e-07, 2.3749e-08, 2.8033e-07, ..., 5.9139e-08, 2.0536e-07, 8.9407e-08], [ 2.0247e-06, 1.0664e-07, 3.7551e-06, ..., 4.5635e-08, 2.7716e-06, 6.8452e-08]], device='cuda:0') Epoch 294, bias, value: tensor([-0.0165, -0.0168, -0.0052, -0.0191, -0.0101, 0.0019, 0.0085, 0.0185, 0.0171, -0.0075], device='cuda:0'), grad: tensor([ 1.4575e-07, 5.5134e-07, 3.0547e-07, 7.6089e-07, 1.5320e-07, 1.2163e-06, -1.9278e-07, -1.0133e-05, 6.7940e-07, 6.5342e-06], device='cuda:0') 100 0.0001 changing lr epoch 293, time 215.12, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4852 re_mapping 0.0050 re_causal 0.0142 /// teacc 99.02 lr 0.00010000 Epoch 295, weight, value: tensor([[-0.2817, 0.0830, -0.1428, ..., -0.0494, -0.2363, -0.1689], [ 0.0078, 0.0985, -0.1157, ..., -0.1141, -0.0574, 0.1113], [ 0.0532, -0.1238, -0.1507, ..., 0.0054, -0.0509, -0.0957], ..., [ 0.0910, -0.0497, 0.1318, ..., 0.0659, 0.1883, -0.0116], [ 0.1137, -0.2053, -0.1300, ..., -0.2627, -0.0643, 0.1941], [-0.1681, 0.0879, 0.0523, ..., -0.2643, -0.1333, -0.0438]], device='cuda:0'), grad: tensor([[ 3.2596e-09, -5.6811e-08, 3.7253e-09, ..., 1.8626e-09, 3.7253e-09, 1.3970e-09], [ 4.8894e-08, 4.9360e-08, 5.7742e-08, ..., 3.0734e-08, 4.7963e-08, 1.9092e-08], [ 1.0291e-07, 1.0245e-08, 1.5320e-07, ..., 7.3109e-08, 8.8010e-08, 6.0536e-09], ..., [-1.1753e-06, 4.1444e-08, -1.6969e-06, ..., -6.8964e-07, -1.2554e-06, 3.7719e-08], [ 4.7032e-08, 3.3062e-08, 8.4285e-08, ..., 4.4703e-08, 5.5414e-08, -2.7008e-08], [ 9.3970e-07, 8.2701e-06, 1.3616e-06, ..., 5.2527e-07, 1.0319e-06, 4.8727e-06]], device='cuda:0') Epoch 295, bias, value: tensor([-0.0165, -0.0169, -0.0051, -0.0190, -0.0099, 0.0018, 0.0085, 0.0188, 0.0169, -0.0077], device='cuda:0'), grad: tensor([-1.1735e-07, 3.1060e-07, 2.3469e-07, 8.8476e-08, -3.3677e-05, -4.7032e-08, 7.1246e-08, -2.4699e-06, 1.6391e-07, 3.5495e-05], device='cuda:0') 100 0.0001 changing lr epoch 294, time 215.31, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4753 re_mapping 0.0050 re_causal 0.0141 /// teacc 99.07 lr 0.00010000 Epoch 296, weight, value: tensor([[-0.2823, 0.0830, -0.1432, ..., -0.0495, -0.2365, -0.1692], [ 0.0078, 0.0991, -0.1157, ..., -0.1142, -0.0574, 0.1125], [ 0.0538, -0.1241, -0.1514, ..., 0.0058, -0.0507, -0.0962], ..., [ 0.0910, -0.0501, 0.1323, ..., 0.0661, 0.1887, -0.0120], [ 0.1137, -0.2057, -0.1301, ..., -0.2639, -0.0646, 0.1938], [-0.1682, 0.0876, 0.0521, ..., -0.2649, -0.1341, -0.0440]], device='cuda:0'), grad: tensor([[ 9.3132e-09, -4.0652e-07, 5.1223e-09, ..., 4.6566e-10, 4.6566e-10, 3.5856e-08], [ 1.8086e-06, 2.8405e-08, 3.7253e-09, ..., 8.8476e-09, 9.7789e-09, 6.8843e-06], [-3.7253e-09, 9.3132e-09, 7.4506e-09, ..., -1.3504e-08, -2.7474e-08, 1.5879e-07], ..., [ 9.4064e-08, 8.9407e-08, 2.5611e-08, ..., 1.1176e-08, 1.4435e-08, 3.9022e-07], [-2.0601e-06, 1.2992e-07, 1.3970e-08, ..., 1.8626e-09, 9.3132e-10, -7.8529e-06], [ 2.0955e-08, 1.8686e-05, -8.4750e-08, ..., 4.6566e-10, 4.6566e-10, 1.6466e-05]], device='cuda:0') Epoch 296, bias, value: tensor([-0.0165, -0.0162, -0.0050, -0.0197, -0.0090, 0.0018, 0.0084, 0.0186, 0.0167, -0.0081], device='cuda:0'), grad: tensor([-7.6601e-07, 8.2403e-06, 1.3504e-07, 6.7055e-08, -6.8903e-05, -1.8161e-07, 5.8860e-07, 8.0653e-07, -8.8662e-06, 6.8903e-05], device='cuda:0') 100 0.0001 changing lr epoch 295, time 214.83, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.5254 re_mapping 0.0050 re_causal 0.0146 /// teacc 99.02 lr 0.00010000 Epoch 297, weight, value: tensor([[-0.2825, 0.0830, -0.1443, ..., -0.0500, -0.2395, -0.1692], [ 0.0077, 0.0990, -0.1158, ..., -0.1145, -0.0576, 0.1126], [ 0.0546, -0.1245, -0.1519, ..., 0.0063, -0.0504, -0.0964], ..., [ 0.0909, -0.0495, 0.1328, ..., 0.0660, 0.1891, -0.0121], [ 0.1138, -0.2061, -0.1302, ..., -0.2634, -0.0650, 0.1942], [-0.1685, 0.0874, 0.0518, ..., -0.2661, -0.1353, -0.0443]], device='cuda:0'), grad: tensor([[ 1.9558e-08, 1.1176e-08, 3.5390e-08, ..., 1.1176e-08, 3.4459e-08, 0.0000e+00], [ 1.3597e-07, -5.1223e-08, 2.9150e-07, ..., 9.3132e-08, 2.7474e-07, -2.4773e-07], [-1.4901e-08, 9.3132e-09, 2.6077e-08, ..., -9.3132e-10, -1.9558e-08, 1.8626e-09], ..., [-2.4959e-07, -1.3690e-07, -5.0012e-07, ..., -1.5646e-07, -4.4797e-07, 6.6124e-08], [ 1.3039e-08, 2.2352e-08, 1.1176e-08, ..., 8.3819e-09, 2.5146e-08, 1.2107e-08], [ 6.7987e-08, 1.8440e-07, -5.3085e-08, ..., 3.0734e-08, 8.6613e-08, 1.8254e-07]], device='cuda:0') Epoch 297, bias, value: tensor([-0.0165, -0.0162, -0.0047, -0.0199, -0.0087, 0.0016, 0.0086, 0.0189, 0.0167, -0.0085], device='cuda:0'), grad: tensor([ 9.4995e-08, 1.7043e-07, -7.3574e-08, 2.2352e-08, 1.5739e-07, -5.3085e-07, 1.0617e-07, -1.0049e-06, 1.3411e-07, 9.2853e-07], device='cuda:0') 100 0.0001 changing lr epoch 296, time 214.87, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.4706 re_mapping 0.0051 re_causal 0.0138 /// teacc 98.97 lr 0.00010000 Epoch 298, weight, value: tensor([[-0.2838, 0.0829, -0.1449, ..., -0.0510, -0.2402, -0.1694], [ 0.0075, 0.0991, -0.1160, ..., -0.1148, -0.0578, 0.1128], [ 0.0546, -0.1246, -0.1526, ..., 0.0065, -0.0503, -0.0965], ..., [ 0.0911, -0.0494, 0.1330, ..., 0.0661, 0.1894, -0.0123], [ 0.1142, -0.2064, -0.1302, ..., -0.2644, -0.0652, 0.1948], [-0.1687, 0.0874, 0.0518, ..., -0.2667, -0.1357, -0.0445]], device='cuda:0'), grad: tensor([[ 1.3970e-08, 2.0489e-08, 2.6077e-08, ..., 6.5193e-09, 1.1176e-08, 7.4506e-09], [ 1.7881e-07, 6.5286e-07, 7.4413e-07, ..., 5.4948e-08, 1.9744e-07, 1.9465e-07], [ 4.6253e-05, 4.4703e-08, 1.5453e-05, ..., 1.7300e-05, 3.3557e-05, 8.7172e-07], ..., [-4.6670e-05, 7.4506e-08, -1.5542e-05, ..., -1.7464e-05, -3.3945e-05, -8.4471e-07], [ 1.1269e-07, 8.9407e-08, 7.6368e-08, ..., 5.2154e-08, 1.0245e-07, -3.4459e-08], [ 2.1420e-08, 1.4342e-07, -4.3772e-08, ..., 1.1176e-08, 2.4214e-08, 1.6764e-08]], device='cuda:0') Epoch 298, bias, value: tensor([-0.0169, -0.0162, -0.0045, -0.0176, -0.0088, -0.0002, 0.0094, 0.0190, 0.0170, -0.0086], device='cuda:0'), grad: tensor([ 9.4995e-08, 2.2501e-06, 5.6505e-05, -6.8210e-06, -1.0645e-06, 4.0047e-06, 3.2317e-07, -5.6624e-05, 5.0105e-07, 9.1922e-07], device='cuda:0') 100 0.0001 changing lr epoch 297, time 214.93, cls_loss 0.0012 cls_loss_mapping 0.0018 cls_loss_causal 0.4977 re_mapping 0.0048 re_causal 0.0134 /// teacc 99.03 lr 0.00010000 Epoch 299, weight, value: tensor([[-0.2853, 0.0829, -0.1461, ..., -0.0520, -0.2417, -0.1697], [ 0.0074, 0.0985, -0.1162, ..., -0.1151, -0.0581, 0.1128], [ 0.0542, -0.1249, -0.1537, ..., 0.0063, -0.0507, -0.0969], ..., [ 0.0915, -0.0498, 0.1334, ..., 0.0665, 0.1900, -0.0123], [ 0.1143, -0.2076, -0.1303, ..., -0.2654, -0.0653, 0.1949], [-0.1688, 0.0871, 0.0517, ..., -0.2675, -0.1365, -0.0446]], device='cuda:0'), grad: tensor([[ 9.3132e-09, 2.7940e-08, 9.3132e-09, ..., 3.7253e-09, 9.3132e-10, 3.7253e-08], [-4.7032e-07, -2.5649e-06, -4.4424e-07, ..., 1.1176e-08, 5.5879e-09, -2.3413e-06], [ 3.7253e-09, 5.8673e-08, 1.3970e-08, ..., -6.3330e-08, -6.5193e-09, 5.2154e-08], ..., [ 3.6694e-07, 2.0564e-06, 3.4366e-07, ..., -4.6566e-09, -9.3132e-09, 1.8682e-06], [-2.8871e-08, 1.5274e-07, 2.5146e-08, ..., 3.9116e-08, 1.8626e-09, 3.3528e-08], [ 4.0978e-08, 3.9302e-07, 3.5390e-08, ..., 2.1420e-08, 5.5879e-09, 1.6391e-07]], device='cuda:0') Epoch 299, bias, value: tensor([-0.0171, -0.0164, -0.0047, -0.0167, -0.0073, -0.0011, 0.0102, 0.0191, 0.0167, -0.0092], device='cuda:0'), grad: tensor([ 1.8068e-07, -9.8422e-06, -1.1735e-06, -9.3132e-10, 1.2107e-08, -5.3272e-07, 3.3434e-07, 7.9125e-06, 1.3225e-06, 1.7472e-06], device='cuda:0') 100 0.0001 changing lr epoch 298, time 214.67, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.4835 re_mapping 0.0048 re_causal 0.0136 /// teacc 99.01 lr 0.00010000 Epoch 300, weight, value: tensor([[-0.2867, 0.0830, -0.1467, ..., -0.0524, -0.2421, -0.1713], [ 0.0076, 0.1003, -0.1155, ..., -0.1153, -0.0582, 0.1138], [ 0.0541, -0.1254, -0.1541, ..., 0.0063, -0.0506, -0.0978], ..., [ 0.0916, -0.0513, 0.1333, ..., 0.0666, 0.1902, -0.0126], [ 0.1146, -0.2081, -0.1304, ..., -0.2661, -0.0655, 0.1953], [-0.1693, 0.0867, 0.0512, ..., -0.2690, -0.1377, -0.0453]], device='cuda:0'), grad: tensor([[ 5.7742e-08, -1.5888e-06, 7.4506e-09, ..., 3.7253e-09, 1.8626e-09, 2.0396e-07], [ 7.7300e-08, -1.0245e-08, 9.0338e-08, ..., 6.7987e-08, 4.0047e-08, -3.6322e-08], [ 5.4017e-08, 1.1176e-08, 8.1025e-08, ..., 4.9360e-08, 2.5146e-08, 2.7940e-09], ..., [-2.9895e-07, 3.4459e-08, -3.1479e-07, ..., -1.9744e-07, -1.3970e-07, 3.5390e-08], [-1.2666e-07, 3.9116e-08, 5.0291e-08, ..., 2.9802e-08, 8.3819e-09, -5.3644e-07], [ 1.9837e-07, 2.8405e-07, -8.9407e-08, ..., 6.2399e-08, 4.7497e-08, 2.8498e-07]], device='cuda:0') Epoch 300, bias, value: tensor([-0.0170, -0.0159, -0.0046, -0.0157, -0.0068, -0.0019, 0.0102, 0.0188, 0.0168, -0.0097], device='cuda:0'), grad: tensor([-5.6922e-06, 2.3060e-06, 1.1735e-07, -3.4459e-08, 5.8115e-07, 1.9222e-06, -1.0058e-07, -3.5111e-07, -8.7824e-07, 2.1085e-06], device='cuda:0') 100 0.0001 changing lr epoch 299, time 214.43, cls_loss 0.0010 cls_loss_mapping 0.0023 cls_loss_causal 0.4911 re_mapping 0.0047 re_causal 0.0139 /// teacc 99.00 lr 0.00010000 Epoch 301, weight, value: tensor([[-0.2877, 0.0832, -0.1471, ..., -0.0534, -0.2424, -0.1715], [ 0.0070, 0.1012, -0.1161, ..., -0.1160, -0.0594, 0.1144], [ 0.0540, -0.1256, -0.1557, ..., 0.0067, -0.0510, -0.0980], ..., [ 0.0922, -0.0521, 0.1341, ..., 0.0668, 0.1914, -0.0129], [ 0.1147, -0.2083, -0.1304, ..., -0.2668, -0.0657, 0.1955], [-0.1695, 0.0865, 0.0513, ..., -0.2693, -0.1378, -0.0457]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.3469e-07, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [-9.3132e-10, -1.6764e-08, -2.3283e-08, ..., 9.3132e-10, -1.1176e-08, -4.1816e-07], [ 2.4214e-08, 1.0245e-08, 1.3039e-08, ..., 9.3132e-10, 5.5879e-09, 2.0675e-07], ..., [ 8.3819e-09, 1.1176e-08, 1.3970e-08, ..., 0.0000e+00, 5.5879e-09, 2.1793e-07], [-1.8626e-07, 3.7253e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -1.2573e-07], [ 6.5193e-09, 2.1886e-07, -4.6566e-09, ..., 0.0000e+00, 0.0000e+00, 6.5193e-09]], device='cuda:0') Epoch 301, bias, value: tensor([-0.0169, -0.0161, -0.0044, -0.0156, -0.0069, -0.0017, 0.0099, 0.0191, 0.0168, -0.0099], device='cuda:0'), grad: tensor([-4.8708e-07, -5.2992e-07, 3.0454e-07, 7.3574e-08, 7.4506e-09, 1.1269e-07, 1.6112e-07, 2.9150e-07, -3.9674e-07, 4.7032e-07], device='cuda:0') 100 0.0001 changing lr epoch 300, time 214.54, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4817 re_mapping 0.0046 re_causal 0.0142 /// teacc 99.00 lr 0.00010000 Epoch 302, weight, value: tensor([[-0.2878, 0.0834, -0.1478, ..., -0.0536, -0.2425, -0.1712], [ 0.0067, 0.1013, -0.1165, ..., -0.1165, -0.0602, 0.1147], [ 0.0544, -0.1259, -0.1560, ..., 0.0070, -0.0507, -0.0985], ..., [ 0.0924, -0.0530, 0.1345, ..., 0.0668, 0.1919, -0.0132], [ 0.1147, -0.2086, -0.1305, ..., -0.2671, -0.0657, 0.1956], [-0.1695, 0.0860, 0.0511, ..., -0.2694, -0.1405, -0.0459]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.4342e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 2.5146e-08, -1.3039e-08, 1.3039e-08, ..., 3.7253e-09, 9.3132e-09, -7.4506e-09], [ 2.2352e-08, 5.5879e-09, 9.3132e-10, ..., -9.3132e-10, 0.0000e+00, 3.1665e-08], ..., [-1.0245e-08, 1.6764e-08, -1.3039e-08, ..., -9.3132e-10, -1.0245e-08, 2.6077e-08], [-1.4063e-07, 1.3039e-08, 1.2107e-08, ..., 4.6566e-09, 1.8626e-09, -1.9930e-07], [ 5.5879e-09, 2.9337e-07, 9.3132e-10, ..., 9.3132e-10, 1.8626e-09, 1.4249e-07]], device='cuda:0') Epoch 302, bias, value: tensor([-0.0167, -0.0162, -0.0044, -0.0159, -0.0054, -0.0015, 0.0098, 0.0191, 0.0168, -0.0107], device='cuda:0'), grad: tensor([-2.7008e-07, 1.5832e-08, 6.2399e-08, -1.5832e-08, -1.0999e-06, 2.0582e-07, 1.3318e-07, 4.9360e-08, -2.9523e-07, 1.2359e-06], device='cuda:0') 100 0.0001 changing lr epoch 301, time 214.96, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.5097 re_mapping 0.0044 re_causal 0.0134 /// teacc 99.03 lr 0.00010000 Epoch 303, weight, value: tensor([[-0.2886, 0.0834, -0.1486, ..., -0.0537, -0.2427, -0.1717], [ 0.0036, 0.1016, -0.1194, ..., -0.1166, -0.0633, 0.1141], [ 0.0542, -0.1263, -0.1566, ..., 0.0070, -0.0508, -0.0990], ..., [ 0.0953, -0.0560, 0.1368, ..., 0.0668, 0.1950, -0.0124], [ 0.1148, -0.2090, -0.1305, ..., -0.2674, -0.0660, 0.1957], [-0.1694, 0.0860, 0.0520, ..., -0.2688, -0.1418, -0.0460]], device='cuda:0'), grad: tensor([[ 1.3039e-08, -1.6298e-07, 1.7695e-08, ..., 7.4506e-09, 8.3819e-09, 1.8626e-09], [-8.1062e-06, -1.3784e-07, 4.6566e-08, ..., 1.9558e-08, -7.3649e-06, -2.6356e-07], [ 7.6592e-06, 4.6566e-09, 2.1420e-08, ..., 1.0245e-08, 6.9365e-06, 1.4901e-08], ..., [-6.8918e-08, 8.2888e-08, -7.9069e-07, ..., -3.3714e-07, 4.9360e-08, 1.5181e-07], [-2.2911e-07, 1.4901e-08, 2.2352e-08, ..., 9.3132e-09, 1.3039e-08, -3.0920e-07], [ 5.0757e-07, 2.5146e-08, 4.9360e-07, ..., 2.1793e-07, 2.5425e-07, 2.1793e-07]], device='cuda:0') Epoch 303, bias, value: tensor([-0.0167, -0.0189, -0.0046, -0.0164, -0.0045, -0.0012, 0.0099, 0.0214, 0.0168, -0.0109], device='cuda:0'), grad: tensor([-2.7940e-07, -3.6627e-05, 3.4064e-05, 1.0524e-07, 4.0699e-07, 2.2817e-07, 3.8184e-07, 9.2201e-07, -7.1526e-07, 1.5236e-06], device='cuda:0') 100 0.0001 changing lr epoch 302, time 215.26, cls_loss 0.0012 cls_loss_mapping 0.0017 cls_loss_causal 0.4741 re_mapping 0.0044 re_causal 0.0128 /// teacc 98.97 lr 0.00010000 Epoch 304, weight, value: tensor([[-0.2911, 0.0836, -0.1492, ..., -0.0557, -0.2445, -0.1718], [ 0.0026, 0.1020, -0.1203, ..., -0.1174, -0.0644, 0.1142], [ 0.0546, -0.1268, -0.1580, ..., 0.0073, -0.0512, -0.0993], ..., [ 0.0963, -0.0563, 0.1377, ..., 0.0672, 0.1961, -0.0122], [ 0.1148, -0.2096, -0.1306, ..., -0.2689, -0.0665, 0.1958], [-0.1695, 0.0859, 0.0521, ..., -0.2693, -0.1420, -0.0463]], device='cuda:0'), grad: tensor([[ 2.0489e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.0536e-08], [ 5.2154e-08, -2.7940e-09, 4.4703e-08, ..., 4.7497e-08, 2.7940e-08, 7.4506e-09], [ 1.0058e-07, 0.0000e+00, 3.6322e-08, ..., -4.5635e-08, 2.6077e-08, 2.3097e-07], ..., [-9.6858e-08, 5.5879e-09, -1.0617e-07, ..., -4.8429e-08, -7.0781e-08, 2.4214e-08], [-1.5739e-07, 2.7940e-09, 4.6566e-09, ..., 2.7940e-09, 1.8626e-09, -4.5728e-07], [ 1.5832e-08, -6.2399e-08, -5.9605e-08, ..., 6.5193e-09, -2.0489e-08, 1.5832e-08]], device='cuda:0') Epoch 304, bias, value: tensor([-0.0165, -0.0198, -0.0038, -0.0166, -0.0044, -0.0020, 0.0111, 0.0223, 0.0166, -0.0111], device='cuda:0'), grad: tensor([ 8.0094e-08, 1.8161e-07, 1.2666e-07, 1.7695e-08, 2.1048e-07, 1.2107e-08, 2.1886e-07, -1.3411e-07, -5.7928e-07, -1.3132e-07], device='cuda:0') 100 0.0001 changing lr epoch 303, time 215.10, cls_loss 0.0013 cls_loss_mapping 0.0016 cls_loss_causal 0.4839 re_mapping 0.0046 re_causal 0.0133 /// teacc 98.98 lr 0.00010000 Epoch 305, weight, value: tensor([[-0.2898, 0.0838, -0.1501, ..., -0.0561, -0.2453, -0.1686], [ 0.0005, 0.1032, -0.1229, ..., -0.1178, -0.0672, 0.1129], [ 0.0547, -0.1274, -0.1601, ..., 0.0076, -0.0515, -0.0995], ..., [ 0.0984, -0.0578, 0.1405, ..., 0.0673, 0.1990, -0.0107], [ 0.1148, -0.2103, -0.1308, ..., -0.2696, -0.0670, 0.1957], [-0.1697, 0.0857, 0.0522, ..., -0.2697, -0.1422, -0.0465]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -2.5239e-07, 4.6566e-09, ..., 1.8626e-09, 1.8626e-09, 3.7253e-09], [ 5.1223e-08, 2.0489e-08, 1.0896e-07, ..., 3.0734e-08, 3.0734e-08, 1.8626e-09], [-1.1176e-08, 2.7940e-09, 5.5879e-09, ..., -1.3039e-08, -0.0000e+00, 2.0489e-08], ..., [-4.1910e-08, 9.3132e-10, 2.7940e-08, ..., 4.0047e-08, -4.2841e-08, 2.7008e-08], [-5.2154e-08, 1.3039e-08, 4.6566e-09, ..., 1.1176e-08, 9.3132e-10, -1.0710e-07], [ 3.2596e-08, 1.8626e-08, 3.8184e-08, ..., 1.7695e-08, 7.4506e-09, 4.0047e-08]], device='cuda:0') Epoch 305, bias, value: tensor([-0.0162, -0.0225, -0.0037, -0.0169, -0.0043, 0.0009, 0.0081, 0.0250, 0.0164, -0.0113], device='cuda:0'), grad: tensor([-5.4762e-07, 2.1979e-07, -3.3528e-08, 3.4086e-07, 4.5635e-08, -4.0699e-07, 5.3085e-08, 1.1455e-07, 3.0734e-08, 1.8533e-07], device='cuda:0') 100 0.0001 changing lr epoch 304, time 215.05, cls_loss 0.0013 cls_loss_mapping 0.0024 cls_loss_causal 0.4494 re_mapping 0.0049 re_causal 0.0126 /// teacc 98.96 lr 0.00010000 Epoch 306, weight, value: tensor([[-0.2901, 0.0838, -0.1509, ..., -0.0567, -0.2456, -0.1688], [ 0.0005, 0.1040, -0.1228, ..., -0.1181, -0.0673, 0.1137], [ 0.0548, -0.1279, -0.1607, ..., 0.0076, -0.0513, -0.0999], ..., [ 0.0984, -0.0598, 0.1403, ..., 0.0673, 0.1991, -0.0117], [ 0.1148, -0.2119, -0.1311, ..., -0.2705, -0.0672, 0.1959], [-0.1695, 0.0864, 0.0541, ..., -0.2699, -0.1423, -0.0454]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 6.5193e-09, 1.6764e-08, ..., 7.4506e-09, 7.4506e-09, 6.5193e-09], [ 5.8673e-07, 1.3728e-06, 2.9616e-06, ..., 4.3306e-07, 3.9674e-07, 1.0896e-06], [ 1.8859e-06, 2.7940e-08, 1.4203e-06, ..., 1.3756e-06, 1.3253e-06, 2.7940e-08], ..., [-3.5986e-06, 1.1381e-06, -9.2853e-07, ..., -2.6468e-06, -2.4755e-06, 9.5926e-07], [ 7.4506e-08, 1.5181e-07, 2.8592e-07, ..., 5.5879e-08, 5.1223e-08, 8.1956e-08], [ 9.2201e-08, -2.8089e-06, -5.5246e-06, ..., 6.1467e-08, 5.8673e-08, -1.8142e-06]], device='cuda:0') Epoch 306, bias, value: tensor([-0.0162, -0.0224, -0.0040, -0.0174, -0.0060, 0.0012, 0.0083, 0.0248, 0.0159, -0.0099], device='cuda:0'), grad: tensor([ 5.5879e-08, 6.7428e-06, 4.0643e-06, 2.0824e-06, -5.0943e-07, -2.8312e-07, 2.0210e-07, -2.8946e-06, 9.0059e-07, -1.0334e-05], device='cuda:0') 100 0.0001 changing lr epoch 305, time 214.82, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.5117 re_mapping 0.0047 re_causal 0.0133 /// teacc 99.01 lr 0.00010000 Epoch 307, weight, value: tensor([[-0.2910, 0.0839, -0.1518, ..., -0.0573, -0.2464, -0.1694], [ 0.0004, 0.1042, -0.1231, ..., -0.1184, -0.0674, 0.1135], [ 0.0560, -0.1282, -0.1612, ..., 0.0087, -0.0508, -0.1004], ..., [ 0.0984, -0.0602, 0.1407, ..., 0.0669, 0.1992, -0.0114], [ 0.1148, -0.2123, -0.1312, ..., -0.2730, -0.0675, 0.1963], [-0.1700, 0.0863, 0.0539, ..., -0.2708, -0.1434, -0.0459]], device='cuda:0'), grad: tensor([[ 2.2445e-07, 5.5879e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 3.0454e-07], [ 2.0489e-08, -0.0000e+00, 5.4017e-08, ..., 9.3132e-09, 4.6566e-09, -8.0094e-08], [ 1.8626e-09, 1.0245e-08, 2.3283e-08, ..., 3.7253e-09, -8.3819e-09, 6.6124e-08], ..., [ 4.2841e-08, 9.1270e-08, 2.4214e-08, ..., 1.1176e-08, -9.3132e-10, 6.7055e-08], [-3.2224e-07, 6.7055e-08, 8.4750e-08, ..., 7.4506e-09, 1.8626e-09, -5.1409e-07], [-2.7940e-08, -2.6543e-07, -3.6228e-07, ..., 9.3132e-10, 1.8626e-09, 1.3411e-07]], device='cuda:0') Epoch 307, bias, value: tensor([-0.0163, -0.0225, -0.0029, -0.0182, -0.0056, 0.0015, 0.0083, 0.0248, 0.0158, -0.0101], device='cuda:0'), grad: tensor([ 4.8801e-07, -1.8626e-09, 7.9162e-08, -2.6077e-08, 5.4948e-08, 6.4261e-08, 6.9849e-08, 3.7625e-07, -3.9581e-07, -7.0408e-07], device='cuda:0') 100 0.0001 changing lr epoch 306, time 214.93, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4907 re_mapping 0.0048 re_causal 0.0134 /// teacc 98.98 lr 0.00010000 Epoch 308, weight, value: tensor([[-0.2916, 0.0839, -0.1525, ..., -0.0576, -0.2471, -0.1696], [ 0.0004, 0.1046, -0.1232, ..., -0.1189, -0.0675, 0.1139], [ 0.0560, -0.1286, -0.1620, ..., 0.0089, -0.0508, -0.1016], ..., [ 0.0985, -0.0606, 0.1408, ..., 0.0670, 0.1993, -0.0116], [ 0.1152, -0.2127, -0.1313, ..., -0.2737, -0.0676, 0.1968], [-0.1702, 0.0862, 0.0540, ..., -0.2712, -0.1436, -0.0461]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 2.0675e-07, -1.9558e-08, 1.7136e-07, ..., 3.0734e-08, 7.3574e-08, 1.6764e-08], [-3.7253e-09, 2.7940e-09, 3.7253e-09, ..., -5.5879e-09, -4.6566e-09, 4.6566e-09], ..., [-3.4180e-07, 2.2352e-08, -2.8498e-07, ..., -4.7497e-08, -1.1735e-07, -4.7497e-08], [ 7.8231e-08, 1.1176e-08, 6.7055e-08, ..., 1.3039e-08, 2.7008e-08, 2.7940e-08], [ 4.1910e-08, 1.1437e-06, 2.9802e-08, ..., 6.5193e-09, 1.4901e-08, 1.3039e-06]], device='cuda:0') Epoch 308, bias, value: tensor([-0.0162, -0.0224, -0.0032, -0.0187, -0.0053, 0.0016, 0.0084, 0.0247, 0.0159, -0.0103], device='cuda:0'), grad: tensor([ 1.8626e-09, 2.9150e-07, -4.6566e-09, 2.0489e-08, -5.8301e-06, 1.0245e-08, 2.7101e-07, -5.0757e-07, 1.9278e-07, 5.5321e-06], device='cuda:0') 100 0.0001 changing lr epoch 307, time 215.13, cls_loss 0.0012 cls_loss_mapping 0.0017 cls_loss_causal 0.4949 re_mapping 0.0048 re_causal 0.0137 /// teacc 99.01 lr 0.00010000 Epoch 309, weight, value: tensor([[-0.2922, 0.0839, -0.1535, ..., -0.0584, -0.2473, -0.1698], [ 0.0005, 0.1084, -0.1226, ..., -0.1191, -0.0675, 0.1175], [ 0.0543, -0.1321, -0.1657, ..., 0.0077, -0.0520, -0.1052], ..., [ 0.0987, -0.0630, 0.1408, ..., 0.0680, 0.1995, -0.0132], [ 0.1153, -0.2132, -0.1313, ..., -0.2751, -0.0678, 0.1971], [-0.1705, 0.0863, 0.0541, ..., -0.2723, -0.1439, -0.0467]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -3.7253e-09, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], [-1.5646e-07, -3.9022e-07, -1.0245e-08, ..., 5.5879e-09, 1.8626e-09, -6.7987e-07], [-1.2293e-07, 1.1176e-08, 2.7940e-09, ..., -8.7544e-08, -3.6322e-08, 6.5193e-09], ..., [ 9.7789e-08, 1.0896e-07, 3.3528e-08, ..., 6.7987e-08, 3.1665e-08, 6.4261e-08], [ 1.3970e-08, 7.4506e-09, 4.6566e-09, ..., 1.1176e-08, 9.3132e-10, 7.4506e-09], [ 3.7253e-09, 3.2596e-08, -5.8673e-08, ..., 1.8626e-09, 1.8626e-09, 2.3283e-08]], device='cuda:0') Epoch 309, bias, value: tensor([-0.0162, -0.0209, -0.0070, -0.0192, -0.0056, 0.0018, 0.0085, 0.0246, 0.0160, -0.0103], device='cuda:0'), grad: tensor([-9.3132e-10, -1.7025e-06, -1.9372e-07, 4.3772e-08, -1.1176e-07, 2.1420e-08, 1.2880e-06, 5.7090e-07, 6.1467e-08, 4.5635e-08], device='cuda:0') 100 0.0001 changing lr epoch 308, time 215.30, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4658 re_mapping 0.0049 re_causal 0.0138 /// teacc 99.03 lr 0.00010000 Epoch 310, weight, value: tensor([[-0.2925, 0.0840, -0.1539, ..., -0.0586, -0.2474, -0.1700], [ 0.0005, 0.1087, -0.1224, ..., -0.1191, -0.0675, 0.1179], [ 0.0540, -0.1322, -0.1664, ..., 0.0075, -0.0529, -0.1052], ..., [ 0.0987, -0.0633, 0.1407, ..., 0.0682, 0.1996, -0.0137], [ 0.1153, -0.2138, -0.1315, ..., -0.2759, -0.0679, 0.1971], [-0.1705, 0.0871, 0.0567, ..., -0.2724, -0.1439, -0.0460]], device='cuda:0'), grad: tensor([[ 3.5390e-08, 1.4901e-08, 9.5926e-08, ..., 1.9558e-08, 2.7940e-09, 1.9930e-07], [-6.1244e-06, -5.5246e-06, -2.1562e-05, ..., -3.4738e-06, -1.3597e-07, -4.8161e-05], [ 1.1530e-06, 8.9593e-07, 3.9302e-06, ..., 5.7090e-07, 1.1083e-07, 8.0317e-06], ..., [ 2.6245e-06, 2.9244e-06, 1.0878e-05, ..., 1.6522e-06, -1.3597e-07, 2.5541e-05], [ 1.4622e-07, 1.0431e-07, 3.4459e-07, ..., 8.5682e-08, 1.4901e-08, 8.8289e-07], [ 1.5553e-07, 2.6077e-08, 1.2014e-07, ..., 6.0536e-08, 4.1910e-08, 1.2759e-07]], device='cuda:0') Epoch 310, bias, value: tensor([-0.0162, -0.0207, -0.0071, -0.0192, -0.0069, 0.0019, 0.0083, 0.0244, 0.0158, -0.0094], device='cuda:0'), grad: tensor([ 3.5856e-07, -8.4579e-05, 1.4186e-05, 2.2903e-05, 2.6822e-07, 6.0070e-07, 4.0047e-07, 4.3571e-05, 1.8459e-06, 5.6252e-07], device='cuda:0') 100 0.0001 changing lr epoch 309, time 214.99, cls_loss 0.0013 cls_loss_mapping 0.0025 cls_loss_causal 0.4816 re_mapping 0.0049 re_causal 0.0130 /// teacc 99.06 lr 0.00010000 Epoch 311, weight, value: tensor([[-0.2933, 0.0841, -0.1548, ..., -0.0590, -0.2477, -0.1706], [ 0.0008, 0.1090, -0.1223, ..., -0.1197, -0.0673, 0.1190], [ 0.0533, -0.1324, -0.1674, ..., 0.0075, -0.0541, -0.1057], ..., [ 0.0988, -0.0638, 0.1401, ..., 0.0688, 0.1997, -0.0146], [ 0.1148, -0.2144, -0.1317, ..., -0.2774, -0.0683, 0.1967], [-0.1710, 0.0871, 0.0594, ..., -0.2731, -0.1456, -0.0462]], device='cuda:0'), grad: tensor([[ 1.9558e-08, -2.9430e-07, 4.6566e-09, ..., 9.3132e-10, 9.3132e-10, 3.5390e-08], [ 2.7940e-08, 1.4901e-08, 7.4506e-08, ..., 1.8626e-08, 3.0734e-08, -3.2596e-08], [ 2.1420e-08, 7.4506e-09, 2.7940e-08, ..., 5.5879e-09, 9.3132e-09, 1.1176e-08], ..., [-1.3504e-07, 2.5146e-08, -4.2934e-07, ..., -7.3574e-08, -1.9465e-07, 3.7253e-08], [-9.1270e-08, 1.1269e-07, 5.0291e-08, ..., 1.8626e-09, 2.7940e-09, -1.8161e-07], [ 1.4529e-07, 1.9129e-06, 2.1141e-07, ..., 4.7497e-08, 1.4063e-07, 1.3411e-07]], device='cuda:0') Epoch 311, bias, value: tensor([-0.0161, -0.0201, -0.0079, -0.0198, -0.0068, 0.0016, 0.0081, 0.0239, 0.0150, -0.0086], device='cuda:0'), grad: tensor([-3.7998e-07, 1.3970e-07, 7.3574e-08, 6.8638e-07, -2.9802e-08, -8.7395e-06, 5.6252e-07, -4.5355e-07, 1.9930e-07, 7.9572e-06], device='cuda:0') 100 0.0001 changing lr epoch 310, time 215.22, cls_loss 0.0017 cls_loss_mapping 0.0017 cls_loss_causal 0.4823 re_mapping 0.0050 re_causal 0.0131 /// teacc 98.96 lr 0.00010000 Epoch 312, weight, value: tensor([[-0.2936, 0.0809, -0.1561, ..., -0.0616, -0.2482, -0.1701], [ 0.0008, 0.1091, -0.1225, ..., -0.1201, -0.0670, 0.1192], [ 0.0529, -0.1325, -0.1681, ..., 0.0072, -0.0552, -0.1058], ..., [ 0.0991, -0.0641, 0.1406, ..., 0.0699, 0.1996, -0.0149], [ 0.1148, -0.2151, -0.1319, ..., -0.2790, -0.0688, 0.1967], [-0.1713, 0.0868, 0.0593, ..., -0.2737, -0.1459, -0.0466]], device='cuda:0'), grad: tensor([[-8.4657e-07, -1.5683e-06, 1.3039e-08, ..., 9.3132e-09, 8.3819e-09, -1.9185e-07], [ 2.5332e-07, 7.7300e-08, 3.8184e-07, ..., 6.0536e-08, 1.9465e-07, -2.6077e-08], [ 1.1828e-07, 1.9744e-07, 1.2293e-07, ..., -1.8626e-09, 3.0734e-08, 4.6566e-08], ..., [-2.1700e-07, 5.1409e-07, -1.0766e-06, ..., -1.8068e-07, -5.4482e-07, 9.7789e-08], [ 7.1712e-08, 7.8231e-08, 6.9849e-08, ..., 3.0734e-08, 3.9116e-08, -9.3132e-10], [ 4.6194e-07, 4.6846e-07, 4.3772e-07, ..., 7.9162e-08, 2.3469e-07, 6.7987e-08]], device='cuda:0') Epoch 312, bias, value: tensor([-0.0195, -0.0196, -0.0083, -0.0210, -0.0061, 0.0036, 0.0103, 0.0237, 0.0147, -0.0091], device='cuda:0'), grad: tensor([-4.9919e-06, 8.4471e-07, 6.3889e-07, -4.0047e-08, 9.4995e-08, 2.8219e-07, 4.8894e-07, 2.0582e-07, 3.5018e-07, 2.1476e-06], device='cuda:0') 100 0.0001 changing lr epoch 311, time 215.77, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4722 re_mapping 0.0049 re_causal 0.0133 /// teacc 98.99 lr 0.00010000 Epoch 313, weight, value: tensor([[-0.2941, 0.0809, -0.1571, ..., -0.0622, -0.2487, -0.1696], [ 0.0008, 0.1090, -0.1225, ..., -0.1195, -0.0669, 0.1192], [ 0.0530, -0.1326, -0.1683, ..., 0.0071, -0.0553, -0.1058], ..., [ 0.0991, -0.0645, 0.1406, ..., 0.0696, 0.1995, -0.0151], [ 0.1152, -0.2156, -0.1321, ..., -0.2801, -0.0690, 0.1977], [-0.1714, 0.0870, 0.0595, ..., -0.2745, -0.1462, -0.0468]], device='cuda:0'), grad: tensor([[ 5.3085e-08, -1.5367e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-08], [ 1.5832e-08, -5.5134e-07, 1.0245e-08, ..., 7.4506e-09, 4.6566e-09, -4.8149e-07], [ 3.9116e-08, 4.8429e-08, 2.6077e-08, ..., 1.8626e-08, 1.3039e-08, 5.9605e-08], ..., [-3.9116e-08, 1.4156e-07, -3.4459e-08, ..., -2.0489e-08, -2.4214e-08, 1.3132e-07], [-8.2627e-06, 5.6811e-08, -3.0305e-06, ..., -7.2923e-07, 9.3132e-10, -8.2627e-06], [ 7.2643e-08, 4.0047e-07, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 3.6694e-07]], device='cuda:0') Epoch 313, bias, value: tensor([-0.0195, -0.0196, -0.0083, -0.0210, -0.0059, 0.0034, 0.0103, 0.0236, 0.0149, -0.0091], device='cuda:0'), grad: tensor([-1.0990e-07, -1.7229e-06, 2.4680e-07, 2.0206e-05, 6.7987e-08, 2.1141e-07, 1.4063e-07, 3.9581e-07, -2.0802e-05, 1.3700e-06], device='cuda:0') 100 0.0001 changing lr epoch 312, time 217.41, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.4671 re_mapping 0.0048 re_causal 0.0134 /// teacc 98.99 lr 0.00010000 Epoch 314, weight, value: tensor([[-0.2946, 0.0810, -0.1576, ..., -0.0627, -0.2490, -0.1698], [ 0.0009, 0.1095, -0.1224, ..., -0.1195, -0.0669, 0.1196], [ 0.0532, -0.1326, -0.1683, ..., 0.0076, -0.0552, -0.1059], ..., [ 0.0990, -0.0653, 0.1407, ..., 0.0695, 0.1995, -0.0154], [ 0.1154, -0.2162, -0.1321, ..., -0.2812, -0.0691, 0.1981], [-0.1717, 0.0866, 0.0595, ..., -0.2749, -0.1463, -0.0475]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 3.3528e-08, 9.3132e-10, 4.6566e-09, ..., 9.3132e-09, 3.7253e-09, 0.0000e+00], [-3.3248e-07, 0.0000e+00, 1.2107e-08, ..., -8.1956e-08, 1.2107e-08, 0.0000e+00], ..., [-1.0245e-08, 3.7253e-09, -4.6566e-09, ..., -6.5193e-09, -1.8626e-08, 0.0000e+00], [ 1.8254e-07, 2.7940e-09, 6.5193e-09, ..., 4.6566e-08, 1.8626e-09, 0.0000e+00], [ 6.5193e-09, -1.0245e-08, -2.8871e-08, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 314, bias, value: tensor([-0.0195, -0.0194, -0.0082, -0.0219, -0.0053, 0.0036, 0.0103, 0.0235, 0.0149, -0.0095], device='cuda:0'), grad: tensor([ 3.1665e-08, 1.2293e-07, -1.3206e-06, 1.5181e-07, 2.3656e-07, 2.4214e-08, 2.7008e-08, 5.6811e-08, 7.1246e-07, -4.0047e-08], device='cuda:0') 100 0.0001 changing lr epoch 313, time 219.60, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4936 re_mapping 0.0045 re_causal 0.0131 /// teacc 98.98 lr 0.00010000 Epoch 315, weight, value: tensor([[-0.2968, 0.0809, -0.1602, ..., -0.0633, -0.2495, -0.1715], [ 0.0012, 0.1098, -0.1225, ..., -0.1198, -0.0669, 0.1204], [ 0.0522, -0.1327, -0.1688, ..., 0.0073, -0.0556, -0.1059], ..., [ 0.0992, -0.0661, 0.1408, ..., 0.0700, 0.1996, -0.0158], [ 0.1146, -0.2193, -0.1324, ..., -0.2821, -0.0693, 0.1972], [-0.1715, 0.0863, 0.0596, ..., -0.2754, -0.1466, -0.0473]], device='cuda:0'), grad: tensor([[ 1.6764e-08, 9.3132e-09, 3.5390e-08, ..., 2.7940e-09, 9.3132e-10, 2.6077e-08], [ 2.7940e-08, 2.2352e-08, 5.1223e-08, ..., 6.5193e-09, 9.3132e-09, -3.6322e-08], [-2.7940e-09, 3.7253e-09, 2.7940e-09, ..., -1.7695e-08, -4.6566e-09, 1.1176e-08], ..., [-2.3283e-08, 1.2200e-07, 8.1025e-08, ..., -9.3132e-10, -1.6764e-08, 6.0536e-08], [-1.1912e-06, -2.3562e-07, 4.6566e-08, ..., 1.8626e-09, 1.8626e-09, -2.5053e-06], [ 1.8626e-09, -1.8813e-07, -2.8405e-07, ..., 2.7940e-09, 6.5193e-09, 1.7695e-08]], device='cuda:0') Epoch 315, bias, value: tensor([-0.0195, -0.0191, -0.0086, -0.0221, -0.0043, 0.0041, 0.0103, 0.0235, 0.0135, -0.0097], device='cuda:0'), grad: tensor([ 8.2888e-08, 5.3085e-08, -2.2352e-08, 1.3039e-07, -2.9523e-07, 3.5334e-06, 1.1548e-07, 3.4925e-07, -3.6247e-06, -3.2317e-07], device='cuda:0') 100 0.0001 changing lr epoch 314, time 215.04, cls_loss 0.0013 cls_loss_mapping 0.0020 cls_loss_causal 0.4892 re_mapping 0.0043 re_causal 0.0126 /// teacc 99.01 lr 0.00010000 Epoch 316, weight, value: tensor([[-0.2976, 0.0807, -0.1625, ..., -0.0637, -0.2498, -0.1718], [ 0.0014, 0.1094, -0.1224, ..., -0.1184, -0.0663, 0.1205], [ 0.0506, -0.1329, -0.1696, ..., 0.0064, -0.0572, -0.1060], ..., [ 0.0994, -0.0663, 0.1409, ..., 0.0700, 0.1993, -0.0159], [ 0.1148, -0.2206, -0.1326, ..., -0.2844, -0.0696, 0.1975], [-0.1718, 0.0871, 0.0597, ..., -0.2767, -0.1468, -0.0477]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 2.9802e-06, 0.0000e+00, ..., 2.7940e-09, 9.3132e-10, 9.3132e-10], [ 2.7940e-09, -1.1176e-08, 2.7940e-09, ..., 1.2107e-08, 6.5193e-09, -3.8184e-08], [-7.3574e-08, 2.4214e-08, 9.3132e-10, ..., -7.6368e-08, -3.2596e-08, 1.8626e-09], ..., [ 5.9605e-08, 1.4901e-08, -2.7940e-09, ..., 5.0291e-08, 2.3283e-08, 4.0978e-08], [ 7.4506e-09, 1.7695e-08, 9.3132e-10, ..., 6.5193e-09, 2.7940e-09, 1.8626e-09], [ 9.3132e-10, -3.2224e-06, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09]], device='cuda:0') Epoch 316, bias, value: tensor([-0.0196, -0.0187, -0.0092, -0.0236, -0.0040, 0.0057, 0.0102, 0.0232, 0.0133, -0.0094], device='cuda:0'), grad: tensor([ 4.9174e-06, -2.3283e-08, -2.0768e-07, 1.2107e-08, 2.6077e-07, 7.4506e-09, 6.5193e-09, 2.3562e-07, 6.1467e-08, -5.2825e-06], device='cuda:0') 100 0.0001 changing lr epoch 315, time 215.95, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4695 re_mapping 0.0047 re_causal 0.0135 /// teacc 99.03 lr 0.00010000 Epoch 317, weight, value: tensor([[-0.2980, 0.0808, -0.1628, ..., -0.0640, -0.2500, -0.1718], [ 0.0013, 0.1094, -0.1225, ..., -0.1187, -0.0663, 0.1205], [ 0.0504, -0.1330, -0.1698, ..., 0.0065, -0.0574, -0.1060], ..., [ 0.0995, -0.0664, 0.1410, ..., 0.0702, 0.1994, -0.0159], [ 0.1148, -0.2210, -0.1327, ..., -0.2859, -0.0699, 0.1976], [-0.1721, 0.0867, 0.0596, ..., -0.2769, -0.1473, -0.0480]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -1.7695e-08, 1.3970e-09, ..., 3.7253e-09, 4.6566e-10, 9.3132e-10], [ 4.7963e-08, -1.1176e-08, 7.3574e-08, ..., 4.4238e-08, 2.0489e-08, -1.1222e-07], [ 1.1176e-08, 1.6298e-08, 2.0489e-08, ..., 3.3993e-08, -2.7940e-09, 1.3039e-08], ..., [-1.9651e-07, 8.1956e-08, -2.5379e-07, ..., -1.0058e-07, -5.9605e-08, 5.7742e-08], [-9.6858e-08, 5.1223e-09, 1.0710e-08, ..., 2.3283e-09, 1.3970e-09, -2.1234e-07], [ 2.3283e-08, 1.0850e-07, 1.5367e-08, ..., 1.1642e-08, 6.9849e-09, 3.7253e-09]], device='cuda:0') Epoch 317, bias, value: tensor([-0.0195, -0.0187, -0.0091, -0.0224, -0.0038, 0.0054, 0.0101, 0.0233, 0.0130, -0.0098], device='cuda:0'), grad: tensor([-2.9337e-08, 2.7008e-08, 1.1502e-07, 1.8626e-08, -4.5169e-07, 1.2526e-07, 1.7276e-07, -1.4296e-07, -2.0349e-07, 3.7672e-07], device='cuda:0') 100 0.0001 changing lr epoch 316, time 214.79, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.4800 re_mapping 0.0046 re_causal 0.0126 /// teacc 99.01 lr 0.00010000 Epoch 318, weight, value: tensor([[-0.2982, 0.0796, -0.1641, ..., -0.0643, -0.2504, -0.1720], [ 0.0012, 0.1095, -0.1226, ..., -0.1190, -0.0664, 0.1207], [ 0.0504, -0.1332, -0.1703, ..., 0.0059, -0.0575, -0.1062], ..., [ 0.0996, -0.0670, 0.1412, ..., 0.0702, 0.1996, -0.0161], [ 0.1151, -0.2214, -0.1327, ..., -0.2868, -0.0695, 0.1978], [-0.1725, 0.0894, 0.0595, ..., -0.2773, -0.1484, -0.0484]], device='cuda:0'), grad: tensor([[ 3.2596e-09, -2.0955e-08, 2.7940e-09, ..., 2.3283e-09, 4.6566e-10, 9.3132e-10], [ 8.7079e-08, -6.5193e-09, 7.4506e-08, ..., 7.0315e-08, 4.0513e-08, -3.0734e-08], [-5.6345e-08, 1.8626e-09, 1.2107e-08, ..., -6.4261e-08, -8.8476e-09, 2.3283e-09], ..., [-1.9139e-07, 1.6298e-08, -1.6345e-07, ..., -1.1688e-07, -1.0384e-07, 2.5146e-08], [-1.5832e-08, 9.7789e-09, 1.5367e-08, ..., 2.5611e-08, 1.0710e-08, -5.0757e-08], [ 1.3039e-08, -3.9395e-07, -3.7905e-07, ..., 1.5367e-08, 3.7253e-09, 3.6322e-08]], device='cuda:0') Epoch 318, bias, value: tensor([-0.0205, -0.0188, -0.0093, -0.0226, -0.0037, 0.0057, 0.0102, 0.0233, 0.0128, -0.0073], device='cuda:0'), grad: tensor([-2.8871e-08, 2.0629e-07, -2.8918e-07, 7.2177e-08, 1.0841e-06, 7.6834e-08, -4.0978e-08, -2.5705e-07, 4.7963e-08, -8.7125e-07], device='cuda:0') 100 0.0001 changing lr epoch 317, time 215.15, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.4611 re_mapping 0.0043 re_causal 0.0124 /// teacc 99.00 lr 0.00010000 Epoch 319, weight, value: tensor([[-0.2989, 0.0796, -0.1644, ..., -0.0648, -0.2506, -0.1724], [ 0.0012, 0.1095, -0.1227, ..., -0.1196, -0.0665, 0.1207], [ 0.0508, -0.1333, -0.1704, ..., 0.0064, -0.0572, -0.1061], ..., [ 0.0997, -0.0673, 0.1413, ..., 0.0703, 0.1996, -0.0163], [ 0.1153, -0.2213, -0.1328, ..., -0.2893, -0.0698, 0.1987], [-0.1727, 0.0892, 0.0597, ..., -0.2777, -0.1483, -0.0487]], device='cuda:0'), grad: tensor([[ 6.9849e-09, -4.0513e-08, 7.4506e-09, ..., 2.2817e-08, 0.0000e+00, 6.5193e-09], [ 2.3004e-07, 1.1176e-08, 2.9337e-08, ..., 2.1420e-08, 1.2573e-08, 1.8347e-07], [ 2.0787e-06, 4.6566e-09, 2.3283e-08, ..., 2.2817e-08, 1.9558e-08, 1.7509e-06], ..., [-6.2399e-08, 4.1910e-09, -5.3085e-08, ..., -4.4238e-08, -5.4482e-08, 6.5193e-09], [-2.9989e-06, 5.9139e-08, 7.2643e-08, ..., 2.0023e-08, 9.3132e-09, -2.6226e-06], [ 1.3178e-07, -2.9802e-08, -9.4064e-08, ..., 2.7940e-09, 2.3283e-09, 1.3551e-07]], device='cuda:0') Epoch 319, bias, value: tensor([-0.0205, -0.0188, -0.0092, -0.0235, -0.0031, 0.0062, 0.0102, 0.0233, 0.0130, -0.0075], device='cuda:0'), grad: tensor([-1.0664e-07, 7.7626e-07, 6.7316e-06, 2.3283e-09, 1.1595e-07, 1.4007e-06, 3.8464e-07, 1.4435e-08, -9.5218e-06, 2.0675e-07], device='cuda:0') 100 0.0001 changing lr epoch 318, time 215.02, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4717 re_mapping 0.0045 re_causal 0.0131 /// teacc 98.95 lr 0.00010000 Epoch 320, weight, value: tensor([[-0.2989, 0.0796, -0.1647, ..., -0.0650, -0.2509, -0.1723], [ 0.0012, 0.1094, -0.1227, ..., -0.1197, -0.0665, 0.1208], [ 0.0507, -0.1333, -0.1706, ..., 0.0064, -0.0572, -0.1062], ..., [ 0.0996, -0.0677, 0.1413, ..., 0.0702, 0.1996, -0.0168], [ 0.1156, -0.2216, -0.1329, ..., -0.2898, -0.0699, 0.1993], [-0.1729, 0.0887, 0.0594, ..., -0.2779, -0.1483, -0.0492]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.2731e-06, 9.3132e-09, ..., 7.4506e-09, 9.3132e-10, 1.8626e-09], [ 3.8184e-08, 2.7940e-09, 6.3330e-08, ..., 5.2154e-08, 2.7940e-08, -4.2841e-08], [-5.6438e-07, 1.4901e-08, -3.4180e-07, ..., -3.4925e-07, -5.8860e-07, 9.3132e-09], ..., [ 5.4110e-07, 1.6764e-08, 3.7625e-07, ..., 3.7253e-07, 5.5227e-07, 2.4214e-08], [-1.5832e-07, 1.6764e-08, 7.1712e-08, ..., 4.7497e-08, 1.8626e-09, -3.4086e-07], [ 9.3132e-10, 5.4017e-08, -1.2107e-08, ..., 3.7253e-09, 9.3132e-10, 3.2596e-08]], device='cuda:0') Epoch 320, bias, value: tensor([-0.0205, -0.0188, -0.0092, -0.0235, -0.0017, 0.0062, 0.0102, 0.0232, 0.0132, -0.0081], device='cuda:0'), grad: tensor([ 2.7940e-06, 9.7789e-08, -1.2489e-06, -4.4610e-07, -1.3504e-07, 1.5646e-07, -2.0862e-06, 1.3504e-06, -6.1281e-07, 1.3225e-07], device='cuda:0') 100 0.0001 changing lr epoch 319, time 214.63, cls_loss 0.0011 cls_loss_mapping 0.0015 cls_loss_causal 0.4845 re_mapping 0.0045 re_causal 0.0126 /// teacc 99.09 lr 0.00010000 Epoch 321, weight, value: tensor([[-0.2993, 0.0799, -0.1656, ..., -0.0656, -0.2519, -0.1707], [ 0.0012, 0.1095, -0.1228, ..., -0.1201, -0.0666, 0.1211], [ 0.0504, -0.1334, -0.1711, ..., 0.0063, -0.0574, -0.1062], ..., [ 0.0997, -0.0682, 0.1413, ..., 0.0705, 0.1997, -0.0174], [ 0.1161, -0.2220, -0.1331, ..., -0.2908, -0.0705, 0.2008], [-0.1728, 0.0890, 0.0609, ..., -0.2785, -0.1482, -0.0494]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 8.3819e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, -0.0000e+00], [ 3.7532e-07, -3.7253e-08, 8.3819e-09, ..., 3.7253e-09, 8.0094e-08, -8.2888e-08], [-6.5751e-07, 1.8626e-09, 2.7940e-09, ..., -9.3132e-10, -1.3690e-07, 2.7940e-09], ..., [ 4.3772e-08, 2.9802e-08, -1.5832e-08, ..., -4.6566e-09, 1.8626e-09, 4.3772e-08], [ 7.2643e-08, 1.8813e-07, 1.0245e-08, ..., 0.0000e+00, 1.4901e-08, 2.7940e-09], [ 1.3970e-08, 1.0245e-08, -7.4506e-09, ..., 2.7940e-09, 6.5193e-09, 1.8626e-08]], device='cuda:0') Epoch 321, bias, value: tensor([-0.0203, -0.0187, -0.0095, -0.0237, -0.0025, 0.0063, 0.0099, 0.0231, 0.0138, -0.0077], device='cuda:0'), grad: tensor([ 8.9407e-08, 1.0571e-06, -2.1290e-06, 2.2352e-08, 2.0862e-07, -4.0904e-06, 3.4533e-06, 2.7753e-07, 1.0347e-06, 8.7544e-08], device='cuda:0') 100 0.0001 changing lr epoch 320, time 214.85, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4677 re_mapping 0.0046 re_causal 0.0132 /// teacc 99.00 lr 0.00010000 Epoch 322, weight, value: tensor([[-0.3000, 0.0800, -0.1664, ..., -0.0660, -0.2523, -0.1708], [ 0.0009, 0.1096, -0.1229, ..., -0.1206, -0.0669, 0.1212], [ 0.0511, -0.1334, -0.1708, ..., 0.0067, -0.0568, -0.1059], ..., [ 0.0999, -0.0685, 0.1414, ..., 0.0706, 0.2000, -0.0181], [ 0.1165, -0.2223, -0.1333, ..., -0.2927, -0.0710, 0.2014], [-0.1731, 0.0887, 0.0609, ..., -0.2790, -0.1484, -0.0500]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -8.1211e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 1.0245e-08, -4.5635e-08, 6.5193e-09, ..., 2.7940e-09, 6.5193e-09, -8.6613e-08], [-1.2107e-08, 1.5832e-08, 9.3132e-10, ..., -3.7253e-09, -6.5193e-09, 2.7940e-09], ..., [ 1.8626e-09, 2.4214e-08, -6.5193e-09, ..., 9.3132e-10, 0.0000e+00, 4.1910e-08], [ 0.0000e+00, 7.4506e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [ 1.8626e-09, 8.7265e-07, -1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 8.1956e-08]], device='cuda:0') Epoch 322, bias, value: tensor([-0.0203, -0.0188, -0.0088, -0.0238, -0.0021, 0.0061, 0.0099, 0.0231, 0.0140, -0.0080], device='cuda:0'), grad: tensor([-2.0433e-06, -1.7881e-07, 1.6764e-08, 2.3283e-08, -2.8033e-07, 1.8626e-09, -8.8476e-08, 1.2200e-07, 9.2201e-08, 2.3283e-06], device='cuda:0') 100 0.0001 changing lr epoch 321, time 214.79, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.5133 re_mapping 0.0047 re_causal 0.0139 /// teacc 99.06 lr 0.00010000 Epoch 323, weight, value: tensor([[-0.3004, 0.0800, -0.1669, ..., -0.0662, -0.2527, -0.1711], [ 0.0005, 0.1094, -0.1234, ..., -0.1207, -0.0674, 0.1210], [ 0.0523, -0.1335, -0.1707, ..., 0.0078, -0.0555, -0.1059], ..., [ 0.1001, -0.0687, 0.1419, ..., 0.0699, 0.2004, -0.0176], [ 0.1164, -0.2226, -0.1336, ..., -0.2941, -0.0717, 0.2015], [-0.1733, 0.0887, 0.0609, ..., -0.2792, -0.1488, -0.0504]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.7940e-09, 5.5879e-09, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 2.8871e-08, -8.3819e-09, 5.7742e-08, ..., 4.6566e-09, 3.1665e-08, -3.3528e-08], [ 4.6566e-09, 9.3132e-10, 1.7695e-08, ..., -5.5879e-09, 8.3819e-09, -2.6077e-08], ..., [-1.9930e-07, 7.4506e-09, -3.6974e-07, ..., 1.7695e-08, -2.1607e-07, 2.4214e-08], [ 6.5193e-09, 4.7497e-08, 7.5437e-08, ..., 9.3132e-10, 3.7253e-09, 2.0489e-08], [ 1.5087e-07, -2.3283e-08, 1.7881e-07, ..., 0.0000e+00, 1.5926e-07, -3.7253e-09]], device='cuda:0') Epoch 323, bias, value: tensor([-0.0202, -0.0190, -0.0085, -0.0238, -0.0020, 0.0058, 0.0100, 0.0232, 0.0138, -0.0081], device='cuda:0'), grad: tensor([ 3.4459e-08, 7.2643e-08, -2.7940e-06, 1.6764e-08, 2.2631e-07, -1.8720e-07, -9.9652e-08, -5.4762e-07, 5.0571e-07, 2.7698e-06], device='cuda:0') 100 0.0001 changing lr epoch 322, time 215.01, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4965 re_mapping 0.0046 re_causal 0.0137 /// teacc 99.07 lr 0.00010000 Epoch 324, weight, value: tensor([[-0.3010, 0.0801, -0.1684, ..., -0.0664, -0.2530, -0.1717], [ 0.0006, 0.1094, -0.1231, ..., -0.1211, -0.0674, 0.1214], [ 0.0527, -0.1335, -0.1706, ..., 0.0081, -0.0553, -0.1059], ..., [ 0.1000, -0.0689, 0.1418, ..., 0.0700, 0.2004, -0.0184], [ 0.1169, -0.2236, -0.1346, ..., -0.2947, -0.0722, 0.2029], [-0.1740, 0.0887, 0.0613, ..., -0.2804, -0.1495, -0.0517]], device='cuda:0'), grad: tensor([[ 5.3085e-08, 1.3420e-06, 0.0000e+00, ..., 9.3132e-09, 9.3132e-10, 0.0000e+00], [ 7.2643e-08, 9.3132e-09, 0.0000e+00, ..., 1.1176e-08, 5.5879e-09, 4.2841e-08], [-8.8662e-07, 1.8626e-08, 0.0000e+00, ..., -1.4622e-07, -1.2107e-08, 1.8626e-09], ..., [ 2.0489e-08, 5.5879e-09, 0.0000e+00, ..., 3.7253e-09, 1.8626e-09, 7.4506e-09], [ 3.5949e-07, 2.1420e-08, 0.0000e+00, ..., 6.4261e-08, 2.7940e-09, -6.7055e-08], [ 5.5879e-09, 3.5390e-08, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 324, bias, value: tensor([-0.0202, -0.0190, -0.0082, -0.0237, -0.0019, 0.0056, 0.0100, 0.0230, 0.0139, -0.0082], device='cuda:0'), grad: tensor([ 5.0440e-06, 2.2352e-07, -2.5127e-06, 8.6427e-07, 3.9116e-08, 9.4995e-08, -5.1148e-06, 7.9162e-08, 1.1539e-06, 1.1269e-07], device='cuda:0') 100 0.0001 changing lr epoch 323, time 214.72, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4979 re_mapping 0.0045 re_causal 0.0128 /// teacc 98.97 lr 0.00010000 Epoch 325, weight, value: tensor([[-0.3014, 0.0801, -0.1697, ..., -0.0665, -0.2531, -0.1718], [ 0.0006, 0.1095, -0.1232, ..., -0.1214, -0.0674, 0.1216], [ 0.0528, -0.1335, -0.1706, ..., 0.0083, -0.0552, -0.1060], ..., [ 0.1000, -0.0695, 0.1418, ..., 0.0699, 0.2005, -0.0188], [ 0.1169, -0.2242, -0.1347, ..., -0.2956, -0.0724, 0.2032], [-0.1743, 0.0900, 0.0636, ..., -0.2809, -0.1497, -0.0522]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.9558e-08, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 1.9278e-07, 8.1956e-08, 2.6822e-07, ..., 4.2841e-08, 7.3574e-08, -2.5146e-08], [-4.3772e-08, 1.3039e-08, 1.0245e-08, ..., -3.4459e-08, 1.8626e-09, 1.8626e-09], ..., [-1.5181e-07, 1.6764e-07, 1.5367e-07, ..., -7.4506e-09, -6.0536e-08, 2.7008e-08], [-1.6205e-07, 5.5879e-08, 3.6322e-08, ..., 0.0000e+00, 0.0000e+00, -3.9954e-07], [ 1.8626e-09, 4.1239e-06, -5.8208e-07, ..., 0.0000e+00, -1.5832e-08, 1.2945e-07]], device='cuda:0') Epoch 325, bias, value: tensor([-0.0202, -0.0190, -0.0081, -0.0266, -0.0043, 0.0079, 0.0100, 0.0230, 0.0135, -0.0070], device='cuda:0'), grad: tensor([-3.4459e-08, 1.0366e-06, -1.3597e-07, 3.2596e-07, -2.1279e-05, 2.7381e-07, 9.9652e-07, 7.7859e-07, -7.6089e-07, 1.8835e-05], device='cuda:0') 100 0.0001 changing lr epoch 324, time 214.49, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4498 re_mapping 0.0046 re_causal 0.0127 /// teacc 99.04 lr 0.00010000 Epoch 326, weight, value: tensor([[-0.3020, 0.0801, -0.1703, ..., -0.0670, -0.2538, -0.1718], [ 0.0007, 0.1095, -0.1230, ..., -0.1218, -0.0675, 0.1217], [ 0.0528, -0.1336, -0.1709, ..., 0.0083, -0.0553, -0.1058], ..., [ 0.1000, -0.0698, 0.1420, ..., 0.0700, 0.2007, -0.0193], [ 0.1171, -0.2245, -0.1348, ..., -0.2962, -0.0726, 0.2037], [-0.1752, 0.0899, 0.0630, ..., -0.2815, -0.1506, -0.0525]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -7.4506e-09, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 1.7695e-08, 1.8626e-09, 1.7695e-08, ..., 1.2107e-08, 1.4901e-08, 0.0000e+00], [ 3.4459e-08, 0.0000e+00, 2.7008e-08, ..., 1.8626e-08, 3.1665e-08, 9.3132e-09], ..., [-9.3132e-08, 1.8626e-09, -4.6566e-08, ..., -3.0734e-08, -8.7544e-08, -1.3039e-08], [ 2.7940e-09, -0.0000e+00, 1.3970e-08, ..., 1.0245e-08, 1.6764e-08, -1.3970e-08], [ 1.1176e-08, 2.7940e-09, 3.7253e-09, ..., 4.6566e-09, 8.3819e-09, 5.5879e-09]], device='cuda:0') Epoch 326, bias, value: tensor([-0.0202, -0.0191, -0.0070, -0.0267, -0.0041, 0.0079, 0.0101, 0.0229, 0.0134, -0.0073], device='cuda:0'), grad: tensor([-1.2107e-08, 4.8429e-08, 9.1270e-08, -6.8918e-08, 3.2596e-08, 5.5879e-08, -1.5832e-08, -1.7229e-07, 9.3132e-09, 3.3528e-08], device='cuda:0') 100 0.0001 changing lr epoch 325, time 214.54, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4603 re_mapping 0.0045 re_causal 0.0131 /// teacc 99.08 lr 0.00010000 Epoch 327, weight, value: tensor([[-0.3023, 0.0801, -0.1710, ..., -0.0674, -0.2540, -0.1721], [ 0.0006, 0.1095, -0.1231, ..., -0.1220, -0.0676, 0.1219], [ 0.0526, -0.1337, -0.1712, ..., 0.0082, -0.0556, -0.1059], ..., [ 0.1001, -0.0701, 0.1420, ..., 0.0693, 0.2008, -0.0196], [ 0.1176, -0.2248, -0.1349, ..., -0.2968, -0.0729, 0.2040], [-0.1754, 0.0898, 0.0631, ..., -0.2822, -0.1509, -0.0527]], device='cuda:0'), grad: tensor([[ 6.5193e-09, -1.9185e-07, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -4.6566e-09], [ 7.4506e-09, 6.5193e-09, 9.3132e-09, ..., 9.3132e-10, 4.6566e-09, -7.4506e-09], [ 1.8626e-09, 4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], ..., [-2.5146e-08, 7.4506e-09, -2.4214e-08, ..., -3.7253e-09, -1.4901e-08, 1.2107e-08], [-5.4017e-08, 1.6764e-08, 1.3970e-08, ..., 9.3132e-10, 3.7253e-09, -2.4773e-07], [ 5.4017e-08, 8.3819e-09, -4.7497e-08, ..., 0.0000e+00, 2.7940e-09, 2.1420e-07]], device='cuda:0') Epoch 327, bias, value: tensor([-0.0202, -0.0194, -0.0063, -0.0265, -0.0042, 0.0074, 0.0101, 0.0228, 0.0138, -0.0074], device='cuda:0'), grad: tensor([-5.6811e-07, 8.9407e-08, -2.5146e-08, 2.7940e-08, 1.1921e-07, 4.1910e-08, 3.4645e-07, 3.7253e-09, -6.8452e-07, 6.4541e-07], device='cuda:0') 100 0.0001 changing lr epoch 326, time 214.75, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4937 re_mapping 0.0045 re_causal 0.0130 /// teacc 98.94 lr 0.00010000 Epoch 328, weight, value: tensor([[-0.3026, 0.0802, -0.1721, ..., -0.0682, -0.2550, -0.1729], [ 0.0006, 0.1095, -0.1232, ..., -0.1226, -0.0677, 0.1220], [ 0.0540, -0.1337, -0.1705, ..., 0.0098, -0.0533, -0.1058], ..., [ 0.0999, -0.0706, 0.1419, ..., 0.0687, 0.2008, -0.0199], [ 0.1178, -0.2255, -0.1351, ..., -0.2983, -0.0734, 0.2044], [-0.1758, 0.0897, 0.0632, ..., -0.2831, -0.1514, -0.0531]], device='cuda:0'), grad: tensor([[-3.8184e-08, 4.5635e-08, 8.4750e-08, ..., 0.0000e+00, 2.7940e-09, 1.8626e-09], [ 3.7253e-08, 6.1188e-07, 1.9651e-07, ..., 9.3132e-10, 4.3772e-08, -5.5879e-09], [ 1.3039e-08, 3.5204e-07, 5.4017e-08, ..., -2.7940e-09, 9.3132e-09, 6.5193e-09], ..., [-5.2154e-08, 2.4103e-06, 5.5600e-07, ..., 9.3132e-10, -6.9849e-08, -7.4506e-09], [ 7.4506e-09, 2.7381e-07, 7.5437e-08, ..., 0.0000e+00, 4.6566e-09, 3.7253e-09], [ 1.7695e-08, 1.4353e-04, 3.6299e-05, ..., 0.0000e+00, 3.7253e-09, 3.6322e-08]], device='cuda:0') Epoch 328, bias, value: tensor([-0.0202, -0.0195, -0.0050, -0.0275, -0.0038, 0.0073, 0.0102, 0.0226, 0.0137, -0.0076], device='cuda:0'), grad: tensor([ 2.7940e-09, 1.6848e-06, 9.5181e-07, 4.7032e-07, -3.8767e-04, 1.1865e-06, -8.4285e-07, 6.2361e-06, 2.2445e-06, 3.7575e-04], device='cuda:0') 100 0.0001 changing lr epoch 327, time 214.67, cls_loss 0.0012 cls_loss_mapping 0.0017 cls_loss_causal 0.4706 re_mapping 0.0046 re_causal 0.0127 /// teacc 98.95 lr 0.00010000 Epoch 329, weight, value: tensor([[-0.3021, 0.0803, -0.1733, ..., -0.0686, -0.2559, -0.1720], [ 0.0006, 0.1096, -0.1232, ..., -0.1229, -0.0677, 0.1224], [ 0.0543, -0.1340, -0.1723, ..., 0.0078, -0.0532, -0.1059], ..., [ 0.0999, -0.0719, 0.1419, ..., 0.0686, 0.2008, -0.0208], [ 0.1182, -0.2261, -0.1353, ..., -0.2994, -0.0736, 0.2053], [-0.1764, 0.0892, 0.0629, ..., -0.2835, -0.1513, -0.0536]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 6.5193e-09, -1.3970e-08, 1.8626e-09, ..., 3.7253e-09, 1.8626e-09, -5.8673e-08], [ 0.0000e+00, 9.3132e-10, -0.0000e+00, ..., -5.5879e-09, -5.5879e-09, 1.0245e-08], ..., [ 9.3132e-09, 1.2107e-08, 3.7253e-09, ..., 2.7940e-09, 2.7940e-09, 5.7742e-08], [-1.1548e-07, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.3784e-07], [ 8.2888e-08, -1.8626e-09, -1.0245e-08, ..., 0.0000e+00, 9.3132e-10, 1.0338e-07]], device='cuda:0') Epoch 329, bias, value: tensor([-0.0201, -0.0197, -0.0063, -0.0256, -0.0028, 0.0072, 0.0102, 0.0224, 0.0139, -0.0082], device='cuda:0'), grad: tensor([-3.7253e-09, -8.2888e-08, -1.8626e-09, -2.7940e-09, 1.9558e-08, 3.6322e-08, 1.3970e-08, 1.2293e-07, -4.0140e-07, 2.9989e-07], device='cuda:0') 100 0.0001 changing lr epoch 328, time 214.55, cls_loss 0.0012 cls_loss_mapping 0.0017 cls_loss_causal 0.4845 re_mapping 0.0045 re_causal 0.0130 /// teacc 98.99 lr 0.00010000 Epoch 330, weight, value: tensor([[-0.3029, 0.0803, -0.1746, ..., -0.0691, -0.2565, -0.1723], [ 0.0006, 0.1095, -0.1233, ..., -0.1231, -0.0677, 0.1225], [ 0.0552, -0.1341, -0.1733, ..., 0.0067, -0.0530, -0.1058], ..., [ 0.0998, -0.0723, 0.1419, ..., 0.0685, 0.2008, -0.0213], [ 0.1187, -0.2265, -0.1351, ..., -0.3001, -0.0742, 0.2063], [-0.1766, 0.0900, 0.0647, ..., -0.2839, -0.1511, -0.0540]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.1176e-08, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, -1.3039e-08, 5.5879e-09, ..., 9.3132e-10, 2.7940e-09, -2.3283e-08], [ 2.6077e-08, 6.5193e-09, 2.0489e-08, ..., 1.1176e-08, 1.5832e-08, 6.5193e-09], ..., [-3.2596e-08, 1.6764e-08, -1.5832e-08, ..., -1.3970e-08, -1.9558e-08, 1.4901e-08], [-7.4506e-09, 2.7940e-09, 5.5879e-09, ..., 9.3132e-10, 9.3132e-10, -9.3132e-09], [ 5.5879e-09, -8.6613e-08, -1.5367e-07, ..., 0.0000e+00, -1.3039e-08, 5.5879e-09]], device='cuda:0') Epoch 330, bias, value: tensor([-0.0201, -0.0198, -0.0071, -0.0245, -0.0044, 0.0070, 0.0102, 0.0222, 0.0141, -0.0072], device='cuda:0'), grad: tensor([-2.3283e-08, -4.0978e-08, 6.0536e-08, 7.4506e-09, 2.9616e-07, 8.3819e-09, -2.2352e-08, 7.4506e-09, -5.5879e-09, -2.6915e-07], device='cuda:0') 100 0.0001 changing lr epoch 329, time 215.25, cls_loss 0.0010 cls_loss_mapping 0.0013 cls_loss_causal 0.4635 re_mapping 0.0044 re_causal 0.0127 /// teacc 99.04 lr 0.00010000 Epoch 331, weight, value: tensor([[-0.3033, 0.0806, -0.1751, ..., -0.0692, -0.2568, -0.1725], [ 0.0007, 0.1099, -0.1232, ..., -0.1235, -0.0677, 0.1232], [ 0.0553, -0.1343, -0.1734, ..., 0.0067, -0.0531, -0.1058], ..., [ 0.0996, -0.0745, 0.1417, ..., 0.0683, 0.2008, -0.0226], [ 0.1190, -0.2273, -0.1354, ..., -0.3011, -0.0745, 0.2068], [-0.1769, 0.0896, 0.0649, ..., -0.2845, -0.1515, -0.0544]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -1.4249e-07, 3.7253e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 4.4703e-08, 4.6566e-09, 3.8184e-08, ..., 2.3283e-08, 2.4214e-08, -8.3819e-09], [-1.8626e-08, 9.3132e-10, 9.1270e-08, ..., -5.5879e-08, -4.8429e-08, 9.3132e-09], ..., [-4.0978e-08, 5.5879e-09, -1.2107e-07, ..., 1.1176e-08, -1.2107e-08, 6.5193e-09], [-2.9802e-08, 1.3970e-08, 1.7695e-08, ..., 1.8626e-09, 2.7940e-09, -4.0978e-08], [ 6.5193e-09, 5.9605e-08, -3.3528e-08, ..., 2.7940e-09, 2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 331, bias, value: tensor([-0.0198, -0.0197, -0.0071, -0.0244, -0.0042, 0.0072, 0.0101, 0.0219, 0.0140, -0.0078], device='cuda:0'), grad: tensor([-3.3434e-07, 1.4529e-07, -3.0827e-07, 6.7987e-08, 9.0338e-08, -3.8464e-07, 4.5728e-07, 1.4994e-07, -4.6566e-09, 1.3132e-07], device='cuda:0') 100 0.0001 changing lr epoch 330, time 215.04, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4651 re_mapping 0.0043 re_causal 0.0126 /// teacc 98.96 lr 0.00010000 Epoch 332, weight, value: tensor([[-0.3035, 0.0808, -0.1763, ..., -0.0694, -0.2570, -0.1723], [ 0.0006, 0.1100, -0.1233, ..., -0.1241, -0.0679, 0.1232], [ 0.0549, -0.1344, -0.1737, ..., 0.0067, -0.0536, -0.1059], ..., [ 0.0998, -0.0750, 0.1421, ..., 0.0687, 0.2011, -0.0228], [ 0.1198, -0.2276, -0.1357, ..., -0.3020, -0.0751, 0.2082], [-0.1774, 0.0893, 0.0649, ..., -0.2856, -0.1522, -0.0546]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 2.7940e-09, 9.3132e-10], [ 3.3528e-08, 3.5390e-08, 7.0781e-08, ..., 1.0245e-08, 2.0489e-08, 2.9802e-08], [ 1.2107e-08, 1.7695e-08, 1.8626e-08, ..., 1.8626e-09, 6.5193e-09, 1.0245e-08], ..., [-9.5926e-08, 6.7055e-08, 5.1223e-08, ..., -3.7253e-09, -7.4506e-08, 7.7300e-08], [-3.2596e-08, 1.9558e-08, 4.8429e-08, ..., 2.0489e-08, 2.7940e-09, -7.9162e-08], [ 2.9802e-08, 1.5460e-07, -1.1828e-07, ..., 5.5879e-09, 2.1420e-08, 1.9372e-07]], device='cuda:0') Epoch 332, bias, value: tensor([-0.0197, -0.0199, -0.0072, -0.0244, -0.0042, 0.0072, 0.0101, 0.0221, 0.0149, -0.0081], device='cuda:0'), grad: tensor([ 1.3039e-08, 3.2410e-07, 9.8720e-08, 2.2911e-07, -8.5961e-07, -5.7835e-07, 1.1548e-07, 2.6636e-07, -2.2445e-07, 6.2399e-07], device='cuda:0') 100 0.0001 changing lr epoch 331, time 215.17, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4850 re_mapping 0.0045 re_causal 0.0133 /// teacc 99.04 lr 0.00010000 Epoch 333, weight, value: tensor([[-0.3040, 0.0810, -0.1767, ..., -0.0695, -0.2574, -0.1725], [ 0.0003, 0.1099, -0.1237, ..., -0.1255, -0.0683, 0.1233], [ 0.0546, -0.1345, -0.1741, ..., 0.0067, -0.0541, -0.1060], ..., [ 0.1002, -0.0755, 0.1426, ..., 0.0694, 0.2016, -0.0230], [ 0.1203, -0.2282, -0.1358, ..., -0.3027, -0.0755, 0.2089], [-0.1778, 0.0894, 0.0657, ..., -0.2859, -0.1522, -0.0548]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 3.7253e-09, 2.0489e-08], [-3.7253e-09, -1.6764e-08, 2.7940e-09, ..., 2.7940e-09, 2.7940e-09, -8.1025e-08], [-5.0291e-08, 9.3132e-10, 0.0000e+00, ..., -3.3528e-08, -4.5635e-08, 2.0489e-08], ..., [ 4.0978e-08, 1.0990e-07, -1.2107e-08, ..., 2.2352e-08, 9.3132e-09, 1.6112e-07], [-2.5146e-08, 1.8626e-09, 2.7940e-09, ..., 3.7253e-09, 4.6566e-09, -6.3330e-08], [ 7.4506e-09, 1.0245e-08, -2.7940e-09, ..., 0.0000e+00, 9.3132e-10, 2.7008e-08]], device='cuda:0') Epoch 333, bias, value: tensor([-0.0195, -0.0200, -0.0072, -0.0244, -0.0046, 0.0070, 0.0100, 0.0222, 0.0151, -0.0080], device='cuda:0'), grad: tensor([ 6.4261e-08, -1.0990e-07, -1.5274e-07, 6.5193e-09, -2.4028e-07, 1.2107e-08, 8.3819e-09, 4.5169e-07, -9.5926e-08, 6.1467e-08], device='cuda:0') 100 0.0001 changing lr epoch 332, time 214.92, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.5080 re_mapping 0.0043 re_causal 0.0132 /// teacc 99.00 lr 0.00010000 Epoch 334, weight, value: tensor([[-0.3015, 0.0813, -0.1762, ..., -0.0698, -0.2578, -0.1703], [ 0.0011, 0.1100, -0.1220, ..., -0.1259, -0.0683, 0.1242], [ 0.0546, -0.1346, -0.1742, ..., 0.0067, -0.0543, -0.1060], ..., [ 0.0995, -0.0761, 0.1409, ..., 0.0696, 0.2017, -0.0244], [ 0.1209, -0.2287, -0.1361, ..., -0.3033, -0.0759, 0.2096], [-0.1778, 0.0891, 0.0658, ..., -0.2862, -0.1522, -0.0548]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.3132e-09, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 9.3132e-10], [ 1.2107e-08, -1.5832e-08, 1.3039e-08, ..., 1.1176e-08, 5.5879e-09, -8.1025e-08], [-3.5949e-07, 3.7253e-09, 1.8626e-08, ..., -1.4529e-07, -2.7101e-07, 1.3039e-08], ..., [ 3.5390e-07, 1.2107e-08, 8.3819e-09, ..., 1.6019e-07, 2.6636e-07, 3.1665e-08], [-2.0489e-08, 1.3039e-08, 4.6566e-09, ..., 4.6566e-09, 2.7940e-09, -1.4901e-08], [ 9.3132e-10, 2.7008e-08, 3.7253e-09, ..., 5.5879e-09, 0.0000e+00, 1.1176e-08]], device='cuda:0') Epoch 334, bias, value: tensor([-0.0190, -0.0190, -0.0071, -0.0244, -0.0043, 0.0067, 0.0095, 0.0210, 0.0155, -0.0083], device='cuda:0'), grad: tensor([-1.3039e-08, -1.5274e-07, -7.9256e-07, -1.7323e-07, -8.6613e-08, 2.8871e-08, 6.7055e-08, 9.7137e-07, 3.0734e-08, 1.2200e-07], device='cuda:0') 100 0.0001 changing lr epoch 333, time 215.04, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4911 re_mapping 0.0046 re_causal 0.0132 /// teacc 99.03 lr 0.00010000 Epoch 335, weight, value: tensor([[-0.3022, 0.0815, -0.1766, ..., -0.0701, -0.2583, -0.1709], [ 0.0008, 0.1097, -0.1222, ..., -0.1274, -0.0687, 0.1242], [ 0.0542, -0.1347, -0.1750, ..., 0.0066, -0.0558, -0.1059], ..., [ 0.0999, -0.0765, 0.1413, ..., 0.0705, 0.2024, -0.0247], [ 0.1217, -0.2291, -0.1365, ..., -0.3065, -0.0774, 0.2105], [-0.1782, 0.0888, 0.0657, ..., -0.2868, -0.1528, -0.0550]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [-1.8626e-09, 1.8626e-09, 9.3132e-09, ..., 9.3132e-10, 0.0000e+00, -1.8626e-09], [ 9.3132e-10, 2.7940e-08, -5.5879e-09, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 335, bias, value: tensor([-0.0188, -0.0192, -0.0072, -0.0242, -0.0042, 0.0061, 0.0094, 0.0212, 0.0163, -0.0087], device='cuda:0'), grad: tensor([ 7.4506e-09, 1.5832e-08, -3.4459e-08, -1.1176e-08, -1.2759e-07, 1.9558e-08, -2.3283e-08, 1.9558e-08, 2.6077e-08, 1.1735e-07], device='cuda:0') 100 0.0001 changing lr epoch 334, time 214.90, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4990 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.00 lr 0.00010000 Epoch 336, weight, value: tensor([[-0.3025, 0.0816, -0.1773, ..., -0.0708, -0.2587, -0.1715], [-0.0004, 0.1102, -0.1218, ..., -0.1278, -0.0690, 0.1236], [ 0.0540, -0.1348, -0.1751, ..., 0.0066, -0.0560, -0.1063], ..., [ 0.1001, -0.0804, 0.1415, ..., 0.0707, 0.2030, -0.0271], [ 0.1248, -0.2262, -0.1368, ..., -0.3076, -0.0781, 0.2138], [-0.1798, 0.0885, 0.0648, ..., -0.2878, -0.1557, -0.0570]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -5.9605e-08, -9.3132e-10, ..., 2.7940e-09, 1.8626e-09, 4.6566e-09], [-1.8626e-09, 4.7497e-08, -1.0990e-07, ..., 4.6566e-09, 2.2352e-08, -2.1048e-07], [-9.3132e-09, 1.5832e-08, 8.3819e-09, ..., -1.6764e-08, -8.3819e-09, 1.4901e-08], ..., [-1.0058e-07, 4.6566e-08, -7.3574e-08, ..., -1.8626e-09, -1.0058e-07, 1.5553e-07], [-9.3132e-10, 1.6764e-08, 1.5832e-08, ..., 2.7940e-09, 2.7940e-09, 1.3039e-08], [ 1.0151e-07, 1.3318e-06, 1.5926e-07, ..., 7.4506e-09, 8.0094e-08, 6.9849e-08]], device='cuda:0') Epoch 336, bias, value: tensor([-0.0188, -0.0200, -0.0074, -0.0242, -0.0042, 0.0060, 0.0093, 0.0210, 0.0194, -0.0092], device='cuda:0'), grad: tensor([-1.5553e-07, -8.3353e-07, -2.0489e-08, 6.9849e-08, -5.1558e-06, 4.3772e-08, 1.1269e-07, 6.9477e-07, 1.4901e-07, 5.0738e-06], device='cuda:0') 100 0.0001 changing lr epoch 335, time 214.92, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4852 re_mapping 0.0047 re_causal 0.0131 /// teacc 99.04 lr 0.00010000 Epoch 337, weight, value: tensor([[-0.3025, 0.0816, -0.1779, ..., -0.0719, -0.2591, -0.1711], [-0.0009, 0.1101, -0.1224, ..., -0.1285, -0.0696, 0.1235], [ 0.0539, -0.1351, -0.1755, ..., 0.0065, -0.0566, -0.1065], ..., [ 0.1009, -0.0805, 0.1424, ..., 0.0714, 0.2037, -0.0266], [ 0.1247, -0.2265, -0.1374, ..., -0.3088, -0.0790, 0.2138], [-0.1808, 0.0882, 0.0643, ..., -0.2893, -0.1568, -0.0579]], device='cuda:0'), grad: tensor([[ 1.2107e-08, 1.5832e-08, 1.7695e-08, ..., 2.7940e-09, 9.3132e-10, 2.7008e-08], [ 2.4214e-08, 5.6811e-08, 4.1910e-08, ..., 1.2107e-08, 4.6566e-09, 6.5193e-09], [ 1.1828e-07, 4.6566e-09, 1.9558e-08, ..., 5.5879e-09, 8.3819e-09, 2.7474e-07], ..., [-1.8626e-08, 6.5193e-09, -1.3970e-08, ..., -4.6566e-09, -1.6764e-08, 3.4459e-08], [-2.6990e-06, 3.5390e-08, 2.0489e-08, ..., 4.6566e-09, 9.3132e-10, -8.3596e-06], [ 2.0303e-06, -1.2573e-07, -1.3225e-07, ..., 1.0245e-08, 1.8626e-09, 6.4112e-06]], device='cuda:0') Epoch 337, bias, value: tensor([-0.0187, -0.0202, -0.0074, -0.0245, -0.0034, 0.0061, 0.0095, 0.0214, 0.0192, -0.0098], device='cuda:0'), grad: tensor([ 1.5460e-07, 3.0734e-07, 9.3412e-07, 2.2259e-07, 4.7497e-08, -4.0978e-07, 5.2899e-06, 7.2643e-08, -2.7180e-05, 2.0608e-05], device='cuda:0') 100 0.0001 changing lr epoch 336, time 214.83, cls_loss 0.0010 cls_loss_mapping 0.0013 cls_loss_causal 0.4924 re_mapping 0.0044 re_causal 0.0128 /// teacc 99.08 lr 0.00010000 Epoch 338, weight, value: tensor([[-0.3025, 0.0817, -0.1785, ..., -0.0726, -0.2593, -0.1709], [-0.0014, 0.1100, -0.1231, ..., -0.1287, -0.0702, 0.1230], [ 0.0543, -0.1352, -0.1757, ..., 0.0065, -0.0565, -0.1066], ..., [ 0.1015, -0.0806, 0.1431, ..., 0.0713, 0.2044, -0.0253], [ 0.1247, -0.2267, -0.1380, ..., -0.3107, -0.0793, 0.2139], [-0.1812, 0.0882, 0.0645, ..., -0.2898, -0.1570, -0.0588]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 2.4214e-08, 8.1025e-08, ..., 1.8626e-09, 1.8626e-09, 9.3132e-10], [-4.6566e-09, -2.7940e-09, 1.1269e-07, ..., 3.7253e-09, 8.3819e-09, -3.5390e-08], [ 9.3132e-09, 7.4506e-09, 2.9802e-08, ..., -3.8184e-08, -1.3039e-08, 7.4506e-09], ..., [-3.1665e-08, 8.9686e-07, 3.1702e-06, ..., 1.8626e-09, -3.2596e-08, 2.9802e-08], [-7.4506e-09, 1.4622e-07, 5.3737e-07, ..., 1.8626e-09, 9.3132e-10, -8.3819e-09], [ 2.6077e-08, -1.8431e-06, -6.8024e-06, ..., 1.8626e-09, 2.6077e-08, 2.7940e-09]], device='cuda:0') Epoch 338, bias, value: tensor([-0.0187, -0.0206, -0.0073, -0.0249, -0.0032, 0.0074, 0.0096, 0.0218, 0.0190, -0.0101], device='cuda:0'), grad: tensor([ 2.7288e-07, 2.5146e-07, -3.7253e-09, 3.5018e-07, 7.3649e-06, 6.3609e-07, 7.6368e-07, 1.0349e-05, 1.7053e-06, -2.1681e-05], device='cuda:0') 100 0.0001 changing lr epoch 337, time 214.96, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4844 re_mapping 0.0043 re_causal 0.0127 /// teacc 99.06 lr 0.00010000 Epoch 339, weight, value: tensor([[-0.3026, 0.0817, -0.1791, ..., -0.0730, -0.2596, -0.1711], [-0.0014, 0.1103, -0.1232, ..., -0.1290, -0.0703, 0.1237], [ 0.0546, -0.1353, -0.1759, ..., 0.0065, -0.0564, -0.1067], ..., [ 0.1016, -0.0809, 0.1433, ..., 0.0715, 0.2045, -0.0256], [ 0.1245, -0.2272, -0.1384, ..., -0.3119, -0.0799, 0.2137], [-0.1814, 0.0882, 0.0649, ..., -0.2901, -0.1573, -0.0596]], device='cuda:0'), grad: tensor([[ 1.7695e-08, -1.8626e-09, 9.3132e-10, ..., 0.0000e+00, 1.8626e-09, 3.1665e-08], [ 8.4750e-08, -8.4750e-08, 8.1025e-08, ..., 2.5146e-08, 8.2888e-08, -1.8720e-07], [ 1.1455e-07, 2.2352e-08, 1.0245e-07, ..., 3.1665e-08, 1.1083e-07, 4.9360e-08], ..., [-2.6822e-07, 5.5879e-08, -3.1199e-07, ..., -9.5926e-08, -2.7195e-07, 1.2759e-07], [-4.8429e-08, 1.8626e-09, 1.5832e-08, ..., 4.6566e-09, 1.2107e-08, -1.0058e-07], [ 2.7008e-08, 1.8626e-09, 1.0245e-08, ..., 2.7940e-09, 8.3819e-09, 3.9116e-08]], device='cuda:0') Epoch 339, bias, value: tensor([-0.0187, -0.0205, -0.0073, -0.0248, -0.0034, 0.0075, 0.0096, 0.0217, 0.0188, -0.0101], device='cuda:0'), grad: tensor([ 8.6613e-08, -3.2317e-07, 3.6042e-07, 1.6112e-07, -9.3132e-09, 2.0489e-08, 6.8918e-08, -2.3562e-07, -2.5984e-07, 1.2759e-07], device='cuda:0') 100 0.0001 changing lr epoch 338, time 215.11, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4963 re_mapping 0.0046 re_causal 0.0135 /// teacc 98.93 lr 0.00010000 Epoch 340, weight, value: tensor([[-0.3030, 0.0817, -0.1797, ..., -0.0739, -0.2608, -0.1713], [-0.0017, 0.1106, -0.1236, ..., -0.1293, -0.0706, 0.1239], [ 0.0549, -0.1353, -0.1760, ..., 0.0065, -0.0560, -0.1067], ..., [ 0.1020, -0.0813, 0.1438, ..., 0.0714, 0.2049, -0.0256], [ 0.1247, -0.2273, -0.1387, ..., -0.3128, -0.0815, 0.2141], [-0.1818, 0.0882, 0.0649, ..., -0.2906, -0.1581, -0.0601]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.9465e-07, 3.4459e-08, ..., 1.0245e-08, 0.0000e+00, 4.6566e-09], [ 4.9360e-08, -1.8626e-08, 4.8429e-08, ..., 1.7695e-08, 3.7253e-09, -8.3819e-08], [ 9.3132e-09, 2.1420e-08, 3.5390e-08, ..., 1.3970e-08, 9.3132e-10, 2.9802e-08], ..., [-1.2107e-08, 4.0047e-08, 1.8626e-09, ..., 1.8626e-09, -7.4506e-09, 1.3039e-07], [-2.1048e-07, 3.8184e-08, 9.9652e-08, ..., 3.3528e-08, 0.0000e+00, -3.9861e-07], [ 6.5193e-09, 1.3690e-07, 2.2165e-07, ..., 7.4506e-08, 9.3132e-10, 9.3132e-08]], device='cuda:0') Epoch 340, bias, value: tensor([-0.0187, -0.0206, -0.0072, -0.0248, -0.0035, 0.0076, 0.0095, 0.0219, 0.0190, -0.0102], device='cuda:0'), grad: tensor([ 4.9174e-07, 2.1420e-08, 1.5926e-07, -1.0515e-06, -2.1327e-07, -4.2468e-07, 8.5402e-07, 2.8033e-07, -8.7637e-07, 7.4692e-07], device='cuda:0') 100 0.0001 changing lr epoch 339, time 215.06, cls_loss 0.0010 cls_loss_mapping 0.0014 cls_loss_causal 0.4661 re_mapping 0.0047 re_causal 0.0127 /// teacc 98.99 lr 0.00010000 Epoch 341, weight, value: tensor([[-0.3029, 0.0818, -0.1802, ..., -0.0748, -0.2618, -0.1713], [-0.0019, 0.1107, -0.1238, ..., -0.1300, -0.0708, 0.1241], [ 0.0588, -0.1354, -0.1731, ..., 0.0081, -0.0523, -0.1069], ..., [ 0.1000, -0.0817, 0.1427, ..., 0.0685, 0.2035, -0.0261], [ 0.1246, -0.2274, -0.1390, ..., -0.3175, -0.0840, 0.2142], [-0.1839, 0.0881, 0.0635, ..., -0.2922, -0.1611, -0.0613]], device='cuda:0'), grad: tensor([[ 3.6322e-08, -4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 8.7544e-08], [-9.6858e-08, -5.5879e-09, 9.3132e-10, ..., 0.0000e+00, -2.7940e-09, -4.4331e-07], [ 4.6566e-08, 2.7940e-09, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 1.7788e-07], ..., [ 1.0245e-08, 5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 4.2841e-08], [-4.8429e-08, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 3.3528e-08, 1.0245e-08, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 7.7300e-08]], device='cuda:0') Epoch 341, bias, value: tensor([-0.0186, -0.0207, -0.0049, -0.0254, -0.0030, 0.0073, 0.0095, 0.0203, 0.0189, -0.0109], device='cuda:0'), grad: tensor([ 1.2945e-07, -1.1204e-06, 4.2375e-07, 3.7253e-09, -2.0489e-07, 2.2352e-08, 1.6391e-07, 1.4715e-07, 1.5739e-07, 2.7474e-07], device='cuda:0') 100 0.0001 changing lr epoch 340, time 214.94, cls_loss 0.0010 cls_loss_mapping 0.0026 cls_loss_causal 0.5094 re_mapping 0.0045 re_causal 0.0127 /// teacc 99.03 lr 0.00010000 Epoch 342, weight, value: tensor([[-0.3030, 0.0819, -0.1812, ..., -0.0759, -0.2623, -0.1718], [-0.0020, 0.1105, -0.1240, ..., -0.1303, -0.0709, 0.1240], [ 0.0587, -0.1358, -0.1732, ..., 0.0080, -0.0524, -0.1070], ..., [ 0.1002, -0.0819, 0.1428, ..., 0.0685, 0.2037, -0.0261], [ 0.1246, -0.2277, -0.1393, ..., -0.3180, -0.0843, 0.2143], [-0.1844, 0.0880, 0.0634, ..., -0.2928, -0.1618, -0.0618]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -5.5879e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [-1.1083e-07, -3.5763e-07, -1.9930e-07, ..., 1.8626e-09, 0.0000e+00, -3.7905e-07], [-9.3132e-10, 3.7253e-09, 1.8626e-09, ..., -1.7695e-08, 0.0000e+00, 5.5879e-09], ..., [ 9.1270e-08, 2.9709e-07, 1.6671e-07, ..., 9.3132e-10, 0.0000e+00, 3.0361e-07], [ 2.7940e-09, 4.6566e-09, 2.7940e-09, ..., 1.2107e-08, 0.0000e+00, 5.5879e-09], [ 1.3039e-08, 4.5635e-08, 1.8626e-08, ..., 0.0000e+00, 0.0000e+00, 4.0047e-08]], device='cuda:0') Epoch 342, bias, value: tensor([-0.0186, -0.0209, -0.0050, -0.0247, -0.0029, 0.0058, 0.0097, 0.0204, 0.0187, -0.0111], device='cuda:0'), grad: tensor([ 0.0000e+00, -1.3290e-06, -3.2783e-07, 2.2352e-08, 8.3819e-08, -1.5832e-08, 2.7940e-08, 1.1222e-06, 2.6263e-07, 1.5739e-07], device='cuda:0') 100 0.0001 changing lr epoch 341, time 215.07, cls_loss 0.0010 cls_loss_mapping 0.0023 cls_loss_causal 0.4748 re_mapping 0.0043 re_causal 0.0121 /// teacc 98.93 lr 0.00010000 Epoch 343, weight, value: tensor([[-0.3034, 0.0820, -0.1817, ..., -0.0752, -0.2630, -0.1721], [-0.0021, 0.1105, -0.1242, ..., -0.1309, -0.0710, 0.1241], [ 0.0587, -0.1360, -0.1733, ..., 0.0080, -0.0524, -0.1070], ..., [ 0.1004, -0.0820, 0.1409, ..., 0.0686, 0.2027, -0.0264], [ 0.1249, -0.2278, -0.1396, ..., -0.3194, -0.0842, 0.2145], [-0.1857, 0.0880, 0.0654, ..., -0.2952, -0.1623, -0.0622]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 1.8626e-09, 4.6566e-09, ..., 9.3132e-10, 9.3132e-10, 7.4506e-09], [ 1.8254e-07, 1.4901e-08, 1.8626e-08, ..., 4.6566e-09, 8.3819e-09, 2.2538e-07], [ 8.4843e-07, 1.8626e-09, 1.0245e-08, ..., -6.5193e-09, -9.3132e-09, 9.0245e-07], ..., [ 3.9116e-08, 1.1176e-08, -4.3772e-08, ..., -1.3039e-08, -2.4214e-08, 9.4995e-08], [-1.2135e-06, 4.8429e-08, 8.9407e-08, ..., 2.7940e-09, 4.6566e-09, -1.2703e-06], [ 2.3283e-08, -4.4703e-08, -1.2852e-07, ..., 4.6566e-09, 1.0245e-08, 1.7602e-07]], device='cuda:0') Epoch 343, bias, value: tensor([-0.0185, -0.0210, -0.0050, -0.0247, -0.0028, 0.0056, 0.0097, 0.0188, 0.0188, -0.0094], device='cuda:0'), grad: tensor([ 2.5518e-07, 7.7952e-07, 2.4177e-06, 6.0070e-07, -3.3062e-07, 1.7928e-06, -4.7088e-06, 2.1327e-07, -1.2983e-06, 2.8126e-07], device='cuda:0') 100 0.0001 changing lr epoch 342, time 215.18, cls_loss 0.0008 cls_loss_mapping 0.0015 cls_loss_causal 0.4569 re_mapping 0.0043 re_causal 0.0125 /// teacc 99.03 lr 0.00010000 Epoch 344, weight, value: tensor([[-0.3026, 0.0820, -0.1826, ..., -0.0754, -0.2618, -0.1708], [-0.0021, 0.1106, -0.1243, ..., -0.1314, -0.0712, 0.1245], [ 0.0587, -0.1363, -0.1734, ..., 0.0080, -0.0525, -0.1077], ..., [ 0.1005, -0.0824, 0.1411, ..., 0.0689, 0.2029, -0.0267], [ 0.1251, -0.2280, -0.1401, ..., -0.3208, -0.0851, 0.2147], [-0.1861, 0.0879, 0.0653, ..., -0.2965, -0.1625, -0.0625]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -1.5832e-07, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 4.6566e-09, -9.3132e-10, 9.3132e-10, ..., 3.7253e-09, 5.5879e-09, -4.6566e-09], [-3.6322e-08, 9.3132e-10, -1.1176e-08, ..., -6.1467e-08, -8.1956e-08, 9.3132e-10], ..., [ 3.1665e-08, 9.3132e-10, 1.0245e-08, ..., 5.0291e-08, 6.7055e-08, 3.7253e-09], [-1.6764e-08, 1.8626e-09, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, -2.4214e-08], [ 8.3819e-09, 3.2596e-08, -9.3132e-09, ..., 1.8626e-09, 2.7940e-09, 1.2107e-08]], device='cuda:0') Epoch 344, bias, value: tensor([-0.0184, -0.0210, -0.0052, -0.0247, -0.0027, 0.0055, 0.0097, 0.0189, 0.0188, -0.0095], device='cuda:0'), grad: tensor([-5.7276e-07, 5.5879e-09, -1.6391e-07, 6.5193e-09, 2.7940e-08, 2.1420e-08, 4.3306e-07, 1.4715e-07, -4.4703e-08, 1.4156e-07], device='cuda:0') 100 0.0001 changing lr epoch 343, time 214.90, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4526 re_mapping 0.0045 re_causal 0.0130 /// teacc 99.02 lr 0.00010000 Epoch 345, weight, value: tensor([[-0.3028, 0.0820, -0.1840, ..., -0.0758, -0.2623, -0.1709], [-0.0027, 0.1067, -0.1260, ..., -0.1316, -0.0721, 0.1243], [ 0.0585, -0.1364, -0.1735, ..., 0.0079, -0.0527, -0.1079], ..., [ 0.1013, -0.0827, 0.1414, ..., 0.0691, 0.2038, -0.0256], [ 0.1247, -0.2283, -0.1412, ..., -0.3220, -0.0884, 0.2145], [-0.1865, 0.0904, 0.0660, ..., -0.2969, -0.1626, -0.0628]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.4065e-06, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3039e-08], [ 3.7253e-09, 3.1665e-08, 3.7253e-09, ..., 1.8626e-09, 4.6566e-09, -2.0489e-08], [-2.7940e-08, 8.7544e-08, 9.3132e-10, ..., -1.5832e-08, -2.8871e-08, 1.8626e-09], ..., [ 2.3283e-08, 4.0047e-08, -9.3132e-09, ..., 1.3039e-08, 2.0489e-08, 2.1420e-08], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.3772e-08], [ 3.7253e-09, -2.7288e-06, 6.5193e-09, ..., 1.8626e-09, 5.5879e-09, 4.6566e-09]], device='cuda:0') Epoch 345, bias, value: tensor([-0.0185, -0.0233, -0.0053, -0.0247, -0.0028, 0.0061, 0.0097, 0.0194, 0.0184, -0.0076], device='cuda:0'), grad: tensor([ 6.2771e-06, 6.0536e-08, 1.8161e-07, 4.7497e-08, 3.3434e-07, -3.1851e-07, -3.7625e-07, 3.6135e-07, 3.0734e-07, -6.8992e-06], device='cuda:0') 100 0.0001 changing lr epoch 344, time 214.74, cls_loss 0.0009 cls_loss_mapping 0.0011 cls_loss_causal 0.4513 re_mapping 0.0046 re_causal 0.0131 /// teacc 98.95 lr 0.00010000 Epoch 346, weight, value: tensor([[-0.3031, 0.0821, -0.1854, ..., -0.0763, -0.2631, -0.1710], [-0.0026, 0.1060, -0.1264, ..., -0.1324, -0.0723, 0.1251], [ 0.0585, -0.1365, -0.1736, ..., 0.0079, -0.0526, -0.1085], ..., [ 0.1014, -0.0828, 0.1416, ..., 0.0694, 0.2040, -0.0261], [ 0.1244, -0.2285, -0.1422, ..., -0.3237, -0.0901, 0.2144], [-0.1868, 0.0909, 0.0662, ..., -0.2979, -0.1625, -0.0638]], device='cuda:0'), grad: tensor([[ 1.3597e-07, -7.4506e-09, 1.3504e-07, ..., 9.3132e-10, 1.3225e-07, 2.9802e-08], [ 6.2466e-05, 1.1176e-08, 5.9694e-05, ..., 9.3132e-10, 6.0618e-05, 1.2323e-05], [ 5.6438e-07, 6.5193e-09, 5.4762e-07, ..., -3.7253e-09, 5.4762e-07, 1.1455e-07], ..., [-6.6161e-05, 4.8429e-08, -6.3181e-05, ..., -1.8626e-09, -6.4254e-05, -1.3031e-05], [ 1.9912e-06, 1.2107e-08, 1.9129e-06, ..., 9.3132e-10, 1.9334e-06, 3.9767e-07], [ 4.7218e-07, 1.4603e-05, 7.9796e-06, ..., 9.3132e-10, 4.5821e-07, 5.4985e-06]], device='cuda:0') Epoch 346, bias, value: tensor([-0.0184, -0.0234, -0.0053, -0.0229, -0.0025, 0.0031, 0.0097, 0.0193, 0.0180, -0.0072], device='cuda:0'), grad: tensor([ 9.7007e-06, 1.3101e-04, 1.2638e-06, 8.3726e-07, -6.3658e-05, 2.3842e-07, -1.0438e-05, -1.3816e-04, 4.8093e-06, 6.4254e-05], device='cuda:0') 100 0.0001 changing lr epoch 345, time 215.02, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4655 re_mapping 0.0044 re_causal 0.0126 /// teacc 99.07 lr 0.00010000 Epoch 347, weight, value: tensor([[-0.3037, 0.0821, -0.1861, ..., -0.0781, -0.2648, -0.1711], [-0.0035, 0.1060, -0.1273, ..., -0.1336, -0.0732, 0.1247], [ 0.0586, -0.1366, -0.1737, ..., 0.0079, -0.0525, -0.1079], ..., [ 0.1022, -0.0825, 0.1422, ..., 0.0699, 0.2048, -0.0257], [ 0.1244, -0.2287, -0.1428, ..., -0.3245, -0.0919, 0.2146], [-0.1876, 0.0908, 0.0661, ..., -0.3014, -0.1627, -0.0644]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.3027e-07, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, -2.7940e-09], [ 9.3132e-10, 6.5193e-09, 1.2107e-08, ..., 3.7253e-09, 1.8626e-09, -6.5193e-09], [ 4.6566e-09, 9.3132e-09, 3.7253e-09, ..., 3.7253e-09, 9.3132e-10, 8.3819e-09], ..., [-4.6566e-09, 1.1176e-08, 6.5193e-08, ..., -9.3132e-10, -1.8626e-09, 5.5879e-09], [-4.6566e-09, 2.4214e-08, 1.8626e-08, ..., 9.3132e-10, 0.0000e+00, -9.3132e-09], [ 4.6566e-09, 3.3900e-07, -1.5181e-07, ..., 1.8626e-09, -1.8626e-09, 2.7940e-09]], device='cuda:0') Epoch 347, bias, value: tensor([-0.0185, -0.0239, -0.0052, -0.0229, -0.0022, 0.0029, 0.0098, 0.0198, 0.0180, -0.0074], device='cuda:0'), grad: tensor([-1.2936e-06, 3.2596e-08, -1.8813e-07, 8.2888e-08, 3.6042e-07, 2.2352e-08, -8.6613e-08, 1.0803e-07, 1.1083e-07, 8.6054e-07], device='cuda:0') 100 0.0001 changing lr epoch 346, time 215.04, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4804 re_mapping 0.0043 re_causal 0.0127 /// teacc 99.02 lr 0.00010000 Epoch 348, weight, value: tensor([[-0.3043, 0.0822, -0.1865, ..., -0.0792, -0.2662, -0.1718], [-0.0033, 0.1060, -0.1275, ..., -0.1339, -0.0732, 0.1253], [ 0.0582, -0.1366, -0.1739, ..., 0.0078, -0.0531, -0.1082], ..., [ 0.1025, -0.0829, 0.1423, ..., 0.0700, 0.2052, -0.0258], [ 0.1242, -0.2289, -0.1431, ..., -0.3251, -0.0922, 0.2143], [-0.1881, 0.0909, 0.0663, ..., -0.3022, -0.1628, -0.0648]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.1176e-08, 2.7940e-09, ..., 2.7940e-09, 0.0000e+00, 9.3132e-10], [ 1.2107e-08, -2.7940e-09, 4.0978e-08, ..., 2.7940e-08, 1.0245e-08, -7.3574e-08], [ 1.7695e-08, 9.3132e-10, 1.2293e-07, ..., 1.3039e-07, 2.7940e-09, 3.7253e-09], ..., [-3.4459e-08, 4.6566e-09, -1.7695e-08, ..., 2.1420e-08, -3.6322e-08, 5.6811e-08], [ 3.2596e-08, 1.2107e-08, 2.5518e-07, ..., 2.3562e-07, 9.3132e-10, 4.6566e-09], [ 2.3283e-08, -1.1176e-08, -4.2841e-08, ..., 1.2107e-08, 1.8626e-08, 1.8626e-09]], device='cuda:0') Epoch 348, bias, value: tensor([-0.0188, -0.0236, -0.0054, -0.0229, -0.0028, 0.0006, 0.0114, 0.0198, 0.0174, -0.0073], device='cuda:0'), grad: tensor([-1.2107e-08, -2.5146e-08, 3.5018e-07, -1.1353e-06, 4.5635e-08, 3.7253e-08, 1.4901e-08, 9.6858e-08, 7.4413e-07, -1.1735e-07], device='cuda:0') 100 0.0001 changing lr epoch 347, time 215.15, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4756 re_mapping 0.0042 re_causal 0.0123 /// teacc 98.97 lr 0.00010000 Epoch 349, weight, value: tensor([[-0.3049, 0.0823, -0.1872, ..., -0.0808, -0.2679, -0.1719], [-0.0033, 0.1061, -0.1275, ..., -0.1341, -0.0733, 0.1255], [ 0.0582, -0.1368, -0.1739, ..., 0.0079, -0.0530, -0.1084], ..., [ 0.1025, -0.0832, 0.1424, ..., 0.0700, 0.2053, -0.0261], [ 0.1244, -0.2290, -0.1433, ..., -0.3259, -0.0925, 0.2145], [-0.1884, 0.0909, 0.0663, ..., -0.3028, -0.1629, -0.0654]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -1.8626e-09, 9.3132e-10, ..., 1.8626e-09, 0.0000e+00, 1.8626e-09], [ 2.4214e-08, -2.2352e-08, 2.8871e-08, ..., 7.4506e-09, 1.9558e-08, -1.3039e-08], [ 2.7940e-09, 9.3132e-10, 6.5193e-09, ..., -5.5879e-09, 5.5879e-09, 3.7253e-09], ..., [-3.6322e-08, 3.3528e-08, -7.0781e-08, ..., -1.4901e-08, -6.0536e-08, 6.1467e-08], [-6.7055e-08, 6.5193e-09, 7.4506e-09, ..., 9.3132e-10, 1.8626e-09, -5.8115e-07], [ 2.7940e-08, 8.1304e-07, 4.3306e-07, ..., 6.5193e-09, 2.8871e-08, 7.2271e-07]], device='cuda:0') Epoch 349, bias, value: tensor([-0.0188, -0.0235, -0.0055, -0.0229, -0.0028, 0.0006, 0.0113, 0.0197, 0.0175, -0.0074], device='cuda:0'), grad: tensor([ 2.8871e-08, 7.7300e-08, -3.7253e-09, 1.4994e-07, -4.2021e-06, -1.6764e-07, 8.2329e-07, 5.5879e-09, -9.6951e-07, 4.2580e-06], device='cuda:0') 100 0.0001 changing lr epoch 348, time 214.86, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4790 re_mapping 0.0041 re_causal 0.0118 /// teacc 99.03 lr 0.00010000 Epoch 350, weight, value: tensor([[-0.3061, 0.0823, -0.1880, ..., -0.0815, -0.2687, -0.1731], [-0.0033, 0.1060, -0.1276, ..., -0.1334, -0.0729, 0.1256], [ 0.0583, -0.1369, -0.1741, ..., 0.0077, -0.0530, -0.1085], ..., [ 0.1026, -0.0835, 0.1424, ..., 0.0694, 0.2051, -0.0262], [ 0.1246, -0.2292, -0.1436, ..., -0.3269, -0.0930, 0.2149], [-0.1888, 0.0908, 0.0664, ..., -0.3045, -0.1629, -0.0660]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.7940e-09, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 8.3819e-09, 1.8626e-09, 7.9162e-08, ..., 7.3574e-08, 1.5832e-08, 1.8626e-08], [ 3.3528e-08, 9.3132e-10, 1.7323e-07, ..., 1.6391e-07, 3.5390e-08, 3.9116e-08], ..., [-3.7253e-09, 0.0000e+00, 8.1956e-08, ..., 7.9162e-08, 1.3970e-08, 3.7253e-09], [-6.9849e-08, 1.0245e-08, 6.5193e-09, ..., 3.7253e-09, 1.8626e-09, -1.1176e-07], [ 7.4506e-09, 0.0000e+00, -5.5879e-09, ..., 3.7253e-09, 9.3132e-10, 2.2352e-08]], device='cuda:0') Epoch 350, bias, value: tensor([-0.0188, -0.0233, -0.0056, -0.0232, -0.0026, 0.0011, 0.0113, 0.0195, 0.0176, -0.0075], device='cuda:0'), grad: tensor([ 9.3132e-09, 2.4680e-07, 5.6252e-07, -8.9034e-07, -2.7940e-09, -1.7229e-07, 1.6671e-07, 2.1979e-07, -2.0489e-07, 5.8673e-08], device='cuda:0') 100 0.0001 changing lr epoch 349, time 214.96, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4694 re_mapping 0.0041 re_causal 0.0123 /// teacc 98.93 lr 0.00010000 Epoch 351, weight, value: tensor([[-0.3063, 0.0824, -0.1886, ..., -0.0823, -0.2689, -0.1735], [-0.0034, 0.1061, -0.1277, ..., -0.1316, -0.0722, 0.1258], [ 0.0583, -0.1369, -0.1746, ..., 0.0076, -0.0532, -0.1086], ..., [ 0.1027, -0.0838, 0.1426, ..., 0.0684, 0.2047, -0.0262], [ 0.1248, -0.2295, -0.1439, ..., -0.3278, -0.0934, 0.2151], [-0.1891, 0.0908, 0.0665, ..., -0.3053, -0.1630, -0.0663]], device='cuda:0'), grad: tensor([[ 3.6322e-08, -1.2107e-08, 6.5193e-09, ..., 0.0000e+00, 9.3132e-10, 2.8778e-07], [-2.4159e-06, -9.8720e-08, -2.9150e-07, ..., 2.7940e-09, 4.6566e-09, -1.9997e-05], [ 1.5637e-06, 1.8626e-09, 2.0768e-07, ..., -1.6764e-08, -7.4506e-09, 1.2740e-05], ..., [ 7.0781e-07, 1.9558e-08, 3.1479e-07, ..., 1.3039e-08, 4.0047e-08, 5.6550e-06], [ 2.2352e-08, 1.3039e-08, 2.6077e-08, ..., 1.8626e-09, 3.7253e-09, 2.3842e-07], [ 8.3819e-09, -6.5193e-08, -3.6508e-07, ..., 9.3132e-10, -5.3085e-08, 5.4948e-08]], device='cuda:0') Epoch 351, bias, value: tensor([-0.0188, -0.0229, -0.0055, -0.0232, -0.0029, 0.0013, 0.0112, 0.0192, 0.0176, -0.0075], device='cuda:0'), grad: tensor([ 5.5227e-07, -4.0442e-05, 2.5645e-05, 1.6950e-07, 1.1250e-06, -1.0803e-07, 1.3076e-06, 1.1928e-05, 5.7835e-07, -7.9721e-07], device='cuda:0') 100 0.0001 changing lr epoch 350, time 215.09, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4769 re_mapping 0.0041 re_causal 0.0123 /// teacc 98.98 lr 0.00010000 Epoch 352, weight, value: tensor([[-0.3065, 0.0824, -0.1892, ..., -0.0835, -0.2694, -0.1737], [-0.0026, 0.1065, -0.1274, ..., -0.1296, -0.0707, 0.1274], [ 0.0580, -0.1370, -0.1751, ..., 0.0073, -0.0538, -0.1092], ..., [ 0.1024, -0.0851, 0.1427, ..., 0.0672, 0.2040, -0.0277], [ 0.1249, -0.2304, -0.1443, ..., -0.3292, -0.0937, 0.2147], [-0.1894, 0.0908, 0.0665, ..., -0.3061, -0.1630, -0.0667]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.2107e-08, 1.2107e-08, ..., 1.4901e-08, 2.7940e-09, -1.8626e-09], [ 1.0338e-07, 3.7253e-09, 1.3597e-07, ..., 7.3574e-08, 1.0710e-07, 6.2399e-08], [ 8.6613e-08, 9.3132e-10, 7.3574e-08, ..., 5.0291e-08, 4.1910e-08, 8.5682e-08], ..., [-3.6787e-07, 4.6566e-09, -8.9500e-07, ..., -6.3051e-07, -5.8953e-07, 1.8626e-08], [-2.7101e-07, 6.5193e-09, 9.3132e-09, ..., 8.3819e-09, 3.7253e-09, -3.7532e-07], [ 4.5635e-08, 9.1270e-08, 8.0094e-08, ..., 3.7253e-09, 9.2201e-08, 3.4459e-08]], device='cuda:0') Epoch 352, bias, value: tensor([-0.0187, -0.0213, -0.0058, -0.0232, -0.0029, 0.0009, 0.0113, 0.0182, 0.0172, -0.0076], device='cuda:0'), grad: tensor([ 0.0000e+00, 3.9674e-07, 3.2596e-07, 8.8289e-07, -2.1607e-07, 1.7695e-07, 2.9057e-07, -1.2862e-06, -9.4436e-07, 3.7905e-07], device='cuda:0') 100 0.0001 changing lr epoch 351, time 214.49, cls_loss 0.0009 cls_loss_mapping 0.0011 cls_loss_causal 0.4473 re_mapping 0.0042 re_causal 0.0121 /// teacc 98.98 lr 0.00010000 Epoch 353, weight, value: tensor([[-0.3067, 0.0825, -0.1899, ..., -0.0836, -0.2698, -0.1735], [-0.0024, 0.1065, -0.1276, ..., -0.1290, -0.0702, 0.1278], [ 0.0580, -0.1375, -0.1752, ..., 0.0072, -0.0540, -0.1099], ..., [ 0.1024, -0.0853, 0.1428, ..., 0.0665, 0.2038, -0.0278], [ 0.1249, -0.2308, -0.1447, ..., -0.3302, -0.0941, 0.2148], [-0.1898, 0.0908, 0.0666, ..., -0.3075, -0.1631, -0.0671]], device='cuda:0'), grad: tensor([[-9.3132e-10, -9.9186e-07, 7.4506e-09, ..., 2.0489e-08, 0.0000e+00, -1.8626e-08], [ 4.5635e-08, 2.7940e-08, 6.7055e-08, ..., 7.3574e-08, 5.6811e-08, 1.0245e-08], [ 1.8626e-09, 2.6356e-07, 2.0768e-07, ..., 5.8673e-07, 1.2107e-08, 1.8626e-09], ..., [-5.1223e-08, 3.7253e-09, -4.3772e-08, ..., 9.3132e-10, -6.2399e-08, -1.8626e-08], [ 0.0000e+00, 9.3132e-09, 1.8626e-09, ..., 2.7940e-09, 0.0000e+00, 2.7940e-09], [ 2.7940e-09, 9.8255e-07, 4.6566e-09, ..., 1.8626e-09, 3.7253e-09, 2.2352e-08]], device='cuda:0') Epoch 353, bias, value: tensor([-0.0187, -0.0207, -0.0061, -0.0234, -0.0031, 0.0013, 0.0114, 0.0180, 0.0170, -0.0076], device='cuda:0'), grad: tensor([-2.3842e-06, 2.4308e-07, 1.3784e-06, -2.3395e-06, -1.3970e-08, 7.4320e-07, 1.5832e-08, -8.8476e-08, 3.1665e-08, 2.4103e-06], device='cuda:0') 100 0.0001 changing lr epoch 352, time 214.99, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4644 re_mapping 0.0042 re_causal 0.0122 /// teacc 98.99 lr 0.00010000 Epoch 354, weight, value: tensor([[-0.3069, 0.0824, -0.1913, ..., -0.0851, -0.2714, -0.1731], [-0.0011, 0.1066, -0.1251, ..., -0.1285, -0.0705, 0.1288], [ 0.0580, -0.1377, -0.1754, ..., 0.0072, -0.0540, -0.1101], ..., [ 0.1015, -0.0850, 0.1412, ..., 0.0662, 0.2041, -0.0293], [ 0.1249, -0.2313, -0.1451, ..., -0.3310, -0.0943, 0.2147], [-0.1902, 0.0908, 0.0666, ..., -0.3094, -0.1633, -0.0681]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.1176e-08, 1.3411e-07, ..., 9.3132e-10, 1.8626e-08, 9.3132e-10], [ 5.0291e-08, 1.4901e-08, 1.0803e-07, ..., 3.4459e-08, 1.1083e-07, -9.3132e-10], [ 3.7253e-09, 1.8626e-09, 5.1223e-08, ..., -1.9558e-08, 4.3772e-08, 9.3132e-10], ..., [-8.6613e-08, 2.7940e-09, 6.0536e-08, ..., -4.8429e-08, -1.7509e-07, 2.7940e-09], [ 5.5879e-09, 1.1176e-08, 3.5390e-08, ..., 1.1176e-08, 4.6566e-09, 0.0000e+00], [ 7.4506e-09, -2.7940e-08, -4.9360e-07, ..., 1.2107e-08, -4.2841e-08, 1.6764e-08]], device='cuda:0') Epoch 354, bias, value: tensor([-0.0187, -0.0190, -0.0061, -0.0235, -0.0030, 0.0013, 0.0113, 0.0166, 0.0168, -0.0078], device='cuda:0'), grad: tensor([ 6.5658e-07, 3.2503e-07, 1.5739e-07, 1.5181e-07, 3.2969e-07, 1.8347e-07, -4.7125e-07, 7.0315e-07, 2.5425e-07, -2.3041e-06], device='cuda:0') 100 0.0001 changing lr epoch 353, time 214.93, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4767 re_mapping 0.0042 re_causal 0.0124 /// teacc 99.00 lr 0.00010000 Epoch 355, weight, value: tensor([[-0.3066, 0.0826, -0.1917, ..., -0.0841, -0.2717, -0.1729], [-0.0014, 0.1066, -0.1253, ..., -0.1288, -0.0710, 0.1286], [ 0.0580, -0.1381, -0.1754, ..., 0.0072, -0.0539, -0.1096], ..., [ 0.1018, -0.0852, 0.1415, ..., 0.0663, 0.2047, -0.0291], [ 0.1250, -0.2317, -0.1455, ..., -0.3326, -0.0948, 0.2148], [-0.1909, 0.0904, 0.0665, ..., -0.3107, -0.1636, -0.0685]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.5193e-08, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.5926e-08, -1.8626e-09, 1.1269e-07, ..., 1.5274e-07, 1.8626e-09, -8.3819e-09], [ 5.1223e-08, 0.0000e+00, 6.0536e-08, ..., 8.3819e-08, -9.3132e-10, -0.0000e+00], ..., [ 5.0385e-07, 9.3132e-10, 5.8301e-07, ..., 8.1863e-07, -1.8626e-09, 9.3132e-10], [ 8.0094e-08, 9.3132e-10, 9.5926e-08, ..., 1.2945e-07, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, -2.7940e-09, -1.6764e-08, ..., 1.8626e-09, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 355, bias, value: tensor([-0.0186, -0.0192, -0.0060, -0.0235, -0.0021, 0.0013, 0.0114, 0.0168, 0.0166, -0.0083], device='cuda:0'), grad: tensor([ 6.7614e-07, 3.8464e-07, 2.1327e-07, -3.0287e-06, 1.4901e-08, 5.1223e-08, -6.9756e-07, 2.0843e-06, 3.4086e-07, -2.7940e-08], device='cuda:0') 100 0.0001 changing lr epoch 354, time 214.89, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4647 re_mapping 0.0041 re_causal 0.0125 /// teacc 99.00 lr 0.00010000 Epoch 356, weight, value: tensor([[-0.3073, 0.0819, -0.1929, ..., -0.0851, -0.2719, -0.1749], [-0.0017, 0.1066, -0.1258, ..., -0.1286, -0.0715, 0.1282], [ 0.0581, -0.1382, -0.1755, ..., 0.0072, -0.0540, -0.1097], ..., [ 0.1022, -0.0855, 0.1419, ..., 0.0662, 0.2053, -0.0282], [ 0.1249, -0.2323, -0.1463, ..., -0.3348, -0.0950, 0.2148], [-0.1923, 0.0906, 0.0662, ..., -0.3115, -0.1640, -0.0694]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], [ 6.5193e-09, 4.6566e-09, 1.8626e-09, ..., 2.7940e-09, 9.3132e-10, 2.7940e-09], [-1.0803e-07, 9.3132e-10, 0.0000e+00, ..., -3.2596e-08, -2.1420e-08, 1.8626e-09], ..., [ 2.7940e-09, 4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [ 1.1176e-08, 7.4506e-09, 5.5879e-09, ..., 2.7940e-08, 1.9558e-08, -1.9558e-07], [ 9.3132e-10, 8.3819e-09, -7.4506e-09, ..., 9.3132e-10, 0.0000e+00, 1.2107e-08]], device='cuda:0') Epoch 356, bias, value: tensor([-0.0192, -0.0194, -0.0059, -0.0236, -0.0008, 0.0013, 0.0117, 0.0170, 0.0163, -0.0085], device='cuda:0'), grad: tensor([ 1.2107e-08, 4.5635e-08, -2.8498e-07, 6.7707e-07, -5.6811e-08, -7.4878e-07, 5.7742e-07, 3.4459e-08, -2.8778e-07, 4.1910e-08], device='cuda:0') 100 0.0001 changing lr epoch 355, time 214.51, cls_loss 0.0007 cls_loss_mapping 0.0016 cls_loss_causal 0.4435 re_mapping 0.0044 re_causal 0.0123 /// teacc 98.95 lr 0.00010000 Epoch 357, weight, value: tensor([[-0.3076, 0.0821, -0.1934, ..., -0.0854, -0.2720, -0.1750], [-0.0021, 0.1066, -0.1261, ..., -0.1286, -0.0721, 0.1278], [ 0.0581, -0.1384, -0.1756, ..., 0.0072, -0.0540, -0.1099], ..., [ 0.1026, -0.0860, 0.1423, ..., 0.0663, 0.2059, -0.0276], [ 0.1251, -0.2325, -0.1465, ..., -0.3349, -0.0952, 0.2152], [-0.1927, 0.0909, 0.0663, ..., -0.3122, -0.1642, -0.0686]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8207e-06, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.9558e-08, 9.6112e-07, 1.0245e-08, ..., 5.5879e-09, 9.3132e-10, -2.8871e-08], [ 3.7253e-09, 6.1467e-08, 4.6566e-09, ..., 2.7940e-09, 9.3132e-10, 3.7253e-09], ..., [ 1.4901e-08, 2.2352e-08, -6.5193e-09, ..., -2.7940e-09, -5.5879e-09, 2.8871e-08], [-3.1665e-08, 1.1176e-08, 8.3819e-09, ..., 9.3132e-10, 0.0000e+00, -4.5635e-08], [ 6.5193e-09, 4.9360e-08, -1.9558e-08, ..., 2.7940e-09, 1.8626e-09, 8.3819e-09]], device='cuda:0') Epoch 357, bias, value: tensor([-0.0191, -0.0196, -0.0059, -0.0236, -0.0014, 0.0011, 0.0117, 0.0172, 0.0163, -0.0083], device='cuda:0'), grad: tensor([-5.0068e-06, 2.6226e-06, 1.8626e-07, -1.2405e-06, 2.9802e-08, 1.3830e-06, 1.8626e-06, 1.0524e-07, -7.8231e-08, 1.2107e-07], device='cuda:0') 100 0.0001 changing lr epoch 356, time 214.85, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4755 re_mapping 0.0041 re_causal 0.0122 /// teacc 99.01 lr 0.00010000 Epoch 358, weight, value: tensor([[-0.3080, 0.0822, -0.1941, ..., -0.0856, -0.2726, -0.1751], [-0.0024, 0.1064, -0.1261, ..., -0.1291, -0.0726, 0.1283], [ 0.0581, -0.1391, -0.1757, ..., 0.0072, -0.0541, -0.1113], ..., [ 0.1029, -0.0840, 0.1424, ..., 0.0667, 0.2065, -0.0272], [ 0.1253, -0.2328, -0.1465, ..., -0.3353, -0.0951, 0.2154], [-0.1933, 0.0907, 0.0663, ..., -0.3133, -0.1644, -0.0696]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -8.6613e-08, 2.4214e-08, ..., 1.3970e-08, 9.3132e-10, -3.1665e-08], [ 5.6811e-08, 2.2352e-08, 5.6811e-08, ..., 5.2154e-08, 1.5832e-08, 0.0000e+00], [-2.0396e-07, 2.8871e-08, 6.5193e-09, ..., -4.4703e-08, -2.7940e-08, -2.1420e-08], ..., [ 1.3039e-07, 4.6566e-09, 1.4901e-08, ..., 5.4948e-08, 2.7940e-09, 3.1665e-08], [-1.7695e-08, 1.5832e-08, 8.3819e-09, ..., 8.3819e-09, 1.8626e-09, -1.1642e-07], [ 1.8626e-08, 6.2399e-08, 9.3132e-09, ..., 4.6566e-09, 4.6566e-09, 8.9407e-08]], device='cuda:0') Epoch 358, bias, value: tensor([-0.0191, -0.0195, -0.0061, -0.0235, -0.0013, 0.0009, 0.0117, 0.0174, 0.0164, -0.0085], device='cuda:0'), grad: tensor([-2.6356e-07, 2.1979e-07, -3.0734e-07, -3.2317e-07, -2.4214e-08, 9.3132e-08, 1.0058e-07, 3.1665e-07, -1.5832e-07, 3.4552e-07], device='cuda:0') 100 0.0001 changing lr epoch 357, time 214.83, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4556 re_mapping 0.0042 re_causal 0.0121 /// teacc 98.97 lr 0.00010000 Epoch 359, weight, value: tensor([[-0.3080, 0.0825, -0.1945, ..., -0.0861, -0.2729, -0.1748], [-0.0031, 0.1065, -0.1267, ..., -0.1296, -0.0730, 0.1288], [ 0.0581, -0.1375, -0.1755, ..., 0.0074, -0.0541, -0.1114], ..., [ 0.1035, -0.0847, 0.1429, ..., 0.0672, 0.2069, -0.0279], [ 0.1256, -0.2332, -0.1466, ..., -0.3354, -0.0941, 0.2156], [-0.1938, 0.0903, 0.0662, ..., -0.3162, -0.1645, -0.0698]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.5832e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, -2.7940e-09, 3.7253e-09, ..., 3.7253e-09, 1.8626e-09, -9.3132e-09], [ 1.8626e-09, 0.0000e+00, 4.6566e-09, ..., -9.3132e-10, -5.5879e-09, 0.0000e+00], ..., [-3.7253e-09, 1.8626e-09, -4.6566e-09, ..., -9.3132e-10, -3.7253e-09, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 1.7695e-08, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 359, bias, value: tensor([-0.0189, -0.0198, -0.0057, -0.0237, -0.0015, 0.0009, 0.0117, 0.0177, 0.0165, -0.0090], device='cuda:0'), grad: tensor([-3.5390e-08, -1.0245e-08, -2.5146e-08, -1.7695e-08, 2.7008e-08, 5.5879e-09, -7.4506e-09, 9.3132e-09, 8.3819e-09, 4.2841e-08], device='cuda:0') 100 0.0001 changing lr epoch 358, time 214.71, cls_loss 0.0010 cls_loss_mapping 0.0014 cls_loss_causal 0.5017 re_mapping 0.0044 re_causal 0.0123 /// teacc 99.13 lr 0.00010000 Epoch 360, weight, value: tensor([[-0.3086, 0.0827, -0.1951, ..., -0.0868, -0.2734, -0.1747], [-0.0006, 0.1066, -0.1240, ..., -0.1298, -0.0731, 0.1302], [ 0.0582, -0.1375, -0.1756, ..., 0.0074, -0.0540, -0.1115], ..., [ 0.1013, -0.0851, 0.1404, ..., 0.0670, 0.2071, -0.0297], [ 0.1257, -0.2337, -0.1469, ..., -0.3365, -0.0944, 0.2157], [-0.1944, 0.0902, 0.0662, ..., -0.3177, -0.1646, -0.0698]], device='cuda:0'), grad: tensor([[-9.3132e-10, -4.2841e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [-1.8626e-09, -3.7253e-09, 9.3132e-10, ..., 1.8626e-09, 3.7253e-09, -1.2107e-07], [ 2.0955e-08, 7.9162e-09, 0.0000e+00, ..., -2.3283e-09, -5.1223e-09, 4.9360e-08], ..., [ 4.6566e-10, 8.3819e-09, 4.6566e-10, ..., 0.0000e+00, 4.6566e-10, 1.5832e-08], [-2.2352e-08, 5.5879e-09, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -2.0955e-08], [ 9.3132e-10, 2.1886e-08, -7.4506e-09, ..., 0.0000e+00, 0.0000e+00, 2.0023e-08]], device='cuda:0') Epoch 360, bias, value: tensor([-0.0188, -0.0174, -0.0054, -0.0237, -0.0016, 0.0001, 0.0122, 0.0153, 0.0161, -0.0091], device='cuda:0'), grad: tensor([-1.0664e-07, -3.9907e-07, 1.6112e-07, 1.4435e-08, -9.5461e-08, -2.9802e-08, 2.9989e-07, 7.7765e-08, -3.2596e-08, 1.2247e-07], device='cuda:0') 100 0.0001 changing lr epoch 359, time 214.75, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.5007 re_mapping 0.0041 re_causal 0.0121 /// teacc 99.06 lr 0.00010000 Epoch 361, weight, value: tensor([[-0.3092, 0.0828, -0.1967, ..., -0.0876, -0.2742, -0.1746], [-0.0013, 0.1067, -0.1244, ..., -0.1304, -0.0747, 0.1299], [ 0.0578, -0.1378, -0.1763, ..., 0.0071, -0.0545, -0.1115], ..., [ 0.1022, -0.0855, 0.1410, ..., 0.0681, 0.2088, -0.0293], [ 0.1259, -0.2341, -0.1474, ..., -0.3374, -0.0948, 0.2160], [-0.1947, 0.0901, 0.0663, ..., -0.3184, -0.1647, -0.0703]], device='cuda:0'), grad: tensor([[ 3.2596e-09, -1.1642e-08, 2.3283e-09, ..., 4.1910e-09, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 1.8626e-09, 5.1223e-09, ..., 3.7253e-09, 1.8626e-09, -1.3970e-09], [-2.4680e-08, 2.7940e-09, -8.8476e-09, ..., -2.5611e-08, 1.3970e-09, 4.6566e-10], ..., [-9.3132e-10, 1.3970e-09, -9.3132e-10, ..., 9.3132e-10, -1.3970e-09, 1.3970e-09], [ 1.8626e-09, 2.3283e-09, 9.3132e-10, ..., 2.3283e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 4.6566e-09, -4.1910e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 361, bias, value: tensor([-0.0187, -0.0178, -0.0056, -0.0237, -0.0019, 0.0001, 0.0122, 0.0158, 0.0159, -0.0091], device='cuda:0'), grad: tensor([-1.1642e-08, 1.5367e-08, -1.2014e-07, 6.7521e-08, 2.0023e-08, 1.8626e-08, -1.4901e-08, 8.8476e-09, 1.9092e-08, 1.0710e-08], device='cuda:0') 100 0.0001 changing lr epoch 360, time 214.98, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4641 re_mapping 0.0043 re_causal 0.0115 /// teacc 99.03 lr 0.00010000 Epoch 362, weight, value: tensor([[-0.3101, 0.0829, -0.1988, ..., -0.0890, -0.2754, -0.1747], [-0.0030, 0.1066, -0.1254, ..., -0.1308, -0.0768, 0.1299], [ 0.0558, -0.1385, -0.1787, ..., 0.0060, -0.0562, -0.1119], ..., [ 0.1047, -0.0858, 0.1424, ..., 0.0700, 0.2114, -0.0294], [ 0.1261, -0.2345, -0.1483, ..., -0.3388, -0.0953, 0.2163], [-0.1951, 0.0899, 0.0665, ..., -0.3188, -0.1646, -0.0713]], device='cuda:0'), grad: tensor([[-9.3132e-10, -4.2841e-08, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 4.6566e-09, 9.3132e-10, 4.1910e-09, ..., 9.3132e-10, 9.3132e-10, -2.7940e-09], [ 2.3283e-09, 3.7253e-09, 2.7940e-09, ..., 2.3283e-09, 1.3970e-09, 1.3970e-09], ..., [ 1.8626e-09, 3.7253e-09, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 4.6566e-09], [-5.5879e-09, 6.4727e-08, 5.5414e-08, ..., 4.6566e-10, 0.0000e+00, -6.5193e-09], [ 1.8626e-09, -5.4482e-08, -8.5216e-08, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 362, bias, value: tensor([-1.8620e-02, -1.8831e-02, -6.9481e-03, -2.3424e-02, -1.2679e-03, -4.5993e-06, 1.2204e-02, 1.7227e-02, 1.5779e-02, -9.2256e-03], device='cuda:0'), grad: tensor([-5.5879e-08, 1.1176e-08, 1.7695e-08, 2.7940e-09, 9.1735e-08, -1.2573e-08, -2.9337e-08, 2.1886e-08, 2.5937e-07, -2.9989e-07], device='cuda:0') 100 0.0001 changing lr epoch 361, time 214.90, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4836 re_mapping 0.0041 re_causal 0.0119 /// teacc 99.10 lr 0.00010000 Epoch 363, weight, value: tensor([[-0.3110, 0.0831, -0.2001, ..., -0.0903, -0.2764, -0.1747], [-0.0033, 0.1066, -0.1255, ..., -0.1315, -0.0773, 0.1299], [ 0.0549, -0.1386, -0.1798, ..., 0.0058, -0.0577, -0.1120], ..., [ 0.1054, -0.0856, 0.1427, ..., 0.0712, 0.2126, -0.0294], [ 0.1264, -0.2349, -0.1490, ..., -0.3399, -0.0961, 0.2166], [-0.1953, 0.0900, 0.0666, ..., -0.3191, -0.1647, -0.0716]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -1.8626e-09, 2.7940e-09, ..., 9.3132e-10, 2.7940e-09, 7.4506e-09], [ 9.3132e-09, 3.7253e-09, 2.1420e-08, ..., 3.7253e-09, 1.2107e-08, 2.7940e-09], [-3.7253e-09, 2.7940e-09, 1.5832e-08, ..., -8.3819e-09, -1.4901e-08, 2.6077e-08], ..., [-1.1176e-08, 3.9116e-08, 2.0955e-07, ..., 3.7253e-09, 1.0058e-07, 3.7253e-09], [-9.6858e-08, 1.8626e-09, 5.5879e-09, ..., 0.0000e+00, 4.6566e-09, -1.8068e-07], [ 6.7987e-08, -7.0781e-08, -3.0175e-07, ..., 0.0000e+00, -1.2293e-07, 1.1921e-07]], device='cuda:0') Epoch 363, bias, value: tensor([-0.0185, -0.0190, -0.0072, -0.0234, -0.0016, 0.0003, 0.0120, 0.0175, 0.0156, -0.0092], device='cuda:0'), grad: tensor([ 1.0524e-07, 1.3877e-07, -1.1455e-07, -3.0734e-08, 5.9232e-07, 2.6263e-07, -7.3109e-07, 4.6473e-07, -5.4482e-07, -1.4994e-07], device='cuda:0') 100 0.0001 changing lr epoch 362, time 214.86, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4754 re_mapping 0.0043 re_causal 0.0125 /// teacc 99.00 lr 0.00010000 Epoch 364, weight, value: tensor([[-0.3117, 0.0829, -0.2034, ..., -0.0910, -0.2796, -0.1748], [-0.0045, 0.1066, -0.1261, ..., -0.1320, -0.0786, 0.1292], [ 0.0549, -0.1387, -0.1800, ..., 0.0058, -0.0578, -0.1121], ..., [ 0.1066, -0.0873, 0.1432, ..., 0.0717, 0.2139, -0.0285], [ 0.1265, -0.2351, -0.1497, ..., -0.3412, -0.0975, 0.2174], [-0.1955, 0.0905, 0.0674, ..., -0.3194, -0.1644, -0.0718]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -1.8626e-09, 2.7940e-09, ..., 1.8626e-09, 2.7940e-09, 0.0000e+00], [ 3.3528e-08, 1.3970e-08, 3.1665e-08, ..., 1.7695e-08, 3.0734e-08, 3.7253e-09], [ 8.2888e-08, 9.3132e-10, 8.5682e-08, ..., 4.4703e-08, 8.4750e-08, 1.8626e-09], ..., [-1.5739e-07, 1.8626e-08, -1.4901e-07, ..., -8.3819e-08, -1.5460e-07, 9.3132e-10], [ 3.7253e-09, 3.2596e-08, 1.7695e-08, ..., 3.7253e-09, 6.5193e-09, -6.5193e-09], [ 2.7940e-08, 2.8871e-08, -8.3819e-09, ..., 1.3970e-08, 2.5146e-08, 1.9558e-08]], device='cuda:0') Epoch 364, bias, value: tensor([-0.0186, -0.0197, -0.0071, -0.0235, -0.0018, 0.0008, 0.0116, 0.0182, 0.0163, -0.0088], device='cuda:0'), grad: tensor([ 4.6566e-09, 1.2293e-07, 1.8720e-07, 5.4017e-08, -3.2783e-07, -1.8626e-07, 3.4459e-08, -2.5518e-07, 1.0058e-07, 2.6450e-07], device='cuda:0') 100 0.0001 changing lr epoch 363, time 214.76, cls_loss 0.0011 cls_loss_mapping 0.0011 cls_loss_causal 0.4675 re_mapping 0.0043 re_causal 0.0119 /// teacc 98.95 lr 0.00010000 Epoch 365, weight, value: tensor([[-0.3124, 0.0829, -0.2035, ..., -0.0918, -0.2798, -0.1751], [-0.0039, 0.1065, -0.1254, ..., -0.1326, -0.0793, 0.1297], [ 0.0550, -0.1388, -0.1808, ..., 0.0047, -0.0581, -0.1119], ..., [ 0.1063, -0.0876, 0.1427, ..., 0.0723, 0.2147, -0.0292], [ 0.1263, -0.2356, -0.1513, ..., -0.3458, -0.0997, 0.2180], [-0.1969, 0.0906, 0.0673, ..., -0.3210, -0.1646, -0.0731]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -8.1956e-08, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 4.6566e-09, 4.6566e-09, 2.7940e-09, ..., 3.7253e-09, 1.8626e-09, -1.8626e-09], [-7.4506e-09, 2.7940e-09, 1.8626e-09, ..., -5.5879e-09, 9.3132e-10, 0.0000e+00], ..., [-3.7253e-09, 2.7940e-09, -8.3819e-09, ..., -0.0000e+00, -3.7253e-09, 1.8626e-09], [ 1.8626e-09, 2.7940e-09, 2.7940e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 2.7940e-09, 9.8720e-08, -9.3132e-10, ..., 9.3132e-10, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 365, bias, value: tensor([-0.0187, -0.0191, -0.0074, -0.0228, -0.0016, 0.0003, 0.0116, 0.0176, 0.0160, -0.0090], device='cuda:0'), grad: tensor([-1.0431e-07, 3.9116e-08, -2.3283e-08, 9.3132e-09, -3.0920e-07, 9.3132e-09, 1.5646e-07, 2.7940e-09, 2.3283e-08, 2.1048e-07], device='cuda:0') 100 0.0001 changing lr epoch 364, time 214.85, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4580 re_mapping 0.0041 re_causal 0.0116 /// teacc 99.00 lr 0.00010000 Epoch 366, weight, value: tensor([[-0.3128, 0.0829, -0.2036, ..., -0.0930, -0.2800, -0.1754], [-0.0040, 0.1066, -0.1255, ..., -0.1330, -0.0795, 0.1299], [ 0.0550, -0.1390, -0.1809, ..., 0.0048, -0.0581, -0.1122], ..., [ 0.1065, -0.0878, 0.1428, ..., 0.0725, 0.2150, -0.0294], [ 0.1270, -0.2360, -0.1522, ..., -0.3466, -0.1006, 0.2188], [-0.1973, 0.0898, 0.0674, ..., -0.3223, -0.1647, -0.0745]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -6.4261e-08, 1.6764e-08, ..., 1.8626e-09, 4.6566e-09, -3.7253e-09], [ 7.4506e-09, -1.1921e-07, 2.4214e-08, ..., 2.7940e-09, 6.5193e-09, -1.0878e-06], [ 2.7940e-09, 2.0489e-08, 3.7253e-09, ..., 9.3132e-10, 9.3132e-10, 9.5926e-08], ..., [-3.4459e-08, 1.8347e-07, -8.5682e-08, ..., -1.4901e-08, -3.3528e-08, 1.0077e-06], [-2.7940e-09, 2.4214e-08, 1.5832e-08, ..., 1.8626e-09, 3.7253e-09, 6.2399e-08], [ 1.3039e-08, -6.0443e-07, -9.9838e-07, ..., 5.5879e-09, 1.2107e-08, 8.8476e-08]], device='cuda:0') Epoch 366, bias, value: tensor([-1.8616e-02, -1.9143e-02, -7.3684e-03, -2.2830e-02, -5.0967e-06, 4.5515e-04, 1.1458e-02, 1.7689e-02, 1.6315e-02, -9.9327e-03], device='cuda:0'), grad: tensor([-1.5646e-07, -1.3178e-06, 1.5181e-07, 2.3283e-08, 2.1011e-06, 4.2841e-08, 5.9605e-08, 1.2713e-06, 1.3504e-07, -2.3171e-06], device='cuda:0') 100 0.0001 changing lr epoch 365, time 214.76, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4673 re_mapping 0.0041 re_causal 0.0115 /// teacc 98.96 lr 0.00010000 Epoch 367, weight, value: tensor([[-0.3131, 0.0830, -0.2037, ..., -0.0956, -0.2801, -0.1759], [-0.0042, 0.1065, -0.1256, ..., -0.1336, -0.0796, 0.1306], [ 0.0555, -0.1397, -0.1811, ..., 0.0048, -0.0581, -0.1124], ..., [ 0.1064, -0.0897, 0.1429, ..., 0.0725, 0.2151, -0.0305], [ 0.1275, -0.2371, -0.1535, ..., -0.3486, -0.1009, 0.2196], [-0.2003, 0.0888, 0.0673, ..., -0.3236, -0.1649, -0.0782]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.2396e-06, 4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 5.5879e-09], [-8.3819e-09, 1.1735e-07, 2.7008e-08, ..., 1.8626e-09, 1.8626e-09, 1.3877e-07], [-1.3039e-08, 4.6566e-09, 2.7940e-09, ..., -8.3819e-09, 0.0000e+00, 3.7253e-09], ..., [-2.7940e-09, 4.0047e-08, 1.4901e-08, ..., -9.3132e-10, -8.3819e-09, 2.7008e-08], [ 9.3132e-10, 7.8231e-08, 1.6764e-08, ..., 7.4506e-09, 0.0000e+00, 3.1665e-08], [ 7.4506e-09, -1.2014e-07, -2.1327e-07, ..., 9.3132e-10, 1.8626e-09, 8.0094e-08]], device='cuda:0') Epoch 367, bias, value: tensor([-0.0188, -0.0194, -0.0067, -0.0208, 0.0019, -0.0016, 0.0116, 0.0175, 0.0165, -0.0110], device='cuda:0'), grad: tensor([ 8.0019e-06, 5.4482e-07, -1.6764e-08, -1.8626e-09, 7.0129e-07, 1.0151e-07, -9.4920e-06, 1.0617e-07, 3.2224e-07, -2.4121e-07], device='cuda:0') 100 0.0001 changing lr epoch 366, time 215.24, cls_loss 0.0011 cls_loss_mapping 0.0013 cls_loss_causal 0.4770 re_mapping 0.0038 re_causal 0.0113 /// teacc 99.04 lr 0.00010000 Epoch 368, weight, value: tensor([[-0.3134, 0.0830, -0.2038, ..., -0.0961, -0.2803, -0.1764], [-0.0038, 0.1064, -0.1253, ..., -0.1317, -0.0786, 0.1304], [ 0.0552, -0.1401, -0.1817, ..., 0.0046, -0.0586, -0.1126], ..., [ 0.1064, -0.0886, 0.1431, ..., 0.0717, 0.2148, -0.0299], [ 0.1277, -0.2381, -0.1553, ..., -0.3496, -0.1026, 0.2200], [-0.2026, 0.0896, 0.0673, ..., -0.3262, -0.1654, -0.0785]], device='cuda:0'), grad: tensor([[-0.0000e+00, -1.0617e-07, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, -4.5635e-08], [-9.3132e-10, 5.4948e-08, -9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.0245e-08], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [ 0.0000e+00, 2.5146e-08, 1.3970e-08, ..., 0.0000e+00, 0.0000e+00, 1.3970e-08], [ 0.0000e+00, 9.3132e-09, 2.7940e-09, ..., 9.3132e-10, 0.0000e+00, 7.4506e-09], [-0.0000e+00, 3.1665e-08, -2.6077e-08, ..., 0.0000e+00, 0.0000e+00, 1.3970e-08]], device='cuda:0') Epoch 368, bias, value: tensor([-0.0187, -0.0184, -0.0073, -0.0212, 0.0011, -0.0022, 0.0119, 0.0171, 0.0165, -0.0107], device='cuda:0'), grad: tensor([-2.1048e-07, 8.6613e-08, 6.5193e-09, 1.2107e-08, 1.3970e-08, -3.9488e-07, 3.3434e-07, 5.9605e-08, 2.7940e-08, 7.4506e-08], device='cuda:0') 100 0.0001 changing lr epoch 367, time 214.96, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4542 re_mapping 0.0042 re_causal 0.0120 /// teacc 99.02 lr 0.00010000 Epoch 369, weight, value: tensor([[-0.3143, 0.0831, -0.2041, ..., -0.0987, -0.2812, -0.1771], [-0.0038, 0.1065, -0.1253, ..., -0.1317, -0.0787, 0.1306], [ 0.0553, -0.1406, -0.1819, ..., 0.0047, -0.0585, -0.1127], ..., [ 0.1065, -0.0889, 0.1430, ..., 0.0713, 0.2149, -0.0299], [ 0.1275, -0.2394, -0.1565, ..., -0.3507, -0.1038, 0.2199], [-0.2028, 0.0895, 0.0674, ..., -0.3266, -0.1654, -0.0786]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -6.8918e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.3039e-08], [ 7.4506e-09, -1.4529e-07, 9.3132e-09, ..., 4.6566e-09, 5.5879e-09, -6.0070e-07], [-1.2107e-08, 4.6566e-09, 3.7253e-09, ..., -4.6566e-09, -1.0245e-08, 1.3970e-08], ..., [-0.0000e+00, 8.3819e-09, -1.3039e-08, ..., 9.3132e-10, 1.8626e-09, 1.5832e-08], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1176e-08], [ 9.3132e-10, 3.1665e-08, -9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 3.5390e-08]], device='cuda:0') Epoch 369, bias, value: tensor([-0.0187, -0.0184, -0.0072, -0.0209, 0.0011, -0.0024, 0.0121, 0.0170, 0.0159, -0.0108], device='cuda:0'), grad: tensor([-1.5181e-07, -9.4622e-07, 1.0245e-08, 1.1362e-07, -1.2107e-07, -1.1269e-07, 1.0524e-06, 3.1665e-08, 2.9802e-08, 1.0245e-07], device='cuda:0') 100 0.0001 changing lr epoch 368, time 214.79, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4729 re_mapping 0.0040 re_causal 0.0117 /// teacc 99.00 lr 0.00010000 Epoch 370, weight, value: tensor([[-0.3145, 0.0826, -0.2048, ..., -0.0993, -0.2814, -0.1769], [-0.0040, 0.1067, -0.1254, ..., -0.1318, -0.0790, 0.1304], [ 0.0554, -0.1407, -0.1819, ..., 0.0048, -0.0585, -0.1128], ..., [ 0.1066, -0.0895, 0.1431, ..., 0.0712, 0.2152, -0.0295], [ 0.1279, -0.2413, -0.1579, ..., -0.3512, -0.1039, 0.2200], [-0.2029, 0.0904, 0.0676, ..., -0.3268, -0.1654, -0.0786]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 4.0978e-08, 1.8626e-09, ..., 3.7253e-09, 1.8626e-09, 3.3528e-08], [ 1.1176e-08, -3.4086e-07, 1.1176e-08, ..., 1.8626e-08, 7.4506e-09, -1.5460e-07], [-7.8231e-08, 1.1176e-08, -8.1956e-08, ..., -1.3784e-07, -5.5879e-08, 3.7253e-09], ..., [ 3.9116e-08, 7.4506e-09, 4.2841e-08, ..., 7.0781e-08, 2.9802e-08, 5.5879e-09], [ 3.7253e-09, 7.4506e-09, 7.4506e-09, ..., 1.3039e-08, 5.5879e-09, -1.4901e-08], [ 7.4506e-09, -3.3528e-08, -4.8429e-08, ..., 5.5879e-09, 1.8626e-09, 1.4901e-08]], device='cuda:0') Epoch 370, bias, value: tensor([-0.0189, -0.0185, -0.0071, -0.0210, 0.0011, -0.0022, 0.0120, 0.0171, 0.0157, -0.0104], device='cuda:0'), grad: tensor([ 1.4901e-07, -8.9221e-07, -4.7870e-07, 5.0291e-08, 1.3411e-07, 4.8429e-08, 6.5193e-07, 2.8871e-07, 3.5390e-08, 1.3039e-08], device='cuda:0') 100 0.0001 changing lr epoch 369, time 215.19, cls_loss 0.0013 cls_loss_mapping 0.0015 cls_loss_causal 0.4985 re_mapping 0.0040 re_causal 0.0118 /// teacc 98.99 lr 0.00010000 Epoch 371, weight, value: tensor([[-0.3150, 0.0810, -0.2057, ..., -0.1003, -0.2818, -0.1770], [-0.0042, 0.1063, -0.1255, ..., -0.1321, -0.0791, 0.1304], [ 0.0495, -0.1408, -0.1853, ..., -0.0012, -0.0646, -0.1182], ..., [ 0.1119, -0.0896, 0.1440, ..., 0.0773, 0.2205, -0.0249], [ 0.1280, -0.2417, -0.1586, ..., -0.3537, -0.1059, 0.2204], [-0.2040, 0.0920, 0.0678, ..., -0.3280, -0.1656, -0.0797]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.1537e-07, 1.8626e-09, ..., -3.9116e-08, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, 1.8626e-09, 3.3528e-08, ..., 1.1176e-08, 5.5879e-09, -1.8626e-09], [ 9.3132e-09, 1.1176e-08, 1.1176e-08, ..., 5.5879e-09, 9.3132e-09, 0.0000e+00], ..., [-3.1665e-08, 3.7253e-09, 2.0489e-08, ..., 7.4506e-09, -2.7940e-08, 1.8626e-09], [-3.7253e-09, 5.5879e-09, 2.7940e-08, ..., 7.4506e-09, 1.8626e-09, -1.4901e-08], [ 7.4506e-09, 2.6822e-07, 7.4506e-09, ..., 4.6566e-08, 5.5879e-09, 5.5879e-09]], device='cuda:0') Epoch 371, bias, value: tensor([-0.0199, -0.0187, -0.0128, -0.0203, 0.0020, -0.0022, 0.0117, 0.0213, 0.0158, -0.0096], device='cuda:0'), grad: tensor([-9.0152e-07, 6.8918e-08, 6.3330e-08, -1.2852e-07, 2.8685e-07, 6.8918e-08, -1.0990e-07, -1.8626e-09, 2.6077e-08, 6.3144e-07], device='cuda:0') 100 0.0001 changing lr epoch 370, time 214.95, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4548 re_mapping 0.0039 re_causal 0.0114 /// teacc 98.94 lr 0.00010000 Epoch 372, weight, value: tensor([[-0.3151, 0.0813, -0.2058, ..., -0.1003, -0.2820, -0.1776], [-0.0040, 0.1062, -0.1253, ..., -0.1322, -0.0792, 0.1311], [ 0.0494, -0.1408, -0.1854, ..., -0.0012, -0.0646, -0.1182], ..., [ 0.1119, -0.0900, 0.1436, ..., 0.0773, 0.2206, -0.0253], [ 0.1281, -0.2421, -0.1591, ..., -0.3542, -0.1062, 0.2207], [-0.2046, 0.0921, 0.0688, ..., -0.3289, -0.1657, -0.0802]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -6.1467e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.5879e-09], [ 5.1223e-07, 9.8720e-08, 5.5879e-09, ..., 0.0000e+00, 1.8626e-09, 5.9605e-07], [ 5.5879e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], ..., [ 5.5879e-09, 3.7253e-09, -1.3039e-08, ..., -0.0000e+00, -1.1176e-08, 1.6764e-08], [-6.2771e-07, -1.1735e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.3388e-07], [ 1.3039e-08, 4.6566e-08, -9.3132e-09, ..., 0.0000e+00, 7.4506e-09, 9.3132e-09]], device='cuda:0') Epoch 372, bias, value: tensor([-0.0197, -0.0184, -0.0128, -0.0203, 0.0012, -0.0022, 0.0117, 0.0211, 0.0158, -0.0091], device='cuda:0'), grad: tensor([-1.3225e-07, 1.8440e-06, 2.2352e-08, 3.3528e-08, 3.3528e-08, 2.0489e-07, 5.0291e-08, 3.1665e-08, -2.2538e-06, 1.4901e-07], device='cuda:0') 100 0.0001 changing lr epoch 371, time 214.97, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4748 re_mapping 0.0042 re_causal 0.0120 /// teacc 98.97 lr 0.00010000 Epoch 373, weight, value: tensor([[-0.3145, 0.0816, -0.2061, ..., -0.1008, -0.2821, -0.1770], [-0.0071, 0.1064, -0.1282, ..., -0.1322, -0.0823, 0.1281], [ 0.0494, -0.1412, -0.1856, ..., -0.0012, -0.0647, -0.1183], ..., [ 0.1145, -0.0908, 0.1464, ..., 0.0773, 0.2234, -0.0223], [ 0.1288, -0.2429, -0.1597, ..., -0.3551, -0.1066, 0.2215], [-0.2053, 0.0919, 0.0693, ..., -0.3301, -0.1655, -0.0811]], device='cuda:0'), grad: tensor([[-3.7253e-09, -4.8429e-08, 3.5390e-08, ..., 2.9802e-08, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 8.3819e-08, 2.0489e-08, ..., 1.4901e-08, 0.0000e+00, 9.3132e-09], [-1.6764e-08, 6.3330e-08, 2.0489e-08, ..., 5.5879e-09, 0.0000e+00, 1.8626e-09], ..., [ 1.4901e-08, 1.4715e-07, 1.1176e-08, ..., 2.2352e-08, -1.8626e-09, 2.0489e-08], [ 1.8626e-09, 1.7881e-07, 2.7940e-08, ..., 1.3039e-08, 0.0000e+00, 3.7253e-09], [ 1.8626e-09, 1.9260e-06, -1.4342e-07, ..., -5.5879e-09, 0.0000e+00, 3.2410e-07]], device='cuda:0') Epoch 373, bias, value: tensor([-0.0195, -0.0213, -0.0128, -0.0193, 0.0014, -0.0034, 0.0118, 0.0236, 0.0158, -0.0090], device='cuda:0'), grad: tensor([-2.9802e-08, 3.4645e-07, 1.2293e-07, -9.6858e-08, -1.0461e-05, 2.3469e-07, 6.3330e-08, 6.7241e-07, 4.7870e-07, 8.6427e-06], device='cuda:0') 100 0.0001 changing lr epoch 372, time 215.27, cls_loss 0.0009 cls_loss_mapping 0.0021 cls_loss_causal 0.5016 re_mapping 0.0041 re_causal 0.0118 /// teacc 98.97 lr 0.00010000 Epoch 374, weight, value: tensor([[-0.3150, 0.0819, -0.2061, ..., -0.1016, -0.2823, -0.1767], [-0.0072, 0.1065, -0.1280, ..., -0.1322, -0.0822, 0.1283], [ 0.0497, -0.1412, -0.1854, ..., -0.0011, -0.0645, -0.1183], ..., [ 0.1144, -0.0916, 0.1462, ..., 0.0772, 0.2233, -0.0225], [ 0.1291, -0.2438, -0.1604, ..., -0.3555, -0.1070, 0.2218], [-0.2063, 0.0916, 0.0693, ..., -0.3308, -0.1656, -0.0814]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.0489e-08, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.3441e-05, -1.8626e-09, -2.2538e-07, ..., 1.8626e-09, -5.5879e-06, -5.4203e-07], [ 1.2010e-05, 3.7253e-09, 2.0862e-07, ..., -0.0000e+00, 4.9919e-06, 4.7497e-07], ..., [ 1.4119e-06, 5.5879e-09, 2.2352e-08, ..., -1.8626e-09, 5.8487e-07, 6.1467e-08], [ 5.5879e-09, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 1.8626e-09], [ 7.4506e-09, 9.3132e-09, -9.3132e-09, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 374, bias, value: tensor([-0.0194, -0.0213, -0.0122, -0.0195, 0.0013, -0.0029, 0.0118, 0.0233, 0.0156, -0.0093], device='cuda:0'), grad: tensor([-5.2154e-08, -5.7995e-05, 5.1796e-05, 1.8626e-09, 9.3132e-09, 3.3528e-08, -1.0617e-07, 6.1244e-06, 1.4156e-07, 4.8429e-08], device='cuda:0') 100 0.0001 changing lr epoch 373, time 215.18, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4820 re_mapping 0.0043 re_causal 0.0119 /// teacc 99.04 lr 0.00010000 Epoch 375, weight, value: tensor([[-0.3171, 0.0818, -0.2062, ..., -0.1025, -0.2828, -0.1785], [-0.0071, 0.1068, -0.1280, ..., -0.1323, -0.0822, 0.1284], [ 0.0497, -0.1414, -0.1855, ..., -0.0011, -0.0645, -0.1184], ..., [ 0.1144, -0.0927, 0.1458, ..., 0.0768, 0.2232, -0.0226], [ 0.1291, -0.2448, -0.1616, ..., -0.3564, -0.1074, 0.2219], [-0.2068, 0.0921, 0.0701, ..., -0.3313, -0.1656, -0.0813]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 1.8626e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [-2.8517e-06, 6.1467e-08, -1.5333e-05, ..., -5.1036e-07, -2.4229e-05, 1.8626e-09], [ 3.7253e-08, -3.9116e-08, 1.0729e-06, ..., 2.6077e-08, 1.6689e-06, 1.1176e-08], ..., [ 2.7698e-06, 3.3528e-08, 1.4298e-05, ..., 4.8615e-07, 2.2486e-05, 1.6764e-08], [-1.8626e-08, 1.1176e-08, 1.3039e-08, ..., 1.8626e-09, 1.8626e-09, -2.6077e-08], [ 7.4506e-09, -1.0245e-06, -1.2238e-06, ..., 0.0000e+00, 3.7253e-09, -2.4587e-07]], device='cuda:0') Epoch 375, bias, value: tensor([-0.0195, -0.0212, -0.0123, -0.0167, 0.0006, -0.0045, 0.0118, 0.0231, 0.0151, -0.0087], device='cuda:0'), grad: tensor([ 3.9116e-08, -8.6129e-05, 5.1670e-06, 8.3819e-08, 3.1553e-06, 4.4703e-08, 1.2666e-07, 8.0705e-05, 2.4214e-08, -3.3565e-06], device='cuda:0') 100 0.0001 changing lr epoch 374, time 215.09, cls_loss 0.0008 cls_loss_mapping 0.0021 cls_loss_causal 0.4786 re_mapping 0.0041 re_causal 0.0117 /// teacc 99.03 lr 0.00010000 Epoch 376, weight, value: tensor([[-0.3171, 0.0850, -0.2063, ..., -0.1032, -0.2830, -0.1756], [-0.0070, 0.1067, -0.1276, ..., -0.1323, -0.0821, 0.1287], [ 0.0498, -0.1416, -0.1856, ..., -0.0010, -0.0644, -0.1184], ..., [ 0.1142, -0.0940, 0.1454, ..., 0.0768, 0.2232, -0.0229], [ 0.1294, -0.2450, -0.1618, ..., -0.3566, -0.1076, 0.2223], [-0.2073, 0.0922, 0.0708, ..., -0.3316, -0.1655, -0.0818]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 3.7253e-09, 7.4506e-09, ..., 3.7253e-09, 0.0000e+00, 1.8626e-09], [-2.5451e-05, 1.8626e-08, -3.8475e-05, ..., 1.4901e-08, 5.5879e-09, -2.5600e-05], [ 9.8720e-08, 1.3970e-07, 2.1979e-07, ..., 1.5460e-07, 0.0000e+00, 1.4901e-08], ..., [ 2.5243e-05, 3.7253e-09, 3.8207e-05, ..., 5.5879e-09, -1.3039e-08, 2.5392e-05], [-1.8626e-09, 1.3039e-08, 3.1665e-08, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 5.0291e-08, -8.1956e-08, -1.0245e-07, ..., 0.0000e+00, 5.5879e-09, 4.0978e-08]], device='cuda:0') Epoch 376, bias, value: tensor([-0.0165, -0.0210, -0.0122, -0.0168, 0.0008, -0.0045, 0.0091, 0.0227, 0.0153, -0.0086], device='cuda:0'), grad: tensor([ 1.6764e-08, -7.6413e-05, 4.8988e-07, -2.3469e-07, 2.6077e-07, 0.0000e+00, 7.4506e-09, 7.5936e-05, 6.5193e-08, -2.4773e-07], device='cuda:0') 100 0.0001 changing lr epoch 375, time 215.07, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4505 re_mapping 0.0041 re_causal 0.0114 /// teacc 98.97 lr 0.00010000 Epoch 377, weight, value: tensor([[-0.3177, 0.0850, -0.2063, ..., -0.1035, -0.2834, -0.1755], [-0.0070, 0.1067, -0.1275, ..., -0.1324, -0.0821, 0.1289], [ 0.0498, -0.1419, -0.1858, ..., -0.0010, -0.0644, -0.1184], ..., [ 0.1142, -0.0942, 0.1453, ..., 0.0768, 0.2232, -0.0231], [ 0.1300, -0.2454, -0.1623, ..., -0.3568, -0.1069, 0.2228], [-0.2066, 0.0924, 0.0715, ..., -0.3318, -0.1653, -0.0820]], device='cuda:0'), grad: tensor([[-7.4506e-09, -2.2352e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -9.3132e-09], [ 3.7253e-09, -1.8626e-09, 1.4901e-08, ..., 3.7253e-09, 5.5879e-09, -3.1665e-08], [ 3.7253e-09, 5.5879e-09, 5.5879e-09, ..., 3.7253e-09, 0.0000e+00, 1.8626e-09], ..., [-5.2154e-08, 3.7253e-09, -3.3528e-08, ..., -1.8626e-09, -1.6764e-08, 5.5879e-09], [ 3.7253e-09, 7.4506e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 2.9802e-08, -1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 1.1176e-08]], device='cuda:0') Epoch 377, bias, value: tensor([-0.0164, -0.0209, -0.0122, -0.0169, 0.0004, -0.0043, 0.0090, 0.0226, 0.0153, -0.0080], device='cuda:0'), grad: tensor([-8.1956e-08, -1.8626e-08, 3.1665e-08, -9.3132e-09, 2.4214e-08, 3.3528e-08, 5.5879e-09, -7.2643e-08, 2.7940e-08, 5.0291e-08], device='cuda:0') 100 0.0001 changing lr epoch 376, time 215.18, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4847 re_mapping 0.0038 re_causal 0.0114 /// teacc 98.98 lr 0.00010000 Epoch 378, weight, value: tensor([[-0.3207, 0.0851, -0.2067, ..., -0.1067, -0.2866, -0.1755], [-0.0070, 0.1062, -0.1276, ..., -0.1326, -0.0822, 0.1289], [ 0.0498, -0.1422, -0.1859, ..., -0.0009, -0.0643, -0.1185], ..., [ 0.1142, -0.0949, 0.1453, ..., 0.0768, 0.2232, -0.0231], [ 0.1312, -0.2464, -0.1628, ..., -0.3569, -0.1072, 0.2238], [-0.2069, 0.0928, 0.0720, ..., -0.3322, -0.1653, -0.0823]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.0552e-06, 2.9057e-07, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, -5.5879e-09], [-0.0000e+00, 4.2841e-08, 1.8626e-09, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], ..., [ 1.8626e-09, 1.6764e-08, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 9.3132e-09], [-5.5879e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 1.8626e-09, -5.1931e-06, -2.9989e-07, ..., 0.0000e+00, 0.0000e+00, 1.8626e-08]], device='cuda:0') Epoch 378, bias, value: tensor([-0.0166, -0.0210, -0.0120, -0.0171, 0.0002, -0.0040, 0.0090, 0.0226, 0.0157, -0.0075], device='cuda:0'), grad: tensor([ 1.0908e-05, 5.5879e-09, 8.9407e-08, -1.8626e-09, 1.2293e-07, 4.4703e-08, -3.5390e-08, 4.8429e-08, 0.0000e+00, -1.1176e-05], device='cuda:0') 100 0.0001 changing lr epoch 377, time 215.12, cls_loss 0.0008 cls_loss_mapping 0.0010 cls_loss_causal 0.4715 re_mapping 0.0040 re_causal 0.0117 /// teacc 98.96 lr 0.00010000 Epoch 379, weight, value: tensor([[-3.2090e-01, 8.5054e-02, -2.0690e-01, ..., -1.0692e-01, -2.8684e-01, -1.7552e-01], [-7.0227e-03, 1.0596e-01, -1.2756e-01, ..., -1.3267e-01, -8.2172e-02, 1.2905e-01], [ 5.0169e-02, -1.4331e-01, -1.8508e-01, ..., -3.3537e-04, -6.3194e-02, -1.1866e-01], ..., [ 1.1397e-01, -9.5941e-02, 1.4515e-01, ..., 7.6159e-02, 2.2270e-01, -2.3162e-02], [ 1.3167e-01, -2.4664e-01, -1.6301e-01, ..., -3.5707e-01, -1.0748e-01, 2.2432e-01], [-2.0639e-01, 9.3577e-02, 7.3160e-02, ..., -3.3247e-01, -1.6465e-01, -8.1888e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -7.4506e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 7.4506e-09, 9.3132e-09, 4.0978e-08, ..., 2.2352e-08, 1.8626e-09, -7.4506e-09], [-0.0000e+00, 5.5879e-09, 3.7253e-09, ..., 1.8626e-09, -0.0000e+00, 7.4506e-09], ..., [ 3.3528e-08, 9.4995e-08, 1.6391e-07, ..., -0.0000e+00, -1.8626e-09, 1.4901e-08], [-5.5879e-08, 3.7253e-09, 5.5879e-09, ..., 0.0000e+00, 0.0000e+00, -1.3039e-07], [-8.0094e-08, -2.3097e-07, -4.3027e-07, ..., 0.0000e+00, 0.0000e+00, 9.3132e-09]], device='cuda:0') Epoch 379, bias, value: tensor([-0.0166, -0.0210, -0.0116, -0.0174, -0.0009, -0.0038, 0.0090, 0.0223, 0.0158, -0.0063], device='cuda:0'), grad: tensor([ 3.7253e-09, 9.1270e-08, 5.0291e-08, -7.0781e-08, 3.7439e-07, 5.2154e-08, 3.5390e-08, 3.2969e-07, -1.3411e-07, -7.4133e-07], device='cuda:0') 100 0.0001 changing lr epoch 378, time 214.97, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4912 re_mapping 0.0040 re_causal 0.0114 /// teacc 99.00 lr 0.00010000 Epoch 380, weight, value: tensor([[-3.2104e-01, 8.5091e-02, -2.0715e-01, ..., -1.0707e-01, -2.8699e-01, -1.7549e-01], [-7.0416e-03, 1.0574e-01, -1.2761e-01, ..., -1.3292e-01, -8.2193e-02, 1.2908e-01], [ 5.0165e-02, -1.4372e-01, -1.8527e-01, ..., -3.0770e-04, -6.3153e-02, -1.1873e-01], ..., [ 1.1402e-01, -9.6256e-02, 1.4525e-01, ..., 7.6224e-02, 2.2275e-01, -2.3183e-02], [ 1.3200e-01, -2.4709e-01, -1.6363e-01, ..., -3.5737e-01, -1.0792e-01, 2.2467e-01], [-2.0747e-01, 9.3518e-02, 7.3351e-02, ..., -3.3430e-01, -1.6489e-01, -8.3039e-02]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -4.0978e-08, 3.7253e-09, ..., 5.5879e-09, 1.8626e-09, -5.5879e-09], [ 1.1176e-08, 2.2352e-08, 1.4901e-08, ..., 1.3039e-08, 5.5879e-09, 4.0978e-08], [ 1.4901e-08, 5.5879e-09, 0.0000e+00, ..., -2.0489e-08, -7.4506e-09, 5.5879e-09], ..., [-1.6764e-08, 1.1176e-08, -1.8626e-09, ..., -1.1176e-08, -1.6764e-08, 5.5879e-09], [ 2.0489e-08, 4.6566e-08, 1.0058e-07, ..., 3.7253e-09, 0.0000e+00, -1.4901e-08], [-3.1665e-08, -0.0000e+00, -1.2852e-07, ..., 0.0000e+00, 0.0000e+00, 4.2841e-08]], device='cuda:0') Epoch 380, bias, value: tensor([-0.0166, -0.0210, -0.0116, -0.0168, -0.0005, -0.0049, 0.0091, 0.0224, 0.0157, -0.0064], device='cuda:0'), grad: tensor([-6.3330e-08, 1.5087e-07, -1.5087e-07, 7.6368e-08, -1.5087e-07, -1.3039e-07, 1.6578e-07, 2.9802e-08, 3.3155e-07, -2.6077e-07], device='cuda:0') 100 0.0001 changing lr epoch 379, time 215.54, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.4862 re_mapping 0.0040 re_causal 0.0116 /// teacc 99.02 lr 0.00010000 Epoch 381, weight, value: tensor([[-3.2150e-01, 8.5052e-02, -2.0748e-01, ..., -1.0733e-01, -2.8716e-01, -1.7559e-01], [-7.0471e-03, 1.0629e-01, -1.2764e-01, ..., -1.3314e-01, -8.2227e-02, 1.2934e-01], [ 5.0154e-02, -1.4411e-01, -1.8540e-01, ..., -3.1780e-04, -6.3162e-02, -1.1933e-01], ..., [ 1.1404e-01, -9.6640e-02, 1.4526e-01, ..., 7.6249e-02, 2.2279e-01, -2.3219e-02], [ 1.3233e-01, -2.4958e-01, -1.6541e-01, ..., -3.5779e-01, -1.0825e-01, 2.2400e-01], [-2.0801e-01, 9.3332e-02, 7.3749e-02, ..., -3.3552e-01, -1.6478e-01, -8.3964e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.8813e-07, -5.4017e-08, ..., 0.0000e+00, 0.0000e+00, -1.1399e-06], [-1.8626e-09, 1.5646e-07, 4.6566e-08, ..., -1.8626e-09, -1.8626e-09, 9.5740e-07], ..., [-1.8626e-09, 2.6077e-08, 3.7253e-09, ..., 1.8626e-09, -3.7253e-09, 1.5274e-07], [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 0.0000e+00, -5.5879e-09], [ 3.7253e-09, -1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 381, bias, value: tensor([-1.6681e-02, -2.0937e-02, -1.1697e-02, -1.5873e-02, 3.4713e-05, -6.1113e-03, 9.2421e-03, 2.2333e-02, 1.4960e-02, -6.4731e-03], device='cuda:0'), grad: tensor([ 0.0000e+00, -1.7956e-06, 1.5032e-06, 1.6950e-07, 1.6764e-08, -1.3411e-07, 5.5879e-09, 2.3842e-07, -5.5879e-09, 5.5879e-09], device='cuda:0') 100 0.0001 changing lr epoch 380, time 215.48, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4610 re_mapping 0.0040 re_causal 0.0117 /// teacc 99.05 lr 0.00010000 Epoch 382, weight, value: tensor([[-3.2173e-01, 8.5094e-02, -2.0763e-01, ..., -1.0748e-01, -2.8725e-01, -1.7561e-01], [-7.0837e-03, 1.0636e-01, -1.2772e-01, ..., -1.3344e-01, -8.2278e-02, 1.2936e-01], [ 5.0151e-02, -1.4434e-01, -1.8566e-01, ..., -2.9652e-04, -6.3185e-02, -1.1940e-01], ..., [ 1.1434e-01, -9.6628e-02, 1.4596e-01, ..., 7.6293e-02, 2.2355e-01, -2.3219e-02], [ 1.3276e-01, -2.5015e-01, -1.6626e-01, ..., -3.5841e-01, -1.0843e-01, 2.2462e-01], [-2.1128e-01, 9.3384e-02, 7.1495e-02, ..., -3.3676e-01, -1.6789e-01, -8.4024e-02]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.3039e-08, 1.8626e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, -1.8626e-09, 3.7253e-09, ..., 5.5879e-09, 3.7253e-09, -2.2352e-08], [-1.6764e-08, 1.3039e-08, 1.8626e-08, ..., -7.4506e-09, -9.3132e-09, 5.5879e-09], ..., [-1.8626e-09, 5.5879e-09, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 9.3132e-09], [-1.8626e-09, 3.7253e-09, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, -5.5879e-09], [ 1.8626e-09, 8.0094e-08, -9.3132e-09, ..., 3.7253e-09, 0.0000e+00, 1.2480e-07]], device='cuda:0') Epoch 382, bias, value: tensor([-1.6638e-02, -2.0981e-02, -1.1682e-02, -1.5964e-02, 7.3005e-05, -6.2297e-03, 9.2144e-03, 2.2756e-02, 1.5060e-02, -8.4564e-03], device='cuda:0'), grad: tensor([-3.3528e-08, -1.1176e-08, -2.9802e-08, -2.0489e-08, -3.7998e-07, 1.3039e-08, 1.8626e-08, 2.9802e-08, 9.3132e-09, 3.7625e-07], device='cuda:0') 100 0.0001 changing lr epoch 381, time 215.31, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.4774 re_mapping 0.0038 re_causal 0.0110 /// teacc 98.98 lr 0.00010000 Epoch 383, weight, value: tensor([[-3.2185e-01, 8.5087e-02, -2.0790e-01, ..., -1.0767e-01, -2.8729e-01, -1.7566e-01], [-7.0202e-03, 1.0707e-01, -1.2753e-01, ..., -1.3364e-01, -8.2255e-02, 1.2986e-01], [ 5.0204e-02, -1.4579e-01, -1.8580e-01, ..., -2.2696e-04, -6.3064e-02, -1.1952e-01], ..., [ 1.1427e-01, -9.7734e-02, 1.4579e-01, ..., 7.6208e-02, 2.2350e-01, -2.3501e-02], [ 1.3301e-01, -2.5067e-01, -1.6688e-01, ..., -3.5915e-01, -1.0884e-01, 2.2499e-01], [-2.1138e-01, 9.3380e-02, 7.1447e-02, ..., -3.3756e-01, -1.6792e-01, -8.5555e-02]], device='cuda:0'), grad: tensor([[ 1.4901e-08, -9.4995e-08, 5.5879e-09, ..., 0.0000e+00, 0.0000e+00, 2.0489e-08], [-8.6613e-07, -1.3895e-06, -2.2165e-07, ..., 0.0000e+00, 0.0000e+00, -1.2070e-06], [ 3.7253e-09, 2.0489e-08, 1.8626e-09, ..., -1.8626e-09, -1.8626e-09, 1.4901e-08], ..., [ 6.6310e-07, 1.0692e-06, 1.7136e-07, ..., 0.0000e+00, -1.8626e-09, 9.2573e-07], [ 9.3132e-09, 1.8626e-08, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 1.3039e-08], [ 1.1735e-07, 2.4959e-07, -7.8231e-08, ..., 0.0000e+00, 0.0000e+00, 1.9185e-07]], device='cuda:0') Epoch 383, bias, value: tensor([-0.0167, -0.0207, -0.0116, -0.0161, 0.0008, -0.0059, 0.0091, 0.0225, 0.0149, -0.0088], device='cuda:0'), grad: tensor([-8.9407e-08, -5.5693e-06, 6.8918e-08, 1.0990e-07, 2.2165e-07, 1.3784e-07, -1.4529e-07, 4.2878e-06, 8.7544e-08, 8.6986e-07], device='cuda:0') 100 0.0001 changing lr epoch 382, time 215.16, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4698 re_mapping 0.0041 re_causal 0.0116 /// teacc 99.08 lr 0.00010000 Epoch 384, weight, value: tensor([[-3.2198e-01, 8.5156e-02, -2.0801e-01, ..., -1.0773e-01, -2.8733e-01, -1.7554e-01], [-7.0267e-03, 1.0723e-01, -1.2756e-01, ..., -1.3396e-01, -8.2289e-02, 1.2996e-01], [ 5.0197e-02, -1.4623e-01, -1.8598e-01, ..., -2.0350e-04, -6.3064e-02, -1.1967e-01], ..., [ 1.1428e-01, -9.8728e-02, 1.4582e-01, ..., 7.6209e-02, 2.2354e-01, -2.3558e-02], [ 1.3367e-01, -2.5118e-01, -1.6694e-01, ..., -3.5866e-01, -1.0940e-01, 2.2573e-01], [-2.1146e-01, 9.2691e-02, 7.1362e-02, ..., -3.3812e-01, -1.6794e-01, -8.7762e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.2527e-07, 5.5879e-09, ..., 0.0000e+00, 0.0000e+00, -9.3132e-09], [ 0.0000e+00, 1.3039e-08, 1.6764e-08, ..., 7.4506e-09, 0.0000e+00, -7.4506e-09], [ 0.0000e+00, 2.0489e-08, 2.6077e-08, ..., 1.8626e-08, 0.0000e+00, 5.5879e-09], ..., [ 0.0000e+00, 9.3132e-09, 1.3039e-08, ..., 7.4506e-09, 0.0000e+00, 5.5879e-09], [ 0.0000e+00, 6.8918e-08, 2.7940e-08, ..., 5.5879e-09, 0.0000e+00, 5.5879e-09], [ 0.0000e+00, -4.4517e-07, -2.1607e-07, ..., 1.8626e-09, 0.0000e+00, -6.1467e-08]], device='cuda:0') Epoch 384, bias, value: tensor([-0.0166, -0.0207, -0.0116, -0.0161, 0.0020, -0.0061, 0.0092, 0.0225, 0.0150, -0.0094], device='cuda:0'), grad: tensor([-1.6969e-06, 5.4017e-08, 1.2107e-07, -1.1548e-07, 1.6056e-06, 6.3330e-08, 9.0897e-07, 5.0291e-08, 2.2352e-07, -1.2219e-06], device='cuda:0') 100 0.0001 changing lr epoch 383, time 215.27, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4602 re_mapping 0.0042 re_causal 0.0117 /// teacc 99.02 lr 0.00010000 Epoch 385, weight, value: tensor([[-3.2223e-01, 8.5241e-02, -2.0814e-01, ..., -1.0794e-01, -2.8739e-01, -1.7551e-01], [-6.8556e-03, 1.0541e-01, -1.2719e-01, ..., -1.3416e-01, -8.2304e-02, 1.3054e-01], [ 5.0199e-02, -1.4680e-01, -1.8611e-01, ..., -1.9148e-04, -6.3064e-02, -1.1973e-01], ..., [ 1.1415e-01, -9.9216e-02, 1.4543e-01, ..., 7.6219e-02, 2.2356e-01, -2.4042e-02], [ 1.3386e-01, -2.5168e-01, -1.6753e-01, ..., -3.5927e-01, -1.0972e-01, 2.2571e-01], [-2.1157e-01, 9.3685e-02, 7.1602e-02, ..., -3.3855e-01, -1.6795e-01, -8.9241e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.3132e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 1.8626e-08, 7.0781e-08, 8.3819e-08, ..., 1.1176e-08, 3.7253e-09, 5.5879e-09], [ 1.8626e-09, 1.8626e-09, 7.4506e-09, ..., -3.7253e-09, 1.8626e-09, 0.0000e+00], ..., [-4.8429e-08, 3.7253e-09, -3.3528e-08, ..., -1.8626e-09, -2.0489e-08, 0.0000e+00], [-0.0000e+00, 1.4901e-08, 1.8626e-08, ..., 3.7253e-09, 0.0000e+00, -1.8626e-09], [ 1.8626e-08, -1.2293e-07, -1.0617e-07, ..., 3.7253e-09, 9.3132e-09, -5.5879e-09]], device='cuda:0') Epoch 385, bias, value: tensor([-0.0165, -0.0206, -0.0116, -0.0162, 0.0029, -0.0061, 0.0091, 0.0222, 0.0147, -0.0089], device='cuda:0'), grad: tensor([-3.5390e-08, 4.2468e-07, -5.5879e-09, -5.7742e-08, 6.8918e-08, 2.0489e-08, 9.4995e-08, -5.7742e-08, 7.8231e-08, -5.4576e-07], device='cuda:0') 100 0.0001 changing lr epoch 384, time 215.02, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4278 re_mapping 0.0038 re_causal 0.0109 /// teacc 99.06 lr 0.00010000 Epoch 386, weight, value: tensor([[-3.2140e-01, 8.5284e-02, -2.0821e-01, ..., -1.0810e-01, -2.8746e-01, -1.7543e-01], [-6.8479e-03, 1.0463e-01, -1.2720e-01, ..., -1.3442e-01, -8.2278e-02, 1.3073e-01], [ 5.0274e-02, -1.4742e-01, -1.8612e-01, ..., -1.0763e-04, -6.2974e-02, -1.1990e-01], ..., [ 1.1411e-01, -9.9288e-02, 1.4542e-01, ..., 7.6160e-02, 2.2351e-01, -2.4199e-02], [ 1.3429e-01, -2.5230e-01, -1.6791e-01, ..., -3.5979e-01, -1.0993e-01, 2.2599e-01], [-2.1176e-01, 9.3991e-02, 7.1691e-02, ..., -3.3900e-01, -1.6797e-01, -9.0394e-02]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -1.8626e-09, 3.7253e-09, ..., 3.7253e-09, 0.0000e+00, 1.8626e-09], [-1.9930e-07, 0.0000e+00, -6.9104e-07, ..., 1.3039e-08, -9.5740e-07, -1.6484e-06], [-1.8626e-08, 5.5879e-09, 7.4506e-08, ..., 4.8429e-08, 7.4506e-09, 1.1176e-08], ..., [ 2.3097e-07, 1.8626e-09, 7.0222e-07, ..., 5.5879e-09, 9.4622e-07, 1.6857e-06], [-3.5390e-08, 1.8626e-09, 2.0489e-08, ..., 3.1665e-08, 1.8626e-09, -1.3225e-07], [ 2.2352e-08, 1.8626e-09, 5.5879e-09, ..., 3.7253e-09, 3.7253e-09, 3.3528e-08]], device='cuda:0') Epoch 386, bias, value: tensor([-0.0164, -0.0206, -0.0114, -0.0162, 0.0035, -0.0061, 0.0090, 0.0221, 0.0141, -0.0087], device='cuda:0'), grad: tensor([ 1.6764e-08, -2.8275e-06, 1.0431e-07, -3.0175e-07, -7.4506e-09, 1.1921e-07, -2.0489e-08, 2.9597e-06, -1.2107e-07, 8.1956e-08], device='cuda:0') 100 0.0001 changing lr epoch 385, time 215.10, cls_loss 0.0007 cls_loss_mapping 0.0008 cls_loss_causal 0.4783 re_mapping 0.0039 re_causal 0.0124 /// teacc 99.03 lr 0.00010000 Epoch 387, weight, value: tensor([[-3.2151e-01, 8.5298e-02, -2.0825e-01, ..., -1.0823e-01, -2.8753e-01, -1.7545e-01], [-6.9461e-03, 1.0449e-01, -1.2734e-01, ..., -1.3466e-01, -8.2411e-02, 1.3080e-01], [ 5.0275e-02, -1.4783e-01, -1.8648e-01, ..., -1.9148e-04, -6.2983e-02, -1.1996e-01], ..., [ 1.1419e-01, -9.9868e-02, 1.4543e-01, ..., 7.5978e-02, 2.2364e-01, -2.4203e-02], [ 1.3453e-01, -2.5296e-01, -1.6823e-01, ..., -3.6009e-01, -1.1003e-01, 2.2595e-01], [-2.1183e-01, 9.4231e-02, 7.1800e-02, ..., -3.3933e-01, -1.6798e-01, -9.0938e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 5.5879e-09, 3.1665e-08, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-7.4506e-09, 7.4506e-09, 1.1176e-08, ..., 0.0000e+00, 0.0000e+00, -1.6764e-08], [ 7.4506e-09, -1.0990e-07, -1.8440e-07, ..., 0.0000e+00, 0.0000e+00, 1.4901e-08]], device='cuda:0') Epoch 387, bias, value: tensor([-0.0164, -0.0207, -0.0115, -0.0157, 0.0038, -0.0061, 0.0090, 0.0221, 0.0139, -0.0087], device='cuda:0'), grad: tensor([ 5.5879e-09, 9.3132e-09, 5.5879e-09, 9.3132e-09, 2.6822e-07, -1.8626e-08, 1.6764e-08, 6.8918e-08, -9.3132e-09, -3.5204e-07], device='cuda:0') 100 0.0001 changing lr epoch 386, time 214.84, cls_loss 0.0007 cls_loss_mapping 0.0009 cls_loss_causal 0.4506 re_mapping 0.0040 re_causal 0.0115 /// teacc 99.07 lr 0.00010000 Epoch 388, weight, value: tensor([[-3.2127e-01, 8.5353e-02, -2.0822e-01, ..., -1.0829e-01, -2.8754e-01, -1.7547e-01], [-6.9585e-03, 1.0458e-01, -1.2733e-01, ..., -1.3500e-01, -8.2464e-02, 1.3088e-01], [ 5.0291e-02, -1.4857e-01, -1.8662e-01, ..., -1.7818e-04, -6.2985e-02, -1.2002e-01], ..., [ 1.1420e-01, -9.9980e-02, 1.4543e-01, ..., 7.5999e-02, 2.2369e-01, -2.4268e-02], [ 1.3485e-01, -2.5335e-01, -1.6865e-01, ..., -3.6163e-01, -1.1037e-01, 2.2667e-01], [-2.1204e-01, 9.4008e-02, 7.1792e-02, ..., -3.4015e-01, -1.6800e-01, -9.1826e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 1.8626e-09, 0.0000e+00, 1.8626e-09], [ 5.9605e-08, -1.6764e-08, 8.7544e-08, ..., 4.4703e-08, 4.0978e-08, -6.3330e-08], [ 3.1665e-08, 1.8626e-09, 4.2841e-08, ..., 2.0489e-08, 2.0489e-08, 5.5879e-09], ..., [-1.0990e-07, 2.0489e-08, -1.4715e-07, ..., -6.7055e-08, -7.6368e-08, 4.2841e-08], [-1.8626e-09, 1.1176e-08, 1.6764e-08, ..., 3.7253e-09, 1.8626e-09, -7.4506e-09], [ 9.3132e-09, -2.0489e-08, -1.3039e-08, ..., 5.5879e-09, 7.4506e-09, 9.3132e-09]], device='cuda:0') Epoch 388, bias, value: tensor([-0.0164, -0.0207, -0.0115, -0.0158, 0.0040, -0.0058, 0.0090, 0.0221, 0.0140, -0.0090], device='cuda:0'), grad: tensor([ 9.3132e-09, 5.2154e-08, 9.1270e-08, -1.1921e-07, 5.5879e-08, 1.3225e-07, 1.8626e-09, -1.9185e-07, 4.4703e-08, -6.3330e-08], device='cuda:0') 100 0.0001 changing lr epoch 387, time 214.84, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4720 re_mapping 0.0040 re_causal 0.0112 /// teacc 99.05 lr 0.00010000 Epoch 389, weight, value: tensor([[-3.1951e-01, 8.5384e-02, -2.0859e-01, ..., -1.0855e-01, -2.8762e-01, -1.7579e-01], [-7.0103e-03, 1.0452e-01, -1.2755e-01, ..., -1.3572e-01, -8.2519e-02, 1.3087e-01], [ 5.0256e-02, -1.4904e-01, -1.8703e-01, ..., -1.7403e-04, -6.2967e-02, -1.2028e-01], ..., [ 1.1423e-01, -1.0038e-01, 1.4543e-01, ..., 7.6065e-02, 2.2373e-01, -2.4355e-02], [ 1.3716e-01, -2.5459e-01, -1.6936e-01, ..., -3.6219e-01, -1.0988e-01, 2.2842e-01], [-2.1236e-01, 9.3834e-02, 7.3113e-02, ..., -3.4131e-01, -1.6792e-01, -9.3531e-02]], device='cuda:0'), grad: tensor([[ 3.7253e-08, 0.0000e+00, 1.1176e-08, ..., 5.2154e-08, 2.9802e-08, 7.4506e-09], [ 7.0781e-08, 1.3039e-08, 4.2841e-08, ..., 1.0617e-07, 5.5879e-08, -3.3267e-06], [-2.8871e-07, 9.3132e-09, 2.9802e-08, ..., -3.2596e-07, -2.3469e-07, 2.8927e-06], ..., [ 1.0803e-07, 9.3132e-09, 3.7253e-09, ..., 1.5274e-07, 9.1270e-08, 3.2037e-07], [ 1.8626e-08, 9.3132e-09, 2.6077e-08, ..., 5.0291e-08, 2.0489e-08, 1.3039e-08], [ 7.4506e-09, 1.6764e-08, 1.1176e-08, ..., 2.2352e-08, 5.5879e-09, 1.8626e-09]], device='cuda:0') Epoch 389, bias, value: tensor([-0.0167, -0.0209, -0.0115, -0.0158, 0.0050, -0.0061, 0.0096, 0.0220, 0.0144, -0.0087], device='cuda:0'), grad: tensor([ 2.1607e-07, -6.7353e-06, 4.8950e-06, -2.5146e-07, 9.4995e-08, -4.0978e-08, 1.8068e-07, 1.2927e-06, 2.1793e-07, 1.0431e-07], device='cuda:0') 100 0.0001 changing lr epoch 388, time 214.88, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4599 re_mapping 0.0041 re_causal 0.0115 /// teacc 99.01 lr 0.00010000 Epoch 390, weight, value: tensor([[-3.1966e-01, 8.5417e-02, -2.0882e-01, ..., -1.0883e-01, -2.8767e-01, -1.7586e-01], [-7.1106e-03, 1.0464e-01, -1.2756e-01, ..., -1.3652e-01, -8.2541e-02, 1.3091e-01], [ 5.0391e-02, -1.4952e-01, -1.8761e-01, ..., -2.2423e-04, -6.2958e-02, -1.2013e-01], ..., [ 1.1427e-01, -1.0022e-01, 1.4548e-01, ..., 7.6097e-02, 2.2379e-01, -2.4361e-02], [ 1.3762e-01, -2.5477e-01, -1.7044e-01, ..., -3.6356e-01, -1.1054e-01, 2.2908e-01], [-2.1252e-01, 9.3816e-02, 7.3209e-02, ..., -3.4229e-01, -1.6796e-01, -9.4384e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8626e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 4.6566e-09, 2.3283e-07, 1.1176e-08, ..., 2.7940e-09, 3.7253e-09, 9.4995e-08], [ 1.0245e-08, 9.3132e-10, 9.3132e-10, ..., -9.3132e-10, -0.0000e+00, 1.7695e-08], ..., [-7.4506e-09, 1.3039e-08, -1.3970e-08, ..., -7.4506e-09, -8.3819e-09, 1.1176e-08], [-2.9802e-08, 2.7940e-09, 5.5879e-09, ..., 0.0000e+00, 0.0000e+00, -4.4703e-08], [ 9.3132e-10, -9.3132e-10, -2.1420e-08, ..., 9.3132e-10, 9.3132e-10, 3.7253e-09]], device='cuda:0') Epoch 390, bias, value: tensor([-0.0167, -0.0212, -0.0109, -0.0156, 0.0048, -0.0061, 0.0096, 0.0220, 0.0144, -0.0089], device='cuda:0'), grad: tensor([-2.7940e-09, 6.0536e-07, 2.0489e-08, 8.3819e-09, -6.6776e-07, 1.9558e-08, 7.2643e-08, 1.7695e-08, -3.5390e-08, -3.5390e-08], device='cuda:0') 100 0.0001 changing lr epoch 389, time 214.91, cls_loss 0.0009 cls_loss_mapping 0.0010 cls_loss_causal 0.4698 re_mapping 0.0041 re_causal 0.0113 /// teacc 99.06 lr 0.00010000 Epoch 391, weight, value: tensor([[-3.1827e-01, 8.5570e-02, -2.0935e-01, ..., -1.0915e-01, -2.8779e-01, -1.7577e-01], [-7.1549e-03, 1.0479e-01, -1.2751e-01, ..., -1.3688e-01, -8.2548e-02, 1.3108e-01], [ 5.0438e-02, -1.5051e-01, -1.8783e-01, ..., -2.1166e-04, -6.2938e-02, -1.2037e-01], ..., [ 1.1429e-01, -1.0098e-01, 1.4552e-01, ..., 7.6134e-02, 2.2383e-01, -2.4480e-02], [ 1.3830e-01, -2.5533e-01, -1.7153e-01, ..., -3.6347e-01, -1.1119e-01, 2.2973e-01], [-2.1300e-01, 9.3569e-02, 7.3345e-02, ..., -3.4356e-01, -1.6803e-01, -9.6333e-02]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.7940e-09, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 1.0245e-08, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 1.6764e-08], [-6.5193e-09, 9.3132e-10, 0.0000e+00, ..., -9.3132e-09, -6.5193e-09, 1.8626e-09], ..., [ 7.4506e-09, 2.7940e-09, -0.0000e+00, ..., 3.7253e-09, 1.8626e-09, 1.3039e-08], [-1.2107e-07, 9.3132e-10, -1.3039e-08, ..., 1.8626e-09, 9.3132e-10, -2.1514e-07], [ 9.9652e-08, 1.8347e-07, -4.6566e-09, ..., 9.3132e-10, 9.3132e-10, 2.0955e-07]], device='cuda:0') Epoch 391, bias, value: tensor([-0.0165, -0.0214, -0.0105, -0.0156, 0.0049, -0.0060, 0.0096, 0.0220, 0.0144, -0.0094], device='cuda:0'), grad: tensor([ 2.5146e-08, 3.5390e-08, -3.2596e-08, 9.3132e-09, -8.5123e-07, 2.7940e-09, -2.1420e-08, 4.0978e-08, -3.8557e-07, 1.1800e-06], device='cuda:0') 100 0.0001 changing lr epoch 390, time 214.75, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4912 re_mapping 0.0039 re_causal 0.0116 /// teacc 99.01 lr 0.00010000 Epoch 392, weight, value: tensor([[-3.1837e-01, 8.5508e-02, -2.0998e-01, ..., -1.0934e-01, -2.8786e-01, -1.7579e-01], [-7.1612e-03, 1.0487e-01, -1.2753e-01, ..., -1.3714e-01, -8.2639e-02, 1.3131e-01], [ 5.0486e-02, -1.5131e-01, -1.8801e-01, ..., -1.1163e-04, -6.2933e-02, -1.2063e-01], ..., [ 1.1431e-01, -1.0183e-01, 1.4556e-01, ..., 7.6137e-02, 2.2393e-01, -2.4623e-02], [ 1.3811e-01, -2.5578e-01, -1.7251e-01, ..., -3.6563e-01, -1.1174e-01, 2.3035e-01], [-2.1317e-01, 9.3884e-02, 7.3522e-02, ..., -3.4447e-01, -1.6805e-01, -9.6865e-02]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 1.8626e-09, 2.7940e-09, ..., 9.3132e-10, 0.0000e+00, 4.6566e-09], [ 9.2201e-08, -2.7940e-08, 1.3411e-07, ..., 2.4214e-08, 1.2014e-07, -3.5390e-08], [ 1.3039e-08, 5.5879e-09, 1.5832e-08, ..., 1.1176e-08, 1.1176e-08, 1.3039e-08], ..., [-1.1548e-07, 1.3039e-08, -1.6578e-07, ..., -3.5390e-08, -1.4994e-07, -3.7253e-09], [-1.8626e-08, 6.5193e-09, 9.3132e-09, ..., 2.7940e-09, 0.0000e+00, -1.7695e-08], [ 2.5146e-08, -2.2352e-08, -1.6764e-08, ..., 3.7253e-09, 1.5832e-08, 2.1420e-08]], device='cuda:0') Epoch 392, bias, value: tensor([-0.0166, -0.0213, -0.0104, -0.0159, 0.0046, -0.0059, 0.0097, 0.0220, 0.0143, -0.0092], device='cuda:0'), grad: tensor([ 1.6764e-08, 5.2154e-08, 6.6124e-08, 4.4703e-08, 2.5146e-08, 7.4506e-09, 1.8626e-08, -1.9185e-07, -7.4506e-09, -3.6322e-08], device='cuda:0') 100 0.0001 changing lr epoch 391, time 214.82, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.4509 re_mapping 0.0039 re_causal 0.0114 /// teacc 98.96 lr 0.00010000 Epoch 393, weight, value: tensor([[-3.1842e-01, 8.5310e-02, -2.1026e-01, ..., -1.0985e-01, -2.8795e-01, -1.7623e-01], [-7.7054e-03, 1.0587e-01, -1.2821e-01, ..., -1.3764e-01, -8.3463e-02, 1.3148e-01], [ 5.0429e-02, -1.5453e-01, -1.8836e-01, ..., -1.9264e-04, -6.2962e-02, -1.2185e-01], ..., [ 1.1483e-01, -1.0171e-01, 1.4627e-01, ..., 7.6206e-02, 2.2472e-01, -2.4285e-02], [ 1.3800e-01, -2.5657e-01, -1.7511e-01, ..., -3.6661e-01, -1.1415e-01, 2.3035e-01], [-2.1343e-01, 9.5009e-02, 7.4804e-02, ..., -3.4556e-01, -1.6811e-01, -9.7190e-02]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -4.0885e-07, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 4.6566e-09, 1.7695e-08, 6.5193e-09, ..., 9.3132e-10, 2.7940e-09, 4.8429e-08], [ 2.7940e-09, 8.3819e-09, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 3.7253e-09], ..., [-1.0990e-07, 1.5832e-08, -1.6578e-07, ..., -9.3132e-10, -1.7043e-07, 2.0489e-08], [ 9.3132e-10, 1.0245e-08, 2.7940e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 7.4506e-09, 3.8743e-07, -1.8626e-08, ..., 0.0000e+00, 6.5193e-09, 2.0210e-07]], device='cuda:0') Epoch 393, bias, value: tensor([-0.0169, -0.0215, -0.0109, -0.0153, 0.0018, -0.0063, 0.0100, 0.0225, 0.0138, -0.0084], device='cuda:0'), grad: tensor([-8.7451e-07, 1.8720e-07, 2.8871e-08, 1.0245e-08, -4.7591e-07, 4.6566e-09, 2.0489e-08, -3.8277e-07, 3.9116e-08, 1.4268e-06], device='cuda:0') 100 0.0001 changing lr epoch 392, time 214.79, cls_loss 0.0010 cls_loss_mapping 0.0010 cls_loss_causal 0.4538 re_mapping 0.0041 re_causal 0.0112 /// teacc 99.04 lr 0.00010000 Epoch 394, weight, value: tensor([[-3.1859e-01, 8.5213e-02, -2.1132e-01, ..., -1.1034e-01, -2.8824e-01, -1.7630e-01], [-7.8000e-03, 1.0551e-01, -1.2843e-01, ..., -1.3819e-01, -8.3611e-02, 1.3137e-01], [ 5.0432e-02, -1.5506e-01, -1.8851e-01, ..., -1.6036e-04, -6.2941e-02, -1.2204e-01], ..., [ 1.1493e-01, -1.0225e-01, 1.4650e-01, ..., 7.6275e-02, 2.2489e-01, -2.4219e-02], [ 1.3902e-01, -2.5729e-01, -1.7533e-01, ..., -3.6667e-01, -1.1460e-01, 2.3135e-01], [-2.1368e-01, 9.5764e-02, 7.5636e-02, ..., -3.4778e-01, -1.6819e-01, -1.0165e-01]], device='cuda:0'), grad: tensor([[ 6.5193e-09, -1.7695e-08, 4.6566e-09, ..., 7.4506e-09, 9.3132e-10, 0.0000e+00], [ 1.7695e-08, 7.4506e-09, 1.6764e-08, ..., 4.1910e-08, 1.1176e-08, -1.8626e-09], [-7.1712e-08, 2.7940e-09, 4.6566e-09, ..., -5.2154e-08, -1.8626e-08, -0.0000e+00], ..., [ 4.6566e-09, 4.6566e-09, 8.3819e-09, ..., 1.3039e-08, 1.8626e-09, 9.3132e-10], [ 2.7940e-08, 5.5879e-09, 6.5193e-09, ..., 1.6764e-08, 1.8626e-09, 9.3132e-10], [ 1.8626e-09, 8.3819e-09, 5.5879e-09, ..., 6.5193e-09, 0.0000e+00, 2.7940e-09]], device='cuda:0') Epoch 394, bias, value: tensor([-0.0171, -0.0217, -0.0109, -0.0159, 0.0025, -0.0058, 0.0100, 0.0226, 0.0142, -0.0081], device='cuda:0'), grad: tensor([-4.4703e-08, 1.6391e-07, -4.7963e-07, -3.6508e-07, 3.7253e-09, 3.2317e-07, 1.1269e-07, 4.9360e-08, 2.0396e-07, 4.0978e-08], device='cuda:0') 100 0.0001 changing lr epoch 393, time 215.01, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4731 re_mapping 0.0040 re_causal 0.0114 /// teacc 99.05 lr 0.00010000 Epoch 395, weight, value: tensor([[-3.1851e-01, 8.5286e-02, -2.1150e-01, ..., -1.1049e-01, -2.8841e-01, -1.7626e-01], [-7.9529e-03, 1.0539e-01, -1.2859e-01, ..., -1.3847e-01, -8.3835e-02, 1.3134e-01], [ 5.0488e-02, -1.5568e-01, -1.8866e-01, ..., -1.5392e-04, -6.2885e-02, -1.2225e-01], ..., [ 1.1505e-01, -1.0263e-01, 1.4671e-01, ..., 7.6320e-02, 2.2511e-01, -2.4177e-02], [ 1.3889e-01, -2.5836e-01, -1.7716e-01, ..., -3.6749e-01, -1.1583e-01, 2.3117e-01], [-2.1397e-01, 9.5773e-02, 7.5640e-02, ..., -3.5120e-01, -1.6831e-01, -1.0190e-01]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 1.8626e-09], [ 2.1420e-08, -6.5193e-08, 2.2352e-08, ..., 1.1176e-08, 1.9558e-08, -5.4017e-08], [-4.6566e-09, 1.9558e-08, 6.5193e-09, ..., -9.3132e-09, -5.5879e-09, 1.2107e-08], ..., [-5.0291e-08, 3.6322e-08, -5.4017e-08, ..., -1.6764e-08, -4.4703e-08, 3.0734e-08], [ 4.6566e-09, 7.4506e-09, 2.7940e-09, ..., 3.7253e-09, 4.6566e-09, 3.7253e-09], [ 1.3970e-08, 5.5879e-09, 1.4901e-08, ..., 6.5193e-09, 1.2107e-08, 5.5879e-09]], device='cuda:0') Epoch 395, bias, value: tensor([-0.0170, -0.0218, -0.0108, -0.0159, 0.0026, -0.0056, 0.0100, 0.0227, 0.0133, -0.0082], device='cuda:0'), grad: tensor([ 1.1176e-08, -1.2945e-07, -4.7497e-08, 2.8871e-08, 9.3132e-09, -5.0291e-08, 2.3283e-08, 4.7497e-08, 5.3085e-08, 5.9605e-08], device='cuda:0') 100 0.0001 changing lr epoch 394, time 215.10, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4515 re_mapping 0.0039 re_causal 0.0111 /// teacc 98.99 lr 0.00010000 Epoch 396, weight, value: tensor([[-3.1988e-01, 8.5339e-02, -2.1177e-01, ..., -1.1277e-01, -2.9076e-01, -1.7627e-01], [-7.9469e-03, 1.0576e-01, -1.2862e-01, ..., -1.3876e-01, -8.3839e-02, 1.3153e-01], [ 5.0514e-02, -1.5625e-01, -1.8890e-01, ..., -6.2472e-05, -6.2731e-02, -1.2250e-01], ..., [ 1.1513e-01, -1.0362e-01, 1.4694e-01, ..., 7.6377e-02, 2.2526e-01, -2.4261e-02], [ 1.3886e-01, -2.5922e-01, -1.7826e-01, ..., -3.6872e-01, -1.1758e-01, 2.3123e-01], [-2.1476e-01, 9.5815e-02, 7.5207e-02, ..., -3.5395e-01, -1.6886e-01, -1.0220e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 1.2107e-08, 1.8626e-09, 1.3039e-08, ..., 1.0245e-08, 1.0245e-08, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 3.7253e-09, ..., -0.0000e+00, -9.3132e-10, 0.0000e+00], ..., [-1.7695e-08, -2.7940e-09, -2.0489e-08, ..., -1.3970e-08, -1.3039e-08, 9.3132e-10], [-0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 9.3132e-10, 4.6566e-09, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 396, bias, value: tensor([-0.0171, -0.0218, -0.0107, -0.0159, 0.0023, -0.0050, 0.0097, 0.0227, 0.0126, -0.0085], device='cuda:0'), grad: tensor([-8.3819e-09, 3.2596e-08, -7.4506e-09, 3.6322e-08, 9.3132e-10, -2.6077e-08, -2.7940e-09, -3.9116e-08, 2.7940e-09, 1.4901e-08], device='cuda:0') 100 0.0001 changing lr epoch 395, time 214.79, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4820 re_mapping 0.0038 re_causal 0.0115 /// teacc 99.04 lr 0.00010000 Epoch 397, weight, value: tensor([[-0.3201, 0.0854, -0.2120, ..., -0.1135, -0.2909, -0.1763], [-0.0081, 0.1058, -0.1288, ..., -0.1398, -0.0840, 0.1315], [ 0.0503, -0.1556, -0.1908, ..., -0.0004, -0.0631, -0.1226], ..., [ 0.1154, -0.1037, 0.1474, ..., 0.0768, 0.2256, -0.0243], [ 0.1399, -0.2599, -0.1789, ..., -0.3695, -0.1184, 0.2324], [-0.2150, 0.0957, 0.0754, ..., -0.3557, -0.1690, -0.1026]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -9.3132e-09, 1.8626e-09, ..., 3.7253e-09, 0.0000e+00, 4.6566e-09], [ 3.0734e-08, 6.5193e-09, 2.3283e-08, ..., 2.3283e-08, 1.2107e-08, -3.1665e-08], [-2.7940e-07, 7.4506e-09, 6.5193e-09, ..., 6.5193e-09, 1.8626e-09, -3.5390e-08], ..., [ 3.7253e-08, 5.0291e-08, -9.3132e-09, ..., 6.5193e-09, -1.7695e-08, 7.5437e-08], [ 1.0245e-07, 4.6566e-09, 4.6566e-09, ..., 5.5879e-09, 9.3132e-10, 1.9558e-08], [ 3.7253e-09, 1.8626e-08, 2.7940e-09, ..., 9.3132e-10, 1.8626e-09, 1.6764e-08]], device='cuda:0') Epoch 397, bias, value: tensor([-0.0171, -0.0219, -0.0109, -0.0161, 0.0019, -0.0036, 0.0094, 0.0230, 0.0133, -0.0087], device='cuda:0'), grad: tensor([ 1.8626e-09, 6.7055e-08, -5.0012e-07, -2.0582e-07, -1.3784e-07, 8.0094e-08, 1.1176e-07, 2.7660e-07, 2.0862e-07, 7.5437e-08], device='cuda:0') 100 0.0001 changing lr epoch 396, time 214.92, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4638 re_mapping 0.0041 re_causal 0.0116 /// teacc 99.08 lr 0.00010000 Epoch 398, weight, value: tensor([[-0.3209, 0.0854, -0.2121, ..., -0.1138, -0.2910, -0.1766], [-0.0081, 0.1058, -0.1288, ..., -0.1403, -0.0841, 0.1316], [ 0.0503, -0.1559, -0.1910, ..., -0.0004, -0.0631, -0.1227], ..., [ 0.1155, -0.1049, 0.1475, ..., 0.0769, 0.2257, -0.0243], [ 0.1412, -0.2612, -0.1798, ..., -0.3706, -0.1199, 0.2340], [-0.2156, 0.0956, 0.0754, ..., -0.3575, -0.1691, -0.1043]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -7.8231e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [-3.7253e-09, 1.8626e-09, -1.8626e-09, ..., -1.8626e-09, -3.7253e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 3.1665e-08, -3.7253e-08, ..., 0.0000e+00, 1.8626e-09, -0.0000e+00]], device='cuda:0') Epoch 398, bias, value: tensor([-0.0173, -0.0219, -0.0109, -0.0161, 0.0025, -0.0036, 0.0097, 0.0230, 0.0136, -0.0091], device='cuda:0'), grad: tensor([-1.4529e-07, 1.8626e-09, 1.8626e-09, 1.8626e-09, 1.1921e-07, 1.1176e-08, 1.8626e-09, 0.0000e+00, 1.3039e-08, -7.4506e-09], device='cuda:0') 100 0.0001 changing lr epoch 397, time 214.71, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4579 re_mapping 0.0040 re_causal 0.0119 /// teacc 99.08 lr 0.00010000 Epoch 399, weight, value: tensor([[-0.3205, 0.0864, -0.2120, ..., -0.1133, -0.2910, -0.1765], [-0.0081, 0.1055, -0.1289, ..., -0.1407, -0.0841, 0.1316], [ 0.0503, -0.1562, -0.1911, ..., -0.0004, -0.0631, -0.1227], ..., [ 0.1155, -0.1053, 0.1475, ..., 0.0770, 0.2258, -0.0243], [ 0.1414, -0.2626, -0.1806, ..., -0.3718, -0.1204, 0.2341], [-0.2157, 0.0953, 0.0755, ..., -0.3585, -0.1691, -0.1046]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.3132e-09, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-09, 9.3132e-09, ..., 0.0000e+00, 1.8626e-09, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [-0.0000e+00, -5.5879e-09, -1.6764e-08, ..., 0.0000e+00, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 399, bias, value: tensor([-0.0155, -0.0220, -0.0109, -0.0182, 0.0024, -0.0018, 0.0082, 0.0230, 0.0128, -0.0092], device='cuda:0'), grad: tensor([ 1.6764e-08, 7.4506e-09, 2.2352e-08, 7.4506e-09, 3.7998e-07, 1.1176e-07, -5.9232e-07, 3.1665e-08, 4.0978e-08, -2.2352e-08], device='cuda:0') 100 0.0001 changing lr epoch 398, time 214.96, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4737 re_mapping 0.0039 re_causal 0.0116 /// teacc 99.02 lr 0.00010000 Epoch 400, weight, value: tensor([[-0.3203, 0.0867, -0.2122, ..., -0.1133, -0.2912, -0.1762], [-0.0082, 0.1055, -0.1289, ..., -0.1408, -0.0842, 0.1316], [ 0.0503, -0.1564, -0.1911, ..., -0.0004, -0.0630, -0.1225], ..., [ 0.1155, -0.1061, 0.1476, ..., 0.0769, 0.2258, -0.0244], [ 0.1415, -0.2638, -0.1821, ..., -0.3727, -0.1219, 0.2342], [-0.2158, 0.0948, 0.0757, ..., -0.3588, -0.1691, -0.1057]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 1.1176e-08, -1.8626e-09, 1.1176e-08, ..., 1.8626e-09, 1.3039e-08, -5.5879e-09], [-1.8626e-09, 0.0000e+00, 5.5879e-09, ..., -3.7253e-09, 3.7253e-09, 0.0000e+00], ..., [-6.1467e-08, -0.0000e+00, -5.2154e-08, ..., -3.7253e-09, -5.2154e-08, 3.7253e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.3528e-08, 1.4901e-08, 2.4214e-08, ..., 1.8626e-09, 2.4214e-08, 0.0000e+00]], device='cuda:0') Epoch 400, bias, value: tensor([-0.0152, -0.0220, -0.0108, -0.0185, 0.0025, -0.0016, 0.0082, 0.0230, 0.0123, -0.0097], device='cuda:0'), grad: tensor([ 2.4214e-08, 1.8626e-08, -9.3132e-09, 1.3039e-08, 1.8626e-08, -4.4703e-08, -1.1176e-08, -1.2852e-07, 1.1176e-08, 1.0617e-07], device='cuda:0') 100 0.0001 changing lr epoch 399, time 215.07, cls_loss 0.0008 cls_loss_mapping 0.0009 cls_loss_causal 0.4474 re_mapping 0.0039 re_causal 0.0114 /// teacc 98.96 lr 0.00001000 Epoch 401, weight, value: tensor([[-0.3205, 0.0867, -0.2124, ..., -0.1135, -0.2912, -0.1762], [-0.0082, 0.1056, -0.1290, ..., -0.1412, -0.0842, 0.1317], [ 0.0503, -0.1567, -0.1912, ..., -0.0004, -0.0630, -0.1228], ..., [ 0.1155, -0.1073, 0.1476, ..., 0.0769, 0.2259, -0.0245], [ 0.1434, -0.2645, -0.1818, ..., -0.3725, -0.1221, 0.2359], [-0.2167, 0.0942, 0.0754, ..., -0.3595, -0.1692, -0.1081]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, -1.4901e-08, 1.1176e-08, ..., 3.7253e-09, 1.1176e-08, -3.7253e-09], [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -3.7253e-09, -3.7253e-09, 0.0000e+00], ..., [-1.1176e-08, 1.1176e-08, -1.4901e-08, ..., -3.7253e-09, -1.4901e-08, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.0978e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.2154e-08]], device='cuda:0') Epoch 401, bias, value: tensor([-0.0152, -0.0220, -0.0109, -0.0193, 0.0037, -0.0009, 0.0084, 0.0230, 0.0133, -0.0106], device='cuda:0'), grad: tensor([ 0.0000e+00, -2.6077e-08, -7.4506e-09, -3.7253e-09, -2.0117e-07, -5.5879e-08, 1.8626e-08, 1.4901e-08, 1.1176e-08, 2.1979e-07], device='cuda:0') 100 1e-05 changing lr epoch 400, time 215.12, cls_loss 0.0010 cls_loss_mapping 0.0014 cls_loss_causal 0.4432 re_mapping 0.0038 re_causal 0.0108 /// teacc 98.96 lr 0.00001000 Epoch 402, weight, value: tensor([[-0.3206, 0.0867, -0.2124, ..., -0.1135, -0.2912, -0.1762], [-0.0082, 0.1056, -0.1290, ..., -0.1413, -0.0842, 0.1317], [ 0.0503, -0.1568, -0.1912, ..., -0.0004, -0.0629, -0.1229], ..., [ 0.1155, -0.1075, 0.1476, ..., 0.0769, 0.2259, -0.0245], [ 0.1436, -0.2646, -0.1818, ..., -0.3724, -0.1221, 0.2361], [-0.2170, 0.0941, 0.0754, ..., -0.3596, -0.1692, -0.1084]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.3528e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, -0.0000e+00, 3.7253e-09], ..., [-0.0000e+00, 0.0000e+00, -0.0000e+00, ..., -0.0000e+00, -3.7253e-09, 3.7253e-09], [-3.7253e-09, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 0.0000e+00, -0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 402, bias, value: tensor([-0.0152, -0.0221, -0.0108, -0.0195, 0.0039, -0.0006, 0.0084, 0.0230, 0.0133, -0.0108], device='cuda:0'), grad: tensor([ 4.8056e-07, 2.6077e-08, 1.1176e-08, 7.4506e-09, 3.3900e-07, 1.2293e-07, -1.0133e-06, 7.4506e-09, 0.0000e+00, 7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 401, time 215.15, cls_loss 0.0008 cls_loss_mapping 0.0009 cls_loss_causal 0.4554 re_mapping 0.0036 re_causal 0.0108 /// teacc 99.03 lr 0.00001000 Epoch 403, weight, value: tensor([[-3.2057e-01, 8.6725e-02, -2.1239e-01, ..., -1.1348e-01, -2.9122e-01, -1.7622e-01], [-8.2493e-03, 1.0558e-01, -1.2905e-01, ..., -1.4141e-01, -8.4256e-02, 1.3166e-01], [ 5.0345e-02, -1.5681e-01, -1.9119e-01, ..., -3.6259e-04, -6.2901e-02, -1.2279e-01], ..., [ 1.1555e-01, -1.0753e-01, 1.4768e-01, ..., 7.6932e-02, 2.2590e-01, -2.4496e-02], [ 1.4362e-01, -2.6465e-01, -1.8183e-01, ..., -3.7246e-01, -1.2211e-01, 2.3614e-01], [-2.1698e-01, 9.4074e-02, 7.5402e-02, ..., -3.5971e-01, -1.6922e-01, -1.0844e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -8.5682e-08, 4.4703e-08, ..., 3.7253e-08, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, 0.0000e+00, 1.1176e-08, ..., 1.4901e-08, 3.7253e-09, -2.2352e-08], [-0.0000e+00, 4.8429e-08, 7.0781e-08, ..., 8.5682e-08, 3.7253e-09, 0.0000e+00], ..., [-1.1176e-08, 1.1176e-08, -0.0000e+00, ..., 7.4506e-09, -1.1176e-08, 1.8626e-08], [-1.8626e-08, 1.1176e-08, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, -2.2352e-08], [ 3.7253e-09, 3.7253e-09, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 7.4506e-09]], device='cuda:0') Epoch 403, bias, value: tensor([-0.0152, -0.0221, -0.0108, -0.0195, 0.0039, -0.0005, 0.0084, 0.0230, 0.0132, -0.0108], device='cuda:0'), grad: tensor([-1.2293e-07, -7.4506e-09, 2.2724e-07, -4.4703e-07, -1.4901e-08, 2.4959e-07, 2.2352e-08, 5.5879e-08, -2.2352e-08, 3.3528e-08], device='cuda:0') 100 1e-05 changing lr epoch 402, time 215.04, cls_loss 0.0008 cls_loss_mapping 0.0007 cls_loss_causal 0.4496 re_mapping 0.0036 re_causal 0.0108 /// teacc 99.05 lr 0.00001000 Epoch 404, weight, value: tensor([[-3.2058e-01, 8.6713e-02, -2.1241e-01, ..., -1.1349e-01, -2.9123e-01, -1.7622e-01], [-8.2560e-03, 1.0557e-01, -1.2906e-01, ..., -1.4148e-01, -8.4264e-02, 1.3166e-01], [ 5.0353e-02, -1.5683e-01, -1.9120e-01, ..., -3.5259e-04, -6.2889e-02, -1.2278e-01], ..., [ 1.1555e-01, -1.0760e-01, 1.4768e-01, ..., 7.6927e-02, 2.2590e-01, -2.4508e-02], [ 1.4368e-01, -2.6470e-01, -1.8185e-01, ..., -3.7249e-01, -1.2214e-01, 2.3623e-01], [-2.1702e-01, 9.4105e-02, 7.5404e-02, ..., -3.5983e-01, -1.6923e-01, -1.0851e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 1.1176e-08, 3.7253e-09, 7.4506e-09, ..., 1.1176e-08, 0.0000e+00, 1.1176e-08], ..., [ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [-1.4901e-08, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, -1.1176e-08], [-3.7253e-09, -0.0000e+00, -1.4901e-08, ..., 0.0000e+00, -0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 404, bias, value: tensor([-0.0152, -0.0221, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0108], device='cuda:0'), grad: tensor([ 3.7253e-09, 2.9802e-08, 5.2154e-08, -4.0978e-08, -4.4703e-08, 3.7253e-09, -1.4901e-08, 1.4901e-08, -2.2352e-08, 0.0000e+00], device='cuda:0') 100 1e-05 changing lr epoch 403, time 215.01, cls_loss 0.0007 cls_loss_mapping 0.0005 cls_loss_causal 0.4597 re_mapping 0.0035 re_causal 0.0109 /// teacc 99.04 lr 0.00001000 Epoch 405, weight, value: tensor([[-3.2060e-01, 8.6720e-02, -2.1242e-01, ..., -1.1350e-01, -2.9124e-01, -1.7623e-01], [-8.2655e-03, 1.0557e-01, -1.2907e-01, ..., -1.4154e-01, -8.4278e-02, 1.3166e-01], [ 5.0358e-02, -1.5685e-01, -1.9120e-01, ..., -3.4654e-04, -6.2879e-02, -1.2277e-01], ..., [ 1.1556e-01, -1.0765e-01, 1.4770e-01, ..., 7.6925e-02, 2.2591e-01, -2.4510e-02], [ 1.4375e-01, -2.6474e-01, -1.8187e-01, ..., -3.7253e-01, -1.2217e-01, 2.3630e-01], [-2.1705e-01, 9.4092e-02, 7.5402e-02, ..., -3.5993e-01, -1.6924e-01, -1.0857e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -7.4506e-09, 3.7253e-09, ..., 3.7253e-09, 0.0000e+00, -4.0978e-08], [ 0.0000e+00, 7.4506e-09, 1.1176e-08, ..., 7.4506e-09, 0.0000e+00, 3.7253e-09], ..., [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-0.0000e+00, -0.0000e+00, -3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 405, bias, value: tensor([-0.0152, -0.0221, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0133, -0.0109], device='cuda:0'), grad: tensor([ 7.4506e-09, -5.9605e-08, 2.9802e-08, -5.9605e-08, 4.0978e-08, 7.4506e-09, 1.1176e-08, 1.4901e-08, 3.7253e-09, -7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 404, time 215.25, cls_loss 0.0007 cls_loss_mapping 0.0006 cls_loss_causal 0.4557 re_mapping 0.0035 re_causal 0.0109 /// teacc 99.12 lr 0.00001000 Epoch 406, weight, value: tensor([[-3.2062e-01, 8.6722e-02, -2.1244e-01, ..., -1.1352e-01, -2.9124e-01, -1.7623e-01], [-8.2704e-03, 1.0557e-01, -1.2908e-01, ..., -1.4162e-01, -8.4286e-02, 1.3167e-01], [ 5.0355e-02, -1.5688e-01, -1.9125e-01, ..., -3.4780e-04, -6.2885e-02, -1.2277e-01], ..., [ 1.1557e-01, -1.0768e-01, 1.4771e-01, ..., 7.6936e-02, 2.2593e-01, -2.4521e-02], [ 1.4380e-01, -2.6478e-01, -1.8190e-01, ..., -3.7257e-01, -1.2217e-01, 2.3635e-01], [-2.1709e-01, 9.4078e-02, 7.5398e-02, ..., -3.6005e-01, -1.6924e-01, -1.0863e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8626e-08, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, -0.0000e+00], [ 0.0000e+00, -1.4901e-08, 1.4901e-08, ..., 7.4506e-09, 1.1176e-08, -5.9605e-08], [ 7.4506e-09, 0.0000e+00, 7.4506e-09, ..., 3.7253e-09, 7.4506e-09, 0.0000e+00], ..., [-2.6077e-08, 1.4901e-08, -3.7253e-08, ..., -1.4901e-08, -3.3528e-08, 4.4703e-08], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.0978e-08, 1.1176e-08, ..., 7.4506e-09, 3.7253e-09, 4.8429e-08]], device='cuda:0') Epoch 406, bias, value: tensor([-0.0152, -0.0221, -0.0108, -0.0196, 0.0040, -0.0005, 0.0085, 0.0230, 0.0133, -0.0109], device='cuda:0'), grad: tensor([-4.8429e-08, -9.6858e-08, 1.8626e-08, 2.7940e-07, -2.3097e-07, -2.9802e-07, 3.7253e-09, 3.3528e-08, 7.4506e-09, 3.0175e-07], device='cuda:0') 100 1e-05 changing lr epoch 405, time 214.97, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4370 re_mapping 0.0034 re_causal 0.0108 /// teacc 99.11 lr 0.00001000 Epoch 407, weight, value: tensor([[-3.2062e-01, 8.6729e-02, -2.1243e-01, ..., -1.1352e-01, -2.9125e-01, -1.7623e-01], [-8.2816e-03, 1.0557e-01, -1.2911e-01, ..., -1.4168e-01, -8.4307e-02, 1.3167e-01], [ 5.0355e-02, -1.5695e-01, -1.9128e-01, ..., -3.4631e-04, -6.2885e-02, -1.2276e-01], ..., [ 1.1558e-01, -1.0776e-01, 1.4775e-01, ..., 7.6940e-02, 2.2595e-01, -2.4523e-02], [ 1.4381e-01, -2.6482e-01, -1.8192e-01, ..., -3.7260e-01, -1.2219e-01, 2.3637e-01], [-2.1711e-01, 9.4084e-02, 7.5395e-02, ..., -3.6013e-01, -1.6925e-01, -1.0865e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -2.1234e-07, 0.0000e+00, ..., -3.7253e-09, 0.0000e+00, 3.7253e-09], [-4.0978e-07, -2.9802e-08, 0.0000e+00, ..., 3.7253e-09, -5.5879e-08, -5.4762e-07], [ 2.6077e-08, 3.7253e-09, 0.0000e+00, ..., -3.7253e-09, 3.7253e-09, 3.3528e-08], ..., [ 1.8254e-07, 1.8626e-08, 0.0000e+00, ..., 0.0000e+00, 2.6077e-08, 2.4214e-07], [ 1.7881e-07, 1.4901e-08, 0.0000e+00, ..., 0.0000e+00, 2.6077e-08, 2.3469e-07], [ 7.4506e-09, 1.1176e-08, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09]], device='cuda:0') Epoch 407, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([-7.2643e-07, -1.1735e-06, 7.4506e-08, 2.2352e-08, 0.0000e+00, 9.6858e-08, 5.8487e-07, 5.3644e-07, 5.2154e-07, 5.2154e-08], device='cuda:0') 100 1e-05 changing lr epoch 406, time 214.90, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4339 re_mapping 0.0034 re_causal 0.0108 /// teacc 99.11 lr 0.00001000 Epoch 408, weight, value: tensor([[-3.2064e-01, 8.6738e-02, -2.1243e-01, ..., -1.1354e-01, -2.9126e-01, -1.7624e-01], [-8.3011e-03, 1.0554e-01, -1.2913e-01, ..., -1.4169e-01, -8.4333e-02, 1.3167e-01], [ 5.0354e-02, -1.5705e-01, -1.9130e-01, ..., -3.4760e-04, -6.2888e-02, -1.2277e-01], ..., [ 1.1560e-01, -1.0780e-01, 1.4777e-01, ..., 7.6940e-02, 2.2598e-01, -2.4516e-02], [ 1.4384e-01, -2.6486e-01, -1.8196e-01, ..., -3.7261e-01, -1.2222e-01, 2.3641e-01], [-2.1713e-01, 9.4096e-02, 7.5403e-02, ..., -3.6017e-01, -1.6926e-01, -1.0868e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 1.4901e-08, 3.7253e-09, 1.4901e-08, ..., 7.4506e-09, 1.1176e-08, 3.7253e-09], [-1.1176e-08, 0.0000e+00, 7.4506e-09, ..., 7.4506e-09, -3.7253e-09, -2.2352e-08], ..., [-1.4901e-08, 0.0000e+00, -2.6077e-08, ..., -1.4901e-08, -1.4901e-08, 1.1176e-08], [ 7.4506e-09, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 3.7253e-09], [ 3.7253e-09, -3.7253e-09, -3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 408, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([ 1.1176e-08, 4.8429e-08, -3.7253e-08, -5.2154e-08, 7.4506e-09, -1.4901e-08, 1.8626e-08, -2.2352e-08, 4.0978e-08, -7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 407, time 214.73, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4486 re_mapping 0.0034 re_causal 0.0111 /// teacc 99.14 lr 0.00001000 Epoch 409, weight, value: tensor([[-3.2066e-01, 8.6728e-02, -2.1245e-01, ..., -1.1355e-01, -2.9126e-01, -1.7624e-01], [-8.3043e-03, 1.0555e-01, -1.2914e-01, ..., -1.4173e-01, -8.4336e-02, 1.3168e-01], [ 5.0353e-02, -1.5708e-01, -1.9133e-01, ..., -3.4869e-04, -6.2891e-02, -1.2278e-01], ..., [ 1.1560e-01, -1.0787e-01, 1.4778e-01, ..., 7.6942e-02, 2.2598e-01, -2.4524e-02], [ 1.4386e-01, -2.6489e-01, -1.8198e-01, ..., -3.7263e-01, -1.2223e-01, 2.3643e-01], [-2.1715e-01, 9.4121e-02, 7.5403e-02, ..., -3.6023e-01, -1.6926e-01, -1.0870e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 7.8231e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.6858e-08], [-3.7253e-09, -2.1234e-07, 2.6077e-08, ..., 1.8626e-08, 1.1176e-08, -2.6822e-07], [ 0.0000e+00, 7.4506e-09, 1.1176e-08, ..., 3.7253e-09, 3.7253e-09, 7.4506e-09], ..., [ 2.9802e-08, 6.7055e-08, 1.3411e-07, ..., 9.3132e-08, 5.2154e-08, 8.9407e-08], [ 0.0000e+00, 2.2352e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.9802e-08], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09]], device='cuda:0') Epoch 409, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([ 3.1665e-07, -7.7486e-07, 4.0978e-08, -5.2527e-07, 7.4506e-09, 5.5879e-08, 5.9605e-08, 6.8918e-07, 9.6858e-08, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr epoch 408, time 214.89, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4354 re_mapping 0.0033 re_causal 0.0107 /// teacc 99.10 lr 0.00001000 Epoch 410, weight, value: tensor([[-3.2067e-01, 8.6732e-02, -2.1246e-01, ..., -1.1356e-01, -2.9127e-01, -1.7624e-01], [-8.3042e-03, 1.0558e-01, -1.2915e-01, ..., -1.4174e-01, -8.4342e-02, 1.3170e-01], [ 5.0354e-02, -1.5710e-01, -1.9135e-01, ..., -3.4723e-04, -6.2892e-02, -1.2278e-01], ..., [ 1.1560e-01, -1.0800e-01, 1.4779e-01, ..., 7.6936e-02, 2.2599e-01, -2.4530e-02], [ 1.4386e-01, -2.6492e-01, -1.8201e-01, ..., -3.7267e-01, -1.2226e-01, 2.3644e-01], [-2.1717e-01, 9.4117e-02, 7.5414e-02, ..., -3.6025e-01, -1.6927e-01, -1.0875e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8626e-08, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.4901e-08, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.9802e-08], [-7.0781e-08, 0.0000e+00, 0.0000e+00, ..., -4.4703e-08, -5.9605e-08, 0.0000e+00], ..., [ 6.7055e-08, 3.7253e-09, 0.0000e+00, ..., 4.0978e-08, 5.5879e-08, 0.0000e+00], [-2.6077e-08, 7.4506e-09, 7.4506e-09, ..., 0.0000e+00, 0.0000e+00, -4.4703e-08], [ 0.0000e+00, 7.0781e-08, -1.8626e-08, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 410, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([-3.7253e-08, 1.8626e-07, -1.5274e-07, 4.8429e-08, -4.7050e-06, -5.9605e-08, 7.8231e-08, 3.9488e-07, 2.2352e-08, 4.1984e-06], device='cuda:0') 100 1e-05 changing lr epoch 409, time 214.76, cls_loss 0.0006 cls_loss_mapping 0.0006 cls_loss_causal 0.4290 re_mapping 0.0032 re_causal 0.0105 /// teacc 99.12 lr 0.00001000 Epoch 411, weight, value: tensor([[-3.2068e-01, 8.6729e-02, -2.1247e-01, ..., -1.1356e-01, -2.9127e-01, -1.7624e-01], [-8.3142e-03, 1.0557e-01, -1.2917e-01, ..., -1.4180e-01, -8.4357e-02, 1.3170e-01], [ 5.0351e-02, -1.5714e-01, -1.9141e-01, ..., -3.4868e-04, -6.2895e-02, -1.2278e-01], ..., [ 1.1562e-01, -1.0803e-01, 1.4782e-01, ..., 7.6948e-02, 2.2601e-01, -2.4528e-02], [ 1.4388e-01, -2.6495e-01, -1.8205e-01, ..., -3.7271e-01, -1.2230e-01, 2.3647e-01], [-2.1720e-01, 9.4139e-02, 7.5410e-02, ..., -3.6035e-01, -1.6928e-01, -1.0879e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 6.8545e-07, 6.3330e-08, 2.5332e-07, ..., 3.7253e-09, 3.5763e-07, 2.3842e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [-8.5309e-07, -4.4703e-08, -3.1665e-07, ..., 0.0000e+00, -4.4703e-07, -2.7940e-07], [ 5.9605e-08, 1.4901e-08, 2.2352e-08, ..., 3.7253e-09, 3.3528e-08, 2.2352e-08], [ 8.9407e-08, 1.3411e-07, 2.6077e-08, ..., 0.0000e+00, 4.4703e-08, 1.3411e-07]], device='cuda:0') Epoch 411, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([ 1.1176e-08, 1.3150e-06, 0.0000e+00, 4.0978e-07, -6.5193e-07, -5.9977e-07, 1.1921e-07, -1.4901e-06, 1.6391e-07, 7.1526e-07], device='cuda:0') 100 1e-05 changing lr epoch 410, time 214.44, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4290 re_mapping 0.0032 re_causal 0.0106 /// teacc 99.09 lr 0.00001000 Epoch 412, weight, value: tensor([[-3.2070e-01, 8.6724e-02, -2.1249e-01, ..., -1.1359e-01, -2.9128e-01, -1.7624e-01], [-8.3191e-03, 1.0558e-01, -1.2917e-01, ..., -1.4187e-01, -8.4360e-02, 1.3171e-01], [ 5.0344e-02, -1.5715e-01, -1.9147e-01, ..., -3.5128e-04, -6.2906e-02, -1.2279e-01], ..., [ 1.1563e-01, -1.0803e-01, 1.4784e-01, ..., 7.6962e-02, 2.2602e-01, -2.4539e-02], [ 1.4392e-01, -2.6499e-01, -1.8207e-01, ..., -3.7277e-01, -1.2234e-01, 2.3653e-01], [-2.1723e-01, 9.4159e-02, 7.5406e-02, ..., -3.6042e-01, -1.6930e-01, -1.0882e-01]], device='cuda:0'), grad: tensor([[ 1.4901e-08, -1.8626e-08, 0.0000e+00, ..., 7.4506e-09, 3.7253e-09, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [-4.8429e-08, 0.0000e+00, 0.0000e+00, ..., -2.6077e-08, -2.2352e-08, 0.0000e+00], ..., [ 2.6077e-08, 3.7253e-09, 7.4506e-09, ..., 1.1176e-08, 1.8626e-08, 7.4506e-09], [-5.5879e-08, 3.7253e-09, -1.4901e-08, ..., -7.4506e-09, 0.0000e+00, -7.8231e-08], [ 7.4506e-09, -5.2154e-08, -7.0781e-08, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 412, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0109], device='cuda:0'), grad: tensor([ 0.0000e+00, 1.8626e-08, -1.6764e-07, 1.6391e-07, 1.4156e-07, 7.4506e-09, 3.3528e-08, 7.8231e-08, -1.4156e-07, -1.4156e-07], device='cuda:0') 100 1e-05 changing lr epoch 411, time 214.91, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4431 re_mapping 0.0031 re_causal 0.0107 /// teacc 99.10 lr 0.00001000 Epoch 413, weight, value: tensor([[-3.2070e-01, 8.6737e-02, -2.1249e-01, ..., -1.1360e-01, -2.9129e-01, -1.7624e-01], [-8.3373e-03, 1.0560e-01, -1.2920e-01, ..., -1.4191e-01, -8.4386e-02, 1.3172e-01], [ 5.0347e-02, -1.5717e-01, -1.9150e-01, ..., -3.4639e-04, -6.2902e-02, -1.2278e-01], ..., [ 1.1565e-01, -1.0811e-01, 1.4787e-01, ..., 7.6968e-02, 2.2605e-01, -2.4544e-02], [ 1.4393e-01, -2.6503e-01, -1.8211e-01, ..., -3.7280e-01, -1.2238e-01, 2.3655e-01], [-2.1727e-01, 9.4138e-02, 7.5413e-02, ..., -3.6051e-01, -1.6931e-01, -1.0887e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.3097e-07, -3.7253e-09, 1.6019e-07, ..., 1.6391e-07, 1.2293e-07, -1.8626e-08], [ 2.6077e-08, 0.0000e+00, 1.8626e-08, ..., 1.8626e-08, 1.4901e-08, 0.0000e+00], ..., [-2.7195e-07, 7.4506e-09, -1.6019e-07, ..., -1.7509e-07, -1.4156e-07, 3.7253e-09], [-3.7253e-09, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 3.7253e-09, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09]], device='cuda:0') Epoch 413, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0110], device='cuda:0'), grad: tensor([ 0.0000e+00, 4.7684e-07, 6.3330e-08, -2.6077e-08, 7.4506e-09, 3.7253e-09, 3.7253e-09, -5.3272e-07, -1.1176e-08, 1.1176e-08], device='cuda:0') 100 1e-05 changing lr epoch 412, time 214.69, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4021 re_mapping 0.0031 re_causal 0.0103 /// teacc 99.13 lr 0.00001000 Epoch 414, weight, value: tensor([[-3.2073e-01, 8.6743e-02, -2.1251e-01, ..., -1.1362e-01, -2.9130e-01, -1.7624e-01], [-8.3455e-03, 1.0561e-01, -1.2921e-01, ..., -1.4199e-01, -8.4398e-02, 1.3173e-01], [ 5.0349e-02, -1.5719e-01, -1.9153e-01, ..., -3.4137e-04, -6.2902e-02, -1.2279e-01], ..., [ 1.1566e-01, -1.0815e-01, 1.4789e-01, ..., 7.6978e-02, 2.2607e-01, -2.4550e-02], [ 1.4393e-01, -2.6509e-01, -1.8214e-01, ..., -3.7284e-01, -1.2242e-01, 2.3655e-01], [-2.1729e-01, 9.4130e-02, 7.5421e-02, ..., -3.6057e-01, -1.6932e-01, -1.0890e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 2.2352e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.4901e-08], [ 3.7253e-09, -0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 3.7253e-09, -7.4506e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [-2.6077e-08, 7.4506e-09, -1.8626e-08, ..., -0.0000e+00, -1.4901e-08, 7.4506e-09], [ 0.0000e+00, 2.9802e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.4901e-08], [ 1.4901e-08, 3.7253e-09, 1.1176e-08, ..., 0.0000e+00, 7.4506e-09, 3.7253e-09]], device='cuda:0') Epoch 414, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0230, 0.0132, -0.0110], device='cuda:0'), grad: tensor([ 1.6019e-07, 3.7253e-09, 3.7253e-09, 1.2293e-07, -2.9802e-08, -1.0356e-06, 5.2154e-07, -1.4901e-08, 2.0862e-07, 5.2154e-08], device='cuda:0') 100 1e-05 changing lr epoch 413, time 214.86, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4399 re_mapping 0.0031 re_causal 0.0105 /// teacc 99.12 lr 0.00001000 Epoch 415, weight, value: tensor([[-3.2074e-01, 8.6748e-02, -2.1252e-01, ..., -1.1363e-01, -2.9131e-01, -1.7625e-01], [-8.3957e-03, 1.0568e-01, -1.2928e-01, ..., -1.4206e-01, -8.4466e-02, 1.3174e-01], [ 5.0347e-02, -1.5720e-01, -1.9156e-01, ..., -3.3974e-04, -6.2907e-02, -1.2279e-01], ..., [ 1.1571e-01, -1.0830e-01, 1.4797e-01, ..., 7.6988e-02, 2.2614e-01, -2.4552e-02], [ 1.4393e-01, -2.6514e-01, -1.8220e-01, ..., -3.7290e-01, -1.2250e-01, 2.3656e-01], [-2.1735e-01, 9.4113e-02, 7.5407e-02, ..., -3.6063e-01, -1.6934e-01, -1.0898e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, -0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 7.4506e-09, -0.0000e+00], [-7.4506e-09, 0.0000e+00, 0.0000e+00, ..., -3.7253e-09, -7.4506e-09, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, -1.1176e-08, ..., 3.7253e-09, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 415, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0231, 0.0131, -0.0110], device='cuda:0'), grad: tensor([ 3.7253e-09, 7.4506e-09, -1.4901e-08, 1.4901e-08, 0.0000e+00, -1.8626e-08, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00], device='cuda:0') 100 1e-05 changing lr epoch 414, time 214.95, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4755 re_mapping 0.0032 re_causal 0.0109 /// teacc 99.11 lr 0.00001000 Epoch 416, weight, value: tensor([[-3.2076e-01, 8.6729e-02, -2.1252e-01, ..., -1.1364e-01, -2.9132e-01, -1.7625e-01], [-8.4033e-03, 1.0569e-01, -1.2929e-01, ..., -1.4215e-01, -8.4476e-02, 1.3175e-01], [ 5.0345e-02, -1.5724e-01, -1.9161e-01, ..., -3.3854e-04, -6.2914e-02, -1.2281e-01], ..., [ 1.1572e-01, -1.0833e-01, 1.4799e-01, ..., 7.7000e-02, 2.2616e-01, -2.4555e-02], [ 1.4396e-01, -2.6520e-01, -1.8226e-01, ..., -3.7293e-01, -1.2253e-01, 2.3659e-01], [-2.1739e-01, 9.4172e-02, 7.5405e-02, ..., -3.6069e-01, -1.6936e-01, -1.0899e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.0978e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 7.4506e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.9605e-08]], device='cuda:0') Epoch 416, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0231, 0.0131, -0.0110], device='cuda:0'), grad: tensor([-8.9407e-08, 0.0000e+00, 3.7253e-09, -4.4703e-08, -2.5705e-07, -4.4703e-08, 7.0781e-08, 3.7253e-09, -3.7253e-09, 3.5763e-07], device='cuda:0') 100 1e-05 changing lr epoch 415, time 215.00, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4405 re_mapping 0.0032 re_causal 0.0105 /// teacc 99.12 lr 0.00001000 Epoch 417, weight, value: tensor([[-3.2078e-01, 8.6707e-02, -2.1255e-01, ..., -1.1364e-01, -2.9132e-01, -1.7626e-01], [-8.4017e-03, 1.0569e-01, -1.2928e-01, ..., -1.4220e-01, -8.4477e-02, 1.3177e-01], [ 5.0339e-02, -1.5737e-01, -1.9167e-01, ..., -3.4518e-04, -6.2923e-02, -1.2282e-01], ..., [ 1.1572e-01, -1.0845e-01, 1.4799e-01, ..., 7.7016e-02, 2.2617e-01, -2.4567e-02], [ 1.4400e-01, -2.6525e-01, -1.8230e-01, ..., -3.7297e-01, -1.2257e-01, 2.3664e-01], [-2.1740e-01, 9.4253e-02, 7.5414e-02, ..., -3.6076e-01, -1.6937e-01, -1.0902e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 3.5018e-07, -5.9605e-08, 1.8254e-07, ..., 7.4506e-09, 2.9430e-07, 4.4703e-08], [ 1.8626e-08, 0.0000e+00, 1.1176e-08, ..., 0.0000e+00, 1.4901e-08, 7.4506e-09], ..., [-4.1723e-07, 4.8429e-08, -2.1607e-07, ..., -7.4506e-09, -3.4645e-07, -9.6858e-08], [ 2.6077e-08, 3.7253e-09, 1.1176e-08, ..., 0.0000e+00, 1.8626e-08, 1.4901e-08], [ 1.8626e-08, 1.8626e-08, 7.4506e-09, ..., 0.0000e+00, 1.1176e-08, 2.2352e-08]], device='cuda:0') Epoch 417, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0195, 0.0040, -0.0005, 0.0085, 0.0231, 0.0131, -0.0109], device='cuda:0'), grad: tensor([ 3.7253e-09, 3.7253e-07, 3.7253e-08, 0.0000e+00, 0.0000e+00, -5.9605e-08, 5.5879e-08, -5.5879e-07, 5.5879e-08, 9.3132e-08], device='cuda:0') 100 1e-05 changing lr epoch 416, time 214.81, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4385 re_mapping 0.0031 re_causal 0.0106 /// teacc 99.11 lr 0.00001000 Epoch 418, weight, value: tensor([[-3.2078e-01, 8.6695e-02, -2.1264e-01, ..., -1.1365e-01, -2.9133e-01, -1.7627e-01], [-8.3893e-03, 1.0577e-01, -1.2927e-01, ..., -1.4224e-01, -8.4467e-02, 1.3180e-01], [ 5.0345e-02, -1.5741e-01, -1.9169e-01, ..., -3.3714e-04, -6.2917e-02, -1.2284e-01], ..., [ 1.1572e-01, -1.0875e-01, 1.4799e-01, ..., 7.7015e-02, 2.2616e-01, -2.4597e-02], [ 1.4403e-01, -2.6530e-01, -1.8235e-01, ..., -3.7302e-01, -1.2260e-01, 2.3669e-01], [-2.1744e-01, 9.4288e-02, 7.5427e-02, ..., -3.6081e-01, -1.6938e-01, -1.0905e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], ..., [-3.7253e-09, 1.1176e-08, 7.4506e-09, ..., -0.0000e+00, -3.7253e-09, 0.0000e+00], [-1.1176e-08, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 0.0000e+00, -7.0781e-08, -7.0781e-08, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 418, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0131, -0.0109], device='cuda:0'), grad: tensor([ 0.0000e+00, 1.8626e-08, 3.3528e-08, 5.2154e-08, 1.0803e-07, 2.6077e-08, 7.4506e-09, 4.4703e-08, -2.6077e-08, -2.7567e-07], device='cuda:0') 100 1e-05 changing lr epoch 417, time 214.85, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4392 re_mapping 0.0031 re_causal 0.0106 /// teacc 99.12 lr 0.00001000 Epoch 419, weight, value: tensor([[-3.2080e-01, 8.6699e-02, -2.1265e-01, ..., -1.1366e-01, -2.9134e-01, -1.7627e-01], [-8.4144e-03, 1.0578e-01, -1.2931e-01, ..., -1.4227e-01, -8.4508e-02, 1.3181e-01], [ 5.0342e-02, -1.5747e-01, -1.9172e-01, ..., -3.3734e-04, -6.2921e-02, -1.2285e-01], ..., [ 1.1574e-01, -1.0889e-01, 1.4804e-01, ..., 7.7026e-02, 2.2621e-01, -2.4595e-02], [ 1.4406e-01, -2.6534e-01, -1.8239e-01, ..., -3.7306e-01, -1.2263e-01, 2.3671e-01], [-2.1746e-01, 9.4303e-02, 7.5425e-02, ..., -3.6087e-01, -1.6940e-01, -1.0908e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.7253e-09], [ 3.3155e-07, 1.0803e-07, 2.9802e-07, ..., 1.5646e-07, 2.5705e-07, 5.5879e-08], [ 2.2352e-08, 0.0000e+00, 3.7253e-08, ..., -1.1176e-08, 2.2352e-08, 3.7253e-09], ..., [-4.0606e-07, 1.4901e-08, -3.7253e-07, ..., -1.7509e-07, -3.2410e-07, 1.1176e-08], [ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-3.7253e-09, -2.0117e-07, -2.4214e-07, ..., 3.7253e-09, 7.4506e-09, 1.1176e-08]], device='cuda:0') Epoch 419, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0131, -0.0109], device='cuda:0'), grad: tensor([ 7.0781e-08, 1.0543e-06, 3.7253e-09, 6.3330e-08, 3.4273e-07, 6.3330e-08, -4.4703e-07, -7.2643e-07, 3.3528e-07, -7.7859e-07], device='cuda:0') 100 1e-05 changing lr epoch 418, time 214.62, cls_loss 0.0005 cls_loss_mapping 0.0005 cls_loss_causal 0.4388 re_mapping 0.0031 re_causal 0.0104 /// teacc 99.12 lr 0.00001000 Epoch 420, weight, value: tensor([[-3.2081e-01, 8.6719e-02, -2.1268e-01, ..., -1.1366e-01, -2.9135e-01, -1.7627e-01], [-8.4120e-03, 1.0579e-01, -1.2931e-01, ..., -1.4230e-01, -8.4502e-02, 1.3183e-01], [ 5.0348e-02, -1.5752e-01, -1.9173e-01, ..., -3.2662e-04, -6.2908e-02, -1.2280e-01], ..., [ 1.1575e-01, -1.0903e-01, 1.4804e-01, ..., 7.7027e-02, 2.2621e-01, -2.4614e-02], [ 1.4405e-01, -2.6540e-01, -1.8243e-01, ..., -3.7313e-01, -1.2272e-01, 2.3672e-01], [-2.1751e-01, 9.4296e-02, 7.5434e-02, ..., -3.6098e-01, -1.6941e-01, -1.0915e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.4901e-08, -0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, -3.7253e-09], [-1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -7.4506e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 420, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0131, -0.0109], device='cuda:0'), grad: tensor([ 7.4506e-09, 7.8231e-08, -7.4506e-08, 1.8626e-08, 1.8626e-08, 4.8429e-08, -1.0803e-07, 3.7253e-09, 1.1176e-08, 0.0000e+00], device='cuda:0') 100 1e-05 changing lr epoch 419, time 214.63, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4420 re_mapping 0.0031 re_causal 0.0102 /// teacc 99.13 lr 0.00001000 Epoch 421, weight, value: tensor([[-3.2082e-01, 8.6735e-02, -2.1268e-01, ..., -1.1366e-01, -2.9136e-01, -1.7627e-01], [-8.4147e-03, 1.0578e-01, -1.2931e-01, ..., -1.4234e-01, -8.4505e-02, 1.3184e-01], [ 5.0346e-02, -1.5757e-01, -1.9177e-01, ..., -3.2358e-04, -6.2912e-02, -1.2281e-01], ..., [ 1.1575e-01, -1.0911e-01, 1.4805e-01, ..., 7.7031e-02, 2.2621e-01, -2.4626e-02], [ 1.4407e-01, -2.6547e-01, -1.8246e-01, ..., -3.7319e-01, -1.2276e-01, 2.3675e-01], [-2.1754e-01, 9.4306e-02, 7.5447e-02, ..., -3.6101e-01, -1.6942e-01, -1.0919e-01]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -1.8626e-09, 0.0000e+00, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 2.0489e-08, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 4.0978e-08], [ 5.0291e-08, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, -1.8626e-09, 5.7742e-08], ..., [ 3.7253e-09, 9.3132e-09, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 2.0489e-08], [-6.7055e-08, 0.0000e+00, 0.0000e+00, ..., -9.3132e-09, 0.0000e+00, -6.7055e-08], [-0.0000e+00, 1.6764e-08, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 3.7253e-08]], device='cuda:0') Epoch 421, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0130, -0.0109], device='cuda:0'), grad: tensor([ 3.7253e-08, 7.8231e-08, 7.6368e-08, 1.6764e-08, -1.7509e-07, 1.8626e-09, 5.5879e-09, 4.4703e-08, -1.5460e-07, 6.3330e-08], device='cuda:0') 100 1e-05 changing lr epoch 420, time 214.44, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4524 re_mapping 0.0030 re_causal 0.0105 /// teacc 99.12 lr 0.00001000 Epoch 422, weight, value: tensor([[-3.2083e-01, 8.6741e-02, -2.1268e-01, ..., -1.1365e-01, -2.9136e-01, -1.7628e-01], [-8.4174e-03, 1.0579e-01, -1.2932e-01, ..., -1.4241e-01, -8.4512e-02, 1.3186e-01], [ 5.0342e-02, -1.5760e-01, -1.9181e-01, ..., -3.2418e-04, -6.2921e-02, -1.2283e-01], ..., [ 1.1576e-01, -1.0917e-01, 1.4807e-01, ..., 7.7040e-02, 2.2623e-01, -2.4634e-02], [ 1.4403e-01, -2.6553e-01, -1.8251e-01, ..., -3.7322e-01, -1.2282e-01, 2.3673e-01], [-2.1758e-01, 9.4310e-02, 7.5461e-02, ..., -3.6107e-01, -1.6944e-01, -1.0925e-01]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -3.7253e-09, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 2.6077e-08, 7.4506e-09, 3.5390e-08, ..., 1.8626e-08, 2.4214e-08, -1.8626e-09], [ 9.3132e-09, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 5.5879e-09, 1.8626e-09], ..., [-8.9407e-08, -1.8626e-08, -1.0245e-07, ..., -4.8429e-08, -8.1956e-08, 3.7253e-09], [-3.7253e-09, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 2.2352e-08, -3.5390e-08, -3.1665e-08, ..., 9.3132e-09, 1.6764e-08, 5.5879e-09]], device='cuda:0') Epoch 422, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0130, -0.0109], device='cuda:0'), grad: tensor([ 3.7253e-09, 6.7055e-08, 2.2352e-08, 6.8918e-08, 9.1270e-08, 1.8626e-08, 1.8626e-09, -2.2165e-07, -7.4506e-09, -5.7742e-08], device='cuda:0') 100 1e-05 changing lr epoch 421, time 214.76, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4378 re_mapping 0.0030 re_causal 0.0103 /// teacc 99.12 lr 0.00001000 Epoch 423, weight, value: tensor([[-3.2085e-01, 8.6736e-02, -2.1270e-01, ..., -1.1366e-01, -2.9137e-01, -1.7628e-01], [-8.4228e-03, 1.0579e-01, -1.2934e-01, ..., -1.4247e-01, -8.4525e-02, 1.3187e-01], [ 5.0342e-02, -1.5765e-01, -1.9183e-01, ..., -3.1882e-04, -6.2918e-02, -1.2286e-01], ..., [ 1.1577e-01, -1.0917e-01, 1.4809e-01, ..., 7.7043e-02, 2.2624e-01, -2.4641e-02], [ 1.4403e-01, -2.6558e-01, -1.8258e-01, ..., -3.7329e-01, -1.2287e-01, 2.3675e-01], [-2.1760e-01, 9.4353e-02, 7.5482e-02, ..., -3.6111e-01, -1.6944e-01, -1.0928e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.9802e-07, 7.4506e-09, 1.2480e-07, ..., 5.5879e-09, 1.7136e-07, 1.1362e-07], [-2.6822e-07, 1.8626e-09, 1.1176e-08, ..., -4.0978e-08, -2.1793e-07, 1.8626e-09], ..., [-1.2480e-07, 2.4214e-08, -1.2666e-07, ..., 3.1665e-08, -1.3039e-08, -1.4342e-07], [ 3.3528e-08, 1.8626e-09, 1.6764e-08, ..., 0.0000e+00, 2.2352e-08, 9.3132e-09], [ 4.0978e-08, -1.8626e-07, -2.7195e-07, ..., 9.3132e-09, 2.4214e-08, 1.4901e-08]], device='cuda:0') Epoch 423, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0040, -0.0005, 0.0086, 0.0231, 0.0129, -0.0109], device='cuda:0'), grad: tensor([ 1.3039e-08, 5.7556e-07, -3.1665e-07, -2.2352e-08, 5.8673e-07, 2.0489e-08, -3.9116e-08, -2.9244e-07, 7.4506e-08, -6.1654e-07], device='cuda:0') 100 1e-05 changing lr epoch 422, time 214.75, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4124 re_mapping 0.0030 re_causal 0.0099 /// teacc 99.13 lr 0.00001000 Epoch 424, weight, value: tensor([[-3.2086e-01, 8.6750e-02, -2.1271e-01, ..., -1.1367e-01, -2.9138e-01, -1.7628e-01], [-8.4125e-03, 1.0585e-01, -1.2933e-01, ..., -1.4250e-01, -8.4512e-02, 1.3194e-01], [ 5.0329e-02, -1.5769e-01, -1.9193e-01, ..., -3.2987e-04, -6.2943e-02, -1.2287e-01], ..., [ 1.1578e-01, -1.0944e-01, 1.4811e-01, ..., 7.7074e-02, 2.2626e-01, -2.4696e-02], [ 1.4403e-01, -2.6565e-01, -1.8265e-01, ..., -3.7332e-01, -1.2292e-01, 2.3676e-01], [-2.1764e-01, 9.4354e-02, 7.5492e-02, ..., -3.6119e-01, -1.6947e-01, -1.0934e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 4.0978e-08, 7.4506e-09, 4.0978e-08, ..., 1.3039e-08, 5.0291e-08, 2.9802e-08], [ 3.7253e-08, 1.8626e-09, 3.9116e-08, ..., 1.8626e-08, 5.0291e-08, 7.4506e-09], ..., [-2.1607e-07, 3.7253e-09, -2.3097e-07, ..., -1.1176e-07, -2.9616e-07, -3.1665e-08], [ 9.3132e-09, 3.7253e-09, 9.3132e-09, ..., 1.8626e-09, 1.1176e-08, 5.5879e-09], [ 9.3132e-09, 1.8626e-08, 1.1176e-08, ..., 3.7253e-09, 1.1176e-08, 4.2841e-08]], device='cuda:0') Epoch 424, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0196, 0.0039, -0.0005, 0.0086, 0.0231, 0.0129, -0.0109], device='cuda:0'), grad: tensor([ 5.5879e-09, 1.3039e-07, 9.1270e-08, 3.9861e-07, -1.7136e-07, -2.7753e-07, 1.3784e-07, -4.8243e-07, 3.1665e-08, 1.3784e-07], device='cuda:0') 100 1e-05 changing lr epoch 423, time 214.86, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4554 re_mapping 0.0029 re_causal 0.0106 /// teacc 99.12 lr 0.00001000 Epoch 425, weight, value: tensor([[-3.2087e-01, 8.6757e-02, -2.1272e-01, ..., -1.1368e-01, -2.9139e-01, -1.7628e-01], [-8.4127e-03, 1.0587e-01, -1.2933e-01, ..., -1.4256e-01, -8.4511e-02, 1.3197e-01], [ 5.0326e-02, -1.5772e-01, -1.9197e-01, ..., -3.3272e-04, -6.2950e-02, -1.2287e-01], ..., [ 1.1579e-01, -1.0953e-01, 1.4812e-01, ..., 7.7099e-02, 2.2627e-01, -2.4719e-02], [ 1.4404e-01, -2.6575e-01, -1.8273e-01, ..., -3.7338e-01, -1.2297e-01, 2.3678e-01], [-2.1766e-01, 9.4357e-02, 7.5512e-02, ..., -3.6126e-01, -1.6947e-01, -1.0938e-01]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -5.5879e-09, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 3.7253e-09, 1.8626e-09, 1.8626e-09, ..., 3.7253e-09, 3.7253e-09, 1.8626e-09], [ 1.4901e-08, 1.8626e-09, 2.0489e-08, ..., -5.5879e-09, 6.3330e-08, 0.0000e+00], ..., [-4.6566e-08, 1.8626e-09, -2.0489e-08, ..., -1.1176e-08, -8.3819e-08, 0.0000e+00], [ 1.1176e-08, 0.0000e+00, 1.8626e-09, ..., 7.4506e-09, 1.8626e-09, 1.8626e-09], [ 1.8626e-09, 5.5879e-09, -3.1665e-08, ..., 0.0000e+00, 1.8626e-09, 1.8626e-08]], device='cuda:0') Epoch 425, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0197, 0.0039, -0.0005, 0.0086, 0.0231, 0.0128, -0.0109], device='cuda:0'), grad: tensor([-9.3132e-09, 1.8626e-08, -2.4214e-08, 2.4214e-08, -1.4529e-07, -3.7253e-09, 1.1176e-08, -4.0978e-08, 3.5390e-08, 1.2852e-07], device='cuda:0') 100 1e-05 changing lr epoch 424, time 214.87, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.3997 re_mapping 0.0030 re_causal 0.0098 /// teacc 99.12 lr 0.00001000 Epoch 426, weight, value: tensor([[-3.2089e-01, 8.6754e-02, -2.1279e-01, ..., -1.1369e-01, -2.9140e-01, -1.7630e-01], [-8.4204e-03, 1.0584e-01, -1.2934e-01, ..., -1.4266e-01, -8.4519e-02, 1.3198e-01], [ 5.0313e-02, -1.5785e-01, -1.9208e-01, ..., -3.4245e-04, -6.2970e-02, -1.2291e-01], ..., [ 1.1581e-01, -1.0957e-01, 1.4815e-01, ..., 7.7127e-02, 2.2630e-01, -2.4730e-02], [ 1.4406e-01, -2.6583e-01, -1.8277e-01, ..., -3.7344e-01, -1.2301e-01, 2.3683e-01], [-2.1774e-01, 9.4384e-02, 7.5529e-02, ..., -3.6132e-01, -1.6948e-01, -1.0952e-01]], device='cuda:0'), grad: tensor([[ 1.8626e-08, 1.8626e-09, 1.8626e-09, ..., 3.7253e-09, 1.8626e-09, 2.4214e-08], [ 1.8626e-08, 0.0000e+00, 7.4506e-09, ..., 7.4506e-09, 1.3039e-08, -3.7253e-09], [ 5.9605e-08, 1.8626e-09, 5.5879e-09, ..., -1.4901e-08, -1.1176e-08, 1.1362e-07], ..., [-5.5879e-09, 3.7253e-09, -9.3132e-09, ..., 1.3039e-08, -7.4506e-09, 3.7253e-09], [-1.1921e-07, 2.2352e-08, -1.8626e-09, ..., -1.8626e-09, 0.0000e+00, -1.7695e-07], [ 1.8626e-09, 2.7940e-08, 5.5879e-09, ..., 1.3039e-08, 1.8626e-09, 1.8626e-09]], device='cuda:0') Epoch 426, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0197, 0.0040, -0.0004, 0.0086, 0.0231, 0.0128, -0.0109], device='cuda:0'), grad: tensor([ 7.2643e-08, 4.8429e-08, 2.1048e-07, 4.2394e-06, 1.8626e-09, -4.6566e-06, 1.5460e-07, 2.9802e-08, -2.8312e-07, 1.8254e-07], device='cuda:0') 100 1e-05 changing lr epoch 425, time 214.51, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4471 re_mapping 0.0030 re_causal 0.0103 /// teacc 99.14 lr 0.00001000 Epoch 427, weight, value: tensor([[-3.2092e-01, 8.6757e-02, -2.1297e-01, ..., -1.1370e-01, -2.9141e-01, -1.7631e-01], [-8.4149e-03, 1.0589e-01, -1.2934e-01, ..., -1.4270e-01, -8.4519e-02, 1.3202e-01], [ 5.0312e-02, -1.5791e-01, -1.9211e-01, ..., -3.3915e-04, -6.2971e-02, -1.2293e-01], ..., [ 1.1581e-01, -1.0969e-01, 1.4816e-01, ..., 7.7133e-02, 2.2630e-01, -2.4753e-02], [ 1.4402e-01, -2.6593e-01, -1.8283e-01, ..., -3.7348e-01, -1.2309e-01, 2.3680e-01], [-2.1778e-01, 9.4414e-02, 7.5567e-02, ..., -3.6141e-01, -1.6950e-01, -1.0956e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.5390e-08, 9.3132e-09, ..., 0.0000e+00, 1.8626e-09, -1.1176e-08], [ 2.4214e-08, 1.8626e-08, 2.2352e-08, ..., 7.4506e-09, 1.4901e-08, -3.7253e-09], [-0.0000e+00, 1.8626e-09, 1.8626e-09, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], ..., [-3.1665e-08, 9.3132e-09, -2.7940e-08, ..., -7.4506e-09, -1.8626e-08, 9.3132e-09], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.6764e-08, -1.3039e-08, ..., 0.0000e+00, -0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 427, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0198, 0.0040, -0.0004, 0.0086, 0.0231, 0.0127, -0.0109], device='cuda:0'), grad: tensor([ 4.1723e-07, 9.1270e-08, -1.8626e-09, 5.5879e-09, 3.5949e-07, 1.5087e-07, -1.2573e-06, -1.8626e-08, 5.4017e-08, 1.8626e-07], device='cuda:0') 100 1e-05 changing lr epoch 426, time 214.30, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4207 re_mapping 0.0030 re_causal 0.0101 /// teacc 99.15 lr 0.00001000 Epoch 428, weight, value: tensor([[-3.2095e-01, 8.6758e-02, -2.1298e-01, ..., -1.1372e-01, -2.9142e-01, -1.7631e-01], [-8.4134e-03, 1.0588e-01, -1.2935e-01, ..., -1.4274e-01, -8.4519e-02, 1.3204e-01], [ 5.0317e-02, -1.5798e-01, -1.9214e-01, ..., -3.3103e-04, -6.2964e-02, -1.2293e-01], ..., [ 1.1581e-01, -1.0978e-01, 1.4817e-01, ..., 7.7135e-02, 2.2630e-01, -2.4768e-02], [ 1.4400e-01, -2.6600e-01, -1.8288e-01, ..., -3.7354e-01, -1.2316e-01, 2.3680e-01], [-2.1780e-01, 9.4457e-02, 7.5581e-02, ..., -3.6148e-01, -1.6951e-01, -1.0960e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, -3.7253e-09, 0.0000e+00, ..., 0.0000e+00, -0.0000e+00, -3.7253e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [-5.0291e-08, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [-3.7253e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, -9.3132e-09, -9.3132e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 428, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0198, 0.0040, -0.0004, 0.0086, 0.0231, 0.0127, -0.0109], device='cuda:0'), grad: tensor([ 5.5879e-09, -5.5879e-09, 1.6764e-08, 7.4506e-09, 1.7136e-07, 1.1176e-08, 1.8626e-09, -1.7695e-07, -7.4506e-09, -2.6077e-08], device='cuda:0') 100 1e-05 changing lr epoch 427, time 214.58, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4539 re_mapping 0.0030 re_causal 0.0103 /// teacc 99.15 lr 0.00001000 Epoch 429, weight, value: tensor([[-3.2096e-01, 8.6777e-02, -2.1298e-01, ..., -1.1373e-01, -2.9143e-01, -1.7632e-01], [-8.4292e-03, 1.0591e-01, -1.2938e-01, ..., -1.4277e-01, -8.4549e-02, 1.3207e-01], [ 5.0317e-02, -1.5802e-01, -1.9218e-01, ..., -3.2489e-04, -6.2966e-02, -1.2295e-01], ..., [ 1.1583e-01, -1.0990e-01, 1.4820e-01, ..., 7.7134e-02, 2.2634e-01, -2.4780e-02], [ 1.4393e-01, -2.6611e-01, -1.8295e-01, ..., -3.7361e-01, -1.2322e-01, 2.3674e-01], [-2.1785e-01, 9.4441e-02, 7.5592e-02, ..., -3.6155e-01, -1.6952e-01, -1.0968e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [-1.8626e-09, 0.0000e+00, 5.5879e-09, ..., -1.8626e-09, 3.7253e-09, 0.0000e+00], ..., [-5.5879e-09, 1.8626e-09, -9.3132e-09, ..., -1.8626e-09, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 1.6764e-08, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, -1.8626e-09], [ 0.0000e+00, 7.4506e-09, -9.3132e-09, ..., 7.4506e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 429, bias, value: tensor([-0.0153, -0.0222, -0.0108, -0.0198, 0.0041, -0.0004, 0.0086, 0.0231, 0.0125, -0.0109], device='cuda:0'), grad: tensor([ 9.3132e-09, 1.8626e-08, -1.8626e-09, 1.9018e-06, 3.3528e-08, -2.2631e-06, 1.2107e-07, -3.7253e-09, 8.3819e-08, 1.0990e-07], device='cuda:0') 100 1e-05 changing lr epoch 428, time 214.80, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4492 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.11 lr 0.00001000 Epoch 430, weight, value: tensor([[-3.2098e-01, 8.6787e-02, -2.1299e-01, ..., -1.1374e-01, -2.9144e-01, -1.7632e-01], [-8.4141e-03, 1.0597e-01, -1.2937e-01, ..., -1.4278e-01, -8.4529e-02, 1.3213e-01], [ 5.0311e-02, -1.5804e-01, -1.9224e-01, ..., -3.2547e-04, -6.2976e-02, -1.2296e-01], ..., [ 1.1583e-01, -1.1010e-01, 1.4822e-01, ..., 7.7158e-02, 2.2634e-01, -2.4825e-02], [ 1.4396e-01, -2.6618e-01, -1.8299e-01, ..., -3.7369e-01, -1.2326e-01, 2.3678e-01], [-2.1787e-01, 9.4426e-02, 7.5609e-02, ..., -3.6162e-01, -1.6953e-01, -1.0974e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 8.5682e-08, -5.5879e-09, 5.0291e-08, ..., 2.4214e-08, 6.5193e-08, 1.8626e-09], [ 5.5879e-09, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 3.7253e-09, 1.8626e-09], ..., [-1.0245e-07, 1.8626e-09, -6.5193e-08, ..., -2.7940e-08, -8.3819e-08, -9.3132e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 7.4506e-09, 0.0000e+00, 7.4506e-09, ..., 1.8626e-09, 1.1176e-08, 1.8626e-09]], device='cuda:0') Epoch 430, bias, value: tensor([-0.0153, -0.0221, -0.0108, -0.0198, 0.0041, -0.0005, 0.0086, 0.0231, 0.0125, -0.0110], device='cuda:0'), grad: tensor([ 5.0291e-08, 1.6391e-07, 1.4901e-08, 0.0000e+00, 3.7253e-09, 1.1176e-08, -9.1270e-08, -2.2165e-07, 2.9802e-08, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr epoch 429, time 214.88, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4358 re_mapping 0.0029 re_causal 0.0102 /// teacc 99.11 lr 0.00001000 Epoch 431, weight, value: tensor([[-3.2102e-01, 8.6801e-02, -2.1305e-01, ..., -1.1376e-01, -2.9146e-01, -1.7633e-01], [-8.4268e-03, 1.0599e-01, -1.2938e-01, ..., -1.4287e-01, -8.4548e-02, 1.3215e-01], [ 5.0307e-02, -1.5812e-01, -1.9231e-01, ..., -3.2491e-04, -6.2986e-02, -1.2298e-01], ..., [ 1.1585e-01, -1.1020e-01, 1.4825e-01, ..., 7.7182e-02, 2.2637e-01, -2.4840e-02], [ 1.4399e-01, -2.6625e-01, -1.8305e-01, ..., -3.7375e-01, -1.2330e-01, 2.3681e-01], [-2.1793e-01, 9.4417e-02, 7.5613e-02, ..., -3.6177e-01, -1.6955e-01, -1.0983e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 431, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0198, 0.0041, -0.0005, 0.0086, 0.0231, 0.0125, -0.0110], device='cuda:0'), grad: tensor([-3.7253e-09, 0.0000e+00, 0.0000e+00, 1.8626e-09, -3.7253e-09, -3.1665e-08, 1.4901e-08, 1.8626e-09, 1.8626e-09, 1.4901e-08], device='cuda:0') 100 1e-05 changing lr epoch 430, time 214.80, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4436 re_mapping 0.0029 re_causal 0.0104 /// teacc 99.11 lr 0.00001000 Epoch 432, weight, value: tensor([[-3.2105e-01, 8.6822e-02, -2.1306e-01, ..., -1.1379e-01, -2.9147e-01, -1.7633e-01], [-8.4455e-03, 1.0598e-01, -1.2941e-01, ..., -1.4278e-01, -8.4576e-02, 1.3216e-01], [ 5.0307e-02, -1.5822e-01, -1.9236e-01, ..., -3.1784e-04, -6.2990e-02, -1.2300e-01], ..., [ 1.1588e-01, -1.1026e-01, 1.4829e-01, ..., 7.7173e-02, 2.2641e-01, -2.4840e-02], [ 1.4401e-01, -2.6632e-01, -1.8314e-01, ..., -3.7384e-01, -1.2338e-01, 2.3689e-01], [-2.1799e-01, 9.4395e-02, 7.5620e-02, ..., -3.6187e-01, -1.6957e-01, -1.0995e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.0978e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.9116e-08], [ 1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], ..., [-1.8626e-09, 2.2352e-08, -1.8626e-09, ..., -0.0000e+00, -1.8626e-09, 1.4901e-08], [ 0.0000e+00, 3.7253e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 2.2352e-08, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 2.4214e-08]], device='cuda:0') Epoch 432, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0199, 0.0042, -0.0005, 0.0086, 0.0231, 0.0125, -0.0111], device='cuda:0'), grad: tensor([-9.3132e-09, -1.6764e-07, 9.3132e-09, 6.1467e-08, -2.6077e-08, -9.4995e-08, 2.6077e-08, 8.3819e-08, 1.3039e-08, 8.7544e-08], device='cuda:0') 100 1e-05 changing lr epoch 431, time 214.49, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4271 re_mapping 0.0029 re_causal 0.0099 /// teacc 99.13 lr 0.00001000 Epoch 433, weight, value: tensor([[-3.2105e-01, 8.6849e-02, -2.1308e-01, ..., -1.1381e-01, -2.9148e-01, -1.7633e-01], [-8.4702e-03, 1.0600e-01, -1.2946e-01, ..., -1.4287e-01, -8.4614e-02, 1.3216e-01], [ 5.0311e-02, -1.5827e-01, -1.9249e-01, ..., -3.0153e-04, -6.2998e-02, -1.2301e-01], ..., [ 1.1590e-01, -1.1034e-01, 1.4836e-01, ..., 7.7177e-02, 2.2646e-01, -2.4838e-02], [ 1.4402e-01, -2.6639e-01, -1.8321e-01, ..., -3.7394e-01, -1.2344e-01, 2.3694e-01], [-2.1804e-01, 9.4350e-02, 7.5619e-02, ..., -3.6197e-01, -1.6960e-01, -1.1002e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 3.9116e-08, -5.5879e-09, 3.3528e-08, ..., 3.7253e-09, 4.0978e-08, 7.4506e-09], [ 2.0489e-08, 1.8626e-09, 1.4901e-08, ..., 1.8626e-09, 1.8626e-08, 9.3132e-09], ..., [-1.1548e-07, -1.8626e-09, -8.7544e-08, ..., -1.3039e-08, -1.0617e-07, -3.9116e-08], [ 1.3039e-08, 7.4506e-09, 1.1176e-08, ..., 0.0000e+00, 1.4901e-08, 9.3132e-09], [ 2.4214e-08, 1.8626e-09, 1.4901e-08, ..., 3.7253e-09, 1.8626e-08, 7.4506e-09]], device='cuda:0') Epoch 433, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0199, 0.0042, -0.0005, 0.0087, 0.0231, 0.0124, -0.0111], device='cuda:0'), grad: tensor([ 2.9802e-08, 8.1956e-08, 5.2154e-08, 9.8720e-08, 1.6764e-08, -8.1956e-07, 4.8615e-07, -1.8254e-07, 1.7695e-07, 5.2154e-08], device='cuda:0') 100 1e-05 changing lr epoch 432, time 214.70, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4637 re_mapping 0.0029 re_causal 0.0100 /// teacc 99.11 lr 0.00001000 Epoch 434, weight, value: tensor([[-3.2109e-01, 8.6872e-02, -2.1308e-01, ..., -1.1383e-01, -2.9149e-01, -1.7633e-01], [-8.5038e-03, 1.0603e-01, -1.2951e-01, ..., -1.4300e-01, -8.4663e-02, 1.3217e-01], [ 5.0309e-02, -1.5834e-01, -1.9262e-01, ..., -2.9112e-04, -6.3007e-02, -1.2303e-01], ..., [ 1.1594e-01, -1.1041e-01, 1.4843e-01, ..., 7.7194e-02, 2.2652e-01, -2.4838e-02], [ 1.4404e-01, -2.6652e-01, -1.8329e-01, ..., -3.7403e-01, -1.2354e-01, 2.3701e-01], [-2.1813e-01, 9.4302e-02, 7.5609e-02, ..., -3.6215e-01, -1.6963e-01, -1.1013e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 1.8626e-09, 5.5879e-09, ..., 0.0000e+00, 3.7253e-09, -1.8626e-09], [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, -0.0000e+00, 0.0000e+00], ..., [-1.1176e-08, 1.8626e-09, -1.1176e-08, ..., -0.0000e+00, -7.4506e-09, 1.8626e-09], [ 0.0000e+00, 7.4506e-09, 5.5879e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, -1.4901e-07, -8.3819e-08, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 434, bias, value: tensor([-0.0152, -0.0222, -0.0108, -0.0200, 0.0042, -0.0004, 0.0086, 0.0231, 0.0124, -0.0112], device='cuda:0'), grad: tensor([ 7.4506e-09, 1.1176e-08, -1.8626e-09, 2.0489e-08, 3.5763e-07, 1.1176e-08, 0.0000e+00, -1.4901e-08, 2.4214e-08, -4.2282e-07], device='cuda:0') 100 1e-05 changing lr epoch 433, time 214.44, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4153 re_mapping 0.0028 re_causal 0.0098 /// teacc 99.10 lr 0.00001000 Epoch 435, weight, value: tensor([[-3.2112e-01, 8.6913e-02, -2.1309e-01, ..., -1.1383e-01, -2.9150e-01, -1.7634e-01], [-8.5018e-03, 1.0608e-01, -1.2951e-01, ..., -1.4308e-01, -8.4675e-02, 1.3222e-01], [ 5.0300e-02, -1.5839e-01, -1.9272e-01, ..., -2.8777e-04, -6.3024e-02, -1.2305e-01], ..., [ 1.1595e-01, -1.1059e-01, 1.4845e-01, ..., 7.7212e-02, 2.2655e-01, -2.4865e-02], [ 1.4403e-01, -2.6666e-01, -1.8337e-01, ..., -3.7413e-01, -1.2362e-01, 2.3702e-01], [-2.1817e-01, 9.4274e-02, 7.5640e-02, ..., -3.6227e-01, -1.6965e-01, -1.1019e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, -3.5390e-08, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, -6.7055e-08], [-8.3819e-08, 7.4506e-09, -9.3132e-09, ..., -2.7940e-08, -6.1467e-08, 1.3039e-08], ..., [ 7.2643e-08, 2.4214e-08, 7.4506e-09, ..., 2.4214e-08, 5.7742e-08, 4.2841e-08], [ 7.4506e-09, 5.5879e-09, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 1.8626e-09], [ 0.0000e+00, 1.1176e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 5.5879e-09]], device='cuda:0') Epoch 435, bias, value: tensor([-0.0151, -0.0222, -0.0108, -0.0200, 0.0042, -0.0004, 0.0086, 0.0231, 0.0123, -0.0113], device='cuda:0'), grad: tensor([-9.3132e-09, -2.7753e-07, -1.2666e-07, 1.2480e-07, 1.8626e-09, -1.6019e-07, 2.0489e-08, 3.2410e-07, 6.1467e-08, 4.8429e-08], device='cuda:0') 100 1e-05 changing lr epoch 434, time 214.85, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4355 re_mapping 0.0029 re_causal 0.0100 /// teacc 99.08 lr 0.00001000 Epoch 436, weight, value: tensor([[-3.2114e-01, 8.6931e-02, -2.1318e-01, ..., -1.1385e-01, -2.9151e-01, -1.7634e-01], [-8.5252e-03, 1.0610e-01, -1.2955e-01, ..., -1.4320e-01, -8.4710e-02, 1.3224e-01], [ 5.0292e-02, -1.5845e-01, -1.9280e-01, ..., -2.8520e-04, -6.3042e-02, -1.2307e-01], ..., [ 1.1598e-01, -1.1070e-01, 1.4850e-01, ..., 7.7231e-02, 2.2660e-01, -2.4866e-02], [ 1.4409e-01, -2.6676e-01, -1.8344e-01, ..., -3.7420e-01, -1.2366e-01, 2.3707e-01], [-2.1824e-01, 9.4278e-02, 7.5658e-02, ..., -3.6241e-01, -1.6968e-01, -1.1031e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -5.5879e-09], [ 0.0000e+00, 3.9116e-08, 3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 2.7940e-08]], device='cuda:0') Epoch 436, bias, value: tensor([-0.0151, -0.0222, -0.0108, -0.0200, 0.0042, -0.0004, 0.0086, 0.0232, 0.0123, -0.0113], device='cuda:0'), grad: tensor([-1.1176e-08, 7.4506e-09, 3.7253e-09, 5.4017e-08, -1.3784e-07, -6.1467e-08, 5.5879e-09, 3.7253e-09, -9.3132e-09, 1.4901e-07], device='cuda:0') 100 1e-05 changing lr epoch 435, time 215.04, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4310 re_mapping 0.0029 re_causal 0.0099 /// teacc 99.11 lr 0.00001000 Epoch 437, weight, value: tensor([[-3.2116e-01, 8.6951e-02, -2.1320e-01, ..., -1.1387e-01, -2.9152e-01, -1.7635e-01], [-8.5403e-03, 1.0609e-01, -1.2958e-01, ..., -1.4331e-01, -8.4730e-02, 1.3225e-01], [ 5.0301e-02, -1.5850e-01, -1.9283e-01, ..., -2.6481e-04, -6.3025e-02, -1.2307e-01], ..., [ 1.1600e-01, -1.1081e-01, 1.4854e-01, ..., 7.7236e-02, 2.2662e-01, -2.4871e-02], [ 1.4412e-01, -2.6686e-01, -1.8352e-01, ..., -3.7433e-01, -1.2375e-01, 2.3714e-01], [-2.1827e-01, 9.4284e-02, 7.5693e-02, ..., -3.6249e-01, -1.6969e-01, -1.1038e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -2.0489e-08, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 1.8626e-09], [ 5.5879e-08, -1.8626e-09, 6.3330e-08, ..., 2.4214e-08, 5.2154e-08, -7.4506e-09], [ 2.7753e-07, 0.0000e+00, 3.2783e-07, ..., 1.2293e-07, 2.6450e-07, 1.1176e-08], ..., [-3.7067e-07, 1.8626e-09, -4.6194e-07, ..., -1.7509e-07, -3.7439e-07, 1.3039e-08], [-5.0291e-08, 1.8626e-09, 5.5879e-09, ..., 1.8626e-09, 3.7253e-09, -5.9605e-08], [ 5.4017e-08, -7.4506e-09, 3.7253e-08, ..., 2.2352e-08, 4.8429e-08, 7.4506e-09]], device='cuda:0') Epoch 437, bias, value: tensor([-0.0151, -0.0222, -0.0108, -0.0200, 0.0042, -0.0004, 0.0086, 0.0232, 0.0122, -0.0113], device='cuda:0'), grad: tensor([-4.2841e-08, 1.1362e-07, 6.9290e-07, 4.6566e-08, 4.0978e-08, 3.3528e-08, 4.2841e-08, -8.9966e-07, -1.3597e-07, 1.0803e-07], device='cuda:0') 100 1e-05 changing lr epoch 436, time 215.19, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4881 re_mapping 0.0029 re_causal 0.0105 /// teacc 99.10 lr 0.00001000 Epoch 438, weight, value: tensor([[-3.2120e-01, 8.6944e-02, -2.1322e-01, ..., -1.1390e-01, -2.9153e-01, -1.7636e-01], [-8.5154e-03, 1.0622e-01, -1.2956e-01, ..., -1.4337e-01, -8.4708e-02, 1.3233e-01], [ 5.0314e-02, -1.5860e-01, -1.9291e-01, ..., -2.2675e-04, -6.3017e-02, -1.2310e-01], ..., [ 1.1598e-01, -1.1109e-01, 1.4854e-01, ..., 7.7212e-02, 2.2661e-01, -2.4935e-02], [ 1.4412e-01, -2.6699e-01, -1.8360e-01, ..., -3.7443e-01, -1.2381e-01, 2.3715e-01], [-2.1834e-01, 9.4309e-02, 7.5709e-02, ..., -3.6261e-01, -1.6970e-01, -1.1054e-01]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.8626e-09, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 2.4959e-07, 1.6764e-08, 3.1851e-07, ..., 3.7253e-09, 1.8626e-07, 4.4703e-08], [-5.5879e-09, 1.8626e-09, 1.8626e-09, ..., -9.3132e-09, -1.8626e-09, 0.0000e+00], ..., [-2.5518e-07, -1.3039e-08, -3.1851e-07, ..., 9.3132e-09, -1.8999e-07, -3.9116e-08], [-1.4901e-08, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 1.8626e-09, -3.1665e-08], [ 1.3039e-08, 1.8626e-09, 3.7253e-09, ..., 0.0000e+00, 1.8626e-09, 1.4901e-08]], device='cuda:0') Epoch 438, bias, value: tensor([-0.0151, -0.0222, -0.0107, -0.0200, 0.0042, -0.0004, 0.0086, 0.0231, 0.0121, -0.0113], device='cuda:0'), grad: tensor([ 1.4901e-08, 5.6624e-07, -2.7940e-08, -7.4506e-09, 3.5390e-08, 1.0990e-07, -1.6391e-07, -5.3085e-07, -4.8429e-08, 4.6566e-08], device='cuda:0') 100 1e-05 changing lr epoch 437, time 215.00, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4585 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.08 lr 0.00001000 Epoch 439, weight, value: tensor([[-3.2123e-01, 8.6959e-02, -2.1324e-01, ..., -1.1392e-01, -2.9155e-01, -1.7637e-01], [-8.4822e-03, 1.0631e-01, -1.2954e-01, ..., -1.4347e-01, -8.4682e-02, 1.3241e-01], [ 5.0318e-02, -1.5866e-01, -1.9296e-01, ..., -2.2024e-04, -6.3013e-02, -1.2310e-01], ..., [ 1.1596e-01, -1.1132e-01, 1.4854e-01, ..., 7.7233e-02, 2.2659e-01, -2.5004e-02], [ 1.4411e-01, -2.6708e-01, -1.8369e-01, ..., -3.7454e-01, -1.2388e-01, 2.3715e-01], [-2.1843e-01, 9.4271e-02, 7.5718e-02, ..., -3.6276e-01, -1.6973e-01, -1.1073e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 5.5879e-09, 0.0000e+00], [ 5.5879e-09, -1.8626e-09, 1.8626e-09, ..., 3.7253e-09, 3.7253e-09, 5.5879e-09], [-1.1176e-08, 0.0000e+00, -3.7253e-09, ..., -1.6764e-08, -2.0489e-08, 1.8626e-09], ..., [ 3.7253e-09, 1.8626e-09, 0.0000e+00, ..., 5.5879e-09, 9.3132e-09, 1.8626e-09], [-7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -3.5390e-08], [ 5.5879e-09, -3.7253e-09, -5.5879e-09, ..., 0.0000e+00, 0.0000e+00, 2.4214e-08]], device='cuda:0') Epoch 439, bias, value: tensor([-0.0151, -0.0221, -0.0107, -0.0200, 0.0042, -0.0004, 0.0087, 0.0231, 0.0120, -0.0114], device='cuda:0'), grad: tensor([ 1.1176e-08, 1.6764e-08, -6.5193e-08, 1.8626e-09, 9.3132e-09, 1.8626e-09, 1.8626e-09, 3.1665e-08, -5.5879e-08, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr epoch 438, time 215.09, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4169 re_mapping 0.0029 re_causal 0.0097 /// teacc 99.12 lr 0.00001000 Epoch 440, weight, value: tensor([[-3.2126e-01, 8.6995e-02, -2.1324e-01, ..., -1.1392e-01, -2.9156e-01, -1.7637e-01], [-8.4846e-03, 1.0633e-01, -1.2955e-01, ..., -1.4359e-01, -8.4692e-02, 1.3246e-01], [ 5.0342e-02, -1.5872e-01, -1.9303e-01, ..., -1.8446e-04, -6.3002e-02, -1.2312e-01], ..., [ 1.1596e-01, -1.1147e-01, 1.4857e-01, ..., 7.7222e-02, 2.2661e-01, -2.5057e-02], [ 1.4413e-01, -2.6717e-01, -1.8377e-01, ..., -3.7465e-01, -1.2393e-01, 2.3723e-01], [-2.1850e-01, 9.4170e-02, 7.5685e-02, ..., -3.6282e-01, -1.6976e-01, -1.1085e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.3528e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-09, -1.6764e-08, 9.3132e-09, ..., 7.4506e-09, 0.0000e+00, -5.9605e-08], [-0.0000e+00, 1.8626e-09, 0.0000e+00, ..., -1.8626e-09, -0.0000e+00, 3.7253e-09], ..., [-9.3132e-09, 1.8626e-08, -1.3039e-08, ..., -9.3132e-09, -1.8626e-09, 4.4703e-08], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 1.5832e-07, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.4506e-09]], device='cuda:0') Epoch 440, bias, value: tensor([-0.0151, -0.0221, -0.0107, -0.0200, 0.0044, -0.0005, 0.0087, 0.0231, 0.0120, -0.0116], device='cuda:0'), grad: tensor([-3.7253e-09, 7.2643e-08, 1.3039e-07, 7.4506e-09, -3.3919e-06, 5.5879e-09, -9.5554e-07, 2.2911e-07, 6.5193e-08, 3.8520e-06], device='cuda:0') 100 1e-05 changing lr epoch 439, time 215.00, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4557 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.11 lr 0.00001000 Epoch 441, weight, value: tensor([[-3.2128e-01, 8.7023e-02, -2.1334e-01, ..., -1.1395e-01, -2.9158e-01, -1.7637e-01], [-8.5103e-03, 1.0631e-01, -1.2957e-01, ..., -1.4371e-01, -8.4718e-02, 1.3248e-01], [ 5.0348e-02, -1.5880e-01, -1.9312e-01, ..., -1.6774e-04, -6.2995e-02, -1.2310e-01], ..., [ 1.1598e-01, -1.1157e-01, 1.4861e-01, ..., 7.7229e-02, 2.2664e-01, -2.5085e-02], [ 1.4416e-01, -2.6724e-01, -1.8384e-01, ..., -3.7477e-01, -1.2405e-01, 2.3728e-01], [-2.1856e-01, 9.4148e-02, 7.5694e-02, ..., -3.6289e-01, -1.6978e-01, -1.1091e-01]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.7940e-09, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 1.8626e-09, 6.5193e-09, 9.3132e-09, ..., 7.4506e-09, 9.3132e-10, 3.7253e-09], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, -1.8626e-09, 0.0000e+00], ..., [ 1.8626e-09, 2.7940e-09, 9.3132e-10, ..., 1.8626e-09, 9.3132e-10, 4.6566e-09], [-1.5739e-07, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.2759e-07], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 441, bias, value: tensor([-0.0150, -0.0221, -0.0107, -0.0200, 0.0045, -0.0005, 0.0087, 0.0231, 0.0120, -0.0116], device='cuda:0'), grad: tensor([-2.7940e-09, 2.7008e-08, -5.5879e-09, -1.3039e-08, -2.1420e-08, 2.4401e-07, 1.2852e-07, 1.6764e-08, -3.7253e-07, 8.3819e-09], device='cuda:0') 100 1e-05 changing lr epoch 440, time 214.97, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4085 re_mapping 0.0028 re_causal 0.0095 /// teacc 99.11 lr 0.00001000 Epoch 442, weight, value: tensor([[-3.2132e-01, 8.7040e-02, -2.1338e-01, ..., -1.1401e-01, -2.9160e-01, -1.7638e-01], [-8.5171e-03, 1.0634e-01, -1.2959e-01, ..., -1.4384e-01, -8.4727e-02, 1.3253e-01], [ 5.0392e-02, -1.5889e-01, -1.9317e-01, ..., -9.0360e-05, -6.2929e-02, -1.2314e-01], ..., [ 1.1597e-01, -1.1166e-01, 1.4864e-01, ..., 7.7187e-02, 2.2664e-01, -2.5119e-02], [ 1.4422e-01, -2.6740e-01, -1.8397e-01, ..., -3.7487e-01, -1.2410e-01, 2.3732e-01], [-2.1861e-01, 9.4118e-02, 7.5716e-02, ..., -3.6302e-01, -1.6980e-01, -1.1103e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.5832e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 1.8626e-09, 2.7940e-09, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [-9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [-1.8626e-09, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, -3.7253e-09], [ 9.3132e-10, 4.6566e-09, -4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 442, bias, value: tensor([-0.0150, -0.0221, -0.0106, -0.0200, 0.0045, -0.0005, 0.0087, 0.0230, 0.0119, -0.0117], device='cuda:0'), grad: tensor([-3.9116e-08, 9.3132e-09, 0.0000e+00, -1.4901e-08, 1.8626e-08, 0.0000e+00, 3.7253e-09, 4.6566e-09, -2.7940e-09, 1.7695e-08], device='cuda:0') 100 1e-05 changing lr epoch 441, time 215.02, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4399 re_mapping 0.0028 re_causal 0.0099 /// teacc 99.12 lr 0.00001000 Epoch 443, weight, value: tensor([[-3.2135e-01, 8.7073e-02, -2.1341e-01, ..., -1.1405e-01, -2.9162e-01, -1.7637e-01], [-8.5098e-03, 1.0639e-01, -1.2956e-01, ..., -1.4392e-01, -8.4727e-02, 1.3257e-01], [ 5.0422e-02, -1.5897e-01, -1.9324e-01, ..., -5.0995e-05, -6.2897e-02, -1.2316e-01], ..., [ 1.1595e-01, -1.1178e-01, 1.4863e-01, ..., 7.7149e-02, 2.2663e-01, -2.5162e-02], [ 1.4425e-01, -2.6749e-01, -1.8403e-01, ..., -3.7495e-01, -1.2415e-01, 2.3740e-01], [-2.1866e-01, 9.4076e-02, 7.5737e-02, ..., -3.6310e-01, -1.6980e-01, -1.1113e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.5832e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 3.7253e-09, 1.8626e-09, ..., 0.0000e+00, 9.3132e-10, -2.7940e-09], [ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], ..., [-5.5879e-09, 4.6566e-09, -6.5193e-09, ..., -1.8626e-09, -4.6566e-09, 8.3819e-09], [-4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.4506e-09], [ 1.8626e-09, 1.0245e-08, 1.8626e-09, ..., 0.0000e+00, 9.3132e-10, 8.3819e-09]], device='cuda:0') Epoch 443, bias, value: tensor([-0.0150, -0.0221, -0.0106, -0.0200, 0.0045, -0.0005, 0.0087, 0.0230, 0.0119, -0.0118], device='cuda:0'), grad: tensor([-4.0978e-08, 1.2107e-08, 4.6566e-09, 5.5879e-09, -4.6566e-08, 1.2107e-08, 2.7008e-08, 8.3819e-09, -1.8626e-08, 4.0978e-08], device='cuda:0') 100 1e-05 changing lr epoch 442, time 215.04, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4153 re_mapping 0.0029 re_causal 0.0098 /// teacc 99.12 lr 0.00001000 Epoch 444, weight, value: tensor([[-3.2142e-01, 8.7069e-02, -2.1351e-01, ..., -1.1409e-01, -2.9167e-01, -1.7639e-01], [-8.5239e-03, 1.0641e-01, -1.2958e-01, ..., -1.4411e-01, -8.4745e-02, 1.3260e-01], [ 5.0447e-02, -1.5901e-01, -1.9330e-01, ..., -1.0292e-05, -6.2870e-02, -1.2312e-01], ..., [ 1.1596e-01, -1.1183e-01, 1.4866e-01, ..., 7.7153e-02, 2.2665e-01, -2.5185e-02], [ 1.4430e-01, -2.6762e-01, -1.8410e-01, ..., -3.7512e-01, -1.2427e-01, 2.3748e-01], [-2.1873e-01, 9.4106e-02, 7.5769e-02, ..., -3.6324e-01, -1.6982e-01, -1.1122e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -8.6613e-08, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 5.0291e-08, 1.6764e-08, 6.5193e-08, ..., 6.5193e-09, 4.0978e-08, -2.7940e-09], [ 5.5879e-09, 9.3132e-09, 1.4901e-08, ..., 1.1176e-08, 1.8626e-09, 9.3132e-10], ..., [-1.1362e-07, -2.6077e-08, -1.3132e-07, ..., 9.3132e-10, -9.7789e-08, 4.6566e-09], [ 1.8626e-09, 1.8626e-09, 3.7253e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 4.7497e-08, 5.8673e-08, 5.2154e-08, ..., 0.0000e+00, 4.0978e-08, 1.3970e-08]], device='cuda:0') Epoch 444, bias, value: tensor([-0.0150, -0.0221, -0.0105, -0.0200, 0.0046, -0.0005, 0.0087, 0.0230, 0.0118, -0.0118], device='cuda:0'), grad: tensor([-1.7788e-07, 1.3504e-07, 3.6322e-08, -8.8476e-08, -9.8720e-08, 6.2399e-08, 1.0617e-07, -2.6356e-07, 9.3132e-09, 2.9150e-07], device='cuda:0') 100 1e-05 changing lr epoch 443, time 214.68, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4373 re_mapping 0.0028 re_causal 0.0099 /// teacc 99.10 lr 0.00001000 Epoch 445, weight, value: tensor([[-3.2143e-01, 8.7107e-02, -2.1358e-01, ..., -1.1409e-01, -2.9170e-01, -1.7639e-01], [-8.4982e-03, 1.0651e-01, -1.2955e-01, ..., -1.4422e-01, -8.4736e-02, 1.3267e-01], [ 5.0517e-02, -1.5909e-01, -1.9340e-01, ..., 8.7677e-05, -6.2789e-02, -1.2309e-01], ..., [ 1.1590e-01, -1.1202e-01, 1.4864e-01, ..., 7.7074e-02, 2.2661e-01, -2.5257e-02], [ 1.4436e-01, -2.6775e-01, -1.8416e-01, ..., -3.7520e-01, -1.2431e-01, 2.3752e-01], [-2.1876e-01, 9.4091e-02, 7.5816e-02, ..., -3.6331e-01, -1.6984e-01, -1.1125e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, -5.5879e-09], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., -0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [-9.3132e-10, 4.6566e-09, -1.8626e-09, ..., -0.0000e+00, -9.3132e-10, 6.5193e-09], [-4.6566e-08, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -6.4261e-08], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 445, bias, value: tensor([-0.0150, -0.0220, -0.0104, -0.0200, 0.0046, -0.0005, 0.0087, 0.0229, 0.0118, -0.0118], device='cuda:0'), grad: tensor([ 6.5193e-09, -9.3132e-10, 2.7940e-09, -9.3132e-09, -1.7695e-08, -3.7253e-09, 8.6613e-08, 1.6764e-08, -8.3819e-08, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 444, time 214.72, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4291 re_mapping 0.0028 re_causal 0.0098 /// teacc 99.10 lr 0.00001000 Epoch 446, weight, value: tensor([[-3.2147e-01, 8.7118e-02, -2.1366e-01, ..., -1.1412e-01, -2.9171e-01, -1.7640e-01], [-8.4646e-03, 1.0663e-01, -1.2951e-01, ..., -1.4440e-01, -8.4726e-02, 1.3277e-01], [ 5.0585e-02, -1.5913e-01, -1.9348e-01, ..., 1.8265e-04, -6.2715e-02, -1.2306e-01], ..., [ 1.1585e-01, -1.1227e-01, 1.4863e-01, ..., 7.7017e-02, 2.2659e-01, -2.5363e-02], [ 1.4439e-01, -2.6784e-01, -1.8423e-01, ..., -3.7530e-01, -1.2439e-01, 2.3759e-01], [-2.1881e-01, 9.4123e-02, 7.5860e-02, ..., -3.6338e-01, -1.6986e-01, -1.1131e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.1176e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 9.3132e-10, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 4.6566e-09, 1.8626e-09, 2.7940e-09, ..., 1.8626e-09, 3.7253e-09, 9.3132e-10], ..., [-4.6566e-09, 0.0000e+00, -1.8626e-09, ..., -1.8626e-09, -3.7253e-09, 0.0000e+00], [-3.7253e-09, 2.0489e-08, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], [ 3.7253e-09, -3.4459e-08, -1.1176e-08, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09]], device='cuda:0') Epoch 446, bias, value: tensor([-0.0150, -0.0220, -0.0103, -0.0201, 0.0045, -0.0005, 0.0087, 0.0228, 0.0117, -0.0118], device='cuda:0'), grad: tensor([-3.0734e-08, 9.3132e-09, 1.4901e-08, 1.7695e-08, 1.6764e-08, 1.6764e-08, 1.1176e-08, -5.5879e-09, 6.0536e-08, -1.0990e-07], device='cuda:0') 100 1e-05 changing lr epoch 445, time 214.81, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4334 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.08 lr 0.00001000 Epoch 447, weight, value: tensor([[-3.2152e-01, 8.7139e-02, -2.1369e-01, ..., -1.1419e-01, -2.9175e-01, -1.7642e-01], [-8.4503e-03, 1.0674e-01, -1.2949e-01, ..., -1.4453e-01, -8.4714e-02, 1.3286e-01], [ 5.0608e-02, -1.5921e-01, -1.9357e-01, ..., 2.1627e-04, -6.2683e-02, -1.2304e-01], ..., [ 1.1583e-01, -1.1246e-01, 1.4865e-01, ..., 7.7019e-02, 2.2658e-01, -2.5442e-02], [ 1.4438e-01, -2.6795e-01, -1.8429e-01, ..., -3.7538e-01, -1.2447e-01, 2.3762e-01], [-2.1887e-01, 9.4146e-02, 7.5888e-02, ..., -3.6346e-01, -1.6989e-01, -1.1138e-01]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -1.8626e-09, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [-8.3819e-09, 0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 447, bias, value: tensor([-0.0150, -0.0219, -0.0103, -0.0201, 0.0045, -0.0005, 0.0087, 0.0228, 0.0116, -0.0118], device='cuda:0'), grad: tensor([ 1.1176e-08, 1.0245e-08, -4.4703e-08, 8.3819e-09, 1.8626e-09, -2.7940e-09, 1.3039e-08, 4.6566e-09, 3.7253e-09, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 446, time 214.66, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4485 re_mapping 0.0029 re_causal 0.0102 /// teacc 99.11 lr 0.00001000 Epoch 448, weight, value: tensor([[-3.2156e-01, 8.7184e-02, -2.1371e-01, ..., -1.1422e-01, -2.9177e-01, -1.7642e-01], [-8.4533e-03, 1.0676e-01, -1.2948e-01, ..., -1.4471e-01, -8.4728e-02, 1.3291e-01], [ 5.0642e-02, -1.5927e-01, -1.9366e-01, ..., 2.5576e-04, -6.2654e-02, -1.2298e-01], ..., [ 1.1583e-01, -1.1257e-01, 1.4864e-01, ..., 7.7020e-02, 2.2659e-01, -2.5492e-02], [ 1.4437e-01, -2.6807e-01, -1.8441e-01, ..., -3.7546e-01, -1.2454e-01, 2.3766e-01], [-2.1890e-01, 9.4147e-02, 7.5973e-02, ..., -3.6352e-01, -1.6991e-01, -1.1143e-01]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 2.7940e-09, 1.8626e-09], [ 3.7253e-09, -1.8626e-09, 1.8626e-09, ..., 9.3132e-10, 1.8626e-09, -9.3132e-10], [-1.9558e-08, 0.0000e+00, 9.3132e-10, ..., -1.5832e-08, -4.6566e-09, 1.8626e-09], ..., [-2.7940e-09, 9.3132e-10, -1.8626e-09, ..., -1.8626e-09, -4.6566e-09, 1.8626e-09], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 2.7940e-09, -1.6764e-08], [ 2.7940e-09, -1.8626e-09, -3.7253e-09, ..., 0.0000e+00, 9.3132e-10, 3.7253e-09]], device='cuda:0') Epoch 448, bias, value: tensor([-0.0149, -0.0219, -0.0102, -0.0201, 0.0045, -0.0005, 0.0087, 0.0227, 0.0115, -0.0118], device='cuda:0'), grad: tensor([ 5.2154e-08, 2.7940e-09, -1.0617e-07, 4.2841e-08, 1.3039e-08, -4.3772e-08, 9.3132e-09, 1.8626e-09, 2.7940e-08, 0.0000e+00], device='cuda:0') 100 1e-05 changing lr epoch 447, time 214.70, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4007 re_mapping 0.0028 re_causal 0.0096 /// teacc 99.12 lr 0.00001000 Epoch 449, weight, value: tensor([[-3.2163e-01, 8.7166e-02, -2.1381e-01, ..., -1.1426e-01, -2.9181e-01, -1.7644e-01], [-8.4302e-03, 1.0681e-01, -1.2947e-01, ..., -1.4482e-01, -8.4709e-02, 1.3299e-01], [ 5.0651e-02, -1.5932e-01, -1.9373e-01, ..., 2.8155e-04, -6.2646e-02, -1.2299e-01], ..., [ 1.1582e-01, -1.1272e-01, 1.4864e-01, ..., 7.7034e-02, 2.2659e-01, -2.5557e-02], [ 1.4429e-01, -2.6820e-01, -1.8456e-01, ..., -3.7560e-01, -1.2467e-01, 2.3763e-01], [-2.1892e-01, 9.4267e-02, 7.6056e-02, ..., -3.6358e-01, -1.6993e-01, -1.1145e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-09, 5.5879e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-09, -4.6566e-09, 8.3819e-09, ..., 4.6566e-09, 5.5879e-09, -7.1712e-08], [ 1.3970e-08, 1.8626e-09, 1.3039e-08, ..., 8.3819e-09, 1.0245e-08, 8.3819e-09], ..., [-2.7008e-08, 4.6566e-09, -2.1420e-08, ..., -1.2107e-08, -2.2352e-08, 5.1223e-08], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 5.5879e-09], [ 2.7940e-09, 1.1176e-08, 1.5832e-08, ..., 7.4506e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 449, bias, value: tensor([-0.0150, -0.0219, -0.0102, -0.0201, 0.0045, -0.0005, 0.0087, 0.0227, 0.0114, -0.0117], device='cuda:0'), grad: tensor([ 1.2107e-08, -7.6368e-08, 3.8184e-08, -9.4064e-08, 7.4506e-09, 5.0291e-08, 9.3132e-10, 1.7695e-08, 9.3132e-09, 3.4459e-08], device='cuda:0') 100 1e-05 changing lr epoch 448, time 214.83, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4423 re_mapping 0.0028 re_causal 0.0100 /// teacc 99.11 lr 0.00001000 Epoch 450, weight, value: tensor([[-3.2167e-01, 8.7179e-02, -2.1386e-01, ..., -1.1428e-01, -2.9183e-01, -1.7645e-01], [-8.4034e-03, 1.0692e-01, -1.2941e-01, ..., -1.4490e-01, -8.4705e-02, 1.3310e-01], [ 5.0666e-02, -1.5939e-01, -1.9379e-01, ..., 2.9645e-04, -6.2621e-02, -1.2302e-01], ..., [ 1.1580e-01, -1.1292e-01, 1.4860e-01, ..., 7.7028e-02, 2.2658e-01, -2.5653e-02], [ 1.4433e-01, -2.6828e-01, -1.8463e-01, ..., -3.7566e-01, -1.2472e-01, 2.3772e-01], [-2.1900e-01, 9.4276e-02, 7.6106e-02, ..., -3.6365e-01, -1.6996e-01, -1.1158e-01]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -1.8626e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 2.4214e-08], [-2.5146e-08, -3.1665e-08, 3.7253e-09, ..., 1.8626e-09, 9.3132e-10, -1.4063e-07], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., -9.3132e-10, -9.3132e-10, 5.5879e-09], ..., [ 2.5146e-08, 3.2596e-08, 0.0000e+00, ..., 0.0000e+00, -0.0000e+00, 1.3132e-07], [-1.6764e-08, -3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.3574e-08], [-9.3132e-10, -2.1420e-08, -3.2596e-08, ..., 0.0000e+00, 0.0000e+00, 2.8871e-08]], device='cuda:0') Epoch 450, bias, value: tensor([-0.0150, -0.0218, -0.0101, -0.0201, 0.0045, -0.0005, 0.0087, 0.0226, 0.0114, -0.0118], device='cuda:0'), grad: tensor([ 5.7742e-08, -3.3155e-07, 1.2107e-08, 1.4901e-08, 6.1467e-08, 1.3039e-08, 4.3772e-08, 3.2876e-07, -2.0303e-07, 4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 449, time 214.87, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4176 re_mapping 0.0028 re_causal 0.0098 /// teacc 99.10 lr 0.00001000 Epoch 451, weight, value: tensor([[-3.2173e-01, 8.7190e-02, -2.1395e-01, ..., -1.1432e-01, -2.9185e-01, -1.7647e-01], [-8.3951e-03, 1.0708e-01, -1.2943e-01, ..., -1.4502e-01, -8.4713e-02, 1.3321e-01], [ 5.0699e-02, -1.5950e-01, -1.9387e-01, ..., 3.4302e-04, -6.2581e-02, -1.2301e-01], ..., [ 1.1578e-01, -1.1314e-01, 1.4862e-01, ..., 7.7003e-02, 2.2659e-01, -2.5740e-02], [ 1.4433e-01, -2.6843e-01, -1.8478e-01, ..., -3.7580e-01, -1.2485e-01, 2.3778e-01], [-2.1906e-01, 9.4314e-02, 7.6228e-02, ..., -3.6374e-01, -1.6998e-01, -1.1168e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [-1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, -9.3132e-10, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -3.1665e-08, -3.8184e-08, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 451, bias, value: tensor([-0.0150, -0.0218, -0.0101, -0.0201, 0.0045, -0.0006, 0.0087, 0.0226, 0.0113, -0.0118], device='cuda:0'), grad: tensor([ 9.3132e-10, 3.7253e-09, -1.0245e-08, 1.8626e-09, 6.9849e-08, 9.3132e-10, 0.0000e+00, 2.7940e-09, 3.7253e-09, -6.8918e-08], device='cuda:0') 100 1e-05 changing lr epoch 450, time 215.30, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4185 re_mapping 0.0028 re_causal 0.0096 /// teacc 99.12 lr 0.00001000 Epoch 452, weight, value: tensor([[-0.3218, 0.0872, -0.2141, ..., -0.1144, -0.2919, -0.1765], [-0.0084, 0.1072, -0.1294, ..., -0.1452, -0.0847, 0.1333], [ 0.0507, -0.1596, -0.1939, ..., 0.0004, -0.0625, -0.1230], ..., [ 0.1158, -0.1133, 0.1486, ..., 0.0770, 0.2266, -0.0258], [ 0.1444, -0.2685, -0.1849, ..., -0.3760, -0.1250, 0.2379], [-0.2191, 0.0943, 0.0763, ..., -0.3638, -0.1700, -0.1117]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.3970e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, -3.7253e-09, 3.7253e-09, ..., 9.3132e-10, 2.7940e-09, -1.5832e-08], [ 3.7253e-09, 9.3132e-10, 2.7940e-09, ..., 1.8626e-09, 2.7940e-09, 0.0000e+00], ..., [-8.3819e-09, 3.7253e-09, -8.3819e-09, ..., -3.7253e-09, -7.4506e-09, 1.2107e-08], [-0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -9.3132e-10], [ 1.3970e-08, 3.8184e-08, 2.9802e-08, ..., 0.0000e+00, 0.0000e+00, 1.8626e-08]], device='cuda:0') Epoch 452, bias, value: tensor([-0.0150, -0.0218, -0.0100, -0.0201, 0.0044, -0.0007, 0.0087, 0.0225, 0.0112, -0.0118], device='cuda:0'), grad: tensor([-3.2596e-08, -2.4214e-08, 1.0245e-08, 5.5879e-09, -1.5646e-07, 1.8626e-09, 1.3970e-08, 1.3970e-08, 0.0000e+00, 1.6578e-07], device='cuda:0') 100 1e-05 changing lr epoch 451, time 214.98, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4444 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.16 lr 0.00001000 Epoch 453, weight, value: tensor([[-0.3219, 0.0872, -0.2143, ..., -0.1145, -0.2919, -0.1765], [-0.0084, 0.1073, -0.1294, ..., -0.1454, -0.0847, 0.1334], [ 0.0507, -0.1596, -0.1940, ..., 0.0004, -0.0625, -0.1230], ..., [ 0.1158, -0.1135, 0.1487, ..., 0.0770, 0.2266, -0.0259], [ 0.1444, -0.2686, -0.1850, ..., -0.3761, -0.1251, 0.2380], [-0.2191, 0.0944, 0.0765, ..., -0.3639, -0.1700, -0.1117]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 3.7253e-09, ..., 2.7940e-09, 9.3132e-10, -7.4506e-09], [-9.3132e-10, 9.3132e-10, 1.8626e-09, ..., -0.0000e+00, -0.0000e+00, 0.0000e+00], ..., [-1.8626e-09, 1.8626e-09, -1.8626e-09, ..., 0.0000e+00, -9.3132e-10, 4.6566e-09], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 9.3132e-10, -9.3132e-10, -3.7253e-09, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 453, bias, value: tensor([-0.0150, -0.0217, -0.0100, -0.0201, 0.0044, -0.0007, 0.0087, 0.0225, 0.0112, -0.0117], device='cuda:0'), grad: tensor([ 9.3132e-10, -4.6566e-09, -1.8626e-09, -8.0094e-08, 9.3132e-09, 6.0536e-08, 9.3132e-09, 6.5193e-09, 6.5193e-09, -4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 452, time 214.83, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4656 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.12 lr 0.00001000 Epoch 454, weight, value: tensor([[-0.3219, 0.0872, -0.2143, ..., -0.1145, -0.2920, -0.1765], [-0.0084, 0.1073, -0.1295, ..., -0.1457, -0.0848, 0.1334], [ 0.0507, -0.1597, -0.1941, ..., 0.0004, -0.0625, -0.1231], ..., [ 0.1158, -0.1135, 0.1487, ..., 0.0771, 0.2266, -0.0259], [ 0.1445, -0.2688, -0.1851, ..., -0.3763, -0.1252, 0.2381], [-0.2193, 0.0944, 0.0765, ..., -0.3640, -0.1701, -0.1119]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.1514e-07, 0.0000e+00, ..., -6.4261e-08, 0.0000e+00, 0.0000e+00], [ 9.3132e-08, 2.7940e-09, 0.0000e+00, ..., 9.0338e-08, 1.4901e-08, 4.7497e-08], [-1.0710e-07, 2.7940e-09, 0.0000e+00, ..., -1.0058e-07, -1.7695e-08, -5.5879e-08], ..., [ 1.0245e-08, 1.8626e-09, 9.3132e-10, ..., 1.0245e-08, 1.8626e-09, 7.4506e-09], [ 0.0000e+00, 1.4901e-08, 9.3132e-10, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -5.5879e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 454, bias, value: tensor([-0.0149, -0.0217, -0.0100, -0.0201, 0.0044, -0.0007, 0.0087, 0.0225, 0.0113, -0.0118], device='cuda:0'), grad: tensor([-5.0850e-07, 5.2433e-07, -5.8115e-07, 4.5169e-07, 7.4506e-09, 3.7253e-09, 7.4506e-09, 6.1467e-08, 3.7253e-08, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 453, time 214.61, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4228 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.14 lr 0.00001000 Epoch 455, weight, value: tensor([[-0.3220, 0.0873, -0.2143, ..., -0.1145, -0.2920, -0.1765], [-0.0084, 0.1073, -0.1295, ..., -0.1459, -0.0848, 0.1334], [ 0.0507, -0.1599, -0.1942, ..., 0.0005, -0.0625, -0.1231], ..., [ 0.1159, -0.1135, 0.1488, ..., 0.0771, 0.2268, -0.0259], [ 0.1445, -0.2689, -0.1852, ..., -0.3764, -0.1253, 0.2382], [-0.2194, 0.0944, 0.0765, ..., -0.3642, -0.1701, -0.1120]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 9.3132e-10, 1.3039e-08, 1.2107e-08, ..., 4.6566e-09, 2.7940e-09, 2.7940e-09], [ 0.0000e+00, 9.3132e-10, -5.5879e-09, ..., -5.5879e-09, -4.6566e-09, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-09, 5.5879e-09, ..., 1.8626e-09, 9.3132e-10, 4.6566e-09], [ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.5635e-08, 1.1176e-08, ..., 9.3132e-10, 1.8626e-09, 2.0489e-08]], device='cuda:0') Epoch 455, bias, value: tensor([-0.0149, -0.0218, -0.0100, -0.0202, 0.0045, -0.0008, 0.0087, 0.0226, 0.0112, -0.0119], device='cuda:0'), grad: tensor([ 1.0245e-08, 7.0781e-08, -7.9162e-08, 0.0000e+00, -1.8999e-07, -1.9558e-08, 1.8626e-08, 3.9116e-08, 7.4506e-09, 1.5367e-07], device='cuda:0') 100 1e-05 changing lr epoch 454, time 214.78, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4406 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.14 lr 0.00001000 Epoch 456, weight, value: tensor([[-0.3220, 0.0873, -0.2143, ..., -0.1146, -0.2920, -0.1765], [-0.0085, 0.1073, -0.1296, ..., -0.1460, -0.0849, 0.1335], [ 0.0507, -0.1600, -0.1943, ..., 0.0005, -0.0625, -0.1232], ..., [ 0.1159, -0.1136, 0.1489, ..., 0.0771, 0.2268, -0.0260], [ 0.1445, -0.2690, -0.1853, ..., -0.3765, -0.1253, 0.2383], [-0.2194, 0.0944, 0.0765, ..., -0.3643, -0.1702, -0.1121]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -6.7055e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -0.0000e+00], [-8.3819e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -7.7300e-08], [ 2.7940e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.9558e-08], ..., [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.4901e-08], [ 9.3132e-10, -9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.2596e-08], [ 0.0000e+00, 2.7008e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 456, bias, value: tensor([-0.0149, -0.0218, -0.0100, -0.0202, 0.0045, -0.0009, 0.0088, 0.0226, 0.0112, -0.0119], device='cuda:0'), grad: tensor([-1.3784e-07, -9.4064e-08, 2.7008e-08, 2.7940e-09, 2.7940e-09, 1.0245e-08, 8.3819e-08, 1.9558e-08, 3.7253e-08, 5.8673e-08], device='cuda:0') 100 1e-05 changing lr epoch 455, time 214.57, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4282 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.12 lr 0.00001000 Epoch 457, weight, value: tensor([[-0.3220, 0.0874, -0.2144, ..., -0.1146, -0.2920, -0.1766], [-0.0085, 0.1075, -0.1296, ..., -0.1462, -0.0849, 0.1336], [ 0.0508, -0.1602, -0.1944, ..., 0.0005, -0.0625, -0.1232], ..., [ 0.1159, -0.1138, 0.1490, ..., 0.0772, 0.2269, -0.0260], [ 0.1446, -0.2691, -0.1854, ..., -0.3766, -0.1254, 0.2384], [-0.2196, 0.0944, 0.0766, ..., -0.3644, -0.1702, -0.1123]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 2.6077e-08, 9.3132e-10, 2.0489e-08, ..., 2.7940e-08, 2.4214e-08, 0.0000e+00], [-3.2596e-08, 0.0000e+00, 1.0245e-08, ..., -3.5390e-08, -2.7940e-09, 9.3132e-10], ..., [-1.5832e-08, 0.0000e+00, -4.6566e-08, ..., -6.5193e-09, -4.3772e-08, 9.3132e-10], [ 2.7940e-09, 9.3132e-10, 5.5879e-09, ..., 2.7940e-09, 6.5193e-09, -2.7940e-09], [ 3.7253e-09, -0.0000e+00, 2.7940e-09, ..., 1.8626e-09, 4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 457, bias, value: tensor([-0.0148, -0.0218, -0.0100, -0.0201, 0.0044, -0.0009, 0.0088, 0.0226, 0.0112, -0.0120], device='cuda:0'), grad: tensor([ 6.5193e-09, 1.6578e-07, -2.1420e-07, 8.1956e-08, 2.5146e-08, -5.0291e-08, 5.5879e-09, -3.5390e-08, 7.4506e-09, 9.3132e-09], device='cuda:0') 100 1e-05 changing lr epoch 456, time 215.10, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4305 re_mapping 0.0027 re_causal 0.0096 /// teacc 99.13 lr 0.00001000 Epoch 458, weight, value: tensor([[-0.3221, 0.0874, -0.2144, ..., -0.1146, -0.2921, -0.1766], [-0.0085, 0.1075, -0.1296, ..., -0.1463, -0.0849, 0.1336], [ 0.0508, -0.1603, -0.1945, ..., 0.0006, -0.0624, -0.1232], ..., [ 0.1159, -0.1138, 0.1490, ..., 0.0771, 0.2268, -0.0261], [ 0.1447, -0.2692, -0.1855, ..., -0.3767, -0.1255, 0.2385], [-0.2196, 0.0943, 0.0767, ..., -0.3645, -0.1702, -0.1124]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.1176e-08, -1.8626e-09, 6.5193e-09, ..., 2.7940e-09, 3.7253e-09, -7.4506e-09], [-5.5879e-08, 9.3132e-10, 9.3132e-10, ..., -3.7253e-09, 9.3132e-10, 9.3132e-10], ..., [-4.6566e-09, 3.7253e-09, -6.5193e-09, ..., -1.8626e-09, -4.6566e-09, 5.5879e-09], [ 3.3528e-08, 0.0000e+00, 9.3132e-10, ..., 3.7253e-09, 0.0000e+00, -1.5832e-08], [ 3.7253e-09, 5.5879e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 458, bias, value: tensor([-0.0148, -0.0218, -0.0099, -0.0201, 0.0044, -0.0009, 0.0088, 0.0225, 0.0112, -0.0121], device='cuda:0'), grad: tensor([ 0.0000e+00, 2.1420e-08, -1.7881e-07, -1.3970e-08, -1.7695e-08, 2.1420e-08, 3.4459e-08, 1.0245e-08, 1.0617e-07, 2.4214e-08], device='cuda:0') 100 1e-05 changing lr epoch 457, time 214.68, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4558 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.10 lr 0.00001000 Epoch 459, weight, value: tensor([[-0.3221, 0.0875, -0.2145, ..., -0.1147, -0.2921, -0.1766], [-0.0085, 0.1075, -0.1297, ..., -0.1464, -0.0849, 0.1337], [ 0.0508, -0.1604, -0.1948, ..., 0.0006, -0.0625, -0.1233], ..., [ 0.1159, -0.1139, 0.1491, ..., 0.0772, 0.2269, -0.0261], [ 0.1448, -0.2693, -0.1856, ..., -0.3768, -0.1255, 0.2387], [-0.2197, 0.0943, 0.0768, ..., -0.3646, -0.1702, -0.1125]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 5.5879e-09, 7.4506e-09, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 9.3132e-09, 3.7253e-09, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 1.8626e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [-1.8626e-09, 9.3132e-10, -9.3132e-10, ..., -0.0000e+00, -1.8626e-09, 1.8626e-09], [-1.5926e-07, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -2.1607e-07], [ 9.3132e-10, -1.4901e-08, -1.9558e-08, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 459, bias, value: tensor([-0.0147, -0.0218, -0.0100, -0.0202, 0.0045, -0.0009, 0.0088, 0.0226, 0.0111, -0.0121], device='cuda:0'), grad: tensor([ 1.7695e-08, 6.2399e-08, 5.5879e-09, 4.2841e-08, 7.4506e-09, -9.3132e-10, 7.3854e-07, 1.8626e-09, -8.3540e-07, -3.5390e-08], device='cuda:0') 100 1e-05 changing lr epoch 458, time 215.19, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4086 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.09 lr 0.00001000 Epoch 460, weight, value: tensor([[-0.3221, 0.0875, -0.2145, ..., -0.1147, -0.2921, -0.1766], [-0.0086, 0.1075, -0.1297, ..., -0.1466, -0.0850, 0.1337], [ 0.0508, -0.1605, -0.1949, ..., 0.0006, -0.0625, -0.1233], ..., [ 0.1160, -0.1140, 0.1492, ..., 0.0772, 0.2270, -0.0261], [ 0.1449, -0.2695, -0.1857, ..., -0.3769, -0.1256, 0.2388], [-0.2198, 0.0943, 0.0768, ..., -0.3647, -0.1703, -0.1126]], device='cuda:0'), grad: tensor([[-1.8626e-09, -8.3819e-09, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, -2.7940e-09], [ 9.3132e-10, -0.0000e+00, 1.8626e-08, ..., 1.3970e-08, 1.8626e-09, -1.8626e-08], [-8.3819e-09, 1.8626e-09, 4.6566e-09, ..., -3.7253e-09, -4.6566e-09, 9.3132e-10], ..., [ 4.6566e-09, 5.5879e-09, 9.3132e-10, ..., 6.5193e-09, 9.3132e-10, 9.3132e-09], [ 1.8626e-09, 4.6566e-09, 2.7940e-09, ..., 1.8626e-09, 0.0000e+00, 8.3819e-09], [ 9.3132e-10, -9.3132e-10, -4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 2.7940e-09]], device='cuda:0') Epoch 460, bias, value: tensor([-0.0147, -0.0218, -0.0099, -0.0203, 0.0045, -0.0009, 0.0088, 0.0226, 0.0112, -0.0122], device='cuda:0'), grad: tensor([-1.4901e-08, 1.4901e-08, -1.3039e-08, -6.7987e-08, 8.3819e-09, 7.4506e-09, -1.8626e-09, 4.3772e-08, 2.6077e-08, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 459, time 215.51, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4545 re_mapping 0.0026 re_causal 0.0100 /// teacc 99.12 lr 0.00001000 Epoch 461, weight, value: tensor([[-0.3222, 0.0875, -0.2145, ..., -0.1148, -0.2922, -0.1766], [-0.0086, 0.1076, -0.1298, ..., -0.1466, -0.0851, 0.1338], [ 0.0508, -0.1606, -0.1951, ..., 0.0006, -0.0625, -0.1233], ..., [ 0.1160, -0.1141, 0.1493, ..., 0.0772, 0.2271, -0.0261], [ 0.1449, -0.2696, -0.1858, ..., -0.3769, -0.1257, 0.2389], [-0.2200, 0.0944, 0.0768, ..., -0.3648, -0.1704, -0.1128]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 4.0978e-08, 0.0000e+00, ..., 2.7940e-09, 3.7253e-09, 6.2399e-08], [ 2.5146e-08, -9.0804e-08, 1.2107e-08, ..., 1.6764e-08, 2.5611e-08, -1.3644e-07], [-5.8673e-08, 2.3283e-09, 1.4435e-08, ..., -4.6566e-08, -5.2154e-08, 5.5879e-09], ..., [ 1.2573e-08, 2.1886e-08, -3.3062e-08, ..., 1.4901e-08, 5.1223e-09, 2.7940e-08], [ 4.1910e-09, 9.7789e-09, 0.0000e+00, ..., 2.7940e-09, 4.1910e-09, 1.4901e-08], [ 3.2596e-09, 3.2596e-09, 1.8626e-09, ..., 1.8626e-09, 3.2596e-09, 5.1223e-09]], device='cuda:0') Epoch 461, bias, value: tensor([-0.0147, -0.0218, -0.0099, -0.0203, 0.0046, -0.0008, 0.0087, 0.0226, 0.0111, -0.0122], device='cuda:0'), grad: tensor([ 1.8487e-07, -2.9150e-07, -2.5425e-07, 1.6298e-08, 1.4901e-08, 2.8871e-08, 4.5169e-08, 1.9278e-07, 5.5414e-08, 2.3283e-08], device='cuda:0') 100 1e-05 changing lr epoch 460, time 215.11, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4339 re_mapping 0.0027 re_causal 0.0100 /// teacc 99.14 lr 0.00001000 Epoch 462, weight, value: tensor([[-0.3223, 0.0875, -0.2146, ..., -0.1148, -0.2922, -0.1766], [-0.0085, 0.1078, -0.1298, ..., -0.1467, -0.0850, 0.1339], [ 0.0508, -0.1607, -0.1952, ..., 0.0007, -0.0625, -0.1234], ..., [ 0.1160, -0.1143, 0.1493, ..., 0.0772, 0.2270, -0.0262], [ 0.1448, -0.2698, -0.1859, ..., -0.3770, -0.1258, 0.2388], [-0.2201, 0.0944, 0.0769, ..., -0.3649, -0.1704, -0.1129]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 9.3132e-10, 1.3970e-09, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 1.8626e-09, -8.3819e-09, -4.6566e-10, ..., 1.3970e-09, 1.8626e-09, -2.0489e-08], [-0.0000e+00, 4.6566e-10, 2.3283e-09, ..., -2.7940e-09, -2.7940e-09, 4.1910e-09], ..., [ 2.7940e-09, 1.1642e-08, 2.3283e-09, ..., 2.7940e-09, 4.6566e-10, 2.1420e-08], [-3.3528e-08, 1.8626e-09, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, -6.1933e-08], [-1.3970e-09, -2.3283e-09, -1.4435e-08, ..., 0.0000e+00, -0.0000e+00, 5.1223e-09]], device='cuda:0') Epoch 462, bias, value: tensor([-0.0147, -0.0217, -0.0099, -0.0203, 0.0046, -0.0008, 0.0087, 0.0225, 0.0109, -0.0123], device='cuda:0'), grad: tensor([ 4.6566e-09, -3.6787e-08, 0.0000e+00, 6.9384e-08, -1.2573e-08, -6.5193e-08, 7.0781e-08, 6.1467e-08, -8.2888e-08, 4.6566e-10], device='cuda:0') 100 1e-05 changing lr epoch 461, time 214.54, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4392 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.14 lr 0.00001000 Epoch 463, weight, value: tensor([[-0.3223, 0.0876, -0.2146, ..., -0.1149, -0.2922, -0.1767], [-0.0086, 0.1078, -0.1299, ..., -0.1469, -0.0851, 0.1339], [ 0.0508, -0.1608, -0.1953, ..., 0.0007, -0.0624, -0.1234], ..., [ 0.1161, -0.1143, 0.1494, ..., 0.0772, 0.2271, -0.0262], [ 0.1448, -0.2698, -0.1860, ..., -0.3771, -0.1259, 0.2388], [-0.2201, 0.0943, 0.0769, ..., -0.3651, -0.1705, -0.1130]], device='cuda:0'), grad: tensor([[-0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, -1.3970e-09], [ 4.6566e-09, 4.6566e-10, 9.3132e-10, ..., 2.7940e-09, 0.0000e+00, 3.7253e-09], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 4.6566e-10, 0.0000e+00, 2.3283e-09], [-6.5193e-09, 4.6566e-10, 4.6566e-10, ..., -2.7940e-09, 0.0000e+00, -4.6566e-09], [ 0.0000e+00, 1.1642e-08, 3.2596e-09, ..., 0.0000e+00, -0.0000e+00, 3.2596e-09]], device='cuda:0') Epoch 463, bias, value: tensor([-0.0147, -0.0217, -0.0098, -0.0202, 0.0046, -0.0009, 0.0087, 0.0226, 0.0109, -0.0124], device='cuda:0'), grad: tensor([-3.2596e-09, 4.6566e-09, 1.4901e-08, -3.2596e-09, -3.5390e-08, 1.5832e-08, -5.1223e-08, 7.9162e-09, 2.4680e-08, 3.4459e-08], device='cuda:0') 100 1e-05 changing lr epoch 462, time 214.91, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4106 re_mapping 0.0027 re_causal 0.0095 /// teacc 99.16 lr 0.00001000 Epoch 464, weight, value: tensor([[-0.3223, 0.0876, -0.2146, ..., -0.1149, -0.2922, -0.1767], [-0.0086, 0.1080, -0.1300, ..., -0.1471, -0.0852, 0.1341], [ 0.0509, -0.1609, -0.1954, ..., 0.0008, -0.0624, -0.1234], ..., [ 0.1161, -0.1145, 0.1495, ..., 0.0771, 0.2272, -0.0263], [ 0.1448, -0.2700, -0.1862, ..., -0.3772, -0.1260, 0.2389], [-0.2203, 0.0943, 0.0770, ..., -0.3652, -0.1705, -0.1132]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.0734e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 7.9162e-08], [ 8.3819e-08, -1.0151e-07, 3.9116e-08, ..., 9.3132e-09, 5.1223e-08, -1.2759e-07], [-9.3132e-10, 2.7940e-09, 0.0000e+00, ..., -9.3132e-10, -9.3132e-10, 5.5879e-09], ..., [-9.8720e-08, 2.5146e-08, -4.6566e-08, ..., -1.0245e-08, -5.9605e-08, -9.3132e-10], [ 4.6566e-09, 7.4506e-09, 2.7940e-09, ..., 9.3132e-10, 3.7253e-09, 8.3819e-09], [ 8.3819e-09, 3.0734e-08, 5.5879e-09, ..., 9.3132e-10, 4.6566e-09, 1.5832e-08]], device='cuda:0') Epoch 464, bias, value: tensor([-0.0147, -0.0217, -0.0098, -0.0203, 0.0047, -0.0009, 0.0087, 0.0225, 0.0108, -0.0125], device='cuda:0'), grad: tensor([ 1.7416e-07, -2.8312e-07, 1.3039e-08, 3.7253e-09, -1.9558e-08, 2.7940e-09, 3.1665e-08, -7.1712e-08, 4.7497e-08, 9.9652e-08], device='cuda:0') 100 1e-05 changing lr epoch 463, time 214.89, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4100 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.13 lr 0.00001000 Epoch 465, weight, value: tensor([[-0.3223, 0.0876, -0.2147, ..., -0.1149, -0.2923, -0.1767], [-0.0087, 0.1081, -0.1300, ..., -0.1471, -0.0852, 0.1342], [ 0.0509, -0.1610, -0.1954, ..., 0.0008, -0.0624, -0.1235], ..., [ 0.1161, -0.1146, 0.1496, ..., 0.0771, 0.2272, -0.0263], [ 0.1447, -0.2702, -0.1863, ..., -0.3773, -0.1261, 0.2389], [-0.2204, 0.0943, 0.0770, ..., -0.3653, -0.1706, -0.1133]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -6.5193e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [-3.3528e-08, -1.3970e-08, 5.5879e-09, ..., 2.7940e-09, 5.5879e-09, -8.1025e-08], [ 9.3132e-09, 3.7253e-09, 3.7253e-09, ..., 2.7940e-09, 9.3132e-10, 1.8626e-08], ..., [-2.7940e-09, 2.7940e-09, -7.4506e-09, ..., -2.7940e-09, -9.3132e-09, 1.3970e-08], [ 1.4901e-08, 9.3132e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 3.5390e-08], [ 1.8626e-09, 1.8626e-09, -2.7940e-09, ..., 9.3132e-10, 1.8626e-09, 1.8626e-09]], device='cuda:0') Epoch 465, bias, value: tensor([-0.0147, -0.0217, -0.0098, -0.0203, 0.0047, -0.0009, 0.0087, 0.0225, 0.0106, -0.0125], device='cuda:0'), grad: tensor([-1.3039e-08, -2.2724e-07, 5.9605e-08, -9.3132e-09, 6.5193e-09, 9.3132e-09, 3.2596e-08, 2.6077e-08, 1.1269e-07, 1.0245e-08], device='cuda:0') 100 1e-05 changing lr epoch 464, time 214.63, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4240 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.13 lr 0.00001000 Epoch 466, weight, value: tensor([[-0.3223, 0.0877, -0.2147, ..., -0.1150, -0.2923, -0.1767], [-0.0087, 0.1081, -0.1301, ..., -0.1472, -0.0853, 0.1342], [ 0.0509, -0.1612, -0.1955, ..., 0.0008, -0.0624, -0.1235], ..., [ 0.1162, -0.1147, 0.1497, ..., 0.0772, 0.2273, -0.0263], [ 0.1448, -0.2703, -0.1864, ..., -0.3774, -0.1262, 0.2389], [-0.2205, 0.0943, 0.0771, ..., -0.3653, -0.1706, -0.1134]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, -1.6764e-08, 7.4506e-09, ..., 3.7253e-09, 5.5879e-09, -2.7940e-09], [-1.8626e-08, 9.3132e-10, 4.6566e-09, ..., -1.2107e-08, -1.3039e-08, 0.0000e+00], ..., [-3.7253e-09, 1.2107e-08, -3.3528e-08, ..., -9.3132e-09, -4.6566e-09, 2.7940e-09], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [ 1.8626e-09, -2.7940e-09, -1.2107e-08, ..., 1.8626e-09, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 466, bias, value: tensor([-0.0146, -0.0217, -0.0098, -0.0203, 0.0047, -0.0009, 0.0087, 0.0226, 0.0106, -0.0126], device='cuda:0'), grad: tensor([-3.7253e-09, -5.1223e-08, -3.7253e-08, 3.7253e-08, 3.6322e-08, 6.9849e-08, -7.0781e-08, 2.4214e-08, 6.5193e-09, -9.3132e-09], device='cuda:0') 100 1e-05 changing lr epoch 465, time 214.65, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4345 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 467, weight, value: tensor([[-0.3224, 0.0876, -0.2149, ..., -0.1150, -0.2923, -0.1767], [-0.0087, 0.1082, -0.1301, ..., -0.1473, -0.0853, 0.1342], [ 0.0510, -0.1613, -0.1956, ..., 0.0009, -0.0623, -0.1235], ..., [ 0.1162, -0.1148, 0.1497, ..., 0.0771, 0.2274, -0.0264], [ 0.1448, -0.2705, -0.1865, ..., -0.3774, -0.1262, 0.2390], [-0.2206, 0.0944, 0.0771, ..., -0.3654, -0.1707, -0.1135]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, -9.3132e-10, 5.5879e-09, ..., 3.7253e-09, 5.5879e-09, -5.5879e-09], [ 1.0245e-08, 0.0000e+00, 1.4901e-08, ..., 3.7253e-09, 1.3039e-08, 9.3132e-10], ..., [-1.8626e-08, 6.5193e-09, -1.7695e-08, ..., -1.2107e-08, -2.3283e-08, 7.4506e-09], [ 9.3132e-10, 3.7253e-09, 6.5193e-09, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, -5.1223e-08, -7.9162e-08, ..., 0.0000e+00, 0.0000e+00, -9.3132e-09]], device='cuda:0') Epoch 467, bias, value: tensor([-0.0146, -0.0217, -0.0097, -0.0203, 0.0047, -0.0009, 0.0087, 0.0226, 0.0105, -0.0126], device='cuda:0'), grad: tensor([ 2.7940e-09, 5.5879e-09, 1.0245e-08, 1.2107e-08, 1.7043e-07, 2.7940e-09, -3.7253e-09, -5.5879e-09, 2.0489e-08, -2.1141e-07], device='cuda:0') 100 1e-05 changing lr epoch 466, time 215.11, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4220 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.12 lr 0.00001000 Epoch 468, weight, value: tensor([[-0.3224, 0.0877, -0.2150, ..., -0.1150, -0.2923, -0.1767], [-0.0086, 0.1084, -0.1300, ..., -0.1475, -0.0852, 0.1345], [ 0.0510, -0.1613, -0.1957, ..., 0.0010, -0.0623, -0.1235], ..., [ 0.1161, -0.1151, 0.1496, ..., 0.0772, 0.2273, -0.0267], [ 0.1449, -0.2706, -0.1866, ..., -0.3775, -0.1263, 0.2391], [-0.2207, 0.0944, 0.0772, ..., -0.3655, -0.1708, -0.1135]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -9.3132e-10, 9.3132e-10, ..., 3.7253e-09, 2.7940e-09, 0.0000e+00], [ 1.5832e-08, 9.3132e-10, 1.3039e-08, ..., 1.1176e-08, 1.5832e-08, 9.3132e-10], [ 2.3283e-08, 0.0000e+00, 4.7497e-08, ..., -1.4901e-08, 2.5146e-08, 9.3132e-10], ..., [-5.8673e-08, 2.7940e-09, -6.7987e-08, ..., -1.5832e-08, -6.1467e-08, 1.8626e-09], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.8626e-09, -2.7940e-09], [ 8.3819e-09, 6.5193e-09, 3.7253e-09, ..., 4.6566e-09, 9.3132e-09, 3.7253e-09]], device='cuda:0') Epoch 468, bias, value: tensor([-0.0146, -0.0215, -0.0096, -0.0203, 0.0047, -0.0009, 0.0087, 0.0224, 0.0105, -0.0127], device='cuda:0'), grad: tensor([ 9.3132e-09, 4.4703e-08, 5.5879e-09, 2.1420e-08, -2.7940e-08, 9.3132e-10, 2.7940e-09, -9.3132e-08, -9.3132e-10, 4.1910e-08], device='cuda:0') 100 1e-05 changing lr epoch 467, time 215.20, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4285 re_mapping 0.0026 re_causal 0.0098 /// teacc 99.13 lr 0.00001000 Epoch 469, weight, value: tensor([[-0.3224, 0.0877, -0.2150, ..., -0.1150, -0.2924, -0.1767], [-0.0085, 0.1085, -0.1299, ..., -0.1477, -0.0851, 0.1347], [ 0.0510, -0.1614, -0.1960, ..., 0.0010, -0.0623, -0.1235], ..., [ 0.1161, -0.1153, 0.1497, ..., 0.0773, 0.2273, -0.0268], [ 0.1450, -0.2707, -0.1867, ..., -0.3776, -0.1263, 0.2392], [-0.2209, 0.0944, 0.0772, ..., -0.3657, -0.1709, -0.1136]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -1.3970e-08, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 4.6566e-09, -3.7253e-09, 4.6566e-09, ..., 2.7940e-09, 4.6566e-09, -1.7695e-08], [-3.7253e-09, 0.0000e+00, 7.4506e-09, ..., -1.4901e-08, -1.2107e-08, 9.3132e-10], ..., [-1.2107e-08, 1.8626e-09, -2.8871e-08, ..., 3.7253e-09, -4.6566e-09, 1.0245e-08], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 6.5193e-09, 1.8626e-09, 7.4506e-09, ..., 2.7940e-09, 6.5193e-09, 0.0000e+00]], device='cuda:0') Epoch 469, bias, value: tensor([-0.0146, -0.0214, -0.0096, -0.0205, 0.0047, -0.0008, 0.0086, 0.0223, 0.0105, -0.0127], device='cuda:0'), grad: tensor([-3.1665e-08, -2.4214e-08, -2.8871e-08, 1.1176e-08, 1.4901e-08, 1.6764e-08, -1.8626e-09, 1.1176e-08, 8.3819e-09, 1.8626e-08], device='cuda:0') 100 1e-05 changing lr epoch 468, time 215.25, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4227 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.11 lr 0.00001000 Epoch 470, weight, value: tensor([[-0.3225, 0.0878, -0.2150, ..., -0.1151, -0.2924, -0.1767], [-0.0086, 0.1085, -0.1299, ..., -0.1479, -0.0852, 0.1347], [ 0.0510, -0.1615, -0.1961, ..., 0.0010, -0.0623, -0.1235], ..., [ 0.1161, -0.1154, 0.1497, ..., 0.0774, 0.2274, -0.0269], [ 0.1451, -0.2708, -0.1868, ..., -0.3777, -0.1264, 0.2393], [-0.2210, 0.0944, 0.0773, ..., -0.3658, -0.1710, -0.1137]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 0.0000e+00, 8.3819e-09, ..., 3.7253e-09, 3.7253e-09, -3.7253e-09], [ 2.7940e-09, 0.0000e+00, 5.5879e-09, ..., 2.7940e-09, 3.7253e-09, 0.0000e+00], ..., [-9.3132e-09, 1.8626e-09, -1.7695e-08, ..., -9.3132e-09, -1.2107e-08, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, -5.5879e-09, -6.5193e-09, ..., 2.7940e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 470, bias, value: tensor([-0.0146, -0.0214, -0.0096, -0.0206, 0.0047, -0.0009, 0.0086, 0.0223, 0.0105, -0.0127], device='cuda:0'), grad: tensor([ 2.7940e-09, 6.5193e-09, 9.3132e-09, -1.4901e-08, 2.4214e-08, 3.7253e-09, 9.3132e-10, -2.1420e-08, 1.8626e-09, -1.6764e-08], device='cuda:0') 100 1e-05 changing lr epoch 469, time 214.93, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4278 re_mapping 0.0027 re_causal 0.0096 /// teacc 99.11 lr 0.00001000 Epoch 471, weight, value: tensor([[-0.3225, 0.0878, -0.2150, ..., -0.1151, -0.2924, -0.1768], [-0.0086, 0.1085, -0.1300, ..., -0.1481, -0.0852, 0.1348], [ 0.0510, -0.1615, -0.1962, ..., 0.0010, -0.0623, -0.1236], ..., [ 0.1162, -0.1154, 0.1498, ..., 0.0774, 0.2275, -0.0269], [ 0.1453, -0.2708, -0.1868, ..., -0.3777, -0.1263, 0.2396], [-0.2211, 0.0944, 0.0772, ..., -0.3659, -0.1711, -0.1137]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -2.7940e-09, 9.3132e-10, ..., -0.0000e+00, 9.3132e-10, 0.0000e+00], [ 2.7940e-09, -2.7940e-09, 9.3132e-10, ..., 0.0000e+00, 1.8626e-09, -5.5879e-09], [-9.3132e-09, 0.0000e+00, -9.3132e-10, ..., -9.3132e-10, -5.5879e-09, -7.4506e-09], ..., [ 4.6566e-09, 1.3970e-08, 2.0489e-08, ..., 0.0000e+00, 5.5879e-09, 8.3819e-09], [ 5.5879e-09, 7.4506e-09, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 1.8626e-09], [-0.0000e+00, -9.3132e-09, -4.4703e-08, ..., 0.0000e+00, -5.5879e-09, 3.7253e-09]], device='cuda:0') Epoch 471, bias, value: tensor([-0.0146, -0.0215, -0.0095, -0.0206, 0.0047, -0.0010, 0.0087, 0.0224, 0.0107, -0.0127], device='cuda:0'), grad: tensor([-1.8626e-09, -5.5879e-09, -2.6077e-08, 8.3819e-09, 6.1467e-08, -8.9407e-08, 1.1176e-08, 7.4506e-08, 4.5635e-08, -6.7987e-08], device='cuda:0') 100 1e-05 changing lr epoch 470, time 215.20, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4095 re_mapping 0.0027 re_causal 0.0096 /// teacc 99.10 lr 0.00001000 Epoch 472, weight, value: tensor([[-0.3226, 0.0878, -0.2150, ..., -0.1151, -0.2925, -0.1768], [-0.0086, 0.1086, -0.1300, ..., -0.1483, -0.0852, 0.1349], [ 0.0511, -0.1617, -0.1962, ..., 0.0013, -0.0621, -0.1237], ..., [ 0.1161, -0.1155, 0.1499, ..., 0.0772, 0.2274, -0.0270], [ 0.1456, -0.2709, -0.1868, ..., -0.3778, -0.1263, 0.2399], [-0.2212, 0.0945, 0.0773, ..., -0.3660, -0.1711, -0.1138]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 5.1223e-08, 5.5879e-09, 6.3330e-08, ..., 2.7940e-09, 5.6811e-08, 1.3970e-08], [-4.6566e-09, 0.0000e+00, 0.0000e+00, ..., -9.3132e-09, -9.3132e-10, 0.0000e+00], ..., [-6.7055e-08, 1.8626e-09, -8.2888e-08, ..., -0.0000e+00, -7.4506e-08, -1.0245e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.3970e-08, -1.8626e-09, 1.3039e-08, ..., 0.0000e+00, 1.6764e-08, 4.6566e-09]], device='cuda:0') Epoch 472, bias, value: tensor([-0.0146, -0.0215, -0.0094, -0.0206, 0.0046, -0.0010, 0.0087, 0.0223, 0.0109, -0.0127], device='cuda:0'), grad: tensor([ 2.8871e-08, 1.2200e-07, -5.5879e-08, 1.6764e-08, -1.5832e-08, 0.0000e+00, 9.3132e-10, -1.2759e-07, 9.3132e-10, 2.4214e-08], device='cuda:0') 100 1e-05 changing lr epoch 471, time 214.85, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4433 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.13 lr 0.00001000 Epoch 473, weight, value: tensor([[-0.3226, 0.0878, -0.2151, ..., -0.1151, -0.2925, -0.1768], [-0.0086, 0.1086, -0.1301, ..., -0.1484, -0.0853, 0.1350], [ 0.0511, -0.1618, -0.1964, ..., 0.0013, -0.0621, -0.1238], ..., [ 0.1162, -0.1156, 0.1500, ..., 0.0772, 0.2274, -0.0271], [ 0.1458, -0.2711, -0.1869, ..., -0.3779, -0.1263, 0.2400], [-0.2213, 0.0946, 0.0774, ..., -0.3661, -0.1712, -0.1138]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.2352e-08, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 9.3132e-10, 4.6566e-09, ..., 1.8626e-09, 1.8626e-09, -1.8626e-09], [-2.7008e-08, 0.0000e+00, 2.7940e-09, ..., -1.3970e-08, -2.1420e-08, 0.0000e+00], ..., [ 2.4214e-08, 9.3132e-10, -2.7940e-09, ..., 1.3970e-08, 1.8626e-08, 1.8626e-09], [-9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 9.3132e-10, 2.4214e-08, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 1.8626e-09]], device='cuda:0') Epoch 473, bias, value: tensor([-0.0146, -0.0215, -0.0094, -0.0206, 0.0045, -0.0010, 0.0088, 0.0223, 0.0110, -0.0127], device='cuda:0'), grad: tensor([-4.7497e-08, 5.5879e-09, -4.3772e-08, -1.3970e-08, -3.7253e-09, 4.6566e-09, 4.6566e-09, 4.5635e-08, -2.7940e-09, 5.5879e-08], device='cuda:0') 100 1e-05 changing lr epoch 472, time 214.89, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4316 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.12 lr 0.00001000 Epoch 474, weight, value: tensor([[-0.3226, 0.0878, -0.2152, ..., -0.1152, -0.2925, -0.1768], [-0.0087, 0.1087, -0.1302, ..., -0.1486, -0.0854, 0.1350], [ 0.0512, -0.1619, -0.1964, ..., 0.0014, -0.0620, -0.1238], ..., [ 0.1162, -0.1157, 0.1501, ..., 0.0771, 0.2275, -0.0271], [ 0.1458, -0.2712, -0.1870, ..., -0.3780, -0.1264, 0.2402], [-0.2215, 0.0946, 0.0773, ..., -0.3661, -0.1714, -0.1139]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -0.0000e+00], [ 2.7940e-09, 0.0000e+00, 1.0245e-08, ..., 1.3039e-08, 9.3132e-10, 9.3132e-10], [ 9.3132e-10, 0.0000e+00, 5.5879e-09, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, 0.0000e+00, 7.4506e-09, ..., 9.3132e-09, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 474, bias, value: tensor([-0.0146, -0.0215, -0.0093, -0.0205, 0.0046, -0.0011, 0.0088, 0.0223, 0.0110, -0.0128], device='cuda:0'), grad: tensor([-1.4901e-08, 2.6077e-08, 8.3819e-09, -5.4948e-08, 1.8626e-09, 0.0000e+00, 6.5193e-09, 1.9558e-08, 6.5193e-09, 1.4901e-08], device='cuda:0') 100 1e-05 changing lr epoch 473, time 215.25, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4442 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.12 lr 0.00001000 Epoch 475, weight, value: tensor([[-0.3227, 0.0879, -0.2153, ..., -0.1153, -0.2926, -0.1769], [-0.0087, 0.1087, -0.1301, ..., -0.1487, -0.0854, 0.1351], [ 0.0513, -0.1619, -0.1964, ..., 0.0016, -0.0618, -0.1239], ..., [ 0.1162, -0.1158, 0.1501, ..., 0.0770, 0.2275, -0.0272], [ 0.1459, -0.2713, -0.1872, ..., -0.3781, -0.1265, 0.2402], [-0.2215, 0.0945, 0.0774, ..., -0.3662, -0.1714, -0.1141]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.0489e-08, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [-1.8626e-09, 0.0000e+00, -9.3132e-10, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], ..., [ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.6077e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 475, bias, value: tensor([-0.0145, -0.0215, -0.0091, -0.0205, 0.0046, -0.0012, 0.0088, 0.0222, 0.0109, -0.0129], device='cuda:0'), grad: tensor([-3.4459e-08, 3.7253e-09, -7.4506e-09, 6.7987e-08, -2.3283e-08, -7.3574e-08, 7.4506e-09, 5.5879e-09, 1.8626e-09, 5.4948e-08], device='cuda:0') 100 1e-05 changing lr epoch 474, time 215.01, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4096 re_mapping 0.0026 re_causal 0.0093 /// teacc 99.11 lr 0.00001000 Epoch 476, weight, value: tensor([[-0.3228, 0.0880, -0.2153, ..., -0.1154, -0.2927, -0.1769], [-0.0086, 0.1088, -0.1300, ..., -0.1489, -0.0854, 0.1353], [ 0.0513, -0.1621, -0.1965, ..., 0.0016, -0.0618, -0.1239], ..., [ 0.1161, -0.1159, 0.1500, ..., 0.0770, 0.2274, -0.0274], [ 0.1460, -0.2714, -0.1873, ..., -0.3783, -0.1265, 0.2404], [-0.2216, 0.0945, 0.0775, ..., -0.3663, -0.1714, -0.1143]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.8626e-09, 2.7940e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 9.3132e-10, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [-1.8626e-09, 0.0000e+00, 9.3132e-10, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, -7.4506e-09, -9.3132e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 476, bias, value: tensor([-0.0145, -0.0214, -0.0090, -0.0205, 0.0046, -0.0012, 0.0088, 0.0220, 0.0109, -0.0129], device='cuda:0'), grad: tensor([ 1.2107e-08, 6.5193e-09, -1.2107e-08, 2.7940e-09, 1.4901e-08, 9.3132e-10, -4.6566e-09, 3.7253e-09, 4.6566e-09, -1.9558e-08], device='cuda:0') 100 1e-05 changing lr epoch 475, time 214.76, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4095 re_mapping 0.0025 re_causal 0.0092 /// teacc 99.13 lr 0.00001000 Epoch 477, weight, value: tensor([[-0.3229, 0.0881, -0.2153, ..., -0.1155, -0.2928, -0.1769], [-0.0087, 0.1088, -0.1301, ..., -0.1491, -0.0854, 0.1354], [ 0.0513, -0.1621, -0.1966, ..., 0.0017, -0.0618, -0.1240], ..., [ 0.1161, -0.1160, 0.1501, ..., 0.0770, 0.2275, -0.0274], [ 0.1460, -0.2716, -0.1874, ..., -0.3785, -0.1267, 0.2405], [-0.2218, 0.0945, 0.0775, ..., -0.3665, -0.1715, -0.1145]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.3039e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -9.3132e-10], [ 4.1723e-07, -5.5879e-09, 3.7346e-07, ..., 9.3132e-10, 4.3586e-07, 2.3935e-07], [ 2.7940e-09, 1.8626e-09, 1.8626e-09, ..., 0.0000e+00, 2.7940e-09, 2.7940e-09], ..., [-4.4145e-07, 4.6566e-09, -3.9954e-07, ..., 0.0000e+00, -4.6846e-07, -2.6636e-07], [-7.4506e-09, 4.6566e-09, 1.8626e-09, ..., 0.0000e+00, 9.3132e-10, -1.3970e-08], [ 2.3283e-08, 6.7987e-08, 1.3039e-08, ..., 0.0000e+00, 2.4214e-08, 1.8626e-08]], device='cuda:0') Epoch 477, bias, value: tensor([-0.0144, -0.0214, -0.0089, -0.0206, 0.0046, -0.0011, 0.0087, 0.0220, 0.0108, -0.0130], device='cuda:0'), grad: tensor([-4.0047e-08, 7.2923e-07, 2.1420e-08, -9.3132e-10, -1.3607e-06, 6.5193e-09, 9.4064e-08, -7.1991e-07, 2.7940e-09, 1.2834e-06], device='cuda:0') 100 1e-05 changing lr epoch 476, time 214.92, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4290 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.13 lr 0.00001000 Epoch 478, weight, value: tensor([[-0.3229, 0.0881, -0.2155, ..., -0.1156, -0.2929, -0.1769], [-0.0087, 0.1090, -0.1302, ..., -0.1493, -0.0855, 0.1355], [ 0.0514, -0.1623, -0.1967, ..., 0.0018, -0.0617, -0.1240], ..., [ 0.1162, -0.1162, 0.1503, ..., 0.0770, 0.2276, -0.0275], [ 0.1460, -0.2717, -0.1875, ..., -0.3785, -0.1268, 0.2405], [-0.2220, 0.0945, 0.0775, ..., -0.3666, -0.1717, -0.1145]], device='cuda:0'), grad: tensor([[ 2.7940e-09, -1.8626e-09, -0.0000e+00, ..., 2.7940e-09, 1.8626e-09, 0.0000e+00], [ 1.0245e-08, -4.6566e-09, 0.0000e+00, ..., 9.3132e-09, 7.4506e-09, -4.2841e-08], [-1.3039e-07, 0.0000e+00, -0.0000e+00, ..., -9.1270e-08, -1.0058e-07, 0.0000e+00], ..., [ 1.1455e-07, 9.3132e-10, 9.3132e-10, ..., 7.5437e-08, 8.9407e-08, 1.8626e-09], [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 478, bias, value: tensor([-0.0144, -0.0214, -0.0088, -0.0207, 0.0046, -0.0011, 0.0087, 0.0221, 0.0107, -0.0131], device='cuda:0'), grad: tensor([ 6.5193e-09, -4.4703e-08, -3.4273e-07, 2.7940e-09, 6.5193e-09, -1.8626e-08, 9.4995e-08, 2.8498e-07, 8.3819e-09, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 477, time 214.97, cls_loss 0.0006 cls_loss_mapping 0.0003 cls_loss_causal 0.4339 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.13 lr 0.00001000 Epoch 479, weight, value: tensor([[-0.3230, 0.0882, -0.2155, ..., -0.1156, -0.2929, -0.1770], [-0.0088, 0.1092, -0.1303, ..., -0.1494, -0.0856, 0.1356], [ 0.0514, -0.1624, -0.1968, ..., 0.0019, -0.0617, -0.1241], ..., [ 0.1163, -0.1164, 0.1505, ..., 0.0771, 0.2278, -0.0276], [ 0.1460, -0.2719, -0.1877, ..., -0.3787, -0.1268, 0.2406], [-0.2221, 0.0945, 0.0776, ..., -0.3667, -0.1718, -0.1147]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 6.5193e-09, 2.7940e-08, 2.7940e-09, ..., 9.3132e-10, 9.3132e-10, 3.8184e-08], [ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], ..., [ 3.0734e-08, 3.7253e-09, 1.2107e-08, ..., 0.0000e+00, 0.0000e+00, 3.5390e-08], [-5.7742e-08, -9.3132e-10, -1.8626e-08, ..., -9.3132e-10, 0.0000e+00, -6.6124e-08], [ 9.3132e-10, 1.8626e-09, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09]], device='cuda:0') Epoch 479, bias, value: tensor([-0.0144, -0.0213, -0.0088, -0.0207, 0.0046, -0.0011, 0.0086, 0.0220, 0.0106, -0.0132], device='cuda:0'), grad: tensor([ 4.6566e-09, 1.0896e-07, 3.7253e-09, 3.7253e-09, -9.1270e-08, 5.5879e-08, 1.0245e-08, 1.1642e-07, -2.0303e-07, 8.3819e-09], device='cuda:0') 100 1e-05 changing lr epoch 478, time 214.79, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.3908 re_mapping 0.0026 re_causal 0.0092 /// teacc 99.12 lr 0.00001000 Epoch 480, weight, value: tensor([[-0.3231, 0.0882, -0.2156, ..., -0.1157, -0.2929, -0.1770], [-0.0088, 0.1092, -0.1304, ..., -0.1495, -0.0857, 0.1357], [ 0.0515, -0.1624, -0.1970, ..., 0.0020, -0.0616, -0.1242], ..., [ 0.1163, -0.1165, 0.1506, ..., 0.0770, 0.2278, -0.0276], [ 0.1461, -0.2722, -0.1878, ..., -0.3788, -0.1269, 0.2407], [-0.2223, 0.0945, 0.0777, ..., -0.3668, -0.1718, -0.1148]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-1.2498e-06, -3.8743e-07, -2.3842e-06, ..., 0.0000e+00, -2.2724e-07, -1.4836e-06], [ 9.3132e-10, 1.8626e-09, 2.7940e-09, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], ..., [ 1.2275e-06, 3.8277e-07, 2.3395e-06, ..., -2.7940e-09, 2.1979e-07, 1.4585e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 8.3819e-09, 5.5879e-09, 1.0245e-08, ..., 9.3132e-10, 2.7940e-09, 1.1176e-08]], device='cuda:0') Epoch 480, bias, value: tensor([-0.0144, -0.0213, -0.0087, -0.0207, 0.0047, -0.0011, 0.0086, 0.0220, 0.0105, -0.0133], device='cuda:0'), grad: tensor([-6.5193e-09, -6.1207e-06, 1.0245e-08, 8.1956e-08, 1.2107e-08, -3.3528e-08, 7.4506e-09, 6.0126e-06, 9.3132e-10, 4.5635e-08], device='cuda:0') 100 1e-05 changing lr epoch 479, time 214.79, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4137 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.13 lr 0.00001000 Epoch 481, weight, value: tensor([[-0.3231, 0.0883, -0.2156, ..., -0.1157, -0.2930, -0.1769], [-0.0088, 0.1093, -0.1304, ..., -0.1497, -0.0857, 0.1358], [ 0.0515, -0.1626, -0.1971, ..., 0.0020, -0.0616, -0.1242], ..., [ 0.1163, -0.1166, 0.1506, ..., 0.0770, 0.2278, -0.0277], [ 0.1461, -0.2724, -0.1880, ..., -0.3789, -0.1270, 0.2407], [-0.2224, 0.0945, 0.0778, ..., -0.3670, -0.1719, -0.1149]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.0245e-08, 2.7940e-09, 5.5879e-09, ..., 3.7253e-09, 5.5879e-09, 3.7253e-09], [ 2.1420e-08, 0.0000e+00, 9.3132e-10, ..., -9.3132e-10, 0.0000e+00, 3.0734e-08], ..., [-3.7253e-09, 9.3132e-10, -6.5193e-09, ..., -3.7253e-09, -6.5193e-09, 5.5879e-09], [-4.0047e-08, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, -5.5879e-08], [ 0.0000e+00, -2.7940e-09, -3.7253e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 481, bias, value: tensor([-0.0143, -0.0212, -0.0087, -0.0207, 0.0046, -0.0011, 0.0085, 0.0219, 0.0104, -0.0133], device='cuda:0'), grad: tensor([ 0.0000e+00, 2.9802e-08, 2.7008e-08, 4.6566e-09, 2.7940e-09, -5.5879e-09, 1.6764e-08, -5.5879e-09, -6.1467e-08, -6.5193e-09], device='cuda:0') 100 1e-05 changing lr epoch 480, time 215.00, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4177 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.12 lr 0.00001000 Epoch 482, weight, value: tensor([[-0.3232, 0.0884, -0.2157, ..., -0.1158, -0.2930, -0.1770], [-0.0088, 0.1093, -0.1305, ..., -0.1500, -0.0858, 0.1358], [ 0.0515, -0.1627, -0.1973, ..., 0.0020, -0.0616, -0.1243], ..., [ 0.1164, -0.1166, 0.1507, ..., 0.0771, 0.2279, -0.0277], [ 0.1461, -0.2725, -0.1881, ..., -0.3790, -0.1270, 0.2408], [-0.2225, 0.0946, 0.0779, ..., -0.3670, -0.1719, -0.1150]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, -0.0000e+00], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [-6.0536e-08, 0.0000e+00, 0.0000e+00, ..., -1.8626e-08, -2.7940e-08, -5.5879e-09], ..., [ 2.3283e-08, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 1.1176e-08, 2.7940e-09], [ 2.2352e-08, 9.3132e-10, 0.0000e+00, ..., 6.5193e-09, 9.3132e-09, 1.8626e-09], [ 1.3039e-08, 9.3132e-10, 0.0000e+00, ..., 3.7253e-09, 5.5879e-09, 9.3132e-10]], device='cuda:0') Epoch 482, bias, value: tensor([-0.0143, -0.0212, -0.0087, -0.0206, 0.0045, -0.0009, 0.0084, 0.0220, 0.0102, -0.0133], device='cuda:0'), grad: tensor([ 2.5146e-08, 5.5879e-09, -1.3504e-07, -4.6566e-09, -6.2399e-08, -1.7695e-08, 2.0489e-08, 5.8673e-08, 5.8673e-08, 4.8429e-08], device='cuda:0') 100 1e-05 changing lr epoch 481, time 214.78, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4061 re_mapping 0.0026 re_causal 0.0093 /// teacc 99.12 lr 0.00001000 Epoch 483, weight, value: tensor([[-0.3233, 0.0884, -0.2157, ..., -0.1158, -0.2931, -0.1770], [-0.0090, 0.1094, -0.1308, ..., -0.1501, -0.0861, 0.1357], [ 0.0515, -0.1628, -0.1974, ..., 0.0021, -0.0616, -0.1244], ..., [ 0.1166, -0.1167, 0.1510, ..., 0.0771, 0.2283, -0.0276], [ 0.1460, -0.2726, -0.1882, ..., -0.3791, -0.1271, 0.2408], [-0.2227, 0.0947, 0.0780, ..., -0.3671, -0.1721, -0.1151]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.8626e-09, 2.7940e-09, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [ 9.3132e-10, 1.8626e-09, 4.6566e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [-8.3819e-09, -4.6566e-09, -1.7695e-08, ..., -1.1176e-08, -8.3819e-09, 9.3132e-10], [-0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 2.7940e-09, -3.7253e-09, -1.8626e-09, ..., 3.7253e-09, 2.7940e-09, 9.3132e-10]], device='cuda:0') Epoch 483, bias, value: tensor([-0.0143, -0.0214, -0.0086, -0.0206, 0.0044, -0.0010, 0.0085, 0.0222, 0.0101, -0.0132], device='cuda:0'), grad: tensor([ 8.3819e-09, 8.3819e-09, 1.8626e-09, 1.8626e-09, 1.9558e-08, -1.6764e-08, 1.3970e-08, -2.7940e-08, 3.7253e-09, -3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 482, time 214.82, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4221 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.10 lr 0.00001000 Epoch 484, weight, value: tensor([[-0.3233, 0.0885, -0.2158, ..., -0.1159, -0.2931, -0.1770], [-0.0090, 0.1095, -0.1308, ..., -0.1502, -0.0861, 0.1359], [ 0.0515, -0.1630, -0.1975, ..., 0.0021, -0.0616, -0.1245], ..., [ 0.1166, -0.1169, 0.1511, ..., 0.0771, 0.2283, -0.0277], [ 0.1460, -0.2728, -0.1885, ..., -0.3792, -0.1272, 0.2408], [-0.2228, 0.0947, 0.0781, ..., -0.3672, -0.1722, -0.1152]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 0.0000e+00, -0.0000e+00], [ 0.0000e+00, -0.0000e+00, 6.0536e-09, ..., 4.6566e-10, 0.0000e+00, -1.6764e-08], [ 4.6566e-10, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 2.3283e-09], ..., [ 4.6566e-10, 1.4901e-08, 1.6298e-08, ..., 0.0000e+00, 0.0000e+00, 1.0710e-08], [-1.3690e-07, 6.0536e-09, 7.4506e-09, ..., 0.0000e+00, 0.0000e+00, -1.4715e-07], [ 0.0000e+00, -6.9849e-08, -8.7544e-08, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 484, bias, value: tensor([-0.0142, -0.0214, -0.0086, -0.0205, 0.0043, -0.0010, 0.0085, 0.0221, 0.0100, -0.0133], device='cuda:0'), grad: tensor([ 7.9162e-09, -1.2107e-08, 7.4506e-09, 4.3772e-08, 6.5658e-08, 2.7893e-07, 1.3504e-07, 7.4506e-08, -3.1665e-07, -2.6124e-07], device='cuda:0') 100 1e-05 changing lr epoch 483, time 214.71, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4298 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.07 lr 0.00001000 Epoch 485, weight, value: tensor([[-0.3234, 0.0886, -0.2159, ..., -0.1160, -0.2931, -0.1771], [-0.0091, 0.1095, -0.1308, ..., -0.1505, -0.0862, 0.1360], [ 0.0515, -0.1631, -0.1977, ..., 0.0021, -0.0616, -0.1245], ..., [ 0.1167, -0.1170, 0.1512, ..., 0.0771, 0.2284, -0.0278], [ 0.1461, -0.2729, -0.1886, ..., -0.3793, -0.1272, 0.2409], [-0.2230, 0.0948, 0.0782, ..., -0.3673, -0.1723, -0.1155]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 3.2596e-09, 2.6543e-08, 1.8626e-09, ..., 1.8626e-09, 1.3970e-09, 9.3132e-10], [-1.3039e-08, 0.0000e+00, 5.1223e-09, ..., -6.0536e-09, 4.1910e-09, 0.0000e+00], ..., [-3.7253e-09, 1.3970e-09, -8.8476e-09, ..., -2.3283e-09, -7.4506e-09, 0.0000e+00], [ 2.3283e-09, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.3504e-08, -1.3039e-08, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 485, bias, value: tensor([-0.0142, -0.0214, -0.0086, -0.0206, 0.0044, -0.0011, 0.0086, 0.0221, 0.0100, -0.0133], device='cuda:0'), grad: tensor([ 6.5193e-09, 7.9628e-08, -3.9116e-08, 2.2352e-08, -5.2620e-08, -7.4506e-09, 2.2817e-08, 5.1223e-09, 7.4506e-09, -3.6322e-08], device='cuda:0') 100 1e-05 changing lr epoch 484, time 214.53, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4032 re_mapping 0.0026 re_causal 0.0093 /// teacc 99.10 lr 0.00001000 Epoch 486, weight, value: tensor([[-0.3234, 0.0886, -0.2159, ..., -0.1160, -0.2932, -0.1771], [-0.0091, 0.1096, -0.1308, ..., -0.1507, -0.0862, 0.1361], [ 0.0515, -0.1632, -0.1978, ..., 0.0022, -0.0615, -0.1245], ..., [ 0.1167, -0.1171, 0.1512, ..., 0.0772, 0.2284, -0.0279], [ 0.1463, -0.2730, -0.1887, ..., -0.3794, -0.1273, 0.2412], [-0.2232, 0.0948, 0.0781, ..., -0.3674, -0.1725, -0.1156]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], ..., [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [-9.3132e-09, 9.3132e-10, -0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.9558e-08], [ 5.1223e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.1176e-08]], device='cuda:0') Epoch 486, bias, value: tensor([-0.0142, -0.0213, -0.0086, -0.0206, 0.0045, -0.0011, 0.0086, 0.0220, 0.0101, -0.0134], device='cuda:0'), grad: tensor([-2.0955e-08, 6.9849e-09, 4.6566e-09, 1.3970e-08, -1.8626e-09, 3.7253e-09, 1.0245e-08, 6.0536e-09, -3.3993e-08, 2.5611e-08], device='cuda:0') 100 1e-05 changing lr epoch 485, time 214.70, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4503 re_mapping 0.0026 re_causal 0.0097 /// teacc 99.13 lr 0.00001000 Epoch 487, weight, value: tensor([[-0.3234, 0.0884, -0.2160, ..., -0.1160, -0.2932, -0.1771], [-0.0090, 0.1097, -0.1308, ..., -0.1508, -0.0862, 0.1362], [ 0.0515, -0.1633, -0.1979, ..., 0.0022, -0.0616, -0.1246], ..., [ 0.1167, -0.1173, 0.1512, ..., 0.0772, 0.2285, -0.0280], [ 0.1464, -0.2732, -0.1888, ..., -0.3794, -0.1273, 0.2413], [-0.2233, 0.0951, 0.0782, ..., -0.3675, -0.1725, -0.1157]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -3.2596e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [-1.9092e-08, -5.1223e-09, -3.3528e-08, ..., 2.3283e-09, -1.2107e-08, -5.9605e-08], [-1.0710e-08, 9.3132e-10, -3.2596e-09, ..., -1.3504e-08, -7.9162e-09, 2.3283e-09], ..., [ 2.7474e-08, 1.6764e-08, 3.1199e-08, ..., 1.2573e-08, 1.6764e-08, 6.6590e-08], [ 2.3283e-09, 1.4435e-08, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, -1.3970e-09], [ 5.1223e-09, 3.7253e-09, 7.4506e-09, ..., 4.6566e-10, 3.2596e-09, 7.9162e-09]], device='cuda:0') Epoch 487, bias, value: tensor([-0.0144, -0.0212, -0.0086, -0.0207, 0.0045, -0.0011, 0.0086, 0.0219, 0.0101, -0.0132], device='cuda:0'), grad: tensor([-5.5879e-09, -1.3178e-07, -2.6077e-08, -4.6566e-10, -3.7253e-08, -3.9767e-07, 2.7101e-07, 2.0117e-07, 1.1176e-07, 3.2596e-08], device='cuda:0') 100 1e-05 changing lr epoch 486, time 214.82, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4232 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.10 lr 0.00001000 Epoch 488, weight, value: tensor([[-0.3235, 0.0885, -0.2160, ..., -0.1160, -0.2932, -0.1771], [-0.0091, 0.1097, -0.1309, ..., -0.1511, -0.0863, 0.1362], [ 0.0515, -0.1634, -0.1981, ..., 0.0022, -0.0616, -0.1247], ..., [ 0.1168, -0.1173, 0.1513, ..., 0.0773, 0.2286, -0.0280], [ 0.1465, -0.2733, -0.1889, ..., -0.3795, -0.1274, 0.2415], [-0.2235, 0.0951, 0.0782, ..., -0.3676, -0.1727, -0.1157]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.0431e-07, 1.8626e-09, ..., 4.6566e-10, 0.0000e+00, -8.7544e-08], [ 2.3283e-09, 8.8476e-09, 7.4506e-09, ..., 5.1223e-09, 7.4506e-09, 1.8626e-09], [-3.2596e-09, 5.1223e-09, 5.1223e-09, ..., -4.1910e-09, -1.0710e-08, 0.0000e+00], ..., [-4.6566e-10, 2.7940e-09, 6.5193e-09, ..., 9.3132e-10, 1.8626e-09, 9.3132e-10], [-9.3132e-10, 2.7940e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 4.6566e-10, -2.4680e-08, -3.2596e-08, ..., 0.0000e+00, -1.8626e-09, 3.7253e-09]], device='cuda:0') Epoch 488, bias, value: tensor([-0.0144, -0.0213, -0.0085, -0.0208, 0.0046, -0.0012, 0.0086, 0.0220, 0.0100, -0.0133], device='cuda:0'), grad: tensor([-3.8836e-07, 4.5635e-08, -1.4435e-08, 6.0536e-09, 8.8476e-09, 6.6124e-08, 3.2037e-07, 2.1886e-08, 4.1910e-09, -6.6124e-08], device='cuda:0') 100 1e-05 changing lr epoch 487, time 214.74, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4124 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.11 lr 0.00001000 Epoch 489, weight, value: tensor([[-0.3235, 0.0886, -0.2161, ..., -0.1161, -0.2933, -0.1771], [-0.0094, 0.1098, -0.1313, ..., -0.1512, -0.0867, 0.1360], [ 0.0515, -0.1635, -0.1983, ..., 0.0023, -0.0616, -0.1248], ..., [ 0.1171, -0.1174, 0.1518, ..., 0.0773, 0.2290, -0.0278], [ 0.1466, -0.2734, -0.1889, ..., -0.3796, -0.1274, 0.2416], [-0.2237, 0.0950, 0.0781, ..., -0.3677, -0.1730, -0.1160]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 4.6566e-10, ..., 1.3970e-09, 1.3970e-09, 1.3970e-09], [ 1.7136e-07, -1.3970e-09, 3.5856e-08, ..., 1.2061e-07, 1.5367e-07, 1.8813e-07], [-2.1141e-07, 0.0000e+00, -3.5390e-08, ..., -1.5460e-07, -1.8254e-07, -2.5844e-07], ..., [ 2.4680e-08, 2.3283e-09, -3.2596e-09, ..., 2.2817e-08, 1.4435e-08, 5.8208e-08], [ 2.3283e-09, 4.6566e-10, 4.6566e-10, ..., 1.8626e-09, 1.8626e-09, 2.3283e-09], [ 7.4506e-09, 2.3283e-09, -0.0000e+00, ..., 5.1223e-09, 6.5193e-09, 8.3819e-09]], device='cuda:0') Epoch 489, bias, value: tensor([-0.0143, -0.0215, -0.0086, -0.0208, 0.0045, -0.0012, 0.0086, 0.0223, 0.0101, -0.0134], device='cuda:0'), grad: tensor([ 7.4506e-09, 6.2538e-07, -8.1724e-07, 6.5193e-09, -4.6566e-10, -1.4901e-08, 9.3132e-09, 1.4110e-07, 1.0245e-08, 3.2131e-08], device='cuda:0') 100 1e-05 changing lr epoch 488, time 215.28, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4284 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.09 lr 0.00001000 Epoch 490, weight, value: tensor([[-0.3236, 0.0887, -0.2161, ..., -0.1161, -0.2933, -0.1771], [-0.0094, 0.1099, -0.1312, ..., -0.1513, -0.0867, 0.1361], [ 0.0516, -0.1636, -0.1984, ..., 0.0024, -0.0615, -0.1248], ..., [ 0.1171, -0.1175, 0.1518, ..., 0.0773, 0.2290, -0.0278], [ 0.1467, -0.2736, -0.1890, ..., -0.3797, -0.1275, 0.2418], [-0.2239, 0.0950, 0.0780, ..., -0.3678, -0.1731, -0.1161]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -6.9849e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.5367e-08, -1.3970e-09, 1.9558e-08, ..., 0.0000e+00, 1.5367e-08, -3.2596e-09], [ 4.6566e-10, 4.6566e-10, 9.3132e-10, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], ..., [-1.9092e-08, 1.8626e-09, -2.4214e-08, ..., 0.0000e+00, -2.0023e-08, 2.3283e-09], [ 4.6566e-10, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 3.7253e-09, 2.3283e-09, 2.3283e-09, ..., 0.0000e+00, 3.2596e-09, 9.3132e-10]], device='cuda:0') Epoch 490, bias, value: tensor([-0.0142, -0.0214, -0.0084, -0.0208, 0.0044, -0.0012, 0.0086, 0.0222, 0.0101, -0.0135], device='cuda:0'), grad: tensor([-2.9011e-07, 2.7008e-08, 4.1910e-09, 1.0245e-08, 3.2596e-09, 4.1444e-08, 2.2026e-07, -3.2596e-08, 5.5879e-09, 1.8626e-08], device='cuda:0') 100 1e-05 changing lr epoch 489, time 215.10, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4150 re_mapping 0.0025 re_causal 0.0092 /// teacc 99.12 lr 0.00001000 Epoch 491, weight, value: tensor([[-0.3236, 0.0888, -0.2162, ..., -0.1161, -0.2934, -0.1772], [-0.0094, 0.1098, -0.1313, ..., -0.1515, -0.0868, 0.1362], [ 0.0515, -0.1638, -0.1987, ..., 0.0023, -0.0616, -0.1249], ..., [ 0.1172, -0.1175, 0.1519, ..., 0.0775, 0.2291, -0.0279], [ 0.1468, -0.2738, -0.1891, ..., -0.3798, -0.1275, 0.2420], [-0.2241, 0.0951, 0.0782, ..., -0.3679, -0.1732, -0.1163]], device='cuda:0'), grad: tensor([[ 9.3132e-10, -9.3132e-09, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, -0.0000e+00], [ 4.6566e-09, 4.6566e-10, 1.3970e-09, ..., 2.7940e-09, 3.7253e-09, 0.0000e+00], [-4.3306e-08, 0.0000e+00, 0.0000e+00, ..., -2.5146e-08, -3.3993e-08, 0.0000e+00], ..., [ 3.7719e-08, 9.3132e-10, 9.3132e-10, ..., 2.1886e-08, 2.9802e-08, 4.6566e-10], [ 9.3132e-10, 1.3970e-09, 1.3970e-09, ..., 4.6566e-10, 4.6566e-10, 4.6566e-10], [ 9.3132e-10, 2.3283e-09, -2.7940e-09, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 491, bias, value: tensor([-0.0142, -0.0215, -0.0086, -0.0210, 0.0043, -0.0012, 0.0087, 0.0223, 0.0101, -0.0136], device='cuda:0'), grad: tensor([-1.9092e-08, 1.3504e-08, -1.0431e-07, 6.9849e-09, 4.6566e-09, -1.0245e-08, 1.2107e-08, 9.4529e-08, 8.8476e-09, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 490, time 214.97, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4210 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.09 lr 0.00001000 Epoch 492, weight, value: tensor([[-0.3237, 0.0888, -0.2163, ..., -0.1161, -0.2934, -0.1772], [-0.0094, 0.1098, -0.1313, ..., -0.1516, -0.0868, 0.1362], [ 0.0514, -0.1639, -0.1990, ..., 0.0023, -0.0617, -0.1250], ..., [ 0.1172, -0.1175, 0.1520, ..., 0.0775, 0.2292, -0.0279], [ 0.1469, -0.2739, -0.1893, ..., -0.3798, -0.1276, 0.2421], [-0.2242, 0.0952, 0.0783, ..., -0.3680, -0.1733, -0.1164]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.7940e-09, 4.6566e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 3.2596e-09, 8.1025e-08, 1.2806e-07, ..., 7.1246e-08, 3.7253e-09, 1.3970e-09], [ 4.6566e-10, 3.2596e-09, 5.1223e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], ..., [-2.7940e-09, 1.4901e-08, 1.0245e-08, ..., 3.7253e-09, -4.1910e-09, 1.2107e-08], [ 0.0000e+00, 2.3283e-09, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, 4.6566e-10], [ 1.8626e-09, 3.7253e-08, 4.9360e-08, ..., 3.2596e-08, 1.8626e-09, 1.3970e-09]], device='cuda:0') Epoch 492, bias, value: tensor([-0.0142, -0.0214, -0.0086, -0.0211, 0.0043, -0.0013, 0.0086, 0.0223, 0.0101, -0.0135], device='cuda:0'), grad: tensor([-4.6566e-09, 2.3842e-07, 1.0245e-08, -2.6869e-07, -2.5611e-08, -9.2201e-08, -2.3283e-09, 4.1444e-08, 7.9162e-09, 1.0803e-07], device='cuda:0') 100 1e-05 changing lr epoch 491, time 214.86, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4149 re_mapping 0.0026 re_causal 0.0093 /// teacc 99.07 lr 0.00001000 Epoch 493, weight, value: tensor([[-0.3238, 0.0889, -0.2164, ..., -0.1162, -0.2934, -0.1772], [-0.0095, 0.1098, -0.1314, ..., -0.1518, -0.0868, 0.1363], [ 0.0514, -0.1640, -0.1992, ..., 0.0023, -0.0617, -0.1251], ..., [ 0.1173, -0.1176, 0.1521, ..., 0.0776, 0.2293, -0.0279], [ 0.1470, -0.2741, -0.1894, ..., -0.3799, -0.1276, 0.2422], [-0.2243, 0.0952, 0.0783, ..., -0.3681, -0.1734, -0.1165]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 2.3283e-09, -2.3283e-09, 4.6566e-10, ..., 4.6566e-10, 4.6566e-10, -3.7253e-09], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., -9.3132e-10, -9.3132e-10, 2.7940e-09], ..., [ 9.3132e-10, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 2.3283e-09], [-1.2107e-08, 4.6566e-10, -4.6566e-10, ..., 0.0000e+00, 0.0000e+00, -2.5146e-08], [ 6.5193e-09, 9.3132e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 1.3504e-08]], device='cuda:0') Epoch 493, bias, value: tensor([-0.0141, -0.0215, -0.0087, -0.0211, 0.0043, -0.0013, 0.0086, 0.0223, 0.0100, -0.0135], device='cuda:0'), grad: tensor([ 2.7940e-09, -2.3283e-09, 2.3283e-09, 6.9849e-09, 2.3283e-09, -2.5146e-08, 1.9092e-08, 1.0710e-08, -4.4703e-08, 2.8405e-08], device='cuda:0') 100 1e-05 changing lr epoch 492, time 214.88, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4163 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.08 lr 0.00001000 Epoch 494, weight, value: tensor([[-0.3238, 0.0890, -0.2164, ..., -0.1162, -0.2935, -0.1772], [-0.0095, 0.1099, -0.1314, ..., -0.1521, -0.0869, 0.1364], [ 0.0514, -0.1642, -0.1993, ..., 0.0023, -0.0618, -0.1251], ..., [ 0.1174, -0.1178, 0.1521, ..., 0.0777, 0.2294, -0.0281], [ 0.1472, -0.2741, -0.1895, ..., -0.3800, -0.1277, 0.2424], [-0.2244, 0.0952, 0.0784, ..., -0.3682, -0.1735, -0.1167]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10], [ 0.0000e+00, -1.8626e-09, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, -3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.0245e-08, -1.0245e-08, ..., 0.0000e+00, 0.0000e+00, 9.3132e-10]], device='cuda:0') Epoch 494, bias, value: tensor([-0.0140, -0.0215, -0.0087, -0.0212, 0.0042, -0.0013, 0.0085, 0.0223, 0.0101, -0.0136], device='cuda:0'), grad: tensor([ 3.7253e-09, -8.8476e-09, 9.3132e-10, 0.0000e+00, 2.6543e-08, 2.7940e-09, 1.8626e-09, 5.1223e-09, 9.3132e-10, -2.6077e-08], device='cuda:0') 100 1e-05 changing lr epoch 493, time 214.86, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4189 re_mapping 0.0026 re_causal 0.0093 /// teacc 99.10 lr 0.00001000 Epoch 495, weight, value: tensor([[-0.3239, 0.0891, -0.2164, ..., -0.1163, -0.2935, -0.1772], [-0.0096, 0.1098, -0.1315, ..., -0.1522, -0.0870, 0.1365], [ 0.0514, -0.1643, -0.1994, ..., 0.0023, -0.0618, -0.1252], ..., [ 0.1175, -0.1178, 0.1522, ..., 0.0778, 0.2295, -0.0281], [ 0.1473, -0.2743, -0.1896, ..., -0.3800, -0.1277, 0.2425], [-0.2245, 0.0953, 0.0786, ..., -0.3683, -0.1735, -0.1168]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -2.7940e-09, 1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -9.3598e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 4.6566e-09], ..., [ 0.0000e+00, 4.1910e-09, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 7.9162e-08], [-5.5879e-09, 4.6566e-10, -1.3970e-09, ..., 0.0000e+00, 0.0000e+00, -1.0710e-08], [ 4.6566e-09, -7.9162e-09, -7.4506e-09, ..., 0.0000e+00, 0.0000e+00, 1.0245e-08]], device='cuda:0') Epoch 495, bias, value: tensor([-0.0140, -0.0215, -0.0087, -0.0211, 0.0041, -0.0015, 0.0086, 0.0223, 0.0100, -0.0135], device='cuda:0'), grad: tensor([ 1.3970e-09, -1.2992e-07, 6.9849e-09, 1.3970e-09, 3.2131e-08, 1.8626e-09, 2.7940e-09, 1.1502e-07, -2.5146e-08, 3.2596e-09], device='cuda:0') 100 1e-05 changing lr epoch 494, time 214.56, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4229 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.10 lr 0.00001000 Epoch 496, weight, value: tensor([[-0.3239, 0.0892, -0.2165, ..., -0.1163, -0.2935, -0.1772], [-0.0095, 0.1099, -0.1314, ..., -0.1523, -0.0870, 0.1366], [ 0.0513, -0.1644, -0.1996, ..., 0.0023, -0.0618, -0.1253], ..., [ 0.1175, -0.1180, 0.1522, ..., 0.0778, 0.2295, -0.0282], [ 0.1474, -0.2745, -0.1898, ..., -0.3801, -0.1278, 0.2426], [-0.2246, 0.0952, 0.0787, ..., -0.3684, -0.1736, -0.1169]], device='cuda:0'), grad: tensor([[ 1.3970e-09, -0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 2.2352e-08, 4.6566e-10, 6.5193e-09, ..., 9.7789e-09, 1.2107e-08, 4.6566e-10], [ 5.5972e-07, 1.8626e-09, 1.8487e-07, ..., 2.4308e-07, 3.0361e-07, 4.6566e-09], ..., [-6.3283e-07, 0.0000e+00, -1.9977e-07, ..., -2.7195e-07, -3.4133e-07, -6.9849e-09], [ 2.0955e-08, 2.7940e-09, 2.7940e-09, ..., 7.9162e-09, 1.0245e-08, 1.3970e-09], [ 9.7789e-09, 4.6566e-10, 0.0000e+00, ..., 3.2596e-09, 4.1910e-09, 9.3132e-10]], device='cuda:0') Epoch 496, bias, value: tensor([-0.0139, -0.0214, -0.0087, -0.0209, 0.0040, -0.0016, 0.0087, 0.0223, 0.0100, -0.0135], device='cuda:0'), grad: tensor([ 1.7695e-08, 3.5390e-08, 6.8359e-07, 2.8871e-08, 4.4703e-08, -9.9186e-08, 0.0000e+00, -7.9861e-07, 6.9849e-08, 2.7008e-08], device='cuda:0') 100 1e-05 changing lr epoch 495, time 214.77, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4213 re_mapping 0.0025 re_causal 0.0091 /// teacc 99.10 lr 0.00001000 Epoch 497, weight, value: tensor([[-0.3239, 0.0892, -0.2166, ..., -0.1164, -0.2935, -0.1771], [-0.0096, 0.1099, -0.1315, ..., -0.1525, -0.0870, 0.1367], [ 0.0512, -0.1645, -0.1999, ..., 0.0023, -0.0620, -0.1254], ..., [ 0.1176, -0.1181, 0.1523, ..., 0.0780, 0.2296, -0.0283], [ 0.1474, -0.2746, -0.1899, ..., -0.3802, -0.1278, 0.2428], [-0.2248, 0.0954, 0.0788, ..., -0.3685, -0.1737, -0.1170]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -4.6566e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 4.6566e-10], [ 4.6566e-10, -4.6566e-10, 1.3970e-09, ..., 9.3132e-10, 0.0000e+00, -1.8626e-09], [-9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 9.3132e-10], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 1.8626e-09], [-9.3132e-10, 4.6566e-10, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, -1.8626e-09], [ 4.6566e-10, -4.6566e-09, -7.9162e-09, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09]], device='cuda:0') Epoch 497, bias, value: tensor([-0.0139, -0.0215, -0.0089, -0.0210, 0.0038, -0.0016, 0.0087, 0.0223, 0.0100, -0.0135], device='cuda:0'), grad: tensor([ 4.6566e-10, -2.7940e-09, -2.7940e-09, -4.1910e-09, 1.4901e-08, 4.6566e-09, 0.0000e+00, 7.4506e-09, 0.0000e+00, -1.2573e-08], device='cuda:0') 100 1e-05 changing lr epoch 496, time 214.80, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4248 re_mapping 0.0025 re_causal 0.0093 /// teacc 99.09 lr 0.00001000 Epoch 498, weight, value: tensor([[-0.3240, 0.0892, -0.2166, ..., -0.1164, -0.2936, -0.1771], [-0.0097, 0.1098, -0.1316, ..., -0.1528, -0.0871, 0.1367], [ 0.0512, -0.1647, -0.2000, ..., 0.0023, -0.0620, -0.1254], ..., [ 0.1177, -0.1181, 0.1525, ..., 0.0781, 0.2298, -0.0283], [ 0.1476, -0.2747, -0.1900, ..., -0.3803, -0.1279, 0.2430], [-0.2249, 0.0955, 0.0789, ..., -0.3686, -0.1738, -0.1171]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -1.7695e-08, 2.3283e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 8.8476e-09, 1.3970e-09, 1.0245e-08, ..., 6.0536e-09, 6.5193e-09, 1.3970e-09], [ 5.5879e-09, 9.3132e-10, 1.2573e-08, ..., 1.2107e-08, 2.3283e-09, 9.3132e-10], ..., [-9.7789e-09, 9.3132e-10, 4.6566e-10, ..., 9.3132e-09, -1.0245e-08, -1.8626e-09], [ 2.3283e-09, 1.3970e-09, 5.1223e-09, ..., 4.1910e-09, 1.3970e-09, 4.6566e-10], [ 4.6566e-10, 9.7789e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 498, bias, value: tensor([-0.0139, -0.0216, -0.0088, -0.0210, 0.0037, -0.0016, 0.0087, 0.0224, 0.0100, -0.0133], device='cuda:0'), grad: tensor([-3.5856e-08, 2.9802e-08, 4.3306e-08, -7.1712e-08, 0.0000e+00, -3.5390e-08, 1.0245e-08, 1.9558e-08, 1.8161e-08, 2.5611e-08], device='cuda:0') 100 1e-05 changing lr epoch 497, time 214.81, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.3965 re_mapping 0.0026 re_causal 0.0091 /// teacc 99.08 lr 0.00001000 Epoch 499, weight, value: tensor([[-0.3242, 0.0893, -0.2167, ..., -0.1165, -0.2937, -0.1770], [-0.0099, 0.1098, -0.1319, ..., -0.1531, -0.0874, 0.1366], [ 0.0512, -0.1648, -0.2002, ..., 0.0023, -0.0620, -0.1253], ..., [ 0.1180, -0.1181, 0.1528, ..., 0.0783, 0.2300, -0.0282], [ 0.1476, -0.2749, -0.1901, ..., -0.3804, -0.1280, 0.2430], [-0.2251, 0.0956, 0.0789, ..., -0.3688, -0.1740, -0.1173]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [-1.1642e-08, -3.6322e-08, 5.5879e-09, ..., 3.2596e-09, 7.9162e-09, -7.8231e-08], [-9.3132e-10, 9.3132e-10, 4.6566e-10, ..., -2.3283e-09, -9.3132e-10, 2.3283e-09], ..., [ 7.9162e-09, 2.8871e-08, -7.4506e-09, ..., -2.7940e-09, -8.8476e-09, 6.3330e-08], [-4.6566e-10, 1.8626e-09, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 9.3132e-10], [ 1.8626e-09, 4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 6.9849e-09]], device='cuda:0') Epoch 499, bias, value: tensor([-0.0138, -0.0218, -0.0087, -0.0211, 0.0036, -0.0016, 0.0086, 0.0226, 0.0099, -0.0134], device='cuda:0'), grad: tensor([ 4.6566e-10, -1.8906e-07, -3.7253e-09, 9.3132e-10, 4.6566e-09, -2.6543e-08, 3.4925e-08, 1.5181e-07, 7.9162e-09, 2.1886e-08], device='cuda:0') 100 1e-05 changing lr epoch 498, time 214.82, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4403 re_mapping 0.0025 re_causal 0.0096 /// teacc 99.10 lr 0.00001000 Epoch 500, weight, value: tensor([[-0.3242, 0.0894, -0.2167, ..., -0.1165, -0.2938, -0.1770], [-0.0100, 0.1099, -0.1319, ..., -0.1533, -0.0874, 0.1367], [ 0.0511, -0.1649, -0.2006, ..., 0.0023, -0.0621, -0.1255], ..., [ 0.1181, -0.1182, 0.1529, ..., 0.0785, 0.2302, -0.0283], [ 0.1477, -0.2750, -0.1902, ..., -0.3805, -0.1280, 0.2432], [-0.2254, 0.0956, 0.0790, ..., -0.3689, -0.1741, -0.1175]], device='cuda:0'), grad: tensor([[-2.7940e-09, -1.1176e-08, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, -3.2596e-09], [ 3.2596e-09, 3.2596e-09, 1.3970e-09, ..., 2.3283e-09, 9.3132e-10, 4.6566e-09], [-9.3132e-10, 4.6566e-10, 0.0000e+00, ..., -3.7253e-09, -1.3970e-09, 2.7940e-09], ..., [ 1.8626e-09, 1.8626e-09, 9.3132e-10, ..., 1.3970e-09, 4.6566e-10, 1.3970e-09], [-7.9162e-09, 1.8626e-09, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, -1.2573e-08], [ 4.1910e-09, -3.2596e-09, -1.4901e-08, ..., 4.6566e-10, 0.0000e+00, 1.3504e-08]], device='cuda:0') Epoch 500, bias, value: tensor([-0.0138, -0.0218, -0.0089, -0.0211, 0.0034, -0.0016, 0.0087, 0.0227, 0.0099, -0.0135], device='cuda:0'), grad: tensor([-2.4680e-08, 2.0955e-08, -9.3132e-09, 4.6566e-10, 1.2107e-08, 1.8626e-09, 4.1910e-09, 1.0710e-08, -1.6298e-08, 8.8476e-09], device='cuda:0') 100 1e-05 changing lr epoch 499, time 214.93, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4227 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.05 lr 0.00001000 ---------------------saving last model at epoch 499---------------------------------------------------- /home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 500, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[-0.0100, -0.0114, -0.0008, ..., -0.0300, 0.0253, 0.0120], [ 0.0002, 0.0172, -0.0180, ..., 0.0283, 0.0296, -0.0199], [ 0.0140, 0.0142, 0.0171, ..., 0.0041, 0.0127, 0.0219], ..., [-0.0155, -0.0106, 0.0301, ..., -0.0249, -0.0211, -0.0042], [-0.0038, 0.0173, -0.0014, ..., -0.0064, -0.0125, -0.0106], [ 0.0016, 0.0114, 0.0042, ..., -0.0289, -0.0007, 0.0232]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([ 0.0146, -0.0216, -0.0005, 0.0296, -0.0112, 0.0282, 0.0049, 0.0115, 0.0094, -0.0021], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 231.85, cls_loss 1.3938 cls_loss_mapping 1.8860 cls_loss_causal 2.2245 re_mapping 0.1345 re_causal 0.1397 /// teacc 85.45 lr 0.00010000 Epoch 2, weight, value: tensor([[-0.0093, -0.0118, -0.0030, ..., -0.0260, 0.0210, 0.0120], [ 0.0008, 0.0176, -0.0176, ..., 0.0259, 0.0305, -0.0199], [ 0.0131, 0.0155, 0.0158, ..., 0.0021, 0.0173, 0.0219], ..., [-0.0163, -0.0116, 0.0276, ..., -0.0288, -0.0281, -0.0042], [-0.0041, 0.0172, -0.0042, ..., -0.0102, -0.0097, -0.0106], [ 0.0007, 0.0102, 0.0018, ..., -0.0340, -0.0060, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0107, 0.0344, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0045, -0.0017, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0224, -0.0208, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0038, 0.0081, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0157, 0.0263, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0050, 0.0150, 0.0000]], device='cuda:0') Epoch 2, bias, value: tensor([ 0.0124, -0.0185, -0.0008, 0.0294, -0.0118, 0.0289, 0.0053, 0.0118, 0.0082, -0.0029], device='cuda:0'), grad: tensor([ 0.0261, 0.0023, -0.0052, -0.0114, 0.0083, -0.0551, -0.0673, -0.0037, 0.0613, 0.0446], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 231.40, cls_loss 0.4578 cls_loss_mapping 0.7867 cls_loss_causal 1.9281 re_mapping 0.2059 re_causal 0.2627 /// teacc 92.16 lr 0.00010000 Epoch 3, weight, value: tensor([[-9.2938e-03, -1.1764e-02, -3.0247e-03, ..., -2.3329e-02, 1.9973e-02, 1.1960e-02], [ 8.7744e-04, 1.7649e-02, -1.7532e-02, ..., 2.5183e-02, 2.9312e-02, -1.9872e-02], [ 1.3047e-02, 1.5507e-02, 1.5782e-02, ..., -1.7187e-05, 1.9124e-02, 2.1870e-02], ..., [-1.6276e-02, -1.1638e-02, 2.7572e-02, ..., -3.1240e-02, -2.9613e-02, -4.2206e-03], [-4.3214e-03, 1.7225e-02, -4.1866e-03, ..., -1.3714e-02, -7.7291e-03, -1.0556e-02], [ 7.1773e-04, 1.0208e-02, 1.7482e-03, ..., -3.5895e-02, -9.1795e-03, 2.3166e-02]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0271, 0.0068, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., -0.0011, 0.0105, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0045, 0.0060, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0009, 0.0037, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0041, -0.0287, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0044, 0.0039, 0.0000]], device='cuda:0') Epoch 3, bias, value: tensor([ 0.0119, -0.0182, -0.0007, 0.0295, -0.0117, 0.0299, 0.0050, 0.0113, 0.0077, -0.0028], device='cuda:0'), grad: tensor([ 0.0381, 0.0062, 0.0194, 0.0093, -0.0139, -0.0243, -0.0286, 0.0022, -0.0200, 0.0116], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 231.11, cls_loss 0.2856 cls_loss_mapping 0.4688 cls_loss_causal 1.7267 re_mapping 0.1550 re_causal 0.2505 /// teacc 93.81 lr 0.00010000 Epoch 4, weight, value: tensor([[-0.0093, -0.0128, -0.0030, ..., -0.0220, 0.0196, 0.0120], [ 0.0009, 0.0173, -0.0175, ..., 0.0256, 0.0289, -0.0199], [ 0.0130, 0.0186, 0.0158, ..., -0.0018, 0.0201, 0.0219], ..., [-0.0163, -0.0126, 0.0276, ..., -0.0327, -0.0304, -0.0042], [-0.0043, 0.0149, -0.0042, ..., -0.0160, -0.0058, -0.0106], [ 0.0007, 0.0098, 0.0017, ..., -0.0372, -0.0114, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.2532e-05, 0.0000e+00, ..., 5.3024e-04, 1.9407e-03, 0.0000e+00], [ 0.0000e+00, 1.2681e-05, 0.0000e+00, ..., 6.7711e-04, 1.2560e-03, 0.0000e+00], [ 0.0000e+00, 7.0953e-03, 0.0000e+00, ..., 7.2708e-03, 5.3497e-02, 0.0000e+00], ..., [ 0.0000e+00, -7.3242e-03, 0.0000e+00, ..., -4.5929e-03, -4.4159e-02, 0.0000e+00], [ 0.0000e+00, 2.4602e-05, 0.0000e+00, ..., 1.2825e-02, 3.9795e-02, 0.0000e+00], [ 0.0000e+00, 2.2147e-06, 0.0000e+00, ..., 5.0497e-04, -1.8873e-03, 0.0000e+00]], device='cuda:0') Epoch 4, bias, value: tensor([ 0.0117, -0.0178, -0.0004, 0.0294, -0.0120, 0.0301, 0.0047, 0.0113, 0.0080, -0.0031], device='cuda:0'), grad: tensor([ 0.0018, -0.0016, 0.0365, -0.0023, 0.0026, -0.0017, -0.0288, -0.0201, 0.0322, -0.0188], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 231.39, cls_loss 0.2267 cls_loss_mapping 0.3438 cls_loss_causal 1.5339 re_mapping 0.1193 re_causal 0.2169 /// teacc 95.53 lr 0.00010000 Epoch 5, weight, value: tensor([[-0.0093, -0.0134, -0.0030, ..., -0.0209, 0.0196, 0.0120], [ 0.0009, 0.0170, -0.0175, ..., 0.0259, 0.0284, -0.0199], [ 0.0130, 0.0178, 0.0158, ..., -0.0030, 0.0208, 0.0219], ..., [-0.0163, -0.0112, 0.0276, ..., -0.0341, -0.0316, -0.0042], [-0.0043, 0.0129, -0.0042, ..., -0.0181, -0.0045, -0.0106], [ 0.0007, 0.0095, 0.0017, ..., -0.0379, -0.0128, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., -0.0019, 0.0194, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0052, 0.0103, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0043, -0.0147, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0006, 0.0041, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0022, 0.0119, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0009, -0.0220, 0.0000]], device='cuda:0') Epoch 5, bias, value: tensor([ 0.0121, -0.0173, -0.0003, 0.0291, -0.0116, 0.0298, 0.0046, 0.0111, 0.0078, -0.0031], device='cuda:0'), grad: tensor([ 0.0193, 0.0073, -0.0134, 0.0260, -0.0079, -0.0156, -0.0226, 0.0017, 0.0151, -0.0099], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 231.05, cls_loss 0.1704 cls_loss_mapping 0.2512 cls_loss_causal 1.4027 re_mapping 0.1000 re_causal 0.2003 /// teacc 95.83 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0093, -0.0134, -0.0030, ..., -0.0199, 0.0197, 0.0120], [ 0.0009, 0.0170, -0.0175, ..., 0.0268, 0.0285, -0.0199], [ 0.0130, 0.0178, 0.0158, ..., -0.0044, 0.0214, 0.0219], ..., [-0.0163, -0.0112, 0.0276, ..., -0.0352, -0.0331, -0.0042], [-0.0043, 0.0129, -0.0042, ..., -0.0194, -0.0029, -0.0106], [ 0.0007, 0.0095, 0.0017, ..., -0.0394, -0.0143, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., -0.0091, -0.0091, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0023, 0.0042, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., -0.0006, -0.0194, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0006, 0.0017, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0075, 0.0224, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0022, 0.0031, 0.0000]], device='cuda:0') Epoch 6, bias, value: tensor([ 0.0123, -0.0167, -0.0004, 0.0290, -0.0117, 0.0294, 0.0044, 0.0109, 0.0080, -0.0031], device='cuda:0'), grad: tensor([-0.0095, 0.0058, -0.0125, -0.0062, 0.0070, -0.0328, 0.0072, 0.0090, 0.0296, 0.0024], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 232.19, cls_loss 0.1403 cls_loss_mapping 0.2009 cls_loss_causal 1.2733 re_mapping 0.0831 re_causal 0.1784 /// teacc 96.87 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0093, -0.0136, -0.0030, ..., -0.0197, 0.0192, 0.0120], [ 0.0009, 0.0170, -0.0175, ..., 0.0262, 0.0275, -0.0199], [ 0.0130, 0.0179, 0.0158, ..., -0.0053, 0.0223, 0.0219], ..., [-0.0163, -0.0112, 0.0276, ..., -0.0362, -0.0343, -0.0042], [-0.0042, 0.0127, -0.0042, ..., -0.0206, -0.0015, -0.0106], [ 0.0007, 0.0094, 0.0017, ..., -0.0400, -0.0157, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0017, -0.0036, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0003, 0.0010, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0015, 0.0025, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0009, 0.0027, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0030, -0.0009, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0012, 0.0028, 0.0000]], device='cuda:0') Epoch 7, bias, value: tensor([ 0.0123, -0.0168, -0.0003, 0.0288, -0.0116, 0.0292, 0.0043, 0.0110, 0.0082, -0.0030], device='cuda:0'), grad: tensor([-0.0120, 0.0004, 0.0056, -0.0004, -0.0185, 0.0036, 0.0042, 0.0066, 0.0020, 0.0084], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 230.80, cls_loss 0.1203 cls_loss_mapping 0.1726 cls_loss_causal 1.2307 re_mapping 0.0705 re_causal 0.1611 /// teacc 97.13 lr 0.00010000 Epoch 8, weight, value: tensor([[-0.0096, -0.0137, -0.0030, ..., -0.0192, 0.0188, 0.0120], [ 0.0008, 0.0168, -0.0175, ..., 0.0262, 0.0267, -0.0199], [ 0.0128, 0.0175, 0.0158, ..., -0.0064, 0.0228, 0.0219], ..., [-0.0163, -0.0113, 0.0276, ..., -0.0370, -0.0348, -0.0042], [-0.0058, 0.0124, -0.0042, ..., -0.0213, -0.0007, -0.0106], [ 0.0004, 0.0092, 0.0017, ..., -0.0414, -0.0169, 0.0232]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0011, 0.0014, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0022, 0.0009, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0007, -0.0015, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0002, 0.0015, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0027, 0.0021, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0005, 0.0008, 0.0000]], device='cuda:0') Epoch 8, bias, value: tensor([ 0.0124, -0.0168, -0.0003, 0.0288, -0.0113, 0.0291, 0.0041, 0.0110, 0.0081, -0.0030], device='cuda:0'), grad: tensor([ 0.0021, 0.0057, 0.0009, 0.0005, -0.0181, -0.0112, 0.0013, 0.0044, 0.0089, 0.0056], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 230.55, cls_loss 0.1114 cls_loss_mapping 0.1632 cls_loss_causal 1.2534 re_mapping 0.0612 re_causal 0.1490 /// teacc 97.19 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0149, -0.0137, -0.0030, ..., -0.0183, 0.0184, 0.0170], [ 0.0049, 0.0168, -0.0175, ..., 0.0262, 0.0259, -0.0219], [ 0.0073, 0.0175, 0.0158, ..., -0.0075, 0.0233, 0.0159], ..., [-0.0237, -0.0113, 0.0276, ..., -0.0374, -0.0353, -0.0039], [-0.0072, 0.0124, -0.0042, ..., -0.0228, 0.0006, -0.0187], [-0.0051, 0.0092, 0.0017, ..., -0.0423, -0.0177, 0.0145]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0011, 0.0022, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0055, 0.0025, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0028, 0.0036, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0003, 0.0018, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0016, -0.0039, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0005, 0.0033, 0.0000]], device='cuda:0') Epoch 9, bias, value: tensor([ 1.2328e-02, -1.6829e-02, -5.0162e-05, 2.8744e-02, -1.1248e-02, 2.8744e-02, 4.0316e-03, 1.1183e-02, 8.4184e-03, -3.2370e-03], device='cuda:0'), grad: tensor([ 0.0042, -0.0015, 0.0082, 0.0193, -0.0048, -0.0291, -0.0100, 0.0054, -0.0002, 0.0085], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 232.00, cls_loss 0.1004 cls_loss_mapping 0.1419 cls_loss_causal 1.1903 re_mapping 0.0560 re_causal 0.1379 /// teacc 97.23 lr 0.00010000 Epoch 10, weight, value: tensor([[-0.0149, -0.0137, -0.0030, ..., -0.0175, 0.0180, 0.0174], [ 0.0049, 0.0168, -0.0175, ..., 0.0260, 0.0251, -0.0219], [ 0.0073, 0.0175, 0.0158, ..., -0.0083, 0.0239, 0.0155], ..., [-0.0237, -0.0113, 0.0276, ..., -0.0383, -0.0365, -0.0040], [-0.0072, 0.0124, -0.0042, ..., -0.0232, 0.0016, -0.0194], [-0.0051, 0.0092, 0.0017, ..., -0.0439, -0.0188, 0.0140]], device='cuda:0'), grad: tensor([[ 0.0000, 0.0000, 0.0000, ..., 0.0096, 0.0079, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., -0.0002, 0.0002, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0007, -0.0005, 0.0000], ..., [ 0.0000, 0.0000, 0.0000, ..., 0.0004, 0.0006, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., 0.0006, -0.0003, 0.0000], [ 0.0000, 0.0000, 0.0000, ..., -0.0100, -0.0075, 0.0000]], device='cuda:0') Epoch 10, bias, value: tensor([ 0.0123, -0.0170, 0.0004, 0.0291, -0.0111, 0.0287, 0.0038, 0.0108, 0.0085, -0.0033], device='cuda:0'), grad: tensor([ 0.0214, -0.0003, -0.0020, 0.0076, 0.0012, -0.0104, 0.0014, 0.0019, 0.0011, -0.0219], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 9---------------------------------------------------- epoch 9, time 230.65, cls_loss 0.0951 cls_loss_mapping 0.1368 cls_loss_causal 1.1031 re_mapping 0.0531 re_causal 0.1253 /// teacc 97.40 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0150, -0.0140, -0.0030, ..., -0.0168, 0.0176, 0.0131], [ 0.0049, 0.0166, -0.0175, ..., 0.0265, 0.0244, -0.0321], [ 0.0073, 0.0177, 0.0158, ..., -0.0095, 0.0243, 0.0079], ..., [-0.0237, -0.0113, 0.0276, ..., -0.0391, -0.0372, 0.0032], [-0.0072, 0.0123, -0.0042, ..., -0.0241, 0.0027, -0.0333], [-0.0051, 0.0092, 0.0017, ..., -0.0450, -0.0197, 0.0091]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8158e-03, -7.8964e-04, -3.3259e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.2731e-05, 2.9397e-04, 8.9929e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.5735e-04, 4.5586e-04, 6.1631e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6463e-04, 8.9347e-05, 7.0147e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2043e-03, 2.1801e-03, 2.3991e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.0640e-04, 3.2663e-04, 3.4243e-05]], device='cuda:0') Epoch 11, bias, value: tensor([ 0.0121, -0.0168, 0.0003, 0.0294, -0.0109, 0.0285, 0.0034, 0.0108, 0.0088, -0.0036], device='cuda:0'), grad: tensor([-0.0008, 0.0261, -0.0030, 0.0094, 0.0074, 0.0049, -0.0079, -0.0114, 0.0078, -0.0325], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 10---------------------------------------------------- epoch 10, time 230.64, cls_loss 0.0825 cls_loss_mapping 0.1230 cls_loss_causal 1.0997 re_mapping 0.0469 re_causal 0.1231 /// teacc 97.57 lr 0.00010000 Epoch 12, weight, value: tensor([[-0.0152, -0.0143, -0.0030, ..., -0.0165, 0.0168, 0.0107], [ 0.0049, 0.0165, -0.0175, ..., 0.0258, 0.0235, -0.0470], [ 0.0072, 0.0178, 0.0158, ..., -0.0103, 0.0246, 0.0043], ..., [-0.0225, -0.0113, 0.0276, ..., -0.0395, -0.0379, 0.0064], [-0.0073, 0.0121, -0.0042, ..., -0.0248, 0.0034, -0.0384], [-0.0058, 0.0091, 0.0017, ..., -0.0466, -0.0204, 0.0024]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8204e-02, -5.3787e-03, 1.9092e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1319e-04, 1.3006e-04, 6.3516e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7861e-04, -6.9857e-04, 3.1712e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7202e-04, 3.9220e-04, 7.2550e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1387e-03, 4.3607e-04, 1.2433e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.2908e-04, 3.3855e-04, 4.4703e-07]], device='cuda:0') Epoch 12, bias, value: tensor([ 0.0121, -0.0174, 0.0004, 0.0294, -0.0111, 0.0285, 0.0036, 0.0109, 0.0090, -0.0036], device='cuda:0'), grad: tensor([-0.0137, 0.0009, 0.0013, 0.0034, -0.0055, 0.0028, 0.0064, 0.0062, 0.0020, -0.0038], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 230.26, cls_loss 0.0615 cls_loss_mapping 0.0927 cls_loss_causal 1.0239 re_mapping 0.0452 re_causal 0.1190 /// teacc 97.78 lr 0.00010000 Epoch 13, weight, value: tensor([[-0.0145, -0.0146, -0.0030, ..., -0.0158, 0.0164, 0.0100], [ 0.0051, 0.0164, -0.0175, ..., 0.0255, 0.0227, -0.0515], [ 0.0070, 0.0180, 0.0158, ..., -0.0108, 0.0252, 0.0054], ..., [-0.0224, -0.0114, 0.0276, ..., -0.0398, -0.0386, 0.0070], [-0.0074, 0.0117, -0.0042, ..., -0.0254, 0.0043, -0.0394], [-0.0060, 0.0089, 0.0017, ..., -0.0476, -0.0213, -0.0005]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.7261e-04, -4.3154e-04, 9.2201e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.9414e-04, -8.1539e-04, 2.4568e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0366e-03, -2.7485e-03, 1.7695e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.1016e-05, -9.9659e-04, 2.3078e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7800e-04, 3.0537e-03, 3.8603e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.6560e-05, 2.4188e-04, 4.7721e-06]], device='cuda:0') Epoch 13, bias, value: tensor([ 0.0121, -0.0175, 0.0008, 0.0293, -0.0113, 0.0284, 0.0036, 0.0110, 0.0093, -0.0038], device='cuda:0'), grad: tensor([-0.0006, -0.0059, 0.0042, -0.0104, -0.0002, 0.0028, 0.0028, -0.0044, 0.0102, 0.0017], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 230.74, cls_loss 0.0686 cls_loss_mapping 0.1031 cls_loss_causal 1.0073 re_mapping 0.0424 re_causal 0.1082 /// teacc 97.85 lr 0.00010000 Epoch 14, weight, value: tensor([[-0.0148, -0.0146, -0.0030, ..., -0.0152, 0.0160, 0.0095], [ 0.0059, 0.0163, -0.0175, ..., 0.0255, 0.0225, -0.0550], [ 0.0038, 0.0181, 0.0158, ..., -0.0114, 0.0256, 0.0038], ..., [-0.0152, -0.0114, 0.0276, ..., -0.0403, -0.0393, 0.0106], [-0.0099, 0.0117, -0.0042, ..., -0.0262, 0.0050, -0.0397], [-0.0131, 0.0089, 0.0017, ..., -0.0484, -0.0215, -0.0025]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0826e-02, -7.7782e-03, 1.4298e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6454e-06, 6.1512e-04, 5.3346e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.4925e-04, -3.9577e-04, 1.0669e-04], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8079e-04, 6.5708e-04, 7.2212e-03], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.0926e-03, 4.6844e-03, 4.2796e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0265e-04, 6.3801e-04, 3.1209e-04]], device='cuda:0') Epoch 14, bias, value: tensor([ 0.0119, -0.0175, 0.0007, 0.0294, -0.0112, 0.0282, 0.0034, 0.0113, 0.0090, -0.0036], device='cuda:0'), grad: tensor([-1.3512e-02, -1.9684e-03, 9.2602e-04, 3.2139e-03, -1.0506e-02, 3.3170e-05, 4.4942e-04, 1.2093e-02, 6.9580e-03, 2.3136e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 13---------------------------------------------------- epoch 13, time 230.57, cls_loss 0.0578 cls_loss_mapping 0.0852 cls_loss_causal 1.0313 re_mapping 0.0390 re_causal 0.1042 /// teacc 98.07 lr 0.00010000 Epoch 15, weight, value: tensor([[-0.0148, -0.0146, -0.0030, ..., -0.0142, 0.0158, 0.0082], [ 0.0059, 0.0163, -0.0175, ..., 0.0256, 0.0219, -0.0609], [ 0.0038, 0.0181, 0.0158, ..., -0.0122, 0.0260, 0.0017], ..., [-0.0152, -0.0114, 0.0276, ..., -0.0409, -0.0401, 0.0116], [-0.0099, 0.0116, -0.0042, ..., -0.0270, 0.0054, -0.0407], [-0.0132, 0.0089, 0.0017, ..., -0.0493, -0.0218, -0.0043]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4402e-04, 4.0054e-04, 3.0470e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0452e-03, 2.4738e-03, 1.8096e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.4169e-04, -1.4760e-05, 6.0052e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.9778e-05, 2.2471e-04, -8.1158e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6060e-03, -3.9101e-03, 9.1732e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7297e-04, 2.4331e-04, 7.6389e-04]], device='cuda:0') Epoch 15, bias, value: tensor([ 0.0119, -0.0175, 0.0006, 0.0296, -0.0110, 0.0285, 0.0035, 0.0113, 0.0089, -0.0039], device='cuda:0'), grad: tensor([ 0.0025, 0.0056, 0.0004, 0.0017, 0.0025, 0.0006, 0.0002, -0.0039, -0.0037, -0.0058], device='cuda:0') 100 0.0001 changing lr epoch 14, time 214.62, cls_loss 0.0541 cls_loss_mapping 0.0854 cls_loss_causal 0.9871 re_mapping 0.0361 re_causal 0.1006 /// teacc 97.99 lr 0.00010000 Epoch 16, weight, value: tensor([[-1.4800e-02, -1.4654e-02, -2.5723e-03, ..., -1.3739e-02, 1.5383e-02, 7.1815e-03], [ 5.8514e-03, 1.6344e-02, -1.7533e-02, ..., 2.5465e-02, 2.1403e-02, -6.8740e-02], [ 3.7659e-03, 1.8055e-02, 1.5764e-02, ..., -1.2773e-02, 2.6297e-02, 1.5657e-05], ..., [-1.5164e-02, -1.1371e-02, 2.7566e-02, ..., -4.1242e-02, -4.0861e-02, 1.2808e-02], [-9.8966e-03, 1.1638e-02, -4.2019e-03, ..., -2.7897e-02, 6.1218e-03, -4.2522e-02], [-1.3214e-02, 8.9298e-03, 1.7273e-03, ..., -5.0079e-02, -2.2540e-02, -7.0787e-03]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.8130e-03, -1.4706e-03, 4.6417e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4923e-05, 3.3569e-04, 2.9221e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.2602e-04, 7.8058e-04, 1.1578e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9898e-04, 4.7731e-04, -1.6183e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3590e-04, -1.1940e-03, 2.1651e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8920e-04, 1.8609e-04, 1.7262e-04]], device='cuda:0') Epoch 16, bias, value: tensor([ 0.0116, -0.0173, 0.0004, 0.0295, -0.0107, 0.0280, 0.0037, 0.0114, 0.0093, -0.0041], device='cuda:0'), grad: tensor([-8.1329e-03, -7.8201e-05, 5.3253e-03, -5.2376e-03, 8.2397e-04, 2.2011e-03, 3.3875e-03, -6.5088e-04, 1.2894e-03, 1.0595e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 15---------------------------------------------------- epoch 15, time 230.66, cls_loss 0.0487 cls_loss_mapping 0.0756 cls_loss_causal 0.9825 re_mapping 0.0351 re_causal 0.0983 /// teacc 98.23 lr 0.00010000 Epoch 17, weight, value: tensor([[-0.0148, -0.0147, -0.0026, ..., -0.0132, 0.0151, 0.0056], [ 0.0059, 0.0163, -0.0175, ..., 0.0253, 0.0210, -0.0787], [ 0.0038, 0.0181, 0.0158, ..., -0.0132, 0.0268, -0.0010], ..., [-0.0152, -0.0114, 0.0276, ..., -0.0416, -0.0417, 0.0141], [-0.0099, 0.0116, -0.0042, ..., -0.0283, 0.0066, -0.0446], [-0.0132, 0.0089, 0.0017, ..., -0.0508, -0.0226, -0.0104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.3408e-05, 1.0014e-04, 3.0845e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.0134e-04, -6.1214e-05, 1.3518e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3065e-04, -4.8733e-04, 4.7565e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8433e-05, 1.0643e-03, -5.9754e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.5882e-04, 8.5163e-04, 1.2374e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9594e-05, 2.7633e-04, 1.4906e-03]], device='cuda:0') Epoch 17, bias, value: tensor([ 0.0117, -0.0176, 0.0009, 0.0301, -0.0109, 0.0279, 0.0036, 0.0113, 0.0093, -0.0045], device='cuda:0'), grad: tensor([ 0.0003, 0.0006, -0.0201, -0.0054, -0.0043, 0.0015, -0.0002, 0.0190, 0.0055, 0.0030], device='cuda:0') 100 0.0001 changing lr epoch 16, time 214.19, cls_loss 0.0481 cls_loss_mapping 0.0727 cls_loss_causal 0.9366 re_mapping 0.0328 re_causal 0.0910 /// teacc 98.08 lr 0.00010000 Epoch 18, weight, value: tensor([[-0.0153, -0.0147, -0.0026, ..., -0.0126, 0.0149, 0.0041], [ 0.0057, 0.0163, -0.0175, ..., 0.0250, 0.0204, -0.0875], [ 0.0032, 0.0181, 0.0158, ..., -0.0136, 0.0272, -0.0025], ..., [-0.0153, -0.0114, 0.0276, ..., -0.0420, -0.0422, 0.0147], [-0.0105, 0.0116, -0.0042, ..., -0.0291, 0.0074, -0.0473], [-0.0141, 0.0089, 0.0017, ..., -0.0510, -0.0231, -0.0134]], device='cuda:0'), grad: tensor([[ 1.2228e-06, 0.0000e+00, 0.0000e+00, ..., 8.4114e-04, 7.8201e-04, 8.5905e-06], [ 2.6394e-06, 0.0000e+00, 0.0000e+00, ..., -2.0429e-05, 1.6928e-04, 5.3167e-05], [ 1.0274e-05, 0.0000e+00, 0.0000e+00, ..., 2.4414e-04, -7.9155e-04, 4.0442e-05], ..., [ 1.0006e-05, 0.0000e+00, 0.0000e+00, ..., 8.4758e-05, 1.9610e-04, 7.1406e-05], [ 3.0845e-06, 0.0000e+00, 0.0000e+00, ..., 4.5776e-03, 5.8937e-03, 5.5522e-05], [ 4.4741e-06, 0.0000e+00, 0.0000e+00, ..., 5.0402e-04, 6.7949e-04, -5.5462e-05]], device='cuda:0') Epoch 18, bias, value: tensor([ 0.0118, -0.0179, 0.0010, 0.0300, -0.0110, 0.0282, 0.0034, 0.0113, 0.0093, -0.0043], device='cuda:0'), grad: tensor([ 0.0003, 0.0031, 0.0005, -0.0010, 0.0003, -0.0134, 0.0054, -0.0045, 0.0085, 0.0008], device='cuda:0') 100 0.0001 changing lr epoch 17, time 214.40, cls_loss 0.0520 cls_loss_mapping 0.0774 cls_loss_causal 0.9198 re_mapping 0.0313 re_causal 0.0860 /// teacc 98.12 lr 0.00010000 Epoch 19, weight, value: tensor([[-0.0181, -0.0147, -0.0026, ..., -0.0120, 0.0143, 0.0019], [ 0.0057, 0.0163, -0.0175, ..., 0.0252, 0.0199, -0.0981], [ 0.0009, 0.0181, 0.0158, ..., -0.0142, 0.0278, -0.0041], ..., [-0.0155, -0.0114, 0.0276, ..., -0.0423, -0.0431, 0.0135], [-0.0083, 0.0116, -0.0042, ..., -0.0297, 0.0080, -0.0503], [-0.0158, 0.0089, 0.0017, ..., -0.0518, -0.0236, -0.0152]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2798e-03, 6.4707e-04, 2.9281e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8919e-04, 2.7776e-04, 3.0041e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8729e-04, -9.0256e-03, 3.8557e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6372e-05, 9.8705e-04, 9.8765e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1212e-04, 3.9177e-03, 3.7074e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.3479e-05, 4.3035e-04, -8.4019e-04]], device='cuda:0') Epoch 19, bias, value: tensor([ 0.0114, -0.0181, 0.0014, 0.0301, -0.0108, 0.0282, 0.0033, 0.0113, 0.0095, -0.0046], device='cuda:0'), grad: tensor([ 0.0026, 0.0003, -0.0143, 0.0010, 0.0064, 0.0028, 0.0012, 0.0022, 0.0096, -0.0118], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 18---------------------------------------------------- epoch 18, time 230.67, cls_loss 0.0371 cls_loss_mapping 0.0563 cls_loss_causal 0.8916 re_mapping 0.0310 re_causal 0.0903 /// teacc 98.28 lr 0.00010000 Epoch 20, weight, value: tensor([[-0.0181, -0.0147, -0.0026, ..., -0.0115, 0.0141, 0.0022], [ 0.0059, 0.0163, -0.0175, ..., 0.0256, 0.0196, -0.1037], [-0.0004, 0.0181, 0.0158, ..., -0.0148, 0.0281, -0.0025], ..., [-0.0144, -0.0114, 0.0276, ..., -0.0426, -0.0435, 0.0127], [-0.0085, 0.0116, -0.0042, ..., -0.0302, 0.0087, -0.0530], [-0.0168, 0.0089, 0.0017, ..., -0.0521, -0.0238, -0.0153]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8944e-04, 5.4073e-04, 2.0042e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.2953e-06, 8.1897e-05, 2.0519e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.2525e-05, 2.0885e-04, 1.7583e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5661e-05, 6.3956e-05, 1.9819e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8253e-04, -5.5504e-04, 1.0565e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.5267e-05, 1.6761e-04, 1.5736e-05]], device='cuda:0') Epoch 20, bias, value: tensor([ 0.0117, -0.0179, 0.0013, 0.0300, -0.0109, 0.0280, 0.0031, 0.0115, 0.0095, -0.0046], device='cuda:0'), grad: tensor([ 6.0558e-04, -4.3839e-05, 2.4462e-04, 3.6192e-04, 3.1567e-04, 5.4312e-04, -1.0729e-03, 2.1935e-04, -6.9237e-04, -4.8065e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 19---------------------------------------------------- epoch 19, time 230.99, cls_loss 0.0412 cls_loss_mapping 0.0590 cls_loss_causal 0.9101 re_mapping 0.0303 re_causal 0.0833 /// teacc 98.36 lr 0.00010000 Epoch 21, weight, value: tensor([[-0.0180, -0.0147, -0.0026, ..., -0.0108, 0.0138, 0.0023], [ 0.0059, 0.0163, -0.0175, ..., 0.0256, 0.0192, -0.1072], [-0.0013, 0.0181, 0.0158, ..., -0.0154, 0.0284, -0.0055], ..., [-0.0126, -0.0114, 0.0276, ..., -0.0429, -0.0437, 0.0133], [-0.0086, 0.0116, -0.0042, ..., -0.0309, 0.0090, -0.0558], [-0.0172, 0.0089, 0.0017, ..., -0.0527, -0.0240, -0.0173]], device='cuda:0'), grad: tensor([[ 8.8662e-07, 0.0000e+00, 0.0000e+00, ..., -1.8525e-04, 8.6355e-04, 1.0170e-05], [-4.0978e-08, 0.0000e+00, 0.0000e+00, ..., -8.8692e-04, 3.9488e-05, 2.8417e-05], [ 4.4219e-06, 0.0000e+00, 0.0000e+00, ..., 2.1553e-04, 6.7770e-05, 3.2037e-05], ..., [-2.0236e-05, 0.0000e+00, 0.0000e+00, ..., 8.9705e-05, 5.6028e-05, 4.8220e-05], [ 3.2261e-06, 0.0000e+00, 0.0000e+00, ..., -8.3847e-03, -2.0218e-02, 1.1645e-05], [ 6.2622e-06, 0.0000e+00, 0.0000e+00, ..., 4.5872e-04, 7.5293e-04, 3.5667e-04]], device='cuda:0') Epoch 21, bias, value: tensor([ 0.0121, -0.0179, 0.0012, 0.0302, -0.0111, 0.0281, 0.0031, 0.0117, 0.0091, -0.0047], device='cuda:0'), grad: tensor([ 7.8726e-04, -2.0885e-03, 9.2077e-04, 1.6678e-02, 9.8038e-04, 5.8708e-03, 9.3174e-04, 6.3658e-05, -2.6581e-02, 2.4242e-03], device='cuda:0') 100 0.0001 changing lr epoch 20, time 214.38, cls_loss 0.0477 cls_loss_mapping 0.0689 cls_loss_causal 0.9220 re_mapping 0.0285 re_causal 0.0796 /// teacc 98.32 lr 0.00010000 Epoch 22, weight, value: tensor([[-0.0177, -0.0147, -0.0026, ..., -0.0108, 0.0133, 0.0011], [ 0.0062, 0.0163, -0.0175, ..., 0.0258, 0.0187, -0.1115], [-0.0020, 0.0181, 0.0158, ..., -0.0158, 0.0288, -0.0074], ..., [-0.0125, -0.0114, 0.0276, ..., -0.0432, -0.0446, 0.0142], [-0.0086, 0.0116, -0.0042, ..., -0.0312, 0.0097, -0.0591], [-0.0178, 0.0089, 0.0017, ..., -0.0522, -0.0246, -0.0192]], device='cuda:0'), grad: tensor([[ 9.9000e-07, 0.0000e+00, 0.0000e+00, ..., 2.9030e-03, 2.4738e-03, 1.5028e-05], [-2.8126e-06, 0.0000e+00, 0.0000e+00, ..., 1.2457e-04, 5.1022e-04, 4.9382e-05], [ 1.6419e-06, 0.0000e+00, 0.0000e+00, ..., 7.2098e-04, 2.2221e-03, 9.8586e-05], ..., [-6.4857e-06, 0.0000e+00, 0.0000e+00, ..., 7.7128e-05, 6.8426e-04, 7.7772e-04], [ 9.9186e-07, 0.0000e+00, 0.0000e+00, ..., 1.5700e-04, -7.1526e-03, 2.1785e-05], [ 2.4047e-06, 0.0000e+00, 0.0000e+00, ..., 5.9891e-04, 1.4353e-03, 3.0327e-04]], device='cuda:0') Epoch 22, bias, value: tensor([ 0.0113, -0.0178, 0.0014, 0.0302, -0.0112, 0.0282, 0.0031, 0.0116, 0.0094, -0.0046], device='cuda:0'), grad: tensor([ 0.0087, 0.0009, 0.0059, -0.0170, -0.0002, 0.0022, 0.0070, 0.0010, -0.0114, 0.0029], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 21---------------------------------------------------- epoch 21, time 230.78, cls_loss 0.0349 cls_loss_mapping 0.0525 cls_loss_causal 0.8437 re_mapping 0.0280 re_causal 0.0774 /// teacc 98.43 lr 0.00010000 Epoch 23, weight, value: tensor([[-0.0177, -0.0147, -0.0026, ..., -0.0104, 0.0128, 0.0008], [ 0.0064, 0.0163, -0.0175, ..., 0.0258, 0.0184, -0.1137], [-0.0012, 0.0181, 0.0158, ..., -0.0163, 0.0291, -0.0078], ..., [-0.0124, -0.0114, 0.0276, ..., -0.0434, -0.0455, 0.0141], [-0.0087, 0.0116, -0.0042, ..., -0.0321, 0.0102, -0.0600], [-0.0180, 0.0089, 0.0017, ..., -0.0528, -0.0255, -0.0204]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.6839e-04, -1.9157e-04, 1.8962e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3708e-05, 1.1772e-04, 4.4219e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7834e-04, -3.9597e-03, -2.7537e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4909e-05, 8.5640e-04, 7.3276e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0933e-04, 2.2087e-03, 2.6673e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.4094e-05, 6.5982e-05, 7.2360e-05]], device='cuda:0') Epoch 23, bias, value: tensor([ 0.0109, -0.0178, 0.0013, 0.0303, -0.0107, 0.0283, 0.0033, 0.0117, 0.0092, -0.0049], device='cuda:0'), grad: tensor([-6.4039e-04, 9.0742e-04, -4.2763e-03, 2.2522e-02, 5.5838e-04, -2.7508e-05, -4.0627e-04, -2.5940e-02, 4.3526e-03, 2.9659e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 22---------------------------------------------------- epoch 22, time 231.05, cls_loss 0.0335 cls_loss_mapping 0.0572 cls_loss_causal 0.8764 re_mapping 0.0265 re_causal 0.0784 /// teacc 98.45 lr 0.00010000 Epoch 24, weight, value: tensor([[-1.7729e-02, -1.4681e-02, -2.5723e-03, ..., -9.6231e-03, 1.2939e-02, 1.8176e-03], [ 6.1973e-03, 1.6330e-02, -1.7533e-02, ..., 2.5636e-02, 1.7671e-02, -1.1878e-01], [-1.1816e-04, 1.8066e-02, 1.5764e-02, ..., -1.6775e-02, 2.9279e-02, -8.5139e-03], ..., [-1.2394e-02, -1.1373e-02, 2.7566e-02, ..., -4.3673e-02, -4.6214e-02, 1.3608e-02], [-8.7420e-03, 1.1618e-02, -4.2019e-03, ..., -3.2935e-02, 1.0413e-02, -6.1694e-02], [-1.8145e-02, 8.9221e-03, 1.7273e-03, ..., -5.3003e-02, -2.5866e-02, -2.2892e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.8290e-04, -1.4572e-03, 7.3109e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.5632e-06, 7.9513e-05, 4.1164e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7059e-04, -1.6654e-04, 7.1973e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.9945e-06, 2.1958e-04, 5.9381e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6654e-04, 1.0519e-03, 1.4855e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.9561e-05, -1.5879e-03, 1.6168e-05]], device='cuda:0') Epoch 24, bias, value: tensor([ 0.0114, -0.0182, 0.0012, 0.0301, -0.0105, 0.0288, 0.0031, 0.0118, 0.0091, -0.0051], device='cuda:0'), grad: tensor([-1.8387e-03, 6.2346e-05, -3.7265e-04, 6.0177e-04, 2.2554e-04, 3.4237e-03, -1.8239e-05, 4.1008e-04, 2.1820e-03, -4.6768e-03], device='cuda:0') 100 0.0001 changing lr epoch 23, time 214.54, cls_loss 0.0327 cls_loss_mapping 0.0481 cls_loss_causal 0.8450 re_mapping 0.0254 re_causal 0.0711 /// teacc 98.33 lr 0.00010000 Epoch 25, weight, value: tensor([[-0.0181, -0.0147, -0.0026, ..., -0.0093, 0.0125, 0.0003], [ 0.0065, 0.0163, -0.0175, ..., 0.0258, 0.0173, -0.1198], [-0.0007, 0.0181, 0.0158, ..., -0.0172, 0.0295, -0.0094], ..., [-0.0119, -0.0114, 0.0276, ..., -0.0438, -0.0470, 0.0152], [-0.0089, 0.0116, -0.0042, ..., -0.0333, 0.0110, -0.0635], [-0.0188, 0.0089, 0.0017, ..., -0.0532, -0.0260, -0.0247]], device='cuda:0'), grad: tensor([[ 2.0862e-07, 0.0000e+00, 0.0000e+00, ..., -2.6464e-04, 7.4911e-04, 9.2208e-05], [-6.6422e-06, 0.0000e+00, 0.0000e+00, ..., -2.0489e-05, 4.2868e-04, 8.4519e-05], [ 4.6194e-07, 0.0000e+00, 0.0000e+00, ..., 3.9518e-05, -3.8929e-03, 1.9503e-04], ..., [ 2.2091e-06, 0.0000e+00, 0.0000e+00, ..., 1.6168e-05, 1.8244e-03, 1.9407e-04], [ 1.3560e-06, 0.0000e+00, 0.0000e+00, ..., 7.4625e-05, 8.6367e-05, 8.1837e-05], [ 5.8208e-07, 0.0000e+00, 0.0000e+00, ..., 5.0992e-05, 1.6654e-04, 1.0500e-03]], device='cuda:0') Epoch 25, bias, value: tensor([ 0.0112, -0.0180, 0.0008, 0.0300, -0.0108, 0.0287, 0.0034, 0.0122, 0.0090, -0.0047], device='cuda:0'), grad: tensor([ 0.0009, 0.0008, -0.0030, 0.0014, -0.0045, 0.0008, 0.0021, -0.0019, 0.0006, 0.0028], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 24---------------------------------------------------- epoch 24, time 230.84, cls_loss 0.0232 cls_loss_mapping 0.0399 cls_loss_causal 0.8480 re_mapping 0.0257 re_causal 0.0771 /// teacc 98.48 lr 0.00010000 Epoch 26, weight, value: tensor([[-0.0193, -0.0147, -0.0026, ..., -0.0090, 0.0120, -0.0004], [ 0.0080, 0.0163, -0.0175, ..., 0.0265, 0.0172, -0.1201], [-0.0026, 0.0181, 0.0158, ..., -0.0179, 0.0298, -0.0093], ..., [-0.0118, -0.0114, 0.0276, ..., -0.0439, -0.0479, 0.0160], [-0.0073, 0.0116, -0.0042, ..., -0.0340, 0.0113, -0.0654], [-0.0191, 0.0089, 0.0017, ..., -0.0539, -0.0265, -0.0269]], device='cuda:0'), grad: tensor([[-5.6118e-05, 0.0000e+00, 0.0000e+00, ..., 4.7035e-03, 4.8294e-03, 8.7246e-06], [-4.7445e-05, 0.0000e+00, 0.0000e+00, ..., 9.3520e-05, 5.4777e-05, 2.2620e-05], [ 5.2482e-05, 0.0000e+00, 0.0000e+00, ..., 2.9802e-04, 3.5095e-04, 1.9625e-05], ..., [ 9.5516e-06, 0.0000e+00, 0.0000e+00, ..., 6.6519e-05, 5.8711e-05, 5.9247e-05], [ 2.8223e-05, 0.0000e+00, 0.0000e+00, ..., 2.9516e-04, -2.7871e-04, 4.1366e-05], [ 2.7809e-06, 0.0000e+00, 0.0000e+00, ..., 1.2505e-04, 7.7009e-05, 4.1693e-05]], device='cuda:0') Epoch 26, bias, value: tensor([ 0.0113, -0.0178, 0.0010, 0.0301, -0.0107, 0.0286, 0.0032, 0.0127, 0.0087, -0.0053], device='cuda:0'), grad: tensor([ 6.4087e-03, 2.5535e-04, 1.0805e-03, 2.1725e-03, -2.3675e-04, -1.0967e-03, -8.3923e-03, 7.6592e-05, 8.3733e-04, -1.1005e-03], device='cuda:0') 100 0.0001 changing lr epoch 25, time 214.53, cls_loss 0.0337 cls_loss_mapping 0.0495 cls_loss_causal 0.8459 re_mapping 0.0244 re_causal 0.0703 /// teacc 98.22 lr 0.00010000 Epoch 27, weight, value: tensor([[-0.0182, -0.0147, -0.0026, ..., -0.0085, 0.0117, 0.0016], [ 0.0107, 0.0163, -0.0175, ..., 0.0264, 0.0167, -0.1221], [-0.0045, 0.0181, 0.0158, ..., -0.0183, 0.0306, -0.0108], ..., [-0.0114, -0.0114, 0.0276, ..., -0.0442, -0.0490, 0.0152], [-0.0080, 0.0116, -0.0042, ..., -0.0348, 0.0114, -0.0674], [-0.0225, 0.0089, 0.0017, ..., -0.0539, -0.0271, -0.0282]], device='cuda:0'), grad: tensor([[ 1.6764e-08, 0.0000e+00, 0.0000e+00, ..., -1.3142e-03, -6.8712e-04, 9.2462e-06], [-1.1977e-06, 0.0000e+00, 0.0000e+00, ..., -5.3376e-05, 1.5008e-04, 2.9504e-05], [ 8.8476e-08, 0.0000e+00, 0.0000e+00, ..., 1.0347e-04, 5.8413e-04, 4.9829e-05], ..., [ 2.2445e-07, 0.0000e+00, 0.0000e+00, ..., 2.6256e-05, -4.1500e-06, 1.7035e-04], [ 2.2072e-07, 0.0000e+00, 0.0000e+00, ..., 1.7190e-04, -2.0351e-03, 3.1859e-05], [ 4.0047e-08, 0.0000e+00, 0.0000e+00, ..., 8.9598e-04, 7.5769e-04, 5.0211e-04]], device='cuda:0') Epoch 27, bias, value: tensor([ 0.0115, -0.0180, 0.0010, 0.0311, -0.0104, 0.0284, 0.0031, 0.0122, 0.0083, -0.0054], device='cuda:0'), grad: tensor([-2.1648e-03, 9.5129e-05, 3.2234e-03, 1.1438e-04, -2.7800e-04, 1.6861e-03, -7.7724e-05, -5.2986e-03, -1.6890e-03, 4.3907e-03], device='cuda:0') 100 0.0001 changing lr epoch 26, time 214.57, cls_loss 0.0309 cls_loss_mapping 0.0514 cls_loss_causal 0.8547 re_mapping 0.0235 re_causal 0.0698 /// teacc 98.46 lr 0.00010000 Epoch 28, weight, value: tensor([[-0.0175, -0.0147, -0.0026, ..., -0.0081, 0.0110, 0.0016], [ 0.0116, 0.0163, -0.0175, ..., 0.0269, 0.0161, -0.1246], [-0.0078, 0.0181, 0.0158, ..., -0.0188, 0.0314, -0.0114], ..., [-0.0097, -0.0114, 0.0276, ..., -0.0446, -0.0495, 0.0148], [-0.0085, 0.0116, -0.0042, ..., -0.0354, 0.0118, -0.0691], [-0.0245, 0.0089, 0.0017, ..., -0.0542, -0.0268, -0.0294]], device='cuda:0'), grad: tensor([[-7.8738e-05, 0.0000e+00, 0.0000e+00, ..., -3.7789e-04, 2.0313e-04, 8.2981e-07], [ 5.1856e-06, 0.0000e+00, 0.0000e+00, ..., 4.3005e-05, 1.2922e-04, 3.7123e-06], [ 7.1190e-06, 0.0000e+00, 0.0000e+00, ..., 8.2254e-05, -7.0000e-04, 2.2389e-06], ..., [ 3.1628e-06, 0.0000e+00, 0.0000e+00, ..., 1.9833e-05, 1.5318e-04, -3.0175e-05], [ 4.2543e-06, 0.0000e+00, 0.0000e+00, ..., 5.0724e-05, -2.3575e-03, 9.7789e-07], [ 3.0145e-05, 0.0000e+00, 0.0000e+00, ..., 1.5533e-04, 1.5421e-03, 1.2390e-05]], device='cuda:0') Epoch 28, bias, value: tensor([ 0.0111, -0.0173, 0.0013, 0.0311, -0.0105, 0.0283, 0.0030, 0.0122, 0.0082, -0.0055], device='cuda:0'), grad: tensor([-6.8712e-04, 3.4833e-04, -9.3603e-04, 7.9393e-04, 2.2125e-04, 4.5228e-04, 1.9240e-04, -4.6909e-05, -2.8343e-03, 2.4967e-03], device='cuda:0') 100 0.0001 changing lr epoch 27, time 214.47, cls_loss 0.0317 cls_loss_mapping 0.0504 cls_loss_causal 0.8206 re_mapping 0.0229 re_causal 0.0671 /// teacc 98.47 lr 0.00010000 Epoch 29, weight, value: tensor([[-0.0174, -0.0147, -0.0026, ..., -0.0077, 0.0109, 0.0006], [ 0.0119, 0.0163, -0.0175, ..., 0.0270, 0.0155, -0.1283], [-0.0107, 0.0181, 0.0158, ..., -0.0193, 0.0316, -0.0125], ..., [-0.0061, -0.0114, 0.0276, ..., -0.0448, -0.0499, 0.0159], [-0.0087, 0.0116, -0.0042, ..., -0.0359, 0.0126, -0.0708], [-0.0253, 0.0089, 0.0017, ..., -0.0546, -0.0276, -0.0297]], device='cuda:0'), grad: tensor([[ 1.1856e-06, 0.0000e+00, 0.0000e+00, ..., 1.3262e-05, 5.5885e-04, 1.4365e-05], [-2.3663e-05, 0.0000e+00, 0.0000e+00, ..., 2.8625e-05, 2.3329e-04, 3.4422e-05], [ 3.4142e-06, 0.0000e+00, 0.0000e+00, ..., 2.2560e-05, -5.3215e-03, 5.4985e-05], ..., [ 5.8562e-06, 0.0000e+00, 0.0000e+00, ..., 1.7345e-05, 5.8556e-04, 7.1168e-05], [ 3.7383e-06, 0.0000e+00, 0.0000e+00, ..., 1.8120e-04, -1.5812e-03, -6.0081e-04], [ 1.4910e-06, 0.0000e+00, 0.0000e+00, ..., 2.2694e-05, 9.1887e-04, 2.8777e-04]], device='cuda:0') Epoch 29, bias, value: tensor([ 0.0113, -0.0174, 0.0010, 0.0309, -0.0106, 0.0284, 0.0028, 0.0127, 0.0086, -0.0058], device='cuda:0'), grad: tensor([ 0.0009, 0.0003, -0.0076, 0.0050, 0.0013, 0.0037, -0.0008, 0.0017, -0.0067, 0.0021], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 28---------------------------------------------------- epoch 28, time 230.56, cls_loss 0.0244 cls_loss_mapping 0.0386 cls_loss_causal 0.8062 re_mapping 0.0232 re_causal 0.0684 /// teacc 98.62 lr 0.00010000 Epoch 30, weight, value: tensor([[-0.0169, -0.0147, -0.0026, ..., -0.0072, 0.0108, -0.0005], [ 0.0133, 0.0163, -0.0175, ..., 0.0268, 0.0152, -0.1302], [-0.0119, 0.0181, 0.0158, ..., -0.0196, 0.0320, -0.0131], ..., [-0.0059, -0.0114, 0.0276, ..., -0.0450, -0.0508, 0.0167], [-0.0091, 0.0116, -0.0042, ..., -0.0363, 0.0130, -0.0732], [-0.0262, 0.0089, 0.0017, ..., -0.0549, -0.0278, -0.0312]], device='cuda:0'), grad: tensor([[ 7.0408e-07, 0.0000e+00, 0.0000e+00, ..., 1.0878e-04, 1.6844e-04, 1.5542e-05], [-7.3165e-06, 0.0000e+00, 0.0000e+00, ..., -1.5807e-04, 6.6161e-05, 7.4387e-05], [ 4.2189e-07, 0.0000e+00, 0.0000e+00, ..., 1.2210e-06, -4.3607e-04, 1.5453e-05], ..., [ 1.4408e-06, 0.0000e+00, 0.0000e+00, ..., 4.9025e-05, 4.7177e-05, 2.6524e-06], [ 8.5775e-07, 0.0000e+00, 0.0000e+00, ..., 1.8263e-04, 3.3474e-04, 3.5465e-05], [ 5.6997e-07, 0.0000e+00, 0.0000e+00, ..., 8.6784e-05, 8.8513e-05, 7.2956e-04]], device='cuda:0') Epoch 30, bias, value: tensor([ 0.0114, -0.0178, 0.0008, 0.0313, -0.0105, 0.0285, 0.0025, 0.0125, 0.0085, -0.0055], device='cuda:0'), grad: tensor([ 0.0003, -0.0003, -0.0004, 0.0003, -0.0015, -0.0008, 0.0005, -0.0001, 0.0007, 0.0012], device='cuda:0') 100 0.0001 changing lr epoch 29, time 214.80, cls_loss 0.0233 cls_loss_mapping 0.0403 cls_loss_causal 0.8009 re_mapping 0.0230 re_causal 0.0651 /// teacc 98.52 lr 0.00010000 Epoch 31, weight, value: tensor([[-0.0168, -0.0147, -0.0026, ..., -0.0070, 0.0105, -0.0004], [ 0.0147, 0.0163, -0.0175, ..., 0.0264, 0.0144, -0.1324], [-0.0139, 0.0181, 0.0158, ..., -0.0199, 0.0322, -0.0137], ..., [-0.0051, -0.0114, 0.0276, ..., -0.0452, -0.0511, 0.0173], [-0.0093, 0.0116, -0.0042, ..., -0.0364, 0.0137, -0.0744], [-0.0270, 0.0089, 0.0017, ..., -0.0553, -0.0283, -0.0324]], device='cuda:0'), grad: tensor([[-5.9791e-07, 0.0000e+00, 0.0000e+00, ..., 2.1141e-06, 1.1855e-04, 4.6752e-06], [ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., -2.6032e-05, 1.6975e-04, 9.5516e-06], [ 5.1223e-08, 0.0000e+00, 0.0000e+00, ..., 1.1034e-05, -1.5229e-05, 2.1383e-05], ..., [ 1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 1.4886e-05, 2.5826e-03, 1.8641e-05], [ 3.9116e-08, 0.0000e+00, 0.0000e+00, ..., 1.2720e-04, -4.2839e-03, 1.3441e-05], [ 3.1665e-08, 0.0000e+00, 0.0000e+00, ..., 1.6361e-05, 3.3140e-04, 6.1893e-04]], device='cuda:0') Epoch 31, bias, value: tensor([ 0.0113, -0.0181, 0.0010, 0.0311, -0.0103, 0.0281, 0.0030, 0.0130, 0.0085, -0.0058], device='cuda:0'), grad: tensor([ 0.0004, 0.0002, 0.0003, 0.0100, -0.0010, 0.0023, 0.0006, 0.0103, -0.0162, -0.0069], device='cuda:0') 100 0.0001 changing lr epoch 30, time 215.20, cls_loss 0.0236 cls_loss_mapping 0.0428 cls_loss_causal 0.8644 re_mapping 0.0202 re_causal 0.0655 /// teacc 98.48 lr 0.00010000 Epoch 32, weight, value: tensor([[-0.0170, -0.0147, -0.0026, ..., -0.0068, 0.0101, -0.0016], [ 0.0140, 0.0163, -0.0175, ..., 0.0265, 0.0139, -0.1338], [-0.0121, 0.0181, 0.0158, ..., -0.0204, 0.0328, -0.0148], ..., [-0.0052, -0.0114, 0.0276, ..., -0.0454, -0.0517, 0.0167], [-0.0092, 0.0116, -0.0042, ..., -0.0365, 0.0143, -0.0764], [-0.0272, 0.0089, 0.0017, ..., -0.0557, -0.0290, -0.0325]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4891e-04, 4.5991e-04, 1.9714e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9011e-05, 1.5485e-04, 9.2089e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8323e-05, -1.5526e-03, 3.9548e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6345e-05, 1.0452e-03, 1.0097e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9014e-04, -9.6035e-04, 5.7995e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8862e-05, 5.8174e-04, 9.0182e-05]], device='cuda:0') Epoch 32, bias, value: tensor([ 0.0111, -0.0183, 0.0013, 0.0306, -0.0100, 0.0285, 0.0024, 0.0129, 0.0087, -0.0057], device='cuda:0'), grad: tensor([ 0.0009, 0.0004, -0.0029, 0.0004, -0.0004, 0.0005, -0.0009, 0.0029, 0.0006, -0.0016], device='cuda:0') 100 0.0001 changing lr epoch 31, time 214.77, cls_loss 0.0225 cls_loss_mapping 0.0373 cls_loss_causal 0.7768 re_mapping 0.0213 re_causal 0.0622 /// teacc 98.59 lr 0.00010000 Epoch 33, weight, value: tensor([[-0.0167, -0.0147, -0.0026, ..., -0.0062, 0.0100, -0.0010], [ 0.0140, 0.0163, -0.0175, ..., 0.0264, 0.0132, -0.1350], [-0.0111, 0.0181, 0.0158, ..., -0.0208, 0.0333, -0.0155], ..., [-0.0052, -0.0114, 0.0276, ..., -0.0457, -0.0518, 0.0161], [-0.0095, 0.0116, -0.0042, ..., -0.0368, 0.0147, -0.0782], [-0.0280, 0.0089, 0.0017, ..., -0.0558, -0.0295, -0.0344]], device='cuda:0'), grad: tensor([[ 5.1409e-07, 0.0000e+00, 0.0000e+00, ..., 1.8823e-04, 2.1338e-04, 1.9655e-05], [-1.7509e-05, 0.0000e+00, 0.0000e+00, ..., 8.9109e-06, 5.0366e-05, 2.2531e-05], [ 1.8692e-06, 0.0000e+00, 0.0000e+00, ..., 2.2978e-05, -4.0382e-05, 1.8924e-05], ..., [ 2.5369e-06, 0.0000e+00, 0.0000e+00, ..., 7.4469e-06, 6.5207e-05, 7.1339e-06], [ 7.4692e-06, 0.0000e+00, 0.0000e+00, ..., 1.0586e-04, -6.6221e-05, 7.7784e-06], [ 4.3120e-07, 0.0000e+00, 0.0000e+00, ..., 3.0130e-05, 6.3598e-05, 5.1916e-05]], device='cuda:0') Epoch 33, bias, value: tensor([ 0.0111, -0.0187, 0.0014, 0.0306, -0.0096, 0.0285, 0.0025, 0.0130, 0.0084, -0.0056], device='cuda:0'), grad: tensor([ 3.2163e-04, 6.1512e-05, 2.7156e-04, -6.8951e-04, -4.6563e-04, 2.7418e-04, -4.9067e-04, 3.4094e-04, 1.0163e-04, 2.7394e-04], device='cuda:0') 100 0.0001 changing lr epoch 32, time 214.66, cls_loss 0.0201 cls_loss_mapping 0.0359 cls_loss_causal 0.7668 re_mapping 0.0206 re_causal 0.0623 /// teacc 98.57 lr 0.00010000 Epoch 34, weight, value: tensor([[-0.0167, -0.0147, -0.0026, ..., -0.0063, 0.0094, -0.0013], [ 0.0143, 0.0163, -0.0175, ..., 0.0266, 0.0129, -0.1367], [-0.0111, 0.0181, 0.0158, ..., -0.0212, 0.0334, -0.0156], ..., [-0.0052, -0.0114, 0.0276, ..., -0.0459, -0.0524, 0.0159], [-0.0096, 0.0116, -0.0042, ..., -0.0373, 0.0148, -0.0792], [-0.0284, 0.0089, 0.0017, ..., -0.0556, -0.0300, -0.0342]], device='cuda:0'), grad: tensor([[-3.6005e-06, 0.0000e+00, 0.0000e+00, ..., -2.2411e-03, -1.1320e-03, 6.2212e-06], [-5.0478e-06, 0.0000e+00, 0.0000e+00, ..., 1.0198e-04, 8.6963e-05, 8.4579e-05], [ 3.3416e-06, 0.0000e+00, 0.0000e+00, ..., 3.2991e-05, 1.1158e-04, 3.8177e-05], ..., [ 7.3481e-07, 0.0000e+00, 0.0000e+00, ..., 8.7395e-06, 2.5541e-05, 5.6595e-05], [ 1.8170e-06, 0.0000e+00, 0.0000e+00, ..., 1.7011e-04, -9.3162e-05, 5.6684e-05], [ 9.9931e-07, 0.0000e+00, 0.0000e+00, ..., 4.0025e-05, 2.7728e-04, 3.7026e-04]], device='cuda:0') Epoch 34, bias, value: tensor([ 0.0104, -0.0185, 0.0017, 0.0308, -0.0099, 0.0287, 0.0030, 0.0126, 0.0082, -0.0054], device='cuda:0'), grad: tensor([-2.0905e-03, 4.2939e-04, 2.8014e-04, -3.1624e-03, -1.2884e-03, 2.4319e-03, 2.1572e-03, 9.9540e-05, 1.2600e-04, 1.0176e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 33---------------------------------------------------- epoch 33, time 230.82, cls_loss 0.0192 cls_loss_mapping 0.0362 cls_loss_causal 0.8033 re_mapping 0.0204 re_causal 0.0595 /// teacc 98.71 lr 0.00010000 Epoch 35, weight, value: tensor([[-0.0158, -0.0147, -0.0026, ..., -0.0060, 0.0091, -0.0002], [ 0.0154, 0.0163, -0.0175, ..., 0.0270, 0.0123, -0.1384], [-0.0127, 0.0181, 0.0158, ..., -0.0218, 0.0338, -0.0175], ..., [-0.0048, -0.0114, 0.0276, ..., -0.0461, -0.0527, 0.0156], [-0.0102, 0.0116, -0.0042, ..., -0.0381, 0.0151, -0.0793], [-0.0302, 0.0089, 0.0017, ..., -0.0563, -0.0303, -0.0345]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.0545e-04, -6.0654e-04, 3.0082e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.4052e-07, 4.8041e-05, 4.9949e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0204e-04, 2.8276e-04, 1.3351e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0756e-05, 5.5403e-05, 4.4972e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2331e-03, -1.8978e-03, 2.1741e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9683e-04, 3.9816e-04, 1.3196e-04]], device='cuda:0') Epoch 35, bias, value: tensor([ 0.0107, -0.0188, 0.0016, 0.0309, -0.0096, 0.0286, 0.0028, 0.0129, 0.0082, -0.0056], device='cuda:0'), grad: tensor([-3.2444e-03, -5.7364e-04, 1.1730e-03, 1.9655e-05, -5.4479e-05, 1.7986e-03, 6.2084e-04, 3.9244e-04, -2.1267e-03, 1.9913e-03], device='cuda:0') 100 0.0001 changing lr epoch 34, time 214.78, cls_loss 0.0223 cls_loss_mapping 0.0392 cls_loss_causal 0.7438 re_mapping 0.0190 re_causal 0.0526 /// teacc 98.68 lr 0.00010000 Epoch 36, weight, value: tensor([[-0.0158, -0.0147, -0.0026, ..., -0.0055, 0.0088, -0.0007], [ 0.0156, 0.0163, -0.0175, ..., 0.0268, 0.0116, -0.1388], [-0.0127, 0.0181, 0.0158, ..., -0.0224, 0.0344, -0.0184], ..., [-0.0048, -0.0114, 0.0276, ..., -0.0463, -0.0536, 0.0157], [-0.0100, 0.0116, -0.0042, ..., -0.0377, 0.0162, -0.0799], [-0.0304, 0.0089, 0.0017, ..., -0.0568, -0.0309, -0.0358]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.9838e-05, 9.5129e-05, 4.2561e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.2978e-06, -4.2076e-03, 3.0268e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3826e-05, 3.2520e-03, 2.3097e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.2436e-06, 2.6584e-04, 1.0170e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2395e-05, -1.9991e-04, 1.5572e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4633e-05, -3.3498e-05, 2.7269e-05]], device='cuda:0') Epoch 36, bias, value: tensor([ 0.0107, -0.0190, 0.0016, 0.0309, -0.0097, 0.0280, 0.0025, 0.0133, 0.0090, -0.0059], device='cuda:0'), grad: tensor([ 0.0001, -0.0078, 0.0061, 0.0006, 0.0003, 0.0004, 0.0003, 0.0006, 0.0004, -0.0011], device='cuda:0') 100 0.0001 changing lr epoch 35, time 215.24, cls_loss 0.0198 cls_loss_mapping 0.0357 cls_loss_causal 0.7668 re_mapping 0.0182 re_causal 0.0573 /// teacc 98.65 lr 0.00010000 Epoch 37, weight, value: tensor([[-0.0166, -0.0147, -0.0026, ..., -0.0055, 0.0084, -0.0009], [ 0.0163, 0.0163, -0.0175, ..., 0.0269, 0.0115, -0.1397], [-0.0139, 0.0181, 0.0158, ..., -0.0232, 0.0343, -0.0183], ..., [-0.0043, -0.0114, 0.0276, ..., -0.0464, -0.0544, 0.0156], [-0.0094, 0.0116, -0.0042, ..., -0.0381, 0.0168, -0.0805], [-0.0311, 0.0089, 0.0017, ..., -0.0568, -0.0314, -0.0365]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -3.9029e-04, -1.7226e-04, 3.8184e-06], [ 9.2201e-08, 0.0000e+00, 0.0000e+00, ..., 1.7321e-04, 1.5295e-04, 2.3693e-05], [ 2.6077e-08, 0.0000e+00, 0.0000e+00, ..., 7.5459e-05, -6.8069e-05, 1.4327e-05], ..., [-3.0641e-07, 0.0000e+00, 0.0000e+00, ..., 1.2420e-05, 3.9667e-05, 1.8477e-05], [ 3.6322e-08, 0.0000e+00, 0.0000e+00, ..., 7.3957e-04, 6.2847e-04, 9.2834e-06], [ 6.2399e-08, 0.0000e+00, 0.0000e+00, ..., 2.3305e-04, 1.0568e-04, 1.0335e-04]], device='cuda:0') Epoch 37, bias, value: tensor([ 0.0100, -0.0186, 0.0015, 0.0306, -0.0092, 0.0283, 0.0027, 0.0132, 0.0089, -0.0059], device='cuda:0'), grad: tensor([-6.4993e-04, 3.8958e-04, 6.8247e-05, -3.3903e-04, -1.4031e-04, 7.3957e-04, -1.7347e-03, -1.3566e-04, 1.2007e-03, 6.0225e-04], device='cuda:0') 100 0.0001 changing lr epoch 36, time 214.82, cls_loss 0.0172 cls_loss_mapping 0.0316 cls_loss_causal 0.7487 re_mapping 0.0187 re_causal 0.0568 /// teacc 98.54 lr 0.00010000 Epoch 38, weight, value: tensor([[-0.0167, -0.0147, -0.0026, ..., -0.0049, 0.0084, -0.0024], [ 0.0164, 0.0163, -0.0175, ..., 0.0269, 0.0108, -0.1410], [-0.0144, 0.0181, 0.0158, ..., -0.0237, 0.0350, -0.0187], ..., [-0.0040, -0.0114, 0.0276, ..., -0.0469, -0.0548, 0.0156], [-0.0094, 0.0116, -0.0042, ..., -0.0386, 0.0170, -0.0813], [-0.0319, 0.0089, 0.0017, ..., -0.0573, -0.0322, -0.0376]], device='cuda:0'), grad: tensor([[-1.1712e-05, 0.0000e+00, 0.0000e+00, ..., -2.3823e-03, 8.2672e-05, 5.8636e-06], [ 3.1386e-06, 0.0000e+00, 0.0000e+00, ..., 6.8092e-04, 1.0138e-03, 2.0936e-05], [-7.7295e-04, 0.0000e+00, 0.0000e+00, ..., 3.3259e-04, -2.0103e-03, 5.9843e-05], ..., [ 7.5436e-04, 0.0000e+00, 0.0000e+00, ..., 1.4968e-05, 2.4815e-03, 6.3255e-06], [ 6.8210e-06, 0.0000e+00, 0.0000e+00, ..., -2.1315e-04, -4.0779e-03, 5.7407e-06], [ 1.2498e-06, 0.0000e+00, 0.0000e+00, ..., 1.1301e-04, 8.6880e-04, 1.6615e-05]], device='cuda:0') Epoch 38, bias, value: tensor([ 0.0103, -0.0186, 0.0015, 0.0309, -0.0093, 0.0282, 0.0028, 0.0134, 0.0085, -0.0061], device='cuda:0'), grad: tensor([-0.0022, 0.0022, -0.0013, 0.0014, 0.0021, 0.0006, 0.0012, 0.0026, -0.0078, 0.0012], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 37---------------------------------------------------- epoch 37, time 230.79, cls_loss 0.0154 cls_loss_mapping 0.0268 cls_loss_causal 0.7396 re_mapping 0.0177 re_causal 0.0550 /// teacc 98.78 lr 0.00010000 Epoch 39, weight, value: tensor([[-0.0170, -0.0147, -0.0026, ..., -0.0045, 0.0081, -0.0027], [ 0.0162, 0.0163, -0.0175, ..., 0.0272, 0.0100, -0.1436], [-0.0143, 0.0181, 0.0158, ..., -0.0241, 0.0359, -0.0191], ..., [-0.0035, -0.0114, 0.0276, ..., -0.0471, -0.0555, 0.0156], [-0.0095, 0.0116, -0.0042, ..., -0.0386, 0.0174, -0.0834], [-0.0334, 0.0089, 0.0017, ..., -0.0577, -0.0327, -0.0377]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.3256e-04, 2.7597e-05, 2.4065e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.4107e-06, 8.5905e-06, 8.1003e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1336e-05, 1.5244e-05, 8.6784e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3314e-05, 7.5102e-06, 1.1647e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0171e-05, 2.7090e-05, 1.4752e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9439e-05, 3.4899e-05, 2.1386e-04]], device='cuda:0') Epoch 39, bias, value: tensor([ 0.0104, -0.0189, 0.0019, 0.0307, -0.0090, 0.0287, 0.0024, 0.0135, 0.0084, -0.0065], device='cuda:0'), grad: tensor([-3.9983e-04, 8.6486e-05, 3.2544e-04, -6.9714e-04, -9.5987e-04, 4.1103e-04, -1.0848e-04, 3.8695e-04, 2.3746e-04, 7.1859e-04], device='cuda:0') 100 0.0001 changing lr epoch 38, time 214.28, cls_loss 0.0195 cls_loss_mapping 0.0328 cls_loss_causal 0.7374 re_mapping 0.0171 re_causal 0.0493 /// teacc 98.66 lr 0.00010000 Epoch 40, weight, value: tensor([[-0.0171, -0.0147, -0.0026, ..., -0.0042, 0.0076, -0.0035], [ 0.0160, 0.0163, -0.0175, ..., 0.0273, 0.0095, -0.1455], [-0.0148, 0.0181, 0.0158, ..., -0.0246, 0.0368, -0.0196], ..., [-0.0030, -0.0114, 0.0276, ..., -0.0472, -0.0563, 0.0144], [-0.0096, 0.0116, -0.0042, ..., -0.0389, 0.0175, -0.0850], [-0.0340, 0.0089, 0.0017, ..., -0.0580, -0.0334, -0.0380]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.6359e-05, 2.7820e-05, 8.7544e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.9744e-05, 2.0400e-05, 5.8953e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4393e-05, -4.5449e-05, 1.0040e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.4964e-06, 1.7658e-05, 6.7428e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.4481e-05, 4.8786e-05, 6.7055e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2457e-05, -9.4235e-05, 4.7032e-07]], device='cuda:0') Epoch 40, bias, value: tensor([ 0.0103, -0.0187, 0.0022, 0.0308, -0.0090, 0.0288, 0.0023, 0.0134, 0.0080, -0.0065], device='cuda:0'), grad: tensor([ 1.2755e-04, 8.7404e-04, 8.3160e-04, 4.3130e-04, 2.4009e-04, 9.4295e-05, 2.3320e-05, 2.6108e-02, 6.1560e-04, -2.9343e-02], device='cuda:0') 100 0.0001 changing lr epoch 39, time 214.56, cls_loss 0.0145 cls_loss_mapping 0.0238 cls_loss_causal 0.7078 re_mapping 0.0176 re_causal 0.0496 /// teacc 98.60 lr 0.00010000 Epoch 41, weight, value: tensor([[-0.0169, -0.0147, -0.0026, ..., -0.0040, 0.0074, -0.0039], [ 0.0172, 0.0163, -0.0175, ..., 0.0274, 0.0091, -0.1470], [-0.0153, 0.0181, 0.0158, ..., -0.0249, 0.0372, -0.0200], ..., [-0.0033, -0.0114, 0.0276, ..., -0.0472, -0.0566, 0.0144], [-0.0098, 0.0116, -0.0042, ..., -0.0398, 0.0178, -0.0853], [-0.0356, 0.0089, 0.0017, ..., -0.0584, -0.0340, -0.0387]], device='cuda:0'), grad: tensor([[ 6.5379e-07, 0.0000e+00, 0.0000e+00, ..., -4.7274e-06, 1.3791e-05, 9.7137e-07], [-4.0412e-05, 0.0000e+00, 0.0000e+00, ..., -4.4182e-06, 5.6416e-05, 5.7928e-07], [ 6.1579e-06, 0.0000e+00, 0.0000e+00, ..., 3.5185e-06, -9.0718e-05, 4.6268e-06], ..., [ 1.1519e-05, 0.0000e+00, 0.0000e+00, ..., 1.2526e-06, 4.0650e-05, 8.0019e-06], [ 4.6566e-06, 0.0000e+00, 0.0000e+00, ..., 4.1351e-06, -3.5405e-05, 3.1684e-06], [ 4.3735e-06, 0.0000e+00, 0.0000e+00, ..., 2.5071e-06, 1.6376e-05, 1.9923e-05]], device='cuda:0') Epoch 41, bias, value: tensor([ 0.0100, -0.0189, 0.0017, 0.0309, -0.0089, 0.0286, 0.0029, 0.0138, 0.0078, -0.0064], device='cuda:0'), grad: tensor([ 3.9726e-05, 5.6171e-04, 5.0020e-04, -1.2960e-03, 7.1287e-04, 1.1581e-04, 2.7373e-05, 7.7486e-04, 1.2070e-04, -1.5574e-03], device='cuda:0') 100 0.0001 changing lr epoch 40, time 214.33, cls_loss 0.0160 cls_loss_mapping 0.0296 cls_loss_causal 0.7057 re_mapping 0.0170 re_causal 0.0513 /// teacc 98.52 lr 0.00010000 Epoch 42, weight, value: tensor([[-0.0168, -0.0147, -0.0026, ..., -0.0032, 0.0077, -0.0044], [ 0.0173, 0.0163, -0.0175, ..., 0.0273, 0.0089, -0.1488], [-0.0154, 0.0181, 0.0158, ..., -0.0253, 0.0376, -0.0208], ..., [-0.0028, -0.0114, 0.0276, ..., -0.0476, -0.0575, 0.0153], [-0.0099, 0.0116, -0.0042, ..., -0.0401, 0.0182, -0.0871], [-0.0359, 0.0089, 0.0017, ..., -0.0586, -0.0345, -0.0390]], device='cuda:0'), grad: tensor([[ 6.5658e-08, 0.0000e+00, 0.0000e+00, ..., -4.4078e-05, 3.3379e-05, 3.2280e-06], [ 4.6939e-07, 0.0000e+00, 0.0000e+00, ..., -9.2760e-06, 3.5558e-06, 1.3612e-05], [ 8.8383e-07, 0.0000e+00, 0.0000e+00, ..., 1.0952e-05, -1.7738e-04, 1.2934e-05], ..., [ 9.4809e-07, 0.0000e+00, 0.0000e+00, ..., 8.1286e-06, 9.3102e-05, 9.5487e-05], [ 7.5903e-08, 0.0000e+00, 0.0000e+00, ..., 5.8979e-05, 1.2207e-04, 9.3505e-06], [ 4.3772e-07, 0.0000e+00, 0.0000e+00, ..., 3.4690e-05, 2.1845e-05, 8.1658e-06]], device='cuda:0') Epoch 42, bias, value: tensor([ 0.0104, -0.0187, 0.0013, 0.0312, -0.0095, 0.0284, 0.0027, 0.0142, 0.0078, -0.0064], device='cuda:0'), grad: tensor([-7.9870e-06, -3.6287e-04, 8.4713e-06, 1.6677e-04, -1.6797e-04, -7.2908e-04, 2.6631e-04, 7.0763e-04, 3.0470e-04, -1.8537e-04], device='cuda:0') 100 0.0001 changing lr epoch 41, time 214.66, cls_loss 0.0161 cls_loss_mapping 0.0258 cls_loss_causal 0.7043 re_mapping 0.0172 re_causal 0.0500 /// teacc 98.62 lr 0.00010000 Epoch 43, weight, value: tensor([[-0.0167, -0.0147, -0.0026, ..., -0.0029, 0.0074, -0.0048], [ 0.0180, 0.0163, -0.0175, ..., 0.0273, 0.0086, -0.1496], [-0.0154, 0.0181, 0.0158, ..., -0.0257, 0.0376, -0.0215], ..., [-0.0018, -0.0114, 0.0276, ..., -0.0478, -0.0579, 0.0150], [-0.0103, 0.0116, -0.0042, ..., -0.0404, 0.0190, -0.0890], [-0.0373, 0.0089, 0.0017, ..., -0.0592, -0.0355, -0.0408]], device='cuda:0'), grad: tensor([[ 1.9232e-07, 0.0000e+00, 0.0000e+00, ..., 5.3525e-05, 1.5569e-04, 2.1365e-06], [ 4.2329e-07, 0.0000e+00, 0.0000e+00, ..., 3.6713e-06, 2.2128e-05, 9.3803e-06], [ 4.2003e-07, 0.0000e+00, 0.0000e+00, ..., 2.6487e-06, -4.1890e-04, 7.9945e-06], ..., [ 1.0058e-07, 0.0000e+00, 0.0000e+00, ..., 5.2527e-06, 5.7787e-05, 1.9088e-05], [ 2.6496e-07, 0.0000e+00, 0.0000e+00, ..., 5.0396e-05, 2.0492e-04, 3.5558e-06], [ 1.6112e-06, 0.0000e+00, 0.0000e+00, ..., -2.7633e-04, -2.6107e-04, 2.4259e-05]], device='cuda:0') Epoch 43, bias, value: tensor([ 0.0104, -0.0184, 0.0009, 0.0310, -0.0094, 0.0288, 0.0026, 0.0141, 0.0079, -0.0064], device='cuda:0'), grad: tensor([ 0.0003, 0.0001, -0.0004, -0.0011, 0.0004, 0.0008, 0.0001, -0.0003, 0.0006, -0.0005], device='cuda:0') 100 0.0001 changing lr epoch 42, time 214.53, cls_loss 0.0172 cls_loss_mapping 0.0278 cls_loss_causal 0.7190 re_mapping 0.0171 re_causal 0.0510 /// teacc 98.50 lr 0.00010000 Epoch 44, weight, value: tensor([[-0.0169, -0.0147, -0.0026, ..., -0.0025, 0.0071, -0.0056], [ 0.0186, 0.0163, -0.0175, ..., 0.0272, 0.0079, -0.1500], [-0.0146, 0.0181, 0.0158, ..., -0.0260, 0.0384, -0.0227], ..., [-0.0012, -0.0114, 0.0276, ..., -0.0480, -0.0581, 0.0169], [-0.0106, 0.0116, -0.0042, ..., -0.0411, 0.0186, -0.0899], [-0.0382, 0.0089, 0.0017, ..., -0.0596, -0.0358, -0.0421]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3030e-04, 4.3535e-04, 1.7788e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3006e-06, 8.1778e-05, 1.2405e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.9754e-06, -1.2159e-03, 7.1013e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4650e-06, 1.3857e-03, 3.3025e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8907e-04, -1.1396e-03, 4.1397e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2338e-05, 2.2161e-04, 9.1195e-06]], device='cuda:0') Epoch 44, bias, value: tensor([ 0.0104, -0.0178, 0.0017, 0.0303, -0.0095, 0.0295, 0.0027, 0.0142, 0.0072, -0.0068], device='cuda:0'), grad: tensor([ 5.4455e-04, -1.4648e-05, -2.8763e-03, 6.4313e-05, 1.1981e-04, 1.5032e-04, 7.7903e-05, 3.2520e-03, -1.6375e-03, 3.1662e-04], device='cuda:0') 100 0.0001 changing lr epoch 43, time 214.61, cls_loss 0.0167 cls_loss_mapping 0.0320 cls_loss_causal 0.6996 re_mapping 0.0165 re_causal 0.0486 /// teacc 98.67 lr 0.00010000 Epoch 45, weight, value: tensor([[-0.0158, -0.0147, -0.0026, ..., -0.0026, 0.0060, -0.0045], [ 0.0186, 0.0163, -0.0175, ..., 0.0270, 0.0073, -0.1509], [-0.0130, 0.0181, 0.0158, ..., -0.0264, 0.0388, -0.0238], ..., [-0.0019, -0.0114, 0.0276, ..., -0.0483, -0.0588, 0.0159], [-0.0109, 0.0116, -0.0042, ..., -0.0417, 0.0186, -0.0910], [-0.0396, 0.0089, 0.0017, ..., -0.0598, -0.0360, -0.0420]], device='cuda:0'), grad: tensor([[ 2.0172e-06, 0.0000e+00, 0.0000e+00, ..., -1.0735e-02, -4.7989e-03, 2.8079e-07], [ 8.1286e-06, 0.0000e+00, 0.0000e+00, ..., 5.1260e-06, 2.1160e-05, 1.2433e-07], [ 5.8934e-06, 0.0000e+00, 0.0000e+00, ..., 7.5661e-06, -2.5344e-04, 1.7975e-07], ..., [-6.2108e-05, 0.0000e+00, 0.0000e+00, ..., 1.3299e-06, 6.2346e-05, 7.8231e-08], [ 1.1459e-05, 0.0000e+00, 0.0000e+00, ..., 2.5705e-05, 1.4377e-04, 5.2620e-08], [ 2.1905e-05, 0.0000e+00, 0.0000e+00, ..., 1.4208e-05, 8.7693e-06, 3.3341e-07]], device='cuda:0') Epoch 45, bias, value: tensor([ 0.0100, -0.0181, 0.0014, 0.0313, -0.0093, 0.0293, 0.0033, 0.0137, 0.0067, -0.0065], device='cuda:0'), grad: tensor([-9.7809e-03, 1.0949e-04, -2.4986e-04, 1.3940e-05, 1.0097e-04, 1.4269e-04, 9.6588e-03, -5.0926e-04, 3.0828e-04, 2.0289e-04], device='cuda:0') 100 0.0001 changing lr epoch 44, time 214.71, cls_loss 0.0136 cls_loss_mapping 0.0248 cls_loss_causal 0.7131 re_mapping 0.0156 re_causal 0.0492 /// teacc 98.69 lr 0.00010000 Epoch 46, weight, value: tensor([[-0.0150, -0.0147, -0.0026, ..., -0.0022, 0.0056, -0.0047], [ 0.0183, 0.0163, -0.0175, ..., 0.0275, 0.0068, -0.1516], [-0.0132, 0.0181, 0.0158, ..., -0.0269, 0.0395, -0.0240], ..., [-0.0013, -0.0114, 0.0276, ..., -0.0486, -0.0591, 0.0160], [-0.0112, 0.0116, -0.0042, ..., -0.0423, 0.0188, -0.0914], [-0.0408, 0.0089, 0.0017, ..., -0.0598, -0.0362, -0.0430]], device='cuda:0'), grad: tensor([[-5.0990e-07, 0.0000e+00, 0.0000e+00, ..., -1.6734e-05, 8.8615e-07, -1.6298e-03], [ 6.1933e-08, 0.0000e+00, 0.0000e+00, ..., -1.9163e-05, 6.5006e-06, 1.0714e-05], [ 5.0897e-07, 0.0000e+00, 0.0000e+00, ..., 7.8142e-05, 1.4532e-04, 2.0027e-04], ..., [-5.2620e-07, 0.0000e+00, 0.0000e+00, ..., 3.5409e-06, 5.5209e-06, 1.8001e-05], [ 3.5390e-08, 0.0000e+00, 0.0000e+00, ..., -6.3300e-05, -3.3450e-04, 3.1620e-05], [ 5.9605e-08, 0.0000e+00, 0.0000e+00, ..., 5.7109e-06, 1.2323e-05, 2.1732e-04]], device='cuda:0') Epoch 46, bias, value: tensor([ 0.0102, -0.0180, 0.0014, 0.0308, -0.0090, 0.0299, 0.0024, 0.0138, 0.0068, -0.0067], device='cuda:0'), grad: tensor([-4.2915e-03, -7.2598e-05, 6.8855e-04, 3.2234e-04, 2.2144e-03, 2.6846e-04, 2.8944e-04, 2.1011e-05, -3.4600e-05, 5.9652e-04], device='cuda:0') 100 0.0001 changing lr epoch 45, time 214.77, cls_loss 0.0131 cls_loss_mapping 0.0234 cls_loss_causal 0.7104 re_mapping 0.0164 re_causal 0.0479 /// teacc 98.74 lr 0.00010000 Epoch 47, weight, value: tensor([[-0.0150, -0.0147, -0.0026, ..., -0.0022, 0.0055, -0.0045], [ 0.0179, 0.0163, -0.0175, ..., 0.0275, 0.0063, -0.1531], [-0.0129, 0.0181, 0.0158, ..., -0.0273, 0.0400, -0.0246], ..., [-0.0008, -0.0114, 0.0276, ..., -0.0488, -0.0597, 0.0170], [-0.0112, 0.0116, -0.0042, ..., -0.0426, 0.0191, -0.0926], [-0.0412, 0.0089, 0.0017, ..., -0.0597, -0.0368, -0.0433]], device='cuda:0'), grad: tensor([[ 6.8452e-08, 0.0000e+00, 0.0000e+00, ..., -1.2898e-04, -2.5809e-05, 2.0452e-06], [ 1.7509e-07, 0.0000e+00, 0.0000e+00, ..., 4.0159e-06, 3.1638e-04, 3.8370e-06], [ 8.9407e-08, 0.0000e+00, 0.0000e+00, ..., 1.0923e-05, -1.4400e-03, -3.9563e-06], ..., [-1.8608e-06, 0.0000e+00, 0.0000e+00, ..., 3.2745e-06, 3.6788e-04, 7.5763e-07], [ 2.7008e-07, 0.0000e+00, 0.0000e+00, ..., 1.1995e-05, 3.6657e-05, 2.1998e-06], [ 9.0711e-07, 0.0000e+00, 0.0000e+00, ..., 5.2154e-05, 2.7955e-05, 2.0230e-04]], device='cuda:0') Epoch 47, bias, value: tensor([ 0.0101, -0.0178, 0.0010, 0.0307, -0.0093, 0.0297, 0.0025, 0.0144, 0.0067, -0.0065], device='cuda:0'), grad: tensor([-0.0004, 0.0006, -0.0026, 0.0009, -0.0005, 0.0003, 0.0002, 0.0005, 0.0001, 0.0008], device='cuda:0') 100 0.0001 changing lr epoch 46, time 214.44, cls_loss 0.0165 cls_loss_mapping 0.0267 cls_loss_causal 0.6946 re_mapping 0.0150 re_causal 0.0441 /// teacc 98.55 lr 0.00010000 Epoch 48, weight, value: tensor([[-0.0160, -0.0147, -0.0026, ..., -0.0018, 0.0056, -0.0048], [ 0.0192, 0.0163, -0.0175, ..., 0.0275, 0.0063, -0.1545], [-0.0131, 0.0181, 0.0158, ..., -0.0276, 0.0402, -0.0250], ..., [-0.0009, -0.0114, 0.0276, ..., -0.0489, -0.0603, 0.0170], [-0.0106, 0.0116, -0.0042, ..., -0.0435, 0.0190, -0.0926], [-0.0423, 0.0089, 0.0017, ..., -0.0600, -0.0378, -0.0440]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -2.0313e-04, 1.1355e-04, 8.9547e-07], [ 4.5169e-08, 0.0000e+00, 0.0000e+00, ..., 6.1750e-05, 8.4639e-05, 6.2212e-06], [ 3.3993e-08, 0.0000e+00, 0.0000e+00, ..., 3.5465e-05, 1.3006e-04, 3.4347e-06], ..., [ 1.6810e-07, 0.0000e+00, 0.0000e+00, ..., 1.2606e-05, 9.4116e-05, 4.4554e-05], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 5.5313e-05, -9.3222e-04, 1.8906e-06], [ 2.0023e-08, 0.0000e+00, 0.0000e+00, ..., 4.2059e-06, 2.7871e-04, 2.2173e-05]], device='cuda:0') Epoch 48, bias, value: tensor([ 0.0102, -0.0175, 0.0007, 0.0311, -0.0085, 0.0291, 0.0029, 0.0146, 0.0064, -0.0071], device='cuda:0'), grad: tensor([-0.0001, 0.0003, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0012, -0.0015, -0.0007], device='cuda:0') 100 0.0001 changing lr epoch 47, time 214.85, cls_loss 0.0145 cls_loss_mapping 0.0280 cls_loss_causal 0.6932 re_mapping 0.0152 re_causal 0.0441 /// teacc 98.76 lr 0.00010000 Epoch 49, weight, value: tensor([[-0.0161, -0.0146, -0.0026, ..., -0.0013, 0.0048, -0.0051], [ 0.0199, 0.0163, -0.0175, ..., 0.0278, 0.0058, -0.1559], [-0.0125, 0.0181, 0.0158, ..., -0.0280, 0.0406, -0.0254], ..., [-0.0011, -0.0114, 0.0276, ..., -0.0491, -0.0610, 0.0164], [-0.0109, 0.0116, -0.0042, ..., -0.0439, 0.0196, -0.0932], [-0.0431, 0.0088, 0.0017, ..., -0.0602, -0.0380, -0.0441]], device='cuda:0'), grad: tensor([[-1.1269e-07, 0.0000e+00, 0.0000e+00, ..., 1.0407e-04, 1.4997e-04, 1.7639e-06], [ 7.3835e-06, 0.0000e+00, 0.0000e+00, ..., 6.7353e-05, 1.4365e-04, 2.4185e-05], [-9.7156e-06, 0.0000e+00, 0.0000e+00, ..., 1.8513e-04, 5.1886e-05, 1.4409e-05], ..., [ 2.0750e-06, 0.0000e+00, 0.0000e+00, ..., 7.1973e-06, 5.1677e-05, 7.2829e-06], [ 1.0906e-06, 0.0000e+00, 0.0000e+00, ..., 1.4961e-04, 2.1768e-04, 2.6021e-06], [ 4.2934e-07, 0.0000e+00, 0.0000e+00, ..., 2.3052e-05, 3.3110e-05, 1.4544e-05]], device='cuda:0') Epoch 49, bias, value: tensor([ 0.0102, -0.0180, 0.0007, 0.0314, -0.0084, 0.0288, 0.0028, 0.0144, 0.0065, -0.0068], device='cuda:0'), grad: tensor([ 0.0002, 0.0002, 0.0001, -0.0002, 0.0001, -0.0004, -0.0009, 0.0001, 0.0005, 0.0003], device='cuda:0') 100 0.0001 changing lr epoch 48, time 214.93, cls_loss 0.0119 cls_loss_mapping 0.0219 cls_loss_causal 0.6532 re_mapping 0.0153 re_causal 0.0476 /// teacc 98.75 lr 0.00010000 Epoch 50, weight, value: tensor([[-0.0153, 0.0003, -0.0026, ..., -0.0012, 0.0041, -0.0054], [ 0.0198, 0.0156, -0.0175, ..., 0.0275, 0.0051, -0.1569], [-0.0124, 0.0180, 0.0159, ..., -0.0281, 0.0409, -0.0252], ..., [-0.0004, -0.0114, 0.0275, ..., -0.0492, -0.0618, 0.0161], [-0.0112, 0.0104, -0.0042, ..., -0.0442, 0.0202, -0.0933], [-0.0434, -0.0008, 0.0017, ..., -0.0604, -0.0380, -0.0451]], device='cuda:0'), grad: tensor([[ 1.2107e-07, -9.9465e-07, 0.0000e+00, ..., -1.7826e-06, 1.6704e-05, 2.1514e-07], [ 1.0710e-07, 1.2107e-08, 0.0000e+00, ..., 2.1979e-06, 1.4864e-05, 6.8359e-07], [-1.6317e-06, 8.7544e-08, 0.0000e+00, ..., 2.7381e-06, -4.6313e-05, 4.7963e-07], ..., [-1.3690e-07, 2.4214e-08, 0.0000e+00, ..., 3.6880e-07, 1.4201e-05, -4.0606e-07], [ 3.6974e-07, 3.0734e-08, 0.0000e+00, ..., 3.3583e-06, -5.2571e-05, 4.9360e-07], [ 1.5926e-07, 4.9733e-07, 0.0000e+00, ..., 5.0440e-06, 1.2361e-05, 6.3360e-05]], device='cuda:0') Epoch 50, bias, value: tensor([ 0.0099, -0.0184, 0.0005, 0.0319, -0.0080, 0.0285, 0.0027, 0.0144, 0.0069, -0.0068], device='cuda:0'), grad: tensor([ 3.8505e-05, 2.0579e-05, 4.5508e-05, -1.3423e-04, -3.8266e-05, 4.3720e-05, 3.2298e-06, -6.6638e-05, -4.4465e-05, 1.3185e-04], device='cuda:0') 100 0.0001 changing lr epoch 49, time 214.82, cls_loss 0.0128 cls_loss_mapping 0.0227 cls_loss_causal 0.7097 re_mapping 0.0144 re_causal 0.0449 /// teacc 98.74 lr 0.00010000 Epoch 51, weight, value: tensor([[-0.0156, -0.0019, -0.0026, ..., -0.0010, 0.0037, -0.0058], [ 0.0203, 0.0153, -0.0175, ..., 0.0268, 0.0044, -0.1563], [-0.0100, 0.0179, 0.0159, ..., -0.0283, 0.0413, -0.0259], ..., [-0.0019, -0.0115, 0.0275, ..., -0.0494, -0.0622, 0.0160], [-0.0120, 0.0101, -0.0042, ..., -0.0442, 0.0206, -0.0951], [-0.0459, 0.0004, 0.0017, ..., -0.0603, -0.0380, -0.0463]], device='cuda:0'), grad: tensor([[ 3.3528e-08, 0.0000e+00, 0.0000e+00, ..., -1.3292e-04, -2.4110e-05, 8.1863e-07], [-6.6124e-07, 0.0000e+00, 0.0000e+00, ..., 6.1616e-06, 5.4240e-06, 6.2644e-05], [ 2.4214e-07, 0.0000e+00, 0.0000e+00, ..., 1.9148e-05, 8.7768e-06, 2.4587e-06], ..., [ 1.5739e-07, 0.0000e+00, 0.0000e+00, ..., 2.8517e-06, 2.8722e-06, 9.6112e-06], [ 2.2445e-07, 0.0000e+00, 0.0000e+00, ..., 3.2157e-05, 2.7880e-05, 1.1072e-05], [ 1.3225e-07, 0.0000e+00, 0.0000e+00, ..., 9.8050e-05, 3.1382e-05, 5.8985e-04]], device='cuda:0') Epoch 51, bias, value: tensor([ 0.0092, -0.0179, 0.0008, 0.0319, -0.0078, 0.0283, 0.0027, 0.0139, 0.0067, -0.0065], device='cuda:0'), grad: tensor([-2.0766e-04, 1.3101e-04, 3.7849e-05, 7.9274e-06, -1.4982e-03, 3.0294e-05, -1.1522e-04, 1.8075e-05, 9.1374e-05, 1.5039e-03], device='cuda:0') 100 0.0001 changing lr epoch 50, time 214.73, cls_loss 0.0102 cls_loss_mapping 0.0173 cls_loss_causal 0.6816 re_mapping 0.0142 re_causal 0.0440 /// teacc 98.65 lr 0.00010000 Epoch 52, weight, value: tensor([[-1.5604e-02, -1.7682e-03, -2.6036e-03, ..., -1.3666e-04, 3.8299e-03, -5.8003e-03], [ 2.0632e-02, 1.5280e-02, -1.7541e-02, ..., 2.6685e-02, 3.7885e-03, -1.5824e-01], [-8.6888e-03, 1.7895e-02, 1.5938e-02, ..., -2.8979e-02, 4.1930e-02, -2.5700e-02], ..., [-2.5019e-03, -1.1479e-02, 2.7466e-02, ..., -4.9577e-02, -6.2743e-02, 1.6270e-02], [-1.2240e-02, 1.0001e-02, -4.2081e-03, ..., -4.4389e-02, 2.0932e-02, -9.4686e-02], [-4.6414e-02, 2.6877e-04, 1.7246e-03, ..., -6.0675e-02, -3.8472e-02, -4.6642e-02]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -3.9840e-04, -7.3731e-05, -1.6198e-05], [ 2.9802e-08, 0.0000e+00, 0.0000e+00, ..., 1.1744e-06, 2.8670e-05, 3.1423e-06], [ 5.8673e-08, 0.0000e+00, 0.0000e+00, ..., 4.8429e-05, 5.2422e-05, 1.9014e-05], ..., [ 1.0338e-07, 0.0000e+00, 0.0000e+00, ..., 3.4682e-06, 1.5235e-04, 5.2191e-06], [ 1.5832e-08, 0.0000e+00, 0.0000e+00, ..., 5.6684e-05, -2.9206e-04, 1.1370e-05], [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 2.1982e-04, 8.8811e-05, 5.1670e-06]], device='cuda:0') Epoch 52, bias, value: tensor([ 0.0099, -0.0178, 0.0011, 0.0315, -0.0086, 0.0286, 0.0024, 0.0140, 0.0066, -0.0064], device='cuda:0'), grad: tensor([-5.2166e-04, 1.7822e-04, 8.4496e-04, -1.1120e-03, -7.9334e-05, 4.4316e-05, 1.1754e-04, 7.2145e-04, -7.5531e-04, 5.6171e-04], device='cuda:0') 100 0.0001 changing lr epoch 51, time 214.91, cls_loss 0.0123 cls_loss_mapping 0.0209 cls_loss_causal 0.6756 re_mapping 0.0142 re_causal 0.0441 /// teacc 98.76 lr 0.00010000 Epoch 53, weight, value: tensor([[-1.5591e-02, -1.8932e-03, -2.6036e-03, ..., 5.4966e-05, 3.7565e-03, -6.2338e-03], [ 2.0548e-02, 1.5252e-02, -1.7541e-02, ..., 2.6717e-02, 3.6637e-03, -1.6061e-01], [-8.2082e-03, 1.7889e-02, 1.5938e-02, ..., -2.9483e-02, 4.3062e-02, -2.6643e-02], ..., [-1.9734e-03, -1.1480e-02, 2.7466e-02, ..., -4.9709e-02, -6.3530e-02, 1.6824e-02], [-1.2314e-02, 8.5633e-03, -4.2081e-03, ..., -4.4756e-02, 2.0514e-02, -9.6491e-02], [-4.6583e-02, 2.6242e-04, 1.7246e-03, ..., -6.0903e-02, -3.8885e-02, -4.8476e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.1601e-04, -3.9577e-04, 9.1922e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.3772e-07, 3.7760e-05, 1.0297e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7688e-05, 2.3961e-04, 9.7811e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3066e-06, 3.7044e-05, 9.2909e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6499e-04, 9.9540e-05, 4.2357e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3648e-05, 1.0341e-04, 3.1255e-06]], device='cuda:0') Epoch 53, bias, value: tensor([ 0.0096, -0.0181, 0.0012, 0.0315, -0.0069, 0.0290, 0.0022, 0.0144, 0.0060, -0.0075], device='cuda:0'), grad: tensor([-5.3453e-04, 1.5545e-04, 4.5943e-04, -7.9203e-04, -3.3855e-04, 5.0455e-05, 6.4731e-05, 7.3051e-04, 1.3804e-04, 6.5982e-05], device='cuda:0') 100 0.0001 changing lr epoch 52, time 214.86, cls_loss 0.0113 cls_loss_mapping 0.0197 cls_loss_causal 0.6856 re_mapping 0.0140 re_causal 0.0426 /// teacc 98.77 lr 0.00010000 Epoch 54, weight, value: tensor([[-0.0157, -0.0019, -0.0026, ..., 0.0002, 0.0032, -0.0066], [ 0.0205, 0.0152, -0.0175, ..., 0.0267, 0.0034, -0.1622], [-0.0084, 0.0179, 0.0159, ..., -0.0300, 0.0435, -0.0274], ..., [-0.0017, -0.0115, 0.0275, ..., -0.0499, -0.0648, 0.0166], [-0.0124, 0.0085, -0.0042, ..., -0.0451, 0.0209, -0.0975], [-0.0470, 0.0003, 0.0017, ..., -0.0611, -0.0393, -0.0489]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.7253e-08, 1.1727e-05, 7.6368e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.1548e-06, 2.1681e-05, 2.9616e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2755e-05, 7.5474e-06, -1.5637e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5646e-06, 5.5641e-05, 3.4552e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8522e-05, -7.8440e-04, 2.9057e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.2867e-06, 6.5088e-04, 6.1691e-06]], device='cuda:0') Epoch 54, bias, value: tensor([ 0.0094, -0.0178, 0.0009, 0.0309, -0.0073, 0.0299, 0.0023, 0.0142, 0.0060, -0.0070], device='cuda:0'), grad: tensor([ 1.9699e-05, 9.0778e-05, 1.2529e-04, -9.0933e-04, 3.2640e-04, 1.3685e-03, -3.6860e-04, 3.5810e-04, -4.5166e-03, 3.5076e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 53---------------------------------------------------- epoch 53, time 230.77, cls_loss 0.0094 cls_loss_mapping 0.0181 cls_loss_causal 0.6865 re_mapping 0.0135 re_causal 0.0416 /// teacc 98.81 lr 0.00010000 Epoch 55, weight, value: tensor([[-0.0158, -0.0019, -0.0026, ..., 0.0007, 0.0029, -0.0066], [ 0.0208, 0.0152, -0.0175, ..., 0.0266, 0.0026, -0.1636], [-0.0084, 0.0179, 0.0159, ..., -0.0304, 0.0440, -0.0280], ..., [-0.0016, -0.0115, 0.0275, ..., -0.0499, -0.0654, 0.0165], [-0.0125, 0.0085, -0.0042, ..., -0.0455, 0.0214, -0.0984], [-0.0479, 0.0003, 0.0017, ..., -0.0618, -0.0398, -0.0492]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 3.4459e-07, 3.3408e-05, 5.6773e-06], [-8.1956e-08, 0.0000e+00, 0.0000e+00, ..., -3.0361e-06, 1.0170e-05, 9.5144e-06], [ 1.9558e-08, 0.0000e+00, 0.0000e+00, ..., 1.5333e-05, 2.1592e-05, 1.3411e-05], ..., [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 6.5900e-06, 2.6584e-05, 1.5557e-05], [ 1.6764e-08, 0.0000e+00, 0.0000e+00, ..., 1.6555e-05, -1.3404e-05, 1.3016e-05], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 2.4259e-05, 3.5286e-05, 3.3557e-05]], device='cuda:0') Epoch 55, bias, value: tensor([ 0.0095, -0.0176, 0.0012, 0.0308, -0.0078, 0.0299, 0.0024, 0.0140, 0.0060, -0.0069], device='cuda:0'), grad: tensor([ 4.7326e-05, -6.0499e-05, 1.6332e-04, -2.3508e-04, -1.1063e-04, -7.4923e-05, 1.0645e-04, 1.0455e-04, 1.0890e-04, -4.9621e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 54---------------------------------------------------- epoch 54, time 230.29, cls_loss 0.0092 cls_loss_mapping 0.0180 cls_loss_causal 0.6663 re_mapping 0.0132 re_causal 0.0411 /// teacc 98.82 lr 0.00010000 Epoch 56, weight, value: tensor([[-0.0156, -0.0019, -0.0026, ..., 0.0014, 0.0025, -0.0068], [ 0.0208, 0.0152, -0.0175, ..., 0.0266, 0.0023, -0.1638], [-0.0087, 0.0179, 0.0159, ..., -0.0307, 0.0444, -0.0280], ..., [-0.0008, -0.0115, 0.0275, ..., -0.0501, -0.0656, 0.0178], [-0.0129, 0.0085, -0.0042, ..., -0.0460, 0.0216, -0.0996], [-0.0483, 0.0003, 0.0017, ..., -0.0627, -0.0402, -0.0497]], device='cuda:0'), grad: tensor([[-1.8962e-06, 0.0000e+00, 0.0000e+00, ..., -2.9609e-05, 4.7505e-05, -2.3283e-08], [ 1.0245e-07, 0.0000e+00, 0.0000e+00, ..., -2.9244e-07, 3.9554e-04, 1.9774e-05], [ 1.6764e-07, 0.0000e+00, 0.0000e+00, ..., 7.9647e-06, -1.3094e-03, 1.2545e-06], ..., [ 8.9407e-08, 0.0000e+00, 0.0000e+00, ..., 3.4217e-06, 3.7503e-04, 6.9141e-05], [ 9.9652e-08, 0.0000e+00, 0.0000e+00, ..., 1.9461e-05, 3.8648e-04, -1.5485e-04], [ 2.7381e-07, 0.0000e+00, 0.0000e+00, ..., 8.6129e-06, 2.4676e-05, 3.1918e-05]], device='cuda:0') Epoch 56, bias, value: tensor([ 0.0100, -0.0179, 0.0010, 0.0308, -0.0083, 0.0299, 0.0024, 0.0148, 0.0059, -0.0071], device='cuda:0'), grad: tensor([ 1.4037e-05, 8.4352e-04, -1.9970e-03, 1.0860e-04, 2.8300e-04, -7.2360e-05, 1.3161e-04, 1.1282e-03, -6.0797e-04, 1.6749e-04], device='cuda:0') 100 0.0001 changing lr epoch 55, time 214.82, cls_loss 0.0106 cls_loss_mapping 0.0208 cls_loss_causal 0.6701 re_mapping 0.0135 re_causal 0.0410 /// teacc 98.77 lr 0.00010000 Epoch 57, weight, value: tensor([[-0.0156, -0.0019, -0.0026, ..., 0.0014, 0.0018, -0.0069], [ 0.0210, 0.0152, -0.0175, ..., 0.0270, 0.0020, -0.1660], [-0.0087, 0.0179, 0.0159, ..., -0.0313, 0.0447, -0.0281], ..., [-0.0008, -0.0115, 0.0275, ..., -0.0504, -0.0657, 0.0175], [-0.0130, 0.0085, -0.0042, ..., -0.0463, 0.0221, -0.1011], [-0.0485, 0.0003, 0.0017, ..., -0.0630, -0.0406, -0.0501]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3202e-05, 3.5375e-05, 1.6326e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.1791e-06, 4.2677e-05, 2.5053e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0975e-05, -1.3900e-04, 1.2945e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1202e-06, 3.5197e-05, 1.1101e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3344e-05, 2.9832e-05, 2.1718e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.7784e-06, -4.6730e-05, -2.7359e-05]], device='cuda:0') Epoch 57, bias, value: tensor([ 0.0093, -0.0177, 0.0004, 0.0307, -0.0086, 0.0300, 0.0027, 0.0160, 0.0058, -0.0075], device='cuda:0'), grad: tensor([ 5.0627e-06, 4.1276e-05, -9.1553e-05, -3.9077e-04, 7.6354e-05, -5.7268e-04, 8.6451e-04, 1.5783e-04, 5.5790e-04, -6.4850e-04], device='cuda:0') 100 0.0001 changing lr epoch 56, time 214.20, cls_loss 0.0112 cls_loss_mapping 0.0199 cls_loss_causal 0.6509 re_mapping 0.0140 re_causal 0.0403 /// teacc 98.53 lr 0.00010000 Epoch 58, weight, value: tensor([[-0.0155, -0.0019, -0.0026, ..., 0.0020, 0.0010, -0.0058], [ 0.0210, 0.0152, -0.0175, ..., 0.0267, 0.0014, -0.1669], [-0.0088, 0.0179, 0.0159, ..., -0.0317, 0.0453, -0.0283], ..., [-0.0008, -0.0115, 0.0275, ..., -0.0506, -0.0666, 0.0171], [-0.0130, 0.0080, -0.0042, ..., -0.0459, 0.0232, -0.1024], [-0.0487, 0.0003, 0.0017, ..., -0.0635, -0.0409, -0.0508]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.4597e-06, 2.2367e-05, 2.5276e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0641e-06, 1.0237e-05, 3.9898e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3786e-06, -1.8752e-04, -4.1574e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.5554e-07, 5.6252e-06, -3.5409e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3135e-05, 2.1875e-05, 2.7381e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8573e-06, 5.5805e-06, 1.4156e-05]], device='cuda:0') Epoch 58, bias, value: tensor([ 0.0096, -0.0177, 0.0004, 0.0304, -0.0081, 0.0298, 0.0022, 0.0162, 0.0062, -0.0079], device='cuda:0'), grad: tensor([ 4.4852e-05, -2.3305e-05, -5.4646e-04, 4.7350e-04, 4.6074e-05, 8.9884e-05, -1.7118e-04, -8.7768e-06, 5.3704e-05, 4.1246e-05], device='cuda:0') 100 0.0001 changing lr epoch 57, time 214.31, cls_loss 0.0078 cls_loss_mapping 0.0157 cls_loss_causal 0.6950 re_mapping 0.0130 re_causal 0.0409 /// teacc 98.75 lr 0.00010000 Epoch 59, weight, value: tensor([[-0.0156, -0.0019, -0.0026, ..., 0.0024, 0.0010, -0.0063], [ 0.0210, 0.0152, -0.0175, ..., 0.0264, 0.0004, -0.1685], [-0.0088, 0.0179, 0.0159, ..., -0.0321, 0.0461, -0.0285], ..., [-0.0007, -0.0115, 0.0275, ..., -0.0507, -0.0674, 0.0174], [-0.0131, 0.0080, -0.0042, ..., -0.0463, 0.0232, -0.1038], [-0.0488, 0.0003, 0.0017, ..., -0.0639, -0.0414, -0.0513]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0647e-05, 4.1425e-06, 4.7609e-06], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., -1.2554e-06, 5.8532e-05, 3.2168e-06], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.5744e-06, 1.4573e-05, 3.4682e-06], ..., [-2.1420e-08, 0.0000e+00, 0.0000e+00, ..., 1.4314e-06, 1.3202e-05, 6.5677e-06], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -5.2899e-06, -3.2473e-04, 3.2559e-06], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 2.1961e-06, 1.1611e-04, 1.5211e-04]], device='cuda:0') Epoch 59, bias, value: tensor([ 0.0101, -0.0179, 0.0007, 0.0306, -0.0084, 0.0297, 0.0027, 0.0162, 0.0057, -0.0079], device='cuda:0'), grad: tensor([ 3.8370e-06, 6.4254e-05, 7.6532e-05, -7.6108e-06, -2.6703e-04, 1.1069e-04, 3.3379e-05, 7.8022e-05, -5.3024e-04, 4.3797e-04], device='cuda:0') 100 0.0001 changing lr epoch 58, time 214.38, cls_loss 0.0085 cls_loss_mapping 0.0153 cls_loss_causal 0.6456 re_mapping 0.0130 re_causal 0.0394 /// teacc 98.79 lr 0.00010000 Epoch 60, weight, value: tensor([[-1.5569e-02, -1.9140e-03, -1.1108e-02, ..., 2.6590e-03, 6.1448e-04, -6.8986e-03], [ 2.1066e-02, 1.5187e-02, -1.8161e-02, ..., 2.6644e-02, -1.0456e-04, -1.7079e-01], [-8.7496e-03, 1.7886e-02, 1.8631e-02, ..., -3.2501e-02, 4.7178e-02, -2.8823e-02], ..., [-7.1947e-04, -1.1485e-02, 1.9661e-02, ..., -5.0877e-02, -6.8379e-02, 1.8281e-02], [-1.3076e-02, 8.0391e-03, -8.1184e-03, ..., -4.6442e-02, 2.3376e-02, -1.0460e-01], [-4.8779e-02, 2.5053e-04, 1.6376e-03, ..., -6.4115e-02, -4.1854e-02, -5.1713e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9856e-06, 8.6874e-06, 2.4214e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3197e-06, 1.3027e-03, 8.3167e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8722e-06, -1.5697e-03, 1.0720e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9523e-07, 8.2433e-05, 1.5246e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.3474e-06, 3.5614e-05, 5.3830e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8953e-07, 1.3731e-05, 3.9637e-06]], device='cuda:0') Epoch 60, bias, value: tensor([ 0.0101, -0.0180, 0.0014, 0.0309, -0.0086, 0.0295, 0.0025, 0.0160, 0.0055, -0.0079], device='cuda:0'), grad: tensor([ 1.2331e-05, 2.1782e-03, -2.6588e-03, 1.5354e-04, 8.0645e-05, 8.2374e-05, -8.3089e-05, 1.3280e-04, 8.2076e-05, 1.8418e-05], device='cuda:0') 100 0.0001 changing lr epoch 59, time 214.23, cls_loss 0.0082 cls_loss_mapping 0.0154 cls_loss_causal 0.6605 re_mapping 0.0127 re_causal 0.0379 /// teacc 98.66 lr 0.00010000 Epoch 61, weight, value: tensor([[-0.0164, -0.0019, -0.0131, ..., 0.0028, 0.0004, -0.0073], [ 0.0210, 0.0152, -0.0186, ..., 0.0267, -0.0007, -0.1718], [-0.0088, 0.0179, 0.0183, ..., -0.0328, 0.0478, -0.0310], ..., [-0.0007, -0.0115, 0.0213, ..., -0.0510, -0.0694, 0.0183], [-0.0120, 0.0080, -0.0082, ..., -0.0468, 0.0236, -0.1054], [-0.0490, 0.0002, 0.0014, ..., -0.0644, -0.0421, -0.0527]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.1897e-05, 8.4996e-05, -2.3190e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.3048e-06, 4.9844e-06, 1.7583e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.5607e-06, 1.2651e-05, 2.8387e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6152e-06, 2.0433e-06, 1.0096e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7761e-05, 4.3474e-06, -2.9486e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.0988e-06, 5.1670e-06, 7.6145e-06]], device='cuda:0') Epoch 61, bias, value: tensor([ 0.0097, -0.0179, 0.0009, 0.0311, -0.0084, 0.0295, 0.0026, 0.0165, 0.0052, -0.0079], device='cuda:0'), grad: tensor([ 6.3777e-05, 3.5942e-05, 3.1233e-05, 3.9577e-04, 1.1787e-05, -1.2374e-04, -1.1510e-04, -6.9189e-04, 8.1182e-05, 3.1114e-04], device='cuda:0') 100 0.0001 changing lr epoch 60, time 214.22, cls_loss 0.0107 cls_loss_mapping 0.0181 cls_loss_causal 0.6738 re_mapping 0.0129 re_causal 0.0364 /// teacc 98.80 lr 0.00010000 Epoch 62, weight, value: tensor([[-1.5928e-02, -1.9056e-03, -1.3076e-02, ..., 3.0189e-03, 7.8417e-05, -4.5376e-03], [ 2.1049e-02, 1.5181e-02, -1.8643e-02, ..., 2.6420e-02, -1.8745e-03, -1.7288e-01], [-8.7342e-03, 1.7886e-02, 1.8267e-02, ..., -3.3239e-02, 4.8803e-02, -3.2231e-02], ..., [-1.4462e-05, -1.1486e-02, 2.1325e-02, ..., -5.1294e-02, -7.0027e-02, 1.9072e-02], [-1.2270e-02, 8.0165e-03, -8.2102e-03, ..., -4.7296e-02, 2.3842e-02, -1.0756e-01], [-5.0247e-02, 2.4880e-04, 1.4279e-03, ..., -6.4406e-02, -4.2770e-02, -5.3086e-02]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -1.9372e-05, 1.1683e-05, -2.9355e-06], [ 2.7008e-08, 0.0000e+00, 0.0000e+00, ..., 1.1614e-06, 8.2031e-06, 6.4969e-06], [ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., 7.6592e-06, -1.7810e-04, -8.7172e-06], ..., [-1.3225e-07, 0.0000e+00, 0.0000e+00, ..., 7.3574e-07, 2.1964e-05, 1.8716e-05], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., -1.0216e-04, -2.3890e-04, 2.7522e-05], [ 5.0291e-08, 0.0000e+00, 0.0000e+00, ..., 8.6352e-06, 2.1279e-05, 5.8651e-05]], device='cuda:0') Epoch 62, bias, value: tensor([ 0.0100, -0.0184, 0.0013, 0.0310, -0.0092, 0.0302, 0.0024, 0.0158, 0.0052, -0.0071], device='cuda:0'), grad: tensor([-1.4767e-05, 1.3849e-06, -2.7394e-04, 2.1458e-04, -7.7188e-05, -8.3160e-04, 1.1168e-03, 6.3419e-05, -1.3614e-04, -6.3002e-05], device='cuda:0') 100 0.0001 changing lr epoch 61, time 214.42, cls_loss 0.0069 cls_loss_mapping 0.0137 cls_loss_causal 0.6144 re_mapping 0.0126 re_causal 0.0386 /// teacc 98.82 lr 0.00010000 Epoch 63, weight, value: tensor([[-1.5847e-02, -1.9056e-03, -1.3076e-02, ..., 3.0183e-03, -5.2993e-04, -4.4439e-03], [ 2.1025e-02, 1.5181e-02, -1.8643e-02, ..., 2.6407e-02, -2.6354e-03, -1.7319e-01], [-8.5288e-03, 1.7886e-02, 1.8267e-02, ..., -3.3514e-02, 4.9340e-02, -3.2394e-02], ..., [-7.6745e-05, -1.1486e-02, 2.1325e-02, ..., -5.1409e-02, -7.0417e-02, 1.9167e-02], [-1.1997e-02, 8.0165e-03, -8.2102e-03, ..., -4.7746e-02, 2.4225e-02, -1.0810e-01], [-5.1168e-02, 2.4880e-04, 1.4279e-03, ..., -6.4660e-02, -4.3769e-02, -5.4183e-02]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 0.0000e+00, 0.0000e+00, ..., -8.7595e-04, -1.1140e-04, 1.2424e-06], [ 1.8626e-08, 0.0000e+00, 0.0000e+00, ..., 3.6620e-06, 3.4664e-06, 1.3784e-06], [ 1.3970e-08, 0.0000e+00, 0.0000e+00, ..., 1.9222e-05, 2.2855e-06, 1.2117e-06], ..., [-2.3562e-07, 0.0000e+00, 0.0000e+00, ..., 5.3383e-06, 5.3570e-06, 4.9882e-06], [ 1.3970e-08, 0.0000e+00, 0.0000e+00, ..., 1.4715e-05, -1.2182e-05, -3.2336e-06], [ 9.4064e-08, 0.0000e+00, 0.0000e+00, ..., 6.3419e-05, 1.8239e-05, 6.5863e-05]], device='cuda:0') Epoch 63, bias, value: tensor([ 0.0097, -0.0182, 0.0010, 0.0311, -0.0091, 0.0303, 0.0027, 0.0156, 0.0053, -0.0072], device='cuda:0'), grad: tensor([-5.2948e-03, -9.4175e-05, 1.3673e-04, 7.2598e-05, 7.8142e-05, 3.5992e-03, 8.4496e-04, 3.9369e-05, 8.3327e-05, 5.3358e-04], device='cuda:0') 100 0.0001 changing lr epoch 62, time 214.98, cls_loss 0.0067 cls_loss_mapping 0.0132 cls_loss_causal 0.6132 re_mapping 0.0120 re_causal 0.0386 /// teacc 98.78 lr 0.00010000 Epoch 64, weight, value: tensor([[-0.0166, -0.0019, -0.0131, ..., 0.0034, -0.0010, -0.0046], [ 0.0210, 0.0152, -0.0186, ..., 0.0260, -0.0028, -0.1742], [-0.0088, 0.0179, 0.0183, ..., -0.0338, 0.0499, -0.0317], ..., [ 0.0007, -0.0115, 0.0213, ..., -0.0515, -0.0712, 0.0191], [-0.0117, 0.0080, -0.0082, ..., -0.0479, 0.0245, -0.1084], [-0.0516, 0.0002, 0.0014, ..., -0.0649, -0.0443, -0.0556]], device='cuda:0'), grad: tensor([[ 4.0978e-08, 0.0000e+00, 0.0000e+00, ..., 3.0577e-05, 5.3555e-05, 4.1351e-06], [ 1.1362e-07, 0.0000e+00, 0.0000e+00, ..., 6.0350e-07, 3.7942e-06, 1.0796e-05], [ 6.0536e-08, 0.0000e+00, 0.0000e+00, ..., 2.6654e-06, -3.7670e-05, 8.8066e-06], ..., [-5.7928e-07, 0.0000e+00, 0.0000e+00, ..., 3.6694e-07, 2.6017e-05, 1.2666e-05], [ 9.4995e-08, 0.0000e+00, 0.0000e+00, ..., 5.4352e-06, 1.1569e-04, 1.0557e-05], [ 1.0058e-07, 0.0000e+00, 0.0000e+00, ..., 2.8238e-06, 1.0091e-04, -1.1724e-04]], device='cuda:0') Epoch 64, bias, value: tensor([ 0.0098, -0.0178, 0.0008, 0.0312, -0.0083, 0.0300, 0.0029, 0.0156, 0.0055, -0.0081], device='cuda:0'), grad: tensor([ 7.2837e-05, 3.1739e-05, -3.0503e-05, 1.1945e-04, 4.2224e-04, -5.8794e-04, -3.3021e-05, 1.9300e-04, 4.5753e-04, -6.4564e-04], device='cuda:0') 100 0.0001 changing lr epoch 63, time 214.81, cls_loss 0.0087 cls_loss_mapping 0.0161 cls_loss_causal 0.6450 re_mapping 0.0121 re_causal 0.0380 /// teacc 98.64 lr 0.00010000 Epoch 65, weight, value: tensor([[-0.0166, -0.0019, -0.0131, ..., 0.0037, -0.0011, -0.0040], [ 0.0210, 0.0152, -0.0186, ..., 0.0259, -0.0035, -0.1759], [-0.0088, 0.0179, 0.0183, ..., -0.0340, 0.0506, -0.0317], ..., [ 0.0009, -0.0115, 0.0213, ..., -0.0516, -0.0719, 0.0190], [-0.0118, 0.0080, -0.0082, ..., -0.0481, 0.0247, -0.1086], [-0.0517, 0.0002, 0.0014, ..., -0.0647, -0.0445, -0.0557]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 8.7544e-06, 1.1779e-05, 6.2678e-07], [ 6.1467e-08, 0.0000e+00, 0.0000e+00, ..., 1.2862e-06, 5.6103e-06, 8.0839e-07], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 2.5984e-06, -6.4485e-06, 1.1288e-06], ..., [-1.4249e-07, 0.0000e+00, 0.0000e+00, ..., 1.3486e-06, 6.5155e-06, 3.7458e-06], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 8.2925e-06, 1.2740e-05, 5.5041e-07], [ 3.4459e-08, 0.0000e+00, 0.0000e+00, ..., 2.7455e-06, 8.8662e-06, 1.7136e-05]], device='cuda:0') Epoch 65, bias, value: tensor([ 0.0097, -0.0175, 0.0011, 0.0313, -0.0079, 0.0301, 0.0028, 0.0153, 0.0052, -0.0084], device='cuda:0'), grad: tensor([ 2.2292e-05, -1.6078e-05, -1.3625e-06, 3.4869e-05, -3.6955e-05, -1.2565e-04, 3.2276e-05, 5.4725e-06, 3.0637e-05, 5.4181e-05], device='cuda:0') 100 0.0001 changing lr epoch 64, time 214.33, cls_loss 0.0097 cls_loss_mapping 0.0168 cls_loss_causal 0.6550 re_mapping 0.0125 re_causal 0.0360 /// teacc 98.76 lr 0.00010000 Epoch 66, weight, value: tensor([[-0.0169, -0.0019, -0.0131, ..., 0.0036, -0.0015, -0.0039], [ 0.0210, 0.0152, -0.0186, ..., 0.0258, -0.0043, -0.1765], [-0.0087, 0.0179, 0.0183, ..., -0.0345, 0.0510, -0.0325], ..., [ 0.0009, -0.0115, 0.0213, ..., -0.0518, -0.0731, 0.0183], [-0.0116, 0.0080, -0.0082, ..., -0.0483, 0.0254, -0.1089], [-0.0519, 0.0002, 0.0014, ..., -0.0650, -0.0453, -0.0566]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1906e-05, 2.3589e-05, 3.3211e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8720e-07, 2.9691e-06, -8.3804e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.6392e-07, 4.3400e-06, 2.6897e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.5914e-07, 4.8354e-06, 4.3720e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.4373e-06, -1.6320e-04, 1.7285e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7772e-06, 4.6045e-05, -2.9787e-05]], device='cuda:0') Epoch 66, bias, value: tensor([ 0.0096, -0.0179, 0.0008, 0.0317, -0.0076, 0.0295, 0.0037, 0.0155, 0.0052, -0.0085], device='cuda:0'), grad: tensor([ 6.4075e-05, 6.1417e-03, 2.1124e-04, 3.9482e-04, 2.1291e-04, -3.6389e-05, -7.7367e-05, -6.7406e-03, -6.6161e-05, -1.0014e-04], device='cuda:0') 100 0.0001 changing lr epoch 65, time 214.52, cls_loss 0.0085 cls_loss_mapping 0.0138 cls_loss_causal 0.6618 re_mapping 0.0119 re_causal 0.0374 /// teacc 98.66 lr 0.00010000 Epoch 67, weight, value: tensor([[-0.0170, -0.0019, -0.0132, ..., 0.0043, -0.0016, -0.0036], [ 0.0221, 0.0152, -0.0186, ..., 0.0260, -0.0040, -0.1767], [-0.0087, 0.0179, 0.0183, ..., -0.0348, 0.0511, -0.0319], ..., [ 0.0003, -0.0115, 0.0213, ..., -0.0520, -0.0742, 0.0184], [-0.0118, 0.0080, -0.0082, ..., -0.0490, 0.0256, -0.1107], [-0.0549, 0.0002, 0.0014, ..., -0.0654, -0.0459, -0.0563]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0973e-04, 1.4089e-05, 5.7276e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.2627e-06, 5.9567e-06, 1.2880e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7896e-05, 4.9397e-06, 8.0559e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3882e-06, 3.1758e-06, -7.2159e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7819e-05, -1.6421e-05, 5.3458e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0759e-05, 8.7768e-06, 5.7161e-05]], device='cuda:0') Epoch 67, bias, value: tensor([ 0.0097, -0.0166, 0.0001, 0.0318, -0.0082, 0.0292, 0.0040, 0.0157, 0.0048, -0.0087], device='cuda:0'), grad: tensor([ 5.8126e-04, 4.6402e-05, 8.3923e-05, -9.0599e-04, -2.1413e-05, -1.2740e-05, 2.4691e-05, 2.1383e-05, 1.6344e-04, 2.0131e-05], device='cuda:0') 100 0.0001 changing lr epoch 66, time 214.37, cls_loss 0.0113 cls_loss_mapping 0.0179 cls_loss_causal 0.6203 re_mapping 0.0124 re_causal 0.0347 /// teacc 98.64 lr 0.00010000 Epoch 68, weight, value: tensor([[-0.0173, -0.0019, -0.0138, ..., 0.0040, -0.0025, -0.0039], [ 0.0232, 0.0152, -0.0189, ..., 0.0257, -0.0047, -0.1768], [-0.0094, 0.0179, 0.0183, ..., -0.0353, 0.0517, -0.0322], ..., [ 0.0010, -0.0115, 0.0208, ..., -0.0523, -0.0755, 0.0191], [-0.0128, 0.0080, -0.0084, ..., -0.0494, 0.0258, -0.1123], [-0.0565, 0.0002, 0.0014, ..., -0.0648, -0.0466, -0.0583]], device='cuda:0'), grad: tensor([[ 5.2154e-08, 0.0000e+00, 0.0000e+00, ..., -6.9812e-06, 1.9297e-05, 1.0833e-05], [ 2.3097e-06, 0.0000e+00, 0.0000e+00, ..., 2.0284e-06, 3.1501e-05, 3.0369e-05], [ 1.9278e-07, 0.0000e+00, 0.0000e+00, ..., 1.2219e-06, 1.5073e-05, 1.9521e-05], ..., [-3.9898e-06, 0.0000e+00, 0.0000e+00, ..., 9.3132e-07, 2.1532e-05, 4.3623e-06], [ 5.6438e-07, 0.0000e+00, 0.0000e+00, ..., 4.5188e-06, 3.5465e-05, 2.6405e-05], [ 2.4121e-07, 0.0000e+00, 0.0000e+00, ..., 1.5702e-06, 7.1764e-04, 5.2071e-04]], device='cuda:0') Epoch 68, bias, value: tensor([ 0.0085, -0.0171, 0.0002, 0.0326, -0.0077, 0.0302, 0.0033, 0.0159, 0.0047, -0.0092], device='cuda:0'), grad: tensor([ 1.1826e-04, 2.5702e-04, 2.3556e-04, -7.1526e-03, 3.8290e-04, 3.7491e-05, 1.9312e-05, -7.7859e-06, 3.3593e-04, 5.7716e-03], device='cuda:0') 100 0.0001 changing lr epoch 67, time 214.15, cls_loss 0.0079 cls_loss_mapping 0.0164 cls_loss_causal 0.6224 re_mapping 0.0115 re_causal 0.0349 /// teacc 98.71 lr 0.00010000 Epoch 69, weight, value: tensor([[-1.7238e-02, -1.9006e-03, -1.4748e-02, ..., 4.7513e-03, -2.5865e-03, -3.9697e-03], [ 2.4412e-02, 1.5174e-02, -1.8955e-02, ..., 2.5404e-02, -5.6061e-03, -1.7799e-01], [-9.5482e-03, 1.7884e-02, 1.8164e-02, ..., -3.6062e-02, 5.2127e-02, -3.2323e-02], ..., [-1.1510e-04, -1.1487e-02, 1.9757e-02, ..., -5.2650e-02, -7.6401e-02, 2.0071e-02], [-1.2333e-02, 8.0033e-03, -8.5257e-03, ..., -4.9144e-02, 2.6646e-02, -1.1298e-01], [-5.7329e-02, 2.4841e-04, 1.3499e-03, ..., -6.4649e-02, -4.6971e-02, -5.8469e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.2861e-04, 4.7016e-04, 9.5926e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8077e-06, 8.3447e-06, 1.8859e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9087e-05, 1.1668e-05, 3.9674e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2936e-06, 4.5076e-06, -2.4159e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.1322e-05, 2.5883e-05, 1.6484e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.5049e-05, 1.7300e-05, 2.4326e-06]], device='cuda:0') Epoch 69, bias, value: tensor([ 0.0086, -0.0168, 0.0002, 0.0327, -0.0079, 0.0305, 0.0025, 0.0153, 0.0055, -0.0092], device='cuda:0'), grad: tensor([ 7.4959e-04, 6.4075e-05, 4.2707e-05, -8.2314e-05, 5.7489e-05, 2.1458e-03, -3.0479e-03, -7.9393e-05, 9.0659e-05, 5.9545e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 68---------------------------------------------------- epoch 68, time 230.60, cls_loss 0.0070 cls_loss_mapping 0.0136 cls_loss_causal 0.6409 re_mapping 0.0110 re_causal 0.0334 /// teacc 98.83 lr 0.00010000 Epoch 70, weight, value: tensor([[-1.7206e-02, -1.9006e-03, -1.4760e-02, ..., 5.4426e-03, -3.0568e-03, -3.9407e-03], [ 2.5376e-02, 1.5174e-02, -1.8956e-02, ..., 2.5440e-02, -5.9514e-03, -1.7985e-01], [-9.5825e-03, 1.7884e-02, 1.8163e-02, ..., -3.6424e-02, 5.2919e-02, -3.2722e-02], ..., [-1.7789e-04, -1.1487e-02, 1.9754e-02, ..., -5.2725e-02, -7.7182e-02, 1.9831e-02], [-1.2731e-02, 8.0033e-03, -8.5261e-03, ..., -4.9374e-02, 2.6793e-02, -1.1333e-01], [-5.8321e-02, 2.4841e-04, 1.3470e-03, ..., -6.5284e-02, -4.7594e-02, -5.8522e-02]], device='cuda:0'), grad: tensor([[ 1.7574e-06, 0.0000e+00, 0.0000e+00, ..., -4.3884e-06, 2.6494e-05, 9.6858e-08], [-3.9339e-05, 0.0000e+00, 0.0000e+00, ..., -7.4469e-06, 3.4928e-04, 3.9814e-07], [-2.4855e-05, 0.0000e+00, 0.0000e+00, ..., 1.1045e-06, -3.3498e-04, 3.5670e-07], ..., [ 1.8299e-05, 0.0000e+00, 0.0000e+00, ..., 1.6419e-06, 5.5462e-05, 3.3434e-07], [ 5.3905e-06, 0.0000e+00, 0.0000e+00, ..., 7.6666e-06, -3.3617e-04, 3.0035e-07], [ 2.8498e-06, 0.0000e+00, 0.0000e+00, ..., 2.4773e-06, 4.8786e-05, 1.4938e-06]], device='cuda:0') Epoch 70, bias, value: tensor([ 0.0091, -0.0169, 0.0005, 0.0322, -0.0080, 0.0307, 0.0026, 0.0153, 0.0055, -0.0092], device='cuda:0'), grad: tensor([ 5.4955e-05, -4.9710e-05, -3.9625e-04, 3.2377e-04, 3.1734e-04, 7.3910e-05, 5.0366e-05, 1.7250e-04, -6.4516e-04, 9.9421e-05], device='cuda:0') 100 0.0001 changing lr epoch 69, time 214.62, cls_loss 0.0067 cls_loss_mapping 0.0120 cls_loss_causal 0.6414 re_mapping 0.0110 re_causal 0.0353 /// teacc 98.79 lr 0.00010000 Epoch 71, weight, value: tensor([[-0.0187, -0.0019, -0.0154, ..., 0.0064, -0.0040, -0.0042], [ 0.0257, 0.0152, -0.0194, ..., 0.0253, -0.0063, -0.1812], [-0.0098, 0.0179, 0.0179, ..., -0.0366, 0.0535, -0.0331], ..., [ 0.0007, -0.0115, 0.0196, ..., -0.0529, -0.0779, 0.0201], [-0.0127, 0.0080, -0.0076, ..., -0.0496, 0.0273, -0.1131], [-0.0589, 0.0002, 0.0013, ..., -0.0656, -0.0479, -0.0591]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., -5.3123e-06, 2.2560e-05, 1.4175e-06], [-3.9395e-07, 0.0000e+00, 0.0000e+00, ..., 1.2228e-06, -1.7300e-05, 1.1781e-06], [ 1.8626e-08, 0.0000e+00, 0.0000e+00, ..., 6.7763e-06, -8.1599e-05, 3.0100e-06], ..., [ 5.4948e-08, 0.0000e+00, 0.0000e+00, ..., 1.1586e-06, 6.8486e-05, 2.7213e-06], [ 1.1921e-07, 0.0000e+00, 0.0000e+00, ..., 5.8003e-06, 2.6479e-05, 6.3218e-06], [ 5.2154e-08, 0.0000e+00, 0.0000e+00, ..., 7.2680e-06, 1.6376e-05, -1.0920e-04]], device='cuda:0') Epoch 71, bias, value: tensor([ 0.0093, -0.0167, 0.0005, 0.0318, -0.0076, 0.0304, 0.0022, 0.0155, 0.0055, -0.0093], device='cuda:0'), grad: tensor([ 8.6486e-05, -3.0971e-04, -5.1796e-05, 3.5357e-04, 4.9829e-04, -2.9826e-04, 9.1612e-05, 5.0217e-05, 2.4152e-04, -6.6280e-04], device='cuda:0') 100 0.0001 changing lr epoch 70, time 214.50, cls_loss 0.0079 cls_loss_mapping 0.0144 cls_loss_causal 0.6221 re_mapping 0.0113 re_causal 0.0331 /// teacc 98.65 lr 0.00010000 Epoch 72, weight, value: tensor([[-0.0173, -0.0019, -0.0156, ..., 0.0063, -0.0054, -0.0041], [ 0.0270, 0.0152, -0.0186, ..., 0.0249, -0.0069, -0.1826], [-0.0103, 0.0179, 0.0178, ..., -0.0374, 0.0536, -0.0335], ..., [ 0.0004, -0.0115, 0.0194, ..., -0.0532, -0.0786, 0.0209], [-0.0133, 0.0080, -0.0076, ..., -0.0497, 0.0284, -0.1141], [-0.0596, 0.0002, 0.0013, ..., -0.0662, -0.0490, -0.0585]], device='cuda:0'), grad: tensor([[ 5.4482e-07, 0.0000e+00, 0.0000e+00, ..., -4.2230e-05, 6.3851e-06, 5.9418e-06], [-5.3085e-06, 0.0000e+00, 0.0000e+00, ..., -4.5709e-06, 1.2562e-05, 1.3754e-05], [ 7.8045e-07, 0.0000e+00, 0.0000e+00, ..., 5.9307e-06, -2.1458e-05, 4.4480e-06], ..., [ 1.2331e-06, 0.0000e+00, 0.0000e+00, ..., 5.5544e-06, 1.5102e-05, -1.2018e-05], [ 1.0431e-06, 0.0000e+00, 0.0000e+00, ..., 1.4871e-05, -8.2180e-06, 3.0119e-06], [ 1.7043e-07, 0.0000e+00, 0.0000e+00, ..., 6.5006e-06, 5.5134e-06, 3.4779e-05]], device='cuda:0') Epoch 72, bias, value: tensor([ 0.0082, -0.0170, 0.0002, 0.0322, -0.0079, 0.0305, 0.0030, 0.0153, 0.0062, -0.0092], device='cuda:0'), grad: tensor([-4.7863e-05, 1.4153e-03, 1.9908e-04, 1.8072e-04, -1.3018e-04, -1.7341e-06, 4.1693e-05, -2.5635e-03, 1.0681e-04, 7.9918e-04], device='cuda:0') 100 0.0001 changing lr epoch 71, time 214.60, cls_loss 0.0065 cls_loss_mapping 0.0113 cls_loss_causal 0.6243 re_mapping 0.0111 re_causal 0.0334 /// teacc 98.80 lr 0.00010000 Epoch 73, weight, value: tensor([[-0.0187, -0.0019, -0.0163, ..., 0.0064, -0.0060, -0.0041], [ 0.0274, 0.0152, -0.0193, ..., 0.0246, -0.0080, -0.1835], [-0.0106, 0.0179, 0.0177, ..., -0.0379, 0.0539, -0.0336], ..., [ 0.0008, -0.0115, 0.0192, ..., -0.0534, -0.0791, 0.0210], [-0.0125, 0.0080, -0.0067, ..., -0.0499, 0.0290, -0.1140], [-0.0604, 0.0002, 0.0012, ..., -0.0665, -0.0497, -0.0587]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.6418e-05, 3.3230e-05, 4.6985e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.5241e-06, 4.5039e-06, 1.1390e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.6981e-06, -3.9965e-05, 2.4345e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.4005e-07, 1.5348e-05, 4.7088e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.5526e-05, 1.9446e-05, 1.5888e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4755e-06, 7.5884e-06, -4.2282e-06]], device='cuda:0') Epoch 73, bias, value: tensor([ 0.0081, -0.0177, 0.0002, 0.0321, -0.0078, 0.0304, 0.0031, 0.0149, 0.0068, -0.0088], device='cuda:0'), grad: tensor([ 7.2479e-05, 1.4782e-05, -4.5508e-05, 5.0992e-05, 2.9057e-05, 2.0206e-04, -1.8179e-04, -1.6546e-04, 1.4806e-04, -1.2469e-04], device='cuda:0') 100 0.0001 changing lr epoch 72, time 214.54, cls_loss 0.0077 cls_loss_mapping 0.0128 cls_loss_causal 0.6603 re_mapping 0.0108 re_causal 0.0328 /// teacc 98.82 lr 0.00010000 Epoch 74, weight, value: tensor([[-0.0191, -0.0019, -0.0165, ..., 0.0067, -0.0062, -0.0026], [ 0.0295, 0.0152, -0.0194, ..., 0.0253, -0.0088, -0.1845], [-0.0113, 0.0179, 0.0178, ..., -0.0381, 0.0551, -0.0341], ..., [ 0.0005, -0.0115, 0.0192, ..., -0.0536, -0.0789, 0.0203], [-0.0136, 0.0080, -0.0067, ..., -0.0504, 0.0287, -0.1143], [-0.0614, 0.0002, 0.0012, ..., -0.0670, -0.0501, -0.0588]], device='cuda:0'), grad: tensor([[-2.2817e-08, 0.0000e+00, 0.0000e+00, ..., -8.9705e-06, 4.0941e-06, 5.7137e-07], [-2.7893e-07, 0.0000e+00, 0.0000e+00, ..., -1.2871e-06, 8.8476e-07, 2.4438e-06], [ 2.8405e-08, 0.0000e+00, 0.0000e+00, ..., 2.6580e-06, 1.7760e-06, 3.1432e-07], ..., [ 3.7719e-08, 0.0000e+00, 0.0000e+00, ..., 6.7195e-07, 1.2703e-06, 3.1721e-06], [ 1.3877e-07, 0.0000e+00, 0.0000e+00, ..., 4.1425e-06, -2.8824e-07, 2.1681e-06], [ 2.5146e-08, 0.0000e+00, 0.0000e+00, ..., 3.2056e-06, -4.2766e-06, -3.0342e-06]], device='cuda:0') Epoch 74, bias, value: tensor([ 0.0088, -0.0173, 0.0006, 0.0323, -0.0084, 0.0297, 0.0039, 0.0154, 0.0053, -0.0087], device='cuda:0'), grad: tensor([ 1.6302e-05, -1.5814e-06, 2.2352e-05, 9.6083e-05, 8.4877e-05, -2.6658e-05, 5.5701e-05, 1.0394e-05, 2.6584e-04, -5.2357e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 73---------------------------------------------------- epoch 73, time 231.86, cls_loss 0.0098 cls_loss_mapping 0.0185 cls_loss_causal 0.6327 re_mapping 0.0107 re_causal 0.0314 /// teacc 98.92 lr 0.00010000 Epoch 75, weight, value: tensor([[-0.0202, -0.0019, -0.0172, ..., 0.0063, -0.0060, -0.0027], [ 0.0326, 0.0152, -0.0176, ..., 0.0250, -0.0092, -0.1850], [-0.0116, 0.0179, 0.0178, ..., -0.0386, 0.0557, -0.0345], ..., [-0.0007, -0.0115, 0.0186, ..., -0.0540, -0.0792, 0.0200], [-0.0162, 0.0080, -0.0068, ..., -0.0510, 0.0284, -0.1149], [-0.0652, 0.0002, 0.0012, ..., -0.0675, -0.0506, -0.0594]], device='cuda:0'), grad: tensor([[-2.9569e-07, 0.0000e+00, 0.0000e+00, ..., -3.3919e-06, 5.0843e-05, 9.3319e-07], [ 4.4927e-06, 0.0000e+00, 0.0000e+00, ..., 7.3612e-06, 1.0389e-04, 2.9504e-06], [ 1.6782e-06, 0.0000e+00, 0.0000e+00, ..., 3.6024e-06, 2.5630e-06, 1.4110e-06], ..., [ 7.9395e-07, 0.0000e+00, 0.0000e+00, ..., 1.3821e-06, 2.5913e-05, 3.1646e-06], [ 4.2235e-07, 0.0000e+00, 0.0000e+00, ..., -4.4179e-04, -4.4937e-03, 1.3299e-05], [ 1.3849e-06, 0.0000e+00, 0.0000e+00, ..., 3.7774e-06, 4.4674e-05, 2.7752e-04]], device='cuda:0') Epoch 75, bias, value: tensor([ 0.0079, -0.0177, 0.0002, 0.0327, -0.0087, 0.0295, 0.0050, 0.0162, 0.0042, -0.0083], device='cuda:0'), grad: tensor([ 0.0001, 0.0004, -0.0004, 0.0013, -0.0007, 0.0007, 0.0049, 0.0002, -0.0072, 0.0006], device='cuda:0') 100 0.0001 changing lr epoch 74, time 214.23, cls_loss 0.0096 cls_loss_mapping 0.0169 cls_loss_causal 0.6218 re_mapping 0.0111 re_causal 0.0320 /// teacc 98.68 lr 0.00010000 Epoch 76, weight, value: tensor([[-0.0207, -0.0019, -0.0172, ..., 0.0070, -0.0059, -0.0028], [ 0.0312, 0.0152, -0.0176, ..., 0.0252, -0.0105, -0.1831], [-0.0099, 0.0179, 0.0178, ..., -0.0395, 0.0564, -0.0362], ..., [-0.0009, -0.0115, 0.0185, ..., -0.0542, -0.0800, 0.0201], [-0.0167, 0.0080, -0.0068, ..., -0.0512, 0.0295, -0.1147], [-0.0668, 0.0002, 0.0012, ..., -0.0665, -0.0511, -0.0602]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5870e-05, 2.9102e-05, 5.6982e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.2750e-05, 5.6028e-04, 2.1175e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.9651e-05, 2.6792e-05, 8.9169e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0446e-04, 1.9515e-04, 6.8665e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.1940e-06, -7.6723e-04, 9.0659e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.5936e-05, 8.0347e-05, 1.4997e-04]], device='cuda:0') Epoch 76, bias, value: tensor([ 0.0079, -0.0172, 0.0003, 0.0322, -0.0077, 0.0298, 0.0038, 0.0159, 0.0048, -0.0086], device='cuda:0'), grad: tensor([ 0.0005, 0.0015, 0.0008, 0.0013, 0.0008, -0.0123, 0.0003, 0.0065, -0.0009, 0.0014], device='cuda:0') 100 0.0001 changing lr epoch 75, time 214.30, cls_loss 0.0062 cls_loss_mapping 0.0110 cls_loss_causal 0.6040 re_mapping 0.0106 re_causal 0.0333 /// teacc 98.76 lr 0.00010000 Epoch 77, weight, value: tensor([[-0.0214, -0.0019, -0.0173, ..., 0.0070, -0.0065, -0.0030], [ 0.0314, 0.0152, -0.0177, ..., 0.0247, -0.0113, -0.1842], [-0.0100, 0.0179, 0.0178, ..., -0.0398, 0.0565, -0.0366], ..., [-0.0009, -0.0115, 0.0185, ..., -0.0547, -0.0806, 0.0213], [-0.0160, 0.0080, -0.0068, ..., -0.0516, 0.0300, -0.1163], [-0.0673, 0.0002, 0.0012, ..., -0.0667, -0.0515, -0.0610]], device='cuda:0'), grad: tensor([[ 5.0291e-08, 0.0000e+00, 0.0000e+00, ..., -3.5930e-06, 8.2776e-06, 2.7800e-07], [-8.4331e-07, 0.0000e+00, 0.0000e+00, ..., 5.8673e-08, 2.1428e-05, 5.0012e-07], [ 1.7323e-07, 0.0000e+00, 0.0000e+00, ..., 6.0117e-07, -1.2536e-06, 1.1034e-05], ..., [ 1.2293e-07, 0.0000e+00, 0.0000e+00, ..., 1.3132e-07, 2.2680e-05, 3.1991e-07], [ 1.8487e-07, 0.0000e+00, 0.0000e+00, ..., 5.4343e-07, -1.5926e-04, -1.6198e-05], [ 5.2154e-08, 0.0000e+00, 0.0000e+00, ..., 6.5984e-07, 1.4015e-05, 1.5637e-06]], device='cuda:0') Epoch 77, bias, value: tensor([ 0.0076, -0.0174, 0.0001, 0.0319, -0.0077, 0.0309, 0.0042, 0.0159, 0.0048, -0.0091], device='cuda:0'), grad: tensor([ 7.4625e-05, 3.5465e-05, 2.0111e-04, 2.8276e-04, 2.3112e-05, 2.4438e-05, 5.5790e-05, -5.0592e-04, -2.5320e-04, 6.2466e-05], device='cuda:0') 100 0.0001 changing lr epoch 76, time 214.29, cls_loss 0.0068 cls_loss_mapping 0.0114 cls_loss_causal 0.5680 re_mapping 0.0109 re_causal 0.0317 /// teacc 98.76 lr 0.00010000 Epoch 78, weight, value: tensor([[-0.0215, -0.0019, -0.0180, ..., 0.0082, -0.0069, -0.0031], [ 0.0313, 0.0152, -0.0185, ..., 0.0237, -0.0122, -0.1852], [-0.0103, 0.0179, 0.0176, ..., -0.0405, 0.0567, -0.0364], ..., [ 0.0004, -0.0115, 0.0181, ..., -0.0544, -0.0809, 0.0209], [-0.0167, 0.0080, -0.0056, ..., -0.0516, 0.0306, -0.1165], [-0.0698, 0.0002, 0.0011, ..., -0.0669, -0.0519, -0.0605]], device='cuda:0'), grad: tensor([[ 3.4459e-08, 0.0000e+00, 0.0000e+00, ..., 1.4715e-05, 9.7230e-06, 5.5367e-07], [-6.4913e-07, 0.0000e+00, 0.0000e+00, ..., -2.0593e-05, 2.5667e-06, 3.4226e-07], [ 2.5192e-07, 0.0000e+00, 0.0000e+00, ..., 7.2382e-06, 1.3895e-06, 3.6042e-07], ..., [ 1.1129e-07, 0.0000e+00, 0.0000e+00, ..., 2.3544e-06, 4.8382e-07, 1.0347e-06], [ 1.2014e-07, 0.0000e+00, 0.0000e+00, ..., 1.7017e-05, -6.7241e-06, 6.2259e-07], [ 4.3306e-08, 0.0000e+00, 0.0000e+00, ..., 3.2242e-06, 1.5935e-06, 3.4213e-05]], device='cuda:0') Epoch 78, bias, value: tensor([ 0.0084, -0.0181, -0.0002, 0.0318, -0.0077, 0.0306, 0.0037, 0.0165, 0.0049, -0.0088], device='cuda:0'), grad: tensor([ 5.8591e-05, -1.5962e-04, 3.9369e-05, 3.4750e-05, 2.2575e-06, -1.2413e-05, -2.6315e-05, 1.3947e-05, 3.8862e-05, 1.0744e-05], device='cuda:0') 100 0.0001 changing lr epoch 77, time 214.63, cls_loss 0.0065 cls_loss_mapping 0.0098 cls_loss_causal 0.6058 re_mapping 0.0105 re_causal 0.0323 /// teacc 98.83 lr 0.00010000 Epoch 79, weight, value: tensor([[-0.0226, -0.0019, -0.0189, ..., 0.0081, -0.0075, -0.0033], [ 0.0315, 0.0152, -0.0188, ..., 0.0238, -0.0131, -0.1856], [-0.0098, 0.0179, 0.0178, ..., -0.0410, 0.0576, -0.0366], ..., [-0.0010, -0.0115, 0.0177, ..., -0.0547, -0.0819, 0.0203], [-0.0173, 0.0080, -0.0050, ..., -0.0523, 0.0308, -0.1177], [-0.0718, 0.0002, 0.0011, ..., -0.0672, -0.0524, -0.0603]], device='cuda:0'), grad: tensor([[ 2.1413e-05, 0.0000e+00, 0.0000e+00, ..., -1.4551e-05, 3.7014e-05, 3.6180e-05], [ 9.4920e-06, 0.0000e+00, 0.0000e+00, ..., -1.6754e-06, 8.6129e-06, 1.8910e-05], [ 6.0983e-06, 0.0000e+00, 0.0000e+00, ..., -5.4687e-06, -2.0194e-04, 6.4634e-06], ..., [-5.5462e-05, 0.0000e+00, 0.0000e+00, ..., -2.8498e-06, 3.8326e-05, -9.9599e-05], [ 2.4270e-06, 0.0000e+00, 0.0000e+00, ..., 6.8955e-06, 6.4731e-05, 6.8396e-06], [ 3.5651e-06, 0.0000e+00, 0.0000e+00, ..., 6.0052e-06, 2.4084e-06, 7.8306e-06]], device='cuda:0') Epoch 79, bias, value: tensor([ 7.5956e-03, -1.7898e-02, -8.5355e-05, 3.1815e-02, -7.6049e-03, 3.0820e-02, 3.8487e-03, 1.5939e-02, 4.2507e-03, -7.9647e-03], device='cuda:0'), grad: tensor([ 1.3864e-04, 5.8383e-05, -2.2185e-04, 1.5140e-04, 6.2525e-05, 3.6538e-05, 4.2707e-05, -4.9639e-04, 1.1688e-04, 1.1140e-04], device='cuda:0') 100 0.0001 changing lr epoch 78, time 214.13, cls_loss 0.0071 cls_loss_mapping 0.0143 cls_loss_causal 0.6171 re_mapping 0.0113 re_causal 0.0327 /// teacc 98.74 lr 0.00010000 Epoch 80, weight, value: tensor([[-0.0231, -0.0019, -0.0250, ..., 0.0081, -0.0079, -0.0031], [ 0.0321, 0.0152, -0.0194, ..., 0.0260, -0.0134, -0.1861], [-0.0098, 0.0179, 0.0162, ..., -0.0416, 0.0583, -0.0368], ..., [-0.0013, -0.0115, 0.0123, ..., -0.0551, -0.0831, 0.0203], [-0.0182, 0.0080, -0.0077, ..., -0.0539, 0.0307, -0.1187], [-0.0731, 0.0002, 0.0023, ..., -0.0678, -0.0531, -0.0605]], device='cuda:0'), grad: tensor([[-5.6252e-07, 0.0000e+00, 0.0000e+00, ..., -8.2403e-06, 4.8757e-05, 2.5630e-05], [ 6.4727e-08, 0.0000e+00, 0.0000e+00, ..., 1.2862e-06, 1.0881e-03, 9.3207e-06], [ 9.7323e-08, 0.0000e+00, 0.0000e+00, ..., 2.2165e-06, -3.2520e-04, -3.1972e-04], ..., [ 2.6543e-08, 0.0000e+00, 0.0000e+00, ..., 3.9600e-06, 1.3411e-04, 1.1015e-04], [ 2.1420e-08, 0.0000e+00, 0.0000e+00, ..., 4.9733e-06, -1.9817e-03, 1.6376e-05], [ 6.3330e-08, 0.0000e+00, 0.0000e+00, ..., 5.5023e-06, 7.1955e-04, 8.7991e-06]], device='cuda:0') Epoch 80, bias, value: tensor([ 0.0074, -0.0175, 0.0005, 0.0316, -0.0071, 0.0314, 0.0033, 0.0157, 0.0036, -0.0083], device='cuda:0'), grad: tensor([ 1.2815e-04, 4.7760e-03, -6.1750e-04, 2.6077e-05, 3.0684e-04, 1.8227e-04, 2.1207e-04, 9.0361e-05, -7.1449e-03, 2.0409e-03], device='cuda:0') 100 0.0001 changing lr epoch 79, time 214.27, cls_loss 0.0061 cls_loss_mapping 0.0119 cls_loss_causal 0.6139 re_mapping 0.0106 re_causal 0.0329 /// teacc 98.81 lr 0.00010000 Epoch 81, weight, value: tensor([[-0.0232, -0.0019, -0.0266, ..., 0.0083, -0.0083, -0.0032], [ 0.0326, 0.0152, -0.0219, ..., 0.0255, -0.0142, -0.1862], [-0.0098, 0.0179, 0.0153, ..., -0.0419, 0.0590, -0.0368], ..., [-0.0014, -0.0115, 0.0096, ..., -0.0554, -0.0843, 0.0203], [-0.0188, 0.0079, -0.0086, ..., -0.0545, 0.0323, -0.1210], [-0.0747, 0.0002, 0.0020, ..., -0.0684, -0.0539, -0.0606]], device='cuda:0'), grad: tensor([[-6.6590e-08, 0.0000e+00, 0.0000e+00, ..., -3.0845e-06, 4.1313e-06, 1.8952e-07], [-2.7474e-08, 0.0000e+00, 0.0000e+00, ..., 1.2377e-06, 3.6228e-06, 6.4261e-08], [ 2.0489e-08, 0.0000e+00, 0.0000e+00, ..., 1.1241e-06, -2.1055e-05, -1.3812e-06], ..., [ 1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 1.5693e-07, 1.3774e-06, 2.7288e-07], [ 1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 2.0087e-05, 6.1989e-05, 4.3064e-06], [ 6.9849e-09, 0.0000e+00, 0.0000e+00, ..., 9.7509e-07, 2.7455e-06, -9.0199e-07]], device='cuda:0') Epoch 81, bias, value: tensor([ 0.0072, -0.0171, 0.0008, 0.0316, -0.0075, 0.0315, 0.0036, 0.0152, 0.0044, -0.0086], device='cuda:0'), grad: tensor([ 2.6934e-06, 1.5289e-05, -9.3430e-06, 8.7738e-05, 1.7211e-05, -3.6180e-05, -8.6308e-05, -5.8293e-05, 1.1563e-04, -4.8637e-05], device='cuda:0') 100 0.0001 changing lr epoch 80, time 214.52, cls_loss 0.0065 cls_loss_mapping 0.0126 cls_loss_causal 0.5807 re_mapping 0.0107 re_causal 0.0314 /// teacc 98.74 lr 0.00010000 Epoch 82, weight, value: tensor([[-0.0232, -0.0019, -0.0285, ..., 0.0085, -0.0087, -0.0029], [ 0.0326, 0.0152, -0.0248, ..., 0.0262, -0.0130, -0.1866], [-0.0098, 0.0179, 0.0146, ..., -0.0427, 0.0595, -0.0368], ..., [-0.0013, -0.0115, 0.0085, ..., -0.0557, -0.0863, 0.0203], [-0.0188, 0.0079, -0.0078, ..., -0.0547, 0.0323, -0.1214], [-0.0749, 0.0002, 0.0020, ..., -0.0686, -0.0547, -0.0604]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., -1.4566e-05, -2.5079e-05, 5.8673e-08], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -2.0266e-05, -1.8075e-05, 1.7835e-07], [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 1.7256e-05, 2.7731e-05, 1.4110e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6876e-06, 2.3004e-06, 1.4435e-07], [ 1.0245e-08, 0.0000e+00, 0.0000e+00, ..., 4.3549e-06, 6.3144e-06, 5.9605e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.4047e-06, 3.4571e-06, 2.4065e-06]], device='cuda:0') Epoch 82, bias, value: tensor([ 0.0073, -0.0162, 0.0010, 0.0315, -0.0072, 0.0311, 0.0041, 0.0152, 0.0034, -0.0090], device='cuda:0'), grad: tensor([-1.1420e-04, -2.1100e-04, 8.2874e-04, 6.2132e-04, 1.3518e-04, 3.2365e-05, 6.1989e-05, -1.4362e-03, 7.0572e-05, 1.1928e-05], device='cuda:0') 100 0.0001 changing lr epoch 81, time 214.36, cls_loss 0.0067 cls_loss_mapping 0.0123 cls_loss_causal 0.6098 re_mapping 0.0106 re_causal 0.0306 /// teacc 98.84 lr 0.00010000 Epoch 83, weight, value: tensor([[-0.0237, -0.0019, -0.0314, ..., 0.0088, -0.0088, -0.0031], [ 0.0325, 0.0151, -0.0251, ..., 0.0262, -0.0135, -0.1881], [-0.0100, 0.0179, 0.0141, ..., -0.0431, 0.0600, -0.0371], ..., [-0.0014, -0.0115, 0.0070, ..., -0.0561, -0.0872, 0.0209], [-0.0174, 0.0079, -0.0086, ..., -0.0556, 0.0322, -0.1211], [-0.0751, 0.0002, 0.0013, ..., -0.0686, -0.0553, -0.0607]], device='cuda:0'), grad: tensor([[ 9.7323e-08, 0.0000e+00, 4.6566e-09, ..., -9.5665e-05, 9.8813e-07, 7.2690e-07], [ 2.4354e-07, 0.0000e+00, 1.5832e-08, ..., 1.4603e-04, 5.0592e-04, 2.3618e-06], [ 2.3376e-06, 0.0000e+00, 4.1910e-09, ..., -4.2245e-06, -1.0449e-04, -1.3113e-06], ..., [ 3.1246e-07, 0.0000e+00, 5.4482e-08, ..., 5.3383e-06, 1.6943e-05, 1.9781e-06], [ 9.2201e-08, 0.0000e+00, 7.3109e-08, ..., 2.5272e-03, 8.8120e-03, 1.9064e-06], [ 1.4435e-07, 0.0000e+00, -2.6310e-07, ..., 6.3002e-05, 4.1544e-05, 8.3566e-05]], device='cuda:0') Epoch 83, bias, value: tensor([ 0.0072, -0.0165, 0.0007, 0.0322, -0.0076, 0.0311, 0.0041, 0.0158, 0.0028, -0.0089], device='cuda:0'), grad: tensor([-1.8215e-04, 7.2670e-04, -1.6189e-04, 6.3062e-05, -2.5794e-05, -1.5282e-02, 1.8253e-03, -1.0467e-04, 1.2741e-02, 3.9482e-04], device='cuda:0') 100 0.0001 changing lr epoch 82, time 214.46, cls_loss 0.0065 cls_loss_mapping 0.0105 cls_loss_causal 0.5919 re_mapping 0.0096 re_causal 0.0287 /// teacc 98.91 lr 0.00010000 Epoch 84, weight, value: tensor([[-0.0238, -0.0019, -0.0336, ..., 0.0099, -0.0094, -0.0032], [ 0.0325, 0.0151, -0.0227, ..., 0.0244, -0.0138, -0.1888], [-0.0103, 0.0179, 0.0137, ..., -0.0435, 0.0606, -0.0366], ..., [-0.0009, -0.0115, 0.0052, ..., -0.0564, -0.0875, 0.0223], [-0.0176, 0.0079, -0.0094, ..., -0.0563, 0.0318, -0.1217], [-0.0754, 0.0002, 0.0018, ..., -0.0690, -0.0560, -0.0617]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8818e-06, 7.2755e-06, 2.8089e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.3726e-07, 4.1649e-06, 5.6485e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0356e-06, 2.7925e-05, 2.1601e-04], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.7323e-08, 6.9797e-05, 4.6846e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9840e-06, 1.0356e-05, 1.7118e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6822e-07, 9.1344e-06, 1.2867e-05]], device='cuda:0') Epoch 84, bias, value: tensor([ 0.0079, -0.0168, 0.0007, 0.0320, -0.0067, 0.0315, 0.0041, 0.0155, 0.0026, -0.0096], device='cuda:0'), grad: tensor([ 2.4438e-05, -1.2410e-04, 2.7037e-04, 4.6921e-04, -5.5695e-04, -4.5395e-04, 1.7717e-05, 2.7752e-04, 9.5129e-05, -1.8597e-05], device='cuda:0') 100 0.0001 changing lr epoch 83, time 214.30, cls_loss 0.0053 cls_loss_mapping 0.0099 cls_loss_causal 0.5955 re_mapping 0.0099 re_causal 0.0298 /// teacc 98.90 lr 0.00010000 Epoch 85, weight, value: tensor([[-0.0240, -0.0018, -0.0365, ..., 0.0104, -0.0098, -0.0026], [ 0.0325, 0.0151, -0.0188, ..., 0.0242, -0.0140, -0.1892], [-0.0103, 0.0179, 0.0135, ..., -0.0439, 0.0614, -0.0371], ..., [-0.0009, -0.0115, 0.0024, ..., -0.0566, -0.0885, 0.0230], [-0.0176, 0.0076, -0.0117, ..., -0.0561, 0.0319, -0.1230], [-0.0756, 0.0002, 0.0018, ..., -0.0694, -0.0565, -0.0617]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.7907e-06, -3.9399e-05, -3.3116e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.6310e-07, 2.7660e-06, 2.1718e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.8406e-07, 1.1519e-05, 3.7640e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.3272e-07, 3.5409e-06, 1.3638e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1383e-06, -2.3317e-04, 4.9204e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5274e-06, 2.2471e-04, -9.0897e-05]], device='cuda:0') Epoch 85, bias, value: tensor([ 0.0083, -0.0175, 0.0008, 0.0326, -0.0071, 0.0313, 0.0038, 0.0159, 0.0025, -0.0094], device='cuda:0'), grad: tensor([-6.7472e-04, 1.3679e-05, 1.1408e-04, 6.3360e-05, 2.2423e-04, 4.0382e-05, 1.8847e-04, 1.3132e-03, -6.9571e-04, -5.8746e-04], device='cuda:0') 100 0.0001 changing lr epoch 84, time 214.44, cls_loss 0.0045 cls_loss_mapping 0.0092 cls_loss_causal 0.5685 re_mapping 0.0098 re_causal 0.0296 /// teacc 98.85 lr 0.00010000 Epoch 86, weight, value: tensor([[-0.0240, -0.0018, -0.0380, ..., 0.0104, -0.0101, -0.0020], [ 0.0325, 0.0150, -0.0179, ..., 0.0241, -0.0143, -0.1895], [-0.0104, 0.0179, 0.0143, ..., -0.0444, 0.0618, -0.0376], ..., [-0.0004, -0.0115, 0.0016, ..., -0.0568, -0.0890, 0.0230], [-0.0177, 0.0075, -0.0125, ..., -0.0565, 0.0321, -0.1237], [-0.0756, 0.0002, 0.0015, ..., -0.0697, -0.0574, -0.0621]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0505e-05, 9.5833e-07, 1.4901e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2526e-07, 9.4436e-07, 1.1269e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.7707e-07, 2.0210e-06, 9.8720e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6391e-07, 7.1526e-07, 2.2864e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.0199e-07, -1.0329e-04, 3.6787e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3581e-06, 1.2495e-05, 6.5845e-07]], device='cuda:0') Epoch 86, bias, value: tensor([ 0.0083, -0.0176, 0.0010, 0.0327, -0.0072, 0.0315, 0.0041, 0.0158, 0.0025, -0.0097], device='cuda:0'), grad: tensor([-2.1607e-05, 6.1691e-06, 6.4559e-06, 1.2600e-04, 1.5117e-05, 1.2815e-05, 3.8177e-05, -8.3596e-06, -1.6272e-04, -1.1981e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 85---------------------------------------------------- epoch 85, time 230.99, cls_loss 0.0048 cls_loss_mapping 0.0106 cls_loss_causal 0.6130 re_mapping 0.0100 re_causal 0.0307 /// teacc 98.98 lr 0.00010000 Epoch 87, weight, value: tensor([[-0.0240, -0.0018, -0.0381, ..., 0.0106, -0.0102, -0.0021], [ 0.0325, 0.0150, -0.0175, ..., 0.0239, -0.0149, -0.1899], [-0.0104, 0.0179, 0.0142, ..., -0.0448, 0.0623, -0.0377], ..., [-0.0005, -0.0115, 0.0015, ..., -0.0570, -0.0896, 0.0229], [-0.0177, 0.0074, -0.0127, ..., -0.0563, 0.0324, -0.1240], [-0.0756, 0.0002, 0.0015, ..., -0.0699, -0.0580, -0.0625]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 6.5193e-09, ..., -2.3339e-06, 2.8059e-05, 1.7695e-07], [ 0.0000e+00, 0.0000e+00, 1.8161e-08, ..., 1.1727e-05, 1.9193e-05, 3.5297e-07], [ 0.0000e+00, 0.0000e+00, 5.1223e-09, ..., 5.6922e-06, 1.9714e-05, 4.0559e-07], ..., [ 0.0000e+00, 0.0000e+00, 7.7952e-07, ..., 5.5274e-07, 2.8405e-06, 2.5192e-07], [ 0.0000e+00, 0.0000e+00, 5.1223e-08, ..., 3.8356e-05, -7.3254e-05, 2.0908e-07], [ 0.0000e+00, 0.0000e+00, -9.7603e-07, ..., 4.8541e-06, 1.0990e-05, 9.7416e-07]], device='cuda:0') Epoch 87, bias, value: tensor([ 0.0083, -0.0182, 0.0010, 0.0325, -0.0070, 0.0318, 0.0043, 0.0162, 0.0026, -0.0100], device='cuda:0'), grad: tensor([ 2.6494e-05, 9.2447e-05, 1.0347e-04, -8.8215e-04, -1.4998e-05, 2.1482e-04, -1.7250e-04, 5.1498e-04, 1.8224e-05, 9.8765e-05], device='cuda:0') 100 0.0001 changing lr epoch 86, time 214.63, cls_loss 0.0050 cls_loss_mapping 0.0096 cls_loss_causal 0.5969 re_mapping 0.0094 re_causal 0.0293 /// teacc 98.93 lr 0.00010000 Epoch 88, weight, value: tensor([[-2.4045e-02, 6.0286e-05, -3.8161e-02, ..., 1.0878e-02, -1.0894e-02, -2.1233e-03], [ 3.2484e-02, 1.3429e-02, -1.7501e-02, ..., 2.4416e-02, -1.5517e-02, -1.9029e-01], [-1.0455e-02, 1.7681e-02, 1.4187e-02, ..., -4.5611e-02, 6.2936e-02, -3.6689e-02], ..., [-1.0260e-04, -1.1694e-02, 1.4155e-03, ..., -5.7610e-02, -9.0238e-02, 2.2545e-02], [-1.7547e-02, 4.5833e-03, -1.2814e-02, ..., -5.6170e-02, 3.3264e-02, -1.2426e-01], [-7.5678e-02, -6.5520e-04, 1.5733e-03, ..., -7.0434e-02, -5.8796e-02, -6.2800e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0736e-05, 2.7381e-06, 1.3504e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.5949e-07, 6.6683e-07, 1.5507e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1194e-06, 1.0274e-05, 8.9873e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0932e-07, 1.2822e-05, 1.8254e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9384e-07, -2.8223e-05, 1.3504e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1090e-06, 1.0226e-06, 1.9046e-06]], device='cuda:0') Epoch 88, bias, value: tensor([ 0.0083, -0.0192, 0.0009, 0.0325, -0.0072, 0.0319, 0.0041, 0.0167, 0.0032, -0.0098], device='cuda:0'), grad: tensor([-1.0781e-05, -9.4902e-07, 1.9267e-05, 3.5893e-06, 2.0489e-06, -3.1088e-06, 1.0513e-05, 2.3603e-05, -3.1352e-05, -1.2837e-05], device='cuda:0') 100 0.0001 changing lr epoch 87, time 214.50, cls_loss 0.0069 cls_loss_mapping 0.0123 cls_loss_causal 0.5881 re_mapping 0.0095 re_causal 0.0287 /// teacc 98.80 lr 0.00010000 Epoch 89, weight, value: tensor([[-2.6043e-02, 3.7143e-04, -3.8220e-02, ..., 1.1324e-02, -1.1646e-02, -2.1896e-03], [ 3.2594e-02, 1.2937e-02, -1.7353e-02, ..., 2.4447e-02, -1.6228e-02, -1.9058e-01], [-1.0277e-02, 1.7776e-02, 1.4158e-02, ..., -4.6358e-02, 6.3708e-02, -3.6303e-02], ..., [-1.6251e-04, -1.1774e-02, 1.3448e-03, ..., -5.7997e-02, -9.0856e-02, 2.2416e-02], [-1.5837e-02, 3.1783e-03, -1.2859e-02, ..., -5.6959e-02, 3.4906e-02, -1.2475e-01], [-7.5843e-02, -7.7885e-04, 1.5771e-03, ..., -7.1726e-02, -6.0967e-02, -6.2492e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4982e-03, 9.5558e-04, 1.0272e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.8587e-06, 2.9907e-05, 6.5519e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.2724e-05, -2.2367e-05, 6.6683e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9488e-06, 1.3532e-06, -2.0657e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1439e-05, 2.8964e-06, 2.3679e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5187e-04, 6.7830e-05, 2.0936e-06]], device='cuda:0') Epoch 89, bias, value: tensor([ 0.0085, -0.0187, 0.0014, 0.0325, -0.0077, 0.0307, 0.0044, 0.0161, 0.0055, -0.0107], device='cuda:0'), grad: tensor([ 2.7504e-03, -4.0150e-04, 1.6916e-04, 2.9206e-04, 2.2197e-04, 8.1778e-05, -3.4676e-03, -2.9191e-05, 6.8843e-05, 3.1090e-04], device='cuda:0') 100 0.0001 changing lr epoch 88, time 214.34, cls_loss 0.0048 cls_loss_mapping 0.0110 cls_loss_causal 0.5940 re_mapping 0.0098 re_causal 0.0306 /// teacc 98.91 lr 0.00010000 Epoch 90, weight, value: tensor([[-2.5820e-02, 3.8353e-04, -3.8229e-02, ..., 1.2556e-02, -1.2130e-02, -1.8754e-03], [ 3.2646e-02, 1.2911e-02, -1.7361e-02, ..., 2.6358e-02, -1.5865e-02, -1.9105e-01], [-1.0334e-02, 1.7818e-02, 1.4152e-02, ..., -4.6718e-02, 6.4342e-02, -3.6447e-02], ..., [-8.8909e-05, -1.1808e-02, 1.3427e-03, ..., -5.8261e-02, -9.1204e-02, 2.2884e-02], [-1.5873e-02, 3.0592e-03, -1.2856e-02, ..., -5.8163e-02, 3.4494e-02, -1.2520e-01], [-7.5976e-02, -7.8098e-04, 1.5904e-03, ..., -7.2031e-02, -6.1460e-02, -6.2914e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.2806e-07, ..., -5.1633e-06, 4.5657e-05, 1.1059e-07], [ 0.0000e+00, 0.0000e+00, 2.5146e-06, ..., 1.4454e-06, 1.7896e-05, 5.4948e-06], [ 0.0000e+00, 0.0000e+00, 2.1998e-06, ..., 1.6978e-06, -3.9196e-04, 2.4564e-07], ..., [ 0.0000e+00, 0.0000e+00, 4.2678e-07, ..., 4.6287e-07, 1.2793e-05, 3.9376e-06], [ 0.0000e+00, 0.0000e+00, -1.2167e-05, ..., 1.3493e-05, 6.6340e-05, 4.3996e-06], [ 0.0000e+00, 0.0000e+00, 6.0163e-07, ..., 1.8328e-06, 6.7987e-06, 4.1537e-06]], device='cuda:0') Epoch 90, bias, value: tensor([ 0.0094, -0.0179, 0.0016, 0.0322, -0.0077, 0.0311, 0.0033, 0.0163, 0.0046, -0.0108], device='cuda:0'), grad: tensor([ 7.2539e-05, 6.0976e-05, -7.7248e-04, 4.1509e-04, -1.0175e-04, 1.1718e-04, -4.6045e-05, 5.6684e-05, 1.7238e-04, 2.4617e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 89---------------------------------------------------- epoch 89, time 230.47, cls_loss 0.0040 cls_loss_mapping 0.0085 cls_loss_causal 0.6025 re_mapping 0.0089 re_causal 0.0291 /// teacc 98.99 lr 0.00010000 Epoch 91, weight, value: tensor([[-0.0256, 0.0004, -0.0384, ..., 0.0127, -0.0126, -0.0008], [ 0.0327, 0.0129, -0.0175, ..., 0.0261, -0.0169, -0.1915], [-0.0104, 0.0178, 0.0140, ..., -0.0473, 0.0647, -0.0363], ..., [ 0.0003, -0.0118, 0.0013, ..., -0.0583, -0.0916, 0.0227], [-0.0160, 0.0030, -0.0124, ..., -0.0583, 0.0349, -0.1262], [-0.0764, -0.0008, 0.0015, ..., -0.0722, -0.0617, -0.0628]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0841e-06, 1.4268e-06, 4.5612e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5111e-07, -2.4751e-05, 2.2158e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.3889e-07, -3.0082e-06, 1.6401e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.0094e-08, 3.6284e-06, 6.6720e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.9381e-06, 5.0306e-05, 7.5512e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9116e-07, 2.8238e-06, 1.9193e-05]], device='cuda:0') Epoch 91, bias, value: tensor([ 0.0096, -0.0178, 0.0013, 0.0319, -0.0077, 0.0314, 0.0026, 0.0162, 0.0046, -0.0102], device='cuda:0'), grad: tensor([ 1.1049e-05, -9.5591e-06, 3.5167e-05, 3.4451e-04, -2.4486e-04, -3.7044e-05, 1.0476e-05, -8.2827e-04, 2.0218e-04, 5.1641e-04], device='cuda:0') 100 0.0001 changing lr epoch 90, time 214.47, cls_loss 0.0048 cls_loss_mapping 0.0091 cls_loss_causal 0.5793 re_mapping 0.0088 re_causal 0.0258 /// teacc 98.99 lr 0.00010000 Epoch 92, weight, value: tensor([[-0.0256, 0.0004, -0.0384, ..., 0.0126, -0.0132, -0.0009], [ 0.0327, 0.0128, -0.0175, ..., 0.0259, -0.0174, -0.1919], [-0.0104, 0.0179, 0.0140, ..., -0.0483, 0.0652, -0.0368], ..., [ 0.0002, -0.0118, 0.0013, ..., -0.0587, -0.0923, 0.0234], [-0.0160, 0.0029, -0.0124, ..., -0.0587, 0.0351, -0.1267], [-0.0765, -0.0008, 0.0015, ..., -0.0725, -0.0621, -0.0633]], device='cuda:0'), grad: tensor([[ 3.6554e-08, 0.0000e+00, 0.0000e+00, ..., 2.4080e-05, 1.9386e-05, 2.2585e-08], [-7.1293e-07, 0.0000e+00, 0.0000e+00, ..., 4.9779e-07, 5.0012e-07, 1.0966e-07], [ 1.5250e-07, 0.0000e+00, 0.0000e+00, ..., 5.8766e-07, -1.4435e-07, 5.7276e-08], ..., [ 2.3260e-07, 0.0000e+00, 0.0000e+00, ..., 1.4110e-07, 1.4529e-07, 2.1351e-07], [ 1.0082e-07, 0.0000e+00, 0.0000e+00, ..., 1.4104e-05, 1.2830e-05, 5.2853e-08], [ 3.0501e-08, 0.0000e+00, 0.0000e+00, ..., 1.4147e-06, 1.0561e-06, 5.4063e-07]], device='cuda:0') Epoch 92, bias, value: tensor([ 0.0094, -0.0179, 0.0013, 0.0319, -0.0076, 0.0317, 0.0027, 0.0166, 0.0041, -0.0105], device='cuda:0'), grad: tensor([ 3.1084e-05, -4.2111e-05, 1.1362e-05, -3.1982e-06, 1.4842e-05, 1.5676e-05, -6.9559e-05, -4.5672e-06, 4.7535e-05, -1.1884e-06], device='cuda:0') 100 0.0001 changing lr epoch 91, time 214.10, cls_loss 0.0044 cls_loss_mapping 0.0092 cls_loss_causal 0.5948 re_mapping 0.0092 re_causal 0.0285 /// teacc 98.85 lr 0.00010000 Epoch 93, weight, value: tensor([[-0.0254, 0.0004, -0.0388, ..., 0.0127, -0.0142, -0.0010], [ 0.0326, 0.0128, -0.0175, ..., 0.0255, -0.0179, -0.1923], [-0.0100, 0.0179, 0.0139, ..., -0.0490, 0.0657, -0.0364], ..., [-0.0002, -0.0118, 0.0009, ..., -0.0591, -0.0945, 0.0233], [-0.0165, 0.0029, -0.0131, ..., -0.0598, 0.0352, -0.1276], [-0.0771, -0.0008, 0.0014, ..., -0.0728, -0.0625, -0.0652]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3458e-06, 4.1910e-06, 4.0559e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6287e-07, 2.9765e-06, 2.9872e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.2305e-07, 8.5458e-06, 9.5554e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.1491e-09, 2.6859e-06, -2.8163e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4557e-06, -6.2585e-06, 3.6391e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.3796e-08, 1.5497e-06, 1.0347e-06]], device='cuda:0') Epoch 93, bias, value: tensor([ 0.0094, -0.0179, 0.0012, 0.0317, -0.0067, 0.0315, 0.0041, 0.0157, 0.0041, -0.0109], device='cuda:0'), grad: tensor([ 1.6779e-05, 1.9774e-05, 6.0856e-05, -4.5061e-04, 2.0996e-05, 3.4451e-04, -1.6894e-06, -2.7969e-05, 9.8944e-06, 6.9141e-06], device='cuda:0') 100 0.0001 changing lr epoch 92, time 214.49, cls_loss 0.0046 cls_loss_mapping 0.0083 cls_loss_causal 0.5993 re_mapping 0.0088 re_causal 0.0282 /// teacc 98.88 lr 0.00010000 Epoch 94, weight, value: tensor([[-2.5531e-02, 3.7669e-04, -3.8857e-02, ..., 1.2643e-02, -1.4657e-02, -1.1499e-03], [ 3.2577e-02, 1.2690e-02, -1.7246e-02, ..., 2.5422e-02, -1.8753e-02, -1.9236e-01], [-1.0200e-02, 1.7993e-02, 1.3831e-02, ..., -4.9359e-02, 6.6858e-02, -3.6859e-02], ..., [ 5.0469e-05, -1.1882e-02, 7.6131e-04, ..., -5.9456e-02, -9.4878e-02, 2.5345e-02], [-1.6533e-02, 2.7472e-03, -1.3100e-02, ..., -6.0227e-02, 3.5041e-02, -1.2816e-01], [-7.7844e-02, -7.8318e-04, 1.4251e-03, ..., -7.1927e-02, -6.2247e-02, -6.5308e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.4661e-05, 3.2708e-06, 1.2228e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.3889e-07, 3.5763e-06, 5.6438e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.5560e-06, -7.3433e-05, 5.8254e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0117e-06, 7.1600e-06, 4.9034e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8855e-06, 5.0180e-06, 2.0470e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0997e-05, 7.0810e-05, 3.6597e-04]], device='cuda:0') Epoch 94, bias, value: tensor([ 0.0094, -0.0178, 0.0009, 0.0315, -0.0075, 0.0316, 0.0041, 0.0167, 0.0037, -0.0106], device='cuda:0'), grad: tensor([-4.1544e-05, 1.4879e-05, -1.2082e-04, 3.8177e-05, -1.5125e-03, 2.5943e-05, 1.3649e-05, -1.0997e-04, 4.6253e-05, 1.6451e-03], device='cuda:0') 100 0.0001 changing lr epoch 93, time 214.43, cls_loss 0.0055 cls_loss_mapping 0.0096 cls_loss_causal 0.5908 re_mapping 0.0085 re_causal 0.0256 /// teacc 98.97 lr 0.00010000 Epoch 95, weight, value: tensor([[-0.0266, 0.0004, -0.0389, ..., 0.0127, -0.0153, -0.0010], [ 0.0327, 0.0122, -0.0172, ..., 0.0258, -0.0189, -0.1928], [-0.0105, 0.0181, 0.0138, ..., -0.0499, 0.0675, -0.0376], ..., [-0.0003, -0.0120, 0.0008, ..., -0.0596, -0.0954, 0.0256], [-0.0154, 0.0013, -0.0131, ..., -0.0610, 0.0345, -0.1295], [-0.0786, -0.0008, 0.0014, ..., -0.0721, -0.0626, -0.0662]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1064e-06, 1.0818e-05, 5.3905e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0775e-06, 1.0245e-05, 2.0102e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3066e-06, 5.0187e-05, 2.8729e-05], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.6846e-08, 2.6152e-06, -7.1466e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0536e-06, -1.4031e-04, 3.6005e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.1863e-07, 1.5795e-05, 4.1723e-05]], device='cuda:0') Epoch 95, bias, value: tensor([ 0.0089, -0.0173, 0.0008, 0.0318, -0.0072, 0.0316, 0.0043, 0.0165, 0.0031, -0.0107], device='cuda:0'), grad: tensor([ 6.2525e-05, 1.5593e-04, 2.2459e-04, 7.9095e-05, -6.2466e-05, 2.1720e-04, -2.2322e-05, -5.0020e-04, -2.6083e-04, 1.0705e-04], device='cuda:0') 100 0.0001 changing lr epoch 94, time 214.54, cls_loss 0.0041 cls_loss_mapping 0.0084 cls_loss_causal 0.6257 re_mapping 0.0089 re_causal 0.0278 /// teacc 98.91 lr 0.00010000 Epoch 96, weight, value: tensor([[-0.0263, 0.0004, -0.0389, ..., 0.0129, -0.0151, -0.0008], [ 0.0327, 0.0120, -0.0172, ..., 0.0256, -0.0193, -0.1927], [-0.0105, 0.0184, 0.0138, ..., -0.0502, 0.0683, -0.0381], ..., [-0.0004, -0.0121, 0.0008, ..., -0.0599, -0.0958, 0.0259], [-0.0154, 0.0004, -0.0131, ..., -0.0613, 0.0344, -0.1301], [-0.0787, -0.0008, 0.0014, ..., -0.0724, -0.0639, -0.0664]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.9912e-06, 4.1313e-06, 2.3306e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.4389e-07, 2.8554e-06, 3.9139e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.0012e-07, -5.7340e-05, 3.2783e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8603e-07, 3.3498e-05, 1.3029e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3916e-06, 8.0541e-06, 9.3831e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1945e-07, 3.7570e-06, 2.8294e-06]], device='cuda:0') Epoch 96, bias, value: tensor([ 0.0093, -0.0170, 0.0009, 0.0321, -0.0074, 0.0314, 0.0047, 0.0164, 0.0026, -0.0107], device='cuda:0'), grad: tensor([ 3.4180e-06, 1.3765e-06, -5.4449e-05, -4.7207e-05, -4.3772e-06, 1.7971e-05, 1.4482e-06, 3.3379e-05, 1.8477e-05, 2.9966e-05], device='cuda:0') 100 0.0001 changing lr epoch 95, time 214.28, cls_loss 0.0048 cls_loss_mapping 0.0107 cls_loss_causal 0.6157 re_mapping 0.0088 re_causal 0.0275 /// teacc 98.96 lr 0.00010000 Epoch 97, weight, value: tensor([[-2.6379e-02, 3.7092e-04, -3.8925e-02, ..., 1.2854e-02, -1.5872e-02, -1.1195e-03], [ 3.2735e-02, 1.2013e-02, -1.7248e-02, ..., 2.5669e-02, -1.9368e-02, -1.9303e-01], [-1.0594e-02, 1.8418e-02, 1.3818e-02, ..., -5.0999e-02, 6.8655e-02, -3.8432e-02], ..., [-1.1431e-04, -1.2071e-02, 7.5122e-04, ..., -6.0458e-02, -9.5947e-02, 2.5837e-02], [-1.5513e-02, 3.2639e-04, -1.3118e-02, ..., -6.1755e-02, 3.4323e-02, -1.3221e-01], [-7.8796e-02, -7.9271e-04, 1.3922e-03, ..., -7.2737e-02, -6.4374e-02, -6.4827e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0924e-06, 2.0210e-06, 2.8638e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.3232e-05, 3.1870e-06, 2.1420e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.4273e-06, -1.1468e-04, 6.7521e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.4680e-07, 9.0480e-05, 8.8476e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0375e-06, -2.3004e-06, 1.7462e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.8836e-07, 1.1986e-06, -7.3062e-07]], device='cuda:0') Epoch 97, bias, value: tensor([ 0.0087, -0.0167, 0.0002, 0.0314, -0.0080, 0.0322, 0.0052, 0.0164, 0.0017, -0.0097], device='cuda:0'), grad: tensor([ 8.2552e-06, -2.0847e-05, -2.1350e-04, 4.6641e-05, 1.3486e-05, 6.2399e-06, 1.3329e-05, 1.3590e-04, -3.9265e-06, 1.4082e-05], device='cuda:0') 100 0.0001 changing lr epoch 96, time 214.44, cls_loss 0.0043 cls_loss_mapping 0.0077 cls_loss_causal 0.5913 re_mapping 0.0087 re_causal 0.0264 /// teacc 98.91 lr 0.00010000 Epoch 98, weight, value: tensor([[-2.6397e-02, 3.8426e-04, -3.8951e-02, ..., 1.2849e-02, -1.6458e-02, -1.2790e-03], [ 3.2766e-02, 1.1911e-02, -1.7105e-02, ..., 2.5664e-02, -2.0407e-02, -1.9286e-01], [-1.0606e-02, 1.8494e-02, 1.3802e-02, ..., -5.1515e-02, 6.8232e-02, -4.0259e-02], ..., [-1.0498e-04, -1.2086e-02, 6.9432e-04, ..., -6.1278e-02, -9.6316e-02, 2.5541e-02], [-1.5532e-02, 1.4026e-04, -1.3128e-02, ..., -6.1373e-02, 3.5862e-02, -1.3175e-01], [-7.8917e-02, -7.9638e-04, 1.3444e-03, ..., -7.2830e-02, -6.4986e-02, -6.5550e-02]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., -5.4240e-06, 9.4846e-06, 4.2585e-07], [-3.3062e-08, 0.0000e+00, 0.0000e+00, ..., 3.8631e-06, 1.3143e-05, 3.3807e-06], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.2817e-06, -3.5763e-05, 4.0978e-06], ..., [ 7.6834e-09, 0.0000e+00, 0.0000e+00, ..., 6.6496e-07, 7.5102e-06, -2.5511e-05], [ 6.7521e-09, 0.0000e+00, 0.0000e+00, ..., -4.3549e-06, -3.0205e-05, -5.5768e-06], [ 1.1642e-09, 0.0000e+00, 0.0000e+00, ..., 2.1830e-06, 3.3993e-06, 9.7230e-06]], device='cuda:0') Epoch 98, bias, value: tensor([ 0.0079, -0.0160, -0.0007, 0.0313, -0.0074, 0.0322, 0.0049, 0.0153, 0.0031, -0.0097], device='cuda:0'), grad: tensor([ 3.9600e-06, 4.5270e-05, 3.3408e-05, -5.2881e-04, 1.9610e-05, 5.4419e-05, -1.7090e-06, 3.3975e-04, -8.9109e-06, 4.2677e-05], device='cuda:0') 100 0.0001 changing lr epoch 97, time 214.49, cls_loss 0.0041 cls_loss_mapping 0.0080 cls_loss_causal 0.5886 re_mapping 0.0084 re_causal 0.0267 /// teacc 98.91 lr 0.00010000 Epoch 99, weight, value: tensor([[-2.6402e-02, 4.0008e-04, -4.0351e-02, ..., 1.3207e-02, -1.6888e-02, -1.3616e-03], [ 3.2800e-02, 1.1888e-02, -1.7244e-02, ..., 2.5518e-02, -2.0885e-02, -1.9345e-01], [-1.0613e-02, 1.8486e-02, 1.3137e-02, ..., -5.2154e-02, 6.8454e-02, -4.0836e-02], ..., [-1.0977e-04, -1.2088e-02, 2.3358e-04, ..., -6.1502e-02, -9.6831e-02, 2.5934e-02], [-1.5576e-02, 9.5116e-05, -1.3085e-02, ..., -6.1595e-02, 3.6393e-02, -1.3070e-01], [-7.8929e-02, -7.9806e-04, 1.1398e-03, ..., -7.3033e-02, -6.4873e-02, -6.6700e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3919e-06, 1.0446e-05, 2.8638e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.4249e-07, -1.5080e-05, 3.5623e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.9314e-07, 7.8976e-06, 1.3504e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.9884e-08, 2.1160e-06, -1.0943e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1686e-06, 6.1020e-06, 5.0757e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7604e-06, 2.5891e-06, -1.6969e-06]], device='cuda:0') Epoch 99, bias, value: tensor([ 0.0084, -0.0156, -0.0009, 0.0313, -0.0067, 0.0317, 0.0048, 0.0157, 0.0030, -0.0105], device='cuda:0'), grad: tensor([ 9.3520e-05, -8.3733e-04, 5.2023e-04, 1.9383e-04, 7.6950e-05, -9.2387e-05, 3.7670e-05, 8.7857e-05, 8.7619e-05, -1.6892e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 98---------------------------------------------------- epoch 98, time 230.54, cls_loss 0.0047 cls_loss_mapping 0.0097 cls_loss_causal 0.5927 re_mapping 0.0083 re_causal 0.0254 /// teacc 99.01 lr 0.00010000 Epoch 100, weight, value: tensor([[-2.6388e-02, 4.2707e-04, -4.0352e-02, ..., 1.3332e-02, -1.7394e-02, -1.1085e-03], [ 3.2792e-02, 1.1559e-02, -1.7243e-02, ..., 2.5655e-02, -2.1047e-02, -1.9408e-01], [-1.0621e-02, 1.8473e-02, 1.3136e-02, ..., -5.3293e-02, 6.8738e-02, -4.0730e-02], ..., [-1.3178e-04, -1.2143e-02, 2.3299e-04, ..., -6.1695e-02, -9.7114e-02, 2.5818e-02], [-1.5587e-02, 2.0205e-03, -1.3082e-02, ..., -6.2121e-02, 3.6567e-02, -1.3099e-01], [-7.9046e-02, -8.6052e-04, 1.1395e-03, ..., -7.3249e-02, -6.5678e-02, -6.7092e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.8348e-06, 3.9130e-05, 7.4739e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8124e-06, 4.0841e-04, 2.5146e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.4482e-07, -3.2455e-05, 2.0815e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3108e-07, -4.2820e-04, 1.0142e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9208e-05, -1.1161e-05, 2.2654e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0990e-06, 3.1501e-05, -2.6766e-06]], device='cuda:0') Epoch 100, bias, value: tensor([ 0.0083, -0.0148, -0.0017, 0.0308, -0.0060, 0.0318, 0.0041, 0.0159, 0.0030, -0.0107], device='cuda:0'), grad: tensor([ 6.0827e-05, 8.1024e-03, 1.5712e-04, 1.0347e-04, 2.1949e-05, 2.3007e-04, -2.5153e-04, -8.5831e-03, 1.0532e-04, 4.7922e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 99---------------------------------------------------- epoch 99, time 230.73, cls_loss 0.0043 cls_loss_mapping 0.0096 cls_loss_causal 0.5791 re_mapping 0.0089 re_causal 0.0260 /// teacc 99.02 lr 0.00010000 Epoch 101, weight, value: tensor([[-0.0263, 0.0004, -0.0404, ..., 0.0135, -0.0179, -0.0008], [ 0.0331, 0.0115, -0.0172, ..., 0.0257, -0.0216, -0.1945], [-0.0107, 0.0185, 0.0131, ..., -0.0534, 0.0697, -0.0408], ..., [-0.0003, -0.0122, 0.0002, ..., -0.0619, -0.0980, 0.0256], [-0.0158, 0.0024, -0.0131, ..., -0.0627, 0.0365, -0.1311], [-0.0794, -0.0009, 0.0011, ..., -0.0723, -0.0656, -0.0673]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0863e-05, 5.9372e-07, 3.4599e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5637e-06, 4.1872e-06, 2.0638e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2582e-06, -3.1367e-06, 5.1036e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6973e-07, 1.2470e-06, 2.2322e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6412e-06, -7.0669e-06, 1.2675e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4445e-06, 1.6680e-06, -5.1081e-05]], device='cuda:0') Epoch 101, bias, value: tensor([ 0.0085, -0.0159, -0.0011, 0.0308, -0.0063, 0.0317, 0.0035, 0.0168, 0.0026, -0.0100], device='cuda:0'), grad: tensor([-3.3975e-05, 3.3259e-05, 1.2346e-05, 3.5584e-05, 1.8597e-04, 1.4879e-05, -5.1893e-06, 1.5211e-04, -2.7329e-05, -3.6740e-04], device='cuda:0') 100 0.0001 changing lr epoch 100, time 214.82, cls_loss 0.0041 cls_loss_mapping 0.0085 cls_loss_causal 0.5466 re_mapping 0.0093 re_causal 0.0260 /// teacc 98.84 lr 0.00010000 Epoch 102, weight, value: tensor([[-2.6309e-02, 4.2359e-04, -4.0785e-02, ..., 1.3267e-02, -1.9116e-02, -8.9251e-04], [ 3.3348e-02, 1.1521e-02, -1.7255e-02, ..., 2.5214e-02, -2.2571e-02, -1.9499e-01], [-1.0648e-02, 1.8491e-02, 1.3106e-02, ..., -5.3552e-02, 7.1194e-02, -4.0439e-02], ..., [-2.6704e-04, -1.2165e-02, -5.4076e-05, ..., -6.2110e-02, -9.8864e-02, 2.5528e-02], [-1.6081e-02, 2.4466e-03, -1.3277e-02, ..., -6.3302e-02, 3.6582e-02, -1.3212e-01], [-7.9658e-02, -8.7668e-04, 9.5353e-04, ..., -7.2517e-02, -6.7374e-02, -6.7209e-02]], device='cuda:0'), grad: tensor([[ 5.7742e-08, 0.0000e+00, 0.0000e+00, ..., -1.0123e-06, 6.2548e-06, 7.3528e-07], [-1.3681e-06, 0.0000e+00, 0.0000e+00, ..., 3.4692e-08, 3.4980e-06, -2.1560e-07], [ 4.6124e-07, 0.0000e+00, 0.0000e+00, ..., 1.1483e-06, -1.7369e-04, 1.2713e-06], ..., [ 1.5087e-07, 0.0000e+00, 0.0000e+00, ..., 2.6869e-07, 3.6597e-04, -1.7677e-06], [ 5.5600e-07, 0.0000e+00, 0.0000e+00, ..., 1.2508e-06, 1.0677e-05, 2.3525e-06], [ 8.6147e-09, 0.0000e+00, 0.0000e+00, ..., 8.2422e-07, 2.8178e-05, 1.7434e-05]], device='cuda:0') Epoch 102, bias, value: tensor([ 0.0080, -0.0159, -0.0003, 0.0313, -0.0064, 0.0315, 0.0038, 0.0169, 0.0023, -0.0102], device='cuda:0'), grad: tensor([ 4.9882e-06, -4.7870e-06, 1.2779e-03, -2.2182e-03, -3.8534e-05, -8.9645e-05, 1.0066e-05, 9.1743e-04, 5.0068e-05, 9.1076e-05], device='cuda:0') 100 0.0001 changing lr epoch 101, time 214.67, cls_loss 0.0043 cls_loss_mapping 0.0092 cls_loss_causal 0.5705 re_mapping 0.0089 re_causal 0.0268 /// teacc 98.92 lr 0.00010000 Epoch 103, weight, value: tensor([[-2.5175e-02, 4.2347e-04, -4.0793e-02, ..., 1.3201e-02, -1.9744e-02, -9.5481e-04], [ 3.3280e-02, 1.1520e-02, -1.7255e-02, ..., 2.5228e-02, -2.3098e-02, -1.9377e-01], [-1.0462e-02, 1.8493e-02, 1.3105e-02, ..., -5.3766e-02, 7.2100e-02, -4.0374e-02], ..., [-7.1211e-04, -1.2165e-02, -5.9020e-05, ..., -6.2323e-02, -1.0024e-01, 2.5066e-02], [-1.5839e-02, 2.4413e-03, -1.3280e-02, ..., -6.3879e-02, 3.6644e-02, -1.3338e-01], [-8.0247e-02, -8.7671e-04, 9.5004e-04, ..., -7.2652e-02, -6.8075e-02, -7.0035e-02]], device='cuda:0'), grad: tensor([[ 1.9115e-07, 0.0000e+00, 0.0000e+00, ..., -2.4363e-06, -8.6753e-07, 1.0780e-07], [ 1.5013e-05, 0.0000e+00, 0.0000e+00, ..., 4.2375e-08, 2.2620e-05, 1.1344e-06], [ 2.0936e-06, 0.0000e+00, 0.0000e+00, ..., 8.4518e-08, -1.6615e-05, -1.1228e-05], ..., [ 1.0198e-06, 0.0000e+00, 0.0000e+00, ..., 2.5146e-08, 1.8664e-06, 1.9986e-06], [-2.6688e-05, 0.0000e+00, 0.0000e+00, ..., 3.9977e-07, -3.5733e-05, 1.8720e-07], [ 6.1886e-07, 0.0000e+00, 0.0000e+00, ..., 4.8755e-07, 3.5334e-06, 1.1260e-06]], device='cuda:0') Epoch 103, bias, value: tensor([ 0.0074, -0.0152, -0.0001, 0.0314, -0.0049, 0.0320, 0.0038, 0.0165, 0.0019, -0.0120], device='cuda:0'), grad: tensor([-4.2021e-06, 5.6028e-05, -1.0006e-05, 2.9281e-06, 1.4558e-05, 8.5756e-06, 1.5944e-05, 1.1146e-05, -1.0180e-04, 6.7838e-06], device='cuda:0') 100 0.0001 changing lr epoch 102, time 214.77, cls_loss 0.0047 cls_loss_mapping 0.0083 cls_loss_causal 0.5855 re_mapping 0.0082 re_causal 0.0251 /// teacc 98.92 lr 0.00010000 Epoch 104, weight, value: tensor([[-0.0252, 0.0004, -0.0412, ..., 0.0129, -0.0209, -0.0011], [ 0.0336, 0.0115, -0.0173, ..., 0.0250, -0.0237, -0.1949], [-0.0105, 0.0186, 0.0127, ..., -0.0542, 0.0726, -0.0397], ..., [-0.0008, -0.0122, -0.0011, ..., -0.0625, -0.1011, 0.0231], [-0.0160, 0.0021, -0.0139, ..., -0.0638, 0.0376, -0.1332], [-0.0804, -0.0009, 0.0008, ..., -0.0729, -0.0685, -0.0694]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.5523e-06, 1.0379e-05, 6.1328e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.9652e-07, 3.5409e-06, 1.1986e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-06, 3.7272e-06, 1.1502e-06], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.2888e-08, 1.1094e-05, 9.0897e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8275e-06, -2.2039e-05, 2.6654e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0606e-07, 1.1124e-05, -2.4557e-05]], device='cuda:0') Epoch 104, bias, value: tensor([ 0.0066, -0.0158, -0.0008, 0.0313, -0.0052, 0.0318, 0.0040, 0.0162, 0.0025, -0.0108], device='cuda:0'), grad: tensor([ 2.8074e-05, 2.0325e-05, 8.0109e-05, -8.2433e-05, 1.0937e-04, 3.0965e-05, -9.6858e-05, 1.7032e-05, -1.1003e-04, 3.5800e-06], device='cuda:0') 100 0.0001 changing lr epoch 103, time 214.61, cls_loss 0.0041 cls_loss_mapping 0.0082 cls_loss_causal 0.5757 re_mapping 0.0090 re_causal 0.0262 /// teacc 98.93 lr 0.00010000 Epoch 105, weight, value: tensor([[-2.5252e-02, 4.4206e-04, -4.1214e-02, ..., 1.3731e-02, -2.1497e-02, -1.5622e-03], [ 3.3872e-02, 1.1480e-02, -1.7279e-02, ..., 2.4055e-02, -2.4620e-02, -1.9610e-01], [-1.0746e-02, 1.8579e-02, 1.2720e-02, ..., -5.4612e-02, 7.3308e-02, -4.1202e-02], ..., [-9.7954e-05, -1.2181e-02, -1.0788e-03, ..., -6.2654e-02, -1.0159e-01, 2.3523e-02], [-1.6004e-02, 2.0461e-03, -1.3936e-02, ..., -6.4151e-02, 3.7887e-02, -1.3433e-01], [-8.0779e-02, -8.8479e-04, 8.1276e-04, ..., -7.3101e-02, -6.9073e-02, -6.9520e-02]], device='cuda:0'), grad: tensor([[ 3.5856e-08, 0.0000e+00, 0.0000e+00, ..., -5.5507e-07, 3.0756e-05, 8.8662e-06], [-8.8988e-07, 0.0000e+00, 0.0000e+00, ..., -2.6785e-06, 2.3562e-06, 1.4175e-06], [ 7.9628e-08, 0.0000e+00, 0.0000e+00, ..., 5.2573e-07, -1.1659e-04, -2.5973e-05], ..., [ 9.1735e-08, 0.0000e+00, 0.0000e+00, ..., 3.9488e-07, 6.3293e-06, 1.9241e-06], [ 1.3877e-07, 0.0000e+00, 0.0000e+00, ..., 2.1905e-06, 5.8323e-05, 1.4611e-05], [ 1.8161e-08, 0.0000e+00, 0.0000e+00, ..., 7.9814e-07, 1.4521e-05, 1.5318e-05]], device='cuda:0') Epoch 105, bias, value: tensor([ 0.0077, -0.0159, -0.0009, 0.0311, -0.0050, 0.0317, 0.0041, 0.0167, 0.0022, -0.0113], device='cuda:0'), grad: tensor([ 4.7565e-05, 2.4766e-05, -1.7190e-04, 2.4930e-05, -2.2143e-05, -2.3991e-05, 2.2277e-05, -1.0693e-04, 9.6619e-05, 1.0860e-04], device='cuda:0') 100 0.0001 changing lr epoch 104, time 214.31, cls_loss 0.0040 cls_loss_mapping 0.0064 cls_loss_causal 0.5788 re_mapping 0.0090 re_causal 0.0268 /// teacc 98.64 lr 0.00010000 Epoch 106, weight, value: tensor([[-2.5079e-02, 5.5103e-04, -4.2571e-02, ..., 1.4015e-02, -2.2079e-02, -1.5906e-03], [ 3.3907e-02, 1.1277e-02, -1.7400e-02, ..., 2.3865e-02, -2.5469e-02, -1.9664e-01], [-1.0604e-02, 1.8600e-02, 1.2314e-02, ..., -5.5951e-02, 7.3833e-02, -4.1283e-02], ..., [-1.4211e-04, -1.2197e-02, -1.2024e-03, ..., -6.2836e-02, -1.0222e-01, 2.4059e-02], [-1.5984e-02, 1.5456e-03, -1.5684e-02, ..., -6.4458e-02, 3.8411e-02, -1.3534e-01], [-8.1021e-02, -9.0084e-04, 4.4046e-03, ..., -7.3310e-02, -6.9933e-02, -6.9465e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -8.3819e-09, 0.0000e+00, ..., -8.4713e-06, 3.5223e-06, 1.9651e-07], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.5635e-08, 8.5160e-06, 4.7637e-07], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 3.1991e-07, 4.1699e-04, 1.1943e-05], ..., [-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 6.0908e-06, 3.2037e-07], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 3.3015e-07, 3.4451e-04, -5.9865e-06], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 1.1874e-07, 1.0878e-05, -4.9509e-06]], device='cuda:0') Epoch 106, bias, value: tensor([ 0.0079, -0.0162, -0.0014, 0.0307, -0.0056, 0.0318, 0.0041, 0.0172, 0.0024, -0.0110], device='cuda:0'), grad: tensor([ 3.6210e-05, 2.6166e-05, 1.3266e-03, -1.2665e-03, 3.3522e-04, -7.8011e-03, 7.4005e-03, 2.6822e-05, 4.2844e-04, -5.0879e-04], device='cuda:0') 100 0.0001 changing lr epoch 105, time 214.52, cls_loss 0.0054 cls_loss_mapping 0.0087 cls_loss_causal 0.5805 re_mapping 0.0090 re_causal 0.0266 /// teacc 98.89 lr 0.00010000 Epoch 107, weight, value: tensor([[-0.0251, 0.0007, -0.0448, ..., 0.0135, -0.0251, -0.0017], [ 0.0342, 0.0110, -0.0185, ..., 0.0220, -0.0252, -0.1974], [-0.0107, 0.0169, 0.0103, ..., -0.0566, 0.0742, -0.0426], ..., [-0.0003, -0.0123, -0.0020, ..., -0.0628, -0.1038, 0.0245], [-0.0161, 0.0005, -0.0149, ..., -0.0646, 0.0386, -0.1360], [-0.0812, -0.0009, 0.0041, ..., -0.0735, -0.0708, -0.0695]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3270e-04, 1.7798e-06, 5.4017e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.2519e-06, 2.0236e-05, 2.6040e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0682e-06, -4.5180e-05, 3.3528e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1269e-07, 9.6038e-06, 1.2383e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6550e-06, -1.9118e-05, -1.4400e-04], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.5640e-06, 2.1040e-05, 1.3161e-04]], device='cuda:0') Epoch 107, bias, value: tensor([ 0.0072, -0.0160, -0.0023, 0.0327, -0.0057, 0.0331, 0.0030, 0.0168, 0.0016, -0.0106], device='cuda:0'), grad: tensor([-4.9448e-04, 3.3438e-05, -6.7294e-05, 3.5703e-05, 6.5565e-06, 7.7263e-06, 4.6253e-04, 5.6535e-05, -5.5933e-04, 5.1785e-04], device='cuda:0') 100 0.0001 changing lr epoch 106, time 214.70, cls_loss 0.0033 cls_loss_mapping 0.0068 cls_loss_causal 0.5594 re_mapping 0.0088 re_causal 0.0267 /// teacc 98.87 lr 0.00010000 Epoch 108, weight, value: tensor([[-2.5069e-02, 6.7492e-04, -4.4982e-02, ..., 1.2651e-02, -2.7238e-02, -1.8463e-03], [ 3.3570e-02, 1.0880e-02, -1.8550e-02, ..., 2.1666e-02, -2.5546e-02, -1.9796e-01], [-1.0710e-02, 1.7071e-02, 1.0493e-02, ..., -5.6933e-02, 7.4878e-02, -4.2716e-02], ..., [-4.2557e-04, -1.2446e-02, -2.2267e-03, ..., -6.2856e-02, -1.0518e-01, 2.4449e-02], [-1.4771e-02, 1.0695e-04, -1.4874e-02, ..., -6.5176e-02, 3.8106e-02, -1.3612e-01], [-8.1334e-02, -9.2197e-04, 4.0446e-03, ..., -7.3927e-02, -7.1795e-02, -6.9576e-02]], device='cuda:0'), grad: tensor([[ 5.1223e-09, 0.0000e+00, 1.8626e-09, ..., -7.9572e-06, -3.5530e-07, 4.5635e-08], [-1.2061e-07, 0.0000e+00, 2.3283e-09, ..., 1.0431e-07, 1.3877e-06, -2.2352e-08], [ 1.6764e-08, 0.0000e+00, -5.2620e-08, ..., 3.4180e-07, -1.0289e-05, 1.0058e-07], ..., [ 1.1176e-08, 0.0000e+00, 8.8476e-09, ..., 1.0896e-07, 5.8524e-06, 5.3458e-07], [ 2.0489e-08, 0.0000e+00, 9.3132e-09, ..., 1.3132e-06, 3.3081e-05, 5.3179e-07], [ 4.6566e-09, 0.0000e+00, 4.6566e-10, ..., 6.5565e-07, -3.8624e-05, 1.5795e-06]], device='cuda:0') Epoch 108, bias, value: tensor([ 0.0069, -0.0162, -0.0018, 0.0315, -0.0056, 0.0343, 0.0035, 0.0168, 0.0013, -0.0112], device='cuda:0'), grad: tensor([-8.1360e-06, -6.9812e-06, -1.0073e-05, 2.4617e-05, 4.1425e-06, 9.6262e-06, 9.7528e-06, -6.5006e-06, 1.6391e-04, -1.8013e-04], device='cuda:0') 100 0.0001 changing lr epoch 107, time 214.63, cls_loss 0.0040 cls_loss_mapping 0.0090 cls_loss_causal 0.5833 re_mapping 0.0082 re_causal 0.0254 /// teacc 98.91 lr 0.00010000 Epoch 109, weight, value: tensor([[-2.5073e-02, 7.4129e-04, -4.5403e-02, ..., 1.3788e-02, -2.7527e-02, -1.8825e-03], [ 3.3599e-02, 1.0822e-02, -1.7457e-02, ..., 2.1449e-02, -2.5714e-02, -1.9908e-01], [-1.0718e-02, 1.7069e-02, 1.0269e-02, ..., -5.7128e-02, 7.5361e-02, -4.2705e-02], ..., [-4.4699e-04, -1.2452e-02, -2.4399e-03, ..., -6.3208e-02, -1.0603e-01, 2.6023e-02], [-1.4842e-02, 1.1402e-05, -1.4924e-02, ..., -6.6743e-02, 3.7627e-02, -1.3680e-01], [-8.1438e-02, -9.2831e-04, 3.6368e-03, ..., -7.5008e-02, -7.2738e-02, -6.9386e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.0892e-06, -3.7719e-06, 6.3796e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.1204e-06, 8.1398e-07, -6.7018e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1898e-07, -3.1322e-05, 8.3912e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4948e-07, 7.9796e-06, 5.4110e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.7696e-07, 1.1154e-05, 6.5658e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1409e-06, 4.1798e-06, 2.2352e-07]], device='cuda:0') Epoch 109, bias, value: tensor([ 0.0077, -0.0156, -0.0016, 0.0307, -0.0062, 0.0342, 0.0040, 0.0168, 0.0003, -0.0108], device='cuda:0'), grad: tensor([-1.6332e-05, -7.3254e-05, -2.8506e-05, 1.7822e-05, 5.1588e-05, 5.5619e-06, 7.8008e-06, -1.0133e-05, 2.6941e-05, 1.8537e-05], device='cuda:0') 100 0.0001 changing lr epoch 108, time 214.66, cls_loss 0.0044 cls_loss_mapping 0.0092 cls_loss_causal 0.5710 re_mapping 0.0083 re_causal 0.0249 /// teacc 98.90 lr 0.00010000 Epoch 110, weight, value: tensor([[-2.5049e-02, 8.3833e-04, -4.5989e-02, ..., 1.3796e-02, -2.8239e-02, -2.1544e-03], [ 3.3710e-02, 1.0774e-02, -1.8599e-02, ..., 2.1232e-02, -2.6346e-02, -2.0078e-01], [-1.0807e-02, 1.7056e-02, 1.2040e-02, ..., -5.7559e-02, 7.5872e-02, -4.2708e-02], ..., [-5.2193e-04, -1.2455e-02, -2.6422e-03, ..., -6.3510e-02, -1.0695e-01, 2.5565e-02], [-1.4706e-02, -1.2762e-04, -1.5102e-02, ..., -6.7118e-02, 3.8047e-02, -1.3674e-01], [-8.1574e-02, -9.3937e-04, 3.9050e-03, ..., -7.5589e-02, -7.3453e-02, -6.9350e-02]], device='cuda:0'), grad: tensor([[ 2.5192e-07, 0.0000e+00, 0.0000e+00, ..., -1.9949e-06, 5.9279e-07, 5.0478e-07], [-4.1388e-06, 0.0000e+00, 0.0000e+00, ..., 6.4727e-08, 7.6042e-07, -7.1675e-06], [ 2.2501e-06, 0.0000e+00, 0.0000e+00, ..., 2.1048e-07, -4.0829e-06, 3.9935e-06], ..., [ 5.4995e-07, 0.0000e+00, 0.0000e+00, ..., 2.7474e-08, 2.7996e-06, 1.0645e-06], [ 3.0966e-07, 0.0000e+00, 0.0000e+00, ..., 3.7020e-07, -1.0565e-05, 7.0687e-07], [ 1.5367e-08, 0.0000e+00, 0.0000e+00, ..., 5.1688e-07, 4.2953e-06, 5.9092e-07]], device='cuda:0') Epoch 110, bias, value: tensor([ 0.0069, -0.0164, -0.0016, 0.0300, -0.0058, 0.0344, 0.0040, 0.0174, 0.0004, -0.0106], device='cuda:0'), grad: tensor([-1.0459e-06, -2.4796e-05, 7.4394e-06, 5.2787e-06, 1.3195e-05, 4.0829e-06, 8.0168e-06, 4.7684e-06, -2.0519e-05, 3.5875e-06], device='cuda:0') 100 0.0001 changing lr epoch 109, time 214.86, cls_loss 0.0034 cls_loss_mapping 0.0062 cls_loss_causal 0.5652 re_mapping 0.0082 re_causal 0.0245 /// teacc 98.85 lr 0.00010000 Epoch 111, weight, value: tensor([[-0.0251, 0.0012, -0.0464, ..., 0.0139, -0.0290, -0.0023], [ 0.0337, 0.0106, -0.0189, ..., 0.0211, -0.0266, -0.2010], [-0.0108, 0.0171, 0.0118, ..., -0.0579, 0.0770, -0.0420], ..., [-0.0006, -0.0125, -0.0029, ..., -0.0636, -0.1091, 0.0252], [-0.0147, -0.0007, -0.0141, ..., -0.0671, 0.0383, -0.1371], [-0.0816, -0.0010, 0.0038, ..., -0.0760, -0.0739, -0.0696]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8375e-06, ..., -1.3880e-05, -2.8573e-06, -2.3749e-08], [ 0.0000e+00, 0.0000e+00, 3.8743e-06, ..., 1.1493e-06, 1.4655e-05, 3.9535e-07], [ 0.0000e+00, 0.0000e+00, 4.1090e-06, ..., 7.1973e-06, -2.8059e-05, 1.8766e-07], ..., [ 0.0000e+00, 0.0000e+00, 1.6570e-05, ..., 3.7961e-06, 6.0707e-05, 3.6415e-07], [ 0.0000e+00, 0.0000e+00, 1.3113e-05, ..., 3.9302e-06, 3.4243e-05, 5.6811e-08], [ 0.0000e+00, 0.0000e+00, 9.0944e-07, ..., 2.9579e-06, 1.0118e-05, 9.9279e-07]], device='cuda:0') Epoch 111, bias, value: tensor([ 0.0068, -0.0162, -0.0009, 0.0299, -0.0053, 0.0338, 0.0042, 0.0166, 0.0003, -0.0107], device='cuda:0'), grad: tensor([-1.0699e-04, 3.8534e-05, -3.9577e-05, -1.8158e-03, 1.4029e-05, 1.5745e-03, 5.3167e-05, 1.4198e-04, 7.8559e-05, 6.1005e-05], device='cuda:0') 100 0.0001 changing lr epoch 110, time 214.54, cls_loss 0.0046 cls_loss_mapping 0.0073 cls_loss_causal 0.5659 re_mapping 0.0081 re_causal 0.0246 /// teacc 99.01 lr 0.00010000 Epoch 112, weight, value: tensor([[-0.0243, 0.0016, -0.0517, ..., 0.0144, -0.0296, -0.0022], [ 0.0342, 0.0103, -0.0185, ..., 0.0210, -0.0284, -0.2016], [-0.0109, 0.0170, 0.0103, ..., -0.0585, 0.0784, -0.0424], ..., [-0.0011, -0.0128, -0.0055, ..., -0.0641, -0.1099, 0.0252], [-0.0149, -0.0021, -0.0156, ..., -0.0676, 0.0386, -0.1373], [-0.0822, -0.0010, 0.0027, ..., -0.0764, -0.0746, -0.0697]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -6.1002e-08, 0.0000e+00, ..., 5.8394e-07, 3.1851e-06, 1.3970e-09], [-1.3551e-07, 1.3970e-09, 0.0000e+00, ..., 5.8208e-07, 2.4494e-06, 3.7253e-09], [ 1.3970e-08, 1.0245e-08, 0.0000e+00, ..., 3.7011e-06, -2.1998e-06, 5.5879e-09], ..., [ 2.6077e-08, 4.6566e-10, 0.0000e+00, ..., 6.3796e-08, 2.5425e-06, 3.7253e-08], [ 4.2375e-08, 7.9162e-09, 0.0000e+00, ..., 1.2368e-06, 9.8813e-07, 4.1910e-09], [ 3.2596e-09, 8.8476e-09, 0.0000e+00, ..., 3.9348e-07, 1.2014e-06, 2.3749e-08]], device='cuda:0') Epoch 112, bias, value: tensor([ 0.0067, -0.0163, -0.0004, 0.0302, -0.0053, 0.0339, 0.0039, 0.0160, 0.0014, -0.0110], device='cuda:0'), grad: tensor([ 4.3893e-04, -1.2379e-03, 1.1230e-04, 8.2552e-05, 8.8334e-05, 1.9324e-04, 6.1035e-05, 1.1814e-04, 5.4598e-04, -4.0197e-04], device='cuda:0') 100 0.0001 changing lr epoch 111, time 214.56, cls_loss 0.0034 cls_loss_mapping 0.0054 cls_loss_causal 0.5623 re_mapping 0.0080 re_causal 0.0245 /// teacc 98.92 lr 0.00010000 Epoch 113, weight, value: tensor([[-0.0256, 0.0034, -0.0529, ..., 0.0142, -0.0306, -0.0023], [ 0.0357, 0.0086, -0.0199, ..., 0.0209, -0.0286, -0.2018], [-0.0111, 0.0177, 0.0115, ..., -0.0592, 0.0787, -0.0423], ..., [-0.0021, -0.0141, -0.0057, ..., -0.0646, -0.1107, 0.0251], [-0.0137, -0.0071, -0.0148, ..., -0.0679, 0.0391, -0.1378], [-0.0839, -0.0014, 0.0020, ..., -0.0766, -0.0757, -0.0697]], device='cuda:0'), grad: tensor([[ 1.7602e-07, 0.0000e+00, 9.3132e-10, ..., 1.3290e-06, 2.2911e-06, 3.4459e-08], [ 1.9044e-05, 0.0000e+00, 4.6566e-10, ..., 3.6806e-06, 9.3728e-06, 2.5705e-07], [ 9.7975e-07, 0.0000e+00, 0.0000e+00, ..., 2.7776e-05, 3.2306e-05, 6.0536e-08], ..., [-2.5526e-05, 0.0000e+00, 4.6566e-10, ..., 1.2619e-07, 5.6252e-06, 7.9907e-07], [ 2.1067e-06, 0.0000e+00, 2.7940e-09, ..., 4.6119e-06, 4.8727e-06, 1.0617e-07], [ 1.3318e-06, 0.0000e+00, 2.3283e-09, ..., 6.0024e-07, 1.3411e-06, 9.6783e-06]], device='cuda:0') Epoch 113, bias, value: tensor([ 0.0062, -0.0161, -0.0007, 0.0311, -0.0053, 0.0332, 0.0039, 0.0161, 0.0014, -0.0110], device='cuda:0'), grad: tensor([ 9.0376e-06, 3.5620e-04, 6.4909e-05, 2.8461e-05, 8.8215e-05, 1.3661e-04, -3.0351e-04, -4.1509e-04, 4.9293e-05, -1.3962e-05], device='cuda:0') 100 0.0001 changing lr epoch 112, time 214.32, cls_loss 0.0030 cls_loss_mapping 0.0065 cls_loss_causal 0.5427 re_mapping 0.0081 re_causal 0.0240 /// teacc 98.89 lr 0.00010000 Epoch 114, weight, value: tensor([[-0.0257, 0.0041, -0.0554, ..., 0.0142, -0.0311, -0.0023], [ 0.0360, 0.0079, -0.0197, ..., 0.0207, -0.0286, -0.2020], [-0.0111, 0.0180, 0.0108, ..., -0.0598, 0.0789, -0.0427], ..., [-0.0019, -0.0143, -0.0091, ..., -0.0647, -0.1115, 0.0255], [-0.0138, -0.0085, -0.0167, ..., -0.0683, 0.0391, -0.1379], [-0.0851, -0.0016, 0.0009, ..., -0.0767, -0.0762, -0.0697]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., -1.6168e-06, 7.5717e-07, 3.9581e-08], [ 6.6590e-08, 0.0000e+00, 0.0000e+00, ..., 8.7637e-07, 1.7602e-06, 2.9849e-07], [ 3.5390e-08, 0.0000e+00, 0.0000e+00, ..., 2.6263e-07, -1.0394e-06, 1.1176e-07], ..., [ 7.6834e-08, 0.0000e+00, 0.0000e+00, ..., 4.9826e-08, 4.3400e-07, 2.5379e-07], [ 2.8405e-08, 0.0000e+00, 0.0000e+00, ..., 2.9784e-06, -2.5164e-06, 1.6345e-07], [ 9.5461e-08, 0.0000e+00, 0.0000e+00, ..., 1.1809e-06, 2.7157e-06, 1.1409e-06]], device='cuda:0') Epoch 114, bias, value: tensor([ 0.0059, -0.0156, -0.0010, 0.0316, -0.0055, 0.0332, 0.0040, 0.0161, 0.0010, -0.0110], device='cuda:0'), grad: tensor([ 1.5152e-04, 3.7923e-06, 2.4930e-05, 3.1255e-06, 6.1810e-05, 7.1712e-07, -3.3230e-06, 1.0699e-05, -3.0585e-06, -2.5010e-04], device='cuda:0') 100 0.0001 changing lr epoch 113, time 214.38, cls_loss 0.0036 cls_loss_mapping 0.0075 cls_loss_causal 0.5540 re_mapping 0.0073 re_causal 0.0225 /// teacc 98.84 lr 0.00010000 Epoch 115, weight, value: tensor([[-2.5900e-02, 4.0682e-03, -5.6205e-02, ..., 1.4995e-02, -3.0818e-02, -2.4174e-03], [ 3.7785e-02, 7.2656e-03, -1.9819e-02, ..., 2.0514e-02, -2.9193e-02, -2.0231e-01], [-1.1750e-02, 1.9344e-02, 1.0618e-02, ..., -6.0270e-02, 7.9521e-02, -4.2877e-02], ..., [-1.5224e-03, -1.5982e-02, -9.0951e-03, ..., -6.6428e-02, -1.1237e-01, 2.5568e-02], [-1.5179e-02, -9.6954e-03, -1.7284e-02, ..., -6.8071e-02, 3.9919e-02, -1.3824e-01], [-8.7812e-02, -1.6461e-03, -8.8172e-05, ..., -7.6994e-02, -7.6684e-02, -6.9680e-02]], device='cuda:0'), grad: tensor([[ 4.0978e-08, 6.1048e-07, 1.5832e-08, ..., -1.0714e-05, 1.0483e-05, 8.8941e-08], [-1.5553e-06, 6.2864e-08, 1.0245e-08, ..., 5.6205e-07, 1.9282e-05, 1.6997e-07], [ 2.0070e-07, -4.1015e-06, 3.7253e-09, ..., 6.6450e-07, -3.4523e-04, 4.7497e-08], ..., [ 3.1106e-07, 2.6673e-06, 1.3504e-08, ..., 3.5390e-07, 2.6202e-04, 5.3737e-07], [ 2.4121e-07, 3.4878e-07, 3.3528e-08, ..., 9.1316e-07, 1.0572e-05, 1.2852e-07], [ 1.1222e-07, 3.1199e-08, 6.2399e-08, ..., 3.2503e-06, 8.4713e-06, -2.6617e-06]], device='cuda:0') Epoch 115, bias, value: tensor([ 0.0065, -0.0161, -0.0010, 0.0310, -0.0056, 0.0327, 0.0040, 0.0166, 0.0015, -0.0108], device='cuda:0'), grad: tensor([ 1.1064e-05, 4.6134e-05, -1.0090e-03, 6.3479e-05, 3.6299e-05, 9.3654e-06, 1.4737e-05, 7.7963e-04, 3.0577e-05, 1.6555e-05], device='cuda:0') 100 0.0001 changing lr epoch 114, time 214.27, cls_loss 0.0029 cls_loss_mapping 0.0052 cls_loss_causal 0.5182 re_mapping 0.0073 re_causal 0.0226 /// teacc 98.85 lr 0.00010000 Epoch 116, weight, value: tensor([[-0.0257, 0.0047, -0.0569, ..., 0.0169, -0.0309, -0.0026], [ 0.0389, 0.0049, -0.0195, ..., 0.0200, -0.0295, -0.2026], [-0.0118, 0.0172, 0.0105, ..., -0.0609, 0.0798, -0.0433], ..., [-0.0020, -0.0201, -0.0091, ..., -0.0669, -0.1132, 0.0254], [-0.0153, -0.0134, -0.0149, ..., -0.0682, 0.0403, -0.1389], [-0.0899, -0.0018, -0.0008, ..., -0.0779, -0.0771, -0.0699]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 7.2690e-07, 3.6173e-06, 1.5646e-07], [-1.9092e-08, 0.0000e+00, 0.0000e+00, ..., 5.1828e-07, 1.4557e-06, 8.1351e-07], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 7.7439e-07, -5.2080e-06, 6.3609e-07], ..., [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 1.4761e-07, 6.2250e-06, 7.6042e-07], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 1.0774e-05, 1.9461e-05, 2.7288e-07], [ 9.3132e-10, 0.0000e+00, 4.6566e-10, ..., 3.5111e-06, 7.3984e-06, 3.2764e-06]], device='cuda:0') Epoch 116, bias, value: tensor([ 0.0081, -0.0160, -0.0015, 0.0317, -0.0056, 0.0325, 0.0035, 0.0164, 0.0015, -0.0109], device='cuda:0'), grad: tensor([ 5.7444e-06, 5.5917e-06, -3.5446e-06, 2.8467e-04, -5.8487e-06, -1.6093e-04, -1.7917e-04, -4.2878e-06, 3.9428e-05, 1.7807e-05], device='cuda:0') 100 0.0001 changing lr epoch 115, time 214.36, cls_loss 0.0041 cls_loss_mapping 0.0063 cls_loss_causal 0.5766 re_mapping 0.0074 re_causal 0.0238 /// teacc 99.02 lr 0.00010000 Epoch 117, weight, value: tensor([[-0.0259, 0.0048, -0.0573, ..., 0.0167, -0.0321, -0.0028], [ 0.0395, 0.0048, -0.0193, ..., 0.0198, -0.0299, -0.2051], [-0.0121, 0.0171, 0.0105, ..., -0.0622, 0.0804, -0.0436], ..., [-0.0025, -0.0201, -0.0091, ..., -0.0671, -0.1144, 0.0256], [-0.0152, -0.0136, -0.0150, ..., -0.0683, 0.0407, -0.1410], [-0.0919, -0.0018, -0.0015, ..., -0.0781, -0.0763, -0.0699]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., -3.9442e-07, 8.1677e-07, 1.2945e-07], [-1.1129e-07, 0.0000e+00, 0.0000e+00, ..., -9.3132e-09, 1.7900e-06, 5.0524e-07], [ 2.5611e-08, 0.0000e+00, 0.0000e+00, ..., 6.3330e-08, -5.7518e-06, 8.7079e-08], ..., [ 2.6077e-08, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 1.2768e-06, 8.0420e-07], [ 1.7229e-08, 0.0000e+00, 0.0000e+00, ..., 1.0198e-07, 5.0999e-06, 2.0582e-07], [ 2.4214e-08, 0.0000e+00, 0.0000e+00, ..., 9.2201e-08, 1.4216e-05, -1.5691e-05]], device='cuda:0') Epoch 117, bias, value: tensor([ 0.0072, -0.0165, -0.0003, 0.0300, -0.0055, 0.0323, 0.0035, 0.0162, 0.0006, -0.0093], device='cuda:0'), grad: tensor([ 2.0750e-06, 2.0728e-05, -3.9302e-07, 3.1263e-05, 9.9957e-05, -6.7770e-05, 1.0423e-05, -2.8640e-05, 1.2375e-05, -7.9930e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 116---------------------------------------------------- epoch 116, time 230.55, cls_loss 0.0025 cls_loss_mapping 0.0053 cls_loss_causal 0.5874 re_mapping 0.0077 re_causal 0.0243 /// teacc 99.07 lr 0.00010000 Epoch 118, weight, value: tensor([[-0.0260, 0.0048, -0.0576, ..., 0.0171, -0.0322, -0.0025], [ 0.0391, 0.0048, -0.0200, ..., 0.0197, -0.0301, -0.2060], [-0.0109, 0.0171, 0.0101, ..., -0.0626, 0.0812, -0.0438], ..., [-0.0028, -0.0201, -0.0092, ..., -0.0673, -0.1153, 0.0255], [-0.0156, -0.0136, -0.0143, ..., -0.0682, 0.0411, -0.1421], [-0.0939, -0.0018, -0.0020, ..., -0.0783, -0.0770, -0.0703]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -5.2661e-05, -1.6168e-05, 1.8626e-08], [-9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 1.1595e-07, 1.6345e-07, 3.8650e-08], [ 4.6566e-10, -1.3970e-09, 0.0000e+00, ..., 1.6004e-05, 2.5239e-06, 1.7695e-08], ..., [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 1.1167e-06, 2.8824e-07, 7.7300e-08], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 2.8815e-06, 7.1060e-07, 2.1886e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 6.8918e-06, 2.0787e-06, 6.1840e-07]], device='cuda:0') Epoch 118, bias, value: tensor([ 0.0074, -0.0171, -0.0002, 0.0301, -0.0049, 0.0319, 0.0035, 0.0170, 0.0007, -0.0099], device='cuda:0'), grad: tensor([-1.0800e-04, -5.0664e-07, 3.6120e-05, 8.5160e-06, 1.1511e-06, 1.5989e-05, 3.2127e-05, -7.6056e-05, 8.5458e-06, 8.2076e-05], device='cuda:0') 100 0.0001 changing lr epoch 117, time 214.11, cls_loss 0.0029 cls_loss_mapping 0.0061 cls_loss_causal 0.5667 re_mapping 0.0073 re_causal 0.0234 /// teacc 98.97 lr 0.00010000 Epoch 119, weight, value: tensor([[-0.0261, 0.0048, -0.0577, ..., 0.0171, -0.0327, -0.0026], [ 0.0388, 0.0047, -0.0198, ..., 0.0194, -0.0308, -0.2065], [-0.0108, 0.0171, 0.0100, ..., -0.0631, 0.0819, -0.0437], ..., [-0.0029, -0.0201, -0.0092, ..., -0.0676, -0.1154, 0.0255], [-0.0152, -0.0137, -0.0143, ..., -0.0685, 0.0412, -0.1423], [-0.0951, -0.0018, -0.0022, ..., -0.0785, -0.0773, -0.0705]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -4.9695e-06, 5.6587e-06, 3.2652e-06], [-2.0955e-08, 0.0000e+00, 0.0000e+00, ..., 1.6866e-06, 1.0477e-06, 7.7114e-06], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.3802e-06, -4.0494e-06, 8.6194e-07], ..., [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 5.9139e-07, 2.8834e-06, -2.1338e-05], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 3.2596e-06, 7.6890e-06, 2.0536e-07], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 6.2445e-07, 1.1183e-05, 1.4059e-05]], device='cuda:0') Epoch 119, bias, value: tensor([ 0.0072, -0.0170, 0.0001, 0.0297, -0.0048, 0.0317, 0.0038, 0.0171, 0.0005, -0.0102], device='cuda:0'), grad: tensor([ 3.2693e-05, 1.1086e-04, 8.1807e-06, -2.8819e-05, -5.1200e-05, 5.2750e-05, -1.2562e-05, -2.9111e-04, 5.9277e-05, 1.1992e-04], device='cuda:0') 100 0.0001 changing lr epoch 118, time 214.49, cls_loss 0.0029 cls_loss_mapping 0.0055 cls_loss_causal 0.5789 re_mapping 0.0077 re_causal 0.0238 /// teacc 98.92 lr 0.00010000 Epoch 120, weight, value: tensor([[-0.0262, 0.0048, -0.0579, ..., 0.0180, -0.0326, -0.0027], [ 0.0389, 0.0047, -0.0196, ..., 0.0188, -0.0313, -0.2071], [-0.0108, 0.0171, 0.0098, ..., -0.0642, 0.0822, -0.0436], ..., [-0.0031, -0.0202, -0.0091, ..., -0.0680, -0.1160, 0.0255], [-0.0154, -0.0139, -0.0145, ..., -0.0694, 0.0413, -0.1428], [-0.0952, -0.0019, -0.0016, ..., -0.0789, -0.0782, -0.0713]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.3283e-08, ..., 4.7207e-04, 3.3951e-04, 2.1094e-07], [ 0.0000e+00, 0.0000e+00, 5.1223e-09, ..., -6.3609e-07, 3.2336e-06, 1.6792e-06], [ 0.0000e+00, 0.0000e+00, 5.1223e-09, ..., 8.1807e-06, 3.0864e-06, 2.4959e-07], ..., [ 0.0000e+00, 0.0000e+00, 1.9558e-08, ..., 7.8231e-07, 1.7043e-06, 2.1961e-06], [ 0.0000e+00, 0.0000e+00, 3.3528e-08, ..., 1.0595e-05, 8.0317e-06, 2.5146e-07], [ 0.0000e+00, 0.0000e+00, 8.9873e-08, ..., 3.7793e-06, 3.3546e-06, 1.4141e-05]], device='cuda:0') Epoch 120, bias, value: tensor([ 0.0080, -0.0171, 0.0003, 0.0291, -0.0035, 0.0319, 0.0033, 0.0173, 0.0002, -0.0111], device='cuda:0'), grad: tensor([ 5.2214e-04, -3.6210e-06, 1.3359e-05, -9.3997e-05, 5.8919e-05, 4.0627e-04, -1.0681e-03, 2.5332e-05, 2.8938e-05, 1.0985e-04], device='cuda:0') 100 0.0001 changing lr epoch 119, time 214.38, cls_loss 0.0032 cls_loss_mapping 0.0072 cls_loss_causal 0.5833 re_mapping 0.0077 re_causal 0.0234 /// teacc 98.82 lr 0.00010000 Epoch 121, weight, value: tensor([[-0.0260, 0.0048, -0.0580, ..., 0.0173, -0.0341, -0.0030], [ 0.0392, 0.0046, -0.0196, ..., 0.0186, -0.0316, -0.2076], [-0.0108, 0.0172, 0.0097, ..., -0.0652, 0.0826, -0.0437], ..., [-0.0032, -0.0202, -0.0091, ..., -0.0684, -0.1165, 0.0253], [-0.0155, -0.0139, -0.0145, ..., -0.0701, 0.0417, -0.1432], [-0.0955, -0.0019, -0.0021, ..., -0.0791, -0.0788, -0.0714]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.8673e-08, 0.0000e+00, ..., 7.1190e-06, 7.1004e-06, 3.6322e-08], [ 0.0000e+00, 1.0105e-07, 0.0000e+00, ..., 5.8860e-07, 1.1595e-06, 1.8906e-07], [ 0.0000e+00, -9.6206e-07, 0.0000e+00, ..., 1.6410e-06, -3.3733e-06, 6.9384e-08], ..., [ 0.0000e+00, 2.9849e-07, 0.0000e+00, ..., 1.1735e-07, 1.6736e-06, 3.0594e-07], [ 0.0000e+00, 1.2340e-07, 0.0000e+00, ..., 1.2945e-06, 2.2762e-06, 9.9186e-08], [ 0.0000e+00, 2.6543e-08, 0.0000e+00, ..., 6.7009e-07, 1.5777e-06, 8.7544e-07]], device='cuda:0') Epoch 121, bias, value: tensor([ 0.0074, -0.0172, 0.0003, 0.0292, -0.0035, 0.0316, 0.0039, 0.0172, 0.0003, -0.0111], device='cuda:0'), grad: tensor([ 1.0736e-05, -2.5444e-06, -6.3442e-06, 1.2629e-05, 8.6203e-06, -2.1696e-05, -1.8954e-05, 5.1670e-06, 7.2978e-06, 5.0366e-06], device='cuda:0') 100 0.0001 changing lr epoch 120, time 214.22, cls_loss 0.0031 cls_loss_mapping 0.0060 cls_loss_causal 0.5597 re_mapping 0.0079 re_causal 0.0238 /// teacc 99.06 lr 0.00010000 Epoch 122, weight, value: tensor([[-0.0258, 0.0049, -0.0582, ..., 0.0167, -0.0349, -0.0032], [ 0.0392, 0.0043, -0.0197, ..., 0.0186, -0.0320, -0.2080], [-0.0107, 0.0172, 0.0095, ..., -0.0651, 0.0832, -0.0433], ..., [-0.0033, -0.0202, -0.0093, ..., -0.0684, -0.1168, 0.0254], [-0.0155, -0.0140, -0.0150, ..., -0.0705, 0.0426, -0.1432], [-0.0957, -0.0019, -0.0020, ..., -0.0796, -0.0806, -0.0714]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.7032e-08, 0.0000e+00, ..., 1.2553e-04, 3.8743e-05, 5.0142e-06], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 7.8045e-07, 1.2917e-06, 4.5635e-08], [ 0.0000e+00, -7.5437e-08, 0.0000e+00, ..., 2.3004e-07, -3.7774e-06, -1.7323e-07], ..., [ 0.0000e+00, 6.0536e-09, 0.0000e+00, ..., 6.0536e-08, 1.1371e-06, 1.2433e-07], [ 0.0000e+00, 1.1642e-08, 0.0000e+00, ..., 3.5223e-06, 5.6028e-06, 8.0559e-08], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.5385e-06, 2.0973e-06, 1.8720e-07]], device='cuda:0') Epoch 122, bias, value: tensor([ 0.0065, -0.0167, 0.0002, 0.0299, -0.0042, 0.0317, 0.0043, 0.0167, 0.0011, -0.0114], device='cuda:0'), grad: tensor([ 2.3592e-04, 4.2915e-06, -6.6869e-06, 4.1947e-06, 5.2392e-05, -8.1658e-05, -2.4045e-04, 2.0880e-06, 2.9489e-05, 2.2585e-07], device='cuda:0') 100 0.0001 changing lr epoch 121, time 214.15, cls_loss 0.0036 cls_loss_mapping 0.0063 cls_loss_causal 0.5443 re_mapping 0.0079 re_causal 0.0224 /// teacc 99.03 lr 0.00010000 Epoch 123, weight, value: tensor([[-0.0259, 0.0039, -0.0583, ..., 0.0163, -0.0362, -0.0032], [ 0.0395, 0.0035, -0.0197, ..., 0.0184, -0.0324, -0.2089], [-0.0099, 0.0173, 0.0095, ..., -0.0644, 0.0825, -0.0430], ..., [-0.0035, -0.0203, -0.0093, ..., -0.0693, -0.1177, 0.0253], [-0.0158, -0.0158, -0.0150, ..., -0.0727, 0.0437, -0.1437], [-0.0959, -0.0019, -0.0023, ..., -0.0796, -0.0813, -0.0715]], device='cuda:0'), grad: tensor([[ 2.2212e-07, 0.0000e+00, 0.0000e+00, ..., 1.3433e-05, 1.6436e-05, 4.0792e-06], [-2.3395e-05, 0.0000e+00, 0.0000e+00, ..., 8.0019e-06, 3.2112e-06, -6.2995e-06], [ 1.2383e-05, 0.0000e+00, 0.0000e+00, ..., 1.4149e-05, 5.6356e-05, 9.4324e-06], ..., [ 3.8743e-07, 0.0000e+00, 0.0000e+00, ..., 1.4566e-06, 4.9062e-06, 1.5507e-07], [ 6.9439e-06, 0.0000e+00, 0.0000e+00, ..., -1.1408e-04, -3.7432e-04, -3.4332e-05], [ 4.9826e-08, 0.0000e+00, 0.0000e+00, ..., 2.1197e-06, 3.1125e-06, 1.1548e-06]], device='cuda:0') Epoch 123, bias, value: tensor([ 0.0056, -0.0160, -0.0009, 0.0295, -0.0042, 0.0324, 0.0049, 0.0170, 0.0010, -0.0116], device='cuda:0'), grad: tensor([ 3.3647e-05, -1.5843e-04, 1.8728e-04, 4.9062e-06, 6.3539e-05, 3.5286e-04, 6.2466e-05, 2.9523e-07, -5.5885e-04, 1.1683e-05], device='cuda:0') 100 0.0001 changing lr epoch 122, time 214.19, cls_loss 0.0049 cls_loss_mapping 0.0085 cls_loss_causal 0.5761 re_mapping 0.0083 re_causal 0.0234 /// teacc 98.88 lr 0.00010000 Epoch 124, weight, value: tensor([[-0.0258, 0.0039, -0.0587, ..., 0.0169, -0.0342, -0.0028], [ 0.0403, 0.0032, -0.0197, ..., 0.0181, -0.0326, -0.2102], [-0.0095, 0.0174, 0.0093, ..., -0.0666, 0.0833, -0.0437], ..., [-0.0042, -0.0204, -0.0094, ..., -0.0701, -0.1198, 0.0269], [-0.0162, -0.0160, -0.0145, ..., -0.0737, 0.0435, -0.1439], [-0.0973, -0.0019, -0.0026, ..., -0.0774, -0.0824, -0.0715]], device='cuda:0'), grad: tensor([[ 8.7311e-08, 2.3283e-09, 2.3283e-08, ..., -1.8150e-05, 1.0058e-06, -1.6242e-05], [-6.4773e-07, 5.5879e-09, 5.0990e-08, ..., 1.1437e-06, 1.4631e-06, 6.6422e-06], [ 7.0548e-08, -3.3062e-08, 2.3982e-08, ..., 1.2629e-06, 4.3819e-07, 2.3488e-06], ..., [ 2.9523e-07, 3.9581e-09, 6.9849e-09, ..., 3.0338e-07, 2.9965e-07, 2.8647e-06], [ 1.6601e-07, 7.2177e-09, 3.6485e-07, ..., 9.1866e-06, 6.2399e-06, 1.0133e-05], [ 1.8487e-07, 6.9849e-10, 4.8894e-08, ..., 5.3979e-06, 2.4997e-06, 3.3647e-05]], device='cuda:0') Epoch 124, bias, value: tensor([ 0.0054, -0.0162, -0.0007, 0.0299, -0.0051, 0.0323, 0.0041, 0.0176, 0.0002, -0.0104], device='cuda:0'), grad: tensor([-6.5744e-05, 2.6494e-05, 1.4573e-05, 2.4170e-05, -9.9421e-05, 1.8775e-05, -4.1187e-05, -4.1306e-05, 5.3287e-05, 1.1009e-04], device='cuda:0') 100 0.0001 changing lr epoch 123, time 214.18, cls_loss 0.0048 cls_loss_mapping 0.0080 cls_loss_causal 0.5402 re_mapping 0.0083 re_causal 0.0226 /// teacc 98.92 lr 0.00010000 Epoch 125, weight, value: tensor([[-0.0261, 0.0035, -0.0590, ..., 0.0168, -0.0352, -0.0029], [ 0.0420, 0.0003, -0.0200, ..., 0.0168, -0.0330, -0.2115], [-0.0089, 0.0159, 0.0089, ..., -0.0669, 0.0842, -0.0411], ..., [-0.0022, -0.0208, -0.0095, ..., -0.0715, -0.1214, 0.0270], [-0.0175, -0.0182, -0.0142, ..., -0.0746, 0.0435, -0.1444], [-0.1003, -0.0020, -0.0029, ..., -0.0776, -0.0837, -0.0710]], device='cuda:0'), grad: tensor([[ 3.6089e-08, 0.0000e+00, 0.0000e+00, ..., 1.4454e-05, 1.3024e-05, 1.1665e-07], [-1.6959e-06, 0.0000e+00, 0.0000e+00, ..., -4.7572e-06, 6.0424e-06, -5.5097e-06], [ 7.1479e-08, 0.0000e+00, 0.0000e+00, ..., 1.2917e-06, -1.9109e-04, 1.4743e-06], ..., [ 5.7183e-07, 0.0000e+00, 0.0000e+00, ..., 2.3888e-07, 2.6450e-06, 1.6997e-07], [ 6.3330e-08, 0.0000e+00, 0.0000e+00, ..., 5.3365e-07, 1.7452e-04, 1.1688e-07], [ 1.7346e-07, 0.0000e+00, 0.0000e+00, ..., 4.3027e-06, 3.3388e-07, 4.7055e-07]], device='cuda:0') Epoch 125, bias, value: tensor([ 0.0055, -0.0157, -0.0004, 0.0305, -0.0061, 0.0321, 0.0048, 0.0173, -0.0004, -0.0110], device='cuda:0'), grad: tensor([ 1.4938e-05, -5.9694e-05, -2.6798e-04, 1.0811e-05, 2.2218e-05, 6.5714e-06, -5.8822e-06, 3.9265e-06, 2.6274e-04, 1.2212e-05], device='cuda:0') 100 0.0001 changing lr epoch 124, time 214.21, cls_loss 0.0033 cls_loss_mapping 0.0057 cls_loss_causal 0.5346 re_mapping 0.0076 re_causal 0.0232 /// teacc 99.02 lr 0.00010000 Epoch 126, weight, value: tensor([[-0.0260, 0.0032, -0.0598, ..., 0.0170, -0.0354, -0.0030], [ 0.0428, -0.0014, -0.0198, ..., 0.0169, -0.0331, -0.2134], [-0.0090, 0.0163, 0.0083, ..., -0.0667, 0.0845, -0.0410], ..., [-0.0022, -0.0217, -0.0097, ..., -0.0722, -0.1223, 0.0265], [-0.0169, -0.0212, -0.0142, ..., -0.0748, 0.0440, -0.1444], [-0.1021, -0.0034, -0.0036, ..., -0.0778, -0.0850, -0.0714]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6196e-06, -7.8231e-08, 2.3283e-09], [-2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 5.3085e-08, 5.8673e-08, 3.0268e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.5181e-07, 7.4971e-08, 5.5879e-09], ..., [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 3.3528e-08, 7.1712e-08, 2.3749e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.2480e-07, 2.6869e-07, 3.0268e-08], [ 4.1910e-09, 0.0000e+00, 0.0000e+00, ..., 6.8313e-07, 1.0431e-07, 8.7544e-08]], device='cuda:0') Epoch 126, bias, value: tensor([ 0.0055, -0.0147, -0.0013, 0.0301, -0.0056, 0.0319, 0.0048, 0.0170, -0.0002, -0.0112], device='cuda:0'), grad: tensor([-1.1455e-06, 5.2489e-06, 4.3847e-06, -3.4213e-05, 9.4716e-07, 3.0369e-05, 8.0932e-07, -2.0131e-05, 7.6443e-06, 6.0610e-06], device='cuda:0') 100 0.0001 changing lr epoch 125, time 214.50, cls_loss 0.0044 cls_loss_mapping 0.0067 cls_loss_causal 0.5498 re_mapping 0.0074 re_causal 0.0210 /// teacc 98.99 lr 0.00010000 Epoch 127, weight, value: tensor([[-0.0263, 0.0032, -0.0613, ..., 0.0172, -0.0360, -0.0033], [ 0.0415, -0.0016, -0.0205, ..., 0.0164, -0.0335, -0.2144], [-0.0090, 0.0163, 0.0072, ..., -0.0672, 0.0849, -0.0411], ..., [-0.0025, -0.0219, -0.0098, ..., -0.0725, -0.1234, 0.0262], [-0.0147, -0.0216, -0.0132, ..., -0.0749, 0.0443, -0.1448], [-0.1044, -0.0035, -0.0033, ..., -0.0780, -0.0835, -0.0711]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 1.3039e-08, 3.7253e-08, ..., -4.7795e-06, 1.9427e-06, -1.0975e-05], [-9.3132e-08, 1.1642e-08, -3.1050e-06, ..., 1.9372e-07, 4.3735e-06, 5.8813e-07], [ 1.6764e-08, -1.7090e-07, 1.5972e-07, ..., 3.7765e-07, -5.2415e-06, -1.3746e-05], ..., [ 4.0513e-08, 4.4238e-08, 9.3412e-07, ..., 1.3039e-07, 1.4499e-05, 9.0748e-06], [ 1.2573e-08, 2.9337e-08, 1.3784e-07, ..., 3.3583e-06, -3.6597e-05, 1.3802e-06], [ 1.0245e-08, 3.7253e-09, 7.6834e-07, ..., 7.8827e-06, 2.0355e-05, 6.1989e-06]], device='cuda:0') Epoch 127, bias, value: tensor([ 0.0053, -0.0147, -0.0013, 0.0301, -0.0066, 0.0320, 0.0036, 0.0171, -0.0006, -0.0093], device='cuda:0'), grad: tensor([-2.5138e-05, -6.7912e-06, -3.6713e-06, 1.2234e-05, 1.0461e-05, -7.2181e-05, 5.4657e-05, 3.4511e-05, -6.7174e-05, 6.2943e-05], device='cuda:0') 100 0.0001 changing lr epoch 126, time 214.37, cls_loss 0.0026 cls_loss_mapping 0.0053 cls_loss_causal 0.5637 re_mapping 0.0075 re_causal 0.0235 /// teacc 98.92 lr 0.00010000 Epoch 128, weight, value: tensor([[-0.0264, 0.0028, -0.0622, ..., 0.0173, -0.0366, -0.0032], [ 0.0424, -0.0020, -0.0204, ..., 0.0162, -0.0338, -0.2150], [-0.0090, 0.0164, 0.0071, ..., -0.0675, 0.0851, -0.0415], ..., [-0.0036, -0.0220, -0.0099, ..., -0.0727, -0.1239, 0.0263], [-0.0147, -0.0221, -0.0137, ..., -0.0752, 0.0444, -0.1451], [-0.1071, -0.0035, -0.0036, ..., -0.0782, -0.0840, -0.0723]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 8.1882e-06, 1.2785e-05, 5.6550e-06], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.6764e-07, 8.9174e-07, 8.4750e-07], [ 0.0000e+00, -6.0536e-09, 0.0000e+00, ..., 2.0117e-06, -1.4111e-05, 1.3094e-06], ..., [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.7789e-09, 1.6121e-06, -3.9581e-08], [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., 2.0508e-06, 1.6004e-05, 1.5618e-06], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 3.3388e-07, 5.3411e-07, -5.9092e-07]], device='cuda:0') Epoch 128, bias, value: tensor([ 0.0053, -0.0145, -0.0012, 0.0303, -0.0047, 0.0318, 0.0038, 0.0168, -0.0007, -0.0109], device='cuda:0'), grad: tensor([ 2.5272e-05, 1.3757e-04, -1.8492e-05, -5.7854e-06, 2.6762e-05, 7.6219e-06, -4.7714e-05, 4.8876e-06, 5.2929e-05, -1.8299e-04], device='cuda:0') 100 0.0001 changing lr epoch 127, time 214.65, cls_loss 0.0038 cls_loss_mapping 0.0070 cls_loss_causal 0.5325 re_mapping 0.0074 re_causal 0.0209 /// teacc 98.99 lr 0.00010000 Epoch 129, weight, value: tensor([[-0.0273, 0.0028, -0.0660, ..., 0.0175, -0.0371, -0.0034], [ 0.0424, -0.0025, -0.0179, ..., 0.0161, -0.0341, -0.2164], [-0.0094, 0.0165, 0.0047, ..., -0.0683, 0.0851, -0.0437], ..., [-0.0016, -0.0221, -0.0103, ..., -0.0729, -0.1244, 0.0267], [-0.0144, -0.0222, -0.0157, ..., -0.0756, 0.0446, -0.1458], [-0.1093, -0.0035, -0.0069, ..., -0.0784, -0.0844, -0.0724]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.2864e-08, 2.1886e-08, ..., -5.8822e-06, 1.0412e-06, 4.6566e-09], [ 0.0000e+00, 9.0804e-08, 2.4904e-06, ..., 5.3085e-08, 9.4622e-06, 3.5390e-08], [ 0.0000e+00, -6.0014e-06, -2.9709e-06, ..., 1.5041e-07, -3.3408e-05, 1.4435e-08], ..., [ 4.6566e-10, 1.0617e-07, 1.7416e-07, ..., 4.7497e-08, 1.4817e-06, -1.0813e-06], [ 0.0000e+00, 5.7276e-08, 3.3062e-08, ..., 2.5984e-07, -2.7288e-07, 8.8476e-09], [ 4.6566e-10, 8.4285e-08, 5.8673e-08, ..., 4.2841e-06, 4.8243e-07, 1.3364e-07]], device='cuda:0') Epoch 129, bias, value: tensor([ 0.0055, -0.0146, -0.0018, 0.0300, -0.0038, 0.0320, 0.0041, 0.0178, -0.0008, -0.0119], device='cuda:0'), grad: tensor([-8.3670e-06, 3.0145e-05, -1.0550e-04, 6.6996e-05, 5.0440e-06, 9.8161e-07, 1.5795e-06, -1.8673e-07, -2.3888e-07, 9.6411e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 128---------------------------------------------------- epoch 128, time 230.51, cls_loss 0.0021 cls_loss_mapping 0.0039 cls_loss_causal 0.5381 re_mapping 0.0075 re_causal 0.0224 /// teacc 99.10 lr 0.00010000 Epoch 130, weight, value: tensor([[-0.0273, 0.0025, -0.0672, ..., 0.0175, -0.0377, -0.0036], [ 0.0424, -0.0029, -0.0179, ..., 0.0160, -0.0342, -0.2168], [-0.0094, 0.0166, 0.0049, ..., -0.0686, 0.0856, -0.0434], ..., [-0.0015, -0.0222, -0.0101, ..., -0.0730, -0.1256, 0.0266], [-0.0143, -0.0229, -0.0164, ..., -0.0759, 0.0444, -0.1462], [-0.1094, -0.0035, -0.0071, ..., -0.0785, -0.0847, -0.0723]], device='cuda:0'), grad: tensor([[ 1.1642e-08, -4.3772e-08, 1.8626e-09, ..., 3.0408e-07, 3.3043e-06, 6.1002e-08], [-4.0606e-07, 4.6566e-10, 4.6566e-10, ..., 2.4363e-06, 5.0254e-06, 1.7509e-07], [-5.8673e-08, 3.7253e-09, 1.3970e-09, ..., 4.9531e-05, 8.0705e-05, 1.0896e-07], ..., [ 2.4587e-07, 9.3132e-10, 0.0000e+00, ..., 7.5437e-08, 6.4122e-07, 5.5367e-07], [ 4.3306e-08, 9.3132e-10, 9.3132e-10, ..., 6.2995e-06, -1.1869e-05, 4.5169e-08], [ 1.2573e-08, 2.7008e-08, 0.0000e+00, ..., 5.9605e-07, 2.6599e-06, 4.4517e-06]], device='cuda:0') Epoch 130, bias, value: tensor([ 0.0054, -0.0149, -0.0016, 0.0300, -0.0041, 0.0320, 0.0043, 0.0182, -0.0007, -0.0120], device='cuda:0'), grad: tensor([ 3.6843e-06, 8.0094e-06, 1.8907e-04, 1.3791e-05, 9.7826e-06, 6.5416e-06, -1.3936e-04, -8.9228e-05, -2.1756e-05, 1.9416e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 214.32, cls_loss 0.0028 cls_loss_mapping 0.0068 cls_loss_causal 0.5137 re_mapping 0.0073 re_causal 0.0209 /// teacc 99.04 lr 0.00010000 Epoch 131, weight, value: tensor([[-0.0274, 0.0030, -0.0683, ..., 0.0197, -0.0365, -0.0037], [ 0.0416, -0.0035, -0.0181, ..., 0.0155, -0.0343, -0.2175], [-0.0096, 0.0166, 0.0062, ..., -0.0704, 0.0858, -0.0435], ..., [-0.0016, -0.0221, -0.0101, ..., -0.0735, -0.1269, 0.0256], [-0.0129, -0.0233, -0.0172, ..., -0.0763, 0.0431, -0.1437], [-0.1100, -0.0040, -0.0075, ..., -0.0793, -0.0853, -0.0724]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 0.0000e+00, 1.3970e-08, ..., -1.5525e-06, 5.9791e-07, -6.4261e-08], [-9.5926e-08, 1.8626e-09, 1.8626e-09, ..., 2.6450e-07, 1.8533e-07, 1.3411e-07], [ 1.2107e-08, -4.6566e-09, 9.3132e-10, ..., 1.8161e-07, -3.6135e-07, 3.3528e-08], ..., [ 4.1910e-08, 9.3132e-10, 0.0000e+00, ..., 8.1025e-08, 1.7881e-07, 3.6415e-07], [ 9.3132e-09, 0.0000e+00, 8.3819e-09, ..., 2.5705e-07, 2.1048e-07, 7.1712e-08], [ 2.7940e-09, 0.0000e+00, 9.3132e-10, ..., 1.0440e-06, 1.4063e-07, -4.4703e-07]], device='cuda:0') Epoch 131, bias, value: tensor([ 0.0071, -0.0146, -0.0018, 0.0301, -0.0039, 0.0320, 0.0045, 0.0179, -0.0020, -0.0124], device='cuda:0'), grad: tensor([-6.1058e-06, 9.9838e-07, 2.1514e-07, 1.6233e-06, 7.6741e-07, 1.1632e-06, -1.6196e-06, -1.2498e-06, 3.1106e-06, 1.0654e-06], device='cuda:0') 100 0.0001 changing lr epoch 130, time 214.29, cls_loss 0.0027 cls_loss_mapping 0.0046 cls_loss_causal 0.5347 re_mapping 0.0073 re_causal 0.0220 /// teacc 98.93 lr 0.00010000 Epoch 132, weight, value: tensor([[-0.0274, 0.0030, -0.0686, ..., 0.0198, -0.0366, -0.0036], [ 0.0420, -0.0037, -0.0179, ..., 0.0152, -0.0346, -0.2181], [-0.0098, 0.0165, 0.0057, ..., -0.0707, 0.0866, -0.0431], ..., [-0.0019, -0.0222, -0.0099, ..., -0.0737, -0.1291, 0.0258], [-0.0129, -0.0235, -0.0175, ..., -0.0767, 0.0432, -0.1439], [-0.1103, -0.0040, -0.0076, ..., -0.0791, -0.0859, -0.0725]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.0489e-08, 0.0000e+00, ..., 1.9595e-06, 1.2275e-06, 3.4273e-07], [-1.8626e-09, 6.5193e-09, 0.0000e+00, ..., 7.9162e-08, 5.0385e-07, 2.7567e-07], [ 9.3132e-10, -4.4610e-07, 0.0000e+00, ..., 2.8871e-08, -4.8988e-06, -3.6322e-08], ..., [-5.5879e-09, 1.2852e-07, 0.0000e+00, ..., 4.6566e-09, 2.0731e-06, -1.0960e-05], [ 1.8626e-09, 1.2852e-07, 0.0000e+00, ..., 1.3728e-06, 4.9584e-06, 1.2666e-07], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 2.1141e-07, 3.8650e-07, 3.4515e-06]], device='cuda:0') Epoch 132, bias, value: tensor([ 0.0069, -0.0146, -0.0014, 0.0300, -0.0057, 0.0319, 0.0046, 0.0177, -0.0021, -0.0109], device='cuda:0'), grad: tensor([ 6.3851e-06, 1.6108e-05, -1.6261e-06, 2.8536e-05, 2.4382e-06, 1.3694e-05, -9.8124e-06, -1.3399e-04, 2.0698e-05, 5.7578e-05], device='cuda:0') 100 0.0001 changing lr epoch 131, time 214.43, cls_loss 0.0033 cls_loss_mapping 0.0065 cls_loss_causal 0.5632 re_mapping 0.0070 re_causal 0.0209 /// teacc 98.96 lr 0.00010000 Epoch 133, weight, value: tensor([[-0.0277, 0.0030, -0.0692, ..., 0.0205, -0.0372, -0.0038], [ 0.0419, -0.0046, -0.0179, ..., 0.0122, -0.0346, -0.2187], [-0.0098, 0.0166, 0.0051, ..., -0.0715, 0.0882, -0.0419], ..., [-0.0016, -0.0222, -0.0101, ..., -0.0740, -0.1302, 0.0258], [-0.0128, -0.0248, -0.0177, ..., -0.0770, 0.0420, -0.1460], [-0.1108, -0.0043, -0.0076, ..., -0.0792, -0.0866, -0.0733]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 6.6310e-07, 1.3327e-06, 1.3970e-08], [-1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 3.6322e-08, 2.8033e-07, 3.3528e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.4622e-07, -1.8608e-06, 7.4506e-09], ..., [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 5.3365e-07, 6.6124e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -4.9453e-07, -3.2838e-06, 3.4459e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1420e-07, 1.6633e-06, -1.2778e-05]], device='cuda:0') Epoch 133, bias, value: tensor([ 0.0080, -0.0148, -0.0010, 0.0308, -0.0054, 0.0316, 0.0046, 0.0178, -0.0034, -0.0116], device='cuda:0'), grad: tensor([ 4.9561e-05, -6.5193e-07, 2.1964e-05, 1.8761e-05, 1.1161e-05, 7.3493e-05, 1.1697e-06, 7.4953e-06, -1.0625e-05, -1.7262e-04], device='cuda:0') 100 0.0001 changing lr epoch 132, time 214.05, cls_loss 0.0026 cls_loss_mapping 0.0055 cls_loss_causal 0.5338 re_mapping 0.0074 re_causal 0.0216 /// teacc 98.81 lr 0.00010000 Epoch 134, weight, value: tensor([[-0.0278, 0.0031, -0.0697, ..., 0.0202, -0.0382, -0.0040], [ 0.0420, -0.0048, -0.0179, ..., 0.0116, -0.0348, -0.2194], [-0.0097, 0.0166, 0.0050, ..., -0.0731, 0.0884, -0.0416], ..., [-0.0019, -0.0222, -0.0104, ..., -0.0746, -0.1313, 0.0250], [-0.0127, -0.0249, -0.0179, ..., -0.0766, 0.0425, -0.1458], [-0.1110, -0.0043, -0.0077, ..., -0.0792, -0.0867, -0.0734]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 7.2643e-08, 4.3679e-07, 1.0757e-06], [ 9.3132e-10, 7.4506e-09, 0.0000e+00, ..., 2.6077e-08, 1.6578e-07, 6.5658e-07], [ 9.3132e-10, -1.5832e-08, 0.0000e+00, ..., -1.8440e-07, -3.2410e-06, 3.5856e-07], ..., [ 9.3132e-10, 2.7940e-09, 0.0000e+00, ..., 1.7881e-07, 2.4419e-06, 4.3865e-07], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 7.9162e-08, 3.5204e-07, 2.1793e-07], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 1.0990e-07, 1.7788e-07, 7.0371e-06]], device='cuda:0') Epoch 134, bias, value: tensor([ 0.0075, -0.0146, -0.0010, 0.0294, -0.0048, 0.0318, 0.0045, 0.0183, -0.0034, -0.0115], device='cuda:0'), grad: tensor([ 2.8461e-06, 2.8480e-06, -5.4389e-06, -1.7956e-06, -1.9938e-05, -1.4193e-06, 2.2855e-06, 7.7039e-06, 3.9488e-06, 8.9183e-06], device='cuda:0') 100 0.0001 changing lr epoch 133, time 214.16, cls_loss 0.0026 cls_loss_mapping 0.0048 cls_loss_causal 0.5719 re_mapping 0.0068 re_causal 0.0214 /// teacc 98.89 lr 0.00010000 Epoch 135, weight, value: tensor([[-0.0279, 0.0027, -0.0704, ..., 0.0203, -0.0391, -0.0045], [ 0.0418, -0.0060, -0.0179, ..., 0.0112, -0.0349, -0.2228], [-0.0098, 0.0168, 0.0046, ..., -0.0736, 0.0887, -0.0415], ..., [-0.0019, -0.0224, -0.0106, ..., -0.0753, -0.1323, 0.0253], [-0.0134, -0.0257, -0.0176, ..., -0.0774, 0.0427, -0.1448], [-0.1113, -0.0045, -0.0078, ..., -0.0797, -0.0876, -0.0733]], device='cuda:0'), grad: tensor([[ 1.8626e-09, -8.5682e-08, 0.0000e+00, ..., 3.8091e-07, 2.2165e-06, 2.6263e-07], [ 2.2352e-08, 9.3132e-10, 0.0000e+00, ..., -1.9744e-07, 2.4959e-06, 2.7288e-07], [ 5.5879e-09, 2.7940e-09, 0.0000e+00, ..., 5.4762e-07, -3.6806e-05, 9.8720e-08], ..., [ 8.3819e-09, 9.3132e-10, 0.0000e+00, ..., 1.1548e-07, 2.1346e-06, 2.4401e-07], [ 9.3132e-10, 8.3819e-09, 0.0000e+00, ..., 2.8647e-06, -1.1265e-05, -2.7101e-06], [ 1.4901e-08, 1.0245e-08, 0.0000e+00, ..., 1.9372e-07, 3.6776e-05, 1.9670e-06]], device='cuda:0') Epoch 135, bias, value: tensor([ 0.0073, -0.0147, -0.0010, 0.0295, -0.0048, 0.0318, 0.0049, 0.0181, -0.0034, -0.0115], device='cuda:0'), grad: tensor([ 1.0096e-05, 1.0133e-05, -9.7811e-05, 2.8446e-05, 1.2726e-05, 1.9774e-05, -2.4706e-05, -5.6326e-05, -1.4670e-05, 1.1241e-04], device='cuda:0') 100 0.0001 changing lr epoch 134, time 214.53, cls_loss 0.0029 cls_loss_mapping 0.0058 cls_loss_causal 0.5158 re_mapping 0.0070 re_causal 0.0212 /// teacc 98.99 lr 0.00010000 Epoch 136, weight, value: tensor([[-0.0283, 0.0023, -0.0708, ..., 0.0203, -0.0400, -0.0064], [ 0.0439, -0.0065, -0.0180, ..., 0.0111, -0.0353, -0.2219], [-0.0100, 0.0169, 0.0044, ..., -0.0741, 0.0890, -0.0416], ..., [-0.0013, -0.0225, -0.0107, ..., -0.0752, -0.1317, 0.0262], [-0.0158, -0.0267, -0.0173, ..., -0.0779, 0.0427, -0.1458], [-0.1117, -0.0046, -0.0078, ..., -0.0792, -0.0870, -0.0740]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., -8.5980e-06, 1.6205e-07, 1.5832e-08], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.5635e-08, 2.2445e-06, 1.1548e-07], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.0978e-08, -2.5909e-06, -1.0803e-07], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 7.4506e-09, 4.1630e-07, 7.5717e-07], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 6.1467e-08, 7.9162e-08, 2.7940e-08], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 1.6298e-07, 2.1886e-07, 9.6858e-08]], device='cuda:0') Epoch 136, bias, value: tensor([ 0.0068, -0.0149, -0.0011, 0.0296, -0.0047, 0.0313, 0.0051, 0.0189, -0.0043, -0.0112], device='cuda:0'), grad: tensor([-1.7315e-05, 1.0139e-04, 2.1718e-06, 7.8306e-06, -2.0806e-06, -3.0994e-06, 1.7166e-05, -1.1039e-04, 1.8133e-06, 2.3730e-06], device='cuda:0') 100 0.0001 changing lr epoch 135, time 214.22, cls_loss 0.0030 cls_loss_mapping 0.0065 cls_loss_causal 0.5390 re_mapping 0.0071 re_causal 0.0210 /// teacc 98.89 lr 0.00010000 Epoch 137, weight, value: tensor([[-0.0284, 0.0024, -0.0716, ..., 0.0215, -0.0394, -0.0080], [ 0.0442, -0.0069, -0.0186, ..., 0.0108, -0.0355, -0.2222], [-0.0102, 0.0169, 0.0030, ..., -0.0742, 0.0895, -0.0418], ..., [-0.0014, -0.0226, -0.0109, ..., -0.0762, -0.1325, 0.0260], [-0.0158, -0.0278, -0.0149, ..., -0.0781, 0.0433, -0.1460], [-0.1123, -0.0047, -0.0080, ..., -0.0801, -0.0868, -0.0744]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 6.5751e-06, 5.0515e-06, 5.8766e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.6578e-07, 3.6415e-07, 2.6915e-07], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 8.9873e-07, -1.6848e-06, 1.9185e-07], ..., [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.7940e-08, 1.5777e-06, 7.7300e-08], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.2189e-07, -1.7695e-08, 3.9209e-07], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 3.2410e-07, 9.5926e-07, 3.6545e-06]], device='cuda:0') Epoch 137, bias, value: tensor([ 0.0078, -0.0155, -0.0009, 0.0302, -0.0060, 0.0303, 0.0047, 0.0196, -0.0043, -0.0101], device='cuda:0'), grad: tensor([ 1.5825e-05, 1.0030e-06, 1.7677e-06, -5.0336e-05, 1.7703e-05, 8.4750e-08, -3.0935e-05, -3.6545e-06, 2.5965e-06, 4.5925e-05], device='cuda:0') 100 0.0001 changing lr epoch 136, time 214.38, cls_loss 0.0029 cls_loss_mapping 0.0060 cls_loss_causal 0.5327 re_mapping 0.0072 re_causal 0.0204 /// teacc 98.80 lr 0.00010000 Epoch 138, weight, value: tensor([[-0.0286, 0.0022, -0.0719, ..., 0.0216, -0.0397, -0.0083], [ 0.0439, -0.0071, -0.0187, ..., 0.0101, -0.0359, -0.2228], [-0.0106, 0.0170, 0.0027, ..., -0.0749, 0.0901, -0.0417], ..., [-0.0014, -0.0227, -0.0117, ..., -0.0763, -0.1338, 0.0260], [-0.0161, -0.0284, -0.0147, ..., -0.0787, 0.0436, -0.1460], [-0.1131, -0.0047, -0.0080, ..., -0.0802, -0.0875, -0.0750]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.1526e-07, 1.3106e-05, 4.9286e-06], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.0654e-05, 8.1539e-05, 2.4810e-05], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3736e-05, 2.5773e-04, 7.7486e-05], ..., [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 2.2929e-06, 1.9073e-05, 6.2250e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.5938e-04, -1.2264e-03, -3.6502e-04], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.8859e-06, 7.4580e-06, -2.4676e-05]], device='cuda:0') Epoch 138, bias, value: tensor([ 0.0080, -0.0158, -0.0005, 0.0300, -0.0061, 0.0306, 0.0048, 0.0209, -0.0041, -0.0113], device='cuda:0'), grad: tensor([ 2.0891e-05, 1.4663e-04, 4.5872e-04, 5.2929e-05, 1.8597e-04, 1.0643e-03, 3.6335e-04, 4.9204e-05, -2.1420e-03, -2.0027e-04], device='cuda:0') 100 0.0001 changing lr epoch 137, time 214.76, cls_loss 0.0029 cls_loss_mapping 0.0052 cls_loss_causal 0.5549 re_mapping 0.0066 re_causal 0.0204 /// teacc 98.93 lr 0.00010000 Epoch 139, weight, value: tensor([[-0.0288, 0.0020, -0.0731, ..., 0.0213, -0.0411, -0.0087], [ 0.0442, -0.0077, -0.0188, ..., 0.0096, -0.0362, -0.2236], [-0.0107, 0.0172, 0.0029, ..., -0.0751, 0.0904, -0.0421], ..., [-0.0016, -0.0231, -0.0128, ..., -0.0770, -0.1344, 0.0258], [-0.0171, -0.0288, -0.0148, ..., -0.0765, 0.0470, -0.1449], [-0.1168, -0.0048, -0.0083, ..., -0.0811, -0.0889, -0.0739]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1064e-04, 5.4955e-05, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.2201e-07, -1.5274e-07, 3.1944e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8664e-06, 3.7253e-09, 2.6077e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2666e-07, 1.8533e-07, 4.4703e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.6054e-07, -5.7407e-06, 5.1223e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-06, 7.2345e-06, 4.9993e-06]], device='cuda:0') Epoch 139, bias, value: tensor([ 0.0074, -0.0160, -0.0004, 0.0299, -0.0066, 0.0286, 0.0047, 0.0206, -0.0012, -0.0111], device='cuda:0'), grad: tensor([ 2.5630e-04, -1.9863e-05, 9.4622e-06, 7.1861e-06, -1.7554e-05, -9.1270e-06, -2.7132e-04, -4.4443e-06, 2.9281e-06, 4.6462e-05], device='cuda:0') 100 0.0001 changing lr epoch 138, time 214.25, cls_loss 0.0022 cls_loss_mapping 0.0045 cls_loss_causal 0.5412 re_mapping 0.0067 re_causal 0.0206 /// teacc 98.84 lr 0.00010000 Epoch 140, weight, value: tensor([[-0.0289, -0.0006, -0.0732, ..., 0.0211, -0.0421, -0.0089], [ 0.0442, -0.0110, -0.0188, ..., 0.0095, -0.0363, -0.2238], [-0.0091, 0.0179, 0.0029, ..., -0.0757, 0.0908, -0.0418], ..., [-0.0020, -0.0243, -0.0129, ..., -0.0773, -0.1366, 0.0258], [-0.0177, -0.0312, -0.0147, ..., -0.0767, 0.0470, -0.1451], [-0.1179, -0.0050, -0.0084, ..., -0.0814, -0.0894, -0.0747]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., -1.2897e-05, 4.1444e-07, 7.8231e-08], [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 2.4680e-07, 6.7987e-07, 2.2631e-07], [ 3.3528e-08, 0.0000e+00, 0.0000e+00, ..., 4.0140e-07, 2.2631e-07, 1.2200e-07], ..., [ 2.7940e-08, 0.0000e+00, 0.0000e+00, ..., 2.1420e-07, 1.2852e-07, 7.4599e-07], [ 1.9558e-08, 0.0000e+00, 0.0000e+00, ..., 2.4289e-06, -1.1727e-05, 1.2387e-07], [ 5.0291e-08, 0.0000e+00, 0.0000e+00, ..., 5.2005e-06, -8.7917e-07, 3.7532e-06]], device='cuda:0') Epoch 140, bias, value: tensor([ 0.0071, -0.0158, -0.0002, 0.0301, -0.0062, 0.0289, 0.0048, 0.0200, -0.0012, -0.0115], device='cuda:0'), grad: tensor([-2.1607e-05, 7.5810e-07, 2.5816e-06, 5.7258e-06, 3.7160e-07, 4.2021e-05, -4.5300e-06, 4.6343e-06, -2.1413e-05, -8.6054e-06], device='cuda:0') 100 0.0001 changing lr epoch 139, time 214.20, cls_loss 0.0024 cls_loss_mapping 0.0055 cls_loss_causal 0.5458 re_mapping 0.0066 re_causal 0.0202 /// teacc 98.90 lr 0.00010000 Epoch 141, weight, value: tensor([[-0.0293, -0.0008, -0.0732, ..., 0.0212, -0.0425, -0.0089], [ 0.0445, -0.0118, -0.0188, ..., 0.0091, -0.0365, -0.2240], [-0.0078, 0.0181, 0.0030, ..., -0.0761, 0.0911, -0.0417], ..., [-0.0025, -0.0245, -0.0129, ..., -0.0775, -0.1374, 0.0256], [-0.0185, -0.0319, -0.0147, ..., -0.0762, 0.0478, -0.1456], [-0.1200, -0.0053, -0.0084, ..., -0.0816, -0.0899, -0.0745]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 2.7940e-09, 0.0000e+00, ..., 7.5996e-07, 1.8794e-06, 2.2054e-06], [ 4.2841e-08, 6.5193e-09, 0.0000e+00, ..., 2.1979e-07, 1.0971e-06, 9.3412e-07], [ 1.8626e-08, -1.3039e-08, 0.0000e+00, ..., -1.2582e-06, -4.2468e-06, 6.9104e-07], ..., [ 1.9558e-08, 7.4506e-08, 0.0000e+00, ..., 2.6915e-07, 1.0086e-06, 6.2678e-07], [ 4.6566e-09, 2.3283e-08, 0.0000e+00, ..., 8.4750e-07, -5.2527e-06, 1.5553e-07], [ 1.3970e-08, 1.6764e-08, 0.0000e+00, ..., 1.9185e-07, 1.2098e-06, 2.8927e-06]], device='cuda:0') Epoch 141, bias, value: tensor([ 0.0070, -0.0157, -0.0002, 0.0301, -0.0066, 0.0283, 0.0048, 0.0196, -0.0007, -0.0110], device='cuda:0'), grad: tensor([ 1.1824e-05, 5.7109e-06, -2.9877e-06, -2.4125e-05, -2.0951e-05, 1.4290e-05, -2.8238e-06, 2.2091e-06, -9.0972e-06, 2.5943e-05], device='cuda:0') 100 0.0001 changing lr epoch 140, time 214.46, cls_loss 0.0022 cls_loss_mapping 0.0039 cls_loss_causal 0.5299 re_mapping 0.0065 re_causal 0.0202 /// teacc 99.02 lr 0.00010000 Epoch 142, weight, value: tensor([[-0.0296, -0.0018, -0.0732, ..., 0.0210, -0.0433, -0.0090], [ 0.0443, -0.0139, -0.0188, ..., 0.0088, -0.0375, -0.2243], [-0.0076, 0.0185, 0.0030, ..., -0.0762, 0.0923, -0.0416], ..., [-0.0021, -0.0252, -0.0129, ..., -0.0779, -0.1383, 0.0254], [-0.0184, -0.0337, -0.0147, ..., -0.0765, 0.0479, -0.1459], [-0.1214, -0.0055, -0.0084, ..., -0.0816, -0.0907, -0.0746]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.5991e-06, -3.7253e-08, 1.0245e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 4.8149e-07, 2.1532e-06, 2.7940e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 7.3947e-07, -1.1601e-05, 2.4214e-08], ..., [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.5460e-07, 5.8375e-06, 3.6322e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.6054e-07, 5.2489e-06, 1.5832e-08], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 1.7639e-06, 1.1381e-06, 3.3528e-07]], device='cuda:0') Epoch 142, bias, value: tensor([ 0.0068, -0.0163, 0.0004, 0.0298, -0.0064, 0.0285, 0.0047, 0.0198, -0.0006, -0.0113], device='cuda:0'), grad: tensor([-7.1898e-06, 6.2771e-06, -3.2693e-05, 7.2122e-06, 6.0409e-05, -5.9232e-06, -6.5386e-05, 1.6823e-05, 1.5102e-05, 5.2713e-06], device='cuda:0') 100 0.0001 changing lr epoch 141, time 214.13, cls_loss 0.0027 cls_loss_mapping 0.0051 cls_loss_causal 0.5361 re_mapping 0.0068 re_causal 0.0206 /// teacc 98.97 lr 0.00010000 Epoch 143, weight, value: tensor([[-0.0297, -0.0022, -0.0732, ..., 0.0210, -0.0440, -0.0101], [ 0.0444, -0.0172, -0.0188, ..., 0.0086, -0.0378, -0.2245], [-0.0076, 0.0192, 0.0029, ..., -0.0767, 0.0925, -0.0417], ..., [-0.0022, -0.0272, -0.0129, ..., -0.0783, -0.1390, 0.0250], [-0.0184, -0.0368, -0.0147, ..., -0.0781, 0.0478, -0.1456], [-0.1221, -0.0059, -0.0084, ..., -0.0817, -0.0913, -0.0748]], device='cuda:0'), grad: tensor([[ 1.3970e-08, 0.0000e+00, 0.0000e+00, ..., -3.0771e-06, 1.5376e-06, -6.2212e-07], [ 5.6066e-07, 0.0000e+00, 0.0000e+00, ..., 1.3877e-07, 4.9919e-07, 1.2191e-06], [ 3.1665e-08, 0.0000e+00, 0.0000e+00, ..., 6.0163e-07, 1.1101e-06, 9.3598e-07], ..., [ 8.6613e-08, 0.0000e+00, 0.0000e+00, ..., 2.3283e-08, 7.5158e-07, 1.0394e-06], [ 3.7253e-08, 0.0000e+00, 0.0000e+00, ..., 2.9802e-06, 4.1015e-06, 5.5693e-07], [ 8.7544e-08, 0.0000e+00, 0.0000e+00, ..., 1.8068e-07, 2.7455e-06, 5.0753e-05]], device='cuda:0') Epoch 143, bias, value: tensor([ 0.0064, -0.0161, 0.0003, 0.0298, -0.0061, 0.0289, 0.0048, 0.0196, -0.0007, -0.0116], device='cuda:0'), grad: tensor([ 3.0659e-06, 1.9908e-05, 1.1988e-05, 3.0965e-05, -2.0325e-04, -9.2387e-06, -2.0228e-06, 1.9684e-05, 7.2420e-05, 5.6416e-05], device='cuda:0') 100 0.0001 changing lr epoch 142, time 214.24, cls_loss 0.0019 cls_loss_mapping 0.0036 cls_loss_causal 0.5601 re_mapping 0.0066 re_causal 0.0213 /// teacc 99.01 lr 0.00010000 Epoch 144, weight, value: tensor([[-0.0297, -0.0021, -0.0732, ..., 0.0210, -0.0448, -0.0097], [ 0.0444, -0.0182, -0.0188, ..., 0.0085, -0.0380, -0.2248], [-0.0073, 0.0195, 0.0030, ..., -0.0766, 0.0934, -0.0416], ..., [-0.0023, -0.0277, -0.0129, ..., -0.0789, -0.1412, 0.0251], [-0.0184, -0.0379, -0.0147, ..., -0.0787, 0.0477, -0.1456], [-0.1223, -0.0062, -0.0084, ..., -0.0818, -0.0921, -0.0749]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.9116e-08, 0.0000e+00, ..., -4.7795e-06, 8.7544e-07, 7.9721e-07], [-0.0000e+00, 2.2352e-08, 0.0000e+00, ..., 1.3411e-07, 2.2873e-06, 1.1399e-05], [ 0.0000e+00, -1.5013e-06, 0.0000e+00, ..., 4.6380e-07, -3.1888e-05, 1.3970e-07], ..., [ 0.0000e+00, 1.3281e-06, 0.0000e+00, ..., 7.2643e-08, 2.0221e-05, 2.7232e-06], [ 0.0000e+00, 3.7253e-08, 0.0000e+00, ..., 1.8626e-07, 6.9365e-06, 1.4137e-06], [ 1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 8.1398e-07, 2.8685e-07, 1.0125e-05]], device='cuda:0') Epoch 144, bias, value: tensor([ 0.0063, -0.0163, 0.0009, 0.0299, -0.0062, 0.0288, 0.0052, 0.0195, -0.0006, -0.0119], device='cuda:0'), grad: tensor([-6.3255e-06, 2.8819e-05, -6.2764e-05, 1.7554e-05, -5.1767e-05, -1.1437e-05, 1.1384e-05, 3.1054e-05, 2.0280e-05, 2.3246e-05], device='cuda:0') 100 0.0001 changing lr epoch 143, time 214.21, cls_loss 0.0023 cls_loss_mapping 0.0059 cls_loss_causal 0.5612 re_mapping 0.0064 re_causal 0.0206 /// teacc 98.87 lr 0.00010000 Epoch 145, weight, value: tensor([[-0.0298, -0.0019, -0.0732, ..., 0.0215, -0.0452, -0.0100], [ 0.0439, -0.0188, -0.0188, ..., 0.0083, -0.0385, -0.2253], [-0.0068, 0.0196, 0.0030, ..., -0.0772, 0.0941, -0.0415], ..., [-0.0020, -0.0283, -0.0129, ..., -0.0793, -0.1430, 0.0251], [-0.0185, -0.0381, -0.0147, ..., -0.0787, 0.0481, -0.1458], [-0.1228, -0.0063, -0.0084, ..., -0.0822, -0.0930, -0.0751]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 5.0291e-08, 0.0000e+00, ..., 7.4506e-09, 5.0105e-07, 7.2829e-07], [ 3.7253e-09, 6.1467e-08, 0.0000e+00, ..., 1.3039e-08, 5.0291e-07, 3.3900e-07], [ 5.5879e-09, -1.2238e-06, 0.0000e+00, ..., 4.8429e-08, -9.2089e-06, -1.1846e-06], ..., [ 1.8626e-09, 3.7812e-07, 0.0000e+00, ..., 1.8626e-09, 2.8796e-06, 6.9477e-07], [ 0.0000e+00, 2.0303e-07, 0.0000e+00, ..., 1.8999e-07, 2.5537e-06, 4.6007e-07], [ 3.7253e-09, 4.8429e-08, 0.0000e+00, ..., 1.8626e-08, 4.9174e-07, 2.4214e-07]], device='cuda:0') Epoch 145, bias, value: tensor([ 0.0066, -0.0168, 0.0012, 0.0287, -0.0053, 0.0294, 0.0051, 0.0186, -0.0001, -0.0124], device='cuda:0'), grad: tensor([ 6.4038e-06, 2.1774e-06, -2.3171e-05, 4.8205e-06, -2.3082e-05, -2.8871e-07, 1.8761e-05, 7.2718e-06, 9.3430e-06, -2.2389e-06], device='cuda:0') 100 0.0001 changing lr epoch 144, time 214.04, cls_loss 0.0024 cls_loss_mapping 0.0051 cls_loss_causal 0.5355 re_mapping 0.0065 re_causal 0.0200 /// teacc 98.91 lr 0.00010000 Epoch 146, weight, value: tensor([[-2.9972e-02, -1.6859e-03, -7.3313e-02, ..., 2.2002e-02, -4.5504e-02, -1.0078e-02], [ 4.3854e-02, -1.9340e-02, -1.8838e-02, ..., 8.2824e-03, -3.8671e-02, -2.2606e-01], [-8.2891e-03, 1.9488e-02, 2.9622e-03, ..., -7.7830e-02, 9.4235e-02, -4.1875e-02], ..., [ 1.7593e-04, -2.9214e-02, -1.3004e-02, ..., -7.9702e-02, -1.4291e-01, 2.5086e-02], [-1.8966e-02, -3.8517e-02, -1.4766e-02, ..., -7.9359e-02, 4.8101e-02, -1.4603e-01], [-1.2302e-01, -6.4641e-03, -8.3773e-03, ..., -8.2751e-02, -9.3163e-02, -7.5578e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.1234e-07, 4.7125e-07, 5.8115e-07], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.3039e-08, 3.0734e-07, 7.7859e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 9.6858e-08, -6.3702e-06, 1.2740e-06], ..., [ 1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 5.6066e-07, -1.3459e-04], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 2.3283e-07, -1.1645e-05, 1.1250e-06], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 3.9116e-08, 1.4275e-05, 3.5223e-06]], device='cuda:0') Epoch 146, bias, value: tensor([ 0.0071, -0.0168, 0.0003, 0.0288, -0.0049, 0.0297, 0.0052, 0.0194, -0.0006, -0.0127], device='cuda:0'), grad: tensor([ 7.1302e-06, 1.6093e-05, -7.4863e-05, 1.3721e-04, 1.9705e-04, 1.5366e-04, -9.3132e-09, -6.3801e-04, -2.7806e-05, 2.2888e-04], device='cuda:0') 100 0.0001 changing lr epoch 145, time 214.30, cls_loss 0.0019 cls_loss_mapping 0.0039 cls_loss_causal 0.5207 re_mapping 0.0065 re_causal 0.0200 /// teacc 98.99 lr 0.00010000 Epoch 147, weight, value: tensor([[-0.0300, -0.0016, -0.0733, ..., 0.0226, -0.0451, -0.0106], [ 0.0439, -0.0199, -0.0188, ..., 0.0081, -0.0386, -0.2264], [-0.0085, 0.0196, 0.0030, ..., -0.0782, 0.0948, -0.0419], ..., [ 0.0007, -0.0294, -0.0130, ..., -0.0801, -0.1447, 0.0255], [-0.0193, -0.0390, -0.0148, ..., -0.0796, 0.0481, -0.1463], [-0.1232, -0.0066, -0.0084, ..., -0.0828, -0.0934, -0.0752]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1418e-06, 1.6652e-06, 7.5065e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5087e-07, 2.7008e-07, 9.1270e-08], [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., 4.8243e-07, 2.7008e-07, 2.9057e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-08, 4.3958e-07, 2.8778e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9057e-07, 6.3330e-08, 1.5460e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.6858e-08, 1.3411e-07, 3.1665e-08]], device='cuda:0') Epoch 147, bias, value: tensor([ 0.0075, -0.0164, 0.0002, 0.0289, -0.0054, 0.0296, 0.0053, 0.0194, -0.0008, -0.0125], device='cuda:0'), grad: tensor([ 4.8280e-06, 3.7942e-06, 4.4197e-05, -6.1333e-05, 6.5342e-06, 4.9993e-06, -2.0459e-05, 5.2750e-06, 1.1206e-05, 9.4064e-07], device='cuda:0') 100 0.0001 changing lr epoch 146, time 214.31, cls_loss 0.0027 cls_loss_mapping 0.0065 cls_loss_causal 0.5587 re_mapping 0.0066 re_causal 0.0206 /// teacc 98.99 lr 0.00010000 Epoch 148, weight, value: tensor([[-0.0301, -0.0020, -0.0733, ..., 0.0225, -0.0460, -0.0111], [ 0.0439, -0.0226, -0.0188, ..., 0.0080, -0.0390, -0.2269], [-0.0085, 0.0207, 0.0030, ..., -0.0785, 0.0954, -0.0414], ..., [ 0.0008, -0.0317, -0.0130, ..., -0.0805, -0.1453, 0.0249], [-0.0192, -0.0416, -0.0148, ..., -0.0800, 0.0479, -0.1467], [-0.1236, -0.0078, -0.0084, ..., -0.0829, -0.0939, -0.0766]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -5.4017e-08, 5.2527e-07, 4.4703e-08], [ 3.9116e-08, 0.0000e+00, 0.0000e+00, ..., 1.2107e-07, 5.0105e-07, 4.3027e-07], [ 7.4506e-09, -0.0000e+00, 0.0000e+00, ..., 1.0245e-07, -6.7614e-07, 8.7544e-08], ..., [-2.4028e-07, 0.0000e+00, 0.0000e+00, ..., 1.4901e-08, 2.6636e-07, -2.4457e-06], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -1.8477e-06, -2.8133e-05, 9.1270e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 2.6263e-07, 4.6194e-07, -5.4762e-07]], device='cuda:0') Epoch 148, bias, value: tensor([ 0.0069, -0.0161, 0.0007, 0.0291, -0.0045, 0.0296, 0.0058, 0.0187, -0.0012, -0.0131], device='cuda:0'), grad: tensor([ 9.6858e-07, 1.0274e-05, -6.3330e-08, 1.3828e-05, 1.8775e-05, 7.8231e-06, 3.2514e-05, -2.8849e-05, -4.9740e-05, -5.6103e-06], device='cuda:0') 100 0.0001 changing lr epoch 147, time 214.61, cls_loss 0.0028 cls_loss_mapping 0.0053 cls_loss_causal 0.5317 re_mapping 0.0067 re_causal 0.0204 /// teacc 98.92 lr 0.00010000 Epoch 149, weight, value: tensor([[-0.0311, -0.0041, -0.0733, ..., 0.0233, -0.0467, -0.0107], [ 0.0436, -0.0243, -0.0189, ..., 0.0061, -0.0392, -0.2272], [-0.0085, 0.0221, 0.0029, ..., -0.0800, 0.0962, -0.0412], ..., [ 0.0008, -0.0336, -0.0130, ..., -0.0811, -0.1468, 0.0247], [-0.0191, -0.0460, -0.0147, ..., -0.0803, 0.0479, -0.1470], [-0.1248, -0.0080, -0.0084, ..., -0.0823, -0.0948, -0.0760]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.3388e-07, 9.4064e-07, 8.5682e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 6.7055e-08, -1.4529e-06, 3.6135e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.0781e-08, 3.9302e-07, 3.1479e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 3.5577e-07, 1.3784e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9744e-07, -5.0336e-05, 1.6950e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 5.5879e-08, 5.1171e-05, 1.3579e-06]], device='cuda:0') Epoch 149, bias, value: tensor([ 0.0079, -0.0165, 0.0010, 0.0295, -0.0056, 0.0295, 0.0051, 0.0180, -0.0013, -0.0117], device='cuda:0'), grad: tensor([ 2.8163e-06, -6.2063e-06, 5.0217e-06, -7.2271e-07, -6.3702e-06, -1.6373e-06, 1.6298e-06, -3.7625e-07, -3.9983e-04, 4.0579e-04], device='cuda:0') 100 0.0001 changing lr epoch 148, time 214.19, cls_loss 0.0019 cls_loss_mapping 0.0050 cls_loss_causal 0.5470 re_mapping 0.0066 re_causal 0.0208 /// teacc 99.00 lr 0.00010000 Epoch 150, weight, value: tensor([[-0.0313, -0.0050, -0.0733, ..., 0.0226, -0.0485, -0.0118], [ 0.0431, -0.0250, -0.0189, ..., 0.0059, -0.0399, -0.2276], [-0.0083, 0.0222, 0.0029, ..., -0.0808, 0.0964, -0.0412], ..., [ 0.0008, -0.0335, -0.0130, ..., -0.0815, -0.1470, 0.0247], [-0.0194, -0.0463, -0.0146, ..., -0.0807, 0.0480, -0.1473], [-0.1264, -0.0061, -0.0084, ..., -0.0824, -0.0961, -0.0766]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.3681e-06, -1.6484e-06, 2.2352e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.6950e-07, 1.6950e-07, 1.6578e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0361e-07, 2.4214e-07, 3.7253e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 2.0489e-08, 9.8720e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.5491e-06, 1.3642e-05, 2.3656e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 9.8720e-08, -5.8860e-07, -1.4603e-06]], device='cuda:0') Epoch 150, bias, value: tensor([ 0.0073, -0.0166, 0.0007, 0.0294, -0.0052, 0.0296, 0.0062, 0.0182, -0.0012, -0.0123], device='cuda:0'), grad: tensor([-1.1437e-05, 3.0193e-06, 3.7588e-06, 3.9265e-06, 2.6137e-05, 4.9919e-05, -6.4015e-05, -5.0440e-06, 2.9504e-05, -3.5852e-05], device='cuda:0') 100 0.0001 changing lr epoch 149, time 214.29, cls_loss 0.0024 cls_loss_mapping 0.0044 cls_loss_causal 0.5414 re_mapping 0.0067 re_causal 0.0205 /// teacc 98.95 lr 0.00010000 Epoch 151, weight, value: tensor([[-0.0315, -0.0051, -0.0733, ..., 0.0228, -0.0489, -0.0123], [ 0.0431, -0.0279, -0.0189, ..., 0.0061, -0.0399, -0.2280], [-0.0058, 0.0225, 0.0029, ..., -0.0810, 0.0974, -0.0397], ..., [ 0.0006, -0.0326, -0.0130, ..., -0.0817, -0.1480, 0.0251], [-0.0220, -0.0468, -0.0146, ..., -0.0807, 0.0478, -0.1495], [-0.1267, -0.0062, -0.0084, ..., -0.0826, -0.0967, -0.0768]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1176e-08, 0.0000e+00, ..., -3.1628e-06, 1.3933e-06, 7.4506e-09], [ 1.8626e-09, 7.4506e-09, 0.0000e+00, ..., 1.2107e-07, 6.5044e-06, 1.6764e-08], [ 0.0000e+00, -1.6950e-07, 0.0000e+00, ..., 1.0245e-07, 1.2770e-05, 1.1176e-08], ..., [ 1.8626e-09, 7.4506e-09, 0.0000e+00, ..., 1.6764e-08, 1.1958e-06, 1.1176e-08], [ 0.0000e+00, 6.7055e-08, 0.0000e+00, ..., 1.3225e-07, -1.1110e-04, 1.1176e-08], [ 1.8626e-09, 7.4506e-09, 0.0000e+00, ..., 1.3225e-07, 1.4082e-06, 2.0489e-07]], device='cuda:0') Epoch 151, bias, value: tensor([ 0.0073, -0.0167, 0.0007, 0.0310, -0.0051, 0.0296, 0.0057, 0.0173, -0.0015, -0.0123], device='cuda:0'), grad: tensor([-4.0680e-06, 7.9423e-06, 2.5377e-05, 7.9274e-06, 2.1756e-06, 1.2589e-04, 3.0726e-05, 3.1441e-06, -2.0468e-04, 5.3756e-06], device='cuda:0') 100 0.0001 changing lr epoch 150, time 214.27, cls_loss 0.0025 cls_loss_mapping 0.0051 cls_loss_causal 0.5644 re_mapping 0.0063 re_causal 0.0195 /// teacc 98.93 lr 0.00010000 Epoch 152, weight, value: tensor([[-0.0316, -0.0058, -0.0734, ..., 0.0224, -0.0503, -0.0125], [ 0.0431, -0.0288, -0.0190, ..., 0.0059, -0.0400, -0.2282], [-0.0055, 0.0229, 0.0030, ..., -0.0818, 0.0984, -0.0395], ..., [ 0.0007, -0.0329, -0.0131, ..., -0.0818, -0.1486, 0.0256], [-0.0223, -0.0481, -0.0143, ..., -0.0809, 0.0475, -0.1499], [-0.1280, -0.0062, -0.0084, ..., -0.0822, -0.0971, -0.0772]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 3.9302e-07, 3.1665e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.3958e-07, 5.0850e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -7.0147e-06, 8.3819e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.6782e-06, 1.5087e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 3.3118e-06, 5.1409e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 3.6694e-07, 1.1630e-05]], device='cuda:0') Epoch 152, bias, value: tensor([ 0.0062, -0.0162, 0.0011, 0.0311, -0.0049, 0.0300, 0.0054, 0.0171, -0.0023, -0.0124], device='cuda:0'), grad: tensor([ 1.9614e-06, 3.8557e-06, -5.5879e-08, 1.0216e-04, -3.4481e-05, 8.9779e-07, 1.5516e-06, -1.2851e-04, 1.3530e-05, 3.8803e-05], device='cuda:0') 100 0.0001 changing lr epoch 151, time 214.28, cls_loss 0.0019 cls_loss_mapping 0.0041 cls_loss_causal 0.5370 re_mapping 0.0060 re_causal 0.0197 /// teacc 98.98 lr 0.00010000 Epoch 153, weight, value: tensor([[-0.0316, -0.0058, -0.0740, ..., 0.0218, -0.0513, -0.0120], [ 0.0432, -0.0292, -0.0189, ..., 0.0058, -0.0402, -0.2283], [-0.0056, 0.0230, 0.0027, ..., -0.0824, 0.0986, -0.0395], ..., [ 0.0007, -0.0330, -0.0136, ..., -0.0822, -0.1488, 0.0256], [-0.0223, -0.0482, -0.0144, ..., -0.0811, 0.0475, -0.1500], [-0.1289, -0.0063, -0.0085, ..., -0.0817, -0.0967, -0.0774]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.2852e-07, 0.0000e+00, ..., -2.6971e-05, -7.8306e-06, 2.9802e-08], [ 0.0000e+00, 2.0489e-08, 0.0000e+00, ..., 1.4883e-06, 1.4212e-06, 2.0452e-06], [-1.8626e-09, -9.3132e-09, 0.0000e+00, ..., 6.9365e-06, 8.6427e-07, 1.6578e-07], ..., [ 0.0000e+00, 2.2352e-08, 0.0000e+00, ..., 9.6858e-07, 2.2445e-06, 1.8440e-07], [ 0.0000e+00, 3.5390e-08, 0.0000e+00, ..., 4.6253e-05, 5.2214e-05, 1.0990e-07], [ 0.0000e+00, 3.5390e-08, 0.0000e+00, ..., 1.0528e-05, 6.3963e-06, 2.4233e-06]], device='cuda:0') Epoch 153, bias, value: tensor([ 0.0055, -0.0153, 0.0009, 0.0308, -0.0045, 0.0299, 0.0049, 0.0166, -0.0024, -0.0120], device='cuda:0'), grad: tensor([-8.2016e-05, 1.9938e-05, 2.6599e-05, 1.7598e-05, -2.2829e-05, 4.9448e-04, -6.0034e-04, -9.7156e-06, 1.1307e-04, 4.2826e-05], device='cuda:0') 100 0.0001 changing lr epoch 152, time 214.72, cls_loss 0.0021 cls_loss_mapping 0.0038 cls_loss_causal 0.5333 re_mapping 0.0060 re_causal 0.0188 /// teacc 98.99 lr 0.00010000 Epoch 154, weight, value: tensor([[-0.0319, -0.0022, -0.0741, ..., 0.0226, -0.0515, -0.0119], [ 0.0431, -0.0370, -0.0189, ..., 0.0051, -0.0405, -0.2287], [-0.0056, 0.0237, 0.0026, ..., -0.0831, 0.0990, -0.0397], ..., [ 0.0009, -0.0346, -0.0136, ..., -0.0825, -0.1492, 0.0244], [-0.0223, -0.0490, -0.0144, ..., -0.0816, 0.0476, -0.1500], [-0.1299, -0.0074, -0.0085, ..., -0.0822, -0.0976, -0.0775]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., 4.9733e-07, 6.7987e-07, 5.9605e-08], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 2.6077e-08, 1.2666e-07, 3.5018e-07], [ 0.0000e+00, -2.6077e-08, 0.0000e+00, ..., 7.0781e-08, 8.8662e-07, 1.3597e-07], ..., [ 0.0000e+00, -8.1956e-08, 0.0000e+00, ..., 0.0000e+00, 1.1735e-07, 1.4342e-07], [ 0.0000e+00, 9.3132e-09, 0.0000e+00, ..., 2.5891e-07, 1.3672e-06, 3.7067e-07], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 2.4214e-08, 4.4331e-07, 2.6785e-06]], device='cuda:0') Epoch 154, bias, value: tensor([ 0.0064, -0.0154, 0.0007, 0.0310, -0.0044, 0.0294, 0.0053, 0.0164, -0.0025, -0.0118], device='cuda:0'), grad: tensor([ 1.4808e-06, 9.5740e-07, 2.3171e-06, 7.3761e-06, -1.5795e-05, -1.3538e-05, 1.8049e-06, -1.1176e-07, 6.0201e-06, 9.4846e-06], device='cuda:0') 100 0.0001 changing lr epoch 153, time 214.79, cls_loss 0.0022 cls_loss_mapping 0.0043 cls_loss_causal 0.5464 re_mapping 0.0063 re_causal 0.0206 /// teacc 99.00 lr 0.00010000 Epoch 155, weight, value: tensor([[-0.0322, -0.0021, -0.0745, ..., 0.0224, -0.0517, -0.0126], [ 0.0432, -0.0390, -0.0190, ..., 0.0046, -0.0406, -0.2290], [-0.0056, 0.0239, 0.0019, ..., -0.0838, 0.0993, -0.0397], ..., [ 0.0032, -0.0342, -0.0139, ..., -0.0827, -0.1495, 0.0264], [-0.0223, -0.0493, -0.0139, ..., -0.0830, 0.0474, -0.1501], [-0.1319, -0.0077, -0.0086, ..., -0.0824, -0.0983, -0.0775]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.4640e-06, 7.5437e-07, 9.3132e-09], [ 1.1176e-08, 3.7253e-09, 0.0000e+00, ..., 1.6764e-08, 1.0818e-05, 3.5390e-08], [ 3.7253e-09, -1.1176e-08, 0.0000e+00, ..., 1.4901e-08, 1.1265e-04, 5.5879e-08], ..., [ 4.4703e-08, -1.1176e-08, 0.0000e+00, ..., 9.3132e-09, 3.3081e-06, 9.4995e-08], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., -1.0058e-07, -2.9707e-04, 1.5087e-07], [ 1.8626e-09, 3.7253e-09, 0.0000e+00, ..., 6.5193e-08, 8.2254e-06, 2.7753e-07]], device='cuda:0') Epoch 155, bias, value: tensor([ 0.0062, -0.0157, 0.0008, 0.0301, -0.0039, 0.0301, 0.0052, 0.0167, -0.0031, -0.0118], device='cuda:0'), grad: tensor([ 4.6901e-06, 1.7196e-05, 1.6797e-04, 6.0081e-04, 2.2396e-05, -4.1747e-04, 4.2677e-05, 1.5855e-05, -5.0402e-04, 5.0724e-05], device='cuda:0') 100 0.0001 changing lr epoch 154, time 214.62, cls_loss 0.0022 cls_loss_mapping 0.0046 cls_loss_causal 0.5244 re_mapping 0.0063 re_causal 0.0191 /// teacc 98.99 lr 0.00010000 Epoch 156, weight, value: tensor([[-0.0327, -0.0023, -0.0746, ..., 0.0224, -0.0527, -0.0128], [ 0.0462, -0.0447, -0.0188, ..., 0.0039, -0.0411, -0.2292], [-0.0072, 0.0244, 0.0016, ..., -0.0865, 0.0994, -0.0397], ..., [ 0.0030, -0.0345, -0.0140, ..., -0.0830, -0.1500, 0.0264], [-0.0226, -0.0499, -0.0140, ..., -0.0850, 0.0459, -0.1501], [-0.1334, -0.0067, -0.0086, ..., -0.0822, -0.0987, -0.0775]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -1.6242e-05, 3.1665e-08, 2.0489e-08], [ 9.3132e-08, 0.0000e+00, 0.0000e+00, ..., 5.2154e-07, 1.0803e-07, 5.7742e-08], [ 5.9605e-08, 3.7253e-09, 0.0000e+00, ..., 5.0664e-07, 7.6368e-08, 2.6077e-08], ..., [ 4.2841e-08, 0.0000e+00, 0.0000e+00, ..., 3.9861e-07, 5.2154e-08, 3.1665e-08], [ 3.7253e-09, -1.6764e-08, 0.0000e+00, ..., 1.2740e-06, -2.7940e-07, 2.7940e-08], [ 2.4214e-08, 0.0000e+00, 0.0000e+00, ..., 4.7609e-06, 2.2538e-07, 9.1344e-06]], device='cuda:0') Epoch 156, bias, value: tensor([ 0.0059, -0.0160, 0.0004, 0.0298, -0.0032, 0.0317, 0.0034, 0.0168, -0.0042, -0.0115], device='cuda:0'), grad: tensor([-3.0696e-05, -1.8626e-09, 3.4627e-06, -7.2084e-06, -1.2434e-04, 2.8554e-06, 2.1800e-05, 2.3358e-06, 4.9956e-06, 1.2672e-04], device='cuda:0') 100 0.0001 changing lr epoch 155, time 214.90, cls_loss 0.0025 cls_loss_mapping 0.0041 cls_loss_causal 0.5232 re_mapping 0.0065 re_causal 0.0190 /// teacc 98.90 lr 0.00010000 Epoch 157, weight, value: tensor([[-0.0330, -0.0022, -0.0753, ..., 0.0207, -0.0534, -0.0121], [ 0.0467, -0.0497, -0.0188, ..., 0.0033, -0.0412, -0.2293], [-0.0069, 0.0262, 0.0018, ..., -0.0867, 0.1005, -0.0393], ..., [ 0.0030, -0.0374, -0.0142, ..., -0.0837, -0.1507, 0.0263], [-0.0226, -0.0508, -0.0140, ..., -0.0851, 0.0460, -0.1502], [-0.1344, -0.0075, -0.0087, ..., -0.0830, -0.0992, -0.0777]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.2585e-07, 4.9174e-07, -1.1921e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.0781e-08, 9.1270e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.3691e-07, 1.0505e-06, -1.8626e-09], ..., [-0.0000e+00, -3.7253e-09, 0.0000e+00, ..., 1.8626e-08, 4.2841e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0547e-07, 3.5204e-07, 5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.5949e-07, 1.1548e-07, 3.5390e-08]], device='cuda:0') Epoch 157, bias, value: tensor([ 0.0043, -0.0164, 0.0010, 0.0293, -0.0030, 0.0316, 0.0052, 0.0178, -0.0042, -0.0124], device='cuda:0'), grad: tensor([-5.3085e-07, 8.2701e-06, 3.7014e-05, -2.6971e-06, -5.0724e-05, 1.1809e-06, -4.8354e-06, 8.8960e-06, 4.4703e-06, -1.0375e-06], device='cuda:0') 100 0.0001 changing lr epoch 156, time 214.54, cls_loss 0.0024 cls_loss_mapping 0.0040 cls_loss_causal 0.5499 re_mapping 0.0062 re_causal 0.0196 /// teacc 98.93 lr 0.00010000 Epoch 158, weight, value: tensor([[-0.0333, -0.0024, -0.0756, ..., 0.0208, -0.0536, -0.0122], [ 0.0469, -0.0537, -0.0189, ..., 0.0033, -0.0415, -0.2294], [-0.0070, 0.0287, 0.0039, ..., -0.0875, 0.1007, -0.0393], ..., [ 0.0032, -0.0374, -0.0147, ..., -0.0845, -0.1526, 0.0264], [-0.0227, -0.0512, -0.0140, ..., -0.0853, 0.0464, -0.1502], [-0.1352, -0.0079, -0.0087, ..., -0.0832, -0.0997, -0.0778]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., -1.5702e-06, -5.4948e-07, 9.3132e-09], [ 0.0000e+00, 1.8626e-09, -6.3330e-07, ..., 1.3039e-08, 2.4214e-08, 3.3528e-08], [ 0.0000e+00, -1.3597e-07, 3.0734e-07, ..., 9.3132e-09, -9.4995e-07, -1.1362e-07], ..., [ 0.0000e+00, 3.1665e-08, 1.8254e-07, ..., 9.3132e-09, 2.2352e-07, 1.0990e-07], [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 1.0058e-07, 1.8254e-07, 3.7253e-08], [ 0.0000e+00, -1.8626e-09, 8.0094e-08, ..., 2.1979e-07, 1.1176e-07, 2.5705e-07]], device='cuda:0') Epoch 158, bias, value: tensor([ 0.0043, -0.0169, 0.0007, 0.0291, -0.0009, 0.0314, 0.0057, 0.0182, -0.0036, -0.0144], device='cuda:0'), grad: tensor([-2.3134e-06, 2.1148e-04, 3.9816e-05, 5.1618e-05, -2.2531e-05, 1.3299e-06, 5.0589e-06, -2.9898e-04, 3.5260e-06, 1.0565e-05], device='cuda:0') 100 0.0001 changing lr epoch 157, time 214.63, cls_loss 0.0018 cls_loss_mapping 0.0052 cls_loss_causal 0.5475 re_mapping 0.0063 re_causal 0.0192 /// teacc 99.09 lr 0.00010000 Epoch 159, weight, value: tensor([[-0.0340, -0.0023, -0.0766, ..., 0.0212, -0.0542, -0.0122], [ 0.0468, -0.0545, -0.0188, ..., 0.0030, -0.0417, -0.2296], [-0.0068, 0.0299, 0.0011, ..., -0.0884, 0.1014, -0.0392], ..., [ 0.0035, -0.0379, -0.0137, ..., -0.0851, -0.1532, 0.0264], [-0.0226, -0.0515, -0.0126, ..., -0.0857, 0.0465, -0.1503], [-0.1359, -0.0083, -0.0085, ..., -0.0827, -0.0999, -0.0779]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3730e-06, 1.3039e-07, -2.4643e-06], [ 1.6764e-08, 0.0000e+00, 0.0000e+00, ..., -1.0118e-05, -4.0494e-06, 5.7742e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 5.8487e-07, -1.2144e-06, 1.3039e-07], ..., [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.4156e-07, 4.2841e-07, 6.1467e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 2.5779e-06, -2.2724e-07, 2.6450e-07], [ 1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 9.1642e-07, 2.0862e-07, 5.4762e-07]], device='cuda:0') Epoch 159, bias, value: tensor([ 0.0042, -0.0169, 0.0010, 0.0288, -0.0010, 0.0313, 0.0054, 0.0184, -0.0034, -0.0144], device='cuda:0'), grad: tensor([-2.2680e-05, -1.3137e-04, 2.8554e-06, 2.5276e-06, 8.7768e-06, 7.8306e-06, 9.0659e-05, 2.2203e-06, 2.9266e-05, 9.8944e-06], device='cuda:0') 100 0.0001 changing lr epoch 158, time 214.79, cls_loss 0.0019 cls_loss_mapping 0.0036 cls_loss_causal 0.5177 re_mapping 0.0057 re_causal 0.0183 /// teacc 98.91 lr 0.00010000 Epoch 160, weight, value: tensor([[-0.0349, -0.0023, -0.0767, ..., 0.0216, -0.0543, -0.0116], [ 0.0466, -0.0552, -0.0188, ..., 0.0028, -0.0419, -0.2299], [-0.0071, 0.0303, 0.0012, ..., -0.0896, 0.1015, -0.0394], ..., [ 0.0041, -0.0379, -0.0138, ..., -0.0858, -0.1536, 0.0271], [-0.0211, -0.0517, -0.0126, ..., -0.0858, 0.0467, -0.1499], [-0.1389, -0.0086, -0.0085, ..., -0.0828, -0.1003, -0.0777]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., -0.0000e+00, 7.4990e-06, 3.9116e-08], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 2.2352e-08, 1.9241e-06, 1.5274e-07], [-1.5646e-07, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 6.7800e-06, -3.6694e-07], ..., [-3.7253e-09, -0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 1.6019e-07, 1.5087e-07], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 3.8557e-07, -2.6479e-05, 2.7381e-07], [ 1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 8.9407e-08, 1.9781e-06, -7.2159e-06]], device='cuda:0') Epoch 160, bias, value: tensor([ 0.0046, -0.0169, 0.0007, 0.0290, -0.0019, 0.0312, 0.0064, 0.0186, -0.0032, -0.0142], device='cuda:0'), grad: tensor([ 1.4320e-05, 4.9639e-04, 5.1320e-05, 1.6123e-05, 1.1164e-04, 4.5449e-06, 1.3426e-05, -6.4325e-04, -1.7837e-05, -4.6283e-05], device='cuda:0') 100 0.0001 changing lr epoch 159, time 214.83, cls_loss 0.0019 cls_loss_mapping 0.0030 cls_loss_causal 0.5154 re_mapping 0.0057 re_causal 0.0190 /// teacc 98.91 lr 0.00010000 Epoch 161, weight, value: tensor([[-0.0355, -0.0024, -0.0768, ..., 0.0217, -0.0545, -0.0114], [ 0.0466, -0.0562, -0.0188, ..., 0.0026, -0.0421, -0.2304], [-0.0074, 0.0305, 0.0013, ..., -0.0898, 0.1015, -0.0395], ..., [ 0.0050, -0.0371, -0.0139, ..., -0.0864, -0.1553, 0.0269], [-0.0210, -0.0518, -0.0126, ..., -0.0858, 0.0469, -0.1501], [-0.1412, -0.0098, -0.0085, ..., -0.0826, -0.1010, -0.0763]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., -1.3597e-07, 3.2596e-07, 2.7940e-08], [ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 1.8626e-08, 1.9316e-06, 6.3330e-08], [ 0.0000e+00, 1.4342e-07, 0.0000e+00, ..., 2.6077e-08, -7.6666e-06, -4.7311e-07], ..., [ 0.0000e+00, -1.6205e-07, 0.0000e+00, ..., 0.0000e+00, 8.0839e-07, 1.1548e-07], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 6.8918e-08, 7.7672e-07, 6.5193e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 1.1548e-07, -1.1362e-07]], device='cuda:0') Epoch 161, bias, value: tensor([ 0.0045, -0.0169, -0.0003, 0.0294, -0.0028, 0.0320, 0.0060, 0.0185, -0.0031, -0.0134], device='cuda:0'), grad: tensor([ 2.9430e-06, 2.9862e-05, 8.9183e-06, 8.7321e-06, 8.0243e-06, 1.8924e-06, 2.1327e-06, -1.1617e-04, 6.1542e-06, 4.7714e-05], device='cuda:0') 100 0.0001 changing lr epoch 160, time 214.69, cls_loss 0.0018 cls_loss_mapping 0.0028 cls_loss_causal 0.5208 re_mapping 0.0065 re_causal 0.0188 /// teacc 98.91 lr 0.00010000 Epoch 162, weight, value: tensor([[-0.0359, -0.0038, -0.0769, ..., 0.0217, -0.0547, -0.0116], [ 0.0466, -0.0590, -0.0183, ..., 0.0021, -0.0421, -0.2309], [-0.0077, 0.0317, 0.0012, ..., -0.0900, 0.1020, -0.0396], ..., [ 0.0058, -0.0342, -0.0142, ..., -0.0870, -0.1558, 0.0270], [-0.0210, -0.0548, -0.0126, ..., -0.0861, 0.0468, -0.1502], [-0.1431, -0.0102, -0.0087, ..., -0.0828, -0.1016, -0.0766]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-08, 4.1537e-07, 5.5879e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.0990e-07, 5.9418e-07, 2.6077e-08], [ 5.5879e-09, -1.8626e-09, 0.0000e+00, ..., 5.2154e-08, -1.2498e-06, 3.7253e-08], ..., [ 2.2352e-08, 0.0000e+00, 0.0000e+00, ..., 9.3132e-09, 1.2740e-06, 1.1362e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4715e-07, -1.5013e-05, 7.4506e-09], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-08, 1.1638e-05, -1.6764e-08]], device='cuda:0') Epoch 162, bias, value: tensor([ 0.0045, -0.0167, -0.0004, 0.0290, -0.0025, 0.0318, 0.0063, 0.0190, -0.0033, -0.0139], device='cuda:0'), grad: tensor([ 1.0282e-06, 1.2647e-06, -1.9185e-06, 1.1791e-06, 5.1819e-06, 2.9020e-06, -2.3674e-06, 3.6545e-06, -3.3081e-05, 2.2098e-05], device='cuda:0') 100 0.0001 changing lr epoch 161, time 214.48, cls_loss 0.0018 cls_loss_mapping 0.0029 cls_loss_causal 0.5143 re_mapping 0.0056 re_causal 0.0187 /// teacc 98.96 lr 0.00010000 Epoch 163, weight, value: tensor([[-3.6175e-02, -4.2249e-03, -7.7493e-02, ..., 2.1844e-02, -5.5218e-02, -1.2040e-02], [ 4.5909e-02, -5.9068e-02, -1.8211e-02, ..., -3.1705e-04, -4.3087e-02, -2.3168e-01], [-7.7440e-03, 3.2182e-02, 8.4007e-05, ..., -8.7916e-02, 1.0384e-01, -3.9890e-02], ..., [ 5.8335e-03, -3.4449e-02, -1.6251e-02, ..., -8.7635e-02, -1.5711e-01, 2.6894e-02], [-2.0657e-02, -5.5914e-02, -1.2688e-02, ..., -8.6360e-02, 4.7059e-02, -1.5017e-01], [-1.4551e-01, -1.0856e-02, -8.7226e-03, ..., -8.2605e-02, -1.0214e-01, -7.6828e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4587e-07, 3.1423e-06, 7.8231e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.2841e-08, 1.6857e-06, 4.0047e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3597e-07, -1.0319e-06, 6.4448e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 2.7195e-06, 7.4506e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0361e-07, 1.7357e-04, 3.2410e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-08, 4.4331e-06, 8.6799e-07]], device='cuda:0') Epoch 163, bias, value: tensor([ 0.0045, -0.0171, 0.0005, 0.0293, -0.0023, 0.0317, 0.0058, 0.0185, -0.0029, -0.0140], device='cuda:0'), grad: tensor([ 8.4937e-06, 4.8764e-06, 5.5805e-06, 1.7095e-04, -5.5492e-05, -6.8140e-04, 5.6833e-05, 1.4909e-05, 4.5347e-04, 2.1085e-05], device='cuda:0') 100 0.0001 changing lr epoch 162, time 214.38, cls_loss 0.0021 cls_loss_mapping 0.0033 cls_loss_causal 0.5394 re_mapping 0.0059 re_causal 0.0187 /// teacc 98.97 lr 0.00010000 Epoch 164, weight, value: tensor([[-3.6239e-02, -4.2950e-03, -7.7639e-02, ..., 2.1355e-02, -5.6479e-02, -1.2102e-02], [ 4.6074e-02, -5.9127e-02, -1.8193e-02, ..., -7.4355e-04, -4.3711e-02, -2.3206e-01], [-7.8192e-03, 3.2255e-02, -2.2185e-04, ..., -8.8292e-02, 1.0390e-01, -4.0020e-02], ..., [ 5.7885e-03, -3.4404e-02, -1.6731e-02, ..., -8.8033e-02, -1.5773e-01, 2.6760e-02], [-2.0701e-02, -5.6200e-02, -1.2584e-02, ..., -8.6329e-02, 4.8093e-02, -1.5018e-01], [-1.4594e-01, -1.0719e-02, -8.7326e-03, ..., -8.2350e-02, -1.0287e-01, -7.7015e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.4703e-07, 6.6683e-07, 8.3819e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.2480e-07, 1.5181e-06, 2.6077e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.8440e-07, 2.5332e-07, -3.9116e-08], ..., [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 1.1548e-07, 2.8871e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.8592e-07, 8.2925e-06, 7.4506e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 6.0536e-08, 8.6520e-07, 8.1956e-08]], device='cuda:0') Epoch 164, bias, value: tensor([ 3.2093e-03, -1.6165e-02, -2.8765e-05, 2.9432e-02, -2.1408e-03, 3.1646e-02, 5.7994e-03, 1.7964e-02, -2.8449e-03, -1.3877e-02], device='cuda:0'), grad: tensor([ 1.6848e-06, -3.2663e-05, 1.0151e-06, 1.3053e-05, 6.6683e-06, -4.2737e-05, 7.5251e-06, 1.1288e-06, 4.1485e-05, 2.8349e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 163---------------------------------------------------- epoch 163, time 230.81, cls_loss 0.0023 cls_loss_mapping 0.0043 cls_loss_causal 0.5119 re_mapping 0.0058 re_causal 0.0177 /// teacc 99.13 lr 0.00010000 Epoch 165, weight, value: tensor([[-0.0377, -0.0043, -0.0777, ..., 0.0225, -0.0562, -0.0123], [ 0.0461, -0.0593, -0.0181, ..., -0.0018, -0.0442, -0.2327], [-0.0079, 0.0323, -0.0004, ..., -0.0910, 0.1039, -0.0403], ..., [ 0.0060, -0.0342, -0.0169, ..., -0.0881, -0.1577, 0.0262], [-0.0207, -0.0566, -0.0126, ..., -0.0868, 0.0481, -0.1502], [-0.1474, -0.0109, -0.0087, ..., -0.0833, -0.1036, -0.0773]], device='cuda:0'), grad: tensor([[ 1.0524e-07, 9.3132e-10, 0.0000e+00, ..., -4.4424e-07, 1.8040e-06, 1.4808e-07], [ 1.1455e-07, 9.3132e-09, 0.0000e+00, ..., 3.3528e-08, 1.4091e-06, 1.6671e-07], [-1.8522e-05, 5.5879e-09, 0.0000e+00, ..., 5.7742e-08, -3.9250e-05, -2.5287e-05], ..., [ 1.4836e-06, -6.7055e-08, 0.0000e+00, ..., 4.6566e-09, 3.7607e-06, 2.0936e-06], [ 1.2547e-05, 3.7253e-09, 0.0000e+00, ..., 9.4064e-08, 4.7421e-04, 1.7151e-05], [ 2.7940e-09, 3.8184e-08, 0.0000e+00, ..., 3.0454e-07, -5.7173e-04, 1.0245e-08]], device='cuda:0') Epoch 165, bias, value: tensor([ 0.0040, -0.0154, -0.0006, 0.0292, -0.0023, 0.0321, 0.0061, 0.0151, -0.0008, -0.0138], device='cuda:0'), grad: tensor([ 6.1840e-06, 5.4464e-06, -2.8834e-05, 4.0196e-06, 3.9911e-04, 4.3225e-04, 7.9274e-06, 6.0052e-06, 1.8330e-03, -2.6665e-03], device='cuda:0') 100 0.0001 changing lr epoch 164, time 214.59, cls_loss 0.0017 cls_loss_mapping 0.0041 cls_loss_causal 0.5276 re_mapping 0.0059 re_causal 0.0179 /// teacc 99.10 lr 0.00010000 Epoch 166, weight, value: tensor([[-0.0380, -0.0043, -0.0782, ..., 0.0234, -0.0563, -0.0123], [ 0.0461, -0.0594, -0.0178, ..., -0.0023, -0.0447, -0.2332], [-0.0075, 0.0324, -0.0004, ..., -0.0914, 0.1043, -0.0401], ..., [ 0.0061, -0.0342, -0.0171, ..., -0.0880, -0.1580, 0.0262], [-0.0211, -0.0571, -0.0129, ..., -0.0870, 0.0479, -0.1505], [-0.1478, -0.0110, -0.0088, ..., -0.0836, -0.1022, -0.0778]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.8394e-07, 7.8231e-08, 1.8626e-09], [-3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.4901e-08, 2.7008e-06, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, -2.8964e-06, 2.7940e-09], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 1.0803e-07, 3.7253e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.5146e-08, 6.6124e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.8918e-08, -6.7241e-07, 7.2643e-08]], device='cuda:0') Epoch 166, bias, value: tensor([ 0.0045, -0.0147, -0.0007, 0.0290, -0.0028, 0.0321, 0.0061, 0.0148, -0.0014, -0.0127], device='cuda:0'), grad: tensor([-6.9849e-08, 3.6508e-06, 2.0787e-05, -3.0175e-05, 1.5981e-06, 2.3730e-06, 5.8208e-07, 5.1185e-06, 9.9614e-06, -1.3925e-05], device='cuda:0') 100 0.0001 changing lr epoch 165, time 214.44, cls_loss 0.0019 cls_loss_mapping 0.0044 cls_loss_causal 0.5231 re_mapping 0.0061 re_causal 0.0183 /// teacc 99.08 lr 0.00010000 Epoch 167, weight, value: tensor([[-0.0384, -0.0043, -0.0782, ..., 0.0235, -0.0569, -0.0127], [ 0.0461, -0.0594, -0.0173, ..., -0.0033, -0.0451, -0.2339], [-0.0075, 0.0324, -0.0006, ..., -0.0915, 0.1049, -0.0404], ..., [ 0.0062, -0.0341, -0.0171, ..., -0.0882, -0.1586, 0.0259], [-0.0210, -0.0571, -0.0129, ..., -0.0871, 0.0480, -0.1505], [-0.1486, -0.0110, -0.0089, ..., -0.0833, -0.1027, -0.0785]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.7265e-07, 2.7958e-06, 2.7940e-09], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 3.9116e-08, 1.1455e-07, 3.7253e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -1.6820e-06, -5.1782e-06, 1.7695e-08], ..., [ 0.0000e+00, -2.6077e-08, 0.0000e+00, ..., 1.3970e-08, 3.3155e-07, 1.8626e-08], [ 0.0000e+00, 1.6764e-08, 0.0000e+00, ..., 5.9605e-08, 2.5798e-07, 6.5193e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 4.6566e-08, 7.8231e-08, -4.5635e-08]], device='cuda:0') Epoch 167, bias, value: tensor([ 0.0043, -0.0154, -0.0006, 0.0290, -0.0022, 0.0319, 0.0062, 0.0148, -0.0015, -0.0123], device='cuda:0'), grad: tensor([ 7.7188e-06, -4.2655e-07, -1.4022e-05, 7.8306e-06, 1.3737e-06, -3.7737e-06, 7.5251e-07, 6.3609e-07, 1.2172e-06, -1.3104e-06], device='cuda:0') 100 0.0001 changing lr epoch 166, time 214.54, cls_loss 0.0019 cls_loss_mapping 0.0037 cls_loss_causal 0.4984 re_mapping 0.0058 re_causal 0.0181 /// teacc 99.08 lr 0.00010000 Epoch 168, weight, value: tensor([[-0.0385, -0.0043, -0.0793, ..., 0.0250, -0.0562, -0.0122], [ 0.0462, -0.0594, -0.0168, ..., -0.0038, -0.0453, -0.2347], [-0.0074, 0.0324, -0.0011, ..., -0.0924, 0.1053, -0.0408], ..., [ 0.0063, -0.0342, -0.0176, ..., -0.0895, -0.1601, 0.0258], [-0.0208, -0.0573, -0.0145, ..., -0.0864, 0.0487, -0.1499], [-0.1493, -0.0110, -0.0091, ..., -0.0841, -0.1032, -0.0785]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.4100e-06, 2.9802e-08, 1.8626e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.1420e-08, 1.2573e-07, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8871e-08, -1.2396e-06, -1.6764e-08], ..., [-6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-09, 5.2713e-07, 9.3132e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.2072e-07, -2.4401e-07, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.0094e-08, 3.2969e-07, -3.8184e-08]], device='cuda:0') Epoch 168, bias, value: tensor([ 0.0054, -0.0150, -0.0006, 0.0288, -0.0023, 0.0318, 0.0055, 0.0144, -0.0014, -0.0122], device='cuda:0'), grad: tensor([-1.3886e-06, 4.4145e-07, -1.8841e-06, -3.2634e-05, 8.6706e-07, 5.1558e-06, 2.1532e-06, 1.4165e-06, 2.2441e-05, 3.3937e-06], device='cuda:0') 100 0.0001 changing lr epoch 167, time 214.60, cls_loss 0.0024 cls_loss_mapping 0.0045 cls_loss_causal 0.5148 re_mapping 0.0054 re_causal 0.0168 /// teacc 98.89 lr 0.00010000 Epoch 169, weight, value: tensor([[-0.0389, -0.0043, -0.0824, ..., 0.0271, -0.0568, -0.0119], [ 0.0460, -0.0594, -0.0189, ..., -0.0044, -0.0461, -0.2377], [-0.0075, 0.0325, -0.0042, ..., -0.0930, 0.1062, -0.0407], ..., [ 0.0070, -0.0342, -0.0147, ..., -0.0902, -0.1613, 0.0258], [-0.0209, -0.0574, -0.0175, ..., -0.0869, 0.0487, -0.1499], [-0.1505, -0.0110, -0.0106, ..., -0.0882, -0.1046, -0.0780]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 9.3132e-10, 0.0000e+00, ..., 4.5262e-07, 1.1427e-06, 1.9744e-07], [ 1.3970e-08, 9.3132e-10, 0.0000e+00, ..., 5.0887e-06, 1.3195e-05, 2.8778e-07], [ 6.5193e-09, 1.8626e-09, 0.0000e+00, ..., -6.6042e-05, -1.7333e-04, 2.8871e-07], ..., [ 1.2107e-08, -8.3819e-09, 0.0000e+00, ..., 9.8720e-08, 2.1271e-06, 1.4994e-07], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 3.9209e-07, 9.3319e-07, 7.2177e-07], [ 2.7008e-08, 9.3132e-10, 0.0000e+00, ..., 8.4750e-08, 1.8720e-07, 1.4889e-04]], device='cuda:0') Epoch 169, bias, value: tensor([ 0.0072, -0.0157, -0.0004, 0.0283, -0.0029, 0.0322, 0.0056, 0.0152, -0.0014, -0.0126], device='cuda:0'), grad: tensor([ 2.6524e-06, 1.9863e-05, -3.2854e-04, 1.1921e-06, -4.0936e-04, -9.6858e-08, 2.9421e-04, 4.2468e-06, 4.4294e-06, 4.1032e-04], device='cuda:0') 100 0.0001 changing lr epoch 168, time 214.68, cls_loss 0.0018 cls_loss_mapping 0.0036 cls_loss_causal 0.5260 re_mapping 0.0058 re_causal 0.0181 /// teacc 99.05 lr 0.00010000 Epoch 170, weight, value: tensor([[-0.0391, -0.0045, -0.0825, ..., 0.0255, -0.0596, -0.0120], [ 0.0462, -0.0594, -0.0188, ..., -0.0039, -0.0466, -0.2388], [-0.0075, 0.0325, -0.0041, ..., -0.0926, 0.1072, -0.0407], ..., [ 0.0071, -0.0341, -0.0147, ..., -0.0905, -0.1625, 0.0276], [-0.0210, -0.0575, -0.0178, ..., -0.0879, 0.0486, -0.1500], [-0.1515, -0.0111, -0.0107, ..., -0.0885, -0.1051, -0.0782]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.4261e-08, 1.8161e-07, 1.3039e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.2154e-08, 8.7544e-08, 1.6764e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.1665e-08, 8.3819e-09, 2.8871e-08], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.0978e-08, 2.3283e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.2399e-07, -3.1386e-07, 6.5193e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 2.7008e-08, 1.7788e-07, 9.5926e-08]], device='cuda:0') Epoch 170, bias, value: tensor([ 0.0054, -0.0152, 0.0003, 0.0283, -0.0026, 0.0318, 0.0057, 0.0155, -0.0020, -0.0127], device='cuda:0'), grad: tensor([ 9.8441e-07, 1.1019e-05, 4.0680e-06, 1.7256e-05, 6.2212e-07, 4.0978e-06, -2.3413e-06, -7.3671e-05, 1.6823e-05, 2.1085e-05], device='cuda:0') 100 0.0001 changing lr epoch 169, time 214.61, cls_loss 0.0023 cls_loss_mapping 0.0046 cls_loss_causal 0.5360 re_mapping 0.0055 re_causal 0.0168 /// teacc 99.05 lr 0.00010000 Epoch 171, weight, value: tensor([[-0.0392, -0.0047, -0.0825, ..., 0.0251, -0.0607, -0.0116], [ 0.0459, -0.0595, -0.0188, ..., -0.0051, -0.0474, -0.2392], [-0.0075, 0.0331, -0.0042, ..., -0.0927, 0.1080, -0.0409], ..., [ 0.0076, -0.0344, -0.0146, ..., -0.0909, -0.1628, 0.0276], [-0.0211, -0.0593, -0.0178, ..., -0.0861, 0.0491, -0.1502], [-0.1531, -0.0112, -0.0107, ..., -0.0890, -0.1064, -0.0788]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -8.0094e-08, 4.5635e-08, 2.7940e-09], [ 1.3132e-07, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 2.6822e-07, 1.9558e-07], [ 2.6077e-08, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, -1.3039e-07, 4.0978e-08], ..., [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.2014e-07, 1.3970e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, -2.5574e-06, -2.8871e-08], [ 1.0245e-08, 0.0000e+00, 0.0000e+00, ..., 2.4214e-08, 1.0012e-06, 4.0978e-08]], device='cuda:0') Epoch 171, bias, value: tensor([ 0.0048, -0.0149, 0.0002, 0.0285, -0.0025, 0.0318, 0.0060, 0.0161, -0.0023, -0.0134], device='cuda:0'), grad: tensor([ 6.3330e-08, 1.2647e-06, 1.4137e-06, -2.9318e-06, -1.6512e-06, 1.0449e-06, 8.5030e-07, 1.7546e-06, -4.4145e-06, 2.5891e-06], device='cuda:0') 100 0.0001 changing lr epoch 170, time 214.54, cls_loss 0.0022 cls_loss_mapping 0.0040 cls_loss_causal 0.5321 re_mapping 0.0054 re_causal 0.0175 /// teacc 99.02 lr 0.00010000 Epoch 172, weight, value: tensor([[-0.0397, -0.0048, -0.0827, ..., 0.0251, -0.0614, -0.0111], [ 0.0465, -0.0596, -0.0188, ..., -0.0081, -0.0484, -0.2398], [-0.0077, 0.0333, -0.0023, ..., -0.0941, 0.1091, -0.0413], ..., [ 0.0076, -0.0345, -0.0154, ..., -0.0921, -0.1639, 0.0276], [-0.0214, -0.0599, -0.0179, ..., -0.0862, 0.0489, -0.1503], [-0.1542, -0.0112, -0.0107, ..., -0.0891, -0.1055, -0.0793]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.5832e-07, 9.7416e-07, 1.8626e-09], [-3.0268e-07, 0.0000e+00, 0.0000e+00, ..., 5.7649e-07, 4.2580e-06, 1.1083e-07], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 2.4568e-06, 9.7528e-06, 1.0245e-08], ..., [ 1.0245e-08, 0.0000e+00, 0.0000e+00, ..., 2.8312e-07, 1.2387e-06, 1.2107e-08], [ 3.3528e-08, 0.0000e+00, 0.0000e+00, ..., -2.9951e-05, -1.3101e-04, 6.5193e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.1548e-07, 2.6543e-07, 5.2154e-08]], device='cuda:0') Epoch 172, bias, value: tensor([ 0.0046, -0.0147, 0.0006, 0.0295, -0.0023, 0.0300, 0.0069, 0.0158, -0.0032, -0.0124], device='cuda:0'), grad: tensor([ 1.5758e-06, 4.2543e-06, 1.8179e-05, -6.3777e-06, 3.0454e-06, 1.7893e-04, 2.7508e-05, 5.4836e-06, -2.3234e-04, -2.3749e-07], device='cuda:0') 100 0.0001 changing lr epoch 171, time 214.83, cls_loss 0.0020 cls_loss_mapping 0.0030 cls_loss_causal 0.5325 re_mapping 0.0054 re_causal 0.0180 /// teacc 98.92 lr 0.00010000 Epoch 173, weight, value: tensor([[-0.0400, -0.0048, -0.0827, ..., 0.0244, -0.0622, -0.0098], [ 0.0465, -0.0596, -0.0188, ..., -0.0085, -0.0489, -0.2400], [-0.0077, 0.0333, -0.0024, ..., -0.0942, 0.1069, -0.0407], ..., [ 0.0077, -0.0345, -0.0153, ..., -0.0925, -0.1614, 0.0276], [-0.0214, -0.0600, -0.0179, ..., -0.0858, 0.0490, -0.1504], [-0.1548, -0.0112, -0.0108, ..., -0.0894, -0.1057, -0.0795]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 5.5879e-09, 8.7544e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 2.2352e-08, 1.3690e-07, 2.3283e-08], [ 0.0000e+00, 0.0000e+00, 2.5146e-08, ..., 2.1420e-08, -3.6787e-07, 4.6566e-09], ..., [ 0.0000e+00, 0.0000e+00, 1.2107e-08, ..., 9.3132e-10, 1.5739e-07, 1.7695e-08], [ 0.0000e+00, 0.0000e+00, 3.1665e-08, ..., 3.9116e-08, 6.1095e-07, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.1176e-08, 5.5600e-07, 2.0489e-07]], device='cuda:0') Epoch 173, bias, value: tensor([ 0.0039, -0.0149, -0.0019, 0.0293, -0.0025, 0.0301, 0.0073, 0.0181, -0.0033, -0.0120], device='cuda:0'), grad: tensor([ 3.2876e-07, -9.5516e-06, 2.0638e-06, 3.8445e-06, -1.8165e-05, -3.6377e-06, 1.2852e-07, 1.2301e-05, 2.0321e-06, 1.0610e-05], device='cuda:0') 100 0.0001 changing lr epoch 172, time 214.55, cls_loss 0.0032 cls_loss_mapping 0.0049 cls_loss_causal 0.5043 re_mapping 0.0057 re_causal 0.0163 /// teacc 98.98 lr 0.00010000 Epoch 174, weight, value: tensor([[-0.0425, -0.0048, -0.0828, ..., 0.0247, -0.0624, -0.0100], [ 0.0461, -0.0597, -0.0188, ..., -0.0110, -0.0519, -0.2403], [-0.0059, 0.0334, -0.0040, ..., -0.0943, 0.1080, -0.0402], ..., [ 0.0077, -0.0345, -0.0143, ..., -0.0937, -0.1615, 0.0276], [-0.0215, -0.0601, -0.0179, ..., -0.0844, 0.0505, -0.1505], [-0.1591, -0.0113, -0.0108, ..., -0.0890, -0.1060, -0.0797]], device='cuda:0'), grad: tensor([[ 9.8720e-08, 0.0000e+00, 0.0000e+00, ..., 1.2536e-06, 2.1793e-06, 3.7253e-09], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 6.7987e-08, 2.1327e-07, 2.0489e-08], [ 1.8626e-08, 0.0000e+00, 0.0000e+00, ..., 1.4435e-07, -4.9360e-07, -2.3283e-08], ..., [ 1.6764e-08, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 3.7160e-07, 1.6764e-08], [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 5.0291e-08, -4.1761e-06, 2.7940e-09], [ 1.0058e-07, 0.0000e+00, 0.0000e+00, ..., 9.2201e-08, 3.5837e-06, 9.2201e-08]], device='cuda:0') Epoch 174, bias, value: tensor([ 0.0039, -0.0168, -0.0011, 0.0308, -0.0048, 0.0276, 0.0057, 0.0181, -0.0022, -0.0091], device='cuda:0'), grad: tensor([ 6.4336e-06, 5.1502e-07, -4.9919e-07, 1.2684e-04, 1.9744e-06, -1.2982e-04, -1.1072e-05, 1.6494e-06, -8.7172e-06, 1.2584e-05], device='cuda:0') 100 0.0001 changing lr epoch 173, time 214.64, cls_loss 0.0026 cls_loss_mapping 0.0051 cls_loss_causal 0.4920 re_mapping 0.0058 re_causal 0.0167 /// teacc 98.89 lr 0.00010000 Epoch 175, weight, value: tensor([[-0.0440, -0.0049, -0.0829, ..., 0.0242, -0.0635, -0.0098], [ 0.0459, -0.0597, -0.0189, ..., -0.0112, -0.0533, -0.2406], [-0.0056, 0.0334, -0.0059, ..., -0.0945, 0.1089, -0.0399], ..., [ 0.0076, -0.0344, -0.0130, ..., -0.0954, -0.1616, 0.0274], [-0.0215, -0.0603, -0.0181, ..., -0.0847, 0.0506, -0.1508], [-0.1616, -0.0116, -0.0108, ..., -0.0898, -0.1068, -0.0828]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., -5.5581e-05, -3.2127e-05, 1.5832e-08], [-7.4506e-09, 2.7940e-09, 0.0000e+00, ..., 7.5437e-08, 1.1735e-07, 1.7881e-07], [ 1.8626e-09, -4.3027e-07, 0.0000e+00, ..., 2.9895e-07, -1.3057e-06, 8.1025e-08], ..., [ 9.3132e-10, 3.4738e-07, 0.0000e+00, ..., 5.8673e-08, 1.2834e-06, 3.1106e-07], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 1.1763e-06, 6.7521e-07, 3.6322e-08], [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 6.6496e-07, 3.9395e-07, 9.7509e-07]], device='cuda:0') Epoch 175, bias, value: tensor([ 0.0032, -0.0177, -0.0005, 0.0310, -0.0007, 0.0280, 0.0059, 0.0180, -0.0025, -0.0129], device='cuda:0'), grad: tensor([-9.9480e-05, 3.9302e-06, -4.1053e-06, -4.0233e-06, 1.7196e-05, 5.1081e-05, 9.0897e-05, -1.5432e-06, 8.5980e-06, -6.2704e-05], device='cuda:0') 100 0.0001 changing lr epoch 174, time 214.61, cls_loss 0.0026 cls_loss_mapping 0.0047 cls_loss_causal 0.5372 re_mapping 0.0057 re_causal 0.0180 /// teacc 99.05 lr 0.00010000 Epoch 176, weight, value: tensor([[-0.0437, -0.0049, -0.0829, ..., 0.0243, -0.0640, -0.0099], [ 0.0461, -0.0597, -0.0187, ..., -0.0114, -0.0537, -0.2407], [-0.0057, 0.0335, -0.0058, ..., -0.0946, 0.1093, -0.0400], ..., [ 0.0075, -0.0344, -0.0131, ..., -0.0963, -0.1617, 0.0274], [-0.0215, -0.0603, -0.0180, ..., -0.0852, 0.0506, -0.1509], [-0.1625, -0.0116, -0.0108, ..., -0.0900, -0.1073, -0.0822]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.7695e-08, 1.3439e-06, 1.8626e-09], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.3132e-07, 2.3283e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -8.1025e-08, 6.5193e-09], ..., [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.6042e-07, 1.6764e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.5635e-08, -5.2154e-08, 1.2107e-08], [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 9.3691e-07, 4.1910e-08]], device='cuda:0') Epoch 176, bias, value: tensor([ 0.0030, -0.0180, -0.0012, 0.0313, -0.0022, 0.0281, 0.0052, 0.0184, -0.0027, -0.0108], device='cuda:0'), grad: tensor([ 8.7470e-06, 6.2305e-07, 9.3691e-07, 1.7628e-05, 9.6187e-06, -2.4647e-05, 2.5742e-06, 4.5687e-05, 2.6412e-06, -6.3717e-05], device='cuda:0') 100 0.0001 changing lr epoch 175, time 214.15, cls_loss 0.0022 cls_loss_mapping 0.0037 cls_loss_causal 0.5475 re_mapping 0.0052 re_causal 0.0172 /// teacc 98.93 lr 0.00010000 Epoch 177, weight, value: tensor([[-0.0460, -0.0050, -0.0829, ..., 0.0245, -0.0646, -0.0104], [ 0.0457, -0.0602, -0.0188, ..., -0.0119, -0.0539, -0.2412], [-0.0057, 0.0334, -0.0062, ..., -0.0950, 0.1095, -0.0401], ..., [ 0.0109, -0.0338, -0.0130, ..., -0.0964, -0.1618, 0.0278], [-0.0216, -0.0605, -0.0180, ..., -0.0856, 0.0506, -0.1510], [-0.1660, -0.0117, -0.0109, ..., -0.0902, -0.1080, -0.0825]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.0757e-07, 6.0536e-08, 1.8626e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.0245e-08, 2.3842e-07, 8.3819e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, -5.5544e-06, -2.9802e-08], ..., [ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., 3.7253e-09, 5.1148e-06, 1.7695e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.8673e-08, 1.5274e-07, 7.4506e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.2154e-08, 1.1548e-07, -4.8429e-08]], device='cuda:0') Epoch 177, bias, value: tensor([ 0.0029, -0.0183, -0.0013, 0.0315, -0.0023, 0.0281, 0.0051, 0.0195, -0.0028, -0.0115], device='cuda:0'), grad: tensor([-1.4501e-06, 1.9148e-06, -2.6733e-05, 1.2862e-06, 1.6280e-06, -1.7677e-06, 4.5355e-07, 2.5243e-05, 7.6182e-07, -1.3085e-06], device='cuda:0') 100 0.0001 changing lr epoch 176, time 214.34, cls_loss 0.0027 cls_loss_mapping 0.0037 cls_loss_causal 0.5293 re_mapping 0.0054 re_causal 0.0158 /// teacc 99.00 lr 0.00010000 Epoch 178, weight, value: tensor([[-0.0486, -0.0050, -0.0830, ..., 0.0247, -0.0663, -0.0106], [ 0.0449, -0.0602, -0.0188, ..., -0.0124, -0.0543, -0.2421], [-0.0043, 0.0335, -0.0071, ..., -0.0957, 0.1100, -0.0390], ..., [ 0.0129, -0.0337, -0.0133, ..., -0.0970, -0.1622, 0.0291], [-0.0214, -0.0608, -0.0181, ..., -0.0862, 0.0511, -0.1512], [-0.1689, -0.0120, -0.0109, ..., -0.0904, -0.1089, -0.0830]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.7229e-07, 1.7323e-07, 3.3528e-08], [ 1.5832e-08, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.4529e-07, 3.5297e-07], [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 1.2107e-07, 5.2154e-08, 4.3772e-08], ..., [ 9.4995e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.2573e-07, 4.7125e-07], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., -1.5367e-07, -5.2247e-07, 1.0245e-07], [ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 6.3237e-07, -9.5740e-07]], device='cuda:0') Epoch 178, bias, value: tensor([ 0.0025, -0.0181, -0.0008, 0.0310, -0.0026, 0.0283, 0.0055, 0.0195, -0.0025, -0.0117], device='cuda:0'), grad: tensor([ 2.6021e-06, 1.1884e-05, 2.2594e-06, -5.2974e-06, 3.9116e-06, -5.0776e-06, 6.8434e-06, -9.4026e-06, 1.7807e-06, -9.5814e-06], device='cuda:0') 100 0.0001 changing lr epoch 177, time 214.21, cls_loss 0.0019 cls_loss_mapping 0.0026 cls_loss_causal 0.5177 re_mapping 0.0056 re_causal 0.0179 /// teacc 99.03 lr 0.00010000 Epoch 179, weight, value: tensor([[-0.0495, -0.0052, -0.0832, ..., 0.0246, -0.0670, -0.0113], [ 0.0448, -0.0612, -0.0187, ..., -0.0127, -0.0545, -0.2427], [-0.0044, 0.0336, -0.0062, ..., -0.0959, 0.1103, -0.0389], ..., [ 0.0129, -0.0332, -0.0136, ..., -0.0972, -0.1623, 0.0290], [-0.0214, -0.0612, -0.0182, ..., -0.0866, 0.0511, -0.1512], [-0.1695, -0.0113, -0.0109, ..., -0.0902, -0.1093, -0.0832]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0431e-07, 1.6484e-07, 3.8184e-08], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.4064e-08, 1.3225e-07, 1.1791e-06], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9802e-08, 1.4901e-08, 8.4750e-08], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.2107e-08, 3.8370e-07], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7975e-07, 2.3376e-07, 8.2888e-08], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-08, 3.5390e-08, 3.5577e-07]], device='cuda:0') Epoch 179, bias, value: tensor([ 0.0017, -0.0189, -0.0008, 0.0308, -0.0027, 0.0289, 0.0054, 0.0203, -0.0030, -0.0117], device='cuda:0'), grad: tensor([ 4.4703e-07, 5.6364e-06, 5.6252e-07, 1.8068e-07, -1.1139e-05, 1.8105e-06, -1.9968e-06, 1.2172e-06, 9.3132e-07, 2.3469e-06], device='cuda:0') 100 0.0001 changing lr epoch 178, time 214.37, cls_loss 0.0021 cls_loss_mapping 0.0037 cls_loss_causal 0.5079 re_mapping 0.0054 re_causal 0.0165 /// teacc 98.91 lr 0.00010000 Epoch 180, weight, value: tensor([[-0.0495, -0.0052, -0.0833, ..., 0.0245, -0.0678, -0.0115], [ 0.0449, -0.0614, -0.0187, ..., -0.0131, -0.0547, -0.2432], [-0.0044, 0.0336, -0.0062, ..., -0.0961, 0.1129, -0.0391], ..., [ 0.0129, -0.0327, -0.0139, ..., -0.0980, -0.1650, 0.0289], [-0.0214, -0.0613, -0.0182, ..., -0.0864, 0.0513, -0.1513], [-0.1697, -0.0114, -0.0110, ..., -0.0904, -0.1097, -0.0837]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.9092e-07, 6.3330e-08, 1.8626e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 3.2596e-08, 2.2352e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, -1.2480e-07, 3.7253e-09], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 6.5193e-08, 1.3039e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6764e-08, 2.5425e-07, 1.1176e-08], [ 2.3283e-08, 0.0000e+00, 0.0000e+00, ..., 5.6811e-08, -1.0459e-06, 1.3039e-08]], device='cuda:0') Epoch 180, bias, value: tensor([ 0.0012, -0.0187, 0.0013, 0.0310, -0.0027, 0.0289, 0.0050, 0.0184, -0.0028, -0.0116], device='cuda:0'), grad: tensor([ 1.4622e-06, -1.3523e-06, 1.0496e-06, -1.7695e-08, 5.2378e-06, 1.7166e-05, 7.3388e-07, 1.2172e-06, 8.9854e-06, -3.4511e-05], device='cuda:0') 100 0.0001 changing lr epoch 179, time 214.02, cls_loss 0.0023 cls_loss_mapping 0.0034 cls_loss_causal 0.5163 re_mapping 0.0055 re_causal 0.0167 /// teacc 99.01 lr 0.00010000 Epoch 181, weight, value: tensor([[-0.0496, -0.0053, -0.0834, ..., 0.0246, -0.0681, -0.0117], [ 0.0447, -0.0615, -0.0187, ..., -0.0136, -0.0552, -0.2466], [-0.0044, 0.0345, -0.0060, ..., -0.0960, 0.1133, -0.0393], ..., [ 0.0129, -0.0333, -0.0136, ..., -0.0985, -0.1650, 0.0302], [-0.0215, -0.0616, -0.0182, ..., -0.0865, 0.0515, -0.1516], [-0.1709, -0.0115, -0.0110, ..., -0.0905, -0.1120, -0.0838]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 8.3223e-06, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4214e-08, 2.1979e-07, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-08, 3.1199e-07, 6.5193e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 9.0338e-08, 1.3039e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.9325e-07, -1.6004e-05, 5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3842e-07, 5.3830e-06, 3.3528e-08]], device='cuda:0') Epoch 181, bias, value: tensor([ 0.0011, -0.0189, 0.0014, 0.0311, -0.0025, 0.0294, 0.0049, 0.0192, -0.0024, -0.0132], device='cuda:0'), grad: tensor([ 2.6837e-05, -8.0243e-06, 1.9297e-06, 4.0047e-07, -6.4634e-06, 2.9281e-06, 4.2766e-06, 2.0992e-06, -4.3780e-05, 1.9833e-05], device='cuda:0') 100 0.0001 changing lr epoch 180, time 214.11, cls_loss 0.0018 cls_loss_mapping 0.0032 cls_loss_causal 0.5222 re_mapping 0.0053 re_causal 0.0163 /// teacc 98.98 lr 0.00010000 Epoch 182, weight, value: tensor([[-0.0496, -0.0055, -0.0834, ..., 0.0247, -0.0688, -0.0114], [ 0.0445, -0.0615, -0.0187, ..., -0.0148, -0.0558, -0.2468], [-0.0043, 0.0347, -0.0061, ..., -0.0963, 0.1134, -0.0394], ..., [ 0.0129, -0.0335, -0.0136, ..., -0.0989, -0.1651, 0.0302], [-0.0215, -0.0618, -0.0182, ..., -0.0869, 0.0519, -0.1516], [-0.1716, -0.0116, -0.0110, ..., -0.0909, -0.1125, -0.0843]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 2.0489e-07, 1.8626e-09], [ 1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 2.9802e-07, 2.0489e-08], [ 4.6566e-09, -9.3132e-10, 0.0000e+00, ..., 1.6764e-08, 3.6322e-08, 4.6566e-09], ..., [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.2536e-06, 8.3819e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 5.3085e-08, 2.8592e-07, 1.8626e-09], [ 6.4261e-08, 0.0000e+00, 0.0000e+00, ..., 1.3039e-08, 2.0210e-07, 7.8231e-08]], device='cuda:0') Epoch 182, bias, value: tensor([ 0.0010, -0.0191, 0.0015, 0.0311, -0.0024, 0.0293, 0.0054, 0.0193, -0.0023, -0.0134], device='cuda:0'), grad: tensor([ 2.7493e-06, 7.4022e-06, 1.1414e-05, -4.6670e-05, -7.5245e-04, 1.9833e-05, 2.2352e-06, 9.1314e-05, 1.1474e-05, 6.5184e-04], device='cuda:0') 100 0.0001 changing lr epoch 181, time 214.73, cls_loss 0.0017 cls_loss_mapping 0.0029 cls_loss_causal 0.4966 re_mapping 0.0055 re_causal 0.0162 /// teacc 98.92 lr 0.00010000 Epoch 183, weight, value: tensor([[-0.0497, -0.0055, -0.0839, ..., 0.0249, -0.0696, -0.0114], [ 0.0445, -0.0616, -0.0188, ..., -0.0153, -0.0563, -0.2469], [-0.0050, 0.0347, -0.0061, ..., -0.0963, 0.1139, -0.0393], ..., [ 0.0137, -0.0335, -0.0147, ..., -0.0989, -0.1653, 0.0301], [-0.0216, -0.0618, -0.0184, ..., -0.0877, 0.0523, -0.1518], [-0.1722, -0.0116, -0.0111, ..., -0.0912, -0.1133, -0.0844]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-08, 1.5181e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1420e-08, 2.1420e-08, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0978e-08, 6.1467e-08, 1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 8.3819e-09, 1.3039e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.9162e-08, 1.4249e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.1025e-08, -1.5646e-07, 1.3970e-08]], device='cuda:0') Epoch 183, bias, value: tensor([ 0.0005, -0.0192, 0.0017, 0.0312, -0.0023, 0.0288, 0.0060, 0.0194, -0.0022, -0.0137], device='cuda:0'), grad: tensor([ 5.6066e-06, -4.8995e-05, 1.2726e-05, 4.7609e-06, 3.8815e-04, 1.6674e-05, 1.9982e-05, 1.0386e-05, 7.7665e-05, -4.8661e-04], device='cuda:0') 100 0.0001 changing lr epoch 182, time 214.21, cls_loss 0.0017 cls_loss_mapping 0.0040 cls_loss_causal 0.5260 re_mapping 0.0054 re_causal 0.0172 /// teacc 98.93 lr 0.00010000 Epoch 184, weight, value: tensor([[-0.0498, -0.0055, -0.0839, ..., 0.0249, -0.0702, -0.0104], [ 0.0445, -0.0616, -0.0189, ..., -0.0154, -0.0555, -0.2470], [-0.0050, 0.0349, -0.0062, ..., -0.0964, 0.1137, -0.0393], ..., [ 0.0137, -0.0336, -0.0147, ..., -0.0992, -0.1653, 0.0300], [-0.0216, -0.0619, -0.0182, ..., -0.0878, 0.0526, -0.1519], [-0.1723, -0.0116, -0.0111, ..., -0.0917, -0.1147, -0.0849]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.1025e-08, 0.0000e+00, ..., -2.4736e-06, -1.1614e-06, 0.0000e+00], [ 0.0000e+00, 1.1502e-07, 0.0000e+00, ..., 6.1467e-08, 3.0920e-07, 4.6566e-10], [ 0.0000e+00, -5.6997e-06, 0.0000e+00, ..., 3.6601e-07, -8.7023e-06, 4.6566e-10], ..., [ 0.0000e+00, 2.6263e-06, 0.0000e+00, ..., 1.1176e-07, 4.0904e-06, 9.3132e-10], [ 0.0000e+00, 1.0850e-07, 0.0000e+00, ..., 1.5907e-06, 5.7295e-06, 4.6566e-10], [ 0.0000e+00, 1.5367e-08, 0.0000e+00, ..., 1.6019e-07, 3.3621e-07, 4.6566e-10]], device='cuda:0') Epoch 184, bias, value: tensor([ 0.0004, -0.0184, 0.0012, 0.0321, -0.0021, 0.0289, 0.0059, 0.0193, -0.0023, -0.0140], device='cuda:0'), grad: tensor([-7.1377e-06, 7.4459e-07, -2.8968e-05, 1.8790e-05, 7.7719e-07, -1.3612e-05, 1.5516e-06, 1.3568e-05, 1.3806e-05, 5.0757e-07], device='cuda:0') 100 0.0001 changing lr epoch 183, time 214.68, cls_loss 0.0019 cls_loss_mapping 0.0034 cls_loss_causal 0.5428 re_mapping 0.0052 re_causal 0.0168 /// teacc 98.99 lr 0.00010000 Epoch 185, weight, value: tensor([[-0.0499, -0.0055, -0.0840, ..., 0.0245, -0.0711, -0.0101], [ 0.0434, -0.0616, -0.0189, ..., -0.0157, -0.0558, -0.2471], [-0.0056, 0.0352, -0.0063, ..., -0.0972, 0.1137, -0.0394], ..., [ 0.0141, -0.0338, -0.0148, ..., -0.0997, -0.1653, 0.0300], [-0.0204, -0.0620, -0.0182, ..., -0.0899, 0.0522, -0.1520], [-0.1727, -0.0116, -0.0111, ..., -0.0917, -0.1152, -0.0848]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.3446e-07, 5.1688e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, 3.1991e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.2399e-08, 5.1223e-09, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 1.2852e-07, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, -1.5264e-06, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2154e-07, 1.4296e-07, 8.8476e-09]], device='cuda:0') Epoch 185, bias, value: tensor([ 5.3367e-05, -1.9743e-02, 9.6193e-04, 3.1588e-02, -2.1789e-03, 2.9116e-02, 7.1058e-03, 2.0214e-02, -2.5643e-03, -1.3988e-02], device='cuda:0'), grad: tensor([-3.1153e-07, 3.6247e-06, 5.5647e-07, 8.0839e-06, 3.6135e-07, 2.7157e-06, 1.0002e-06, 1.5553e-06, -1.9670e-05, 2.0768e-06], device='cuda:0') 100 0.0001 changing lr epoch 184, time 214.42, cls_loss 0.0017 cls_loss_mapping 0.0038 cls_loss_causal 0.5424 re_mapping 0.0055 re_causal 0.0167 /// teacc 98.97 lr 0.00010000 Epoch 186, weight, value: tensor([[-0.0499, -0.0055, -0.0840, ..., 0.0250, -0.0714, -0.0102], [ 0.0432, -0.0616, -0.0189, ..., -0.0156, -0.0559, -0.2473], [-0.0051, 0.0353, -0.0063, ..., -0.0973, 0.1139, -0.0389], ..., [ 0.0140, -0.0339, -0.0149, ..., -0.1002, -0.1655, 0.0294], [-0.0206, -0.0621, -0.0183, ..., -0.0900, 0.0519, -0.1527], [-0.1730, -0.0116, -0.0111, ..., -0.0919, -0.1147, -0.0850]], device='cuda:0'), grad: tensor([[ 1.3085e-07, 0.0000e+00, 0.0000e+00, ..., -3.9954e-07, 9.3132e-08, 4.5262e-07], [ 1.2293e-07, 0.0000e+00, 0.0000e+00, ..., 3.5390e-08, 5.3830e-07, 4.9360e-07], [ 2.9337e-07, 0.0000e+00, 0.0000e+00, ..., 2.4680e-08, -6.1877e-06, 4.6939e-07], ..., [ 1.2107e-07, 0.0000e+00, 0.0000e+00, ..., 1.4901e-08, 4.5970e-06, 8.4145e-07], [ 4.3306e-08, 0.0000e+00, 0.0000e+00, ..., 1.0477e-07, 2.4401e-07, 2.2771e-07], [ 3.2131e-08, 0.0000e+00, 0.0000e+00, ..., 1.0105e-07, 7.6834e-08, 4.8289e-07]], device='cuda:0') Epoch 186, bias, value: tensor([ 0.0002, -0.0193, 0.0010, 0.0314, -0.0020, 0.0294, 0.0071, 0.0200, -0.0033, -0.0138], device='cuda:0'), grad: tensor([ 5.8394e-07, -2.8964e-06, -2.4050e-05, -1.0297e-05, -1.6555e-05, 4.3809e-06, 1.5683e-06, 2.7940e-05, 6.9290e-06, 1.2331e-05], device='cuda:0') 100 0.0001 changing lr epoch 185, time 214.33, cls_loss 0.0018 cls_loss_mapping 0.0032 cls_loss_causal 0.5018 re_mapping 0.0056 re_causal 0.0164 /// teacc 98.95 lr 0.00010000 Epoch 187, weight, value: tensor([[-0.0502, -0.0055, -0.0840, ..., 0.0254, -0.0717, -0.0073], [ 0.0439, -0.0617, -0.0189, ..., -0.0161, -0.0563, -0.2475], [-0.0052, 0.0353, -0.0063, ..., -0.0972, 0.1141, -0.0395], ..., [ 0.0138, -0.0339, -0.0149, ..., -0.1002, -0.1656, 0.0298], [-0.0206, -0.0622, -0.0183, ..., -0.0918, 0.0512, -0.1525], [-0.1735, -0.0116, -0.0111, ..., -0.0919, -0.1152, -0.0854]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.8394e-07, -4.2841e-08, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 6.6124e-08, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, -5.8534e-07, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 7.4971e-08, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0268e-08, -8.9360e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5553e-07, 1.1874e-06, 1.0757e-07]], device='cuda:0') Epoch 187, bias, value: tensor([ 0.0005, -0.0194, 0.0011, 0.0314, -0.0019, 0.0291, 0.0075, 0.0196, -0.0032, -0.0136], device='cuda:0'), grad: tensor([-1.1763e-06, 3.2503e-07, -8.9826e-07, 4.8289e-07, 8.6008e-07, 4.0559e-07, 4.9500e-07, -3.3304e-06, -1.2498e-06, 4.0792e-06], device='cuda:0') 100 0.0001 changing lr epoch 186, time 214.29, cls_loss 0.0016 cls_loss_mapping 0.0031 cls_loss_causal 0.5159 re_mapping 0.0055 re_causal 0.0167 /// teacc 98.90 lr 0.00010000 Epoch 188, weight, value: tensor([[-0.0502, -0.0055, -0.0840, ..., 0.0255, -0.0725, -0.0074], [ 0.0442, -0.0617, -0.0189, ..., -0.0165, -0.0573, -0.2477], [-0.0052, 0.0354, -0.0063, ..., -0.0977, 0.1148, -0.0394], ..., [ 0.0137, -0.0339, -0.0149, ..., -0.1013, -0.1657, 0.0299], [-0.0206, -0.0622, -0.0183, ..., -0.0914, 0.0519, -0.1525], [-0.1738, -0.0116, -0.0111, ..., -0.0931, -0.1163, -0.0855]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.3160e-06, -3.6061e-06, 1.3039e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3877e-07, 2.3982e-07, 4.9826e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1572e-07, 1.1353e-06, 2.4214e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.8429e-08, 1.0710e-07, 5.4017e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.1609e-06, 3.1352e-05, 9.6392e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9281e-06, 2.8200e-06, -7.4506e-09]], device='cuda:0') Epoch 188, bias, value: tensor([ 0.0003, -0.0198, 0.0016, 0.0313, -0.0018, 0.0292, 0.0067, 0.0195, -0.0026, -0.0138], device='cuda:0'), grad: tensor([-1.1230e-04, 2.6762e-05, 5.9828e-06, 2.2560e-05, -6.9916e-05, -4.4316e-05, -1.9357e-05, 8.3596e-06, 9.6500e-05, 8.5652e-05], device='cuda:0') 100 0.0001 changing lr epoch 187, time 214.09, cls_loss 0.0020 cls_loss_mapping 0.0033 cls_loss_causal 0.5275 re_mapping 0.0057 re_causal 0.0169 /// teacc 98.85 lr 0.00010000 Epoch 189, weight, value: tensor([[-0.0503, -0.0056, -0.0840, ..., 0.0242, -0.0751, -0.0074], [ 0.0442, -0.0617, -0.0189, ..., -0.0172, -0.0577, -0.2478], [-0.0050, 0.0356, -0.0063, ..., -0.0982, 0.1149, -0.0389], ..., [ 0.0137, -0.0340, -0.0149, ..., -0.1017, -0.1658, 0.0306], [-0.0209, -0.0625, -0.0183, ..., -0.0919, 0.0515, -0.1532], [-0.1739, -0.0120, -0.0111, ..., -0.0930, -0.1169, -0.0862]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.8452e-08, 1.8487e-07, 1.5832e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.2919e-05, 6.3796e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -5.1945e-05, 8.0559e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 3.3993e-07, 2.3749e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0536e-09, 3.1805e-07, 1.2573e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.8476e-09, 6.7055e-08, 4.0978e-08]], device='cuda:0') Epoch 189, bias, value: tensor([-0.0016, -0.0199, 0.0016, 0.0322, -0.0012, 0.0294, 0.0069, 0.0197, -0.0031, -0.0144], device='cuda:0'), grad: tensor([ 2.5565e-07, 2.0206e-05, -7.1406e-05, -1.6332e-05, 4.3176e-06, 8.3372e-06, 5.3525e-05, 1.9446e-06, 9.3179e-07, -1.6708e-06], device='cuda:0') 100 0.0001 changing lr epoch 188, time 214.19, cls_loss 0.0017 cls_loss_mapping 0.0028 cls_loss_causal 0.5340 re_mapping 0.0056 re_causal 0.0171 /// teacc 99.07 lr 0.00010000 Epoch 190, weight, value: tensor([[-0.0503, -0.0057, -0.0844, ..., 0.0250, -0.0749, -0.0080], [ 0.0443, -0.0617, -0.0190, ..., -0.0176, -0.0584, -0.2480], [-0.0050, 0.0356, -0.0070, ..., -0.0983, 0.1155, -0.0390], ..., [ 0.0136, -0.0339, -0.0161, ..., -0.1023, -0.1662, 0.0283], [-0.0209, -0.0626, -0.0193, ..., -0.0920, 0.0514, -0.1533], [-0.1740, -0.0120, -0.0113, ..., -0.0932, -0.1173, -0.0864]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.7807e-06, -4.8429e-08, 2.3283e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.5996e-07, -2.3376e-07, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1548e-07, 6.0769e-07, 3.3993e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, -3.9814e-07, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1781e-07, -5.5227e-07, 5.1223e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0559e-07, 3.1153e-07, 3.2596e-09]], device='cuda:0') Epoch 190, bias, value: tensor([-0.0015, -0.0197, 0.0019, 0.0326, -0.0010, 0.0298, 0.0065, 0.0189, -0.0034, -0.0142], device='cuda:0'), grad: tensor([-4.8205e-06, 1.7229e-07, 7.1637e-06, -2.2221e-06, 6.7288e-07, 1.6540e-06, 1.1064e-06, -6.5565e-06, 5.8999e-07, 2.2240e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 214.16, cls_loss 0.0015 cls_loss_mapping 0.0030 cls_loss_causal 0.5309 re_mapping 0.0055 re_causal 0.0172 /// teacc 98.94 lr 0.00010000 Epoch 191, weight, value: tensor([[-0.0505, -0.0057, -0.0846, ..., 0.0243, -0.0760, -0.0084], [ 0.0445, -0.0618, -0.0190, ..., -0.0177, -0.0585, -0.2487], [-0.0050, 0.0355, -0.0073, ..., -0.0986, 0.1156, -0.0392], ..., [ 0.0136, -0.0335, -0.0165, ..., -0.1025, -0.1662, 0.0283], [-0.0210, -0.0629, -0.0196, ..., -0.0920, 0.0516, -0.1534], [-0.1741, -0.0124, -0.0114, ..., -0.0934, -0.1181, -0.0863]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.8697e-08, 5.6811e-08, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 4.1444e-08, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, -1.4603e-06, 4.1910e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 5.7928e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9337e-08, 7.0548e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4435e-08, 3.0873e-07, 2.4680e-08]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0021, -0.0196, 0.0019, 0.0323, -0.0011, 0.0299, 0.0068, 0.0199, -0.0033, -0.0151], device='cuda:0'), grad: tensor([-5.2620e-08, -8.6380e-07, -4.6119e-06, 1.3597e-06, 3.4971e-07, -1.6969e-06, 6.3144e-07, 2.0564e-06, 1.4892e-06, 1.3299e-06], device='cuda:0') 100 0.0001 changing lr epoch 190, time 214.21, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5182 re_mapping 0.0051 re_causal 0.0162 /// teacc 99.08 lr 0.00010000 Epoch 192, weight, value: tensor([[-0.0509, -0.0056, -0.0846, ..., 0.0255, -0.0758, -0.0084], [ 0.0445, -0.0618, -0.0191, ..., -0.0176, -0.0587, -0.2489], [-0.0050, 0.0355, -0.0068, ..., -0.0988, 0.1158, -0.0398], ..., [ 0.0137, -0.0336, -0.0140, ..., -0.1032, -0.1662, 0.0286], [-0.0210, -0.0630, -0.0195, ..., -0.0930, 0.0509, -0.1536], [-0.1743, -0.0126, -0.0114, ..., -0.0939, -0.1194, -0.0865]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 3.4925e-08, -9.4855e-07, 6.3051e-07], [ 0.0000e+00, 0.0000e+00, 2.1420e-08, ..., 2.0722e-07, 1.8161e-07, 2.1048e-07], [ 0.0000e+00, 0.0000e+00, -4.6100e-08, ..., 8.8476e-08, -5.4250e-07, 4.8382e-07], ..., [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 4.0513e-08, 2.2259e-07, 2.0210e-07], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 6.4541e-07, 5.8766e-07, 4.1910e-08], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 2.8079e-07, 1.6158e-07, 4.4098e-07]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0012, -0.0187, 0.0018, 0.0316, -0.0008, 0.0299, 0.0074, 0.0198, -0.0039, -0.0157], device='cuda:0'), grad: tensor([ 1.1697e-06, 1.1493e-06, 3.9376e-06, 1.0170e-06, -1.5609e-06, 8.0094e-08, -5.7817e-06, -4.2915e-06, 2.7604e-06, 1.4752e-06], device='cuda:0') 100 0.0001 changing lr epoch 191, time 214.45, cls_loss 0.0014 cls_loss_mapping 0.0026 cls_loss_causal 0.4971 re_mapping 0.0050 re_causal 0.0157 /// teacc 98.92 lr 0.00010000 Epoch 193, weight, value: tensor([[-0.0510, -0.0056, -0.0846, ..., 0.0255, -0.0758, -0.0086], [ 0.0446, -0.0618, -0.0191, ..., -0.0175, -0.0586, -0.2491], [-0.0050, 0.0355, -0.0068, ..., -0.0990, 0.1159, -0.0412], ..., [ 0.0137, -0.0336, -0.0139, ..., -0.1036, -0.1663, 0.0285], [-0.0210, -0.0630, -0.0195, ..., -0.0933, 0.0509, -0.1537], [-0.1745, -0.0126, -0.0114, ..., -0.0929, -0.1199, -0.0868]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.3271e-07, -1.9558e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -7.1153e-07, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 2.6915e-07, 2.7940e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 3.7253e-08, 4.1910e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, 1.6950e-07, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-09, 9.3598e-08, -4.5169e-08]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0019, -0.0180, 0.0017, 0.0315, -0.0007, 0.0301, 0.0072, 0.0197, -0.0041, -0.0157], device='cuda:0'), grad: tensor([-9.4529e-08, -5.2676e-06, 4.8354e-06, 1.8459e-06, 6.4820e-06, -6.5193e-07, 1.7388e-06, 1.2569e-05, 1.1839e-05, -3.3289e-05], device='cuda:0') 100 0.0001 changing lr epoch 192, time 214.55, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5178 re_mapping 0.0051 re_causal 0.0162 /// teacc 99.12 lr 0.00010000 Epoch 194, weight, value: tensor([[-0.0510, -0.0057, -0.0846, ..., 0.0253, -0.0764, -0.0088], [ 0.0447, -0.0618, -0.0189, ..., -0.0176, -0.0587, -0.2492], [-0.0051, 0.0355, -0.0068, ..., -0.0992, 0.1161, -0.0414], ..., [ 0.0137, -0.0336, -0.0139, ..., -0.1039, -0.1664, 0.0289], [-0.0210, -0.0631, -0.0194, ..., -0.0936, 0.0509, -0.1536], [-0.1745, -0.0126, -0.0114, ..., -0.0929, -0.1209, -0.0865]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6699e-06, 5.5879e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6298e-07, 1.1036e-07, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1036e-07, 1.5832e-08, 1.3970e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9092e-08, 7.1246e-08, -1.0710e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2340e-07, -2.6543e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8720e-07, 8.0327e-07, 5.6345e-08]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0022, -0.0177, 0.0015, 0.0334, -0.0015, 0.0301, 0.0074, 0.0190, -0.0044, -0.0150], device='cuda:0'), grad: tensor([-5.7369e-06, 2.3078e-06, 4.2357e-06, 1.6138e-05, 8.4285e-07, -3.8743e-06, 3.4887e-06, -2.2978e-05, 5.1223e-07, 5.0366e-06], device='cuda:0') 100 0.0001 changing lr epoch 193, time 214.23, cls_loss 0.0024 cls_loss_mapping 0.0040 cls_loss_causal 0.5253 re_mapping 0.0059 re_causal 0.0162 /// teacc 99.01 lr 0.00010000 Epoch 195, weight, value: tensor([[-0.0512, -0.0057, -0.0847, ..., 0.0255, -0.0770, -0.0094], [ 0.0446, -0.0619, -0.0189, ..., -0.0179, -0.0588, -0.2494], [-0.0052, 0.0355, -0.0069, ..., -0.1008, 0.1168, -0.0408], ..., [ 0.0137, -0.0336, -0.0139, ..., -0.1044, -0.1667, 0.0304], [-0.0210, -0.0631, -0.0195, ..., -0.0941, 0.0507, -0.1537], [-0.1750, -0.0126, -0.0115, ..., -0.0932, -0.1216, -0.0874]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.4680e-08, 3.6322e-08, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, 9.0338e-08, 1.4901e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6298e-08, -1.1791e-06, 1.2573e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.2117e-06, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.0781e-08, 2.6263e-07, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.9162e-09, 1.1176e-07, 2.7940e-07]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0024, -0.0158, 0.0038, 0.0327, -0.0017, 0.0297, 0.0084, 0.0166, -0.0030, -0.0162], device='cuda:0'), grad: tensor([ 1.0151e-07, 7.6322e-07, -2.5295e-06, -1.2696e-05, 8.5756e-06, -4.2096e-06, -5.3411e-07, 1.0230e-05, 1.0163e-05, -9.8422e-06], device='cuda:0') 100 0.0001 changing lr epoch 194, time 214.26, cls_loss 0.0016 cls_loss_mapping 0.0027 cls_loss_causal 0.4979 re_mapping 0.0056 re_causal 0.0163 /// teacc 98.98 lr 0.00010000 Epoch 196, weight, value: tensor([[-0.0512, -0.0057, -0.0847, ..., 0.0254, -0.0776, -0.0095], [ 0.0447, -0.0619, -0.0189, ..., -0.0180, -0.0597, -0.2498], [-0.0053, 0.0355, -0.0070, ..., -0.1013, 0.1175, -0.0414], ..., [ 0.0137, -0.0336, -0.0140, ..., -0.1065, -0.1669, 0.0304], [-0.0211, -0.0631, -0.0195, ..., -0.0943, 0.0505, -0.1538], [-0.1751, -0.0126, -0.0115, ..., -0.0931, -0.1223, -0.0873]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -5.6438e-06, -1.8859e-07, 3.4459e-08], [ 2.8405e-08, 0.0000e+00, 0.0000e+00, ..., 1.9139e-07, 4.2329e-07, 2.0023e-08], [ 6.2399e-08, 0.0000e+00, 0.0000e+00, ..., 3.3202e-07, -6.2631e-07, 7.8231e-08], ..., [-1.7136e-07, 0.0000e+00, 0.0000e+00, ..., 2.2817e-08, 2.2817e-07, 9.7789e-09], [ 2.5611e-08, 0.0000e+00, 0.0000e+00, ..., 1.5311e-06, 2.8610e-05, 9.3132e-09], [ 1.3504e-08, 0.0000e+00, 0.0000e+00, ..., 1.7229e-07, 1.5264e-06, 2.7241e-07]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0028, -0.0162, 0.0039, 0.0327, -0.0020, 0.0299, 0.0088, 0.0167, -0.0028, -0.0161], device='cuda:0'), grad: tensor([-2.2531e-05, 8.2562e-07, 5.9456e-06, -1.3039e-07, 2.7940e-09, -1.3387e-04, 9.4295e-05, -7.4226e-07, 5.1260e-05, 4.7088e-06], device='cuda:0') 100 0.0001 changing lr epoch 195, time 214.49, cls_loss 0.0020 cls_loss_mapping 0.0030 cls_loss_causal 0.5079 re_mapping 0.0052 re_causal 0.0158 /// teacc 99.02 lr 0.00010000 Epoch 197, weight, value: tensor([[-0.0515, -0.0057, -0.0847, ..., 0.0260, -0.0773, -0.0097], [ 0.0446, -0.0619, -0.0189, ..., -0.0183, -0.0566, -0.2505], [-0.0054, 0.0355, -0.0070, ..., -0.1016, 0.1172, -0.0420], ..., [ 0.0139, -0.0336, -0.0140, ..., -0.1069, -0.1673, 0.0303], [-0.0212, -0.0631, -0.0195, ..., -0.0955, 0.0472, -0.1552], [-0.1757, -0.0126, -0.0115, ..., -0.0935, -0.1230, -0.0870]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8161e-08, 4.1910e-09, 6.5193e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0757e-07, 1.9558e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.5274e-07, 1.9558e-08], ..., [-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-08, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, -2.1309e-06, 1.8626e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 1.3057e-06, 4.2608e-07]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0022, -0.0134, 0.0031, 0.0321, -0.0022, 0.0301, 0.0093, 0.0165, -0.0053, -0.0158], device='cuda:0'), grad: tensor([ 2.8405e-08, 6.5565e-07, 7.3295e-07, 1.5898e-06, -1.8580e-06, 2.0256e-07, 4.2329e-07, -1.6652e-06, -6.2101e-06, 6.1020e-06], device='cuda:0') 100 0.0001 changing lr epoch 196, time 214.14, cls_loss 0.0017 cls_loss_mapping 0.0033 cls_loss_causal 0.5284 re_mapping 0.0052 re_causal 0.0158 /// teacc 99.00 lr 0.00010000 Epoch 198, weight, value: tensor([[-0.0518, -0.0057, -0.0848, ..., 0.0269, -0.0764, -0.0097], [ 0.0446, -0.0619, -0.0189, ..., -0.0184, -0.0570, -0.2511], [-0.0069, 0.0355, -0.0079, ..., -0.1020, 0.1178, -0.0423], ..., [ 0.0154, -0.0336, -0.0141, ..., -0.1076, -0.1674, 0.0298], [-0.0215, -0.0632, -0.0200, ..., -0.0969, 0.0461, -0.1554], [-0.1764, -0.0126, -0.0116, ..., -0.0935, -0.1237, -0.0871]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.4296e-07, 9.5461e-07, 0.0000e+00], [ 4.1910e-09, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, 1.9521e-06, 5.1223e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 8.8476e-09, 6.8694e-06, 1.3970e-09], ..., [-1.7695e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.7311e-06, 9.3132e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 4.7963e-08, 2.7999e-05, 2.7940e-09], [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 2.5146e-08, 1.6475e-06, 1.4855e-07]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0016, -0.0136, 0.0033, 0.0319, -0.0020, 0.0307, 0.0094, 0.0164, -0.0063, -0.0157], device='cuda:0'), grad: tensor([ 7.3351e-06, 1.5587e-05, 6.6102e-05, -4.1747e-04, 1.6298e-06, 4.2975e-05, -7.4506e-08, 3.6418e-05, 2.3377e-04, 1.3731e-05], device='cuda:0') 100 0.0001 changing lr epoch 197, time 214.23, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.5238 re_mapping 0.0050 re_causal 0.0161 /// teacc 99.06 lr 0.00010000 Epoch 199, weight, value: tensor([[-0.0521, -0.0057, -0.0851, ..., 0.0275, -0.0762, -0.0097], [ 0.0437, -0.0619, -0.0190, ..., -0.0187, -0.0573, -0.2513], [-0.0073, 0.0355, -0.0090, ..., -0.1024, 0.1182, -0.0425], ..., [ 0.0158, -0.0336, -0.0139, ..., -0.1084, -0.1676, 0.0298], [-0.0210, -0.0632, -0.0205, ..., -0.0971, 0.0459, -0.1555], [-0.1786, -0.0126, -0.0117, ..., -0.0927, -0.1241, -0.0875]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -3.5856e-08, 5.2713e-07, 1.8626e-09], [ 3.3993e-08, 0.0000e+00, 0.0000e+00, ..., 8.3353e-08, -5.1968e-06, 6.5193e-09], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 3.8184e-08, 6.5602e-06, 2.1420e-08], ..., [-5.4948e-08, 0.0000e+00, 0.0000e+00, ..., 1.5832e-08, 1.3765e-06, 6.0536e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -2.3171e-06, -1.8179e-05, 2.3283e-09], [ 1.8626e-08, 0.0000e+00, 0.0000e+00, ..., 3.4412e-07, 9.1344e-06, 1.7229e-08]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0012, -0.0135, 0.0036, 0.0321, -0.0017, 0.0308, 0.0089, 0.0161, -0.0067, -0.0154], device='cuda:0'), grad: tensor([ 1.5311e-06, -5.4538e-05, 4.8608e-05, 3.6787e-06, 1.4231e-06, 2.5958e-05, -9.2015e-06, -9.1910e-05, -4.8369e-05, 1.2290e-04], device='cuda:0') 100 0.0001 changing lr epoch 198, time 214.39, cls_loss 0.0017 cls_loss_mapping 0.0024 cls_loss_causal 0.5140 re_mapping 0.0051 re_causal 0.0154 /// teacc 99.02 lr 0.00010000 Epoch 200, weight, value: tensor([[-0.0521, -0.0057, -0.0851, ..., 0.0261, -0.0767, -0.0097], [ 0.0437, -0.0619, -0.0190, ..., -0.0191, -0.0574, -0.2514], [-0.0072, 0.0356, -0.0090, ..., -0.1023, 0.1185, -0.0426], ..., [ 0.0158, -0.0336, -0.0139, ..., -0.1093, -0.1681, 0.0299], [-0.0210, -0.0632, -0.0206, ..., -0.0972, 0.0463, -0.1555], [-0.1791, -0.0127, -0.0117, ..., -0.0904, -0.1253, -0.0877]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.5837e-06, -2.1309e-06, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 4.1444e-08, 1.0710e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.8476e-09, -5.6345e-08, -2.4214e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 3.1199e-08, 2.8871e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, -1.4249e-07, 9.7789e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9581e-08, 2.1001e-07, -2.5705e-07]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0031, -0.0135, 0.0037, 0.0323, -0.0016, 0.0303, 0.0092, 0.0159, -0.0066, -0.0152], device='cuda:0'), grad: tensor([-7.3612e-06, 1.2014e-06, 2.4447e-07, 1.2629e-05, 3.6955e-06, -7.4431e-06, 7.6666e-06, 9.7603e-07, 8.2841e-07, -1.2450e-05], device='cuda:0') 100 0.0001 changing lr epoch 199, time 214.33, cls_loss 0.0019 cls_loss_mapping 0.0029 cls_loss_causal 0.5327 re_mapping 0.0053 re_causal 0.0158 /// teacc 98.92 lr 0.00010000 Epoch 201, weight, value: tensor([[-0.0522, -0.0057, -0.0853, ..., 0.0266, -0.0767, -0.0098], [ 0.0416, -0.0639, -0.0193, ..., -0.0193, -0.0575, -0.2526], [-0.0074, 0.0347, -0.0104, ..., -0.1031, 0.1189, -0.0431], ..., [ 0.0173, -0.0310, -0.0131, ..., -0.1097, -0.1686, 0.0306], [-0.0210, -0.0640, -0.0212, ..., -0.0973, 0.0462, -0.1561], [-0.1801, -0.0129, -0.0119, ..., -0.0905, -0.1258, -0.0863]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0338e-07, 2.5611e-08, 1.3970e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 2.2678e-07, 1.1921e-07], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, -2.2240e-06, 7.9162e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.0850e-07, 2.6077e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8161e-08, 9.5367e-07, 1.4435e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 1.4901e-08, 1.0263e-06]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0025, -0.0137, 0.0038, 0.0317, -0.0025, 0.0300, 0.0085, 0.0162, -0.0065, -0.0142], device='cuda:0'), grad: tensor([ 2.5379e-07, -2.6375e-06, 1.0990e-06, 2.2464e-06, -4.5113e-06, 3.0790e-06, 4.8522e-07, -1.0572e-05, 2.9672e-06, 7.5698e-06], device='cuda:0') 100 0.0001 changing lr epoch 200, time 214.29, cls_loss 0.0016 cls_loss_mapping 0.0028 cls_loss_causal 0.4973 re_mapping 0.0048 re_causal 0.0151 /// teacc 98.99 lr 0.00010000 Epoch 202, weight, value: tensor([[-0.0522, -0.0057, -0.0856, ..., 0.0265, -0.0778, -0.0099], [ 0.0415, -0.0646, -0.0194, ..., -0.0194, -0.0575, -0.2531], [-0.0073, 0.0345, -0.0131, ..., -0.1032, 0.1190, -0.0431], ..., [ 0.0173, -0.0303, -0.0103, ..., -0.1100, -0.1687, 0.0305], [-0.0211, -0.0645, -0.0214, ..., -0.0976, 0.0462, -0.1564], [-0.1803, -0.0128, -0.0120, ..., -0.0905, -0.1262, -0.0864]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.9977e-06, -2.4736e-06, 4.1910e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4901e-07, 2.0936e-06, 6.3796e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7136e-07, -2.1517e-05, 1.6298e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1316e-07, 1.4171e-05, 6.5193e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9525e-06, 2.4773e-06, 8.3819e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9663e-07, 5.9977e-07, 6.3004e-07]], device='cuda:0') Epoch 202, bias, value: tensor([-0.0029, -0.0137, 0.0034, 0.0317, -0.0022, 0.0296, 0.0086, 0.0168, -0.0066, -0.0144], device='cuda:0'), grad: tensor([-2.5094e-05, 1.0140e-05, -7.2598e-05, 1.3486e-05, -1.0459e-06, 1.7062e-06, 4.7311e-06, 3.8832e-05, 2.0131e-05, 9.7454e-06], device='cuda:0') 100 0.0001 changing lr epoch 201, time 214.21, cls_loss 0.0018 cls_loss_mapping 0.0044 cls_loss_causal 0.5354 re_mapping 0.0052 re_causal 0.0164 /// teacc 98.98 lr 0.00010000 Epoch 203, weight, value: tensor([[-0.0525, -0.0058, -0.0865, ..., 0.0268, -0.0778, -0.0099], [ 0.0436, -0.0646, -0.0197, ..., -0.0196, -0.0574, -0.2534], [-0.0068, 0.0346, -0.0141, ..., -0.1031, 0.1190, -0.0436], ..., [ 0.0161, -0.0303, -0.0103, ..., -0.1110, -0.1689, 0.0322], [-0.0216, -0.0647, -0.0230, ..., -0.0978, 0.0468, -0.1554], [-0.1821, -0.0127, -0.0125, ..., -0.0906, -0.1266, -0.0862]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 2.2352e-08, ..., -5.5879e-09, 7.3481e-07, 2.4214e-08], [ 3.7253e-09, 0.0000e+00, 8.6613e-08, ..., 2.7940e-09, -7.7579e-07, 1.0617e-07], [ 6.5193e-09, 0.0000e+00, 5.4948e-08, ..., 3.7253e-09, -1.7239e-06, 3.1665e-08], ..., [ 9.3132e-10, 0.0000e+00, -1.4249e-06, ..., 0.0000e+00, 2.1234e-07, -1.2051e-06], [ 4.6566e-09, 0.0000e+00, 3.0734e-08, ..., 4.6566e-09, 1.1036e-06, 2.9802e-08], [ 9.3132e-09, 0.0000e+00, 8.8755e-07, ..., 3.7253e-09, 1.8906e-07, 7.8045e-07]], device='cuda:0') Epoch 203, bias, value: tensor([-0.0025, -0.0132, 0.0031, 0.0308, -0.0027, 0.0303, 0.0079, 0.0169, -0.0065, -0.0140], device='cuda:0'), grad: tensor([ 1.6103e-06, -2.3153e-06, -2.9895e-07, 1.0375e-06, 3.6471e-06, 1.9297e-06, 1.5646e-07, -1.4707e-05, 2.8126e-06, 6.0722e-06], device='cuda:0') 100 0.0001 changing lr epoch 202, time 214.48, cls_loss 0.0019 cls_loss_mapping 0.0033 cls_loss_causal 0.5163 re_mapping 0.0053 re_causal 0.0159 /// teacc 99.07 lr 0.00010000 Epoch 204, weight, value: tensor([[-0.0533, -0.0063, -0.0866, ..., 0.0246, -0.0781, -0.0101], [ 0.0400, -0.0647, -0.0198, ..., -0.0220, -0.0587, -0.2568], [-0.0075, 0.0351, -0.0142, ..., -0.1037, 0.1197, -0.0412], ..., [ 0.0186, -0.0304, -0.0105, ..., -0.1122, -0.1691, 0.0322], [-0.0238, -0.0651, -0.0230, ..., -0.0980, 0.0468, -0.1562], [-0.1862, -0.0127, -0.0129, ..., -0.0878, -0.1265, -0.0857]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -5.5879e-09, 1.0245e-08, 9.3132e-10], [-1.9558e-08, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.3039e-08, 5.5879e-09], [-6.2399e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.1816e-07, 4.6566e-09], ..., [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -6.1002e-07, 2.7940e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-08, -6.9849e-08, 0.0000e+00], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 8.6613e-08, 6.7987e-07]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0050, -0.0143, 0.0036, 0.0303, -0.0030, 0.0320, 0.0066, 0.0172, -0.0068, -0.0128], device='cuda:0'), grad: tensor([ 5.7742e-08, 2.2259e-07, 4.0203e-05, 1.1921e-07, -9.8255e-07, 1.4622e-07, 9.0338e-08, -4.2737e-05, 1.0915e-06, 1.8431e-06], device='cuda:0') 100 0.0001 changing lr epoch 203, time 214.48, cls_loss 0.0018 cls_loss_mapping 0.0024 cls_loss_causal 0.4798 re_mapping 0.0053 re_causal 0.0150 /// teacc 99.09 lr 0.00010000 Epoch 205, weight, value: tensor([[-0.0536, -0.0082, -0.0872, ..., 0.0250, -0.0782, -0.0098], [ 0.0400, -0.0648, -0.0198, ..., -0.0226, -0.0587, -0.2571], [-0.0082, 0.0363, -0.0146, ..., -0.1041, 0.1198, -0.0416], ..., [ 0.0195, -0.0304, -0.0106, ..., -0.1125, -0.1694, 0.0326], [-0.0236, -0.0662, -0.0235, ..., -0.0981, 0.0471, -0.1567], [-0.1898, -0.0156, -0.0130, ..., -0.0881, -0.1275, -0.0881]], device='cuda:0'), grad: tensor([[ 1.7788e-07, 0.0000e+00, 0.0000e+00, ..., -1.2107e-08, 4.3772e-08, 1.0710e-07], [ 1.9651e-07, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.3085e-08, 1.4808e-07], [ 7.7859e-06, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0291e-06, 4.5076e-06], ..., [-1.6317e-05, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.1961e-06, -9.0003e-06], [ 2.0955e-07, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 2.0117e-07, 8.8383e-07], [ 1.2107e-07, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.0803e-07, -1.2247e-06]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0046, -0.0131, 0.0035, 0.0305, -0.0007, 0.0318, 0.0063, 0.0167, -0.0067, -0.0151], device='cuda:0'), grad: tensor([ 8.0001e-07, 7.1004e-06, 3.5912e-05, 6.0573e-06, 8.4043e-06, 2.1860e-05, 7.5065e-07, -8.1062e-05, 9.9987e-06, -9.9763e-06], device='cuda:0') 100 0.0001 changing lr epoch 204, time 214.02, cls_loss 0.0018 cls_loss_mapping 0.0030 cls_loss_causal 0.5108 re_mapping 0.0050 re_causal 0.0149 /// teacc 99.03 lr 0.00010000 Epoch 206, weight, value: tensor([[-0.0545, -0.0084, -0.0872, ..., 0.0249, -0.0793, -0.0100], [ 0.0407, -0.0648, -0.0198, ..., -0.0239, -0.0590, -0.2572], [-0.0077, 0.0365, -0.0146, ..., -0.1021, 0.1202, -0.0418], ..., [ 0.0194, -0.0300, -0.0106, ..., -0.1130, -0.1699, 0.0327], [-0.0239, -0.0664, -0.0235, ..., -0.0986, 0.0471, -0.1569], [-0.1913, -0.0171, -0.0130, ..., -0.0881, -0.1284, -0.0882]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, 2.1514e-07, 6.1467e-08], [-1.1083e-07, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 2.2817e-07, 4.5635e-08], [ 1.6764e-08, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, -1.3513e-06, 9.8720e-08], ..., [ 1.0245e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.5763e-07, 5.7742e-08], [ 3.7253e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.6380e-07, 7.9162e-08], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.0978e-08, 5.4725e-06]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0047, -0.0132, 0.0040, 0.0300, -0.0007, 0.0323, 0.0062, 0.0167, -0.0069, -0.0153], device='cuda:0'), grad: tensor([ 5.5879e-07, -1.0431e-07, -8.8848e-07, -6.8471e-06, -1.0327e-05, -3.6322e-08, 7.9349e-07, 3.1125e-06, 3.8818e-06, 9.8050e-06], device='cuda:0') 100 0.0001 changing lr epoch 205, time 214.35, cls_loss 0.0013 cls_loss_mapping 0.0017 cls_loss_causal 0.4974 re_mapping 0.0049 re_causal 0.0156 /// teacc 99.10 lr 0.00010000 Epoch 207, weight, value: tensor([[-0.0566, -0.0084, -0.0874, ..., 0.0250, -0.0796, -0.0101], [ 0.0412, -0.0648, -0.0200, ..., -0.0242, -0.0591, -0.2573], [-0.0081, 0.0366, -0.0147, ..., -0.1022, 0.1202, -0.0420], ..., [ 0.0192, -0.0299, -0.0106, ..., -0.1131, -0.1699, 0.0331], [-0.0237, -0.0665, -0.0233, ..., -0.0987, 0.0469, -0.1565], [-0.1935, -0.0175, -0.0131, ..., -0.0882, -0.1288, -0.0882]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.3132e-06, 1.7695e-08, -1.4016e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-08, 2.0489e-08, 2.3283e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.6811e-08, -2.7940e-09, 1.7229e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 2.7940e-09, 9.3132e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.7497e-08, -6.4261e-08, 1.2945e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.8348e-07, 2.8871e-08, 3.5856e-07]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0048, -0.0134, 0.0034, 0.0305, -0.0005, 0.0327, 0.0053, 0.0174, -0.0072, -0.0154], device='cuda:0'), grad: tensor([-9.8497e-06, 2.0072e-05, 3.8184e-06, 2.1365e-06, 1.4920e-06, 4.1537e-07, 1.1595e-06, -2.4557e-05, 6.4448e-07, 4.6268e-06], device='cuda:0') 100 0.0001 changing lr epoch 206, time 214.18, cls_loss 0.0017 cls_loss_mapping 0.0034 cls_loss_causal 0.5093 re_mapping 0.0050 re_causal 0.0150 /// teacc 98.99 lr 0.00010000 Epoch 208, weight, value: tensor([[-0.0569, -0.0086, -0.0874, ..., 0.0248, -0.0806, -0.0101], [ 0.0417, -0.0649, -0.0200, ..., -0.0266, -0.0598, -0.2590], [-0.0082, 0.0370, -0.0148, ..., -0.1022, 0.1206, -0.0403], ..., [ 0.0192, -0.0301, -0.0107, ..., -0.1144, -0.1700, 0.0331], [-0.0238, -0.0666, -0.0233, ..., -0.0978, 0.0489, -0.1567], [-0.1940, -0.0175, -0.0131, ..., -0.0882, -0.1293, -0.0883]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, 2.9802e-08, 1.8626e-09], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 9.3132e-09, 9.4995e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 5.2154e-08, 5.5879e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-09, 1.1176e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, -1.7136e-07, 8.3819e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.4156e-07, 3.3714e-07]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0052, -0.0142, 0.0038, 0.0305, -0.0009, 0.0317, 0.0059, 0.0173, -0.0062, -0.0149], device='cuda:0'), grad: tensor([ 1.4529e-07, -1.8897e-06, 6.6217e-07, -1.9539e-06, -7.7672e-07, -9.1270e-07, 3.6880e-07, 4.4145e-07, 2.1979e-06, 1.7174e-06], device='cuda:0') 100 0.0001 changing lr epoch 207, time 214.49, cls_loss 0.0017 cls_loss_mapping 0.0036 cls_loss_causal 0.4927 re_mapping 0.0051 re_causal 0.0159 /// teacc 98.86 lr 0.00010000 Epoch 209, weight, value: tensor([[-0.0570, -0.0087, -0.0875, ..., 0.0249, -0.0820, -0.0103], [ 0.0421, -0.0650, -0.0203, ..., -0.0268, -0.0597, -0.2591], [-0.0083, 0.0378, -0.0150, ..., -0.1025, 0.1207, -0.0403], ..., [ 0.0191, -0.0305, -0.0108, ..., -0.1146, -0.1699, 0.0328], [-0.0241, -0.0663, -0.0227, ..., -0.0983, 0.0485, -0.1573], [-0.1943, -0.0181, -0.0132, ..., -0.0885, -0.1302, -0.0880]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3283e-07, 6.9849e-08, 2.4214e-08], [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4214e-08, 1.1176e-07, 2.9150e-07], [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., 1.6764e-08, 4.9639e-07, 6.5193e-08], ..., [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 4.6566e-09, 2.8964e-07, 1.2387e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-08, 1.1034e-05, 5.9884e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8184e-08, 1.5367e-07, 7.8976e-07]], device='cuda:0') Epoch 209, bias, value: tensor([-0.0057, -0.0139, 0.0037, 0.0300, -0.0012, 0.0331, 0.0066, 0.0171, -0.0069, -0.0147], device='cuda:0'), grad: tensor([-2.9989e-07, 3.0011e-05, 2.9523e-06, -9.2936e-04, -6.2734e-06, 8.6784e-04, 2.0377e-06, -2.9996e-05, 6.7651e-05, -3.8520e-06], device='cuda:0') 100 0.0001 changing lr epoch 208, time 214.07, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.5079 re_mapping 0.0052 re_causal 0.0158 /// teacc 99.01 lr 0.00010000 Epoch 210, weight, value: tensor([[-0.0571, -0.0088, -0.0875, ..., 0.0249, -0.0824, -0.0109], [ 0.0423, -0.0650, -0.0203, ..., -0.0269, -0.0608, -0.2596], [-0.0082, 0.0381, -0.0150, ..., -0.1028, 0.1210, -0.0393], ..., [ 0.0191, -0.0307, -0.0108, ..., -0.1149, -0.1700, 0.0327], [-0.0243, -0.0665, -0.0228, ..., -0.0985, 0.0495, -0.1579], [-0.1945, -0.0182, -0.0132, ..., -0.0886, -0.1308, -0.0889]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -8.8476e-08, 3.7253e-09, 2.6077e-08], [-1.0151e-07, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -4.1910e-08, 1.5087e-07], [ 1.7695e-08, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -2.1048e-07, 4.0978e-08], ..., [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.3039e-07, 4.4517e-07], [ 1.3970e-08, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, 5.0291e-08, 1.5460e-07], [ 1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 1.3039e-08, 2.4792e-06]], device='cuda:0') Epoch 210, bias, value: tensor([-0.0058, -0.0145, 0.0038, 0.0306, -0.0007, 0.0326, 0.0066, 0.0170, -0.0062, -0.0152], device='cuda:0'), grad: tensor([-1.1083e-07, -6.9197e-07, 1.7788e-07, -7.1712e-08, -8.9332e-06, 4.2375e-07, 4.9546e-07, 1.7565e-06, 9.1270e-07, 6.0722e-06], device='cuda:0') 100 0.0001 changing lr epoch 209, time 214.60, cls_loss 0.0016 cls_loss_mapping 0.0028 cls_loss_causal 0.5093 re_mapping 0.0047 re_causal 0.0149 /// teacc 99.00 lr 0.00010000 Epoch 211, weight, value: tensor([[-0.0576, -0.0088, -0.0875, ..., 0.0248, -0.0830, -0.0110], [ 0.0424, -0.0650, -0.0203, ..., -0.0271, -0.0609, -0.2600], [-0.0079, 0.0383, -0.0150, ..., -0.1029, 0.1212, -0.0397], ..., [ 0.0193, -0.0308, -0.0108, ..., -0.1152, -0.1702, 0.0337], [-0.0243, -0.0666, -0.0228, ..., -0.0988, 0.0494, -0.1583], [-0.1955, -0.0182, -0.0132, ..., -0.0887, -0.1316, -0.0895]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.2107e-08, 0.0000e+00, ..., -1.3765e-06, 3.2596e-08, 2.7940e-09], [ 0.0000e+00, 1.4901e-08, 0.0000e+00, ..., 5.7742e-08, 5.5879e-08, 3.7253e-08], [ 0.0000e+00, -5.1316e-07, 0.0000e+00, ..., 3.0454e-07, -3.2663e-05, -5.8264e-06], ..., [ 0.0000e+00, 3.4273e-07, 0.0000e+00, ..., 1.6764e-08, 3.2127e-05, 5.8711e-06], [ 0.0000e+00, 2.8871e-08, 0.0000e+00, ..., 1.3225e-07, 5.9605e-08, 6.5193e-09], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 5.7276e-07, 1.5832e-08, 9.8720e-08]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0062, -0.0146, 0.0039, 0.0303, -0.0001, 0.0329, 0.0068, 0.0176, -0.0063, -0.0160], device='cuda:0'), grad: tensor([-1.1280e-05, 4.5039e-06, -1.2827e-04, 1.8775e-06, 7.2643e-07, 4.1444e-07, 1.2824e-06, 1.2136e-04, 2.4959e-06, 7.0147e-06], device='cuda:0') 100 0.0001 changing lr epoch 210, time 214.22, cls_loss 0.0018 cls_loss_mapping 0.0036 cls_loss_causal 0.5147 re_mapping 0.0051 re_causal 0.0156 /// teacc 98.84 lr 0.00010000 Epoch 212, weight, value: tensor([[-0.0580, -0.0078, -0.0875, ..., 0.0259, -0.0826, -0.0111], [ 0.0430, -0.0654, -0.0203, ..., -0.0288, -0.0610, -0.2601], [-0.0079, 0.0387, -0.0150, ..., -0.1031, 0.1241, -0.0399], ..., [ 0.0192, -0.0309, -0.0108, ..., -0.1158, -0.1705, 0.0351], [-0.0246, -0.0669, -0.0228, ..., -0.0990, 0.0469, -0.1559], [-0.1959, -0.0183, -0.0132, ..., -0.0890, -0.1322, -0.0895]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 1.0431e-07, 3.7253e-09], [ 1.8813e-07, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, -7.0967e-06, 1.0245e-08], [ 1.4901e-07, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 6.3051e-07, -2.5146e-08], ..., [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.4645e-07, 1.3039e-08], [ 4.6566e-08, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 6.0350e-07, 2.7940e-09], [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 9.5926e-08, 1.2107e-08]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0053, -0.0156, 0.0065, 0.0301, -0.0008, 0.0330, 0.0059, 0.0191, -0.0087, -0.0160], device='cuda:0'), grad: tensor([ 9.2573e-07, -8.2552e-05, 7.5623e-06, -2.2631e-07, 1.6272e-05, 1.1893e-06, 5.8144e-05, 7.4804e-05, 6.3814e-06, -8.2374e-05], device='cuda:0') 100 0.0001 changing lr epoch 211, time 214.17, cls_loss 0.0017 cls_loss_mapping 0.0031 cls_loss_causal 0.5163 re_mapping 0.0053 re_causal 0.0153 /// teacc 98.99 lr 0.00010000 Epoch 213, weight, value: tensor([[-0.0580, -0.0077, -0.0878, ..., 0.0266, -0.0833, -0.0104], [ 0.0429, -0.0664, -0.0222, ..., -0.0291, -0.0612, -0.2602], [-0.0075, 0.0392, -0.0161, ..., -0.1031, 0.1233, -0.0393], ..., [ 0.0191, -0.0313, -0.0111, ..., -0.1176, -0.1707, 0.0352], [-0.0251, -0.0643, -0.0204, ..., -0.0989, 0.0479, -0.1570], [-0.1960, -0.0185, -0.0134, ..., -0.0891, -0.1342, -0.0895]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.8673e-08, 8.0094e-08, 1.9651e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5832e-08, 6.7614e-07, 2.4121e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.8871e-08, -6.9849e-07, 1.1455e-07], ..., [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 2.7940e-09, 1.1362e-07, 2.9523e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -6.3051e-07, 4.5449e-06], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 8.8476e-08, 4.0885e-07, -3.3975e-05]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0052, -0.0159, 0.0059, 0.0301, -0.0009, 0.0334, 0.0050, 0.0193, -0.0079, -0.0161], device='cuda:0'), grad: tensor([ 1.0310e-06, 2.1346e-06, -4.7125e-07, 1.6242e-06, 1.4949e-04, 2.5481e-06, 1.5907e-06, 1.8803e-06, 2.2948e-05, -1.8251e-04], device='cuda:0') 100 0.0001 changing lr epoch 212, time 214.40, cls_loss 0.0014 cls_loss_mapping 0.0030 cls_loss_causal 0.4856 re_mapping 0.0051 re_causal 0.0149 /// teacc 98.97 lr 0.00010000 Epoch 214, weight, value: tensor([[-0.0581, -0.0077, -0.0888, ..., 0.0261, -0.0844, -0.0104], [ 0.0430, -0.0668, -0.0222, ..., -0.0294, -0.0620, -0.2603], [-0.0073, 0.0399, -0.0164, ..., -0.1034, 0.1233, -0.0394], ..., [ 0.0191, -0.0312, -0.0114, ..., -0.1181, -0.1709, 0.0351], [-0.0251, -0.0649, -0.0208, ..., -0.1003, 0.0479, -0.1579], [-0.1963, -0.0174, -0.0136, ..., -0.0891, -0.1354, -0.0896]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.7404e-07, 5.5879e-09, 0.0000e+00], [-9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 1.8626e-08, 2.6077e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.2201e-08, 3.2596e-08, 1.8626e-09], ..., [ 9.3132e-10, -5.5879e-09, 0.0000e+00, ..., 6.5193e-09, 2.7940e-09, 1.8626e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 5.8673e-08, -4.3772e-08, 9.3132e-10], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 2.2072e-07, 2.2352e-08, 1.4901e-08]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0056, -0.0161, 0.0059, 0.0304, -0.0010, 0.0333, 0.0061, 0.0193, -0.0078, -0.0162], device='cuda:0'), grad: tensor([-1.5646e-06, 4.7311e-07, 8.0094e-07, -9.7416e-07, 5.4017e-08, 9.6858e-08, -7.6368e-08, -2.0899e-06, 9.9000e-07, 2.2724e-06], device='cuda:0') 100 0.0001 changing lr epoch 213, time 214.19, cls_loss 0.0012 cls_loss_mapping 0.0026 cls_loss_causal 0.4950 re_mapping 0.0049 re_causal 0.0152 /// teacc 99.03 lr 0.00010000 Epoch 215, weight, value: tensor([[-0.0582, -0.0098, -0.0889, ..., 0.0267, -0.0853, -0.0104], [ 0.0443, -0.0670, -0.0222, ..., -0.0295, -0.0620, -0.2604], [-0.0073, 0.0442, -0.0154, ..., -0.1040, 0.1233, -0.0393], ..., [ 0.0190, -0.0335, -0.0116, ..., -0.1216, -0.1711, 0.0351], [-0.0264, -0.0658, -0.0208, ..., -0.0996, 0.0480, -0.1583], [-0.1967, -0.0169, -0.0136, ..., -0.0892, -0.1363, -0.0896]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-08, 5.8673e-08, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.0734e-08, 9.1270e-08, 9.3132e-10], [ 9.3132e-10, -9.3132e-10, 0.0000e+00, ..., 2.7940e-08, 1.7695e-08, 9.3132e-10], ..., [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 5.3085e-08, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0210e-07, 1.8813e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.2596e-08, 9.6858e-08, 0.0000e+00]], device='cuda:0') Epoch 215, bias, value: tensor([-0.0053, -0.0160, 0.0059, 0.0302, -0.0013, 0.0331, 0.0062, 0.0193, -0.0078, -0.0161], device='cuda:0'), grad: tensor([ 4.6566e-09, 9.5740e-07, 1.6019e-07, 7.1712e-08, 2.1279e-05, 2.0266e-06, -3.0212e-06, -2.2113e-05, 3.4552e-07, 3.3993e-07], device='cuda:0') 100 0.0001 changing lr epoch 214, time 214.37, cls_loss 0.0016 cls_loss_mapping 0.0036 cls_loss_causal 0.5251 re_mapping 0.0053 re_causal 0.0154 /// teacc 99.01 lr 0.00010000 Epoch 216, weight, value: tensor([[-0.0583, -0.0099, -0.0907, ..., 0.0252, -0.0883, -0.0103], [ 0.0443, -0.0671, -0.0222, ..., -0.0297, -0.0619, -0.2605], [-0.0073, 0.0457, -0.0176, ..., -0.1017, 0.1234, -0.0394], ..., [ 0.0189, -0.0348, -0.0122, ..., -0.1224, -0.1717, 0.0346], [-0.0265, -0.0660, -0.0228, ..., -0.0998, 0.0481, -0.1584], [-0.1970, -0.0170, -0.0137, ..., -0.0893, -0.1391, -0.0898]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 2.3283e-08, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 1.4901e-08, 1.2666e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0862e-07, 0.0000e+00, 7.0781e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8533e-07, 7.0781e-08, 1.4938e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0803e-07, -9.8720e-08, 9.1270e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-08, 1.0710e-07, 3.9525e-06]], device='cuda:0') Epoch 216, bias, value: tensor([-0.0078, -0.0158, 0.0059, 0.0320, -0.0014, 0.0331, 0.0063, 0.0190, -0.0075, -0.0164], device='cuda:0'), grad: tensor([ 2.9895e-07, 9.8906e-07, 2.5537e-06, -6.0238e-06, -3.3587e-05, 1.2107e-06, 1.9092e-07, 1.0699e-05, 1.2536e-06, 2.2441e-05], device='cuda:0') 100 0.0001 changing lr epoch 215, time 214.60, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.5039 re_mapping 0.0051 re_causal 0.0150 /// teacc 98.96 lr 0.00010000 Epoch 217, weight, value: tensor([[-0.0585, -0.0099, -0.0908, ..., 0.0258, -0.0884, -0.0103], [ 0.0440, -0.0672, -0.0223, ..., -0.0310, -0.0622, -0.2607], [-0.0070, 0.0471, -0.0177, ..., -0.1019, 0.1235, -0.0396], ..., [ 0.0188, -0.0360, -0.0124, ..., -0.1233, -0.1723, 0.0344], [-0.0258, -0.0663, -0.0229, ..., -0.0992, 0.0482, -0.1587], [-0.1972, -0.0170, -0.0137, ..., -0.0894, -0.1408, -0.0899]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -5.1223e-08, 1.0990e-07, 5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-08, 7.0874e-07, 1.1362e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0151e-07, -5.6624e-07, -1.5553e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 2.1886e-07, 1.3970e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0536e-08, 9.6019e-07, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 2.5872e-06, -6.5193e-09]], device='cuda:0') Epoch 217, bias, value: tensor([-0.0073, -0.0158, 0.0058, 0.0320, -0.0013, 0.0330, 0.0055, 0.0191, -0.0074, -0.0166], device='cuda:0'), grad: tensor([ 2.5984e-07, -1.1581e-04, -1.4435e-07, 3.1203e-05, 7.2867e-06, -3.6895e-05, -6.2473e-06, 1.0562e-04, 3.1963e-06, 1.1712e-05], device='cuda:0') 100 0.0001 changing lr epoch 216, time 214.17, cls_loss 0.0016 cls_loss_mapping 0.0023 cls_loss_causal 0.4589 re_mapping 0.0050 re_causal 0.0138 /// teacc 98.91 lr 0.00010000 Epoch 218, weight, value: tensor([[-0.0588, -0.0130, -0.0909, ..., 0.0263, -0.0887, -0.0101], [ 0.0441, -0.0674, -0.0224, ..., -0.0314, -0.0621, -0.2614], [-0.0071, 0.0504, -0.0177, ..., -0.1005, 0.1235, -0.0399], ..., [ 0.0187, -0.0380, -0.0125, ..., -0.1266, -0.1728, 0.0337], [-0.0256, -0.0668, -0.0229, ..., -0.0997, 0.0483, -0.1565], [-0.1975, -0.0175, -0.0138, ..., -0.0896, -0.1427, -0.0899]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7008e-08, 0.0000e+00, ..., -5.4948e-08, 2.7101e-07, 1.2107e-08], [ 0.0000e+00, 1.1176e-08, 0.0000e+00, ..., 1.1176e-08, 3.7905e-07, 3.7253e-08], [ 0.0000e+00, -4.2934e-07, 0.0000e+00, ..., 1.4901e-08, -2.8927e-06, 6.6124e-08], ..., [ 0.0000e+00, 3.1851e-07, 0.0000e+00, ..., 9.3132e-10, 2.7008e-06, 8.9407e-08], [ 0.0000e+00, 4.8429e-08, 0.0000e+00, ..., 8.1025e-08, -4.9800e-05, 8.4750e-08], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 5.4948e-08, 3.5405e-05, 3.4757e-06]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0070, -0.0155, 0.0056, 0.0325, -0.0011, 0.0329, 0.0054, 0.0190, -0.0075, -0.0165], device='cuda:0'), grad: tensor([ 5.2899e-07, 6.8452e-07, -6.1318e-06, 2.4214e-06, -1.1556e-05, 1.9997e-05, 8.5607e-06, 6.3144e-06, -1.0818e-04, 8.7321e-05], device='cuda:0') 100 0.0001 changing lr epoch 217, time 214.40, cls_loss 0.0015 cls_loss_mapping 0.0031 cls_loss_causal 0.5078 re_mapping 0.0047 re_causal 0.0145 /// teacc 98.96 lr 0.00010000 Epoch 219, weight, value: tensor([[-0.0591, -0.0131, -0.0912, ..., 0.0266, -0.0894, -0.0102], [ 0.0436, -0.0678, -0.0224, ..., -0.0317, -0.0629, -0.2639], [-0.0072, 0.0508, -0.0180, ..., -0.1010, 0.1237, -0.0377], ..., [ 0.0189, -0.0370, -0.0136, ..., -0.1269, -0.1734, 0.0339], [-0.0246, -0.0688, -0.0233, ..., -0.1001, 0.0484, -0.1566], [-0.1982, -0.0176, -0.0138, ..., -0.0900, -0.1444, -0.0902]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.8312e-07, 0.0000e+00, ..., -5.5879e-09, 8.3819e-09, 9.3132e-10], [ 0.0000e+00, 6.0536e-08, 0.0000e+00, ..., 3.7253e-09, 3.5390e-08, 1.0245e-08], [ 0.0000e+00, 3.5297e-07, 0.0000e+00, ..., 4.6566e-09, -7.8231e-08, 8.3819e-09], ..., [ 0.0000e+00, -1.6652e-06, 0.0000e+00, ..., 9.3132e-10, 2.4214e-08, 1.5832e-08], [ 0.0000e+00, 1.5087e-07, 0.0000e+00, ..., -1.5832e-08, -3.0920e-07, 1.3970e-08], [ 0.0000e+00, 4.0606e-07, 0.0000e+00, ..., 3.7253e-09, 1.1176e-08, 9.0338e-08]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0068, -0.0156, 0.0056, 0.0324, -0.0009, 0.0325, 0.0057, 0.0188, -0.0073, -0.0169], device='cuda:0'), grad: tensor([ 5.1185e-06, -2.8554e-06, 6.5416e-06, 3.2894e-06, 4.9733e-07, 3.6620e-06, 4.3493e-07, -2.6584e-05, 2.1458e-06, 7.7188e-06], device='cuda:0') 100 0.0001 changing lr epoch 218, time 214.44, cls_loss 0.0017 cls_loss_mapping 0.0029 cls_loss_causal 0.4754 re_mapping 0.0050 re_causal 0.0145 /// teacc 99.00 lr 0.00010000 Epoch 220, weight, value: tensor([[-0.0596, -0.0131, -0.0915, ..., 0.0268, -0.0899, -0.0100], [ 0.0439, -0.0672, -0.0215, ..., -0.0318, -0.0626, -0.2639], [-0.0068, 0.0509, -0.0184, ..., -0.1013, 0.1251, -0.0378], ..., [ 0.0188, -0.0364, -0.0144, ..., -0.1258, -0.1764, 0.0338], [-0.0246, -0.0697, -0.0239, ..., -0.0996, 0.0485, -0.1567], [-0.1995, -0.0178, -0.0139, ..., -0.0902, -0.1475, -0.0903]], device='cuda:0'), grad: tensor([[ 1.0245e-07, 0.0000e+00, 0.0000e+00, ..., -1.8701e-06, -2.0675e-07, 0.0000e+00], [ 7.6368e-08, 0.0000e+00, 0.0000e+00, ..., 4.0326e-07, 2.7008e-08, 1.8626e-09], [ 1.3784e-07, 0.0000e+00, 0.0000e+00, ..., 4.0792e-07, 2.8871e-08, 9.3132e-10], ..., [-4.7963e-07, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 2.0489e-08, 1.8626e-09], [ 4.4703e-08, 0.0000e+00, 0.0000e+00, ..., 2.9057e-07, 2.9337e-07, 2.8871e-08], [ 4.7497e-08, 0.0000e+00, 0.0000e+00, ..., 4.0885e-07, 5.0571e-07, -1.5106e-06]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0069, -0.0137, 0.0069, 0.0321, -0.0008, 0.0327, 0.0052, 0.0164, -0.0073, -0.0184], device='cuda:0'), grad: tensor([-5.7518e-06, 2.6748e-06, 3.4999e-06, 1.1157e-06, 1.5840e-05, -3.7812e-06, 3.3937e-06, -6.4783e-06, 2.4736e-06, -1.3039e-05], device='cuda:0') 100 0.0001 changing lr epoch 219, time 214.29, cls_loss 0.0015 cls_loss_mapping 0.0025 cls_loss_causal 0.4923 re_mapping 0.0052 re_causal 0.0149 /// teacc 99.00 lr 0.00010000 Epoch 221, weight, value: tensor([[-0.0598, -0.0131, -0.0915, ..., 0.0279, -0.0901, -0.0100], [ 0.0431, -0.0683, -0.0217, ..., -0.0327, -0.0634, -0.2640], [-0.0069, 0.0508, -0.0185, ..., -0.1018, 0.1252, -0.0378], ..., [ 0.0189, -0.0356, -0.0144, ..., -0.1249, -0.1764, 0.0337], [-0.0236, -0.0700, -0.0237, ..., -0.1004, 0.0485, -0.1568], [-0.2000, -0.0183, -0.0139, ..., -0.0910, -0.1480, -0.0906]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-07, 1.0617e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6019e-07, 2.2445e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.6322e-08, -4.6473e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 1.3597e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8440e-07, 3.0454e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9092e-07, 1.1176e-08, 3.7253e-09]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0055, -0.0140, 0.0069, 0.0321, -0.0005, 0.0323, 0.0054, 0.0166, -0.0073, -0.0188], device='cuda:0'), grad: tensor([-5.6438e-07, 1.7464e-05, 2.3860e-06, 5.8860e-07, 2.1532e-06, 1.1483e-06, -1.1250e-06, 2.0117e-05, 2.3246e-06, -4.4465e-05], device='cuda:0') 100 0.0001 changing lr epoch 220, time 214.47, cls_loss 0.0012 cls_loss_mapping 0.0024 cls_loss_causal 0.4978 re_mapping 0.0044 re_causal 0.0143 /// teacc 98.92 lr 0.00010000 Epoch 222, weight, value: tensor([[-0.0599, -0.0130, -0.0915, ..., 0.0283, -0.0901, -0.0099], [ 0.0430, -0.0683, -0.0218, ..., -0.0333, -0.0639, -0.2646], [-0.0069, 0.0508, -0.0185, ..., -0.1019, 0.1252, -0.0375], ..., [ 0.0190, -0.0354, -0.0144, ..., -0.1251, -0.1765, 0.0336], [-0.0234, -0.0701, -0.0236, ..., -0.1035, 0.0483, -0.1568], [-0.2003, -0.0185, -0.0139, ..., -0.0912, -0.1483, -0.0906]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.6764e-07, 0.0000e+00, ..., 2.5537e-06, 3.1628e-06, 0.0000e+00], [-2.1420e-08, 2.7940e-09, 0.0000e+00, ..., 8.1025e-08, 1.0524e-07, 9.3132e-10], [ 9.3132e-09, -4.3772e-07, 0.0000e+00, ..., -6.5230e-06, -7.9796e-06, 1.8626e-09], ..., [ 2.7940e-09, 1.0151e-07, 0.0000e+00, ..., 1.5227e-06, 1.8841e-06, -1.1176e-08], [ 1.8626e-09, 3.4459e-08, 0.0000e+00, ..., 1.2238e-06, 1.5227e-06, 9.3132e-10], [ 1.8626e-09, 1.3039e-08, 0.0000e+00, ..., 2.1048e-07, 2.6636e-07, 5.5879e-09]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0052, -0.0140, 0.0069, 0.0319, -0.0006, 0.0324, 0.0073, 0.0165, -0.0077, -0.0186], device='cuda:0'), grad: tensor([ 8.2925e-06, 3.7588e-06, -1.9133e-05, 3.4980e-06, 2.4319e-05, 2.3060e-06, -1.4156e-06, 1.8636e-06, 3.0145e-05, -5.3674e-05], device='cuda:0') 100 0.0001 changing lr epoch 221, time 214.52, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.5200 re_mapping 0.0047 re_causal 0.0151 /// teacc 98.98 lr 0.00010000 Epoch 223, weight, value: tensor([[-0.0605, -0.0130, -0.0915, ..., 0.0292, -0.0898, -0.0103], [ 0.0428, -0.0684, -0.0218, ..., -0.0354, -0.0640, -0.2648], [-0.0074, 0.0509, -0.0185, ..., -0.1020, 0.1253, -0.0377], ..., [ 0.0190, -0.0354, -0.0144, ..., -0.1252, -0.1766, 0.0334], [-0.0234, -0.0701, -0.0236, ..., -0.1036, 0.0483, -0.1567], [-0.2006, -0.0186, -0.0139, ..., -0.0914, -0.1487, -0.0907]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -6.0536e-09, 1.1642e-08, 0.0000e+00], [-5.1223e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 8.1025e-08, 1.7695e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, -2.4447e-07, 9.3132e-10], ..., [ 5.1223e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 7.8697e-08, 9.3132e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 7.9628e-08, 7.4506e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 5.0757e-08, 1.2573e-08]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0047, -0.0143, 0.0069, 0.0333, 0.0001, 0.0312, 0.0071, 0.0167, -0.0078, -0.0190], device='cuda:0'), grad: tensor([ 2.5146e-08, 6.3982e-07, -1.9930e-07, 5.8534e-07, -1.9800e-06, -7.7346e-07, 2.7847e-07, 2.3935e-07, 9.2108e-07, 2.8266e-07], device='cuda:0') 100 0.0001 changing lr epoch 222, time 214.26, cls_loss 0.0018 cls_loss_mapping 0.0029 cls_loss_causal 0.5053 re_mapping 0.0047 re_causal 0.0137 /// teacc 99.03 lr 0.00010000 Epoch 224, weight, value: tensor([[-0.0610, -0.0131, -0.0915, ..., 0.0272, -0.0901, -0.0108], [ 0.0434, -0.0685, -0.0218, ..., -0.0364, -0.0647, -0.2650], [-0.0090, 0.0510, -0.0185, ..., -0.1027, 0.1253, -0.0384], ..., [ 0.0187, -0.0353, -0.0144, ..., -0.1256, -0.1765, 0.0341], [-0.0215, -0.0703, -0.0236, ..., -0.1046, 0.0483, -0.1565], [-0.2012, -0.0190, -0.0139, ..., -0.0888, -0.1492, -0.0909]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -1.0245e-08, 4.2375e-08, 1.8626e-09], [-2.2352e-08, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 8.5682e-08, 7.2177e-08], [ 1.3504e-08, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, -2.7148e-07, 1.3039e-08], ..., [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.1781e-07, 3.2131e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 1.5739e-07, 2.6543e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 1.5134e-07, 9.8255e-08]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0070, -0.0141, 0.0065, 0.0330, 0.0001, 0.0305, 0.0083, 0.0171, -0.0079, -0.0178], device='cuda:0'), grad: tensor([ 3.9209e-07, 5.2759e-07, 7.1302e-06, 3.1609e-06, -5.7593e-06, 1.0077e-06, 1.1753e-06, -1.2860e-05, 3.0566e-06, 2.1663e-06], device='cuda:0') 100 0.0001 changing lr epoch 223, time 214.69, cls_loss 0.0015 cls_loss_mapping 0.0025 cls_loss_causal 0.5359 re_mapping 0.0045 re_causal 0.0141 /// teacc 98.99 lr 0.00010000 Epoch 225, weight, value: tensor([[-0.0624, -0.0131, -0.0915, ..., 0.0272, -0.0904, -0.0110], [ 0.0431, -0.0686, -0.0218, ..., -0.0380, -0.0653, -0.2652], [-0.0100, 0.0511, -0.0185, ..., -0.1029, 0.1254, -0.0386], ..., [ 0.0188, -0.0353, -0.0144, ..., -0.1265, -0.1766, 0.0341], [-0.0210, -0.0703, -0.0236, ..., -0.1017, 0.0500, -0.1558], [-0.2028, -0.0191, -0.0139, ..., -0.0888, -0.1499, -0.0910]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3097e-06, 2.9337e-08, 0.0000e+00], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-08, 2.8405e-08, 9.3132e-10], [ 9.3132e-10, -0.0000e+00, 0.0000e+00, ..., 1.1409e-07, -7.6788e-07, 4.6566e-10], ..., [-1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 5.5740e-07, 1.1781e-07, 2.3283e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.3667e-07, 1.1595e-07, 2.3283e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 5.5833e-07, 1.5832e-08, -1.6391e-07]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0070, -0.0142, 0.0064, 0.0331, 0.0001, 0.0295, 0.0054, 0.0172, -0.0062, -0.0179], device='cuda:0'), grad: tensor([-5.3644e-06, 9.5926e-08, -9.0664e-07, 2.1979e-06, 1.5879e-06, 6.0583e-07, -2.4633e-07, 1.5376e-06, 1.0617e-06, -5.6205e-07], device='cuda:0') 100 0.0001 changing lr epoch 224, time 214.29, cls_loss 0.0014 cls_loss_mapping 0.0033 cls_loss_causal 0.5375 re_mapping 0.0045 re_causal 0.0145 /// teacc 98.94 lr 0.00010000 Epoch 226, weight, value: tensor([[-0.0627, -0.0131, -0.0915, ..., 0.0274, -0.0906, -0.0111], [ 0.0429, -0.0686, -0.0218, ..., -0.0398, -0.0661, -0.2654], [-0.0104, 0.0511, -0.0185, ..., -0.1068, 0.1249, -0.0386], ..., [ 0.0189, -0.0353, -0.0144, ..., -0.1271, -0.1767, 0.0341], [-0.0200, -0.0704, -0.0236, ..., -0.1010, 0.0506, -0.1556], [-0.2036, -0.0190, -0.0139, ..., -0.0889, -0.1509, -0.0911]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2247e-06, -2.5192e-07, -2.3935e-07], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 8.5682e-08, 2.3283e-09], [ 0.0000e+00, -1.1176e-08, 0.0000e+00, ..., 1.4575e-07, -5.0254e-06, -1.0710e-08], ..., [ 4.6566e-10, 9.3132e-09, 0.0000e+00, ..., 6.9849e-09, 5.9120e-06, 3.1199e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.8545e-07, -1.1809e-06, 5.7276e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0210e-07, 4.8708e-07, 1.1548e-07]], device='cuda:0') Epoch 226, bias, value: tensor([-0.0070, -0.0141, 0.0059, 0.0322, -0.0001, 0.0296, 0.0056, 0.0173, -0.0056, -0.0179], device='cuda:0'), grad: tensor([-7.1228e-06, 2.3935e-07, -8.9332e-06, 5.1828e-07, 3.8557e-07, -6.6636e-07, 9.3458e-07, 1.1496e-05, -1.0096e-06, 4.1388e-06], device='cuda:0') 100 0.0001 changing lr epoch 225, time 214.32, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.5035 re_mapping 0.0047 re_causal 0.0141 /// teacc 99.05 lr 0.00010000 Epoch 227, weight, value: tensor([[-0.0628, -0.0149, -0.0923, ..., 0.0274, -0.0910, -0.0112], [ 0.0428, -0.0693, -0.0218, ..., -0.0421, -0.0668, -0.2654], [-0.0107, 0.0513, -0.0201, ..., -0.1070, 0.1249, -0.0387], ..., [ 0.0192, -0.0347, -0.0153, ..., -0.1277, -0.1767, 0.0341], [-0.0200, -0.0710, -0.0245, ..., -0.1011, 0.0506, -0.1557], [-0.2041, -0.0164, -0.0140, ..., -0.0886, -0.1518, -0.0911]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., -1.5730e-06, -4.9779e-07, 3.0268e-08], [-3.3854e-07, 9.3132e-10, 0.0000e+00, ..., -8.8708e-07, -5.4808e-07, 1.6624e-07], [ 9.5461e-08, 9.3132e-10, 0.0000e+00, ..., 1.1511e-06, 6.7893e-07, 5.1968e-07], ..., [ 1.0710e-08, -8.8476e-09, 0.0000e+00, ..., 3.5390e-08, 9.7789e-08, 2.8405e-08], [ 2.2817e-08, 9.3132e-10, 0.0000e+00, ..., 1.5367e-07, 3.5763e-07, 1.2945e-07], [ 2.3283e-09, 1.8626e-09, 0.0000e+00, ..., 1.1921e-07, 1.2293e-07, 1.2899e-07]], device='cuda:0') Epoch 227, bias, value: tensor([-7.2020e-03, -1.4230e-02, 5.3840e-03, 3.2020e-02, -6.8047e-05, 3.1221e-02, 5.3288e-03, 1.7816e-02, -5.6898e-03, -1.7882e-02], device='cuda:0'), grad: tensor([-3.0622e-06, -2.1365e-06, 5.3085e-06, -6.8769e-06, 5.6811e-08, 1.2955e-06, 2.5444e-06, 4.5169e-07, 1.5683e-06, 8.5216e-07], device='cuda:0') 100 0.0001 changing lr epoch 226, time 214.41, cls_loss 0.0012 cls_loss_mapping 0.0022 cls_loss_causal 0.5043 re_mapping 0.0046 re_causal 0.0146 /// teacc 98.98 lr 0.00010000 Epoch 228, weight, value: tensor([[-0.0629, -0.0201, -0.0923, ..., 0.0273, -0.0909, -0.0088], [ 0.0426, -0.0718, -0.0218, ..., -0.0419, -0.0667, -0.2656], [-0.0110, 0.0547, -0.0201, ..., -0.1071, 0.1249, -0.0393], ..., [ 0.0192, -0.0373, -0.0153, ..., -0.1280, -0.1767, 0.0332], [-0.0193, -0.0732, -0.0245, ..., -0.1012, 0.0507, -0.1558], [-0.2043, -0.0115, -0.0140, ..., -0.0882, -0.1545, -0.0912]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.0245e-08, 0.0000e+00, ..., -2.7716e-06, -6.3889e-07, 0.0000e+00], [-9.3132e-10, 6.0536e-09, 0.0000e+00, ..., 1.2433e-07, 4.5635e-08, 9.3132e-10], [ 0.0000e+00, 5.1223e-09, 0.0000e+00, ..., 3.9116e-07, 9.3598e-08, 4.6566e-10], ..., [ 4.6566e-10, 6.3330e-08, 0.0000e+00, ..., 4.7963e-08, 6.3796e-08, 1.8626e-09], [ 0.0000e+00, 1.7695e-08, 0.0000e+00, ..., 1.1129e-07, -4.4703e-07, 4.6566e-10], [ 0.0000e+00, -1.5832e-07, 0.0000e+00, ..., 1.2713e-06, 5.6904e-07, -1.1176e-08]], device='cuda:0') Epoch 228, bias, value: tensor([-7.2335e-03, -1.4186e-02, 5.3442e-03, 2.9328e-02, 1.8841e-05, 3.3840e-02, 5.3064e-03, 1.7720e-02, -5.5352e-03, -1.8148e-02], device='cuda:0'), grad: tensor([-7.7337e-06, 5.5647e-07, 1.7490e-06, 3.0966e-07, 3.5554e-05, 1.0021e-06, 8.6986e-07, 5.4576e-07, -2.9989e-07, -3.2544e-05], device='cuda:0') 100 0.0001 changing lr epoch 227, time 214.25, cls_loss 0.0015 cls_loss_mapping 0.0020 cls_loss_causal 0.5340 re_mapping 0.0045 re_causal 0.0143 /// teacc 99.05 lr 0.00010000 Epoch 229, weight, value: tensor([[-0.0632, -0.0204, -0.0951, ..., 0.0273, -0.0911, -0.0091], [ 0.0430, -0.0752, -0.0222, ..., -0.0421, -0.0669, -0.2658], [-0.0115, 0.0550, -0.0223, ..., -0.1071, 0.1249, -0.0387], ..., [ 0.0193, -0.0359, -0.0177, ..., -0.1282, -0.1768, 0.0331], [-0.0193, -0.0741, -0.0274, ..., -0.1012, 0.0507, -0.1560], [-0.2052, -0.0110, -0.0148, ..., -0.0882, -0.1551, -0.0917]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.5832e-08, -3.7253e-09, 1.8626e-09], [ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 5.1223e-09, 1.3039e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 2.8871e-08, 5.1223e-09], ..., [ 2.7940e-09, 1.3970e-09, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 2.4214e-08], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 1.3970e-09, -4.3772e-08, 4.1910e-09], [ 9.3132e-10, -1.2107e-08, 0.0000e+00, ..., 5.5879e-09, 6.0536e-09, -1.1642e-08]], device='cuda:0') Epoch 229, bias, value: tensor([-0.0072, -0.0143, 0.0053, 0.0307, 0.0006, 0.0333, 0.0053, 0.0174, -0.0055, -0.0180], device='cuda:0'), grad: tensor([ 1.6764e-08, 4.2059e-06, 1.2055e-05, -9.9186e-07, 8.9966e-07, 4.2235e-07, 5.7276e-08, -1.8254e-05, 3.1255e-06, -1.5246e-06], device='cuda:0') 100 0.0001 changing lr epoch 228, time 214.35, cls_loss 0.0011 cls_loss_mapping 0.0014 cls_loss_causal 0.5010 re_mapping 0.0045 re_causal 0.0137 /// teacc 99.02 lr 0.00010000 Epoch 230, weight, value: tensor([[-0.0632, -0.0214, -0.0952, ..., 0.0274, -0.0913, -0.0082], [ 0.0430, -0.0768, -0.0224, ..., -0.0418, -0.0671, -0.2662], [-0.0116, 0.0549, -0.0226, ..., -0.1073, 0.1250, -0.0388], ..., [ 0.0193, -0.0331, -0.0182, ..., -0.1283, -0.1768, 0.0331], [-0.0194, -0.0756, -0.0278, ..., -0.1011, 0.0509, -0.1560], [-0.2053, -0.0115, -0.0149, ..., -0.0883, -0.1559, -0.0919]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -6.5193e-08, 0.0000e+00, ..., -1.8440e-07, 6.3796e-08, 1.3504e-08], [-3.1199e-08, 1.8626e-09, 0.0000e+00, ..., 7.8231e-08, 9.6392e-08, 1.3039e-07], [ 1.8161e-08, 5.5879e-09, 0.0000e+00, ..., 6.3190e-07, 6.8359e-07, 2.9383e-07], ..., [ 6.5193e-09, 4.6566e-10, 0.0000e+00, ..., 2.0489e-08, 3.8184e-08, 6.4727e-08], [ 4.6566e-09, 1.8626e-09, 0.0000e+00, ..., 1.0338e-07, 3.0734e-08, 2.9337e-07], [ 0.0000e+00, 4.2375e-08, 0.0000e+00, ..., 1.8859e-07, 5.3551e-08, 5.8673e-07]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0072, -0.0143, 0.0052, 0.0308, 0.0009, 0.0326, 0.0053, 0.0175, -0.0055, -0.0182], device='cuda:0'), grad: tensor([-4.6156e-06, 7.9861e-07, 4.3698e-06, 5.4482e-07, 8.8066e-06, 1.3448e-06, -1.9789e-05, -1.6578e-07, 2.2743e-06, 6.3814e-06], device='cuda:0') 100 0.0001 changing lr epoch 229, time 214.39, cls_loss 0.0016 cls_loss_mapping 0.0026 cls_loss_causal 0.5202 re_mapping 0.0045 re_causal 0.0140 /// teacc 98.92 lr 0.00010000 Epoch 231, weight, value: tensor([[-0.0647, -0.0215, -0.0952, ..., 0.0276, -0.0913, -0.0084], [ 0.0436, -0.0773, -0.0224, ..., -0.0427, -0.0673, -0.2664], [-0.0111, 0.0568, -0.0226, ..., -0.1073, 0.1251, -0.0387], ..., [ 0.0189, -0.0342, -0.0182, ..., -0.1286, -0.1770, 0.0330], [-0.0198, -0.0773, -0.0278, ..., -0.1012, 0.0509, -0.1562], [-0.2062, -0.0112, -0.0149, ..., -0.0884, -0.1569, -0.0920]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.4435e-08, 0.0000e+00, ..., 2.2026e-07, 2.9523e-07, 2.3283e-09], [-2.0303e-07, 4.1910e-09, 0.0000e+00, ..., -4.0047e-08, 4.5588e-07, 4.6566e-10], [ 4.3306e-08, 1.3970e-09, 0.0000e+00, ..., 1.4668e-07, 1.6391e-06, 0.0000e+00], ..., [ 1.5832e-08, -8.3819e-09, 0.0000e+00, ..., 2.0489e-08, 1.9427e-06, 1.3970e-09], [ 5.6811e-08, 6.9849e-09, 0.0000e+00, ..., 1.2573e-07, -5.5730e-06, 4.6566e-10], [ 8.8476e-09, -3.5856e-08, 0.0000e+00, ..., 4.6566e-09, 2.0768e-07, -2.7940e-09]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0072, -0.0135, 0.0053, 0.0307, 0.0007, 0.0321, 0.0055, 0.0169, -0.0055, -0.0183], device='cuda:0'), grad: tensor([ 9.6671e-07, 1.8198e-06, 5.2229e-06, 3.2410e-06, 1.7891e-06, 8.8215e-06, -7.7412e-06, 1.3418e-05, -2.6926e-05, -6.3283e-07], device='cuda:0') 100 0.0001 changing lr epoch 230, time 214.17, cls_loss 0.0019 cls_loss_mapping 0.0037 cls_loss_causal 0.5263 re_mapping 0.0047 re_causal 0.0142 /// teacc 98.94 lr 0.00010000 Epoch 232, weight, value: tensor([[-0.0655, -0.0216, -0.0952, ..., 0.0255, -0.0945, -0.0087], [ 0.0442, -0.0775, -0.0224, ..., -0.0440, -0.0699, -0.2666], [-0.0117, 0.0571, -0.0230, ..., -0.1074, 0.1252, -0.0388], ..., [ 0.0198, -0.0342, -0.0180, ..., -0.1287, -0.1772, 0.0331], [-0.0201, -0.0779, -0.0279, ..., -0.1013, 0.0513, -0.1562], [-0.2120, -0.0113, -0.0149, ..., -0.0884, -0.1581, -0.0920]], device='cuda:0'), grad: tensor([[ 7.3109e-08, 0.0000e+00, 0.0000e+00, ..., -1.0729e-06, -2.7940e-09, 4.1910e-09], [ 1.2573e-08, 0.0000e+00, 0.0000e+00, ..., 7.9162e-09, 1.7416e-07, 1.3039e-08], [ 8.0559e-08, 0.0000e+00, 0.0000e+00, ..., 1.0896e-07, 2.3186e-05, 4.6566e-09], ..., [ 1.8626e-08, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 1.4808e-07, 7.9162e-09], [-2.6682e-07, 0.0000e+00, 0.0000e+00, ..., 8.5216e-08, -2.5034e-05, 1.1176e-08], [ 3.2596e-09, 0.0000e+00, 0.0000e+00, ..., 4.9919e-07, 5.3504e-07, -6.9384e-08]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0087, -0.0126, 0.0052, 0.0306, -0.0009, 0.0311, 0.0068, 0.0159, -0.0051, -0.0176], device='cuda:0'), grad: tensor([-2.7642e-06, 6.9756e-07, 5.6207e-05, 1.3653e-06, 2.1160e-06, 8.2515e-07, 1.7267e-06, 5.7276e-07, -5.9485e-05, -1.1930e-06], device='cuda:0') 100 0.0001 changing lr epoch 231, time 214.17, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.4880 re_mapping 0.0046 re_causal 0.0146 /// teacc 98.98 lr 0.00010000 Epoch 233, weight, value: tensor([[-0.0660, -0.0216, -0.0952, ..., 0.0255, -0.0947, -0.0088], [ 0.0447, -0.0780, -0.0225, ..., -0.0448, -0.0699, -0.2667], [-0.0121, 0.0571, -0.0230, ..., -0.1076, 0.1252, -0.0389], ..., [ 0.0199, -0.0337, -0.0180, ..., -0.1294, -0.1772, 0.0331], [-0.0203, -0.0780, -0.0279, ..., -0.1013, 0.0513, -0.1563], [-0.2134, -0.0117, -0.0151, ..., -0.0882, -0.1589, -0.0925]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.3085e-08, 1.2573e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 1.9092e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -9.6392e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 1.0012e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.0536e-09, -3.7299e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2573e-08, 1.6438e-07, 3.2596e-09]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0087, -0.0127, 0.0051, 0.0304, -0.0004, 0.0313, 0.0068, 0.0161, -0.0050, -0.0183], device='cuda:0'), grad: tensor([-1.0105e-07, 1.4668e-07, -3.2131e-08, 1.5413e-07, 4.4238e-08, 1.8068e-07, 2.0349e-07, -2.1886e-07, -8.7637e-07, 4.8429e-07], device='cuda:0') 100 0.0001 changing lr epoch 232, time 214.31, cls_loss 0.0017 cls_loss_mapping 0.0039 cls_loss_causal 0.4971 re_mapping 0.0047 re_causal 0.0143 /// teacc 98.93 lr 0.00010000 Epoch 234, weight, value: tensor([[-0.0660, -0.0216, -0.0967, ..., 0.0258, -0.0948, -0.0091], [ 0.0449, -0.0805, -0.0230, ..., -0.0449, -0.0701, -0.2669], [-0.0124, 0.0572, -0.0243, ..., -0.1077, 0.1252, -0.0387], ..., [ 0.0199, -0.0329, -0.0182, ..., -0.1297, -0.1773, 0.0335], [-0.0203, -0.0782, -0.0283, ..., -0.0999, 0.0529, -0.1565], [-0.2138, -0.0117, -0.0160, ..., -0.0883, -0.1600, -0.0926]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 8.6147e-08, 4.6566e-10], [-9.7789e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.0047e-08, 9.7789e-09], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -2.8824e-07, 2.4680e-08], ..., [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.2352e-08, 2.7940e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 4.7032e-08, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.8161e-08, 1.4435e-08]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0086, -0.0127, 0.0050, 0.0312, -0.0005, 0.0304, 0.0047, 0.0163, -0.0036, -0.0183], device='cuda:0'), grad: tensor([ 1.8859e-07, 1.7518e-06, 1.3877e-07, 5.4240e-06, -2.0443e-07, -5.2620e-07, 1.1781e-07, -8.9034e-06, 1.6093e-06, 3.7719e-07], device='cuda:0') 100 0.0001 changing lr epoch 233, time 214.22, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.5276 re_mapping 0.0044 re_causal 0.0140 /// teacc 99.01 lr 0.00010000 Epoch 235, weight, value: tensor([[-0.0655, -0.0216, -0.0968, ..., 0.0259, -0.0950, -0.0069], [ 0.0449, -0.0808, -0.0230, ..., -0.0450, -0.0702, -0.2669], [-0.0122, 0.0572, -0.0243, ..., -0.1078, 0.1258, -0.0386], ..., [ 0.0207, -0.0327, -0.0182, ..., -0.1303, -0.1784, 0.0335], [-0.0207, -0.0783, -0.0283, ..., -0.1004, 0.0527, -0.1566], [-0.2144, -0.0119, -0.0161, ..., -0.0884, -0.1607, -0.0927]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -7.1246e-08, 2.7381e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.1420e-08, 5.6345e-08, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.0047e-08, 7.7486e-07, 0.0000e+00], ..., [-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 2.5611e-08, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -1.3411e-07, -1.0328e-06, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 3.7719e-08, 7.5437e-08, 2.3283e-09]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0083, -0.0129, 0.0057, 0.0312, -0.0006, 0.0302, 0.0051, 0.0160, -0.0039, -0.0186], device='cuda:0'), grad: tensor([-4.7171e-07, 1.8850e-06, 2.5909e-06, -1.0431e-06, -6.5088e-05, 2.0526e-06, 1.3188e-06, 4.0270e-06, -2.5369e-06, 5.7250e-05], device='cuda:0') 100 0.0001 changing lr epoch 234, time 214.17, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.5052 re_mapping 0.0046 re_causal 0.0146 /// teacc 98.88 lr 0.00010000 Epoch 236, weight, value: tensor([[-0.0681, -0.0216, -0.0968, ..., 0.0262, -0.0949, -0.0079], [ 0.0457, -0.0808, -0.0231, ..., -0.0451, -0.0704, -0.2671], [-0.0138, 0.0572, -0.0244, ..., -0.1078, 0.1259, -0.0386], ..., [ 0.0217, -0.0327, -0.0183, ..., -0.1305, -0.1785, 0.0336], [-0.0212, -0.0783, -0.0283, ..., -0.1006, 0.0527, -0.1567], [-0.2157, -0.0119, -0.0161, ..., -0.0883, -0.1612, -0.0928]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., -3.7719e-08, 8.3819e-09, 0.0000e+00], [-1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, 4.7963e-08, 0.0000e+00], [-2.3749e-08, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, -4.0280e-07, 0.0000e+00], ..., [ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 6.5658e-08, 1.9278e-07, 0.0000e+00], [ 8.8476e-09, 0.0000e+00, 0.0000e+00, ..., 3.4459e-07, 1.1176e-08, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -4.7171e-07, 1.7229e-08, 0.0000e+00]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0081, -0.0130, 0.0057, 0.0314, -0.0002, 0.0302, 0.0050, 0.0160, -0.0040, -0.0190], device='cuda:0'), grad: tensor([ 6.4867e-07, 6.8592e-07, -4.9965e-07, 1.9763e-06, -1.5013e-06, 3.1060e-07, 1.3690e-07, 2.4904e-06, 1.0669e-05, -1.4916e-05], device='cuda:0') 100 0.0001 changing lr epoch 235, time 214.22, cls_loss 0.0017 cls_loss_mapping 0.0021 cls_loss_causal 0.4881 re_mapping 0.0046 re_causal 0.0134 /// teacc 99.05 lr 0.00010000 Epoch 237, weight, value: tensor([[-0.0673, -0.0216, -0.0968, ..., 0.0275, -0.0937, -0.0083], [ 0.0405, -0.0808, -0.0232, ..., -0.0455, -0.0715, -0.2674], [-0.0131, 0.0572, -0.0245, ..., -0.1094, 0.1258, -0.0390], ..., [ 0.0238, -0.0327, -0.0184, ..., -0.1310, -0.1786, 0.0334], [-0.0197, -0.0783, -0.0286, ..., -0.1006, 0.0527, -0.1561], [-0.2192, -0.0118, -0.0161, ..., -0.0885, -0.1617, -0.0928]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.3411e-07, -1.1595e-07, 9.3132e-10], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 4.7032e-08, 6.0536e-09], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -7.3016e-07, 1.8626e-09], ..., [-1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.2177e-08, 6.5193e-09], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 4.9360e-08, 5.3551e-07, 9.3132e-10], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 1.1642e-08, 2.0955e-08]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0066, -0.0133, 0.0053, 0.0316, -0.0011, 0.0302, 0.0049, 0.0162, -0.0040, -0.0180], device='cuda:0'), grad: tensor([-2.7847e-07, -1.2629e-06, -6.9384e-07, 3.5809e-07, 9.3132e-10, -2.3330e-07, 2.3283e-07, 2.0443e-07, 9.4064e-07, 7.3202e-07], device='cuda:0') 100 0.0001 changing lr epoch 236, time 214.27, cls_loss 0.0013 cls_loss_mapping 0.0018 cls_loss_causal 0.4963 re_mapping 0.0045 re_causal 0.0138 /// teacc 99.07 lr 0.00010000 Epoch 238, weight, value: tensor([[-0.0668, -0.0216, -0.0968, ..., 0.0283, -0.0937, -0.0085], [ 0.0408, -0.0813, -0.0232, ..., -0.0460, -0.0709, -0.2676], [-0.0138, 0.0571, -0.0245, ..., -0.1096, 0.1258, -0.0412], ..., [ 0.0244, -0.0322, -0.0184, ..., -0.1313, -0.1787, 0.0327], [-0.0198, -0.0783, -0.0286, ..., -0.1007, 0.0527, -0.1564], [-0.2201, -0.0123, -0.0161, ..., -0.0893, -0.1630, -0.0929]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6186e-06, -1.3085e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7206e-07, 3.3211e-06, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.4482e-08, -3.7644e-06, -1.5367e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1246e-08, 1.7742e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.1479e-07, 3.6461e-07, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.2387e-07, 4.9826e-08, 2.3283e-09]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0056, -0.0125, 0.0051, 0.0316, -0.0006, 0.0302, 0.0049, 0.0163, -0.0048, -0.0185], device='cuda:0'), grad: tensor([-5.3048e-06, 1.7002e-05, -5.8971e-06, -6.9803e-07, 1.2359e-06, 1.6615e-06, -3.4133e-07, -1.0572e-05, 3.0231e-06, -1.0477e-07], device='cuda:0') 100 0.0001 changing lr epoch 237, time 214.24, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.5074 re_mapping 0.0044 re_causal 0.0137 /// teacc 98.92 lr 0.00010000 Epoch 239, weight, value: tensor([[-0.0670, -0.0217, -0.0969, ..., 0.0292, -0.0934, -0.0097], [ 0.0407, -0.0815, -0.0233, ..., -0.0462, -0.0711, -0.2678], [-0.0140, 0.0571, -0.0246, ..., -0.1097, 0.1258, -0.0417], ..., [ 0.0246, -0.0320, -0.0187, ..., -0.1323, -0.1786, 0.0327], [-0.0197, -0.0783, -0.0289, ..., -0.1006, 0.0528, -0.1562], [-0.2216, -0.0124, -0.0162, ..., -0.0895, -0.1650, -0.0935]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -2.4680e-08, 6.1933e-08, 0.0000e+00], [ 2.6543e-08, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 9.7323e-08, 1.8626e-09], [ 8.8476e-09, 0.0000e+00, 0.0000e+00, ..., -5.7276e-08, -8.8802e-07, 0.0000e+00], ..., [-8.4285e-08, 0.0000e+00, 0.0000e+00, ..., 1.3039e-08, 3.6089e-07, 0.0000e+00], [ 1.3504e-08, 0.0000e+00, 0.0000e+00, ..., 1.3504e-08, -1.3411e-06, 0.0000e+00], [ 2.0955e-08, 0.0000e+00, 0.0000e+00, ..., 2.2817e-08, 1.4212e-06, 9.3132e-10]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0049, -0.0126, 0.0047, 0.0315, 0.0002, 0.0303, 0.0045, 0.0167, -0.0046, -0.0191], device='cuda:0'), grad: tensor([ 8.8941e-08, 4.1584e-07, -1.0021e-06, 1.0338e-06, 4.5588e-07, -1.3234e-06, 6.7055e-07, 7.0967e-07, -3.0622e-06, 2.0023e-06], device='cuda:0') 100 0.0001 changing lr epoch 238, time 214.50, cls_loss 0.0013 cls_loss_mapping 0.0016 cls_loss_causal 0.4912 re_mapping 0.0044 re_causal 0.0136 /// teacc 98.98 lr 0.00010000 Epoch 240, weight, value: tensor([[-0.0672, -0.0217, -0.0969, ..., 0.0298, -0.0933, -0.0093], [ 0.0395, -0.0815, -0.0233, ..., -0.0464, -0.0703, -0.2679], [-0.0124, 0.0571, -0.0246, ..., -0.1097, 0.1256, -0.0417], ..., [ 0.0247, -0.0320, -0.0187, ..., -0.1334, -0.1787, 0.0326], [-0.0197, -0.0783, -0.0289, ..., -0.1007, 0.0529, -0.1562], [-0.2227, -0.0124, -0.0162, ..., -0.0906, -0.1666, -0.0936]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., -3.8277e-07, 1.7313e-06, 4.6566e-10], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 4.3400e-07, 1.3066e-06, 9.3132e-10], [-1.5832e-08, 0.0000e+00, 0.0000e+00, ..., 4.4657e-07, -1.9697e-07, -7.4506e-09], ..., [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 1.0198e-07, 4.3446e-07, 5.1223e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., -3.0939e-06, -8.3521e-06, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.7218e-07, 1.2852e-06, 2.3283e-09]], device='cuda:0') Epoch 240, bias, value: tensor([-4.4404e-03, -1.2073e-02, 4.1750e-03, 3.1554e-02, 4.0330e-05, 3.0267e-02, 4.5086e-03, 1.6711e-02, -4.4796e-03, -1.9461e-02], device='cuda:0'), grad: tensor([ 2.5928e-06, 3.7104e-06, 2.7791e-06, 3.0249e-06, 1.0552e-06, 4.7423e-06, 3.3490e-06, -2.7958e-06, -2.3648e-05, 5.1931e-06], device='cuda:0') 100 0.0001 changing lr epoch 239, time 214.48, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4738 re_mapping 0.0044 re_causal 0.0135 /// teacc 99.00 lr 0.00010000 Epoch 241, weight, value: tensor([[-0.0668, -0.0217, -0.0969, ..., 0.0301, -0.0934, -0.0093], [ 0.0378, -0.0815, -0.0233, ..., -0.0441, -0.0706, -0.2681], [-0.0147, 0.0571, -0.0246, ..., -0.1098, 0.1256, -0.0416], ..., [ 0.0272, -0.0320, -0.0187, ..., -0.1343, -0.1787, 0.0325], [-0.0172, -0.0783, -0.0289, ..., -0.1006, 0.0530, -0.1563], [-0.2237, -0.0124, -0.0162, ..., -0.0910, -0.1675, -0.0936]], device='cuda:0'), grad: tensor([[ 8.8476e-09, 0.0000e+00, 0.0000e+00, ..., -5.8208e-08, -1.7229e-08, 2.7940e-09], [ 6.7055e-08, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 3.4925e-08, 2.1420e-08], [ 2.9802e-08, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.7788e-07, 1.8626e-08], ..., [ 4.2375e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.0070e-08, 2.0955e-08], [ 1.1176e-08, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, 1.7695e-08, 3.7253e-09], [ 1.2200e-07, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 3.2596e-09, 4.4703e-08]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0043, -0.0119, 0.0038, 0.0317, -0.0008, 0.0302, 0.0042, 0.0170, -0.0044, -0.0187], device='cuda:0'), grad: tensor([-6.7521e-08, 1.4529e-06, 2.2464e-06, 1.1437e-06, -5.3551e-07, 5.4482e-08, 3.7486e-07, -5.3607e-06, 1.3364e-07, 5.4995e-07], device='cuda:0') 100 0.0001 changing lr epoch 240, time 214.83, cls_loss 0.0018 cls_loss_mapping 0.0024 cls_loss_causal 0.5257 re_mapping 0.0044 re_causal 0.0134 /// teacc 99.00 lr 0.00010000 Epoch 242, weight, value: tensor([[-0.0672, -0.0217, -0.0969, ..., 0.0302, -0.0937, -0.0093], [ 0.0379, -0.0815, -0.0233, ..., -0.0442, -0.0709, -0.2682], [-0.0150, 0.0571, -0.0246, ..., -0.1099, 0.1257, -0.0419], ..., [ 0.0275, -0.0320, -0.0187, ..., -0.1342, -0.1788, 0.0322], [-0.0172, -0.0783, -0.0289, ..., -0.1007, 0.0531, -0.1558], [-0.2256, -0.0124, -0.0163, ..., -0.0914, -0.1689, -0.0936]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.0641e-07, 1.7323e-07, 2.7008e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9802e-08, 1.2247e-07, 2.1886e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8184e-08, -4.0559e-07, -1.4994e-07], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7229e-08, 3.1479e-07, 4.8894e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0803e-07, -4.9267e-07, -5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4948e-07, -2.0117e-07, 1.8626e-09]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0040, -0.0123, 0.0036, 0.0333, -0.0007, 0.0284, 0.0042, 0.0177, -0.0044, -0.0192], device='cuda:0'), grad: tensor([-5.4622e-07, 2.8033e-07, -6.0583e-07, 5.1558e-06, 5.3970e-07, 2.4270e-06, -8.3260e-07, 9.0711e-07, -1.8906e-07, -7.1637e-06], device='cuda:0') 100 0.0001 changing lr epoch 241, time 214.65, cls_loss 0.0011 cls_loss_mapping 0.0023 cls_loss_causal 0.5003 re_mapping 0.0043 re_causal 0.0138 /// teacc 99.03 lr 0.00010000 Epoch 243, weight, value: tensor([[-0.0674, -0.0217, -0.0969, ..., 0.0307, -0.0941, -0.0095], [ 0.0378, -0.0815, -0.0233, ..., -0.0449, -0.0710, -0.2687], [-0.0150, 0.0574, -0.0246, ..., -0.1102, 0.1258, -0.0427], ..., [ 0.0280, -0.0322, -0.0187, ..., -0.1345, -0.1789, 0.0316], [-0.0173, -0.0785, -0.0289, ..., -0.1007, 0.0531, -0.1561], [-0.2284, -0.0124, -0.0163, ..., -0.0914, -0.1696, -0.0937]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.5367e-08, 8.8476e-09, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 7.9162e-09, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -9.3132e-09, 2.7940e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.6298e-08, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., -1.1548e-07, -1.4408e-06, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 2.8871e-08, 1.7229e-08]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0035, -0.0124, 0.0036, 0.0346, 0.0001, 0.0271, 0.0042, 0.0176, -0.0044, -0.0196], device='cuda:0'), grad: tensor([ 1.4994e-07, -4.4294e-06, 3.5875e-06, 2.0750e-06, -2.1607e-07, 8.3586e-07, 2.6487e-06, -2.0005e-06, -2.8983e-06, 2.2026e-07], device='cuda:0') 100 0.0001 changing lr epoch 242, time 214.88, cls_loss 0.0014 cls_loss_mapping 0.0023 cls_loss_causal 0.4919 re_mapping 0.0045 re_causal 0.0136 /// teacc 99.02 lr 0.00010000 Epoch 244, weight, value: tensor([[-0.0676, -0.0217, -0.0969, ..., 0.0308, -0.0942, -0.0096], [ 0.0375, -0.0815, -0.0233, ..., -0.0450, -0.0711, -0.2688], [-0.0146, 0.0574, -0.0247, ..., -0.1102, 0.1259, -0.0427], ..., [ 0.0282, -0.0322, -0.0189, ..., -0.1346, -0.1791, 0.0315], [-0.0174, -0.0786, -0.0289, ..., -0.1008, 0.0531, -0.1562], [-0.2302, -0.0124, -0.0163, ..., -0.0915, -0.1704, -0.0939]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.9384e-08, 9.7789e-09, 1.3970e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.2573e-08, 9.0804e-08, 2.1420e-08], [ 1.2107e-08, 0.0000e+00, 0.0000e+00, ..., 2.6077e-07, 2.2706e-06, 2.1420e-08], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 2.4214e-08, 6.9849e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.5239e-07, -2.4755e-06, 4.6566e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.3504e-08, 1.3504e-08, 2.3283e-08]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0035, -0.0127, 0.0034, 0.0344, 0.0005, 0.0272, 0.0042, 0.0176, -0.0040, -0.0195], device='cuda:0'), grad: tensor([-1.7835e-07, 1.0664e-07, 4.2208e-06, 1.2619e-07, -2.4214e-07, -1.2619e-07, 1.2247e-07, 2.4680e-08, -4.2282e-06, 1.9092e-07], device='cuda:0') 100 0.0001 changing lr epoch 243, time 214.88, cls_loss 0.0012 cls_loss_mapping 0.0018 cls_loss_causal 0.4654 re_mapping 0.0046 re_causal 0.0140 /// teacc 98.94 lr 0.00010000 Epoch 245, weight, value: tensor([[-0.0714, -0.0217, -0.0969, ..., 0.0304, -0.0961, -0.0097], [ 0.0375, -0.0816, -0.0233, ..., -0.0452, -0.0717, -0.2689], [-0.0147, 0.0589, -0.0247, ..., -0.1102, 0.1261, -0.0428], ..., [ 0.0288, -0.0337, -0.0189, ..., -0.1348, -0.1793, 0.0315], [-0.0176, -0.0790, -0.0289, ..., -0.1008, 0.0534, -0.1562], [-0.2338, -0.0124, -0.0163, ..., -0.0918, -0.1722, -0.0940]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 5.8208e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 8.3819e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2573e-08, -4.0047e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 2.8871e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3749e-08, 9.2201e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 2.7940e-09, 2.7940e-09]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0050, -0.0128, 0.0035, 0.0344, 0.0005, 0.0273, 0.0042, 0.0177, -0.0034, -0.0201], device='cuda:0'), grad: tensor([ 1.8347e-07, -4.1723e-06, 8.0559e-08, 3.4040e-07, 3.3062e-07, -1.2573e-08, -1.0664e-07, 3.0231e-06, 2.8405e-07, 5.6811e-08], device='cuda:0') 100 0.0001 changing lr epoch 244, time 214.62, cls_loss 0.0014 cls_loss_mapping 0.0026 cls_loss_causal 0.4923 re_mapping 0.0048 re_causal 0.0138 /// teacc 98.97 lr 0.00010000 Epoch 246, weight, value: tensor([[-0.0716, -0.0217, -0.0970, ..., 0.0300, -0.0967, -0.0098], [ 0.0375, -0.0816, -0.0250, ..., -0.0453, -0.0734, -0.2692], [-0.0136, 0.0593, -0.0266, ..., -0.1104, 0.1265, -0.0421], ..., [ 0.0282, -0.0340, -0.0190, ..., -0.1350, -0.1795, 0.0306], [-0.0178, -0.0791, -0.0270, ..., -0.1009, 0.0534, -0.1572], [-0.2341, -0.0124, -0.0167, ..., -0.0920, -0.1729, -0.0942]], device='cuda:0'), grad: tensor([[6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 2.2817e-08, 1.7509e-07, 2.7940e-09], [9.9186e-08, 0.0000e+00, 0.0000e+00, ..., 9.7323e-08, 2.3236e-07, 4.9360e-08], [9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 6.2399e-08, 1.7229e-07, 4.6566e-09], ..., [1.7509e-07, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, 1.3364e-07, 8.7079e-08], [1.0710e-08, 0.0000e+00, 0.0000e+00, ..., 2.6356e-07, 2.5313e-06, 5.5879e-09], [8.1025e-08, 0.0000e+00, 0.0000e+00, ..., 3.4925e-08, 3.7393e-07, 4.0047e-08]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0051, -0.0131, 0.0038, 0.0343, 0.0012, 0.0273, 0.0043, 0.0177, -0.0036, -0.0199], device='cuda:0'), grad: tensor([ 6.5332e-07, 2.8275e-06, 1.0859e-06, 2.7772e-06, -1.1306e-06, -1.3970e-05, 3.1032e-06, -2.2575e-06, 8.0615e-06, -1.1381e-06], device='cuda:0') 100 0.0001 changing lr epoch 245, time 214.82, cls_loss 0.0012 cls_loss_mapping 0.0024 cls_loss_causal 0.4712 re_mapping 0.0047 re_causal 0.0139 /// teacc 98.97 lr 0.00010000 Epoch 247, weight, value: tensor([[-0.0716, -0.0217, -0.0970, ..., 0.0310, -0.0960, -0.0095], [ 0.0375, -0.0818, -0.0251, ..., -0.0454, -0.0736, -0.2695], [-0.0132, 0.0592, -0.0262, ..., -0.1106, 0.1266, -0.0417], ..., [ 0.0280, -0.0338, -0.0193, ..., -0.1353, -0.1796, 0.0305], [-0.0179, -0.0794, -0.0270, ..., -0.1010, 0.0534, -0.1578], [-0.2349, -0.0124, -0.0169, ..., -0.0932, -0.1754, -0.0943]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.2387e-07, 7.4506e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3039e-08, 2.7791e-06, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.8836e-07, 2.3246e-06, -6.5193e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, 1.1083e-07, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.3306e-08, -3.6918e-06, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.0664e-07, 4.6706e-07, 0.0000e+00]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0038, -0.0132, 0.0038, 0.0342, 0.0011, 0.0274, 0.0043, 0.0178, -0.0038, -0.0203], device='cuda:0'), grad: tensor([-3.4645e-07, 1.2681e-05, 6.5304e-06, 2.0787e-06, 1.0626e-06, 7.5763e-07, -5.6811e-06, -4.3362e-05, 2.3350e-05, 2.8424e-06], device='cuda:0') 100 0.0001 changing lr epoch 246, time 214.69, cls_loss 0.0017 cls_loss_mapping 0.0021 cls_loss_causal 0.5021 re_mapping 0.0044 re_causal 0.0134 /// teacc 98.97 lr 0.00010000 Epoch 248, weight, value: tensor([[-0.0718, -0.0217, -0.0970, ..., 0.0313, -0.0963, -0.0096], [ 0.0376, -0.0818, -0.0252, ..., -0.0458, -0.0737, -0.2697], [-0.0134, 0.0592, -0.0262, ..., -0.1108, 0.1267, -0.0407], ..., [ 0.0279, -0.0338, -0.0194, ..., -0.1355, -0.1799, 0.0297], [-0.0177, -0.0794, -0.0270, ..., -0.1010, 0.0534, -0.1578], [-0.2355, -0.0124, -0.0169, ..., -0.0930, -0.1764, -0.0949]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.6834e-08, 1.0943e-07, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7008e-08, 7.6368e-08, 1.5832e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, -1.5879e-07, -8.6147e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.1444e-08, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.5681e-07, 5.8301e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, 1.0571e-07, 9.2853e-07]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0036, -0.0152, 0.0029, 0.0341, 0.0017, 0.0275, 0.0042, 0.0205, -0.0041, -0.0211], device='cuda:0'), grad: tensor([ 3.3062e-07, -7.4226e-07, -4.6566e-08, 1.5721e-06, -2.0713e-06, -1.1455e-06, -2.7288e-06, 1.4305e-06, 1.5832e-06, 1.8179e-06], device='cuda:0') 100 0.0001 changing lr epoch 247, time 214.73, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4905 re_mapping 0.0041 re_causal 0.0133 /// teacc 98.99 lr 0.00010000 Epoch 249, weight, value: tensor([[-0.0718, -0.0217, -0.0970, ..., 0.0317, -0.0963, -0.0078], [ 0.0376, -0.0819, -0.0252, ..., -0.0460, -0.0743, -0.2709], [-0.0103, 0.0609, -0.0262, ..., -0.1106, 0.1274, -0.0401], ..., [ 0.0248, -0.0356, -0.0194, ..., -0.1372, -0.1811, 0.0297], [-0.0177, -0.0796, -0.0270, ..., -0.1011, 0.0533, -0.1581], [-0.2356, -0.0123, -0.0169, ..., -0.0931, -0.1771, -0.0957]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -1.9558e-08, 1.3970e-09, 1.8626e-09], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 2.7940e-09, 4.6566e-09], [-1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -7.9162e-09, 9.3132e-10], ..., [-4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 5.5879e-09, 1.1176e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 8.8476e-09, 1.5367e-08, 5.5879e-09], [ 0.0000e+00, -4.6566e-09, 0.0000e+00, ..., 1.3970e-09, 1.3970e-09, -6.0070e-08]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0032, -0.0152, 0.0038, 0.0341, 0.0020, 0.0275, 0.0042, 0.0200, -0.0042, -0.0213], device='cuda:0'), grad: tensor([-7.4506e-09, -1.2226e-05, 4.0121e-06, 4.3167e-07, 5.2853e-07, 6.8452e-08, 1.8626e-08, 7.6964e-06, 1.9418e-07, -7.3062e-07], device='cuda:0') 100 0.0001 changing lr epoch 248, time 214.44, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4999 re_mapping 0.0045 re_causal 0.0138 /// teacc 98.94 lr 0.00010000 Epoch 250, weight, value: tensor([[-0.0722, -0.0217, -0.0970, ..., 0.0317, -0.0964, -0.0079], [ 0.0376, -0.0819, -0.0252, ..., -0.0460, -0.0745, -0.2719], [-0.0103, 0.0609, -0.0262, ..., -0.1106, 0.1276, -0.0396], ..., [ 0.0249, -0.0356, -0.0194, ..., -0.1382, -0.1812, 0.0296], [-0.0177, -0.0797, -0.0271, ..., -0.1012, 0.0533, -0.1584], [-0.2364, -0.0122, -0.0170, ..., -0.0928, -0.1779, -0.0957]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 5.6205e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 4.8429e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.1886e-08, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, -7.1200e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.1176e-08, -6.5193e-09]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0034, -0.0150, 0.0038, 0.0341, 0.0017, 0.0276, 0.0041, 0.0199, -0.0043, -0.0210], device='cuda:0'), grad: tensor([ 5.8208e-08, 7.3649e-06, 8.2282e-07, 3.0641e-07, 3.7812e-07, 8.3819e-09, 1.9418e-07, -5.6103e-06, -2.0172e-06, -1.4827e-06], device='cuda:0') 100 0.0001 changing lr epoch 249, time 214.49, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.5072 re_mapping 0.0043 re_causal 0.0136 /// teacc 98.99 lr 0.00010000 Epoch 251, weight, value: tensor([[-0.0724, -0.0217, -0.0970, ..., 0.0313, -0.0969, -0.0080], [ 0.0374, -0.0819, -0.0252, ..., -0.0465, -0.0750, -0.2720], [-0.0103, 0.0609, -0.0263, ..., -0.1105, 0.1276, -0.0396], ..., [ 0.0249, -0.0356, -0.0194, ..., -0.1402, -0.1813, 0.0296], [-0.0174, -0.0797, -0.0271, ..., -0.1013, 0.0533, -0.1585], [-0.2366, -0.0122, -0.0170, ..., -0.0927, -0.1788, -0.0959]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 7.9162e-09, 1.5367e-08, 0.0000e+00], [ 0.0000e+00, 7.9162e-09, 0.0000e+00, ..., 4.6566e-10, 4.6566e-09, 2.3283e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, -7.7300e-08, -9.3132e-10], ..., [ 0.0000e+00, 1.0896e-07, 0.0000e+00, ..., 0.0000e+00, 2.0023e-08, 1.3970e-09], [ 0.0000e+00, 1.2573e-08, 0.0000e+00, ..., 1.8161e-08, 2.3749e-08, 1.8626e-09], [ 0.0000e+00, -1.7090e-07, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 1.1176e-08]], device='cuda:0') Epoch 251, bias, value: tensor([-0.0036, -0.0152, 0.0036, 0.0340, 0.0019, 0.0277, 0.0043, 0.0199, -0.0042, -0.0209], device='cuda:0'), grad: tensor([ 1.1222e-07, 1.4491e-06, 1.4529e-05, -1.2247e-07, 5.3504e-07, 1.6345e-07, -3.3062e-08, -1.6257e-05, 2.0284e-06, -2.4065e-06], device='cuda:0') 100 0.0001 changing lr epoch 250, time 214.41, cls_loss 0.0019 cls_loss_mapping 0.0031 cls_loss_causal 0.5016 re_mapping 0.0046 re_causal 0.0134 /// teacc 98.99 lr 0.00010000 Epoch 252, weight, value: tensor([[-0.0726, -0.0218, -0.0972, ..., 0.0314, -0.0976, -0.0077], [ 0.0371, -0.0826, -0.0259, ..., -0.0474, -0.0783, -0.2751], [-0.0104, 0.0612, -0.0293, ..., -0.1091, 0.1279, -0.0398], ..., [ 0.0252, -0.0355, -0.0166, ..., -0.1418, -0.1815, 0.0313], [-0.0175, -0.0798, -0.0275, ..., -0.1015, 0.0563, -0.1556], [-0.2370, -0.0123, -0.0174, ..., -0.0927, -0.1798, -0.0970]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.2061e-07, -1.1874e-07, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.9337e-08, 1.9558e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.1665e-08, -9.1502e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 9.1689e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.3958e-07, 3.0082e-07, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.9139e-07, 9.4064e-08, 3.2596e-09]], device='cuda:0') Epoch 252, bias, value: tensor([-0.0039, -0.0182, 0.0042, 0.0339, 0.0023, 0.0278, 0.0036, 0.0198, -0.0011, -0.0210], device='cuda:0'), grad: tensor([-1.6941e-06, -3.2317e-07, -1.5125e-06, 1.9092e-08, 9.3132e-09, -8.4471e-07, 4.6706e-07, 1.7844e-06, 1.4445e-06, 6.5332e-07], device='cuda:0') 100 0.0001 changing lr epoch 251, time 214.18, cls_loss 0.0017 cls_loss_mapping 0.0028 cls_loss_causal 0.5023 re_mapping 0.0047 re_causal 0.0140 /// teacc 98.99 lr 0.00010000 Epoch 253, weight, value: tensor([[-0.0727, -0.0218, -0.0973, ..., 0.0298, -0.1003, -0.0073], [ 0.0372, -0.0828, -0.0261, ..., -0.0496, -0.0784, -0.2754], [-0.0103, 0.0620, -0.0294, ..., -0.1095, 0.1281, -0.0396], ..., [ 0.0251, -0.0362, -0.0165, ..., -0.1422, -0.1820, 0.0308], [-0.0180, -0.0802, -0.0275, ..., -0.1017, 0.0563, -0.1556], [-0.2372, -0.0122, -0.0177, ..., -0.0929, -0.1810, -0.0964]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, 1.3970e-09, ..., -5.5879e-09, 6.1002e-08, 0.0000e+00], [-1.8999e-07, 0.0000e+00, 3.7253e-09, ..., 1.3970e-09, 1.1642e-08, 0.0000e+00], [ 2.0489e-08, 0.0000e+00, 1.2806e-07, ..., 8.8476e-09, 7.0781e-08, -4.1910e-09], ..., [ 1.8626e-08, 0.0000e+00, 2.4680e-08, ..., 4.6566e-10, 1.3970e-09, 0.0000e+00], [ 6.7055e-08, 0.0000e+00, 2.9756e-07, ..., 2.6543e-08, -3.1479e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 4.6566e-10, ..., 2.3283e-09, 2.5146e-08, -1.3970e-09]], device='cuda:0') Epoch 253, bias, value: tensor([-0.0066, -0.0183, 0.0046, 0.0339, 0.0025, 0.0281, 0.0041, 0.0192, -0.0011, -0.0195], device='cuda:0'), grad: tensor([ 2.1141e-07, -3.8091e-06, 1.3076e-06, -1.5507e-06, 1.4938e-06, 3.6834e-07, 5.3272e-07, -9.6858e-07, 2.2072e-06, 1.9232e-07], device='cuda:0') 100 0.0001 changing lr epoch 252, time 214.23, cls_loss 0.0013 cls_loss_mapping 0.0015 cls_loss_causal 0.4952 re_mapping 0.0043 re_causal 0.0132 /// teacc 99.04 lr 0.00010000 Epoch 254, weight, value: tensor([[-0.0728, -0.0218, -0.0974, ..., 0.0304, -0.1000, -0.0064], [ 0.0377, -0.0828, -0.0261, ..., -0.0507, -0.0784, -0.2754], [-0.0103, 0.0620, -0.0294, ..., -0.1122, 0.1272, -0.0394], ..., [ 0.0251, -0.0362, -0.0165, ..., -0.1425, -0.1821, 0.0308], [-0.0181, -0.0802, -0.0279, ..., -0.1019, 0.0563, -0.1557], [-0.2377, -0.0122, -0.0178, ..., -0.0935, -0.1814, -0.0966]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6345e-07, 4.4703e-08, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, 1.7416e-07, 4.6566e-10], [-1.0710e-08, 0.0000e+00, 0.0000e+00, ..., 8.3819e-09, -1.6093e-06, 0.0000e+00], ..., [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 8.0327e-07, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.2375e-08, 1.3271e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, 3.7253e-08, -8.3819e-09]], device='cuda:0') Epoch 254, bias, value: tensor([-0.0057, -0.0183, 0.0037, 0.0339, 0.0018, 0.0280, 0.0061, 0.0191, -0.0011, -0.0196], device='cuda:0'), grad: tensor([ 1.5367e-08, -6.5342e-06, -2.9914e-06, 6.5472e-07, 1.1623e-06, 5.9651e-07, 5.8673e-07, -1.0468e-06, 3.1460e-06, 4.4145e-06], device='cuda:0') 100 0.0001 changing lr epoch 253, time 214.32, cls_loss 0.0015 cls_loss_mapping 0.0025 cls_loss_causal 0.4795 re_mapping 0.0043 re_causal 0.0129 /// teacc 99.03 lr 0.00010000 Epoch 255, weight, value: tensor([[-0.0728, -0.0221, -0.0975, ..., 0.0308, -0.1002, -0.0065], [ 0.0378, -0.0828, -0.0262, ..., -0.0508, -0.0784, -0.2757], [-0.0103, 0.0639, -0.0295, ..., -0.1123, 0.1274, -0.0384], ..., [ 0.0251, -0.0375, -0.0166, ..., -0.1432, -0.1824, 0.0305], [-0.0182, -0.0812, -0.0281, ..., -0.1020, 0.0563, -0.1558], [-0.2379, -0.0129, -0.0178, ..., -0.0939, -0.1824, -0.0966]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 8.3819e-09, 4.4703e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 3.9116e-08, ..., 3.7253e-09, 1.3830e-07, 7.9162e-09], [-3.7253e-09, -7.9162e-09, 8.9873e-08, ..., 6.0536e-09, -5.8021e-07, 7.9162e-09], ..., [ 2.7940e-09, 5.5879e-09, 2.6077e-08, ..., 0.0000e+00, 1.1735e-07, -2.3283e-09], [ 0.0000e+00, 4.6566e-10, -3.1060e-07, ..., 3.4459e-08, -3.0594e-07, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 1.3970e-09, 1.8161e-08, -4.6566e-09]], device='cuda:0') Epoch 255, bias, value: tensor([-0.0055, -0.0180, 0.0035, 0.0349, 0.0011, 0.0271, 0.0061, 0.0188, -0.0011, -0.0197], device='cuda:0'), grad: tensor([ 1.3039e-07, -3.5390e-08, -8.6147e-07, 2.2128e-06, 5.0291e-08, -1.0505e-06, -1.3039e-08, 2.3982e-07, -6.5658e-07, -2.7008e-08], device='cuda:0') 100 0.0001 changing lr epoch 254, time 214.47, cls_loss 0.0011 cls_loss_mapping 0.0022 cls_loss_causal 0.5109 re_mapping 0.0044 re_causal 0.0137 /// teacc 99.07 lr 0.00010000 Epoch 256, weight, value: tensor([[-0.0728, -0.0221, -0.0975, ..., 0.0309, -0.1004, -0.0067], [ 0.0382, -0.0829, -0.0263, ..., -0.0504, -0.0784, -0.2758], [-0.0103, 0.0641, -0.0296, ..., -0.1122, 0.1277, -0.0384], ..., [ 0.0250, -0.0376, -0.0166, ..., -0.1436, -0.1828, 0.0303], [-0.0183, -0.0814, -0.0280, ..., -0.1023, 0.0563, -0.1559], [-0.2380, -0.0129, -0.0179, ..., -0.0939, -0.1833, -0.0972]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.3819e-08, 8.8476e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, 1.5367e-08, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, -4.9360e-08, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 5.3085e-08, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1828e-07, 1.9465e-07, 5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0617e-07, 2.3283e-09, 1.8626e-08]], device='cuda:0') Epoch 256, bias, value: tensor([-0.0053, -0.0177, 0.0036, 0.0350, 0.0008, 0.0271, 0.0061, 0.0184, -0.0012, -0.0204], device='cuda:0'), grad: tensor([ 9.3412e-07, -8.2999e-06, 6.1840e-07, -1.6600e-05, 1.8114e-07, 2.4345e-06, -5.8860e-07, -3.7579e-07, 1.0796e-05, 1.0923e-05], device='cuda:0') 100 0.0001 changing lr epoch 255, time 214.41, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.4939 re_mapping 0.0044 re_causal 0.0135 /// teacc 99.00 lr 0.00010000 Epoch 257, weight, value: tensor([[-0.0728, -0.0222, -0.0975, ..., 0.0313, -0.1005, -0.0052], [ 0.0384, -0.0832, -0.0263, ..., -0.0505, -0.0784, -0.2759], [-0.0103, 0.0647, -0.0296, ..., -0.1122, 0.1279, -0.0383], ..., [ 0.0250, -0.0381, -0.0166, ..., -0.1438, -0.1830, 0.0316], [-0.0183, -0.0816, -0.0280, ..., -0.1024, 0.0563, -0.1561], [-0.2380, -0.0129, -0.0179, ..., -0.0943, -0.1844, -0.0965]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, 1.3504e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 4.7032e-08, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.2033e-06, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0403e-06, 3.8650e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -1.4063e-07, 3.0734e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0710e-08, -8.7079e-08]], device='cuda:0') Epoch 257, bias, value: tensor([-0.0048, -0.0177, 0.0037, 0.0350, -0.0008, 0.0272, 0.0064, 0.0183, -0.0013, -0.0201], device='cuda:0'), grad: tensor([ 5.8208e-08, -3.6554e-07, -3.0845e-06, 1.5693e-07, 7.7533e-07, 2.8871e-08, -2.3143e-07, 2.9560e-06, 2.7940e-09, -2.7288e-07], device='cuda:0') 100 0.0001 changing lr epoch 256, time 214.26, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4914 re_mapping 0.0045 re_causal 0.0138 /// teacc 99.00 lr 0.00010000 Epoch 258, weight, value: tensor([[-0.0728, -0.0228, -0.0976, ..., 0.0318, -0.1008, -0.0053], [ 0.0383, -0.0848, -0.0264, ..., -0.0510, -0.0784, -0.2759], [-0.0103, 0.0655, -0.0296, ..., -0.1122, 0.1280, -0.0380], ..., [ 0.0251, -0.0379, -0.0166, ..., -0.1439, -0.1830, 0.0328], [-0.0182, -0.0829, -0.0282, ..., -0.1025, 0.0562, -0.1561], [-0.2381, -0.0130, -0.0179, ..., -0.0948, -0.1829, -0.0967]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.8476e-09, 1.0757e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3656e-07, -2.8741e-06, 1.7695e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -1.4240e-06, 3.7253e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 9.0804e-07, 2.3749e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3516e-07, 3.0864e-06, 2.3283e-09], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 5.1223e-09, 3.9581e-08, 3.7719e-08]], device='cuda:0') Epoch 258, bias, value: tensor([-0.0046, -0.0176, 0.0036, 0.0349, -0.0020, 0.0272, 0.0068, 0.0186, -0.0014, -0.0200], device='cuda:0'), grad: tensor([ 6.1467e-07, -2.7135e-05, -3.6582e-06, 5.9279e-07, -1.0431e-07, -3.3528e-08, 1.3830e-07, 2.8424e-06, 2.6792e-05, -7.1712e-08], device='cuda:0') 100 0.0001 changing lr epoch 257, time 214.39, cls_loss 0.0012 cls_loss_mapping 0.0021 cls_loss_causal 0.4607 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.08 lr 0.00010000 Epoch 259, weight, value: tensor([[-0.0728, -0.0229, -0.0976, ..., 0.0320, -0.1010, -0.0063], [ 0.0383, -0.0850, -0.0264, ..., -0.0514, -0.0784, -0.2761], [-0.0102, 0.0655, -0.0296, ..., -0.1122, 0.1281, -0.0381], ..., [ 0.0251, -0.0381, -0.0166, ..., -0.1442, -0.1831, 0.0315], [-0.0183, -0.0823, -0.0282, ..., -0.1026, 0.0563, -0.1562], [-0.2381, -0.0126, -0.0179, ..., -0.0947, -0.1834, -0.0937]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.5879e-09, 5.0291e-08, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 3.3062e-08, 3.2596e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -6.5705e-07, 1.3039e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.2480e-07, -4.4703e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5832e-08, 3.3714e-07, 5.1223e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 5.1223e-09, 1.6717e-07]], device='cuda:0') Epoch 259, bias, value: tensor([-0.0041, -0.0176, 0.0035, 0.0349, -0.0043, 0.0272, 0.0065, 0.0186, -0.0014, -0.0175], device='cuda:0'), grad: tensor([ 1.2806e-07, 1.9353e-06, -4.7125e-07, 3.2410e-07, -1.6391e-07, 1.7136e-07, 5.0291e-08, -3.8967e-06, 8.3493e-07, 1.0785e-06], device='cuda:0') 100 0.0001 changing lr epoch 258, time 214.54, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4899 re_mapping 0.0041 re_causal 0.0131 /// teacc 99.05 lr 0.00010000 Epoch 260, weight, value: tensor([[-0.0728, -0.0229, -0.0976, ..., 0.0319, -0.1021, -0.0077], [ 0.0387, -0.0850, -0.0264, ..., -0.0515, -0.0785, -0.2763], [-0.0102, 0.0655, -0.0296, ..., -0.1123, 0.1284, -0.0383], ..., [ 0.0250, -0.0381, -0.0167, ..., -0.1444, -0.1833, 0.0318], [-0.0184, -0.0822, -0.0283, ..., -0.1027, 0.0562, -0.1563], [-0.2383, -0.0125, -0.0179, ..., -0.0947, -0.1843, -0.0934]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3283e-09, 5.4017e-08, 4.1910e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 1.4901e-08, 2.6543e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 5.5879e-09, 6.9849e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 7.4506e-09, 3.5390e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.5157e-07, 5.5274e-07, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2573e-07, 2.1374e-07, 1.3383e-06]], device='cuda:0') Epoch 260, bias, value: tensor([-0.0047, -0.0177, 0.0033, 0.0349, -0.0046, 0.0273, 0.0065, 0.0190, -0.0015, -0.0172], device='cuda:0'), grad: tensor([ 2.9709e-07, -1.3039e-05, 6.8499e-07, 7.9256e-07, -4.2617e-05, -5.2303e-06, 2.9616e-06, 8.4192e-06, 2.1048e-06, 4.5717e-05], device='cuda:0') 100 0.0001 changing lr epoch 259, time 214.47, cls_loss 0.0012 cls_loss_mapping 0.0018 cls_loss_causal 0.4920 re_mapping 0.0042 re_causal 0.0127 /// teacc 99.08 lr 0.00010000 Epoch 261, weight, value: tensor([[-0.0728, -0.0229, -0.0976, ..., 0.0321, -0.1023, -0.0076], [ 0.0387, -0.0851, -0.0264, ..., -0.0516, -0.0785, -0.2763], [-0.0102, 0.0655, -0.0296, ..., -0.1123, 0.1284, -0.0382], ..., [ 0.0250, -0.0381, -0.0167, ..., -0.1460, -0.1837, 0.0318], [-0.0184, -0.0822, -0.0282, ..., -0.1024, 0.0565, -0.1564], [-0.2384, -0.0125, -0.0179, ..., -0.0949, -0.1868, -0.0935]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, 3.7532e-07, 1.7229e-08], [-9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 3.6787e-08, 7.6834e-07, 1.0757e-07], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -9.7789e-08, -2.1774e-06, 4.6566e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-08, 1.9092e-07, 2.3749e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.5635e-08, -1.2526e-07, 5.2620e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.7765e-08, 9.7789e-08, 9.7044e-07]], device='cuda:0') Epoch 261, bias, value: tensor([-0.0046, -0.0175, 0.0031, 0.0348, -0.0045, 0.0272, 0.0066, 0.0188, -0.0013, -0.0176], device='cuda:0'), grad: tensor([ 6.6124e-07, 2.0117e-06, -3.4403e-06, 9.2015e-07, -9.0525e-06, 3.1292e-07, 6.8080e-07, 1.4994e-06, -1.0198e-07, 6.4820e-06], device='cuda:0') 100 0.0001 changing lr epoch 260, time 214.41, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4759 re_mapping 0.0043 re_causal 0.0132 /// teacc 99.07 lr 0.00010000 Epoch 262, weight, value: tensor([[-0.0729, -0.0229, -0.0976, ..., 0.0323, -0.1027, -0.0077], [ 0.0385, -0.0851, -0.0264, ..., -0.0525, -0.0785, -0.2764], [-0.0102, 0.0656, -0.0296, ..., -0.1123, 0.1286, -0.0381], ..., [ 0.0250, -0.0381, -0.0167, ..., -0.1464, -0.1839, 0.0319], [-0.0186, -0.0823, -0.0283, ..., -0.1027, 0.0564, -0.1564], [-0.2384, -0.0126, -0.0179, ..., -0.0953, -0.1883, -0.0935]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.0023e-08, 2.4214e-08, 6.9849e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 2.7940e-09, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, -3.7253e-09, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 2.7940e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3528e-08, 1.4435e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 2.7940e-09, 8.8941e-08]], device='cuda:0') Epoch 262, bias, value: tensor([-0.0047, -0.0175, 0.0031, 0.0351, -0.0046, 0.0270, 0.0067, 0.0189, -0.0013, -0.0178], device='cuda:0'), grad: tensor([ 1.2806e-07, 1.4994e-07, 2.6729e-07, -7.1619e-07, -9.8068e-07, 4.8429e-08, 2.0023e-08, 1.1176e-08, 1.7602e-07, 8.9314e-07], device='cuda:0') 100 0.0001 changing lr epoch 261, time 214.30, cls_loss 0.0011 cls_loss_mapping 0.0025 cls_loss_causal 0.5034 re_mapping 0.0040 re_causal 0.0130 /// teacc 99.00 lr 0.00010000 Epoch 263, weight, value: tensor([[-0.0729, -0.0230, -0.0977, ..., 0.0321, -0.1034, -0.0079], [ 0.0387, -0.0852, -0.0264, ..., -0.0535, -0.0785, -0.2770], [-0.0102, 0.0656, -0.0296, ..., -0.1124, 0.1287, -0.0380], ..., [ 0.0250, -0.0381, -0.0167, ..., -0.1466, -0.1839, 0.0320], [-0.0187, -0.0823, -0.0283, ..., -0.1031, 0.0564, -0.1565], [-0.2385, -0.0124, -0.0180, ..., -0.0954, -0.1893, -0.0962]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.0245e-08, 3.4925e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 3.7253e-09, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -1.5367e-08, 6.9849e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 4.4238e-09, 3.9581e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3271e-08, -2.6310e-08, 1.6298e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6298e-09, 3.0268e-09, 1.5064e-07]], device='cuda:0') Epoch 263, bias, value: tensor([-0.0052, -0.0176, 0.0030, 0.0352, -0.0020, 0.0270, 0.0070, 0.0190, -0.0014, -0.0203], device='cuda:0'), grad: tensor([ 3.2131e-08, -2.4572e-05, 1.8394e-07, 1.3225e-07, 1.5181e-07, 2.3097e-07, 6.2631e-08, 2.1622e-05, 7.2364e-07, 1.4603e-06], device='cuda:0') 100 0.0001 changing lr epoch 262, time 214.56, cls_loss 0.0015 cls_loss_mapping 0.0035 cls_loss_causal 0.5160 re_mapping 0.0041 re_causal 0.0124 /// teacc 99.08 lr 0.00010000 Epoch 264, weight, value: tensor([[-0.0729, -0.0230, -0.0977, ..., 0.0291, -0.1057, -0.0081], [ 0.0389, -0.0856, -0.0264, ..., -0.0535, -0.0785, -0.2772], [-0.0102, 0.0659, -0.0296, ..., -0.1125, 0.1289, -0.0383], ..., [ 0.0250, -0.0379, -0.0167, ..., -0.1472, -0.1843, 0.0317], [-0.0189, -0.0839, -0.0283, ..., -0.1032, 0.0564, -0.1567], [-0.2386, -0.0113, -0.0180, ..., -0.0931, -0.1897, -0.0952]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 1.2340e-08, 3.3993e-08, 9.7789e-09], [-2.3283e-10, 3.0268e-09, 0.0000e+00, ..., 6.5193e-09, 1.7695e-08, 1.2200e-07], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 7.4506e-09, 1.8789e-07, 3.3528e-08], ..., [ 0.0000e+00, -1.1874e-08, 0.0000e+00, ..., 2.3283e-10, 3.6089e-08, -7.9721e-07], [ 0.0000e+00, 3.4925e-09, 0.0000e+00, ..., 1.0454e-07, -1.8231e-07, 2.4913e-08], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 1.1642e-09, 5.6112e-08, 4.3656e-07]], device='cuda:0') Epoch 264, bias, value: tensor([-0.0072, -0.0173, 0.0030, 0.0354, -0.0034, 0.0267, 0.0071, 0.0189, -0.0015, -0.0188], device='cuda:0'), grad: tensor([ 1.6857e-07, 1.5041e-06, 9.6671e-07, 9.7603e-07, 1.6252e-06, 5.2620e-08, -2.1793e-07, -8.6203e-06, -9.7323e-08, 3.6359e-06], device='cuda:0') 100 0.0001 changing lr epoch 263, time 214.77, cls_loss 0.0013 cls_loss_mapping 0.0019 cls_loss_causal 0.4683 re_mapping 0.0042 re_causal 0.0124 /// teacc 99.11 lr 0.00010000 Epoch 265, weight, value: tensor([[-0.0730, -0.0246, -0.0977, ..., 0.0279, -0.1063, -0.0083], [ 0.0402, -0.0886, -0.0265, ..., -0.0538, -0.0785, -0.2773], [-0.0102, 0.0667, -0.0296, ..., -0.1124, 0.1293, -0.0391], ..., [ 0.0249, -0.0364, -0.0167, ..., -0.1472, -0.1845, 0.0319], [-0.0193, -0.0861, -0.0284, ..., -0.1033, 0.0563, -0.1567], [-0.2396, -0.0115, -0.0180, ..., -0.0921, -0.1908, -0.0952]], device='cuda:0'), grad: tensor([[ 1.6298e-08, 0.0000e+00, 0.0000e+00, ..., 1.9092e-08, 4.9360e-08, 1.6298e-09], [-7.6788e-07, 1.1642e-09, 0.0000e+00, ..., 6.2864e-09, 6.8685e-08, 3.4692e-08], [ 4.1211e-08, -2.3283e-09, 0.0000e+00, ..., -1.6089e-07, -2.2002e-07, 2.0023e-08], ..., [ 2.6310e-08, 9.3132e-10, 0.0000e+00, ..., 8.1724e-08, 1.4692e-07, -8.4052e-08], [ 2.0489e-08, 0.0000e+00, 0.0000e+00, ..., 4.6100e-08, -1.5497e-06, 4.4238e-09], [ 1.2573e-08, 0.0000e+00, 0.0000e+00, ..., 1.2573e-08, 3.4226e-08, 1.5832e-08]], device='cuda:0') Epoch 265, bias, value: tensor([-0.0088, -0.0170, 0.0033, 0.0354, -0.0035, 0.0270, 0.0072, 0.0182, -0.0016, -0.0186], device='cuda:0'), grad: tensor([ 2.5122e-07, -4.1835e-06, -1.6787e-07, 2.0470e-06, 1.6596e-06, 3.6787e-06, 1.6987e-06, -2.6054e-07, -5.1185e-06, 3.7951e-07], device='cuda:0') 100 0.0001 changing lr epoch 264, time 214.76, cls_loss 0.0016 cls_loss_mapping 0.0022 cls_loss_causal 0.5048 re_mapping 0.0044 re_causal 0.0132 /// teacc 99.06 lr 0.00010000 Epoch 266, weight, value: tensor([[-0.0731, -0.0248, -0.0977, ..., 0.0290, -0.1082, -0.0084], [ 0.0427, -0.0893, -0.0265, ..., -0.0539, -0.0786, -0.2774], [-0.0103, 0.0670, -0.0296, ..., -0.1117, 0.1296, -0.0389], ..., [ 0.0248, -0.0361, -0.0167, ..., -0.1508, -0.1848, 0.0316], [-0.0204, -0.0870, -0.0284, ..., -0.1039, 0.0575, -0.1568], [-0.2416, -0.0117, -0.0180, ..., -0.0927, -0.1916, -0.0952]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.8173e-08, 7.4506e-09, 4.6566e-10], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 1.6065e-08, 1.9325e-08, 7.4506e-09], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., -2.1886e-08, -5.0524e-08, 5.5879e-09], ..., [ 0.0000e+00, -1.0245e-08, 0.0000e+00, ..., 6.7521e-09, 1.5367e-08, 7.2177e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 1.1642e-08, 6.3330e-08, 2.0955e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 9.0804e-09, 3.7253e-09, 8.3819e-09]], device='cuda:0') Epoch 266, bias, value: tensor([-0.0108, -0.0166, 0.0034, 0.0351, -0.0037, 0.0265, 0.0085, 0.0177, -0.0011, -0.0186], device='cuda:0'), grad: tensor([-5.7509e-08, 1.0873e-07, 5.7742e-08, 9.9186e-08, -8.4192e-07, -2.0047e-07, 7.6601e-08, -2.2422e-07, 2.6356e-07, 7.2038e-07], device='cuda:0') 100 0.0001 changing lr epoch 265, time 214.56, cls_loss 0.0013 cls_loss_mapping 0.0022 cls_loss_causal 0.4663 re_mapping 0.0044 re_causal 0.0134 /// teacc 98.94 lr 0.00010000 Epoch 267, weight, value: tensor([[-0.0731, -0.0248, -0.0977, ..., 0.0294, -0.1087, -0.0088], [ 0.0428, -0.0894, -0.0265, ..., -0.0546, -0.0786, -0.2775], [-0.0103, 0.0671, -0.0296, ..., -0.1117, 0.1301, -0.0389], ..., [ 0.0248, -0.0360, -0.0167, ..., -0.1509, -0.1852, 0.0315], [-0.0204, -0.0871, -0.0284, ..., -0.1040, 0.0574, -0.1569], [-0.2417, -0.0117, -0.0180, ..., -0.0927, -0.1919, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 4.3772e-08, 7.2876e-08, 4.4238e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 6.1933e-08, 6.5891e-08, 9.8906e-07], [ 0.0000e+00, -1.7928e-08, 0.0000e+00, ..., 1.2806e-08, -1.5367e-08, 2.4680e-08], ..., [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., -4.8894e-09, 3.4226e-08, 2.5006e-07], [ 0.0000e+00, 8.8476e-09, 0.0000e+00, ..., 6.7241e-06, 5.3979e-06, 1.9092e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0978e-08, 5.8673e-08, -1.4435e-06]], device='cuda:0') Epoch 267, bias, value: tensor([-0.0115, -0.0138, 0.0036, 0.0347, -0.0038, 0.0270, 0.0083, 0.0147, -0.0012, -0.0185], device='cuda:0'), grad: tensor([ 2.1025e-07, 8.2850e-06, 4.1747e-07, 4.7777e-07, 1.2927e-06, -1.0245e-07, -1.9372e-05, 1.4603e-06, 1.8716e-05, -1.1377e-05], device='cuda:0') 100 0.0001 changing lr epoch 266, time 214.52, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4889 re_mapping 0.0041 re_causal 0.0129 /// teacc 99.00 lr 0.00010000 Epoch 268, weight, value: tensor([[-0.0731, -0.0249, -0.0977, ..., 0.0292, -0.1089, -0.0090], [ 0.0428, -0.0926, -0.0265, ..., -0.0544, -0.0786, -0.2776], [-0.0102, 0.0666, -0.0296, ..., -0.1119, 0.1307, -0.0388], ..., [ 0.0248, -0.0329, -0.0167, ..., -0.1510, -0.1853, 0.0320], [-0.0205, -0.0874, -0.0284, ..., -0.1033, 0.0575, -0.1571], [-0.2418, -0.0147, -0.0180, ..., -0.0927, -0.1934, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.0559e-08, -3.2596e-09, 1.8626e-09], [ 0.0000e+00, 3.2596e-09, 0.0000e+00, ..., 3.7253e-09, 1.0431e-07, 3.3993e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.5832e-08, 7.1013e-07, 4.6566e-10], ..., [ 0.0000e+00, -6.5193e-09, 0.0000e+00, ..., 2.3283e-09, -1.0300e-06, 9.3132e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.8894e-08, 1.5786e-07, 9.3132e-10], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 6.0536e-09, 4.8894e-08, -4.8894e-08]], device='cuda:0') Epoch 268, bias, value: tensor([-0.0117, -0.0138, 0.0038, 0.0346, -0.0038, 0.0272, 0.0074, 0.0149, -0.0011, -0.0189], device='cuda:0'), grad: tensor([ 1.4100e-06, 8.5652e-05, 1.0973e-04, 1.3091e-05, 4.8056e-07, 8.5589e-07, -1.4668e-07, -2.3675e-04, 5.8115e-06, 1.9968e-05], device='cuda:0') 100 0.0001 changing lr epoch 267, time 214.19, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.5017 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.06 lr 0.00010000 Epoch 269, weight, value: tensor([[-0.0731, -0.0251, -0.0977, ..., 0.0291, -0.1093, -0.0093], [ 0.0430, -0.0930, -0.0265, ..., -0.0545, -0.0787, -0.2778], [-0.0102, 0.0686, -0.0297, ..., -0.1120, 0.1309, -0.0420], ..., [ 0.0248, -0.0334, -0.0168, ..., -0.1512, -0.1855, 0.0318], [-0.0203, -0.0901, -0.0285, ..., -0.1035, 0.0574, -0.1574], [-0.2419, -0.0147, -0.0180, ..., -0.0929, -0.1956, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 1.1520e-06, 1.2051e-06, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.6764e-08, 3.7253e-08, 3.2596e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.2107e-08, -1.1781e-07, 4.6566e-10], ..., [ 0.0000e+00, -2.2817e-08, 0.0000e+00, ..., 1.3970e-09, 6.7055e-08, 9.3132e-10], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 6.0815e-07, 7.9582e-07, 9.3132e-10], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 7.6834e-08, 8.1956e-08, 1.7276e-07]], device='cuda:0') Epoch 269, bias, value: tensor([-0.0118, -0.0141, 0.0035, 0.0345, -0.0037, 0.0275, 0.0076, 0.0153, -0.0012, -0.0196], device='cuda:0'), grad: tensor([ 2.8089e-06, -9.7789e-09, -7.7300e-08, -1.1548e-07, -5.6485e-07, 1.6857e-06, -6.1169e-06, -3.0734e-08, 1.6056e-06, 8.1398e-07], device='cuda:0') 100 0.0001 changing lr epoch 268, time 214.53, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4621 re_mapping 0.0041 re_causal 0.0129 /// teacc 99.05 lr 0.00010000 Epoch 270, weight, value: tensor([[-0.0731, -0.0252, -0.0977, ..., 0.0291, -0.1097, -0.0096], [ 0.0430, -0.0934, -0.0265, ..., -0.0545, -0.0787, -0.2779], [-0.0102, 0.0694, -0.0297, ..., -0.1120, 0.1310, -0.0424], ..., [ 0.0248, -0.0334, -0.0168, ..., -0.1513, -0.1858, 0.0320], [-0.0202, -0.0910, -0.0285, ..., -0.1033, 0.0575, -0.1577], [-0.2419, -0.0146, -0.0181, ..., -0.0932, -0.1966, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3993e-08, 1.4296e-07, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3993e-08, -2.5049e-05, 1.4901e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.7789e-09, 3.5077e-05, 3.7253e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0850e-06, 7.9628e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.9858e-06, -2.0057e-05, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.3597e-06, 7.9628e-08]], device='cuda:0') Epoch 270, bias, value: tensor([-0.0119, -0.0141, 0.0034, 0.0344, -0.0038, 0.0276, 0.0075, 0.0153, -0.0011, -0.0196], device='cuda:0'), grad: tensor([ 1.3504e-07, -1.4710e-04, 1.6761e-04, 7.2978e-06, -6.8452e-08, 7.1675e-06, 2.2575e-06, 4.7944e-06, -4.5329e-05, 3.4422e-06], device='cuda:0') 100 0.0001 changing lr epoch 269, time 214.43, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4958 re_mapping 0.0039 re_causal 0.0131 /// teacc 99.10 lr 0.00010000 Epoch 271, weight, value: tensor([[-0.0731, -0.0252, -0.0977, ..., 0.0289, -0.1102, -0.0091], [ 0.0434, -0.0935, -0.0266, ..., -0.0547, -0.0787, -0.2780], [-0.0104, 0.0699, -0.0297, ..., -0.1122, 0.1312, -0.0427], ..., [ 0.0248, -0.0335, -0.0168, ..., -0.1519, -0.1862, 0.0351], [-0.0202, -0.0912, -0.0285, ..., -0.1038, 0.0576, -0.1579], [-0.2420, -0.0146, -0.0181, ..., -0.0931, -0.1979, -0.0954]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 8.8476e-09, 5.8208e-08, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6764e-08, 3.5390e-08, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -2.9244e-07, -2.9337e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6764e-08, 2.8871e-08, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.2667e-08, 2.6729e-07, 2.2352e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 4.6566e-09, 2.1886e-08]], device='cuda:0') Epoch 271, bias, value: tensor([-0.0122, -0.0141, 0.0033, 0.0344, -0.0060, 0.0275, 0.0078, 0.0158, -0.0011, -0.0197], device='cuda:0'), grad: tensor([ 1.2480e-07, 5.9092e-07, -4.5029e-07, 1.0896e-07, -2.7753e-07, 1.0896e-07, -2.2305e-07, -5.9325e-07, 5.8115e-07, 3.9116e-08], device='cuda:0') 100 0.0001 changing lr epoch 270, time 214.43, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.5114 re_mapping 0.0040 re_causal 0.0132 /// teacc 99.04 lr 0.00010000 Epoch 272, weight, value: tensor([[-0.0731, -0.0253, -0.0978, ..., 0.0289, -0.1103, -0.0096], [ 0.0434, -0.0935, -0.0266, ..., -0.0547, -0.0787, -0.2781], [-0.0104, 0.0700, -0.0298, ..., -0.1122, 0.1313, -0.0442], ..., [ 0.0249, -0.0335, -0.0168, ..., -0.1522, -0.1864, 0.0351], [-0.0202, -0.0914, -0.0285, ..., -0.1039, 0.0576, -0.1579], [-0.2422, -0.0146, -0.0181, ..., -0.0931, -0.1982, -0.0954]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-08, 1.9511e-07, 3.1712e-07], [-1.5367e-08, 0.0000e+00, 0.0000e+00, ..., 1.3039e-08, 3.5856e-08, 3.7719e-08], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 9.4064e-08, 2.5705e-07, 2.1607e-07], ..., [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 8.8476e-09, 6.0536e-09], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 5.8208e-08, 1.1036e-07, 9.0804e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 6.0536e-09, 4.3772e-08, 3.6787e-08]], device='cuda:0') Epoch 272, bias, value: tensor([-0.0124, -0.0143, 0.0031, 0.0344, -0.0059, 0.0276, 0.0079, 0.0159, -0.0011, -0.0193], device='cuda:0'), grad: tensor([ 1.0906e-06, 1.3672e-06, 9.9987e-06, -1.0766e-05, 3.5930e-06, 1.6242e-06, -7.3835e-06, -1.4063e-06, 1.4342e-06, 4.6100e-07], device='cuda:0') 100 0.0001 changing lr epoch 271, time 214.37, cls_loss 0.0013 cls_loss_mapping 0.0024 cls_loss_causal 0.4933 re_mapping 0.0041 re_causal 0.0122 /// teacc 99.06 lr 0.00010000 Epoch 273, weight, value: tensor([[-0.0732, -0.0254, -0.0978, ..., 0.0287, -0.1106, -0.0104], [ 0.0433, -0.0935, -0.0266, ..., -0.0548, -0.0787, -0.2782], [-0.0106, 0.0703, -0.0298, ..., -0.1122, 0.1319, -0.0425], ..., [ 0.0249, -0.0335, -0.0168, ..., -0.1526, -0.1866, 0.0354], [-0.0203, -0.0920, -0.0285, ..., -0.1043, 0.0575, -0.1580], [-0.2424, -0.0146, -0.0181, ..., -0.0925, -0.1986, -0.0955]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-09, 1.0245e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 7.4506e-09, 4.6566e-10], [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., 9.3132e-10, -8.3819e-08, 4.6566e-10], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 1.1269e-07, 2.7940e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 5.5879e-09, -8.8476e-08, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 6.0536e-09, 4.6566e-10]], device='cuda:0') Epoch 273, bias, value: tensor([-0.0131, -0.0143, 0.0032, 0.0341, -0.0060, 0.0281, 0.0078, 0.0160, -0.0013, -0.0191], device='cuda:0'), grad: tensor([ 4.6100e-08, -1.8582e-05, 9.2201e-08, 7.4096e-06, 1.8766e-07, 3.0361e-07, 5.5879e-09, 1.7881e-05, 3.5157e-07, -7.6517e-06], device='cuda:0') 100 0.0001 changing lr epoch 272, time 214.55, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.4684 re_mapping 0.0043 re_causal 0.0122 /// teacc 99.11 lr 0.00010000 Epoch 274, weight, value: tensor([[-0.0733, -0.0257, -0.0978, ..., 0.0294, -0.1107, -0.0106], [ 0.0461, -0.0936, -0.0266, ..., -0.0553, -0.0788, -0.2787], [-0.0106, 0.0704, -0.0298, ..., -0.1124, 0.1325, -0.0397], ..., [ 0.0248, -0.0335, -0.0168, ..., -0.1529, -0.1868, 0.0361], [-0.0232, -0.0902, -0.0285, ..., -0.1047, 0.0575, -0.1601], [-0.2427, -0.0150, -0.0181, ..., -0.0923, -0.1996, -0.0955]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9558e-08, 4.8894e-08, 6.5193e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.6298e-08, 2.7008e-08, 1.5413e-07], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., 3.2596e-08, -1.2899e-07, -7.4040e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 1.2200e-07, 5.4529e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1130e-07, 4.5262e-07, 9.7789e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-08, 9.7789e-09, -7.4739e-07]], device='cuda:0') Epoch 274, bias, value: tensor([-0.0129, -0.0143, 0.0035, 0.0339, -0.0061, 0.0282, 0.0075, 0.0161, -0.0015, -0.0192], device='cuda:0'), grad: tensor([ 4.4703e-08, 1.0449e-06, -3.2596e-09, 2.5192e-07, 1.3560e-06, 3.7393e-07, -1.4696e-06, 6.2063e-06, 1.1120e-06, -8.9407e-06], device='cuda:0') 100 0.0001 changing lr epoch 273, time 214.48, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4863 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.00 lr 0.00010000 Epoch 275, weight, value: tensor([[-0.0734, -0.0265, -0.0978, ..., 0.0292, -0.1110, -0.0107], [ 0.0482, -0.0950, -0.0266, ..., -0.0554, -0.0788, -0.2790], [-0.0113, 0.0708, -0.0298, ..., -0.1124, 0.1327, -0.0403], ..., [ 0.0253, -0.0325, -0.0168, ..., -0.1541, -0.1870, 0.0361], [-0.0252, -0.0902, -0.0285, ..., -0.1054, 0.0573, -0.1602], [-0.2430, -0.0152, -0.0181, ..., -0.0920, -0.2003, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, 1.5367e-08, 9.3132e-10], [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 4.8894e-08, 2.6962e-07, 1.5832e-08], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 2.7474e-08, 1.3737e-07, 6.9849e-09], ..., [ 1.3970e-09, -4.6566e-10, 0.0000e+00, ..., 1.0710e-08, 6.4727e-08, -8.6147e-08], [-1.6764e-08, 0.0000e+00, 0.0000e+00, ..., -9.4995e-08, -3.4971e-07, 6.9849e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 2.7474e-08, 3.0268e-08]], device='cuda:0') Epoch 275, bias, value: tensor([-0.0133, -0.0143, 0.0034, 0.0338, -0.0070, 0.0284, 0.0076, 0.0162, -0.0018, -0.0177], device='cuda:0'), grad: tensor([-1.0245e-08, 9.7882e-07, 5.2340e-07, 7.8464e-07, 3.6322e-08, -1.2787e-06, 4.5775e-07, -6.9151e-07, -1.1977e-06, 4.0000e-07], device='cuda:0') 100 0.0001 changing lr epoch 274, time 214.54, cls_loss 0.0012 cls_loss_mapping 0.0018 cls_loss_causal 0.4849 re_mapping 0.0041 re_causal 0.0126 /// teacc 99.08 lr 0.00010000 Epoch 276, weight, value: tensor([[-0.0734, -0.0268, -0.0978, ..., 0.0288, -0.1114, -0.0107], [ 0.0490, -0.0974, -0.0266, ..., -0.0555, -0.0788, -0.2792], [-0.0114, 0.0705, -0.0298, ..., -0.1126, 0.1318, -0.0403], ..., [ 0.0252, -0.0305, -0.0168, ..., -0.1547, -0.1875, 0.0362], [-0.0258, -0.0895, -0.0285, ..., -0.1056, 0.0577, -0.1604], [-0.2431, -0.0159, -0.0181, ..., -0.0925, -0.2018, -0.0953]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 6.9849e-09, 4.1910e-09], [-1.3970e-09, 1.8626e-09, 0.0000e+00, ..., 4.6566e-10, 1.8626e-08, 2.8405e-08], [ 0.0000e+00, -7.4506e-09, 0.0000e+00, ..., 0.0000e+00, -6.1933e-08, 6.5193e-09], ..., [ 1.3970e-09, 4.1910e-09, 0.0000e+00, ..., 4.6566e-10, 6.5193e-08, 2.2352e-08], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 9.3132e-09, 2.5472e-07, 4.1910e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 1.1781e-07, 1.3821e-06]], device='cuda:0') Epoch 276, bias, value: tensor([-0.0138, -0.0143, 0.0018, 0.0340, -0.0070, 0.0282, 0.0082, 0.0163, -0.0015, -0.0178], device='cuda:0'), grad: tensor([ 3.1665e-08, 1.9325e-07, 5.5879e-09, 4.3679e-07, -4.7386e-06, -1.6037e-06, 1.8440e-07, 7.8231e-08, 6.3889e-07, 4.7721e-06], device='cuda:0') 100 0.0001 changing lr epoch 275, time 214.52, cls_loss 0.0013 cls_loss_mapping 0.0020 cls_loss_causal 0.4610 re_mapping 0.0040 re_causal 0.0118 /// teacc 99.11 lr 0.00010000 Epoch 277, weight, value: tensor([[-0.0735, -0.0271, -0.0978, ..., 0.0276, -0.1130, -0.0109], [ 0.0491, -0.0975, -0.0267, ..., -0.0552, -0.0797, -0.2818], [-0.0114, 0.0706, -0.0298, ..., -0.1126, 0.1341, -0.0377], ..., [ 0.0252, -0.0301, -0.0168, ..., -0.1558, -0.1880, 0.0362], [-0.0258, -0.0892, -0.0285, ..., -0.1058, 0.0577, -0.1605], [-0.2442, -0.0167, -0.0181, ..., -0.0926, -0.2026, -0.0954]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.3504e-08, 2.0117e-07, 4.6566e-10], [-2.1886e-08, 4.5169e-08, 0.0000e+00, ..., -1.9278e-07, 2.8592e-07, 1.5832e-08], [ 4.6566e-09, 4.6566e-10, 0.0000e+00, ..., 6.0536e-09, -5.0180e-06, 3.2596e-09], ..., [ 1.0710e-08, -6.0070e-08, 0.0000e+00, ..., 1.4901e-08, 2.5779e-06, 4.1910e-09], [ 6.0536e-09, 2.7940e-09, 0.0000e+00, ..., 1.0943e-07, 9.4622e-07, 2.3283e-09], [ 2.7940e-09, 7.9162e-09, 0.0000e+00, ..., 4.1910e-09, 1.1129e-07, 5.0291e-08]], device='cuda:0') Epoch 277, bias, value: tensor([-0.0145, -0.0147, 0.0045, 0.0338, -0.0071, 0.0284, 0.0085, 0.0162, -0.0016, -0.0179], device='cuda:0'), grad: tensor([ 8.7731e-07, -3.0994e-05, -1.4082e-05, -7.2494e-06, 1.7527e-06, 5.8301e-06, 8.8429e-07, 2.2486e-05, 6.5751e-06, 1.3947e-05], device='cuda:0') 100 0.0001 changing lr epoch 276, time 214.45, cls_loss 0.0016 cls_loss_mapping 0.0020 cls_loss_causal 0.4938 re_mapping 0.0039 re_causal 0.0118 /// teacc 99.08 lr 0.00010000 Epoch 278, weight, value: tensor([[-0.0736, -0.0272, -0.0978, ..., 0.0304, -0.1132, -0.0077], [ 0.0489, -0.0976, -0.0267, ..., -0.0541, -0.0797, -0.2820], [-0.0112, 0.0706, -0.0298, ..., -0.1125, 0.1349, -0.0381], ..., [ 0.0253, -0.0296, -0.0168, ..., -0.1564, -0.1886, 0.0362], [-0.0261, -0.0917, -0.0285, ..., -0.1064, 0.0575, -0.1606], [-0.2452, -0.0168, -0.0181, ..., -0.0955, -0.2061, -0.0956]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., -1.3458e-07, 1.9092e-08, -6.4708e-06], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 1.4547e-06, 1.2107e-08], [-4.3772e-08, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, -1.8002e-06, 9.2713e-07], ..., [ 2.8405e-08, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 2.0536e-07, 8.3819e-09], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 1.5041e-07, 1.5227e-07, 2.9663e-07], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 5.5879e-09, 4.3819e-07]], device='cuda:0') Epoch 278, bias, value: tensor([-0.0127, -0.0140, 0.0047, 0.0337, -0.0077, 0.0286, 0.0075, 0.0158, -0.0019, -0.0189], device='cuda:0'), grad: tensor([-1.1206e-05, 3.0939e-06, -1.7425e-06, 1.0477e-06, 6.1281e-06, 4.3772e-08, 5.8673e-07, -1.5926e-07, 1.4333e-06, 7.7533e-07], device='cuda:0') 100 0.0001 changing lr epoch 277, time 214.39, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4793 re_mapping 0.0042 re_causal 0.0124 /// teacc 99.08 lr 0.00010000 Epoch 279, weight, value: tensor([[-0.0737, -0.0272, -0.0978, ..., 0.0315, -0.1130, -0.0067], [ 0.0476, -0.0981, -0.0267, ..., -0.0542, -0.0798, -0.2821], [-0.0107, 0.0727, -0.0298, ..., -0.1127, 0.1357, -0.0381], ..., [ 0.0257, -0.0287, -0.0168, ..., -0.1566, -0.1904, 0.0358], [-0.0262, -0.0952, -0.0285, ..., -0.1069, 0.0574, -0.1606], [-0.2484, -0.0169, -0.0181, ..., -0.0956, -0.2080, -0.0958]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2945e-07, 1.2219e-06, 4.6566e-09], [4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.5832e-08, 1.3318e-07, 2.8871e-08], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7951e-07, 2.9318e-06, 3.7253e-09], ..., [4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 4.4052e-07, 5.2154e-08], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5646e-07, 4.0652e-07, 2.3283e-09], [4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.8161e-08, 3.1246e-07, 9.5554e-07]], device='cuda:0') Epoch 279, bias, value: tensor([-0.0121, -0.0140, 0.0055, 0.0337, -0.0075, 0.0289, 0.0068, 0.0157, -0.0021, -0.0193], device='cuda:0'), grad: tensor([ 2.6245e-06, -2.8405e-07, 6.4895e-06, 1.9893e-06, -2.7120e-06, -1.7047e-05, 3.7812e-06, 1.2908e-06, 5.4203e-07, 3.3230e-06], device='cuda:0') 100 0.0001 changing lr epoch 278, time 214.38, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4491 re_mapping 0.0042 re_causal 0.0121 /// teacc 99.11 lr 0.00010000 Epoch 280, weight, value: tensor([[-0.0738, -0.0273, -0.0978, ..., 0.0302, -0.1133, -0.0070], [ 0.0472, -0.0986, -0.0267, ..., -0.0542, -0.0798, -0.2822], [-0.0110, 0.0719, -0.0298, ..., -0.1129, 0.1355, -0.0381], ..., [ 0.0264, -0.0278, -0.0168, ..., -0.1574, -0.1905, 0.0357], [-0.0262, -0.0953, -0.0285, ..., -0.1071, 0.0576, -0.1607], [-0.2493, -0.0169, -0.0181, ..., -0.0936, -0.2088, -0.0956]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.5856e-08, 3.7253e-09, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 6.9849e-09, 2.3283e-09], [-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -7.4506e-09, 9.3132e-10], ..., [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 5.1223e-09, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, -2.8871e-08, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-08, 2.0955e-08, 1.4808e-07]], device='cuda:0') Epoch 280, bias, value: tensor([-0.0139, -0.0141, 0.0049, 0.0338, -0.0080, 0.0288, 0.0066, 0.0159, -0.0020, -0.0178], device='cuda:0'), grad: tensor([-1.5227e-07, 1.9977e-07, 3.4459e-08, 1.5181e-07, -3.4552e-07, -7.5437e-08, 7.5903e-08, -4.4471e-07, 1.5832e-08, 5.4622e-07], device='cuda:0') 100 0.0001 changing lr epoch 279, time 214.38, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4847 re_mapping 0.0038 re_causal 0.0120 /// teacc 99.04 lr 0.00010000 Epoch 281, weight, value: tensor([[-0.0738, -0.0273, -0.0978, ..., 0.0297, -0.1135, -0.0073], [ 0.0476, -0.0987, -0.0268, ..., -0.0544, -0.0799, -0.2824], [-0.0109, 0.0719, -0.0298, ..., -0.1129, 0.1351, -0.0379], ..., [ 0.0262, -0.0278, -0.0172, ..., -0.1580, -0.1908, 0.0358], [-0.0261, -0.0952, -0.0286, ..., -0.1072, 0.0580, -0.1611], [-0.2499, -0.0170, -0.0182, ..., -0.0929, -0.2103, -0.0957]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8161e-08, 1.8626e-08, 0.0000e+00], [-4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 3.5856e-08, 2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 3.2596e-08, 1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 3.7253e-09, -5.5879e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7229e-08, -7.2177e-08, 9.3132e-10], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 3.2596e-09, 2.3283e-08]], device='cuda:0') Epoch 281, bias, value: tensor([-0.0145, -0.0142, 0.0045, 0.0337, -0.0079, 0.0289, 0.0064, 0.0160, -0.0017, -0.0178], device='cuda:0'), grad: tensor([ 5.6345e-08, 2.7232e-06, 1.3877e-07, -2.7474e-08, -1.7881e-07, -1.5786e-07, 3.4459e-08, -3.1423e-06, -7.2177e-08, 6.2445e-07], device='cuda:0') 100 0.0001 changing lr epoch 280, time 214.65, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4904 re_mapping 0.0039 re_causal 0.0125 /// teacc 99.07 lr 0.00010000 Epoch 282, weight, value: tensor([[-0.0739, -0.0273, -0.0978, ..., 0.0299, -0.1136, -0.0074], [ 0.0475, -0.0991, -0.0268, ..., -0.0544, -0.0799, -0.2826], [-0.0109, 0.0719, -0.0298, ..., -0.1129, 0.1353, -0.0375], ..., [ 0.0262, -0.0275, -0.0173, ..., -0.1585, -0.1912, 0.0360], [-0.0255, -0.0953, -0.0287, ..., -0.1073, 0.0581, -0.1611], [-0.2500, -0.0170, -0.0182, ..., -0.0929, -0.2107, -0.0958]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.0955e-08, 3.1665e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 9.9652e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 2.3562e-06, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.5635e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, -2.4457e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-09, 8.9873e-08, 4.6566e-10]], device='cuda:0') Epoch 282, bias, value: tensor([-0.0144, -0.0141, 0.0042, 0.0336, -0.0078, 0.0290, 0.0063, 0.0161, -0.0017, -0.0181], device='cuda:0'), grad: tensor([-1.1642e-08, 2.6496e-07, 5.7407e-06, 1.2862e-06, 4.6566e-08, -2.2948e-06, 3.9628e-07, 8.6147e-08, -5.8413e-06, 3.0454e-07], device='cuda:0') 100 0.0001 changing lr epoch 281, time 214.56, cls_loss 0.0012 cls_loss_mapping 0.0024 cls_loss_causal 0.4950 re_mapping 0.0040 re_causal 0.0125 /// teacc 99.05 lr 0.00010000 Epoch 283, weight, value: tensor([[-0.0739, -0.0274, -0.0978, ..., 0.0298, -0.1139, -0.0073], [ 0.0479, -0.0991, -0.0272, ..., -0.0545, -0.0829, -0.2826], [-0.0113, 0.0721, -0.0300, ..., -0.1130, 0.1391, -0.0370], ..., [ 0.0263, -0.0276, -0.0176, ..., -0.1589, -0.1916, 0.0362], [-0.0254, -0.0954, -0.0283, ..., -0.1074, 0.0580, -0.1612], [-0.2500, -0.0170, -0.0183, ..., -0.0929, -0.2113, -0.0960]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6298e-08, 5.1223e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8161e-08, 4.3772e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 1.2573e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 1.1176e-08, -4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.1548e-07, -2.8871e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 283, bias, value: tensor([-0.0145, -0.0160, 0.0075, 0.0340, -0.0077, 0.0287, 0.0062, 0.0163, -0.0017, -0.0185], device='cuda:0'), grad: tensor([-1.4901e-08, 2.3702e-07, 3.8324e-07, 1.2219e-06, 7.8697e-08, -1.2526e-07, 4.4238e-08, -6.2166e-07, -1.1511e-06, -7.4040e-08], device='cuda:0') 100 0.0001 changing lr epoch 282, time 214.68, cls_loss 0.0011 cls_loss_mapping 0.0015 cls_loss_causal 0.5055 re_mapping 0.0038 re_causal 0.0123 /// teacc 98.94 lr 0.00010000 Epoch 284, weight, value: tensor([[-0.0740, -0.0274, -0.0978, ..., 0.0299, -0.1144, -0.0073], [ 0.0481, -0.0991, -0.0274, ..., -0.0547, -0.0829, -0.2828], [-0.0115, 0.0721, -0.0300, ..., -0.1131, 0.1391, -0.0366], ..., [ 0.0262, -0.0275, -0.0181, ..., -0.1592, -0.1919, 0.0362], [-0.0254, -0.0954, -0.0285, ..., -0.1077, 0.0580, -0.1614], [-0.2501, -0.0170, -0.0185, ..., -0.0929, -0.2117, -0.0972]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-09, 1.1176e-08, 0.0000e+00], [-4.6566e-09, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 1.9558e-08, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-08, -7.9162e-08, -4.6566e-09], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 7.2643e-08, 8.3819e-09], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 6.2399e-08, 9.8720e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 4.6566e-09, 7.4506e-09]], device='cuda:0') Epoch 284, bias, value: tensor([-0.0145, -0.0160, 0.0073, 0.0337, -0.0045, 0.0288, 0.0064, 0.0165, -0.0018, -0.0222], device='cuda:0'), grad: tensor([-2.7940e-09, 3.9581e-07, 2.5146e-08, 1.4514e-05, -4.4703e-08, 9.4995e-08, -3.2224e-07, -1.5378e-05, 4.7125e-07, 2.2072e-07], device='cuda:0') 100 0.0001 changing lr epoch 283, time 214.73, cls_loss 0.0012 cls_loss_mapping 0.0016 cls_loss_causal 0.4791 re_mapping 0.0040 re_causal 0.0126 /// teacc 99.01 lr 0.00010000 Epoch 285, weight, value: tensor([[-0.0741, -0.0274, -0.0979, ..., 0.0298, -0.1147, -0.0075], [ 0.0499, -0.0992, -0.0309, ..., -0.0549, -0.0830, -0.2830], [-0.0117, 0.0721, -0.0297, ..., -0.1135, 0.1392, -0.0387], ..., [ 0.0259, -0.0275, -0.0179, ..., -0.1611, -0.1929, 0.0361], [-0.0261, -0.0954, -0.0286, ..., -0.1082, 0.0580, -0.1618], [-0.2502, -0.0167, -0.0186, ..., -0.0929, -0.2112, -0.0980]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 8.3819e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -2.5239e-07, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.2573e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 2.2352e-08, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 285, bias, value: tensor([-0.0146, -0.0162, 0.0072, 0.0336, -0.0028, 0.0290, 0.0062, 0.0167, -0.0021, -0.0231], device='cuda:0'), grad: tensor([ 1.9558e-08, 1.4901e-07, -6.2305e-07, 1.2759e-07, 6.0536e-08, -1.1362e-07, 1.6764e-08, 3.2503e-07, 9.8720e-08, -5.8673e-08], device='cuda:0') 100 0.0001 changing lr epoch 284, time 214.93, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4970 re_mapping 0.0040 re_causal 0.0132 /// teacc 98.92 lr 0.00010000 Epoch 286, weight, value: tensor([[-0.0741, -0.0274, -0.0979, ..., 0.0299, -0.1150, -0.0075], [ 0.0499, -0.0992, -0.0301, ..., -0.0548, -0.0830, -0.2831], [-0.0116, 0.0722, -0.0297, ..., -0.1134, 0.1394, -0.0390], ..., [ 0.0258, -0.0275, -0.0184, ..., -0.1622, -0.1934, 0.0364], [-0.0262, -0.0955, -0.0292, ..., -0.1086, 0.0578, -0.1618], [-0.2502, -0.0166, -0.0188, ..., -0.0929, -0.2120, -0.0981]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.2387e-07, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.1362e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9116e-08, 2.9802e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 286, bias, value: tensor([-0.0147, -0.0161, 0.0073, 0.0335, -0.0029, 0.0292, 0.0062, 0.0166, -0.0023, -0.0233], device='cuda:0'), grad: tensor([ 2.7008e-08, 1.1409e-06, 9.0338e-08, -3.5390e-08, 6.6031e-07, 2.7195e-07, -2.7195e-07, -2.7400e-06, 5.6811e-07, 2.7101e-07], device='cuda:0') 100 0.0001 changing lr epoch 285, time 214.74, cls_loss 0.0013 cls_loss_mapping 0.0017 cls_loss_causal 0.4736 re_mapping 0.0040 re_causal 0.0120 /// teacc 99.03 lr 0.00010000 Epoch 287, weight, value: tensor([[-0.0741, -0.0274, -0.0980, ..., 0.0302, -0.1152, -0.0068], [ 0.0498, -0.0992, -0.0299, ..., -0.0565, -0.0831, -0.2832], [-0.0117, 0.0722, -0.0297, ..., -0.1139, 0.1395, -0.0398], ..., [ 0.0259, -0.0275, -0.0190, ..., -0.1627, -0.1938, 0.0365], [-0.0255, -0.0955, -0.0294, ..., -0.1088, 0.0580, -0.1619], [-0.2504, -0.0166, -0.0193, ..., -0.0930, -0.2137, -0.0981]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9092e-07, 1.2293e-07, 0.0000e+00], [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-08, 4.9081e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7008e-08, -9.2667e-07, -1.8626e-09], ..., [-9.3132e-10, -9.3132e-10, 0.0000e+00, ..., 2.7940e-09, 2.1141e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.6601e-07, 1.6298e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.5367e-07, 6.5193e-08, 1.0245e-08]], device='cuda:0') Epoch 287, bias, value: tensor([-0.0146, -0.0163, 0.0072, 0.0334, -0.0030, 0.0292, 0.0064, 0.0168, -0.0021, -0.0231], device='cuda:0'), grad: tensor([ 5.6252e-07, 2.6748e-06, -4.7311e-06, 9.4064e-07, 6.8732e-07, 2.8592e-07, -2.5220e-06, 7.9162e-07, 8.9314e-07, 3.9581e-07], device='cuda:0') 100 0.0001 changing lr epoch 286, time 214.56, cls_loss 0.0015 cls_loss_mapping 0.0020 cls_loss_causal 0.4974 re_mapping 0.0041 re_causal 0.0124 /// teacc 98.99 lr 0.00010000 Epoch 288, weight, value: tensor([[-0.0742, -0.0275, -0.0980, ..., 0.0349, -0.1155, -0.0064], [ 0.0499, -0.0995, -0.0332, ..., -0.0543, -0.0831, -0.2833], [-0.0117, 0.0725, -0.0295, ..., -0.1145, 0.1397, -0.0393], ..., [ 0.0261, -0.0273, -0.0183, ..., -0.1636, -0.1945, 0.0366], [-0.0256, -0.0956, -0.0294, ..., -0.1095, 0.0578, -0.1624], [-0.2522, -0.0168, -0.0214, ..., -0.0941, -0.2146, -0.0989]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-08, 7.4506e-09, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 8.3819e-09, ..., 9.3132e-10, 9.3132e-10, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 6.5193e-09, ..., 1.8626e-09, -1.3039e-08, 7.4506e-09], ..., [ 0.0000e+00, 0.0000e+00, 3.6508e-07, ..., 0.0000e+00, 3.7253e-09, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 4.6566e-09, 1.3039e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-08, 0.0000e+00, 4.0978e-08]], device='cuda:0') Epoch 288, bias, value: tensor([-0.0101, -0.0165, 0.0073, 0.0337, -0.0026, 0.0290, 0.0032, 0.0171, -0.0023, -0.0254], device='cuda:0'), grad: tensor([-4.1910e-08, 1.5926e-07, 1.8161e-07, -7.5325e-06, -1.7416e-07, 6.0536e-08, 1.7695e-08, 6.9290e-06, 1.6298e-07, 2.3097e-07], device='cuda:0') 100 0.0001 changing lr epoch 287, time 214.87, cls_loss 0.0013 cls_loss_mapping 0.0018 cls_loss_causal 0.4850 re_mapping 0.0042 re_causal 0.0125 /// teacc 99.06 lr 0.00010000 Epoch 289, weight, value: tensor([[-0.0742, -0.0275, -0.0980, ..., 0.0354, -0.1157, -0.0072], [ 0.0502, -0.0995, -0.0328, ..., -0.0542, -0.0830, -0.2834], [-0.0117, 0.0725, -0.0295, ..., -0.1147, 0.1396, -0.0391], ..., [ 0.0260, -0.0273, -0.0185, ..., -0.1644, -0.1957, 0.0367], [-0.0257, -0.0957, -0.0294, ..., -0.1100, 0.0579, -0.1625], [-0.2523, -0.0168, -0.0217, ..., -0.0945, -0.2155, -0.0991]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, 8.2888e-08, 6.6124e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.6368e-08, 1.5367e-07, 2.5146e-06], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1420e-07, 2.4587e-07, 1.2107e-08], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.2945e-07, 1.0245e-08], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 8.9407e-08, 3.3248e-07, 2.6077e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.3039e-08, -7.0669e-06]], device='cuda:0') Epoch 289, bias, value: tensor([-0.0096, -0.0165, 0.0069, 0.0335, -0.0018, 0.0295, 0.0027, 0.0174, -0.0023, -0.0263], device='cuda:0'), grad: tensor([ 5.5134e-07, 1.4186e-05, 5.2713e-07, 9.3784e-07, 2.6256e-05, -6.2957e-07, -4.1761e-06, 3.9302e-07, 8.8383e-07, -3.8981e-05], device='cuda:0') 100 0.0001 changing lr epoch 288, time 214.55, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4786 re_mapping 0.0040 re_causal 0.0125 /// teacc 99.06 lr 0.00010000 Epoch 290, weight, value: tensor([[-0.0743, -0.0275, -0.0980, ..., 0.0356, -0.1155, -0.0075], [ 0.0503, -0.0996, -0.0331, ..., -0.0548, -0.0830, -0.2836], [-0.0119, 0.0725, -0.0295, ..., -0.1161, 0.1396, -0.0392], ..., [ 0.0261, -0.0273, -0.0184, ..., -0.1655, -0.1962, 0.0367], [-0.0256, -0.0957, -0.0294, ..., -0.1115, 0.0577, -0.1625], [-0.2524, -0.0168, -0.0218, ..., -0.0946, -0.2160, -0.0992]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.7295e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, -2.6822e-06, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.4809e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.0978e-08, -1.4994e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.8626e-09, 1.8626e-09]], device='cuda:0') Epoch 290, bias, value: tensor([-0.0095, -0.0164, 0.0067, 0.0336, -0.0017, 0.0296, 0.0030, 0.0174, -0.0026, -0.0263], device='cuda:0'), grad: tensor([ 2.3283e-08, 1.1325e-06, -5.8971e-06, 9.1270e-08, 8.8476e-08, 2.5891e-07, 2.7940e-09, 4.4852e-06, -2.8312e-07, 6.5193e-08], device='cuda:0') 100 0.0001 changing lr epoch 289, time 214.22, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4615 re_mapping 0.0038 re_causal 0.0116 /// teacc 99.06 lr 0.00010000 Epoch 291, weight, value: tensor([[-0.0743, -0.0275, -0.0983, ..., 0.0356, -0.1156, -0.0079], [ 0.0503, -0.0996, -0.0343, ..., -0.0549, -0.0830, -0.2837], [-0.0119, 0.0725, -0.0294, ..., -0.1165, 0.1395, -0.0393], ..., [ 0.0261, -0.0273, -0.0188, ..., -0.1657, -0.1971, 0.0367], [-0.0255, -0.0957, -0.0296, ..., -0.1123, 0.0580, -0.1625], [-0.2525, -0.0168, -0.0224, ..., -0.0946, -0.2167, -0.0992]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 0.0000e+00, 0.0000e+00, ..., 2.2445e-07, 3.5949e-07, 6.6124e-08], [-1.4994e-07, 0.0000e+00, 0.0000e+00, ..., 2.5146e-08, 6.5193e-08, 0.0000e+00], [ 7.7300e-08, 0.0000e+00, 0.0000e+00, ..., 1.3970e-08, -5.5879e-09, 1.5832e-08], ..., [ 8.3819e-09, 0.0000e+00, -0.0000e+00, ..., 9.3132e-10, 2.8871e-08, 0.0000e+00], [ 4.0047e-08, 0.0000e+00, 0.0000e+00, ..., 1.2675e-06, 1.6084e-06, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-08, 3.8184e-08, 9.3132e-10]], device='cuda:0') Epoch 291, bias, value: tensor([-0.0094, -0.0164, 0.0065, 0.0337, -0.0018, 0.0295, 0.0030, 0.0174, -0.0024, -0.0262], device='cuda:0'), grad: tensor([ 7.9349e-07, 1.6633e-06, 2.9895e-07, 3.1292e-07, 1.0431e-07, -8.7172e-07, -3.8184e-06, -2.4512e-06, 3.5688e-06, 4.1071e-07], device='cuda:0') 100 0.0001 changing lr epoch 290, time 214.49, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4817 re_mapping 0.0038 re_causal 0.0122 /// teacc 99.08 lr 0.00010000 Epoch 292, weight, value: tensor([[-0.0745, -0.0275, -0.0984, ..., 0.0356, -0.1160, -0.0082], [ 0.0508, -0.0996, -0.0381, ..., -0.0549, -0.0832, -0.2839], [-0.0120, 0.0725, -0.0288, ..., -0.1168, 0.1397, -0.0393], ..., [ 0.0261, -0.0273, -0.0181, ..., -0.1660, -0.1973, 0.0363], [-0.0259, -0.0957, -0.0272, ..., -0.1129, 0.0579, -0.1625], [-0.2528, -0.0168, -0.0250, ..., -0.0945, -0.2171, -0.0990]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 1.0245e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -5.1223e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.5635e-08, 0.0000e+00]], device='cuda:0') Epoch 292, bias, value: tensor([-0.0095, -0.0164, 0.0065, 0.0337, -0.0018, 0.0296, 0.0031, 0.0173, -0.0024, -0.0259], device='cuda:0'), grad: tensor([-1.5832e-08, -5.5227e-07, -8.5682e-08, 0.0000e+00, 1.1176e-07, 8.1956e-08, -1.7695e-07, 5.4948e-08, 4.6194e-07, 1.2014e-07], device='cuda:0') 100 0.0001 changing lr epoch 291, time 214.94, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4692 re_mapping 0.0037 re_causal 0.0120 /// teacc 99.02 lr 0.00010000 Epoch 293, weight, value: tensor([[-0.0745, -0.0275, -0.0984, ..., 0.0356, -0.1162, -0.0083], [ 0.0505, -0.0997, -0.0413, ..., -0.0556, -0.0832, -0.2839], [-0.0120, 0.0724, -0.0282, ..., -0.1167, 0.1398, -0.0400], ..., [ 0.0264, -0.0271, -0.0169, ..., -0.1665, -0.1976, 0.0363], [-0.0260, -0.0957, -0.0271, ..., -0.1134, 0.0578, -0.1626], [-0.2530, -0.0168, -0.0254, ..., -0.0945, -0.2173, -0.0990]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.1420e-08, 3.7253e-09, 0.0000e+00], [ 5.5879e-08, 0.0000e+00, 0.0000e+00, ..., -1.1455e-07, 2.6356e-07, 0.0000e+00], [ 3.2596e-08, 0.0000e+00, 0.0000e+00, ..., 2.3283e-08, 5.4017e-08, 0.0000e+00], ..., [-1.0990e-07, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 7.4506e-09, 2.7940e-09], [ 5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 3.8184e-08, -1.8999e-07, 0.0000e+00], [ 8.3819e-09, 0.0000e+00, 0.0000e+00, ..., 4.5635e-08, 9.3132e-09, -8.3819e-09]], device='cuda:0') Epoch 293, bias, value: tensor([-0.0095, -0.0165, 0.0064, 0.0334, -0.0018, 0.0302, 0.0027, 0.0176, -0.0026, -0.0261], device='cuda:0'), grad: tensor([ 2.9616e-07, 3.6545e-06, 2.8741e-06, 2.9430e-07, 5.4482e-07, -2.6263e-07, 9.3132e-10, -8.5458e-06, -5.0291e-08, 1.2042e-06], device='cuda:0') 100 0.0001 changing lr epoch 292, time 214.96, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4835 re_mapping 0.0038 re_causal 0.0122 /// teacc 99.01 lr 0.00010000 Epoch 294, weight, value: tensor([[-0.0745, -0.0275, -0.0984, ..., 0.0356, -0.1169, -0.0083], [ 0.0506, -0.0997, -0.0412, ..., -0.0560, -0.0832, -0.2841], [-0.0118, 0.0725, -0.0284, ..., -0.1162, 0.1399, -0.0398], ..., [ 0.0262, -0.0271, -0.0168, ..., -0.1679, -0.1985, 0.0364], [-0.0261, -0.0957, -0.0270, ..., -0.1135, 0.0580, -0.1626], [-0.2532, -0.0168, -0.0256, ..., -0.0945, -0.2178, -0.0991]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 4.6566e-09, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, -1.2387e-07, ..., 1.8626e-09, 4.6566e-09, 1.2107e-08], [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 2.7940e-09, -5.5879e-09, 8.3819e-09], ..., [ 0.0000e+00, 3.7253e-09, 8.1025e-08, ..., 0.0000e+00, 1.1176e-08, -5.6811e-08], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 1.8626e-08, 1.0803e-07, 2.3283e-08], [ 0.0000e+00, -1.0245e-08, 5.5879e-09, ..., 1.8626e-09, 7.4506e-09, -1.8626e-09]], device='cuda:0') Epoch 294, bias, value: tensor([-0.0096, -0.0165, 0.0064, 0.0333, -0.0018, 0.0303, 0.0028, 0.0176, -0.0024, -0.0261], device='cuda:0'), grad: tensor([ 2.2352e-08, -9.6112e-07, 2.0955e-07, -1.6950e-07, 1.7881e-07, -2.4121e-07, 4.1537e-07, 1.9558e-08, 5.4576e-07, -2.8871e-08], device='cuda:0') 100 0.0001 changing lr epoch 293, time 214.91, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4873 re_mapping 0.0037 re_causal 0.0123 /// teacc 99.11 lr 0.00010000 Epoch 295, weight, value: tensor([[-0.0746, -0.0275, -0.0984, ..., 0.0357, -0.1174, -0.0084], [ 0.0507, -0.0998, -0.0411, ..., -0.0564, -0.0832, -0.2843], [-0.0120, 0.0724, -0.0285, ..., -0.1162, 0.1395, -0.0397], ..., [ 0.0262, -0.0271, -0.0169, ..., -0.1689, -0.1991, 0.0363], [-0.0261, -0.0958, -0.0270, ..., -0.1143, 0.0588, -0.1626], [-0.2535, -0.0167, -0.0258, ..., -0.0947, -0.2194, -0.0993]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9744e-07, 9.3225e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.4342e-07, 6.7893e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.9162e-08, 3.6880e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.2107e-08, 6.3330e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.2678e-07, -2.9653e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, 8.5682e-08, -0.0000e+00]], device='cuda:0') Epoch 295, bias, value: tensor([-0.0093, -0.0165, 0.0061, 0.0330, -0.0017, 0.0305, 0.0031, 0.0175, -0.0017, -0.0266], device='cuda:0'), grad: tensor([ 3.6284e-06, -4.3988e-05, 1.5190e-06, 1.7807e-06, 4.4703e-08, 4.6846e-07, 1.0021e-06, 4.6492e-05, -1.1414e-05, 4.4797e-07], device='cuda:0') 100 0.0001 changing lr epoch 294, time 214.67, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4773 re_mapping 0.0038 re_causal 0.0119 /// teacc 99.11 lr 0.00010000