/home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 500, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_adam', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[-0.0229, -0.0198, 0.0145, ..., -0.0150, -0.0229, 0.0150], [ 0.0067, -0.0086, -0.0215, ..., 0.0307, 0.0172, 0.0031], [-0.0304, -0.0185, -0.0098, ..., 0.0100, -0.0173, 0.0222], ..., [ 0.0125, -0.0113, -0.0201, ..., -0.0116, 0.0090, 0.0017], [ 0.0270, 0.0120, -0.0142, ..., -0.0211, -0.0112, 0.0308], [-0.0156, 0.0101, 0.0250, ..., -0.0249, 0.0107, -0.0086]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([-0.0058, -0.0305, 0.0100, -0.0208, 0.0150, 0.0014, 0.0218, -0.0126, -0.0232, 0.0025], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 280.32, cls_loss 1.5221 cls_loss_mapping 1.9297 cls_loss_causal 2.2244 re_mapping 0.1031 re_causal 0.1042 /// teacc 82.82 lr 0.00010000 Epoch 2, weight, value: tensor([[-0.0187, -0.0223, 0.0113, ..., -0.0233, -0.0257, 0.0088], [ 0.0056, -0.0064, -0.0187, ..., 0.0356, 0.0174, 0.0065], [-0.0322, -0.0189, -0.0155, ..., 0.0165, -0.0119, 0.0174], ..., [ 0.0079, -0.0115, -0.0274, ..., -0.0111, 0.0017, 0.0076], [ 0.0233, 0.0096, -0.0100, ..., -0.0189, -0.0133, 0.0258], [-0.0200, 0.0071, 0.0202, ..., -0.0310, 0.0091, -0.0075]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -3.2368e-03, ..., 5.0354e-03, -9.0866e-03, 4.6310e-07], [ 0.0000e+00, 0.0000e+00, 9.5673e-03, ..., 1.8482e-03, 1.3781e-03, -2.4796e-04], [ 0.0000e+00, 0.0000e+00, -1.5747e-02, ..., -5.8258e-02, -1.5022e-02, 2.7195e-05], ..., [ 0.0000e+00, 0.0000e+00, 1.5656e-02, ..., 1.6800e-02, 2.0004e-02, 2.4736e-05], [ 0.0000e+00, 0.0000e+00, -2.1423e-02, ..., 6.3515e-04, 4.4823e-03, 4.9353e-05], [ 0.0000e+00, 0.0000e+00, 2.0767e-02, ..., 1.0090e-03, -1.9806e-02, 4.1455e-05]], device='cuda:0') Epoch 2, bias, value: tensor([-0.0077, -0.0288, 0.0089, -0.0207, 0.0140, 0.0016, 0.0217, -0.0119, -0.0237, 0.0025], device='cuda:0'), grad: tensor([-0.0174, 0.0233, -0.0216, 0.0623, 0.0347, -0.1466, 0.0386, 0.0470, -0.0019, -0.0184], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 279.51, cls_loss 0.5193 cls_loss_mapping 0.8489 cls_loss_causal 1.9181 re_mapping 0.2062 re_causal 0.2506 /// teacc 90.62 lr 0.00010000 Epoch 3, weight, value: tensor([[-0.0131, -0.0223, 0.0066, ..., -0.0265, -0.0276, 0.0070], [ 0.0052, -0.0064, -0.0181, ..., 0.0379, 0.0188, 0.0087], [-0.0315, -0.0189, -0.0192, ..., 0.0190, -0.0104, 0.0200], ..., [ 0.0002, -0.0115, -0.0320, ..., -0.0099, -0.0008, 0.0109], [ 0.0202, 0.0096, -0.0075, ..., -0.0198, -0.0153, 0.0207], [-0.0269, 0.0071, 0.0190, ..., -0.0360, 0.0083, -0.0096]], device='cuda:0'), grad: tensor([[-2.5997e-03, 0.0000e+00, 2.1439e-03, ..., 2.4738e-03, 1.2684e-04, 5.6148e-05], [ 5.8365e-04, 0.0000e+00, -1.1978e-02, ..., -3.0258e-02, -1.8454e-03, -4.2992e-03], [-2.1706e-03, 0.0000e+00, 6.4659e-04, ..., -4.2664e-02, -1.0214e-03, -9.6273e-04], ..., [ 1.2836e-03, 0.0000e+00, 3.8624e-03, ..., 1.3199e-02, 2.3232e-03, 4.7660e-04], [-1.1740e-03, 0.0000e+00, -4.2915e-03, ..., 1.0429e-02, 1.4200e-03, 1.0481e-03], [ 4.7398e-04, 0.0000e+00, 7.9727e-03, ..., 1.1276e-02, -5.7564e-03, -4.7922e-04]], device='cuda:0') Epoch 3, bias, value: tensor([-0.0081, -0.0287, 0.0083, -0.0207, 0.0138, 0.0029, 0.0213, -0.0123, -0.0239, 0.0030], device='cuda:0'), grad: tensor([-0.0053, -0.0143, -0.0182, 0.0160, -0.0222, 0.0097, 0.0101, 0.0087, -0.0011, 0.0167], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 278.47, cls_loss 0.3500 cls_loss_mapping 0.4936 cls_loss_causal 1.6778 re_mapping 0.1573 re_causal 0.2369 /// teacc 93.59 lr 0.00010000 Epoch 4, weight, value: tensor([[-0.0103, -0.0223, 0.0036, ..., -0.0287, -0.0287, 0.0006], [ 0.0057, -0.0064, -0.0182, ..., 0.0386, 0.0202, 0.0099], [-0.0292, -0.0189, -0.0216, ..., 0.0207, -0.0094, 0.0201], ..., [-0.0062, -0.0115, -0.0360, ..., -0.0080, -0.0019, 0.0151], [ 0.0193, 0.0096, -0.0069, ..., -0.0198, -0.0175, 0.0158], [-0.0297, 0.0071, 0.0201, ..., -0.0389, 0.0078, -0.0117]], device='cuda:0'), grad: tensor([[ 4.4179e-04, 0.0000e+00, 4.5180e-04, ..., 7.4625e-04, 1.1253e-02, 3.5310e-04], [-1.3762e-03, 0.0000e+00, 1.5144e-02, ..., 1.3113e-03, 9.5978e-03, 6.7520e-03], [ 5.4502e-04, 0.0000e+00, 2.3842e-03, ..., -6.1378e-03, 2.9488e-03, -4.8709e-04], ..., [ 9.7215e-05, 0.0000e+00, 6.0959e-03, ..., -2.2542e-04, 4.5700e-03, -6.2799e-04], [ 1.1873e-03, 0.0000e+00, 6.7101e-03, ..., 1.1421e-02, 5.5656e-03, -1.2693e-03], [ 8.5890e-05, 0.0000e+00, -3.5004e-02, ..., -4.2381e-03, 8.7357e-03, 1.7843e-03]], device='cuda:0') Epoch 4, bias, value: tensor([-0.0087, -0.0287, 0.0084, -0.0209, 0.0139, 0.0037, 0.0211, -0.0121, -0.0242, 0.0031], device='cuda:0'), grad: tensor([ 0.0173, 0.0151, 0.0085, 0.0242, -0.0677, 0.0196, -0.0236, 0.0063, 0.0038, -0.0034], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 278.78, cls_loss 0.2505 cls_loss_mapping 0.3509 cls_loss_causal 1.5292 re_mapping 0.1212 re_causal 0.2169 /// teacc 94.58 lr 0.00010000 Epoch 5, weight, value: tensor([[-0.0085, -0.0223, 0.0007, ..., -0.0307, -0.0284, -0.0034], [ 0.0055, -0.0064, -0.0171, ..., 0.0400, 0.0222, 0.0099], [-0.0293, -0.0189, -0.0246, ..., 0.0221, -0.0094, 0.0214], ..., [-0.0140, -0.0115, -0.0392, ..., -0.0075, -0.0040, 0.0179], [ 0.0244, 0.0096, -0.0061, ..., -0.0199, -0.0191, 0.0140], [-0.0356, 0.0071, 0.0215, ..., -0.0412, 0.0072, -0.0145]], device='cuda:0'), grad: tensor([[ 6.2287e-05, 0.0000e+00, 4.5681e-04, ..., 1.0548e-03, -1.6918e-03, 8.2031e-06], [-9.3384e-03, 0.0000e+00, -1.2779e-02, ..., -7.9224e-02, -1.2383e-02, 1.2174e-05], [ 7.0686e-03, 0.0000e+00, 1.0117e-02, ..., 4.0863e-02, 3.9062e-03, -4.0102e-04], ..., [ 1.6212e-04, 0.0000e+00, 1.8187e-03, ..., -1.3103e-03, 1.1091e-03, 6.6042e-05], [ 2.4652e-04, 0.0000e+00, 2.5749e-03, ..., 3.2120e-03, 2.2488e-03, 4.2975e-05], [ 1.7071e-04, 0.0000e+00, -8.1787e-03, ..., 1.1187e-03, -5.7755e-03, 1.0431e-05]], device='cuda:0') Epoch 5, bias, value: tensor([-0.0084, -0.0282, 0.0084, -0.0209, 0.0141, 0.0034, 0.0211, -0.0122, -0.0246, 0.0030], device='cuda:0'), grad: tensor([-0.0055, -0.0304, 0.0171, 0.0146, 0.0064, 0.0022, 0.0062, 0.0020, 0.0054, -0.0179], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 280.63, cls_loss 0.2013 cls_loss_mapping 0.2692 cls_loss_causal 1.3562 re_mapping 0.1003 re_causal 0.1970 /// teacc 95.39 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0058, -0.0223, -0.0011, ..., -0.0329, -0.0288, -0.0075], [ 0.0085, -0.0064, -0.0168, ..., 0.0413, 0.0242, 0.0100], [-0.0316, -0.0189, -0.0277, ..., 0.0234, -0.0081, 0.0215], ..., [-0.0200, -0.0115, -0.0408, ..., -0.0075, -0.0056, 0.0205], [ 0.0268, 0.0096, -0.0055, ..., -0.0193, -0.0204, 0.0121], [-0.0418, 0.0071, 0.0222, ..., -0.0437, 0.0067, -0.0172]], device='cuda:0'), grad: tensor([[ 5.8651e-04, 0.0000e+00, 9.8896e-04, ..., 7.0429e-04, 2.5864e-03, 4.5180e-05], [-2.1286e-03, 0.0000e+00, -1.4162e-03, ..., -4.3983e-03, -5.0850e-03, -5.0592e-04], [ 8.8739e-04, 0.0000e+00, 2.5845e-03, ..., 8.9979e-04, 5.0201e-03, 2.9182e-04], ..., [ 2.7180e-04, 0.0000e+00, 1.0967e-03, ..., -2.4915e-04, -5.6076e-04, -3.5405e-04], [ 2.2066e-04, 0.0000e+00, -5.7259e-03, ..., -6.0310e-03, -9.4681e-03, 7.8058e-04], [ 9.2649e-04, 0.0000e+00, -3.8929e-03, ..., 1.0090e-03, -1.4992e-03, 2.9826e-04]], device='cuda:0') Epoch 6, bias, value: tensor([-0.0082, -0.0279, 0.0085, -0.0209, 0.0142, 0.0030, 0.0207, -0.0125, -0.0243, 0.0032], device='cuda:0'), grad: tensor([ 0.0021, -0.0036, 0.0055, -0.0034, -0.0021, 0.0094, 0.0069, -0.0034, -0.0055, -0.0059], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 278.72, cls_loss 0.1822 cls_loss_mapping 0.2325 cls_loss_causal 1.3252 re_mapping 0.0856 re_causal 0.1840 /// teacc 96.41 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0048, -0.0223, -0.0029, ..., -0.0350, -0.0300, -0.0132], [ 0.0102, -0.0064, -0.0165, ..., 0.0420, 0.0263, 0.0095], [-0.0334, -0.0189, -0.0305, ..., 0.0243, -0.0072, 0.0208], ..., [-0.0229, -0.0115, -0.0416, ..., -0.0071, -0.0070, 0.0219], [ 0.0299, 0.0096, -0.0046, ..., -0.0194, -0.0223, 0.0103], [-0.0454, 0.0071, 0.0220, ..., -0.0465, 0.0057, -0.0200]], device='cuda:0'), grad: tensor([[-0.0008, 0.0000, 0.0014, ..., 0.0017, -0.0031, 0.0002], [ 0.0029, 0.0000, 0.0070, ..., 0.0149, -0.0004, -0.0002], [ 0.0014, 0.0000, 0.0031, ..., -0.0083, 0.0003, -0.0008], ..., [ 0.0008, 0.0000, 0.0013, ..., 0.0039, 0.0040, 0.0002], [-0.0004, 0.0000, -0.0041, ..., -0.0001, 0.0012, -0.0005], [ 0.0008, 0.0000, 0.0015, ..., 0.0028, -0.0038, -0.0012]], device='cuda:0') Epoch 7, bias, value: tensor([-0.0082, -0.0278, 0.0082, -0.0209, 0.0145, 0.0031, 0.0202, -0.0122, -0.0238, 0.0027], device='cuda:0'), grad: tensor([-0.0098, 0.0083, -0.0008, -0.0146, 0.0015, 0.0237, -0.0142, 0.0126, 0.0012, -0.0079], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 278.42, cls_loss 0.1492 cls_loss_mapping 0.1952 cls_loss_causal 1.2539 re_mapping 0.0763 re_causal 0.1698 /// teacc 96.50 lr 0.00010000 Epoch 8, weight, value: tensor([[-0.0034, -0.0223, -0.0051, ..., -0.0376, -0.0306, -0.0179], [ 0.0120, -0.0064, -0.0159, ..., 0.0430, 0.0283, 0.0086], [-0.0338, -0.0189, -0.0320, ..., 0.0253, -0.0059, 0.0204], ..., [-0.0255, -0.0115, -0.0432, ..., -0.0073, -0.0088, 0.0229], [ 0.0313, 0.0096, -0.0039, ..., -0.0195, -0.0235, 0.0087], [-0.0490, 0.0071, 0.0221, ..., -0.0479, 0.0052, -0.0221]], device='cuda:0'), grad: tensor([[-6.9618e-04, 0.0000e+00, 4.6611e-04, ..., 3.6454e-04, -8.1873e-04, 2.3916e-05], [-1.0653e-03, 0.0000e+00, -8.9502e-04, ..., -1.1177e-03, -3.5324e-03, -2.8357e-05], [ 5.7268e-04, 0.0000e+00, 3.9649e-04, ..., -5.2299e-03, -1.8473e-03, 1.1764e-05], ..., [ 1.6406e-05, 0.0000e+00, 1.4200e-03, ..., 3.4833e-04, 6.3705e-04, -4.7743e-05], [ 1.0405e-03, 0.0000e+00, 9.7809e-03, ..., 5.3101e-03, 2.9888e-03, 4.3011e-04], [ 1.7223e-03, 0.0000e+00, 1.4048e-03, ..., 2.6779e-03, -2.5272e-03, -1.1355e-04]], device='cuda:0') Epoch 8, bias, value: tensor([-0.0083, -0.0276, 0.0084, -0.0207, 0.0145, 0.0026, 0.0200, -0.0122, -0.0235, 0.0026], device='cuda:0'), grad: tensor([-0.0018, -0.0020, -0.0042, -0.0106, 0.0061, 0.0014, 0.0023, -0.0003, 0.0121, -0.0030], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 279.28, cls_loss 0.1329 cls_loss_mapping 0.1677 cls_loss_causal 1.2467 re_mapping 0.0697 re_causal 0.1641 /// teacc 96.76 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0019, -0.0223, -0.0066, ..., -0.0392, -0.0305, -0.0224], [ 0.0127, -0.0064, -0.0156, ..., 0.0437, 0.0305, 0.0089], [-0.0338, -0.0189, -0.0334, ..., 0.0263, -0.0056, 0.0197], ..., [-0.0276, -0.0115, -0.0439, ..., -0.0075, -0.0106, 0.0241], [ 0.0321, 0.0096, -0.0035, ..., -0.0196, -0.0245, 0.0077], [-0.0504, 0.0071, 0.0220, ..., -0.0490, 0.0053, -0.0235]], device='cuda:0'), grad: tensor([[ 1.9860e-04, 0.0000e+00, 8.2159e-04, ..., 4.7159e-04, 6.7472e-05, 2.5824e-05], [-1.2474e-03, 0.0000e+00, -1.9409e-02, ..., -5.4688e-02, -2.6627e-02, -8.7023e-04], [ 1.3523e-03, 0.0000e+00, 2.0142e-02, ..., 7.3608e-02, 2.9358e-02, 3.0255e-04], ..., [ 2.2793e-04, 0.0000e+00, 1.4668e-03, ..., -2.4048e-02, -3.1796e-03, 6.5565e-06], [ 3.5501e-04, 0.0000e+00, 6.9475e-04, ..., 1.3647e-03, 2.1133e-03, -1.2353e-05], [-3.2687e-04, 0.0000e+00, -5.8670e-03, ..., -2.5215e-03, -6.8779e-03, 1.9753e-04]], device='cuda:0') Epoch 9, bias, value: tensor([-0.0079, -0.0274, 0.0085, -0.0209, 0.0146, 0.0023, 0.0198, -0.0121, -0.0236, 0.0026], device='cuda:0'), grad: tensor([ 0.0006, -0.0255, 0.0376, 0.0006, 0.0075, 0.0014, -0.0009, -0.0115, 0.0020, -0.0118], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 278.95, cls_loss 0.0998 cls_loss_mapping 0.1346 cls_loss_causal 1.1738 re_mapping 0.0622 re_causal 0.1539 /// teacc 97.20 lr 0.00010000 Epoch 10, weight, value: tensor([[-0.0009, -0.0223, -0.0080, ..., -0.0408, -0.0304, -0.0248], [ 0.0123, -0.0064, -0.0155, ..., 0.0436, 0.0313, 0.0084], [-0.0347, -0.0189, -0.0349, ..., 0.0273, -0.0048, 0.0195], ..., [-0.0305, -0.0115, -0.0448, ..., -0.0073, -0.0118, 0.0247], [ 0.0348, 0.0096, -0.0033, ..., -0.0192, -0.0253, 0.0073], [-0.0519, 0.0071, 0.0219, ..., -0.0504, 0.0056, -0.0244]], device='cuda:0'), grad: tensor([[ 7.9155e-05, 0.0000e+00, 8.6403e-04, ..., 3.7122e-04, 1.4389e-04, 6.3598e-05], [ 2.0611e-04, 0.0000e+00, 1.6289e-03, ..., 3.4332e-03, 3.0112e-04, 1.1339e-03], [-4.8685e-04, 0.0000e+00, -2.1553e-03, ..., -1.5991e-02, 1.0443e-04, -8.9188e-03], ..., [ 1.5247e-04, 0.0000e+00, 3.0479e-03, ..., 6.2904e-03, 1.4896e-03, 2.5101e-03], [ 8.8644e-04, 0.0000e+00, 4.4441e-03, ..., 1.6006e-02, 1.7605e-03, 8.9111e-03], [ 3.7456e-04, 0.0000e+00, 8.9979e-04, ..., 1.7252e-03, -3.5167e-05, 3.4451e-04]], device='cuda:0') Epoch 10, bias, value: tensor([-0.0077, -0.0277, 0.0088, -0.0210, 0.0145, 0.0019, 0.0198, -0.0119, -0.0232, 0.0025], device='cuda:0'), grad: tensor([-0.0019, 0.0027, -0.0081, -0.0084, 0.0010, -0.0204, 0.0108, 0.0074, 0.0175, -0.0006], device='cuda:0') 100 0.0001 changing lr epoch 9, time 262.35, cls_loss 0.0925 cls_loss_mapping 0.1251 cls_loss_causal 1.1859 re_mapping 0.0567 re_causal 0.1430 /// teacc 96.75 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0003, -0.0223, -0.0092, ..., -0.0426, -0.0312, -0.0276], [ 0.0129, -0.0064, -0.0153, ..., 0.0437, 0.0323, 0.0080], [-0.0362, -0.0189, -0.0364, ..., 0.0277, -0.0046, 0.0186], ..., [-0.0329, -0.0115, -0.0454, ..., -0.0068, -0.0127, 0.0263], [ 0.0364, 0.0096, -0.0025, ..., -0.0187, -0.0264, 0.0070], [-0.0548, 0.0071, 0.0215, ..., -0.0524, 0.0055, -0.0256]], device='cuda:0'), grad: tensor([[-1.4095e-03, 0.0000e+00, 1.8847e-04, ..., 2.6107e-04, -7.4148e-04, 2.7716e-05], [-4.8447e-04, 0.0000e+00, -1.3294e-03, ..., -6.9475e-04, -2.6093e-03, 4.7684e-05], [ 3.9846e-05, 0.0000e+00, 8.2374e-05, ..., -7.2670e-03, -1.0414e-03, -7.2145e-04], ..., [ 3.7956e-04, 0.0000e+00, 5.7030e-04, ..., 3.3417e-03, 1.4067e-03, -2.6727e-04], [ 9.4748e-04, 0.0000e+00, 1.0208e-02, ..., 6.8512e-03, 6.8016e-03, 1.8370e-04], [ 5.3501e-04, 0.0000e+00, -1.0048e-02, ..., -4.3411e-03, -6.8398e-03, 1.4925e-04]], device='cuda:0') Epoch 11, bias, value: tensor([-0.0079, -0.0279, 0.0083, -0.0210, 0.0147, 0.0018, 0.0197, -0.0114, -0.0227, 0.0021], device='cuda:0'), grad: tensor([-0.0051, -0.0022, -0.0031, 0.0017, 0.0056, 0.0016, 0.0007, 0.0033, 0.0233, -0.0259], device='cuda:0') 100 0.0001 changing lr epoch 10, time 262.38, cls_loss 0.0920 cls_loss_mapping 0.1155 cls_loss_causal 1.0707 re_mapping 0.0534 re_causal 0.1240 /// teacc 97.04 lr 0.00010000 Epoch 12, weight, value: tensor([[ 0.0005, -0.0223, -0.0104, ..., -0.0439, -0.0319, -0.0300], [ 0.0130, -0.0064, -0.0156, ..., 0.0435, 0.0334, 0.0079], [-0.0364, -0.0189, -0.0375, ..., 0.0288, -0.0039, 0.0184], ..., [-0.0349, -0.0115, -0.0465, ..., -0.0066, -0.0142, 0.0273], [ 0.0375, 0.0096, -0.0020, ..., -0.0186, -0.0277, 0.0065], [-0.0560, 0.0071, 0.0215, ..., -0.0536, 0.0056, -0.0266]], device='cuda:0'), grad: tensor([[ 5.4464e-06, 0.0000e+00, 8.2159e-04, ..., 7.2432e-04, 6.7568e-04, 9.8407e-05], [-8.0645e-05, 0.0000e+00, -2.9278e-04, ..., -7.1764e-04, -7.9393e-04, 3.4928e-04], [ 1.6302e-05, 0.0000e+00, 2.3232e-03, ..., 7.1001e-04, 2.0008e-03, 4.8423e-04], ..., [ 1.3165e-05, 0.0000e+00, 3.1376e-03, ..., 2.7447e-03, 7.1383e-04, 9.5320e-04], [ 1.8328e-05, 0.0000e+00, -3.6831e-03, ..., -2.0638e-03, 7.7915e-04, 4.6539e-04], [ 1.5402e-04, 0.0000e+00, 2.3041e-03, ..., 4.4899e-03, 1.2684e-03, 1.8892e-03]], device='cuda:0') Epoch 12, bias, value: tensor([-0.0078, -0.0280, 0.0086, -0.0208, 0.0145, 0.0016, 0.0198, -0.0114, -0.0227, 0.0021], device='cuda:0'), grad: tensor([-0.0015, 0.0003, 0.0051, -0.0063, -0.0071, 0.0043, -0.0005, 0.0052, -0.0061, 0.0068], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 278.61, cls_loss 0.0944 cls_loss_mapping 0.1198 cls_loss_causal 1.1229 re_mapping 0.0485 re_causal 0.1233 /// teacc 97.52 lr 0.00010000 Epoch 13, weight, value: tensor([[ 0.0016, -0.0223, -0.0118, ..., -0.0459, -0.0326, -0.0320], [ 0.0126, -0.0064, -0.0154, ..., 0.0441, 0.0350, 0.0068], [-0.0375, -0.0189, -0.0393, ..., 0.0288, -0.0043, 0.0181], ..., [-0.0347, -0.0115, -0.0469, ..., -0.0063, -0.0150, 0.0286], [ 0.0388, 0.0096, -0.0013, ..., -0.0180, -0.0285, 0.0056], [-0.0565, 0.0071, 0.0211, ..., -0.0551, 0.0055, -0.0278]], device='cuda:0'), grad: tensor([[ 3.1829e-04, 0.0000e+00, 6.0844e-04, ..., 1.4009e-03, 1.6296e-04, 2.5168e-05], [ 5.7906e-05, 0.0000e+00, -3.0255e-04, ..., -3.4022e-04, -6.2799e-04, 9.5785e-05], [ 1.5342e-04, 0.0000e+00, 5.0735e-04, ..., -1.8632e-04, -5.5361e-04, 2.5010e-04], ..., [-1.2743e-04, 0.0000e+00, 2.8539e-04, ..., -5.7173e-04, 3.7432e-04, -8.7738e-04], [-3.8576e-04, 0.0000e+00, 2.7714e-03, ..., 3.3331e-04, 1.9331e-03, 6.0201e-05], [ 2.4652e-04, 0.0000e+00, 6.7902e-04, ..., 1.0383e-04, -2.0370e-03, 1.2255e-04]], device='cuda:0') Epoch 13, bias, value: tensor([-0.0077, -0.0278, 0.0083, -0.0209, 0.0146, 0.0016, 0.0194, -0.0113, -0.0224, 0.0020], device='cuda:0'), grad: tensor([ 0.0030, -0.0003, 0.0006, 0.0024, 0.0013, -0.0071, 0.0020, -0.0005, -0.0008, -0.0007], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 278.72, cls_loss 0.0749 cls_loss_mapping 0.0972 cls_loss_causal 1.0655 re_mapping 0.0466 re_causal 0.1147 /// teacc 97.66 lr 0.00010000 Epoch 14, weight, value: tensor([[ 0.0017, -0.0223, -0.0130, ..., -0.0473, -0.0332, -0.0341], [ 0.0123, -0.0064, -0.0153, ..., 0.0444, 0.0360, 0.0067], [-0.0381, -0.0189, -0.0408, ..., 0.0292, -0.0036, 0.0173], ..., [-0.0349, -0.0115, -0.0479, ..., -0.0064, -0.0161, 0.0294], [ 0.0398, 0.0096, -0.0009, ..., -0.0178, -0.0291, 0.0050], [-0.0576, 0.0071, 0.0210, ..., -0.0562, 0.0057, -0.0281]], device='cuda:0'), grad: tensor([[-5.3883e-04, 0.0000e+00, 3.7980e-04, ..., 3.5071e-04, 1.2740e-05, 1.8582e-05], [ 1.4801e-03, 0.0000e+00, 1.8578e-03, ..., 2.1744e-03, -9.0790e-04, 6.2823e-05], [ 6.2227e-04, 0.0000e+00, 1.5860e-03, ..., 1.8559e-03, 2.6464e-04, 9.5725e-05], ..., [-7.6175e-05, 0.0000e+00, 4.1986e-04, ..., -9.6607e-04, 3.4904e-04, -6.8617e-04], [-3.9825e-03, 0.0000e+00, -1.0170e-02, ..., -1.1017e-02, -7.7069e-05, 2.8148e-05], [ 4.4441e-04, 0.0000e+00, 4.9324e-03, ..., 4.3259e-03, -1.1665e-04, 1.7023e-04]], device='cuda:0') Epoch 14, bias, value: tensor([-0.0076, -0.0279, 0.0084, -0.0205, 0.0142, 0.0019, 0.0192, -0.0114, -0.0225, 0.0019], device='cuda:0'), grad: tensor([-0.0008, 0.0027, 0.0023, 0.0038, 0.0008, -0.0018, 0.0017, -0.0005, -0.0165, 0.0084], device='cuda:0') 100 0.0001 changing lr epoch 13, time 262.33, cls_loss 0.0878 cls_loss_mapping 0.1078 cls_loss_causal 1.0380 re_mapping 0.0432 re_causal 0.1055 /// teacc 97.58 lr 0.00010000 Epoch 15, weight, value: tensor([[ 0.0021, -0.0223, -0.0141, ..., -0.0488, -0.0337, -0.0358], [ 0.0128, -0.0064, -0.0155, ..., 0.0445, 0.0369, 0.0066], [-0.0392, -0.0189, -0.0420, ..., 0.0300, -0.0025, 0.0170], ..., [-0.0363, -0.0115, -0.0480, ..., -0.0063, -0.0173, 0.0304], [ 0.0420, 0.0096, -0.0006, ..., -0.0175, -0.0301, 0.0042], [-0.0595, 0.0071, 0.0209, ..., -0.0570, 0.0052, -0.0286]], device='cuda:0'), grad: tensor([[ 3.3379e-04, 0.0000e+00, 5.1403e-04, ..., 1.1263e-03, 1.1034e-03, 7.6694e-07], [ 3.5357e-04, 0.0000e+00, 4.1509e-04, ..., 7.5293e-04, 1.0228e-04, 1.3253e-06], [ 1.1539e-03, 0.0000e+00, 1.4572e-03, ..., -8.4686e-04, 1.0157e-03, -1.7062e-05], ..., [ 2.4259e-04, 0.0000e+00, 1.1501e-03, ..., 1.5516e-03, 6.8617e-04, 4.2133e-06], [-2.0580e-03, 0.0000e+00, -7.8249e-04, ..., -2.8801e-03, 6.5422e-04, 1.0729e-06], [ 4.6277e-04, 0.0000e+00, 9.4843e-04, ..., 1.1625e-03, 9.8705e-04, 5.4948e-08]], device='cuda:0') Epoch 15, bias, value: tensor([-0.0079, -0.0278, 0.0088, -0.0205, 0.0141, 0.0021, 0.0191, -0.0118, -0.0225, 0.0020], device='cuda:0'), grad: tensor([ 0.0028, 0.0012, -0.0005, 0.0091, -0.0082, -0.0145, 0.0031, 0.0064, 0.0008, -0.0001], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 14---------------------------------------------------- epoch 14, time 279.63, cls_loss 0.0727 cls_loss_mapping 0.0954 cls_loss_causal 1.0262 re_mapping 0.0412 re_causal 0.1041 /// teacc 97.71 lr 0.00010000 Epoch 16, weight, value: tensor([[ 2.9271e-03, -2.2335e-02, -1.5212e-02, ..., -5.0395e-02, -3.3793e-02, -3.7773e-02], [ 1.3338e-02, -6.4199e-03, -1.5480e-02, ..., 4.4424e-02, 3.7605e-02, 5.9012e-03], [-3.9176e-02, -1.8948e-02, -4.3374e-02, ..., 3.0824e-02, -1.9028e-03, 1.6749e-02], ..., [-3.7710e-02, -1.1540e-02, -4.7206e-02, ..., -6.4257e-03, -1.7850e-02, 3.0798e-02], [ 4.2785e-02, 9.5671e-03, 2.2658e-05, ..., -1.7307e-02, -3.1190e-02, 3.9399e-03], [-6.0666e-02, 7.1317e-03, 2.1243e-02, ..., -5.7402e-02, 5.3516e-03, -2.9453e-02]], device='cuda:0'), grad: tensor([[-1.5755e-03, 0.0000e+00, 2.9588e-04, ..., 1.2455e-03, 6.3038e-04, 8.5175e-05], [ 7.9918e-04, 0.0000e+00, 9.3126e-04, ..., 1.8797e-03, 1.7083e-04, 2.4402e-04], [ 1.3390e-03, 0.0000e+00, 2.1172e-03, ..., -1.4549e-02, 6.3848e-04, 9.4700e-04], ..., [ 1.4095e-03, 0.0000e+00, 7.7200e-04, ..., 2.4548e-03, 2.5387e-03, -1.6832e-04], [-7.5436e-04, 0.0000e+00, -2.9163e-03, ..., -1.5612e-03, -8.7309e-04, 1.3649e-04], [-9.8801e-04, 0.0000e+00, 2.8687e-03, ..., 3.0670e-03, -4.0207e-03, 1.7416e-04]], device='cuda:0') Epoch 16, bias, value: tensor([-0.0072, -0.0280, 0.0091, -0.0206, 0.0140, 0.0018, 0.0189, -0.0114, -0.0225, 0.0017], device='cuda:0'), grad: tensor([ 0.0021, 0.0030, -0.0113, 0.0034, 0.0012, 0.0042, 0.0033, 0.0085, -0.0066, -0.0078], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 15---------------------------------------------------- epoch 15, time 279.05, cls_loss 0.0821 cls_loss_mapping 0.0971 cls_loss_causal 1.0063 re_mapping 0.0395 re_causal 0.0973 /// teacc 97.84 lr 0.00010000 Epoch 17, weight, value: tensor([[ 0.0032, -0.0233, -0.0162, ..., -0.0521, -0.0342, -0.0400], [ 0.0140, -0.0086, -0.0153, ..., 0.0444, 0.0381, 0.0053], [-0.0397, -0.0121, -0.0448, ..., 0.0314, -0.0007, 0.0163], ..., [-0.0393, -0.0133, -0.0470, ..., -0.0061, -0.0187, 0.0314], [ 0.0435, 0.0094, 0.0005, ..., -0.0168, -0.0320, 0.0038], [-0.0627, 0.0065, 0.0208, ..., -0.0589, 0.0054, -0.0306]], device='cuda:0'), grad: tensor([[ 1.0622e-04, 1.5929e-05, 1.6284e-04, ..., 2.5058e-04, 3.2735e-04, 1.3387e-04], [ 3.7861e-04, 2.4121e-06, 1.8702e-03, ..., 3.3283e-03, 4.3845e-04, 1.3885e-03], [-4.7874e-04, 1.0086e-06, -1.1322e-02, ..., -2.0889e-02, -2.7523e-03, -9.0256e-03], ..., [ 4.2796e-04, 3.1125e-06, 7.3433e-03, ..., 1.3321e-02, 1.6584e-03, 5.1079e-03], [-2.1863e-04, 1.1399e-06, 7.9989e-05, ..., 1.4839e-03, -1.3840e-04, 5.3883e-04], [ 1.2350e-04, 3.2643e-07, 1.6427e-04, ..., 3.6788e-04, 7.4804e-05, 1.7965e-04]], device='cuda:0') Epoch 17, bias, value: tensor([-0.0073, -0.0280, 0.0093, -0.0206, 0.0142, 0.0024, 0.0182, -0.0114, -0.0223, 0.0011], device='cuda:0'), grad: tensor([ 4.0221e-04, 3.1872e-03, -1.6006e-02, 1.6384e-03, 1.2045e-03, 4.5133e-04, -3.0065e-04, 8.5068e-03, -3.6389e-05, 9.5892e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 16---------------------------------------------------- epoch 16, time 278.68, cls_loss 0.0692 cls_loss_mapping 0.0852 cls_loss_causal 0.9558 re_mapping 0.0389 re_causal 0.0921 /// teacc 98.14 lr 0.00010000 Epoch 18, weight, value: tensor([[ 0.0034, -0.0281, -0.0170, ..., -0.0536, -0.0348, -0.0413], [ 0.0140, -0.0052, -0.0149, ..., 0.0448, 0.0391, 0.0046], [-0.0405, -0.0111, -0.0463, ..., 0.0316, -0.0004, 0.0161], ..., [-0.0408, -0.0174, -0.0472, ..., -0.0060, -0.0194, 0.0320], [ 0.0445, 0.0047, 0.0012, ..., -0.0166, -0.0328, 0.0041], [-0.0629, 0.0023, 0.0203, ..., -0.0601, 0.0056, -0.0315]], device='cuda:0'), grad: tensor([[ 8.4415e-06, 2.2352e-08, 1.1241e-04, ..., 1.4734e-04, 8.1360e-05, 8.0764e-06], [-3.4750e-05, -1.6764e-06, 7.6532e-04, ..., 6.4421e-04, -4.0412e-04, 9.5546e-05], [ 3.6031e-05, 1.5926e-07, 3.7193e-04, ..., 4.8876e-04, 3.2473e-04, 1.8850e-05], ..., [ 2.4915e-05, 4.4238e-08, -4.7760e-03, ..., -4.2572e-03, 2.5272e-04, -6.4135e-04], [ 2.6301e-05, 6.6031e-07, 2.6665e-03, ..., 2.2373e-03, 3.5906e-04, 2.3258e-04], [ 1.7807e-05, 3.4459e-08, 2.6560e-04, ..., 5.9795e-04, -7.1526e-05, 8.8155e-05]], device='cuda:0') Epoch 18, bias, value: tensor([-0.0075, -0.0280, 0.0088, -0.0202, 0.0146, 0.0022, 0.0181, -0.0114, -0.0223, 0.0011], device='cuda:0'), grad: tensor([ 2.2817e-04, 8.4925e-04, 7.6199e-04, 3.8929e-03, 3.3975e-04, -3.6201e-03, 9.8571e-06, -6.5651e-03, 3.4313e-03, 6.6566e-04], device='cuda:0') 100 0.0001 changing lr epoch 17, time 261.99, cls_loss 0.0606 cls_loss_mapping 0.0783 cls_loss_causal 0.9925 re_mapping 0.0366 re_causal 0.0930 /// teacc 97.85 lr 0.00010000 Epoch 19, weight, value: tensor([[ 0.0039, -0.0298, -0.0182, ..., -0.0551, -0.0355, -0.0428], [ 0.0135, -0.0060, -0.0148, ..., 0.0447, 0.0393, 0.0043], [-0.0403, -0.0082, -0.0468, ..., 0.0325, 0.0010, 0.0160], ..., [-0.0414, -0.0205, -0.0479, ..., -0.0059, -0.0206, 0.0334], [ 0.0451, 0.0031, 0.0014, ..., -0.0169, -0.0339, 0.0032], [-0.0639, 0.0037, 0.0204, ..., -0.0610, 0.0057, -0.0326]], device='cuda:0'), grad: tensor([[ 6.4790e-05, 1.8207e-06, 6.0409e-05, ..., 2.4939e-04, 5.0831e-04, 2.7999e-05], [ 2.0294e-03, 2.2640e-03, 3.9458e-04, ..., 1.1299e-02, 1.1597e-02, 1.4111e-05], [-2.2202e-03, -2.3098e-03, 2.1970e-04, ..., -1.2459e-02, -1.2932e-02, -1.2361e-05], ..., [-4.3678e-04, 2.6345e-05, -1.0614e-03, ..., -2.4581e-04, -1.3900e-04, -1.4937e-04], [ 3.7193e-05, 2.0918e-06, 1.0371e-04, ..., 1.7118e-04, 2.4045e-04, 9.0227e-06], [ 2.3639e-04, 2.4028e-07, 6.1178e-04, ..., 6.7377e-04, 3.6430e-04, 8.0585e-05]], device='cuda:0') Epoch 19, bias, value: tensor([-0.0074, -0.0280, 0.0096, -0.0204, 0.0143, 0.0028, 0.0179, -0.0112, -0.0228, 0.0008], device='cuda:0'), grad: tensor([ 0.0012, 0.0116, -0.0111, 0.0005, 0.0013, -0.0052, 0.0026, -0.0038, 0.0006, 0.0023], device='cuda:0') 100 0.0001 changing lr epoch 18, time 261.97, cls_loss 0.0531 cls_loss_mapping 0.0700 cls_loss_causal 0.9371 re_mapping 0.0353 re_causal 0.0912 /// teacc 97.92 lr 0.00010000 Epoch 20, weight, value: tensor([[ 0.0046, -0.0391, -0.0189, ..., -0.0558, -0.0368, -0.0454], [ 0.0135, -0.0071, -0.0155, ..., 0.0443, 0.0394, 0.0037], [-0.0399, -0.0019, -0.0468, ..., 0.0332, 0.0020, 0.0156], ..., [-0.0415, -0.0305, -0.0483, ..., -0.0059, -0.0220, 0.0345], [ 0.0452, -0.0007, 0.0018, ..., -0.0168, -0.0347, 0.0032], [-0.0652, 0.0076, 0.0206, ..., -0.0618, 0.0067, -0.0331]], device='cuda:0'), grad: tensor([[ 1.3411e-04, 3.3863e-06, 3.6597e-04, ..., 5.0163e-04, 2.6798e-04, 1.0937e-04], [ 1.9729e-04, -7.6666e-06, 2.8419e-04, ..., 2.8658e-04, -6.9737e-05, 1.1289e-04], [ 1.9722e-03, -4.5300e-05, 8.6403e-04, ..., 1.6129e-02, 8.5373e-03, 3.0875e-04], ..., [-9.7036e-04, 1.5944e-05, 9.2030e-05, ..., -3.8767e-04, 1.9848e-04, -1.6155e-03], [-1.4839e-03, 5.3644e-06, -5.2605e-03, ..., -3.6888e-03, -2.2984e-03, 8.5652e-05], [ 5.6362e-04, 7.1600e-06, 1.6813e-03, ..., 1.0338e-03, 7.2670e-03, 6.2132e-04]], device='cuda:0') Epoch 20, bias, value: tensor([-0.0077, -0.0286, 0.0101, -0.0202, 0.0140, 0.0022, 0.0184, -0.0113, -0.0227, 0.0013], device='cuda:0'), grad: tensor([ 0.0011, 0.0005, 0.0157, -0.0131, -0.0097, 0.0027, 0.0008, -0.0020, -0.0128, 0.0168], device='cuda:0') 100 0.0001 changing lr epoch 19, time 262.13, cls_loss 0.0633 cls_loss_mapping 0.0785 cls_loss_causal 0.9255 re_mapping 0.0324 re_causal 0.0829 /// teacc 98.14 lr 0.00010000 Epoch 21, weight, value: tensor([[ 5.1018e-03, -4.2474e-02, -1.9843e-02, ..., -5.7534e-02, -3.7260e-02, -4.7040e-02], [ 1.3454e-02, -8.9705e-03, -1.5220e-02, ..., 4.4096e-02, 3.9606e-02, 3.6152e-03], [-4.1687e-02, 3.9066e-05, -4.7625e-02, ..., 3.3249e-02, 2.6951e-03, 1.4517e-02], ..., [-4.2111e-02, -2.8523e-02, -4.8739e-02, ..., -5.7513e-03, -2.2157e-02, 3.5070e-02], [ 4.6602e-02, -1.8864e-03, 2.1780e-03, ..., -1.6189e-02, -3.5679e-02, 3.4759e-03], [-6.5758e-02, 5.3955e-03, 2.0645e-02, ..., -6.2398e-02, 6.6634e-03, -3.3677e-02]], device='cuda:0'), grad: tensor([[-7.1287e-05, 7.7188e-06, 1.0215e-05, ..., 4.0352e-05, -3.4750e-05, 5.3234e-06], [ 8.9034e-06, 1.1832e-05, -1.3657e-05, ..., 3.9965e-05, 2.0787e-05, 1.4573e-05], [ 5.5850e-05, -1.1498e-04, 1.1164e-04, ..., -1.4198e-04, -1.8001e-04, -1.4074e-05], ..., [ 6.8098e-06, 2.7835e-05, 1.4506e-05, ..., -1.9848e-04, 1.5986e-04, -2.0206e-04], [-4.5933e-06, 2.1145e-05, -1.2958e-04, ..., -6.1035e-05, 1.1992e-04, 1.0863e-05], [ 3.8296e-05, 8.9183e-06, 5.2500e-04, ..., 7.3004e-04, 2.9430e-03, 8.1658e-05]], device='cuda:0') Epoch 21, bias, value: tensor([-0.0077, -0.0288, 0.0096, -0.0203, 0.0145, 0.0025, 0.0179, -0.0111, -0.0225, 0.0012], device='cuda:0'), grad: tensor([-1.5593e-04, 9.9361e-05, 2.9251e-05, 5.6839e-04, -7.6141e-03, -4.2701e-04, -2.4843e-04, -1.4412e-04, 1.3328e-04, 7.7629e-03], device='cuda:0') 100 0.0001 changing lr epoch 20, time 261.91, cls_loss 0.0411 cls_loss_mapping 0.0577 cls_loss_causal 0.8953 re_mapping 0.0320 re_causal 0.0870 /// teacc 97.95 lr 0.00010000 Epoch 22, weight, value: tensor([[ 5.5057e-03, -4.8713e-02, -2.0457e-02, ..., -5.8301e-02, -3.7313e-02, -4.7785e-02], [ 1.3723e-02, -6.9754e-03, -1.4377e-02, ..., 4.4885e-02, 4.0666e-02, 3.3136e-03], [-4.1866e-02, 6.5748e-05, -4.9471e-02, ..., 3.3447e-02, 2.2865e-03, 1.4338e-02], ..., [-4.3079e-02, -3.0418e-02, -4.8271e-02, ..., -5.4875e-03, -2.2548e-02, 3.6107e-02], [ 4.7456e-02, 2.1532e-04, 2.2991e-03, ..., -1.5973e-02, -3.6440e-02, 2.9919e-03], [-6.6437e-02, 5.4790e-03, 2.1240e-02, ..., -6.3581e-02, 6.8969e-03, -3.4409e-02]], device='cuda:0'), grad: tensor([[-3.7074e-04, -1.8394e-04, 3.3569e-04, ..., 4.8280e-04, 2.7716e-05, 3.2037e-06], [ 6.5088e-05, -8.5950e-05, -3.3236e-04, ..., -2.5463e-04, -6.9427e-04, 1.1601e-05], [ 3.6144e-04, -1.4710e-04, 2.9802e-04, ..., 4.2319e-04, 3.7336e-04, 1.4520e-04], ..., [ 1.5080e-04, 2.4170e-05, 1.6344e-04, ..., -6.8140e-04, -4.5419e-04, -3.6979e-04], [ 5.7840e-04, 1.4257e-04, 7.3862e-04, ..., 9.1553e-04, 3.9601e-04, 3.0011e-05], [ 8.3876e-04, 2.7561e-04, 1.4067e-03, ..., 1.2503e-03, 1.8978e-03, 9.8109e-05]], device='cuda:0') Epoch 22, bias, value: tensor([-0.0075, -0.0285, 0.0093, -0.0210, 0.0143, 0.0027, 0.0182, -0.0107, -0.0224, 0.0011], device='cuda:0'), grad: tensor([ 3.9577e-05, -6.1321e-04, 1.9302e-03, -8.4457e-03, -2.4300e-03, 5.8250e-03, 6.4659e-04, -1.3876e-03, 1.8063e-03, 2.6245e-03], device='cuda:0') 100 0.0001 changing lr epoch 21, time 262.43, cls_loss 0.0422 cls_loss_mapping 0.0582 cls_loss_causal 0.8725 re_mapping 0.0304 re_causal 0.0806 /// teacc 98.14 lr 0.00010000 Epoch 23, weight, value: tensor([[ 0.0056, -0.0546, -0.0213, ..., -0.0602, -0.0383, -0.0509], [ 0.0135, -0.0058, -0.0137, ..., 0.0453, 0.0419, 0.0030], [-0.0428, 0.0011, -0.0510, ..., 0.0338, 0.0027, 0.0138], ..., [-0.0433, -0.0316, -0.0490, ..., -0.0057, -0.0237, 0.0371], [ 0.0478, 0.0023, 0.0023, ..., -0.0162, -0.0374, 0.0024], [-0.0668, 0.0062, 0.0216, ..., -0.0639, 0.0071, -0.0356]], device='cuda:0'), grad: tensor([[ 2.8208e-05, -7.1973e-06, 1.0729e-04, ..., 7.6354e-05, -1.6713e-04, 1.5542e-05], [ 8.7380e-05, -2.8208e-05, -1.6456e-06, ..., -6.6340e-05, -4.2510e-04, 9.8288e-05], [ 8.0287e-05, 7.0184e-06, 2.4557e-04, ..., 2.9325e-04, 1.1563e-04, 9.6977e-05], ..., [ 2.3842e-04, -1.8969e-05, 1.2436e-03, ..., 7.2050e-04, 2.2709e-04, 1.4019e-04], [-1.2657e-06, 2.2035e-06, 9.6560e-05, ..., -1.1545e-04, 1.5140e-04, 1.9073e-05], [ 3.2258e-04, 1.3649e-05, 6.4468e-04, ..., 9.7656e-04, -1.5962e-04, 2.4176e-04]], device='cuda:0') Epoch 23, bias, value: tensor([-0.0078, -0.0282, 0.0095, -0.0205, 0.0148, 0.0022, 0.0180, -0.0111, -0.0227, 0.0013], device='cuda:0'), grad: tensor([-2.8419e-04, -3.5375e-05, 5.3072e-04, -4.1809e-03, -8.9049e-05, 9.5844e-04, 3.3402e-04, 1.4086e-03, 3.9458e-04, 9.6416e-04], device='cuda:0') 100 0.0001 changing lr epoch 22, time 262.18, cls_loss 0.0496 cls_loss_mapping 0.0644 cls_loss_causal 0.8851 re_mapping 0.0295 re_causal 0.0766 /// teacc 98.10 lr 0.00010000 Epoch 24, weight, value: tensor([[ 0.0060, -0.0574, -0.0220, ..., -0.0614, -0.0390, -0.0522], [ 0.0129, -0.0070, -0.0138, ..., 0.0448, 0.0420, 0.0025], [-0.0430, 0.0027, -0.0520, ..., 0.0343, 0.0036, 0.0135], ..., [-0.0440, -0.0316, -0.0493, ..., -0.0058, -0.0252, 0.0378], [ 0.0490, 0.0015, 0.0030, ..., -0.0157, -0.0382, 0.0022], [-0.0681, 0.0049, 0.0211, ..., -0.0650, 0.0081, -0.0365]], device='cuda:0'), grad: tensor([[ 5.5462e-05, 4.6945e-04, 6.0940e-04, ..., 7.9584e-04, 4.6682e-04, 2.5094e-05], [ 1.4091e-04, 6.9678e-05, 6.8092e-04, ..., 1.0576e-03, 2.3210e-04, 7.4089e-05], [ 3.2735e-04, -1.2803e-04, 9.3603e-04, ..., -7.0453e-05, 8.9049e-05, 3.3402e-04], ..., [-1.8225e-03, -3.1114e-04, 1.1665e-04, ..., -3.3054e-03, 3.4165e-04, -1.5354e-03], [ 8.0919e-04, 9.3231e-03, 1.1253e-02, ..., 1.2581e-02, 4.5280e-03, 3.6144e-04], [ 2.0623e-04, -7.6408e-03, -8.3313e-03, ..., -7.4196e-03, 5.2490e-03, 2.2781e-04]], device='cuda:0') Epoch 24, bias, value: tensor([-0.0079, -0.0288, 0.0097, -0.0204, 0.0144, 0.0025, 0.0181, -0.0112, -0.0226, 0.0015], device='cuda:0'), grad: tensor([ 0.0008, 0.0013, 0.0019, -0.0071, -0.0111, 0.0046, 0.0007, -0.0074, 0.0321, -0.0157], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 23---------------------------------------------------- epoch 23, time 279.15, cls_loss 0.0410 cls_loss_mapping 0.0548 cls_loss_causal 0.8936 re_mapping 0.0282 re_causal 0.0758 /// teacc 98.32 lr 0.00010000 Epoch 25, weight, value: tensor([[ 0.0063, -0.0587, -0.0228, ..., -0.0625, -0.0391, -0.0550], [ 0.0126, -0.0070, -0.0131, ..., 0.0450, 0.0429, 0.0022], [-0.0438, 0.0048, -0.0533, ..., 0.0348, 0.0042, 0.0131], ..., [-0.0444, -0.0334, -0.0496, ..., -0.0062, -0.0264, 0.0388], [ 0.0494, 0.0002, 0.0033, ..., -0.0155, -0.0398, 0.0020], [-0.0689, 0.0077, 0.0208, ..., -0.0659, 0.0091, -0.0370]], device='cuda:0'), grad: tensor([[ 7.8902e-06, 8.9183e-06, 6.7174e-05, ..., 7.5810e-06, -7.7009e-05, 3.0905e-05], [ 2.8878e-05, 2.6226e-03, 1.8680e-04, ..., 4.3106e-03, 6.3248e-03, 8.9526e-05], [-2.5654e-03, -3.1776e-03, -6.0158e-03, ..., -1.5808e-02, -1.2970e-02, 1.2207e-04], ..., [-2.4110e-05, 5.2422e-05, -1.8799e-04, ..., -7.5626e-04, 1.1152e-04, -1.1377e-03], [ 2.3270e-03, 2.4211e-04, 4.7760e-03, ..., 1.0147e-02, 5.4893e-03, 2.9624e-05], [ 1.0449e-04, 5.9381e-06, 5.1594e-04, ..., 6.6471e-04, -5.1260e-05, 5.6887e-04]], device='cuda:0') Epoch 25, bias, value: tensor([-0.0078, -0.0286, 0.0097, -0.0203, 0.0139, 0.0024, 0.0184, -0.0116, -0.0229, 0.0021], device='cuda:0'), grad: tensor([-0.0047, 0.0050, -0.0126, 0.0013, 0.0005, 0.0008, 0.0031, -0.0027, 0.0069, 0.0025], device='cuda:0') 100 0.0001 changing lr epoch 24, time 262.36, cls_loss 0.0401 cls_loss_mapping 0.0559 cls_loss_causal 0.8670 re_mapping 0.0277 re_causal 0.0744 /// teacc 98.13 lr 0.00010000 Epoch 26, weight, value: tensor([[ 6.3072e-03, -5.9249e-02, -2.3751e-02, ..., -6.3550e-02, -3.9737e-02, -5.6704e-02], [ 1.1861e-02, -8.1189e-03, -1.2558e-02, ..., 4.4919e-02, 4.2930e-02, 1.7998e-03], [-4.3228e-02, 6.8351e-03, -5.4552e-02, ..., 3.5124e-02, 5.1489e-03, 1.2409e-02], ..., [-4.4607e-02, -3.4165e-02, -4.9596e-02, ..., -5.7494e-03, -2.6601e-02, 3.9994e-02], [ 4.9796e-02, 4.4689e-05, 3.6707e-03, ..., -1.5601e-02, -4.1024e-02, 1.8276e-03], [-7.0225e-02, 7.0874e-03, 2.0145e-02, ..., -6.7072e-02, 9.3765e-03, -3.8391e-02]], device='cuda:0'), grad: tensor([[ 3.9153e-06, 7.9036e-05, 5.5611e-05, ..., 1.0777e-04, 2.2459e-04, 2.3246e-06], [ 1.2565e-04, 1.0920e-03, 2.6226e-03, ..., 5.5466e-03, 3.1834e-03, 2.4867e-04], [ 1.8597e-04, 1.3466e-03, 8.6260e-04, ..., 1.7099e-03, 4.8752e-03, 2.6539e-05], ..., [-6.9797e-05, -1.6317e-03, -4.9133e-03, ..., -9.8801e-03, -6.5575e-03, -3.4547e-04], [-6.6423e-04, 1.3161e-04, -6.6662e-04, ..., -1.2054e-03, 4.3416e-04, 3.2578e-06], [ 2.7156e-04, -2.3346e-03, 1.4753e-03, ..., 3.1528e-03, -4.2458e-03, 2.9072e-05]], device='cuda:0') Epoch 26, bias, value: tensor([-0.0078, -0.0288, 0.0096, -0.0200, 0.0138, 0.0025, 0.0183, -0.0106, -0.0231, 0.0015], device='cuda:0'), grad: tensor([ 0.0004, 0.0080, 0.0077, 0.0021, 0.0023, 0.0009, 0.0004, -0.0130, -0.0004, -0.0083], device='cuda:0') 100 0.0001 changing lr epoch 25, time 261.99, cls_loss 0.0354 cls_loss_mapping 0.0457 cls_loss_causal 0.8578 re_mapping 0.0265 re_causal 0.0736 /// teacc 98.17 lr 0.00010000 Epoch 27, weight, value: tensor([[ 0.0072, -0.0601, -0.0244, ..., -0.0639, -0.0394, -0.0577], [ 0.0110, -0.0093, -0.0122, ..., 0.0448, 0.0434, 0.0014], [-0.0435, 0.0074, -0.0557, ..., 0.0356, 0.0051, 0.0124], ..., [-0.0460, -0.0327, -0.0496, ..., -0.0055, -0.0269, 0.0404], [ 0.0509, 0.0010, 0.0044, ..., -0.0151, -0.0414, 0.0018], [-0.0713, 0.0069, 0.0196, ..., -0.0682, 0.0091, -0.0389]], device='cuda:0'), grad: tensor([[ 1.7405e-05, 2.4308e-06, 5.7310e-05, ..., 7.9930e-05, 3.8713e-05, 5.6326e-06], [-7.8917e-04, -4.1890e-04, -2.3136e-03, ..., -2.4567e-03, -2.5787e-03, 1.7852e-05], [-7.4971e-07, 4.0054e-05, 1.1196e-03, ..., 5.7125e-04, 4.1842e-04, 1.7524e-05], ..., [ 4.4346e-05, 4.0054e-05, 2.5249e-04, ..., -4.6223e-05, 1.8716e-04, -5.6148e-05], [ 6.3515e-04, 2.2507e-04, 4.6468e-04, ..., 1.4544e-03, 1.0576e-03, 5.5507e-06], [ 4.5627e-05, 3.1684e-06, 2.6837e-05, ..., 2.2328e-04, 6.1572e-05, -9.6440e-05]], device='cuda:0') Epoch 27, bias, value: tensor([-0.0073, -0.0290, 0.0097, -0.0202, 0.0142, 0.0026, 0.0177, -0.0106, -0.0228, 0.0011], device='cuda:0'), grad: tensor([-2.9534e-05, -3.7022e-03, 1.5240e-03, -1.8013e-04, 1.2007e-03, 2.3675e-04, 1.5044e-04, -6.3002e-05, 9.8515e-04, -1.2600e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 26---------------------------------------------------- epoch 26, time 278.81, cls_loss 0.0345 cls_loss_mapping 0.0466 cls_loss_causal 0.8280 re_mapping 0.0265 re_causal 0.0713 /// teacc 98.44 lr 0.00010000 Epoch 28, weight, value: tensor([[ 0.0076, -0.0609, -0.0251, ..., -0.0647, -0.0402, -0.0584], [ 0.0109, -0.0092, -0.0126, ..., 0.0445, 0.0441, 0.0009], [-0.0448, 0.0078, -0.0560, ..., 0.0356, 0.0053, 0.0120], ..., [-0.0470, -0.0335, -0.0492, ..., -0.0051, -0.0270, 0.0410], [ 0.0511, 0.0010, 0.0047, ..., -0.0151, -0.0421, 0.0018], [-0.0720, 0.0072, 0.0197, ..., -0.0686, 0.0097, -0.0392]], device='cuda:0'), grad: tensor([[ 2.8033e-06, 7.7039e-06, 1.2152e-05, ..., 2.9683e-05, -1.5751e-05, 1.2340e-06], [ 3.5129e-06, -2.5351e-06, -5.7161e-05, ..., -2.6003e-05, -8.7678e-05, 1.0088e-05], [ 2.1517e-05, -3.1859e-05, 3.0458e-05, ..., 9.7677e-06, -1.0155e-05, 7.3433e-05], ..., [-3.6418e-05, -5.4166e-06, 5.8126e-04, ..., -5.5730e-05, 7.8630e-04, -1.5759e-04], [ 8.7637e-07, 8.6948e-06, -4.3184e-05, ..., 4.7088e-06, 5.0306e-05, 8.6278e-06], [ 4.3474e-06, 2.1011e-06, -5.8603e-04, ..., -1.1277e-04, -8.6594e-04, 8.7321e-06]], device='cuda:0') Epoch 28, bias, value: tensor([-0.0079, -0.0293, 0.0094, -0.0199, 0.0140, 0.0027, 0.0179, -0.0104, -0.0230, 0.0015], device='cuda:0'), grad: tensor([-2.7823e-04, -2.9668e-05, 6.9141e-05, 5.9557e-04, 3.7718e-04, -3.8505e-04, -4.6670e-05, 3.9043e-03, 9.2864e-05, -4.2992e-03], device='cuda:0') 100 0.0001 changing lr epoch 27, time 262.28, cls_loss 0.0383 cls_loss_mapping 0.0468 cls_loss_causal 0.8278 re_mapping 0.0245 re_causal 0.0644 /// teacc 98.26 lr 0.00010000 Epoch 29, weight, value: tensor([[ 0.0077, -0.0621, -0.0258, ..., -0.0655, -0.0403, -0.0597], [ 0.0105, -0.0097, -0.0119, ..., 0.0449, 0.0447, 0.0007], [-0.0445, 0.0095, -0.0573, ..., 0.0359, 0.0061, 0.0120], ..., [-0.0465, -0.0334, -0.0502, ..., -0.0054, -0.0281, 0.0412], [ 0.0514, 0.0007, 0.0052, ..., -0.0147, -0.0430, 0.0022], [-0.0735, 0.0068, 0.0192, ..., -0.0696, 0.0093, -0.0402]], device='cuda:0'), grad: tensor([[ 5.0753e-05, 1.5438e-05, 7.3016e-05, ..., 6.7353e-05, 3.7456e-04, 6.2697e-06], [ 4.6074e-05, -1.2830e-05, -2.3210e-04, ..., -4.2737e-05, -3.2449e-04, 1.2159e-05], [ 1.1760e-04, -2.8706e-04, 3.4595e-04, ..., 1.4961e-04, 2.7084e-04, 1.9062e-04], ..., [-2.6107e-05, 1.6499e-04, 8.6248e-05, ..., -2.3484e-04, 1.4055e-04, -2.9159e-04], [ 2.0170e-04, 5.0068e-05, 3.0470e-04, ..., 4.0078e-04, 2.1315e-04, 5.5254e-05], [ 8.4519e-05, -3.2969e-06, 2.5368e-04, ..., 1.0896e-04, -8.6784e-04, 1.3426e-05]], device='cuda:0') Epoch 29, bias, value: tensor([-0.0072, -0.0291, 0.0098, -0.0198, 0.0134, 0.0028, 0.0178, -0.0105, -0.0230, 0.0010], device='cuda:0'), grad: tensor([ 8.9979e-04, -6.0976e-05, 9.1028e-04, 5.4312e-04, 4.7636e-04, -1.2932e-03, -5.2357e-04, 3.8326e-05, 8.7404e-04, -1.8663e-03], device='cuda:0') 100 0.0001 changing lr epoch 28, time 262.22, cls_loss 0.0367 cls_loss_mapping 0.0467 cls_loss_causal 0.8258 re_mapping 0.0253 re_causal 0.0682 /// teacc 98.23 lr 0.00010000 Epoch 30, weight, value: tensor([[ 0.0082, -0.0637, -0.0264, ..., -0.0661, -0.0404, -0.0606], [ 0.0104, -0.0104, -0.0115, ..., 0.0448, 0.0447, 0.0003], [-0.0451, 0.0112, -0.0575, ..., 0.0364, 0.0069, 0.0113], ..., [-0.0466, -0.0327, -0.0508, ..., -0.0054, -0.0290, 0.0418], [ 0.0519, 0.0002, 0.0058, ..., -0.0145, -0.0435, 0.0028], [-0.0740, 0.0071, 0.0187, ..., -0.0706, 0.0094, -0.0407]], device='cuda:0'), grad: tensor([[ 7.0827e-07, 1.2897e-05, 3.8266e-05, ..., 1.7449e-05, -3.7879e-05, 1.9129e-06], [-1.4435e-06, 4.4912e-05, 1.4591e-04, ..., -1.2383e-05, -9.3520e-05, 2.6315e-05], [ 5.8524e-06, 4.9210e-04, 3.1161e-04, ..., 6.3848e-04, 4.3440e-04, 7.0989e-05], ..., [ 1.4044e-06, 1.1496e-05, -5.3585e-05, ..., -4.8327e-04, 9.6142e-05, -2.8396e-04], [ 2.6077e-06, -3.9554e-04, 1.4246e-04, ..., -3.8123e-04, -1.4150e-04, 1.3754e-05], [ 6.5751e-06, -1.7178e-04, -4.7398e-04, ..., 1.8203e-04, -3.9530e-04, 4.7207e-05]], device='cuda:0') Epoch 30, bias, value: tensor([-0.0074, -0.0290, 0.0100, -0.0199, 0.0136, 0.0031, 0.0177, -0.0105, -0.0230, 0.0007], device='cuda:0'), grad: tensor([-3.4857e-04, 3.2330e-04, 1.5545e-03, 3.2973e-04, 3.1137e-04, -1.9073e-04, 7.8201e-05, -1.0452e-03, -4.4727e-04, -5.6458e-04], device='cuda:0') 100 0.0001 changing lr epoch 29, time 262.24, cls_loss 0.0331 cls_loss_mapping 0.0445 cls_loss_causal 0.8056 re_mapping 0.0250 re_causal 0.0670 /// teacc 98.40 lr 0.00010000 Epoch 31, weight, value: tensor([[ 0.0088, -0.0646, -0.0272, ..., -0.0668, -0.0416, -0.0622], [ 0.0096, -0.0116, -0.0114, ..., 0.0445, 0.0450, -0.0004], [-0.0447, 0.0114, -0.0584, ..., 0.0366, 0.0069, 0.0109], ..., [-0.0473, -0.0319, -0.0506, ..., -0.0050, -0.0294, 0.0428], [ 0.0524, 0.0012, 0.0063, ..., -0.0143, -0.0436, 0.0023], [-0.0744, 0.0071, 0.0184, ..., -0.0711, 0.0102, -0.0411]], device='cuda:0'), grad: tensor([[ 1.2994e-05, 3.0443e-05, 2.2054e-04, ..., 1.0896e-04, 1.1665e-04, 3.0220e-05], [ 1.7703e-04, -3.3474e-03, -4.2610e-03, ..., -2.8458e-03, -7.6065e-03, 3.1567e-04], [ 2.9159e-04, 2.4274e-05, 8.0681e-04, ..., 3.7937e-03, 8.8882e-04, 5.5981e-04], ..., [-7.6437e-04, 9.0003e-05, -2.7142e-03, ..., -2.1420e-03, 4.6802e-04, -1.6193e-03], [ 9.7081e-06, 2.4643e-03, 3.1338e-03, ..., 2.5692e-03, 5.7411e-03, 5.5939e-05], [ 3.1024e-05, 1.5469e-03, 3.7527e-04, ..., 1.8823e-04, 9.6130e-03, 2.0719e-04]], device='cuda:0') Epoch 31, bias, value: tensor([-0.0079, -0.0294, 0.0096, -0.0199, 0.0134, 0.0030, 0.0183, -0.0099, -0.0228, 0.0007], device='cuda:0'), grad: tensor([ 0.0010, -0.0120, 0.0035, -0.0008, -0.0104, 0.0089, 0.0026, -0.0158, 0.0102, 0.0129], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 30---------------------------------------------------- epoch 30, time 278.28, cls_loss 0.0268 cls_loss_mapping 0.0358 cls_loss_causal 0.7893 re_mapping 0.0241 re_causal 0.0665 /// teacc 98.54 lr 0.00010000 Epoch 32, weight, value: tensor([[ 0.0089, -0.0642, -0.0278, ..., -0.0676, -0.0421, -0.0639], [ 0.0100, -0.0096, -0.0109, ..., 0.0448, 0.0461, -0.0009], [-0.0450, 0.0117, -0.0594, ..., 0.0367, 0.0071, 0.0107], ..., [-0.0479, -0.0321, -0.0504, ..., -0.0046, -0.0309, 0.0442], [ 0.0526, 0.0011, 0.0063, ..., -0.0145, -0.0447, 0.0019], [-0.0750, 0.0055, 0.0182, ..., -0.0719, 0.0105, -0.0416]], device='cuda:0'), grad: tensor([[ 3.1710e-05, 6.2227e-05, 3.4362e-05, ..., 8.5473e-05, 2.5535e-04, 9.8869e-06], [ 1.2189e-04, 2.7132e-04, -1.4365e-04, ..., 1.3411e-05, 6.4898e-04, 1.2301e-05], [ 2.4581e-04, 1.5688e-04, 2.7442e-04, ..., 6.8903e-04, 6.4182e-04, 2.9624e-05], ..., [ 5.7779e-06, 8.6799e-06, 1.5271e-04, ..., -3.4285e-04, 5.5790e-04, -1.3733e-04], [-2.0361e-04, 5.6803e-05, -3.9861e-06, ..., -6.6328e-04, 2.5392e-04, 3.3379e-05], [ 1.9848e-05, 3.7588e-06, -1.3697e-04, ..., 2.0957e-04, -8.0061e-04, -1.7309e-04]], device='cuda:0') Epoch 32, bias, value: tensor([-0.0077, -0.0289, 0.0093, -0.0202, 0.0138, 0.0028, 0.0183, -0.0096, -0.0230, 0.0005], device='cuda:0'), grad: tensor([ 3.4022e-04, 7.1859e-04, 1.4277e-03, 4.6468e-04, -1.0977e-03, -2.9707e-04, -1.9054e-03, 2.0909e-04, -3.5644e-05, 1.7536e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 31---------------------------------------------------- epoch 31, time 278.37, cls_loss 0.0342 cls_loss_mapping 0.0480 cls_loss_causal 0.8259 re_mapping 0.0226 re_causal 0.0620 /// teacc 98.56 lr 0.00010000 Epoch 33, weight, value: tensor([[ 0.0093, -0.0653, -0.0286, ..., -0.0685, -0.0423, -0.0648], [ 0.0094, -0.0098, -0.0108, ..., 0.0451, 0.0469, -0.0016], [-0.0445, 0.0128, -0.0598, ..., 0.0368, 0.0075, 0.0105], ..., [-0.0483, -0.0320, -0.0504, ..., -0.0045, -0.0317, 0.0454], [ 0.0531, 0.0002, 0.0070, ..., -0.0143, -0.0457, 0.0015], [-0.0759, 0.0050, 0.0179, ..., -0.0723, 0.0105, -0.0427]], device='cuda:0'), grad: tensor([[ 2.3991e-06, 2.2694e-05, 2.6196e-05, ..., 1.5259e-04, 6.4611e-05, -2.2376e-04], [-6.0886e-05, -8.6334e-07, -3.3927e-04, ..., -2.0015e-04, -6.5708e-04, 1.7250e-04], [ 9.0823e-06, -3.3808e-04, -1.1998e-04, ..., -2.0428e-03, -8.6880e-04, 4.3154e-04], ..., [ 1.0148e-05, -1.7524e-04, -1.4186e-04, ..., -1.5879e-03, 1.0423e-05, -1.4668e-03], [-1.0923e-05, 2.7394e-04, 5.3978e-04, ..., 1.8740e-03, 7.9632e-04, 8.8871e-05], [ 9.2685e-06, 2.5868e-05, 6.9201e-05, ..., 1.8370e-04, 2.3276e-05, 7.6294e-05]], device='cuda:0') Epoch 33, bias, value: tensor([-0.0072, -0.0291, 0.0094, -0.0198, 0.0136, 0.0021, 0.0184, -0.0092, -0.0229, 0.0001], device='cuda:0'), grad: tensor([-0.0007, -0.0005, -0.0021, 0.0017, 0.0006, -0.0019, 0.0008, -0.0013, 0.0031, 0.0002], device='cuda:0') 100 0.0001 changing lr epoch 32, time 262.58, cls_loss 0.0348 cls_loss_mapping 0.0443 cls_loss_causal 0.7970 re_mapping 0.0225 re_causal 0.0618 /// teacc 98.35 lr 0.00010000 Epoch 34, weight, value: tensor([[ 0.0092, -0.0656, -0.0295, ..., -0.0692, -0.0425, -0.0667], [ 0.0090, -0.0097, -0.0104, ..., 0.0457, 0.0479, -0.0020], [-0.0446, 0.0140, -0.0600, ..., 0.0370, 0.0076, 0.0105], ..., [-0.0483, -0.0324, -0.0509, ..., -0.0050, -0.0332, 0.0457], [ 0.0531, -0.0007, 0.0070, ..., -0.0143, -0.0466, 0.0012], [-0.0763, 0.0058, 0.0177, ..., -0.0728, 0.0116, -0.0436]], device='cuda:0'), grad: tensor([[ 1.4409e-05, 4.8578e-06, 1.2450e-05, ..., 4.8310e-05, 2.1696e-05, 2.4661e-05], [ 1.9744e-05, 1.9193e-05, -1.8156e-04, ..., 2.3258e-04, -1.3638e-04, 1.3924e-04], [ 5.7310e-05, -1.5363e-05, 5.3853e-05, ..., -1.5354e-03, -1.2226e-03, -2.8825e-04], ..., [-2.1851e-04, -5.9873e-05, -3.5316e-05, ..., 2.1899e-04, 9.8705e-04, -4.1461e-04], [ 1.6764e-05, 6.4038e-06, 9.2238e-06, ..., 5.0932e-05, 1.1241e-04, 4.0740e-05], [ 3.8087e-05, 1.2472e-05, 1.2383e-05, ..., 1.3828e-04, -4.6104e-05, 1.0234e-04]], device='cuda:0') Epoch 34, bias, value: tensor([-0.0072, -0.0288, 0.0095, -0.0195, 0.0136, 0.0024, 0.0181, -0.0094, -0.0234, 0.0001], device='cuda:0'), grad: tensor([-6.0177e-04, 2.6250e-04, -1.0777e-03, 9.2506e-04, -8.2779e-03, 7.2765e-04, 6.9737e-05, 7.5493e-03, 3.1543e-04, 1.0788e-04], device='cuda:0') 100 0.0001 changing lr epoch 33, time 262.07, cls_loss 0.0319 cls_loss_mapping 0.0387 cls_loss_causal 0.7821 re_mapping 0.0226 re_causal 0.0613 /// teacc 98.25 lr 0.00010000 Epoch 35, weight, value: tensor([[ 0.0099, -0.0654, -0.0302, ..., -0.0696, -0.0431, -0.0682], [ 0.0087, -0.0097, -0.0097, ..., 0.0462, 0.0488, -0.0025], [-0.0445, 0.0148, -0.0608, ..., 0.0376, 0.0083, 0.0105], ..., [-0.0492, -0.0325, -0.0512, ..., -0.0053, -0.0346, 0.0464], [ 0.0536, -0.0010, 0.0075, ..., -0.0140, -0.0470, 0.0014], [-0.0770, 0.0055, 0.0172, ..., -0.0739, 0.0119, -0.0449]], device='cuda:0'), grad: tensor([[ 5.0634e-05, -1.6019e-05, 3.9697e-04, ..., 3.9792e-04, 2.9221e-05, 3.7044e-05], [ 1.8343e-05, 1.1399e-05, 1.0377e-04, ..., 2.0337e-04, -2.9132e-05, 5.1349e-05], [ 1.4675e-04, -2.6315e-05, 4.2629e-04, ..., 8.5926e-04, 2.0519e-05, 2.8920e-04], ..., [-7.0892e-06, 1.5765e-05, -4.0144e-05, ..., -1.3714e-03, 1.9717e-04, -7.1144e-04], [-4.9686e-04, 5.1856e-06, -8.9264e-04, ..., -1.3390e-03, 4.8488e-05, 2.8342e-05], [ 4.3094e-05, 8.1724e-07, -6.8188e-05, ..., 3.9124e-04, -1.0192e-04, 5.9426e-05]], device='cuda:0') Epoch 35, bias, value: tensor([-0.0064, -0.0285, 0.0098, -0.0199, 0.0135, 0.0022, 0.0178, -0.0096, -0.0231, -0.0002], device='cuda:0'), grad: tensor([ 0.0004, 0.0003, 0.0012, 0.0241, 0.0001, -0.0286, 0.0041, -0.0007, -0.0008, -0.0003], device='cuda:0') 100 0.0001 changing lr epoch 34, time 262.65, cls_loss 0.0290 cls_loss_mapping 0.0428 cls_loss_causal 0.7885 re_mapping 0.0223 re_causal 0.0620 /// teacc 98.52 lr 0.00010000 Epoch 36, weight, value: tensor([[ 0.0098, -0.0660, -0.0314, ..., -0.0710, -0.0437, -0.0699], [ 0.0086, -0.0104, -0.0098, ..., 0.0463, 0.0490, -0.0029], [-0.0450, 0.0182, -0.0611, ..., 0.0386, 0.0092, 0.0098], ..., [-0.0489, -0.0332, -0.0522, ..., -0.0057, -0.0356, 0.0472], [ 0.0544, -0.0027, 0.0086, ..., -0.0135, -0.0482, 0.0015], [-0.0782, 0.0055, 0.0172, ..., -0.0744, 0.0130, -0.0451]], device='cuda:0'), grad: tensor([[ 1.2212e-05, 2.7064e-06, 3.5554e-05, ..., 2.9817e-05, 2.9638e-05, -7.8557e-07], [-2.2113e-05, 5.4948e-06, -6.7770e-05, ..., -4.3541e-05, -3.3712e-04, 6.1952e-06], [ 6.2704e-05, -2.4050e-05, 1.8704e-04, ..., 1.9526e-04, 2.2441e-05, 1.2450e-05], ..., [ 2.4974e-05, 1.3664e-05, -7.3671e-05, ..., -1.2960e-03, 1.4031e-04, -3.1590e-05], [-6.3717e-05, -6.8210e-06, -3.0947e-04, ..., -3.1757e-04, 2.3293e-04, 1.1787e-05], [ 4.7147e-05, 1.3532e-06, -2.9135e-04, ..., 6.8665e-04, -1.4172e-03, 2.6356e-06]], device='cuda:0') Epoch 36, bias, value: tensor([-0.0067, -0.0288, 0.0105, -0.0203, 0.0131, 0.0023, 0.0175, -0.0095, -0.0228, 0.0001], device='cuda:0'), grad: tensor([ 3.8683e-05, -2.7990e-04, 2.4390e-04, 1.0376e-03, 1.7805e-03, -6.6519e-05, 1.9515e-04, -1.6413e-03, -2.2161e-04, -1.0853e-03], device='cuda:0') 100 0.0001 changing lr epoch 35, time 262.24, cls_loss 0.0240 cls_loss_mapping 0.0323 cls_loss_causal 0.7610 re_mapping 0.0206 re_causal 0.0588 /// teacc 98.37 lr 0.00010000 Epoch 37, weight, value: tensor([[ 0.0099, -0.0665, -0.0321, ..., -0.0721, -0.0440, -0.0702], [ 0.0085, -0.0100, -0.0093, ..., 0.0467, 0.0499, -0.0035], [-0.0454, 0.0186, -0.0623, ..., 0.0384, 0.0091, 0.0095], ..., [-0.0491, -0.0341, -0.0523, ..., -0.0054, -0.0364, 0.0484], [ 0.0549, -0.0017, 0.0089, ..., -0.0133, -0.0492, 0.0011], [-0.0792, 0.0053, 0.0167, ..., -0.0755, 0.0132, -0.0466]], device='cuda:0'), grad: tensor([[ 1.4231e-05, 1.5303e-05, 3.1590e-05, ..., 4.5598e-05, 2.9311e-05, 7.2308e-06], [ 5.8293e-05, 5.8800e-05, 2.5439e-04, ..., 3.4070e-04, 3.1471e-05, 3.5197e-05], [ 7.8678e-05, -1.7178e-04, 1.0556e-04, ..., -9.6917e-05, -1.4770e-04, 8.4341e-05], ..., [-2.5257e-05, -8.6352e-06, 6.6662e-04, ..., 2.9469e-04, 5.0843e-05, -2.7680e-04], [ 8.8394e-05, 3.6776e-05, 3.0994e-04, ..., 2.0039e-04, 1.6713e-04, 9.8422e-06], [ 3.9983e-04, 7.9796e-06, 1.0519e-03, ..., 9.3889e-04, -7.8976e-05, 7.2777e-05]], device='cuda:0') Epoch 37, bias, value: tensor([-0.0066, -0.0283, 0.0101, -0.0199, 0.0132, 0.0020, 0.0180, -0.0096, -0.0230, -0.0004], device='cuda:0'), grad: tensor([ 7.6115e-05, 3.9768e-04, -1.3657e-05, -2.3880e-03, 1.7309e-04, 7.3075e-05, 1.1069e-04, 3.1304e-04, 5.6458e-04, 6.9332e-04], device='cuda:0') 100 0.0001 changing lr epoch 36, time 262.20, cls_loss 0.0231 cls_loss_mapping 0.0312 cls_loss_causal 0.7677 re_mapping 0.0210 re_causal 0.0595 /// teacc 98.45 lr 0.00010000 Epoch 38, weight, value: tensor([[ 0.0095, -0.0680, -0.0331, ..., -0.0728, -0.0446, -0.0709], [ 0.0083, -0.0119, -0.0095, ..., 0.0464, 0.0499, -0.0045], [-0.0457, 0.0208, -0.0623, ..., 0.0389, 0.0102, 0.0093], ..., [-0.0487, -0.0345, -0.0519, ..., -0.0049, -0.0374, 0.0498], [ 0.0551, -0.0012, 0.0091, ..., -0.0134, -0.0499, 0.0007], [-0.0792, 0.0056, 0.0166, ..., -0.0762, 0.0134, -0.0478]], device='cuda:0'), grad: tensor([[ 3.6448e-05, -5.7109e-06, 6.2644e-05, ..., 1.8167e-04, 1.1545e-04, 4.5747e-06], [ 2.9683e-05, -1.9062e-04, -1.1778e-03, ..., -1.3657e-03, -1.8549e-03, -9.0837e-05], [-9.1028e-04, 4.2349e-05, -6.3133e-04, ..., -6.1378e-03, -1.4839e-03, -8.8274e-05], ..., [ 9.1612e-05, 2.1040e-05, 1.5059e-03, ..., 1.7366e-03, 1.8206e-03, 7.5996e-05], [-8.4925e-04, -3.4839e-05, -4.9829e-04, ..., -1.1988e-03, -7.5758e-05, 2.6181e-05], [ 4.1223e-04, 5.2273e-05, -6.0749e-04, ..., 4.0621e-05, -7.7581e-04, 1.8510e-07]], device='cuda:0') Epoch 38, bias, value: tensor([-0.0070, -0.0287, 0.0104, -0.0202, 0.0131, 0.0023, 0.0180, -0.0090, -0.0231, -0.0004], device='cuda:0'), grad: tensor([ 0.0003, -0.0018, -0.0051, 0.0065, 0.0007, 0.0007, -0.0002, 0.0035, -0.0044, -0.0002], device='cuda:0') 100 0.0001 changing lr epoch 37, time 262.27, cls_loss 0.0274 cls_loss_mapping 0.0372 cls_loss_causal 0.7886 re_mapping 0.0199 re_causal 0.0571 /// teacc 98.49 lr 0.00010000 Epoch 39, weight, value: tensor([[ 0.0094, -0.0696, -0.0339, ..., -0.0738, -0.0451, -0.0716], [ 0.0091, -0.0125, -0.0087, ..., 0.0466, 0.0505, -0.0046], [-0.0467, 0.0223, -0.0632, ..., 0.0390, 0.0104, 0.0085], ..., [-0.0491, -0.0350, -0.0521, ..., -0.0052, -0.0382, 0.0506], [ 0.0554, -0.0015, 0.0100, ..., -0.0127, -0.0507, 0.0015], [-0.0799, 0.0066, 0.0160, ..., -0.0767, 0.0144, -0.0494]], device='cuda:0'), grad: tensor([[ 2.2557e-06, 5.9843e-05, 2.9683e-05, ..., 7.0035e-05, 1.8787e-04, 3.2634e-06], [ 1.3523e-05, 1.4174e-04, 1.0777e-03, ..., 1.8930e-04, 2.3060e-03, 1.4909e-05], [ 2.8864e-05, -9.0456e-04, 1.5628e-04, ..., -5.6744e-04, -2.4872e-03, 2.5854e-05], ..., [-8.0347e-04, -1.0788e-05, -1.7281e-03, ..., -1.3304e-03, -3.0403e-03, -1.4114e-04], [ 5.3078e-05, -2.1911e-04, -9.1553e-04, ..., -5.5122e-04, -1.6594e-04, 5.4479e-05], [ 7.2527e-04, 2.1505e-04, 1.4029e-03, ..., 1.9321e-03, 8.1301e-04, 2.0459e-05]], device='cuda:0') Epoch 39, bias, value: tensor([-0.0073, -0.0286, 0.0102, -0.0201, 0.0125, 0.0022, 0.0176, -0.0085, -0.0226, -0.0003], device='cuda:0'), grad: tensor([ 0.0002, 0.0026, -0.0014, -0.0003, 0.0010, 0.0001, 0.0009, -0.0234, -0.0015, 0.0218], device='cuda:0') 100 0.0001 changing lr epoch 38, time 262.34, cls_loss 0.0265 cls_loss_mapping 0.0396 cls_loss_causal 0.7396 re_mapping 0.0201 re_causal 0.0566 /// teacc 98.49 lr 0.00010000 Epoch 40, weight, value: tensor([[ 0.0107, -0.0706, -0.0341, ..., -0.0740, -0.0451, -0.0723], [ 0.0082, -0.0125, -0.0084, ..., 0.0465, 0.0503, -0.0049], [-0.0465, 0.0235, -0.0643, ..., 0.0391, 0.0109, 0.0080], ..., [-0.0491, -0.0358, -0.0527, ..., -0.0053, -0.0391, 0.0512], [ 0.0557, -0.0024, 0.0102, ..., -0.0127, -0.0516, 0.0012], [-0.0799, 0.0064, 0.0166, ..., -0.0771, 0.0153, -0.0502]], device='cuda:0'), grad: tensor([[ 3.7309e-06, 2.4736e-05, 4.0293e-05, ..., 8.5890e-05, 8.2135e-05, 1.2256e-06], [ 1.7017e-05, -1.1330e-03, 2.4834e-03, ..., -5.4970e-03, -3.9444e-03, 1.7971e-05], [ 7.0296e-06, 9.4271e-04, 5.0694e-05, ..., 5.7106e-03, 4.6005e-03, 7.7188e-06], ..., [-1.3448e-05, 4.3124e-05, 1.3202e-05, ..., -5.6654e-05, 2.0361e-04, -1.9789e-04], [ 1.8194e-05, 5.9813e-05, 8.6498e-04, ..., 5.4169e-04, 5.6887e-04, 7.8797e-05], [ 1.1362e-05, 2.0340e-05, 3.5644e-04, ..., 3.5572e-04, 3.7098e-04, 5.2452e-05]], device='cuda:0') Epoch 40, bias, value: tensor([-0.0071, -0.0289, 0.0101, -0.0200, 0.0125, 0.0019, 0.0176, -0.0088, -0.0229, 0.0006], device='cuda:0'), grad: tensor([ 0.0001, -0.0032, 0.0062, 0.0032, -0.0003, -0.0085, 0.0003, -0.0002, 0.0021, 0.0003], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 39---------------------------------------------------- epoch 39, time 280.55, cls_loss 0.0220 cls_loss_mapping 0.0311 cls_loss_causal 0.7567 re_mapping 0.0193 re_causal 0.0572 /// teacc 98.60 lr 0.00010000 Epoch 41, weight, value: tensor([[ 0.0106, -0.0714, -0.0349, ..., -0.0747, -0.0455, -0.0736], [ 0.0092, -0.0129, -0.0090, ..., 0.0459, 0.0505, -0.0060], [-0.0467, 0.0251, -0.0649, ..., 0.0395, 0.0114, 0.0078], ..., [-0.0507, -0.0368, -0.0520, ..., -0.0050, -0.0396, 0.0518], [ 0.0559, -0.0025, 0.0107, ..., -0.0125, -0.0518, 0.0008], [-0.0806, 0.0060, 0.0162, ..., -0.0778, 0.0154, -0.0511]], device='cuda:0'), grad: tensor([[-3.0208e-04, 2.4885e-05, 1.2803e-04, ..., 1.1152e-04, -3.7842e-03, 6.1765e-06], [ 7.2896e-05, 9.0301e-06, 5.2929e-04, ..., 1.4830e-04, 1.4162e-04, 2.2411e-04], [ 7.7248e-05, -6.9916e-05, 9.9480e-05, ..., -1.0335e-04, -4.4256e-05, 1.5751e-05], ..., [ 1.2927e-06, 1.7717e-05, -3.1700e-03, ..., -5.0497e-04, 8.3208e-05, -1.4772e-03], [-5.2452e-04, -5.7638e-05, -6.6948e-04, ..., -5.5647e-04, 1.2302e-04, 1.4886e-05], [ 2.0564e-05, 5.4874e-06, 2.2049e-03, ..., 3.6383e-04, -1.4484e-04, 1.0357e-03]], device='cuda:0') Epoch 41, bias, value: tensor([-7.2204e-03, -2.8990e-02, 1.0248e-02, -1.9564e-02, 1.3015e-02, 2.1794e-03, 1.6977e-02, -8.7730e-03, -2.2834e-02, -2.4634e-05], device='cuda:0'), grad: tensor([-8.8577e-03, 1.5793e-03, 9.6083e-05, 8.4448e-04, 1.1358e-03, 2.7204e-04, 7.7362e-03, -7.5455e-03, -8.0299e-04, 5.5504e-03], device='cuda:0') 100 0.0001 changing lr epoch 40, time 262.33, cls_loss 0.0192 cls_loss_mapping 0.0284 cls_loss_causal 0.7486 re_mapping 0.0181 re_causal 0.0551 /// teacc 98.58 lr 0.00010000 Epoch 42, weight, value: tensor([[ 0.0109, -0.0719, -0.0355, ..., -0.0752, -0.0460, -0.0745], [ 0.0096, -0.0134, -0.0086, ..., 0.0461, 0.0507, -0.0064], [-0.0471, 0.0260, -0.0658, ..., 0.0394, 0.0113, 0.0071], ..., [-0.0507, -0.0371, -0.0518, ..., -0.0048, -0.0403, 0.0529], [ 0.0559, -0.0036, 0.0108, ..., -0.0127, -0.0528, 0.0001], [-0.0812, 0.0045, 0.0162, ..., -0.0787, 0.0161, -0.0520]], device='cuda:0'), grad: tensor([[ 6.3442e-06, 2.3663e-05, 3.5554e-05, ..., 3.7640e-05, -2.8357e-05, -5.3309e-06], [-7.6145e-06, 4.4727e-04, -4.4131e-04, ..., 4.6301e-04, -9.4116e-05, 1.3806e-05], [ 3.5584e-05, -2.3556e-03, 6.8188e-05, ..., -1.5030e-03, -3.0651e-03, 3.5856e-06], ..., [ 7.6229e-07, 7.2956e-05, -2.8658e-04, ..., 1.8120e-05, 3.1185e-04, -4.8923e-04], [ 2.8059e-05, 7.7844e-05, 2.6798e-04, ..., 1.8501e-04, 5.5313e-04, 9.8720e-06], [ 2.2277e-05, 5.2378e-06, 3.1757e-04, ..., 1.6916e-04, -3.2991e-05, 3.2783e-04]], device='cuda:0') Epoch 42, bias, value: tensor([-7.5196e-03, -2.9081e-02, 9.6837e-03, -1.9521e-02, 1.2586e-02, 2.6258e-03, 1.7738e-02, -8.3705e-03, -2.3273e-02, 8.4240e-05], device='cuda:0'), grad: tensor([-3.2210e-04, -1.6034e-04, -2.6398e-03, 3.9071e-05, 5.3835e-04, 1.2708e-04, 1.5869e-03, -4.8971e-04, 7.4387e-04, 5.7745e-04], device='cuda:0') 100 0.0001 changing lr epoch 41, time 262.33, cls_loss 0.0250 cls_loss_mapping 0.0346 cls_loss_causal 0.7460 re_mapping 0.0184 re_causal 0.0515 /// teacc 98.38 lr 0.00010000 Epoch 43, weight, value: tensor([[ 0.0109, -0.0737, -0.0363, ..., -0.0760, -0.0459, -0.0754], [ 0.0100, -0.0132, -0.0083, ..., 0.0462, 0.0515, -0.0072], [-0.0483, 0.0280, -0.0668, ..., 0.0400, 0.0116, 0.0072], ..., [-0.0508, -0.0386, -0.0517, ..., -0.0052, -0.0410, 0.0528], [ 0.0561, -0.0037, 0.0105, ..., -0.0131, -0.0541, -0.0007], [-0.0826, 0.0034, 0.0157, ..., -0.0794, 0.0161, -0.0528]], device='cuda:0'), grad: tensor([[ 2.1994e-05, 8.9407e-06, 7.6652e-05, ..., 3.0935e-05, 5.9456e-06, 4.9733e-07], [ 7.2792e-06, 1.6198e-05, 1.8328e-05, ..., 3.8892e-05, -6.5453e-06, 9.6485e-06], [ 1.4879e-05, -6.5148e-05, 7.0810e-05, ..., 1.5423e-05, -5.7638e-05, 6.0871e-06], ..., [ 4.4435e-05, 9.6560e-06, 2.8591e-03, ..., 4.0512e-03, 3.9160e-05, 8.4496e-04], [ 7.1049e-05, -3.7178e-06, -3.3112e-03, ..., -4.5319e-03, 5.9187e-05, -9.2936e-04], [ 8.4758e-05, 6.7912e-06, -1.9878e-05, ..., 8.5533e-05, 8.7976e-05, 8.4490e-06]], device='cuda:0') Epoch 43, bias, value: tensor([-0.0076, -0.0290, 0.0101, -0.0188, 0.0130, 0.0026, 0.0180, -0.0086, -0.0239, -0.0007], device='cuda:0'), grad: tensor([ 3.5226e-05, 9.3937e-05, 5.2750e-05, 5.7220e-04, -4.0340e-04, -5.3596e-04, 2.3985e-04, 3.8757e-03, -4.1275e-03, 1.9801e-04], device='cuda:0') 100 0.0001 changing lr epoch 42, time 262.40, cls_loss 0.0207 cls_loss_mapping 0.0305 cls_loss_causal 0.7319 re_mapping 0.0183 re_causal 0.0514 /// teacc 98.59 lr 0.00010000 Epoch 44, weight, value: tensor([[ 0.0108, -0.0745, -0.0370, ..., -0.0763, -0.0464, -0.0758], [ 0.0103, -0.0132, -0.0077, ..., 0.0466, 0.0524, -0.0074], [-0.0483, 0.0288, -0.0676, ..., 0.0401, 0.0116, 0.0068], ..., [-0.0519, -0.0377, -0.0517, ..., -0.0047, -0.0414, 0.0536], [ 0.0563, -0.0046, 0.0111, ..., -0.0129, -0.0547, -0.0008], [-0.0839, 0.0027, 0.0155, ..., -0.0806, 0.0163, -0.0534]], device='cuda:0'), grad: tensor([[ 9.9614e-06, 9.0105e-08, 9.7603e-06, ..., 7.8902e-06, -4.5657e-05, 7.3388e-06], [ 1.0401e-05, 1.6298e-09, 1.0081e-05, ..., 2.9653e-05, -3.2433e-07, 2.5809e-05], [ 2.2233e-05, 1.6298e-09, 6.5207e-05, ..., 1.3077e-04, 2.7582e-05, 6.2346e-05], ..., [-4.3005e-05, 2.3283e-10, 2.2292e-05, ..., -1.8024e-04, 1.8775e-04, -1.5295e-04], [ 1.7345e-05, 1.6997e-08, 6.1572e-05, ..., 5.6893e-05, 5.1737e-05, 1.9416e-05], [ 8.6129e-06, 6.9849e-10, -2.9564e-04, ..., -7.9691e-05, -6.2943e-04, -4.0203e-05]], device='cuda:0') Epoch 44, bias, value: tensor([-0.0073, -0.0289, 0.0097, -0.0189, 0.0131, 0.0025, 0.0182, -0.0084, -0.0239, -0.0008], device='cuda:0'), grad: tensor([-1.6129e-04, 6.2704e-05, 1.9932e-04, 1.6177e-04, 7.3338e-04, 6.5625e-05, 8.3372e-06, 2.8586e-04, 2.1064e-04, -1.5669e-03], device='cuda:0') 100 0.0001 changing lr epoch 43, time 262.19, cls_loss 0.0196 cls_loss_mapping 0.0292 cls_loss_causal 0.7307 re_mapping 0.0180 re_causal 0.0509 /// teacc 98.56 lr 0.00010000 Epoch 45, weight, value: tensor([[ 0.0108, -0.0752, -0.0374, ..., -0.0768, -0.0469, -0.0763], [ 0.0103, -0.0147, -0.0073, ..., 0.0466, 0.0527, -0.0069], [-0.0484, 0.0310, -0.0679, ..., 0.0407, 0.0122, 0.0064], ..., [-0.0516, -0.0380, -0.0526, ..., -0.0048, -0.0429, 0.0540], [ 0.0560, -0.0049, 0.0112, ..., -0.0131, -0.0555, -0.0011], [-0.0842, 0.0015, 0.0157, ..., -0.0810, 0.0163, -0.0539]], device='cuda:0'), grad: tensor([[ 6.9290e-06, 4.6268e-06, 1.6898e-05, ..., 2.0012e-05, 4.5031e-05, 8.9407e-08], [ 3.7383e-06, 3.5733e-05, -1.9416e-05, ..., 8.4937e-05, 4.6283e-05, -1.1288e-06], [ 6.9201e-05, -2.1470e-04, 4.5180e-05, ..., -5.0259e-04, -4.6229e-04, -4.1444e-07], ..., [ 5.8748e-06, 5.4687e-05, 2.6017e-05, ..., 9.6381e-05, 1.0943e-04, -1.0822e-06], [ 7.1943e-05, 9.6798e-05, 2.0492e-04, ..., 5.0449e-04, 3.5739e-04, 2.7521e-07], [ 2.6703e-05, 2.0228e-06, 3.7432e-05, ..., 5.4359e-05, 1.5065e-05, 1.2657e-06]], device='cuda:0') Epoch 45, bias, value: tensor([-0.0076, -0.0289, 0.0101, -0.0186, 0.0136, 0.0022, 0.0181, -0.0088, -0.0243, -0.0005], device='cuda:0'), grad: tensor([ 9.2447e-05, 6.9082e-05, -4.7660e-04, -2.7442e-04, 6.7592e-05, 6.9094e-04, -1.1997e-03, 1.5330e-04, 8.0442e-04, 7.2241e-05], device='cuda:0') 100 0.0001 changing lr epoch 44, time 261.28, cls_loss 0.0216 cls_loss_mapping 0.0293 cls_loss_causal 0.7246 re_mapping 0.0179 re_causal 0.0507 /// teacc 98.52 lr 0.00010000 Epoch 46, weight, value: tensor([[ 0.0108, -0.0758, -0.0372, ..., -0.0769, -0.0469, -0.0785], [ 0.0099, -0.0152, -0.0070, ..., 0.0466, 0.0531, -0.0069], [-0.0487, 0.0319, -0.0686, ..., 0.0405, 0.0121, 0.0058], ..., [-0.0522, -0.0375, -0.0530, ..., -0.0046, -0.0431, 0.0547], [ 0.0563, -0.0053, 0.0117, ..., -0.0128, -0.0561, -0.0011], [-0.0845, 0.0012, 0.0156, ..., -0.0816, 0.0168, -0.0544]], device='cuda:0'), grad: tensor([[ 1.1045e-06, 2.7884e-06, 4.7758e-06, ..., 1.0207e-05, -1.6764e-05, -1.6287e-05], [ 2.9430e-06, 7.4327e-05, 1.3685e-04, ..., 3.1447e-04, 9.7454e-05, 1.8036e-04], [ 5.9344e-06, -4.0197e-04, -1.1042e-05, ..., -7.1001e-04, -6.0654e-04, 2.1458e-05], ..., [ 5.0515e-06, 3.1114e-05, -2.0552e-04, ..., -2.3675e-04, 5.0485e-05, -3.3522e-04], [ 6.0722e-06, 2.5892e-04, 1.6257e-05, ..., 4.7827e-04, 4.0579e-04, 3.0696e-05], [ 4.1053e-06, 3.5930e-06, 2.4706e-05, ..., 3.2246e-05, 2.7016e-05, 3.1322e-05]], device='cuda:0') Epoch 46, bias, value: tensor([-0.0076, -0.0289, 0.0096, -0.0185, 0.0135, 0.0025, 0.0181, -0.0085, -0.0245, -0.0004], device='cuda:0'), grad: tensor([-3.9172e-04, 5.6648e-04, -8.8072e-04, 7.6532e-05, 1.9640e-05, 1.2946e-04, 1.2779e-04, -5.7411e-04, 7.1859e-04, 2.0754e-04], device='cuda:0') 100 0.0001 changing lr epoch 45, time 256.89, cls_loss 0.0170 cls_loss_mapping 0.0233 cls_loss_causal 0.7125 re_mapping 0.0177 re_causal 0.0495 /// teacc 98.60 lr 0.00010000 Epoch 47, weight, value: tensor([[ 0.0107, -0.0769, -0.0376, ..., -0.0774, -0.0477, -0.0796], [ 0.0099, -0.0162, -0.0064, ..., 0.0468, 0.0531, -0.0077], [-0.0489, 0.0333, -0.0691, ..., 0.0407, 0.0127, 0.0054], ..., [-0.0521, -0.0381, -0.0534, ..., -0.0046, -0.0436, 0.0557], [ 0.0565, -0.0055, 0.0119, ..., -0.0127, -0.0566, -0.0014], [-0.0846, 0.0007, 0.0152, ..., -0.0823, 0.0174, -0.0550]], device='cuda:0'), grad: tensor([[ 1.6391e-04, 2.9936e-05, 3.3307e-04, ..., 2.2149e-04, 2.3711e-04, 1.0759e-05], [-1.6940e-04, 2.3112e-05, -9.0265e-04, ..., -6.1464e-04, -1.3409e-03, 6.5267e-05], [ 3.4750e-05, -6.5684e-05, 1.2600e-04, ..., 4.5395e-04, 4.6134e-04, 1.4668e-03], ..., [ 2.1502e-05, 9.3430e-06, 1.9228e-04, ..., -5.4741e-04, -3.4571e-04, -2.3041e-03], [-5.2414e-03, -6.5506e-05, -8.6060e-03, ..., -5.3596e-03, 1.0157e-04, 2.3529e-05], [ 8.5711e-05, 3.0175e-05, 2.1398e-04, ..., 1.8251e-04, 1.2124e-04, 8.1658e-05]], device='cuda:0') Epoch 47, bias, value: tensor([-0.0079, -0.0290, 0.0096, -0.0185, 0.0137, 0.0023, 0.0184, -0.0085, -0.0245, -0.0004], device='cuda:0'), grad: tensor([ 0.0009, -0.0022, 0.0029, 0.0013, 0.0011, 0.0232, 0.0009, -0.0036, -0.0253, 0.0008], device='cuda:0') 100 0.0001 changing lr epoch 46, time 261.47, cls_loss 0.0195 cls_loss_mapping 0.0279 cls_loss_causal 0.7324 re_mapping 0.0171 re_causal 0.0502 /// teacc 98.50 lr 0.00010000 Epoch 48, weight, value: tensor([[ 0.0106, -0.0781, -0.0389, ..., -0.0783, -0.0485, -0.0804], [ 0.0096, -0.0177, -0.0065, ..., 0.0465, 0.0531, -0.0084], [-0.0487, 0.0352, -0.0700, ..., 0.0410, 0.0135, 0.0054], ..., [-0.0523, -0.0399, -0.0535, ..., -0.0049, -0.0440, 0.0564], [ 0.0572, -0.0049, 0.0127, ..., -0.0121, -0.0573, -0.0018], [-0.0842, -0.0002, 0.0152, ..., -0.0830, 0.0174, -0.0564]], device='cuda:0'), grad: tensor([[ 2.1482e-04, 1.9968e-05, 1.9856e-06, ..., 1.9324e-04, 3.4273e-05, 7.5717e-07], [ 1.2741e-03, 4.5300e-04, 4.2319e-06, ..., 1.3456e-03, 1.9944e-04, 6.3963e-06], [-7.7858e-03, -2.3537e-03, -1.7449e-05, ..., -7.1335e-03, -4.7493e-04, 9.2611e-06], ..., [ 4.5700e-03, 1.2999e-03, 6.2771e-06, ..., 3.9101e-03, 2.3782e-04, -4.0382e-05], [ 3.9744e-04, 1.3685e-04, -1.3098e-05, ..., 3.8767e-04, 6.3837e-05, 2.7493e-06], [ 2.2089e-04, 7.5579e-05, -2.3786e-06, ..., 2.1291e-04, -1.2648e-04, 1.6987e-05]], device='cuda:0') Epoch 48, bias, value: tensor([-0.0084, -0.0294, 0.0094, -0.0182, 0.0141, 0.0021, 0.0187, -0.0084, -0.0240, -0.0007], device='cuda:0'), grad: tensor([-5.1439e-05, 2.0809e-03, -1.1047e-02, 1.4257e-03, 2.7680e-04, 2.4772e-04, 6.9737e-05, 6.0272e-03, 6.7139e-04, 3.0017e-04], device='cuda:0') 100 0.0001 changing lr epoch 47, time 262.21, cls_loss 0.0222 cls_loss_mapping 0.0301 cls_loss_causal 0.7183 re_mapping 0.0173 re_causal 0.0468 /// teacc 98.50 lr 0.00010000 Epoch 49, weight, value: tensor([[ 0.0108, -0.0788, -0.0389, ..., -0.0788, -0.0484, -0.0815], [ 0.0099, -0.0185, -0.0064, ..., 0.0461, 0.0530, -0.0092], [-0.0474, 0.0367, -0.0703, ..., 0.0417, 0.0145, 0.0052], ..., [-0.0536, -0.0403, -0.0534, ..., -0.0047, -0.0451, 0.0574], [ 0.0572, -0.0054, 0.0127, ..., -0.0123, -0.0588, -0.0021], [-0.0844, -0.0008, 0.0152, ..., -0.0833, 0.0178, -0.0580]], device='cuda:0'), grad: tensor([[ 1.7881e-07, 5.4836e-06, 1.1927e-04, ..., 6.9141e-05, 2.0111e-04, 1.3754e-05], [ 3.9674e-07, 4.2051e-05, -5.8441e-03, ..., -2.9831e-03, -6.8016e-03, -1.7214e-03], [ 4.8522e-07, 1.6749e-05, 1.9014e-04, ..., 1.4555e-04, 2.2388e-04, 2.0370e-05], ..., [ 3.7672e-07, -9.4920e-06, 2.3899e-03, ..., 1.6575e-03, 2.8362e-03, 1.0176e-03], [ 6.8499e-07, -2.5082e-04, -5.6314e-04, ..., -1.7529e-03, 1.3199e-03, 2.1219e-05], [ 1.8971e-06, 3.6117e-06, 1.5993e-03, ..., 1.0014e-03, 1.1826e-03, 3.6311e-04]], device='cuda:0') Epoch 49, bias, value: tensor([-0.0078, -0.0300, 0.0103, -0.0184, 0.0138, 0.0027, 0.0182, -0.0083, -0.0248, -0.0006], device='cuda:0'), grad: tensor([ 0.0004, -0.0130, 0.0005, 0.0007, 0.0015, 0.0019, 0.0002, 0.0047, -0.0002, 0.0034], device='cuda:0') 100 0.0001 changing lr epoch 48, time 262.24, cls_loss 0.0158 cls_loss_mapping 0.0204 cls_loss_causal 0.7318 re_mapping 0.0166 re_causal 0.0477 /// teacc 98.56 lr 0.00010000 Epoch 50, weight, value: tensor([[ 0.0108, -0.0800, -0.0396, ..., -0.0796, -0.0481, -0.0821], [ 0.0097, -0.0182, -0.0061, ..., 0.0464, 0.0538, -0.0096], [-0.0481, 0.0369, -0.0716, ..., 0.0414, 0.0145, 0.0042], ..., [-0.0536, -0.0400, -0.0537, ..., -0.0045, -0.0457, 0.0589], [ 0.0573, -0.0049, 0.0132, ..., -0.0118, -0.0594, -0.0019], [-0.0847, -0.0014, 0.0148, ..., -0.0843, 0.0179, -0.0595]], device='cuda:0'), grad: tensor([[ 7.0333e-04, 1.6546e-04, 7.5865e-04, ..., 1.7595e-04, 1.8632e-04, 2.0061e-06], [ 1.1630e-05, 1.1683e-05, -4.7445e-05, ..., 2.2426e-05, -7.1943e-05, 1.8567e-05], [ 3.0056e-05, -4.4441e-04, 6.3717e-05, ..., -5.3215e-04, -5.8126e-04, 1.4380e-05], ..., [ 5.6595e-05, 6.4194e-05, 1.0365e-04, ..., 5.2720e-05, 2.8276e-04, 3.3051e-05], [-7.5197e-04, 1.1855e-04, -1.2426e-03, ..., -9.6369e-04, 4.7117e-05, 1.9390e-06], [ 3.8886e-04, 2.1741e-05, 4.6515e-04, ..., 1.2577e-04, -2.9042e-05, 8.2180e-06]], device='cuda:0') Epoch 50, bias, value: tensor([-0.0078, -0.0298, 0.0097, -0.0179, 0.0138, 0.0028, 0.0182, -0.0078, -0.0248, -0.0013], device='cuda:0'), grad: tensor([ 3.5591e-03, 2.1160e-05, -1.7643e-03, -1.2230e-02, -2.0117e-05, 1.0986e-02, 4.9067e-04, 7.8869e-04, -3.5458e-03, 1.7128e-03], device='cuda:0') 100 0.0001 changing lr epoch 49, time 262.53, cls_loss 0.0181 cls_loss_mapping 0.0254 cls_loss_causal 0.6882 re_mapping 0.0166 re_causal 0.0454 /// teacc 98.53 lr 0.00010000 Epoch 51, weight, value: tensor([[ 0.0106, -0.0825, -0.0405, ..., -0.0803, -0.0487, -0.0829], [ 0.0100, -0.0191, -0.0052, ..., 0.0462, 0.0544, -0.0105], [-0.0483, 0.0382, -0.0722, ..., 0.0417, 0.0150, 0.0039], ..., [-0.0537, -0.0402, -0.0540, ..., -0.0041, -0.0462, 0.0599], [ 0.0571, -0.0051, 0.0133, ..., -0.0118, -0.0605, -0.0022], [-0.0851, -0.0013, 0.0143, ..., -0.0848, 0.0182, -0.0601]], device='cuda:0'), grad: tensor([[ 1.2830e-05, 2.0470e-06, 1.5453e-05, ..., 2.3067e-05, 2.0534e-05, 3.6918e-06], [ 9.0301e-06, 3.1516e-06, 7.5400e-05, ..., 1.0133e-04, 8.1778e-05, 1.0991e-04], [-2.0102e-05, -1.3702e-05, 3.4750e-05, ..., -7.4208e-05, -2.7448e-05, -1.1161e-05], ..., [ 1.1303e-05, 5.0738e-06, -1.4663e-04, ..., -2.4390e-04, -7.4983e-05, -3.2210e-04], [ 5.8979e-05, -4.3213e-06, 5.9485e-05, ..., 2.5913e-05, 3.3975e-05, 7.5400e-06], [ 1.6272e-05, 1.1902e-06, 6.0081e-05, ..., 1.7416e-04, -2.4962e-04, 1.7798e-04]], device='cuda:0') Epoch 51, bias, value: tensor([-0.0084, -0.0294, 0.0097, -0.0183, 0.0133, 0.0030, 0.0191, -0.0072, -0.0252, -0.0015], device='cuda:0'), grad: tensor([ 7.7069e-05, 3.1400e-04, -6.1691e-05, 1.2236e-03, 3.1042e-04, -2.0657e-03, 4.5061e-04, -5.1069e-04, 2.3937e-04, 2.4036e-05], device='cuda:0') 100 0.0001 changing lr epoch 50, time 262.67, cls_loss 0.0166 cls_loss_mapping 0.0233 cls_loss_causal 0.7195 re_mapping 0.0158 re_causal 0.0451 /// teacc 98.57 lr 0.00010000 Epoch 52, weight, value: tensor([[ 0.0108, -0.0839, -0.0405, ..., -0.0808, -0.0490, -0.0832], [ 0.0098, -0.0198, -0.0049, ..., 0.0461, 0.0544, -0.0102], [-0.0488, 0.0379, -0.0730, ..., 0.0417, 0.0154, 0.0033], ..., [-0.0537, -0.0408, -0.0543, ..., -0.0041, -0.0465, 0.0605], [ 0.0579, -0.0040, 0.0142, ..., -0.0111, -0.0610, -0.0018], [-0.0853, -0.0020, 0.0139, ..., -0.0860, 0.0183, -0.0611]], device='cuda:0'), grad: tensor([[ 1.5041e-07, 6.3777e-06, 3.4440e-06, ..., 1.4111e-05, 2.1905e-06, -7.5474e-06], [ 1.0692e-06, 7.7412e-06, -4.2319e-05, ..., -2.3678e-05, -5.4002e-05, 2.7269e-06], [ 7.2131e-07, -9.7930e-05, -1.0803e-06, ..., -2.3687e-04, -1.3363e-04, -1.4096e-05], ..., [ 8.7311e-07, 9.2313e-06, 1.4976e-05, ..., 1.6913e-05, 3.9876e-05, -9.3728e-06], [ 8.2422e-07, 4.4733e-05, 4.1991e-05, ..., 1.5521e-04, 1.0604e-04, 1.2852e-05], [ 9.4902e-07, 1.7826e-06, -3.1471e-05, ..., 9.7975e-06, -5.3078e-05, 3.1870e-06]], device='cuda:0') Epoch 52, bias, value: tensor([-0.0081, -0.0294, 0.0091, -0.0184, 0.0134, 0.0034, 0.0190, -0.0074, -0.0243, -0.0020], device='cuda:0'), grad: tensor([-1.6379e-04, -3.4153e-05, -2.3627e-04, 1.3351e-04, 3.0845e-05, -6.5088e-05, 4.9800e-05, 1.4138e-04, 2.7728e-04, -1.3280e-04], device='cuda:0') 100 0.0001 changing lr epoch 51, time 262.65, cls_loss 0.0142 cls_loss_mapping 0.0218 cls_loss_causal 0.6826 re_mapping 0.0164 re_causal 0.0465 /// teacc 98.58 lr 0.00010000 Epoch 53, weight, value: tensor([[ 0.0105, -0.0841, -0.0408, ..., -0.0811, -0.0486, -0.0840], [ 0.0098, -0.0201, -0.0048, ..., 0.0460, 0.0544, -0.0106], [-0.0490, 0.0384, -0.0736, ..., 0.0422, 0.0163, 0.0023], ..., [-0.0537, -0.0404, -0.0545, ..., -0.0040, -0.0475, 0.0617], [ 0.0580, -0.0043, 0.0142, ..., -0.0114, -0.0620, -0.0022], [-0.0853, -0.0023, 0.0139, ..., -0.0860, 0.0191, -0.0614]], device='cuda:0'), grad: tensor([[ 1.1481e-05, 3.5614e-06, 9.9242e-06, ..., 3.8773e-05, 5.1826e-05, 1.3530e-05], [ 6.7540e-06, 6.6776e-07, -3.4332e-04, ..., -1.1702e-03, -1.6327e-03, 8.9705e-06], [ 4.1544e-05, -1.9044e-05, 3.0112e-04, ..., 1.0099e-03, 1.2445e-03, 2.8685e-05], ..., [-1.0073e-04, 3.4757e-06, 3.7909e-05, ..., -8.0943e-05, 1.0842e-04, -1.4734e-04], [ 1.5467e-05, 1.5730e-06, 9.9421e-05, ..., 1.4305e-04, 5.8562e-05, 2.5496e-05], [ 1.3836e-05, 4.9779e-07, 3.4958e-05, ..., 5.4032e-05, 4.4912e-05, 2.0519e-05]], device='cuda:0') Epoch 53, bias, value: tensor([-0.0077, -0.0296, 0.0093, -0.0180, 0.0137, 0.0032, 0.0182, -0.0074, -0.0250, -0.0016], device='cuda:0'), grad: tensor([ 8.5771e-05, -1.7910e-03, 1.5020e-03, -3.1322e-05, 5.2713e-06, 3.0790e-06, -2.7269e-05, -1.1945e-04, 2.3866e-04, 1.3220e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 52---------------------------------------------------- epoch 52, time 279.35, cls_loss 0.0159 cls_loss_mapping 0.0222 cls_loss_causal 0.6894 re_mapping 0.0159 re_causal 0.0455 /// teacc 98.65 lr 0.00010000 Epoch 54, weight, value: tensor([[ 0.0105, -0.0848, -0.0412, ..., -0.0818, -0.0489, -0.0846], [ 0.0100, -0.0208, -0.0044, ..., 0.0463, 0.0552, -0.0112], [-0.0489, 0.0393, -0.0745, ..., 0.0421, 0.0165, 0.0017], ..., [-0.0538, -0.0399, -0.0551, ..., -0.0040, -0.0478, 0.0622], [ 0.0580, -0.0047, 0.0152, ..., -0.0106, -0.0632, -0.0011], [-0.0858, -0.0026, 0.0136, ..., -0.0867, 0.0194, -0.0609]], device='cuda:0'), grad: tensor([[ 1.0924e-06, 3.7299e-07, 5.2489e-06, ..., 8.1733e-06, 7.3090e-06, 2.8852e-06], [ 4.4741e-06, 8.5905e-06, -2.8238e-05, ..., -1.0328e-06, -2.7195e-05, 7.7635e-06], [-3.1237e-06, -5.4032e-05, 1.1575e-04, ..., 9.4295e-05, -4.2677e-05, 1.1839e-05], ..., [ 2.8074e-05, 4.2140e-05, 8.9586e-05, ..., 1.3387e-04, 2.8276e-04, -3.9369e-05], [ 1.6481e-05, 1.0701e-06, 2.3293e-04, ..., 3.0875e-04, 9.5725e-05, 4.6007e-06], [ 1.3568e-05, 1.2014e-07, 2.9042e-05, ..., -7.6175e-05, -1.6108e-03, 5.4985e-06]], device='cuda:0') Epoch 54, bias, value: tensor([-0.0076, -0.0293, 0.0089, -0.0184, 0.0138, 0.0029, 0.0182, -0.0070, -0.0247, -0.0017], device='cuda:0'), grad: tensor([-1.1966e-05, 2.8927e-06, 1.1712e-04, -5.0497e-04, 2.9488e-03, 1.0681e-04, 2.0146e-05, 6.1560e-04, 4.0579e-04, -3.7003e-03], device='cuda:0') 100 0.0001 changing lr epoch 53, time 262.87, cls_loss 0.0154 cls_loss_mapping 0.0214 cls_loss_causal 0.7139 re_mapping 0.0151 re_causal 0.0454 /// teacc 98.59 lr 0.00010000 Epoch 55, weight, value: tensor([[ 0.0104, -0.0851, -0.0417, ..., -0.0825, -0.0503, -0.0860], [ 0.0096, -0.0215, -0.0039, ..., 0.0466, 0.0558, -0.0111], [-0.0487, 0.0404, -0.0749, ..., 0.0422, 0.0172, 0.0017], ..., [-0.0543, -0.0405, -0.0556, ..., -0.0041, -0.0492, 0.0625], [ 0.0579, -0.0048, 0.0153, ..., -0.0107, -0.0647, -0.0004], [-0.0863, -0.0029, 0.0133, ..., -0.0869, 0.0202, -0.0619]], device='cuda:0'), grad: tensor([[ 1.6671e-07, 7.2867e-06, 9.5442e-06, ..., 3.4481e-05, 2.8774e-05, 1.3098e-05], [ 1.3988e-06, 1.0304e-05, 2.6211e-05, ..., 4.7654e-05, 3.0637e-05, 2.9936e-05], [ 3.3788e-06, -1.0357e-03, -1.0446e-05, ..., -4.3755e-03, -5.8441e-03, 8.6486e-05], ..., [ 2.4289e-06, -3.5971e-05, -2.5678e-04, ..., -2.3413e-04, -4.5657e-05, -4.8637e-04], [ 1.0431e-05, 1.5378e-05, 1.9419e-04, ..., 2.8157e-04, 1.3161e-04, 9.3162e-05], [ 7.5670e-07, 9.8896e-04, 1.1551e-04, ..., 4.3182e-03, 5.4970e-03, 1.1885e-04]], device='cuda:0') Epoch 55, bias, value: tensor([-0.0082, -0.0288, 0.0091, -0.0185, 0.0141, 0.0032, 0.0186, -0.0080, -0.0244, -0.0016], device='cuda:0'), grad: tensor([ 5.7578e-05, 1.2743e-04, -5.2032e-03, 4.2534e-04, 1.0949e-04, -7.3075e-05, -1.6212e-05, -1.1253e-03, 5.1355e-04, 5.1804e-03], device='cuda:0') 100 0.0001 changing lr epoch 54, time 262.44, cls_loss 0.0153 cls_loss_mapping 0.0200 cls_loss_causal 0.6926 re_mapping 0.0151 re_causal 0.0447 /// teacc 98.57 lr 0.00010000 Epoch 56, weight, value: tensor([[ 0.0102, -0.0857, -0.0417, ..., -0.0832, -0.0493, -0.0870], [ 0.0097, -0.0219, -0.0038, ..., 0.0466, 0.0559, -0.0125], [-0.0492, 0.0415, -0.0756, ..., 0.0425, 0.0175, 0.0015], ..., [-0.0542, -0.0413, -0.0556, ..., -0.0042, -0.0497, 0.0636], [ 0.0581, -0.0055, 0.0153, ..., -0.0110, -0.0656, -0.0010], [-0.0865, -0.0035, 0.0131, ..., -0.0879, 0.0199, -0.0632]], device='cuda:0'), grad: tensor([[ 6.1840e-06, 1.6689e-06, 1.5163e-04, ..., 4.4465e-05, 8.1837e-05, -1.2424e-06], [-3.9488e-05, 2.2858e-05, -3.4833e-04, ..., -5.8085e-05, -2.4748e-04, 1.3679e-05], [ 4.3698e-06, -6.5506e-05, 3.6180e-05, ..., -6.7234e-05, -7.0453e-05, 5.6438e-06], ..., [ 6.9290e-06, 3.3945e-05, 1.9044e-05, ..., 1.3418e-05, 6.7174e-05, -1.3602e-04], [ 7.8678e-06, 2.4866e-07, -3.7146e-04, ..., -2.5988e-04, 6.9380e-05, 9.8646e-06], [ 4.6492e-06, 6.0536e-07, 9.9778e-05, ..., 1.5068e-04, -3.5548e-04, 5.2303e-05]], device='cuda:0') Epoch 56, bias, value: tensor([-0.0071, -0.0293, 0.0092, -0.0182, 0.0138, 0.0037, 0.0181, -0.0078, -0.0249, -0.0020], device='cuda:0'), grad: tensor([ 1.7715e-04, -6.1464e-04, -3.8147e-05, 2.4354e-04, 1.2255e-03, 1.8167e-04, 1.5891e-04, -1.3149e-04, -5.2214e-04, -6.7902e-04], device='cuda:0') 100 0.0001 changing lr epoch 55, time 262.18, cls_loss 0.0170 cls_loss_mapping 0.0240 cls_loss_causal 0.7047 re_mapping 0.0150 re_causal 0.0422 /// teacc 98.51 lr 0.00010000 Epoch 57, weight, value: tensor([[ 0.0100, -0.0868, -0.0422, ..., -0.0839, -0.0499, -0.0877], [ 0.0100, -0.0229, -0.0038, ..., 0.0457, 0.0558, -0.0126], [-0.0501, 0.0428, -0.0758, ..., 0.0430, 0.0183, 0.0010], ..., [-0.0544, -0.0417, -0.0554, ..., -0.0036, -0.0498, 0.0644], [ 0.0590, -0.0051, 0.0157, ..., -0.0103, -0.0665, -0.0006], [-0.0867, -0.0037, 0.0127, ..., -0.0888, 0.0204, -0.0641]], device='cuda:0'), grad: tensor([[ 2.5220e-06, 7.8261e-05, 2.6282e-06, ..., 1.4007e-04, 8.9109e-05, 5.2750e-06], [-4.5776e-05, 9.1717e-06, -1.0747e-04, ..., -5.7667e-05, -8.4400e-05, 4.7296e-05], [-5.3123e-06, -3.3569e-04, 1.8671e-05, ..., -5.5933e-04, -3.3069e-04, 2.4676e-05], ..., [ 6.6534e-06, 1.1259e-04, -1.6287e-05, ..., 6.0350e-05, 7.7152e-04, -2.6083e-04], [ 3.0994e-05, 6.5088e-05, 5.4389e-05, ..., 2.0456e-04, 2.0874e-04, 1.9237e-05], [ 3.7253e-06, 1.1966e-05, 7.7710e-06, ..., 3.7849e-05, -7.6830e-05, 2.2545e-05]], device='cuda:0') Epoch 57, bias, value: tensor([-0.0075, -0.0296, 0.0096, -0.0187, 0.0138, 0.0039, 0.0183, -0.0075, -0.0249, -0.0019], device='cuda:0'), grad: tensor([ 3.0756e-04, 8.1122e-05, -1.0672e-03, 2.1482e-04, -3.1872e-03, 1.1480e-04, 1.3673e-04, 2.9964e-03, 4.8685e-04, -8.3864e-05], device='cuda:0') 100 0.0001 changing lr epoch 56, time 262.51, cls_loss 0.0146 cls_loss_mapping 0.0217 cls_loss_causal 0.6835 re_mapping 0.0154 re_causal 0.0444 /// teacc 98.61 lr 0.00010000 Epoch 58, weight, value: tensor([[ 0.0111, -0.0873, -0.0425, ..., -0.0846, -0.0500, -0.0886], [ 0.0101, -0.0245, -0.0036, ..., 0.0456, 0.0559, -0.0137], [-0.0494, 0.0446, -0.0758, ..., 0.0438, 0.0193, 0.0008], ..., [-0.0546, -0.0420, -0.0555, ..., -0.0035, -0.0504, 0.0652], [ 0.0591, -0.0058, 0.0158, ..., -0.0106, -0.0678, -0.0011], [-0.0869, -0.0039, 0.0123, ..., -0.0895, 0.0203, -0.0652]], device='cuda:0'), grad: tensor([[-1.1384e-05, 1.1958e-05, 4.9174e-05, ..., 6.0469e-05, -7.8380e-05, 4.7684e-07], [-4.6790e-05, 2.6226e-05, -1.3676e-03, ..., -2.0580e-03, -1.7090e-03, -8.0538e-04], [ 9.7692e-05, 2.5082e-04, 3.0136e-04, ..., 5.3501e-04, 5.3704e-05, 3.3248e-06], ..., [ 5.9992e-05, 6.1750e-04, 1.2770e-03, ..., 2.2488e-03, 1.3323e-03, 6.9904e-04], [-2.0385e-05, -1.0452e-03, -8.6486e-05, ..., -6.3944e-04, 4.0799e-05, 6.0573e-06], [ 1.9848e-05, 1.7866e-05, 1.9521e-05, ..., 1.2767e-04, -1.2243e-04, 3.9011e-05]], device='cuda:0') Epoch 58, bias, value: tensor([-0.0070, -0.0297, 0.0104, -0.0187, 0.0144, 0.0038, 0.0183, -0.0075, -0.0255, -0.0028], device='cuda:0'), grad: tensor([-4.6420e-04, -5.2643e-03, 1.1234e-03, -8.3494e-04, 7.1955e-04, 9.7692e-05, 6.1560e-04, 5.7220e-03, -1.6613e-03, -4.7356e-05], device='cuda:0') 100 0.0001 changing lr epoch 57, time 262.73, cls_loss 0.0167 cls_loss_mapping 0.0249 cls_loss_causal 0.7140 re_mapping 0.0150 re_causal 0.0427 /// teacc 98.61 lr 0.00010000 Epoch 59, weight, value: tensor([[ 0.0106, -0.0879, -0.0439, ..., -0.0858, -0.0507, -0.0899], [ 0.0099, -0.0234, -0.0031, ..., 0.0462, 0.0571, -0.0150], [-0.0489, 0.0452, -0.0771, ..., 0.0436, 0.0190, 0.0007], ..., [-0.0552, -0.0433, -0.0552, ..., -0.0035, -0.0513, 0.0669], [ 0.0593, -0.0059, 0.0162, ..., -0.0105, -0.0692, -0.0017], [-0.0874, -0.0041, 0.0117, ..., -0.0900, 0.0206, -0.0662]], device='cuda:0'), grad: tensor([[ 8.2050e-07, 8.7637e-07, 2.1681e-06, ..., 3.4980e-06, -2.0824e-06, 1.8813e-06], [ 5.0198e-07, 3.5334e-06, -2.8059e-05, ..., -4.8056e-06, -3.0756e-05, 8.7097e-06], [ 6.1747e-07, -6.1452e-05, -2.8551e-05, ..., -1.6797e-04, -9.7394e-05, -3.3200e-05], ..., [ 6.3423e-07, 7.8157e-06, 2.1994e-05, ..., 1.4909e-05, 3.8266e-04, 4.8727e-05], [-3.0082e-06, 4.5478e-05, 7.8753e-06, ..., 1.3161e-04, 1.1057e-04, 4.0114e-05], [ 1.5637e-06, 1.6298e-07, 8.0317e-06, ..., 1.0744e-05, -3.9887e-04, -7.9513e-05]], device='cuda:0') Epoch 59, bias, value: tensor([-0.0076, -0.0294, 0.0103, -0.0185, 0.0147, 0.0040, 0.0177, -0.0073, -0.0255, -0.0030], device='cuda:0'), grad: tensor([-1.4819e-05, -2.1711e-05, -1.9157e-04, 7.9572e-05, 6.8128e-05, -5.9187e-05, 3.3565e-06, 6.9714e-04, 1.9217e-04, -7.5436e-04], device='cuda:0') 100 0.0001 changing lr epoch 58, time 262.25, cls_loss 0.0136 cls_loss_mapping 0.0200 cls_loss_causal 0.6408 re_mapping 0.0147 re_causal 0.0406 /// teacc 98.57 lr 0.00010000 Epoch 60, weight, value: tensor([[ 0.0103, -0.0880, -0.0446, ..., -0.0864, -0.0518, -0.0911], [ 0.0122, -0.0243, -0.0016, ..., 0.0468, 0.0579, -0.0139], [-0.0493, 0.0463, -0.0781, ..., 0.0438, 0.0197, 0.0003], ..., [-0.0566, -0.0439, -0.0560, ..., -0.0039, -0.0533, 0.0671], [ 0.0592, -0.0063, 0.0166, ..., -0.0107, -0.0701, -0.0025], [-0.0878, -0.0043, 0.0110, ..., -0.0909, 0.0220, -0.0660]], device='cuda:0'), grad: tensor([[ 4.7721e-06, 1.2629e-05, 3.2224e-06, ..., 2.6096e-06, 2.1085e-05, 1.3243e-06], [ 1.3607e-06, 9.5814e-06, 2.7250e-06, ..., 1.6183e-05, 4.5687e-05, 6.6161e-06], [ 1.7500e-06, -2.1130e-05, 9.3281e-06, ..., -2.0966e-05, -2.2963e-05, 5.0738e-06], ..., [-7.4580e-06, 3.5539e-06, -1.9878e-05, ..., -4.2200e-05, 7.0095e-05, -6.4492e-05], [ 2.8592e-06, 4.7386e-06, -1.1683e-05, ..., -5.9605e-08, 2.3413e-04, 2.0377e-06], [ 4.8093e-06, 5.9698e-07, -1.0394e-05, ..., 1.6004e-05, -2.1057e-03, 1.3955e-05]], device='cuda:0') Epoch 60, bias, value: tensor([-0.0084, -0.0284, 0.0102, -0.0178, 0.0143, 0.0038, 0.0180, -0.0081, -0.0259, -0.0024], device='cuda:0'), grad: tensor([ 5.3972e-05, 1.2982e-04, 1.3456e-05, 1.1611e-04, 5.4741e-03, 1.6129e-04, 4.6879e-05, 1.1122e-04, 7.5006e-04, -6.8550e-03], device='cuda:0') 100 0.0001 changing lr epoch 59, time 262.20, cls_loss 0.0112 cls_loss_mapping 0.0182 cls_loss_causal 0.6588 re_mapping 0.0145 re_causal 0.0426 /// teacc 98.65 lr 0.00010000 Epoch 61, weight, value: tensor([[ 0.0102, -0.0887, -0.0450, ..., -0.0868, -0.0519, -0.0918], [ 0.0119, -0.0248, -0.0020, ..., 0.0464, 0.0577, -0.0146], [-0.0490, 0.0474, -0.0776, ..., 0.0444, 0.0205, 0.0001], ..., [-0.0565, -0.0447, -0.0563, ..., -0.0041, -0.0541, 0.0678], [ 0.0596, -0.0064, 0.0172, ..., -0.0105, -0.0703, -0.0029], [-0.0888, -0.0046, 0.0106, ..., -0.0919, 0.0220, -0.0667]], device='cuda:0'), grad: tensor([[-5.8152e-06, 4.3474e-06, 1.1986e-06, ..., 6.5789e-06, 1.7500e-04, 3.7998e-06], [ 1.2539e-05, 3.1710e-04, 1.7239e-06, ..., 2.3496e-04, 3.7384e-04, 3.9846e-05], [-9.7156e-06, -4.8304e-04, -1.0923e-05, ..., -3.8886e-04, -5.9271e-04, -4.8459e-05], ..., [ 2.6748e-06, 1.3955e-05, -5.9605e-06, ..., -1.5453e-05, 5.8502e-05, -3.5554e-05], [-1.0826e-05, 9.4771e-06, -3.2425e-05, ..., -1.1407e-05, 3.7313e-05, 3.1367e-06], [ 9.3225e-07, 1.7257e-06, 8.6427e-06, ..., -2.9489e-05, -2.5392e-04, -9.9480e-05]], device='cuda:0') Epoch 61, bias, value: tensor([-0.0083, -0.0290, 0.0108, -0.0176, 0.0144, 0.0040, 0.0176, -0.0081, -0.0258, -0.0029], device='cuda:0'), grad: tensor([ 4.4417e-04, 3.9005e-04, -5.1022e-04, 1.3304e-04, 1.0071e-03, 7.4506e-05, -9.6178e-04, 6.7890e-05, 1.3880e-05, -6.5899e-04], device='cuda:0') 100 0.0001 changing lr epoch 60, time 261.89, cls_loss 0.0127 cls_loss_mapping 0.0201 cls_loss_causal 0.6618 re_mapping 0.0142 re_causal 0.0416 /// teacc 98.49 lr 0.00010000 Epoch 62, weight, value: tensor([[ 1.0399e-02, -8.9173e-02, -4.5177e-02, ..., -8.7403e-02, -5.2336e-02, -9.2642e-02], [ 1.2629e-02, -2.4592e-02, -1.0421e-03, ..., 4.6919e-02, 5.8763e-02, -1.3845e-02], [-4.9594e-02, 4.7889e-02, -7.8553e-02, ..., 4.4304e-02, 2.0586e-02, -1.3442e-05], ..., [-5.6891e-02, -4.5617e-02, -5.6746e-02, ..., -4.1491e-03, -5.5773e-02, 6.8108e-02], [ 5.9460e-02, -6.3847e-03, 1.7492e-02, ..., -1.0285e-02, -7.0937e-02, -3.3056e-03], [-8.9216e-02, -4.6511e-03, 1.0163e-02, ..., -9.2741e-02, 2.3265e-02, -6.6444e-02]], device='cuda:0'), grad: tensor([[ 4.8913e-06, 7.8604e-06, 3.7700e-05, ..., 2.1636e-05, 6.6161e-06, 1.9688e-06], [ 3.6824e-06, 5.3495e-06, -7.7337e-06, ..., 3.7886e-06, -7.5400e-06, 5.3085e-06], [-1.8835e-05, -9.7632e-05, 2.8312e-05, ..., -1.6403e-04, -1.4114e-04, -2.1487e-05], ..., [ 1.9133e-05, 8.0884e-05, 1.7539e-05, ..., 1.3697e-04, 1.5664e-04, 1.0014e-05], [ 6.7838e-06, -4.7326e-05, -5.8055e-05, ..., -1.4715e-05, -1.3638e-04, 1.0291e-06], [ 6.0946e-06, 3.5703e-05, 8.5056e-05, ..., 2.8193e-05, 1.0610e-04, 1.6494e-06]], device='cuda:0') Epoch 62, bias, value: tensor([-0.0083, -0.0283, 0.0106, -0.0170, 0.0133, 0.0034, 0.0180, -0.0085, -0.0258, -0.0021], device='cuda:0'), grad: tensor([ 7.3314e-05, 1.4186e-05, -1.2386e-04, 1.5450e-04, -6.1929e-05, -4.4632e-04, 8.1718e-05, 2.4056e-04, -3.0255e-04, 3.6931e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 61---------------------------------------------------- epoch 61, time 279.38, cls_loss 0.0122 cls_loss_mapping 0.0188 cls_loss_causal 0.6811 re_mapping 0.0144 re_causal 0.0435 /// teacc 98.67 lr 0.00010000 Epoch 63, weight, value: tensor([[ 0.0104, -0.0912, -0.0465, ..., -0.0878, -0.0523, -0.0936], [ 0.0123, -0.0251, -0.0010, ..., 0.0467, 0.0586, -0.0144], [-0.0494, 0.0486, -0.0790, ..., 0.0446, 0.0212, -0.0003], ..., [-0.0570, -0.0460, -0.0571, ..., -0.0044, -0.0565, 0.0686], [ 0.0597, -0.0058, 0.0178, ..., -0.0101, -0.0715, -0.0036], [-0.0895, -0.0050, 0.0100, ..., -0.0932, 0.0238, -0.0668]], device='cuda:0'), grad: tensor([[ 2.2259e-07, 5.1688e-07, 1.5618e-06, ..., 3.2447e-06, -2.4706e-05, 4.5486e-06], [ 3.7998e-07, -2.2575e-06, -3.6340e-06, ..., 1.0412e-06, -2.0396e-06, 3.4958e-05], [ 7.7933e-06, 1.2787e-06, 3.6031e-05, ..., 5.9992e-05, 1.5110e-05, 1.4395e-05], ..., [ 1.2526e-06, 1.1548e-06, 5.9903e-06, ..., -2.0728e-05, -5.9813e-05, -1.3483e-04], [ 4.2990e-06, -4.4703e-06, -1.0423e-05, ..., -3.7216e-06, 2.1547e-05, 1.9968e-06], [ 3.5204e-07, 3.6787e-07, 2.9597e-06, ..., 7.6890e-06, 7.8201e-05, 4.8876e-05]], device='cuda:0') Epoch 63, bias, value: tensor([-0.0084, -0.0287, 0.0106, -0.0169, 0.0128, 0.0041, 0.0179, -0.0087, -0.0259, -0.0019], device='cuda:0'), grad: tensor([-9.0003e-05, 5.3763e-05, 9.7692e-05, -3.5018e-05, -2.1785e-05, 6.1572e-05, -8.1137e-06, -2.3770e-04, 3.1829e-05, 1.4770e-04], device='cuda:0') 100 0.0001 changing lr epoch 62, time 262.46, cls_loss 0.0157 cls_loss_mapping 0.0189 cls_loss_causal 0.6852 re_mapping 0.0139 re_causal 0.0398 /// teacc 98.65 lr 0.00010000 Epoch 64, weight, value: tensor([[ 0.0104, -0.0930, -0.0472, ..., -0.0885, -0.0524, -0.0944], [ 0.0121, -0.0256, -0.0010, ..., 0.0464, 0.0584, -0.0152], [-0.0487, 0.0494, -0.0796, ..., 0.0446, 0.0210, -0.0002], ..., [-0.0578, -0.0455, -0.0571, ..., -0.0037, -0.0557, 0.0695], [ 0.0599, -0.0058, 0.0183, ..., -0.0099, -0.0725, -0.0038], [-0.0898, -0.0034, 0.0099, ..., -0.0936, 0.0241, -0.0675]], device='cuda:0'), grad: tensor([[ 8.9407e-07, 9.1076e-05, 2.7940e-06, ..., 5.3346e-06, 4.7177e-05, 2.3488e-06], [ 1.8151e-06, 1.2964e-06, 6.6042e-05, ..., 9.2804e-05, -3.7402e-06, 5.1260e-05], [ 6.7018e-06, -8.9854e-06, 2.6718e-05, ..., 1.9014e-05, -1.4290e-05, 2.2337e-05], ..., [-1.5944e-05, -7.2457e-06, 1.2489e-02, ..., 1.6602e-02, 1.8716e-05, 8.6746e-03], [ 2.0135e-06, 8.7246e-06, -1.2840e-02, ..., -1.7090e-02, 2.6211e-05, -8.9417e-03], [ 2.9821e-06, 3.1684e-06, 1.0235e-06, ..., 7.3969e-05, -5.5820e-05, 4.0263e-05]], device='cuda:0') Epoch 64, bias, value: tensor([-0.0081, -0.0295, 0.0104, -0.0175, 0.0132, 0.0041, 0.0184, -0.0079, -0.0260, -0.0021], device='cuda:0'), grad: tensor([ 1.8907e-04, 1.0526e-04, 3.7253e-05, 3.3069e-04, 1.9646e-04, -8.5413e-05, -2.3770e-04, 1.7593e-02, -1.8036e-02, -9.5069e-05], device='cuda:0') 100 0.0001 changing lr epoch 63, time 262.45, cls_loss 0.0127 cls_loss_mapping 0.0163 cls_loss_causal 0.6758 re_mapping 0.0138 re_causal 0.0396 /// teacc 98.58 lr 0.00010000 Epoch 65, weight, value: tensor([[ 0.0105, -0.0939, -0.0479, ..., -0.0891, -0.0526, -0.0954], [ 0.0130, -0.0258, -0.0006, ..., 0.0464, 0.0586, -0.0160], [-0.0489, 0.0504, -0.0803, ..., 0.0447, 0.0217, -0.0013], ..., [-0.0578, -0.0454, -0.0578, ..., -0.0041, -0.0565, 0.0704], [ 0.0597, -0.0066, 0.0193, ..., -0.0092, -0.0739, -0.0027], [-0.0900, -0.0036, 0.0098, ..., -0.0941, 0.0249, -0.0683]], device='cuda:0'), grad: tensor([[ 1.6112e-07, 9.2909e-06, 4.9286e-06, ..., 1.2256e-05, 1.3545e-05, 5.8208e-07], [ 3.2689e-07, -1.2350e-04, -3.3998e-04, ..., -3.5143e-04, -4.7803e-04, 4.4741e-06], [ 3.3341e-07, -2.7537e-04, -4.4376e-05, ..., -3.2043e-04, 3.1304e-04, 1.9759e-05], ..., [ 4.4797e-07, 2.7299e-05, 1.1861e-05, ..., 1.3351e-05, 3.2336e-05, -2.4572e-05], [ 6.9663e-07, 3.2187e-04, 3.4094e-04, ..., 5.4455e-04, 6.3753e-04, 3.1646e-06], [ 1.4612e-06, 4.5039e-06, 5.3905e-06, ..., 1.8671e-05, 5.8770e-05, 1.2688e-05]], device='cuda:0') Epoch 65, bias, value: tensor([-0.0082, -0.0295, 0.0101, -0.0179, 0.0135, 0.0043, 0.0185, -0.0079, -0.0257, -0.0020], device='cuda:0'), grad: tensor([-8.1301e-05, -6.7091e-04, 1.4007e-04, 1.3554e-04, -7.1383e-04, -6.2585e-05, 1.4508e-04, -1.7375e-05, 9.9277e-04, 1.3435e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 64---------------------------------------------------- epoch 64, time 278.49, cls_loss 0.0112 cls_loss_mapping 0.0147 cls_loss_causal 0.6958 re_mapping 0.0139 re_causal 0.0408 /// teacc 98.69 lr 0.00010000 Epoch 66, weight, value: tensor([[ 1.0757e-02, -9.5198e-02, -4.8139e-02, ..., -8.9725e-02, -5.3294e-02, -9.5830e-02], [ 1.2627e-02, -2.6135e-02, -6.7352e-05, ..., 4.6324e-02, 5.8831e-02, -1.6305e-02], [-4.8842e-02, 5.1630e-02, -8.0917e-02, ..., 4.4990e-02, 2.2209e-02, -1.8782e-03], ..., [-5.7479e-02, -4.5611e-02, -5.7908e-02, ..., -3.8529e-03, -5.7171e-02, 7.1724e-02], [ 5.9682e-02, -6.6708e-03, 1.9776e-02, ..., -9.1764e-03, -7.4682e-02, -3.2358e-03], [-8.9935e-02, -4.0307e-03, 9.5205e-03, ..., -9.4765e-02, 2.5303e-02, -6.9225e-02]], device='cuda:0'), grad: tensor([[ 8.1025e-08, 5.6714e-05, 1.8971e-06, ..., 4.8727e-06, 1.1778e-04, 8.4378e-07], [ 7.0035e-07, 4.2766e-06, -1.5453e-05, ..., 6.0908e-07, -1.7300e-05, 1.4573e-05], [ 2.4773e-07, 8.0943e-05, 3.5763e-05, ..., 5.8830e-05, 1.6057e-04, 2.2739e-05], ..., [ 3.1386e-07, -2.7549e-06, 9.4902e-07, ..., -8.7142e-05, 8.2180e-06, -9.2328e-05], [ 2.5146e-07, 4.2260e-05, -3.6359e-05, ..., -5.5194e-05, 2.0528e-04, 5.7220e-06], [ 2.8685e-07, -9.3520e-05, 8.5831e-06, ..., 3.6091e-05, -1.4887e-03, 1.9222e-05]], device='cuda:0') Epoch 66, bias, value: tensor([-0.0088, -0.0294, 0.0103, -0.0181, 0.0140, 0.0039, 0.0186, -0.0078, -0.0258, -0.0019], device='cuda:0'), grad: tensor([ 2.2221e-04, 2.4438e-05, 4.0460e-04, 7.1108e-05, 1.6813e-03, 1.3056e-03, -2.6054e-03, -1.0604e-04, 3.6645e-04, -1.3628e-03], device='cuda:0') 100 0.0001 changing lr epoch 65, time 262.47, cls_loss 0.0121 cls_loss_mapping 0.0172 cls_loss_causal 0.6641 re_mapping 0.0143 re_causal 0.0395 /// teacc 98.53 lr 0.00010000 Epoch 67, weight, value: tensor([[ 0.0107, -0.0969, -0.0484, ..., -0.0902, -0.0537, -0.0965], [ 0.0126, -0.0259, 0.0002, ..., 0.0463, 0.0591, -0.0169], [-0.0484, 0.0546, -0.0798, ..., 0.0461, 0.0230, -0.0023], ..., [-0.0579, -0.0458, -0.0579, ..., -0.0038, -0.0574, 0.0723], [ 0.0595, -0.0099, 0.0192, ..., -0.0098, -0.0763, -0.0034], [-0.0901, -0.0053, 0.0091, ..., -0.0962, 0.0252, -0.0689]], device='cuda:0'), grad: tensor([[ 1.2732e-04, 9.8169e-05, 6.8545e-05, ..., 9.2328e-05, 5.7459e-05, 3.2306e-05], [ 4.0680e-05, 6.1417e-04, 2.5196e-03, ..., 1.2722e-03, 2.9354e-03, 9.9659e-05], [-9.3699e-05, 8.1003e-05, 1.0276e-04, ..., -8.2207e-04, -1.9205e-04, -4.5371e-04], ..., [ 2.8655e-05, 4.9137e-06, 4.9770e-05, ..., 1.8942e-04, 1.2994e-04, 8.4758e-05], [-2.2006e-04, -5.0515e-05, 5.2643e-04, ..., 4.3535e-04, 7.2098e-04, 1.1325e-04], [ 1.2010e-05, 7.8157e-06, -8.6948e-06, ..., 1.7792e-05, -1.0711e-04, 6.0461e-06]], device='cuda:0') Epoch 67, bias, value: tensor([-0.0087, -0.0294, 0.0117, -0.0187, 0.0148, 0.0043, 0.0180, -0.0080, -0.0270, -0.0021], device='cuda:0'), grad: tensor([ 0.0007, 0.0055, -0.0009, 0.0005, 0.0004, 0.0008, -0.0076, 0.0004, 0.0006, -0.0003], device='cuda:0') 100 0.0001 changing lr epoch 66, time 262.55, cls_loss 0.0117 cls_loss_mapping 0.0152 cls_loss_causal 0.6478 re_mapping 0.0134 re_causal 0.0368 /// teacc 98.63 lr 0.00010000 Epoch 68, weight, value: tensor([[ 0.0106, -0.0979, -0.0488, ..., -0.0909, -0.0540, -0.0967], [ 0.0129, -0.0257, 0.0005, ..., 0.0466, 0.0600, -0.0167], [-0.0490, 0.0546, -0.0807, ..., 0.0457, 0.0228, -0.0029], ..., [-0.0577, -0.0460, -0.0582, ..., -0.0040, -0.0591, 0.0730], [ 0.0603, -0.0097, 0.0196, ..., -0.0092, -0.0771, -0.0037], [-0.0907, -0.0056, 0.0086, ..., -0.0964, 0.0261, -0.0694]], device='cuda:0'), grad: tensor([[ 1.0850e-06, 5.0105e-06, 1.0580e-05, ..., 1.2979e-05, 1.9535e-05, 1.5181e-06], [ 8.0764e-06, 2.8476e-05, 1.9088e-05, ..., 6.2406e-05, 1.8731e-05, 1.8448e-05], [ 5.7742e-06, 4.2655e-06, 4.1366e-05, ..., 4.2081e-05, 2.6643e-05, 1.3135e-05], ..., [-6.1691e-05, -2.0134e-04, -2.0313e-04, ..., -4.1270e-04, -2.0719e-04, -1.2201e-04], [ 1.9912e-06, 3.6918e-06, -3.2723e-05, ..., -2.1935e-05, 6.6102e-05, 3.8855e-06], [ 3.9279e-05, 1.2577e-04, 1.8251e-04, ..., 2.8229e-04, -1.3943e-03, 6.3598e-05]], device='cuda:0') Epoch 68, bias, value: tensor([-0.0085, -0.0291, 0.0110, -0.0184, 0.0145, 0.0044, 0.0179, -0.0083, -0.0269, -0.0018], device='cuda:0'), grad: tensor([ 8.3327e-05, 2.9778e-04, 1.9491e-04, 2.2471e-04, 4.4785e-03, -1.6558e-04, 5.5134e-05, -1.6289e-03, 2.6941e-04, -3.8052e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 67---------------------------------------------------- epoch 67, time 279.17, cls_loss 0.0115 cls_loss_mapping 0.0146 cls_loss_causal 0.6184 re_mapping 0.0131 re_causal 0.0375 /// teacc 98.72 lr 0.00010000 Epoch 69, weight, value: tensor([[ 0.0105, -0.0987, -0.0499, ..., -0.0917, -0.0549, -0.0978], [ 0.0132, -0.0263, 0.0008, ..., 0.0462, 0.0605, -0.0172], [-0.0488, 0.0548, -0.0813, ..., 0.0457, 0.0232, -0.0031], ..., [-0.0582, -0.0465, -0.0581, ..., -0.0033, -0.0595, 0.0744], [ 0.0605, -0.0086, 0.0200, ..., -0.0088, -0.0776, -0.0037], [-0.0913, -0.0062, 0.0078, ..., -0.0982, 0.0260, -0.0713]], device='cuda:0'), grad: tensor([[ 9.2834e-06, 1.3255e-05, 4.3493e-07, ..., 3.8855e-06, 4.1008e-05, 2.2165e-07], [ 4.9710e-05, 7.0572e-05, 2.3115e-06, ..., 2.1741e-05, 2.1899e-04, 2.7455e-06], [ 5.0277e-05, 7.0512e-05, 4.1574e-06, ..., 2.1264e-05, 2.2328e-04, 1.6782e-06], ..., [-1.4435e-07, 1.8626e-07, -1.5460e-07, ..., -7.6890e-06, 1.3011e-06, -1.4283e-05], [ 8.3297e-06, 1.1675e-05, -2.5406e-06, ..., 2.7120e-06, 4.0859e-05, 1.3420e-06], [ 1.1167e-06, 4.5914e-07, 2.3246e-06, ..., 5.1446e-06, 2.0508e-06, 4.7274e-06]], device='cuda:0') Epoch 69, bias, value: tensor([-0.0091, -0.0291, 0.0106, -0.0184, 0.0141, 0.0046, 0.0189, -0.0075, -0.0267, -0.0025], device='cuda:0'), grad: tensor([ 5.0694e-05, 3.0231e-04, 3.0732e-04, 9.9316e-06, 6.3062e-05, 8.2105e-06, -7.9536e-04, -2.0072e-05, 5.8085e-05, 1.6108e-05], device='cuda:0') 100 0.0001 changing lr epoch 68, time 262.79, cls_loss 0.0095 cls_loss_mapping 0.0138 cls_loss_causal 0.6330 re_mapping 0.0128 re_causal 0.0384 /// teacc 98.72 lr 0.00010000 Epoch 70, weight, value: tensor([[ 0.0106, -0.0991, -0.0501, ..., -0.0921, -0.0563, -0.0980], [ 0.0131, -0.0265, 0.0007, ..., 0.0457, 0.0605, -0.0170], [-0.0494, 0.0559, -0.0814, ..., 0.0465, 0.0233, -0.0028], ..., [-0.0584, -0.0468, -0.0580, ..., -0.0030, -0.0596, 0.0745], [ 0.0613, -0.0094, 0.0203, ..., -0.0093, -0.0780, -0.0045], [-0.0918, -0.0065, 0.0070, ..., -0.0991, 0.0266, -0.0716]], device='cuda:0'), grad: tensor([[ 2.1458e-06, 5.7649e-07, 6.2920e-06, ..., 7.2084e-06, 6.8061e-06, 1.3970e-07], [ 2.1793e-07, -1.3839e-06, 2.1923e-04, ..., 1.3769e-04, 5.0068e-04, 8.1304e-07], [ 8.0019e-06, 3.1255e-06, 4.6730e-05, ..., 1.6004e-05, -2.8014e-05, 1.0412e-06], ..., [ 2.5518e-07, 3.1479e-07, 1.8448e-05, ..., 1.0669e-05, 4.6760e-05, -2.5835e-06], [-1.6272e-05, -1.0682e-06, -6.7353e-05, ..., -8.8453e-05, 3.1620e-05, 2.1886e-07], [ 2.9597e-06, 1.3970e-07, 1.0604e-04, ..., 9.7871e-05, 2.0182e-04, 2.0005e-06]], device='cuda:0') Epoch 70, bias, value: tensor([-0.0097, -0.0292, 0.0109, -0.0177, 0.0145, 0.0040, 0.0189, -0.0077, -0.0268, -0.0024], device='cuda:0'), grad: tensor([ 2.0742e-05, 7.8773e-04, 4.0293e-05, 1.0264e-04, -1.1959e-03, 2.0817e-05, 4.0047e-08, 7.2718e-05, -1.1039e-04, 2.5940e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 69---------------------------------------------------- epoch 69, time 279.29, cls_loss 0.0092 cls_loss_mapping 0.0142 cls_loss_causal 0.6208 re_mapping 0.0126 re_causal 0.0373 /// teacc 98.74 lr 0.00010000 Epoch 71, weight, value: tensor([[ 0.0107, -0.0996, -0.0504, ..., -0.0926, -0.0570, -0.0981], [ 0.0130, -0.0270, 0.0007, ..., 0.0454, 0.0609, -0.0174], [-0.0497, 0.0566, -0.0814, ..., 0.0468, 0.0244, -0.0030], ..., [-0.0587, -0.0472, -0.0579, ..., -0.0030, -0.0603, 0.0754], [ 0.0615, -0.0091, 0.0207, ..., -0.0090, -0.0785, -0.0046], [-0.0919, -0.0068, 0.0068, ..., -0.0994, 0.0265, -0.0723]], device='cuda:0'), grad: tensor([[ 1.2172e-06, 1.3746e-05, 2.4159e-06, ..., 3.0264e-05, -4.7565e-05, 2.4047e-06], [ 8.2981e-07, 1.2117e-06, -1.3411e-05, ..., -1.4948e-06, -1.3120e-05, 3.2187e-06], [ 3.8743e-06, -1.2970e-04, 9.6038e-06, ..., -2.4700e-04, -3.6573e-04, 8.1584e-06], ..., [-1.9684e-05, 1.3866e-05, -1.0148e-05, ..., -1.1355e-05, 4.9442e-05, -4.8995e-05], [ 2.5649e-06, 8.5458e-06, -2.4289e-05, ..., -1.9312e-05, 3.6597e-05, 2.7120e-06], [-5.4054e-06, 3.9041e-05, -3.6001e-05, ..., 1.0544e-04, 1.0449e-04, 2.3276e-05]], device='cuda:0') Epoch 71, bias, value: tensor([-0.0098, -0.0292, 0.0113, -0.0175, 0.0138, 0.0041, 0.0189, -0.0076, -0.0264, -0.0026], device='cuda:0'), grad: tensor([-1.2362e-04, 5.9381e-06, -8.1539e-04, 2.6727e-04, 8.8990e-05, 1.5008e-04, 1.2672e-04, -8.9526e-05, 1.9476e-05, 3.7003e-04], device='cuda:0') 100 0.0001 changing lr epoch 70, time 262.26, cls_loss 0.0097 cls_loss_mapping 0.0152 cls_loss_causal 0.6165 re_mapping 0.0123 re_causal 0.0365 /// teacc 98.65 lr 0.00010000 Epoch 72, weight, value: tensor([[ 0.0108, -0.1001, -0.0508, ..., -0.0934, -0.0572, -0.0985], [ 0.0137, -0.0269, 0.0015, ..., 0.0458, 0.0620, -0.0170], [-0.0503, 0.0566, -0.0816, ..., 0.0470, 0.0240, -0.0037], ..., [-0.0589, -0.0472, -0.0578, ..., -0.0026, -0.0603, 0.0764], [ 0.0613, -0.0090, 0.0211, ..., -0.0087, -0.0792, -0.0049], [-0.0919, -0.0070, 0.0061, ..., -0.1005, 0.0263, -0.0733]], device='cuda:0'), grad: tensor([[ 9.1270e-06, 1.6578e-07, 1.2159e-05, ..., 1.9595e-05, 5.2340e-06, 3.1479e-07], [ 2.3082e-05, 2.2843e-05, 6.9141e-06, ..., 3.6478e-05, 5.3197e-05, 1.3381e-05], [ 1.0765e-04, -2.3484e-05, 8.6188e-05, ..., 1.8549e-04, -1.0066e-05, 2.6971e-06], ..., [ 1.8358e-05, 1.5199e-06, 2.2173e-05, ..., 3.4779e-05, 3.1441e-05, 3.4012e-06], [ 2.8089e-05, 4.6752e-07, 6.8963e-05, ..., 6.9916e-05, 4.0174e-05, 1.4296e-06], [ 1.1075e-04, 1.8347e-07, 3.0398e-04, ..., 2.8706e-04, 2.3127e-04, 1.2582e-06]], device='cuda:0') Epoch 72, bias, value: tensor([-0.0096, -0.0286, 0.0112, -0.0188, 0.0138, 0.0042, 0.0189, -0.0069, -0.0264, -0.0032], device='cuda:0'), grad: tensor([ 3.0667e-05, 1.0592e-04, 2.4772e-04, -1.2627e-03, -3.6025e-04, 2.7752e-04, 1.8999e-05, 7.6234e-05, 1.4126e-04, 7.2527e-04], device='cuda:0') 100 0.0001 changing lr epoch 71, time 262.52, cls_loss 0.0098 cls_loss_mapping 0.0141 cls_loss_causal 0.6240 re_mapping 0.0128 re_causal 0.0356 /// teacc 98.70 lr 0.00010000 Epoch 73, weight, value: tensor([[ 0.0108, -0.1010, -0.0522, ..., -0.0946, -0.0570, -0.0990], [ 0.0138, -0.0269, 0.0020, ..., 0.0462, 0.0628, -0.0179], [-0.0520, 0.0563, -0.0828, ..., 0.0464, 0.0238, -0.0040], ..., [-0.0589, -0.0474, -0.0580, ..., -0.0027, -0.0609, 0.0765], [ 0.0627, -0.0082, 0.0216, ..., -0.0081, -0.0802, -0.0055], [-0.0924, -0.0069, 0.0055, ..., -0.1016, 0.0266, -0.0738]], device='cuda:0'), grad: tensor([[ 1.4342e-05, 7.4133e-06, 4.5925e-05, ..., 4.3422e-05, 1.0085e-04, 5.0180e-06], [ 1.0235e-06, -8.6904e-05, -7.0274e-05, ..., -4.8065e-04, -5.3310e-04, 2.8815e-06], [ 6.0439e-05, 6.6102e-05, 1.8537e-04, ..., 4.2748e-04, 3.4285e-04, 1.8552e-05], ..., [ 3.7253e-07, 2.1666e-05, 1.1221e-05, ..., 1.0121e-04, 1.0884e-04, -4.2133e-06], [-1.0204e-04, -3.1024e-05, -3.8481e-04, ..., -2.9778e-04, -1.2666e-07, -3.3140e-05], [ 4.1053e-06, 3.1162e-06, 7.9155e-05, ..., 7.1406e-05, -4.6283e-05, 5.2154e-06]], device='cuda:0') Epoch 73, bias, value: tensor([-0.0093, -0.0285, 0.0105, -0.0184, 0.0140, 0.0043, 0.0185, -0.0070, -0.0260, -0.0032], device='cuda:0'), grad: tensor([ 2.5344e-04, -6.4945e-04, 6.6996e-04, 3.0923e-04, 7.8380e-05, -1.3542e-04, -1.0651e-04, 1.4830e-04, -6.8092e-04, 1.1259e-04], device='cuda:0') 100 0.0001 changing lr epoch 72, time 262.54, cls_loss 0.0092 cls_loss_mapping 0.0138 cls_loss_causal 0.6373 re_mapping 0.0124 re_causal 0.0368 /// teacc 98.73 lr 0.00010000 Epoch 74, weight, value: tensor([[ 0.0108, -0.1016, -0.0526, ..., -0.0954, -0.0569, -0.0998], [ 0.0139, -0.0278, 0.0030, ..., 0.0460, 0.0633, -0.0181], [-0.0522, 0.0570, -0.0835, ..., 0.0464, 0.0241, -0.0042], ..., [-0.0590, -0.0469, -0.0587, ..., -0.0022, -0.0614, 0.0773], [ 0.0628, -0.0081, 0.0216, ..., -0.0081, -0.0815, -0.0059], [-0.0926, -0.0076, 0.0046, ..., -0.1024, 0.0267, -0.0746]], device='cuda:0'), grad: tensor([[ 3.3788e-06, 4.5672e-06, 1.4178e-05, ..., 2.0131e-05, 2.2650e-05, 1.0803e-07], [ 9.3412e-07, 1.2536e-06, -1.0729e-06, ..., 3.3546e-06, 1.0759e-05, 6.2026e-07], [ 3.5893e-06, -1.4808e-06, 2.3067e-05, ..., 1.0684e-05, -4.1686e-06, -8.3819e-08], ..., [ 1.8114e-06, 3.7365e-06, 2.8357e-05, ..., 3.2812e-05, 1.3733e-04, -4.2208e-06], [ 1.8757e-06, -1.5154e-05, -1.1092e-04, ..., -1.6105e-04, 1.8284e-05, 2.8312e-07], [ 8.0466e-06, 1.1427e-06, 8.1003e-05, ..., 9.7394e-05, 1.0085e-04, 1.8943e-06]], device='cuda:0') Epoch 74, bias, value: tensor([-0.0090, -0.0285, 0.0101, -0.0183, 0.0141, 0.0046, 0.0187, -0.0066, -0.0266, -0.0035], device='cuda:0'), grad: tensor([ 8.9824e-05, 7.4089e-05, 7.0751e-05, -6.3610e-04, -8.4257e-04, 1.0699e-04, 4.1175e-04, 5.6791e-04, -4.4131e-04, 5.9891e-04], device='cuda:0') 100 0.0001 changing lr epoch 73, time 262.44, cls_loss 0.0101 cls_loss_mapping 0.0180 cls_loss_causal 0.6429 re_mapping 0.0122 re_causal 0.0352 /// teacc 98.70 lr 0.00010000 Epoch 75, weight, value: tensor([[ 0.0119, -0.1025, -0.0519, ..., -0.0960, -0.0561, -0.1006], [ 0.0137, -0.0285, 0.0035, ..., 0.0462, 0.0633, -0.0182], [-0.0524, 0.0575, -0.0838, ..., 0.0464, 0.0247, -0.0047], ..., [-0.0586, -0.0464, -0.0592, ..., -0.0019, -0.0623, 0.0782], [ 0.0631, -0.0082, 0.0226, ..., -0.0076, -0.0815, -0.0062], [-0.0927, -0.0083, 0.0043, ..., -0.1030, 0.0271, -0.0753]], device='cuda:0'), grad: tensor([[ 1.0245e-07, 6.8061e-06, 1.3905e-06, ..., 5.8711e-06, 2.4036e-05, 4.6566e-06], [ 2.5053e-07, 1.3843e-05, -9.4324e-06, ..., 2.7921e-06, -1.3180e-05, 1.0304e-05], [ 3.1572e-07, 3.3081e-05, 9.1642e-06, ..., 3.1680e-05, 4.8608e-05, 2.5287e-05], ..., [ 3.8464e-07, -8.7202e-05, 5.8785e-06, ..., -6.8903e-05, 7.3791e-05, -2.4274e-05], [ 9.1456e-07, -3.4850e-06, -6.2063e-06, ..., -9.2387e-06, 1.0557e-05, 4.1723e-06], [ 2.9150e-07, 6.7279e-06, 5.0776e-06, ..., 1.9267e-05, -2.2244e-04, -4.1038e-05]], device='cuda:0') Epoch 75, bias, value: tensor([-0.0082, -0.0287, 0.0098, -0.0191, 0.0142, 0.0045, 0.0172, -0.0061, -0.0253, -0.0037], device='cuda:0'), grad: tensor([ 5.0426e-05, 2.2814e-05, 1.5867e-04, 4.7326e-05, 9.7156e-05, 3.9369e-05, -2.6412e-06, -4.5121e-05, 1.6183e-05, -3.8433e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 74---------------------------------------------------- epoch 74, time 279.07, cls_loss 0.0071 cls_loss_mapping 0.0107 cls_loss_causal 0.6133 re_mapping 0.0120 re_causal 0.0369 /// teacc 98.82 lr 0.00010000 Epoch 76, weight, value: tensor([[ 0.0119, -0.1031, -0.0528, ..., -0.0968, -0.0564, -0.1009], [ 0.0140, -0.0292, 0.0039, ..., 0.0458, 0.0634, -0.0187], [-0.0524, 0.0583, -0.0845, ..., 0.0465, 0.0248, -0.0052], ..., [-0.0588, -0.0467, -0.0593, ..., -0.0015, -0.0620, 0.0789], [ 0.0628, -0.0085, 0.0226, ..., -0.0077, -0.0827, -0.0064], [-0.0929, -0.0086, 0.0040, ..., -0.1035, 0.0273, -0.0757]], device='cuda:0'), grad: tensor([[ 1.5832e-07, 4.1425e-06, 3.6582e-06, ..., 1.1332e-05, 4.7684e-06, 8.4657e-07], [-1.4625e-05, -8.0606e-07, -1.8275e-04, ..., -3.1567e-04, -3.2496e-04, 9.1270e-06], [ 1.0744e-05, -1.9014e-05, 1.1963e-04, ..., 2.0993e-04, 1.7595e-04, 3.8922e-05], ..., [ 2.8983e-06, -2.6021e-06, 4.3392e-05, ..., 1.7673e-05, 9.8646e-05, -6.9857e-05], [ 6.8033e-07, 3.9488e-06, 1.7703e-05, ..., 2.3007e-05, 1.5870e-05, 4.6343e-06], [ 2.8545e-07, 1.3523e-06, 1.2763e-05, ..., -9.4548e-06, -4.7505e-05, 6.4000e-06]], device='cuda:0') Epoch 76, bias, value: tensor([-0.0084, -0.0290, 0.0097, -0.0192, 0.0142, 0.0051, 0.0177, -0.0057, -0.0259, -0.0039], device='cuda:0'), grad: tensor([-1.3657e-05, -4.1151e-04, 2.9802e-04, 1.9395e-04, 4.3631e-05, -2.6393e-04, 2.8282e-05, 8.2433e-05, 9.0659e-05, -4.8131e-05], device='cuda:0') 100 0.0001 changing lr epoch 75, time 262.72, cls_loss 0.0100 cls_loss_mapping 0.0139 cls_loss_causal 0.6361 re_mapping 0.0124 re_causal 0.0357 /// teacc 98.76 lr 0.00010000 Epoch 77, weight, value: tensor([[ 0.0123, -0.1058, -0.0536, ..., -0.0981, -0.0569, -0.1014], [ 0.0146, -0.0299, 0.0039, ..., 0.0456, 0.0636, -0.0195], [-0.0523, 0.0588, -0.0851, ..., 0.0463, 0.0251, -0.0057], ..., [-0.0597, -0.0473, -0.0592, ..., -0.0012, -0.0635, 0.0800], [ 0.0628, -0.0081, 0.0236, ..., -0.0073, -0.0831, -0.0066], [-0.0935, -0.0068, 0.0036, ..., -0.1029, 0.0283, -0.0762]], device='cuda:0'), grad: tensor([[-6.0759e-06, 5.7459e-05, 2.0508e-06, ..., 3.2689e-06, 5.9664e-05, 2.2888e-05], [ 3.3546e-06, 8.0392e-06, -1.4044e-05, ..., -1.1422e-05, -3.2961e-05, 2.6032e-05], [ 1.1837e-06, -1.9264e-04, 6.2473e-06, ..., 3.7309e-06, -2.1994e-04, 9.6321e-05], ..., [ 1.1008e-06, 9.1255e-05, 1.6615e-05, ..., -1.2703e-05, -8.1658e-05, -3.0003e-03], [ 2.2892e-06, 2.2024e-05, -1.0252e-05, ..., -9.6858e-06, 3.7402e-05, 1.1019e-05], [ 9.1642e-07, 5.0366e-06, 7.3276e-06, ..., 1.3471e-05, 1.1466e-05, 2.8059e-05]], device='cuda:0') Epoch 77, bias, value: tensor([-0.0089, -0.0292, 0.0093, -0.0193, 0.0146, 0.0049, 0.0176, -0.0058, -0.0254, -0.0033], device='cuda:0'), grad: tensor([ 1.7786e-04, 3.9279e-05, -2.7776e-04, 6.8784e-05, 6.2370e-03, 4.9472e-05, 2.3171e-05, -6.5079e-03, 9.7871e-05, 9.3102e-05], device='cuda:0') 100 0.0001 changing lr epoch 76, time 262.28, cls_loss 0.0080 cls_loss_mapping 0.0124 cls_loss_causal 0.5894 re_mapping 0.0125 re_causal 0.0347 /// teacc 98.81 lr 0.00010000 Epoch 78, weight, value: tensor([[ 0.0125, -0.1061, -0.0541, ..., -0.0987, -0.0572, -0.1017], [ 0.0152, -0.0304, 0.0041, ..., 0.0457, 0.0639, -0.0198], [-0.0527, 0.0599, -0.0849, ..., 0.0469, 0.0258, -0.0058], ..., [-0.0601, -0.0475, -0.0591, ..., -0.0014, -0.0647, 0.0809], [ 0.0629, -0.0090, 0.0232, ..., -0.0082, -0.0846, -0.0068], [-0.0938, -0.0062, 0.0030, ..., -0.1029, 0.0288, -0.0765]], device='cuda:0'), grad: tensor([[-1.0073e-05, 2.3004e-06, -2.0750e-06, ..., 2.1085e-05, 4.5300e-06, 6.2166e-07], [ 1.2582e-06, 1.3340e-04, 2.3603e-04, ..., 1.2188e-03, 5.6362e-04, 1.0721e-05], [-4.3288e-06, 1.6439e-04, 3.3355e-04, ..., 1.7185e-03, 7.4768e-04, 2.3484e-05], ..., [ 1.3383e-06, -3.6740e-04, -6.6328e-04, ..., -3.4561e-03, -1.5421e-03, -8.5413e-05], [ 5.9232e-06, 1.9863e-05, -6.7391e-06, ..., 2.7850e-05, 3.9548e-05, 2.3674e-06], [ 2.7642e-06, 2.9299e-06, 1.4968e-05, ..., 4.8608e-05, 1.0177e-05, 3.2216e-05]], device='cuda:0') Epoch 78, bias, value: tensor([-0.0088, -0.0291, 0.0098, -0.0195, 0.0141, 0.0056, 0.0178, -0.0059, -0.0265, -0.0030], device='cuda:0'), grad: tensor([-1.0580e-05, 2.1267e-03, 2.9736e-03, 6.2323e-04, 3.8087e-05, 4.8369e-05, 2.3127e-05, -5.9700e-03, 4.9591e-05, 9.3818e-05], device='cuda:0') 100 0.0001 changing lr epoch 77, time 262.48, cls_loss 0.0071 cls_loss_mapping 0.0116 cls_loss_causal 0.6216 re_mapping 0.0121 re_causal 0.0357 /// teacc 98.61 lr 0.00010000 Epoch 79, weight, value: tensor([[ 0.0125, -0.1066, -0.0539, ..., -0.0993, -0.0567, -0.1023], [ 0.0153, -0.0313, 0.0039, ..., 0.0454, 0.0638, -0.0201], [-0.0527, 0.0613, -0.0851, ..., 0.0478, 0.0271, -0.0062], ..., [-0.0602, -0.0481, -0.0588, ..., -0.0012, -0.0651, 0.0817], [ 0.0630, -0.0093, 0.0236, ..., -0.0080, -0.0857, -0.0066], [-0.0939, -0.0065, 0.0028, ..., -0.1037, 0.0288, -0.0778]], device='cuda:0'), grad: tensor([[ 1.4175e-06, 9.5367e-07, 7.3761e-06, ..., 7.9349e-06, 1.3337e-05, 2.0675e-07], [ 1.6978e-06, 4.5896e-06, -3.3319e-05, ..., -2.1398e-05, -6.0707e-05, 6.2212e-07], [ 1.2435e-05, -2.3693e-05, 4.3690e-05, ..., 1.0943e-06, -1.3679e-05, 1.7984e-06], ..., [ 5.2750e-06, 1.7926e-05, 6.6347e-06, ..., 2.6673e-05, 2.8118e-05, -1.8645e-06], [ 3.1888e-05, 1.3001e-06, 8.2254e-05, ..., 7.7963e-05, 4.6581e-05, 4.3912e-07], [ 2.6114e-06, 1.0990e-06, 6.7428e-06, ..., 7.5847e-06, 5.2713e-06, 1.9819e-06]], device='cuda:0') Epoch 79, bias, value: tensor([-0.0075, -0.0298, 0.0107, -0.0198, 0.0133, 0.0052, 0.0180, -0.0053, -0.0268, -0.0035], device='cuda:0'), grad: tensor([ 2.8133e-05, -7.5042e-05, 4.3750e-05, -2.0206e-04, -2.8163e-05, -2.9597e-06, -1.5423e-05, 4.6462e-05, 1.8322e-04, 2.2054e-05], device='cuda:0') 100 0.0001 changing lr epoch 78, time 262.52, cls_loss 0.0088 cls_loss_mapping 0.0117 cls_loss_causal 0.6554 re_mapping 0.0115 re_causal 0.0344 /// teacc 98.68 lr 0.00010000 Epoch 80, weight, value: tensor([[ 0.0126, -0.1073, -0.0544, ..., -0.1000, -0.0569, -0.1037], [ 0.0152, -0.0317, 0.0043, ..., 0.0458, 0.0637, -0.0206], [-0.0529, 0.0622, -0.0857, ..., 0.0479, 0.0277, -0.0063], ..., [-0.0602, -0.0487, -0.0594, ..., -0.0020, -0.0661, 0.0817], [ 0.0629, -0.0095, 0.0236, ..., -0.0081, -0.0871, -0.0067], [-0.0942, -0.0065, 0.0032, ..., -0.1038, 0.0297, -0.0787]], device='cuda:0'), grad: tensor([[ 8.2422e-08, 4.0047e-06, 6.5863e-06, ..., 1.5989e-05, 4.4316e-05, 2.1514e-06], [ 2.2165e-07, 2.6934e-06, -4.2558e-05, ..., -4.2319e-05, -1.1659e-04, 5.0897e-07], [-5.0431e-07, -2.1771e-05, 8.1956e-06, ..., -1.9029e-05, -7.0557e-06, 1.9139e-07], ..., [ 5.4762e-07, 5.4017e-06, 8.8811e-06, ..., 1.5780e-05, 3.4094e-05, 1.1828e-06], [ 6.4261e-07, -1.7090e-07, -1.4015e-05, ..., -2.2918e-05, 3.4511e-05, 3.7765e-07], [ 2.5611e-07, 2.1271e-06, 3.1348e-06, ..., -4.6305e-06, -7.3433e-05, 9.9000e-07]], device='cuda:0') Epoch 80, bias, value: tensor([-0.0071, -0.0301, 0.0109, -0.0193, 0.0130, 0.0052, 0.0179, -0.0058, -0.0273, -0.0029], device='cuda:0'), grad: tensor([ 1.2743e-04, -1.4830e-04, -1.0557e-05, 2.3842e-05, 1.0782e-04, 1.2852e-05, -4.4554e-05, 1.2141e-04, 8.7321e-06, -1.9872e-04], device='cuda:0') 100 0.0001 changing lr epoch 79, time 262.30, cls_loss 0.0083 cls_loss_mapping 0.0117 cls_loss_causal 0.6296 re_mapping 0.0118 re_causal 0.0347 /// teacc 98.65 lr 0.00010000 Epoch 81, weight, value: tensor([[ 0.0138, -0.1080, -0.0549, ..., -0.1005, -0.0571, -0.1046], [ 0.0152, -0.0300, 0.0050, ..., 0.0465, 0.0649, -0.0208], [-0.0522, 0.0620, -0.0869, ..., 0.0477, 0.0274, -0.0073], ..., [-0.0605, -0.0490, -0.0596, ..., -0.0019, -0.0667, 0.0832], [ 0.0629, -0.0093, 0.0239, ..., -0.0077, -0.0877, -0.0070], [-0.0946, -0.0065, 0.0033, ..., -0.1045, 0.0296, -0.0790]], device='cuda:0'), grad: tensor([[-1.2778e-05, 7.4180e-07, -2.9624e-05, ..., 3.2373e-06, -1.4915e-03, 6.6590e-08], [ 1.4044e-06, 1.2126e-06, 1.1422e-05, ..., 1.6138e-05, 9.0003e-06, 7.0175e-07], [ 1.9874e-06, -3.0309e-05, 1.5303e-05, ..., -8.1241e-05, -8.7023e-05, 3.0734e-07], ..., [ 4.3167e-07, 1.4277e-06, 1.9863e-05, ..., 1.6898e-05, 9.4846e-06, -2.9542e-06], [ 8.9109e-06, 4.3884e-06, 2.4885e-05, ..., 2.2635e-05, 7.9036e-05, 3.5949e-07], [ 1.2144e-06, 2.8703e-06, 6.1616e-06, ..., 7.6056e-05, -3.8967e-06, 3.5316e-06]], device='cuda:0') Epoch 81, bias, value: tensor([-0.0066, -0.0295, 0.0105, -0.0194, 0.0128, 0.0044, 0.0183, -0.0056, -0.0272, -0.0031], device='cuda:0'), grad: tensor([-2.9602e-03, 3.1263e-05, -3.5197e-05, 1.0693e-04, 9.2328e-05, -5.4836e-04, 2.6894e-03, 4.3720e-05, 2.3437e-04, 3.4356e-04], device='cuda:0') 100 0.0001 changing lr epoch 80, time 262.42, cls_loss 0.0070 cls_loss_mapping 0.0097 cls_loss_causal 0.5968 re_mapping 0.0113 re_causal 0.0345 /// teacc 98.80 lr 0.00010000 Epoch 82, weight, value: tensor([[ 0.0140, -0.1082, -0.0550, ..., -0.1011, -0.0564, -0.1050], [ 0.0152, -0.0301, 0.0052, ..., 0.0466, 0.0647, -0.0212], [-0.0519, 0.0624, -0.0872, ..., 0.0480, 0.0278, -0.0072], ..., [-0.0607, -0.0492, -0.0600, ..., -0.0019, -0.0673, 0.0838], [ 0.0629, -0.0096, 0.0240, ..., -0.0079, -0.0884, -0.0074], [-0.0948, -0.0064, 0.0035, ..., -0.1048, 0.0302, -0.0794]], device='cuda:0'), grad: tensor([[ 1.3830e-07, 2.0247e-06, 5.0291e-07, ..., 5.6922e-06, 2.8647e-06, 1.9614e-06], [ 9.6858e-08, 7.6294e-06, -9.4809e-07, ..., 1.9416e-05, 2.8722e-06, 6.6943e-06], [ 6.8452e-08, -1.3781e-04, 2.4457e-06, ..., -3.5167e-04, -5.6624e-05, -9.0182e-05], ..., [ 3.0966e-07, 3.0071e-05, 1.6922e-06, ..., 5.9694e-05, 1.5646e-05, -5.9493e-06], [ 6.0163e-07, 7.6473e-05, 3.2224e-06, ..., 2.0492e-04, 3.3319e-05, 5.4121e-05], [ 6.1654e-07, 4.6194e-07, 3.3937e-06, ..., 1.3441e-05, 1.9029e-05, 1.3165e-05]], device='cuda:0') Epoch 82, bias, value: tensor([-0.0057, -0.0298, 0.0107, -0.0200, 0.0125, 0.0050, 0.0181, -0.0059, -0.0275, -0.0027], device='cuda:0'), grad: tensor([ 1.1876e-05, 2.6450e-05, -4.0627e-04, -1.5423e-05, -1.2435e-05, -7.0632e-05, 5.0217e-05, 4.6581e-05, 2.8062e-04, 8.8990e-05], device='cuda:0') 100 0.0001 changing lr epoch 81, time 262.41, cls_loss 0.0073 cls_loss_mapping 0.0104 cls_loss_causal 0.5684 re_mapping 0.0114 re_causal 0.0318 /// teacc 98.79 lr 0.00010000 Epoch 83, weight, value: tensor([[ 0.0139, -0.1092, -0.0557, ..., -0.1021, -0.0566, -0.1057], [ 0.0161, -0.0300, 0.0056, ..., 0.0468, 0.0655, -0.0210], [-0.0520, 0.0629, -0.0876, ..., 0.0478, 0.0280, -0.0073], ..., [-0.0618, -0.0494, -0.0604, ..., -0.0016, -0.0679, 0.0841], [ 0.0632, -0.0094, 0.0246, ..., -0.0077, -0.0891, -0.0075], [-0.0952, -0.0064, 0.0032, ..., -0.1054, 0.0307, -0.0797]], device='cuda:0'), grad: tensor([[-2.6554e-05, -2.9169e-06, 1.5218e-06, ..., 2.1886e-06, 1.3560e-05, 1.0012e-07], [ 6.2622e-06, -1.3568e-05, 2.7269e-06, ..., 1.8347e-07, 2.2620e-05, 1.6525e-05], [ 2.5872e-06, 9.4920e-06, 1.2919e-05, ..., 1.8597e-05, 4.7088e-05, 7.1060e-07], ..., [ 7.2597e-07, 6.2492e-07, -5.5730e-06, ..., -3.3766e-05, 1.5289e-05, -2.3350e-05], [ 2.4121e-06, 7.0874e-07, 2.2352e-06, ..., 1.8375e-06, 2.1085e-05, 4.9826e-07], [ 2.1040e-05, 9.9745e-07, 8.2478e-06, ..., 1.4044e-05, 4.5925e-05, 1.8338e-06]], device='cuda:0') Epoch 83, bias, value: tensor([-0.0053, -0.0294, 0.0102, -0.0197, 0.0120, 0.0050, 0.0178, -0.0059, -0.0273, -0.0026], device='cuda:0'), grad: tensor([-3.8266e-04, 1.7059e-04, 8.6010e-05, 9.5546e-05, -6.1083e-04, 1.9252e-05, 3.4642e-04, -2.0012e-05, -7.0286e-04, 9.9850e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 82---------------------------------------------------- epoch 82, time 279.62, cls_loss 0.0091 cls_loss_mapping 0.0129 cls_loss_causal 0.6147 re_mapping 0.0113 re_causal 0.0320 /// teacc 98.83 lr 0.00010000 Epoch 84, weight, value: tensor([[ 0.0141, -0.1082, -0.0561, ..., -0.1028, -0.0569, -0.1061], [ 0.0160, -0.0301, 0.0052, ..., 0.0461, 0.0651, -0.0223], [-0.0520, 0.0634, -0.0880, ..., 0.0481, 0.0284, -0.0071], ..., [-0.0621, -0.0499, -0.0601, ..., -0.0019, -0.0673, 0.0843], [ 0.0634, -0.0089, 0.0254, ..., -0.0064, -0.0899, -0.0061], [-0.0955, -0.0072, 0.0029, ..., -0.1062, 0.0292, -0.0800]], device='cuda:0'), grad: tensor([[ 4.2375e-08, 1.9027e-06, 4.0382e-06, ..., 8.6576e-06, 3.2872e-05, 1.2852e-07], [ 4.3306e-08, 1.2973e-06, -2.8610e-05, ..., -2.5705e-05, -4.1068e-05, 1.2945e-07], [ 1.8673e-07, -3.5375e-05, 1.1645e-05, ..., -4.1872e-05, -3.7700e-05, 3.0780e-07], ..., [ 5.1688e-08, 2.3663e-05, 7.7561e-06, ..., 3.8892e-05, 3.8207e-05, -5.0552e-06], [-1.2010e-05, 2.9672e-06, -9.8228e-05, ..., -1.0014e-04, 2.8595e-05, 4.1630e-07], [ 5.8208e-08, 4.3120e-07, 1.0008e-04, ..., 1.4246e-04, 1.4997e-04, 2.9113e-06]], device='cuda:0') Epoch 84, bias, value: tensor([-0.0045, -0.0304, 0.0102, -0.0193, 0.0131, 0.0051, 0.0176, -0.0058, -0.0265, -0.0039], device='cuda:0'), grad: tensor([ 3.1447e-04, -4.8578e-05, -7.2956e-05, 2.3711e-04, -1.7002e-05, -6.6566e-04, -3.2568e-04, 7.6056e-05, -8.7440e-05, 5.8889e-04], device='cuda:0') 100 0.0001 changing lr epoch 83, time 262.22, cls_loss 0.0067 cls_loss_mapping 0.0091 cls_loss_causal 0.6275 re_mapping 0.0108 re_causal 0.0332 /// teacc 98.66 lr 0.00010000 Epoch 85, weight, value: tensor([[ 0.0141, -0.1085, -0.0564, ..., -0.1033, -0.0574, -0.1064], [ 0.0160, -0.0301, 0.0062, ..., 0.0471, 0.0659, -0.0224], [-0.0517, 0.0623, -0.0887, ..., 0.0469, 0.0279, -0.0098], ..., [-0.0625, -0.0509, -0.0609, ..., -0.0027, -0.0684, 0.0850], [ 0.0633, -0.0089, 0.0251, ..., -0.0069, -0.0920, -0.0062], [-0.0958, -0.0074, 0.0026, ..., -0.1070, 0.0294, -0.0803]], device='cuda:0'), grad: tensor([[-6.3516e-07, -5.3272e-07, 2.4997e-06, ..., 5.4352e-06, 9.7081e-06, 5.3719e-06], [ 2.2054e-06, 8.4471e-07, 6.0678e-05, ..., 1.2517e-04, 3.2663e-05, 1.3888e-04], [ 2.3730e-06, -1.6978e-06, 1.4059e-05, ..., 2.3335e-05, 5.7854e-06, 2.1055e-05], ..., [ 4.5309e-07, 8.5356e-07, -4.3344e-04, ..., -8.8930e-04, -2.2531e-04, -9.9659e-04], [ 3.7104e-06, 2.6189e-06, 1.1966e-05, ..., 2.6777e-05, 1.3143e-05, 3.1292e-05], [ 2.2911e-06, 1.3076e-06, 2.4962e-04, ..., 5.1117e-04, 1.1313e-04, 5.6934e-04]], device='cuda:0') Epoch 85, bias, value: tensor([-0.0048, -0.0296, 0.0092, -0.0173, 0.0129, 0.0044, 0.0192, -0.0062, -0.0278, -0.0039], device='cuda:0'), grad: tensor([-1.1855e-04, 3.9911e-04, 7.5519e-05, 5.9223e-04, 6.8843e-05, 2.5138e-05, 2.3982e-07, -2.7485e-03, 1.0973e-04, 1.5984e-03], device='cuda:0') 100 0.0001 changing lr epoch 84, time 262.17, cls_loss 0.0080 cls_loss_mapping 0.0114 cls_loss_causal 0.6499 re_mapping 0.0106 re_causal 0.0328 /// teacc 98.79 lr 0.00010000 Epoch 86, weight, value: tensor([[ 0.0139, -0.1088, -0.0565, ..., -0.1037, -0.0572, -0.1071], [ 0.0162, -0.0304, 0.0070, ..., 0.0482, 0.0663, -0.0206], [-0.0519, 0.0628, -0.0893, ..., 0.0470, 0.0284, -0.0098], ..., [-0.0624, -0.0514, -0.0615, ..., -0.0036, -0.0694, 0.0850], [ 0.0632, -0.0084, 0.0253, ..., -0.0065, -0.0925, -0.0064], [-0.0965, -0.0080, 0.0017, ..., -0.1086, 0.0288, -0.0816]], device='cuda:0'), grad: tensor([[ 3.4366e-07, 9.9912e-06, 1.1899e-05, ..., 2.3305e-05, 1.8388e-05, 7.3016e-07], [ 1.2899e-07, 4.6223e-05, 9.3877e-06, ..., 7.3910e-05, 8.0824e-05, 3.3379e-06], [ 4.4936e-07, -1.2088e-04, 2.6852e-05, ..., -1.5187e-04, -1.9193e-04, 2.7213e-06], ..., [ 1.5646e-07, 4.2431e-06, 6.3442e-06, ..., -1.4290e-05, 1.2688e-05, -1.9774e-05], [ 2.6338e-06, 3.2157e-05, -2.3380e-05, ..., 2.5615e-05, 4.7445e-05, 2.4326e-06], [ 1.1493e-06, 7.9870e-06, 2.0790e-04, ..., 1.5152e-04, 4.8965e-05, 6.1765e-06]], device='cuda:0') Epoch 86, bias, value: tensor([-0.0046, -0.0288, 0.0091, -0.0172, 0.0134, 0.0049, 0.0188, -0.0067, -0.0277, -0.0049], device='cuda:0'), grad: tensor([ 5.1230e-05, 1.2624e-04, -2.4486e-04, -4.8494e-04, 1.4469e-05, 7.5400e-05, -7.7859e-06, -2.2855e-06, 6.0141e-05, 4.1246e-04], device='cuda:0') 100 0.0001 changing lr epoch 85, time 262.03, cls_loss 0.0086 cls_loss_mapping 0.0112 cls_loss_causal 0.6205 re_mapping 0.0108 re_causal 0.0326 /// teacc 98.74 lr 0.00010000 Epoch 87, weight, value: tensor([[ 0.0136, -0.1099, -0.0573, ..., -0.1048, -0.0577, -0.1085], [ 0.0169, -0.0307, 0.0051, ..., 0.0459, 0.0660, -0.0214], [-0.0521, 0.0631, -0.0896, ..., 0.0471, 0.0291, -0.0104], ..., [-0.0625, -0.0507, -0.0597, ..., -0.0016, -0.0694, 0.0856], [ 0.0628, -0.0086, 0.0260, ..., -0.0052, -0.0936, -0.0050], [-0.0971, -0.0081, 0.0011, ..., -0.1092, 0.0292, -0.0822]], device='cuda:0'), grad: tensor([[ 5.0897e-07, 3.3993e-07, 1.2759e-06, ..., 2.3022e-06, 3.7514e-06, 1.3039e-08], [-1.0990e-07, 3.5297e-07, -6.5714e-06, ..., -1.0133e-05, -8.8736e-06, 3.2596e-08], [ 4.7125e-07, -1.8366e-06, 3.2932e-06, ..., 3.1199e-06, 2.5555e-06, 1.6298e-08], ..., [ 1.0645e-06, 4.3912e-07, 2.5090e-06, ..., 3.5800e-06, 5.8226e-06, -4.2003e-07], [ 5.2946e-07, -1.1409e-06, -6.4299e-06, ..., -1.1079e-05, 1.9353e-06, 3.8184e-08], [ 4.2422e-07, 9.7416e-07, 4.7907e-06, ..., 8.8066e-06, 1.4961e-05, 1.6112e-07]], device='cuda:0') Epoch 87, bias, value: tensor([-0.0048, -0.0304, 0.0092, -0.0175, 0.0133, 0.0049, 0.0193, -0.0054, -0.0274, -0.0047], device='cuda:0'), grad: tensor([ 1.1571e-05, -1.5646e-05, 1.2144e-05, 2.7224e-05, -7.9930e-05, -3.2336e-05, 1.8671e-05, 1.7181e-05, -1.1966e-05, 5.3018e-05], device='cuda:0') 100 0.0001 changing lr epoch 86, time 262.79, cls_loss 0.0083 cls_loss_mapping 0.0106 cls_loss_causal 0.6494 re_mapping 0.0109 re_causal 0.0333 /// teacc 98.79 lr 0.00010000 Epoch 88, weight, value: tensor([[ 0.0133, -0.1102, -0.0578, ..., -0.1063, -0.0592, -0.1114], [ 0.0181, -0.0307, 0.0053, ..., 0.0460, 0.0664, -0.0221], [-0.0527, 0.0634, -0.0903, ..., 0.0469, 0.0292, -0.0109], ..., [-0.0625, -0.0508, -0.0593, ..., -0.0009, -0.0694, 0.0872], [ 0.0628, -0.0087, 0.0264, ..., -0.0054, -0.0946, -0.0053], [-0.0976, -0.0082, 0.0004, ..., -0.1103, 0.0296, -0.0837]], device='cuda:0'), grad: tensor([[-1.3271e-07, 1.4063e-07, 1.3607e-06, ..., 5.2080e-06, -5.1785e-04, -1.8203e-04], [ 1.6689e-06, 5.9092e-07, -1.3217e-05, ..., -2.4766e-05, -3.3081e-05, 4.7721e-06], [ 9.3598e-07, -1.8775e-06, 6.0424e-06, ..., 1.0028e-05, 1.0943e-04, 3.3587e-05], ..., [ 1.6391e-07, 3.8743e-07, 9.6560e-06, ..., 1.5944e-05, 3.2037e-05, -8.2795e-07], [-6.2585e-06, -4.1258e-07, -1.8224e-05, ..., -3.5018e-06, 4.0770e-05, 9.0897e-06], [ 9.5461e-08, 6.9384e-08, 6.3218e-06, ..., 1.2383e-05, 2.5168e-05, 8.3670e-06]], device='cuda:0') Epoch 88, bias, value: tensor([-0.0058, -0.0304, 0.0088, -0.0169, 0.0133, 0.0048, 0.0189, -0.0046, -0.0277, -0.0047], device='cuda:0'), grad: tensor([-1.0881e-03, -1.7077e-05, 2.3150e-04, 1.0437e-04, 2.0480e-04, -2.1625e-04, 5.1403e-04, 7.4983e-05, 1.1945e-04, 7.1883e-05], device='cuda:0') 100 0.0001 changing lr epoch 87, time 262.57, cls_loss 0.0082 cls_loss_mapping 0.0140 cls_loss_causal 0.6003 re_mapping 0.0112 re_causal 0.0323 /// teacc 98.70 lr 0.00010000 Epoch 89, weight, value: tensor([[ 0.0136, -0.1100, -0.0582, ..., -0.1081, -0.0603, -0.1122], [ 0.0185, -0.0300, 0.0056, ..., 0.0470, 0.0678, -0.0221], [-0.0532, 0.0637, -0.0909, ..., 0.0467, 0.0290, -0.0110], ..., [-0.0627, -0.0516, -0.0597, ..., -0.0014, -0.0711, 0.0875], [ 0.0630, -0.0088, 0.0268, ..., -0.0053, -0.0955, -0.0055], [-0.0985, -0.0083, -0.0005, ..., -0.1110, 0.0310, -0.0835]], device='cuda:0'), grad: tensor([[ 1.7323e-07, 9.5461e-08, 2.6776e-07, ..., 5.9931e-07, 4.4750e-07, -1.2107e-07], [ 8.3819e-09, 1.3066e-06, -7.7114e-07, ..., 4.8578e-06, 2.6897e-06, 2.2929e-06], [ 2.4820e-07, -2.4904e-06, 5.9092e-07, ..., -3.5018e-06, -2.5127e-06, -1.2852e-07], ..., [ 3.3295e-07, 2.2864e-07, 8.7032e-07, ..., -1.4938e-05, 2.0508e-06, -1.1377e-05], [ 1.4734e-06, 1.6345e-07, 2.6058e-06, ..., 6.3889e-06, 9.3132e-06, 2.3507e-06], [ 1.6401e-06, 5.9139e-08, 2.1998e-06, ..., 3.1255e-06, 2.8208e-05, 1.0524e-06]], device='cuda:0') Epoch 89, bias, value: tensor([-0.0063, -0.0297, 0.0086, -0.0172, 0.0125, 0.0054, 0.0183, -0.0054, -0.0281, -0.0033], device='cuda:0'), grad: tensor([-4.1723e-06, 9.0450e-06, 2.7418e-06, 3.0659e-06, -1.1492e-04, -5.5373e-05, 2.2173e-05, -1.1079e-05, 3.2753e-05, 1.1557e-04], device='cuda:0') 100 0.0001 changing lr epoch 88, time 262.58, cls_loss 0.0069 cls_loss_mapping 0.0109 cls_loss_causal 0.5865 re_mapping 0.0110 re_causal 0.0312 /// teacc 98.77 lr 0.00010000 Epoch 90, weight, value: tensor([[ 0.0137, -0.1104, -0.0586, ..., -0.1088, -0.0613, -0.1154], [ 0.0184, -0.0301, 0.0059, ..., 0.0466, 0.0677, -0.0225], [-0.0530, 0.0646, -0.0913, ..., 0.0474, 0.0304, -0.0112], ..., [-0.0627, -0.0520, -0.0598, ..., -0.0009, -0.0717, 0.0886], [ 0.0631, -0.0089, 0.0273, ..., -0.0051, -0.0962, -0.0059], [-0.0988, -0.0098, -0.0011, ..., -0.1125, 0.0314, -0.0826]], device='cuda:0'), grad: tensor([[ 1.5236e-06, 1.1092e-06, 2.5705e-06, ..., 8.2776e-06, 6.5088e-05, 1.2137e-05], [ 1.9204e-06, 1.5711e-06, -7.3537e-06, ..., 3.1851e-06, -5.9158e-06, 1.1772e-05], [ 1.3085e-06, -2.4447e-07, 3.4813e-06, ..., 5.9530e-06, 4.4107e-06, 6.6981e-06], ..., [-8.8140e-06, -6.2101e-06, 1.7975e-06, ..., -5.5969e-05, 1.5637e-06, -1.3137e-04], [ 2.7940e-07, 6.1607e-07, -8.2180e-06, ..., -1.2238e-06, 6.0089e-06, 8.1956e-06], [ 1.4696e-06, 1.0263e-06, 4.7274e-06, ..., 2.5392e-05, -7.5281e-05, 6.9439e-05]], device='cuda:0') Epoch 90, bias, value: tensor([-0.0068, -0.0302, 0.0093, -0.0178, 0.0125, 0.0058, 0.0181, -0.0050, -0.0281, -0.0030], device='cuda:0'), grad: tensor([ 1.5235e-04, 2.0996e-05, 3.0100e-05, 2.6718e-05, 9.3400e-05, 3.9905e-05, -1.0192e-04, -3.6311e-04, 1.4238e-05, 8.7023e-05], device='cuda:0') 100 0.0001 changing lr epoch 89, time 262.22, cls_loss 0.0053 cls_loss_mapping 0.0085 cls_loss_causal 0.5950 re_mapping 0.0106 re_causal 0.0315 /// teacc 98.71 lr 0.00010000 Epoch 91, weight, value: tensor([[ 0.0139, -0.1102, -0.0595, ..., -0.1095, -0.0612, -0.1155], [ 0.0192, -0.0301, 0.0065, ..., 0.0469, 0.0681, -0.0234], [-0.0533, 0.0652, -0.0918, ..., 0.0476, 0.0303, -0.0115], ..., [-0.0629, -0.0519, -0.0600, ..., -0.0004, -0.0714, 0.0901], [ 0.0630, -0.0096, 0.0280, ..., -0.0056, -0.0972, -0.0063], [-0.0991, -0.0101, -0.0020, ..., -0.1132, 0.0310, -0.0835]], device='cuda:0'), grad: tensor([[ 6.7055e-08, 3.3397e-06, 9.9372e-07, ..., 4.8801e-06, 4.2617e-06, 1.9092e-08], [ 1.1129e-07, 1.3663e-06, -2.9132e-05, ..., -2.5898e-05, -4.7594e-05, 1.3737e-07], [-3.6880e-07, -4.0084e-05, 3.1907e-06, ..., -4.4078e-05, -3.7193e-05, 4.5169e-08], ..., [ 3.9674e-07, 5.4985e-06, 3.0156e-06, ..., 8.8438e-06, 1.1273e-05, -1.8906e-07], [ 2.5146e-08, 3.3230e-06, -4.3437e-06, ..., 8.1956e-07, 2.0638e-05, 2.8405e-08], [ 8.0559e-08, 2.1746e-07, 5.6857e-07, ..., 7.7253e-07, 1.1045e-06, 3.2550e-07]], device='cuda:0') Epoch 91, bias, value: tensor([-0.0064, -0.0301, 0.0092, -0.0185, 0.0126, 0.0061, 0.0186, -0.0042, -0.0286, -0.0038], device='cuda:0'), grad: tensor([-2.0370e-05, -8.6546e-05, -9.4354e-05, 4.8906e-05, 2.0280e-05, 5.6684e-05, 7.9647e-06, 2.3142e-05, 3.6120e-05, 8.1509e-06], device='cuda:0') 100 0.0001 changing lr epoch 90, time 262.12, cls_loss 0.0069 cls_loss_mapping 0.0105 cls_loss_causal 0.6339 re_mapping 0.0104 re_causal 0.0320 /// teacc 98.83 lr 0.00010000 Epoch 92, weight, value: tensor([[ 1.3766e-02, -1.1022e-01, -6.0017e-02, ..., -1.1029e-01, -6.1421e-02, -1.1578e-01], [ 2.0084e-02, -3.0368e-02, 6.6888e-03, ..., 4.6752e-02, 6.8061e-02, -2.4182e-02], [-5.3488e-02, 6.5497e-02, -9.2620e-02, ..., 4.7774e-02, 3.1722e-02, -1.1159e-02], ..., [-6.3126e-02, -5.1962e-02, -5.9789e-02, ..., -1.1232e-04, -7.1858e-02, 9.1025e-02], [ 6.3028e-02, -9.6692e-03, 2.8991e-02, ..., -5.3622e-03, -9.8100e-02, -6.8899e-03], [-9.9297e-02, -1.0562e-02, -2.4371e-03, ..., -1.1417e-01, 3.0349e-02, -8.4403e-02]], device='cuda:0'), grad: tensor([[ 1.3504e-08, 6.7567e-07, 8.2189e-07, ..., 1.7779e-06, 2.2799e-06, 3.5111e-07], [ 4.1444e-08, 5.0059e-07, -1.0401e-05, ..., -9.4399e-06, -1.7196e-05, -5.7481e-06], [ 5.3085e-08, -4.4238e-07, 2.7604e-06, ..., 3.4757e-06, 5.8934e-06, 2.9914e-06], ..., [ 1.2340e-07, 8.6799e-07, 6.0759e-06, ..., 1.1604e-06, 1.6913e-05, -2.1718e-06], [ 1.2713e-07, 2.3618e-06, -9.7789e-08, ..., 3.8892e-06, 8.7172e-06, 7.9582e-07], [ 2.6543e-08, 1.1548e-06, -6.8285e-06, ..., 4.3921e-06, -3.8952e-05, 1.6605e-06]], device='cuda:0') Epoch 92, bias, value: tensor([-0.0061, -0.0303, 0.0093, -0.0183, 0.0130, 0.0059, 0.0182, -0.0040, -0.0280, -0.0046], device='cuda:0'), grad: tensor([ 3.9265e-06, -2.1070e-05, 1.7047e-05, 2.6643e-05, 7.7724e-05, -6.3360e-05, 1.4916e-05, 3.4332e-05, 2.6971e-05, -1.1688e-04], device='cuda:0') 100 0.0001 changing lr epoch 91, time 262.70, cls_loss 0.0057 cls_loss_mapping 0.0074 cls_loss_causal 0.5860 re_mapping 0.0107 re_causal 0.0320 /// teacc 98.72 lr 0.00010000 Epoch 93, weight, value: tensor([[ 0.0134, -0.1111, -0.0601, ..., -0.1113, -0.0613, -0.1160], [ 0.0201, -0.0309, 0.0075, ..., 0.0470, 0.0683, -0.0240], [-0.0551, 0.0651, -0.0935, ..., 0.0474, 0.0321, -0.0119], ..., [-0.0630, -0.0518, -0.0603, ..., -0.0004, -0.0721, 0.0912], [ 0.0631, -0.0098, 0.0292, ..., -0.0055, -0.0990, -0.0075], [-0.0997, -0.0107, -0.0032, ..., -0.1152, 0.0307, -0.0858]], device='cuda:0'), grad: tensor([[ 2.4494e-07, 5.0664e-06, 7.0082e-07, ..., 1.0632e-05, 7.5400e-06, 4.1211e-07], [-3.4086e-07, 2.1756e-05, -1.4745e-05, ..., 2.1815e-05, 1.0923e-05, -2.9691e-06], [-3.0864e-06, -2.5272e-04, 1.3141e-06, ..., -4.2772e-04, -2.5821e-04, -3.8594e-06], ..., [ 1.6056e-06, 8.2776e-06, 6.7130e-06, ..., 2.3693e-05, 1.4566e-05, 1.9465e-06], [ 2.3115e-06, 1.7536e-04, 3.2280e-06, ..., 2.8515e-04, 1.7810e-04, 1.1306e-06], [ 3.5390e-07, 1.8049e-06, 1.7332e-06, ..., 4.9211e-06, 4.0121e-06, 5.7742e-07]], device='cuda:0') Epoch 93, bias, value: tensor([-0.0060, -0.0301, 0.0085, -0.0170, 0.0125, 0.0054, 0.0188, -0.0041, -0.0283, -0.0046], device='cuda:0'), grad: tensor([ 1.8582e-05, 2.4289e-05, -5.9175e-04, 9.6083e-05, 3.0342e-06, 3.3200e-05, -3.4451e-05, 3.5465e-05, 4.0483e-04, 1.1049e-05], device='cuda:0') 100 0.0001 changing lr epoch 92, time 262.36, cls_loss 0.0054 cls_loss_mapping 0.0082 cls_loss_causal 0.5731 re_mapping 0.0108 re_causal 0.0299 /// teacc 98.69 lr 0.00010000 Epoch 94, weight, value: tensor([[ 0.0132, -0.1121, -0.0602, ..., -0.1124, -0.0613, -0.1162], [ 0.0201, -0.0309, 0.0081, ..., 0.0475, 0.0690, -0.0242], [-0.0549, 0.0659, -0.0944, ..., 0.0473, 0.0324, -0.0132], ..., [-0.0631, -0.0522, -0.0603, ..., -0.0002, -0.0726, 0.0924], [ 0.0632, -0.0098, 0.0298, ..., -0.0051, -0.0995, -0.0074], [-0.1000, -0.0124, -0.0037, ..., -0.1167, 0.0302, -0.0867]], device='cuda:0'), grad: tensor([[ 9.8255e-08, 2.6554e-05, 2.8461e-05, ..., 4.1366e-05, 3.1203e-05, 0.0000e+00], [ 9.1270e-08, 4.1677e-07, 1.3809e-03, ..., 1.1950e-03, 6.5613e-04, 0.0000e+00], [-3.8091e-07, -6.2943e-05, 2.9624e-05, ..., -1.1779e-05, -2.8580e-05, 0.0000e+00], ..., [ 5.2946e-07, 8.9034e-06, 1.1951e-05, ..., 1.9446e-05, 2.5019e-05, 0.0000e+00], [ 2.5239e-07, -9.2909e-06, -1.6518e-03, ..., -1.4362e-03, -6.7329e-04, 0.0000e+00], [ 2.8824e-07, 7.8529e-06, 2.8923e-05, ..., 3.1263e-05, -6.7472e-05, 0.0000e+00]], device='cuda:0') Epoch 94, bias, value: tensor([-0.0057, -0.0298, 0.0085, -0.0174, 0.0128, 0.0056, 0.0184, -0.0039, -0.0280, -0.0053], device='cuda:0'), grad: tensor([ 6.3479e-05, 2.5673e-03, -5.8115e-05, 1.7297e-04, -4.9144e-05, 1.5974e-04, -8.0541e-06, 7.6771e-05, -2.7752e-03, -1.5271e-04], device='cuda:0') 100 0.0001 changing lr epoch 93, time 262.37, cls_loss 0.0072 cls_loss_mapping 0.0113 cls_loss_causal 0.6233 re_mapping 0.0103 re_causal 0.0313 /// teacc 98.79 lr 0.00010000 Epoch 95, weight, value: tensor([[ 0.0131, -0.1121, -0.0609, ..., -0.1138, -0.0614, -0.1163], [ 0.0210, -0.0310, 0.0075, ..., 0.0466, 0.0700, -0.0239], [-0.0548, 0.0665, -0.0950, ..., 0.0477, 0.0328, -0.0133], ..., [-0.0640, -0.0526, -0.0594, ..., 0.0006, -0.0744, 0.0924], [ 0.0635, -0.0099, 0.0320, ..., -0.0042, -0.0989, -0.0076], [-0.1008, -0.0125, -0.0059, ..., -0.1170, 0.0299, -0.0863]], device='cuda:0'), grad: tensor([[ 4.9360e-08, 1.3970e-08, 5.6718e-07, ..., 1.0310e-06, -3.6266e-06, 1.4203e-07], [ 2.5332e-07, 2.7940e-08, -4.0457e-06, ..., -4.5188e-06, -1.0960e-05, 1.8580e-07], [ 1.1036e-07, -5.4436e-07, 1.6112e-06, ..., 1.3690e-06, 3.2317e-06, 8.9873e-08], ..., [ 6.3051e-07, 9.4529e-08, 3.6936e-06, ..., 6.8136e-06, 6.5416e-06, -6.2818e-07], [ 4.4843e-07, 1.6298e-07, -4.2528e-05, ..., -9.2089e-05, 3.3751e-06, 5.6345e-08], [-8.5821e-07, 1.5367e-08, 2.6841e-06, ..., 6.1393e-06, -4.7497e-06, 2.4354e-07]], device='cuda:0') Epoch 95, bias, value: tensor([-0.0064, -0.0298, 0.0085, -0.0179, 0.0137, 0.0057, 0.0184, -0.0041, -0.0267, -0.0059], device='cuda:0'), grad: tensor([-1.7270e-05, -8.4639e-06, 7.9647e-06, 3.3319e-05, 1.0626e-06, 1.4853e-04, 2.0593e-05, 2.0981e-05, -2.1672e-04, 9.9391e-06], device='cuda:0') 100 0.0001 changing lr epoch 94, time 262.11, cls_loss 0.0061 cls_loss_mapping 0.0105 cls_loss_causal 0.6150 re_mapping 0.0102 re_causal 0.0301 /// teacc 98.77 lr 0.00010000 Epoch 96, weight, value: tensor([[ 0.0128, -0.1122, -0.0613, ..., -0.1157, -0.0623, -0.1165], [ 0.0216, -0.0310, 0.0082, ..., 0.0466, 0.0702, -0.0244], [-0.0551, 0.0670, -0.0955, ..., 0.0485, 0.0339, -0.0133], ..., [-0.0638, -0.0530, -0.0597, ..., 0.0006, -0.0751, 0.0930], [ 0.0632, -0.0102, 0.0322, ..., -0.0045, -0.1002, -0.0078], [-0.1013, -0.0126, -0.0067, ..., -0.1179, 0.0304, -0.0866]], device='cuda:0'), grad: tensor([[-5.3585e-05, 3.0268e-08, 1.9029e-05, ..., 1.0833e-05, -5.3421e-06, 3.8650e-08], [ 8.8587e-06, 4.7497e-08, -1.1055e-06, ..., -1.3225e-06, -4.3064e-06, 1.1176e-07], [ 9.8497e-06, -7.8045e-07, 2.4363e-05, ..., 1.2554e-05, 1.9688e-06, 3.7393e-07], ..., [ 3.9823e-06, 1.0990e-07, 8.2180e-06, ..., 4.3362e-06, 3.4329e-06, -1.3020e-06], [ 8.4937e-06, 4.6473e-07, 2.7716e-05, ..., 1.8448e-05, 8.9332e-06, 7.9628e-08], [ 2.4363e-05, 1.6298e-08, 8.6606e-05, ..., 5.0068e-05, -2.1875e-05, 4.1537e-07]], device='cuda:0') Epoch 96, bias, value: tensor([-0.0070, -0.0297, 0.0095, -0.0181, 0.0138, 0.0062, 0.0180, -0.0042, -0.0274, -0.0057], device='cuda:0'), grad: tensor([-6.1393e-05, 1.7881e-05, 7.7546e-05, -4.5371e-04, 8.6188e-05, 4.9770e-05, -2.9817e-05, 2.6450e-05, 9.9719e-05, 1.8740e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 95---------------------------------------------------- epoch 95, time 278.80, cls_loss 0.0082 cls_loss_mapping 0.0111 cls_loss_causal 0.5944 re_mapping 0.0103 re_causal 0.0289 /// teacc 98.86 lr 0.00010000 Epoch 97, weight, value: tensor([[ 0.0128, -0.1125, -0.0623, ..., -0.1171, -0.0625, -0.1167], [ 0.0220, -0.0311, 0.0079, ..., 0.0464, 0.0705, -0.0247], [-0.0557, 0.0675, -0.0962, ..., 0.0487, 0.0343, -0.0135], ..., [-0.0639, -0.0532, -0.0597, ..., 0.0002, -0.0757, 0.0933], [ 0.0641, -0.0105, 0.0332, ..., -0.0034, -0.1011, -0.0058], [-0.1015, -0.0127, -0.0072, ..., -0.1183, 0.0305, -0.0866]], device='cuda:0'), grad: tensor([[ 5.8711e-05, 1.1288e-06, 7.5102e-05, ..., 4.9584e-06, 3.2540e-06, 6.7009e-07], [ 1.4402e-05, 2.2560e-05, 4.1753e-05, ..., 6.9141e-05, 7.3433e-05, 1.0312e-05], [ 5.2124e-05, -4.4137e-05, 3.4690e-05, ..., -1.0401e-04, -1.3053e-04, -1.5825e-05], ..., [ 4.1080e-04, 3.6843e-06, 2.8706e-04, ..., 8.6352e-06, 1.5467e-05, -9.8348e-06], [ 7.1704e-05, 1.4612e-06, -1.9386e-05, ..., -1.0842e-04, -2.8163e-05, 1.6754e-06], [ 9.5844e-05, 2.9290e-07, 5.9700e-04, ..., 4.1053e-06, 5.5313e-05, 5.7258e-06]], device='cuda:0') Epoch 97, bias, value: tensor([-0.0070, -0.0301, 0.0094, -0.0167, 0.0138, 0.0050, 0.0180, -0.0036, -0.0270, -0.0064], device='cuda:0'), grad: tensor([ 0.0004, 0.0002, 0.0002, -0.0012, 0.0006, -0.0052, 0.0003, 0.0026, 0.0004, 0.0016], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 96---------------------------------------------------- epoch 96, time 278.41, cls_loss 0.0054 cls_loss_mapping 0.0084 cls_loss_causal 0.6091 re_mapping 0.0099 re_causal 0.0298 /// teacc 98.92 lr 0.00010000 Epoch 98, weight, value: tensor([[ 0.0133, -0.1128, -0.0632, ..., -0.1181, -0.0627, -0.1168], [ 0.0229, -0.0312, 0.0087, ..., 0.0467, 0.0714, -0.0247], [-0.0554, 0.0680, -0.0967, ..., 0.0492, 0.0347, -0.0136], ..., [-0.0646, -0.0533, -0.0600, ..., 0.0003, -0.0766, 0.0936], [ 0.0640, -0.0110, 0.0330, ..., -0.0044, -0.1036, -0.0058], [-0.1011, -0.0127, -0.0068, ..., -0.1181, 0.0308, -0.0868]], device='cuda:0'), grad: tensor([[ 6.6590e-08, 5.3551e-08, 3.4180e-07, ..., 6.3237e-07, 5.3458e-06, 2.6869e-07], [ 4.0978e-08, 2.7707e-07, 7.1712e-07, ..., 5.0813e-06, 8.4564e-06, 3.6769e-06], [-5.7742e-08, -2.5965e-06, 2.0728e-05, ..., 2.1547e-05, 3.9116e-06, 1.7583e-06], ..., [ 8.6613e-08, 3.9814e-07, 3.4850e-06, ..., 2.3507e-06, 1.3024e-05, -4.3772e-06], [ 8.0047e-07, 5.7323e-07, -2.7031e-05, ..., -2.7537e-05, -2.5816e-06, 3.3230e-06], [ 2.9011e-07, 1.0245e-08, 1.5600e-06, ..., 4.0680e-06, 1.0222e-05, 2.7400e-06]], device='cuda:0') Epoch 98, bias, value: tensor([-0.0068, -0.0296, 0.0098, -0.0174, 0.0137, 0.0057, 0.0180, -0.0037, -0.0285, -0.0061], device='cuda:0'), grad: tensor([ 5.0455e-05, 3.2932e-05, 4.7594e-05, 4.8392e-06, -1.1551e-04, -3.0667e-05, -5.9605e-05, 3.1561e-05, -2.7448e-05, 6.5744e-05], device='cuda:0') 100 0.0001 changing lr epoch 97, time 262.30, cls_loss 0.0060 cls_loss_mapping 0.0083 cls_loss_causal 0.5627 re_mapping 0.0097 re_causal 0.0281 /// teacc 98.78 lr 0.00010000 Epoch 99, weight, value: tensor([[ 0.0142, -0.1129, -0.0637, ..., -0.1188, -0.0645, -0.1168], [ 0.0233, -0.0313, 0.0094, ..., 0.0473, 0.0721, -0.0246], [-0.0555, 0.0682, -0.0974, ..., 0.0492, 0.0349, -0.0142], ..., [-0.0648, -0.0533, -0.0603, ..., 0.0004, -0.0773, 0.0942], [ 0.0636, -0.0112, 0.0331, ..., -0.0047, -0.1045, -0.0058], [-0.1019, -0.0129, -0.0076, ..., -0.1188, 0.0322, -0.0873]], device='cuda:0'), grad: tensor([[ 6.4261e-08, 6.1002e-08, 2.3469e-07, ..., 4.0559e-07, 6.2538e-07, 8.1025e-08], [ 1.5274e-07, 1.5041e-07, -3.5502e-06, ..., -2.2203e-06, -6.0573e-06, 4.2096e-07], [-4.0699e-07, -6.6124e-07, 5.5274e-07, ..., -4.9360e-07, -1.8161e-06, 4.0513e-07], ..., [-3.5856e-08, -9.5461e-08, 1.5711e-06, ..., -1.6624e-06, 2.4084e-06, -2.5202e-06], [ 7.0315e-08, 1.5367e-07, -1.9129e-06, ..., -1.1204e-06, 3.0976e-06, 3.1199e-07], [ 1.3318e-07, 3.4925e-08, 2.2864e-07, ..., 5.7742e-07, 1.0254e-06, 4.6985e-07]], device='cuda:0') Epoch 99, bias, value: tensor([-0.0079, -0.0292, 0.0096, -0.0176, 0.0133, 0.0056, 0.0185, -0.0038, -0.0291, -0.0049], device='cuda:0'), grad: tensor([ 2.3982e-07, -6.1542e-06, -5.1968e-07, 4.4629e-06, -2.0694e-06, 7.2345e-06, -5.2452e-06, -2.3767e-06, 3.1888e-06, 1.2368e-06], device='cuda:0') 100 0.0001 changing lr epoch 98, time 262.07, cls_loss 0.0076 cls_loss_mapping 0.0099 cls_loss_causal 0.5878 re_mapping 0.0100 re_causal 0.0290 /// teacc 98.83 lr 0.00010000 Epoch 100, weight, value: tensor([[ 0.0156, -0.1134, -0.0643, ..., -0.1197, -0.0651, -0.1169], [ 0.0239, -0.0316, 0.0099, ..., 0.0475, 0.0727, -0.0247], [-0.0560, 0.0688, -0.0983, ..., 0.0490, 0.0353, -0.0148], ..., [-0.0640, -0.0535, -0.0609, ..., -0.0011, -0.0780, 0.0932], [ 0.0635, -0.0115, 0.0335, ..., -0.0045, -0.1056, -0.0056], [-0.1036, -0.0133, -0.0071, ..., -0.1181, 0.0318, -0.0858]], device='cuda:0'), grad: tensor([[ 3.6787e-07, 7.7765e-08, 2.5099e-07, ..., 5.2154e-07, -1.2136e-04, 8.4285e-08], [ 2.2165e-07, 1.1222e-07, -4.9314e-07, ..., 5.7090e-07, -4.0606e-07, 6.2538e-07], [ 3.0687e-07, -8.2003e-07, 9.5926e-07, ..., 5.2806e-07, 3.0501e-07, 3.3155e-07], ..., [-3.2457e-07, 8.9873e-08, 9.2667e-07, ..., -7.6182e-06, 1.6205e-06, -1.2897e-05], [ 2.1569e-06, 1.7229e-08, 6.4187e-06, ..., 1.2264e-05, 4.9770e-06, 6.2725e-07], [ 5.0897e-07, 8.1025e-08, 4.9174e-06, ..., 1.3314e-05, 1.1629e-04, 9.9167e-06]], device='cuda:0') Epoch 100, bias, value: tensor([-0.0082, -0.0289, 0.0092, -0.0168, 0.0138, 0.0048, 0.0185, -0.0042, -0.0297, -0.0044], device='cuda:0'), grad: tensor([-4.2701e-04, 5.5730e-06, 5.8338e-06, 5.4955e-05, 4.2021e-06, -1.7154e-04, 2.6494e-05, -2.2143e-05, 6.4850e-05, 4.5872e-04], device='cuda:0') 100 0.0001 changing lr epoch 99, time 262.23, cls_loss 0.0053 cls_loss_mapping 0.0078 cls_loss_causal 0.5646 re_mapping 0.0100 re_causal 0.0285 /// teacc 98.91 lr 0.00010000 Epoch 101, weight, value: tensor([[ 0.0158, -0.1135, -0.0644, ..., -0.1203, -0.0650, -0.1170], [ 0.0237, -0.0330, 0.0101, ..., 0.0477, 0.0727, -0.0251], [-0.0554, 0.0703, -0.0986, ..., 0.0502, 0.0367, -0.0150], ..., [-0.0641, -0.0539, -0.0611, ..., -0.0012, -0.0784, 0.0937], [ 0.0632, -0.0113, 0.0335, ..., -0.0044, -0.1051, -0.0057], [-0.1042, -0.0138, -0.0077, ..., -0.1189, 0.0313, -0.0861]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 4.0047e-08, 1.0328e-06, ..., 7.9069e-07, 1.6931e-06, 2.8871e-08], [ 5.9605e-08, -3.4738e-07, -1.7866e-05, ..., -9.0972e-06, -3.7849e-05, 2.8051e-06], [ 1.9837e-07, -2.2259e-07, 4.0494e-06, ..., 3.2932e-06, 4.0308e-06, 2.8871e-07], ..., [ 1.1642e-07, 3.6415e-07, 1.9446e-06, ..., -2.5257e-06, 3.1330e-06, -4.1947e-06], [ 7.3202e-07, 4.0047e-08, 1.8761e-05, ..., 1.4551e-05, 1.7866e-05, 2.3749e-07], [ 9.4995e-08, 1.3970e-08, 1.5441e-06, ..., 1.5004e-06, -7.4971e-07, 1.6298e-07]], device='cuda:0') Epoch 101, bias, value: tensor([-0.0076, -0.0291, 0.0104, -0.0173, 0.0142, 0.0053, 0.0178, -0.0044, -0.0295, -0.0051], device='cuda:0'), grad: tensor([ 1.4221e-06, -4.3303e-05, 9.3132e-06, -1.6257e-05, 4.4368e-06, -2.8275e-06, 1.1414e-05, -3.2652e-06, 3.8534e-05, 5.4389e-07], device='cuda:0') 100 0.0001 changing lr epoch 100, time 262.57, cls_loss 0.0054 cls_loss_mapping 0.0087 cls_loss_causal 0.5706 re_mapping 0.0097 re_causal 0.0289 /// teacc 98.84 lr 0.00010000 Epoch 102, weight, value: tensor([[ 0.0157, -0.1140, -0.0653, ..., -0.1212, -0.0650, -0.1171], [ 0.0260, -0.0332, 0.0101, ..., 0.0473, 0.0745, -0.0245], [-0.0555, 0.0705, -0.0997, ..., 0.0500, 0.0365, -0.0156], ..., [-0.0662, -0.0541, -0.0611, ..., -0.0008, -0.0809, 0.0939], [ 0.0629, -0.0116, 0.0340, ..., -0.0045, -0.1054, -0.0056], [-0.1053, -0.0139, -0.0088, ..., -0.1194, 0.0313, -0.0865]], device='cuda:0'), grad: tensor([[-1.2387e-07, 3.4273e-07, 1.4696e-06, ..., 1.4026e-06, 1.3616e-06, 2.0489e-08], [-7.1414e-06, -4.9621e-05, -1.4818e-04, ..., -1.5938e-04, -1.6308e-04, 2.2911e-07], [ 5.0701e-06, 3.2246e-05, 9.4593e-05, ..., 1.0419e-04, 1.0371e-04, 1.2461e-06], ..., [ 1.6522e-06, 1.0356e-05, 3.1888e-05, ..., 3.1292e-05, 3.4958e-05, -2.5909e-06], [ 1.8999e-07, 4.1723e-07, 1.6363e-06, ..., 1.5944e-06, 3.3397e-06, 1.5832e-07], [ 5.3272e-07, 1.2480e-07, 1.4342e-06, ..., 1.8477e-06, -5.6159e-07, 6.1188e-07]], device='cuda:0') Epoch 102, bias, value: tensor([-0.0077, -0.0281, 0.0098, -0.0168, 0.0143, 0.0055, 0.0181, -0.0051, -0.0293, -0.0056], device='cuda:0'), grad: tensor([ 1.6596e-06, -2.9230e-04, 1.9205e-04, 2.2411e-05, 2.1420e-08, 8.7619e-06, 4.3325e-06, 5.8204e-05, 6.8583e-06, -1.9297e-06], device='cuda:0') 100 0.0001 changing lr epoch 101, time 262.01, cls_loss 0.0051 cls_loss_mapping 0.0071 cls_loss_causal 0.5731 re_mapping 0.0097 re_causal 0.0279 /// teacc 98.82 lr 0.00010000 Epoch 103, weight, value: tensor([[ 0.0157, -0.1148, -0.0662, ..., -0.1221, -0.0650, -0.1173], [ 0.0262, -0.0333, 0.0102, ..., 0.0473, 0.0744, -0.0249], [-0.0552, 0.0705, -0.1006, ..., 0.0500, 0.0366, -0.0160], ..., [-0.0662, -0.0541, -0.0610, ..., -0.0004, -0.0815, 0.0948], [ 0.0631, -0.0113, 0.0347, ..., -0.0047, -0.1062, -0.0067], [-0.1053, -0.0138, -0.0089, ..., -0.1193, 0.0319, -0.0865]], device='cuda:0'), grad: tensor([[ 2.4587e-07, 6.4727e-07, 4.3213e-07, ..., 1.2461e-06, 1.5182e-03, 5.9698e-07], [ 1.9036e-06, 1.3309e-06, 1.5080e-05, ..., 1.6401e-06, 6.6280e-05, 7.2181e-05], [ 3.0175e-07, -1.9878e-05, 5.7090e-07, ..., -2.6330e-05, -1.6078e-05, 5.7928e-07], ..., [-2.6710e-06, 4.6864e-06, 7.0930e-06, ..., 4.8205e-06, 4.5896e-05, 1.6868e-05], [ 2.3376e-07, 1.0803e-05, 1.4352e-06, ..., 1.5408e-05, 3.1918e-05, 1.8571e-06], [ 1.4519e-06, 2.2072e-07, 9.9540e-06, ..., 3.5483e-06, -1.5888e-03, 2.3618e-05]], device='cuda:0') Epoch 103, bias, value: tensor([-0.0077, -0.0286, 0.0093, -0.0173, 0.0150, 0.0057, 0.0182, -0.0050, -0.0294, -0.0052], device='cuda:0'), grad: tensor([ 2.2202e-03, 2.3687e-04, -3.5346e-05, -1.0282e-05, -2.7061e-04, 5.5999e-05, -5.5730e-05, 1.1176e-04, 9.5904e-05, -2.3479e-03], device='cuda:0') 100 0.0001 changing lr epoch 102, time 262.52, cls_loss 0.0058 cls_loss_mapping 0.0080 cls_loss_causal 0.6027 re_mapping 0.0095 re_causal 0.0274 /// teacc 98.78 lr 0.00010000 Epoch 104, weight, value: tensor([[ 0.0156, -0.1153, -0.0674, ..., -0.1236, -0.0654, -0.1174], [ 0.0263, -0.0346, 0.0102, ..., 0.0471, 0.0742, -0.0250], [-0.0555, 0.0727, -0.1009, ..., 0.0509, 0.0381, -0.0152], ..., [-0.0658, -0.0563, -0.0611, ..., -0.0008, -0.0828, 0.0947], [ 0.0630, -0.0113, 0.0349, ..., -0.0048, -0.1068, -0.0070], [-0.1055, -0.0142, -0.0089, ..., -0.1196, 0.0313, -0.0866]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 1.0058e-07, 1.7015e-06, ..., 1.9502e-06, 1.7388e-06, 2.7940e-09], [ 4.6566e-09, -6.3777e-05, -1.9538e-04, ..., -3.7909e-04, -4.5013e-04, 1.4901e-08], [ 7.4506e-09, 5.6773e-05, 1.2755e-04, ..., 2.8229e-04, 3.5739e-04, 8.3819e-09], ..., [-9.3132e-10, 5.2638e-06, 5.8860e-05, ..., 8.1956e-05, 7.3075e-05, -7.2643e-08], [ 2.7008e-08, 5.0291e-07, 7.7784e-06, ..., 9.2015e-06, 1.2204e-05, 4.6566e-09], [ 8.3819e-09, 1.2387e-07, 8.9779e-06, ..., 8.0243e-06, -1.2957e-05, 2.5146e-08]], device='cuda:0') Epoch 104, bias, value: tensor([-0.0081, -0.0288, 0.0104, -0.0174, 0.0166, 0.0062, 0.0186, -0.0055, -0.0301, -0.0060], device='cuda:0'), grad: tensor([ 7.8231e-07, -5.5599e-04, 4.2558e-04, -8.4817e-05, 3.4243e-05, 6.3062e-05, -1.3456e-05, 1.1736e-04, 4.2468e-05, -2.8491e-05], device='cuda:0') 100 0.0001 changing lr epoch 103, time 262.80, cls_loss 0.0045 cls_loss_mapping 0.0078 cls_loss_causal 0.5787 re_mapping 0.0103 re_causal 0.0280 /// teacc 98.88 lr 0.00010000 Epoch 105, weight, value: tensor([[ 0.0155, -0.1154, -0.0686, ..., -0.1252, -0.0657, -0.1175], [ 0.0264, -0.0351, 0.0106, ..., 0.0471, 0.0744, -0.0251], [-0.0559, 0.0736, -0.1015, ..., 0.0516, 0.0395, -0.0148], ..., [-0.0655, -0.0563, -0.0611, ..., -0.0008, -0.0834, 0.0954], [ 0.0631, -0.0117, 0.0351, ..., -0.0052, -0.1077, -0.0073], [-0.1060, -0.0146, -0.0092, ..., -0.1201, 0.0316, -0.0867]], device='cuda:0'), grad: tensor([[ 2.8405e-07, 4.1351e-07, 1.5264e-06, ..., 1.7220e-06, -5.2247e-07, 7.9162e-08], [ 4.0699e-07, 3.0268e-07, 4.0047e-06, ..., 3.4105e-06, 2.4550e-06, 1.8161e-07], [-1.1036e-06, -8.1509e-06, 1.1407e-05, ..., -2.0210e-06, -5.4240e-06, 9.9558e-07], ..., [ 1.2387e-06, 4.5784e-06, 1.9342e-05, ..., 1.9461e-05, 1.8805e-05, 1.3374e-06], [ 6.8638e-07, 1.2992e-06, 5.4799e-06, ..., 5.4613e-06, 4.0904e-06, 3.2503e-07], [ 9.8255e-07, 1.2387e-07, -3.5930e-06, ..., 7.0855e-06, -4.5925e-05, 4.6846e-07]], device='cuda:0') Epoch 105, bias, value: tensor([-0.0083, -0.0288, 0.0115, -0.0176, 0.0158, 0.0067, 0.0183, -0.0052, -0.0307, -0.0061], device='cuda:0'), grad: tensor([-9.5144e-06, 9.0003e-06, -3.7905e-06, -5.1069e-04, 8.8334e-05, 4.4894e-04, 2.6077e-06, 4.6164e-05, 1.3910e-05, -8.4400e-05], device='cuda:0') 100 0.0001 changing lr epoch 104, time 262.77, cls_loss 0.0051 cls_loss_mapping 0.0077 cls_loss_causal 0.5943 re_mapping 0.0092 re_causal 0.0275 /// teacc 98.90 lr 0.00010000 Epoch 106, weight, value: tensor([[ 0.0156, -0.1157, -0.0696, ..., -0.1258, -0.0674, -0.1176], [ 0.0265, -0.0353, 0.0107, ..., 0.0473, 0.0744, -0.0253], [-0.0561, 0.0740, -0.1020, ..., 0.0517, 0.0397, -0.0151], ..., [-0.0654, -0.0564, -0.0612, ..., -0.0007, -0.0840, 0.0960], [ 0.0631, -0.0119, 0.0357, ..., -0.0052, -0.1085, -0.0075], [-0.1062, -0.0148, -0.0092, ..., -0.1200, 0.0331, -0.0870]], device='cuda:0'), grad: tensor([[ 1.3318e-07, -5.4911e-06, 5.5972e-07, ..., 1.3448e-06, -2.6580e-06, 2.8312e-07], [ 2.8126e-07, 1.6084e-06, 3.9022e-07, ..., 4.1053e-06, 4.8243e-06, 1.3700e-06], [ 1.2433e-06, -1.1265e-05, -3.3844e-06, ..., -1.6898e-05, -3.1739e-05, 9.7416e-07], ..., [ 2.1234e-07, 1.0766e-06, -2.3007e-05, ..., -9.6440e-05, -5.8413e-05, -8.9705e-05], [ 3.0175e-07, 6.8434e-06, 6.6347e-06, ..., 1.9565e-05, 2.1651e-05, 9.1046e-06], [ 9.4250e-07, 3.0994e-06, 1.8314e-05, ..., 7.0810e-05, 4.8846e-05, 6.3896e-05]], device='cuda:0') Epoch 106, bias, value: tensor([-0.0096, -0.0290, 0.0113, -0.0179, 0.0158, 0.0066, 0.0187, -0.0051, -0.0306, -0.0052], device='cuda:0'), grad: tensor([-6.5982e-05, 1.7166e-05, -3.2902e-05, 2.5421e-05, 5.3346e-05, 3.7663e-06, 6.2957e-06, -5.0449e-04, 8.5652e-05, 4.1103e-04], device='cuda:0') 100 0.0001 changing lr epoch 105, time 262.68, cls_loss 0.0054 cls_loss_mapping 0.0108 cls_loss_causal 0.6164 re_mapping 0.0099 re_causal 0.0289 /// teacc 98.90 lr 0.00010000 Epoch 107, weight, value: tensor([[ 0.0155, -0.1162, -0.0711, ..., -0.1262, -0.0675, -0.1178], [ 0.0266, -0.0354, 0.0123, ..., 0.0477, 0.0755, -0.0257], [-0.0555, 0.0742, -0.1026, ..., 0.0516, 0.0395, -0.0153], ..., [-0.0652, -0.0565, -0.0611, ..., -0.0002, -0.0839, 0.0965], [ 0.0623, -0.0116, 0.0361, ..., -0.0049, -0.1101, -0.0076], [-0.1066, -0.0148, -0.0116, ..., -0.1220, 0.0328, -0.0873]], device='cuda:0'), grad: tensor([[ 5.7742e-08, -6.6217e-07, 4.3623e-06, ..., 4.0345e-06, 3.2634e-06, 3.1665e-07], [ 8.2515e-07, 7.9162e-08, -8.7595e-04, ..., -7.4911e-04, -8.5926e-04, 1.9558e-07], [ 5.0329e-06, -9.5647e-07, 1.5509e-04, ..., 1.2326e-04, 5.4240e-05, 2.5984e-07], ..., [ 2.1514e-07, 1.0803e-07, 5.2184e-05, ..., 4.0174e-05, 4.4078e-05, -2.3972e-06], [ 3.3155e-06, 7.5903e-07, 8.6451e-04, ..., 7.3004e-04, 7.7868e-04, -3.5483e-07], [ 1.1921e-07, 4.6566e-08, 1.1809e-05, ..., 1.0878e-05, 8.5458e-06, 8.8941e-07]], device='cuda:0') Epoch 107, bias, value: tensor([-0.0096, -0.0281, 0.0111, -0.0179, 0.0159, 0.0061, 0.0189, -0.0047, -0.0309, -0.0058], device='cuda:0'), grad: tensor([-6.0678e-05, -1.7900e-03, 2.7537e-04, -3.5977e-04, 9.2834e-06, -6.0908e-06, 2.5406e-06, 9.4712e-05, 1.8005e-03, 3.3706e-05], device='cuda:0') 100 0.0001 changing lr epoch 106, time 262.70, cls_loss 0.0067 cls_loss_mapping 0.0102 cls_loss_causal 0.6286 re_mapping 0.0094 re_causal 0.0274 /// teacc 98.82 lr 0.00010000 Epoch 108, weight, value: tensor([[ 0.0158, -0.1164, -0.0719, ..., -0.1271, -0.0678, -0.1180], [ 0.0266, -0.0356, 0.0134, ..., 0.0482, 0.0762, -0.0255], [-0.0559, 0.0744, -0.1036, ..., 0.0513, 0.0396, -0.0162], ..., [-0.0652, -0.0566, -0.0618, ..., -0.0003, -0.0849, 0.0967], [ 0.0623, -0.0118, 0.0360, ..., -0.0051, -0.1114, -0.0077], [-0.1069, -0.0149, -0.0113, ..., -0.1211, 0.0340, -0.0869]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 9.9652e-08, 1.3039e-07, ..., 2.4401e-07, -2.1141e-06, 2.2352e-08], [ 7.0781e-08, 1.9558e-08, -9.5833e-07, ..., -4.3772e-07, -1.0263e-06, 1.5832e-07], [ 3.8184e-08, -1.4752e-06, 6.0163e-07, ..., -4.1313e-06, -4.4443e-06, 1.5181e-07], ..., [ 2.8871e-08, 2.0582e-07, 1.5143e-06, ..., 4.7032e-07, 3.0957e-06, -4.1071e-07], [-9.8906e-07, 6.4261e-08, -1.5758e-06, ..., -5.4017e-08, 1.7826e-06, 3.6135e-07], [ 1.4901e-08, 2.0489e-08, 6.3963e-06, ..., 1.1642e-06, 1.8597e-05, 2.5164e-06]], device='cuda:0') Epoch 108, bias, value: tensor([-0.0092, -0.0278, 0.0103, -0.0183, 0.0146, 0.0057, 0.0199, -0.0053, -0.0317, -0.0038], device='cuda:0'), grad: tensor([-6.8605e-05, 3.8333e-06, -4.4405e-06, 5.8450e-06, -6.2406e-05, -2.8864e-05, 1.3947e-05, 8.8438e-06, 6.6832e-06, 1.2505e-04], device='cuda:0') 100 0.0001 changing lr epoch 107, time 262.67, cls_loss 0.0049 cls_loss_mapping 0.0073 cls_loss_causal 0.5841 re_mapping 0.0093 re_causal 0.0265 /// teacc 98.90 lr 0.00010000 Epoch 109, weight, value: tensor([[ 0.0159, -0.1167, -0.0724, ..., -0.1278, -0.0672, -0.1182], [ 0.0265, -0.0363, 0.0132, ..., 0.0477, 0.0757, -0.0266], [-0.0556, 0.0753, -0.1035, ..., 0.0522, 0.0411, -0.0161], ..., [-0.0656, -0.0570, -0.0616, ..., -0.0002, -0.0853, 0.0975], [ 0.0624, -0.0118, 0.0367, ..., -0.0049, -0.1122, -0.0077], [-0.1072, -0.0151, -0.0117, ..., -0.1213, 0.0333, -0.0869]], device='cuda:0'), grad: tensor([[ 9.3132e-09, -4.4331e-07, 1.1092e-06, ..., 1.5702e-06, 2.5332e-06, -3.4831e-07], [ 3.8184e-08, 6.2659e-06, -6.0946e-06, ..., 2.6584e-05, 2.5466e-05, 8.8941e-07], [ 1.1455e-07, -9.0227e-06, 1.1511e-05, ..., -1.4730e-05, -3.5673e-05, 2.1048e-06], ..., [-2.0582e-07, 1.7881e-06, -3.8091e-07, ..., -2.6003e-05, 1.7896e-05, -2.5928e-05], [ 1.3597e-07, 6.4168e-07, 8.9332e-06, ..., 8.3148e-06, 1.1615e-05, 3.4459e-06], [ 3.7253e-08, 7.1712e-08, 6.6981e-06, ..., 1.4462e-05, 6.5118e-06, 1.3582e-05]], device='cuda:0') Epoch 109, bias, value: tensor([-0.0076, -0.0284, 0.0112, -0.0185, 0.0150, 0.0058, 0.0194, -0.0052, -0.0316, -0.0048], device='cuda:0'), grad: tensor([-8.4490e-06, 4.4197e-05, 1.5214e-05, 8.2105e-06, 6.8247e-05, -4.1246e-04, 1.2708e-04, -4.8995e-05, 1.1569e-04, 9.1493e-05], device='cuda:0') 100 0.0001 changing lr epoch 108, time 262.62, cls_loss 0.0044 cls_loss_mapping 0.0070 cls_loss_causal 0.5666 re_mapping 0.0092 re_causal 0.0273 /// teacc 98.76 lr 0.00010000 Epoch 110, weight, value: tensor([[ 1.6203e-02, -1.1709e-01, -7.1949e-02, ..., -1.2855e-01, -6.7447e-02, -1.1835e-01], [ 2.6430e-02, -3.6436e-02, 1.3321e-02, ..., 4.7838e-02, 7.5649e-02, -2.6720e-02], [-5.6039e-02, 7.5361e-02, -1.0440e-01, ..., 5.2119e-02, 4.1277e-02, -1.6652e-02], ..., [-6.5382e-02, -5.6908e-02, -6.1856e-02, ..., -7.2966e-05, -8.5671e-02, 9.8264e-02], [ 6.2353e-02, -1.1939e-02, 3.6987e-02, ..., -4.7849e-03, -1.1255e-01, -7.7472e-03], [-1.0759e-01, -1.5421e-02, -1.2209e-02, ..., -1.2193e-01, 3.3372e-02, -8.7601e-02]], device='cuda:0'), grad: tensor([[ 2.2817e-06, 3.0082e-07, 1.2159e-05, ..., 9.3579e-06, 7.7933e-06, 9.3132e-09], [ 1.5721e-06, 4.2282e-07, 6.0238e-06, ..., 8.1435e-06, 8.1733e-06, 8.2888e-08], [ 2.4531e-06, -2.6319e-06, 8.3223e-06, ..., 1.0416e-05, 9.2909e-06, 3.6322e-08], ..., [ 9.4436e-07, 5.2527e-07, 2.8312e-06, ..., 4.5858e-06, 6.5751e-06, -5.2527e-07], [-3.2783e-07, 1.8068e-07, -3.3319e-05, ..., -1.9506e-05, 3.5204e-06, 2.2352e-08], [-2.0504e-05, 6.5286e-07, -3.6567e-05, ..., -8.7500e-05, -1.1933e-04, 2.3190e-07]], device='cuda:0') Epoch 110, bias, value: tensor([-0.0073, -0.0286, 0.0107, -0.0199, 0.0149, 0.0076, 0.0198, -0.0049, -0.0317, -0.0052], device='cuda:0'), grad: tensor([ 5.3614e-05, 3.5703e-05, 4.8667e-05, 2.0266e-04, 2.4930e-05, 1.0289e-05, 6.2466e-05, 2.0862e-05, -7.1287e-05, -3.8815e-04], device='cuda:0') 100 0.0001 changing lr epoch 109, time 262.82, cls_loss 0.0042 cls_loss_mapping 0.0062 cls_loss_causal 0.5934 re_mapping 0.0091 re_causal 0.0271 /// teacc 98.85 lr 0.00010000 Epoch 111, weight, value: tensor([[ 0.0161, -0.1179, -0.0728, ..., -0.1308, -0.0674, -0.1184], [ 0.0269, -0.0371, 0.0126, ..., 0.0469, 0.0753, -0.0283], [-0.0559, 0.0764, -0.1055, ..., 0.0525, 0.0420, -0.0167], ..., [-0.0660, -0.0574, -0.0610, ..., 0.0005, -0.0857, 0.0995], [ 0.0624, -0.0121, 0.0374, ..., -0.0044, -0.1131, -0.0078], [-0.1076, -0.0157, -0.0126, ..., -0.1220, 0.0331, -0.0881]], device='cuda:0'), grad: tensor([[ 4.9360e-07, 2.9486e-06, 2.7940e-05, ..., 1.2517e-05, 2.2709e-05, 4.3772e-08], [ 1.0030e-06, 9.1735e-07, -1.2910e-04, ..., -4.7237e-05, -9.8944e-05, 1.7602e-07], [ 2.6450e-07, -3.2043e-04, -2.2364e-04, ..., -4.1199e-04, -3.6478e-04, 5.4669e-07], ..., [ 5.0291e-07, 2.5518e-07, 1.3351e-05, ..., 6.9141e-06, 1.1772e-05, -1.8468e-06], [ 6.5193e-07, 3.0851e-04, 2.9325e-04, ..., 4.2462e-04, 4.0865e-04, 9.9652e-08], [ 8.2608e-07, 3.4180e-06, 4.5747e-06, ..., 5.8785e-06, 5.1409e-06, 2.7381e-07]], device='cuda:0') Epoch 111, bias, value: tensor([-0.0072, -0.0296, 0.0111, -0.0198, 0.0151, 0.0075, 0.0202, -0.0044, -0.0314, -0.0056], device='cuda:0'), grad: tensor([ 1.8721e-03, -3.3951e-04, -9.1743e-04, 1.7989e-04, 2.7016e-05, -2.6360e-03, 1.9395e-04, 1.5152e-04, 1.1816e-03, 2.9016e-04], device='cuda:0') 100 0.0001 changing lr epoch 110, time 262.55, cls_loss 0.0039 cls_loss_mapping 0.0060 cls_loss_causal 0.5753 re_mapping 0.0090 re_causal 0.0269 /// teacc 98.90 lr 0.00010000 Epoch 112, weight, value: tensor([[ 0.0161, -0.1184, -0.0747, ..., -0.1317, -0.0675, -0.1185], [ 0.0269, -0.0373, 0.0134, ..., 0.0474, 0.0759, -0.0285], [-0.0553, 0.0767, -0.1060, ..., 0.0522, 0.0420, -0.0191], ..., [-0.0671, -0.0570, -0.0619, ..., 0.0006, -0.0860, 0.1011], [ 0.0626, -0.0123, 0.0375, ..., -0.0045, -0.1140, -0.0078], [-0.1077, -0.0161, -0.0127, ..., -0.1221, 0.0332, -0.0883]], device='cuda:0'), grad: tensor([[-4.9919e-07, 4.4703e-08, 6.3796e-07, ..., 7.8976e-07, -3.9116e-06, 4.8615e-07], [ 2.6077e-08, 2.2352e-07, 5.0396e-05, ..., 5.4806e-05, 1.5154e-05, 3.6150e-05], [ 3.6322e-08, -5.5879e-08, 8.2478e-06, ..., 8.9929e-06, 2.1756e-06, 6.8918e-06], ..., [ 8.9407e-08, -1.4920e-06, -2.2221e-04, ..., -2.4176e-04, -6.6221e-05, -1.6284e-04], [ 2.1886e-07, 2.9802e-07, 8.3297e-06, ..., 1.0274e-05, 3.3304e-06, 7.7412e-06], [ 9.1363e-07, 3.7067e-07, 9.3341e-05, ..., 9.9003e-05, 2.8789e-05, 6.4731e-05]], device='cuda:0') Epoch 112, bias, value: tensor([-0.0076, -0.0290, 0.0101, -0.0197, 0.0152, 0.0077, 0.0200, -0.0041, -0.0316, -0.0056], device='cuda:0'), grad: tensor([-1.3523e-05, 1.4579e-04, 2.7403e-05, 2.1982e-04, 1.4022e-05, -5.6088e-05, 2.7083e-06, -6.4373e-04, 2.9325e-05, 2.7418e-04], device='cuda:0') 100 0.0001 changing lr epoch 111, time 262.24, cls_loss 0.0051 cls_loss_mapping 0.0068 cls_loss_causal 0.5767 re_mapping 0.0093 re_causal 0.0271 /// teacc 98.78 lr 0.00010000 Epoch 113, weight, value: tensor([[ 0.0164, -0.1189, -0.0747, ..., -0.1327, -0.0669, -0.1191], [ 0.0264, -0.0388, 0.0147, ..., 0.0488, 0.0760, -0.0263], [-0.0550, 0.0776, -0.1052, ..., 0.0523, 0.0433, -0.0193], ..., [-0.0674, -0.0569, -0.0641, ..., -0.0007, -0.0878, 0.0998], [ 0.0627, -0.0127, 0.0380, ..., -0.0044, -0.1151, -0.0072], [-0.1084, -0.0166, -0.0131, ..., -0.1226, 0.0335, -0.0888]], device='cuda:0'), grad: tensor([[-2.0303e-07, 9.9652e-07, 4.3735e-06, ..., 2.6580e-06, 5.2564e-06, 1.5087e-07], [ 2.0489e-08, 1.3346e-06, -8.4102e-05, ..., -1.0163e-04, -1.2791e-04, 5.3737e-07], [ 5.4017e-08, -8.7172e-06, 3.9250e-05, ..., 3.1710e-05, 4.1366e-05, 4.6846e-07], ..., [ 1.3970e-08, 2.1011e-06, 6.4299e-06, ..., 9.0525e-06, 1.5065e-05, -2.5630e-06], [ 2.9802e-08, 9.3598e-07, 9.7007e-06, ..., 1.0282e-05, 1.4991e-05, 3.6787e-07], [ 6.0536e-08, 3.7067e-07, 8.1137e-06, ..., 1.0297e-05, 9.3132e-06, 5.1968e-07]], device='cuda:0') Epoch 113, bias, value: tensor([-0.0067, -0.0283, 0.0104, -0.0196, 0.0150, 0.0078, 0.0194, -0.0052, -0.0316, -0.0056], device='cuda:0'), grad: tensor([ 1.9707e-06, -1.6892e-04, 5.2303e-05, 3.8743e-05, 1.5028e-05, -1.0878e-05, 9.1642e-06, 1.6659e-05, 3.3915e-05, 1.2383e-05], device='cuda:0') 100 0.0001 changing lr epoch 112, time 262.63, cls_loss 0.0060 cls_loss_mapping 0.0089 cls_loss_causal 0.5642 re_mapping 0.0092 re_causal 0.0263 /// teacc 98.68 lr 0.00010000 Epoch 114, weight, value: tensor([[ 0.0165, -0.1191, -0.0750, ..., -0.1336, -0.0671, -0.1194], [ 0.0265, -0.0389, 0.0151, ..., 0.0488, 0.0759, -0.0261], [-0.0546, 0.0782, -0.1060, ..., 0.0521, 0.0436, -0.0200], ..., [-0.0666, -0.0573, -0.0643, ..., -0.0008, -0.0885, 0.1004], [ 0.0627, -0.0130, 0.0377, ..., -0.0048, -0.1171, -0.0073], [-0.1096, -0.0168, -0.0137, ..., -0.1219, 0.0343, -0.0885]], device='cuda:0'), grad: tensor([[ 5.9605e-08, 8.6613e-08, 5.5507e-07, ..., 7.4040e-07, 8.1882e-06, 4.1910e-08], [ 8.1956e-08, 3.5390e-07, 8.4098e-07, ..., 2.0880e-06, 1.4706e-06, 4.6846e-07], [ 3.0547e-07, -3.4738e-06, 1.6578e-06, ..., -1.9539e-06, 1.0267e-05, 2.2911e-07], ..., [ 2.6077e-07, 3.0082e-07, 2.0713e-06, ..., -1.6885e-06, 2.8573e-06, -4.3362e-06], [ 5.5600e-07, 1.8133e-06, 2.8014e-06, ..., 6.3218e-06, 5.3346e-06, 1.6764e-07], [ 1.6019e-07, 2.8871e-08, 1.5274e-06, ..., 3.1423e-06, -1.3828e-05, 1.4901e-06]], device='cuda:0') Epoch 114, bias, value: tensor([-0.0066, -0.0286, 0.0101, -0.0195, 0.0134, 0.0082, 0.0201, -0.0048, -0.0328, -0.0046], device='cuda:0'), grad: tensor([ 2.7139e-06, 8.0690e-06, 1.7956e-05, -7.5139e-06, -5.3421e-06, -2.6867e-05, 1.1183e-05, -5.2620e-07, 2.2873e-05, -2.2620e-05], device='cuda:0') 100 0.0001 changing lr epoch 113, time 262.32, cls_loss 0.0069 cls_loss_mapping 0.0081 cls_loss_causal 0.6039 re_mapping 0.0093 re_causal 0.0267 /// teacc 98.89 lr 0.00010000 Epoch 115, weight, value: tensor([[ 1.6279e-02, -1.2023e-01, -7.5680e-02, ..., -1.3473e-01, -6.7798e-02, -1.1952e-01], [ 2.6654e-02, -3.8665e-02, 1.4047e-02, ..., 4.7737e-02, 7.5665e-02, -2.6870e-02], [-5.4584e-02, 7.8155e-02, -1.0695e-01, ..., 5.2231e-02, 4.3406e-02, -1.9961e-02], ..., [-6.6596e-02, -5.7833e-02, -6.3101e-02, ..., 1.1827e-04, -8.8103e-02, 1.0358e-01], [ 6.2843e-02, -1.2553e-02, 3.7690e-02, ..., -4.8990e-03, -1.1811e-01, -7.4039e-03], [-1.0964e-01, -1.6972e-02, -1.3936e-02, ..., -1.2239e-01, 3.5119e-02, -8.9203e-02]], device='cuda:0'), grad: tensor([[ 2.0582e-07, 5.2620e-07, 6.0350e-07, ..., 1.0477e-06, 2.2408e-06, 2.3283e-08], [-9.2268e-05, -4.4048e-05, -8.0407e-05, ..., -2.1684e-04, -2.4915e-04, 1.8347e-07], [ 6.0588e-05, 2.1726e-05, 5.2392e-05, ..., 1.3506e-04, 1.5140e-04, 3.9861e-07], ..., [ 2.5034e-05, 1.2569e-05, 2.3782e-05, ..., 5.8889e-05, 6.8545e-05, -1.8319e-06], [ 1.5646e-06, 4.2729e-06, -8.5402e-07, ..., 7.9423e-06, 1.0446e-05, 3.8557e-07], [ 2.1141e-07, 6.4541e-07, 1.9316e-06, ..., 2.2035e-06, 7.2643e-07, 5.4389e-07]], device='cuda:0') Epoch 115, bias, value: tensor([-0.0064, -0.0295, 0.0095, -0.0189, 0.0115, 0.0083, 0.0193, -0.0028, -0.0328, -0.0043], device='cuda:0'), grad: tensor([ 1.9027e-06, -2.9969e-04, 1.8430e-04, 1.0198e-06, -1.3625e-06, 1.0036e-05, 4.4554e-06, 8.2016e-05, 1.1921e-05, 5.3495e-06], device='cuda:0') 100 0.0001 changing lr epoch 114, time 262.76, cls_loss 0.0050 cls_loss_mapping 0.0067 cls_loss_causal 0.5725 re_mapping 0.0087 re_causal 0.0267 /// teacc 98.75 lr 0.00010000 Epoch 116, weight, value: tensor([[ 0.0165, -0.1233, -0.0736, ..., -0.1355, -0.0673, -0.1198], [ 0.0267, -0.0389, 0.0140, ..., 0.0474, 0.0754, -0.0271], [-0.0547, 0.0783, -0.1073, ..., 0.0522, 0.0435, -0.0204], ..., [-0.0666, -0.0579, -0.0633, ..., -0.0002, -0.0885, 0.1018], [ 0.0629, -0.0112, 0.0385, ..., -0.0045, -0.1182, -0.0078], [-0.1098, -0.0173, -0.0144, ..., -0.1215, 0.0357, -0.0867]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 2.0117e-07, ..., 4.5262e-07, 6.6102e-05, 3.0547e-07], [ 2.6077e-08, 0.0000e+00, -2.8256e-06, ..., -8.2795e-07, -3.4925e-06, 2.4457e-06], [ 1.7695e-08, 9.3132e-10, 7.4785e-07, ..., 4.2021e-06, 4.7356e-05, 2.5425e-06], ..., [ 1.2107e-08, 0.0000e+00, 1.0841e-06, ..., 2.0545e-06, 2.5585e-05, 4.8578e-05], [ 1.2107e-08, -9.3132e-10, -4.5542e-07, ..., 4.0885e-07, 4.6778e-04, 6.5845e-07], [ 1.0245e-08, 0.0000e+00, 7.1805e-07, ..., -9.9614e-06, -8.1491e-04, -6.0171e-05]], device='cuda:0') Epoch 116, bias, value: tensor([-0.0057, -0.0300, 0.0092, -0.0186, 0.0113, 0.0082, 0.0190, -0.0038, -0.0324, -0.0031], device='cuda:0'), grad: tensor([ 1.7643e-04, 5.4538e-06, 1.3351e-04, 2.6032e-05, 1.0848e-04, 4.0221e-04, 4.3035e-05, 2.1958e-04, 1.2589e-03, -2.3727e-03], device='cuda:0') 100 0.0001 changing lr epoch 115, time 262.67, cls_loss 0.0041 cls_loss_mapping 0.0084 cls_loss_causal 0.5411 re_mapping 0.0093 re_causal 0.0263 /// teacc 98.86 lr 0.00010000 Epoch 117, weight, value: tensor([[ 1.5501e-02, -1.2369e-01, -7.4265e-02, ..., -1.3655e-01, -6.7484e-02, -1.2011e-01], [ 2.7600e-02, -3.8894e-02, 1.4475e-02, ..., 4.7533e-02, 7.6454e-02, -2.6830e-02], [-5.4890e-02, 7.9197e-02, -1.0808e-01, ..., 5.2533e-02, 4.3578e-02, -2.0176e-02], ..., [-6.7356e-02, -5.8163e-02, -6.3525e-02, ..., -6.7993e-05, -8.9330e-02, 1.0205e-01], [ 6.4459e-02, -1.2048e-02, 3.9543e-02, ..., -4.5599e-03, -1.1873e-01, -8.5964e-03], [-1.1071e-01, -1.7562e-02, -1.5174e-02, ..., -1.2203e-01, 3.5619e-02, -8.6844e-02]], device='cuda:0'), grad: tensor([[-4.3400e-07, 2.8163e-06, -3.2522e-06, ..., 5.5321e-06, 1.9018e-06, 3.0641e-07], [ 4.3176e-06, 7.3425e-06, 3.3882e-06, ..., 2.1324e-05, 1.3120e-05, 2.2113e-05], [-1.1474e-04, -1.4007e-04, -1.1456e-04, ..., -4.8542e-04, -2.1672e-04, 4.4890e-07], ..., [ 4.1164e-07, 3.8184e-06, -5.4110e-07, ..., -1.3150e-05, 7.2867e-06, -2.3261e-05], [ 1.0735e-04, 1.0407e-04, 1.0937e-04, ..., 4.2748e-04, 1.6594e-04, 4.6194e-07], [ 1.0133e-06, 9.9614e-06, 1.4491e-06, ..., 2.4617e-05, 1.1943e-05, -4.2431e-06]], device='cuda:0') Epoch 117, bias, value: tensor([-0.0064, -0.0293, 0.0091, -0.0190, 0.0118, 0.0079, 0.0194, -0.0039, -0.0316, -0.0034], device='cuda:0'), grad: tensor([-1.1832e-05, 7.6413e-05, -1.0090e-03, 5.0902e-05, -3.0696e-06, -1.4491e-05, 1.0915e-05, 2.9802e-06, 8.9979e-04, -1.5302e-06], device='cuda:0') 100 0.0001 changing lr epoch 116, time 262.56, cls_loss 0.0043 cls_loss_mapping 0.0065 cls_loss_causal 0.5482 re_mapping 0.0088 re_causal 0.0264 /// teacc 98.74 lr 0.00010000 Epoch 118, weight, value: tensor([[ 1.5297e-02, -1.2388e-01, -7.4985e-02, ..., -1.3730e-01, -6.7543e-02, -1.2019e-01], [ 2.7546e-02, -3.8949e-02, 1.4371e-02, ..., 4.7277e-02, 7.6496e-02, -2.7339e-02], [-5.3680e-02, 7.9598e-02, -1.0889e-01, ..., 5.2872e-02, 4.3715e-02, -2.0307e-02], ..., [-6.7722e-02, -5.8542e-02, -6.3357e-02, ..., 5.5455e-05, -8.9460e-02, 1.0228e-01], [ 6.4626e-02, -1.1799e-02, 4.0416e-02, ..., -4.3290e-03, -1.1950e-01, -8.8830e-03], [-1.1105e-01, -1.7790e-02, -1.5079e-02, ..., -1.2232e-01, 3.4556e-02, -8.7034e-02]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-09, 1.6671e-07, ..., 1.6484e-07, 2.6263e-07, 9.3132e-10], [ 0.0000e+00, 1.4901e-08, -4.8727e-06, ..., -3.4645e-06, -8.3968e-06, 4.9360e-08], [ 3.7253e-09, -9.1270e-08, 5.9977e-07, ..., 4.6100e-07, 2.0675e-07, 3.2596e-08], ..., [ 9.3132e-10, 5.9605e-08, 7.0967e-06, ..., 5.5768e-06, 8.2403e-06, -8.2329e-07], [ 9.3132e-10, 3.7253e-09, -5.3421e-06, ..., -5.5470e-06, 1.5832e-06, 2.3283e-08], [ 9.3132e-10, 9.3132e-10, 2.5630e-06, ..., 3.0380e-06, 3.8445e-05, 6.2492e-07]], device='cuda:0') Epoch 118, bias, value: tensor([-0.0064, -0.0298, 0.0091, -0.0192, 0.0133, 0.0073, 0.0198, -0.0038, -0.0312, -0.0043], device='cuda:0'), grad: tensor([ 5.1782e-07, -1.2487e-05, 1.6596e-06, -5.1968e-07, -1.4269e-04, 5.9698e-07, 6.2678e-07, 2.4781e-05, -1.5661e-05, 1.4329e-04], device='cuda:0') 100 0.0001 changing lr epoch 117, time 262.86, cls_loss 0.0043 cls_loss_mapping 0.0057 cls_loss_causal 0.5525 re_mapping 0.0090 re_causal 0.0266 /// teacc 98.87 lr 0.00010000 Epoch 119, weight, value: tensor([[ 0.0153, -0.1240, -0.0753, ..., -0.1385, -0.0678, -0.1209], [ 0.0276, -0.0393, 0.0146, ..., 0.0469, 0.0764, -0.0279], [-0.0537, 0.0799, -0.1095, ..., 0.0521, 0.0433, -0.0206], ..., [-0.0677, -0.0585, -0.0636, ..., 0.0008, -0.0892, 0.1026], [ 0.0647, -0.0119, 0.0406, ..., -0.0044, -0.1200, -0.0092], [-0.1110, -0.0179, -0.0153, ..., -0.1220, 0.0349, -0.0870]], device='cuda:0'), grad: tensor([[ 8.0094e-08, 6.6124e-08, 1.8254e-07, ..., 1.0049e-06, 3.5483e-07, 2.1718e-06], [ 2.8498e-07, 2.3935e-06, -3.2261e-06, ..., 5.8934e-06, 8.9873e-07, 8.3968e-06], [ 4.8429e-08, -5.5730e-05, 8.2515e-07, ..., -2.2352e-04, -1.4579e-04, -6.8665e-05], ..., [-1.1194e-06, 5.2840e-05, 1.2992e-06, ..., 1.8287e-04, 1.4400e-04, -2.3961e-05], [ 6.5193e-08, 1.0710e-07, 1.3784e-06, ..., 3.1423e-06, 1.0747e-06, 2.7344e-06], [ 6.7428e-07, 4.0047e-08, 2.0117e-06, ..., 5.5730e-06, 4.8615e-07, 9.4101e-06]], device='cuda:0') Epoch 119, bias, value: tensor([-0.0065, -0.0302, 0.0084, -0.0192, 0.0131, 0.0076, 0.0199, -0.0032, -0.0314, -0.0040], device='cuda:0'), grad: tensor([ 5.1111e-06, 1.1943e-05, -2.8992e-04, 4.1962e-05, 9.3281e-05, -2.7269e-05, 2.4587e-07, 1.2636e-04, 1.2904e-05, 2.5585e-05], device='cuda:0') 100 0.0001 changing lr epoch 118, time 262.88, cls_loss 0.0039 cls_loss_mapping 0.0067 cls_loss_causal 0.5704 re_mapping 0.0091 re_causal 0.0260 /// teacc 98.83 lr 0.00010000 Epoch 120, weight, value: tensor([[ 0.0152, -0.1247, -0.0749, ..., -0.1390, -0.0677, -0.1212], [ 0.0280, -0.0393, 0.0148, ..., 0.0466, 0.0766, -0.0285], [-0.0539, 0.0801, -0.1098, ..., 0.0520, 0.0435, -0.0208], ..., [-0.0680, -0.0584, -0.0637, ..., 0.0012, -0.0896, 0.1033], [ 0.0647, -0.0119, 0.0410, ..., -0.0043, -0.1206, -0.0093], [-0.1112, -0.0180, -0.0158, ..., -0.1227, 0.0349, -0.0874]], device='cuda:0'), grad: tensor([[ 1.8626e-08, 4.7497e-08, 5.0440e-06, ..., 2.3767e-06, 3.2410e-07, 2.4121e-07], [ 2.2724e-07, 4.9360e-08, -3.9116e-08, ..., 4.1813e-05, 1.1876e-05, 2.2203e-05], [ 6.7987e-08, -3.8333e-06, 2.1886e-06, ..., 2.1338e-04, 6.9499e-05, 1.1361e-04], ..., [ 4.6566e-07, 1.9465e-06, 3.4645e-07, ..., -2.7823e-04, -7.0512e-05, -1.4997e-04], [ 1.3039e-08, 7.7300e-08, -2.6613e-05, ..., -1.0528e-05, 4.0606e-07, 6.8638e-07], [ 1.1548e-07, 8.1956e-08, 5.9418e-07, ..., 3.1218e-06, 2.1622e-05, 2.0340e-06]], device='cuda:0') Epoch 120, bias, value: tensor([-0.0063, -0.0305, 0.0081, -0.0191, 0.0131, 0.0072, 0.0201, -0.0027, -0.0316, -0.0042], device='cuda:0'), grad: tensor([-9.4250e-06, 7.3433e-05, 3.6597e-04, 4.1336e-05, -1.6761e-04, -7.7039e-06, 2.1219e-05, -4.4012e-04, -5.8353e-05, 1.8096e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 119---------------------------------------------------- epoch 119, time 279.51, cls_loss 0.0038 cls_loss_mapping 0.0056 cls_loss_causal 0.5365 re_mapping 0.0088 re_causal 0.0250 /// teacc 98.98 lr 0.00010000 Epoch 121, weight, value: tensor([[ 0.0152, -0.1248, -0.0753, ..., -0.1399, -0.0679, -0.1213], [ 0.0279, -0.0394, 0.0150, ..., 0.0466, 0.0768, -0.0287], [-0.0538, 0.0801, -0.1104, ..., 0.0520, 0.0442, -0.0219], ..., [-0.0681, -0.0580, -0.0636, ..., 0.0018, -0.0900, 0.1042], [ 0.0646, -0.0121, 0.0412, ..., -0.0041, -0.1215, -0.0094], [-0.1113, -0.0182, -0.0162, ..., -0.1234, 0.0349, -0.0877]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 2.7940e-08, 6.2026e-07, ..., 4.7497e-07, 1.2303e-06, 5.2620e-08], [ 1.2107e-08, 9.4529e-08, -4.3586e-06, ..., -1.4529e-06, -6.8694e-06, 1.7555e-07], [ 1.4901e-08, -8.3260e-07, 7.8045e-07, ..., -2.2678e-07, -4.5123e-07, 6.4261e-08], ..., [ 2.4680e-08, 4.1630e-07, 2.4326e-06, ..., 1.7025e-06, 6.8592e-07, -1.6661e-06], [ 2.7707e-07, 7.0781e-08, -9.9614e-06, ..., -2.0750e-06, 5.2452e-06, 7.0781e-08], [ 2.4680e-08, 1.5367e-08, 1.4016e-06, ..., 1.6419e-06, -6.7521e-08, 1.1139e-06]], device='cuda:0') Epoch 121, bias, value: tensor([-0.0065, -0.0305, 0.0081, -0.0200, 0.0138, 0.0074, 0.0201, -0.0020, -0.0320, -0.0048], device='cuda:0'), grad: tensor([-3.7146e-04, -7.7263e-06, 3.6415e-06, -8.4281e-05, 2.6003e-06, 1.0717e-04, 6.1154e-05, 4.0159e-06, 9.1968e-07, 2.8396e-04], device='cuda:0') 100 0.0001 changing lr epoch 120, time 262.94, cls_loss 0.0037 cls_loss_mapping 0.0086 cls_loss_causal 0.5859 re_mapping 0.0085 re_causal 0.0252 /// teacc 98.82 lr 0.00010000 Epoch 122, weight, value: tensor([[ 0.0152, -0.1253, -0.0756, ..., -0.1406, -0.0681, -0.1215], [ 0.0278, -0.0395, 0.0155, ..., 0.0466, 0.0770, -0.0290], [-0.0539, 0.0805, -0.1111, ..., 0.0519, 0.0446, -0.0221], ..., [-0.0680, -0.0581, -0.0641, ..., 0.0020, -0.0904, 0.1046], [ 0.0651, -0.0123, 0.0411, ..., -0.0042, -0.1222, -0.0094], [-0.1115, -0.0184, -0.0167, ..., -0.1239, 0.0351, -0.0878]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.9232e-07, 4.6194e-06, ..., 6.0126e-06, 1.0595e-05, 4.1910e-09], [ 0.0000e+00, -1.4961e-05, -3.9196e-04, ..., -5.1212e-04, -9.1648e-04, 5.4948e-08], [ 0.0000e+00, 3.4235e-06, 1.0359e-04, ..., 1.3423e-04, 2.3806e-04, 1.2573e-08], ..., [-0.0000e+00, 1.0379e-05, 2.6608e-04, ..., 3.4738e-04, 6.1893e-04, -1.8487e-07], [ 0.0000e+00, 1.2200e-07, 1.2629e-06, ..., 1.6112e-06, 5.6848e-06, 8.3819e-09], [ 0.0000e+00, 3.4692e-07, 9.4473e-06, ..., 1.1854e-05, 2.2918e-05, 7.9162e-08]], device='cuda:0') Epoch 122, bias, value: tensor([-0.0064, -0.0305, 0.0076, -0.0197, 0.0140, 0.0082, 0.0194, -0.0017, -0.0326, -0.0049], device='cuda:0'), grad: tensor([-5.3257e-05, -2.1286e-03, 5.5838e-04, 1.4320e-05, 3.7134e-05, -2.5138e-05, 7.0035e-05, 1.4410e-03, 2.0355e-05, 6.7770e-05], device='cuda:0') 100 0.0001 changing lr epoch 121, time 262.65, cls_loss 0.0039 cls_loss_mapping 0.0061 cls_loss_causal 0.5597 re_mapping 0.0080 re_causal 0.0246 /// teacc 98.89 lr 0.00010000 Epoch 123, weight, value: tensor([[ 0.0152, -0.1253, -0.0756, ..., -0.1409, -0.0682, -0.1215], [ 0.0277, -0.0396, 0.0134, ..., 0.0451, 0.0758, -0.0292], [-0.0539, 0.0809, -0.1117, ..., 0.0519, 0.0448, -0.0223], ..., [-0.0680, -0.0578, -0.0627, ..., 0.0030, -0.0888, 0.1045], [ 0.0651, -0.0125, 0.0425, ..., -0.0029, -0.1226, -0.0073], [-0.1115, -0.0195, -0.0171, ..., -0.1243, 0.0350, -0.0881]], device='cuda:0'), grad: tensor([[ 7.9162e-09, -1.5367e-08, 5.8115e-06, ..., 1.7146e-06, 8.4341e-06, 5.3085e-08], [ 8.3819e-09, 1.4389e-07, -2.0817e-05, ..., -3.2261e-06, -2.5943e-05, 2.4419e-06], [ 1.5832e-08, -3.3788e-06, 3.9861e-06, ..., -3.9674e-06, 2.1141e-06, -3.7253e-09], ..., [ 1.8626e-09, 3.0715e-06, -2.8033e-06, ..., 1.2107e-06, 8.8140e-06, -6.6385e-06], [ 1.0710e-08, 7.2643e-08, -7.4878e-07, ..., 1.3337e-06, 1.4760e-05, 2.2259e-07], [ 2.7940e-09, 2.3749e-08, -1.8999e-06, ..., -5.8375e-06, -2.6271e-05, 3.4980e-06]], device='cuda:0') Epoch 123, bias, value: tensor([-0.0064, -0.0325, 0.0075, -0.0198, 0.0138, 0.0079, 0.0196, -0.0001, -0.0315, -0.0052], device='cuda:0'), grad: tensor([ 1.4268e-05, -3.3289e-05, 6.3777e-06, 1.2860e-05, 8.2105e-06, 2.7493e-06, 6.4820e-06, -5.3719e-06, 6.9216e-06, -1.9088e-05], device='cuda:0') 100 0.0001 changing lr epoch 122, time 262.75, cls_loss 0.0042 cls_loss_mapping 0.0079 cls_loss_causal 0.5629 re_mapping 0.0084 re_causal 0.0239 /// teacc 98.83 lr 0.00010000 Epoch 124, weight, value: tensor([[ 0.0149, -0.1258, -0.0764, ..., -0.1417, -0.0684, -0.1216], [ 0.0285, -0.0397, 0.0139, ..., 0.0453, 0.0763, -0.0293], [-0.0562, 0.0811, -0.1113, ..., 0.0534, 0.0445, -0.0199], ..., [-0.0681, -0.0580, -0.0628, ..., 0.0030, -0.0893, 0.1048], [ 0.0653, -0.0146, 0.0412, ..., -0.0054, -0.1231, -0.0098], [-0.1117, -0.0199, -0.0181, ..., -0.1247, 0.0342, -0.0883]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 8.3167e-07, 2.4606e-06, ..., 3.2634e-06, 1.8822e-06, 1.0012e-07], [ 3.7253e-09, 6.2631e-07, 8.0287e-05, ..., 1.0532e-04, -3.1918e-05, 1.3553e-05], [ 6.8918e-08, -7.4804e-05, -1.6332e-05, ..., -9.3341e-05, -4.8548e-05, 3.8045e-07], ..., [ 4.2375e-08, 7.5297e-07, -1.1450e-04, ..., -1.4174e-04, 1.1183e-05, -1.6630e-05], [ 3.6322e-08, 7.0691e-05, 4.3690e-05, ..., 1.1563e-04, 6.1691e-05, 1.0310e-06], [ 2.7940e-09, 1.3690e-07, 3.6377e-06, ..., 3.3863e-06, -1.8626e-08, 9.1502e-07]], device='cuda:0') Epoch 124, bias, value: tensor([-0.0059, -0.0322, 0.0080, -0.0190, 0.0150, 0.0079, 0.0194, -0.0003, -0.0330, -0.0064], device='cuda:0'), grad: tensor([ 6.5081e-06, 2.1958e-04, -1.5879e-04, -6.5148e-05, 6.5491e-06, 7.1764e-05, 3.1590e-06, -2.9874e-04, 2.0647e-04, 8.4043e-06], device='cuda:0') 100 0.0001 changing lr epoch 123, time 262.75, cls_loss 0.0037 cls_loss_mapping 0.0068 cls_loss_causal 0.5826 re_mapping 0.0083 re_causal 0.0266 /// teacc 98.84 lr 0.00010000 Epoch 125, weight, value: tensor([[ 0.0149, -0.1260, -0.0769, ..., -0.1426, -0.0686, -0.1218], [ 0.0286, -0.0398, 0.0151, ..., 0.0460, 0.0769, -0.0295], [-0.0563, 0.0817, -0.1123, ..., 0.0531, 0.0446, -0.0200], ..., [-0.0689, -0.0585, -0.0643, ..., 0.0025, -0.0904, 0.1052], [ 0.0653, -0.0144, 0.0416, ..., -0.0054, -0.1243, -0.0099], [-0.1121, -0.0213, -0.0191, ..., -0.1247, 0.0344, -0.0879]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4040e-08, 2.5006e-07, ..., 2.3656e-07, 8.9034e-07, 9.7789e-09], [ 4.6566e-10, 6.5006e-07, -1.9651e-07, ..., 9.6578e-07, 3.7700e-06, 3.4366e-07], [ 9.3132e-10, -9.9167e-06, 3.9162e-07, ..., -1.9446e-05, -1.7896e-05, 1.3504e-08], ..., [ 1.8626e-09, 5.2620e-07, 1.0822e-06, ..., 1.2470e-06, 1.3476e-06, -4.6566e-09], [ 9.3132e-10, 1.3970e-07, -7.0184e-06, ..., -2.3209e-06, 1.3737e-06, 7.6368e-08], [ 9.3132e-10, 1.5367e-08, 5.2080e-06, ..., 1.1073e-06, 7.6108e-06, 2.1085e-06]], device='cuda:0') Epoch 125, bias, value: tensor([-0.0059, -0.0314, 0.0075, -0.0187, 0.0148, 0.0082, 0.0201, -0.0013, -0.0332, -0.0060], device='cuda:0'), grad: tensor([ 1.2107e-06, 7.5959e-06, -1.9804e-05, 9.3877e-07, -4.6670e-05, 1.2498e-06, 3.3706e-05, 4.0680e-06, -1.0453e-05, 2.8104e-05], device='cuda:0') 100 0.0001 changing lr epoch 124, time 262.92, cls_loss 0.0038 cls_loss_mapping 0.0056 cls_loss_causal 0.5601 re_mapping 0.0077 re_causal 0.0240 /// teacc 98.85 lr 0.00010000 Epoch 126, weight, value: tensor([[ 0.0146, -0.1266, -0.0774, ..., -0.1439, -0.0701, -0.1223], [ 0.0287, -0.0401, 0.0152, ..., 0.0460, 0.0769, -0.0297], [-0.0572, 0.0812, -0.1128, ..., 0.0527, 0.0446, -0.0202], ..., [-0.0689, -0.0580, -0.0644, ..., 0.0027, -0.0905, 0.1056], [ 0.0655, -0.0145, 0.0420, ..., -0.0054, -0.1249, -0.0100], [-0.1123, -0.0220, -0.0196, ..., -0.1252, 0.0357, -0.0881]], device='cuda:0'), grad: tensor([[ 1.7602e-07, 3.1013e-07, 1.0272e-06, ..., 8.0746e-07, 2.7306e-06, 5.0291e-08], [ 2.7940e-07, 2.8070e-06, 3.1322e-05, ..., 2.6003e-05, 1.4760e-05, 4.4517e-06], [ 2.9206e-06, 5.3830e-07, 1.7777e-05, ..., 7.3202e-06, -7.6741e-06, 9.9372e-07], ..., [ 1.1586e-06, 5.1968e-07, 6.7204e-06, ..., 4.2394e-06, 5.6587e-06, -1.0431e-06], [ 1.0664e-06, -2.9162e-05, -1.0455e-04, ..., -6.7055e-05, -1.0557e-05, -6.1244e-06], [ 3.7439e-07, 2.9895e-07, 2.9113e-06, ..., 2.6021e-06, -1.0490e-05, 1.1967e-06]], device='cuda:0') Epoch 126, bias, value: tensor([-0.0065, -0.0316, 0.0067, -0.0182, 0.0147, 0.0087, 0.0195, -0.0010, -0.0331, -0.0057], device='cuda:0'), grad: tensor([ 4.6417e-06, 7.8857e-05, 3.5077e-05, -2.8342e-05, 9.5069e-06, 1.4091e-04, 3.0510e-06, 1.8656e-05, -2.5129e-04, -1.1116e-05], device='cuda:0') 100 0.0001 changing lr epoch 125, time 262.43, cls_loss 0.0044 cls_loss_mapping 0.0060 cls_loss_causal 0.5895 re_mapping 0.0083 re_causal 0.0249 /// teacc 98.86 lr 0.00010000 Epoch 127, weight, value: tensor([[ 0.0147, -0.1269, -0.0781, ..., -0.1447, -0.0706, -0.1230], [ 0.0287, -0.0403, 0.0152, ..., 0.0458, 0.0768, -0.0302], [-0.0590, 0.0816, -0.1131, ..., 0.0526, 0.0456, -0.0203], ..., [-0.0696, -0.0584, -0.0646, ..., 0.0027, -0.0905, 0.1060], [ 0.0677, -0.0137, 0.0423, ..., -0.0049, -0.1262, -0.0101], [-0.1131, -0.0238, -0.0202, ..., -0.1262, 0.0352, -0.0884]], device='cuda:0'), grad: tensor([[-1.1556e-05, 2.7940e-09, -2.8256e-06, ..., 1.4715e-07, -4.4733e-05, 1.1176e-08], [ 2.9746e-06, 2.7940e-09, -9.3691e-07, ..., -1.3253e-06, 8.7246e-06, 3.5390e-08], [ 1.2480e-07, -3.3528e-08, 1.9073e-06, ..., 1.1642e-06, 3.1665e-07, 1.7695e-08], ..., [ 1.4063e-07, 1.8626e-08, 8.2608e-07, ..., 5.4762e-07, 1.1306e-06, -7.8231e-08], [ 6.5193e-07, 2.7940e-09, -1.6447e-06, ..., -6.3330e-07, 2.9691e-06, 4.3306e-07], [ 4.6380e-07, 9.3132e-10, 1.4799e-06, ..., 1.1986e-06, -8.1509e-06, 2.9989e-07]], device='cuda:0') Epoch 127, bias, value: tensor([-0.0067, -0.0319, 0.0065, -0.0190, 0.0142, 0.0098, 0.0201, -0.0009, -0.0327, -0.0055], device='cuda:0'), grad: tensor([-1.7989e-04, 4.0650e-05, 4.6343e-06, -4.9621e-06, 2.1219e-05, 1.2629e-05, 1.0788e-04, 3.0641e-06, 8.2180e-06, -1.3731e-05], device='cuda:0') 100 0.0001 changing lr epoch 126, time 262.54, cls_loss 0.0038 cls_loss_mapping 0.0060 cls_loss_causal 0.5580 re_mapping 0.0085 re_causal 0.0245 /// teacc 98.87 lr 0.00010000 Epoch 128, weight, value: tensor([[ 0.0146, -0.1274, -0.0781, ..., -0.1462, -0.0706, -0.1231], [ 0.0299, -0.0406, 0.0151, ..., 0.0459, 0.0769, -0.0303], [-0.0594, 0.0827, -0.1137, ..., 0.0535, 0.0465, -0.0203], ..., [-0.0693, -0.0591, -0.0645, ..., 0.0025, -0.0913, 0.1062], [ 0.0677, -0.0142, 0.0426, ..., -0.0052, -0.1270, -0.0103], [-0.1135, -0.0240, -0.0195, ..., -0.1264, 0.0358, -0.0883]], device='cuda:0'), grad: tensor([[-7.8510e-07, 1.2107e-08, 2.3190e-07, ..., 2.1048e-07, -7.1805e-07, 8.3819e-09], [ 7.7300e-08, 4.6566e-08, -4.6164e-05, ..., -2.8953e-05, -5.2363e-05, 2.8498e-07], [ 2.0582e-07, -1.0785e-06, 7.4413e-07, ..., -4.3772e-07, -4.2934e-07, 4.6566e-08], ..., [ 3.4459e-08, 5.1688e-07, 4.0263e-05, ..., 2.4840e-05, 4.6700e-05, -1.2470e-06], [ 6.9849e-08, 1.5832e-08, -1.5274e-07, ..., 1.8962e-06, 1.9372e-06, 5.2154e-08], [ 7.5437e-08, 8.3819e-09, 3.8408e-06, ..., 3.3267e-06, -5.9232e-06, 4.7404e-07]], device='cuda:0') Epoch 128, bias, value: tensor([-0.0061, -0.0321, 0.0076, -0.0191, 0.0138, 0.0089, 0.0200, -0.0009, -0.0332, -0.0051], device='cuda:0'), grad: tensor([-9.7156e-06, -9.2208e-05, 3.1292e-06, 1.6928e-05, 2.9340e-05, -6.8009e-05, 1.9550e-05, 8.2791e-05, 3.7104e-05, -1.8746e-05], device='cuda:0') 100 0.0001 changing lr epoch 127, time 262.06, cls_loss 0.0039 cls_loss_mapping 0.0057 cls_loss_causal 0.5335 re_mapping 0.0084 re_causal 0.0237 /// teacc 98.91 lr 0.00010000 Epoch 129, weight, value: tensor([[ 0.0149, -0.1277, -0.0786, ..., -0.1468, -0.0720, -0.1233], [ 0.0300, -0.0410, 0.0157, ..., 0.0465, 0.0777, -0.0299], [-0.0596, 0.0832, -0.1142, ..., 0.0536, 0.0467, -0.0205], ..., [-0.0694, -0.0592, -0.0650, ..., 0.0020, -0.0922, 0.1061], [ 0.0673, -0.0144, 0.0430, ..., -0.0055, -0.1275, -0.0104], [-0.1140, -0.0242, -0.0203, ..., -0.1269, 0.0365, -0.0885]], device='cuda:0'), grad: tensor([[ 9.1270e-08, 6.5193e-08, 2.1607e-07, ..., 3.0361e-07, 4.7404e-07, 1.5460e-07], [ 2.0675e-06, 2.6822e-07, -1.1492e-04, ..., -8.7142e-05, -1.9991e-04, -3.4243e-05], [-4.5169e-07, -1.6289e-06, 7.1526e-07, ..., -1.4016e-06, -1.1874e-06, 5.6718e-07], ..., [-8.5682e-06, 7.2736e-07, 5.1767e-05, ..., 3.7521e-05, 9.1195e-05, 8.8587e-06], [ 9.7696e-07, 3.4180e-07, 1.7453e-06, ..., 1.7844e-06, 4.9621e-06, 1.4091e-06], [ 2.4997e-06, 1.3039e-08, 5.0306e-05, ..., 3.9607e-05, 8.6784e-05, 1.7703e-05]], device='cuda:0') Epoch 129, bias, value: tensor([-0.0069, -0.0316, 0.0075, -0.0189, 0.0141, 0.0091, 0.0200, -0.0011, -0.0334, -0.0050], device='cuda:0'), grad: tensor([-5.9128e-05, -3.4523e-04, 4.4890e-07, 7.9200e-06, 3.8326e-05, 5.3458e-06, 9.0301e-06, 1.3351e-04, 1.6257e-05, 1.9348e-04], device='cuda:0') 100 0.0001 changing lr epoch 128, time 257.48, cls_loss 0.0024 cls_loss_mapping 0.0046 cls_loss_causal 0.5360 re_mapping 0.0085 re_causal 0.0249 /// teacc 98.93 lr 0.00010000 Epoch 130, weight, value: tensor([[ 0.0146, -0.1278, -0.0788, ..., -0.1475, -0.0730, -0.1237], [ 0.0300, -0.0409, 0.0160, ..., 0.0467, 0.0780, -0.0300], [-0.0597, 0.0834, -0.1144, ..., 0.0540, 0.0470, -0.0203], ..., [-0.0691, -0.0593, -0.0652, ..., 0.0018, -0.0926, 0.1063], [ 0.0677, -0.0146, 0.0435, ..., -0.0056, -0.1279, -0.0106], [-0.1146, -0.0243, -0.0208, ..., -0.1272, 0.0369, -0.0886]], device='cuda:0'), grad: tensor([[ 1.2843e-06, 3.4180e-07, 6.9439e-06, ..., 4.0755e-06, 1.0341e-05, 1.2107e-08], [-4.4703e-05, 5.4017e-08, -2.3305e-04, ..., -1.1259e-04, -1.8966e-04, 1.1735e-07], [ 2.3469e-07, -2.2072e-06, 2.2911e-06, ..., -3.6042e-06, -3.6750e-06, 1.3970e-08], ..., [ 1.5181e-07, 8.6334e-07, 1.1530e-06, ..., 2.3730e-06, 4.2431e-06, -5.6345e-07], [ 2.8536e-05, 1.3597e-07, 1.4174e-04, ..., 6.6280e-05, 1.2565e-04, 1.8626e-08], [ 1.8347e-07, 7.6368e-08, 6.9812e-06, ..., 4.2915e-06, 6.7018e-06, 2.9150e-07]], device='cuda:0') Epoch 130, bias, value: tensor([-0.0073, -0.0314, 0.0079, -0.0191, 0.0146, 0.0091, 0.0200, -0.0013, -0.0334, -0.0051], device='cuda:0'), grad: tensor([ 2.7090e-05, -4.4799e-04, 2.2948e-06, 7.6368e-06, 9.1672e-05, 1.3137e-04, -1.4877e-04, 8.0764e-06, 2.9230e-04, 3.5822e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 261.35, cls_loss 0.0042 cls_loss_mapping 0.0062 cls_loss_causal 0.5495 re_mapping 0.0081 re_causal 0.0240 /// teacc 98.82 lr 0.00010000 Epoch 131, weight, value: tensor([[ 0.0142, -0.1284, -0.0798, ..., -0.1485, -0.0731, -0.1240], [ 0.0301, -0.0414, 0.0162, ..., 0.0467, 0.0779, -0.0306], [-0.0594, 0.0840, -0.1148, ..., 0.0547, 0.0479, -0.0200], ..., [-0.0691, -0.0593, -0.0655, ..., 0.0014, -0.0930, 0.1063], [ 0.0674, -0.0151, 0.0441, ..., -0.0058, -0.1285, -0.0106], [-0.1152, -0.0244, -0.0213, ..., -0.1277, 0.0369, -0.0890]], device='cuda:0'), grad: tensor([[ 7.4040e-07, 4.9360e-08, 7.2680e-06, ..., 3.0417e-06, 6.2771e-06, 4.0978e-08], [-2.4904e-06, 2.7940e-08, -2.6003e-05, ..., -1.2584e-05, -2.3827e-05, -9.0431e-07], [-1.2647e-06, -5.4501e-06, 3.2987e-06, ..., -1.4871e-05, -8.9034e-07, -1.6019e-07], ..., [ 4.7125e-07, 2.3562e-07, 7.4580e-06, ..., 4.9882e-06, 6.2063e-06, 1.9651e-07], [ 2.3656e-06, 5.0701e-06, 2.1309e-05, ..., 2.6047e-05, 1.0610e-05, 5.6531e-07], [ 9.6858e-08, 2.7940e-09, 1.6512e-06, ..., 8.5216e-07, 9.2760e-07, 3.1851e-07]], device='cuda:0') Epoch 131, bias, value: tensor([-0.0076, -0.0316, 0.0082, -0.0189, 0.0144, 0.0094, 0.0203, -0.0012, -0.0335, -0.0053], device='cuda:0'), grad: tensor([ 1.4633e-05, -4.9263e-05, -1.1452e-05, -1.9327e-05, -2.5425e-07, -1.7628e-05, -8.9873e-07, 1.5989e-05, 6.3479e-05, 4.6901e-06], device='cuda:0') 100 0.0001 changing lr epoch 130, time 262.81, cls_loss 0.0033 cls_loss_mapping 0.0065 cls_loss_causal 0.5523 re_mapping 0.0080 re_causal 0.0249 /// teacc 98.85 lr 0.00010000 Epoch 132, weight, value: tensor([[ 0.0141, -0.1288, -0.0804, ..., -0.1492, -0.0732, -0.1241], [ 0.0305, -0.0414, 0.0172, ..., 0.0475, 0.0786, -0.0295], [-0.0596, 0.0844, -0.1157, ..., 0.0546, 0.0482, -0.0203], ..., [-0.0691, -0.0594, -0.0663, ..., 0.0009, -0.0938, 0.1062], [ 0.0676, -0.0152, 0.0446, ..., -0.0057, -0.1292, -0.0107], [-0.1157, -0.0247, -0.0222, ..., -0.1281, 0.0357, -0.0892]], device='cuda:0'), grad: tensor([[ 3.4459e-08, 1.1828e-07, 1.7881e-07, ..., 4.5542e-07, 1.8533e-06, 3.1665e-08], [ 4.6566e-08, 5.4017e-08, -5.8301e-06, ..., -1.1539e-06, -4.4294e-06, 9.6764e-07], [ 2.3283e-08, -1.3541e-06, 2.9150e-07, ..., -1.1139e-05, -1.4238e-05, 1.0245e-07], ..., [ 4.3772e-08, 4.8243e-07, 4.7497e-07, ..., 2.2948e-06, 8.4490e-06, -3.8035e-06], [ 9.6858e-08, 7.4506e-08, 3.1758e-06, ..., 1.4389e-06, 5.5097e-06, 2.9337e-07], [ 2.1979e-07, 8.8476e-08, -7.9069e-07, ..., 8.7637e-07, -3.5781e-06, 2.6729e-07]], device='cuda:0') Epoch 132, bias, value: tensor([-0.0079, -0.0308, 0.0080, -0.0192, 0.0156, 0.0091, 0.0209, -0.0018, -0.0334, -0.0061], device='cuda:0'), grad: tensor([ 3.0436e-06, -6.1132e-06, -2.4229e-05, -2.3365e-05, 7.2159e-06, 4.0561e-05, -8.2701e-06, 8.3745e-06, 1.1414e-05, -8.7470e-06], device='cuda:0') 100 0.0001 changing lr epoch 131, time 262.47, cls_loss 0.0056 cls_loss_mapping 0.0095 cls_loss_causal 0.6045 re_mapping 0.0077 re_causal 0.0241 /// teacc 98.84 lr 0.00010000 Epoch 133, weight, value: tensor([[ 0.0139, -0.1291, -0.0806, ..., -0.1500, -0.0752, -0.1269], [ 0.0322, -0.0417, 0.0175, ..., 0.0474, 0.0792, -0.0297], [-0.0595, 0.0851, -0.1162, ..., 0.0551, 0.0497, -0.0202], ..., [-0.0697, -0.0599, -0.0665, ..., 0.0010, -0.0950, 0.1067], [ 0.0676, -0.0153, 0.0461, ..., -0.0054, -0.1294, -0.0105], [-0.1171, -0.0254, -0.0230, ..., -0.1295, 0.0378, -0.0887]], device='cuda:0'), grad: tensor([[ 2.0489e-08, 1.5181e-07, 4.3586e-07, ..., 5.2433e-07, -4.1008e-05, 1.7416e-07], [ 1.5832e-08, 1.8217e-06, 9.7826e-06, ..., 1.1675e-05, 1.4296e-06, -7.2829e-06], [ 8.3819e-08, -1.3515e-05, -2.6263e-07, ..., -1.9699e-05, 1.8328e-05, 3.7998e-07], ..., [ 4.8429e-08, 2.9430e-06, 2.1812e-06, ..., 5.7220e-06, 1.0215e-05, 3.5204e-07], [ 5.0105e-07, 6.8210e-06, -3.4839e-05, ..., -1.8626e-05, -2.6785e-06, -3.9302e-07], [ 1.6205e-07, 2.4959e-07, 2.3656e-06, ..., 2.1383e-06, 1.0234e-04, 6.0257e-07]], device='cuda:0') Epoch 133, bias, value: tensor([-0.0089, -0.0307, 0.0088, -0.0183, 0.0148, 0.0073, 0.0207, -0.0020, -0.0330, -0.0045], device='cuda:0'), grad: tensor([-1.1772e-04, 2.8506e-05, 1.3426e-05, 4.5709e-06, -1.8716e-04, 1.7315e-05, 4.3005e-05, 1.9714e-05, -3.8981e-05, 2.1744e-04], device='cuda:0') 100 0.0001 changing lr epoch 132, time 262.58, cls_loss 0.0037 cls_loss_mapping 0.0062 cls_loss_causal 0.5709 re_mapping 0.0083 re_causal 0.0248 /// teacc 98.82 lr 0.00010000 Epoch 134, weight, value: tensor([[ 0.0137, -0.1291, -0.0807, ..., -0.1512, -0.0746, -0.1273], [ 0.0333, -0.0422, 0.0180, ..., 0.0473, 0.0791, -0.0302], [-0.0591, 0.0861, -0.1167, ..., 0.0560, 0.0507, -0.0204], ..., [-0.0693, -0.0609, -0.0664, ..., 0.0012, -0.0956, 0.1075], [ 0.0667, -0.0154, 0.0462, ..., -0.0057, -0.1307, -0.0108], [-0.1174, -0.0256, -0.0239, ..., -0.1303, 0.0372, -0.0889]], device='cuda:0'), grad: tensor([[ 6.7055e-08, 2.5146e-08, 3.7160e-07, ..., 4.5542e-07, -3.0920e-07, 2.8871e-08], [ 9.4716e-07, 1.3970e-08, 4.6007e-07, ..., 8.1360e-06, 1.8664e-06, 3.4105e-06], [ 6.2995e-06, -3.3528e-07, 3.2037e-05, ..., 3.4213e-05, 4.8336e-07, 9.8906e-07], ..., [-1.0198e-06, 1.9744e-07, 1.3150e-06, ..., -9.4175e-06, -8.0839e-07, -5.9679e-06], [-7.1302e-06, 1.0245e-08, -3.0637e-05, ..., -2.8715e-05, 9.8906e-07, 3.6415e-07], [ 1.0524e-07, 2.4214e-08, 3.2131e-07, ..., 1.0198e-06, 3.8370e-07, 3.5390e-07]], device='cuda:0') Epoch 134, bias, value: tensor([-0.0079, -0.0309, 0.0095, -0.0185, 0.0159, 0.0072, 0.0194, -0.0019, -0.0336, -0.0053], device='cuda:0'), grad: tensor([-3.9279e-05, 2.1279e-05, 8.9884e-05, -7.1302e-06, 5.3123e-06, 2.3067e-05, 9.8124e-06, -3.6597e-05, -8.2672e-05, 1.6287e-05], device='cuda:0') 100 0.0001 changing lr epoch 133, time 262.32, cls_loss 0.0037 cls_loss_mapping 0.0054 cls_loss_causal 0.5429 re_mapping 0.0085 re_causal 0.0246 /// teacc 98.80 lr 0.00010000 Epoch 135, weight, value: tensor([[ 0.0134, -0.1298, -0.0810, ..., -0.1519, -0.0747, -0.1275], [ 0.0346, -0.0438, 0.0180, ..., 0.0466, 0.0781, -0.0306], [-0.0590, 0.0870, -0.1155, ..., 0.0572, 0.0533, -0.0207], ..., [-0.0693, -0.0611, -0.0666, ..., 0.0011, -0.0962, 0.1080], [ 0.0662, -0.0154, 0.0461, ..., -0.0057, -0.1315, -0.0108], [-0.1178, -0.0257, -0.0243, ..., -0.1311, 0.0371, -0.0891]], device='cuda:0'), grad: tensor([[ 2.7940e-08, 2.7940e-08, 1.7416e-07, ..., 5.2899e-07, 3.3993e-07, 4.9453e-07], [ 4.2003e-07, 1.1921e-07, 1.6801e-06, ..., 4.7609e-06, 4.6529e-06, 4.2170e-06], [ 5.3737e-07, -4.2561e-07, 2.1011e-06, ..., 1.1541e-05, 1.2070e-05, 1.1690e-05], ..., [ 2.7008e-08, 1.3877e-07, 5.6624e-07, ..., -2.7716e-05, -3.0205e-05, -3.0637e-05], [-2.1271e-06, -3.6508e-07, 5.9530e-06, ..., 7.1898e-07, 3.5614e-06, 2.7008e-06], [ 2.3469e-07, 2.6077e-08, 2.9169e-06, ..., 5.5246e-06, 5.0478e-06, 5.2787e-06]], device='cuda:0') Epoch 135, bias, value: tensor([-0.0078, -0.0314, 0.0110, -0.0192, 0.0158, 0.0077, 0.0197, -0.0019, -0.0341, -0.0054], device='cuda:0'), grad: tensor([ 9.9279e-07, 2.0266e-05, 4.7535e-05, 1.4853e-04, 3.9749e-06, -2.1172e-04, 1.4491e-05, -1.0723e-04, 5.0455e-05, 3.2634e-05], device='cuda:0') 100 0.0001 changing lr epoch 134, time 262.72, cls_loss 0.0042 cls_loss_mapping 0.0074 cls_loss_causal 0.5633 re_mapping 0.0084 re_causal 0.0238 /// teacc 98.89 lr 0.00010000 Epoch 136, weight, value: tensor([[ 0.0135, -0.1301, -0.0817, ..., -0.1526, -0.0746, -0.1276], [ 0.0350, -0.0440, 0.0190, ..., 0.0473, 0.0790, -0.0299], [-0.0592, 0.0874, -0.1165, ..., 0.0569, 0.0532, -0.0210], ..., [-0.0696, -0.0608, -0.0671, ..., 0.0012, -0.0968, 0.1087], [ 0.0669, -0.0157, 0.0468, ..., -0.0058, -0.1323, -0.0109], [-0.1184, -0.0261, -0.0258, ..., -0.1321, 0.0368, -0.0899]], device='cuda:0'), grad: tensor([[ 2.4214e-08, 2.7008e-08, 5.0291e-07, ..., 1.9930e-07, 3.9954e-07, 2.0489e-08], [ 5.9605e-08, 1.0664e-06, -3.7737e-06, ..., -2.6133e-06, -6.5751e-07, 2.7847e-07], [ 2.5425e-07, -1.1735e-07, 4.3437e-06, ..., 1.8757e-06, 3.1032e-06, 5.8673e-08], ..., [ 9.3132e-08, 1.0710e-07, 1.5823e-06, ..., -1.0328e-06, 1.7276e-06, -1.9092e-06], [-1.1111e-06, 2.5146e-08, -1.1533e-05, ..., -1.9688e-06, 7.8790e-07, 4.0978e-07], [ 3.6694e-07, 2.3283e-08, 5.3123e-06, ..., 1.7360e-06, -9.9745e-07, 6.2957e-07]], device='cuda:0') Epoch 136, bias, value: tensor([-0.0071, -0.0307, 0.0106, -0.0212, 0.0159, 0.0091, 0.0198, -0.0020, -0.0333, -0.0063], device='cuda:0'), grad: tensor([ 5.6699e-06, 5.6662e-06, 1.0975e-05, 1.7762e-05, 3.4153e-05, -5.2595e-04, 2.5439e-04, -2.2817e-07, 1.9395e-04, 3.4403e-06], device='cuda:0') 100 0.0001 changing lr epoch 135, time 262.38, cls_loss 0.0032 cls_loss_mapping 0.0053 cls_loss_causal 0.5619 re_mapping 0.0081 re_causal 0.0248 /// teacc 98.97 lr 0.00010000 Epoch 137, weight, value: tensor([[ 0.0134, -0.1307, -0.0825, ..., -0.1533, -0.0747, -0.1276], [ 0.0352, -0.0435, 0.0194, ..., 0.0472, 0.0793, -0.0310], [-0.0593, 0.0874, -0.1177, ..., 0.0562, 0.0527, -0.0213], ..., [-0.0696, -0.0609, -0.0671, ..., 0.0019, -0.0964, 0.1097], [ 0.0669, -0.0155, 0.0485, ..., -0.0054, -0.1319, -0.0110], [-0.1189, -0.0263, -0.0273, ..., -0.1331, 0.0369, -0.0901]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 1.2359e-06, 1.0338e-07, ..., 6.3237e-07, 4.8578e-06, 2.2352e-08], [ 5.5879e-09, 3.7074e-05, -6.6031e-07, ..., 1.2860e-05, 1.2779e-04, 3.8650e-07], [ 1.1269e-07, -4.1313e-06, 5.8953e-07, ..., -5.4687e-05, -4.4644e-05, -5.9791e-07], ..., [ 6.0536e-08, 5.6438e-06, 4.0699e-07, ..., 3.4988e-05, 3.8952e-05, -8.1025e-08], [ 1.3970e-08, 1.1921e-06, 2.3935e-07, ..., 3.3919e-06, 6.2138e-06, 2.2165e-07], [ 3.7253e-09, 3.7253e-07, -2.7940e-07, ..., 1.9874e-06, 1.6643e-06, -2.3935e-07]], device='cuda:0') Epoch 137, bias, value: tensor([-0.0071, -0.0307, 0.0096, -0.0207, 0.0160, 0.0087, 0.0189, -0.0013, -0.0325, -0.0066], device='cuda:0'), grad: tensor([ 5.9679e-06, 2.2578e-04, -8.8811e-05, 4.6846e-07, 4.9584e-06, 8.8196e-07, -2.4033e-04, 7.4804e-05, 1.4342e-05, 2.4177e-06], device='cuda:0') 100 0.0001 changing lr epoch 136, time 262.85, cls_loss 0.0032 cls_loss_mapping 0.0053 cls_loss_causal 0.5493 re_mapping 0.0080 re_causal 0.0241 /// teacc 98.86 lr 0.00010000 Epoch 138, weight, value: tensor([[ 0.0137, -0.1309, -0.0833, ..., -0.1538, -0.0747, -0.1276], [ 0.0356, -0.0433, 0.0198, ..., 0.0474, 0.0799, -0.0314], [-0.0597, 0.0878, -0.1185, ..., 0.0558, 0.0525, -0.0215], ..., [-0.0696, -0.0613, -0.0671, ..., 0.0023, -0.0968, 0.1103], [ 0.0670, -0.0155, 0.0485, ..., -0.0054, -0.1327, -0.0111], [-0.1198, -0.0264, -0.0277, ..., -0.1335, 0.0373, -0.0904]], device='cuda:0'), grad: tensor([[ 9.6858e-08, 6.0536e-08, 2.2911e-07, ..., 2.2911e-07, 2.2396e-05, 5.6811e-08], [ 1.0850e-06, 6.8545e-07, -1.2619e-06, ..., 1.7304e-06, 1.1157e-06, 9.4622e-07], [ 1.4760e-05, 8.2552e-06, 6.5658e-07, ..., 3.0443e-05, 3.7253e-08, 1.1526e-05], ..., [-2.9102e-05, -1.7062e-05, 8.9686e-07, ..., -6.0946e-05, 3.8482e-06, -2.2084e-05], [ 7.9796e-06, 4.7535e-06, -2.4121e-07, ..., 1.6600e-05, 2.4978e-06, 6.1691e-06], [ 1.2387e-07, 3.1665e-08, 2.8312e-07, ..., 2.5984e-07, 1.7462e-06, 3.7160e-07]], device='cuda:0') Epoch 138, bias, value: tensor([-0.0071, -0.0305, 0.0092, -0.0209, 0.0152, 0.0089, 0.0189, -0.0010, -0.0329, -0.0060], device='cuda:0'), grad: tensor([ 8.1301e-05, 8.3223e-06, 4.4435e-05, 3.2987e-06, 4.7386e-05, 1.5244e-05, -1.6594e-04, -7.3433e-05, 3.1978e-05, 7.4282e-06], device='cuda:0') 100 0.0001 changing lr epoch 137, time 262.28, cls_loss 0.0026 cls_loss_mapping 0.0046 cls_loss_causal 0.5232 re_mapping 0.0080 re_causal 0.0233 /// teacc 98.93 lr 0.00010000 Epoch 139, weight, value: tensor([[ 0.0137, -0.1315, -0.0843, ..., -0.1546, -0.0747, -0.1277], [ 0.0357, -0.0431, 0.0203, ..., 0.0474, 0.0802, -0.0320], [-0.0600, 0.0878, -0.1191, ..., 0.0555, 0.0525, -0.0218], ..., [-0.0692, -0.0611, -0.0674, ..., 0.0026, -0.0971, 0.1118], [ 0.0671, -0.0154, 0.0488, ..., -0.0053, -0.1329, -0.0112], [-0.1202, -0.0270, -0.0282, ..., -0.1342, 0.0370, -0.0907]], device='cuda:0'), grad: tensor([[ 2.4900e-05, 9.9652e-08, 5.7742e-08, ..., 2.1141e-07, 7.2457e-06, 5.5879e-09], [ 8.5309e-07, 1.3411e-07, -4.7497e-08, ..., 2.7660e-07, 2.1514e-07, 1.1269e-07], [ 1.5125e-06, -1.2284e-06, 1.1269e-07, ..., -1.6252e-06, -2.0824e-06, -8.3819e-09], ..., [ 2.6170e-07, 6.0722e-07, 1.1828e-07, ..., 4.7497e-07, 1.3048e-06, -4.0419e-07], [ 8.1807e-06, 1.1735e-07, 1.0412e-06, ..., 1.2452e-06, 2.4103e-06, 2.4214e-08], [ 1.4724e-06, 2.2352e-08, 2.1979e-07, ..., 3.3714e-07, 4.1816e-07, 7.0781e-08]], device='cuda:0') Epoch 139, bias, value: tensor([-0.0071, -0.0305, 0.0087, -0.0206, 0.0153, 0.0088, 0.0187, -0.0002, -0.0328, -0.0065], device='cuda:0'), grad: tensor([ 8.3387e-05, 3.2671e-06, 1.4296e-06, 3.9190e-05, 1.0125e-05, -2.3991e-05, -1.5378e-04, 2.5034e-06, 3.1650e-05, 6.0536e-06], device='cuda:0') 100 0.0001 changing lr epoch 138, time 262.23, cls_loss 0.0035 cls_loss_mapping 0.0069 cls_loss_causal 0.5560 re_mapping 0.0080 re_causal 0.0235 /// teacc 98.81 lr 0.00010000 Epoch 140, weight, value: tensor([[ 0.0133, -0.1317, -0.0849, ..., -0.1555, -0.0754, -0.1289], [ 0.0355, -0.0436, 0.0212, ..., 0.0481, 0.0807, -0.0311], [-0.0602, 0.0884, -0.1196, ..., 0.0559, 0.0529, -0.0214], ..., [-0.0691, -0.0618, -0.0685, ..., 0.0016, -0.0982, 0.1108], [ 0.0673, -0.0153, 0.0495, ..., -0.0048, -0.1335, -0.0101], [-0.1217, -0.0272, -0.0293, ..., -0.1349, 0.0378, -0.0901]], device='cuda:0'), grad: tensor([[ 5.0943e-07, 3.7253e-09, 7.0818e-06, ..., 2.1942e-06, 6.1095e-07, 5.5879e-09], [ 1.2666e-07, 8.3819e-09, 5.7407e-06, ..., 4.2170e-06, -8.6240e-07, 6.1467e-08], [ 4.6287e-07, -1.0058e-07, 6.5230e-06, ..., 3.1032e-06, 1.1642e-07, 5.5879e-08], ..., [ 6.6031e-07, 5.5879e-09, 3.0454e-06, ..., 1.3411e-06, 2.1905e-06, -1.8347e-07], [ 2.3432e-06, 1.4901e-08, -7.4469e-06, ..., -1.4931e-05, 9.1456e-07, 6.1467e-08], [ 2.5965e-06, 0.0000e+00, 4.8041e-05, ..., 1.9431e-05, 2.4494e-07, -9.6858e-08]], device='cuda:0') Epoch 140, bias, value: tensor([-0.0078, -0.0301, 0.0090, -0.0204, 0.0149, 0.0089, 0.0190, -0.0010, -0.0327, -0.0061], device='cuda:0'), grad: tensor([ 9.6783e-06, 1.5780e-05, 1.4447e-05, -1.2058e-04, -1.6531e-06, 2.7806e-05, -4.6566e-07, 1.0915e-05, -4.4912e-05, 8.8930e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 139---------------------------------------------------- epoch 139, time 279.28, cls_loss 0.0034 cls_loss_mapping 0.0054 cls_loss_causal 0.5502 re_mapping 0.0077 re_causal 0.0229 /// teacc 99.05 lr 0.00010000 Epoch 141, weight, value: tensor([[ 0.0132, -0.1332, -0.0850, ..., -0.1564, -0.0756, -0.1290], [ 0.0353, -0.0437, 0.0216, ..., 0.0484, 0.0810, -0.0308], [-0.0603, 0.0889, -0.1200, ..., 0.0560, 0.0531, -0.0215], ..., [-0.0692, -0.0619, -0.0689, ..., 0.0011, -0.0989, 0.1108], [ 0.0670, -0.0151, 0.0501, ..., -0.0048, -0.1342, -0.0100], [-0.1227, -0.0277, -0.0302, ..., -0.1356, 0.0370, -0.0905]], device='cuda:0'), grad: tensor([[-1.6484e-07, -6.4634e-07, 2.1327e-07, ..., 2.3283e-07, 1.8533e-07, 4.2189e-07], [ 4.3772e-08, 1.3970e-08, 1.3672e-06, ..., 2.5202e-06, 3.4459e-07, 2.8703e-06], [ 4.1910e-08, 9.2201e-08, 3.8743e-07, ..., 5.1688e-07, 1.6764e-08, 8.0187e-07], ..., [ 5.5879e-09, 4.3772e-08, 2.2259e-07, ..., -3.8929e-06, 2.1048e-07, -2.2829e-05], [-6.5193e-08, 2.8871e-08, 5.2303e-05, ..., 3.1888e-05, 1.5691e-05, 3.5577e-07], [ 9.2201e-08, 3.5577e-07, 7.7579e-07, ..., 1.7453e-06, -6.1374e-07, 1.4842e-05]], device='cuda:0') Epoch 141, bias, value: tensor([-0.0081, -0.0300, 0.0089, -0.0199, 0.0161, 0.0089, 0.0186, -0.0012, -0.0323, -0.0070], device='cuda:0'), grad: tensor([-1.4946e-05, 1.1973e-05, 5.5172e-06, 8.5950e-05, 5.6103e-06, -3.5739e-04, 2.8476e-05, -6.0767e-05, 2.4414e-04, 5.1558e-05], device='cuda:0') 100 0.0001 changing lr epoch 140, time 262.24, cls_loss 0.0035 cls_loss_mapping 0.0054 cls_loss_causal 0.5304 re_mapping 0.0076 re_causal 0.0227 /// teacc 98.93 lr 0.00010000 Epoch 142, weight, value: tensor([[ 0.0127, -0.1335, -0.0848, ..., -0.1576, -0.0749, -0.1291], [ 0.0354, -0.0446, 0.0217, ..., 0.0481, 0.0807, -0.0312], [-0.0604, 0.0908, -0.1203, ..., 0.0562, 0.0546, -0.0216], ..., [-0.0693, -0.0627, -0.0690, ..., 0.0016, -0.0994, 0.1113], [ 0.0684, -0.0152, 0.0507, ..., -0.0050, -0.1337, -0.0101], [-0.1231, -0.0282, -0.0306, ..., -0.1364, 0.0370, -0.0908]], device='cuda:0'), grad: tensor([[ 2.2259e-07, 6.8266e-07, 9.0152e-07, ..., 2.1476e-06, 9.7416e-07, 7.4971e-07], [ 2.1700e-07, 6.6962e-07, 1.0617e-05, ..., 1.7762e-05, 7.0930e-06, 1.0721e-05], [-3.5077e-05, -1.1516e-04, 3.1650e-05, ..., -9.9838e-05, -8.8513e-06, 3.1501e-05], ..., [ 3.4511e-05, 1.1301e-04, -4.8429e-05, ..., 6.9261e-05, -1.8021e-06, -4.9382e-05], [ 5.4948e-08, 1.2480e-07, -8.5216e-07, ..., 2.0582e-07, 1.5097e-06, 3.1758e-07], [ 5.0291e-08, 3.5390e-08, 2.3562e-06, ..., 2.4885e-06, -6.2883e-06, 1.2266e-06]], device='cuda:0') Epoch 142, bias, value: tensor([-0.0070, -0.0307, 0.0095, -0.0200, 0.0158, 0.0089, 0.0166, -0.0008, -0.0307, -0.0072], device='cuda:0'), grad: tensor([ 5.5879e-06, 4.9204e-05, -1.0353e-04, 1.7226e-05, 1.1444e-05, 2.5705e-06, -1.8012e-06, 2.4587e-05, 4.0643e-06, -9.9540e-06], device='cuda:0') 100 0.0001 changing lr epoch 141, time 260.17, cls_loss 0.0027 cls_loss_mapping 0.0052 cls_loss_causal 0.5522 re_mapping 0.0077 re_causal 0.0232 /// teacc 98.95 lr 0.00010000 Epoch 143, weight, value: tensor([[ 0.0122, -0.1361, -0.0850, ..., -0.1582, -0.0750, -0.1291], [ 0.0354, -0.0451, 0.0219, ..., 0.0481, 0.0809, -0.0314], [-0.0604, 0.0916, -0.1209, ..., 0.0564, 0.0548, -0.0216], ..., [-0.0695, -0.0631, -0.0691, ..., 0.0016, -0.0997, 0.1118], [ 0.0685, -0.0155, 0.0510, ..., -0.0049, -0.1347, -0.0101], [-0.1232, -0.0289, -0.0318, ..., -0.1371, 0.0369, -0.0909]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.8720e-08, 2.7101e-07, ..., 2.8778e-07, -1.4501e-06, 1.0245e-08], [ 0.0000e+00, -1.1906e-05, -3.2932e-05, ..., -3.4243e-05, -5.1677e-05, -2.0768e-07], [ 1.8626e-09, 1.0833e-05, 2.6390e-05, ..., 2.8923e-05, 4.1842e-05, 5.2154e-08], ..., [ 9.3132e-10, 6.9290e-07, 6.3255e-06, ..., 4.7795e-06, 8.4713e-06, -2.0675e-07], [ 9.3132e-10, 7.1712e-08, -1.8878e-06, ..., -1.8599e-06, 5.1595e-07, 2.3283e-08], [ 0.0000e+00, 2.2352e-07, 7.7393e-07, ..., 8.6147e-07, 1.9595e-06, 1.1176e-07]], device='cuda:0') Epoch 143, bias, value: tensor([-0.0072, -0.0307, 0.0094, -0.0198, 0.0150, 0.0085, 0.0182, -0.0001, -0.0309, -0.0077], device='cuda:0'), grad: tensor([-4.1962e-05, -6.8843e-05, 6.0886e-05, 1.4678e-05, -2.8592e-06, -1.0870e-05, 1.2055e-05, 1.4715e-05, 6.5938e-07, 2.1502e-05], device='cuda:0') 100 0.0001 changing lr epoch 142, time 259.68, cls_loss 0.0035 cls_loss_mapping 0.0055 cls_loss_causal 0.5485 re_mapping 0.0074 re_causal 0.0217 /// teacc 98.90 lr 0.00010000 Epoch 144, weight, value: tensor([[ 0.0122, -0.1363, -0.0856, ..., -0.1591, -0.0753, -0.1293], [ 0.0354, -0.0465, 0.0222, ..., 0.0481, 0.0808, -0.0316], [-0.0601, 0.0939, -0.1214, ..., 0.0568, 0.0557, -0.0216], ..., [-0.0701, -0.0646, -0.0692, ..., 0.0016, -0.1004, 0.1123], [ 0.0687, -0.0165, 0.0516, ..., -0.0050, -0.1358, -0.0103], [-0.1233, -0.0308, -0.0325, ..., -0.1380, 0.0365, -0.0913]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.2399e-07, 2.8312e-07, ..., 8.4378e-07, 6.5230e-06, 6.3330e-08], [ 0.0000e+00, 2.6263e-07, -7.4506e-08, ..., 1.4007e-06, 1.1157e-06, 8.4564e-07], [ 0.0000e+00, -3.8669e-06, 1.1008e-06, ..., -6.5565e-06, -2.9188e-06, 4.5821e-07], ..., [ 0.0000e+00, 1.7080e-06, 2.1979e-07, ..., 1.1902e-06, 3.4925e-06, -2.1514e-06], [ 0.0000e+00, 2.7753e-07, -2.6990e-06, ..., 9.5554e-07, 2.9430e-05, -2.3283e-07], [ 0.0000e+00, 1.3784e-07, 5.5879e-07, ..., 5.6997e-07, -5.1931e-06, 2.5891e-07]], device='cuda:0') Epoch 144, bias, value: tensor([-6.8900e-03, -3.0989e-02, 1.0346e-02, -1.9950e-02, 1.5448e-02, 8.5766e-03, 1.8889e-02, -2.1528e-05, -3.1354e-02, -8.7578e-03], device='cuda:0'), grad: tensor([ 1.0245e-05, 4.6715e-06, -7.0632e-06, 4.2021e-06, 4.9658e-06, 1.3500e-05, -6.9261e-05, 4.8876e-06, 4.2766e-05, -8.9258e-06], device='cuda:0') 100 0.0001 changing lr epoch 143, time 259.23, cls_loss 0.0023 cls_loss_mapping 0.0038 cls_loss_causal 0.5416 re_mapping 0.0075 re_causal 0.0230 /// teacc 98.98 lr 0.00010000 Epoch 145, weight, value: tensor([[ 0.0122, -0.1364, -0.0860, ..., -0.1598, -0.0754, -0.1295], [ 0.0353, -0.0466, 0.0225, ..., 0.0480, 0.0809, -0.0317], [-0.0596, 0.0941, -0.1216, ..., 0.0569, 0.0558, -0.0216], ..., [-0.0704, -0.0646, -0.0696, ..., 0.0016, -0.1009, 0.1125], [ 0.0685, -0.0167, 0.0514, ..., -0.0053, -0.1364, -0.0105], [-0.1234, -0.0309, -0.0329, ..., -0.1381, 0.0366, -0.0913]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 0.0000e+00, 2.3656e-07, ..., 3.7253e-08, 3.7439e-07, 9.3132e-09], [ 2.6077e-08, 0.0000e+00, -7.4506e-07, ..., -1.1548e-07, -1.2666e-06, 3.1665e-08], [ 6.8918e-08, 0.0000e+00, 4.1910e-07, ..., 1.3411e-07, 8.0466e-07, 2.0489e-08], ..., [ 2.7940e-08, 0.0000e+00, 3.5018e-07, ..., 7.0781e-08, 3.5577e-07, -2.0862e-07], [-1.1120e-06, 0.0000e+00, -6.4448e-06, ..., -2.3376e-06, 2.0303e-07, 1.6764e-08], [ 2.0489e-08, 0.0000e+00, 2.5332e-07, ..., 1.0431e-07, 1.9558e-07, 9.3132e-08]], device='cuda:0') Epoch 145, bias, value: tensor([-6.8792e-03, -3.1122e-02, 1.0350e-02, -1.9841e-02, 1.5689e-02, 9.0812e-03, 1.8215e-02, 9.5557e-05, -3.2026e-02, -8.7079e-03], device='cuda:0'), grad: tensor([ 1.0766e-06, -1.5292e-06, 1.6876e-06, 5.9493e-06, -6.8359e-07, 3.2205e-06, -1.0468e-06, 4.2841e-07, -9.8124e-06, 6.9290e-07], device='cuda:0') 100 0.0001 changing lr epoch 144, time 261.48, cls_loss 0.0033 cls_loss_mapping 0.0056 cls_loss_causal 0.5869 re_mapping 0.0074 re_causal 0.0234 /// teacc 98.97 lr 0.00010000 Epoch 146, weight, value: tensor([[ 0.0122, -0.1366, -0.0868, ..., -0.1602, -0.0756, -0.1296], [ 0.0355, -0.0467, 0.0240, ..., 0.0486, 0.0819, -0.0297], [-0.0597, 0.0943, -0.1221, ..., 0.0567, 0.0558, -0.0221], ..., [-0.0707, -0.0645, -0.0714, ..., 0.0009, -0.1019, 0.1110], [ 0.0683, -0.0167, 0.0520, ..., -0.0049, -0.1374, -0.0093], [-0.1235, -0.0310, -0.0332, ..., -0.1383, 0.0367, -0.0917]], device='cuda:0'), grad: tensor([[ 4.5821e-07, 1.3001e-06, 3.5390e-08, ..., 2.6282e-06, 5.3085e-06, 1.8626e-08], [ 5.7742e-08, 2.1994e-04, 8.7544e-08, ..., 5.4550e-04, 8.5545e-04, 5.7742e-08], [ 3.7253e-08, -2.2948e-04, -1.1921e-07, ..., -5.6696e-04, -8.8787e-04, 8.3819e-08], ..., [ 9.3132e-09, 2.7046e-06, 1.0915e-05, ..., 1.8120e-05, 9.6858e-06, 9.4324e-06], [ 4.9733e-07, 2.7176e-06, 1.2927e-06, ..., 7.7263e-06, 1.1042e-05, 1.0952e-06], [ 2.2352e-08, 1.5087e-07, 1.1362e-07, ..., 3.7439e-07, 3.9488e-07, 8.9407e-08]], device='cuda:0') Epoch 146, bias, value: tensor([-0.0071, -0.0296, 0.0099, -0.0197, 0.0156, 0.0104, 0.0169, -0.0014, -0.0320, -0.0085], device='cuda:0'), grad: tensor([ 2.1942e-06, 1.4658e-03, -1.5230e-03, -1.3635e-05, 7.4022e-06, 2.0061e-06, -2.3693e-06, 3.4213e-05, 2.3752e-05, 3.9339e-06], device='cuda:0') 100 0.0001 changing lr epoch 145, time 261.94, cls_loss 0.0036 cls_loss_mapping 0.0055 cls_loss_causal 0.5715 re_mapping 0.0072 re_causal 0.0226 /// teacc 98.85 lr 0.00010000 Epoch 147, weight, value: tensor([[ 0.0117, -0.1369, -0.0873, ..., -0.1610, -0.0758, -0.1297], [ 0.0358, -0.0473, 0.0228, ..., 0.0475, 0.0818, -0.0312], [-0.0599, 0.0931, -0.1228, ..., 0.0565, 0.0558, -0.0235], ..., [-0.0708, -0.0627, -0.0699, ..., 0.0019, -0.1025, 0.1125], [ 0.0687, -0.0166, 0.0543, ..., -0.0040, -0.1360, -0.0095], [-0.1239, -0.0313, -0.0337, ..., -0.1386, 0.0368, -0.0918]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.7486e-07, 5.9605e-08, ..., 9.0711e-07, 3.4682e-06, -6.6422e-06], [ 0.0000e+00, 5.8822e-06, 5.9269e-06, ..., 8.6352e-06, 1.6764e-05, 9.9279e-07], [ 0.0000e+00, -1.4283e-05, 3.8370e-07, ..., -1.5102e-05, -2.5719e-05, 5.9046e-07], ..., [ 0.0000e+00, 5.4948e-07, 1.0487e-06, ..., -6.7614e-07, 3.5111e-06, -2.6170e-06], [ 0.0000e+00, 8.4750e-07, -7.5959e-06, ..., -1.1381e-06, 3.0026e-06, 7.2643e-07], [ 0.0000e+00, 2.5518e-07, 1.3150e-06, ..., 1.2554e-06, 9.8944e-06, 1.0058e-06]], device='cuda:0') Epoch 147, bias, value: tensor([-0.0074, -0.0311, 0.0089, -0.0193, 0.0155, 0.0103, 0.0161, -0.0001, -0.0302, -0.0086], device='cuda:0'), grad: tensor([-1.5795e-04, 4.2975e-05, -4.0531e-05, 2.9430e-06, -1.0051e-05, 2.2352e-05, 7.8022e-05, 4.3586e-06, 1.7226e-05, 4.0472e-05], device='cuda:0') 100 0.0001 changing lr epoch 146, time 262.12, cls_loss 0.0029 cls_loss_mapping 0.0047 cls_loss_causal 0.5570 re_mapping 0.0077 re_causal 0.0225 /// teacc 98.88 lr 0.00010000 Epoch 148, weight, value: tensor([[ 0.0118, -0.1377, -0.0878, ..., -0.1619, -0.0757, -0.1297], [ 0.0360, -0.0476, 0.0230, ..., 0.0474, 0.0824, -0.0317], [-0.0601, 0.0937, -0.1236, ..., 0.0565, 0.0558, -0.0240], ..., [-0.0710, -0.0638, -0.0700, ..., 0.0018, -0.1032, 0.1138], [ 0.0687, -0.0163, 0.0537, ..., -0.0043, -0.1369, -0.0098], [-0.1240, -0.0316, -0.0337, ..., -0.1389, 0.0370, -0.0926]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 2.0489e-08, 2.6077e-08, ..., 5.7742e-08, 3.5278e-06, 2.4214e-08], [ 1.8626e-09, 2.9616e-07, -1.9576e-06, ..., 2.2352e-07, -1.5944e-06, 5.1409e-07], [-6.1467e-08, -7.3761e-07, 2.7940e-08, ..., -9.0711e-07, -1.1511e-06, 1.0617e-07], ..., [ 5.0291e-08, 2.2538e-07, 6.6869e-07, ..., -5.4576e-07, 9.3132e-07, -1.6205e-06], [ 4.6566e-08, 1.5460e-07, 6.5193e-08, ..., 4.2282e-07, 3.0577e-05, 1.8440e-07], [ 1.8626e-09, 7.4506e-09, 1.2107e-07, ..., 3.6694e-07, 2.4978e-06, 3.8184e-07]], device='cuda:0') Epoch 148, bias, value: tensor([-0.0074, -0.0311, 0.0087, -0.0191, 0.0152, 0.0110, 0.0167, 0.0001, -0.0316, -0.0087], device='cuda:0'), grad: tensor([ 7.0073e-06, -1.6186e-06, -1.3709e-06, 2.8741e-06, 3.2093e-06, 3.4666e-04, -4.2248e-04, 2.3507e-06, 6.1005e-05, 1.8980e-06], device='cuda:0') 100 0.0001 changing lr epoch 147, time 262.20, cls_loss 0.0035 cls_loss_mapping 0.0057 cls_loss_causal 0.5805 re_mapping 0.0073 re_causal 0.0221 /// teacc 98.96 lr 0.00010000 Epoch 149, weight, value: tensor([[ 0.0122, -0.1381, -0.0880, ..., -0.1625, -0.0759, -0.1299], [ 0.0359, -0.0478, 0.0230, ..., 0.0472, 0.0822, -0.0318], [-0.0602, 0.0935, -0.1239, ..., 0.0568, 0.0564, -0.0243], ..., [-0.0710, -0.0638, -0.0702, ..., 0.0020, -0.1037, 0.1142], [ 0.0688, -0.0156, 0.0542, ..., -0.0039, -0.1369, -0.0100], [-0.1240, -0.0307, -0.0345, ..., -0.1394, 0.0365, -0.0926]], device='cuda:0'), grad: tensor([[ 1.3039e-08, 4.7684e-07, 6.7055e-08, ..., 7.3016e-07, 1.5981e-06, 1.1735e-07], [ 1.8626e-09, 1.2852e-06, -1.0170e-06, ..., 1.3020e-06, 1.2312e-06, 1.3970e-07], [-2.4214e-08, -3.0383e-05, 3.8929e-07, ..., -3.6597e-05, -3.6329e-05, -5.1148e-06], ..., [ 1.6764e-08, 1.3225e-06, 3.0920e-07, ..., 2.1365e-06, 3.9935e-06, -7.2457e-07], [-2.4214e-08, 2.5123e-05, -5.6066e-07, ..., 2.8700e-05, 3.1233e-05, 5.1223e-06], [ 1.8626e-09, 3.5577e-07, 7.0781e-08, ..., 5.8301e-07, -5.6297e-05, 2.9802e-07]], device='cuda:0') Epoch 149, bias, value: tensor([-6.8160e-03, -3.1481e-02, 8.5887e-03, -1.8734e-02, 1.6498e-02, 1.0562e-02, 1.6535e-02, -5.8479e-05, -3.1007e-02, -9.7955e-03], device='cuda:0'), grad: tensor([ 1.6354e-06, 4.8950e-06, -6.9380e-05, 9.1419e-06, 1.3995e-04, 2.5742e-06, 3.4124e-06, 1.0177e-05, 6.1989e-05, -1.6427e-04], device='cuda:0') 100 0.0001 changing lr epoch 148, time 262.13, cls_loss 0.0025 cls_loss_mapping 0.0041 cls_loss_causal 0.5323 re_mapping 0.0075 re_causal 0.0218 /// teacc 98.96 lr 0.00010000 Epoch 150, weight, value: tensor([[ 0.0119, -0.1379, -0.0884, ..., -0.1635, -0.0759, -0.1299], [ 0.0362, -0.0480, 0.0233, ..., 0.0476, 0.0824, -0.0317], [-0.0603, 0.0938, -0.1243, ..., 0.0571, 0.0574, -0.0247], ..., [-0.0711, -0.0631, -0.0704, ..., 0.0018, -0.1046, 0.1147], [ 0.0687, -0.0160, 0.0541, ..., -0.0041, -0.1378, -0.0102], [-0.1243, -0.0317, -0.0348, ..., -0.1399, 0.0366, -0.0929]], device='cuda:0'), grad: tensor([[ 1.8865e-05, 2.1998e-06, 8.3074e-06, ..., 1.6615e-05, 1.3307e-05, 5.5879e-09], [ 2.9057e-06, 3.3062e-06, 1.6764e-07, ..., 1.8895e-05, 1.0587e-05, 9.3132e-09], [-2.2918e-05, -4.2945e-05, -3.3583e-06, ..., -1.9729e-04, -1.1426e-04, -3.7253e-09], ..., [ 3.2913e-06, 1.0200e-05, 1.6391e-06, ..., 3.3170e-05, 2.0683e-05, 1.3970e-07], [ 1.5855e-05, 9.6783e-06, 9.4771e-06, ..., 1.4700e-05, 1.4625e-05, -4.8429e-08], [ 1.2573e-06, 7.9535e-07, 4.9919e-07, ..., 5.0142e-06, 9.6485e-07, -2.0117e-07]], device='cuda:0') Epoch 150, bias, value: tensor([-0.0065, -0.0312, 0.0089, -0.0186, 0.0163, 0.0100, 0.0169, -0.0002, -0.0314, -0.0098], device='cuda:0'), grad: tensor([-6.6233e-04, 3.6657e-05, -6.5327e-05, 2.2352e-04, 1.8060e-04, 8.6427e-05, -6.5684e-05, 6.7711e-05, 1.6117e-04, 3.8803e-05], device='cuda:0') 100 0.0001 changing lr epoch 149, time 261.49, cls_loss 0.0028 cls_loss_mapping 0.0043 cls_loss_causal 0.5381 re_mapping 0.0073 re_causal 0.0212 /// teacc 98.93 lr 0.00010000 Epoch 151, weight, value: tensor([[ 0.0114, -0.1382, -0.0885, ..., -0.1649, -0.0765, -0.1300], [ 0.0365, -0.0482, 0.0236, ..., 0.0479, 0.0830, -0.0319], [-0.0605, 0.0946, -0.1249, ..., 0.0571, 0.0575, -0.0245], ..., [-0.0713, -0.0640, -0.0705, ..., 0.0017, -0.1054, 0.1150], [ 0.0687, -0.0157, 0.0542, ..., -0.0040, -0.1385, -0.0102], [-0.1249, -0.0328, -0.0353, ..., -0.1404, 0.0371, -0.0930]], device='cuda:0'), grad: tensor([[-1.1921e-07, 3.1665e-08, 2.0489e-08, ..., 2.9989e-07, -8.0094e-08, 9.3132e-09], [ 6.7055e-08, 6.5565e-07, -2.9802e-08, ..., 2.1067e-06, 1.3113e-06, 6.5565e-07], [-2.4773e-07, -1.0915e-06, 8.3819e-08, ..., -7.7635e-06, -3.2485e-06, 1.8068e-07], ..., [ 2.1048e-07, 2.1420e-07, 6.2399e-07, ..., 3.1572e-06, 1.3225e-06, -1.0133e-06], [ 3.3528e-08, 5.5879e-08, -4.3772e-07, ..., 2.8871e-07, 5.1968e-07, 3.5390e-08], [ 5.2154e-08, 1.6764e-08, -1.7919e-06, ..., 1.1362e-07, -7.0781e-08, 4.2841e-08]], device='cuda:0') Epoch 151, bias, value: tensor([-0.0068, -0.0311, 0.0088, -0.0175, 0.0160, 0.0086, 0.0174, -0.0003, -0.0316, -0.0092], device='cuda:0'), grad: tensor([-1.5255e-06, 3.4403e-06, -1.0356e-05, 2.3264e-06, 3.1710e-05, 9.7416e-07, -1.4734e-06, 1.4901e-05, 1.2461e-06, -4.1306e-05], device='cuda:0') 100 0.0001 changing lr epoch 150, time 260.62, cls_loss 0.0036 cls_loss_mapping 0.0053 cls_loss_causal 0.5569 re_mapping 0.0073 re_causal 0.0212 /// teacc 98.93 lr 0.00010000 Epoch 152, weight, value: tensor([[ 0.0151, -0.1384, -0.0888, ..., -0.1664, -0.0765, -0.1300], [ 0.0365, -0.0486, 0.0237, ..., 0.0479, 0.0832, -0.0319], [-0.0613, 0.0953, -0.1252, ..., 0.0572, 0.0576, -0.0247], ..., [-0.0714, -0.0638, -0.0716, ..., 0.0006, -0.1060, 0.1136], [ 0.0686, -0.0156, 0.0552, ..., -0.0024, -0.1391, -0.0086], [-0.1260, -0.0352, -0.0368, ..., -0.1421, 0.0384, -0.0932]], device='cuda:0'), grad: tensor([[ 1.0580e-06, 7.4506e-09, 4.2841e-08, ..., 5.7481e-06, 2.5518e-07, 1.0304e-05], [ 9.1642e-07, 1.8626e-09, -2.0489e-07, ..., 5.5768e-06, 1.7136e-07, 9.4548e-06], [ 7.3388e-07, -5.0291e-08, 7.2643e-08, ..., 6.5938e-06, 7.8976e-07, 9.0003e-06], ..., [-3.9712e-06, 1.1176e-08, 6.1467e-08, ..., -4.2260e-05, 2.9430e-07, -5.4806e-05], [ 2.2724e-07, 1.4901e-08, 1.2852e-07, ..., 6.5751e-06, 1.6466e-06, 6.2324e-06], [ 4.3586e-07, 0.0000e+00, 9.5926e-07, ..., 2.4829e-06, 7.3835e-06, 4.3176e-06]], device='cuda:0') Epoch 152, bias, value: tensor([-0.0056, -0.0312, 0.0084, -0.0167, 0.0148, 0.0084, 0.0174, -0.0014, -0.0306, -0.0088], device='cuda:0'), grad: tensor([ 3.0324e-05, 2.7210e-05, 2.5466e-05, 3.2365e-05, -3.4690e-05, 1.4119e-06, 2.9989e-07, -1.4138e-04, 1.8120e-05, 4.0859e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 151---------------------------------------------------- epoch 151, time 276.40, cls_loss 0.0032 cls_loss_mapping 0.0052 cls_loss_causal 0.5551 re_mapping 0.0074 re_causal 0.0213 /// teacc 99.06 lr 0.00010000 Epoch 153, weight, value: tensor([[ 1.5786e-02, -1.3908e-01, -8.8877e-02, ..., -1.6768e-01, -7.6487e-02, -1.3008e-01], [ 3.8513e-02, -4.9525e-02, 2.4310e-02, ..., 4.8156e-02, 8.2941e-02, -3.2473e-02], [-6.1386e-02, 9.6836e-02, -1.2500e-01, ..., 5.8504e-02, 5.9178e-02, -2.4917e-02], ..., [-7.1543e-02, -6.3689e-02, -7.2168e-02, ..., -4.9144e-05, -1.0638e-01, 1.1298e-01], [ 6.6605e-02, -1.6713e-02, 5.5319e-02, ..., -1.9923e-03, -1.4126e-01, -7.3466e-03], [-1.2637e-01, -3.5514e-02, -3.7453e-02, ..., -1.4300e-01, 3.8315e-02, -9.3554e-02]], device='cuda:0'), grad: tensor([[-4.3027e-07, 3.9116e-08, 1.0803e-07, ..., 2.4959e-07, 3.6694e-07, 1.0058e-07], [ 2.2165e-07, 1.4901e-08, -2.6077e-08, ..., 2.2016e-06, -9.8720e-08, 2.4103e-06], [ 1.2107e-06, -3.6694e-07, 9.1456e-07, ..., 5.6624e-07, -2.7753e-07, 1.4529e-07], ..., [ 4.2841e-08, 5.4017e-08, -1.6481e-05, ..., -1.5700e-04, 3.0547e-07, -1.1820e-04], [-1.4976e-06, 1.7509e-07, -1.0058e-06, ..., 2.0955e-06, 5.0478e-07, 2.4643e-06], [ 5.4017e-08, 1.1176e-08, 5.2527e-07, ..., 4.6380e-06, 1.8440e-07, 4.1686e-06]], device='cuda:0') Epoch 153, bias, value: tensor([-0.0054, -0.0313, 0.0098, -0.0168, 0.0150, 0.0081, 0.0173, -0.0017, -0.0307, -0.0089], device='cuda:0'), grad: tensor([-1.2740e-06, 1.1489e-05, 4.4554e-06, 4.7851e-04, -2.9430e-07, 2.2873e-06, -3.4887e-06, -5.1546e-04, 6.6012e-06, 1.7866e-05], device='cuda:0') 100 0.0001 changing lr epoch 152, time 258.60, cls_loss 0.0030 cls_loss_mapping 0.0050 cls_loss_causal 0.5384 re_mapping 0.0079 re_causal 0.0220 /// teacc 98.96 lr 0.00010000 Epoch 154, weight, value: tensor([[ 0.0160, -0.1394, -0.0895, ..., -0.1694, -0.0769, -0.1301], [ 0.0385, -0.0487, 0.0246, ..., 0.0485, 0.0843, -0.0339], [-0.0613, 0.0975, -0.1268, ..., 0.0576, 0.0584, -0.0253], ..., [-0.0717, -0.0635, -0.0714, ..., 0.0019, -0.1066, 0.1146], [ 0.0645, -0.0183, 0.0538, ..., -0.0035, -0.1420, -0.0077], [-0.1266, -0.0353, -0.0379, ..., -0.1435, 0.0380, -0.0937]], device='cuda:0'), grad: tensor([[ 1.8626e-08, 2.7940e-08, 1.7695e-07, ..., 3.7998e-07, 3.8557e-07, 2.2352e-08], [ 1.6764e-08, 3.1665e-08, -2.5518e-07, ..., 5.5879e-08, -4.5821e-07, 4.4703e-08], [ 3.0547e-07, -1.8068e-07, 2.4792e-06, ..., 5.3458e-06, 5.7742e-08, 9.3132e-09], ..., [ 2.2352e-08, 6.7055e-08, 3.3341e-07, ..., 4.4331e-07, 1.7509e-07, -2.9244e-07], [ 6.3330e-08, 2.4214e-08, 8.9779e-07, ..., 1.2536e-06, 1.3206e-06, 3.1665e-08], [ 5.5879e-09, 1.8626e-09, 1.1921e-07, ..., 1.3411e-07, -2.9802e-07, 1.0431e-07]], device='cuda:0') Epoch 154, bias, value: tensor([-0.0056, -0.0312, 0.0090, -0.0164, 0.0154, 0.0080, 0.0172, -0.0003, -0.0320, -0.0092], device='cuda:0'), grad: tensor([ 1.4473e-06, 2.2724e-07, 1.1653e-05, -8.5175e-05, 7.7300e-07, 6.9678e-05, -4.2543e-06, 3.7439e-07, 6.3255e-06, -1.1344e-06], device='cuda:0') 100 0.0001 changing lr epoch 153, time 259.12, cls_loss 0.0033 cls_loss_mapping 0.0061 cls_loss_causal 0.5324 re_mapping 0.0072 re_causal 0.0211 /// teacc 98.94 lr 0.00010000 Epoch 155, weight, value: tensor([[ 0.0157, -0.1398, -0.0901, ..., -0.1711, -0.0770, -0.1302], [ 0.0384, -0.0489, 0.0249, ..., 0.0491, 0.0850, -0.0348], [-0.0616, 0.0982, -0.1279, ..., 0.0570, 0.0581, -0.0255], ..., [-0.0717, -0.0640, -0.0713, ..., 0.0022, -0.1074, 0.1153], [ 0.0648, -0.0184, 0.0537, ..., -0.0037, -0.1425, -0.0077], [-0.1270, -0.0349, -0.0388, ..., -0.1440, 0.0382, -0.0949]], device='cuda:0'), grad: tensor([[ 4.6752e-07, 3.7812e-07, 9.6858e-08, ..., 7.6555e-07, 9.3132e-09, 1.8626e-09], [ 3.2820e-06, 1.0971e-06, -3.1106e-07, ..., 2.7306e-06, -9.2387e-07, 4.8429e-08], [ 6.3255e-06, 8.0466e-07, 7.8045e-07, ..., 3.9749e-06, -2.1607e-06, 9.3132e-09], ..., [ 4.3586e-07, 4.7870e-07, 2.5146e-07, ..., 6.0722e-07, 1.2442e-06, -3.3528e-07], [-2.4334e-05, -7.6815e-06, -2.6766e-06, ..., -2.1532e-05, 4.2841e-07, 1.6950e-07], [ 1.4715e-07, 2.8126e-07, 2.2538e-07, ..., 6.1095e-07, 3.7812e-07, 1.3039e-08]], device='cuda:0') Epoch 155, bias, value: tensor([-0.0058, -0.0313, 0.0086, -0.0166, 0.0154, 0.0083, 0.0173, 0.0013, -0.0323, -0.0104], device='cuda:0'), grad: tensor([ 1.3430e-06, 1.4499e-05, 2.6450e-05, 7.6443e-06, 4.1053e-06, 3.0816e-05, 1.9863e-05, 4.6007e-06, -1.1301e-04, 3.8818e-06], device='cuda:0') 100 0.0001 changing lr epoch 154, time 259.47, cls_loss 0.0032 cls_loss_mapping 0.0045 cls_loss_causal 0.5273 re_mapping 0.0074 re_causal 0.0212 /// teacc 98.96 lr 0.00010000 Epoch 156, weight, value: tensor([[ 0.0158, -0.1404, -0.0892, ..., -0.1712, -0.0766, -0.1302], [ 0.0384, -0.0494, 0.0257, ..., 0.0492, 0.0851, -0.0350], [-0.0617, 0.1002, -0.1281, ..., 0.0578, 0.0586, -0.0254], ..., [-0.0718, -0.0645, -0.0720, ..., 0.0018, -0.1085, 0.1153], [ 0.0643, -0.0204, 0.0534, ..., -0.0045, -0.1428, -0.0077], [-0.1277, -0.0357, -0.0395, ..., -0.1448, 0.0382, -0.0944]], device='cuda:0'), grad: tensor([[ 2.6077e-08, 5.5879e-09, 1.5274e-07, ..., 5.7742e-08, 1.1250e-06, 5.2154e-08], [ 7.6368e-08, 7.4506e-09, 2.3097e-07, ..., 1.3784e-07, 1.3673e-04, 3.1665e-07], [ 5.6997e-07, -6.7055e-08, 2.9411e-06, ..., 7.4133e-07, 4.5002e-06, 6.1467e-08], ..., [ 3.0175e-07, 3.3528e-08, 1.5479e-06, ..., 3.3528e-08, 8.0824e-05, -9.6112e-07], [ 4.8801e-07, 9.3132e-09, 2.3991e-06, ..., 5.8487e-07, 4.3809e-06, 7.0781e-08], [ 1.1921e-07, 1.8626e-09, 6.0536e-07, ..., 2.3842e-07, -2.4366e-04, 4.0606e-07]], device='cuda:0') Epoch 156, bias, value: tensor([-0.0050, -0.0311, 0.0091, -0.0165, 0.0155, 0.0087, 0.0167, 0.0008, -0.0328, -0.0103], device='cuda:0'), grad: tensor([ 3.5409e-06, 2.8348e-04, 1.5087e-05, -9.9391e-06, 2.5034e-05, 3.9116e-06, 1.3039e-07, 1.4687e-04, 1.4730e-05, -4.8327e-04], device='cuda:0') 100 0.0001 changing lr epoch 155, time 259.40, cls_loss 0.0030 cls_loss_mapping 0.0035 cls_loss_causal 0.5644 re_mapping 0.0067 re_causal 0.0212 /// teacc 98.97 lr 0.00010000 Epoch 157, weight, value: tensor([[ 0.0158, -0.1405, -0.0896, ..., -0.1734, -0.0769, -0.1302], [ 0.0383, -0.0501, 0.0254, ..., 0.0484, 0.0844, -0.0351], [-0.0618, 0.1009, -0.1277, ..., 0.0587, 0.0596, -0.0256], ..., [-0.0719, -0.0653, -0.0722, ..., 0.0020, -0.1095, 0.1154], [ 0.0642, -0.0201, 0.0538, ..., -0.0042, -0.1430, -0.0078], [-0.1279, -0.0358, -0.0401, ..., -0.1454, 0.0384, -0.0945]], device='cuda:0'), grad: tensor([[ 9.3132e-09, 1.8626e-09, -1.2666e-07, ..., 5.2154e-08, 1.0803e-07, 1.1176e-08], [ 1.4529e-07, 1.8626e-09, 5.6252e-07, ..., 5.4762e-07, 2.2538e-07, 1.5087e-07], [ 2.7940e-07, -1.4901e-08, 1.2051e-06, ..., 8.1398e-07, 7.7672e-07, 5.9605e-08], ..., [ 1.5832e-07, 7.4506e-09, 8.9966e-07, ..., 4.4703e-08, 5.3644e-07, -5.1782e-07], [ 3.7253e-08, 0.0000e+00, -3.4571e-06, ..., -2.7996e-06, 3.4776e-06, 3.1665e-08], [ 2.0489e-08, 0.0000e+00, 2.0433e-06, ..., 1.7043e-06, 2.9989e-07, 9.8720e-08]], device='cuda:0') Epoch 157, bias, value: tensor([-0.0052, -0.0321, 0.0099, -0.0167, 0.0153, 0.0079, 0.0182, 0.0009, -0.0325, -0.0102], device='cuda:0'), grad: tensor([-3.5297e-06, 2.2482e-06, 3.7495e-06, -2.0489e-07, 6.1691e-06, 3.0786e-05, -4.9949e-05, 1.6596e-06, 5.9605e-08, 8.9854e-06], device='cuda:0') 100 0.0001 changing lr epoch 156, time 259.57, cls_loss 0.0028 cls_loss_mapping 0.0048 cls_loss_causal 0.5464 re_mapping 0.0070 re_causal 0.0205 /// teacc 98.99 lr 0.00010000 Epoch 158, weight, value: tensor([[ 0.0159, -0.1407, -0.0903, ..., -0.1743, -0.0769, -0.1303], [ 0.0383, -0.0506, 0.0243, ..., 0.0472, 0.0841, -0.0359], [-0.0617, 0.1013, -0.1280, ..., 0.0588, 0.0599, -0.0257], ..., [-0.0725, -0.0656, -0.0712, ..., 0.0034, -0.1095, 0.1159], [ 0.0641, -0.0198, 0.0542, ..., -0.0040, -0.1427, -0.0078], [-0.1281, -0.0358, -0.0406, ..., -0.1468, 0.0387, -0.0950]], device='cuda:0'), grad: tensor([[ 2.6077e-08, 3.7253e-09, 2.6077e-08, ..., 3.5390e-08, 2.1793e-07, 9.3132e-09], [ 5.5879e-09, 2.9244e-07, 1.4156e-07, ..., 1.7453e-06, 6.6496e-07, 6.2585e-07], [ 5.5879e-09, -4.4145e-07, 2.8685e-07, ..., -7.5437e-07, -1.3132e-06, 3.7439e-07], ..., [ 2.6077e-08, 8.3819e-08, -7.0222e-07, ..., -3.7309e-06, 4.8056e-07, -3.0510e-06], [ 4.2841e-08, 2.6077e-08, -4.0978e-08, ..., 2.6077e-07, 7.5810e-07, 1.6764e-07], [ 3.7253e-09, 3.7253e-09, 1.5274e-07, ..., 3.7812e-07, -2.7940e-08, 2.7381e-07]], device='cuda:0') Epoch 158, bias, value: tensor([-0.0050, -0.0333, 0.0099, -0.0169, 0.0153, 0.0081, 0.0179, 0.0016, -0.0321, -0.0104], device='cuda:0'), grad: tensor([-5.1782e-07, 3.5986e-06, 3.1292e-07, 4.4107e-06, 2.7195e-07, 1.0543e-06, -3.7309e-06, -9.9689e-06, 3.2745e-06, 1.2629e-06], device='cuda:0') 100 0.0001 changing lr epoch 157, time 259.83, cls_loss 0.0024 cls_loss_mapping 0.0036 cls_loss_causal 0.5456 re_mapping 0.0071 re_causal 0.0216 /// teacc 98.90 lr 0.00010000 Epoch 159, weight, value: tensor([[ 0.0163, -0.1408, -0.0910, ..., -0.1752, -0.0769, -0.1303], [ 0.0383, -0.0508, 0.0246, ..., 0.0473, 0.0845, -0.0363], [-0.0616, 0.1016, -0.1284, ..., 0.0588, 0.0602, -0.0260], ..., [-0.0726, -0.0657, -0.0712, ..., 0.0030, -0.1111, 0.1155], [ 0.0641, -0.0198, 0.0533, ..., -0.0046, -0.1436, -0.0078], [-0.1282, -0.0362, -0.0413, ..., -0.1452, 0.0389, -0.0929]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3877e-07, 2.6077e-07, ..., 2.1234e-07, 5.0142e-06, 3.7253e-09], [ 3.7253e-09, 5.4389e-07, 2.1979e-07, ..., 1.3374e-06, 1.0058e-06, 2.2352e-08], [ 0.0000e+00, -6.2212e-07, 1.0431e-06, ..., -3.5763e-07, -6.0350e-07, 1.4901e-08], ..., [ 3.7253e-09, 1.7509e-07, 5.5134e-07, ..., 6.7800e-07, 3.8259e-06, -1.4901e-07], [-1.8626e-08, 3.2037e-07, -2.0973e-06, ..., -1.6093e-06, 1.1250e-06, 1.8626e-08], [ 3.7253e-09, 3.7253e-08, 7.9721e-07, ..., 5.2527e-07, -8.4862e-06, 8.1956e-08]], device='cuda:0') Epoch 159, bias, value: tensor([-0.0046, -0.0332, 0.0098, -0.0167, 0.0153, 0.0083, 0.0182, 0.0007, -0.0329, -0.0096], device='cuda:0'), grad: tensor([ 1.9446e-05, 4.8518e-05, 4.1313e-06, 2.2471e-05, 4.0054e-05, -1.2502e-05, -3.4541e-05, -8.0585e-04, 6.6943e-06, 7.1192e-04], device='cuda:0') 100 0.0001 changing lr epoch 158, time 259.74, cls_loss 0.0022 cls_loss_mapping 0.0037 cls_loss_causal 0.5588 re_mapping 0.0070 re_causal 0.0216 /// teacc 98.98 lr 0.00010000 Epoch 160, weight, value: tensor([[ 0.0162, -0.1408, -0.0913, ..., -0.1771, -0.0770, -0.1303], [ 0.0383, -0.0512, 0.0249, ..., 0.0472, 0.0844, -0.0364], [-0.0616, 0.1022, -0.1288, ..., 0.0593, 0.0607, -0.0262], ..., [-0.0725, -0.0660, -0.0713, ..., 0.0030, -0.1118, 0.1156], [ 0.0640, -0.0202, 0.0533, ..., -0.0050, -0.1442, -0.0079], [-0.1283, -0.0363, -0.0418, ..., -0.1452, 0.0390, -0.0925]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.4820e-07, 3.0547e-07, ..., 1.4938e-06, 1.5646e-06, 1.0803e-07], [ 0.0000e+00, 4.5076e-07, -1.8999e-05, ..., -1.4119e-05, -1.9029e-05, 8.0466e-07], [ 0.0000e+00, -5.3525e-05, 1.7397e-06, ..., -2.6360e-05, -4.9531e-05, 2.9191e-05], ..., [ 0.0000e+00, 4.8056e-07, 1.1623e-05, ..., -7.1108e-05, 1.2904e-05, -3.2932e-05], [ 3.7253e-09, 5.0247e-05, 4.6790e-06, ..., 1.0079e-04, 5.2989e-05, 9.5740e-07], [ 0.0000e+00, 2.2352e-07, 3.1814e-06, ..., 2.7865e-06, 2.3730e-06, 4.0233e-07]], device='cuda:0') Epoch 160, bias, value: tensor([-0.0042, -0.0332, 0.0101, -0.0169, 0.0153, 0.0085, 0.0182, 0.0004, -0.0334, -0.0092], device='cuda:0'), grad: tensor([ 4.2394e-06, -4.0442e-05, -4.6432e-05, 1.2457e-05, 1.1474e-06, -2.4199e-05, -5.9158e-06, -8.5235e-05, 1.6856e-04, 1.5602e-05], device='cuda:0') 100 0.0001 changing lr epoch 159, time 259.59, cls_loss 0.0026 cls_loss_mapping 0.0040 cls_loss_causal 0.5550 re_mapping 0.0067 re_causal 0.0209 /// teacc 98.89 lr 0.00010000 Epoch 161, weight, value: tensor([[ 0.0163, -0.1410, -0.0917, ..., -0.1788, -0.0792, -0.1304], [ 0.0384, -0.0527, 0.0261, ..., 0.0473, 0.0843, -0.0365], [-0.0618, 0.1030, -0.1293, ..., 0.0597, 0.0618, -0.0267], ..., [-0.0720, -0.0662, -0.0715, ..., 0.0031, -0.1125, 0.1158], [ 0.0639, -0.0205, 0.0525, ..., -0.0056, -0.1461, -0.0080], [-0.1285, -0.0362, -0.0423, ..., -0.1459, 0.0402, -0.0927]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-08, ..., 1.8626e-08, -1.1027e-06, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, -1.1548e-07, ..., -6.3330e-08, 2.0117e-07, 4.0978e-08], [ 0.0000e+00, -7.4506e-09, 1.3784e-07, ..., 7.0781e-08, 2.8685e-07, 1.1176e-08], ..., [ 0.0000e+00, 0.0000e+00, 1.6019e-07, ..., 2.9802e-08, 6.6683e-07, -1.1176e-07], [ 0.0000e+00, 0.0000e+00, -1.6019e-07, ..., -1.1176e-07, 1.4603e-06, 1.1176e-08], [ 0.0000e+00, 0.0000e+00, 2.8685e-07, ..., 1.1176e-07, -2.2613e-06, 2.2352e-08]], device='cuda:0') Epoch 161, bias, value: tensor([-0.0056, -0.0330, 0.0105, -0.0167, 0.0152, 0.0082, 0.0182, 0.0005, -0.0340, -0.0084], device='cuda:0'), grad: tensor([ 5.2750e-05, 2.5705e-06, 3.2075e-06, 2.1458e-05, 5.9493e-06, 7.6666e-06, -1.1367e-04, 3.8408e-06, 2.2113e-05, -5.8413e-06], device='cuda:0') 100 0.0001 changing lr epoch 160, time 259.55, cls_loss 0.0030 cls_loss_mapping 0.0051 cls_loss_causal 0.5415 re_mapping 0.0071 re_causal 0.0208 /// teacc 98.91 lr 0.00010000 Epoch 162, weight, value: tensor([[ 0.0161, -0.1412, -0.0925, ..., -0.1802, -0.0794, -0.1304], [ 0.0384, -0.0528, 0.0270, ..., 0.0485, 0.0854, -0.0355], [-0.0624, 0.1034, -0.1302, ..., 0.0593, 0.0622, -0.0288], ..., [-0.0720, -0.0665, -0.0721, ..., 0.0024, -0.1149, 0.1159], [ 0.0639, -0.0205, 0.0528, ..., -0.0054, -0.1464, -0.0080], [-0.1287, -0.0363, -0.0429, ..., -0.1465, 0.0409, -0.0928]], device='cuda:0'), grad: tensor([[-7.4506e-09, 3.2410e-07, 9.9465e-07, ..., 1.2517e-06, 1.4268e-06, 2.6077e-08], [ 0.0000e+00, 2.6077e-08, -7.6532e-05, ..., -7.6950e-05, -1.8024e-04, 1.4156e-07], [ 0.0000e+00, -1.7099e-06, 7.4983e-05, ..., 7.3493e-05, 1.7118e-04, 7.8231e-08], ..., [ 0.0000e+00, 4.5449e-07, 2.1420e-06, ..., 2.0526e-06, 3.3379e-06, -6.3702e-07], [ 0.0000e+00, 1.6019e-07, -2.2836e-06, ..., -1.8068e-06, 1.6205e-06, 7.4506e-08], [ 3.7253e-09, 3.7253e-08, 4.9248e-06, ..., 2.8722e-06, 8.5682e-08, 1.3039e-07]], device='cuda:0') Epoch 162, bias, value: tensor([-6.1018e-03, -3.1724e-02, 9.8041e-03, -1.7036e-02, 1.3636e-02, 8.5496e-03, 1.9008e-02, -8.1377e-05, -3.4139e-02, -7.2731e-03], device='cuda:0'), grad: tensor([-1.8537e-05, -2.3723e-04, 2.2566e-04, -1.6987e-05, 1.0729e-06, 4.2543e-06, 2.1338e-05, 6.3628e-06, 1.7621e-06, 1.2405e-05], device='cuda:0') 100 0.0001 changing lr epoch 161, time 259.22, cls_loss 0.0024 cls_loss_mapping 0.0046 cls_loss_causal 0.5087 re_mapping 0.0067 re_causal 0.0202 /// teacc 98.87 lr 0.00010000 Epoch 163, weight, value: tensor([[ 0.0160, -0.1414, -0.0940, ..., -0.1823, -0.0801, -0.1305], [ 0.0385, -0.0524, 0.0276, ..., 0.0490, 0.0865, -0.0357], [-0.0627, 0.1046, -0.1312, ..., 0.0606, 0.0626, -0.0289], ..., [-0.0722, -0.0681, -0.0721, ..., 0.0010, -0.1168, 0.1160], [ 0.0642, -0.0210, 0.0531, ..., -0.0053, -0.1480, -0.0081], [-0.1290, -0.0364, -0.0437, ..., -0.1470, 0.0411, -0.0928]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-08, 4.0978e-08, ..., 2.7567e-07, 1.7881e-07, 0.0000e+00], [ 0.0000e+00, 2.6822e-07, -1.2740e-06, ..., 7.3388e-07, -9.6485e-07, 1.4901e-08], [ 0.0000e+00, 2.3097e-06, 1.0580e-06, ..., 1.5043e-05, 6.6906e-06, 2.9802e-08], ..., [ 0.0000e+00, -1.7628e-05, 2.1607e-07, ..., -1.0777e-04, -4.0919e-05, -5.9605e-08], [ 0.0000e+00, 1.3411e-07, 1.0431e-07, ..., 9.8348e-07, 1.2405e-06, 3.7253e-09], [ 0.0000e+00, 1.3165e-05, 7.8231e-08, ..., 8.0764e-05, 2.9743e-05, 3.7253e-09]], device='cuda:0') Epoch 163, bias, value: tensor([-0.0071, -0.0313, 0.0108, -0.0172, 0.0136, 0.0084, 0.0198, -0.0010, -0.0344, -0.0069], device='cuda:0'), grad: tensor([ 8.8662e-07, 2.0824e-06, 3.2604e-05, 3.6359e-06, 2.0325e-05, -5.3011e-06, 4.9546e-07, -2.3234e-04, 4.7497e-06, 1.7250e-04], device='cuda:0') 100 0.0001 changing lr epoch 162, time 259.38, cls_loss 0.0029 cls_loss_mapping 0.0034 cls_loss_causal 0.5456 re_mapping 0.0069 re_causal 0.0198 /// teacc 98.93 lr 0.00010000 Epoch 164, weight, value: tensor([[ 0.0160, -0.1416, -0.0962, ..., -0.1848, -0.0805, -0.1305], [ 0.0385, -0.0524, 0.0281, ..., 0.0495, 0.0872, -0.0359], [-0.0628, 0.1053, -0.1322, ..., 0.0610, 0.0626, -0.0291], ..., [-0.0720, -0.0689, -0.0723, ..., 0.0003, -0.1176, 0.1163], [ 0.0642, -0.0213, 0.0537, ..., -0.0050, -0.1487, -0.0081], [-0.1292, -0.0366, -0.0451, ..., -0.1481, 0.0414, -0.0930]], device='cuda:0'), grad: tensor([[ 2.6077e-08, 7.0781e-08, 1.1176e-06, ..., 3.7625e-07, 1.3001e-06, 0.0000e+00], [ 3.7253e-09, 3.7253e-09, 1.6764e-07, ..., 1.4901e-07, 4.2841e-07, 2.2352e-08], [ 3.7253e-09, -1.8254e-07, 7.6741e-07, ..., 3.4273e-07, 1.1176e-08, 9.3132e-08], ..., [ 0.0000e+00, 1.8626e-08, 3.3900e-07, ..., -3.0547e-07, 3.4645e-07, -1.6391e-07], [ 5.9605e-08, 1.1176e-08, 7.7188e-06, ..., 7.2643e-07, 6.1542e-06, 1.4901e-08], [ 0.0000e+00, 7.4506e-09, -1.0908e-05, ..., 1.4529e-07, -8.1024e-03, 7.4506e-09]], device='cuda:0') Epoch 164, bias, value: tensor([-0.0073, -0.0312, 0.0110, -0.0172, 0.0134, 0.0088, 0.0196, -0.0011, -0.0343, -0.0069], device='cuda:0'), grad: tensor([ 6.7614e-06, 1.4380e-06, 2.9355e-06, 4.3511e-06, 1.3863e-02, 2.2799e-06, -2.1942e-06, 3.6880e-07, 4.6223e-05, -1.3924e-02], device='cuda:0') 100 0.0001 changing lr epoch 163, time 259.35, cls_loss 0.0025 cls_loss_mapping 0.0036 cls_loss_causal 0.5339 re_mapping 0.0066 re_causal 0.0204 /// teacc 98.90 lr 0.00010000 Epoch 165, weight, value: tensor([[ 0.0158, -0.1417, -0.0970, ..., -0.1860, -0.0814, -0.1306], [ 0.0384, -0.0526, 0.0283, ..., 0.0492, 0.0874, -0.0360], [-0.0632, 0.1058, -0.1329, ..., 0.0608, 0.0627, -0.0292], ..., [-0.0719, -0.0691, -0.0720, ..., 0.0010, -0.1176, 0.1164], [ 0.0646, -0.0215, 0.0538, ..., -0.0049, -0.1494, -0.0081], [-0.1294, -0.0372, -0.0461, ..., -0.1492, 0.0415, -0.0932]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.0978e-08, 6.7055e-07, ..., 6.4075e-07, 5.6997e-07, 0.0000e+00], [ 0.0000e+00, 1.0803e-07, -6.0350e-07, ..., -7.4506e-08, 1.1921e-07, 2.9802e-08], [ 0.0000e+00, -2.5369e-06, 1.0744e-05, ..., 9.4771e-06, -1.7993e-06, 4.0978e-08], ..., [ 0.0000e+00, 1.9409e-06, 2.7008e-06, ..., 3.6806e-06, 3.3714e-06, -1.6019e-07], [ 0.0000e+00, 2.1234e-07, 4.3549e-06, ..., 5.1670e-06, 2.1495e-06, 3.7253e-08], [ 0.0000e+00, 3.7253e-08, 1.0729e-06, ..., 7.2271e-07, 6.4708e-06, 2.2352e-08]], device='cuda:0') Epoch 165, bias, value: tensor([-0.0078, -0.0315, 0.0107, -0.0175, 0.0131, 0.0086, 0.0203, -0.0004, -0.0342, -0.0069], device='cuda:0'), grad: tensor([ 6.5677e-06, 1.5944e-06, 2.3916e-05, -5.3346e-05, -2.5988e-05, 5.4277e-06, -1.2755e-05, 1.3381e-05, 1.7464e-05, 2.3723e-05], device='cuda:0') 100 0.0001 changing lr epoch 164, time 259.44, cls_loss 0.0024 cls_loss_mapping 0.0039 cls_loss_causal 0.5173 re_mapping 0.0070 re_causal 0.0209 /// teacc 98.87 lr 0.00010000 Epoch 166, weight, value: tensor([[ 0.0168, -0.1418, -0.0975, ..., -0.1867, -0.0814, -0.1306], [ 0.0384, -0.0526, 0.0291, ..., 0.0494, 0.0874, -0.0360], [-0.0635, 0.1059, -0.1335, ..., 0.0608, 0.0631, -0.0296], ..., [-0.0719, -0.0691, -0.0723, ..., 0.0010, -0.1178, 0.1167], [ 0.0646, -0.0216, 0.0538, ..., -0.0048, -0.1501, -0.0081], [-0.1297, -0.0374, -0.0471, ..., -0.1504, 0.0414, -0.0935]], device='cuda:0'), grad: tensor([[ 2.2352e-08, 1.4156e-07, 7.8231e-08, ..., 2.1979e-07, 2.3842e-07, 2.9802e-08], [ 7.4506e-09, 7.4506e-09, 1.3001e-06, ..., 4.5113e-06, -1.6391e-07, 7.1265e-06], [-1.8626e-08, -2.6450e-07, 1.1548e-07, ..., -1.1548e-07, -3.6508e-07, 5.9605e-08], ..., [ 1.1176e-08, 4.0978e-08, -1.3635e-06, ..., -4.7572e-06, 1.7136e-07, -7.7263e-06], [ 1.1176e-08, 1.4901e-08, 6.7055e-08, ..., 9.6858e-08, 2.6450e-07, 2.9802e-08], [ 7.4506e-09, 2.2352e-08, 2.0117e-07, ..., 3.6880e-07, 2.8685e-07, 4.0978e-07]], device='cuda:0') Epoch 166, bias, value: tensor([-0.0067, -0.0313, 0.0105, -0.0176, 0.0130, 0.0082, 0.0210, -0.0002, -0.0341, -0.0075], device='cuda:0'), grad: tensor([-4.0904e-06, 1.4752e-05, 1.0803e-06, -2.1495e-06, 2.6822e-07, 1.2144e-06, -1.0356e-06, -1.5289e-05, 9.3877e-07, 4.2543e-06], device='cuda:0') 100 0.0001 changing lr epoch 165, time 259.25, cls_loss 0.0027 cls_loss_mapping 0.0048 cls_loss_causal 0.5357 re_mapping 0.0066 re_causal 0.0202 /// teacc 99.02 lr 0.00010000 Epoch 167, weight, value: tensor([[ 0.0166, -0.1420, -0.0987, ..., -0.1880, -0.0815, -0.1306], [ 0.0389, -0.0528, 0.0272, ..., 0.0479, 0.0876, -0.0362], [-0.0637, 0.1060, -0.1338, ..., 0.0607, 0.0633, -0.0297], ..., [-0.0723, -0.0689, -0.0704, ..., 0.0027, -0.1181, 0.1169], [ 0.0648, -0.0216, 0.0540, ..., -0.0047, -0.1511, -0.0081], [-0.1325, -0.0375, -0.0499, ..., -0.1533, 0.0411, -0.0937]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 3.7253e-09, 1.0058e-07, ..., 5.9605e-08, 1.5274e-07, 7.4506e-09], [ 3.7253e-09, 2.6077e-08, -1.9744e-07, ..., 2.1979e-07, -2.2352e-07, 1.1921e-07], [ 1.1548e-07, 7.4506e-09, 2.7977e-06, ..., 1.8589e-06, 2.1607e-07, 2.2352e-08], ..., [ 7.4506e-09, -5.5879e-08, 1.9744e-07, ..., -5.4389e-07, 1.7136e-07, -3.3155e-07], [-3.3528e-08, 3.7253e-09, 3.0547e-07, ..., 2.3097e-07, 9.7975e-07, 7.4506e-08], [ 1.1176e-08, 7.4506e-09, 1.1548e-07, ..., 1.1176e-07, -1.0282e-06, 4.8429e-08]], device='cuda:0') Epoch 167, bias, value: tensor([-0.0070, -0.0331, 0.0103, -0.0167, 0.0134, 0.0091, 0.0211, 0.0016, -0.0344, -0.0086], device='cuda:0'), grad: tensor([ 4.9174e-07, 8.1956e-07, 5.1558e-06, -4.6082e-06, 2.0005e-06, 5.2154e-08, -3.1888e-06, -1.4417e-06, 2.7120e-06, -2.0415e-06], device='cuda:0') 100 0.0001 changing lr epoch 166, time 259.50, cls_loss 0.0020 cls_loss_mapping 0.0032 cls_loss_causal 0.5391 re_mapping 0.0065 re_causal 0.0202 /// teacc 98.92 lr 0.00010000 Epoch 168, weight, value: tensor([[ 0.0156, -0.1421, -0.0998, ..., -0.1893, -0.0815, -0.1307], [ 0.0395, -0.0531, 0.0279, ..., 0.0478, 0.0880, -0.0363], [-0.0624, 0.1078, -0.1331, ..., 0.0615, 0.0644, -0.0298], ..., [-0.0733, -0.0693, -0.0707, ..., 0.0028, -0.1186, 0.1170], [ 0.0636, -0.0239, 0.0537, ..., -0.0057, -0.1536, -0.0082], [-0.1337, -0.0377, -0.0503, ..., -0.1537, 0.0408, -0.0937]], device='cuda:0'), grad: tensor([[-8.1956e-08, 4.4703e-08, -1.2293e-07, ..., 2.3097e-07, -7.1526e-07, 1.0058e-07], [ 1.4901e-08, 5.2154e-08, 6.3144e-06, ..., 1.0572e-05, 3.1292e-07, 2.9542e-06], [ 3.3528e-08, -4.0606e-07, 8.4564e-07, ..., 3.1367e-06, 1.5646e-07, 2.3581e-06], ..., [ 1.8626e-08, 1.6019e-07, -7.0184e-06, ..., -1.7792e-05, -6.0350e-07, -8.1137e-06], [ 7.4506e-09, 4.0978e-08, 1.8999e-07, ..., 1.2405e-06, 3.7998e-07, 7.8604e-07], [ 7.4506e-09, 7.4506e-09, 3.9116e-07, ..., 1.6801e-06, -4.4331e-07, 9.0897e-07]], device='cuda:0') Epoch 168, bias, value: tensor([-0.0069, -0.0329, 0.0112, -0.0169, 0.0138, 0.0096, 0.0209, 0.0015, -0.0355, -0.0091], device='cuda:0'), grad: tensor([-4.6268e-06, 2.5183e-05, 1.2159e-05, 1.4231e-06, 3.4384e-06, 8.7842e-06, -6.8508e-06, -4.8846e-05, 4.9248e-06, 4.3176e-06], device='cuda:0') 100 0.0001 changing lr epoch 167, time 259.18, cls_loss 0.0025 cls_loss_mapping 0.0040 cls_loss_causal 0.4920 re_mapping 0.0067 re_causal 0.0189 /// teacc 98.98 lr 0.00010000 Epoch 169, weight, value: tensor([[ 0.0169, -0.1425, -0.1029, ..., -0.1932, -0.0816, -0.1307], [ 0.0394, -0.0535, 0.0278, ..., 0.0477, 0.0880, -0.0364], [-0.0621, 0.1088, -0.1337, ..., 0.0616, 0.0648, -0.0301], ..., [-0.0739, -0.0696, -0.0709, ..., 0.0029, -0.1191, 0.1172], [ 0.0632, -0.0244, 0.0545, ..., -0.0058, -0.1537, -0.0082], [-0.1344, -0.0380, -0.0511, ..., -0.1541, 0.0409, -0.0937]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.7509e-07, 1.0431e-07, ..., 2.3097e-07, 5.4762e-07, 0.0000e+00], [ 0.0000e+00, 1.9819e-06, 1.1511e-06, ..., 2.9132e-06, 3.4794e-06, 1.8626e-08], [ 0.0000e+00, 3.7551e-06, 1.3746e-06, ..., 4.2468e-06, 2.9802e-07, 7.4506e-09], ..., [ 0.0000e+00, -1.4067e-05, -3.0920e-06, ..., -1.4327e-05, 1.0692e-06, -1.0431e-07], [ 0.0000e+00, 3.3341e-06, 4.1910e-06, ..., 4.7684e-06, 1.1586e-06, 1.1176e-08], [ 0.0000e+00, 4.2468e-07, 2.7195e-07, ..., 6.4075e-07, 5.3756e-06, 7.4506e-09]], device='cuda:0') Epoch 169, bias, value: tensor([-0.0073, -0.0331, 0.0113, -0.0167, 0.0136, 0.0103, 0.0207, 0.0016, -0.0356, -0.0091], device='cuda:0'), grad: tensor([ 1.8328e-06, 1.7971e-05, 1.2122e-05, 1.1146e-05, -5.5909e-05, -1.6347e-05, 1.7732e-05, -3.3319e-05, 2.0564e-05, 2.4125e-05], device='cuda:0') 100 0.0001 changing lr epoch 168, time 259.53, cls_loss 0.0020 cls_loss_mapping 0.0033 cls_loss_causal 0.4963 re_mapping 0.0073 re_causal 0.0196 /// teacc 98.92 lr 0.00010000 Epoch 170, weight, value: tensor([[ 0.0181, -0.1442, -0.1036, ..., -0.1941, -0.0818, -0.1307], [ 0.0394, -0.0537, 0.0277, ..., 0.0476, 0.0879, -0.0367], [-0.0627, 0.1095, -0.1343, ..., 0.0617, 0.0652, -0.0298], ..., [-0.0740, -0.0706, -0.0711, ..., 0.0028, -0.1197, 0.1173], [ 0.0636, -0.0240, 0.0549, ..., -0.0058, -0.1541, -0.0082], [-0.1351, -0.0383, -0.0513, ..., -0.1544, 0.0409, -0.0939]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.6858e-08, 7.0781e-08, ..., 3.8743e-07, 2.1607e-07, 2.6077e-08], [ 0.0000e+00, 9.3132e-08, -2.3954e-06, ..., -5.9977e-07, -1.4305e-06, 1.7136e-07], [ 0.0000e+00, -3.0175e-07, 2.1607e-07, ..., 2.1607e-07, -8.7172e-07, 6.4448e-07], ..., [ 0.0000e+00, -3.2037e-07, 1.2890e-06, ..., -2.9691e-06, 1.1735e-06, -1.9707e-06], [-1.4901e-08, 8.1956e-08, -9.4622e-07, ..., -3.7998e-07, 3.6508e-07, 4.0978e-08], [ 3.7253e-09, 2.9802e-08, 8.9407e-08, ..., 4.2468e-07, -3.0175e-07, 1.4529e-07]], device='cuda:0') Epoch 170, bias, value: tensor([-0.0068, -0.0335, 0.0115, -0.0168, 0.0135, 0.0109, 0.0204, 0.0013, -0.0356, -0.0090], device='cuda:0'), grad: tensor([-2.2724e-07, -2.1495e-06, 2.8647e-06, 7.8604e-06, 2.3469e-06, -9.6485e-07, 8.2329e-07, -9.1046e-06, -4.6194e-07, -1.0058e-06], device='cuda:0') 100 0.0001 changing lr epoch 169, time 259.59, cls_loss 0.0026 cls_loss_mapping 0.0048 cls_loss_causal 0.5769 re_mapping 0.0068 re_causal 0.0208 /// teacc 98.96 lr 0.00010000 Epoch 171, weight, value: tensor([[ 0.0182, -0.1447, -0.1044, ..., -0.1965, -0.0820, -0.1308], [ 0.0393, -0.0545, 0.0281, ..., 0.0475, 0.0881, -0.0370], [-0.0631, 0.1077, -0.1357, ..., 0.0607, 0.0647, -0.0323], ..., [-0.0737, -0.0678, -0.0710, ..., 0.0034, -0.1203, 0.1182], [ 0.0639, -0.0245, 0.0550, ..., -0.0059, -0.1551, -0.0083], [-0.1355, -0.0385, -0.0516, ..., -0.1548, 0.0410, -0.0940]], device='cuda:0'), grad: tensor([[-1.0952e-06, 1.0103e-05, 2.8349e-06, ..., 3.3155e-07, 8.7917e-07, 6.7055e-08], [ 7.4506e-09, 1.8626e-07, -3.1292e-07, ..., 2.3469e-07, 8.4117e-06, 3.7625e-07], [ 2.2352e-08, -1.8813e-06, 1.5795e-06, ..., -1.1697e-06, -1.1176e-06, 1.5832e-06], ..., [ 0.0000e+00, 2.4475e-06, 2.1309e-06, ..., -1.3486e-06, 4.4703e-06, -1.0893e-05], [ 1.8626e-08, -1.4067e-05, 2.3097e-06, ..., 6.4708e-06, 1.2442e-06, 1.7397e-06], [ 1.7881e-07, 1.0096e-06, 8.6054e-07, ..., 5.3272e-07, 6.7830e-05, 1.4156e-07]], device='cuda:0') Epoch 171, bias, value: tensor([-0.0069, -0.0335, 0.0097, -0.0175, 0.0136, 0.0118, 0.0214, 0.0020, -0.0360, -0.0088], device='cuda:0'), grad: tensor([ 2.7508e-05, 2.0057e-05, 5.1744e-06, -2.3752e-05, -2.5749e-04, 7.2896e-05, 5.8234e-05, -1.1533e-05, -2.6345e-05, 1.3518e-04], device='cuda:0') 100 0.0001 changing lr epoch 170, time 259.64, cls_loss 0.0021 cls_loss_mapping 0.0035 cls_loss_causal 0.5362 re_mapping 0.0068 re_causal 0.0209 /// teacc 98.96 lr 0.00010000 Epoch 172, weight, value: tensor([[ 0.0189, -0.1451, -0.1049, ..., -0.1977, -0.0820, -0.1297], [ 0.0403, -0.0547, 0.0285, ..., 0.0477, 0.0884, -0.0373], [-0.0636, 0.1083, -0.1362, ..., 0.0608, 0.0646, -0.0323], ..., [-0.0742, -0.0683, -0.0715, ..., 0.0026, -0.1209, 0.1176], [ 0.0640, -0.0245, 0.0549, ..., -0.0062, -0.1559, -0.0083], [-0.1359, -0.0387, -0.0519, ..., -0.1550, 0.0410, -0.0942]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4506e-09, 8.9407e-08, ..., 7.0781e-08, 3.7253e-07, 1.1176e-08], [ 0.0000e+00, 2.6077e-08, -1.2703e-06, ..., -5.2154e-07, -1.5050e-06, -1.9744e-07], [ 0.0000e+00, -3.1292e-07, 1.2293e-07, ..., -7.0781e-07, -3.5390e-07, 7.4506e-09], ..., [ 0.0000e+00, 1.8626e-07, 5.1036e-07, ..., 7.1898e-07, 1.0356e-06, -1.2293e-07], [ 0.0000e+00, 5.2154e-08, -4.6045e-06, ..., -2.4214e-06, 2.5705e-06, 3.3528e-08], [ 0.0000e+00, 3.7253e-09, 2.4214e-07, ..., 1.6764e-07, -4.9546e-07, 1.4529e-07]], device='cuda:0') Epoch 172, bias, value: tensor([-0.0067, -0.0333, 0.0096, -0.0177, 0.0139, 0.0125, 0.0214, 0.0014, -0.0363, -0.0089], device='cuda:0'), grad: tensor([ 1.1250e-06, -2.7269e-06, -1.1958e-06, 4.7162e-06, 1.1064e-06, 8.5533e-06, -1.2018e-05, 2.3320e-06, -2.0377e-06, 1.0431e-07], device='cuda:0') 100 0.0001 changing lr epoch 171, time 258.83, cls_loss 0.0020 cls_loss_mapping 0.0029 cls_loss_causal 0.5241 re_mapping 0.0069 re_causal 0.0210 /// teacc 98.82 lr 0.00010000 Epoch 173, weight, value: tensor([[ 0.0189, -0.1453, -0.1056, ..., -0.1995, -0.0822, -0.1297], [ 0.0403, -0.0551, 0.0287, ..., 0.0477, 0.0885, -0.0377], [-0.0638, 0.1085, -0.1366, ..., 0.0608, 0.0649, -0.0323], ..., [-0.0744, -0.0685, -0.0716, ..., 0.0021, -0.1223, 0.1168], [ 0.0641, -0.0242, 0.0562, ..., -0.0053, -0.1561, -0.0083], [-0.1363, -0.0388, -0.0522, ..., -0.1531, 0.0411, -0.0918]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 1.9744e-07, ..., 3.3528e-08, -5.3272e-07, 0.0000e+00], [ 7.4506e-09, 0.0000e+00, -2.3469e-07, ..., -1.3411e-07, -2.7195e-07, -2.6077e-08], [ 7.4506e-09, 0.0000e+00, 7.8231e-08, ..., 2.6077e-08, 3.2410e-07, 0.0000e+00], ..., [ 2.6077e-08, 0.0000e+00, 2.1979e-07, ..., 3.3528e-08, 3.5763e-07, -1.1176e-08], [ 7.4506e-09, 0.0000e+00, 1.5482e-05, ..., 2.2352e-06, 1.1124e-05, 3.7253e-09], [ 2.9802e-08, 0.0000e+00, -1.7256e-05, ..., -2.4736e-06, -1.0982e-05, 2.2352e-08]], device='cuda:0') Epoch 173, bias, value: tensor([-0.0067, -0.0334, 0.0096, -0.0176, 0.0143, 0.0105, 0.0212, 0.0007, -0.0342, -0.0084], device='cuda:0'), grad: tensor([-3.8072e-06, -2.0862e-07, 1.5572e-06, 6.0350e-06, -1.4529e-06, -5.2229e-06, 3.5763e-06, 2.2724e-07, 5.3048e-05, -5.3823e-05], device='cuda:0') 100 0.0001 changing lr epoch 172, time 259.04, cls_loss 0.0021 cls_loss_mapping 0.0023 cls_loss_causal 0.5276 re_mapping 0.0069 re_causal 0.0209 /// teacc 98.83 lr 0.00010000 Epoch 174, weight, value: tensor([[ 0.0192, -0.1455, -0.1062, ..., -0.2012, -0.0823, -0.1297], [ 0.0401, -0.0552, 0.0287, ..., 0.0475, 0.0880, -0.0377], [-0.0639, 0.1088, -0.1366, ..., 0.0614, 0.0658, -0.0323], ..., [-0.0749, -0.0686, -0.0717, ..., 0.0020, -0.1226, 0.1169], [ 0.0643, -0.0242, 0.0567, ..., -0.0052, -0.1561, -0.0083], [-0.1368, -0.0388, -0.0523, ..., -0.1534, 0.0406, -0.0919]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 3.7253e-09, 4.5449e-07, ..., 1.8254e-07, 7.5623e-07, 7.4506e-09], [ 5.9605e-08, 7.4506e-09, 2.3022e-06, ..., 9.1270e-07, 5.5879e-07, 2.9802e-08], [ 1.0431e-07, -7.0781e-08, 1.1884e-06, ..., 6.7055e-07, 3.0175e-07, 4.8429e-08], ..., [ 3.7253e-08, 2.9802e-08, 1.2293e-06, ..., 4.0978e-07, 6.2436e-06, -8.5682e-08], [-4.9174e-07, 1.1176e-08, -2.1420e-06, ..., -1.7397e-06, 2.3767e-06, -2.2724e-07], [ 2.2352e-08, 3.7253e-09, 2.1867e-06, ..., 8.0466e-07, -3.6120e-05, 4.0978e-08]], device='cuda:0') Epoch 174, bias, value: tensor([-0.0065, -0.0338, 0.0103, -0.0177, 0.0147, 0.0105, 0.0208, 0.0006, -0.0340, -0.0089], device='cuda:0'), grad: tensor([-2.5332e-07, 5.5656e-06, 3.3453e-06, -5.0291e-06, 6.0171e-05, 7.1004e-06, 3.1702e-06, 1.8746e-05, 1.5236e-06, -9.4354e-05], device='cuda:0') 100 0.0001 changing lr epoch 173, time 258.99, cls_loss 0.0022 cls_loss_mapping 0.0048 cls_loss_causal 0.5648 re_mapping 0.0068 re_causal 0.0207 /// teacc 98.95 lr 0.00010000 Epoch 175, weight, value: tensor([[ 0.0200, -0.1457, -0.1065, ..., -0.2020, -0.0822, -0.1295], [ 0.0400, -0.0558, 0.0307, ..., 0.0492, 0.0879, -0.0350], [-0.0640, 0.1092, -0.1369, ..., 0.0616, 0.0663, -0.0325], ..., [-0.0752, -0.0685, -0.0738, ..., 0.0004, -0.1230, 0.1155], [ 0.0643, -0.0244, 0.0574, ..., -0.0049, -0.1562, -0.0084], [-0.1372, -0.0390, -0.0524, ..., -0.1537, 0.0405, -0.0921]], device='cuda:0'), grad: tensor([[-3.7253e-09, 0.0000e+00, 8.5682e-08, ..., 7.4506e-09, 1.4901e-07, 1.1176e-08], [ 0.0000e+00, 0.0000e+00, 3.9488e-07, ..., 2.9802e-08, 8.2329e-07, 1.4529e-07], [ 0.0000e+00, -2.9802e-08, 7.0781e-08, ..., -5.9605e-08, 6.3330e-08, 2.2352e-08], ..., [ 0.0000e+00, 1.1176e-08, 7.5251e-07, ..., -1.0431e-07, 1.4082e-06, -5.8487e-07], [ 0.0000e+00, 3.7253e-09, 3.9898e-06, ..., 1.1176e-08, 2.4177e-06, 1.9372e-07], [ 0.0000e+00, 0.0000e+00, -2.1905e-06, ..., 6.7055e-08, -3.5092e-06, 3.1292e-07]], device='cuda:0') Epoch 175, bias, value: tensor([-0.0059, -0.0321, 0.0104, -0.0169, 0.0149, 0.0099, 0.0203, -0.0008, -0.0339, -0.0092], device='cuda:0'), grad: tensor([-4.1723e-07, 5.0180e-06, 9.2015e-07, 1.3724e-05, 1.0088e-05, -8.0347e-05, 1.3649e-05, 2.3060e-06, 4.7922e-05, -1.2867e-05], device='cuda:0') 100 0.0001 changing lr epoch 174, time 259.53, cls_loss 0.0021 cls_loss_mapping 0.0043 cls_loss_causal 0.5473 re_mapping 0.0066 re_causal 0.0201 /// teacc 98.86 lr 0.00010000 Epoch 176, weight, value: tensor([[ 0.0212, -0.1464, -0.1068, ..., -0.2032, -0.0824, -0.1296], [ 0.0398, -0.0559, 0.0296, ..., 0.0483, 0.0878, -0.0353], [-0.0641, 0.1096, -0.1373, ..., 0.0616, 0.0664, -0.0328], ..., [-0.0765, -0.0685, -0.0730, ..., 0.0013, -0.1234, 0.1158], [ 0.0643, -0.0246, 0.0577, ..., -0.0048, -0.1565, -0.0084], [-0.1381, -0.0401, -0.0527, ..., -0.1541, 0.0404, -0.0923]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 8.1956e-08, 2.9802e-08, ..., 1.0803e-07, 3.3528e-07, 3.7253e-09], [ 1.4901e-08, 1.1176e-08, -2.3209e-06, ..., -1.7136e-07, 3.1702e-06, 1.8626e-07], [ 7.4506e-09, -3.9861e-07, 4.0978e-08, ..., -3.2037e-07, -4.7684e-07, 1.8626e-08], ..., [ 2.7567e-07, 1.3784e-07, 2.6487e-06, ..., 1.1288e-06, 3.7923e-06, -3.3528e-08], [ 3.7253e-09, 1.1176e-08, 8.1956e-08, ..., 3.7625e-07, 2.8312e-07, 1.1176e-08], [ 1.4901e-08, 2.6077e-08, 6.3330e-08, ..., 1.2293e-07, 5.9605e-07, 4.0978e-08]], device='cuda:0') Epoch 176, bias, value: tensor([-0.0059, -0.0333, 0.0103, -0.0169, 0.0152, 0.0105, 0.0205, -0.0001, -0.0341, -0.0095], device='cuda:0'), grad: tensor([-5.5134e-07, 3.2745e-06, -2.0862e-07, 1.2890e-06, -2.8417e-05, -2.7828e-06, 1.1489e-05, 1.0177e-05, 2.0191e-06, 3.6918e-06], device='cuda:0') 100 0.0001 changing lr epoch 175, time 260.02, cls_loss 0.0026 cls_loss_mapping 0.0041 cls_loss_causal 0.5126 re_mapping 0.0069 re_causal 0.0190 /// teacc 98.80 lr 0.00010000 Epoch 177, weight, value: tensor([[ 0.0213, -0.1476, -0.1078, ..., -0.2050, -0.0827, -0.1297], [ 0.0399, -0.0560, 0.0302, ..., 0.0483, 0.0889, -0.0354], [-0.0642, 0.1102, -0.1382, ..., 0.0616, 0.0668, -0.0333], ..., [-0.0767, -0.0685, -0.0727, ..., 0.0017, -0.1241, 0.1162], [ 0.0643, -0.0249, 0.0573, ..., -0.0052, -0.1588, -0.0085], [-0.1384, -0.0416, -0.0530, ..., -0.1542, 0.0405, -0.0926]], device='cuda:0'), grad: tensor([[-2.5444e-06, -1.3784e-07, 3.5018e-07, ..., 7.0408e-07, 7.8231e-08, 4.0978e-08], [ 7.2271e-07, 1.8626e-08, 2.3395e-06, ..., 1.0766e-05, -3.3826e-06, 5.5134e-07], [ 2.6077e-08, 3.0175e-07, 2.5466e-05, ..., 5.0962e-05, 2.6077e-08, 1.5870e-06], ..., [ 1.1176e-08, -4.8429e-08, -3.4392e-05, ..., -7.5400e-05, 3.3379e-06, -2.8275e-06], [ 1.3039e-07, -2.7940e-07, 9.3132e-08, ..., 5.9232e-07, 1.1548e-07, 7.8231e-08], [ 3.7253e-08, 3.7253e-08, 8.4192e-07, ..., 1.7174e-06, 7.0781e-08, 1.7881e-07]], device='cuda:0') Epoch 177, bias, value: tensor([-0.0057, -0.0331, 0.0102, -0.0180, 0.0150, 0.0111, 0.0205, 0.0003, -0.0347, -0.0093], device='cuda:0'), grad: tensor([ 3.3170e-05, 2.7597e-05, 1.1897e-04, 2.4214e-05, -2.6077e-08, 1.7472e-06, -3.8892e-05, -1.7452e-04, 2.7455e-06, 5.1335e-06], device='cuda:0') 100 0.0001 changing lr epoch 176, time 259.00, cls_loss 0.0017 cls_loss_mapping 0.0030 cls_loss_causal 0.5356 re_mapping 0.0064 re_causal 0.0197 /// teacc 98.96 lr 0.00010000 Epoch 178, weight, value: tensor([[ 0.0212, -0.1478, -0.1084, ..., -0.2062, -0.0829, -0.1297], [ 0.0401, -0.0564, 0.0309, ..., 0.0493, 0.0893, -0.0347], [-0.0641, 0.1105, -0.1386, ..., 0.0616, 0.0669, -0.0334], ..., [-0.0768, -0.0685, -0.0734, ..., 0.0010, -0.1249, 0.1159], [ 0.0642, -0.0251, 0.0579, ..., -0.0048, -0.1586, -0.0085], [-0.1385, -0.0420, -0.0532, ..., -0.1545, 0.0406, -0.0929]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 0.0000e+00, 5.5879e-08, ..., 8.1956e-08, 2.9802e-07, 5.2154e-08], [ 2.2352e-08, 0.0000e+00, -3.5763e-07, ..., 1.1288e-06, 3.5390e-06, 1.1511e-06], [ 4.4703e-08, 2.7195e-07, 2.8312e-07, ..., 8.5682e-07, 7.4133e-07, 5.4389e-07], ..., [ 2.2352e-08, -2.7567e-07, 3.3155e-07, ..., 1.0394e-05, 3.4541e-05, 9.2313e-06], [ 3.3900e-07, 0.0000e+00, 1.3001e-06, ..., 7.4878e-07, 1.4380e-06, 3.0175e-07], [ 1.4901e-08, 0.0000e+00, 1.2293e-07, ..., -1.6853e-05, -5.4359e-05, -1.5073e-05]], device='cuda:0') Epoch 178, bias, value: tensor([-5.7890e-03, -3.2410e-02, 1.0068e-02, -1.8432e-02, 1.5083e-02, 1.1284e-02, 2.0358e-02, -9.5728e-05, -3.4629e-02, -9.4174e-03], device='cuda:0'), grad: tensor([ 9.2387e-07, 2.1249e-05, 4.7125e-06, -8.4564e-07, 7.1406e-05, 2.3097e-06, 3.1665e-07, 1.8418e-04, 1.1019e-05, -2.9564e-04], device='cuda:0') 100 0.0001 changing lr epoch 177, time 259.13, cls_loss 0.0022 cls_loss_mapping 0.0043 cls_loss_causal 0.5291 re_mapping 0.0062 re_causal 0.0191 /// teacc 98.93 lr 0.00010000 Epoch 179, weight, value: tensor([[ 0.0216, -0.1481, -0.1092, ..., -0.2078, -0.0829, -0.1294], [ 0.0408, -0.0566, 0.0303, ..., 0.0490, 0.0910, -0.0363], [-0.0644, 0.1107, -0.1414, ..., 0.0595, 0.0645, -0.0336], ..., [-0.0769, -0.0686, -0.0719, ..., 0.0028, -0.1238, 0.1171], [ 0.0641, -0.0250, 0.0578, ..., -0.0049, -0.1598, -0.0086], [-0.1389, -0.0423, -0.0535, ..., -0.1554, 0.0404, -0.0928]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 1.8626e-09, 1.9558e-07, ..., 8.1398e-07, 1.0114e-06, 9.3132e-08], [-1.1548e-07, 3.7253e-09, 7.5027e-06, ..., 1.0930e-05, 1.8895e-05, 1.5207e-05], [ 3.7253e-09, -1.2480e-07, 2.0191e-06, ..., 2.5705e-07, 2.1085e-06, 3.4403e-06], ..., [ 3.7253e-09, 1.0990e-07, -1.3940e-05, ..., -2.5406e-05, -2.3097e-05, -4.1425e-05], [ 3.7253e-09, 0.0000e+00, 1.7837e-05, ..., 2.4274e-05, 3.0786e-05, 5.7966e-06], [ 1.8626e-09, 0.0000e+00, 5.6997e-07, ..., 8.2254e-06, 1.2033e-05, 1.4469e-05]], device='cuda:0') Epoch 179, bias, value: tensor([-0.0055, -0.0331, 0.0080, -0.0183, 0.0155, 0.0115, 0.0204, 0.0017, -0.0350, -0.0098], device='cuda:0'), grad: tensor([ 2.5537e-06, 5.8472e-05, 8.2403e-06, 3.3937e-06, -3.2544e-05, -1.7858e-04, 3.2812e-05, -1.8406e-04, 1.6129e-04, 1.2815e-04], device='cuda:0') 100 0.0001 changing lr epoch 178, time 259.47, cls_loss 0.0025 cls_loss_mapping 0.0036 cls_loss_causal 0.5339 re_mapping 0.0061 re_causal 0.0190 /// teacc 98.85 lr 0.00010000 Epoch 180, weight, value: tensor([[ 0.0217, -0.1482, -0.1083, ..., -0.2091, -0.0856, -0.1292], [ 0.0407, -0.0574, 0.0323, ..., 0.0514, 0.0923, -0.0335], [-0.0645, 0.1109, -0.1420, ..., 0.0594, 0.0646, -0.0339], ..., [-0.0771, -0.0686, -0.0737, ..., 0.0005, -0.1261, 0.1147], [ 0.0641, -0.0251, 0.0578, ..., -0.0049, -0.1608, -0.0086], [-0.1390, -0.0424, -0.0539, ..., -0.1556, 0.0419, -0.0929]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4506e-09, 5.5879e-09, ..., 1.3039e-08, 8.0094e-07, 1.8626e-09], [ 1.8626e-09, 7.4506e-09, -1.0990e-07, ..., -9.3132e-09, -5.9605e-08, 2.9802e-08], [ 5.5879e-09, -1.5274e-07, 4.4703e-08, ..., -8.3819e-08, 1.1176e-08, 3.7253e-09], ..., [ 0.0000e+00, 1.1548e-07, 6.5193e-08, ..., 8.1956e-08, 2.0675e-07, -7.0781e-08], [ 6.7055e-08, 7.4506e-09, 4.0978e-08, ..., -8.1956e-08, 1.5628e-06, -1.8626e-09], [ 0.0000e+00, 3.7253e-09, 7.4506e-09, ..., 1.6764e-08, 1.0617e-07, 2.0489e-08]], device='cuda:0') Epoch 180, bias, value: tensor([-0.0073, -0.0310, 0.0079, -0.0184, 0.0154, 0.0117, 0.0204, -0.0004, -0.0354, -0.0085], device='cuda:0'), grad: tensor([ 5.1185e-06, 2.1607e-07, 7.0781e-08, 3.2037e-07, -2.6077e-08, 1.5646e-05, -3.1263e-05, 2.2911e-07, 9.0078e-06, 7.2643e-07], device='cuda:0') 100 0.0001 changing lr epoch 179, time 259.08, cls_loss 0.0018 cls_loss_mapping 0.0044 cls_loss_causal 0.5158 re_mapping 0.0063 re_causal 0.0193 /// teacc 98.93 lr 0.00010000 Epoch 181, weight, value: tensor([[ 0.0217, -0.1478, -0.1103, ..., -0.2113, -0.0856, -0.1293], [ 0.0408, -0.0578, 0.0324, ..., 0.0514, 0.0924, -0.0335], [-0.0645, 0.1112, -0.1418, ..., 0.0598, 0.0650, -0.0341], ..., [-0.0775, -0.0687, -0.0739, ..., 0.0005, -0.1268, 0.1147], [ 0.0640, -0.0252, 0.0579, ..., -0.0050, -0.1614, -0.0085], [-0.1394, -0.0428, -0.0544, ..., -0.1556, 0.0420, -0.0925]], device='cuda:0'), grad: tensor([[-1.1176e-08, -3.3528e-08, 1.0058e-07, ..., 2.9802e-08, 2.0303e-07, 5.5879e-09], [ 1.8626e-09, 5.5879e-09, -6.1654e-07, ..., 4.4703e-08, -7.2457e-07, 2.6450e-07], [ 1.8626e-09, 1.8626e-09, 5.0105e-07, ..., 3.8743e-07, 2.2165e-07, 2.4214e-08], ..., [ 0.0000e+00, 1.8626e-09, 7.0781e-08, ..., -1.9614e-06, 1.9185e-07, -1.2945e-06], [ 0.0000e+00, -9.3132e-09, 2.6785e-06, ..., 1.5777e-06, 3.5893e-06, 9.1270e-08], [ 3.7253e-09, 3.1665e-08, -1.8068e-07, ..., 5.1223e-07, -2.5518e-07, 3.5390e-07]], device='cuda:0') Epoch 181, bias, value: tensor([-0.0071, -0.0310, 0.0083, -0.0186, 0.0154, 0.0118, 0.0204, -0.0007, -0.0355, -0.0083], device='cuda:0'), grad: tensor([-3.1721e-06, -1.7881e-07, 1.2685e-06, -2.9802e-07, 1.1735e-07, 1.5646e-06, -7.4878e-06, -4.2915e-06, 9.2387e-06, 3.2224e-06], device='cuda:0') 100 0.0001 changing lr epoch 180, time 259.12, cls_loss 0.0018 cls_loss_mapping 0.0041 cls_loss_causal 0.5276 re_mapping 0.0064 re_causal 0.0190 /// teacc 98.80 lr 0.00010000 Epoch 182, weight, value: tensor([[ 0.0217, -0.1479, -0.1107, ..., -0.2126, -0.0857, -0.1293], [ 0.0409, -0.0583, 0.0324, ..., 0.0512, 0.0922, -0.0336], [-0.0653, 0.1112, -0.1423, ..., 0.0598, 0.0652, -0.0337], ..., [-0.0777, -0.0689, -0.0739, ..., 0.0005, -0.1271, 0.1148], [ 0.0647, -0.0244, 0.0582, ..., -0.0046, -0.1624, -0.0086], [-0.1397, -0.0430, -0.0542, ..., -0.1551, 0.0421, -0.0925]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, 4.2841e-08, ..., 8.9407e-08, 3.0175e-07, 6.1467e-08], [ 2.0489e-08, 1.8626e-09, -3.5949e-07, ..., 1.8254e-07, -7.9907e-07, 4.1537e-07], [ 1.1176e-08, -1.6764e-08, 2.7753e-07, ..., 4.2841e-08, 8.3819e-08, 1.2480e-07], ..., [-6.3330e-08, 9.3132e-09, -1.5646e-07, ..., -1.5777e-06, 3.9861e-07, -1.6782e-06], [ 1.8626e-09, 3.7253e-09, 9.1270e-08, ..., 2.0675e-07, 6.5006e-07, 1.5087e-07], [ 1.4901e-08, 0.0000e+00, 9.4995e-08, ..., 8.1584e-07, -1.0245e-07, 8.0653e-07]], device='cuda:0') Epoch 182, bias, value: tensor([-0.0071, -0.0312, 0.0082, -0.0186, 0.0151, 0.0117, 0.0208, -0.0007, -0.0354, -0.0079], device='cuda:0'), grad: tensor([ 9.5740e-07, 7.6368e-07, 5.7556e-07, 6.7055e-07, 1.6224e-06, -2.9244e-07, -2.1104e-06, -9.0674e-06, 2.5388e-06, 4.3213e-06], device='cuda:0') 100 0.0001 changing lr epoch 181, time 259.28, cls_loss 0.0018 cls_loss_mapping 0.0044 cls_loss_causal 0.5218 re_mapping 0.0062 re_causal 0.0194 /// teacc 98.82 lr 0.00010000 Epoch 183, weight, value: tensor([[ 0.0220, -0.1479, -0.1110, ..., -0.2138, -0.0857, -0.1293], [ 0.0409, -0.0584, 0.0324, ..., 0.0511, 0.0922, -0.0336], [-0.0654, 0.1112, -0.1426, ..., 0.0594, 0.0655, -0.0338], ..., [-0.0778, -0.0689, -0.0739, ..., 0.0008, -0.1272, 0.1149], [ 0.0648, -0.0242, 0.0584, ..., -0.0044, -0.1629, -0.0086], [-0.1399, -0.0430, -0.0547, ..., -0.1556, 0.0411, -0.0926]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 1.8626e-09, 2.9560e-06, ..., 2.7940e-08, 8.0094e-08, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, -3.6228e-06, ..., -3.3230e-06, -4.4741e-06, 1.4901e-08], [ 0.0000e+00, -4.2282e-07, 1.5777e-06, ..., 6.3330e-07, 1.1306e-06, 1.8626e-09], ..., [ 0.0000e+00, 2.4214e-08, 1.2200e-06, ..., 1.0449e-06, 1.3579e-06, -4.2841e-08], [ 3.7253e-09, 3.7439e-07, 1.9073e-05, ..., 8.2143e-07, 1.0412e-06, 5.5879e-09], [ 0.0000e+00, 1.8626e-09, 1.2293e-06, ..., 5.9977e-07, 4.5076e-07, 1.1176e-08]], device='cuda:0') Epoch 183, bias, value: tensor([-0.0069, -0.0314, 0.0079, -0.0186, 0.0164, 0.0119, 0.0206, -0.0004, -0.0355, -0.0091], device='cuda:0'), grad: tensor([ 1.2495e-05, -5.1372e-06, 1.9763e-06, 4.0978e-07, -1.6205e-07, -7.6914e-04, 6.5899e-04, 2.0899e-06, 9.4533e-05, 4.2319e-06], device='cuda:0') 100 0.0001 changing lr epoch 182, time 259.44, cls_loss 0.0018 cls_loss_mapping 0.0035 cls_loss_causal 0.5203 re_mapping 0.0064 re_causal 0.0189 /// teacc 98.82 lr 0.00010000 Epoch 184, weight, value: tensor([[ 0.0220, -0.1479, -0.1128, ..., -0.2164, -0.0858, -0.1294], [ 0.0409, -0.0586, 0.0324, ..., 0.0510, 0.0921, -0.0338], [-0.0655, 0.1109, -0.1431, ..., 0.0594, 0.0656, -0.0338], ..., [-0.0778, -0.0689, -0.0739, ..., 0.0008, -0.1281, 0.1151], [ 0.0649, -0.0234, 0.0577, ..., -0.0046, -0.1633, -0.0087], [-0.1400, -0.0430, -0.0551, ..., -0.1561, 0.0412, -0.0928]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 8.9481e-06, ..., 2.5574e-06, 1.9558e-07, 7.4506e-09], [ 1.4901e-08, 0.0000e+00, 1.7509e-07, ..., 1.1735e-07, 1.1362e-07, 1.8626e-08], [ 5.5879e-09, -3.7253e-09, 2.9244e-07, ..., 1.0058e-07, 1.7509e-07, 3.7253e-09], ..., [ 2.0489e-08, 0.0000e+00, 3.7998e-07, ..., 1.0990e-07, 8.7544e-08, -8.5682e-08], [-6.3330e-08, 1.8626e-09, -8.3447e-07, ..., -5.2899e-07, 3.6694e-07, 9.3132e-09], [ 1.8626e-09, 0.0000e+00, 2.2165e-07, ..., 6.3330e-08, 5.8115e-07, 5.4017e-08]], device='cuda:0') Epoch 184, bias, value: tensor([-0.0069, -0.0315, 0.0076, -0.0183, 0.0174, 0.0126, 0.0197, -0.0006, -0.0359, -0.0091], device='cuda:0'), grad: tensor([ 1.6466e-05, 9.4064e-07, 1.6484e-06, -1.3098e-05, -4.6976e-06, -6.7568e-04, 6.6710e-04, 9.2387e-07, 3.0436e-06, 3.7756e-06], device='cuda:0') 100 0.0001 changing lr epoch 183, time 259.27, cls_loss 0.0019 cls_loss_mapping 0.0032 cls_loss_causal 0.5229 re_mapping 0.0063 re_causal 0.0180 /// teacc 98.93 lr 0.00010000 Epoch 185, weight, value: tensor([[ 0.0229, -0.1480, -0.1130, ..., -0.2173, -0.0861, -0.1294], [ 0.0409, -0.0587, 0.0327, ..., 0.0511, 0.0924, -0.0338], [-0.0646, 0.1112, -0.1435, ..., 0.0596, 0.0658, -0.0339], ..., [-0.0779, -0.0690, -0.0741, ..., 0.0007, -0.1284, 0.1152], [ 0.0650, -0.0237, 0.0581, ..., -0.0044, -0.1641, -0.0087], [-0.1403, -0.0432, -0.0562, ..., -0.1567, 0.0411, -0.0929]], device='cuda:0'), grad: tensor([[ 3.1665e-08, 0.0000e+00, 7.2643e-08, ..., 2.7940e-08, -1.2107e-07, 9.3132e-09], [ 3.7253e-09, 0.0000e+00, -3.4831e-07, ..., 2.2352e-07, -2.9616e-07, 1.6391e-07], [ 2.0489e-08, -3.7253e-09, 1.3411e-07, ..., 1.3597e-07, 5.8115e-07, 6.5193e-08], ..., [ 0.0000e+00, 1.8626e-09, 3.5390e-08, ..., -1.9222e-06, 9.6858e-08, -9.8348e-07], [ 3.7253e-09, 0.0000e+00, 3.1106e-07, ..., 1.0617e-07, 5.6066e-07, 4.8429e-08], [ 3.7253e-09, 0.0000e+00, 2.4214e-08, ..., 1.0580e-06, 2.4773e-07, 4.8615e-07]], device='cuda:0') Epoch 185, bias, value: tensor([-0.0069, -0.0314, 0.0079, -0.0182, 0.0173, 0.0125, 0.0195, -0.0006, -0.0358, -0.0094], device='cuda:0'), grad: tensor([-6.8061e-06, 7.8231e-07, 3.2075e-06, 2.1756e-06, -5.3793e-06, -8.4378e-07, 2.5574e-06, -5.5656e-06, 4.0457e-06, 5.8077e-06], device='cuda:0') 100 0.0001 changing lr epoch 184, time 259.21, cls_loss 0.0020 cls_loss_mapping 0.0033 cls_loss_causal 0.5133 re_mapping 0.0066 re_causal 0.0186 /// teacc 98.92 lr 0.00010000 Epoch 186, weight, value: tensor([[ 0.0232, -0.1474, -0.1136, ..., -0.2174, -0.0866, -0.1294], [ 0.0410, -0.0589, 0.0328, ..., 0.0509, 0.0926, -0.0339], [-0.0644, 0.1114, -0.1439, ..., 0.0593, 0.0659, -0.0340], ..., [-0.0780, -0.0691, -0.0741, ..., 0.0011, -0.1285, 0.1153], [ 0.0651, -0.0238, 0.0588, ..., -0.0042, -0.1643, -0.0087], [-0.1406, -0.0433, -0.0574, ..., -0.1571, 0.0420, -0.0929]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 3.7253e-09, 2.4214e-08, ..., 2.6077e-08, 7.2643e-08, 2.2352e-08], [ 1.8626e-09, 0.0000e+00, 3.4362e-05, ..., 3.9309e-05, -4.6566e-08, 3.9279e-05], [ 1.8626e-09, -2.2352e-08, 1.0990e-07, ..., 3.5577e-07, 2.6077e-08, 1.8626e-07], ..., [ 5.5879e-09, 1.1176e-08, -3.6359e-05, ..., -4.1842e-05, 9.8720e-08, -4.1664e-05], [ 1.8626e-09, 3.7253e-09, 1.2349e-06, ..., 1.4398e-06, 5.5879e-08, 1.4026e-06], [ 1.8626e-09, 0.0000e+00, 1.0058e-07, ..., 6.8918e-08, -7.1526e-07, 1.1548e-07]], device='cuda:0') Epoch 186, bias, value: tensor([-0.0067, -0.0315, 0.0075, -0.0182, 0.0164, 0.0126, 0.0194, -0.0003, -0.0355, -0.0090], device='cuda:0'), grad: tensor([-4.4703e-07, 1.6320e-04, 1.0375e-06, 2.5127e-06, 2.4103e-06, 8.8662e-07, 1.2480e-07, -1.7273e-04, 6.4149e-06, -3.2596e-06], device='cuda:0') 100 0.0001 changing lr epoch 185, time 259.06, cls_loss 0.0025 cls_loss_mapping 0.0032 cls_loss_causal 0.5355 re_mapping 0.0070 re_causal 0.0197 /// teacc 98.98 lr 0.00010000 Epoch 187, weight, value: tensor([[ 0.0231, -0.1476, -0.1143, ..., -0.2190, -0.0868, -0.1295], [ 0.0410, -0.0591, 0.0318, ..., 0.0500, 0.0933, -0.0347], [-0.0644, 0.1119, -0.1447, ..., 0.0596, 0.0660, -0.0342], ..., [-0.0781, -0.0699, -0.0729, ..., 0.0016, -0.1298, 0.1160], [ 0.0654, -0.0233, 0.0595, ..., -0.0038, -0.1648, -0.0088], [-0.1407, -0.0435, -0.0583, ..., -0.1556, 0.0422, -0.0920]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.3528e-08, 4.4145e-07, ..., 4.9919e-07, 9.7416e-07, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, -2.1243e-04, ..., -1.7107e-04, -1.2660e-04, 1.1176e-08], [ 0.0000e+00, -4.0978e-08, 3.6154e-06, ..., 4.5933e-06, 1.4529e-07, 9.3132e-09], ..., [ 0.0000e+00, 2.4959e-07, 2.0516e-04, ..., 1.6248e-04, 1.2290e-04, -9.3132e-08], [-1.8626e-09, 5.5879e-09, 1.3504e-06, ..., 2.7753e-06, 2.5313e-06, 3.7253e-09], [ 0.0000e+00, 7.4506e-09, 2.5611e-06, ..., 2.5518e-06, 1.2070e-06, 5.9605e-08]], device='cuda:0') Epoch 187, bias, value: tensor([-6.6125e-03, -3.2429e-02, 7.3708e-03, -1.7977e-02, 1.6774e-02, 1.2034e-02, 1.9091e-02, 8.3569e-05, -3.5019e-02, -8.5028e-03], device='cuda:0'), grad: tensor([ 4.2915e-06, -5.3453e-04, 1.5005e-05, -1.5259e-05, 5.8301e-07, 1.9416e-05, -1.2219e-05, 5.0640e-04, 5.4613e-06, 1.1928e-05], device='cuda:0') 100 0.0001 changing lr epoch 186, time 259.11, cls_loss 0.0022 cls_loss_mapping 0.0054 cls_loss_causal 0.5129 re_mapping 0.0064 re_causal 0.0194 /// teacc 98.94 lr 0.00010000 Epoch 188, weight, value: tensor([[ 0.0230, -0.1478, -0.1150, ..., -0.2208, -0.0865, -0.1296], [ 0.0407, -0.0586, 0.0329, ..., 0.0514, 0.0960, -0.0348], [-0.0644, 0.1123, -0.1474, ..., 0.0578, 0.0642, -0.0349], ..., [-0.0782, -0.0703, -0.0735, ..., 0.0009, -0.1311, 0.1157], [ 0.0663, -0.0234, 0.0613, ..., -0.0017, -0.1648, -0.0075], [-0.1408, -0.0440, -0.0587, ..., -0.1562, 0.0417, -0.0922]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.5390e-08, 3.7998e-07, ..., 1.5665e-06, 7.2643e-07, 1.8626e-08], [ 0.0000e+00, 3.7253e-09, -4.0010e-06, ..., -6.5006e-07, -5.4501e-06, 8.0094e-08], [ 0.0000e+00, 4.2841e-08, 3.1535e-06, ..., 8.1733e-06, 3.8557e-06, 4.6790e-06], ..., [ 0.0000e+00, 3.7253e-09, 6.7241e-07, ..., -1.8403e-05, 9.6112e-07, -5.0403e-06], [ 0.0000e+00, -2.0117e-07, -1.6168e-06, ..., 7.3574e-07, 2.5749e-05, 5.0291e-08], [ 0.0000e+00, 1.4901e-08, 2.8685e-07, ..., 9.1456e-07, -2.6077e-08, 7.4506e-08]], device='cuda:0') Epoch 188, bias, value: tensor([-0.0059, -0.0314, 0.0057, -0.0181, 0.0174, 0.0110, 0.0195, -0.0005, -0.0335, -0.0092], device='cuda:0'), grad: tensor([ 7.6890e-06, 2.6748e-05, 2.0906e-05, 2.8208e-05, 3.2913e-06, 7.6443e-06, -6.6400e-05, -1.5712e-04, 6.8724e-05, 6.0290e-05], device='cuda:0') 100 0.0001 changing lr epoch 187, time 258.87, cls_loss 0.0018 cls_loss_mapping 0.0045 cls_loss_causal 0.5228 re_mapping 0.0062 re_causal 0.0198 /// teacc 98.99 lr 0.00010000 Epoch 189, weight, value: tensor([[ 0.0226, -0.1480, -0.1171, ..., -0.2237, -0.0862, -0.1296], [ 0.0409, -0.0587, 0.0328, ..., 0.0510, 0.0961, -0.0351], [-0.0644, 0.1126, -0.1479, ..., 0.0578, 0.0641, -0.0353], ..., [-0.0783, -0.0704, -0.0731, ..., 0.0015, -0.1305, 0.1160], [ 0.0666, -0.0234, 0.0612, ..., -0.0019, -0.1655, -0.0075], [-0.1408, -0.0441, -0.0591, ..., -0.1565, 0.0420, -0.0922]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.9360e-07, ..., 2.7381e-07, 6.1840e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 2.7031e-05, ..., 2.0742e-05, 1.2051e-06, 9.3132e-09], [ 0.0000e+00, -5.5879e-09, 1.0896e-06, ..., 7.7300e-07, 2.8498e-07, 5.5879e-09], ..., [ 0.0000e+00, 1.8626e-09, -5.1379e-05, ..., -4.0084e-05, 2.3656e-07, -3.5390e-08], [ 0.0000e+00, 0.0000e+00, 7.2159e-06, ..., 7.2978e-06, -5.6624e-06, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 1.5553e-06, ..., 1.1791e-06, 5.9605e-08, 9.3132e-09]], device='cuda:0') Epoch 189, bias, value: tensor([-4.1264e-03, -3.1662e-02, 5.4680e-03, -1.8104e-02, 1.7119e-02, 9.1500e-03, 2.0023e-02, 5.7972e-05, -3.3936e-02, -8.9590e-03], device='cuda:0'), grad: tensor([ 3.6489e-06, 1.0389e-04, 3.0901e-06, 9.2089e-06, 1.3195e-05, 6.4597e-06, -8.6501e-06, -2.1636e-04, 7.9274e-05, 5.8189e-06], device='cuda:0') 100 0.0001 changing lr epoch 188, time 258.95, cls_loss 0.0015 cls_loss_mapping 0.0037 cls_loss_causal 0.5149 re_mapping 0.0064 re_causal 0.0190 /// teacc 99.06 lr 0.00010000 Epoch 190, weight, value: tensor([[ 0.0223, -0.1482, -0.1188, ..., -0.2263, -0.0863, -0.1296], [ 0.0409, -0.0590, 0.0331, ..., 0.0511, 0.0963, -0.0352], [-0.0644, 0.1129, -0.1480, ..., 0.0578, 0.0643, -0.0354], ..., [-0.0783, -0.0707, -0.0733, ..., 0.0013, -0.1309, 0.1160], [ 0.0665, -0.0235, 0.0609, ..., -0.0022, -0.1660, -0.0075], [-0.1409, -0.0442, -0.0593, ..., -0.1566, 0.0420, -0.0922]], device='cuda:0'), grad: tensor([[ 1.8626e-08, 5.5879e-09, 2.9802e-07, ..., 1.1548e-07, 8.5682e-08, 5.5879e-09], [ 4.2841e-08, 1.8626e-09, -1.0673e-06, ..., -4.2841e-08, -1.4082e-06, 8.7544e-08], [ 9.3132e-09, -1.6764e-08, 2.3842e-07, ..., 1.2293e-07, 2.4214e-08, 1.1176e-08], ..., [ 2.6077e-08, 5.5879e-09, 3.7812e-07, ..., -7.6182e-07, 1.0990e-07, -4.8988e-07], [ 2.3842e-07, 1.8626e-09, 2.1160e-06, ..., 7.0594e-07, 3.5949e-07, 4.2841e-08], [ 5.5879e-08, 1.8626e-09, 5.3458e-07, ..., 2.9616e-07, 1.6764e-08, 5.7742e-08]], device='cuda:0') Epoch 190, bias, value: tensor([-4.3177e-03, -3.1596e-02, 5.5106e-03, -1.8221e-02, 1.7231e-02, 9.6759e-03, 2.0172e-02, 4.5741e-06, -3.4630e-02, -8.9778e-03], device='cuda:0'), grad: tensor([ 1.5050e-06, -1.3411e-06, 5.1409e-07, -2.5779e-05, 1.3672e-06, 2.4036e-05, 1.6950e-06, -1.1232e-06, 6.0201e-06, -6.9812e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 258.74, cls_loss 0.0017 cls_loss_mapping 0.0034 cls_loss_causal 0.5449 re_mapping 0.0062 re_causal 0.0194 /// teacc 98.99 lr 0.00010000 Epoch 191, weight, value: tensor([[ 0.0248, -0.1483, -0.1191, ..., -0.2285, -0.0864, -0.1296], [ 0.0408, -0.0593, 0.0334, ..., 0.0512, 0.0963, -0.0351], [-0.0645, 0.1130, -0.1483, ..., 0.0581, 0.0649, -0.0357], ..., [-0.0784, -0.0708, -0.0735, ..., 0.0011, -0.1316, 0.1160], [ 0.0665, -0.0233, 0.0611, ..., -0.0022, -0.1666, -0.0075], [-0.1415, -0.0445, -0.0592, ..., -0.1568, 0.0421, -0.0921]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., -3.4831e-07, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6380e-07, ..., -2.7940e-08, -3.5390e-07, 1.8626e-09], [ 0.0000e+00, -5.5879e-09, 1.1362e-07, ..., 1.1548e-07, 1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.8720e-08, ..., 6.5193e-08, 9.8720e-08, -9.3132e-09], [ 0.0000e+00, 0.0000e+00, 2.2724e-07, ..., 1.0431e-07, 1.2293e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.0489e-08, ..., 2.5146e-07, -3.2224e-07, 3.7253e-09]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0041, -0.0315, 0.0059, -0.0188, 0.0167, 0.0098, 0.0199, -0.0001, -0.0348, -0.0084], device='cuda:0'), grad: tensor([-3.5781e-06, -8.9407e-07, 2.9057e-07, 8.7544e-08, 1.0822e-06, 4.9919e-07, 8.1956e-08, 2.8498e-07, 5.4576e-07, 1.6075e-06], device='cuda:0') 100 0.0001 changing lr epoch 190, time 258.78, cls_loss 0.0016 cls_loss_mapping 0.0031 cls_loss_causal 0.5125 re_mapping 0.0062 re_causal 0.0186 /// teacc 98.97 lr 0.00010000 Epoch 192, weight, value: tensor([[ 0.0248, -0.1489, -0.1196, ..., -0.2293, -0.0865, -0.1296], [ 0.0409, -0.0593, 0.0335, ..., 0.0512, 0.0964, -0.0352], [-0.0646, 0.1133, -0.1484, ..., 0.0584, 0.0653, -0.0358], ..., [-0.0784, -0.0710, -0.0736, ..., 0.0008, -0.1326, 0.1160], [ 0.0666, -0.0233, 0.0613, ..., -0.0021, -0.1669, -0.0075], [-0.1417, -0.0446, -0.0594, ..., -0.1564, 0.0422, -0.0923]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.7881e-07, 1.6764e-07, ..., 4.4703e-07, 5.9046e-07, 3.7253e-09], [ 0.0000e+00, 3.3528e-08, -1.0997e-05, ..., -6.5118e-06, -8.8364e-06, 5.7742e-08], [ 0.0000e+00, -4.7497e-07, 2.5667e-06, ..., 1.7770e-06, 8.1584e-07, 1.1176e-08], ..., [ 3.7253e-09, 9.8720e-08, 3.0920e-06, ..., -7.1600e-06, 2.4065e-06, -2.3656e-07], [-0.0000e+00, 1.6764e-08, 1.8440e-06, ..., 1.2983e-06, 1.3653e-06, 3.7253e-08], [ 0.0000e+00, 4.0978e-08, 2.3656e-07, ..., 1.3318e-06, -1.1120e-06, 5.0291e-08]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0042, -0.0317, 0.0062, -0.0187, 0.0166, 0.0100, 0.0200, -0.0005, -0.0348, -0.0081], device='cuda:0'), grad: tensor([-7.2904e-06, -2.1070e-05, 5.3123e-06, 1.1235e-05, 1.0148e-05, 7.4431e-06, 1.6559e-06, -1.3798e-05, 4.8503e-06, 1.4734e-06], device='cuda:0') 100 0.0001 changing lr epoch 191, time 259.28, cls_loss 0.0020 cls_loss_mapping 0.0039 cls_loss_causal 0.5306 re_mapping 0.0062 re_causal 0.0184 /// teacc 98.88 lr 0.00010000 Epoch 193, weight, value: tensor([[ 0.0249, -0.1493, -0.1206, ..., -0.2304, -0.0867, -0.1297], [ 0.0406, -0.0604, 0.0335, ..., 0.0508, 0.0956, -0.0352], [-0.0647, 0.1140, -0.1479, ..., 0.0585, 0.0662, -0.0359], ..., [-0.0785, -0.0712, -0.0737, ..., 0.0013, -0.1327, 0.1161], [ 0.0669, -0.0233, 0.0622, ..., -0.0020, -0.1673, -0.0076], [-0.1418, -0.0453, -0.0598, ..., -0.1566, 0.0425, -0.0924]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 9.3132e-09, 3.9265e-06, ..., 2.2352e-07, 2.1420e-07, 1.4901e-08], [ 1.8626e-09, 0.0000e+00, -2.2680e-05, ..., -1.4670e-05, -1.5572e-05, -1.1362e-06], [-1.8626e-08, -6.8918e-08, 3.1777e-06, ..., 1.7677e-06, 1.6801e-06, 2.0303e-07], ..., [ 7.4506e-09, 5.5879e-09, 8.0615e-06, ..., 4.1835e-06, 5.0999e-06, -1.0431e-06], [ 2.4214e-08, 5.5879e-09, -5.1260e-06, ..., 2.8443e-06, 1.8924e-06, 5.2154e-07], [ 1.8626e-09, 9.3132e-09, 2.3339e-06, ..., 2.2911e-06, 1.7695e-07, 1.3970e-07]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0048, -0.0324, 0.0068, -0.0192, 0.0163, 0.0099, 0.0203, -0.0001, -0.0342, -0.0077], device='cuda:0'), grad: tensor([ 8.5682e-06, -4.9800e-05, 5.9083e-06, 7.7546e-05, 1.4544e-05, -8.2731e-05, 4.9807e-06, 1.5542e-05, -6.7763e-06, 1.2189e-05], device='cuda:0') 100 0.0001 changing lr epoch 192, time 259.50, cls_loss 0.0021 cls_loss_mapping 0.0045 cls_loss_causal 0.5395 re_mapping 0.0063 re_causal 0.0180 /// teacc 99.04 lr 0.00010000 Epoch 194, weight, value: tensor([[ 0.0250, -0.1494, -0.1207, ..., -0.2320, -0.0869, -0.1297], [ 0.0405, -0.0606, 0.0337, ..., 0.0507, 0.0958, -0.0353], [-0.0647, 0.1141, -0.1484, ..., 0.0584, 0.0662, -0.0363], ..., [-0.0785, -0.0712, -0.0736, ..., 0.0017, -0.1335, 0.1167], [ 0.0670, -0.0230, 0.0622, ..., -0.0020, -0.1685, -0.0077], [-0.1418, -0.0456, -0.0617, ..., -0.1575, 0.0426, -0.0931]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 6.1467e-08, ..., 4.6566e-08, 3.5390e-07, 2.9802e-08], [-2.6077e-08, 5.5879e-09, -1.4491e-06, ..., 1.0170e-06, -1.4286e-06, 1.0300e-06], [ 1.8626e-09, -6.1467e-08, 8.5682e-08, ..., 1.8999e-07, -3.5390e-08, 2.4214e-07], ..., [ 1.8626e-09, 2.7940e-08, 2.4214e-07, ..., -6.5751e-06, 4.4145e-07, -6.1579e-06], [-3.7253e-09, 7.4506e-09, 2.5891e-07, ..., 9.6858e-08, 5.6438e-07, 3.9116e-08], [ 3.7253e-09, 1.8626e-09, 1.2852e-07, ..., 5.1335e-06, 1.1995e-06, 4.6715e-06]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0046, -0.0325, 0.0063, -0.0186, 0.0164, 0.0093, 0.0207, 0.0004, -0.0344, -0.0080], device='cuda:0'), grad: tensor([ 4.7684e-07, 2.3358e-06, 1.0971e-06, 1.6950e-07, -2.1961e-06, 1.8310e-06, -1.5013e-06, -2.8744e-05, 1.4771e-06, 2.5049e-05], device='cuda:0') 100 0.0001 changing lr epoch 193, time 259.52, cls_loss 0.0023 cls_loss_mapping 0.0040 cls_loss_causal 0.5461 re_mapping 0.0061 re_causal 0.0184 /// teacc 98.99 lr 0.00010000 Epoch 195, weight, value: tensor([[ 0.0249, -0.1507, -0.1219, ..., -0.2343, -0.0882, -0.1299], [ 0.0404, -0.0609, 0.0340, ..., 0.0507, 0.0959, -0.0354], [-0.0647, 0.1148, -0.1488, ..., 0.0586, 0.0665, -0.0364], ..., [-0.0785, -0.0717, -0.0737, ..., 0.0017, -0.1340, 0.1170], [ 0.0670, -0.0227, 0.0625, ..., -0.0019, -0.1693, -0.0078], [-0.1419, -0.0468, -0.0624, ..., -0.1582, 0.0433, -0.0937]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.1665e-08, 4.0419e-07, ..., 7.2457e-07, 1.6261e-06, 3.7253e-09], [ 0.0000e+00, 7.4506e-09, 1.4052e-05, ..., 2.5839e-05, 5.3018e-05, -5.0291e-08], [ 0.0000e+00, -1.7323e-07, -2.0504e-05, ..., -3.6865e-05, -7.6711e-05, 2.6077e-08], ..., [ 0.0000e+00, 2.2352e-08, 4.2468e-07, ..., 4.5635e-07, 1.2163e-06, -1.8254e-07], [ 0.0000e+00, 6.5193e-08, 2.3097e-07, ..., 4.4331e-07, 8.9221e-07, 1.8626e-08], [ 0.0000e+00, 1.3039e-08, 1.1362e-06, ..., 2.1402e-06, 3.8855e-06, 8.7544e-08]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0053, -0.0326, 0.0067, -0.0200, 0.0162, 0.0099, 0.0206, 0.0006, -0.0342, -0.0076], device='cuda:0'), grad: tensor([ 2.8349e-06, 9.0539e-05, -1.3006e-04, 3.1710e-05, 4.4107e-06, -1.7226e-05, 7.6592e-06, 1.6354e-06, 2.3991e-06, 5.9977e-06], device='cuda:0') 100 0.0001 changing lr epoch 194, time 259.17, cls_loss 0.0018 cls_loss_mapping 0.0032 cls_loss_causal 0.5419 re_mapping 0.0060 re_causal 0.0183 /// teacc 98.97 lr 0.00010000 Epoch 196, weight, value: tensor([[ 0.0249, -0.1510, -0.1221, ..., -0.2356, -0.0883, -0.1299], [ 0.0404, -0.0612, 0.0337, ..., 0.0501, 0.0951, -0.0358], [-0.0648, 0.1153, -0.1482, ..., 0.0592, 0.0675, -0.0366], ..., [-0.0786, -0.0734, -0.0735, ..., 0.0025, -0.1339, 0.1177], [ 0.0670, -0.0207, 0.0627, ..., -0.0017, -0.1699, -0.0078], [-0.1419, -0.0475, -0.0627, ..., -0.1589, 0.0435, -0.0940]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.5390e-07, 1.0803e-07, ..., 5.1968e-07, 8.8476e-07, 0.0000e+00], [ 0.0000e+00, 5.3085e-07, -4.0978e-07, ..., 1.2405e-06, 1.5926e-06, 0.0000e+00], [ 0.0000e+00, -1.9893e-06, 2.7567e-07, ..., -1.9222e-06, -5.4687e-06, 0.0000e+00], ..., [ 0.0000e+00, 2.4587e-07, 3.8370e-07, ..., -2.6431e-06, 1.0002e-06, 0.0000e+00], [ 0.0000e+00, 3.5390e-07, 1.2107e-07, ..., 1.5125e-06, 9.4064e-07, 0.0000e+00], [ 0.0000e+00, 1.5274e-07, 1.3970e-07, ..., 4.5262e-07, -5.0664e-07, 0.0000e+00]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0053, -0.0334, 0.0075, -0.0213, 0.0159, 0.0104, 0.0205, 0.0013, -0.0340, -0.0077], device='cuda:0'), grad: tensor([ 1.9893e-06, 3.0417e-06, -6.1244e-06, 9.4250e-07, 7.7859e-07, 4.5858e-06, 5.5879e-08, -8.9854e-06, 5.5209e-06, -1.8738e-06], device='cuda:0') 100 0.0001 changing lr epoch 195, time 259.59, cls_loss 0.0020 cls_loss_mapping 0.0039 cls_loss_causal 0.5290 re_mapping 0.0065 re_causal 0.0182 /// teacc 98.89 lr 0.00010000 Epoch 197, weight, value: tensor([[ 0.0249, -0.1512, -0.1224, ..., -0.2365, -0.0884, -0.1299], [ 0.0405, -0.0617, 0.0339, ..., 0.0498, 0.0952, -0.0361], [-0.0649, 0.1203, -0.1483, ..., 0.0617, 0.0687, -0.0367], ..., [-0.0786, -0.0776, -0.0735, ..., 0.0019, -0.1357, 0.1183], [ 0.0669, -0.0218, 0.0628, ..., -0.0021, -0.1708, -0.0078], [-0.1421, -0.0522, -0.0635, ..., -0.1602, 0.0437, -0.0942]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.7055e-08, 8.5682e-08, ..., 2.3469e-07, 3.6880e-07, 7.4506e-09], [ 1.8626e-09, -9.8050e-06, 5.1409e-07, ..., -1.6585e-05, -4.8041e-05, 6.3330e-08], [ 1.8626e-09, 1.1474e-06, 3.4142e-06, ..., 1.2502e-05, 2.7612e-05, 1.1735e-07], ..., [ 3.7253e-09, 6.6832e-06, -1.1347e-05, ..., -1.1094e-05, 1.0811e-05, -3.8743e-07], [ 1.1176e-08, 5.5879e-08, 3.4813e-06, ..., 4.6007e-06, 3.2596e-07, 4.2841e-08], [ 0.0000e+00, 2.4959e-07, 3.0696e-06, ..., 5.9716e-06, 1.3690e-06, 1.2852e-07]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0053, -0.0336, 0.0103, -0.0251, 0.0156, 0.0126, 0.0205, 0.0005, -0.0344, -0.0073], device='cuda:0'), grad: tensor([ 8.2701e-07, -4.8578e-05, 4.1723e-05, 2.4691e-05, 1.5888e-06, 1.3411e-07, -1.3269e-05, -4.3452e-05, 1.5959e-05, 2.0355e-05], device='cuda:0') 100 0.0001 changing lr epoch 196, time 259.47, cls_loss 0.0022 cls_loss_mapping 0.0038 cls_loss_causal 0.5298 re_mapping 0.0062 re_causal 0.0184 /// teacc 98.93 lr 0.00010000 Epoch 198, weight, value: tensor([[ 0.0248, -0.1513, -0.1221, ..., -0.2379, -0.0885, -0.1300], [ 0.0405, -0.0614, 0.0332, ..., 0.0491, 0.0948, -0.0363], [-0.0646, 0.1226, -0.1484, ..., 0.0626, 0.0700, -0.0369], ..., [-0.0780, -0.0787, -0.0727, ..., 0.0025, -0.1357, 0.1186], [ 0.0664, -0.0242, 0.0622, ..., -0.0035, -0.1737, -0.0080], [-0.1424, -0.0565, -0.0657, ..., -0.1617, 0.0434, -0.0944]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.0617e-07, ..., 8.5682e-08, 1.2666e-07, 0.0000e+00], [-1.0245e-07, 1.4901e-08, -5.2601e-05, ..., -8.1956e-05, -6.4850e-05, 5.5879e-09], [ 4.6566e-08, 1.4715e-07, 2.2247e-05, ..., 3.5316e-05, 2.7165e-05, 4.0419e-07], ..., [ 5.5879e-08, -1.6764e-07, 2.8744e-05, ..., 4.3303e-05, 3.4988e-05, -4.2096e-07], [ 3.7253e-09, 1.8626e-09, 1.6838e-06, ..., 1.9185e-07, 3.2224e-06, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 3.5204e-07, ..., 3.5949e-07, -5.6624e-07, 3.7253e-09]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0043, -0.0345, 0.0116, -0.0244, 0.0160, 0.0124, 0.0200, 0.0010, -0.0361, -0.0078], device='cuda:0'), grad: tensor([ 1.4715e-07, -1.4603e-04, 6.3419e-05, -2.7940e-08, 2.6450e-06, 2.4401e-06, -6.5118e-06, 7.8022e-05, 6.1542e-06, -2.8126e-07], device='cuda:0') 100 0.0001 changing lr epoch 197, time 259.81, cls_loss 0.0025 cls_loss_mapping 0.0040 cls_loss_causal 0.5273 re_mapping 0.0059 re_causal 0.0173 /// teacc 99.02 lr 0.00010000 Epoch 199, weight, value: tensor([[ 0.0249, -0.1513, -0.1221, ..., -0.2391, -0.0890, -0.1300], [ 0.0406, -0.0616, 0.0325, ..., 0.0483, 0.0949, -0.0364], [-0.0647, 0.1228, -0.1491, ..., 0.0623, 0.0700, -0.0372], ..., [-0.0781, -0.0788, -0.0717, ..., 0.0035, -0.1353, 0.1188], [ 0.0665, -0.0242, 0.0623, ..., -0.0037, -0.1745, -0.0081], [-0.1425, -0.0569, -0.0682, ..., -0.1626, 0.0441, -0.0948]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.7497e-08, 1.8999e-07, ..., 4.9546e-07, 4.0419e-07, 7.4506e-09], [ 0.0000e+00, 4.2841e-08, -3.1665e-08, ..., 3.9116e-07, 1.3690e-07, 1.4994e-07], [ 0.0000e+00, -5.1875e-07, 1.6550e-06, ..., -2.8498e-06, -2.5518e-06, 4.7497e-08], ..., [ 0.0000e+00, 2.0768e-07, 1.2107e-07, ..., -2.5146e-08, 6.3144e-07, -4.6659e-07], [ 0.0000e+00, 1.0524e-07, -2.8461e-06, ..., 5.2061e-07, 1.6857e-07, 4.9360e-08], [ 0.0000e+00, 1.2107e-08, 1.1083e-07, ..., 2.5332e-07, 1.8068e-07, 1.6112e-07]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0046, -0.0352, 0.0112, -0.0214, 0.0147, 0.0098, 0.0200, 0.0018, -0.0366, -0.0069], device='cuda:0'), grad: tensor([ 1.4361e-06, 1.2908e-06, -3.5688e-06, 2.0862e-06, 2.1793e-07, -1.5181e-06, 2.7046e-06, -4.5914e-07, -2.7362e-06, 4.7591e-07], device='cuda:0') 100 0.0001 changing lr epoch 198, time 259.66, cls_loss 0.0018 cls_loss_mapping 0.0028 cls_loss_causal 0.4975 re_mapping 0.0058 re_causal 0.0169 /// teacc 98.90 lr 0.00010000 Epoch 200, weight, value: tensor([[ 0.0249, -0.1515, -0.1227, ..., -0.2402, -0.0921, -0.1300], [ 0.0406, -0.0618, 0.0329, ..., 0.0488, 0.0955, -0.0363], [-0.0652, 0.1227, -0.1497, ..., 0.0621, 0.0700, -0.0374], ..., [-0.0781, -0.0789, -0.0721, ..., 0.0029, -0.1361, 0.1187], [ 0.0668, -0.0239, 0.0630, ..., -0.0032, -0.1750, -0.0081], [-0.1425, -0.0571, -0.0688, ..., -0.1630, 0.0455, -0.0950]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.7008e-08, ..., 8.6706e-07, 1.5413e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.3935e-07, ..., 4.2468e-07, 4.6659e-07, 7.4506e-09], [ 0.0000e+00, -4.0978e-08, 9.3225e-07, ..., -7.6711e-05, -1.1706e-04, 1.5832e-08], ..., [ 0.0000e+00, 3.5390e-08, 3.4645e-07, ..., 2.2873e-06, 3.3602e-06, -2.8871e-08], [ 0.0000e+00, 0.0000e+00, -8.4192e-07, ..., -8.5682e-08, 8.1025e-07, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 4.2003e-07, ..., 2.0768e-07, 1.1222e-06, 1.8626e-09]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0073, -0.0347, 0.0108, -0.0213, 0.0147, 0.0098, 0.0208, 0.0014, -0.0363, -0.0055], device='cuda:0'), grad: tensor([-4.5672e-06, 1.5246e-06, -1.9717e-04, 1.8859e-04, -4.2841e-06, 5.3085e-07, 1.0999e-06, 6.5118e-06, -1.4622e-07, 7.6182e-06], device='cuda:0') 100 0.0001 changing lr epoch 199, time 256.34, cls_loss 0.0022 cls_loss_mapping 0.0037 cls_loss_causal 0.5360 re_mapping 0.0058 re_causal 0.0177 /// teacc 98.95 lr 0.00010000 Epoch 201, weight, value: tensor([[ 0.0253, -0.1518, -0.1234, ..., -0.2424, -0.0924, -0.1300], [ 0.0407, -0.0622, 0.0332, ..., 0.0488, 0.0957, -0.0364], [-0.0657, 0.1230, -0.1511, ..., 0.0628, 0.0702, -0.0375], ..., [-0.0773, -0.0794, -0.0722, ..., 0.0029, -0.1373, 0.1187], [ 0.0668, -0.0229, 0.0637, ..., -0.0027, -0.1748, -0.0081], [-0.1431, -0.0583, -0.0690, ..., -0.1640, 0.0463, -0.0951]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.1327e-07, 1.3039e-07, ..., 4.0885e-07, 2.8368e-06, 0.0000e+00], [ 0.0000e+00, 1.4901e-08, 1.5832e-08, ..., 9.5926e-08, -1.2359e-06, 6.5193e-09], [ 0.0000e+00, -6.7521e-07, 1.2172e-06, ..., -3.0641e-07, -1.2126e-06, 2.7940e-09], ..., [ 0.0000e+00, 1.0245e-07, 1.5339e-06, ..., 1.1278e-06, 1.7034e-06, -1.7695e-08], [ 0.0000e+00, 1.4249e-07, 7.3574e-08, ..., 3.9395e-07, 1.0151e-06, 9.3132e-10], [ 0.0000e+00, 1.2107e-08, 6.5006e-07, ..., 4.5914e-07, -1.8915e-06, 5.5879e-09]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0074, -0.0346, 0.0109, -0.0216, 0.0145, 0.0097, 0.0212, 0.0011, -0.0358, -0.0046], device='cuda:0'), grad: tensor([ 6.6124e-06, -7.9162e-07, 9.9838e-07, -1.4104e-05, 6.6049e-06, 3.7216e-06, -9.1270e-06, 5.9903e-06, 2.2873e-06, -2.2091e-06], device='cuda:0') 100 0.0001 changing lr epoch 200, time 253.65, cls_loss 0.0025 cls_loss_mapping 0.0035 cls_loss_causal 0.5354 re_mapping 0.0059 re_causal 0.0181 /// teacc 98.97 lr 0.00010000 Epoch 202, weight, value: tensor([[ 0.0256, -0.1517, -0.1248, ..., -0.2433, -0.0955, -0.1300], [ 0.0408, -0.0623, 0.0336, ..., 0.0491, 0.0962, -0.0364], [-0.0659, 0.1230, -0.1517, ..., 0.0626, 0.0701, -0.0378], ..., [-0.0771, -0.0795, -0.0724, ..., 0.0027, -0.1380, 0.1188], [ 0.0668, -0.0223, 0.0641, ..., -0.0025, -0.1749, -0.0081], [-0.1438, -0.0585, -0.0696, ..., -0.1643, 0.0484, -0.0952]], device='cuda:0'), grad: tensor([[ 2.1420e-08, 1.3039e-08, -2.8200e-06, ..., 3.4180e-07, 2.9709e-07, 9.3132e-10], [-9.3132e-10, 3.7253e-09, 1.6028e-06, ..., 9.5274e-07, 3.9488e-07, 1.3411e-07], [ 3.4831e-07, -1.0245e-07, 1.5043e-05, ..., 5.8301e-06, 4.1313e-06, 1.7695e-08], ..., [ 8.0094e-08, 3.2596e-08, 3.4682e-06, ..., 1.2862e-06, 1.5106e-06, -2.6356e-07], [ 9.3132e-08, 3.4459e-08, 3.6489e-06, ..., 1.5637e-06, 1.3849e-06, 4.8429e-08], [ 1.2107e-08, 6.5193e-09, 1.2908e-06, ..., 3.3807e-07, -2.6524e-05, 2.7008e-08]], device='cuda:0') Epoch 202, bias, value: tensor([-0.0101, -0.0344, 0.0106, -0.0217, 0.0170, 0.0099, 0.0212, 0.0009, -0.0356, -0.0028], device='cuda:0'), grad: tensor([-3.3945e-05, 5.1968e-06, 3.9399e-05, -1.2422e-04, 9.8050e-05, 5.9277e-05, 2.2858e-05, 9.4622e-06, 1.0803e-05, -8.6725e-05], device='cuda:0') 100 0.0001 changing lr epoch 201, time 253.69, cls_loss 0.0016 cls_loss_mapping 0.0024 cls_loss_causal 0.5475 re_mapping 0.0057 re_causal 0.0178 /// teacc 99.04 lr 0.00010000 Epoch 203, weight, value: tensor([[ 0.0256, -0.1498, -0.1251, ..., -0.2430, -0.0955, -0.1301], [ 0.0410, -0.0624, 0.0336, ..., 0.0490, 0.0960, -0.0365], [-0.0659, 0.1232, -0.1519, ..., 0.0629, 0.0705, -0.0379], ..., [-0.0771, -0.0797, -0.0725, ..., 0.0027, -0.1384, 0.1190], [ 0.0668, -0.0225, 0.0645, ..., -0.0025, -0.1753, -0.0082], [-0.1440, -0.0591, -0.0700, ..., -0.1647, 0.0484, -0.0954]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 1.5274e-07, 9.3132e-09, ..., 1.2573e-07, 1.3504e-07, 9.3132e-10], [-2.1420e-08, 1.0151e-07, -6.5193e-09, ..., 3.1106e-07, 2.7567e-07, 2.0489e-08], [-1.5832e-08, -4.9919e-07, 4.2841e-08, ..., -1.1241e-06, -1.2303e-06, 6.5193e-09], ..., [ 1.4901e-08, 1.4342e-07, 7.1712e-08, ..., 2.6356e-07, 3.2224e-07, -1.7323e-07], [-8.3819e-09, 1.8626e-08, -2.8592e-07, ..., -1.1735e-07, 1.1176e-08, 5.5879e-09], [ 2.7940e-09, 7.4506e-09, 8.1956e-08, ..., 5.8673e-08, 2.6077e-08, 7.5437e-08]], device='cuda:0') Epoch 203, bias, value: tensor([-0.0099, -0.0346, 0.0108, -0.0215, 0.0168, 0.0097, 0.0212, 0.0010, -0.0354, -0.0029], device='cuda:0'), grad: tensor([ 2.2650e-06, 7.4599e-07, -2.9206e-06, 7.5437e-07, 3.1013e-07, 2.6356e-07, -2.1458e-06, 3.0175e-07, -4.6194e-07, 8.8289e-07], device='cuda:0') 100 0.0001 changing lr epoch 202, time 253.96, cls_loss 0.0014 cls_loss_mapping 0.0024 cls_loss_causal 0.4946 re_mapping 0.0059 re_causal 0.0184 /// teacc 98.92 lr 0.00010000 Epoch 204, weight, value: tensor([[ 0.0259, -0.1494, -0.1253, ..., -0.2434, -0.0955, -0.1301], [ 0.0411, -0.0630, 0.0338, ..., 0.0491, 0.0962, -0.0365], [-0.0660, 0.1235, -0.1523, ..., 0.0630, 0.0709, -0.0384], ..., [-0.0771, -0.0800, -0.0727, ..., 0.0022, -0.1392, 0.1187], [ 0.0668, -0.0228, 0.0647, ..., -0.0026, -0.1757, -0.0082], [-0.1443, -0.0592, -0.0705, ..., -0.1649, 0.0484, -0.0955]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 3.7253e-09, 4.0978e-08, 0.0000e+00], [-4.6566e-09, 0.0000e+00, -3.5856e-07, ..., -4.8429e-08, -3.5856e-07, 1.9558e-08], [ 0.0000e+00, -9.3132e-10, 2.6077e-08, ..., 2.5146e-08, 4.8708e-07, 6.5193e-09], ..., [ 9.3132e-10, 9.3132e-10, 1.3877e-07, ..., -5.1223e-08, 1.0496e-06, -5.6811e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 1.7695e-08, 1.0617e-07, 3.7253e-09], [ 2.7940e-09, 0.0000e+00, 1.4715e-07, ..., 5.5879e-08, 8.4471e-07, 1.8626e-08]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0097, -0.0345, 0.0108, -0.0212, 0.0169, 0.0097, 0.0210, 0.0006, -0.0355, -0.0031], device='cuda:0'), grad: tensor([ 1.1269e-07, -7.1619e-07, 1.5404e-06, -6.5193e-09, -7.4767e-06, 1.1735e-07, 1.0990e-07, 3.0473e-06, 4.0233e-07, 2.8610e-06], device='cuda:0') 100 0.0001 changing lr epoch 203, time 253.63, cls_loss 0.0017 cls_loss_mapping 0.0033 cls_loss_causal 0.5163 re_mapping 0.0055 re_causal 0.0173 /// teacc 98.96 lr 0.00010000 Epoch 205, weight, value: tensor([[ 0.0259, -0.1495, -0.1251, ..., -0.2439, -0.0954, -0.1301], [ 0.0413, -0.0632, 0.0336, ..., 0.0488, 0.0951, -0.0366], [-0.0660, 0.1237, -0.1508, ..., 0.0639, 0.0730, -0.0385], ..., [-0.0771, -0.0801, -0.0730, ..., 0.0020, -0.1402, 0.1189], [ 0.0668, -0.0229, 0.0653, ..., -0.0026, -0.1761, -0.0082], [-0.1445, -0.0594, -0.0715, ..., -0.1654, 0.0484, -0.0956]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1176e-07, 4.5635e-08, ..., 1.1642e-07, 2.6170e-07, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, -2.1793e-07, ..., -5.4017e-08, -2.4587e-07, 1.3970e-08], [ 0.0000e+00, -1.4808e-07, 8.8476e-08, ..., -3.2596e-08, -2.4494e-07, 2.4214e-08], ..., [ 0.0000e+00, 1.9558e-08, 1.0617e-07, ..., -8.6613e-08, 1.4435e-07, -7.6368e-08], [ 0.0000e+00, 4.6566e-09, 6.0536e-08, ..., 3.3528e-08, 2.8871e-08, 3.7253e-09], [ 0.0000e+00, 2.7940e-09, 5.9605e-08, ..., 6.8918e-08, 1.1921e-07, 2.7940e-08]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0094, -0.0350, 0.0124, -0.0214, 0.0166, 0.0099, 0.0204, 0.0004, -0.0355, -0.0032], device='cuda:0'), grad: tensor([ 3.2131e-07, -2.0582e-07, -8.5682e-08, -1.0589e-06, -2.9523e-07, 8.0466e-07, 1.3039e-07, -4.7591e-07, 1.6578e-07, 7.0874e-07], device='cuda:0') 100 0.0001 changing lr epoch 204, time 253.58, cls_loss 0.0012 cls_loss_mapping 0.0029 cls_loss_causal 0.5072 re_mapping 0.0058 re_causal 0.0181 /// teacc 98.95 lr 0.00010000 Epoch 206, weight, value: tensor([[ 0.0261, -0.1495, -0.1240, ..., -0.2443, -0.0954, -0.1301], [ 0.0412, -0.0634, 0.0329, ..., 0.0480, 0.0940, -0.0369], [-0.0664, 0.1239, -0.1506, ..., 0.0641, 0.0737, -0.0387], ..., [-0.0772, -0.0801, -0.0724, ..., 0.0028, -0.1392, 0.1192], [ 0.0671, -0.0230, 0.0654, ..., -0.0024, -0.1767, -0.0082], [-0.1447, -0.0596, -0.0722, ..., -0.1662, 0.0482, -0.0959]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.6077e-08, ..., 2.3283e-08, 5.2899e-07, 5.4948e-08], [ 0.0000e+00, 0.0000e+00, -6.1020e-06, ..., -6.1207e-06, -1.3359e-05, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 1.3243e-06, ..., 1.2349e-06, 2.1905e-06, 1.5832e-08], ..., [ 0.0000e+00, 0.0000e+00, 5.1558e-06, ..., 4.8876e-06, 1.0632e-05, -4.3772e-08], [ 0.0000e+00, 0.0000e+00, 3.1590e-06, ..., 1.6410e-06, 1.5367e-07, 1.8626e-08], [ 0.0000e+00, 0.0000e+00, 2.3190e-07, ..., 2.4587e-07, 5.9642e-06, 1.0245e-08]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0093, -0.0360, 0.0128, -0.0214, 0.0170, 0.0100, 0.0201, 0.0013, -0.0356, -0.0036], device='cuda:0'), grad: tensor([ 2.0918e-06, -2.7314e-05, 5.2191e-06, -8.4117e-06, -1.2018e-05, 2.6375e-06, -2.7567e-06, 2.1830e-05, 5.3570e-06, 1.3292e-05], device='cuda:0') 100 0.0001 changing lr epoch 205, time 253.65, cls_loss 0.0016 cls_loss_mapping 0.0030 cls_loss_causal 0.5155 re_mapping 0.0058 re_causal 0.0173 /// teacc 98.95 lr 0.00010000 Epoch 207, weight, value: tensor([[ 0.0268, -0.1496, -0.1241, ..., -0.2451, -0.0954, -0.1301], [ 0.0411, -0.0636, 0.0333, ..., 0.0482, 0.0945, -0.0369], [-0.0680, 0.1240, -0.1518, ..., 0.0635, 0.0734, -0.0388], ..., [-0.0776, -0.0801, -0.0726, ..., 0.0027, -0.1394, 0.1190], [ 0.0687, -0.0230, 0.0656, ..., -0.0020, -0.1773, -0.0082], [-0.1455, -0.0598, -0.0720, ..., -0.1665, 0.0481, -0.0960]], device='cuda:0'), grad: tensor([[ 1.3039e-08, 0.0000e+00, 6.2399e-08, ..., 1.7695e-08, 1.0524e-07, 4.6566e-09], [ 1.8347e-07, 2.7940e-09, 4.0326e-07, ..., 1.9558e-07, 1.3597e-07, 4.3772e-08], [ 1.2759e-07, -1.0245e-08, 4.3027e-07, ..., 1.8813e-07, 2.5798e-07, 3.2596e-08], ..., [ 8.1770e-07, 5.5879e-09, 3.9563e-06, ..., 1.8440e-07, 3.8370e-06, -2.8312e-07], [-4.3027e-07, 1.8626e-09, -8.6054e-07, ..., -8.3353e-07, -8.3819e-08, 4.7497e-08], [ 4.2096e-07, 0.0000e+00, 2.1402e-06, ..., 7.0781e-08, 9.0990e-07, 8.7544e-08]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0091, -0.0359, 0.0123, -0.0212, 0.0172, 0.0099, 0.0204, 0.0010, -0.0355, -0.0037], device='cuda:0'), grad: tensor([-2.9150e-07, 2.0713e-06, 1.7416e-06, 2.3916e-06, -2.7180e-05, -8.8569e-07, 1.2591e-06, 1.5631e-05, -1.3839e-06, 6.6161e-06], device='cuda:0') 100 0.0001 changing lr epoch 206, time 253.54, cls_loss 0.0016 cls_loss_mapping 0.0031 cls_loss_causal 0.5150 re_mapping 0.0058 re_causal 0.0176 /// teacc 98.95 lr 0.00010000 Epoch 208, weight, value: tensor([[ 0.0277, -0.1500, -0.1247, ..., -0.2464, -0.0954, -0.1300], [ 0.0410, -0.0638, 0.0340, ..., 0.0483, 0.0958, -0.0369], [-0.0691, 0.1240, -0.1524, ..., 0.0634, 0.0734, -0.0389], ..., [-0.0777, -0.0802, -0.0732, ..., 0.0025, -0.1418, 0.1189], [ 0.0698, -0.0227, 0.0672, ..., -0.0017, -0.1769, -0.0083], [-0.1470, -0.0599, -0.0726, ..., -0.1668, 0.0481, -0.0960]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 0.0000e+00, -4.6819e-05, ..., 2.7940e-08, -7.8261e-05, 7.4506e-09], [ 1.8626e-09, 0.0000e+00, 2.6211e-05, ..., -2.6356e-07, 4.4256e-05, 6.7055e-08], [ 1.1176e-08, -1.8626e-09, 9.5293e-06, ..., 1.0990e-07, 1.5929e-05, 5.4948e-08], ..., [ 6.5193e-09, 0.0000e+00, 7.9069e-07, ..., 9.0338e-08, 7.4320e-07, -1.6093e-06], [ 8.3819e-09, 0.0000e+00, -8.0466e-07, ..., -1.5832e-07, 3.0547e-07, 5.6811e-08], [ 2.7940e-09, 0.0000e+00, 1.1884e-06, ..., 1.9185e-07, 1.6578e-06, 8.8476e-08]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0091, -0.0346, 0.0121, -0.0211, 0.0173, 0.0100, 0.0196, -0.0002, -0.0344, -0.0038], device='cuda:0'), grad: tensor([-2.4748e-04, 1.4210e-04, 5.6684e-05, 5.7071e-06, 7.9125e-06, 3.3323e-06, 3.7611e-05, -1.3642e-05, 3.3341e-07, 7.1228e-06], device='cuda:0') 100 0.0001 changing lr epoch 207, time 253.76, cls_loss 0.0019 cls_loss_mapping 0.0029 cls_loss_causal 0.5128 re_mapping 0.0055 re_causal 0.0169 /// teacc 98.91 lr 0.00010000 Epoch 209, weight, value: tensor([[ 0.0282, -0.1497, -0.1230, ..., -0.2485, -0.0956, -0.1300], [ 0.0378, -0.0638, 0.0341, ..., 0.0478, 0.0948, -0.0370], [-0.0661, 0.1241, -0.1523, ..., 0.0643, 0.0747, -0.0390], ..., [-0.0779, -0.0802, -0.0733, ..., 0.0025, -0.1423, 0.1191], [ 0.0700, -0.0227, 0.0673, ..., -0.0017, -0.1777, -0.0083], [-0.1490, -0.0601, -0.0733, ..., -0.1668, 0.0484, -0.0962]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.6322e-08, 6.2399e-08, ..., 7.5437e-08, 1.7192e-06, 0.0000e+00], [-2.1420e-08, 7.4506e-09, -8.7544e-08, ..., 7.4506e-09, -1.9558e-08, 5.5879e-09], [ 6.5193e-09, -1.2573e-07, 1.7416e-07, ..., -1.2200e-07, -1.8813e-07, 1.8626e-09], ..., [ 4.6566e-09, 3.8184e-08, 1.0431e-07, ..., 7.6368e-08, 3.5204e-07, -1.3970e-08], [ 1.8626e-09, -2.7381e-07, -1.4510e-06, ..., -8.2888e-07, 1.2569e-05, 9.3132e-10], [ 3.7253e-09, 2.4494e-07, 3.8650e-07, ..., 2.0675e-07, 2.2128e-06, 5.5879e-09]], device='cuda:0') Epoch 209, bias, value: tensor([-9.1916e-03, -3.5346e-02, 1.3071e-02, -2.1062e-02, 1.6832e-02, 1.0018e-02, 1.9548e-02, -2.7881e-05, -3.4831e-02, -3.4739e-03], device='cuda:0'), grad: tensor([ 3.9265e-06, 4.4703e-07, 5.5972e-07, 9.7230e-07, -5.8636e-06, 3.3714e-07, -3.5644e-05, 4.2003e-07, 2.6599e-05, 8.1882e-06], device='cuda:0') 100 0.0001 changing lr epoch 208, time 253.82, cls_loss 0.0019 cls_loss_mapping 0.0027 cls_loss_causal 0.5115 re_mapping 0.0059 re_causal 0.0167 /// teacc 98.97 lr 0.00010000 Epoch 210, weight, value: tensor([[ 0.0281, -0.1496, -0.1244, ..., -0.2506, -0.0958, -0.1300], [ 0.0381, -0.0639, 0.0346, ..., 0.0480, 0.0952, -0.0370], [-0.0660, 0.1241, -0.1527, ..., 0.0642, 0.0747, -0.0391], ..., [-0.0781, -0.0802, -0.0735, ..., 0.0023, -0.1430, 0.1191], [ 0.0706, -0.0228, 0.0677, ..., -0.0015, -0.1789, -0.0083], [-0.1510, -0.0604, -0.0739, ..., -0.1670, 0.0486, -0.0962]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.5404e-06, ..., 1.7043e-07, 4.4703e-08, 2.7940e-09], [-2.7940e-09, 9.3132e-10, 3.2224e-07, ..., 1.0524e-07, -1.0245e-08, 2.1420e-08], [ 0.0000e+00, -1.0245e-08, 1.3867e-06, ..., 4.7684e-07, -1.0245e-08, 1.3132e-07], ..., [ 2.7940e-09, 7.4506e-09, 4.0326e-07, ..., -7.1712e-07, 5.9605e-08, -2.2538e-07], [ 5.5879e-09, 9.3132e-10, 1.8761e-05, ..., 2.0545e-06, 8.1025e-08, 5.5879e-09], [ 4.6566e-09, 0.0000e+00, 8.8960e-06, ..., 1.0412e-06, -9.0152e-07, 2.5146e-08]], device='cuda:0') Epoch 210, bias, value: tensor([-9.2448e-03, -3.5078e-02, 1.2827e-02, -2.1133e-02, 1.7217e-02, 1.0109e-02, 1.9483e-02, -8.8837e-05, -3.5133e-02, -3.5224e-03], device='cuda:0'), grad: tensor([ 7.3090e-06, 6.8769e-06, 4.2692e-06, -5.5611e-05, 5.4501e-06, 9.9018e-06, 3.0827e-07, -4.2468e-05, 3.4660e-05, 2.9385e-05], device='cuda:0') 100 0.0001 changing lr epoch 209, time 253.45, cls_loss 0.0017 cls_loss_mapping 0.0035 cls_loss_causal 0.5425 re_mapping 0.0058 re_causal 0.0173 /// teacc 99.01 lr 0.00010000 Epoch 211, weight, value: tensor([[ 0.0280, -0.1497, -0.1247, ..., -0.2520, -0.0958, -0.1300], [ 0.0381, -0.0640, 0.0349, ..., 0.0481, 0.0954, -0.0370], [-0.0661, 0.1241, -0.1533, ..., 0.0640, 0.0746, -0.0390], ..., [-0.0781, -0.0803, -0.0736, ..., 0.0027, -0.1431, 0.1192], [ 0.0707, -0.0224, 0.0676, ..., -0.0015, -0.1805, -0.0083], [-0.1515, -0.0604, -0.0745, ..., -0.1688, 0.0483, -0.0963]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, 3.6322e-08, ..., 9.3132e-09, 2.0955e-07, 0.0000e+00], [ 1.8626e-09, 3.7253e-09, -3.4086e-07, ..., -1.0896e-07, -4.0233e-07, 0.0000e+00], [ 1.0245e-08, 1.0245e-08, 1.1548e-07, ..., 5.0291e-08, 1.4901e-07, 0.0000e+00], ..., [ 2.4214e-08, 3.3528e-08, 2.0489e-07, ..., -4.6566e-09, 3.5949e-07, -0.0000e+00], [ 3.8184e-08, -5.4017e-08, 1.0151e-07, ..., -1.3318e-07, 1.7509e-06, 0.0000e+00], [ 1.0990e-07, 9.3132e-10, 5.4017e-08, ..., 1.2852e-07, 5.2899e-07, 0.0000e+00]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0092, -0.0351, 0.0126, -0.0212, 0.0180, 0.0104, 0.0195, 0.0018, -0.0360, -0.0050], device='cuda:0'), grad: tensor([-4.8522e-07, -3.6135e-07, 5.9605e-07, 1.9465e-07, -3.7923e-06, -1.2703e-06, -2.1830e-06, 8.1956e-08, 3.3751e-06, 3.8221e-06], device='cuda:0') 100 0.0001 changing lr epoch 210, time 253.09, cls_loss 0.0022 cls_loss_mapping 0.0033 cls_loss_causal 0.5148 re_mapping 0.0057 re_causal 0.0170 /// teacc 99.05 lr 0.00010000 Epoch 212, weight, value: tensor([[ 0.0276, -0.1492, -0.1254, ..., -0.2536, -0.0970, -0.1300], [ 0.0382, -0.0642, 0.0356, ..., 0.0482, 0.0958, -0.0370], [-0.0662, 0.1244, -0.1537, ..., 0.0634, 0.0742, -0.0391], ..., [-0.0777, -0.0805, -0.0741, ..., 0.0029, -0.1433, 0.1192], [ 0.0706, -0.0224, 0.0673, ..., -0.0017, -0.1820, -0.0083], [-0.1523, -0.0610, -0.0750, ..., -0.1694, 0.0492, -0.0963]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-08, 7.6368e-08, ..., 6.2399e-08, 1.6671e-07, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, -6.5193e-09, ..., 3.0734e-08, 1.8720e-07, 4.6566e-09], [ 0.0000e+00, 4.4145e-07, 1.5674e-06, ..., 1.6075e-06, 2.7008e-08, 9.3132e-10], ..., [ 0.0000e+00, 8.3819e-09, 3.5390e-08, ..., 4.3772e-08, 6.7167e-06, -1.4901e-08], [ 0.0000e+00, -6.2212e-07, -2.1588e-06, ..., -2.1625e-06, 7.3574e-08, 1.8626e-09], [ 0.0000e+00, 1.7695e-08, 7.5437e-08, ..., 7.1712e-08, 7.2177e-07, 5.5879e-09]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0100, -0.0349, 0.0114, -0.0209, 0.0155, 0.0101, 0.0232, 0.0021, -0.0367, -0.0045], device='cuda:0'), grad: tensor([ 7.8510e-07, 9.3039e-07, 4.7907e-06, 4.2841e-08, -3.2216e-05, -9.7789e-08, 5.4855e-07, 2.7880e-05, -6.0089e-06, 3.3490e-06], device='cuda:0') 100 0.0001 changing lr epoch 211, time 253.06, cls_loss 0.0023 cls_loss_mapping 0.0038 cls_loss_causal 0.5333 re_mapping 0.0058 re_causal 0.0169 /// teacc 98.96 lr 0.00010000 Epoch 213, weight, value: tensor([[ 0.0280, -0.1492, -0.1266, ..., -0.2554, -0.0970, -0.1302], [ 0.0384, -0.0645, 0.0344, ..., 0.0481, 0.0943, -0.0371], [-0.0663, 0.1237, -0.1542, ..., 0.0625, 0.0742, -0.0392], ..., [-0.0803, -0.0794, -0.0744, ..., 0.0033, -0.1438, 0.1193], [ 0.0698, -0.0228, 0.0668, ..., -0.0025, -0.1834, -0.0083], [-0.1541, -0.0613, -0.0757, ..., -0.1700, 0.0490, -0.0966]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.6764e-08, 9.3132e-09, ..., 4.2841e-08, 1.1791e-06, 3.7253e-09], [ 0.0000e+00, 6.5193e-09, -1.0896e-07, ..., 3.4925e-07, 6.4075e-07, 1.0617e-07], [ 0.0000e+00, -1.8720e-07, 3.0734e-08, ..., -1.1260e-06, -1.6941e-06, -2.6077e-07], ..., [ 0.0000e+00, 9.8720e-08, 2.9802e-08, ..., 5.9605e-08, -1.2536e-06, 1.3039e-08], [-1.8626e-09, 2.1420e-08, -8.8476e-08, ..., -1.1176e-08, 1.3411e-07, 4.6566e-09], [ 0.0000e+00, 1.4901e-08, 1.1176e-08, ..., 1.5832e-07, 1.6941e-06, 2.7940e-09]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0099, -0.0363, 0.0103, -0.0205, 0.0162, 0.0102, 0.0250, 0.0026, -0.0380, -0.0050], device='cuda:0'), grad: tensor([ 7.8138e-07, 2.2911e-06, -2.1048e-06, 8.7731e-07, 7.4040e-07, -6.3851e-06, 3.0827e-07, -8.8587e-06, 1.9893e-06, 1.0341e-05], device='cuda:0') 100 0.0001 changing lr epoch 212, time 253.74, cls_loss 0.0021 cls_loss_mapping 0.0028 cls_loss_causal 0.5151 re_mapping 0.0060 re_causal 0.0173 /// teacc 98.93 lr 0.00010000 Epoch 214, weight, value: tensor([[ 0.0282, -0.1492, -0.1269, ..., -0.2568, -0.0970, -0.1302], [ 0.0385, -0.0654, 0.0352, ..., 0.0501, 0.0966, -0.0371], [-0.0662, 0.1222, -0.1547, ..., 0.0595, 0.0726, -0.0392], ..., [-0.0809, -0.0804, -0.0751, ..., 0.0028, -0.1451, 0.1194], [ 0.0697, -0.0232, 0.0674, ..., -0.0027, -0.1845, -0.0083], [-0.1547, -0.0615, -0.0762, ..., -0.1700, 0.0490, -0.0966]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.0361e-07, 2.7940e-09, ..., 8.2422e-07, 6.9477e-07, 0.0000e+00], [-4.6566e-09, 2.6431e-06, -1.0990e-07, ..., 9.0674e-06, 1.1034e-05, 0.0000e+00], [ 0.0000e+00, -5.0068e-06, 1.3039e-08, ..., -1.6674e-05, -1.9565e-05, 0.0000e+00], ..., [ 1.8626e-09, 3.7067e-07, 4.1910e-08, ..., 9.1828e-07, 7.9535e-07, -9.3132e-10], [-8.8476e-08, 1.2340e-06, -3.7812e-07, ..., 3.2410e-06, 2.8275e-06, 0.0000e+00], [ 9.3132e-10, 3.0734e-08, 1.0245e-08, ..., 8.8755e-07, 2.2743e-06, 0.0000e+00]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0098, -0.0344, 0.0077, -0.0198, 0.0160, 0.0102, 0.0250, 0.0018, -0.0379, -0.0049], device='cuda:0'), grad: tensor([ 2.2445e-06, 2.5034e-05, -4.6551e-05, 1.4035e-06, 7.0687e-07, 1.0338e-06, 2.6338e-06, 2.5015e-06, 8.2627e-06, 2.7586e-06], device='cuda:0') 100 0.0001 changing lr epoch 213, time 253.45, cls_loss 0.0017 cls_loss_mapping 0.0033 cls_loss_causal 0.5323 re_mapping 0.0057 re_causal 0.0170 /// teacc 98.94 lr 0.00010000 Epoch 215, weight, value: tensor([[ 0.0282, -0.1489, -0.1271, ..., -0.2588, -0.0970, -0.1303], [ 0.0386, -0.0660, 0.0355, ..., 0.0509, 0.0975, -0.0372], [-0.0662, 0.1229, -0.1550, ..., 0.0591, 0.0721, -0.0392], ..., [-0.0811, -0.0810, -0.0754, ..., 0.0025, -0.1458, 0.1195], [ 0.0697, -0.0239, 0.0682, ..., -0.0022, -0.1851, -0.0083], [-0.1549, -0.0621, -0.0768, ..., -0.1711, 0.0489, -0.0967]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 5.8673e-08, ..., 5.9605e-08, -1.0975e-05, 1.8626e-09], [ 0.0000e+00, 1.1176e-08, 1.7229e-07, ..., 9.7696e-07, -1.0151e-07, 1.2573e-07], [ 0.0000e+00, 5.2154e-08, 1.3690e-07, ..., 1.2010e-05, -8.2888e-07, 1.6904e-06], ..., [ 0.0000e+00, -8.5682e-08, 3.9395e-07, ..., -1.5453e-05, 2.3562e-07, -2.0918e-06], [ 0.0000e+00, 1.2107e-08, 2.9989e-07, ..., 1.4128e-06, 1.1176e-07, 1.6950e-07], [ 9.3132e-10, 9.3132e-10, 5.7742e-08, ..., 3.2783e-07, 1.1034e-05, 1.7695e-08]], device='cuda:0') Epoch 215, bias, value: tensor([-0.0097, -0.0336, 0.0073, -0.0201, 0.0160, 0.0102, 0.0251, 0.0015, -0.0377, -0.0052], device='cuda:0'), grad: tensor([-5.4240e-05, 1.3195e-05, 2.4050e-05, -4.5933e-06, 4.4592e-06, 7.1377e-06, -3.9395e-07, -7.1347e-05, 3.7886e-06, 7.7903e-05], device='cuda:0') 100 0.0001 changing lr epoch 214, time 251.07, cls_loss 0.0029 cls_loss_mapping 0.0046 cls_loss_causal 0.5478 re_mapping 0.0059 re_causal 0.0169 /// teacc 99.03 lr 0.00010000 Epoch 216, weight, value: tensor([[ 0.0281, -0.1490, -0.1273, ..., -0.2607, -0.0971, -0.1307], [ 0.0387, -0.0678, 0.0328, ..., 0.0488, 0.0948, -0.0373], [-0.0662, 0.1232, -0.1556, ..., 0.0589, 0.0720, -0.0394], ..., [-0.0814, -0.0810, -0.0754, ..., 0.0027, -0.1460, 0.1197], [ 0.0696, -0.0239, 0.0687, ..., -0.0017, -0.1859, -0.0083], [-0.1556, -0.0624, -0.0781, ..., -0.1716, 0.0489, -0.0967]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 2.7940e-09, 3.8780e-06, ..., 1.8887e-06, 8.4750e-08, 7.4506e-09], [-2.6580e-06, 1.8626e-09, -6.9700e-06, ..., -5.3719e-06, -3.2663e-05, 2.0210e-07], [ 1.2945e-07, -6.2399e-08, 1.0161e-06, ..., 1.4864e-06, 1.4771e-06, 2.9895e-07], ..., [ 9.3132e-10, 1.5832e-08, 1.2964e-06, ..., -1.5814e-06, 5.0291e-08, -9.5833e-07], [ 1.0245e-08, 3.0734e-08, 1.3560e-06, ..., 1.2387e-06, 1.9278e-07, 2.6543e-07], [ 9.3132e-10, 1.8626e-09, 2.0228e-06, ..., 9.8441e-07, 8.0094e-08, 5.3085e-08]], device='cuda:0') Epoch 216, bias, value: tensor([-0.0095, -0.0364, 0.0069, -0.0198, 0.0162, 0.0101, 0.0280, 0.0017, -0.0377, -0.0055], device='cuda:0'), grad: tensor([ 9.2387e-06, -4.0948e-05, 6.8434e-06, -2.4185e-05, 3.9116e-08, -9.2015e-06, 5.2214e-05, -6.4746e-06, 6.4895e-06, 5.9344e-06], device='cuda:0') 100 0.0001 changing lr epoch 215, time 248.37, cls_loss 0.0017 cls_loss_mapping 0.0033 cls_loss_causal 0.4911 re_mapping 0.0057 re_causal 0.0167 /// teacc 99.01 lr 0.00010000 Epoch 217, weight, value: tensor([[ 0.0282, -0.1488, -0.1279, ..., -0.2624, -0.0971, -0.1308], [ 0.0389, -0.0680, 0.0330, ..., 0.0487, 0.0950, -0.0374], [-0.0662, 0.1234, -0.1561, ..., 0.0590, 0.0721, -0.0394], ..., [-0.0813, -0.0812, -0.0756, ..., 0.0027, -0.1465, 0.1198], [ 0.0695, -0.0239, 0.0702, ..., -0.0019, -0.1877, -0.0084], [-0.1558, -0.0627, -0.0785, ..., -0.1717, 0.0489, -0.0968]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 5.5414e-07, 5.5879e-09, ..., 6.3144e-07, 8.8289e-07, 3.7253e-09], [ 9.3132e-10, 8.3819e-09, -3.3993e-07, ..., 9.4064e-08, -2.8219e-07, 3.6322e-08], [-6.8918e-08, -8.3540e-07, 1.8626e-08, ..., -1.3271e-06, -1.8943e-06, 6.5193e-09], ..., [ 1.8626e-09, 1.2107e-08, 1.5926e-07, ..., -1.1735e-07, 3.2876e-07, -5.4017e-08], [ 2.7940e-09, 1.3039e-08, 1.1176e-08, ..., 2.8871e-08, 8.3819e-08, 8.3819e-09], [ 1.8626e-09, 3.6322e-08, 9.4064e-08, ..., 8.1956e-08, 2.0117e-07, 1.1176e-08]], device='cuda:0') Epoch 217, bias, value: tensor([-0.0094, -0.0363, 0.0069, -0.0193, 0.0165, 0.0099, 0.0277, 0.0017, -0.0372, -0.0057], device='cuda:0'), grad: tensor([ 1.8487e-06, -1.1735e-07, -4.7348e-06, 1.9018e-06, -4.9826e-07, -3.7067e-07, 2.1234e-07, -1.4622e-07, 3.0827e-07, 1.5954e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 216---------------------------------------------------- epoch 216, time 264.90, cls_loss 0.0017 cls_loss_mapping 0.0027 cls_loss_causal 0.4842 re_mapping 0.0053 re_causal 0.0157 /// teacc 99.10 lr 0.00010000 Epoch 218, weight, value: tensor([[ 0.0281, -0.1492, -0.1261, ..., -0.2637, -0.0967, -0.1289], [ 0.0397, -0.0710, 0.0330, ..., 0.0487, 0.0949, -0.0375], [-0.0662, 0.1243, -0.1560, ..., 0.0591, 0.0725, -0.0387], ..., [-0.0816, -0.0813, -0.0757, ..., 0.0027, -0.1468, 0.1200], [ 0.0676, -0.0242, 0.0696, ..., -0.0019, -0.1895, -0.0085], [-0.1567, -0.0630, -0.0790, ..., -0.1720, 0.0488, -0.0970]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.7300e-08, 1.6764e-08, ..., 9.5926e-08, 9.7789e-08, 2.7940e-09], [ 0.0000e+00, 1.6671e-07, -6.0536e-08, ..., 2.1607e-07, -1.3970e-08, 2.6077e-08], [ 9.3132e-10, 4.2934e-07, 1.7323e-07, ..., 5.3365e-07, -7.5437e-08, 2.7940e-09], ..., [ 0.0000e+00, -3.5614e-06, 3.9116e-08, ..., -3.3341e-06, -1.5637e-06, -7.3574e-08], [ 0.0000e+00, 1.3039e-08, -5.9325e-07, ..., -4.9733e-07, 4.1910e-08, 4.6566e-09], [ 9.3132e-10, 1.9558e-06, 1.8161e-07, ..., 1.9204e-06, 6.5565e-07, 2.9802e-08]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0086, -0.0364, 0.0073, -0.0192, 0.0173, 0.0101, 0.0274, 0.0017, -0.0383, -0.0060], device='cuda:0'), grad: tensor([-5.8394e-07, 8.5216e-07, 2.3246e-06, 3.9116e-07, 5.0291e-06, 5.8394e-07, 3.0920e-07, -1.3016e-05, -1.5125e-06, 5.6364e-06], device='cuda:0') 100 0.0001 changing lr epoch 217, time 247.69, cls_loss 0.0013 cls_loss_mapping 0.0024 cls_loss_causal 0.5140 re_mapping 0.0055 re_causal 0.0173 /// teacc 98.97 lr 0.00010000 Epoch 219, weight, value: tensor([[ 0.0281, -0.1465, -0.1256, ..., -0.2647, -0.0967, -0.1291], [ 0.0397, -0.0712, 0.0331, ..., 0.0487, 0.0951, -0.0377], [-0.0662, 0.1244, -0.1563, ..., 0.0592, 0.0725, -0.0387], ..., [-0.0816, -0.0814, -0.0758, ..., 0.0027, -0.1472, 0.1202], [ 0.0675, -0.0241, 0.0697, ..., -0.0020, -0.1901, -0.0086], [-0.1567, -0.0654, -0.0793, ..., -0.1719, 0.0487, -0.0972]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8487e-07, 2.0955e-08, ..., 8.1956e-08, 1.8999e-07, 0.0000e+00], [-2.7940e-09, 6.5193e-08, -1.7034e-06, ..., -8.3400e-07, -3.2447e-06, 4.6566e-09], [ 9.3132e-10, -3.4161e-06, 1.0608e-06, ..., -7.7812e-07, -7.1013e-07, 4.6566e-10], ..., [ 4.6566e-10, 2.8666e-06, 2.3935e-07, ..., 1.1884e-06, 2.7660e-06, -2.9802e-08], [ 0.0000e+00, 1.8161e-08, 6.4261e-08, ..., 8.8941e-08, 1.7509e-07, 0.0000e+00], [ 0.0000e+00, 1.1921e-07, 6.8918e-08, ..., 2.1094e-07, 2.1420e-07, 1.8161e-08]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0078, -0.0363, 0.0073, -0.0192, 0.0171, 0.0095, 0.0273, 0.0016, -0.0385, -0.0062], device='cuda:0'), grad: tensor([-6.0648e-06, -4.2692e-06, -2.9672e-06, 5.3756e-06, 6.7521e-08, -6.2957e-06, 6.0871e-06, 5.3644e-06, 6.2305e-07, 2.0582e-06], device='cuda:0') 100 0.0001 changing lr epoch 218, time 247.39, cls_loss 0.0019 cls_loss_mapping 0.0027 cls_loss_causal 0.4991 re_mapping 0.0054 re_causal 0.0165 /// teacc 98.92 lr 0.00010000 Epoch 220, weight, value: tensor([[ 0.0278, -0.1466, -0.1260, ..., -0.2663, -0.0965, -0.1292], [ 0.0398, -0.0715, 0.0331, ..., 0.0486, 0.0951, -0.0379], [-0.0662, 0.1247, -0.1563, ..., 0.0595, 0.0727, -0.0389], ..., [-0.0816, -0.0819, -0.0758, ..., 0.0026, -0.1478, 0.1206], [ 0.0676, -0.0242, 0.0699, ..., -0.0019, -0.1902, -0.0086], [-0.1568, -0.0656, -0.0798, ..., -0.1728, 0.0485, -0.0981]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 9.3132e-09, 3.1013e-07, -1.5832e-08], [ 9.3132e-10, 0.0000e+00, 6.0536e-08, ..., 4.0047e-08, 1.2107e-08, 1.1176e-08], [ 0.0000e+00, 2.7940e-09, 5.6811e-08, ..., 5.6811e-08, 1.2107e-08, 9.3132e-09], ..., [ 9.3132e-10, 0.0000e+00, 2.9802e-08, ..., -3.4459e-08, 3.5390e-08, -2.0489e-08], [ 0.0000e+00, -3.7253e-09, -9.2480e-07, ..., -3.1851e-07, 2.1420e-08, 5.5879e-09], [ 1.8626e-09, 0.0000e+00, 6.1467e-08, ..., 4.6566e-08, -1.8626e-09, 2.0489e-08]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0073, -0.0363, 0.0077, -0.0194, 0.0171, 0.0104, 0.0271, 0.0019, -0.0387, -0.0071], device='cuda:0'), grad: tensor([-3.5688e-06, 4.3400e-07, 4.6380e-07, 7.3276e-06, 2.6096e-06, 1.0729e-05, 6.2399e-08, 5.5879e-07, -1.2433e-06, -1.7434e-05], device='cuda:0') 100 0.0001 changing lr epoch 219, time 247.62, cls_loss 0.0016 cls_loss_mapping 0.0035 cls_loss_causal 0.5294 re_mapping 0.0056 re_causal 0.0173 /// teacc 98.97 lr 0.00010000 Epoch 221, weight, value: tensor([[ 0.0276, -0.1466, -0.1262, ..., -0.2672, -0.0965, -0.1292], [ 0.0398, -0.0717, 0.0332, ..., 0.0486, 0.0950, -0.0380], [-0.0662, 0.1248, -0.1562, ..., 0.0595, 0.0734, -0.0389], ..., [-0.0817, -0.0819, -0.0760, ..., 0.0026, -0.1482, 0.1207], [ 0.0676, -0.0243, 0.0747, ..., 0.0027, -0.1896, -0.0086], [-0.1570, -0.0656, -0.0805, ..., -0.1732, 0.0484, -0.0983]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.3819e-09, 1.9558e-08, ..., 1.0524e-07, 1.3690e-07, 9.3132e-10], [ 0.0000e+00, 1.3970e-08, -1.1753e-06, ..., -5.6531e-07, -9.4902e-07, 5.5879e-09], [ 0.0000e+00, -1.3504e-07, 2.1886e-07, ..., -2.4773e-06, -2.8647e-06, -5.5879e-09], ..., [ 0.0000e+00, 1.0245e-08, 9.4716e-07, ..., 6.5751e-07, 7.0687e-07, -0.0000e+00], [ 0.0000e+00, 8.0094e-08, 3.3248e-07, ..., 2.4829e-06, 2.3674e-06, 9.3132e-10], [ 0.0000e+00, 8.3819e-09, 5.8487e-07, ..., 6.8452e-07, 8.5868e-07, 0.0000e+00]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0070, -0.0364, 0.0082, -0.0192, 0.0173, 0.0099, 0.0261, 0.0019, -0.0344, -0.0073], device='cuda:0'), grad: tensor([ 2.6822e-07, -2.3395e-06, -5.6326e-06, 1.2666e-06, -1.1828e-06, -5.5581e-06, 4.2003e-07, 1.9595e-06, 6.3293e-06, 4.4741e-06], device='cuda:0') 100 0.0001 changing lr epoch 220, time 247.67, cls_loss 0.0015 cls_loss_mapping 0.0029 cls_loss_causal 0.5269 re_mapping 0.0057 re_causal 0.0167 /// teacc 98.97 lr 0.00010000 Epoch 222, weight, value: tensor([[ 0.0276, -0.1466, -0.1264, ..., -0.2684, -0.0965, -0.1289], [ 0.0399, -0.0718, 0.0333, ..., 0.0487, 0.0951, -0.0383], [-0.0662, 0.1250, -0.1566, ..., 0.0595, 0.0734, -0.0390], ..., [-0.0820, -0.0820, -0.0762, ..., 0.0024, -0.1487, 0.1208], [ 0.0676, -0.0244, 0.0747, ..., 0.0027, -0.1897, -0.0087], [-0.1581, -0.0660, -0.0812, ..., -0.1739, 0.0491, -0.0984]], device='cuda:0'), grad: tensor([[-1.6298e-07, 1.8626e-09, 3.7253e-09, ..., 9.3132e-09, 6.0238e-06, 9.3132e-10], [ 2.3283e-08, 8.3819e-09, -7.0781e-08, ..., 4.4703e-08, 2.5425e-07, 2.6077e-08], [ 3.2596e-08, -2.7008e-08, 2.2352e-08, ..., -3.6322e-08, 2.7753e-06, 1.8626e-09], ..., [ 3.7253e-09, 1.0245e-08, 7.4506e-08, ..., -2.8871e-08, 3.4925e-07, -5.5879e-08], [ 1.3039e-08, 9.3132e-10, -9.3132e-08, ..., -3.0734e-08, 2.4028e-07, 7.4506e-09], [ 6.2399e-08, 0.0000e+00, 7.4506e-09, ..., 4.0978e-08, 7.0967e-07, 1.6764e-08]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0069, -0.0364, 0.0082, -0.0194, 0.0163, 0.0101, 0.0260, 0.0019, -0.0345, -0.0067], device='cuda:0'), grad: tensor([ 8.9481e-06, 1.3625e-06, 6.2175e-06, 4.5598e-06, 1.7453e-06, -4.2245e-06, -2.3276e-05, 4.8056e-07, 8.7637e-07, 3.2894e-06], device='cuda:0') 100 0.0001 changing lr epoch 221, time 247.90, cls_loss 0.0020 cls_loss_mapping 0.0029 cls_loss_causal 0.5174 re_mapping 0.0061 re_causal 0.0169 /// teacc 99.03 lr 0.00010000 Epoch 223, weight, value: tensor([[ 0.0274, -0.1468, -0.1271, ..., -0.2694, -0.0966, -0.1289], [ 0.0398, -0.0721, 0.0333, ..., 0.0485, 0.0951, -0.0384], [-0.0662, 0.1248, -0.1568, ..., 0.0589, 0.0730, -0.0390], ..., [-0.0821, -0.0814, -0.0762, ..., 0.0033, -0.1479, 0.1209], [ 0.0678, -0.0248, 0.0749, ..., 0.0028, -0.1899, -0.0087], [-0.1594, -0.0661, -0.0819, ..., -0.1745, 0.0500, -0.0985]], device='cuda:0'), grad: tensor([[ 4.7404e-07, 9.3132e-10, 3.3062e-07, ..., 2.5146e-08, 7.8231e-08, 3.7253e-09], [ 6.6124e-08, 0.0000e+00, 1.6671e-07, ..., 3.6601e-07, -5.1223e-08, 2.2352e-08], [ 6.4541e-07, -7.4506e-09, 1.4240e-06, ..., 1.0710e-06, 5.1968e-07, 1.1176e-08], ..., [ 2.3637e-06, 3.7253e-09, 1.6997e-06, ..., -2.8871e-07, 4.2189e-07, -5.6811e-08], [ 1.0254e-06, 9.3132e-10, -3.9395e-07, ..., -1.1669e-06, -4.5355e-07, 5.5879e-09], [ 1.0217e-06, 0.0000e+00, 6.9011e-07, ..., 6.7055e-08, 1.3690e-07, 6.5193e-09]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0070, -0.0365, 0.0073, -0.0193, 0.0158, 0.0103, 0.0259, 0.0026, -0.0345, -0.0058], device='cuda:0'), grad: tensor([ 1.0245e-05, 2.6599e-06, 1.6510e-05, 1.8969e-05, 2.3037e-05, -1.7011e-04, 6.1877e-06, 5.0038e-05, 1.9729e-05, 2.2829e-05], device='cuda:0') 100 0.0001 changing lr epoch 222, time 247.73, cls_loss 0.0019 cls_loss_mapping 0.0027 cls_loss_causal 0.5130 re_mapping 0.0061 re_causal 0.0175 /// teacc 98.91 lr 0.00010000 Epoch 224, weight, value: tensor([[ 0.0273, -0.1463, -0.1275, ..., -0.2728, -0.0966, -0.1290], [ 0.0400, -0.0732, 0.0335, ..., 0.0486, 0.0952, -0.0381], [-0.0667, 0.1252, -0.1571, ..., 0.0600, 0.0742, -0.0390], ..., [-0.0791, -0.0811, -0.0768, ..., 0.0032, -0.1492, 0.1207], [ 0.0656, -0.0262, 0.0748, ..., 0.0025, -0.1902, -0.0088], [-0.1619, -0.0667, -0.0843, ..., -0.1751, 0.0493, -0.0987]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 3.7253e-09, 3.8184e-08, ..., 5.6811e-08, 4.7497e-08, 2.6077e-08], [ 3.4459e-08, 3.7253e-09, 3.8221e-06, ..., 8.8662e-06, -7.9256e-07, 5.5283e-06], [ 2.7940e-09, -5.8673e-08, 3.1479e-07, ..., 1.8254e-07, -9.9652e-08, 1.4715e-07], ..., [ 2.7381e-07, 1.9558e-08, -6.9924e-06, ..., -1.4499e-05, 1.1111e-06, -9.1866e-06], [ 8.9407e-08, 4.6566e-09, -2.1681e-06, ..., -1.2284e-06, 2.2724e-07, 7.5437e-07], [ 1.6764e-07, 1.8626e-09, 3.0268e-07, ..., 5.3924e-07, 1.5590e-06, 1.6857e-07]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0073, -0.0363, 0.0082, -0.0199, 0.0166, 0.0105, 0.0260, 0.0023, -0.0348, -0.0066], device='cuda:0'), grad: tensor([-3.8370e-07, 4.2349e-05, 1.3914e-06, 2.4125e-05, -7.8231e-06, 4.8578e-06, 6.3423e-07, -6.8069e-05, -2.0303e-07, 3.0696e-06], device='cuda:0') 100 0.0001 changing lr epoch 223, time 247.47, cls_loss 0.0029 cls_loss_mapping 0.0046 cls_loss_causal 0.5051 re_mapping 0.0059 re_causal 0.0167 /// teacc 98.92 lr 0.00010000 Epoch 225, weight, value: tensor([[ 0.0272, -0.1466, -0.1277, ..., -0.2748, -0.0967, -0.1290], [ 0.0401, -0.0740, 0.0336, ..., 0.0487, 0.0953, -0.0381], [-0.0668, 0.1257, -0.1548, ..., 0.0631, 0.0774, -0.0392], ..., [-0.0791, -0.0814, -0.0773, ..., 0.0028, -0.1504, 0.1208], [ 0.0656, -0.0261, 0.0749, ..., 0.0026, -0.1903, -0.0088], [-0.1623, -0.0672, -0.0852, ..., -0.1779, 0.0492, -0.0988]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.0489e-08, ..., 2.7008e-08, 1.0245e-08, 0.0000e+00], [-2.7940e-09, 0.0000e+00, 2.7940e-08, ..., 7.9162e-08, -5.4017e-08, 1.8626e-09], [ 0.0000e+00, -1.8626e-09, 2.5332e-07, ..., 6.1840e-07, 1.1176e-08, 0.0000e+00], ..., [ 1.8626e-09, 1.8626e-09, 5.0385e-07, ..., 5.0757e-07, 4.1910e-08, -3.7253e-09], [ 0.0000e+00, 0.0000e+00, -3.4459e-08, ..., -9.3132e-10, 5.1223e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 8.1956e-08, ..., 1.0245e-07, -3.2783e-07, 9.3132e-10]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0071, -0.0363, 0.0112, -0.0222, 0.0167, 0.0101, 0.0260, 0.0021, -0.0347, -0.0070], device='cuda:0'), grad: tensor([ 1.0710e-07, 1.9930e-07, 1.3197e-06, -4.1462e-06, 8.5495e-07, 5.0850e-07, 2.3190e-07, 1.5562e-06, 1.7416e-07, -8.0280e-07], device='cuda:0') 100 0.0001 changing lr epoch 224, time 248.40, cls_loss 0.0016 cls_loss_mapping 0.0028 cls_loss_causal 0.5237 re_mapping 0.0057 re_causal 0.0166 /// teacc 98.93 lr 0.00010000 Epoch 226, weight, value: tensor([[ 0.0269, -0.1458, -0.1299, ..., -0.2768, -0.0967, -0.1291], [ 0.0405, -0.0744, 0.0341, ..., 0.0496, 0.0958, -0.0382], [-0.0669, 0.1255, -0.1547, ..., 0.0633, 0.0776, -0.0394], ..., [-0.0791, -0.0818, -0.0789, ..., 0.0016, -0.1533, 0.1210], [ 0.0654, -0.0254, 0.0749, ..., 0.0027, -0.1906, -0.0088], [-0.1630, -0.0680, -0.0856, ..., -0.1784, 0.0492, -0.0988]], device='cuda:0'), grad: tensor([[ 7.5437e-08, 4.6566e-09, 1.6503e-06, ..., 9.3132e-08, 2.4606e-06, 9.3132e-10], [-2.0582e-07, 3.5390e-08, -5.4538e-06, ..., -4.4703e-07, -7.8008e-06, 1.8626e-08], [ 1.5832e-08, 6.8732e-07, 4.4238e-07, ..., 5.2340e-07, 6.9011e-07, 8.3819e-09], ..., [ 2.5146e-08, -9.2853e-07, 1.0207e-06, ..., -4.3400e-07, 1.0459e-06, -4.6566e-08], [ 3.9116e-08, 3.2596e-08, 6.3237e-07, ..., 7.0781e-08, 1.2955e-06, 5.5879e-09], [ 6.2399e-08, 3.5390e-08, 2.9150e-07, ..., 1.6484e-07, 6.0070e-07, 5.5879e-09]], device='cuda:0') Epoch 226, bias, value: tensor([-7.2164e-03, -3.5470e-02, 1.1452e-02, -2.2439e-02, 1.6789e-02, 1.0644e-02, 2.5866e-02, 7.7172e-05, -3.5012e-02, -7.2658e-03], device='cuda:0'), grad: tensor([ 5.4911e-06, -1.5825e-05, 3.1665e-06, -1.2815e-06, 6.7614e-07, 2.1327e-06, 1.8505e-06, -1.5972e-06, 2.7716e-06, 2.5779e-06], device='cuda:0') 100 0.0001 changing lr epoch 225, time 248.04, cls_loss 0.0019 cls_loss_mapping 0.0028 cls_loss_causal 0.5010 re_mapping 0.0055 re_causal 0.0161 /// teacc 98.94 lr 0.00010000 Epoch 227, weight, value: tensor([[ 0.0268, -0.1460, -0.1300, ..., -0.2787, -0.0970, -0.1297], [ 0.0406, -0.0731, 0.0343, ..., 0.0502, 0.0962, -0.0387], [-0.0670, 0.1247, -0.1549, ..., 0.0632, 0.0774, -0.0395], ..., [-0.0782, -0.0814, -0.0794, ..., 0.0012, -0.1542, 0.1214], [ 0.0650, -0.0253, 0.0750, ..., 0.0027, -0.1908, -0.0089], [-0.1634, -0.0692, -0.0857, ..., -0.1798, 0.0493, -0.0990]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.1665e-08, 5.5879e-08, ..., 1.1921e-07, 1.3039e-07, 0.0000e+00], [-1.8626e-09, -1.8431e-06, -2.0061e-06, ..., -3.8408e-06, -4.8876e-06, -3.7253e-09], [ 5.5879e-09, 1.5376e-06, 1.2135e-06, ..., 3.6396e-06, 5.0738e-06, 9.3132e-10], ..., [ 4.6566e-09, -6.8080e-07, 4.4517e-07, ..., -8.5495e-07, 1.8794e-06, 9.3132e-10], [ 1.8626e-09, 3.1013e-07, 1.9744e-07, ..., 7.2643e-07, 1.3402e-06, 0.0000e+00], [ 1.8626e-09, 2.1514e-07, 4.3772e-08, ..., -8.1211e-07, -7.8455e-06, 0.0000e+00]], device='cuda:0') Epoch 227, bias, value: tensor([-0.0073, -0.0354, 0.0113, -0.0224, 0.0168, 0.0105, 0.0259, 0.0006, -0.0351, -0.0075], device='cuda:0'), grad: tensor([ 4.9546e-07, 3.4552e-07, 1.0885e-05, -6.3702e-07, 7.4506e-06, 2.0787e-06, 1.4994e-07, -3.4235e-06, 3.4608e-06, -2.0847e-05], device='cuda:0') 100 0.0001 changing lr epoch 226, time 248.15, cls_loss 0.0013 cls_loss_mapping 0.0025 cls_loss_causal 0.5182 re_mapping 0.0057 re_causal 0.0172 /// teacc 98.88 lr 0.00010000 Epoch 228, weight, value: tensor([[ 0.0267, -0.1462, -0.1302, ..., -0.2814, -0.0971, -0.1298], [ 0.0409, -0.0713, 0.0346, ..., 0.0507, 0.0966, -0.0388], [-0.0670, 0.1250, -0.1552, ..., 0.0632, 0.0774, -0.0396], ..., [-0.0781, -0.0821, -0.0798, ..., 0.0009, -0.1553, 0.1216], [ 0.0646, -0.0252, 0.0750, ..., 0.0026, -0.1911, -0.0092], [-0.1636, -0.0700, -0.0861, ..., -0.1801, 0.0495, -0.0990]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 1.1176e-08, ..., 8.3819e-09, 1.3970e-08, 0.0000e+00], [-0.0000e+00, 0.0000e+00, -2.6785e-06, ..., -2.2575e-06, -4.2282e-06, -5.5879e-09], [ 0.0000e+00, -1.2107e-08, 8.6706e-07, ..., 7.3668e-07, 1.2992e-06, 0.0000e+00], ..., [ 0.0000e+00, 6.5193e-09, 1.6736e-06, ..., 1.4082e-06, 2.6934e-06, 3.7253e-09], [ 0.0000e+00, 9.3132e-10, -8.1956e-08, ..., -2.5146e-08, 7.0781e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.7987e-08, ..., 5.2154e-08, 2.9895e-07, 0.0000e+00]], device='cuda:0') Epoch 228, bias, value: tensor([-0.0073, -0.0351, 0.0112, -0.0224, 0.0165, 0.0110, 0.0257, 0.0003, -0.0353, -0.0072], device='cuda:0'), grad: tensor([-8.6613e-08, -5.9716e-06, 2.0489e-06, -2.3600e-06, -1.2405e-06, 2.0824e-06, 2.2072e-07, 4.1276e-06, 2.0396e-07, 9.4529e-07], device='cuda:0') 100 0.0001 changing lr epoch 227, time 248.30, cls_loss 0.0025 cls_loss_mapping 0.0036 cls_loss_causal 0.5173 re_mapping 0.0060 re_causal 0.0165 /// teacc 98.93 lr 0.00010000 Epoch 229, weight, value: tensor([[ 0.0268, -0.1464, -0.1275, ..., -0.2792, -0.0976, -0.1299], [ 0.0410, -0.0715, 0.0346, ..., 0.0506, 0.0964, -0.0390], [-0.0671, 0.1247, -0.1556, ..., 0.0631, 0.0771, -0.0398], ..., [-0.0782, -0.0818, -0.0800, ..., 0.0010, -0.1563, 0.1217], [ 0.0646, -0.0250, 0.0750, ..., 0.0024, -0.1913, -0.0093], [-0.1639, -0.0698, -0.0886, ..., -0.1793, 0.0487, -0.0991]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 5.5879e-09, ..., 6.5193e-09, 2.5779e-06, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, -5.1223e-08, ..., -9.3132e-10, -4.8429e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-08, 5.4017e-08, ..., 2.6077e-08, -7.4506e-09, 0.0000e+00], ..., [ 0.0000e+00, 5.5879e-09, 8.8476e-08, ..., 9.7789e-08, 2.2352e-08, 0.0000e+00], [ 0.0000e+00, 8.3819e-09, -4.6566e-09, ..., 2.7940e-09, 7.6368e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 1.3970e-08, ..., 6.5193e-09, 1.1642e-07, 0.0000e+00]], device='cuda:0') Epoch 229, bias, value: tensor([-7.1250e-03, -3.5381e-02, 1.1009e-02, -2.2298e-02, 1.9389e-02, 1.0911e-02, 2.5526e-02, -9.1543e-05, -3.5612e-02, -7.4394e-03], device='cuda:0'), grad: tensor([ 6.6608e-06, -4.4703e-08, 9.5926e-08, 6.8918e-07, 1.5367e-07, -4.5806e-05, 3.4899e-05, 2.7381e-07, 1.2424e-06, 1.8552e-06], device='cuda:0') 100 0.0001 changing lr epoch 228, time 248.12, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.5292 re_mapping 0.0058 re_causal 0.0171 /// teacc 98.99 lr 0.00010000 Epoch 230, weight, value: tensor([[ 0.0269, -0.1468, -0.1274, ..., -0.2797, -0.0972, -0.1304], [ 0.0410, -0.0716, 0.0347, ..., 0.0506, 0.0965, -0.0393], [-0.0671, 0.1247, -0.1559, ..., 0.0631, 0.0771, -0.0407], ..., [-0.0782, -0.0819, -0.0801, ..., 0.0013, -0.1564, 0.1222], [ 0.0647, -0.0245, 0.0751, ..., 0.0025, -0.1915, -0.0093], [-0.1641, -0.0700, -0.0891, ..., -0.1801, 0.0488, -0.0994]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.1176e-08, ..., 1.2107e-08, 9.5926e-08, 6.5193e-09], [-2.0489e-08, 1.2107e-08, -5.4669e-07, ..., -2.1607e-07, -4.8615e-07, 1.5832e-08], [ 2.7940e-09, -5.5879e-09, 6.8918e-08, ..., 5.4017e-08, 6.4261e-08, 9.3132e-09], ..., [ 1.0245e-08, -4.6566e-09, 2.0117e-07, ..., -1.6578e-07, 2.3749e-07, -8.7544e-08], [ 3.7253e-09, 2.7940e-09, -1.5087e-07, ..., -1.0058e-07, 2.6170e-07, 1.0245e-08], [ 0.0000e+00, 8.3819e-09, 8.8476e-08, ..., 1.6950e-07, 7.1712e-08, 1.0245e-08]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0062, -0.0353, 0.0110, -0.0223, 0.0190, 0.0111, 0.0255, 0.0001, -0.0357, -0.0079], device='cuda:0'), grad: tensor([-6.8098e-06, -6.9942e-07, 4.3679e-07, 2.9430e-07, 1.1735e-07, -2.1048e-07, -7.4692e-07, 4.7833e-06, 6.7707e-07, 2.1495e-06], device='cuda:0') 100 0.0001 changing lr epoch 229, time 248.03, cls_loss 0.0017 cls_loss_mapping 0.0029 cls_loss_causal 0.4792 re_mapping 0.0054 re_causal 0.0157 /// teacc 98.96 lr 0.00010000 Epoch 231, weight, value: tensor([[ 0.0268, -0.1470, -0.1277, ..., -0.2810, -0.0973, -0.1309], [ 0.0410, -0.0694, 0.0356, ..., 0.0523, 0.0980, -0.0399], [-0.0671, 0.1238, -0.1580, ..., 0.0621, 0.0754, -0.0410], ..., [-0.0782, -0.0815, -0.0795, ..., 0.0030, -0.1553, 0.1229], [ 0.0647, -0.0253, 0.0749, ..., 0.0022, -0.1921, -0.0094], [-0.1639, -0.0702, -0.0893, ..., -0.1806, 0.0486, -0.0995]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 1.5832e-07, ..., 1.9837e-07, -1.8626e-08, 1.8626e-09], [ 9.3132e-10, 2.7940e-09, 2.8405e-07, ..., 4.4703e-07, 4.6007e-07, 9.3132e-10], [ 9.3132e-10, -3.6322e-08, 2.5444e-06, ..., 3.2093e-06, 1.1837e-06, 9.3132e-10], ..., [ 0.0000e+00, 1.7695e-08, 1.1986e-06, ..., 1.3933e-06, 4.2096e-07, -9.3132e-10], [-0.0000e+00, 7.4506e-09, 4.4517e-07, ..., 6.0722e-07, 2.7381e-07, 1.8626e-09], [ 0.0000e+00, 9.3132e-10, 3.8650e-07, ..., 4.9919e-07, 7.3295e-07, 0.0000e+00]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0063, -0.0346, 0.0103, -0.0224, 0.0192, 0.0113, 0.0255, 0.0016, -0.0362, -0.0083], device='cuda:0'), grad: tensor([-3.0827e-07, 2.0675e-06, 9.6709e-06, -2.0385e-05, -8.4564e-07, 3.8296e-06, -3.4291e-06, 4.1872e-06, 2.0433e-06, 3.1367e-06], device='cuda:0') 100 0.0001 changing lr epoch 230, time 248.03, cls_loss 0.0016 cls_loss_mapping 0.0027 cls_loss_causal 0.5028 re_mapping 0.0054 re_causal 0.0156 /// teacc 99.08 lr 0.00010000 Epoch 232, weight, value: tensor([[ 0.0260, -0.1472, -0.1278, ..., -0.2813, -0.0974, -0.1311], [ 0.0409, -0.0711, 0.0359, ..., 0.0521, 0.0979, -0.0400], [-0.0674, 0.1229, -0.1581, ..., 0.0621, 0.0756, -0.0410], ..., [-0.0765, -0.0823, -0.0802, ..., 0.0027, -0.1561, 0.1230], [ 0.0647, -0.0256, 0.0749, ..., 0.0022, -0.1922, -0.0094], [-0.1647, -0.0713, -0.0898, ..., -0.1818, 0.0486, -0.0997]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1176e-08, 5.8673e-08, ..., 1.5832e-07, 1.0338e-07, 0.0000e+00], [ 9.3132e-10, 1.3039e-08, 2.8685e-07, ..., 9.1270e-07, 2.0210e-07, 0.0000e+00], [ 0.0000e+00, -1.7881e-07, 1.0012e-06, ..., 5.3085e-06, 1.9539e-06, 0.0000e+00], ..., [ 0.0000e+00, 9.6858e-08, -8.8289e-07, ..., -7.9721e-06, -2.9933e-06, -1.8626e-09], [ 0.0000e+00, 1.5832e-08, -2.2985e-06, ..., -1.7490e-06, -6.5472e-07, 0.0000e+00], [ 1.8626e-09, 4.6566e-09, 3.2689e-07, ..., 1.4165e-06, 7.2923e-07, 1.8626e-09]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0062, -0.0346, 0.0102, -0.0221, 0.0193, 0.0107, 0.0255, 0.0012, -0.0363, -0.0084], device='cuda:0'), grad: tensor([ 6.0908e-07, 3.0398e-06, 2.1413e-05, 2.2873e-06, -9.3319e-07, 8.7637e-07, 1.8561e-06, -3.2753e-05, -2.7008e-06, 6.3255e-06], device='cuda:0') 100 0.0001 changing lr epoch 231, time 248.27, cls_loss 0.0012 cls_loss_mapping 0.0022 cls_loss_causal 0.5090 re_mapping 0.0052 re_causal 0.0162 /// teacc 99.02 lr 0.00010000 Epoch 233, weight, value: tensor([[ 0.0261, -0.1485, -0.1280, ..., -0.2819, -0.0974, -0.1311], [ 0.0412, -0.0725, 0.0361, ..., 0.0520, 0.0979, -0.0401], [-0.0675, 0.1241, -0.1582, ..., 0.0621, 0.0758, -0.0410], ..., [-0.0760, -0.0829, -0.0804, ..., 0.0027, -0.1566, 0.1231], [ 0.0642, -0.0255, 0.0750, ..., 0.0023, -0.1924, -0.0095], [-0.1659, -0.0723, -0.0905, ..., -0.1823, 0.0486, -0.0997]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.7253e-09, 1.0012e-06, ..., 4.6566e-07, -1.4901e-08, 0.0000e+00], [ 5.5879e-09, 4.0978e-08, -4.1537e-07, ..., -2.6915e-07, -6.9384e-07, 9.3132e-10], [ 9.3132e-10, -1.4622e-07, -1.2852e-07, ..., -5.1595e-07, -1.0263e-06, 0.0000e+00], ..., [ 9.3132e-10, 6.3330e-08, 2.6450e-07, ..., 3.5670e-07, 6.0722e-07, -2.7940e-09], [ 1.2107e-08, 3.7253e-09, -2.4643e-06, ..., -1.0645e-06, 1.0431e-07, 0.0000e+00], [ 1.3970e-08, 1.7695e-08, 2.5705e-07, ..., 2.1048e-07, 5.9232e-07, 9.3132e-10]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0062, -0.0346, 0.0103, -0.0222, 0.0194, 0.0107, 0.0255, 0.0012, -0.0363, -0.0085], device='cuda:0'), grad: tensor([ 1.8785e-06, -6.6310e-07, -2.4475e-06, 7.4320e-07, -2.5146e-07, 1.1176e-08, 1.5721e-06, 1.5749e-06, -5.0776e-06, 2.6487e-06], device='cuda:0') 100 0.0001 changing lr epoch 232, time 248.30, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.5144 re_mapping 0.0055 re_causal 0.0172 /// teacc 98.97 lr 0.00010000 Epoch 234, weight, value: tensor([[ 0.0286, -0.1482, -0.1280, ..., -0.2822, -0.0974, -0.1312], [ 0.0410, -0.0700, 0.0367, ..., 0.0525, 0.0982, -0.0399], [-0.0684, 0.1229, -0.1588, ..., 0.0621, 0.0756, -0.0410], ..., [-0.0755, -0.0836, -0.0815, ..., 0.0020, -0.1574, 0.1230], [ 0.0643, -0.0257, 0.0750, ..., 0.0022, -0.1926, -0.0096], [-0.1672, -0.0731, -0.0910, ..., -0.1831, 0.0486, -0.1000]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-08, 2.5146e-08, ..., 4.4703e-08, -3.2596e-08, 1.8626e-09], [ 9.3132e-10, 1.9558e-08, -1.0617e-07, ..., 1.8440e-07, -1.1269e-07, 6.6124e-08], [ 9.3132e-10, -8.7544e-08, 4.1910e-08, ..., -4.2841e-08, -6.3330e-08, 4.6566e-09], ..., [ 9.3132e-10, 1.4901e-08, 1.5832e-08, ..., -9.8068e-07, 4.6566e-08, -5.6252e-07], [ 0.0000e+00, -2.7940e-09, -3.5390e-08, ..., -1.7695e-08, 8.4750e-08, 4.6566e-09], [ 0.0000e+00, 7.4506e-09, 2.4214e-08, ..., 2.1234e-07, 2.5146e-07, 9.4064e-08]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0058, -0.0343, 0.0102, -0.0222, 0.0194, 0.0112, 0.0253, 0.0007, -0.0364, -0.0086], device='cuda:0'), grad: tensor([-4.6566e-07, 4.1816e-07, 2.0117e-07, 2.6748e-06, 7.5437e-07, -6.9011e-07, -1.2154e-06, -3.8296e-06, 3.0175e-07, 1.8599e-06], device='cuda:0') 100 0.0001 changing lr epoch 233, time 247.67, cls_loss 0.0016 cls_loss_mapping 0.0024 cls_loss_causal 0.4887 re_mapping 0.0055 re_causal 0.0156 /// teacc 99.02 lr 0.00010000 Epoch 235, weight, value: tensor([[ 0.0288, -0.1485, -0.1281, ..., -0.2827, -0.0974, -0.1312], [ 0.0439, -0.0708, 0.0371, ..., 0.0521, 0.0986, -0.0407], [-0.0689, 0.1234, -0.1590, ..., 0.0621, 0.0757, -0.0411], ..., [-0.0744, -0.0834, -0.0804, ..., 0.0038, -0.1578, 0.1240], [ 0.0611, -0.0257, 0.0740, ..., 0.0014, -0.1937, -0.0097], [-0.1684, -0.0733, -0.0917, ..., -0.1841, 0.0486, -0.1005]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 4.6566e-09, 1.4808e-07, ..., 1.1362e-07, 1.6578e-07, 9.3132e-10], [ 9.8161e-07, 2.5146e-08, -4.9099e-06, ..., -1.3541e-06, -1.2949e-05, -2.1420e-08], [ 2.0396e-07, -3.7253e-08, 3.1069e-06, ..., 2.2054e-06, 3.2987e-06, 2.7940e-09], ..., [-2.2631e-07, -2.5891e-07, 9.3039e-07, ..., 3.6135e-07, 1.0487e-06, 3.7253e-09], [-1.1027e-06, 1.4342e-07, -5.1484e-06, ..., -5.2489e-06, 1.4342e-07, 1.8626e-09], [ 5.1223e-08, 5.1223e-08, -4.9453e-07, ..., 6.8918e-08, -3.2783e-07, 0.0000e+00]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0062, -0.0341, 0.0102, -0.0221, 0.0193, 0.0119, 0.0254, 0.0018, -0.0382, -0.0088], device='cuda:0'), grad: tensor([ 7.0874e-07, -1.0408e-05, 8.2776e-06, 2.5369e-06, 1.0520e-05, 8.3726e-07, 2.9095e-06, 4.0196e-06, -1.3731e-05, -5.6773e-06], device='cuda:0') 100 0.0001 changing lr epoch 234, time 248.08, cls_loss 0.0018 cls_loss_mapping 0.0032 cls_loss_causal 0.5181 re_mapping 0.0056 re_causal 0.0162 /// teacc 98.99 lr 0.00010000 Epoch 236, weight, value: tensor([[ 0.0286, -0.1491, -0.1284, ..., -0.2834, -0.0977, -0.1314], [ 0.0439, -0.0708, 0.0374, ..., 0.0518, 0.0986, -0.0410], [-0.0690, 0.1238, -0.1592, ..., 0.0621, 0.0758, -0.0412], ..., [-0.0748, -0.0840, -0.0806, ..., 0.0045, -0.1584, 0.1245], [ 0.0615, -0.0250, 0.0740, ..., 0.0013, -0.1941, -0.0098], [-0.1685, -0.0737, -0.0923, ..., -0.1840, 0.0487, -0.1006]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.9802e-08, -1.6391e-07, ..., 1.5832e-08, -1.3132e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 0.0000e+00, ..., 1.5274e-07, -4.1910e-08, 0.0000e+00], [ 0.0000e+00, 2.4214e-08, 4.3772e-08, ..., -2.0489e-08, -4.0047e-08, 0.0000e+00], ..., [-9.3132e-10, 1.7695e-08, 3.5390e-08, ..., -4.8988e-07, 5.4017e-08, -0.0000e+00], [ 0.0000e+00, -1.9372e-07, -1.7323e-07, ..., -1.3970e-08, 1.3970e-08, 0.0000e+00], [ 0.0000e+00, 6.3330e-08, 1.5087e-07, ..., 1.7229e-07, 1.4063e-07, 0.0000e+00]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0064, -0.0343, 0.0102, -0.0221, 0.0193, 0.0131, 0.0252, 0.0022, -0.0386, -0.0087], device='cuda:0'), grad: tensor([-1.4938e-06, 6.8173e-07, 1.7509e-07, 7.1060e-07, -1.0431e-07, 1.1548e-07, 5.9884e-07, -1.8496e-06, -8.4378e-07, 1.9986e-06], device='cuda:0') 100 0.0001 changing lr epoch 235, time 247.83, cls_loss 0.0014 cls_loss_mapping 0.0027 cls_loss_causal 0.4963 re_mapping 0.0053 re_causal 0.0157 /// teacc 99.01 lr 0.00010000 Epoch 237, weight, value: tensor([[ 0.0282, -0.1496, -0.1286, ..., -0.2842, -0.0978, -0.1314], [ 0.0439, -0.0693, 0.0380, ..., 0.0522, 0.0990, -0.0410], [-0.0693, 0.1230, -0.1601, ..., 0.0620, 0.0756, -0.0412], ..., [-0.0733, -0.0848, -0.0811, ..., 0.0045, -0.1596, 0.1246], [ 0.0617, -0.0252, 0.0742, ..., 0.0015, -0.1942, -0.0098], [-0.1685, -0.0736, -0.0929, ..., -0.1844, 0.0490, -0.1006]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 4.2003e-07, 9.3132e-09, ..., 3.1199e-07, 6.6124e-08, 0.0000e+00], [ 4.6566e-09, 7.4506e-08, -5.2154e-08, ..., 7.4506e-09, -1.8161e-07, 0.0000e+00], [-2.9802e-08, -2.7329e-05, 2.5146e-07, ..., -1.9029e-05, -2.0489e-08, 0.0000e+00], ..., [-7.4506e-09, 2.3171e-05, 3.0734e-08, ..., 1.6242e-05, 3.9116e-08, 0.0000e+00], [ 1.8626e-08, 1.9558e-08, -4.0885e-07, ..., -1.7416e-07, 1.8720e-07, 0.0000e+00], [ 2.7940e-09, 2.7474e-07, 1.2107e-08, ..., 2.0768e-07, 2.5146e-08, 0.0000e+00]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0065, -0.0339, 0.0100, -0.0221, 0.0188, 0.0131, 0.0252, 0.0017, -0.0385, -0.0082], device='cuda:0'), grad: tensor([ 9.2015e-07, 1.0990e-07, -5.2929e-05, 6.5379e-06, 1.5739e-07, -7.5281e-05, 7.4148e-05, 4.5210e-05, 4.2841e-07, 7.8697e-07], device='cuda:0') 100 0.0001 changing lr epoch 236, time 247.54, cls_loss 0.0015 cls_loss_mapping 0.0032 cls_loss_causal 0.4941 re_mapping 0.0050 re_causal 0.0153 /// teacc 99.00 lr 0.00010000 Epoch 238, weight, value: tensor([[ 0.0286, -0.1500, -0.1287, ..., -0.2848, -0.0979, -0.1319], [ 0.0437, -0.0693, 0.0381, ..., 0.0522, 0.0990, -0.0411], [-0.0693, 0.1232, -0.1602, ..., 0.0620, 0.0757, -0.0412], ..., [-0.0732, -0.0852, -0.0815, ..., 0.0042, -0.1605, 0.1246], [ 0.0616, -0.0250, 0.0748, ..., 0.0024, -0.1940, -0.0098], [-0.1692, -0.0738, -0.0932, ..., -0.1835, 0.0493, -0.1007]], device='cuda:0'), grad: tensor([[ 2.9802e-08, 2.8871e-08, 6.0815e-07, ..., 4.4890e-07, 6.8545e-07, 2.7940e-09], [ 1.6298e-07, 8.3819e-09, -1.5823e-06, ..., 1.1176e-07, -1.5469e-06, 1.8626e-09], [ 9.3132e-09, -9.5926e-08, 4.7870e-07, ..., 6.4261e-07, -1.3970e-08, -1.4901e-08], ..., [-3.2783e-07, 2.7940e-09, 1.4901e-07, ..., -1.5842e-06, 9.9652e-08, 0.0000e+00], [ 3.7253e-09, 1.7695e-08, -3.8650e-07, ..., 7.9162e-08, 2.5146e-07, 2.7940e-09], [ 2.6077e-08, 9.3132e-10, 7.5437e-08, ..., 1.7136e-07, 9.5926e-07, 0.0000e+00]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0065, -0.0339, 0.0100, -0.0220, 0.0186, 0.0131, 0.0249, 0.0013, -0.0378, -0.0078], device='cuda:0'), grad: tensor([ 3.0436e-06, -9.8627e-07, 2.1234e-06, -1.8366e-06, -2.2743e-06, 5.1260e-06, -3.1665e-07, -8.5905e-06, 5.7742e-07, 3.1237e-06], device='cuda:0') 100 0.0001 changing lr epoch 237, time 248.08, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.5226 re_mapping 0.0052 re_causal 0.0161 /// teacc 99.00 lr 0.00010000 Epoch 239, weight, value: tensor([[ 0.0286, -0.1506, -0.1289, ..., -0.2857, -0.0980, -0.1317], [ 0.0438, -0.0692, 0.0383, ..., 0.0523, 0.0991, -0.0410], [-0.0694, 0.1230, -0.1603, ..., 0.0620, 0.0757, -0.0412], ..., [-0.0731, -0.0857, -0.0820, ..., 0.0041, -0.1611, 0.1247], [ 0.0616, -0.0249, 0.0748, ..., 0.0023, -0.1942, -0.0098], [-0.1697, -0.0741, -0.0935, ..., -0.1849, 0.0494, -0.1007]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 8.3819e-09, 1.2480e-07, ..., 1.5087e-07, 8.8476e-08, 0.0000e+00], [ 6.5193e-09, 1.3970e-08, -6.5193e-09, ..., 4.4703e-08, 1.6578e-07, -9.3132e-10], [ 1.2014e-07, -4.9360e-08, 8.9221e-07, ..., 1.0254e-06, 3.4552e-07, 0.0000e+00], ..., [ 2.1420e-08, 1.0245e-08, 1.8254e-07, ..., 2.0675e-07, 1.3411e-07, -1.8626e-09], [ 9.3132e-10, 6.5193e-09, -2.8871e-08, ..., 4.6566e-09, 5.4948e-08, 9.3132e-10], [ 1.8626e-09, 1.8626e-09, 8.5682e-08, ..., 5.2154e-08, 1.4836e-06, 0.0000e+00]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0063, -0.0337, 0.0100, -0.0221, 0.0187, 0.0136, 0.0249, 0.0012, -0.0380, -0.0081], device='cuda:0'), grad: tensor([-4.3139e-06, 9.5088e-07, 3.4049e-06, -2.3935e-07, -4.6641e-06, -4.7125e-06, 4.6417e-06, 7.9256e-07, 2.0955e-07, 3.9116e-06], device='cuda:0') 100 0.0001 changing lr epoch 238, time 247.90, cls_loss 0.0014 cls_loss_mapping 0.0025 cls_loss_causal 0.4855 re_mapping 0.0053 re_causal 0.0153 /// teacc 99.01 lr 0.00010000 Epoch 240, weight, value: tensor([[ 0.0285, -0.1508, -0.1290, ..., -0.2861, -0.0985, -0.1321], [ 0.0437, -0.0693, 0.0383, ..., 0.0522, 0.0992, -0.0411], [-0.0694, 0.1228, -0.1606, ..., 0.0618, 0.0757, -0.0412], ..., [-0.0731, -0.0857, -0.0820, ..., 0.0043, -0.1613, 0.1250], [ 0.0619, -0.0242, 0.0751, ..., 0.0035, -0.1942, -0.0100], [-0.1705, -0.0748, -0.0942, ..., -0.1856, 0.0498, -0.1009]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 4.6566e-09, ..., -1.2107e-08, 1.0990e-07, 9.3132e-10], [ 0.0000e+00, 2.7940e-09, 1.8999e-07, ..., 4.5914e-07, 3.3528e-08, 1.8626e-09], [ 0.0000e+00, -2.5146e-08, 2.4214e-08, ..., 2.1420e-08, 2.1420e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.3039e-08, 5.5879e-09, ..., -9.0897e-07, 4.7497e-08, -8.3819e-09], [ 0.0000e+00, 1.8626e-09, 2.0694e-06, ..., 2.2631e-06, 1.2862e-06, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 5.5879e-09, ..., 5.7183e-07, -8.3819e-09, 1.8626e-09]], device='cuda:0') Epoch 240, bias, value: tensor([-0.0065, -0.0338, 0.0097, -0.0220, 0.0180, 0.0136, 0.0249, 0.0014, -0.0370, -0.0077], device='cuda:0'), grad: tensor([-2.3283e-07, 3.1292e-06, 3.7253e-07, 4.7963e-07, -3.3155e-07, -7.2002e-05, 4.4286e-05, -4.0680e-06, 2.5406e-05, 2.8033e-06], device='cuda:0') 100 0.0001 changing lr epoch 239, time 249.65, cls_loss 0.0014 cls_loss_mapping 0.0021 cls_loss_causal 0.5108 re_mapping 0.0050 re_causal 0.0152 /// teacc 99.06 lr 0.00010000 Epoch 241, weight, value: tensor([[ 0.0283, -0.1511, -0.1291, ..., -0.2866, -0.0986, -0.1323], [ 0.0438, -0.0695, 0.0383, ..., 0.0522, 0.0992, -0.0411], [-0.0695, 0.1232, -0.1608, ..., 0.0618, 0.0758, -0.0420], ..., [-0.0730, -0.0867, -0.0821, ..., 0.0042, -0.1619, 0.1252], [ 0.0619, -0.0234, 0.0753, ..., 0.0041, -0.1943, -0.0100], [-0.1708, -0.0750, -0.0949, ..., -0.1866, 0.0496, -0.1010]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.3819e-09, 3.9954e-07, ..., 7.4506e-08, 4.2841e-08, 0.0000e+00], [-1.3970e-08, 5.2154e-08, 1.5181e-07, ..., 5.9605e-08, -2.5425e-07, 0.0000e+00], [ 0.0000e+00, -8.4750e-08, 5.6624e-07, ..., 1.9651e-07, 4.0978e-08, 0.0000e+00], ..., [ 0.0000e+00, -1.1921e-07, 1.9539e-06, ..., 1.7695e-08, 9.2201e-08, 0.0000e+00], [ 1.2107e-08, 7.7300e-08, 1.1576e-06, ..., 3.1479e-07, 1.8440e-07, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, 7.9721e-06, ..., 7.8790e-07, -1.5181e-07, 0.0000e+00]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0060, -0.0338, 0.0098, -0.0222, 0.0188, 0.0147, 0.0247, 0.0010, -0.0367, -0.0084], device='cuda:0'), grad: tensor([-1.7416e-07, 1.6699e-06, 1.5227e-06, -5.9456e-05, 2.1178e-06, 2.8029e-05, -4.3772e-07, 3.7700e-06, 3.4086e-06, 1.9550e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 240---------------------------------------------------- epoch 240, time 266.03, cls_loss 0.0017 cls_loss_mapping 0.0032 cls_loss_causal 0.5200 re_mapping 0.0053 re_causal 0.0160 /// teacc 99.14 lr 0.00010000 Epoch 242, weight, value: tensor([[ 0.0283, -0.1516, -0.1292, ..., -0.2870, -0.0985, -0.1326], [ 0.0439, -0.0694, 0.0381, ..., 0.0518, 0.0993, -0.0417], [-0.0696, 0.1237, -0.1610, ..., 0.0618, 0.0759, -0.0421], ..., [-0.0728, -0.0869, -0.0813, ..., 0.0049, -0.1622, 0.1256], [ 0.0619, -0.0239, 0.0753, ..., 0.0041, -0.1945, -0.0101], [-0.1714, -0.0756, -0.0958, ..., -0.1878, 0.0494, -0.1013]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 7.4506e-09, 3.9116e-08, ..., 2.6077e-08, -3.6135e-07, 9.3132e-10], [-6.0536e-08, 6.5193e-09, 8.9966e-07, ..., 5.2247e-07, -2.4121e-07, 0.0000e+00], [ 1.4901e-08, -9.3691e-07, 1.3206e-06, ..., 5.1130e-07, -1.5274e-07, 0.0000e+00], ..., [ 1.3970e-08, 1.1176e-08, 2.0955e-07, ..., 1.1828e-07, 7.3574e-08, 0.0000e+00], [-1.5739e-07, 8.9500e-07, -3.1795e-06, ..., -1.5181e-06, 1.6857e-07, 9.3132e-10], [ 2.7008e-08, 9.3132e-10, 2.4308e-07, ..., 1.1921e-07, -1.3039e-07, 0.0000e+00]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0053, -0.0342, 0.0098, -0.0239, 0.0189, 0.0175, 0.0247, 0.0020, -0.0368, -0.0093], device='cuda:0'), grad: tensor([-9.4473e-06, 1.7779e-06, 8.2236e-07, -6.7130e-06, 1.5460e-06, -3.0510e-06, 1.7405e-05, 6.3889e-07, -4.2841e-06, 1.2629e-06], device='cuda:0') 100 0.0001 changing lr epoch 241, time 248.46, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.4842 re_mapping 0.0052 re_causal 0.0158 /// teacc 99.12 lr 0.00010000 Epoch 243, weight, value: tensor([[ 0.0284, -0.1519, -0.1294, ..., -0.2874, -0.0986, -0.1326], [ 0.0439, -0.0692, 0.0382, ..., 0.0520, 0.0994, -0.0417], [-0.0696, 0.1239, -0.1613, ..., 0.0618, 0.0759, -0.0421], ..., [-0.0728, -0.0875, -0.0814, ..., 0.0048, -0.1628, 0.1258], [ 0.0618, -0.0240, 0.0754, ..., 0.0041, -0.1945, -0.0101], [-0.1713, -0.0768, -0.0964, ..., -0.1886, 0.0494, -0.1013]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.8476e-08, 5.7742e-08, ..., 2.4959e-07, 3.5111e-07, 0.0000e+00], [ 0.0000e+00, 2.7008e-08, -7.5437e-08, ..., 1.4063e-07, 8.0094e-08, 2.7940e-09], [ 0.0000e+00, -3.9022e-07, 8.9407e-08, ..., -6.7428e-07, -1.3094e-06, 2.7940e-09], ..., [ 0.0000e+00, 2.6077e-08, 3.6322e-08, ..., -4.7870e-07, 5.4948e-08, -8.3819e-09], [ 0.0000e+00, 1.3597e-07, 4.0978e-08, ..., 2.5798e-07, 4.8429e-07, 1.8626e-09], [ 0.0000e+00, 4.2841e-08, 5.5879e-09, ..., 2.6822e-07, 4.6380e-07, 0.0000e+00]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0051, -0.0341, 0.0098, -0.0240, 0.0187, 0.0177, 0.0247, 0.0019, -0.0368, -0.0095], device='cuda:0'), grad: tensor([ 1.0198e-06, 5.9046e-07, -3.6843e-06, -2.3004e-07, -1.2536e-06, 7.7207e-07, 7.0408e-07, -1.4622e-06, 1.6419e-06, 1.8943e-06], device='cuda:0') 100 0.0001 changing lr epoch 242, time 247.77, cls_loss 0.0020 cls_loss_mapping 0.0032 cls_loss_causal 0.5290 re_mapping 0.0049 re_causal 0.0149 /// teacc 99.09 lr 0.00010000 Epoch 244, weight, value: tensor([[ 0.0282, -0.1522, -0.1298, ..., -0.2882, -0.0987, -0.1329], [ 0.0441, -0.0690, 0.0381, ..., 0.0520, 0.0994, -0.0418], [-0.0697, 0.1244, -0.1616, ..., 0.0618, 0.0760, -0.0434], ..., [-0.0728, -0.0878, -0.0818, ..., 0.0045, -0.1631, 0.1262], [ 0.0618, -0.0250, 0.0753, ..., 0.0038, -0.1951, -0.0103], [-0.1722, -0.0772, -0.0966, ..., -0.1897, 0.0494, -0.1014]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 3.2596e-08, ..., 1.2107e-08, 1.0245e-08, 9.3132e-10], [ 0.0000e+00, -3.0734e-08, -5.0850e-07, ..., -2.1420e-07, -3.8836e-07, 0.0000e+00], [ 0.0000e+00, 1.6764e-08, 3.3528e-07, ..., 1.4435e-07, 1.1642e-07, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 3.1758e-07, ..., 1.4622e-07, 1.7136e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -1.0384e-06, ..., -2.7101e-07, 2.5146e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 1.6205e-07, ..., 5.4948e-08, 6.5193e-09, 0.0000e+00]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0052, -0.0343, 0.0098, -0.0239, 0.0189, 0.0175, 0.0249, 0.0016, -0.0378, -0.0093], device='cuda:0'), grad: tensor([-1.9837e-07, -9.8161e-07, 7.3854e-07, -1.5460e-07, 5.7742e-08, 7.6089e-07, 5.2527e-07, 6.5658e-07, -1.8878e-06, 4.6939e-07], device='cuda:0') 100 0.0001 changing lr epoch 243, time 247.59, cls_loss 0.0013 cls_loss_mapping 0.0020 cls_loss_causal 0.4788 re_mapping 0.0053 re_causal 0.0155 /// teacc 98.96 lr 0.00010000 Epoch 245, weight, value: tensor([[ 0.0284, -0.1524, -0.1298, ..., -0.2885, -0.0986, -0.1330], [ 0.0442, -0.0673, 0.0374, ..., 0.0515, 0.0997, -0.0420], [-0.0698, 0.1230, -0.1629, ..., 0.0615, 0.0756, -0.0428], ..., [-0.0728, -0.0878, -0.0810, ..., 0.0054, -0.1629, 0.1263], [ 0.0618, -0.0245, 0.0755, ..., 0.0043, -0.1952, -0.0103], [-0.1713, -0.0774, -0.0969, ..., -0.1901, 0.0495, -0.1015]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.0245e-08, ..., 1.3970e-08, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.5739e-07, ..., -2.5705e-07, -6.2957e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.9639e-07, ..., 7.7020e-07, 4.0606e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.3621e-07, ..., 5.8580e-07, 3.0361e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -6.8918e-08, ..., -1.8626e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.9360e-08, ..., 6.5193e-08, 3.9116e-08, 0.0000e+00]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0048, -0.0350, 0.0094, -0.0238, 0.0191, 0.0173, 0.0248, 0.0025, -0.0376, -0.0092], device='cuda:0'), grad: tensor([ 4.9360e-08, -6.6217e-07, 2.4624e-06, -4.3735e-06, -9.1270e-08, 9.7603e-07, 3.2783e-07, 1.2843e-06, -2.3656e-07, 2.4959e-07], device='cuda:0') 100 0.0001 changing lr epoch 244, time 247.62, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.4438 re_mapping 0.0052 re_causal 0.0147 /// teacc 99.05 lr 0.00010000 Epoch 246, weight, value: tensor([[ 0.0284, -0.1521, -0.1300, ..., -0.2886, -0.0986, -0.1331], [ 0.0442, -0.0673, 0.0373, ..., 0.0511, 0.0998, -0.0440], [-0.0701, 0.1230, -0.1630, ..., 0.0614, 0.0756, -0.0434], ..., [-0.0723, -0.0880, -0.0809, ..., 0.0058, -0.1631, 0.1282], [ 0.0618, -0.0246, 0.0781, ..., 0.0044, -0.1954, -0.0103], [-0.1716, -0.0785, -0.0973, ..., -0.1907, 0.0494, -0.1017]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-08, 1.2107e-08, ..., 1.3039e-08, 4.3772e-08, 0.0000e+00], [ 0.0000e+00, -4.5449e-07, -1.2936e-06, ..., -1.7146e-06, -2.6803e-06, 0.0000e+00], [ 0.0000e+00, 1.7695e-07, 7.3481e-07, ..., 8.9779e-07, 1.3504e-06, 0.0000e+00], ..., [ 0.0000e+00, 2.3749e-07, 3.5390e-07, ..., 5.9698e-07, 8.1398e-07, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, -2.2352e-08, ..., 1.4901e-08, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, 8.3819e-09, ..., 6.5193e-09, -5.0589e-06, 0.0000e+00]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0050, -0.0354, 0.0093, -0.0238, 0.0193, 0.0169, 0.0244, 0.0029, -0.0354, -0.0095], device='cuda:0'), grad: tensor([-4.6007e-06, -5.2266e-06, 2.9169e-06, 7.1712e-08, 9.1493e-06, 1.7975e-07, 1.3448e-06, 1.6689e-06, 3.1777e-06, -8.7246e-06], device='cuda:0') 100 0.0001 changing lr epoch 245, time 247.87, cls_loss 0.0015 cls_loss_mapping 0.0029 cls_loss_causal 0.5163 re_mapping 0.0052 re_causal 0.0154 /// teacc 98.97 lr 0.00010000 Epoch 247, weight, value: tensor([[ 0.0283, -0.1525, -0.1310, ..., -0.2893, -0.0987, -0.1333], [ 0.0446, -0.0677, 0.0373, ..., 0.0510, 0.0998, -0.0440], [-0.0702, 0.1256, -0.1631, ..., 0.0617, 0.0761, -0.0435], ..., [-0.0706, -0.0913, -0.0809, ..., 0.0056, -0.1645, 0.1282], [ 0.0614, -0.0251, 0.0780, ..., 0.0043, -0.1959, -0.0103], [-0.1737, -0.0789, -0.0994, ..., -0.1920, 0.0494, -0.1017]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.5193e-09, 5.2154e-08, ..., 7.0781e-08, 8.1025e-08, 0.0000e+00], [ 0.0000e+00, 2.4214e-08, 2.7940e-08, ..., 5.3085e-08, 2.8871e-08, 0.0000e+00], [ 0.0000e+00, -8.3912e-07, 1.0524e-07, ..., -9.2294e-07, -1.3364e-06, 0.0000e+00], ..., [ 0.0000e+00, 1.4901e-08, 4.8429e-08, ..., 1.7509e-07, 2.2259e-07, 0.0000e+00], [ 0.0000e+00, 7.7207e-07, -1.1176e-08, ..., 5.7742e-07, 8.4843e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.3411e-07, ..., 5.1223e-08, 1.2107e-08, 0.0000e+00]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0050, -0.0354, 0.0096, -0.0236, 0.0195, 0.0168, 0.0243, 0.0028, -0.0357, -0.0098], device='cuda:0'), grad: tensor([ 3.7067e-07, 2.1327e-07, -3.2298e-06, -1.7546e-06, 1.8626e-09, 4.2003e-07, 3.0920e-07, 4.8615e-07, 2.6524e-06, 5.3458e-07], device='cuda:0') 100 0.0001 changing lr epoch 246, time 247.66, cls_loss 0.0013 cls_loss_mapping 0.0018 cls_loss_causal 0.4771 re_mapping 0.0054 re_causal 0.0150 /// teacc 98.99 lr 0.00010000 Epoch 248, weight, value: tensor([[ 0.0286, -0.1526, -0.1312, ..., -0.2896, -0.0989, -0.1334], [ 0.0446, -0.0679, 0.0374, ..., 0.0510, 0.0998, -0.0443], [-0.0703, 0.1257, -0.1633, ..., 0.0617, 0.0761, -0.0434], ..., [-0.0705, -0.0912, -0.0809, ..., 0.0057, -0.1645, 0.1284], [ 0.0614, -0.0253, 0.0780, ..., 0.0043, -0.1961, -0.0104], [-0.1739, -0.0790, -0.0997, ..., -0.1924, 0.0495, -0.1018]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 4.6566e-09, 5.5879e-09, ..., 1.7695e-08, 2.9802e-08, 9.3132e-10], [ 9.3132e-10, 2.9709e-07, -7.7561e-06, ..., -1.7762e-05, -2.1294e-05, 0.0000e+00], [-6.7987e-08, -5.0757e-07, 1.7537e-06, ..., 3.6787e-06, 3.8594e-06, 0.0000e+00], ..., [ 7.4506e-09, 1.3039e-08, 5.8077e-06, ..., 1.3448e-05, 1.6436e-05, 0.0000e+00], [ 9.3132e-10, 4.6566e-09, -4.3772e-08, ..., -1.3039e-08, 2.7008e-08, 0.0000e+00], [ 0.0000e+00, 1.3597e-07, 3.3528e-08, ..., 1.0338e-07, 3.3807e-07, 0.0000e+00]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0052, -0.0355, 0.0095, -0.0235, 0.0196, 0.0167, 0.0244, 0.0029, -0.0358, -0.0099], device='cuda:0'), grad: tensor([ 6.3330e-08, -5.2005e-05, 1.0028e-05, 8.7451e-07, 1.5553e-07, -7.1898e-07, 2.6356e-07, 3.9786e-05, 2.8592e-07, 1.2787e-06], device='cuda:0') 100 0.0001 changing lr epoch 247, time 247.79, cls_loss 0.0015 cls_loss_mapping 0.0031 cls_loss_causal 0.5515 re_mapping 0.0053 re_causal 0.0161 /// teacc 99.01 lr 0.00010000 Epoch 249, weight, value: tensor([[ 0.0291, -0.1526, -0.1314, ..., -0.2914, -0.0991, -0.1335], [ 0.0446, -0.0679, 0.0375, ..., 0.0510, 0.1000, -0.0443], [-0.0706, 0.1256, -0.1635, ..., 0.0616, 0.0761, -0.0435], ..., [-0.0701, -0.0910, -0.0810, ..., 0.0057, -0.1652, 0.1284], [ 0.0615, -0.0251, 0.0782, ..., 0.0046, -0.1962, -0.0103], [-0.1760, -0.0803, -0.1014, ..., -0.1938, 0.0495, -0.1018]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1176e-08, 6.4261e-08, ..., 3.1665e-08, 2.7008e-08, 0.0000e+00], [ 0.0000e+00, 4.1444e-07, -1.0632e-05, ..., -2.0154e-06, -4.8429e-07, 0.0000e+00], [ 0.0000e+00, -3.5483e-07, 2.7288e-07, ..., 3.2596e-08, -8.5961e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.6764e-08, 2.7381e-07, ..., 1.3039e-08, 9.3132e-08, 0.0000e+00], [ 0.0000e+00, -1.1828e-07, 5.0887e-06, ..., 1.1893e-06, 5.3924e-07, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 2.9802e-07, ..., 1.3690e-07, -8.9407e-08, 0.0000e+00]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0054, -0.0354, 0.0093, -0.0234, 0.0200, 0.0166, 0.0243, 0.0030, -0.0357, -0.0101], device='cuda:0'), grad: tensor([ 2.0489e-07, -2.9474e-05, -2.4587e-07, -5.1484e-06, 8.3912e-07, 1.3739e-05, 5.6997e-06, 5.0385e-07, 1.3284e-05, 5.5693e-07], device='cuda:0') 100 0.0001 changing lr epoch 248, time 247.63, cls_loss 0.0011 cls_loss_mapping 0.0022 cls_loss_causal 0.4958 re_mapping 0.0051 re_causal 0.0159 /// teacc 99.02 lr 0.00010000 Epoch 250, weight, value: tensor([[ 0.0292, -0.1528, -0.1314, ..., -0.2916, -0.0991, -0.1335], [ 0.0447, -0.0679, 0.0375, ..., 0.0510, 0.1001, -0.0443], [-0.0706, 0.1257, -0.1637, ..., 0.0616, 0.0761, -0.0435], ..., [-0.0701, -0.0912, -0.0810, ..., 0.0058, -0.1655, 0.1285], [ 0.0615, -0.0253, 0.0782, ..., 0.0044, -0.1965, -0.0103], [-0.1762, -0.0805, -0.1019, ..., -0.1946, 0.0496, -0.1018]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.3819e-09, 1.0245e-08, ..., 1.5832e-08, 8.5682e-08, 0.0000e+00], [ 0.0000e+00, 2.1420e-07, -3.5483e-07, ..., 2.9244e-07, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, -2.7381e-07, 3.3528e-08, ..., -4.5076e-07, -4.4517e-07, 0.0000e+00], ..., [ 0.0000e+00, 3.1665e-08, -2.7008e-08, ..., -3.7253e-08, 7.3574e-08, 0.0000e+00], [ 0.0000e+00, 8.3819e-09, 2.2352e-08, ..., 2.0489e-08, 8.0094e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 4.4703e-08, ..., 8.4750e-08, 5.4855e-07, 0.0000e+00]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0052, -0.0354, 0.0093, -0.0232, 0.0200, 0.0163, 0.0242, 0.0031, -0.0359, -0.0105], device='cuda:0'), grad: tensor([ 2.4028e-07, 2.0396e-07, -1.0524e-06, 4.3213e-07, -2.7157e-06, -5.9083e-06, 7.5772e-06, -4.3660e-06, 7.8417e-07, 4.7982e-06], device='cuda:0') 100 0.0001 changing lr epoch 249, time 247.53, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4918 re_mapping 0.0052 re_causal 0.0158 /// teacc 99.07 lr 0.00010000 Epoch 251, weight, value: tensor([[ 0.0292, -0.1529, -0.1317, ..., -0.2918, -0.0992, -0.1335], [ 0.0449, -0.0678, 0.0375, ..., 0.0509, 0.1005, -0.0443], [-0.0707, 0.1258, -0.1639, ..., 0.0615, 0.0761, -0.0444], ..., [-0.0701, -0.0913, -0.0809, ..., 0.0060, -0.1662, 0.1286], [ 0.0615, -0.0253, 0.0783, ..., 0.0045, -0.1966, -0.0103], [-0.1763, -0.0808, -0.1025, ..., -0.1951, 0.0495, -0.1018]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 3.7253e-09, ..., 3.7253e-09, 1.3039e-08, 0.0000e+00], [-3.7253e-09, 1.8626e-09, -7.2177e-07, ..., -4.0140e-07, -1.4137e-06, 9.3132e-10], [ 0.0000e+00, -3.4459e-08, 5.2806e-07, ..., 2.9523e-07, 1.0217e-06, 0.0000e+00], ..., [ 0.0000e+00, 2.1420e-08, 4.3772e-08, ..., 2.5146e-08, 1.0990e-07, -9.3132e-10], [ 0.0000e+00, 5.5879e-09, -1.4901e-08, ..., -6.5193e-09, 2.8871e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.5832e-08, ..., 1.0245e-08, -7.4506e-08, 0.0000e+00]], device='cuda:0') Epoch 251, bias, value: tensor([-0.0054, -0.0354, 0.0092, -0.0232, 0.0201, 0.0164, 0.0241, 0.0031, -0.0359, -0.0107], device='cuda:0'), grad: tensor([-4.2990e-06, -2.1532e-06, 1.5777e-06, 5.7742e-08, 3.9581e-07, 3.6731e-06, 7.5158e-07, 2.6450e-07, 1.3039e-08, -2.8498e-07], device='cuda:0') 100 0.0001 changing lr epoch 250, time 247.54, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.5049 re_mapping 0.0051 re_causal 0.0160 /// teacc 99.02 lr 0.00010000 Epoch 252, weight, value: tensor([[ 0.0290, -0.1535, -0.1328, ..., -0.2924, -0.0993, -0.1335], [ 0.0449, -0.0679, 0.0376, ..., 0.0509, 0.1006, -0.0443], [-0.0706, 0.1260, -0.1639, ..., 0.0615, 0.0761, -0.0444], ..., [-0.0701, -0.0914, -0.0809, ..., 0.0060, -0.1667, 0.1287], [ 0.0615, -0.0255, 0.0784, ..., 0.0044, -0.1966, -0.0103], [-0.1764, -0.0811, -0.1035, ..., -0.1954, 0.0499, -0.1019]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.0489e-08, ..., 1.0245e-08, 2.4382e-06, 9.3132e-10], [-0.0000e+00, 0.0000e+00, 2.2445e-06, ..., 2.1663e-06, -1.7323e-07, 2.4214e-07], [ 0.0000e+00, 0.0000e+00, 3.9116e-08, ..., 2.7940e-08, 1.4901e-08, 1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, -3.4440e-06, ..., -3.1460e-06, 7.1712e-08, -3.4273e-07], [ 0.0000e+00, 0.0000e+00, 7.0315e-07, ..., 6.2492e-07, 3.4459e-08, 6.7055e-08], [ 0.0000e+00, 0.0000e+00, 1.1921e-07, ..., 1.0245e-07, -2.4885e-06, 1.0245e-08]], device='cuda:0') Epoch 252, bias, value: tensor([-0.0057, -0.0354, 0.0092, -0.0230, 0.0196, 0.0163, 0.0242, 0.0031, -0.0359, -0.0102], device='cuda:0'), grad: tensor([ 5.0105e-06, 1.2785e-05, 1.9185e-07, 7.5623e-07, 3.7719e-07, 5.9605e-08, 2.6636e-07, -1.8418e-05, 3.8333e-06, -4.8354e-06], device='cuda:0') 100 0.0001 changing lr epoch 251, time 247.69, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.5013 re_mapping 0.0049 re_causal 0.0151 /// teacc 99.04 lr 0.00010000 Epoch 253, weight, value: tensor([[ 0.0290, -0.1532, -0.1328, ..., -0.2925, -0.0994, -0.1337], [ 0.0449, -0.0673, 0.0377, ..., 0.0510, 0.1009, -0.0443], [-0.0706, 0.1258, -0.1643, ..., 0.0615, 0.0760, -0.0444], ..., [-0.0701, -0.0915, -0.0810, ..., 0.0059, -0.1672, 0.1288], [ 0.0616, -0.0254, 0.0786, ..., 0.0045, -0.1967, -0.0104], [-0.1765, -0.0815, -0.1040, ..., -0.1957, 0.0499, -0.1019]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 3.7253e-09, 9.3132e-09, ..., 9.3132e-09, 3.6322e-08, 1.8626e-09], [ 1.4901e-08, 1.8626e-09, -1.1828e-07, ..., -5.3085e-08, -1.4994e-07, 1.1176e-08], [ 2.7940e-09, -5.0291e-08, 5.5879e-09, ..., -2.7940e-08, 2.7940e-09, 1.8626e-09], ..., [-4.0047e-08, 3.9116e-08, 5.5879e-08, ..., -4.1910e-08, 1.4156e-07, -2.9802e-08], [ 9.3132e-10, 2.7940e-09, -0.0000e+00, ..., 4.0978e-08, 5.4017e-08, 9.3132e-10], [ 1.7695e-08, 9.3132e-10, 1.2107e-08, ..., 2.7008e-08, 8.8383e-07, 1.3039e-08]], device='cuda:0') Epoch 253, bias, value: tensor([-0.0051, -0.0353, 0.0091, -0.0230, 0.0197, 0.0162, 0.0240, 0.0030, -0.0356, -0.0105], device='cuda:0'), grad: tensor([-4.6909e-05, 5.0664e-07, 4.6827e-06, 4.9844e-06, -6.1616e-06, 6.3106e-06, 6.6124e-07, 9.4343e-07, 6.8732e-07, 3.4273e-05], device='cuda:0') 100 0.0001 changing lr epoch 252, time 247.50, cls_loss 0.0013 cls_loss_mapping 0.0016 cls_loss_causal 0.4568 re_mapping 0.0050 re_causal 0.0146 /// teacc 99.00 lr 0.00010000 Epoch 254, weight, value: tensor([[ 0.0289, -0.1536, -0.1329, ..., -0.2928, -0.0997, -0.1338], [ 0.0449, -0.0674, 0.0381, ..., 0.0516, 0.1014, -0.0444], [-0.0706, 0.1260, -0.1644, ..., 0.0615, 0.0761, -0.0445], ..., [-0.0701, -0.0917, -0.0814, ..., 0.0053, -0.1693, 0.1310], [ 0.0619, -0.0256, 0.0790, ..., 0.0046, -0.1968, -0.0104], [-0.1793, -0.0833, -0.1070, ..., -0.1970, 0.0503, -0.1048]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.1106e-07, 5.5879e-09, ..., 3.9488e-07, 4.5635e-08, 0.0000e+00], [ 0.0000e+00, 6.5193e-09, -9.6858e-08, ..., -3.2596e-08, -6.7987e-08, 0.0000e+00], [ 0.0000e+00, -1.0841e-06, 4.0978e-08, ..., -1.3346e-06, -2.9709e-07, 0.0000e+00], ..., [ 0.0000e+00, 4.9081e-07, 6.3330e-08, ..., 6.4354e-07, 1.5460e-07, 0.0000e+00], [ 0.0000e+00, 3.3528e-08, -8.3819e-08, ..., -3.7253e-09, 1.2293e-07, 0.0000e+00], [ 0.0000e+00, 5.6811e-08, 4.6566e-09, ..., 6.2399e-08, 6.9849e-08, 0.0000e+00]], device='cuda:0') Epoch 254, bias, value: tensor([-0.0043, -0.0348, 0.0091, -0.0225, 0.0192, 0.0155, 0.0239, 0.0031, -0.0353, -0.0131], device='cuda:0'), grad: tensor([ 1.1222e-06, -1.0338e-07, -3.7383e-06, 6.7893e-07, -1.2387e-07, 1.7695e-08, -5.0291e-08, 1.8533e-06, 1.0151e-07, 2.4494e-07], device='cuda:0') 100 0.0001 changing lr epoch 253, time 247.61, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4813 re_mapping 0.0050 re_causal 0.0153 /// teacc 98.97 lr 0.00010000 Epoch 255, weight, value: tensor([[ 0.0289, -0.1546, -0.1331, ..., -0.2933, -0.0999, -0.1338], [ 0.0454, -0.0674, 0.0382, ..., 0.0516, 0.1015, -0.0445], [-0.0707, 0.1262, -0.1644, ..., 0.0615, 0.0765, -0.0448], ..., [-0.0702, -0.0916, -0.0815, ..., 0.0052, -0.1698, 0.1311], [ 0.0619, -0.0260, 0.0790, ..., 0.0045, -0.1972, -0.0104], [-0.1794, -0.0840, -0.1076, ..., -0.1982, 0.0504, -0.1048]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 5.8673e-08, ..., 5.7742e-08, 9.4064e-08, 0.0000e+00], [-1.8626e-09, 1.8626e-09, 3.5111e-07, ..., 3.5577e-07, -4.8708e-07, 5.5879e-09], [ 0.0000e+00, -5.4948e-08, 3.5577e-07, ..., 1.0058e-07, -1.4249e-07, 9.3132e-10], ..., [ 9.3132e-10, 7.4506e-09, 1.7695e-07, ..., 5.1223e-08, 1.4808e-07, -1.3039e-08], [ 0.0000e+00, 2.7008e-08, -1.1437e-06, ..., -7.7486e-07, 6.8918e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-08, ..., 2.5146e-08, 2.3283e-08, 4.6566e-09]], device='cuda:0') Epoch 255, bias, value: tensor([-0.0046, -0.0348, 0.0093, -0.0225, 0.0175, 0.0156, 0.0240, 0.0031, -0.0356, -0.0125], device='cuda:0'), grad: tensor([ 2.1234e-07, 2.1886e-06, 8.2236e-07, 1.2293e-07, 9.0338e-08, 9.0338e-08, 3.3900e-07, 2.5798e-07, -4.2766e-06, 1.3504e-07], device='cuda:0') 100 0.0001 changing lr epoch 254, time 247.86, cls_loss 0.0014 cls_loss_mapping 0.0024 cls_loss_causal 0.5293 re_mapping 0.0049 re_causal 0.0152 /// teacc 99.01 lr 0.00010000 Epoch 256, weight, value: tensor([[ 0.0291, -0.1551, -0.1341, ..., -0.2936, -0.1000, -0.1341], [ 0.0456, -0.0673, 0.0385, ..., 0.0521, 0.1020, -0.0447], [-0.0707, 0.1261, -0.1648, ..., 0.0614, 0.0765, -0.0457], ..., [-0.0702, -0.0918, -0.0818, ..., 0.0048, -0.1712, 0.1310], [ 0.0619, -0.0261, 0.0792, ..., 0.0048, -0.1975, -0.0096], [-0.1797, -0.0845, -0.1080, ..., -0.1992, 0.0511, -0.1049]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.5832e-08, 4.6566e-09, ..., 2.3283e-08, 2.9802e-07, 0.0000e+00], [-4.6566e-09, 2.2352e-08, -1.1176e-07, ..., 3.7253e-08, -2.4214e-08, 7.4506e-09], [ 9.3132e-10, -1.6019e-07, 2.4214e-08, ..., -2.0023e-07, -4.1816e-07, 9.3132e-10], ..., [ 1.8626e-09, 2.2352e-08, 3.7253e-08, ..., -2.9802e-08, 9.6858e-08, -1.8626e-08], [ 0.0000e+00, 1.9558e-08, 1.8626e-09, ..., 2.1420e-08, 1.8161e-07, 9.3132e-10], [ 9.3132e-10, 1.1176e-08, 1.1176e-08, ..., 2.1420e-08, -8.4750e-08, 9.3132e-10]], device='cuda:0') Epoch 256, bias, value: tensor([-0.0049, -0.0345, 0.0091, -0.0223, 0.0162, 0.0158, 0.0238, 0.0028, -0.0355, -0.0121], device='cuda:0'), grad: tensor([ 6.2771e-07, 1.0524e-07, -7.0408e-07, 6.3144e-07, 5.4855e-07, 2.9989e-07, -1.6121e-06, 1.2852e-07, 6.1654e-07, -6.2305e-07], device='cuda:0') 100 0.0001 changing lr epoch 255, time 247.95, cls_loss 0.0016 cls_loss_mapping 0.0022 cls_loss_causal 0.4694 re_mapping 0.0052 re_causal 0.0152 /// teacc 98.99 lr 0.00010000 Epoch 257, weight, value: tensor([[ 0.0292, -0.1557, -0.1342, ..., -0.2941, -0.1024, -0.1342], [ 0.0457, -0.0673, 0.0385, ..., 0.0516, 0.1015, -0.0448], [-0.0705, 0.1264, -0.1650, ..., 0.0613, 0.0765, -0.0463], ..., [-0.0703, -0.0919, -0.0818, ..., 0.0055, -0.1708, 0.1312], [ 0.0619, -0.0264, 0.0794, ..., 0.0047, -0.1977, -0.0097], [-0.1799, -0.0850, -0.1081, ..., -0.1997, 0.0531, -0.1049]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 1.3970e-08, ..., 1.0245e-08, 3.0734e-08, 0.0000e+00], [-2.0489e-08, 3.8184e-08, -1.6391e-07, ..., 6.4261e-08, -1.0803e-07, 4.6566e-09], [ 9.3132e-10, -1.2666e-07, 1.7695e-07, ..., -5.5879e-08, -1.8626e-07, 0.0000e+00], ..., [ 5.5879e-09, 1.0245e-08, 1.3225e-07, ..., 2.6077e-08, 1.0990e-07, -1.2107e-08], [ 9.3132e-10, 5.8673e-08, -1.3970e-08, ..., 7.9162e-08, 1.5926e-07, 9.3132e-10], [ 1.1176e-08, 9.3132e-10, 1.6112e-07, ..., 4.5635e-08, -2.3525e-06, 3.7253e-09]], device='cuda:0') Epoch 257, bias, value: tensor([-0.0072, -0.0348, 0.0090, -0.0223, 0.0164, 0.0157, 0.0241, 0.0032, -0.0356, -0.0103], device='cuda:0'), grad: tensor([-1.5711e-06, -1.4901e-08, 4.2841e-08, 2.4252e-06, 8.5831e-06, -2.6524e-06, -1.5441e-06, 3.6135e-07, 1.1018e-06, -6.7130e-06], device='cuda:0') 100 0.0001 changing lr epoch 256, time 247.97, cls_loss 0.0016 cls_loss_mapping 0.0025 cls_loss_causal 0.4910 re_mapping 0.0053 re_causal 0.0151 /// teacc 99.06 lr 0.00010000 Epoch 258, weight, value: tensor([[ 0.0290, -0.1585, -0.1346, ..., -0.2946, -0.1028, -0.1343], [ 0.0457, -0.0676, 0.0382, ..., 0.0512, 0.1015, -0.0450], [-0.0705, 0.1270, -0.1652, ..., 0.0614, 0.0768, -0.0465], ..., [-0.0704, -0.0925, -0.0814, ..., 0.0059, -0.1715, 0.1314], [ 0.0617, -0.0251, 0.0798, ..., 0.0045, -0.1985, -0.0100], [-0.1800, -0.0856, -0.1086, ..., -0.2001, 0.0531, -0.1049]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.3970e-08, 7.6368e-08, ..., 7.3574e-08, 4.7497e-08, 0.0000e+00], [ 9.3132e-09, 6.5193e-09, -1.9651e-07, ..., 1.7695e-08, -1.5832e-07, 0.0000e+00], [ 1.8626e-09, -2.2259e-07, 2.5798e-07, ..., 1.4901e-08, -6.6124e-07, 0.0000e+00], ..., [ 9.3132e-10, 4.0978e-08, 1.5367e-07, ..., 9.3132e-09, 2.5053e-07, -0.0000e+00], [ 2.7940e-09, -7.3574e-08, -2.4773e-07, ..., 2.3283e-08, -7.9162e-08, 0.0000e+00], [ 1.8626e-09, 7.5437e-08, 2.3842e-07, ..., 4.9360e-08, 1.6298e-07, 0.0000e+00]], device='cuda:0') Epoch 258, bias, value: tensor([-0.0075, -0.0353, 0.0091, -0.0222, 0.0158, 0.0152, 0.0252, 0.0035, -0.0356, -0.0104], device='cuda:0'), grad: tensor([ 2.3749e-07, -1.6578e-07, -1.7425e-06, -8.7358e-07, 5.0571e-07, -3.4459e-08, 1.0561e-06, 6.6590e-07, -7.8604e-07, 1.1306e-06], device='cuda:0') 100 0.0001 changing lr epoch 257, time 247.83, cls_loss 0.0017 cls_loss_mapping 0.0036 cls_loss_causal 0.4669 re_mapping 0.0053 re_causal 0.0149 /// teacc 99.09 lr 0.00010000 Epoch 259, weight, value: tensor([[ 0.0289, -0.1581, -0.1336, ..., -0.2953, -0.1034, -0.1355], [ 0.0462, -0.0677, 0.0382, ..., 0.0512, 0.1015, -0.0452], [-0.0706, 0.1275, -0.1655, ..., 0.0614, 0.0772, -0.0458], ..., [-0.0705, -0.0928, -0.0814, ..., 0.0059, -0.1720, 0.1316], [ 0.0612, -0.0256, 0.0776, ..., 0.0042, -0.2022, -0.0102], [-0.1802, -0.0868, -0.1100, ..., -0.2014, 0.0532, -0.1049]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.7695e-08, -4.2506e-06, ..., 4.1910e-08, -4.9397e-06, 0.0000e+00], [ 8.3819e-09, 1.2852e-07, 9.0897e-07, ..., 1.6019e-07, 1.3076e-06, 0.0000e+00], [ 0.0000e+00, -1.1083e-07, 1.2154e-06, ..., -1.3504e-07, 5.8021e-07, 0.0000e+00], ..., [ 0.0000e+00, 5.5879e-09, 6.7055e-08, ..., 3.2596e-08, 6.6124e-08, -0.0000e+00], [ 0.0000e+00, -1.1828e-07, -1.8664e-06, ..., -2.7567e-07, 5.1223e-08, 0.0000e+00], [ 9.3132e-10, 7.4506e-09, 1.3225e-07, ..., 1.9558e-08, 3.8184e-08, 0.0000e+00]], device='cuda:0') Epoch 259, bias, value: tensor([-0.0076, -0.0353, 0.0091, -0.0219, 0.0157, 0.0147, 0.0268, 0.0035, -0.0385, -0.0103], device='cuda:0'), grad: tensor([-3.5793e-05, 8.3074e-06, 6.6459e-06, 1.3746e-06, -1.4417e-06, 5.8673e-08, 2.3410e-05, 1.7136e-07, -3.1590e-06, 4.2748e-07], device='cuda:0') 100 0.0001 changing lr epoch 258, time 247.53, cls_loss 0.0013 cls_loss_mapping 0.0024 cls_loss_causal 0.5049 re_mapping 0.0050 re_causal 0.0156 /// teacc 99.10 lr 0.00010000 Epoch 260, weight, value: tensor([[ 0.0289, -0.1609, -0.1341, ..., -0.2957, -0.1035, -0.1356], [ 0.0466, -0.0679, 0.0385, ..., 0.0517, 0.1018, -0.0461], [-0.0706, 0.1285, -0.1657, ..., 0.0616, 0.0773, -0.0461], ..., [-0.0707, -0.0941, -0.0817, ..., 0.0052, -0.1728, 0.1322], [ 0.0612, -0.0231, 0.0776, ..., 0.0040, -0.2024, -0.0102], [-0.1804, -0.0883, -0.1106, ..., -0.2026, 0.0529, -0.1050]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 6.5193e-09, ..., 7.4506e-09, 6.0499e-06, 0.0000e+00], [ 0.0000e+00, 2.0023e-08, 9.3132e-09, ..., 4.5169e-08, 1.6065e-07, 0.0000e+00], [ 4.6566e-10, -1.8943e-06, 9.0804e-08, ..., -3.1739e-06, -2.0415e-06, 0.0000e+00], ..., [ 0.0000e+00, 1.8282e-06, 2.8871e-08, ..., 3.1497e-06, 2.3134e-06, 0.0000e+00], [ 4.6566e-10, 1.8626e-08, 4.9826e-08, ..., 5.1223e-08, 4.2701e-07, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 2.7940e-09, ..., 3.7253e-09, 3.1013e-07, 0.0000e+00]], device='cuda:0') Epoch 260, bias, value: tensor([-0.0074, -0.0350, 0.0092, -0.0218, 0.0161, 0.0148, 0.0267, 0.0032, -0.0387, -0.0107], device='cuda:0'), grad: tensor([ 2.2292e-05, 6.7893e-07, -9.0152e-06, -3.6741e-07, 4.8392e-06, 1.1921e-06, -3.2991e-05, 1.0371e-05, 1.7099e-06, 1.2629e-06], device='cuda:0') 100 0.0001 changing lr epoch 259, time 247.95, cls_loss 0.0011 cls_loss_mapping 0.0022 cls_loss_causal 0.5100 re_mapping 0.0049 re_causal 0.0149 /// teacc 99.04 lr 0.00010000 Epoch 261, weight, value: tensor([[ 0.0294, -0.1612, -0.1341, ..., -0.2969, -0.1035, -0.1360], [ 0.0467, -0.0683, 0.0387, ..., 0.0518, 0.1019, -0.0461], [-0.0710, 0.1289, -0.1662, ..., 0.0616, 0.0775, -0.0463], ..., [-0.0708, -0.0945, -0.0819, ..., 0.0049, -0.1738, 0.1322], [ 0.0612, -0.0232, 0.0777, ..., 0.0039, -0.2026, -0.0103], [-0.1807, -0.0888, -0.1111, ..., -0.2027, 0.0528, -0.1050]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 1.2293e-07, ..., 8.0559e-08, 1.3225e-07, 0.0000e+00], [ 4.6566e-10, 1.3970e-09, -1.2564e-06, ..., -8.5449e-07, -1.3309e-06, 0.0000e+00], [ 3.7253e-09, -1.7229e-08, 5.4995e-07, ..., 4.2748e-07, 6.2492e-07, 0.0000e+00], ..., [ 9.3132e-10, 1.2107e-08, 2.7521e-07, ..., 9.6858e-08, 2.6869e-07, 0.0000e+00], [ 3.2596e-09, 1.8626e-09, 6.0536e-08, ..., 5.4948e-08, 1.2433e-07, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 5.0291e-08, ..., 4.0047e-08, 4.9360e-08, 0.0000e+00]], device='cuda:0') Epoch 261, bias, value: tensor([-0.0072, -0.0349, 0.0092, -0.0205, 0.0165, 0.0131, 0.0267, 0.0030, -0.0388, -0.0109], device='cuda:0'), grad: tensor([ 3.1386e-07, -2.6729e-06, 1.3392e-06, -5.2620e-08, -1.7695e-08, 3.4180e-07, -3.8464e-07, 4.3120e-07, 4.4284e-07, 2.7148e-07], device='cuda:0') 100 0.0001 changing lr epoch 260, time 247.75, cls_loss 0.0013 cls_loss_mapping 0.0030 cls_loss_causal 0.5037 re_mapping 0.0047 re_causal 0.0141 /// teacc 99.12 lr 0.00010000 Epoch 262, weight, value: tensor([[ 0.0304, -0.1614, -0.1340, ..., -0.2977, -0.1038, -0.1361], [ 0.0466, -0.0685, 0.0389, ..., 0.0520, 0.1021, -0.0461], [-0.0713, 0.1299, -0.1665, ..., 0.0617, 0.0777, -0.0464], ..., [-0.0702, -0.0949, -0.0820, ..., 0.0049, -0.1744, 0.1323], [ 0.0612, -0.0247, 0.0789, ..., 0.0042, -0.2028, -0.0103], [-0.1820, -0.0892, -0.1131, ..., -0.2041, 0.0529, -0.1050]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 2.8871e-08, ..., 3.8184e-08, 2.6077e-08, 9.3132e-10], [ 0.0000e+00, 1.6764e-08, 3.8836e-07, ..., 1.8347e-07, -4.8894e-07, -7.5437e-08], [ 0.0000e+00, -8.4750e-08, 2.4028e-07, ..., 1.8347e-07, -9.3132e-10, 4.6566e-09], ..., [ 0.0000e+00, 5.5879e-09, 1.0151e-07, ..., -2.5090e-06, 4.0978e-08, 1.8626e-09], [ 0.0000e+00, 4.6566e-09, -4.0196e-06, ..., -2.2911e-06, -5.4389e-07, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, 2.3283e-08, ..., 2.1979e-06, 3.4124e-06, 0.0000e+00]], device='cuda:0') Epoch 262, bias, value: tensor([-0.0069, -0.0348, 0.0093, -0.0201, 0.0169, 0.0125, 0.0263, 0.0028, -0.0379, -0.0113], device='cuda:0'), grad: tensor([ 1.4342e-07, 1.0394e-06, 6.2119e-07, 2.2259e-06, -8.8662e-06, -1.0207e-06, 3.8259e-06, -9.7603e-06, -6.4634e-06, 1.8224e-05], device='cuda:0') 100 0.0001 changing lr epoch 261, time 247.75, cls_loss 0.0014 cls_loss_mapping 0.0024 cls_loss_causal 0.4844 re_mapping 0.0049 re_causal 0.0145 /// teacc 99.00 lr 0.00010000 Epoch 263, weight, value: tensor([[ 0.0303, -0.1613, -0.1343, ..., -0.2987, -0.1038, -0.1367], [ 0.0464, -0.0685, 0.0392, ..., 0.0529, 0.1027, -0.0461], [-0.0714, 0.1301, -0.1674, ..., 0.0613, 0.0775, -0.0464], ..., [-0.0700, -0.0950, -0.0822, ..., 0.0043, -0.1758, 0.1323], [ 0.0612, -0.0251, 0.0791, ..., 0.0043, -0.2030, -0.0103], [-0.1822, -0.0909, -0.1141, ..., -0.2047, 0.0530, -0.1049]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 6.5193e-09, ..., 4.6566e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 1.0245e-08, -1.1493e-06, ..., -8.1304e-07, -6.7335e-07, 0.0000e+00], [ 0.0000e+00, 6.2212e-07, 2.6077e-08, ..., 1.9278e-06, -1.7136e-07, 0.0000e+00], ..., [ 0.0000e+00, -8.3074e-07, 1.0813e-06, ..., -1.4659e-06, 6.2771e-07, 0.0000e+00], [ 0.0000e+00, 1.7323e-07, -1.5832e-08, ..., 3.7253e-07, 1.0431e-07, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, 1.7695e-08, ..., -2.4214e-07, -6.7055e-08, 0.0000e+00]], device='cuda:0') Epoch 263, bias, value: tensor([-0.0063, -0.0345, 0.0090, -0.0198, 0.0171, 0.0126, 0.0257, 0.0026, -0.0379, -0.0116], device='cuda:0'), grad: tensor([ 1.6950e-07, -2.3656e-06, 5.0515e-06, 4.7963e-07, 2.4308e-07, -2.7008e-08, 2.1048e-07, -3.4422e-06, 3.5726e-06, -3.9190e-06], device='cuda:0') 100 0.0001 changing lr epoch 262, time 247.56, cls_loss 0.0015 cls_loss_mapping 0.0021 cls_loss_causal 0.4889 re_mapping 0.0051 re_causal 0.0147 /// teacc 99.02 lr 0.00010000 Epoch 264, weight, value: tensor([[ 0.0302, -0.1614, -0.1345, ..., -0.2995, -0.1039, -0.1368], [ 0.0469, -0.0686, 0.0372, ..., 0.0504, 0.1031, -0.0463], [-0.0714, 0.1304, -0.1678, ..., 0.0613, 0.0776, -0.0465], ..., [-0.0700, -0.0951, -0.0801, ..., 0.0069, -0.1763, 0.1326], [ 0.0614, -0.0253, 0.0794, ..., 0.0046, -0.2033, -0.0103], [-0.1825, -0.0913, -0.1146, ..., -0.2060, 0.0537, -0.1050]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.1176e-08, ..., 9.3132e-09, 5.7276e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -4.4238e-07, ..., -5.2061e-07, -2.9802e-08, -9.3132e-10], [ 0.0000e+00, -8.3819e-09, 1.0058e-07, ..., 1.0058e-07, 4.9267e-07, 9.3132e-10], ..., [ 0.0000e+00, 2.7940e-09, 3.4180e-07, ..., 4.1537e-07, 9.8161e-07, -3.7253e-09], [ 0.0000e+00, -9.3132e-10, -9.3132e-10, ..., -0.0000e+00, 2.6077e-08, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, 1.2107e-08, ..., 1.5832e-08, 1.0906e-06, 9.3132e-10]], device='cuda:0') Epoch 264, bias, value: tensor([-0.0064, -0.0367, 0.0089, -0.0198, 0.0152, 0.0124, 0.0256, 0.0049, -0.0379, -0.0107], device='cuda:0'), grad: tensor([ 1.4137e-06, 6.6031e-07, 1.2554e-06, -2.0117e-07, -1.3329e-05, 4.6566e-08, 5.4352e-06, 1.8254e-06, 1.2480e-07, 2.7604e-06], device='cuda:0') 100 0.0001 changing lr epoch 263, time 247.68, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.5213 re_mapping 0.0051 re_causal 0.0153 /// teacc 99.11 lr 0.00010000 Epoch 265, weight, value: tensor([[ 0.0301, -0.1618, -0.1347, ..., -0.3004, -0.1041, -0.1386], [ 0.0485, -0.0689, 0.0373, ..., 0.0503, 0.1043, -0.0461], [-0.0714, 0.1311, -0.1680, ..., 0.0615, 0.0780, -0.0447], ..., [-0.0701, -0.0955, -0.0802, ..., 0.0068, -0.1790, 0.1324], [ 0.0613, -0.0252, 0.0803, ..., 0.0060, -0.2035, -0.0104], [-0.1828, -0.0940, -0.1163, ..., -0.2076, 0.0535, -0.1051]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.6391e-07, 4.3772e-08, ..., 1.5274e-07, 5.8394e-07, 1.8626e-09], [-9.3132e-10, 3.7253e-08, 9.4064e-08, ..., 1.4063e-07, 4.5635e-08, 1.8626e-09], [ 0.0000e+00, -1.0217e-06, 3.3434e-07, ..., -4.6752e-07, -9.3970e-07, 9.3132e-10], ..., [ 0.0000e+00, 6.2957e-07, 4.3772e-08, ..., 2.4866e-07, 6.3796e-07, 1.8626e-09], [ 0.0000e+00, 6.0536e-08, -8.6427e-07, ..., -3.2317e-07, 3.6694e-07, 9.3132e-09], [ 0.0000e+00, 3.2596e-08, 4.5635e-08, ..., 1.6298e-07, -1.2014e-07, -3.7253e-09]], device='cuda:0') Epoch 265, bias, value: tensor([-0.0064, -0.0364, 0.0092, -0.0195, 0.0152, 0.0119, 0.0252, 0.0047, -0.0373, -0.0110], device='cuda:0'), grad: tensor([ 4.1015e-06, 6.3796e-07, -2.2948e-06, -7.2829e-07, 1.6950e-07, 2.1651e-05, -2.6315e-05, 1.2731e-06, 1.0785e-06, 4.1258e-07], device='cuda:0') 100 0.0001 changing lr epoch 264, time 247.88, cls_loss 0.0016 cls_loss_mapping 0.0021 cls_loss_causal 0.4720 re_mapping 0.0050 re_causal 0.0142 /// teacc 99.01 lr 0.00010000 Epoch 266, weight, value: tensor([[ 0.0303, -0.1610, -0.1355, ..., -0.3045, -0.1055, -0.1413], [ 0.0486, -0.0692, 0.0373, ..., 0.0503, 0.1045, -0.0463], [-0.0714, 0.1313, -0.1687, ..., 0.0613, 0.0779, -0.0435], ..., [-0.0701, -0.0957, -0.0802, ..., 0.0069, -0.1794, 0.1336], [ 0.0613, -0.0251, 0.0809, ..., 0.0071, -0.2034, -0.0108], [-0.1830, -0.0969, -0.1198, ..., -0.2090, 0.0545, -0.1050]], device='cuda:0'), grad: tensor([[-1.0617e-06, 0.0000e+00, 0.0000e+00, ..., -6.5193e-09, -1.6764e-07, 5.5879e-09], [ 3.7253e-09, 2.7940e-09, -3.0734e-08, ..., 1.4901e-08, -7.4506e-09, 1.8626e-09], [ 7.8231e-08, -3.7253e-09, 1.8626e-09, ..., 2.7940e-09, 3.3528e-08, 1.8626e-09], ..., [ 9.3132e-10, 9.3132e-10, 1.0245e-08, ..., -5.6811e-08, 2.8871e-08, -5.5879e-09], [ 1.7695e-08, 0.0000e+00, 4.6566e-09, ..., 1.8626e-09, 9.3132e-09, 9.3132e-10], [ 2.4028e-07, 0.0000e+00, 2.7940e-09, ..., 2.9802e-08, 4.5635e-08, 1.8626e-09]], device='cuda:0') Epoch 266, bias, value: tensor([-0.0064, -0.0364, 0.0090, -0.0198, 0.0155, 0.0120, 0.0251, 0.0047, -0.0366, -0.0109], device='cuda:0'), grad: tensor([-9.5218e-06, 1.2200e-07, 8.8196e-07, 2.8312e-07, 1.1269e-07, 5.6997e-07, 4.9025e-06, -3.4459e-08, 1.9651e-07, 2.4661e-06], device='cuda:0') 100 0.0001 changing lr epoch 265, time 247.55, cls_loss 0.0014 cls_loss_mapping 0.0024 cls_loss_causal 0.4972 re_mapping 0.0050 re_causal 0.0150 /// teacc 99.00 lr 0.00010000 Epoch 267, weight, value: tensor([[ 0.0307, -0.1611, -0.1375, ..., -0.3055, -0.1054, -0.1417], [ 0.0490, -0.0694, 0.0377, ..., 0.0509, 0.1051, -0.0464], [-0.0715, 0.1319, -0.1691, ..., 0.0613, 0.0780, -0.0434], ..., [-0.0701, -0.0964, -0.0805, ..., 0.0064, -0.1810, 0.1338], [ 0.0613, -0.0248, 0.0810, ..., 0.0060, -0.2037, -0.0110], [-0.1832, -0.0979, -0.1203, ..., -0.2099, 0.0544, -0.1051]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, -6.0908e-07, ..., 2.5146e-08, 2.5146e-08, 6.5193e-09], [ 0.0000e+00, 9.3132e-10, 3.3993e-07, ..., 8.9128e-07, -3.8464e-07, -2.2445e-07], [ 0.0000e+00, -1.2107e-08, 8.3819e-08, ..., 1.4249e-07, -1.9558e-08, 4.6566e-09], ..., [ 0.0000e+00, 9.3132e-10, -4.9826e-07, ..., -1.7183e-06, 4.3772e-08, -5.1223e-08], [ 0.0000e+00, 4.6566e-09, -7.8604e-07, ..., -6.5286e-07, 7.3574e-08, 3.0734e-08], [ 0.0000e+00, 0.0000e+00, 4.0792e-07, ..., 6.1747e-07, 9.3132e-09, 6.7987e-08]], device='cuda:0') Epoch 267, bias, value: tensor([-0.0058, -0.0360, 0.0090, -0.0198, 0.0154, 0.0123, 0.0254, 0.0044, -0.0373, -0.0117], device='cuda:0'), grad: tensor([-2.9877e-05, 1.3020e-06, 8.1304e-07, 2.8815e-06, 9.5926e-08, 5.9139e-07, 1.6123e-05, -3.8743e-06, -1.1371e-06, 1.3024e-05], device='cuda:0') 100 0.0001 changing lr epoch 266, time 247.66, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4969 re_mapping 0.0046 re_causal 0.0144 /// teacc 98.99 lr 0.00010000 Epoch 268, weight, value: tensor([[ 0.0307, -0.1611, -0.1375, ..., -0.3057, -0.1052, -0.1420], [ 0.0487, -0.0695, 0.0378, ..., 0.0509, 0.1052, -0.0467], [-0.0716, 0.1319, -0.1693, ..., 0.0612, 0.0781, -0.0460], ..., [-0.0701, -0.0962, -0.0806, ..., 0.0064, -0.1814, 0.1348], [ 0.0612, -0.0249, 0.0812, ..., 0.0061, -0.2040, -0.0118], [-0.1834, -0.0980, -0.1210, ..., -0.2107, 0.0540, -0.1052]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.4703e-08, ..., 5.6811e-08, 3.0734e-08, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, -1.2014e-07, ..., -7.3574e-08, -1.6578e-07, 9.3132e-10], [ 0.0000e+00, -7.4506e-09, 1.5274e-07, ..., 1.7043e-07, 3.3528e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.7940e-09, 5.9605e-08, ..., -2.7940e-08, 6.9849e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -2.3656e-07, ..., -3.1106e-07, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.0617e-07, ..., 1.1921e-07, -2.7940e-08, -1.0245e-08]], device='cuda:0') Epoch 268, bias, value: tensor([-0.0052, -0.0361, 0.0089, -0.0198, 0.0161, 0.0124, 0.0255, 0.0044, -0.0372, -0.0127], device='cuda:0'), grad: tensor([ 5.1875e-07, -1.5646e-07, 9.7137e-07, -2.4401e-07, -9.3132e-10, 2.2911e-07, 8.2515e-07, -8.2888e-08, -2.6226e-06, 5.5786e-07], device='cuda:0') 100 0.0001 changing lr epoch 267, time 247.67, cls_loss 0.0018 cls_loss_mapping 0.0026 cls_loss_causal 0.5013 re_mapping 0.0048 re_causal 0.0143 /// teacc 99.01 lr 0.00010000 Epoch 269, weight, value: tensor([[ 0.0307, -0.1612, -0.1380, ..., -0.3065, -0.1056, -0.1424], [ 0.0486, -0.0694, 0.0354, ..., 0.0485, 0.1042, -0.0477], [-0.0716, 0.1325, -0.1703, ..., 0.0612, 0.0781, -0.0460], ..., [-0.0701, -0.0963, -0.0780, ..., 0.0090, -0.1789, 0.1354], [ 0.0612, -0.0259, 0.0811, ..., 0.0047, -0.2055, -0.0115], [-0.1836, -0.0982, -0.1217, ..., -0.2115, 0.0535, -0.1051]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 3.7253e-09, 9.1270e-08, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, -9.6858e-08, ..., 4.6194e-07, 1.4435e-07, 3.6415e-07], [ 0.0000e+00, -3.7253e-09, 3.8184e-08, ..., 3.4459e-08, 9.0338e-08, 4.6566e-09], ..., [ 0.0000e+00, 3.7253e-09, 4.0978e-08, ..., -5.3179e-07, 1.7788e-07, -3.8277e-07], [ 0.0000e+00, 0.0000e+00, -2.2352e-08, ..., -2.0489e-08, 1.7323e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.2107e-08, 2.2855e-06, 7.4506e-09]], device='cuda:0') Epoch 269, bias, value: tensor([-0.0051, -0.0381, 0.0089, -0.0198, 0.0173, 0.0122, 0.0243, 0.0067, -0.0377, -0.0136], device='cuda:0'), grad: tensor([ 7.6368e-08, 2.1458e-06, 3.0547e-07, 1.2387e-07, -7.6815e-06, 1.5460e-07, 1.1763e-06, -1.1129e-06, 8.8569e-07, 3.9339e-06], device='cuda:0') 100 0.0001 changing lr epoch 268, time 247.61, cls_loss 0.0011 cls_loss_mapping 0.0014 cls_loss_causal 0.4616 re_mapping 0.0050 re_causal 0.0143 /// teacc 99.05 lr 0.00010000 Epoch 270, weight, value: tensor([[ 0.0309, -0.1613, -0.1381, ..., -0.3068, -0.1057, -0.1444], [ 0.0487, -0.0695, 0.0354, ..., 0.0485, 0.1042, -0.0485], [-0.0716, 0.1326, -0.1706, ..., 0.0613, 0.0785, -0.0457], ..., [-0.0701, -0.0964, -0.0780, ..., 0.0090, -0.1790, 0.1359], [ 0.0612, -0.0259, 0.0813, ..., 0.0051, -0.2060, -0.0119], [-0.1839, -0.0980, -0.1219, ..., -0.2117, 0.0534, -0.1052]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 4.2841e-08, ..., 5.0291e-08, 8.6613e-08, 1.8626e-09], [ 0.0000e+00, 9.3132e-10, -3.3528e-08, ..., -3.9581e-07, -1.8375e-06, 5.5879e-09], [ 0.0000e+00, -3.7253e-09, 4.3586e-07, ..., 5.0105e-07, 8.7637e-07, 9.3132e-10], ..., [ 0.0000e+00, 9.3132e-10, 2.8033e-07, ..., 2.2165e-07, 3.9395e-07, -1.3970e-08], [ 0.0000e+00, 0.0000e+00, -1.6689e-06, ..., -1.2377e-06, -2.7474e-07, 8.3819e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-08, ..., 4.8429e-08, 9.3132e-09, 3.7253e-09]], device='cuda:0') Epoch 270, bias, value: tensor([-0.0049, -0.0381, 0.0090, -0.0198, 0.0182, 0.0122, 0.0240, 0.0067, -0.0380, -0.0137], device='cuda:0'), grad: tensor([ 1.9930e-07, -1.2396e-06, 1.4463e-06, 5.7556e-07, 1.9465e-07, 7.0501e-07, 7.3109e-07, 5.5414e-07, -3.2820e-06, 1.0524e-07], device='cuda:0') 100 0.0001 changing lr epoch 269, time 247.88, cls_loss 0.0013 cls_loss_mapping 0.0026 cls_loss_causal 0.4982 re_mapping 0.0047 re_causal 0.0148 /// teacc 99.00 lr 0.00010000 Epoch 271, weight, value: tensor([[ 0.0309, -0.1613, -0.1388, ..., -0.3082, -0.1059, -0.1456], [ 0.0488, -0.0696, 0.0354, ..., 0.0485, 0.1046, -0.0496], [-0.0716, 0.1331, -0.1715, ..., 0.0613, 0.0786, -0.0466], ..., [-0.0701, -0.0966, -0.0780, ..., 0.0088, -0.1793, 0.1362], [ 0.0612, -0.0265, 0.0822, ..., 0.0044, -0.2075, -0.0143], [-0.1842, -0.0982, -0.1247, ..., -0.2136, 0.0534, -0.1057]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 9.3132e-09, 5.4110e-07, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, -9.1195e-06, ..., -1.3039e-05, -1.2264e-05, 3.6880e-07], [ 0.0000e+00, -9.3132e-10, 1.1586e-06, ..., 1.6829e-06, 1.6605e-06, 2.4214e-08], ..., [ 0.0000e+00, 9.3132e-10, 7.6219e-06, ..., 1.0870e-05, 1.0565e-05, -4.0792e-07], [ 0.0000e+00, 0.0000e+00, 2.4214e-08, ..., 4.9360e-08, 5.0291e-08, 2.9802e-08], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 3.7253e-09, -4.4145e-07, -3.0734e-08]], device='cuda:0') Epoch 271, bias, value: tensor([-0.0049, -0.0381, 0.0091, -0.0192, 0.0182, 0.0120, 0.0243, 0.0066, -0.0380, -0.0143], device='cuda:0'), grad: tensor([ 2.0303e-06, -2.0206e-05, 3.2037e-06, 1.2256e-06, 7.6741e-06, 1.7202e-06, -1.1981e-05, 1.7717e-05, 3.6135e-07, -1.7500e-06], device='cuda:0') 100 0.0001 changing lr epoch 270, time 247.81, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.5313 re_mapping 0.0049 re_causal 0.0159 /// teacc 99.02 lr 0.00010000 Epoch 272, weight, value: tensor([[ 0.0307, -0.1614, -0.1397, ..., -0.3086, -0.1061, -0.1457], [ 0.0496, -0.0696, 0.0355, ..., 0.0486, 0.1049, -0.0499], [-0.0717, 0.1333, -0.1723, ..., 0.0612, 0.0786, -0.0467], ..., [-0.0706, -0.0967, -0.0780, ..., 0.0085, -0.1796, 0.1371], [ 0.0603, -0.0265, 0.0820, ..., 0.0039, -0.2084, -0.0143], [-0.1841, -0.0983, -0.1249, ..., -0.2137, 0.0536, -0.1057]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 3.7253e-09, 2.8871e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -9.5926e-08, ..., -8.4750e-08, -3.8184e-08, -5.5879e-09], [ 0.0000e+00, -3.7253e-09, 2.6077e-08, ..., -8.5682e-08, -1.1828e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 1.3877e-07, ..., 7.9162e-08, 1.4249e-07, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, -1.7583e-06, ..., -4.6007e-07, 5.4017e-08, 0.0000e+00], [ 2.7940e-09, 0.0000e+00, 1.1176e-08, ..., 4.6566e-09, -1.4715e-07, -9.3132e-10]], device='cuda:0') Epoch 272, bias, value: tensor([-0.0052, -0.0380, 0.0090, -0.0187, 0.0183, 0.0119, 0.0241, 0.0065, -0.0382, -0.0141], device='cuda:0'), grad: tensor([ 8.3819e-09, 3.4459e-08, -2.6543e-07, 1.6410e-06, 1.9278e-07, -2.1700e-07, 1.2862e-06, 3.0547e-07, -2.5332e-06, -4.5914e-07], device='cuda:0') 100 0.0001 changing lr epoch 271, time 247.84, cls_loss 0.0013 cls_loss_mapping 0.0017 cls_loss_causal 0.4822 re_mapping 0.0050 re_causal 0.0140 /// teacc 99.01 lr 0.00010000 Epoch 273, weight, value: tensor([[ 0.0305, -0.1615, -0.1401, ..., -0.3092, -0.1064, -0.1462], [ 0.0497, -0.0696, 0.0355, ..., 0.0486, 0.1054, -0.0502], [-0.0718, 0.1333, -0.1732, ..., 0.0610, 0.0783, -0.0474], ..., [-0.0708, -0.0967, -0.0780, ..., 0.0086, -0.1798, 0.1373], [ 0.0604, -0.0265, 0.0833, ..., 0.0050, -0.2088, -0.0144], [-0.1844, -0.0985, -0.1253, ..., -0.2138, 0.0538, -0.1057]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 3.7253e-09, 2.4214e-08, ..., 2.7008e-08, 1.0245e-08, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 2.7940e-08, ..., 4.4703e-08, -8.3819e-09, 2.7940e-09], [ 9.3132e-10, -5.0291e-08, 3.7793e-06, ..., 4.0159e-06, 6.9849e-07, 2.6077e-08], ..., [ 9.3132e-10, 4.2841e-08, 1.6019e-07, ..., 7.4506e-08, 9.1270e-08, -3.8184e-08], [ 1.4901e-08, 2.7940e-09, 8.6613e-08, ..., 9.1270e-08, 3.0734e-08, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 2.4214e-08, ..., 3.9116e-08, -5.8301e-07, 4.6566e-09]], device='cuda:0') Epoch 273, bias, value: tensor([-0.0069, -0.0380, 0.0088, -0.0188, 0.0180, 0.0124, 0.0249, 0.0065, -0.0374, -0.0141], device='cuda:0'), grad: tensor([-9.6764e-07, 1.6298e-07, 9.0078e-06, -9.3281e-06, 3.3230e-06, -3.3248e-07, 8.8010e-07, 1.8440e-07, 3.5483e-07, -3.3118e-06], device='cuda:0') 100 0.0001 changing lr epoch 272, time 247.38, cls_loss 0.0012 cls_loss_mapping 0.0025 cls_loss_causal 0.4888 re_mapping 0.0048 re_causal 0.0146 /// teacc 98.98 lr 0.00010000 Epoch 274, weight, value: tensor([[ 0.0307, -0.1616, -0.1408, ..., -0.3095, -0.1065, -0.1463], [ 0.0500, -0.0696, 0.0356, ..., 0.0486, 0.1055, -0.0503], [-0.0718, 0.1337, -0.1737, ..., 0.0612, 0.0788, -0.0477], ..., [-0.0710, -0.0971, -0.0781, ..., 0.0085, -0.1801, 0.1374], [ 0.0600, -0.0265, 0.0835, ..., 0.0036, -0.2092, -0.0143], [-0.1848, -0.0986, -0.1255, ..., -0.2140, 0.0537, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.8626e-09, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6566e-09, ..., 5.5879e-09, 2.0955e-07, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 8.3819e-09, ..., 5.3085e-08, 4.6566e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.5832e-08, ..., -4.6566e-08, 3.8184e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 3.7253e-09, 4.2841e-08, 2.7940e-09], [ 9.3132e-10, 0.0000e+00, 1.1176e-08, ..., 1.0245e-08, -4.4238e-07, 0.0000e+00]], device='cuda:0') Epoch 274, bias, value: tensor([-0.0086, -0.0379, 0.0089, -0.0187, 0.0184, 0.0125, 0.0265, 0.0064, -0.0385, -0.0143], device='cuda:0'), grad: tensor([-1.2945e-07, 6.8638e-07, 1.6019e-07, -2.9895e-07, 4.3772e-07, 2.6450e-07, -1.0896e-07, -1.3039e-08, 1.2759e-07, -1.1260e-06], device='cuda:0') 100 0.0001 changing lr epoch 273, time 247.36, cls_loss 0.0014 cls_loss_mapping 0.0025 cls_loss_causal 0.4913 re_mapping 0.0049 re_causal 0.0141 /// teacc 98.91 lr 0.00010000 Epoch 275, weight, value: tensor([[ 0.0307, -0.1615, -0.1403, ..., -0.3097, -0.1063, -0.1464], [ 0.0503, -0.0697, 0.0357, ..., 0.0486, 0.1056, -0.0505], [-0.0719, 0.1339, -0.1740, ..., 0.0613, 0.0788, -0.0481], ..., [-0.0712, -0.0973, -0.0781, ..., 0.0084, -0.1803, 0.1394], [ 0.0600, -0.0265, 0.0833, ..., 0.0034, -0.2094, -0.0145], [-0.1846, -0.0990, -0.1259, ..., -0.2144, 0.0534, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 5.5879e-09, ..., 7.4506e-09, 7.6368e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 1.8626e-08, ..., 1.6764e-08, 1.4901e-08, 3.7253e-09], [ 0.0000e+00, -1.3039e-08, 1.6950e-07, ..., 1.0058e-07, 7.4506e-09, 1.8626e-09], ..., [ 0.0000e+00, 1.8626e-09, 6.9663e-07, ..., 5.9232e-07, 3.1851e-07, -1.6764e-08], [ 0.0000e+00, 1.8626e-09, -1.1176e-07, ..., 5.2154e-08, 2.0489e-08, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 5.5879e-09, -7.3947e-07, 1.8626e-09]], device='cuda:0') Epoch 275, bias, value: tensor([-0.0084, -0.0379, 0.0089, -0.0186, 0.0180, 0.0127, 0.0267, 0.0065, -0.0389, -0.0146], device='cuda:0'), grad: tensor([ 2.0489e-07, 2.3469e-07, 3.5949e-07, -2.6375e-06, 6.1095e-07, 4.1164e-07, 1.4342e-07, 2.5239e-06, -2.0489e-08, -1.8403e-06], device='cuda:0') 100 0.0001 changing lr epoch 274, time 247.19, cls_loss 0.0011 cls_loss_mapping 0.0026 cls_loss_causal 0.4767 re_mapping 0.0047 re_causal 0.0141 /// teacc 98.98 lr 0.00010000 Epoch 276, weight, value: tensor([[ 0.0306, -0.1616, -0.1407, ..., -0.3099, -0.1064, -0.1465], [ 0.0507, -0.0697, 0.0358, ..., 0.0487, 0.1059, -0.0505], [-0.0721, 0.1343, -0.1747, ..., 0.0613, 0.0790, -0.0481], ..., [-0.0718, -0.0976, -0.0783, ..., 0.0084, -0.1807, 0.1394], [ 0.0599, -0.0265, 0.0836, ..., 0.0037, -0.2097, -0.0145], [-0.1847, -0.0996, -0.1260, ..., -0.2146, 0.0539, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.3039e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -6.9104e-07, ..., 3.7253e-09, -7.4133e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 8.3819e-08, ..., 1.7881e-07, 8.7544e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 5.8860e-07, ..., -3.5204e-07, 6.0908e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.9616e-07, ..., -5.7742e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.9116e-08, ..., 3.7253e-08, -3.0175e-07, 0.0000e+00]], device='cuda:0') Epoch 276, bias, value: tensor([-0.0085, -0.0378, 0.0089, -0.0185, 0.0172, 0.0129, 0.0267, 0.0063, -0.0386, -0.0141], device='cuda:0'), grad: tensor([ 7.4506e-09, -5.3458e-07, 5.5321e-07, 4.2096e-07, 1.6019e-07, 5.5879e-07, 2.0675e-07, -1.0803e-07, -4.9546e-07, -8.0653e-07], device='cuda:0') 100 0.0001 changing lr epoch 275, time 247.02, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4892 re_mapping 0.0048 re_causal 0.0141 /// teacc 99.05 lr 0.00010000 Epoch 277, weight, value: tensor([[ 0.0305, -0.1617, -0.1410, ..., -0.3101, -0.1066, -0.1466], [ 0.0507, -0.0698, 0.0360, ..., 0.0488, 0.1062, -0.0505], [-0.0722, 0.1348, -0.1752, ..., 0.0613, 0.0791, -0.0482], ..., [-0.0717, -0.0980, -0.0784, ..., 0.0083, -0.1811, 0.1394], [ 0.0599, -0.0265, 0.0838, ..., 0.0038, -0.2098, -0.0146], [-0.1847, -0.0998, -0.1265, ..., -0.2152, 0.0542, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 5.5879e-09, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.2498e-06, ..., -8.3074e-07, -1.3076e-06, 1.6764e-08], [ 0.0000e+00, 0.0000e+00, 4.5449e-07, ..., 7.0781e-07, 1.4901e-08, 9.8720e-08], ..., [ 0.0000e+00, 0.0000e+00, 5.3458e-07, ..., -2.6450e-07, 1.2387e-06, -1.6391e-07], [ 0.0000e+00, 0.0000e+00, 6.5193e-08, ..., 1.2293e-07, 1.8626e-08, 1.8626e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 1.8626e-08, -2.2314e-06, 0.0000e+00]], device='cuda:0') Epoch 277, bias, value: tensor([-0.0085, -0.0377, 0.0089, -0.0185, 0.0167, 0.0129, 0.0269, 0.0062, -0.0387, -0.0140], device='cuda:0'), grad: tensor([ 6.3330e-08, -2.2221e-06, 1.4286e-06, 4.4517e-07, 6.4895e-06, 6.1467e-08, -9.4995e-08, 9.4995e-08, 2.7940e-07, -6.5565e-06], device='cuda:0') 100 0.0001 changing lr epoch 276, time 247.08, cls_loss 0.0014 cls_loss_mapping 0.0022 cls_loss_causal 0.5006 re_mapping 0.0046 re_causal 0.0139 /// teacc 99.00 lr 0.00010000 Epoch 278, weight, value: tensor([[ 0.0305, -0.1617, -0.1415, ..., -0.3103, -0.1066, -0.1466], [ 0.0507, -0.0699, 0.0361, ..., 0.0488, 0.1064, -0.0505], [-0.0722, 0.1352, -0.1757, ..., 0.0613, 0.0793, -0.0482], ..., [-0.0717, -0.0982, -0.0786, ..., 0.0082, -0.1814, 0.1394], [ 0.0599, -0.0266, 0.0844, ..., 0.0048, -0.2105, -0.0146], [-0.1851, -0.1013, -0.1268, ..., -0.2169, 0.0555, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 3.7253e-09, ..., 3.7253e-09, 2.7940e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, -1.8626e-09, ..., 2.9802e-08, 9.1270e-06, 0.0000e+00], [ 0.0000e+00, -6.7055e-08, 3.1665e-08, ..., -2.9802e-08, -5.2154e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.9802e-08, 1.5832e-07, ..., 1.0245e-07, 2.7008e-07, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -2.9430e-07, ..., -4.0978e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 3.9116e-08, ..., 2.4214e-08, 2.6077e-05, 0.0000e+00]], device='cuda:0') Epoch 278, bias, value: tensor([-0.0086, -0.0376, 0.0089, -0.0169, 0.0151, 0.0113, 0.0266, 0.0060, -0.0384, -0.0127], device='cuda:0'), grad: tensor([-1.7863e-06, 2.0921e-05, 1.1362e-07, 1.5087e-07, -7.8917e-05, 3.7067e-07, 1.3020e-06, 7.5810e-07, -3.6880e-07, 5.7399e-05], device='cuda:0') 100 0.0001 changing lr epoch 277, time 247.09, cls_loss 0.0012 cls_loss_mapping 0.0022 cls_loss_causal 0.5450 re_mapping 0.0050 re_causal 0.0151 /// teacc 98.97 lr 0.00010000 Epoch 279, weight, value: tensor([[ 0.0311, -0.1617, -0.1414, ..., -0.3104, -0.1067, -0.1467], [ 0.0508, -0.0703, 0.0361, ..., 0.0488, 0.1065, -0.0508], [-0.0737, 0.1359, -0.1764, ..., 0.0612, 0.0793, -0.0482], ..., [-0.0700, -0.0985, -0.0786, ..., 0.0082, -0.1823, 0.1395], [ 0.0596, -0.0258, 0.0849, ..., 0.0055, -0.2107, -0.0144], [-0.1860, -0.1055, -0.1270, ..., -0.2173, 0.0577, -0.1058]], device='cuda:0'), grad: tensor([[-5.5879e-09, 2.9802e-08, 0.0000e+00, ..., 2.2352e-08, 8.6613e-07, 0.0000e+00], [ 0.0000e+00, 2.7753e-07, -3.5390e-08, ..., 1.2480e-07, 2.7008e-07, 0.0000e+00], [ 0.0000e+00, -8.4750e-07, 2.2352e-08, ..., -4.8243e-07, -8.2888e-07, 0.0000e+00], ..., [ 0.0000e+00, 3.0175e-07, 2.0489e-08, ..., 1.9930e-07, 4.0419e-07, 0.0000e+00], [ 0.0000e+00, 5.4017e-08, 5.5879e-09, ..., 3.7253e-08, 2.4959e-07, 0.0000e+00], [ 1.8626e-09, 1.8626e-08, 0.0000e+00, ..., 1.3039e-08, -3.2783e-07, 0.0000e+00]], device='cuda:0') Epoch 279, bias, value: tensor([-0.0083, -0.0377, 0.0087, -0.0169, 0.0135, 0.0112, 0.0267, 0.0058, -0.0382, -0.0107], device='cuda:0'), grad: tensor([ 5.0142e-06, 8.6240e-07, -2.0061e-06, 2.3469e-07, 1.1548e-06, 4.4890e-07, -7.3388e-06, 1.1604e-06, 1.3877e-06, -9.2201e-07], device='cuda:0') 100 0.0001 changing lr epoch 278, time 247.05, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4781 re_mapping 0.0048 re_causal 0.0140 /// teacc 98.93 lr 0.00010000 Epoch 280, weight, value: tensor([[ 0.0338, -0.1618, -0.1417, ..., -0.3106, -0.1067, -0.1472], [ 0.0514, -0.0704, 0.0363, ..., 0.0491, 0.1071, -0.0508], [-0.0749, 0.1367, -0.1769, ..., 0.0612, 0.0792, -0.0483], ..., [-0.0722, -0.0995, -0.0787, ..., 0.0079, -0.1828, 0.1395], [ 0.0592, -0.0253, 0.0850, ..., 0.0056, -0.2111, -0.0146], [-0.1879, -0.1072, -0.1274, ..., -0.2178, 0.0581, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 3.7253e-09, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 1.5423e-06, ..., 1.1530e-06, -1.8440e-07, 0.0000e+00], [ 0.0000e+00, -1.4901e-08, 2.7940e-08, ..., 1.1176e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 7.4506e-09, -2.0731e-06, ..., -1.5199e-06, 6.1467e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.0431e-07, ..., 5.5879e-08, 1.0058e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.6508e-07, ..., 2.6636e-07, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 280, bias, value: tensor([-0.0080, -0.0375, 0.0085, -0.0168, 0.0127, 0.0112, 0.0268, 0.0057, -0.0383, -0.0106], device='cuda:0'), grad: tensor([ 2.0117e-07, 6.9402e-06, 6.5193e-08, 2.0489e-08, 6.3330e-08, -5.5879e-09, -2.4214e-07, -8.8662e-06, 2.6077e-07, 1.5572e-06], device='cuda:0') 100 0.0001 changing lr epoch 279, time 247.31, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.4898 re_mapping 0.0045 re_causal 0.0137 /// teacc 98.97 lr 0.00010000 Epoch 281, weight, value: tensor([[ 0.0338, -0.1620, -0.1419, ..., -0.3108, -0.1067, -0.1482], [ 0.0514, -0.0706, 0.0371, ..., 0.0502, 0.1093, -0.0508], [-0.0751, 0.1375, -0.1774, ..., 0.0611, 0.0790, -0.0483], ..., [-0.0726, -0.1000, -0.0795, ..., 0.0068, -0.1853, 0.1395], [ 0.0590, -0.0252, 0.0853, ..., 0.0057, -0.2116, -0.0146], [-0.1885, -0.1076, -0.1276, ..., -0.2181, 0.0597, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 1.8626e-09, ..., 3.7253e-09, 3.7253e-08, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, -3.9116e-08, ..., 1.8626e-09, 1.7695e-07, 0.0000e+00], [ 1.8626e-09, -5.9605e-08, 5.9605e-08, ..., 1.1176e-08, 1.6764e-08, 0.0000e+00], ..., [-1.8626e-09, 1.8626e-09, 2.7940e-08, ..., -1.7881e-07, 1.0431e-07, 0.0000e+00], [ 0.0000e+00, 5.4017e-08, 1.8626e-09, ..., 8.1956e-08, 7.0781e-08, 0.0000e+00], [ 3.9116e-08, 0.0000e+00, 1.8626e-09, ..., 9.4995e-08, 6.3218e-06, 0.0000e+00]], device='cuda:0') Epoch 281, bias, value: tensor([-0.0078, -0.0363, 0.0084, -0.0167, 0.0109, 0.0112, 0.0268, 0.0044, -0.0382, -0.0091], device='cuda:0'), grad: tensor([ 1.7881e-07, 6.8173e-07, 4.8988e-07, -4.6566e-08, 4.8205e-06, 3.5595e-06, 5.4576e-07, -2.5928e-05, 5.8487e-07, 1.5102e-05], device='cuda:0') 100 0.0001 changing lr epoch 280, time 247.47, cls_loss 0.0014 cls_loss_mapping 0.0016 cls_loss_causal 0.5090 re_mapping 0.0048 re_causal 0.0139 /// teacc 98.94 lr 0.00010000 Epoch 282, weight, value: tensor([[ 0.0339, -0.1621, -0.1435, ..., -0.3110, -0.1068, -0.1499], [ 0.0517, -0.0710, 0.0372, ..., 0.0502, 0.1094, -0.0510], [-0.0753, 0.1380, -0.1770, ..., 0.0616, 0.0801, -0.0481], ..., [-0.0729, -0.1001, -0.0796, ..., 0.0067, -0.1857, 0.1395], [ 0.0584, -0.0255, 0.0855, ..., 0.0060, -0.2122, -0.0149], [-0.1892, -0.1074, -0.1279, ..., -0.2187, 0.0603, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.9802e-08, 5.7556e-07, ..., 4.4703e-08, 1.0617e-07, 5.5879e-08], [ 0.0000e+00, 2.7940e-08, 3.1665e-08, ..., -6.5193e-08, -4.8429e-08, 1.3225e-07], [ 0.0000e+00, -7.2643e-07, 5.6066e-07, ..., -7.2829e-07, -3.8184e-07, 8.7544e-08], ..., [ 0.0000e+00, 4.1910e-07, 3.8743e-07, ..., 4.5262e-07, 3.9302e-07, 3.7253e-09], [ 0.0000e+00, 1.5460e-07, 1.5963e-06, ..., 3.3714e-07, 6.4075e-07, 1.2871e-06], [-3.7253e-09, 7.4506e-09, 6.2026e-07, ..., 2.2352e-08, 7.8231e-08, 2.6077e-08]], device='cuda:0') Epoch 282, bias, value: tensor([-0.0076, -0.0363, 0.0090, -0.0166, 0.0101, 0.0111, 0.0266, 0.0043, -0.0383, -0.0086], device='cuda:0'), grad: tensor([ 1.5143e-06, 9.1456e-07, -6.8918e-07, -7.8157e-06, -3.2037e-07, -1.4044e-06, -4.5225e-06, 2.2557e-06, 6.7167e-06, 3.3230e-06], device='cuda:0') 100 0.0001 changing lr epoch 281, time 247.04, cls_loss 0.0016 cls_loss_mapping 0.0030 cls_loss_causal 0.4929 re_mapping 0.0047 re_causal 0.0136 /// teacc 98.97 lr 0.00010000 Epoch 283, weight, value: tensor([[ 0.0339, -0.1623, -0.1422, ..., -0.3112, -0.1069, -0.1508], [ 0.0530, -0.0715, 0.0373, ..., 0.0502, 0.1094, -0.0511], [-0.0756, 0.1388, -0.1781, ..., 0.0618, 0.0804, -0.0483], ..., [-0.0731, -0.1003, -0.0797, ..., 0.0066, -0.1858, 0.1397], [ 0.0578, -0.0259, 0.0862, ..., 0.0056, -0.2132, -0.0153], [-0.1897, -0.1078, -0.1283, ..., -0.2194, 0.0608, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7998e-07, ..., 4.2841e-08, 3.7067e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-06, ..., -3.9302e-07, -1.0524e-06, 0.0000e+00], [ 0.0000e+00, -1.6764e-08, 1.3970e-07, ..., 1.3039e-08, 8.0094e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.3039e-08, 3.4831e-07, ..., 8.5682e-08, 2.2352e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1921e-07, ..., 3.7253e-08, 1.0990e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-08, ..., 1.1176e-08, -1.1362e-07, 0.0000e+00]], device='cuda:0') Epoch 283, bias, value: tensor([-0.0074, -0.0364, 0.0088, -0.0162, 0.0100, 0.0112, 0.0262, 0.0047, -0.0384, -0.0097], device='cuda:0'), grad: tensor([ 5.3868e-06, -4.5449e-06, 6.5938e-07, 5.9605e-08, -1.0610e-05, 1.0412e-06, 1.0077e-06, 7.0594e-07, 3.3714e-07, 5.9418e-06], device='cuda:0') 100 0.0001 changing lr epoch 282, time 246.91, cls_loss 0.0015 cls_loss_mapping 0.0020 cls_loss_causal 0.4882 re_mapping 0.0048 re_causal 0.0140 /// teacc 99.00 lr 0.00010000 Epoch 284, weight, value: tensor([[ 0.0338, -0.1624, -0.1438, ..., -0.3113, -0.1069, -0.1511], [ 0.0529, -0.0717, 0.0375, ..., 0.0502, 0.1095, -0.0511], [-0.0757, 0.1393, -0.1787, ..., 0.0618, 0.0806, -0.0484], ..., [-0.0731, -0.1006, -0.0798, ..., 0.0065, -0.1889, 0.1397], [ 0.0576, -0.0260, 0.0868, ..., 0.0057, -0.2138, -0.0154], [-0.1922, -0.1079, -0.1289, ..., -0.2196, 0.0645, -0.1059]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 3.7253e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.5146e-07, ..., -9.6858e-08, -2.1793e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.4703e-08, ..., 2.2352e-08, 5.0291e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.8720e-08, ..., 3.5390e-08, 8.7544e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, 1.1735e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 8.5495e-07, 0.0000e+00]], device='cuda:0') Epoch 284, bias, value: tensor([-0.0079, -0.0363, 0.0089, -0.0159, 0.0100, 0.0110, 0.0259, 0.0031, -0.0383, -0.0062], device='cuda:0'), grad: tensor([ 2.2352e-08, -3.5949e-07, 1.1921e-07, 3.1050e-06, 1.0245e-07, -2.3201e-05, 4.4703e-07, 1.4342e-07, 1.9334e-06, 1.7658e-05], device='cuda:0') 100 0.0001 changing lr epoch 283, time 246.67, cls_loss 0.0014 cls_loss_mapping 0.0028 cls_loss_causal 0.4903 re_mapping 0.0051 re_causal 0.0147 /// teacc 98.93 lr 0.00010000 Epoch 285, weight, value: tensor([[ 0.0343, -0.1630, -0.1441, ..., -0.3115, -0.1093, -0.1515], [ 0.0524, -0.0717, 0.0376, ..., 0.0503, 0.1100, -0.0511], [-0.0760, 0.1404, -0.1800, ..., 0.0615, 0.0801, -0.0486], ..., [-0.0750, -0.1015, -0.0799, ..., 0.0064, -0.1891, 0.1400], [ 0.0584, -0.0265, 0.0871, ..., 0.0056, -0.2142, -0.0155], [-0.1941, -0.1081, -0.1299, ..., -0.2206, 0.0645, -0.1059]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 2.9802e-08, 5.5879e-09, ..., 2.6077e-08, 2.0489e-07, 0.0000e+00], [ 1.8626e-09, 7.8231e-08, -2.0489e-08, ..., 3.0361e-07, 1.4529e-07, 0.0000e+00], [ 0.0000e+00, -2.6263e-07, -4.6566e-08, ..., -2.0489e-07, -2.8871e-07, 0.0000e+00], ..., [ 1.8626e-09, 3.7253e-09, 1.3039e-08, ..., -2.0042e-06, 6.3330e-08, -0.0000e+00], [ 9.3132e-08, 6.1467e-08, 1.4901e-08, ..., 6.7055e-08, 1.2666e-07, 0.0000e+00], [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 3.5390e-08, 5.7966e-06, 0.0000e+00]], device='cuda:0') Epoch 285, bias, value: tensor([-0.0099, -0.0362, 0.0088, -0.0163, 0.0097, 0.0117, 0.0282, 0.0030, -0.0384, -0.0064], device='cuda:0'), grad: tensor([ 5.8115e-07, 1.1139e-06, -1.1791e-06, 4.5300e-06, -1.2137e-05, -2.7642e-06, 1.3988e-06, -4.5560e-06, 1.1008e-06, 1.1876e-05], device='cuda:0') 100 0.0001 changing lr epoch 284, time 247.06, cls_loss 0.0015 cls_loss_mapping 0.0024 cls_loss_causal 0.4645 re_mapping 0.0049 re_causal 0.0140 /// teacc 99.04 lr 0.00010000 Epoch 286, weight, value: tensor([[ 0.0339, -0.1634, -0.1450, ..., -0.3116, -0.1096, -0.1519], [ 0.0529, -0.0718, 0.0377, ..., 0.0505, 0.1107, -0.0511], [-0.0765, 0.1409, -0.1806, ..., 0.0610, 0.0796, -0.0486], ..., [-0.0713, -0.1019, -0.0800, ..., 0.0064, -0.1894, 0.1400], [ 0.0569, -0.0292, 0.0866, ..., 0.0054, -0.2152, -0.0155], [-0.1961, -0.1055, -0.1278, ..., -0.2213, 0.0646, -0.1059]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 7.4506e-09, 1.4156e-07, ..., 2.7940e-08, 1.9185e-07, 0.0000e+00], [-6.3330e-08, 3.3528e-08, -2.1048e-06, ..., -2.2724e-07, -3.3155e-06, 0.0000e+00], [ 1.8626e-09, -7.8231e-08, 5.5879e-08, ..., -1.4901e-07, -2.1048e-07, 0.0000e+00], ..., [ 2.4214e-08, 1.4901e-08, 1.7248e-06, ..., 2.0862e-07, 2.5891e-06, 0.0000e+00], [ 5.5879e-09, 5.5879e-09, 1.2666e-07, ..., 4.4703e-08, 1.4529e-07, 0.0000e+00], [ 1.4901e-08, 1.8626e-09, 1.9316e-06, ..., 3.5390e-08, 3.8557e-07, 0.0000e+00]], device='cuda:0') Epoch 286, bias, value: tensor([-0.0099, -0.0361, 0.0084, -0.0163, 0.0093, 0.0118, 0.0281, 0.0031, -0.0408, -0.0058], device='cuda:0'), grad: tensor([-1.0710e-06, -6.4075e-06, -4.0792e-07, -5.0813e-06, 4.6566e-08, 1.4286e-06, 1.4231e-06, 5.2452e-06, 4.5449e-07, 4.3139e-06], device='cuda:0') 100 0.0001 changing lr epoch 285, time 247.22, cls_loss 0.0011 cls_loss_mapping 0.0015 cls_loss_causal 0.4891 re_mapping 0.0050 re_causal 0.0145 /// teacc 98.99 lr 0.00010000 Epoch 287, weight, value: tensor([[ 0.0333, -0.1639, -0.1457, ..., -0.3119, -0.1097, -0.1519], [ 0.0544, -0.0721, 0.0373, ..., 0.0498, 0.1104, -0.0511], [-0.0775, 0.1389, -0.1810, ..., 0.0598, 0.0797, -0.0486], ..., [-0.0716, -0.1027, -0.0795, ..., 0.0071, -0.1890, 0.1400], [ 0.0585, -0.0294, 0.0871, ..., 0.0054, -0.2156, -0.0156], [-0.1961, -0.1056, -0.1279, ..., -0.2215, 0.0646, -0.1059]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 7.4506e-09, -0.0000e+00, -3.1665e-08, ..., -1.4901e-08, -2.2352e-08, 0.0000e+00], [ 3.7253e-09, -5.2154e-08, 7.4506e-09, ..., -1.0990e-07, -7.6368e-08, 0.0000e+00], ..., [-2.4214e-08, 4.8429e-08, 1.3039e-08, ..., 9.6858e-08, 1.9558e-07, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, -9.3132e-09, ..., -0.0000e+00, 1.1176e-08, 0.0000e+00], [ 9.3132e-09, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, -2.6077e-07, 0.0000e+00]], device='cuda:0') Epoch 287, bias, value: tensor([-0.0099, -0.0368, 0.0077, -0.0157, 0.0093, 0.0114, 0.0280, 0.0037, -0.0406, -0.0058], device='cuda:0'), grad: tensor([-2.3842e-07, 2.7940e-08, -1.4715e-07, 3.3528e-08, 3.9488e-07, -6.3330e-08, 1.8626e-07, 4.6194e-07, 8.9407e-08, -7.7114e-07], device='cuda:0') 100 0.0001 changing lr epoch 286, time 247.14, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.4769 re_mapping 0.0047 re_causal 0.0137 /// teacc 99.04 lr 0.00010000 Epoch 288, weight, value: tensor([[ 0.0332, -0.1641, -0.1458, ..., -0.3121, -0.1097, -0.1519], [ 0.0562, -0.0722, 0.0375, ..., 0.0499, 0.1108, -0.0511], [-0.0774, 0.1382, -0.1816, ..., 0.0584, 0.0798, -0.0486], ..., [-0.0716, -0.1011, -0.0796, ..., 0.0072, -0.1893, 0.1400], [ 0.0583, -0.0301, 0.0872, ..., 0.0051, -0.2170, -0.0156], [-0.1984, -0.1056, -0.1280, ..., -0.2221, 0.0646, -0.1059]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.0617e-07, 1.3597e-07, ..., 5.5879e-08, 4.0047e-07, 0.0000e+00], [ 0.0000e+00, 5.7742e-08, -3.4600e-05, ..., -2.6543e-06, -6.9022e-05, 0.0000e+00], [ 0.0000e+00, -2.0787e-06, 4.2841e-07, ..., -1.8030e-06, -1.4175e-06, 0.0000e+00], ..., [ 0.0000e+00, 4.2841e-08, 4.4703e-07, ..., 5.7742e-08, 9.1456e-07, 0.0000e+00], [-5.5879e-09, 9.8720e-08, 5.5283e-06, ..., 4.5821e-07, 1.1377e-05, 0.0000e+00], [ 0.0000e+00, 1.3039e-08, 1.0245e-07, ..., 1.6764e-08, 2.1793e-07, 0.0000e+00]], device='cuda:0') Epoch 288, bias, value: tensor([-0.0097, -0.0366, 0.0068, -0.0156, 0.0092, 0.0114, 0.0279, 0.0038, -0.0410, -0.0057], device='cuda:0'), grad: tensor([ 1.4622e-06, -2.6011e-04, -3.6526e-06, 5.2489e-06, 1.6999e-04, 3.1143e-06, 3.6836e-05, 3.3863e-06, 4.2677e-05, 8.1770e-07], device='cuda:0') 100 0.0001 changing lr epoch 287, time 247.35, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.4911 re_mapping 0.0047 re_causal 0.0143 /// teacc 98.99 lr 0.00010000 Epoch 289, weight, value: tensor([[ 0.0325, -0.1642, -0.1462, ..., -0.3128, -0.1101, -0.1526], [ 0.0577, -0.0724, 0.0376, ..., 0.0499, 0.1114, -0.0511], [-0.0769, 0.1389, -0.1819, ..., 0.0584, 0.0802, -0.0480], ..., [-0.0716, -0.1013, -0.0796, ..., 0.0073, -0.1894, 0.1400], [ 0.0583, -0.0307, 0.0878, ..., 0.0050, -0.2180, -0.0156], [-0.1993, -0.1058, -0.1282, ..., -0.2224, 0.0645, -0.1059]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-6.0350e-07, 0.0000e+00, -1.7229e-06, ..., -1.4380e-06, -2.1812e-06, 0.0000e+00], [ 1.2293e-07, 1.8626e-09, 3.6508e-07, ..., 3.0547e-07, 4.4145e-07, 0.0000e+00], ..., [ 2.7940e-08, 0.0000e+00, 1.0617e-07, ..., 6.3330e-08, 1.1921e-07, 0.0000e+00], [ 2.4401e-07, -3.7253e-09, 6.1281e-07, ..., 5.3085e-07, 9.7044e-07, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 1.1176e-08, ..., 1.1176e-08, 1.3039e-08, 0.0000e+00]], device='cuda:0') Epoch 289, bias, value: tensor([-0.0098, -0.0364, 0.0070, -0.0157, 0.0089, 0.0120, 0.0266, 0.0038, -0.0410, -0.0059], device='cuda:0'), grad: tensor([-5.9605e-08, -2.8890e-06, 7.0781e-07, 1.1176e-07, 7.6368e-07, 8.0094e-08, -2.1420e-07, 1.4715e-07, 1.2275e-06, 1.2107e-07], device='cuda:0') 100 0.0001 changing lr epoch 288, time 247.12, cls_loss 0.0012 cls_loss_mapping 0.0016 cls_loss_causal 0.5122 re_mapping 0.0045 re_causal 0.0139 /// teacc 99.09 lr 0.00010000 Epoch 290, weight, value: tensor([[ 0.0345, -0.1635, -0.1463, ..., -0.3133, -0.1101, -0.1543], [ 0.0576, -0.0725, 0.0376, ..., 0.0498, 0.1115, -0.0510], [-0.0764, 0.1394, -0.1825, ..., 0.0584, 0.0805, -0.0480], ..., [-0.0719, -0.1019, -0.0795, ..., 0.0074, -0.1896, 0.1401], [ 0.0574, -0.0307, 0.0879, ..., 0.0050, -0.2185, -0.0158], [-0.2012, -0.1061, -0.1284, ..., -0.2231, 0.0646, -0.1059]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 1.8626e-09, 5.5879e-09, ..., 1.4901e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 2.7940e-08, -5.5879e-08, ..., 1.0990e-07, -4.8429e-08, 0.0000e+00], [ 0.0000e+00, 7.4506e-08, 3.1665e-08, ..., 4.5821e-07, -3.9116e-08, 0.0000e+00], ..., [-9.3132e-09, -1.6019e-07, 1.3039e-08, ..., -2.2296e-06, 2.7940e-08, 0.0000e+00], [ 0.0000e+00, 1.6764e-08, -8.7544e-08, ..., 3.5949e-07, 2.4214e-08, 8.7544e-08], [ 5.5879e-09, 5.5879e-09, 2.4214e-08, ..., 1.7136e-07, 5.4017e-08, 0.0000e+00]], device='cuda:0') Epoch 290, bias, value: tensor([-0.0093, -0.0365, 0.0071, -0.0156, 0.0089, 0.0121, 0.0265, 0.0039, -0.0412, -0.0060], device='cuda:0'), grad: tensor([ 6.5193e-08, 3.4831e-07, 1.2126e-06, 3.1963e-06, -2.1420e-07, 2.8498e-07, -1.0617e-07, -6.9477e-06, 1.1139e-06, 1.0189e-06], device='cuda:0') 100 0.0001 changing lr epoch 289, time 246.93, cls_loss 0.0012 cls_loss_mapping 0.0030 cls_loss_causal 0.4815 re_mapping 0.0048 re_causal 0.0140 /// teacc 98.99 lr 0.00010000 Epoch 291, weight, value: tensor([[ 0.0346, -0.1638, -0.1471, ..., -0.3139, -0.1103, -0.1553], [ 0.0576, -0.0728, 0.0378, ..., 0.0499, 0.1117, -0.0510], [-0.0762, 0.1403, -0.1830, ..., 0.0586, 0.0809, -0.0478], ..., [-0.0718, -0.1028, -0.0796, ..., 0.0073, -0.1899, 0.1401], [ 0.0573, -0.0309, 0.0895, ..., 0.0049, -0.2189, -0.0161], [-0.2016, -0.1063, -0.1287, ..., -0.2250, 0.0649, -0.1059]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -6.8918e-08, ..., -1.8626e-08, -5.4017e-08, 0.0000e+00], [ 0.0000e+00, -7.4506e-09, 5.7742e-08, ..., 3.7253e-08, -5.5879e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 3.5390e-08, ..., -4.6566e-08, 4.6566e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, -9.6858e-08, ..., -7.6368e-08, 2.2352e-08, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 3.7253e-09, ..., 5.5879e-08, -6.7055e-08, 0.0000e+00]], device='cuda:0') Epoch 291, bias, value: tensor([-0.0094, -0.0364, 0.0077, -0.0158, 0.0088, 0.0117, 0.0261, 0.0039, -0.0398, -0.0060], device='cuda:0'), grad: tensor([ 5.4017e-08, -3.3528e-08, 1.0058e-07, 2.9616e-07, -1.3039e-08, 3.9488e-07, -4.3027e-07, -5.0291e-07, -1.1735e-07, 2.2352e-07], device='cuda:0') 100 0.0001 changing lr epoch 290, time 247.29, cls_loss 0.0013 cls_loss_mapping 0.0025 cls_loss_causal 0.5377 re_mapping 0.0048 re_causal 0.0138 /// teacc 99.10 lr 0.00010000 Epoch 292, weight, value: tensor([[ 0.0345, -0.1639, -0.1473, ..., -0.3141, -0.1103, -0.1556], [ 0.0576, -0.0730, 0.0379, ..., 0.0500, 0.1118, -0.0510], [-0.0764, 0.1406, -0.1837, ..., 0.0586, 0.0809, -0.0479], ..., [-0.0719, -0.1029, -0.0797, ..., 0.0073, -0.1902, 0.1400], [ 0.0568, -0.0310, 0.0897, ..., 0.0047, -0.2198, -0.0166], [-0.2035, -0.1063, -0.1290, ..., -0.2255, 0.0648, -0.1058]], device='cuda:0'), grad: tensor([[-3.7253e-09, 5.5879e-08, 3.1665e-08, ..., 5.9605e-08, -3.1665e-08, 0.0000e+00], [-3.7253e-08, 1.5460e-07, -3.3155e-07, ..., -1.4156e-07, -4.9360e-07, 0.0000e+00], [ 9.3132e-09, -1.4324e-06, 4.2841e-08, ..., -1.2256e-06, -1.4976e-06, 0.0000e+00], ..., [ 1.8626e-09, 3.2410e-07, 9.4995e-08, ..., 3.4831e-07, 5.1409e-07, 0.0000e+00], [ 3.7253e-09, 1.0617e-07, 1.4901e-08, ..., 1.0245e-07, 1.8254e-07, 0.0000e+00], [ 1.8626e-08, 9.3132e-09, 2.0489e-08, ..., 1.8626e-08, 3.7998e-07, 0.0000e+00]], device='cuda:0') Epoch 292, bias, value: tensor([-0.0092, -0.0364, 0.0075, -0.0160, 0.0099, 0.0119, 0.0261, 0.0037, -0.0397, -0.0063], device='cuda:0'), grad: tensor([-6.6459e-06, -4.6007e-07, -3.6061e-06, 2.2203e-06, -1.3672e-06, 2.4214e-07, 3.1218e-06, 1.3411e-06, 6.0350e-07, 4.5374e-06], device='cuda:0') 100 0.0001 changing lr epoch 291, time 246.99, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4871 re_mapping 0.0047 re_causal 0.0135 /// teacc 99.04 lr 0.00010000 Epoch 293, weight, value: tensor([[ 0.0346, -0.1643, -0.1474, ..., -0.3143, -0.1104, -0.1559], [ 0.0578, -0.0734, 0.0382, ..., 0.0505, 0.1126, -0.0513], [-0.0775, 0.1402, -0.1841, ..., 0.0583, 0.0809, -0.0480], ..., [-0.0714, -0.1023, -0.0799, ..., 0.0068, -0.1909, 0.1401], [ 0.0567, -0.0310, 0.0899, ..., 0.0047, -0.2205, -0.0167], [-0.2055, -0.1068, -0.1293, ..., -0.2271, 0.0647, -0.1061]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 1.8626e-09, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.5390e-08, ..., -1.8626e-09, -4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 1.8626e-08, ..., -3.7253e-09, 5.2154e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.8626e-09, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, -1.8626e-08, -1.8626e-09]], device='cuda:0') Epoch 293, bias, value: tensor([-0.0090, -0.0359, 0.0073, -0.0160, 0.0102, 0.0119, 0.0259, 0.0033, -0.0398, -0.0066], device='cuda:0'), grad: tensor([-1.9930e-07, 2.0489e-08, 3.5390e-08, -3.0808e-06, -4.3400e-07, 3.0566e-06, -5.5879e-08, 4.6752e-07, 8.0094e-08, 1.0058e-07], device='cuda:0') 100 0.0001 changing lr epoch 292, time 246.77, cls_loss 0.0013 cls_loss_mapping 0.0016 cls_loss_causal 0.5066 re_mapping 0.0047 re_causal 0.0136 /// teacc 99.03 lr 0.00010000 Epoch 294, weight, value: tensor([[ 0.0343, -0.1645, -0.1477, ..., -0.3148, -0.1110, -0.1575], [ 0.0585, -0.0735, 0.0383, ..., 0.0506, 0.1128, -0.0515], [-0.0789, 0.1406, -0.1845, ..., 0.0583, 0.0812, -0.0483], ..., [-0.0708, -0.1022, -0.0800, ..., 0.0069, -0.1911, 0.1405], [ 0.0574, -0.0315, 0.0906, ..., 0.0050, -0.2214, -0.0178], [-0.2061, -0.1069, -0.1292, ..., -0.2275, 0.0650, -0.1060]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 9.3132e-09, 3.1665e-08, 0.0000e+00], [-1.8626e-09, 4.0419e-07, -0.0000e+00, ..., 6.8545e-07, 5.2527e-07, 0.0000e+00], [ 0.0000e+00, -4.7497e-07, 7.4506e-09, ..., -7.8790e-07, -1.1735e-06, 0.0000e+00], ..., [ 0.0000e+00, 4.0978e-08, 4.6566e-08, ..., 9.8720e-08, 6.7055e-08, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, 3.7253e-09, ..., 9.3132e-09, 3.9116e-08, 3.7253e-09], [ 3.7253e-09, 5.5879e-09, 1.6764e-08, ..., 2.4214e-08, 1.0245e-07, 0.0000e+00]], device='cuda:0') Epoch 294, bias, value: tensor([-0.0095, -0.0358, 0.0073, -0.0162, 0.0099, 0.0118, 0.0265, 0.0033, -0.0396, -0.0064], device='cuda:0'), grad: tensor([-1.6112e-06, 1.0170e-06, -2.1849e-06, -3.8743e-07, -3.2783e-07, 1.0207e-06, 1.0021e-06, 2.8871e-07, 2.5891e-07, 9.3877e-07], device='cuda:0') 100 0.0001 changing lr epoch 293, time 246.86, cls_loss 0.0013 cls_loss_mapping 0.0021 cls_loss_causal 0.4621 re_mapping 0.0045 re_causal 0.0131 /// teacc 99.05 lr 0.00010000 Epoch 295, weight, value: tensor([[ 0.0342, -0.1649, -0.1478, ..., -0.3151, -0.1112, -0.1580], [ 0.0588, -0.0739, 0.0382, ..., 0.0505, 0.1121, -0.0521], [-0.0793, 0.1399, -0.1851, ..., 0.0572, 0.0819, -0.0487], ..., [-0.0707, -0.1000, -0.0800, ..., 0.0072, -0.1911, 0.1410], [ 0.0592, -0.0313, 0.0912, ..., 0.0060, -0.2221, -0.0176], [-0.2077, -0.1084, -0.1287, ..., -0.2283, 0.0653, -0.1061]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 3.3807e-07, 1.3039e-08, ..., 8.3819e-09, 1.0747e-06, 0.0000e+00], [-3.9116e-08, 1.8626e-09, -1.0312e-05, ..., -1.0416e-05, -1.5229e-05, 0.0000e+00], [ 0.0000e+00, 2.2352e-08, 4.3772e-08, ..., 3.7253e-08, 1.5181e-07, 0.0000e+00], ..., [ 4.6566e-09, 3.7253e-09, 7.9870e-06, ..., 8.1435e-06, 1.1772e-05, 0.0000e+00], [ 1.1176e-08, 5.1595e-07, 3.4459e-08, ..., 1.5832e-08, 1.6298e-06, 9.3132e-10], [ 1.8626e-09, -8.9966e-07, 2.0415e-06, ..., 2.0880e-06, 2.4866e-07, 0.0000e+00]], device='cuda:0') Epoch 295, bias, value: tensor([-0.0092, -0.0364, 0.0067, -0.0162, 0.0097, 0.0117, 0.0267, 0.0037, -0.0393, -0.0061], device='cuda:0'), grad: tensor([ 4.4517e-06, -6.3181e-05, 5.0385e-07, 8.2888e-08, 6.2771e-07, 1.2852e-07, -1.7136e-07, 4.9442e-05, 6.6943e-06, 1.4333e-06], device='cuda:0') 100 0.0001 changing lr epoch 294, time 246.92, cls_loss 0.0011 cls_loss_mapping 0.0016 cls_loss_causal 0.4742 re_mapping 0.0046 re_causal 0.0138 /// teacc 99.09 lr 0.00010000 Epoch 296, weight, value: tensor([[ 0.0341, -0.1652, -0.1483, ..., -0.3154, -0.1114, -0.1589], [ 0.0595, -0.0741, 0.0379, ..., 0.0501, 0.1123, -0.0524], [-0.0794, 0.1402, -0.1856, ..., 0.0572, 0.0821, -0.0489], ..., [-0.0708, -0.1005, -0.0796, ..., 0.0076, -0.1912, 0.1412], [ 0.0588, -0.0306, 0.0915, ..., 0.0065, -0.2230, -0.0184], [-0.2084, -0.1084, -0.1292, ..., -0.2293, 0.0653, -0.1062]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 4.6566e-09, -2.0489e-08, ..., 3.7253e-09, 6.5193e-09, -1.1176e-07], [ 2.7940e-09, 3.7253e-09, 2.5146e-08, ..., 5.5879e-09, -1.2107e-08, 0.0000e+00], [ 2.7940e-09, -2.2259e-07, 4.0047e-08, ..., -7.2643e-08, -1.7416e-07, 1.3970e-08], ..., [ 1.8626e-09, 6.5193e-09, 2.0489e-08, ..., -1.3039e-08, 1.3039e-08, 0.0000e+00], [ 1.1176e-08, 1.7975e-07, 3.4459e-08, ..., 6.7055e-08, 1.4994e-07, 1.8626e-08], [ 3.3528e-08, 1.8626e-09, 1.9558e-08, ..., 3.7253e-09, 1.9558e-08, 2.3283e-08]], device='cuda:0') Epoch 296, bias, value: tensor([-0.0092, -0.0369, 0.0066, -0.0168, 0.0098, 0.0124, 0.0268, 0.0042, -0.0392, -0.0063], device='cuda:0'), grad: tensor([-1.2415e-06, 1.1455e-07, -3.7905e-07, -2.2538e-07, -3.0827e-07, -1.5926e-07, 8.1398e-07, -1.0245e-08, 9.9558e-07, 4.1537e-07], device='cuda:0') 100 0.0001 changing lr epoch 295, time 246.98, cls_loss 0.0009 cls_loss_mapping 0.0020 cls_loss_causal 0.5013 re_mapping 0.0046 re_causal 0.0141 /// teacc 98.99 lr 0.00010000 Epoch 297, weight, value: tensor([[ 0.0339, -0.1655, -0.1488, ..., -0.3157, -0.1112, -0.1591], [ 0.0605, -0.0739, 0.0380, ..., 0.0501, 0.1126, -0.0524], [-0.0800, 0.1404, -0.1861, ..., 0.0570, 0.0823, -0.0489], ..., [-0.0708, -0.1001, -0.0796, ..., 0.0077, -0.1913, 0.1413], [ 0.0582, -0.0308, 0.0918, ..., 0.0064, -0.2238, -0.0187], [-0.2103, -0.1091, -0.1297, ..., -0.2298, 0.0651, -0.1061]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.0151e-07, 4.0978e-08, ..., 6.1467e-08, 1.3970e-07, 0.0000e+00], [-5.5879e-09, 2.8871e-08, -2.1420e-08, ..., 5.2154e-08, 2.9802e-08, 0.0000e+00], [ 9.3132e-10, -3.3341e-07, -8.4750e-08, ..., -2.6729e-07, -6.9756e-07, 0.0000e+00], ..., [ 1.8626e-09, 1.4901e-08, -3.7253e-09, ..., -2.4214e-08, 2.9802e-08, 0.0000e+00], [ 9.3132e-10, 8.9407e-08, -2.1420e-08, ..., 4.0047e-08, 1.1642e-07, 0.0000e+00], [ 0.0000e+00, 2.5146e-08, 2.2352e-08, ..., 2.5146e-08, 2.7008e-08, -0.0000e+00]], device='cuda:0') Epoch 297, bias, value: tensor([-0.0090, -0.0369, 0.0065, -0.0167, 0.0100, 0.0119, 0.0270, 0.0043, -0.0393, -0.0066], device='cuda:0'), grad: tensor([ 5.4669e-07, 2.4401e-07, -1.9204e-06, 2.2911e-07, 1.2107e-08, -3.5297e-07, 8.5216e-07, -1.5274e-07, 3.7625e-07, 1.6391e-07], device='cuda:0') 100 0.0001 changing lr epoch 296, time 246.83, cls_loss 0.0012 cls_loss_mapping 0.0016 cls_loss_causal 0.5075 re_mapping 0.0047 re_causal 0.0138 /// teacc 99.01 lr 0.00010000 Epoch 298, weight, value: tensor([[ 0.0339, -0.1648, -0.1488, ..., -0.3158, -0.1112, -0.1600], [ 0.0607, -0.0737, 0.0380, ..., 0.0501, 0.1133, -0.0527], [-0.0801, 0.1403, -0.1875, ..., 0.0565, 0.0812, -0.0491], ..., [-0.0752, -0.1001, -0.0795, ..., 0.0076, -0.1917, 0.1423], [ 0.0569, -0.0310, 0.0914, ..., 0.0065, -0.2253, -0.0191], [-0.2108, -0.1096, -0.1298, ..., -0.2306, 0.0652, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.5193e-09, 3.0734e-08, ..., 1.5832e-08, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 4.9360e-08, ..., 2.4214e-08, -5.1223e-08, 0.0000e+00], [ 0.0000e+00, -2.7847e-07, 6.1840e-07, ..., 7.6368e-08, -1.8720e-07, -2.7940e-09], ..., [ 0.0000e+00, 8.0094e-08, 6.1188e-07, ..., 3.2596e-07, 7.3574e-08, 9.3132e-10], [ 0.0000e+00, 6.2399e-08, 1.8906e-07, ..., 1.7323e-07, 2.2352e-08, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, 9.9652e-08, ..., 4.6566e-09, 1.5367e-07, 0.0000e+00]], device='cuda:0') Epoch 298, bias, value: tensor([-0.0085, -0.0368, 0.0055, -0.0163, 0.0110, 0.0127, 0.0261, 0.0042, -0.0404, -0.0067], device='cuda:0'), grad: tensor([ 7.4506e-09, 1.2852e-07, 8.1211e-07, -3.5875e-06, -7.7486e-07, 4.1071e-07, 1.8813e-07, 1.4259e-06, 4.6473e-07, 9.3039e-07], device='cuda:0') 100 0.0001 changing lr epoch 297, time 247.10, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4632 re_mapping 0.0049 re_causal 0.0138 /// teacc 99.06 lr 0.00010000 Epoch 299, weight, value: tensor([[ 0.0338, -0.1652, -0.1492, ..., -0.3160, -0.1114, -0.1602], [ 0.0617, -0.0738, 0.0399, ..., 0.0508, 0.1138, -0.0528], [-0.0803, 0.1403, -0.1885, ..., 0.0562, 0.0814, -0.0491], ..., [-0.0765, -0.1002, -0.0813, ..., 0.0071, -0.1920, 0.1427], [ 0.0552, -0.0313, 0.0914, ..., 0.0065, -0.2264, -0.0192], [-0.2114, -0.1098, -0.1304, ..., -0.2309, 0.0651, -0.1058]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.3819e-09, 9.3132e-09, ..., 5.5879e-09, 1.3039e-08, 0.0000e+00], [ 4.6566e-09, 3.7253e-09, 2.7940e-09, ..., 3.0734e-08, -5.5879e-09, 3.7253e-09], [ 0.0000e+00, -4.7497e-08, 1.0245e-08, ..., -2.3283e-08, -5.6811e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.0245e-08, 2.1420e-08, ..., -1.3970e-08, 1.5832e-08, 1.8626e-09], [ 0.0000e+00, 3.7253e-09, -5.0291e-08, ..., 3.7253e-09, 1.0245e-08, 0.0000e+00], [ 9.3132e-10, 3.7253e-09, 2.4214e-08, ..., 1.2107e-08, 3.7253e-09, -1.8626e-09]], device='cuda:0') Epoch 299, bias, value: tensor([-0.0074, -0.0352, 0.0054, -0.0165, 0.0104, 0.0116, 0.0270, 0.0030, -0.0407, -0.0069], device='cuda:0'), grad: tensor([ 5.4948e-08, 1.3225e-07, -9.7789e-08, -3.5297e-07, -4.8429e-08, 3.7160e-07, -3.5390e-08, 0.0000e+00, -7.3574e-08, 7.4506e-08], device='cuda:0') 100 0.0001 changing lr epoch 298, time 247.31, cls_loss 0.0011 cls_loss_mapping 0.0015 cls_loss_causal 0.4625 re_mapping 0.0047 re_causal 0.0134 /// teacc 99.10 lr 0.00010000 Epoch 300, weight, value: tensor([[ 0.0338, -0.1664, -0.1499, ..., -0.3171, -0.1116, -0.1636], [ 0.0617, -0.0748, 0.0399, ..., 0.0507, 0.1137, -0.0569], [-0.0802, 0.1414, -0.1889, ..., 0.0564, 0.0820, -0.0477], ..., [-0.0762, -0.1006, -0.0813, ..., 0.0071, -0.1921, 0.1439], [ 0.0552, -0.0317, 0.0921, ..., 0.0065, -0.2269, -0.0198], [-0.2114, -0.1101, -0.1309, ..., -0.2320, 0.0652, -0.1057]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.4214e-08, 3.7253e-09, ..., 1.1176e-08, 1.8626e-08, 0.0000e+00], [-0.0000e+00, 1.3039e-08, -7.3947e-07, ..., -6.4448e-07, -1.3821e-06, -1.6764e-08], [ 0.0000e+00, -1.9185e-07, 8.1956e-08, ..., 1.6764e-08, 5.2154e-08, 1.8626e-09], ..., [ 0.0000e+00, -2.7940e-08, 5.1595e-07, ..., 3.7253e-07, 1.0114e-06, -9.3132e-09], [ 0.0000e+00, 7.6368e-08, 0.0000e+00, ..., 3.3528e-08, 4.8429e-08, 1.8626e-09], [ 0.0000e+00, 9.3132e-09, 5.5879e-09, ..., 8.5682e-08, 9.4995e-08, 1.8626e-09]], device='cuda:0') Epoch 300, bias, value: tensor([-0.0075, -0.0354, 0.0058, -0.0163, 0.0103, 0.0114, 0.0272, 0.0032, -0.0406, -0.0070], device='cuda:0'), grad: tensor([-1.3728e-06, -2.3898e-06, 3.5390e-08, 3.8929e-07, -1.4491e-06, 1.6950e-07, 1.2964e-06, 9.8348e-07, 3.0361e-07, 2.0359e-06], device='cuda:0') 100 0.0001 changing lr epoch 299, time 247.14, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4949 re_mapping 0.0047 re_causal 0.0139 /// teacc 99.11 lr 0.00010000 Epoch 301, weight, value: tensor([[ 0.0338, -0.1667, -0.1504, ..., -0.3179, -0.1118, -0.1643], [ 0.0619, -0.0754, 0.0400, ..., 0.0506, 0.1140, -0.0576], [-0.0803, 0.1418, -0.1893, ..., 0.0564, 0.0824, -0.0478], ..., [-0.0762, -0.1003, -0.0814, ..., 0.0072, -0.1923, 0.1451], [ 0.0552, -0.0318, 0.0935, ..., 0.0067, -0.2266, -0.0195], [-0.2116, -0.1104, -0.1322, ..., -0.2339, 0.0651, -0.1073]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.2352e-08, 6.3330e-07, ..., 7.4506e-09, 4.6566e-08, 1.8626e-09], [-3.7253e-09, 9.1270e-08, 3.7253e-09, ..., 3.1665e-08, 1.2666e-07, 0.0000e+00], [ 0.0000e+00, -2.0862e-07, 5.9605e-08, ..., -2.0489e-08, -3.6880e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.4901e-08, 3.7253e-08, ..., 2.4214e-08, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, 6.3330e-08, -1.1995e-06, ..., 4.4703e-08, 1.4529e-07, 5.5879e-09], [ 0.0000e+00, 3.7253e-09, 4.2282e-07, ..., 1.8626e-09, 3.4459e-07, 0.0000e+00]], device='cuda:0') Epoch 301, bias, value: tensor([-0.0075, -0.0355, 0.0059, -0.0164, 0.0102, 0.0107, 0.0277, 0.0034, -0.0399, -0.0073], device='cuda:0'), grad: tensor([ 1.2126e-06, 2.6263e-07, -3.9116e-07, -3.1665e-07, -2.6301e-06, 2.2911e-07, 2.0303e-07, 1.7323e-07, -2.2743e-06, 3.5148e-06], device='cuda:0') 100 0.0001 changing lr epoch 300, time 246.97, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4880 re_mapping 0.0046 re_causal 0.0143 /// teacc 99.04 lr 0.00010000 Epoch 302, weight, value: tensor([[ 0.0338, -0.1673, -0.1507, ..., -0.3197, -0.1119, -0.1623], [ 0.0619, -0.0758, 0.0404, ..., 0.0511, 0.1145, -0.0585], [-0.0803, 0.1429, -0.1896, ..., 0.0568, 0.0831, -0.0467], ..., [-0.0762, -0.1004, -0.0818, ..., 0.0068, -0.1929, 0.1453], [ 0.0552, -0.0328, 0.0947, ..., 0.0063, -0.2275, -0.0206], [-0.2117, -0.1107, -0.1324, ..., -0.2357, 0.0652, -0.1093]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.1176e-08, ..., 3.7253e-09, 1.8626e-08, 1.8626e-09], [ 0.0000e+00, 3.7253e-08, -3.9116e-08, ..., 3.5390e-08, -5.4017e-08, 0.0000e+00], [ 0.0000e+00, -6.7428e-07, 5.4017e-08, ..., -8.1398e-07, -4.5262e-07, 0.0000e+00], ..., [ 0.0000e+00, 5.9605e-07, 2.7940e-08, ..., 7.2271e-07, 4.4890e-07, -0.0000e+00], [ 0.0000e+00, 9.3132e-09, -2.5705e-07, ..., 1.1176e-08, -1.8626e-09, 1.8626e-09], [ 0.0000e+00, 1.8626e-09, 7.6368e-08, ..., 7.4506e-09, 7.2643e-08, 0.0000e+00]], device='cuda:0') Epoch 302, bias, value: tensor([-0.0071, -0.0347, 0.0066, -0.0166, 0.0102, 0.0115, 0.0264, 0.0026, -0.0396, -0.0076], device='cuda:0'), grad: tensor([ 5.1409e-07, 1.2666e-07, -2.7679e-06, 3.4422e-06, -1.3039e-07, -7.2904e-06, 4.6752e-07, 2.6356e-06, -2.8685e-07, 3.2987e-06], device='cuda:0') 100 0.0001 changing lr epoch 301, time 247.06, cls_loss 0.0008 cls_loss_mapping 0.0015 cls_loss_causal 0.4844 re_mapping 0.0045 re_causal 0.0139 /// teacc 99.08 lr 0.00010000 Epoch 303, weight, value: tensor([[ 0.0337, -0.1678, -0.1513, ..., -0.3198, -0.1120, -0.1624], [ 0.0619, -0.0760, 0.0401, ..., 0.0510, 0.1147, -0.0585], [-0.0803, 0.1439, -0.1903, ..., 0.0568, 0.0831, -0.0469], ..., [-0.0763, -0.1006, -0.0815, ..., 0.0070, -0.1930, 0.1458], [ 0.0552, -0.0338, 0.0949, ..., 0.0060, -0.2279, -0.0207], [-0.2118, -0.1109, -0.1327, ..., -0.2364, 0.0650, -0.1093]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 1.8626e-09, ..., 1.8626e-09, 7.4506e-09, 1.8626e-09], [ 0.0000e+00, 1.8626e-09, -2.0489e-08, ..., 2.0489e-08, -5.0291e-08, 0.0000e+00], [ 0.0000e+00, -1.4901e-08, 2.3842e-07, ..., 8.3819e-08, -1.1735e-07, -2.9802e-08], ..., [ 0.0000e+00, 3.7253e-09, 6.5193e-08, ..., -2.7940e-08, 2.7940e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 2.0489e-08, ..., 1.3039e-08, 7.4506e-07, 0.0000e+00]], device='cuda:0') Epoch 303, bias, value: tensor([-0.0070, -0.0351, 0.0067, -0.0167, 0.0105, 0.0114, 0.0265, 0.0030, -0.0397, -0.0078], device='cuda:0'), grad: tensor([-1.3039e-08, -2.0489e-08, 3.5577e-07, -6.6124e-07, -3.0734e-06, 3.7253e-08, 1.6019e-07, 3.1665e-08, 1.1176e-08, 3.1553e-06], device='cuda:0') 100 0.0001 changing lr epoch 302, time 246.98, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4887 re_mapping 0.0045 re_causal 0.0139 /// teacc 98.97 lr 0.00010000 Epoch 304, weight, value: tensor([[ 0.0338, -0.1680, -0.1514, ..., -0.3199, -0.1121, -0.1625], [ 0.0620, -0.0757, 0.0401, ..., 0.0507, 0.1149, -0.0586], [-0.0804, 0.1445, -0.1915, ..., 0.0567, 0.0826, -0.0471], ..., [-0.0763, -0.1006, -0.0814, ..., 0.0074, -0.1930, 0.1463], [ 0.0552, -0.0346, 0.0951, ..., 0.0055, -0.2284, -0.0209], [-0.2119, -0.1109, -0.1330, ..., -0.2378, 0.0650, -0.1093]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 2.9802e-08, ..., 1.1176e-08, 3.3528e-08, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, -7.4878e-07, ..., -2.3097e-07, -2.0470e-06, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 1.5460e-07, ..., 5.0291e-08, 2.8498e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 5.5879e-08, ..., 1.3039e-08, 6.1467e-08, -0.0000e+00], [-1.1176e-08, -7.4506e-09, -5.8301e-07, ..., -2.0675e-07, -9.6858e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 304, bias, value: tensor([-0.0070, -0.0354, 0.0064, -0.0166, 0.0105, 0.0112, 0.0264, 0.0035, -0.0398, -0.0080], device='cuda:0'), grad: tensor([ 6.7055e-08, -3.1833e-06, 5.5507e-07, 3.5204e-07, 2.4959e-07, -2.7195e-07, 3.0268e-06, 1.5460e-07, -1.0785e-06, 1.1548e-07], device='cuda:0') 100 0.0001 changing lr epoch 303, time 246.84, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4568 re_mapping 0.0044 re_causal 0.0127 /// teacc 99.03 lr 0.00010000 Epoch 305, weight, value: tensor([[ 0.0337, -0.1685, -0.1525, ..., -0.3203, -0.1123, -0.1626], [ 0.0621, -0.0762, 0.0407, ..., 0.0508, 0.1154, -0.0586], [-0.0805, 0.1445, -0.1922, ..., 0.0567, 0.0831, -0.0473], ..., [-0.0763, -0.1000, -0.0820, ..., 0.0073, -0.1937, 0.1470], [ 0.0553, -0.0348, 0.0954, ..., 0.0054, -0.2292, -0.0210], [-0.2120, -0.1111, -0.1334, ..., -0.2400, 0.0649, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7998e-07, ..., 2.4587e-07, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 8.0094e-08, ..., 7.0781e-08, -5.4017e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-08, 1.8254e-07, ..., 8.7544e-08, -1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 1.0245e-07, ..., 5.7742e-08, 3.7253e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 4.6380e-07, ..., 3.0361e-07, 2.7940e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.7055e-08, ..., 4.2841e-08, -2.0675e-07, 0.0000e+00]], device='cuda:0') Epoch 305, bias, value: tensor([-0.0068, -0.0351, 0.0063, -0.0163, 0.0106, 0.0111, 0.0266, 0.0034, -0.0402, -0.0084], device='cuda:0'), grad: tensor([-1.1027e-05, 2.8685e-07, 1.5721e-06, -5.3272e-06, 1.6019e-07, 4.0568e-06, 6.8024e-06, 2.5146e-07, 2.5406e-06, 6.2399e-07], device='cuda:0') 100 0.0001 changing lr epoch 304, time 246.99, cls_loss 0.0010 cls_loss_mapping 0.0011 cls_loss_causal 0.4811 re_mapping 0.0044 re_causal 0.0131 /// teacc 99.02 lr 0.00010000 Epoch 306, weight, value: tensor([[ 0.0335, -0.1687, -0.1532, ..., -0.3206, -0.1124, -0.1626], [ 0.0621, -0.0764, 0.0409, ..., 0.0508, 0.1160, -0.0586], [-0.0807, 0.1447, -0.1928, ..., 0.0567, 0.0829, -0.0474], ..., [-0.0764, -0.1001, -0.0822, ..., 0.0073, -0.1940, 0.1478], [ 0.0553, -0.0349, 0.0962, ..., 0.0059, -0.2297, -0.0211], [-0.2127, -0.1111, -0.1338, ..., -0.2403, 0.0643, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, -7.4506e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 4.1351e-07, 5.5879e-09, ..., 5.1782e-07, 1.8626e-09, 0.0000e+00], ..., [-1.8626e-09, 5.1036e-07, 1.1176e-08, ..., 6.3702e-07, 8.5682e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 7.4506e-09, ..., 7.4506e-09, 4.8429e-08, 1.8626e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.8626e-09, -7.8790e-07, 0.0000e+00]], device='cuda:0') Epoch 306, bias, value: tensor([-0.0067, -0.0350, 0.0060, -0.0163, 0.0111, 0.0110, 0.0265, 0.0035, -0.0402, -0.0090], device='cuda:0'), grad: tensor([ 2.4214e-08, 6.5193e-08, 8.1956e-07, -1.8831e-06, 1.8161e-06, 1.1362e-07, -8.7544e-08, 1.1399e-06, 1.1176e-07, -2.1067e-06], device='cuda:0') 100 0.0001 changing lr epoch 305, time 247.88, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4920 re_mapping 0.0044 re_causal 0.0132 /// teacc 99.07 lr 0.00010000 Epoch 307, weight, value: tensor([[ 0.0335, -0.1691, -0.1533, ..., -0.3209, -0.1124, -0.1626], [ 0.0625, -0.0775, 0.0410, ..., 0.0510, 0.1165, -0.0586], [-0.0805, 0.1465, -0.1934, ..., 0.0571, 0.0843, -0.0474], ..., [-0.0764, -0.1021, -0.0824, ..., 0.0070, -0.1948, 0.1477], [ 0.0550, -0.0349, 0.0964, ..., 0.0060, -0.2307, -0.0215], [-0.2129, -0.1116, -0.1340, ..., -0.2420, 0.0638, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, -1.8626e-09, ..., 5.5879e-09, -2.6077e-08, 0.0000e+00], [ 0.0000e+00, -7.4506e-09, 1.6764e-08, ..., 1.8626e-09, -9.3132e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 2.4214e-08, ..., 2.4214e-08, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.8429e-08, ..., -1.6764e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 5.5879e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 307, bias, value: tensor([-0.0062, -0.0348, 0.0068, -0.0162, 0.0118, 0.0110, 0.0266, 0.0032, -0.0403, -0.0097], device='cuda:0'), grad: tensor([ 3.7253e-09, 1.8626e-08, 9.3132e-09, -5.1893e-06, -7.4506e-09, 5.1111e-06, 2.7940e-08, 7.2643e-08, -8.9407e-08, 2.9802e-08], device='cuda:0') 100 0.0001 changing lr epoch 306, time 247.89, cls_loss 0.0008 cls_loss_mapping 0.0021 cls_loss_causal 0.4841 re_mapping 0.0046 re_causal 0.0139 /// teacc 98.97 lr 0.00010000 Epoch 308, weight, value: tensor([[ 0.0335, -0.1691, -0.1533, ..., -0.3220, -0.1128, -0.1626], [ 0.0625, -0.0772, 0.0411, ..., 0.0510, 0.1166, -0.0586], [-0.0807, 0.1464, -0.1940, ..., 0.0572, 0.0847, -0.0474], ..., [-0.0764, -0.1022, -0.0824, ..., 0.0070, -0.1950, 0.1477], [ 0.0550, -0.0349, 0.0965, ..., 0.0059, -0.2316, -0.0215], [-0.2130, -0.1116, -0.1342, ..., -0.2424, 0.0638, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 7.4506e-09, ..., 7.4506e-09, 6.3330e-08, 0.0000e+00], [ 5.5879e-09, 1.8626e-09, 4.0792e-07, ..., 3.2410e-07, -4.2841e-08, 0.0000e+00], [ 0.0000e+00, -1.6764e-08, 2.2352e-07, ..., 1.4529e-07, 6.8918e-08, 0.0000e+00], ..., [ 0.0000e+00, -9.3132e-09, 3.7253e-08, ..., -2.3283e-07, 1.3970e-07, 0.0000e+00], [-1.4901e-08, 1.1176e-08, -1.1642e-06, ..., -6.3889e-07, -1.2107e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.0291e-08, ..., 3.5390e-08, 4.7684e-07, 0.0000e+00]], device='cuda:0') Epoch 308, bias, value: tensor([-0.0067, -0.0348, 0.0070, -0.0161, 0.0117, 0.0109, 0.0273, 0.0032, -0.0404, -0.0098], device='cuda:0'), grad: tensor([ 1.7509e-07, 1.5777e-06, 5.8115e-07, -5.5879e-08, -2.1439e-06, 1.2573e-06, 3.0175e-07, -3.9116e-07, -3.0529e-06, 1.7565e-06], device='cuda:0') 100 0.0001 changing lr epoch 307, time 247.48, cls_loss 0.0014 cls_loss_mapping 0.0022 cls_loss_causal 0.4957 re_mapping 0.0044 re_causal 0.0130 /// teacc 99.04 lr 0.00010000 Epoch 309, weight, value: tensor([[ 0.0333, -0.1693, -0.1541, ..., -0.3229, -0.1160, -0.1629], [ 0.0624, -0.0772, 0.0414, ..., 0.0510, 0.1177, -0.0585], [-0.0809, 0.1466, -0.1963, ..., 0.0566, 0.0834, -0.0476], ..., [-0.0765, -0.1023, -0.0826, ..., 0.0072, -0.1952, 0.1482], [ 0.0549, -0.0349, 0.0972, ..., 0.0061, -0.2318, -0.0216], [-0.2132, -0.1116, -0.1349, ..., -0.2431, 0.0665, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 6.3330e-08, ..., 8.3819e-08, 7.0781e-08, 0.0000e+00], [ 1.8626e-09, -2.2352e-08, -3.3528e-08, ..., 5.0664e-07, 3.8929e-07, 0.0000e+00], [ 5.5879e-09, 1.8626e-09, 1.4156e-07, ..., -2.1346e-06, -2.1551e-06, 0.0000e+00], ..., [ 1.1176e-08, 1.8626e-09, 1.3970e-07, ..., 1.1921e-06, 1.2275e-06, 0.0000e+00], [-2.9802e-08, 0.0000e+00, -1.8254e-07, ..., 9.8720e-08, 1.1176e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.9802e-08, ..., 1.4901e-08, 2.3656e-07, -0.0000e+00]], device='cuda:0') Epoch 309, bias, value: tensor([-0.0097, -0.0348, 0.0064, -0.0159, 0.0117, 0.0105, 0.0269, 0.0034, -0.0399, -0.0074], device='cuda:0'), grad: tensor([ 3.8743e-07, 1.7621e-06, -6.1691e-06, -6.6236e-06, -5.7183e-07, 6.5267e-06, 2.4214e-07, 3.9116e-06, -1.1921e-07, 6.4634e-07], device='cuda:0') 100 0.0001 changing lr epoch 308, time 247.75, cls_loss 0.0011 cls_loss_mapping 0.0019 cls_loss_causal 0.4888 re_mapping 0.0045 re_causal 0.0132 /// teacc 99.05 lr 0.00010000 Epoch 310, weight, value: tensor([[ 0.0331, -0.1693, -0.1543, ..., -0.3231, -0.1169, -0.1629], [ 0.0626, -0.0772, 0.0414, ..., 0.0510, 0.1179, -0.0585], [-0.0812, 0.1466, -0.1969, ..., 0.0564, 0.0835, -0.0476], ..., [-0.0765, -0.1024, -0.0826, ..., 0.0074, -0.1954, 0.1484], [ 0.0542, -0.0349, 0.0980, ..., 0.0070, -0.2319, -0.0218], [-0.2147, -0.1117, -0.1355, ..., -0.2465, 0.0672, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.2841e-08, ..., 5.0291e-08, -5.9605e-08, 0.0000e+00], [ 0.0000e+00, -9.3132e-09, 1.3597e-07, ..., -5.5879e-09, -1.6019e-07, 0.0000e+00], ..., [ 0.0000e+00, 7.4506e-09, 1.6764e-08, ..., -6.1467e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.1176e-08, ..., 9.3132e-09, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 9.3132e-09, -3.3528e-08, 0.0000e+00]], device='cuda:0') Epoch 310, bias, value: tensor([-0.0102, -0.0348, 0.0061, -0.0160, 0.0119, 0.0106, 0.0265, 0.0035, -0.0397, -0.0070], device='cuda:0'), grad: tensor([-3.3528e-08, 8.9407e-08, 3.3528e-08, -2.4028e-07, 2.6636e-07, 7.4506e-08, -2.6077e-08, -1.1921e-07, 2.0489e-08, -9.3132e-08], device='cuda:0') 100 0.0001 changing lr epoch 309, time 247.43, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4760 re_mapping 0.0044 re_causal 0.0136 /// teacc 98.99 lr 0.00010000 Epoch 311, weight, value: tensor([[ 0.0326, -0.1694, -0.1548, ..., -0.3232, -0.1183, -0.1629], [ 0.0629, -0.0778, 0.0414, ..., 0.0508, 0.1179, -0.0582], [-0.0813, 0.1467, -0.1979, ..., 0.0556, 0.0834, -0.0476], ..., [-0.0766, -0.1024, -0.0826, ..., 0.0076, -0.1954, 0.1485], [ 0.0541, -0.0354, 0.0982, ..., 0.0069, -0.2324, -0.0219], [-0.2166, -0.1117, -0.1358, ..., -0.2473, 0.0683, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.6368e-08, ..., 2.9802e-08, 4.2282e-06, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, -9.9652e-07, ..., -4.1164e-07, -1.2983e-06, 0.0000e+00], [ 0.0000e+00, -1.1176e-08, 4.9174e-07, ..., 1.8999e-07, 6.8769e-06, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 8.1956e-08, ..., 4.0978e-08, 1.0058e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 9.1270e-08, ..., 4.4703e-08, 2.2911e-07, -1.8626e-09], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 3.7253e-09, 1.0245e-07, 0.0000e+00]], device='cuda:0') Epoch 311, bias, value: tensor([-0.0114, -0.0349, 0.0053, -0.0151, 0.0127, 0.0105, 0.0271, 0.0037, -0.0400, -0.0065], device='cuda:0'), grad: tensor([ 2.1532e-05, -2.0266e-06, 3.4511e-05, 4.8243e-07, 5.5134e-06, 4.4890e-07, -6.2585e-05, 2.0489e-07, 6.0722e-07, 1.1604e-06], device='cuda:0') 100 0.0001 changing lr epoch 310, time 247.63, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4844 re_mapping 0.0044 re_causal 0.0129 /// teacc 99.10 lr 0.00010000 Epoch 312, weight, value: tensor([[ 0.0326, -0.1696, -0.1557, ..., -0.3235, -0.1183, -0.1631], [ 0.0619, -0.0779, 0.0417, ..., 0.0509, 0.1185, -0.0581], [-0.0814, 0.1471, -0.1985, ..., 0.0554, 0.0831, -0.0477], ..., [-0.0753, -0.1029, -0.0828, ..., 0.0073, -0.1956, 0.1487], [ 0.0541, -0.0355, 0.0986, ..., 0.0068, -0.2331, -0.0220], [-0.2187, -0.1119, -0.1370, ..., -0.2484, 0.0682, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 5.5879e-08, ..., 6.2212e-07, 1.2759e-06, 1.6764e-08], [ 0.0000e+00, 1.8626e-09, -1.1493e-06, ..., -2.0117e-07, -1.7844e-06, -5.2154e-07], [ 0.0000e+00, -1.1176e-08, 1.1921e-07, ..., -7.2084e-07, -1.5572e-06, -1.1176e-08], ..., [ 0.0000e+00, 5.5879e-09, 6.3330e-08, ..., 1.4901e-08, 4.4703e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, -8.6799e-06, ..., 1.0058e-07, 1.0617e-07, 1.4901e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 7.4506e-09, -2.2352e-08, 0.0000e+00]], device='cuda:0') Epoch 312, bias, value: tensor([-0.0115, -0.0348, 0.0049, -0.0147, 0.0131, 0.0103, 0.0277, 0.0036, -0.0401, -0.0068], device='cuda:0'), grad: tensor([ 3.7141e-06, -3.6955e-06, -4.2915e-06, -6.3330e-08, 6.7614e-07, 1.9930e-07, 1.7002e-05, 1.2852e-07, -1.3843e-05, 1.4342e-07], device='cuda:0') 100 0.0001 changing lr epoch 311, time 247.52, cls_loss 0.0011 cls_loss_mapping 0.0017 cls_loss_causal 0.4585 re_mapping 0.0043 re_causal 0.0126 /// teacc 99.08 lr 0.00010000 Epoch 313, weight, value: tensor([[ 0.0323, -0.1699, -0.1579, ..., -0.3243, -0.1183, -0.1634], [ 0.0618, -0.0775, 0.0417, ..., 0.0510, 0.1191, -0.0581], [-0.0815, 0.1473, -0.2002, ..., 0.0552, 0.0832, -0.0471], ..., [-0.0752, -0.1031, -0.0828, ..., 0.0072, -0.1962, 0.1489], [ 0.0542, -0.0365, 0.0988, ..., 0.0055, -0.2347, -0.0224], [-0.2188, -0.1119, -0.1376, ..., -0.2492, 0.0682, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 2.4214e-08, 1.1176e-08, 1.8626e-09], [-0.0000e+00, 1.8626e-09, -9.3132e-09, ..., 2.7120e-06, -5.5879e-08, 1.8626e-09], [ 0.0000e+00, -3.9116e-08, 5.7742e-08, ..., -2.8498e-07, -3.3341e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.9116e-08, ..., -3.7681e-06, 7.4506e-09, -1.1735e-07], [ 0.0000e+00, 3.7253e-08, 1.1362e-07, ..., 4.4331e-07, 3.6508e-07, 7.4506e-09], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 6.7055e-08, -4.6566e-08, 1.0058e-07]], device='cuda:0') Epoch 313, bias, value: tensor([-0.0116, -0.0348, 0.0047, -0.0136, 0.0132, 0.0104, 0.0275, 0.0036, -0.0410, -0.0069], device='cuda:0'), grad: tensor([ 1.1735e-07, 6.6832e-06, -5.0850e-07, 9.1270e-07, 2.4028e-07, 9.1642e-07, -2.1048e-07, -1.0625e-05, 1.2368e-06, 1.2219e-06], device='cuda:0') 100 0.0001 changing lr epoch 312, time 248.01, cls_loss 0.0013 cls_loss_mapping 0.0015 cls_loss_causal 0.5104 re_mapping 0.0044 re_causal 0.0129 /// teacc 99.07 lr 0.00010000 Epoch 314, weight, value: tensor([[ 0.0322, -0.1705, -0.1580, ..., -0.3249, -0.1183, -0.1632], [ 0.0622, -0.0783, 0.0403, ..., 0.0498, 0.1192, -0.0585], [-0.0818, 0.1473, -0.2017, ..., 0.0548, 0.0832, -0.0472], ..., [-0.0753, -0.1014, -0.0815, ..., 0.0085, -0.1968, 0.1492], [ 0.0542, -0.0375, 0.0987, ..., 0.0045, -0.2365, -0.0228], [-0.2189, -0.1120, -0.1382, ..., -0.2504, 0.0685, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 2.0489e-08, 3.3528e-08, 0.0000e+00], [ 0.0000e+00, 3.5763e-07, 7.4506e-09, ..., 7.7486e-07, 1.4491e-06, 0.0000e+00], [ 0.0000e+00, -5.3085e-07, 2.6077e-08, ..., -1.0375e-06, -1.8850e-06, 0.0000e+00], ..., [ 0.0000e+00, 9.6858e-08, 5.4017e-08, ..., 1.9185e-07, 6.2212e-07, -1.8626e-09], [ 0.0000e+00, 1.3039e-08, 9.3132e-09, ..., 2.6077e-08, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 5.5879e-09, -1.1381e-06, 0.0000e+00]], device='cuda:0') Epoch 314, bias, value: tensor([-0.0115, -0.0363, 0.0041, -0.0132, 0.0128, 0.0100, 0.0287, 0.0048, -0.0424, -0.0066], device='cuda:0'), grad: tensor([ 5.0291e-08, 2.5444e-06, -3.4533e-06, -1.1176e-08, 4.9546e-06, 8.5682e-08, 7.8231e-08, 3.1702e-06, 1.8440e-07, -7.6145e-06], device='cuda:0') 100 0.0001 changing lr epoch 313, time 247.84, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4694 re_mapping 0.0046 re_causal 0.0134 /// teacc 98.97 lr 0.00010000 Epoch 315, weight, value: tensor([[ 0.0318, -0.1703, -0.1576, ..., -0.3253, -0.1184, -0.1632], [ 0.0624, -0.0783, 0.0403, ..., 0.0496, 0.1190, -0.0585], [-0.0798, 0.1475, -0.2013, ..., 0.0563, 0.0849, -0.0472], ..., [-0.0746, -0.1014, -0.0815, ..., 0.0087, -0.1972, 0.1492], [ 0.0541, -0.0378, 0.0990, ..., 0.0042, -0.2372, -0.0228], [-0.2189, -0.1124, -0.1391, ..., -0.2529, 0.0682, -0.1097]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 5.5879e-09, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, -3.1665e-08, ..., -0.0000e+00, -2.6077e-08, 0.0000e+00], [ 1.8626e-09, -1.8626e-09, 3.7253e-09, ..., -0.0000e+00, -3.7253e-09, 0.0000e+00], ..., [ 3.7253e-09, 0.0000e+00, 1.1176e-08, ..., -2.2352e-08, 2.9802e-08, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, 2.0489e-08, 0.0000e+00], [ 1.6764e-08, 0.0000e+00, 1.8626e-09, ..., 3.7253e-09, 1.8626e-08, 0.0000e+00]], device='cuda:0') Epoch 315, bias, value: tensor([-0.0115, -0.0366, 0.0060, -0.0156, 0.0126, 0.0116, 0.0290, 0.0051, -0.0428, -0.0069], device='cuda:0'), grad: tensor([-5.5879e-09, -1.4901e-08, 2.6077e-08, 1.3039e-08, -4.1537e-07, 2.6636e-07, -2.0303e-07, 4.6566e-08, 8.5682e-08, 1.9372e-07], device='cuda:0') 100 0.0001 changing lr epoch 314, time 247.60, cls_loss 0.0012 cls_loss_mapping 0.0016 cls_loss_causal 0.4749 re_mapping 0.0044 re_causal 0.0129 /// teacc 99.13 lr 0.00010000 Epoch 316, weight, value: tensor([[ 0.0313, -0.1705, -0.1582, ..., -0.3256, -0.1184, -0.1632], [ 0.0623, -0.0786, 0.0402, ..., 0.0494, 0.1193, -0.0585], [-0.0799, 0.1480, -0.2022, ..., 0.0565, 0.0854, -0.0472], ..., [-0.0739, -0.1018, -0.0815, ..., 0.0088, -0.1977, 0.1493], [ 0.0544, -0.0378, 0.1017, ..., 0.0060, -0.2361, -0.0228], [-0.2199, -0.1124, -0.1415, ..., -0.2548, 0.0684, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 5.5879e-09, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, -1.8626e-09, ..., 1.1176e-08, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, -5.5879e-08, 3.7253e-09, ..., -1.3039e-07, -8.7544e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.6764e-08, 1.8626e-09, ..., 3.7253e-08, 3.1665e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -7.4506e-09, ..., -1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.8626e-09, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 316, bias, value: tensor([-0.0112, -0.0367, 0.0061, -0.0160, 0.0125, 0.0117, 0.0291, 0.0049, -0.0412, -0.0070], device='cuda:0'), grad: tensor([-7.7337e-06, 8.5682e-08, 6.5193e-08, 6.5193e-07, 2.8498e-07, -2.0675e-07, 5.4166e-06, 1.5646e-07, 3.5949e-07, 8.9407e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 315---------------------------------------------------- epoch 315, time 262.52, cls_loss 0.0011 cls_loss_mapping 0.0018 cls_loss_causal 0.4901 re_mapping 0.0043 re_causal 0.0132 /// teacc 99.15 lr 0.00010000 Epoch 317, weight, value: tensor([[ 0.0311, -0.1708, -0.1592, ..., -0.3269, -0.1189, -0.1632], [ 0.0639, -0.0789, 0.0411, ..., 0.0502, 0.1209, -0.0586], [-0.0801, 0.1491, -0.2031, ..., 0.0564, 0.0860, -0.0472], ..., [-0.0741, -0.1035, -0.0823, ..., 0.0081, -0.1994, 0.1496], [ 0.0532, -0.0378, 0.1018, ..., 0.0059, -0.2368, -0.0229], [-0.2202, -0.1127, -0.1421, ..., -0.2574, 0.0688, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.9605e-08, ..., 2.4214e-08, -1.4901e-08, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, 3.7253e-08, ..., 9.3132e-09, 2.7940e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 3.3528e-08, ..., 7.4506e-09, 1.4901e-08, -0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.5635e-07, ..., -1.1735e-07, -3.7253e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4529e-07, ..., 7.4506e-09, 2.9802e-08, 0.0000e+00]], device='cuda:0') Epoch 317, bias, value: tensor([-0.0117, -0.0357, 0.0062, -0.0156, 0.0125, 0.0116, 0.0286, 0.0042, -0.0415, -0.0068], device='cuda:0'), grad: tensor([-1.0058e-07, 1.8254e-07, 1.9372e-07, -5.7742e-08, -1.7509e-07, 2.7940e-07, 2.2724e-07, 9.1270e-08, -1.1902e-06, 5.2899e-07], device='cuda:0') 100 0.0001 changing lr epoch 316, time 247.74, cls_loss 0.0010 cls_loss_mapping 0.0023 cls_loss_causal 0.4621 re_mapping 0.0044 re_causal 0.0130 /// teacc 99.07 lr 0.00010000 Epoch 318, weight, value: tensor([[ 0.0304, -0.1709, -0.1607, ..., -0.3272, -0.1195, -0.1633], [ 0.0653, -0.0792, 0.0415, ..., 0.0504, 0.1214, -0.0586], [-0.0802, 0.1492, -0.2039, ..., 0.0563, 0.0860, -0.0472], ..., [-0.0742, -0.1037, -0.0826, ..., 0.0080, -0.1999, 0.1497], [ 0.0524, -0.0374, 0.1025, ..., 0.0059, -0.2371, -0.0228], [-0.2210, -0.1128, -0.1428, ..., -0.2589, 0.0692, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 2.4214e-08, 0.0000e+00], [-1.3039e-08, 1.8626e-09, -8.3819e-08, ..., -1.8626e-09, -1.0431e-07, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 8.5682e-08, ..., 7.0781e-08, 1.8626e-08, 0.0000e+00], ..., [ 1.8626e-09, -5.5879e-09, 4.0978e-08, ..., 9.3132e-09, 4.6566e-08, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 2.0489e-08, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 7.4506e-09, ..., 7.4506e-09, 5.8450e-06, 0.0000e+00]], device='cuda:0') Epoch 318, bias, value: tensor([-0.0123, -0.0355, 0.0060, -0.0154, 0.0129, 0.0116, 0.0283, 0.0041, -0.0413, -0.0066], device='cuda:0'), grad: tensor([ 2.7940e-08, -1.9930e-07, 3.7812e-07, -7.8045e-07, -1.8775e-05, 2.8312e-07, 1.0617e-07, 1.7695e-07, 8.5682e-08, 1.8686e-05], device='cuda:0') 100 0.0001 changing lr epoch 317, time 247.55, cls_loss 0.0011 cls_loss_mapping 0.0021 cls_loss_causal 0.4965 re_mapping 0.0042 re_causal 0.0127 /// teacc 99.07 lr 0.00010000 Epoch 319, weight, value: tensor([[ 0.0300, -0.1713, -0.1606, ..., -0.3284, -0.1196, -0.1633], [ 0.0662, -0.0795, 0.0415, ..., 0.0503, 0.1212, -0.0586], [-0.0804, 0.1498, -0.2047, ..., 0.0565, 0.0866, -0.0471], ..., [-0.0751, -0.1037, -0.0826, ..., 0.0082, -0.2002, 0.1497], [ 0.0519, -0.0376, 0.1026, ..., 0.0053, -0.2384, -0.0229], [-0.2212, -0.1134, -0.1435, ..., -0.2603, 0.0692, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7567e-07, 5.5879e-09, ..., 2.8312e-07, 3.4459e-07, 0.0000e+00], [ 0.0000e+00, 2.4214e-08, -2.2911e-07, ..., 1.0058e-07, -1.7136e-07, 0.0000e+00], [ 0.0000e+00, -8.5831e-06, 3.7253e-09, ..., -8.0168e-06, -6.9477e-06, 0.0000e+00], ..., [ 0.0000e+00, 6.1318e-06, -1.3039e-08, ..., 5.8971e-06, 6.2138e-06, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, 1.1176e-08, ..., 1.3039e-08, 9.1456e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 1.8626e-09, ..., 2.0489e-08, -3.3658e-06, 0.0000e+00]], device='cuda:0') Epoch 319, bias, value: tensor([-0.0123, -0.0358, 0.0064, -0.0153, 0.0134, 0.0113, 0.0290, 0.0043, -0.0415, -0.0067], device='cuda:0'), grad: tensor([ 1.3504e-06, 2.4959e-07, -3.3647e-05, 4.5002e-06, 8.0615e-06, 1.2852e-07, 6.4075e-07, 2.8253e-05, 3.5781e-06, -1.3113e-05], device='cuda:0') 100 0.0001 changing lr epoch 318, time 247.68, cls_loss 0.0010 cls_loss_mapping 0.0016 cls_loss_causal 0.4978 re_mapping 0.0045 re_causal 0.0134 /// teacc 99.14 lr 0.00010000 Epoch 320, weight, value: tensor([[ 0.0304, -0.1716, -0.1609, ..., -0.3288, -0.1196, -0.1634], [ 0.0667, -0.0795, 0.0390, ..., 0.0481, 0.1201, -0.0581], [-0.0821, 0.1531, -0.2047, ..., 0.0579, 0.0881, -0.0467], ..., [-0.0746, -0.1044, -0.0800, ..., 0.0104, -0.1993, 0.1497], [ 0.0511, -0.0418, 0.1029, ..., 0.0021, -0.2418, -0.0229], [-0.2218, -0.1136, -0.1450, ..., -0.2606, 0.0686, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.4622e-07, ..., -5.5879e-09, -1.4994e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.7695e-08, ..., 9.3132e-09, 3.7253e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.2841e-08, ..., 1.1176e-08, 3.5390e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., -3.7253e-09, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 320, bias, value: tensor([-0.0122, -0.0384, 0.0077, -0.0151, 0.0154, 0.0111, 0.0286, 0.0066, -0.0425, -0.0074], device='cuda:0'), grad: tensor([-1.6550e-06, -3.9302e-07, 1.2293e-07, -1.6950e-07, 1.1083e-07, 2.3283e-07, 1.1893e-06, 1.0990e-07, 7.0781e-08, 3.7253e-07], device='cuda:0') 100 0.0001 changing lr epoch 319, time 247.65, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4931 re_mapping 0.0044 re_causal 0.0135 /// teacc 99.12 lr 0.00010000 Epoch 321, weight, value: tensor([[ 3.0378e-02, -1.7106e-01, -1.6109e-01, ..., -3.2938e-01, -1.1967e-01, -1.6352e-01], [ 6.6724e-02, -7.9626e-02, 4.1876e-02, ..., 5.1007e-02, 1.2290e-01, -5.8057e-02], [-8.2117e-02, 1.5526e-01, -2.0492e-01, ..., 5.9575e-02, 9.0294e-02, -4.6785e-02], ..., [-7.4841e-02, -1.0425e-01, -8.2905e-02, ..., 7.3642e-03, -2.0240e-01, 1.4974e-01], [ 5.1117e-02, -4.4042e-02, 1.0306e-01, ..., -2.9579e-04, -2.4396e-01, -2.3038e-02], [-2.2142e-01, -1.1395e-01, -1.4675e-01, ..., -2.6159e-01, 6.8663e-02, -1.0971e-01]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.6077e-08, ..., 5.5879e-09, -2.6077e-08, 0.0000e+00], [ 0.0000e+00, -1.4529e-07, 2.2352e-08, ..., -1.9744e-07, -2.8312e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 7.4506e-09, ..., -1.4901e-08, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 321, bias, value: tensor([-0.0123, -0.0355, 0.0092, -0.0147, 0.0154, 0.0119, 0.0273, 0.0037, -0.0439, -0.0076], device='cuda:0'), grad: tensor([-9.1828e-07, -1.1176e-08, -5.4017e-07, 5.1968e-07, 2.4214e-08, 1.0245e-07, 3.1292e-07, -1.0058e-07, 3.1665e-08, 5.7183e-07], device='cuda:0') 100 0.0001 changing lr epoch 320, time 247.58, cls_loss 0.0010 cls_loss_mapping 0.0018 cls_loss_causal 0.4941 re_mapping 0.0044 re_causal 0.0133 /// teacc 99.10 lr 0.00010000 Epoch 322, weight, value: tensor([[ 0.0303, -0.1707, -0.1615, ..., -0.3296, -0.1197, -0.1635], [ 0.0668, -0.0798, 0.0397, ..., 0.0495, 0.1212, -0.0581], [-0.0821, 0.1547, -0.2017, ..., 0.0601, 0.0934, -0.0468], ..., [-0.0749, -0.1017, -0.0811, ..., 0.0089, -0.2018, 0.1496], [ 0.0511, -0.0453, 0.1035, ..., -0.0017, -0.2454, -0.0231], [-0.2214, -0.1142, -0.1477, ..., -0.2626, 0.0687, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 8.5682e-08, ..., 3.7253e-09, 5.7742e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 5.5879e-08, -1.6764e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 5.4017e-08, ..., -1.8626e-09, -4.8429e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., -1.0990e-07, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.4273e-07, ..., 1.4901e-08, -5.0291e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-08, ..., 1.1176e-08, -0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 322, bias, value: tensor([-0.0121, -0.0371, 0.0104, -0.0149, 0.0151, 0.0117, 0.0272, 0.0052, -0.0447, -0.0077], device='cuda:0'), grad: tensor([ 2.3842e-07, 1.6764e-07, -2.2352e-08, 1.0990e-07, 6.1467e-08, 2.4214e-07, 3.1665e-08, -5.6624e-07, -5.9232e-07, 3.2224e-07], device='cuda:0') 100 0.0001 changing lr epoch 321, time 247.45, cls_loss 0.0010 cls_loss_mapping 0.0021 cls_loss_causal 0.4571 re_mapping 0.0044 re_causal 0.0126 /// teacc 99.02 lr 0.00010000 Epoch 323, weight, value: tensor([[ 0.0305, -0.1709, -0.1618, ..., -0.3300, -0.1198, -0.1639], [ 0.0668, -0.0798, 0.0399, ..., 0.0497, 0.1215, -0.0579], [-0.0822, 0.1548, -0.2023, ..., 0.0594, 0.0930, -0.0468], ..., [-0.0749, -0.1017, -0.0812, ..., 0.0088, -0.2019, 0.1497], [ 0.0511, -0.0453, 0.1039, ..., -0.0019, -0.2455, -0.0231], [-0.2216, -0.1146, -0.1483, ..., -0.2638, 0.0687, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.0489e-08, ..., 2.4214e-08, 3.1665e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, -1.5542e-05, ..., -7.2084e-06, -1.3866e-05, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 1.0952e-05, ..., 5.8115e-06, 9.7528e-06, 1.6764e-08], ..., [ 0.0000e+00, 0.0000e+00, 3.9823e-06, ..., 9.2387e-07, 3.5185e-06, -1.4156e-07], [ 0.0000e+00, 0.0000e+00, -2.5332e-07, ..., -2.1234e-07, 1.8626e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 5.9605e-08, 4.2841e-08, 1.8626e-09]], device='cuda:0') Epoch 323, bias, value: tensor([-0.0122, -0.0369, 0.0100, -0.0149, 0.0150, 0.0116, 0.0275, 0.0051, -0.0449, -0.0078], device='cuda:0'), grad: tensor([ 1.6019e-07, -2.7955e-05, 2.3350e-05, 1.8291e-06, 2.4997e-06, 8.1956e-08, 3.3155e-07, -6.5193e-08, -6.5006e-07, 3.8370e-07], device='cuda:0') 100 0.0001 changing lr epoch 322, time 247.49, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4862 re_mapping 0.0045 re_causal 0.0134 /// teacc 98.97 lr 0.00010000 Epoch 324, weight, value: tensor([[ 0.0301, -0.1711, -0.1623, ..., -0.3303, -0.1198, -0.1643], [ 0.0668, -0.0794, 0.0399, ..., 0.0497, 0.1216, -0.0580], [-0.0834, 0.1548, -0.2025, ..., 0.0593, 0.0929, -0.0466], ..., [-0.0751, -0.1018, -0.0812, ..., 0.0089, -0.2019, 0.1504], [ 0.0497, -0.0453, 0.1045, ..., -0.0019, -0.2456, -0.0233], [-0.2219, -0.1146, -0.1486, ..., -0.2642, 0.0688, -0.1098]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.8626e-09, ..., 1.8626e-09, 2.9802e-07, 0.0000e+00], [ 0.0000e+00, -1.6764e-08, 3.7253e-09, ..., -1.8626e-08, -1.3039e-08, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-09, 5.5879e-09, ..., 1.1176e-08, 1.2144e-06, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.5441e-06, 0.0000e+00]], device='cuda:0') Epoch 324, bias, value: tensor([-0.0122, -0.0370, 0.0098, -0.0149, 0.0130, 0.0117, 0.0277, 0.0054, -0.0450, -0.0077], device='cuda:0'), grad: tensor([-7.4506e-09, 8.6799e-07, -1.4901e-08, -3.7253e-08, 0.0000e+00, 6.1467e-08, 2.0489e-08, 3.4962e-06, 3.7253e-08, -4.4480e-06], device='cuda:0') 100 0.0001 changing lr epoch 323, time 247.87, cls_loss 0.0010 cls_loss_mapping 0.0020 cls_loss_causal 0.4634 re_mapping 0.0042 re_causal 0.0126 /// teacc 99.09 lr 0.00010000 Epoch 325, weight, value: tensor([[ 0.0301, -0.1712, -0.1625, ..., -0.3311, -0.1199, -0.1643], [ 0.0668, -0.0795, 0.0400, ..., 0.0497, 0.1217, -0.0580], [-0.0835, 0.1551, -0.2026, ..., 0.0597, 0.0931, -0.0468], ..., [-0.0753, -0.1017, -0.0813, ..., 0.0089, -0.2021, 0.1505], [ 0.0495, -0.0456, 0.1062, ..., -0.0021, -0.2462, -0.0235], [-0.2219, -0.1150, -0.1502, ..., -0.2641, 0.0690, -0.1098]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.3039e-08, ..., 0.0000e+00, -1.8626e-08, 0.0000e+00], [ 0.0000e+00, -7.4506e-09, 1.3039e-08, ..., -1.1176e-08, -2.6077e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 1.8626e-08, ..., 1.3039e-08, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 2.1234e-07, ..., 1.1176e-08, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 3.7253e-09, 7.4506e-09, 0.0000e+00]], device='cuda:0') Epoch 325, bias, value: tensor([-0.0122, -0.0369, 0.0100, -0.0154, 0.0131, 0.0119, 0.0273, 0.0053, -0.0447, -0.0074], device='cuda:0'), grad: tensor([-4.0978e-08, 1.8626e-08, -4.2841e-08, 1.4026e-06, 2.4214e-08, -1.7621e-06, -5.9791e-07, 4.8429e-08, 8.9407e-07, 5.5879e-08], device='cuda:0') 100 0.0001 changing lr epoch 324, time 247.76, cls_loss 0.0010 cls_loss_mapping 0.0012 cls_loss_causal 0.4645 re_mapping 0.0044 re_causal 0.0130 /// teacc 99.10 lr 0.00010000 Epoch 326, weight, value: tensor([[ 0.0301, -0.1713, -0.1628, ..., -0.3337, -0.1213, -0.1644], [ 0.0668, -0.0793, 0.0400, ..., 0.0498, 0.1222, -0.0577], [-0.0836, 0.1550, -0.2030, ..., 0.0586, 0.0924, -0.0468], ..., [-0.0753, -0.1018, -0.0812, ..., 0.0089, -0.2023, 0.1506], [ 0.0493, -0.0456, 0.1069, ..., -0.0022, -0.2463, -0.0238], [-0.2220, -0.1150, -0.1504, ..., -0.2633, 0.0703, -0.1098]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -5.6811e-06, ..., -1.5736e-05, -1.6168e-05, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 5.2527e-06, ..., 1.4417e-05, 1.4797e-05, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 4.6939e-07, ..., 1.2703e-06, 1.3225e-06, -3.7253e-09], [ 0.0000e+00, 0.0000e+00, 2.6077e-08, ..., 9.3132e-09, 1.8626e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.1176e-08, -5.0291e-08, 0.0000e+00]], device='cuda:0') Epoch 326, bias, value: tensor([-0.0135, -0.0369, 0.0093, -0.0155, 0.0132, 0.0124, 0.0264, 0.0053, -0.0447, -0.0063], device='cuda:0'), grad: tensor([-4.1202e-06, -2.6137e-05, 2.5034e-05, 1.0245e-06, 2.0862e-07, 1.5087e-07, 3.1106e-07, 2.1588e-06, 3.9116e-07, 9.8348e-07], device='cuda:0') 100 0.0001 changing lr epoch 325, time 247.57, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4830 re_mapping 0.0044 re_causal 0.0131 /// teacc 99.10 lr 0.00010000 Epoch 327, weight, value: tensor([[ 0.0301, -0.1714, -0.1630, ..., -0.3341, -0.1214, -0.1647], [ 0.0668, -0.0794, 0.0400, ..., 0.0498, 0.1222, -0.0577], [-0.0839, 0.1550, -0.2032, ..., 0.0587, 0.0925, -0.0469], ..., [-0.0753, -0.1018, -0.0813, ..., 0.0089, -0.2024, 0.1506], [ 0.0493, -0.0456, 0.1079, ..., -0.0018, -0.2464, -0.0240], [-0.2221, -0.1152, -0.1521, ..., -0.2643, 0.0703, -0.1098]], device='cuda:0'), grad: tensor([[-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -3.1665e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, -1.3039e-08, 3.7253e-09, ..., -2.4214e-07, -2.5518e-07, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-09, -7.4506e-09, ..., 1.5274e-07, 2.0303e-07, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -3.7253e-09, ..., 3.3528e-08, 4.6566e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 327, bias, value: tensor([-0.0132, -0.0369, 0.0093, -0.0153, 0.0132, 0.0120, 0.0269, 0.0053, -0.0445, -0.0064], device='cuda:0'), grad: tensor([-3.1851e-07, 3.2969e-07, -5.9232e-07, 9.4995e-08, -3.5390e-08, -5.9605e-08, 6.3330e-08, 3.8370e-07, 1.3597e-07, -1.8626e-09], device='cuda:0') 100 0.0001 changing lr epoch 326, time 247.67, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4919 re_mapping 0.0044 re_causal 0.0128 /// teacc 99.01 lr 0.00010000 Epoch 328, weight, value: tensor([[ 0.0299, -0.1713, -0.1635, ..., -0.3345, -0.1214, -0.1647], [ 0.0678, -0.0796, 0.0401, ..., 0.0497, 0.1224, -0.0575], [-0.0846, 0.1548, -0.2032, ..., 0.0585, 0.0925, -0.0469], ..., [-0.0750, -0.1014, -0.0813, ..., 0.0089, -0.2024, 0.1506], [ 0.0490, -0.0456, 0.1082, ..., -0.0016, -0.2468, -0.0241], [-0.2221, -0.1156, -0.1530, ..., -0.2667, 0.0704, -0.1098]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 8.5682e-08, 3.7253e-09, ..., 8.9407e-08, 8.1956e-08, 0.0000e+00], [ 0.0000e+00, 4.6380e-07, 3.3528e-08, ..., 5.1595e-07, 3.7253e-07, 0.0000e+00], [ 0.0000e+00, -2.3190e-06, 1.1176e-08, ..., -2.4345e-06, -1.8235e-06, 0.0000e+00], ..., [ 0.0000e+00, 4.8988e-07, 9.3132e-09, ..., 5.0850e-07, 4.0606e-07, 0.0000e+00], [ 0.0000e+00, 1.1679e-06, -7.0781e-08, ..., 1.1530e-06, 1.1232e-06, 0.0000e+00], [ 0.0000e+00, 1.3039e-08, 3.7253e-09, ..., 1.8626e-08, -5.0291e-08, 0.0000e+00]], device='cuda:0') Epoch 328, bias, value: tensor([-0.0130, -0.0369, 0.0091, -0.0151, 0.0130, 0.0115, 0.0264, 0.0055, -0.0446, -0.0070], device='cuda:0'), grad: tensor([ 2.7753e-07, 1.4435e-06, -6.6496e-06, 1.6205e-07, 2.0489e-07, 8.0094e-08, -6.3330e-07, 1.3858e-06, 3.8445e-06, -1.0617e-07], device='cuda:0') 100 0.0001 changing lr epoch 327, time 248.28, cls_loss 0.0009 cls_loss_mapping 0.0021 cls_loss_causal 0.4816 re_mapping 0.0044 re_causal 0.0131 /// teacc 98.96 lr 0.00010000 Epoch 329, weight, value: tensor([[ 0.0298, -0.1717, -0.1640, ..., -0.3348, -0.1214, -0.1647], [ 0.0678, -0.0800, 0.0401, ..., 0.0496, 0.1224, -0.0575], [-0.0846, 0.1559, -0.2033, ..., 0.0590, 0.0928, -0.0471], ..., [-0.0750, -0.1019, -0.0815, ..., 0.0086, -0.2025, 0.1506], [ 0.0486, -0.0465, 0.1087, ..., -0.0024, -0.2478, -0.0242], [-0.2222, -0.1158, -0.1533, ..., -0.2672, 0.0704, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.4214e-08, 0.0000e+00, ..., 1.8626e-08, 1.8626e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -3.5390e-08, ..., -1.4901e-08, -4.0978e-08, 0.0000e+00], [ 0.0000e+00, -1.1548e-07, 7.4506e-09, ..., -8.7544e-08, -8.1956e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.0489e-08, 1.4901e-08, ..., 2.4214e-08, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, 5.9605e-08, 1.1176e-08, ..., 5.0291e-08, 5.2154e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00]], device='cuda:0') Epoch 329, bias, value: tensor([-0.0129, -0.0370, 0.0093, -0.0123, 0.0129, 0.0115, 0.0262, 0.0052, -0.0452, -0.0069], device='cuda:0'), grad: tensor([ 1.4342e-07, -6.5193e-08, -6.5751e-07, -8.5682e-08, -8.0094e-08, 9.6858e-08, 3.3528e-08, 1.5646e-07, 4.0047e-07, 5.5879e-08], device='cuda:0') 100 0.0001 changing lr epoch 328, time 248.18, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4756 re_mapping 0.0040 re_causal 0.0125 /// teacc 99.05 lr 0.00010000 Epoch 330, weight, value: tensor([[ 0.0298, -0.1719, -0.1645, ..., -0.3352, -0.1214, -0.1647], [ 0.0679, -0.0801, 0.0403, ..., 0.0497, 0.1226, -0.0575], [-0.0854, 0.1559, -0.2035, ..., 0.0591, 0.0930, -0.0471], ..., [-0.0752, -0.1019, -0.0816, ..., 0.0085, -0.2027, 0.1506], [ 0.0490, -0.0465, 0.1093, ..., -0.0023, -0.2480, -0.0242], [-0.2222, -0.1163, -0.1533, ..., -0.2697, 0.0707, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.1176e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.6764e-08, ..., -9.3132e-09, -3.1665e-08, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 9.3132e-09, ..., -4.0978e-08, -1.0803e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 3.7253e-09, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -6.1467e-08, ..., 7.4506e-09, 8.5682e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 3.7253e-09, 1.3039e-08, 0.0000e+00]], device='cuda:0') Epoch 330, bias, value: tensor([-0.0128, -0.0369, 0.0093, -0.0124, 0.0125, 0.0109, 0.0261, 0.0054, -0.0453, -0.0081], device='cuda:0'), grad: tensor([ 8.1956e-08, -2.9802e-08, -1.3039e-07, 0.0000e+00, -1.4901e-07, 2.2352e-07, -2.6450e-07, 2.6077e-08, 5.4017e-08, 1.8068e-07], device='cuda:0') 100 0.0001 changing lr epoch 329, time 247.70, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4785 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.07 lr 0.00010000 Epoch 331, weight, value: tensor([[ 0.0289, -0.1723, -0.1648, ..., -0.3361, -0.1215, -0.1649], [ 0.0680, -0.0809, 0.0403, ..., 0.0496, 0.1225, -0.0575], [-0.0852, 0.1561, -0.2034, ..., 0.0592, 0.0934, -0.0471], ..., [-0.0754, -0.1019, -0.0817, ..., 0.0086, -0.2027, 0.1506], [ 0.0494, -0.0466, 0.1099, ..., -0.0021, -0.2481, -0.0242], [-0.2226, -0.1167, -0.1533, ..., -0.2703, 0.0707, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.0781e-08, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 2.5369e-06, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 7.4506e-09, ..., 1.1176e-08, 1.8626e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., -3.6657e-06, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 5.5879e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.7055e-08, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 331, bias, value: tensor([-0.0129, -0.0370, 0.0098, -0.0124, 0.0126, 0.0109, 0.0262, 0.0055, -0.0452, -0.0082], device='cuda:0'), grad: tensor([ 5.4389e-07, 1.8984e-05, 1.6950e-07, 5.5283e-06, 1.3784e-07, 1.6708e-06, -1.2107e-07, -2.7448e-05, 7.8231e-08, 4.5076e-07], device='cuda:0') 100 0.0001 changing lr epoch 330, time 247.69, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4553 re_mapping 0.0044 re_causal 0.0124 /// teacc 99.02 lr 0.00010000 Epoch 332, weight, value: tensor([[ 0.0288, -0.1724, -0.1652, ..., -0.3365, -0.1217, -0.1649], [ 0.0684, -0.0815, 0.0403, ..., 0.0496, 0.1223, -0.0574], [-0.0856, 0.1565, -0.2034, ..., 0.0594, 0.0937, -0.0472], ..., [-0.0753, -0.1022, -0.0817, ..., 0.0086, -0.2028, 0.1507], [ 0.0494, -0.0466, 0.1101, ..., -0.0021, -0.2490, -0.0243], [-0.2227, -0.1176, -0.1538, ..., -0.2714, 0.0708, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 7.6368e-08, ..., 2.9802e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 7.6368e-08, ..., 5.9605e-08, -3.7253e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 3.7625e-07, ..., 1.0058e-07, -2.7940e-08, 0.0000e+00], ..., [ 0.0000e+00, 7.2643e-08, 2.5369e-06, ..., 9.3691e-07, 3.1665e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 7.0781e-08, ..., 4.0978e-08, 2.4214e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 1.1176e-07, ..., 6.5193e-08, -1.0431e-07, 0.0000e+00]], device='cuda:0') Epoch 332, bias, value: tensor([-0.0133, -0.0371, 0.0101, -0.0125, 0.0121, 0.0113, 0.0279, 0.0055, -0.0459, -0.0082], device='cuda:0'), grad: tensor([ 1.8813e-07, 3.0734e-07, 7.6741e-07, -8.1584e-06, 4.2841e-08, 1.3895e-06, 1.0617e-07, 5.5283e-06, 4.1164e-07, -6.1095e-07], device='cuda:0') 100 0.0001 changing lr epoch 331, time 247.70, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4906 re_mapping 0.0043 re_causal 0.0135 /// teacc 99.07 lr 0.00010000 Epoch 333, weight, value: tensor([[ 0.0287, -0.1727, -0.1651, ..., -0.3369, -0.1218, -0.1650], [ 0.0686, -0.0817, 0.0404, ..., 0.0497, 0.1225, -0.0573], [-0.0863, 0.1565, -0.2035, ..., 0.0594, 0.0937, -0.0472], ..., [-0.0754, -0.1025, -0.0818, ..., 0.0084, -0.2031, 0.1507], [ 0.0494, -0.0466, 0.1103, ..., -0.0020, -0.2495, -0.0243], [-0.2229, -0.1176, -0.1542, ..., -0.2718, 0.0709, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -1.4715e-07, 1.8626e-09, ..., 1.6764e-08, 1.1176e-08, 0.0000e+00], [-0.0000e+00, 1.1176e-08, 2.9244e-07, ..., 1.0431e-06, -2.1793e-07, 0.0000e+00], [ 7.0781e-08, -0.0000e+00, 9.9093e-07, ..., 1.2629e-06, 2.4587e-07, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, -4.2096e-07, ..., -1.1344e-06, 1.1176e-08, 0.0000e+00], [-7.8231e-08, 7.4506e-09, -1.0822e-06, ..., -1.4864e-06, -3.6694e-07, 0.0000e+00], [ 0.0000e+00, 1.0431e-07, 1.8626e-09, ..., 1.4901e-08, 1.1176e-08, 0.0000e+00]], device='cuda:0') Epoch 333, bias, value: tensor([-0.0133, -0.0370, 0.0101, -0.0114, 0.0121, 0.0107, 0.0280, 0.0054, -0.0462, -0.0082], device='cuda:0'), grad: tensor([-1.5367e-06, 3.3341e-06, 2.5984e-06, 5.1409e-07, 3.9116e-08, 9.6858e-08, 4.3400e-07, -3.6657e-06, -3.0231e-06, 1.1958e-06], device='cuda:0') 100 0.0001 changing lr epoch 332, time 247.57, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4700 re_mapping 0.0043 re_causal 0.0125 /// teacc 99.04 lr 0.00010000 Epoch 334, weight, value: tensor([[ 0.0287, -0.1726, -0.1654, ..., -0.3372, -0.1218, -0.1650], [ 0.0687, -0.0825, 0.0404, ..., 0.0497, 0.1227, -0.0568], [-0.0868, 0.1565, -0.2037, ..., 0.0590, 0.0935, -0.0473], ..., [-0.0747, -0.1023, -0.0818, ..., 0.0085, -0.2031, 0.1508], [ 0.0495, -0.0466, 0.1108, ..., -0.0018, -0.2494, -0.0243], [-0.2236, -0.1180, -0.1547, ..., -0.2728, 0.0709, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 5.5879e-09, 9.3132e-09, 0.0000e+00], [-1.8626e-08, 0.0000e+00, -1.9185e-07, ..., -6.3330e-08, -2.1048e-07, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 2.4214e-08, ..., 1.6764e-08, 1.3039e-08, 0.0000e+00], ..., [ 1.3039e-08, 0.0000e+00, 1.1921e-07, ..., 2.2352e-08, 9.8720e-08, -0.0000e+00], [ 0.0000e+00, -0.0000e+00, -4.6566e-08, ..., -2.4214e-08, -5.5879e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 1.8626e-08, ..., 2.0489e-08, -2.2352e-08, 0.0000e+00]], device='cuda:0') Epoch 334, bias, value: tensor([-0.0133, -0.0370, 0.0098, -0.0113, 0.0124, 0.0108, 0.0275, 0.0055, -0.0459, -0.0086], device='cuda:0'), grad: tensor([-3.7253e-09, -4.3027e-07, 8.1956e-08, -5.2154e-07, 1.0617e-07, 4.8615e-07, 2.2352e-07, 0.0000e+00, -1.3411e-07, 1.6950e-07], device='cuda:0') 100 0.0001 changing lr epoch 333, time 247.68, cls_loss 0.0011 cls_loss_mapping 0.0020 cls_loss_causal 0.4812 re_mapping 0.0043 re_causal 0.0124 /// teacc 99.08 lr 0.00010000 Epoch 335, weight, value: tensor([[ 0.0285, -0.1736, -0.1660, ..., -0.3377, -0.1218, -0.1650], [ 0.0687, -0.0835, 0.0404, ..., 0.0496, 0.1226, -0.0565], [-0.0871, 0.1572, -0.2039, ..., 0.0589, 0.0936, -0.0474], ..., [-0.0750, -0.1025, -0.0818, ..., 0.0086, -0.2032, 0.1509], [ 0.0497, -0.0466, 0.1123, ..., -0.0017, -0.2494, -0.0243], [-0.2250, -0.1178, -0.1556, ..., -0.2735, 0.0707, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 6.3330e-08, ..., 1.3039e-08, -1.3039e-07, 0.0000e+00], [ 0.0000e+00, 2.4214e-08, 1.4901e-07, ..., 2.4214e-08, 5.0291e-08, 0.0000e+00], [ 0.0000e+00, -6.6869e-07, 3.1665e-08, ..., -7.5437e-07, -1.4026e-06, 0.0000e+00], ..., [ 0.0000e+00, 3.7998e-07, 1.9558e-07, ..., 4.2841e-07, 8.0094e-07, 0.0000e+00], [ 0.0000e+00, 1.5832e-07, -7.5623e-07, ..., 1.7509e-07, 3.3900e-07, 0.0000e+00], [ 0.0000e+00, 8.3819e-08, 2.1793e-07, ..., 8.5682e-08, 1.6205e-07, 0.0000e+00]], device='cuda:0') Epoch 335, bias, value: tensor([-0.0129, -0.0371, 0.0099, -0.0109, 0.0130, 0.0094, 0.0267, 0.0056, -0.0453, -0.0090], device='cuda:0'), grad: tensor([-7.5065e-07, 8.9966e-07, -3.6377e-06, 2.3842e-07, 5.4017e-08, 1.9185e-07, 8.5495e-07, 3.0808e-06, -2.6468e-06, 1.7136e-06], device='cuda:0') 100 0.0001 changing lr epoch 334, time 248.19, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4751 re_mapping 0.0042 re_causal 0.0124 /// teacc 99.09 lr 0.00010000 Epoch 336, weight, value: tensor([[ 0.0285, -0.1735, -0.1657, ..., -0.3378, -0.1217, -0.1650], [ 0.0688, -0.0837, 0.0402, ..., 0.0495, 0.1216, -0.0561], [-0.0874, 0.1573, -0.2030, ..., 0.0597, 0.0957, -0.0475], ..., [-0.0751, -0.1026, -0.0817, ..., 0.0086, -0.2032, 0.1509], [ 0.0499, -0.0468, 0.1127, ..., -0.0018, -0.2498, -0.0243], [-0.2261, -0.1178, -0.1560, ..., -0.2738, 0.0712, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 6.5193e-08, ..., 2.0303e-07, -1.3039e-08, 0.0000e+00], [ 3.7253e-09, -9.3132e-09, 1.1176e-08, ..., -1.1176e-08, -1.3039e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, -7.4506e-08, ..., -2.2352e-07, 1.4901e-08, 0.0000e+00], [-3.7253e-09, 5.5879e-09, -1.8626e-09, ..., 1.1176e-08, 2.2352e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.1176e-08, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 336, bias, value: tensor([-0.0124, -0.0376, 0.0118, -0.0109, 0.0123, 0.0095, 0.0263, 0.0057, -0.0454, -0.0088], device='cuda:0'), grad: tensor([-3.9116e-08, 5.1595e-07, 7.4506e-09, 2.9802e-08, 8.3819e-08, 2.4214e-08, -2.0675e-07, -5.3830e-07, 8.0094e-08, 5.2154e-08], device='cuda:0') 100 0.0001 changing lr epoch 335, time 247.49, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4872 re_mapping 0.0042 re_causal 0.0125 /// teacc 99.01 lr 0.00010000 Epoch 337, weight, value: tensor([[ 0.0283, -0.1739, -0.1665, ..., -0.3381, -0.1217, -0.1651], [ 0.0690, -0.0839, 0.0403, ..., 0.0495, 0.1218, -0.0561], [-0.0879, 0.1572, -0.2034, ..., 0.0594, 0.0956, -0.0476], ..., [-0.0756, -0.1026, -0.0818, ..., 0.0086, -0.2034, 0.1509], [ 0.0499, -0.0468, 0.1133, ..., -0.0017, -0.2501, -0.0244], [-0.2271, -0.1183, -0.1563, ..., -0.2740, 0.0717, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.0781e-08, ..., 3.7253e-09, 2.3469e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.4890e-07, ..., 1.3597e-07, -1.4994e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.0990e-07, ..., 1.9930e-07, 2.4773e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 2.2352e-08, ..., -3.7625e-07, -5.4017e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 0.0000e+00, 7.6368e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 3.7253e-09, -4.2841e-08, 0.0000e+00]], device='cuda:0') Epoch 337, bias, value: tensor([-0.0123, -0.0375, 0.0115, -0.0109, 0.0117, 0.0095, 0.0260, 0.0057, -0.0452, -0.0085], device='cuda:0'), grad: tensor([ 4.4890e-07, -2.7865e-06, 9.7416e-07, 2.4214e-08, 1.0431e-07, 4.4703e-08, 1.9632e-06, -8.9034e-07, 2.3469e-07, -1.3784e-07], device='cuda:0') 100 0.0001 changing lr epoch 336, time 247.78, cls_loss 0.0010 cls_loss_mapping 0.0012 cls_loss_causal 0.4638 re_mapping 0.0042 re_causal 0.0123 /// teacc 98.98 lr 0.00010000 Epoch 338, weight, value: tensor([[ 0.0282, -0.1741, -0.1666, ..., -0.3389, -0.1218, -0.1651], [ 0.0689, -0.0841, 0.0399, ..., 0.0493, 0.1213, -0.0561], [-0.0879, 0.1573, -0.2036, ..., 0.0595, 0.0957, -0.0471], ..., [-0.0755, -0.1026, -0.0818, ..., 0.0086, -0.2035, 0.1509], [ 0.0498, -0.0469, 0.1166, ..., 0.0012, -0.2472, -0.0244], [-0.2274, -0.1186, -0.1569, ..., -0.2744, 0.0715, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, -1.0841e-05, ..., 3.7253e-09, 2.2724e-07, 3.1665e-08], [ 0.0000e+00, 1.3039e-08, 1.8626e-08, ..., 1.6764e-08, 2.2352e-08, 7.4506e-09], [ 0.0000e+00, -1.4417e-06, 2.6077e-08, ..., -2.2687e-06, -2.5071e-06, 9.3132e-09], ..., [ 0.0000e+00, 2.7567e-07, 1.4901e-08, ..., 5.1782e-07, 4.4331e-07, 0.0000e+00], [ 0.0000e+00, 1.1418e-06, 1.8664e-06, ..., 1.7192e-06, 2.1867e-06, 2.0489e-08], [ 0.0000e+00, 0.0000e+00, 8.0392e-06, ..., 1.8626e-09, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 338, bias, value: tensor([-0.0119, -0.0379, 0.0115, -0.0114, 0.0122, 0.0094, 0.0254, 0.0057, -0.0423, -0.0088], device='cuda:0'), grad: tensor([-5.8383e-05, 2.7381e-07, -4.8615e-06, 4.8615e-07, 1.2107e-07, 6.1095e-06, -3.1628e-06, 1.1288e-06, 1.4454e-05, 4.3780e-05], device='cuda:0') 100 0.0001 changing lr epoch 337, time 247.75, cls_loss 0.0012 cls_loss_mapping 0.0022 cls_loss_causal 0.4852 re_mapping 0.0043 re_causal 0.0125 /// teacc 99.02 lr 0.00010000 Epoch 339, weight, value: tensor([[ 0.0288, -0.1755, -0.1649, ..., -0.3397, -0.1218, -0.1652], [ 0.0696, -0.0852, 0.0383, ..., 0.0478, 0.1205, -0.0562], [-0.0898, 0.1575, -0.2040, ..., 0.0595, 0.0956, -0.0471], ..., [-0.0760, -0.1028, -0.0802, ..., 0.0101, -0.2029, 0.1511], [ 0.0494, -0.0469, 0.1166, ..., 0.0012, -0.2474, -0.0246], [-0.2279, -0.1178, -0.1591, ..., -0.2750, 0.0720, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 3.7253e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, -7.4506e-09, ..., -1.3039e-08, -2.6077e-08, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, 1.4156e-07, ..., 5.4017e-08, 1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, -2.8126e-07, 2.9802e-08, ..., -0.0000e+00, 7.4506e-09, 0.0000e+00], [-1.8626e-09, 5.5879e-09, -8.3260e-07, ..., -2.7753e-07, -7.4506e-08, 0.0000e+00], [ 0.0000e+00, 2.4401e-07, 1.6391e-07, ..., 5.7742e-08, -2.2352e-08, 0.0000e+00]], device='cuda:0') Epoch 339, bias, value: tensor([-0.0109, -0.0395, 0.0113, -0.0112, 0.0136, 0.0093, 0.0249, 0.0072, -0.0424, -0.0093], device='cuda:0'), grad: tensor([ 4.2841e-08, 8.9407e-08, 4.5821e-07, 3.4459e-07, 4.8243e-07, 8.5682e-07, 5.9605e-08, -5.7966e-06, -2.0396e-06, 5.4799e-06], device='cuda:0') 100 0.0001 changing lr epoch 338, time 247.59, cls_loss 0.0010 cls_loss_mapping 0.0019 cls_loss_causal 0.4913 re_mapping 0.0045 re_causal 0.0138 /// teacc 99.06 lr 0.00010000 Epoch 340, weight, value: tensor([[ 0.0287, -0.1782, -0.1650, ..., -0.3403, -0.1219, -0.1667], [ 0.0699, -0.0862, 0.0372, ..., 0.0468, 0.1203, -0.0556], [-0.0903, 0.1573, -0.2042, ..., 0.0591, 0.0956, -0.0473], ..., [-0.0763, -0.1021, -0.0790, ..., 0.0112, -0.2027, 0.1511], [ 0.0498, -0.0469, 0.1167, ..., 0.0012, -0.2476, -0.0250], [-0.2281, -0.1174, -0.1601, ..., -0.2761, 0.0727, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -5.5879e-09, ..., 3.7253e-09, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 6.4075e-07, 0.0000e+00, ..., 1.4231e-06, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, -6.5193e-07, 0.0000e+00, ..., -1.4491e-06, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 2.2352e-08, ..., 1.3039e-08, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 340, bias, value: tensor([-0.0111, -0.0406, 0.0109, -0.0127, 0.0133, 0.0113, 0.0252, 0.0083, -0.0425, -0.0092], device='cuda:0'), grad: tensor([ 2.4214e-08, 1.4901e-08, 5.6624e-06, 1.5218e-06, 3.9116e-08, -1.7174e-06, 1.3039e-08, -5.7817e-06, 1.7881e-07, 4.4703e-08], device='cuda:0') 100 0.0001 changing lr epoch 339, time 247.49, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4607 re_mapping 0.0044 re_causal 0.0126 /// teacc 99.12 lr 0.00010000 Epoch 341, weight, value: tensor([[ 0.0275, -0.1787, -0.1656, ..., -0.3419, -0.1220, -0.1667], [ 0.0699, -0.0863, 0.0370, ..., 0.0467, 0.1203, -0.0555], [-0.0921, 0.1575, -0.2045, ..., 0.0592, 0.0958, -0.0473], ..., [-0.0766, -0.1022, -0.0789, ..., 0.0113, -0.2028, 0.1512], [ 0.0519, -0.0469, 0.1171, ..., 0.0016, -0.2476, -0.0257], [-0.2287, -0.1176, -0.1607, ..., -0.2764, 0.0730, -0.1099]], device='cuda:0'), grad: tensor([[ 4.0047e-07, 9.3132e-10, 2.7940e-09, ..., 7.6815e-06, 9.0972e-06, 0.0000e+00], [ 5.5879e-09, 9.3132e-10, -6.5193e-09, ..., 1.1921e-07, 1.2014e-07, 0.0000e+00], [-5.7463e-07, -1.3039e-08, 1.6764e-08, ..., -1.1042e-05, -1.3083e-05, 0.0000e+00], ..., [ 1.8626e-08, 1.0245e-08, 1.1176e-08, ..., 3.5390e-07, 5.0385e-07, 0.0000e+00], [ 1.2387e-07, 0.0000e+00, -5.8673e-08, ..., 2.3749e-06, 2.8145e-06, 0.0000e+00], [ 2.7940e-09, 0.0000e+00, 4.0978e-08, ..., 7.2643e-08, 8.0094e-08, 0.0000e+00]], device='cuda:0') Epoch 341, bias, value: tensor([-0.0112, -0.0407, 0.0108, -0.0134, 0.0123, 0.0118, 0.0255, 0.0085, -0.0422, -0.0089], device='cuda:0'), grad: tensor([ 2.4214e-05, 3.9767e-07, -3.4779e-05, 5.5507e-07, -6.5006e-07, 1.1455e-07, 5.9512e-07, 1.2182e-06, 7.5623e-06, 7.7114e-07], device='cuda:0') 100 0.0001 changing lr epoch 340, time 247.54, cls_loss 0.0010 cls_loss_mapping 0.0022 cls_loss_causal 0.4804 re_mapping 0.0042 re_causal 0.0124 /// teacc 98.99 lr 0.00010000 Epoch 342, weight, value: tensor([[ 0.0272, -0.1797, -0.1650, ..., -0.3447, -0.1222, -0.1672], [ 0.0701, -0.0871, 0.0370, ..., 0.0466, 0.1203, -0.0555], [-0.0923, 0.1576, -0.2049, ..., 0.0592, 0.0960, -0.0475], ..., [-0.0778, -0.1024, -0.0789, ..., 0.0113, -0.2029, 0.1512], [ 0.0520, -0.0468, 0.1173, ..., 0.0019, -0.2477, -0.0256], [-0.2294, -0.1176, -0.1622, ..., -0.2771, 0.0712, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 9.3132e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.7940e-09, ..., -4.6566e-09, -2.8871e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 9.3132e-10, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.9558e-08, ..., -7.9162e-08, 1.3970e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 2.7940e-09, -8.3819e-09, 0.0000e+00]], device='cuda:0') Epoch 342, bias, value: tensor([-0.0110, -0.0408, 0.0108, -0.0133, 0.0147, 0.0119, 0.0259, 0.0086, -0.0422, -0.0106], device='cuda:0'), grad: tensor([ 3.8184e-08, 9.1270e-08, 2.1420e-08, -2.5146e-08, 5.9605e-08, 9.0338e-08, 6.0536e-08, -2.8964e-07, 8.9407e-08, -1.4249e-07], device='cuda:0') 100 0.0001 changing lr epoch 341, time 247.67, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.4559 re_mapping 0.0044 re_causal 0.0120 /// teacc 98.98 lr 0.00010000 Epoch 343, weight, value: tensor([[ 0.0273, -0.1804, -0.1650, ..., -0.3453, -0.1222, -0.1672], [ 0.0710, -0.0882, 0.0371, ..., 0.0466, 0.1206, -0.0557], [-0.0928, 0.1559, -0.2053, ..., 0.0576, 0.0957, -0.0474], ..., [-0.0778, -0.1000, -0.0789, ..., 0.0116, -0.2030, 0.1515], [ 0.0522, -0.0471, 0.1173, ..., 0.0017, -0.2478, -0.0257], [-0.2295, -0.1178, -0.1627, ..., -0.2784, 0.0717, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 3.7253e-09, 5.9605e-08, 0.0000e+00], [ 1.8626e-09, 7.8231e-08, -2.0675e-07, ..., 3.9116e-08, -2.7008e-08, 0.0000e+00], [ 0.0000e+00, -9.9652e-08, 4.9360e-08, ..., -5.5879e-08, -4.9360e-08, 0.0000e+00], ..., [-3.7253e-09, 1.6764e-08, 1.5926e-07, ..., -6.6124e-08, 1.7323e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 5.5879e-09, ..., 1.0245e-08, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., -0.0000e+00, -4.9360e-07, 0.0000e+00]], device='cuda:0') Epoch 343, bias, value: tensor([-0.0109, -0.0408, 0.0093, -0.0133, 0.0139, 0.0114, 0.0257, 0.0089, -0.0423, -0.0105], device='cuda:0'), grad: tensor([ 1.7975e-07, 2.3283e-08, 1.3504e-07, 2.8219e-07, 1.3132e-07, 7.1339e-07, 2.3283e-08, -1.0338e-07, 8.6613e-08, -1.4836e-06], device='cuda:0') 100 0.0001 changing lr epoch 342, time 247.61, cls_loss 0.0012 cls_loss_mapping 0.0019 cls_loss_causal 0.4983 re_mapping 0.0042 re_causal 0.0127 /// teacc 98.98 lr 0.00010000 Epoch 344, weight, value: tensor([[ 0.0267, -0.1816, -0.1667, ..., -0.3469, -0.1224, -0.1672], [ 0.0716, -0.0902, 0.0372, ..., 0.0467, 0.1210, -0.0551], [-0.0929, 0.1550, -0.2057, ..., 0.0566, 0.0952, -0.0475], ..., [-0.0783, -0.0986, -0.0789, ..., 0.0122, -0.2028, 0.1521], [ 0.0517, -0.0475, 0.1170, ..., 0.0013, -0.2482, -0.0258], [-0.2296, -0.1179, -0.1630, ..., -0.2792, 0.0736, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.7940e-09, 1.8626e-09, ..., 7.4506e-09, 1.0245e-08, 0.0000e+00], [ 0.0000e+00, 1.3970e-08, -4.5635e-07, ..., -3.0175e-07, -3.2969e-07, 0.0000e+00], [ 0.0000e+00, -5.4017e-08, 1.2107e-08, ..., -1.3411e-07, -1.6019e-07, 0.0000e+00], ..., [ 0.0000e+00, 3.0734e-08, 4.2748e-07, ..., 3.8836e-07, 4.1910e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -5.0291e-08, ..., -4.6566e-09, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 1.1176e-08, -9.3132e-09, 0.0000e+00]], device='cuda:0') Epoch 344, bias, value: tensor([-0.0121, -0.0406, 0.0082, -0.0135, 0.0117, 0.0122, 0.0258, 0.0096, -0.0429, -0.0091], device='cuda:0'), grad: tensor([ 2.9802e-08, -8.4657e-07, -4.2468e-07, 3.6322e-08, 4.7497e-08, 1.0524e-07, 4.0047e-08, 1.1185e-06, 6.5193e-09, -9.9652e-08], device='cuda:0') 100 0.0001 changing lr epoch 343, time 247.60, cls_loss 0.0010 cls_loss_mapping 0.0017 cls_loss_causal 0.4735 re_mapping 0.0043 re_causal 0.0126 /// teacc 98.97 lr 0.00010000 Epoch 345, weight, value: tensor([[ 0.0270, -0.1826, -0.1668, ..., -0.3477, -0.1225, -0.1673], [ 0.0719, -0.0913, 0.0381, ..., 0.0478, 0.1230, -0.0552], [-0.0930, 0.1529, -0.2058, ..., 0.0545, 0.0939, -0.0476], ..., [-0.0783, -0.0964, -0.0798, ..., 0.0127, -0.2036, 0.1529], [ 0.0518, -0.0477, 0.1170, ..., 0.0012, -0.2484, -0.0258], [-0.2298, -0.1180, -0.1636, ..., -0.2794, 0.0736, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -6.5193e-09, ..., 4.6566e-09, -3.7253e-09, 0.0000e+00], [ 0.0000e+00, -1.5832e-08, 1.8626e-09, ..., 4.3772e-08, -1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.5832e-08, 4.6566e-09, ..., -4.3772e-08, 2.3283e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.2107e-08, 0.0000e+00]], device='cuda:0') Epoch 345, bias, value: tensor([-0.0110, -0.0398, 0.0062, -0.0134, 0.0103, 0.0124, 0.0253, 0.0106, -0.0431, -0.0101], device='cuda:0'), grad: tensor([-1.2480e-07, 2.5146e-08, 1.3690e-07, -6.6124e-08, -8.3819e-08, 6.0536e-08, 1.6764e-08, -1.3318e-07, 4.6566e-09, 1.5926e-07], device='cuda:0') 100 0.0001 changing lr epoch 344, time 247.18, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4851 re_mapping 0.0043 re_causal 0.0130 /// teacc 99.02 lr 0.00010000 Epoch 346, weight, value: tensor([[ 0.0273, -0.1828, -0.1671, ..., -0.3479, -0.1221, -0.1673], [ 0.0720, -0.0910, 0.0383, ..., 0.0480, 0.1233, -0.0554], [-0.0943, 0.1530, -0.2063, ..., 0.0545, 0.0939, -0.0476], ..., [-0.0794, -0.0965, -0.0799, ..., 0.0126, -0.2040, 0.1530], [ 0.0514, -0.0479, 0.1172, ..., 0.0010, -0.2485, -0.0261], [-0.2305, -0.1180, -0.1650, ..., -0.2805, 0.0734, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.2200e-07, ..., 1.3039e-08, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 4.1537e-07, ..., 4.5635e-08, -9.3132e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.6764e-08, ..., -7.4506e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8841e-06, ..., -1.9744e-07, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.0245e-07, ..., 1.2107e-08, 1.0692e-06, 0.0000e+00]], device='cuda:0') Epoch 346, bias, value: tensor([-0.0093, -0.0397, 0.0062, -0.0132, 0.0104, 0.0119, 0.0258, 0.0105, -0.0430, -0.0114], device='cuda:0'), grad: tensor([ 3.9581e-07, 3.4459e-08, 1.4314e-06, 3.3639e-06, -2.0918e-06, 5.8115e-07, 3.2410e-07, 9.3132e-09, -6.5416e-06, 2.4941e-06], device='cuda:0') 100 0.0001 changing lr epoch 345, time 247.94, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4803 re_mapping 0.0042 re_causal 0.0125 /// teacc 99.01 lr 0.00010000 Epoch 347, weight, value: tensor([[ 0.0265, -0.1829, -0.1682, ..., -0.3503, -0.1223, -0.1673], [ 0.0721, -0.0909, 0.0383, ..., 0.0480, 0.1233, -0.0555], [-0.0937, 0.1530, -0.2065, ..., 0.0546, 0.0941, -0.0476], ..., [-0.0795, -0.0965, -0.0799, ..., 0.0126, -0.2041, 0.1531], [ 0.0511, -0.0480, 0.1173, ..., 0.0008, -0.2489, -0.0262], [-0.2311, -0.1181, -0.1686, ..., -0.2817, 0.0740, -0.1100]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 3.7253e-09, 2.4214e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 1.5832e-08, ..., 5.1223e-08, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, -2.0489e-08, 0.0000e+00, ..., -6.1467e-08, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.1176e-08, -4.0978e-08, ..., -2.7008e-08, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 2.0489e-08, ..., 2.3283e-08, 2.2352e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 3.7253e-09, -6.3330e-08, 0.0000e+00]], device='cuda:0') Epoch 347, bias, value: tensor([-0.0095, -0.0397, 0.0062, -0.0124, 0.0097, 0.0128, 0.0251, 0.0105, -0.0433, -0.0116], device='cuda:0'), grad: tensor([-4.6473e-07, 1.7975e-07, 1.7136e-07, 4.8429e-08, 3.7253e-08, 1.0338e-07, -7.2643e-08, -3.6322e-08, 1.4622e-07, -8.3819e-08], device='cuda:0') 100 0.0001 changing lr epoch 346, time 247.55, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4830 re_mapping 0.0043 re_causal 0.0128 /// teacc 99.06 lr 0.00010000 Epoch 348, weight, value: tensor([[ 0.0265, -0.1830, -0.1689, ..., -0.3521, -0.1225, -0.1673], [ 0.0722, -0.0910, 0.0381, ..., 0.0478, 0.1235, -0.0555], [-0.0938, 0.1530, -0.2068, ..., 0.0546, 0.0941, -0.0475], ..., [-0.0795, -0.0965, -0.0797, ..., 0.0127, -0.2043, 0.1533], [ 0.0512, -0.0481, 0.1178, ..., 0.0012, -0.2493, -0.0263], [-0.2315, -0.1181, -0.1689, ..., -0.2831, 0.0748, -0.1100]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.9360e-08, ..., 1.0617e-07, -5.5879e-09, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 2.7940e-09, ..., 1.8626e-09, -4.6566e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, -2.4214e-08, ..., -9.5926e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.6950e-07, ..., 9.5926e-08, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 348, bias, value: tensor([-0.0096, -0.0399, 0.0062, -0.0123, 0.0092, 0.0128, 0.0251, 0.0106, -0.0433, -0.0110], device='cuda:0'), grad: tensor([ 7.4506e-09, 3.1479e-07, 3.7253e-09, -7.6089e-07, 1.8626e-09, 3.6228e-07, 4.6566e-09, -2.7660e-07, 4.6566e-09, 3.4925e-07], device='cuda:0') 100 0.0001 changing lr epoch 347, time 247.63, cls_loss 0.0010 cls_loss_mapping 0.0013 cls_loss_causal 0.4785 re_mapping 0.0042 re_causal 0.0126 /// teacc 99.09 lr 0.00010000 Epoch 349, weight, value: tensor([[ 0.0264, -0.1830, -0.1689, ..., -0.3527, -0.1224, -0.1674], [ 0.0748, -0.0925, 0.0382, ..., 0.0477, 0.1234, -0.0555], [-0.0939, 0.1530, -0.2070, ..., 0.0547, 0.0946, -0.0476], ..., [-0.0796, -0.0965, -0.0798, ..., 0.0127, -0.2044, 0.1545], [ 0.0511, -0.0481, 0.1181, ..., 0.0015, -0.2494, -0.0262], [-0.2316, -0.1184, -0.1691, ..., -0.2844, 0.0749, -0.1100]], device='cuda:0'), grad: tensor([[-5.5879e-09, 9.3132e-10, 0.0000e+00, ..., 4.6566e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, -1.3039e-08, ..., 2.1420e-08, -1.4901e-08, 0.0000e+00], [ 0.0000e+00, -1.3970e-08, 9.3132e-10, ..., -7.6368e-08, -6.7055e-08, 0.0000e+00], ..., [ 1.8626e-09, -2.7940e-09, 2.7940e-09, ..., -6.0536e-08, 1.4901e-08, 0.0000e+00], [ 9.3132e-10, 2.7940e-09, 9.3132e-10, ..., 1.1176e-08, 2.7940e-09, 0.0000e+00], [ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 7.4506e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 349, bias, value: tensor([-0.0076, -0.0400, 0.0063, -0.0126, 0.0089, 0.0131, 0.0220, 0.0107, -0.0431, -0.0112], device='cuda:0'), grad: tensor([-1.1176e-07, 4.0047e-08, -1.5087e-07, 1.2107e-07, 8.8476e-08, -7.4506e-09, 2.7008e-08, -1.7323e-07, 2.9802e-08, 1.3225e-07], device='cuda:0') 100 0.0001 changing lr epoch 348, time 247.74, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4736 re_mapping 0.0043 re_causal 0.0127 /// teacc 99.07 lr 0.00010000 Epoch 350, weight, value: tensor([[ 0.0262, -0.1831, -0.1695, ..., -0.3545, -0.1225, -0.1674], [ 0.0771, -0.0929, 0.0384, ..., 0.0482, 0.1249, -0.0554], [-0.0962, 0.1530, -0.2073, ..., 0.0544, 0.0935, -0.0475], ..., [-0.0799, -0.0965, -0.0799, ..., 0.0126, -0.2053, 0.1546], [ 0.0509, -0.0481, 0.1181, ..., 0.0012, -0.2497, -0.0264], [-0.2334, -0.1184, -0.1694, ..., -0.2852, 0.0757, -0.1100]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.6764e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.7695e-08, ..., 4.4797e-07, 1.5702e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-08, ..., 1.6764e-08, 2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 2.3283e-08, ..., -9.6299e-07, 1.1362e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.7043e-07, ..., -2.8871e-08, 4.6566e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.6077e-08, ..., 8.4750e-08, 3.9022e-07, 0.0000e+00]], device='cuda:0') Epoch 350, bias, value: tensor([-0.0074, -0.0396, 0.0062, -0.0126, 0.0088, 0.0137, 0.0207, 0.0105, -0.0436, -0.0105], device='cuda:0'), grad: tensor([ 7.1712e-08, 5.4650e-06, 8.6613e-08, 1.2452e-06, -5.2527e-06, 2.6450e-07, 4.9360e-08, -2.8610e-06, -3.9861e-07, 1.3364e-06], device='cuda:0') 100 0.0001 changing lr epoch 349, time 247.37, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.5140 re_mapping 0.0042 re_causal 0.0137 /// teacc 99.02 lr 0.00010000 Epoch 351, weight, value: tensor([[ 0.0265, -0.1834, -0.1697, ..., -0.3553, -0.1227, -0.1674], [ 0.0781, -0.0933, 0.0385, ..., 0.0482, 0.1251, -0.0554], [-0.0965, 0.1531, -0.2075, ..., 0.0544, 0.0936, -0.0475], ..., [-0.0792, -0.0965, -0.0800, ..., 0.0126, -0.2055, 0.1546], [ 0.0508, -0.0482, 0.1197, ..., 0.0010, -0.2500, -0.0264], [-0.2340, -0.1185, -0.1695, ..., -0.2854, 0.0761, -0.1100]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 1.8626e-09, ..., 8.3819e-09, 3.7253e-08, 0.0000e+00], [ 0.0000e+00, 2.1420e-08, 2.9802e-08, ..., 3.4459e-08, 9.5926e-08, 0.0000e+00], [ 0.0000e+00, -1.5926e-07, 1.3970e-08, ..., -1.5832e-07, -2.9057e-07, -2.7940e-09], ..., [-0.0000e+00, 2.7008e-08, 8.3819e-09, ..., 2.5146e-08, 4.9360e-08, 0.0000e+00], [ 0.0000e+00, 7.0781e-08, -2.3562e-07, ..., -4.7497e-08, 1.5739e-07, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 8.6613e-08, ..., 4.0978e-08, 4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 351, bias, value: tensor([-0.0079, -0.0395, 0.0062, -0.0129, 0.0087, 0.0127, 0.0215, 0.0104, -0.0427, -0.0102], device='cuda:0'), grad: tensor([ 3.1665e-08, 2.8405e-07, -6.6124e-07, 2.3749e-07, 2.4214e-08, 5.2340e-07, -6.1281e-07, 1.1548e-07, -1.8533e-07, 2.5146e-07], device='cuda:0') 100 0.0001 changing lr epoch 350, time 247.48, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4610 re_mapping 0.0043 re_causal 0.0123 /// teacc 99.01 lr 0.00010000 Epoch 352, weight, value: tensor([[ 0.0262, -0.1839, -0.1704, ..., -0.3556, -0.1228, -0.1674], [ 0.0785, -0.0920, 0.0385, ..., 0.0482, 0.1254, -0.0554], [-0.0966, 0.1530, -0.2080, ..., 0.0544, 0.0936, -0.0474], ..., [-0.0792, -0.0965, -0.0801, ..., 0.0126, -0.2057, 0.1547], [ 0.0499, -0.0482, 0.1195, ..., 0.0009, -0.2504, -0.0264], [-0.2347, -0.1187, -0.1697, ..., -0.2857, 0.0761, -0.1100]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, -1.2107e-08, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 2.7940e-09, 4.8429e-08, 4.8429e-08, ..., 9.4995e-08, 4.1910e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], ..., [ 9.3132e-10, -5.1223e-08, -5.4017e-08, ..., -1.0338e-07, 6.8918e-08, 0.0000e+00], [ 2.1420e-08, 1.8626e-09, 9.3132e-09, ..., 2.7940e-09, 1.1176e-08, 0.0000e+00], [ 6.5193e-09, 0.0000e+00, -9.3132e-10, ..., 9.3132e-10, -1.1269e-07, 0.0000e+00]], device='cuda:0') Epoch 352, bias, value: tensor([-0.0080, -0.0395, 0.0061, -0.0128, 0.0088, 0.0130, 0.0215, 0.0104, -0.0431, -0.0103], device='cuda:0'), grad: tensor([-4.3772e-08, 5.1409e-07, 1.5832e-08, 6.4634e-07, -2.7940e-09, -1.0561e-06, 1.5832e-08, -1.2387e-07, 3.3714e-07, -2.9709e-07], device='cuda:0') 100 0.0001 changing lr epoch 351, time 247.80, cls_loss 0.0007 cls_loss_mapping 0.0014 cls_loss_causal 0.4736 re_mapping 0.0043 re_causal 0.0127 /// teacc 99.03 lr 0.00010000 Epoch 353, weight, value: tensor([[ 0.0258, -0.1842, -0.1707, ..., -0.3559, -0.1228, -0.1674], [ 0.0785, -0.0920, 0.0386, ..., 0.0483, 0.1257, -0.0554], [-0.0991, 0.1531, -0.2087, ..., 0.0544, 0.0937, -0.0474], ..., [-0.0819, -0.0966, -0.0803, ..., 0.0124, -0.2060, 0.1547], [ 0.0530, -0.0485, 0.1208, ..., 0.0029, -0.2507, -0.0264], [-0.2345, -0.1189, -0.1698, ..., -0.2861, 0.0761, -0.1100]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [-1.8626e-09, 0.0000e+00, -7.4506e-09, ..., 5.5879e-09, -2.3283e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 8.3819e-09, ..., 7.4506e-09, 9.3132e-10, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 6.5193e-09, ..., -1.8626e-09, 9.3132e-09, 0.0000e+00], [-0.0000e+00, 0.0000e+00, -8.1025e-08, ..., -6.9849e-08, 2.7940e-09, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 1.8626e-09, -1.0245e-08, 0.0000e+00]], device='cuda:0') Epoch 353, bias, value: tensor([-0.0077, -0.0394, 0.0061, -0.0127, 0.0087, 0.0129, 0.0216, 0.0103, -0.0419, -0.0104], device='cuda:0'), grad: tensor([ 0.0000e+00, -2.1420e-08, 2.5146e-08, 9.4064e-08, 8.7544e-08, 5.8673e-08, 1.8626e-08, 1.3970e-08, -1.8347e-07, -9.1270e-08], device='cuda:0') 100 0.0001 changing lr epoch 352, time 247.33, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4683 re_mapping 0.0041 re_causal 0.0122 /// teacc 99.13 lr 0.00010000 Epoch 354, weight, value: tensor([[ 0.0252, -0.1843, -0.1714, ..., -0.3561, -0.1230, -0.1674], [ 0.0790, -0.0928, 0.0387, ..., 0.0483, 0.1257, -0.0554], [-0.0992, 0.1531, -0.2088, ..., 0.0544, 0.0938, -0.0474], ..., [-0.0819, -0.0966, -0.0803, ..., 0.0124, -0.2063, 0.1548], [ 0.0530, -0.0485, 0.1207, ..., 0.0029, -0.2509, -0.0264], [-0.2353, -0.1163, -0.1698, ..., -0.2868, 0.0768, -0.1100]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.8626e-09, 1.8626e-09, ..., 6.5193e-09, 1.2107e-08, 0.0000e+00], [-1.4901e-08, 7.7300e-08, -7.1712e-08, ..., 2.0489e-07, 4.1910e-08, 0.0000e+00], [ 3.6322e-08, -2.9150e-07, 2.2445e-07, ..., -4.5635e-07, -8.8010e-07, 0.0000e+00], ..., [-7.5344e-07, 1.3411e-07, 1.3970e-07, ..., 5.0664e-07, 5.8208e-07, 0.0000e+00], [-5.3085e-08, 4.1910e-08, -2.3469e-07, ..., -3.1479e-07, 1.3411e-07, 0.0000e+00], [ 3.7253e-09, 3.4459e-08, 3.7253e-09, ..., 9.4064e-08, -1.7695e-08, 0.0000e+00]], device='cuda:0') Epoch 354, bias, value: tensor([-0.0079, -0.0395, 0.0061, -0.0128, 0.0087, 0.0127, 0.0222, 0.0103, -0.0422, -0.0098], device='cuda:0'), grad: tensor([ 1.7695e-08, 4.7870e-07, -1.5935e-06, -1.1083e-07, 3.4980e-06, 4.0513e-07, 1.1176e-08, -1.8356e-06, -5.0291e-07, -3.5856e-07], device='cuda:0') 100 0.0001 changing lr epoch 353, time 248.13, cls_loss 0.0006 cls_loss_mapping 0.0014 cls_loss_causal 0.4637 re_mapping 0.0042 re_causal 0.0129 /// teacc 99.09 lr 0.00010000 Epoch 355, weight, value: tensor([[ 0.0248, -0.1847, -0.1719, ..., -0.3566, -0.1231, -0.1674], [ 0.0794, -0.0928, 0.0388, ..., 0.0484, 0.1260, -0.0554], [-0.0998, 0.1531, -0.2092, ..., 0.0544, 0.0938, -0.0474], ..., [-0.0814, -0.0966, -0.0804, ..., 0.0123, -0.2066, 0.1548], [ 0.0529, -0.0486, 0.1207, ..., 0.0028, -0.2511, -0.0265], [-0.2360, -0.1160, -0.1699, ..., -0.2870, 0.0769, -0.1100]], device='cuda:0'), grad: tensor([[ 8.3819e-09, 3.7253e-09, 1.8626e-09, ..., 1.2107e-08, 1.0245e-08, 0.0000e+00], [ 1.4994e-07, 6.3330e-08, 2.0489e-08, ..., 2.0768e-07, 1.6857e-07, 0.0000e+00], [ 9.0078e-06, -1.1735e-07, 1.1781e-06, ..., 9.3803e-06, -3.2689e-07, 0.0000e+00], ..., [-9.4026e-06, 1.2107e-08, -1.1725e-06, ..., -9.8795e-06, 4.0047e-08, 0.0000e+00], [ 1.9092e-07, 2.9802e-08, 2.8871e-08, ..., 2.4121e-07, 8.8476e-08, 0.0000e+00], [ 9.3132e-10, 2.7940e-09, 0.0000e+00, ..., 9.3132e-09, 4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 355, bias, value: tensor([-0.0077, -0.0394, 0.0061, -0.0129, 0.0088, 0.0128, 0.0220, 0.0102, -0.0423, -0.0098], device='cuda:0'), grad: tensor([ 4.1910e-08, 6.8825e-07, 2.6852e-05, -5.0291e-08, 2.7008e-08, 1.3970e-07, -1.1642e-07, -2.8387e-05, 7.5251e-07, 3.0734e-08], device='cuda:0') 100 0.0001 changing lr epoch 354, time 247.42, cls_loss 0.0008 cls_loss_mapping 0.0017 cls_loss_causal 0.4957 re_mapping 0.0044 re_causal 0.0130 /// teacc 99.11 lr 0.00010000 Epoch 356, weight, value: tensor([[ 0.0248, -0.1853, -0.1728, ..., -0.3568, -0.1235, -0.1674], [ 0.0794, -0.0940, 0.0389, ..., 0.0483, 0.1265, -0.0555], [-0.1022, 0.1532, -0.2092, ..., 0.0544, 0.0941, -0.0474], ..., [-0.0793, -0.0967, -0.0804, ..., 0.0123, -0.2069, 0.1550], [ 0.0528, -0.0487, 0.1207, ..., 0.0028, -0.2513, -0.0265], [-0.2350, -0.1162, -0.1700, ..., -0.2873, 0.0767, -0.1100]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 0.0000e+00, 4.4703e-08, ..., 9.3132e-10, 6.3330e-08, 0.0000e+00], [-4.0978e-08, 0.0000e+00, -2.3004e-07, ..., 3.7253e-09, -2.2352e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 2.2352e-08, ..., 2.3283e-08, -3.0734e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.1665e-08, ..., 6.6124e-08, 1.0245e-08, 0.0000e+00], [ 4.6566e-09, 0.0000e+00, 3.0734e-08, ..., 5.5879e-09, 2.7008e-08, 0.0000e+00], [ 4.6566e-09, 0.0000e+00, 6.5193e-09, ..., 9.3132e-10, 2.4214e-08, 0.0000e+00]], device='cuda:0') Epoch 356, bias, value: tensor([-0.0079, -0.0392, 0.0061, -0.0127, 0.0086, 0.0126, 0.0220, 0.0102, -0.0424, -0.0101], device='cuda:0'), grad: tensor([ 2.0303e-07, -6.4448e-07, -3.4459e-08, -2.3935e-07, -3.7253e-08, 5.2154e-08, 3.9022e-07, -1.4901e-08, 1.0524e-07, 2.1141e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 355---------------------------------------------------- epoch 355, time 263.92, cls_loss 0.0008 cls_loss_mapping 0.0013 cls_loss_causal 0.4485 re_mapping 0.0041 re_causal 0.0119 /// teacc 99.16 lr 0.00010000 Epoch 357, weight, value: tensor([[ 0.0257, -0.1855, -0.1752, ..., -0.3571, -0.1236, -0.1676], [ 0.0795, -0.0932, 0.0391, ..., 0.0485, 0.1270, -0.0569], [-0.1025, 0.1532, -0.2096, ..., 0.0544, 0.0939, -0.0468], ..., [-0.0791, -0.0967, -0.0806, ..., 0.0123, -0.2073, 0.1561], [ 0.0527, -0.0487, 0.1207, ..., 0.0028, -0.2514, -0.0272], [-0.2356, -0.1162, -0.1703, ..., -0.2877, 0.0770, -0.1101]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 3.1665e-08, ..., 1.4901e-08, 6.3702e-07, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, -1.8626e-09, ..., 3.7253e-09, 1.7695e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 8.3819e-09, 1.1660e-06, 0.0000e+00], ..., [ 0.0000e+00, -2.7940e-09, 4.6566e-09, ..., -3.2596e-08, 4.9174e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.2841e-08, 3.5390e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, -2.3469e-07, ..., -1.0617e-07, -9.2909e-06, 0.0000e+00]], device='cuda:0') Epoch 357, bias, value: tensor([-0.0080, -0.0390, 0.0061, -0.0123, 0.0081, 0.0127, 0.0220, 0.0101, -0.0426, -0.0098], device='cuda:0'), grad: tensor([ 2.6990e-06, 1.4063e-07, 3.8743e-06, 2.8908e-06, 8.2180e-06, 3.0641e-07, 4.8429e-08, 1.8016e-05, 1.7071e-06, -3.7879e-05], device='cuda:0') 100 0.0001 changing lr epoch 356, time 247.80, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4717 re_mapping 0.0041 re_causal 0.0123 /// teacc 99.15 lr 0.00010000 Epoch 358, weight, value: tensor([[ 0.0266, -0.1860, -0.1756, ..., -0.3573, -0.1236, -0.1674], [ 0.0799, -0.0934, 0.0391, ..., 0.0485, 0.1271, -0.0570], [-0.1026, 0.1533, -0.2098, ..., 0.0544, 0.0940, -0.0460], ..., [-0.0787, -0.0967, -0.0806, ..., 0.0123, -0.2075, 0.1565], [ 0.0527, -0.0488, 0.1208, ..., 0.0028, -0.2515, -0.0277], [-0.2360, -0.1163, -0.1706, ..., -0.2880, 0.0772, -0.1102]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -2.7940e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 2.7940e-08, ..., 2.7008e-08, 4.0978e-08, 0.0000e+00], ..., [ 0.0000e+00, -9.3132e-10, 4.6566e-09, ..., 9.3132e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.0245e-08, ..., 1.2107e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 9.3132e-10, 2.0489e-08, 0.0000e+00]], device='cuda:0') Epoch 358, bias, value: tensor([-0.0071, -0.0390, 0.0061, -0.0122, 0.0079, 0.0127, 0.0216, 0.0101, -0.0427, -0.0100], device='cuda:0'), grad: tensor([ 1.3970e-08, 1.0245e-08, 1.9278e-07, -9.4995e-08, -2.1607e-07, -1.8626e-09, 2.6077e-08, -1.8626e-09, 2.3283e-08, 5.7742e-08], device='cuda:0') 100 0.0001 changing lr epoch 357, time 247.58, cls_loss 0.0007 cls_loss_mapping 0.0012 cls_loss_causal 0.4910 re_mapping 0.0042 re_causal 0.0129 /// teacc 99.11 lr 0.00010000 Epoch 359, weight, value: tensor([[ 0.0270, -0.1873, -0.1759, ..., -0.3578, -0.1236, -0.1676], [ 0.0799, -0.0954, 0.0391, ..., 0.0483, 0.1270, -0.0569], [-0.1035, 0.1536, -0.2099, ..., 0.0546, 0.0945, -0.0453], ..., [-0.0779, -0.0969, -0.0806, ..., 0.0123, -0.2076, 0.1565], [ 0.0526, -0.0496, 0.1207, ..., 0.0026, -0.2519, -0.0285], [-0.2363, -0.1166, -0.1708, ..., -0.2897, 0.0773, -0.1102]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 1.8626e-09, ..., 3.7253e-09, 1.9558e-08, 0.0000e+00], [-4.6566e-09, -2.7940e-09, -9.8720e-08, ..., -2.5146e-08, -2.6636e-07, 0.0000e+00], [ 1.8626e-09, -5.8953e-07, 1.6764e-08, ..., -6.4354e-07, -1.9558e-08, 0.0000e+00], ..., [ 8.3819e-09, 5.4389e-07, 3.7253e-08, ..., 6.1467e-07, 1.8533e-07, 0.0000e+00], [ 9.3132e-10, 2.8871e-08, 8.3819e-09, ..., 3.1665e-08, 2.4214e-08, 0.0000e+00], [ 9.3132e-10, 1.8626e-09, 1.8626e-09, ..., 1.8626e-09, -6.2399e-08, 0.0000e+00]], device='cuda:0') Epoch 359, bias, value: tensor([-0.0065, -0.0392, 0.0063, -0.0121, 0.0079, 0.0124, 0.0212, 0.0101, -0.0431, -0.0102], device='cuda:0'), grad: tensor([ 4.5635e-08, -3.4645e-07, -2.4065e-06, -9.3132e-10, -1.5832e-07, 3.3528e-08, 1.9558e-07, 2.5816e-06, 1.5181e-07, -9.4064e-08], device='cuda:0') 100 0.0001 changing lr epoch 358, time 247.93, cls_loss 0.0009 cls_loss_mapping 0.0011 cls_loss_causal 0.4405 re_mapping 0.0040 re_causal 0.0115 /// teacc 99.03 lr 0.00010000 Epoch 360, weight, value: tensor([[ 0.0243, -0.1877, -0.1761, ..., -0.3596, -0.1241, -0.1684], [ 0.0799, -0.0961, 0.0393, ..., 0.0485, 0.1274, -0.0551], [-0.1025, 0.1538, -0.2101, ..., 0.0547, 0.0947, -0.0439], ..., [-0.0779, -0.0970, -0.0807, ..., 0.0121, -0.2081, 0.1564], [ 0.0524, -0.0504, 0.1207, ..., 0.0025, -0.2530, -0.0294], [-0.2369, -0.1171, -0.1709, ..., -0.2911, 0.0790, -0.1102]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.5193e-09, 1.8626e-09, ..., 5.5879e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 1.7695e-08, -1.4901e-08, ..., 1.6764e-08, -9.3132e-10, 4.6566e-09], [ 0.0000e+00, -1.2945e-07, 7.5437e-08, ..., -5.1223e-08, -1.2480e-07, 2.7940e-09], ..., [ 0.0000e+00, 6.6124e-08, 2.7940e-09, ..., 4.2841e-08, 7.5437e-08, -1.3970e-08], [ 0.0000e+00, 2.6077e-08, 9.3132e-09, ..., 2.6077e-08, 3.2596e-08, 2.7940e-09], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 1.8626e-09, 2.8871e-08, 0.0000e+00]], device='cuda:0') Epoch 360, bias, value: tensor([-0.0067, -0.0390, 0.0063, -0.0119, 0.0064, 0.0122, 0.0216, 0.0099, -0.0439, -0.0088], device='cuda:0'), grad: tensor([ 4.6566e-08, 8.1025e-08, -4.3772e-07, -1.4249e-07, -2.6263e-07, 2.7008e-08, 1.2200e-07, 3.1013e-07, 1.6298e-07, 1.0710e-07], device='cuda:0') 100 0.0001 changing lr epoch 359, time 247.31, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4962 re_mapping 0.0042 re_causal 0.0126 /// teacc 99.10 lr 0.00010000 Epoch 361, weight, value: tensor([[ 0.0242, -0.1881, -0.1764, ..., -0.3601, -0.1245, -0.1686], [ 0.0800, -0.0965, 0.0394, ..., 0.0487, 0.1278, -0.0566], [-0.1029, 0.1537, -0.2104, ..., 0.0547, 0.0948, -0.0439], ..., [-0.0776, -0.0968, -0.0809, ..., 0.0120, -0.2087, 0.1581], [ 0.0523, -0.0505, 0.1209, ..., 0.0028, -0.2535, -0.0318], [-0.2371, -0.1173, -0.1710, ..., -0.2914, 0.0798, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 2.5611e-08, 4.0047e-08, 3.2596e-09], [ 0.0000e+00, -2.4680e-08, 1.3970e-09, ..., -8.8010e-08, -2.7660e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., -2.2817e-08, 2.3283e-09, -6.0536e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, 5.5879e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -6.9849e-09, 4.6566e-10]], device='cuda:0') Epoch 361, bias, value: tensor([-0.0072, -0.0389, 0.0062, -0.0117, 0.0063, 0.0117, 0.0228, 0.0098, -0.0440, -0.0080], device='cuda:0'), grad: tensor([-8.7079e-08, 1.0571e-07, -4.1304e-07, 2.7008e-08, 1.4901e-08, 8.7544e-08, 2.8592e-07, -7.1712e-08, 2.4680e-08, 3.4925e-08], device='cuda:0') 100 0.0001 changing lr epoch 360, time 247.71, cls_loss 0.0011 cls_loss_mapping 0.0014 cls_loss_causal 0.4861 re_mapping 0.0044 re_causal 0.0122 /// teacc 99.15 lr 0.00010000 Epoch 362, weight, value: tensor([[ 0.0255, -0.1894, -0.1766, ..., -0.3612, -0.1249, -0.1687], [ 0.0797, -0.0971, 0.0396, ..., 0.0490, 0.1284, -0.0568], [-0.1029, 0.1539, -0.2108, ..., 0.0548, 0.0952, -0.0436], ..., [-0.0777, -0.0970, -0.0811, ..., 0.0117, -0.2095, 0.1582], [ 0.0522, -0.0504, 0.1226, ..., 0.0029, -0.2535, -0.0322], [-0.2379, -0.1183, -0.1741, ..., -0.2919, 0.0805, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 6.9849e-09, ..., 1.3970e-09, -3.2596e-09, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 9.3132e-10, ..., -9.3132e-10, 5.1223e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 2.3283e-09, ..., 2.3283e-09, 6.0536e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 8.8476e-09, 1.8626e-09]], device='cuda:0') Epoch 362, bias, value: tensor([-0.0072, -0.0387, 0.0063, -0.0119, 0.0055, 0.0123, 0.0226, 0.0098, -0.0413, -0.0102], device='cuda:0'), grad: tensor([-5.1148e-06, 1.6298e-08, 1.0896e-07, 1.6876e-06, -3.5390e-08, -1.8431e-06, 1.7416e-07, 1.8626e-08, 9.0338e-08, 4.9211e-06], device='cuda:0') 100 0.0001 changing lr epoch 361, time 247.34, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4475 re_mapping 0.0043 re_causal 0.0120 /// teacc 99.08 lr 0.00010000 Epoch 363, weight, value: tensor([[ 0.0256, -0.1906, -0.1771, ..., -0.3615, -0.1255, -0.1687], [ 0.0796, -0.0968, 0.0400, ..., 0.0494, 0.1300, -0.0568], [-0.1029, 0.1538, -0.2122, ..., 0.0539, 0.0952, -0.0436], ..., [-0.0777, -0.0968, -0.0814, ..., 0.0120, -0.2111, 0.1584], [ 0.0522, -0.0510, 0.1220, ..., 0.0027, -0.2553, -0.0323], [-0.2380, -0.1183, -0.1742, ..., -0.2921, 0.0807, -0.1107]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 1.3970e-09, 9.3132e-10, 0.0000e+00], [ 6.9849e-09, 1.3970e-09, -2.7474e-08, ..., -4.1910e-09, 1.7695e-08, 0.0000e+00], [ 4.6566e-10, -5.1223e-09, 3.2596e-09, ..., -1.8626e-09, -3.7253e-09, 0.0000e+00], ..., [ 1.5972e-07, 4.6566e-10, 7.9162e-09, ..., 2.3283e-09, 1.3132e-07, 0.0000e+00], [ 1.8626e-09, 1.3970e-09, 9.3132e-09, ..., 6.9849e-09, 4.6566e-09, 0.0000e+00], [ 1.8626e-09, 4.6566e-10, 4.6566e-10, ..., 9.3132e-10, 1.0710e-08, 0.0000e+00]], device='cuda:0') Epoch 363, bias, value: tensor([-0.0074, -0.0381, 0.0059, -0.0110, 0.0056, 0.0121, 0.0245, 0.0097, -0.0427, -0.0102], device='cuda:0'), grad: tensor([ 1.0245e-08, 2.0396e-07, 2.7940e-09, -5.4501e-06, -9.3225e-07, 5.1968e-06, 7.8231e-08, 6.2212e-07, 2.0908e-07, 5.8673e-08], device='cuda:0') 100 0.0001 changing lr epoch 362, time 247.69, cls_loss 0.0009 cls_loss_mapping 0.0014 cls_loss_causal 0.4720 re_mapping 0.0041 re_causal 0.0120 /// teacc 99.09 lr 0.00010000 Epoch 364, weight, value: tensor([[ 0.0254, -0.1910, -0.1773, ..., -0.3617, -0.1256, -0.1687], [ 0.0794, -0.0953, 0.0398, ..., 0.0492, 0.1303, -0.0568], [-0.1029, 0.1540, -0.2133, ..., 0.0539, 0.0952, -0.0436], ..., [-0.0794, -0.0971, -0.0813, ..., 0.0121, -0.2114, 0.1584], [ 0.0520, -0.0513, 0.1218, ..., 0.0025, -0.2558, -0.0323], [-0.2405, -0.1184, -0.1744, ..., -0.2925, 0.0829, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 0.0000e+00, 3.0361e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 3.7253e-09, 5.0291e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 1.8626e-09, 1.4901e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.6764e-08, ..., 5.5879e-09, 1.8626e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.1176e-07, ..., -2.2352e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 1.8626e-09, -1.0878e-06, 0.0000e+00]], device='cuda:0') Epoch 364, bias, value: tensor([-0.0075, -0.0383, 0.0059, -0.0113, 0.0034, 0.0125, 0.0247, 0.0098, -0.0432, -0.0082], device='cuda:0'), grad: tensor([ 1.6801e-06, 1.6019e-07, 1.2107e-07, 1.1176e-07, 1.7900e-06, 2.3469e-07, -1.8254e-07, 7.6368e-08, -4.3958e-07, -3.5539e-06], device='cuda:0') 100 0.0001 changing lr epoch 363, time 247.79, cls_loss 0.0008 cls_loss_mapping 0.0019 cls_loss_causal 0.4819 re_mapping 0.0041 re_causal 0.0119 /// teacc 98.98 lr 0.00010000 Epoch 365, weight, value: tensor([[ 0.0253, -0.1912, -0.1775, ..., -0.3619, -0.1258, -0.1687], [ 0.0796, -0.0947, 0.0400, ..., 0.0492, 0.1313, -0.0577], [-0.1028, 0.1546, -0.2136, ..., 0.0543, 0.0957, -0.0434], ..., [-0.0793, -0.0977, -0.0814, ..., 0.0118, -0.2123, 0.1591], [ 0.0519, -0.0515, 0.1217, ..., 0.0024, -0.2563, -0.0325], [-0.2427, -0.1202, -0.1750, ..., -0.2941, 0.0822, -0.1107]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 3.7253e-09, 0.0000e+00, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 7.4506e-09, 2.4214e-08, 7.4506e-09, ..., 1.3039e-08, 4.2841e-08, 0.0000e+00], [ 1.8626e-09, -1.4529e-07, 3.7253e-09, ..., -4.4703e-08, -2.5146e-07, 0.0000e+00], ..., [-2.2352e-08, 3.7253e-09, -0.0000e+00, ..., -5.5879e-09, 1.3039e-08, 0.0000e+00], [ 3.7253e-09, 8.9407e-08, -3.7253e-08, ..., 1.3039e-08, 1.5832e-07, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 1.8626e-08, ..., 9.3132e-09, -1.3039e-08, 0.0000e+00]], device='cuda:0') Epoch 365, bias, value: tensor([-0.0074, -0.0380, 0.0062, -0.0103, 0.0036, 0.0109, 0.0260, 0.0095, -0.0435, -0.0094], device='cuda:0'), grad: tensor([-1.1384e-05, 1.9558e-07, -6.8732e-07, 9.4995e-08, 2.0489e-08, 1.0170e-06, 1.0118e-05, -5.5879e-09, 5.3458e-07, 1.3225e-07], device='cuda:0') 100 0.0001 changing lr epoch 364, time 248.00, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4976 re_mapping 0.0042 re_causal 0.0127 /// teacc 99.04 lr 0.00010000 Epoch 366, weight, value: tensor([[ 0.0252, -0.1885, -0.1750, ..., -0.3622, -0.1254, -0.1688], [ 0.0794, -0.0954, 0.0401, ..., 0.0492, 0.1313, -0.0585], [-0.1028, 0.1547, -0.2140, ..., 0.0544, 0.0960, -0.0431], ..., [-0.0808, -0.0978, -0.0814, ..., 0.0118, -0.2127, 0.1599], [ 0.0517, -0.0518, 0.1216, ..., 0.0022, -0.2569, -0.0327], [-0.2459, -0.1217, -0.1751, ..., -0.2945, 0.0824, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.7253e-09, 2.2352e-08, ..., 2.9802e-08, 4.2841e-08, 0.0000e+00], [ 0.0000e+00, 1.2107e-07, -4.8429e-07, ..., 2.7567e-07, 1.9558e-07, 0.0000e+00], [ 0.0000e+00, -3.9302e-07, 2.6077e-08, ..., -1.4044e-06, -1.7099e-06, 0.0000e+00], ..., [ 0.0000e+00, 2.3097e-07, 7.2643e-08, ..., 7.0408e-07, 8.5123e-07, 0.0000e+00], [ 0.0000e+00, 2.2352e-08, 3.3714e-07, ..., 3.1851e-07, 5.1036e-07, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, 1.6764e-08, ..., 6.3330e-08, 8.3819e-08, 0.0000e+00]], device='cuda:0') Epoch 366, bias, value: tensor([-0.0062, -0.0381, 0.0062, -0.0098, 0.0041, 0.0108, 0.0264, 0.0094, -0.0440, -0.0095], device='cuda:0'), grad: tensor([ 1.0431e-07, -3.1665e-08, -3.5577e-06, 0.0000e+00, -5.5879e-09, 1.6764e-08, 3.3528e-08, 1.8664e-06, 1.3616e-06, 1.9558e-07], device='cuda:0') 100 0.0001 changing lr epoch 365, time 247.84, cls_loss 0.0012 cls_loss_mapping 0.0017 cls_loss_causal 0.5032 re_mapping 0.0040 re_causal 0.0120 /// teacc 99.03 lr 0.00010000 Epoch 367, weight, value: tensor([[ 0.0249, -0.1905, -0.1755, ..., -0.3636, -0.1277, -0.1688], [ 0.0793, -0.0967, 0.0399, ..., 0.0487, 0.1313, -0.0583], [-0.1030, 0.1554, -0.2151, ..., 0.0547, 0.0972, -0.0431], ..., [-0.0807, -0.0981, -0.0813, ..., 0.0120, -0.2129, 0.1599], [ 0.0514, -0.0506, 0.1214, ..., 0.0023, -0.2592, -0.0328], [-0.2469, -0.1227, -0.1743, ..., -0.2955, 0.0839, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 7.4506e-09, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 4.2841e-08, -1.4901e-08, ..., 5.4017e-08, 9.8720e-08, 0.0000e+00], [ 0.0000e+00, -1.9856e-06, 0.0000e+00, ..., -2.6245e-06, -5.7556e-06, 0.0000e+00], ..., [ 0.0000e+00, 1.3039e-08, 1.8626e-09, ..., 1.8626e-08, 5.4017e-08, 0.0000e+00], [ 0.0000e+00, 1.8831e-06, 0.0000e+00, ..., 2.4922e-06, 5.4613e-06, 0.0000e+00], [ 0.0000e+00, 1.6764e-08, 0.0000e+00, ..., 2.2352e-08, 4.8429e-08, 0.0000e+00]], device='cuda:0') Epoch 367, bias, value: tensor([-0.0060, -0.0385, 0.0066, -0.0108, 0.0039, 0.0107, 0.0266, 0.0097, -0.0450, -0.0094], device='cuda:0'), grad: tensor([ 2.9802e-08, 1.9372e-07, -1.0870e-05, 1.1176e-08, 3.1665e-08, 2.2352e-08, 6.3330e-08, 7.4506e-08, 1.0327e-05, 9.8720e-08], device='cuda:0') 100 0.0001 changing lr epoch 366, time 247.76, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4834 re_mapping 0.0042 re_causal 0.0123 /// teacc 99.13 lr 0.00010000 Epoch 368, weight, value: tensor([[ 0.0233, -0.1923, -0.1757, ..., -0.3666, -0.1281, -0.1689], [ 0.0791, -0.0947, 0.0400, ..., 0.0493, 0.1327, -0.0580], [-0.1030, 0.1554, -0.2160, ..., 0.0544, 0.0961, -0.0432], ..., [-0.0802, -0.0982, -0.0813, ..., 0.0119, -0.2134, 0.1599], [ 0.0508, -0.0509, 0.1215, ..., 0.0021, -0.2601, -0.0330], [-0.2491, -0.1232, -0.1746, ..., -0.2963, 0.0838, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-08, ..., -3.7253e-09, -1.4901e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 1.8626e-09, ..., -1.8626e-09, -3.7253e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., -1.8626e-09, 2.7940e-08, 0.0000e+00], [-1.8626e-09, 0.0000e+00, -9.3132e-09, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 9.3132e-07, 0.0000e+00]], device='cuda:0') Epoch 368, bias, value: tensor([-0.0060, -0.0382, 0.0064, -0.0112, 0.0041, 0.0109, 0.0269, 0.0096, -0.0453, -0.0099], device='cuda:0'), grad: tensor([ 3.7253e-09, -2.2352e-08, 1.8626e-09, 1.8626e-09, -1.4137e-06, 3.7253e-09, 9.3132e-09, 1.6764e-08, -2.6077e-08, 1.4305e-06], device='cuda:0') 100 0.0001 changing lr epoch 367, time 247.71, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4856 re_mapping 0.0040 re_causal 0.0120 /// teacc 99.06 lr 0.00010000 Epoch 369, weight, value: tensor([[ 0.0234, -0.1927, -0.1756, ..., -0.3668, -0.1281, -0.1689], [ 0.0789, -0.0947, 0.0402, ..., 0.0495, 0.1334, -0.0580], [-0.1031, 0.1555, -0.2163, ..., 0.0544, 0.0964, -0.0432], ..., [-0.0801, -0.0982, -0.0815, ..., 0.0118, -0.2141, 0.1599], [ 0.0506, -0.0509, 0.1217, ..., 0.0020, -0.2603, -0.0329], [-0.2506, -0.1233, -0.1751, ..., -0.2984, 0.0837, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4506e-09, 1.8626e-09, ..., 9.3132e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, -1.7323e-07, ..., -6.5193e-08, -1.8813e-07, 0.0000e+00], [ 1.8626e-09, -4.3586e-07, 1.1548e-07, ..., -4.0233e-07, -4.0978e-08, 0.0000e+00], ..., [ 0.0000e+00, 3.7439e-07, 4.0978e-08, ..., 3.9488e-07, 1.5274e-07, 0.0000e+00], [-3.7253e-09, 2.9802e-08, -1.3858e-06, ..., -8.2888e-07, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 3.7253e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 369, bias, value: tensor([-0.0059, -0.0380, 0.0065, -0.0122, 0.0044, 0.0114, 0.0268, 0.0096, -0.0454, -0.0103], device='cuda:0'), grad: tensor([ 4.8429e-08, -3.6508e-07, -2.2203e-06, 2.9244e-06, -5.2154e-08, 2.9802e-08, 1.8254e-07, 2.1476e-06, -2.7027e-06, -1.8626e-08], device='cuda:0') 100 0.0001 changing lr epoch 368, time 247.53, cls_loss 0.0009 cls_loss_mapping 0.0013 cls_loss_causal 0.4961 re_mapping 0.0040 re_causal 0.0119 /// teacc 99.07 lr 0.00010000 Epoch 370, weight, value: tensor([[ 0.0231, -0.1941, -0.1760, ..., -0.3676, -0.1285, -0.1689], [ 0.0791, -0.0949, 0.0408, ..., 0.0499, 0.1343, -0.0577], [-0.1036, 0.1558, -0.2164, ..., 0.0546, 0.0967, -0.0436], ..., [-0.0807, -0.0985, -0.0820, ..., 0.0114, -0.2154, 0.1600], [ 0.0506, -0.0509, 0.1225, ..., 0.0022, -0.2605, -0.0330], [-0.2508, -0.1234, -0.1757, ..., -0.3000, 0.0836, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4506e-09, 1.8626e-09, ..., 9.3132e-09, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, -2.6077e-08, ..., 5.5879e-09, -5.5879e-09, 0.0000e+00], [ 0.0000e+00, -2.4214e-08, 5.5879e-09, ..., -3.1665e-08, -8.5682e-08, 0.0000e+00], ..., [ 0.0000e+00, 7.4506e-09, 1.8626e-08, ..., 1.6764e-08, 3.1665e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 370, bias, value: tensor([-0.0065, -0.0375, 0.0067, -0.0116, 0.0044, 0.0095, 0.0285, 0.0091, -0.0452, -0.0104], device='cuda:0'), grad: tensor([ 8.2143e-07, -1.1176e-08, -2.3842e-07, 7.4506e-09, 7.4506e-09, 2.6636e-07, -1.0263e-06, 8.1956e-08, 8.0094e-08, 1.8626e-09], device='cuda:0') 100 0.0001 changing lr epoch 369, time 248.01, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4763 re_mapping 0.0039 re_causal 0.0116 /// teacc 99.06 lr 0.00010000 Epoch 371, weight, value: tensor([[ 0.0230, -0.1949, -0.1765, ..., -0.3679, -0.1286, -0.1689], [ 0.0791, -0.0950, 0.0404, ..., 0.0493, 0.1347, -0.0576], [-0.1040, 0.1558, -0.2168, ..., 0.0547, 0.0970, -0.0442], ..., [-0.0809, -0.0985, -0.0816, ..., 0.0119, -0.2157, 0.1602], [ 0.0505, -0.0509, 0.1228, ..., 0.0023, -0.2607, -0.0331], [-0.2510, -0.1234, -0.1759, ..., -0.3011, 0.0834, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.7544e-08, -2.8871e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 9.3132e-09, 6.3330e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.4901e-08, 4.4703e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.1176e-08, 4.2841e-08, 0.0000e+00], [ 1.1176e-08, 0.0000e+00, -1.8626e-09, ..., -0.0000e+00, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-08, 1.8254e-07, 0.0000e+00]], device='cuda:0') Epoch 371, bias, value: tensor([-0.0068, -0.0379, 0.0068, -0.0111, 0.0048, 0.0087, 0.0287, 0.0094, -0.0452, -0.0108], device='cuda:0'), grad: tensor([-2.0266e-06, 3.2037e-07, 3.0175e-07, 5.4948e-07, -5.7369e-07, -7.4692e-07, 6.4448e-07, 2.9244e-07, 3.2037e-07, 9.2015e-07], device='cuda:0') 100 0.0001 changing lr epoch 370, time 247.77, cls_loss 0.0009 cls_loss_mapping 0.0012 cls_loss_causal 0.4467 re_mapping 0.0040 re_causal 0.0113 /// teacc 99.01 lr 0.00010000 Epoch 372, weight, value: tensor([[ 0.0230, -0.1948, -0.1767, ..., -0.3684, -0.1286, -0.1690], [ 0.0790, -0.0950, 0.0411, ..., 0.0499, 0.1362, -0.0572], [-0.1041, 0.1559, -0.2177, ..., 0.0547, 0.0971, -0.0446], ..., [-0.0809, -0.0986, -0.0823, ..., 0.0115, -0.2172, 0.1606], [ 0.0505, -0.0509, 0.1229, ..., 0.0023, -0.2608, -0.0332], [-0.2511, -0.1238, -0.1764, ..., -0.3022, 0.0833, -0.1107]], device='cuda:0'), grad: tensor([[-1.8626e-09, 2.7940e-08, 1.8626e-09, ..., 1.1176e-08, 6.3330e-08, 0.0000e+00], [ 0.0000e+00, 1.4901e-08, 1.0058e-07, ..., 4.8429e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, -3.1851e-07, 9.3132e-09, ..., -5.5321e-07, -6.4075e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.3225e-07, 1.8626e-09, ..., 2.6077e-07, 2.8312e-07, 0.0000e+00], [ 0.0000e+00, 1.3039e-07, -2.9989e-07, ..., 1.8254e-07, 2.7381e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 3.5390e-08, 0.0000e+00]], device='cuda:0') Epoch 372, bias, value: tensor([-0.0064, -0.0373, 0.0068, -0.0110, 0.0047, 0.0093, 0.0285, 0.0089, -0.0453, -0.0111], device='cuda:0'), grad: tensor([ 1.4529e-07, 3.0920e-07, -1.7062e-06, 1.0245e-07, -5.1968e-07, 7.2643e-08, 4.6194e-07, 7.0781e-07, 2.7195e-07, 1.6019e-07], device='cuda:0') 100 0.0001 changing lr epoch 371, time 247.36, cls_loss 0.0010 cls_loss_mapping 0.0015 cls_loss_causal 0.4779 re_mapping 0.0041 re_causal 0.0115 /// teacc 99.05 lr 0.00010000 Epoch 373, weight, value: tensor([[ 0.0235, -0.1953, -0.1771, ..., -0.3690, -0.1288, -0.1690], [ 0.0790, -0.0957, 0.0415, ..., 0.0498, 0.1370, -0.0562], [-0.1041, 0.1557, -0.2184, ..., 0.0546, 0.0974, -0.0447], ..., [-0.0809, -0.0983, -0.0824, ..., 0.0116, -0.2177, 0.1610], [ 0.0506, -0.0511, 0.1229, ..., 0.0023, -0.2615, -0.0336], [-0.2512, -0.1242, -0.1766, ..., -0.3035, 0.0830, -0.1108]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, -7.2643e-08, ..., -2.6077e-08, -6.3330e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 3.7253e-09, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., -1.8626e-09, 1.8626e-08, 0.0000e+00], [-0.0000e+00, 0.0000e+00, -2.0489e-08, ..., -5.5879e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 3.5390e-08, 0.0000e+00]], device='cuda:0') Epoch 373, bias, value: tensor([-0.0062, -0.0372, 0.0067, -0.0109, 0.0052, 0.0103, 0.0259, 0.0092, -0.0461, -0.0118], device='cuda:0'), grad: tensor([-1.5274e-07, -1.0431e-07, 5.0291e-08, 4.8429e-08, -1.1921e-07, 1.8999e-07, 5.2154e-08, 5.9605e-08, 2.7940e-08, -6.7055e-08], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 372---------------------------------------------------- epoch 372, time 264.53, cls_loss 0.0009 cls_loss_mapping 0.0015 cls_loss_causal 0.4962 re_mapping 0.0040 re_causal 0.0118 /// teacc 99.20 lr 0.00010000 Epoch 374, weight, value: tensor([[ 0.0234, -0.1956, -0.1773, ..., -0.3695, -0.1291, -0.1702], [ 0.0788, -0.0963, 0.0420, ..., 0.0502, 0.1381, -0.0565], [-0.1042, 0.1557, -0.2188, ..., 0.0546, 0.0974, -0.0435], ..., [-0.0816, -0.0983, -0.0829, ..., 0.0114, -0.2186, 0.1616], [ 0.0505, -0.0513, 0.1224, ..., 0.0019, -0.2623, -0.0338], [-0.2518, -0.1243, -0.1768, ..., -0.3045, 0.0827, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.6764e-08, 3.7253e-09, ..., 1.6764e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-08, 1.8626e-09, ..., 5.0291e-08, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, -1.2498e-06, 1.2480e-07, ..., -1.0543e-06, -6.4634e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.0300e-06, 0.0000e+00, ..., 8.1770e-07, 5.0105e-07, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.3243e-06, ..., -3.6694e-07, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 3.7253e-09, 5.7742e-08, 0.0000e+00]], device='cuda:0') Epoch 374, bias, value: tensor([-0.0062, -0.0367, 0.0067, -0.0109, 0.0058, 0.0106, 0.0258, 0.0089, -0.0467, -0.0123], device='cuda:0'), grad: tensor([ 1.8626e-09, 2.0303e-07, -3.5781e-06, 9.7416e-07, -2.4773e-07, -3.7253e-09, 1.4640e-06, 2.9057e-06, -1.9502e-06, 2.0117e-07], device='cuda:0') 100 0.0001 changing lr epoch 373, time 247.45, cls_loss 0.0010 cls_loss_mapping 0.0014 cls_loss_causal 0.4440 re_mapping 0.0041 re_causal 0.0114 /// teacc 99.05 lr 0.00010000 Epoch 375, weight, value: tensor([[ 0.0235, -0.1955, -0.1776, ..., -0.3702, -0.1269, -0.1700], [ 0.0823, -0.0963, 0.0427, ..., 0.0496, 0.1381, -0.0566], [-0.1050, 0.1569, -0.2203, ..., 0.0560, 0.0992, -0.0435], ..., [-0.0820, -0.0984, -0.0834, ..., 0.0110, -0.2199, 0.1618], [ 0.0503, -0.0514, 0.1220, ..., 0.0014, -0.2633, -0.0338], [-0.2521, -0.1246, -0.1770, ..., -0.3058, 0.0813, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 1.8626e-09, ..., 7.4506e-09, -5.5879e-09, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 2.5891e-07, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-08, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.6077e-08, ..., 0.0000e+00, -9.3132e-09, 0.0000e+00]], device='cuda:0') Epoch 375, bias, value: tensor([-0.0027, -0.0369, 0.0077, -0.0107, 0.0055, 0.0102, 0.0250, 0.0085, -0.0473, -0.0142], device='cuda:0'), grad: tensor([ 0.0000e+00, 2.7940e-08, 2.0489e-08, -6.6496e-07, 1.8626e-08, 1.3039e-08, -1.3039e-08, 6.2957e-07, -1.0245e-07, 5.9605e-08], device='cuda:0') 100 0.0001 changing lr epoch 374, time 247.21, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4622 re_mapping 0.0042 re_causal 0.0119 /// teacc 99.06 lr 0.00010000 Epoch 376, weight, value: tensor([[ 0.0234, -0.1957, -0.1780, ..., -0.3708, -0.1272, -0.1702], [ 0.0830, -0.0978, 0.0449, ..., 0.0501, 0.1385, -0.0566], [-0.1051, 0.1579, -0.2217, ..., 0.0571, 0.1006, -0.0434], ..., [-0.0823, -0.0985, -0.0854, ..., 0.0092, -0.2230, 0.1629], [ 0.0504, -0.0514, 0.1223, ..., 0.0016, -0.2633, -0.0339], [-0.2527, -0.1247, -0.1773, ..., -0.3064, 0.0814, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 7.4506e-08, -1.8626e-09, ..., 1.7323e-07, 3.7625e-07, 0.0000e+00], [ 0.0000e+00, -8.5682e-08, 2.4214e-08, ..., -1.9930e-07, -4.1351e-07, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-09, 1.8626e-09, ..., 2.0489e-08, 4.6566e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-08, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, -1.1176e-08, 0.0000e+00]], device='cuda:0') Epoch 376, bias, value: tensor([-0.0028, -0.0354, 0.0084, -0.0105, 0.0055, 0.0098, 0.0257, 0.0064, -0.0470, -0.0143], device='cuda:0'), grad: tensor([ 4.6566e-08, 5.3272e-07, -4.1537e-07, 2.9802e-08, -2.4214e-08, 1.4901e-08, 2.2352e-08, 7.6368e-08, -2.4214e-07, -4.4703e-08], device='cuda:0') 100 0.0001 changing lr epoch 375, time 247.16, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4730 re_mapping 0.0041 re_causal 0.0120 /// teacc 99.06 lr 0.00010000 Epoch 377, weight, value: tensor([[ 0.0231, -0.1960, -0.1784, ..., -0.3713, -0.1274, -0.1702], [ 0.0831, -0.0978, 0.0448, ..., 0.0498, 0.1386, -0.0567], [-0.1052, 0.1579, -0.2224, ..., 0.0571, 0.1006, -0.0435], ..., [-0.0824, -0.0985, -0.0852, ..., 0.0096, -0.2233, 0.1630], [ 0.0506, -0.0514, 0.1220, ..., 0.0015, -0.2656, -0.0340], [-0.2528, -0.1248, -0.1762, ..., -0.3069, 0.0837, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 1.0803e-07, 0.0000e+00], [-2.8312e-07, 5.5879e-09, -2.1607e-07, ..., -1.8254e-07, -4.8615e-07, 0.0000e+00], [ 4.6566e-08, -2.9802e-08, 8.3819e-08, ..., 3.5390e-08, 2.0862e-07, 0.0000e+00], ..., [ 1.8626e-09, 2.4214e-08, 1.4901e-08, ..., 7.4506e-09, 8.7544e-08, 0.0000e+00], [ 2.0489e-08, 0.0000e+00, 1.8626e-08, ..., 1.4901e-08, 5.9605e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 2.2352e-07, 0.0000e+00]], device='cuda:0') Epoch 377, bias, value: tensor([-0.0029, -0.0357, 0.0084, -0.0108, 0.0042, 0.0097, 0.0260, 0.0066, -0.0482, -0.0125], device='cuda:0'), grad: tensor([ 2.2911e-07, -2.6636e-07, 3.4459e-07, -4.1537e-07, -1.5870e-06, -3.9116e-08, 8.4564e-07, 2.2911e-07, 8.5682e-08, 5.7369e-07], device='cuda:0') 100 0.0001 changing lr epoch 376, time 247.40, cls_loss 0.0007 cls_loss_mapping 0.0013 cls_loss_causal 0.4671 re_mapping 0.0041 re_causal 0.0121 /// teacc 99.19 lr 0.00010000 Epoch 378, weight, value: tensor([[ 0.0231, -0.1960, -0.1791, ..., -0.3717, -0.1276, -0.1704], [ 0.0834, -0.0978, 0.0447, ..., 0.0497, 0.1387, -0.0568], [-0.1056, 0.1579, -0.2229, ..., 0.0571, 0.1006, -0.0437], ..., [-0.0823, -0.0985, -0.0852, ..., 0.0098, -0.2234, 0.1630], [ 0.0508, -0.0511, 0.1222, ..., 0.0016, -0.2660, -0.0341], [-0.2528, -0.1248, -0.1761, ..., -0.3073, 0.0841, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.0489e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4773e-07, 1.8999e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -2.4959e-07, -2.0675e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -5.5879e-09, 2.2352e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -7.4506e-09, 0.0000e+00]], device='cuda:0') Epoch 378, bias, value: tensor([-0.0029, -0.0358, 0.0084, -0.0109, 0.0042, 0.0098, 0.0259, 0.0067, -0.0483, -0.0122], device='cuda:0'), grad: tensor([-1.0058e-07, 4.7311e-07, -4.0606e-07, 2.0489e-08, 1.2480e-07, 0.0000e+00, 7.4506e-09, -1.5087e-07, 3.7253e-09, 2.6077e-08], device='cuda:0') 100 0.0001 changing lr epoch 377, time 247.36, cls_loss 0.0010 cls_loss_mapping 0.0013 cls_loss_causal 0.4977 re_mapping 0.0040 re_causal 0.0117 /// teacc 99.10 lr 0.00010000 Epoch 379, weight, value: tensor([[ 0.0231, -0.1964, -0.1818, ..., -0.3730, -0.1280, -0.1707], [ 0.0835, -0.0978, 0.0450, ..., 0.0497, 0.1387, -0.0578], [-0.1058, 0.1579, -0.2232, ..., 0.0571, 0.1006, -0.0459], ..., [-0.0824, -0.0985, -0.0853, ..., 0.0098, -0.2235, 0.1652], [ 0.0510, -0.0511, 0.1222, ..., 0.0014, -0.2666, -0.0346], [-0.2528, -0.1248, -0.1765, ..., -0.3086, 0.0853, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 1.8626e-09, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., -0.0000e+00, 4.2841e-08, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 1.8626e-08, ..., 9.3132e-09, -0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-09, ..., -9.3132e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.3448e-06, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-08, ..., 1.6764e-08, -2.7940e-08, 0.0000e+00]], device='cuda:0') Epoch 379, bias, value: tensor([-0.0027, -0.0358, 0.0083, -0.0109, 0.0034, 0.0099, 0.0254, 0.0068, -0.0485, -0.0118], device='cuda:0'), grad: tensor([ 1.8440e-07, 1.5274e-07, 6.1467e-08, -2.7940e-08, -1.6205e-07, 1.5032e-06, 1.0170e-06, -2.4214e-07, -2.6450e-06, 1.6764e-07], device='cuda:0') 100 0.0001 changing lr epoch 378, time 247.07, cls_loss 0.0009 cls_loss_mapping 0.0018 cls_loss_causal 0.4802 re_mapping 0.0038 re_causal 0.0116 /// teacc 99.12 lr 0.00010000 Epoch 380, weight, value: tensor([[ 0.0231, -0.1968, -0.1826, ..., -0.3735, -0.1282, -0.1711], [ 0.0835, -0.0978, 0.0450, ..., 0.0497, 0.1388, -0.0591], [-0.1058, 0.1579, -0.2240, ..., 0.0571, 0.1006, -0.0459], ..., [-0.0825, -0.0985, -0.0853, ..., 0.0098, -0.2236, 0.1658], [ 0.0509, -0.0511, 0.1228, ..., 0.0015, -0.2667, -0.0327], [-0.2529, -0.1249, -0.1769, ..., -0.3093, 0.0852, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., -0.0000e+00, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 380, bias, value: tensor([ 0.0004, -0.0359, 0.0082, -0.0091, 0.0037, 0.0088, 0.0220, 0.0068, -0.0483, -0.0120], device='cuda:0'), grad: tensor([ 1.8626e-09, 1.4901e-08, 9.3132e-09, -9.3132e-09, -7.4506e-09, -3.1665e-08, -2.9802e-08, -2.9989e-07, 2.0489e-08, 3.2410e-07], device='cuda:0') 100 0.0001 changing lr epoch 379, time 247.88, cls_loss 0.0008 cls_loss_mapping 0.0009 cls_loss_causal 0.4619 re_mapping 0.0039 re_causal 0.0118 /// teacc 99.08 lr 0.00010000 Epoch 381, weight, value: tensor([[ 0.0230, -0.1969, -0.1828, ..., -0.3738, -0.1283, -0.1700], [ 0.0835, -0.0978, 0.0450, ..., 0.0494, 0.1386, -0.0607], [-0.1059, 0.1579, -0.2242, ..., 0.0572, 0.1010, -0.0460], ..., [-0.0825, -0.0985, -0.0853, ..., 0.0101, -0.2237, 0.1672], [ 0.0509, -0.0511, 0.1236, ..., 0.0019, -0.2667, -0.0329], [-0.2529, -0.1250, -0.1772, ..., -0.3108, 0.0851, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 2.6077e-08, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, -5.5879e-08, ..., 5.5879e-09, -4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, -3.7253e-09, 0.0000e+00], ..., [-9.3132e-09, 3.7253e-09, 3.7253e-09, ..., 5.5879e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.4703e-08, ..., 9.3132e-09, 7.4506e-09, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 381, bias, value: tensor([ 0.0004, -0.0362, 0.0085, -0.0102, 0.0039, 0.0088, 0.0219, 0.0070, -0.0477, -0.0122], device='cuda:0'), grad: tensor([ 3.7253e-08, -9.8720e-08, 3.7253e-09, -1.9372e-07, 1.8626e-09, 1.8440e-07, 1.6764e-08, -5.7742e-08, 8.0094e-08, 2.6077e-08], device='cuda:0') 100 0.0001 changing lr epoch 380, time 247.92, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4815 re_mapping 0.0039 re_causal 0.0116 /// teacc 99.02 lr 0.00010000 Epoch 382, weight, value: tensor([[ 0.0226, -0.1998, -0.1833, ..., -0.3756, -0.1285, -0.1700], [ 0.0834, -0.0978, 0.0457, ..., 0.0500, 0.1390, -0.0618], [-0.1060, 0.1584, -0.2251, ..., 0.0573, 0.1011, -0.0459], ..., [-0.0824, -0.0987, -0.0859, ..., 0.0096, -0.2245, 0.1692], [ 0.0509, -0.0531, 0.1243, ..., 0.0010, -0.2672, -0.0351], [-0.2530, -0.1251, -0.1775, ..., -0.3118, 0.0853, -0.1113]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 4.0978e-08, ..., 2.4214e-08, 3.7253e-08, 0.0000e+00], [ 5.0291e-08, 1.3039e-08, -1.8068e-07, ..., -5.0291e-08, 2.0489e-08, -0.0000e+00], [ 0.0000e+00, 3.3528e-08, 1.4901e-08, ..., 2.0489e-08, -8.0094e-08, 0.0000e+00], ..., [ 5.5879e-09, -6.1467e-08, 2.2352e-08, ..., -8.5682e-08, 3.9116e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 8.3819e-08, ..., 4.8429e-08, 7.2643e-08, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 3.7253e-09, ..., 5.5879e-09, -0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 382, bias, value: tensor([ 0.0001, -0.0357, 0.0087, -0.0092, 0.0040, 0.0076, 0.0220, 0.0065, -0.0478, -0.0122], device='cuda:0'), grad: tensor([ 1.6578e-07, -2.7567e-07, 1.1921e-07, 1.2480e-07, -2.5518e-07, -4.9919e-07, 4.2096e-07, -2.2165e-07, 3.9861e-07, 2.4214e-08], device='cuda:0') 100 0.0001 changing lr epoch 381, time 247.09, cls_loss 0.0009 cls_loss_mapping 0.0019 cls_loss_causal 0.4733 re_mapping 0.0039 re_causal 0.0113 /// teacc 99.06 lr 0.00010000 Epoch 383, weight, value: tensor([[ 0.0220, -0.2000, -0.1843, ..., -0.3761, -0.1276, -0.1701], [ 0.0840, -0.0980, 0.0433, ..., 0.0478, 0.1391, -0.0629], [-0.1061, 0.1590, -0.2259, ..., 0.0575, 0.1012, -0.0457], ..., [-0.0824, -0.0993, -0.0835, ..., 0.0117, -0.2246, 0.1700], [ 0.0507, -0.0531, 0.1259, ..., 0.0020, -0.2673, -0.0350], [-0.2531, -0.1254, -0.1779, ..., -0.3128, 0.0848, -0.1115]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 5.4017e-08, -3.7253e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., 1.3039e-08, -1.8626e-09, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -1.2107e-07, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.9116e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 383, bias, value: tensor([ 0.0007, -0.0381, 0.0089, -0.0094, 0.0040, 0.0075, 0.0219, 0.0089, -0.0467, -0.0135], device='cuda:0'), grad: tensor([ 3.7253e-08, 2.1420e-07, 5.0291e-08, 2.4214e-08, 0.0000e+00, 3.7253e-09, -2.9802e-08, -4.7125e-07, 1.4342e-07, 2.4214e-08], device='cuda:0') 100 0.0001 changing lr epoch 382, time 247.47, cls_loss 0.0008 cls_loss_mapping 0.0010 cls_loss_causal 0.4627 re_mapping 0.0039 re_causal 0.0115 /// teacc 99.11 lr 0.00010000 Epoch 384, weight, value: tensor([[ 0.0212, -0.2004, -0.1846, ..., -0.3768, -0.1277, -0.1706], [ 0.0847, -0.0979, 0.0433, ..., 0.0478, 0.1392, -0.0630], [-0.1064, 0.1603, -0.2265, ..., 0.0580, 0.1014, -0.0457], ..., [-0.0821, -0.1011, -0.0835, ..., 0.0114, -0.2249, 0.1700], [ 0.0497, -0.0533, 0.1257, ..., 0.0017, -0.2681, -0.0351], [-0.2531, -0.1255, -0.1775, ..., -0.3120, 0.0848, -0.1113]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.5635e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 8.3819e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.5832e-08, ..., 8.3819e-09, 1.3039e-08, 0.0000e+00], ..., [ 1.3039e-08, 0.0000e+00, 3.7253e-09, ..., -6.5193e-09, 4.1910e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.5146e-08, 0.0000e+00]], device='cuda:0') Epoch 384, bias, value: tensor([ 0.0007, -0.0382, 0.0096, -0.0094, 0.0033, 0.0075, 0.0219, 0.0086, -0.0475, -0.0130], device='cuda:0'), grad: tensor([ 2.2817e-07, 4.2841e-08, 5.7742e-08, 4.5635e-08, -2.7474e-07, -6.9663e-07, -2.8219e-07, 2.2631e-07, 1.6578e-07, 4.9546e-07], device='cuda:0') 100 0.0001 changing lr epoch 383, time 247.51, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4707 re_mapping 0.0040 re_causal 0.0114 /// teacc 99.12 lr 0.00010000 Epoch 385, weight, value: tensor([[ 0.0211, -0.2003, -0.1852, ..., -0.3771, -0.1284, -0.1731], [ 0.0860, -0.0979, 0.0433, ..., 0.0478, 0.1393, -0.0639], [-0.1068, 0.1607, -0.2274, ..., 0.0582, 0.1013, -0.0454], ..., [-0.0823, -0.1016, -0.0835, ..., 0.0112, -0.2251, 0.1741], [ 0.0495, -0.0531, 0.1265, ..., 0.0020, -0.2686, -0.0352], [-0.2534, -0.1257, -0.1776, ..., -0.3126, 0.0852, -0.1139]], device='cuda:0'), grad: tensor([[-0.0000e+00, 1.4901e-08, 4.3772e-08, ..., 4.4703e-08, 5.7742e-08, 0.0000e+00], [ 0.0000e+00, 1.7695e-08, -1.6820e-06, ..., -9.6578e-07, -1.5656e-06, 0.0000e+00], [ 0.0000e+00, -1.9372e-07, 2.5332e-07, ..., -8.0094e-08, 1.2107e-08, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-08, 1.1753e-06, ..., 7.8604e-07, 1.2126e-06, 0.0000e+00], [ 0.0000e+00, 3.5390e-08, 1.4622e-07, ..., 1.2945e-07, 1.8533e-07, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, 5.4017e-08, ..., 4.0047e-08, 5.7742e-08, -0.0000e+00]], device='cuda:0') Epoch 385, bias, value: tensor([ 0.0005, -0.0381, 0.0098, -0.0091, 0.0032, 0.0068, 0.0220, 0.0085, -0.0472, -0.0127], device='cuda:0'), grad: tensor([ 2.5798e-07, -5.8934e-06, -4.6939e-07, 1.4435e-07, 4.0978e-08, 5.0291e-08, -6.7987e-08, 4.8652e-06, 8.0373e-07, 2.4587e-07], device='cuda:0') 100 0.0001 changing lr epoch 384, time 247.50, cls_loss 0.0009 cls_loss_mapping 0.0016 cls_loss_causal 0.4674 re_mapping 0.0040 re_causal 0.0112 /// teacc 99.07 lr 0.00010000 Epoch 386, weight, value: tensor([[ 0.0211, -0.2006, -0.1854, ..., -0.3776, -0.1280, -0.1732], [ 0.0862, -0.0979, 0.0434, ..., 0.0479, 0.1395, -0.0647], [-0.1069, 0.1606, -0.2279, ..., 0.0580, 0.1013, -0.0457], ..., [-0.0823, -0.1014, -0.0836, ..., 0.0113, -0.2255, 0.1752], [ 0.0494, -0.0533, 0.1269, ..., 0.0020, -0.2688, -0.0354], [-0.2535, -0.1259, -0.1782, ..., -0.3140, 0.0851, -0.1147]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 1.8626e-09, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, -1.6764e-08, ..., 5.5879e-09, -9.3132e-09, -0.0000e+00], [ 0.0000e+00, 3.3528e-08, 2.7940e-09, ..., 3.0734e-08, 2.0489e-08, 0.0000e+00], ..., [ 0.0000e+00, -2.0862e-07, 5.5879e-09, ..., -1.8347e-07, -9.6858e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6566e-09, ..., -1.8626e-09, 8.3819e-09, 9.3132e-10], [ 0.0000e+00, 1.5460e-07, 9.3132e-10, ..., 1.3784e-07, 8.5682e-08, 0.0000e+00]], device='cuda:0') Epoch 386, bias, value: tensor([ 0.0024, -0.0381, 0.0094, -0.0088, 0.0029, 0.0061, 0.0215, 0.0087, -0.0473, -0.0151], device='cuda:0'), grad: tensor([ 5.5879e-09, 1.7695e-08, 1.5274e-07, 9.3132e-09, -2.0489e-08, -9.3132e-10, 3.7253e-09, -8.5775e-07, 1.2107e-08, 6.8918e-07], device='cuda:0') 100 0.0001 changing lr epoch 385, time 247.60, cls_loss 0.0007 cls_loss_mapping 0.0011 cls_loss_causal 0.4823 re_mapping 0.0039 re_causal 0.0118 /// teacc 99.08 lr 0.00010000 Epoch 387, weight, value: tensor([[ 0.0210, -0.2007, -0.1859, ..., -0.3778, -0.1281, -0.1732], [ 0.0860, -0.0980, 0.0440, ..., 0.0482, 0.1400, -0.0656], [-0.1071, 0.1606, -0.2283, ..., 0.0580, 0.1014, -0.0458], ..., [-0.0805, -0.1014, -0.0840, ..., 0.0111, -0.2262, 0.1757], [ 0.0492, -0.0532, 0.1269, ..., 0.0020, -0.2690, -0.0354], [-0.2538, -0.1262, -0.1784, ..., -0.3150, 0.0850, -0.1150]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.4261e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.7940e-09, ..., -0.0000e+00, -2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 387, bias, value: tensor([ 0.0025, -0.0377, 0.0094, -0.0088, 0.0031, 0.0057, 0.0216, 0.0084, -0.0475, -0.0153], device='cuda:0'), grad: tensor([ 1.8626e-07, 1.8626e-09, 9.3132e-09, 5.5879e-09, -7.4506e-09, 1.3039e-07, -3.3434e-07, -9.3132e-10, 2.7940e-09, 1.1176e-08], device='cuda:0') 100 0.0001 changing lr epoch 386, time 247.40, cls_loss 0.0008 cls_loss_mapping 0.0016 cls_loss_causal 0.4873 re_mapping 0.0038 re_causal 0.0115 /// teacc 99.08 lr 0.00010000 Epoch 388, weight, value: tensor([[ 0.0209, -0.2010, -0.1864, ..., -0.3781, -0.1282, -0.1732], [ 0.0890, -0.0980, 0.0441, ..., 0.0482, 0.1401, -0.0674], [-0.1083, 0.1607, -0.2287, ..., 0.0580, 0.1015, -0.0461], ..., [-0.0808, -0.1014, -0.0842, ..., 0.0111, -0.2265, 0.1754], [ 0.0487, -0.0530, 0.1273, ..., 0.0021, -0.2694, -0.0367], [-0.2553, -0.1263, -0.1786, ..., -0.3159, 0.0833, -0.1119]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-08, ..., 4.1910e-08, 1.8626e-09, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, 6.4261e-08, ..., 3.5390e-08, 1.8626e-09, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 4.3772e-08, ..., -2.0489e-08, 2.7940e-09, -4.6566e-09], [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 5.5879e-09, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 388, bias, value: tensor([ 0.0025, -0.0377, 0.0094, -0.0090, 0.0065, 0.0057, 0.0215, 0.0083, -0.0475, -0.0172], device='cuda:0'), grad: tensor([ 1.3039e-08, 1.3318e-07, 1.7416e-07, -7.6089e-07, 0.0000e+00, 3.7998e-07, 1.8626e-09, 3.7253e-09, 3.5390e-08, 2.3283e-08], device='cuda:0') 100 0.0001 changing lr epoch 387, time 247.61, cls_loss 0.0009 cls_loss_mapping 0.0011 cls_loss_causal 0.4902 re_mapping 0.0037 re_causal 0.0115 /// teacc 99.07 lr 0.00010000 Epoch 389, weight, value: tensor([[ 0.0209, -0.2012, -0.1870, ..., -0.3788, -0.1281, -0.1734], [ 0.0891, -0.0980, 0.0454, ..., 0.0491, 0.1414, -0.0641], [-0.1088, 0.1607, -0.2293, ..., 0.0580, 0.1015, -0.0457], ..., [-0.0806, -0.1015, -0.0854, ..., 0.0102, -0.2285, 0.1721], [ 0.0486, -0.0530, 0.1274, ..., 0.0021, -0.2697, -0.0371], [-0.2555, -0.1265, -0.1788, ..., -0.3166, 0.0833, -0.1119]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 6.5193e-09, 1.8626e-09, ..., 1.4901e-08, 1.3039e-07, 0.0000e+00], [ 0.0000e+00, 1.3039e-08, 1.1809e-06, ..., 2.0545e-06, 1.3970e-08, 2.9802e-08], [ 0.0000e+00, -2.7381e-07, 1.6764e-08, ..., -4.2282e-07, -2.5425e-07, 0.0000e+00], ..., [ 0.0000e+00, 2.4494e-07, -1.2247e-06, ..., -1.7080e-06, 2.1979e-07, -3.0734e-08], [ 0.0000e+00, 4.6566e-09, 9.3132e-10, ..., 1.5832e-08, 1.7323e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 4.6566e-09, 2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 389, bias, value: tensor([ 0.0021, -0.0367, 0.0093, -0.0095, 0.0067, 0.0090, 0.0185, 0.0074, -0.0485, -0.0173], device='cuda:0'), grad: tensor([ 4.9826e-07, 6.5938e-06, -1.4734e-06, 1.7136e-07, 5.3085e-08, -7.4506e-09, -1.1334e-06, -5.3570e-06, 6.4075e-07, 2.7940e-08], device='cuda:0') 100 0.0001 changing lr epoch 388, time 247.18, cls_loss 0.0007 cls_loss_mapping 0.0007 cls_loss_causal 0.4472 re_mapping 0.0037 re_causal 0.0112 /// teacc 99.04 lr 0.00010000 Epoch 390, weight, value: tensor([[ 0.0209, -0.2018, -0.1879, ..., -0.3796, -0.1284, -0.1737], [ 0.0891, -0.0980, 0.0448, ..., 0.0484, 0.1413, -0.0642], [-0.1088, 0.1608, -0.2298, ..., 0.0580, 0.1016, -0.0448], ..., [-0.0807, -0.1015, -0.0848, ..., 0.0108, -0.2298, 0.1718], [ 0.0487, -0.0529, 0.1275, ..., 0.0020, -0.2701, -0.0384], [-0.2563, -0.1269, -0.1791, ..., -0.3150, 0.0861, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 9.3132e-10, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -6.5193e-09, ..., -2.7940e-09, -6.5193e-09, -0.0000e+00], [ 0.0000e+00, -2.7940e-09, 1.8626e-09, ..., -9.3132e-10, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 2.7940e-09, ..., 1.8626e-09, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.2596e-08, ..., -2.7940e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6566e-09, ..., 0.0000e+00, -1.8626e-08, 0.0000e+00]], device='cuda:0') Epoch 390, bias, value: tensor([ 0.0019, -0.0374, 0.0093, -0.0096, 0.0068, 0.0090, 0.0185, 0.0073, -0.0487, -0.0151], device='cuda:0'), grad: tensor([ 9.6858e-08, -9.3132e-09, -7.4506e-09, 7.2643e-08, 4.6566e-09, 8.3819e-09, -2.0489e-08, 1.3039e-08, -6.9849e-08, -7.9162e-08], device='cuda:0') 100 0.0001 changing lr epoch 389, time 247.48, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4463 re_mapping 0.0038 re_causal 0.0113 /// teacc 99.15 lr 0.00010000 Epoch 391, weight, value: tensor([[ 0.0209, -0.2020, -0.1885, ..., -0.3801, -0.1285, -0.1739], [ 0.0891, -0.0980, 0.0449, ..., 0.0484, 0.1415, -0.0642], [-0.1088, 0.1609, -0.2300, ..., 0.0581, 0.1016, -0.0448], ..., [-0.0807, -0.1016, -0.0849, ..., 0.0108, -0.2301, 0.1719], [ 0.0487, -0.0531, 0.1281, ..., 0.0021, -0.2703, -0.0389], [-0.2564, -0.1272, -0.1793, ..., -0.3154, 0.0861, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.5832e-08, 0.0000e+00, ..., 5.0291e-08, 2.7008e-08, 0.0000e+00], [ 0.0000e+00, -2.0489e-08, 0.0000e+00, ..., -4.5635e-08, -3.3528e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., -6.5193e-09, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 391, bias, value: tensor([ 0.0019, -0.0374, 0.0094, -0.0097, 0.0069, 0.0091, 0.0185, 0.0073, -0.0486, -0.0152], device='cuda:0'), grad: tensor([-1.0543e-06, 9.5926e-08, 7.8231e-08, 1.9558e-08, 5.0291e-08, 4.9360e-08, 5.2713e-07, -2.4214e-08, 2.7940e-08, 2.3656e-07], device='cuda:0') 100 0.0001 changing lr epoch 390, time 247.55, cls_loss 0.0009 cls_loss_mapping 0.0017 cls_loss_causal 0.4804 re_mapping 0.0039 re_causal 0.0113 /// teacc 99.10 lr 0.00010000 Epoch 392, weight, value: tensor([[ 0.0213, -0.2019, -0.1890, ..., -0.3803, -0.1286, -0.1739], [ 0.0890, -0.0981, 0.0450, ..., 0.0483, 0.1424, -0.0643], [-0.1095, 0.1611, -0.2302, ..., 0.0582, 0.1017, -0.0446], ..., [-0.0803, -0.1017, -0.0851, ..., 0.0109, -0.2316, 0.1720], [ 0.0488, -0.0534, 0.1285, ..., 0.0022, -0.2708, -0.0386], [-0.2566, -0.1277, -0.1797, ..., -0.3159, 0.0857, -0.1095]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -2.7940e-09, ..., 7.4506e-09, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, -1.1269e-07, 1.8626e-09, ..., -1.1362e-07, -7.0781e-08, -9.3132e-10], ..., [ 0.0000e+00, 6.1467e-08, 2.7940e-09, ..., 5.1223e-08, 4.2841e-08, 0.0000e+00], [ 0.0000e+00, 4.7497e-08, -9.3132e-10, ..., 4.7497e-08, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, 2.7940e-09, ..., 2.7940e-09, 7.5437e-08, 0.0000e+00]], device='cuda:0') Epoch 392, bias, value: tensor([ 0.0022, -0.0373, 0.0095, -0.0097, 0.0083, 0.0091, 0.0185, 0.0070, -0.0487, -0.0156], device='cuda:0'), grad: tensor([-1.5832e-08, 5.7742e-08, -2.5984e-07, 5.5879e-08, -2.2817e-07, -1.0896e-07, -8.3819e-09, 1.0431e-07, 1.6112e-07, 2.4401e-07], device='cuda:0') 100 0.0001 changing lr epoch 391, time 247.49, cls_loss 0.0008 cls_loss_mapping 0.0014 cls_loss_causal 0.4647 re_mapping 0.0038 re_causal 0.0110 /// teacc 99.05 lr 0.00010000 Epoch 393, weight, value: tensor([[ 0.0212, -0.2014, -0.1893, ..., -0.3807, -0.1292, -0.1746], [ 0.0890, -0.0987, 0.0450, ..., 0.0481, 0.1415, -0.0644], [-0.1097, 0.1617, -0.2287, ..., 0.0586, 0.1026, -0.0444], ..., [-0.0802, -0.1020, -0.0851, ..., 0.0109, -0.2319, 0.1721], [ 0.0487, -0.0535, 0.1288, ..., 0.0021, -0.2711, -0.0387], [-0.2567, -0.1291, -0.1794, ..., -0.3167, 0.0871, -0.1096]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 2.7940e-09, -2.7940e-09, 0.0000e+00], [ 0.0000e+00, -6.5193e-09, 9.3132e-10, ..., -1.9558e-08, -7.4506e-09, 0.0000e+00], ..., [ 0.0000e+00, 5.5879e-09, 2.7940e-09, ..., 1.5832e-08, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 393, bias, value: tensor([ 0.0021, -0.0378, 0.0101, -0.0098, 0.0081, 0.0090, 0.0186, 0.0070, -0.0487, -0.0148], device='cuda:0'), grad: tensor([-3.3528e-08, 3.7253e-09, -3.0734e-08, -2.7940e-09, 3.7253e-09, 1.4901e-08, 1.0245e-08, 2.7940e-08, 9.3132e-10, 9.3132e-09], device='cuda:0') 100 0.0001 changing lr epoch 392, time 247.54, cls_loss 0.0011 cls_loss_mapping 0.0014 cls_loss_causal 0.4620 re_mapping 0.0037 re_causal 0.0103 /// teacc 99.07 lr 0.00010000 Epoch 394, weight, value: tensor([[ 0.0211, -0.2013, -0.1906, ..., -0.3830, -0.1303, -0.1753], [ 0.0889, -0.0991, 0.0450, ..., 0.0474, 0.1415, -0.0644], [-0.1096, 0.1627, -0.2282, ..., 0.0596, 0.1031, -0.0438], ..., [-0.0804, -0.1027, -0.0852, ..., 0.0112, -0.2323, 0.1721], [ 0.0485, -0.0549, 0.1296, ..., 0.0020, -0.2714, -0.0389], [-0.2567, -0.1306, -0.1808, ..., -0.3194, 0.0872, -0.1096]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 3.7253e-09, 1.8626e-09, 0.0000e+00], [ 9.3132e-10, 9.3132e-10, -2.8200e-06, ..., -1.0049e-06, -2.1216e-06, 0.0000e+00], [ 0.0000e+00, -1.4901e-08, 1.5832e-08, ..., 3.0734e-08, -6.5193e-09, 0.0000e+00], ..., [ 1.8626e-09, -1.8626e-09, 2.7996e-06, ..., 9.3225e-07, 2.1197e-06, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, -1.1176e-08, ..., -3.7253e-09, 5.5879e-09, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 2.1420e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 394, bias, value: tensor([ 0.0019, -0.0381, 0.0111, -0.0103, 0.0083, 0.0091, 0.0186, 0.0070, -0.0489, -0.0149], device='cuda:0'), grad: tensor([ 1.3970e-08, -5.7071e-06, 5.0291e-08, 6.8918e-08, -1.4901e-08, 1.2107e-08, 8.3819e-09, 5.5060e-06, -4.6566e-09, 7.3574e-08], device='cuda:0') 100 0.0001 changing lr epoch 393, time 247.60, cls_loss 0.0006 cls_loss_mapping 0.0010 cls_loss_causal 0.4395 re_mapping 0.0039 re_causal 0.0114 /// teacc 99.07 lr 0.00010000 Epoch 395, weight, value: tensor([[ 0.0211, -0.2014, -0.1909, ..., -0.3833, -0.1304, -0.1753], [ 0.0890, -0.0986, 0.0451, ..., 0.0475, 0.1419, -0.0644], [-0.1096, 0.1626, -0.2289, ..., 0.0593, 0.1030, -0.0437], ..., [-0.0805, -0.1028, -0.0852, ..., 0.0114, -0.2327, 0.1721], [ 0.0484, -0.0550, 0.1297, ..., 0.0019, -0.2716, -0.0389], [-0.2568, -0.1313, -0.1813, ..., -0.3209, 0.0871, -0.1096]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.1420e-08, 1.1176e-08, ..., 0.0000e+00, 1.3225e-06, 0.0000e+00], [-9.3132e-10, 0.0000e+00, 1.2107e-08, ..., -1.8626e-09, 3.4459e-08, 0.0000e+00], [ 0.0000e+00, -8.3819e-09, 9.3132e-10, ..., -2.2352e-08, -1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 7.4506e-09, 2.7940e-09, ..., 2.0489e-08, 1.6764e-08, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 3.1665e-08, ..., 9.3132e-10, 7.0781e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 395, bias, value: tensor([ 0.0019, -0.0380, 0.0109, -0.0103, 0.0083, 0.0090, 0.0186, 0.0071, -0.0490, -0.0150], device='cuda:0'), grad: tensor([ 4.6566e-06, 1.3504e-07, -4.4703e-08, -6.5193e-09, 7.4506e-09, 2.7381e-07, -5.3123e-06, 4.9360e-08, 2.2911e-07, 1.6764e-08], device='cuda:0') 100 0.0001 changing lr epoch 394, time 247.39, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4676 re_mapping 0.0037 re_causal 0.0111 /// teacc 98.99 lr 0.00010000 Epoch 396, weight, value: tensor([[ 0.0211, -0.2015, -0.1915, ..., -0.3836, -0.1306, -0.1754], [ 0.0890, -0.0985, 0.0451, ..., 0.0474, 0.1419, -0.0649], [-0.1097, 0.1627, -0.2298, ..., 0.0592, 0.1030, -0.0437], ..., [-0.0803, -0.1029, -0.0852, ..., 0.0116, -0.2330, 0.1727], [ 0.0484, -0.0550, 0.1299, ..., 0.0016, -0.2720, -0.0401], [-0.2571, -0.1316, -0.1804, ..., -0.3220, 0.0875, -0.1096]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 8.7544e-08, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 1.3039e-08, ..., 1.6764e-07, 3.5390e-08, 2.4214e-08], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., -3.9116e-07, -1.9930e-07, -3.7253e-08], ..., [-1.8626e-09, -0.0000e+00, -1.3039e-08, ..., 9.8720e-08, 1.8999e-07, -1.8626e-09], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 8.0094e-08, 1.1735e-07, 7.4506e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 3.7253e-08, 0.0000e+00]], device='cuda:0') Epoch 396, bias, value: tensor([ 0.0019, -0.0382, 0.0107, -0.0105, 0.0088, 0.0091, 0.0186, 0.0072, -0.0493, -0.0149], device='cuda:0'), grad: tensor([ 3.8184e-07, 5.9977e-07, -5.5321e-07, 0.0000e+00, -6.1654e-07, 9.2760e-07, -1.6037e-06, 1.1176e-07, 4.9733e-07, 2.5146e-07], device='cuda:0') 100 0.0001 changing lr epoch 395, time 247.31, cls_loss 0.0008 cls_loss_mapping 0.0011 cls_loss_causal 0.4620 re_mapping 0.0038 re_causal 0.0110 /// teacc 98.98 lr 0.00010000 Epoch 397, weight, value: tensor([[ 0.0205, -0.2015, -0.1925, ..., -0.3846, -0.1312, -0.1758], [ 0.0888, -0.0984, 0.0451, ..., 0.0474, 0.1422, -0.0652], [-0.1101, 0.1626, -0.2312, ..., 0.0589, 0.1028, -0.0435], ..., [-0.0799, -0.1029, -0.0852, ..., 0.0117, -0.2332, 0.1730], [ 0.0483, -0.0550, 0.1319, ..., 0.0025, -0.2722, -0.0404], [-0.2574, -0.1321, -0.1811, ..., -0.3231, 0.0876, -0.1097]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 1.8626e-09, 2.4214e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -3.3528e-08, 7.4506e-09, ..., -8.3819e-08, -1.1176e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.6077e-08, 8.9407e-08, ..., 9.8720e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.5146e-07, ..., -8.7544e-08, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-08, ..., 1.6764e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 397, bias, value: tensor([ 0.0018, -0.0383, 0.0105, -0.0106, 0.0091, 0.0091, 0.0187, 0.0073, -0.0482, -0.0150], device='cuda:0'), grad: tensor([ 2.5891e-07, 3.1665e-08, -1.2852e-07, 1.8440e-07, 2.9802e-08, 7.0781e-08, -2.9430e-07, 3.4645e-07, -6.0722e-07, 1.1921e-07], device='cuda:0') 100 0.0001 changing lr epoch 396, time 247.50, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4554 re_mapping 0.0038 re_causal 0.0112 /// teacc 99.02 lr 0.00010000 Epoch 398, weight, value: tensor([[ 0.0205, -0.2016, -0.1934, ..., -0.3854, -0.1314, -0.1759], [ 0.0888, -0.0985, 0.0450, ..., 0.0472, 0.1421, -0.0653], [-0.1101, 0.1627, -0.2312, ..., 0.0590, 0.1028, -0.0418], ..., [-0.0799, -0.1030, -0.0853, ..., 0.0119, -0.2333, 0.1730], [ 0.0483, -0.0551, 0.1345, ..., 0.0043, -0.2719, -0.0405], [-0.2576, -0.1322, -0.1826, ..., -0.3242, 0.0879, -0.1098]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0617e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 1.1176e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 5.4017e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.0117e-07, 0.0000e+00]], device='cuda:0') Epoch 398, bias, value: tensor([ 0.0016, -0.0386, 0.0106, -0.0071, 0.0092, 0.0062, 0.0187, 0.0073, -0.0466, -0.0147], device='cuda:0'), grad: tensor([ 4.6194e-07, 2.0489e-08, 4.0978e-08, 2.0489e-08, 8.3819e-08, 7.4506e-09, -1.1176e-08, 1.8440e-07, 4.2841e-08, -8.5123e-07], device='cuda:0') 100 0.0001 changing lr epoch 397, time 247.44, cls_loss 0.0007 cls_loss_mapping 0.0010 cls_loss_causal 0.4848 re_mapping 0.0036 re_causal 0.0112 /// teacc 99.08 lr 0.00010000 Epoch 399, weight, value: tensor([[ 0.0204, -0.2017, -0.1954, ..., -0.3862, -0.1318, -0.1759], [ 0.0888, -0.0984, 0.0451, ..., 0.0472, 0.1422, -0.0655], [-0.1102, 0.1627, -0.2318, ..., 0.0589, 0.1028, -0.0417], ..., [-0.0800, -0.1030, -0.0854, ..., 0.0119, -0.2335, 0.1732], [ 0.0483, -0.0552, 0.1352, ..., 0.0046, -0.2721, -0.0405], [-0.2580, -0.1323, -0.1829, ..., -0.3254, 0.0880, -0.1099]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 1.6764e-08, -7.4506e-09, 1.8626e-09], [ 0.0000e+00, 1.8626e-09, 6.1467e-08, ..., 4.8056e-07, 0.0000e+00, 3.5390e-08], ..., [ 0.0000e+00, -0.0000e+00, -7.2643e-08, ..., -5.5507e-07, 3.7253e-09, -4.4703e-08], [ 0.0000e+00, -1.8626e-09, -1.8626e-09, ..., 9.3132e-09, 0.0000e+00, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 399, bias, value: tensor([ 0.0015, -0.0386, 0.0104, -0.0071, 0.0092, 0.0062, 0.0187, 0.0074, -0.0464, -0.0147], device='cuda:0'), grad: tensor([-2.2352e-08, 3.5390e-08, 1.1567e-06, 8.1956e-08, -5.3272e-07, 1.4901e-08, 3.7253e-09, -1.0859e-06, 4.4703e-08, 3.0175e-07], device='cuda:0') 100 0.0001 changing lr epoch 398, time 247.42, cls_loss 0.0008 cls_loss_mapping 0.0012 cls_loss_causal 0.4680 re_mapping 0.0037 re_causal 0.0107 /// teacc 99.06 lr 0.00010000 Epoch 400, weight, value: tensor([[ 0.0204, -0.2016, -0.1959, ..., -0.3869, -0.1325, -0.1778], [ 0.0888, -0.0984, 0.0451, ..., 0.0471, 0.1424, -0.0662], [-0.1102, 0.1628, -0.2326, ..., 0.0589, 0.1028, -0.0419], ..., [-0.0800, -0.1031, -0.0854, ..., 0.0120, -0.2336, 0.1741], [ 0.0483, -0.0553, 0.1354, ..., 0.0044, -0.2725, -0.0413], [-0.2581, -0.1324, -0.1830, ..., -0.3268, 0.0881, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 1.1176e-08, 1.3039e-08, 0.0000e+00], [ 3.1665e-08, 0.0000e+00, -6.5193e-07, ..., -4.7684e-07, -5.5507e-07, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 2.7940e-08, ..., 3.1665e-08, 5.0291e-08, 0.0000e+00], ..., [ 2.4214e-08, 0.0000e+00, 2.7940e-08, ..., -7.6368e-08, 9.3132e-08, -0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.7497e-07, ..., 3.5018e-07, 4.0233e-07, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, 9.3132e-09, ..., 3.9116e-08, 8.7544e-08, 0.0000e+00]], device='cuda:0') Epoch 400, bias, value: tensor([ 0.0010, -0.0387, 0.0104, -0.0070, 0.0089, 0.0064, 0.0184, 0.0076, -0.0470, -0.0147], device='cuda:0'), grad: tensor([ 9.8720e-08, -1.1809e-06, 1.6205e-07, 3.9674e-07, -5.4203e-07, 6.7055e-07, 1.5646e-07, -2.5332e-06, 1.6131e-06, 1.1455e-06], device='cuda:0') 100 0.0001 changing lr epoch 399, time 247.53, cls_loss 0.0009 cls_loss_mapping 0.0009 cls_loss_causal 0.4632 re_mapping 0.0039 re_causal 0.0115 /// teacc 99.05 lr 0.00001000 Epoch 401, weight, value: tensor([[ 0.0202, -0.2017, -0.1966, ..., -0.3876, -0.1330, -0.1780], [ 0.0886, -0.0985, 0.0440, ..., 0.0461, 0.1423, -0.0671], [-0.1109, 0.1629, -0.2332, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0794, -0.1032, -0.0850, ..., 0.0125, -0.2337, 0.1753], [ 0.0482, -0.0553, 0.1390, ..., 0.0082, -0.2701, -0.0413], [-0.2582, -0.1325, -0.1833, ..., -0.3275, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[-7.4506e-08, 0.0000e+00, 3.3528e-08, ..., 1.8626e-09, 1.7323e-07, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 9.3132e-09, 4.8429e-08, 5.5879e-09], [ 9.3132e-09, 0.0000e+00, 5.5879e-08, ..., 3.3528e-08, 3.7253e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 2.2352e-08, ..., -1.1176e-08, 7.4506e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 3.5204e-07, ..., 1.8626e-09, 2.0117e-06, 0.0000e+00], [ 5.5879e-09, 0.0000e+00, 7.4506e-09, ..., 5.5879e-09, 1.1176e-08, 0.0000e+00]], device='cuda:0') Epoch 401, bias, value: tensor([ 0.0006, -0.0397, 0.0104, -0.0067, 0.0086, 0.0064, 0.0179, 0.0079, -0.0435, -0.0147], device='cuda:0'), grad: tensor([ 2.5705e-07, 1.6391e-07, 2.1048e-07, -2.0303e-07, 3.7253e-09, 1.4499e-05, -2.2262e-05, 0.0000e+00, 7.3314e-06, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr epoch 400, time 247.49, cls_loss 0.0006 cls_loss_mapping 0.0006 cls_loss_causal 0.4681 re_mapping 0.0037 re_causal 0.0115 /// teacc 99.07 lr 0.00001000 Epoch 402, weight, value: tensor([[ 0.0202, -0.2017, -0.1967, ..., -0.3876, -0.1330, -0.1780], [ 0.0886, -0.0985, 0.0440, ..., 0.0461, 0.1423, -0.0672], [-0.1109, 0.1629, -0.2334, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0794, -0.1032, -0.0850, ..., 0.0125, -0.2337, 0.1753], [ 0.0482, -0.0553, 0.1390, ..., 0.0082, -0.2702, -0.0414], [-0.2583, -0.1325, -0.1833, ..., -0.3277, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 7.4506e-08, -5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 9.3132e-09, 5.5879e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -1.1176e-07, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.0489e-08, ..., -1.1176e-08, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 3.3528e-08, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 402, bias, value: tensor([ 0.0006, -0.0397, 0.0104, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0148], device='cuda:0'), grad: tensor([ 5.5879e-09, 5.6252e-07, 3.3528e-08, 7.4506e-09, 9.3132e-09, 4.6566e-08, -3.7253e-09, -8.2888e-07, -5.9605e-08, 2.2165e-07], device='cuda:0') 100 1e-05 changing lr epoch 401, time 247.57, cls_loss 0.0006 cls_loss_mapping 0.0006 cls_loss_causal 0.4178 re_mapping 0.0035 re_causal 0.0106 /// teacc 99.08 lr 0.00001000 Epoch 403, weight, value: tensor([[ 0.0202, -0.2017, -0.1967, ..., -0.3877, -0.1331, -0.1780], [ 0.0886, -0.0985, 0.0440, ..., 0.0461, 0.1423, -0.0672], [-0.1109, 0.1629, -0.2335, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0794, -0.1032, -0.0850, ..., 0.0126, -0.2337, 0.1753], [ 0.0482, -0.0553, 0.1390, ..., 0.0082, -0.2702, -0.0414], [-0.2583, -0.1325, -0.1834, ..., -0.3278, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[-1.3039e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, -1.4156e-06, ..., -6.6869e-07, -1.1977e-06, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 5.5879e-09, ..., 3.7253e-09, 3.7253e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.3951e-06, ..., 6.5751e-07, 1.1828e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 9.3132e-09, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 403, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0147], device='cuda:0'), grad: tensor([-4.8429e-08, -2.9001e-06, 3.5390e-08, 9.3132e-09, 7.4506e-09, 2.0489e-08, -4.8429e-08, 2.9039e-06, 3.3528e-08, -1.8626e-08], device='cuda:0') 100 1e-05 changing lr epoch 402, time 247.68, cls_loss 0.0007 cls_loss_mapping 0.0006 cls_loss_causal 0.4467 re_mapping 0.0035 re_causal 0.0109 /// teacc 99.10 lr 0.00001000 Epoch 404, weight, value: tensor([[ 0.0202, -0.2017, -0.1968, ..., -0.3878, -0.1331, -0.1780], [ 0.0886, -0.0985, 0.0440, ..., 0.0461, 0.1424, -0.0672], [-0.1110, 0.1629, -0.2336, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0794, -0.1032, -0.0850, ..., 0.0126, -0.2338, 0.1754], [ 0.0482, -0.0553, 0.1390, ..., 0.0082, -0.2702, -0.0414], [-0.2583, -0.1325, -0.1834, ..., -0.3279, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [-0.0000e+00, 0.0000e+00, 1.6764e-08, ..., 1.3039e-08, -4.4703e-08, -1.8626e-09], [ 0.0000e+00, -3.7253e-09, 7.4506e-09, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 3.3528e-08, ..., 2.4214e-08, 6.1467e-08, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, -7.8231e-08, ..., -4.6566e-08, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, -2.0489e-08, 0.0000e+00]], device='cuda:0') Epoch 404, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0147], device='cuda:0'), grad: tensor([ 5.5879e-09, 8.9407e-08, 0.0000e+00, 1.5832e-07, -3.7253e-08, -1.5832e-07, 3.5390e-08, 1.4342e-07, -1.9185e-07, -5.7742e-08], device='cuda:0') 100 1e-05 changing lr epoch 403, time 247.22, cls_loss 0.0007 cls_loss_mapping 0.0005 cls_loss_causal 0.4539 re_mapping 0.0034 re_causal 0.0106 /// teacc 99.11 lr 0.00001000 Epoch 405, weight, value: tensor([[ 0.0203, -0.2018, -0.1968, ..., -0.3879, -0.1331, -0.1780], [ 0.0886, -0.0984, 0.0440, ..., 0.0460, 0.1424, -0.0672], [-0.1110, 0.1629, -0.2337, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0794, -0.1032, -0.0850, ..., 0.0126, -0.2338, 0.1754], [ 0.0482, -0.0553, 0.1390, ..., 0.0082, -0.2703, -0.0414], [-0.2583, -0.1325, -0.1835, ..., -0.3280, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.3039e-08, ..., 2.6077e-08, -1.3039e-08, 0.0000e+00], [ 0.0000e+00, -2.9802e-08, 1.8626e-09, ..., -4.6939e-07, -8.7544e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.4214e-08, 1.1176e-08, ..., 4.0047e-07, 9.1270e-08, 0.0000e+00], [-3.7253e-09, 1.8626e-09, -1.6764e-08, ..., 2.9802e-08, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 405, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0147], device='cuda:0'), grad: tensor([ 7.4506e-08, 5.5879e-08, -1.2238e-06, 1.6764e-08, 2.4214e-08, -2.4214e-08, -3.5390e-08, 1.0747e-06, 7.4506e-08, -2.2352e-08], device='cuda:0') 100 1e-05 changing lr epoch 404, time 247.12, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4462 re_mapping 0.0034 re_causal 0.0108 /// teacc 99.13 lr 0.00001000 Epoch 406, weight, value: tensor([[ 0.0203, -0.2018, -0.1969, ..., -0.3879, -0.1332, -0.1780], [ 0.0886, -0.0984, 0.0440, ..., 0.0461, 0.1424, -0.0673], [-0.1110, 0.1629, -0.2337, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2339, 0.1755], [ 0.0481, -0.0553, 0.1390, ..., 0.0082, -0.2703, -0.0414], [-0.2583, -0.1326, -0.1835, ..., -0.3280, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 0.0000e+00, 2.6077e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.4214e-08, ..., -1.8626e-09, -3.9116e-08, -0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.1176e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 406, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0147], device='cuda:0'), grad: tensor([ 8.0094e-08, -1.1176e-07, 3.5390e-08, 3.1665e-08, 1.4901e-08, -5.9605e-08, -4.6566e-08, 1.1176e-08, 3.5390e-08, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 405, time 246.96, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4351 re_mapping 0.0033 re_causal 0.0107 /// teacc 99.14 lr 0.00001000 Epoch 407, weight, value: tensor([[ 0.0203, -0.2018, -0.1969, ..., -0.3880, -0.1332, -0.1780], [ 0.0886, -0.0984, 0.0441, ..., 0.0461, 0.1424, -0.0673], [-0.1110, 0.1629, -0.2338, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2339, 0.1755], [ 0.0481, -0.0553, 0.1390, ..., 0.0082, -0.2703, -0.0415], [-0.2584, -0.1326, -0.1835, ..., -0.3281, 0.0881, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, -1.8626e-09, ..., -1.8626e-09, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], ..., [ 1.8626e-09, 0.0000e+00, 1.8626e-09, ..., -3.7253e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 7.4506e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 407, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0086, 0.0064, 0.0179, 0.0080, -0.0435, -0.0147], device='cuda:0'), grad: tensor([-4.6566e-08, 9.3132e-09, 1.6764e-08, -1.1176e-08, -8.0094e-08, 2.2352e-08, -5.5879e-09, 2.6077e-08, 2.9802e-08, 3.1665e-08], device='cuda:0') 100 1e-05 changing lr epoch 406, time 247.48, cls_loss 0.0006 cls_loss_mapping 0.0006 cls_loss_causal 0.4322 re_mapping 0.0032 re_causal 0.0104 /// teacc 99.12 lr 0.00001000 Epoch 408, weight, value: tensor([[ 0.0203, -0.2018, -0.1969, ..., -0.3880, -0.1332, -0.1780], [ 0.0886, -0.0984, 0.0441, ..., 0.0461, 0.1425, -0.0673], [-0.1110, 0.1629, -0.2338, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2339, 0.1756], [ 0.0481, -0.0553, 0.1390, ..., 0.0082, -0.2704, -0.0415], [-0.2584, -0.1326, -0.1835, ..., -0.3282, 0.0882, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 5.5879e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -5.5879e-09, -2.4214e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., -7.4506e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 408, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0084, 0.0064, 0.0179, 0.0080, -0.0436, -0.0146], device='cuda:0'), grad: tensor([-6.7055e-07, 2.9802e-08, -3.5390e-08, -1.9446e-06, 3.7253e-08, 2.1625e-06, 3.7439e-07, -2.0489e-08, 6.3330e-08, 5.5879e-09], device='cuda:0') 100 1e-05 changing lr epoch 407, time 247.60, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4629 re_mapping 0.0033 re_causal 0.0108 /// teacc 99.13 lr 0.00001000 Epoch 409, weight, value: tensor([[ 0.0204, -0.2018, -0.1970, ..., -0.3881, -0.1332, -0.1780], [ 0.0886, -0.0984, 0.0441, ..., 0.0461, 0.1425, -0.0674], [-0.1110, 0.1629, -0.2340, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2340, 0.1756], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2704, -0.0414], [-0.2584, -0.1326, -0.1836, ..., -0.3284, 0.0882, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.1793e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 9.3132e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -4.0978e-08, 0.0000e+00, ..., -5.4017e-08, -4.6566e-08, 0.0000e+00], ..., [ 0.0000e+00, 3.5390e-08, 0.0000e+00, ..., 4.6566e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., -1.1176e-08, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.3039e-08, 0.0000e+00]], device='cuda:0') Epoch 409, bias, value: tensor([ 0.0006, -0.0397, 0.0103, -0.0067, 0.0084, 0.0064, 0.0179, 0.0080, -0.0436, -0.0146], device='cuda:0'), grad: tensor([ 8.6613e-07, 4.6566e-08, -1.2293e-07, 1.6764e-08, 5.7742e-08, 8.3819e-08, -9.6112e-07, 1.2293e-07, 3.7253e-08, -1.5087e-07], device='cuda:0') 100 1e-05 changing lr epoch 408, time 247.94, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4565 re_mapping 0.0032 re_causal 0.0106 /// teacc 99.11 lr 0.00001000 Epoch 410, weight, value: tensor([[ 0.0204, -0.2018, -0.1970, ..., -0.3882, -0.1333, -0.1780], [ 0.0886, -0.0984, 0.0440, ..., 0.0460, 0.1425, -0.0675], [-0.1110, 0.1629, -0.2340, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0849, ..., 0.0126, -0.2340, 0.1757], [ 0.0481, -0.0553, 0.1390, ..., 0.0082, -0.2704, -0.0414], [-0.2584, -0.1326, -0.1836, ..., -0.3284, 0.0883, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.7253e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -7.4506e-09, ..., 2.2352e-08, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, -2.0489e-08, -5.5879e-09, ..., -9.6858e-08, -3.1665e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.6764e-08, 9.3132e-09, ..., 6.1467e-08, 3.3528e-08, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, -0.0000e+00, ..., -0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -1.5460e-07, 0.0000e+00]], device='cuda:0') Epoch 410, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0084, 0.0064, 0.0179, 0.0081, -0.0436, -0.0146], device='cuda:0'), grad: tensor([-1.3039e-08, 1.0245e-07, -2.0303e-07, 1.1176e-08, 3.3528e-07, 8.0094e-08, -1.8626e-08, 1.0431e-07, 9.3132e-09, -4.3958e-07], device='cuda:0') 100 1e-05 changing lr epoch 409, time 247.42, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4551 re_mapping 0.0031 re_causal 0.0103 /// teacc 99.13 lr 0.00001000 Epoch 411, weight, value: tensor([[ 0.0204, -0.2019, -0.1971, ..., -0.3883, -0.1333, -0.1780], [ 0.0886, -0.0984, 0.0440, ..., 0.0460, 0.1425, -0.0675], [-0.1111, 0.1629, -0.2340, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2340, 0.1757], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2704, -0.0414], [-0.2584, -0.1326, -0.1836, ..., -0.3285, 0.0883, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 5.5879e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, 7.4506e-09, ..., 5.9605e-08, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, -7.8231e-08, 2.0489e-08, ..., -1.1176e-07, -9.6858e-08, 0.0000e+00], ..., [ 0.0000e+00, 5.0291e-08, 3.7253e-09, ..., 1.8626e-08, 6.8918e-08, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, 1.8626e-09, ..., 4.2841e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.3039e-08, -8.5309e-07, 0.0000e+00]], device='cuda:0') Epoch 411, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0084, 0.0064, 0.0179, 0.0081, -0.0436, -0.0146], device='cuda:0'), grad: tensor([ 2.2352e-08, 2.0117e-07, -3.4831e-07, 6.8918e-07, 1.8626e-09, 2.8275e-06, -9.3132e-09, 6.7055e-08, 2.9802e-07, -3.7700e-06], device='cuda:0') 100 1e-05 changing lr epoch 410, time 247.35, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4413 re_mapping 0.0031 re_causal 0.0104 /// teacc 99.15 lr 0.00001000 Epoch 412, weight, value: tensor([[ 0.0204, -0.2019, -0.1971, ..., -0.3883, -0.1333, -0.1780], [ 0.0886, -0.0984, 0.0441, ..., 0.0460, 0.1425, -0.0675], [-0.1111, 0.1630, -0.2341, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0126, -0.2340, 0.1758], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2705, -0.0415], [-0.2584, -0.1326, -0.1837, ..., -0.3286, 0.0883, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-09, ..., -5.5879e-09, -1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., 3.7253e-09, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., -1.8626e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 412, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0084, 0.0064, 0.0179, 0.0081, -0.0436, -0.0146], device='cuda:0'), grad: tensor([ 7.4506e-09, -2.4214e-08, 0.0000e+00, 0.0000e+00, 9.3132e-09, 1.1176e-08, -2.7940e-08, 2.0489e-08, 2.0489e-08, -2.2352e-08], device='cuda:0') 100 1e-05 changing lr epoch 411, time 246.97, cls_loss 0.0006 cls_loss_mapping 0.0005 cls_loss_causal 0.4405 re_mapping 0.0031 re_causal 0.0103 /// teacc 99.16 lr 0.00001000 Epoch 413, weight, value: tensor([[ 0.0205, -0.2019, -0.1971, ..., -0.3884, -0.1334, -0.1780], [ 0.0886, -0.0984, 0.0441, ..., 0.0460, 0.1426, -0.0675], [-0.1111, 0.1630, -0.2341, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0793, -0.1033, -0.0850, ..., 0.0127, -0.2341, 0.1759], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2705, -0.0414], [-0.2585, -0.1326, -0.1837, ..., -0.3287, 0.0883, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.6950e-07, ..., -1.4529e-07, -1.6205e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., -0.0000e+00, -1.1176e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.6205e-07, ..., 1.3784e-07, 1.4342e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., -7.4506e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.1176e-08, ..., 7.4506e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 413, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0083, 0.0064, 0.0179, 0.0081, -0.0436, -0.0145], device='cuda:0'), grad: tensor([ 3.3528e-08, -5.0478e-07, 1.8626e-08, 4.8429e-08, -8.7544e-08, -4.6566e-08, 2.7940e-08, 4.8243e-07, -9.3132e-09, 3.9116e-08], device='cuda:0') 100 1e-05 changing lr epoch 412, time 247.43, cls_loss 0.0007 cls_loss_mapping 0.0005 cls_loss_causal 0.4170 re_mapping 0.0030 re_causal 0.0101 /// teacc 99.12 lr 0.00001000 Epoch 414, weight, value: tensor([[ 0.0206, -0.2019, -0.1972, ..., -0.3884, -0.1334, -0.1780], [ 0.0886, -0.0984, 0.0440, ..., 0.0460, 0.1426, -0.0676], [-0.1111, 0.1630, -0.2343, ..., 0.0589, 0.1028, -0.0419], ..., [-0.0793, -0.1033, -0.0849, ..., 0.0127, -0.2341, 0.1759], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2706, -0.0414], [-0.2585, -0.1326, -0.1837, ..., -0.3288, 0.0883, -0.1104]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 6.3330e-08, ..., 1.1176e-08, 5.2154e-08, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, -1.1362e-07, ..., 1.0803e-07, -7.8231e-08, 0.0000e+00], [ 0.0000e+00, -7.4506e-09, 1.8626e-09, ..., 2.2352e-08, -1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 7.4506e-09, ..., -1.7136e-07, 9.3132e-09, 0.0000e+00], [-1.8626e-09, 0.0000e+00, 3.1665e-08, ..., 7.4506e-09, 2.6077e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 414, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0083, 0.0064, 0.0179, 0.0081, -0.0436, -0.0145], device='cuda:0'), grad: tensor([ 2.4773e-07, -1.1176e-07, 8.0094e-08, 4.2841e-08, -3.7253e-09, 4.8429e-08, 1.6764e-08, -4.6939e-07, 1.1362e-07, 2.6077e-08], device='cuda:0') 100 1e-05 changing lr epoch 413, time 246.72, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4447 re_mapping 0.0030 re_causal 0.0102 /// teacc 99.15 lr 0.00001000 Epoch 415, weight, value: tensor([[ 0.0207, -0.2020, -0.1972, ..., -0.3885, -0.1334, -0.1781], [ 0.0886, -0.0985, 0.0440, ..., 0.0459, 0.1426, -0.0677], [-0.1112, 0.1630, -0.2343, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1034, -0.0849, ..., 0.0127, -0.2341, 0.1760], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2706, -0.0414], [-0.2586, -0.1326, -0.1837, ..., -0.3289, 0.0883, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 2.4214e-08, 2.6077e-08, 0.0000e+00], [ 0.0000e+00, -1.1176e-08, 3.7253e-09, ..., -1.8999e-07, -2.3469e-07, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 3.7253e-09, ..., 7.0781e-08, 9.1270e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 3.3528e-08, 0.0000e+00]], device='cuda:0') Epoch 415, bias, value: tensor([ 0.0006, -0.0398, 0.0104, -0.0067, 0.0083, 0.0064, 0.0179, 0.0081, -0.0437, -0.0145], device='cuda:0'), grad: tensor([-7.0781e-08, 7.2643e-08, -5.7369e-07, 1.8626e-07, 2.2352e-08, 1.1176e-08, 1.4901e-08, 2.2724e-07, 5.5879e-09, 1.0803e-07], device='cuda:0') 100 1e-05 changing lr epoch 414, time 247.38, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4374 re_mapping 0.0030 re_causal 0.0103 /// teacc 99.17 lr 0.00001000 Epoch 416, weight, value: tensor([[ 0.0207, -0.2020, -0.1972, ..., -0.3886, -0.1334, -0.1781], [ 0.0886, -0.0985, 0.0440, ..., 0.0459, 0.1426, -0.0678], [-0.1112, 0.1630, -0.2344, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1034, -0.0849, ..., 0.0128, -0.2342, 0.1761], [ 0.0481, -0.0553, 0.1391, ..., 0.0082, -0.2706, -0.0415], [-0.2586, -0.1327, -0.1838, ..., -0.3290, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 4.6566e-09, 0.0000e+00], [ 9.3132e-10, 2.7940e-09, -5.5879e-09, ..., 1.1176e-08, -0.0000e+00, -2.7940e-09], [ 9.3132e-10, -7.5437e-08, 8.3819e-09, ..., -1.0431e-07, -1.4715e-07, 0.0000e+00], ..., [ 2.7940e-09, 7.1712e-08, 5.5879e-09, ..., 8.8476e-08, 1.3690e-07, 1.8626e-09], [ 2.7940e-09, 0.0000e+00, -9.3132e-10, ..., -9.3132e-10, 5.5879e-09, 0.0000e+00], [ 6.5193e-09, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 416, bias, value: tensor([ 0.0006, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0179, 0.0082, -0.0437, -0.0145], device='cuda:0'), grad: tensor([ 1.9558e-08, 4.1910e-08, -2.9616e-07, -9.3132e-09, -6.4261e-08, 2.8871e-08, -1.9558e-08, 2.4680e-07, 9.3132e-09, 4.0978e-08], device='cuda:0') 100 1e-05 changing lr epoch 415, time 247.19, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4373 re_mapping 0.0030 re_causal 0.0100 /// teacc 99.15 lr 0.00001000 Epoch 417, weight, value: tensor([[ 0.0207, -0.2020, -0.1973, ..., -0.3887, -0.1335, -0.1781], [ 0.0886, -0.0985, 0.0440, ..., 0.0459, 0.1426, -0.0678], [-0.1112, 0.1631, -0.2344, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1034, -0.0849, ..., 0.0128, -0.2342, 0.1762], [ 0.0480, -0.0553, 0.1391, ..., 0.0082, -0.2707, -0.0415], [-0.2586, -0.1327, -0.1838, ..., -0.3291, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 4.6566e-09, ..., 9.3132e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 3.6322e-08, -1.3970e-08, ..., 2.0582e-07, -1.9558e-08, 0.0000e+00], [ 0.0000e+00, 1.2107e-08, 4.6566e-09, ..., 6.9849e-08, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, -5.4017e-08, 4.6566e-09, ..., -3.1106e-07, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-09, ..., -6.5193e-09, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 5.5879e-09, ..., 1.2107e-08, 4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 417, bias, value: tensor([ 0.0006, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0179, 0.0082, -0.0437, -0.0145], device='cuda:0'), grad: tensor([ 3.8184e-08, 5.2527e-07, 1.9558e-07, 4.5635e-08, -9.3132e-10, 3.7253e-09, 5.5879e-09, -8.3819e-07, -2.0489e-08, 4.0978e-08], device='cuda:0') 100 1e-05 changing lr epoch 416, time 247.22, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4464 re_mapping 0.0029 re_causal 0.0102 /// teacc 99.15 lr 0.00001000 Epoch 418, weight, value: tensor([[ 0.0207, -0.2021, -0.1973, ..., -0.3889, -0.1335, -0.1781], [ 0.0886, -0.0985, 0.0440, ..., 0.0459, 0.1427, -0.0678], [-0.1112, 0.1631, -0.2345, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1034, -0.0849, ..., 0.0128, -0.2342, 0.1762], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2707, -0.0415], [-0.2586, -0.1327, -0.1839, ..., -0.3292, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -7.4506e-09, ..., -3.7253e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, 1.8626e-09, ..., 0.0000e+00, -9.3132e-10, 0.0000e+00], ..., [-9.3132e-10, 1.8626e-09, 3.7253e-09, ..., 1.8626e-09, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.8626e-09, ..., 9.3132e-10, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 9.3132e-10, -3.5390e-08, 0.0000e+00]], device='cuda:0') Epoch 418, bias, value: tensor([ 0.0006, -0.0399, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0082, -0.0438, -0.0145], device='cuda:0'), grad: tensor([ 1.1176e-08, 3.5390e-08, -9.3132e-10, -1.8626e-09, 1.2107e-08, 9.3132e-10, 1.8626e-08, 4.0978e-08, 7.0781e-08, -1.8068e-07], device='cuda:0') 100 1e-05 changing lr epoch 417, time 247.08, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4177 re_mapping 0.0030 re_causal 0.0100 /// teacc 99.20 lr 0.00001000 Epoch 419, weight, value: tensor([[ 0.0207, -0.2021, -0.1974, ..., -0.3889, -0.1336, -0.1781], [ 0.0886, -0.0984, 0.0440, ..., 0.0459, 0.1427, -0.0679], [-0.1112, 0.1631, -0.2346, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0792, -0.1034, -0.0849, ..., 0.0128, -0.2343, 0.1762], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2708, -0.0415], [-0.2586, -0.1327, -0.1839, ..., -0.3293, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.9558e-08, ..., 4.6566e-09, 2.7940e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -4.6100e-07, ..., -7.2643e-08, -6.1188e-07, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 3.7253e-09, ..., -2.7940e-09, 2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.3039e-08, 4.0978e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-08, ..., 9.3132e-09, 6.6124e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3039e-08, ..., 1.0245e-08, 4.8429e-08, 0.0000e+00]], device='cuda:0') Epoch 419, bias, value: tensor([ 0.0005, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0180, 0.0082, -0.0438, -0.0145], device='cuda:0'), grad: tensor([ 5.7742e-08, -1.2070e-06, 4.6566e-09, 8.3819e-09, -2.6077e-07, 1.2852e-07, 9.0338e-07, 1.1083e-07, 1.3597e-07, 1.2852e-07], device='cuda:0') 100 1e-05 changing lr epoch 418, time 247.71, cls_loss 0.0006 cls_loss_mapping 0.0003 cls_loss_causal 0.4346 re_mapping 0.0029 re_causal 0.0101 /// teacc 99.18 lr 0.00001000 Epoch 420, weight, value: tensor([[ 0.0207, -0.2021, -0.1974, ..., -0.3890, -0.1336, -0.1781], [ 0.0886, -0.0984, 0.0440, ..., 0.0459, 0.1428, -0.0679], [-0.1113, 0.1631, -0.2348, ..., 0.0589, 0.1028, -0.0420], ..., [-0.0792, -0.1035, -0.0848, ..., 0.0129, -0.2344, 0.1763], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2708, -0.0415], [-0.2586, -0.1327, -0.1840, ..., -0.3294, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 0.0000e+00, ..., 5.0291e-08, 1.9558e-08, 0.0000e+00], [ 0.0000e+00, -1.6764e-08, 2.7940e-09, ..., -2.3283e-08, -3.5390e-08, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 2.7940e-09, ..., -4.6566e-08, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 3.7253e-09, ..., 2.7940e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.8626e-09, ..., 1.8626e-08, 4.0047e-08, 0.0000e+00]], device='cuda:0') Epoch 420, bias, value: tensor([ 0.0005, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0180, 0.0083, -0.0438, -0.0145], device='cuda:0'), grad: tensor([ 4.6566e-09, 1.4901e-07, -7.5437e-08, -2.7940e-08, -1.0338e-07, 4.6566e-09, 1.8626e-09, -1.8999e-07, 1.7695e-08, 2.2259e-07], device='cuda:0') 100 1e-05 changing lr epoch 419, time 248.04, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4358 re_mapping 0.0029 re_causal 0.0101 /// teacc 99.18 lr 0.00001000 Epoch 421, weight, value: tensor([[ 0.0207, -0.2021, -0.1974, ..., -0.3890, -0.1336, -0.1781], [ 0.0885, -0.0984, 0.0440, ..., 0.0459, 0.1428, -0.0679], [-0.1113, 0.1631, -0.2349, ..., 0.0588, 0.1028, -0.0419], ..., [-0.0792, -0.1035, -0.0848, ..., 0.0129, -0.2344, 0.1763], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2709, -0.0416], [-0.2587, -0.1328, -0.1840, ..., -0.3295, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-08, 8.3819e-09, 1.8626e-09], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., -9.3132e-10, -9.3132e-10, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, -3.7253e-09, ..., -5.6811e-08, 2.7940e-09, -5.5879e-09], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 1.9558e-08, 8.3819e-09, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 2.7940e-09, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 421, bias, value: tensor([ 0.0005, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0180, 0.0083, -0.0438, -0.0145], device='cuda:0'), grad: tensor([ 1.6764e-08, 5.9605e-08, 3.7253e-09, 4.0047e-08, -1.1176e-08, -4.6566e-09, -5.7742e-08, -1.2759e-07, 6.3330e-08, 2.6077e-08], device='cuda:0') 100 1e-05 changing lr epoch 420, time 247.52, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4156 re_mapping 0.0029 re_causal 0.0099 /// teacc 99.19 lr 0.00001000 Epoch 422, weight, value: tensor([[ 0.0207, -0.2021, -0.1975, ..., -0.3891, -0.1336, -0.1781], [ 0.0885, -0.0984, 0.0440, ..., 0.0459, 0.1429, -0.0680], [-0.1113, 0.1631, -0.2350, ..., 0.0589, 0.1028, -0.0419], ..., [-0.0792, -0.1035, -0.0848, ..., 0.0129, -0.2344, 0.1764], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2709, -0.0416], [-0.2587, -0.1328, -0.1840, ..., -0.3296, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 2.9802e-08, ..., 0.0000e+00, 9.4995e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.2107e-08, ..., 6.5193e-09, 3.4459e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., -8.3819e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.0699e-07, ..., 9.3132e-10, 1.1958e-06, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 4.6566e-09, 9.3132e-09, 0.0000e+00]], device='cuda:0') Epoch 422, bias, value: tensor([ 0.0005, -0.0399, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0083, -0.0439, -0.0145], device='cuda:0'), grad: tensor([ 3.6974e-07, 1.4435e-07, 8.3819e-09, 9.3132e-10, 1.0245e-08, 2.8051e-06, -7.8231e-06, -2.7008e-08, 4.4629e-06, 5.2154e-08], device='cuda:0') 100 1e-05 changing lr epoch 421, time 247.34, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4317 re_mapping 0.0029 re_causal 0.0100 /// teacc 99.19 lr 0.00001000 Epoch 423, weight, value: tensor([[ 0.0207, -0.2022, -0.1975, ..., -0.3893, -0.1337, -0.1781], [ 0.0885, -0.0984, 0.0440, ..., 0.0458, 0.1428, -0.0681], [-0.1113, 0.1632, -0.2350, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1035, -0.0848, ..., 0.0129, -0.2344, 0.1765], [ 0.0480, -0.0554, 0.1391, ..., 0.0082, -0.2710, -0.0416], [-0.2587, -0.1328, -0.1841, ..., -0.3297, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 2.7940e-09, ..., 0.0000e+00, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -7.3574e-08, ..., -9.3132e-10, -9.9652e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 9.3132e-10, 9.3132e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 9.3132e-10, 1.3970e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.1420e-08, ..., 0.0000e+00, 5.0291e-08, 0.0000e+00]], device='cuda:0') Epoch 423, bias, value: tensor([ 0.0005, -0.0399, 0.0104, -0.0067, 0.0082, 0.0064, 0.0180, 0.0083, -0.0439, -0.0145], device='cuda:0'), grad: tensor([-4.6566e-09, -1.6484e-07, 2.2352e-08, -7.4506e-09, -7.6368e-08, 2.3283e-08, 1.8626e-09, 4.4703e-08, 1.2107e-08, 1.5460e-07], device='cuda:0') 100 1e-05 changing lr epoch 422, time 247.54, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4206 re_mapping 0.0029 re_causal 0.0101 /// teacc 99.18 lr 0.00001000 Epoch 424, weight, value: tensor([[ 0.0207, -0.2023, -0.1975, ..., -0.3894, -0.1337, -0.1781], [ 0.0885, -0.0984, 0.0440, ..., 0.0458, 0.1428, -0.0682], [-0.1113, 0.1632, -0.2351, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1035, -0.0848, ..., 0.0130, -0.2344, 0.1766], [ 0.0479, -0.0555, 0.1391, ..., 0.0081, -0.2710, -0.0416], [-0.2587, -0.1328, -0.1841, ..., -0.3298, 0.0884, -0.1105]], device='cuda:0'), grad: tensor([[-1.9558e-08, 0.0000e+00, -9.3132e-10, ..., 9.3132e-10, 9.3132e-09, 9.3132e-10], [ 1.1176e-08, 0.0000e+00, -5.5879e-09, ..., -6.5193e-09, -1.5832e-08, -1.1176e-08], [ 0.0000e+00, -9.3132e-10, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 2.7940e-09, 0.0000e+00, 2.7940e-09, ..., -9.3132e-10, 1.8626e-09, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, -1.5832e-08, ..., -1.8626e-09, 1.0245e-08, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 1.0245e-08, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 424, bias, value: tensor([ 0.0005, -0.0400, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0083, -0.0439, -0.0146], device='cuda:0'), grad: tensor([-9.5926e-08, 5.2154e-08, 9.3132e-09, 8.3819e-09, 1.6764e-08, 2.6077e-08, -8.5682e-08, 2.1420e-08, 7.4506e-09, 4.5635e-08], device='cuda:0') 100 1e-05 changing lr epoch 423, time 247.56, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4277 re_mapping 0.0029 re_causal 0.0102 /// teacc 99.16 lr 0.00001000 Epoch 425, weight, value: tensor([[ 0.0207, -0.2023, -0.1976, ..., -0.3895, -0.1338, -0.1781], [ 0.0885, -0.0984, 0.0440, ..., 0.0458, 0.1429, -0.0682], [-0.1113, 0.1632, -0.2351, ..., 0.0589, 0.1029, -0.0419], ..., [-0.0792, -0.1036, -0.0848, ..., 0.0130, -0.2345, 0.1766], [ 0.0479, -0.0555, 0.1391, ..., 0.0082, -0.2711, -0.0416], [-0.2587, -0.1329, -0.1842, ..., -0.3299, 0.0884, -0.1106]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 5.5879e-09, -4.6566e-09, ..., 2.6077e-08, 8.3819e-09, 9.3132e-10], [ 0.0000e+00, -1.3039e-08, 9.3132e-10, ..., -3.3528e-08, -3.0734e-08, 0.0000e+00], ..., [ 0.0000e+00, 6.5193e-09, -1.1176e-08, ..., -3.1572e-07, 2.0489e-08, -2.7940e-09], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.3970e-08, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -6.5193e-09, 0.0000e+00]], device='cuda:0') Epoch 425, bias, value: tensor([ 0.0005, -0.0400, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0083, -0.0439, -0.0146], device='cuda:0'), grad: tensor([ 2.7940e-09, 5.9605e-08, -8.2888e-08, 7.5996e-07, 6.5193e-09, 2.7940e-09, 3.7253e-09, -7.8790e-07, 3.4459e-08, -1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 424, time 247.31, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4075 re_mapping 0.0029 re_causal 0.0099 /// teacc 99.15 lr 0.00001000 Epoch 426, weight, value: tensor([[ 0.0207, -0.2023, -0.1976, ..., -0.3895, -0.1338, -0.1781], [ 0.0885, -0.0984, 0.0439, ..., 0.0457, 0.1429, -0.0682], [-0.1113, 0.1632, -0.2352, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1036, -0.0848, ..., 0.0130, -0.2345, 0.1766], [ 0.0479, -0.0555, 0.1391, ..., 0.0082, -0.2711, -0.0416], [-0.2587, -0.1329, -0.1842, ..., -0.3300, 0.0884, -0.1106]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -5.8673e-08, -3.1758e-07, ..., -2.4121e-07, -3.8184e-07, 0.0000e+00], [ 0.0000e+00, 1.0245e-08, 9.2201e-08, ..., 7.8231e-08, 8.1956e-08, 0.0000e+00], ..., [ 0.0000e+00, 4.8429e-08, 2.4773e-07, ..., 1.7695e-07, 2.9802e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -5.1223e-08, ..., -3.9116e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.5193e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 426, bias, value: tensor([ 0.0005, -0.0400, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0084, -0.0440, -0.0146], device='cuda:0'), grad: tensor([ 7.4506e-09, -5.2620e-07, 2.3656e-07, 1.3039e-08, 8.3819e-09, 1.4435e-07, -6.7987e-08, 3.7160e-07, -1.9744e-07, 2.3283e-08], device='cuda:0') 100 1e-05 changing lr epoch 425, time 247.42, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4208 re_mapping 0.0029 re_causal 0.0101 /// teacc 99.16 lr 0.00001000 Epoch 427, weight, value: tensor([[ 0.0207, -0.2023, -0.1976, ..., -0.3896, -0.1339, -0.1781], [ 0.0885, -0.0984, 0.0439, ..., 0.0457, 0.1429, -0.0682], [-0.1113, 0.1632, -0.2352, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1036, -0.0847, ..., 0.0130, -0.2345, 0.1766], [ 0.0479, -0.0555, 0.1392, ..., 0.0082, -0.2712, -0.0415], [-0.2587, -0.1329, -0.1843, ..., -0.3302, 0.0884, -0.1106]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., -4.6566e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7008e-08, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.4901e-08, 0.0000e+00]], device='cuda:0') Epoch 427, bias, value: tensor([ 0.0005, -0.0401, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0084, -0.0440, -0.0146], device='cuda:0'), grad: tensor([ 9.3132e-10, 6.5193e-09, 9.3132e-09, 6.5193e-08, -4.0047e-08, -1.9372e-07, 4.4703e-08, -8.3819e-09, 9.3132e-08, 3.7253e-08], device='cuda:0') 100 1e-05 changing lr epoch 426, time 246.54, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4443 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.16 lr 0.00001000 Epoch 428, weight, value: tensor([[ 0.0207, -0.2023, -0.1976, ..., -0.3896, -0.1339, -0.1781], [ 0.0885, -0.0984, 0.0439, ..., 0.0457, 0.1429, -0.0682], [-0.1114, 0.1633, -0.2353, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1036, -0.0847, ..., 0.0131, -0.2346, 0.1766], [ 0.0479, -0.0555, 0.1392, ..., 0.0082, -0.2713, -0.0415], [-0.2587, -0.1329, -0.1844, ..., -0.3303, 0.0884, -0.1106]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, -1.0245e-08, 0.0000e+00, ..., -2.6077e-08, -1.3970e-08, 0.0000e+00], ..., [ 0.0000e+00, 6.5193e-09, 1.8626e-09, ..., 1.7695e-08, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -7.4506e-09, ..., -3.7253e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 428, bias, value: tensor([ 0.0005, -0.0401, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0084, -0.0440, -0.0146], device='cuda:0'), grad: tensor([-1.8626e-09, 4.6566e-09, -7.4506e-08, 3.6322e-08, 9.3132e-10, -6.5193e-09, -5.5879e-09, 4.8429e-08, -2.7940e-09, 4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 427, time 247.60, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4204 re_mapping 0.0028 re_causal 0.0102 /// teacc 99.19 lr 0.00001000 Epoch 429, weight, value: tensor([[ 0.0207, -0.2023, -0.1976, ..., -0.3897, -0.1339, -0.1781], [ 0.0885, -0.0984, 0.0439, ..., 0.0456, 0.1429, -0.0683], [-0.1114, 0.1633, -0.2353, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1036, -0.0847, ..., 0.0131, -0.2346, 0.1767], [ 0.0479, -0.0555, 0.1392, ..., 0.0082, -0.2713, -0.0415], [-0.2587, -0.1330, -0.1844, ..., -0.3304, 0.0884, -0.1106]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 2.7940e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-08, -3.4459e-08, ..., 2.5146e-08, 2.5146e-08, 0.0000e+00], [ 0.0000e+00, -4.0047e-08, 9.3132e-10, ..., -5.6811e-08, -9.1270e-08, 0.0000e+00], ..., [ 0.0000e+00, -0.0000e+00, 1.0245e-08, ..., -4.6566e-09, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 1.7695e-08, 6.5193e-09, ..., 2.4214e-08, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 2.7940e-09, -2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 429, bias, value: tensor([ 0.0005, -0.0401, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0085, -0.0440, -0.0146], device='cuda:0'), grad: tensor([ 2.7940e-08, 0.0000e+00, -1.8720e-07, 5.5879e-09, 2.7008e-08, 6.5193e-09, 2.6077e-08, -5.5879e-09, 9.0338e-08, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 428, time 247.47, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4292 re_mapping 0.0028 re_causal 0.0100 /// teacc 99.19 lr 0.00001000 Epoch 430, weight, value: tensor([[ 0.0207, -0.2024, -0.1977, ..., -0.3898, -0.1340, -0.1782], [ 0.0885, -0.0984, 0.0439, ..., 0.0456, 0.1429, -0.0683], [-0.1114, 0.1633, -0.2354, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1036, -0.0847, ..., 0.0131, -0.2346, 0.1767], [ 0.0479, -0.0555, 0.1392, ..., 0.0082, -0.2713, -0.0415], [-0.2587, -0.1330, -0.1844, ..., -0.3306, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -8.6613e-08, ..., -1.5460e-07, -3.6135e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.0978e-08, ..., 7.3574e-08, 1.6671e-07, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.5635e-08, ..., 5.4017e-08, 1.8533e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.4214e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 430, bias, value: tensor([ 0.0005, -0.0401, 0.0104, -0.0067, 0.0083, 0.0064, 0.0180, 0.0085, -0.0440, -0.0147], device='cuda:0'), grad: tensor([ 2.7940e-09, -6.2212e-07, 2.9616e-07, 1.3039e-08, 2.7940e-09, -5.4017e-08, 7.4506e-09, 2.0675e-07, 3.9116e-08, 1.0617e-07], device='cuda:0') 100 1e-05 changing lr epoch 429, time 247.57, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4407 re_mapping 0.0028 re_causal 0.0102 /// teacc 99.18 lr 0.00001000 Epoch 431, weight, value: tensor([[ 0.0208, -0.2024, -0.1977, ..., -0.3898, -0.1340, -0.1782], [ 0.0885, -0.0984, 0.0439, ..., 0.0456, 0.1430, -0.0683], [-0.1114, 0.1633, -0.2355, ..., 0.0589, 0.1029, -0.0418], ..., [-0.0792, -0.1037, -0.0848, ..., 0.0131, -0.2346, 0.1767], [ 0.0479, -0.0556, 0.1392, ..., 0.0082, -0.2713, -0.0415], [-0.2587, -0.1330, -0.1845, ..., -0.3307, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-09, 8.3819e-09, 4.6566e-09], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., 6.5658e-07, -4.7497e-08, 3.3900e-07], [ 0.0000e+00, 0.0000e+00, 3.3528e-08, ..., 5.6811e-08, 3.1665e-08, 1.3970e-08], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., -8.8569e-07, 1.8626e-09, -4.4145e-07], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 6.7055e-08, 1.8626e-09, 3.2596e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 7.7300e-08, 9.3132e-10, 3.8184e-08]], device='cuda:0') Epoch 431, bias, value: tensor([ 0.0005, -0.0401, 0.0104, -0.0067, 0.0083, 0.0064, 0.0181, 0.0085, -0.0440, -0.0147], device='cuda:0'), grad: tensor([-4.6659e-07, 2.7977e-06, 2.5891e-07, 1.2387e-07, 2.0489e-08, 3.9116e-08, 2.1420e-07, -3.7197e-06, 3.0082e-07, 4.3586e-07], device='cuda:0') 100 1e-05 changing lr epoch 430, time 247.02, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4007 re_mapping 0.0028 re_causal 0.0097 /// teacc 99.17 lr 0.00001000 Epoch 432, weight, value: tensor([[ 0.0208, -0.2024, -0.1977, ..., -0.3898, -0.1341, -0.1782], [ 0.0885, -0.0985, 0.0440, ..., 0.0456, 0.1430, -0.0683], [-0.1115, 0.1634, -0.2355, ..., 0.0589, 0.1030, -0.0418], ..., [-0.0791, -0.1037, -0.0848, ..., 0.0132, -0.2347, 0.1767], [ 0.0478, -0.0556, 0.1393, ..., 0.0082, -0.2714, -0.0415], [-0.2588, -0.1331, -0.1846, ..., -0.3309, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.8626e-09, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 0.0000e+00, -9.3132e-10], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., 0.0000e+00, 2.7940e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 432, bias, value: tensor([ 0.0005, -0.0401, 0.0105, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0440, -0.0147], device='cuda:0'), grad: tensor([ 1.3039e-08, 1.1176e-08, 9.3132e-10, 2.7940e-09, -7.4506e-09, -1.0245e-08, -1.5832e-08, -3.7253e-09, -5.0291e-08, 6.6124e-08], device='cuda:0') 100 1e-05 changing lr epoch 431, time 247.14, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4138 re_mapping 0.0028 re_causal 0.0097 /// teacc 99.18 lr 0.00001000 Epoch 433, weight, value: tensor([[ 0.0208, -0.2024, -0.1978, ..., -0.3899, -0.1341, -0.1782], [ 0.0885, -0.0985, 0.0440, ..., 0.0456, 0.1429, -0.0684], [-0.1115, 0.1634, -0.2355, ..., 0.0590, 0.1030, -0.0418], ..., [-0.0791, -0.1037, -0.0848, ..., 0.0132, -0.2347, 0.1768], [ 0.0478, -0.0556, 0.1393, ..., 0.0082, -0.2714, -0.0415], [-0.2588, -0.1331, -0.1846, ..., -0.3310, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 7.4506e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -5.5879e-09, -0.0000e+00]], device='cuda:0') Epoch 433, bias, value: tensor([ 0.0005, -0.0402, 0.0105, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0441, -0.0147], device='cuda:0'), grad: tensor([-1.1176e-08, 1.1176e-08, 3.5390e-08, 7.4506e-09, 1.2107e-08, 4.6566e-09, -5.8673e-08, 4.6566e-09, 2.4214e-08, -2.2352e-08], device='cuda:0') 100 1e-05 changing lr epoch 432, time 247.94, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4491 re_mapping 0.0028 re_causal 0.0100 /// teacc 99.16 lr 0.00001000 Epoch 434, weight, value: tensor([[ 0.0208, -0.2024, -0.1978, ..., -0.3899, -0.1342, -0.1782], [ 0.0885, -0.0985, 0.0440, ..., 0.0456, 0.1430, -0.0684], [-0.1115, 0.1634, -0.2355, ..., 0.0590, 0.1030, -0.0418], ..., [-0.0791, -0.1037, -0.0848, ..., 0.0132, -0.2347, 0.1768], [ 0.0478, -0.0556, 0.1393, ..., 0.0082, -0.2714, -0.0415], [-0.2588, -0.1331, -0.1847, ..., -0.3311, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.0978e-08, ..., 1.4156e-07, 1.2107e-08, 9.3132e-10], [ 0.0000e+00, -6.5193e-09, 2.7940e-09, ..., -4.9360e-08, -9.3132e-09, 0.0000e+00], ..., [ 0.0000e+00, 6.5193e-09, -4.0978e-08, ..., -1.0245e-07, 1.6764e-08, -9.3132e-10], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 9.3132e-10, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 1.0245e-08, 5.9605e-08, 0.0000e+00]], device='cuda:0') Epoch 434, bias, value: tensor([ 0.0005, -0.0402, 0.0105, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0441, -0.0147], device='cuda:0'), grad: tensor([ 2.2352e-08, 5.4203e-07, -2.0582e-07, -3.7253e-09, -2.8498e-07, 1.3039e-08, -8.7544e-08, -2.9989e-07, 6.6124e-08, 2.4214e-07], device='cuda:0') 100 1e-05 changing lr epoch 433, time 247.69, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4407 re_mapping 0.0028 re_causal 0.0100 /// teacc 99.17 lr 0.00001000 Epoch 435, weight, value: tensor([[ 0.0208, -0.2024, -0.1978, ..., -0.3900, -0.1342, -0.1782], [ 0.0884, -0.0985, 0.0440, ..., 0.0456, 0.1430, -0.0685], [-0.1115, 0.1635, -0.2356, ..., 0.0590, 0.1030, -0.0418], ..., [-0.0791, -0.1037, -0.0848, ..., 0.0131, -0.2348, 0.1769], [ 0.0478, -0.0556, 0.1393, ..., 0.0082, -0.2715, -0.0415], [-0.2588, -0.1332, -0.1847, ..., -0.3313, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -1.3411e-07, ..., -9.9652e-08, -1.2107e-07, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, 1.8626e-09, ..., -6.5193e-09, -1.3970e-08, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 1.2666e-07, ..., 1.0151e-07, 1.2293e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 435, bias, value: tensor([ 0.0005, -0.0401, 0.0105, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0441, -0.0147], device='cuda:0'), grad: tensor([ 2.6356e-07, -3.4552e-07, -2.2352e-08, 3.7253e-09, 2.2352e-08, 1.8626e-09, -3.0454e-07, 3.3621e-07, 4.2841e-08, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 434, time 246.83, cls_loss 0.0006 cls_loss_mapping 0.0003 cls_loss_causal 0.3939 re_mapping 0.0028 re_causal 0.0096 /// teacc 99.16 lr 0.00001000 Epoch 436, weight, value: tensor([[ 0.0208, -0.2025, -0.1979, ..., -0.3901, -0.1343, -0.1782], [ 0.0884, -0.0986, 0.0440, ..., 0.0456, 0.1430, -0.0685], [-0.1115, 0.1635, -0.2357, ..., 0.0590, 0.1031, -0.0417], ..., [-0.0792, -0.1038, -0.0848, ..., 0.0131, -0.2348, 0.1769], [ 0.0478, -0.0556, 0.1393, ..., 0.0082, -0.2715, -0.0415], [-0.2588, -0.1332, -0.1848, ..., -0.3314, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 7.9162e-09, -2.7940e-09, ..., 1.2107e-08, 1.5832e-08, -0.0000e+00], [ 0.0000e+00, -1.3504e-08, 1.3970e-09, ..., -2.0023e-08, -3.3528e-08, 0.0000e+00], ..., [ 0.0000e+00, 3.2596e-09, 1.8626e-09, ..., 5.5879e-09, 9.7789e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, -3.7253e-09, ..., -5.1223e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 0.0000e+00]], device='cuda:0') Epoch 436, bias, value: tensor([ 0.0004, -0.0402, 0.0106, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0441, -0.0148], device='cuda:0'), grad: tensor([ 5.1223e-09, 4.2375e-08, -7.5903e-08, 6.0536e-09, 5.5879e-09, 9.3132e-10, 0.0000e+00, 2.0489e-08, -6.0536e-09, 4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 435, time 246.99, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4225 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.18 lr 0.00001000 Epoch 437, weight, value: tensor([[ 0.0208, -0.2025, -0.1980, ..., -0.3901, -0.1343, -0.1782], [ 0.0884, -0.0985, 0.0441, ..., 0.0456, 0.1431, -0.0685], [-0.1116, 0.1636, -0.2358, ..., 0.0590, 0.1031, -0.0417], ..., [-0.0791, -0.1038, -0.0849, ..., 0.0131, -0.2349, 0.1770], [ 0.0478, -0.0557, 0.1393, ..., 0.0082, -0.2716, -0.0415], [-0.2589, -0.1332, -0.1848, ..., -0.3316, 0.0884, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 7.4506e-09, 4.6566e-10, ..., 4.1910e-09, 6.9849e-09, 0.0000e+00], [ 2.2352e-08, 3.4925e-08, 3.2596e-09, ..., 3.1199e-08, 8.4750e-08, 0.0000e+00], [ 9.3132e-10, -1.8813e-07, 2.3283e-09, ..., -9.7789e-08, -1.4715e-07, 0.0000e+00], ..., [ 4.6566e-09, 9.3132e-08, -4.6566e-10, ..., 3.1199e-08, 9.5926e-08, 0.0000e+00], [-4.6566e-10, 1.3970e-08, -1.3039e-08, ..., 4.6566e-10, 1.0710e-08, 0.0000e+00], [ 2.3283e-09, 5.1223e-09, 4.6566e-10, ..., 5.5879e-09, 7.4506e-09, 0.0000e+00]], device='cuda:0') Epoch 437, bias, value: tensor([ 0.0004, -0.0401, 0.0106, -0.0067, 0.0084, 0.0064, 0.0181, 0.0085, -0.0442, -0.0148], device='cuda:0'), grad: tensor([ 2.3749e-08, 2.5099e-07, -4.4890e-07, 6.3330e-08, -1.6019e-07, 1.2107e-08, 7.9162e-09, 2.1933e-07, 1.5367e-08, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr ---------------------saving model at epoch 436---------------------------------------------------- epoch 436, time 264.40, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4227 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.21 lr 0.00001000 Epoch 438, weight, value: tensor([[ 0.0208, -0.2025, -0.1980, ..., -0.3902, -0.1344, -0.1782], [ 0.0884, -0.0985, 0.0441, ..., 0.0456, 0.1432, -0.0685], [-0.1116, 0.1636, -0.2360, ..., 0.0590, 0.1031, -0.0417], ..., [-0.0791, -0.1039, -0.0849, ..., 0.0132, -0.2350, 0.1770], [ 0.0478, -0.0557, 0.1393, ..., 0.0082, -0.2716, -0.0415], [-0.2589, -0.1332, -0.1849, ..., -0.3318, 0.0883, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.6566e-10, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.3039e-08, ..., -8.8476e-09, -1.9558e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.9849e-09, ..., 4.6566e-09, 2.3283e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.2107e-08, ..., 8.3819e-09, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 5.1223e-09, 6.9849e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 438, bias, value: tensor([ 0.0004, -0.0402, 0.0105, -0.0067, 0.0085, 0.0064, 0.0181, 0.0085, -0.0442, -0.0148], device='cuda:0'), grad: tensor([ 1.0710e-08, -2.9802e-08, 1.0291e-07, -4.5635e-08, 1.0943e-07, 1.0245e-08, -2.1467e-07, 3.1665e-08, 2.6077e-08, 3.2596e-09], device='cuda:0') 100 1e-05 changing lr ---------------------saving model at epoch 437---------------------------------------------------- epoch 437, time 263.05, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4222 re_mapping 0.0028 re_causal 0.0098 /// teacc 99.22 lr 0.00001000 Epoch 439, weight, value: tensor([[ 0.0208, -0.2026, -0.1981, ..., -0.3902, -0.1345, -0.1782], [ 0.0884, -0.0986, 0.0441, ..., 0.0456, 0.1432, -0.0686], [-0.1116, 0.1637, -0.2360, ..., 0.0590, 0.1032, -0.0417], ..., [-0.0791, -0.1039, -0.0849, ..., 0.0131, -0.2351, 0.1770], [ 0.0478, -0.0557, 0.1393, ..., 0.0082, -0.2717, -0.0415], [-0.2589, -0.1333, -0.1850, ..., -0.3319, 0.0883, -0.1107]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 2.5611e-08, 9.3132e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-09, 1.6764e-08, 3.2596e-09], [ 0.0000e+00, 1.8626e-09, 2.0023e-08, ..., 1.9092e-08, 9.7789e-09, 3.2596e-09], ..., [ 0.0000e+00, -4.1910e-09, 2.8871e-08, ..., -2.7940e-09, 4.0513e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 8.3819e-09, ..., 4.1910e-09, 8.1491e-08, 5.1223e-09], [ 0.0000e+00, 9.3132e-10, 1.3970e-09, ..., 6.0536e-09, -1.1884e-06, 0.0000e+00]], device='cuda:0') Epoch 439, bias, value: tensor([ 0.0004, -0.0402, 0.0106, -0.0067, 0.0085, 0.0064, 0.0181, 0.0085, -0.0442, -0.0149], device='cuda:0'), grad: tensor([ 8.9873e-08, 8.1025e-08, 9.7789e-08, -1.2154e-07, 3.9674e-06, 6.9384e-08, -8.6613e-08, 1.4855e-07, 3.1758e-07, -4.5486e-06], device='cuda:0') 100 1e-05 changing lr epoch 438, time 246.89, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4241 re_mapping 0.0028 re_causal 0.0099 /// teacc 99.22 lr 0.00001000 Epoch 440, weight, value: tensor([[ 0.0208, -0.2026, -0.1981, ..., -0.3904, -0.1345, -0.1783], [ 0.0884, -0.0986, 0.0441, ..., 0.0456, 0.1432, -0.0686], [-0.1116, 0.1637, -0.2361, ..., 0.0590, 0.1032, -0.0417], ..., [-0.0791, -0.1039, -0.0849, ..., 0.0132, -0.2351, 0.1771], [ 0.0477, -0.0557, 0.1393, ..., 0.0082, -0.2718, -0.0416], [-0.2589, -0.1333, -0.1851, ..., -0.3320, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 3.6787e-08, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 8.3819e-09, ..., 4.6566e-09, 4.6566e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., -2.7474e-08, 2.3283e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-09, 2.3283e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 3.2596e-09, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 440, bias, value: tensor([ 0.0004, -0.0402, 0.0106, -0.0067, 0.0085, 0.0064, 0.0181, 0.0085, -0.0443, -0.0149], device='cuda:0'), grad: tensor([ 5.5879e-09, 1.4342e-07, 1.6764e-08, -7.0315e-08, 9.7789e-09, 1.0710e-08, -1.8161e-08, -1.2526e-07, 2.3283e-08, 1.3970e-08], device='cuda:0') 100 1e-05 changing lr epoch 439, time 247.47, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4265 re_mapping 0.0028 re_causal 0.0099 /// teacc 99.19 lr 0.00001000 Epoch 441, weight, value: tensor([[ 0.0208, -0.2026, -0.1981, ..., -0.3904, -0.1346, -0.1783], [ 0.0884, -0.0986, 0.0442, ..., 0.0456, 0.1432, -0.0686], [-0.1116, 0.1637, -0.2361, ..., 0.0590, 0.1032, -0.0417], ..., [-0.0791, -0.1039, -0.0849, ..., 0.0132, -0.2352, 0.1771], [ 0.0477, -0.0558, 0.1393, ..., 0.0081, -0.2718, -0.0416], [-0.2589, -0.1333, -0.1851, ..., -0.3322, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 4.6566e-10, ..., 4.6566e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, -1.8626e-09, ..., -0.0000e+00, -3.2596e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.2596e-09, ..., 1.3970e-09, -4.6566e-10, 0.0000e+00], ..., [ 2.0955e-08, 4.6566e-10, 4.6566e-10, ..., 4.6566e-10, 1.8626e-09, 0.0000e+00], [ 1.1642e-08, -1.3970e-09, -3.2596e-09, ..., -2.3283e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, -6.0536e-09, 0.0000e+00]], device='cuda:0') Epoch 441, bias, value: tensor([ 0.0004, -0.0402, 0.0107, -0.0067, 0.0086, 0.0064, 0.0181, 0.0085, -0.0444, -0.0149], device='cuda:0'), grad: tensor([ 4.6566e-09, -2.7940e-09, 3.7253e-09, 3.2596e-08, 1.2573e-08, -1.1036e-07, 6.5193e-09, 5.8673e-08, 2.2352e-08, -1.3504e-08], device='cuda:0') 100 1e-05 changing lr epoch 440, time 247.15, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4143 re_mapping 0.0027 re_causal 0.0096 /// teacc 99.18 lr 0.00001000 Epoch 442, weight, value: tensor([[ 0.0208, -0.2027, -0.1982, ..., -0.3905, -0.1346, -0.1783], [ 0.0884, -0.0986, 0.0441, ..., 0.0455, 0.1432, -0.0686], [-0.1116, 0.1637, -0.2362, ..., 0.0591, 0.1033, -0.0417], ..., [-0.0791, -0.1040, -0.0849, ..., 0.0132, -0.2352, 0.1771], [ 0.0477, -0.0558, 0.1393, ..., 0.0081, -0.2719, -0.0416], [-0.2589, -0.1334, -0.1851, ..., -0.3324, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 3.7253e-09, ..., 1.3970e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, -1.3970e-09, 2.1420e-08, ..., -2.7940e-09, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.3970e-09, 6.5193e-09, ..., 2.3283e-09, 2.7008e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.2072e-07, ..., -2.1886e-08, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 4.6566e-10, -2.5146e-08, 0.0000e+00]], device='cuda:0') Epoch 442, bias, value: tensor([ 0.0004, -0.0403, 0.0107, -0.0067, 0.0086, 0.0064, 0.0181, 0.0085, -0.0444, -0.0150], device='cuda:0'), grad: tensor([ 2.7940e-09, 1.2107e-08, 3.9116e-08, 1.0664e-07, 5.0291e-08, 6.9849e-09, 8.9407e-08, 1.1409e-07, -3.2410e-07, -8.2888e-08], device='cuda:0') 100 1e-05 changing lr epoch 441, time 247.82, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4271 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 443, weight, value: tensor([[ 0.0208, -0.2027, -0.1982, ..., -0.3905, -0.1347, -0.1783], [ 0.0884, -0.0986, 0.0441, ..., 0.0455, 0.1432, -0.0686], [-0.1117, 0.1638, -0.2362, ..., 0.0591, 0.1033, -0.0417], ..., [-0.0791, -0.1040, -0.0849, ..., 0.0132, -0.2353, 0.1771], [ 0.0477, -0.0558, 0.1394, ..., 0.0082, -0.2720, -0.0416], [-0.2589, -0.1334, -0.1852, ..., -0.3325, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.2107e-08, 0.0000e+00], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 2.3283e-09, 2.3283e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 4.6566e-10, ..., -3.2596e-09, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 2.3283e-09, 0.0000e+00], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 443, bias, value: tensor([ 0.0004, -0.0403, 0.0108, -0.0067, 0.0086, 0.0064, 0.0182, 0.0085, -0.0444, -0.0150], device='cuda:0'), grad: tensor([ 4.0047e-08, 1.0710e-08, -6.0536e-09, 2.7940e-09, 7.9162e-09, 1.3970e-08, -7.8231e-08, 1.0245e-08, 1.8626e-08, -1.6298e-08], device='cuda:0') 100 1e-05 changing lr epoch 442, time 247.84, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4329 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 444, weight, value: tensor([[ 0.0208, -0.2027, -0.1983, ..., -0.3906, -0.1348, -0.1783], [ 0.0884, -0.0986, 0.0441, ..., 0.0455, 0.1432, -0.0686], [-0.1117, 0.1638, -0.2364, ..., 0.0591, 0.1033, -0.0417], ..., [-0.0791, -0.1040, -0.0849, ..., 0.0132, -0.2353, 0.1772], [ 0.0477, -0.0558, 0.1394, ..., 0.0081, -0.2721, -0.0416], [-0.2589, -0.1334, -0.1852, ..., -0.3326, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 3.2596e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 9.3132e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [-0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.3283e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.1910e-09, 0.0000e+00]], device='cuda:0') Epoch 444, bias, value: tensor([ 0.0004, -0.0403, 0.0108, -0.0067, 0.0086, 0.0064, 0.0182, 0.0086, -0.0445, -0.0150], device='cuda:0'), grad: tensor([ 4.1910e-09, 6.9849e-09, 3.2596e-09, 7.9162e-09, -1.9558e-08, 3.7253e-09, -1.4435e-08, -9.7789e-09, 4.1910e-09, 1.9558e-08], device='cuda:0') 100 1e-05 changing lr epoch 443, time 246.88, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4280 re_mapping 0.0028 re_causal 0.0099 /// teacc 99.15 lr 0.00001000 Epoch 445, weight, value: tensor([[ 0.0209, -0.2027, -0.1983, ..., -0.3906, -0.1349, -0.1783], [ 0.0884, -0.0988, 0.0441, ..., 0.0455, 0.1432, -0.0687], [-0.1117, 0.1639, -0.2364, ..., 0.0591, 0.1035, -0.0417], ..., [-0.0792, -0.1041, -0.0849, ..., 0.0132, -0.2354, 0.1772], [ 0.0477, -0.0558, 0.1394, ..., 0.0082, -0.2721, -0.0416], [-0.2590, -0.1334, -0.1853, ..., -0.3327, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 5.5879e-09, 2.5611e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.5856e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 4.6566e-10, ..., -1.3970e-09, -1.8626e-08, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, -4.6566e-10, ..., -7.4040e-08, 1.5367e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.7695e-08, -2.8405e-08, 0.0000e+00]], device='cuda:0') Epoch 445, bias, value: tensor([ 0.0004, -0.0404, 0.0109, -0.0067, 0.0086, 0.0064, 0.0182, 0.0086, -0.0445, -0.0150], device='cuda:0'), grad: tensor([ 1.1502e-07, 1.4668e-07, -7.4506e-09, 1.1129e-07, 1.3504e-08, -3.8650e-08, -2.3283e-09, -2.8964e-07, 1.1642e-08, -5.3085e-08], device='cuda:0') 100 1e-05 changing lr epoch 444, time 247.07, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4224 re_mapping 0.0027 re_causal 0.0094 /// teacc 99.17 lr 0.00001000 Epoch 446, weight, value: tensor([[ 0.0209, -0.2027, -0.1984, ..., -0.3907, -0.1349, -0.1783], [ 0.0884, -0.0988, 0.0441, ..., 0.0455, 0.1432, -0.0687], [-0.1117, 0.1640, -0.2365, ..., 0.0592, 0.1035, -0.0417], ..., [-0.0792, -0.1041, -0.0849, ..., 0.0132, -0.2356, 0.1772], [ 0.0477, -0.0559, 0.1395, ..., 0.0082, -0.2722, -0.0415], [-0.2590, -0.1335, -0.1854, ..., -0.3329, 0.0883, -0.1108]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 8.3819e-09, ..., 4.6566e-09, 1.3970e-09, -2.5379e-07], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, -1.3970e-09, 0.0000e+00]], device='cuda:0') Epoch 446, bias, value: tensor([ 0.0004, -0.0404, 0.0109, -0.0067, 0.0086, 0.0064, 0.0182, 0.0086, -0.0445, -0.0150], device='cuda:0'), grad: tensor([-2.7940e-09, 6.0536e-09, 5.1223e-09, -4.1444e-07, 6.3842e-07, 3.9348e-07, 1.3970e-09, -6.1886e-07, -1.3970e-09, 4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 445, time 247.42, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4477 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.16 lr 0.00001000 Epoch 447, weight, value: tensor([[ 0.0209, -0.2028, -0.1984, ..., -0.3908, -0.1350, -0.1783], [ 0.0884, -0.0988, 0.0442, ..., 0.0455, 0.1433, -0.0687], [-0.1117, 0.1641, -0.2366, ..., 0.0592, 0.1036, -0.0417], ..., [-0.0792, -0.1042, -0.0849, ..., 0.0132, -0.2357, 0.1773], [ 0.0477, -0.0559, 0.1395, ..., 0.0082, -0.2723, -0.0415], [-0.2590, -0.1335, -0.1855, ..., -0.3331, 0.0884, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, -1.3970e-09, ..., 2.7940e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, -9.7789e-09, 1.8626e-08, ..., -4.3772e-08, -2.0489e-08, 0.0000e+00], ..., [ 0.0000e+00, 7.9162e-09, 9.3132e-09, ..., 4.1444e-08, 2.5611e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 2.3283e-09, 0.0000e+00]], device='cuda:0') Epoch 447, bias, value: tensor([ 0.0004, -0.0404, 0.0110, -0.0068, 0.0085, 0.0064, 0.0182, 0.0085, -0.0445, -0.0149], device='cuda:0'), grad: tensor([ 5.5879e-09, 2.5611e-08, -6.4261e-08, -6.8452e-08, -4.4238e-08, 6.9849e-09, 9.3132e-10, 1.2619e-07, 6.5193e-09, 1.5367e-08], device='cuda:0') 100 1e-05 changing lr epoch 446, time 247.53, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4031 re_mapping 0.0027 re_causal 0.0095 /// teacc 99.16 lr 0.00001000 Epoch 448, weight, value: tensor([[ 0.0209, -0.2028, -0.1985, ..., -0.3908, -0.1351, -0.1783], [ 0.0884, -0.0989, 0.0442, ..., 0.0454, 0.1434, -0.0687], [-0.1117, 0.1642, -0.2366, ..., 0.0592, 0.1036, -0.0417], ..., [-0.0792, -0.1042, -0.0850, ..., 0.0132, -0.2359, 0.1773], [ 0.0477, -0.0559, 0.1395, ..., 0.0082, -0.2724, -0.0415], [-0.2590, -0.1335, -0.1856, ..., -0.3332, 0.0884, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 3.7253e-09, 4.1910e-09, 0.0000e+00], [ 0.0000e+00, -2.7008e-08, 4.6566e-10, ..., -7.9628e-08, -8.6147e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.6077e-08, 9.3132e-10, ..., 7.7765e-08, 8.4750e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 448, bias, value: tensor([ 0.0004, -0.0404, 0.0110, -0.0068, 0.0085, 0.0064, 0.0182, 0.0085, -0.0446, -0.0149], device='cuda:0'), grad: tensor([ 2.3283e-09, 1.3504e-08, -2.6776e-07, 0.0000e+00, -3.7253e-09, 0.0000e+00, -2.7940e-09, 2.6356e-07, 0.0000e+00, 3.2596e-09], device='cuda:0') 100 1e-05 changing lr epoch 447, time 246.83, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4356 re_mapping 0.0028 re_causal 0.0101 /// teacc 99.18 lr 0.00001000 Epoch 449, weight, value: tensor([[ 0.0209, -0.2028, -0.1985, ..., -0.3909, -0.1351, -0.1783], [ 0.0883, -0.0989, 0.0442, ..., 0.0454, 0.1434, -0.0687], [-0.1117, 0.1642, -0.2368, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1043, -0.0849, ..., 0.0133, -0.2359, 0.1773], [ 0.0476, -0.0559, 0.1396, ..., 0.0082, -0.2724, -0.0415], [-0.2591, -0.1335, -0.1857, ..., -0.3333, 0.0885, -0.1109]], device='cuda:0'), grad: tensor([[ 5.5879e-09, -3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 1.6764e-08, 4.6566e-10], [ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 3.2596e-09, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., -5.5879e-09, 4.6566e-10, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 9.3132e-09, 2.3283e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, -9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 449, bias, value: tensor([ 0.0004, -0.0404, 0.0110, -0.0068, 0.0085, 0.0064, 0.0182, 0.0085, -0.0446, -0.0149], device='cuda:0'), grad: tensor([ 3.3062e-08, 2.1420e-08, 8.3819e-09, 5.5879e-09, 1.0245e-08, 2.7940e-08, -1.2433e-07, -2.8871e-08, 3.7719e-08, 1.8161e-08], device='cuda:0') 100 1e-05 changing lr epoch 448, time 247.39, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4368 re_mapping 0.0027 re_causal 0.0099 /// teacc 99.18 lr 0.00001000 Epoch 450, weight, value: tensor([[ 0.0210, -0.2028, -0.1986, ..., -0.3909, -0.1352, -0.1783], [ 0.0883, -0.0989, 0.0443, ..., 0.0455, 0.1436, -0.0688], [-0.1117, 0.1642, -0.2369, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1043, -0.0851, ..., 0.0131, -0.2362, 0.1773], [ 0.0476, -0.0559, 0.1396, ..., 0.0082, -0.2725, -0.0415], [-0.2591, -0.1336, -0.1858, ..., -0.3335, 0.0885, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 6.3796e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.3283e-09, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, -4.6566e-10, ..., -7.7300e-08, 4.6566e-10, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 1.1176e-08, -9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 450, bias, value: tensor([ 0.0004, -0.0403, 0.0111, -0.0068, 0.0085, 0.0064, 0.0183, 0.0084, -0.0446, -0.0149], device='cuda:0'), grad: tensor([ 3.2596e-09, 2.3562e-07, 9.7789e-09, 1.5832e-08, 1.3970e-09, -1.3225e-07, 9.2667e-08, -2.8126e-07, 1.6298e-08, 3.9581e-08], device='cuda:0') 100 1e-05 changing lr epoch 449, time 248.01, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4057 re_mapping 0.0027 re_causal 0.0095 /// teacc 99.17 lr 0.00001000 Epoch 451, weight, value: tensor([[ 0.0210, -0.2028, -0.1986, ..., -0.3910, -0.1353, -0.1783], [ 0.0883, -0.0989, 0.0444, ..., 0.0455, 0.1436, -0.0688], [-0.1117, 0.1642, -0.2371, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1044, -0.0851, ..., 0.0132, -0.2362, 0.1774], [ 0.0476, -0.0559, 0.1396, ..., 0.0082, -0.2726, -0.0415], [-0.2591, -0.1336, -0.1858, ..., -0.3337, 0.0885, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 1.9092e-08, ..., 1.0012e-07, 7.4506e-09, 3.3528e-08], [ 0.0000e+00, -3.0268e-08, 9.3132e-10, ..., -4.8894e-08, -4.5169e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.5611e-08, -1.9092e-08, ..., -8.7079e-08, 3.9116e-08, -3.5856e-08], [ 0.0000e+00, 1.8626e-09, -9.3132e-10, ..., 6.0536e-09, 9.3132e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 2.3749e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 451, bias, value: tensor([ 0.0004, -0.0403, 0.0110, -0.0068, 0.0085, 0.0064, 0.0183, 0.0084, -0.0447, -0.0149], device='cuda:0'), grad: tensor([-2.4820e-07, 4.0932e-07, -1.4342e-07, 2.6077e-08, -8.3819e-09, 5.6811e-08, 5.3085e-08, -3.2922e-07, 4.8894e-08, 1.4389e-07], device='cuda:0') 100 1e-05 changing lr epoch 450, time 247.85, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4301 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.16 lr 0.00001000 Epoch 452, weight, value: tensor([[ 0.0210, -0.2028, -0.1987, ..., -0.3911, -0.1353, -0.1783], [ 0.0883, -0.0988, 0.0444, ..., 0.0455, 0.1437, -0.0689], [-0.1117, 0.1643, -0.2373, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1045, -0.0851, ..., 0.0132, -0.2364, 0.1775], [ 0.0476, -0.0560, 0.1396, ..., 0.0082, -0.2726, -0.0415], [-0.2591, -0.1337, -0.1859, ..., -0.3339, 0.0885, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 4.6566e-10, ..., 4.6566e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 7.9162e-08, ..., 9.6392e-08, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, -1.3970e-09, 3.7253e-09, ..., 3.2596e-09, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, -4.6566e-10, -8.8941e-08, ..., -1.1083e-07, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -6.0536e-09, ..., -4.6566e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 6.9849e-09, ..., 8.3819e-09, -4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 452, bias, value: tensor([ 0.0004, -0.0404, 0.0110, -0.0068, 0.0086, 0.0064, 0.0183, 0.0085, -0.0447, -0.0150], device='cuda:0'), grad: tensor([ 7.4506e-09, 3.5530e-07, 5.1223e-09, 2.6543e-08, 1.8626e-09, -1.2573e-08, 3.2596e-09, -4.0419e-07, -9.7789e-09, 2.9802e-08], device='cuda:0') 100 1e-05 changing lr epoch 451, time 247.26, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4272 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.14 lr 0.00001000 Epoch 453, weight, value: tensor([[ 0.0210, -0.2028, -0.1987, ..., -0.3911, -0.1354, -0.1783], [ 0.0883, -0.0988, 0.0443, ..., 0.0454, 0.1438, -0.0689], [-0.1118, 0.1643, -0.2374, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1045, -0.0850, ..., 0.0133, -0.2364, 0.1775], [ 0.0476, -0.0560, 0.1397, ..., 0.0082, -0.2727, -0.0415], [-0.2591, -0.1337, -0.1860, ..., -0.3341, 0.0885, -0.1109]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 2.7940e-09, ..., 4.6566e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 1.1176e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -1.3504e-08, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.1420e-08, ..., 4.6566e-10, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 2.2352e-08, ..., 1.3970e-09, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 453, bias, value: tensor([ 0.0004, -0.0404, 0.0110, -0.0068, 0.0086, 0.0064, 0.0183, 0.0085, -0.0447, -0.0149], device='cuda:0'), grad: tensor([ 1.5367e-08, 5.4482e-08, 8.8476e-09, -1.6298e-08, -2.3283e-09, 6.5193e-08, -8.0559e-08, -4.8894e-08, -8.1956e-08, 9.6858e-08], device='cuda:0') 100 1e-05 changing lr epoch 452, time 247.84, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4112 re_mapping 0.0027 re_causal 0.0096 /// teacc 99.14 lr 0.00001000 Epoch 454, weight, value: tensor([[ 0.0210, -0.2028, -0.1988, ..., -0.3912, -0.1355, -0.1783], [ 0.0882, -0.0988, 0.0444, ..., 0.0454, 0.1438, -0.0689], [-0.1118, 0.1644, -0.2375, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1046, -0.0851, ..., 0.0132, -0.2365, 0.1775], [ 0.0476, -0.0560, 0.1397, ..., 0.0082, -0.2727, -0.0415], [-0.2591, -0.1337, -0.1861, ..., -0.3343, 0.0886, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 1.9558e-08, ..., 6.9849e-09, 2.0023e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -1.2619e-07, ..., -1.6298e-08, -1.2014e-07, 0.0000e+00], [ 0.0000e+00, -6.0536e-09, 2.2817e-08, ..., -1.3039e-08, 2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-09, 6.2864e-08, ..., 7.4506e-09, 7.4506e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-08, ..., 3.7253e-09, 1.2573e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.9162e-09, ..., 1.0245e-08, 6.9849e-09, 0.0000e+00]], device='cuda:0') Epoch 454, bias, value: tensor([ 0.0004, -0.0404, 0.0111, -0.0068, 0.0086, 0.0064, 0.0183, 0.0085, -0.0447, -0.0149], device='cuda:0'), grad: tensor([-4.0978e-08, -3.2084e-07, 3.1665e-08, 1.1642e-08, 2.3283e-09, -2.0489e-08, 2.4214e-08, 1.7183e-07, 6.8918e-08, 8.0094e-08], device='cuda:0') 100 1e-05 changing lr epoch 453, time 247.17, cls_loss 0.0006 cls_loss_mapping 0.0004 cls_loss_causal 0.4440 re_mapping 0.0028 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 455, weight, value: tensor([[ 0.0211, -0.2028, -0.1990, ..., -0.3913, -0.1357, -0.1783], [ 0.0882, -0.0988, 0.0444, ..., 0.0455, 0.1439, -0.0689], [-0.1119, 0.1644, -0.2377, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0792, -0.1046, -0.0851, ..., 0.0133, -0.2366, 0.1775], [ 0.0475, -0.0560, 0.1398, ..., 0.0082, -0.2729, -0.0415], [-0.2592, -0.1338, -0.1862, ..., -0.3344, 0.0886, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.3970e-09, 6.0536e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4435e-08, ..., 9.3132e-10, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 3.7253e-09, ..., 1.8626e-09, 6.5193e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.6566e-10, 5.1223e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 4.6566e-10, 3.3062e-08, 0.0000e+00]], device='cuda:0') Epoch 455, bias, value: tensor([ 0.0003, -0.0404, 0.0110, -0.0068, 0.0087, 0.0065, 0.0183, 0.0085, -0.0448, -0.0149], device='cuda:0'), grad: tensor([ 1.8626e-09, 3.9116e-08, 2.5611e-08, -5.4482e-08, -3.1758e-07, 2.3283e-08, -3.2131e-08, 3.0268e-08, 2.6077e-08, 2.6589e-07], device='cuda:0') 100 1e-05 changing lr epoch 454, time 247.38, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4366 re_mapping 0.0026 re_causal 0.0098 /// teacc 99.15 lr 0.00001000 Epoch 456, weight, value: tensor([[ 0.0211, -0.2028, -0.1991, ..., -0.3914, -0.1358, -0.1783], [ 0.0882, -0.0988, 0.0445, ..., 0.0455, 0.1441, -0.0689], [-0.1119, 0.1644, -0.2379, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0791, -0.1047, -0.0852, ..., 0.0132, -0.2369, 0.1775], [ 0.0475, -0.0560, 0.1398, ..., 0.0083, -0.2731, -0.0415], [-0.2592, -0.1338, -0.1863, ..., -0.3347, 0.0886, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.1910e-09, ..., 2.0955e-09, 4.8894e-09, 0.0000e+00], [ 0.0000e+00, 7.9162e-08, 2.4820e-07, ..., 2.1067e-06, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 1.7695e-08, ..., 7.3574e-08, -8.1956e-08, 0.0000e+00], ..., [ 0.0000e+00, -8.2888e-08, -2.8219e-07, ..., -2.2389e-06, 3.0035e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.1642e-09, 4.8894e-09, 0.0000e+00], [ 0.0000e+00, 6.9849e-10, 6.9849e-10, ..., 1.2107e-08, -8.4285e-08, 0.0000e+00]], device='cuda:0') Epoch 456, bias, value: tensor([ 0.0003, -0.0403, 0.0110, -0.0068, 0.0088, 0.0065, 0.0184, 0.0084, -0.0449, -0.0150], device='cuda:0'), grad: tensor([ 1.4435e-08, 3.9116e-06, 5.6112e-08, 8.3819e-08, 2.6752e-07, 6.0536e-09, 1.6997e-08, -4.1425e-06, 1.0477e-08, -2.2841e-07], device='cuda:0') 100 1e-05 changing lr epoch 455, time 247.40, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4391 re_mapping 0.0026 re_causal 0.0097 /// teacc 99.15 lr 0.00001000 Epoch 457, weight, value: tensor([[ 0.0211, -0.2028, -0.1992, ..., -0.3914, -0.1359, -0.1784], [ 0.0881, -0.0987, 0.0445, ..., 0.0455, 0.1442, -0.0689], [-0.1119, 0.1645, -0.2381, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0791, -0.1047, -0.0852, ..., 0.0133, -0.2370, 0.1775], [ 0.0475, -0.0560, 0.1399, ..., 0.0083, -0.2731, -0.0415], [-0.2592, -0.1338, -0.1864, ..., -0.3349, 0.0887, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 9.3132e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 4.6566e-08, ..., 1.2480e-07, -1.4435e-08, -4.6566e-10], [ 0.0000e+00, -9.3132e-10, 6.0536e-09, ..., 7.9162e-09, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, -8.3353e-08, ..., -1.9791e-07, 1.6298e-08, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 8.8476e-09, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.6298e-08, ..., 3.4925e-08, -4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 457, bias, value: tensor([ 0.0003, -0.0404, 0.0110, -0.0068, 0.0087, 0.0065, 0.0184, 0.0085, -0.0449, -0.0149], device='cuda:0'), grad: tensor([ 4.1910e-09, 4.6846e-07, 3.0268e-08, 9.4529e-08, 9.3132e-10, -2.7940e-09, -9.3132e-10, -7.6089e-07, 3.7719e-08, 1.4063e-07], device='cuda:0') 100 1e-05 changing lr epoch 456, time 247.29, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4462 re_mapping 0.0026 re_causal 0.0098 /// teacc 99.19 lr 0.00001000 Epoch 458, weight, value: tensor([[ 0.0211, -0.2029, -0.1992, ..., -0.3916, -0.1361, -0.1784], [ 0.0881, -0.0987, 0.0444, ..., 0.0453, 0.1443, -0.0689], [-0.1120, 0.1645, -0.2381, ..., 0.0593, 0.1037, -0.0417], ..., [-0.0790, -0.1048, -0.0850, ..., 0.0134, -0.2371, 0.1776], [ 0.0475, -0.0561, 0.1399, ..., 0.0082, -0.2733, -0.0415], [-0.2592, -0.1339, -0.1866, ..., -0.3352, 0.0887, -0.1109]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-10, 4.6566e-10, ..., -2.7940e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., -9.3132e-10, 1.8626e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 458, bias, value: tensor([ 0.0003, -0.0405, 0.0110, -0.0069, 0.0087, 0.0065, 0.0185, 0.0086, -0.0450, -0.0149], device='cuda:0'), grad: tensor([ 5.5879e-08, 1.1642e-08, 6.0536e-09, 8.8476e-09, 1.1642e-08, 3.3528e-08, -4.0513e-08, 4.7963e-08, 9.4529e-08, -2.2724e-07], device='cuda:0') 100 1e-05 changing lr epoch 457, time 247.51, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4454 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.17 lr 0.00001000 Epoch 459, weight, value: tensor([[ 0.0211, -0.2029, -0.1993, ..., -0.3917, -0.1361, -0.1784], [ 0.0881, -0.0987, 0.0444, ..., 0.0453, 0.1444, -0.0689], [-0.1121, 0.1646, -0.2382, ..., 0.0592, 0.1037, -0.0417], ..., [-0.0790, -0.1048, -0.0851, ..., 0.0134, -0.2372, 0.1776], [ 0.0475, -0.0561, 0.1399, ..., 0.0082, -0.2734, -0.0415], [-0.2592, -0.1339, -0.1866, ..., -0.3353, 0.0887, -0.1109]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 1.2107e-08, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 4.6566e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 459, bias, value: tensor([ 0.0003, -0.0405, 0.0110, -0.0069, 0.0087, 0.0065, 0.0185, 0.0086, -0.0451, -0.0150], device='cuda:0'), grad: tensor([ 4.6566e-09, 4.6566e-09, 2.1420e-08, -3.4459e-08, 0.0000e+00, 1.5832e-08, -2.0489e-08, 5.5879e-09, 9.3132e-10, 2.7940e-09], device='cuda:0') 100 1e-05 changing lr epoch 458, time 247.28, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4235 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.16 lr 0.00001000 Epoch 460, weight, value: tensor([[ 0.0211, -0.2029, -0.1994, ..., -0.3918, -0.1362, -0.1784], [ 0.0881, -0.0987, 0.0445, ..., 0.0453, 0.1444, -0.0690], [-0.1121, 0.1647, -0.2383, ..., 0.0593, 0.1038, -0.0417], ..., [-0.0790, -0.1049, -0.0851, ..., 0.0135, -0.2373, 0.1777], [ 0.0475, -0.0561, 0.1400, ..., 0.0082, -0.2735, -0.0415], [-0.2592, -0.1340, -0.1867, ..., -0.3356, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-09, ..., -1.8626e-09, -1.1176e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 1.8626e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -2.0489e-08, 0.0000e+00]], device='cuda:0') Epoch 460, bias, value: tensor([ 0.0003, -0.0405, 0.0110, -0.0069, 0.0087, 0.0065, 0.0185, 0.0086, -0.0451, -0.0150], device='cuda:0'), grad: tensor([ 6.4261e-08, 5.5879e-09, 2.7940e-09, -2.7940e-09, 3.2596e-08, -2.0489e-08, -3.6322e-08, 2.7008e-08, 1.5832e-08, -8.2888e-08], device='cuda:0') 100 1e-05 changing lr epoch 459, time 246.92, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4179 re_mapping 0.0026 re_causal 0.0097 /// teacc 99.18 lr 0.00001000 Epoch 461, weight, value: tensor([[ 0.0211, -0.2030, -0.1995, ..., -0.3919, -0.1363, -0.1784], [ 0.0881, -0.0987, 0.0445, ..., 0.0453, 0.1444, -0.0691], [-0.1121, 0.1647, -0.2385, ..., 0.0593, 0.1038, -0.0416], ..., [-0.0790, -0.1049, -0.0851, ..., 0.0135, -0.2374, 0.1777], [ 0.0475, -0.0562, 0.1400, ..., 0.0082, -0.2736, -0.0415], [-0.2592, -0.1340, -0.1868, ..., -0.3358, 0.0888, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.7940e-09, ..., 0.0000e+00, -2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 6.5193e-09, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-09, ..., -8.3819e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 461, bias, value: tensor([ 0.0003, -0.0406, 0.0110, -0.0069, 0.0088, 0.0065, 0.0185, 0.0087, -0.0452, -0.0150], device='cuda:0'), grad: tensor([ 9.3132e-10, -2.7940e-09, 1.5832e-08, -3.7253e-09, 0.0000e+00, 2.7940e-09, 0.0000e+00, -1.6764e-08, 4.6566e-09, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 460, time 247.06, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4392 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.17 lr 0.00001000 Epoch 462, weight, value: tensor([[ 0.0211, -0.2030, -0.1996, ..., -0.3920, -0.1363, -0.1784], [ 0.0881, -0.0987, 0.0445, ..., 0.0453, 0.1446, -0.0691], [-0.1121, 0.1647, -0.2386, ..., 0.0593, 0.1038, -0.0416], ..., [-0.0790, -0.1050, -0.0851, ..., 0.0135, -0.2375, 0.1777], [ 0.0475, -0.0562, 0.1401, ..., 0.0083, -0.2737, -0.0415], [-0.2593, -0.1340, -0.1870, ..., -0.3360, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, -2.0489e-08, ..., -0.0000e+00, -1.3039e-08, 0.0000e+00], [ 0.0000e+00, -2.4214e-08, 1.1176e-08, ..., -4.0978e-08, -1.2107e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.7008e-08, 2.7940e-08, ..., 5.5879e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, -5.5879e-09, -2.0489e-08, ..., -1.1176e-08, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 462, bias, value: tensor([ 0.0003, -0.0406, 0.0110, -0.0069, 0.0088, 0.0065, 0.0185, 0.0087, -0.0452, -0.0150], device='cuda:0'), grad: tensor([ 8.3819e-09, -5.4017e-08, -7.3574e-08, -3.0734e-08, 9.3132e-10, 1.9558e-08, 4.6566e-09, 1.6391e-07, -3.7253e-08, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 461, time 247.09, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4306 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.18 lr 0.00001000 Epoch 463, weight, value: tensor([[ 0.0211, -0.2030, -0.1997, ..., -0.3920, -0.1365, -0.1784], [ 0.0881, -0.0986, 0.0445, ..., 0.0453, 0.1447, -0.0691], [-0.1121, 0.1647, -0.2389, ..., 0.0592, 0.1037, -0.0416], ..., [-0.0790, -0.1050, -0.0851, ..., 0.0136, -0.2376, 0.1777], [ 0.0475, -0.0562, 0.1402, ..., 0.0083, -0.2739, -0.0415], [-0.2593, -0.1341, -0.1873, ..., -0.3364, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 4.6566e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -4.6566e-09, 0.0000e+00, ..., -1.1176e-08, 1.4901e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.7940e-09, 1.8626e-09, ..., 8.3819e-09, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 463, bias, value: tensor([ 0.0003, -0.0405, 0.0109, -0.0069, 0.0087, 0.0065, 0.0185, 0.0087, -0.0453, -0.0151], device='cuda:0'), grad: tensor([ 1.1176e-08, 6.5193e-09, 5.6811e-08, 1.5832e-08, 1.5832e-08, -1.8626e-09, -1.3784e-07, 1.9558e-08, 1.2107e-08, 5.5879e-09], device='cuda:0') 100 1e-05 changing lr epoch 462, time 246.92, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4380 re_mapping 0.0026 re_causal 0.0097 /// teacc 99.15 lr 0.00001000 Epoch 464, weight, value: tensor([[ 0.0211, -0.2030, -0.1998, ..., -0.3921, -0.1367, -0.1784], [ 0.0881, -0.0987, 0.0445, ..., 0.0451, 0.1447, -0.0691], [-0.1121, 0.1649, -0.2389, ..., 0.0592, 0.1038, -0.0416], ..., [-0.0791, -0.1050, -0.0850, ..., 0.0137, -0.2377, 0.1777], [ 0.0474, -0.0562, 0.1402, ..., 0.0083, -0.2741, -0.0415], [-0.2593, -0.1341, -0.1873, ..., -0.3366, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, -7.4506e-07, -3.0324e-06, ..., -4.1611e-06, -3.8631e-06, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 7.4506e-08, ..., 1.0151e-07, 7.7300e-08, 0.0000e+00], ..., [ 0.0000e+00, 7.3109e-07, 2.9393e-06, ..., 4.0308e-06, 3.7681e-06, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, -3.7253e-09, ..., -1.8626e-09, 1.3970e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-09, -3.1665e-08, 0.0000e+00]], device='cuda:0') Epoch 464, bias, value: tensor([ 0.0002, -0.0407, 0.0110, -0.0069, 0.0088, 0.0065, 0.0186, 0.0088, -0.0454, -0.0152], device='cuda:0'), grad: tensor([ 4.7497e-08, -8.2180e-06, 2.0955e-07, 3.8184e-08, 4.1910e-08, 1.4901e-08, -5.2154e-08, 7.9498e-06, 3.1665e-08, -6.9849e-08], device='cuda:0') 100 1e-05 changing lr epoch 463, time 246.96, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4226 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.18 lr 0.00001000 Epoch 465, weight, value: tensor([[ 0.0211, -0.2031, -0.1999, ..., -0.3922, -0.1368, -0.1784], [ 0.0881, -0.0985, 0.0446, ..., 0.0453, 0.1449, -0.0691], [-0.1122, 0.1649, -0.2391, ..., 0.0592, 0.1038, -0.0416], ..., [-0.0790, -0.1053, -0.0851, ..., 0.0136, -0.2380, 0.1778], [ 0.0475, -0.0562, 0.1403, ..., 0.0083, -0.2742, -0.0415], [-0.2593, -0.1342, -0.1874, ..., -0.3367, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.3132e-10, 9.3132e-10, ..., -9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.5832e-08, 2.7940e-09, ..., 2.0489e-08, 1.4901e-08, 0.0000e+00], [ 0.0000e+00, -2.7008e-08, -1.1176e-08, ..., -4.0047e-08, -3.0734e-08, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 9.3132e-10, ..., 4.6566e-09, 8.3819e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 9.3132e-10, ..., 7.4506e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, -4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 465, bias, value: tensor([ 0.0002, -0.0406, 0.0109, -0.0069, 0.0088, 0.0065, 0.0186, 0.0088, -0.0454, -0.0152], device='cuda:0'), grad: tensor([-3.7253e-08, 6.7987e-08, -1.1176e-07, 2.6077e-08, 5.5879e-09, 0.0000e+00, 2.7940e-09, 4.4703e-08, 2.1420e-08, -1.5832e-08], device='cuda:0') 100 1e-05 changing lr epoch 464, time 247.19, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4305 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.16 lr 0.00001000 Epoch 466, weight, value: tensor([[ 0.0211, -0.2031, -0.2000, ..., -0.3923, -0.1370, -0.1784], [ 0.0881, -0.0984, 0.0446, ..., 0.0452, 0.1451, -0.0691], [-0.1122, 0.1649, -0.2394, ..., 0.0591, 0.1038, -0.0416], ..., [-0.0790, -0.1054, -0.0851, ..., 0.0137, -0.2381, 0.1778], [ 0.0474, -0.0563, 0.1403, ..., 0.0083, -0.2743, -0.0415], [-0.2593, -0.1343, -0.1875, ..., -0.3370, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-0.0000e+00, -9.3132e-10, -1.5832e-08, ..., -1.1176e-08, -2.0489e-08, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 6.5193e-09, -0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.8626e-09, 1.2107e-08, ..., 1.8626e-09, 1.8626e-08, 0.0000e+00], [-5.5879e-09, 0.0000e+00, -5.5879e-09, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 466, bias, value: tensor([ 0.0001, -0.0406, 0.0109, -0.0069, 0.0088, 0.0065, 0.0186, 0.0088, -0.0455, -0.0152], device='cuda:0'), grad: tensor([ 4.5635e-08, -2.0489e-08, 2.4214e-08, 4.6566e-09, 0.0000e+00, 4.6566e-09, 4.6566e-09, -3.5390e-08, -5.2154e-08, 3.7253e-08], device='cuda:0') 100 1e-05 changing lr epoch 465, time 247.01, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4238 re_mapping 0.0027 re_causal 0.0097 /// teacc 99.16 lr 0.00001000 Epoch 467, weight, value: tensor([[ 0.0211, -0.2031, -0.2001, ..., -0.3924, -0.1372, -0.1784], [ 0.0881, -0.0982, 0.0447, ..., 0.0453, 0.1452, -0.0692], [-0.1122, 0.1649, -0.2395, ..., 0.0591, 0.1038, -0.0416], ..., [-0.0790, -0.1056, -0.0852, ..., 0.0136, -0.2383, 0.1779], [ 0.0474, -0.0563, 0.1405, ..., 0.0084, -0.2744, -0.0415], [-0.2593, -0.1343, -0.1876, ..., -0.3371, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 3.7253e-09, ..., 2.0489e-08, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 9.3132e-10, ..., 2.4214e-08, -2.7940e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., -4.7497e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 8.3819e-09, 0.0000e+00]], device='cuda:0') Epoch 467, bias, value: tensor([ 7.9802e-05, -4.0509e-02, 1.0798e-02, -6.9628e-03, 8.8327e-03, 6.5426e-03, 1.8661e-02, 8.7374e-03, -4.5480e-02, -1.5290e-02], device='cuda:0'), grad: tensor([ 9.3132e-09, 1.1642e-07, 1.1735e-07, 4.6566e-09, -3.5390e-08, -4.8429e-08, 2.3283e-08, -2.3749e-07, 1.1176e-08, 4.0047e-08], device='cuda:0') 100 1e-05 changing lr epoch 466, time 248.17, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4230 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.17 lr 0.00001000 Epoch 468, weight, value: tensor([[ 0.0212, -0.2031, -0.2001, ..., -0.3925, -0.1373, -0.1784], [ 0.0881, -0.0982, 0.0449, ..., 0.0454, 0.1454, -0.0692], [-0.1122, 0.1650, -0.2397, ..., 0.0591, 0.1038, -0.0415], ..., [-0.0790, -0.1057, -0.0854, ..., 0.0135, -0.2385, 0.1779], [ 0.0474, -0.0563, 0.1406, ..., 0.0084, -0.2745, -0.0415], [-0.2593, -0.1343, -0.1877, ..., -0.3374, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, -5.5879e-09, 9.3132e-10, ..., -1.5832e-08, -1.9558e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-10, ..., 6.5193e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 468, bias, value: tensor([ 5.8610e-05, -4.0401e-02, 1.0800e-02, -7.0061e-03, 8.8884e-03, 6.5748e-03, 1.8683e-02, 8.6304e-03, -4.5473e-02, -1.5358e-02], device='cuda:0'), grad: tensor([ 4.6566e-09, 7.4506e-09, -5.3085e-08, 4.6566e-09, 1.8626e-09, 0.0000e+00, 1.8626e-09, 1.6764e-08, 5.5879e-09, 1.3039e-08], device='cuda:0') 100 1e-05 changing lr epoch 467, time 247.74, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4198 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.16 lr 0.00001000 Epoch 469, weight, value: tensor([[ 0.0212, -0.2032, -0.2002, ..., -0.3927, -0.1375, -0.1784], [ 0.0881, -0.0981, 0.0450, ..., 0.0455, 0.1455, -0.0693], [-0.1122, 0.1650, -0.2398, ..., 0.0591, 0.1038, -0.0415], ..., [-0.0790, -0.1058, -0.0854, ..., 0.0135, -0.2386, 0.1779], [ 0.0474, -0.0563, 0.1406, ..., 0.0085, -0.2746, -0.0415], [-0.2593, -0.1344, -0.1879, ..., -0.3377, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, 1.8626e-09, ..., -3.7253e-09, 6.4261e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.7940e-09, 1.8626e-09, ..., 6.5193e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., -9.3132e-10, 1.3970e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 469, bias, value: tensor([ 5.1839e-05, -4.0382e-02, 1.0822e-02, -7.0342e-03, 8.8915e-03, 6.6138e-03, 1.8662e-02, 8.6278e-03, -4.5544e-02, -1.5432e-02], device='cuda:0'), grad: tensor([ 1.6764e-08, 2.8871e-08, 1.4063e-07, -7.4506e-09, 7.6368e-08, 8.3819e-09, -2.9802e-07, 1.7695e-08, 2.9802e-08, -3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 468, time 247.51, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4395 re_mapping 0.0027 re_causal 0.0098 /// teacc 99.15 lr 0.00001000 Epoch 470, weight, value: tensor([[ 0.0212, -0.2032, -0.2003, ..., -0.3927, -0.1376, -0.1784], [ 0.0881, -0.0982, 0.0449, ..., 0.0454, 0.1456, -0.0693], [-0.1122, 0.1651, -0.2399, ..., 0.0591, 0.1039, -0.0415], ..., [-0.0790, -0.1058, -0.0854, ..., 0.0136, -0.2387, 0.1780], [ 0.0474, -0.0564, 0.1407, ..., 0.0085, -0.2747, -0.0415], [-0.2593, -0.1344, -0.1881, ..., -0.3379, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., -5.5879e-09, -1.2107e-08, 0.0000e+00], [ 0.0000e+00, -2.7940e-09, 1.8626e-09, ..., -5.5879e-09, -1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 1.1176e-08, ..., 9.3132e-09, 1.1176e-08, 0.0000e+00], [-0.0000e+00, -9.3132e-10, -1.3039e-08, ..., -5.5879e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 470, bias, value: tensor([ 3.2656e-05, -4.0442e-02, 1.0837e-02, -7.0698e-03, 8.9484e-03, 6.6815e-03, 1.8622e-02, 8.7021e-03, -4.5662e-02, -1.5547e-02], device='cuda:0'), grad: tensor([ 7.4506e-09, -3.0734e-08, -6.5193e-09, 1.4901e-08, 4.6566e-09, 1.4901e-08, -7.4506e-09, 3.4459e-08, -3.0734e-08, -2.7940e-09], device='cuda:0') 100 1e-05 changing lr epoch 469, time 247.42, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4242 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.18 lr 0.00001000 Epoch 471, weight, value: tensor([[ 0.0212, -0.2032, -0.2003, ..., -0.3928, -0.1376, -0.1784], [ 0.0881, -0.0983, 0.0448, ..., 0.0452, 0.1457, -0.0694], [-0.1122, 0.1652, -0.2400, ..., 0.0591, 0.1039, -0.0415], ..., [-0.0790, -0.1058, -0.0853, ..., 0.0138, -0.2389, 0.1782], [ 0.0474, -0.0564, 0.1407, ..., 0.0085, -0.2748, -0.0415], [-0.2594, -0.1345, -0.1882, ..., -0.3382, 0.0886, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.4901e-08, ..., -5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 1.8626e-09, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 471, bias, value: tensor([ 6.4606e-05, -4.0588e-02, 1.0845e-02, -7.0959e-03, 8.9410e-03, 6.6904e-03, 1.8641e-02, 8.8803e-03, -4.5703e-02, -1.5671e-02], device='cuda:0'), grad: tensor([ 0.0000e+00, 9.3132e-09, 7.4506e-09, 5.3085e-08, -1.9558e-08, -5.6811e-08, 1.0245e-08, 4.6566e-09, -2.6077e-08, 1.4901e-08], device='cuda:0') 100 1e-05 changing lr epoch 470, time 247.23, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4382 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.18 lr 0.00001000 Epoch 472, weight, value: tensor([[ 0.0212, -0.2033, -0.2004, ..., -0.3930, -0.1378, -0.1785], [ 0.0881, -0.0983, 0.0449, ..., 0.0452, 0.1458, -0.0694], [-0.1123, 0.1653, -0.2401, ..., 0.0592, 0.1040, -0.0415], ..., [-0.0789, -0.1058, -0.0853, ..., 0.0138, -0.2390, 0.1782], [ 0.0474, -0.0564, 0.1408, ..., 0.0085, -0.2750, -0.0415], [-0.2594, -0.1345, -0.1884, ..., -0.3385, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 5.9605e-08, ..., 8.3819e-09, 3.0734e-08, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, -1.0245e-07, ..., 1.9558e-08, -3.5390e-08, 0.0000e+00], [ 0.0000e+00, -6.5193e-09, 7.4506e-09, ..., -4.1910e-08, -2.6077e-08, 0.0000e+00], ..., [ 0.0000e+00, 1.8626e-09, 9.3132e-09, ..., 1.5832e-08, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.6322e-08, ..., 1.0245e-08, 1.8626e-08, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 8.3819e-09, 0.0000e+00]], device='cuda:0') Epoch 472, bias, value: tensor([ 2.1036e-05, -4.0563e-02, 1.0875e-02, -7.1199e-03, 8.9023e-03, 6.7147e-03, 1.8649e-02, 8.8817e-03, -4.5842e-02, -1.5727e-02], device='cuda:0'), grad: tensor([ 1.3877e-07, -2.0955e-07, -5.1223e-08, -3.4459e-08, -3.1665e-08, -6.4261e-08, 6.1467e-08, 4.4703e-08, 1.2387e-07, 3.5390e-08], device='cuda:0') 100 1e-05 changing lr epoch 471, time 247.25, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4239 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.19 lr 0.00001000 Epoch 473, weight, value: tensor([[ 0.0212, -0.2033, -0.2005, ..., -0.3931, -0.1380, -0.1785], [ 0.0881, -0.0983, 0.0450, ..., 0.0453, 0.1459, -0.0694], [-0.1123, 0.1654, -0.2401, ..., 0.0592, 0.1040, -0.0415], ..., [-0.0789, -0.1059, -0.0854, ..., 0.0137, -0.2392, 0.1782], [ 0.0474, -0.0564, 0.1408, ..., 0.0085, -0.2751, -0.0415], [-0.2594, -0.1346, -0.1885, ..., -0.3386, 0.0887, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 1.8626e-09, ..., 7.4506e-09, 1.8626e-09, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 1.0245e-08, ..., 7.4506e-09, 3.7253e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., -1.0245e-08, 2.7940e-09, -3.7253e-09], [ 9.3132e-10, 0.0000e+00, 2.1420e-08, ..., 1.1176e-08, 9.3132e-10, 0.0000e+00], [ 3.7253e-09, 0.0000e+00, 9.3132e-10, ..., 5.5879e-09, -9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 473, bias, value: tensor([ 3.3948e-05, -4.0497e-02, 1.0873e-02, -7.1374e-03, 8.8745e-03, 6.7238e-03, 1.8661e-02, 8.8219e-03, -4.5903e-02, -1.5720e-02], device='cuda:0'), grad: tensor([-6.5193e-09, 6.5193e-08, 4.0978e-08, -7.1712e-08, -4.0978e-08, 1.7695e-08, 1.8626e-09, -8.5682e-08, 5.4948e-08, 3.0734e-08], device='cuda:0') 100 1e-05 changing lr epoch 472, time 247.47, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4149 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.17 lr 0.00001000 Epoch 474, weight, value: tensor([[ 0.0212, -0.2033, -0.2006, ..., -0.3931, -0.1381, -0.1785], [ 0.0880, -0.0983, 0.0451, ..., 0.0453, 0.1460, -0.0696], [-0.1123, 0.1654, -0.2403, ..., 0.0592, 0.1040, -0.0415], ..., [-0.0789, -0.1059, -0.0855, ..., 0.0137, -0.2393, 0.1784], [ 0.0474, -0.0565, 0.1409, ..., 0.0084, -0.2752, -0.0415], [-0.2594, -0.1346, -0.1886, ..., -0.3389, 0.0888, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, -1.8626e-09, ..., 2.2352e-08, 3.5390e-08, 0.0000e+00], [ 0.0000e+00, -1.3970e-08, -9.3132e-10, ..., -2.4214e-08, -5.4017e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -6.5193e-09, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 3.7253e-09, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 474, bias, value: tensor([ 6.5861e-05, -4.0483e-02, 1.0831e-02, -7.1458e-03, 8.8257e-03, 6.7321e-03, 1.8672e-02, 8.8335e-03, -4.5961e-02, -1.5763e-02], device='cuda:0'), grad: tensor([ 3.7253e-09, 6.7055e-08, -7.4506e-08, 4.6566e-09, 1.0245e-08, -7.4506e-09, -9.3132e-09, -2.2352e-08, 1.6764e-08, 8.3819e-09], device='cuda:0') 100 1e-05 changing lr epoch 473, time 247.48, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4473 re_mapping 0.0026 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 475, weight, value: tensor([[ 0.0212, -0.2033, -0.2007, ..., -0.3932, -0.1382, -0.1785], [ 0.0880, -0.0982, 0.0451, ..., 0.0453, 0.1461, -0.0696], [-0.1123, 0.1655, -0.2404, ..., 0.0592, 0.1040, -0.0415], ..., [-0.0789, -0.1060, -0.0855, ..., 0.0137, -0.2394, 0.1784], [ 0.0474, -0.0565, 0.1408, ..., 0.0084, -0.2753, -0.0415], [-0.2594, -0.1346, -0.1886, ..., -0.3391, 0.0888, -0.1110]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, -6.5193e-09, ..., 2.7940e-09, -5.5879e-09, 0.0000e+00], [ 0.0000e+00, -2.6077e-08, 0.0000e+00, ..., -4.3772e-08, -1.0245e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.1420e-08, 6.5193e-09, ..., 3.8184e-08, 1.5832e-08, -0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 3.7253e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 475, bias, value: tensor([ 5.7248e-05, -4.0466e-02, 1.0816e-02, -7.1474e-03, 8.8152e-03, 6.7775e-03, 1.8677e-02, 8.8251e-03, -4.6215e-02, -1.5813e-02], device='cuda:0'), grad: tensor([-3.7253e-09, 1.8626e-09, -1.2387e-07, 1.8626e-09, -2.7940e-09, 2.7940e-09, -2.7940e-09, 1.1642e-07, 1.2107e-08, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 474, time 247.54, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4202 re_mapping 0.0026 re_causal 0.0096 /// teacc 99.21 lr 0.00001000 Epoch 476, weight, value: tensor([[ 0.0213, -0.2033, -0.2007, ..., -0.3932, -0.1383, -0.1785], [ 0.0880, -0.0982, 0.0452, ..., 0.0454, 0.1462, -0.0697], [-0.1123, 0.1655, -0.2406, ..., 0.0591, 0.1039, -0.0415], ..., [-0.0789, -0.1061, -0.0855, ..., 0.0137, -0.2395, 0.1785], [ 0.0474, -0.0565, 0.1409, ..., 0.0084, -0.2754, -0.0415], [-0.2594, -0.1347, -0.1887, ..., -0.3393, 0.0888, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.7940e-09, ..., -0.0000e+00, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -3.7253e-09, 4.6566e-09, -9.3132e-10], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., -9.3132e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.9558e-08, 0.0000e+00]], device='cuda:0') Epoch 476, bias, value: tensor([ 3.3043e-05, -4.0450e-02, 1.0697e-02, -7.1433e-03, 8.8149e-03, 6.7709e-03, 1.8719e-02, 8.8479e-03, -4.6248e-02, -1.5884e-02], device='cuda:0'), grad: tensor([ 1.7695e-08, 1.8626e-09, 1.3970e-08, 6.5193e-09, -6.5193e-09, -2.8871e-08, 2.9802e-08, -8.3819e-09, 3.7253e-09, -2.3283e-08], device='cuda:0') 100 1e-05 changing lr epoch 475, time 246.71, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4327 re_mapping 0.0026 re_causal 0.0098 /// teacc 99.17 lr 0.00001000 Epoch 477, weight, value: tensor([[ 0.0213, -0.2034, -0.2008, ..., -0.3933, -0.1384, -0.1785], [ 0.0880, -0.0981, 0.0453, ..., 0.0454, 0.1463, -0.0697], [-0.1123, 0.1656, -0.2408, ..., 0.0591, 0.1040, -0.0415], ..., [-0.0789, -0.1062, -0.0856, ..., 0.0137, -0.2396, 0.1785], [ 0.0474, -0.0565, 0.1410, ..., 0.0084, -0.2755, -0.0415], [-0.2594, -0.1347, -0.1888, ..., -0.3394, 0.0888, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, -3.7253e-09, ..., -9.3132e-10, -1.0245e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 9.3132e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -2.7940e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 477, bias, value: tensor([ 5.6391e-06, -4.0398e-02, 1.0677e-02, -7.1491e-03, 8.8128e-03, 6.7670e-03, 1.8741e-02, 8.8157e-03, -4.6258e-02, -1.5942e-02], device='cuda:0'), grad: tensor([ 6.5193e-09, -9.3132e-09, 2.7008e-08, 1.8626e-09, 1.3970e-08, 4.6566e-09, -3.5390e-08, -1.2107e-08, 7.4506e-09, -3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 476, time 247.46, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4215 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.14 lr 0.00001000 Epoch 478, weight, value: tensor([[ 0.0213, -0.2034, -0.2008, ..., -0.3934, -0.1385, -0.1785], [ 0.0880, -0.0982, 0.0453, ..., 0.0454, 0.1463, -0.0697], [-0.1123, 0.1656, -0.2408, ..., 0.0591, 0.1040, -0.0415], ..., [-0.0789, -0.1062, -0.0856, ..., 0.0137, -0.2397, 0.1786], [ 0.0474, -0.0566, 0.1410, ..., 0.0084, -0.2756, -0.0415], [-0.2595, -0.1347, -0.1888, ..., -0.3396, 0.0888, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.4214e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 1.8626e-09, ..., 0.0000e+00, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 2.7940e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 2.7940e-09, ..., 2.7940e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, -8.3819e-09, ..., -3.7253e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 9.3132e-09, 9.3132e-10, ..., 0.0000e+00, -2.7940e-09, 0.0000e+00]], device='cuda:0') Epoch 478, bias, value: tensor([-3.3555e-05, -4.0387e-02, 1.0684e-02, -7.1559e-03, 8.7509e-03, 6.7723e-03, 1.8780e-02, 8.8098e-03, -4.6307e-02, -1.5952e-02], device='cuda:0'), grad: tensor([-1.4063e-07, 3.7253e-09, 2.7940e-08, -4.6566e-09, 6.5193e-09, -8.3819e-09, 1.3970e-08, 1.4901e-08, 3.8184e-08, 4.3772e-08], device='cuda:0') 100 1e-05 changing lr epoch 477, time 248.42, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4203 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.16 lr 0.00001000 Epoch 479, weight, value: tensor([[ 0.0212, -0.2034, -0.2009, ..., -0.3936, -0.1388, -0.1785], [ 0.0880, -0.0982, 0.0454, ..., 0.0454, 0.1464, -0.0697], [-0.1124, 0.1657, -0.2409, ..., 0.0591, 0.1040, -0.0415], ..., [-0.0789, -0.1063, -0.0857, ..., 0.0137, -0.2398, 0.1786], [ 0.0474, -0.0566, 0.1410, ..., 0.0084, -0.2757, -0.0416], [-0.2595, -0.1348, -0.1890, ..., -0.3399, 0.0888, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.7789e-09, ..., 8.8476e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 2.7940e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.9558e-08, ..., -1.7229e-08, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, -4.1910e-09, 0.0000e+00]], device='cuda:0') Epoch 479, bias, value: tensor([-0.0001, -0.0404, 0.0107, -0.0072, 0.0088, 0.0068, 0.0188, 0.0088, -0.0464, -0.0161], device='cuda:0'), grad: tensor([ 1.8626e-09, 6.0536e-09, 4.0047e-08, 1.8626e-09, 3.2596e-09, 1.8161e-08, 1.0245e-08, 1.2573e-08, -6.7987e-08, -2.0489e-08], device='cuda:0') 100 1e-05 changing lr epoch 478, time 248.71, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4185 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.14 lr 0.00001000 Epoch 480, weight, value: tensor([[ 0.0212, -0.2034, -0.2010, ..., -0.3936, -0.1389, -0.1786], [ 0.0880, -0.0983, 0.0455, ..., 0.0455, 0.1466, -0.0698], [-0.1124, 0.1659, -0.2411, ..., 0.0591, 0.1041, -0.0415], ..., [-0.0789, -0.1063, -0.0858, ..., 0.0136, -0.2400, 0.1788], [ 0.0474, -0.0566, 0.1411, ..., 0.0084, -0.2759, -0.0416], [-0.2595, -0.1348, -0.1891, ..., -0.3401, 0.0888, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 9.3132e-10, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.3283e-09, ..., 4.6566e-10, -1.3970e-09, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 9.3132e-10, ..., -3.3062e-08, -2.0489e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 2.0955e-08, 1.5367e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 3.2596e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -1.3970e-09, 0.0000e+00]], device='cuda:0') Epoch 480, bias, value: tensor([-0.0002, -0.0403, 0.0108, -0.0072, 0.0089, 0.0068, 0.0189, 0.0087, -0.0465, -0.0162], device='cuda:0'), grad: tensor([-7.4506e-09, -4.6566e-10, -5.7742e-08, 1.2573e-08, -2.3283e-09, -3.7253e-09, 5.5879e-09, 3.5856e-08, 1.4435e-08, 3.2596e-09], device='cuda:0') 100 1e-05 changing lr epoch 479, time 248.65, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4422 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.15 lr 0.00001000 Epoch 481, weight, value: tensor([[ 0.0212, -0.2034, -0.2011, ..., -0.3937, -0.1390, -0.1786], [ 0.0880, -0.0982, 0.0456, ..., 0.0455, 0.1466, -0.0700], [-0.1124, 0.1660, -0.2411, ..., 0.0592, 0.1042, -0.0415], ..., [-0.0789, -0.1065, -0.0859, ..., 0.0136, -0.2401, 0.1790], [ 0.0474, -0.0566, 0.1411, ..., 0.0083, -0.2760, -0.0417], [-0.2595, -0.1349, -0.1891, ..., -0.3403, 0.0887, -0.1111]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.2596e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 2.3283e-09, 3.2596e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., 9.3132e-10, 1.7695e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.3970e-09, ..., -1.3970e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 7.9162e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 481, bias, value: tensor([-0.0002, -0.0403, 0.0108, -0.0072, 0.0090, 0.0068, 0.0189, 0.0087, -0.0466, -0.0164], device='cuda:0'), grad: tensor([ 5.0757e-08, 2.0955e-08, 4.9826e-08, 0.0000e+00, -3.7253e-08, 3.4925e-08, -2.2212e-07, -8.8476e-09, 1.1176e-07, 7.9162e-09], device='cuda:0') 100 1e-05 changing lr epoch 480, time 248.59, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4238 re_mapping 0.0026 re_causal 0.0094 /// teacc 99.13 lr 0.00001000 Epoch 482, weight, value: tensor([[ 0.0212, -0.2035, -0.2012, ..., -0.3939, -0.1392, -0.1786], [ 0.0880, -0.0982, 0.0456, ..., 0.0455, 0.1467, -0.0702], [-0.1124, 0.1660, -0.2413, ..., 0.0593, 0.1042, -0.0415], ..., [-0.0789, -0.1065, -0.0859, ..., 0.0137, -0.2401, 0.1792], [ 0.0474, -0.0566, 0.1411, ..., 0.0083, -0.2762, -0.0417], [-0.2595, -0.1349, -0.1892, ..., -0.3406, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.3970e-09, ..., 7.4506e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 4.1444e-08, 2.3283e-09, ..., 1.6484e-07, -1.3970e-09, 0.0000e+00], [ 0.0000e+00, 2.5146e-08, 1.5832e-08, ..., 1.0012e-07, 1.3970e-09, 0.0000e+00], ..., [ 0.0000e+00, -7.0315e-08, 2.3283e-09, ..., -2.8638e-07, 2.7940e-09, -1.3970e-09], [ 0.0000e+00, -5.5879e-09, -3.5856e-08, ..., -2.0955e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 1.3970e-09, ..., 6.0536e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 482, bias, value: tensor([-0.0003, -0.0403, 0.0108, -0.0073, 0.0092, 0.0069, 0.0189, 0.0088, -0.0467, -0.0166], device='cuda:0'), grad: tensor([ 2.9802e-08, 5.4995e-07, 2.5472e-07, 4.7032e-08, -8.8476e-09, 2.8871e-08, 9.7789e-09, -9.3272e-07, -8.8010e-08, 1.1362e-07], device='cuda:0') 100 1e-05 changing lr epoch 481, time 249.02, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4256 re_mapping 0.0026 re_causal 0.0095 /// teacc 99.12 lr 0.00001000 Epoch 483, weight, value: tensor([[ 0.0212, -0.2035, -0.2014, ..., -0.3941, -0.1394, -0.1786], [ 0.0880, -0.0984, 0.0456, ..., 0.0453, 0.1466, -0.0702], [-0.1124, 0.1663, -0.2413, ..., 0.0593, 0.1044, -0.0414], ..., [-0.0789, -0.1066, -0.0858, ..., 0.0139, -0.2402, 0.1792], [ 0.0474, -0.0567, 0.1411, ..., 0.0082, -0.2765, -0.0417], [-0.2595, -0.1349, -0.1893, ..., -0.3407, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.1176e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, -9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 483, bias, value: tensor([-0.0004, -0.0405, 0.0109, -0.0073, 0.0092, 0.0069, 0.0190, 0.0090, -0.0470, -0.0166], device='cuda:0'), grad: tensor([ 6.1467e-08, 3.2596e-08, 2.7940e-09, 3.7253e-09, 1.8626e-09, 8.3819e-09, -7.3574e-08, -5.2154e-08, 1.2107e-08, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 482, time 249.37, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4217 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.15 lr 0.00001000 Epoch 484, weight, value: tensor([[ 0.0212, -0.2036, -0.2015, ..., -0.3942, -0.1397, -0.1786], [ 0.0880, -0.0984, 0.0456, ..., 0.0452, 0.1467, -0.0704], [-0.1124, 0.1663, -0.2414, ..., 0.0593, 0.1045, -0.0414], ..., [-0.0789, -0.1066, -0.0858, ..., 0.0140, -0.2403, 0.1794], [ 0.0474, -0.0567, 0.1411, ..., 0.0082, -0.2767, -0.0418], [-0.2595, -0.1349, -0.1894, ..., -0.3410, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., -0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 484, bias, value: tensor([-0.0006, -0.0406, 0.0109, -0.0073, 0.0092, 0.0069, 0.0191, 0.0091, -0.0471, -0.0167], device='cuda:0'), grad: tensor([-2.7940e-09, 6.5193e-09, 3.7253e-09, 0.0000e+00, -2.7940e-09, -9.3132e-10, -1.9558e-08, 3.7253e-09, 7.4506e-09, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 483, time 248.82, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4317 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.18 lr 0.00001000 Epoch 485, weight, value: tensor([[ 0.0212, -0.2036, -0.2016, ..., -0.3943, -0.1398, -0.1786], [ 0.0880, -0.0986, 0.0456, ..., 0.0451, 0.1467, -0.0704], [-0.1124, 0.1665, -0.2415, ..., 0.0593, 0.1046, -0.0414], ..., [-0.0789, -0.1066, -0.0858, ..., 0.0141, -0.2404, 0.1795], [ 0.0473, -0.0567, 0.1412, ..., 0.0082, -0.2768, -0.0418], [-0.2595, -0.1350, -0.1895, ..., -0.3411, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 7.4506e-09, ..., 2.4214e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.0245e-08, 9.3132e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 9.3132e-10, ..., -4.9360e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -8.6613e-08, ..., -4.9360e-08, -1.4901e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, -4.6566e-09, 0.0000e+00]], device='cuda:0') Epoch 485, bias, value: tensor([-0.0007, -0.0407, 0.0110, -0.0074, 0.0092, 0.0069, 0.0191, 0.0092, -0.0472, -0.0168], device='cuda:0'), grad: tensor([ 5.4017e-08, 8.5682e-08, 3.1665e-08, 4.6566e-08, 1.8626e-08, -2.7940e-09, 9.1270e-08, -1.1269e-07, -8.6613e-08, -1.2200e-07], device='cuda:0') 100 1e-05 changing lr epoch 484, time 248.67, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4355 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.15 lr 0.00001000 Epoch 486, weight, value: tensor([[ 0.0213, -0.2036, -0.2017, ..., -0.3943, -0.1399, -0.1786], [ 0.0880, -0.0985, 0.0456, ..., 0.0451, 0.1468, -0.0704], [-0.1124, 0.1666, -0.2416, ..., 0.0593, 0.1047, -0.0414], ..., [-0.0790, -0.1067, -0.0858, ..., 0.0142, -0.2405, 0.1795], [ 0.0473, -0.0567, 0.1413, ..., 0.0082, -0.2770, -0.0418], [-0.2595, -0.1350, -0.1895, ..., -0.3413, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -9.3132e-10, 0.0000e+00], [ 0.0000e+00, 3.0734e-08, 3.7253e-09, ..., 1.1828e-07, 5.1223e-08, 0.0000e+00], [ 0.0000e+00, -3.6322e-08, 9.3132e-10, ..., -8.9407e-08, -6.1467e-08, 0.0000e+00], ..., [ 0.0000e+00, 4.6566e-09, -3.7253e-09, ..., -2.9802e-08, 1.3039e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.7940e-09, ..., -1.8626e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 486, bias, value: tensor([-0.0007, -0.0408, 0.0110, -0.0074, 0.0092, 0.0070, 0.0192, 0.0092, -0.0473, -0.0168], device='cuda:0'), grad: tensor([-1.3597e-07, 1.7975e-07, -8.2888e-08, 8.3819e-09, 1.8626e-09, 1.6764e-08, 3.1665e-08, -4.6566e-08, 1.0245e-08, 2.3283e-08], device='cuda:0') 100 1e-05 changing lr epoch 485, time 248.89, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4379 re_mapping 0.0025 re_causal 0.0096 /// teacc 99.17 lr 0.00001000 Epoch 487, weight, value: tensor([[ 0.0212, -0.2037, -0.2017, ..., -0.3944, -0.1401, -0.1786], [ 0.0880, -0.0986, 0.0457, ..., 0.0451, 0.1469, -0.0704], [-0.1125, 0.1667, -0.2417, ..., 0.0594, 0.1047, -0.0414], ..., [-0.0789, -0.1067, -0.0859, ..., 0.0141, -0.2408, 0.1795], [ 0.0473, -0.0567, 0.1414, ..., 0.0083, -0.2770, -0.0418], [-0.2595, -0.1351, -0.1896, ..., -0.3414, 0.0886, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.2107e-08, 0.0000e+00]], device='cuda:0') Epoch 487, bias, value: tensor([-0.0007, -0.0407, 0.0110, -0.0074, 0.0093, 0.0070, 0.0192, 0.0092, -0.0473, -0.0169], device='cuda:0'), grad: tensor([-1.8626e-09, 6.5193e-09, 9.3132e-10, 2.7940e-09, 2.3283e-08, -3.7253e-09, 0.0000e+00, 3.7253e-09, 9.3132e-10, -2.5146e-08], device='cuda:0') 100 1e-05 changing lr epoch 486, time 248.67, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4054 re_mapping 0.0025 re_causal 0.0091 /// teacc 99.16 lr 0.00001000 Epoch 488, weight, value: tensor([[ 0.0212, -0.2037, -0.2018, ..., -0.3945, -0.1403, -0.1787], [ 0.0880, -0.0986, 0.0458, ..., 0.0452, 0.1470, -0.0704], [-0.1125, 0.1668, -0.2418, ..., 0.0594, 0.1047, -0.0414], ..., [-0.0789, -0.1069, -0.0860, ..., 0.0141, -0.2409, 0.1795], [ 0.0473, -0.0567, 0.1415, ..., 0.0084, -0.2772, -0.0418], [-0.2595, -0.1351, -0.1897, ..., -0.3415, 0.0887, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.6764e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 3.7253e-09, ..., -0.0000e+00, -1.8626e-09, -0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 488, bias, value: tensor([-0.0008, -0.0407, 0.0110, -0.0074, 0.0094, 0.0070, 0.0193, 0.0091, -0.0473, -0.0169], device='cuda:0'), grad: tensor([ 6.3330e-08, 7.4506e-09, 5.5879e-09, -4.6566e-08, 1.1176e-08, 3.7253e-08, -1.0803e-07, 0.0000e+00, 9.3132e-09, 1.1176e-08], device='cuda:0') 100 1e-05 changing lr epoch 487, time 248.77, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4316 re_mapping 0.0025 re_causal 0.0093 /// teacc 99.18 lr 0.00001000 Epoch 489, weight, value: tensor([[ 0.0213, -0.2037, -0.2019, ..., -0.3946, -0.1405, -0.1787], [ 0.0880, -0.0986, 0.0459, ..., 0.0452, 0.1471, -0.0705], [-0.1125, 0.1668, -0.2418, ..., 0.0594, 0.1048, -0.0414], ..., [-0.0789, -0.1070, -0.0860, ..., 0.0141, -0.2411, 0.1796], [ 0.0473, -0.0567, 0.1416, ..., 0.0084, -0.2773, -0.0418], [-0.2595, -0.1351, -0.1898, ..., -0.3416, 0.0888, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -5.5879e-09, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -1.6764e-08, 0.0000e+00, ..., -4.2841e-08, -3.9116e-08, -1.8626e-09], ..., [ 0.0000e+00, 1.4901e-08, 0.0000e+00, ..., 2.9802e-08, 3.5390e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 1.8626e-09, 1.8626e-09], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 489, bias, value: tensor([-0.0008, -0.0407, 0.0110, -0.0074, 0.0093, 0.0070, 0.0194, 0.0091, -0.0474, -0.0169], device='cuda:0'), grad: tensor([-2.2352e-08, 2.4214e-08, -1.2293e-07, 4.2841e-08, 0.0000e+00, -7.2643e-08, 1.3039e-08, 9.1270e-08, 1.3039e-08, 1.6764e-08], device='cuda:0') 100 1e-05 changing lr epoch 488, time 249.25, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.3846 re_mapping 0.0025 re_causal 0.0089 /// teacc 99.18 lr 0.00001000 Epoch 490, weight, value: tensor([[ 0.0213, -0.2037, -0.2020, ..., -0.3946, -0.1406, -0.1787], [ 0.0880, -0.0987, 0.0461, ..., 0.0453, 0.1472, -0.0705], [-0.1125, 0.1671, -0.2420, ..., 0.0595, 0.1050, -0.0414], ..., [-0.0789, -0.1071, -0.0863, ..., 0.0139, -0.2413, 0.1796], [ 0.0473, -0.0567, 0.1417, ..., 0.0085, -0.2775, -0.0418], [-0.2596, -0.1352, -0.1899, ..., -0.3417, 0.0888, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 3.5390e-08, 1.0245e-07, ..., 2.5705e-07, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, 3.7253e-09, ..., 1.6764e-08, -2.2352e-08, 0.0000e+00], ..., [ 0.0000e+00, -3.7253e-08, -1.1548e-07, ..., -2.9430e-07, 5.5879e-09, 0.0000e+00], [-1.8626e-09, 1.8626e-09, -3.7253e-09, ..., 1.8626e-09, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 490, bias, value: tensor([-0.0008, -0.0406, 0.0112, -0.0075, 0.0094, 0.0070, 0.0194, 0.0090, -0.0474, -0.0170], device='cuda:0'), grad: tensor([ 1.4901e-08, 7.2829e-07, -3.5390e-08, 5.0291e-08, 0.0000e+00, 7.4506e-09, -3.7253e-09, -8.2329e-07, 4.0978e-08, 2.0489e-08], device='cuda:0') 100 1e-05 changing lr epoch 489, time 249.33, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4278 re_mapping 0.0025 re_causal 0.0092 /// teacc 99.15 lr 0.00001000 Epoch 491, weight, value: tensor([[ 0.0213, -0.2037, -0.2021, ..., -0.3947, -0.1408, -0.1787], [ 0.0880, -0.0986, 0.0462, ..., 0.0453, 0.1473, -0.0705], [-0.1125, 0.1671, -0.2423, ..., 0.0594, 0.1050, -0.0414], ..., [-0.0789, -0.1073, -0.0863, ..., 0.0139, -0.2415, 0.1797], [ 0.0473, -0.0567, 0.1418, ..., 0.0085, -0.2776, -0.0418], [-0.2596, -0.1352, -0.1900, ..., -0.3419, 0.0889, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 3.7253e-09, 1.3039e-08, ..., -9.3132e-09, 1.8626e-09, -1.8626e-09], [ 0.0000e+00, -5.5879e-09, -1.4901e-08, ..., -5.5879e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 491, bias, value: tensor([-0.0008, -0.0406, 0.0112, -0.0075, 0.0093, 0.0070, 0.0195, 0.0090, -0.0474, -0.0170], device='cuda:0'), grad: tensor([ 1.1176e-08, 4.2841e-08, 1.8626e-08, -1.1176e-08, -2.0489e-08, 5.5879e-09, 3.7253e-09, -7.4506e-09, -2.6077e-08, -1.6764e-08], device='cuda:0') 100 1e-05 changing lr epoch 490, time 248.16, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4283 re_mapping 0.0025 re_causal 0.0096 /// teacc 99.13 lr 0.00001000 Epoch 492, weight, value: tensor([[ 0.0214, -0.2037, -0.2022, ..., -0.3947, -0.1410, -0.1787], [ 0.0880, -0.0986, 0.0463, ..., 0.0454, 0.1474, -0.0705], [-0.1126, 0.1671, -0.2425, ..., 0.0593, 0.1050, -0.0414], ..., [-0.0788, -0.1074, -0.0864, ..., 0.0139, -0.2416, 0.1797], [ 0.0473, -0.0567, 0.1419, ..., 0.0086, -0.2776, -0.0417], [-0.2596, -0.1352, -0.1902, ..., -0.3421, 0.0890, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., -0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -1.8626e-09, -0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, -3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 492, bias, value: tensor([-0.0009, -0.0405, 0.0111, -0.0075, 0.0094, 0.0070, 0.0195, 0.0089, -0.0474, -0.0170], device='cuda:0'), grad: tensor([ 0.0000e+00, 7.4506e-09, -1.8626e-09, 2.0489e-08, -2.4214e-08, 1.8626e-09, 1.3039e-08, 3.7253e-09, 1.8626e-09, -2.7940e-08], device='cuda:0') 100 1e-05 changing lr epoch 491, time 247.40, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4269 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.15 lr 0.00001000 Epoch 493, weight, value: tensor([[ 0.0214, -0.2037, -0.2023, ..., -0.3948, -0.1411, -0.1787], [ 0.0880, -0.0986, 0.0464, ..., 0.0454, 0.1476, -0.0705], [-0.1126, 0.1672, -0.2426, ..., 0.0592, 0.1050, -0.0414], ..., [-0.0789, -0.1075, -0.0865, ..., 0.0139, -0.2418, 0.1797], [ 0.0473, -0.0567, 0.1421, ..., 0.0087, -0.2778, -0.0417], [-0.2596, -0.1353, -0.1904, ..., -0.3423, 0.0889, -0.1112]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -0.0000e+00, 1.8626e-09, ..., -1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 493, bias, value: tensor([-0.0009, -0.0404, 0.0110, -0.0075, 0.0095, 0.0070, 0.0195, 0.0089, -0.0474, -0.0172], device='cuda:0'), grad: tensor([ 2.4214e-08, 1.8626e-09, 0.0000e+00, -3.7253e-09, 1.8626e-08, 5.5879e-09, -3.7253e-08, 0.0000e+00, 5.5879e-09, -1.3039e-08], device='cuda:0') 100 1e-05 changing lr epoch 492, time 247.51, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4308 re_mapping 0.0025 re_causal 0.0093 /// teacc 99.13 lr 0.00001000 Epoch 494, weight, value: tensor([[ 0.0214, -0.2037, -0.2024, ..., -0.3948, -0.1412, -0.1787], [ 0.0880, -0.0985, 0.0466, ..., 0.0455, 0.1479, -0.0706], [-0.1127, 0.1673, -0.2427, ..., 0.0593, 0.1050, -0.0414], ..., [-0.0788, -0.1077, -0.0866, ..., 0.0138, -0.2421, 0.1798], [ 0.0472, -0.0567, 0.1421, ..., 0.0087, -0.2779, -0.0418], [-0.2596, -0.1353, -0.1905, ..., -0.3425, 0.0889, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, 2.0489e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.5460e-07, ..., -7.6368e-08, -1.8254e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.4901e-08, ..., 7.4506e-09, 1.6764e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.2852e-07, ..., 6.1467e-08, 1.5087e-07, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, -5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 494, bias, value: tensor([-0.0008, -0.0403, 0.0111, -0.0076, 0.0095, 0.0070, 0.0195, 0.0088, -0.0475, -0.0173], device='cuda:0'), grad: tensor([ 8.0094e-08, -4.0419e-07, 4.2841e-08, 1.2666e-07, 2.2352e-08, -9.6858e-08, -1.1548e-07, 3.2969e-07, 1.3039e-08, 7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 493, time 247.37, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4121 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.17 lr 0.00001000 Epoch 495, weight, value: tensor([[ 0.0214, -0.2037, -0.2024, ..., -0.3949, -0.1413, -0.1787], [ 0.0880, -0.0985, 0.0467, ..., 0.0455, 0.1480, -0.0707], [-0.1127, 0.1674, -0.2428, ..., 0.0594, 0.1050, -0.0414], ..., [-0.0788, -0.1079, -0.0867, ..., 0.0138, -0.2423, 0.1799], [ 0.0472, -0.0567, 0.1422, ..., 0.0088, -0.2780, -0.0418], [-0.2597, -0.1354, -0.1907, ..., -0.3426, 0.0890, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 495, bias, value: tensor([-0.0008, -0.0403, 0.0111, -0.0076, 0.0094, 0.0070, 0.0196, 0.0088, -0.0476, -0.0173], device='cuda:0'), grad: tensor([ 1.8626e-09, 1.8626e-09, 1.8626e-09, 7.4506e-09, 0.0000e+00, -3.7253e-09, -9.3132e-09, -7.4506e-09, 0.0000e+00, 7.4506e-09], device='cuda:0') 100 1e-05 changing lr epoch 494, time 247.07, cls_loss 0.0004 cls_loss_mapping 0.0003 cls_loss_causal 0.4444 re_mapping 0.0025 re_causal 0.0095 /// teacc 99.18 lr 0.00001000 Epoch 496, weight, value: tensor([[ 0.0214, -0.2037, -0.2025, ..., -0.3950, -0.1414, -0.1787], [ 0.0880, -0.0985, 0.0468, ..., 0.0455, 0.1481, -0.0707], [-0.1127, 0.1675, -0.2431, ..., 0.0593, 0.1051, -0.0414], ..., [-0.0788, -0.1080, -0.0867, ..., 0.0138, -0.2425, 0.1800], [ 0.0472, -0.0568, 0.1422, ..., 0.0087, -0.2781, -0.0418], [-0.2597, -0.1354, -0.1908, ..., -0.3427, 0.0892, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.8626e-08, 1.8626e-09, ..., 6.5193e-08, 1.1176e-08, 0.0000e+00], [ 0.0000e+00, -6.3330e-08, 0.0000e+00, ..., -3.1665e-08, -5.9605e-08, 0.0000e+00], ..., [ 0.0000e+00, 2.4214e-08, -1.8626e-09, ..., -4.6566e-08, 3.1665e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 1.8626e-09, 5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 496, bias, value: tensor([-0.0008, -0.0403, 0.0112, -0.0076, 0.0093, 0.0070, 0.0196, 0.0088, -0.0476, -0.0173], device='cuda:0'), grad: tensor([ 1.1176e-08, 2.0117e-07, -1.4901e-07, 6.5193e-08, 1.3039e-08, -3.5390e-08, -3.5390e-08, -1.0617e-07, 3.1665e-08, 9.3132e-09], device='cuda:0') 100 1e-05 changing lr epoch 495, time 247.41, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.3879 re_mapping 0.0024 re_causal 0.0089 /// teacc 99.14 lr 0.00001000 Epoch 497, weight, value: tensor([[ 0.0214, -0.2038, -0.2026, ..., -0.3951, -0.1417, -0.1787], [ 0.0880, -0.0985, 0.0468, ..., 0.0454, 0.1481, -0.0708], [-0.1127, 0.1676, -0.2433, ..., 0.0593, 0.1052, -0.0413], ..., [-0.0788, -0.1082, -0.0867, ..., 0.0139, -0.2426, 0.1801], [ 0.0472, -0.0568, 0.1423, ..., 0.0087, -0.2783, -0.0418], [-0.2597, -0.1355, -0.1909, ..., -0.3429, 0.0892, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 0.0000e+00], [ 0.0000e+00, -1.8626e-09, -3.7253e-09, ..., -1.8626e-09, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -1.8626e-09, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, -0.0000e+00]], device='cuda:0') Epoch 497, bias, value: tensor([-0.0009, -0.0404, 0.0112, -0.0076, 0.0093, 0.0070, 0.0198, 0.0089, -0.0477, -0.0174], device='cuda:0'), grad: tensor([ 3.3528e-08, -5.5879e-09, 2.4214e-08, 0.0000e+00, 5.5879e-09, 0.0000e+00, -6.5193e-08, 7.4506e-09, -3.7253e-09, -1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 496, time 247.41, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4189 re_mapping 0.0024 re_causal 0.0093 /// teacc 99.17 lr 0.00001000 Epoch 498, weight, value: tensor([[ 0.0214, -0.2038, -0.2027, ..., -0.3951, -0.1418, -0.1787], [ 0.0879, -0.0985, 0.0469, ..., 0.0454, 0.1482, -0.0708], [-0.1127, 0.1677, -0.2433, ..., 0.0592, 0.1052, -0.0413], ..., [-0.0788, -0.1081, -0.0868, ..., 0.0140, -0.2427, 0.1801], [ 0.0472, -0.0568, 0.1423, ..., 0.0087, -0.2785, -0.0417], [-0.2597, -0.1355, -0.1909, ..., -0.3430, 0.0892, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 5.5879e-09, ..., 3.7253e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., -1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., -1.8626e-09, 1.8626e-09, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 498, bias, value: tensor([-0.0009, -0.0404, 0.0112, -0.0076, 0.0094, 0.0070, 0.0199, 0.0089, -0.0479, -0.0175], device='cuda:0'), grad: tensor([ 1.8626e-09, 2.0489e-08, 1.1176e-08, 1.1921e-07, -5.2154e-08, -1.6205e-07, 2.6077e-08, -3.7253e-09, 2.0489e-08, 2.2352e-08], device='cuda:0') 100 1e-05 changing lr epoch 497, time 247.19, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4471 re_mapping 0.0025 re_causal 0.0094 /// teacc 99.18 lr 0.00001000 Epoch 499, weight, value: tensor([[ 0.0214, -0.2038, -0.2027, ..., -0.3952, -0.1419, -0.1787], [ 0.0879, -0.0985, 0.0469, ..., 0.0453, 0.1482, -0.0708], [-0.1127, 0.1678, -0.2435, ..., 0.0592, 0.1054, -0.0413], ..., [-0.0788, -0.1082, -0.0868, ..., 0.0141, -0.2428, 0.1802], [ 0.0472, -0.0568, 0.1425, ..., 0.0088, -0.2785, -0.0417], [-0.2597, -0.1355, -0.1910, ..., -0.3432, 0.0893, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, -7.4506e-09, ..., -3.7253e-09, -5.5879e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -2.3469e-07, ..., -4.0978e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 499, bias, value: tensor([-0.0008, -0.0405, 0.0112, -0.0076, 0.0094, 0.0071, 0.0199, 0.0090, -0.0479, -0.0175], device='cuda:0'), grad: tensor([ 9.3132e-09, -1.8626e-08, 7.4506e-09, 1.1548e-07, 1.8626e-09, 5.5879e-09, 1.6950e-07, 5.5879e-09, -2.9244e-07, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 498, time 247.22, cls_loss 0.0005 cls_loss_mapping 0.0003 cls_loss_causal 0.4335 re_mapping 0.0024 re_causal 0.0093 /// teacc 99.14 lr 0.00001000 Epoch 500, weight, value: tensor([[ 0.0214, -0.2038, -0.2028, ..., -0.3952, -0.1420, -0.1787], [ 0.0879, -0.0986, 0.0469, ..., 0.0453, 0.1482, -0.0708], [-0.1128, 0.1679, -0.2436, ..., 0.0592, 0.1054, -0.0413], ..., [-0.0788, -0.1082, -0.0868, ..., 0.0141, -0.2429, 0.1802], [ 0.0472, -0.0568, 0.1426, ..., 0.0089, -0.2786, -0.0417], [-0.2597, -0.1355, -0.1911, ..., -0.3433, 0.0893, -0.1112]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.9802e-08, 0.0000e+00], [ 0.0000e+00, -1.1176e-08, -3.9116e-08, ..., -1.6764e-08, -4.2841e-08, 0.0000e+00], [ 0.0000e+00, -3.7253e-08, 2.4214e-08, ..., -2.6077e-08, -1.3039e-08, 0.0000e+00], ..., [ 0.0000e+00, 4.4703e-08, 1.3039e-08, ..., 4.2841e-08, 5.4017e-08, -0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 0.0000e+00]], device='cuda:0') Epoch 500, bias, value: tensor([-0.0009, -0.0406, 0.0112, -0.0077, 0.0093, 0.0072, 0.0199, 0.0090, -0.0480, -0.0176], device='cuda:0'), grad: tensor([ 1.7323e-07, -7.8231e-08, -3.1665e-08, 5.0291e-08, -7.4506e-09, -2.3097e-07, -1.7509e-07, 2.0117e-07, 2.7940e-08, 7.2643e-08], device='cuda:0') 100 1e-05 changing lr epoch 499, time 247.44, cls_loss 0.0005 cls_loss_mapping 0.0004 cls_loss_causal 0.4273 re_mapping 0.0024 re_causal 0.0091 /// teacc 99.16 lr 0.00001000 ---------------------saving last model at epoch 499---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_adam', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_adam/14factor_best.csv', 'channels': 3, 'factor_num': 14, 'stride': 3, 'epoch': 'best', 'eval_mapping': True} loading weight of best randm: False stride: 3 loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best Using downloaded and verified file: /home/yuqian_fu/.pytorch/SVHN/test_32x32.mat mnist mnist_FA ... usps_FA Avg ShearX 99.040001 98.930000 ... 79.62133 68.275971 ShearY 98.769997 98.699997 ... 79.62133 64.653491 AutoContrast 99.199997 99.099998 ... 79.62133 59.755771 Invert 98.860001 98.369995 ... 79.62133 64.192465 Equalize 98.439995 98.229996 ... 79.62133 70.835657 Solarize 98.239998 97.639999 ... 79.62133 59.579589 SolarizeAdd 98.400002 97.779999 ... 79.62133 72.674644 Posterize 98.909996 99.029999 ... 79.62133 72.055723 Contrast 99.159996 99.180000 ... 79.62133 66.427597 Color 99.119995 99.220001 ... 79.62133 59.084085 Brightness 99.119995 99.229996 ... 79.62133 65.520764 Sharpness 99.099998 99.150002 ... 79.62133 69.842453 NoiseSalt 99.099998 99.169998 ... 79.62133 53.793901 NoiseGaussian 99.080002 99.199997 ... 79.62133 55.708333 w/o do (original x) 99.220000 0.000000 ... 0.00000 73.114814 [15 rows x 11 columns] mnist svhn mnist_m syndigit usps Avg do 99.13 69.283958 78.813465 77.546321 83.507723 77.287867