/home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 500, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps2', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[ 0.0094, -0.0300, -0.0202, ..., -0.0133, 0.0254, 0.0255], [-0.0269, -0.0168, 0.0046, ..., -0.0113, -0.0025, -0.0234], [ 0.0059, -0.0206, 0.0299, ..., -0.0036, 0.0285, -0.0219], ..., [-0.0253, -0.0004, 0.0165, ..., 0.0212, 0.0120, -0.0137], [-0.0243, 0.0281, -0.0075, ..., 0.0071, -0.0178, -0.0153], [ 0.0121, 0.0164, -0.0064, ..., 0.0142, -0.0213, 0.0214]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([-0.0146, -0.0232, -0.0161, -0.0271, -0.0302, -0.0007, 0.0295, -0.0105, 0.0252, 0.0037], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 282.82, cls_loss 1.3336 cls_loss_mapping 1.8518 cls_loss_causal 2.2299 re_mapping 0.1420 re_causal 0.1531 /// teacc 87.82 lr 0.00010000 Epoch 2, weight, value: tensor([[ 0.0055, -0.0290, -0.0202, ..., -0.0107, 0.0303, 0.0314], [-0.0261, -0.0221, 0.0046, ..., -0.0192, -0.0076, -0.0313], [ 0.0041, -0.0206, 0.0299, ..., -0.0090, 0.0275, -0.0207], ..., [-0.0252, 0.0030, 0.0165, ..., 0.0254, 0.0123, -0.0181], [-0.0266, 0.0263, -0.0075, ..., 0.0078, -0.0190, -0.0177], [ 0.0128, 0.0130, -0.0064, ..., 0.0154, -0.0234, 0.0160]], device='cuda:0'), grad: tensor([[ 4.3144e-03, 1.3351e-03, 0.0000e+00, ..., 4.6462e-05, -5.3711e-03, -9.2926e-03], [ 7.6592e-05, 5.3520e-03, 0.0000e+00, ..., 7.4463e-03, 4.2248e-04, 8.9073e-04], [-2.7275e-03, -1.2817e-02, 0.0000e+00, ..., -9.4910e-03, -9.2411e-04, 1.5173e-03], ..., [ 1.1215e-02, 1.5182e-02, 0.0000e+00, ..., 5.3375e-02, 3.4084e-03, 3.1624e-03], [ 8.5831e-03, 1.8021e-02, 0.0000e+00, ..., 5.5618e-03, -1.3704e-03, 9.5901e-03], [-1.7303e-02, -7.7362e-03, 0.0000e+00, ..., -8.3313e-02, 3.7365e-03, 2.5520e-03]], device='cuda:0') Epoch 2, bias, value: tensor([-0.0159, -0.0208, -0.0162, -0.0281, -0.0302, -0.0004, 0.0285, -0.0104, 0.0253, 0.0029], device='cuda:0'), grad: tensor([ 0.0027, -0.0289, -0.0156, -0.0481, 0.0213, 0.0228, -0.0087, 0.0459, 0.0480, -0.0393], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 281.67, cls_loss 0.4012 cls_loss_mapping 0.7534 cls_loss_causal 1.9334 re_mapping 0.2077 re_causal 0.2738 /// teacc 93.25 lr 0.00010000 Epoch 3, weight, value: tensor([[ 0.0022, -0.0282, -0.0202, ..., -0.0099, 0.0362, 0.0349], [-0.0259, -0.0246, 0.0046, ..., -0.0213, -0.0107, -0.0325], [ 0.0010, -0.0205, 0.0299, ..., -0.0113, 0.0292, -0.0209], ..., [-0.0242, 0.0055, 0.0165, ..., 0.0269, 0.0072, -0.0198], [-0.0320, 0.0258, -0.0075, ..., 0.0082, -0.0207, -0.0205], [ 0.0134, 0.0127, -0.0064, ..., 0.0164, -0.0222, 0.0148]], device='cuda:0'), grad: tensor([[ 2.1038e-03, 1.9836e-03, 0.0000e+00, ..., 1.3411e-05, -1.7061e-03, -2.1172e-03], [ 2.2125e-03, 5.4207e-03, 0.0000e+00, ..., 1.2369e-03, 1.3618e-03, 2.9087e-03], [ 3.6907e-03, -1.3664e-02, 0.0000e+00, ..., 2.2430e-03, -2.1164e-02, -1.0399e-02], ..., [-3.5238e-04, -4.6272e-03, 0.0000e+00, ..., 1.6642e-03, 4.7264e-03, 3.5000e-03], [ 5.5885e-03, 7.3090e-03, 0.0000e+00, ..., 6.1798e-03, 5.9509e-03, 8.5144e-03], [ 3.1395e-03, 7.3671e-04, 0.0000e+00, ..., 1.4439e-03, 1.4496e-03, 1.5640e-03]], device='cuda:0') Epoch 3, bias, value: tensor([-1.5880e-02, -2.0302e-02, -1.6533e-02, -2.8419e-02, -3.0375e-02, -7.7978e-05, 2.8361e-02, -1.0864e-02, 2.5489e-02, 3.3449e-03], device='cuda:0'), grad: tensor([ 0.0057, 0.0006, -0.0258, 0.0253, -0.0084, -0.0065, -0.0296, 0.0029, 0.0284, 0.0076], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 282.63, cls_loss 0.2555 cls_loss_mapping 0.4624 cls_loss_causal 1.7128 re_mapping 0.1559 re_causal 0.2582 /// teacc 94.25 lr 0.00010000 Epoch 4, weight, value: tensor([[ 0.0010, -0.0277, -0.0203, ..., -0.0096, 0.0400, 0.0373], [-0.0269, -0.0269, 0.0003, ..., -0.0233, -0.0158, -0.0343], [-0.0026, -0.0202, 0.0260, ..., -0.0129, 0.0329, -0.0202], ..., [-0.0244, 0.0070, 0.0179, ..., 0.0281, 0.0037, -0.0209], [-0.0364, 0.0257, -0.0115, ..., 0.0083, -0.0209, -0.0216], [ 0.0141, 0.0133, -0.0103, ..., 0.0167, -0.0215, 0.0142]], device='cuda:0'), grad: tensor([[-0.0026, -0.0088, 0.0000, ..., -0.0040, -0.0160, -0.0242], [ 0.0007, 0.0026, 0.0000, ..., 0.0031, 0.0004, 0.0006], [ 0.0015, 0.0109, 0.0000, ..., 0.0027, 0.0036, 0.0076], ..., [-0.0056, -0.0101, 0.0000, ..., -0.0206, 0.0016, 0.0023], [-0.0090, -0.0121, 0.0000, ..., -0.0152, 0.0004, 0.0023], [ 0.0074, 0.0182, 0.0000, ..., 0.0283, 0.0013, 0.0036]], device='cuda:0') Epoch 4, bias, value: tensor([-1.5505e-02, -1.9999e-02, -1.6656e-02, -2.8608e-02, -3.0272e-02, -4.8942e-05, 2.8023e-02, -1.1061e-02, 2.5536e-02, 3.4134e-03], device='cuda:0'), grad: tensor([-0.0162, 0.0015, 0.0200, -0.0070, 0.0192, 0.0060, 0.0029, -0.0074, -0.0452, 0.0262], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 282.60, cls_loss 0.2090 cls_loss_mapping 0.3590 cls_loss_causal 1.5909 re_mapping 0.1200 re_causal 0.2258 /// teacc 94.39 lr 0.00010000 Epoch 5, weight, value: tensor([[ 2.5493e-04, -2.7750e-02, -2.0303e-02, ..., -8.9923e-03, 4.3243e-02, 3.9449e-02], [-2.9210e-02, -2.9602e-02, 3.2619e-04, ..., -2.5070e-02, -1.9404e-02, -3.5510e-02], [-5.9607e-03, -2.0000e-02, 2.5963e-02, ..., -1.4382e-02, 3.6077e-02, -2.0003e-02], ..., [-2.5401e-02, 8.9050e-03, 1.7921e-02, ..., 2.9216e-02, 4.4441e-05, -2.2393e-02], [-3.9703e-02, 2.5462e-02, -1.1513e-02, ..., 8.7589e-03, -2.1776e-02, -2.2173e-02], [ 1.6515e-02, 1.3325e-02, -1.0273e-02, ..., 1.7216e-02, -2.2674e-02, 1.2541e-02]], device='cuda:0'), grad: tensor([[ 0.0027, 0.0031, 0.0000, ..., 0.0023, 0.0094, 0.0074], [ 0.0018, 0.0021, 0.0000, ..., 0.0005, 0.0012, 0.0016], [ 0.0048, 0.0085, 0.0000, ..., -0.0031, 0.0103, -0.0066], ..., [-0.0050, -0.0013, 0.0000, ..., -0.0181, 0.0089, 0.0077], [ 0.0069, -0.0212, 0.0000, ..., 0.0060, -0.0143, -0.0090], [ 0.0175, 0.0194, 0.0000, ..., 0.0287, 0.0017, 0.0018]], device='cuda:0') Epoch 5, bias, value: tensor([-0.0152, -0.0199, -0.0166, -0.0291, -0.0305, -0.0003, 0.0278, -0.0110, 0.0257, 0.0039], device='cuda:0'), grad: tensor([ 0.0084, 0.0034, 0.0093, -0.0302, -0.0279, 0.0112, 0.0061, -0.0065, -0.0066, 0.0328], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 281.51, cls_loss 0.1658 cls_loss_mapping 0.2742 cls_loss_causal 1.4598 re_mapping 0.1014 re_causal 0.2046 /// teacc 96.27 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0021, -0.0277, -0.0203, ..., -0.0089, 0.0456, 0.0410], [-0.0318, -0.0321, 0.0003, ..., -0.0259, -0.0237, -0.0371], [-0.0077, -0.0207, 0.0260, ..., -0.0157, 0.0379, -0.0194], ..., [-0.0263, 0.0101, 0.0179, ..., 0.0304, -0.0027, -0.0232], [-0.0431, 0.0261, -0.0115, ..., 0.0088, -0.0217, -0.0225], [ 0.0183, 0.0129, -0.0103, ..., 0.0169, -0.0224, 0.0118]], device='cuda:0'), grad: tensor([[ 0.0012, 0.0040, 0.0000, ..., 0.0147, 0.0457, 0.0405], [ 0.0020, 0.0083, 0.0000, ..., 0.0069, 0.0022, 0.0052], [ 0.0005, -0.0112, 0.0000, ..., 0.0024, -0.0245, -0.0163], ..., [ 0.0003, -0.0018, 0.0000, ..., 0.0006, 0.0016, -0.0008], [ 0.0083, 0.0035, 0.0000, ..., 0.0126, -0.0011, 0.0004], [ 0.0039, 0.0016, 0.0000, ..., -0.0119, -0.0455, -0.0388]], device='cuda:0') Epoch 6, bias, value: tensor([-0.0150, -0.0198, -0.0166, -0.0290, -0.0305, -0.0004, 0.0278, -0.0112, 0.0257, 0.0039], device='cuda:0'), grad: tensor([ 0.0339, 0.0194, -0.0110, 0.0025, -0.0038, 0.0027, -0.0183, -0.0130, 0.0092, -0.0214], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 282.87, cls_loss 0.1459 cls_loss_mapping 0.2310 cls_loss_causal 1.3923 re_mapping 0.0834 re_causal 0.1812 /// teacc 96.57 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0028, -0.0283, -0.0208, ..., -0.0088, 0.0478, 0.0425], [-0.0341, -0.0343, -0.0055, ..., -0.0272, -0.0243, -0.0370], [-0.0100, -0.0209, 0.0199, ..., -0.0167, 0.0402, -0.0190], ..., [-0.0270, 0.0116, 0.0221, ..., 0.0316, -0.0060, -0.0246], [-0.0467, 0.0262, -0.0158, ..., 0.0088, -0.0228, -0.0234], [ 0.0199, 0.0127, -0.0133, ..., 0.0170, -0.0218, 0.0120]], device='cuda:0'), grad: tensor([[ 1.8060e-04, 9.7752e-04, 6.4773e-07, ..., 1.9608e-02, 4.5349e-02, 3.0823e-02], [ 1.3971e-04, 2.5082e-03, 4.6566e-06, ..., 1.7109e-03, 3.0689e-03, 1.7014e-03], [ 5.6934e-04, 2.4242e-03, 1.0632e-05, ..., 6.1369e-04, -5.5145e-02, -1.5068e-02], ..., [ 9.0408e-04, 4.4937e-03, -4.3273e-05, ..., 1.0996e-03, 1.3056e-03, 7.0095e-04], [ 2.8095e-03, 2.6550e-03, 2.2054e-06, ..., -2.4994e-02, -2.8015e-02, -3.0899e-02], [-2.0809e-03, -1.1635e-03, 1.4558e-05, ..., -1.0240e-04, 2.0123e-03, 9.8610e-04]], device='cuda:0') Epoch 7, bias, value: tensor([-0.0147, -0.0197, -0.0168, -0.0292, -0.0306, -0.0006, 0.0275, -0.0111, 0.0259, 0.0041], device='cuda:0'), grad: tensor([ 0.0331, 0.0101, -0.0394, -0.0025, 0.0005, 0.0057, 0.0027, 0.0099, -0.0198, -0.0002], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 281.63, cls_loss 0.1317 cls_loss_mapping 0.1933 cls_loss_causal 1.3203 re_mapping 0.0716 re_causal 0.1593 /// teacc 96.81 lr 0.00010000 Epoch 8, weight, value: tensor([[-0.0048, -0.0286, -0.0268, ..., -0.0093, 0.0498, 0.0438], [-0.0365, -0.0366, -0.0260, ..., -0.0284, -0.0273, -0.0377], [-0.0119, -0.0213, 0.0253, ..., -0.0181, 0.0427, -0.0184], ..., [-0.0277, 0.0125, 0.0295, ..., 0.0327, -0.0083, -0.0260], [-0.0498, 0.0266, -0.0332, ..., 0.0093, -0.0239, -0.0239], [ 0.0209, 0.0119, -0.0229, ..., 0.0168, -0.0217, 0.0116]], device='cuda:0'), grad: tensor([[ 9.6083e-05, 3.7003e-04, 4.8369e-05, ..., -9.7036e-05, -1.8158e-03, -1.6947e-03], [ 8.8692e-05, 5.4398e-03, 1.2779e-04, ..., 1.9608e-03, 6.4909e-05, -1.0681e-04], [ 1.9610e-04, 2.5043e-03, 9.3603e-04, ..., 1.2054e-03, -2.2125e-03, -1.3580e-03], ..., [ 2.0561e-03, 1.9424e-02, 3.8795e-03, ..., 5.5237e-03, 4.4560e-04, 3.5691e-04], [ 8.5545e-04, 1.6947e-03, 7.8738e-05, ..., 1.2903e-03, 1.2207e-03, 1.1091e-03], [-4.1847e-03, -1.0818e-02, -6.6853e-04, ..., -8.7051e-03, 2.7251e-04, 2.3580e-04]], device='cuda:0') Epoch 8, bias, value: tensor([-0.0147, -0.0198, -0.0167, -0.0289, -0.0301, -0.0008, 0.0271, -0.0111, 0.0261, 0.0038], device='cuda:0'), grad: tensor([-0.0005, 0.0265, 0.0027, -0.0231, 0.0046, 0.0014, 0.0007, 0.0280, 0.0056, -0.0459], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 281.50, cls_loss 0.1119 cls_loss_mapping 0.1637 cls_loss_causal 1.2541 re_mapping 0.0646 re_causal 0.1477 /// teacc 97.21 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0055, -0.0287, -0.0292, ..., -0.0092, 0.0518, 0.0451], [-0.0375, -0.0384, -0.0342, ..., -0.0291, -0.0285, -0.0377], [-0.0135, -0.0212, 0.0253, ..., -0.0189, 0.0435, -0.0183], ..., [-0.0286, 0.0129, 0.0278, ..., 0.0331, -0.0106, -0.0276], [-0.0521, 0.0269, -0.0451, ..., 0.0096, -0.0245, -0.0247], [ 0.0218, 0.0119, -0.0259, ..., 0.0166, -0.0212, 0.0115]], device='cuda:0'), grad: tensor([[ 3.9673e-04, 1.1339e-03, 2.7013e-04, ..., -4.6670e-05, -2.2182e-03, -1.8110e-03], [ 1.2434e-04, 6.1560e-04, 3.9268e-04, ..., 2.0015e-04, 4.9019e-04, 6.7651e-05], [ 3.3617e-04, 8.3542e-04, -3.4022e-04, ..., 5.9128e-04, -2.4867e-04, 2.5988e-04], ..., [-2.2674e-04, -5.3310e-04, 1.7214e-04, ..., -1.3895e-03, 7.6723e-04, 3.7551e-04], [ 3.4008e-03, 1.4007e-04, 2.4676e-04, ..., 1.8578e-03, 1.0824e-03, 1.4007e-04], [-1.2541e-03, 1.7948e-03, 3.2425e-04, ..., -1.7605e-03, 1.2903e-03, 8.0490e-04]], device='cuda:0') Epoch 9, bias, value: tensor([-0.0142, -0.0195, -0.0172, -0.0290, -0.0302, -0.0011, 0.0270, -0.0117, 0.0265, 0.0043], device='cuda:0'), grad: tensor([-0.0001, -0.0001, 0.0011, -0.0037, 0.0018, 0.0002, -0.0034, 0.0008, 0.0037, -0.0002], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 281.69, cls_loss 0.0992 cls_loss_mapping 0.1439 cls_loss_causal 1.2044 re_mapping 0.0564 re_causal 0.1344 /// teacc 97.50 lr 0.00010000 Epoch 10, weight, value: tensor([[-0.0057, -0.0294, -0.0289, ..., -0.0092, 0.0536, 0.0461], [-0.0388, -0.0407, -0.0449, ..., -0.0301, -0.0292, -0.0375], [-0.0148, -0.0214, 0.0277, ..., -0.0197, 0.0453, -0.0176], ..., [-0.0293, 0.0140, 0.0274, ..., 0.0341, -0.0124, -0.0287], [-0.0545, 0.0268, -0.0502, ..., 0.0095, -0.0253, -0.0255], [ 0.0226, 0.0111, -0.0275, ..., 0.0163, -0.0221, 0.0109]], device='cuda:0'), grad: tensor([[-0.0026, -0.0006, 0.0001, ..., 0.0009, -0.0057, -0.0052], [ 0.0046, 0.0027, 0.0001, ..., 0.0015, 0.0008, 0.0007], [ 0.0004, 0.0009, 0.0004, ..., 0.0001, -0.0009, -0.0006], ..., [ 0.0015, -0.0027, -0.0001, ..., -0.0011, 0.0005, 0.0003], [ 0.0059, 0.0044, 0.0001, ..., 0.0035, 0.0008, 0.0010], [ 0.0037, 0.0028, 0.0005, ..., 0.0029, 0.0013, 0.0016]], device='cuda:0') Epoch 10, bias, value: tensor([-0.0138, -0.0195, -0.0171, -0.0286, -0.0304, -0.0011, 0.0269, -0.0116, 0.0263, 0.0039], device='cuda:0'), grad: tensor([-0.0038, 0.0088, -0.0031, 0.0168, -0.0009, -0.0331, 0.0011, 0.0007, 0.0072, 0.0062], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 9---------------------------------------------------- epoch 9, time 281.41, cls_loss 0.0994 cls_loss_mapping 0.1390 cls_loss_causal 1.1666 re_mapping 0.0513 re_causal 0.1208 /// teacc 97.74 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0060, -0.0299, -0.0298, ..., -0.0092, 0.0553, 0.0474], [-0.0403, -0.0424, -0.0480, ..., -0.0307, -0.0322, -0.0386], [-0.0162, -0.0218, 0.0302, ..., -0.0205, 0.0467, -0.0172], ..., [-0.0304, 0.0149, 0.0290, ..., 0.0350, -0.0137, -0.0298], [-0.0559, 0.0270, -0.0550, ..., 0.0100, -0.0257, -0.0255], [ 0.0228, 0.0103, -0.0300, ..., 0.0160, -0.0222, 0.0105]], device='cuda:0'), grad: tensor([[ 1.1911e-03, 1.7395e-03, 6.1607e-04, ..., -7.6008e-04, 2.2304e-04, -1.7090e-03], [ 2.6441e-04, 7.8201e-04, 8.9526e-05, ..., 4.4799e-04, -5.9426e-05, -2.5797e-04], [ 1.3046e-03, -5.6152e-03, -7.3242e-04, ..., 1.2197e-03, -1.3664e-02, 7.8773e-04], ..., [ 3.5739e-04, 5.3215e-04, -5.3942e-05, ..., -9.1982e-04, 1.3084e-03, 3.5310e-04], [ 5.1193e-03, 3.9749e-03, 1.5440e-03, ..., 1.4076e-03, 1.6088e-03, 3.7241e-04], [-2.4612e-02, -4.1509e-04, -7.1716e-03, ..., 1.3113e-03, 2.7800e-04, 3.8576e-04]], device='cuda:0') Epoch 11, bias, value: tensor([-0.0136, -0.0197, -0.0168, -0.0286, -0.0305, -0.0013, 0.0268, -0.0115, 0.0263, 0.0038], device='cuda:0'), grad: tensor([ 0.0028, -0.0005, -0.0145, 0.0050, 0.0034, 0.0058, 0.0110, 0.0021, 0.0101, -0.0251], device='cuda:0') 100 0.0001 changing lr epoch 10, time 265.14, cls_loss 0.0918 cls_loss_mapping 0.1268 cls_loss_causal 1.1186 re_mapping 0.0481 re_causal 0.1114 /// teacc 97.62 lr 0.00010000 Epoch 12, weight, value: tensor([[-0.0072, -0.0308, -0.0315, ..., -0.0098, 0.0561, 0.0478], [-0.0419, -0.0449, -0.0511, ..., -0.0315, -0.0340, -0.0388], [-0.0180, -0.0221, 0.0321, ..., -0.0212, 0.0479, -0.0170], ..., [-0.0307, 0.0159, 0.0284, ..., 0.0360, -0.0144, -0.0304], [-0.0582, 0.0277, -0.0586, ..., 0.0104, -0.0271, -0.0261], [ 0.0233, 0.0086, -0.0323, ..., 0.0156, -0.0222, 0.0103]], device='cuda:0'), grad: tensor([[ 2.7752e-04, 4.7541e-04, 5.3734e-05, ..., 1.3363e-04, -1.8448e-05, 1.7136e-05], [ 1.5879e-04, 8.2111e-04, -1.7142e-04, ..., 4.9204e-05, 2.1189e-05, 1.7691e-04], [ 1.5056e-04, 7.0076e-03, 9.8586e-05, ..., 6.6996e-04, 4.2367e-04, 8.3780e-04], ..., [ 6.7234e-04, 1.0826e-02, 2.0409e-04, ..., 2.4853e-03, 7.4767e-06, 1.6813e-03], [ 8.7023e-04, 2.4834e-03, 1.3018e-04, ..., 8.6641e-04, 1.0425e-04, 3.4237e-04], [-8.8587e-06, -9.2602e-04, -4.1628e-04, ..., -3.2673e-03, 4.3035e-05, 1.3018e-04]], device='cuda:0') Epoch 12, bias, value: tensor([-0.0142, -0.0200, -0.0168, -0.0285, -0.0304, -0.0012, 0.0268, -0.0111, 0.0264, 0.0037], device='cuda:0'), grad: tensor([ 0.0007, -0.0018, 0.0068, -0.0216, 0.0044, 0.0007, -0.0002, 0.0138, 0.0045, -0.0072], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 281.92, cls_loss 0.0749 cls_loss_mapping 0.1097 cls_loss_causal 1.1203 re_mapping 0.0461 re_causal 0.1116 /// teacc 97.76 lr 0.00010000 Epoch 13, weight, value: tensor([[-0.0080, -0.0312, -0.0324, ..., -0.0103, 0.0576, 0.0488], [-0.0431, -0.0461, -0.0549, ..., -0.0321, -0.0366, -0.0395], [-0.0196, -0.0220, 0.0354, ..., -0.0217, 0.0494, -0.0164], ..., [-0.0308, 0.0164, 0.0267, ..., 0.0368, -0.0167, -0.0316], [-0.0593, 0.0284, -0.0611, ..., 0.0110, -0.0271, -0.0261], [ 0.0238, 0.0077, -0.0325, ..., 0.0152, -0.0227, 0.0099]], device='cuda:0'), grad: tensor([[ 1.4830e-04, -2.0361e-04, 6.0111e-05, ..., -2.1410e-04, -2.2736e-03, -2.0027e-03], [ 8.8654e-03, 4.7684e-03, 1.3847e-03, ..., 3.6073e-04, 2.8286e-03, 1.9140e-03], [ 2.4021e-04, 2.7275e-04, -2.1763e-03, ..., 8.3542e-04, -3.2215e-03, -2.3804e-03], ..., [ 4.5156e-04, -2.9354e-03, -3.3927e-04, ..., -1.9722e-03, 4.6611e-04, 3.6907e-04], [ 1.0786e-03, -4.7684e-03, 1.6797e-04, ..., -2.7618e-03, 3.1066e-04, 3.1757e-04], [-5.4806e-05, 4.3907e-03, -5.7030e-04, ..., 2.4376e-03, 2.5225e-04, 3.1352e-04]], device='cuda:0') Epoch 13, bias, value: tensor([-0.0141, -0.0198, -0.0166, -0.0284, -0.0303, -0.0012, 0.0266, -0.0115, 0.0267, 0.0034], device='cuda:0'), grad: tensor([-0.0012, 0.0144, -0.0036, 0.0174, -0.0041, -0.0256, 0.0018, -0.0023, -0.0058, 0.0091], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 282.95, cls_loss 0.0668 cls_loss_mapping 0.1031 cls_loss_causal 1.0754 re_mapping 0.0426 re_causal 0.1025 /// teacc 97.88 lr 0.00010000 Epoch 14, weight, value: tensor([[-0.0083, -0.0318, -0.0333, ..., -0.0105, 0.0589, 0.0497], [-0.0443, -0.0477, -0.0565, ..., -0.0329, -0.0368, -0.0391], [-0.0204, -0.0220, 0.0367, ..., -0.0222, 0.0498, -0.0164], ..., [-0.0313, 0.0173, 0.0275, ..., 0.0377, -0.0179, -0.0328], [-0.0612, 0.0295, -0.0643, ..., 0.0112, -0.0270, -0.0261], [ 0.0238, 0.0067, -0.0334, ..., 0.0147, -0.0218, 0.0100]], device='cuda:0'), grad: tensor([[ 1.0672e-03, 8.1778e-04, 2.6298e-04, ..., 1.4839e-03, 1.7238e-04, 1.6046e-04], [ 2.6035e-04, 3.4332e-04, 1.2457e-04, ..., 3.6597e-04, 3.1734e-04, 1.2732e-04], [ 8.8036e-05, 7.3929e-03, 6.3276e-04, ..., 5.8937e-03, 1.5364e-03, 5.2547e-04], ..., [-4.7708e-04, -1.0117e-02, -1.6041e-03, ..., -9.0637e-03, -3.1185e-03, -9.1648e-04], [ 6.3419e-04, -8.1539e-05, 1.8799e-04, ..., -4.4107e-04, 4.7350e-04, -1.5363e-05], [-1.7252e-03, 3.9864e-04, 1.1921e-04, ..., -5.5809e-03, 2.3401e-04, 1.8728e-04]], device='cuda:0') Epoch 14, bias, value: tensor([-0.0142, -0.0196, -0.0165, -0.0288, -0.0305, -0.0015, 0.0266, -0.0113, 0.0271, 0.0035], device='cuda:0'), grad: tensor([ 0.0017, -0.0028, 0.0081, 0.0054, 0.0240, -0.0030, -0.0044, -0.0093, 0.0002, -0.0200], device='cuda:0') 100 0.0001 changing lr epoch 13, time 265.57, cls_loss 0.0709 cls_loss_mapping 0.0945 cls_loss_causal 1.0394 re_mapping 0.0408 re_causal 0.0975 /// teacc 97.69 lr 0.00010000 Epoch 15, weight, value: tensor([[-0.0091, -0.0324, -0.0323, ..., -0.0109, 0.0604, 0.0506], [-0.0450, -0.0492, -0.0588, ..., -0.0338, -0.0384, -0.0395], [-0.0216, -0.0226, 0.0385, ..., -0.0234, 0.0510, -0.0162], ..., [-0.0316, 0.0181, 0.0276, ..., 0.0387, -0.0195, -0.0336], [-0.0627, 0.0293, -0.0656, ..., 0.0120, -0.0271, -0.0262], [ 0.0238, 0.0054, -0.0350, ..., 0.0143, -0.0222, 0.0096]], device='cuda:0'), grad: tensor([[ 3.2127e-05, 7.0453e-05, 3.2693e-05, ..., 1.7524e-05, -1.4992e-03, -1.2417e-03], [ 1.6749e-05, 1.5316e-03, 5.8079e-04, ..., 6.3019e-03, 4.1294e-04, -1.6555e-05], [ 2.8148e-05, -3.1567e-04, -9.8324e-04, ..., 3.6550e-04, -9.9850e-04, 2.0236e-05], ..., [ 1.4067e-04, -4.0283e-03, 3.2353e-04, ..., -1.4124e-03, 4.3273e-04, 1.1021e-04], [ 3.8481e-04, -1.2112e-03, 6.7055e-05, ..., -7.7286e-03, 3.2806e-04, 2.4021e-04], [-1.4853e-04, -5.0402e-04, 2.7776e-04, ..., -5.8317e-04, 3.4952e-04, 2.8372e-04]], device='cuda:0') Epoch 15, bias, value: tensor([-0.0140, -0.0199, -0.0164, -0.0286, -0.0305, -0.0016, 0.0263, -0.0111, 0.0274, 0.0032], device='cuda:0'), grad: tensor([-0.0012, 0.0095, -0.0009, 0.0043, 0.0003, 0.0013, 0.0006, -0.0017, -0.0113, -0.0009], device='cuda:0') 100 0.0001 changing lr epoch 14, time 265.06, cls_loss 0.0640 cls_loss_mapping 0.0867 cls_loss_causal 1.0318 re_mapping 0.0367 re_causal 0.0886 /// teacc 97.86 lr 0.00010000 Epoch 16, weight, value: tensor([[-0.0097, -0.0331, -0.0333, ..., -0.0108, 0.0616, 0.0515], [-0.0463, -0.0507, -0.0607, ..., -0.0348, -0.0401, -0.0395], [-0.0226, -0.0226, 0.0402, ..., -0.0242, 0.0522, -0.0159], ..., [-0.0321, 0.0188, 0.0285, ..., 0.0393, -0.0204, -0.0344], [-0.0651, 0.0297, -0.0685, ..., 0.0120, -0.0280, -0.0273], [ 0.0243, 0.0048, -0.0349, ..., 0.0140, -0.0229, 0.0089]], device='cuda:0'), grad: tensor([[-2.9922e-04, -3.2187e-04, 8.2254e-05, ..., -5.1498e-04, -4.7379e-03, -3.7441e-03], [ 5.4747e-05, 8.0490e-04, 4.3058e-04, ..., 3.7622e-04, 1.1069e-04, -7.5388e-04], [ 6.7234e-05, 8.2550e-03, 3.0479e-03, ..., 3.6669e-04, -9.0265e-04, -1.6117e-04], ..., [ 4.0102e-04, 6.3248e-03, 2.6817e-03, ..., -1.5306e-03, 8.9645e-04, 8.5890e-05], [ 7.3576e-04, 1.7080e-03, 1.7738e-04, ..., 1.0729e-03, 2.7227e-04, 5.3644e-04], [-7.4744e-05, 8.8549e-04, 3.7909e-04, ..., -8.3447e-04, 3.6192e-04, 3.6955e-04]], device='cuda:0') Epoch 16, bias, value: tensor([-0.0141, -0.0198, -0.0164, -0.0287, -0.0308, -0.0009, 0.0264, -0.0111, 0.0270, 0.0032], device='cuda:0'), grad: tensor([-0.0041, -0.0013, 0.0087, -0.0195, 0.0013, 0.0010, 0.0038, 0.0065, 0.0032, 0.0004], device='cuda:0') 100 0.0001 changing lr epoch 15, time 265.54, cls_loss 0.0576 cls_loss_mapping 0.0810 cls_loss_causal 0.9884 re_mapping 0.0355 re_causal 0.0873 /// teacc 97.85 lr 0.00010000 Epoch 17, weight, value: tensor([[-0.0107, -0.0340, -0.0344, ..., -0.0114, 0.0624, 0.0521], [-0.0473, -0.0520, -0.0621, ..., -0.0352, -0.0418, -0.0396], [-0.0239, -0.0223, 0.0421, ..., -0.0247, 0.0536, -0.0153], ..., [-0.0323, 0.0190, 0.0280, ..., 0.0398, -0.0211, -0.0352], [-0.0672, 0.0301, -0.0698, ..., 0.0126, -0.0284, -0.0276], [ 0.0245, 0.0038, -0.0352, ..., 0.0135, -0.0228, 0.0087]], device='cuda:0'), grad: tensor([[ 7.4148e-04, 1.5211e-04, 5.4479e-05, ..., 2.4509e-04, -1.0252e-03, -8.2874e-04], [ 8.4162e-05, 1.1683e-04, 4.3541e-05, ..., 1.3924e-04, 3.9428e-05, 1.4946e-05], [ 1.6582e-04, 5.5313e-04, 3.8218e-04, ..., 6.0225e-04, 3.4833e-04, 3.8362e-04], ..., [-2.2717e-03, -3.6049e-03, -7.1335e-04, ..., -4.6654e-03, -3.4451e-05, -2.2030e-04], [ 2.2686e-04, -4.1544e-05, 7.2837e-05, ..., -2.5034e-04, 2.3580e-04, 3.2783e-05], [-7.3862e-04, 6.4468e-04, 6.2275e-04, ..., 1.6603e-03, 4.7064e-04, 1.9741e-04]], device='cuda:0') Epoch 17, bias, value: tensor([-0.0146, -0.0200, -0.0159, -0.0287, -0.0307, -0.0006, 0.0262, -0.0115, 0.0271, 0.0033], device='cuda:0'), grad: tensor([ 1.5342e-04, -5.4665e-03, 3.0155e-03, 7.4234e-03, -8.9169e-05, -1.4706e-03, 1.3084e-03, -8.3694e-03, 3.7760e-05, 3.4618e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 16---------------------------------------------------- epoch 16, time 282.44, cls_loss 0.0571 cls_loss_mapping 0.0798 cls_loss_causal 0.9809 re_mapping 0.0340 re_causal 0.0860 /// teacc 98.08 lr 0.00010000 Epoch 18, weight, value: tensor([[-0.0110, -0.0343, -0.0353, ..., -0.0113, 0.0640, 0.0532], [-0.0481, -0.0529, -0.0629, ..., -0.0359, -0.0433, -0.0400], [-0.0247, -0.0228, 0.0419, ..., -0.0257, 0.0541, -0.0153], ..., [-0.0325, 0.0199, 0.0305, ..., 0.0408, -0.0220, -0.0357], [-0.0688, 0.0308, -0.0699, ..., 0.0129, -0.0285, -0.0277], [ 0.0247, 0.0030, -0.0367, ..., 0.0130, -0.0229, 0.0085]], device='cuda:0'), grad: tensor([[ 2.3353e-04, -1.3494e-04, 1.1712e-04, ..., 1.3173e-04, -1.4181e-03, -9.3794e-04], [ 1.2863e-04, 3.7813e-04, 7.9513e-05, ..., 1.4830e-04, 2.9588e-04, -4.6968e-05], [ 1.9350e-03, 3.5076e-03, 1.1645e-05, ..., 7.7820e-04, 1.9779e-03, 3.0732e-04], ..., [ 1.3411e-04, -1.3866e-03, -3.0971e-04, ..., -7.6342e-04, 2.0206e-04, 6.9559e-05], [ 2.9230e-04, -9.2077e-04, 8.8096e-05, ..., -6.7043e-04, -7.9060e-04, -6.0618e-05], [ 1.8227e-04, -9.1493e-05, -4.8232e-04, ..., -1.4424e-04, 5.1069e-04, 4.4274e-04]], device='cuda:0') Epoch 18, bias, value: tensor([-0.0139, -0.0200, -0.0165, -0.0290, -0.0306, -0.0008, 0.0257, -0.0112, 0.0278, 0.0030], device='cuda:0'), grad: tensor([-0.0008, -0.0004, 0.0059, -0.0030, 0.0007, -0.0004, 0.0008, -0.0005, -0.0017, -0.0005], device='cuda:0') 100 0.0001 changing lr epoch 17, time 265.80, cls_loss 0.0578 cls_loss_mapping 0.0747 cls_loss_causal 0.9653 re_mapping 0.0321 re_causal 0.0793 /// teacc 97.83 lr 0.00010000 Epoch 19, weight, value: tensor([[-0.0115, -0.0346, -0.0362, ..., -0.0118, 0.0646, 0.0537], [-0.0493, -0.0542, -0.0661, ..., -0.0367, -0.0446, -0.0405], [-0.0255, -0.0233, 0.0436, ..., -0.0266, 0.0558, -0.0149], ..., [-0.0331, 0.0203, 0.0304, ..., 0.0417, -0.0232, -0.0364], [-0.0702, 0.0316, -0.0708, ..., 0.0133, -0.0289, -0.0282], [ 0.0250, 0.0025, -0.0362, ..., 0.0127, -0.0235, 0.0081]], device='cuda:0'), grad: tensor([[-1.0228e-04, 1.2040e-04, 2.4331e-04, ..., 2.0587e-04, -1.6756e-03, -1.8559e-03], [ 5.5164e-05, 4.7708e-04, 2.9135e-04, ..., 8.1491e-04, 1.2362e-04, 3.7521e-05], [-1.3649e-04, -1.3857e-03, -1.9798e-03, ..., -3.0975e-03, -1.0994e-02, -3.7346e-03], ..., [-1.0055e-04, -5.2881e-04, -8.5640e-04, ..., 2.3499e-03, 4.2419e-03, 3.4313e-03], [ 9.1887e-04, 3.6812e-04, -1.4544e-04, ..., -2.7580e-03, 4.5538e-04, 3.5357e-04], [-4.2677e-04, -1.6677e-04, -2.3805e-06, ..., -1.2070e-04, 3.7479e-04, 1.9300e-04]], device='cuda:0') Epoch 19, bias, value: tensor([-0.0140, -0.0204, -0.0158, -0.0290, -0.0310, -0.0008, 0.0260, -0.0112, 0.0281, 0.0027], device='cuda:0'), grad: tensor([-0.0008, 0.0029, -0.0085, 0.0013, 0.0049, 0.0067, 0.0026, 0.0011, -0.0097, -0.0004], device='cuda:0') 100 0.0001 changing lr epoch 18, time 265.19, cls_loss 0.0517 cls_loss_mapping 0.0738 cls_loss_causal 0.9729 re_mapping 0.0319 re_causal 0.0811 /// teacc 97.87 lr 0.00010000 Epoch 20, weight, value: tensor([[-0.0127, -0.0353, -0.0372, ..., -0.0121, 0.0654, 0.0541], [-0.0512, -0.0553, -0.0675, ..., -0.0374, -0.0453, -0.0408], [-0.0273, -0.0239, 0.0439, ..., -0.0278, 0.0565, -0.0148], ..., [-0.0339, 0.0207, 0.0295, ..., 0.0423, -0.0244, -0.0375], [-0.0712, 0.0313, -0.0719, ..., 0.0136, -0.0297, -0.0284], [ 0.0256, 0.0015, -0.0367, ..., 0.0123, -0.0232, 0.0084]], device='cuda:0'), grad: tensor([[-1.0097e-04, 3.1972e-04, 3.2330e-04, ..., 3.2115e-04, 1.9569e-03, 7.4768e-04], [ 5.6833e-05, 6.4754e-04, 2.2784e-05, ..., 2.9445e-04, 1.2469e-04, 9.7454e-05], [ 8.6904e-05, -1.6081e-04, -3.7026e-04, ..., 2.0623e-04, -3.1891e-03, -1.6003e-03], ..., [ 1.1134e-04, -5.0049e-03, 1.0639e-04, ..., -2.5291e-03, 3.4809e-04, 1.1009e-04], [ 4.6825e-04, -8.8751e-05, 9.2208e-05, ..., 2.4843e-04, -1.6284e-04, -3.2926e-04], [-6.8903e-04, -2.9254e-04, -2.4050e-05, ..., -5.8365e-04, 1.9264e-04, 1.8930e-04]], device='cuda:0') Epoch 20, bias, value: tensor([-0.0145, -0.0205, -0.0162, -0.0283, -0.0308, -0.0008, 0.0258, -0.0115, 0.0282, 0.0028], device='cuda:0'), grad: tensor([ 0.0017, 0.0004, -0.0016, 0.0063, 0.0006, 0.0006, -0.0002, -0.0034, 0.0007, -0.0053], device='cuda:0') 100 0.0001 changing lr epoch 19, time 265.07, cls_loss 0.0492 cls_loss_mapping 0.0693 cls_loss_causal 0.9366 re_mapping 0.0311 re_causal 0.0800 /// teacc 98.00 lr 0.00010000 Epoch 21, weight, value: tensor([[-0.0122, -0.0357, -0.0349, ..., -0.0124, 0.0668, 0.0552], [-0.0526, -0.0569, -0.0690, ..., -0.0380, -0.0468, -0.0414], [-0.0284, -0.0242, 0.0451, ..., -0.0287, 0.0573, -0.0146], ..., [-0.0339, 0.0211, 0.0294, ..., 0.0430, -0.0256, -0.0381], [-0.0725, 0.0322, -0.0730, ..., 0.0140, -0.0295, -0.0283], [ 0.0254, 0.0007, -0.0377, ..., 0.0119, -0.0233, 0.0079]], device='cuda:0'), grad: tensor([[-2.9106e-03, -1.7376e-03, 3.3639e-06, ..., 1.6674e-05, -3.6526e-03, -2.6703e-03], [ 5.2786e-04, 1.2608e-03, 3.9190e-06, ..., 6.0034e-04, 3.3170e-05, 1.2316e-05], [ 2.4796e-04, 1.0881e-03, 6.6459e-05, ..., 4.3178e-04, 4.2582e-04, 3.3855e-04], ..., [ 3.0422e-04, -2.1725e-03, -2.0707e-04, ..., -1.1654e-03, 1.4114e-04, 1.1230e-04], [ 6.0749e-04, -2.4319e-03, 4.5411e-06, ..., -1.3628e-03, 2.5415e-04, 2.1744e-04], [-2.3937e-03, 1.9140e-03, 2.5839e-05, ..., 8.4114e-04, 1.4324e-03, 9.8896e-04]], device='cuda:0') Epoch 21, bias, value: tensor([-0.0138, -0.0204, -0.0161, -0.0281, -0.0309, -0.0015, 0.0260, -0.0113, 0.0285, 0.0024], device='cuda:0'), grad: tensor([-0.0040, 0.0057, 0.0019, 0.0030, 0.0070, -0.0004, 0.0004, -0.0007, -0.0080, -0.0048], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 20---------------------------------------------------- epoch 20, time 281.30, cls_loss 0.0417 cls_loss_mapping 0.0594 cls_loss_causal 0.8965 re_mapping 0.0301 re_causal 0.0776 /// teacc 98.20 lr 0.00010000 Epoch 22, weight, value: tensor([[-0.0130, -0.0363, -0.0351, ..., -0.0127, 0.0678, 0.0559], [-0.0537, -0.0582, -0.0701, ..., -0.0384, -0.0479, -0.0414], [-0.0289, -0.0244, 0.0459, ..., -0.0294, 0.0582, -0.0141], ..., [-0.0341, 0.0221, 0.0292, ..., 0.0437, -0.0262, -0.0391], [-0.0737, 0.0329, -0.0729, ..., 0.0142, -0.0297, -0.0284], [ 0.0262, -0.0002, -0.0380, ..., 0.0118, -0.0236, 0.0075]], device='cuda:0'), grad: tensor([[ 9.3889e-04, 8.0645e-05, 3.8505e-05, ..., 7.6890e-05, 2.7275e-04, 3.1447e-04], [ 2.2352e-05, 4.8459e-05, 6.3896e-05, ..., 1.6361e-05, 7.4744e-05, -1.4508e-04], [ 5.1945e-05, -4.6206e-04, -1.5993e-03, ..., 1.2502e-05, -1.0929e-03, 2.0349e-04], ..., [ 2.9027e-05, 6.9797e-05, 5.3358e-04, ..., -7.2062e-05, 4.1699e-04, 2.9266e-05], [ 1.6057e-04, 3.9846e-05, 4.0799e-05, ..., 4.6492e-05, 1.4389e-04, 1.1462e-04], [-6.3002e-05, 4.2081e-05, 3.9428e-05, ..., -5.2750e-05, 1.5533e-04, 9.2089e-05]], device='cuda:0') Epoch 22, bias, value: tensor([-0.0138, -0.0205, -0.0160, -0.0285, -0.0311, -0.0014, 0.0256, -0.0109, 0.0285, 0.0025], device='cuda:0'), grad: tensor([ 5.2500e-04, -2.5806e-03, -1.3514e-03, 1.0576e-03, 1.5154e-03, 4.9973e-04, -1.0567e-03, 9.6035e-04, 4.4799e-04, -2.0161e-05], device='cuda:0') 100 0.0001 changing lr epoch 21, time 265.82, cls_loss 0.0366 cls_loss_mapping 0.0546 cls_loss_causal 0.8627 re_mapping 0.0293 re_causal 0.0739 /// teacc 98.10 lr 0.00010000 Epoch 23, weight, value: tensor([[-0.0133, -0.0367, -0.0352, ..., -0.0130, 0.0688, 0.0566], [-0.0547, -0.0592, -0.0716, ..., -0.0392, -0.0492, -0.0416], [-0.0298, -0.0247, 0.0469, ..., -0.0302, 0.0594, -0.0133], ..., [-0.0347, 0.0225, 0.0289, ..., 0.0447, -0.0272, -0.0402], [-0.0753, 0.0330, -0.0731, ..., 0.0142, -0.0299, -0.0290], [ 0.0267, -0.0007, -0.0379, ..., 0.0115, -0.0239, 0.0069]], device='cuda:0'), grad: tensor([[ 3.0851e-04, 1.6403e-04, 4.5896e-05, ..., 1.1301e-04, -4.5228e-04, -3.0017e-04], [ 7.6413e-05, 6.7472e-05, -1.1957e-04, ..., 3.5644e-05, 3.4034e-05, -3.4869e-05], [ 3.4142e-03, 2.5654e-04, -1.7395e-03, ..., 4.3106e-04, -1.0562e-04, -7.8392e-04], ..., [ 7.8440e-05, 1.1196e-03, 3.4928e-04, ..., 7.2193e-04, 1.5373e-03, 1.3628e-03], [ 3.2406e-03, 3.1829e-04, 6.7759e-04, ..., 3.9291e-03, 3.5048e-04, 9.1136e-05], [-1.2231e-04, 1.0175e-04, 2.1219e-04, ..., -3.9876e-05, 3.4165e-04, 1.0741e-04]], device='cuda:0') Epoch 23, bias, value: tensor([-0.0137, -0.0209, -0.0156, -0.0286, -0.0314, -0.0013, 0.0255, -0.0110, 0.0287, 0.0026], device='cuda:0'), grad: tensor([ 0.0002, -0.0009, 0.0019, -0.0095, -0.0003, 0.0002, 0.0004, 0.0022, 0.0052, 0.0005], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 22---------------------------------------------------- epoch 22, time 282.33, cls_loss 0.0412 cls_loss_mapping 0.0602 cls_loss_causal 0.8785 re_mapping 0.0275 re_causal 0.0709 /// teacc 98.34 lr 0.00010000 Epoch 24, weight, value: tensor([[-0.0137, -0.0374, -0.0356, ..., -0.0132, 0.0696, 0.0573], [-0.0556, -0.0607, -0.0726, ..., -0.0393, -0.0502, -0.0415], [-0.0311, -0.0256, 0.0476, ..., -0.0314, 0.0602, -0.0130], ..., [-0.0358, 0.0234, 0.0304, ..., 0.0455, -0.0282, -0.0408], [-0.0767, 0.0336, -0.0737, ..., 0.0144, -0.0300, -0.0294], [ 0.0271, -0.0016, -0.0381, ..., 0.0111, -0.0233, 0.0071]], device='cuda:0'), grad: tensor([[ 1.2565e-04, 4.5359e-05, 2.7642e-05, ..., 5.3756e-06, 4.8786e-05, 4.8757e-05], [ 1.1563e-04, 2.6584e-04, 7.7724e-04, ..., 1.0306e-04, 3.6001e-04, 1.0133e-04], [ 3.5167e-05, -1.5091e-02, -3.3989e-03, ..., 4.0978e-05, -5.6124e-04, -2.2054e-04], ..., [-2.4605e-03, -5.5075e-04, 2.2221e-04, ..., -2.7828e-03, 7.7665e-05, 3.8087e-05], [ 1.3101e-04, 4.4250e-03, 7.4673e-04, ..., -4.8429e-06, 9.8273e-06, -2.8953e-05], [ 2.2945e-03, 1.0462e-03, -1.8269e-05, ..., 2.0466e-03, 7.0691e-05, 6.5386e-05]], device='cuda:0') Epoch 24, bias, value: tensor([-0.0139, -0.0201, -0.0162, -0.0283, -0.0315, -0.0014, 0.0254, -0.0110, 0.0288, 0.0025], device='cuda:0'), grad: tensor([ 3.6669e-04, 4.7302e-04, -1.6617e-02, 7.6637e-03, 1.4343e-03, 2.6588e-03, 1.0133e-05, -6.0463e-03, 3.9215e-03, 6.1378e-03], device='cuda:0') 100 0.0001 changing lr epoch 23, time 265.34, cls_loss 0.0390 cls_loss_mapping 0.0519 cls_loss_causal 0.8758 re_mapping 0.0267 re_causal 0.0688 /// teacc 98.29 lr 0.00010000 Epoch 25, weight, value: tensor([[-0.0140, -0.0380, -0.0361, ..., -0.0132, 0.0703, 0.0578], [-0.0564, -0.0619, -0.0744, ..., -0.0400, -0.0524, -0.0423], [-0.0318, -0.0255, 0.0495, ..., -0.0321, 0.0614, -0.0122], ..., [-0.0358, 0.0238, 0.0305, ..., 0.0459, -0.0289, -0.0415], [-0.0783, 0.0337, -0.0752, ..., 0.0146, -0.0306, -0.0296], [ 0.0279, -0.0022, -0.0387, ..., 0.0108, -0.0239, 0.0065]], device='cuda:0'), grad: tensor([[ 2.0683e-04, 5.2214e-04, 1.5587e-05, ..., 3.4839e-05, 1.4524e-03, 1.2064e-03], [ 1.7011e-04, 2.3162e-04, 6.7055e-05, ..., 6.6876e-05, 3.8791e-04, 2.4235e-04], [ 5.4598e-04, 1.1005e-03, 5.7489e-05, ..., 2.0778e-04, 5.7840e-04, 4.3511e-04], ..., [ 1.8612e-05, -7.9012e-04, -4.6611e-05, ..., -7.7772e-04, 6.2466e-05, 8.1301e-05], [ 1.2541e-04, -2.3448e-04, 2.9221e-05, ..., -7.2658e-05, -2.1038e-03, -1.7824e-03], [ 8.1444e-04, 9.0599e-04, 1.3041e-04, ..., 2.6965e-04, 1.8954e-04, 1.8978e-04]], device='cuda:0') Epoch 25, bias, value: tensor([-0.0140, -0.0199, -0.0163, -0.0284, -0.0313, -0.0012, 0.0250, -0.0110, 0.0288, 0.0027], device='cuda:0'), grad: tensor([ 0.0036, 0.0008, 0.0023, -0.0024, 0.0004, 0.0007, -0.0018, -0.0008, -0.0045, 0.0016], device='cuda:0') 100 0.0001 changing lr epoch 24, time 265.08, cls_loss 0.0397 cls_loss_mapping 0.0607 cls_loss_causal 0.8547 re_mapping 0.0276 re_causal 0.0704 /// teacc 98.30 lr 0.00010000 Epoch 26, weight, value: tensor([[-0.0142, -0.0382, -0.0359, ..., -0.0133, 0.0714, 0.0586], [-0.0572, -0.0626, -0.0758, ..., -0.0400, -0.0542, -0.0422], [-0.0328, -0.0261, 0.0500, ..., -0.0327, 0.0625, -0.0119], ..., [-0.0361, 0.0240, 0.0302, ..., 0.0469, -0.0300, -0.0427], [-0.0795, 0.0342, -0.0759, ..., 0.0149, -0.0309, -0.0299], [ 0.0280, -0.0030, -0.0394, ..., 0.0102, -0.0242, 0.0060]], device='cuda:0'), grad: tensor([[ 2.7657e-04, -5.1558e-05, 9.2834e-06, ..., 1.2159e-04, 2.9964e-03, 1.7128e-03], [ 6.0380e-05, 1.0687e-04, 1.6674e-05, ..., 7.5042e-05, 3.0541e-04, 1.8775e-04], [ 3.5286e-05, -1.7357e-03, -1.6189e-04, ..., 3.2997e-04, -7.0152e-03, -3.6488e-03], ..., [ 4.4775e-04, -8.9645e-04, 2.2912e-04, ..., -9.5129e-05, 2.2030e-04, -1.0991e-04], [ 6.8760e-04, 1.0710e-03, 1.6466e-05, ..., 5.8079e-04, 1.0767e-03, 6.8378e-04], [ 2.8305e-03, -1.2636e-04, 2.3305e-04, ..., 4.1351e-03, 2.9683e-04, 2.1410e-04]], device='cuda:0') Epoch 26, bias, value: tensor([-0.0137, -0.0201, -0.0158, -0.0282, -0.0315, -0.0015, 0.0251, -0.0109, 0.0288, 0.0022], device='cuda:0'), grad: tensor([ 0.0022, 0.0002, -0.0067, 0.0040, -0.0171, 0.0020, -0.0024, 0.0015, 0.0034, 0.0129], device='cuda:0') 100 0.0001 changing lr epoch 25, time 264.81, cls_loss 0.0334 cls_loss_mapping 0.0535 cls_loss_causal 0.8335 re_mapping 0.0263 re_causal 0.0676 /// teacc 98.14 lr 0.00010000 Epoch 27, weight, value: tensor([[-0.0149, -0.0387, -0.0364, ..., -0.0136, 0.0719, 0.0590], [-0.0579, -0.0646, -0.0763, ..., -0.0412, -0.0545, -0.0419], [-0.0333, -0.0264, 0.0509, ..., -0.0331, 0.0633, -0.0114], ..., [-0.0367, 0.0249, 0.0300, ..., 0.0477, -0.0311, -0.0435], [-0.0811, 0.0343, -0.0764, ..., 0.0151, -0.0313, -0.0304], [ 0.0285, -0.0034, -0.0400, ..., 0.0100, -0.0243, 0.0056]], device='cuda:0'), grad: tensor([[ 1.4558e-05, 4.0829e-05, 4.6998e-05, ..., 4.0144e-05, 1.5080e-04, -3.7074e-05], [ 1.2949e-05, 5.7101e-05, 1.0198e-04, ..., 1.4257e-04, 3.4690e-05, -1.0028e-05], [ 1.8001e-05, 6.5625e-05, 1.0714e-05, ..., 1.1075e-04, 1.2577e-04, -2.1219e-05], ..., [-1.1347e-05, -1.3542e-04, 7.9036e-05, ..., -1.9908e-04, 1.5008e-04, 2.7716e-05], [ 5.8383e-05, -2.8396e-04, 5.3167e-05, ..., -2.9945e-04, -6.1274e-05, -1.2733e-05], [-2.2972e-04, -2.9254e-04, 6.8605e-05, ..., 1.4651e-04, 3.1614e-04, 9.8944e-06]], device='cuda:0') Epoch 27, bias, value: tensor([-0.0139, -0.0202, -0.0154, -0.0284, -0.0311, -0.0016, 0.0253, -0.0108, 0.0284, 0.0023], device='cuda:0'), grad: tensor([ 0.0004, 0.0005, 0.0005, 0.0006, -0.0032, 0.0004, 0.0007, 0.0005, -0.0002, -0.0003], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 26---------------------------------------------------- epoch 26, time 282.30, cls_loss 0.0337 cls_loss_mapping 0.0485 cls_loss_causal 0.8751 re_mapping 0.0241 re_causal 0.0648 /// teacc 98.40 lr 0.00010000 Epoch 28, weight, value: tensor([[-0.0153, -0.0391, -0.0365, ..., -0.0137, 0.0729, 0.0597], [-0.0588, -0.0654, -0.0776, ..., -0.0415, -0.0553, -0.0420], [-0.0339, -0.0275, 0.0512, ..., -0.0344, 0.0642, -0.0114], ..., [-0.0372, 0.0256, 0.0307, ..., 0.0487, -0.0319, -0.0441], [-0.0823, 0.0349, -0.0768, ..., 0.0152, -0.0313, -0.0302], [ 0.0287, -0.0036, -0.0405, ..., 0.0095, -0.0249, 0.0051]], device='cuda:0'), grad: tensor([[-1.0042e-03, -6.8235e-04, 2.0027e-05, ..., -2.8553e-03, -5.8556e-04, -1.3151e-03], [ 3.1680e-05, 6.6936e-05, 1.0423e-05, ..., 5.3495e-05, 2.2098e-05, 1.3724e-05], [ 2.7135e-05, 8.2552e-05, 4.6223e-05, ..., 9.2506e-05, 8.1301e-05, 3.6657e-05], ..., [ 5.8222e-04, -7.0953e-04, 1.4722e-04, ..., 5.4884e-04, 2.8133e-04, 5.4932e-04], [ 8.3923e-05, 6.9618e-05, 1.8850e-05, ..., -8.5890e-05, -7.0632e-05, 7.2896e-05], [-1.8132e-04, -1.9491e-04, -1.2481e-04, ..., -4.5180e-05, 1.1134e-04, 3.5673e-05]], device='cuda:0') Epoch 28, bias, value: tensor([-0.0140, -0.0202, -0.0159, -0.0286, -0.0313, -0.0015, 0.0254, -0.0108, 0.0286, 0.0026], device='cuda:0'), grad: tensor([-2.5196e-03, -4.1223e-04, 2.9564e-04, 8.1158e-04, -1.4865e-04, 1.4057e-03, 2.6494e-05, 1.0033e-03, -8.6278e-06, -4.5371e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 27---------------------------------------------------- epoch 27, time 282.18, cls_loss 0.0329 cls_loss_mapping 0.0475 cls_loss_causal 0.8096 re_mapping 0.0249 re_causal 0.0653 /// teacc 98.48 lr 0.00010000 Epoch 29, weight, value: tensor([[-0.0153, -0.0395, -0.0373, ..., -0.0139, 0.0739, 0.0604], [-0.0593, -0.0665, -0.0777, ..., -0.0422, -0.0566, -0.0420], [-0.0345, -0.0277, 0.0530, ..., -0.0350, 0.0652, -0.0109], ..., [-0.0376, 0.0252, 0.0314, ..., 0.0491, -0.0330, -0.0449], [-0.0827, 0.0351, -0.0782, ..., 0.0157, -0.0319, -0.0306], [ 0.0290, -0.0046, -0.0413, ..., 0.0090, -0.0257, 0.0046]], device='cuda:0'), grad: tensor([[ 6.5625e-05, 7.7605e-05, 7.0930e-05, ..., 4.1366e-05, 4.6849e-04, 2.8062e-04], [ 2.9340e-05, 9.6738e-05, 9.3654e-06, ..., 6.3181e-05, 1.1581e-04, -4.3482e-05], [ 3.2902e-05, 1.1826e-04, -3.9190e-06, ..., 1.5986e-04, -1.2474e-03, -7.5722e-04], ..., [ 6.6161e-05, -9.0075e-04, 3.5405e-05, ..., -7.6342e-04, 8.1897e-05, 5.1588e-05], [ 1.9276e-04, 2.7609e-04, 4.1276e-05, ..., -7.2956e-05, 4.0817e-04, 2.7704e-04], [-1.6117e-04, 9.6202e-05, -8.8453e-05, ..., 1.2815e-04, 2.0325e-04, 6.2466e-05]], device='cuda:0') Epoch 29, bias, value: tensor([-0.0137, -0.0195, -0.0159, -0.0282, -0.0313, -0.0013, 0.0248, -0.0111, 0.0288, 0.0020], device='cuda:0'), grad: tensor([ 0.0007, -0.0008, -0.0006, -0.0001, 0.0004, 0.0004, 0.0004, -0.0008, 0.0007, -0.0002], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 28---------------------------------------------------- epoch 28, time 281.63, cls_loss 0.0341 cls_loss_mapping 0.0507 cls_loss_causal 0.8270 re_mapping 0.0236 re_causal 0.0618 /// teacc 98.52 lr 0.00010000 Epoch 30, weight, value: tensor([[-0.0160, -0.0404, -0.0380, ..., -0.0142, 0.0742, 0.0606], [-0.0602, -0.0682, -0.0791, ..., -0.0433, -0.0571, -0.0420], [-0.0349, -0.0278, 0.0539, ..., -0.0359, 0.0665, -0.0105], ..., [-0.0382, 0.0259, 0.0313, ..., 0.0500, -0.0338, -0.0452], [-0.0836, 0.0351, -0.0792, ..., 0.0160, -0.0325, -0.0309], [ 0.0292, -0.0052, -0.0411, ..., 0.0088, -0.0261, 0.0042]], device='cuda:0'), grad: tensor([[ 2.3997e-04, 4.9591e-05, -5.3978e-04, ..., 8.4102e-05, -2.6817e-03, -1.6956e-03], [ 8.3625e-05, 9.1493e-05, 1.1659e-04, ..., 8.0585e-05, 2.1660e-04, 1.4651e-04], [ 1.2316e-05, 7.2002e-05, 3.7527e-04, ..., 2.9117e-05, 1.1606e-03, 6.8092e-04], ..., [ 2.4116e-04, -7.4744e-05, 9.3102e-05, ..., -7.4029e-05, 1.5163e-04, 8.9824e-05], [ 1.7557e-03, 4.7952e-05, 3.6812e-04, ..., 1.1921e-03, 4.1866e-04, 4.9210e-04], [-5.3024e-04, 2.4700e-03, 3.2578e-03, ..., -1.1110e-04, 8.7976e-05, 5.6714e-05]], device='cuda:0') Epoch 30, bias, value: tensor([-0.0142, -0.0198, -0.0155, -0.0280, -0.0313, -0.0015, 0.0244, -0.0109, 0.0291, 0.0017], device='cuda:0'), grad: tensor([-0.0012, 0.0001, 0.0010, -0.0148, 0.0014, 0.0131, -0.0146, 0.0005, 0.0029, 0.0115], device='cuda:0') 100 0.0001 changing lr epoch 29, time 266.35, cls_loss 0.0318 cls_loss_mapping 0.0457 cls_loss_causal 0.8244 re_mapping 0.0239 re_causal 0.0629 /// teacc 98.47 lr 0.00010000 Epoch 31, weight, value: tensor([[-0.0168, -0.0412, -0.0380, ..., -0.0145, 0.0748, 0.0610], [-0.0611, -0.0696, -0.0803, ..., -0.0443, -0.0587, -0.0429], [-0.0356, -0.0275, 0.0551, ..., -0.0366, 0.0674, -0.0100], ..., [-0.0383, 0.0266, 0.0314, ..., 0.0510, -0.0349, -0.0459], [-0.0844, 0.0354, -0.0795, ..., 0.0164, -0.0326, -0.0307], [ 0.0297, -0.0057, -0.0415, ..., 0.0084, -0.0260, 0.0042]], device='cuda:0'), grad: tensor([[ 4.9472e-05, 3.1531e-05, 1.8096e-06, ..., 3.0756e-05, 1.5363e-05, 1.1697e-05], [ 3.5942e-05, 1.2894e-03, 7.1712e-06, ..., 1.6241e-03, 1.0824e-04, 1.9103e-05], [ 3.3677e-05, 3.7003e-04, -4.9859e-05, ..., 6.6614e-04, -3.5429e-04, -1.5402e-04], ..., [ 4.9263e-05, -2.3785e-03, 3.6601e-06, ..., -3.2692e-03, 1.3876e-04, 1.0002e-04], [-1.3924e-04, -1.7083e-04, 4.0196e-06, ..., 3.8409e-04, 3.1978e-05, 2.5690e-05], [ 7.0333e-04, 1.7369e-04, 2.5570e-05, ..., 2.3234e-04, 8.6101e-07, 7.2550e-07]], device='cuda:0') Epoch 31, bias, value: tensor([-0.0145, -0.0201, -0.0154, -0.0284, -0.0310, -0.0017, 0.0247, -0.0108, 0.0296, 0.0018], device='cuda:0'), grad: tensor([ 0.0001, 0.0024, 0.0008, 0.0015, -0.0014, -0.0013, 0.0006, -0.0042, -0.0002, 0.0017], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 30---------------------------------------------------- epoch 30, time 281.97, cls_loss 0.0308 cls_loss_mapping 0.0474 cls_loss_causal 0.8714 re_mapping 0.0224 re_causal 0.0631 /// teacc 98.54 lr 0.00010000 Epoch 32, weight, value: tensor([[-0.0174, -0.0415, -0.0384, ..., -0.0150, 0.0757, 0.0616], [-0.0623, -0.0710, -0.0813, ..., -0.0452, -0.0595, -0.0431], [-0.0357, -0.0278, 0.0556, ..., -0.0373, 0.0680, -0.0098], ..., [-0.0391, 0.0273, 0.0316, ..., 0.0517, -0.0357, -0.0464], [-0.0852, 0.0357, -0.0796, ..., 0.0164, -0.0327, -0.0308], [ 0.0299, -0.0063, -0.0414, ..., 0.0080, -0.0263, 0.0040]], device='cuda:0'), grad: tensor([[ 9.3997e-05, 5.1439e-05, 5.5172e-06, ..., -8.8871e-05, -3.0351e-04, -3.2258e-04], [ 5.2899e-05, 9.7990e-05, 1.9893e-05, ..., 3.9876e-05, 1.5533e-04, 5.6535e-05], [ 4.0948e-05, 6.6943e-06, -8.3029e-05, ..., 2.5094e-05, -1.6057e-04, -4.2021e-05], ..., [ 2.3156e-05, -3.2806e-04, 1.4700e-05, ..., -2.3806e-04, 9.1612e-05, 9.2685e-05], [ 8.6260e-04, 6.9332e-04, 8.1286e-06, ..., 6.8724e-05, 5.2309e-04, 4.6873e-04], [-1.1384e-04, 3.3915e-05, 7.2271e-06, ..., 1.0300e-04, 1.5974e-05, 4.2140e-05]], device='cuda:0') Epoch 32, bias, value: tensor([-0.0141, -0.0207, -0.0156, -0.0284, -0.0305, -0.0016, 0.0243, -0.0109, 0.0300, 0.0015], device='cuda:0'), grad: tensor([ 3.9250e-05, -6.3956e-05, 3.9816e-04, 4.8971e-04, 7.4005e-04, 1.0376e-03, -6.5918e-03, 9.9063e-05, 4.0779e-03, -2.2602e-04], device='cuda:0') 100 0.0001 changing lr epoch 31, time 265.20, cls_loss 0.0348 cls_loss_mapping 0.0505 cls_loss_causal 0.7935 re_mapping 0.0221 re_causal 0.0547 /// teacc 98.52 lr 0.00010000 Epoch 33, weight, value: tensor([[-0.0179, -0.0424, -0.0384, ..., -0.0151, 0.0766, 0.0625], [-0.0633, -0.0725, -0.0823, ..., -0.0457, -0.0602, -0.0426], [-0.0365, -0.0283, 0.0560, ..., -0.0381, 0.0688, -0.0093], ..., [-0.0396, 0.0282, 0.0337, ..., 0.0527, -0.0363, -0.0470], [-0.0865, 0.0354, -0.0806, ..., 0.0166, -0.0335, -0.0314], [ 0.0300, -0.0068, -0.0430, ..., 0.0074, -0.0268, 0.0030]], device='cuda:0'), grad: tensor([[ 3.0965e-05, 4.7445e-05, 2.0251e-05, ..., 3.3945e-05, 1.8954e-05, 2.4170e-05], [ 1.9565e-05, 2.8181e-04, 9.1076e-05, ..., 8.9765e-05, 6.0856e-05, 8.1122e-05], [ 1.9580e-05, 7.5459e-05, -2.4986e-04, ..., -1.3125e-04, -2.0468e-04, -7.9751e-05], ..., [ 3.2902e-05, 8.1444e-04, 1.1039e-04, ..., -3.4070e-04, 8.8155e-05, 6.7532e-05], [ 1.0926e-04, 1.6165e-04, 2.0647e-04, ..., 4.6670e-05, 3.5644e-05, -1.2457e-04], [ 1.6773e-04, -3.7479e-04, -1.7059e-04, ..., 9.2208e-05, 5.0329e-06, 1.5087e-05]], device='cuda:0') Epoch 33, bias, value: tensor([-0.0143, -0.0204, -0.0157, -0.0286, -0.0306, -0.0012, 0.0244, -0.0106, 0.0296, 0.0016], device='cuda:0'), grad: tensor([ 0.0002, 0.0005, -0.0002, -0.0004, -0.0004, -0.0008, 0.0006, 0.0014, -0.0017, 0.0007], device='cuda:0') 100 0.0001 changing lr epoch 32, time 265.23, cls_loss 0.0299 cls_loss_mapping 0.0459 cls_loss_causal 0.7943 re_mapping 0.0219 re_causal 0.0568 /// teacc 98.50 lr 0.00010000 Epoch 34, weight, value: tensor([[-0.0188, -0.0429, -0.0388, ..., -0.0157, 0.0772, 0.0631], [-0.0643, -0.0739, -0.0834, ..., -0.0461, -0.0612, -0.0427], [-0.0372, -0.0291, 0.0563, ..., -0.0391, 0.0690, -0.0095], ..., [-0.0402, 0.0282, 0.0340, ..., 0.0531, -0.0360, -0.0473], [-0.0872, 0.0368, -0.0817, ..., 0.0173, -0.0336, -0.0315], [ 0.0305, -0.0065, -0.0428, ..., 0.0071, -0.0271, 0.0026]], device='cuda:0'), grad: tensor([[ 1.3962e-05, 2.5004e-05, 5.5507e-06, ..., 9.5293e-06, -8.6844e-05, -7.7724e-05], [ 7.3239e-06, 6.9380e-05, -4.4417e-04, ..., 4.1813e-05, 4.5858e-06, 2.7642e-06], [ 3.1680e-05, 3.6788e-04, 7.2241e-05, ..., 1.2141e-04, 2.5228e-05, 4.1515e-05], ..., [-4.6045e-06, -4.7994e-04, -1.1250e-05, ..., -3.9077e-04, 1.6153e-05, 2.9150e-06], [ 3.7432e-05, 3.5167e-05, 9.6977e-05, ..., 9.1717e-06, 1.6466e-05, 2.0489e-08], [-9.0718e-05, -2.3592e-04, -1.2136e-04, ..., -1.6248e-04, 4.2558e-05, 2.9862e-05]], device='cuda:0') Epoch 34, bias, value: tensor([-0.0142, -0.0206, -0.0159, -0.0286, -0.0308, -0.0015, 0.0245, -0.0107, 0.0301, 0.0017], device='cuda:0'), grad: tensor([-2.1741e-05, -1.2646e-03, 6.4230e-04, 7.8201e-05, 1.2970e-03, 3.6311e-04, 3.2711e-04, -3.7026e-04, 3.2926e-04, -1.3809e-03], device='cuda:0') 100 0.0001 changing lr epoch 33, time 265.40, cls_loss 0.0240 cls_loss_mapping 0.0373 cls_loss_causal 0.7918 re_mapping 0.0217 re_causal 0.0574 /// teacc 98.42 lr 0.00010000 Epoch 35, weight, value: tensor([[-0.0193, -0.0436, -0.0391, ..., -0.0158, 0.0777, 0.0634], [-0.0649, -0.0742, -0.0847, ..., -0.0467, -0.0625, -0.0430], [-0.0379, -0.0294, 0.0572, ..., -0.0398, 0.0702, -0.0090], ..., [-0.0405, 0.0286, 0.0343, ..., 0.0538, -0.0367, -0.0477], [-0.0882, 0.0375, -0.0823, ..., 0.0178, -0.0338, -0.0317], [ 0.0307, -0.0072, -0.0431, ..., 0.0066, -0.0276, 0.0024]], device='cuda:0'), grad: tensor([[ 1.0526e-04, 4.4733e-05, 6.3032e-06, ..., 8.2791e-05, -2.2483e-04, -1.2565e-04], [ 1.6347e-05, 6.2764e-05, 6.4075e-05, ..., 5.2601e-05, 2.9027e-05, 6.0722e-06], [ 1.2264e-05, -9.0778e-05, -3.4499e-04, ..., 5.6058e-05, -1.7846e-04, 3.7458e-06], ..., [-1.6898e-05, -4.4298e-04, -4.7743e-05, ..., -5.7650e-04, 1.8165e-05, 1.0081e-05], [ 2.5320e-04, 1.3316e-04, 2.8443e-04, ..., 5.4389e-05, 2.2817e-04, 1.3161e-04], [ 4.9561e-05, 1.8060e-04, 1.5688e-04, ..., 2.6107e-04, 3.5346e-05, 2.4989e-05]], device='cuda:0') Epoch 35, bias, value: tensor([-0.0143, -0.0207, -0.0155, -0.0283, -0.0309, -0.0022, 0.0249, -0.0106, 0.0302, 0.0016], device='cuda:0'), grad: tensor([-2.0832e-05, 1.3697e-04, -9.7942e-04, 4.3631e-04, -2.7966e-04, -4.7684e-04, -1.0949e-04, -4.3726e-04, 1.2121e-03, 5.2071e-04], device='cuda:0') 100 0.0001 changing lr epoch 34, time 264.97, cls_loss 0.0260 cls_loss_mapping 0.0348 cls_loss_causal 0.7714 re_mapping 0.0205 re_causal 0.0515 /// teacc 98.53 lr 0.00010000 Epoch 36, weight, value: tensor([[-0.0198, -0.0440, -0.0394, ..., -0.0160, 0.0782, 0.0637], [-0.0661, -0.0752, -0.0854, ..., -0.0474, -0.0628, -0.0436], [-0.0387, -0.0300, 0.0578, ..., -0.0402, 0.0712, -0.0084], ..., [-0.0416, 0.0292, 0.0345, ..., 0.0544, -0.0377, -0.0484], [-0.0895, 0.0375, -0.0831, ..., 0.0177, -0.0341, -0.0319], [ 0.0316, -0.0075, -0.0436, ..., 0.0067, -0.0278, 0.0021]], device='cuda:0'), grad: tensor([[ 9.4056e-05, 1.0654e-05, 2.7373e-05, ..., 4.7863e-05, -3.2425e-05, -1.0514e-04], [ 4.0740e-05, 9.2015e-06, 2.2855e-06, ..., 2.2352e-05, 5.9679e-06, -1.2904e-05], [ 3.1114e-05, 3.8445e-05, -2.5500e-06, ..., 2.0206e-05, 4.5151e-05, 2.2292e-05], ..., [ 8.4098e-07, -5.1498e-05, -1.4165e-06, ..., -7.1049e-05, 2.2501e-05, 1.6391e-05], [ 3.7813e-04, -8.8140e-06, -3.6005e-06, ..., 4.9889e-05, 1.4114e-04, 7.6354e-05], [ 4.0919e-05, 4.7982e-05, 1.5825e-05, ..., 1.0145e-04, 8.4162e-05, 4.8369e-05]], device='cuda:0') Epoch 36, bias, value: tensor([-0.0146, -0.0211, -0.0153, -0.0285, -0.0310, -0.0017, 0.0249, -0.0108, 0.0298, 0.0024], device='cuda:0'), grad: tensor([ 1.8716e-04, -5.8317e-04, 3.1543e-04, 3.3200e-05, -9.3222e-04, 7.7286e-03, -7.8735e-03, 2.0301e-04, 6.2799e-04, 2.9135e-04], device='cuda:0') 100 0.0001 changing lr epoch 35, time 265.30, cls_loss 0.0236 cls_loss_mapping 0.0360 cls_loss_causal 0.7585 re_mapping 0.0217 re_causal 0.0573 /// teacc 98.45 lr 0.00010000 Epoch 37, weight, value: tensor([[-0.0200, -0.0444, -0.0395, ..., -0.0161, 0.0792, 0.0645], [-0.0669, -0.0761, -0.0862, ..., -0.0478, -0.0634, -0.0436], [-0.0394, -0.0306, 0.0580, ..., -0.0406, 0.0714, -0.0081], ..., [-0.0420, 0.0295, 0.0349, ..., 0.0546, -0.0374, -0.0487], [-0.0903, 0.0380, -0.0832, ..., 0.0181, -0.0339, -0.0319], [ 0.0315, -0.0081, -0.0434, ..., 0.0061, -0.0278, 0.0020]], device='cuda:0'), grad: tensor([[ 3.0589e-04, 1.1230e-04, 5.1379e-05, ..., 2.3448e-04, 1.0848e-04, 1.5402e-04], [ 4.5031e-05, 1.1122e-04, 2.6608e-04, ..., 3.3051e-05, 5.9700e-04, 3.6788e-04], [ 1.9312e-05, -2.7895e-04, -6.9427e-04, ..., 6.3717e-05, -1.9913e-03, -1.2560e-03], ..., [ 7.4863e-05, 8.3148e-05, 1.6761e-04, ..., 6.9030e-06, 5.4073e-04, 3.6955e-04], [ 2.8348e-04, -2.0194e-04, 1.0628e-04, ..., -9.2864e-05, 2.4629e-04, -7.0594e-06], [ 2.1386e-04, 1.6904e-04, -1.3128e-05, ..., 2.2113e-04, 6.7890e-05, 1.6916e-04]], device='cuda:0') Epoch 37, bias, value: tensor([-0.0148, -0.0208, -0.0160, -0.0284, -0.0307, -0.0018, 0.0246, -0.0106, 0.0300, 0.0022], device='cuda:0'), grad: tensor([ 7.8583e-04, 9.4080e-04, -2.8839e-03, 4.3201e-04, 1.3483e-04, 9.2793e-04, -2.0351e-03, 1.0262e-03, 9.3222e-05, 5.7602e-04], device='cuda:0') 100 0.0001 changing lr epoch 36, time 265.20, cls_loss 0.0250 cls_loss_mapping 0.0348 cls_loss_causal 0.7694 re_mapping 0.0214 re_causal 0.0553 /// teacc 98.33 lr 0.00010000 Epoch 38, weight, value: tensor([[-0.0204, -0.0447, -0.0399, ..., -0.0164, 0.0799, 0.0650], [-0.0679, -0.0771, -0.0865, ..., -0.0486, -0.0652, -0.0443], [-0.0399, -0.0310, 0.0591, ..., -0.0418, 0.0724, -0.0071], ..., [-0.0425, 0.0302, 0.0341, ..., 0.0553, -0.0378, -0.0496], [-0.0914, 0.0373, -0.0850, ..., 0.0179, -0.0347, -0.0327], [ 0.0324, -0.0088, -0.0441, ..., 0.0057, -0.0280, 0.0016]], device='cuda:0'), grad: tensor([[ 7.4729e-06, 8.9347e-05, 1.1712e-04, ..., 5.2080e-06, 1.6317e-05, -3.2216e-05], [ 7.8157e-06, 1.7965e-04, 4.3333e-05, ..., 3.9011e-05, 3.1739e-05, 7.5251e-06], [ 9.3430e-06, -1.6665e-04, -1.6174e-03, ..., 2.9564e-05, -1.0071e-03, -8.2612e-05], ..., [ 1.7747e-05, 1.0118e-03, 4.7994e-04, ..., 8.2910e-05, 4.2415e-04, 5.7697e-05], [ 3.3230e-05, 3.7575e-04, 9.0361e-05, ..., 3.4899e-05, 8.1837e-05, 2.0474e-05], [-5.6833e-05, 1.2951e-03, 3.6812e-04, ..., 2.0289e-04, 1.4544e-04, 6.5938e-06]], device='cuda:0') Epoch 38, bias, value: tensor([-0.0149, -0.0206, -0.0161, -0.0281, -0.0302, -0.0018, 0.0242, -0.0105, 0.0297, 0.0020], device='cuda:0'), grad: tensor([ 0.0004, -0.0017, -0.0026, -0.0050, 0.0005, 0.0007, 0.0004, 0.0033, 0.0009, 0.0031], device='cuda:0') 100 0.0001 changing lr epoch 37, time 265.27, cls_loss 0.0239 cls_loss_mapping 0.0318 cls_loss_causal 0.7645 re_mapping 0.0195 re_causal 0.0513 /// teacc 98.30 lr 0.00010000 Epoch 39, weight, value: tensor([[-0.0205, -0.0450, -0.0402, ..., -0.0165, 0.0807, 0.0656], [-0.0701, -0.0777, -0.0873, ..., -0.0493, -0.0663, -0.0446], [-0.0404, -0.0313, 0.0604, ..., -0.0423, 0.0733, -0.0066], ..., [-0.0432, 0.0301, 0.0342, ..., 0.0558, -0.0390, -0.0507], [-0.0922, 0.0378, -0.0862, ..., 0.0183, -0.0353, -0.0331], [ 0.0329, -0.0101, -0.0443, ..., 0.0053, -0.0285, 0.0012]], device='cuda:0'), grad: tensor([[ 1.0449e-04, 1.4961e-04, 1.1927e-04, ..., 2.5201e-04, -6.5386e-05, 4.2528e-05], [ 2.1207e-04, 5.5790e-05, 8.4102e-05, ..., 2.5320e-04, 8.8096e-05, 1.4111e-05], [ 1.9833e-05, -5.8413e-04, -4.5896e-04, ..., -1.9002e-04, -9.0218e-04, -2.9683e-04], ..., [-4.2534e-03, -7.0286e-04, -1.1044e-03, ..., -5.1727e-03, 3.3283e-04, 1.5533e-04], [ 7.5102e-05, 2.1636e-04, 4.5151e-05, ..., -1.5819e-04, -3.7718e-04, -4.7326e-04], [ 3.7079e-03, 7.0572e-04, 1.1330e-03, ..., 4.5128e-03, 1.4257e-04, 1.5175e-04]], device='cuda:0') Epoch 39, bias, value: tensor([-0.0148, -0.0209, -0.0158, -0.0279, -0.0303, -0.0019, 0.0244, -0.0110, 0.0303, 0.0017], device='cuda:0'), grad: tensor([ 0.0005, 0.0006, -0.0013, 0.0002, 0.0003, 0.0007, 0.0012, -0.0108, -0.0015, 0.0101], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 38---------------------------------------------------- epoch 38, time 283.68, cls_loss 0.0256 cls_loss_mapping 0.0387 cls_loss_causal 0.7818 re_mapping 0.0197 re_causal 0.0518 /// teacc 98.66 lr 0.00010000 Epoch 40, weight, value: tensor([[-0.0206, -0.0453, -0.0405, ..., -0.0168, 0.0813, 0.0661], [-0.0708, -0.0778, -0.0882, ..., -0.0502, -0.0672, -0.0441], [-0.0412, -0.0319, 0.0616, ..., -0.0430, 0.0742, -0.0064], ..., [-0.0426, 0.0309, 0.0349, ..., 0.0574, -0.0405, -0.0516], [-0.0930, 0.0378, -0.0872, ..., 0.0184, -0.0358, -0.0337], [ 0.0332, -0.0103, -0.0448, ..., 0.0045, -0.0288, 0.0009]], device='cuda:0'), grad: tensor([[ 4.9174e-05, 4.4018e-05, 5.6922e-06, ..., 2.8297e-05, -5.4628e-05, -2.3454e-05], [ 1.3009e-05, 1.0109e-03, 7.2300e-05, ..., 3.6502e-04, 4.2245e-06, 3.2689e-06], [ 1.0364e-05, 1.1330e-03, 2.3499e-05, ..., 2.9898e-04, -1.2733e-05, 8.3074e-06], ..., [ 9.1791e-05, 1.3695e-02, 1.5364e-03, ..., 4.7417e-03, 4.0457e-06, 2.0098e-06], [ 9.3162e-05, 3.6407e-04, 6.9082e-05, ..., 1.1909e-04, 4.1544e-05, 3.7968e-05], [-1.0008e-04, -1.5778e-02, -1.7748e-03, ..., -5.7297e-03, 2.4721e-05, 1.4685e-05]], device='cuda:0') Epoch 40, bias, value: tensor([-0.0148, -0.0206, -0.0161, -0.0282, -0.0303, -0.0018, 0.0240, -0.0101, 0.0301, 0.0017], device='cuda:0'), grad: tensor([ 8.9705e-05, 1.6785e-03, 1.7271e-03, 1.3173e-04, 5.9223e-04, -2.3925e-04, -2.2364e-04, 2.5299e-02, 8.4734e-04, -2.9907e-02], device='cuda:0') 100 0.0001 changing lr epoch 39, time 265.21, cls_loss 0.0182 cls_loss_mapping 0.0292 cls_loss_causal 0.7523 re_mapping 0.0198 re_causal 0.0513 /// teacc 98.63 lr 0.00010000 Epoch 41, weight, value: tensor([[-0.0210, -0.0459, -0.0410, ..., -0.0169, 0.0819, 0.0665], [-0.0714, -0.0790, -0.0891, ..., -0.0509, -0.0680, -0.0440], [-0.0418, -0.0324, 0.0628, ..., -0.0438, 0.0754, -0.0058], ..., [-0.0427, 0.0316, 0.0349, ..., 0.0584, -0.0418, -0.0524], [-0.0939, 0.0375, -0.0882, ..., 0.0185, -0.0363, -0.0340], [ 0.0337, -0.0103, -0.0453, ..., 0.0041, -0.0292, 0.0005]], device='cuda:0'), grad: tensor([[-2.2388e-04, 6.4559e-06, 1.2144e-05, ..., -4.7803e-05, -8.6021e-04, -6.6710e-04], [ 5.0999e-06, 9.2506e-05, 3.3593e-04, ..., 6.1095e-05, 2.2864e-04, 1.1742e-04], [ 4.0308e-06, 2.7585e-04, -3.9840e-04, ..., 2.1303e-04, -4.9496e-04, -2.6894e-04], ..., [ 8.4043e-06, -5.6171e-04, -4.3660e-05, ..., -3.5977e-04, 1.5378e-04, 8.6665e-05], [ 8.6844e-05, 9.5248e-05, 6.4552e-05, ..., 1.1641e-04, 5.5313e-05, 3.8266e-05], [ 7.0632e-05, 3.4070e-04, 6.4898e-04, ..., 3.4833e-04, 2.1505e-04, 1.6367e-04]], device='cuda:0') Epoch 41, bias, value: tensor([-0.0149, -0.0209, -0.0162, -0.0278, -0.0304, -0.0022, 0.0241, -0.0099, 0.0300, 0.0018], device='cuda:0'), grad: tensor([-1.0538e-03, 4.2272e-04, -3.1710e-04, 1.6797e-04, -1.1644e-03, 6.1631e-05, 6.5994e-04, -5.7316e-04, 2.8014e-04, 1.5144e-03], device='cuda:0') 100 0.0001 changing lr epoch 40, time 265.16, cls_loss 0.0209 cls_loss_mapping 0.0304 cls_loss_causal 0.7307 re_mapping 0.0194 re_causal 0.0509 /// teacc 98.60 lr 0.00010000 Epoch 42, weight, value: tensor([[-2.1314e-02, -4.6477e-02, -4.1340e-02, ..., -1.6991e-02, 8.3152e-02, 6.7446e-02], [-7.1942e-02, -8.0279e-02, -8.8827e-02, ..., -5.1931e-02, -6.8565e-02, -4.3897e-02], [-4.2324e-02, -3.2757e-02, 6.3314e-02, ..., -4.4658e-02, 7.5998e-02, -5.7002e-03], ..., [-4.2829e-02, 3.1579e-02, 3.4212e-02, ..., 5.8817e-02, -4.2757e-02, -5.3114e-02], [-9.4988e-02, 3.7984e-02, -8.8170e-02, ..., 1.8525e-02, -3.6656e-02, -3.4171e-02], [ 3.3962e-02, -1.0141e-02, -4.5372e-02, ..., 3.8681e-03, -2.9619e-02, 7.8126e-05]], device='cuda:0'), grad: tensor([[ 4.8548e-05, 1.9535e-05, 1.4283e-05, ..., 6.9067e-06, 1.0145e-04, 5.6416e-05], [ 3.2812e-05, 1.0103e-04, 3.7265e-04, ..., 3.4600e-05, 7.2098e-04, 4.8065e-04], [ 4.0323e-05, -5.0926e-04, -1.2369e-03, ..., -1.1313e-04, -2.8515e-03, -1.7376e-03], ..., [ 1.0617e-05, 3.5429e-04, 7.0143e-04, ..., 5.8532e-05, 1.4172e-03, 8.3542e-04], [ 6.5231e-04, 6.0320e-05, 3.3796e-05, ..., 1.8448e-05, 1.4079e-04, 9.6083e-05], [ 2.2173e-05, 2.0623e-05, 5.2862e-06, ..., 1.3977e-05, 9.5844e-05, 5.5552e-05]], device='cuda:0') Epoch 42, bias, value: tensor([-0.0144, -0.0207, -0.0162, -0.0285, -0.0307, -0.0012, 0.0239, -0.0103, 0.0300, 0.0019], device='cuda:0'), grad: tensor([ 0.0003, 0.0015, -0.0056, 0.0004, 0.0005, 0.0026, -0.0036, 0.0028, 0.0009, 0.0002], device='cuda:0') 100 0.0001 changing lr epoch 41, time 265.10, cls_loss 0.0260 cls_loss_mapping 0.0317 cls_loss_causal 0.7500 re_mapping 0.0191 re_causal 0.0476 /// teacc 98.43 lr 0.00010000 Epoch 43, weight, value: tensor([[-0.0223, -0.0473, -0.0405, ..., -0.0173, 0.0839, 0.0677], [-0.0729, -0.0813, -0.0894, ..., -0.0523, -0.0698, -0.0448], [-0.0434, -0.0326, 0.0638, ..., -0.0453, 0.0770, -0.0049], ..., [-0.0432, 0.0319, 0.0361, ..., 0.0598, -0.0433, -0.0541], [-0.0957, 0.0382, -0.0890, ..., 0.0191, -0.0376, -0.0342], [ 0.0343, -0.0108, -0.0454, ..., 0.0032, -0.0290, -0.0001]], device='cuda:0'), grad: tensor([[-9.3555e-04, 2.1253e-06, 1.1034e-05, ..., -1.3285e-03, -6.0415e-04, -1.6546e-03], [ 1.0297e-05, 8.0109e-05, 2.3752e-05, ..., 1.0979e-04, 1.0744e-05, 3.1680e-05], [ 1.9431e-05, 1.2589e-04, 2.1827e-04, ..., 5.1641e-04, 6.5744e-05, 3.0828e-04], ..., [ 6.7949e-05, -1.1320e-03, -2.8934e-03, ..., -2.5215e-03, 4.9591e-05, 1.5581e-04], [ 2.5630e-04, 1.1444e-04, 1.6525e-05, ..., 5.5075e-04, 1.6630e-04, 5.5075e-04], [ 2.4945e-05, 5.7191e-05, 1.1712e-05, ..., 1.3268e-04, 2.6748e-05, 7.4863e-05]], device='cuda:0') Epoch 43, bias, value: tensor([-0.0145, -0.0211, -0.0160, -0.0287, -0.0313, -0.0017, 0.0244, -0.0100, 0.0308, 0.0018], device='cuda:0'), grad: tensor([-0.0018, 0.0002, 0.0020, 0.0027, 0.0036, -0.0052, -0.0001, -0.0045, 0.0027, 0.0006], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 42---------------------------------------------------- epoch 42, time 282.04, cls_loss 0.0204 cls_loss_mapping 0.0293 cls_loss_causal 0.7493 re_mapping 0.0192 re_causal 0.0503 /// teacc 98.77 lr 0.00010000 Epoch 44, weight, value: tensor([[-0.0227, -0.0478, -0.0406, ..., -0.0176, 0.0846, 0.0683], [-0.0742, -0.0830, -0.0902, ..., -0.0529, -0.0715, -0.0456], [-0.0441, -0.0330, 0.0641, ..., -0.0460, 0.0781, -0.0046], ..., [-0.0431, 0.0329, 0.0370, ..., 0.0609, -0.0439, -0.0545], [-0.0970, 0.0384, -0.0894, ..., 0.0194, -0.0376, -0.0340], [ 0.0343, -0.0113, -0.0458, ..., 0.0026, -0.0292, -0.0006]], device='cuda:0'), grad: tensor([[ 9.7826e-06, 2.5600e-05, 3.4627e-06, ..., -3.4403e-06, -5.8532e-05, -1.1772e-04], [ 2.1532e-06, 6.6280e-05, 7.3649e-06, ..., 4.1008e-05, 9.2268e-05, 1.7032e-05], [ 1.2696e-05, -1.3018e-03, -8.5473e-05, ..., 1.6630e-05, -4.7417e-03, -1.6165e-04], ..., [ 5.7966e-06, -1.1736e-04, 1.0890e-04, ..., -3.1757e-04, 7.1859e-04, 1.6284e-04], [ 1.2554e-05, -2.6748e-06, 3.3733e-06, ..., -4.9949e-05, 6.3419e-05, 5.4270e-05], [-3.4243e-05, 1.6415e-04, 2.3484e-05, ..., 1.8907e-04, 6.5804e-05, 1.2033e-05]], device='cuda:0') Epoch 44, bias, value: tensor([-0.0145, -0.0220, -0.0158, -0.0291, -0.0309, -0.0012, 0.0241, -0.0095, 0.0310, 0.0014], device='cuda:0'), grad: tensor([-2.7359e-05, 7.6711e-05, -7.3357e-03, 6.1378e-03, -1.4865e-04, 1.5962e-04, 2.1890e-05, 7.9727e-04, 2.9519e-05, 2.8968e-04], device='cuda:0') 100 0.0001 changing lr epoch 43, time 260.04, cls_loss 0.0178 cls_loss_mapping 0.0247 cls_loss_causal 0.7020 re_mapping 0.0194 re_causal 0.0473 /// teacc 98.70 lr 0.00010000 Epoch 45, weight, value: tensor([[-0.0231, -0.0481, -0.0408, ..., -0.0176, 0.0853, 0.0689], [-0.0748, -0.0841, -0.0903, ..., -0.0538, -0.0720, -0.0459], [-0.0449, -0.0333, 0.0645, ..., -0.0467, 0.0789, -0.0042], ..., [-0.0437, 0.0327, 0.0373, ..., 0.0613, -0.0449, -0.0552], [-0.0971, 0.0385, -0.0897, ..., 0.0202, -0.0376, -0.0337], [ 0.0351, -0.0115, -0.0460, ..., 0.0023, -0.0293, -0.0010]], device='cuda:0'), grad: tensor([[ 1.7002e-05, 1.1194e-04, 5.8860e-07, ..., 3.2634e-05, 1.2517e-04, 2.3410e-05], [ 9.2387e-06, 1.4342e-05, 5.4576e-07, ..., 4.7311e-06, 7.6592e-06, 2.3581e-06], [ 1.1981e-05, 1.0443e-04, -7.6592e-06, ..., 3.0667e-05, 1.3268e-04, 5.1141e-05], ..., [ 1.0729e-05, -4.5776e-05, -2.7753e-07, ..., -4.5806e-05, 1.2167e-05, 4.0792e-06], [ 6.2764e-05, -2.6011e-04, 1.9725e-06, ..., -6.2168e-05, -4.8637e-04, -1.4532e-04], [ 2.2650e-05, 4.4405e-05, 1.5013e-06, ..., 1.3880e-05, 2.3797e-05, 1.2212e-05]], device='cuda:0') Epoch 45, bias, value: tensor([-0.0145, -0.0218, -0.0156, -0.0289, -0.0311, -0.0012, 0.0236, -0.0098, 0.0310, 0.0017], device='cuda:0'), grad: tensor([ 2.9135e-04, -5.2738e-04, 4.2129e-04, 1.1241e-04, -5.7250e-05, 1.0651e-04, 3.5793e-05, 1.1969e-04, -7.1955e-04, 2.1541e-04], device='cuda:0') 100 0.0001 changing lr epoch 44, time 263.58, cls_loss 0.0172 cls_loss_mapping 0.0285 cls_loss_causal 0.7269 re_mapping 0.0181 re_causal 0.0481 /// teacc 98.47 lr 0.00010000 Epoch 46, weight, value: tensor([[-0.0232, -0.0487, -0.0409, ..., -0.0175, 0.0860, 0.0695], [-0.0766, -0.0850, -0.0906, ..., -0.0537, -0.0726, -0.0462], [-0.0456, -0.0338, 0.0650, ..., -0.0475, 0.0797, -0.0040], ..., [-0.0442, 0.0335, 0.0375, ..., 0.0621, -0.0456, -0.0554], [-0.0976, 0.0385, -0.0901, ..., 0.0205, -0.0380, -0.0338], [ 0.0355, -0.0123, -0.0461, ..., 0.0019, -0.0293, -0.0015]], device='cuda:0'), grad: tensor([[ 6.2585e-06, 2.1845e-05, 1.0453e-05, ..., 1.7136e-05, 3.6389e-05, -8.2403e-06], [ 1.4246e-05, 1.7047e-04, 2.4326e-06, ..., 2.2125e-04, -1.4424e-05, -2.2948e-05], [ 2.5690e-05, 2.8205e-04, -6.1631e-05, ..., 1.8418e-04, -2.4271e-04, 1.0043e-05], ..., [ 8.5533e-06, -9.5665e-05, -2.1905e-06, ..., -1.9193e-04, 2.1428e-05, 3.9265e-06], [ 6.2168e-05, -6.0368e-04, 1.9055e-06, ..., -5.4169e-04, -6.6042e-05, 7.0445e-06], [ 1.3739e-05, 1.0091e-04, 3.1106e-07, ..., 8.4877e-05, 9.7603e-06, 3.4422e-06]], device='cuda:0') Epoch 46, bias, value: tensor([-0.0143, -0.0216, -0.0159, -0.0285, -0.0311, -0.0015, 0.0233, -0.0097, 0.0311, 0.0016], device='cuda:0'), grad: tensor([ 1.2338e-04, -1.1482e-03, 1.2074e-03, 4.2486e-04, 1.4699e-04, 3.2115e-04, -1.7679e-04, 6.6496e-06, -1.1892e-03, 2.8348e-04], device='cuda:0') 100 0.0001 changing lr epoch 45, time 265.11, cls_loss 0.0174 cls_loss_mapping 0.0280 cls_loss_causal 0.7351 re_mapping 0.0175 re_causal 0.0477 /// teacc 98.69 lr 0.00010000 Epoch 47, weight, value: tensor([[-0.0239, -0.0492, -0.0408, ..., -0.0176, 0.0870, 0.0701], [-0.0771, -0.0852, -0.0910, ..., -0.0542, -0.0739, -0.0465], [-0.0461, -0.0342, 0.0661, ..., -0.0481, 0.0807, -0.0038], ..., [-0.0447, 0.0337, 0.0371, ..., 0.0626, -0.0465, -0.0558], [-0.0989, 0.0383, -0.0906, ..., 0.0206, -0.0389, -0.0342], [ 0.0361, -0.0124, -0.0460, ..., 0.0016, -0.0295, -0.0019]], device='cuda:0'), grad: tensor([[ 1.3523e-05, 4.0221e-04, 2.0042e-06, ..., 6.1607e-04, 8.2922e-04, 1.2177e-04], [ 1.5222e-05, 6.3360e-05, 1.8459e-06, ..., 3.8952e-05, 2.9415e-05, 8.6799e-06], [ 8.6904e-05, 1.8096e-04, -2.9057e-05, ..., -1.0766e-05, -7.3624e-04, -2.3091e-04], ..., [-2.2918e-05, -2.7800e-04, -1.3880e-05, ..., -9.6798e-04, -4.1318e-04, 1.0478e-04], [ 4.8995e-05, 1.4436e-04, 8.1286e-06, ..., -7.1786e-06, 4.8190e-05, 2.7806e-05], [ 1.1623e-04, 2.2912e-04, 1.8910e-05, ..., 5.2299e-03, 1.3757e-04, 2.2545e-05]], device='cuda:0') Epoch 47, bias, value: tensor([-0.0142, -0.0211, -0.0155, -0.0284, -0.0316, -0.0016, 0.0239, -0.0104, 0.0308, 0.0018], device='cuda:0'), grad: tensor([ 1.6327e-03, 1.2457e-04, -4.6039e-04, -7.7820e-04, -2.8305e-02, 2.3651e-04, 4.7147e-05, -1.2856e-03, 7.5623e-06, 2.8763e-02], device='cuda:0') 100 0.0001 changing lr epoch 46, time 265.52, cls_loss 0.0210 cls_loss_mapping 0.0260 cls_loss_causal 0.7203 re_mapping 0.0169 re_causal 0.0437 /// teacc 98.59 lr 0.00010000 Epoch 48, weight, value: tensor([[-0.0249, -0.0497, -0.0410, ..., -0.0179, 0.0879, 0.0707], [-0.0775, -0.0862, -0.0917, ..., -0.0549, -0.0745, -0.0464], [-0.0469, -0.0344, 0.0668, ..., -0.0484, 0.0817, -0.0033], ..., [-0.0452, 0.0330, 0.0372, ..., 0.0626, -0.0480, -0.0567], [-0.1006, 0.0383, -0.0910, ..., 0.0204, -0.0399, -0.0351], [ 0.0363, -0.0125, -0.0460, ..., 0.0014, -0.0294, -0.0025]], device='cuda:0'), grad: tensor([[ 2.1112e-04, 4.4346e-05, 3.1352e-05, ..., 2.1443e-05, 2.6464e-04, 2.6703e-04], [ 1.3337e-05, 8.6963e-05, 3.2216e-05, ..., 5.9426e-05, 8.0228e-05, 5.9396e-05], [ 2.0057e-05, -5.0402e-04, -4.3321e-04, ..., -9.5814e-06, -1.1053e-03, -7.6151e-04], ..., [ 6.4325e-04, 2.6588e-03, 5.3972e-05, ..., 4.9782e-04, 3.2234e-04, 2.6441e-04], [-5.0688e-04, -8.4352e-04, -1.3041e-04, ..., -2.7885e-03, 2.1183e-04, 1.7905e-04], [ 1.4865e-04, 7.1383e-04, 2.8566e-05, ..., 2.2089e-04, 6.8009e-05, 6.2585e-05]], device='cuda:0') Epoch 48, bias, value: tensor([-0.0143, -0.0209, -0.0156, -0.0278, -0.0320, -0.0017, 0.0241, -0.0112, 0.0306, 0.0025], device='cuda:0'), grad: tensor([ 4.2415e-04, 2.2626e-04, -2.2335e-03, -1.5144e-03, 3.1614e-04, 3.2558e-03, -7.7128e-05, 2.1763e-03, -3.2520e-03, 6.8283e-04], device='cuda:0') 100 0.0001 changing lr epoch 47, time 265.55, cls_loss 0.0164 cls_loss_mapping 0.0261 cls_loss_causal 0.7087 re_mapping 0.0170 re_causal 0.0462 /// teacc 98.74 lr 0.00010000 Epoch 49, weight, value: tensor([[-0.0251, -0.0502, -0.0412, ..., -0.0181, 0.0889, 0.0715], [-0.0783, -0.0868, -0.0919, ..., -0.0558, -0.0750, -0.0466], [-0.0480, -0.0346, 0.0668, ..., -0.0492, 0.0821, -0.0030], ..., [-0.0458, 0.0332, 0.0375, ..., 0.0640, -0.0483, -0.0572], [-0.1015, 0.0383, -0.0912, ..., 0.0206, -0.0398, -0.0353], [ 0.0365, -0.0129, -0.0462, ..., 0.0008, -0.0301, -0.0032]], device='cuda:0'), grad: tensor([[ 7.5221e-05, 6.3181e-05, 9.3803e-06, ..., 1.1384e-04, -6.1333e-05, -4.0770e-05], [ 2.0489e-05, 1.2189e-04, 8.2254e-05, ..., 1.4174e-04, 3.2783e-05, 5.6662e-06], [ 1.9327e-05, 3.6049e-04, 1.8764e-04, ..., 2.8849e-04, -1.2577e-05, 3.4440e-06], ..., [ 8.0490e-04, -2.3460e-04, -4.3678e-04, ..., 4.8923e-04, -1.2469e-04, -1.8284e-05], [ 2.1946e-04, 1.0526e-04, 2.0504e-05, ..., 2.6751e-04, 4.1515e-05, 2.4922e-06], [ 8.5771e-05, -8.1897e-05, 2.4602e-05, ..., 4.2295e-04, 1.0997e-05, 4.0531e-06]], device='cuda:0') Epoch 49, bias, value: tensor([-0.0140, -0.0206, -0.0159, -0.0276, -0.0318, -0.0019, 0.0232, -0.0109, 0.0310, 0.0020], device='cuda:0'), grad: tensor([ 8.0347e-05, 7.2718e-04, 5.2881e-04, 4.1890e-04, -7.9803e-03, -8.6403e-04, 5.7891e-06, 7.5722e-04, 2.4724e-04, 6.0806e-03], device='cuda:0') 100 0.0001 changing lr epoch 48, time 265.56, cls_loss 0.0161 cls_loss_mapping 0.0223 cls_loss_causal 0.7041 re_mapping 0.0165 re_causal 0.0440 /// teacc 98.65 lr 0.00010000 Epoch 50, weight, value: tensor([[-0.0254, -0.0507, -0.0413, ..., -0.0182, 0.0894, 0.0719], [-0.0801, -0.0874, -0.0925, ..., -0.0566, -0.0765, -0.0473], [-0.0475, -0.0350, 0.0676, ..., -0.0501, 0.0831, -0.0024], ..., [-0.0464, 0.0335, 0.0376, ..., 0.0647, -0.0490, -0.0577], [-0.1024, 0.0382, -0.0917, ..., 0.0212, -0.0405, -0.0356], [ 0.0372, -0.0129, -0.0460, ..., 0.0004, -0.0308, -0.0036]], device='cuda:0'), grad: tensor([[-8.2612e-05, -8.3506e-05, 5.6401e-06, ..., -3.4451e-05, -9.3699e-04, -7.7105e-04], [ 6.0126e-06, 1.9324e-04, 7.0967e-06, ..., 1.1557e-04, 4.2409e-05, 1.4096e-05], [ 1.4558e-05, 6.8247e-05, -2.1732e-04, ..., 6.9559e-05, -3.7909e-04, 1.3940e-05], ..., [ 3.4012e-06, -1.0958e-03, 3.8929e-06, ..., -8.5163e-04, 1.2994e-04, 5.9485e-05], [ 4.1515e-05, 2.5600e-05, 4.2617e-06, ..., 3.4660e-05, 5.3883e-05, 4.7237e-05], [-2.5809e-05, 5.0879e-04, 9.7752e-05, ..., 4.6492e-04, 1.8334e-04, 3.1412e-05]], device='cuda:0') Epoch 50, bias, value: tensor([-0.0139, -0.0211, -0.0151, -0.0274, -0.0314, -0.0024, 0.0236, -0.0114, 0.0311, 0.0019], device='cuda:0'), grad: tensor([-1.2026e-03, 2.6727e-04, -7.3528e-04, 7.1096e-04, 2.9635e-04, -1.5393e-05, 9.8419e-04, -1.4954e-03, 1.2910e-04, 1.0624e-03], device='cuda:0') 100 0.0001 changing lr epoch 49, time 265.88, cls_loss 0.0199 cls_loss_mapping 0.0287 cls_loss_causal 0.6855 re_mapping 0.0175 re_causal 0.0426 /// teacc 98.66 lr 0.00010000 Epoch 51, weight, value: tensor([[-0.0259, -0.0510, -0.0415, ..., -0.0184, 0.0900, 0.0723], [-0.0808, -0.0883, -0.0934, ..., -0.0572, -0.0772, -0.0473], [-0.0482, -0.0355, 0.0680, ..., -0.0515, 0.0837, -0.0026], ..., [-0.0474, 0.0336, 0.0377, ..., 0.0655, -0.0494, -0.0580], [-0.1034, 0.0385, -0.0913, ..., 0.0216, -0.0400, -0.0353], [ 0.0375, -0.0136, -0.0458, ..., -0.0002, -0.0313, -0.0040]], device='cuda:0'), grad: tensor([[ 2.4453e-05, 1.2286e-05, 5.9977e-06, ..., 1.1809e-05, 1.9670e-05, 1.4350e-05], [ 9.4697e-06, 8.0943e-05, 3.3259e-05, ..., 6.8843e-05, 4.0948e-05, 2.0593e-05], [ 1.5402e-04, 1.6892e-04, -4.1723e-05, ..., 4.5061e-05, -6.3598e-05, -8.1122e-05], ..., [-1.8895e-04, -1.3275e-03, -4.7493e-04, ..., -1.1206e-03, 3.7491e-05, 1.5222e-05], [ 8.3685e-05, 1.4327e-05, 2.9743e-05, ..., 6.2466e-05, 3.6925e-05, 1.4700e-05], [ 1.3518e-04, 2.8253e-04, 3.6627e-05, ..., 3.2663e-04, 1.1303e-05, 5.0813e-06]], device='cuda:0') Epoch 51, bias, value: tensor([-0.0141, -0.0218, -0.0151, -0.0272, -0.0310, -0.0021, 0.0231, -0.0113, 0.0320, 0.0013], device='cuda:0'), grad: tensor([ 5.9098e-05, 1.4687e-04, 9.7632e-05, -2.2495e-04, 1.2102e-03, -2.7031e-05, 4.1991e-05, -2.0885e-03, 1.6928e-04, 6.1560e-04], device='cuda:0') 100 0.0001 changing lr epoch 50, time 265.37, cls_loss 0.0193 cls_loss_mapping 0.0255 cls_loss_causal 0.6766 re_mapping 0.0170 re_causal 0.0436 /// teacc 98.60 lr 0.00010000 Epoch 52, weight, value: tensor([[-0.0264, -0.0515, -0.0417, ..., -0.0186, 0.0900, 0.0723], [-0.0814, -0.0901, -0.0941, ..., -0.0576, -0.0780, -0.0476], [-0.0487, -0.0361, 0.0685, ..., -0.0521, 0.0854, -0.0017], ..., [-0.0476, 0.0345, 0.0378, ..., 0.0661, -0.0507, -0.0591], [-0.1049, 0.0388, -0.0914, ..., 0.0215, -0.0401, -0.0357], [ 0.0375, -0.0144, -0.0459, ..., -0.0006, -0.0316, -0.0042]], device='cuda:0'), grad: tensor([[ 8.5711e-05, 2.8342e-05, 2.2314e-06, ..., 2.0340e-05, -4.4368e-06, 7.0594e-07], [ 1.4193e-05, 2.1648e-04, 3.8557e-06, ..., 1.4615e-04, 1.4659e-06, 1.6298e-06], [ 2.4393e-05, 1.3566e-04, -3.2075e-06, ..., 8.6129e-05, -7.8678e-06, -4.0419e-07], ..., [ 2.0355e-05, 2.2797e-02, -3.0324e-05, ..., 1.4809e-02, 7.1339e-06, 3.7383e-06], [ 1.1146e-04, -2.5055e-02, 1.1779e-05, ..., -1.6312e-02, 8.2180e-06, 1.0990e-05], [ 9.1910e-05, 3.1638e-04, 2.4680e-06, ..., 2.3460e-04, 2.2519e-06, 5.3383e-06]], device='cuda:0') Epoch 52, bias, value: tensor([-0.0146, -0.0223, -0.0153, -0.0272, -0.0312, -0.0024, 0.0238, -0.0106, 0.0320, 0.0013], device='cuda:0'), grad: tensor([ 1.5092e-04, 1.6272e-04, 1.9479e-04, 3.7079e-03, 3.9667e-05, -2.4738e-03, 5.1349e-05, 2.4918e-02, -2.7191e-02, 4.5109e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 51---------------------------------------------------- epoch 51, time 281.80, cls_loss 0.0164 cls_loss_mapping 0.0217 cls_loss_causal 0.6797 re_mapping 0.0158 re_causal 0.0399 /// teacc 98.79 lr 0.00010000 Epoch 53, weight, value: tensor([[-0.0266, -0.0522, -0.0419, ..., -0.0188, 0.0915, 0.0736], [-0.0819, -0.0912, -0.0936, ..., -0.0584, -0.0795, -0.0484], [-0.0495, -0.0373, 0.0690, ..., -0.0536, 0.0857, -0.0016], ..., [-0.0477, 0.0356, 0.0379, ..., 0.0672, -0.0511, -0.0593], [-0.1055, 0.0395, -0.0919, ..., 0.0223, -0.0408, -0.0362], [ 0.0377, -0.0154, -0.0465, ..., -0.0010, -0.0320, -0.0046]], device='cuda:0'), grad: tensor([[-1.0562e-04, 2.3972e-06, 4.6566e-07, ..., 1.2890e-06, -1.5426e-04, -1.3697e-04], [ 1.0669e-05, 1.1452e-05, 1.3541e-06, ..., 3.4831e-06, 2.3115e-06, 1.3057e-06], [ 5.2229e-06, 1.9759e-05, -3.4086e-06, ..., 3.2578e-06, -2.4125e-05, -6.8508e-06], ..., [ 1.6585e-05, -6.9797e-05, -1.4484e-05, ..., -5.0634e-05, 1.3985e-05, 4.7348e-06], [ 1.2159e-04, 2.0787e-05, 2.0713e-06, ..., 7.6219e-06, 4.7296e-05, 5.5254e-05], [-1.7726e-04, 1.0855e-05, 7.1377e-06, ..., -2.4423e-05, 7.4863e-05, 4.8019e-06]], device='cuda:0') Epoch 53, bias, value: tensor([-0.0137, -0.0217, -0.0156, -0.0277, -0.0313, -0.0026, 0.0233, -0.0101, 0.0321, 0.0009], device='cuda:0'), grad: tensor([-1.3626e-04, -1.8477e-05, -4.1425e-06, 1.5271e-04, -1.0985e-04, -4.9919e-05, 1.0081e-05, -3.4213e-05, 2.7704e-04, -8.6546e-05], device='cuda:0') 100 0.0001 changing lr epoch 52, time 266.08, cls_loss 0.0163 cls_loss_mapping 0.0217 cls_loss_causal 0.6746 re_mapping 0.0155 re_causal 0.0400 /// teacc 98.72 lr 0.00010000 Epoch 54, weight, value: tensor([[-0.0274, -0.0527, -0.0421, ..., -0.0190, 0.0928, 0.0745], [-0.0826, -0.0909, -0.0941, ..., -0.0591, -0.0804, -0.0490], [-0.0506, -0.0375, 0.0694, ..., -0.0543, 0.0863, -0.0012], ..., [-0.0486, 0.0355, 0.0381, ..., 0.0679, -0.0520, -0.0603], [-0.1061, 0.0396, -0.0924, ..., 0.0225, -0.0413, -0.0360], [ 0.0387, -0.0155, -0.0469, ..., -0.0015, -0.0324, -0.0050]], device='cuda:0'), grad: tensor([[ 1.3851e-05, 2.1681e-05, 3.7253e-05, ..., 1.3702e-05, 7.0333e-05, 2.9579e-05], [ 5.8636e-06, 6.2659e-06, 2.2831e-03, ..., 5.5842e-06, 4.3526e-03, 2.8000e-03], [ 4.4703e-06, -1.0006e-05, -2.6588e-03, ..., 7.7486e-06, -5.1041e-03, -3.2463e-03], ..., [-1.1511e-05, -6.2466e-05, 1.0121e-04, ..., -8.0347e-05, 1.8632e-04, 1.1730e-04], [ 6.4313e-05, 2.0877e-05, 7.8440e-05, ..., 1.7166e-05, 1.4794e-04, 8.6725e-05], [-1.3745e-04, 2.4542e-05, -1.5032e-06, ..., 5.4330e-05, 1.4007e-05, 9.3952e-06]], device='cuda:0') Epoch 54, bias, value: tensor([-0.0131, -0.0212, -0.0155, -0.0278, -0.0316, -0.0029, 0.0233, -0.0108, 0.0323, 0.0013], device='cuda:0'), grad: tensor([ 0.0002, 0.0106, -0.0122, 0.0011, 0.0004, -0.0008, 0.0003, 0.0004, 0.0004, -0.0002], device='cuda:0') 100 0.0001 changing lr epoch 53, time 266.08, cls_loss 0.0172 cls_loss_mapping 0.0232 cls_loss_causal 0.7259 re_mapping 0.0152 re_causal 0.0411 /// teacc 98.77 lr 0.00010000 Epoch 55, weight, value: tensor([[-0.0282, -0.0534, -0.0425, ..., -0.0193, 0.0931, 0.0748], [-0.0834, -0.0923, -0.0949, ..., -0.0600, -0.0816, -0.0493], [-0.0513, -0.0374, 0.0702, ..., -0.0553, 0.0874, 0.0001], ..., [-0.0501, 0.0356, 0.0382, ..., 0.0686, -0.0516, -0.0617], [-0.1072, 0.0401, -0.0930, ..., 0.0222, -0.0417, -0.0363], [ 0.0392, -0.0165, -0.0469, ..., -0.0017, -0.0325, -0.0051]], device='cuda:0'), grad: tensor([[ 3.2635e-03, 5.1744e-06, 4.1910e-07, ..., 9.8991e-04, 1.9703e-03, 2.4052e-03], [ 3.3025e-06, 3.0249e-05, -1.2722e-06, ..., 1.7917e-04, 1.6410e-06, -8.6613e-07], [ 9.6112e-06, 4.6194e-05, -2.2594e-06, ..., 6.1274e-05, -1.6261e-06, 4.1537e-06], ..., [-3.2298e-06, -1.9121e-04, 2.5611e-06, ..., -1.2045e-03, 4.5262e-06, 2.6114e-06], [ 3.1888e-05, -6.4731e-05, 1.2517e-06, ..., 2.3794e-04, 1.9014e-05, 2.0936e-05], [ 5.5879e-08, 6.5386e-05, -9.9093e-07, ..., 4.2892e-04, 3.9861e-06, 2.6878e-06]], device='cuda:0') Epoch 55, bias, value: tensor([-0.0138, -0.0217, -0.0151, -0.0283, -0.0319, -0.0022, 0.0230, -0.0104, 0.0324, 0.0014], device='cuda:0'), grad: tensor([ 3.5305e-03, -4.2462e-04, 2.3603e-04, 2.3639e-04, -6.1274e-05, 4.6921e-04, -3.7003e-03, -1.4229e-03, 4.6706e-04, 6.7234e-04], device='cuda:0') 100 0.0001 changing lr epoch 54, time 265.52, cls_loss 0.0172 cls_loss_mapping 0.0235 cls_loss_causal 0.7089 re_mapping 0.0149 re_causal 0.0394 /// teacc 98.70 lr 0.00010000 Epoch 56, weight, value: tensor([[-0.0288, -0.0540, -0.0426, ..., -0.0198, 0.0940, 0.0754], [-0.0854, -0.0928, -0.0951, ..., -0.0607, -0.0822, -0.0488], [-0.0520, -0.0378, 0.0709, ..., -0.0561, 0.0881, 0.0002], ..., [-0.0505, 0.0355, 0.0383, ..., 0.0692, -0.0526, -0.0619], [-0.1084, 0.0413, -0.0934, ..., 0.0232, -0.0418, -0.0366], [ 0.0399, -0.0173, -0.0471, ..., -0.0023, -0.0325, -0.0052]], device='cuda:0'), grad: tensor([[ 1.1787e-05, 3.2723e-05, 2.5239e-06, ..., 3.0845e-05, -2.1076e-04, -1.7679e-04], [ 3.6173e-06, 7.6175e-05, 1.1642e-06, ..., 7.1406e-05, 9.0301e-06, 4.6901e-06], [ 8.5458e-06, 5.3883e-04, 1.0077e-06, ..., 5.0259e-04, 4.9472e-05, 5.2065e-05], ..., [ 8.8960e-06, -1.4877e-03, 9.3281e-06, ..., -1.3924e-03, 3.0443e-05, 1.3448e-05], [ 2.7835e-05, 1.2040e-04, 1.1977e-06, ..., 6.6638e-05, 4.8429e-05, 3.9011e-05], [-5.9795e-04, 3.6329e-05, 4.9919e-05, ..., 5.7071e-05, 4.2617e-05, 3.0518e-05]], device='cuda:0') Epoch 56, bias, value: tensor([-0.0144, -0.0216, -0.0151, -0.0285, -0.0317, -0.0017, 0.0230, -0.0108, 0.0326, 0.0016], device='cuda:0'), grad: tensor([-1.8311e-04, 1.2398e-04, 1.2779e-03, 1.6975e-03, -1.5497e-04, 6.3133e-04, 4.2677e-05, -3.1357e-03, 3.0041e-04, -6.0081e-04], device='cuda:0') 100 0.0001 changing lr epoch 55, time 265.32, cls_loss 0.0178 cls_loss_mapping 0.0219 cls_loss_causal 0.6829 re_mapping 0.0157 re_causal 0.0380 /// teacc 98.72 lr 0.00010000 Epoch 57, weight, value: tensor([[-0.0296, -0.0545, -0.0429, ..., -0.0199, 0.0946, 0.0759], [-0.0863, -0.0937, -0.0953, ..., -0.0614, -0.0839, -0.0493], [-0.0531, -0.0384, 0.0713, ..., -0.0569, 0.0888, 0.0002], ..., [-0.0518, 0.0354, 0.0382, ..., 0.0698, -0.0530, -0.0622], [-0.1097, 0.0415, -0.0939, ..., 0.0232, -0.0426, -0.0369], [ 0.0405, -0.0182, -0.0474, ..., -0.0031, -0.0327, -0.0057]], device='cuda:0'), grad: tensor([[ 3.1721e-06, 8.3223e-06, 1.8291e-06, ..., 4.4405e-06, -9.2015e-06, -5.0850e-06], [ 1.6522e-06, 1.0267e-05, 5.8115e-06, ..., 4.3958e-06, 5.6289e-06, 1.1511e-06], [ 2.8498e-06, -1.1563e-04, -1.3304e-04, ..., -4.7199e-06, -2.2531e-04, -1.0926e-04], ..., [-7.5221e-05, -5.6684e-05, 1.4341e-04, ..., -1.2195e-04, 2.1470e-04, 9.9719e-05], [ 1.3411e-05, -2.1172e-04, 6.9179e-06, ..., -1.2541e-04, 1.1221e-05, 6.1691e-06], [ 5.3942e-05, 1.3876e-04, 1.2910e-04, ..., 8.1956e-05, 2.8253e-05, 2.9430e-06]], device='cuda:0') Epoch 57, bias, value: tensor([-0.0144, -0.0221, -0.0153, -0.0282, -0.0314, -0.0014, 0.0235, -0.0112, 0.0326, 0.0015], device='cuda:0'), grad: tensor([ 1.1526e-05, -2.8219e-06, -3.6502e-04, 2.0409e-04, -2.3308e-03, 5.5730e-05, -2.7567e-07, 3.6979e-04, -1.8990e-04, 2.2469e-03], device='cuda:0') 100 0.0001 changing lr epoch 56, time 265.73, cls_loss 0.0169 cls_loss_mapping 0.0210 cls_loss_causal 0.6573 re_mapping 0.0156 re_causal 0.0372 /// teacc 98.78 lr 0.00010000 Epoch 58, weight, value: tensor([[-0.0305, -0.0552, -0.0436, ..., -0.0202, 0.0948, 0.0761], [-0.0870, -0.0946, -0.0955, ..., -0.0621, -0.0853, -0.0494], [-0.0538, -0.0386, 0.0721, ..., -0.0575, 0.0896, 0.0013], ..., [-0.0522, 0.0356, 0.0381, ..., 0.0706, -0.0532, -0.0636], [-0.1115, 0.0418, -0.0945, ..., 0.0232, -0.0430, -0.0372], [ 0.0406, -0.0197, -0.0475, ..., -0.0038, -0.0332, -0.0062]], device='cuda:0'), grad: tensor([[ 2.5202e-06, 9.0301e-06, 1.9558e-07, ..., -4.9055e-05, -3.1590e-04, -2.5058e-04], [ 1.5885e-05, 5.2959e-05, 1.2852e-06, ..., 2.3201e-05, 3.5077e-05, 2.2799e-05], [ 4.0308e-06, 1.6224e-04, -8.0243e-06, ..., 2.9713e-05, 5.0366e-05, 1.3739e-05], ..., [-5.2787e-06, -7.3195e-04, -4.3772e-07, ..., -4.0197e-04, 4.2498e-05, 2.7090e-05], [ 4.2051e-05, 6.9559e-05, 1.0356e-06, ..., 1.5005e-05, 1.2493e-04, 7.8499e-05], [-6.0797e-05, 1.7130e-04, 1.9800e-06, ..., 1.1557e-04, 3.6597e-05, 2.4036e-05]], device='cuda:0') Epoch 58, bias, value: tensor([-0.0151, -0.0218, -0.0156, -0.0274, -0.0312, -0.0013, 0.0237, -0.0106, 0.0323, 0.0006], device='cuda:0'), grad: tensor([-4.1175e-04, 1.1873e-04, 2.3568e-04, 4.8733e-04, 1.4372e-05, 1.7571e-04, -1.3101e-04, -9.3079e-04, 3.1281e-04, 1.2946e-04], device='cuda:0') 100 0.0001 changing lr epoch 57, time 265.39, cls_loss 0.0164 cls_loss_mapping 0.0201 cls_loss_causal 0.7051 re_mapping 0.0146 re_causal 0.0384 /// teacc 98.74 lr 0.00010000 Epoch 59, weight, value: tensor([[-0.0314, -0.0558, -0.0437, ..., -0.0199, 0.0957, 0.0769], [-0.0878, -0.0955, -0.0959, ..., -0.0627, -0.0867, -0.0500], [-0.0544, -0.0384, 0.0719, ..., -0.0575, 0.0905, 0.0020], ..., [-0.0530, 0.0361, 0.0390, ..., 0.0717, -0.0539, -0.0646], [-0.1127, 0.0416, -0.0945, ..., 0.0229, -0.0436, -0.0377], [ 0.0406, -0.0206, -0.0479, ..., -0.0043, -0.0337, -0.0066]], device='cuda:0'), grad: tensor([[ 3.8408e-06, 7.3493e-05, 1.5393e-05, ..., 2.8200e-06, 1.5116e-04, 4.0829e-05], [-9.5889e-06, 6.6347e-06, 3.1758e-06, ..., 4.4480e-06, 1.2651e-05, -2.4214e-07], [ 1.0543e-06, -1.0514e-04, 1.5140e-04, ..., 1.4675e-04, 1.4052e-05, -7.9036e-05], ..., [ 7.6666e-06, -2.8563e-04, 3.6322e-06, ..., -1.8013e-04, -9.6738e-05, -4.1097e-05], [ 4.3958e-06, 2.1315e-04, 9.4064e-07, ..., 5.8487e-07, 3.6860e-04, 1.4055e-04], [ 3.6489e-06, 9.7156e-06, 1.5795e-05, ..., 6.8136e-06, 8.1658e-06, 1.7863e-06]], device='cuda:0') Epoch 59, bias, value: tensor([-0.0144, -0.0218, -0.0150, -0.0273, -0.0319, -0.0010, 0.0239, -0.0103, 0.0319, -0.0002], device='cuda:0'), grad: tensor([ 2.1160e-04, -2.2840e-04, 3.7265e-04, 1.8775e-04, -2.6932e-03, 7.9691e-05, 1.6966e-03, -2.0850e-04, -7.9930e-05, 6.6137e-04], device='cuda:0') 100 0.0001 changing lr epoch 58, time 265.55, cls_loss 0.0151 cls_loss_mapping 0.0201 cls_loss_causal 0.6425 re_mapping 0.0153 re_causal 0.0380 /// teacc 98.72 lr 0.00010000 Epoch 60, weight, value: tensor([[-0.0313, -0.0564, -0.0439, ..., -0.0198, 0.0963, 0.0775], [-0.0904, -0.0960, -0.0974, ..., -0.0634, -0.0885, -0.0507], [-0.0540, -0.0381, 0.0731, ..., -0.0577, 0.0915, 0.0030], ..., [-0.0533, 0.0364, 0.0389, ..., 0.0724, -0.0554, -0.0661], [-0.1130, 0.0416, -0.0949, ..., 0.0231, -0.0432, -0.0376], [ 0.0409, -0.0209, -0.0478, ..., -0.0050, -0.0340, -0.0069]], device='cuda:0'), grad: tensor([[ 2.2259e-06, 1.5020e-05, 4.8168e-06, ..., 1.9521e-05, -6.5446e-05, -3.9816e-05], [ 1.8720e-06, 7.5400e-05, 1.9580e-05, ..., 1.0425e-04, 1.8552e-05, 4.0121e-06], [ 3.9749e-06, 1.6308e-04, 4.2208e-06, ..., 1.8561e-04, 2.8804e-05, 1.2256e-05], ..., [ 2.9169e-06, -4.2826e-05, -1.4400e-04, ..., -1.6558e-04, 8.1360e-06, 4.1053e-06], [ 7.2159e-06, -9.0456e-04, 5.0291e-07, ..., -1.0576e-03, -1.3554e-04, -4.0710e-05], [-7.6815e-06, 4.7028e-05, 1.0383e-04, ..., 1.4424e-04, 8.2031e-06, 4.6492e-06]], device='cuda:0') Epoch 60, bias, value: tensor([-0.0144, -0.0227, -0.0138, -0.0277, -0.0315, -0.0008, 0.0231, -0.0105, 0.0325, -0.0003], device='cuda:0'), grad: tensor([-7.8045e-07, 1.0937e-04, 5.3120e-04, 6.1321e-04, 1.9109e-04, 1.2064e-03, 2.4056e-04, -3.5286e-04, -2.6760e-03, 1.3494e-04], device='cuda:0') 100 0.0001 changing lr epoch 59, time 265.37, cls_loss 0.0142 cls_loss_mapping 0.0233 cls_loss_causal 0.6752 re_mapping 0.0155 re_causal 0.0393 /// teacc 98.77 lr 0.00010000 Epoch 61, weight, value: tensor([[-0.0316, -0.0568, -0.0441, ..., -0.0200, 0.0972, 0.0781], [-0.0916, -0.0971, -0.0974, ..., -0.0647, -0.0899, -0.0511], [-0.0536, -0.0385, 0.0735, ..., -0.0583, 0.0917, 0.0032], ..., [-0.0540, 0.0361, 0.0388, ..., 0.0723, -0.0554, -0.0668], [-0.1138, 0.0429, -0.0956, ..., 0.0244, -0.0435, -0.0377], [ 0.0417, -0.0212, -0.0482, ..., -0.0050, -0.0346, -0.0073]], device='cuda:0'), grad: tensor([[ 3.1114e-05, 1.5569e-04, -3.4124e-06, ..., 1.6633e-06, 6.7353e-05, -1.9148e-05], [ 5.8673e-07, 3.0547e-05, 4.6305e-06, ..., 2.0891e-05, -3.9101e-04, 4.2878e-06], [ 1.1310e-05, 1.8501e-04, -4.8205e-06, ..., 1.3359e-05, 5.9247e-05, 7.4089e-05], ..., [ 1.2461e-06, -1.0781e-05, -4.2357e-06, ..., -7.0572e-05, 2.8938e-05, 6.6385e-06], [ 2.8536e-05, 3.1531e-05, 6.2771e-06, ..., 2.0817e-05, 3.1590e-04, 3.6597e-05], [-6.4187e-06, 8.6576e-06, -3.1609e-06, ..., 1.2301e-05, 5.9828e-06, 1.8273e-06]], device='cuda:0') Epoch 61, bias, value: tensor([-0.0141, -0.0235, -0.0140, -0.0278, -0.0311, -0.0014, 0.0234, -0.0103, 0.0329, -0.0003], device='cuda:0'), grad: tensor([ 0.0002, -0.0057, 0.0005, -0.0007, -0.0013, 0.0001, 0.0021, 0.0002, 0.0043, 0.0003], device='cuda:0') 100 0.0001 changing lr epoch 60, time 265.16, cls_loss 0.0152 cls_loss_mapping 0.0236 cls_loss_causal 0.6916 re_mapping 0.0143 re_causal 0.0368 /// teacc 98.79 lr 0.00010000 Epoch 62, weight, value: tensor([[-0.0316, -0.0573, -0.0444, ..., -0.0200, 0.0976, 0.0789], [-0.0921, -0.0977, -0.0979, ..., -0.0653, -0.0907, -0.0511], [-0.0543, -0.0380, 0.0741, ..., -0.0591, 0.0925, 0.0034], ..., [-0.0545, 0.0366, 0.0389, ..., 0.0730, -0.0567, -0.0677], [-0.1146, 0.0430, -0.0962, ..., 0.0250, -0.0442, -0.0381], [ 0.0420, -0.0219, -0.0482, ..., -0.0058, -0.0336, -0.0078]], device='cuda:0'), grad: tensor([[ 4.4294e-06, 3.5726e-06, 2.7940e-07, ..., 2.3413e-06, -8.4996e-05, -6.3837e-05], [ 5.3570e-06, 1.1273e-05, 1.5274e-07, ..., 1.6037e-06, 1.1977e-06, 7.1153e-07], [ 7.5176e-06, 7.5340e-05, 3.9898e-06, ..., 1.4335e-05, 1.2621e-05, 3.5055e-06], ..., [ 8.0913e-06, 1.4283e-05, 8.4564e-07, ..., -1.8492e-05, -2.9802e-08, 4.8056e-07], [ 2.4691e-05, 2.6867e-05, -1.6652e-06, ..., -1.0736e-05, -2.6841e-06, 3.4124e-06], [-7.2233e-06, 1.6198e-05, 1.2703e-06, ..., 9.9689e-06, 5.2571e-05, 4.0799e-05]], device='cuda:0') Epoch 62, bias, value: tensor([-0.0147, -0.0229, -0.0140, -0.0283, -0.0312, -0.0010, 0.0229, -0.0106, 0.0328, 0.0005], device='cuda:0'), grad: tensor([-7.4863e-05, -3.1665e-06, 1.1635e-04, -2.3711e-04, 8.4937e-05, 4.4405e-05, 2.5779e-05, 5.5015e-05, 4.6760e-05, -5.8174e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 61---------------------------------------------------- epoch 61, time 282.08, cls_loss 0.0121 cls_loss_mapping 0.0156 cls_loss_causal 0.6431 re_mapping 0.0137 re_causal 0.0341 /// teacc 98.80 lr 0.00010000 Epoch 63, weight, value: tensor([[-0.0324, -0.0579, -0.0446, ..., -0.0209, 0.0980, 0.0792], [-0.0925, -0.0981, -0.0977, ..., -0.0659, -0.0913, -0.0511], [-0.0549, -0.0381, 0.0745, ..., -0.0598, 0.0930, 0.0035], ..., [-0.0554, 0.0370, 0.0389, ..., 0.0737, -0.0574, -0.0682], [-0.1155, 0.0430, -0.0964, ..., 0.0253, -0.0441, -0.0383], [ 0.0418, -0.0225, -0.0485, ..., -0.0066, -0.0339, -0.0083]], device='cuda:0'), grad: tensor([[ 1.2860e-05, 6.2212e-06, 2.4810e-06, ..., 2.0996e-05, 3.5968e-06, -1.2822e-05], [ 4.0419e-06, 2.0728e-05, 1.3471e-05, ..., 7.8321e-05, -6.1207e-06, -9.1344e-06], [ 3.5428e-06, 5.6595e-05, -1.3866e-05, ..., 3.6418e-05, -2.6941e-04, 8.7395e-06], ..., [ 5.2452e-06, -2.0409e-04, -3.0637e-05, ..., -2.7895e-04, -1.7267e-06, 1.3281e-06], [ 2.2218e-05, 1.8671e-05, 8.4564e-07, ..., 7.9945e-06, 4.6603e-06, 6.1467e-08], [-2.9683e-04, -5.3532e-06, 9.2015e-06, ..., -2.5481e-05, -2.5168e-05, 3.1106e-06]], device='cuda:0') Epoch 63, bias, value: tensor([-0.0152, -0.0223, -0.0141, -0.0284, -0.0305, -0.0006, 0.0232, -0.0109, 0.0329, -0.0003], device='cuda:0'), grad: tensor([ 8.5115e-05, 1.4925e-04, -1.7717e-05, 4.1318e-04, 4.7898e-04, 5.2118e-04, 4.2528e-05, -6.6328e-04, 5.8353e-05, -1.0672e-03], device='cuda:0') 100 0.0001 changing lr epoch 62, time 265.67, cls_loss 0.0129 cls_loss_mapping 0.0170 cls_loss_causal 0.6684 re_mapping 0.0140 re_causal 0.0348 /// teacc 98.73 lr 0.00010000 Epoch 64, weight, value: tensor([[-0.0328, -0.0583, -0.0447, ..., -0.0210, 0.0985, 0.0796], [-0.0933, -0.0987, -0.0983, ..., -0.0669, -0.0929, -0.0520], [-0.0551, -0.0383, 0.0750, ..., -0.0607, 0.0944, 0.0048], ..., [-0.0548, 0.0372, 0.0390, ..., 0.0748, -0.0580, -0.0691], [-0.1163, 0.0430, -0.0965, ..., 0.0254, -0.0451, -0.0388], [ 0.0421, -0.0229, -0.0486, ..., -0.0072, -0.0344, -0.0087]], device='cuda:0'), grad: tensor([[ 2.5660e-05, 1.9699e-05, 3.0845e-06, ..., 2.9653e-05, -3.3349e-05, -1.2346e-05], [ 6.8918e-06, 8.8438e-06, 8.1770e-07, ..., 1.7285e-05, 2.5425e-06, 2.3078e-06], [ 5.1484e-06, 7.0520e-06, 5.1409e-07, ..., 9.7305e-06, -5.8375e-06, 1.1362e-07], ..., [-1.6659e-05, -2.8893e-05, -3.9302e-06, ..., -6.3956e-05, 6.2287e-06, 4.2394e-06], [-3.4366e-06, -5.4538e-05, 1.3858e-06, ..., -1.6928e-04, 8.4341e-06, -3.2365e-05], [ 3.6269e-05, 5.9187e-05, -4.4182e-06, ..., 1.9073e-04, 7.9349e-06, 3.2395e-05]], device='cuda:0') Epoch 64, bias, value: tensor([-0.0153, -0.0229, -0.0134, -0.0283, -0.0308, -0.0009, 0.0233, -0.0110, 0.0327, 0.0001], device='cuda:0'), grad: tensor([ 4.9025e-05, 3.7760e-05, 3.7134e-05, 5.4836e-05, -3.6502e-04, 7.9513e-05, -2.6956e-05, 1.5581e-04, -9.9468e-04, 9.7227e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 63---------------------------------------------------- epoch 63, time 281.33, cls_loss 0.0141 cls_loss_mapping 0.0188 cls_loss_causal 0.6417 re_mapping 0.0137 re_causal 0.0341 /// teacc 98.81 lr 0.00010000 Epoch 65, weight, value: tensor([[-0.0334, -0.0586, -0.0449, ..., -0.0213, 0.0991, 0.0804], [-0.0943, -0.0994, -0.0981, ..., -0.0681, -0.0942, -0.0522], [-0.0560, -0.0390, 0.0750, ..., -0.0617, 0.0951, 0.0051], ..., [-0.0550, 0.0378, 0.0393, ..., 0.0761, -0.0587, -0.0695], [-0.1184, 0.0430, -0.0966, ..., 0.0254, -0.0462, -0.0392], [ 0.0427, -0.0235, -0.0488, ..., -0.0079, -0.0339, -0.0094]], device='cuda:0'), grad: tensor([[ 1.7241e-05, 1.1958e-06, 7.6368e-08, ..., 2.8804e-05, -3.6693e-04, -3.1424e-04], [ 3.3733e-06, 4.1090e-06, 2.2165e-07, ..., 8.4937e-05, 1.5414e-04, 8.7142e-05], [ 1.9092e-06, 3.9674e-07, -1.0654e-06, ..., 1.2815e-04, 1.2147e-04, 2.2173e-05], ..., [ 4.0866e-06, 5.2862e-06, -3.2037e-07, ..., 4.8243e-06, 1.4864e-05, 4.3809e-06], [ 2.7224e-05, 1.9297e-05, 3.9674e-07, ..., -7.5579e-04, -3.7670e-04, 1.4675e-04], [ 1.4454e-05, 3.6135e-06, 2.6263e-07, ..., 1.8507e-05, 3.4213e-05, 2.5436e-05]], device='cuda:0') Epoch 65, bias, value: tensor([-0.0157, -0.0226, -0.0137, -0.0281, -0.0307, -0.0015, 0.0239, -0.0105, 0.0321, 0.0002], device='cuda:0'), grad: tensor([-2.7585e-04, 5.7507e-04, 8.1921e-04, 1.6260e-04, 9.7394e-05, 2.2106e-03, 6.7472e-04, 1.3077e-04, -4.4632e-03, 6.9380e-05], device='cuda:0') 100 0.0001 changing lr epoch 64, time 265.17, cls_loss 0.0137 cls_loss_mapping 0.0175 cls_loss_causal 0.6738 re_mapping 0.0138 re_causal 0.0352 /// teacc 98.80 lr 0.00010000 Epoch 66, weight, value: tensor([[-0.0338, -0.0591, -0.0451, ..., -0.0216, 0.1002, 0.0811], [-0.0950, -0.1000, -0.0984, ..., -0.0689, -0.0951, -0.0521], [-0.0568, -0.0393, 0.0757, ..., -0.0624, 0.0956, 0.0055], ..., [-0.0554, 0.0380, 0.0392, ..., 0.0767, -0.0593, -0.0702], [-0.1195, 0.0430, -0.0970, ..., 0.0254, -0.0468, -0.0397], [ 0.0434, -0.0235, -0.0490, ..., -0.0078, -0.0344, -0.0099]], device='cuda:0'), grad: tensor([[ 4.6007e-06, 3.8892e-05, 7.7561e-06, ..., 1.9222e-05, 4.4443e-06, 4.1202e-06], [ 2.9411e-06, 4.8727e-05, 1.2666e-05, ..., 3.9339e-05, 1.7434e-05, 2.0992e-06], [ 2.5928e-05, 4.7708e-04, 1.9026e-04, ..., 4.7827e-04, 2.3854e-04, 2.4699e-06], ..., [-2.8029e-05, -6.7091e-04, -2.6512e-04, ..., -6.9523e-04, -3.1710e-04, 9.7044e-07], [ 8.2329e-06, -1.2326e-04, 9.9167e-06, ..., 2.4229e-05, 3.0756e-05, -6.2644e-05], [ 2.7772e-06, 1.2839e-04, 1.5102e-05, ..., 4.1336e-05, 2.1592e-05, 3.9309e-05]], device='cuda:0') Epoch 66, bias, value: tensor([-0.0155, -0.0226, -0.0134, -0.0279, -0.0304, -0.0024, 0.0245, -0.0111, 0.0320, 0.0006], device='cuda:0'), grad: tensor([ 1.1611e-04, 1.4460e-04, 1.5898e-03, 1.3793e-04, 4.5300e-05, 5.8085e-05, -1.0245e-06, -2.1534e-03, -3.1590e-04, 3.7861e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 65---------------------------------------------------- epoch 65, time 281.46, cls_loss 0.0119 cls_loss_mapping 0.0165 cls_loss_causal 0.6781 re_mapping 0.0136 re_causal 0.0363 /// teacc 98.84 lr 0.00010000 Epoch 67, weight, value: tensor([[-0.0344, -0.0600, -0.0453, ..., -0.0219, 0.1008, 0.0817], [-0.0954, -0.1008, -0.0986, ..., -0.0690, -0.0957, -0.0524], [-0.0574, -0.0397, 0.0759, ..., -0.0637, 0.0963, 0.0056], ..., [-0.0563, 0.0384, 0.0394, ..., 0.0771, -0.0594, -0.0705], [-0.1201, 0.0430, -0.0969, ..., 0.0255, -0.0470, -0.0402], [ 0.0431, -0.0244, -0.0492, ..., -0.0083, -0.0348, -0.0102]], device='cuda:0'), grad: tensor([[ 2.6729e-06, 3.6471e-06, 1.5832e-07, ..., -2.9847e-05, -1.3494e-04, -1.0055e-04], [ 3.7048e-06, 4.3184e-05, 2.9001e-06, ..., 1.5259e-04, 1.0431e-05, 9.1568e-06], [ 4.3735e-06, 1.1438e-04, 5.9828e-06, ..., 1.3304e-04, 1.5870e-05, 1.4454e-06], ..., [ 7.5735e-06, -1.0425e-04, -8.3223e-06, ..., -9.7513e-05, -1.2755e-05, 1.6242e-06], [ 5.2899e-07, -1.0860e-04, 3.5763e-07, ..., -8.2445e-04, -1.3858e-05, -1.3582e-05], [ 6.4559e-06, 1.0818e-05, 6.0536e-07, ..., 1.5974e-05, 1.0528e-05, 5.7444e-06]], device='cuda:0') Epoch 67, bias, value: tensor([-1.5696e-02, -2.1927e-02, -1.3872e-02, -2.7807e-02, -3.0770e-02, -1.5916e-03, 2.4542e-02, -1.0917e-02, 3.1704e-02, 9.6543e-05], device='cuda:0'), grad: tensor([-1.5485e-04, 3.9387e-04, 4.3249e-04, -6.4909e-05, 6.3276e-04, -1.4267e-03, 2.1305e-03, -1.4639e-04, -1.8282e-03, 2.6956e-05], device='cuda:0') 100 0.0001 changing lr epoch 66, time 265.34, cls_loss 0.0129 cls_loss_mapping 0.0197 cls_loss_causal 0.6607 re_mapping 0.0133 re_causal 0.0350 /// teacc 98.83 lr 0.00010000 Epoch 68, weight, value: tensor([[-0.0344, -0.0604, -0.0456, ..., -0.0219, 0.1017, 0.0826], [-0.0974, -0.1012, -0.0994, ..., -0.0702, -0.0973, -0.0533], [-0.0560, -0.0411, 0.0762, ..., -0.0656, 0.0973, 0.0066], ..., [-0.0571, 0.0382, 0.0399, ..., 0.0776, -0.0598, -0.0720], [-0.1202, 0.0432, -0.0968, ..., 0.0258, -0.0472, -0.0405], [ 0.0430, -0.0252, -0.0495, ..., -0.0088, -0.0350, -0.0105]], device='cuda:0'), grad: tensor([[ 2.1592e-05, 3.2149e-06, 2.6263e-07, ..., 3.7253e-06, 3.4031e-06, 5.0552e-06], [ 4.9360e-06, 3.8958e-04, 9.2015e-06, ..., 6.3801e-04, 3.5197e-05, 1.6689e-05], [ 1.9401e-05, 3.3021e-05, -1.2182e-05, ..., 6.8136e-06, -4.6968e-05, -1.7360e-05], ..., [ 5.4613e-06, -5.5027e-04, -9.5554e-07, ..., -8.8406e-04, 4.6007e-06, 2.8089e-06], [ 1.8284e-05, 4.7892e-05, 1.1344e-06, ..., 7.9453e-05, 4.2506e-06, 2.6897e-06], [-4.1574e-05, 7.1287e-05, 1.0524e-06, ..., 8.2076e-05, 1.0841e-06, 1.1083e-06]], device='cuda:0') Epoch 68, bias, value: tensor([-0.0153, -0.0232, -0.0133, -0.0275, -0.0311, -0.0017, 0.0242, -0.0107, 0.0326, -0.0003], device='cuda:0'), grad: tensor([ 3.2544e-05, 1.1053e-03, -1.9237e-05, -6.5982e-05, 1.8966e-04, 7.2896e-05, -1.2957e-05, -1.3857e-03, 1.8251e-04, -1.0067e-04], device='cuda:0') 100 0.0001 changing lr epoch 67, time 265.12, cls_loss 0.0109 cls_loss_mapping 0.0153 cls_loss_causal 0.6328 re_mapping 0.0137 re_causal 0.0351 /// teacc 98.78 lr 0.00010000 Epoch 69, weight, value: tensor([[-0.0347, -0.0609, -0.0457, ..., -0.0224, 0.1028, 0.0831], [-0.0988, -0.1027, -0.0997, ..., -0.0715, -0.0984, -0.0539], [-0.0564, -0.0414, 0.0766, ..., -0.0663, 0.0979, 0.0069], ..., [-0.0578, 0.0388, 0.0402, ..., 0.0786, -0.0601, -0.0724], [-0.1210, 0.0431, -0.0972, ..., 0.0262, -0.0474, -0.0405], [ 0.0433, -0.0259, -0.0498, ..., -0.0094, -0.0358, -0.0109]], device='cuda:0'), grad: tensor([[ 4.6715e-06, 2.6412e-06, 1.9558e-08, ..., 2.6450e-07, -3.0696e-05, -1.1936e-05], [ 4.2934e-07, 2.2091e-06, 1.4249e-07, ..., 1.8319e-06, 2.6878e-06, -2.1076e-04], [ 3.2540e-06, 2.7314e-05, -2.8312e-07, ..., 2.0880e-06, 1.6302e-05, 9.6321e-05], ..., [ 4.4331e-07, -1.7598e-05, 2.7195e-07, ..., -1.0997e-05, 3.6508e-06, 7.2360e-05], [ 2.6040e-06, 1.1750e-05, 7.6368e-08, ..., -3.3733e-06, 8.4266e-06, 1.7956e-05], [-2.5611e-06, 1.5851e-06, 3.5856e-07, ..., 3.5204e-07, 1.1601e-05, 1.5020e-05]], device='cuda:0') Epoch 69, bias, value: tensor([-0.0152, -0.0242, -0.0136, -0.0276, -0.0316, -0.0012, 0.0248, -0.0102, 0.0329, -0.0004], device='cuda:0'), grad: tensor([ 1.6332e-05, -9.7132e-04, 6.1607e-04, -5.9247e-05, -4.5753e-04, 4.5002e-05, 9.5963e-05, 5.4121e-04, 8.3804e-05, 9.0599e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 68---------------------------------------------------- epoch 68, time 281.68, cls_loss 0.0112 cls_loss_mapping 0.0139 cls_loss_causal 0.6863 re_mapping 0.0132 re_causal 0.0341 /// teacc 98.88 lr 0.00010000 Epoch 70, weight, value: tensor([[-0.0359, -0.0612, -0.0459, ..., -0.0227, 0.1039, 0.0837], [-0.0996, -0.1031, -0.1000, ..., -0.0721, -0.0996, -0.0541], [-0.0569, -0.0419, 0.0766, ..., -0.0670, 0.0984, 0.0069], ..., [-0.0581, 0.0395, 0.0407, ..., 0.0794, -0.0607, -0.0726], [-0.1221, 0.0433, -0.0975, ..., 0.0266, -0.0476, -0.0407], [ 0.0437, -0.0264, -0.0500, ..., -0.0098, -0.0360, -0.0114]], device='cuda:0'), grad: tensor([[ 6.5416e-06, 3.4180e-07, 8.7544e-08, ..., -1.7524e-05, -1.2887e-04, -8.5115e-05], [ 1.1949e-06, 1.1489e-05, 5.2154e-08, ..., 6.4038e-06, 6.7428e-07, 2.9244e-07], [ 2.2035e-06, 1.9949e-06, -2.0023e-06, ..., 2.2855e-06, -4.5188e-06, -3.8557e-07], ..., [ 8.0280e-07, -7.6830e-05, 2.4587e-07, ..., -3.7223e-05, 1.1083e-06, 6.2771e-07], [ 1.7181e-05, 2.5462e-06, 1.2033e-06, ..., -6.6310e-06, 9.4399e-06, 9.5144e-06], [-1.3635e-05, 3.5763e-07, 7.4506e-09, ..., 6.1989e-06, 1.3225e-06, 9.7137e-07]], device='cuda:0') Epoch 70, bias, value: tensor([-0.0149, -0.0238, -0.0133, -0.0276, -0.0317, -0.0018, 0.0250, -0.0103, 0.0327, -0.0003], device='cuda:0'), grad: tensor([-1.3340e-04, 3.0637e-04, 1.5676e-05, 9.3877e-05, 5.6553e-04, 3.6955e-05, 1.1021e-04, -5.9158e-05, 1.4222e-04, -1.0767e-03], device='cuda:0') 100 0.0001 changing lr epoch 69, time 265.81, cls_loss 0.0110 cls_loss_mapping 0.0166 cls_loss_causal 0.6154 re_mapping 0.0130 re_causal 0.0328 /// teacc 98.72 lr 0.00010000 Epoch 71, weight, value: tensor([[-0.0361, -0.0616, -0.0460, ..., -0.0228, 0.1036, 0.0838], [-0.0999, -0.1029, -0.1002, ..., -0.0714, -0.1001, -0.0538], [-0.0577, -0.0432, 0.0770, ..., -0.0679, 0.0982, 0.0067], ..., [-0.0584, 0.0395, 0.0406, ..., 0.0794, -0.0610, -0.0733], [-0.1231, 0.0432, -0.0978, ..., 0.0266, -0.0485, -0.0412], [ 0.0437, -0.0263, -0.0501, ..., -0.0098, -0.0354, -0.0119]], device='cuda:0'), grad: tensor([[ 1.6484e-06, -4.3884e-06, 2.0582e-07, ..., -6.5923e-05, -1.0939e-03, -5.4312e-04], [ 7.1432e-07, 6.6459e-05, 1.1921e-07, ..., 1.9908e-05, 8.9481e-06, 4.7907e-06], [ 4.6939e-07, 1.2165e-04, 1.4715e-07, ..., 3.9726e-05, 1.8150e-05, 1.1243e-05], ..., [ 5.4110e-07, -1.0872e-03, 1.9930e-07, ..., -2.6488e-04, 1.6928e-05, 1.0557e-05], [ 4.7125e-06, 7.3649e-06, 4.3306e-07, ..., 3.1054e-05, 1.4472e-04, 1.1861e-04], [-7.3202e-07, 5.4407e-04, -3.0920e-07, ..., 1.2010e-04, 1.5092e-04, 6.6638e-05]], device='cuda:0') Epoch 71, bias, value: tensor([-0.0158, -0.0223, -0.0146, -0.0275, -0.0308, -0.0016, 0.0252, -0.0113, 0.0324, 0.0001], device='cuda:0'), grad: tensor([-1.5755e-03, 1.1009e-04, 2.0659e-04, 3.7217e-04, 2.0528e-04, 6.6698e-05, 1.0099e-03, -1.4505e-03, 1.5497e-04, 8.9931e-04], device='cuda:0') 100 0.0001 changing lr epoch 70, time 265.24, cls_loss 0.0120 cls_loss_mapping 0.0149 cls_loss_causal 0.6525 re_mapping 0.0127 re_causal 0.0332 /// teacc 98.72 lr 0.00010000 Epoch 72, weight, value: tensor([[-0.0365, -0.0619, -0.0462, ..., -0.0228, 0.1047, 0.0846], [-0.1005, -0.1046, -0.1008, ..., -0.0728, -0.1006, -0.0539], [-0.0581, -0.0438, 0.0773, ..., -0.0689, 0.0986, 0.0071], ..., [-0.0588, 0.0402, 0.0407, ..., 0.0805, -0.0615, -0.0738], [-0.1241, 0.0435, -0.0965, ..., 0.0270, -0.0492, -0.0417], [ 0.0437, -0.0270, -0.0503, ..., -0.0103, -0.0357, -0.0125]], device='cuda:0'), grad: tensor([[ 4.6454e-06, 1.1988e-05, 3.9823e-06, ..., -9.5546e-05, -1.1320e-03, -7.8726e-04], [ 2.9299e-06, 3.0816e-05, 6.5342e-06, ..., 1.7643e-05, 1.5891e-04, 1.1069e-04], [ 4.0159e-06, 1.0371e-04, 1.7568e-05, ..., 8.6308e-05, 3.4833e-04, 2.6512e-04], ..., [ 2.7232e-06, -2.3830e-04, -5.6446e-05, ..., -1.3375e-04, 1.3936e-04, 6.4373e-05], [ 7.6257e-06, 3.8683e-05, 6.0536e-06, ..., 2.6181e-05, 6.5207e-05, 4.6849e-05], [ 3.8631e-06, 3.9339e-05, 6.1132e-06, ..., 2.4676e-05, 1.7747e-05, 1.1854e-05]], device='cuda:0') Epoch 72, bias, value: tensor([-0.0153, -0.0237, -0.0142, -0.0274, -0.0311, -0.0018, 0.0252, -0.0103, 0.0329, -0.0002], device='cuda:0'), grad: tensor([-1.7014e-03, 2.7800e-04, 7.3862e-04, 3.5048e-05, 1.0097e-04, 1.0300e-04, 4.4250e-04, -2.6035e-04, 1.9276e-04, 7.1406e-05], device='cuda:0') 100 0.0001 changing lr epoch 71, time 265.35, cls_loss 0.0130 cls_loss_mapping 0.0142 cls_loss_causal 0.6438 re_mapping 0.0132 re_causal 0.0336 /// teacc 98.78 lr 0.00010000 Epoch 73, weight, value: tensor([[-0.0376, -0.0623, -0.0464, ..., -0.0232, 0.1055, 0.0852], [-0.1010, -0.1062, -0.1010, ..., -0.0733, -0.1009, -0.0535], [-0.0586, -0.0447, 0.0778, ..., -0.0699, 0.0996, 0.0069], ..., [-0.0594, 0.0405, 0.0408, ..., 0.0813, -0.0621, -0.0743], [-0.1251, 0.0437, -0.0970, ..., 0.0260, -0.0500, -0.0421], [ 0.0441, -0.0282, -0.0507, ..., -0.0116, -0.0360, -0.0130]], device='cuda:0'), grad: tensor([[ 1.2159e-05, 6.6720e-06, 2.6077e-07, ..., 1.0356e-05, 4.6082e-06, 7.4171e-06], [-2.5272e-05, 1.6809e-05, 2.8592e-07, ..., 7.7933e-06, 9.9465e-07, 7.4785e-07], [ 8.8364e-06, 3.6210e-05, -2.4568e-06, ..., 1.4737e-05, -5.7183e-06, -3.7365e-06], ..., [ 2.6878e-06, -2.3448e-04, 6.7241e-07, ..., -1.1295e-04, 1.0524e-06, 5.3551e-07], [ 1.1012e-05, 1.1530e-06, 7.8324e-07, ..., -4.4815e-06, 5.6475e-06, 5.3383e-06], [ 7.4208e-06, 8.8632e-05, 3.6601e-07, ..., 3.8713e-05, 1.0813e-06, 8.1118e-07]], device='cuda:0') Epoch 73, bias, value: tensor([-0.0151, -0.0232, -0.0144, -0.0272, -0.0307, -0.0009, 0.0251, -0.0102, 0.0318, -0.0011], device='cuda:0'), grad: tensor([ 5.4985e-05, -3.3417e-03, 1.2469e-04, 7.8261e-05, -2.6274e-04, 2.1970e-04, 8.8289e-07, -2.3389e-04, 2.5787e-03, 7.8058e-04], device='cuda:0') 100 0.0001 changing lr epoch 72, time 264.95, cls_loss 0.0130 cls_loss_mapping 0.0179 cls_loss_causal 0.6449 re_mapping 0.0128 re_causal 0.0323 /// teacc 98.85 lr 0.00010000 Epoch 74, weight, value: tensor([[-0.0381, -0.0627, -0.0466, ..., -0.0230, 0.1059, 0.0855], [-0.1015, -0.1078, -0.1010, ..., -0.0743, -0.1015, -0.0524], [-0.0591, -0.0445, 0.0782, ..., -0.0707, 0.1001, 0.0073], ..., [-0.0593, 0.0404, 0.0409, ..., 0.0815, -0.0625, -0.0750], [-0.1247, 0.0450, -0.0979, ..., 0.0283, -0.0494, -0.0424], [ 0.0443, -0.0290, -0.0507, ..., -0.0121, -0.0361, -0.0132]], device='cuda:0'), grad: tensor([[ 2.4401e-06, 7.0632e-05, 3.9302e-07, ..., -2.4959e-06, 8.8751e-05, 6.2764e-05], [ 7.1712e-06, 8.3521e-06, 1.5274e-07, ..., 8.7731e-07, 1.0766e-05, 7.6108e-06], [ 5.8860e-07, -6.8855e-04, -4.2878e-06, ..., 9.4436e-07, -9.2840e-04, -6.8045e-04], ..., [ 5.0962e-06, 4.9305e-04, 2.3991e-06, ..., -3.4012e-06, 6.7759e-04, 4.8876e-04], [ 6.6981e-06, 3.0056e-05, 9.0711e-07, ..., 2.4159e-06, 3.4720e-05, 2.3365e-05], [-1.0949e-04, 5.7697e-05, 2.6226e-06, ..., -4.2841e-06, 1.2493e-04, 7.4029e-05]], device='cuda:0') Epoch 74, bias, value: tensor([-0.0153, -0.0233, -0.0146, -0.0273, -0.0307, -0.0012, 0.0238, -0.0103, 0.0332, -0.0007], device='cuda:0'), grad: tensor([ 2.7442e-04, 1.2040e-05, -2.4643e-03, 7.1764e-05, 1.4467e-03, 7.5698e-05, 8.5592e-05, 1.9588e-03, 1.9836e-04, -1.6613e-03], device='cuda:0') 100 0.0001 changing lr epoch 73, time 265.31, cls_loss 0.0130 cls_loss_mapping 0.0172 cls_loss_causal 0.6246 re_mapping 0.0124 re_causal 0.0307 /// teacc 98.81 lr 0.00010000 Epoch 75, weight, value: tensor([[-0.0392, -0.0634, -0.0466, ..., -0.0235, 0.1063, 0.0858], [-0.1023, -0.1074, -0.1013, ..., -0.0754, -0.1022, -0.0528], [-0.0596, -0.0444, 0.0785, ..., -0.0714, 0.1006, 0.0078], ..., [-0.0591, 0.0405, 0.0408, ..., 0.0828, -0.0636, -0.0760], [-0.1257, 0.0452, -0.0972, ..., 0.0285, -0.0499, -0.0428], [ 0.0450, -0.0298, -0.0508, ..., -0.0132, -0.0364, -0.0142]], device='cuda:0'), grad: tensor([[ 1.9744e-05, 7.4357e-06, 2.1309e-06, ..., 1.0997e-05, -5.3123e-06, -2.7083e-06], [ 2.0087e-05, 2.0340e-05, 4.2021e-06, ..., 2.3156e-05, 1.6123e-05, 6.1058e-06], [ 2.3127e-05, -2.5462e-06, -5.6595e-05, ..., 4.8578e-05, -1.4114e-04, -7.2837e-05], ..., [ 3.4362e-05, -2.6762e-05, 4.1455e-05, ..., -3.0667e-05, 1.0407e-04, 4.7088e-05], [ 7.7188e-05, 1.1559e-03, 8.3447e-07, ..., -3.5930e-06, -1.4409e-05, 2.7958e-06], [ 2.7910e-05, -1.2131e-03, 1.0617e-07, ..., 2.1800e-05, 4.3139e-06, 2.4773e-06]], device='cuda:0') Epoch 75, bias, value: tensor([-0.0156, -0.0228, -0.0144, -0.0276, -0.0302, -0.0009, 0.0238, -0.0104, 0.0332, -0.0013], device='cuda:0'), grad: tensor([ 5.1051e-05, 1.6427e-04, 2.8181e-04, 8.0299e-04, -8.1348e-04, -9.2506e-04, 2.3985e-04, 2.2018e-04, 2.3346e-03, -2.3518e-03], device='cuda:0') 100 0.0001 changing lr epoch 74, time 265.16, cls_loss 0.0114 cls_loss_mapping 0.0146 cls_loss_causal 0.6592 re_mapping 0.0124 re_causal 0.0320 /// teacc 98.68 lr 0.00010000 Epoch 76, weight, value: tensor([[-0.0399, -0.0640, -0.0468, ..., -0.0238, 0.1068, 0.0861], [-0.1028, -0.1070, -0.1019, ..., -0.0755, -0.1030, -0.0531], [-0.0601, -0.0445, 0.0795, ..., -0.0720, 0.1014, 0.0084], ..., [-0.0595, 0.0407, 0.0407, ..., 0.0836, -0.0645, -0.0768], [-0.1264, 0.0453, -0.0978, ..., 0.0283, -0.0503, -0.0426], [ 0.0457, -0.0302, -0.0511, ..., -0.0136, -0.0364, -0.0148]], device='cuda:0'), grad: tensor([[ 3.7611e-05, 4.2841e-07, 5.0329e-06, ..., 1.6883e-05, -7.1786e-06, -2.5723e-06], [ 8.2180e-06, 1.7695e-06, 1.5441e-06, ..., 6.2473e-06, 3.8520e-06, 4.0457e-06], [ 3.4515e-06, 2.9076e-06, -1.0028e-05, ..., 9.2983e-06, -1.8850e-06, 3.7532e-06], ..., [ 5.5879e-06, 3.4645e-07, 8.5682e-06, ..., -5.2899e-07, 9.6187e-06, 4.0345e-06], [ 1.1891e-04, -7.4506e-06, 8.4750e-07, ..., 1.2434e-04, 7.6964e-06, 7.7710e-06], [ 1.4961e-05, 6.7428e-07, 6.2101e-06, ..., 1.1019e-05, 1.8507e-05, 8.0094e-06]], device='cuda:0') Epoch 76, bias, value: tensor([-0.0158, -0.0225, -0.0141, -0.0278, -0.0303, -0.0008, 0.0234, -0.0108, 0.0333, -0.0011], device='cuda:0'), grad: tensor([ 4.6164e-05, 1.4737e-05, 1.5348e-05, 6.1131e-04, 3.0994e-05, -1.1139e-03, 1.1122e-04, 3.2812e-05, 2.0659e-04, 4.4107e-05], device='cuda:0') 100 0.0001 changing lr epoch 75, time 265.37, cls_loss 0.0116 cls_loss_mapping 0.0156 cls_loss_causal 0.6792 re_mapping 0.0127 re_causal 0.0312 /// teacc 98.79 lr 0.00010000 Epoch 77, weight, value: tensor([[-0.0406, -0.0647, -0.0470, ..., -0.0241, 0.1071, 0.0863], [-0.1032, -0.1074, -0.1025, ..., -0.0759, -0.1038, -0.0534], [-0.0605, -0.0448, 0.0798, ..., -0.0728, 0.1022, 0.0086], ..., [-0.0595, 0.0412, 0.0412, ..., 0.0846, -0.0651, -0.0773], [-0.1271, 0.0455, -0.0975, ..., 0.0287, -0.0504, -0.0428], [ 0.0455, -0.0304, -0.0511, ..., -0.0144, -0.0367, -0.0152]], device='cuda:0'), grad: tensor([[ 2.2128e-06, 6.7592e-05, 1.7751e-06, ..., 1.6280e-06, -1.0781e-05, -1.5303e-05], [ 3.4831e-07, 3.9577e-05, 5.8740e-05, ..., 2.7847e-06, 4.6760e-05, 9.3356e-06], [ 4.5449e-07, 4.5013e-04, -5.9187e-05, ..., 7.6443e-06, -2.0370e-05, -1.3106e-05], ..., [ 3.1479e-07, 8.9943e-05, 9.9987e-06, ..., -2.8118e-05, 1.4655e-05, 2.7902e-06], [ 3.2075e-06, 3.1352e-05, 2.8703e-06, ..., 3.6899e-06, 1.3627e-05, 8.1211e-06], [ 8.4750e-07, 4.2647e-05, -1.0408e-05, ..., 8.7693e-06, 2.9039e-06, 3.1982e-06]], device='cuda:0') Epoch 77, bias, value: tensor([-0.0163, -0.0227, -0.0139, -0.0271, -0.0303, -0.0020, 0.0240, -0.0110, 0.0338, -0.0009], device='cuda:0'), grad: tensor([ 1.8418e-04, 3.4142e-04, 9.6321e-04, -1.9569e-03, 1.5378e-05, 4.2796e-05, 1.0110e-05, 3.4118e-04, 1.0854e-04, -4.8429e-05], device='cuda:0') 100 0.0001 changing lr epoch 76, time 265.20, cls_loss 0.0099 cls_loss_mapping 0.0129 cls_loss_causal 0.6261 re_mapping 0.0120 re_causal 0.0312 /// teacc 98.80 lr 0.00010000 Epoch 78, weight, value: tensor([[-0.0409, -0.0653, -0.0473, ..., -0.0246, 0.1067, 0.0863], [-0.1034, -0.1080, -0.1029, ..., -0.0768, -0.1042, -0.0535], [-0.0607, -0.0452, 0.0802, ..., -0.0737, 0.1029, 0.0093], ..., [-0.0595, 0.0419, 0.0414, ..., 0.0857, -0.0658, -0.0782], [-0.1279, 0.0454, -0.0978, ..., 0.0285, -0.0506, -0.0433], [ 0.0453, -0.0310, -0.0511, ..., -0.0147, -0.0358, -0.0150]], device='cuda:0'), grad: tensor([[ 1.3649e-04, 1.2331e-05, 2.1979e-07, ..., 1.6928e-05, 2.2840e-04, 2.4915e-04], [ 2.8126e-07, 9.1046e-06, 1.3225e-07, ..., 1.6075e-06, 1.1921e-07, -7.3791e-05], [ 6.5565e-07, 8.2433e-05, -5.1223e-07, ..., 3.3788e-06, -1.3225e-06, 3.7458e-06], ..., [-4.4703e-06, 3.5226e-05, -1.2554e-06, ..., -6.1274e-05, 7.2457e-07, 3.5781e-06], [ 2.4904e-06, 8.7991e-06, 1.8440e-07, ..., 3.5260e-06, 5.6252e-07, 3.1412e-05], [ 1.5069e-06, 2.2992e-05, 1.2945e-06, ..., 2.0966e-05, 1.0803e-07, 4.0084e-06]], device='cuda:0') Epoch 78, bias, value: tensor([-0.0180, -0.0233, -0.0139, -0.0273, -0.0298, -0.0018, 0.0241, -0.0107, 0.0337, -0.0002], device='cuda:0'), grad: tensor([ 3.6454e-04, -5.0783e-04, 1.5473e-04, -2.6679e-04, 4.1556e-04, 4.8101e-05, -1.8764e-04, 7.4089e-05, 2.3997e-04, -3.3307e-04], device='cuda:0') 100 0.0001 changing lr epoch 77, time 265.59, cls_loss 0.0114 cls_loss_mapping 0.0156 cls_loss_causal 0.6235 re_mapping 0.0122 re_causal 0.0312 /// teacc 98.82 lr 0.00010000 Epoch 79, weight, value: tensor([[-0.0414, -0.0660, -0.0475, ..., -0.0251, 0.1075, 0.0871], [-0.1038, -0.1084, -0.1037, ..., -0.0768, -0.1053, -0.0538], [-0.0611, -0.0456, 0.0807, ..., -0.0743, 0.1036, 0.0097], ..., [-0.0597, 0.0413, 0.0420, ..., 0.0860, -0.0663, -0.0786], [-0.1290, 0.0455, -0.0986, ..., 0.0279, -0.0511, -0.0440], [ 0.0461, -0.0319, -0.0515, ..., -0.0156, -0.0362, -0.0155]], device='cuda:0'), grad: tensor([[ 1.0446e-05, 1.8496e-06, 4.3660e-06, ..., 5.0291e-07, -7.1786e-06, 1.9930e-07], [ 1.1548e-05, 4.2915e-06, 8.8103e-07, ..., 5.5693e-06, 1.5482e-05, 1.0513e-05], [ 4.5337e-06, 1.0997e-05, 3.5018e-07, ..., 4.6752e-06, -1.1303e-05, -8.6203e-06], ..., [ 1.0375e-06, -7.1079e-06, -2.4848e-06, ..., -1.1571e-05, 6.4559e-06, 4.1611e-06], [ 5.2601e-06, 3.7216e-06, 5.6997e-07, ..., -4.4852e-06, 1.7196e-05, 1.1578e-05], [ 1.7714e-06, 6.1095e-06, 6.4634e-07, ..., 1.7416e-06, 5.8599e-06, 3.4142e-06]], device='cuda:0') Epoch 79, bias, value: tensor([-0.0178, -0.0230, -0.0137, -0.0273, -0.0293, -0.0010, 0.0239, -0.0110, 0.0327, -0.0003], device='cuda:0'), grad: tensor([ 1.4096e-05, 5.9605e-05, 7.4320e-07, -3.4273e-05, 4.7803e-04, 6.8188e-05, -5.5933e-04, 9.7975e-07, -4.1500e-06, -2.3589e-05], device='cuda:0') 100 0.0001 changing lr epoch 78, time 265.79, cls_loss 0.0100 cls_loss_mapping 0.0138 cls_loss_causal 0.6601 re_mapping 0.0118 re_causal 0.0313 /// teacc 98.83 lr 0.00010000 Epoch 80, weight, value: tensor([[-0.0425, -0.0664, -0.0477, ..., -0.0255, 0.1084, 0.0878], [-0.1043, -0.1088, -0.1046, ..., -0.0775, -0.1061, -0.0540], [-0.0616, -0.0463, 0.0811, ..., -0.0759, 0.1042, 0.0100], ..., [-0.0602, 0.0417, 0.0428, ..., 0.0871, -0.0669, -0.0791], [-0.1296, 0.0455, -0.0991, ..., 0.0279, -0.0516, -0.0443], [ 0.0459, -0.0322, -0.0519, ..., -0.0161, -0.0364, -0.0158]], device='cuda:0'), grad: tensor([[ 3.6657e-06, 5.9418e-07, 2.9430e-07, ..., -3.4273e-05, -7.8976e-05, -3.2514e-05], [ 2.0359e-06, 2.9039e-06, 3.7812e-07, ..., 1.4529e-06, 3.1665e-06, 1.5572e-06], [ 2.2054e-06, 2.9683e-05, -1.8887e-06, ..., 8.6501e-06, -3.2932e-06, -5.3644e-07], ..., [ 7.0222e-07, 1.5154e-05, -1.6522e-06, ..., -2.5164e-06, 7.7114e-06, 2.5798e-06], [ 6.3896e-05, -9.1121e-06, 4.5821e-07, ..., -2.0921e-05, 6.0238e-06, 3.3855e-05], [ 1.6149e-06, 5.2862e-06, 9.5740e-07, ..., 3.5483e-06, 4.0717e-06, 1.6484e-06]], device='cuda:0') Epoch 80, bias, value: tensor([-0.0174, -0.0238, -0.0140, -0.0277, -0.0291, -0.0005, 0.0240, -0.0106, 0.0328, -0.0005], device='cuda:0'), grad: tensor([-6.0767e-05, 1.2591e-05, 3.4720e-05, -7.4506e-05, 4.5896e-05, 5.5820e-05, -6.6578e-05, 2.5615e-05, 4.8757e-05, -2.1517e-05], device='cuda:0') 100 0.0001 changing lr epoch 79, time 265.73, cls_loss 0.0095 cls_loss_mapping 0.0127 cls_loss_causal 0.6425 re_mapping 0.0121 re_causal 0.0319 /// teacc 98.72 lr 0.00010000 Epoch 81, weight, value: tensor([[-0.0429, -0.0667, -0.0479, ..., -0.0256, 0.1089, 0.0883], [-0.1048, -0.1093, -0.1047, ..., -0.0782, -0.1069, -0.0541], [-0.0622, -0.0470, 0.0812, ..., -0.0779, 0.1049, 0.0101], ..., [-0.0606, 0.0423, 0.0432, ..., 0.0884, -0.0676, -0.0796], [-0.1300, 0.0455, -0.0992, ..., 0.0282, -0.0516, -0.0446], [ 0.0458, -0.0326, -0.0521, ..., -0.0166, -0.0368, -0.0163]], device='cuda:0'), grad: tensor([[ 1.5274e-06, 7.5623e-07, 5.6811e-07, ..., 1.0412e-06, -2.8759e-06, -1.3560e-06], [ 3.3155e-07, 3.9376e-06, 3.3248e-06, ..., 6.0759e-06, 9.5367e-07, 5.4389e-07], [ 5.0850e-07, 1.2398e-05, 7.0184e-06, ..., 1.0684e-05, 2.8580e-05, -4.6007e-06], ..., [ 1.3281e-06, -9.1732e-05, -2.3041e-06, ..., -4.2677e-05, -5.5879e-07, 1.0207e-06], [ 1.5385e-06, 3.7104e-06, 3.0566e-06, ..., 2.4103e-06, 2.8927e-06, 1.7472e-06], [-1.4957e-06, 6.1654e-06, 1.3113e-06, ..., 4.6194e-06, 7.2643e-07, 4.3027e-07]], device='cuda:0') Epoch 81, bias, value: tensor([-0.0174, -0.0232, -0.0138, -0.0281, -0.0297, -0.0004, 0.0233, -0.0102, 0.0327, -0.0005], device='cuda:0'), grad: tensor([ 2.8118e-05, 7.5758e-05, 5.8830e-05, 1.1235e-04, -8.7261e-04, 3.7193e-05, 7.9215e-05, -4.5121e-05, 1.1826e-04, 4.0865e-04], device='cuda:0') 100 0.0001 changing lr epoch 80, time 265.32, cls_loss 0.0094 cls_loss_mapping 0.0133 cls_loss_causal 0.6273 re_mapping 0.0118 re_causal 0.0303 /// teacc 98.81 lr 0.00010000 Epoch 82, weight, value: tensor([[-0.0432, -0.0670, -0.0480, ..., -0.0259, 0.1101, 0.0894], [-0.1051, -0.1097, -0.1048, ..., -0.0786, -0.1080, -0.0544], [-0.0627, -0.0472, 0.0814, ..., -0.0785, 0.1051, 0.0102], ..., [-0.0607, 0.0423, 0.0433, ..., 0.0889, -0.0681, -0.0801], [-0.1308, 0.0453, -0.0994, ..., 0.0283, -0.0524, -0.0452], [ 0.0458, -0.0317, -0.0523, ..., -0.0169, -0.0372, -0.0172]], device='cuda:0'), grad: tensor([[-2.9117e-05, 1.7546e-06, 3.9116e-08, ..., -3.5822e-05, -9.1553e-04, -7.3385e-04], [ 1.3106e-05, 5.3421e-06, 1.3411e-07, ..., 2.1979e-06, 1.4313e-05, 9.9540e-06], [ 1.3523e-06, -1.8954e-05, -9.6671e-07, ..., 2.0675e-07, -2.6468e-06, -3.1460e-06], ..., [ 2.0042e-06, -3.6322e-07, 1.4342e-07, ..., -9.7305e-06, 8.9630e-06, 6.0797e-06], [ 1.2338e-05, 2.0444e-05, 3.1851e-07, ..., 3.8706e-06, 5.0813e-05, 3.5375e-05], [ 9.0972e-06, 7.7039e-06, 7.4506e-09, ..., 9.1419e-06, 7.4983e-05, 4.5300e-05]], device='cuda:0') Epoch 82, bias, value: tensor([-0.0169, -0.0232, -0.0141, -0.0280, -0.0299, -0.0006, 0.0237, -0.0106, 0.0328, -0.0002], device='cuda:0'), grad: tensor([-1.2360e-03, -2.1553e-04, -2.9474e-05, 9.1732e-05, 9.0647e-04, 7.4565e-05, 8.3590e-04, 8.2076e-05, 2.0885e-04, -7.1907e-04], device='cuda:0') 100 0.0001 changing lr epoch 81, time 265.30, cls_loss 0.0116 cls_loss_mapping 0.0137 cls_loss_causal 0.6138 re_mapping 0.0110 re_causal 0.0274 /// teacc 98.81 lr 0.00010000 Epoch 83, weight, value: tensor([[-0.0437, -0.0680, -0.0482, ..., -0.0266, 0.1108, 0.0903], [-0.1055, -0.1101, -0.1048, ..., -0.0791, -0.1103, -0.0547], [-0.0632, -0.0476, 0.0819, ..., -0.0788, 0.1067, 0.0112], ..., [-0.0611, 0.0428, 0.0432, ..., 0.0894, -0.0681, -0.0814], [-0.1319, 0.0454, -0.0994, ..., 0.0284, -0.0528, -0.0455], [ 0.0461, -0.0326, -0.0525, ..., -0.0172, -0.0379, -0.0179]], device='cuda:0'), grad: tensor([[ 1.0021e-06, -1.4588e-05, 1.6391e-07, ..., -6.2436e-06, -1.6880e-04, -9.2447e-05], [ 1.7881e-07, 3.9786e-06, 3.1665e-08, ..., 7.2829e-06, 1.4640e-06, 8.1398e-07], [ 9.4995e-07, 1.3225e-06, 4.3772e-07, ..., 2.5630e-06, 2.1815e-05, 1.2539e-05], ..., [ 5.7556e-07, -8.6427e-06, -6.4820e-07, ..., -1.8910e-05, 6.7614e-06, 3.3416e-06], [ 3.4943e-06, 8.4937e-06, 5.2154e-08, ..., 3.9004e-06, 7.7426e-05, 4.1723e-05], [ 9.8422e-06, 4.4405e-06, 5.2154e-08, ..., 4.3251e-06, 4.8488e-05, 2.2411e-05]], device='cuda:0') Epoch 83, bias, value: tensor([-1.6904e-02, -2.3820e-02, -1.3472e-02, -2.8182e-02, -3.0519e-02, -6.4255e-04, 2.3336e-02, -1.0101e-02, 3.3028e-02, -3.5299e-05], device='cuda:0'), grad: tensor([-2.2829e-04, 1.5110e-05, 3.4243e-05, 6.1840e-06, -7.7486e-05, 1.5959e-05, 2.5079e-05, -4.7162e-06, 1.0866e-04, 1.0508e-04], device='cuda:0') 100 0.0001 changing lr epoch 82, time 265.55, cls_loss 0.0096 cls_loss_mapping 0.0114 cls_loss_causal 0.5904 re_mapping 0.0124 re_causal 0.0290 /// teacc 98.84 lr 0.00010000 Epoch 84, weight, value: tensor([[-0.0442, -0.0686, -0.0483, ..., -0.0269, 0.1124, 0.0913], [-0.1058, -0.1109, -0.1045, ..., -0.0798, -0.1112, -0.0547], [-0.0637, -0.0488, 0.0823, ..., -0.0793, 0.1071, 0.0111], ..., [-0.0612, 0.0428, 0.0431, ..., 0.0900, -0.0688, -0.0819], [-0.1328, 0.0451, -0.0997, ..., 0.0285, -0.0532, -0.0460], [ 0.0467, -0.0320, -0.0526, ..., -0.0168, -0.0384, -0.0184]], device='cuda:0'), grad: tensor([[ 4.7833e-05, 2.9817e-05, 3.1106e-07, ..., 1.8016e-05, 6.6996e-05, 4.1306e-05], [ 1.3057e-06, 1.1489e-05, 2.0489e-08, ..., 5.1707e-05, 1.7151e-05, 6.0536e-07], [ 2.0750e-06, 1.2919e-05, -1.9558e-07, ..., 7.2539e-05, 2.5257e-05, 3.5409e-06], ..., [ 2.9281e-06, -1.4174e-04, 5.0291e-08, ..., -1.1188e-04, 1.2983e-06, 2.5705e-07], [ 1.5050e-05, 1.9312e-05, 2.9802e-08, ..., -8.3780e-04, -3.1662e-04, 3.0771e-06], [ 7.1898e-06, 8.2076e-05, 7.4506e-09, ..., 6.6996e-05, 7.1675e-06, 3.6545e-06]], device='cuda:0') Epoch 84, bias, value: tensor([-0.0161, -0.0235, -0.0135, -0.0278, -0.0316, -0.0011, 0.0230, -0.0107, 0.0328, 0.0012], device='cuda:0'), grad: tensor([ 1.6260e-04, 1.7190e-04, 2.9755e-04, -3.5810e-04, 5.7966e-05, 2.8896e-03, 4.9543e-04, -1.6665e-04, -3.6869e-03, 1.3769e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 83---------------------------------------------------- epoch 83, time 282.41, cls_loss 0.0095 cls_loss_mapping 0.0117 cls_loss_causal 0.6118 re_mapping 0.0116 re_causal 0.0281 /// teacc 98.90 lr 0.00010000 Epoch 85, weight, value: tensor([[-0.0452, -0.0691, -0.0485, ..., -0.0273, 0.1132, 0.0921], [-0.1062, -0.1117, -0.1051, ..., -0.0801, -0.1122, -0.0549], [-0.0641, -0.0496, 0.0831, ..., -0.0799, 0.1079, 0.0113], ..., [-0.0614, 0.0432, 0.0432, ..., 0.0909, -0.0692, -0.0823], [-0.1335, 0.0454, -0.1000, ..., 0.0293, -0.0535, -0.0465], [ 0.0468, -0.0324, -0.0531, ..., -0.0176, -0.0387, -0.0188]], device='cuda:0'), grad: tensor([[ 4.4741e-06, 2.3264e-06, 2.6822e-07, ..., 1.7844e-06, -7.0274e-05, -5.2124e-05], [ 2.9262e-06, 3.7760e-05, 1.5404e-06, ..., 2.4587e-05, 4.3996e-06, 2.6636e-06], [ 2.7046e-06, 1.7434e-05, 3.9116e-07, ..., 1.2822e-05, 3.3267e-06, 4.1239e-06], ..., [ 9.0152e-07, -2.2733e-04, -6.1058e-06, ..., -1.5843e-04, 2.0806e-06, 1.2368e-06], [ 4.7028e-05, 1.9535e-05, 1.1306e-06, ..., -1.4029e-05, 4.0919e-05, 1.7986e-05], [ 4.8392e-06, -8.2329e-06, 1.2033e-06, ..., 6.1505e-06, 8.1658e-06, 5.1633e-06]], device='cuda:0') Epoch 85, bias, value: tensor([-0.0162, -0.0238, -0.0133, -0.0276, -0.0316, -0.0018, 0.0236, -0.0106, 0.0332, 0.0010], device='cuda:0'), grad: tensor([-6.9618e-05, 5.4449e-05, 3.6508e-05, 1.8847e-04, 1.3864e-04, 2.4259e-04, -2.1017e-04, -2.5201e-04, 1.9282e-05, -1.4806e-04], device='cuda:0') 100 0.0001 changing lr epoch 84, time 265.39, cls_loss 0.0088 cls_loss_mapping 0.0118 cls_loss_causal 0.6303 re_mapping 0.0112 re_causal 0.0290 /// teacc 98.82 lr 0.00010000 Epoch 86, weight, value: tensor([[-0.0454, -0.0695, -0.0486, ..., -0.0272, 0.1147, 0.0934], [-0.1068, -0.1118, -0.1053, ..., -0.0814, -0.1131, -0.0552], [-0.0651, -0.0491, 0.0839, ..., -0.0796, 0.1083, 0.0113], ..., [-0.0615, 0.0435, 0.0429, ..., 0.0919, -0.0701, -0.0828], [-0.1344, 0.0457, -0.0999, ..., 0.0290, -0.0539, -0.0468], [ 0.0468, -0.0332, -0.0533, ..., -0.0182, -0.0393, -0.0194]], device='cuda:0'), grad: tensor([[ 6.9104e-07, 5.0105e-06, 7.2643e-08, ..., 4.8615e-07, -1.7807e-05, -2.0023e-06], [ 1.5777e-06, 1.5780e-05, 4.6566e-08, ..., 6.2697e-06, 7.7069e-05, 3.6716e-05], [ 2.7381e-07, -3.0875e-05, -2.5500e-06, ..., 1.2666e-07, -7.1573e-04, -1.3328e-04], ..., [ 1.3243e-06, -1.4216e-05, 1.2480e-06, ..., -1.1697e-05, 4.5359e-05, 2.0206e-05], [ 7.0632e-06, -1.5676e-05, 9.1456e-07, ..., -5.9269e-06, 1.0633e-04, 5.4896e-05], [ 1.8962e-06, 1.1381e-06, 5.5879e-09, ..., 3.6396e-06, 5.5504e-04, 1.9781e-06]], device='cuda:0') Epoch 86, bias, value: tensor([-0.0154, -0.0237, -0.0133, -0.0278, -0.0318, -0.0013, 0.0233, -0.0107, 0.0329, 0.0007], device='cuda:0'), grad: tensor([ 4.3720e-05, -2.5153e-04, -1.6556e-03, 1.0830e-04, -8.1491e-04, 1.3411e-07, 2.8133e-05, 1.2314e-04, 4.8804e-04, 1.9312e-03], device='cuda:0') 100 0.0001 changing lr epoch 85, time 265.42, cls_loss 0.0109 cls_loss_mapping 0.0140 cls_loss_causal 0.6145 re_mapping 0.0112 re_causal 0.0287 /// teacc 98.82 lr 0.00010000 Epoch 87, weight, value: tensor([[-0.0459, -0.0702, -0.0487, ..., -0.0272, 0.1150, 0.0938], [-0.1074, -0.1129, -0.1055, ..., -0.0818, -0.1158, -0.0564], [-0.0657, -0.0497, 0.0842, ..., -0.0805, 0.1095, 0.0122], ..., [-0.0607, 0.0441, 0.0429, ..., 0.0927, -0.0697, -0.0833], [-0.1358, 0.0458, -0.0996, ..., 0.0290, -0.0547, -0.0472], [ 0.0467, -0.0339, -0.0534, ..., -0.0188, -0.0398, -0.0198]], device='cuda:0'), grad: tensor([[ 4.1336e-05, 9.9778e-05, 7.9349e-07, ..., 1.7548e-04, 5.2601e-05, -1.4249e-06], [ 9.6411e-06, 7.4580e-06, 6.6683e-07, ..., -7.4320e-06, 4.7311e-06, 7.4506e-08], [ 1.0557e-05, 1.2055e-05, -4.8690e-06, ..., 1.7464e-05, -2.2352e-08, -2.4308e-06], ..., [-2.2173e-05, -1.9646e-04, -5.1856e-06, ..., -2.7084e-04, -4.8190e-05, 4.2655e-07], [ 9.5591e-06, 1.0625e-05, 1.7416e-06, ..., 1.0267e-05, 8.1882e-06, 6.7241e-07], [ 2.7835e-05, 3.8713e-05, 9.4436e-07, ..., 6.8545e-05, 2.6867e-05, 3.7439e-07]], device='cuda:0') Epoch 87, bias, value: tensor([-0.0159, -0.0247, -0.0136, -0.0269, -0.0320, -0.0015, 0.0235, -0.0102, 0.0329, 0.0013], device='cuda:0'), grad: tensor([ 3.6168e-04, -1.7250e-04, 4.5896e-05, 1.0097e-04, -1.5469e-03, -1.3471e-04, 1.5545e-03, -3.8075e-04, 1.2612e-04, 4.4793e-05], device='cuda:0') 100 0.0001 changing lr epoch 86, time 266.32, cls_loss 0.0095 cls_loss_mapping 0.0110 cls_loss_causal 0.6441 re_mapping 0.0113 re_causal 0.0292 /// teacc 98.74 lr 0.00010000 Epoch 88, weight, value: tensor([[-0.0466, -0.0708, -0.0488, ..., -0.0271, 0.1156, 0.0947], [-0.1077, -0.1136, -0.1056, ..., -0.0825, -0.1165, -0.0567], [-0.0661, -0.0502, 0.0847, ..., -0.0812, 0.1100, 0.0123], ..., [-0.0605, 0.0429, 0.0429, ..., 0.0931, -0.0701, -0.0836], [-0.1367, 0.0460, -0.0999, ..., 0.0290, -0.0549, -0.0474], [ 0.0466, -0.0347, -0.0536, ..., -0.0193, -0.0395, -0.0205]], device='cuda:0'), grad: tensor([[ 1.5780e-05, 1.6764e-07, 1.7881e-07, ..., 3.6880e-07, 5.4568e-05, 3.6687e-05], [ 3.8184e-07, 1.3616e-06, 4.4703e-08, ..., 8.4750e-07, 6.8732e-07, 3.9674e-07], [ 1.4212e-06, 1.1362e-07, -1.1493e-06, ..., 2.1048e-07, 1.5087e-07, 1.2126e-06], ..., [ 6.8173e-07, 3.2019e-06, 5.6997e-07, ..., 8.9779e-06, 2.1942e-06, 8.6427e-07], [ 8.2515e-07, 1.0803e-07, 5.2154e-08, ..., -2.5257e-06, 1.3784e-07, 6.0722e-07], [-2.7940e-08, -1.1019e-05, 1.3039e-08, ..., -1.5512e-05, 2.0172e-06, 9.3132e-07]], device='cuda:0') Epoch 88, bias, value: tensor([-0.0166, -0.0246, -0.0140, -0.0262, -0.0319, -0.0023, 0.0248, -0.0106, 0.0330, 0.0013], device='cuda:0'), grad: tensor([ 7.7009e-05, -4.9472e-06, 3.8296e-06, 9.7454e-06, 4.1819e-04, -1.9386e-05, -1.1283e-04, 3.7003e-04, -7.5921e-06, -7.3385e-04], device='cuda:0') 100 0.0001 changing lr epoch 87, time 265.42, cls_loss 0.0091 cls_loss_mapping 0.0123 cls_loss_causal 0.5913 re_mapping 0.0118 re_causal 0.0286 /// teacc 98.85 lr 0.00010000 Epoch 89, weight, value: tensor([[-0.0472, -0.0711, -0.0490, ..., -0.0273, 0.1162, 0.0953], [-0.1079, -0.1143, -0.1057, ..., -0.0832, -0.1168, -0.0567], [-0.0665, -0.0503, 0.0850, ..., -0.0821, 0.1106, 0.0126], ..., [-0.0614, 0.0435, 0.0430, ..., 0.0941, -0.0710, -0.0844], [-0.1380, 0.0459, -0.0999, ..., 0.0291, -0.0547, -0.0472], [ 0.0472, -0.0353, -0.0537, ..., -0.0200, -0.0394, -0.0211]], device='cuda:0'), grad: tensor([[-4.3005e-05, -3.0734e-06, 2.6394e-06, ..., -5.6267e-05, -2.6798e-04, -1.7524e-04], [ 4.4703e-07, 1.7047e-05, 6.1430e-06, ..., 1.5236e-05, 1.3821e-06, 7.8417e-07], [ 3.2037e-07, 2.9624e-05, 1.4827e-05, ..., 3.3021e-05, -3.6806e-06, -1.4063e-06], ..., [ 1.9968e-06, -1.1170e-04, -5.9247e-05, ..., -1.1647e-04, 5.5358e-06, 3.3248e-06], [ 4.6790e-06, 1.3642e-05, 8.4564e-06, ..., 4.7199e-06, 3.3323e-06, 2.7716e-06], [-2.0191e-06, 1.1310e-05, 4.1910e-06, ..., 1.6078e-05, 1.7494e-05, 1.1526e-05]], device='cuda:0') Epoch 89, bias, value: tensor([-0.0169, -0.0252, -0.0132, -0.0265, -0.0334, -0.0019, 0.0246, -0.0103, 0.0328, 0.0023], device='cuda:0'), grad: tensor([-4.9734e-04, 4.5687e-05, 8.1658e-05, 6.0439e-05, 5.4002e-05, 2.4581e-04, 2.2781e-04, -3.0637e-04, 4.5151e-05, 4.3213e-05], device='cuda:0') 100 0.0001 changing lr epoch 88, time 265.64, cls_loss 0.0098 cls_loss_mapping 0.0124 cls_loss_causal 0.5771 re_mapping 0.0111 re_causal 0.0267 /// teacc 98.81 lr 0.00010000 Epoch 90, weight, value: tensor([[-0.0486, -0.0715, -0.0491, ..., -0.0277, 0.1158, 0.0950], [-0.1083, -0.1148, -0.1065, ..., -0.0840, -0.1183, -0.0578], [-0.0669, -0.0495, 0.0861, ..., -0.0826, 0.1122, 0.0142], ..., [-0.0620, 0.0438, 0.0433, ..., 0.0950, -0.0719, -0.0849], [-0.1391, 0.0452, -0.1012, ..., 0.0284, -0.0555, -0.0483], [ 0.0469, -0.0352, -0.0540, ..., -0.0208, -0.0391, -0.0215]], device='cuda:0'), grad: tensor([[ 2.6965e-04, 7.6368e-07, 3.3714e-07, ..., 9.7036e-05, 2.6859e-06, -9.9279e-07], [ 7.5661e-06, 1.0170e-06, 5.2154e-08, ..., 2.7735e-06, 1.9222e-06, 2.8498e-07], [ 1.8954e-05, 3.2596e-07, -7.8790e-07, ..., 8.5682e-06, 1.1615e-05, -1.4342e-07], ..., [ 1.7524e-05, -3.3528e-08, 6.6124e-07, ..., 4.4219e-06, 1.3541e-06, 2.7567e-07], [ 1.3143e-05, 2.3469e-07, 4.3027e-07, ..., -1.2927e-06, -5.0008e-05, 3.8482e-06], [ 7.0691e-05, 1.4938e-06, 3.2280e-06, ..., 3.2276e-05, 1.4342e-05, 1.3113e-06]], device='cuda:0') Epoch 90, bias, value: tensor([-0.0180, -0.0260, -0.0112, -0.0273, -0.0326, -0.0009, 0.0247, -0.0109, 0.0321, 0.0022], device='cuda:0'), grad: tensor([ 6.5327e-04, 2.8223e-05, 1.0657e-04, 6.7568e-04, -1.4436e-04, -1.6241e-03, 1.8215e-04, 5.8979e-05, -2.7204e-04, 3.3450e-04], device='cuda:0') 100 0.0001 changing lr epoch 89, time 265.48, cls_loss 0.0098 cls_loss_mapping 0.0125 cls_loss_causal 0.6316 re_mapping 0.0106 re_causal 0.0271 /// teacc 98.77 lr 0.00010000 Epoch 91, weight, value: tensor([[-0.0488, -0.0719, -0.0495, ..., -0.0279, 0.1167, 0.0958], [-0.1086, -0.1146, -0.1073, ..., -0.0844, -0.1191, -0.0580], [-0.0674, -0.0497, 0.0872, ..., -0.0835, 0.1130, 0.0146], ..., [-0.0623, 0.0430, 0.0438, ..., 0.0948, -0.0724, -0.0853], [-0.1395, 0.0462, -0.1021, ..., 0.0310, -0.0559, -0.0488], [ 0.0466, -0.0360, -0.0544, ..., -0.0216, -0.0397, -0.0223]], device='cuda:0'), grad: tensor([[ 4.0568e-06, 3.9823e-06, 1.8626e-07, ..., 3.0138e-06, -1.2539e-05, -7.3165e-06], [ 1.0580e-06, 1.5102e-05, 2.3283e-07, ..., 4.4927e-06, 2.8498e-07, 1.2293e-07], [ 1.0729e-06, 1.8969e-05, -4.2357e-06, ..., 1.0796e-05, -1.1683e-05, -1.7211e-06], ..., [ 1.0990e-06, -8.6352e-06, 1.7732e-06, ..., -3.4869e-05, 2.8424e-06, 1.0245e-06], [ 6.6012e-06, 1.0088e-05, 4.0755e-06, ..., -1.0833e-05, 1.0289e-05, 1.6801e-06], [ 7.0453e-05, 2.4125e-05, 2.8070e-06, ..., 8.3745e-05, 8.3297e-06, 4.0792e-06]], device='cuda:0') Epoch 91, bias, value: tensor([-0.0176, -0.0249, -0.0112, -0.0273, -0.0319, -0.0019, 0.0245, -0.0116, 0.0334, 0.0013], device='cuda:0'), grad: tensor([ 4.6939e-07, -8.4788e-06, 4.2796e-05, -1.0890e-04, -5.1588e-05, -8.4460e-05, -1.0140e-05, 2.7239e-05, -2.6673e-06, 1.9562e-04], device='cuda:0') 100 0.0001 changing lr epoch 90, time 265.76, cls_loss 0.0085 cls_loss_mapping 0.0113 cls_loss_causal 0.6149 re_mapping 0.0111 re_causal 0.0275 /// teacc 98.80 lr 0.00010000 Epoch 92, weight, value: tensor([[-0.0488, -0.0725, -0.0497, ..., -0.0281, 0.1181, 0.0973], [-0.1087, -0.1150, -0.1072, ..., -0.0853, -0.1197, -0.0583], [-0.0677, -0.0502, 0.0878, ..., -0.0847, 0.1142, 0.0151], ..., [-0.0624, 0.0435, 0.0439, ..., 0.0960, -0.0732, -0.0860], [-0.1401, 0.0460, -0.1024, ..., 0.0309, -0.0569, -0.0497], [ 0.0460, -0.0365, -0.0549, ..., -0.0227, -0.0408, -0.0237]], device='cuda:0'), grad: tensor([[ 1.8571e-06, 8.3633e-07, 1.3765e-06, ..., 1.0617e-06, -9.3460e-05, -3.9965e-05], [ 1.3001e-06, 2.0619e-06, 1.1706e-04, ..., 2.2221e-06, 1.6546e-04, 1.7043e-06], [ 4.7125e-07, -1.9714e-05, -1.6987e-04, ..., -4.4703e-07, -2.3985e-04, -2.4259e-05], ..., [ 1.1511e-06, 1.7643e-05, 3.8564e-05, ..., -5.3160e-06, 7.8261e-05, 3.0354e-05], [ 8.0094e-06, -4.9695e-06, 4.9658e-06, ..., -8.1770e-07, 5.0634e-05, 1.7986e-05], [ 2.2367e-05, 2.9374e-06, 1.9483e-06, ..., 1.6481e-05, 1.3284e-05, 5.0925e-06]], device='cuda:0') Epoch 92, bias, value: tensor([-0.0165, -0.0247, -0.0109, -0.0266, -0.0310, -0.0024, 0.0240, -0.0114, 0.0328, 0.0002], device='cuda:0'), grad: tensor([-9.4533e-05, 3.5334e-04, -5.5456e-04, 8.2910e-05, 3.5524e-04, -3.9071e-05, -6.2346e-05, 1.9276e-04, 1.1295e-04, -3.4738e-04], device='cuda:0') 100 0.0001 changing lr epoch 91, time 265.15, cls_loss 0.0085 cls_loss_mapping 0.0108 cls_loss_causal 0.6061 re_mapping 0.0110 re_causal 0.0270 /// teacc 98.88 lr 0.00010000 Epoch 93, weight, value: tensor([[-0.0492, -0.0731, -0.0499, ..., -0.0282, 0.1185, 0.0977], [-0.1096, -0.1152, -0.1076, ..., -0.0858, -0.1201, -0.0585], [-0.0681, -0.0503, 0.0884, ..., -0.0851, 0.1149, 0.0155], ..., [-0.0626, 0.0434, 0.0440, ..., 0.0963, -0.0737, -0.0869], [-0.1409, 0.0463, -0.1028, ..., 0.0312, -0.0570, -0.0496], [ 0.0457, -0.0370, -0.0550, ..., -0.0234, -0.0410, -0.0239]], device='cuda:0'), grad: tensor([[ 7.2271e-07, 5.5358e-06, 2.9802e-08, ..., 1.3467e-06, 1.4909e-05, 1.3381e-05], [ 3.5129e-06, 5.9381e-06, 2.6077e-08, ..., 1.4342e-06, 2.1070e-05, 1.6108e-05], [ 3.7774e-06, -3.5256e-05, -3.7253e-09, ..., -7.5996e-06, -1.3602e-04, -1.2189e-04], ..., [ 2.7865e-06, 1.4491e-05, -4.2841e-08, ..., 5.7295e-06, 2.2039e-05, 1.9535e-05], [ 3.4925e-06, 2.0534e-05, 4.2841e-08, ..., 3.4887e-06, 5.4091e-05, 4.4137e-05], [ 8.5309e-06, -3.9972e-06, 3.9116e-08, ..., -1.7464e-05, 1.3299e-05, 1.3322e-05]], device='cuda:0') Epoch 93, bias, value: tensor([-0.0168, -0.0249, -0.0110, -0.0271, -0.0299, -0.0022, 0.0238, -0.0112, 0.0331, -0.0004], device='cuda:0'), grad: tensor([ 5.5492e-05, -1.2445e-03, 7.1621e-04, -7.5459e-05, 5.6922e-05, 1.9145e-04, -1.9744e-05, 1.7917e-04, 1.3220e-04, 7.2531e-06], device='cuda:0') 100 0.0001 changing lr epoch 92, time 265.56, cls_loss 0.0091 cls_loss_mapping 0.0117 cls_loss_causal 0.5977 re_mapping 0.0106 re_causal 0.0272 /// teacc 98.85 lr 0.00010000 Epoch 94, weight, value: tensor([[-0.0496, -0.0735, -0.0500, ..., -0.0283, 0.1191, 0.0984], [-0.1101, -0.1147, -0.1079, ..., -0.0851, -0.1210, -0.0591], [-0.0684, -0.0506, 0.0889, ..., -0.0861, 0.1158, 0.0168], ..., [-0.0629, 0.0439, 0.0440, ..., 0.0974, -0.0741, -0.0876], [-0.1416, 0.0469, -0.1029, ..., 0.0323, -0.0576, -0.0503], [ 0.0454, -0.0393, -0.0553, ..., -0.0261, -0.0413, -0.0243]], device='cuda:0'), grad: tensor([[ 7.7963e-05, 1.1828e-07, 1.4901e-08, ..., 1.2759e-07, 8.6784e-05, 4.9740e-05], [ 1.1586e-06, 2.0452e-06, 3.0734e-08, ..., 1.6801e-06, 2.0526e-06, 1.4240e-06], [ 4.0494e-06, 5.8021e-07, -2.1327e-07, ..., 4.1071e-07, 6.7055e-06, 4.3288e-06], ..., [ 7.2177e-07, -1.3873e-05, 2.7008e-08, ..., -1.1660e-05, 2.4252e-06, 1.4994e-06], [ 2.0951e-05, 1.5786e-06, 3.0734e-08, ..., 1.8245e-06, 1.7136e-05, 1.2979e-05], [ 4.5896e-06, 1.7136e-07, 1.8626e-09, ..., 8.9500e-07, 5.3719e-06, 1.9092e-06]], device='cuda:0') Epoch 94, bias, value: tensor([-0.0168, -0.0253, -0.0095, -0.0284, -0.0299, -0.0018, 0.0243, -0.0111, 0.0343, -0.0017], device='cuda:0'), grad: tensor([ 1.3137e-04, -6.1244e-06, 1.7643e-05, 3.4690e-05, 5.3674e-05, -8.3268e-05, -1.6642e-04, -6.8322e-06, 6.7770e-05, -4.2856e-05], device='cuda:0') 100 0.0001 changing lr epoch 93, time 265.70, cls_loss 0.0084 cls_loss_mapping 0.0099 cls_loss_causal 0.6133 re_mapping 0.0110 re_causal 0.0282 /// teacc 98.86 lr 0.00010000 Epoch 95, weight, value: tensor([[-0.0494, -0.0738, -0.0501, ..., -0.0280, 0.1202, 0.0996], [-0.1105, -0.1153, -0.1080, ..., -0.0855, -0.1214, -0.0591], [-0.0689, -0.0514, 0.0890, ..., -0.0864, 0.1167, 0.0178], ..., [-0.0631, 0.0434, 0.0441, ..., 0.0981, -0.0746, -0.0880], [-0.1427, 0.0469, -0.1029, ..., 0.0320, -0.0591, -0.0522], [ 0.0453, -0.0399, -0.0553, ..., -0.0267, -0.0416, -0.0248]], device='cuda:0'), grad: tensor([[ 8.3819e-08, 4.8429e-07, 3.7253e-09, ..., 2.9150e-07, -3.4068e-06, -2.4643e-06], [ 1.6671e-07, 4.7907e-06, 6.4261e-08, ..., 4.8578e-06, 1.8068e-07, 1.4435e-07], [ 2.1234e-06, 7.5549e-06, -2.2631e-07, ..., 1.5246e-06, 5.5879e-08, -1.0245e-08], ..., [ 2.4308e-07, -2.2426e-05, 1.5832e-08, ..., -2.4453e-05, 3.3248e-07, 2.5332e-07], [ 4.6473e-07, 4.0978e-06, 8.8476e-08, ..., -1.1213e-06, 2.1607e-07, 1.7043e-07], [ 2.2631e-07, 1.3346e-06, 5.5879e-09, ..., 6.2808e-06, 1.0673e-06, 7.7672e-07]], device='cuda:0') Epoch 95, bias, value: tensor([-0.0162, -0.0252, -0.0095, -0.0280, -0.0301, -0.0014, 0.0240, -0.0113, 0.0336, -0.0017], device='cuda:0'), grad: tensor([-2.5928e-06, 1.8045e-05, 1.3247e-05, 8.8736e-06, 7.1287e-05, 1.5488e-06, 2.5127e-06, -2.9966e-05, 1.2636e-05, -9.5785e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 94---------------------------------------------------- epoch 94, time 282.03, cls_loss 0.0072 cls_loss_mapping 0.0079 cls_loss_causal 0.5720 re_mapping 0.0103 re_causal 0.0265 /// teacc 99.00 lr 0.00010000 Epoch 96, weight, value: tensor([[-0.0496, -0.0742, -0.0501, ..., -0.0282, 0.1219, 0.1012], [-0.1108, -0.1156, -0.1081, ..., -0.0859, -0.1217, -0.0594], [-0.0694, -0.0513, 0.0893, ..., -0.0867, 0.1171, 0.0181], ..., [-0.0635, 0.0433, 0.0441, ..., 0.0986, -0.0752, -0.0888], [-0.1431, 0.0469, -0.1031, ..., 0.0322, -0.0595, -0.0527], [ 0.0450, -0.0403, -0.0554, ..., -0.0275, -0.0418, -0.0252]], device='cuda:0'), grad: tensor([[ 1.5926e-07, 1.6205e-07, 3.1665e-08, ..., 2.8778e-07, -3.8832e-05, -2.9862e-05], [ 1.0338e-07, 9.6764e-07, 3.3528e-08, ..., 6.0163e-07, 1.2387e-06, 8.5402e-07], [ 8.0094e-08, 6.2399e-07, -9.6858e-08, ..., 1.1735e-07, 8.6874e-06, 7.2680e-06], ..., [ 4.2468e-07, 1.6671e-07, 1.3411e-07, ..., 1.3663e-06, 1.4082e-06, 9.7696e-07], [ 1.9576e-06, 1.7686e-06, 1.8626e-08, ..., 3.2969e-06, 3.0361e-06, 1.8906e-06], [ 1.9185e-07, -2.1048e-06, 1.7136e-07, ..., -5.7407e-06, 1.0259e-05, 7.6368e-06]], device='cuda:0') Epoch 96, bias, value: tensor([-0.0154, -0.0251, -0.0091, -0.0283, -0.0293, -0.0012, 0.0237, -0.0114, 0.0335, -0.0025], device='cuda:0'), grad: tensor([-5.6624e-05, -1.7118e-06, 1.3165e-05, 4.7348e-06, 1.0282e-05, -1.2554e-06, 1.0282e-05, 1.4961e-05, 1.2122e-05, -5.9418e-06], device='cuda:0') 100 0.0001 changing lr epoch 95, time 265.58, cls_loss 0.0086 cls_loss_mapping 0.0103 cls_loss_causal 0.6327 re_mapping 0.0105 re_causal 0.0261 /// teacc 98.90 lr 0.00010000 Epoch 97, weight, value: tensor([[-0.0501, -0.0750, -0.0505, ..., -0.0287, 0.1227, 0.1020], [-0.1111, -0.1162, -0.1083, ..., -0.0871, -0.1226, -0.0593], [-0.0700, -0.0517, 0.0893, ..., -0.0878, 0.1176, 0.0181], ..., [-0.0635, 0.0441, 0.0442, ..., 0.1004, -0.0755, -0.0892], [-0.1443, 0.0468, -0.1029, ..., 0.0330, -0.0599, -0.0537], [ 0.0449, -0.0407, -0.0554, ..., -0.0283, -0.0420, -0.0258]], device='cuda:0'), grad: tensor([[ 1.5218e-06, 2.1495e-06, 1.6978e-06, ..., 3.3136e-06, -4.5113e-06, -2.0899e-06], [ 1.7416e-07, 8.8289e-06, 9.1493e-06, ..., 1.2107e-05, 5.2061e-07, 1.3029e-06], [ 2.6263e-06, 8.1968e-04, 8.7786e-04, ..., 1.0061e-03, 6.9626e-06, 1.0824e-04], ..., [ 2.6915e-07, -8.4925e-04, -8.9741e-04, ..., -1.0357e-03, 2.0526e-06, -1.0645e-04], [ 3.1888e-06, 3.4049e-06, 1.7472e-06, ..., -8.0094e-07, 1.1791e-06, 8.0559e-07], [ 1.0552e-06, 7.0818e-06, 9.4809e-07, ..., -1.0364e-05, 1.4296e-06, 8.6520e-07]], device='cuda:0') Epoch 97, bias, value: tensor([-0.0152, -0.0254, -0.0094, -0.0294, -0.0295, -0.0010, 0.0240, -0.0105, 0.0337, -0.0028], device='cuda:0'), grad: tensor([ 7.8753e-06, 5.0068e-05, 2.7237e-03, 1.0423e-05, 3.3069e-04, 1.7166e-05, -1.3359e-05, -2.7008e-03, -3.0212e-06, -4.2319e-04], device='cuda:0') 100 0.0001 changing lr epoch 96, time 265.23, cls_loss 0.0089 cls_loss_mapping 0.0115 cls_loss_causal 0.6186 re_mapping 0.0100 re_causal 0.0260 /// teacc 98.79 lr 0.00010000 Epoch 98, weight, value: tensor([[-0.0525, -0.0755, -0.0506, ..., -0.0298, 0.1225, 0.1022], [-0.1115, -0.1168, -0.1084, ..., -0.0869, -0.1237, -0.0597], [-0.0708, -0.0526, 0.0888, ..., -0.0894, 0.1182, 0.0184], ..., [-0.0638, 0.0448, 0.0449, ..., 0.1013, -0.0756, -0.0896], [-0.1452, 0.0470, -0.1031, ..., 0.0331, -0.0606, -0.0540], [ 0.0447, -0.0410, -0.0555, ..., -0.0287, -0.0421, -0.0262]], device='cuda:0'), grad: tensor([[ 1.2470e-06, 2.0918e-06, 1.1176e-08, ..., 2.6617e-06, -1.0318e-04, -9.8050e-05], [ 1.6093e-06, 4.8429e-06, 2.0489e-08, ..., 6.3591e-06, 5.4296e-07, 4.3865e-07], [ 4.8988e-06, 1.1027e-05, -2.1327e-07, ..., 5.0403e-06, 3.9637e-06, 4.2394e-06], ..., [ 2.7381e-06, -1.7002e-05, 2.7008e-08, ..., -1.5423e-05, 7.9814e-07, 6.1281e-07], [ 6.9141e-06, 8.8662e-06, 6.3330e-08, ..., -1.1522e-04, 3.2224e-06, 2.8871e-06], [ 3.5986e-06, 1.1630e-05, 1.8626e-09, ..., 6.8069e-05, 6.0499e-05, 5.7846e-05]], device='cuda:0') Epoch 98, bias, value: tensor([-0.0159, -0.0247, -0.0100, -0.0281, -0.0299, -0.0019, 0.0244, -0.0100, 0.0331, -0.0027], device='cuda:0'), grad: tensor([-1.5819e-04, 4.8697e-05, 3.8236e-05, -1.2279e-05, 3.7819e-05, 2.2805e-04, 5.6863e-05, -8.7172e-06, -1.0262e-03, 7.9536e-04], device='cuda:0') 100 0.0001 changing lr epoch 97, time 265.38, cls_loss 0.0099 cls_loss_mapping 0.0105 cls_loss_causal 0.6036 re_mapping 0.0098 re_causal 0.0251 /// teacc 98.90 lr 0.00010000 Epoch 99, weight, value: tensor([[-0.0533, -0.0761, -0.0510, ..., -0.0299, 0.1233, 0.1032], [-0.1120, -0.1178, -0.1090, ..., -0.0881, -0.1245, -0.0600], [-0.0713, -0.0526, 0.0895, ..., -0.0899, 0.1183, 0.0188], ..., [-0.0642, 0.0450, 0.0447, ..., 0.1021, -0.0760, -0.0909], [-0.1461, 0.0471, -0.1033, ..., 0.0334, -0.0611, -0.0545], [ 0.0447, -0.0412, -0.0555, ..., -0.0293, -0.0425, -0.0268]], device='cuda:0'), grad: tensor([[-5.3868e-06, 8.5775e-07, 4.5635e-08, ..., -1.2553e-04, -2.0456e-04, -1.8442e-04], [ 2.5984e-07, 3.5286e-05, 2.3842e-07, ..., -2.7325e-06, 3.7346e-07, 3.0827e-07], [ 5.0198e-07, 2.0280e-05, 1.4501e-06, ..., 4.9204e-05, 1.7527e-06, 1.5656e-06], ..., [ 8.8196e-07, 4.6659e-04, 5.9418e-07, ..., 1.8759e-03, 5.0999e-06, 4.5635e-06], [ 1.5702e-06, -7.4096e-06, -1.4920e-06, ..., 6.8285e-06, 2.7027e-06, 2.6692e-06], [-3.4962e-06, -5.2357e-04, 8.2888e-07, ..., -1.9798e-03, 9.9652e-07, 8.5030e-07]], device='cuda:0') Epoch 99, bias, value: tensor([-0.0159, -0.0252, -0.0105, -0.0281, -0.0303, -0.0024, 0.0251, -0.0101, 0.0332, -0.0016], device='cuda:0'), grad: tensor([-2.5034e-04, -1.2245e-03, 2.1756e-04, 4.2289e-05, 2.2674e-04, 9.1195e-05, 2.4605e-04, 9.1858e-03, 1.5900e-05, -8.5449e-03], device='cuda:0') 100 0.0001 changing lr epoch 98, time 265.61, cls_loss 0.0072 cls_loss_mapping 0.0104 cls_loss_causal 0.6021 re_mapping 0.0102 re_causal 0.0270 /// teacc 98.89 lr 0.00010000 Epoch 100, weight, value: tensor([[-0.0535, -0.0764, -0.0513, ..., -0.0300, 0.1240, 0.1040], [-0.1123, -0.1194, -0.1091, ..., -0.0890, -0.1247, -0.0603], [-0.0717, -0.0531, 0.0898, ..., -0.0906, 0.1188, 0.0195], ..., [-0.0645, 0.0454, 0.0447, ..., 0.1026, -0.0764, -0.0913], [-0.1468, 0.0471, -0.1035, ..., 0.0336, -0.0614, -0.0550], [ 0.0449, -0.0415, -0.0556, ..., -0.0293, -0.0426, -0.0272]], device='cuda:0'), grad: tensor([[ 1.6857e-07, 4.6007e-07, 3.9116e-08, ..., 1.3225e-06, -3.2306e-05, -3.4183e-05], [ 2.6450e-07, 1.5339e-06, 1.0245e-08, ..., 1.5914e-05, 1.0155e-05, 1.2092e-05], [ 5.8673e-08, 1.2293e-06, 3.9116e-08, ..., 2.5108e-06, 1.9744e-07, 6.0908e-07], ..., [ 1.8440e-07, -7.5579e-05, 1.6764e-08, ..., -3.3706e-05, 1.4585e-06, 1.5106e-06], [ 7.4320e-07, -2.0526e-06, 9.3132e-09, ..., -1.2088e-04, 2.8498e-06, 2.9802e-06], [ 1.0803e-07, 4.0680e-05, 4.0047e-08, ..., 2.2173e-05, 8.4192e-06, 7.6964e-06]], device='cuda:0') Epoch 100, bias, value: tensor([-0.0157, -0.0268, -0.0097, -0.0279, -0.0298, -0.0025, 0.0246, -0.0101, 0.0332, -0.0011], device='cuda:0'), grad: tensor([-5.0902e-05, 5.0515e-05, 7.7263e-06, 5.0366e-05, -5.3644e-06, 1.9193e-04, 4.2409e-05, -1.0055e-04, -2.7609e-04, 8.9943e-05], device='cuda:0') 100 0.0001 changing lr epoch 99, time 266.02, cls_loss 0.0080 cls_loss_mapping 0.0099 cls_loss_causal 0.6017 re_mapping 0.0103 re_causal 0.0263 /// teacc 98.98 lr 0.00010000 Epoch 101, weight, value: tensor([[-0.0542, -0.0770, -0.0514, ..., -0.0304, 0.1246, 0.1045], [-0.1133, -0.1198, -0.1091, ..., -0.0893, -0.1250, -0.0597], [-0.0723, -0.0538, 0.0900, ..., -0.0921, 0.1196, 0.0199], ..., [-0.0647, 0.0458, 0.0447, ..., 0.1033, -0.0773, -0.0927], [-0.1478, 0.0482, -0.1047, ..., 0.0340, -0.0618, -0.0553], [ 0.0445, -0.0433, -0.0556, ..., -0.0309, -0.0429, -0.0279]], device='cuda:0'), grad: tensor([[ 8.4937e-07, -5.6252e-07, 6.1840e-07, ..., 3.1609e-06, -1.6540e-05, -2.0027e-05], [ 2.7530e-06, 1.2748e-05, 4.3288e-06, ..., -6.2305e-07, 1.0356e-05, 1.1520e-06], [ 2.0098e-06, 1.3679e-05, -3.2604e-05, ..., 8.4266e-06, -7.1883e-05, -1.9465e-06], ..., [ 1.5303e-05, -3.5286e-05, 1.3690e-06, ..., -7.1406e-05, 3.7327e-06, 8.7451e-07], [ 1.4536e-05, 6.2704e-05, 6.4727e-07, ..., 2.8223e-05, 7.8231e-06, 7.4059e-06], [ 4.7117e-05, 1.6525e-05, 7.5437e-08, ..., 7.1049e-05, 1.1241e-06, 1.0626e-06]], device='cuda:0') Epoch 101, bias, value: tensor([-0.0158, -0.0261, -0.0104, -0.0279, -0.0296, -0.0016, 0.0243, -0.0101, 0.0338, -0.0022], device='cuda:0'), grad: tensor([-2.3559e-05, -1.2606e-05, -9.7156e-05, -1.4007e-04, 2.2292e-05, -2.4945e-05, 9.8109e-05, -4.7415e-05, 1.0163e-04, 1.2350e-04], device='cuda:0') 100 0.0001 changing lr epoch 100, time 265.60, cls_loss 0.0083 cls_loss_mapping 0.0109 cls_loss_causal 0.5971 re_mapping 0.0099 re_causal 0.0255 /// teacc 98.88 lr 0.00010000 Epoch 102, weight, value: tensor([[-0.0550, -0.0775, -0.0516, ..., -0.0307, 0.1254, 0.1061], [-0.1137, -0.1191, -0.1093, ..., -0.0877, -0.1258, -0.0605], [-0.0729, -0.0539, 0.0902, ..., -0.0938, 0.1205, 0.0205], ..., [-0.0648, 0.0459, 0.0450, ..., 0.1038, -0.0781, -0.0933], [-0.1486, 0.0483, -0.1048, ..., 0.0340, -0.0623, -0.0560], [ 0.0441, -0.0440, -0.0557, ..., -0.0321, -0.0413, -0.0286]], device='cuda:0'), grad: tensor([[-2.2985e-06, -3.7253e-09, 5.1558e-06, ..., -7.5530e-07, -2.4840e-05, -3.7909e-05], [ 4.8522e-07, 1.2182e-06, 1.1576e-06, ..., -6.9477e-06, 4.3996e-06, 1.7509e-06], [ 1.1111e-06, -2.2724e-07, -4.6760e-05, ..., 1.2219e-06, -1.2887e-04, -1.8582e-05], ..., [ 7.6257e-06, 2.3171e-05, 2.1141e-07, ..., -2.2855e-06, 1.5562e-06, 9.7696e-07], [ 6.2659e-06, 4.1164e-07, 1.4678e-05, ..., 9.3505e-06, 4.9949e-05, 1.5870e-05], [ 6.6049e-06, 2.3201e-05, 6.7204e-06, ..., 1.1036e-06, 2.4229e-05, 8.5458e-06]], device='cuda:0') Epoch 102, bias, value: tensor([-0.0162, -0.0245, -0.0107, -0.0282, -0.0310, -0.0015, 0.0244, -0.0105, 0.0336, -0.0016], device='cuda:0'), grad: tensor([-1.6749e-05, -1.0502e-04, -2.5845e-04, -5.7429e-05, 1.0741e-04, -6.0648e-06, 3.3826e-05, 4.2826e-05, 1.7893e-04, 8.0407e-05], device='cuda:0') 100 0.0001 changing lr epoch 101, time 265.63, cls_loss 0.0060 cls_loss_mapping 0.0082 cls_loss_causal 0.5821 re_mapping 0.0100 re_causal 0.0261 /// teacc 98.89 lr 0.00010000 Epoch 103, weight, value: tensor([[-0.0573, -0.0789, -0.0517, ..., -0.0308, 0.1258, 0.1065], [-0.1141, -0.1193, -0.1096, ..., -0.0882, -0.1262, -0.0609], [-0.0733, -0.0538, 0.0906, ..., -0.0940, 0.1212, 0.0210], ..., [-0.0652, 0.0461, 0.0449, ..., 0.1043, -0.0787, -0.0942], [-0.1495, 0.0481, -0.1049, ..., 0.0342, -0.0630, -0.0563], [ 0.0439, -0.0446, -0.0558, ..., -0.0327, -0.0416, -0.0293]], device='cuda:0'), grad: tensor([[ 5.4017e-07, 1.5367e-07, 7.0781e-08, ..., 1.0477e-06, -2.6785e-06, -3.8259e-06], [ 2.2631e-07, 2.5835e-06, 2.1234e-07, ..., 2.2501e-06, 2.6692e-06, 1.3821e-06], [ 6.9384e-07, 1.9372e-05, 1.0058e-06, ..., -1.1539e-04, -2.2376e-04, -1.2422e-04], ..., [ 3.6415e-07, -2.7940e-05, -2.1569e-06, ..., -1.1802e-05, 2.2203e-05, 1.1474e-05], [ 2.8554e-06, 1.1884e-05, 5.0198e-07, ..., 2.8443e-06, 5.4240e-06, 3.1348e-06], [ 1.6578e-07, 2.6915e-07, -2.7101e-07, ..., 1.5181e-06, 2.2929e-06, 1.4352e-06]], device='cuda:0') Epoch 103, bias, value: tensor([-0.0163, -0.0246, -0.0105, -0.0284, -0.0312, -0.0013, 0.0250, -0.0106, 0.0335, -0.0017], device='cuda:0'), grad: tensor([ 6.1207e-06, 8.7246e-06, -6.6996e-04, 8.4996e-05, 1.2517e-05, 5.2834e-04, 3.9041e-06, 3.6918e-06, 1.8179e-05, 3.8259e-06], device='cuda:0') 100 0.0001 changing lr epoch 102, time 265.92, cls_loss 0.0068 cls_loss_mapping 0.0086 cls_loss_causal 0.5737 re_mapping 0.0095 re_causal 0.0248 /// teacc 98.92 lr 0.00010000 Epoch 104, weight, value: tensor([[-0.0577, -0.0795, -0.0518, ..., -0.0311, 0.1265, 0.1073], [-0.1144, -0.1198, -0.1097, ..., -0.0897, -0.1266, -0.0609], [-0.0738, -0.0544, 0.0909, ..., -0.0950, 0.1220, 0.0215], ..., [-0.0654, 0.0455, 0.0449, ..., 0.1051, -0.0792, -0.0947], [-0.1502, 0.0480, -0.1051, ..., 0.0345, -0.0634, -0.0565], [ 0.0443, -0.0449, -0.0558, ..., -0.0332, -0.0418, -0.0302]], device='cuda:0'), grad: tensor([[ 2.5332e-07, 5.6904e-07, 1.9558e-08, ..., 3.5856e-07, -3.9995e-05, -1.9863e-05], [ 1.4156e-07, 1.3523e-06, 1.2107e-08, ..., 9.6299e-07, 4.2468e-07, 1.4715e-07], [ 3.6135e-07, 2.2668e-06, -1.8254e-07, ..., 1.5171e-06, 5.4613e-06, 3.4906e-06], ..., [ 1.6112e-07, -2.7604e-06, 2.3283e-08, ..., -3.3863e-06, 1.6792e-06, 6.3889e-07], [-5.7966e-06, -1.5289e-05, 2.6077e-08, ..., -6.0722e-06, 1.3664e-05, 6.5900e-06], [-2.2352e-08, 5.4576e-07, 1.8626e-09, ..., 1.4780e-06, 7.4171e-06, 3.6582e-06]], device='cuda:0') Epoch 104, bias, value: tensor([-0.0161, -0.0252, -0.0106, -0.0276, -0.0303, -0.0016, 0.0247, -0.0106, 0.0336, -0.0020], device='cuda:0'), grad: tensor([-4.0919e-05, -4.3303e-05, 2.3678e-05, 6.4254e-05, 1.9416e-05, 2.5332e-05, 6.7800e-06, 3.2187e-05, -7.4685e-05, -1.2711e-05], device='cuda:0') 100 0.0001 changing lr epoch 103, time 265.39, cls_loss 0.0101 cls_loss_mapping 0.0119 cls_loss_causal 0.6117 re_mapping 0.0095 re_causal 0.0239 /// teacc 98.91 lr 0.00010000 Epoch 105, weight, value: tensor([[-0.0586, -0.0814, -0.0519, ..., -0.0314, 0.1266, 0.1086], [-0.1151, -0.1207, -0.1097, ..., -0.0908, -0.1273, -0.0618], [-0.0750, -0.0551, 0.0909, ..., -0.0964, 0.1215, 0.0209], ..., [-0.0662, 0.0461, 0.0450, ..., 0.1058, -0.0796, -0.0952], [-0.1512, 0.0483, -0.1052, ..., 0.0351, -0.0641, -0.0567], [ 0.0440, -0.0459, -0.0559, ..., -0.0336, -0.0407, -0.0307]], device='cuda:0'), grad: tensor([[ 7.6275e-07, 1.6456e-06, 2.7940e-09, ..., 1.0682e-06, -1.0592e-04, -2.7940e-05], [ 4.1723e-07, 7.0222e-06, 9.3132e-10, ..., 2.1588e-06, 3.8520e-06, 4.1537e-07], [ 1.1912e-06, 2.6017e-05, -2.1979e-07, ..., 8.6874e-06, -4.8369e-05, 3.5763e-07], ..., [ 2.3469e-07, 8.6501e-06, 1.4249e-07, ..., -2.6319e-06, 4.3154e-05, 2.2054e-06], [ 3.1084e-05, 1.6451e-04, 2.7008e-08, ..., 7.6652e-05, 9.9018e-06, 2.7083e-06], [ 7.2457e-07, 3.5092e-06, 9.3132e-10, ..., -5.2378e-06, 1.2673e-05, 2.9001e-06]], device='cuda:0') Epoch 105, bias, value: tensor([-0.0171, -0.0269, -0.0116, -0.0274, -0.0300, -0.0010, 0.0249, -0.0103, 0.0349, -0.0014], device='cuda:0'), grad: tensor([-1.7977e-04, -2.6250e-04, -1.4296e-06, -5.6362e-04, 7.5042e-05, -2.3097e-05, 1.0091e-04, 2.8992e-04, 5.7125e-04, -7.1265e-06], device='cuda:0') 100 0.0001 changing lr epoch 104, time 265.16, cls_loss 0.0064 cls_loss_mapping 0.0078 cls_loss_causal 0.5711 re_mapping 0.0104 re_causal 0.0260 /// teacc 98.88 lr 0.00010000 Epoch 106, weight, value: tensor([[-0.0590, -0.0820, -0.0520, ..., -0.0315, 0.1271, 0.1091], [-0.1153, -0.1216, -0.1098, ..., -0.0918, -0.1277, -0.0619], [-0.0754, -0.0561, 0.0913, ..., -0.0981, 0.1222, 0.0211], ..., [-0.0665, 0.0467, 0.0449, ..., 0.1070, -0.0803, -0.0956], [-0.1518, 0.0484, -0.1054, ..., 0.0351, -0.0641, -0.0570], [ 0.0449, -0.0462, -0.0559, ..., -0.0339, -0.0410, -0.0311]], device='cuda:0'), grad: tensor([[ 3.9581e-07, 1.3039e-07, 1.9558e-08, ..., 3.7439e-07, -2.0012e-05, -1.2428e-05], [ 2.3749e-07, 7.5623e-07, 2.7940e-09, ..., 3.0268e-07, 1.6112e-07, 7.8231e-08], [ 8.6613e-07, 6.8806e-06, -1.5832e-08, ..., 6.7055e-07, 2.3860e-06, 1.4622e-06], ..., [ 1.5441e-06, -3.9823e-06, 1.8626e-09, ..., -1.3657e-05, 3.0920e-07, 1.4342e-07], [ 3.2876e-06, 7.9442e-07, 7.4506e-09, ..., 4.9025e-06, 1.0636e-06, 6.6124e-07], [ 8.3726e-07, 7.1004e-06, 0.0000e+00, ..., 9.5740e-06, 1.2293e-05, 7.6517e-06]], device='cuda:0') Epoch 106, bias, value: tensor([-0.0171, -0.0271, -0.0120, -0.0281, -0.0297, -0.0007, 0.0247, -0.0097, 0.0354, -0.0016], device='cuda:0'), grad: tensor([ 2.1350e-04, -1.7080e-03, 1.9044e-05, -4.8131e-06, 1.4529e-05, 8.4341e-05, 9.0063e-05, 1.2308e-05, 1.1902e-03, 8.9705e-05], device='cuda:0') 100 0.0001 changing lr epoch 105, time 265.16, cls_loss 0.0077 cls_loss_mapping 0.0103 cls_loss_causal 0.5827 re_mapping 0.0096 re_causal 0.0241 /// teacc 98.84 lr 0.00010000 Epoch 107, weight, value: tensor([[-0.0600, -0.0825, -0.0523, ..., -0.0318, 0.1272, 0.1092], [-0.1156, -0.1219, -0.1098, ..., -0.0921, -0.1278, -0.0619], [-0.0760, -0.0564, 0.0916, ..., -0.0993, 0.1232, 0.0215], ..., [-0.0671, 0.0471, 0.0448, ..., 0.1076, -0.0817, -0.0966], [-0.1528, 0.0488, -0.1053, ..., 0.0364, -0.0646, -0.0574], [ 0.0450, -0.0470, -0.0559, ..., -0.0348, -0.0410, -0.0318]], device='cuda:0'), grad: tensor([[ 2.6464e-04, 1.4715e-07, 3.9116e-08, ..., 5.8937e-04, 1.7679e-04, -9.9361e-05], [ 2.6934e-06, 4.7777e-07, 3.7253e-09, ..., 6.2063e-06, 6.6236e-06, 2.3674e-06], [ 8.1118e-07, 2.6822e-07, 4.6566e-09, ..., 1.7751e-06, 1.0185e-05, 4.3660e-06], ..., [ 1.2785e-05, -2.9802e-07, 3.7253e-09, ..., 2.6882e-05, 1.9655e-05, 3.2634e-06], [ 1.1700e-04, -7.4580e-06, 2.7940e-09, ..., 2.3901e-04, 1.4186e-04, 9.0227e-06], [ 8.3223e-06, 2.4401e-07, 1.8626e-09, ..., 1.8165e-05, 3.7760e-05, 1.9625e-05]], device='cuda:0') Epoch 107, bias, value: tensor([-0.0176, -0.0262, -0.0124, -0.0285, -0.0313, -0.0006, 0.0250, -0.0099, 0.0355, -0.0009], device='cuda:0'), grad: tensor([ 7.2908e-04, 1.6987e-05, 7.5340e-05, 5.2929e-05, 3.5912e-05, -5.5542e-03, 4.1580e-03, 6.1452e-05, 3.1543e-04, 1.0973e-04], device='cuda:0') 100 0.0001 changing lr epoch 106, time 265.26, cls_loss 0.0077 cls_loss_mapping 0.0085 cls_loss_causal 0.5726 re_mapping 0.0093 re_causal 0.0231 /// teacc 98.94 lr 0.00010000 Epoch 108, weight, value: tensor([[-0.0610, -0.0831, -0.0529, ..., -0.0327, 0.1274, 0.1096], [-0.1159, -0.1221, -0.1102, ..., -0.0924, -0.1283, -0.0623], [-0.0763, -0.0568, 0.0927, ..., -0.1001, 0.1240, 0.0222], ..., [-0.0672, 0.0473, 0.0445, ..., 0.1085, -0.0824, -0.0975], [-0.1539, 0.0486, -0.1059, ..., 0.0366, -0.0645, -0.0578], [ 0.0447, -0.0474, -0.0562, ..., -0.0355, -0.0413, -0.0324]], device='cuda:0'), grad: tensor([[ 1.3132e-07, 1.2973e-06, 8.0187e-07, ..., 3.5986e-06, -2.5225e-04, -1.5152e-04], [ 1.1455e-07, 2.5071e-06, 1.8161e-07, ..., 3.6180e-05, 1.1092e-06, 4.9174e-07], [ 1.0803e-07, 1.5004e-06, -2.6077e-06, ..., 3.8743e-06, -4.8839e-06, -1.6745e-06], ..., [ 4.0047e-07, 3.7234e-06, 6.9011e-07, ..., -3.3051e-05, 1.8273e-06, 7.4506e-07], [ 4.0606e-07, -5.6297e-05, 1.0524e-07, ..., -1.7166e-04, 9.0338e-07, 6.2212e-07], [ 3.0454e-07, 5.3421e-06, 9.7509e-07, ..., 1.8403e-05, 5.9940e-06, 1.6429e-06]], device='cuda:0') Epoch 108, bias, value: tensor([-0.0179, -0.0274, -0.0103, -0.0288, -0.0308, -0.0007, 0.0254, -0.0100, 0.0353, -0.0011], device='cuda:0'), grad: tensor([-2.5392e-04, 7.6592e-05, -3.5092e-06, 8.2374e-05, 6.1877e-06, 2.1541e-04, 2.7847e-04, -4.6939e-05, -4.0960e-04, 5.5015e-05], device='cuda:0') 100 0.0001 changing lr epoch 107, time 265.03, cls_loss 0.0086 cls_loss_mapping 0.0097 cls_loss_causal 0.5773 re_mapping 0.0094 re_causal 0.0231 /// teacc 98.88 lr 0.00010000 Epoch 109, weight, value: tensor([[-0.0614, -0.0835, -0.0529, ..., -0.0331, 0.1274, 0.1105], [-0.1171, -0.1224, -0.1090, ..., -0.0928, -0.1306, -0.0646], [-0.0769, -0.0571, 0.0921, ..., -0.1008, 0.1250, 0.0238], ..., [-0.0677, 0.0475, 0.0446, ..., 0.1094, -0.0826, -0.0982], [-0.1549, 0.0487, -0.1058, ..., 0.0365, -0.0652, -0.0583], [ 0.0443, -0.0479, -0.0569, ..., -0.0360, -0.0405, -0.0332]], device='cuda:0'), grad: tensor([[ 1.5404e-06, 2.0433e-06, 1.6885e-06, ..., 8.5682e-07, 2.5462e-06, 8.9221e-07], [ 5.7276e-07, 6.6645e-06, 2.4680e-07, ..., 2.5537e-06, 4.0513e-07, 4.5542e-07], [ 1.5395e-06, 9.8571e-06, -1.0476e-05, ..., 1.1856e-06, -3.0566e-06, 1.9055e-06], ..., [ 2.7549e-06, 2.7046e-05, 4.0829e-06, ..., -1.2428e-05, 4.9695e-06, 4.7535e-06], [ 5.9754e-06, 1.4514e-05, 1.1874e-06, ..., 3.7532e-06, 1.6624e-06, 3.6173e-06], [ 4.1015e-06, -3.1233e-05, 4.7870e-07, ..., 6.6273e-06, 5.6345e-07, 1.1222e-06]], device='cuda:0') Epoch 109, bias, value: tensor([-0.0186, -0.0264, -0.0111, -0.0286, -0.0315, -0.0007, 0.0255, -0.0101, 0.0351, -0.0006], device='cuda:0'), grad: tensor([ 2.0966e-05, 2.5302e-05, -5.3793e-05, 1.0163e-05, -1.2428e-05, -6.8724e-05, -2.8983e-06, 2.2137e-04, 4.5180e-05, -1.8489e-04], device='cuda:0') 100 0.0001 changing lr epoch 108, time 265.12, cls_loss 0.0079 cls_loss_mapping 0.0085 cls_loss_causal 0.6028 re_mapping 0.0098 re_causal 0.0233 /// teacc 98.91 lr 0.00010000 Epoch 110, weight, value: tensor([[-0.0617, -0.0841, -0.0530, ..., -0.0334, 0.1276, 0.1103], [-0.1174, -0.1228, -0.1097, ..., -0.0930, -0.1331, -0.0650], [-0.0774, -0.0569, 0.0926, ..., -0.1011, 0.1262, 0.0244], ..., [-0.0679, 0.0475, 0.0446, ..., 0.1102, -0.0837, -0.0994], [-0.1565, 0.0484, -0.1059, ..., 0.0364, -0.0662, -0.0592], [ 0.0444, -0.0486, -0.0570, ..., -0.0374, -0.0410, -0.0342]], device='cuda:0'), grad: tensor([[ 5.7742e-08, 2.6710e-06, 1.8626e-09, ..., 6.9253e-06, 2.3004e-06, 1.0189e-06], [ 1.2107e-07, 7.3552e-05, 0.0000e+00, ..., 5.9038e-05, 7.1805e-07, 4.2468e-07], [ 1.1642e-07, 1.5509e-04, -1.3039e-08, ..., 1.3173e-04, 7.8883e-07, 8.9686e-07], ..., [ 2.5518e-07, -7.0667e-04, 5.5879e-09, ..., -5.7697e-04, 2.6748e-06, 1.5590e-06], [ 2.6077e-07, 2.3782e-05, 9.3132e-10, ..., 5.2191e-06, -2.5898e-05, -1.5587e-05], [ 3.8184e-08, 3.8836e-07, 0.0000e+00, ..., 3.6675e-06, 3.3733e-06, 2.0191e-06]], device='cuda:0') Epoch 110, bias, value: tensor([-0.0189, -0.0275, -0.0108, -0.0277, -0.0305, -0.0011, 0.0277, -0.0105, 0.0349, -0.0015], device='cuda:0'), grad: tensor([ 3.2544e-05, 1.5080e-04, 3.4738e-04, 1.0128e-03, 7.0184e-06, 4.4912e-05, 1.6049e-05, -1.5764e-03, -1.8761e-05, -1.6958e-05], device='cuda:0') 100 0.0001 changing lr epoch 109, time 265.55, cls_loss 0.0074 cls_loss_mapping 0.0086 cls_loss_causal 0.5927 re_mapping 0.0088 re_causal 0.0226 /// teacc 98.87 lr 0.00010000 Epoch 111, weight, value: tensor([[-0.0624, -0.0849, -0.0532, ..., -0.0336, 0.1290, 0.1112], [-0.1179, -0.1232, -0.1101, ..., -0.0932, -0.1332, -0.0649], [-0.0778, -0.0570, 0.0933, ..., -0.1020, 0.1270, 0.0246], ..., [-0.0676, 0.0479, 0.0445, ..., 0.1111, -0.0848, -0.0999], [-0.1575, 0.0484, -0.1062, ..., 0.0362, -0.0661, -0.0596], [ 0.0439, -0.0494, -0.0571, ..., -0.0375, -0.0417, -0.0351]], device='cuda:0'), grad: tensor([[ 2.9746e-06, 4.0419e-07, 4.6566e-09, ..., 1.7490e-06, -3.9369e-05, -2.3171e-05], [ 1.4454e-06, 1.0980e-06, 0.0000e+00, ..., 1.6307e-06, 7.4320e-07, 4.0885e-07], [ 7.6182e-07, 5.8077e-06, -8.3819e-09, ..., 7.1563e-06, 1.9539e-06, 1.0813e-06], ..., [ 1.0338e-06, -7.2643e-06, 1.8626e-09, ..., -1.0528e-05, 5.6159e-07, 2.9989e-07], [ 2.1160e-05, 1.6112e-06, 9.3132e-10, ..., -2.3559e-05, 9.2760e-06, 7.8455e-06], [ 1.4370e-06, 1.2405e-06, 9.3132e-10, ..., 3.2395e-05, 1.7002e-05, 7.7784e-06]], device='cuda:0') Epoch 111, bias, value: tensor([-0.0185, -0.0271, -0.0104, -0.0279, -0.0309, -0.0016, 0.0281, -0.0110, 0.0349, -0.0016], device='cuda:0'), grad: tensor([-6.1333e-05, 6.8918e-06, 2.6241e-05, 1.4019e-04, 2.2382e-05, -2.4509e-04, 7.3671e-05, -1.1474e-05, -2.0063e-04, 2.4843e-04], device='cuda:0') 100 0.0001 changing lr epoch 110, time 265.84, cls_loss 0.0061 cls_loss_mapping 0.0077 cls_loss_causal 0.5713 re_mapping 0.0092 re_causal 0.0233 /// teacc 98.86 lr 0.00010000 Epoch 112, weight, value: tensor([[-0.0625, -0.0849, -0.0534, ..., -0.0336, 0.1296, 0.1119], [-0.1183, -0.1230, -0.1089, ..., -0.0945, -0.1332, -0.0641], [-0.0783, -0.0573, 0.0938, ..., -0.1026, 0.1274, 0.0245], ..., [-0.0677, 0.0481, 0.0438, ..., 0.1123, -0.0856, -0.1006], [-0.1591, 0.0481, -0.1063, ..., 0.0358, -0.0667, -0.0603], [ 0.0433, -0.0497, -0.0578, ..., -0.0381, -0.0421, -0.0357]], device='cuda:0'), grad: tensor([[ 2.0321e-06, 3.8184e-08, 9.3319e-07, ..., -3.0827e-07, 3.1646e-06, 1.7351e-06], [ 6.2305e-07, 5.4110e-07, 6.6590e-07, ..., 5.1316e-07, 1.4780e-06, 7.9069e-07], [ 3.3993e-07, -2.6245e-06, -7.9051e-06, ..., 9.4064e-08, -7.8306e-06, -1.4501e-06], ..., [ 2.2724e-07, -2.0415e-06, 6.4857e-06, ..., -3.6452e-06, 6.2138e-06, 1.4333e-06], [ 4.6901e-06, 4.4890e-07, 3.9767e-07, ..., 1.5721e-06, 3.8520e-06, 3.1963e-06], [-9.0152e-07, 6.0536e-08, 1.6019e-07, ..., -3.8743e-07, 4.9733e-07, 1.9744e-07]], device='cuda:0') Epoch 112, bias, value: tensor([-0.0182, -0.0257, -0.0114, -0.0279, -0.0313, -0.0010, 0.0276, -0.0111, 0.0343, -0.0018], device='cuda:0'), grad: tensor([ 7.6145e-06, 7.6517e-06, -1.2375e-05, 1.4804e-05, -1.0557e-05, 2.8074e-05, -5.9903e-05, 1.0207e-05, 4.2945e-05, -2.8446e-05], device='cuda:0') 100 0.0001 changing lr epoch 111, time 265.53, cls_loss 0.0070 cls_loss_mapping 0.0071 cls_loss_causal 0.5595 re_mapping 0.0086 re_causal 0.0217 /// teacc 98.93 lr 0.00010000 Epoch 113, weight, value: tensor([[-0.0633, -0.0850, -0.0537, ..., -0.0345, 0.1311, 0.1135], [-0.1193, -0.1228, -0.1095, ..., -0.0948, -0.1340, -0.0642], [-0.0789, -0.0574, 0.0951, ..., -0.1029, 0.1280, 0.0249], ..., [-0.0680, 0.0483, 0.0432, ..., 0.1128, -0.0868, -0.1022], [-0.1610, 0.0478, -0.1065, ..., 0.0352, -0.0675, -0.0607], [ 0.0433, -0.0501, -0.0579, ..., -0.0390, -0.0424, -0.0365]], device='cuda:0'), grad: tensor([[ 3.7178e-06, 1.1027e-06, 4.6566e-09, ..., 2.6356e-07, -5.4091e-06, -3.9488e-06], [ 3.9339e-06, 8.5473e-05, 3.7439e-07, ..., 9.6381e-05, 2.2501e-06, 1.1269e-06], [ 4.1798e-06, 6.1728e-06, -5.0291e-07, ..., 4.4629e-06, 2.5015e-06, 2.2817e-06], ..., [ 2.5220e-06, -9.8586e-05, 3.1665e-08, ..., -1.1450e-04, 8.0932e-07, 2.3656e-07], [ 6.0238e-06, 1.0123e-06, 9.3132e-09, ..., -1.5432e-06, 1.2303e-06, 6.7614e-07], [ 1.4260e-05, 4.3623e-06, 9.3132e-10, ..., 2.3972e-06, 1.3569e-06, 6.5938e-07]], device='cuda:0') Epoch 113, bias, value: tensor([-0.0171, -0.0252, -0.0113, -0.0276, -0.0310, -0.0016, 0.0276, -0.0116, 0.0335, -0.0020], device='cuda:0'), grad: tensor([ 2.0824e-06, 2.9755e-04, 2.3052e-05, 4.3416e-04, 1.9073e-05, -5.2404e-04, 4.2409e-05, -3.2973e-04, 4.6901e-06, 3.0398e-05], device='cuda:0') 100 0.0001 changing lr epoch 112, time 265.61, cls_loss 0.0068 cls_loss_mapping 0.0069 cls_loss_causal 0.5831 re_mapping 0.0088 re_causal 0.0225 /// teacc 98.76 lr 0.00010000 Epoch 114, weight, value: tensor([[-0.0636, -0.0855, -0.0538, ..., -0.0351, 0.1312, 0.1148], [-0.1198, -0.1233, -0.1095, ..., -0.0957, -0.1344, -0.0644], [-0.0792, -0.0585, 0.0952, ..., -0.1050, 0.1280, 0.0250], ..., [-0.0683, 0.0480, 0.0433, ..., 0.1121, -0.0875, -0.1027], [-0.1618, 0.0491, -0.1066, ..., 0.0370, -0.0678, -0.0611], [ 0.0430, -0.0497, -0.0580, ..., -0.0380, -0.0414, -0.0380]], device='cuda:0'), grad: tensor([[ 1.1027e-05, 1.4063e-07, 5.1688e-08, ..., 5.3532e-06, -2.7735e-06, -1.8906e-06], [ 2.0117e-06, 8.6101e-07, 4.4238e-08, ..., 1.5143e-06, 3.0966e-07, 1.2061e-07], [ 2.1178e-06, 1.3635e-06, 1.7695e-08, ..., 1.1353e-06, -3.8967e-06, -7.7765e-07], ..., [ 6.4373e-06, 2.2780e-06, 2.4540e-07, ..., -1.8487e-06, 4.1462e-06, 1.0310e-06], [ 5.2452e-05, 4.8429e-06, 6.8452e-08, ..., 2.2113e-05, 6.4559e-06, 3.1479e-07], [ 2.9996e-05, 1.1353e-06, 9.2201e-08, ..., 1.6868e-05, 1.1567e-06, 6.7381e-07]], device='cuda:0') Epoch 114, bias, value: tensor([-0.0180, -0.0253, -0.0121, -0.0280, -0.0309, -0.0013, 0.0275, -0.0124, 0.0343, -0.0004], device='cuda:0'), grad: tensor([ 2.8715e-05, 4.3750e-05, 1.5959e-05, 4.3368e-04, -1.2684e-04, -6.7759e-04, 9.4846e-06, 3.4899e-05, 1.5461e-04, 8.3804e-05], device='cuda:0') 100 0.0001 changing lr epoch 113, time 265.41, cls_loss 0.0060 cls_loss_mapping 0.0064 cls_loss_causal 0.5533 re_mapping 0.0091 re_causal 0.0226 /// teacc 98.93 lr 0.00010000 Epoch 115, weight, value: tensor([[-0.0640, -0.0859, -0.0542, ..., -0.0353, 0.1319, 0.1158], [-0.1215, -0.1236, -0.1095, ..., -0.0961, -0.1347, -0.0643], [-0.0796, -0.0587, 0.0955, ..., -0.1054, 0.1282, 0.0253], ..., [-0.0686, 0.0485, 0.0432, ..., 0.1130, -0.0880, -0.1035], [-0.1628, 0.0488, -0.1070, ..., 0.0367, -0.0682, -0.0614], [ 0.0429, -0.0506, -0.0582, ..., -0.0389, -0.0417, -0.0387]], device='cuda:0'), grad: tensor([[ 2.3469e-06, 4.5169e-08, 4.0047e-08, ..., 1.1781e-06, -6.7521e-08, 4.0606e-06], [ 2.0117e-06, 1.4203e-07, 2.4334e-05, ..., 1.1455e-06, 2.7925e-05, -1.1303e-05], [ 1.4147e-06, 2.5844e-07, -2.6867e-05, ..., 7.6788e-07, -3.2365e-05, 1.1148e-06], ..., [ 1.9483e-06, -3.8370e-07, 2.1327e-06, ..., 4.4471e-07, 2.7996e-06, 1.2117e-06], [ 8.2329e-06, -5.9139e-07, 7.9628e-08, ..., 4.0159e-06, 5.2806e-07, 1.9372e-06], [ 2.9169e-06, 1.4016e-07, 6.5155e-06, ..., 1.8179e-06, 6.1374e-07, 1.0403e-06]], device='cuda:0') Epoch 115, bias, value: tensor([-0.0177, -0.0250, -0.0125, -0.0288, -0.0305, -0.0006, 0.0275, -0.0122, 0.0338, -0.0008], device='cuda:0'), grad: tensor([ 2.3663e-05, 3.5495e-05, -8.7559e-05, 1.1510e-04, -5.9932e-05, -1.4782e-04, 1.3724e-05, 1.6600e-05, 2.0489e-05, 7.0274e-05], device='cuda:0') 100 0.0001 changing lr epoch 114, time 265.37, cls_loss 0.0055 cls_loss_mapping 0.0065 cls_loss_causal 0.5722 re_mapping 0.0091 re_causal 0.0229 /// teacc 98.86 lr 0.00010000 Epoch 116, weight, value: tensor([[-0.0643, -0.0864, -0.0544, ..., -0.0357, 0.1323, 0.1163], [-0.1220, -0.1240, -0.1098, ..., -0.0965, -0.1351, -0.0646], [-0.0799, -0.0591, 0.0961, ..., -0.1059, 0.1290, 0.0260], ..., [-0.0688, 0.0474, 0.0429, ..., 0.1132, -0.0886, -0.1045], [-0.1633, 0.0491, -0.1072, ..., 0.0370, -0.0686, -0.0617], [ 0.0425, -0.0512, -0.0578, ..., -0.0395, -0.0419, -0.0394]], device='cuda:0'), grad: tensor([[ 7.4506e-08, 4.2981e-07, 4.6566e-10, ..., 3.8417e-07, -1.0310e-06, -1.2424e-06], [ 5.4948e-08, 1.9418e-07, 4.6566e-10, ..., 1.7090e-07, 2.0536e-07, 1.3178e-07], [ 3.1991e-07, 3.6806e-06, -2.3283e-09, ..., 5.0925e-06, 4.9286e-06, 2.9188e-06], ..., [ 5.2620e-08, -7.0874e-07, 2.7940e-09, ..., -8.2934e-07, 4.1910e-07, 2.8266e-07], [ 2.0443e-07, -6.2101e-06, 4.6566e-10, ..., -9.7454e-06, -1.1772e-05, -6.7502e-06], [ 2.2352e-07, 2.5239e-07, 4.6566e-10, ..., 2.0675e-07, 5.8720e-07, 4.8103e-07]], device='cuda:0') Epoch 116, bias, value: tensor([-0.0177, -0.0255, -0.0115, -0.0286, -0.0307, -0.0002, 0.0272, -0.0127, 0.0340, -0.0009], device='cuda:0'), grad: tensor([ 7.7346e-07, 9.8161e-07, 2.4781e-05, 6.6236e-06, 2.6524e-06, 1.7226e-05, 8.4564e-07, 4.5868e-07, -5.3018e-05, -1.3616e-06], device='cuda:0') 100 0.0001 changing lr epoch 115, time 265.41, cls_loss 0.0063 cls_loss_mapping 0.0073 cls_loss_causal 0.5718 re_mapping 0.0092 re_causal 0.0221 /// teacc 98.93 lr 0.00010000 Epoch 117, weight, value: tensor([[-0.0649, -0.0869, -0.0546, ..., -0.0365, 0.1333, 0.1173], [-0.1225, -0.1244, -0.1099, ..., -0.0971, -0.1352, -0.0645], [-0.0804, -0.0603, 0.0967, ..., -0.1062, 0.1298, 0.0265], ..., [-0.0691, 0.0476, 0.0426, ..., 0.1141, -0.0896, -0.1058], [-0.1642, 0.0487, -0.1075, ..., 0.0369, -0.0691, -0.0620], [ 0.0420, -0.0520, -0.0578, ..., -0.0400, -0.0421, -0.0399]], device='cuda:0'), grad: tensor([[ 2.5518e-07, 2.5891e-07, 4.6566e-10, ..., 3.5716e-07, -4.9509e-06, -3.5148e-06], [ 2.4633e-07, 1.1120e-06, -1.9278e-07, ..., 9.4902e-07, 1.1781e-07, 5.4948e-08], [ 1.8440e-07, 7.3295e-07, 1.2387e-07, ..., 6.4215e-07, -1.5246e-06, 4.1723e-07], ..., [ 2.8824e-07, -7.9200e-06, 1.8626e-09, ..., -1.0408e-05, 3.8883e-07, 1.1967e-07], [ 6.0210e-07, 2.0396e-06, 3.2596e-09, ..., 1.7341e-06, 4.6659e-07, 1.3411e-07], [ 2.7046e-06, 3.0734e-06, 9.3132e-10, ..., 3.1069e-06, 2.3376e-06, 1.3886e-06]], device='cuda:0') Epoch 117, bias, value: tensor([-0.0173, -0.0254, -0.0115, -0.0275, -0.0306, -0.0002, 0.0267, -0.0128, 0.0335, -0.0012], device='cuda:0'), grad: tensor([-5.5209e-06, -1.7390e-05, -3.0510e-06, 4.9099e-06, 1.1921e-05, -2.5630e-06, 4.3437e-06, -3.5316e-06, 8.9183e-06, 1.9483e-06], device='cuda:0') 100 0.0001 changing lr epoch 116, time 265.55, cls_loss 0.0073 cls_loss_mapping 0.0079 cls_loss_causal 0.5915 re_mapping 0.0085 re_causal 0.0214 /// teacc 98.93 lr 0.00010000 Epoch 118, weight, value: tensor([[-0.0654, -0.0874, -0.0549, ..., -0.0365, 0.1334, 0.1174], [-0.1229, -0.1268, -0.1098, ..., -0.0965, -0.1353, -0.0646], [-0.0810, -0.0610, 0.0970, ..., -0.1073, 0.1301, 0.0269], ..., [-0.0693, 0.0489, 0.0430, ..., 0.1159, -0.0902, -0.1064], [-0.1648, 0.0486, -0.1080, ..., 0.0370, -0.0692, -0.0622], [ 0.0416, -0.0523, -0.0579, ..., -0.0414, -0.0422, -0.0405]], device='cuda:0'), grad: tensor([[ 5.6392e-07, 2.3562e-07, 9.5926e-08, ..., -1.9204e-06, -1.4119e-05, -8.2180e-06], [ 5.5553e-07, 4.6752e-07, 5.3551e-08, ..., 1.0338e-06, 1.2582e-06, 7.6462e-07], [ 1.5823e-06, 3.3956e-06, 2.5611e-08, ..., 7.1991e-07, -2.5146e-08, -3.1479e-07], ..., [ 4.7265e-07, 1.1437e-06, 2.4680e-08, ..., 1.8878e-06, 2.8834e-06, 2.0470e-06], [ 6.0834e-06, 1.3197e-06, 1.6764e-08, ..., 8.6799e-06, 1.6317e-06, 9.0990e-07], [ 2.3190e-07, 1.3923e-07, 1.3970e-09, ..., 9.4622e-06, 4.3064e-06, 2.3022e-06]], device='cuda:0') Epoch 118, bias, value: tensor([-0.0176, -0.0260, -0.0120, -0.0276, -0.0312, -0.0002, 0.0268, -0.0110, 0.0334, -0.0015], device='cuda:0'), grad: tensor([-1.5162e-05, -1.2264e-03, 2.7806e-05, 5.8591e-05, -2.3067e-04, -1.7807e-05, -6.2166e-07, 1.1492e-03, 3.2365e-05, 2.2197e-04], device='cuda:0') 100 0.0001 changing lr epoch 117, time 265.24, cls_loss 0.0066 cls_loss_mapping 0.0085 cls_loss_causal 0.5924 re_mapping 0.0089 re_causal 0.0223 /// teacc 98.92 lr 0.00010000 Epoch 119, weight, value: tensor([[-0.0656, -0.0880, -0.0551, ..., -0.0360, 0.1342, 0.1181], [-0.1232, -0.1274, -0.1095, ..., -0.0969, -0.1355, -0.0650], [-0.0813, -0.0617, 0.0970, ..., -0.1086, 0.1309, 0.0275], ..., [-0.0698, 0.0482, 0.0429, ..., 0.1157, -0.0909, -0.1071], [-0.1658, 0.0489, -0.1082, ..., 0.0373, -0.0697, -0.0625], [ 0.0420, -0.0533, -0.0579, ..., -0.0414, -0.0429, -0.0411]], device='cuda:0'), grad: tensor([[ 6.4401e-07, 2.5146e-08, 1.3644e-07, ..., 8.5682e-08, 5.5879e-08, -5.2759e-07], [ 2.7474e-07, 1.0291e-07, 1.8161e-08, ..., 1.6158e-07, 1.0943e-06, 3.7579e-07], [ 2.8173e-07, 1.2992e-07, 2.8871e-08, ..., 1.2014e-07, 1.0394e-06, 1.9697e-07], ..., [ 7.2177e-08, 4.6100e-08, 4.6566e-10, ..., 7.9162e-08, 4.0140e-07, 2.3749e-07], [ 1.1353e-06, 2.2491e-07, 2.5146e-08, ..., 3.7486e-07, -5.3644e-07, 7.0035e-07], [ 2.8685e-07, -1.0990e-07, 2.7940e-09, ..., -2.3330e-07, 2.5798e-06, 4.7917e-07]], device='cuda:0') Epoch 119, bias, value: tensor([-0.0172, -0.0255, -0.0123, -0.0271, -0.0309, -0.0002, 0.0265, -0.0121, 0.0342, -0.0016], device='cuda:0'), grad: tensor([ 2.7977e-06, 7.4804e-06, 2.3004e-06, 1.1995e-05, -4.1425e-05, -1.0125e-05, -1.3083e-05, 7.0594e-06, -5.8524e-06, 3.8743e-05], device='cuda:0') 100 0.0001 changing lr epoch 118, time 265.30, cls_loss 0.0066 cls_loss_mapping 0.0076 cls_loss_causal 0.5834 re_mapping 0.0088 re_causal 0.0221 /// teacc 98.91 lr 0.00010000 Epoch 120, weight, value: tensor([[-0.0659, -0.0886, -0.0554, ..., -0.0360, 0.1348, 0.1188], [-0.1240, -0.1281, -0.1100, ..., -0.0975, -0.1362, -0.0655], [-0.0820, -0.0621, 0.0972, ..., -0.1093, 0.1312, 0.0275], ..., [-0.0701, 0.0490, 0.0428, ..., 0.1168, -0.0913, -0.1076], [-0.1657, 0.0490, -0.1081, ..., 0.0385, -0.0673, -0.0629], [ 0.0421, -0.0538, -0.0579, ..., -0.0416, -0.0432, -0.0415]], device='cuda:0'), grad: tensor([[ 7.5437e-08, 1.6298e-07, 1.3970e-08, ..., 1.4110e-06, -5.9232e-07, -3.8277e-07], [ 5.8673e-08, 3.1143e-06, 9.3132e-10, ..., 4.8317e-06, 6.7987e-08, 2.0489e-08], [ 6.3330e-08, 3.4571e-06, 1.8626e-09, ..., 1.1332e-05, -1.1146e-05, 1.0896e-07], ..., [ 6.4261e-08, -1.0543e-05, 0.0000e+00, ..., -5.7101e-05, 5.1595e-07, 1.7695e-08], [ 2.4587e-07, 8.8848e-07, 9.3132e-10, ..., 4.2841e-07, 9.3691e-07, 7.5437e-08], [ 1.4249e-07, 1.0384e-06, 0.0000e+00, ..., 2.2817e-06, 2.3656e-07, 1.2014e-07]], device='cuda:0') Epoch 120, bias, value: tensor([-0.0170, -0.0257, -0.0118, -0.0278, -0.0317, 0.0002, 0.0256, -0.0118, 0.0354, -0.0016], device='cuda:0'), grad: tensor([ 2.2613e-06, 5.0254e-06, -1.3141e-06, 8.0615e-06, 4.7028e-05, 2.7016e-05, 1.3923e-06, -9.7692e-05, 6.0685e-06, 2.0899e-06], device='cuda:0') 100 0.0001 changing lr epoch 119, time 265.39, cls_loss 0.0058 cls_loss_mapping 0.0057 cls_loss_causal 0.5625 re_mapping 0.0087 re_causal 0.0217 /// teacc 98.80 lr 0.00010000 Epoch 121, weight, value: tensor([[-0.0673, -0.0893, -0.0557, ..., -0.0370, 0.1342, 0.1183], [-0.1248, -0.1284, -0.1102, ..., -0.0977, -0.1364, -0.0656], [-0.0825, -0.0628, 0.0976, ..., -0.1104, 0.1317, 0.0275], ..., [-0.0704, 0.0497, 0.0427, ..., 0.1179, -0.0919, -0.1078], [-0.1665, 0.0489, -0.1081, ..., 0.0385, -0.0676, -0.0634], [ 0.0419, -0.0546, -0.0579, ..., -0.0425, -0.0435, -0.0418]], device='cuda:0'), grad: tensor([[ 2.7381e-07, 6.1467e-08, 0.0000e+00, ..., -4.3474e-06, -1.4052e-05, -8.6427e-06], [ 8.3819e-08, 7.2364e-07, 2.0489e-08, ..., 3.8557e-07, 6.4634e-07, 2.5891e-07], [ 8.1025e-08, -2.4885e-06, -5.3551e-07, ..., 6.1002e-07, -4.7497e-06, -5.6811e-07], ..., [ 2.1141e-07, 8.4750e-07, 5.1502e-07, ..., -1.6307e-06, 4.7944e-06, 8.0001e-07], [ 7.1824e-05, 2.7567e-06, 0.0000e+00, ..., 1.6224e-04, 1.2098e-06, 9.8813e-07], [ 2.1700e-07, 4.3306e-07, 0.0000e+00, ..., 1.8338e-06, 3.0138e-06, 1.7975e-06]], device='cuda:0') Epoch 121, bias, value: tensor([-0.0177, -0.0259, -0.0120, -0.0282, -0.0312, 0.0004, 0.0263, -0.0114, 0.0354, -0.0022], device='cuda:0'), grad: tensor([-3.1024e-05, 1.3180e-05, -2.5287e-05, 1.1124e-05, 2.1458e-05, -2.1064e-04, 1.1973e-05, 1.5274e-05, 2.2864e-04, -3.4988e-05], device='cuda:0') 100 0.0001 changing lr epoch 120, time 265.84, cls_loss 0.0067 cls_loss_mapping 0.0070 cls_loss_causal 0.5548 re_mapping 0.0088 re_causal 0.0220 /// teacc 98.98 lr 0.00010000 Epoch 122, weight, value: tensor([[-0.0676, -0.0898, -0.0558, ..., -0.0372, 0.1341, 0.1190], [-0.1254, -0.1290, -0.1109, ..., -0.0984, -0.1372, -0.0663], [-0.0830, -0.0635, 0.0983, ..., -0.1113, 0.1325, 0.0282], ..., [-0.0711, 0.0505, 0.0426, ..., 0.1194, -0.0925, -0.1083], [-0.1669, 0.0487, -0.1074, ..., 0.0384, -0.0679, -0.0638], [ 0.0421, -0.0554, -0.0579, ..., -0.0436, -0.0426, -0.0425]], device='cuda:0'), grad: tensor([[ 3.5763e-07, 3.7439e-07, 9.3132e-10, ..., 1.7229e-07, -3.6322e-08, 3.5670e-07], [ 2.8126e-07, 4.6566e-07, 0.0000e+00, ..., 3.2224e-07, 1.7866e-05, 1.1593e-05], [ 1.7695e-07, 1.1548e-06, 9.3132e-10, ..., 2.0377e-06, -4.4316e-05, -3.3647e-05], ..., [ 5.3365e-07, -7.5437e-07, 0.0000e+00, ..., -1.8561e-06, 2.1607e-06, 1.2936e-06], [ 1.3076e-06, 1.2711e-05, 0.0000e+00, ..., -1.7267e-06, 1.0692e-06, 1.1930e-06], [ 4.9360e-07, -1.1541e-05, 0.0000e+00, ..., 8.0187e-07, 1.4920e-06, 9.2201e-07]], device='cuda:0') Epoch 122, bias, value: tensor([-1.8815e-02, -2.6876e-02, -1.1588e-02, -2.8067e-02, -3.0534e-02, 2.2794e-05, 2.5934e-02, -1.1187e-02, 3.5494e-02, -9.9249e-04], device='cuda:0'), grad: tensor([ 9.5665e-06, -4.2468e-05, -1.1522e-04, -7.1287e-04, -1.0513e-05, 7.2145e-04, 1.0979e-04, 9.1419e-06, 7.0155e-05, -3.9339e-05], device='cuda:0') 100 0.0001 changing lr epoch 121, time 265.15, cls_loss 0.0055 cls_loss_mapping 0.0066 cls_loss_causal 0.5497 re_mapping 0.0087 re_causal 0.0221 /// teacc 98.93 lr 0.00010000 Epoch 123, weight, value: tensor([[-0.0679, -0.0903, -0.0559, ..., -0.0373, 0.1348, 0.1198], [-0.1271, -0.1279, -0.1110, ..., -0.0987, -0.1375, -0.0664], [-0.0836, -0.0635, 0.0984, ..., -0.1117, 0.1330, 0.0286], ..., [-0.0711, 0.0500, 0.0426, ..., 0.1199, -0.0932, -0.1091], [-0.1673, 0.0485, -0.1074, ..., 0.0382, -0.0681, -0.0645], [ 0.0423, -0.0549, -0.0579, ..., -0.0435, -0.0431, -0.0437]], device='cuda:0'), grad: tensor([[ 1.7788e-07, 8.4378e-07, 0.0000e+00, ..., 4.5914e-07, -6.1803e-06, -4.0755e-06], [ 1.5087e-07, 6.7148e-07, 0.0000e+00, ..., 4.2841e-07, 9.4157e-07, 3.1386e-07], [ 3.0454e-07, 2.0433e-06, 0.0000e+00, ..., -2.7623e-06, -9.0748e-06, -2.3454e-05], ..., [ 1.2293e-07, 1.9558e-08, 0.0000e+00, ..., -2.5984e-07, 1.1288e-06, 5.1130e-07], [ 1.0598e-06, -6.4895e-06, 0.0000e+00, ..., -7.3574e-08, 2.3376e-07, 2.0996e-05], [-2.7940e-09, 1.4491e-06, 0.0000e+00, ..., 3.9209e-07, 4.7944e-06, 2.1998e-06]], device='cuda:0') Epoch 123, bias, value: tensor([-0.0184, -0.0260, -0.0115, -0.0280, -0.0306, -0.0006, 0.0261, -0.0127, 0.0354, -0.0003], device='cuda:0'), grad: tensor([ 2.2985e-06, -8.1122e-05, -1.7017e-05, 3.8624e-05, -6.3241e-05, 7.4618e-06, 8.1435e-06, 7.1764e-05, -9.0599e-06, 4.1991e-05], device='cuda:0') 100 0.0001 changing lr epoch 122, time 265.36, cls_loss 0.0069 cls_loss_mapping 0.0074 cls_loss_causal 0.5992 re_mapping 0.0088 re_causal 0.0213 /// teacc 98.90 lr 0.00010000 Epoch 124, weight, value: tensor([[-0.0684, -0.0909, -0.0560, ..., -0.0376, 0.1358, 0.1210], [-0.1283, -0.1284, -0.1121, ..., -0.0996, -0.1379, -0.0664], [-0.0836, -0.0641, 0.0993, ..., -0.1124, 0.1334, 0.0289], ..., [-0.0716, 0.0507, 0.0428, ..., 0.1216, -0.0941, -0.1107], [-0.1676, 0.0480, -0.1070, ..., 0.0385, -0.0673, -0.0656], [ 0.0421, -0.0559, -0.0580, ..., -0.0448, -0.0437, -0.0450]], device='cuda:0'), grad: tensor([[ 6.5379e-07, 4.0978e-07, 2.2352e-08, ..., 2.8592e-07, -6.1467e-06, -3.0473e-06], [ 7.0315e-07, 1.2890e-06, 2.9802e-08, ..., 5.6718e-07, 1.3132e-07, 8.7544e-08], [ 5.3924e-07, 4.5076e-07, -3.5297e-06, ..., 3.2969e-07, 3.1758e-07, -1.1828e-07], ..., [ 2.7400e-06, 2.1420e-06, 1.2061e-06, ..., -1.8617e-06, 7.6648e-07, 5.7835e-07], [ 3.2708e-06, 1.8105e-06, 1.1204e-06, ..., 1.0859e-06, 4.2841e-07, 3.6415e-07], [ 4.3400e-06, 5.3979e-06, 1.8626e-09, ..., 1.4929e-06, 2.0768e-06, 1.1083e-06]], device='cuda:0') Epoch 124, bias, value: tensor([-0.0181, -0.0262, -0.0107, -0.0272, -0.0306, -0.0013, 0.0257, -0.0124, 0.0351, -0.0007], device='cuda:0'), grad: tensor([-4.9397e-06, 2.9616e-07, -6.8367e-05, 7.5674e-04, 1.7866e-05, -7.7915e-04, 2.9914e-06, 3.1024e-05, 2.9266e-05, 1.4521e-05], device='cuda:0') 100 0.0001 changing lr epoch 123, time 265.50, cls_loss 0.0058 cls_loss_mapping 0.0068 cls_loss_causal 0.5907 re_mapping 0.0086 re_causal 0.0222 /// teacc 98.90 lr 0.00010000 Epoch 125, weight, value: tensor([[-0.0690, -0.0912, -0.0564, ..., -0.0384, 0.1364, 0.1215], [-0.1286, -0.1285, -0.1127, ..., -0.0996, -0.1381, -0.0664], [-0.0843, -0.0643, 0.0996, ..., -0.1122, 0.1335, 0.0295], ..., [-0.0716, 0.0510, 0.0427, ..., 0.1221, -0.0946, -0.1113], [-0.1675, 0.0478, -0.1065, ..., 0.0388, -0.0671, -0.0668], [ 0.0417, -0.0562, -0.0580, ..., -0.0453, -0.0441, -0.0459]], device='cuda:0'), grad: tensor([[ 7.9349e-07, 1.0701e-06, 0.0000e+00, ..., -7.3109e-07, -5.3756e-06, -6.4299e-06], [ 1.6484e-07, 1.1828e-07, 0.0000e+00, ..., 2.9616e-07, 1.5730e-06, 1.4845e-06], [ 5.4948e-08, 3.9302e-07, 0.0000e+00, ..., 9.5926e-08, 1.0580e-06, 4.7684e-07], ..., [ 3.2596e-07, 2.9430e-07, 0.0000e+00, ..., -2.3283e-08, 1.7602e-07, 1.5553e-07], [ 8.0280e-07, -6.2734e-06, 0.0000e+00, ..., 6.5751e-07, -7.8082e-06, 1.0990e-06], [ 2.7008e-06, 2.8182e-06, 0.0000e+00, ..., 1.1576e-06, 5.6997e-06, 1.6699e-06]], device='cuda:0') Epoch 125, bias, value: tensor([-0.0181, -0.0257, -0.0107, -0.0289, -0.0295, -0.0012, 0.0259, -0.0126, 0.0358, -0.0013], device='cuda:0'), grad: tensor([-6.9067e-06, 3.4459e-06, 3.8035e-06, 1.4031e-04, -1.4268e-05, -1.4412e-04, 8.2925e-06, 5.4240e-06, -1.6361e-05, 2.0251e-05], device='cuda:0') 100 0.0001 changing lr epoch 124, time 265.47, cls_loss 0.0052 cls_loss_mapping 0.0069 cls_loss_causal 0.5725 re_mapping 0.0085 re_causal 0.0214 /// teacc 98.90 lr 0.00010000 Epoch 126, weight, value: tensor([[-0.0693, -0.0923, -0.0565, ..., -0.0376, 0.1365, 0.1218], [-0.1289, -0.1290, -0.1130, ..., -0.1001, -0.1391, -0.0666], [-0.0847, -0.0646, 0.0999, ..., -0.1125, 0.1325, 0.0295], ..., [-0.0720, 0.0515, 0.0428, ..., 0.1230, -0.0957, -0.1121], [-0.1683, 0.0479, -0.1060, ..., 0.0387, -0.0671, -0.0676], [ 0.0415, -0.0571, -0.0580, ..., -0.0460, -0.0443, -0.0464]], device='cuda:0'), grad: tensor([[ 2.5611e-07, 8.7544e-07, 1.8626e-08, ..., 8.7079e-07, -1.0908e-05, -1.4096e-05], [ 5.4017e-08, 2.2445e-06, 9.3132e-10, ..., 4.5262e-06, 6.8367e-05, 6.5506e-05], [ 2.4214e-08, 9.8720e-08, -8.7544e-08, ..., 8.5682e-08, -7.7486e-05, -7.3075e-05], ..., [ 1.3318e-07, -2.8804e-05, 1.9558e-08, ..., -6.2168e-05, 4.8205e-06, 4.1872e-06], [ 1.5516e-06, 1.8626e-08, 8.3819e-09, ..., 2.0918e-06, 1.0431e-06, 1.0375e-06], [ 1.3318e-06, 2.2769e-05, 2.7940e-09, ..., 4.7892e-05, 4.8522e-07, 3.5297e-07]], device='cuda:0') Epoch 126, bias, value: tensor([-0.0182, -0.0258, -0.0113, -0.0288, -0.0291, -0.0012, 0.0269, -0.0126, 0.0355, -0.0014], device='cuda:0'), grad: tensor([-1.4178e-05, 2.4104e-04, -2.5868e-04, 2.8223e-05, 1.9222e-05, -1.9252e-05, 2.8104e-05, -1.2624e-04, 6.4969e-06, 9.5367e-05], device='cuda:0') 100 0.0001 changing lr epoch 125, time 261.93, cls_loss 0.0061 cls_loss_mapping 0.0086 cls_loss_causal 0.5695 re_mapping 0.0082 re_causal 0.0203 /// teacc 98.89 lr 0.00010000 Epoch 127, weight, value: tensor([[-0.0690, -0.0929, -0.0567, ..., -0.0374, 0.1378, 0.1230], [-0.1293, -0.1299, -0.1127, ..., -0.1003, -0.1395, -0.0670], [-0.0852, -0.0651, 0.1001, ..., -0.1138, 0.1332, 0.0302], ..., [-0.0719, 0.0522, 0.0427, ..., 0.1242, -0.0967, -0.1135], [-0.1693, 0.0473, -0.1065, ..., 0.0383, -0.0676, -0.0680], [ 0.0413, -0.0579, -0.0584, ..., -0.0472, -0.0449, -0.0477]], device='cuda:0'), grad: tensor([[ 3.5856e-07, 1.4063e-07, 1.5832e-08, ..., 4.6156e-06, 2.3395e-06, 4.0531e-06], [ 4.8522e-07, 2.9337e-07, 1.3039e-08, ..., 4.5635e-07, 6.6590e-07, 4.5262e-07], [ 4.7684e-07, 4.5355e-07, -1.7695e-07, ..., 1.2964e-06, -4.5635e-06, -5.3085e-08], ..., [ 4.4052e-07, 1.5553e-07, 3.4459e-08, ..., -1.0617e-07, 1.4156e-06, 7.4599e-07], [ 2.7381e-06, 7.8045e-07, 3.8184e-08, ..., -8.2925e-06, -1.5244e-05, -1.5765e-05], [ 3.7905e-07, 4.3958e-07, 2.4866e-07, ..., 1.4752e-06, 8.4471e-07, 5.8208e-07]], device='cuda:0') Epoch 127, bias, value: tensor([-0.0174, -0.0261, -0.0114, -0.0285, -0.0293, -0.0015, 0.0270, -0.0124, 0.0350, -0.0014], device='cuda:0'), grad: tensor([ 2.7850e-05, 5.7463e-07, -3.1237e-06, 1.5342e-04, 5.4501e-06, -1.2016e-04, 1.2271e-05, 5.7593e-06, -5.4181e-05, -2.8074e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 126---------------------------------------------------- epoch 126, time 278.75, cls_loss 0.0070 cls_loss_mapping 0.0083 cls_loss_causal 0.5859 re_mapping 0.0089 re_causal 0.0215 /// teacc 99.01 lr 0.00010000 Epoch 128, weight, value: tensor([[-0.0692, -0.0935, -0.0571, ..., -0.0378, 0.1388, 0.1243], [-0.1299, -0.1309, -0.1132, ..., -0.1021, -0.1404, -0.0673], [-0.0865, -0.0671, 0.1005, ..., -0.1147, 0.1343, 0.0308], ..., [-0.0721, 0.0527, 0.0426, ..., 0.1259, -0.0978, -0.1150], [-0.1709, 0.0469, -0.1069, ..., 0.0361, -0.0676, -0.0707], [ 0.0406, -0.0589, -0.0587, ..., -0.0480, -0.0449, -0.0469]], device='cuda:0'), grad: tensor([[-1.3225e-07, 2.7101e-07, 1.2107e-08, ..., -2.6561e-06, -1.6034e-05, -1.3024e-05], [ 1.3504e-07, 2.7660e-07, 0.0000e+00, ..., 1.9465e-07, 2.3562e-07, -2.5984e-07], [ 6.2399e-08, 1.3318e-07, -2.5146e-08, ..., 2.7847e-07, 3.1665e-08, 1.6671e-07], ..., [ 8.0094e-08, 3.3639e-06, 0.0000e+00, ..., 4.0643e-06, 7.1805e-07, 7.8045e-07], [ 4.7795e-06, 7.0110e-06, 2.7940e-09, ..., 2.3302e-06, 4.2319e-06, 3.6433e-06], [ 3.2689e-06, -4.9621e-06, 0.0000e+00, ..., -1.1064e-05, 4.6287e-07, 4.3586e-07]], device='cuda:0') Epoch 128, bias, value: tensor([-0.0173, -0.0263, -0.0115, -0.0281, -0.0294, 0.0002, 0.0269, -0.0119, 0.0333, -0.0019], device='cuda:0'), grad: tensor([-1.6406e-05, -2.6658e-05, 1.7613e-05, -4.6968e-05, 2.1681e-05, 4.8906e-05, 9.4697e-06, 4.3005e-05, 2.3589e-05, -7.4267e-05], device='cuda:0') 100 0.0001 changing lr epoch 127, time 265.45, cls_loss 0.0057 cls_loss_mapping 0.0062 cls_loss_causal 0.5984 re_mapping 0.0084 re_causal 0.0213 /// teacc 98.83 lr 0.00010000 Epoch 129, weight, value: tensor([[-0.0702, -0.0941, -0.0573, ..., -0.0390, 0.1393, 0.1251], [-0.1304, -0.1313, -0.1133, ..., -0.1026, -0.1406, -0.0674], [-0.0872, -0.0679, 0.1007, ..., -0.1144, 0.1350, 0.0317], ..., [-0.0726, 0.0531, 0.0430, ..., 0.1269, -0.0984, -0.1159], [-0.1716, 0.0469, -0.1071, ..., 0.0361, -0.0679, -0.0714], [ 0.0404, -0.0597, -0.0594, ..., -0.0491, -0.0452, -0.0473]], device='cuda:0'), grad: tensor([[ 2.0582e-07, 3.6974e-07, 4.1910e-08, ..., 4.5262e-07, -1.7602e-07, -7.5437e-08], [ 3.2596e-08, 8.7079e-07, 1.6764e-08, ..., 7.0222e-07, 1.6391e-07, 8.0094e-08], [ 6.3330e-08, 2.9020e-06, -3.9395e-07, ..., 3.8669e-06, -3.4720e-06, -4.2282e-07], ..., [ 9.3132e-09, -8.8811e-05, 1.3970e-08, ..., -5.9038e-05, 1.5926e-07, 8.2888e-08], [ 2.1420e-07, 7.3202e-07, 3.1665e-08, ..., -2.2411e-05, 2.6450e-07, -9.8255e-07], [ 7.3574e-08, 1.1204e-06, 9.3132e-10, ..., 9.8813e-07, 2.5798e-07, 1.3784e-07]], device='cuda:0') Epoch 129, bias, value: tensor([-0.0175, -0.0260, -0.0107, -0.0275, -0.0295, 0.0005, 0.0260, -0.0119, 0.0331, -0.0023], device='cuda:0'), grad: tensor([ 3.5744e-06, 2.0005e-06, -5.1469e-05, 1.0329e-04, 1.5378e-05, 3.1501e-05, 5.4613e-06, -8.4579e-05, -3.9726e-05, 1.4544e-05], device='cuda:0') 100 0.0001 changing lr epoch 128, time 265.56, cls_loss 0.0057 cls_loss_mapping 0.0069 cls_loss_causal 0.5453 re_mapping 0.0082 re_causal 0.0211 /// teacc 98.90 lr 0.00010000 Epoch 130, weight, value: tensor([[-0.0710, -0.0947, -0.0576, ..., -0.0389, 0.1381, 0.1256], [-0.1309, -0.1319, -0.1111, ..., -0.1029, -0.1408, -0.0676], [-0.0877, -0.0676, 0.1021, ..., -0.1145, 0.1360, 0.0327], ..., [-0.0729, 0.0530, 0.0408, ..., 0.1272, -0.1000, -0.1177], [-0.1721, 0.0471, -0.1075, ..., 0.0364, -0.0681, -0.0719], [ 0.0401, -0.0600, -0.0602, ..., -0.0498, -0.0435, -0.0482]], device='cuda:0'), grad: tensor([[ 1.3225e-07, 1.2759e-07, 1.9185e-07, ..., 4.8336e-07, 1.1213e-06, 1.1064e-06], [ 1.2573e-07, 1.3173e-05, 7.4506e-08, ..., 1.4426e-06, 3.2876e-07, 3.4366e-07], [ 4.8429e-08, 2.2352e-07, -3.1833e-06, ..., 1.8906e-07, -1.8865e-05, -1.8656e-05], ..., [ 3.4459e-08, 2.2575e-05, 2.7400e-06, ..., 2.9802e-06, 1.0483e-05, 1.1310e-05], [ 1.8552e-06, -1.6605e-06, 1.9558e-08, ..., -1.1772e-05, 5.7258e-06, 5.1074e-06], [ 1.1083e-07, -6.1840e-07, 3.7253e-09, ..., -2.0321e-06, 2.5146e-08, 2.4214e-08]], device='cuda:0') Epoch 130, bias, value: tensor([-1.9512e-02, -2.5615e-02, -1.0418e-02, -2.7191e-02, -2.9085e-02, 7.2359e-05, 2.5716e-02, -1.2597e-02, 3.3429e-02, -1.1608e-03], device='cuda:0'), grad: tensor([ 4.0084e-06, 1.9163e-05, -4.0025e-05, -5.7638e-05, -1.2862e-06, 2.3425e-05, -6.2771e-07, 7.2002e-05, -1.5870e-06, -1.7509e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 265.23, cls_loss 0.0059 cls_loss_mapping 0.0060 cls_loss_causal 0.5636 re_mapping 0.0082 re_causal 0.0211 /// teacc 98.92 lr 0.00010000 Epoch 131, weight, value: tensor([[-0.0715, -0.0953, -0.0577, ..., -0.0389, 0.1384, 0.1260], [-0.1347, -0.1323, -0.1111, ..., -0.1033, -0.1414, -0.0698], [-0.0848, -0.0678, 0.1027, ..., -0.1144, 0.1367, 0.0349], ..., [-0.0732, 0.0524, 0.0402, ..., 0.1269, -0.1013, -0.1196], [-0.1729, 0.0478, -0.1077, ..., 0.0367, -0.0684, -0.0730], [ 0.0396, -0.0605, -0.0601, ..., -0.0500, -0.0437, -0.0487]], device='cuda:0'), grad: tensor([[ 2.9746e-06, 4.4983e-07, 8.3819e-09, ..., 2.6077e-06, -2.4904e-06, 4.4424e-07], [ 8.7079e-07, 7.0520e-06, 5.4017e-08, ..., 7.0184e-06, 3.2131e-07, 3.3528e-07], [ 5.8021e-07, 2.2147e-06, -4.2841e-08, ..., 2.8536e-06, -4.7777e-07, -5.9977e-07], ..., [ 1.8906e-07, -3.9124e-04, -7.6368e-08, ..., -3.2806e-04, 5.2433e-07, 5.3830e-07], [ 9.2506e-05, 1.4743e-06, 3.7253e-09, ..., 9.7871e-05, 1.7852e-05, 2.6047e-05], [ 8.1025e-07, 1.4313e-05, 1.7695e-08, ..., 1.2875e-05, 1.1679e-06, 8.0653e-07]], device='cuda:0') Epoch 131, bias, value: tensor([-0.0196, -0.0266, -0.0089, -0.0265, -0.0290, -0.0003, 0.0264, -0.0134, 0.0334, -0.0010], device='cuda:0'), grad: tensor([ 6.7651e-06, 1.0377e-04, 6.2324e-06, 6.8998e-04, -1.1033e-04, -2.3341e-04, -1.2696e-04, -7.3147e-04, 3.4618e-04, 4.8310e-05], device='cuda:0') 100 0.0001 changing lr epoch 130, time 265.15, cls_loss 0.0061 cls_loss_mapping 0.0064 cls_loss_causal 0.5444 re_mapping 0.0082 re_causal 0.0212 /// teacc 98.97 lr 0.00010000 Epoch 132, weight, value: tensor([[-0.0721, -0.0964, -0.0578, ..., -0.0393, 0.1387, 0.1262], [-0.1354, -0.1326, -0.1111, ..., -0.1031, -0.1416, -0.0696], [-0.0849, -0.0684, 0.1024, ..., -0.1152, 0.1369, 0.0353], ..., [-0.0731, 0.0532, 0.0413, ..., 0.1282, -0.1017, -0.1207], [-0.1739, 0.0479, -0.1078, ..., 0.0368, -0.0686, -0.0737], [ 0.0401, -0.0610, -0.0605, ..., -0.0503, -0.0438, -0.0492]], device='cuda:0'), grad: tensor([[ 1.3383e-06, 5.4017e-08, 2.4214e-08, ..., 4.2841e-08, -2.3283e-07, 7.6927e-07], [ 9.4995e-08, 1.1548e-07, 1.2107e-08, ..., 6.7055e-08, 5.6811e-08, 6.1467e-08], [ 2.6263e-07, 7.8045e-07, 1.4529e-07, ..., 6.6124e-07, -7.9069e-07, -1.1427e-06], ..., [ 3.4459e-08, -1.1837e-06, -1.7136e-07, ..., -1.0915e-06, 3.0734e-07, 3.9488e-07], [ 3.5111e-07, 2.5705e-07, 5.8673e-08, ..., 1.5926e-07, 4.5914e-07, 5.5041e-07], [ 4.7497e-08, -1.3504e-07, -1.1828e-07, ..., 1.8068e-07, 1.1828e-07, 9.4064e-08]], device='cuda:0') Epoch 132, bias, value: tensor([-0.0198, -0.0264, -0.0091, -0.0247, -0.0302, -0.0023, 0.0268, -0.0125, 0.0331, -0.0006], device='cuda:0'), grad: tensor([ 2.9057e-06, -5.3458e-06, -8.2236e-07, 1.5311e-06, 3.0816e-05, 2.1458e-06, -3.1918e-05, -2.3562e-07, 4.4852e-06, -3.5297e-06], device='cuda:0') 100 0.0001 changing lr epoch 131, time 265.42, cls_loss 0.0058 cls_loss_mapping 0.0062 cls_loss_causal 0.5757 re_mapping 0.0080 re_causal 0.0205 /// teacc 98.94 lr 0.00010000 Epoch 133, weight, value: tensor([[-0.0743, -0.0968, -0.0584, ..., -0.0398, 0.1374, 0.1268], [-0.1356, -0.1330, -0.1114, ..., -0.1039, -0.1417, -0.0697], [-0.0852, -0.0688, 0.1025, ..., -0.1157, 0.1371, 0.0351], ..., [-0.0735, 0.0536, 0.0417, ..., 0.1286, -0.1020, -0.1210], [-0.1752, 0.0486, -0.1082, ..., 0.0372, -0.0689, -0.0746], [ 0.0400, -0.0618, -0.0611, ..., -0.0508, -0.0419, -0.0500]], device='cuda:0'), grad: tensor([[ 1.4435e-07, 7.9162e-08, 1.3970e-08, ..., 1.9651e-07, -1.8254e-05, -7.4767e-06], [ 8.4750e-08, -6.1877e-06, 2.7940e-09, ..., 8.7172e-06, 6.8173e-07, 6.9849e-08], [ 5.5879e-08, 4.2561e-07, 6.5193e-09, ..., 1.7835e-06, 1.3690e-07, 1.5646e-07], ..., [ 1.2759e-07, 5.2191e-06, -1.3039e-08, ..., 2.1141e-07, 1.3225e-07, 3.3528e-08], [ 8.7321e-06, -7.2420e-06, 3.7253e-09, ..., 1.2644e-05, -3.4738e-07, 3.8091e-07], [ 2.3786e-06, 5.3346e-06, 6.5193e-09, ..., 1.4380e-05, 1.5467e-05, 6.1616e-06]], device='cuda:0') Epoch 133, bias, value: tensor([-0.0214, -0.0262, -0.0095, -0.0249, -0.0303, -0.0022, 0.0269, -0.0125, 0.0332, 0.0008], device='cuda:0'), grad: tensor([-2.0459e-05, -7.8011e-04, 6.4850e-04, 2.9266e-05, -9.0361e-05, -2.5094e-05, 1.4849e-05, 1.2887e-04, 1.8561e-06, 9.2268e-05], device='cuda:0') 100 0.0001 changing lr epoch 132, time 264.84, cls_loss 0.0062 cls_loss_mapping 0.0054 cls_loss_causal 0.5834 re_mapping 0.0079 re_causal 0.0199 /// teacc 98.88 lr 0.00010000 Epoch 134, weight, value: tensor([[-0.0746, -0.0984, -0.0590, ..., -0.0400, 0.1378, 0.1284], [-0.1360, -0.1339, -0.1115, ..., -0.1051, -0.1422, -0.0699], [-0.0854, -0.0696, 0.1024, ..., -0.1176, 0.1372, 0.0349], ..., [-0.0736, 0.0539, 0.0419, ..., 0.1296, -0.1024, -0.1214], [-0.1766, 0.0496, -0.1086, ..., 0.0381, -0.0686, -0.0754], [ 0.0392, -0.0626, -0.0610, ..., -0.0519, -0.0418, -0.0521]], device='cuda:0'), grad: tensor([[ 5.0385e-07, 3.4366e-07, 5.0850e-07, ..., 4.7125e-07, -9.8720e-08, -7.4040e-07], [ 1.0990e-07, 3.2037e-07, 7.1712e-08, ..., 2.9616e-07, 4.9267e-07, 8.7544e-08], [ 2.3190e-07, 3.2969e-07, 1.2666e-07, ..., 1.7788e-07, 1.2275e-06, 7.2643e-08], ..., [-2.5053e-07, -3.9786e-06, 1.1176e-07, ..., -5.1521e-06, 6.9570e-07, 8.4750e-08], [ 2.0023e-06, 2.2352e-07, 3.1013e-07, ..., 1.0738e-06, 1.4622e-06, 2.9057e-07], [ 2.5705e-06, 2.4382e-06, 1.7788e-07, ..., 3.4589e-06, 2.7865e-06, 5.3272e-07]], device='cuda:0') Epoch 134, bias, value: tensor([-0.0216, -0.0265, -0.0100, -0.0250, -0.0295, -0.0016, 0.0259, -0.0121, 0.0338, 0.0005], device='cuda:0'), grad: tensor([ 3.2894e-06, 3.1702e-06, 4.2617e-06, 2.0750e-06, -1.3888e-04, -8.3074e-06, 1.9085e-04, -4.2319e-06, 9.3877e-06, -6.1631e-05], device='cuda:0') 100 0.0001 changing lr epoch 133, time 265.03, cls_loss 0.0079 cls_loss_mapping 0.0069 cls_loss_causal 0.5588 re_mapping 0.0080 re_causal 0.0200 /// teacc 98.99 lr 0.00010000 Epoch 135, weight, value: tensor([[-0.0750, -0.0991, -0.0602, ..., -0.0405, 0.1383, 0.1310], [-0.1367, -0.1344, -0.1117, ..., -0.1058, -0.1432, -0.0699], [-0.0857, -0.0701, 0.1031, ..., -0.1186, 0.1370, 0.0341], ..., [-0.0738, 0.0532, 0.0417, ..., 0.1305, -0.1027, -0.1218], [-0.1776, 0.0496, -0.1089, ..., 0.0380, -0.0689, -0.0764], [ 0.0390, -0.0627, -0.0613, ..., -0.0520, -0.0410, -0.0535]], device='cuda:0'), grad: tensor([[ 3.1851e-07, 2.8685e-07, 8.3819e-09, ..., 3.1106e-07, -8.2422e-07, -7.3388e-07], [ 5.2992e-07, 6.1654e-07, 1.6764e-08, ..., 6.0424e-06, 3.1572e-07, 7.0781e-08], [ 1.1735e-07, 3.3993e-07, -3.5111e-07, ..., 4.1798e-06, -1.3234e-06, -8.1025e-08], ..., [ 1.1912e-06, -5.4389e-07, 1.6764e-08, ..., -1.5842e-06, 1.3784e-07, 2.7008e-08], [ 2.3246e-06, -4.0606e-06, 1.2107e-08, ..., -1.1906e-05, -1.4687e-06, 4.4052e-07], [ 1.4603e-06, 2.9821e-06, 9.3132e-10, ..., 1.6605e-06, 5.9512e-07, 3.6042e-07]], device='cuda:0') Epoch 135, bias, value: tensor([-0.0223, -0.0257, -0.0110, -0.0256, -0.0304, -0.0011, 0.0256, -0.0119, 0.0336, 0.0016], device='cuda:0'), grad: tensor([ 5.8860e-07, -4.8423e-04, 3.9673e-04, -8.1491e-04, 2.2426e-05, 8.6403e-04, -2.6803e-06, 6.9141e-05, -6.3479e-05, 1.3307e-05], device='cuda:0') 100 0.0001 changing lr epoch 134, time 265.59, cls_loss 0.0055 cls_loss_mapping 0.0058 cls_loss_causal 0.5690 re_mapping 0.0079 re_causal 0.0201 /// teacc 98.94 lr 0.00010000 Epoch 136, weight, value: tensor([[-0.0783, -0.0999, -0.0614, ..., -0.0413, 0.1361, 0.1287], [-0.1371, -0.1349, -0.1118, ..., -0.1060, -0.1436, -0.0700], [-0.0858, -0.0705, 0.1034, ..., -0.1189, 0.1376, 0.0344], ..., [-0.0741, 0.0541, 0.0419, ..., 0.1317, -0.1032, -0.1222], [-0.1778, 0.0493, -0.1089, ..., 0.0381, -0.0690, -0.0770], [ 0.0388, -0.0633, -0.0615, ..., -0.0526, -0.0411, -0.0546]], device='cuda:0'), grad: tensor([[ 1.9185e-07, 1.2293e-07, 7.4506e-09, ..., 5.0291e-08, -1.4333e-06, -7.1619e-07], [ 6.0536e-07, 1.5842e-06, 0.0000e+00, ..., 8.2888e-07, 1.1642e-07, 5.6811e-08], [ 2.7567e-07, 6.5472e-07, 7.4506e-09, ..., 7.9162e-08, -9.3132e-10, 5.5879e-08], ..., [ 3.2689e-07, -1.1995e-06, 0.0000e+00, ..., -2.1067e-06, 6.6124e-08, 2.7940e-08], [ 3.8967e-06, 5.9605e-06, 9.3132e-10, ..., 2.5891e-07, 2.2817e-07, 1.0338e-07], [ 1.8440e-06, 1.8505e-06, 0.0000e+00, ..., 1.0962e-06, 9.5740e-07, 4.7963e-07]], device='cuda:0') Epoch 136, bias, value: tensor([-0.0242, -0.0256, -0.0110, -0.0265, -0.0302, -0.0005, 0.0276, -0.0116, 0.0335, 0.0015], device='cuda:0'), grad: tensor([-1.1129e-06, -3.3110e-05, 5.6624e-06, -2.8372e-05, -5.6475e-06, 1.5080e-05, 2.0824e-06, -4.0140e-07, 1.5363e-05, 3.0473e-05], device='cuda:0') 100 0.0001 changing lr epoch 135, time 265.17, cls_loss 0.0063 cls_loss_mapping 0.0085 cls_loss_causal 0.5731 re_mapping 0.0081 re_causal 0.0200 /// teacc 98.97 lr 0.00010000 Epoch 137, weight, value: tensor([[-0.0784, -0.1008, -0.0619, ..., -0.0414, 0.1366, 0.1295], [-0.1377, -0.1354, -0.1119, ..., -0.1075, -0.1439, -0.0705], [-0.0861, -0.0712, 0.1034, ..., -0.1201, 0.1382, 0.0346], ..., [-0.0745, 0.0541, 0.0419, ..., 0.1329, -0.1044, -0.1231], [-0.1785, 0.0494, -0.1089, ..., 0.0381, -0.0692, -0.0777], [ 0.0383, -0.0635, -0.0618, ..., -0.0530, -0.0408, -0.0569]], device='cuda:0'), grad: tensor([[ 2.1793e-07, 3.6508e-07, 0.0000e+00, ..., 1.5367e-07, 9.2201e-08, 4.0978e-08], [ 3.0175e-07, 5.1782e-07, 0.0000e+00, ..., 8.8569e-07, 5.7742e-08, 2.2352e-08], [ 8.0839e-07, 2.0657e-06, 0.0000e+00, ..., 1.1427e-06, -8.2888e-08, -1.7881e-07], ..., [ 1.1269e-07, -4.9826e-07, 0.0000e+00, ..., -5.1688e-07, 2.1420e-08, 1.3039e-08], [-1.1317e-05, -2.6971e-05, 0.0000e+00, ..., -9.0301e-06, -4.4927e-06, -1.2740e-06], [ 4.8708e-07, 1.2862e-06, 0.0000e+00, ..., 1.8273e-06, 9.6858e-08, 2.9802e-08]], device='cuda:0') Epoch 137, bias, value: tensor([-0.0242, -0.0255, -0.0111, -0.0273, -0.0303, 0.0005, 0.0270, -0.0119, 0.0333, 0.0022], device='cuda:0'), grad: tensor([ 2.7101e-06, 8.8394e-05, 6.1430e-06, 7.4059e-06, -2.7180e-04, 2.4855e-05, 4.3988e-05, 1.5676e-05, -8.3447e-05, 1.6582e-04], device='cuda:0') 100 0.0001 changing lr epoch 136, time 265.39, cls_loss 0.0054 cls_loss_mapping 0.0055 cls_loss_causal 0.5564 re_mapping 0.0080 re_causal 0.0199 /// teacc 98.94 lr 0.00010000 Epoch 138, weight, value: tensor([[-0.0787, -0.1014, -0.0622, ..., -0.0423, 0.1369, 0.1298], [-0.1380, -0.1360, -0.1119, ..., -0.1080, -0.1443, -0.0707], [-0.0864, -0.0718, 0.1036, ..., -0.1208, 0.1379, 0.0342], ..., [-0.0749, 0.0538, 0.0417, ..., 0.1335, -0.1048, -0.1236], [-0.1791, 0.0493, -0.1091, ..., 0.0381, -0.0694, -0.0787], [ 0.0382, -0.0642, -0.0613, ..., -0.0535, -0.0411, -0.0589]], device='cuda:0'), grad: tensor([[ 1.7229e-06, 5.5879e-09, 3.7253e-09, ..., 1.3597e-07, -9.9421e-05, -6.0499e-05], [ 1.2014e-07, 7.6368e-08, 1.1176e-08, ..., 8.6613e-08, 3.5495e-05, 2.1547e-05], [ 6.4261e-08, 6.7987e-08, 5.3085e-08, ..., 4.1351e-07, 9.1344e-06, 5.5693e-06], ..., [ 1.0058e-07, 6.5193e-09, 9.4064e-08, ..., -6.5193e-09, 8.3260e-07, 5.0850e-07], [ 5.0664e-07, 6.5193e-08, 1.8626e-08, ..., -1.3560e-06, 7.6368e-06, 4.6454e-06], [ 2.6729e-07, -4.9360e-08, -1.4156e-07, ..., 4.8149e-07, 4.3847e-06, 2.6952e-06]], device='cuda:0') Epoch 138, bias, value: tensor([-0.0241, -0.0251, -0.0114, -0.0271, -0.0300, 0.0003, 0.0272, -0.0120, 0.0333, 0.0018], device='cuda:0'), grad: tensor([-1.5879e-04, 5.0187e-05, 1.9193e-05, 4.9025e-06, -1.7295e-06, 6.6943e-06, 5.7906e-05, 4.0904e-06, 5.0440e-06, 1.2323e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 137---------------------------------------------------- epoch 137, time 281.77, cls_loss 0.0054 cls_loss_mapping 0.0053 cls_loss_causal 0.5253 re_mapping 0.0078 re_causal 0.0192 /// teacc 99.02 lr 0.00010000 Epoch 139, weight, value: tensor([[-0.0790, -0.1023, -0.0624, ..., -0.0425, 0.1374, 0.1303], [-0.1394, -0.1363, -0.1120, ..., -0.1083, -0.1449, -0.0720], [-0.0853, -0.0721, 0.1037, ..., -0.1210, 0.1374, 0.0347], ..., [-0.0751, 0.0539, 0.0417, ..., 0.1338, -0.1054, -0.1242], [-0.1797, 0.0501, -0.1093, ..., 0.0388, -0.0690, -0.0795], [ 0.0385, -0.0646, -0.0613, ..., -0.0540, -0.0412, -0.0600]], device='cuda:0'), grad: tensor([[ 1.5181e-07, 8.3819e-08, 0.0000e+00, ..., 4.6566e-08, 5.7183e-07, 1.0803e-07], [ 6.0536e-08, 3.2503e-07, 0.0000e+00, ..., 1.5367e-07, 1.9558e-08, 2.7940e-09], [ 2.0117e-07, 5.6531e-07, 0.0000e+00, ..., 6.9849e-08, -1.4782e-05, -3.6322e-08], ..., [ 7.4506e-08, -1.2927e-06, 9.3132e-10, ..., -1.0096e-06, 1.0626e-06, 1.2107e-08], [ 4.8615e-07, 6.4541e-07, 9.3132e-10, ..., 5.0757e-07, 1.0710e-07, 1.8626e-08], [-2.0172e-06, 2.1700e-07, 9.3132e-10, ..., 2.8778e-07, 2.4214e-08, 1.8626e-09]], device='cuda:0') Epoch 139, bias, value: tensor([-0.0240, -0.0255, -0.0106, -0.0277, -0.0312, 0.0007, 0.0274, -0.0123, 0.0337, 0.0022], device='cuda:0'), grad: tensor([ 1.0617e-06, -1.2415e-06, -1.5259e-05, -3.8184e-08, 7.4565e-05, 1.1874e-06, 1.1660e-06, 9.6764e-07, 2.4363e-06, -6.4850e-05], device='cuda:0') 100 0.0001 changing lr epoch 138, time 263.83, cls_loss 0.0054 cls_loss_mapping 0.0053 cls_loss_causal 0.5331 re_mapping 0.0080 re_causal 0.0194 /// teacc 98.85 lr 0.00010000 Epoch 140, weight, value: tensor([[-0.0792, -0.1029, -0.0625, ..., -0.0428, 0.1377, 0.1305], [-0.1396, -0.1371, -0.1119, ..., -0.1088, -0.1461, -0.0735], [-0.0854, -0.0731, 0.1039, ..., -0.1226, 0.1384, 0.0358], ..., [-0.0754, 0.0543, 0.0418, ..., 0.1344, -0.1059, -0.1248], [-0.1809, 0.0501, -0.1095, ..., 0.0384, -0.0690, -0.0802], [ 0.0390, -0.0651, -0.0617, ..., -0.0543, -0.0411, -0.0608]], device='cuda:0'), grad: tensor([[ 5.6550e-06, 2.5844e-07, 1.3132e-07, ..., -1.3344e-05, -3.1322e-05, -2.1607e-05], [ 2.3562e-07, 6.8033e-07, 3.2596e-07, ..., 7.7719e-07, 4.7358e-07, 2.3004e-07], [ 2.4168e-07, 1.9390e-06, 9.3272e-07, ..., 2.4233e-06, 1.6056e-06, 1.1344e-06], ..., [ 1.2340e-07, -7.1898e-06, -3.2149e-06, ..., 4.6082e-06, 2.9996e-05, 2.0266e-05], [ 7.9442e-07, 1.2023e-06, 6.8033e-07, ..., 1.3625e-06, 6.9803e-07, 3.9814e-07], [ 7.3574e-07, 3.8510e-07, -4.7311e-07, ..., 7.8883e-07, 1.6065e-06, 1.0477e-06]], device='cuda:0') Epoch 140, bias, value: tensor([-0.0240, -0.0258, -0.0110, -0.0276, -0.0312, 0.0012, 0.0270, -0.0116, 0.0332, 0.0023], device='cuda:0'), grad: tensor([-4.2975e-05, -1.8671e-05, 8.2105e-06, 1.3113e-05, 1.9208e-05, 1.0371e-04, -1.1522e-04, 3.1233e-05, 1.2629e-05, -1.1258e-05], device='cuda:0') 100 0.0001 changing lr epoch 139, time 263.27, cls_loss 0.0050 cls_loss_mapping 0.0070 cls_loss_causal 0.5478 re_mapping 0.0083 re_causal 0.0203 /// teacc 98.98 lr 0.00010000 Epoch 141, weight, value: tensor([[-0.0785, -0.1045, -0.0628, ..., -0.0431, 0.1382, 0.1314], [-0.1400, -0.1376, -0.1116, ..., -0.1089, -0.1466, -0.0737], [-0.0858, -0.0735, 0.1039, ..., -0.1228, 0.1398, 0.0380], ..., [-0.0756, 0.0528, 0.0420, ..., 0.1344, -0.1067, -0.1254], [-0.1817, 0.0500, -0.1100, ..., 0.0379, -0.0690, -0.0824], [ 0.0370, -0.0658, -0.0621, ..., -0.0549, -0.0422, -0.0654]], device='cuda:0'), grad: tensor([[ 4.7218e-07, 4.0047e-08, 1.2107e-08, ..., 4.4703e-08, -7.5437e-07, -3.4831e-07], [ 5.6028e-06, 1.4156e-07, 1.6764e-08, ..., 1.3784e-07, 4.8429e-08, 2.2352e-08], [ 1.2089e-06, -7.0035e-07, -2.0023e-07, ..., 9.1270e-08, -2.8387e-06, -1.6624e-06], ..., [ 6.0536e-07, 4.9360e-08, 1.3132e-07, ..., -2.0582e-07, 2.2780e-06, 1.3821e-06], [ 2.9709e-07, -1.4603e-05, 7.7300e-08, ..., -2.7671e-05, 1.7602e-07, 8.4750e-08], [ 9.1456e-07, 1.3597e-07, -3.4925e-07, ..., -1.4063e-07, 6.2864e-07, 2.8778e-07]], device='cuda:0') Epoch 141, bias, value: tensor([-0.0236, -0.0255, -0.0103, -0.0261, -0.0313, 0.0005, 0.0271, -0.0121, 0.0323, 0.0016], device='cuda:0'), grad: tensor([ 2.3246e-06, 2.9668e-05, 3.7216e-06, 3.2112e-06, -1.4794e-04, 3.1263e-05, 1.0329e-04, 1.3068e-05, -3.3408e-05, -5.4501e-06], device='cuda:0') 100 0.0001 changing lr epoch 140, time 262.06, cls_loss 0.0055 cls_loss_mapping 0.0068 cls_loss_causal 0.5637 re_mapping 0.0077 re_causal 0.0199 /// teacc 98.94 lr 0.00010000 Epoch 142, weight, value: tensor([[-0.0787, -0.1056, -0.0632, ..., -0.0436, 0.1386, 0.1317], [-0.1406, -0.1381, -0.1116, ..., -0.1097, -0.1470, -0.0738], [-0.0862, -0.0738, 0.1052, ..., -0.1233, 0.1410, 0.0389], ..., [-0.0757, 0.0532, 0.0422, ..., 0.1354, -0.1071, -0.1261], [-0.1825, 0.0500, -0.1103, ..., 0.0381, -0.0696, -0.0833], [ 0.0358, -0.0669, -0.0630, ..., -0.0580, -0.0427, -0.0666]], device='cuda:0'), grad: tensor([[ 8.3167e-07, 1.6764e-08, 4.8429e-08, ..., 9.1176e-07, -3.0641e-07, -1.2480e-07], [ 7.4133e-06, 1.5367e-07, 2.6543e-07, ..., 9.5069e-06, 2.4680e-07, 3.4925e-07], [ 4.7497e-08, 2.2799e-06, 1.9744e-07, ..., 1.4463e-06, -1.5192e-05, -2.2367e-05], ..., [ 5.2191e-06, -3.0585e-06, 9.7752e-06, ..., 2.4632e-05, 1.5087e-07, 2.0303e-07], [ 3.6567e-05, -2.4214e-07, 5.4017e-08, ..., 3.8058e-05, 1.3217e-05, 2.0444e-05], [ 5.8524e-06, 1.1921e-07, -1.0528e-05, ..., -1.7256e-05, 8.6334e-07, 3.3528e-07]], device='cuda:0') Epoch 142, bias, value: tensor([-0.0234, -0.0255, -0.0101, -0.0266, -0.0293, 0.0006, 0.0273, -0.0117, 0.0323, -0.0001], device='cuda:0'), grad: tensor([ 3.3565e-06, 1.9923e-05, -6.9261e-05, 2.8357e-05, -9.5427e-05, -1.3566e-04, 7.6443e-06, 1.9360e-04, 1.3924e-04, -9.1791e-05], device='cuda:0') 100 0.0001 changing lr epoch 141, time 263.44, cls_loss 0.0054 cls_loss_mapping 0.0048 cls_loss_causal 0.5554 re_mapping 0.0076 re_causal 0.0193 /// teacc 98.93 lr 0.00010000 Epoch 143, weight, value: tensor([[-0.0788, -0.1059, -0.0634, ..., -0.0438, 0.1388, 0.1321], [-0.1418, -0.1388, -0.1116, ..., -0.1103, -0.1472, -0.0740], [-0.0867, -0.0743, 0.1055, ..., -0.1239, 0.1417, 0.0388], ..., [-0.0765, 0.0533, 0.0422, ..., 0.1360, -0.1082, -0.1266], [-0.1832, 0.0500, -0.1102, ..., 0.0382, -0.0698, -0.0835], [ 0.0349, -0.0676, -0.0632, ..., -0.0583, -0.0430, -0.0674]], device='cuda:0'), grad: tensor([[ 1.1194e-06, 2.1420e-08, 3.3528e-08, ..., 9.2201e-08, -7.0315e-07, -3.4086e-07], [ 6.2399e-08, 1.0524e-07, 2.4177e-06, ..., 3.7979e-06, -2.0117e-07, 1.5832e-08], [ 4.5635e-08, -1.1455e-07, 1.5832e-08, ..., 1.9372e-07, -5.3644e-07, -3.6322e-07], ..., [ 4.0978e-08, -3.8091e-07, 7.9535e-07, ..., 7.5065e-07, 1.2107e-07, 5.3085e-08], [ 2.4199e-05, 1.2200e-07, 5.4576e-07, ..., 1.8962e-06, 3.7998e-07, 2.0768e-07], [-3.5435e-05, 1.0617e-07, -4.2617e-06, ..., -8.2031e-06, 4.3679e-07, 2.1327e-07]], device='cuda:0') Epoch 143, bias, value: tensor([-0.0235, -0.0255, -0.0102, -0.0273, -0.0293, 0.0016, 0.0275, -0.0121, 0.0326, -0.0002], device='cuda:0'), grad: tensor([ 4.9360e-06, 6.4194e-05, 3.2391e-06, 2.7895e-05, 1.6242e-05, 1.5110e-05, 2.8238e-06, 2.6271e-05, 1.2279e-04, -2.8348e-04], device='cuda:0') 100 0.0001 changing lr epoch 142, time 265.29, cls_loss 0.0048 cls_loss_mapping 0.0051 cls_loss_causal 0.5091 re_mapping 0.0074 re_causal 0.0185 /// teacc 98.97 lr 0.00010000 Epoch 144, weight, value: tensor([[-0.0791, -0.1051, -0.0635, ..., -0.0441, 0.1397, 0.1330], [-0.1421, -0.1400, -0.1117, ..., -0.1116, -0.1486, -0.0756], [-0.0868, -0.0748, 0.1056, ..., -0.1247, 0.1428, 0.0399], ..., [-0.0769, 0.0539, 0.0422, ..., 0.1372, -0.1088, -0.1271], [-0.1835, 0.0492, -0.1104, ..., 0.0391, -0.0689, -0.0840], [ 0.0347, -0.0681, -0.0630, ..., -0.0586, -0.0439, -0.0692]], device='cuda:0'), grad: tensor([[ 3.2652e-06, 5.2154e-08, 1.8626e-09, ..., 1.7695e-08, 3.2932e-06, 4.2990e-06], [ 3.6974e-07, 1.7509e-07, 9.3132e-10, ..., 1.3039e-08, 3.3062e-07, -1.2359e-06], [ 2.7362e-06, 1.1735e-07, 0.0000e+00, ..., 1.2107e-08, 3.5074e-06, 3.9078e-06], ..., [ 1.5926e-07, 1.6764e-07, 1.8626e-09, ..., -1.2107e-08, 3.6322e-08, 8.2888e-08], [ 2.7493e-05, 5.6438e-07, 9.3132e-10, ..., 2.0862e-07, 3.5048e-05, 3.5971e-05], [ 5.6345e-07, 2.2165e-07, -5.5879e-09, ..., 4.4703e-08, 1.0505e-06, 7.5903e-07]], device='cuda:0') Epoch 144, bias, value: tensor([-0.0228, -0.0258, -0.0097, -0.0274, -0.0297, 0.0017, 0.0268, -0.0120, 0.0330, -0.0006], device='cuda:0'), grad: tensor([ 1.1384e-05, -7.9796e-06, 1.0796e-05, -1.0375e-06, 1.7453e-06, 1.1690e-05, -1.1635e-04, 1.5255e-06, 8.8692e-05, -5.2433e-07], device='cuda:0') 100 0.0001 changing lr epoch 143, time 265.37, cls_loss 0.0056 cls_loss_mapping 0.0065 cls_loss_causal 0.5403 re_mapping 0.0076 re_causal 0.0194 /// teacc 98.91 lr 0.00010000 Epoch 145, weight, value: tensor([[-0.0791, -0.1051, -0.0637, ..., -0.0442, 0.1398, 0.1331], [-0.1422, -0.1379, -0.1118, ..., -0.1094, -0.1492, -0.0760], [-0.0870, -0.0750, 0.1067, ..., -0.1256, 0.1432, 0.0401], ..., [-0.0771, 0.0532, 0.0414, ..., 0.1369, -0.1092, -0.1277], [-0.1853, 0.0496, -0.1105, ..., 0.0386, -0.0692, -0.0845], [ 0.0345, -0.0693, -0.0629, ..., -0.0600, -0.0441, -0.0697]], device='cuda:0'), grad: tensor([[ 1.3309e-06, 5.6811e-08, 6.5193e-09, ..., 3.4459e-08, -5.0701e-06, -3.3099e-06], [ 1.1045e-06, 2.3935e-07, 1.7695e-08, ..., 2.9150e-07, 2.5705e-07, -3.3528e-08], [ 1.4929e-06, 3.2503e-07, 1.8626e-08, ..., 3.4645e-07, 3.2596e-08, 5.1502e-07], ..., [ 2.5611e-06, -1.8235e-06, -7.2643e-08, ..., -1.8906e-06, 3.6694e-07, 2.3097e-07], [ 3.2246e-05, 2.1569e-06, 7.7672e-07, ..., 3.1199e-07, 1.7444e-06, 1.1194e-06], [-2.1744e-04, -1.6801e-06, -8.4471e-07, ..., 5.6159e-07, 7.3854e-07, 4.8056e-07]], device='cuda:0') Epoch 145, bias, value: tensor([-0.0227, -0.0229, -0.0097, -0.0278, -0.0294, 0.0021, 0.0269, -0.0146, 0.0326, -0.0009], device='cuda:0'), grad: tensor([-4.9658e-06, -2.5854e-06, 1.1094e-05, 6.1952e-06, -4.8637e-05, 3.7789e-04, 1.0258e-04, 9.7156e-06, 1.1081e-04, -5.6124e-04], device='cuda:0') 100 0.0001 changing lr epoch 144, time 264.97, cls_loss 0.0058 cls_loss_mapping 0.0062 cls_loss_causal 0.5481 re_mapping 0.0075 re_causal 0.0196 /// teacc 99.02 lr 0.00010000 Epoch 146, weight, value: tensor([[-0.0791, -0.1058, -0.0643, ..., -0.0443, 0.1402, 0.1336], [-0.1445, -0.1402, -0.1119, ..., -0.1122, -0.1496, -0.0755], [-0.0852, -0.0749, 0.1072, ..., -0.1259, 0.1436, 0.0399], ..., [-0.0777, 0.0546, 0.0411, ..., 0.1393, -0.1100, -0.1286], [-0.1866, 0.0493, -0.1108, ..., 0.0382, -0.0695, -0.0851], [ 0.0344, -0.0698, -0.0630, ..., -0.0609, -0.0446, -0.0712]], device='cuda:0'), grad: tensor([[ 2.2538e-07, 6.5193e-09, 0.0000e+00, ..., 4.7591e-07, -3.9414e-06, -1.1176e-06], [ 1.4156e-07, 4.3772e-08, 0.0000e+00, ..., 3.5204e-07, 6.5491e-06, 1.1168e-05], [ 6.7055e-08, 4.1910e-08, 0.0000e+00, ..., 1.3504e-07, -1.4976e-05, -2.4632e-05], ..., [ 6.7335e-07, -5.0291e-08, 0.0000e+00, ..., 3.1404e-06, 3.1050e-06, 4.6864e-06], [ 1.6028e-06, -3.5390e-08, 0.0000e+00, ..., 3.2876e-06, 1.9111e-06, 1.4473e-06], [ 6.1747e-07, 8.8476e-08, 9.3132e-10, ..., -1.0822e-06, 2.2780e-06, 2.1793e-06]], device='cuda:0') Epoch 146, bias, value: tensor([-2.2472e-02, -2.4970e-02, -8.9712e-03, -2.8102e-02, -3.0582e-02, 1.8346e-03, 2.6861e-02, -1.2489e-02, 3.1929e-02, -9.4486e-05], device='cuda:0'), grad: tensor([ 5.2787e-06, 6.8784e-05, -1.8716e-04, 2.3007e-05, 2.3156e-05, -1.1496e-05, 6.8173e-06, 5.8979e-05, 1.4797e-05, -2.1309e-06], device='cuda:0') 100 0.0001 changing lr epoch 145, time 265.71, cls_loss 0.0063 cls_loss_mapping 0.0062 cls_loss_causal 0.5588 re_mapping 0.0075 re_causal 0.0187 /// teacc 98.99 lr 0.00010000 Epoch 147, weight, value: tensor([[-0.0790, -0.1061, -0.0614, ..., -0.0449, 0.1414, 0.1350], [-0.1451, -0.1404, -0.1126, ..., -0.1130, -0.1500, -0.0757], [-0.0855, -0.0754, 0.1072, ..., -0.1266, 0.1433, 0.0395], ..., [-0.0781, 0.0539, 0.0420, ..., 0.1386, -0.1108, -0.1294], [-0.1877, 0.0519, -0.1119, ..., 0.0405, -0.0700, -0.0860], [ 0.0332, -0.0708, -0.0633, ..., -0.0615, -0.0455, -0.0735]], device='cuda:0'), grad: tensor([[ 3.9209e-07, 4.1444e-07, 1.8626e-09, ..., 1.2666e-07, -3.9861e-07, -2.9244e-07], [ 7.0874e-07, 8.5682e-07, 9.3132e-10, ..., 2.3935e-07, 1.4808e-07, 5.7742e-08], [ 1.6829e-06, 2.0582e-06, 0.0000e+00, ..., 5.7649e-07, 8.2236e-07, 1.3225e-07], ..., [ 5.0571e-07, 5.3924e-07, 3.7253e-09, ..., -1.7509e-07, 5.6811e-08, 2.3283e-08], [ 1.4633e-05, 6.9663e-06, 5.4017e-08, ..., 1.0088e-05, 5.7183e-06, 2.0992e-06], [ 6.5006e-07, 9.6112e-07, -7.1712e-08, ..., 1.9465e-07, 3.5204e-07, 2.1048e-07]], device='cuda:0') Epoch 147, bias, value: tensor([-0.0216, -0.0251, -0.0093, -0.0291, -0.0307, 0.0028, 0.0267, -0.0125, 0.0328, -0.0006], device='cuda:0'), grad: tensor([ 5.1409e-07, 1.8897e-06, 5.8599e-06, -2.9176e-05, 1.9558e-07, -8.9407e-05, 6.7174e-05, 1.2247e-06, 3.9548e-05, 2.0340e-06], device='cuda:0') 100 0.0001 changing lr epoch 146, time 265.32, cls_loss 0.0049 cls_loss_mapping 0.0038 cls_loss_causal 0.5518 re_mapping 0.0077 re_causal 0.0193 /// teacc 98.96 lr 0.00010000 Epoch 148, weight, value: tensor([[-0.0795, -0.1072, -0.0614, ..., -0.0455, 0.1417, 0.1353], [-0.1457, -0.1406, -0.1128, ..., -0.1133, -0.1505, -0.0757], [-0.0857, -0.0762, 0.1073, ..., -0.1288, 0.1440, 0.0394], ..., [-0.0798, 0.0543, 0.0423, ..., 0.1397, -0.1108, -0.1298], [-0.1882, 0.0518, -0.1126, ..., 0.0403, -0.0701, -0.0863], [ 0.0335, -0.0713, -0.0626, ..., -0.0620, -0.0458, -0.0744]], device='cuda:0'), grad: tensor([[ 1.1183e-05, 3.4999e-06, 3.7253e-09, ..., 1.5832e-08, 1.7090e-06, 5.8711e-06], [ 1.4873e-06, 5.7630e-06, 9.3132e-10, ..., 1.9558e-08, 1.5553e-07, 4.6194e-07], [ 5.6811e-07, 2.7284e-05, 1.8626e-09, ..., 3.3528e-08, 2.1420e-08, 2.1886e-07], ..., [ 2.9244e-07, 1.3141e-06, 1.8626e-09, ..., 1.6764e-08, 4.0978e-08, 1.0151e-07], [ 3.0212e-06, 1.1034e-05, -1.8626e-09, ..., 5.1316e-07, 4.6659e-07, 1.1595e-06], [ 4.2655e-06, 1.5888e-06, 4.6566e-09, ..., -9.1922e-07, 8.0373e-07, 2.6561e-06]], device='cuda:0') Epoch 148, bias, value: tensor([-0.0215, -0.0250, -0.0098, -0.0292, -0.0310, 0.0028, 0.0270, -0.0121, 0.0320, -0.0005], device='cuda:0'), grad: tensor([ 4.3035e-05, 1.1697e-05, 4.2230e-05, -6.6638e-05, 3.9965e-05, 1.0669e-05, -1.2279e-04, 2.8908e-06, 2.6852e-05, 1.2286e-05], device='cuda:0') 100 0.0001 changing lr epoch 147, time 265.66, cls_loss 0.0044 cls_loss_mapping 0.0048 cls_loss_causal 0.5414 re_mapping 0.0081 re_causal 0.0198 /// teacc 98.90 lr 0.00010000 Epoch 149, weight, value: tensor([[-0.0797, -0.1078, -0.0613, ..., -0.0457, 0.1422, 0.1359], [-0.1462, -0.1407, -0.1118, ..., -0.1136, -0.1508, -0.0760], [-0.0858, -0.0773, 0.1081, ..., -0.1293, 0.1445, 0.0397], ..., [-0.0798, 0.0545, 0.0418, ..., 0.1401, -0.1112, -0.1303], [-0.1888, 0.0525, -0.1128, ..., 0.0401, -0.0703, -0.0867], [ 0.0327, -0.0733, -0.0627, ..., -0.0622, -0.0459, -0.0749]], device='cuda:0'), grad: tensor([[ 4.1090e-06, 6.7428e-07, 0.0000e+00, ..., 6.8638e-07, 2.9758e-05, 2.5779e-05], [ 3.9116e-08, 3.0827e-07, 2.7940e-09, ..., 3.5949e-07, 1.9185e-07, 1.4622e-07], [ 3.1665e-08, 1.3039e-07, 4.6566e-09, ..., 1.3411e-07, 3.5856e-07, 2.4494e-07], ..., [ 8.1398e-07, -3.2559e-06, 9.3132e-09, ..., -5.4874e-06, 3.8184e-08, 2.4214e-08], [ 2.5295e-06, 1.2852e-07, 0.0000e+00, ..., 3.7476e-06, 2.7008e-07, 1.4156e-07], [-2.4959e-06, -1.2293e-06, 2.7940e-09, ..., 2.6990e-06, 2.8741e-06, 1.3411e-06]], device='cuda:0') Epoch 149, bias, value: tensor([-0.0215, -0.0248, -0.0101, -0.0291, -0.0309, 0.0027, 0.0268, -0.0123, 0.0331, -0.0010], device='cuda:0'), grad: tensor([ 6.9439e-05, 4.1574e-06, 1.2526e-06, 7.3537e-06, 1.1757e-05, 4.5933e-06, -7.2062e-05, 9.6392e-07, 4.5970e-06, -3.2187e-05], device='cuda:0') 100 0.0001 changing lr epoch 148, time 265.60, cls_loss 0.0034 cls_loss_mapping 0.0038 cls_loss_causal 0.5251 re_mapping 0.0077 re_causal 0.0195 /// teacc 98.88 lr 0.00010000 Epoch 150, weight, value: tensor([[-0.0799, -0.1082, -0.0613, ..., -0.0460, 0.1423, 0.1360], [-0.1466, -0.1410, -0.1118, ..., -0.1147, -0.1514, -0.0761], [-0.0860, -0.0779, 0.1084, ..., -0.1295, 0.1450, 0.0400], ..., [-0.0802, 0.0547, 0.0417, ..., 0.1410, -0.1119, -0.1310], [-0.1893, 0.0525, -0.1129, ..., 0.0401, -0.0705, -0.0869], [ 0.0324, -0.0737, -0.0627, ..., -0.0626, -0.0460, -0.0751]], device='cuda:0'), grad: tensor([[ 8.0094e-08, 3.9116e-08, 1.8626e-09, ..., 1.5926e-07, -1.1422e-05, -5.3421e-06], [ 8.0094e-08, 9.9465e-07, 9.3132e-10, ..., 4.9509e-06, 2.5872e-06, 1.2945e-06], [ 8.6613e-08, 1.6112e-07, 0.0000e+00, ..., 6.1188e-07, -1.0617e-06, -4.8988e-07], ..., [ 2.8871e-08, -6.0648e-06, 0.0000e+00, ..., -3.1352e-05, 1.0077e-06, 5.1688e-07], [ 2.6822e-07, 2.6841e-06, 2.7940e-09, ..., 1.4454e-05, 4.3809e-06, 1.9204e-06], [ 4.4703e-08, 1.5954e-06, 0.0000e+00, ..., 8.1956e-06, 3.2745e-06, 1.6131e-06]], device='cuda:0') Epoch 150, bias, value: tensor([-0.0215, -0.0253, -0.0102, -0.0290, -0.0307, 0.0028, 0.0266, -0.0118, 0.0331, -0.0012], device='cuda:0'), grad: tensor([-1.7419e-05, 1.2405e-05, -2.9299e-06, 2.7288e-06, 9.1735e-07, 3.2373e-06, -1.0412e-06, -3.7730e-05, 2.3752e-05, 1.6093e-05], device='cuda:0') 100 0.0001 changing lr epoch 149, time 263.72, cls_loss 0.0047 cls_loss_mapping 0.0052 cls_loss_causal 0.5462 re_mapping 0.0073 re_causal 0.0191 /// teacc 98.95 lr 0.00010000 Epoch 151, weight, value: tensor([[-0.0801, -0.1088, -0.0617, ..., -0.0466, 0.1429, 0.1365], [-0.1501, -0.1416, -0.1120, ..., -0.1155, -0.1518, -0.0771], [-0.0836, -0.0789, 0.1085, ..., -0.1310, 0.1453, 0.0403], ..., [-0.0802, 0.0553, 0.0417, ..., 0.1419, -0.1126, -0.1313], [-0.1907, 0.0524, -0.1131, ..., 0.0401, -0.0709, -0.0873], [ 0.0325, -0.0742, -0.0628, ..., -0.0630, -0.0464, -0.0757]], device='cuda:0'), grad: tensor([[ 3.8464e-07, 1.2191e-06, 1.8626e-09, ..., 1.4156e-07, -2.8387e-06, -1.5032e-06], [ 3.5483e-07, 4.2934e-07, 9.3132e-10, ..., 1.4994e-07, 4.5635e-08, 2.3283e-08], [ 1.6857e-07, 8.9407e-07, 0.0000e+00, ..., 1.4249e-07, -2.1048e-07, -4.1910e-08], ..., [ 1.4994e-07, -2.3730e-06, 2.7940e-09, ..., -1.6326e-06, 1.9185e-07, 7.4506e-08], [ 7.0453e-05, 2.1324e-05, 2.7940e-09, ..., 2.5183e-05, 2.6263e-07, 1.3877e-07], [ 1.1595e-06, 1.1660e-06, -1.3039e-08, ..., 4.6007e-07, 2.0545e-06, 1.0710e-06]], device='cuda:0') Epoch 151, bias, value: tensor([-0.0211, -0.0265, -0.0095, -0.0290, -0.0307, 0.0025, 0.0268, -0.0108, 0.0329, -0.0013], device='cuda:0'), grad: tensor([-1.3616e-06, -5.3905e-06, 4.8839e-06, 4.3297e-04, -2.5146e-07, -5.4550e-04, 3.4664e-06, -6.6124e-07, 1.0383e-04, 7.0408e-06], device='cuda:0') 100 0.0001 changing lr epoch 150, time 260.43, cls_loss 0.0048 cls_loss_mapping 0.0055 cls_loss_causal 0.5582 re_mapping 0.0077 re_causal 0.0192 /// teacc 98.95 lr 0.00010000 Epoch 152, weight, value: tensor([[-0.0805, -0.1110, -0.0618, ..., -0.0473, 0.1430, 0.1367], [-0.1508, -0.1417, -0.1121, ..., -0.1160, -0.1525, -0.0775], [-0.0833, -0.0798, 0.1092, ..., -0.1323, 0.1466, 0.0406], ..., [-0.0813, 0.0555, 0.0418, ..., 0.1425, -0.1142, -0.1319], [-0.1921, 0.0523, -0.1132, ..., 0.0401, -0.0713, -0.0878], [ 0.0323, -0.0746, -0.0628, ..., -0.0629, -0.0466, -0.0761]], device='cuda:0'), grad: tensor([[ 1.0096e-05, 2.5146e-08, 1.6019e-07, ..., 1.3039e-07, 3.8017e-06, 6.8955e-06], [ 1.1679e-06, 4.3772e-07, 6.9849e-08, ..., -2.9296e-05, 6.4541e-07, 9.1642e-07], [ 6.4857e-06, 1.5646e-07, 5.9605e-08, ..., 9.5461e-07, 2.9560e-06, 4.9025e-06], ..., [ 1.4529e-07, -1.2293e-06, 1.1642e-07, ..., -2.2855e-06, 1.2666e-07, 1.0896e-07], [ 2.9817e-05, 1.2666e-07, 3.2596e-08, ..., 1.6809e-05, 1.2361e-05, 2.1785e-05], [-4.0904e-06, 4.1816e-07, 1.8626e-07, ..., 1.2992e-06, 3.4180e-07, 4.8708e-07]], device='cuda:0') Epoch 152, bias, value: tensor([-0.0212, -0.0268, -0.0096, -0.0289, -0.0302, 0.0032, 0.0256, -0.0108, 0.0325, -0.0013], device='cuda:0'), grad: tensor([ 2.9132e-05, -1.6201e-04, 2.1517e-05, 2.2203e-06, 1.8785e-06, 2.2721e-04, -2.8348e-04, 2.7880e-05, 1.8239e-04, -4.6670e-05], device='cuda:0') 100 0.0001 changing lr epoch 151, time 256.46, cls_loss 0.0055 cls_loss_mapping 0.0063 cls_loss_causal 0.4956 re_mapping 0.0075 re_causal 0.0180 /// teacc 98.95 lr 0.00010000 Epoch 153, weight, value: tensor([[-0.0809, -0.1117, -0.0620, ..., -0.0487, 0.1433, 0.1371], [-0.1511, -0.1419, -0.1122, ..., -0.1169, -0.1532, -0.0777], [-0.0836, -0.0804, 0.1094, ..., -0.1330, 0.1483, 0.0408], ..., [-0.0814, 0.0546, 0.0417, ..., 0.1420, -0.1159, -0.1329], [-0.1929, 0.0542, -0.1134, ..., 0.0420, -0.0713, -0.0883], [ 0.0322, -0.0757, -0.0636, ..., -0.0627, -0.0470, -0.0769]], device='cuda:0'), grad: tensor([[1.3721e-04, 1.2293e-07, 2.7940e-09, ..., 9.6977e-05, 6.7532e-05, 2.9922e-05], [1.3672e-06, 2.6077e-08, 2.7940e-09, ..., 9.9093e-07, 5.0198e-07, 1.6578e-07], [2.7604e-06, 6.6124e-08, 3.7253e-09, ..., 1.4575e-06, 2.1681e-06, 1.3160e-06], ..., [1.4566e-06, 3.3528e-08, 9.3132e-10, ..., 1.1260e-06, 3.0920e-07, 1.3970e-08], [7.8678e-06, 5.4948e-08, 3.7253e-09, ..., 2.3190e-07, 1.3709e-05, 1.1079e-05], [6.0350e-07, 3.8184e-08, 2.7940e-09, ..., 4.6603e-06, 1.6978e-06, 1.8347e-07]], device='cuda:0') Epoch 153, bias, value: tensor([-0.0212, -0.0269, -0.0097, -0.0289, -0.0297, 0.0025, 0.0270, -0.0117, 0.0344, -0.0015], device='cuda:0'), grad: tensor([ 5.3072e-04, 4.8690e-06, 1.1258e-05, 1.5211e-04, -1.6761e-04, -6.4421e-04, -9.2268e-05, 5.2117e-06, 1.7786e-04, 2.1443e-05], device='cuda:0') 100 0.0001 changing lr epoch 152, time 256.05, cls_loss 0.0060 cls_loss_mapping 0.0060 cls_loss_causal 0.5058 re_mapping 0.0073 re_causal 0.0180 /// teacc 99.01 lr 0.00010000 Epoch 154, weight, value: tensor([[-0.0814, -0.1127, -0.0621, ..., -0.0500, 0.1430, 0.1368], [-0.1533, -0.1430, -0.1122, ..., -0.1173, -0.1543, -0.0780], [-0.0843, -0.0812, 0.1094, ..., -0.1342, 0.1503, 0.0407], ..., [-0.0833, 0.0551, 0.0418, ..., 0.1415, -0.1173, -0.1334], [-0.1908, 0.0552, -0.1134, ..., 0.0433, -0.0703, -0.0883], [ 0.0318, -0.0783, -0.0636, ..., -0.0629, -0.0471, -0.0773]], device='cuda:0'), grad: tensor([[ 3.5577e-07, 1.1176e-07, 0.0000e+00, ..., 5.0291e-08, 1.1679e-06, 8.1956e-08], [ 8.1286e-06, 4.3847e-06, 0.0000e+00, ..., -6.1374e-07, 1.2666e-07, 4.2841e-08], [ 7.5530e-07, 1.0906e-06, 0.0000e+00, ..., 1.8068e-07, -1.0161e-06, -1.9744e-07], ..., [ 5.4855e-07, -5.1051e-05, 0.0000e+00, ..., -4.2319e-05, 2.5798e-07, 4.5635e-08], [ 5.3272e-07, 2.2892e-06, 0.0000e+00, ..., 1.5004e-06, 3.1106e-07, 1.2945e-07], [-7.4618e-06, 2.8372e-05, 0.0000e+00, ..., 2.3976e-05, 1.8999e-07, 2.6077e-08]], device='cuda:0') Epoch 154, bias, value: tensor([-0.0220, -0.0274, -0.0097, -0.0292, -0.0292, 0.0024, 0.0272, -0.0120, 0.0367, -0.0018], device='cuda:0'), grad: tensor([ 3.6526e-06, 6.3717e-05, 1.8226e-06, 2.2203e-05, 1.6931e-06, 5.4613e-06, -1.5926e-06, -7.3493e-05, 6.1356e-06, -2.9609e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 153---------------------------------------------------- epoch 153, time 272.72, cls_loss 0.0050 cls_loss_mapping 0.0048 cls_loss_causal 0.5256 re_mapping 0.0072 re_causal 0.0188 /// teacc 99.03 lr 0.00010000 Epoch 155, weight, value: tensor([[-0.0816, -0.1130, -0.0621, ..., -0.0502, 0.1435, 0.1372], [-0.1541, -0.1438, -0.1122, ..., -0.1168, -0.1550, -0.0782], [-0.0840, -0.0817, 0.1089, ..., -0.1356, 0.1507, 0.0407], ..., [-0.0839, 0.0552, 0.0428, ..., 0.1418, -0.1182, -0.1339], [-0.1917, 0.0552, -0.1142, ..., 0.0432, -0.0707, -0.0888], [ 0.0320, -0.0790, -0.0639, ..., -0.0636, -0.0472, -0.0777]], device='cuda:0'), grad: tensor([[ 1.9576e-06, 1.8468e-06, 0.0000e+00, ..., 2.7195e-06, 1.6727e-06, 1.7127e-06], [ 6.9849e-08, 4.4219e-06, 0.0000e+00, ..., 6.4224e-06, 4.3772e-08, 4.0978e-08], [ 1.4808e-07, 2.7195e-07, 0.0000e+00, ..., 3.4180e-07, 8.3819e-08, 1.0245e-07], ..., [ 2.3283e-08, -1.0826e-05, 0.0000e+00, ..., -1.6212e-05, 1.2107e-08, 9.3132e-09], [ 2.0489e-07, 4.0419e-07, 0.0000e+00, ..., 2.6263e-07, 9.4064e-08, 7.4506e-08], [ 6.6496e-07, 2.5239e-06, 0.0000e+00, ..., 2.0601e-06, 2.2538e-07, 1.6950e-07]], device='cuda:0') Epoch 155, bias, value: tensor([-0.0219, -0.0274, -0.0098, -0.0290, -0.0291, 0.0023, 0.0271, -0.0119, 0.0365, -0.0019], device='cuda:0'), grad: tensor([ 1.6868e-05, 1.3704e-03, 4.3064e-06, 3.8370e-06, -1.6575e-03, 1.2398e-05, -2.4945e-05, -2.5451e-05, 2.7582e-05, 2.7323e-04], device='cuda:0') 100 0.0001 changing lr epoch 154, time 255.78, cls_loss 0.0056 cls_loss_mapping 0.0056 cls_loss_causal 0.5775 re_mapping 0.0071 re_causal 0.0182 /// teacc 98.95 lr 0.00010000 Epoch 156, weight, value: tensor([[-0.0819, -0.1132, -0.0622, ..., -0.0505, 0.1439, 0.1376], [-0.1543, -0.1439, -0.1123, ..., -0.1167, -0.1555, -0.0784], [-0.0847, -0.0844, 0.1091, ..., -0.1368, 0.1521, 0.0410], ..., [-0.0870, 0.0555, 0.0427, ..., 0.1418, -0.1204, -0.1350], [-0.1930, 0.0550, -0.1143, ..., 0.0431, -0.0711, -0.0891], [ 0.0319, -0.0794, -0.0639, ..., -0.0641, -0.0477, -0.0785]], device='cuda:0'), grad: tensor([[ 6.9849e-08, 3.1665e-08, 1.1176e-08, ..., 3.9116e-08, -1.7762e-05, -1.1437e-05], [ 7.2643e-08, 2.9709e-07, 1.8626e-09, ..., 4.7777e-07, 4.9435e-06, 3.1907e-06], [ 2.0489e-08, -2.5146e-08, 9.3132e-10, ..., 1.1083e-07, 1.7229e-06, 1.2247e-06], ..., [ 1.4901e-08, -2.5705e-07, 0.0000e+00, ..., -1.8347e-07, 6.3423e-07, 3.9861e-07], [ 2.2817e-07, -4.1723e-07, 3.7253e-09, ..., -2.3730e-06, 3.1721e-06, 2.0117e-06], [ 6.6124e-08, 1.8347e-07, 0.0000e+00, ..., 3.5018e-07, 1.6531e-06, 1.0058e-06]], device='cuda:0') Epoch 156, bias, value: tensor([-0.0218, -0.0271, -0.0098, -0.0291, -0.0293, 0.0036, 0.0269, -0.0129, 0.0360, -0.0020], device='cuda:0'), grad: tensor([-2.2978e-05, 1.0997e-05, 1.5255e-06, 2.6345e-05, 3.8296e-06, 3.8110e-06, 7.9572e-06, 1.1846e-06, -4.2439e-05, 9.7156e-06], device='cuda:0') 100 0.0001 changing lr epoch 155, time 255.86, cls_loss 0.0056 cls_loss_mapping 0.0051 cls_loss_causal 0.5195 re_mapping 0.0071 re_causal 0.0174 /// teacc 99.01 lr 0.00010000 Epoch 157, weight, value: tensor([[-0.0821, -0.1141, -0.0622, ..., -0.0513, 0.1441, 0.1383], [-0.1548, -0.1439, -0.1126, ..., -0.1168, -0.1560, -0.0784], [-0.0845, -0.0846, 0.1094, ..., -0.1377, 0.1527, 0.0409], ..., [-0.0874, 0.0554, 0.0427, ..., 0.1420, -0.1209, -0.1353], [-0.1939, 0.0548, -0.1140, ..., 0.0431, -0.0715, -0.0898], [ 0.0316, -0.0786, -0.0639, ..., -0.0640, -0.0473, -0.0791]], device='cuda:0'), grad: tensor([[ 3.6974e-07, 4.0419e-07, 0.0000e+00, ..., 5.2154e-08, -1.2442e-06, -8.6706e-07], [ 4.7777e-07, 7.2457e-07, 0.0000e+00, ..., 4.4703e-08, 1.6764e-08, 1.0245e-08], [ 1.6168e-06, 2.4550e-06, 0.0000e+00, ..., 1.8626e-09, 5.1223e-08, 2.6077e-08], ..., [ 1.2852e-07, 9.3132e-08, 0.0000e+00, ..., -5.4948e-08, 5.5879e-09, 2.7940e-09], [ 8.8848e-07, 2.5015e-06, 0.0000e+00, ..., 4.7497e-08, 3.2224e-07, 1.5646e-07], [ 1.4715e-07, 2.4959e-07, 0.0000e+00, ..., 9.9745e-07, 2.4866e-07, 1.6578e-07]], device='cuda:0') Epoch 157, bias, value: tensor([-0.0221, -0.0263, -0.0112, -0.0289, -0.0292, 0.0033, 0.0267, -0.0132, 0.0357, -0.0006], device='cuda:0'), grad: tensor([ 9.8348e-07, 3.6150e-05, 1.4305e-05, -2.6435e-05, -3.7789e-04, 1.2722e-06, 1.8682e-06, 4.8801e-06, 6.0648e-06, 3.3832e-04], device='cuda:0') 100 0.0001 changing lr epoch 156, time 255.52, cls_loss 0.0047 cls_loss_mapping 0.0048 cls_loss_causal 0.5536 re_mapping 0.0071 re_causal 0.0182 /// teacc 98.95 lr 0.00010000 Epoch 158, weight, value: tensor([[-0.0826, -0.1148, -0.0622, ..., -0.0516, 0.1441, 0.1384], [-0.1550, -0.1442, -0.1125, ..., -0.1152, -0.1566, -0.0783], [-0.0846, -0.0867, 0.1094, ..., -0.1411, 0.1531, 0.0411], ..., [-0.0874, 0.0562, 0.0427, ..., 0.1421, -0.1217, -0.1361], [-0.1965, 0.0547, -0.1135, ..., 0.0428, -0.0723, -0.0901], [ 0.0326, -0.0794, -0.0639, ..., -0.0650, -0.0474, -0.0795]], device='cuda:0'), grad: tensor([[ 5.9605e-08, 3.0734e-08, 0.0000e+00, ..., 5.2154e-08, -5.7183e-06, -2.4606e-06], [ 2.6077e-08, 2.7288e-07, 0.0000e+00, ..., 4.3493e-07, 2.3190e-07, 1.1176e-07], [ 5.2154e-08, 3.8184e-08, 0.0000e+00, ..., 7.8231e-08, 2.8405e-07, 4.3772e-08], ..., [ 6.5193e-09, -2.7902e-06, 0.0000e+00, ..., -4.0568e-06, 2.5611e-07, 1.5553e-07], [ 1.3039e-07, 1.1716e-06, 0.0000e+00, ..., 1.2359e-06, 7.1619e-07, 2.7101e-07], [ 1.3318e-07, 9.3877e-07, 0.0000e+00, ..., 1.5814e-06, 1.9167e-06, 8.4192e-07]], device='cuda:0') Epoch 158, bias, value: tensor([-0.0222, -0.0250, -0.0124, -0.0287, -0.0292, 0.0033, 0.0268, -0.0134, 0.0350, -0.0009], device='cuda:0'), grad: tensor([-7.8753e-06, -2.6631e-04, 1.8287e-04, 6.1616e-06, -1.5143e-06, 4.2170e-06, 3.4459e-06, 6.5982e-05, 3.9265e-06, 9.1791e-06], device='cuda:0') 100 0.0001 changing lr epoch 157, time 255.83, cls_loss 0.0058 cls_loss_mapping 0.0050 cls_loss_causal 0.5710 re_mapping 0.0069 re_causal 0.0179 /// teacc 98.98 lr 0.00010000 Epoch 159, weight, value: tensor([[-0.0830, -0.1158, -0.0623, ..., -0.0489, 0.1454, 0.1395], [-0.1553, -0.1447, -0.1126, ..., -0.1156, -0.1576, -0.0786], [-0.0852, -0.0875, 0.1095, ..., -0.1420, 0.1535, 0.0408], ..., [-0.0877, 0.0561, 0.0426, ..., 0.1419, -0.1237, -0.1372], [-0.1993, 0.0547, -0.1134, ..., 0.0423, -0.0737, -0.0921], [ 0.0328, -0.0788, -0.0639, ..., -0.0644, -0.0480, -0.0803]], device='cuda:0'), grad: tensor([[ 7.8231e-08, 1.0245e-08, 3.7253e-09, ..., 5.7742e-08, -2.2724e-05, -1.1079e-05], [ 1.1176e-08, 3.0454e-07, 9.3132e-10, ..., 1.1455e-07, 7.2736e-07, 2.6077e-07], [ 1.0245e-08, 6.6683e-07, 9.3132e-10, ..., 9.9372e-07, 4.5225e-06, 1.1427e-06], ..., [ 3.6322e-08, 5.9046e-06, 0.0000e+00, ..., -3.7812e-07, 6.7893e-07, 3.1106e-07], [ 5.5879e-08, 1.9278e-07, 6.5193e-09, ..., -1.1623e-06, -3.0547e-06, -1.5646e-07], [-1.4622e-07, 6.5193e-08, 0.0000e+00, ..., 2.4773e-07, 1.4424e-05, 6.8322e-06]], device='cuda:0') Epoch 159, bias, value: tensor([-0.0212, -0.0250, -0.0127, -0.0281, -0.0290, 0.0028, 0.0277, -0.0143, 0.0341, -0.0004], device='cuda:0'), grad: tensor([-3.4481e-05, 1.4612e-06, 1.5467e-05, -4.1798e-06, -3.5092e-06, 2.8126e-06, 6.0350e-06, 6.2026e-06, -1.3612e-05, 2.3767e-05], device='cuda:0') 100 0.0001 changing lr epoch 158, time 255.68, cls_loss 0.0061 cls_loss_mapping 0.0055 cls_loss_causal 0.5288 re_mapping 0.0072 re_causal 0.0174 /// teacc 98.93 lr 0.00010000 Epoch 160, weight, value: tensor([[-0.0835, -0.1177, -0.0623, ..., -0.0496, 0.1461, 0.1402], [-0.1579, -0.1449, -0.1128, ..., -0.1158, -0.1588, -0.0790], [-0.0857, -0.0883, 0.1086, ..., -0.1439, 0.1539, 0.0405], ..., [-0.0885, 0.0563, 0.0436, ..., 0.1421, -0.1250, -0.1377], [-0.1992, 0.0550, -0.1131, ..., 0.0424, -0.0742, -0.0931], [ 0.0317, -0.0796, -0.0639, ..., -0.0640, -0.0484, -0.0808]], device='cuda:0'), grad: tensor([[ 4.5635e-08, 3.6322e-08, 0.0000e+00, ..., 2.7940e-08, -2.7847e-07, -1.1921e-07], [ 3.3528e-08, 4.3213e-07, 0.0000e+00, ..., 3.9022e-07, 2.7940e-08, -6.5193e-09], [ 7.3574e-08, 1.6652e-06, 0.0000e+00, ..., 1.6764e-07, -6.4448e-07, -1.6671e-07], ..., [ 2.8871e-08, 1.1465e-06, 0.0000e+00, ..., -7.5717e-07, 8.3819e-08, 1.7695e-08], [ 3.6228e-07, 2.3078e-06, 0.0000e+00, ..., -3.0827e-07, 3.5204e-07, 1.2387e-07], [ 2.9244e-07, 2.7847e-07, 0.0000e+00, ..., 5.3085e-07, 2.4680e-07, 1.0245e-07]], device='cuda:0') Epoch 160, bias, value: tensor([-0.0208, -0.0253, -0.0129, -0.0283, -0.0309, 0.0034, 0.0274, -0.0149, 0.0352, 0.0008], device='cuda:0'), grad: tensor([-7.6368e-08, -2.8685e-07, 1.6475e-06, -6.1393e-06, -7.3135e-05, -3.5353e-06, 1.3961e-06, 2.3171e-06, 2.8275e-06, 7.4804e-05], device='cuda:0') 100 0.0001 changing lr epoch 159, time 255.88, cls_loss 0.0057 cls_loss_mapping 0.0057 cls_loss_causal 0.5416 re_mapping 0.0072 re_causal 0.0175 /// teacc 98.98 lr 0.00010000 Epoch 161, weight, value: tensor([[-0.0839, -0.1187, -0.0629, ..., -0.0500, 0.1464, 0.1404], [-0.1596, -0.1462, -0.1130, ..., -0.1168, -0.1596, -0.0798], [-0.0843, -0.0887, 0.1087, ..., -0.1447, 0.1546, 0.0413], ..., [-0.0888, 0.0568, 0.0437, ..., 0.1428, -0.1262, -0.1386], [-0.1996, 0.0553, -0.1134, ..., 0.0425, -0.0745, -0.0938], [ 0.0305, -0.0805, -0.0639, ..., -0.0648, -0.0486, -0.0812]], device='cuda:0'), grad: tensor([[ 1.5544e-06, 3.1153e-07, 1.4901e-08, ..., 1.3867e-06, -3.5390e-07, -1.8533e-07], [ 1.4687e-06, 1.8501e-04, 1.8626e-09, ..., 1.1772e-04, 3.9581e-08, 6.9849e-09], [ 4.6492e-06, 6.0424e-06, -1.4063e-07, ..., 7.8455e-06, -2.6682e-07, 6.1002e-08], ..., [ 3.5703e-05, -1.9813e-04, 1.3504e-08, ..., -9.3937e-05, 2.9290e-07, 1.3039e-08], [ 7.7039e-06, 5.4725e-06, 1.1455e-07, ..., 9.2164e-06, 4.7125e-06, 1.2051e-06], [ 8.4471e-07, -3.0063e-06, 0.0000e+00, ..., -5.2853e-07, -3.2596e-09, 9.3598e-08]], device='cuda:0') Epoch 161, bias, value: tensor([-0.0209, -0.0262, -0.0129, -0.0288, -0.0307, 0.0037, 0.0279, -0.0140, 0.0355, 0.0002], device='cuda:0'), grad: tensor([ 6.5416e-06, 6.2370e-04, 3.4720e-05, 4.8757e-05, -1.6332e-05, -2.2972e-04, 2.7642e-06, -5.4216e-04, 5.1916e-05, 2.0072e-05], device='cuda:0') 100 0.0001 changing lr epoch 160, time 255.92, cls_loss 0.0057 cls_loss_mapping 0.0050 cls_loss_causal 0.5578 re_mapping 0.0069 re_causal 0.0171 /// teacc 98.96 lr 0.00010000 Epoch 162, weight, value: tensor([[-0.0843, -0.1192, -0.0631, ..., -0.0505, 0.1449, 0.1407], [-0.1603, -0.1468, -0.1131, ..., -0.1180, -0.1600, -0.0800], [-0.0839, -0.0890, 0.1088, ..., -0.1455, 0.1553, 0.0417], ..., [-0.0893, 0.0574, 0.0437, ..., 0.1436, -0.1278, -0.1399], [-0.2000, 0.0551, -0.1135, ..., 0.0425, -0.0752, -0.0947], [ 0.0296, -0.0808, -0.0641, ..., -0.0653, -0.0464, -0.0818]], device='cuda:0'), grad: tensor([[-1.1567e-06, 2.3283e-09, 0.0000e+00, ..., 1.5320e-07, -5.6848e-06, -1.3392e-06], [ 9.3132e-09, 2.0023e-08, 0.0000e+00, ..., 1.7229e-07, 1.2992e-07, -1.2126e-06], [ 1.6298e-08, 9.3132e-09, 0.0000e+00, ..., 8.8755e-07, -3.4552e-06, -9.7323e-07], ..., [ 6.0536e-09, -4.1816e-07, 0.0000e+00, ..., -8.3121e-07, 1.9465e-06, 9.7696e-07], [ 1.0943e-07, 2.6543e-08, 0.0000e+00, ..., -4.2245e-06, 2.2911e-07, 2.8312e-07], [ 3.5809e-07, 3.8231e-07, 0.0000e+00, ..., 2.9504e-06, 2.8815e-06, 1.1837e-06]], device='cuda:0') Epoch 162, bias, value: tensor([-0.0232, -0.0267, -0.0126, -0.0291, -0.0306, 0.0037, 0.0282, -0.0137, 0.0354, 0.0016], device='cuda:0'), grad: tensor([-6.9961e-06, -1.2510e-05, 4.2170e-06, 2.5574e-06, 5.5414e-08, 7.8380e-06, 2.4736e-06, 7.0147e-06, -2.7478e-05, 2.2799e-05], device='cuda:0') 100 0.0001 changing lr epoch 161, time 255.91, cls_loss 0.0042 cls_loss_mapping 0.0045 cls_loss_causal 0.5471 re_mapping 0.0071 re_causal 0.0183 /// teacc 99.00 lr 0.00010000 Epoch 163, weight, value: tensor([[-0.0835, -0.1202, -0.0633, ..., -0.0489, 0.1457, 0.1420], [-0.1605, -0.1468, -0.1131, ..., -0.1174, -0.1628, -0.0815], [-0.0841, -0.0895, 0.1091, ..., -0.1460, 0.1563, 0.0424], ..., [-0.0894, 0.0577, 0.0437, ..., 0.1438, -0.1284, -0.1405], [-0.2006, 0.0555, -0.1141, ..., 0.0425, -0.0759, -0.0958], [ 0.0294, -0.0824, -0.0642, ..., -0.0660, -0.0464, -0.0829]], device='cuda:0'), grad: tensor([[ 8.3819e-09, -1.6764e-07, 0.0000e+00, ..., -3.7067e-07, -2.5630e-06, -2.2110e-06], [ 9.3132e-09, 4.9360e-08, 0.0000e+00, ..., -2.3842e-06, 6.7055e-08, 5.6811e-08], [ 2.9802e-08, 2.2352e-08, -9.3132e-10, ..., 1.3318e-07, 2.0489e-07, 2.0862e-07], ..., [ 3.7253e-09, -1.3970e-07, 0.0000e+00, ..., 1.7965e-06, 3.1106e-07, 2.7288e-07], [ 8.3819e-08, 8.2888e-08, 0.0000e+00, ..., 3.6415e-07, 9.0525e-07, 7.8790e-07], [ 2.7940e-08, 7.8231e-08, 0.0000e+00, ..., 3.3993e-07, 6.6869e-07, 5.4389e-07]], device='cuda:0') Epoch 163, bias, value: tensor([-0.0228, -0.0266, -0.0119, -0.0290, -0.0305, 0.0035, 0.0279, -0.0139, 0.0360, 0.0011], device='cuda:0'), grad: tensor([-5.5805e-06, -2.0057e-05, 1.2843e-06, 8.4471e-07, -1.8001e-05, 1.4435e-07, 3.8650e-07, 1.6779e-05, 5.2527e-06, 1.8895e-05], device='cuda:0') 100 0.0001 changing lr epoch 162, time 255.75, cls_loss 0.0049 cls_loss_mapping 0.0050 cls_loss_causal 0.5682 re_mapping 0.0071 re_causal 0.0178 /// teacc 98.99 lr 0.00010000 Epoch 164, weight, value: tensor([[-0.0845, -0.1210, -0.0634, ..., -0.0505, 0.1466, 0.1432], [-0.1607, -0.1483, -0.1132, ..., -0.1183, -0.1634, -0.0819], [-0.0842, -0.0893, 0.1092, ..., -0.1463, 0.1568, 0.0428], ..., [-0.0898, 0.0584, 0.0437, ..., 0.1445, -0.1294, -0.1414], [-0.2018, 0.0554, -0.1139, ..., 0.0424, -0.0773, -0.0975], [ 0.0291, -0.0829, -0.0639, ..., -0.0665, -0.0470, -0.0854]], device='cuda:0'), grad: tensor([[ 1.7881e-07, 8.2888e-08, 0.0000e+00, ..., -3.9116e-08, -8.5458e-06, -3.8147e-06], [ 2.4587e-07, 2.7288e-07, 0.0000e+00, ..., 1.0245e-08, 7.8231e-08, 1.5832e-08], [ 4.0140e-07, 3.9674e-07, 0.0000e+00, ..., 9.3132e-09, 5.6103e-06, 2.2165e-06], ..., [ 4.1630e-07, 5.1782e-07, 0.0000e+00, ..., -1.8626e-08, 1.0431e-07, 5.4948e-08], [ 3.0361e-07, 3.2317e-07, 0.0000e+00, ..., 5.4948e-08, 3.7346e-07, 2.3842e-07], [ 1.2480e-07, 3.2596e-08, 0.0000e+00, ..., -6.3330e-08, 1.8757e-06, 1.0207e-06]], device='cuda:0') Epoch 164, bias, value: tensor([-0.0223, -0.0273, -0.0117, -0.0283, -0.0295, 0.0030, 0.0278, -0.0131, 0.0355, 0.0001], device='cuda:0'), grad: tensor([-9.8795e-06, 2.3451e-06, 8.3745e-06, -4.4443e-06, -2.6852e-05, 1.7481e-06, 1.3011e-06, 2.5611e-06, 4.1351e-06, 2.0683e-05], device='cuda:0') 100 0.0001 changing lr epoch 163, time 255.80, cls_loss 0.0052 cls_loss_mapping 0.0050 cls_loss_causal 0.5613 re_mapping 0.0070 re_causal 0.0175 /// teacc 98.90 lr 0.00010000 Epoch 165, weight, value: tensor([[-0.0850, -0.1221, -0.0634, ..., -0.0515, 0.1469, 0.1435], [-0.1609, -0.1488, -0.1132, ..., -0.1191, -0.1638, -0.0820], [-0.0845, -0.0897, 0.1092, ..., -0.1467, 0.1571, 0.0428], ..., [-0.0902, 0.0589, 0.0437, ..., 0.1451, -0.1309, -0.1425], [-0.2035, 0.0554, -0.1140, ..., 0.0425, -0.0770, -0.0984], [ 0.0314, -0.0833, -0.0639, ..., -0.0668, -0.0472, -0.0861]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 3.1386e-07, 4.6566e-09, ..., 5.9791e-07, 2.9057e-07, -1.6764e-08], [ 2.3283e-08, 1.6922e-06, 1.8626e-09, ..., 1.2321e-06, 2.5705e-07, 9.3132e-10], [ 7.4506e-09, 4.1537e-07, 0.0000e+00, ..., 6.6217e-07, -1.2256e-06, -5.5879e-09], ..., [ 1.2107e-08, -2.1338e-05, 5.5879e-09, ..., -1.2100e-05, 7.3574e-08, 3.7253e-09], [ 6.5193e-08, 2.3209e-06, 7.4506e-09, ..., 1.1921e-06, -4.4145e-07, 2.7940e-09], [ 1.6764e-08, -8.8662e-06, -3.7253e-08, ..., -1.1213e-05, 2.0396e-07, 9.3132e-09]], device='cuda:0') Epoch 165, bias, value: tensor([-0.0224, -0.0274, -0.0119, -0.0284, -0.0293, 0.0027, 0.0282, -0.0128, 0.0350, 0.0003], device='cuda:0'), grad: tensor([ 3.7178e-06, 2.0504e-05, -1.6885e-06, 2.8998e-05, 4.6939e-05, 3.2634e-06, 5.9530e-06, -2.6524e-06, -3.0279e-05, -7.4744e-05], device='cuda:0') 100 0.0001 changing lr epoch 164, time 255.31, cls_loss 0.0049 cls_loss_mapping 0.0055 cls_loss_causal 0.5246 re_mapping 0.0072 re_causal 0.0173 /// teacc 98.93 lr 0.00010000 Epoch 166, weight, value: tensor([[-0.0863, -0.1224, -0.0634, ..., -0.0522, 0.1464, 0.1428], [-0.1611, -0.1496, -0.1133, ..., -0.1199, -0.1643, -0.0821], [-0.0847, -0.0897, 0.1093, ..., -0.1470, 0.1580, 0.0431], ..., [-0.0905, 0.0593, 0.0440, ..., 0.1459, -0.1325, -0.1435], [-0.2049, 0.0552, -0.1141, ..., 0.0423, -0.0777, -0.0998], [ 0.0314, -0.0836, -0.0644, ..., -0.0673, -0.0473, -0.0865]], device='cuda:0'), grad: tensor([[ 3.1572e-07, 1.8626e-08, 1.8626e-09, ..., -8.2888e-08, -9.4399e-06, -5.4389e-06], [ 5.6345e-07, 7.2643e-08, 2.4214e-08, ..., -1.4104e-05, 6.1654e-07, 2.1514e-07], [ 3.7253e-08, 2.3283e-08, -1.1269e-07, ..., 1.1548e-07, -2.5034e-06, -2.8219e-07], ..., [ 1.7695e-07, 2.5146e-08, 2.7940e-09, ..., 8.5980e-06, 2.3451e-06, 1.2098e-06], [ 1.4797e-05, 4.4778e-06, 2.7940e-09, ..., 7.8604e-06, 1.0449e-06, 4.4797e-07], [ 1.5711e-06, 4.8708e-07, 0.0000e+00, ..., 3.4776e-06, 3.8706e-06, 2.0582e-06]], device='cuda:0') Epoch 166, bias, value: tensor([-2.2879e-02, -2.8045e-02, -1.1545e-02, -2.8122e-02, -2.8689e-02, 2.4492e-03, 2.9381e-02, -1.2313e-02, 3.4289e-02, -4.0254e-05], device='cuda:0'), grad: tensor([-1.5199e-05, -6.5625e-05, -7.2345e-06, 2.0790e-04, 1.7911e-05, -2.4724e-04, 2.2426e-05, 4.4525e-05, 2.2337e-05, 2.0340e-05], device='cuda:0') 100 0.0001 changing lr epoch 165, time 255.64, cls_loss 0.0049 cls_loss_mapping 0.0060 cls_loss_causal 0.5394 re_mapping 0.0069 re_causal 0.0173 /// teacc 98.95 lr 0.00010000 Epoch 167, weight, value: tensor([[-0.0867, -0.1232, -0.0635, ..., -0.0524, 0.1461, 0.1432], [-0.1620, -0.1499, -0.1133, ..., -0.1202, -0.1652, -0.0827], [-0.0844, -0.0895, 0.1093, ..., -0.1483, 0.1586, 0.0432], ..., [-0.0907, 0.0593, 0.0440, ..., 0.1462, -0.1344, -0.1453], [-0.2054, 0.0558, -0.1142, ..., 0.0424, -0.0777, -0.1002], [ 0.0313, -0.0847, -0.0638, ..., -0.0675, -0.0467, -0.0872]], device='cuda:0'), grad: tensor([[ 4.3064e-06, 7.3295e-07, 3.7253e-09, ..., 6.7055e-08, -5.3365e-07, 5.0254e-06], [ 4.1723e-07, 8.3148e-06, 4.6007e-06, ..., 2.3127e-05, 3.9022e-07, -7.6964e-06], [ 9.8627e-07, 4.1537e-07, 1.2945e-07, ..., 6.7335e-07, -1.1958e-06, 2.5574e-06], ..., [ 1.0617e-07, -1.1459e-05, -6.1020e-06, ..., -3.1531e-05, 3.9488e-07, 2.6822e-07], [-1.5333e-05, -2.6189e-06, 9.2201e-08, ..., 1.5832e-07, -3.6377e-06, -1.7270e-05], [ 7.9349e-06, 3.5893e-06, 1.1418e-06, ..., 7.1265e-06, 1.9874e-06, 1.0177e-05]], device='cuda:0') Epoch 167, bias, value: tensor([-0.0236, -0.0279, -0.0124, -0.0284, -0.0289, 0.0027, 0.0296, -0.0124, 0.0354, 0.0002], device='cuda:0'), grad: tensor([ 5.4359e-05, -2.3425e-05, 2.6256e-05, 1.7196e-05, 4.0978e-06, -5.5820e-05, 1.0389e-04, -8.0049e-05, -1.6344e-04, 1.1677e-04], device='cuda:0') 100 0.0001 changing lr epoch 166, time 255.56, cls_loss 0.0047 cls_loss_mapping 0.0044 cls_loss_causal 0.5702 re_mapping 0.0070 re_causal 0.0177 /// teacc 98.94 lr 0.00010000 Epoch 168, weight, value: tensor([[-0.0877, -0.1234, -0.0635, ..., -0.0526, 0.1465, 0.1435], [-0.1622, -0.1501, -0.1135, ..., -0.1200, -0.1661, -0.0833], [-0.0846, -0.0907, 0.1096, ..., -0.1492, 0.1585, 0.0430], ..., [-0.0910, 0.0594, 0.0441, ..., 0.1462, -0.1354, -0.1464], [-0.2056, 0.0559, -0.1143, ..., 0.0424, -0.0785, -0.1007], [ 0.0302, -0.0851, -0.0639, ..., -0.0676, -0.0469, -0.0894]], device='cuda:0'), grad: tensor([[-1.0513e-05, 1.6205e-07, 0.0000e+00, ..., -3.2395e-05, -1.7047e-04, -8.1122e-05], [ 7.0222e-07, 6.7707e-07, -2.3749e-07, ..., 6.2734e-06, 3.9767e-07, 1.9558e-07], [ 1.1278e-06, -5.9567e-06, 6.5193e-08, ..., -4.2394e-06, 2.0787e-06, -9.1456e-07], ..., [ 3.3621e-07, 3.0790e-06, 7.4506e-09, ..., 3.9823e-06, 5.2825e-06, 3.7458e-06], [ 1.0103e-05, 1.8347e-07, 2.2352e-08, ..., -4.4852e-06, 1.1340e-05, 4.5411e-06], [ 2.1812e-06, 4.0047e-07, 9.3132e-10, ..., 2.4028e-07, 1.0602e-05, 4.1686e-06]], device='cuda:0') Epoch 168, bias, value: tensor([-2.3524e-02, -2.7244e-02, -1.3227e-02, -2.8207e-02, -2.9483e-02, 2.8143e-03, 3.0259e-02, -1.2617e-02, 3.5282e-02, 9.3513e-05], device='cuda:0'), grad: tensor([-2.3413e-04, 1.7732e-05, -2.2426e-05, 1.7166e-05, 6.7502e-06, -1.2064e-04, 2.9731e-04, 2.1189e-05, 1.7472e-06, 1.5303e-05], device='cuda:0') 100 0.0001 changing lr epoch 167, time 255.64, cls_loss 0.0051 cls_loss_mapping 0.0053 cls_loss_causal 0.5374 re_mapping 0.0069 re_causal 0.0170 /// teacc 99.01 lr 0.00010000 Epoch 169, weight, value: tensor([[-0.0875, -0.1238, -0.0636, ..., -0.0515, 0.1478, 0.1450], [-0.1638, -0.1498, -0.1114, ..., -0.1199, -0.1671, -0.0839], [-0.0836, -0.0903, 0.1099, ..., -0.1492, 0.1596, 0.0437], ..., [-0.0916, 0.0593, 0.0430, ..., 0.1465, -0.1372, -0.1481], [-0.2061, 0.0559, -0.1144, ..., 0.0423, -0.0791, -0.1012], [ 0.0297, -0.0857, -0.0652, ..., -0.0680, -0.0472, -0.0902]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 2.7344e-06, 9.3132e-10, ..., 1.4435e-07, 6.1803e-06, 2.7437e-06], [ 9.2201e-08, 4.5449e-07, 1.0245e-08, ..., 9.9745e-07, 1.1642e-07, 1.9558e-08], [ 1.9558e-08, -6.9924e-06, 3.7253e-09, ..., 4.6659e-07, -1.6779e-05, -7.6666e-06], ..., [ 1.8626e-08, -3.4198e-06, -8.2888e-08, ..., -1.2457e-05, 5.4650e-06, 2.4363e-06], [ 9.6019e-07, 2.1588e-06, 2.7940e-09, ..., 3.5018e-07, 4.1537e-06, 1.6047e-06], [ 1.2945e-07, 4.9695e-06, 6.2399e-08, ..., 1.0058e-05, 1.5926e-07, 7.1712e-08]], device='cuda:0') Epoch 169, bias, value: tensor([-0.0226, -0.0258, -0.0125, -0.0281, -0.0294, 0.0041, 0.0272, -0.0136, 0.0351, -0.0005], device='cuda:0'), grad: tensor([ 1.6421e-05, 7.0687e-07, -4.2945e-05, -1.7509e-05, 1.2973e-06, 2.2471e-05, -7.1712e-06, -8.9556e-06, 1.6347e-05, 1.9342e-05], device='cuda:0') 100 0.0001 changing lr epoch 168, time 255.63, cls_loss 0.0045 cls_loss_mapping 0.0044 cls_loss_causal 0.5680 re_mapping 0.0069 re_causal 0.0175 /// teacc 98.94 lr 0.00010000 Epoch 170, weight, value: tensor([[-0.0875, -0.1242, -0.0640, ..., -0.0515, 0.1481, 0.1454], [-0.1640, -0.1501, -0.1118, ..., -0.1201, -0.1675, -0.0835], [-0.0838, -0.0909, 0.1098, ..., -0.1499, 0.1597, 0.0428], ..., [-0.0918, 0.0595, 0.0429, ..., 0.1479, -0.1385, -0.1488], [-0.2064, 0.0561, -0.1128, ..., 0.0426, -0.0790, -0.1012], [ 0.0287, -0.0867, -0.0651, ..., -0.0696, -0.0474, -0.0910]], device='cuda:0'), grad: tensor([[ 4.4703e-08, 1.3970e-08, 0.0000e+00, ..., 4.1910e-08, 1.3690e-07, -1.8813e-07], [ 8.1956e-08, 8.8476e-08, 0.0000e+00, ..., 1.2014e-07, 4.2468e-07, 1.5460e-07], [ 8.3819e-09, 3.3993e-07, 0.0000e+00, ..., 5.0012e-07, -3.3304e-06, -1.5274e-06], ..., [ 1.8626e-09, -9.2946e-07, 0.0000e+00, ..., -1.0803e-06, 7.5810e-07, 3.6694e-07], [ 4.8429e-08, 1.3039e-08, 0.0000e+00, ..., -2.1514e-07, 8.0001e-07, 4.8988e-07], [ 7.4506e-09, 6.4261e-08, 0.0000e+00, ..., 8.4750e-08, 7.6834e-07, 4.9360e-07]], device='cuda:0') Epoch 170, bias, value: tensor([-0.0225, -0.0256, -0.0130, -0.0280, -0.0303, 0.0044, 0.0269, -0.0126, 0.0356, -0.0014], device='cuda:0'), grad: tensor([ 8.7358e-07, -1.5618e-06, -3.2857e-06, 1.5786e-06, -3.0547e-07, 5.7090e-07, -6.6310e-07, -6.3889e-07, 1.6028e-06, 1.7798e-06], device='cuda:0') 100 0.0001 changing lr epoch 169, time 255.32, cls_loss 0.0036 cls_loss_mapping 0.0036 cls_loss_causal 0.5091 re_mapping 0.0068 re_causal 0.0176 /// teacc 99.01 lr 0.00010000 Epoch 171, weight, value: tensor([[-0.0877, -0.1244, -0.0640, ..., -0.0534, 0.1483, 0.1456], [-0.1659, -0.1507, -0.1121, ..., -0.1208, -0.1682, -0.0848], [-0.0816, -0.0912, 0.1098, ..., -0.1508, 0.1605, 0.0439], ..., [-0.0920, 0.0599, 0.0433, ..., 0.1485, -0.1382, -0.1495], [-0.2068, 0.0561, -0.1129, ..., 0.0425, -0.0793, -0.1016], [ 0.0285, -0.0868, -0.0651, ..., -0.0701, -0.0475, -0.0913]], device='cuda:0'), grad: tensor([[ 4.0792e-07, 1.4901e-08, 0.0000e+00, ..., 1.0151e-07, -9.0338e-07, -5.5972e-07], [ 4.9360e-08, 5.8673e-08, 0.0000e+00, ..., 8.6613e-08, 3.5390e-08, 1.9558e-08], [ 2.2352e-08, 3.5390e-08, 0.0000e+00, ..., 1.0524e-07, 8.7544e-08, 5.8673e-08], ..., [ 1.3039e-07, -1.7229e-07, 0.0000e+00, ..., -5.9605e-08, 8.3819e-09, 3.7253e-09], [ 5.5600e-07, -2.1420e-08, 0.0000e+00, ..., 5.5879e-08, 4.6007e-07, 2.6077e-07], [ 3.0268e-07, 2.6077e-08, 0.0000e+00, ..., -1.4650e-06, 1.0030e-06, 6.0536e-07]], device='cuda:0') Epoch 171, bias, value: tensor([-0.0225, -0.0267, -0.0115, -0.0280, -0.0299, 0.0045, 0.0267, -0.0122, 0.0354, -0.0016], device='cuda:0'), grad: tensor([ 2.4680e-07, -2.1982e-04, 1.2898e-04, 1.2383e-05, 3.9160e-05, -1.9968e-06, -1.5032e-06, 8.0228e-05, 3.9339e-06, -4.1038e-05], device='cuda:0') 100 0.0001 changing lr epoch 170, time 255.42, cls_loss 0.0040 cls_loss_mapping 0.0048 cls_loss_causal 0.5591 re_mapping 0.0067 re_causal 0.0173 /// teacc 98.95 lr 0.00010000 Epoch 172, weight, value: tensor([[-0.0878, -0.1247, -0.0642, ..., -0.0531, 0.1490, 0.1463], [-0.1661, -0.1519, -0.1112, ..., -0.1220, -0.1685, -0.0853], [-0.0816, -0.0913, 0.1096, ..., -0.1504, 0.1615, 0.0446], ..., [-0.0923, 0.0603, 0.0431, ..., 0.1491, -0.1409, -0.1515], [-0.2075, 0.0560, -0.1123, ..., 0.0424, -0.0795, -0.1022], [ 0.0276, -0.0873, -0.0666, ..., -0.0702, -0.0482, -0.0922]], device='cuda:0'), grad: tensor([[ 3.1386e-07, 6.5193e-09, 9.3132e-10, ..., 8.1956e-08, -1.8179e-06, -3.0492e-06], [ 4.7497e-08, 7.3574e-08, 5.5879e-09, ..., 9.8720e-08, 3.6322e-07, 1.6298e-07], [ 2.4214e-08, 1.9558e-08, 0.0000e+00, ..., 1.5553e-07, -2.8461e-06, 1.6950e-07], ..., [ 7.4506e-09, 1.0245e-08, 1.4901e-08, ..., 1.6158e-06, 2.0582e-07, 1.1455e-07], [ 1.4342e-07, 1.5832e-08, 1.8626e-09, ..., 6.2212e-07, 8.1770e-07, 5.4669e-07], [ 1.3970e-08, -3.6694e-07, -6.4261e-08, ..., -4.9882e-06, 4.2841e-07, 2.8033e-07]], device='cuda:0') Epoch 172, bias, value: tensor([-0.0218, -0.0270, -0.0115, -0.0278, -0.0305, 0.0042, 0.0269, -0.0119, 0.0352, -0.0019], device='cuda:0'), grad: tensor([ 6.2063e-06, 3.7365e-06, 3.2596e-06, 2.2888e-05, 1.1913e-05, 7.4394e-06, -1.4855e-06, 1.0937e-05, -3.6329e-05, -2.8595e-05], device='cuda:0') 100 0.0001 changing lr epoch 171, time 255.39, cls_loss 0.0045 cls_loss_mapping 0.0057 cls_loss_causal 0.5226 re_mapping 0.0067 re_causal 0.0173 /// teacc 98.88 lr 0.00010000 Epoch 173, weight, value: tensor([[-0.0880, -0.1250, -0.0643, ..., -0.0532, 0.1496, 0.1471], [-0.1666, -0.1521, -0.1112, ..., -0.1225, -0.1692, -0.0857], [-0.0819, -0.0918, 0.1096, ..., -0.1511, 0.1615, 0.0444], ..., [-0.0926, 0.0604, 0.0431, ..., 0.1495, -0.1413, -0.1519], [-0.2083, 0.0560, -0.1115, ..., 0.0424, -0.0801, -0.1031], [ 0.0265, -0.0867, -0.0663, ..., -0.0700, -0.0483, -0.0934]], device='cuda:0'), grad: tensor([[ 3.6303e-06, 2.0489e-08, 0.0000e+00, ..., 8.1025e-07, 2.4848e-06, 2.0843e-06], [ 2.3283e-07, 9.9652e-08, 0.0000e+00, ..., 5.1223e-08, 4.2748e-07, 1.0617e-07], [ 2.6822e-07, 2.4866e-07, -0.0000e+00, ..., -6.7428e-07, -8.7544e-07, -8.0187e-07], ..., [ 4.0978e-08, -3.7253e-08, 0.0000e+00, ..., 3.2783e-07, 6.2026e-07, 4.8615e-07], [ 2.6338e-06, 1.2945e-07, 0.0000e+00, ..., 7.4413e-07, 1.3206e-06, 1.4920e-06], [ 1.9111e-06, 2.2631e-07, 0.0000e+00, ..., 4.8708e-07, 1.3411e-07, 8.8476e-08]], device='cuda:0') Epoch 173, bias, value: tensor([-0.0215, -0.0271, -0.0120, -0.0264, -0.0313, 0.0030, 0.0275, -0.0120, 0.0349, -0.0013], device='cuda:0'), grad: tensor([ 8.4639e-06, 4.7944e-06, -6.0908e-07, 2.5228e-05, -9.9614e-06, -1.3523e-05, -2.5526e-05, 1.7723e-06, -5.6718e-07, 9.9391e-06], device='cuda:0') 100 0.0001 changing lr epoch 172, time 255.89, cls_loss 0.0044 cls_loss_mapping 0.0045 cls_loss_causal 0.5408 re_mapping 0.0069 re_causal 0.0171 /// teacc 99.02 lr 0.00010000 Epoch 174, weight, value: tensor([[-0.0882, -0.1256, -0.0646, ..., -0.0530, 0.1499, 0.1474], [-0.1668, -0.1525, -0.1104, ..., -0.1228, -0.1696, -0.0840], [-0.0821, -0.0919, 0.1097, ..., -0.1514, 0.1620, 0.0439], ..., [-0.0928, 0.0605, 0.0440, ..., 0.1500, -0.1418, -0.1541], [-0.2077, 0.0560, -0.1128, ..., 0.0430, -0.0790, -0.1037], [ 0.0262, -0.0868, -0.0682, ..., -0.0704, -0.0484, -0.0941]], device='cuda:0'), grad: tensor([[ 6.7987e-08, 2.9802e-08, 0.0000e+00, ..., 4.6566e-09, -8.9407e-08, -6.2399e-08], [ 4.0047e-08, 3.5390e-08, 0.0000e+00, ..., 1.3970e-08, 2.7940e-08, 1.8626e-09], [ 1.2200e-07, 8.8476e-08, -4.6566e-09, ..., 2.5146e-08, 8.3819e-08, 3.7253e-09], ..., [ 5.7742e-08, -2.7940e-09, 9.3132e-10, ..., -6.6124e-08, 8.3819e-09, 3.7253e-09], [ 6.5304e-06, 3.6974e-07, 4.6566e-09, ..., 8.3819e-09, -2.5891e-07, 9.3132e-10], [-5.6997e-06, -3.3528e-08, -0.0000e+00, ..., 2.7940e-09, 6.6124e-08, 3.6322e-08]], device='cuda:0') Epoch 174, bias, value: tensor([-0.0215, -0.0263, -0.0125, -0.0269, -0.0314, 0.0031, 0.0275, -0.0119, 0.0357, -0.0018], device='cuda:0'), grad: tensor([ 5.6811e-08, -1.0338e-07, 5.0105e-07, -5.4166e-06, 1.6456e-06, 4.0829e-06, 5.0105e-07, 7.6741e-07, 3.1739e-05, -3.3736e-05], device='cuda:0') 100 0.0001 changing lr epoch 173, time 255.60, cls_loss 0.0046 cls_loss_mapping 0.0043 cls_loss_causal 0.5481 re_mapping 0.0067 re_causal 0.0173 /// teacc 98.96 lr 0.00010000 Epoch 175, weight, value: tensor([[-0.0884, -0.1261, -0.0646, ..., -0.0531, 0.1503, 0.1478], [-0.1670, -0.1529, -0.1105, ..., -0.1240, -0.1699, -0.0841], [-0.0824, -0.0921, 0.1100, ..., -0.1514, 0.1629, 0.0446], ..., [-0.0930, 0.0597, 0.0442, ..., 0.1503, -0.1429, -0.1561], [-0.2066, 0.0559, -0.1130, ..., 0.0436, -0.0777, -0.1043], [ 0.0262, -0.0868, -0.0683, ..., -0.0703, -0.0485, -0.0948]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.7416e-07, 1.8626e-09, ..., 2.1700e-07, -3.6418e-05, -2.0429e-05], [ 6.5193e-09, 9.3598e-07, 1.8626e-09, ..., 1.2787e-06, 3.5390e-07, 2.3935e-07], [ 9.3132e-10, 2.2903e-05, -1.1455e-07, ..., 3.0905e-05, -6.2771e-07, -1.0999e-06], ..., [ 5.5879e-09, -3.0175e-05, 9.3132e-09, ..., -4.0919e-05, 4.1910e-07, 6.4727e-07], [ 1.0245e-08, 2.5798e-06, 8.6613e-08, ..., 3.1982e-06, 2.7940e-07, 1.9651e-07], [ 3.7253e-09, 2.1532e-06, 0.0000e+00, ..., 3.1870e-06, 9.4809e-07, 4.5262e-07]], device='cuda:0') Epoch 175, bias, value: tensor([-0.0212, -0.0265, -0.0125, -0.0264, -0.0314, 0.0029, 0.0267, -0.0123, 0.0366, -0.0016], device='cuda:0'), grad: tensor([-5.9098e-05, -1.2025e-05, 6.0588e-05, 3.7402e-06, 1.1977e-06, 4.0866e-06, 5.7340e-05, -7.1824e-05, 6.5491e-06, 9.3728e-06], device='cuda:0') 100 0.0001 changing lr epoch 174, time 255.61, cls_loss 0.0062 cls_loss_mapping 0.0039 cls_loss_causal 0.5438 re_mapping 0.0069 re_causal 0.0169 /// teacc 98.95 lr 0.00010000 Epoch 176, weight, value: tensor([[-0.0892, -0.1270, -0.0638, ..., -0.0536, 0.1508, 0.1484], [-0.1673, -0.1533, -0.1091, ..., -0.1241, -0.1717, -0.0849], [-0.0826, -0.0932, 0.1103, ..., -0.1533, 0.1637, 0.0446], ..., [-0.0938, 0.0568, 0.0434, ..., 0.1497, -0.1436, -0.1565], [-0.2078, 0.0567, -0.1132, ..., 0.0441, -0.0780, -0.1048], [ 0.0255, -0.0844, -0.0689, ..., -0.0690, -0.0487, -0.0956]], device='cuda:0'), grad: tensor([[ 4.7404e-07, 1.7695e-08, 9.3132e-10, ..., 7.4320e-07, -2.3019e-04, -1.1796e-04], [ 8.7544e-07, 1.4175e-06, 9.3132e-10, ..., 2.3153e-06, 1.9725e-06, 1.0133e-06], [ 8.5682e-07, 2.3562e-07, 9.3132e-10, ..., 1.4966e-06, 4.6473e-07, 2.9150e-07], ..., [ 2.7567e-07, -7.8604e-06, 1.8626e-09, ..., -5.6438e-06, 4.6939e-07, 2.3749e-07], [ 5.6297e-05, 1.3718e-06, 9.3132e-10, ..., 8.3029e-05, 1.1753e-06, 8.0001e-07], [ 1.2945e-07, 8.0094e-08, 1.3039e-08, ..., 5.4389e-07, 2.9784e-06, 1.6205e-06]], device='cuda:0') Epoch 176, bias, value: tensor([-0.0211, -0.0265, -0.0130, -0.0257, -0.0318, 0.0038, 0.0261, -0.0149, 0.0370, 0.0001], device='cuda:0'), grad: tensor([-3.0088e-04, -5.4762e-06, 3.9861e-06, 1.2815e-05, 4.5225e-06, -4.0126e-04, 5.1498e-04, -1.3299e-05, 1.8358e-04, 1.6047e-06], device='cuda:0') 100 0.0001 changing lr epoch 175, time 255.66, cls_loss 0.0048 cls_loss_mapping 0.0036 cls_loss_causal 0.5312 re_mapping 0.0067 re_causal 0.0167 /// teacc 98.94 lr 0.00010000 Epoch 177, weight, value: tensor([[-0.0902, -0.1277, -0.0638, ..., -0.0543, 0.1505, 0.1487], [-0.1677, -0.1535, -0.1087, ..., -0.1242, -0.1723, -0.0853], [-0.0827, -0.0934, 0.1109, ..., -0.1534, 0.1649, 0.0455], ..., [-0.0941, 0.0588, 0.0428, ..., 0.1518, -0.1443, -0.1573], [-0.2084, 0.0567, -0.1133, ..., 0.0442, -0.0782, -0.1050], [ 0.0249, -0.0881, -0.0701, ..., -0.0723, -0.0480, -0.0964]], device='cuda:0'), grad: tensor([[ 9.4064e-08, 4.0978e-08, 0.0000e+00, ..., 1.6764e-08, -8.1509e-06, -3.1441e-06], [ 4.2841e-08, 6.4261e-08, 9.3132e-10, ..., 2.9802e-08, 1.8254e-07, 3.9116e-08], [ 1.0245e-07, 2.0489e-08, 0.0000e+00, ..., 1.3039e-08, -3.1851e-07, -2.0675e-07], ..., [ 2.6077e-08, -2.9989e-07, 2.7940e-09, ..., -3.1106e-07, 2.3935e-07, 1.4529e-07], [ 3.3621e-07, 2.7195e-07, 9.3132e-10, ..., 6.7987e-08, 9.7789e-08, 5.6811e-08], [ 9.4064e-08, 6.7987e-08, -0.0000e+00, ..., 6.8918e-08, 7.3910e-06, 2.8070e-06]], device='cuda:0') Epoch 177, bias, value: tensor([-0.0218, -0.0262, -0.0127, -0.0262, -0.0317, 0.0041, 0.0262, -0.0132, 0.0372, -0.0016], device='cuda:0'), grad: tensor([-1.5132e-05, 8.2143e-07, -3.2671e-06, 7.8417e-07, -1.0189e-06, -1.1167e-06, 1.7295e-06, 4.6194e-07, 1.0598e-06, 1.5676e-05], device='cuda:0') 100 0.0001 changing lr epoch 176, time 256.12, cls_loss 0.0052 cls_loss_mapping 0.0054 cls_loss_causal 0.5361 re_mapping 0.0068 re_causal 0.0167 /// teacc 99.00 lr 0.00010000 Epoch 178, weight, value: tensor([[-0.0909, -0.1283, -0.0642, ..., -0.0541, 0.1508, 0.1489], [-0.1680, -0.1537, -0.1088, ..., -0.1240, -0.1744, -0.0859], [-0.0829, -0.0933, 0.1124, ..., -0.1518, 0.1684, 0.0475], ..., [-0.0944, 0.0590, 0.0445, ..., 0.1520, -0.1485, -0.1613], [-0.2066, 0.0566, -0.1138, ..., 0.0455, -0.0763, -0.1063], [ 0.0248, -0.0885, -0.0729, ..., -0.0739, -0.0485, -0.0979]], device='cuda:0'), grad: tensor([[ 7.2643e-08, 1.6764e-08, 0.0000e+00, ..., 2.6077e-08, -9.3132e-08, -4.9360e-08], [ 2.6636e-07, 2.0768e-07, 1.8626e-09, ..., 6.3330e-08, 1.5832e-08, 1.3039e-08], [ 1.0990e-07, 8.0094e-08, -1.8626e-09, ..., 2.4214e-08, -1.0505e-06, -8.5402e-07], ..., [ 9.8161e-07, 1.1539e-06, -2.7940e-09, ..., -2.5146e-08, 1.0030e-06, 8.0653e-07], [ 2.9951e-06, 2.4214e-07, 0.0000e+00, ..., 1.3681e-06, 4.2841e-08, 2.5146e-08], [ 1.6298e-07, 1.5646e-07, 9.3132e-10, ..., 4.8429e-08, 5.3085e-08, 2.7940e-08]], device='cuda:0') Epoch 178, bias, value: tensor([-0.0219, -0.0260, -0.0121, -0.0276, -0.0304, 0.0044, 0.0264, -0.0132, 0.0387, -0.0026], device='cuda:0'), grad: tensor([ 1.2573e-07, 8.4285e-07, -1.9781e-06, -4.1053e-06, -8.4788e-06, -2.2754e-05, 1.8626e-05, 6.0573e-06, 5.9009e-06, 5.7817e-06], device='cuda:0') 100 0.0001 changing lr epoch 177, time 255.46, cls_loss 0.0047 cls_loss_mapping 0.0042 cls_loss_causal 0.5499 re_mapping 0.0068 re_causal 0.0168 /// teacc 99.02 lr 0.00010000 Epoch 179, weight, value: tensor([[-0.0911, -0.1290, -0.0641, ..., -0.0543, 0.1514, 0.1495], [-0.1688, -0.1560, -0.1087, ..., -0.1256, -0.1756, -0.0869], [-0.0822, -0.0944, 0.1112, ..., -0.1524, 0.1699, 0.0489], ..., [-0.0946, 0.0592, 0.0462, ..., 0.1525, -0.1497, -0.1625], [-0.2068, 0.0565, -0.1143, ..., 0.0456, -0.0766, -0.1071], [ 0.0250, -0.0879, -0.0737, ..., -0.0735, -0.0489, -0.0990]], device='cuda:0'), grad: tensor([[ 1.6019e-07, 1.2387e-07, 8.3819e-09, ..., 1.2480e-07, 2.3190e-07, 9.5926e-08], [ 1.3877e-07, 1.2657e-06, 1.8626e-09, ..., 1.8775e-06, -2.7940e-08, 1.0245e-08], [ 1.5646e-07, 3.2131e-07, -2.9709e-07, ..., 3.6508e-07, -2.2538e-07, -2.4214e-07], ..., [ 1.6764e-08, -3.5577e-06, 1.1176e-08, ..., -4.7833e-06, 1.0617e-07, 8.7544e-08], [ 1.7602e-07, 3.1199e-07, 2.6822e-07, ..., 4.4890e-07, 4.1630e-07, 2.5053e-07], [ 9.3132e-09, 1.2442e-06, -0.0000e+00, ..., 1.2415e-06, 4.2841e-08, 1.8626e-08]], device='cuda:0') Epoch 179, bias, value: tensor([-0.0215, -0.0280, -0.0111, -0.0275, -0.0305, 0.0043, 0.0261, -0.0134, 0.0388, -0.0017], device='cuda:0'), grad: tensor([ 9.4809e-07, 2.0694e-06, -5.7742e-07, 2.9709e-07, 9.6671e-07, 6.1654e-07, -1.7369e-06, -8.1211e-06, 3.0287e-06, 2.4959e-06], device='cuda:0') 100 0.0001 changing lr epoch 178, time 256.06, cls_loss 0.0040 cls_loss_mapping 0.0037 cls_loss_causal 0.5314 re_mapping 0.0067 re_causal 0.0169 /// teacc 98.94 lr 0.00010000 Epoch 180, weight, value: tensor([[-0.0914, -0.1296, -0.0642, ..., -0.0546, 0.1516, 0.1497], [-0.1688, -0.1562, -0.1086, ..., -0.1256, -0.1759, -0.0870], [-0.0823, -0.0954, 0.1111, ..., -0.1534, 0.1704, 0.0492], ..., [-0.0947, 0.0596, 0.0463, ..., 0.1528, -0.1506, -0.1631], [-0.2070, 0.0564, -0.1128, ..., 0.0456, -0.0763, -0.1073], [ 0.0251, -0.0880, -0.0736, ..., -0.0736, -0.0491, -0.0996]], device='cuda:0'), grad: tensor([[ 5.1223e-08, 3.8184e-08, 9.3132e-10, ..., 2.4214e-08, -5.6624e-07, -1.4696e-06], [ 1.6764e-08, 1.7397e-06, 1.1176e-08, ..., 1.1390e-06, 9.1046e-06, 1.9781e-06], [ 2.4214e-08, 4.3306e-07, 1.8626e-09, ..., 2.5798e-07, -1.5467e-05, 6.6776e-07], ..., [ 1.1455e-07, -3.5558e-06, -3.3528e-08, ..., -2.4457e-06, 4.9360e-07, 1.7695e-08], [ 6.2399e-08, 9.0338e-08, 0.0000e+00, ..., 1.9558e-08, 4.3213e-07, 1.6205e-07], [-4.7963e-07, 7.9535e-07, 1.4901e-08, ..., 6.3144e-07, 3.0175e-06, 1.2387e-06]], device='cuda:0') Epoch 180, bias, value: tensor([-0.0215, -0.0275, -0.0116, -0.0273, -0.0307, 0.0041, 0.0260, -0.0135, 0.0389, -0.0016], device='cuda:0'), grad: tensor([ 3.2671e-06, 2.6420e-05, -2.9445e-05, 3.6836e-05, 9.6951e-07, 9.3654e-06, -2.4214e-05, -9.7826e-06, 1.1027e-06, -1.4655e-05], device='cuda:0') 100 0.0001 changing lr epoch 179, time 255.98, cls_loss 0.0038 cls_loss_mapping 0.0049 cls_loss_causal 0.5086 re_mapping 0.0067 re_causal 0.0169 /// teacc 98.99 lr 0.00010000 Epoch 181, weight, value: tensor([[-0.0925, -0.1304, -0.0643, ..., -0.0568, 0.1515, 0.1495], [-0.1691, -0.1566, -0.1087, ..., -0.1265, -0.1761, -0.0871], [-0.0824, -0.0958, 0.1111, ..., -0.1538, 0.1707, 0.0492], ..., [-0.0952, 0.0598, 0.0463, ..., 0.1532, -0.1508, -0.1633], [-0.2082, 0.0566, -0.1126, ..., 0.0456, -0.0762, -0.1081], [ 0.0251, -0.0881, -0.0735, ..., -0.0739, -0.0492, -0.1000]], device='cuda:0'), grad: tensor([[ 1.9558e-06, 8.3819e-08, 0.0000e+00, ..., 7.5903e-07, 8.8476e-07, 1.0207e-06], [ 1.2480e-07, 9.7454e-06, 0.0000e+00, ..., 1.2122e-05, 2.2352e-08, 2.0489e-08], [ 1.2200e-07, 1.0967e-05, 0.0000e+00, ..., 1.3471e-05, 2.7940e-09, 1.8626e-09], ..., [ 8.5682e-08, -2.3380e-05, 0.0000e+00, ..., -3.0458e-05, 1.8626e-09, 1.8626e-09], [ 3.2634e-06, -6.5099e-07, 0.0000e+00, ..., 2.0731e-06, 1.3569e-06, 1.9409e-06], [ 1.6779e-05, 1.4016e-06, 0.0000e+00, ..., 7.7263e-06, 2.3283e-08, 2.9802e-08]], device='cuda:0') Epoch 181, bias, value: tensor([-0.0219, -0.0279, -0.0120, -0.0271, -0.0297, 0.0052, 0.0245, -0.0133, 0.0389, -0.0019], device='cuda:0'), grad: tensor([ 5.0142e-06, 4.4912e-05, 5.0187e-05, 1.2830e-05, -4.8243e-06, 1.7062e-05, -5.8055e-05, -1.0926e-04, 4.9062e-06, 3.7253e-05], device='cuda:0') 100 0.0001 changing lr epoch 180, time 255.99, cls_loss 0.0045 cls_loss_mapping 0.0040 cls_loss_causal 0.5369 re_mapping 0.0066 re_causal 0.0166 /// teacc 98.98 lr 0.00010000 Epoch 182, weight, value: tensor([[-0.0938, -0.1317, -0.0645, ..., -0.0573, 0.1515, 0.1496], [-0.1693, -0.1569, -0.1088, ..., -0.1267, -0.1789, -0.0896], [-0.0826, -0.0963, 0.1114, ..., -0.1540, 0.1729, 0.0511], ..., [-0.0954, 0.0599, 0.0462, ..., 0.1534, -0.1516, -0.1641], [-0.2089, 0.0564, -0.1127, ..., 0.0454, -0.0750, -0.1072], [ 0.0247, -0.0882, -0.0735, ..., -0.0740, -0.0497, -0.1027]], device='cuda:0'), grad: tensor([[ 4.6566e-08, 2.4308e-07, 0.0000e+00, ..., 1.3132e-07, -2.6636e-07, -1.8254e-07], [ 1.6205e-07, 1.6689e-06, 0.0000e+00, ..., 3.8464e-07, 1.0245e-08, 5.5879e-09], [ 1.1362e-07, 4.1902e-05, 0.0000e+00, ..., 1.7107e-05, 3.8650e-07, 1.6671e-07], ..., [ 3.4459e-08, -9.4652e-05, 0.0000e+00, ..., -3.7432e-05, 1.0245e-08, 2.7940e-09], [ 8.2143e-07, 2.0117e-07, 0.0000e+00, ..., -2.1886e-07, -6.5938e-07, -2.5705e-07], [ 2.7809e-06, 5.2527e-07, 0.0000e+00, ..., 4.1164e-07, 1.1642e-07, 7.6368e-08]], device='cuda:0') Epoch 182, bias, value: tensor([-0.0223, -0.0289, -0.0106, -0.0270, -0.0295, 0.0054, 0.0245, -0.0135, 0.0390, -0.0020], device='cuda:0'), grad: tensor([ 3.6880e-07, -6.5684e-05, 9.0659e-05, 2.3186e-04, 1.8338e-06, -1.6010e-04, 1.5628e-06, -1.4436e-04, 3.7402e-05, 6.7167e-06], device='cuda:0') 100 0.0001 changing lr epoch 181, time 255.48, cls_loss 0.0033 cls_loss_mapping 0.0034 cls_loss_causal 0.5572 re_mapping 0.0067 re_causal 0.0176 /// teacc 99.01 lr 0.00010000 Epoch 183, weight, value: tensor([[-0.0943, -0.1322, -0.0647, ..., -0.0576, 0.1517, 0.1498], [-0.1693, -0.1559, -0.1089, ..., -0.1262, -0.1785, -0.0897], [-0.0827, -0.0967, 0.1115, ..., -0.1544, 0.1730, 0.0513], ..., [-0.0955, 0.0600, 0.0476, ..., 0.1536, -0.1538, -0.1647], [-0.2092, 0.0564, -0.1129, ..., 0.0454, -0.0747, -0.1075], [ 0.0246, -0.0887, -0.0757, ..., -0.0745, -0.0501, -0.1040]], device='cuda:0'), grad: tensor([[ 4.5933e-06, 1.0477e-07, 2.3283e-09, ..., 9.1270e-08, 4.6892e-07, -4.0047e-07], [ 1.7136e-07, 1.5413e-07, 2.7940e-09, ..., 2.0768e-07, 7.1749e-06, 2.4214e-06], [ 1.2293e-07, 1.9465e-07, 2.8871e-08, ..., 3.4459e-08, 1.8077e-06, 8.9174e-07], ..., [ 2.0023e-08, -6.6981e-06, 3.7253e-09, ..., -9.6932e-06, 2.0443e-07, 1.3830e-07], [ 8.3214e-07, 6.4587e-07, 0.0000e+00, ..., -1.3970e-09, 1.8058e-06, 1.6857e-06], [ 1.1493e-06, 4.9137e-06, 3.2596e-09, ..., 7.1637e-06, 1.7174e-06, 1.3979e-06]], device='cuda:0') Epoch 183, bias, value: tensor([-0.0223, -0.0278, -0.0109, -0.0268, -0.0295, 0.0050, 0.0251, -0.0139, 0.0391, -0.0023], device='cuda:0'), grad: tensor([ 9.2313e-06, 1.3158e-05, 5.4613e-06, -4.5784e-06, 5.9679e-06, 7.9721e-06, -4.6432e-05, -1.8597e-05, 6.0350e-06, 2.1696e-05], device='cuda:0') 100 0.0001 changing lr epoch 182, time 255.90, cls_loss 0.0044 cls_loss_mapping 0.0045 cls_loss_causal 0.5392 re_mapping 0.0066 re_causal 0.0167 /// teacc 98.98 lr 0.00010000 Epoch 184, weight, value: tensor([[-0.0945, -0.1331, -0.0653, ..., -0.0577, 0.1524, 0.1506], [-0.1699, -0.1561, -0.1090, ..., -0.1264, -0.1786, -0.0898], [-0.0829, -0.0978, 0.1117, ..., -0.1555, 0.1733, 0.0516], ..., [-0.0957, 0.0601, 0.0477, ..., 0.1542, -0.1541, -0.1651], [-0.2109, 0.0562, -0.1136, ..., 0.0442, -0.0753, -0.1096], [ 0.0238, -0.0891, -0.0758, ..., -0.0748, -0.0506, -0.1060]], device='cuda:0'), grad: tensor([[ 2.2398e-07, 2.3469e-07, 0.0000e+00, ..., 0.0000e+00, -2.8312e-07, -2.4308e-07], [ 1.2619e-07, 1.2014e-07, 0.0000e+00, ..., 1.8626e-09, 2.3283e-08, 1.6298e-08], [ 1.5926e-07, 1.5153e-06, 0.0000e+00, ..., 9.3132e-10, 6.4261e-08, 2.3749e-08], ..., [ 1.0105e-07, 7.9628e-08, 0.0000e+00, ..., -1.3504e-08, 1.3039e-08, 6.0536e-09], [ 1.9185e-07, -1.4585e-06, 0.0000e+00, ..., 1.0245e-08, 7.0315e-08, 3.9116e-08], [ 4.3539e-07, 3.8231e-07, 0.0000e+00, ..., 9.7789e-09, 1.6205e-07, 9.4064e-08]], device='cuda:0') Epoch 184, bias, value: tensor([-0.0218, -0.0276, -0.0112, -0.0269, -0.0297, 0.0055, 0.0257, -0.0141, 0.0372, -0.0021], device='cuda:0'), grad: tensor([ 3.1758e-07, 2.6962e-07, 7.2494e-06, -7.4148e-05, 3.9581e-08, 7.2122e-05, 1.2666e-07, 2.6124e-07, -7.3761e-06, 1.2089e-06], device='cuda:0') 100 0.0001 changing lr epoch 183, time 256.02, cls_loss 0.0047 cls_loss_mapping 0.0052 cls_loss_causal 0.5267 re_mapping 0.0065 re_causal 0.0155 /// teacc 99.00 lr 0.00010000 Epoch 185, weight, value: tensor([[-0.0948, -0.1335, -0.0655, ..., -0.0581, 0.1536, 0.1523], [-0.1724, -0.1564, -0.1093, ..., -0.1271, -0.1789, -0.0901], [-0.0806, -0.0986, 0.1123, ..., -0.1565, 0.1737, 0.0519], ..., [-0.0964, 0.0604, 0.0479, ..., 0.1548, -0.1545, -0.1659], [-0.2117, 0.0561, -0.1137, ..., 0.0441, -0.0757, -0.1107], [ 0.0233, -0.0892, -0.0760, ..., -0.0751, -0.0508, -0.1067]], device='cuda:0'), grad: tensor([[ 4.2748e-07, 8.8476e-09, 0.0000e+00, ..., 2.3004e-07, -2.3600e-06, -1.2983e-06], [ 1.1129e-07, 2.0955e-08, 0.0000e+00, ..., 4.7032e-08, 1.1828e-07, 8.7079e-08], [ 7.4040e-08, 1.3039e-08, 0.0000e+00, ..., 2.6077e-08, 1.2480e-07, 7.4971e-08], ..., [ 1.0524e-07, 2.1420e-08, 4.6566e-10, ..., 8.3353e-08, 2.3283e-08, 1.5367e-08], [ 2.4848e-06, 8.4983e-07, 0.0000e+00, ..., 1.6466e-06, 7.7300e-08, 4.7963e-08], [ 5.4576e-07, -8.4611e-07, 4.6566e-10, ..., 8.6147e-08, 1.6633e-06, 8.9500e-07]], device='cuda:0') Epoch 185, bias, value: tensor([-0.0210, -0.0293, -0.0100, -0.0267, -0.0298, 0.0055, 0.0253, -0.0139, 0.0368, -0.0016], device='cuda:0'), grad: tensor([-1.7751e-06, 8.7824e-07, 1.0179e-06, 1.7911e-05, -5.2229e-06, -3.0398e-05, 4.5411e-06, 7.9349e-06, 2.0415e-05, -1.5303e-05], device='cuda:0') 100 0.0001 changing lr epoch 184, time 255.91, cls_loss 0.0041 cls_loss_mapping 0.0042 cls_loss_causal 0.5307 re_mapping 0.0066 re_causal 0.0165 /// teacc 99.00 lr 0.00010000 Epoch 186, weight, value: tensor([[-0.0979, -0.1343, -0.0657, ..., -0.0608, 0.1528, 0.1506], [-0.1726, -0.1565, -0.1093, ..., -0.1279, -0.1790, -0.0902], [-0.0812, -0.0986, 0.1127, ..., -0.1569, 0.1737, 0.0520], ..., [-0.0965, 0.0605, 0.0479, ..., 0.1553, -0.1554, -0.1678], [-0.2124, 0.0561, -0.1145, ..., 0.0437, -0.0759, -0.1114], [ 0.0245, -0.0892, -0.0759, ..., -0.0752, -0.0509, -0.1073]], device='cuda:0'), grad: tensor([[ 3.8603e-07, 6.9849e-08, 4.6566e-10, ..., 4.1118e-07, 4.1071e-07, 3.7253e-07], [ 8.9407e-08, 6.8266e-07, 1.3970e-09, ..., 7.6601e-07, 6.6590e-08, 5.2154e-08], [ 3.0780e-07, 7.2755e-06, 9.3132e-10, ..., 1.5810e-05, -2.0303e-06, 4.0606e-07], ..., [ 2.8778e-07, -1.5050e-05, -7.3574e-08, ..., -3.0398e-05, 5.0291e-08, 3.1199e-08], [ 3.3248e-06, 2.9262e-06, 1.3970e-09, ..., -8.3968e-06, 4.2617e-06, 2.1234e-06], [ 2.2352e-07, 9.2685e-06, 1.5926e-07, ..., 1.7866e-05, 6.3330e-08, 5.1688e-08]], device='cuda:0') Epoch 186, bias, value: tensor([-0.0224, -0.0293, -0.0103, -0.0270, -0.0299, 0.0056, 0.0268, -0.0139, 0.0362, -0.0011], device='cuda:0'), grad: tensor([ 3.1013e-06, 2.3916e-06, 2.6837e-05, -4.7609e-06, -6.4895e-06, 3.8058e-05, -4.0889e-05, -7.1883e-05, -3.0100e-06, 5.6475e-05], device='cuda:0') 100 0.0001 changing lr epoch 185, time 255.66, cls_loss 0.0036 cls_loss_mapping 0.0039 cls_loss_causal 0.5679 re_mapping 0.0062 re_causal 0.0165 /// teacc 99.02 lr 0.00010000 Epoch 187, weight, value: tensor([[-0.0980, -0.1349, -0.0655, ..., -0.0607, 0.1531, 0.1509], [-0.1729, -0.1568, -0.1075, ..., -0.1282, -0.1793, -0.0905], [-0.0814, -0.0993, 0.1120, ..., -0.1574, 0.1747, 0.0535], ..., [-0.0967, 0.0610, 0.0490, ..., 0.1561, -0.1560, -0.1688], [-0.2128, 0.0558, -0.1151, ..., 0.0435, -0.0769, -0.1130], [ 0.0248, -0.0894, -0.0789, ..., -0.0759, -0.0511, -0.1079]], device='cuda:0'), grad: tensor([[ 4.4517e-07, 2.4959e-07, 5.5879e-09, ..., 2.4401e-07, 2.5705e-06, 9.7882e-07], [ 7.1246e-08, 1.8813e-07, 8.8476e-09, ..., 1.3085e-07, -2.1793e-06, 3.2131e-08], [ 1.0245e-07, 2.6217e-07, 4.6566e-10, ..., 7.0781e-08, -4.5635e-06, -2.6617e-06], ..., [ 7.3574e-08, -1.7975e-06, 2.1467e-07, ..., -1.8990e-06, 9.1689e-07, 5.3784e-07], [ 5.0431e-07, 7.2876e-07, 4.0047e-08, ..., 7.8836e-07, 2.0079e-06, 8.4890e-07], [ 3.9535e-07, 4.9453e-07, -1.9651e-07, ..., 1.1055e-06, 5.7044e-07, 3.0361e-07]], device='cuda:0') Epoch 187, bias, value: tensor([-0.0223, -0.0292, -0.0097, -0.0270, -0.0298, 0.0055, 0.0270, -0.0136, 0.0351, -0.0015], device='cuda:0'), grad: tensor([ 1.2361e-05, -3.6865e-05, -4.6194e-06, -2.7753e-07, 5.6345e-08, 4.5151e-06, 6.7055e-06, 4.5029e-07, 1.2249e-05, 5.3719e-06], device='cuda:0') 100 0.0001 changing lr epoch 186, time 255.65, cls_loss 0.0039 cls_loss_mapping 0.0043 cls_loss_causal 0.5232 re_mapping 0.0064 re_causal 0.0162 /// teacc 99.00 lr 0.00010000 Epoch 188, weight, value: tensor([[-0.0981, -0.1355, -0.0657, ..., -0.0607, 0.1535, 0.1512], [-0.1734, -0.1569, -0.1082, ..., -0.1287, -0.1797, -0.0909], [-0.0815, -0.1015, 0.1112, ..., -0.1594, 0.1753, 0.0541], ..., [-0.0969, 0.0615, 0.0512, ..., 0.1574, -0.1571, -0.1700], [-0.2131, 0.0557, -0.1152, ..., 0.0435, -0.0770, -0.1131], [ 0.0250, -0.0899, -0.0803, ..., -0.0767, -0.0514, -0.1087]], device='cuda:0'), grad: tensor([[ 1.3830e-07, 1.0245e-08, 7.9628e-08, ..., 5.2154e-08, 6.3190e-07, 5.6811e-08], [ 9.9652e-08, 1.3364e-07, 1.8626e-09, ..., 2.2398e-07, 2.0023e-08, 3.7253e-09], [ 3.5856e-08, 2.6543e-08, 2.3283e-08, ..., 1.7695e-08, -1.1986e-06, -9.3132e-09], ..., [ 2.7474e-08, -7.2364e-07, 7.9162e-09, ..., -1.8785e-06, 6.4261e-08, 3.2596e-09], [ 1.1912e-06, 1.4901e-08, 0.0000e+00, ..., 5.6345e-07, 1.3728e-06, 1.1409e-07], [ 1.3551e-07, 2.1979e-07, 9.7789e-09, ..., 3.3714e-07, 1.0803e-07, 6.9849e-09]], device='cuda:0') Epoch 188, bias, value: tensor([-0.0222, -0.0291, -0.0103, -0.0283, -0.0294, 0.0068, 0.0265, -0.0131, 0.0350, -0.0019], device='cuda:0'), grad: tensor([ 1.4473e-06, -2.9653e-05, -1.2502e-05, 9.9558e-07, -1.8161e-07, -3.9712e-06, 1.1828e-06, 1.5467e-05, 2.5943e-05, 1.3281e-06], device='cuda:0') 100 0.0001 changing lr epoch 187, time 255.55, cls_loss 0.0045 cls_loss_mapping 0.0051 cls_loss_causal 0.5122 re_mapping 0.0061 re_causal 0.0153 /// teacc 98.95 lr 0.00010000 Epoch 189, weight, value: tensor([[-0.0981, -0.1362, -0.0659, ..., -0.0608, 0.1543, 0.1521], [-0.1749, -0.1574, -0.1082, ..., -0.1276, -0.1799, -0.0918], [-0.0803, -0.1034, 0.1112, ..., -0.1616, 0.1756, 0.0555], ..., [-0.0973, 0.0623, 0.0512, ..., 0.1575, -0.1574, -0.1707], [-0.2134, 0.0557, -0.1153, ..., 0.0437, -0.0773, -0.1137], [ 0.0240, -0.0903, -0.0803, ..., -0.0771, -0.0515, -0.1092]], device='cuda:0'), grad: tensor([[ 4.4797e-07, 1.7714e-06, 0.0000e+00, ..., 1.4435e-08, -8.4843e-07, -6.9663e-07], [ 3.3714e-07, 6.8359e-07, 0.0000e+00, ..., 5.4017e-08, 2.7940e-08, 1.7229e-08], [ 2.9569e-07, 7.9069e-07, 0.0000e+00, ..., 2.1933e-07, 4.1444e-08, 4.3772e-08], ..., [ 8.2422e-07, 9.9465e-07, 0.0000e+00, ..., -4.6426e-07, 2.2817e-08, 1.3970e-08], [-1.8537e-05, -8.0705e-05, 0.0000e+00, ..., -4.1723e-07, 9.8255e-08, 8.0094e-08], [ 1.7390e-05, 7.6234e-05, 0.0000e+00, ..., 4.1770e-07, 2.5565e-07, 2.0396e-07]], device='cuda:0') Epoch 189, bias, value: tensor([-0.0217, -0.0298, -0.0098, -0.0270, -0.0301, 0.0057, 0.0260, -0.0132, 0.0353, -0.0013], device='cuda:0'), grad: tensor([ 7.5400e-06, 3.3807e-06, 4.3660e-06, 2.6170e-06, -1.1250e-06, 5.7146e-06, 2.9616e-06, 3.6471e-06, -3.8457e-04, 3.5548e-04], device='cuda:0') 100 0.0001 changing lr epoch 188, time 255.71, cls_loss 0.0051 cls_loss_mapping 0.0055 cls_loss_causal 0.5581 re_mapping 0.0062 re_causal 0.0161 /// teacc 99.02 lr 0.00010000 Epoch 190, weight, value: tensor([[-0.0978, -0.1376, -0.0668, ..., -0.0615, 0.1559, 0.1538], [-0.1757, -0.1577, -0.1082, ..., -0.1274, -0.1822, -0.0948], [-0.0799, -0.1038, 0.1113, ..., -0.1620, 0.1760, 0.0558], ..., [-0.0976, 0.0625, 0.0512, ..., 0.1576, -0.1579, -0.1711], [-0.2134, 0.0557, -0.1154, ..., 0.0439, -0.0772, -0.1141], [ 0.0234, -0.0906, -0.0805, ..., -0.0775, -0.0517, -0.1098]], device='cuda:0'), grad: tensor([[ 2.7940e-08, 1.3039e-08, 1.4435e-08, ..., 9.3132e-10, 5.1688e-08, 2.6077e-08], [ 6.6124e-08, 2.1653e-07, 4.1910e-09, ..., 1.4296e-07, 1.6438e-07, 2.2352e-08], [ 8.1956e-08, 1.3076e-06, -1.7043e-07, ..., 9.3132e-10, -5.0385e-07, -1.6391e-07], ..., [ 2.0955e-08, -8.3819e-09, 1.6764e-08, ..., -1.9418e-07, 7.6834e-08, 1.9558e-08], [ 2.7521e-07, 6.1886e-07, 5.8208e-08, ..., 1.3039e-08, 1.4622e-07, 8.8941e-08], [ 1.6531e-07, 1.6671e-07, 2.3283e-09, ..., 6.2864e-08, 6.0536e-09, 2.7940e-09]], device='cuda:0') Epoch 190, bias, value: tensor([-0.0203, -0.0310, -0.0092, -0.0278, -0.0300, 0.0064, 0.0253, -0.0133, 0.0362, -0.0013], device='cuda:0'), grad: tensor([ 3.6694e-07, -5.6531e-07, -1.1120e-06, -1.1966e-05, -6.0257e-07, 9.6187e-06, 1.8487e-07, 6.5146e-07, 2.2650e-06, 1.1288e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 256.13, cls_loss 0.0040 cls_loss_mapping 0.0037 cls_loss_causal 0.5327 re_mapping 0.0062 re_causal 0.0156 /// teacc 98.95 lr 0.00010000 Epoch 191, weight, value: tensor([[-0.0980, -0.1382, -0.0675, ..., -0.0619, 0.1561, 0.1539], [-0.1757, -0.1587, -0.1083, ..., -0.1281, -0.1834, -0.0948], [-0.0800, -0.1040, 0.1117, ..., -0.1625, 0.1773, 0.0559], ..., [-0.0986, 0.0619, 0.0513, ..., 0.1580, -0.1575, -0.1715], [-0.2138, 0.0559, -0.1156, ..., 0.0438, -0.0774, -0.1144], [ 0.0230, -0.0911, -0.0807, ..., -0.0776, -0.0519, -0.1103]], device='cuda:0'), grad: tensor([[ 1.0431e-07, 1.4016e-07, 0.0000e+00, ..., 8.1956e-08, -5.3085e-08, 2.9337e-08], [ 7.7300e-08, 3.5949e-07, 0.0000e+00, ..., 2.7195e-07, 2.4214e-08, 4.0978e-08], [ 2.1420e-08, 5.2527e-06, 0.0000e+00, ..., 3.1758e-06, -3.7253e-09, 3.2596e-09], ..., [ 2.3283e-09, -6.9402e-06, 9.3132e-10, ..., -5.2825e-06, 1.3970e-09, 1.3970e-09], [ 1.1437e-06, 2.3600e-06, 0.0000e+00, ..., -8.4750e-08, 3.0501e-07, 6.2073e-07], [-2.5518e-07, 5.6159e-07, -9.3132e-10, ..., 4.9965e-07, 5.8673e-08, 3.5390e-08]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0203, -0.0314, -0.0096, -0.0279, -0.0302, 0.0067, 0.0255, -0.0129, 0.0366, -0.0015], device='cuda:0'), grad: tensor([ 6.7428e-07, 1.1604e-06, 1.4037e-05, -1.1683e-05, 2.6207e-06, 6.1579e-06, -9.3356e-06, -1.6153e-05, 1.3173e-05, -6.7288e-07], device='cuda:0') 100 0.0001 changing lr epoch 190, time 255.80, cls_loss 0.0035 cls_loss_mapping 0.0037 cls_loss_causal 0.5005 re_mapping 0.0064 re_causal 0.0163 /// teacc 98.93 lr 0.00010000 Epoch 192, weight, value: tensor([[-0.0982, -0.1385, -0.0689, ..., -0.0621, 0.1563, 0.1540], [-0.1759, -0.1591, -0.1082, ..., -0.1286, -0.1835, -0.0948], [-0.0801, -0.1045, 0.1125, ..., -0.1629, 0.1778, 0.0559], ..., [-0.0990, 0.0618, 0.0512, ..., 0.1583, -0.1586, -0.1720], [-0.2140, 0.0559, -0.1162, ..., 0.0437, -0.0775, -0.1146], [ 0.0224, -0.0912, -0.0804, ..., -0.0782, -0.0521, -0.1108]], device='cuda:0'), grad: tensor([[ 3.5390e-08, 2.5146e-08, 3.7253e-09, ..., 2.0489e-08, -1.6764e-08, -5.1223e-09], [ 3.4925e-08, 6.4820e-07, 1.2992e-07, ..., 6.2771e-07, 6.2399e-08, 2.3283e-09], [ 6.0536e-09, 2.3749e-08, -4.9826e-08, ..., 2.7474e-08, -7.8697e-08, 9.3132e-10], ..., [ 2.5146e-08, -8.3633e-07, -9.5461e-08, ..., -1.0524e-06, 6.5193e-09, 9.3132e-10], [ 7.8231e-08, 2.0489e-08, 1.8626e-09, ..., 8.3819e-09, 2.0023e-08, 2.1420e-08], [ 2.2491e-07, 1.5274e-07, -2.2352e-08, ..., 2.3516e-07, 7.9162e-09, 5.5879e-09]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0203, -0.0316, -0.0095, -0.0277, -0.0291, 0.0065, 0.0257, -0.0129, 0.0367, -0.0022], device='cuda:0'), grad: tensor([ 2.2165e-07, 1.7658e-06, 1.6158e-07, -3.3015e-07, 7.9162e-08, 1.0077e-06, -7.1479e-07, -2.2352e-06, 5.4762e-07, -5.2154e-07], device='cuda:0') 100 0.0001 changing lr epoch 191, time 255.98, cls_loss 0.0045 cls_loss_mapping 0.0031 cls_loss_causal 0.5416 re_mapping 0.0063 re_causal 0.0157 /// teacc 98.98 lr 0.00010000 Epoch 193, weight, value: tensor([[-0.0990, -0.1393, -0.0691, ..., -0.0625, 0.1571, 0.1546], [-0.1761, -0.1593, -0.1072, ..., -0.1287, -0.1836, -0.0950], [-0.0802, -0.1074, 0.1129, ..., -0.1660, 0.1778, 0.0557], ..., [-0.0996, 0.0627, 0.0508, ..., 0.1597, -0.1591, -0.1723], [-0.2147, 0.0560, -0.1163, ..., 0.0442, -0.0780, -0.1157], [ 0.0216, -0.0915, -0.0807, ..., -0.0789, -0.0522, -0.1115]], device='cuda:0'), grad: tensor([[-4.0140e-07, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, -2.1935e-05, -1.1869e-05], [ 2.9802e-08, 1.1176e-08, 0.0000e+00, ..., 1.5832e-08, 1.4808e-07, 7.0781e-08], [ 7.6368e-08, 5.5879e-09, -6.5193e-09, ..., 7.4506e-09, -2.1055e-05, 1.8161e-07], ..., [ 2.1420e-08, -4.3772e-08, 0.0000e+00, ..., -6.1467e-08, 4.9360e-08, 2.1420e-08], [ 3.9600e-06, 9.3132e-10, 0.0000e+00, ..., -8.2888e-08, 1.4016e-06, 5.6066e-07], [-3.4552e-07, 1.9558e-08, 0.0000e+00, ..., 3.3528e-08, 4.9546e-07, 2.5611e-07]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0200, -0.0310, -0.0111, -0.0276, -0.0287, 0.0064, 0.0259, -0.0125, 0.0366, -0.0027], device='cuda:0'), grad: tensor([-2.7746e-05, -6.5193e-07, -5.0664e-05, 9.4995e-07, 4.7445e-05, 2.5034e-06, 8.2925e-06, 2.9709e-07, 1.4655e-05, 4.9062e-06], device='cuda:0') 100 0.0001 changing lr epoch 192, time 255.48, cls_loss 0.0043 cls_loss_mapping 0.0058 cls_loss_causal 0.5499 re_mapping 0.0064 re_causal 0.0161 /// teacc 98.92 lr 0.00010000 Epoch 194, weight, value: tensor([[-0.0992, -0.1398, -0.0692, ..., -0.0629, 0.1574, 0.1549], [-0.1763, -0.1577, -0.1072, ..., -0.1294, -0.1837, -0.0953], [-0.0802, -0.1074, 0.1135, ..., -0.1661, 0.1783, 0.0562], ..., [-0.1007, 0.0614, 0.0507, ..., 0.1602, -0.1598, -0.1725], [-0.2152, 0.0560, -0.1172, ..., 0.0441, -0.0782, -0.1160], [ 0.0217, -0.0916, -0.0807, ..., -0.0793, -0.0524, -0.1123]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 4.6566e-09, 0.0000e+00, ..., -3.2596e-07, -2.7325e-06, -2.4736e-06], [ 2.2352e-08, 6.2399e-08, 0.0000e+00, ..., 4.6566e-08, 1.3569e-06, 4.2282e-07], [ 7.4506e-09, 9.3132e-09, 0.0000e+00, ..., 3.0734e-08, -5.2340e-07, -8.0559e-07], ..., [ 1.4901e-08, -4.1630e-07, 0.0000e+00, ..., -3.4645e-07, 5.4110e-07, 3.1665e-07], [ 5.3085e-08, 5.4017e-08, 0.0000e+00, ..., -2.8778e-07, -1.7369e-06, 8.4471e-07], [ 2.0489e-08, 7.8231e-08, 0.0000e+00, ..., 3.5483e-07, 1.0524e-06, 6.9477e-07]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0200, -0.0289, -0.0107, -0.0271, -0.0293, 0.0060, 0.0261, -0.0145, 0.0362, -0.0025], device='cuda:0'), grad: tensor([-6.7391e-06, 1.2908e-06, 4.4741e-06, 2.4047e-06, 2.3916e-06, 1.4612e-06, 2.6990e-06, 1.6252e-06, -1.2174e-05, 2.4885e-06], device='cuda:0') 100 0.0001 changing lr epoch 193, time 255.66, cls_loss 0.0049 cls_loss_mapping 0.0052 cls_loss_causal 0.4969 re_mapping 0.0063 re_causal 0.0154 /// teacc 98.96 lr 0.00010000 Epoch 195, weight, value: tensor([[-0.0998, -0.1408, -0.0696, ..., -0.0632, 0.1573, 0.1548], [-0.1773, -0.1577, -0.1072, ..., -0.1298, -0.1838, -0.0953], [-0.0799, -0.1080, 0.1138, ..., -0.1666, 0.1784, 0.0562], ..., [-0.1013, 0.0602, 0.0507, ..., 0.1595, -0.1603, -0.1730], [-0.2154, 0.0586, -0.1178, ..., 0.0464, -0.0784, -0.1166], [ 0.0215, -0.0919, -0.0806, ..., -0.0796, -0.0524, -0.1127]], device='cuda:0'), grad: tensor([[ 1.8813e-07, 3.3528e-08, 0.0000e+00, ..., 6.5193e-09, -1.2815e-05, -4.0568e-06], [ 1.6764e-08, 6.7055e-08, 0.0000e+00, ..., 5.4017e-08, 1.0775e-06, 9.3132e-10], [ 1.0245e-08, -1.0896e-07, 0.0000e+00, ..., 2.7940e-09, -2.4438e-06, 4.4703e-08], ..., [ 2.1420e-08, -4.2878e-06, 0.0000e+00, ..., -6.5342e-06, 1.1493e-06, 1.8626e-09], [ 3.2596e-08, -2.5146e-08, 0.0000e+00, ..., 1.4901e-08, 1.1086e-05, 3.5521e-06], [ 9.9652e-08, 4.2319e-06, 0.0000e+00, ..., 6.2697e-06, 2.2613e-06, 5.9232e-07]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0205, -0.0288, -0.0103, -0.0260, -0.0293, 0.0046, 0.0266, -0.0150, 0.0383, -0.0027], device='cuda:0'), grad: tensor([-1.9297e-05, 2.6137e-05, -6.9499e-05, 6.4075e-07, 6.6124e-07, -1.6484e-07, -2.6077e-07, 1.5058e-05, 2.3097e-05, 2.3484e-05], device='cuda:0') 100 0.0001 changing lr epoch 194, time 255.81, cls_loss 0.0039 cls_loss_mapping 0.0037 cls_loss_causal 0.5165 re_mapping 0.0060 re_causal 0.0154 /// teacc 98.98 lr 0.00010000 Epoch 196, weight, value: tensor([[-0.1000, -0.1412, -0.0698, ..., -0.0637, 0.1575, 0.1550], [-0.1774, -0.1579, -0.1070, ..., -0.1300, -0.1842, -0.0959], [-0.0800, -0.1086, 0.1138, ..., -0.1673, 0.1788, 0.0570], ..., [-0.1021, 0.0605, 0.0507, ..., 0.1599, -0.1607, -0.1735], [-0.2159, 0.0584, -0.1179, ..., 0.0471, -0.0779, -0.1169], [ 0.0213, -0.0920, -0.0812, ..., -0.0798, -0.0526, -0.1131]], device='cuda:0'), grad: tensor([[-3.9767e-07, 2.0489e-08, 0.0000e+00, ..., 4.1910e-08, -3.3844e-06, -1.8496e-06], [ 1.3039e-08, 8.3819e-08, 0.0000e+00, ..., 1.6484e-07, 3.9116e-08, 1.5832e-08], [ 1.3970e-08, 7.7859e-07, 0.0000e+00, ..., 1.6941e-06, 1.5367e-07, 7.3574e-08], ..., [ 5.2154e-08, -9.1735e-07, 0.0000e+00, ..., -1.9949e-06, 1.8626e-07, 1.2107e-07], [ 1.5832e-08, 1.8626e-08, 0.0000e+00, ..., -5.0291e-08, 1.5181e-07, 6.3330e-08], [ 7.1712e-08, 7.2643e-08, 0.0000e+00, ..., 7.4506e-08, 1.3215e-06, 6.6962e-07]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0205, -0.0289, -0.0103, -0.0261, -0.0293, 0.0046, 0.0261, -0.0147, 0.0385, -0.0029], device='cuda:0'), grad: tensor([-5.0068e-06, 3.0734e-06, 1.0312e-05, -2.4945e-05, -6.2771e-06, 2.6584e-05, 3.5018e-06, -9.8422e-06, 2.9150e-07, 2.3916e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 195---------------------------------------------------- epoch 195, time 272.05, cls_loss 0.0038 cls_loss_mapping 0.0039 cls_loss_causal 0.5316 re_mapping 0.0060 re_causal 0.0156 /// teacc 99.05 lr 0.00010000 Epoch 197, weight, value: tensor([[-0.1003, -0.1416, -0.0703, ..., -0.0644, 0.1576, 0.1549], [-0.1776, -0.1580, -0.1070, ..., -0.1303, -0.1844, -0.0961], [-0.0801, -0.1099, 0.1140, ..., -0.1697, 0.1790, 0.0558], ..., [-0.1024, 0.0609, 0.0506, ..., 0.1606, -0.1610, -0.1739], [-0.2165, 0.0584, -0.1179, ..., 0.0475, -0.0770, -0.1180], [ 0.0214, -0.0922, -0.0812, ..., -0.0801, -0.0529, -0.1139]], device='cuda:0'), grad: tensor([[ 1.7136e-07, 5.5879e-09, 0.0000e+00, ..., -3.1292e-07, -3.6731e-06, -4.5598e-06], [ 3.0175e-07, 2.1420e-08, 0.0000e+00, ..., 6.6124e-08, 2.1420e-08, 1.7695e-08], [ 1.8720e-07, 7.4506e-09, 0.0000e+00, ..., -3.0994e-05, -9.6187e-06, -1.0775e-06], ..., [ 2.3097e-07, -4.7497e-08, 0.0000e+00, ..., 3.0845e-05, 9.6411e-06, 1.1232e-06], [ 5.6997e-06, 2.5146e-08, 0.0000e+00, ..., 2.0396e-07, 2.8871e-08, 3.8184e-08], [ 2.6450e-07, 4.2841e-08, 0.0000e+00, ..., 7.6368e-08, 1.9185e-07, 1.9278e-07]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0208, -0.0289, -0.0115, -0.0259, -0.0299, 0.0047, 0.0260, -0.0145, 0.0389, -0.0025], device='cuda:0'), grad: tensor([-6.8992e-06, -1.0934e-06, -2.0278e-04, 7.3574e-06, 2.6356e-07, -4.6104e-05, 4.1217e-05, 2.0480e-04, 7.1228e-06, -3.8072e-06], device='cuda:0') 100 0.0001 changing lr epoch 196, time 255.76, cls_loss 0.0043 cls_loss_mapping 0.0040 cls_loss_causal 0.5381 re_mapping 0.0060 re_causal 0.0155 /// teacc 98.97 lr 0.00010000 Epoch 198, weight, value: tensor([[-0.1003, -0.1426, -0.0705, ..., -0.0637, 0.1577, 0.1554], [-0.1780, -0.1599, -0.1071, ..., -0.1316, -0.1847, -0.0968], [-0.0802, -0.1101, 0.1141, ..., -0.1698, 0.1800, 0.0572], ..., [-0.1027, 0.0623, 0.0506, ..., 0.1616, -0.1614, -0.1748], [-0.2169, 0.0583, -0.1180, ..., 0.0474, -0.0775, -0.1190], [ 0.0208, -0.0929, -0.0812, ..., -0.0808, -0.0529, -0.1153]], device='cuda:0'), grad: tensor([[ 8.1025e-08, 3.7253e-09, 1.8626e-09, ..., 2.1420e-08, -2.2724e-07, -1.7975e-07], [ 8.5682e-08, 6.4727e-07, 1.8626e-09, ..., 1.8533e-07, 2.2352e-08, 1.1176e-08], [ 2.0489e-08, 3.8557e-07, -8.3819e-09, ..., 5.5134e-07, -6.2399e-08, -1.8626e-08], ..., [ 5.4017e-08, 1.6335e-06, 1.8626e-09, ..., -3.6601e-07, 3.2596e-08, 1.9558e-08], [ 1.9465e-07, -2.6822e-07, 9.3132e-10, ..., -2.2724e-07, 1.9558e-08, 1.2107e-08], [ 2.8592e-07, 1.3039e-08, 0.0000e+00, ..., 7.3574e-08, 7.4506e-08, 5.7742e-08]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0211, -0.0301, -0.0110, -0.0258, -0.0298, 0.0047, 0.0261, -0.0134, 0.0384, -0.0023], device='cuda:0'), grad: tensor([-2.0489e-07, 2.5760e-06, 2.0899e-06, 3.3647e-05, 1.2405e-06, -3.9309e-05, 2.8480e-06, 5.3868e-06, -4.1164e-06, -4.1015e-06], device='cuda:0') 100 0.0001 changing lr epoch 197, time 251.39, cls_loss 0.0046 cls_loss_mapping 0.0046 cls_loss_causal 0.5418 re_mapping 0.0063 re_causal 0.0157 /// teacc 99.02 lr 0.00010000 Epoch 199, weight, value: tensor([[-0.1020, -0.1433, -0.0706, ..., -0.0640, 0.1583, 0.1549], [-0.1783, -0.1600, -0.1071, ..., -0.1319, -0.1848, -0.0973], [-0.0801, -0.1105, 0.1141, ..., -0.1701, 0.1803, 0.0580], ..., [-0.1030, 0.0622, 0.0506, ..., 0.1619, -0.1621, -0.1762], [-0.2178, 0.0585, -0.1181, ..., 0.0454, -0.0780, -0.1212], [ 0.0198, -0.0934, -0.0814, ..., -0.0810, -0.0550, -0.1189]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 2.4214e-08, -3.2317e-07, -2.0303e-07], [ 1.8626e-09, 6.5193e-09, 1.8626e-09, ..., 1.6403e-04, 6.5193e-09, 4.6566e-09], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 2.4494e-07, -3.5390e-08, 5.5879e-09], ..., [ 1.8626e-09, -1.8626e-09, 7.4506e-09, ..., -1.6797e-04, 8.3819e-09, 3.7253e-09], [ 1.0710e-07, 1.3039e-08, 9.3132e-10, ..., -5.9791e-07, 1.0803e-07, 6.9849e-08], [ 9.3132e-10, 4.6566e-09, -2.2352e-08, ..., 3.8482e-06, 1.3877e-07, 9.2201e-08]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0209, -0.0302, -0.0106, -0.0250, -0.0300, 0.0054, 0.0267, -0.0134, 0.0364, -0.0030], device='cuda:0'), grad: tensor([-1.0990e-07, 5.2547e-04, 8.3447e-07, 5.0142e-06, 3.7774e-06, 3.9767e-07, -7.3574e-08, -5.3549e-04, 3.4738e-07, -1.2182e-06], device='cuda:0') 100 0.0001 changing lr epoch 198, time 250.75, cls_loss 0.0040 cls_loss_mapping 0.0040 cls_loss_causal 0.5100 re_mapping 0.0061 re_causal 0.0155 /// teacc 98.99 lr 0.00010000 Epoch 200, weight, value: tensor([[-0.1022, -0.1439, -0.0709, ..., -0.0641, 0.1586, 0.1551], [-0.1787, -0.1601, -0.1070, ..., -0.1328, -0.1850, -0.0974], [-0.0802, -0.1109, 0.1141, ..., -0.1706, 0.1805, 0.0579], ..., [-0.1036, 0.0622, 0.0505, ..., 0.1626, -0.1626, -0.1765], [-0.2184, 0.0585, -0.1177, ..., 0.0448, -0.0783, -0.1223], [ 0.0204, -0.0935, -0.0815, ..., -0.0813, -0.0553, -0.1197]], device='cuda:0'), grad: tensor([[ 2.2352e-08, 7.1712e-08, 0.0000e+00, ..., 1.8720e-07, -1.1018e-06, -7.2457e-07], [ 1.5832e-07, 9.2201e-08, 0.0000e+00, ..., 1.7229e-07, 1.0990e-07, 7.7300e-08], [ 1.5832e-08, 1.1111e-06, 0.0000e+00, ..., 1.4976e-06, 1.8626e-08, 4.1910e-08], ..., [ 6.5193e-09, -2.6971e-06, 0.0000e+00, ..., -4.5598e-06, 2.3283e-08, 2.7940e-09], [ 1.0338e-07, 5.8394e-07, 0.0000e+00, ..., 7.7859e-07, 1.1362e-07, 7.4506e-08], [ 1.6764e-08, 1.5181e-07, 0.0000e+00, ..., 3.4273e-07, 9.0338e-08, 5.5879e-08]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0209, -0.0292, -0.0118, -0.0252, -0.0299, 0.0058, 0.0274, -0.0141, 0.0358, -0.0030], device='cuda:0'), grad: tensor([-1.1390e-06, 6.9477e-07, 2.5630e-06, 1.1111e-06, 2.5276e-06, 1.0535e-05, -1.0252e-05, -7.4394e-06, 1.3635e-06, 9.3132e-09], device='cuda:0') 100 0.0001 changing lr epoch 199, time 251.15, cls_loss 0.0036 cls_loss_mapping 0.0031 cls_loss_causal 0.5240 re_mapping 0.0060 re_causal 0.0150 /// teacc 98.99 lr 0.00010000 Epoch 201, weight, value: tensor([[-0.1026, -0.1445, -0.0715, ..., -0.0644, 0.1588, 0.1551], [-0.1792, -0.1601, -0.1070, ..., -0.1333, -0.1852, -0.0976], [-0.0799, -0.1112, 0.1151, ..., -0.1711, 0.1806, 0.0582], ..., [-0.1038, 0.0622, 0.0504, ..., 0.1630, -0.1630, -0.1770], [-0.2181, 0.0584, -0.1183, ..., 0.0454, -0.0777, -0.1226], [ 0.0188, -0.0939, -0.0816, ..., -0.0816, -0.0556, -0.1202]], device='cuda:0'), grad: tensor([[ 2.4159e-06, 9.3132e-10, 9.3132e-10, ..., -0.0000e+00, -4.4964e-06, 1.4370e-06], [ 1.5832e-08, 1.1735e-07, 4.6566e-09, ..., -1.6494e-06, 5.3924e-07, 2.1234e-07], [ 2.1420e-08, 3.7253e-09, 0.0000e+00, ..., 1.6764e-08, -6.8285e-06, -2.8498e-06], ..., [ 5.5879e-09, -1.9185e-07, 1.4901e-08, ..., -2.9057e-07, 2.0582e-06, 8.3260e-07], [ 1.8626e-07, 1.8626e-09, 2.7940e-09, ..., -2.7940e-08, 3.8110e-06, 1.7760e-06], [ 4.5635e-08, 3.3528e-08, -4.0978e-08, ..., 9.9652e-08, 5.1372e-06, 1.1846e-06]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0209, -0.0293, -0.0119, -0.0249, -0.0299, 0.0053, 0.0276, -0.0141, 0.0362, -0.0032], device='cuda:0'), grad: tensor([-1.3337e-06, -1.3933e-05, -1.4499e-05, 2.3600e-06, 1.3039e-05, 1.7798e-06, -1.1519e-05, 4.4033e-06, 9.7007e-06, 9.9838e-06], device='cuda:0') 100 0.0001 changing lr epoch 200, time 250.88, cls_loss 0.0037 cls_loss_mapping 0.0031 cls_loss_causal 0.5298 re_mapping 0.0061 re_causal 0.0153 /// teacc 98.92 lr 0.00010000 Epoch 202, weight, value: tensor([[-0.1027, -0.1448, -0.0689, ..., -0.0644, 0.1598, 0.1561], [-0.1793, -0.1602, -0.1067, ..., -0.1332, -0.1854, -0.0978], [-0.0800, -0.1120, 0.1155, ..., -0.1715, 0.1813, 0.0584], ..., [-0.1040, 0.0624, 0.0499, ..., 0.1633, -0.1643, -0.1779], [-0.2186, 0.0584, -0.1182, ..., 0.0455, -0.0779, -0.1231], [ 0.0183, -0.0940, -0.0827, ..., -0.0819, -0.0567, -0.1229]], device='cuda:0'), grad: tensor([[ 9.3132e-09, 5.5879e-09, 0.0000e+00, ..., 5.3085e-08, 5.9698e-07, 1.8999e-07], [ 1.8626e-08, 1.6578e-07, 0.0000e+00, ..., 6.1467e-06, 1.4165e-06, 4.6659e-07], [ 9.3132e-10, 4.3679e-07, 0.0000e+00, ..., 1.0049e-06, -9.8050e-06, -3.2280e-06], ..., [ 6.3330e-08, -5.0850e-07, 0.0000e+00, ..., -1.6075e-06, 2.5053e-06, 8.2515e-07], [ 3.6322e-08, 1.9465e-07, 0.0000e+00, ..., -6.0536e-06, 2.8722e-06, 9.4622e-07], [ 3.1665e-08, 1.1828e-07, 0.0000e+00, ..., 6.5193e-08, 1.3877e-07, 4.6566e-08]], device='cuda:0') Epoch 202, bias, value: tensor([-0.0202, -0.0291, -0.0121, -0.0251, -0.0298, 0.0051, 0.0280, -0.0141, 0.0361, -0.0032], device='cuda:0'), grad: tensor([ 2.4624e-06, 2.3261e-05, -3.4869e-05, 5.1446e-06, 9.0338e-07, 5.8301e-07, 2.5406e-06, 7.7337e-06, -8.4788e-06, 6.9477e-07], device='cuda:0') 100 0.0001 changing lr epoch 201, time 250.64, cls_loss 0.0037 cls_loss_mapping 0.0033 cls_loss_causal 0.5372 re_mapping 0.0060 re_causal 0.0155 /// teacc 99.01 lr 0.00010000 Epoch 203, weight, value: tensor([[-0.1019, -0.1452, -0.0665, ..., -0.0642, 0.1613, 0.1577], [-0.1794, -0.1612, -0.1064, ..., -0.1365, -0.1855, -0.0979], [-0.0802, -0.1125, 0.1150, ..., -0.1718, 0.1810, 0.0575], ..., [-0.1047, 0.0632, 0.0499, ..., 0.1652, -0.1648, -0.1783], [-0.2189, 0.0583, -0.1190, ..., 0.0455, -0.0781, -0.1234], [ 0.0179, -0.0943, -0.0830, ..., -0.0819, -0.0569, -0.1236]], device='cuda:0'), grad: tensor([[ 2.6263e-07, 2.1886e-07, 1.4901e-08, ..., 1.5832e-08, -1.1651e-06, -9.1270e-07], [ 1.0803e-07, 4.0606e-07, 9.3132e-10, ..., 1.1362e-07, 1.7695e-08, 1.5832e-08], [ 5.3085e-07, 6.2585e-07, -1.9372e-07, ..., 1.0338e-07, -6.4354e-07, -1.1176e-08], ..., [ 2.5146e-07, 1.9416e-05, 1.8720e-07, ..., 6.1058e-06, 6.5938e-07, 5.3085e-08], [ 4.6641e-06, 2.4401e-06, 2.7008e-08, ..., 3.7663e-06, 1.0617e-07, 1.8999e-07], [ 3.7253e-07, 5.4948e-07, 2.7940e-09, ..., 2.6636e-07, 2.0023e-07, 1.5553e-07]], device='cuda:0') Epoch 203, bias, value: tensor([-0.0190, -0.0300, -0.0123, -0.0254, -0.0305, 0.0053, 0.0276, -0.0134, 0.0360, -0.0027], device='cuda:0'), grad: tensor([-1.2759e-06, 8.0746e-07, 1.0533e-06, -3.6240e-05, 1.5926e-07, -3.6433e-06, -3.1851e-06, 3.0696e-05, 1.0252e-05, 1.3830e-06], device='cuda:0') 100 0.0001 changing lr epoch 202, time 250.51, cls_loss 0.0034 cls_loss_mapping 0.0034 cls_loss_causal 0.5381 re_mapping 0.0060 re_causal 0.0153 /// teacc 98.97 lr 0.00010000 Epoch 204, weight, value: tensor([[-0.1021, -0.1462, -0.0663, ..., -0.0643, 0.1615, 0.1580], [-0.1798, -0.1613, -0.1065, ..., -0.1370, -0.1856, -0.0982], [-0.0803, -0.1129, 0.1160, ..., -0.1720, 0.1812, 0.0577], ..., [-0.1054, 0.0633, 0.0497, ..., 0.1656, -0.1658, -0.1791], [-0.2199, 0.0584, -0.1185, ..., 0.0455, -0.0781, -0.1240], [ 0.0162, -0.0945, -0.0830, ..., -0.0819, -0.0570, -0.1243]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 9.3132e-10, 0.0000e+00, ..., -2.7940e-09, -2.3749e-07, -1.9278e-07], [ 1.2107e-08, 1.7695e-08, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 3.7253e-09, 4.6566e-09, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 2.7940e-09], ..., [ 1.0245e-08, 2.7940e-09, 0.0000e+00, ..., -5.4017e-08, 1.8626e-09, 9.3132e-10], [ 2.4214e-08, 1.4901e-08, 0.0000e+00, ..., -3.5390e-08, 2.9802e-08, 2.4214e-08], [ 5.5879e-09, 1.2107e-08, 0.0000e+00, ..., 6.0536e-08, 1.6019e-07, 1.3132e-07]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0191, -0.0301, -0.0122, -0.0254, -0.0321, 0.0051, 0.0280, -0.0134, 0.0361, -0.0013], device='cuda:0'), grad: tensor([-3.6415e-07, -4.2282e-07, 4.2841e-08, -1.2014e-07, 5.3085e-08, 3.9116e-08, 7.1712e-08, 1.4249e-07, -1.1083e-07, 6.4541e-07], device='cuda:0') 100 0.0001 changing lr epoch 203, time 250.91, cls_loss 0.0036 cls_loss_mapping 0.0028 cls_loss_causal 0.5382 re_mapping 0.0059 re_causal 0.0154 /// teacc 99.01 lr 0.00010000 Epoch 205, weight, value: tensor([[-0.1030, -0.1473, -0.0664, ..., -0.0647, 0.1611, 0.1577], [-0.1800, -0.1614, -0.1065, ..., -0.1376, -0.1857, -0.0984], [-0.0805, -0.1131, 0.1165, ..., -0.1726, 0.1815, 0.0577], ..., [-0.1066, 0.0634, 0.0496, ..., 0.1662, -0.1660, -0.1794], [-0.2203, 0.0583, -0.1187, ..., 0.0456, -0.0782, -0.1241], [ 0.0152, -0.0948, -0.0831, ..., -0.0824, -0.0567, -0.1247]], device='cuda:0'), grad: tensor([[ 7.8231e-08, 1.8626e-09, 0.0000e+00, ..., 7.1712e-08, -1.4184e-06, -7.3947e-07], [ 3.8184e-08, 4.3772e-08, 0.0000e+00, ..., 7.2643e-08, 4.4052e-07, 3.4645e-07], [ 2.7940e-09, 1.2480e-07, 0.0000e+00, ..., 1.1455e-07, -3.9954e-07, -3.3807e-07], ..., [ 2.4214e-08, -7.9256e-07, 0.0000e+00, ..., -8.1304e-07, 6.1467e-08, 3.8184e-08], [ 1.6484e-07, 3.9116e-08, 0.0000e+00, ..., 1.8161e-07, 7.3574e-08, 4.0978e-08], [-1.5832e-08, 1.0151e-07, 0.0000e+00, ..., 1.6298e-07, 7.5530e-07, 3.9488e-07]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0199, -0.0299, -0.0130, -0.0256, -0.0311, 0.0053, 0.0285, -0.0134, 0.0360, -0.0019], device='cuda:0'), grad: tensor([-2.5816e-06, 1.9819e-06, -1.3299e-06, 8.9221e-07, -5.1688e-07, -2.0750e-06, 2.5332e-06, -9.8534e-07, 5.4669e-07, 1.5339e-06], device='cuda:0') 100 0.0001 changing lr epoch 204, time 250.46, cls_loss 0.0045 cls_loss_mapping 0.0043 cls_loss_causal 0.5049 re_mapping 0.0056 re_causal 0.0134 /// teacc 98.98 lr 0.00010000 Epoch 206, weight, value: tensor([[-0.1024, -0.1488, -0.0665, ..., -0.0636, 0.1623, 0.1589], [-0.1802, -0.1615, -0.1051, ..., -0.1382, -0.1859, -0.0987], [-0.0806, -0.1129, 0.1169, ..., -0.1730, 0.1820, 0.0583], ..., [-0.1073, 0.0636, 0.0487, ..., 0.1671, -0.1670, -0.1813], [-0.2210, 0.0582, -0.1189, ..., 0.0455, -0.0795, -0.1264], [ 0.0149, -0.0952, -0.0835, ..., -0.0830, -0.0572, -0.1256]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 1.0431e-07, 9.3132e-10, ..., 1.6205e-07, 7.5623e-07, 2.1793e-07], [ 3.3528e-08, 1.6391e-07, 2.7940e-09, ..., 2.6356e-07, 6.5938e-07, 4.3679e-07], [ 1.0245e-08, -4.9658e-06, 0.0000e+00, ..., -1.8068e-07, -1.0766e-05, -1.6671e-06], ..., [ 5.5879e-09, 5.4948e-07, 1.8626e-09, ..., -8.7824e-07, 1.4482e-06, 3.0547e-07], [ 9.5088e-07, 1.9744e-07, 1.8626e-09, ..., 1.1399e-06, 4.7460e-06, 9.0804e-07], [ 4.6566e-09, 7.1712e-08, -1.6764e-08, ..., 1.0245e-07, 4.9360e-08, 2.2352e-08]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0186, -0.0296, -0.0126, -0.0258, -0.0320, 0.0051, 0.0280, -0.0136, 0.0354, -0.0013], device='cuda:0'), grad: tensor([ 2.2445e-06, 2.3115e-06, -3.6955e-05, 1.6183e-05, -3.2037e-07, 6.7335e-07, -2.0396e-06, 4.4256e-06, 1.4037e-05, -5.2154e-07], device='cuda:0') 100 0.0001 changing lr epoch 205, time 250.74, cls_loss 0.0034 cls_loss_mapping 0.0028 cls_loss_causal 0.4910 re_mapping 0.0060 re_causal 0.0146 /// teacc 98.90 lr 0.00010000 Epoch 207, weight, value: tensor([[-0.1027, -0.1493, -0.0666, ..., -0.0637, 0.1625, 0.1591], [-0.1813, -0.1617, -0.1052, ..., -0.1387, -0.1864, -0.1006], [-0.0797, -0.1128, 0.1173, ..., -0.1726, 0.1837, 0.0621], ..., [-0.1075, 0.0637, 0.0486, ..., 0.1673, -0.1697, -0.1856], [-0.2195, 0.0581, -0.1194, ..., 0.0468, -0.0775, -0.1253], [ 0.0146, -0.0952, -0.0831, ..., -0.0825, -0.0574, -0.1261]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 7.4506e-09, 3.4459e-08, ..., 0.0000e+00, -1.1176e-07, -1.0245e-07], [ 2.7008e-08, 8.9407e-08, 1.8626e-09, ..., 8.3819e-09, 2.7940e-08, 1.8626e-08], [ 2.7008e-08, 1.4715e-07, -1.9558e-07, ..., 5.4017e-08, -4.4983e-07, -1.0990e-07], ..., [ 1.6764e-08, -1.2107e-07, 8.4750e-08, ..., 5.6811e-08, 1.4435e-07, 1.1176e-08], [ 1.3690e-07, 2.2259e-07, 9.3132e-09, ..., 1.5553e-07, 1.1642e-07, 7.3574e-08], [ 7.7300e-08, 4.4703e-08, -6.5193e-09, ..., 9.0338e-08, 9.9652e-08, 6.4261e-08]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0187, -0.0299, -0.0117, -0.0257, -0.0317, 0.0049, 0.0265, -0.0137, 0.0363, -0.0009], device='cuda:0'), grad: tensor([ 6.2399e-08, 1.9744e-07, -1.4175e-06, -5.6997e-07, -1.0589e-06, -2.5425e-07, 2.3469e-07, 1.6363e-06, 7.5623e-07, 4.2655e-07], device='cuda:0') 100 0.0001 changing lr epoch 206, time 250.91, cls_loss 0.0043 cls_loss_mapping 0.0040 cls_loss_causal 0.5214 re_mapping 0.0058 re_causal 0.0146 /// teacc 98.98 lr 0.00010000 Epoch 208, weight, value: tensor([[-0.1032, -0.1508, -0.0664, ..., -0.0639, 0.1626, 0.1598], [-0.1820, -0.1618, -0.1052, ..., -0.1392, -0.1875, -0.1021], [-0.0798, -0.1135, 0.1172, ..., -0.1733, 0.1839, 0.0626], ..., [-0.1078, 0.0642, 0.0485, ..., 0.1682, -0.1708, -0.1867], [-0.2192, 0.0582, -0.1187, ..., 0.0476, -0.0770, -0.1260], [ 0.0146, -0.0967, -0.0829, ..., -0.0840, -0.0564, -0.1268]], device='cuda:0'), grad: tensor([[-4.5635e-06, 3.7253e-09, 1.4901e-08, ..., -4.0740e-05, -5.4479e-05, -5.6982e-05], [ 1.1176e-08, 6.5193e-08, 1.3039e-08, ..., 2.3842e-07, 1.4901e-07, 5.9605e-08], [ 3.7253e-09, 1.4901e-08, -4.7032e-07, ..., 7.1712e-08, -2.0918e-06, 2.7008e-08], ..., [ 1.3039e-08, -1.0710e-07, 3.1665e-07, ..., -1.6764e-08, 1.6177e-06, 1.1269e-07], [ 1.0710e-07, -2.1439e-06, 1.8626e-09, ..., -3.5435e-05, 1.4994e-07, 1.2945e-07], [ 4.8429e-08, 2.0489e-08, 9.3132e-10, ..., 3.3993e-07, 2.4810e-06, 4.8894e-07]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0192, -0.0299, -0.0115, -0.0257, -0.0327, 0.0047, 0.0263, -0.0136, 0.0370, -0.0003], device='cuda:0'), grad: tensor([-1.4341e-04, 8.2236e-07, -4.9211e-06, 3.6061e-06, -3.3438e-05, 1.5271e-04, 4.7296e-05, 4.8652e-06, -5.7936e-05, 3.0145e-05], device='cuda:0') 100 0.0001 changing lr epoch 207, time 250.54, cls_loss 0.0039 cls_loss_mapping 0.0031 cls_loss_causal 0.5036 re_mapping 0.0062 re_causal 0.0148 /// teacc 98.92 lr 0.00010000 Epoch 209, weight, value: tensor([[-0.1037, -0.1520, -0.0665, ..., -0.0634, 0.1629, 0.1601], [-0.1835, -0.1620, -0.1052, ..., -0.1399, -0.1876, -0.1024], [-0.0800, -0.1140, 0.1173, ..., -0.1736, 0.1842, 0.0626], ..., [-0.1083, 0.0646, 0.0485, ..., 0.1685, -0.1712, -0.1868], [-0.2201, 0.0591, -0.1187, ..., 0.0480, -0.0774, -0.1273], [ 0.0141, -0.0974, -0.0830, ..., -0.0847, -0.0566, -0.1274]], device='cuda:0'), grad: tensor([[ 7.1712e-08, 1.1828e-07, 0.0000e+00, ..., 2.7847e-07, 5.9605e-08, 3.3528e-08], [ 1.0245e-08, 3.4459e-08, 0.0000e+00, ..., 6.9849e-08, 6.5193e-09, 3.7253e-09], [ 1.3039e-08, -2.4773e-06, 0.0000e+00, ..., 2.8871e-08, 1.3039e-08, -5.9512e-07], ..., [ 8.3819e-09, -1.0524e-06, 0.0000e+00, ..., -2.7157e-06, 9.3132e-10, 2.5146e-08], [ 4.1444e-07, 2.2352e-06, 0.0000e+00, ..., 1.5367e-07, 5.3085e-08, 5.7276e-07], [ 1.5832e-07, 9.3319e-07, -9.3132e-10, ..., 2.2892e-06, 1.3039e-08, 8.3819e-09]], device='cuda:0') Epoch 209, bias, value: tensor([-0.0192, -0.0300, -0.0116, -0.0258, -0.0328, 0.0043, 0.0278, -0.0136, 0.0372, -0.0004], device='cuda:0'), grad: tensor([ 6.0163e-07, -4.1537e-07, -2.2709e-05, 5.6960e-06, -1.8626e-08, -4.9248e-06, -5.8115e-07, -2.2035e-06, 2.1130e-05, 3.4403e-06], device='cuda:0') 100 0.0001 changing lr epoch 208, time 250.55, cls_loss 0.0046 cls_loss_mapping 0.0037 cls_loss_causal 0.4852 re_mapping 0.0061 re_causal 0.0144 /// teacc 99.00 lr 0.00010000 Epoch 210, weight, value: tensor([[-0.1041, -0.1541, -0.0666, ..., -0.0639, 0.1629, 0.1601], [-0.1853, -0.1633, -0.1036, ..., -0.1427, -0.1887, -0.1038], [-0.0786, -0.1143, 0.1172, ..., -0.1741, 0.1853, 0.0631], ..., [-0.1088, 0.0661, 0.0466, ..., 0.1707, -0.1719, -0.1870], [-0.2210, 0.0592, -0.1185, ..., 0.0480, -0.0778, -0.1288], [ 0.0135, -0.0987, -0.0834, ..., -0.0851, -0.0568, -0.1282]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, -8.0094e-08, -3.3528e-08], [ 5.5879e-09, 3.6322e-08, 0.0000e+00, ..., 1.2293e-07, 9.3132e-10, 0.0000e+00], [ 1.8626e-09, 8.3819e-09, -0.0000e+00, ..., 2.7940e-08, -2.8871e-08, 9.3132e-10], ..., [ 4.6566e-09, -1.3039e-07, 0.0000e+00, ..., -1.2107e-07, 3.7253e-09, 9.3132e-10], [ 3.7253e-09, 1.8626e-09, 0.0000e+00, ..., -2.3004e-07, 5.5879e-09, 2.7940e-09], [ 1.8626e-09, 6.6124e-08, 9.3132e-09, ..., 6.4261e-08, 5.2154e-08, 2.1420e-08]], device='cuda:0') Epoch 210, bias, value: tensor([-0.0197, -0.0312, -0.0115, -0.0260, -0.0325, 0.0041, 0.0292, -0.0126, 0.0369, -0.0003], device='cuda:0'), grad: tensor([-9.2201e-08, -2.7940e-09, 1.1362e-07, -5.6811e-08, -1.5553e-07, 2.3749e-07, 3.0082e-07, -5.7742e-08, -6.8266e-07, 3.9581e-07], device='cuda:0') 100 0.0001 changing lr epoch 209, time 250.31, cls_loss 0.0034 cls_loss_mapping 0.0038 cls_loss_causal 0.5119 re_mapping 0.0058 re_causal 0.0149 /// teacc 98.92 lr 0.00010000 Epoch 211, weight, value: tensor([[-0.1051, -0.1564, -0.0667, ..., -0.0640, 0.1636, 0.1605], [-0.1853, -0.1636, -0.1034, ..., -0.1431, -0.1904, -0.1047], [-0.0788, -0.1149, 0.1194, ..., -0.1745, 0.1867, 0.0633], ..., [-0.1093, 0.0667, 0.0457, ..., 0.1721, -0.1735, -0.1880], [-0.2212, 0.0592, -0.1188, ..., 0.0477, -0.0780, -0.1291], [ 0.0135, -0.0997, -0.0840, ..., -0.0873, -0.0571, -0.1289]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 9.3132e-09, 0.0000e+00, ..., 7.4506e-09, -1.4715e-07, -7.8231e-08], [ 2.7940e-09, 1.9558e-08, 0.0000e+00, ..., 2.7940e-08, 1.0058e-07, 1.9558e-08], [ 1.1176e-08, 7.8976e-07, 0.0000e+00, ..., 5.7090e-07, -1.3355e-06, -3.0454e-07], ..., [ 9.3132e-10, -6.4261e-08, 0.0000e+00, ..., -1.6298e-07, 3.3155e-07, 6.3330e-08], [ 4.0047e-08, -3.2093e-06, 0.0000e+00, ..., -2.2277e-06, -1.2610e-06, 5.2154e-08], [ 1.8626e-09, 2.9802e-08, -9.3132e-10, ..., 7.8231e-08, 1.7323e-07, 6.7055e-08]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0191, -0.0309, -0.0125, -0.0258, -0.0315, 0.0039, 0.0293, -0.0123, 0.0368, -0.0016], device='cuda:0'), grad: tensor([-4.6566e-08, -1.1064e-05, 3.7998e-06, 1.2338e-05, 1.2666e-07, 5.6252e-07, 1.8720e-07, 6.9551e-06, -1.3255e-05, 3.7253e-07], device='cuda:0') 100 0.0001 changing lr epoch 210, time 250.50, cls_loss 0.0039 cls_loss_mapping 0.0039 cls_loss_causal 0.5498 re_mapping 0.0060 re_causal 0.0154 /// teacc 98.99 lr 0.00010000 Epoch 212, weight, value: tensor([[-0.1061, -0.1571, -0.0661, ..., -0.0641, 0.1639, 0.1606], [-0.1855, -0.1638, -0.1031, ..., -0.1437, -0.1909, -0.1050], [-0.0791, -0.1151, 0.1194, ..., -0.1747, 0.1874, 0.0632], ..., [-0.1105, 0.0648, 0.0456, ..., 0.1722, -0.1742, -0.1884], [-0.2215, 0.0590, -0.1194, ..., 0.0478, -0.0785, -0.1296], [ 0.0134, -0.1004, -0.0837, ..., -0.0881, -0.0575, -0.1301]], device='cuda:0'), grad: tensor([[ 3.4459e-08, 9.3132e-09, 0.0000e+00, ..., 3.7253e-09, -1.1241e-06, -8.0559e-07], [ 2.7940e-09, 1.7136e-07, 1.0245e-08, ..., 1.7136e-07, 6.9756e-07, 8.5682e-08], [ 1.2573e-07, 8.3819e-08, 0.0000e+00, ..., 4.6566e-09, -1.7276e-06, -8.1956e-08], ..., [ 0.0000e+00, -7.6741e-07, -1.8626e-08, ..., -7.4971e-07, 5.2061e-07, 8.6613e-08], [ 1.1548e-07, 4.3213e-07, 2.7940e-09, ..., 8.7544e-08, 9.5833e-07, 3.1758e-07], [ 9.3132e-10, 1.1828e-07, 6.6124e-08, ..., 1.4156e-07, 6.3237e-07, 3.5949e-07]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0190, -0.0309, -0.0124, -0.0241, -0.0320, 0.0035, 0.0296, -0.0128, 0.0365, -0.0016], device='cuda:0'), grad: tensor([-9.5274e-07, 3.3788e-06, -4.1723e-06, 8.7358e-07, -6.3814e-06, 3.7346e-07, 8.8103e-07, 1.8440e-07, 3.3714e-06, 2.4475e-06], device='cuda:0') 100 0.0001 changing lr epoch 211, time 250.51, cls_loss 0.0039 cls_loss_mapping 0.0030 cls_loss_causal 0.5251 re_mapping 0.0058 re_causal 0.0148 /// teacc 98.96 lr 0.00010000 Epoch 213, weight, value: tensor([[-0.1071, -0.1584, -0.0661, ..., -0.0645, 0.1635, 0.1609], [-0.1856, -0.1641, -0.1017, ..., -0.1434, -0.1911, -0.1051], [-0.0792, -0.1155, 0.1197, ..., -0.1750, 0.1883, 0.0640], ..., [-0.1108, 0.0645, 0.0445, ..., 0.1724, -0.1756, -0.1896], [-0.2222, 0.0590, -0.1199, ..., 0.0478, -0.0791, -0.1306], [ 0.0131, -0.1005, -0.0846, ..., -0.0888, -0.0571, -0.1321]], device='cuda:0'), grad: tensor([[ 3.7812e-07, 9.2667e-08, 0.0000e+00, ..., 6.5984e-07, -1.2042e-06, -2.1402e-06], [ 4.6520e-07, 1.1288e-06, 0.0000e+00, ..., 3.7439e-06, 3.0873e-07, 1.0431e-07], [ 2.5611e-07, 3.3900e-07, 0.0000e+00, ..., 9.1782e-07, 8.8802e-07, 2.9802e-08], ..., [ 8.7218e-07, -4.6119e-06, 0.0000e+00, ..., -1.5207e-05, 1.6252e-07, 2.4214e-08], [ 2.2817e-07, 3.0315e-07, 0.0000e+00, ..., 7.9954e-07, 2.3190e-07, 1.6112e-07], [ 6.1560e-07, 7.5297e-07, 0.0000e+00, ..., 1.4966e-06, 3.0966e-07, 2.0862e-07]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0201, -0.0311, -0.0134, -0.0242, -0.0316, 0.0035, 0.0304, -0.0120, 0.0362, -0.0017], device='cuda:0'), grad: tensor([-6.1467e-07, 1.1489e-05, 3.7327e-06, 7.5549e-06, 3.0193e-06, 7.8185e-07, 4.1686e-06, 5.2065e-05, 2.0247e-06, -8.4221e-05], device='cuda:0') 100 0.0001 changing lr epoch 212, time 250.47, cls_loss 0.0037 cls_loss_mapping 0.0038 cls_loss_causal 0.5281 re_mapping 0.0059 re_causal 0.0149 /// teacc 98.98 lr 0.00010000 Epoch 214, weight, value: tensor([[-0.1040, -0.1601, -0.0660, ..., -0.0614, 0.1668, 0.1644], [-0.1884, -0.1642, -0.1003, ..., -0.1431, -0.1916, -0.1063], [-0.0777, -0.1163, 0.1198, ..., -0.1752, 0.1894, 0.0655], ..., [-0.1121, 0.0648, 0.0434, ..., 0.1729, -0.1774, -0.1917], [-0.2229, 0.0592, -0.1200, ..., 0.0478, -0.0794, -0.1311], [ 0.0119, -0.1010, -0.0855, ..., -0.0896, -0.0571, -0.1340]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 9.3132e-09, 1.3970e-09, ..., 2.3749e-08, -1.2107e-08, -9.3132e-09], [ 5.6345e-08, 7.9628e-08, 0.0000e+00, ..., 3.3062e-08, 4.6566e-10, 4.6566e-10], [ 4.2375e-08, 5.8208e-08, 0.0000e+00, ..., 2.3283e-09, 1.3970e-09, 1.3970e-09], ..., [ 1.4808e-07, 2.1560e-07, 2.3283e-09, ..., 6.4820e-07, 4.6566e-10, 4.6566e-10], [ 7.7765e-08, -1.0384e-06, 9.4995e-08, ..., -1.2182e-05, 4.1910e-09, 3.2596e-09], [ 1.4808e-07, 1.2573e-06, -1.1036e-07, ..., 1.1526e-05, 3.2596e-09, 2.3283e-09]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0169, -0.0310, -0.0138, -0.0267, -0.0314, 0.0052, 0.0274, -0.0117, 0.0360, -0.0022], device='cuda:0'), grad: tensor([ 1.5553e-07, 3.3379e-06, 2.0582e-07, -1.6307e-06, -6.2622e-06, 1.4575e-07, 3.4366e-07, 3.2336e-06, -2.5600e-05, 2.6107e-05], device='cuda:0') 100 0.0001 changing lr epoch 213, time 250.22, cls_loss 0.0039 cls_loss_mapping 0.0037 cls_loss_causal 0.5058 re_mapping 0.0056 re_causal 0.0145 /// teacc 98.95 lr 0.00010000 Epoch 215, weight, value: tensor([[-0.1041, -0.1620, -0.0661, ..., -0.0614, 0.1668, 0.1644], [-0.1884, -0.1643, -0.1003, ..., -0.1432, -0.1918, -0.1065], [-0.0778, -0.1170, 0.1198, ..., -0.1760, 0.1899, 0.0658], ..., [-0.1132, 0.0650, 0.0435, ..., 0.1734, -0.1782, -0.1925], [-0.2234, 0.0591, -0.1201, ..., 0.0481, -0.0796, -0.1317], [ 0.0115, -0.1014, -0.0856, ..., -0.0934, -0.0572, -0.1344]], device='cuda:0'), grad: tensor([[ 4.2841e-08, 7.4506e-09, 0.0000e+00, ..., 1.0151e-07, -4.6566e-08, -2.7474e-08], [ 5.1223e-09, 1.3784e-06, 0.0000e+00, ..., 2.1439e-06, 1.8161e-08, 2.3283e-09], [ 7.4506e-09, 1.1176e-08, 0.0000e+00, ..., 3.5577e-07, -1.9535e-05, 4.6566e-09], ..., [ 1.3970e-09, -1.4370e-06, 0.0000e+00, ..., -2.2221e-06, 1.9046e-07, 1.3970e-09], [ 5.5879e-09, 2.2817e-08, 0.0000e+00, ..., -1.2415e-06, 1.0710e-08, 6.9849e-09], [ 2.3283e-09, -2.7800e-07, 0.0000e+00, ..., -6.2399e-07, 2.8405e-08, 2.0023e-08]], device='cuda:0') Epoch 215, bias, value: tensor([-0.0170, -0.0305, -0.0140, -0.0269, -0.0277, 0.0054, 0.0275, -0.0120, 0.0361, -0.0055], device='cuda:0'), grad: tensor([ 3.0780e-07, 3.5930e-06, -9.9361e-05, 6.1467e-08, 1.0514e-04, 2.1271e-06, -3.1665e-07, -2.5071e-06, -3.7570e-06, -5.3719e-06], device='cuda:0') 100 0.0001 changing lr epoch 214, time 250.35, cls_loss 0.0046 cls_loss_mapping 0.0041 cls_loss_causal 0.5307 re_mapping 0.0058 re_causal 0.0146 /// teacc 99.02 lr 0.00010000 Epoch 216, weight, value: tensor([[-0.1044, -0.1657, -0.0661, ..., -0.0616, 0.1667, 0.1643], [-0.1888, -0.1647, -0.1003, ..., -0.1445, -0.1918, -0.1070], [-0.0779, -0.1174, 0.1199, ..., -0.1770, 0.1921, 0.0668], ..., [-0.1141, 0.0648, 0.0435, ..., 0.1724, -0.1808, -0.1944], [-0.2239, 0.0603, -0.1202, ..., 0.0494, -0.0798, -0.1314], [ 0.0116, -0.1013, -0.0856, ..., -0.0934, -0.0574, -0.1350]], device='cuda:0'), grad: tensor([[ 1.2144e-06, 2.9337e-08, 1.3970e-09, ..., 8.0094e-08, 2.7735e-06, 1.9260e-06], [ 1.0245e-08, 1.8300e-07, 5.5879e-09, ..., 4.8382e-07, 1.3504e-08, 1.0710e-08], [ 4.6566e-09, 9.3132e-10, 4.6566e-10, ..., 4.1910e-08, 3.2596e-09, 3.2596e-09], ..., [ 4.6566e-10, -6.4028e-07, 4.6566e-10, ..., -1.7509e-06, 2.3283e-09, 1.3970e-09], [ 8.1630e-07, 2.3283e-09, 9.3132e-10, ..., -2.4540e-07, 7.2876e-07, 6.7614e-07], [ 6.0536e-09, 3.3900e-07, -9.3132e-09, ..., 9.6299e-07, 1.1176e-08, 8.3819e-09]], device='cuda:0') Epoch 216, bias, value: tensor([-0.0174, -0.0304, -0.0140, -0.0267, -0.0271, 0.0052, 0.0276, -0.0129, 0.0373, -0.0056], device='cuda:0'), grad: tensor([ 7.5847e-06, -5.0617e-07, 1.8906e-07, 5.8673e-08, 6.7288e-07, 4.2245e-06, -1.3456e-05, -3.0585e-06, 1.7080e-06, 2.5816e-06], device='cuda:0') 100 0.0001 changing lr epoch 215, time 250.61, cls_loss 0.0038 cls_loss_mapping 0.0030 cls_loss_causal 0.5319 re_mapping 0.0056 re_causal 0.0147 /// teacc 99.03 lr 0.00010000 Epoch 217, weight, value: tensor([[-0.1047, -0.1677, -0.0661, ..., -0.0617, 0.1665, 0.1642], [-0.1889, -0.1652, -0.1003, ..., -0.1452, -0.1921, -0.1074], [-0.0780, -0.1185, 0.1204, ..., -0.1775, 0.1930, 0.0673], ..., [-0.1149, 0.0651, 0.0438, ..., 0.1730, -0.1817, -0.1952], [-0.2242, 0.0604, -0.1208, ..., 0.0494, -0.0801, -0.1308], [ 0.0113, -0.1020, -0.0861, ..., -0.0935, -0.0577, -0.1365]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 1.3970e-09, 4.6566e-10, ..., -3.4459e-08, -7.3574e-08, -3.4925e-08], [ 1.3970e-09, 2.3283e-09, 2.7940e-09, ..., 6.0536e-09, 3.1432e-07, 2.3702e-07], [ 1.3970e-09, 1.2573e-08, 4.6566e-10, ..., 1.7229e-08, -8.5169e-07, -6.4867e-07], ..., [ 4.6566e-10, -2.2817e-08, 1.0924e-06, ..., 8.7963e-07, 3.0501e-07, 1.9325e-07], [ 8.3819e-09, 1.8626e-09, 4.6566e-10, ..., 1.4435e-08, 1.0151e-07, 7.4506e-08], [ 9.3132e-10, 3.7253e-09, -1.1036e-06, ..., -9.9652e-07, 2.8871e-08, 1.6298e-08]], device='cuda:0') Epoch 217, bias, value: tensor([-0.0175, -0.0304, -0.0143, -0.0264, -0.0271, 0.0051, 0.0278, -0.0128, 0.0377, -0.0057], device='cuda:0'), grad: tensor([-2.8871e-08, -6.8903e-05, 3.8624e-05, 4.3726e-07, 3.3248e-07, 7.4785e-07, 1.3430e-06, 3.4124e-05, 3.9674e-07, -7.0482e-06], device='cuda:0') 100 0.0001 changing lr epoch 216, time 250.27, cls_loss 0.0037 cls_loss_mapping 0.0029 cls_loss_causal 0.5419 re_mapping 0.0059 re_causal 0.0153 /// teacc 99.05 lr 0.00010000 Epoch 218, weight, value: tensor([[-0.1049, -0.1695, -0.0661, ..., -0.0617, 0.1666, 0.1642], [-0.1891, -0.1655, -0.1003, ..., -0.1457, -0.1925, -0.1078], [-0.0784, -0.1193, 0.1204, ..., -0.1780, 0.1934, 0.0672], ..., [-0.1157, 0.0643, 0.0438, ..., 0.1717, -0.1827, -0.1958], [-0.2245, 0.0623, -0.1208, ..., 0.0510, -0.0804, -0.1306], [ 0.0108, -0.1025, -0.0861, ..., -0.0932, -0.0577, -0.1371]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 1.0245e-08, 0.0000e+00, ..., 6.5193e-09, -3.8184e-08, -2.4680e-08], [ 5.1223e-09, 5.4017e-08, 0.0000e+00, ..., 4.6100e-08, 7.1712e-08, 1.3970e-09], [ 2.3283e-09, 4.0000e-07, 0.0000e+00, ..., 4.2841e-07, -1.2713e-07, 2.3283e-08], ..., [ 2.3749e-08, -4.8056e-07, 0.0000e+00, ..., -6.2212e-07, 5.4482e-08, 3.7253e-09], [ 2.7474e-08, 1.4948e-07, 0.0000e+00, ..., 5.9605e-08, -4.5169e-08, -6.4261e-08], [ 5.5879e-09, 7.6368e-08, 4.6566e-10, ..., 7.3574e-08, 1.2573e-08, 8.3819e-09]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0176, -0.0295, -0.0161, -0.0260, -0.0273, 0.0050, 0.0278, -0.0137, 0.0391, -0.0054], device='cuda:0'), grad: tensor([ 6.4727e-08, 2.8731e-07, 5.0198e-07, -5.8999e-07, -4.5635e-07, 3.3621e-07, 1.5553e-07, -7.4459e-07, 7.4506e-09, 4.5449e-07], device='cuda:0') 100 0.0001 changing lr epoch 217, time 250.37, cls_loss 0.0033 cls_loss_mapping 0.0025 cls_loss_causal 0.5043 re_mapping 0.0059 re_causal 0.0149 /// teacc 98.94 lr 0.00010000 Epoch 219, weight, value: tensor([[-0.1049, -0.1707, -0.0662, ..., -0.0617, 0.1666, 0.1642], [-0.1905, -0.1655, -0.0998, ..., -0.1456, -0.1929, -0.1092], [-0.0772, -0.1196, 0.1203, ..., -0.1782, 0.1943, 0.0688], ..., [-0.1162, 0.0644, 0.0436, ..., 0.1719, -0.1834, -0.1965], [-0.2249, 0.0623, -0.1209, ..., 0.0511, -0.0807, -0.1311], [ 0.0109, -0.1030, -0.0855, ..., -0.0935, -0.0576, -0.1370]], device='cuda:0'), grad: tensor([[ 1.6689e-05, 1.1176e-08, 0.0000e+00, ..., 2.0955e-08, 4.3064e-05, 2.6360e-05], [ 6.0536e-09, 2.2864e-07, 0.0000e+00, ..., 4.0326e-07, 5.8673e-08, 1.5367e-08], [ 4.2375e-08, 5.1111e-06, 0.0000e+00, ..., 9.6783e-06, 1.0515e-06, 8.4285e-08], ..., [ 1.3970e-09, -5.4948e-06, 4.6566e-10, ..., -1.0267e-05, -9.5181e-07, 5.5879e-09], [ 9.0804e-08, 1.6671e-07, 1.3970e-09, ..., -3.0734e-08, 3.5902e-07, 2.2957e-07], [ 5.1223e-08, 6.7987e-08, -3.2596e-09, ..., -1.4622e-07, 5.7789e-07, 3.7719e-07]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0176, -0.0297, -0.0162, -0.0259, -0.0271, 0.0049, 0.0278, -0.0134, 0.0386, -0.0055], device='cuda:0'), grad: tensor([ 7.8917e-05, 9.4483e-07, 2.2382e-05, -2.7847e-07, 6.5472e-07, 6.3377e-07, -8.1599e-05, -2.3261e-05, 6.0257e-07, 8.8522e-07], device='cuda:0') 100 0.0001 changing lr epoch 218, time 250.29, cls_loss 0.0034 cls_loss_mapping 0.0039 cls_loss_causal 0.5202 re_mapping 0.0056 re_causal 0.0142 /// teacc 98.96 lr 0.00010000 Epoch 220, weight, value: tensor([[-0.1052, -0.1720, -0.0657, ..., -0.0617, 0.1665, 0.1642], [-0.1908, -0.1657, -0.0975, ..., -0.1454, -0.1930, -0.1095], [-0.0773, -0.1196, 0.1201, ..., -0.1786, 0.1951, 0.0704], ..., [-0.1164, 0.0649, 0.0428, ..., 0.1729, -0.1843, -0.1977], [-0.2252, 0.0621, -0.1214, ..., 0.0511, -0.0814, -0.1328], [ 0.0102, -0.1044, -0.0879, ..., -0.0945, -0.0577, -0.1376]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 6.5193e-09, 1.0245e-08, ..., 1.3039e-08, -6.5845e-07, -4.1351e-07], [ 4.6566e-09, 1.9372e-07, 3.7253e-09, ..., 5.0291e-07, 1.8161e-07, 1.9744e-07], [ 4.6566e-09, 1.2107e-08, 0.0000e+00, ..., 1.7695e-08, 4.6566e-09, 5.5879e-09], ..., [ 3.7253e-09, -1.0179e-06, 5.7742e-08, ..., -2.7418e-06, 1.1176e-08, 4.6566e-09], [ 8.5682e-08, 6.5193e-08, 1.5832e-08, ..., 1.1455e-07, 4.0978e-08, 4.0047e-08], [ 2.9802e-08, 7.4878e-07, 2.1420e-08, ..., 1.9483e-06, 4.1071e-07, 1.4622e-07]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0177, -0.0293, -0.0160, -0.0260, -0.0272, 0.0049, 0.0279, -0.0132, 0.0384, -0.0059], device='cuda:0'), grad: tensor([-1.0785e-06, 1.6987e-06, 6.2399e-08, 8.1025e-08, -1.0647e-05, 6.3237e-07, 9.7975e-06, -5.2191e-06, 9.4622e-07, 3.6806e-06], device='cuda:0') 100 0.0001 changing lr epoch 219, time 250.41, cls_loss 0.0032 cls_loss_mapping 0.0037 cls_loss_causal 0.5289 re_mapping 0.0055 re_causal 0.0147 /// teacc 99.05 lr 0.00010000 Epoch 221, weight, value: tensor([[-0.1053, -0.1730, -0.0658, ..., -0.0617, 0.1666, 0.1642], [-0.1912, -0.1658, -0.0975, ..., -0.1449, -0.1933, -0.1093], [-0.0773, -0.1197, 0.1205, ..., -0.1788, 0.1963, 0.0716], ..., [-0.1167, 0.0650, 0.0427, ..., 0.1728, -0.1856, -0.1998], [-0.2253, 0.0623, -0.1217, ..., 0.0514, -0.0815, -0.1327], [ 0.0105, -0.1052, -0.0879, ..., -0.0947, -0.0578, -0.1392]], device='cuda:0'), grad: tensor([[-6.2399e-08, 1.0245e-08, 0.0000e+00, ..., 2.0489e-08, -4.8708e-07, -3.1292e-07], [ 5.5879e-09, 9.9652e-08, 0.0000e+00, ..., 7.7672e-07, 1.3039e-08, 1.3039e-08], [ 0.0000e+00, -1.1176e-08, -0.0000e+00, ..., -1.8626e-09, -9.4716e-07, -9.3970e-07], ..., [ 4.6566e-09, -3.2410e-07, 0.0000e+00, ..., -6.1002e-07, 8.8383e-07, 8.8848e-07], [ 8.3819e-08, 3.5390e-08, 0.0000e+00, ..., 2.0117e-07, 2.7008e-08, 1.8626e-08], [ 1.8626e-08, 2.4494e-07, 0.0000e+00, ..., 8.7079e-07, 8.3819e-09, 5.5879e-09]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0178, -0.0295, -0.0162, -0.0258, -0.0274, 0.0047, 0.0278, -0.0127, 0.0389, -0.0060], device='cuda:0'), grad: tensor([-5.6159e-07, 4.5002e-06, -2.3805e-06, -4.3772e-08, -4.8429e-06, -3.0268e-07, 7.9069e-07, 1.1912e-06, 9.7696e-07, 6.5938e-07], device='cuda:0') 100 0.0001 changing lr epoch 220, time 250.50, cls_loss 0.0028 cls_loss_mapping 0.0032 cls_loss_causal 0.5151 re_mapping 0.0059 re_causal 0.0149 /// teacc 99.00 lr 0.00010000 Epoch 222, weight, value: tensor([[-0.1053, -0.1737, -0.0658, ..., -0.0617, 0.1665, 0.1643], [-0.1916, -0.1660, -0.0975, ..., -0.1450, -0.1940, -0.1101], [-0.0774, -0.1203, 0.1206, ..., -0.1793, 0.1984, 0.0731], ..., [-0.1173, 0.0650, 0.0422, ..., 0.1730, -0.1885, -0.2022], [-0.2257, 0.0626, -0.1219, ..., 0.0521, -0.0816, -0.1335], [ 0.0099, -0.1054, -0.0874, ..., -0.0949, -0.0568, -0.1401]], device='cuda:0'), grad: tensor([[ 1.8766e-06, 6.5193e-09, 6.9849e-08, ..., 3.0361e-07, 1.5041e-06, 1.1232e-06], [ 2.7008e-08, 7.0781e-08, 7.8231e-08, ..., 3.5204e-07, 1.0245e-08, 5.5879e-09], [ 4.6566e-08, 1.0151e-07, -5.1372e-06, ..., -2.3201e-05, 4.7497e-08, 2.4214e-08], ..., [ 4.8429e-08, 1.4715e-07, 1.0822e-06, ..., 4.8839e-06, 1.5832e-08, 9.3132e-09], [ 3.5204e-07, 3.4459e-08, 2.2557e-06, ..., 1.0185e-05, 2.9057e-07, 2.1420e-07], [ 8.3819e-09, 1.4901e-08, -0.0000e+00, ..., 3.7253e-09, 8.3819e-09, 5.5879e-09]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0179, -0.0294, -0.0160, -0.0256, -0.0274, 0.0043, 0.0278, -0.0131, 0.0395, -0.0057], device='cuda:0'), grad: tensor([ 5.7817e-06, 1.4259e-06, -1.0556e-04, -2.8778e-07, 2.4259e-05, 8.4266e-06, -4.1388e-06, 2.2650e-05, 4.7475e-05, 6.3330e-08], device='cuda:0') 100 0.0001 changing lr epoch 221, time 250.30, cls_loss 0.0040 cls_loss_mapping 0.0038 cls_loss_causal 0.5513 re_mapping 0.0054 re_causal 0.0140 /// teacc 99.00 lr 0.00010000 Epoch 223, weight, value: tensor([[-0.1053, -0.1761, -0.0658, ..., -0.0618, 0.1667, 0.1643], [-0.1929, -0.1667, -0.0975, ..., -0.1476, -0.1941, -0.1101], [-0.0775, -0.1208, 0.1210, ..., -0.1796, 0.1994, 0.0732], ..., [-0.1192, 0.0653, 0.0421, ..., 0.1754, -0.1894, -0.2028], [-0.2257, 0.0625, -0.1227, ..., 0.0523, -0.0815, -0.1340], [ 0.0097, -0.1059, -0.0874, ..., -0.0946, -0.0573, -0.1411]], device='cuda:0'), grad: tensor([[ 3.7067e-07, 1.2014e-07, 0.0000e+00, ..., 2.7940e-09, 4.5635e-07, 3.9581e-07], [ 1.0040e-06, 2.6971e-06, 0.0000e+00, ..., 1.3970e-08, 1.0710e-07, 8.9407e-08], [ 3.9302e-07, 4.7870e-07, 0.0000e+00, ..., -9.3132e-09, -3.7625e-07, -2.4773e-07], ..., [ 1.0105e-06, 2.8610e-06, 0.0000e+00, ..., 5.0291e-08, 1.9744e-07, 1.4529e-07], [ 2.6241e-05, 7.5530e-07, 0.0000e+00, ..., 1.2480e-07, 2.7165e-05, 2.4363e-05], [ 3.5390e-07, 8.2795e-07, 0.0000e+00, ..., 2.2352e-08, 7.7300e-08, 6.5193e-08]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0179, -0.0308, -0.0160, -0.0259, -0.0272, 0.0045, 0.0277, -0.0121, 0.0399, -0.0057], device='cuda:0'), grad: tensor([ 1.8040e-06, 6.7204e-06, 5.5786e-07, -2.1264e-05, 4.0047e-08, 2.2814e-05, -1.1659e-04, 7.3835e-06, 9.6858e-05, 1.6643e-06], device='cuda:0') 100 0.0001 changing lr epoch 222, time 250.41, cls_loss 0.0030 cls_loss_mapping 0.0029 cls_loss_causal 0.5339 re_mapping 0.0057 re_causal 0.0149 /// teacc 98.96 lr 0.00010000 Epoch 224, weight, value: tensor([[-0.1058, -0.1774, -0.0660, ..., -0.0618, 0.1660, 0.1642], [-0.1934, -0.1669, -0.0974, ..., -0.1476, -0.1944, -0.1103], [-0.0777, -0.1212, 0.1211, ..., -0.1799, 0.2002, 0.0739], ..., [-0.1199, 0.0656, 0.0421, ..., 0.1757, -0.1896, -0.2030], [-0.2261, 0.0624, -0.1228, ..., 0.0522, -0.0826, -0.1357], [ 0.0116, -0.1065, -0.0874, ..., -0.0949, -0.0553, -0.1420]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, -1.3284e-05, -8.0019e-06], [ 0.0000e+00, 6.5193e-09, 0.0000e+00, ..., 1.8626e-08, 1.0151e-07, 3.0734e-08], [ 0.0000e+00, 1.5832e-08, 0.0000e+00, ..., 3.6322e-08, -9.4622e-07, -2.2538e-07], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.7497e-07, 2.4214e-08, 8.3819e-09], [ 2.7940e-09, -7.6368e-08, 0.0000e+00, ..., -1.3318e-07, -2.3283e-08, -6.4261e-08], [ 5.5879e-09, -3.6322e-08, 0.0000e+00, ..., -5.3458e-07, 1.1176e-07, 6.8918e-08]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0190, -0.0306, -0.0159, -0.0261, -0.0279, 0.0042, 0.0283, -0.0121, 0.0395, -0.0044], device='cuda:0'), grad: tensor([-1.6391e-05, 3.1013e-07, -2.0731e-06, 1.3057e-06, -8.1770e-07, -5.9605e-08, 1.6987e-05, 4.3772e-06, -3.5297e-07, -3.2559e-06], device='cuda:0') 100 0.0001 changing lr epoch 223, time 250.44, cls_loss 0.0026 cls_loss_mapping 0.0025 cls_loss_causal 0.5000 re_mapping 0.0056 re_causal 0.0146 /// teacc 99.02 lr 0.00010000 Epoch 225, weight, value: tensor([[-0.1059, -0.1788, -0.0662, ..., -0.0618, 0.1659, 0.1642], [-0.1948, -0.1671, -0.0975, ..., -0.1478, -0.1944, -0.1117], [-0.0763, -0.1215, 0.1211, ..., -0.1807, 0.2003, 0.0751], ..., [-0.1205, 0.0646, 0.0421, ..., 0.1758, -0.1899, -0.2032], [-0.2263, 0.0625, -0.1231, ..., 0.0524, -0.0827, -0.1359], [ 0.0121, -0.1072, -0.0874, ..., -0.0951, -0.0549, -0.1432]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., -6.7055e-08, -6.2305e-07, -6.6962e-07], [ 2.7940e-09, 1.4901e-08, 0.0000e+00, ..., 1.1176e-08, 4.0047e-08, 3.8184e-08], [ 2.0489e-08, 5.0291e-08, 0.0000e+00, ..., 3.6322e-08, 1.1362e-07, 4.6566e-08], ..., [ 4.6566e-09, 1.6764e-08, 0.0000e+00, ..., 5.5879e-09, 1.1176e-08, 2.7940e-09], [-3.5390e-08, 1.8626e-09, 0.0000e+00, ..., -2.2314e-06, -1.9651e-07, -7.3574e-08], [ 9.3132e-10, -1.3970e-08, 0.0000e+00, ..., -3.6322e-08, 4.6566e-08, 2.0489e-08]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0192, -0.0306, -0.0159, -0.0247, -0.0280, 0.0031, 0.0284, -0.0122, 0.0398, -0.0043], device='cuda:0'), grad: tensor([-1.4063e-06, 1.1828e-07, 8.3074e-07, 1.1269e-07, 4.1910e-08, 2.4028e-06, 6.2771e-06, 1.3132e-07, -8.0392e-06, -4.7963e-07], device='cuda:0') 100 0.0001 changing lr epoch 224, time 250.53, cls_loss 0.0027 cls_loss_mapping 0.0028 cls_loss_causal 0.4744 re_mapping 0.0058 re_causal 0.0143 /// teacc 98.98 lr 0.00010000 Epoch 226, weight, value: tensor([[-0.1059, -0.1802, -0.0663, ..., -0.0618, 0.1660, 0.1643], [-0.1956, -0.1672, -0.0974, ..., -0.1479, -0.1946, -0.1120], [-0.0762, -0.1221, 0.1209, ..., -0.1812, 0.2005, 0.0751], ..., [-0.1211, 0.0646, 0.0421, ..., 0.1761, -0.1904, -0.2034], [-0.2270, 0.0624, -0.1232, ..., 0.0524, -0.0829, -0.1362], [ 0.0116, -0.1075, -0.0874, ..., -0.0954, -0.0551, -0.1440]], device='cuda:0'), grad: tensor([[ 7.3761e-06, 1.0245e-08, 0.0000e+00, ..., 8.0466e-06, -7.5437e-08, -2.7008e-08], [ 8.5682e-08, 1.6205e-07, 0.0000e+00, ..., 3.3993e-07, 0.0000e+00, 0.0000e+00], [ 9.0338e-08, 2.6356e-07, 0.0000e+00, ..., 5.0478e-07, -1.8626e-09, -9.3132e-10], ..., [ 1.2293e-07, -1.0887e-06, 9.3132e-10, ..., -1.5302e-06, 2.7940e-09, 9.3132e-10], [ 2.7940e-08, 3.7253e-08, 0.0000e+00, ..., 5.8673e-08, 2.7940e-09, 1.8626e-09], [ 3.7625e-07, 5.6438e-07, -2.7940e-09, ..., 1.2740e-06, 6.6124e-08, 2.4214e-08]], device='cuda:0') Epoch 226, bias, value: tensor([-0.0191, -0.0304, -0.0161, -0.0246, -0.0281, 0.0028, 0.0287, -0.0123, 0.0397, -0.0043], device='cuda:0'), grad: tensor([ 2.3037e-05, 9.0525e-07, 1.5497e-06, 8.5309e-06, -4.0978e-08, -3.5048e-05, 1.5767e-06, -4.6566e-06, 1.5926e-07, 4.0308e-06], device='cuda:0') 100 0.0001 changing lr epoch 225, time 250.57, cls_loss 0.0029 cls_loss_mapping 0.0029 cls_loss_causal 0.4978 re_mapping 0.0052 re_causal 0.0139 /// teacc 99.05 lr 0.00010000 Epoch 227, weight, value: tensor([[-0.1061, -0.1810, -0.0664, ..., -0.0618, 0.1661, 0.1643], [-0.1960, -0.1675, -0.0973, ..., -0.1481, -0.1950, -0.1123], [-0.0759, -0.1224, 0.1210, ..., -0.1813, 0.2019, 0.0757], ..., [-0.1218, 0.0643, 0.0420, ..., 0.1761, -0.1921, -0.2040], [-0.2279, 0.0623, -0.1236, ..., 0.0522, -0.0832, -0.1368], [ 0.0098, -0.1081, -0.0874, ..., -0.0954, -0.0553, -0.1450]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 1.5832e-08, 0.0000e+00, ..., 2.1420e-08, -1.9930e-07, -1.3225e-07], [ 4.6566e-08, 1.2666e-07, 1.8626e-09, ..., 2.9989e-07, 8.2888e-08, 9.3132e-10], [ 9.3132e-10, 7.4320e-07, 4.6566e-09, ..., 1.2787e-06, -2.3842e-07, 6.5193e-09], ..., [ 1.2107e-08, -2.5220e-06, 0.0000e+00, ..., -4.1313e-06, 1.4994e-07, 2.7940e-09], [ 6.3144e-07, -2.2724e-07, 0.0000e+00, ..., -1.9744e-07, 1.3970e-08, 7.4506e-09], [ 3.0641e-07, 1.5534e-06, 0.0000e+00, ..., 2.7921e-06, 5.2154e-08, 3.1665e-08]], device='cuda:0') Epoch 227, bias, value: tensor([-0.0191, -0.0306, -0.0156, -0.0241, -0.0278, 0.0032, 0.0286, -0.0125, 0.0395, -0.0045], device='cuda:0'), grad: tensor([-2.7474e-07, 1.6801e-06, 1.6131e-06, 4.8280e-06, 2.9430e-06, -4.9658e-06, 4.7497e-07, -6.0201e-06, -1.1539e-06, 8.4750e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 226---------------------------------------------------- epoch 226, time 267.53, cls_loss 0.0036 cls_loss_mapping 0.0027 cls_loss_causal 0.4922 re_mapping 0.0055 re_causal 0.0137 /// teacc 99.07 lr 0.00010000 Epoch 228, weight, value: tensor([[-0.1067, -0.1835, -0.0664, ..., -0.0618, 0.1659, 0.1641], [-0.1962, -0.1681, -0.0973, ..., -0.1486, -0.1955, -0.1128], [-0.0761, -0.1223, 0.1212, ..., -0.1812, 0.2032, 0.0762], ..., [-0.1211, 0.0648, 0.0420, ..., 0.1770, -0.1934, -0.2051], [-0.2271, 0.0621, -0.1261, ..., 0.0526, -0.0832, -0.1370], [ 0.0075, -0.1097, -0.0874, ..., -0.0960, -0.0554, -0.1458]], device='cuda:0'), grad: tensor([[ 8.2050e-07, 1.0245e-08, 0.0000e+00, ..., 2.3283e-08, 5.6252e-07, 2.4494e-07], [ 1.3039e-08, 1.5832e-08, 0.0000e+00, ..., 1.5274e-07, 6.5193e-09, 6.5193e-09], [ 1.1176e-08, -2.5705e-07, 0.0000e+00, ..., 6.5193e-09, -1.1548e-07, -2.8405e-07], ..., [ 3.9022e-07, 2.7940e-09, 0.0000e+00, ..., 1.6754e-06, 2.1420e-08, 2.8871e-08], [ 3.3714e-07, 1.0710e-07, 0.0000e+00, ..., 4.7404e-07, 1.0524e-07, 8.9407e-08], [-2.6580e-06, 2.9802e-08, -0.0000e+00, ..., -1.3158e-05, 5.5879e-08, 3.6322e-08]], device='cuda:0') Epoch 228, bias, value: tensor([-0.0194, -0.0308, -0.0152, -0.0237, -0.0275, 0.0028, 0.0284, -0.0121, 0.0401, -0.0051], device='cuda:0'), grad: tensor([ 1.2778e-06, 6.5938e-07, -1.2312e-06, 9.7509e-07, 3.2127e-05, 2.2620e-05, -8.7246e-06, 8.0094e-06, 3.1181e-06, -5.8800e-05], device='cuda:0') 100 0.0001 changing lr epoch 227, time 250.63, cls_loss 0.0026 cls_loss_mapping 0.0030 cls_loss_causal 0.5020 re_mapping 0.0058 re_causal 0.0149 /// teacc 99.03 lr 0.00010000 Epoch 229, weight, value: tensor([[-0.1067, -0.1840, -0.0663, ..., -0.0618, 0.1659, 0.1641], [-0.1967, -0.1694, -0.0973, ..., -0.1502, -0.1959, -0.1139], [-0.0759, -0.1226, 0.1212, ..., -0.1819, 0.2036, 0.0772], ..., [-0.1210, 0.0659, 0.0419, ..., 0.1790, -0.1935, -0.2053], [-0.2272, 0.0616, -0.1262, ..., 0.0526, -0.0832, -0.1372], [ 0.0067, -0.1114, -0.0874, ..., -0.0969, -0.0554, -0.1461]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 1.1176e-08, 1.1902e-06, ..., 1.3970e-08, 7.5884e-06, 3.2689e-07], [ 1.5832e-08, 2.0489e-08, 7.4506e-09, ..., 1.7695e-08, 5.1223e-08, 6.5193e-09], [ 7.4506e-09, 6.5193e-09, -1.5264e-06, ..., 1.6764e-08, -9.8348e-06, -4.5821e-07], ..., [ 8.3819e-09, -8.0187e-07, 4.0047e-08, ..., -9.7230e-07, 2.7847e-07, 2.8871e-08], [ 2.2352e-08, 9.4995e-08, 4.3772e-08, ..., 7.2643e-08, 2.7567e-07, 1.7695e-08], [-5.2713e-07, 6.5006e-07, 5.0291e-08, ..., -1.7034e-06, 3.6508e-07, 2.1420e-08]], device='cuda:0') Epoch 229, bias, value: tensor([-0.0194, -0.0313, -0.0147, -0.0252, -0.0275, 0.0040, 0.0284, -0.0113, 0.0397, -0.0055], device='cuda:0'), grad: tensor([ 1.4089e-05, 1.7323e-07, -1.8209e-05, -2.8796e-06, 2.7418e-06, 2.0325e-05, 4.4703e-08, -1.6596e-06, 7.4506e-07, -1.5363e-05], device='cuda:0') 100 0.0001 changing lr epoch 228, time 250.54, cls_loss 0.0029 cls_loss_mapping 0.0032 cls_loss_causal 0.4890 re_mapping 0.0055 re_causal 0.0140 /// teacc 98.99 lr 0.00010000 Epoch 230, weight, value: tensor([[-0.1068, -0.1848, -0.0661, ..., -0.0618, 0.1660, 0.1642], [-0.1974, -0.1695, -0.0973, ..., -0.1505, -0.1970, -0.1148], [-0.0756, -0.1237, 0.1218, ..., -0.1832, 0.2048, 0.0780], ..., [-0.1213, 0.0661, 0.0419, ..., 0.1794, -0.1939, -0.2056], [-0.2286, 0.0615, -0.1264, ..., 0.0524, -0.0835, -0.1377], [ 0.0067, -0.1118, -0.0874, ..., -0.0970, -0.0556, -0.1465]], device='cuda:0'), grad: tensor([[-3.0734e-08, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, -1.8533e-07, -6.4261e-08], [ 1.7136e-07, 1.3690e-07, 0.0000e+00, ..., 3.5856e-07, 9.7789e-08, 1.2107e-07], [ 1.5832e-08, 3.7253e-09, 0.0000e+00, ..., -4.9658e-06, -1.6868e-05, -1.4544e-05], ..., [ 1.8626e-09, -1.4715e-07, 0.0000e+00, ..., -3.8557e-07, 1.6764e-08, 1.0245e-08], [ 1.4342e-07, 1.8626e-09, 0.0000e+00, ..., 4.9621e-06, 1.6913e-05, 1.4596e-05], [ 1.2107e-08, 5.5879e-09, 0.0000e+00, ..., 1.4901e-08, 5.9605e-08, 2.5146e-08]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0193, -0.0312, -0.0146, -0.0252, -0.0285, 0.0040, 0.0286, -0.0113, 0.0391, -0.0046], device='cuda:0'), grad: tensor([-2.8778e-07, 1.2117e-06, -7.1824e-05, 5.2527e-07, -4.6566e-08, 4.5542e-07, -1.7127e-06, -6.5193e-07, 7.2122e-05, 1.2200e-07], device='cuda:0') 100 0.0001 changing lr epoch 229, time 250.26, cls_loss 0.0025 cls_loss_mapping 0.0027 cls_loss_causal 0.5179 re_mapping 0.0053 re_causal 0.0144 /// teacc 98.97 lr 0.00010000 Epoch 231, weight, value: tensor([[-0.1068, -0.1854, -0.0662, ..., -0.0618, 0.1661, 0.1642], [-0.1995, -0.1696, -0.0962, ..., -0.1507, -0.1973, -0.1150], [-0.0757, -0.1249, 0.1218, ..., -0.1834, 0.2049, 0.0780], ..., [-0.1226, 0.0659, 0.0406, ..., 0.1794, -0.1943, -0.2060], [-0.2297, 0.0620, -0.1264, ..., 0.0520, -0.0841, -0.1400], [ 0.0065, -0.1123, -0.0881, ..., -0.0973, -0.0557, -0.1470]], device='cuda:0'), grad: tensor([[ 1.0338e-07, 0.0000e+00, 0.0000e+00, ..., 7.4506e-09, -6.7987e-08, -5.3085e-08], [ 8.3819e-09, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 4.6566e-09, 2.7940e-09], [ 6.5193e-09, -9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 5.5879e-09, 5.5879e-09], ..., [ 2.6077e-08, 1.8626e-09, 0.0000e+00, ..., 6.5193e-09, 2.9802e-08, 2.1420e-08], [ 3.8184e-08, 2.7940e-09, 0.0000e+00, ..., -4.6566e-09, 9.3132e-09, 6.5193e-09], [ 3.2037e-07, -1.8626e-09, 0.0000e+00, ..., 6.9849e-08, 6.1467e-08, 4.2841e-08]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0193, -0.0312, -0.0148, -0.0251, -0.0287, 0.0053, 0.0285, -0.0114, 0.0384, -0.0047], device='cuda:0'), grad: tensor([ 3.6322e-08, -2.0295e-05, 2.7455e-06, 3.3051e-05, 8.5682e-08, -2.9787e-05, -1.8626e-08, 8.6278e-06, 1.6950e-07, 5.4426e-06], device='cuda:0') 100 0.0001 changing lr epoch 230, time 250.41, cls_loss 0.0030 cls_loss_mapping 0.0026 cls_loss_causal 0.5130 re_mapping 0.0056 re_causal 0.0139 /// teacc 98.96 lr 0.00010000 Epoch 232, weight, value: tensor([[-0.1069, -0.1884, -0.0662, ..., -0.0619, 0.1664, 0.1643], [-0.2000, -0.1698, -0.0962, ..., -0.1509, -0.1989, -0.1159], [-0.0756, -0.1256, 0.1219, ..., -0.1842, 0.2050, 0.0780], ..., [-0.1243, 0.0664, 0.0407, ..., 0.1799, -0.1947, -0.2062], [-0.2303, 0.0620, -0.1267, ..., 0.0522, -0.0840, -0.1400], [ 0.0064, -0.1137, -0.0882, ..., -0.0983, -0.0558, -0.1478]], device='cuda:0'), grad: tensor([[-3.7253e-09, 2.3283e-08, 0.0000e+00, ..., 1.0151e-07, -2.9709e-07, -1.2945e-07], [ 5.5879e-08, 2.9337e-07, 0.0000e+00, ..., 8.3819e-09, 9.3132e-10, 0.0000e+00], [ 3.0175e-07, 1.5795e-06, 0.0000e+00, ..., 6.4354e-07, 2.2352e-07, 7.0781e-08], ..., [ 1.8626e-08, 7.3574e-08, 0.0000e+00, ..., -9.3132e-09, 3.7253e-09, 1.8626e-09], [ 2.8051e-06, 8.0839e-07, 0.0000e+00, ..., 1.1921e-06, -8.1025e-07, -2.4959e-07], [ 4.9360e-08, 5.4017e-08, 0.0000e+00, ..., 1.3970e-08, 2.2352e-08, 1.0245e-08]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0190, -0.0314, -0.0153, -0.0257, -0.0284, 0.0055, 0.0285, -0.0111, 0.0386, -0.0051], device='cuda:0'), grad: tensor([ 2.2352e-08, 4.8708e-07, 4.4107e-06, 9.6709e-06, 1.2759e-07, -2.2948e-05, 5.0291e-07, 1.7975e-07, 7.5437e-06, -6.5193e-09], device='cuda:0') 100 0.0001 changing lr epoch 231, time 250.38, cls_loss 0.0027 cls_loss_mapping 0.0025 cls_loss_causal 0.5024 re_mapping 0.0056 re_causal 0.0144 /// teacc 98.99 lr 0.00010000 Epoch 233, weight, value: tensor([[-0.1072, -0.1899, -0.0670, ..., -0.0621, 0.1663, 0.1643], [-0.2010, -0.1699, -0.0962, ..., -0.1512, -0.1993, -0.1159], [-0.0756, -0.1264, 0.1226, ..., -0.1849, 0.2061, 0.0784], ..., [-0.1249, 0.0639, 0.0405, ..., 0.1777, -0.1962, -0.2075], [-0.2314, 0.0619, -0.1271, ..., 0.0520, -0.0843, -0.1405], [ 0.0060, -0.1108, -0.0881, ..., -0.0955, -0.0559, -0.1481]], device='cuda:0'), grad: tensor([[ 1.3039e-08, -3.7253e-08, 0.0000e+00, ..., 2.7940e-09, -3.9767e-07, -2.4494e-07], [ 7.4506e-09, 2.3283e-08, 0.0000e+00, ..., -4.2617e-05, 1.0245e-08, 6.5193e-09], [ 7.4506e-09, 3.1665e-08, 0.0000e+00, ..., 2.4214e-08, 2.3562e-07, 1.5367e-07], ..., [ 9.3132e-10, -1.7509e-07, 0.0000e+00, ..., 3.8981e-05, 5.5879e-09, 3.7253e-09], [ 3.7253e-09, 1.7695e-08, 0.0000e+00, ..., 3.6322e-08, 7.2643e-08, 4.6566e-08], [ 9.3132e-10, 3.0734e-08, 0.0000e+00, ..., 3.1143e-06, 1.0151e-07, 6.3330e-08]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0192, -0.0316, -0.0151, -0.0256, -0.0282, 0.0055, 0.0290, -0.0134, 0.0383, -0.0029], device='cuda:0'), grad: tensor([-4.8243e-07, -9.3997e-05, 4.3958e-07, 1.8720e-07, 7.9628e-07, 4.4424e-07, -1.0617e-07, 8.6546e-05, 2.6543e-07, 6.0014e-06], device='cuda:0') 100 0.0001 changing lr epoch 232, time 250.30, cls_loss 0.0035 cls_loss_mapping 0.0028 cls_loss_causal 0.5041 re_mapping 0.0056 re_causal 0.0132 /// teacc 99.01 lr 0.00010000 Epoch 234, weight, value: tensor([[-0.1073, -0.1928, -0.0678, ..., -0.0621, 0.1668, 0.1645], [-0.2013, -0.1705, -0.0956, ..., -0.1518, -0.2006, -0.1166], [-0.0760, -0.1277, 0.1241, ..., -0.1850, 0.2080, 0.0780], ..., [-0.1278, 0.0640, 0.0398, ..., 0.1781, -0.1997, -0.2082], [-0.2318, 0.0616, -0.1276, ..., 0.0517, -0.0848, -0.1410], [ 0.0033, -0.1109, -0.0885, ..., -0.0956, -0.0563, -0.1505]], device='cuda:0'), grad: tensor([[-2.2836e-06, 1.8626e-08, 6.5193e-09, ..., 3.3528e-08, -7.5139e-06, -6.9998e-06], [ 6.5193e-09, 7.4506e-09, 1.8626e-09, ..., 2.3283e-08, 1.4994e-07, 7.6368e-08], [ 2.0023e-07, 5.4948e-08, 0.0000e+00, ..., 7.2643e-08, -1.0375e-06, -2.0489e-07], ..., [ 2.7940e-08, -1.6764e-07, 1.5832e-08, ..., -2.0210e-07, 2.5705e-07, 1.6112e-07], [ 2.1793e-07, 3.3528e-08, 1.8626e-08, ..., -4.7591e-07, 1.1511e-06, 8.3726e-07], [ 4.0326e-07, 3.9116e-08, -5.4948e-08, ..., 4.8056e-07, 1.3923e-06, 1.2573e-06]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0187, -0.0314, -0.0144, -0.0242, -0.0280, 0.0050, 0.0286, -0.0137, 0.0379, -0.0032], device='cuda:0'), grad: tensor([-1.1846e-05, 8.7265e-07, -7.6219e-06, 9.9745e-07, 1.5339e-06, 1.2163e-06, 8.6948e-06, 9.2387e-07, 1.8915e-06, 3.3379e-06], device='cuda:0') 100 0.0001 changing lr epoch 233, time 250.42, cls_loss 0.0025 cls_loss_mapping 0.0021 cls_loss_causal 0.5273 re_mapping 0.0057 re_causal 0.0150 /// teacc 99.01 lr 0.00010000 Epoch 235, weight, value: tensor([[-0.1074, -0.1952, -0.0680, ..., -0.0622, 0.1669, 0.1646], [-0.2013, -0.1707, -0.0954, ..., -0.1534, -0.2009, -0.1167], [-0.0761, -0.1279, 0.1242, ..., -0.1839, 0.2101, 0.0779], ..., [-0.1277, 0.0641, 0.0397, ..., 0.1789, -0.2018, -0.2082], [-0.2322, 0.0616, -0.1276, ..., 0.0518, -0.0847, -0.1410], [ 0.0030, -0.1110, -0.0890, ..., -0.0957, -0.0564, -0.1509]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 7.4506e-09, 0.0000e+00, ..., 9.3132e-09, -2.0489e-08, -1.4901e-08], [ 0.0000e+00, 1.3690e-07, 0.0000e+00, ..., 2.2817e-07, 2.7940e-09, 9.3132e-10], [ 0.0000e+00, 1.4901e-07, 0.0000e+00, ..., 1.6671e-07, -2.9802e-08, -3.7253e-09], ..., [ 9.3132e-10, -9.7230e-07, 0.0000e+00, ..., -1.3439e-06, 2.2352e-08, 3.7253e-09], [ 4.0978e-08, 7.4506e-09, 0.0000e+00, ..., 7.9162e-08, 1.3039e-08, 1.8626e-09], [ 4.6566e-09, 6.5193e-07, 0.0000e+00, ..., 9.2015e-07, -3.7253e-09, 3.7253e-09]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0186, -0.0324, -0.0136, -0.0231, -0.0280, 0.0039, 0.0285, -0.0133, 0.0380, -0.0032], device='cuda:0'), grad: tensor([ 3.7253e-08, 6.4634e-07, 3.9488e-07, 2.0489e-07, -8.8476e-08, -2.6729e-07, 4.7497e-08, -3.4552e-06, 2.4959e-07, 2.2221e-06], device='cuda:0') 100 0.0001 changing lr epoch 234, time 250.56, cls_loss 0.0030 cls_loss_mapping 0.0031 cls_loss_causal 0.5277 re_mapping 0.0053 re_causal 0.0142 /// teacc 99.03 lr 0.00010000 Epoch 236, weight, value: tensor([[-0.1075, -0.1960, -0.0684, ..., -0.0622, 0.1670, 0.1647], [-0.2016, -0.1708, -0.0954, ..., -0.1547, -0.2012, -0.1167], [-0.0762, -0.1280, 0.1260, ..., -0.1840, 0.2108, 0.0780], ..., [-0.1280, 0.0645, 0.0398, ..., 0.1797, -0.2022, -0.2085], [-0.2327, 0.0615, -0.1280, ..., 0.0519, -0.0850, -0.1415], [ 0.0026, -0.1114, -0.0890, ..., -0.0962, -0.0565, -0.1515]], device='cuda:0'), grad: tensor([[ 2.0489e-08, 4.6566e-09, 3.2596e-08, ..., 1.5832e-08, 1.6298e-07, 9.3132e-10], [ 3.7253e-09, 8.3819e-08, 0.0000e+00, ..., 1.1362e-07, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 1.0245e-08, 0.0000e+00, ..., 1.4901e-08, -2.7940e-09, -9.3132e-10], ..., [ 4.6566e-09, -8.1584e-07, 0.0000e+00, ..., -1.1642e-06, 1.8626e-09, 9.3132e-10], [ 1.2107e-08, 1.0990e-07, 4.6566e-09, ..., 1.5926e-07, 1.8626e-09, 0.0000e+00], [ 1.8626e-09, 4.9081e-07, 2.8871e-08, ..., 7.1060e-07, 1.7136e-07, 0.0000e+00]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0186, -0.0338, -0.0137, -0.0249, -0.0271, 0.0054, 0.0285, -0.0121, 0.0381, -0.0038], device='cuda:0'), grad: tensor([ 9.5926e-07, -4.8149e-07, 4.6566e-08, 2.6897e-06, -6.2305e-07, -2.4606e-06, 1.8068e-07, -2.5034e-06, 5.0385e-07, 1.7183e-06], device='cuda:0') 100 0.0001 changing lr epoch 235, time 250.36, cls_loss 0.0026 cls_loss_mapping 0.0033 cls_loss_causal 0.4962 re_mapping 0.0052 re_causal 0.0135 /// teacc 99.00 lr 0.00010000 Epoch 237, weight, value: tensor([[-0.1079, -0.1971, -0.0691, ..., -0.0623, 0.1671, 0.1647], [-0.2016, -0.1709, -0.0956, ..., -0.1548, -0.2016, -0.1169], [-0.0763, -0.1280, 0.1269, ..., -0.1841, 0.2114, 0.0781], ..., [-0.1281, 0.0647, 0.0405, ..., 0.1800, -0.2026, -0.2089], [-0.2331, 0.0614, -0.1285, ..., 0.0519, -0.0853, -0.1417], [ 0.0031, -0.1114, -0.0922, ..., -0.0963, -0.0567, -0.1523]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 1.8626e-09, ..., 2.7940e-09, -4.2375e-07, -2.0210e-07], [ 0.0000e+00, 6.4261e-08, -5.3085e-08, ..., 1.2759e-07, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 1.0245e-08, 0.0000e+00, ..., 1.7695e-08, 3.2596e-08, 2.3283e-08], ..., [ 0.0000e+00, -2.5518e-07, 1.4901e-08, ..., -5.0478e-07, 4.6566e-09, 3.7253e-09], [ 0.0000e+00, 5.5879e-09, 7.4506e-09, ..., 2.7940e-09, 1.3411e-07, 5.8673e-08], [ 0.0000e+00, 1.6391e-07, -9.8720e-08, ..., 3.4459e-07, 2.3190e-07, 1.0431e-07]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0187, -0.0338, -0.0137, -0.0246, -0.0260, 0.0050, 0.0283, -0.0120, 0.0379, -0.0044], device='cuda:0'), grad: tensor([-6.4634e-07, -1.0151e-06, 1.1083e-07, 3.5390e-08, 1.5125e-06, 1.3970e-08, 2.8871e-08, -1.2992e-06, 2.6543e-07, 9.9000e-07], device='cuda:0') 100 0.0001 changing lr epoch 236, time 250.41, cls_loss 0.0023 cls_loss_mapping 0.0022 cls_loss_causal 0.5031 re_mapping 0.0054 re_causal 0.0142 /// teacc 98.97 lr 0.00010000 Epoch 238, weight, value: tensor([[-0.1081, -0.1988, -0.0697, ..., -0.0625, 0.1671, 0.1647], [-0.2020, -0.1713, -0.0954, ..., -0.1552, -0.2016, -0.1173], [-0.0760, -0.1292, 0.1274, ..., -0.1850, 0.2120, 0.0783], ..., [-0.1284, 0.0650, 0.0408, ..., 0.1805, -0.2034, -0.2091], [-0.2334, 0.0613, -0.1300, ..., 0.0520, -0.0853, -0.1417], [ 0.0030, -0.1116, -0.0913, ..., -0.0964, -0.0568, -0.1531]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 3.7253e-09, ..., 9.3132e-10, -4.4703e-08, -4.5635e-08], [ 1.8626e-09, 4.6566e-09, -9.3132e-10, ..., -3.6322e-08, 7.0781e-08, 2.1420e-08], [ 0.0000e+00, 9.3132e-10, -3.3528e-08, ..., 2.7940e-09, -2.0303e-07, -6.1467e-08], ..., [ 9.3132e-10, -1.5832e-08, 1.3970e-08, ..., -1.4901e-08, 1.6764e-08, 4.6566e-09], [ 9.3132e-10, 1.8626e-09, 1.8626e-09, ..., -6.5193e-09, 7.4506e-09, 2.7940e-09], [ 1.8626e-09, 1.9558e-08, -1.0524e-07, ..., 3.8184e-08, 1.0245e-08, 7.4506e-09]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0188, -0.0340, -0.0137, -0.0235, -0.0265, 0.0038, 0.0285, -0.0117, 0.0380, -0.0042], device='cuda:0'), grad: tensor([-5.4948e-08, -1.6764e-07, -2.8405e-07, -8.3819e-09, 5.1875e-07, 1.4901e-08, 1.8068e-07, 1.0617e-07, -6.5193e-09, -2.9989e-07], device='cuda:0') 100 0.0001 changing lr epoch 237, time 250.57, cls_loss 0.0036 cls_loss_mapping 0.0035 cls_loss_causal 0.5428 re_mapping 0.0052 re_causal 0.0137 /// teacc 99.01 lr 0.00010000 Epoch 239, weight, value: tensor([[-0.1084, -0.2000, -0.0701, ..., -0.0625, 0.1671, 0.1647], [-0.2031, -0.1717, -0.0950, ..., -0.1554, -0.2018, -0.1174], [-0.0761, -0.1299, 0.1276, ..., -0.1862, 0.2125, 0.0785], ..., [-0.1286, 0.0650, 0.0425, ..., 0.1798, -0.2037, -0.2097], [-0.2327, 0.0630, -0.1297, ..., 0.0561, -0.0847, -0.1421], [ 0.0017, -0.1121, -0.0924, ..., -0.0967, -0.0570, -0.1543]], device='cuda:0'), grad: tensor([[ 1.2945e-07, 1.7509e-07, 5.1223e-08, ..., 2.7008e-07, -4.0326e-07, -2.4959e-07], [ 3.7253e-08, 3.0175e-07, 8.5682e-08, ..., 2.9244e-07, 1.8626e-09, 1.8626e-09], [ 7.4506e-09, 9.3132e-09, 2.7940e-09, ..., 1.4901e-08, 4.4703e-08, 2.6077e-08], ..., [ 5.1223e-08, -6.5565e-05, -1.9163e-05, ..., -5.6177e-05, 1.8626e-08, 1.1176e-08], [ 1.6764e-08, 2.1048e-07, 5.9605e-08, ..., 1.9185e-07, 9.2201e-08, 5.7742e-08], [ 3.7514e-06, 6.4611e-05, 1.8880e-05, ..., 5.8383e-05, 1.0896e-07, 6.7987e-08]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0189, -0.0339, -0.0142, -0.0232, -0.0277, 0.0029, 0.0276, -0.0121, 0.0416, -0.0036], device='cuda:0'), grad: tensor([ 3.0361e-07, 8.0653e-07, 7.6368e-08, 3.2689e-06, 1.2703e-06, -1.0841e-05, 2.8592e-07, -2.0337e-04, 9.4157e-07, 2.0742e-04], device='cuda:0') 100 0.0001 changing lr epoch 238, time 250.18, cls_loss 0.0036 cls_loss_mapping 0.0030 cls_loss_causal 0.5197 re_mapping 0.0055 re_causal 0.0139 /// teacc 98.97 lr 0.00010000 Epoch 240, weight, value: tensor([[-0.1085, -0.2013, -0.0703, ..., -0.0626, 0.1651, 0.1649], [-0.2038, -0.1721, -0.0945, ..., -0.1560, -0.2025, -0.1181], [-0.0753, -0.1302, 0.1287, ..., -0.1865, 0.2132, 0.0791], ..., [-0.1294, 0.0658, 0.0439, ..., 0.1807, -0.2041, -0.2101], [-0.2348, 0.0629, -0.1307, ..., 0.0558, -0.0859, -0.1451], [ 0.0014, -0.1128, -0.0949, ..., -0.0973, -0.0541, -0.1565]], device='cuda:0'), grad: tensor([[ 8.8476e-09, 1.3970e-09, 0.0000e+00, ..., 1.2573e-08, -1.1157e-06, -5.4482e-07], [ 9.3132e-10, 2.3283e-09, 0.0000e+00, ..., 3.2596e-09, -9.3132e-10, 1.8626e-09], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 4.6566e-09, 4.6799e-07, 2.2678e-07], ..., [ 5.5879e-09, -2.4214e-08, 4.6566e-10, ..., -2.2817e-08, 1.1176e-08, 5.1223e-09], [ 5.5879e-08, 9.3132e-09, 2.7940e-09, ..., 4.9826e-08, 1.4435e-08, 6.9849e-09], [ 4.0047e-08, 1.9092e-08, -3.7253e-09, ..., 6.3330e-08, 5.8953e-07, 2.8871e-07]], device='cuda:0') Epoch 240, bias, value: tensor([-0.0215, -0.0343, -0.0149, -0.0233, -0.0272, 0.0029, 0.0278, -0.0111, 0.0410, -0.0021], device='cuda:0'), grad: tensor([-1.9576e-06, -3.8696e-07, 9.1782e-07, 3.9339e-06, 1.6112e-07, -4.0121e-06, 2.2165e-07, 2.4214e-08, 2.5379e-07, 8.4611e-07], device='cuda:0') 100 0.0001 changing lr epoch 239, time 252.06, cls_loss 0.0036 cls_loss_mapping 0.0034 cls_loss_causal 0.5169 re_mapping 0.0052 re_causal 0.0135 /// teacc 99.01 lr 0.00010000 Epoch 241, weight, value: tensor([[-0.1085, -0.2028, -0.0711, ..., -0.0626, 0.1652, 0.1650], [-0.2040, -0.1724, -0.0947, ..., -0.1564, -0.2023, -0.1168], [-0.0754, -0.1306, 0.1297, ..., -0.1870, 0.2130, 0.0775], ..., [-0.1309, 0.0645, 0.0443, ..., 0.1809, -0.2043, -0.2105], [-0.2359, 0.0627, -0.1328, ..., 0.0552, -0.0866, -0.1462], [ 0.0010, -0.1131, -0.0948, ..., -0.0976, -0.0542, -0.1585]], device='cuda:0'), grad: tensor([[ 3.4645e-07, 5.5693e-07, 0.0000e+00, ..., 6.1467e-08, 6.7055e-08, 1.4165e-06], [ 3.5390e-08, 3.5809e-07, 0.0000e+00, ..., 3.9162e-07, 1.6298e-08, 9.2667e-08], [ 3.6787e-08, 6.0070e-08, 0.0000e+00, ..., 1.8859e-07, -2.7381e-07, -1.9418e-07], ..., [ 4.4238e-08, -1.1073e-06, 0.0000e+00, ..., -1.4696e-06, 2.2678e-07, 2.9476e-07], [ 8.8476e-07, 1.3132e-06, 0.0000e+00, ..., -2.1653e-07, 4.6287e-07, 3.5204e-06], [ 1.5832e-06, 3.4608e-06, -0.0000e+00, ..., 1.0002e-06, 4.0978e-08, 1.2293e-07]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0215, -0.0336, -0.0173, -0.0211, -0.0272, 0.0026, 0.0279, -0.0111, 0.0403, -0.0023], device='cuda:0'), grad: tensor([ 8.1062e-06, 1.8766e-07, 8.0140e-07, -5.0291e-06, 5.0813e-06, 6.1989e-06, -3.9846e-05, -1.9409e-06, 1.8939e-05, 7.5139e-06], device='cuda:0') 100 0.0001 changing lr epoch 240, time 252.34, cls_loss 0.0028 cls_loss_mapping 0.0022 cls_loss_causal 0.4799 re_mapping 0.0054 re_causal 0.0136 /// teacc 98.96 lr 0.00010000 Epoch 242, weight, value: tensor([[-0.1086, -0.2040, -0.0713, ..., -0.0627, 0.1653, 0.1651], [-0.2045, -0.1730, -0.0927, ..., -0.1562, -0.2031, -0.1169], [-0.0751, -0.1311, 0.1301, ..., -0.1873, 0.2140, 0.0778], ..., [-0.1310, 0.0645, 0.0419, ..., 0.1812, -0.2049, -0.2109], [-0.2363, 0.0626, -0.1331, ..., 0.0552, -0.0873, -0.1472], [ 0.0010, -0.1134, -0.0951, ..., -0.0979, -0.0542, -0.1608]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 1.0943e-07, ..., 4.0093e-07, -2.7940e-09, -3.7253e-09], [ 1.3970e-09, 7.9162e-09, 9.9745e-07, ..., 2.7288e-07, 6.5230e-06, 0.0000e+00], [ 9.3132e-10, -9.3132e-09, -1.1943e-05, ..., -4.5985e-05, -9.9421e-05, 0.0000e+00], ..., [ 3.2596e-09, -7.4506e-09, 1.1548e-06, ..., 9.8199e-06, 9.2864e-05, 0.0000e+00], [ 1.3970e-08, 1.7695e-08, 7.3090e-06, ..., 2.6777e-05, 2.5611e-08, 0.0000e+00], [ 5.5879e-09, -2.4214e-08, -9.3132e-09, ..., 1.2573e-08, 8.3819e-09, 4.6566e-09]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0214, -0.0339, -0.0166, -0.0200, -0.0272, 0.0019, 0.0277, -0.0110, 0.0401, -0.0026], device='cuda:0'), grad: tensor([ 1.3746e-06, 1.8075e-05, -4.4513e-04, 2.7521e-07, 2.8446e-05, 2.9169e-06, 6.3190e-07, 3.0541e-04, 9.1851e-05, -3.9712e-06], device='cuda:0') 100 0.0001 changing lr epoch 241, time 252.42, cls_loss 0.0027 cls_loss_mapping 0.0024 cls_loss_causal 0.4987 re_mapping 0.0054 re_causal 0.0141 /// teacc 99.03 lr 0.00010000 Epoch 243, weight, value: tensor([[-0.1086, -0.2043, -0.0713, ..., -0.0627, 0.1654, 0.1653], [-0.2053, -0.1731, -0.0918, ..., -0.1566, -0.2034, -0.1170], [-0.0743, -0.1323, 0.1317, ..., -0.1871, 0.2161, 0.0781], ..., [-0.1317, 0.0645, 0.0412, ..., 0.1815, -0.2082, -0.2122], [-0.2365, 0.0624, -0.1351, ..., 0.0552, -0.0875, -0.1474], [ 0.0020, -0.1134, -0.0957, ..., -0.0980, -0.0543, -0.1618]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 1.1409e-07, -2.9942e-07, -2.0582e-07], [ 9.3132e-10, 7.4506e-09, 0.0000e+00, ..., 1.3970e-08, 2.9802e-08, 1.3970e-09], [ 4.6566e-10, 1.7229e-08, 0.0000e+00, ..., 2.6496e-07, -2.1234e-07, -2.7474e-08], ..., [ 3.2596e-09, -4.8894e-08, 0.0000e+00, ..., -6.2864e-08, 7.9162e-08, 1.3970e-09], [ 2.7940e-09, 2.3283e-09, 0.0000e+00, ..., -1.4184e-06, -3.3295e-07, 4.0047e-08], [ 9.3132e-10, 2.1420e-08, 0.0000e+00, ..., 2.5611e-08, 3.0082e-07, 1.7881e-07]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0214, -0.0344, -0.0146, -0.0192, -0.0274, 0.0009, 0.0279, -0.0111, 0.0400, -0.0026], device='cuda:0'), grad: tensor([-1.9977e-07, -3.7253e-09, 1.0757e-07, 2.6217e-07, 1.9232e-07, 2.3562e-06, 9.1409e-07, 1.7975e-07, -4.4070e-06, 6.1234e-07], device='cuda:0') 100 0.0001 changing lr epoch 242, time 252.48, cls_loss 0.0027 cls_loss_mapping 0.0033 cls_loss_causal 0.5178 re_mapping 0.0052 re_causal 0.0138 /// teacc 98.96 lr 0.00010000 Epoch 244, weight, value: tensor([[-0.1087, -0.2045, -0.0714, ..., -0.0629, 0.1656, 0.1653], [-0.2054, -0.1734, -0.0917, ..., -0.1567, -0.2035, -0.1170], [-0.0744, -0.1342, 0.1321, ..., -0.1884, 0.2168, 0.0781], ..., [-0.1345, 0.0647, 0.0412, ..., 0.1818, -0.2087, -0.2124], [-0.2370, 0.0623, -0.1351, ..., 0.0551, -0.0876, -0.1475], [ 0.0019, -0.1136, -0.0956, ..., -0.0985, -0.0544, -0.1633]], device='cuda:0'), grad: tensor([[ 1.0710e-07, 3.7253e-09, 6.0536e-09, ..., 6.0536e-08, 8.3819e-09, -1.1176e-08], [ 1.0198e-07, 2.0489e-08, 1.3970e-09, ..., 1.4808e-07, 3.7253e-09, 0.0000e+00], [ 2.6077e-08, -3.2596e-09, -2.5891e-07, ..., 2.4214e-08, -9.7137e-07, -1.3970e-09], ..., [ 1.2433e-07, 1.7695e-08, 4.6566e-09, ..., 2.1514e-07, 1.7229e-08, 0.0000e+00], [ 3.4004e-05, 2.1979e-06, 4.6566e-10, ..., 3.3349e-05, 4.6566e-09, 4.6566e-10], [ 1.0449e-06, 2.9802e-08, 0.0000e+00, ..., -3.1982e-06, 9.7789e-09, 6.9849e-09]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0213, -0.0342, -0.0150, -0.0191, -0.0271, 0.0012, 0.0277, -0.0109, 0.0399, -0.0029], device='cuda:0'), grad: tensor([ 2.3702e-07, 3.4459e-07, -9.7789e-07, 4.8089e-04, 1.4514e-05, -5.5122e-04, 3.9563e-06, 1.2424e-06, 6.4135e-05, -1.3404e-05], device='cuda:0') 100 0.0001 changing lr epoch 243, time 250.57, cls_loss 0.0031 cls_loss_mapping 0.0030 cls_loss_causal 0.5225 re_mapping 0.0050 re_causal 0.0138 /// teacc 98.99 lr 0.00010000 Epoch 245, weight, value: tensor([[-0.1087, -0.2054, -0.0702, ..., -0.0629, 0.1659, 0.1655], [-0.2055, -0.1736, -0.0908, ..., -0.1569, -0.2046, -0.1178], [-0.0744, -0.1355, 0.1303, ..., -0.1895, 0.2171, 0.0782], ..., [-0.1358, 0.0632, 0.0409, ..., 0.1795, -0.2090, -0.2128], [-0.2383, 0.0653, -0.1360, ..., 0.0577, -0.0883, -0.1486], [ 0.0010, -0.1138, -0.0955, ..., -0.0987, -0.0544, -0.1648]], device='cuda:0'), grad: tensor([[ 3.3528e-08, 2.1886e-08, 0.0000e+00, ..., 6.5193e-09, -1.8962e-06, -1.5860e-06], [ 8.7544e-08, 1.0105e-07, 0.0000e+00, ..., 3.9116e-08, 1.1176e-08, 8.8476e-09], [ 1.1083e-07, 1.2666e-07, 0.0000e+00, ..., 4.6566e-10, 9.4995e-08, 7.9162e-08], ..., [ 8.3353e-08, 1.2107e-07, 0.0000e+00, ..., 1.2573e-08, 6.9849e-09, 5.5879e-09], [ 1.8626e-07, 2.6310e-07, 0.0000e+00, ..., 4.0047e-08, 1.9185e-07, 1.6019e-07], [ 2.8871e-08, 3.4925e-08, 0.0000e+00, ..., -7.2643e-08, 1.5479e-06, 1.2945e-06]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0211, -0.0331, -0.0170, -0.0199, -0.0277, 0.0024, 0.0275, -0.0124, 0.0421, -0.0028], device='cuda:0'), grad: tensor([-3.8557e-06, 5.6764e-07, 7.0455e-07, 3.5018e-06, 1.7043e-07, -5.5432e-06, -3.9814e-07, 5.6112e-07, 1.4091e-06, 2.8685e-06], device='cuda:0') 100 0.0001 changing lr epoch 244, time 250.29, cls_loss 0.0028 cls_loss_mapping 0.0025 cls_loss_causal 0.4792 re_mapping 0.0052 re_causal 0.0131 /// teacc 99.03 lr 0.00010000 Epoch 246, weight, value: tensor([[-0.1087, -0.2065, -0.0709, ..., -0.0629, 0.1663, 0.1658], [-0.2056, -0.1739, -0.0908, ..., -0.1573, -0.2064, -0.1193], [-0.0745, -0.1357, 0.1306, ..., -0.1900, 0.2177, 0.0783], ..., [-0.1361, 0.0634, 0.0408, ..., 0.1797, -0.2095, -0.2133], [-0.2387, 0.0652, -0.1370, ..., 0.0577, -0.0886, -0.1490], [ 0.0010, -0.1141, -0.0956, ..., -0.0987, -0.0546, -0.1687]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -1.1921e-07, 0.0000e+00, ..., 0.0000e+00, -6.5146e-07, -3.3295e-07], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., -4.6566e-09, 3.9116e-08, 2.7940e-09], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, -1.0738e-06, 5.5879e-09], ..., [ 1.3970e-09, 1.9558e-08, 0.0000e+00, ..., 1.8626e-09, 5.4017e-08, 2.7474e-08], [ 4.0978e-08, 2.5611e-08, 0.0000e+00, ..., 2.4214e-08, 4.1910e-08, 2.0955e-08], [ 2.1420e-08, 6.5193e-08, 0.0000e+00, ..., 2.2817e-08, 2.7660e-07, 1.4110e-07]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0208, -0.0334, -0.0172, -0.0204, -0.0279, 0.0026, 0.0278, -0.0123, 0.0420, -0.0028], device='cuda:0'), grad: tensor([-1.5479e-06, -5.2676e-06, 1.5693e-07, 2.0908e-07, 4.7125e-07, 1.7183e-07, 2.3264e-06, 3.3583e-06, 1.8626e-07, -5.7742e-08], device='cuda:0') 100 0.0001 changing lr epoch 245, time 250.25, cls_loss 0.0027 cls_loss_mapping 0.0021 cls_loss_causal 0.5151 re_mapping 0.0053 re_causal 0.0140 /// teacc 98.96 lr 0.00010000 Epoch 247, weight, value: tensor([[-1.0872e-01, -2.0688e-01, -7.0241e-02, ..., -6.2884e-02, 1.6696e-01, 1.6604e-01], [-2.0644e-01, -1.7410e-01, -9.0777e-02, ..., -1.5660e-01, -2.0770e-01, -1.1931e-01], [-7.4634e-02, -1.3569e-01, 1.3060e-01, ..., -1.9005e-01, 2.1911e-01, 7.8251e-02], ..., [-1.3666e-01, 6.3550e-02, 4.0760e-02, ..., 1.7963e-01, -2.0993e-01, -2.1364e-01], [-2.3937e-01, 6.5157e-02, -1.3700e-01, ..., 5.7704e-02, -8.9496e-02, -1.4954e-01], [ 4.4920e-05, -1.1426e-01, -9.5559e-02, ..., -9.8916e-02, -5.4780e-02, -1.7133e-01]], device='cuda:0'), grad: tensor([[ 4.6566e-09, -1.5879e-07, 0.0000e+00, ..., 2.7940e-09, -1.9222e-06, -1.1241e-06], [ 0.0000e+00, 3.5390e-08, 2.0489e-08, ..., 2.5705e-07, 2.1420e-08, 1.2573e-08], [ 0.0000e+00, 2.7008e-08, 2.2817e-08, ..., 2.9197e-07, 8.8010e-08, 4.7963e-08], ..., [ 5.1223e-09, -1.9697e-07, 4.6566e-10, ..., -3.1618e-07, 2.0023e-08, 1.1176e-08], [ 5.0757e-08, -2.1514e-07, -6.3330e-08, ..., -1.2368e-06, 7.2084e-07, 4.3400e-07], [ 9.3132e-10, 1.6904e-07, 0.0000e+00, ..., 2.1188e-07, 3.0827e-07, 1.7928e-07]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0205, -0.0331, -0.0168, -0.0206, -0.0286, 0.0028, 0.0278, -0.0126, 0.0419, -0.0028], device='cuda:0'), grad: tensor([-4.0643e-06, 1.2163e-06, 1.4771e-06, 1.3597e-06, -7.5698e-06, 1.7472e-06, 1.1101e-06, -5.6857e-07, -3.3714e-06, 8.6427e-06], device='cuda:0') 100 0.0001 changing lr epoch 246, time 249.85, cls_loss 0.0035 cls_loss_mapping 0.0025 cls_loss_causal 0.5216 re_mapping 0.0053 re_causal 0.0134 /// teacc 99.07 lr 0.00010000 Epoch 248, weight, value: tensor([[-0.1088, -0.2084, -0.0706, ..., -0.0630, 0.1672, 0.1664], [-0.2067, -0.1747, -0.0908, ..., -0.1563, -0.2080, -0.1194], [-0.0747, -0.1360, 0.1307, ..., -0.1903, 0.2199, 0.0784], ..., [-0.1371, 0.0639, 0.0407, ..., 0.1797, -0.2107, -0.2139], [-0.2397, 0.0654, -0.1370, ..., 0.0577, -0.0886, -0.1497], [-0.0006, -0.1145, -0.0956, ..., -0.0991, -0.0547, -0.1730]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 3.2596e-09, 0.0000e+00, ..., 9.3132e-09, -1.7341e-06, -1.6410e-06], [ 2.7940e-09, 1.0524e-07, 0.0000e+00, ..., 3.5018e-07, 4.9360e-08, 1.7229e-08], [ 9.3132e-10, 2.3283e-09, 0.0000e+00, ..., 4.6566e-09, -1.4855e-07, -5.6345e-08], ..., [ 4.6566e-09, -9.8068e-07, 0.0000e+00, ..., -1.2834e-06, 1.9092e-08, 9.7789e-09], [ 2.0955e-08, 2.7940e-08, 0.0000e+00, ..., 2.2352e-08, 4.6799e-07, 4.0559e-07], [ 1.2107e-08, 6.9384e-08, 0.0000e+00, ..., 1.9465e-07, 1.1781e-06, 1.1129e-06]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0205, -0.0325, -0.0170, -0.0209, -0.0298, 0.0027, 0.0271, -0.0130, 0.0424, -0.0019], device='cuda:0'), grad: tensor([-5.1558e-06, 6.5658e-07, -3.1479e-07, 6.5425e-07, 4.1910e-08, 7.6462e-07, 3.3388e-07, -2.6114e-06, 1.4473e-06, 4.1761e-06], device='cuda:0') 100 0.0001 changing lr epoch 247, time 250.19, cls_loss 0.0028 cls_loss_mapping 0.0028 cls_loss_causal 0.4910 re_mapping 0.0051 re_causal 0.0130 /// teacc 98.99 lr 0.00010000 Epoch 249, weight, value: tensor([[-0.1110, -0.2091, -0.0706, ..., -0.0630, 0.1663, 0.1658], [-0.2069, -0.1753, -0.0908, ..., -0.1577, -0.2091, -0.1195], [-0.0749, -0.1363, 0.1307, ..., -0.1908, 0.2217, 0.0785], ..., [-0.1389, 0.0641, 0.0407, ..., 0.1802, -0.2110, -0.2142], [-0.2400, 0.0654, -0.1371, ..., 0.0577, -0.0887, -0.1500], [-0.0008, -0.1146, -0.0956, ..., -0.0992, -0.0549, -0.1748]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, -1.4575e-07, -5.9605e-08], [ 5.1223e-09, 5.1223e-09, 0.0000e+00, ..., 3.2596e-09, -1.6019e-07, 9.3132e-10], [ 1.3970e-09, 2.3283e-09, 0.0000e+00, ..., 5.5879e-09, 4.3306e-08, -1.8626e-09], ..., [ 1.1642e-08, -8.7544e-08, 0.0000e+00, ..., -1.2992e-07, 2.0489e-08, 2.3283e-09], [ 8.0094e-08, 6.5193e-09, 0.0000e+00, ..., 4.7963e-08, 4.5169e-08, 9.3132e-09], [ 8.8476e-09, 5.4017e-08, 0.0000e+00, ..., 9.7323e-08, 1.3877e-07, 5.2620e-08]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0211, -0.0335, -0.0168, -0.0205, -0.0297, 0.0020, 0.0285, -0.0124, 0.0425, -0.0021], device='cuda:0'), grad: tensor([-1.9139e-07, -8.5495e-07, 3.0873e-07, 3.3528e-08, -1.2247e-07, -2.4075e-07, 4.2329e-07, -6.6590e-08, 2.3330e-07, 4.7032e-07], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 248---------------------------------------------------- epoch 248, time 267.07, cls_loss 0.0026 cls_loss_mapping 0.0032 cls_loss_causal 0.5209 re_mapping 0.0051 re_causal 0.0138 /// teacc 99.14 lr 0.00010000 Epoch 250, weight, value: tensor([[-0.1111, -0.2104, -0.0707, ..., -0.0631, 0.1666, 0.1661], [-0.2073, -0.1757, -0.0907, ..., -0.1579, -0.2104, -0.1201], [-0.0744, -0.1365, 0.1307, ..., -0.1911, 0.2236, 0.0791], ..., [-0.1389, 0.0650, 0.0408, ..., 0.1808, -0.2112, -0.2145], [-0.2404, 0.0654, -0.1373, ..., 0.0578, -0.0887, -0.1504], [-0.0008, -0.1159, -0.0956, ..., -0.1003, -0.0551, -0.1771]], device='cuda:0'), grad: tensor([[ 5.7276e-08, 4.3306e-08, 0.0000e+00, ..., 1.3970e-09, 1.3113e-05, -1.3970e-08], [ 1.0245e-08, 1.8626e-08, 0.0000e+00, ..., 1.3504e-08, 1.1642e-08, 4.6566e-10], [ 7.7765e-08, 6.5658e-08, 0.0000e+00, ..., 6.0536e-09, 8.2422e-08, -0.0000e+00], ..., [ 1.8626e-08, -9.4064e-08, 0.0000e+00, ..., -1.3504e-07, 1.3132e-07, 9.3132e-10], [ 1.4901e-08, 6.0536e-09, 0.0000e+00, ..., 7.4506e-09, 5.1223e-09, 9.3132e-10], [ 1.1176e-08, 8.8476e-08, 0.0000e+00, ..., 9.8720e-08, -1.3344e-05, 9.3132e-09]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0209, -0.0344, -0.0152, -0.0204, -0.0301, 0.0014, 0.0282, -0.0119, 0.0428, -0.0025], device='cuda:0'), grad: tensor([ 6.8963e-05, 1.0850e-07, 5.7649e-07, -8.6520e-07, 1.0524e-07, 5.6438e-07, 4.5169e-08, 4.1304e-07, 5.5414e-08, -6.9857e-05], device='cuda:0') 100 0.0001 changing lr epoch 249, time 250.41, cls_loss 0.0033 cls_loss_mapping 0.0029 cls_loss_causal 0.5149 re_mapping 0.0049 re_causal 0.0130 /// teacc 98.99 lr 0.00010000 Epoch 251, weight, value: tensor([[-0.1111, -0.2124, -0.0707, ..., -0.0634, 0.1668, 0.1663], [-0.2079, -0.1761, -0.0906, ..., -0.1583, -0.2106, -0.1202], [-0.0745, -0.1369, 0.1308, ..., -0.1914, 0.2241, 0.0791], ..., [-0.1392, 0.0653, 0.0407, ..., 0.1812, -0.2116, -0.2147], [-0.2409, 0.0654, -0.1383, ..., 0.0577, -0.0889, -0.1507], [-0.0010, -0.1164, -0.0956, ..., -0.1010, -0.0551, -0.1788]], device='cuda:0'), grad: tensor([[ 9.5461e-08, 4.9174e-07, 0.0000e+00, ..., 2.0489e-08, 1.8626e-08, -7.5903e-08], [ 7.0315e-08, 1.0012e-06, 0.0000e+00, ..., 5.5414e-08, 4.9500e-07, 2.3283e-09], [ 1.5926e-07, 4.1090e-06, 4.6566e-10, ..., 1.6950e-07, 2.9653e-06, 0.0000e+00], ..., [ 5.3085e-08, -1.5218e-06, 2.3283e-09, ..., -5.2992e-07, 1.4016e-07, 3.7253e-09], [ 1.9139e-07, -5.3406e-05, 4.6566e-10, ..., 3.9116e-08, -4.5627e-05, 9.3132e-09], [ 1.8161e-07, 1.0151e-06, -4.6566e-09, ..., 1.7136e-07, 1.2061e-07, 4.4703e-08]], device='cuda:0') Epoch 251, bias, value: tensor([-2.0832e-02, -3.6316e-02, -1.5020e-02, -1.8726e-02, -2.9570e-02, 8.0183e-05, 2.7928e-02, -1.1584e-02, 4.2911e-02, -1.6742e-03], device='cuda:0'), grad: tensor([ 2.2277e-06, 5.7891e-06, 2.9624e-05, 7.8753e-06, 5.0571e-07, 2.0843e-06, 3.6812e-04, -2.9393e-06, -4.1604e-04, 3.1721e-06], device='cuda:0') 100 0.0001 changing lr epoch 250, time 250.33, cls_loss 0.0029 cls_loss_mapping 0.0030 cls_loss_causal 0.5065 re_mapping 0.0055 re_causal 0.0132 /// teacc 98.97 lr 0.00010000 Epoch 252, weight, value: tensor([[-0.1111, -0.2119, -0.0713, ..., -0.0635, 0.1673, 0.1667], [-0.2080, -0.1763, -0.0907, ..., -0.1583, -0.2111, -0.1203], [-0.0746, -0.1378, 0.1311, ..., -0.1920, 0.2251, 0.0789], ..., [-0.1395, 0.0656, 0.0407, ..., 0.1815, -0.2127, -0.2152], [-0.2424, 0.0655, -0.1386, ..., 0.0575, -0.0888, -0.1527], [-0.0013, -0.1167, -0.0956, ..., -0.1013, -0.0553, -0.1814]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 1.8626e-09, 2.2352e-08, ..., 0.0000e+00, -2.7567e-07, -2.9569e-07], [ 2.9383e-07, 1.5367e-07, 1.0245e-08, ..., 1.3970e-09, 3.3993e-08, 8.3819e-09], [ 9.3132e-09, 8.3819e-09, -2.0582e-06, ..., 4.6566e-10, 1.6764e-08, 2.5146e-08], ..., [ 2.0489e-08, -4.6566e-08, 1.9511e-07, ..., -5.8673e-08, 3.7253e-09, 1.3970e-09], [ 5.4482e-08, 1.1176e-08, 1.5162e-06, ..., -0.0000e+00, 1.5367e-08, 1.0710e-08], [ 1.8207e-07, 1.4622e-07, 9.3132e-10, ..., 4.0047e-08, 2.5099e-07, 3.1199e-08]], device='cuda:0') Epoch 252, bias, value: tensor([-0.0205, -0.0365, -0.0148, -0.0192, -0.0295, 0.0006, 0.0272, -0.0116, 0.0427, -0.0016], device='cuda:0'), grad: tensor([-1.1735e-07, 1.0757e-06, -1.4283e-05, -7.8883e-07, -1.3802e-06, -5.5041e-07, 7.5158e-07, 1.3374e-06, 1.0692e-05, 3.2708e-06], device='cuda:0') 100 0.0001 changing lr epoch 251, time 250.50, cls_loss 0.0023 cls_loss_mapping 0.0020 cls_loss_causal 0.4796 re_mapping 0.0054 re_causal 0.0132 /// teacc 99.02 lr 0.00010000 Epoch 253, weight, value: tensor([[-0.1111, -0.2125, -0.0713, ..., -0.0636, 0.1674, 0.1668], [-0.2082, -0.1767, -0.0907, ..., -0.1585, -0.2112, -0.1203], [-0.0747, -0.1383, 0.1315, ..., -0.1921, 0.2263, 0.0791], ..., [-0.1404, 0.0665, 0.0411, ..., 0.1818, -0.2142, -0.2160], [-0.2441, 0.0654, -0.1393, ..., 0.0574, -0.0891, -0.1543], [-0.0015, -0.1169, -0.0955, ..., -0.1015, -0.0554, -0.1820]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, -6.0536e-09, -5.1223e-09], [ 9.3132e-10, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, 4.6566e-09, 3.2596e-09], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 2.7940e-09, -6.9849e-09, -2.3283e-09], ..., [ 4.6566e-10, -6.7987e-08, 0.0000e+00, ..., -7.1712e-08, 1.3970e-09, 4.6566e-10], [ 6.0536e-09, 6.5193e-09, 0.0000e+00, ..., -3.7253e-09, 7.9162e-09, 3.2596e-09], [ 4.6566e-10, 4.6100e-08, 0.0000e+00, ..., 4.8894e-08, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 253, bias, value: tensor([-0.0204, -0.0365, -0.0144, -0.0198, -0.0295, 0.0008, 0.0274, -0.0115, 0.0424, -0.0016], device='cuda:0'), grad: tensor([ 2.6543e-08, -2.7940e-08, 1.5367e-08, -2.7474e-08, -1.0766e-06, 8.1491e-08, 2.1420e-08, 7.5437e-08, 1.4435e-08, 9.1502e-07], device='cuda:0') 100 0.0001 changing lr epoch 252, time 250.19, cls_loss 0.0026 cls_loss_mapping 0.0023 cls_loss_causal 0.5083 re_mapping 0.0052 re_causal 0.0130 /// teacc 99.02 lr 0.00010000 Epoch 254, weight, value: tensor([[-0.1112, -0.2147, -0.0714, ..., -0.0636, 0.1674, 0.1669], [-0.2083, -0.1768, -0.0906, ..., -0.1586, -0.2114, -0.1204], [-0.0747, -0.1389, 0.1316, ..., -0.1927, 0.2294, 0.0794], ..., [-0.1406, 0.0666, 0.0410, ..., 0.1819, -0.2175, -0.2167], [-0.2456, 0.0654, -0.1395, ..., 0.0572, -0.0895, -0.1546], [-0.0009, -0.1169, -0.0955, ..., -0.1014, -0.0553, -0.1825]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 1.3970e-09, -1.0151e-07, -5.5414e-08], [ 9.3132e-10, 2.3749e-08, 0.0000e+00, ..., 1.3970e-08, 9.3132e-10, 4.6566e-10], [ 7.4506e-09, 1.3970e-08, 0.0000e+00, ..., 8.3819e-09, 7.9162e-09, 3.7253e-09], ..., [ 5.1223e-09, -3.9628e-07, 0.0000e+00, ..., -2.8685e-07, 3.7253e-09, 2.3283e-09], [ 4.1910e-09, 1.4435e-08, 0.0000e+00, ..., -5.4017e-08, 1.4901e-08, 8.8476e-09], [ 2.3283e-09, 1.0664e-07, -0.0000e+00, ..., 6.7987e-08, 5.7276e-08, 3.0268e-08]], device='cuda:0') Epoch 254, bias, value: tensor([-0.0205, -0.0363, -0.0135, -0.0200, -0.0302, 0.0023, 0.0265, -0.0122, 0.0418, -0.0010], device='cuda:0'), grad: tensor([-1.9139e-07, -1.4808e-07, 8.8476e-08, 4.2375e-07, -5.1223e-08, 1.8300e-07, 2.9802e-08, -5.6205e-07, -9.0804e-08, 3.4738e-07], device='cuda:0') 100 0.0001 changing lr epoch 253, time 250.25, cls_loss 0.0025 cls_loss_mapping 0.0026 cls_loss_causal 0.4969 re_mapping 0.0051 re_causal 0.0128 /// teacc 99.00 lr 0.00010000 Epoch 255, weight, value: tensor([[-0.1112, -0.2158, -0.0715, ..., -0.0636, 0.1681, 0.1673], [-0.2118, -0.1777, -0.0906, ..., -0.1589, -0.2117, -0.1210], [-0.0740, -0.1394, 0.1319, ..., -0.1930, 0.2296, 0.0800], ..., [-0.1405, 0.0669, 0.0410, ..., 0.1821, -0.2176, -0.2169], [-0.2470, 0.0653, -0.1399, ..., 0.0572, -0.0910, -0.1559], [-0.0023, -0.1171, -0.0954, ..., -0.1015, -0.0560, -0.1871]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 2.6077e-08, 0.0000e+00, ..., 4.9826e-08, -5.8860e-06, -5.6736e-06], [ 9.3132e-10, 6.5193e-09, 0.0000e+00, ..., 6.9849e-09, 5.4948e-08, 5.2154e-08], [ 9.3132e-10, 4.2841e-08, 0.0000e+00, ..., 8.1025e-08, 3.2643e-07, 3.1013e-07], ..., [ 4.6566e-10, -2.4261e-07, 0.0000e+00, ..., -2.5332e-07, 1.1502e-07, 1.2992e-07], [ 2.9150e-07, 8.3819e-09, 0.0000e+00, ..., 3.6834e-07, 2.9374e-06, 2.8331e-06], [ 4.6566e-10, 1.4435e-07, 0.0000e+00, ..., 1.5926e-07, 1.1502e-06, 1.1045e-06]], device='cuda:0') Epoch 255, bias, value: tensor([-0.0199, -0.0367, -0.0134, -0.0210, -0.0307, 0.0040, 0.0264, -0.0122, 0.0415, -0.0013], device='cuda:0'), grad: tensor([-2.0012e-05, 2.2352e-07, 1.4110e-06, 4.5868e-07, -4.9826e-07, 1.2796e-06, 2.2054e-06, -1.3085e-07, 1.0245e-05, 4.7982e-06], device='cuda:0') 100 0.0001 changing lr epoch 254, time 250.41, cls_loss 0.0030 cls_loss_mapping 0.0024 cls_loss_causal 0.5248 re_mapping 0.0050 re_causal 0.0130 /// teacc 99.02 lr 0.00010000 Epoch 256, weight, value: tensor([[-0.1113, -0.2163, -0.0716, ..., -0.0639, 0.1684, 0.1675], [-0.2122, -0.1791, -0.0902, ..., -0.1596, -0.2123, -0.1218], [-0.0724, -0.1415, 0.1321, ..., -0.1944, 0.2302, 0.0813], ..., [-0.1413, 0.0666, 0.0410, ..., 0.1815, -0.2178, -0.2177], [-0.2482, 0.0666, -0.1425, ..., 0.0582, -0.0916, -0.1568], [-0.0031, -0.1174, -0.0954, ..., -0.1018, -0.0562, -0.1884]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -6.9626e-06, -2.6282e-06], [ 4.6566e-10, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, 9.8255e-08, 3.7719e-08], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -9.1735e-08, -5.7276e-08], ..., [ 4.6566e-10, 1.8626e-09, 0.0000e+00, ..., 4.6566e-10, 2.5285e-07, 9.6392e-08], [ 2.9197e-07, 8.2189e-07, 0.0000e+00, ..., 1.6158e-07, 4.9919e-07, 1.8813e-07], [ 7.4506e-09, 2.0023e-08, 0.0000e+00, ..., 3.2596e-09, 3.4012e-06, 1.3011e-06]], device='cuda:0') Epoch 256, bias, value: tensor([-0.0197, -0.0369, -0.0130, -0.0213, -0.0308, 0.0042, 0.0262, -0.0126, 0.0427, -0.0015], device='cuda:0'), grad: tensor([-1.8612e-05, 2.7101e-07, -2.5844e-07, -9.7509e-07, -1.4948e-07, 2.2538e-06, 4.7423e-06, 6.9523e-07, 2.8498e-06, 9.1717e-06], device='cuda:0') 100 0.0001 changing lr epoch 255, time 250.27, cls_loss 0.0022 cls_loss_mapping 0.0017 cls_loss_causal 0.5129 re_mapping 0.0049 re_causal 0.0131 /// teacc 99.05 lr 0.00010000 Epoch 257, weight, value: tensor([[-0.1115, -0.2171, -0.0717, ..., -0.0640, 0.1685, 0.1676], [-0.2122, -0.1795, -0.0902, ..., -0.1598, -0.2124, -0.1219], [-0.0726, -0.1424, 0.1332, ..., -0.1961, 0.2307, 0.0811], ..., [-0.1427, 0.0668, 0.0409, ..., 0.1816, -0.2179, -0.2182], [-0.2487, 0.0666, -0.1431, ..., 0.0582, -0.0922, -0.1574], [-0.0029, -0.1175, -0.0954, ..., -0.1019, -0.0563, -0.1892]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 6.0536e-08, 3.2596e-09], [ 4.1910e-09, 6.9849e-09, 0.0000e+00, ..., 0.0000e+00, 2.6543e-08, 4.1910e-09], [ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., 9.3132e-10, -5.5768e-06, -8.1584e-07], ..., [ 7.4506e-09, 1.3970e-08, 0.0000e+00, ..., 1.8626e-09, 5.3570e-06, 7.8417e-07], [ 2.1420e-08, 3.4925e-08, 0.0000e+00, ..., 4.6566e-10, 7.9162e-08, 1.2107e-08], [ 6.9849e-09, 1.3504e-08, 0.0000e+00, ..., -4.6566e-09, 1.8626e-08, 4.1910e-09]], device='cuda:0') Epoch 257, bias, value: tensor([-0.0196, -0.0369, -0.0132, -0.0214, -0.0309, 0.0045, 0.0262, -0.0125, 0.0426, -0.0016], device='cuda:0'), grad: tensor([ 1.5274e-07, -1.1502e-07, -1.2666e-05, -1.9837e-07, 1.0803e-07, 8.1491e-08, 3.3528e-08, 1.2361e-05, 2.8545e-07, -3.5856e-08], device='cuda:0') 100 0.0001 changing lr epoch 256, time 250.44, cls_loss 0.0023 cls_loss_mapping 0.0026 cls_loss_causal 0.4881 re_mapping 0.0048 re_causal 0.0128 /// teacc 99.00 lr 0.00010000 Epoch 258, weight, value: tensor([[-0.1118, -0.2176, -0.0721, ..., -0.0641, 0.1688, 0.1678], [-0.2123, -0.1786, -0.0903, ..., -0.1600, -0.2128, -0.1220], [-0.0728, -0.1429, 0.1333, ..., -0.1971, 0.2310, 0.0811], ..., [-0.1431, 0.0665, 0.0409, ..., 0.1815, -0.2180, -0.2188], [-0.2495, 0.0670, -0.1432, ..., 0.0586, -0.0928, -0.1583], [-0.0030, -0.1176, -0.0954, ..., -0.1021, -0.0564, -0.1902]], device='cuda:0'), grad: tensor([[-3.6787e-08, 4.6566e-10, 0.0000e+00, ..., 2.3283e-08, -5.2713e-07, -2.0815e-07], [ 2.2352e-08, 1.8626e-09, 0.0000e+00, ..., 2.3749e-08, 6.0070e-08, 2.3283e-09], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, -4.5635e-07, 2.4214e-08], ..., [ 5.8673e-08, -9.2667e-08, 0.0000e+00, ..., -2.7940e-08, 4.5355e-07, 2.3283e-09], [ 2.8405e-08, 1.2573e-08, 0.0000e+00, ..., 2.0955e-08, 4.5169e-08, 1.3970e-08], [ 5.4017e-08, 7.4971e-08, 0.0000e+00, ..., 1.2713e-07, 1.6065e-07, 5.6345e-08]], device='cuda:0') Epoch 258, bias, value: tensor([-0.0194, -0.0358, -0.0133, -0.0216, -0.0303, 0.0047, 0.0262, -0.0136, 0.0425, -0.0017], device='cuda:0'), grad: tensor([-6.5332e-07, -1.4994e-07, -1.2508e-06, 2.2491e-07, 5.6904e-07, 7.1712e-08, -3.2736e-07, 1.4473e-06, 3.7253e-08, 4.2375e-08], device='cuda:0') 100 0.0001 changing lr epoch 257, time 250.58, cls_loss 0.0029 cls_loss_mapping 0.0028 cls_loss_causal 0.5121 re_mapping 0.0050 re_causal 0.0130 /// teacc 99.04 lr 0.00010000 Epoch 259, weight, value: tensor([[-0.1119, -0.2187, -0.0721, ..., -0.0641, 0.1690, 0.1679], [-0.2123, -0.1818, -0.0903, ..., -0.1604, -0.2159, -0.1221], [-0.0730, -0.1439, 0.1334, ..., -0.1978, 0.2312, 0.0811], ..., [-0.1441, 0.0685, 0.0409, ..., 0.1817, -0.2152, -0.2189], [-0.2501, 0.0669, -0.1432, ..., 0.0585, -0.0933, -0.1587], [-0.0034, -0.1177, -0.0954, ..., -0.1022, -0.0565, -0.1911]], device='cuda:0'), grad: tensor([[ 2.8405e-08, 4.6566e-10, 0.0000e+00, ..., 4.9360e-08, 1.7555e-07, 1.1967e-07], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 5.5879e-09, 2.3283e-09, 1.3970e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -2.7707e-07, -8.4704e-07, -6.0815e-07], ..., [ 0.0000e+00, -1.8161e-08, 0.0000e+00, ..., 2.7940e-09, 7.4971e-08, 5.3551e-08], [ 2.7940e-08, 2.3283e-09, 0.0000e+00, ..., 1.7975e-07, 5.1409e-07, 3.6741e-07], [ 1.3039e-08, 7.4506e-09, 0.0000e+00, ..., 2.0489e-08, 3.7253e-09, 2.3283e-09]], device='cuda:0') Epoch 259, bias, value: tensor([-0.0193, -0.0387, -0.0141, -0.0219, -0.0300, 0.0052, 0.0261, -0.0105, 0.0424, -0.0020], device='cuda:0'), grad: tensor([ 5.6485e-07, 1.7276e-07, -2.7586e-06, 4.2515e-07, -2.7027e-06, -8.5682e-08, -1.1548e-07, 2.2491e-07, 1.7053e-06, 2.5705e-06], device='cuda:0') 100 0.0001 changing lr epoch 258, time 250.64, cls_loss 0.0027 cls_loss_mapping 0.0030 cls_loss_causal 0.4839 re_mapping 0.0050 re_causal 0.0130 /// teacc 99.04 lr 0.00010000 Epoch 260, weight, value: tensor([[-0.1121, -0.2219, -0.0725, ..., -0.0642, 0.1694, 0.1684], [-0.2125, -0.1819, -0.0903, ..., -0.1608, -0.2159, -0.1221], [-0.0740, -0.1463, 0.1334, ..., -0.1998, 0.2307, 0.0811], ..., [-0.1445, 0.0687, 0.0409, ..., 0.1821, -0.2153, -0.2191], [-0.2498, 0.0668, -0.1432, ..., 0.0586, -0.0925, -0.1590], [-0.0035, -0.1179, -0.0954, ..., -0.1023, -0.0566, -0.1915]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 9.3132e-10, 0.0000e+00, ..., 4.1910e-09, -4.6566e-10, -7.9162e-09], [ 9.3132e-10, 1.4435e-08, 0.0000e+00, ..., 2.2817e-08, 1.1176e-08, 2.3283e-09], [ 2.7940e-09, 1.3970e-09, 0.0000e+00, ..., 9.3132e-10, -1.6868e-05, -3.0808e-06], ..., [ 1.3970e-09, -1.1036e-07, 0.0000e+00, ..., -1.2200e-07, 1.6809e-05, 3.0715e-06], [ 6.3330e-08, 4.6566e-09, 0.0000e+00, ..., 3.0268e-08, 2.0955e-08, 6.9849e-09], [ 1.8626e-09, 6.4261e-08, 0.0000e+00, ..., 4.3772e-08, 1.0710e-08, 7.4506e-09]], device='cuda:0') Epoch 260, bias, value: tensor([-0.0190, -0.0386, -0.0148, -0.0223, -0.0304, 0.0054, 0.0254, -0.0105, 0.0428, -0.0020], device='cuda:0'), grad: tensor([ 3.9116e-08, 1.5926e-07, -7.0214e-05, 1.0524e-07, 4.0932e-07, -4.0373e-07, 1.9697e-07, 6.9737e-05, 1.8254e-07, -3.4599e-07], device='cuda:0') 100 0.0001 changing lr epoch 259, time 250.55, cls_loss 0.0023 cls_loss_mapping 0.0016 cls_loss_causal 0.5076 re_mapping 0.0049 re_causal 0.0129 /// teacc 98.96 lr 0.00010000 Epoch 261, weight, value: tensor([[-0.1124, -0.2233, -0.0733, ..., -0.0643, 0.1695, 0.1686], [-0.2125, -0.1820, -0.0903, ..., -0.1610, -0.2159, -0.1222], [-0.0740, -0.1475, 0.1334, ..., -0.2003, 0.2313, 0.0807], ..., [-0.1447, 0.0689, 0.0409, ..., 0.1823, -0.2155, -0.2202], [-0.2512, 0.0667, -0.1436, ..., 0.0585, -0.0933, -0.1596], [-0.0036, -0.1180, -0.0954, ..., -0.1024, -0.0567, -0.1934]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.1910e-09, 0.0000e+00, ..., 1.8626e-09, -1.8626e-08, -3.3528e-08], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 1.0245e-08, 4.6566e-10], [ 9.3132e-10, -6.5193e-09, 0.0000e+00, ..., -9.7789e-09, -4.9500e-07, -5.8208e-08], ..., [ 1.3970e-09, 2.3283e-09, 0.0000e+00, ..., 8.3819e-09, 2.5379e-07, 1.9092e-08], [ 5.5879e-09, -4.0513e-08, 0.0000e+00, ..., -3.2596e-08, 9.6392e-08, 1.7695e-08], [ 2.7940e-09, 2.6543e-08, 0.0000e+00, ..., 1.0245e-08, 1.1083e-07, 4.8429e-08]], device='cuda:0') Epoch 261, bias, value: tensor([-0.0191, -0.0386, -0.0147, -0.0230, -0.0307, 0.0065, 0.0251, -0.0105, 0.0424, -0.0019], device='cuda:0'), grad: tensor([ 1.2154e-07, 4.3772e-08, -1.5050e-06, -8.4750e-08, -6.9514e-06, 5.4203e-07, 5.9139e-08, 8.3726e-07, 2.3283e-08, 6.9514e-06], device='cuda:0') 100 0.0001 changing lr epoch 260, time 250.62, cls_loss 0.0023 cls_loss_mapping 0.0024 cls_loss_causal 0.4879 re_mapping 0.0051 re_causal 0.0132 /// teacc 99.01 lr 0.00010000 Epoch 262, weight, value: tensor([[-0.1125, -0.2249, -0.0733, ..., -0.0644, 0.1699, 0.1689], [-0.2126, -0.1820, -0.0903, ..., -0.1611, -0.2159, -0.1223], [-0.0741, -0.1479, 0.1334, ..., -0.2008, 0.2314, 0.0804], ..., [-0.1459, 0.0686, 0.0409, ..., 0.1821, -0.2156, -0.2206], [-0.2519, 0.0669, -0.1436, ..., 0.0586, -0.0939, -0.1601], [-0.0030, -0.1175, -0.0954, ..., -0.1020, -0.0570, -0.1956]], device='cuda:0'), grad: tensor([[-1.2115e-05, -9.0599e-06, 0.0000e+00, ..., 9.3132e-10, -4.5866e-05, -1.9342e-05], [ 5.0757e-08, 1.0664e-07, 0.0000e+00, ..., 1.8766e-07, 2.0955e-08, 1.2573e-08], [ 3.1991e-07, 2.4028e-07, 0.0000e+00, ..., 1.8626e-09, 1.1949e-06, 5.0385e-07], ..., [ 7.8231e-08, 9.1735e-08, 0.0000e+00, ..., -2.4633e-07, 2.3283e-08, 1.1642e-08], [ 1.8021e-06, 1.3513e-06, 0.0000e+00, ..., 9.7789e-09, 6.7726e-06, 2.8573e-06], [ 3.4105e-06, 2.7269e-06, 0.0000e+00, ..., 4.2375e-08, 1.0811e-05, 4.5560e-06]], device='cuda:0') Epoch 262, bias, value: tensor([-0.0188, -0.0386, -0.0150, -0.0223, -0.0292, 0.0057, 0.0251, -0.0107, 0.0428, -0.0023], device='cuda:0'), grad: tensor([-6.0827e-05, 7.1852e-07, 1.6019e-06, -1.0826e-05, 2.0955e-07, 1.1206e-05, 3.3349e-05, -1.9697e-07, 9.2462e-06, 1.5602e-05], device='cuda:0') 100 0.0001 changing lr epoch 261, time 250.56, cls_loss 0.0022 cls_loss_mapping 0.0019 cls_loss_causal 0.4995 re_mapping 0.0051 re_causal 0.0136 /// teacc 99.04 lr 0.00010000 Epoch 263, weight, value: tensor([[-0.1126, -0.2220, -0.0733, ..., -0.0647, 0.1701, 0.1690], [-0.2128, -0.1821, -0.0903, ..., -0.1619, -0.2159, -0.1223], [-0.0734, -0.1481, 0.1334, ..., -0.2008, 0.2320, 0.0806], ..., [-0.1466, 0.0687, 0.0409, ..., 0.1824, -0.2158, -0.2211], [-0.2531, 0.0668, -0.1436, ..., 0.0585, -0.0947, -0.1610], [-0.0050, -0.1176, -0.0954, ..., -0.1021, -0.0573, -0.1971]], device='cuda:0'), grad: tensor([[ 4.4703e-08, 5.5879e-09, 0.0000e+00, ..., 4.7497e-08, 4.2841e-08, 4.1910e-08], [ 1.2573e-08, 1.8626e-09, 0.0000e+00, ..., 1.8626e-09, 4.1910e-09, 5.5879e-09], [ 2.3283e-09, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 2.3283e-09, 1.3970e-09], ..., [ 3.7253e-09, -1.7323e-07, 0.0000e+00, ..., -1.5805e-06, 3.2596e-09, 0.0000e+00], [ 2.1793e-07, 1.8626e-09, 0.0000e+00, ..., 1.3970e-09, 7.8836e-07, 6.1002e-07], [ 5.5879e-09, 4.6566e-08, 0.0000e+00, ..., 3.3388e-07, 1.8626e-09, 1.3970e-09]], device='cuda:0') Epoch 263, bias, value: tensor([-0.0188, -0.0387, -0.0146, -0.0221, -0.0290, 0.0058, 0.0251, -0.0106, 0.0427, -0.0021], device='cuda:0'), grad: tensor([ 4.7497e-07, 9.7323e-08, 2.4214e-08, -7.4506e-08, 3.4645e-07, 2.2836e-06, -7.3276e-06, -2.2035e-06, 6.7614e-06, -3.8231e-07], device='cuda:0') 100 0.0001 changing lr epoch 262, time 250.55, cls_loss 0.0020 cls_loss_mapping 0.0020 cls_loss_causal 0.4693 re_mapping 0.0048 re_causal 0.0131 /// teacc 98.96 lr 0.00010000 Epoch 264, weight, value: tensor([[-0.1130, -0.2240, -0.0733, ..., -0.0648, 0.1700, 0.1690], [-0.2128, -0.1821, -0.0903, ..., -0.1618, -0.2159, -0.1223], [-0.0735, -0.1484, 0.1334, ..., -0.2013, 0.2322, 0.0808], ..., [-0.1468, 0.0687, 0.0409, ..., 0.1822, -0.2159, -0.2213], [-0.2542, 0.0671, -0.1436, ..., 0.0586, -0.0950, -0.1614], [-0.0050, -0.1177, -0.0954, ..., -0.1022, -0.0573, -0.1974]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 1.8626e-09, 0.0000e+00, ..., 5.5879e-09, -1.6801e-06, -8.1863e-07], [ 2.1420e-08, 2.9802e-08, 0.0000e+00, ..., 3.9395e-07, 4.6566e-09, 1.8626e-09], [ 5.5879e-09, 8.3819e-09, 0.0000e+00, ..., 4.4703e-08, -1.3495e-06, -1.3970e-08], ..., [ 3.4459e-08, 3.3528e-08, 0.0000e+00, ..., -4.4703e-08, 1.3225e-06, 2.7940e-09], [ 1.9558e-08, 2.7008e-08, 0.0000e+00, ..., -2.2557e-06, 9.0338e-08, 4.2841e-08], [ 3.5390e-08, 5.5879e-08, 0.0000e+00, ..., 4.7311e-07, 1.3877e-06, 6.7987e-07]], device='cuda:0') Epoch 264, bias, value: tensor([-0.0189, -0.0386, -0.0147, -0.0218, -0.0305, 0.0060, 0.0251, -0.0107, 0.0427, -0.0015], device='cuda:0'), grad: tensor([-2.8629e-06, 7.0371e-06, -3.5353e-06, 8.6054e-06, 5.8766e-07, 2.5347e-05, 2.3320e-06, 4.5523e-06, -5.2512e-05, 1.0416e-05], device='cuda:0') 100 0.0001 changing lr epoch 263, time 250.15, cls_loss 0.0024 cls_loss_mapping 0.0020 cls_loss_causal 0.4943 re_mapping 0.0048 re_causal 0.0127 /// teacc 98.91 lr 0.00010000 Epoch 265, weight, value: tensor([[-0.1132, -0.2247, -0.0734, ..., -0.0654, 0.1703, 0.1691], [-0.2130, -0.1822, -0.0902, ..., -0.1619, -0.2162, -0.1228], [-0.0737, -0.1490, 0.1334, ..., -0.2019, 0.2323, 0.0807], ..., [-0.1470, 0.0688, 0.0409, ..., 0.1826, -0.2159, -0.2215], [-0.2571, 0.0670, -0.1437, ..., 0.0575, -0.0980, -0.1619], [-0.0054, -0.1178, -0.0954, ..., -0.1026, -0.0574, -0.1981]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -3.8184e-08, -2.1420e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 6.6124e-08, 1.8626e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 8.3819e-09, 4.6566e-09], ..., [ 1.8626e-09, -0.0000e+00, 0.0000e+00, ..., 9.4064e-08, 9.3132e-10, 0.0000e+00], [ 1.7695e-08, 1.8626e-09, 0.0000e+00, ..., -1.7881e-07, 6.5193e-09, 7.4506e-09], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 2.7940e-09, 6.5193e-09, 3.7253e-09]], device='cuda:0') Epoch 265, bias, value: tensor([-0.0188, -0.0386, -0.0148, -0.0218, -0.0309, 0.0070, 0.0255, -0.0107, 0.0412, -0.0016], device='cuda:0'), grad: tensor([-7.1712e-08, 1.1083e-07, 2.6077e-08, 4.4424e-07, 4.6566e-09, -4.3027e-07, 2.4214e-08, 2.2352e-07, -3.2689e-07, -1.2107e-08], device='cuda:0') 100 0.0001 changing lr epoch 264, time 250.62, cls_loss 0.0026 cls_loss_mapping 0.0028 cls_loss_causal 0.4877 re_mapping 0.0049 re_causal 0.0127 /// teacc 99.06 lr 0.00010000 Epoch 266, weight, value: tensor([[-0.1133, -0.2252, -0.0734, ..., -0.0656, 0.1706, 0.1693], [-0.2132, -0.1822, -0.0902, ..., -0.1605, -0.2164, -0.1229], [-0.0738, -0.1493, 0.1334, ..., -0.2011, 0.2334, 0.0811], ..., [-0.1472, 0.0689, 0.0409, ..., 0.1818, -0.2163, -0.2230], [-0.2575, 0.0669, -0.1437, ..., 0.0576, -0.0984, -0.1619], [-0.0055, -0.1177, -0.0954, ..., -0.1022, -0.0575, -0.1990]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 1.6764e-08, 7.4506e-09], [ 2.5146e-08, 1.3970e-08, 0.0000e+00, ..., 6.5193e-09, 1.3039e-08, 6.5193e-09], [ 1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 5.3085e-08, -6.7055e-08, -3.2596e-08], ..., [ 9.3132e-09, -2.0489e-08, 0.0000e+00, ..., -2.5146e-08, 1.0245e-08, 4.6566e-09], [ 8.3819e-08, 4.0978e-08, 0.0000e+00, ..., -7.1712e-08, 2.1420e-08, 1.0245e-08], [ 3.2596e-08, 3.2596e-08, 0.0000e+00, ..., 2.1420e-08, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 266, bias, value: tensor([-0.0186, -0.0381, -0.0143, -0.0222, -0.0304, 0.0073, 0.0255, -0.0111, 0.0411, -0.0021], device='cuda:0'), grad: tensor([ 9.7789e-08, 5.3085e-08, 1.8347e-07, -3.6359e-05, -4.1574e-06, 3.6180e-05, 1.2107e-08, 1.1828e-07, -2.6170e-07, 4.1537e-06], device='cuda:0') 100 0.0001 changing lr epoch 265, time 250.71, cls_loss 0.0028 cls_loss_mapping 0.0034 cls_loss_causal 0.4800 re_mapping 0.0052 re_causal 0.0134 /// teacc 99.01 lr 0.00010000 Epoch 267, weight, value: tensor([[-0.1135, -0.2265, -0.0740, ..., -0.0658, 0.1708, 0.1695], [-0.2148, -0.1824, -0.0901, ..., -0.1611, -0.2170, -0.1240], [-0.0710, -0.1500, 0.1340, ..., -0.2002, 0.2356, 0.0821], ..., [-0.1474, 0.0692, 0.0408, ..., 0.1815, -0.2167, -0.2249], [-0.2586, 0.0670, -0.1440, ..., 0.0578, -0.0996, -0.1625], [-0.0058, -0.1178, -0.0954, ..., -0.1012, -0.0575, -0.1995]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.7695e-08, -9.3132e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-09, 1.8626e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 1.8626e-09, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 3.7253e-09]], device='cuda:0') Epoch 267, bias, value: tensor([-0.0186, -0.0383, -0.0119, -0.0225, -0.0306, 0.0076, 0.0249, -0.0116, 0.0411, -0.0009], device='cuda:0'), grad: tensor([ 9.1791e-06, -2.7463e-05, 2.8033e-07, -2.5146e-07, 9.3132e-10, 2.3916e-06, 2.7046e-06, 8.1211e-07, 1.2577e-05, -2.3935e-07], device='cuda:0') 100 0.0001 changing lr epoch 266, time 250.48, cls_loss 0.0023 cls_loss_mapping 0.0020 cls_loss_causal 0.5130 re_mapping 0.0048 re_causal 0.0132 /// teacc 99.04 lr 0.00010000 Epoch 268, weight, value: tensor([[-0.1135, -0.2267, -0.0741, ..., -0.0658, 0.1714, 0.1699], [-0.2150, -0.1827, -0.0901, ..., -0.1618, -0.2172, -0.1242], [-0.0710, -0.1507, 0.1355, ..., -0.2002, 0.2369, 0.0827], ..., [-0.1475, 0.0699, 0.0408, ..., 0.1820, -0.2171, -0.2268], [-0.2590, 0.0669, -0.1440, ..., 0.0577, -0.0997, -0.1630], [-0.0059, -0.1180, -0.0954, ..., -0.1014, -0.0576, -0.2002]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.7695e-08, 0.0000e+00, ..., 2.7940e-08, -2.0675e-07, -8.1025e-08], [ 9.3132e-10, 1.8626e-07, 0.0000e+00, ..., 3.2037e-07, 6.5193e-09, 2.7940e-09], [ 0.0000e+00, 4.5635e-08, 0.0000e+00, ..., 5.6997e-07, 1.3970e-07, 5.3085e-08], ..., [ 9.3132e-10, -5.9046e-07, 0.0000e+00, ..., -8.5123e-07, 5.5879e-09, 1.8626e-09], [ 3.7253e-09, 3.9116e-08, 0.0000e+00, ..., -3.2838e-06, 6.5193e-09, 2.7940e-09], [ 4.6566e-09, 1.9930e-07, 0.0000e+00, ..., 3.0454e-07, 3.5390e-08, 1.4901e-08]], device='cuda:0') Epoch 268, bias, value: tensor([-0.0183, -0.0383, -0.0118, -0.0223, -0.0305, 0.0072, 0.0245, -0.0116, 0.0412, -0.0005], device='cuda:0'), grad: tensor([-3.1758e-07, 9.8720e-07, 1.4640e-06, 3.1292e-07, 1.4901e-08, 5.2378e-06, 5.2154e-08, -2.5816e-06, -6.2101e-06, 1.0263e-06], device='cuda:0') 100 0.0001 changing lr epoch 267, time 250.28, cls_loss 0.0021 cls_loss_mapping 0.0024 cls_loss_causal 0.5097 re_mapping 0.0047 re_causal 0.0133 /// teacc 99.09 lr 0.00010000 Epoch 269, weight, value: tensor([[-0.1137, -0.2269, -0.0741, ..., -0.0658, 0.1714, 0.1699], [-0.2150, -0.1829, -0.0901, ..., -0.1626, -0.2173, -0.1243], [-0.0710, -0.1511, 0.1356, ..., -0.2016, 0.2370, 0.0827], ..., [-0.1475, 0.0703, 0.0408, ..., 0.1827, -0.2172, -0.2270], [-0.2593, 0.0668, -0.1440, ..., 0.0577, -0.0999, -0.1630], [-0.0063, -0.1183, -0.0954, ..., -0.1018, -0.0577, -0.2007]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, -2.7940e-09], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, -2.2352e-08, 0.0000e+00], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., -0.0000e+00, 6.5193e-09, 0.0000e+00], ..., [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 1.2107e-08, 0.0000e+00], [ 1.7695e-08, 9.3132e-10, 0.0000e+00, ..., 8.3819e-09, 3.7253e-09, 0.0000e+00], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 3.7253e-09, 7.4506e-09, 1.8626e-09]], device='cuda:0') Epoch 269, bias, value: tensor([-0.0185, -0.0384, -0.0121, -0.0222, -0.0304, 0.0068, 0.0245, -0.0115, 0.0417, -0.0007], device='cuda:0'), grad: tensor([ 8.1025e-08, -2.4680e-07, 6.2399e-08, 1.1642e-07, 7.4506e-08, -8.6613e-08, 6.0070e-07, 8.5402e-07, 3.4459e-08, -1.4892e-06], device='cuda:0') 100 0.0001 changing lr epoch 268, time 250.35, cls_loss 0.0029 cls_loss_mapping 0.0027 cls_loss_causal 0.5217 re_mapping 0.0048 re_causal 0.0127 /// teacc 99.04 lr 0.00010000 Epoch 270, weight, value: tensor([[-0.1127, -0.2279, -0.0741, ..., -0.0661, 0.1740, 0.1715], [-0.2151, -0.1829, -0.0901, ..., -0.1626, -0.2173, -0.1244], [-0.0711, -0.1517, 0.1357, ..., -0.2018, 0.2372, 0.0827], ..., [-0.1477, 0.0699, 0.0408, ..., 0.1830, -0.2172, -0.2275], [-0.2602, 0.0670, -0.1440, ..., 0.0577, -0.1007, -0.1638], [-0.0061, -0.1187, -0.0954, ..., -0.1019, -0.0595, -0.2045]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 2.7940e-09, 0.0000e+00, ..., 1.0245e-08, -3.7253e-08, -3.0734e-08], [ 1.1176e-08, 8.1956e-08, 0.0000e+00, ..., 1.8906e-07, 1.7695e-08, 0.0000e+00], [ 9.3132e-10, 3.5390e-08, 0.0000e+00, ..., 4.5076e-07, 1.8626e-09, 9.3132e-10], ..., [ 1.8626e-09, -3.0156e-06, 0.0000e+00, ..., -3.2149e-06, 1.8626e-09, 0.0000e+00], [ 8.9407e-08, 2.4214e-08, 0.0000e+00, ..., 9.2201e-08, 9.3132e-10, 9.3132e-10], [ 5.5879e-09, 2.0117e-07, 0.0000e+00, ..., 7.1619e-07, 4.0978e-08, 2.4214e-08]], device='cuda:0') Epoch 270, bias, value: tensor([-0.0154, -0.0384, -0.0129, -0.0216, -0.0306, 0.0067, 0.0231, -0.0114, 0.0417, -0.0020], device='cuda:0'), grad: tensor([-1.8626e-09, 5.5134e-07, 1.0002e-06, 5.9232e-06, -6.5006e-07, -4.7684e-07, 8.1863e-07, -9.0897e-06, 2.4959e-07, 1.6792e-06], device='cuda:0') 100 0.0001 changing lr epoch 269, time 250.44, cls_loss 0.0021 cls_loss_mapping 0.0027 cls_loss_causal 0.5090 re_mapping 0.0051 re_causal 0.0130 /// teacc 99.09 lr 0.00010000 Epoch 271, weight, value: tensor([[-0.1127, -0.2283, -0.0741, ..., -0.0662, 0.1742, 0.1717], [-0.2151, -0.1832, -0.0901, ..., -0.1631, -0.2173, -0.1246], [-0.0711, -0.1548, 0.1357, ..., -0.2047, 0.2375, 0.0830], ..., [-0.1477, 0.0710, 0.0408, ..., 0.1840, -0.2173, -0.2277], [-0.2603, 0.0668, -0.1440, ..., 0.0576, -0.1010, -0.1641], [-0.0065, -0.1191, -0.0954, ..., -0.1023, -0.0596, -0.2047]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.6822e-07, 0.0000e+00, ..., 2.4121e-07, 6.5193e-09, 0.0000e+00], ..., [ 0.0000e+00, -2.5984e-07, 0.0000e+00, ..., -2.5146e-07, -7.4506e-09, 0.0000e+00], [ 0.0000e+00, 2.5146e-08, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.9360e-08, 0.0000e+00, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 271, bias, value: tensor([-0.0153, -0.0384, -0.0133, -0.0219, -0.0302, 0.0070, 0.0224, -0.0113, 0.0414, -0.0023], device='cuda:0'), grad: tensor([ 1.0245e-08, 4.9360e-08, 9.3598e-07, 1.8626e-08, -7.1991e-07, 8.3819e-09, 1.0245e-08, -6.0443e-07, 1.8999e-07, 1.0245e-07], device='cuda:0') 100 0.0001 changing lr epoch 270, time 250.16, cls_loss 0.0018 cls_loss_mapping 0.0020 cls_loss_causal 0.5004 re_mapping 0.0049 re_causal 0.0132 /// teacc 99.05 lr 0.00010000 Epoch 272, weight, value: tensor([[-0.1129, -0.2283, -0.0741, ..., -0.0663, 0.1743, 0.1718], [-0.2151, -0.1834, -0.0901, ..., -0.1632, -0.2174, -0.1244], [-0.0711, -0.1551, 0.1357, ..., -0.2049, 0.2377, 0.0829], ..., [-0.1488, 0.0712, 0.0408, ..., 0.1841, -0.2173, -0.2283], [-0.2608, 0.0667, -0.1440, ..., 0.0576, -0.1013, -0.1648], [-0.0067, -0.1192, -0.0954, ..., -0.1025, -0.0596, -0.2049]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.4214e-08, -1.7695e-08], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 3.7253e-09, 7.4506e-09, 0.0000e+00], [ 0.0000e+00, 1.2107e-08, 0.0000e+00, ..., 1.1176e-08, -1.5926e-07, 0.0000e+00], ..., [ 0.0000e+00, -3.4459e-08, 0.0000e+00, ..., -3.4459e-08, 9.4064e-08, 0.0000e+00], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 3.7253e-09, 9.3132e-09, 2.7940e-09], [ 9.3132e-10, 5.5879e-09, 0.0000e+00, ..., 6.5193e-09, 1.2107e-08, 6.5193e-09]], device='cuda:0') Epoch 272, bias, value: tensor([-0.0152, -0.0382, -0.0137, -0.0220, -0.0303, 0.0068, 0.0228, -0.0113, 0.0410, -0.0024], device='cuda:0'), grad: tensor([ 3.7253e-08, -2.3283e-07, -2.0340e-06, 3.5390e-08, 6.7614e-07, 2.4214e-08, 4.8429e-08, 1.2834e-06, 6.9849e-08, 9.1270e-08], device='cuda:0') 100 0.0001 changing lr epoch 271, time 250.59, cls_loss 0.0020 cls_loss_mapping 0.0020 cls_loss_causal 0.4910 re_mapping 0.0047 re_causal 0.0129 /// teacc 99.07 lr 0.00010000 Epoch 273, weight, value: tensor([[-0.1152, -0.2285, -0.0743, ..., -0.0663, 0.1737, 0.1713], [-0.2152, -0.1834, -0.0902, ..., -0.1634, -0.2173, -0.1243], [-0.0712, -0.1560, 0.1358, ..., -0.2055, 0.2378, 0.0828], ..., [-0.1491, 0.0715, 0.0408, ..., 0.1845, -0.2174, -0.2285], [-0.2613, 0.0666, -0.1441, ..., 0.0576, -0.1015, -0.1651], [-0.0071, -0.1193, -0.0954, ..., -0.1025, -0.0596, -0.2050]], device='cuda:0'), grad: tensor([[ 1.2266e-06, 9.3132e-09, 0.0000e+00, ..., 2.7940e-09, 8.0746e-07, 7.2364e-07], [ 1.8626e-09, 8.3819e-08, 0.0000e+00, ..., 2.9802e-08, 1.8626e-09, 1.8626e-09], [ 1.8626e-09, 3.7439e-07, 0.0000e+00, ..., 1.2293e-07, -0.0000e+00, 9.3132e-10], ..., [ 9.3132e-10, -5.7928e-07, 0.0000e+00, ..., -2.0768e-07, 9.3132e-10, 0.0000e+00], [ 4.6566e-09, 1.3970e-08, 0.0000e+00, ..., -5.0291e-08, 3.5390e-08, 2.7940e-08], [ 2.4214e-08, 2.8871e-08, 0.0000e+00, ..., 2.7008e-08, 4.7497e-08, 3.8184e-08]], device='cuda:0') Epoch 273, bias, value: tensor([-0.0158, -0.0381, -0.0141, -0.0221, -0.0301, 0.0069, 0.0238, -0.0113, 0.0410, -0.0026], device='cuda:0'), grad: tensor([ 2.2650e-06, 3.8557e-07, 1.2014e-06, 2.1234e-07, -3.8072e-06, 3.0547e-07, -2.3339e-06, -1.6000e-06, 1.0151e-07, 3.2671e-06], device='cuda:0') 100 0.0001 changing lr epoch 272, time 250.35, cls_loss 0.0019 cls_loss_mapping 0.0017 cls_loss_causal 0.5351 re_mapping 0.0047 re_causal 0.0129 /// teacc 99.03 lr 0.00010000 Epoch 274, weight, value: tensor([[-0.1152, -0.2284, -0.0745, ..., -0.0664, 0.1737, 0.1713], [-0.2152, -0.1835, -0.0901, ..., -0.1635, -0.2173, -0.1245], [-0.0712, -0.1563, 0.1361, ..., -0.2058, 0.2382, 0.0838], ..., [-0.1498, 0.0719, 0.0408, ..., 0.1849, -0.2175, -0.2295], [-0.2618, 0.0665, -0.1442, ..., 0.0574, -0.1023, -0.1681], [-0.0083, -0.1197, -0.0954, ..., -0.1028, -0.0597, -0.2051]], device='cuda:0'), grad: tensor([[ 2.8778e-06, 5.5879e-09, 0.0000e+00, ..., 9.3132e-10, 1.0198e-06, 1.6335e-06], [ 7.4506e-09, 4.9360e-08, 0.0000e+00, ..., 4.9360e-08, 9.3132e-10, 1.8626e-09], [ 6.5193e-09, 7.4506e-09, 0.0000e+00, ..., 2.7940e-09, -9.3132e-10, 1.8626e-09], ..., [ 8.3819e-09, -1.2852e-07, 0.0000e+00, ..., -1.4994e-07, 2.7940e-09, 0.0000e+00], [ 1.4529e-07, 1.3039e-08, 0.0000e+00, ..., 4.6566e-09, 5.0291e-08, 7.9162e-08], [ 6.4261e-08, 1.3504e-07, 0.0000e+00, ..., 7.7300e-08, 5.5879e-09, 5.5879e-09]], device='cuda:0') Epoch 274, bias, value: tensor([-0.0158, -0.0380, -0.0141, -0.0222, -0.0297, 0.0068, 0.0242, -0.0113, 0.0405, -0.0028], device='cuda:0'), grad: tensor([ 9.4920e-06, 6.5751e-07, 2.7008e-08, -5.6159e-07, -6.3516e-07, 7.1302e-06, -1.6823e-05, -1.1921e-07, 5.0571e-07, 2.9150e-07], device='cuda:0') 100 0.0001 changing lr epoch 273, time 250.29, cls_loss 0.0032 cls_loss_mapping 0.0023 cls_loss_causal 0.4844 re_mapping 0.0048 re_causal 0.0117 /// teacc 99.00 lr 0.00010000 Epoch 275, weight, value: tensor([[-0.1153, -0.2287, -0.0748, ..., -0.0664, 0.1744, 0.1721], [-0.2153, -0.1836, -0.0898, ..., -0.1637, -0.2175, -0.1247], [-0.0713, -0.1565, 0.1360, ..., -0.2061, 0.2388, 0.0845], ..., [-0.1505, 0.0719, 0.0408, ..., 0.1854, -0.2177, -0.2318], [-0.2621, 0.0664, -0.1445, ..., 0.0575, -0.1026, -0.1689], [-0.0095, -0.1207, -0.0955, ..., -0.1033, -0.0597, -0.2053]], device='cuda:0'), grad: tensor([[ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 5.5879e-09, -0.0000e+00, 9.3132e-10], [ 9.3132e-10, 3.7253e-09, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.5832e-08, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, -7.0781e-08, 0.0000e+00, ..., -9.0338e-08, 0.0000e+00, 0.0000e+00], [ 7.4506e-09, 9.3132e-10, 0.0000e+00, ..., -3.3528e-08, 0.0000e+00, 0.0000e+00], [ 2.9802e-08, 6.5193e-08, 0.0000e+00, ..., 1.2014e-07, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 275, bias, value: tensor([-0.0153, -0.0379, -0.0140, -0.0218, -0.0300, 0.0065, 0.0236, -0.0114, 0.0404, -0.0027], device='cuda:0'), grad: tensor([ 3.6322e-08, 7.4506e-09, 1.1176e-07, 3.9116e-07, 3.8184e-08, -4.3586e-07, -2.5146e-08, -2.3656e-07, -2.2259e-07, 3.4645e-07], device='cuda:0') 100 0.0001 changing lr epoch 274, time 250.49, cls_loss 0.0019 cls_loss_mapping 0.0017 cls_loss_causal 0.4741 re_mapping 0.0049 re_causal 0.0124 /// teacc 98.88 lr 0.00010000 Epoch 276, weight, value: tensor([[-0.1157, -0.2296, -0.0768, ..., -0.0664, 0.1742, 0.1720], [-0.2153, -0.1839, -0.0899, ..., -0.1638, -0.2176, -0.1248], [-0.0715, -0.1575, 0.1361, ..., -0.2068, 0.2390, 0.0845], ..., [-0.1510, 0.0725, 0.0408, ..., 0.1860, -0.2178, -0.2323], [-0.2627, 0.0663, -0.1445, ..., 0.0574, -0.1028, -0.1694], [-0.0102, -0.1214, -0.0955, ..., -0.1039, -0.0598, -0.2054]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.7940e-09, 0.0000e+00, ..., 7.4506e-09, -5.8487e-07, -2.3935e-07], [ 9.3132e-10, 1.9651e-06, 0.0000e+00, ..., 1.3933e-06, 4.6566e-08, 1.8626e-09], [ 0.0000e+00, 2.1420e-08, 0.0000e+00, ..., 4.4703e-08, 6.2399e-08, -1.7695e-08], ..., [ 0.0000e+00, -2.6356e-06, 0.0000e+00, ..., -2.0657e-06, 6.5193e-09, 9.3132e-10], [ 2.4214e-08, 3.1665e-07, 0.0000e+00, ..., 1.9651e-07, -1.4026e-06, 7.4506e-09], [ 0.0000e+00, 1.1548e-07, 0.0000e+00, ..., 2.8312e-07, 9.1083e-07, 2.2072e-07]], device='cuda:0') Epoch 276, bias, value: tensor([-0.0156, -0.0379, -0.0142, -0.0223, -0.0301, 0.0068, 0.0244, -0.0113, 0.0405, -0.0032], device='cuda:0'), grad: tensor([ 5.4017e-08, 4.3400e-06, 1.9316e-06, 7.0781e-07, 1.2256e-06, 1.3085e-06, 1.5810e-05, -7.1861e-06, -2.6003e-05, 7.8082e-06], device='cuda:0') 100 0.0001 changing lr epoch 275, time 250.35, cls_loss 0.0026 cls_loss_mapping 0.0020 cls_loss_causal 0.4862 re_mapping 0.0049 re_causal 0.0125 /// teacc 99.01 lr 0.00010000 Epoch 277, weight, value: tensor([[-0.1164, -0.2323, -0.0774, ..., -0.0665, 0.1742, 0.1720], [-0.2154, -0.1841, -0.0899, ..., -0.1641, -0.2179, -0.1252], [-0.0716, -0.1571, 0.1369, ..., -0.2063, 0.2405, 0.0861], ..., [-0.1518, 0.0727, 0.0407, ..., 0.1863, -0.2182, -0.2355], [-0.2636, 0.0665, -0.1448, ..., 0.0574, -0.1038, -0.1725], [-0.0104, -0.1219, -0.0955, ..., -0.1042, -0.0598, -0.2056]], device='cuda:0'), grad: tensor([[ 9.5088e-07, 2.7940e-09, 0.0000e+00, ..., 3.6675e-06, -5.5879e-09, -4.6566e-09], [ 2.6077e-08, 1.1176e-08, 0.0000e+00, ..., 8.5682e-08, 9.3132e-10, 9.3132e-10], [ 1.3970e-08, 4.6566e-09, 0.0000e+00, ..., 3.8184e-08, -9.0338e-08, -0.0000e+00], ..., [ 4.2468e-07, 2.4214e-08, 0.0000e+00, ..., 1.5972e-06, 1.8626e-09, 0.0000e+00], [ 1.0394e-06, 1.0151e-07, 0.0000e+00, ..., 3.9637e-06, 1.8626e-08, 1.8626e-09], [ 4.4703e-08, 1.0245e-08, 0.0000e+00, ..., 1.6857e-07, 1.8626e-09, 1.8626e-09]], device='cuda:0') Epoch 277, bias, value: tensor([-0.0158, -0.0376, -0.0135, -0.0240, -0.0304, 0.0072, 0.0258, -0.0116, 0.0407, -0.0030], device='cuda:0'), grad: tensor([ 7.6219e-06, 2.1514e-07, -2.3190e-07, 2.2873e-06, 1.0896e-07, -2.4199e-05, 2.3060e-06, 3.4086e-06, 8.5682e-06, -9.8720e-08], device='cuda:0') 100 0.0001 changing lr epoch 276, time 250.32, cls_loss 0.0027 cls_loss_mapping 0.0030 cls_loss_causal 0.4916 re_mapping 0.0051 re_causal 0.0123 /// teacc 99.03 lr 0.00010000 Epoch 278, weight, value: tensor([[-0.1170, -0.2368, -0.0778, ..., -0.0689, 0.1744, 0.1722], [-0.2155, -0.1842, -0.0898, ..., -0.1649, -0.2180, -0.1253], [-0.0716, -0.1573, 0.1371, ..., -0.2066, 0.2412, 0.0870], ..., [-0.1490, 0.0734, 0.0407, ..., 0.1871, -0.2183, -0.2365], [-0.2645, 0.0664, -0.1448, ..., 0.0573, -0.1044, -0.1743], [-0.0108, -0.1224, -0.0955, ..., -0.1045, -0.0599, -0.2060]], device='cuda:0'), grad: tensor([[ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -8.6520e-07, -5.2713e-07], [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 2.7940e-09, 9.3132e-10], [ 2.7940e-09, -1.0058e-07, 0.0000e+00, ..., 1.3039e-08, -4.2841e-07, -1.3784e-07], ..., [ 0.0000e+00, 1.0151e-07, 0.0000e+00, ..., -1.8626e-09, 4.7311e-07, 1.6578e-07], [ 9.3132e-09, 1.8626e-09, 0.0000e+00, ..., -1.4901e-08, 2.1420e-08, 1.3039e-08], [ 9.3132e-10, -1.8626e-09, 0.0000e+00, ..., 1.8626e-09, 7.3202e-07, 4.4610e-07]], device='cuda:0') Epoch 278, bias, value: tensor([-0.0162, -0.0372, -0.0135, -0.0243, -0.0293, 0.0066, 0.0269, -0.0119, 0.0404, -0.0036], device='cuda:0'), grad: tensor([-1.3905e-06, 5.4017e-08, -1.0028e-05, 1.9558e-08, 5.8766e-07, 3.4086e-07, 6.4261e-08, 3.3677e-06, 5.8413e-06, 1.1548e-06], device='cuda:0') 100 0.0001 changing lr epoch 277, time 250.40, cls_loss 0.0026 cls_loss_mapping 0.0031 cls_loss_causal 0.4941 re_mapping 0.0048 re_causal 0.0120 /// teacc 99.04 lr 0.00010000 Epoch 279, weight, value: tensor([[-0.1171, -0.2372, -0.0779, ..., -0.0692, 0.1751, 0.1727], [-0.2155, -0.1840, -0.0898, ..., -0.1627, -0.2180, -0.1253], [-0.0718, -0.1573, 0.1377, ..., -0.2080, 0.2416, 0.0871], ..., [-0.1497, 0.0738, 0.0407, ..., 0.1865, -0.2185, -0.2381], [-0.2676, 0.0662, -0.1449, ..., 0.0563, -0.1046, -0.1746], [-0.0115, -0.1231, -0.0955, ..., -0.1054, -0.0603, -0.2070]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 8.3819e-09, 0.0000e+00, ..., 8.3819e-09, 7.4506e-09, 6.5193e-09], [ 5.5879e-09, 3.0734e-08, 0.0000e+00, ..., 4.6566e-08, 2.7940e-09, 9.3132e-10], [ 1.8626e-09, 1.4156e-07, 0.0000e+00, ..., 5.3085e-08, -1.0245e-07, -4.6566e-08], ..., [ 9.3132e-09, -3.8836e-07, 0.0000e+00, ..., -1.7416e-07, 2.6077e-08, 1.2107e-08], [ 6.2585e-07, 9.3132e-09, 0.0000e+00, ..., 3.8221e-06, 7.9162e-08, 3.5390e-08], [ 2.7940e-08, 5.4948e-08, 0.0000e+00, ..., 3.7253e-08, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 279, bias, value: tensor([-0.0160, -0.0363, -0.0137, -0.0243, -0.0293, 0.0080, 0.0263, -0.0126, 0.0389, -0.0041], device='cuda:0'), grad: tensor([ 5.9605e-08, 1.5646e-07, 1.7788e-07, 1.4594e-06, 1.6410e-06, -1.0870e-05, 4.2375e-07, -7.8138e-07, 9.1717e-06, -1.4342e-06], device='cuda:0') 100 0.0001 changing lr epoch 278, time 250.48, cls_loss 0.0023 cls_loss_mapping 0.0021 cls_loss_causal 0.4885 re_mapping 0.0048 re_causal 0.0123 /// teacc 99.02 lr 0.00010000 Epoch 280, weight, value: tensor([[-0.1168, -0.2373, -0.0780, ..., -0.0683, 0.1755, 0.1733], [-0.2156, -0.1842, -0.0896, ..., -0.1629, -0.2181, -0.1254], [-0.0719, -0.1574, 0.1376, ..., -0.2081, 0.2424, 0.0875], ..., [-0.1510, 0.0744, 0.0407, ..., 0.1870, -0.2188, -0.2402], [-0.2690, 0.0661, -0.1451, ..., 0.0559, -0.1049, -0.1747], [-0.0120, -0.1241, -0.0955, ..., -0.1061, -0.0604, -0.2075]], device='cuda:0'), grad: tensor([[ 1.2107e-08, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 5.5879e-09], [ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., 3.7253e-09, 1.8626e-09, 0.0000e+00], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, -1.2107e-08, 0.0000e+00], ..., [ 9.3132e-10, -5.3085e-08, 0.0000e+00, ..., -8.1956e-08, 1.8626e-09, 0.0000e+00], [ 1.3039e-08, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 3.7253e-09], [ 7.4506e-09, 5.7742e-08, 0.0000e+00, ..., 7.5437e-08, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 280, bias, value: tensor([-0.0158, -0.0362, -0.0139, -0.0237, -0.0290, 0.0079, 0.0262, -0.0125, 0.0380, -0.0046], device='cuda:0'), grad: tensor([ 3.3528e-08, -2.2817e-07, -2.6077e-08, -1.3132e-07, 2.0582e-07, 2.7567e-07, -2.5425e-07, -2.0768e-07, 5.4948e-08, 2.8592e-07], device='cuda:0') 100 0.0001 changing lr epoch 279, time 250.32, cls_loss 0.0022 cls_loss_mapping 0.0026 cls_loss_causal 0.5018 re_mapping 0.0047 re_causal 0.0123 /// teacc 98.97 lr 0.00010000 Epoch 281, weight, value: tensor([[-0.1173, -0.2380, -0.0790, ..., -0.0682, 0.1755, 0.1735], [-0.2158, -0.1843, -0.0896, ..., -0.1631, -0.2183, -0.1257], [-0.0720, -0.1574, 0.1375, ..., -0.2080, 0.2435, 0.0886], ..., [-0.1514, 0.0746, 0.0406, ..., 0.1873, -0.2190, -0.2412], [-0.2691, 0.0660, -0.1452, ..., 0.0559, -0.1052, -0.1752], [-0.0154, -0.1246, -0.0955, ..., -0.1064, -0.0605, -0.2081]], device='cuda:0'), grad: tensor([[ 2.7940e-09, 1.8626e-09, 0.0000e+00, ..., 1.7695e-08, 1.2759e-07, 9.4995e-08], [ 1.8626e-09, 1.5832e-08, 2.3562e-07, ..., 2.7940e-08, 3.9227e-06, 1.1269e-07], [ 9.3132e-10, 3.5390e-08, -2.3935e-07, ..., -1.1269e-07, -4.9695e-06, -8.6706e-07], ..., [ 9.3132e-10, 1.1828e-07, 1.8626e-09, ..., 4.5635e-08, 6.5099e-07, 4.7218e-07], [ 7.4506e-09, 9.3132e-09, 0.0000e+00, ..., 2.9802e-08, 2.1700e-07, 1.6484e-07], [ 2.7940e-09, 1.0245e-08, 0.0000e+00, ..., 6.5193e-09, 1.2107e-08, 9.3132e-09]], device='cuda:0') Epoch 281, bias, value: tensor([-0.0160, -0.0360, -0.0135, -0.0235, -0.0288, 0.0078, 0.0266, -0.0127, 0.0382, -0.0053], device='cuda:0'), grad: tensor([ 4.0885e-07, 6.2101e-06, -9.3505e-06, -3.4086e-07, 2.3283e-08, 3.6135e-07, -2.9244e-07, 2.2184e-06, 7.1619e-07, 4.6566e-08], device='cuda:0') 100 0.0001 changing lr epoch 280, time 250.12, cls_loss 0.0022 cls_loss_mapping 0.0021 cls_loss_causal 0.4760 re_mapping 0.0049 re_causal 0.0122 /// teacc 98.99 lr 0.00010000 Epoch 282, weight, value: tensor([[-0.1175, -0.2381, -0.0792, ..., -0.0690, 0.1757, 0.1736], [-0.2158, -0.1849, -0.0893, ..., -0.1634, -0.2185, -0.1262], [-0.0722, -0.1577, 0.1375, ..., -0.2084, 0.2438, 0.0894], ..., [-0.1521, 0.0752, 0.0405, ..., 0.1877, -0.2191, -0.2418], [-0.2695, 0.0659, -0.1454, ..., 0.0560, -0.1046, -0.1770], [-0.0168, -0.1248, -0.0956, ..., -0.1067, -0.0605, -0.2082]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 0.0000e+00, 8.3819e-09, 3.7253e-09], [ 9.3132e-10, 2.7940e-08, 4.2841e-08, ..., 1.7695e-08, 6.7614e-07, 3.3528e-07], [ 9.3132e-10, 9.3132e-10, -6.2399e-08, ..., 9.3132e-10, -9.8627e-07, -4.9081e-07], ..., [ 3.7253e-09, -1.0896e-07, 4.6566e-09, ..., -8.6613e-08, 7.7300e-08, 3.8184e-08], [ 1.1176e-08, 7.4506e-09, 9.3132e-09, ..., -0.0000e+00, 1.3784e-07, 6.8918e-08], [ 1.8626e-09, 8.2888e-08, 0.0000e+00, ..., 6.5193e-08, 4.6566e-09, 1.8626e-09]], device='cuda:0') Epoch 282, bias, value: tensor([-0.0161, -0.0359, -0.0135, -0.0233, -0.0292, 0.0077, 0.0264, -0.0126, 0.0381, -0.0051], device='cuda:0'), grad: tensor([ 4.2841e-08, 2.8405e-06, -4.0159e-06, -1.2293e-07, 2.6543e-07, 2.0023e-07, 2.1141e-07, 1.7695e-07, 5.8208e-07, -1.8347e-07], device='cuda:0') 100 0.0001 changing lr epoch 281, time 250.82, cls_loss 0.0025 cls_loss_mapping 0.0034 cls_loss_causal 0.5122 re_mapping 0.0048 re_causal 0.0123 /// teacc 99.01 lr 0.00010000 Epoch 283, weight, value: tensor([[-0.1176, -0.2382, -0.0796, ..., -0.0694, 0.1760, 0.1740], [-0.2159, -0.1850, -0.0890, ..., -0.1636, -0.2187, -0.1263], [-0.0728, -0.1597, 0.1380, ..., -0.2089, 0.2447, 0.0898], ..., [-0.1511, 0.0752, 0.0402, ..., 0.1874, -0.2195, -0.2434], [-0.2707, 0.0670, -0.1457, ..., 0.0564, -0.1051, -0.1786], [-0.0174, -0.1252, -0.0951, ..., -0.1069, -0.0607, -0.2087]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.7940e-09, 9.3132e-10, ..., 0.0000e+00, -1.0896e-07, -7.7300e-08], [ 2.7940e-09, 5.5879e-09, 2.3283e-08, ..., 0.0000e+00, 5.8673e-08, 1.8626e-09], [ 9.3132e-10, 1.8626e-09, -9.4995e-08, ..., 0.0000e+00, -2.1141e-07, 2.1420e-08], ..., [ 1.5832e-07, 3.7346e-07, 1.3970e-08, ..., 0.0000e+00, 3.7253e-08, 9.3132e-10], [ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 1.6764e-08, 1.1176e-08], [ 2.3283e-08, 3.7253e-09, 0.0000e+00, ..., 1.3039e-08, 4.9360e-08, 3.4459e-08]], device='cuda:0') Epoch 283, bias, value: tensor([-0.0160, -0.0360, -0.0137, -0.0235, -0.0292, 0.0082, 0.0265, -0.0126, 0.0381, -0.0053], device='cuda:0'), grad: tensor([-8.4750e-08, -3.7812e-07, -3.3248e-07, -4.5076e-07, 2.0005e-06, -6.5193e-08, 2.3935e-07, 6.5099e-07, 8.1956e-08, -1.6605e-06], device='cuda:0') 100 0.0001 changing lr epoch 282, time 250.51, cls_loss 0.0022 cls_loss_mapping 0.0022 cls_loss_causal 0.4795 re_mapping 0.0047 re_causal 0.0122 /// teacc 99.00 lr 0.00010000 Epoch 284, weight, value: tensor([[-0.1174, -0.2384, -0.0798, ..., -0.0683, 0.1762, 0.1744], [-0.2160, -0.1854, -0.0877, ..., -0.1640, -0.2188, -0.1264], [-0.0729, -0.1610, 0.1383, ..., -0.2121, 0.2420, 0.0901], ..., [-0.1529, 0.0757, 0.0395, ..., 0.1892, -0.2169, -0.2447], [-0.2711, 0.0669, -0.1475, ..., 0.0562, -0.1055, -0.1790], [-0.0177, -0.1253, -0.0955, ..., -0.1072, -0.0607, -0.2092]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, -5.7742e-08, -3.0734e-08], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 7.6368e-08, 2.6543e-08], [ 0.0000e+00, -5.1223e-09, 0.0000e+00, ..., 3.2596e-09, -1.8999e-07, -6.1933e-08], ..., [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 1.0710e-07, 3.5856e-08], [ 1.1176e-08, 1.8626e-09, 0.0000e+00, ..., -4.6566e-10, 1.3504e-08, 6.9849e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 2.9802e-08, 1.5367e-08]], device='cuda:0') Epoch 284, bias, value: tensor([-0.0159, -0.0361, -0.0165, -0.0237, -0.0293, 0.0088, 0.0262, -0.0113, 0.0378, -0.0052], device='cuda:0'), grad: tensor([-1.0384e-07, 1.7555e-07, -4.1863e-07, 1.8161e-08, -2.6962e-07, 1.4482e-07, -1.3597e-07, 3.2037e-07, 1.2340e-07, 1.6298e-07], device='cuda:0') 100 0.0001 changing lr epoch 283, time 250.73, cls_loss 0.0021 cls_loss_mapping 0.0021 cls_loss_causal 0.4700 re_mapping 0.0050 re_causal 0.0126 /// teacc 99.04 lr 0.00010000 Epoch 285, weight, value: tensor([[-0.1175, -0.2385, -0.0799, ..., -0.0683, 0.1764, 0.1746], [-0.2161, -0.1856, -0.0871, ..., -0.1642, -0.2188, -0.1265], [-0.0729, -0.1608, 0.1382, ..., -0.2121, 0.2422, 0.0903], ..., [-0.1530, 0.0758, 0.0393, ..., 0.1897, -0.2170, -0.2452], [-0.2713, 0.0668, -0.1482, ..., 0.0563, -0.1060, -0.1798], [-0.0186, -0.1256, -0.0957, ..., -0.1080, -0.0610, -0.2098]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 2.5611e-08, 4.6566e-10, ..., 9.3132e-09, -1.4901e-08, -9.3132e-09], [ 1.3970e-09, 2.6962e-07, 0.0000e+00, ..., 5.2946e-07, 9.3132e-10, 9.3132e-10], [ 4.6566e-10, 2.7940e-08, 0.0000e+00, ..., 4.5169e-08, 6.0536e-09, 4.1910e-09], ..., [ 1.3970e-09, -1.3215e-06, 0.0000e+00, ..., -1.3579e-06, 4.6566e-10, 0.0000e+00], [ 9.3132e-10, 1.2945e-07, 4.6566e-10, ..., 1.4901e-07, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 3.6275e-07, -9.3132e-10, ..., 2.9057e-07, 6.0536e-09, 3.7253e-09]], device='cuda:0') Epoch 285, bias, value: tensor([-0.0158, -0.0356, -0.0165, -0.0234, -0.0294, 0.0086, 0.0262, -0.0117, 0.0375, -0.0055], device='cuda:0'), grad: tensor([ 3.6787e-08, 7.8464e-07, 1.8254e-07, 4.4098e-07, 4.7358e-07, 2.3656e-07, 3.5390e-08, -4.1686e-06, 9.1596e-07, 1.0598e-06], device='cuda:0') 100 0.0001 changing lr epoch 284, time 250.32, cls_loss 0.0021 cls_loss_mapping 0.0016 cls_loss_causal 0.4917 re_mapping 0.0050 re_causal 0.0130 /// teacc 99.05 lr 0.00010000 Epoch 286, weight, value: tensor([[-0.1172, -0.2387, -0.0801, ..., -0.0672, 0.1768, 0.1751], [-0.2161, -0.1858, -0.0862, ..., -0.1645, -0.2193, -0.1267], [-0.0731, -0.1643, 0.1380, ..., -0.2128, 0.2424, 0.0901], ..., [-0.1546, 0.0769, 0.0391, ..., 0.1903, -0.2170, -0.2458], [-0.2731, 0.0660, -0.1485, ..., 0.0557, -0.1060, -0.1799], [-0.0189, -0.1262, -0.0960, ..., -0.1086, -0.0611, -0.2104]], device='cuda:0'), grad: tensor([[ 1.1176e-08, 4.6566e-10, 4.6566e-10, ..., 9.3132e-10, -4.0978e-08, -2.7008e-08], [ 9.3132e-10, 4.8662e-07, 0.0000e+00, ..., 9.4529e-07, 1.3970e-09, 9.3132e-10], [ 2.7940e-09, 2.3283e-09, 0.0000e+00, ..., 3.7253e-09, 6.0536e-09, 4.1910e-09], ..., [ 0.0000e+00, -5.3924e-07, 0.0000e+00, ..., -1.0468e-06, 1.8626e-09, 1.3970e-09], [ 3.2596e-09, -0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 9.7789e-09, 6.5193e-09], [ 0.0000e+00, 4.3306e-08, 0.0000e+00, ..., 8.0094e-08, 3.5390e-08, 2.4214e-08]], device='cuda:0') Epoch 286, bias, value: tensor([-0.0156, -0.0355, -0.0167, -0.0216, -0.0296, 0.0072, 0.0260, -0.0116, 0.0372, -0.0057], device='cuda:0'), grad: tensor([-2.7474e-08, 2.8275e-06, 5.4017e-08, 2.9337e-08, 6.2399e-08, 2.0023e-08, -6.7055e-08, -3.1292e-06, -4.3772e-08, 2.6450e-07], device='cuda:0') 100 0.0001 changing lr epoch 285, time 250.27, cls_loss 0.0022 cls_loss_mapping 0.0017 cls_loss_causal 0.4996 re_mapping 0.0047 re_causal 0.0122 /// teacc 99.06 lr 0.00010000 Epoch 287, weight, value: tensor([[-0.1174, -0.2388, -0.0802, ..., -0.0671, 0.1769, 0.1752], [-0.2162, -0.1861, -0.0858, ..., -0.1647, -0.2191, -0.1267], [-0.0758, -0.1644, 0.1378, ..., -0.2128, 0.2413, 0.0883], ..., [-0.1550, 0.0772, 0.0391, ..., 0.1908, -0.2171, -0.2469], [-0.2735, 0.0659, -0.1487, ..., 0.0557, -0.1059, -0.1805], [-0.0206, -0.1268, -0.0960, ..., -0.1091, -0.0612, -0.2106]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 2.5611e-08, 0.0000e+00, ..., 4.6566e-10, -4.2208e-06, -2.7847e-06], [ 3.2596e-09, 6.9849e-09, 0.0000e+00, ..., 2.7940e-09, 2.0489e-08, 2.0955e-08], [ 9.3132e-09, 1.6764e-08, 0.0000e+00, ..., 3.7253e-09, 1.2992e-06, 8.5216e-07], ..., [ 7.4506e-09, 9.3132e-09, 0.0000e+00, ..., -1.8626e-09, 5.4482e-08, 3.6787e-08], [ 1.9558e-08, 2.7474e-08, 0.0000e+00, ..., -3.1199e-08, 1.8114e-07, 1.1828e-07], [ 2.2817e-08, 4.2841e-08, 0.0000e+00, ..., 3.2596e-09, 3.1432e-07, 2.0675e-07]], device='cuda:0') Epoch 287, bias, value: tensor([-0.0156, -0.0353, -0.0177, -0.0215, -0.0298, 0.0063, 0.0280, -0.0115, 0.0383, -0.0060], device='cuda:0'), grad: tensor([-6.3442e-06, -7.9628e-08, 2.1420e-06, -5.0664e-07, 3.6787e-08, 5.3551e-07, 3.4608e-06, 2.0536e-07, -5.5879e-09, 5.6624e-07], device='cuda:0') 100 0.0001 changing lr epoch 286, time 250.32, cls_loss 0.0028 cls_loss_mapping 0.0028 cls_loss_causal 0.4976 re_mapping 0.0047 re_causal 0.0121 /// teacc 99.06 lr 0.00010000 Epoch 288, weight, value: tensor([[-0.1185, -0.2389, -0.0802, ..., -0.0669, 0.1769, 0.1752], [-0.2175, -0.1864, -0.0858, ..., -0.1656, -0.2192, -0.1276], [-0.0757, -0.1648, 0.1378, ..., -0.2130, 0.2414, 0.0893], ..., [-0.1561, 0.0791, 0.0391, ..., 0.1922, -0.2172, -0.2494], [-0.2737, 0.0658, -0.1487, ..., 0.0555, -0.1061, -0.1812], [-0.0208, -0.1296, -0.0960, ..., -0.1112, -0.0615, -0.2119]], device='cuda:0'), grad: tensor([[ 1.6438e-07, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 3.7253e-09], [ 6.6496e-06, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, 9.3132e-10, 9.3132e-10], [ 7.9162e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -8.3819e-09, -6.0536e-09], ..., [ 7.9162e-09, -5.3551e-08, 0.0000e+00, ..., -6.0536e-08, 8.3819e-09, 6.0536e-09], [ 2.0973e-06, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 1.3970e-09, 1.1083e-07], [ 9.3272e-07, 3.7253e-08, 0.0000e+00, ..., 4.2841e-08, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 288, bias, value: tensor([-0.0157, -0.0347, -0.0176, -0.0212, -0.0306, 0.0063, 0.0288, -0.0117, 0.0382, -0.0069], device='cuda:0'), grad: tensor([ 5.0897e-07, 3.7521e-05, 3.9302e-06, 3.4213e-04, 1.0338e-07, -3.7241e-04, -3.5344e-07, -2.1443e-05, 6.5528e-06, 3.1218e-06], device='cuda:0') 100 0.0001 changing lr epoch 287, time 250.30, cls_loss 0.0020 cls_loss_mapping 0.0014 cls_loss_causal 0.4994 re_mapping 0.0051 re_causal 0.0129 /// teacc 99.09 lr 0.00010000 Epoch 289, weight, value: tensor([[-0.1182, -0.2390, -0.0802, ..., -0.0657, 0.1771, 0.1757], [-0.2183, -0.1875, -0.0857, ..., -0.1670, -0.2193, -0.1281], [-0.0757, -0.1649, 0.1378, ..., -0.2129, 0.2417, 0.0902], ..., [-0.1562, 0.0810, 0.0391, ..., 0.1941, -0.2175, -0.2517], [-0.2739, 0.0657, -0.1487, ..., 0.0553, -0.1064, -0.1816], [-0.0213, -0.1316, -0.0960, ..., -0.1130, -0.0616, -0.2122]], device='cuda:0'), grad: tensor([[ 1.1642e-08, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, 2.3283e-09, 5.5879e-09], [ 4.6566e-09, 2.7940e-09, 0.0000e+00, ..., 4.6566e-09, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 1.8626e-09, -1.1642e-08, 0.0000e+00, ..., -1.3504e-08, 0.0000e+00, 0.0000e+00], [ 1.9558e-07, 9.3132e-10, 0.0000e+00, ..., 7.9162e-09, 5.2620e-08, 1.3877e-07], [ 8.8476e-09, 1.0245e-08, 0.0000e+00, ..., 1.1642e-08, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 289, bias, value: tensor([-0.0155, -0.0347, -0.0174, -0.0216, -0.0308, 0.0067, 0.0282, -0.0112, 0.0377, -0.0080], device='cuda:0'), grad: tensor([ 3.4459e-08, -4.7497e-07, 7.9162e-09, 1.8813e-07, 1.2526e-07, -3.8650e-08, -6.4401e-07, -1.1176e-08, 7.5018e-07, 5.5414e-08], device='cuda:0') 100 0.0001 changing lr epoch 288, time 250.45, cls_loss 0.0020 cls_loss_mapping 0.0021 cls_loss_causal 0.4964 re_mapping 0.0050 re_causal 0.0127 /// teacc 99.06 lr 0.00010000 Epoch 290, weight, value: tensor([[-0.1185, -0.2392, -0.0802, ..., -0.0657, 0.1775, 0.1760], [-0.2187, -0.1878, -0.0854, ..., -0.1672, -0.2195, -0.1285], [-0.0757, -0.1651, 0.1377, ..., -0.2130, 0.2418, 0.0902], ..., [-0.1563, 0.0813, 0.0391, ..., 0.1944, -0.2175, -0.2525], [-0.2741, 0.0655, -0.1488, ..., 0.0553, -0.1068, -0.1824], [-0.0218, -0.1319, -0.0960, ..., -0.1133, -0.0617, -0.2128]], device='cuda:0'), grad: tensor([[-6.9849e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -3.3528e-08, -2.3283e-08], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 4.6566e-10], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -1.0571e-07, 9.3132e-10], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 4.6566e-10], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.3970e-09, 9.3132e-10], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0896e-07, 1.8626e-09]], device='cuda:0') Epoch 290, bias, value: tensor([-0.0154, -0.0347, -0.0175, -0.0231, -0.0308, 0.0076, 0.0290, -0.0112, 0.0376, -0.0081], device='cuda:0'), grad: tensor([ 2.6077e-08, -1.8785e-06, -4.0699e-07, 1.0598e-06, 1.3914e-06, -6.4587e-07, 2.8173e-07, 4.2142e-07, 7.6508e-07, -1.0068e-06], device='cuda:0') 100 0.0001 changing lr epoch 289, time 250.33, cls_loss 0.0025 cls_loss_mapping 0.0022 cls_loss_causal 0.4829 re_mapping 0.0049 re_causal 0.0123 /// teacc 99.04 lr 0.00010000 Epoch 291, weight, value: tensor([[-0.1174, -0.2402, -0.0802, ..., -0.0629, 0.1803, 0.1783], [-0.2190, -0.1881, -0.0854, ..., -0.1677, -0.2200, -0.1304], [-0.0757, -0.1654, 0.1377, ..., -0.2131, 0.2419, 0.0897], ..., [-0.1571, 0.0815, 0.0391, ..., 0.1948, -0.2177, -0.2541], [-0.2743, 0.0658, -0.1488, ..., 0.0553, -0.1076, -0.1833], [-0.0222, -0.1320, -0.0960, ..., -0.1136, -0.0619, -0.2135]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 4.1910e-09, 1.3039e-08, 0.0000e+00, ..., 5.2154e-08, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 1.3970e-09, 0.0000e+00, ..., 2.3283e-08, 0.0000e+00, 0.0000e+00], ..., [ 1.3970e-09, -1.7229e-08, 0.0000e+00, ..., -9.1735e-08, 0.0000e+00, 0.0000e+00], [ 4.2841e-08, 6.5193e-09, 0.0000e+00, ..., 1.7695e-08, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 9.7789e-09, 0.0000e+00, ..., 3.1665e-08, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 291, bias, value: tensor([-0.0132, -0.0346, -0.0176, -0.0234, -0.0315, 0.0076, 0.0262, -0.0112, 0.0386, -0.0080], device='cuda:0'), grad: tensor([ 1.5832e-08, 8.0559e-08, 2.2398e-07, -2.6636e-07, -1.1642e-08, 1.3132e-07, 2.8405e-08, -1.4389e-07, -1.3970e-07, 9.2201e-08], device='cuda:0') 100 0.0001 changing lr epoch 290, time 250.43, cls_loss 0.0021 cls_loss_mapping 0.0020 cls_loss_causal 0.4857 re_mapping 0.0048 re_causal 0.0125 /// teacc 99.09 lr 0.00010000 Epoch 292, weight, value: tensor([[-0.1142, -0.2401, -0.0804, ..., -0.0615, 0.1838, 0.1819], [-0.2191, -0.1883, -0.0854, ..., -0.1679, -0.2201, -0.1312], [-0.0759, -0.1655, 0.1381, ..., -0.2131, 0.2422, 0.0896], ..., [-0.1578, 0.0820, 0.0390, ..., 0.1953, -0.2180, -0.2577], [-0.2748, 0.0658, -0.1491, ..., 0.0550, -0.1086, -0.1861], [-0.0225, -0.1326, -0.0960, ..., -0.1141, -0.0619, -0.2135]], device='cuda:0'), grad: tensor([[ 4.1910e-09, 4.6566e-09, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 4.6566e-10], [ 2.7940e-09, 2.8405e-08, 3.2596e-09, ..., 3.7719e-08, 3.6322e-08, 1.0245e-08], [ 1.8626e-09, 4.1910e-09, -7.4506e-09, ..., 1.3970e-09, -8.8941e-08, -2.5611e-08], ..., [ 1.8626e-09, -1.3923e-07, 1.8626e-09, ..., -2.0768e-07, 2.0023e-08, 5.5879e-09], [ 1.6298e-08, 1.4435e-08, 2.3283e-09, ..., 3.2596e-09, 2.7474e-08, 7.9162e-09], [ 2.4214e-08, 1.4203e-07, 0.0000e+00, ..., 1.6764e-07, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 292, bias, value: tensor([-0.0096, -0.0346, -0.0175, -0.0234, -0.0315, 0.0074, 0.0226, -0.0111, 0.0384, -0.0082], device='cuda:0'), grad: tensor([ 2.2817e-08, 1.9418e-07, -1.6438e-07, -2.4633e-07, 1.1176e-08, 7.7300e-08, 3.2131e-08, -6.4960e-07, 7.7300e-08, 6.4960e-07], device='cuda:0') 100 0.0001 changing lr epoch 291, time 250.14, cls_loss 0.0019 cls_loss_mapping 0.0023 cls_loss_causal 0.4776 re_mapping 0.0046 re_causal 0.0120 /// teacc 99.07 lr 0.00010000 Epoch 293, weight, value: tensor([[-0.1142, -0.2403, -0.0822, ..., -0.0616, 0.1839, 0.1820], [-0.2191, -0.1888, -0.0855, ..., -0.1683, -0.2202, -0.1314], [-0.0760, -0.1654, 0.1389, ..., -0.2132, 0.2424, 0.0897], ..., [-0.1578, 0.0828, 0.0390, ..., 0.1960, -0.2181, -0.2583], [-0.2751, 0.0657, -0.1488, ..., 0.0549, -0.1092, -0.1874], [-0.0228, -0.1333, -0.0958, ..., -0.1149, -0.0619, -0.2138]], device='cuda:0'), grad: tensor([[ 7.9162e-09, 4.6566e-10, 0.0000e+00, ..., 3.2596e-09, -6.6170e-07, -3.9674e-07], [ 4.1910e-09, 1.3039e-08, 0.0000e+00, ..., 2.7940e-08, 3.5856e-08, 2.1886e-08], [ 2.7940e-09, 4.6566e-09, 0.0000e+00, ..., 2.8405e-08, -4.4703e-08, 3.7253e-09], ..., [ 1.8626e-09, -2.1886e-08, 0.0000e+00, ..., -2.3749e-08, 4.6566e-09, 1.8626e-09], [ 4.6566e-09, 6.0536e-09, 0.0000e+00, ..., -7.4506e-08, 1.9558e-08, 1.0710e-08], [ 2.3283e-09, 1.7229e-08, -0.0000e+00, ..., 1.8161e-08, 1.8300e-07, 1.0896e-07]], device='cuda:0') Epoch 293, bias, value: tensor([-0.0096, -0.0343, -0.0175, -0.0235, -0.0308, 0.0074, 0.0226, -0.0111, 0.0383, -0.0092], device='cuda:0'), grad: tensor([-1.4864e-06, 2.8452e-07, 2.5611e-08, 9.4576e-07, -2.6077e-08, 1.3411e-07, 5.4948e-08, -1.4435e-08, -4.1537e-07, 5.0664e-07], device='cuda:0') 100 0.0001 changing lr epoch 292, time 250.26, cls_loss 0.0021 cls_loss_mapping 0.0020 cls_loss_causal 0.4935 re_mapping 0.0045 re_causal 0.0118 /// teacc 99.03 lr 0.00010000 Epoch 294, weight, value: tensor([[-0.1142, -0.2409, -0.0824, ..., -0.0616, 0.1840, 0.1821], [-0.2195, -0.1889, -0.0855, ..., -0.1687, -0.2207, -0.1349], [-0.0756, -0.1656, 0.1390, ..., -0.2132, 0.2430, 0.0927], ..., [-0.1583, 0.0829, 0.0389, ..., 0.1962, -0.2184, -0.2614], [-0.2753, 0.0661, -0.1481, ..., 0.0553, -0.1096, -0.1886], [-0.0235, -0.1335, -0.0957, ..., -0.1151, -0.0620, -0.2141]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.8626e-09, 1.3970e-09], [ 1.3970e-09, 2.5146e-08, 0.0000e+00, ..., 2.2352e-08, 2.3283e-09, 1.3970e-09], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., -6.0536e-09, -4.0047e-08, -3.2131e-08], ..., [ 1.3970e-09, -4.0513e-08, 0.0000e+00, ..., -4.0978e-08, 3.0268e-08, 2.6077e-08], [ 1.3970e-09, 6.5193e-09, 0.0000e+00, ..., 4.1910e-09, 3.2596e-09, 2.3283e-09], [ 2.3283e-09, 5.1223e-09, 0.0000e+00, ..., 6.9849e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 294, bias, value: tensor([-0.0095, -0.0344, -0.0172, -0.0236, -0.0307, 0.0076, 0.0225, -0.0112, 0.0385, -0.0091], device='cuda:0'), grad: tensor([ 1.0245e-08, 1.3784e-07, -1.1036e-07, 3.3528e-08, 9.0748e-06, 9.3132e-10, 6.0536e-09, -1.5832e-08, 3.1851e-07, -9.4324e-06], device='cuda:0') 100 0.0001 changing lr epoch 293, time 250.54, cls_loss 0.0015 cls_loss_mapping 0.0022 cls_loss_causal 0.4848 re_mapping 0.0046 re_causal 0.0125 /// teacc 98.98 lr 0.00010000 Epoch 295, weight, value: tensor([[-0.1143, -0.2411, -0.0826, ..., -0.0616, 0.1840, 0.1821], [-0.2196, -0.1896, -0.0855, ..., -0.1692, -0.2209, -0.1350], [-0.0757, -0.1657, 0.1390, ..., -0.2132, 0.2434, 0.0933], ..., [-0.1586, 0.0831, 0.0387, ..., 0.1968, -0.2186, -0.2626], [-0.2755, 0.0661, -0.1480, ..., 0.0553, -0.1098, -0.1888], [-0.0241, -0.1330, -0.0949, ..., -0.1156, -0.0618, -0.2147]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 1.3970e-09, 0.0000e+00, ..., -2.0955e-08, -2.6077e-07, -2.7521e-07], [ 9.3132e-10, 1.2387e-07, 0.0000e+00, ..., 1.9278e-07, 2.0489e-08, 2.1886e-08], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 5.1223e-09, 1.3970e-08, 1.4901e-08], ..., [ 0.0000e+00, -4.3865e-07, 0.0000e+00, ..., -6.2259e-07, 1.7229e-08, 1.7695e-08], [ 9.3132e-10, 4.6566e-09, 0.0000e+00, ..., 1.8161e-08, 1.9046e-07, 2.0163e-07], [ 0.0000e+00, 3.0268e-07, 0.0000e+00, ..., 4.1444e-07, 2.0955e-08, 2.0023e-08]], device='cuda:0') Epoch 295, bias, value: tensor([-0.0096, -0.0345, -0.0172, -0.0239, -0.0306, 0.0079, 0.0225, -0.0120, 0.0386, -0.0071], device='cuda:0'), grad: tensor([-6.4075e-07, 7.9675e-07, 4.6566e-08, 1.9558e-08, 1.6298e-08, 6.1933e-08, -5.2620e-08, -1.9316e-06, 4.7917e-07, 1.2126e-06], device='cuda:0') 100 0.0001 changing lr epoch 294, time 250.52, cls_loss 0.0025 cls_loss_mapping 0.0022 cls_loss_causal 0.5022 re_mapping 0.0045 re_causal 0.0115 /// teacc 99.07 lr 0.00010000 Epoch 296, weight, value: tensor([[-0.1143, -0.2416, -0.0826, ..., -0.0611, 0.1840, 0.1822], [-0.2197, -0.1927, -0.0855, ..., -0.1720, -0.2244, -0.1371], [-0.0759, -0.1663, 0.1395, ..., -0.2134, 0.2460, 0.0941], ..., [-0.1594, 0.0856, 0.0386, ..., 0.1987, -0.2187, -0.2639], [-0.2768, 0.0662, -0.1483, ..., 0.0548, -0.1090, -0.1914], [-0.0242, -0.1333, -0.0946, ..., -0.1162, -0.0620, -0.2153]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 1.3970e-09], [ 1.8626e-09, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 4.6566e-09, 4.1910e-09], [ 0.0000e+00, -5.9139e-08, 0.0000e+00, ..., -1.9977e-07, -3.3155e-07, -2.6310e-07], ..., [ 4.6566e-10, 5.0757e-08, 0.0000e+00, ..., 1.8300e-07, 3.1991e-07, 2.5472e-07], [ 3.7253e-09, 3.7253e-09, 0.0000e+00, ..., 6.5193e-09, 1.3970e-08, 1.1176e-08], [ 9.3132e-10, 4.6566e-09, 0.0000e+00, ..., 3.7253e-09, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 296, bias, value: tensor([-0.0096, -0.0361, -0.0159, -0.0240, -0.0301, 0.0082, 0.0224, -0.0110, 0.0381, -0.0073], device='cuda:0'), grad: tensor([ 4.0047e-08, 6.6124e-08, -8.8289e-07, 9.3132e-10, 7.6462e-07, 1.0198e-07, -3.9581e-08, 8.9267e-07, 6.5044e-06, -7.4357e-06], device='cuda:0') 100 0.0001 changing lr epoch 295, time 250.28, cls_loss 0.0019 cls_loss_mapping 0.0023 cls_loss_causal 0.4658 re_mapping 0.0048 re_causal 0.0119 /// teacc 99.05 lr 0.00010000 Epoch 297, weight, value: tensor([[-0.1143, -0.2417, -0.0828, ..., -0.0608, 0.1840, 0.1822], [-0.2197, -0.1928, -0.0855, ..., -0.1721, -0.2249, -0.1363], [-0.0759, -0.1667, 0.1396, ..., -0.2135, 0.2467, 0.0946], ..., [-0.1598, 0.0856, 0.0383, ..., 0.1989, -0.2190, -0.2663], [-0.2770, 0.0661, -0.1476, ..., 0.0549, -0.1091, -0.1915], [-0.0242, -0.1335, -0.0927, ..., -0.1166, -0.0618, -0.2161]], device='cuda:0'), grad: tensor([[ 1.5600e-07, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0617e-07, 9.4064e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -1.5832e-08, 4.6566e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 1.1176e-08, -2.3283e-09], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 5.1223e-09, 9.3132e-10], [ 1.1642e-08, 0.0000e+00, 0.0000e+00, ..., -1.3793e-06, 1.6298e-08, 1.3039e-08], [ 4.6566e-10, 0.0000e+00, -2.3283e-09, ..., 0.0000e+00, 4.0513e-08, 2.9337e-08]], device='cuda:0') Epoch 297, bias, value: tensor([-0.0096, -0.0359, -0.0158, -0.0236, -0.0295, 0.0079, 0.0224, -0.0112, 0.0382, -0.0075], device='cuda:0'), grad: tensor([ 2.8405e-07, -2.0675e-07, 1.6578e-07, 1.3970e-08, -1.1154e-05, 4.4703e-07, 6.9514e-06, 1.0982e-05, -7.6890e-06, 2.2771e-07], device='cuda:0') 100 0.0001 changing lr epoch 296, time 249.90, cls_loss 0.0020 cls_loss_mapping 0.0015 cls_loss_causal 0.4873 re_mapping 0.0047 re_causal 0.0120 /// teacc 99.10 lr 0.00010000 Epoch 298, weight, value: tensor([[-0.1143, -0.2419, -0.0820, ..., -0.0608, 0.1841, 0.1823], [-0.2198, -0.1929, -0.0855, ..., -0.1723, -0.2251, -0.1368], [-0.0759, -0.1675, 0.1394, ..., -0.2135, 0.2472, 0.0956], ..., [-0.1601, 0.0857, 0.0382, ..., 0.1991, -0.2194, -0.2688], [-0.2772, 0.0657, -0.1475, ..., 0.0552, -0.1083, -0.1917], [-0.0244, -0.1337, -0.0922, ..., -0.1168, -0.0620, -0.2167]], device='cuda:0'), grad: tensor([[ 2.1886e-08, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, -1.0943e-07, -4.9360e-08], [ 2.7940e-09, 4.1910e-09, 0.0000e+00, ..., 6.0536e-09, 1.7229e-08, 1.3504e-08], [ 3.6787e-08, 6.0536e-09, 0.0000e+00, ..., 6.0536e-09, 1.3970e-09, 3.2596e-09], ..., [ 4.6566e-10, -3.3528e-08, 0.0000e+00, ..., -5.4482e-08, 2.3283e-09, 9.3132e-10], [ 7.2503e-07, 2.1420e-08, 4.6566e-10, ..., 2.3749e-08, 1.1511e-06, 1.1623e-06], [-3.0873e-07, 8.3819e-09, -1.8626e-09, ..., 1.3039e-08, 4.4238e-08, 2.8405e-08]], device='cuda:0') Epoch 298, bias, value: tensor([-0.0096, -0.0357, -0.0157, -0.0235, -0.0281, 0.0079, 0.0223, -0.0112, 0.0380, -0.0083], device='cuda:0'), grad: tensor([ 4.1444e-08, 1.6624e-07, 2.5751e-07, 5.5414e-08, 3.9488e-06, 6.8732e-07, -8.1137e-06, -1.2852e-07, 9.9167e-06, -6.8322e-06], device='cuda:0') 100 0.0001 changing lr epoch 297, time 250.88, cls_loss 0.0022 cls_loss_mapping 0.0020 cls_loss_causal 0.5134 re_mapping 0.0046 re_causal 0.0122 /// teacc 99.06 lr 0.00010000 Epoch 299, weight, value: tensor([[-0.1145, -0.2421, -0.0798, ..., -0.0608, 0.1842, 0.1824], [-0.2217, -0.1931, -0.0855, ..., -0.1727, -0.2253, -0.1384], [-0.0757, -0.1679, 0.1392, ..., -0.2137, 0.2472, 0.0968], ..., [-0.1603, 0.0866, 0.0382, ..., 0.1999, -0.2196, -0.2698], [-0.2779, 0.0658, -0.1476, ..., 0.0553, -0.1088, -0.1926], [-0.0247, -0.1351, -0.0919, ..., -0.1182, -0.0622, -0.2177]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.5926e-08, 0.0000e+00, ..., -3.0175e-07, -6.7567e-07, -6.6776e-07], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 1.0710e-08, 6.9849e-09, 6.9849e-09], [ 0.0000e+00, 1.6764e-08, 0.0000e+00, ..., 2.9802e-08, 2.5611e-08, 2.8871e-08], ..., [ 0.0000e+00, -9.2667e-08, 0.0000e+00, ..., -1.0198e-07, 4.7032e-08, 4.5169e-08], [ 1.3970e-09, 5.4948e-08, 0.0000e+00, ..., 6.7987e-08, 7.4506e-09, 7.4506e-09], [-2.8871e-08, 7.9162e-09, 0.0000e+00, ..., 1.6764e-08, 2.8871e-08, 2.7940e-08]], device='cuda:0') Epoch 299, bias, value: tensor([-0.0096, -0.0358, -0.0158, -0.0231, -0.0294, 0.0075, 0.0224, -0.0108, 0.0376, -0.0087], device='cuda:0'), grad: tensor([-1.9297e-06, 4.1444e-08, 1.1316e-07, 1.1176e-07, 2.3749e-08, 6.7428e-07, 1.0571e-06, -1.8813e-07, 1.9139e-07, -9.1735e-08], device='cuda:0') 100 0.0001 changing lr epoch 298, time 250.68, cls_loss 0.0017 cls_loss_mapping 0.0015 cls_loss_causal 0.4881 re_mapping 0.0044 re_causal 0.0119 /// teacc 98.94 lr 0.00010000 Epoch 300, weight, value: tensor([[-0.1145, -0.2424, -0.0797, ..., -0.0604, 0.1843, 0.1825], [-0.2223, -0.1932, -0.0855, ..., -0.1728, -0.2254, -0.1387], [-0.0758, -0.1684, 0.1393, ..., -0.2139, 0.2473, 0.0972], ..., [-0.1612, 0.0853, 0.0382, ..., 0.1998, -0.2197, -0.2706], [-0.2781, 0.0665, -0.1477, ..., 0.0558, -0.1089, -0.1930], [-0.0247, -0.1353, -0.0918, ..., -0.1184, -0.0623, -0.2178]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 2.7940e-09, 0.0000e+00, ..., 6.0536e-09, -1.0896e-07, -4.7032e-08], [ 9.3132e-10, 3.0594e-07, 0.0000e+00, ..., 7.3761e-07, 3.2596e-09, 8.8476e-09], [ 2.3283e-09, 2.8312e-07, 0.0000e+00, ..., 6.7148e-07, 8.3819e-08, 4.3306e-08], ..., [ 7.9162e-09, -6.2305e-07, 0.0000e+00, ..., -1.5581e-06, 4.6566e-09, -9.7789e-09], [ 1.2573e-08, 2.2352e-08, 0.0000e+00, ..., -3.6322e-08, 2.3283e-09, 4.6566e-09], [ 1.3970e-09, 1.5367e-08, 0.0000e+00, ..., 3.7253e-08, 1.6764e-08, 8.3819e-09]], device='cuda:0') Epoch 300, bias, value: tensor([-0.0096, -0.0358, -0.0158, -0.0225, -0.0295, 0.0075, 0.0224, -0.0110, 0.0379, -0.0086], device='cuda:0'), grad: tensor([-2.0070e-07, 1.8794e-06, 1.8999e-06, 1.8859e-07, -1.8859e-07, 1.4622e-07, -3.8184e-08, -3.7216e-06, -8.3353e-08, 1.3784e-07], device='cuda:0') 100 0.0001 changing lr epoch 299, time 250.59, cls_loss 0.0025 cls_loss_mapping 0.0021 cls_loss_causal 0.4816 re_mapping 0.0047 re_causal 0.0119 /// teacc 99.06 lr 0.00010000 Epoch 301, weight, value: tensor([[-0.1148, -0.2426, -0.0800, ..., -0.0604, 0.1855, 0.1836], [-0.2229, -0.1933, -0.0856, ..., -0.1731, -0.2256, -0.1397], [-0.0753, -0.1691, 0.1415, ..., -0.2142, 0.2478, 0.0989], ..., [-0.1618, 0.0847, 0.0366, ..., 0.1994, -0.2200, -0.2733], [-0.2787, 0.0680, -0.1478, ..., 0.0570, -0.1076, -0.1914], [-0.0254, -0.1354, -0.0918, ..., -0.1188, -0.0645, -0.2216]], device='cuda:0'), grad: tensor([[ 2.2491e-07, 3.2131e-07, 4.6566e-10, ..., 0.0000e+00, 1.0394e-06, 5.3085e-07], [ 1.3970e-09, 5.5879e-09, 9.3132e-10, ..., 5.1223e-09, 5.1200e-05, 4.1947e-06], [ 1.4435e-08, 2.0955e-08, 0.0000e+00, ..., 1.3970e-09, -5.2840e-05, -4.2766e-06], ..., [ 4.6566e-10, -3.3993e-08, 3.2596e-09, ..., -3.9116e-08, 8.4145e-07, 6.9384e-08], [ 2.1420e-08, 2.5146e-08, 9.3132e-10, ..., 2.4680e-08, 1.6345e-07, -5.1223e-09], [ 1.3970e-09, 4.6566e-09, -6.9849e-09, ..., 5.1223e-09, 6.3796e-08, 5.4017e-08]], device='cuda:0') Epoch 301, bias, value: tensor([-0.0085, -0.0359, -0.0161, -0.0223, -0.0295, 0.0075, 0.0225, -0.0110, 0.0388, -0.0104], device='cuda:0'), grad: tensor([ 3.0845e-06, 1.6296e-04, -1.6797e-04, -1.7006e-06, 7.9162e-07, -7.4832e-07, 6.1607e-07, 2.8666e-06, 4.1770e-07, -2.9802e-07], device='cuda:0') 100 0.0001 changing lr epoch 300, time 250.30, cls_loss 0.0020 cls_loss_mapping 0.0023 cls_loss_causal 0.4767 re_mapping 0.0046 re_causal 0.0118 /// teacc 98.94 lr 0.00010000 Epoch 302, weight, value: tensor([[-0.1151, -0.2428, -0.0800, ..., -0.0605, 0.1857, 0.1839], [-0.2236, -0.1965, -0.0857, ..., -0.1769, -0.2267, -0.1409], [-0.0754, -0.1698, 0.1420, ..., -0.2148, 0.2484, 0.0994], ..., [-0.1621, 0.0873, 0.0361, ..., 0.2022, -0.2201, -0.2743], [-0.2789, 0.0679, -0.1479, ..., 0.0571, -0.1078, -0.1917], [-0.0250, -0.1356, -0.0913, ..., -0.1191, -0.0651, -0.2222]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -1.9092e-08, -1.6298e-08], [ 9.3132e-10, 3.0268e-08, 0.0000e+00, ..., 3.5390e-08, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 1.7229e-08, 0.0000e+00, ..., 3.5390e-08, -4.1910e-08, -1.2107e-08], ..., [ 1.3970e-09, -1.3364e-07, 0.0000e+00, ..., -1.5367e-07, 1.8626e-09, 1.8626e-09], [ 1.3970e-08, 2.3283e-08, 4.6566e-10, ..., -3.7253e-09, 4.6566e-10, 4.6566e-10], [ 4.6566e-10, 3.7719e-08, -9.3132e-10, ..., 4.3772e-08, 2.0023e-08, 2.0489e-08]], device='cuda:0') Epoch 302, bias, value: tensor([-0.0082, -0.0379, -0.0164, -0.0222, -0.0307, 0.0074, 0.0226, -0.0090, 0.0389, -0.0107], device='cuda:0'), grad: tensor([-4.3772e-08, 9.0804e-08, 1.6531e-07, 7.4971e-08, 1.0757e-07, 2.5611e-08, -4.1910e-09, -3.8370e-07, -1.3597e-07, 1.0990e-07], device='cuda:0') 100 0.0001 changing lr epoch 301, time 250.54, cls_loss 0.0024 cls_loss_mapping 0.0026 cls_loss_causal 0.4864 re_mapping 0.0048 re_causal 0.0115 /// teacc 99.09 lr 0.00010000 Epoch 303, weight, value: tensor([[-0.1151, -0.2430, -0.0801, ..., -0.0603, 0.1858, 0.1839], [-0.2236, -0.1971, -0.0855, ..., -0.1778, -0.2271, -0.1410], [-0.0755, -0.1700, 0.1425, ..., -0.2150, 0.2489, 0.0996], ..., [-0.1631, 0.0882, 0.0352, ..., 0.2031, -0.2203, -0.2748], [-0.2794, 0.0681, -0.1486, ..., 0.0572, -0.1078, -0.1923], [-0.0256, -0.1361, -0.0907, ..., -0.1197, -0.0651, -0.2222]], device='cuda:0'), grad: tensor([[ 2.5611e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.3283e-09, 4.1910e-09], [ 1.4901e-08, 7.4506e-09, 0.0000e+00, ..., 2.7940e-09, 9.3132e-10, 9.3132e-10], [ 3.7253e-09, 2.7474e-08, 0.0000e+00, ..., 1.8626e-09, 4.6566e-10, 0.0000e+00], ..., [ 1.6764e-08, -2.3283e-09, 0.0000e+00, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00], [ 1.7369e-07, -6.0536e-08, 0.0000e+00, ..., -6.9849e-09, 1.8626e-09, 2.3283e-09], [ 7.8231e-08, 5.4948e-08, 0.0000e+00, ..., 8.8476e-09, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 303, bias, value: tensor([-0.0083, -0.0379, -0.0163, -0.0212, -0.0300, 0.0063, 0.0226, -0.0090, 0.0394, -0.0107], device='cuda:0'), grad: tensor([ 2.9337e-08, 3.0920e-07, 1.1642e-07, 3.7253e-07, -4.9081e-07, -1.1958e-06, 5.7369e-07, 1.0012e-07, -9.5461e-08, 2.9802e-07], device='cuda:0') 100 0.0001 changing lr epoch 302, time 250.18, cls_loss 0.0018 cls_loss_mapping 0.0023 cls_loss_causal 0.4380 re_mapping 0.0047 re_causal 0.0112 /// teacc 99.08 lr 0.00010000 Epoch 304, weight, value: tensor([[-0.1152, -0.2432, -0.0801, ..., -0.0604, 0.1858, 0.1840], [-0.2272, -0.1972, -0.0855, ..., -0.1781, -0.2275, -0.1448], [-0.0720, -0.1703, 0.1425, ..., -0.2151, 0.2491, 0.1027], ..., [-0.1642, 0.0887, 0.0352, ..., 0.2034, -0.2203, -0.2752], [-0.2800, 0.0680, -0.1487, ..., 0.0573, -0.1082, -0.1935], [-0.0265, -0.1364, -0.0905, ..., -0.1200, -0.0651, -0.2222]], device='cuda:0'), grad: tensor([[ 6.9384e-08, 4.4703e-08, 0.0000e+00, ..., 3.4925e-09, 1.5437e-07, 6.7987e-08], [ 1.6298e-09, 1.2573e-08, 0.0000e+00, ..., 2.6077e-08, 2.8871e-08, 2.3283e-10], [ 1.7928e-08, 1.1642e-08, 0.0000e+00, ..., 6.9849e-10, 6.9849e-08, 1.5367e-08], ..., [ 2.5611e-09, -2.5844e-08, 0.0000e+00, ..., -6.3796e-08, 2.3982e-08, 4.6566e-10], [ 4.4238e-09, 3.2596e-09, 0.0000e+00, ..., 2.3283e-10, 2.7940e-09, 1.1642e-09], [ 6.9849e-10, 1.3504e-08, 0.0000e+00, ..., 3.0501e-08, 1.6554e-07, 2.3283e-10]], device='cuda:0') Epoch 304, bias, value: tensor([-0.0083, -0.0385, -0.0147, -0.0213, -0.0299, 0.0060, 0.0227, -0.0089, 0.0395, -0.0110], device='cuda:0'), grad: tensor([ 6.2119e-07, 2.2468e-07, 4.7521e-07, -5.7416e-07, -3.2056e-06, 6.9616e-08, 9.4762e-08, 1.4831e-07, 2.7707e-08, 2.1309e-06], device='cuda:0') 100 0.0001 changing lr epoch 303, time 250.42, cls_loss 0.0018 cls_loss_mapping 0.0019 cls_loss_causal 0.4951 re_mapping 0.0045 re_causal 0.0119 /// teacc 99.05 lr 0.00010000 Epoch 305, weight, value: tensor([[-0.1152, -0.2435, -0.0801, ..., -0.0605, 0.1859, 0.1840], [-0.2274, -0.1974, -0.0855, ..., -0.1785, -0.2276, -0.1447], [-0.0721, -0.1705, 0.1425, ..., -0.2152, 0.2490, 0.1027], ..., [-0.1645, 0.0887, 0.0352, ..., 0.2034, -0.2204, -0.2764], [-0.2804, 0.0685, -0.1486, ..., 0.0580, -0.1086, -0.1940], [-0.0276, -0.1366, -0.0904, ..., -0.1201, -0.0651, -0.2222]], device='cuda:0'), grad: tensor([[ 1.1642e-09, 1.6298e-09, 2.3283e-10, ..., 1.8626e-09, -7.7672e-07, -5.6252e-07], [ 4.1910e-09, 1.0477e-08, 1.1642e-09, ..., 1.1409e-08, 1.9558e-08, 1.2107e-08], [ 6.9849e-10, 2.5611e-09, 0.0000e+00, ..., 2.7940e-09, 1.0012e-08, 6.9849e-09], ..., [ 4.6566e-10, -2.3516e-08, 3.4925e-09, ..., -2.5611e-08, 1.0477e-08, 7.4506e-09], [ 1.3970e-09, 1.8626e-09, 2.3283e-10, ..., 1.8626e-09, 3.9116e-08, 2.9337e-08], [ 4.6566e-10, 5.3551e-09, -9.3132e-09, ..., 5.8208e-09, 3.3434e-07, 2.4191e-07]], device='cuda:0') Epoch 305, bias, value: tensor([-0.0083, -0.0380, -0.0151, -0.0218, -0.0300, 0.0061, 0.0228, -0.0091, 0.0395, -0.0110], device='cuda:0'), grad: tensor([-2.2426e-06, 9.5461e-08, 4.2841e-08, 4.4005e-08, -7.2177e-07, 2.1793e-07, 1.5469e-06, -1.6531e-08, 1.2713e-07, 9.1270e-07], device='cuda:0') 100 0.0001 changing lr epoch 304, time 250.51, cls_loss 0.0018 cls_loss_mapping 0.0023 cls_loss_causal 0.5029 re_mapping 0.0048 re_causal 0.0123 /// teacc 99.11 lr 0.00010000 Epoch 306, weight, value: tensor([[-0.1152, -0.2437, -0.0801, ..., -0.0606, 0.1859, 0.1841], [-0.2274, -0.1976, -0.0855, ..., -0.1784, -0.2276, -0.1448], [-0.0721, -0.1706, 0.1425, ..., -0.2152, 0.2492, 0.1033], ..., [-0.1646, 0.0890, 0.0352, ..., 0.2039, -0.2206, -0.2784], [-0.2809, 0.0684, -0.1487, ..., 0.0581, -0.1091, -0.1950], [-0.0283, -0.1370, -0.0903, ..., -0.1207, -0.0651, -0.2223]], device='cuda:0'), grad: tensor([[ 9.3132e-09, 2.5379e-08, 0.0000e+00, ..., 6.4028e-08, -6.3796e-08, -4.7032e-08], [ 6.0536e-09, 9.0338e-08, 0.0000e+00, ..., 1.5250e-07, 3.9581e-09, 2.3283e-09], [ 1.6298e-09, 1.3015e-07, 0.0000e+00, ..., 1.1059e-07, -3.8417e-08, -1.3504e-08], ..., [ 9.3132e-10, -1.2284e-06, 0.0000e+00, ..., -1.5274e-06, 1.6298e-09, -1.5600e-08], [ 1.4203e-08, 4.7730e-08, 0.0000e+00, ..., 4.0978e-08, 3.0268e-09, 2.7940e-09], [ 1.6764e-08, 1.6973e-07, 0.0000e+00, ..., 4.8103e-07, 5.2620e-08, 3.8184e-08]], device='cuda:0') Epoch 306, bias, value: tensor([-0.0083, -0.0380, -0.0151, -0.0220, -0.0298, 0.0061, 0.0229, -0.0092, 0.0398, -0.0110], device='cuda:0'), grad: tensor([ 1.0873e-07, -3.0398e-06, 2.3795e-07, 3.9153e-06, -1.0524e-06, -2.3190e-06, 8.1258e-07, -8.3493e-07, 1.6624e-07, 2.0079e-06], device='cuda:0') 100 0.0001 changing lr epoch 305, time 251.16, cls_loss 0.0024 cls_loss_mapping 0.0026 cls_loss_causal 0.4824 re_mapping 0.0046 re_causal 0.0110 /// teacc 99.08 lr 0.00010000 Epoch 307, weight, value: tensor([[-0.1153, -0.2441, -0.0804, ..., -0.0606, 0.1860, 0.1841], [-0.2276, -0.1977, -0.0857, ..., -0.1789, -0.2277, -0.1453], [-0.0721, -0.1708, 0.1426, ..., -0.2153, 0.2495, 0.1042], ..., [-0.1658, 0.0894, 0.0333, ..., 0.2051, -0.2209, -0.2794], [-0.2812, 0.0683, -0.1490, ..., 0.0585, -0.1097, -0.1960], [-0.0279, -0.1374, -0.0865, ..., -0.1220, -0.0651, -0.2223]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -5.7509e-08, -3.4925e-08], [ 0.0000e+00, 3.0268e-09, 0.0000e+00, ..., 4.4238e-09, 3.0268e-09, 1.6298e-09], [ 0.0000e+00, 3.3993e-08, 0.0000e+00, ..., 4.9826e-08, -3.0268e-08, -1.7928e-08], ..., [ 0.0000e+00, -1.8300e-07, 0.0000e+00, ..., -2.1793e-07, 6.9849e-09, 3.4925e-09], [ 1.3271e-08, 1.4203e-07, 0.0000e+00, ..., 1.5926e-07, 2.5611e-08, 1.6531e-08], [ 4.6566e-10, 2.3283e-09, 0.0000e+00, ..., 6.5193e-09, 3.5856e-08, 2.0023e-08]], device='cuda:0') Epoch 307, bias, value: tensor([-0.0083, -0.0381, -0.0152, -0.0215, -0.0315, 0.0050, 0.0230, -0.0089, 0.0399, -0.0105], device='cuda:0'), grad: tensor([-1.1781e-07, -1.7928e-08, -9.0804e-09, 1.3039e-08, -6.4960e-08, -5.6578e-08, 4.9593e-08, -3.0617e-07, 3.4692e-07, 1.6228e-07], device='cuda:0') 100 0.0001 changing lr epoch 306, time 250.91, cls_loss 0.0018 cls_loss_mapping 0.0019 cls_loss_causal 0.4890 re_mapping 0.0046 re_causal 0.0118 /// teacc 99.12 lr 0.00010000 Epoch 308, weight, value: tensor([[-0.1154, -0.2448, -0.0804, ..., -0.0607, 0.1860, 0.1842], [-0.2294, -0.1977, -0.0857, ..., -0.1788, -0.2278, -0.1456], [-0.0720, -0.1707, 0.1427, ..., -0.2154, 0.2495, 0.1044], ..., [-0.1676, 0.0893, 0.0332, ..., 0.2050, -0.2209, -0.2800], [-0.2818, 0.0691, -0.1490, ..., 0.0595, -0.1097, -0.1970], [-0.0288, -0.1377, -0.0863, ..., -0.1229, -0.0652, -0.2225]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, -2.2096e-07, -2.5402e-07], [ 6.9849e-10, 9.3132e-10, 0.0000e+00, ..., 5.5879e-09, -4.6566e-10, 1.3970e-09], [ 0.0000e+00, -2.3283e-10, 0.0000e+00, ..., 2.3283e-10, -8.7311e-08, -6.0769e-08], ..., [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 1.0710e-08, 5.5879e-09, 4.4238e-09], [ 1.3970e-09, 2.3283e-10, 0.0000e+00, ..., -2.0722e-08, 3.2596e-09, 2.7940e-09], [ 9.3132e-10, 1.1642e-09, 0.0000e+00, ..., 2.3283e-09, 1.9325e-08, 2.1886e-08]], device='cuda:0') Epoch 308, bias, value: tensor([-0.0084, -0.0382, -0.0151, -0.0203, -0.0316, 0.0041, 0.0231, -0.0090, 0.0406, -0.0106], device='cuda:0'), grad: tensor([-4.9360e-07, -1.1409e-08, -1.8999e-07, 2.6543e-08, 2.7474e-07, 1.0920e-07, 3.0617e-07, 8.3121e-08, -4.7963e-08, -4.0047e-08], device='cuda:0') 100 0.0001 changing lr epoch 307, time 250.64, cls_loss 0.0015 cls_loss_mapping 0.0016 cls_loss_causal 0.4640 re_mapping 0.0044 re_causal 0.0114 /// teacc 99.10 lr 0.00010000 Epoch 309, weight, value: tensor([[-0.1154, -0.2460, -0.0805, ..., -0.0606, 0.1861, 0.1843], [-0.2301, -0.1979, -0.0857, ..., -0.1786, -0.2281, -0.1458], [-0.0720, -0.1704, 0.1427, ..., -0.2155, 0.2498, 0.1057], ..., [-0.1688, 0.0893, 0.0330, ..., 0.2050, -0.2213, -0.2814], [-0.2824, 0.0690, -0.1491, ..., 0.0595, -0.1109, -0.1984], [-0.0299, -0.1377, -0.0860, ..., -0.1230, -0.0654, -0.2227]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 2.3283e-10], [-8.8476e-09, 1.6298e-09, 0.0000e+00, ..., 2.0955e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.1642e-09, 0.0000e+00, 0.0000e+00], ..., [ 7.2177e-09, -2.7940e-09, 0.0000e+00, ..., -6.5193e-09, 0.0000e+00, 0.0000e+00], [ 5.3551e-09, 3.9581e-09, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.0955e-09, 2.3283e-09, 0.0000e+00, ..., 2.0955e-09, 4.6566e-10, 2.3283e-10]], device='cuda:0') Epoch 309, bias, value: tensor([-0.0083, -0.0374, -0.0151, -0.0202, -0.0315, 0.0040, 0.0232, -0.0099, 0.0404, -0.0106], device='cuda:0'), grad: tensor([ 1.6298e-09, -1.3411e-07, 3.7253e-09, -4.4238e-08, 7.6834e-09, 5.7975e-08, 2.3283e-09, 1.2824e-06, 2.1420e-08, -1.1958e-06], device='cuda:0') 100 0.0001 changing lr epoch 308, time 250.31, cls_loss 0.0020 cls_loss_mapping 0.0018 cls_loss_causal 0.4983 re_mapping 0.0043 re_causal 0.0112 /// teacc 99.11 lr 0.00010000 Epoch 310, weight, value: tensor([[-0.1156, -0.2465, -0.0806, ..., -0.0606, 0.1860, 0.1844], [-0.2302, -0.1983, -0.0858, ..., -0.1797, -0.2287, -0.1462], [-0.0721, -0.1703, 0.1428, ..., -0.2157, 0.2501, 0.1068], ..., [-0.1702, 0.0894, 0.0328, ..., 0.2058, -0.2212, -0.2829], [-0.2827, 0.0690, -0.1491, ..., 0.0595, -0.1105, -0.1993], [-0.0311, -0.1380, -0.0855, ..., -0.1232, -0.0651, -0.2228]], device='cuda:0'), grad: tensor([[ 5.3551e-09, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 3.0035e-08, 4.4238e-09], [ 5.1223e-09, 3.0268e-09, 0.0000e+00, ..., 2.7940e-09, 3.2596e-08, 0.0000e+00], [ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., 8.6147e-09, 1.7928e-08, 2.3283e-10], ..., [ 2.3283e-09, -8.3121e-08, 0.0000e+00, ..., -1.2922e-07, 6.2864e-09, 0.0000e+00], [ 2.7940e-09, 6.1002e-08, 0.0000e+00, ..., 9.0338e-08, 6.9849e-10, 2.3283e-10], [ 2.5611e-09, 5.3551e-09, 0.0000e+00, ..., 9.5461e-09, 6.2864e-09, 6.9849e-10]], device='cuda:0') Epoch 310, bias, value: tensor([-0.0086, -0.0379, -0.0150, -0.0194, -0.0318, 0.0036, 0.0232, -0.0097, 0.0408, -0.0101], device='cuda:0'), grad: tensor([ 9.2201e-08, 1.2224e-07, 7.6368e-08, 1.1409e-08, -1.2070e-06, 2.0256e-08, 7.4133e-07, -1.3621e-07, 1.3551e-07, 1.5902e-07], device='cuda:0') 100 0.0001 changing lr epoch 309, time 250.37, cls_loss 0.0023 cls_loss_mapping 0.0022 cls_loss_causal 0.4856 re_mapping 0.0043 re_causal 0.0115 /// teacc 99.07 lr 0.00010000 Epoch 311, weight, value: tensor([[-0.1151, -0.2467, -0.0807, ..., -0.0606, 0.1860, 0.1851], [-0.2302, -0.1984, -0.0858, ..., -0.1801, -0.2298, -0.1462], [-0.0722, -0.1706, 0.1428, ..., -0.2160, 0.2513, 0.1064], ..., [-0.1707, 0.0894, 0.0328, ..., 0.2057, -0.2222, -0.2836], [-0.2830, 0.0694, -0.1492, ..., 0.0610, -0.1104, -0.1996], [-0.0309, -0.1381, -0.0855, ..., -0.1237, -0.0643, -0.2229]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.0245e-08, 0.0000e+00, ..., 2.0722e-08, 2.0955e-09, 0.0000e+00], [ 0.0000e+00, 2.0955e-09, 0.0000e+00, ..., 4.1910e-09, -2.5379e-08, 0.0000e+00], ..., [ 0.0000e+00, -4.8662e-08, 0.0000e+00, ..., -9.5461e-08, 6.7521e-09, 4.6566e-10], [ 1.3970e-09, 2.0489e-08, 0.0000e+00, ..., 3.7253e-08, 1.0012e-08, 0.0000e+00], [ 4.6566e-10, 2.3283e-09, -0.0000e+00, ..., 3.9581e-09, 9.3132e-10, 2.3283e-10]], device='cuda:0') Epoch 311, bias, value: tensor([-0.0091, -0.0376, -0.0138, -0.0189, -0.0326, 0.0028, 0.0228, -0.0105, 0.0426, -0.0089], device='cuda:0'), grad: tensor([ 5.3551e-09, 9.8255e-08, -6.0536e-08, 8.2422e-08, -2.0373e-07, 2.2817e-08, 1.0943e-08, -1.2713e-07, 7.4506e-08, 1.0594e-07], device='cuda:0') 100 0.0001 changing lr epoch 310, time 250.34, cls_loss 0.0020 cls_loss_mapping 0.0016 cls_loss_causal 0.4781 re_mapping 0.0048 re_causal 0.0119 /// teacc 99.05 lr 0.00010000 Epoch 312, weight, value: tensor([[-0.1158, -0.2469, -0.0807, ..., -0.0613, 0.1859, 0.1851], [-0.2304, -0.1985, -0.0858, ..., -0.1803, -0.2301, -0.1465], [-0.0723, -0.1708, 0.1436, ..., -0.2162, 0.2519, 0.1069], ..., [-0.1712, 0.0895, 0.0328, ..., 0.2059, -0.2226, -0.2850], [-0.2846, 0.0692, -0.1492, ..., 0.0613, -0.1105, -0.2001], [-0.0332, -0.1382, -0.0855, ..., -0.1236, -0.0645, -0.2231]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 3.0268e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 1.1409e-08, 3.7253e-09, 1.8626e-09], [ 0.0000e+00, 1.6298e-09, 0.0000e+00, ..., 7.4506e-09, -9.0804e-09, -4.6566e-09], ..., [ 2.3283e-10, -3.6322e-08, 0.0000e+00, ..., -8.4285e-08, 4.8894e-09, 2.5611e-09], [ 9.3132e-10, 2.3283e-10, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10], [ 4.4238e-09, 2.2817e-08, 0.0000e+00, ..., 5.1921e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 312, bias, value: tensor([-0.0093, -0.0374, -0.0136, -0.0194, -0.0338, 0.0032, 0.0229, -0.0110, 0.0428, -0.0082], device='cuda:0'), grad: tensor([ 8.6147e-09, -1.2526e-07, 1.3039e-08, 4.2375e-08, -6.8918e-08, -2.2817e-08, 2.7940e-09, -4.0978e-08, 5.1223e-09, 1.9115e-07], device='cuda:0') 100 0.0001 changing lr epoch 311, time 250.39, cls_loss 0.0020 cls_loss_mapping 0.0016 cls_loss_causal 0.4989 re_mapping 0.0043 re_causal 0.0120 /// teacc 99.03 lr 0.00010000 Epoch 313, weight, value: tensor([[-0.1161, -0.2470, -0.0807, ..., -0.0613, 0.1857, 0.1850], [-0.2306, -0.1986, -0.0859, ..., -0.1822, -0.2303, -0.1469], [-0.0723, -0.1716, 0.1437, ..., -0.2162, 0.2522, 0.1080], ..., [-0.1714, 0.0897, 0.0328, ..., 0.2073, -0.2230, -0.2873], [-0.2850, 0.0692, -0.1476, ..., 0.0605, -0.1109, -0.2009], [-0.0326, -0.1383, -0.0857, ..., -0.1242, -0.0642, -0.2232]], device='cuda:0'), grad: tensor([[ 1.1642e-09, 0.0000e+00, 0.0000e+00, ..., 6.9849e-10, -3.0268e-09, -1.1642e-08], [ 2.3283e-10, 1.1642e-09, 0.0000e+00, ..., 1.8161e-08, -1.8557e-07, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 7.9162e-09, 1.4668e-08, 4.6566e-10], ..., [ 2.3283e-10, -3.0268e-09, 0.0000e+00, ..., 8.6147e-09, 2.7707e-08, 2.3283e-10], [ 9.7789e-09, 1.1642e-09, 0.0000e+00, ..., -3.7486e-08, 8.1491e-08, 6.9849e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.0804e-09, 2.5379e-08, 1.2806e-08]], device='cuda:0') Epoch 313, bias, value: tensor([-0.0098, -0.0373, -0.0135, -0.0174, -0.0339, 0.0017, 0.0229, -0.0109, 0.0421, -0.0079], device='cuda:0'), grad: tensor([ 1.5670e-07, -2.4159e-06, 2.7148e-07, 4.5868e-08, 3.0105e-07, 6.4261e-08, 3.9721e-07, 4.1653e-07, 8.2841e-07, -5.6578e-08], device='cuda:0') 100 0.0001 changing lr epoch 312, time 250.52, cls_loss 0.0021 cls_loss_mapping 0.0028 cls_loss_causal 0.4984 re_mapping 0.0043 re_causal 0.0113 /// teacc 99.10 lr 0.00010000 Epoch 314, weight, value: tensor([[-0.1162, -0.2470, -0.0807, ..., -0.0614, 0.1857, 0.1851], [-0.2307, -0.1986, -0.0859, ..., -0.1828, -0.2303, -0.1469], [-0.0723, -0.1721, 0.1436, ..., -0.2163, 0.2524, 0.1084], ..., [-0.1721, 0.0895, 0.0327, ..., 0.2078, -0.2232, -0.2881], [-0.2852, 0.0691, -0.1477, ..., 0.0608, -0.1096, -0.2023], [-0.0326, -0.1384, -0.0857, ..., -0.1245, -0.0641, -0.2233]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -1.4435e-08, -8.6147e-09], [ 0.0000e+00, 8.0327e-08, 0.0000e+00, ..., 1.5460e-07, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 4.6566e-09, 4.6566e-10, 2.3283e-10], ..., [ 0.0000e+00, -1.1665e-07, 0.0000e+00, ..., -2.2771e-07, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 6.9849e-10, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 3.1898e-08, -2.3283e-10, ..., 6.4494e-08, 8.3819e-09, 5.1223e-09]], device='cuda:0') Epoch 314, bias, value: tensor([-0.0100, -0.0377, -0.0137, -0.0173, -0.0359, 0.0015, 0.0226, -0.0105, 0.0426, -0.0068], device='cuda:0'), grad: tensor([-9.0804e-09, 7.4878e-07, 2.1188e-08, 5.1223e-09, -9.3691e-07, 1.7695e-08, 3.6531e-07, -5.9418e-07, 1.7928e-08, 3.7695e-07], device='cuda:0') 100 0.0001 changing lr epoch 313, time 250.66, cls_loss 0.0020 cls_loss_mapping 0.0019 cls_loss_causal 0.4778 re_mapping 0.0046 re_causal 0.0115 /// teacc 99.06 lr 0.00010000 Epoch 315, weight, value: tensor([[-0.1163, -0.2472, -0.0808, ..., -0.0613, 0.1858, 0.1852], [-0.2308, -0.1987, -0.0862, ..., -0.1830, -0.2305, -0.1461], [-0.0723, -0.1730, 0.1464, ..., -0.2162, 0.2533, 0.1096], ..., [-0.1725, 0.0896, 0.0327, ..., 0.2081, -0.2240, -0.2908], [-0.2856, 0.0690, -0.1477, ..., 0.0607, -0.1099, -0.2026], [-0.0329, -0.1381, -0.0856, ..., -0.1247, -0.0642, -0.2235]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.0955e-09, 0.0000e+00, ..., 2.5611e-09, -4.0233e-06, -3.5595e-06], [ 2.3283e-10, 6.7521e-09, 0.0000e+00, ..., 5.3551e-09, 5.9372e-08, 4.9360e-08], [ 0.0000e+00, 6.4727e-08, 0.0000e+00, ..., 7.5903e-08, 2.3982e-08, 3.1432e-08], ..., [ 4.6566e-10, -5.6811e-08, 0.0000e+00, ..., -3.4226e-08, 3.4692e-08, 2.2817e-08], [ 4.6566e-10, -4.4471e-08, 0.0000e+00, ..., -1.5344e-07, 1.2526e-07, 1.1479e-07], [ 0.0000e+00, 2.1886e-08, 0.0000e+00, ..., 9.8487e-08, 1.2759e-06, 1.1278e-06]], device='cuda:0') Epoch 315, bias, value: tensor([-0.0099, -0.0368, -0.0138, -0.0173, -0.0353, 0.0016, 0.0225, -0.0117, 0.0426, -0.0064], device='cuda:0'), grad: tensor([-1.0304e-05, 1.4459e-07, 2.8731e-07, 1.8184e-07, 3.0268e-08, 3.3015e-07, 5.9418e-06, 2.3004e-07, -6.9849e-07, 3.8594e-06], device='cuda:0') 100 0.0001 changing lr epoch 314, time 250.55, cls_loss 0.0020 cls_loss_mapping 0.0019 cls_loss_causal 0.4872 re_mapping 0.0044 re_causal 0.0114 /// teacc 99.02 lr 0.00010000 Epoch 316, weight, value: tensor([[-0.1163, -0.2474, -0.0808, ..., -0.0611, 0.1861, 0.1857], [-0.2311, -0.2000, -0.0862, ..., -0.1845, -0.2310, -0.1476], [-0.0723, -0.1731, 0.1466, ..., -0.2158, 0.2541, 0.1111], ..., [-0.1728, 0.0907, 0.0327, ..., 0.2089, -0.2250, -0.2933], [-0.2869, 0.0688, -0.1478, ..., 0.0605, -0.1108, -0.2054], [-0.0331, -0.1385, -0.0856, ..., -0.1253, -0.0644, -0.2238]], device='cuda:0'), grad: tensor([[ 1.1874e-08, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, -1.0245e-08, -7.4506e-09], [ 4.6566e-09, 4.1910e-09, 0.0000e+00, ..., 2.3283e-10, -1.6089e-07, 7.9162e-09], [ 2.4191e-07, 1.6298e-09, 0.0000e+00, ..., 0.0000e+00, 1.7998e-07, -1.1642e-08], ..., [ 6.6590e-08, 6.3097e-08, 0.0000e+00, ..., 4.6566e-10, 6.0536e-09, 1.6298e-08], [ 2.0256e-07, 4.6566e-10, 0.0000e+00, ..., 4.8429e-08, 2.6077e-08, 4.6566e-10], [-3.3039e-07, 6.9849e-10, 0.0000e+00, ..., 1.8161e-08, -8.2189e-08, 6.0536e-09]], device='cuda:0') Epoch 316, bias, value: tensor([-0.0097, -0.0375, -0.0135, -0.0174, -0.0353, 0.0014, 0.0227, -0.0108, 0.0423, -0.0070], device='cuda:0'), grad: tensor([-4.6566e-09, -1.0999e-06, 1.6754e-06, -2.3982e-07, -8.1817e-07, -2.0699e-07, 2.9337e-07, 3.0128e-07, 4.7497e-07, -3.8208e-07], device='cuda:0') 100 0.0001 changing lr epoch 315, time 250.12, cls_loss 0.0020 cls_loss_mapping 0.0020 cls_loss_causal 0.5071 re_mapping 0.0043 re_causal 0.0119 /// teacc 99.00 lr 0.00010000 Epoch 317, weight, value: tensor([[-0.1163, -0.2481, -0.0808, ..., -0.0612, 0.1857, 0.1858], [-0.2311, -0.2001, -0.0862, ..., -0.1847, -0.2310, -0.1469], [-0.0724, -0.1730, 0.1468, ..., -0.2161, 0.2544, 0.1115], ..., [-0.1734, 0.0896, 0.0327, ..., 0.2070, -0.2253, -0.2943], [-0.2873, 0.0688, -0.1478, ..., 0.0606, -0.1111, -0.2053], [-0.0335, -0.1363, -0.0855, ..., -0.1226, -0.0637, -0.2239]], device='cuda:0'), grad: tensor([[ 1.4668e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -7.4506e-09, -9.7789e-09], [ 6.9849e-10, 2.3283e-10, 0.0000e+00, ..., 9.3132e-10, 2.3283e-10, -4.6566e-10], [ 1.6298e-09, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 6.9849e-10, 6.9849e-10], ..., [ 6.9849e-10, 2.3283e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 6.9849e-10], [ 6.5193e-09, 6.9849e-10, 0.0000e+00, ..., -8.1491e-09, 1.3970e-09, 1.1642e-09], [-2.9802e-08, 2.3283e-10, -0.0000e+00, ..., -4.6566e-10, 2.1188e-08, 1.7695e-08]], device='cuda:0') Epoch 317, bias, value: tensor([-0.0103, -0.0372, -0.0138, -0.0173, -0.0340, 0.0013, 0.0227, -0.0133, 0.0423, -0.0042], device='cuda:0'), grad: tensor([ 2.2352e-08, 1.6531e-08, 1.0594e-07, 3.0105e-07, 4.0280e-08, 4.9593e-08, -2.9104e-08, 4.5402e-08, -1.1642e-08, -5.3458e-07], device='cuda:0') 100 0.0001 changing lr epoch 316, time 250.05, cls_loss 0.0019 cls_loss_mapping 0.0019 cls_loss_causal 0.4770 re_mapping 0.0044 re_causal 0.0113 /// teacc 98.96 lr 0.00010000 Epoch 318, weight, value: tensor([[-0.1164, -0.2485, -0.0808, ..., -0.0612, 0.1858, 0.1858], [-0.2312, -0.2001, -0.0862, ..., -0.1849, -0.2311, -0.1460], [-0.0728, -0.1737, 0.1468, ..., -0.2164, 0.2543, 0.1111], ..., [-0.1739, 0.0891, 0.0326, ..., 0.2048, -0.2253, -0.2947], [-0.2879, 0.0718, -0.1479, ..., 0.0646, -0.1115, -0.2064], [-0.0339, -0.1369, -0.0853, ..., -0.1229, -0.0638, -0.2240]], device='cuda:0'), grad: tensor([[ 5.8208e-09, 3.2596e-09, 0.0000e+00, ..., 0.0000e+00, -3.4925e-09, -3.0268e-09], [ 1.1874e-08, 1.0012e-08, 0.0000e+00, ..., 1.1642e-09, 3.0268e-09, 1.1642e-09], [ 1.8626e-09, 5.3551e-09, 0.0000e+00, ..., 2.5611e-09, -1.0012e-08, -4.8894e-09], ..., [ 1.8394e-08, -1.1642e-09, 0.0000e+00, ..., -1.1874e-08, 6.2864e-09, 3.0268e-09], [ 5.6112e-08, 3.6322e-08, 0.0000e+00, ..., 1.2107e-08, 2.3283e-10, 2.3283e-10], [ 3.3760e-08, 2.8405e-08, 0.0000e+00, ..., 2.5611e-09, 1.8626e-09, 1.8626e-09]], device='cuda:0') Epoch 318, bias, value: tensor([-0.0103, -0.0368, -0.0141, -0.0171, -0.0339, 0.0011, 0.0229, -0.0138, 0.0446, -0.0044], device='cuda:0'), grad: tensor([ 9.3132e-09, 1.1273e-05, -1.4668e-08, -1.7476e-04, -1.1563e-05, 1.7452e-04, 1.5600e-08, 4.6799e-08, 2.3213e-07, 2.3632e-07], device='cuda:0') 100 0.0001 changing lr epoch 317, time 250.44, cls_loss 0.0025 cls_loss_mapping 0.0022 cls_loss_causal 0.4825 re_mapping 0.0041 re_causal 0.0107 /// teacc 99.04 lr 0.00010000 Epoch 319, weight, value: tensor([[-0.1168, -0.2487, -0.0809, ..., -0.0614, 0.1863, 0.1867], [-0.2323, -0.2003, -0.0862, ..., -0.1831, -0.2312, -0.1468], [-0.0717, -0.1743, 0.1468, ..., -0.2167, 0.2547, 0.1119], ..., [-0.1748, 0.0891, 0.0326, ..., 0.2034, -0.2258, -0.2951], [-0.2925, 0.0741, -0.1482, ..., 0.0641, -0.1121, -0.2079], [-0.0353, -0.1379, -0.0852, ..., -0.1235, -0.0639, -0.2245]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -3.2783e-07, -2.1420e-07], [ 0.0000e+00, 2.1094e-07, 0.0000e+00, ..., 4.3400e-07, 2.6543e-08, 1.8161e-08], [ 0.0000e+00, 3.7253e-08, 0.0000e+00, ..., 7.6368e-08, 9.7789e-09, 6.0536e-09], ..., [ 0.0000e+00, -3.0315e-07, 0.0000e+00, ..., -6.1560e-07, 1.1176e-08, 6.5193e-09], [ 3.2596e-09, 1.2107e-08, 0.0000e+00, ..., 2.4680e-08, 8.3819e-09, 5.5879e-09], [ 0.0000e+00, 1.2573e-08, -4.6566e-10, ..., 1.7229e-08, 1.1735e-07, 8.0094e-08]], device='cuda:0') Epoch 319, bias, value: tensor([-0.0102, -0.0362, -0.0141, -0.0168, -0.0327, 0.0021, 0.0225, -0.0143, 0.0444, -0.0047], device='cuda:0'), grad: tensor([-6.9803e-07, 1.2238e-06, 2.2445e-07, 1.6065e-07, 4.5169e-08, -1.5832e-08, 3.2084e-07, -1.6149e-06, 8.7079e-08, 2.7614e-07], device='cuda:0') 100 0.0001 changing lr epoch 318, time 250.40, cls_loss 0.0016 cls_loss_mapping 0.0011 cls_loss_causal 0.4987 re_mapping 0.0044 re_causal 0.0119 /// teacc 98.99 lr 0.00010000 Epoch 320, weight, value: tensor([[-0.1170, -0.2488, -0.0810, ..., -0.0616, 0.1863, 0.1868], [-0.2329, -0.2004, -0.0862, ..., -0.1835, -0.2315, -0.1476], [-0.0714, -0.1728, 0.1468, ..., -0.2166, 0.2550, 0.1127], ..., [-0.1755, 0.0892, 0.0326, ..., 0.2036, -0.2262, -0.2961], [-0.2927, 0.0746, -0.1484, ..., 0.0644, -0.1110, -0.2077], [-0.0353, -0.1386, -0.0851, ..., -0.1237, -0.0640, -0.2246]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.4110e-07, -1.7695e-08, 6.9849e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.4435e-08, 1.3970e-09, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8636e-06, -5.1223e-09, 2.8824e-07], ..., [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.0245e-08, 4.1910e-09, 5.5879e-09], [ 3.7253e-09, 0.0000e+00, 0.0000e+00, ..., -5.9940e-06, -9.3132e-10, -9.3831e-07], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., 7.9162e-09, 8.8476e-09, 9.7789e-09]], device='cuda:0') Epoch 320, bias, value: tensor([-0.0102, -0.0362, -0.0141, -0.0169, -0.0324, 0.0014, 0.0228, -0.0144, 0.0456, -0.0049], device='cuda:0'), grad: tensor([ 4.2235e-07, -2.3283e-08, 5.8860e-06, 3.9116e-08, 9.7789e-09, 1.0595e-05, 1.8859e-06, 1.4016e-07, -1.8999e-05, 4.5169e-08], device='cuda:0') 100 0.0001 changing lr epoch 319, time 250.35, cls_loss 0.0019 cls_loss_mapping 0.0020 cls_loss_causal 0.4904 re_mapping 0.0044 re_causal 0.0109 /// teacc 99.11 lr 0.00010000 Epoch 321, weight, value: tensor([[-0.1171, -0.2504, -0.0831, ..., -0.0617, 0.1864, 0.1868], [-0.2333, -0.2006, -0.0862, ..., -0.1840, -0.2317, -0.1479], [-0.0714, -0.1730, 0.1468, ..., -0.2171, 0.2555, 0.1132], ..., [-0.1757, 0.0894, 0.0326, ..., 0.2040, -0.2263, -0.2962], [-0.2929, 0.0745, -0.1485, ..., 0.0646, -0.1111, -0.2079], [-0.0358, -0.1388, -0.0850, ..., -0.1236, -0.0640, -0.2247]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, -2.6263e-07, -1.4761e-07], [ 9.5461e-08, 5.5879e-09, 0.0000e+00, ..., -3.0873e-07, 4.5635e-08, 2.5611e-08], [ 1.8626e-09, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 5.5879e-09, 4.6566e-09], ..., [ 5.1223e-09, -2.4820e-07, 0.0000e+00, ..., -1.3411e-07, 4.1910e-09, 2.3283e-09], [ 1.8626e-09, 2.7940e-09, 0.0000e+00, ..., 2.7940e-09, 1.4901e-08, 8.3819e-09], [-1.6298e-08, 2.3749e-07, 0.0000e+00, ..., 4.3446e-07, 1.8114e-07, 1.0012e-07]], device='cuda:0') Epoch 321, bias, value: tensor([-0.0102, -0.0347, -0.0140, -0.0169, -0.0330, 0.0011, 0.0229, -0.0156, 0.0458, -0.0046], device='cuda:0'), grad: tensor([-5.5321e-07, -2.5649e-06, 3.0268e-08, 6.5193e-09, -9.2480e-07, 9.8255e-08, 1.1921e-07, 1.7481e-06, 4.2841e-08, 2.0154e-06], device='cuda:0') 100 0.0001 changing lr epoch 320, time 250.19, cls_loss 0.0023 cls_loss_mapping 0.0023 cls_loss_causal 0.4827 re_mapping 0.0041 re_causal 0.0106 /// teacc 99.03 lr 0.00010000 Epoch 322, weight, value: tensor([[-0.1176, -0.2513, -0.0834, ..., -0.0619, 0.1865, 0.1870], [-0.2333, -0.2008, -0.0861, ..., -0.1844, -0.2317, -0.1480], [-0.0717, -0.1733, 0.1470, ..., -0.2191, 0.2557, 0.1135], ..., [-0.1758, 0.0896, 0.0325, ..., 0.2044, -0.2267, -0.2968], [-0.2934, 0.0748, -0.1486, ..., 0.0657, -0.1111, -0.2081], [-0.0377, -0.1397, -0.0852, ..., -0.1243, -0.0643, -0.2251]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -8.6613e-08, -6.4261e-08], [ 6.9849e-09, 1.3970e-09, 0.0000e+00, ..., 4.6566e-09, 8.3819e-09, 3.7253e-09], [ 2.3283e-09, 3.2596e-09, 0.0000e+00, ..., 1.3970e-09, -6.7055e-08, 4.6566e-09], ..., [ 8.8476e-09, 7.9162e-09, 0.0000e+00, ..., 5.5879e-09, 6.1467e-08, 4.6566e-10], [ 5.5209e-06, 5.7276e-08, 0.0000e+00, ..., 3.4384e-06, 2.2817e-08, 1.9092e-08], [ 3.7719e-08, 4.6566e-10, 0.0000e+00, ..., 2.3749e-08, 1.3504e-08, 1.0710e-08]], device='cuda:0') Epoch 322, bias, value: tensor([-0.0102, -0.0346, -0.0145, -0.0172, -0.0328, 0.0015, 0.0228, -0.0155, 0.0472, -0.0050], device='cuda:0'), grad: tensor([-8.9407e-08, 5.2363e-05, -1.3923e-07, 2.7522e-05, 3.0454e-07, -3.8564e-05, 9.4529e-08, 1.0945e-05, 1.1168e-05, -6.3658e-05], device='cuda:0') 100 0.0001 changing lr epoch 321, time 250.39, cls_loss 0.0019 cls_loss_mapping 0.0015 cls_loss_causal 0.4700 re_mapping 0.0044 re_causal 0.0114 /// teacc 99.07 lr 0.00010000 Epoch 323, weight, value: tensor([[-0.1176, -0.2516, -0.0847, ..., -0.0620, 0.1866, 0.1871], [-0.2338, -0.2008, -0.0849, ..., -0.1853, -0.2321, -0.1485], [-0.0717, -0.1753, 0.1475, ..., -0.2205, 0.2565, 0.1141], ..., [-0.1754, 0.0900, 0.0316, ..., 0.2054, -0.2269, -0.2972], [-0.2939, 0.0748, -0.1495, ..., 0.0657, -0.1114, -0.2083], [-0.0391, -0.1399, -0.0862, ..., -0.1248, -0.0643, -0.2252]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, -2.7008e-08, -1.5367e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 7.6219e-06, 5.5879e-09, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 5.1223e-09, 1.3970e-09], ..., [ 0.0000e+00, -4.1910e-09, 0.0000e+00, ..., -8.0615e-06, 4.6566e-10, 0.0000e+00], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.9360e-08, 1.3970e-09, 9.3132e-10], [ 0.0000e+00, 2.7940e-09, -4.6566e-10, ..., 3.5577e-07, 2.2817e-08, 1.2107e-08]], device='cuda:0') Epoch 323, bias, value: tensor([-0.0102, -0.0348, -0.0145, -0.0174, -0.0321, 0.0025, 0.0224, -0.0153, 0.0472, -0.0052], device='cuda:0'), grad: tensor([-2.8405e-08, 2.4199e-05, 1.0896e-07, 2.3283e-08, 3.1665e-08, -5.1223e-09, 1.1642e-08, -2.6479e-05, 8.3726e-07, 1.3541e-06], device='cuda:0') 100 0.0001 changing lr epoch 322, time 250.26, cls_loss 0.0022 cls_loss_mapping 0.0020 cls_loss_causal 0.5031 re_mapping 0.0043 re_causal 0.0115 /// teacc 99.10 lr 0.00010000 Epoch 324, weight, value: tensor([[-0.1176, -0.2518, -0.0843, ..., -0.0622, 0.1868, 0.1874], [-0.2350, -0.2010, -0.0834, ..., -0.1859, -0.2332, -0.1491], [-0.0717, -0.1755, 0.1476, ..., -0.2215, 0.2567, 0.1142], ..., [-0.1762, 0.0912, 0.0307, ..., 0.2070, -0.2271, -0.2974], [-0.2944, 0.0745, -0.1511, ..., 0.0656, -0.1107, -0.2098], [-0.0396, -0.1421, -0.0877, ..., -0.1264, -0.0645, -0.2254]], device='cuda:0'), grad: tensor([[-1.2061e-07, 4.6566e-10, 0.0000e+00, ..., 3.6787e-08, -4.2794e-07, -4.8196e-07], [ 1.3970e-09, 8.8476e-09, 0.0000e+00, ..., 1.3970e-08, -1.8626e-09, 9.3132e-10], [ 4.6566e-10, 6.6590e-08, 0.0000e+00, ..., 1.0757e-07, -8.3819e-09, -9.3132e-09], ..., [ 1.2713e-07, -3.7579e-07, 4.6566e-10, ..., -5.7416e-07, 7.4506e-09, 6.9849e-09], [ 6.2399e-08, 2.9523e-07, 4.6566e-10, ..., 4.9965e-07, 9.3132e-10, 4.6566e-10], [ 1.1176e-08, 1.0245e-08, -4.6566e-10, ..., 1.7695e-08, 1.0710e-08, 1.0710e-08]], device='cuda:0') Epoch 324, bias, value: tensor([-0.0101, -0.0328, -0.0149, -0.0178, -0.0316, 0.0035, 0.0219, -0.0165, 0.0473, -0.0060], device='cuda:0'), grad: tensor([-7.0501e-07, 2.5146e-08, 1.4622e-07, 5.1223e-09, 2.8452e-07, -1.2144e-06, 1.6009e-06, -7.1246e-07, 8.5682e-07, -2.7986e-07], device='cuda:0') 100 0.0001 changing lr epoch 323, time 250.21, cls_loss 0.0022 cls_loss_mapping 0.0015 cls_loss_causal 0.4906 re_mapping 0.0042 re_causal 0.0108 /// teacc 99.09 lr 0.00010000 Epoch 325, weight, value: tensor([[-0.1183, -0.2522, -0.0869, ..., -0.0624, 0.1865, 0.1871], [-0.2353, -0.2014, -0.0835, ..., -0.1870, -0.2341, -0.1497], [-0.0719, -0.1759, 0.1476, ..., -0.2218, 0.2572, 0.1147], ..., [-0.1766, 0.0921, 0.0302, ..., 0.2079, -0.2272, -0.2981], [-0.2950, 0.0743, -0.1521, ..., 0.0658, -0.1101, -0.2107], [-0.0400, -0.1431, -0.0873, ..., -0.1268, -0.0645, -0.2254]], device='cuda:0'), grad: tensor([[ 1.3039e-08, 4.6566e-10, 0.0000e+00, ..., 6.7055e-08, -9.7789e-09, -8.8476e-09], [ 2.1420e-08, 5.8208e-08, 0.0000e+00, ..., 4.4517e-07, 1.8626e-09, 1.3970e-09], [ 4.6566e-10, 1.1176e-08, 0.0000e+00, ..., 6.6543e-07, 1.8626e-09, 9.3132e-10], ..., [ 1.3970e-09, -7.4971e-08, 0.0000e+00, ..., -3.0175e-07, 0.0000e+00, 0.0000e+00], [ 1.1548e-07, 0.0000e+00, 0.0000e+00, ..., -8.0792e-07, 3.2596e-09, 2.3283e-09], [ 5.5879e-09, 1.8626e-09, 0.0000e+00, ..., 1.3504e-08, 3.6787e-08, 2.5611e-08]], device='cuda:0') Epoch 325, bias, value: tensor([-0.0104, -0.0331, -0.0146, -0.0154, -0.0315, 0.0015, 0.0222, -0.0156, 0.0473, -0.0069], device='cuda:0'), grad: tensor([ 4.1071e-07, 1.6429e-06, 4.6603e-06, 5.1782e-07, 4.8429e-08, -1.8291e-06, 1.3085e-06, -4.9081e-07, -6.4038e-06, 1.4110e-07], device='cuda:0') 100 0.0001 changing lr epoch 324, time 250.23, cls_loss 0.0019 cls_loss_mapping 0.0017 cls_loss_causal 0.4962 re_mapping 0.0043 re_causal 0.0112 /// teacc 99.00 lr 0.00010000 Epoch 326, weight, value: tensor([[-0.1192, -0.2528, -0.0872, ..., -0.0621, 0.1859, 0.1868], [-0.2357, -0.2020, -0.0836, ..., -0.1877, -0.2343, -0.1498], [-0.0721, -0.1771, 0.1480, ..., -0.2226, 0.2577, 0.1155], ..., [-0.1760, 0.0932, 0.0298, ..., 0.2088, -0.2276, -0.2985], [-0.2954, 0.0742, -0.1537, ..., 0.0658, -0.1113, -0.2130], [-0.0402, -0.1442, -0.0862, ..., -0.1272, -0.0641, -0.2257]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 4.6566e-10, 4.6566e-10, ..., 1.3970e-09, 2.1420e-08, -0.0000e+00], [ 5.8394e-07, 1.8161e-08, 9.3132e-10, ..., 5.4482e-08, 8.8802e-07, 1.7136e-07], [ 2.7940e-09, -2.5146e-08, -1.3970e-08, ..., 2.9802e-08, -2.5611e-08, 9.3132e-10], ..., [ 0.0000e+00, -3.4971e-07, 9.3132e-10, ..., -1.0906e-06, 9.3132e-10, 0.0000e+00], [ 1.1781e-07, 2.2678e-07, 4.1910e-09, ..., 6.9011e-07, 1.8533e-07, 3.4925e-08], [ 0.0000e+00, 9.5461e-08, -4.6566e-10, ..., 2.9895e-07, 7.4506e-09, 6.5193e-09]], device='cuda:0') Epoch 326, bias, value: tensor([-0.0112, -0.0327, -0.0148, -0.0153, -0.0334, 0.0014, 0.0223, -0.0152, 0.0463, -0.0066], device='cuda:0'), grad: tensor([ 1.2852e-07, 7.9162e-09, 1.8738e-06, 1.5413e-07, 1.6810e-07, 4.5933e-06, -9.9167e-06, -1.7183e-07, 2.4866e-06, 6.7521e-07], device='cuda:0') 100 0.0001 changing lr epoch 325, time 250.31, cls_loss 0.0020 cls_loss_mapping 0.0016 cls_loss_causal 0.4430 re_mapping 0.0044 re_causal 0.0110 /// teacc 99.03 lr 0.00010000 Epoch 327, weight, value: tensor([[-0.1196, -0.2521, -0.0873, ..., -0.0620, 0.1861, 0.1870], [-0.2359, -0.2029, -0.0836, ..., -0.1886, -0.2347, -0.1500], [-0.0722, -0.1772, 0.1486, ..., -0.2221, 0.2586, 0.1166], ..., [-0.1763, 0.0939, 0.0297, ..., 0.2092, -0.2279, -0.2987], [-0.2959, 0.0743, -0.1551, ..., 0.0657, -0.1129, -0.2159], [-0.0406, -0.1444, -0.0860, ..., -0.1274, -0.0646, -0.2265]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 2.0508e-06, 1.8170e-06], [ 9.3132e-10, 1.3970e-09, 0.0000e+00, ..., 2.3283e-09, 2.1420e-08, 1.8161e-08], [ 0.0000e+00, 1.3970e-09, 9.3132e-10, ..., 1.8626e-09, -4.8280e-06, -4.2506e-06], ..., [ 4.6566e-10, -6.9849e-09, 1.8626e-09, ..., -8.8476e-09, 4.2375e-07, 3.7299e-07], [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 1.0664e-06, 9.3784e-07], [ 0.0000e+00, 3.7253e-09, -2.7940e-09, ..., 4.6566e-09, 1.1548e-07, 9.2667e-08]], device='cuda:0') Epoch 327, bias, value: tensor([-0.0111, -0.0329, -0.0123, -0.0155, -0.0316, 0.0023, 0.0220, -0.0156, 0.0450, -0.0071], device='cuda:0'), grad: tensor([ 4.8466e-06, 1.0245e-08, -1.1273e-05, 2.5742e-06, 4.6678e-06, -5.4017e-08, 2.0489e-07, 1.1362e-06, 4.4927e-06, -6.6087e-06], device='cuda:0') 100 0.0001 changing lr epoch 326, time 250.23, cls_loss 0.0023 cls_loss_mapping 0.0019 cls_loss_causal 0.4618 re_mapping 0.0043 re_causal 0.0107 /// teacc 99.07 lr 0.00010000 Epoch 328, weight, value: tensor([[-0.1200, -0.2524, -0.0875, ..., -0.0620, 0.1863, 0.1872], [-0.2360, -0.2030, -0.0835, ..., -0.1888, -0.2356, -0.1504], [-0.0723, -0.1776, 0.1485, ..., -0.2225, 0.2596, 0.1176], ..., [-0.1766, 0.0945, 0.0270, ..., 0.2097, -0.2281, -0.2990], [-0.2961, 0.0743, -0.1556, ..., 0.0661, -0.1127, -0.2172], [-0.0411, -0.1452, -0.0837, ..., -0.1280, -0.0650, -0.2271]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 1.0245e-08, -4.0932e-07, -4.4098e-07], [ 8.3819e-09, 2.7474e-08, 0.0000e+00, ..., 1.0151e-07, 1.1176e-08, 1.3970e-09], [ 3.4925e-08, 5.5414e-08, 0.0000e+00, ..., 2.2212e-07, 1.6298e-08, 0.0000e+00], ..., [ 1.0384e-07, 1.1967e-07, 0.0000e+00, ..., -3.2643e-07, 4.1910e-09, 4.6566e-10], [ 1.1642e-08, 9.3132e-10, 0.0000e+00, ..., -2.4606e-06, -8.0466e-06, 1.3970e-09], [ 2.8405e-08, 4.6566e-08, 0.0000e+00, ..., 9.3132e-09, 2.3749e-08, 1.3504e-08]], device='cuda:0') Epoch 328, bias, value: tensor([-0.0112, -0.0330, -0.0119, -0.0155, -0.0305, 0.0021, 0.0218, -0.0158, 0.0453, -0.0072], device='cuda:0'), grad: tensor([-6.3702e-07, 3.1758e-07, 6.7474e-07, -8.9174e-07, 9.3132e-10, 2.3674e-06, 3.6895e-05, -2.9057e-07, -3.8743e-05, 2.1234e-07], device='cuda:0') 100 0.0001 changing lr epoch 327, time 250.51, cls_loss 0.0019 cls_loss_mapping 0.0015 cls_loss_causal 0.4779 re_mapping 0.0044 re_causal 0.0112 /// teacc 99.09 lr 0.00010000 Epoch 329, weight, value: tensor([[-0.1205, -0.2528, -0.0878, ..., -0.0620, 0.1863, 0.1872], [-0.2361, -0.2031, -0.0825, ..., -0.1892, -0.2356, -0.1502], [-0.0724, -0.1777, 0.1484, ..., -0.2227, 0.2597, 0.1176], ..., [-0.1795, 0.0931, 0.0257, ..., 0.2091, -0.2284, -0.2997], [-0.2963, 0.0742, -0.1558, ..., 0.0663, -0.1124, -0.2177], [-0.0411, -0.1453, -0.0835, ..., -0.1281, -0.0651, -0.2273]], device='cuda:0'), grad: tensor([[ 8.8476e-09, 4.7032e-08, 0.0000e+00, ..., 1.1548e-07, -1.1642e-08, -8.8476e-09], [ 0.0000e+00, 0.0000e+00, 4.1910e-09, ..., 0.0000e+00, 6.5193e-09, 1.8626e-09], [ 0.0000e+00, 4.6566e-10, -1.6298e-08, ..., 9.3132e-10, -2.5611e-08, -7.9162e-09], ..., [-9.7789e-09, -5.6345e-08, 1.3970e-09, ..., -1.3877e-07, 2.7940e-09, 9.3132e-10], [ 2.3283e-09, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 4.1910e-09, 3.2596e-09], [ 1.3970e-09, 4.6566e-09, 0.0000e+00, ..., 1.1642e-08, 6.0536e-09, 4.6566e-09]], device='cuda:0') Epoch 329, bias, value: tensor([-0.0112, -0.0330, -0.0121, -0.0149, -0.0307, 0.0022, 0.0218, -0.0168, 0.0459, -0.0061], device='cuda:0'), grad: tensor([ 2.2212e-07, -1.3970e-09, -5.1223e-08, 3.7253e-09, -1.7229e-07, -3.1199e-08, 7.9628e-08, -2.7847e-07, 2.9802e-08, 2.0303e-07], device='cuda:0') 100 0.0001 changing lr epoch 328, time 250.46, cls_loss 0.0016 cls_loss_mapping 0.0022 cls_loss_causal 0.4780 re_mapping 0.0044 re_causal 0.0116 /// teacc 99.01 lr 0.00010000 Epoch 330, weight, value: tensor([[-0.1205, -0.2531, -0.0875, ..., -0.0621, 0.1863, 0.1877], [-0.2361, -0.2031, -0.0825, ..., -0.1889, -0.2361, -0.1507], [-0.0724, -0.1781, 0.1484, ..., -0.2227, 0.2599, 0.1177], ..., [-0.1800, 0.0931, 0.0255, ..., 0.2091, -0.2285, -0.3000], [-0.2968, 0.0740, -0.1571, ..., 0.0663, -0.1128, -0.2185], [-0.0408, -0.1454, -0.0833, ..., -0.1283, -0.0646, -0.2274]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.8405e-08, 4.6566e-10, ..., 6.8452e-08, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.1222e-07, -1.7695e-08, ..., -2.2817e-07, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.9558e-08, 0.0000e+00, ..., 4.7497e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.3330e-08, 1.7229e-08, ..., 1.0943e-07, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 330, bias, value: tensor([-0.0114, -0.0329, -0.0120, -0.0150, -0.0309, 0.0022, 0.0216, -0.0170, 0.0465, -0.0058], device='cuda:0'), grad: tensor([ 4.6566e-10, 1.2619e-07, 9.3132e-10, 9.3132e-10, -6.0536e-08, 4.6566e-10, 1.3970e-09, -4.4517e-07, 8.6147e-08, 2.8731e-07], device='cuda:0') 100 0.0001 changing lr epoch 329, time 250.83, cls_loss 0.0020 cls_loss_mapping 0.0022 cls_loss_causal 0.5074 re_mapping 0.0046 re_causal 0.0120 /// teacc 99.05 lr 0.00010000 Epoch 331, weight, value: tensor([[-0.1218, -0.2535, -0.0876, ..., -0.0622, 0.1852, 0.1860], [-0.2370, -0.2032, -0.0826, ..., -0.1895, -0.2362, -0.1509], [-0.0726, -0.1790, 0.1487, ..., -0.2234, 0.2600, 0.1176], ..., [-0.1810, 0.0921, 0.0251, ..., 0.2076, -0.2286, -0.3001], [-0.2970, 0.0763, -0.1572, ..., 0.0691, -0.1130, -0.2189], [-0.0413, -0.1459, -0.0832, ..., -0.1292, -0.0646, -0.2275]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, -2.1886e-08, -2.4680e-08], [ 1.3970e-09, 3.0268e-08, 0.0000e+00, ..., 6.4261e-08, 7.9162e-09, 0.0000e+00], [ 4.6566e-10, 1.1176e-08, 0.0000e+00, ..., -1.8300e-07, -2.0163e-07, 0.0000e+00], ..., [ 3.2596e-09, -6.8452e-08, 0.0000e+00, ..., -1.4063e-07, -2.3283e-09, 0.0000e+00], [ 6.9849e-09, 7.4506e-09, 0.0000e+00, ..., 1.6764e-08, 6.5193e-09, 0.0000e+00], [ 1.0245e-08, 3.4459e-08, 0.0000e+00, ..., 5.0291e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 331, bias, value: tensor([-0.0128, -0.0329, -0.0122, -0.0153, -0.0310, 0.0028, 0.0233, -0.0178, 0.0489, -0.0055], device='cuda:0'), grad: tensor([-3.8184e-08, 1.3178e-07, -5.4343e-07, -1.6391e-07, 9.3132e-10, 6.2445e-07, 7.5903e-08, -2.6030e-07, 4.1910e-08, 1.4435e-07], device='cuda:0') 100 0.0001 changing lr epoch 330, time 250.89, cls_loss 0.0017 cls_loss_mapping 0.0018 cls_loss_causal 0.5001 re_mapping 0.0043 re_causal 0.0117 /// teacc 99.02 lr 0.00010000 Epoch 332, weight, value: tensor([[-0.1220, -0.2538, -0.0877, ..., -0.0623, 0.1830, 0.1861], [-0.2371, -0.2039, -0.0826, ..., -0.1896, -0.2367, -0.1511], [-0.0726, -0.1797, 0.1493, ..., -0.2237, 0.2602, 0.1177], ..., [-0.1815, 0.0928, 0.0251, ..., 0.2080, -0.2285, -0.3002], [-0.2974, 0.0761, -0.1577, ..., 0.0689, -0.1132, -0.2202], [-0.0418, -0.1461, -0.0831, ..., -0.1294, -0.0617, -0.2276]], device='cuda:0'), grad: tensor([[ 7.9162e-09, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, -4.6100e-08, -3.9581e-08], [ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 2.8461e-06, 4.6566e-10], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, -2.9299e-06, 9.3132e-10], ..., [ 3.7253e-09, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 3.8184e-08, 0.0000e+00], [ 2.3283e-08, 1.3970e-09, 0.0000e+00, ..., 1.2107e-08, 2.3283e-09, 4.6566e-10], [ 1.0710e-08, 0.0000e+00, 0.0000e+00, ..., 4.6566e-09, 2.0955e-08, 1.3970e-08]], device='cuda:0') Epoch 332, bias, value: tensor([-0.0149, -0.0332, -0.0122, -0.0150, -0.0313, 0.0022, 0.0237, -0.0178, 0.0486, -0.0036], device='cuda:0'), grad: tensor([-6.3330e-08, 8.4043e-06, -8.6650e-06, 1.1688e-07, -2.7008e-08, -3.2736e-07, 2.7195e-07, 1.2945e-07, 7.3109e-08, 7.7765e-08], device='cuda:0') 100 0.0001 changing lr epoch 331, time 250.78, cls_loss 0.0016 cls_loss_mapping 0.0025 cls_loss_causal 0.4626 re_mapping 0.0044 re_causal 0.0113 /// teacc 99.06 lr 0.00010000 Epoch 333, weight, value: tensor([[-0.1220, -0.2541, -0.0878, ..., -0.0623, 0.1841, 0.1875], [-0.2373, -0.2043, -0.0827, ..., -0.1910, -0.2375, -0.1513], [-0.0728, -0.1805, 0.1493, ..., -0.2238, 0.2610, 0.1179], ..., [-0.1824, 0.0932, 0.0251, ..., 0.2088, -0.2288, -0.3006], [-0.2979, 0.0759, -0.1575, ..., 0.0689, -0.1137, -0.2214], [-0.0420, -0.1464, -0.0830, ..., -0.1296, -0.0618, -0.2279]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 2.7940e-09], [ 6.5193e-09, 4.6566e-09, 0.0000e+00, ..., 1.3039e-08, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 4.6566e-10], ..., [ 1.3970e-09, -7.4506e-09, 0.0000e+00, ..., -6.9849e-09, 4.6566e-10, 4.6566e-10], [ 1.0245e-08, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 8.3819e-09, 1.7229e-08], [ 1.1642e-08, 9.3132e-10, 0.0000e+00, ..., 1.4435e-08, 1.8626e-09, 1.3970e-09]], device='cuda:0') Epoch 333, bias, value: tensor([-0.0142, -0.0338, -0.0117, -0.0130, -0.0317, 0.0007, 0.0225, -0.0176, 0.0485, -0.0033], device='cuda:0'), grad: tensor([ 9.3132e-09, 4.5635e-08, 4.1910e-09, 1.3039e-08, 1.8626e-09, -1.0431e-07, -6.6590e-08, -9.3132e-09, 5.2620e-08, 6.1933e-08], device='cuda:0') 100 0.0001 changing lr epoch 332, time 250.56, cls_loss 0.0016 cls_loss_mapping 0.0020 cls_loss_causal 0.4698 re_mapping 0.0045 re_causal 0.0112 /// teacc 99.03 lr 0.00010000 Epoch 334, weight, value: tensor([[-0.1221, -0.2546, -0.0879, ..., -0.0628, 0.1841, 0.1875], [-0.2377, -0.2044, -0.0828, ..., -0.1912, -0.2385, -0.1514], [-0.0729, -0.1812, 0.1500, ..., -0.2242, 0.2618, 0.1182], ..., [-0.1829, 0.0935, 0.0246, ..., 0.2090, -0.2290, -0.3008], [-0.2981, 0.0759, -0.1569, ..., 0.0691, -0.1139, -0.2217], [-0.0422, -0.1465, -0.0829, ..., -0.1299, -0.0618, -0.2279]], device='cuda:0'), grad: tensor([[ 6.0536e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 8.7544e-08, 2.0955e-08], [ 1.3970e-09, 2.3283e-09, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 4.6566e-10], [ 2.7940e-09, 3.2596e-09, 4.6566e-10, ..., 0.0000e+00, 2.2165e-07, 1.3970e-09], ..., [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 1.1642e-08, 1.8626e-09, 0.0000e+00, ..., -4.6566e-10, 4.6100e-08, 4.6100e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 9.7789e-09, 1.8626e-09]], device='cuda:0') Epoch 334, bias, value: tensor([-0.0142, -0.0346, -0.0114, -0.0131, -0.0306, 0.0007, 0.0226, -0.0176, 0.0486, -0.0032], device='cuda:0'), grad: tensor([ 2.9849e-07, 1.6298e-08, 7.2550e-07, 2.0489e-08, -1.1390e-06, 1.5832e-08, -2.1840e-07, 2.7008e-08, 2.8312e-07, -1.6764e-08], device='cuda:0') 100 0.0001 changing lr epoch 333, time 250.30, cls_loss 0.0017 cls_loss_mapping 0.0017 cls_loss_causal 0.4903 re_mapping 0.0043 re_causal 0.0114 /// teacc 98.99 lr 0.00010000 Epoch 335, weight, value: tensor([[-0.1222, -0.2549, -0.0882, ..., -0.0629, 0.1841, 0.1875], [-0.2378, -0.2037, -0.0828, ..., -0.1916, -0.2390, -0.1516], [-0.0733, -0.1813, 0.1501, ..., -0.2242, 0.2628, 0.1187], ..., [-0.1833, 0.0932, 0.0246, ..., 0.2099, -0.2296, -0.3014], [-0.2984, 0.0761, -0.1570, ..., 0.0694, -0.1138, -0.2220], [-0.0417, -0.1471, -0.0838, ..., -0.1312, -0.0619, -0.2279]], device='cuda:0'), grad: tensor([[ 7.4506e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.5461e-08, 3.2131e-08], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 2.8871e-08, 2.3283e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.4715e-07, -4.1910e-08], ..., [ 0.0000e+00, -8.8476e-09, 0.0000e+00, ..., -5.4482e-08, 6.5193e-09, 9.3132e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-09, 2.3283e-09], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 2.4214e-08, -6.5193e-09, 4.6566e-10]], device='cuda:0') Epoch 335, bias, value: tensor([-0.0143, -0.0330, -0.0116, -0.0131, -0.0303, 0.0003, 0.0227, -0.0184, 0.0488, -0.0035], device='cuda:0'), grad: tensor([ 3.1339e-07, 1.2666e-07, -3.3714e-07, 1.2014e-07, 7.1246e-08, 8.6613e-08, -1.0990e-07, 8.7079e-08, 1.8626e-08, -3.7719e-07], device='cuda:0') 100 0.0001 changing lr epoch 334, time 250.56, cls_loss 0.0018 cls_loss_mapping 0.0018 cls_loss_causal 0.4799 re_mapping 0.0042 re_causal 0.0111 /// teacc 99.03 lr 0.00010000 Epoch 336, weight, value: tensor([[-0.1222, -0.2555, -0.0882, ..., -0.0631, 0.1841, 0.1876], [-0.2380, -0.2044, -0.0828, ..., -0.1925, -0.2391, -0.1516], [-0.0736, -0.1805, 0.1502, ..., -0.2238, 0.2636, 0.1194], ..., [-0.1837, 0.0936, 0.0245, ..., 0.2104, -0.2307, -0.3026], [-0.2987, 0.0760, -0.1572, ..., 0.0694, -0.1140, -0.2224], [-0.0396, -0.1473, -0.0838, ..., -0.1315, -0.0619, -0.2280]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -7.5903e-08, -3.5390e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.0536e-09, 2.7940e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4506e-09, 3.7253e-09], ..., [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., -3.7253e-09, 6.9849e-09, 3.2596e-09], [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 3.2596e-09, 7.9162e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 3.2596e-09, 3.4925e-08, 1.9558e-08]], device='cuda:0') Epoch 336, bias, value: tensor([-0.0143, -0.0331, -0.0113, -0.0136, -0.0293, 0.0006, 0.0228, -0.0187, 0.0488, -0.0036], device='cuda:0'), grad: tensor([-3.0966e-07, -2.0210e-06, 1.1725e-06, -6.9849e-09, 2.2864e-07, 5.0291e-08, 7.9162e-09, 9.6858e-07, 3.9581e-08, -1.3923e-07], device='cuda:0') 100 0.0001 changing lr epoch 335, time 250.08, cls_loss 0.0019 cls_loss_mapping 0.0018 cls_loss_causal 0.4542 re_mapping 0.0042 re_causal 0.0107 /// teacc 99.10 lr 0.00010000 Epoch 337, weight, value: tensor([[-0.1222, -0.2561, -0.0882, ..., -0.0632, 0.1842, 0.1877], [-0.2381, -0.2048, -0.0826, ..., -0.1915, -0.2393, -0.1518], [-0.0738, -0.1805, 0.1505, ..., -0.2242, 0.2645, 0.1203], ..., [-0.1840, 0.0940, 0.0241, ..., 0.2107, -0.2318, -0.3037], [-0.2991, 0.0759, -0.1574, ..., 0.0694, -0.1142, -0.2227], [-0.0393, -0.1480, -0.0837, ..., -0.1328, -0.0619, -0.2281]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 4.3772e-08, 2.5146e-08], [ 0.0000e+00, 6.5193e-09, 0.0000e+00, ..., 5.7742e-08, 4.6566e-09, 3.7253e-09], [ 0.0000e+00, 3.0827e-07, 0.0000e+00, ..., 2.3469e-07, -2.6077e-07, -2.2352e-07], ..., [ 0.0000e+00, -3.3528e-07, 0.0000e+00, ..., -2.5518e-07, 2.2072e-07, 1.9837e-07], [ 9.3132e-10, 1.1176e-08, 9.3132e-10, ..., -1.6019e-07, 1.2107e-08, 6.5193e-09], [ 0.0000e+00, 3.7253e-09, -1.8626e-09, ..., 4.6566e-09, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 337, bias, value: tensor([-0.0142, -0.0322, -0.0111, -0.0137, -0.0303, -0.0002, 0.0237, -0.0192, 0.0487, -0.0035], device='cuda:0'), grad: tensor([ 1.2759e-07, -2.4587e-07, -2.0489e-07, 1.3970e-08, 5.5879e-09, 2.5239e-07, -4.5635e-08, 4.3400e-07, -3.2037e-07, -2.6077e-08], device='cuda:0') 100 0.0001 changing lr epoch 336, time 250.70, cls_loss 0.0018 cls_loss_mapping 0.0027 cls_loss_causal 0.4963 re_mapping 0.0041 re_causal 0.0113 /// teacc 99.04 lr 0.00010000 Epoch 338, weight, value: tensor([[-0.1223, -0.2564, -0.0888, ..., -0.0633, 0.1849, 0.1889], [-0.2390, -0.2058, -0.0826, ..., -0.1928, -0.2402, -0.1519], [-0.0734, -0.1808, 0.1520, ..., -0.2246, 0.2657, 0.1205], ..., [-0.1843, 0.0949, 0.0235, ..., 0.2118, -0.2323, -0.3040], [-0.2997, 0.0756, -0.1577, ..., 0.0690, -0.1143, -0.2230], [-0.0400, -0.1483, -0.0838, ..., -0.1332, -0.0620, -0.2284]], device='cuda:0'), grad: tensor([[ 3.3528e-08, 1.0151e-07, 0.0000e+00, ..., 2.1327e-07, 1.8626e-09, -0.0000e+00], [ 0.0000e+00, 1.9558e-08, 0.0000e+00, ..., 3.0734e-08, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 6.5193e-09, 0.0000e+00, ..., 1.1176e-08, -2.7940e-09, -1.8626e-09], ..., [-4.8429e-08, -3.0175e-07, 0.0000e+00, ..., -5.5786e-07, 9.3132e-10, 0.0000e+00], [ 1.6764e-08, 1.8626e-08, 0.0000e+00, ..., 4.0978e-08, -4.3772e-08, -1.5832e-08], [ 8.3819e-09, 6.8918e-08, 0.0000e+00, ..., 1.2200e-07, 4.6566e-09, 2.7940e-09]], device='cuda:0') Epoch 338, bias, value: tensor([-0.0138, -0.0328, -0.0095, -0.0120, -0.0304, -0.0016, 0.0230, -0.0190, 0.0481, -0.0036], device='cuda:0'), grad: tensor([ 5.9325e-07, 1.1269e-07, 4.8429e-08, 4.6100e-07, 1.1241e-06, -3.3528e-08, 3.4925e-07, -1.4240e-06, -3.4925e-07, -8.8476e-07], device='cuda:0') 100 0.0001 changing lr epoch 337, time 250.45, cls_loss 0.0019 cls_loss_mapping 0.0017 cls_loss_causal 0.4810 re_mapping 0.0041 re_causal 0.0109 /// teacc 99.04 lr 0.00010000 Epoch 339, weight, value: tensor([[-0.1240, -0.2569, -0.0889, ..., -0.0633, 0.1844, 0.1883], [-0.2392, -0.2074, -0.0826, ..., -0.1944, -0.2404, -0.1522], [-0.0735, -0.1813, 0.1521, ..., -0.2259, 0.2662, 0.1212], ..., [-0.1846, 0.0963, 0.0235, ..., 0.2134, -0.2328, -0.3049], [-0.3004, 0.0753, -0.1577, ..., 0.0691, -0.1147, -0.2234], [-0.0414, -0.1487, -0.0837, ..., -0.1339, -0.0620, -0.2289]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -1.8626e-09, -9.3132e-10], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.9558e-08, 0.0000e+00, 0.0000e+00, ..., 1.5832e-08, 0.0000e+00, 0.0000e+00], [ 6.8918e-08, 0.0000e+00, 0.0000e+00, ..., 9.3132e-09, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 339, bias, value: tensor([-0.0146, -0.0333, -0.0095, -0.0114, -0.0298, -0.0024, 0.0245, -0.0184, 0.0479, -0.0040], device='cuda:0'), grad: tensor([ 2.7940e-09, 6.5193e-09, 0.0000e+00, 1.4901e-07, 6.7987e-08, -5.5134e-07, 8.1956e-08, 1.2107e-08, 6.8918e-08, 1.6671e-07], device='cuda:0') 100 0.0001 changing lr epoch 338, time 250.39, cls_loss 0.0018 cls_loss_mapping 0.0017 cls_loss_causal 0.4754 re_mapping 0.0043 re_causal 0.0112 /// teacc 99.12 lr 0.00010000 Epoch 340, weight, value: tensor([[-0.1246, -0.2575, -0.0879, ..., -0.0634, 0.1844, 0.1882], [-0.2394, -0.2076, -0.0828, ..., -0.1948, -0.2407, -0.1524], [-0.0735, -0.1812, 0.1526, ..., -0.2263, 0.2668, 0.1217], ..., [-0.1848, 0.0965, 0.0233, ..., 0.2141, -0.2336, -0.3060], [-0.3007, 0.0752, -0.1586, ..., 0.0695, -0.1150, -0.2245], [-0.0418, -0.1490, -0.0836, ..., -0.1342, -0.0621, -0.2291]], device='cuda:0'), grad: tensor([[-4.0885e-07, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -5.4315e-06, -4.4666e-06], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 2.4214e-08, 3.2596e-08, 0.0000e+00, ..., 4.7497e-08, 3.1944e-07, 2.6263e-07], ..., [ 0.0000e+00, -4.4703e-08, 0.0000e+00, ..., -6.6124e-08, 9.3132e-10, 9.3132e-10], [ 1.3039e-08, 4.6566e-09, 0.0000e+00, ..., 5.0291e-08, 2.0489e-08, 1.6764e-08], [ 1.0524e-07, 5.5879e-09, -9.3132e-10, ..., -0.0000e+00, 1.3951e-06, 1.1474e-06]], device='cuda:0') Epoch 340, bias, value: tensor([-0.0148, -0.0341, -0.0094, -0.0114, -0.0314, -0.0027, 0.0254, -0.0185, 0.0484, -0.0029], device='cuda:0'), grad: tensor([-7.8231e-06, 3.7253e-09, 5.4017e-07, 4.6566e-09, 5.4948e-08, -1.2014e-07, 5.3234e-06, -9.9652e-08, 3.1572e-07, 1.7993e-06], device='cuda:0') 100 0.0001 changing lr epoch 339, time 250.32, cls_loss 0.0021 cls_loss_mapping 0.0021 cls_loss_causal 0.4624 re_mapping 0.0042 re_causal 0.0108 /// teacc 99.10 lr 0.00010000 Epoch 341, weight, value: tensor([[-0.1255, -0.2583, -0.0883, ..., -0.0634, 0.1832, 0.1874], [-0.2394, -0.2084, -0.0823, ..., -0.1956, -0.2408, -0.1534], [-0.0737, -0.1816, 0.1528, ..., -0.2265, 0.2671, 0.1222], ..., [-0.1849, 0.0972, 0.0230, ..., 0.2141, -0.2340, -0.3066], [-0.3010, 0.0753, -0.1591, ..., 0.0694, -0.1152, -0.2244], [-0.0423, -0.1494, -0.0843, ..., -0.1331, -0.0614, -0.2293]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.4141e-05, -1.8813e-07], [ 9.3132e-10, 8.3819e-09, 0.0000e+00, ..., 1.7695e-08, 3.7253e-09, 0.0000e+00], [ 0.0000e+00, 1.5832e-08, 0.0000e+00, ..., 3.8184e-08, -4.6566e-09, -5.5879e-09], ..., [ 9.3132e-10, -3.2596e-08, 0.0000e+00, ..., -8.1025e-08, 2.6077e-08, 2.7940e-09], [ 1.3970e-08, 3.3528e-08, 0.0000e+00, ..., -1.6764e-08, 2.7940e-09, 9.3132e-10], [ 4.6566e-09, 1.4901e-08, 0.0000e+00, ..., 2.2352e-08, -1.4201e-05, 1.8813e-07]], device='cuda:0') Epoch 341, bias, value: tensor([-0.0161, -0.0344, -0.0097, -0.0112, -0.0319, -0.0027, 0.0264, -0.0194, 0.0485, -0.0014], device='cuda:0'), grad: tensor([ 4.1544e-05, 4.1910e-08, 5.9605e-08, -1.0710e-07, 4.8429e-08, 1.5832e-08, 9.3132e-10, -6.0536e-08, -4.8429e-08, -4.1574e-05], device='cuda:0') 100 0.0001 changing lr epoch 340, time 250.12, cls_loss 0.0016 cls_loss_mapping 0.0016 cls_loss_causal 0.4616 re_mapping 0.0041 re_causal 0.0109 /// teacc 99.08 lr 0.00010000 Epoch 342, weight, value: tensor([[-0.1256, -0.2595, -0.0883, ..., -0.0632, 0.1834, 0.1877], [-0.2395, -0.2093, -0.0825, ..., -0.1968, -0.2411, -0.1548], [-0.0737, -0.1822, 0.1533, ..., -0.2268, 0.2676, 0.1223], ..., [-0.1851, 0.0981, 0.0229, ..., 0.2150, -0.2343, -0.3071], [-0.3014, 0.0751, -0.1594, ..., 0.0691, -0.1156, -0.2231], [-0.0430, -0.1500, -0.0842, ..., -0.1332, -0.0614, -0.2298]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 6.5193e-09, 4.6566e-09], [ 0.0000e+00, 2.3842e-07, 0.0000e+00, ..., 3.7998e-07, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.2945e-07, 0.0000e+00, ..., 2.7195e-07, -8.3819e-08, -1.3039e-08], ..., [ 0.0000e+00, -1.6876e-06, 0.0000e+00, ..., -2.7493e-06, 6.7055e-08, 1.0245e-08], [ 0.0000e+00, 1.2834e-06, 0.0000e+00, ..., 2.0508e-06, 2.7940e-09, 9.3132e-10], [ 0.0000e+00, 9.3132e-09, 0.0000e+00, ..., 1.4901e-08, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 342, bias, value: tensor([-0.0161, -0.0351, -0.0096, -0.0112, -0.0326, -0.0025, 0.0263, -0.0191, 0.0495, -0.0011], device='cuda:0'), grad: tensor([ 2.1420e-08, -4.3139e-06, 3.4086e-06, 1.0524e-07, 6.2399e-08, 4.0047e-08, -1.3970e-08, -3.1386e-06, 3.7588e-06, 4.0978e-08], device='cuda:0') 100 0.0001 changing lr epoch 341, time 250.32, cls_loss 0.0019 cls_loss_mapping 0.0019 cls_loss_causal 0.4969 re_mapping 0.0041 re_causal 0.0111 /// teacc 98.97 lr 0.00010000 Epoch 343, weight, value: tensor([[-0.1257, -0.2601, -0.0884, ..., -0.0639, 0.1823, 0.1878], [-0.2422, -0.2095, -0.0832, ..., -0.1971, -0.2416, -0.1564], [-0.0709, -0.1821, 0.1568, ..., -0.2269, 0.2706, 0.1250], ..., [-0.1853, 0.0984, 0.0229, ..., 0.2154, -0.2362, -0.3076], [-0.3032, 0.0748, -0.1595, ..., 0.0675, -0.1154, -0.2259], [-0.0425, -0.1504, -0.0842, ..., -0.1337, -0.0599, -0.2300]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 3.7253e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 9.3132e-10, 1.2107e-08, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.1176e-08, 0.0000e+00, ..., 1.8626e-09, -3.7253e-09, 0.0000e+00], ..., [ 1.3039e-08, 1.9558e-08, 0.0000e+00, ..., -1.8626e-08, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 1.2107e-08, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 0.0000e+00], [ 9.3132e-10, 1.1176e-08, 0.0000e+00, ..., 1.8626e-09, 1.8626e-09, 0.0000e+00]], device='cuda:0') Epoch 343, bias, value: tensor([-0.0171, -0.0365, -0.0080, -0.0113, -0.0328, -0.0021, 0.0265, -0.0182, 0.0475, -0.0003], device='cuda:0'), grad: tensor([ 5.4948e-08, -4.4610e-07, 4.0978e-08, -1.8626e-09, 8.0746e-07, 2.2762e-06, 1.7881e-07, 2.0489e-06, 8.3819e-08, -5.0664e-06], device='cuda:0') 100 0.0001 changing lr epoch 342, time 250.40, cls_loss 0.0017 cls_loss_mapping 0.0018 cls_loss_causal 0.4793 re_mapping 0.0040 re_causal 0.0108 /// teacc 99.00 lr 0.00010000 Epoch 344, weight, value: tensor([[-0.1262, -0.2612, -0.0885, ..., -0.0640, 0.1821, 0.1879], [-0.2422, -0.2111, -0.0834, ..., -0.1986, -0.2427, -0.1566], [-0.0709, -0.1827, 0.1591, ..., -0.2272, 0.2734, 0.1269], ..., [-0.1856, 0.0996, 0.0229, ..., 0.2153, -0.2368, -0.3079], [-0.3030, 0.0760, -0.1594, ..., 0.0695, -0.1162, -0.2265], [-0.0427, -0.1518, -0.0842, ..., -0.1345, -0.0597, -0.2301]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -8.3819e-09, -5.5879e-09], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], [ 1.8626e-09, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 8.3819e-09, 5.5879e-09]], device='cuda:0') Epoch 344, bias, value: tensor([-0.0174, -0.0372, -0.0056, -0.0118, -0.0328, -0.0018, 0.0262, -0.0184, 0.0492, -0.0001], device='cuda:0'), grad: tensor([-1.4901e-08, -2.2158e-05, 3.2261e-06, 5.8673e-08, 1.3039e-08, 3.9116e-08, -3.3528e-08, 1.8850e-05, 4.0978e-08, -2.8871e-08], device='cuda:0') 100 0.0001 changing lr epoch 343, time 250.37, cls_loss 0.0016 cls_loss_mapping 0.0011 cls_loss_causal 0.4720 re_mapping 0.0039 re_causal 0.0107 /// teacc 99.03 lr 0.00010000 Epoch 345, weight, value: tensor([[-0.1262, -0.2613, -0.0887, ..., -0.0641, 0.1823, 0.1881], [-0.2422, -0.2114, -0.0836, ..., -0.1983, -0.2438, -0.1574], [-0.0710, -0.1836, 0.1597, ..., -0.2281, 0.2741, 0.1275], ..., [-0.1860, 0.1004, 0.0229, ..., 0.2155, -0.2371, -0.3082], [-0.3036, 0.0758, -0.1591, ..., 0.0693, -0.1164, -0.2268], [-0.0428, -0.1527, -0.0844, ..., -0.1346, -0.0598, -0.2303]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -3.0734e-08, -2.1420e-08], [ 9.3132e-10, 9.3132e-10, 9.3132e-10, ..., 9.3132e-10, 9.3132e-10, -2.7940e-09], [ 0.0000e+00, 9.3132e-10, -6.1467e-08, ..., 0.0000e+00, -4.4703e-08, -1.0245e-08], ..., [ 2.7940e-09, 3.7253e-09, 5.7742e-08, ..., 2.7940e-09, 4.1910e-08, 9.3132e-09], [ 1.4901e-08, 1.9558e-08, 0.0000e+00, ..., 1.6764e-08, 0.0000e+00, 1.8626e-09], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 2.7940e-08, 1.7695e-08]], device='cuda:0') Epoch 345, bias, value: tensor([-0.0173, -0.0367, -0.0055, -0.0123, -0.0319, -0.0014, 0.0262, -0.0191, 0.0490, -0.0001], device='cuda:0'), grad: tensor([-3.8184e-08, -1.6205e-07, -5.8487e-07, -3.9116e-08, -6.6031e-07, -8.5682e-08, 6.8266e-07, 6.0629e-07, 1.9092e-07, 7.7300e-08], device='cuda:0') 100 0.0001 changing lr epoch 344, time 250.14, cls_loss 0.0018 cls_loss_mapping 0.0013 cls_loss_causal 0.4554 re_mapping 0.0042 re_causal 0.0107 /// teacc 99.13 lr 0.00010000 Epoch 346, weight, value: tensor([[-0.1263, -0.2612, -0.0888, ..., -0.0659, 0.1826, 0.1885], [-0.2430, -0.2116, -0.0837, ..., -0.1986, -0.2439, -0.1571], [-0.0712, -0.1858, 0.1597, ..., -0.2302, 0.2741, 0.1272], ..., [-0.1859, 0.1006, 0.0227, ..., 0.2155, -0.2371, -0.3083], [-0.3043, 0.0768, -0.1586, ..., 0.0703, -0.1167, -0.2270], [-0.0448, -0.1539, -0.0853, ..., -0.1358, -0.0601, -0.2313]], device='cuda:0'), grad: tensor([[ 3.7253e-09, -3.7253e-09, 0.0000e+00, ..., -4.7497e-08, -2.4308e-06, -1.1157e-06], [ 4.6566e-09, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, 3.7253e-09, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.0489e-08, 9.3132e-09], ..., [ 9.3132e-10, -1.8626e-09, 1.2107e-08, ..., 8.5682e-08, 6.5193e-09, 3.7253e-09], [ 4.0978e-08, 9.3132e-10, 0.0000e+00, ..., 3.1665e-08, 1.1129e-06, 5.1130e-07], [ 3.7253e-09, 4.6566e-09, -1.3970e-08, ..., -9.1270e-08, 1.0859e-06, 5.0012e-07]], device='cuda:0') Epoch 346, bias, value: tensor([-0.0171, -0.0365, -0.0062, -0.0132, -0.0323, -0.0005, 0.0263, -0.0194, 0.0502, -0.0002], device='cuda:0'), grad: tensor([-4.7311e-06, 2.4214e-08, 3.9116e-08, 1.8999e-07, 4.2841e-08, -9.2909e-06, 9.3058e-06, 4.5914e-07, 2.2873e-06, 1.6494e-06], device='cuda:0') 100 0.0001 changing lr epoch 345, time 250.29, cls_loss 0.0015 cls_loss_mapping 0.0020 cls_loss_causal 0.4955 re_mapping 0.0041 re_causal 0.0112 /// teacc 99.06 lr 0.00010000 Epoch 347, weight, value: tensor([[-0.1282, -0.2616, -0.0888, ..., -0.0659, 0.1811, 0.1868], [-0.2431, -0.2116, -0.0837, ..., -0.1956, -0.2443, -0.1571], [-0.0712, -0.1867, 0.1597, ..., -0.2308, 0.2747, 0.1274], ..., [-0.1859, 0.1010, 0.0226, ..., 0.2134, -0.2378, -0.3086], [-0.3045, 0.0768, -0.1589, ..., 0.0703, -0.1177, -0.2277], [-0.0452, -0.1548, -0.0852, ..., -0.1367, -0.0602, -0.2315]], device='cuda:0'), grad: tensor([[ 5.7966e-05, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0842e-04, 6.5684e-05], [ 4.6566e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 8.3819e-09, 5.5879e-09], [ 3.1665e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 5.8673e-08, 3.5390e-08], ..., [ 5.5879e-09, -9.3132e-10, 0.0000e+00, ..., -1.8626e-09, 1.1176e-08, 6.5193e-09], [ 2.6077e-08, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 3.8184e-08, 2.2352e-08], [ 1.7695e-08, 1.8626e-09, 0.0000e+00, ..., 9.3132e-10, 3.4459e-08, 2.0489e-08]], device='cuda:0') Epoch 347, bias, value: tensor([-0.0181, -0.0339, -0.0059, -0.0150, -0.0322, 0.0012, 0.0277, -0.0219, 0.0501, -0.0004], device='cuda:0'), grad: tensor([ 2.3305e-04, 2.0489e-08, 1.2666e-07, 0.0000e+00, 4.1910e-08, 6.4541e-07, -2.3448e-04, 2.1420e-08, 9.4064e-08, 7.5437e-08], device='cuda:0') 100 0.0001 changing lr epoch 346, time 250.31, cls_loss 0.0015 cls_loss_mapping 0.0016 cls_loss_causal 0.4503 re_mapping 0.0040 re_causal 0.0105 /// teacc 99.11 lr 0.00010000 Epoch 348, weight, value: tensor([[-0.1295, -0.2624, -0.0888, ..., -0.0661, 0.1806, 0.1863], [-0.2450, -0.2124, -0.0837, ..., -0.1961, -0.2457, -0.1583], [-0.0694, -0.1837, 0.1597, ..., -0.2301, 0.2761, 0.1280], ..., [-0.1864, 0.1014, 0.0226, ..., 0.2141, -0.2398, -0.3088], [-0.3046, 0.0759, -0.1590, ..., 0.0702, -0.1185, -0.2281], [-0.0459, -0.1558, -0.0851, ..., -0.1373, -0.0604, -0.2322]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 2.7940e-08, 0.0000e+00, ..., 2.0489e-08, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 3.5949e-07, 0.0000e+00, ..., 2.8126e-07, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.4459e-08, 0.0000e+00, ..., 3.5390e-08, -1.8626e-09, -9.3132e-10], ..., [ 0.0000e+00, -1.2117e-06, 0.0000e+00, ..., -9.3784e-07, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 1.2480e-07, 0.0000e+00, ..., 8.7544e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.8243e-07, 0.0000e+00, ..., 3.5018e-07, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 348, bias, value: tensor([-0.0186, -0.0341, -0.0044, -0.0150, -0.0330, 0.0015, 0.0283, -0.0218, 0.0499, -0.0005], device='cuda:0'), grad: tensor([ 8.3819e-08, 1.0040e-06, 9.1270e-08, 4.7963e-07, -2.0489e-08, 4.0047e-08, 3.5390e-08, -3.4273e-06, 3.4645e-07, 1.3690e-06], device='cuda:0') 100 0.0001 changing lr epoch 347, time 250.07, cls_loss 0.0016 cls_loss_mapping 0.0013 cls_loss_causal 0.4882 re_mapping 0.0040 re_causal 0.0110 /// teacc 99.04 lr 0.00010000 Epoch 349, weight, value: tensor([[-0.1295, -0.2633, -0.0888, ..., -0.0665, 0.1807, 0.1866], [-0.2455, -0.2126, -0.0837, ..., -0.1963, -0.2468, -0.1586], [-0.0694, -0.1840, 0.1598, ..., -0.2307, 0.2765, 0.1280], ..., [-0.1866, 0.1020, 0.0222, ..., 0.2148, -0.2398, -0.3090], [-0.3051, 0.0758, -0.1590, ..., 0.0702, -0.1194, -0.2289], [-0.0449, -0.1567, -0.0847, ..., -0.1389, -0.0603, -0.2330]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-08, -1.1176e-08], [ 0.0000e+00, 0.0000e+00, -4.6566e-09, ..., -1.8626e-09, 1.2107e-08, 0.0000e+00], [ 0.0000e+00, -6.5193e-09, 0.0000e+00, ..., -1.8626e-09, -1.6112e-07, -1.8626e-08], ..., [ 0.0000e+00, 9.3132e-10, 2.7940e-09, ..., 1.8626e-09, 6.5193e-09, 3.7253e-09], [ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 9.3132e-10, 2.2352e-08, 1.3970e-08], [ 0.0000e+00, 0.0000e+00, 1.8626e-09, ..., 9.3132e-10, 3.7253e-09, 1.8626e-09]], device='cuda:0') Epoch 349, bias, value: tensor([-0.0186, -0.0343, -0.0046, -0.0149, -0.0333, 0.0016, 0.0281, -0.0216, 0.0502, -0.0005], device='cuda:0'), grad: tensor([ 1.2573e-07, -1.5460e-07, -3.0082e-07, 5.9605e-08, 8.3819e-09, 2.7940e-09, 2.2352e-08, 1.1828e-07, 5.8673e-08, 7.1712e-08], device='cuda:0') 100 0.0001 changing lr epoch 348, time 250.39, cls_loss 0.0014 cls_loss_mapping 0.0011 cls_loss_causal 0.4805 re_mapping 0.0041 re_causal 0.0111 /// teacc 99.09 lr 0.00010000 Epoch 350, weight, value: tensor([[-0.1296, -0.2645, -0.0888, ..., -0.0674, 0.1808, 0.1867], [-0.2456, -0.2159, -0.0836, ..., -0.1983, -0.2475, -0.1578], [-0.0695, -0.1844, 0.1598, ..., -0.2312, 0.2767, 0.1275], ..., [-0.1869, 0.1052, 0.0224, ..., 0.2167, -0.2400, -0.3095], [-0.3062, 0.0760, -0.1593, ..., 0.0711, -0.1196, -0.2296], [-0.0443, -0.1574, -0.0846, ..., -0.1396, -0.0605, -0.2334]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -4.7497e-08, -2.8871e-08], [ 4.6566e-09, 1.2107e-08, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 4.6566e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10], ..., [ 2.7940e-09, 2.7940e-09, 0.0000e+00, ..., -4.6566e-09, 0.0000e+00, 0.0000e+00], [ 1.0245e-08, 2.6077e-08, 0.0000e+00, ..., 9.3132e-10, 3.7253e-09, 1.8626e-09], [ 1.8626e-09, 4.6566e-09, 0.0000e+00, ..., 9.3132e-10, 2.2352e-08, 1.4901e-08]], device='cuda:0') Epoch 350, bias, value: tensor([-0.0185, -0.0356, -0.0051, -0.0148, -0.0328, 0.0011, 0.0280, -0.0204, 0.0526, -0.0008], device='cuda:0'), grad: tensor([-6.5193e-08, -4.3772e-08, 1.1176e-08, -1.4342e-07, 1.8626e-09, 3.9116e-08, 3.5390e-08, 8.3819e-09, 1.0431e-07, 5.5879e-08], device='cuda:0') 100 0.0001 changing lr epoch 349, time 250.15, cls_loss 0.0017 cls_loss_mapping 0.0021 cls_loss_causal 0.4706 re_mapping 0.0042 re_causal 0.0108 /// teacc 99.09 lr 0.00010000 Epoch 351, weight, value: tensor([[-0.1299, -0.2653, -0.0891, ..., -0.0677, 0.1808, 0.1867], [-0.2460, -0.2162, -0.0854, ..., -0.1987, -0.2480, -0.1578], [-0.0696, -0.1853, 0.1598, ..., -0.2327, 0.2768, 0.1274], ..., [-0.1873, 0.1082, 0.0263, ..., 0.2199, -0.2402, -0.3096], [-0.3076, 0.0757, -0.1595, ..., 0.0713, -0.1182, -0.2303], [-0.0442, -0.1612, -0.0882, ..., -0.1435, -0.0607, -0.2339]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -3.9116e-08, -1.7695e-08], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 9.3132e-10], [ 0.0000e+00, -2.6077e-08, 0.0000e+00, ..., 0.0000e+00, -6.4261e-08, -1.4901e-08], ..., [ 0.0000e+00, 2.1420e-08, 0.0000e+00, ..., 0.0000e+00, 5.8673e-08, 1.3970e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 3.5390e-08, 1.3970e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 5.5879e-09, 2.7940e-09]], device='cuda:0') Epoch 351, bias, value: tensor([-0.0185, -0.0356, -0.0056, -0.0148, -0.0335, 0.0011, 0.0278, -0.0185, 0.0530, -0.0031], device='cuda:0'), grad: tensor([-6.4261e-08, 7.4506e-09, -1.7509e-07, 5.7742e-08, 1.8626e-08, 4.6566e-09, 3.7253e-09, 1.5646e-07, -2.1420e-08, 1.9558e-08], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 350---------------------------------------------------- epoch 350, time 268.03, cls_loss 0.0017 cls_loss_mapping 0.0017 cls_loss_causal 0.4852 re_mapping 0.0041 re_causal 0.0108 /// teacc 99.15 lr 0.00010000 Epoch 352, weight, value: tensor([[-0.1301, -0.2672, -0.0890, ..., -0.0681, 0.1812, 0.1872], [-0.2460, -0.2163, -0.0847, ..., -0.1991, -0.2483, -0.1581], [-0.0697, -0.1861, 0.1598, ..., -0.2355, 0.2772, 0.1285], ..., [-0.1876, 0.1085, 0.0263, ..., 0.2208, -0.2404, -0.3096], [-0.3088, 0.0754, -0.1596, ..., 0.0712, -0.1189, -0.2326], [-0.0448, -0.1613, -0.0882, ..., -0.1439, -0.0611, -0.2348]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., -5.5879e-09, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 9.3132e-10, 0.0000e+00, ..., -9.3132e-10, 1.8626e-09, 2.7940e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 2.7940e-09, 3.7253e-09, 1.8626e-09]], device='cuda:0') Epoch 352, bias, value: tensor([-0.0183, -0.0357, -0.0059, -0.0152, -0.0329, 0.0015, 0.0277, -0.0182, 0.0528, -0.0035], device='cuda:0'), grad: tensor([ 9.3132e-10, 2.7940e-09, 2.7940e-09, -1.1176e-08, -3.2596e-08, 4.7125e-07, -4.5542e-07, -6.5193e-09, 1.1176e-08, 1.9558e-08], device='cuda:0') 100 0.0001 changing lr epoch 351, time 250.51, cls_loss 0.0014 cls_loss_mapping 0.0017 cls_loss_causal 0.4895 re_mapping 0.0039 re_causal 0.0109 /// teacc 99.01 lr 0.00010000 Epoch 353, weight, value: tensor([[-0.1301, -0.2667, -0.0890, ..., -0.0682, 0.1818, 0.1877], [-0.2460, -0.2164, -0.0847, ..., -0.1992, -0.2485, -0.1582], [-0.0697, -0.1866, 0.1600, ..., -0.2365, 0.2773, 0.1289], ..., [-0.1886, 0.1092, 0.0263, ..., 0.2219, -0.2405, -0.3099], [-0.3109, 0.0752, -0.1603, ..., 0.0709, -0.1199, -0.2337], [-0.0452, -0.1622, -0.0882, ..., -0.1449, -0.0613, -0.2353]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -6.5193e-09, -2.7940e-09], ..., [ 1.8626e-08, -0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 1.8626e-09, 9.3132e-10], [ 1.5832e-08, 0.0000e+00, 0.0000e+00, ..., 1.1176e-08, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 353, bias, value: tensor([-0.0179, -0.0357, -0.0060, -0.0150, -0.0329, 0.0006, 0.0282, -0.0178, 0.0520, -0.0039], device='cuda:0'), grad: tensor([ 9.3132e-09, -2.4121e-07, 1.8347e-07, 0.0000e+00, 5.5879e-09, -8.1956e-08, 9.3132e-09, 6.9849e-08, 2.7008e-08, 1.7695e-08], device='cuda:0') 100 0.0001 changing lr epoch 352, time 250.26, cls_loss 0.0020 cls_loss_mapping 0.0020 cls_loss_causal 0.4979 re_mapping 0.0039 re_causal 0.0106 /// teacc 98.93 lr 0.00010000 Epoch 354, weight, value: tensor([[-0.1301, -0.2674, -0.0890, ..., -0.0685, 0.1816, 0.1878], [-0.2460, -0.2165, -0.0848, ..., -0.1990, -0.2486, -0.1583], [-0.0698, -0.1868, 0.1600, ..., -0.2353, 0.2786, 0.1306], ..., [-0.1895, 0.1093, 0.0264, ..., 0.2219, -0.2422, -0.3127], [-0.3133, 0.0750, -0.1602, ..., 0.0701, -0.1210, -0.2345], [-0.0450, -0.1624, -0.0882, ..., -0.1453, -0.0612, -0.2357]], device='cuda:0'), grad: tensor([[ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 2.7940e-09], [ 2.7940e-09, 1.8626e-08, 0.0000e+00, ..., 9.3132e-10, 2.7940e-09, 9.3132e-10], [ 2.7940e-09, 5.4017e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 9.3132e-10, 1.9558e-08, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 7.4506e-09, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 9.3132e-10], [ 0.0000e+00, 9.3132e-10, -9.3132e-10, ..., 9.3132e-10, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 354, bias, value: tensor([-0.0182, -0.0354, -0.0057, -0.0150, -0.0333, 0.0014, 0.0277, -0.0181, 0.0508, -0.0036], device='cuda:0'), grad: tensor([ 3.5390e-08, -6.4075e-07, 1.0058e-07, -1.3784e-07, 6.4820e-07, 5.4017e-08, -7.5437e-08, 4.6566e-08, 3.0734e-08, -5.2154e-08], device='cuda:0') 100 0.0001 changing lr epoch 353, time 250.79, cls_loss 0.0014 cls_loss_mapping 0.0012 cls_loss_causal 0.4762 re_mapping 0.0039 re_causal 0.0108 /// teacc 99.02 lr 0.00010000 Epoch 355, weight, value: tensor([[-0.1302, -0.2681, -0.0890, ..., -0.0694, 0.1818, 0.1880], [-0.2464, -0.2165, -0.0848, ..., -0.1991, -0.2507, -0.1583], [-0.0698, -0.1870, 0.1600, ..., -0.2354, 0.2800, 0.1311], ..., [-0.1896, 0.1093, 0.0263, ..., 0.2223, -0.2426, -0.3131], [-0.3136, 0.0749, -0.1608, ..., 0.0702, -0.1216, -0.2351], [-0.0453, -0.1625, -0.0882, ..., -0.1456, -0.0615, -0.2363]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, -1.8626e-09, -9.3132e-10], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 1.3039e-08, 4.6566e-09, 0.0000e+00], [ 0.0000e+00, 5.4948e-08, 0.0000e+00, ..., 6.0536e-08, -6.5193e-09, 0.0000e+00], ..., [ 0.0000e+00, -7.0781e-08, 0.0000e+00, ..., -8.7544e-08, 0.0000e+00, 0.0000e+00], [ 3.7253e-09, 8.3819e-09, 0.0000e+00, ..., 1.1176e-08, 1.8626e-09, 0.0000e+00], [ 4.6566e-09, 7.4506e-09, 0.0000e+00, ..., 7.4506e-09, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 355, bias, value: tensor([-0.0182, -0.0354, -0.0050, -0.0149, -0.0332, 0.0012, 0.0280, -0.0181, 0.0506, -0.0037], device='cuda:0'), grad: tensor([ 2.3283e-08, -3.6228e-07, 1.4063e-07, -1.3970e-08, 1.0710e-07, -5.8673e-08, 1.3039e-07, -1.5553e-07, 1.0710e-07, 8.4750e-08], device='cuda:0') 100 0.0001 changing lr epoch 354, time 250.57, cls_loss 0.0015 cls_loss_mapping 0.0015 cls_loss_causal 0.4948 re_mapping 0.0040 re_causal 0.0108 /// teacc 99.08 lr 0.00010000 Epoch 356, weight, value: tensor([[-0.1304, -0.2683, -0.0890, ..., -0.0697, 0.1818, 0.1882], [-0.2476, -0.2166, -0.0849, ..., -0.1993, -0.2508, -0.1591], [-0.0687, -0.1872, 0.1600, ..., -0.2356, 0.2803, 0.1316], ..., [-0.1900, 0.1094, 0.0263, ..., 0.2230, -0.2428, -0.3132], [-0.3156, 0.0748, -0.1608, ..., 0.0693, -0.1227, -0.2356], [-0.0465, -0.1626, -0.0882, ..., -0.1464, -0.0616, -0.2366]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 5.5879e-09, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], ..., [ 2.7940e-09, 2.7940e-09, -0.0000e+00, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 1.3970e-08, 1.9558e-08, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 1.6764e-08, 7.4506e-09, 0.0000e+00, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 356, bias, value: tensor([-0.0184, -0.0354, -0.0046, -0.0149, -0.0345, 0.0017, 0.0282, -0.0178, 0.0497, -0.0038], device='cuda:0'), grad: tensor([ 7.4506e-09, 2.0489e-08, 1.3039e-08, -1.0431e-07, 9.3132e-10, -1.8999e-07, 1.3411e-07, 9.3132e-09, -1.1083e-07, 2.2724e-07], device='cuda:0') 100 0.0001 changing lr epoch 355, time 250.49, cls_loss 0.0014 cls_loss_mapping 0.0010 cls_loss_causal 0.4571 re_mapping 0.0039 re_causal 0.0106 /// teacc 99.03 lr 0.00010000 Epoch 357, weight, value: tensor([[-0.1304, -0.2693, -0.0879, ..., -0.0704, 0.1820, 0.1884], [-0.2477, -0.2182, -0.0847, ..., -0.2001, -0.2509, -0.1591], [-0.0687, -0.1872, 0.1600, ..., -0.2358, 0.2804, 0.1318], ..., [-0.1904, 0.1106, 0.0263, ..., 0.2236, -0.2431, -0.3136], [-0.3173, 0.0748, -0.1603, ..., 0.0690, -0.1226, -0.2356], [-0.0466, -0.1627, -0.0882, ..., -0.1465, -0.0616, -0.2367]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, -3.6322e-08, -1.8161e-08], [ 9.3132e-10, 2.3283e-09, 0.0000e+00, ..., 9.3132e-10, 1.3970e-09, 4.6566e-10], [ 4.6566e-10, 2.7940e-09, 0.0000e+00, ..., 2.7940e-09, 1.8626e-09, 9.3132e-10], ..., [ 1.3970e-09, -1.3970e-09, 0.0000e+00, ..., -4.6566e-09, 0.0000e+00, 0.0000e+00], [ 3.2596e-09, 5.5879e-09, 0.0000e+00, ..., 0.0000e+00, 1.3970e-09, 9.3132e-10], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 2.4214e-08, 1.2573e-08]], device='cuda:0') Epoch 357, bias, value: tensor([-0.0183, -0.0360, -0.0045, -0.0152, -0.0363, 0.0021, 0.0280, -0.0173, 0.0493, -0.0033], device='cuda:0'), grad: tensor([-6.7055e-08, -3.3528e-08, 1.2573e-08, -9.0804e-08, -9.3132e-09, 7.4971e-08, 9.3132e-09, 3.4459e-08, 1.4435e-08, 5.6811e-08], device='cuda:0') 100 0.0001 changing lr epoch 356, time 250.74, cls_loss 0.0014 cls_loss_mapping 0.0016 cls_loss_causal 0.4599 re_mapping 0.0040 re_causal 0.0109 /// teacc 99.07 lr 0.00010000 Epoch 358, weight, value: tensor([[-0.1304, -0.2704, -0.0885, ..., -0.0704, 0.1820, 0.1885], [-0.2497, -0.2182, -0.0846, ..., -0.2002, -0.2512, -0.1592], [-0.0667, -0.1880, 0.1601, ..., -0.2375, 0.2808, 0.1319], ..., [-0.1908, 0.1099, 0.0263, ..., 0.2239, -0.2435, -0.3138], [-0.3176, 0.0745, -0.1590, ..., 0.0689, -0.1225, -0.2354], [-0.0469, -0.1628, -0.0882, ..., -0.1467, -0.0617, -0.2369]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, -6.0536e-09, -3.7253e-09], [ 3.7253e-09, 1.3504e-08, 0.0000e+00, ..., 8.8476e-09, 4.6566e-10, 0.0000e+00], [ 3.2596e-09, 1.4435e-08, 0.0000e+00, ..., 1.0245e-08, 4.6566e-10, 4.6566e-10], ..., [ 2.7940e-09, -3.2131e-08, 0.0000e+00, ..., -4.6566e-08, 0.0000e+00, 0.0000e+00], [ 4.1910e-09, 7.9162e-09, 0.0000e+00, ..., 1.8626e-09, 9.3132e-10, 4.6566e-10], [-9.3132e-10, 6.0536e-09, 0.0000e+00, ..., 2.7940e-09, 4.6566e-09, 3.2596e-09]], device='cuda:0') Epoch 358, bias, value: tensor([-0.0183, -0.0362, -0.0037, -0.0146, -0.0338, 0.0021, 0.0280, -0.0174, 0.0491, -0.0046], device='cuda:0'), grad: tensor([-7.4506e-09, 3.9116e-08, 4.3306e-08, -6.0536e-09, -1.1176e-08, 2.5146e-08, 6.5193e-09, -8.2422e-08, 1.8626e-08, -2.6077e-08], device='cuda:0') 100 0.0001 changing lr epoch 357, time 250.33, cls_loss 0.0018 cls_loss_mapping 0.0020 cls_loss_causal 0.4706 re_mapping 0.0040 re_causal 0.0104 /// teacc 99.13 lr 0.00010000 Epoch 359, weight, value: tensor([[-0.1306, -0.2713, -0.0887, ..., -0.0704, 0.1821, 0.1886], [-0.2499, -0.2183, -0.0853, ..., -0.2003, -0.2516, -0.1594], [-0.0666, -0.1888, 0.1603, ..., -0.2395, 0.2810, 0.1320], ..., [-0.1915, 0.1093, 0.0263, ..., 0.2228, -0.2436, -0.3140], [-0.3179, 0.0771, -0.1584, ..., 0.0718, -0.1229, -0.2359], [-0.0476, -0.1629, -0.0882, ..., -0.1468, -0.0618, -0.2373]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 5.5879e-09, 5.1921e-07, -2.6077e-08], [ 0.0000e+00, 8.8476e-09, 0.0000e+00, ..., 1.0664e-07, 3.9581e-08, 6.0536e-09], [ 0.0000e+00, 5.1223e-09, -0.0000e+00, ..., 6.0536e-08, -8.2795e-07, -9.3598e-08], ..., [ 4.6566e-10, 2.6543e-08, 0.0000e+00, ..., -1.9884e-07, 6.9849e-09, 1.3970e-09], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 8.3819e-09, 2.3283e-09, 9.3132e-10], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.9558e-08, 8.3819e-08, 4.0978e-08]], device='cuda:0') Epoch 359, bias, value: tensor([-0.0183, -0.0361, -0.0041, -0.0136, -0.0319, 0.0009, 0.0282, -0.0179, 0.0515, -0.0056], device='cuda:0'), grad: tensor([ 9.7416e-07, 3.7951e-07, -1.3150e-06, 6.4727e-08, 7.8697e-08, 1.2647e-06, 1.8114e-07, -5.0105e-07, 3.1665e-08, -1.1306e-06], device='cuda:0') 100 0.0001 changing lr epoch 358, time 250.50, cls_loss 0.0019 cls_loss_mapping 0.0020 cls_loss_causal 0.4925 re_mapping 0.0039 re_causal 0.0105 /// teacc 99.15 lr 0.00010000 Epoch 360, weight, value: tensor([[-0.1306, -0.2718, -0.0888, ..., -0.0703, 0.1831, 0.1897], [-0.2500, -0.2185, -0.0825, ..., -0.2003, -0.2547, -0.1598], [-0.0669, -0.1906, 0.1604, ..., -0.2399, 0.2840, 0.1323], ..., [-0.1920, 0.1096, 0.0261, ..., 0.2231, -0.2442, -0.3145], [-0.3185, 0.0766, -0.1587, ..., 0.0714, -0.1237, -0.2368], [-0.0478, -0.1630, -0.0882, ..., -0.1469, -0.0621, -0.2382]], device='cuda:0'), grad: tensor([[ 1.4435e-08, 5.5879e-09, 0.0000e+00, ..., -2.6636e-07, -1.1437e-06, -7.0501e-07], [ 4.6566e-10, 5.4948e-08, 0.0000e+00, ..., 4.4238e-08, 9.3132e-10, 9.3132e-10], [ 4.6566e-10, 2.7986e-07, 0.0000e+00, ..., 1.3504e-08, -1.0245e-08, 4.6566e-10], ..., [ 0.0000e+00, -1.2480e-07, 0.0000e+00, ..., -4.4471e-07, 1.8626e-09, 1.3970e-09], [ 5.8673e-08, 1.5367e-07, 0.0000e+00, ..., 3.3993e-08, 3.8650e-08, 7.4971e-08], [ 4.6566e-10, 1.9697e-07, 0.0000e+00, ..., 2.7195e-07, 1.9558e-08, 1.2107e-08]], device='cuda:0') Epoch 360, bias, value: tensor([-0.0176, -0.0357, -0.0016, -0.0135, -0.0319, 0.0013, 0.0272, -0.0192, 0.0509, -0.0057], device='cuda:0'), grad: tensor([-2.1737e-06, 2.9150e-07, 4.5961e-07, -8.7824e-07, 1.7229e-08, 3.3807e-06, -1.1995e-06, -1.5795e-06, 5.1875e-07, 1.1828e-06], device='cuda:0') 100 0.0001 changing lr epoch 359, time 250.73, cls_loss 0.0014 cls_loss_mapping 0.0018 cls_loss_causal 0.4487 re_mapping 0.0039 re_causal 0.0104 /// teacc 99.00 lr 0.00010000 Epoch 361, weight, value: tensor([[-0.1307, -0.2721, -0.0888, ..., -0.0701, 0.1839, 0.1906], [-0.2501, -0.2186, -0.0829, ..., -0.2006, -0.2547, -0.1585], [-0.0674, -0.1922, 0.1604, ..., -0.2406, 0.2839, 0.1302], ..., [-0.1929, 0.1100, 0.0262, ..., 0.2236, -0.2457, -0.3149], [-0.3192, 0.0761, -0.1588, ..., 0.0711, -0.1280, -0.2406], [-0.0464, -0.1631, -0.0882, ..., -0.1470, -0.0623, -0.2390]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -9.8255e-08, -8.4285e-08], [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -9.3132e-10, ..., 0.0000e+00, -3.2596e-09, -4.6566e-10], ..., [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, -0.0000e+00, -2.7940e-09], [ 1.3970e-09, 2.3283e-09, 0.0000e+00, ..., 0.0000e+00, 1.2573e-08, 1.0710e-08]], device='cuda:0') Epoch 361, bias, value: tensor([-0.0170, -0.0355, -0.0024, -0.0134, -0.0322, 0.0015, 0.0272, -0.0190, 0.0498, -0.0056], device='cuda:0'), grad: tensor([-1.4808e-07, 8.5682e-08, -5.5879e-09, -1.6438e-07, 7.5549e-06, 1.6764e-07, 1.5274e-07, 5.3272e-07, 1.2619e-07, -8.2925e-06], device='cuda:0') 100 0.0001 changing lr epoch 360, time 250.61, cls_loss 0.0015 cls_loss_mapping 0.0014 cls_loss_causal 0.4910 re_mapping 0.0038 re_causal 0.0104 /// teacc 99.10 lr 0.00010000 Epoch 362, weight, value: tensor([[-0.1308, -0.2733, -0.0888, ..., -0.0706, 0.1841, 0.1909], [-0.2508, -0.2188, -0.0837, ..., -0.2008, -0.2547, -0.1583], [-0.0670, -0.1922, 0.1606, ..., -0.2407, 0.2840, 0.1300], ..., [-0.1930, 0.1106, 0.0262, ..., 0.2241, -0.2459, -0.3152], [-0.3203, 0.0760, -0.1595, ..., 0.0709, -0.1283, -0.2409], [-0.0465, -0.1641, -0.0882, ..., -0.1475, -0.0624, -0.2392]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 9.3132e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 0.0000e+00, -3.7253e-09, -2.3283e-09], ..., [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 1.3970e-09, 9.3132e-10], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 362, bias, value: tensor([-0.0171, -0.0352, -0.0028, -0.0134, -0.0321, 0.0015, 0.0272, -0.0189, 0.0492, -0.0059], device='cuda:0'), grad: tensor([ 6.5193e-09, -2.8405e-08, -9.7789e-09, 3.7253e-09, 6.5193e-09, 1.8626e-09, 6.5193e-09, 2.3283e-09, 5.5879e-09, 1.8161e-08], device='cuda:0') 100 0.0001 changing lr epoch 361, time 250.51, cls_loss 0.0016 cls_loss_mapping 0.0021 cls_loss_causal 0.4686 re_mapping 0.0038 re_causal 0.0100 /// teacc 99.09 lr 0.00010000 Epoch 363, weight, value: tensor([[-0.1308, -0.2761, -0.0888, ..., -0.0717, 0.1846, 0.1915], [-0.2510, -0.2190, -0.0837, ..., -0.2010, -0.2548, -0.1585], [-0.0671, -0.1924, 0.1607, ..., -0.2408, 0.2841, 0.1301], ..., [-0.1936, 0.1117, 0.0262, ..., 0.2253, -0.2461, -0.3154], [-0.3207, 0.0759, -0.1595, ..., 0.0708, -0.1281, -0.2404], [-0.0464, -0.1655, -0.0882, ..., -0.1490, -0.0624, -0.2395]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 7.8697e-08, 0.0000e+00, ..., 1.4855e-07, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.5367e-08, 0.0000e+00, ..., 2.9802e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.6578e-07, 0.0000e+00, ..., -3.1525e-07, 0.0000e+00, 0.0000e+00], [ 4.1910e-09, 1.3970e-09, 0.0000e+00, ..., 3.2596e-09, 4.1910e-09, 4.6566e-09], [ 0.0000e+00, 6.8452e-08, 0.0000e+00, ..., 1.3085e-07, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 363, bias, value: tensor([-0.0168, -0.0353, -0.0028, -0.0133, -0.0321, 0.0011, 0.0271, -0.0183, 0.0493, -0.0065], device='cuda:0'), grad: tensor([ 6.3796e-08, -9.9186e-08, 8.5216e-08, 1.3039e-08, 1.6205e-07, 7.9162e-09, -2.0955e-08, -6.9337e-07, 3.8184e-08, 4.5123e-07], device='cuda:0') 100 0.0001 changing lr epoch 362, time 250.65, cls_loss 0.0016 cls_loss_mapping 0.0019 cls_loss_causal 0.4497 re_mapping 0.0040 re_causal 0.0104 /// teacc 99.04 lr 0.00010000 Epoch 364, weight, value: tensor([[-0.1309, -0.2780, -0.0886, ..., -0.0727, 0.1846, 0.1915], [-0.2531, -0.2192, -0.0839, ..., -0.2013, -0.2548, -0.1593], [-0.0650, -0.1928, 0.1607, ..., -0.2409, 0.2841, 0.1307], ..., [-0.1942, 0.1119, 0.0262, ..., 0.2257, -0.2461, -0.3155], [-0.3212, 0.0759, -0.1596, ..., 0.0711, -0.1273, -0.2394], [-0.0469, -0.1657, -0.0882, ..., -0.1493, -0.0625, -0.2396]], device='cuda:0'), grad: tensor([[ 3.2596e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, -6.6124e-08, -4.9360e-08], [ 5.1223e-09, 1.4575e-07, 0.0000e+00, ..., 4.0606e-07, 3.9116e-08, 2.3749e-08], [ 6.9849e-09, 1.3970e-08, 0.0000e+00, ..., 1.0710e-08, 9.3132e-09, 5.1223e-09], ..., [ 4.7497e-08, -6.4261e-08, 0.0000e+00, ..., -4.0093e-07, 0.0000e+00, 0.0000e+00], [ 7.5903e-08, 1.2200e-07, 0.0000e+00, ..., 1.8626e-08, -1.4435e-08, 2.9337e-08], [ 2.7474e-08, 4.6566e-08, 0.0000e+00, ..., 1.0710e-08, 3.2596e-09, -2.7940e-08]], device='cuda:0') Epoch 364, bias, value: tensor([-0.0169, -0.0357, -0.0020, -0.0133, -0.0324, 0.0042, 0.0238, -0.0182, 0.0501, -0.0065], device='cuda:0'), grad: tensor([ 1.8626e-07, 1.1101e-06, 8.6613e-08, -5.0701e-06, 1.1176e-08, 4.5672e-06, 4.1444e-08, -8.8196e-07, 5.6718e-07, -6.2119e-07], device='cuda:0') 100 0.0001 changing lr epoch 363, time 250.85, cls_loss 0.0016 cls_loss_mapping 0.0016 cls_loss_causal 0.4321 re_mapping 0.0041 re_causal 0.0102 /// teacc 99.05 lr 0.00010000 Epoch 365, weight, value: tensor([[-0.1310, -0.2816, -0.0886, ..., -0.0741, 0.1846, 0.1914], [-0.2534, -0.2195, -0.0839, ..., -0.2016, -0.2549, -0.1594], [-0.0649, -0.1931, 0.1608, ..., -0.2418, 0.2845, 0.1319], ..., [-0.1966, 0.1124, 0.0262, ..., 0.2262, -0.2466, -0.3160], [-0.3217, 0.0755, -0.1601, ..., 0.0711, -0.1279, -0.2402], [-0.0484, -0.1659, -0.0882, ..., -0.1494, -0.0628, -0.2407]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., -5.2154e-08, -6.4261e-08, -4.8429e-08], [ 0.0000e+00, 9.3132e-10, -0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., -4.6566e-10, -3.7253e-09, -1.3970e-09], ..., [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., -2.3283e-09, 1.8626e-09, 9.3132e-10], [ 3.5390e-08, -1.3970e-09, 0.0000e+00, ..., 1.2759e-07, 1.3970e-09, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, -0.0000e+00, ..., 1.8626e-09, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 365, bias, value: tensor([-0.0172, -0.0358, -0.0019, -0.0132, -0.0320, 0.0042, 0.0237, -0.0181, 0.0497, -0.0066], device='cuda:0'), grad: tensor([-1.4994e-07, -4.6566e-10, -9.7789e-09, 9.3132e-09, 3.2596e-09, -8.8476e-08, -1.3970e-09, -9.3132e-10, 2.3376e-07, 3.2596e-09], device='cuda:0') 100 0.0001 changing lr epoch 364, time 250.81, cls_loss 0.0013 cls_loss_mapping 0.0012 cls_loss_causal 0.4638 re_mapping 0.0039 re_causal 0.0104 /// teacc 99.02 lr 0.00010000 Epoch 366, weight, value: tensor([[-0.1310, -0.2819, -0.0887, ..., -0.0741, 0.1848, 0.1917], [-0.2534, -0.2199, -0.0841, ..., -0.2024, -0.2550, -0.1595], [-0.0650, -0.1934, 0.1608, ..., -0.2421, 0.2847, 0.1320], ..., [-0.1972, 0.1127, 0.0262, ..., 0.2267, -0.2471, -0.3163], [-0.3221, 0.0754, -0.1603, ..., 0.0712, -0.1278, -0.2404], [-0.0478, -0.1659, -0.0882, ..., -0.1495, -0.0629, -0.2409]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 2.7940e-09, 0.0000e+00, ..., -5.3085e-08, -3.6368e-07, -2.1746e-07], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, -0.0000e+00, 4.6566e-10], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 2.3283e-09, 9.3132e-10], ..., [ 9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 0.0000e+00], [ 1.3970e-08, 1.4435e-08, 4.6566e-10, ..., 8.8476e-09, 3.2596e-09, 3.2596e-09], [ 4.1910e-09, -3.3528e-08, -1.3970e-09, ..., 3.2596e-09, -3.2596e-09, -5.1223e-09]], device='cuda:0') Epoch 366, bias, value: tensor([-0.0171, -0.0362, -0.0017, -0.0132, -0.0322, 0.0041, 0.0237, -0.0179, 0.0500, -0.0064], device='cuda:0'), grad: tensor([-6.8173e-07, -2.1886e-08, 1.0245e-08, 3.1292e-07, 1.2107e-08, 4.7032e-08, 4.4936e-07, 2.4680e-08, 1.7229e-07, -3.1246e-07], device='cuda:0') 100 0.0001 changing lr epoch 365, time 250.80, cls_loss 0.0012 cls_loss_mapping 0.0012 cls_loss_causal 0.4917 re_mapping 0.0039 re_causal 0.0109 /// teacc 99.08 lr 0.00010000 Epoch 367, weight, value: tensor([[-0.1311, -0.2821, -0.0887, ..., -0.0741, 0.1848, 0.1919], [-0.2534, -0.2200, -0.0841, ..., -0.2025, -0.2550, -0.1595], [-0.0651, -0.1944, 0.1608, ..., -0.2434, 0.2847, 0.1321], ..., [-0.1974, 0.1130, 0.0262, ..., 0.2269, -0.2477, -0.3172], [-0.3226, 0.0750, -0.1603, ..., 0.0709, -0.1281, -0.2408], [-0.0480, -0.1660, -0.0882, ..., -0.1493, -0.0630, -0.2414]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 3.7253e-09, -4.6566e-09, -1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.0268e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 8.0280e-07, -1.3970e-09, -4.6566e-10], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 3.2596e-09, 1.3970e-09, 4.6566e-10], [ 3.7253e-09, 4.6566e-10, 0.0000e+00, ..., -4.6119e-06, 1.3970e-09, 9.3132e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, 6.0536e-09, 3.2596e-09]], device='cuda:0') Epoch 367, bias, value: tensor([-0.0171, -0.0362, -0.0018, -0.0131, -0.0324, 0.0040, 0.0239, -0.0183, 0.0496, -0.0056], device='cuda:0'), grad: tensor([ 2.7940e-09, -3.8650e-08, 2.3097e-06, 6.5193e-09, 2.6543e-08, 9.6187e-06, 8.5123e-07, 5.4948e-08, -1.2830e-05, 9.3132e-09], device='cuda:0') 100 0.0001 changing lr epoch 366, time 250.82, cls_loss 0.0017 cls_loss_mapping 0.0013 cls_loss_causal 0.4533 re_mapping 0.0037 re_causal 0.0097 /// teacc 98.97 lr 0.00010000 Epoch 368, weight, value: tensor([[-0.1312, -0.2824, -0.0887, ..., -0.0741, 0.1848, 0.1919], [-0.2562, -0.2202, -0.0842, ..., -0.2026, -0.2551, -0.1597], [-0.0637, -0.1944, 0.1608, ..., -0.2433, 0.2852, 0.1326], ..., [-0.1982, 0.1132, 0.0262, ..., 0.2270, -0.2486, -0.3183], [-0.3233, 0.0749, -0.1604, ..., 0.0709, -0.1286, -0.2414], [-0.0489, -0.1663, -0.0882, ..., -0.1494, -0.0631, -0.2420]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.7183e-07, 9.7323e-08], [ 0.0000e+00, 4.1910e-09, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 2.7940e-09, 1.3970e-09], ..., [ 0.0000e+00, -8.8476e-09, 0.0000e+00, ..., -1.4901e-08, 1.3970e-09, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, 9.3132e-10, 4.6566e-10], [ 0.0000e+00, 3.7253e-09, 0.0000e+00, ..., 6.5193e-09, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 368, bias, value: tensor([-0.0172, -0.0360, -0.0011, -0.0132, -0.0324, 0.0045, 0.0236, -0.0189, 0.0494, -0.0055], device='cuda:0'), grad: tensor([ 3.3295e-07, -3.0734e-08, 8.8476e-09, 5.1223e-09, -2.2352e-08, 2.7940e-09, -3.0408e-07, -1.2107e-08, 1.3970e-09, 2.4680e-08], device='cuda:0') 100 0.0001 changing lr epoch 367, time 250.95, cls_loss 0.0017 cls_loss_mapping 0.0016 cls_loss_causal 0.4845 re_mapping 0.0036 re_causal 0.0098 /// teacc 99.00 lr 0.00010000 Epoch 369, weight, value: tensor([[-0.1313, -0.2835, -0.0887, ..., -0.0751, 0.1842, 0.1921], [-0.2565, -0.2203, -0.0842, ..., -0.2026, -0.2552, -0.1603], [-0.0635, -0.1943, 0.1608, ..., -0.2434, 0.2858, 0.1333], ..., [-0.1988, 0.1136, 0.0262, ..., 0.2273, -0.2496, -0.3192], [-0.3237, 0.0749, -0.1604, ..., 0.0725, -0.1238, -0.2386], [-0.0491, -0.1668, -0.0882, ..., -0.1497, -0.0624, -0.2424]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -1.3970e-09, -9.3132e-10], [ 0.0000e+00, 1.3392e-06, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.4249e-07, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 1.8626e-09, -1.5963e-06, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., -3.3528e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.2596e-09, 0.0000e+00, ..., 6.0536e-09, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 369, bias, value: tensor([-0.0187, -0.0360, -0.0007, -0.0133, -0.0323, 0.0041, 0.0230, -0.0188, 0.0530, -0.0054], device='cuda:0'), grad: tensor([ 1.1642e-08, 1.2740e-05, 1.3625e-06, 1.0543e-06, 3.2596e-09, 6.9384e-08, 1.4901e-08, -1.5177e-05, -2.8778e-07, 1.9837e-07], device='cuda:0') 100 0.0001 changing lr epoch 368, time 250.81, cls_loss 0.0017 cls_loss_mapping 0.0022 cls_loss_causal 0.4749 re_mapping 0.0038 re_causal 0.0101 /// teacc 98.95 lr 0.00010000 Epoch 370, weight, value: tensor([[-0.1313, -0.2851, -0.0887, ..., -0.0761, 0.1848, 0.1929], [-0.2567, -0.2224, -0.0798, ..., -0.2016, -0.2553, -0.1604], [-0.0636, -0.1952, 0.1608, ..., -0.2442, 0.2857, 0.1329], ..., [-0.1987, 0.1180, 0.0258, ..., 0.2302, -0.2498, -0.3194], [-0.3259, 0.0748, -0.1604, ..., 0.0723, -0.1240, -0.2389], [-0.0494, -0.1706, -0.0883, ..., -0.1535, -0.0628, -0.2438]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 0.0000e+00, -5.7276e-08, -4.1444e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-09, 4.6566e-09], ..., [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.2596e-09, 4.6566e-10, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 5.0757e-08, 3.7253e-08]], device='cuda:0') Epoch 370, bias, value: tensor([-0.0184, -0.0353, -0.0009, -0.0135, -0.0324, 0.0036, 0.0235, -0.0163, 0.0526, -0.0088], device='cuda:0'), grad: tensor([-1.1828e-07, -5.0943e-07, 3.6787e-08, 4.6566e-10, -5.5414e-08, -1.3504e-08, 6.9849e-09, 2.2026e-07, 1.2759e-07, 3.1758e-07], device='cuda:0') 100 0.0001 changing lr epoch 369, time 250.51, cls_loss 0.0012 cls_loss_mapping 0.0013 cls_loss_causal 0.4410 re_mapping 0.0038 re_causal 0.0103 /// teacc 99.06 lr 0.00010000 Epoch 371, weight, value: tensor([[-0.1314, -0.2858, -0.0887, ..., -0.0770, 0.1851, 0.1932], [-0.2567, -0.2239, -0.0804, ..., -0.2032, -0.2553, -0.1605], [-0.0636, -0.1954, 0.1608, ..., -0.2447, 0.2858, 0.1330], ..., [-0.1989, 0.1193, 0.0259, ..., 0.2314, -0.2503, -0.3200], [-0.3274, 0.0748, -0.1605, ..., 0.0726, -0.1238, -0.2395], [-0.0498, -0.1711, -0.0883, ..., -0.1540, -0.0629, -0.2443]], device='cuda:0'), grad: tensor([[ 4.6566e-10, -4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -3.2596e-08, -2.6077e-08], [ 1.8626e-09, 4.6566e-10, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, 5.1223e-09, 3.7253e-09], ..., [ 3.7253e-09, 4.6566e-10, 0.0000e+00, ..., 6.0536e-09, 0.0000e+00, 0.0000e+00], [-2.6748e-06, -1.9744e-07, 0.0000e+00, ..., -4.3772e-06, 7.9162e-09, 6.5193e-09], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 6.0536e-09, 4.6566e-09]], device='cuda:0') Epoch 371, bias, value: tensor([-0.0183, -0.0358, -0.0009, -0.0137, -0.0323, 0.0035, 0.0236, -0.0156, 0.0528, -0.0091], device='cuda:0'), grad: tensor([-6.1467e-08, -2.3805e-06, 1.3327e-06, 1.2247e-07, -1.4901e-07, 5.2378e-06, 9.6858e-06, 1.1409e-06, -1.4968e-05, 1.9558e-08], device='cuda:0') 100 0.0001 changing lr epoch 370, time 250.28, cls_loss 0.0015 cls_loss_mapping 0.0015 cls_loss_causal 0.4550 re_mapping 0.0040 re_causal 0.0102 /// teacc 99.03 lr 0.00010000 Epoch 372, weight, value: tensor([[-0.1314, -0.2856, -0.0887, ..., -0.0773, 0.1862, 0.1946], [-0.2567, -0.2239, -0.0804, ..., -0.2033, -0.2554, -0.1607], [-0.0636, -0.1965, 0.1608, ..., -0.2457, 0.2859, 0.1329], ..., [-0.1997, 0.1195, 0.0259, ..., 0.2317, -0.2505, -0.3202], [-0.3273, 0.0745, -0.1629, ..., 0.0727, -0.1238, -0.2396], [-0.0508, -0.1712, -0.0882, ..., -0.1542, -0.0631, -0.2449]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.7229e-08, -1.5367e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], ..., [ 4.6566e-09, 1.3970e-08, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-09, 4.6566e-10, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.7695e-08, 1.5367e-08]], device='cuda:0') Epoch 372, bias, value: tensor([-0.0175, -0.0366, -0.0010, -0.0136, -0.0324, 0.0032, 0.0235, -0.0155, 0.0528, -0.0084], device='cuda:0'), grad: tensor([-4.0978e-08, 9.3132e-10, 2.7940e-09, -2.7940e-08, -2.7940e-09, -8.8476e-09, 2.7940e-09, 1.8161e-08, 8.8476e-09, 4.4703e-08], device='cuda:0') 100 0.0001 changing lr epoch 371, time 250.45, cls_loss 0.0017 cls_loss_mapping 0.0016 cls_loss_causal 0.4816 re_mapping 0.0039 re_causal 0.0100 /// teacc 98.95 lr 0.00010000 Epoch 373, weight, value: tensor([[-0.1317, -0.2862, -0.0887, ..., -0.0780, 0.1865, 0.1949], [-0.2575, -0.2241, -0.0804, ..., -0.2034, -0.2556, -0.1617], [-0.0637, -0.1993, 0.1608, ..., -0.2493, 0.2856, 0.1321], ..., [-0.2004, 0.1196, 0.0259, ..., 0.2323, -0.2507, -0.3193], [-0.3284, 0.0744, -0.1629, ..., 0.0729, -0.1239, -0.2398], [-0.0509, -0.1713, -0.0882, ..., -0.1544, -0.0632, -0.2452]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.1642e-08, -7.9162e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-09, 1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 2.3283e-09, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, 4.6566e-09, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 373, bias, value: tensor([-0.0174, -0.0364, -0.0024, -0.0131, -0.0329, 0.0030, 0.0237, -0.0155, 0.0529, -0.0082], device='cuda:0'), grad: tensor([-2.0489e-08, 1.0245e-08, 1.8626e-09, 4.1910e-09, -6.6124e-08, 7.0315e-08, -6.6590e-08, 2.3283e-09, 1.1642e-08, 5.6345e-08], device='cuda:0') 100 0.0001 changing lr epoch 372, time 250.31, cls_loss 0.0014 cls_loss_mapping 0.0017 cls_loss_causal 0.4776 re_mapping 0.0038 re_causal 0.0102 /// teacc 99.02 lr 0.00010000 Epoch 374, weight, value: tensor([[-0.1322, -0.2844, -0.0887, ..., -0.0782, 0.1866, 0.1953], [-0.2580, -0.2242, -0.0804, ..., -0.2036, -0.2557, -0.1623], [-0.0634, -0.1994, 0.1608, ..., -0.2495, 0.2857, 0.1321], ..., [-0.2006, 0.1197, 0.0259, ..., 0.2326, -0.2516, -0.3195], [-0.3292, 0.0739, -0.1629, ..., 0.0728, -0.1235, -0.2397], [-0.0489, -0.1713, -0.0882, ..., -0.1545, -0.0633, -0.2454]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -3.6322e-08, -2.4214e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, -1.3970e-09, -4.6566e-10], ..., [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., -1.3970e-09, 1.3970e-09, 4.6566e-10], [ 6.9849e-09, 9.3132e-10, 0.0000e+00, ..., -3.1665e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-09, 1.3970e-09]], device='cuda:0') Epoch 374, bias, value: tensor([-0.0174, -0.0365, -0.0022, -0.0134, -0.0324, 0.0030, 0.0238, -0.0156, 0.0533, -0.0084], device='cuda:0'), grad: tensor([-6.2399e-08, 5.8673e-08, 3.6322e-08, 1.3504e-08, -2.0070e-07, 4.6566e-08, 6.8918e-08, 9.3132e-09, -4.9826e-08, 9.3132e-08], device='cuda:0') 100 0.0001 changing lr epoch 373, time 250.19, cls_loss 0.0018 cls_loss_mapping 0.0014 cls_loss_causal 0.4658 re_mapping 0.0036 re_causal 0.0095 /// teacc 99.09 lr 0.00010000 Epoch 375, weight, value: tensor([[-0.1323, -0.2841, -0.0887, ..., -0.0772, 0.1868, 0.1955], [-0.2594, -0.2243, -0.0804, ..., -0.2039, -0.2558, -0.1624], [-0.0637, -0.2012, 0.1608, ..., -0.2517, 0.2862, 0.1330], ..., [-0.2027, 0.1201, 0.0259, ..., 0.2334, -0.2525, -0.3206], [-0.3273, 0.0729, -0.1629, ..., 0.0732, -0.1234, -0.2399], [-0.0494, -0.1714, -0.0882, ..., -0.1546, -0.0634, -0.2458]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 4.1910e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -5.5879e-09, 0.0000e+00, ..., -1.2107e-08, 0.0000e+00, 0.0000e+00], [ 2.3283e-09, 7.4506e-09, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 3.7253e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 375, bias, value: tensor([-0.0173, -0.0363, -0.0029, -0.0124, -0.0325, 0.0022, 0.0237, -0.0158, 0.0554, -0.0085], device='cuda:0'), grad: tensor([ 1.3970e-09, 1.1176e-08, 1.8626e-09, -5.1223e-09, 4.1910e-09, 1.0710e-08, 4.6566e-10, -2.5611e-08, -1.2107e-08, 2.0023e-08], device='cuda:0') 100 0.0001 changing lr epoch 374, time 250.26, cls_loss 0.0016 cls_loss_mapping 0.0013 cls_loss_causal 0.4651 re_mapping 0.0037 re_causal 0.0101 /// teacc 99.00 lr 0.00010000 Epoch 376, weight, value: tensor([[-0.1326, -0.2842, -0.0887, ..., -0.0770, 0.1895, 0.1983], [-0.2595, -0.2246, -0.0784, ..., -0.2041, -0.2560, -0.1625], [-0.0638, -0.2017, 0.1609, ..., -0.2525, 0.2864, 0.1330], ..., [-0.2033, 0.1204, 0.0252, ..., 0.2338, -0.2531, -0.3209], [-0.3280, 0.0723, -0.1630, ..., 0.0729, -0.1236, -0.2401], [-0.0513, -0.1714, -0.0882, ..., -0.1547, -0.0666, -0.2506]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 6.2399e-08, 0.0000e+00, ..., 2.0862e-07, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.9912e-06, 0.0000e+00, ..., 6.6012e-06, 9.7789e-09, 4.6566e-09], ..., [ 0.0000e+00, -2.0564e-06, 0.0000e+00, ..., -6.8285e-06, -1.0245e-08, -5.1223e-09], [ 4.6566e-10, 3.7253e-09, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, -4.6566e-10, ..., 6.0536e-09, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 376, bias, value: tensor([-0.0151, -0.0364, -0.0028, -0.0125, -0.0328, 0.0022, 0.0240, -0.0157, 0.0551, -0.0094], device='cuda:0'), grad: tensor([ 1.6298e-08, 4.2003e-07, 1.3255e-05, -4.6566e-09, -2.7940e-08, 1.3970e-09, 9.3132e-10, -1.3687e-05, 1.7229e-08, 2.7474e-08], device='cuda:0') 100 0.0001 changing lr epoch 375, time 250.73, cls_loss 0.0015 cls_loss_mapping 0.0016 cls_loss_causal 0.4958 re_mapping 0.0038 re_causal 0.0104 /// teacc 98.99 lr 0.00010000 Epoch 377, weight, value: tensor([[-0.1341, -0.2843, -0.0888, ..., -0.0771, 0.1897, 0.1984], [-0.2597, -0.2248, -0.0776, ..., -0.2045, -0.2561, -0.1623], [-0.0655, -0.2036, 0.1608, ..., -0.2540, 0.2863, 0.1323], ..., [-0.2039, 0.1213, 0.0249, ..., 0.2349, -0.2537, -0.3212], [-0.3279, 0.0699, -0.1630, ..., 0.0721, -0.1233, -0.2401], [-0.0513, -0.1714, -0.0882, ..., -0.1548, -0.0671, -0.2511]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 4.1910e-09, -4.6566e-10, -1.8626e-09], [ 5.3551e-09, 2.5844e-08, 0.0000e+00, ..., 1.2107e-07, 2.8638e-08, 0.0000e+00], [ 2.3283e-10, 1.0477e-08, 0.0000e+00, ..., 1.6065e-08, -7.9162e-09, -3.9581e-09], ..., [ 6.9849e-10, -4.4005e-08, 0.0000e+00, ..., -9.1735e-08, 1.1642e-09, 2.3283e-10], [ 6.2166e-08, 2.5611e-09, 0.0000e+00, ..., -1.4016e-07, -7.3342e-08, 4.1910e-09], [ 2.0955e-09, 6.9849e-09, 0.0000e+00, ..., 1.3271e-08, 4.8894e-09, 3.2596e-09]], device='cuda:0') Epoch 377, bias, value: tensor([-0.0151, -0.0364, -0.0034, -0.0127, -0.0349, 0.0020, 0.0244, -0.0154, 0.0546, -0.0086], device='cuda:0'), grad: tensor([ 2.1420e-08, 5.2061e-07, 2.9337e-08, 4.1281e-07, -3.7486e-08, 3.5856e-08, 7.6601e-08, -2.1257e-07, -9.3039e-07, 9.4064e-08], device='cuda:0') 100 0.0001 changing lr epoch 376, time 250.65, cls_loss 0.0013 cls_loss_mapping 0.0013 cls_loss_causal 0.4490 re_mapping 0.0036 re_causal 0.0098 /// teacc 99.00 lr 0.00010000 Epoch 378, weight, value: tensor([[-0.1341, -0.2845, -0.0888, ..., -0.0775, 0.1897, 0.1985], [-0.2601, -0.2262, -0.0774, ..., -0.2057, -0.2566, -0.1625], [-0.0656, -0.2045, 0.1608, ..., -0.2570, 0.2866, 0.1322], ..., [-0.2041, 0.1221, 0.0249, ..., 0.2357, -0.2521, -0.3210], [-0.3299, 0.0698, -0.1630, ..., 0.0721, -0.1237, -0.2406], [-0.0513, -0.1715, -0.0882, ..., -0.1549, -0.0670, -0.2511]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 3.2596e-09, 1.5600e-08, 5.1223e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.0745e-08, 1.0012e-08, -9.3132e-10], [ 0.0000e+00, 3.2596e-09, 0.0000e+00, ..., -8.6147e-09, -1.8440e-07, -1.1292e-07], ..., [ 0.0000e+00, -4.1910e-09, 0.0000e+00, ..., 1.6997e-08, 1.9092e-08, 1.5832e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 6.0536e-09, 5.4948e-08, 3.3993e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.7462e-08, 9.7789e-09, 9.7789e-09]], device='cuda:0') Epoch 378, bias, value: tensor([-0.0152, -0.0371, -0.0036, -0.0131, -0.0345, 0.0020, 0.0248, -0.0149, 0.0546, -0.0088], device='cuda:0'), grad: tensor([ 8.8708e-08, -3.3295e-07, -7.2457e-07, 6.5193e-09, 2.3446e-07, 3.1199e-08, 6.2631e-08, 2.8801e-07, 2.2841e-07, 1.2922e-07], device='cuda:0') 100 0.0001 changing lr epoch 377, time 250.60, cls_loss 0.0014 cls_loss_mapping 0.0010 cls_loss_causal 0.4627 re_mapping 0.0036 re_causal 0.0099 /// teacc 99.03 lr 0.00010000 Epoch 379, weight, value: tensor([[-0.1343, -0.2848, -0.0888, ..., -0.0775, 0.1902, 0.1992], [-0.2602, -0.2263, -0.0769, ..., -0.2049, -0.2566, -0.1627], [-0.0656, -0.2047, 0.1608, ..., -0.2574, 0.2865, 0.1318], ..., [-0.2047, 0.1223, 0.0248, ..., 0.2358, -0.2526, -0.3217], [-0.3306, 0.0696, -0.1630, ..., 0.0727, -0.1240, -0.2408], [-0.0515, -0.1716, -0.0883, ..., -0.1553, -0.0672, -0.2514]], device='cuda:0'), grad: tensor([[ 1.1642e-09, 6.9849e-10, 0.0000e+00, ..., 6.9849e-10, -9.0804e-09, -5.8208e-09], [ 9.3132e-10, 2.6310e-08, 0.0000e+00, ..., 7.3807e-08, 1.1642e-09, 4.6566e-09], [-4.6566e-10, 5.3318e-08, 0.0000e+00, ..., 1.5623e-07, -3.4925e-09, 8.3819e-09], ..., [ 3.7253e-09, -7.6601e-08, 0.0000e+00, ..., -2.3423e-07, 3.2596e-09, -1.2806e-08], [ 6.9849e-10, 9.3132e-10, 0.0000e+00, ..., -0.0000e+00, -6.9849e-10, 2.3283e-10], [ 2.0955e-09, 3.2596e-09, 0.0000e+00, ..., 2.0955e-09, 5.3551e-09, 3.2596e-09]], device='cuda:0') Epoch 379, bias, value: tensor([-0.0149, -0.0362, -0.0037, -0.0132, -0.0344, 0.0017, 0.0249, -0.0154, 0.0549, -0.0090], device='cuda:0'), grad: tensor([ 1.1874e-08, 1.6205e-07, 3.2200e-07, -2.0955e-08, 1.0221e-07, 2.7940e-09, 6.2864e-09, -4.6892e-07, 2.5146e-08, -1.3434e-07], device='cuda:0') 100 0.0001 changing lr epoch 378, time 250.53, cls_loss 0.0013 cls_loss_mapping 0.0019 cls_loss_causal 0.4871 re_mapping 0.0039 re_causal 0.0103 /// teacc 99.10 lr 0.00010000 Epoch 380, weight, value: tensor([[-0.1354, -0.2849, -0.0888, ..., -0.0777, 0.1903, 0.1992], [-0.2604, -0.2267, -0.0767, ..., -0.2065, -0.2567, -0.1627], [-0.0657, -0.2049, 0.1608, ..., -0.2577, 0.2868, 0.1324], ..., [-0.2053, 0.1226, 0.0247, ..., 0.2368, -0.2535, -0.3239], [-0.3342, 0.0694, -0.1630, ..., 0.0710, -0.1255, -0.2413], [-0.0515, -0.1716, -0.0883, ..., -0.1554, -0.0676, -0.2517]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -3.8417e-08, -2.4913e-08], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 2.3283e-10, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 4.1910e-09, 2.3283e-10, 0.0000e+00, ..., 1.4901e-08, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 7.6834e-09, ..., 8.6147e-09, 2.0955e-09, 1.3970e-09]], device='cuda:0') Epoch 380, bias, value: tensor([-0.0148, -0.0366, -0.0039, -0.0129, -0.0344, 0.0027, 0.0245, -0.0151, 0.0533, -0.0090], device='cuda:0'), grad: tensor([-5.5879e-08, 2.0955e-09, 4.6566e-10, 4.6566e-10, -5.6345e-08, -6.9616e-08, 9.9884e-08, 3.0268e-09, 2.5844e-08, 5.7509e-08], device='cuda:0') 100 0.0001 changing lr epoch 379, time 250.61, cls_loss 0.0014 cls_loss_mapping 0.0010 cls_loss_causal 0.4825 re_mapping 0.0038 re_causal 0.0102 /// teacc 99.04 lr 0.00010000 Epoch 381, weight, value: tensor([[-0.1329, -0.2851, -0.0888, ..., -0.0780, 0.1923, 0.2018], [-0.2605, -0.2269, -0.0767, ..., -0.2067, -0.2568, -0.1633], [-0.0658, -0.2081, 0.1608, ..., -0.2612, 0.2867, 0.1314], ..., [-0.2060, 0.1233, 0.0247, ..., 0.2382, -0.2528, -0.3219], [-0.3351, 0.0682, -0.1630, ..., 0.0700, -0.1259, -0.2416], [-0.0514, -0.1716, -0.0883, ..., -0.1556, -0.0672, -0.2518]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 6.9849e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 381, bias, value: tensor([-0.0133, -0.0368, -0.0049, -0.0129, -0.0346, 0.0031, 0.0229, -0.0145, 0.0524, -0.0087], device='cuda:0'), grad: tensor([ 4.6566e-10, 1.8626e-09, 4.6566e-10, 2.5611e-09, 4.4238e-09, 4.6566e-09, 6.9849e-10, 1.2573e-08, -6.2864e-09, -1.3737e-08], device='cuda:0') 100 0.0001 changing lr epoch 380, time 250.41, cls_loss 0.0014 cls_loss_mapping 0.0013 cls_loss_causal 0.4781 re_mapping 0.0039 re_causal 0.0104 /// teacc 99.00 lr 0.00010000 Epoch 382, weight, value: tensor([[-0.1331, -0.2857, -0.0888, ..., -0.0785, 0.1928, 0.2026], [-0.2605, -0.2270, -0.0767, ..., -0.2068, -0.2569, -0.1640], [-0.0659, -0.2082, 0.1608, ..., -0.2617, 0.2868, 0.1311], ..., [-0.2063, 0.1233, 0.0247, ..., 0.2386, -0.2537, -0.3227], [-0.3356, 0.0680, -0.1630, ..., 0.0699, -0.1263, -0.2422], [-0.0523, -0.1718, -0.0883, ..., -0.1559, -0.0674, -0.2522]], device='cuda:0'), grad: tensor([[-9.0804e-09, 6.9849e-10, 0.0000e+00, ..., 5.1223e-09, -1.3085e-07, -8.6613e-08], [ 2.3283e-10, 1.6065e-08, 0.0000e+00, ..., 1.1735e-07, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 4.6333e-08, 0.0000e+00, ..., 3.4133e-07, 2.3283e-10, 2.3283e-10], ..., [ 2.3283e-10, -1.8021e-07, 0.0000e+00, ..., -1.3141e-06, 0.0000e+00, 0.0000e+00], [ 1.6298e-09, 6.0536e-09, 0.0000e+00, ..., 2.8405e-08, 4.6566e-10, 6.9849e-10], [ 2.3283e-10, 1.1479e-07, 0.0000e+00, ..., 8.2050e-07, 9.3132e-10, 6.9849e-10]], device='cuda:0') Epoch 382, bias, value: tensor([-0.0128, -0.0368, -0.0050, -0.0135, -0.0343, 0.0034, 0.0227, -0.0145, 0.0522, -0.0090], device='cuda:0'), grad: tensor([-1.6345e-07, 2.8289e-07, 8.1584e-07, -1.4203e-08, 8.3819e-09, 4.8894e-08, 1.1595e-07, -3.1237e-06, 7.7765e-08, 1.9558e-06], device='cuda:0') 100 0.0001 changing lr epoch 381, time 250.39, cls_loss 0.0014 cls_loss_mapping 0.0017 cls_loss_causal 0.4738 re_mapping 0.0038 re_causal 0.0102 /// teacc 98.96 lr 0.00010000 Epoch 383, weight, value: tensor([[-0.1332, -0.2858, -0.0888, ..., -0.0786, 0.1928, 0.2026], [-0.2606, -0.2270, -0.0767, ..., -0.2070, -0.2569, -0.1641], [-0.0662, -0.2090, 0.1608, ..., -0.2622, 0.2869, 0.1312], ..., [-0.2044, 0.1235, 0.0247, ..., 0.2394, -0.2538, -0.3228], [-0.3364, 0.0680, -0.1631, ..., 0.0697, -0.1264, -0.2423], [-0.0517, -0.1720, -0.0883, ..., -0.1564, -0.0674, -0.2523]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -1.6764e-08, -1.0245e-08], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 2.5611e-09, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, 1.1642e-09, 6.9849e-10], ..., [ 0.0000e+00, 1.6103e-06, 0.0000e+00, ..., 3.6806e-06, 2.3283e-10, 2.3283e-10], [ 2.3283e-10, -1.6727e-06, 0.0000e+00, ..., -3.8296e-06, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 1.8626e-09, 1.1176e-08, 6.7521e-09]], device='cuda:0') Epoch 383, bias, value: tensor([-0.0129, -0.0369, -0.0051, -0.0134, -0.0349, 0.0009, 0.0252, -0.0143, 0.0521, -0.0089], device='cuda:0'), grad: tensor([-2.9104e-08, 6.0536e-09, 4.8894e-09, 1.9628e-07, -1.0384e-07, 1.7928e-08, -2.0955e-09, 5.9642e-06, -6.1654e-06, 9.5693e-08], device='cuda:0') 100 0.0001 changing lr epoch 382, time 250.37, cls_loss 0.0022 cls_loss_mapping 0.0018 cls_loss_causal 0.4499 re_mapping 0.0039 re_causal 0.0096 /// teacc 98.97 lr 0.00010000 Epoch 384, weight, value: tensor([[-0.1334, -0.2866, -0.0888, ..., -0.0799, 0.1929, 0.2027], [-0.2609, -0.2283, -0.0757, ..., -0.2074, -0.2573, -0.1645], [-0.0663, -0.2091, 0.1610, ..., -0.2622, 0.2875, 0.1316], ..., [-0.2050, 0.1242, 0.0247, ..., 0.2402, -0.2548, -0.3232], [-0.3370, 0.0696, -0.1631, ..., 0.0701, -0.1266, -0.2405], [-0.0522, -0.1728, -0.0887, ..., -0.1571, -0.0677, -0.2535]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, -1.1409e-08, -6.7521e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 3.9581e-09, 2.3283e-10, 0.0000e+00], ..., [ 2.3283e-10, 1.6298e-09, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 2.3283e-10, 0.0000e+00, ..., -5.4017e-07, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.0804e-07, 9.7789e-09, 5.8208e-09]], device='cuda:0') Epoch 384, bias, value: tensor([-0.0131, -0.0377, -0.0048, -0.0128, -0.0346, 0.0006, 0.0254, -0.0136, 0.0552, -0.0112], device='cuda:0'), grad: tensor([-1.3737e-08, 3.0501e-08, 2.1420e-08, 6.5193e-09, 1.0245e-08, 5.7509e-08, 2.1886e-08, 1.7695e-08, -3.7737e-06, 3.6303e-06], device='cuda:0') 100 0.0001 changing lr epoch 383, time 250.59, cls_loss 0.0017 cls_loss_mapping 0.0018 cls_loss_causal 0.4628 re_mapping 0.0038 re_causal 0.0099 /// teacc 98.99 lr 0.00010000 Epoch 385, weight, value: tensor([[-0.1336, -0.2868, -0.0888, ..., -0.0831, 0.1935, 0.2034], [-0.2610, -0.2284, -0.0725, ..., -0.2047, -0.2574, -0.1647], [-0.0663, -0.2092, 0.1617, ..., -0.2624, 0.2884, 0.1324], ..., [-0.2057, 0.1243, 0.0215, ..., 0.2376, -0.2551, -0.3237], [-0.3412, 0.0695, -0.1632, ..., 0.0715, -0.1248, -0.2425], [-0.0531, -0.1728, -0.0887, ..., -0.1574, -0.0687, -0.2548]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -7.4971e-08, -4.9593e-08], [ 6.9849e-10, 1.5832e-08, 0.0000e+00, ..., 1.5064e-07, 4.4238e-09, 2.7940e-09], [ 0.0000e+00, -2.3283e-10, 0.0000e+00, ..., -5.1223e-09, -5.3551e-09, -2.3283e-09], ..., [ 6.9849e-10, -1.7695e-08, 0.0000e+00, ..., -1.7346e-07, 1.6298e-09, 9.3132e-10], [ 3.5623e-08, 2.3283e-10, 0.0000e+00, ..., 4.1677e-08, 9.3132e-09, 5.3551e-09], [-9.1270e-08, 1.6298e-09, -2.3283e-10, ..., 1.6531e-08, 4.7497e-08, 3.1665e-08]], device='cuda:0') Epoch 385, bias, value: tensor([-0.0127, -0.0353, -0.0044, -0.0131, -0.0339, 0.0007, 0.0250, -0.0158, 0.0556, -0.0118], device='cuda:0'), grad: tensor([-2.2235e-07, 1.1805e-07, -1.6997e-08, 4.0443e-07, 7.4506e-09, 1.2573e-08, 4.6333e-08, -2.8359e-07, 3.0361e-07, -3.6648e-07], device='cuda:0') 100 0.0001 changing lr epoch 384, time 251.15, cls_loss 0.0017 cls_loss_mapping 0.0014 cls_loss_causal 0.4993 re_mapping 0.0037 re_causal 0.0100 /// teacc 99.00 lr 0.00010000 Epoch 386, weight, value: tensor([[-0.1364, -0.2870, -0.0889, ..., -0.0861, 0.1908, 0.2009], [-0.2620, -0.2286, -0.0724, ..., -0.2048, -0.2583, -0.1655], [-0.0658, -0.2089, 0.1618, ..., -0.2622, 0.2894, 0.1333], ..., [-0.2059, 0.1245, 0.0213, ..., 0.2376, -0.2552, -0.3239], [-0.3429, 0.0693, -0.1634, ..., 0.0704, -0.1251, -0.2438], [-0.0533, -0.1729, -0.0878, ..., -0.1573, -0.0687, -0.2549]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8394e-08, -6.7288e-08, -3.1665e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 3.0268e-09, 6.9849e-09, 3.0268e-09], [ 0.0000e+00, -1.6298e-09, 0.0000e+00, ..., -1.0245e-08, -2.3516e-08, -1.0943e-08], ..., [ 2.3283e-10, 1.6298e-09, 0.0000e+00, ..., 1.3737e-08, 3.3062e-08, 1.3737e-08], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., -1.0943e-08, 1.3970e-09, 4.6566e-10], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 6.9849e-09, 2.0256e-08, 8.1491e-09]], device='cuda:0') Epoch 386, bias, value: tensor([-0.0154, -0.0355, -0.0034, -0.0134, -0.0345, -0.0029, 0.0307, -0.0159, 0.0548, -0.0114], device='cuda:0'), grad: tensor([-1.4808e-07, 3.4226e-08, -8.5915e-08, 1.8510e-07, -3.0734e-08, -1.2177e-07, 5.8440e-08, 1.0966e-07, -3.6089e-08, 4.4238e-08], device='cuda:0') 100 0.0001 changing lr epoch 385, time 250.69, cls_loss 0.0011 cls_loss_mapping 0.0011 cls_loss_causal 0.4491 re_mapping 0.0038 re_causal 0.0102 /// teacc 99.02 lr 0.00010000 Epoch 387, weight, value: tensor([[-0.1367, -0.2872, -0.0890, ..., -0.0863, 0.1906, 0.2007], [-0.2629, -0.2292, -0.0725, ..., -0.2050, -0.2583, -0.1662], [-0.0655, -0.2114, 0.1618, ..., -0.2639, 0.2895, 0.1322], ..., [-0.2061, 0.1254, 0.0213, ..., 0.2382, -0.2551, -0.3218], [-0.3431, 0.0691, -0.1634, ..., 0.0703, -0.1252, -0.2438], [-0.0541, -0.1731, -0.0878, ..., -0.1577, -0.0688, -0.2549]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 1.2573e-08, 9.3132e-10], [ 2.3283e-10, 9.3132e-09, 0.0000e+00, ..., 9.3132e-10, 8.6147e-09, 6.9849e-10], [ 0.0000e+00, -8.7963e-07, 0.0000e+00, ..., 0.0000e+00, -1.0971e-06, -1.0873e-07], ..., [ 0.0000e+00, 7.1572e-07, 0.0000e+00, ..., 4.6566e-10, 6.3749e-07, 5.9605e-08], [ 4.6566e-10, 8.6846e-08, 0.0000e+00, ..., -6.9849e-10, 8.2189e-08, 8.1491e-09], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 4.6566e-10, 1.1642e-09, 4.6566e-10]], device='cuda:0') Epoch 387, bias, value: tensor([-0.0156, -0.0356, -0.0037, -0.0137, -0.0343, -0.0028, 0.0308, -0.0155, 0.0546, -0.0116], device='cuda:0'), grad: tensor([ 3.1665e-08, 3.4692e-08, -4.8093e-06, 2.9197e-07, -1.7020e-07, 1.1642e-09, 5.1223e-07, 3.4999e-06, 4.2818e-07, 1.8440e-07], device='cuda:0') 100 0.0001 changing lr epoch 386, time 250.30, cls_loss 0.0014 cls_loss_mapping 0.0012 cls_loss_causal 0.4691 re_mapping 0.0036 re_causal 0.0101 /// teacc 99.08 lr 0.00010000 Epoch 388, weight, value: tensor([[-0.1367, -0.2874, -0.0890, ..., -0.0865, 0.1907, 0.2008], [-0.2635, -0.2319, -0.0724, ..., -0.2053, -0.2597, -0.1679], [-0.0655, -0.2118, 0.1628, ..., -0.2644, 0.2916, 0.1337], ..., [-0.2066, 0.1271, 0.0213, ..., 0.2385, -0.2566, -0.3236], [-0.3438, 0.0689, -0.1634, ..., 0.0698, -0.1256, -0.2440], [-0.0544, -0.1731, -0.0879, ..., -0.1577, -0.0689, -0.2550]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 9.7789e-09, 6.9849e-10, 1.6298e-09], [ 4.6566e-10, 2.0955e-09, 0.0000e+00, ..., 1.5600e-08, 2.7940e-09, -5.3551e-08], [ 6.9849e-10, 4.6566e-10, 0.0000e+00, ..., 3.3993e-08, 7.4506e-09, 2.7008e-08], ..., [ 4.6566e-10, -5.5879e-09, 0.0000e+00, ..., -8.6147e-09, 6.9849e-10, 4.0280e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., -7.2829e-06, -1.5404e-06, -2.0787e-06], [ 9.3132e-09, 1.9791e-08, 0.0000e+00, ..., 1.0477e-08, 3.0268e-09, 2.7940e-09]], device='cuda:0') Epoch 388, bias, value: tensor([-0.0156, -0.0362, -0.0027, -0.0163, -0.0339, -0.0012, 0.0302, -0.0149, 0.0541, -0.0117], device='cuda:0'), grad: tensor([ 4.3306e-08, -2.2892e-06, 8.2515e-07, -2.6543e-08, 1.3039e-08, 2.7213e-06, 2.6658e-05, 1.5935e-06, -2.9609e-05, 9.9884e-08], device='cuda:0') 100 0.0001 changing lr epoch 387, time 250.48, cls_loss 0.0018 cls_loss_mapping 0.0017 cls_loss_causal 0.4650 re_mapping 0.0038 re_causal 0.0097 /// teacc 99.04 lr 0.00010000 Epoch 389, weight, value: tensor([[-0.1377, -0.2871, -0.0890, ..., -0.0892, 0.1888, 0.1991], [-0.2644, -0.2325, -0.0724, ..., -0.2059, -0.2599, -0.1677], [-0.0656, -0.2116, 0.1628, ..., -0.2652, 0.2922, 0.1344], ..., [-0.2071, 0.1285, 0.0213, ..., 0.2401, -0.2583, -0.3248], [-0.3440, 0.0685, -0.1634, ..., 0.0696, -0.1256, -0.2441], [-0.0548, -0.1748, -0.0879, ..., -0.1604, -0.0690, -0.2551]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 2.7940e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.5832e-08, 0.0000e+00, ..., -6.0536e-09, -8.3819e-09, -4.6566e-10], ..., [ 0.0000e+00, 5.1223e-09, 0.0000e+00, ..., 3.2596e-09, 3.7253e-09, 4.6566e-10], [ 4.6566e-10, 2.7940e-09, 0.0000e+00, ..., 2.3283e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 9.3132e-10]], device='cuda:0') Epoch 389, bias, value: tensor([-0.0175, -0.0363, -0.0024, -0.0159, -0.0334, -0.0013, 0.0316, -0.0141, 0.0538, -0.0128], device='cuda:0'), grad: tensor([ 1.1176e-08, 6.9849e-09, -9.8255e-08, 6.2399e-08, -1.0431e-07, 9.3132e-09, 1.1176e-08, 2.9337e-08, -2.1886e-08, 1.0431e-07], device='cuda:0') 100 0.0001 changing lr epoch 388, time 250.34, cls_loss 0.0013 cls_loss_mapping 0.0011 cls_loss_causal 0.4639 re_mapping 0.0037 re_causal 0.0101 /// teacc 99.11 lr 0.00010000 Epoch 390, weight, value: tensor([[-0.1377, -0.2872, -0.0892, ..., -0.0891, 0.1890, 0.1993], [-0.2647, -0.2339, -0.0724, ..., -0.2060, -0.2614, -0.1701], [-0.0658, -0.2119, 0.1643, ..., -0.2653, 0.2929, 0.1349], ..., [-0.2072, 0.1295, 0.0213, ..., 0.2404, -0.2587, -0.3250], [-0.3443, 0.0684, -0.1639, ..., 0.0696, -0.1257, -0.2441], [-0.0549, -0.1750, -0.0879, ..., -0.1606, -0.0690, -0.2551]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.3504e-08, 0.0000e+00, ..., -1.8626e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-09, 2.7940e-09, 0.0000e+00, ..., 8.3819e-09, 9.3132e-10, 4.6566e-10], [ 0.0000e+00, 7.9162e-09, 0.0000e+00, ..., 1.3039e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 390, bias, value: tensor([-0.0172, -0.0370, -0.0020, -0.0162, -0.0333, -0.0013, 0.0316, -0.0137, 0.0537, -0.0129], device='cuda:0'), grad: tensor([ 4.1910e-09, 5.1223e-09, 1.8626e-09, -6.5193e-09, -1.3039e-08, -7.4506e-09, -6.9849e-09, -2.7940e-08, 2.7474e-08, 3.2596e-08], device='cuda:0') 100 0.0001 changing lr epoch 389, time 250.07, cls_loss 0.0012 cls_loss_mapping 0.0012 cls_loss_causal 0.4674 re_mapping 0.0038 re_causal 0.0101 /// teacc 99.08 lr 0.00010000 Epoch 391, weight, value: tensor([[-0.1377, -0.2883, -0.0892, ..., -0.0891, 0.1890, 0.1994], [-0.2648, -0.2338, -0.0724, ..., -0.2060, -0.2615, -0.1703], [-0.0659, -0.2119, 0.1644, ..., -0.2654, 0.2929, 0.1349], ..., [-0.2073, 0.1297, 0.0213, ..., 0.2406, -0.2592, -0.3254], [-0.3444, 0.0684, -0.1639, ..., 0.0696, -0.1259, -0.2441], [-0.0550, -0.1752, -0.0879, ..., -0.1609, -0.0691, -0.2553]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -3.5390e-08, -2.0955e-08], [ 9.3132e-10, 4.6566e-09, 0.0000e+00, ..., 3.7253e-09, 4.6566e-10, 0.0000e+00], [ 6.9849e-09, 1.6298e-08, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], ..., [ 4.6566e-10, -1.8626e-09, -0.0000e+00, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 4.6566e-10], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 9.7789e-09, 6.5193e-09]], device='cuda:0') Epoch 391, bias, value: tensor([-0.0172, -0.0344, -0.0023, -0.0169, -0.0335, -0.0012, 0.0316, -0.0157, 0.0537, -0.0132], device='cuda:0'), grad: tensor([-6.5193e-08, 4.6566e-09, 3.6322e-08, -3.6787e-08, -2.3283e-09, 1.3970e-09, 3.9116e-08, -3.2596e-09, 5.1223e-09, 2.7474e-08], device='cuda:0') 100 0.0001 changing lr epoch 390, time 250.44, cls_loss 0.0012 cls_loss_mapping 0.0012 cls_loss_causal 0.4607 re_mapping 0.0037 re_causal 0.0102 /// teacc 99.07 lr 0.00010000 Epoch 392, weight, value: tensor([[-0.1379, -0.2883, -0.0892, ..., -0.0887, 0.1893, 0.1998], [-0.2649, -0.2370, -0.0724, ..., -0.2067, -0.2616, -0.1704], [-0.0659, -0.2132, 0.1644, ..., -0.2665, 0.2930, 0.1349], ..., [-0.2083, 0.1329, 0.0213, ..., 0.2414, -0.2595, -0.3254], [-0.3447, 0.0681, -0.1639, ..., 0.0696, -0.1255, -0.2440], [-0.0549, -0.1753, -0.0879, ..., -0.1610, -0.0692, -0.2554]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -6.4727e-08, -4.4238e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.6345e-07, 1.3970e-09], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.4808e-07, 9.3132e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.0536e-09, 3.2596e-09], [ 8.3819e-09, 1.2107e-08, 0.0000e+00, ..., 0.0000e+00, 3.3528e-08, 2.2817e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 1.3039e-08, 8.8476e-09]], device='cuda:0') Epoch 392, bias, value: tensor([-0.0170, -0.0367, -0.0028, -0.0168, -0.0336, -0.0012, 0.0314, -0.0133, 0.0537, -0.0132], device='cuda:0'), grad: tensor([-1.3737e-07, 4.6566e-07, -4.2515e-07, -2.8871e-08, 1.0710e-07, 5.1223e-09, -1.2806e-07, 1.5367e-08, 1.0151e-07, 3.2131e-08], device='cuda:0') 100 0.0001 changing lr epoch 391, time 250.09, cls_loss 0.0014 cls_loss_mapping 0.0012 cls_loss_causal 0.4496 re_mapping 0.0038 re_causal 0.0098 /// teacc 99.06 lr 0.00010000 Epoch 393, weight, value: tensor([[-0.1380, -0.2884, -0.0892, ..., -0.0887, 0.1894, 0.1998], [-0.2651, -0.2371, -0.0724, ..., -0.2068, -0.2617, -0.1704], [-0.0658, -0.2150, 0.1644, ..., -0.2678, 0.2933, 0.1355], ..., [-0.2085, 0.1331, 0.0212, ..., 0.2417, -0.2605, -0.3260], [-0.3468, 0.0677, -0.1640, ..., 0.0683, -0.1264, -0.2466], [-0.0550, -0.1754, -0.0879, ..., -0.1615, -0.0693, -0.2555]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.7695e-08, -1.3504e-08], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 9.7789e-09, 0.0000e+00, -4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 393, bias, value: tensor([-0.0170, -0.0368, -0.0036, -0.0168, -0.0336, -0.0011, 0.0321, -0.0132, 0.0521, -0.0132], device='cuda:0'), grad: tensor([-3.2131e-08, 4.1910e-09, 4.6566e-10, 5.5879e-09, 1.1642e-08, -4.0047e-08, 3.2131e-08, 1.4901e-07, 1.8626e-09, -1.2852e-07], device='cuda:0') 100 0.0001 changing lr epoch 392, time 250.17, cls_loss 0.0012 cls_loss_mapping 0.0020 cls_loss_causal 0.4686 re_mapping 0.0037 re_causal 0.0099 /// teacc 99.05 lr 0.00010000 Epoch 394, weight, value: tensor([[-0.1380, -0.2888, -0.0899, ..., -0.0887, 0.1893, 0.1999], [-0.2651, -0.2371, -0.0724, ..., -0.2069, -0.2617, -0.1705], [-0.0658, -0.2150, 0.1653, ..., -0.2678, 0.2936, 0.1364], ..., [-0.2090, 0.1314, 0.0188, ..., 0.2393, -0.2611, -0.3262], [-0.3470, 0.0676, -0.1661, ..., 0.0682, -0.1266, -0.2467], [-0.0556, -0.1724, -0.0848, ..., -0.1585, -0.0691, -0.2555]], device='cuda:0'), grad: tensor([[ 2.3283e-09, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-09, 6.9849e-09], [ 0.0000e+00, 1.3690e-07, 0.0000e+00, ..., 7.5437e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.1002e-08, 0.0000e+00, ..., 3.3993e-08, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.5355e-07, 0.0000e+00, ..., -2.5146e-07, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.1910e-09, 0.0000e+00, ..., 5.1223e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.4156e-07, -4.6566e-10, ..., 7.5437e-08, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 394, bias, value: tensor([-0.0171, -0.0368, -0.0033, -0.0162, -0.0364, -0.0012, 0.0321, -0.0150, 0.0520, -0.0096], device='cuda:0'), grad: tensor([ 3.3062e-08, 9.0804e-07, 4.0000e-07, 6.1654e-07, 8.2888e-08, 4.6566e-10, -3.2131e-08, -2.9616e-06, 3.7719e-08, 9.1316e-07], device='cuda:0') 100 0.0001 changing lr epoch 393, time 250.55, cls_loss 0.0016 cls_loss_mapping 0.0014 cls_loss_causal 0.4581 re_mapping 0.0037 re_causal 0.0096 /// teacc 99.01 lr 0.00010000 Epoch 395, weight, value: tensor([[-0.1380, -0.2894, -0.0901, ..., -0.0888, 0.1893, 0.1999], [-0.2683, -0.2372, -0.0724, ..., -0.2070, -0.2618, -0.1707], [-0.0660, -0.2150, 0.1655, ..., -0.2680, 0.2939, 0.1369], ..., [-0.2094, 0.1314, 0.0188, ..., 0.2394, -0.2615, -0.3266], [-0.3469, 0.0686, -0.1664, ..., 0.0687, -0.1267, -0.2468], [-0.0585, -0.1726, -0.0848, ..., -0.1588, -0.0690, -0.2556]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, -1.4016e-07, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 6.0536e-09, 4.6566e-10, ..., 9.3132e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -6.9849e-09, 1.1921e-07, ..., -1.1176e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 9.3132e-10, 5.5879e-09, ..., 1.8626e-09, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 395, bias, value: tensor([-0.0172, -0.0366, -0.0070, -0.0164, -0.0372, -0.0006, 0.0321, -0.0150, 0.0525, -0.0094], device='cuda:0'), grad: tensor([ 6.9849e-09, -1.5497e-06, 2.9802e-08, 2.3749e-08, 1.5330e-06, -1.6764e-08, 2.7940e-09, 1.3169e-06, 3.2596e-09, -1.3560e-06], device='cuda:0') 100 0.0001 changing lr epoch 394, time 250.92, cls_loss 0.0010 cls_loss_mapping 0.0012 cls_loss_causal 0.4421 re_mapping 0.0038 re_causal 0.0101 /// teacc 99.07 lr 0.00010000 Epoch 396, weight, value: tensor([[-0.1380, -0.2897, -0.0901, ..., -0.0888, 0.1893, 0.1999], [-0.2682, -0.2372, -0.0724, ..., -0.2070, -0.2618, -0.1707], [-0.0664, -0.2177, 0.1655, ..., -0.2683, 0.2946, 0.1379], ..., [-0.2097, 0.1317, 0.0188, ..., 0.2395, -0.2631, -0.3271], [-0.3470, 0.0684, -0.1665, ..., 0.0687, -0.1268, -0.2468], [-0.0588, -0.1726, -0.0848, ..., -0.1588, -0.0689, -0.2556]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -3.7253e-09, -1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 396, bias, value: tensor([-0.0172, -0.0364, -0.0089, -0.0166, -0.0375, -0.0005, 0.0320, -0.0148, 0.0524, -0.0092], device='cuda:0'), grad: tensor([-7.4506e-09, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.7253e-09, 1.3970e-09, 2.3283e-09, 1.8626e-09, 1.3970e-09, 4.6566e-09], device='cuda:0') 100 0.0001 changing lr epoch 395, time 250.94, cls_loss 0.0011 cls_loss_mapping 0.0014 cls_loss_causal 0.4844 re_mapping 0.0039 re_causal 0.0103 /// teacc 99.08 lr 0.00010000 Epoch 397, weight, value: tensor([[-0.1380, -0.2898, -0.0901, ..., -0.0888, 0.1893, 0.2000], [-0.2683, -0.2372, -0.0724, ..., -0.2071, -0.2618, -0.1707], [-0.0664, -0.2179, 0.1655, ..., -0.2686, 0.2947, 0.1380], ..., [-0.2094, 0.1318, 0.0188, ..., 0.2396, -0.2632, -0.3271], [-0.3470, 0.0682, -0.1665, ..., 0.0687, -0.1269, -0.2469], [-0.0596, -0.1727, -0.0848, ..., -0.1589, -0.0689, -0.2556]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., -4.6566e-10, -1.3970e-09, -1.8626e-09], [ 1.3970e-09, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 2.7940e-09, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -5.5879e-09, 0.0000e+00, ..., -8.3819e-09, 0.0000e+00, -0.0000e+00], [ 4.6566e-09, 5.1223e-09, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 3.7253e-09, 9.3132e-10, 1.3970e-09]], device='cuda:0') Epoch 397, bias, value: tensor([-0.0172, -0.0363, -0.0090, -0.0166, -0.0373, -0.0006, 0.0321, -0.0148, 0.0526, -0.0093], device='cuda:0'), grad: tensor([-2.3283e-09, 4.1910e-09, 8.8476e-09, 2.3749e-08, 0.0000e+00, -3.9116e-08, 6.0536e-09, -1.6298e-08, 1.0710e-08, 8.8476e-09], device='cuda:0') 100 0.0001 changing lr epoch 396, time 250.64, cls_loss 0.0014 cls_loss_mapping 0.0016 cls_loss_causal 0.4638 re_mapping 0.0035 re_causal 0.0095 /// teacc 99.07 lr 0.00010000 Epoch 398, weight, value: tensor([[-0.1382, -0.2898, -0.0899, ..., -0.0889, 0.1893, 0.2000], [-0.2683, -0.2373, -0.0724, ..., -0.2082, -0.2619, -0.1708], [-0.0665, -0.2179, 0.1655, ..., -0.2686, 0.2951, 0.1387], ..., [-0.2098, 0.1314, 0.0188, ..., 0.2403, -0.2640, -0.3277], [-0.3471, 0.0679, -0.1671, ..., 0.0687, -0.1270, -0.2470], [-0.0597, -0.1728, -0.0848, ..., -0.1590, -0.0690, -0.2557]], device='cuda:0'), grad: tensor([[ 1.2573e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.3504e-08, 1.3039e-08], [ 0.0000e+00, 7.9162e-09, 0.0000e+00, ..., 6.0536e-09, 9.3132e-10, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, -8.3819e-09, 4.6566e-10], ..., [ 0.0000e+00, -3.7253e-08, 0.0000e+00, ..., -2.9337e-08, 1.3970e-09, 0.0000e+00], [ 3.2596e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 6.5193e-09, 2.3283e-09], [ 0.0000e+00, 2.8871e-08, 0.0000e+00, ..., 2.2817e-08, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 398, bias, value: tensor([-0.0173, -0.0368, -0.0090, -0.0151, -0.0375, -0.0005, 0.0321, -0.0146, 0.0525, -0.0094], device='cuda:0'), grad: tensor([ 4.9826e-08, 1.3132e-07, -2.4214e-08, 5.1223e-09, 1.8161e-08, 3.7486e-07, -4.3819e-07, 3.4459e-08, 2.5146e-08, -1.7462e-07], device='cuda:0') 100 0.0001 changing lr epoch 397, time 250.74, cls_loss 0.0016 cls_loss_mapping 0.0010 cls_loss_causal 0.4310 re_mapping 0.0036 re_causal 0.0096 /// teacc 99.03 lr 0.00010000 Epoch 399, weight, value: tensor([[-0.1385, -0.2898, -0.0899, ..., -0.0889, 0.1896, 0.2004], [-0.2684, -0.2374, -0.0708, ..., -0.2072, -0.2620, -0.1709], [-0.0665, -0.2181, 0.1656, ..., -0.2694, 0.2954, 0.1393], ..., [-0.2102, 0.1315, 0.0177, ..., 0.2400, -0.2645, -0.3280], [-0.3472, 0.0678, -0.1672, ..., 0.0685, -0.1270, -0.2470], [-0.0601, -0.1729, -0.0848, ..., -0.1593, -0.0691, -0.2559]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 5.6811e-08, 0.0000e+00, ..., 0.0000e+00, -9.3132e-10, 2.4214e-08], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 9.3132e-10, 4.6566e-10, 2.3283e-09], [ 0.0000e+00, 5.5879e-09, -4.6566e-10, ..., 0.0000e+00, -1.8626e-09, 2.3283e-09], ..., [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 1.3970e-09], [ 4.6566e-10, -9.9186e-07, 0.0000e+00, ..., -5.1223e-09, 0.0000e+00, -4.4517e-07], [ 0.0000e+00, 8.6660e-07, 0.0000e+00, ..., 9.3132e-10, 4.6566e-10, 3.8929e-07]], device='cuda:0') Epoch 399, bias, value: tensor([-0.0171, -0.0358, -0.0102, -0.0153, -0.0364, -0.0003, 0.0319, -0.0151, 0.0523, -0.0098], device='cuda:0'), grad: tensor([ 6.8219e-07, 4.0513e-08, 6.3330e-08, 2.5006e-07, 3.0734e-08, 3.4133e-07, 1.0105e-07, 3.4925e-08, -1.2033e-05, 1.0513e-05], device='cuda:0') 100 0.0001 changing lr epoch 398, time 250.70, cls_loss 0.0015 cls_loss_mapping 0.0011 cls_loss_causal 0.4728 re_mapping 0.0036 re_causal 0.0096 /// teacc 99.09 lr 0.00010000 Epoch 400, weight, value: tensor([[-0.1386, -0.2893, -0.0899, ..., -0.0888, 0.1901, 0.2008], [-0.2684, -0.2375, -0.0708, ..., -0.2074, -0.2620, -0.1710], [-0.0667, -0.2184, 0.1657, ..., -0.2695, 0.2969, 0.1395], ..., [-0.2104, 0.1320, 0.0177, ..., 0.2407, -0.2667, -0.3291], [-0.3472, 0.0678, -0.1672, ..., 0.0681, -0.1278, -0.2470], [-0.0608, -0.1736, -0.0848, ..., -0.1598, -0.0708, -0.2570]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.5390e-08, 0.0000e+00, ..., 1.3039e-08, 1.8626e-08, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 1.5367e-08, 0.0000e+00, ..., 9.7789e-09, 9.3132e-09, 9.3132e-10], ..., [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], [ 4.6566e-10, -1.6112e-07, 0.0000e+00, ..., -3.5297e-07, -1.7602e-07, -7.1712e-08], [ 0.0000e+00, 1.1642e-08, 0.0000e+00, ..., 4.1910e-09, 6.0536e-09, 0.0000e+00]], device='cuda:0') Epoch 400, bias, value: tensor([-0.0165, -0.0358, -0.0097, -0.0152, -0.0367, -0.0001, 0.0319, -0.0147, 0.0518, -0.0106], device='cuda:0'), grad: tensor([ 2.6403e-07, 1.5832e-08, 1.0803e-07, 3.0594e-07, 4.6566e-10, 1.0664e-07, 1.6401e-06, 5.1223e-09, -2.5239e-06, 6.3330e-08], device='cuda:0') 100 0.0001 changing lr epoch 399, time 250.24, cls_loss 0.0013 cls_loss_mapping 0.0017 cls_loss_causal 0.4713 re_mapping 0.0037 re_causal 0.0103 /// teacc 98.98 lr 0.00001000 Epoch 401, weight, value: tensor([[-0.1386, -0.2896, -0.0899, ..., -0.0887, 0.1903, 0.2011], [-0.2687, -0.2375, -0.0708, ..., -0.2075, -0.2627, -0.1726], [-0.0672, -0.2186, 0.1657, ..., -0.2700, 0.2969, 0.1392], ..., [-0.2108, 0.1315, 0.0177, ..., 0.2400, -0.2668, -0.3279], [-0.3473, 0.0707, -0.1672, ..., 0.0701, -0.1278, -0.2470], [-0.0612, -0.1736, -0.0848, ..., -0.1600, -0.0713, -0.2575]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, -9.3132e-09, -1.0245e-08], [ 0.0000e+00, 1.3504e-08, 0.0000e+00, ..., 1.3504e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 4.6566e-10], ..., [ 0.0000e+00, -2.0023e-08, 0.0000e+00, ..., -2.0023e-08, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 3.2596e-09, 9.3132e-10, 9.3132e-10], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, 8.3819e-09, 7.9162e-09]], device='cuda:0') Epoch 401, bias, value: tensor([-0.0161, -0.0358, -0.0099, -0.0135, -0.0369, -0.0008, 0.0318, -0.0150, 0.0531, -0.0110], device='cuda:0'), grad: tensor([-3.3062e-08, 4.4238e-08, 3.2596e-09, 1.3970e-09, 9.0897e-07, 1.3970e-09, 5.5879e-09, -1.4901e-08, 1.2107e-08, -9.2341e-07], device='cuda:0') 100 1e-05 changing lr epoch 400, time 250.40, cls_loss 0.0013 cls_loss_mapping 0.0013 cls_loss_causal 0.4725 re_mapping 0.0035 re_causal 0.0103 /// teacc 99.06 lr 0.00001000 Epoch 402, weight, value: tensor([[-0.1386, -0.2896, -0.0899, ..., -0.0887, 0.1904, 0.2011], [-0.2687, -0.2376, -0.0705, ..., -0.2073, -0.2627, -0.1726], [-0.0672, -0.2186, 0.1657, ..., -0.2700, 0.2969, 0.1392], ..., [-0.2108, 0.1315, 0.0175, ..., 0.2399, -0.2668, -0.3280], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1278, -0.2470], [-0.0614, -0.1736, -0.0848, ..., -0.1600, -0.0714, -0.2576]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, -6.5193e-08, -4.2375e-08], [ 4.6566e-10, 8.8476e-09, 0.0000e+00, ..., 1.3225e-06, 4.6566e-10, 4.6566e-10], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.1642e-08, 1.8626e-09, 4.6566e-10], ..., [ 9.3132e-10, -2.0955e-08, 0.0000e+00, ..., -1.4743e-06, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 3.7253e-09, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [-1.8626e-09, 7.9162e-09, 0.0000e+00, ..., 9.8255e-08, 7.4506e-09, 5.1223e-09]], device='cuda:0') Epoch 402, bias, value: tensor([-0.0161, -0.0357, -0.0099, -0.0135, -0.0367, -0.0008, 0.0318, -0.0151, 0.0531, -0.0110], device='cuda:0'), grad: tensor([-8.2888e-08, 9.9689e-06, 1.1548e-07, 2.9337e-07, 1.0710e-08, 8.8476e-09, 8.2888e-08, -1.2241e-05, 9.4110e-07, 9.0059e-07], device='cuda:0') 100 1e-05 changing lr epoch 401, time 250.64, cls_loss 0.0013 cls_loss_mapping 0.0009 cls_loss_causal 0.4504 re_mapping 0.0035 re_causal 0.0098 /// teacc 99.07 lr 0.00001000 Epoch 403, weight, value: tensor([[-0.1386, -0.2896, -0.0899, ..., -0.0887, 0.1904, 0.2011], [-0.2687, -0.2376, -0.0703, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2186, 0.1657, ..., -0.2700, 0.2970, 0.1393], ..., [-0.2108, 0.1314, 0.0173, ..., 0.2397, -0.2670, -0.3280], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1278, -0.2470], [-0.0615, -0.1736, -0.0848, ..., -0.1600, -0.0714, -0.2576]], device='cuda:0'), grad: tensor([[ 1.1688e-07, 5.9605e-08, 0.0000e+00, ..., 1.3271e-07, 4.6566e-10, 4.6566e-10], [ 2.3283e-09, 2.3283e-09, 0.0000e+00, ..., 1.8626e-09, -2.7940e-09, 0.0000e+00], [ 3.2596e-09, 3.7253e-09, 0.0000e+00, ..., 2.3283e-08, 4.6566e-10, 4.6566e-10], ..., [ 3.1199e-08, 1.3970e-08, 0.0000e+00, ..., 3.1665e-08, 0.0000e+00, 0.0000e+00], [ 5.7742e-08, 4.0047e-08, 0.0000e+00, ..., 1.4761e-07, 0.0000e+00, 0.0000e+00], [ 1.2107e-07, 6.6124e-08, 0.0000e+00, ..., 1.2992e-07, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 403, bias, value: tensor([-0.0161, -0.0355, -0.0099, -0.0135, -0.0366, -0.0008, 0.0318, -0.0153, 0.0531, -0.0110], device='cuda:0'), grad: tensor([ 3.8324e-07, -3.0641e-07, 5.9605e-08, 2.5742e-06, 1.5367e-08, -3.8520e-06, 2.6496e-07, 9.6858e-08, 3.4738e-07, 4.0093e-07], device='cuda:0') 100 1e-05 changing lr epoch 402, time 250.50, cls_loss 0.0011 cls_loss_mapping 0.0007 cls_loss_causal 0.4296 re_mapping 0.0033 re_causal 0.0093 /// teacc 99.08 lr 0.00001000 Epoch 404, weight, value: tensor([[-0.1386, -0.2896, -0.0899, ..., -0.0887, 0.1904, 0.2011], [-0.2687, -0.2376, -0.0703, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2185, 0.1659, ..., -0.2700, 0.2973, 0.1393], ..., [-0.2109, 0.1315, 0.0173, ..., 0.2397, -0.2672, -0.3280], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1278, -0.2470], [-0.0615, -0.1737, -0.0848, ..., -0.1601, -0.0714, -0.2576]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 3.2596e-09, -2.3283e-09, -2.3283e-09], [ 9.3132e-10, 1.8626e-09, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], ..., [ 2.7474e-08, 1.2107e-08, 0.0000e+00, ..., 6.5193e-08, 0.0000e+00, 0.0000e+00], [ 1.0245e-08, 5.1223e-09, 0.0000e+00, ..., 2.3749e-08, 0.0000e+00, 0.0000e+00], [ 1.0245e-08, 4.1910e-09, 0.0000e+00, ..., 2.4214e-08, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 404, bias, value: tensor([-0.0161, -0.0355, -0.0098, -0.0135, -0.0366, -0.0007, 0.0317, -0.0152, 0.0531, -0.0111], device='cuda:0'), grad: tensor([-9.3132e-10, 5.5879e-09, 3.2596e-09, 1.0990e-07, 1.8626e-09, -3.8184e-07, 3.0734e-08, 1.3318e-07, 4.7032e-08, 5.3551e-08], device='cuda:0') 100 1e-05 changing lr epoch 403, time 250.80, cls_loss 0.0012 cls_loss_mapping 0.0007 cls_loss_causal 0.4800 re_mapping 0.0033 re_causal 0.0098 /// teacc 99.09 lr 0.00001000 Epoch 405, weight, value: tensor([[-0.1386, -0.2897, -0.0899, ..., -0.0887, 0.1904, 0.2012], [-0.2687, -0.2376, -0.0703, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2184, 0.1659, ..., -0.2701, 0.2973, 0.1392], ..., [-0.2109, 0.1315, 0.0173, ..., 0.2397, -0.2673, -0.3280], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1279, -0.2470], [-0.0615, -0.1737, -0.0848, ..., -0.1601, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 1.3970e-09, 0.0000e+00, ..., 2.7940e-09, 5.5879e-09, 0.0000e+00], [ 1.0710e-08, 3.7253e-09, 0.0000e+00, ..., 6.9849e-09, 2.6077e-08, 9.3132e-10], [ 2.6543e-08, -9.3132e-10, 0.0000e+00, ..., 6.9849e-09, -4.5635e-08, -3.1199e-08], ..., [ 0.0000e+00, -1.3039e-08, 0.0000e+00, ..., -3.4459e-08, 5.8208e-08, 3.0268e-08], [ 2.3283e-09, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 6.0536e-09, 0.0000e+00], [ 4.6566e-10, 7.4506e-09, 0.0000e+00, ..., 1.5367e-08, 9.3132e-10, 0.0000e+00]], device='cuda:0') Epoch 405, bias, value: tensor([-0.0161, -0.0355, -0.0097, -0.0135, -0.0366, -0.0006, 0.0316, -0.0152, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 2.5146e-08, 1.4249e-07, -1.3132e-07, 4.1910e-09, 2.1886e-08, 2.3283e-09, -2.0256e-07, 8.2422e-08, 1.7695e-08, 4.5169e-08], device='cuda:0') 100 1e-05 changing lr epoch 404, time 250.41, cls_loss 0.0010 cls_loss_mapping 0.0006 cls_loss_causal 0.4403 re_mapping 0.0032 re_causal 0.0093 /// teacc 99.10 lr 0.00001000 Epoch 406, weight, value: tensor([[-0.1386, -0.2897, -0.0899, ..., -0.0887, 0.1904, 0.2012], [-0.2687, -0.2376, -0.0703, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2185, 0.1659, ..., -0.2701, 0.2974, 0.1392], ..., [-0.2109, 0.1315, 0.0173, ..., 0.2397, -0.2673, -0.3281], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0616, -0.1737, -0.0848, ..., -0.1601, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, -6.5193e-09, -2.3283e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.5193e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.3970e-09, -9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.5193e-09, 0.0000e+00, 0.0000e+00], [ 1.5367e-08, 0.0000e+00, 0.0000e+00, ..., 2.6077e-08, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 406, bias, value: tensor([-0.0161, -0.0355, -0.0097, -0.0135, -0.0365, -0.0006, 0.0316, -0.0152, 0.0531, -0.0111], device='cuda:0'), grad: tensor([-1.8626e-09, -1.4668e-07, -2.7940e-09, 3.7253e-09, 1.9418e-07, -4.5635e-08, -1.8952e-07, 1.1828e-07, 4.5635e-08, 2.1886e-08], device='cuda:0') 100 1e-05 changing lr epoch 405, time 250.44, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.4012 re_mapping 0.0032 re_causal 0.0091 /// teacc 99.12 lr 0.00001000 Epoch 407, weight, value: tensor([[-0.1386, -0.2897, -0.0899, ..., -0.0887, 0.1904, 0.2012], [-0.2687, -0.2376, -0.0703, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2185, 0.1659, ..., -0.2701, 0.2974, 0.1392], ..., [-0.2110, 0.1315, 0.0173, ..., 0.2397, -0.2673, -0.3281], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0616, -0.1737, -0.0848, ..., -0.1601, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], ..., [ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., -3.2596e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 2.3283e-09, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 407, bias, value: tensor([-0.0161, -0.0355, -0.0097, -0.0135, -0.0365, -0.0006, 0.0316, -0.0152, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 9.3132e-10, -2.7940e-09, 1.0245e-08, 9.3132e-10, 2.7157e-06, 9.3132e-10, 1.3970e-09, 1.2154e-07, 4.6566e-10, -2.8424e-06], device='cuda:0') 100 1e-05 changing lr epoch 406, time 250.35, cls_loss 0.0013 cls_loss_mapping 0.0005 cls_loss_causal 0.4361 re_mapping 0.0032 re_causal 0.0092 /// teacc 99.09 lr 0.00001000 Epoch 408, weight, value: tensor([[-0.1386, -0.2897, -0.0899, ..., -0.0886, 0.1904, 0.2012], [-0.2687, -0.2376, -0.0701, ..., -0.2069, -0.2628, -0.1726], [-0.0672, -0.2185, 0.1660, ..., -0.2701, 0.2974, 0.1392], ..., [-0.2110, 0.1315, 0.0171, ..., 0.2396, -0.2674, -0.3281], [-0.3474, 0.0707, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0617, -0.1737, -0.0848, ..., -0.1602, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 408, bias, value: tensor([-0.0160, -0.0355, -0.0097, -0.0134, -0.0365, -0.0006, 0.0316, -0.0153, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 4.6566e-10, 4.6566e-10, 0.0000e+00, 1.3504e-08, 9.3132e-10, -2.2352e-08, 4.6566e-09, 2.3283e-09, 2.3283e-09, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 407, time 250.25, cls_loss 0.0012 cls_loss_mapping 0.0006 cls_loss_causal 0.4462 re_mapping 0.0032 re_causal 0.0093 /// teacc 99.10 lr 0.00001000 Epoch 409, weight, value: tensor([[-0.1386, -0.2897, -0.0899, ..., -0.0886, 0.1905, 0.2012], [-0.2687, -0.2378, -0.0701, ..., -0.2070, -0.2628, -0.1727], [-0.0672, -0.2185, 0.1660, ..., -0.2701, 0.2974, 0.1393], ..., [-0.2111, 0.1315, 0.0171, ..., 0.2397, -0.2674, -0.3281], [-0.3474, 0.0706, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0618, -0.1737, -0.0848, ..., -0.1602, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 0.0000e+00], [-4.6566e-10, -0.0000e+00, 0.0000e+00, ..., -2.3283e-10, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.9791e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 409, bias, value: tensor([-0.0160, -0.0355, -0.0097, -0.0132, -0.0364, -0.0007, 0.0316, -0.0153, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 5.1223e-09, -6.9849e-09, 1.1642e-08, 4.4238e-09, -6.5938e-07, 1.8626e-09, 7.6834e-09, 8.3819e-08, 4.6566e-10, 5.5879e-07], device='cuda:0') 100 1e-05 changing lr epoch 408, time 250.37, cls_loss 0.0010 cls_loss_mapping 0.0005 cls_loss_causal 0.4277 re_mapping 0.0031 re_causal 0.0091 /// teacc 99.11 lr 0.00001000 Epoch 410, weight, value: tensor([[-0.1386, -0.2898, -0.0899, ..., -0.0886, 0.1905, 0.2012], [-0.2687, -0.2380, -0.0701, ..., -0.2071, -0.2628, -0.1726], [-0.0672, -0.2185, 0.1660, ..., -0.2701, 0.2975, 0.1392], ..., [-0.2111, 0.1317, 0.0171, ..., 0.2397, -0.2674, -0.3281], [-0.3474, 0.0706, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0618, -0.1737, -0.0848, ..., -0.1602, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 6.9849e-10, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 4.1910e-09, 0.0000e+00, ..., 5.3551e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 7.4506e-09, 0.0000e+00, ..., 1.4203e-08, 0.0000e+00, 0.0000e+00], ..., [ 4.6566e-10, -2.6077e-08, 0.0000e+00, ..., -4.4471e-08, 0.0000e+00, 0.0000e+00], [ 1.2340e-08, 1.4435e-08, 0.0000e+00, ..., 3.3528e-08, 0.0000e+00, 0.0000e+00], [ 4.1910e-09, 4.6566e-09, 0.0000e+00, ..., 6.0536e-09, 2.3283e-10, 0.0000e+00]], device='cuda:0') Epoch 410, bias, value: tensor([-0.0160, -0.0357, -0.0098, -0.0131, -0.0364, -0.0007, 0.0316, -0.0151, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 1.8626e-09, 1.6531e-08, 3.3295e-08, -1.1176e-08, 8.3819e-09, -6.8918e-08, 2.4214e-08, -9.9884e-08, 8.1724e-08, 1.6298e-08], device='cuda:0') 100 1e-05 changing lr epoch 409, time 250.48, cls_loss 0.0011 cls_loss_mapping 0.0005 cls_loss_causal 0.4435 re_mapping 0.0031 re_causal 0.0092 /// teacc 99.09 lr 0.00001000 Epoch 411, weight, value: tensor([[-0.1386, -0.2898, -0.0899, ..., -0.0886, 0.1905, 0.2012], [-0.2687, -0.2380, -0.0701, ..., -0.2071, -0.2629, -0.1726], [-0.0672, -0.2186, 0.1660, ..., -0.2702, 0.2975, 0.1393], ..., [-0.2112, 0.1317, 0.0171, ..., 0.2398, -0.2674, -0.3282], [-0.3475, 0.0706, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0619, -0.1737, -0.0848, ..., -0.1602, -0.0714, -0.2577]], device='cuda:0'), grad: tensor([[ 3.9581e-09, 1.3970e-09, 0.0000e+00, ..., -5.8347e-07, -4.2059e-06, -3.0324e-06], [ 0.0000e+00, 1.6298e-09, 0.0000e+00, ..., 1.8626e-09, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 4.1910e-09, 7.6834e-09, 5.5879e-09], ..., [ 0.0000e+00, -8.8476e-09, 0.0000e+00, ..., -1.0710e-08, 4.6566e-10, 2.3283e-10], [ 5.1223e-09, 9.3132e-10, 0.0000e+00, ..., 2.3283e-09, 1.1642e-09, 1.8626e-09], [ 0.0000e+00, 3.4925e-09, 0.0000e+00, ..., 5.5879e-09, 1.0710e-08, 7.6834e-09]], device='cuda:0') Epoch 411, bias, value: tensor([-0.0160, -0.0357, -0.0098, -0.0131, -0.0364, -0.0006, 0.0316, -0.0151, 0.0531, -0.0111], device='cuda:0'), grad: tensor([-7.2643e-06, -1.6820e-06, 1.2945e-06, 2.1188e-08, 3.2596e-08, -1.5367e-08, 7.2382e-06, 3.7905e-07, 2.4680e-08, -2.6310e-08], device='cuda:0') 100 1e-05 changing lr epoch 410, time 250.19, cls_loss 0.0010 cls_loss_mapping 0.0005 cls_loss_causal 0.4328 re_mapping 0.0030 re_causal 0.0092 /// teacc 99.11 lr 0.00001000 Epoch 412, weight, value: tensor([[-0.1386, -0.2899, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1726], [-0.0672, -0.2186, 0.1660, ..., -0.2702, 0.2975, 0.1393], ..., [-0.2112, 0.1317, 0.0171, ..., 0.2398, -0.2675, -0.3282], [-0.3475, 0.0706, -0.1673, ..., 0.0701, -0.1279, -0.2471], [-0.0619, -0.1737, -0.0848, ..., -0.1602, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.8894e-09, 0.0000e+00, ..., 2.3283e-09, -2.9569e-08, -2.3516e-08], [ 0.0000e+00, 6.2166e-08, 0.0000e+00, ..., 3.0966e-08, -0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.0571e-07, 0.0000e+00, ..., 5.2620e-08, -1.0012e-08, 0.0000e+00], ..., [ 0.0000e+00, -1.8999e-07, 0.0000e+00, ..., -9.4762e-08, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 1.5600e-08, 0.0000e+00, ..., 7.9162e-09, 2.3516e-08, 1.8626e-08]], device='cuda:0') Epoch 412, bias, value: tensor([-0.0160, -0.0357, -0.0099, -0.0131, -0.0364, -0.0006, 0.0316, -0.0151, 0.0531, -0.0111], device='cuda:0'), grad: tensor([-6.5425e-08, 2.9244e-07, 4.8988e-07, 1.1874e-08, 3.7253e-09, 5.5879e-09, 2.1188e-08, -9.0059e-07, 2.5611e-09, 1.4203e-07], device='cuda:0') 100 1e-05 changing lr epoch 411, time 250.40, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.4309 re_mapping 0.0030 re_causal 0.0091 /// teacc 99.13 lr 0.00001000 Epoch 413, weight, value: tensor([[-0.1386, -0.2899, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1726], [-0.0672, -0.2187, 0.1660, ..., -0.2702, 0.2975, 0.1393], ..., [-0.2112, 0.1318, 0.0171, ..., 0.2398, -0.2675, -0.3282], [-0.3475, 0.0706, -0.1673, ..., 0.0701, -0.1280, -0.2471], [-0.0619, -0.1738, -0.0848, ..., -0.1602, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.3283e-10, -1.0608e-06, -5.8580e-07], [ 0.0000e+00, 5.3551e-09, 0.0000e+00, ..., 6.2864e-09, 6.7521e-09, 3.7253e-09], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.1642e-09, 2.5425e-07, 1.4319e-07], ..., [ 0.0000e+00, -2.8871e-08, 0.0000e+00, ..., -3.1665e-08, 4.8894e-09, 2.5611e-09], [ 2.3283e-10, 3.9581e-09, 0.0000e+00, ..., 4.1910e-09, 2.8405e-08, 1.8394e-08], [ 0.0000e+00, 1.3271e-08, 0.0000e+00, ..., 1.5832e-08, 1.0361e-07, 6.3097e-08]], device='cuda:0') Epoch 413, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0129, -0.0364, -0.0007, 0.0316, -0.0150, 0.0531, -0.0111], device='cuda:0'), grad: tensor([-1.9632e-06, 2.9802e-08, 4.7870e-07, 2.3749e-08, 8.1491e-09, 5.1223e-09, 1.1846e-06, -7.6601e-08, 6.6357e-08, 2.4354e-07], device='cuda:0') 100 1e-05 changing lr epoch 412, time 249.93, cls_loss 0.0010 cls_loss_mapping 0.0005 cls_loss_causal 0.4508 re_mapping 0.0029 re_causal 0.0090 /// teacc 99.10 lr 0.00001000 Epoch 414, weight, value: tensor([[-0.1386, -0.2899, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1727], [-0.0672, -0.2187, 0.1661, ..., -0.2702, 0.2975, 0.1393], ..., [-0.2113, 0.1318, 0.0171, ..., 0.2399, -0.2675, -0.3282], [-0.3475, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0620, -0.1738, -0.0848, ..., -0.1602, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 2.3283e-10, 2.3283e-10], [ 3.2596e-09, 4.6566e-10, 0.0000e+00, ..., 2.7940e-09, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, -2.3283e-10, 0.0000e+00, ..., -1.1642e-09, -1.8626e-09, -1.3970e-09], ..., [ 2.0955e-09, 1.1642e-09, 0.0000e+00, ..., 2.5611e-09, 1.3970e-09, 1.1642e-09], [ 5.7044e-08, 3.2596e-09, 0.0000e+00, ..., 4.5868e-08, 1.3970e-09, 1.3970e-09], [ 3.1898e-08, 3.7253e-08, 0.0000e+00, ..., 2.0256e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 414, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0129, -0.0364, -0.0006, 0.0315, -0.0150, 0.0531, -0.0111], device='cuda:0'), grad: tensor([ 1.3970e-09, 1.4435e-08, -3.4925e-09, 3.4552e-07, -7.2177e-09, -6.7893e-07, -4.6566e-10, 1.3504e-08, 1.4110e-07, 1.8254e-07], device='cuda:0') 100 1e-05 changing lr epoch 413, time 250.24, cls_loss 0.0010 cls_loss_mapping 0.0004 cls_loss_causal 0.4607 re_mapping 0.0029 re_causal 0.0093 /// teacc 99.10 lr 0.00001000 Epoch 415, weight, value: tensor([[-0.1386, -0.2900, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1727], [-0.0672, -0.2188, 0.1661, ..., -0.2702, 0.2976, 0.1393], ..., [-0.2113, 0.1318, 0.0171, ..., 0.2399, -0.2676, -0.3282], [-0.3475, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0620, -0.1738, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -3.2596e-09, -3.4925e-09], [ 2.3283e-10, 4.1910e-09, 0.0000e+00, ..., 4.6566e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-09, -5.1223e-09, 0.0000e+00, ..., -6.2864e-09, 0.0000e+00, 2.3283e-10], [ 2.0955e-09, 8.8476e-09, 0.0000e+00, ..., -2.3283e-10, 0.0000e+00, 9.3132e-10], [ 2.3283e-10, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 2.0955e-09, 2.3283e-09]], device='cuda:0') Epoch 415, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0129, -0.0363, -0.0006, 0.0315, -0.0150, 0.0531, -0.0112], device='cuda:0'), grad: tensor([-1.0477e-08, 1.2107e-08, 2.5611e-09, -2.6077e-08, 0.0000e+00, -1.3970e-09, 3.4925e-09, -1.2806e-08, 1.7229e-08, 1.2107e-08], device='cuda:0') 100 1e-05 changing lr epoch 414, time 250.17, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.4290 re_mapping 0.0029 re_causal 0.0091 /// teacc 99.11 lr 0.00001000 Epoch 416, weight, value: tensor([[-0.1386, -0.2900, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1727], [-0.0672, -0.2188, 0.1661, ..., -0.2702, 0.2976, 0.1393], ..., [-0.2113, 0.1318, 0.0171, ..., 0.2399, -0.2676, -0.3283], [-0.3475, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1738, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., -4.4238e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 3.9581e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 416, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0129, -0.0363, -0.0006, 0.0315, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 9.3132e-10, 9.3132e-10, 2.3283e-10, 1.3970e-09, 5.8208e-09, 1.8626e-09, 2.3283e-10, -6.0536e-09, 3.8883e-08, -4.4936e-08], device='cuda:0') 100 1e-05 changing lr epoch 415, time 250.23, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.4450 re_mapping 0.0029 re_causal 0.0093 /// teacc 99.12 lr 0.00001000 Epoch 417, weight, value: tensor([[-0.1386, -0.2901, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2381, -0.0701, ..., -0.2072, -0.2629, -0.1727], [-0.0672, -0.2188, 0.1661, ..., -0.2702, 0.2977, 0.1394], ..., [-0.2114, 0.1318, 0.0171, ..., 0.2399, -0.2676, -0.3283], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1738, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 6.9849e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -6.8825e-07, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 9.3132e-10, 4.6566e-10, 2.3283e-10]], device='cuda:0') Epoch 417, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0129, -0.0363, -0.0005, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 3.9581e-09, 3.9581e-09, 0.0000e+00, 3.7253e-09, 8.0978e-07, 2.7311e-07, -1.0710e-08, -9.9279e-07, 5.1223e-09, -9.2201e-08], device='cuda:0') 100 1e-05 changing lr epoch 416, time 250.38, cls_loss 0.0010 cls_loss_mapping 0.0005 cls_loss_causal 0.4438 re_mapping 0.0030 re_causal 0.0091 /// teacc 99.15 lr 0.00001000 Epoch 418, weight, value: tensor([[-0.1386, -0.2901, -0.0899, ..., -0.0886, 0.1905, 0.2013], [-0.2688, -0.2382, -0.0700, ..., -0.2072, -0.2630, -0.1727], [-0.0672, -0.2188, 0.1663, ..., -0.2703, 0.2978, 0.1394], ..., [-0.2114, 0.1319, 0.0170, ..., 0.2399, -0.2676, -0.3283], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1738, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.8894e-09, 0.0000e+00, ..., -2.7940e-09, -1.3970e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.6298e-09, 1.1642e-09]], device='cuda:0') Epoch 418, bias, value: tensor([-0.0160, -0.0358, -0.0099, -0.0128, -0.0363, -0.0005, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 2.3283e-10, -1.8626e-09, 2.0955e-09, 4.6566e-09, 9.3132e-10, 1.1642e-09, 4.1211e-08, 1.6298e-09, -4.4238e-08, 3.9581e-09], device='cuda:0') 100 1e-05 changing lr ---------------------saving model at epoch 417---------------------------------------------------- epoch 417, time 267.27, cls_loss 0.0010 cls_loss_mapping 0.0006 cls_loss_causal 0.4379 re_mapping 0.0029 re_causal 0.0090 /// teacc 99.17 lr 0.00001000 Epoch 419, weight, value: tensor([[-0.1386, -0.2902, -0.0899, ..., -0.0887, 0.1905, 0.2014], [-0.2688, -0.2383, -0.0700, ..., -0.2072, -0.2630, -0.1727], [-0.0673, -0.2187, 0.1663, ..., -0.2702, 0.2979, 0.1394], ..., [-0.2115, 0.1320, 0.0170, ..., 0.2399, -0.2679, -0.3284], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1738, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.3504e-08, -1.0710e-08], [ 4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.2107e-08, 4.6566e-10], ..., [ 4.6566e-10, -4.6566e-10, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.3039e-08, 0.0000e+00], [ 6.9849e-09, 1.3970e-09, 0.0000e+00, ..., 6.5193e-09, 2.7940e-09, 1.8626e-09]], device='cuda:0') Epoch 419, bias, value: tensor([-0.0160, -0.0359, -0.0098, -0.0127, -0.0363, -0.0005, 0.0314, -0.0149, 0.0530, -0.0112], device='cuda:0'), grad: tensor([-3.4925e-08, -4.1444e-08, -7.6834e-08, 9.3132e-10, 2.5611e-08, -2.3283e-08, 2.7474e-08, 1.6298e-08, 1.1316e-07, -1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 418, time 250.41, cls_loss 0.0011 cls_loss_mapping 0.0005 cls_loss_causal 0.4268 re_mapping 0.0029 re_causal 0.0091 /// teacc 99.15 lr 0.00001000 Epoch 420, weight, value: tensor([[-0.1386, -0.2902, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2688, -0.2383, -0.0699, ..., -0.2071, -0.2630, -0.1727], [-0.0673, -0.2187, 0.1663, ..., -0.2702, 0.2979, 0.1394], ..., [-0.2115, 0.1320, 0.0169, ..., 0.2399, -0.2679, -0.3284], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1739, -0.0848, ..., -0.1603, -0.0715, -0.2578]], device='cuda:0'), grad: tensor([[4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [9.3132e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [1.3970e-09, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [4.6566e-10, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 420, bias, value: tensor([-0.0160, -0.0358, -0.0098, -0.0126, -0.0363, -0.0006, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 1.3970e-09, -4.9826e-08, 2.7940e-09, -2.9337e-08, 5.3085e-08, -1.0245e-08, 1.3504e-08, 6.9849e-09, 4.1910e-09, 3.7253e-09], device='cuda:0') 100 1e-05 changing lr epoch 419, time 250.29, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4327 re_mapping 0.0029 re_causal 0.0090 /// teacc 99.12 lr 0.00001000 Epoch 421, weight, value: tensor([[-0.1386, -0.2903, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2688, -0.2383, -0.0699, ..., -0.2071, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1663, ..., -0.2702, 0.2980, 0.1395], ..., [-0.2115, 0.1320, 0.0169, ..., 0.2399, -0.2679, -0.3284], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0621, -0.1739, -0.0848, ..., -0.1603, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -3.7253e-08, 0.0000e+00, ..., -9.5926e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 3.6787e-08, 0.0000e+00, ..., 9.4529e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 421, bias, value: tensor([-0.0160, -0.0358, -0.0098, -0.0126, -0.0363, -0.0006, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 6.0536e-09, -4.6566e-10, 1.7695e-08, 6.5193e-09, 1.3970e-09, 3.2596e-09, 4.1910e-09, -1.7462e-07, -4.0047e-08, 1.8161e-07], device='cuda:0') 100 1e-05 changing lr epoch 420, time 250.61, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.3993 re_mapping 0.0028 re_causal 0.0084 /// teacc 99.09 lr 0.00001000 Epoch 422, weight, value: tensor([[-0.1386, -0.2903, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2688, -0.2384, -0.0699, ..., -0.2071, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1663, ..., -0.2702, 0.2980, 0.1395], ..., [-0.2116, 0.1320, 0.0169, ..., 0.2399, -0.2680, -0.3284], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0622, -0.1739, -0.0848, ..., -0.1604, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -4.6566e-10, -0.0000e+00], ..., [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 6.5193e-09, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 3.2596e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 422, bias, value: tensor([-0.0160, -0.0358, -0.0097, -0.0126, -0.0363, -0.0006, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 1.3970e-09, 9.3132e-10, -3.7253e-09, 5.1223e-09, 1.8626e-09, -8.2888e-08, 5.4948e-08, 2.7940e-09, 1.0245e-08, 1.3970e-08], device='cuda:0') 100 1e-05 changing lr epoch 421, time 250.81, cls_loss 0.0009 cls_loss_mapping 0.0005 cls_loss_causal 0.4276 re_mapping 0.0027 re_causal 0.0088 /// teacc 99.09 lr 0.00001000 Epoch 423, weight, value: tensor([[-0.1386, -0.2904, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2689, -0.2384, -0.0699, ..., -0.2072, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1663, ..., -0.2703, 0.2980, 0.1395], ..., [-0.2116, 0.1320, 0.0169, ..., 0.2399, -0.2680, -0.3285], [-0.3476, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0622, -0.1739, -0.0848, ..., -0.1604, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.7940e-09, -2.3283e-09], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 9.3132e-10, 4.6566e-10], [ 0.0000e+00, 7.4506e-08, 0.0000e+00, ..., 1.0571e-07, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -8.5682e-08, 0.0000e+00, ..., -1.2200e-07, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.2573e-08, 0.0000e+00, ..., 1.4901e-08, 4.6566e-10, 4.6566e-10], [ 2.3283e-09, 2.3283e-09, 0.0000e+00, ..., -0.0000e+00, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 423, bias, value: tensor([-0.0160, -0.0358, -0.0097, -0.0125, -0.0363, -0.0006, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([-6.5193e-09, 5.5879e-09, 1.7742e-07, -1.9092e-08, 9.3132e-10, 9.3132e-09, 4.1910e-09, -1.9930e-07, 3.1665e-08, 6.9849e-09], device='cuda:0') 100 1e-05 changing lr epoch 422, time 250.69, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4694 re_mapping 0.0027 re_causal 0.0091 /// teacc 99.10 lr 0.00001000 Epoch 424, weight, value: tensor([[-0.1386, -0.2904, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2689, -0.2384, -0.0699, ..., -0.2072, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2703, 0.2981, 0.1396], ..., [-0.2117, 0.1321, 0.0169, ..., 0.2399, -0.2681, -0.3285], [-0.3477, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0623, -0.1739, -0.0848, ..., -0.1604, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-09, 0.0000e+00, 1.3970e-09], ..., [ 0.0000e+00, -6.5193e-09, 0.0000e+00, ..., -9.1270e-08, 0.0000e+00, -1.1642e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -0.0000e+00, -0.0000e+00], [ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 7.4506e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 424, bias, value: tensor([-0.0160, -0.0358, -0.0097, -0.0124, -0.0363, -0.0007, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 1.8626e-09, 2.3283e-09, 1.6764e-08, 9.3132e-10, 2.7940e-08, 9.8255e-08, 9.3132e-10, -1.5926e-07, 1.4901e-08, -4.6566e-09], device='cuda:0') 100 1e-05 changing lr epoch 423, time 250.91, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4292 re_mapping 0.0028 re_causal 0.0087 /// teacc 99.12 lr 0.00001000 Epoch 425, weight, value: tensor([[-0.1386, -0.2904, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2689, -0.2385, -0.0699, ..., -0.2072, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2703, 0.2981, 0.1396], ..., [-0.2117, 0.1321, 0.0169, ..., 0.2399, -0.2681, -0.3286], [-0.3477, 0.0706, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0623, -0.1739, -0.0848, ..., -0.1604, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.3283e-09, -1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 3.2596e-09, -0.0000e+00, 0.0000e+00, ..., -3.2596e-09, 9.3132e-10, 4.6566e-10], [ 2.7940e-09, 4.6566e-10, 0.0000e+00, ..., 4.1910e-09, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 425, bias, value: tensor([-0.0160, -0.0358, -0.0097, -0.0123, -0.0362, -0.0007, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([-3.7253e-09, -3.5716e-07, 3.0128e-07, 2.2352e-08, 3.2596e-09, -2.3283e-08, 6.5193e-09, 2.6543e-08, -6.9849e-09, 3.9581e-08], device='cuda:0') 100 1e-05 changing lr epoch 424, time 250.69, cls_loss 0.0010 cls_loss_mapping 0.0004 cls_loss_causal 0.4186 re_mapping 0.0028 re_causal 0.0087 /// teacc 99.10 lr 0.00001000 Epoch 426, weight, value: tensor([[-0.1386, -0.2904, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2689, -0.2385, -0.0699, ..., -0.2072, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2703, 0.2982, 0.1398], ..., [-0.2117, 0.1321, 0.0169, ..., 0.2399, -0.2683, -0.3287], [-0.3477, 0.0705, -0.1673, ..., 0.0700, -0.1280, -0.2471], [-0.0623, -0.1739, -0.0848, ..., -0.1604, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -4.1910e-09, -3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0245e-08, 6.0536e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.6077e-08, -1.5832e-08], ..., [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., -1.8626e-09, 1.4901e-08, 8.8476e-09], [-3.7253e-09, -4.6566e-09, 0.0000e+00, ..., -2.3749e-08, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, 4.1910e-09, 3.2596e-09]], device='cuda:0') Epoch 426, bias, value: tensor([-0.0160, -0.0358, -0.0096, -0.0123, -0.0362, -0.0007, 0.0314, -0.0150, 0.0530, -0.0112], device='cuda:0'), grad: tensor([ 3.7253e-09, 1.4398e-06, -1.3364e-07, 7.8231e-08, -3.2596e-09, 1.3364e-07, 4.0093e-07, 9.0804e-08, -2.0433e-06, 3.1665e-08], device='cuda:0') 100 1e-05 changing lr epoch 425, time 250.71, cls_loss 0.0010 cls_loss_mapping 0.0004 cls_loss_causal 0.4549 re_mapping 0.0028 re_causal 0.0088 /// teacc 99.10 lr 0.00001000 Epoch 427, weight, value: tensor([[-0.1387, -0.2904, -0.0899, ..., -0.0887, 0.1906, 0.2014], [-0.2689, -0.2385, -0.0698, ..., -0.2071, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2703, 0.2982, 0.1398], ..., [-0.2118, 0.1321, 0.0168, ..., 0.2399, -0.2683, -0.3287], [-0.3477, 0.0705, -0.1674, ..., 0.0701, -0.1280, -0.2471], [-0.0623, -0.1739, -0.0848, ..., -0.1605, -0.0714, -0.2579]], device='cuda:0'), grad: tensor([[4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 4.6566e-10], [0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 427, bias, value: tensor([-0.0161, -0.0358, -0.0097, -0.0122, -0.0361, -0.0007, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([ 2.3283e-09, 2.3283e-09, 1.3970e-09, 1.8626e-09, 7.9162e-09, 1.4435e-08, -2.3749e-08, 1.8626e-09, 4.6566e-10, -6.0536e-09], device='cuda:0') 100 1e-05 changing lr epoch 426, time 250.25, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4291 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.12 lr 0.00001000 Epoch 428, weight, value: tensor([[-0.1387, -0.2905, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2689, -0.2385, -0.0698, ..., -0.2071, -0.2630, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2704, 0.2982, 0.1398], ..., [-0.2118, 0.1321, 0.0168, ..., 0.2399, -0.2683, -0.3288], [-0.3477, 0.0705, -0.1674, ..., 0.0700, -0.1281, -0.2472], [-0.0624, -0.1739, -0.0848, ..., -0.1605, -0.0714, -0.2579]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.0489e-08, 0.0000e+00, 0.0000e+00], [9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [1.3504e-08, 0.0000e+00, 0.0000e+00, ..., 7.4971e-08, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 428, bias, value: tensor([-0.0161, -0.0358, -0.0097, -0.0122, -0.0362, -0.0008, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([ 5.5879e-09, 9.3132e-09, 2.7940e-09, 3.2131e-08, -7.9395e-07, -8.9407e-08, 1.5832e-08, 1.9372e-07, 1.7229e-08, 6.1281e-07], device='cuda:0') 100 1e-05 changing lr epoch 427, time 250.32, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4183 re_mapping 0.0027 re_causal 0.0086 /// teacc 99.12 lr 0.00001000 Epoch 429, weight, value: tensor([[-0.1387, -0.2905, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2689, -0.2385, -0.0698, ..., -0.2071, -0.2631, -0.1728], [-0.0673, -0.2187, 0.1664, ..., -0.2704, 0.2982, 0.1398], ..., [-0.2119, 0.1321, 0.0168, ..., 0.2399, -0.2684, -0.3288], [-0.3477, 0.0705, -0.1674, ..., 0.0700, -0.1281, -0.2472], [-0.0624, -0.1739, -0.0848, ..., -0.1605, -0.0715, -0.2579]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.8626e-09, 0.0000e+00, ..., -4.1910e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 9.3132e-10, 0.0000e+00, ..., -9.3132e-10, 4.6566e-10, 4.6566e-10], [ 4.6566e-10, 2.7940e-09, 0.0000e+00, ..., 4.1910e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 429, bias, value: tensor([-0.0160, -0.0358, -0.0096, -0.0121, -0.0361, -0.0008, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([ 4.6566e-10, -2.0443e-07, 1.3970e-09, -6.0536e-09, 0.0000e+00, 7.9162e-09, -3.7253e-09, 1.6252e-07, 2.3283e-09, 4.3772e-08], device='cuda:0') 100 1e-05 changing lr epoch 428, time 250.72, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4242 re_mapping 0.0027 re_causal 0.0086 /// teacc 99.13 lr 0.00001000 Epoch 430, weight, value: tensor([[-0.1387, -0.2906, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2689, -0.2385, -0.0698, ..., -0.2072, -0.2631, -0.1728], [-0.0673, -0.2187, 0.1665, ..., -0.2704, 0.2982, 0.1399], ..., [-0.2119, 0.1321, 0.0168, ..., 0.2400, -0.2684, -0.3288], [-0.3478, 0.0705, -0.1674, ..., 0.0700, -0.1281, -0.2472], [-0.0624, -0.1740, -0.0848, ..., -0.1605, -0.0715, -0.2580]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, -3.7253e-09, -4.6566e-10, ..., -4.6566e-10, -5.1223e-09, -3.7253e-09], ..., [ 4.6566e-10, 2.7940e-09, 4.6566e-10, ..., -3.6322e-08, 4.6566e-09, 3.2596e-09], [ 8.3819e-09, 9.3132e-10, 0.0000e+00, ..., 7.9162e-09, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 430, bias, value: tensor([-0.0160, -0.0358, -0.0096, -0.0120, -0.0361, -0.0008, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([ 9.3132e-10, -7.4506e-09, -2.2352e-08, 1.3970e-09, 1.1874e-07, -6.2864e-08, 6.2864e-08, -6.2864e-08, 2.9337e-08, -5.1688e-08], device='cuda:0') 100 1e-05 changing lr epoch 429, time 250.20, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4689 re_mapping 0.0027 re_causal 0.0090 /// teacc 99.12 lr 0.00001000 Epoch 431, weight, value: tensor([[-0.1387, -0.2906, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2690, -0.2385, -0.0698, ..., -0.2072, -0.2631, -0.1728], [-0.0673, -0.2187, 0.1665, ..., -0.2705, 0.2983, 0.1399], ..., [-0.2120, 0.1321, 0.0168, ..., 0.2400, -0.2684, -0.3289], [-0.3478, 0.0705, -0.1674, ..., 0.0700, -0.1281, -0.2472], [-0.0624, -0.1740, -0.0848, ..., -0.1606, -0.0714, -0.2580]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, -1.0431e-07, -5.3551e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 4.6566e-10, -2.7940e-09, -3.2596e-09], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -1.8626e-09, 3.7253e-09, 3.7253e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 1.0477e-07, 5.4017e-08]], device='cuda:0') Epoch 431, bias, value: tensor([-0.0161, -0.0358, -0.0096, -0.0120, -0.0361, -0.0008, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([-2.5751e-07, 2.3283e-09, -9.3132e-09, 5.1223e-09, -5.9046e-07, 1.8161e-08, 1.8626e-09, 2.2817e-08, -2.1420e-08, 8.3586e-07], device='cuda:0') 100 1e-05 changing lr epoch 430, time 250.30, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4265 re_mapping 0.0027 re_causal 0.0085 /// teacc 99.13 lr 0.00001000 Epoch 432, weight, value: tensor([[-0.1387, -0.2907, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2690, -0.2385, -0.0698, ..., -0.2072, -0.2631, -0.1728], [-0.0673, -0.2187, 0.1665, ..., -0.2705, 0.2984, 0.1400], ..., [-0.2121, 0.1321, 0.0168, ..., 0.2400, -0.2686, -0.3289], [-0.3478, 0.0705, -0.1674, ..., 0.0701, -0.1281, -0.2472], [-0.0624, -0.1740, -0.0848, ..., -0.1606, -0.0714, -0.2580]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -3.8557e-07, -3.5483e-07], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-09, 1.8626e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, -3.2596e-09, -2.7940e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 3.7253e-09, 3.7253e-09], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.1910e-09, 3.7253e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 5.4948e-08, 5.1223e-08]], device='cuda:0') Epoch 432, bias, value: tensor([-0.0161, -0.0358, -0.0096, -0.0120, -0.0360, -0.0009, 0.0314, -0.0150, 0.0530, -0.0113], device='cuda:0'), grad: tensor([-9.2713e-07, 7.9162e-09, -2.9802e-08, 1.3970e-09, 4.7171e-07, 6.5193e-09, 7.7626e-07, 5.1223e-08, 2.8871e-08, -3.8324e-07], device='cuda:0') 100 1e-05 changing lr epoch 431, time 250.24, cls_loss 0.0010 cls_loss_mapping 0.0005 cls_loss_causal 0.4369 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.14 lr 0.00001000 Epoch 433, weight, value: tensor([[-0.1388, -0.2907, -0.0899, ..., -0.0886, 0.1906, 0.2015], [-0.2690, -0.2386, -0.0698, ..., -0.2073, -0.2631, -0.1729], [-0.0673, -0.2187, 0.1666, ..., -0.2705, 0.2984, 0.1400], ..., [-0.2121, 0.1322, 0.0168, ..., 0.2401, -0.2686, -0.3290], [-0.3478, 0.0705, -0.1674, ..., 0.0701, -0.1281, -0.2472], [-0.0625, -0.1741, -0.0848, ..., -0.1606, -0.0714, -0.2580]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -4.6566e-10, -4.6566e-10], [ 0.0000e+00, 5.5879e-09, 0.0000e+00, ..., 1.0710e-08, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -6.9849e-09, 0.0000e+00, ..., -1.3039e-08, 0.0000e+00, 0.0000e+00], [ 6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 4.1910e-09, 3.2596e-09], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 2.3283e-09, 1.3970e-09, 9.3132e-10]], device='cuda:0') Epoch 433, bias, value: tensor([-0.0162, -0.0359, -0.0095, -0.0119, -0.0360, -0.0009, 0.0314, -0.0149, 0.0531, -0.0113], device='cuda:0'), grad: tensor([ 9.3132e-10, 3.3528e-08, 0.0000e+00, 4.6566e-10, 0.0000e+00, 5.7742e-08, -8.0559e-08, -4.0513e-08, 1.9558e-08, 1.0710e-08], device='cuda:0') 100 1e-05 changing lr ---------------------saving model at epoch 432---------------------------------------------------- epoch 432, time 266.64, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4147 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.18 lr 0.00001000 Epoch 434, weight, value: tensor([[-0.1388, -0.2907, -0.0899, ..., -0.0886, 0.1906, 0.2016], [-0.2690, -0.2386, -0.0698, ..., -0.2073, -0.2631, -0.1729], [-0.0673, -0.2187, 0.1666, ..., -0.2706, 0.2984, 0.1400], ..., [-0.2122, 0.1322, 0.0168, ..., 0.2401, -0.2686, -0.3290], [-0.3479, 0.0705, -0.1674, ..., 0.0701, -0.1281, -0.2472], [-0.0625, -0.1741, -0.0848, ..., -0.1607, -0.0714, -0.2581]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.1176e-08, -6.5193e-09], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 5.5879e-09, 1.3970e-09, 0.0000e+00, ..., 0.0000e+00, 8.3819e-09, 7.9162e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 2.1886e-08, 7.9162e-09, 0.0000e+00, ..., 4.6566e-10, 2.8405e-08, 2.7940e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.1910e-09, 2.7940e-09]], device='cuda:0') Epoch 434, bias, value: tensor([-0.0161, -0.0359, -0.0095, -0.0119, -0.0360, -0.0009, 0.0314, -0.0149, 0.0531, -0.0113], device='cuda:0'), grad: tensor([-2.5611e-08, 3.2596e-09, 3.2131e-08, -6.9849e-09, -9.3132e-10, 1.8626e-09, -1.3271e-07, 3.4925e-08, 1.1967e-07, -2.2352e-08], device='cuda:0') 100 1e-05 changing lr epoch 433, time 250.32, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4450 re_mapping 0.0028 re_causal 0.0089 /// teacc 99.12 lr 0.00001000 Epoch 435, weight, value: tensor([[-0.1388, -0.2907, -0.0899, ..., -0.0886, 0.1907, 0.2016], [-0.2690, -0.2386, -0.0698, ..., -0.2073, -0.2631, -0.1729], [-0.0673, -0.2187, 0.1666, ..., -0.2706, 0.2985, 0.1401], ..., [-0.2122, 0.1322, 0.0168, ..., 0.2401, -0.2687, -0.3290], [-0.3479, 0.0705, -0.1674, ..., 0.0701, -0.1282, -0.2472], [-0.0625, -0.1741, -0.0848, ..., -0.1607, -0.0715, -0.2581]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.8626e-09, -6.7987e-08, -6.7987e-08], [ 4.6566e-10, 1.2573e-08, 0.0000e+00, ..., 2.4214e-08, 9.3132e-10, 4.6566e-10], [ 0.0000e+00, 5.1223e-09, 0.0000e+00, ..., 9.7789e-09, -7.4506e-09, -3.2596e-09], ..., [ 4.6566e-10, -2.6869e-07, 0.0000e+00, ..., -5.0897e-07, 2.3283e-09, 9.3132e-10], [ 4.6566e-09, 2.0023e-08, 0.0000e+00, ..., 1.7229e-08, 9.3132e-10, 4.6566e-10], [ 1.0245e-08, 2.3423e-07, 0.0000e+00, ..., 4.5029e-07, 5.1223e-09, 3.7253e-09]], device='cuda:0') Epoch 435, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0119, -0.0360, -0.0009, 0.0314, -0.0149, 0.0530, -0.0114], device='cuda:0'), grad: tensor([-1.2526e-07, 2.0629e-07, -2.7940e-09, 3.7253e-09, -1.7639e-06, -1.5832e-08, 1.4063e-07, -1.3355e-06, 7.5903e-08, 2.8126e-06], device='cuda:0') 100 1e-05 changing lr epoch 434, time 250.70, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4049 re_mapping 0.0027 re_causal 0.0086 /// teacc 99.14 lr 0.00001000 Epoch 436, weight, value: tensor([[-0.1388, -0.2907, -0.0899, ..., -0.0885, 0.1907, 0.2017], [-0.2690, -0.2386, -0.0698, ..., -0.2073, -0.2631, -0.1729], [-0.0674, -0.2186, 0.1666, ..., -0.2706, 0.2986, 0.1401], ..., [-0.2122, 0.1322, 0.0168, ..., 0.2402, -0.2689, -0.3290], [-0.3479, 0.0704, -0.1674, ..., 0.0701, -0.1282, -0.2472], [-0.0625, -0.1741, -0.0848, ..., -0.1607, -0.0715, -0.2582]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -7.9162e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -7.4506e-09, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, -9.3132e-10], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 4.6566e-10]], device='cuda:0') Epoch 436, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0119, -0.0360, -0.0009, 0.0314, -0.0149, 0.0531, -0.0114], device='cuda:0'), grad: tensor([ 2.3283e-09, -1.2200e-07, 2.7940e-09, 3.2596e-09, 1.8626e-09, 4.6566e-10, -3.7253e-09, 9.2667e-08, 7.4506e-09, 9.3132e-09], device='cuda:0') 100 1e-05 changing lr epoch 435, time 250.94, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4288 re_mapping 0.0028 re_causal 0.0088 /// teacc 99.12 lr 0.00001000 Epoch 437, weight, value: tensor([[-0.1388, -0.2907, -0.0899, ..., -0.0885, 0.1907, 0.2017], [-0.2691, -0.2386, -0.0698, ..., -0.2073, -0.2632, -0.1729], [-0.0674, -0.2187, 0.1667, ..., -0.2707, 0.2986, 0.1401], ..., [-0.2123, 0.1322, 0.0168, ..., 0.2402, -0.2689, -0.3290], [-0.3480, 0.0704, -0.1674, ..., 0.0701, -0.1282, -0.2472], [-0.0624, -0.1741, -0.0848, ..., -0.1607, -0.0715, -0.2582]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.7323e-07, -1.4203e-07], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 2.3283e-09, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -1.6298e-08, -2.2817e-08, -2.9802e-08], ..., [ 0.0000e+00, -1.2573e-08, 0.0000e+00, ..., 3.2596e-09, 2.1886e-08, 2.8871e-08], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 2.7940e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 5.5879e-09, 2.7940e-09, 2.7940e-09]], device='cuda:0') Epoch 437, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0119, -0.0360, -0.0009, 0.0314, -0.0149, 0.0530, -0.0114], device='cuda:0'), grad: tensor([-3.0175e-07, 7.4506e-09, -2.5611e-07, 1.3970e-09, 2.7940e-09, 6.5193e-09, 3.0920e-07, 2.5798e-07, 2.7940e-09, -1.8161e-08], device='cuda:0') 100 1e-05 changing lr epoch 436, time 250.68, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4422 re_mapping 0.0027 re_causal 0.0089 /// teacc 99.13 lr 0.00001000 Epoch 438, weight, value: tensor([[-0.1388, -0.2908, -0.0899, ..., -0.0885, 0.1907, 0.2018], [-0.2691, -0.2387, -0.0698, ..., -0.2073, -0.2632, -0.1729], [-0.0674, -0.2187, 0.1667, ..., -0.2707, 0.2987, 0.1401], ..., [-0.2124, 0.1322, 0.0168, ..., 0.2402, -0.2689, -0.3291], [-0.3480, 0.0704, -0.1674, ..., 0.0700, -0.1282, -0.2472], [-0.0625, -0.1742, -0.0848, ..., -0.1607, -0.0714, -0.2582]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 4.6566e-10, 9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -9.3132e-10, 0.0000e+00], ..., [ 4.6566e-10, -4.6566e-10, 0.0000e+00, ..., 7.4506e-09, 9.3132e-10, 4.6566e-10], [ 1.0710e-08, 4.6566e-10, 0.0000e+00, ..., 7.9162e-09, -4.6566e-10, -1.8626e-09], [ 2.7940e-09, 4.6566e-10, 0.0000e+00, ..., 1.7229e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 438, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0117, -0.0360, -0.0009, 0.0313, -0.0149, 0.0530, -0.0114], device='cuda:0'), grad: tensor([ 7.4506e-09, 1.3504e-08, 4.1910e-09, 1.1642e-08, -2.6403e-07, -4.2375e-08, 5.5879e-09, 5.8673e-08, 1.3039e-08, 1.9697e-07], device='cuda:0') 100 1e-05 changing lr epoch 437, time 250.53, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4196 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.11 lr 0.00001000 Epoch 439, weight, value: tensor([[-0.1389, -0.2909, -0.0899, ..., -0.0886, 0.1907, 0.2018], [-0.2691, -0.2387, -0.0697, ..., -0.2072, -0.2632, -0.1729], [-0.0674, -0.2187, 0.1667, ..., -0.2707, 0.2987, 0.1401], ..., [-0.2125, 0.1322, 0.0167, ..., 0.2401, -0.2689, -0.3291], [-0.3481, 0.0704, -0.1674, ..., 0.0700, -0.1283, -0.2473], [-0.0625, -0.1742, -0.0848, ..., -0.1608, -0.0715, -0.2583]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 3.7253e-09, 1.3970e-09], [ 6.9849e-10, 9.3132e-10, 0.0000e+00, ..., -3.0268e-09, 2.3283e-10, 2.3283e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -2.0256e-08, 0.0000e+00], ..., [ 0.0000e+00, -3.0268e-09, 0.0000e+00, ..., 1.3970e-09, 1.6298e-09, 0.0000e+00], [ 9.3132e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], [ 2.3283e-10, 1.6298e-09, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 439, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0117, -0.0360, -0.0008, 0.0313, -0.0150, 0.0529, -0.0114], device='cuda:0'), grad: tensor([ 1.2806e-08, -5.1921e-08, -2.6776e-08, 2.4447e-08, 1.5134e-08, 5.8906e-08, -9.0571e-08, 6.1933e-08, 3.0268e-09, -6.0536e-09], device='cuda:0') 100 1e-05 changing lr epoch 438, time 250.50, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4287 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.13 lr 0.00001000 Epoch 440, weight, value: tensor([[-0.1389, -0.2910, -0.0899, ..., -0.0886, 0.1908, 0.2018], [-0.2691, -0.2387, -0.0697, ..., -0.2073, -0.2632, -0.1730], [-0.0674, -0.2187, 0.1667, ..., -0.2707, 0.2987, 0.1402], ..., [-0.2125, 0.1322, 0.0167, ..., 0.2401, -0.2690, -0.3292], [-0.3481, 0.0704, -0.1674, ..., 0.0700, -0.1283, -0.2473], [-0.0625, -0.1742, -0.0848, ..., -0.1608, -0.0715, -0.2583]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 6.9849e-10, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 6.9849e-10, 4.6566e-10], [ 0.0000e+00, 6.7521e-09, 0.0000e+00, ..., 8.3819e-09, 0.0000e+00, 4.6566e-10], ..., [ 4.6566e-10, -1.8161e-08, 0.0000e+00, ..., -2.0489e-08, 0.0000e+00, -6.9849e-10], [ 9.3132e-10, 1.3970e-09, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 2.3283e-09, 1.2573e-08, 0.0000e+00, ..., 8.8476e-09, 4.6566e-10, 2.3283e-10]], device='cuda:0') Epoch 440, bias, value: tensor([-0.0161, -0.0359, -0.0094, -0.0117, -0.0360, -0.0007, 0.0312, -0.0150, 0.0529, -0.0114], device='cuda:0'), grad: tensor([ 2.3283e-10, 5.3551e-09, 2.2352e-08, -1.7649e-07, 4.6566e-10, 1.6764e-07, 6.9849e-10, -4.4005e-08, 9.3132e-10, 2.8638e-08], device='cuda:0') 100 1e-05 changing lr epoch 439, time 250.44, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4035 re_mapping 0.0026 re_causal 0.0086 /// teacc 99.12 lr 0.00001000 Epoch 441, weight, value: tensor([[-0.1389, -0.2910, -0.0899, ..., -0.0885, 0.1909, 0.2019], [-0.2691, -0.2387, -0.0697, ..., -0.2073, -0.2632, -0.1730], [-0.0674, -0.2187, 0.1667, ..., -0.2708, 0.2987, 0.1402], ..., [-0.2126, 0.1322, 0.0167, ..., 0.2402, -0.2690, -0.3292], [-0.3482, 0.0704, -0.1674, ..., 0.0700, -0.1283, -0.2473], [-0.0624, -0.1742, -0.0848, ..., -0.1608, -0.0715, -0.2583]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.9162e-07, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [-2.2002e-07, 1.1642e-09, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 1.1642e-09, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 9.7789e-09, 2.3283e-10, 0.0000e+00, ..., -1.6298e-09, 2.3283e-10, 2.3283e-10], [ 4.1910e-09, 2.5611e-09, 0.0000e+00, ..., -4.6566e-10, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 441, bias, value: tensor([-0.0160, -0.0359, -0.0093, -0.0116, -0.0360, -0.0007, 0.0312, -0.0150, 0.0529, -0.0114], device='cuda:0'), grad: tensor([ 1.2806e-08, 1.4855e-06, -1.7090e-06, -1.5600e-08, 7.5670e-08, 4.2841e-08, 2.3749e-08, 9.0804e-09, 6.1700e-08, 1.2573e-08], device='cuda:0') 100 1e-05 changing lr epoch 440, time 250.95, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4243 re_mapping 0.0027 re_causal 0.0087 /// teacc 99.15 lr 0.00001000 Epoch 442, weight, value: tensor([[-0.1389, -0.2910, -0.0899, ..., -0.0884, 0.1909, 0.2020], [-0.2692, -0.2387, -0.0697, ..., -0.2073, -0.2632, -0.1730], [-0.0674, -0.2188, 0.1667, ..., -0.2708, 0.2988, 0.1403], ..., [-0.2126, 0.1322, 0.0167, ..., 0.2402, -0.2691, -0.3293], [-0.3482, 0.0704, -0.1674, ..., 0.0699, -0.1283, -0.2473], [-0.0625, -0.1742, -0.0848, ..., -0.1608, -0.0715, -0.2584]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.4040e-07, -1.3551e-07], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 1.6298e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-09, 4.1910e-09], ..., [ 4.6566e-10, 2.0955e-09, 0.0000e+00, ..., 6.9849e-10, 1.8626e-09, 1.6298e-09], [ 4.6566e-10, -2.5611e-09, 0.0000e+00, ..., -1.5832e-08, 3.5157e-08, 3.0501e-08], [ 0.0000e+00, 3.0268e-09, 0.0000e+00, ..., 4.8894e-09, 2.3749e-08, 2.2119e-08]], device='cuda:0') Epoch 442, bias, value: tensor([-0.0160, -0.0359, -0.0094, -0.0116, -0.0358, -0.0008, 0.0312, -0.0150, 0.0529, -0.0114], device='cuda:0'), grad: tensor([-2.3795e-07, 7.2177e-09, 1.0710e-08, -2.5611e-09, -2.6310e-08, 2.8173e-08, 9.7556e-08, 1.1642e-08, 4.3772e-08, 7.8930e-08], device='cuda:0') 100 1e-05 changing lr epoch 441, time 250.66, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4282 re_mapping 0.0027 re_causal 0.0088 /// teacc 99.14 lr 0.00001000 Epoch 443, weight, value: tensor([[-0.1389, -0.2910, -0.0899, ..., -0.0884, 0.1909, 0.2020], [-0.2692, -0.2387, -0.0697, ..., -0.2073, -0.2633, -0.1730], [-0.0674, -0.2188, 0.1667, ..., -0.2708, 0.2988, 0.1403], ..., [-0.2127, 0.1322, 0.0167, ..., 0.2402, -0.2691, -0.3293], [-0.3483, 0.0704, -0.1674, ..., 0.0700, -0.1283, -0.2473], [-0.0625, -0.1742, -0.0848, ..., -0.1608, -0.0715, -0.2584]], device='cuda:0'), grad: tensor([[ 6.9849e-09, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, 2.3283e-09, 1.1642e-09], [ 4.6566e-10, 2.3283e-10, 2.3283e-10, ..., 1.1642e-09, 6.9849e-10, 4.6566e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 4.6566e-10], ..., [ 2.3283e-10, -8.1491e-09, 4.6566e-10, ..., -1.8859e-08, 0.0000e+00, 0.0000e+00], [ 6.7521e-09, 0.0000e+00, 0.0000e+00, ..., 3.9581e-09, 6.9849e-10, 4.6566e-10], [ 4.6566e-10, 7.9162e-09, 1.6298e-09, ..., 2.6077e-08, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 443, bias, value: tensor([-0.0160, -0.0359, -0.0094, -0.0116, -0.0358, -0.0008, 0.0312, -0.0150, 0.0529, -0.0114], device='cuda:0'), grad: tensor([ 1.5134e-08, 4.8894e-09, 1.8626e-09, 4.2934e-07, -1.9092e-08, -3.9325e-07, -8.0792e-08, -2.3283e-08, 1.5367e-08, 5.1688e-08], device='cuda:0') 100 1e-05 changing lr epoch 442, time 251.06, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.4097 re_mapping 0.0026 re_causal 0.0085 /// teacc 99.13 lr 0.00001000 Epoch 444, weight, value: tensor([[-0.1390, -0.2910, -0.0899, ..., -0.0884, 0.1909, 0.2020], [-0.2692, -0.2389, -0.0697, ..., -0.2073, -0.2633, -0.1729], [-0.0674, -0.2188, 0.1667, ..., -0.2709, 0.2989, 0.1402], ..., [-0.2128, 0.1323, 0.0167, ..., 0.2403, -0.2691, -0.3294], [-0.3483, 0.0704, -0.1674, ..., 0.0700, -0.1284, -0.2473], [-0.0625, -0.1742, -0.0848, ..., -0.1609, -0.0715, -0.2584]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.3504e-08, -9.3132e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.1642e-09, 0.0000e+00, ..., -1.1642e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 4.6566e-10]], device='cuda:0') Epoch 444, bias, value: tensor([-0.0160, -0.0360, -0.0094, -0.0116, -0.0358, -0.0008, 0.0312, -0.0149, 0.0529, -0.0115], device='cuda:0'), grad: tensor([-2.2119e-08, 1.1642e-09, 2.3283e-10, 3.7253e-09, 1.3970e-09, 1.3970e-09, 2.4680e-08, 1.1642e-09, 9.3132e-10, -4.4238e-09], device='cuda:0') 100 1e-05 changing lr epoch 443, time 250.96, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.3950 re_mapping 0.0026 re_causal 0.0083 /// teacc 99.13 lr 0.00001000 Epoch 445, weight, value: tensor([[-0.1390, -0.2910, -0.0899, ..., -0.0883, 0.1911, 0.2023], [-0.2692, -0.2391, -0.0697, ..., -0.2074, -0.2633, -0.1729], [-0.0674, -0.2188, 0.1667, ..., -0.2709, 0.2989, 0.1403], ..., [-0.2129, 0.1325, 0.0167, ..., 0.2404, -0.2692, -0.3294], [-0.3483, 0.0703, -0.1674, ..., 0.0700, -0.1284, -0.2474], [-0.0625, -0.1742, -0.0848, ..., -0.1609, -0.0716, -0.2584]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.1642e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.4238e-09, -8.6147e-09, 0.0000e+00], ..., [ 1.8626e-09, 2.0955e-09, 0.0000e+00, ..., 1.1642e-09, 6.5193e-09, 0.0000e+00], [ 1.1642e-09, 2.5611e-09, 0.0000e+00, ..., -7.4506e-09, 0.0000e+00, 0.0000e+00], [-1.6298e-09, 0.0000e+00, 0.0000e+00, ..., -2.3283e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 445, bias, value: tensor([-0.0158, -0.0361, -0.0094, -0.0114, -0.0357, -0.0008, 0.0311, -0.0148, 0.0529, -0.0115], device='cuda:0'), grad: tensor([ 4.1910e-09, -1.3970e-09, -3.4925e-09, -6.2864e-09, 3.0268e-09, 6.0536e-09, 8.3819e-09, 3.3062e-08, -1.9791e-08, -1.6531e-08], device='cuda:0') 100 1e-05 changing lr epoch 444, time 250.95, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4197 re_mapping 0.0025 re_causal 0.0086 /// teacc 99.13 lr 0.00001000 Epoch 446, weight, value: tensor([[-0.1390, -0.2911, -0.0900, ..., -0.0883, 0.1912, 0.2024], [-0.2693, -0.2391, -0.0697, ..., -0.2075, -0.2633, -0.1729], [-0.0674, -0.2188, 0.1667, ..., -0.2710, 0.2989, 0.1402], ..., [-0.2130, 0.1325, 0.0167, ..., 0.2404, -0.2692, -0.3294], [-0.3484, 0.0703, -0.1674, ..., 0.0700, -0.1284, -0.2474], [-0.0625, -0.1742, -0.0848, ..., -0.1609, -0.0716, -0.2585]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., -1.8394e-08, -2.9569e-08, -3.3295e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 6.9849e-10], ..., [ 2.3283e-10, -4.4238e-09, 0.0000e+00, ..., -2.0955e-09, 6.9849e-10, 6.9849e-10], [ 6.9849e-10, 1.1642e-09, 0.0000e+00, ..., 2.0955e-09, 2.3283e-10, 2.3283e-10], [ 2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 3.0268e-09, 4.4238e-09, 4.8894e-09]], device='cuda:0') Epoch 446, bias, value: tensor([-0.0158, -0.0361, -0.0095, -0.0115, -0.0357, -0.0008, 0.0311, -0.0147, 0.0529, -0.0115], device='cuda:0'), grad: tensor([-9.2201e-08, 1.6298e-09, 1.8626e-09, 4.6566e-09, -2.5611e-09, -1.6298e-09, 7.5903e-08, 3.7486e-08, 6.0536e-09, -2.9569e-08], device='cuda:0') 100 1e-05 changing lr epoch 445, time 250.65, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.4285 re_mapping 0.0026 re_causal 0.0086 /// teacc 99.13 lr 0.00001000 Epoch 447, weight, value: tensor([[-0.1390, -0.2911, -0.0900, ..., -0.0883, 0.1912, 0.2024], [-0.2694, -0.2392, -0.0697, ..., -0.2075, -0.2634, -0.1731], [-0.0675, -0.2189, 0.1667, ..., -0.2711, 0.2990, 0.1404], ..., [-0.2130, 0.1326, 0.0167, ..., 0.2404, -0.2693, -0.3295], [-0.3484, 0.0703, -0.1674, ..., 0.0699, -0.1284, -0.2474], [-0.0626, -0.1743, -0.0848, ..., -0.1609, -0.0716, -0.2585]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -0.0000e+00, -6.9849e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 447, bias, value: tensor([-0.0158, -0.0361, -0.0094, -0.0115, -0.0357, -0.0008, 0.0311, -0.0147, 0.0529, -0.0115], device='cuda:0'), grad: tensor([ 9.3132e-10, 4.6566e-10, -3.7253e-09, 3.2596e-09, -1.5064e-07, -3.4925e-09, 1.6298e-09, 2.3283e-09, 6.9849e-10, 1.5320e-07], device='cuda:0') 100 1e-05 changing lr epoch 446, time 250.35, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4023 re_mapping 0.0026 re_causal 0.0085 /// teacc 99.11 lr 0.00001000 Epoch 448, weight, value: tensor([[-0.1390, -0.2911, -0.0900, ..., -0.0883, 0.1912, 0.2024], [-0.2694, -0.2392, -0.0697, ..., -0.2075, -0.2634, -0.1731], [-0.0675, -0.2189, 0.1668, ..., -0.2712, 0.2990, 0.1405], ..., [-0.2130, 0.1326, 0.0167, ..., 0.2404, -0.2693, -0.3297], [-0.3485, 0.0703, -0.1674, ..., 0.0700, -0.1285, -0.2474], [-0.0626, -0.1743, -0.0848, ..., -0.1610, -0.0716, -0.2585]], device='cuda:0'), grad: tensor([[ 1.6298e-09, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -3.4925e-09, -2.7940e-09], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 1.6298e-09, 2.3283e-10], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, -6.9849e-10, 2.3283e-10], ..., [ 0.0000e+00, -2.0955e-09, 0.0000e+00, ..., -4.8894e-09, 0.0000e+00, 0.0000e+00], [ 2.0955e-09, 0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 2.5611e-09, 2.0955e-09], [ 0.0000e+00, 1.8626e-09, 0.0000e+00, ..., 5.1223e-09, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 448, bias, value: tensor([-0.0158, -0.0361, -0.0094, -0.0115, -0.0357, -0.0008, 0.0311, -0.0147, 0.0529, -0.0116], device='cuda:0'), grad: tensor([-3.4925e-09, -1.0058e-07, 1.1874e-08, 4.6566e-09, 4.4238e-09, 1.1176e-08, 9.3132e-10, -3.4925e-09, 6.3097e-08, 1.3504e-08], device='cuda:0') 100 1e-05 changing lr epoch 447, time 250.52, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4229 re_mapping 0.0026 re_causal 0.0087 /// teacc 99.10 lr 0.00001000 Epoch 449, weight, value: tensor([[-0.1390, -0.2912, -0.0900, ..., -0.0883, 0.1912, 0.2024], [-0.2694, -0.2392, -0.0697, ..., -0.2075, -0.2635, -0.1732], [-0.0675, -0.2189, 0.1668, ..., -0.2712, 0.2991, 0.1407], ..., [-0.2130, 0.1327, 0.0167, ..., 0.2405, -0.2694, -0.3298], [-0.3485, 0.0703, -0.1674, ..., 0.0698, -0.1285, -0.2475], [-0.0626, -0.1743, -0.0848, ..., -0.1610, -0.0716, -0.2586]], device='cuda:0'), grad: tensor([[6.0536e-09, 0.0000e+00, 0.0000e+00, ..., 4.1910e-09, 4.8894e-09, 4.4238e-09], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [3.4925e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 2.7940e-09, 2.5611e-09], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 449, bias, value: tensor([-0.0157, -0.0362, -0.0094, -0.0114, -0.0357, -0.0007, 0.0310, -0.0147, 0.0528, -0.0116], device='cuda:0'), grad: tensor([ 2.3516e-08, 4.6566e-10, 4.6566e-10, 0.0000e+00, 1.8626e-09, 2.7344e-06, -2.7716e-06, 6.9849e-10, 1.0012e-08, 2.3283e-09], device='cuda:0') 100 1e-05 changing lr epoch 448, time 250.32, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4334 re_mapping 0.0026 re_causal 0.0085 /// teacc 99.10 lr 0.00001000 Epoch 450, weight, value: tensor([[-0.1391, -0.2912, -0.0900, ..., -0.0882, 0.1913, 0.2025], [-0.2694, -0.2392, -0.0697, ..., -0.2075, -0.2635, -0.1732], [-0.0675, -0.2189, 0.1668, ..., -0.2712, 0.2992, 0.1407], ..., [-0.2131, 0.1327, 0.0167, ..., 0.2405, -0.2695, -0.3298], [-0.3486, 0.0702, -0.1674, ..., 0.0698, -0.1286, -0.2475], [-0.0627, -0.1743, -0.0848, ..., -0.1611, -0.0717, -0.2586]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 6.9849e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], ..., [ 2.3283e-10, -2.3283e-09, 0.0000e+00, ..., -3.7253e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 3.0268e-09, 9.3132e-10, 4.6566e-10]], device='cuda:0') Epoch 450, bias, value: tensor([-0.0157, -0.0362, -0.0093, -0.0113, -0.0357, -0.0006, 0.0310, -0.0147, 0.0527, -0.0116], device='cuda:0'), grad: tensor([ 2.3283e-10, 3.0268e-09, 9.3132e-10, -3.0268e-09, 0.0000e+00, 3.7253e-09, 9.3132e-10, -4.6566e-09, -1.3970e-09, 7.9162e-09], device='cuda:0') 100 1e-05 changing lr epoch 449, time 250.43, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4232 re_mapping 0.0026 re_causal 0.0085 /// teacc 99.08 lr 0.00001000 Epoch 451, weight, value: tensor([[-0.1391, -0.2912, -0.0900, ..., -0.0882, 0.1914, 0.2026], [-0.2695, -0.2392, -0.0697, ..., -0.2075, -0.2636, -0.1733], [-0.0675, -0.2189, 0.1668, ..., -0.2713, 0.2993, 0.1408], ..., [-0.2131, 0.1327, 0.0167, ..., 0.2405, -0.2695, -0.3299], [-0.3486, 0.0702, -0.1674, ..., 0.0697, -0.1286, -0.2475], [-0.0627, -0.1744, -0.0848, ..., -0.1611, -0.0717, -0.2586]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.7940e-08, -1.6298e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 2.7940e-09, 0.0000e+00, ..., -0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.7474e-08, 1.5832e-08]], device='cuda:0') Epoch 451, bias, value: tensor([-0.0156, -0.0362, -0.0093, -0.0112, -0.0357, -0.0005, 0.0308, -0.0147, 0.0525, -0.0116], device='cuda:0'), grad: tensor([-6.7055e-08, 4.6566e-09, 2.7940e-09, -8.8476e-09, 1.6717e-07, 4.6566e-10, 1.3970e-09, 8.8476e-08, 9.3132e-10, -1.9278e-07], device='cuda:0') 100 1e-05 changing lr epoch 450, time 250.62, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4244 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.10 lr 0.00001000 Epoch 452, weight, value: tensor([[-0.1391, -0.2913, -0.0900, ..., -0.0881, 0.1914, 0.2026], [-0.2696, -0.2393, -0.0697, ..., -0.2076, -0.2637, -0.1734], [-0.0676, -0.2189, 0.1668, ..., -0.2714, 0.2992, 0.1407], ..., [-0.2132, 0.1327, 0.0167, ..., 0.2406, -0.2695, -0.3298], [-0.3487, 0.0702, -0.1674, ..., 0.0696, -0.1286, -0.2475], [-0.0628, -0.1744, -0.0848, ..., -0.1611, -0.0717, -0.2587]], device='cuda:0'), grad: tensor([[ 1.0710e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.0489e-08, 2.2352e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, -3.7253e-09, ..., -4.1910e-09, -4.6566e-10, -3.2596e-09], ..., [ 0.0000e+00, 0.0000e+00, 4.6566e-10, ..., 4.6566e-10, 0.0000e+00, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 2.7940e-09, ..., 3.2596e-09, 9.3132e-10, 3.2596e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.3132e-10, 9.3132e-10]], device='cuda:0') Epoch 452, bias, value: tensor([-0.0156, -0.0363, -0.0093, -0.0112, -0.0357, -0.0005, 0.0308, -0.0146, 0.0525, -0.0116], device='cuda:0'), grad: tensor([ 8.6147e-08, 1.3970e-08, -2.8405e-08, 4.6566e-10, -2.3283e-08, 2.7940e-09, -7.9162e-08, 4.6566e-09, 2.4680e-08, 2.7940e-09], device='cuda:0') 100 1e-05 changing lr epoch 451, time 250.63, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4081 re_mapping 0.0025 re_causal 0.0082 /// teacc 99.09 lr 0.00001000 Epoch 453, weight, value: tensor([[-0.1392, -0.2913, -0.0900, ..., -0.0881, 0.1914, 0.2026], [-0.2696, -0.2393, -0.0696, ..., -0.2076, -0.2637, -0.1734], [-0.0676, -0.2190, 0.1668, ..., -0.2714, 0.2993, 0.1408], ..., [-0.2133, 0.1327, 0.0166, ..., 0.2406, -0.2696, -0.3299], [-0.3488, 0.0702, -0.1674, ..., 0.0696, -0.1287, -0.2475], [-0.0628, -0.1744, -0.0848, ..., -0.1612, -0.0717, -0.2587]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 1.3970e-09, -4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -3.2596e-09, 0.0000e+00, ..., -3.2596e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 453, bias, value: tensor([-0.0157, -0.0362, -0.0093, -0.0112, -0.0357, -0.0005, 0.0309, -0.0147, 0.0525, -0.0116], device='cuda:0'), grad: tensor([ 9.3132e-10, -7.4506e-09, 1.8626e-09, 1.3970e-09, 5.1223e-09, 4.6566e-10, 7.4506e-09, -5.5879e-09, 3.7253e-09, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 452, time 250.79, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4297 re_mapping 0.0026 re_causal 0.0083 /// teacc 99.09 lr 0.00001000 Epoch 454, weight, value: tensor([[-0.1392, -0.2914, -0.0900, ..., -0.0882, 0.1914, 0.2026], [-0.2696, -0.2394, -0.0696, ..., -0.2077, -0.2637, -0.1733], [-0.0676, -0.2190, 0.1668, ..., -0.2715, 0.2993, 0.1407], ..., [-0.2133, 0.1328, 0.0166, ..., 0.2407, -0.2697, -0.3300], [-0.3489, 0.0702, -0.1674, ..., 0.0696, -0.1287, -0.2476], [-0.0628, -0.1745, -0.0848, ..., -0.1612, -0.0717, -0.2587]], device='cuda:0'), grad: tensor([[ 5.1223e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 2.7940e-09, 1.3970e-09], [ 4.6566e-10, 2.3283e-09, 0.0000e+00, ..., 1.8626e-09, 4.6566e-10, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.3283e-09, 0.0000e+00, ..., -2.3283e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., -4.6566e-09, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 454, bias, value: tensor([-0.0157, -0.0363, -0.0095, -0.0112, -0.0358, -0.0006, 0.0309, -0.0146, 0.0525, -0.0116], device='cuda:0'), grad: tensor([ 1.1176e-08, 6.9849e-09, 2.3283e-09, 5.5879e-09, 1.2573e-08, 3.8650e-08, -6.3796e-08, -6.0536e-09, -8.8476e-09, 1.8626e-09], device='cuda:0') 100 1e-05 changing lr epoch 453, time 250.50, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4328 re_mapping 0.0025 re_causal 0.0084 /// teacc 99.09 lr 0.00001000 Epoch 455, weight, value: tensor([[-0.1392, -0.2914, -0.0900, ..., -0.0881, 0.1915, 0.2027], [-0.2696, -0.2394, -0.0696, ..., -0.2077, -0.2637, -0.1733], [-0.0676, -0.2190, 0.1668, ..., -0.2715, 0.2994, 0.1407], ..., [-0.2134, 0.1328, 0.0166, ..., 0.2407, -0.2698, -0.3300], [-0.3489, 0.0702, -0.1674, ..., 0.0697, -0.1287, -0.2476], [-0.0628, -0.1745, -0.0848, ..., -0.1613, -0.0718, -0.2588]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.3819e-09, 0.0000e+00, ..., 6.9849e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, -9.3132e-10, -0.0000e+00], ..., [ 0.0000e+00, -6.1467e-08, 0.0000e+00, ..., -5.0757e-08, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 1.3970e-09, 0.0000e+00, ..., 2.3283e-09, 9.3132e-10, 0.0000e+00], [ 0.0000e+00, 3.7719e-08, 0.0000e+00, ..., 3.2596e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 455, bias, value: tensor([-0.0156, -0.0363, -0.0094, -0.0112, -0.0357, -0.0006, 0.0309, -0.0146, 0.0526, -0.0117], device='cuda:0'), grad: tensor([ 2.3283e-09, 3.2596e-08, -2.3283e-09, 5.5879e-08, 1.8626e-09, -6.0536e-09, 3.2596e-09, -2.2864e-07, 4.6566e-09, 1.4529e-07], device='cuda:0') 100 1e-05 changing lr epoch 454, time 250.83, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4313 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.11 lr 0.00001000 Epoch 456, weight, value: tensor([[-0.1392, -0.2914, -0.0900, ..., -0.0881, 0.1916, 0.2028], [-0.2697, -0.2394, -0.0696, ..., -0.2077, -0.2637, -0.1733], [-0.0677, -0.2190, 0.1668, ..., -0.2717, 0.2994, 0.1407], ..., [-0.2134, 0.1329, 0.0166, ..., 0.2408, -0.2699, -0.3301], [-0.3490, 0.0701, -0.1675, ..., 0.0696, -0.1287, -0.2476], [-0.0629, -0.1746, -0.0848, ..., -0.1614, -0.0718, -0.2588]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 5.1223e-09, 1.3970e-09, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, -1.8626e-09, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., -1.0245e-08, -1.3970e-09, -1.3970e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., -1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 456, bias, value: tensor([-0.0156, -0.0363, -0.0094, -0.0112, -0.0356, -0.0005, 0.0308, -0.0146, 0.0525, -0.0118], device='cuda:0'), grad: tensor([ 5.5414e-08, -9.9186e-08, 3.7253e-09, 1.3970e-09, -9.5926e-08, 1.4435e-08, 3.5390e-08, -5.5879e-09, 8.8476e-09, 8.2422e-08], device='cuda:0') 100 1e-05 changing lr epoch 455, time 250.91, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4448 re_mapping 0.0025 re_causal 0.0084 /// teacc 99.10 lr 0.00001000 Epoch 457, weight, value: tensor([[-0.1392, -0.2915, -0.0900, ..., -0.0881, 0.1916, 0.2029], [-0.2697, -0.2395, -0.0695, ..., -0.2076, -0.2638, -0.1734], [-0.0677, -0.2191, 0.1668, ..., -0.2717, 0.2995, 0.1410], ..., [-0.2134, 0.1330, 0.0165, ..., 0.2407, -0.2700, -0.3303], [-0.3490, 0.0701, -0.1675, ..., 0.0696, -0.1288, -0.2477], [-0.0629, -0.1746, -0.0848, ..., -0.1615, -0.0718, -0.2589]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -4.6566e-10, 0.0000e+00, ..., -4.6566e-10, -4.6566e-10, 0.0000e+00], ..., [ 0.0000e+00, -6.5193e-09, 0.0000e+00, ..., -9.3132e-09, 0.0000e+00, 0.0000e+00], [ 1.8626e-09, 1.8626e-09, 0.0000e+00, ..., 2.7940e-09, 9.3132e-10, 4.6566e-10], [ 2.3283e-09, 6.0536e-09, 0.0000e+00, ..., 1.3504e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 457, bias, value: tensor([-0.0155, -0.0363, -0.0093, -0.0111, -0.0355, -0.0005, 0.0307, -0.0146, 0.0525, -0.0118], device='cuda:0'), grad: tensor([ 1.8626e-09, 7.9162e-09, -2.7940e-09, 8.4750e-08, -4.8429e-08, -9.0804e-08, -1.8626e-09, -9.3132e-09, 1.3504e-08, 5.4948e-08], device='cuda:0') 100 1e-05 changing lr epoch 456, time 250.29, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4220 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.11 lr 0.00001000 Epoch 458, weight, value: tensor([[-0.1393, -0.2915, -0.0900, ..., -0.0881, 0.1916, 0.2029], [-0.2698, -0.2395, -0.0695, ..., -0.2076, -0.2638, -0.1734], [-0.0677, -0.2190, 0.1668, ..., -0.2718, 0.2996, 0.1410], ..., [-0.2135, 0.1330, 0.0165, ..., 0.2407, -0.2701, -0.3304], [-0.3491, 0.0701, -0.1675, ..., 0.0696, -0.1288, -0.2477], [-0.0629, -0.1747, -0.0848, ..., -0.1615, -0.0719, -0.2590]], device='cuda:0'), grad: tensor([[ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -1.7229e-08, -1.3970e-08], [ 1.5460e-07, 4.6566e-10, 0.0000e+00, ..., 6.4727e-08, 4.6566e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 2.8871e-08, 0.0000e+00, 0.0000e+00, ..., 1.3504e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 6.5193e-09, 4.6566e-09]], device='cuda:0') Epoch 458, bias, value: tensor([-0.0156, -0.0363, -0.0092, -0.0111, -0.0355, -0.0005, 0.0308, -0.0146, 0.0525, -0.0118], device='cuda:0'), grad: tensor([-2.9802e-08, 3.4133e-07, 1.3970e-09, 4.6566e-10, 5.1223e-09, 1.0189e-06, -1.4026e-06, 2.7940e-09, 6.4261e-08, 5.5879e-09], device='cuda:0') 100 1e-05 changing lr epoch 457, time 250.44, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4257 re_mapping 0.0026 re_causal 0.0084 /// teacc 99.11 lr 0.00001000 Epoch 459, weight, value: tensor([[-0.1393, -0.2916, -0.0900, ..., -0.0881, 0.1916, 0.2029], [-0.2699, -0.2395, -0.0695, ..., -0.2076, -0.2639, -0.1734], [-0.0677, -0.2191, 0.1669, ..., -0.2719, 0.2997, 0.1410], ..., [-0.2137, 0.1330, 0.0165, ..., 0.2408, -0.2702, -0.3305], [-0.3492, 0.0701, -0.1675, ..., 0.0697, -0.1288, -0.2478], [-0.0629, -0.1748, -0.0848, ..., -0.1616, -0.0719, -0.2591]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, 1.8626e-09, 0.0000e+00], [ 0.0000e+00, -4.1910e-09, 0.0000e+00, ..., 0.0000e+00, -5.5879e-09, 0.0000e+00], ..., [ 0.0000e+00, 3.2596e-09, 0.0000e+00, ..., 0.0000e+00, 4.6566e-09, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 459, bias, value: tensor([-0.0156, -0.0363, -0.0093, -0.0110, -0.0354, -0.0005, 0.0308, -0.0146, 0.0525, -0.0119], device='cuda:0'), grad: tensor([ 9.3132e-10, 1.0896e-07, -3.3062e-08, 6.2399e-08, -3.8510e-07, -6.7055e-08, 5.1223e-09, 2.6543e-08, 3.2596e-09, 2.8545e-07], device='cuda:0') 100 1e-05 changing lr epoch 458, time 251.08, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4383 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.06 lr 0.00001000 Epoch 460, weight, value: tensor([[-0.1394, -0.2916, -0.0900, ..., -0.0881, 0.1916, 0.2029], [-0.2699, -0.2396, -0.0695, ..., -0.2076, -0.2639, -0.1734], [-0.0677, -0.2190, 0.1669, ..., -0.2719, 0.2998, 0.1411], ..., [-0.2137, 0.1330, 0.0165, ..., 0.2408, -0.2703, -0.3305], [-0.3493, 0.0701, -0.1675, ..., 0.0697, -0.1289, -0.2478], [-0.0630, -0.1748, -0.0848, ..., -0.1617, -0.0720, -0.2592]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, -9.3132e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 8.8476e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -4.6566e-09, 4.6566e-10], ..., [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, -1.3504e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-09, 1.8626e-09]], device='cuda:0') Epoch 460, bias, value: tensor([-0.0156, -0.0362, -0.0092, -0.0109, -0.0354, -0.0006, 0.0308, -0.0147, 0.0525, -0.0120], device='cuda:0'), grad: tensor([ 4.6566e-10, 3.4738e-07, -1.3504e-08, 2.1886e-08, 1.0710e-08, 4.4238e-08, 1.1967e-07, 2.3283e-09, -5.5321e-07, 2.7940e-08], device='cuda:0') 100 1e-05 changing lr epoch 459, time 250.52, cls_loss 0.0006 cls_loss_mapping 0.0003 cls_loss_causal 0.4059 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.11 lr 0.00001000 Epoch 461, weight, value: tensor([[-0.1394, -0.2916, -0.0900, ..., -0.0880, 0.1918, 0.2031], [-0.2699, -0.2396, -0.0695, ..., -0.2076, -0.2640, -0.1734], [-0.0677, -0.2190, 0.1669, ..., -0.2719, 0.2998, 0.1410], ..., [-0.2137, 0.1330, 0.0165, ..., 0.2408, -0.2704, -0.3306], [-0.3494, 0.0700, -0.1675, ..., 0.0697, -0.1289, -0.2478], [-0.0630, -0.1748, -0.0848, ..., -0.1617, -0.0720, -0.2592]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.7940e-09, 0.0000e+00, ..., -4.1910e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 4.6566e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 3.2596e-09, 4.6566e-10, 0.0000e+00]], device='cuda:0') Epoch 461, bias, value: tensor([-0.0154, -0.0362, -0.0092, -0.0110, -0.0353, -0.0006, 0.0307, -0.0147, 0.0525, -0.0120], device='cuda:0'), grad: tensor([ 0.0000e+00, 2.3283e-09, 4.6566e-10, 6.9849e-09, 9.3132e-10, -1.1176e-08, 3.7253e-09, 1.3970e-09, 3.2596e-09, -4.6566e-10], device='cuda:0') 100 1e-05 changing lr epoch 460, time 251.02, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4459 re_mapping 0.0025 re_causal 0.0086 /// teacc 99.11 lr 0.00001000 Epoch 462, weight, value: tensor([[-0.1395, -0.2917, -0.0900, ..., -0.0880, 0.1918, 0.2031], [-0.2700, -0.2397, -0.0695, ..., -0.2077, -0.2640, -0.1734], [-0.0677, -0.2190, 0.1669, ..., -0.2720, 0.2999, 0.1411], ..., [-0.2138, 0.1331, 0.0165, ..., 0.2409, -0.2705, -0.3307], [-0.3494, 0.0700, -0.1675, ..., 0.0697, -0.1290, -0.2479], [-0.0630, -0.1749, -0.0848, ..., -0.1617, -0.0720, -0.2592]], device='cuda:0'), grad: tensor([[ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 6.5193e-09, 5.1223e-09], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., 0.0000e+00, -9.3132e-09, -4.1910e-09], ..., [ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., -4.1910e-09, 1.3970e-09, 1.3970e-09], [ 8.8476e-09, 9.3132e-10, 0.0000e+00, ..., 0.0000e+00, 2.5146e-08, 1.5367e-08], [ 0.0000e+00, 2.7940e-09, 0.0000e+00, ..., 2.7940e-09, 5.1223e-09, 4.6566e-10]], device='cuda:0') Epoch 462, bias, value: tensor([-0.0154, -0.0363, -0.0091, -0.0109, -0.0353, -0.0006, 0.0307, -0.0146, 0.0525, -0.0121], device='cuda:0'), grad: tensor([ 2.4214e-08, 1.8626e-09, -3.5390e-08, 9.3132e-10, -3.2596e-09, 2.7940e-09, -1.0803e-07, -4.1910e-09, 9.7789e-08, 2.7008e-08], device='cuda:0') 100 1e-05 changing lr epoch 461, time 250.80, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4110 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.09 lr 0.00001000 Epoch 463, weight, value: tensor([[-0.1395, -0.2917, -0.0900, ..., -0.0878, 0.1920, 0.2034], [-0.2700, -0.2397, -0.0695, ..., -0.2077, -0.2640, -0.1734], [-0.0678, -0.2190, 0.1670, ..., -0.2720, 0.3000, 0.1412], ..., [-0.2138, 0.1332, 0.0165, ..., 0.2410, -0.2706, -0.3308], [-0.3495, 0.0700, -0.1675, ..., 0.0697, -0.1290, -0.2479], [-0.0631, -0.1750, -0.0848, ..., -0.1619, -0.0721, -0.2593]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.6298e-08, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, -0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 463, bias, value: tensor([-0.0152, -0.0363, -0.0090, -0.0109, -0.0349, -0.0006, 0.0306, -0.0146, 0.0525, -0.0123], device='cuda:0'), grad: tensor([ 4.6566e-10, 9.3132e-10, -1.0803e-07, 5.1223e-09, 1.5367e-08, -1.8626e-09, 1.8626e-09, 3.7253e-09, 1.1455e-07, -2.7474e-08], device='cuda:0') 100 1e-05 changing lr epoch 462, time 250.68, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4331 re_mapping 0.0025 re_causal 0.0084 /// teacc 99.09 lr 0.00001000 Epoch 464, weight, value: tensor([[-0.1396, -0.2917, -0.0900, ..., -0.0877, 0.1921, 0.2034], [-0.2700, -0.2397, -0.0695, ..., -0.2077, -0.2640, -0.1733], [-0.0678, -0.2190, 0.1670, ..., -0.2721, 0.3001, 0.1411], ..., [-0.2139, 0.1333, 0.0165, ..., 0.2410, -0.2706, -0.3308], [-0.3495, 0.0700, -0.1675, ..., 0.0697, -0.1291, -0.2479], [-0.0631, -0.1752, -0.0848, ..., -0.1621, -0.0721, -0.2593]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 9.3132e-10, 4.6566e-10], ..., [ 0.0000e+00, -1.2107e-08, 0.0000e+00, ..., -1.2573e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 4.6566e-10, 0.0000e+00, ..., -9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 8.8476e-09, 0.0000e+00, ..., 9.3132e-09, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 464, bias, value: tensor([-0.0152, -0.0363, -0.0091, -0.0109, -0.0348, -0.0006, 0.0306, -0.0146, 0.0525, -0.0125], device='cuda:0'), grad: tensor([ 4.6566e-10, 4.6566e-09, 6.9849e-09, 2.3283e-09, 1.8626e-09, 2.3283e-09, -1.4435e-08, -3.3528e-08, -5.1223e-09, 4.0047e-08], device='cuda:0') 100 1e-05 changing lr epoch 463, time 250.68, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4468 re_mapping 0.0025 re_causal 0.0086 /// teacc 99.08 lr 0.00001000 Epoch 465, weight, value: tensor([[-0.1396, -0.2917, -0.0900, ..., -0.0877, 0.1921, 0.2034], [-0.2700, -0.2398, -0.0695, ..., -0.2078, -0.2641, -0.1732], [-0.0678, -0.2190, 0.1670, ..., -0.2721, 0.3003, 0.1411], ..., [-0.2139, 0.1333, 0.0165, ..., 0.2411, -0.2708, -0.3309], [-0.3496, 0.0699, -0.1675, ..., 0.0696, -0.1291, -0.2480], [-0.0631, -0.1752, -0.0848, ..., -0.1622, -0.0719, -0.2594]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, -1.3970e-09, -9.3132e-10], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 2.3283e-09, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -2.8405e-08, 0.0000e+00, ..., -4.1910e-08, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 2.1886e-08, 0.0000e+00, ..., 2.5611e-08, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-09, 0.0000e+00, ..., 1.1176e-08, 4.6566e-10, 4.6566e-10]], device='cuda:0') Epoch 465, bias, value: tensor([-0.0153, -0.0363, -0.0090, -0.0109, -0.0347, -0.0005, 0.0305, -0.0146, 0.0523, -0.0125], device='cuda:0'), grad: tensor([-2.7940e-09, 6.5193e-09, 0.0000e+00, 1.3970e-09, 9.3132e-10, 2.3283e-09, 9.3132e-10, -8.6613e-08, 5.7742e-08, 1.9558e-08], device='cuda:0') 100 1e-05 changing lr epoch 464, time 250.29, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4373 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.11 lr 0.00001000 Epoch 466, weight, value: tensor([[-0.1397, -0.2917, -0.0900, ..., -0.0876, 0.1921, 0.2035], [-0.2701, -0.2398, -0.0695, ..., -0.2078, -0.2641, -0.1732], [-0.0678, -0.2190, 0.1670, ..., -0.2721, 0.3003, 0.1412], ..., [-0.2140, 0.1333, 0.0165, ..., 0.2411, -0.2709, -0.3310], [-0.3498, 0.0699, -0.1675, ..., 0.0695, -0.1292, -0.2480], [-0.0632, -0.1753, -0.0848, ..., -0.1623, -0.0720, -0.2596]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -2.7940e-09, -5.5879e-09, -6.0536e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, -2.4680e-08, 0.0000e+00], ..., [ 0.0000e+00, -3.7253e-09, 0.0000e+00, ..., -3.2596e-09, 6.5193e-09, 4.1910e-09], [ 1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 1.3970e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 3.2596e-09, 1.3970e-09, 1.3970e-09]], device='cuda:0') Epoch 466, bias, value: tensor([-0.0153, -0.0363, -0.0090, -0.0108, -0.0345, -0.0006, 0.0306, -0.0146, 0.0523, -0.0126], device='cuda:0'), grad: tensor([-1.2107e-08, 4.6566e-09, -3.5390e-08, 3.1665e-08, 3.2596e-09, -6.5193e-09, 4.6566e-09, 6.0536e-09, 7.9162e-09, -9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 465, time 250.99, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4509 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.11 lr 0.00001000 Epoch 467, weight, value: tensor([[-0.1397, -0.2918, -0.0900, ..., -0.0876, 0.1922, 0.2036], [-0.2701, -0.2399, -0.0695, ..., -0.2077, -0.2641, -0.1733], [-0.0678, -0.2191, 0.1670, ..., -0.2722, 0.3004, 0.1413], ..., [-0.2141, 0.1334, 0.0165, ..., 0.2410, -0.2709, -0.3312], [-0.3499, 0.0698, -0.1675, ..., 0.0695, -0.1293, -0.2481], [-0.0632, -0.1753, -0.0848, ..., -0.1624, -0.0719, -0.2596]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 2.3283e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, -3.7253e-09, -4.6566e-10], ..., [ 0.0000e+00, -3.7253e-08, -9.3132e-10, ..., -4.7963e-08, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 9.3132e-10, 3.7253e-09, 4.6566e-10], [ 0.0000e+00, 3.7253e-08, 9.3132e-10, ..., 6.8452e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 467, bias, value: tensor([-0.0153, -0.0362, -0.0090, -0.0104, -0.0344, -0.0007, 0.0305, -0.0146, 0.0522, -0.0127], device='cuda:0'), grad: tensor([ 1.8626e-09, 1.3039e-08, -2.1420e-08, 9.3132e-10, -1.4482e-07, 9.3132e-10, 4.1910e-09, -8.3819e-08, 2.5146e-08, 2.0955e-07], device='cuda:0') 100 1e-05 changing lr epoch 466, time 250.31, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4187 re_mapping 0.0024 re_causal 0.0081 /// teacc 99.13 lr 0.00001000 Epoch 468, weight, value: tensor([[-0.1397, -0.2918, -0.0900, ..., -0.0875, 0.1922, 0.2036], [-0.2702, -0.2400, -0.0695, ..., -0.2077, -0.2642, -0.1733], [-0.0678, -0.2191, 0.1670, ..., -0.2722, 0.3005, 0.1414], ..., [-0.2142, 0.1335, 0.0165, ..., 0.2411, -0.2710, -0.3312], [-0.3500, 0.0698, -0.1675, ..., 0.0695, -0.1294, -0.2481], [-0.0632, -0.1754, -0.0848, ..., -0.1626, -0.0720, -0.2597]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -1.3970e-09, 0.0000e+00, ..., -1.3970e-09, 0.0000e+00, -0.0000e+00], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, 1.3970e-09, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 468, bias, value: tensor([-0.0152, -0.0363, -0.0089, -0.0104, -0.0342, -0.0007, 0.0305, -0.0146, 0.0522, -0.0129], device='cuda:0'), grad: tensor([ 4.6566e-10, 6.0536e-09, 4.6566e-10, 1.8626e-09, -5.5414e-08, 1.3970e-09, -5.5879e-09, -1.3970e-09, 5.1223e-09, 4.6566e-08], device='cuda:0') 100 1e-05 changing lr epoch 467, time 250.15, cls_loss 0.0009 cls_loss_mapping 0.0004 cls_loss_causal 0.4478 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.10 lr 0.00001000 Epoch 469, weight, value: tensor([[-0.1398, -0.2918, -0.0900, ..., -0.0875, 0.1922, 0.2036], [-0.2702, -0.2400, -0.0695, ..., -0.2078, -0.2642, -0.1733], [-0.0678, -0.2190, 0.1670, ..., -0.2722, 0.3006, 0.1414], ..., [-0.2144, 0.1335, 0.0165, ..., 0.2411, -0.2711, -0.3313], [-0.3500, 0.0697, -0.1675, ..., 0.0695, -0.1294, -0.2482], [-0.0632, -0.1755, -0.0848, ..., -0.1627, -0.0718, -0.2598]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 469, bias, value: tensor([-0.0154, -0.0363, -0.0089, -0.0103, -0.0342, -0.0007, 0.0306, -0.0146, 0.0521, -0.0128], device='cuda:0'), grad: tensor([ 4.6566e-10, 9.3132e-10, 4.6566e-10, 9.3132e-10, 3.2596e-09, 9.3132e-10, 0.0000e+00, 2.4680e-08, -2.7940e-09, -2.5611e-08], device='cuda:0') 100 1e-05 changing lr epoch 468, time 250.51, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4293 re_mapping 0.0024 re_causal 0.0084 /// teacc 99.12 lr 0.00001000 Epoch 470, weight, value: tensor([[-0.1399, -0.2918, -0.0900, ..., -0.0875, 0.1922, 0.2036], [-0.2702, -0.2400, -0.0695, ..., -0.2078, -0.2643, -0.1733], [-0.0679, -0.2190, 0.1670, ..., -0.2722, 0.3007, 0.1414], ..., [-0.2144, 0.1335, 0.0165, ..., 0.2412, -0.2712, -0.3314], [-0.3501, 0.0697, -0.1675, ..., 0.0694, -0.1295, -0.2482], [-0.0633, -0.1755, -0.0848, ..., -0.1627, -0.0719, -0.2599]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 3.1013e-07, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, -2.7940e-08, 0.0000e+00, ..., 0.0000e+00, -2.8405e-08, 0.0000e+00], ..., [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., -3.2829e-07, 9.3132e-10, 0.0000e+00], [ 1.3970e-09, 9.3132e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 2.7940e-09, 1.8626e-09, 0.0000e+00, ..., 1.7229e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 470, bias, value: tensor([-0.0154, -0.0364, -0.0088, -0.0103, -0.0342, -0.0007, 0.0306, -0.0145, 0.0521, -0.0128], device='cuda:0'), grad: tensor([ 9.3132e-10, 8.9081e-07, -2.4401e-07, 2.5891e-07, -8.3819e-09, -3.2596e-08, 9.3132e-10, -9.3039e-07, 6.9849e-09, 6.0070e-08], device='cuda:0') 100 1e-05 changing lr epoch 469, time 250.28, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4113 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.11 lr 0.00001000 Epoch 471, weight, value: tensor([[-0.1399, -0.2919, -0.0900, ..., -0.0873, 0.1924, 0.2039], [-0.2703, -0.2400, -0.0695, ..., -0.2078, -0.2644, -0.1734], [-0.0679, -0.2190, 0.1670, ..., -0.2723, 0.3008, 0.1416], ..., [-0.2145, 0.1335, 0.0165, ..., 0.2413, -0.2713, -0.3315], [-0.3502, 0.0696, -0.1675, ..., 0.0694, -0.1296, -0.2483], [-0.0634, -0.1755, -0.0848, ..., -0.1627, -0.0719, -0.2601]], device='cuda:0'), grad: tensor([[ 1.1409e-08, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 2.8638e-08, 1.0477e-08], [ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10], [ 2.3283e-09, 0.0000e+00, 0.0000e+00, ..., -2.3283e-09, -1.3970e-09, -9.3132e-10], ..., [ 9.3132e-10, 0.0000e+00, 0.0000e+00, ..., 1.6298e-09, 6.9849e-10, 4.6566e-10], [ 2.7940e-09, 0.0000e+00, 0.0000e+00, ..., -2.3283e-10, 6.2864e-09, 2.3283e-09], [ 1.6298e-08, 2.3283e-10, 0.0000e+00, ..., 7.9162e-09, 2.3283e-10, 2.3283e-10]], device='cuda:0') Epoch 471, bias, value: tensor([-0.0152, -0.0364, -0.0088, -0.0102, -0.0342, -0.0006, 0.0304, -0.0145, 0.0520, -0.0128], device='cuda:0'), grad: tensor([ 5.7509e-08, 6.2864e-09, -9.0804e-09, 4.9826e-07, 2.2817e-08, -5.4250e-07, -7.4040e-08, 1.1874e-08, 2.4913e-08, 2.7940e-09], device='cuda:0') 100 1e-05 changing lr epoch 470, time 250.33, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.3858 re_mapping 0.0025 re_causal 0.0081 /// teacc 99.10 lr 0.00001000 Epoch 472, weight, value: tensor([[-0.1400, -0.2919, -0.0900, ..., -0.0873, 0.1924, 0.2039], [-0.2703, -0.2402, -0.0694, ..., -0.2078, -0.2644, -0.1734], [-0.0680, -0.2191, 0.1671, ..., -0.2724, 0.3008, 0.1416], ..., [-0.2145, 0.1337, 0.0164, ..., 0.2413, -0.2713, -0.3316], [-0.3503, 0.0696, -0.1675, ..., 0.0694, -0.1297, -0.2483], [-0.0635, -0.1756, -0.0848, ..., -0.1628, -0.0720, -0.2602]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 3.4925e-09, 0.0000e+00, ..., 6.9849e-10, 2.5611e-09, 2.3283e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 4.6566e-10, 1.1642e-09, 2.3283e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [-1.3970e-09, -2.1886e-08, 0.0000e+00, ..., -4.8894e-09, -1.5134e-08, -9.3132e-10], [ 2.3283e-10, 2.5611e-09, 0.0000e+00, ..., 6.9849e-10, 2.3283e-09, 4.6566e-10]], device='cuda:0') Epoch 472, bias, value: tensor([-0.0152, -0.0364, -0.0088, -0.0103, -0.0342, -0.0006, 0.0304, -0.0145, 0.0520, -0.0128], device='cuda:0'), grad: tensor([ 2.2119e-08, 9.3132e-10, 8.1491e-09, 1.3504e-08, -6.7521e-09, -1.6298e-09, 7.7765e-08, 1.3970e-09, -1.3039e-07, 2.1653e-08], device='cuda:0') 100 1e-05 changing lr epoch 471, time 250.21, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4249 re_mapping 0.0026 re_causal 0.0084 /// teacc 99.11 lr 0.00001000 Epoch 473, weight, value: tensor([[-0.1400, -0.2919, -0.0900, ..., -0.0873, 0.1924, 0.2039], [-0.2704, -0.2403, -0.0694, ..., -0.2078, -0.2644, -0.1734], [-0.0681, -0.2192, 0.1671, ..., -0.2725, 0.3008, 0.1416], ..., [-0.2145, 0.1338, 0.0164, ..., 0.2413, -0.2714, -0.3316], [-0.3504, 0.0695, -0.1675, ..., 0.0694, -0.1297, -0.2483], [-0.0635, -0.1756, -0.0848, ..., -0.1629, -0.0721, -0.2605]], device='cuda:0'), grad: tensor([[ 1.3970e-09, -9.3132e-10, 0.0000e+00, ..., 1.6298e-09, -1.2573e-08, -1.2107e-08], [ 0.0000e+00, 4.6566e-10, -2.0955e-09, ..., 9.3132e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -1.6298e-09, 0.0000e+00, ..., -0.0000e+00, -8.3819e-09, -7.2177e-09], ..., [ 2.3283e-10, 6.9849e-10, 6.9849e-10, ..., 5.3551e-09, 1.3970e-09, 1.6298e-09], [ 5.1223e-09, 4.6566e-10, 0.0000e+00, ..., -9.0804e-09, 4.4238e-09, 4.6566e-09], [ 3.7253e-09, 6.9849e-10, -1.1642e-09, ..., 1.2573e-08, 1.4435e-08, 1.4435e-08]], device='cuda:0') Epoch 473, bias, value: tensor([-0.0152, -0.0364, -0.0089, -0.0103, -0.0341, -0.0006, 0.0304, -0.0145, 0.0520, -0.0130], device='cuda:0'), grad: tensor([-3.6554e-08, -1.7695e-08, -2.9569e-08, 1.5600e-08, 1.2340e-08, -4.4005e-08, 1.7695e-08, 3.6554e-08, 6.7521e-09, 4.0978e-08], device='cuda:0') 100 1e-05 changing lr epoch 472, time 250.58, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4545 re_mapping 0.0024 re_causal 0.0087 /// teacc 99.14 lr 0.00001000 Epoch 474, weight, value: tensor([[-0.1400, -0.2919, -0.0900, ..., -0.0872, 0.1925, 0.2040], [-0.2705, -0.2404, -0.0694, ..., -0.2078, -0.2645, -0.1735], [-0.0682, -0.2191, 0.1670, ..., -0.2726, 0.3009, 0.1416], ..., [-0.2146, 0.1339, 0.0164, ..., 0.2413, -0.2714, -0.3316], [-0.3505, 0.0695, -0.1675, ..., 0.0693, -0.1298, -0.2484], [-0.0635, -0.1756, -0.0848, ..., -0.1630, -0.0722, -0.2606]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 6.9849e-10, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.1223e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., -8.3819e-09, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 474, bias, value: tensor([-0.0152, -0.0364, -0.0088, -0.0103, -0.0340, -0.0005, 0.0303, -0.0145, 0.0519, -0.0130], device='cuda:0'), grad: tensor([ 6.9849e-10, 0.0000e+00, 1.7928e-08, 0.0000e+00, -1.0477e-08, 3.7253e-09, 3.4925e-09, 2.7940e-09, -2.9802e-08, 1.5832e-08], device='cuda:0') 100 1e-05 changing lr epoch 473, time 250.31, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4233 re_mapping 0.0025 re_causal 0.0084 /// teacc 99.13 lr 0.00001000 Epoch 475, weight, value: tensor([[-0.1401, -0.2919, -0.0900, ..., -0.0872, 0.1925, 0.2040], [-0.2705, -0.2405, -0.0694, ..., -0.2079, -0.2646, -0.1736], [-0.0682, -0.2192, 0.1671, ..., -0.2726, 0.3010, 0.1416], ..., [-0.2146, 0.1340, 0.0164, ..., 0.2414, -0.2715, -0.3316], [-0.3507, 0.0694, -0.1675, ..., 0.0693, -0.1299, -0.2485], [-0.0636, -0.1758, -0.0848, ..., -0.1631, -0.0723, -0.2608]], device='cuda:0'), grad: tensor([[ 1.2610e-06, 2.3283e-10, 0.0000e+00, ..., 1.4226e-07, 1.6764e-06, 1.4352e-06], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 2.3283e-10], [ 9.3132e-10, -2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 9.3132e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 2.3283e-10, 1.8626e-09, 1.3970e-09], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 6.9849e-10]], device='cuda:0') Epoch 475, bias, value: tensor([-0.0152, -0.0365, -0.0088, -0.0100, -0.0340, -0.0007, 0.0304, -0.0144, 0.0519, -0.0132], device='cuda:0'), grad: tensor([ 4.5486e-06, 2.7940e-09, -4.6566e-10, 2.0955e-09, 3.5390e-08, 9.4762e-08, -4.6715e-06, 3.2596e-09, 9.5461e-09, -3.8883e-08], device='cuda:0') 100 1e-05 changing lr epoch 474, time 250.34, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4213 re_mapping 0.0026 re_causal 0.0084 /// teacc 99.14 lr 0.00001000 Epoch 476, weight, value: tensor([[-0.1404, -0.2919, -0.0900, ..., -0.0870, 0.1927, 0.2042], [-0.2705, -0.2405, -0.0694, ..., -0.2079, -0.2646, -0.1736], [-0.0682, -0.2192, 0.1671, ..., -0.2727, 0.3010, 0.1417], ..., [-0.2148, 0.1340, 0.0164, ..., 0.2415, -0.2715, -0.3317], [-0.3508, 0.0694, -0.1675, ..., 0.0693, -0.1300, -0.2485], [-0.0637, -0.1758, -0.0848, ..., -0.1632, -0.0723, -0.2610]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -6.1002e-08, -3.6554e-08], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 1.1642e-09, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 3.4925e-09, 2.0955e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.6566e-10, 2.3283e-10, 2.3283e-10], [ 4.1910e-09, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.4680e-08, 1.4901e-08], [-5.5879e-09, 0.0000e+00, 0.0000e+00, ..., 1.8626e-09, 2.9104e-08, 1.7462e-08]], device='cuda:0') Epoch 476, bias, value: tensor([-0.0151, -0.0365, -0.0088, -0.0100, -0.0340, -0.0007, 0.0303, -0.0144, 0.0518, -0.0132], device='cuda:0'), grad: tensor([-1.2224e-07, 7.4506e-09, 1.3271e-08, 2.3283e-09, -3.1898e-08, 6.5193e-09, -5.1223e-09, 5.3551e-09, 7.1712e-08, 5.0524e-08], device='cuda:0') 100 1e-05 changing lr epoch 475, time 250.30, cls_loss 0.0006 cls_loss_mapping 0.0003 cls_loss_causal 0.3898 re_mapping 0.0025 re_causal 0.0081 /// teacc 99.13 lr 0.00001000 Epoch 477, weight, value: tensor([[-0.1404, -0.2920, -0.0900, ..., -0.0869, 0.1928, 0.2044], [-0.2705, -0.2405, -0.0694, ..., -0.2079, -0.2646, -0.1736], [-0.0682, -0.2192, 0.1671, ..., -0.2728, 0.3011, 0.1417], ..., [-0.2148, 0.1341, 0.0164, ..., 0.2415, -0.2716, -0.3317], [-0.3509, 0.0693, -0.1675, ..., 0.0692, -0.1300, -0.2486], [-0.0638, -0.1759, -0.0848, ..., -0.1632, -0.0725, -0.2612]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 3.4925e-09, 0.0000e+00, ..., 0.0000e+00, 9.3132e-09, 5.1223e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, -3.4925e-09, 0.0000e+00, ..., 0.0000e+00, -9.5461e-09, -5.1223e-09], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 477, bias, value: tensor([-0.0149, -0.0365, -0.0088, -0.0100, -0.0339, -0.0006, 0.0302, -0.0144, 0.0518, -0.0133], device='cuda:0'), grad: tensor([ 2.2585e-08, 2.3283e-10, -2.2352e-08, 1.1642e-09, 1.3970e-09, 0.0000e+00, 0.0000e+00, 1.6298e-09, 1.8626e-09, 9.3132e-10], device='cuda:0') 100 1e-05 changing lr epoch 476, time 250.12, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4434 re_mapping 0.0025 re_causal 0.0086 /// teacc 99.13 lr 0.00001000 Epoch 478, weight, value: tensor([[-0.1404, -0.2920, -0.0900, ..., -0.0869, 0.1928, 0.2044], [-0.2705, -0.2405, -0.0694, ..., -0.2079, -0.2647, -0.1736], [-0.0683, -0.2192, 0.1671, ..., -0.2729, 0.3011, 0.1417], ..., [-0.2148, 0.1341, 0.0163, ..., 0.2415, -0.2716, -0.3317], [-0.3510, 0.0692, -0.1675, ..., 0.0692, -0.1300, -0.2486], [-0.0638, -0.1759, -0.0848, ..., -0.1632, -0.0725, -0.2613]], device='cuda:0'), grad: tensor([[ 4.6566e-10, 2.3283e-10, 0.0000e+00, ..., 6.9849e-10, -2.5611e-09, -1.1642e-09], [ 2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 2.3283e-10, 2.3283e-10], ..., [ 4.4238e-09, 2.2352e-08, 0.0000e+00, ..., -9.0804e-09, 0.0000e+00, 0.0000e+00], [ 3.0268e-09, 1.2573e-08, 0.0000e+00, ..., 7.4506e-09, -1.1642e-09, -6.9849e-10], [ 2.3283e-09, 9.5461e-09, 0.0000e+00, ..., 2.4913e-08, 6.9849e-10, 4.6566e-10]], device='cuda:0') Epoch 478, bias, value: tensor([-0.0149, -0.0364, -0.0087, -0.0100, -0.0339, -0.0006, 0.0302, -0.0145, 0.0517, -0.0133], device='cuda:0'), grad: tensor([-1.6298e-09, 3.0268e-09, 1.1642e-09, -2.7008e-08, 2.3283e-10, -8.6846e-08, 2.0256e-08, 1.1874e-08, 2.0256e-08, 6.2166e-08], device='cuda:0') 100 1e-05 changing lr epoch 477, time 251.66, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4234 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.14 lr 0.00001000 Epoch 479, weight, value: tensor([[-0.1404, -0.2920, -0.0900, ..., -0.0868, 0.1929, 0.2044], [-0.2705, -0.2406, -0.0693, ..., -0.2079, -0.2647, -0.1736], [-0.0683, -0.2193, 0.1671, ..., -0.2730, 0.3012, 0.1418], ..., [-0.2148, 0.1341, 0.0163, ..., 0.2415, -0.2717, -0.3318], [-0.3511, 0.0692, -0.1676, ..., 0.0692, -0.1301, -0.2487], [-0.0638, -0.1759, -0.0848, ..., -0.1633, -0.0725, -0.2614]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., -2.3283e-10, -1.3970e-09, -1.3970e-09], [ 0.0000e+00, 7.9162e-09, 0.0000e+00, ..., 7.9162e-09, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 4.6566e-10, 4.6566e-10, 2.3283e-10], ..., [ 0.0000e+00, -2.1420e-08, 0.0000e+00, ..., -1.8161e-08, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, -3.2596e-09, -1.3970e-09], [ 2.3283e-10, 1.0943e-08, 0.0000e+00, ..., 9.0804e-09, 9.3132e-10, 6.9849e-10]], device='cuda:0') Epoch 479, bias, value: tensor([-0.0149, -0.0364, -0.0088, -0.0100, -0.0339, -0.0007, 0.0302, -0.0145, 0.0517, -0.0133], device='cuda:0'), grad: tensor([-1.6298e-09, 5.8440e-08, 7.4506e-09, 3.9581e-09, -4.2841e-08, 4.4238e-09, 3.1665e-08, -6.8219e-08, -3.1898e-08, 4.6100e-08], device='cuda:0') 100 1e-05 changing lr epoch 478, time 251.93, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4380 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.14 lr 0.00001000 Epoch 480, weight, value: tensor([[-0.1404, -0.2920, -0.0900, ..., -0.0868, 0.1929, 0.2045], [-0.2706, -0.2407, -0.0693, ..., -0.2079, -0.2647, -0.1736], [-0.0683, -0.2193, 0.1672, ..., -0.2730, 0.3012, 0.1418], ..., [-0.2149, 0.1342, 0.0163, ..., 0.2416, -0.2717, -0.3318], [-0.3513, 0.0692, -0.1676, ..., 0.0692, -0.1301, -0.2487], [-0.0639, -0.1761, -0.0848, ..., -0.1635, -0.0726, -0.2617]], device='cuda:0'), grad: tensor([[ 2.3283e-10, -2.0955e-09, 0.0000e+00, ..., 2.3283e-10, -1.6764e-08, -1.7229e-08], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 6.9849e-10, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, -3.2596e-09, -4.6566e-10], ..., [ 0.0000e+00, -2.0955e-09, 0.0000e+00, ..., -3.0268e-09, 2.3283e-10, 0.0000e+00], [ 1.2340e-08, 1.6298e-09, 0.0000e+00, ..., 1.4435e-08, 4.6566e-10, 0.0000e+00], [ 1.8626e-09, 3.2596e-09, 0.0000e+00, ..., 4.4238e-09, 1.4435e-08, 1.4435e-08]], device='cuda:0') Epoch 480, bias, value: tensor([-0.0149, -0.0364, -0.0088, -0.0100, -0.0339, -0.0006, 0.0301, -0.0144, 0.0517, -0.0134], device='cuda:0'), grad: tensor([-5.5414e-08, 3.0268e-09, -9.0804e-09, 2.6077e-08, 6.0536e-09, -6.8685e-08, 5.3551e-09, -5.5879e-09, 4.6566e-08, 4.5868e-08], device='cuda:0') 100 1e-05 changing lr epoch 479, time 251.87, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4438 re_mapping 0.0024 re_causal 0.0085 /// teacc 99.12 lr 0.00001000 Epoch 481, weight, value: tensor([[-0.1405, -0.2920, -0.0900, ..., -0.0868, 0.1929, 0.2045], [-0.2706, -0.2407, -0.0693, ..., -0.2079, -0.2647, -0.1736], [-0.0684, -0.2194, 0.1671, ..., -0.2732, 0.3012, 0.1417], ..., [-0.2149, 0.1343, 0.0163, ..., 0.2416, -0.2718, -0.3319], [-0.3514, 0.0691, -0.1676, ..., 0.0692, -0.1302, -0.2488], [-0.0639, -0.1761, -0.0848, ..., -0.1635, -0.0727, -0.2618]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 9.3132e-10, 6.9849e-10], [ 2.3283e-10, 1.1642e-09, 0.0000e+00, ..., 6.9849e-10, -1.6298e-09, 0.0000e+00], [ 6.9849e-10, 1.3970e-09, 0.0000e+00, ..., -1.6298e-09, -3.9581e-09, -3.4925e-09], ..., [ 2.3283e-10, -1.3039e-08, 0.0000e+00, ..., -1.6065e-08, 2.5611e-09, 1.8626e-09], [ 1.3970e-09, 2.3283e-10, 0.0000e+00, ..., 1.1642e-09, 4.6566e-10, 2.3283e-10], [ 2.3283e-10, 1.3970e-08, -0.0000e+00, ..., 1.7462e-08, 6.9849e-10, 4.6566e-10]], device='cuda:0') Epoch 481, bias, value: tensor([-0.0149, -0.0364, -0.0089, -0.0100, -0.0339, -0.0006, 0.0302, -0.0144, 0.0518, -0.0134], device='cuda:0'), grad: tensor([ 9.7789e-09, -7.8464e-08, -4.8894e-09, -1.3970e-09, 1.2573e-08, -8.1491e-09, 1.6764e-08, -4.1910e-09, 2.0955e-08, 4.4005e-08], device='cuda:0') 100 1e-05 changing lr epoch 480, time 251.82, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4136 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.13 lr 0.00001000 Epoch 482, weight, value: tensor([[-0.1405, -0.2920, -0.0900, ..., -0.0868, 0.1930, 0.2045], [-0.2707, -0.2407, -0.0693, ..., -0.2079, -0.2648, -0.1736], [-0.0685, -0.2194, 0.1672, ..., -0.2733, 0.3012, 0.1417], ..., [-0.2149, 0.1343, 0.0163, ..., 0.2417, -0.2718, -0.3319], [-0.3514, 0.0691, -0.1676, ..., 0.0692, -0.1302, -0.2488], [-0.0639, -0.1762, -0.0848, ..., -0.1636, -0.0727, -0.2618]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00]], device='cuda:0') Epoch 482, bias, value: tensor([-0.0149, -0.0364, -0.0089, -0.0100, -0.0339, -0.0006, 0.0302, -0.0145, 0.0518, -0.0134], device='cuda:0'), grad: tensor([0.0000e+00, 2.3283e-10, 0.0000e+00, 0.0000e+00, 0.0000e+00, 6.9849e-10, 1.6298e-09, 0.0000e+00, 4.6566e-10, 2.3283e-10], device='cuda:0') 100 1e-05 changing lr epoch 481, time 251.88, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4248 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.11 lr 0.00001000 Epoch 483, weight, value: tensor([[-0.1407, -0.2921, -0.0900, ..., -0.0868, 0.1929, 0.2045], [-0.2707, -0.2407, -0.0693, ..., -0.2079, -0.2648, -0.1736], [-0.0685, -0.2195, 0.1672, ..., -0.2734, 0.3012, 0.1417], ..., [-0.2149, 0.1344, 0.0163, ..., 0.2417, -0.2719, -0.3320], [-0.3515, 0.0690, -0.1676, ..., 0.0693, -0.1303, -0.2489], [-0.0639, -0.1763, -0.0848, ..., -0.1637, -0.0728, -0.2620]], device='cuda:0'), grad: tensor([[-2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.0955e-09, -2.0955e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.5611e-09, 1.6298e-09], [ 0.0000e+00, -1.6298e-09, 0.0000e+00, ..., 0.0000e+00, -1.5832e-08, -6.9849e-09], ..., [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 0.0000e+00, 1.2107e-08, 5.3551e-09], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 4.6566e-10, 2.3283e-10]], device='cuda:0') Epoch 483, bias, value: tensor([-0.0149, -0.0363, -0.0090, -0.0100, -0.0340, -0.0007, 0.0303, -0.0145, 0.0518, -0.0134], device='cuda:0'), grad: tensor([-2.0955e-09, 1.2573e-08, -7.6136e-08, 2.5611e-09, 1.3970e-09, 1.1642e-09, 3.4925e-09, 5.8440e-08, 1.3970e-09, 1.6298e-09], device='cuda:0') 100 1e-05 changing lr epoch 482, time 251.80, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4271 re_mapping 0.0024 re_causal 0.0084 /// teacc 99.13 lr 0.00001000 Epoch 484, weight, value: tensor([[-0.1407, -0.2921, -0.0900, ..., -0.0869, 0.1930, 0.2046], [-0.2708, -0.2408, -0.0693, ..., -0.2079, -0.2648, -0.1736], [-0.0686, -0.2195, 0.1672, ..., -0.2734, 0.3012, 0.1417], ..., [-0.2150, 0.1344, 0.0163, ..., 0.2417, -0.2719, -0.3320], [-0.3517, 0.0690, -0.1676, ..., 0.0692, -0.1304, -0.2489], [-0.0640, -0.1763, -0.0848, ..., -0.1638, -0.0729, -0.2621]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.6997e-08, -1.5367e-08], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, -0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -1.8626e-09, -0.0000e+00], ..., [ 0.0000e+00, -9.5461e-09, 0.0000e+00, ..., -1.2573e-08, 9.3132e-10, 4.6566e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 4.6566e-10], [ 0.0000e+00, 9.0804e-09, 0.0000e+00, ..., 1.1642e-08, 4.8894e-09, 3.9581e-09]], device='cuda:0') Epoch 484, bias, value: tensor([-0.0149, -0.0364, -0.0090, -0.0098, -0.0340, -0.0006, 0.0302, -0.0145, 0.0517, -0.0134], device='cuda:0'), grad: tensor([-6.1234e-08, 4.6566e-09, -5.3551e-09, 5.3551e-09, 2.5611e-09, 3.1665e-08, 1.1642e-08, -2.0256e-08, 6.0536e-09, 3.7486e-08], device='cuda:0') 100 1e-05 changing lr epoch 483, time 252.38, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4296 re_mapping 0.0024 re_causal 0.0084 /// teacc 99.13 lr 0.00001000 Epoch 485, weight, value: tensor([[-0.1408, -0.2922, -0.0900, ..., -0.0869, 0.1930, 0.2046], [-0.2709, -0.2413, -0.0693, ..., -0.2083, -0.2650, -0.1736], [-0.0687, -0.2196, 0.1672, ..., -0.2735, 0.3013, 0.1417], ..., [-0.2150, 0.1348, 0.0163, ..., 0.2422, -0.2720, -0.3321], [-0.3518, 0.0690, -0.1676, ..., 0.0692, -0.1304, -0.2489], [-0.0640, -0.1764, -0.0848, ..., -0.1638, -0.0730, -0.2623]], device='cuda:0'), grad: tensor([[ 1.1642e-09, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 3.2596e-09, 6.9849e-10], [ 0.0000e+00, 3.5856e-08, 0.0000e+00, ..., 4.2608e-08, 4.6566e-10, 0.0000e+00], [ 0.0000e+00, 4.6566e-09, 0.0000e+00, ..., 5.5879e-09, 0.0000e+00, 0.0000e+00], ..., [ 0.0000e+00, -4.2608e-08, 0.0000e+00, ..., -5.0291e-08, 2.3283e-10, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., -9.3132e-10, 4.6566e-10, 2.3283e-10], [-1.3970e-09, 4.6566e-10, 0.0000e+00, ..., 6.9849e-10, -3.4925e-09, -6.9849e-10]], device='cuda:0') Epoch 485, bias, value: tensor([-0.0149, -0.0368, -0.0092, -0.0096, -0.0341, -0.0007, 0.0302, -0.0140, 0.0517, -0.0135], device='cuda:0'), grad: tensor([ 2.7707e-08, 1.4668e-07, 2.1420e-08, 4.1910e-09, 4.1910e-09, 2.5611e-09, 2.3283e-09, -1.7229e-07, -4.4238e-09, -2.7241e-08], device='cuda:0') 100 1e-05 changing lr epoch 484, time 252.02, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4207 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.13 lr 0.00001000 Epoch 486, weight, value: tensor([[-0.1408, -0.2922, -0.0900, ..., -0.0869, 0.1931, 0.2047], [-0.2709, -0.2414, -0.0693, ..., -0.2085, -0.2650, -0.1737], [-0.0687, -0.2195, 0.1673, ..., -0.2736, 0.3016, 0.1418], ..., [-0.2151, 0.1350, 0.0163, ..., 0.2423, -0.2722, -0.3322], [-0.3519, 0.0689, -0.1676, ..., 0.0692, -0.1305, -0.2490], [-0.0640, -0.1765, -0.0848, ..., -0.1639, -0.0731, -0.2623]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -1.4761e-07, -1.1991e-07], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], ..., [ 2.3283e-09, -1.3271e-08, 0.0000e+00, ..., -1.5832e-08, 0.0000e+00, 0.0000e+00], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 2.3283e-10, 0.0000e+00], [-3.2596e-09, 1.3039e-08, 0.0000e+00, ..., 1.4901e-08, 6.9849e-10, 4.6566e-10]], device='cuda:0') Epoch 486, bias, value: tensor([-0.0149, -0.0369, -0.0090, -0.0094, -0.0340, -0.0008, 0.0302, -0.0139, 0.0517, -0.0135], device='cuda:0'), grad: tensor([-2.4331e-07, 4.6566e-09, 4.6566e-10, 1.1642e-09, -7.0315e-08, 2.5611e-09, 2.5076e-07, -2.0955e-08, 2.0955e-09, 8.7544e-08], device='cuda:0') 100 1e-05 changing lr epoch 485, time 252.11, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.4364 re_mapping 0.0025 re_causal 0.0084 /// teacc 99.14 lr 0.00001000 Epoch 487, weight, value: tensor([[-0.1410, -0.2922, -0.0900, ..., -0.0869, 0.1931, 0.2046], [-0.2710, -0.2415, -0.0692, ..., -0.2085, -0.2651, -0.1737], [-0.0687, -0.2196, 0.1673, ..., -0.2737, 0.3017, 0.1419], ..., [-0.2151, 0.1351, 0.0162, ..., 0.2424, -0.2723, -0.3323], [-0.3520, 0.0689, -0.1676, ..., 0.0691, -0.1306, -0.2490], [-0.0641, -0.1766, -0.0848, ..., -0.1640, -0.0732, -0.2625]], device='cuda:0'), grad: tensor([[6.9849e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.8894e-09, 1.3970e-09], [2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 1.3970e-09, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 0.0000e+00], ..., [2.3283e-10, 2.3283e-10, 0.0000e+00, ..., 0.0000e+00, 1.1642e-09, 0.0000e+00], [2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 0.0000e+00, 0.0000e+00], [2.3283e-10, 4.6566e-10, 0.0000e+00, ..., 0.0000e+00, 1.3970e-09, 0.0000e+00]], device='cuda:0') Epoch 487, bias, value: tensor([-0.0149, -0.0370, -0.0089, -0.0094, -0.0341, -0.0008, 0.0303, -0.0139, 0.0517, -0.0135], device='cuda:0'), grad: tensor([ 1.9092e-08, 3.4925e-09, 7.2177e-09, -1.1642e-09, -5.4017e-08, 2.7940e-09, 1.3970e-08, 5.8208e-09, 9.3132e-10, 6.7521e-09], device='cuda:0') 100 1e-05 changing lr epoch 486, time 252.29, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.4286 re_mapping 0.0025 re_causal 0.0081 /// teacc 99.13 lr 0.00001000 Epoch 488, weight, value: tensor([[-0.1410, -0.2922, -0.0900, ..., -0.0869, 0.1931, 0.2047], [-0.2711, -0.2415, -0.0691, ..., -0.2085, -0.2651, -0.1737], [-0.0687, -0.2197, 0.1674, ..., -0.2737, 0.3019, 0.1420], ..., [-0.2152, 0.1351, 0.0161, ..., 0.2424, -0.2723, -0.3324], [-0.3522, 0.0688, -0.1676, ..., 0.0691, -0.1307, -0.2490], [-0.0641, -0.1767, -0.0848, ..., -0.1642, -0.0732, -0.2626]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -2.3283e-10, 0.0000e+00, ..., 0.0000e+00, -4.6566e-09, -3.4925e-09], [ 0.0000e+00, 1.1642e-09, 0.0000e+00, ..., 1.1642e-09, 2.3283e-10, 2.3283e-10], [ 0.0000e+00, -1.1642e-09, 0.0000e+00, ..., -2.3283e-09, -6.2864e-09, -3.4925e-09], ..., [ 2.3283e-10, -1.6601e-07, 0.0000e+00, ..., -1.5902e-07, 5.3551e-09, 3.2596e-09], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 2.3283e-10], [ 0.0000e+00, 1.6042e-07, 0.0000e+00, ..., 1.5460e-07, 4.6566e-09, 3.4925e-09]], device='cuda:0') Epoch 488, bias, value: tensor([-0.0149, -0.0370, -0.0088, -0.0094, -0.0341, -0.0008, 0.0303, -0.0139, 0.0516, -0.0136], device='cuda:0'), grad: tensor([-1.0710e-08, 4.8894e-09, -2.7474e-08, -5.8208e-09, -3.7253e-09, 2.1886e-08, 1.1642e-09, -4.2585e-07, 3.2596e-09, 4.4378e-07], device='cuda:0') 100 1e-05 changing lr epoch 487, time 252.33, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4353 re_mapping 0.0025 re_causal 0.0083 /// teacc 99.10 lr 0.00001000 Epoch 489, weight, value: tensor([[-0.1411, -0.2922, -0.0900, ..., -0.0868, 0.1932, 0.2048], [-0.2712, -0.2416, -0.0691, ..., -0.2086, -0.2652, -0.1737], [-0.0688, -0.2197, 0.1674, ..., -0.2737, 0.3019, 0.1420], ..., [-0.2152, 0.1352, 0.0161, ..., 0.2425, -0.2724, -0.3325], [-0.3523, 0.0688, -0.1676, ..., 0.0690, -0.1308, -0.2491], [-0.0641, -0.1768, -0.0848, ..., -0.1643, -0.0733, -0.2627]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -2.7940e-09, -2.5611e-09], [ 0.0000e+00, 1.1409e-08, 0.0000e+00, ..., 2.9337e-08, 1.1642e-09, 4.6566e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 9.3132e-10, -4.6566e-09, -1.3970e-09], ..., [ 0.0000e+00, -1.2340e-08, 0.0000e+00, ..., -3.1898e-08, 9.3132e-10, 2.3283e-10], [-4.6566e-10, -1.3970e-09, 0.0000e+00, ..., -1.4901e-08, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 1.1642e-09, 3.7253e-09, 2.7940e-09]], device='cuda:0') Epoch 489, bias, value: tensor([-0.0148, -0.0370, -0.0088, -0.0094, -0.0341, -0.0009, 0.0303, -0.0138, 0.0515, -0.0137], device='cuda:0'), grad: tensor([-3.9581e-09, -3.0571e-07, 5.9139e-08, 1.1409e-08, 1.2759e-07, 6.0536e-09, 4.9826e-08, 6.4261e-08, -4.4238e-08, 3.5390e-08], device='cuda:0') 100 1e-05 changing lr epoch 488, time 252.07, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4126 re_mapping 0.0025 re_causal 0.0081 /// teacc 99.12 lr 0.00001000 Epoch 490, weight, value: tensor([[-0.1411, -0.2922, -0.0900, ..., -0.0867, 0.1933, 0.2049], [-0.2712, -0.2416, -0.0689, ..., -0.2085, -0.2653, -0.1738], [-0.0688, -0.2197, 0.1674, ..., -0.2738, 0.3021, 0.1422], ..., [-0.2152, 0.1352, 0.0159, ..., 0.2424, -0.2725, -0.3326], [-0.3524, 0.0688, -0.1676, ..., 0.0691, -0.1309, -0.2492], [-0.0641, -0.1769, -0.0848, ..., -0.1644, -0.0734, -0.2629]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 6.9849e-10, 0.0000e+00, ..., 2.3283e-10, 1.1642e-09, 4.6566e-10], [ 4.6566e-10, 1.1642e-09, 0.0000e+00, ..., 6.9849e-10, 6.9849e-10, 2.3283e-10], [ 2.3283e-10, -2.5611e-09, 0.0000e+00, ..., 0.0000e+00, -9.0804e-09, -3.0268e-09], ..., [ 4.6566e-10, -2.0722e-08, 0.0000e+00, ..., -5.0524e-08, 2.5611e-09, 6.9849e-10], [ 6.9849e-10, 1.8626e-09, 0.0000e+00, ..., 2.3283e-10, 2.5611e-09, 9.3132e-10], [ 2.7940e-09, 2.5379e-08, 0.0000e+00, ..., 4.8429e-08, 0.0000e+00, 0.0000e+00]], device='cuda:0') Epoch 490, bias, value: tensor([-0.0147, -0.0369, -0.0087, -0.0094, -0.0342, -0.0009, 0.0303, -0.0139, 0.0516, -0.0137], device='cuda:0'), grad: tensor([ 5.3551e-09, 6.9849e-09, -3.0268e-08, -1.6298e-08, 3.2363e-08, 5.1223e-09, -6.9849e-10, -7.1246e-08, 1.1642e-08, 5.8906e-08], device='cuda:0') 100 1e-05 changing lr epoch 489, time 250.27, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.3918 re_mapping 0.0025 re_causal 0.0080 /// teacc 99.15 lr 0.00001000 Epoch 491, weight, value: tensor([[-0.1412, -0.2923, -0.0900, ..., -0.0867, 0.1933, 0.2050], [-0.2713, -0.2416, -0.0689, ..., -0.2086, -0.2653, -0.1738], [-0.0688, -0.2197, 0.1674, ..., -0.2738, 0.3021, 0.1422], ..., [-0.2152, 0.1352, 0.0159, ..., 0.2425, -0.2726, -0.3326], [-0.3526, 0.0688, -0.1676, ..., 0.0691, -0.1310, -0.2493], [-0.0641, -0.1769, -0.0848, ..., -0.1646, -0.0733, -0.2629]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -2.0955e-09, -1.5134e-09], [ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 2.2119e-09, 1.1642e-10, 0.0000e+00], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -2.0955e-09, 4.6566e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 5.2387e-09, 0.0000e+00, 0.0000e+00], [ 1.1642e-10, 1.1642e-10, 0.0000e+00, ..., -2.3167e-08, 1.0477e-09, 6.9849e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 3.4925e-10, 3.2596e-09, 6.9849e-10]], device='cuda:0') Epoch 491, bias, value: tensor([-0.0148, -0.0369, -0.0086, -0.0093, -0.0341, -0.0010, 0.0303, -0.0139, 0.0515, -0.0137], device='cuda:0'), grad: tensor([-2.7940e-09, 5.8208e-09, -2.7823e-08, 4.0745e-09, 3.4925e-10, 2.8987e-08, 4.7730e-09, 1.1292e-08, -3.6671e-08, 2.0256e-08], device='cuda:0') 100 1e-05 changing lr epoch 490, time 250.50, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4094 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.15 lr 0.00001000 Epoch 492, weight, value: tensor([[-0.1416, -0.2923, -0.0900, ..., -0.0867, 0.1932, 0.2048], [-0.2714, -0.2416, -0.0689, ..., -0.2086, -0.2653, -0.1738], [-0.0688, -0.2197, 0.1674, ..., -0.2739, 0.3023, 0.1423], ..., [-0.2152, 0.1352, 0.0159, ..., 0.2426, -0.2727, -0.3328], [-0.3526, 0.0688, -0.1676, ..., 0.0693, -0.1310, -0.2493], [-0.0642, -0.1770, -0.0848, ..., -0.1646, -0.0734, -0.2631]], device='cuda:0'), grad: tensor([[ 0.0000e+00, -9.3132e-10, 0.0000e+00, ..., -1.1642e-09, -3.2713e-08, -2.9686e-08], [ 0.0000e+00, 6.9849e-10, 0.0000e+00, ..., 4.4238e-09, -2.4447e-09, 4.6566e-10], [ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 2.3283e-10, 4.3074e-09, 1.9791e-09], ..., [ 0.0000e+00, -4.3074e-09, 0.0000e+00, ..., -2.6776e-09, 1.2806e-09, 8.1491e-10], [ 1.1642e-10, 2.5611e-09, 0.0000e+00, ..., 1.7462e-09, 1.4901e-08, 1.4086e-08], [ 0.0000e+00, 4.6566e-10, 0.0000e+00, ..., 4.6566e-10, 6.7521e-09, 5.9372e-09]], device='cuda:0') Epoch 492, bias, value: tensor([-0.0150, -0.0369, -0.0084, -0.0094, -0.0342, -0.0010, 0.0304, -0.0139, 0.0517, -0.0138], device='cuda:0'), grad: tensor([-8.0210e-08, -1.2806e-09, 7.3807e-08, 7.4506e-09, -8.5915e-08, 7.9162e-09, 2.6543e-08, 1.1409e-08, 4.0629e-08, 1.0477e-08], device='cuda:0') 100 1e-05 changing lr epoch 491, time 250.49, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4578 re_mapping 0.0024 re_causal 0.0085 /// teacc 99.18 lr 0.00001000 Epoch 493, weight, value: tensor([[-0.1416, -0.2923, -0.0900, ..., -0.0865, 0.1935, 0.2051], [-0.2714, -0.2416, -0.0688, ..., -0.2086, -0.2654, -0.1739], [-0.0688, -0.2198, 0.1674, ..., -0.2740, 0.3023, 0.1424], ..., [-0.2153, 0.1352, 0.0158, ..., 0.2426, -0.2728, -0.3329], [-0.3528, 0.0688, -0.1676, ..., 0.0692, -0.1311, -0.2494], [-0.0642, -0.1771, -0.0848, ..., -0.1648, -0.0735, -0.2632]], device='cuda:0'), grad: tensor([[ 2.3283e-10, 8.1491e-10, 0.0000e+00, ..., 1.3970e-09, -2.8056e-08, -2.2817e-08], [ 2.3283e-10, 5.3085e-08, 0.0000e+00, ..., 9.3714e-08, 5.8208e-10, 4.6566e-10], [ 0.0000e+00, 2.3283e-10, 0.0000e+00, ..., 4.6566e-10, -6.9849e-10, 1.1642e-10], ..., [ 2.3283e-10, -6.2049e-08, 0.0000e+00, ..., -1.1094e-07, 9.3132e-10, 1.1642e-10], [ 4.6566e-10, 6.9849e-10, 0.0000e+00, ..., 5.8208e-10, 4.6566e-10, 3.4925e-10], [ 1.1642e-10, 3.7253e-09, 0.0000e+00, ..., 6.4028e-09, 6.4028e-09, 5.2387e-09]], device='cuda:0') Epoch 493, bias, value: tensor([-0.0147, -0.0369, -0.0084, -0.0094, -0.0341, -0.0009, 0.0302, -0.0139, 0.0516, -0.0139], device='cuda:0'), grad: tensor([-6.1002e-08, 2.0408e-07, -2.3283e-10, 1.9558e-08, -2.3283e-09, 9.3132e-10, 4.2375e-08, -2.3213e-07, 3.9581e-09, 3.4110e-08], device='cuda:0') 100 1e-05 changing lr epoch 492, time 250.41, cls_loss 0.0007 cls_loss_mapping 0.0003 cls_loss_causal 0.4151 re_mapping 0.0025 re_causal 0.0082 /// teacc 99.14 lr 0.00001000 Epoch 494, weight, value: tensor([[-0.1416, -0.2923, -0.0900, ..., -0.0866, 0.1935, 0.2052], [-0.2716, -0.2416, -0.0688, ..., -0.2087, -0.2655, -0.1739], [-0.0688, -0.2197, 0.1674, ..., -0.2740, 0.3024, 0.1424], ..., [-0.2153, 0.1353, 0.0158, ..., 0.2427, -0.2729, -0.3330], [-0.3530, 0.0687, -0.1676, ..., 0.0692, -0.1313, -0.2495], [-0.0642, -0.1772, -0.0848, ..., -0.1649, -0.0733, -0.2632]], device='cuda:0'), grad: tensor([[ 1.6298e-09, -1.1642e-10, 0.0000e+00, ..., 5.8208e-10, -9.4296e-09, -7.4506e-09], [ 1.3458e-07, 1.1642e-10, 0.0000e+00, ..., 4.4471e-08, 1.1642e-10, 1.1642e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 1.1642e-10, 4.6566e-10, 3.4925e-10], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.1642e-10, 1.1642e-10], [ 3.2480e-08, 1.1642e-10, 0.0000e+00, ..., 1.0710e-08, 4.6566e-10, 2.3283e-10], [ 0.0000e+00, 3.4925e-10, 0.0000e+00, ..., 5.8208e-10, 7.9162e-09, 6.2864e-09]], device='cuda:0') Epoch 494, bias, value: tensor([-0.0148, -0.0369, -0.0083, -0.0094, -0.0340, -0.0010, 0.0303, -0.0139, 0.0515, -0.0140], device='cuda:0'), grad: tensor([-1.7812e-08, 1.8859e-08, 6.7404e-08, 2.3283e-09, -4.6566e-10, 1.0170e-06, -1.3569e-06, 1.6915e-07, 7.9046e-08, 2.6077e-08], device='cuda:0') 100 1e-05 changing lr epoch 493, time 250.37, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4489 re_mapping 0.0024 re_causal 0.0085 /// teacc 99.13 lr 0.00001000 Epoch 495, weight, value: tensor([[-0.1417, -0.2924, -0.0900, ..., -0.0866, 0.1935, 0.2052], [-0.2717, -0.2416, -0.0688, ..., -0.2087, -0.2655, -0.1740], [-0.0689, -0.2198, 0.1674, ..., -0.2742, 0.3024, 0.1424], ..., [-0.2153, 0.1353, 0.0158, ..., 0.2427, -0.2730, -0.3330], [-0.3531, 0.0687, -0.1676, ..., 0.0693, -0.1314, -0.2495], [-0.0643, -0.1773, -0.0848, ..., -0.1651, -0.0734, -0.2634]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 0.0000e+00, ..., 3.4925e-10, -8.1491e-10, -0.0000e+00], [ 3.4925e-10, 1.1642e-10, -1.1642e-10, ..., 1.1642e-10, 4.6566e-10, 6.9849e-10], [ 3.4925e-10, 2.3283e-10, 1.1642e-10, ..., 3.4925e-10, 3.4925e-10, 5.8208e-10], ..., [ 3.4925e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 4.6566e-10, 5.8208e-10], [-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., -1.8626e-09, 1.1642e-10, -1.6298e-09], [ 2.8522e-08, 4.6566e-10, 0.0000e+00, ..., 1.2806e-09, 3.5390e-08, 5.0175e-08]], device='cuda:0') Epoch 495, bias, value: tensor([-0.0148, -0.0369, -0.0084, -0.0092, -0.0338, -0.0011, 0.0303, -0.0139, 0.0516, -0.0142], device='cuda:0'), grad: tensor([ 2.4098e-08, -4.8894e-09, 1.8394e-08, 2.5611e-09, -5.0291e-07, 1.5134e-09, 4.0047e-08, 5.1223e-09, -1.9674e-08, 4.4215e-07], device='cuda:0') 100 1e-05 changing lr epoch 494, time 250.46, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4220 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.13 lr 0.00001000 Epoch 496, weight, value: tensor([[-0.1417, -0.2924, -0.0900, ..., -0.0865, 0.1936, 0.2053], [-0.2717, -0.2416, -0.0688, ..., -0.2087, -0.2656, -0.1740], [-0.0689, -0.2198, 0.1674, ..., -0.2742, 0.3025, 0.1424], ..., [-0.2154, 0.1353, 0.0158, ..., 0.2427, -0.2730, -0.3331], [-0.3532, 0.0686, -0.1676, ..., 0.0693, -0.1314, -0.2495], [-0.0643, -0.1774, -0.0849, ..., -0.1653, -0.0735, -0.2635]], device='cuda:0'), grad: tensor([[ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 4.6566e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 1.6298e-09, 0.0000e+00, ..., 6.6357e-09, 0.0000e+00, 1.5134e-09], [ 0.0000e+00, 9.3132e-10, 0.0000e+00, ..., 4.3074e-09, 0.0000e+00, 1.1642e-09], ..., [ 0.0000e+00, -3.0268e-09, 0.0000e+00, ..., -1.2689e-08, 0.0000e+00, -3.1432e-09], [ 0.0000e+00, 1.1642e-10, 0.0000e+00, ..., 3.4925e-10, 0.0000e+00, 1.1642e-10], [ 0.0000e+00, 5.8208e-10, 0.0000e+00, ..., 1.8626e-09, 0.0000e+00, 3.4925e-10]], device='cuda:0') Epoch 496, bias, value: tensor([-0.0147, -0.0369, -0.0084, -0.0091, -0.0338, -0.0012, 0.0303, -0.0139, 0.0517, -0.0142], device='cuda:0'), grad: tensor([ 1.0477e-09, 1.6764e-08, 1.1642e-08, 1.3970e-09, 9.3132e-10, 0.0000e+00, 1.1642e-10, -3.0035e-08, 9.3132e-10, 5.2387e-09], device='cuda:0') 100 1e-05 changing lr epoch 495, time 249.88, cls_loss 0.0008 cls_loss_mapping 0.0003 cls_loss_causal 0.4342 re_mapping 0.0024 re_causal 0.0083 /// teacc 99.13 lr 0.00001000 Epoch 497, weight, value: tensor([[-0.1417, -0.2924, -0.0900, ..., -0.0864, 0.1937, 0.2054], [-0.2718, -0.2416, -0.0687, ..., -0.2086, -0.2657, -0.1740], [-0.0690, -0.2199, 0.1675, ..., -0.2744, 0.3026, 0.1424], ..., [-0.2154, 0.1353, 0.0157, ..., 0.2427, -0.2731, -0.3331], [-0.3534, 0.0686, -0.1676, ..., 0.0693, -0.1315, -0.2496], [-0.0643, -0.1775, -0.0849, ..., -0.1654, -0.0736, -0.2637]], device='cuda:0'), grad: tensor([[ 1.1642e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, -9.5461e-09, -7.3342e-09], [ 1.1642e-10, 1.1642e-10, 0.0000e+00, ..., 1.1642e-10, 1.1642e-10, 1.1642e-10], [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.1642e-10, 0.0000e+00], ..., [ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [ 5.8208e-10, 1.1642e-10, 0.0000e+00, ..., -2.0955e-09, 1.1642e-10, 1.1642e-10], [ 3.4925e-10, 0.0000e+00, 0.0000e+00, ..., 5.8208e-10, 1.6298e-09, 1.2806e-09]], device='cuda:0') Epoch 497, bias, value: tensor([-0.0146, -0.0369, -0.0082, -0.0090, -0.0337, -0.0013, 0.0302, -0.0139, 0.0516, -0.0143], device='cuda:0'), grad: tensor([-1.7928e-08, -3.0501e-08, 4.5402e-09, 1.6741e-07, 4.6566e-10, -1.5786e-07, 1.5600e-08, 2.5495e-08, -9.3132e-09, 6.6357e-09], device='cuda:0') 100 1e-05 changing lr epoch 496, time 249.75, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4177 re_mapping 0.0024 re_causal 0.0081 /// teacc 99.14 lr 0.00001000 Epoch 498, weight, value: tensor([[-0.1420, -0.2924, -0.0900, ..., -0.0865, 0.1936, 0.2053], [-0.2719, -0.2416, -0.0687, ..., -0.2087, -0.2657, -0.1739], [-0.0690, -0.2199, 0.1675, ..., -0.2745, 0.3027, 0.1424], ..., [-0.2155, 0.1353, 0.0157, ..., 0.2428, -0.2731, -0.3331], [-0.3537, 0.0686, -0.1676, ..., 0.0691, -0.1316, -0.2497], [-0.0642, -0.1776, -0.0849, ..., -0.1655, -0.0736, -0.2638]], device='cuda:0'), grad: tensor([[ 8.1491e-09, 2.3283e-10, 0.0000e+00, ..., 4.8894e-09, 1.4203e-08, 5.8208e-09], [ 1.2806e-08, 2.5611e-09, 0.0000e+00, ..., 4.6566e-10, 1.6764e-08, 7.9162e-09], [ 1.8626e-09, 1.3970e-09, 0.0000e+00, ..., 4.6566e-10, -6.0536e-09, -4.6566e-10], ..., [ 5.5879e-09, 3.4925e-09, 0.0000e+00, ..., 1.3970e-09, 6.9849e-10, 0.0000e+00], [ 6.7521e-08, 1.1874e-08, 0.0000e+00, ..., 3.2829e-08, -1.6531e-08, -8.1491e-09], [ 2.2585e-08, 4.1910e-09, 0.0000e+00, ..., 7.2177e-09, 3.4925e-09, 1.6298e-09]], device='cuda:0') Epoch 498, bias, value: tensor([-0.0148, -0.0369, -0.0083, -0.0090, -0.0337, -0.0011, 0.0302, -0.0139, 0.0513, -0.0144], device='cuda:0'), grad: tensor([ 8.8708e-08, 7.6834e-08, -7.4506e-09, 6.8452e-07, -7.2177e-09, -1.1483e-06, 1.5204e-07, 2.4680e-08, 6.0536e-08, 8.2189e-08], device='cuda:0') 100 1e-05 changing lr epoch 497, time 250.23, cls_loss 0.0008 cls_loss_mapping 0.0004 cls_loss_causal 0.4486 re_mapping 0.0025 re_causal 0.0085 /// teacc 99.12 lr 0.00001000 Epoch 499, weight, value: tensor([[-0.1420, -0.2925, -0.0900, ..., -0.0865, 0.1937, 0.2053], [-0.2720, -0.2416, -0.0687, ..., -0.2088, -0.2659, -0.1740], [-0.0690, -0.2199, 0.1675, ..., -0.2745, 0.3029, 0.1424], ..., [-0.2156, 0.1353, 0.0157, ..., 0.2429, -0.2732, -0.3332], [-0.3538, 0.0685, -0.1676, ..., 0.0691, -0.1317, -0.2498], [-0.0643, -0.1777, -0.0849, ..., -0.1656, -0.0737, -0.2640]], device='cuda:0'), grad: tensor([[-1.3970e-09, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, -2.8871e-08, -2.0489e-08], [ 4.6566e-10, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 4.6566e-10], [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 6.9849e-10, 4.6566e-10], ..., [ 2.3283e-10, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 0.0000e+00], [ 8.1956e-08, 0.0000e+00, 0.0000e+00, ..., 5.3318e-08, 0.0000e+00, 0.0000e+00], [ 9.3132e-10, 2.3283e-10, 0.0000e+00, ..., 1.1642e-09, 1.8394e-08, 1.1642e-08]], device='cuda:0') Epoch 499, bias, value: tensor([-0.0147, -0.0369, -0.0083, -0.0088, -0.0337, -0.0012, 0.0302, -0.0139, 0.0513, -0.0145], device='cuda:0'), grad: tensor([-8.4983e-08, 3.7253e-09, 2.0955e-09, 2.5937e-07, -1.7462e-08, -6.7800e-07, 2.9942e-07, 1.3970e-09, 1.5483e-07, 7.0781e-08], device='cuda:0') 100 1e-05 changing lr epoch 498, time 249.84, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4336 re_mapping 0.0024 re_causal 0.0082 /// teacc 99.12 lr 0.00001000 Epoch 500, weight, value: tensor([[-0.1422, -0.2925, -0.0900, ..., -0.0865, 0.1936, 0.2053], [-0.2721, -0.2417, -0.0687, ..., -0.2088, -0.2660, -0.1740], [-0.0690, -0.2199, 0.1676, ..., -0.2745, 0.3031, 0.1425], ..., [-0.2157, 0.1353, 0.0157, ..., 0.2430, -0.2734, -0.3333], [-0.3539, 0.0685, -0.1676, ..., 0.0691, -0.1318, -0.2499], [-0.0643, -0.1778, -0.0849, ..., -0.1657, -0.0736, -0.2641]], device='cuda:0'), grad: tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], ..., [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.3283e-10, 2.3283e-10], [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 2.3283e-10, 6.9849e-10, 4.6566e-10]], device='cuda:0') Epoch 500, bias, value: tensor([-0.0149, -0.0369, -0.0083, -0.0088, -0.0336, -0.0012, 0.0302, -0.0139, 0.0514, -0.0145], device='cuda:0'), grad: tensor([ 2.3283e-10, 4.6566e-10, 9.3132e-10, 4.6566e-10, -6.9849e-09, 4.6566e-10, 2.3283e-10, 1.6298e-09, 1.6298e-09, 6.7521e-09], device='cuda:0') 100 1e-05 changing lr epoch 499, time 250.06, cls_loss 0.0007 cls_loss_mapping 0.0004 cls_loss_causal 0.4281 re_mapping 0.0024 re_causal 0.0084 /// teacc 99.11 lr 0.00001000 ---------------------saving last model at epoch 499---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps2', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep500_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_eps2/14factor_best.csv', 'channels': 3, 'factor_num': 14, 'stride': 3, 'epoch': 'best', 'eval_mapping': True} loading weight of best randm: False stride: 3 loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best loading weight of best Using downloaded and verified file: /home/yuqian_fu/.pytorch/SVHN/test_32x32.mat mnist mnist_FA ... usps_FA Avg ShearX 98.940002 98.900002 ... 81.913307 71.451877 ShearY 98.830002 98.769997 ... 81.913307 70.115474 AutoContrast 99.010002 99.110001 ... 81.913307 62.938438 Invert 98.889999 81.290001 ... 81.913307 56.000196 Equalize 98.400002 97.909996 ... 81.913307 71.951622 Solarize 98.379997 96.630005 ... 81.913307 62.680078 SolarizeAdd 98.529999 96.599998 ... 81.913307 68.729369 Posterize 98.959999 99.029999 ... 81.913307 76.495037 Contrast 99.119995 99.129997 ... 81.913307 69.983360 Color 99.059998 99.180000 ... 81.913307 61.149043 Brightness 99.040001 99.139999 ... 81.913307 70.621150 Sharpness 99.029999 99.049995 ... 81.913307 71.066257 NoiseSalt 98.900002 99.139999 ... 81.913307 60.974798 NoiseGaussian 98.959999 99.180000 ... 81.913307 59.648846 w/o do (original x) 99.180000 0.000000 ... 0.000000 65.803656 [15 rows x 11 columns] mnist svhn mnist_m syndigit usps Avg do 99.12 66.318377 78.657927 77.336962 86.596911 77.227544