/home/yuqian_fu here1 here2 {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 250, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep250_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_epoch250', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} stride: 3 --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False Epoch 1, weight, value: tensor([[-0.0118, 0.0228, -0.0024, ..., -0.0263, 0.0294, 0.0225], [ 0.0117, 0.0125, -0.0312, ..., 0.0006, -0.0133, 0.0132], [ 0.0204, -0.0145, -0.0123, ..., -0.0209, 0.0127, -0.0073], ..., [-0.0015, -0.0013, 0.0121, ..., 0.0043, 0.0199, 0.0109], [ 0.0096, 0.0208, -0.0142, ..., 0.0160, -0.0174, 0.0288], [-0.0055, -0.0161, 0.0242, ..., -0.0260, -0.0230, 0.0003]], device='cuda:0'), grad: None Epoch 1, bias, value: tensor([ 0.0002, 0.0236, 0.0062, 0.0051, 0.0040, -0.0094, 0.0250, -0.0272, 0.0250, 0.0046], device='cuda:0'), grad: None 100 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 221.75, cls_loss 1.6842 cls_loss_mapping 2.0049 cls_loss_causal 2.2496 re_mapping 0.0767 re_causal 0.0774 /// teacc 84.37 lr 0.00010000 Epoch 2, weight, value: tensor([[-0.0126, 0.0228, 0.0004, ..., -0.0319, 0.0314, 0.0219], [ 0.0086, 0.0125, -0.0421, ..., 0.0074, -0.0105, 0.0126], [ 0.0192, -0.0145, -0.0187, ..., -0.0247, 0.0152, -0.0079], ..., [ 0.0011, -0.0013, 0.0107, ..., 0.0045, 0.0185, 0.0114], [ 0.0097, 0.0208, -0.0128, ..., 0.0117, -0.0165, 0.0295], [-0.0047, -0.0161, 0.0288, ..., -0.0276, -0.0288, -0.0004]], device='cuda:0'), grad: tensor([[ 0.0072, 0.0000, 0.0089, ..., 0.0002, 0.0204, 0.0000], [ 0.0124, 0.0000, 0.0125, ..., 0.0015, 0.0347, 0.0000], [-0.0183, 0.0000, 0.0107, ..., 0.0003, -0.0353, 0.0000], ..., [-0.0045, 0.0000, -0.0565, ..., -0.0074, -0.0013, 0.0000], [-0.0344, 0.0000, -0.0058, ..., -0.0060, -0.0693, 0.0000], [ 0.0173, 0.0000, 0.1266, ..., 0.0083, 0.0386, 0.0000]], device='cuda:0') Epoch 2, bias, value: tensor([-0.0037, 0.0263, 0.0055, 0.0059, 0.0041, -0.0085, 0.0241, -0.0262, 0.0237, 0.0050], device='cuda:0'), grad: tensor([ 0.0261, 0.0339, -0.0185, 0.0468, -0.0235, -0.0387, -0.0398, -0.0343, -0.0600, 0.1082], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 222.81, cls_loss 0.5751 cls_loss_mapping 0.8903 cls_loss_causal 1.9459 re_mapping 0.2095 re_causal 0.2474 /// teacc 90.15 lr 0.00010000 Epoch 3, weight, value: tensor([[-0.0133, 0.0228, -0.0016, ..., -0.0345, 0.0322, 0.0219], [ 0.0073, 0.0125, -0.0446, ..., 0.0099, -0.0113, 0.0126], [ 0.0210, -0.0145, -0.0211, ..., -0.0259, 0.0167, -0.0079], ..., [ 0.0014, -0.0013, 0.0108, ..., 0.0048, 0.0192, 0.0114], [ 0.0087, 0.0208, -0.0127, ..., 0.0093, -0.0153, 0.0295], [-0.0047, -0.0161, 0.0294, ..., -0.0308, -0.0306, -0.0004]], device='cuda:0'), grad: tensor([[ 0.0004, 0.0000, 0.0062, ..., 0.0003, 0.0099, 0.0000], [ 0.0026, 0.0000, 0.0009, ..., 0.0002, 0.0028, 0.0000], [-0.0282, 0.0000, -0.0045, ..., 0.0004, -0.0375, 0.0000], ..., [-0.0017, 0.0000, 0.0044, ..., 0.0007, -0.0014, 0.0000], [ 0.0175, 0.0000, 0.0013, ..., 0.0011, 0.0173, 0.0000], [ 0.0029, 0.0000, 0.0058, ..., 0.0027, 0.0027, 0.0000]], device='cuda:0') Epoch 3, bias, value: tensor([-0.0044, 0.0265, 0.0052, 0.0057, 0.0041, -0.0070, 0.0236, -0.0264, 0.0234, 0.0052], device='cuda:0'), grad: tensor([ 0.0199, 0.0046, -0.0508, -0.0047, 0.0078, -0.0029, -0.0100, 0.0019, 0.0240, 0.0102], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 220.88, cls_loss 0.3477 cls_loss_mapping 0.5253 cls_loss_causal 1.7447 re_mapping 0.1589 re_causal 0.2344 /// teacc 93.42 lr 0.00010000 Epoch 4, weight, value: tensor([[-0.0136, 0.0228, -0.0035, ..., -0.0363, 0.0330, 0.0219], [ 0.0068, 0.0125, -0.0460, ..., 0.0108, -0.0103, 0.0126], [ 0.0225, -0.0145, -0.0216, ..., -0.0257, 0.0167, -0.0079], ..., [ 0.0025, -0.0013, 0.0104, ..., 0.0053, 0.0195, 0.0114], [ 0.0083, 0.0208, -0.0141, ..., 0.0069, -0.0150, 0.0295], [-0.0057, -0.0161, 0.0304, ..., -0.0329, -0.0310, -0.0004]], device='cuda:0'), grad: tensor([[-0.0008, 0.0000, 0.0006, ..., 0.0000, -0.0026, 0.0000], [ 0.0009, 0.0000, 0.0005, ..., 0.0000, -0.0022, 0.0000], [-0.0275, 0.0000, 0.0009, ..., 0.0000, -0.0058, 0.0000], ..., [ 0.0023, 0.0000, 0.0018, ..., 0.0000, 0.0020, 0.0000], [ 0.0611, 0.0000, 0.0223, ..., 0.0000, 0.0236, 0.0000], [ 0.0045, 0.0000, -0.0014, ..., 0.0000, 0.0022, 0.0000]], device='cuda:0') Epoch 4, bias, value: tensor([-0.0041, 0.0269, 0.0052, 0.0057, 0.0037, -0.0064, 0.0235, -0.0267, 0.0230, 0.0052], device='cuda:0'), grad: tensor([-0.0077, -0.0008, -0.0145, -0.0323, 0.0042, -0.0293, 0.0009, 0.0038, 0.0704, 0.0052], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 218.72, cls_loss 0.2539 cls_loss_mapping 0.3669 cls_loss_causal 1.5234 re_mapping 0.1287 re_causal 0.2210 /// teacc 94.90 lr 0.00010000 Epoch 5, weight, value: tensor([[-0.0134, 0.0228, -0.0046, ..., -0.0408, 0.0336, 0.0219], [ 0.0065, 0.0125, -0.0470, ..., 0.0115, -0.0087, 0.0126], [ 0.0238, -0.0145, -0.0223, ..., -0.0229, 0.0165, -0.0079], ..., [ 0.0047, -0.0013, 0.0098, ..., 0.0070, 0.0199, 0.0114], [ 0.0074, 0.0208, -0.0146, ..., 0.0038, -0.0147, 0.0295], [-0.0073, -0.0161, 0.0309, ..., -0.0338, -0.0308, -0.0004]], device='cuda:0'), grad: tensor([[ 2.4948e-03, 0.0000e+00, 2.3232e-03, ..., 5.9843e-05, 8.5449e-03, 0.0000e+00], [-3.3360e-03, 0.0000e+00, -6.3744e-03, ..., -1.0071e-02, -2.1877e-03, 0.0000e+00], [ 2.0355e-02, 0.0000e+00, 1.5106e-03, ..., 8.4114e-04, 1.0719e-02, 0.0000e+00], ..., [ 1.0414e-02, 0.0000e+00, 7.0229e-03, ..., -1.8120e-04, 6.8588e-03, 0.0000e+00], [-4.2389e-02, 0.0000e+00, 1.3069e-02, ..., 2.7485e-03, -1.3840e-02, 0.0000e+00], [ 4.2953e-03, 0.0000e+00, -3.7781e-02, ..., 6.1703e-04, -8.6594e-03, 0.0000e+00]], device='cuda:0') Epoch 5, bias, value: tensor([-0.0039, 0.0276, 0.0055, 0.0055, 0.0038, -0.0067, 0.0232, -0.0267, 0.0230, 0.0051], device='cuda:0'), grad: tensor([ 0.0161, -0.0300, 0.0190, 0.0335, -0.0035, -0.0140, -0.0066, 0.0149, -0.0064, -0.0230], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 219.21, cls_loss 0.2005 cls_loss_mapping 0.2738 cls_loss_causal 1.4344 re_mapping 0.1058 re_causal 0.2003 /// teacc 95.39 lr 0.00010000 Epoch 6, weight, value: tensor([[-0.0138, 0.0228, -0.0060, ..., -0.0444, 0.0334, 0.0219], [ 0.0058, 0.0125, -0.0481, ..., 0.0117, -0.0080, 0.0126], [ 0.0248, -0.0145, -0.0229, ..., -0.0204, 0.0163, -0.0079], ..., [ 0.0057, -0.0013, 0.0094, ..., 0.0076, 0.0200, 0.0114], [ 0.0074, 0.0208, -0.0151, ..., 0.0024, -0.0143, 0.0295], [-0.0078, -0.0161, 0.0315, ..., -0.0345, -0.0306, -0.0004]], device='cuda:0'), grad: tensor([[-2.9697e-03, 0.0000e+00, 6.4507e-03, ..., 2.4348e-05, -9.5177e-04, 0.0000e+00], [ 1.1158e-03, 0.0000e+00, 4.9877e-04, ..., 7.8022e-05, 9.6560e-04, 0.0000e+00], [ 1.1053e-03, 0.0000e+00, 1.8187e-03, ..., 1.6415e-04, -8.0795e-03, 0.0000e+00], ..., [-1.9119e-02, 0.0000e+00, -5.7793e-04, ..., 5.4866e-05, -1.5533e-02, 0.0000e+00], [ 2.9640e-03, 0.0000e+00, -6.6185e-03, ..., 3.2568e-04, 4.6883e-03, 0.0000e+00], [ 1.2794e-02, 0.0000e+00, 8.0338e-03, ..., 2.3575e-03, 1.3672e-02, 0.0000e+00]], device='cuda:0') Epoch 6, bias, value: tensor([-0.0039, 0.0276, 0.0055, 0.0052, 0.0041, -0.0066, 0.0229, -0.0268, 0.0231, 0.0053], device='cuda:0'), grad: tensor([ 0.0018, 0.0014, -0.0032, 0.0058, -0.0147, 0.0065, 0.0040, -0.0266, -0.0003, 0.0253], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 219.32, cls_loss 0.1731 cls_loss_mapping 0.2329 cls_loss_causal 1.3032 re_mapping 0.0907 re_causal 0.1857 /// teacc 95.98 lr 0.00010000 Epoch 7, weight, value: tensor([[-0.0144, 0.0228, -0.0067, ..., -0.0481, 0.0336, 0.0219], [ 0.0057, 0.0125, -0.0497, ..., 0.0112, -0.0073, 0.0126], [ 0.0258, -0.0145, -0.0238, ..., -0.0200, 0.0160, -0.0079], ..., [ 0.0062, -0.0013, 0.0088, ..., 0.0078, 0.0200, 0.0114], [ 0.0069, 0.0208, -0.0155, ..., 0.0013, -0.0143, 0.0295], [-0.0090, -0.0161, 0.0321, ..., -0.0352, -0.0308, -0.0004]], device='cuda:0'), grad: tensor([[-5.1041e-03, 0.0000e+00, 1.3676e-03, ..., 2.3358e-06, -3.9597e-03, 0.0000e+00], [ 1.2379e-03, 0.0000e+00, 9.1839e-04, ..., 1.4175e-06, 6.3438e-03, 0.0000e+00], [ 5.2166e-04, 0.0000e+00, 2.2125e-03, ..., 9.5833e-07, -5.6763e-03, 0.0000e+00], ..., [ 1.5430e-03, 0.0000e+00, 1.4477e-03, ..., -1.9759e-05, 4.8304e-04, 0.0000e+00], [ 4.6120e-03, 0.0000e+00, 2.7122e-03, ..., 3.3388e-07, 3.3855e-03, 0.0000e+00], [ 1.3089e-04, 0.0000e+00, -1.5572e-02, ..., 1.0930e-05, -2.2449e-03, 0.0000e+00]], device='cuda:0') Epoch 7, bias, value: tensor([-0.0034, 0.0276, 0.0053, 0.0051, 0.0045, -0.0070, 0.0225, -0.0269, 0.0232, 0.0054], device='cuda:0'), grad: tensor([-0.0089, 0.0094, -0.0060, -0.0251, 0.0048, 0.0266, -0.0009, 0.0030, 0.0094, -0.0122], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 219.21, cls_loss 0.1524 cls_loss_mapping 0.2100 cls_loss_causal 1.2993 re_mapping 0.0762 re_causal 0.1680 /// teacc 96.07 lr 0.00010000 Epoch 8, weight, value: tensor([[-0.0149, 0.0228, -0.0079, ..., -0.0489, 0.0333, 0.0219], [ 0.0052, 0.0125, -0.0507, ..., 0.0111, -0.0067, 0.0126], [ 0.0266, -0.0145, -0.0243, ..., -0.0201, 0.0152, -0.0079], ..., [ 0.0067, -0.0013, 0.0084, ..., 0.0079, 0.0200, 0.0114], [ 0.0065, 0.0208, -0.0156, ..., 0.0013, -0.0144, 0.0295], [-0.0094, -0.0161, 0.0328, ..., -0.0354, -0.0297, -0.0004]], device='cuda:0'), grad: tensor([[-0.0003, 0.0000, 0.0005, ..., 0.0000, -0.0020, 0.0000], [ 0.0006, 0.0000, -0.0210, ..., 0.0000, -0.0089, 0.0000], [-0.0083, 0.0000, 0.0005, ..., 0.0000, -0.0036, 0.0000], ..., [ 0.0036, 0.0000, 0.0120, ..., 0.0000, 0.0046, 0.0000], [ 0.0022, 0.0000, 0.0033, ..., 0.0000, 0.0036, 0.0000], [-0.0108, 0.0000, -0.0057, ..., 0.0000, -0.0091, 0.0000]], device='cuda:0') Epoch 8, bias, value: tensor([-0.0035, 0.0279, 0.0055, 0.0049, 0.0042, -0.0072, 0.0225, -0.0270, 0.0231, 0.0058], device='cuda:0'), grad: tensor([-0.0048, -0.0293, -0.0114, 0.0277, 0.0001, 0.0094, -0.0057, 0.0235, 0.0070, -0.0167], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 219.46, cls_loss 0.1335 cls_loss_mapping 0.1743 cls_loss_causal 1.2317 re_mapping 0.0690 re_causal 0.1570 /// teacc 96.80 lr 0.00010000 Epoch 9, weight, value: tensor([[-0.0148, 0.0228, -0.0085, ..., -0.0495, 0.0332, 0.0219], [ 0.0047, 0.0125, -0.0512, ..., 0.0108, -0.0057, 0.0126], [ 0.0269, -0.0145, -0.0251, ..., -0.0202, 0.0141, -0.0079], ..., [ 0.0080, -0.0013, 0.0080, ..., 0.0082, 0.0203, 0.0114], [ 0.0060, 0.0208, -0.0156, ..., 0.0017, -0.0143, 0.0295], [-0.0108, -0.0161, 0.0333, ..., -0.0362, -0.0297, -0.0004]], device='cuda:0'), grad: tensor([[ 2.0733e-03, 0.0000e+00, 2.9349e-04, ..., 0.0000e+00, -7.9453e-05, 0.0000e+00], [ 1.6060e-03, 0.0000e+00, 8.5402e-04, ..., 0.0000e+00, 9.1362e-04, 0.0000e+00], [ 5.8055e-05, 0.0000e+00, 3.3522e-04, ..., 0.0000e+00, 9.9277e-04, 0.0000e+00], ..., [ 4.6692e-03, 0.0000e+00, 2.4109e-03, ..., 0.0000e+00, 8.7357e-04, 0.0000e+00], [-1.6518e-03, 0.0000e+00, 1.9779e-03, ..., 0.0000e+00, -1.9665e-03, 0.0000e+00], [ 2.2774e-03, 0.0000e+00, 4.0283e-03, ..., 0.0000e+00, -9.0265e-04, 0.0000e+00]], device='cuda:0') Epoch 9, bias, value: tensor([-0.0035, 0.0284, 0.0051, 0.0052, 0.0044, -0.0077, 0.0223, -0.0268, 0.0235, 0.0056], device='cuda:0'), grad: tensor([ 0.0006, 0.0042, 0.0017, 0.0190, -0.0242, -0.0178, -0.0019, 0.0090, 0.0014, 0.0079], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 219.68, cls_loss 0.1251 cls_loss_mapping 0.1543 cls_loss_causal 1.1917 re_mapping 0.0646 re_causal 0.1443 /// teacc 97.04 lr 0.00010000 Epoch 10, weight, value: tensor([[-0.0155, 0.0228, -0.0092, ..., -0.0506, 0.0329, 0.0219], [ 0.0038, 0.0125, -0.0519, ..., 0.0105, -0.0049, 0.0126], [ 0.0274, -0.0145, -0.0256, ..., -0.0202, 0.0133, -0.0079], ..., [ 0.0088, -0.0013, 0.0078, ..., 0.0089, 0.0203, 0.0114], [ 0.0060, 0.0208, -0.0159, ..., 0.0020, -0.0140, 0.0295], [-0.0118, -0.0161, 0.0339, ..., -0.0364, -0.0292, -0.0004]], device='cuda:0'), grad: tensor([[ 3.0828e-04, 0.0000e+00, 4.2892e-04, ..., 4.5970e-06, 4.0579e-04, 0.0000e+00], [ 6.8617e-04, 0.0000e+00, 8.0824e-05, ..., 1.7239e-06, -4.3917e-04, 0.0000e+00], [ 1.7424e-03, 0.0000e+00, 5.7983e-04, ..., 1.4137e-06, 2.6398e-03, 0.0000e+00], ..., [ 1.7726e-04, 0.0000e+00, 2.1839e-04, ..., -1.5914e-05, 5.3644e-04, 0.0000e+00], [-4.1962e-03, 0.0000e+00, -5.4979e-04, ..., 1.0096e-06, 2.3484e-05, 0.0000e+00], [ 9.2316e-04, 0.0000e+00, 4.6015e-04, ..., 4.0270e-06, 1.1492e-04, 0.0000e+00]], device='cuda:0') Epoch 10, bias, value: tensor([-0.0037, 0.0280, 0.0050, 0.0049, 0.0042, -0.0077, 0.0222, -0.0264, 0.0240, 0.0057], device='cuda:0'), grad: tensor([ 0.0023, 0.0010, 0.0195, -0.0186, 0.0017, 0.0015, -0.0014, 0.0021, -0.0106, 0.0024], device='cuda:0') 100 0.0001 changing lr epoch 9, time 218.62, cls_loss 0.1113 cls_loss_mapping 0.1363 cls_loss_causal 1.1221 re_mapping 0.0603 re_causal 0.1350 /// teacc 96.99 lr 0.00010000 Epoch 11, weight, value: tensor([[-0.0164, 0.0228, -0.0096, ..., -0.0521, 0.0328, 0.0219], [ 0.0033, 0.0125, -0.0523, ..., 0.0103, -0.0042, 0.0126], [ 0.0280, -0.0145, -0.0261, ..., -0.0204, 0.0129, -0.0079], ..., [ 0.0089, -0.0013, 0.0071, ..., 0.0104, 0.0199, 0.0114], [ 0.0057, 0.0208, -0.0162, ..., 0.0015, -0.0138, 0.0295], [-0.0123, -0.0161, 0.0344, ..., -0.0364, -0.0289, -0.0004]], device='cuda:0'), grad: tensor([[ 1.7178e-04, 0.0000e+00, 1.3912e-04, ..., 2.2531e-05, 3.3593e-04, 0.0000e+00], [ 3.1853e-04, 0.0000e+00, 1.2231e-04, ..., 5.9724e-05, -3.1519e-04, 0.0000e+00], [ 1.0300e-03, 0.0000e+00, 1.1057e-04, ..., 3.7074e-04, 4.4799e-04, 0.0000e+00], ..., [-2.0523e-03, 0.0000e+00, 4.9686e-04, ..., -3.4976e-04, -1.0376e-03, 0.0000e+00], [ 2.0905e-03, 0.0000e+00, 3.2921e-03, ..., 2.2125e-04, 2.3098e-03, 0.0000e+00], [-1.5465e-02, 0.0000e+00, -5.7793e-03, ..., 1.4937e-04, -1.5442e-02, 0.0000e+00]], device='cuda:0') Epoch 11, bias, value: tensor([-0.0038, 0.0280, 0.0052, 0.0048, 0.0044, -0.0077, 0.0219, -0.0265, 0.0243, 0.0056], device='cuda:0'), grad: tensor([ 6.6853e-04, -5.5701e-05, 1.0920e-03, 1.9730e-02, 7.5483e-04, -5.8441e-03, -1.4508e-04, -1.8644e-03, 5.8098e-03, -2.0142e-02], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 10---------------------------------------------------- epoch 10, time 221.32, cls_loss 0.0999 cls_loss_mapping 0.1227 cls_loss_causal 1.1094 re_mapping 0.0549 re_causal 0.1267 /// teacc 97.24 lr 0.00010000 Epoch 12, weight, value: tensor([[-0.0171, 0.0228, -0.0102, ..., -0.0525, 0.0324, 0.0219], [ 0.0031, 0.0125, -0.0536, ..., 0.0116, -0.0032, 0.0126], [ 0.0284, -0.0145, -0.0262, ..., -0.0203, 0.0123, -0.0079], ..., [ 0.0094, -0.0013, 0.0069, ..., 0.0110, 0.0197, 0.0114], [ 0.0054, 0.0208, -0.0166, ..., 0.0004, -0.0138, 0.0295], [-0.0129, -0.0161, 0.0346, ..., -0.0370, -0.0284, -0.0004]], device='cuda:0'), grad: tensor([[ 2.4557e-04, 0.0000e+00, -2.1343e-03, ..., 1.0300e-06, -1.6947e-03, 0.0000e+00], [ 2.1577e-04, 0.0000e+00, 3.1209e-04, ..., 1.6671e-07, -8.0013e-04, 0.0000e+00], [-1.9207e-03, 0.0000e+00, 4.3845e-04, ..., 1.9372e-07, 2.2519e-04, 0.0000e+00], ..., [ 2.8992e-04, 0.0000e+00, 1.5678e-03, ..., 1.7462e-07, 1.4315e-03, 0.0000e+00], [-1.1641e-04, 0.0000e+00, -3.7231e-03, ..., 4.0419e-06, -3.1757e-03, 0.0000e+00], [ 1.1234e-03, 0.0000e+00, 7.3776e-03, ..., 1.6848e-06, 2.5864e-03, 0.0000e+00]], device='cuda:0') Epoch 12, bias, value: tensor([-0.0040, 0.0282, 0.0049, 0.0049, 0.0042, -0.0078, 0.0220, -0.0264, 0.0244, 0.0056], device='cuda:0'), grad: tensor([-1.0231e-02, -1.9920e-04, -1.6049e-05, 4.3144e-03, -4.3869e-03, 3.5877e-03, 1.2624e-04, 5.4092e-03, -1.2894e-02, 1.4290e-02], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 218.79, cls_loss 0.0856 cls_loss_mapping 0.1040 cls_loss_causal 1.0154 re_mapping 0.0538 re_causal 0.1211 /// teacc 97.46 lr 0.00010000 Epoch 13, weight, value: tensor([[-1.7175e-02, 2.2769e-02, -1.0488e-02, ..., -5.3541e-02, 3.2494e-02, 2.1893e-02], [ 2.9271e-03, 1.2500e-02, -5.4046e-02, ..., 1.1676e-02, -2.6134e-03, 1.2576e-02], [ 2.8715e-02, -1.4480e-02, -2.6587e-02, ..., -2.0364e-02, 1.1720e-02, -7.9110e-03], ..., [ 1.0327e-02, -1.2941e-03, 6.8708e-03, ..., 1.1770e-02, 1.9598e-02, 1.1425e-02], [ 4.7974e-03, 2.0836e-02, -1.6912e-02, ..., 4.7806e-05, -1.3859e-02, 2.9502e-02], [-1.3850e-02, -1.6141e-02, 3.5095e-02, ..., -3.7387e-02, -2.7884e-02, -3.8447e-04]], device='cuda:0'), grad: tensor([[ 4.3821e-04, 0.0000e+00, 6.7174e-05, ..., 5.4501e-06, 9.9778e-05, 0.0000e+00], [ 1.0386e-03, 0.0000e+00, 1.5581e-04, ..., 1.2964e-05, -3.2783e-04, 0.0000e+00], [ 1.9627e-03, 0.0000e+00, 2.3377e-04, ..., 1.3493e-05, 1.2789e-03, 0.0000e+00], ..., [-1.0918e-02, 0.0000e+00, -2.1954e-03, ..., -1.4901e-04, -1.3571e-03, 0.0000e+00], [ 2.9602e-03, 0.0000e+00, 9.1553e-05, ..., 1.5095e-05, -7.1478e-04, 0.0000e+00], [-3.8795e-03, 0.0000e+00, -1.3718e-02, ..., 4.4942e-05, 7.5340e-04, 0.0000e+00]], device='cuda:0') Epoch 13, bias, value: tensor([-0.0035, 0.0282, 0.0049, 0.0050, 0.0041, -0.0079, 0.0217, -0.0261, 0.0245, 0.0054], device='cuda:0'), grad: tensor([-0.0006, 0.0010, -0.0040, 0.0029, 0.0187, 0.0026, -0.0006, -0.0105, 0.0057, -0.0152], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 219.25, cls_loss 0.0909 cls_loss_mapping 0.1133 cls_loss_causal 1.0595 re_mapping 0.0484 re_causal 0.1174 /// teacc 97.57 lr 0.00010000 Epoch 14, weight, value: tensor([[-0.0174, 0.0223, -0.0110, ..., -0.0539, 0.0324, 0.0219], [ 0.0026, 0.0108, -0.0538, ..., 0.0116, -0.0016, 0.0126], [ 0.0294, -0.0126, -0.0267, ..., -0.0198, 0.0113, -0.0079], ..., [ 0.0107, -0.0024, 0.0064, ..., 0.0118, 0.0192, 0.0114], [ 0.0041, 0.0201, -0.0172, ..., -0.0004, -0.0138, 0.0295], [-0.0141, -0.0164, 0.0356, ..., -0.0377, -0.0274, -0.0004]], device='cuda:0'), grad: tensor([[-1.1490e-02, 0.0000e+00, -4.9531e-05, ..., 0.0000e+00, -1.7609e-02, 0.0000e+00], [ 8.0109e-03, 0.0000e+00, 3.1352e-04, ..., 0.0000e+00, 1.1238e-02, 0.0000e+00], [-5.4665e-03, 0.0000e+00, 1.9860e-04, ..., 0.0000e+00, 4.8375e-04, 0.0000e+00], ..., [ 3.0017e-04, 0.0000e+00, 1.4007e-04, ..., 0.0000e+00, 4.7827e-04, 0.0000e+00], [ 5.2948e-03, 0.0000e+00, 6.5279e-04, ..., 0.0000e+00, 1.3485e-03, 0.0000e+00], [ 3.5019e-03, 0.0000e+00, 1.9302e-03, ..., 0.0000e+00, 2.7733e-03, 0.0000e+00]], device='cuda:0') Epoch 14, bias, value: tensor([-0.0034, 0.0282, 0.0052, 0.0048, 0.0041, -0.0080, 0.0213, -0.0263, 0.0245, 0.0056], device='cuda:0'), grad: tensor([-0.0307, 0.0211, -0.0127, -0.0020, -0.0030, -0.0025, 0.0077, 0.0009, 0.0143, 0.0070], device='cuda:0') 100 0.0001 changing lr epoch 13, time 218.77, cls_loss 0.0798 cls_loss_mapping 0.0973 cls_loss_causal 1.0558 re_mapping 0.0448 re_causal 0.1108 /// teacc 97.50 lr 0.00010000 Epoch 15, weight, value: tensor([[-0.0175, 0.0202, -0.0115, ..., -0.0566, 0.0324, 0.0219], [ 0.0023, 0.0079, -0.0546, ..., 0.0113, -0.0007, 0.0126], [ 0.0293, -0.0095, -0.0275, ..., -0.0198, 0.0108, -0.0079], ..., [ 0.0112, -0.0049, 0.0064, ..., 0.0129, 0.0189, 0.0114], [ 0.0036, 0.0175, -0.0176, ..., -0.0007, -0.0135, 0.0295], [-0.0153, -0.0178, 0.0362, ..., -0.0381, -0.0271, -0.0004]], device='cuda:0'), grad: tensor([[ 1.3185e-04, 0.0000e+00, -1.1963e-04, ..., 1.0490e-05, 2.2864e-04, 0.0000e+00], [-1.4439e-03, 0.0000e+00, 6.2943e-05, ..., 1.4651e-04, -2.2297e-03, 0.0000e+00], [ 2.1629e-03, 0.0000e+00, 7.2241e-05, ..., 7.5197e-04, 1.1454e-03, 0.0000e+00], ..., [ 3.1403e-02, 0.0000e+00, 3.8028e-04, ..., 1.6312e-02, 4.7922e-04, 0.0000e+00], [ 4.9973e-04, 0.0000e+00, 1.9255e-03, ..., 1.0657e-04, 3.2711e-04, 0.0000e+00], [ 1.1492e-03, 0.0000e+00, 1.2932e-03, ..., 5.6684e-05, 3.3975e-04, 0.0000e+00]], device='cuda:0') Epoch 15, bias, value: tensor([-0.0033, 0.0283, 0.0051, 0.0051, 0.0043, -0.0082, 0.0211, -0.0261, 0.0245, 0.0054], device='cuda:0'), grad: tensor([-0.0007, -0.0028, 0.0026, -0.0197, -0.0010, -0.0102, 0.0006, 0.0254, 0.0029, 0.0028], device='cuda:0') 100 0.0001 changing lr epoch 14, time 217.64, cls_loss 0.0826 cls_loss_mapping 0.1031 cls_loss_causal 1.0196 re_mapping 0.0416 re_causal 0.0998 /// teacc 97.44 lr 0.00010000 Epoch 16, weight, value: tensor([[-1.7718e-02, 1.2063e-02, -1.2141e-02, ..., -5.6871e-02, 3.2121e-02, 2.1893e-02], [ 1.5121e-03, -5.6718e-04, -5.4833e-02, ..., 1.1570e-02, -2.3863e-05, 1.2576e-02], [ 2.9192e-02, -1.5483e-03, -2.7864e-02, ..., -2.0202e-02, 1.0512e-02, -7.9110e-03], ..., [ 1.2028e-02, -1.3074e-02, 5.7202e-03, ..., 1.3618e-02, 1.8653e-02, 1.1425e-02], [ 4.3767e-03, 1.0639e-02, -1.7824e-02, ..., -8.9181e-04, -1.3371e-02, 2.9502e-02], [-1.6387e-02, -2.5177e-02, 3.6844e-02, ..., -3.8102e-02, -2.6650e-02, -3.8447e-04]], device='cuda:0'), grad: tensor([[ 5.1260e-05, 0.0000e+00, -8.0913e-06, ..., 3.9674e-07, -1.6284e-04, 0.0000e+00], [ 3.3450e-04, 0.0000e+00, 2.2471e-04, ..., 2.7921e-06, -3.1233e-04, 0.0000e+00], [ 1.9503e-04, 0.0000e+00, 1.0413e-04, ..., 1.9163e-05, 1.1146e-04, 0.0000e+00], ..., [-9.7370e-04, 0.0000e+00, 1.0544e-04, ..., 2.6613e-05, -6.1214e-05, 0.0000e+00], [ 3.0208e-04, 0.0000e+00, 2.2686e-04, ..., 1.3120e-05, 1.8597e-04, 0.0000e+00], [ 2.8443e-04, 0.0000e+00, -5.9414e-04, ..., 8.5756e-06, -4.8470e-04, 0.0000e+00]], device='cuda:0') Epoch 16, bias, value: tensor([-0.0035, 0.0285, 0.0049, 0.0049, 0.0045, -0.0085, 0.0211, -0.0261, 0.0251, 0.0052], device='cuda:0'), grad: tensor([-1.1375e-02, 1.7452e-04, 6.6805e-04, 1.0559e-02, 7.7844e-05, -3.0689e-03, 3.1452e-03, -7.8106e-04, 9.4700e-04, -3.5429e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 15---------------------------------------------------- epoch 15, time 217.96, cls_loss 0.0682 cls_loss_mapping 0.0822 cls_loss_causal 0.9587 re_mapping 0.0406 re_causal 0.0976 /// teacc 97.77 lr 0.00010000 Epoch 17, weight, value: tensor([[-0.0181, 0.0045, -0.0123, ..., -0.0573, 0.0318, 0.0219], [ 0.0012, 0.0009, -0.0553, ..., 0.0113, 0.0007, 0.0126], [ 0.0296, -0.0034, -0.0283, ..., -0.0206, 0.0101, -0.0079], ..., [ 0.0129, -0.0164, 0.0054, ..., 0.0138, 0.0187, 0.0114], [ 0.0039, 0.0084, -0.0182, ..., -0.0005, -0.0135, 0.0295], [-0.0172, -0.0323, 0.0373, ..., -0.0381, -0.0264, -0.0004]], device='cuda:0'), grad: tensor([[ 1.4687e-04, 0.0000e+00, 1.2720e-04, ..., 2.6543e-07, 1.8859e-04, 0.0000e+00], [ 1.4913e-04, 0.0000e+00, 1.7488e-04, ..., 1.1977e-06, -1.1253e-03, 0.0000e+00], [-6.3324e-04, 0.0000e+00, 1.1492e-04, ..., 3.8277e-07, 1.6022e-04, 0.0000e+00], ..., [ 2.5234e-03, 0.0000e+00, 5.4646e-04, ..., -6.7838e-06, 8.2111e-04, 0.0000e+00], [ 3.8171e-04, 0.0000e+00, 1.2569e-03, ..., 3.0734e-07, 7.5245e-04, 0.0000e+00], [ 3.7074e-04, 0.0000e+00, -4.2114e-03, ..., 2.6487e-06, -5.6601e-04, 0.0000e+00]], device='cuda:0') Epoch 17, bias, value: tensor([-0.0036, 0.0284, 0.0050, 0.0047, 0.0045, -0.0085, 0.0211, -0.0256, 0.0248, 0.0053], device='cuda:0'), grad: tensor([ 0.0004, -0.0008, -0.0018, 0.0008, 0.0044, -0.0139, 0.0076, 0.0042, 0.0035, -0.0042], device='cuda:0') 100 0.0001 changing lr epoch 16, time 217.24, cls_loss 0.0656 cls_loss_mapping 0.0820 cls_loss_causal 0.9977 re_mapping 0.0380 re_causal 0.0964 /// teacc 97.29 lr 0.00010000 Epoch 18, weight, value: tensor([[-0.0176, -0.0005, -0.0122, ..., -0.0576, 0.0316, 0.0219], [ 0.0006, 0.0024, -0.0561, ..., 0.0109, 0.0013, 0.0126], [ 0.0301, -0.0043, -0.0290, ..., -0.0205, 0.0099, -0.0079], ..., [ 0.0131, -0.0207, 0.0057, ..., 0.0136, 0.0186, 0.0114], [ 0.0035, 0.0050, -0.0187, ..., -0.0001, -0.0133, 0.0295], [-0.0181, -0.0383, 0.0374, ..., -0.0383, -0.0263, -0.0004]], device='cuda:0'), grad: tensor([[ 8.4162e-05, 0.0000e+00, 6.0856e-05, ..., 2.8871e-07, 5.6237e-05, 0.0000e+00], [-8.5354e-04, 0.0000e+00, 8.1241e-05, ..., 4.4331e-07, -1.0977e-03, 0.0000e+00], [-2.4509e-04, 0.0000e+00, 7.7248e-05, ..., 7.7300e-07, 1.1319e-04, 0.0000e+00], ..., [-6.0678e-05, 0.0000e+00, 8.0490e-04, ..., -5.2229e-06, 2.4199e-04, 0.0000e+00], [ 5.5408e-04, 0.0000e+00, 1.1454e-03, ..., 2.2631e-07, 2.9325e-04, 0.0000e+00], [-8.4305e-04, 0.0000e+00, -4.2801e-03, ..., 1.4668e-06, 7.2896e-05, 0.0000e+00]], device='cuda:0') Epoch 18, bias, value: tensor([-0.0032, 0.0283, 0.0051, 0.0047, 0.0047, -0.0088, 0.0209, -0.0256, 0.0248, 0.0053], device='cuda:0'), grad: tensor([-0.0002, -0.0039, 0.0004, 0.0022, 0.0046, -0.0015, 0.0001, 0.0017, 0.0029, -0.0064], device='cuda:0') 100 0.0001 changing lr epoch 17, time 217.25, cls_loss 0.0763 cls_loss_mapping 0.0965 cls_loss_causal 0.9589 re_mapping 0.0377 re_causal 0.0918 /// teacc 97.72 lr 0.00010000 Epoch 19, weight, value: tensor([[-1.8015e-02, -1.0963e-02, -1.2383e-02, ..., -5.7778e-02, 3.1339e-02, 2.1893e-02], [ 8.5515e-05, 4.4174e-03, -5.6827e-02, ..., 1.0723e-02, 2.0767e-03, 1.2576e-02], [ 3.0601e-02, -3.0445e-03, -2.8312e-02, ..., -2.0565e-02, 9.7862e-03, -7.9110e-03], ..., [ 1.2963e-02, -2.1107e-02, 5.4822e-03, ..., 1.4224e-02, 1.8137e-02, 1.1425e-02], [ 3.4132e-03, -6.8826e-03, -1.9315e-02, ..., -2.9777e-04, -1.3033e-02, 2.9502e-02], [-1.8927e-02, -5.1199e-02, 3.7845e-02, ..., -3.8537e-02, -2.5991e-02, -3.8447e-04]], device='cuda:0'), grad: tensor([[ 3.1853e-04, 2.1532e-06, 1.5855e-04, ..., 2.0117e-07, 1.8072e-04, 0.0000e+00], [ 2.7370e-04, 7.7114e-06, 6.9857e-05, ..., -1.7434e-05, -6.2704e-05, 0.0000e+00], [-6.3896e-04, -2.4572e-05, 1.4186e-04, ..., 1.6643e-06, 3.7342e-05, 0.0000e+00], ..., [-8.5115e-04, 2.7381e-06, 1.6773e-04, ..., 7.1079e-06, -7.5245e-04, 0.0000e+00], [ 3.0422e-04, 3.9712e-06, 5.8651e-04, ..., 1.0869e-06, 8.0168e-05, 0.0000e+00], [ 7.1096e-04, 3.1386e-07, 2.6274e-04, ..., 7.3947e-07, 4.4203e-04, 0.0000e+00]], device='cuda:0') Epoch 19, bias, value: tensor([-0.0031, 0.0282, 0.0061, 0.0049, 0.0044, -0.0090, 0.0207, -0.0260, 0.0248, 0.0051], device='cuda:0'), grad: tensor([ 1.1177e-03, 3.6025e-04, -2.2340e-04, -1.9180e-02, 8.4519e-05, 1.4404e-02, 1.5745e-03, -2.0638e-03, 1.7977e-03, 2.1400e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 18---------------------------------------------------- epoch 18, time 218.21, cls_loss 0.0601 cls_loss_mapping 0.0775 cls_loss_causal 0.9205 re_mapping 0.0346 re_causal 0.0893 /// teacc 97.79 lr 0.00010000 Epoch 20, weight, value: tensor([[-0.0183, -0.0115, -0.0128, ..., -0.0579, 0.0312, 0.0219], [-0.0007, 0.0045, -0.0574, ..., 0.0107, 0.0024, 0.0126], [ 0.0308, -0.0015, -0.0285, ..., -0.0207, 0.0096, -0.0079], ..., [ 0.0134, -0.0245, 0.0053, ..., 0.0145, 0.0179, 0.0114], [ 0.0032, -0.0086, -0.0197, ..., -0.0002, -0.0130, 0.0295], [-0.0196, -0.0530, 0.0381, ..., -0.0390, -0.0255, -0.0004]], device='cuda:0'), grad: tensor([[ 1.0884e-04, 0.0000e+00, 1.6153e-05, ..., 1.6764e-08, 6.9022e-05, 0.0000e+00], [ 1.8191e-04, 0.0000e+00, 2.9355e-05, ..., 1.6857e-07, 1.8477e-04, 0.0000e+00], [-1.3094e-03, 0.0000e+00, -3.0130e-05, ..., 1.5832e-07, 1.6534e-04, 0.0000e+00], ..., [ 1.2052e-04, 0.0000e+00, 2.3365e-05, ..., -1.0924e-06, -1.6861e-03, 0.0000e+00], [-1.8060e-05, 0.0000e+00, 1.0139e-04, ..., 1.0245e-07, 6.1655e-04, 0.0000e+00], [-9.4414e-05, 0.0000e+00, 6.0320e-04, ..., 2.3562e-07, -2.7871e-04, 0.0000e+00]], device='cuda:0') Epoch 20, bias, value: tensor([-0.0034, 0.0280, 0.0061, 0.0053, 0.0048, -0.0089, 0.0204, -0.0259, 0.0248, 0.0049], device='cuda:0'), grad: tensor([ 4.3273e-04, 2.6550e-03, -2.4376e-03, 1.8759e-03, 7.3075e-05, 1.4553e-03, 1.5039e-03, -1.5472e-02, 1.0910e-02, -1.0080e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 19---------------------------------------------------- epoch 19, time 218.04, cls_loss 0.0640 cls_loss_mapping 0.0817 cls_loss_causal 0.8950 re_mapping 0.0349 re_causal 0.0851 /// teacc 98.11 lr 0.00010000 Epoch 21, weight, value: tensor([[-0.0184, -0.0115, -0.0130, ..., -0.0580, 0.0308, 0.0219], [-0.0009, 0.0049, -0.0574, ..., 0.0107, 0.0033, 0.0126], [ 0.0312, -0.0017, -0.0292, ..., -0.0208, 0.0092, -0.0079], ..., [ 0.0143, -0.0247, 0.0050, ..., 0.0148, 0.0179, 0.0114], [ 0.0026, -0.0089, -0.0199, ..., -0.0002, -0.0128, 0.0295], [-0.0204, -0.0534, 0.0383, ..., -0.0392, -0.0253, -0.0004]], device='cuda:0'), grad: tensor([[-5.2547e-04, 1.7667e-06, 5.3942e-05, ..., 5.7742e-08, 4.2856e-05, 0.0000e+00], [ 3.6860e-04, 1.1669e-06, 1.4949e-04, ..., 1.7779e-06, -6.7472e-05, 0.0000e+00], [-7.4244e-04, -1.0341e-05, 3.2067e-05, ..., 3.9022e-07, 2.6047e-05, 0.0000e+00], ..., [-3.2663e-04, 2.5127e-06, 1.7512e-04, ..., -5.9791e-06, 1.5795e-04, 0.0000e+00], [ 2.0874e-04, 1.8450e-06, 1.1164e-04, ..., 2.2445e-07, -2.4092e-04, 0.0000e+00], [-1.0830e-04, 3.3621e-07, 3.2806e-04, ..., 1.4734e-06, -2.7418e-04, 0.0000e+00]], device='cuda:0') Epoch 21, bias, value: tensor([-0.0038, 0.0283, 0.0060, 0.0048, 0.0048, -0.0084, 0.0200, -0.0256, 0.0249, 0.0049], device='cuda:0'), grad: tensor([-0.0028, 0.0007, -0.0006, 0.0019, -0.0031, 0.0019, 0.0008, 0.0007, -0.0004, 0.0009], device='cuda:0') 100 0.0001 changing lr epoch 20, time 217.31, cls_loss 0.0542 cls_loss_mapping 0.0705 cls_loss_causal 0.9315 re_mapping 0.0321 re_causal 0.0833 /// teacc 98.00 lr 0.00010000 Epoch 22, weight, value: tensor([[-0.0184, -0.0117, -0.0132, ..., -0.0585, 0.0304, 0.0219], [-0.0010, 0.0058, -0.0580, ..., 0.0102, 0.0037, 0.0126], [ 0.0317, -0.0022, -0.0298, ..., -0.0210, 0.0090, -0.0079], ..., [ 0.0145, -0.0249, 0.0048, ..., 0.0151, 0.0174, 0.0114], [ 0.0020, -0.0101, -0.0203, ..., -0.0005, -0.0128, 0.0295], [-0.0208, -0.0541, 0.0386, ..., -0.0394, -0.0246, -0.0004]], device='cuda:0'), grad: tensor([[-9.7454e-05, 3.9972e-06, -3.6925e-05, ..., 2.0768e-07, 3.5435e-05, 0.0000e+00], [ 1.3280e-04, 1.9222e-05, 2.7806e-05, ..., 1.4547e-06, -6.2227e-04, 0.0000e+00], [-6.7043e-04, -1.7178e-04, 6.2108e-05, ..., 1.5860e-06, 1.1045e-04, 0.0000e+00], ..., [ 6.2799e-04, 1.4037e-05, 9.3579e-05, ..., -7.2643e-06, 3.0923e-04, 0.0000e+00], [ 7.3671e-04, 8.6010e-05, 4.7803e-04, ..., 5.4855e-07, -2.6751e-04, 0.0000e+00], [ 5.8937e-04, 1.4156e-06, -2.1660e-04, ..., 1.5246e-06, -2.0134e-04, 0.0000e+00]], device='cuda:0') Epoch 22, bias, value: tensor([-0.0040, 0.0282, 0.0062, 0.0045, 0.0049, -0.0084, 0.0202, -0.0260, 0.0250, 0.0051], device='cuda:0'), grad: tensor([-0.0005, -0.0007, -0.0005, 0.0026, 0.0002, -0.0050, 0.0004, 0.0015, 0.0013, 0.0006], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 21---------------------------------------------------- epoch 21, time 218.45, cls_loss 0.0558 cls_loss_mapping 0.0664 cls_loss_causal 0.8972 re_mapping 0.0329 re_causal 0.0796 /// teacc 98.12 lr 0.00010000 Epoch 23, weight, value: tensor([[-0.0186, -0.0118, -0.0133, ..., -0.0587, 0.0301, 0.0219], [-0.0009, 0.0076, -0.0593, ..., 0.0102, 0.0041, 0.0126], [ 0.0318, -0.0036, -0.0302, ..., -0.0211, 0.0086, -0.0079], ..., [ 0.0148, -0.0253, 0.0048, ..., 0.0162, 0.0172, 0.0114], [ 0.0021, -0.0110, -0.0203, ..., -0.0007, -0.0126, 0.0295], [-0.0218, -0.0545, 0.0391, ..., -0.0396, -0.0243, -0.0004]], device='cuda:0'), grad: tensor([[ 3.8981e-04, 0.0000e+00, 1.4699e-04, ..., 2.1756e-06, 2.6536e-04, 0.0000e+00], [ 6.7186e-04, 0.0000e+00, 3.4070e-04, ..., 6.9365e-06, -9.4461e-04, 0.0000e+00], [-3.8490e-03, 0.0000e+00, 8.8835e-04, ..., 6.3591e-06, -6.7091e-04, 0.0000e+00], ..., [-5.2261e-04, 0.0000e+00, 3.4733e-03, ..., 1.2529e-04, 5.5164e-05, 0.0000e+00], [ 3.1433e-03, 0.0000e+00, 1.5602e-03, ..., 4.8876e-05, 1.4668e-03, 0.0000e+00], [ 7.0763e-04, 0.0000e+00, 1.0300e-03, ..., 2.1428e-05, 9.9838e-05, 0.0000e+00]], device='cuda:0') Epoch 23, bias, value: tensor([-0.0037, 0.0282, 0.0060, 0.0042, 0.0050, -0.0085, 0.0200, -0.0257, 0.0254, 0.0050], device='cuda:0'), grad: tensor([ 0.0011, 0.0006, -0.0045, 0.0013, -0.0178, 0.0050, -0.0032, 0.0054, 0.0092, 0.0029], device='cuda:0') 100 0.0001 changing lr epoch 22, time 217.54, cls_loss 0.0515 cls_loss_mapping 0.0723 cls_loss_causal 0.9218 re_mapping 0.0304 re_causal 0.0818 /// teacc 98.03 lr 0.00010000 Epoch 24, weight, value: tensor([[-0.0190, -0.0121, -0.0134, ..., -0.0590, 0.0299, 0.0219], [-0.0009, 0.0083, -0.0598, ..., 0.0101, 0.0048, 0.0126], [ 0.0321, -0.0038, -0.0306, ..., -0.0207, 0.0081, -0.0079], ..., [ 0.0150, -0.0254, 0.0046, ..., 0.0170, 0.0171, 0.0114], [ 0.0021, -0.0132, -0.0203, ..., -0.0009, -0.0127, 0.0295], [-0.0229, -0.0565, 0.0390, ..., -0.0399, -0.0241, -0.0004]], device='cuda:0'), grad: tensor([[ 8.8394e-05, 4.2655e-06, 3.4682e-06, ..., 5.5693e-06, 2.7239e-05, 0.0000e+00], [-1.1295e-04, -7.3290e-04, -4.3720e-05, ..., -3.6359e-05, -5.7220e-04, 0.0000e+00], [ 4.0527e-02, 4.5562e-04, 1.8632e-04, ..., 6.4278e-03, 1.5032e-04, 0.0000e+00], ..., [ 6.4468e-04, 7.9751e-05, 3.1877e-04, ..., 3.8838e-04, 3.8862e-04, 0.0000e+00], [-6.1512e-04, 7.6592e-05, -3.2444e-03, ..., 3.0696e-05, -7.5388e-04, 0.0000e+00], [ 2.3174e-03, 1.2986e-05, 2.5196e-03, ..., 1.5192e-05, 5.9557e-04, 0.0000e+00]], device='cuda:0') Epoch 24, bias, value: tensor([-0.0039, 0.0285, 0.0059, 0.0042, 0.0053, -0.0083, 0.0200, -0.0256, 0.0251, 0.0047], device='cuda:0'), grad: tensor([-0.0012, -0.0017, 0.0298, -0.0299, 0.0003, 0.0004, 0.0009, 0.0002, -0.0079, 0.0092], device='cuda:0') 100 0.0001 changing lr epoch 23, time 217.11, cls_loss 0.0458 cls_loss_mapping 0.0581 cls_loss_causal 0.8614 re_mapping 0.0302 re_causal 0.0783 /// teacc 98.01 lr 0.00010000 Epoch 25, weight, value: tensor([[-0.0196, -0.0126, -0.0135, ..., -0.0592, 0.0296, 0.0219], [-0.0011, 0.0091, -0.0608, ..., 0.0096, 0.0052, 0.0126], [ 0.0319, -0.0037, -0.0306, ..., -0.0209, 0.0079, -0.0079], ..., [ 0.0156, -0.0260, 0.0052, ..., 0.0189, 0.0168, 0.0114], [ 0.0015, -0.0175, -0.0203, ..., -0.0011, -0.0129, 0.0295], [-0.0235, -0.0598, 0.0393, ..., -0.0400, -0.0235, -0.0004]], device='cuda:0'), grad: tensor([[ 4.8935e-05, 0.0000e+00, 2.1607e-05, ..., 1.3225e-07, 3.8505e-05, 0.0000e+00], [ 1.8549e-04, 0.0000e+00, 1.1384e-05, ..., 1.1250e-06, -2.6122e-05, 0.0000e+00], [-8.5258e-04, 0.0000e+00, 8.4117e-06, ..., 1.3690e-06, 5.0306e-05, 0.0000e+00], ..., [ 4.9067e-04, 0.0000e+00, 1.0548e-03, ..., -1.5140e-05, 1.9670e-04, 0.0000e+00], [ 2.7752e-04, 0.0000e+00, 5.5969e-05, ..., 8.2888e-07, 1.3232e-04, 0.0000e+00], [ 1.8609e-04, 0.0000e+00, -1.1787e-03, ..., 6.0052e-06, -9.8109e-05, 0.0000e+00]], device='cuda:0') Epoch 25, bias, value: tensor([-0.0035, 0.0285, 0.0059, 0.0044, 0.0050, -0.0086, 0.0200, -0.0250, 0.0248, 0.0046], device='cuda:0'), grad: tensor([ 8.8573e-05, 2.4557e-04, -1.3609e-03, -4.1275e-03, 2.1636e-04, 3.8681e-03, -4.5490e-04, 2.5425e-03, 5.7983e-04, -1.5926e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 24---------------------------------------------------- epoch 24, time 218.00, cls_loss 0.0482 cls_loss_mapping 0.0591 cls_loss_causal 0.9073 re_mapping 0.0287 re_causal 0.0757 /// teacc 98.21 lr 0.00010000 Epoch 26, weight, value: tensor([[-0.0200, -0.0126, -0.0137, ..., -0.0600, 0.0294, 0.0219], [-0.0016, 0.0096, -0.0609, ..., 0.0104, 0.0054, 0.0126], [ 0.0320, -0.0038, -0.0307, ..., -0.0209, 0.0078, -0.0079], ..., [ 0.0163, -0.0261, 0.0048, ..., 0.0181, 0.0170, 0.0114], [ 0.0009, -0.0183, -0.0208, ..., -0.0012, -0.0128, 0.0295], [-0.0241, -0.0604, 0.0394, ..., -0.0390, -0.0238, -0.0004]], device='cuda:0'), grad: tensor([[ 9.5725e-05, 0.0000e+00, 8.0466e-05, ..., 1.1489e-05, 4.8697e-05, 0.0000e+00], [-2.5034e-04, 0.0000e+00, 1.3340e-04, ..., -2.1994e-04, -6.4373e-04, 0.0000e+00], [ 8.7690e-04, 0.0000e+00, 5.6028e-04, ..., 9.1791e-05, 7.7188e-05, 0.0000e+00], ..., [-3.2005e-03, 0.0000e+00, -2.3712e-02, ..., -4.6501e-03, 3.2640e-04, 0.0000e+00], [-3.3665e-04, 0.0000e+00, -1.1625e-03, ..., 1.3635e-05, 9.6321e-05, 0.0000e+00], [-9.8944e-05, 0.0000e+00, -9.7084e-04, ..., 2.3186e-05, 8.5235e-05, 0.0000e+00]], device='cuda:0') Epoch 26, bias, value: tensor([-0.0037, 0.0280, 0.0060, 0.0041, 0.0053, -0.0081, 0.0202, -0.0248, 0.0245, 0.0045], device='cuda:0'), grad: tensor([-0.0005, -0.0022, 0.0017, 0.0016, 0.0291, 0.0009, -0.0002, -0.0281, -0.0015, -0.0009], device='cuda:0') 100 0.0001 changing lr epoch 25, time 217.30, cls_loss 0.0344 cls_loss_mapping 0.0494 cls_loss_causal 0.8657 re_mapping 0.0287 re_causal 0.0770 /// teacc 98.14 lr 0.00010000 Epoch 27, weight, value: tensor([[-0.0204, -0.0127, -0.0142, ..., -0.0601, 0.0290, 0.0219], [-0.0020, 0.0102, -0.0611, ..., 0.0110, 0.0059, 0.0126], [ 0.0326, -0.0040, -0.0305, ..., -0.0214, 0.0074, -0.0079], ..., [ 0.0168, -0.0271, 0.0049, ..., 0.0186, 0.0166, 0.0114], [ 0.0006, -0.0189, -0.0212, ..., -0.0013, -0.0127, 0.0295], [-0.0250, -0.0607, 0.0398, ..., -0.0388, -0.0233, -0.0004]], device='cuda:0'), grad: tensor([[-1.2600e-04, 0.0000e+00, -5.7258e-06, ..., 2.7940e-08, 8.4005e-07, 0.0000e+00], [-2.2388e-04, 0.0000e+00, 2.0713e-05, ..., -2.2314e-06, -1.1387e-03, 0.0000e+00], [ 9.0837e-05, 0.0000e+00, 1.2839e-04, ..., 1.0990e-07, 8.0287e-05, 0.0000e+00], ..., [ 7.8827e-06, 0.0000e+00, 1.3566e-04, ..., 2.3469e-07, 1.9968e-04, 0.0000e+00], [ 2.4009e-04, 0.0000e+00, -1.1176e-04, ..., 2.1607e-07, 3.5644e-04, 0.0000e+00], [ 1.2875e-04, 0.0000e+00, -2.6059e-04, ..., 4.6007e-07, 1.1408e-04, 0.0000e+00]], device='cuda:0') Epoch 27, bias, value: tensor([-0.0038, 0.0279, 0.0062, 0.0042, 0.0051, -0.0086, 0.0203, -0.0245, 0.0244, 0.0047], device='cuda:0'), grad: tensor([-5.7173e-04, -2.7485e-03, 6.3372e-04, -4.7188e-03, 1.0300e-03, 4.5738e-03, 5.1320e-05, 8.1778e-04, 7.6199e-04, 1.6844e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 26---------------------------------------------------- epoch 26, time 218.42, cls_loss 0.0370 cls_loss_mapping 0.0525 cls_loss_causal 0.8234 re_mapping 0.0278 re_causal 0.0727 /// teacc 98.29 lr 0.00010000 Epoch 28, weight, value: tensor([[-0.0206, -0.0127, -0.0145, ..., -0.0603, 0.0286, 0.0219], [-0.0021, 0.0102, -0.0619, ..., 0.0110, 0.0065, 0.0126], [ 0.0332, -0.0039, -0.0310, ..., -0.0213, 0.0069, -0.0079], ..., [ 0.0168, -0.0272, 0.0049, ..., 0.0193, 0.0162, 0.0114], [ 0.0002, -0.0190, -0.0212, ..., -0.0014, -0.0128, 0.0295], [-0.0255, -0.0607, 0.0398, ..., -0.0389, -0.0230, -0.0004]], device='cuda:0'), grad: tensor([[ 1.9801e-04, 0.0000e+00, 1.0413e-04, ..., 4.6566e-08, 6.3300e-05, 0.0000e+00], [ 3.9062e-03, 0.0000e+00, 1.3137e-04, ..., -2.6841e-06, 4.3106e-04, 0.0000e+00], [-4.9362e-03, 0.0000e+00, -7.3195e-05, ..., 8.4937e-07, -1.2693e-03, 0.0000e+00], ..., [-2.3804e-03, 0.0000e+00, -2.9869e-03, ..., 1.0431e-07, -1.0738e-03, 0.0000e+00], [ 1.0395e-03, 0.0000e+00, 5.9652e-04, ..., 4.0792e-07, 3.7146e-04, 0.0000e+00], [ 2.0847e-03, 0.0000e+00, 3.2883e-03, ..., 2.0489e-08, 8.9359e-04, 0.0000e+00]], device='cuda:0') Epoch 28, bias, value: tensor([-0.0038, 0.0280, 0.0064, 0.0041, 0.0049, -0.0080, 0.0204, -0.0248, 0.0242, 0.0047], device='cuda:0'), grad: tensor([ 0.0017, 0.0159, -0.0332, 0.0026, -0.0008, 0.0006, 0.0009, -0.0072, 0.0081, 0.0114], device='cuda:0') 100 0.0001 changing lr epoch 27, time 217.22, cls_loss 0.0374 cls_loss_mapping 0.0512 cls_loss_causal 0.8379 re_mapping 0.0267 re_causal 0.0732 /// teacc 98.14 lr 0.00010000 Epoch 29, weight, value: tensor([[-0.0209, -0.0128, -0.0149, ..., -0.0606, 0.0283, 0.0219], [-0.0025, 0.0107, -0.0627, ..., 0.0110, 0.0073, 0.0126], [ 0.0328, -0.0042, -0.0315, ..., -0.0215, 0.0066, -0.0079], ..., [ 0.0173, -0.0273, 0.0052, ..., 0.0194, 0.0156, 0.0114], [-0.0002, -0.0196, -0.0214, ..., -0.0016, -0.0127, 0.0295], [-0.0266, -0.0618, 0.0400, ..., -0.0389, -0.0226, -0.0004]], device='cuda:0'), grad: tensor([[ 1.1474e-04, 2.0489e-08, 1.8191e-04, ..., 2.9244e-07, 1.1253e-04, 0.0000e+00], [ 4.4346e-04, 6.4261e-07, 2.1696e-04, ..., 2.5276e-06, 2.4587e-05, 0.0000e+00], [ 2.0778e-04, 4.6194e-07, 8.2016e-04, ..., 1.9707e-06, 9.1195e-05, 0.0000e+00], ..., [ 1.2541e-04, -1.9316e-06, 3.4761e-04, ..., -2.0005e-06, 4.7833e-05, 0.0000e+00], [ 4.6062e-04, 9.8720e-08, 6.0797e-04, ..., 1.9651e-06, -3.4833e-04, 0.0000e+00], [ 3.0637e-04, 2.3097e-07, 8.7786e-04, ..., 2.0489e-06, 1.5116e-04, 0.0000e+00]], device='cuda:0') Epoch 29, bias, value: tensor([-0.0038, 0.0281, 0.0061, 0.0046, 0.0048, -0.0084, 0.0205, -0.0246, 0.0241, 0.0046], device='cuda:0'), grad: tensor([ 5.0974e-04, 1.0605e-03, 1.8349e-03, -2.7809e-03, -2.4204e-03, -1.8835e-03, 1.2693e-03, 7.3528e-04, 8.3804e-05, 1.5898e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 28---------------------------------------------------- epoch 28, time 218.04, cls_loss 0.0366 cls_loss_mapping 0.0434 cls_loss_causal 0.8025 re_mapping 0.0265 re_causal 0.0670 /// teacc 98.47 lr 0.00010000 Epoch 30, weight, value: tensor([[-0.0212, -0.0130, -0.0154, ..., -0.0621, 0.0280, 0.0219], [-0.0031, 0.0109, -0.0637, ..., 0.0105, 0.0080, 0.0126], [ 0.0331, -0.0036, -0.0323, ..., -0.0217, 0.0064, -0.0079], ..., [ 0.0176, -0.0284, 0.0047, ..., 0.0189, 0.0152, 0.0114], [-0.0006, -0.0206, -0.0218, ..., -0.0004, -0.0124, 0.0295], [-0.0269, -0.0667, 0.0405, ..., -0.0392, -0.0221, -0.0004]], device='cuda:0'), grad: tensor([[ 9.5308e-05, 0.0000e+00, 1.9133e-05, ..., 0.0000e+00, 9.1851e-05, 0.0000e+00], [ 1.1027e-04, 0.0000e+00, 1.6570e-05, ..., 0.0000e+00, -6.4299e-06, 0.0000e+00], [ 4.9210e-04, 0.0000e+00, 2.5392e-05, ..., 0.0000e+00, 3.9411e-04, 0.0000e+00], ..., [-1.8203e-04, 0.0000e+00, -7.4729e-06, ..., 0.0000e+00, 2.4661e-05, 0.0000e+00], [ 2.7347e-04, 0.0000e+00, 1.6868e-04, ..., 0.0000e+00, -6.8092e-04, 0.0000e+00], [-1.5056e-04, 0.0000e+00, -3.8624e-04, ..., 0.0000e+00, -5.8889e-05, 0.0000e+00]], device='cuda:0') Epoch 30, bias, value: tensor([-0.0039, 0.0277, 0.0064, 0.0043, 0.0051, -0.0079, 0.0200, -0.0249, 0.0242, 0.0047], device='cuda:0'), grad: tensor([ 2.2161e-04, 1.6427e-04, 1.3714e-03, -5.7745e-04, 8.2445e-04, -5.4550e-04, 4.0054e-05, -2.6464e-04, -6.6137e-04, -5.7173e-04], device='cuda:0') 100 0.0001 changing lr epoch 29, time 217.56, cls_loss 0.0332 cls_loss_mapping 0.0423 cls_loss_causal 0.8212 re_mapping 0.0253 re_causal 0.0703 /// teacc 98.34 lr 0.00010000 Epoch 31, weight, value: tensor([[-0.0215, -0.0131, -0.0149, ..., -0.0626, 0.0278, 0.0219], [-0.0034, 0.0115, -0.0641, ..., 0.0104, 0.0086, 0.0126], [ 0.0336, -0.0041, -0.0330, ..., -0.0219, 0.0061, -0.0079], ..., [ 0.0178, -0.0280, 0.0046, ..., 0.0190, 0.0150, 0.0114], [-0.0012, -0.0212, -0.0222, ..., -0.0007, -0.0124, 0.0295], [-0.0278, -0.0695, 0.0408, ..., -0.0395, -0.0219, -0.0004]], device='cuda:0'), grad: tensor([[-7.0989e-05, 3.7253e-08, -8.7738e-05, ..., 3.7439e-07, 3.6150e-05, 0.0000e+00], [ 2.2411e-04, -1.8403e-06, 1.5962e-04, ..., 1.1474e-06, 9.7752e-05, 0.0000e+00], [-2.5868e-04, 4.3772e-07, 5.0992e-05, ..., 6.6496e-07, 6.1274e-05, 0.0000e+00], ..., [-6.8069e-05, -5.2527e-07, -2.1726e-05, ..., -1.6779e-05, 9.8199e-06, 0.0000e+00], [ 2.5764e-05, 1.5832e-07, 5.1200e-05, ..., 2.1979e-07, 3.4332e-05, 0.0000e+00], [ 7.9811e-05, 3.2037e-07, 5.6028e-04, ..., 1.0386e-05, 6.8806e-06, 0.0000e+00]], device='cuda:0') Epoch 31, bias, value: tensor([-0.0038, 0.0277, 0.0066, 0.0045, 0.0050, -0.0076, 0.0199, -0.0249, 0.0237, 0.0048], device='cuda:0'), grad: tensor([-2.2163e-03, 8.8215e-04, 1.9491e-05, 1.6749e-04, -8.3923e-04, 8.0490e-04, -4.4894e-04, 1.7673e-05, 3.0446e-04, 1.3094e-03], device='cuda:0') 100 0.0001 changing lr epoch 30, time 217.41, cls_loss 0.0350 cls_loss_mapping 0.0488 cls_loss_causal 0.7968 re_mapping 0.0245 re_causal 0.0674 /// teacc 97.97 lr 0.00010000 Epoch 32, weight, value: tensor([[-0.0212, -0.0132, -0.0150, ..., -0.0627, 0.0275, 0.0219], [-0.0046, 0.0114, -0.0647, ..., 0.0103, 0.0087, 0.0126], [ 0.0341, -0.0038, -0.0330, ..., -0.0221, 0.0058, -0.0079], ..., [ 0.0181, -0.0281, 0.0045, ..., 0.0193, 0.0153, 0.0114], [-0.0012, -0.0216, -0.0222, ..., -0.0006, -0.0122, 0.0295], [-0.0295, -0.0710, 0.0408, ..., -0.0396, -0.0222, -0.0004]], device='cuda:0'), grad: tensor([[ 4.1604e-05, 2.1793e-07, 3.0303e-04, ..., 0.0000e+00, 1.8805e-05, 0.0000e+00], [ 6.8367e-05, 1.9558e-06, -6.0940e-04, ..., 0.0000e+00, -3.4356e-04, 0.0000e+00], [-2.9182e-04, 2.5127e-06, 1.1837e-04, ..., 0.0000e+00, -9.7334e-05, 0.0000e+00], ..., [-1.2648e-04, -9.6485e-06, 2.4891e-04, ..., 0.0000e+00, 1.1402e-04, 0.0000e+00], [ 1.9145e-04, 6.1281e-07, 4.1699e-04, ..., 0.0000e+00, 1.5914e-04, 0.0000e+00], [-8.3590e-04, 4.8243e-07, -1.3245e-02, ..., 0.0000e+00, -1.0490e-03, 0.0000e+00]], device='cuda:0') Epoch 32, bias, value: tensor([-0.0040, 0.0270, 0.0071, 0.0045, 0.0047, -0.0075, 0.0204, -0.0248, 0.0241, 0.0044], device='cuda:0'), grad: tensor([ 0.0004, -0.0014, -0.0006, 0.0012, 0.0153, 0.0024, 0.0020, 0.0004, 0.0011, -0.0207], device='cuda:0') 100 0.0001 changing lr epoch 31, time 217.14, cls_loss 0.0289 cls_loss_mapping 0.0428 cls_loss_causal 0.8166 re_mapping 0.0234 re_causal 0.0661 /// teacc 97.91 lr 0.00010000 Epoch 33, weight, value: tensor([[-0.0214, -0.0132, -0.0152, ..., -0.0628, 0.0274, 0.0219], [-0.0048, 0.0114, -0.0644, ..., 0.0109, 0.0093, 0.0126], [ 0.0341, -0.0038, -0.0331, ..., -0.0221, 0.0053, -0.0079], ..., [ 0.0187, -0.0280, 0.0044, ..., 0.0194, 0.0150, 0.0114], [-0.0014, -0.0218, -0.0225, ..., -0.0006, -0.0120, 0.0295], [-0.0298, -0.0713, 0.0406, ..., -0.0400, -0.0220, -0.0004]], device='cuda:0'), grad: tensor([[-6.7186e-04, 0.0000e+00, 6.1020e-06, ..., 0.0000e+00, -2.2709e-04, 0.0000e+00], [-1.3947e-04, 0.0000e+00, 2.1249e-05, ..., 0.0000e+00, -7.4196e-04, 0.0000e+00], [ 6.3801e-04, 0.0000e+00, 1.8165e-05, ..., 0.0000e+00, 3.7980e-04, 0.0000e+00], ..., [-5.5361e-04, 0.0000e+00, 2.7210e-05, ..., 0.0000e+00, 7.0453e-05, 0.0000e+00], [ 6.3300e-05, 0.0000e+00, -5.3346e-05, ..., 0.0000e+00, 1.2684e-04, 0.0000e+00], [ 1.3009e-05, 0.0000e+00, -1.7568e-05, ..., 0.0000e+00, -8.2791e-05, 0.0000e+00]], device='cuda:0') Epoch 33, bias, value: tensor([-0.0037, 0.0273, 0.0070, 0.0044, 0.0052, -0.0077, 0.0198, -0.0251, 0.0244, 0.0042], device='cuda:0'), grad: tensor([-4.0855e-03, -8.3303e-04, 1.0691e-03, 1.0443e-03, 2.8133e-04, -1.4186e-04, 2.7294e-03, -1.8871e-04, 6.5625e-05, 5.9724e-05], device='cuda:0') 100 0.0001 changing lr epoch 32, time 217.28, cls_loss 0.0318 cls_loss_mapping 0.0407 cls_loss_causal 0.7904 re_mapping 0.0228 re_causal 0.0645 /// teacc 98.26 lr 0.00010000 Epoch 34, weight, value: tensor([[-0.0207, -0.0132, -0.0152, ..., -0.0629, 0.0278, 0.0213], [-0.0056, 0.0116, -0.0641, ..., 0.0116, 0.0101, 0.0051], [ 0.0342, -0.0039, -0.0335, ..., -0.0221, 0.0048, -0.0112], ..., [ 0.0189, -0.0280, 0.0043, ..., 0.0194, 0.0145, 0.0135], [-0.0010, -0.0219, -0.0227, ..., -0.0006, -0.0121, 0.0260], [-0.0304, -0.0714, 0.0409, ..., -0.0401, -0.0215, -0.0013]], device='cuda:0'), grad: tensor([[ 1.5482e-05, 0.0000e+00, 3.5241e-06, ..., 0.0000e+00, 1.1146e-05, 0.0000e+00], [ 1.8328e-05, 0.0000e+00, 1.4216e-05, ..., 0.0000e+00, -2.6524e-05, 0.0000e+00], [-1.5581e-04, 0.0000e+00, 6.1542e-06, ..., 0.0000e+00, -5.9605e-08, 0.0000e+00], ..., [ 3.0220e-05, 0.0000e+00, 1.3852e-04, ..., 0.0000e+00, 1.9014e-04, 0.0000e+00], [ 1.1468e-04, 0.0000e+00, 1.1104e-04, ..., 0.0000e+00, 7.5102e-05, 0.0000e+00], [ 5.3495e-05, 0.0000e+00, -2.1905e-05, ..., 0.0000e+00, -2.3496e-04, 0.0000e+00]], device='cuda:0') Epoch 34, bias, value: tensor([-0.0036, 0.0275, 0.0067, 0.0043, 0.0046, -0.0077, 0.0200, -0.0251, 0.0245, 0.0046], device='cuda:0'), grad: tensor([-2.5272e-05, 1.2353e-05, -2.8682e-04, 1.2231e-04, -2.1112e-04, -3.1734e-04, -3.1590e-05, 4.6444e-04, 4.1318e-04, -1.4067e-04], device='cuda:0') 100 0.0001 changing lr epoch 33, time 217.36, cls_loss 0.0300 cls_loss_mapping 0.0402 cls_loss_causal 0.7865 re_mapping 0.0223 re_causal 0.0598 /// teacc 98.47 lr 0.00010000 Epoch 35, weight, value: tensor([[-0.0213, -0.0133, -0.0151, ..., -0.0633, 0.0277, 0.0212], [-0.0064, 0.0115, -0.0647, ..., 0.0116, 0.0104, 0.0050], [ 0.0345, -0.0037, -0.0340, ..., -0.0221, 0.0044, -0.0113], ..., [ 0.0197, -0.0281, 0.0043, ..., 0.0200, 0.0144, 0.0136], [-0.0013, -0.0219, -0.0228, ..., -0.0007, -0.0119, 0.0259], [-0.0314, -0.0715, 0.0409, ..., -0.0402, -0.0214, -0.0013]], device='cuda:0'), grad: tensor([[ 5.9217e-05, 0.0000e+00, 8.9183e-06, ..., 3.8184e-07, 4.2111e-05, 0.0000e+00], [ 6.6757e-05, 0.0000e+00, 1.1727e-05, ..., 3.6694e-07, -6.7115e-05, 0.0000e+00], [ 6.3002e-05, 0.0000e+00, 1.0349e-05, ..., 9.0338e-07, 3.3230e-05, 0.0000e+00], ..., [-3.2973e-04, 0.0000e+00, 1.2197e-05, ..., 1.5423e-06, 8.5086e-06, 0.0000e+00], [ 8.0228e-05, 0.0000e+00, 3.7044e-05, ..., 6.1430e-06, 3.6687e-05, 0.0000e+00], [ 1.8787e-04, 0.0000e+00, 3.0732e-04, ..., 4.8615e-06, -6.1095e-05, 0.0000e+00]], device='cuda:0') Epoch 35, bias, value: tensor([-0.0034, 0.0271, 0.0068, 0.0039, 0.0048, -0.0075, 0.0202, -0.0246, 0.0249, 0.0040], device='cuda:0'), grad: tensor([ 1.2267e-04, -1.3083e-05, 1.1319e-04, -8.4782e-04, -4.7731e-04, 7.5340e-04, -6.8307e-05, -3.9458e-04, 1.8954e-04, 6.2323e-04], device='cuda:0') 100 0.0001 changing lr epoch 34, time 217.43, cls_loss 0.0290 cls_loss_mapping 0.0377 cls_loss_causal 0.7935 re_mapping 0.0221 re_causal 0.0606 /// teacc 98.46 lr 0.00010000 Epoch 36, weight, value: tensor([[-0.0215, -0.0133, -0.0156, ..., -0.0633, 0.0272, 0.0212], [-0.0070, 0.0115, -0.0653, ..., 0.0117, 0.0107, 0.0012], [ 0.0346, -0.0037, -0.0343, ..., -0.0222, 0.0038, -0.0143], ..., [ 0.0203, -0.0281, 0.0042, ..., 0.0201, 0.0145, 0.0167], [-0.0013, -0.0220, -0.0233, ..., -0.0007, -0.0117, 0.0255], [-0.0322, -0.0715, 0.0413, ..., -0.0402, -0.0213, -0.0013]], device='cuda:0'), grad: tensor([[-9.2566e-05, 0.0000e+00, -6.7890e-05, ..., 0.0000e+00, -1.5020e-04, 2.5146e-07], [ 1.9088e-05, 0.0000e+00, 8.3447e-06, ..., 0.0000e+00, -1.4699e-04, 5.5507e-07], [-8.8394e-05, 0.0000e+00, 1.3910e-05, ..., 0.0000e+00, 3.2872e-05, 3.9116e-08], ..., [-5.6922e-06, 0.0000e+00, 3.2634e-05, ..., 0.0000e+00, 1.2732e-04, -3.9786e-06], [ 1.0467e-04, 0.0000e+00, 2.1607e-05, ..., 0.0000e+00, 1.4806e-04, 5.2154e-08], [ 1.1663e-03, 0.0000e+00, 5.0694e-05, ..., 0.0000e+00, 1.3676e-03, 2.2352e-06]], device='cuda:0') Epoch 36, bias, value: tensor([-0.0035, 0.0269, 0.0066, 0.0038, 0.0050, -0.0077, 0.0205, -0.0244, 0.0250, 0.0038], device='cuda:0'), grad: tensor([-7.6962e-04, -1.7488e-04, -9.6112e-06, -2.7161e-03, 1.9491e-05, 3.1209e-04, 2.2793e-04, 2.0015e-04, 3.3307e-04, 2.5768e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 35---------------------------------------------------- epoch 35, time 218.07, cls_loss 0.0257 cls_loss_mapping 0.0350 cls_loss_causal 0.7962 re_mapping 0.0212 re_causal 0.0601 /// teacc 98.51 lr 0.00010000 Epoch 37, weight, value: tensor([[-0.0213, -0.0133, -0.0156, ..., -0.0634, 0.0269, 0.0212], [-0.0077, 0.0115, -0.0658, ..., 0.0117, 0.0118, -0.0022], [ 0.0351, -0.0037, -0.0346, ..., -0.0222, 0.0034, -0.0154], ..., [ 0.0205, -0.0281, 0.0044, ..., 0.0203, 0.0138, 0.0195], [-0.0016, -0.0220, -0.0236, ..., -0.0007, -0.0118, 0.0251], [-0.0328, -0.0715, 0.0414, ..., -0.0403, -0.0207, -0.0014]], device='cuda:0'), grad: tensor([[ 1.1921e-04, 0.0000e+00, 2.6897e-05, ..., 0.0000e+00, 4.2379e-05, 0.0000e+00], [ 1.5955e-03, 0.0000e+00, 5.2512e-05, ..., 0.0000e+00, 4.6206e-04, 0.0000e+00], [-2.1667e-03, 0.0000e+00, 1.1832e-05, ..., 0.0000e+00, -4.3035e-04, 0.0000e+00], ..., [ 2.1863e-04, 0.0000e+00, 5.2023e-04, ..., 0.0000e+00, 3.4404e-04, 0.0000e+00], [ 5.0497e-04, 0.0000e+00, 2.1124e-04, ..., 0.0000e+00, 1.7059e-04, 0.0000e+00], [-2.3887e-05, 0.0000e+00, -9.7942e-04, ..., 0.0000e+00, -7.6103e-04, 0.0000e+00]], device='cuda:0') Epoch 37, bias, value: tensor([-0.0031, 0.0273, 0.0069, 0.0038, 0.0047, -0.0076, 0.0200, -0.0246, 0.0248, 0.0040], device='cuda:0'), grad: tensor([ 0.0002, 0.0050, -0.0057, 0.0034, 0.0006, -0.0041, 0.0001, 0.0013, 0.0011, -0.0020], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 36---------------------------------------------------- epoch 36, time 218.13, cls_loss 0.0268 cls_loss_mapping 0.0352 cls_loss_causal 0.7471 re_mapping 0.0214 re_causal 0.0572 /// teacc 98.63 lr 0.00010000 Epoch 38, weight, value: tensor([[-0.0217, -0.0133, -0.0153, ..., -0.0639, 0.0265, 0.0212], [-0.0086, 0.0113, -0.0664, ..., 0.0117, 0.0120, -0.0027], [ 0.0356, -0.0033, -0.0352, ..., -0.0223, 0.0031, -0.0160], ..., [ 0.0208, -0.0283, 0.0040, ..., 0.0208, 0.0135, 0.0202], [-0.0017, -0.0222, -0.0241, ..., -0.0008, -0.0119, 0.0249], [-0.0337, -0.0716, 0.0412, ..., -0.0407, -0.0203, -0.0014]], device='cuda:0'), grad: tensor([[ 4.2975e-05, 0.0000e+00, 3.8356e-05, ..., 0.0000e+00, 1.5348e-05, 0.0000e+00], [ 1.4806e-04, 0.0000e+00, 4.3094e-05, ..., 0.0000e+00, -3.4600e-05, 0.0000e+00], [ 1.9860e-04, 0.0000e+00, 6.9320e-05, ..., 0.0000e+00, 7.6115e-05, 0.0000e+00], ..., [-9.0742e-04, 0.0000e+00, -1.0812e-04, ..., 0.0000e+00, -7.3731e-05, 0.0000e+00], [ 1.5688e-04, 0.0000e+00, 2.8682e-04, ..., 0.0000e+00, 1.5330e-04, 0.0000e+00], [-4.9829e-05, 0.0000e+00, -3.7217e-04, ..., 0.0000e+00, -2.8253e-04, 0.0000e+00]], device='cuda:0') Epoch 38, bias, value: tensor([-0.0029, 0.0267, 0.0069, 0.0039, 0.0051, -0.0077, 0.0204, -0.0243, 0.0246, 0.0036], device='cuda:0'), grad: tensor([ 5.4747e-05, 2.1505e-04, 3.8195e-04, 6.9141e-04, 2.5535e-04, -5.3883e-04, 2.3174e-04, -1.3437e-03, 6.3276e-04, -5.8079e-04], device='cuda:0') 100 0.0001 changing lr epoch 37, time 217.49, cls_loss 0.0266 cls_loss_mapping 0.0349 cls_loss_causal 0.7600 re_mapping 0.0200 re_causal 0.0567 /// teacc 98.43 lr 0.00010000 Epoch 39, weight, value: tensor([[-0.0225, -0.0134, -0.0157, ..., -0.0642, 0.0258, 0.0212], [-0.0089, 0.0112, -0.0672, ..., 0.0116, 0.0124, -0.0028], [ 0.0359, -0.0031, -0.0360, ..., -0.0225, 0.0028, -0.0160], ..., [ 0.0210, -0.0286, 0.0036, ..., 0.0205, 0.0130, 0.0203], [-0.0019, -0.0223, -0.0243, ..., -0.0009, -0.0117, 0.0249], [-0.0341, -0.0718, 0.0415, ..., -0.0407, -0.0194, -0.0014]], device='cuda:0'), grad: tensor([[-2.5177e-04, 0.0000e+00, -1.1501e-03, ..., 7.4506e-08, -1.1665e-04, 0.0000e+00], [ 1.7837e-05, 0.0000e+00, 6.3360e-05, ..., 1.5087e-07, -1.1736e-04, 0.0000e+00], [-5.0992e-05, 0.0000e+00, 1.1599e-04, ..., 2.6450e-07, 2.2128e-05, 0.0000e+00], ..., [ 7.1466e-05, 0.0000e+00, 2.2244e-04, ..., 7.7859e-07, 1.9407e-04, 0.0000e+00], [ 1.2863e-04, 0.0000e+00, 3.6979e-04, ..., 1.5087e-07, 2.0814e-04, 0.0000e+00], [ 7.9632e-05, 0.0000e+00, 9.6703e-04, ..., 2.8312e-07, -3.3236e-04, 0.0000e+00]], device='cuda:0') Epoch 39, bias, value: tensor([-0.0038, 0.0265, 0.0064, 0.0037, 0.0051, -0.0071, 0.0204, -0.0245, 0.0249, 0.0043], device='cuda:0'), grad: tensor([-2.2202e-03, -3.1859e-05, 1.6952e-04, -2.0468e-04, -1.1339e-03, 4.4107e-04, 2.9254e-04, 6.3801e-04, 8.7500e-04, 1.1740e-03], device='cuda:0') 100 0.0001 changing lr epoch 38, time 217.23, cls_loss 0.0263 cls_loss_mapping 0.0371 cls_loss_causal 0.7553 re_mapping 0.0206 re_causal 0.0557 /// teacc 98.47 lr 0.00010000 Epoch 40, weight, value: tensor([[-0.0214, -0.0135, -0.0156, ..., -0.0643, 0.0260, 0.0212], [-0.0097, 0.0114, -0.0677, ..., 0.0116, 0.0128, -0.0036], [ 0.0359, -0.0032, -0.0363, ..., -0.0226, 0.0023, -0.0161], ..., [ 0.0220, -0.0287, 0.0035, ..., 0.0212, 0.0127, 0.0204], [-0.0022, -0.0224, -0.0251, ..., -0.0010, -0.0117, 0.0249], [-0.0352, -0.0721, 0.0420, ..., -0.0409, -0.0191, -0.0014]], device='cuda:0'), grad: tensor([[ 7.6145e-06, 0.0000e+00, 3.9190e-05, ..., 0.0000e+00, -6.0111e-05, 0.0000e+00], [ 5.1069e-04, 0.0000e+00, 3.2663e-05, ..., 0.0000e+00, 1.2982e-04, 0.0000e+00], [ 5.7936e-04, 0.0000e+00, 1.4983e-05, ..., 0.0000e+00, 4.1693e-05, 0.0000e+00], ..., [-1.4238e-03, 0.0000e+00, 2.0385e-04, ..., 0.0000e+00, 1.4901e-07, 0.0000e+00], [ 1.3757e-04, 0.0000e+00, 1.2898e-04, ..., 0.0000e+00, 2.5320e-04, 0.0000e+00], [ 2.9755e-04, 0.0000e+00, -1.0824e-03, ..., 0.0000e+00, 3.3617e-05, 0.0000e+00]], device='cuda:0') Epoch 40, bias, value: tensor([-0.0032, 0.0264, 0.0060, 0.0038, 0.0051, -0.0071, 0.0203, -0.0242, 0.0246, 0.0043], device='cuda:0'), grad: tensor([-0.0004, 0.0009, 0.0009, -0.0002, 0.0012, 0.0003, -0.0004, -0.0015, 0.0009, -0.0017], device='cuda:0') 100 0.0001 changing lr epoch 39, time 217.27, cls_loss 0.0205 cls_loss_mapping 0.0267 cls_loss_causal 0.7271 re_mapping 0.0203 re_causal 0.0569 /// teacc 98.48 lr 0.00010000 Epoch 41, weight, value: tensor([[-0.0212, -0.0136, -0.0148, ..., -0.0649, 0.0257, 0.0211], [-0.0100, 0.0114, -0.0680, ..., 0.0118, 0.0136, -0.0063], [ 0.0361, -0.0032, -0.0363, ..., -0.0230, 0.0018, -0.0191], ..., [ 0.0217, -0.0287, 0.0034, ..., 0.0215, 0.0125, 0.0233], [-0.0023, -0.0225, -0.0255, ..., -0.0010, -0.0118, 0.0228], [-0.0360, -0.0722, 0.0422, ..., -0.0411, -0.0187, -0.0014]], device='cuda:0'), grad: tensor([[ 7.8857e-05, 0.0000e+00, 2.6524e-05, ..., 0.0000e+00, 7.1645e-05, 7.6927e-07], [ 4.2826e-05, 0.0000e+00, 3.3021e-05, ..., 0.0000e+00, 4.4823e-05, 8.7917e-07], [-2.6917e-04, 0.0000e+00, 2.1899e-04, ..., 0.0000e+00, 6.7353e-05, 1.0021e-05], ..., [-6.6566e-04, 0.0000e+00, -1.1083e-06, ..., 0.0000e+00, -4.2379e-05, -2.4244e-05], [ 1.4257e-04, 0.0000e+00, 6.3479e-06, ..., 0.0000e+00, 2.2042e-04, 6.6869e-07], [ 1.9717e-04, 0.0000e+00, -5.4455e-04, ..., 0.0000e+00, -2.4307e-04, 8.1956e-07]], device='cuda:0') Epoch 41, bias, value: tensor([-0.0029, 0.0266, 0.0060, 0.0037, 0.0049, -0.0067, 0.0204, -0.0242, 0.0243, 0.0041], device='cuda:0'), grad: tensor([ 1.5390e-04, 1.7715e-04, 1.4389e-04, 8.8358e-04, 2.3401e-04, 1.8346e-04, -3.2663e-04, -9.5177e-04, -5.2661e-05, -4.4346e-04], device='cuda:0') 100 0.0001 changing lr epoch 40, time 217.22, cls_loss 0.0278 cls_loss_mapping 0.0355 cls_loss_causal 0.7402 re_mapping 0.0194 re_causal 0.0535 /// teacc 98.50 lr 0.00010000 Epoch 42, weight, value: tensor([[-0.0217, -0.0137, -0.0155, ..., -0.0652, 0.0252, 0.0212], [-0.0104, 0.0124, -0.0687, ..., 0.0118, 0.0139, -0.0082], [ 0.0365, -0.0038, -0.0374, ..., -0.0231, 0.0014, -0.0194], ..., [ 0.0220, -0.0299, 0.0028, ..., 0.0216, 0.0117, 0.0233], [-0.0028, -0.0230, -0.0258, ..., -0.0010, -0.0123, 0.0218], [-0.0360, -0.0725, 0.0420, ..., -0.0412, -0.0172, -0.0015]], device='cuda:0'), grad: tensor([[ 1.2231e-04, 0.0000e+00, 1.9178e-05, ..., 2.5705e-07, 3.7551e-05, 0.0000e+00], [-2.3580e-04, 0.0000e+00, -1.3723e-03, ..., 5.2713e-07, -1.8854e-03, 0.0000e+00], [ 7.8058e-04, 0.0000e+00, 8.6844e-05, ..., 8.9034e-07, 8.6546e-05, 0.0000e+00], ..., [ 2.9421e-04, 0.0000e+00, 1.8239e-04, ..., -4.6901e-06, 1.4985e-04, 0.0000e+00], [ 2.9325e-04, 0.0000e+00, 1.4365e-04, ..., 9.1270e-08, 1.1625e-03, 0.0000e+00], [ 4.1318e-04, 0.0000e+00, 1.2839e-04, ..., 3.1665e-07, 3.6240e-04, 0.0000e+00]], device='cuda:0') Epoch 42, bias, value: tensor([-0.0038, 0.0265, 0.0060, 0.0041, 0.0053, -0.0069, 0.0206, -0.0245, 0.0242, 0.0043], device='cuda:0'), grad: tensor([ 0.0002, -0.0046, 0.0013, -0.0137, 0.0022, 0.0096, 0.0004, 0.0008, 0.0024, 0.0013], device='cuda:0') 100 0.0001 changing lr epoch 41, time 217.42, cls_loss 0.0263 cls_loss_mapping 0.0359 cls_loss_causal 0.7653 re_mapping 0.0191 re_causal 0.0547 /// teacc 98.61 lr 0.00010000 Epoch 43, weight, value: tensor([[-0.0218, -0.0137, -0.0155, ..., -0.0653, 0.0249, 0.0212], [-0.0102, 0.0125, -0.0683, ..., 0.0121, 0.0146, -0.0108], [ 0.0366, -0.0037, -0.0374, ..., -0.0231, 0.0007, -0.0196], ..., [ 0.0223, -0.0300, 0.0030, ..., 0.0220, 0.0110, 0.0240], [-0.0028, -0.0232, -0.0262, ..., -0.0011, -0.0122, 0.0206], [-0.0373, -0.0726, 0.0423, ..., -0.0409, -0.0166, -0.0016]], device='cuda:0'), grad: tensor([[ 1.1790e-04, 0.0000e+00, -1.4424e-04, ..., 7.0035e-07, -5.1409e-05, 1.6764e-08], [ 5.7489e-05, 0.0000e+00, 4.3094e-05, ..., 3.3993e-06, -4.2230e-05, 1.1176e-08], [-2.4452e-03, 0.0000e+00, -6.9332e-04, ..., 2.9132e-06, 2.3738e-05, -4.3213e-07], ..., [ 1.6584e-03, 0.0000e+00, 1.9569e-03, ..., 1.1140e-04, 2.1420e-07, 2.7008e-07], [ 1.0443e-04, 0.0000e+00, 4.3392e-05, ..., 1.0524e-06, 6.2585e-05, 5.9605e-08], [ 1.3113e-04, 0.0000e+00, 5.0783e-04, ..., 5.1931e-06, 3.3259e-05, 1.8626e-09]], device='cuda:0') Epoch 43, bias, value: tensor([-0.0035, 0.0271, 0.0062, 0.0038, 0.0049, -0.0061, 0.0194, -0.0246, 0.0241, 0.0045], device='cuda:0'), grad: tensor([-5.6553e-04, 4.1902e-05, -3.3340e-03, 4.6396e-04, -1.9264e-03, 7.8506e-03, -7.3128e-03, 3.6144e-03, 2.8491e-04, 8.7690e-04], device='cuda:0') 100 0.0001 changing lr epoch 42, time 217.23, cls_loss 0.0251 cls_loss_mapping 0.0325 cls_loss_causal 0.7533 re_mapping 0.0186 re_causal 0.0523 /// teacc 98.16 lr 0.00010000 Epoch 44, weight, value: tensor([[-0.0216, -0.0139, -0.0156, ..., -0.0655, 0.0245, 0.0213], [-0.0109, 0.0137, -0.0686, ..., 0.0123, 0.0155, -0.0119], [ 0.0368, -0.0045, -0.0379, ..., -0.0232, 0.0003, -0.0195], ..., [ 0.0226, -0.0307, 0.0028, ..., 0.0224, 0.0106, 0.0241], [-0.0035, -0.0238, -0.0263, ..., -0.0011, -0.0123, 0.0202], [-0.0377, -0.0730, 0.0421, ..., -0.0409, -0.0160, -0.0016]], device='cuda:0'), grad: tensor([[-1.4746e-04, 0.0000e+00, 1.7494e-05, ..., 1.8626e-09, 1.7494e-05, 1.1362e-07], [ 3.8475e-05, 0.0000e+00, 2.7761e-05, ..., -2.1979e-07, -1.2405e-05, 3.2410e-07], [ 3.8415e-05, 0.0000e+00, 1.3888e-05, ..., 1.8626e-08, 1.3970e-05, 3.3528e-08], ..., [ 5.1558e-05, 0.0000e+00, 9.4235e-05, ..., 1.1735e-07, 1.3721e-04, -5.2080e-06], [-4.7445e-04, 0.0000e+00, -2.2697e-04, ..., 3.3528e-08, -2.0921e-05, 9.4995e-08], [-5.0604e-05, 0.0000e+00, -3.7098e-04, ..., 1.3039e-08, -3.2878e-04, 4.0196e-06]], device='cuda:0') Epoch 44, bias, value: tensor([-0.0039, 0.0273, 0.0063, 0.0042, 0.0051, -0.0066, 0.0193, -0.0245, 0.0241, 0.0044], device='cuda:0'), grad: tensor([-0.0003, 0.0001, 0.0003, 0.0012, 0.0005, -0.0001, 0.0003, 0.0008, -0.0011, -0.0016], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 43---------------------------------------------------- epoch 43, time 218.05, cls_loss 0.0216 cls_loss_mapping 0.0296 cls_loss_causal 0.7488 re_mapping 0.0182 re_causal 0.0537 /// teacc 98.70 lr 0.00010000 Epoch 45, weight, value: tensor([[-2.1662e-02, -1.3942e-02, -1.5540e-02, ..., -6.5847e-02, 2.4035e-02, 2.0561e-02], [-1.1763e-02, 1.3926e-02, -6.9201e-02, ..., 1.3015e-02, 1.5959e-02, -1.9042e-02], [ 3.7467e-02, -4.6319e-03, -3.7659e-02, ..., -2.3430e-02, 3.5998e-05, -1.6238e-02], ..., [ 2.2948e-02, -3.1033e-02, 2.3940e-03, ..., 2.2459e-02, 1.0317e-02, 2.4452e-02], [-3.4721e-03, -2.3909e-02, -2.6404e-02, ..., -1.1909e-03, -1.2683e-02, 1.4079e-02], [-3.8343e-02, -7.3186e-02, 4.2162e-02, ..., -4.1136e-02, -1.5362e-02, -1.8112e-03]], device='cuda:0'), grad: tensor([[ 1.6704e-05, 0.0000e+00, 7.9453e-05, ..., 1.8626e-08, 9.8169e-05, -5.2266e-06], [ 5.4359e-05, 0.0000e+00, 2.2650e-06, ..., -2.9579e-06, -9.3818e-05, 4.8522e-07], [ 7.5674e-04, 0.0000e+00, 3.5539e-06, ..., 6.1467e-08, 2.9892e-05, 1.1362e-06], ..., [ 1.8448e-02, 0.0000e+00, -6.4559e-06, ..., -4.6752e-07, 1.9878e-05, 1.4808e-06], [ 1.7715e-04, 0.0000e+00, 7.3731e-05, ..., 1.0990e-06, 1.0115e-04, 4.2468e-07], [ 1.3864e-04, 0.0000e+00, 4.4018e-05, ..., 1.0347e-06, 2.5824e-05, 1.1446e-06]], device='cuda:0') Epoch 45, bias, value: tensor([-0.0041, 0.0270, 0.0068, 0.0040, 0.0051, -0.0067, 0.0195, -0.0247, 0.0241, 0.0047], device='cuda:0'), grad: tensor([ 3.6454e-04, -7.8022e-05, 7.7581e-04, -2.2430e-02, 1.1027e-04, 8.7309e-04, -1.9178e-03, 2.1240e-02, 7.0715e-04, 3.5405e-04], device='cuda:0') 100 0.0001 changing lr epoch 44, time 217.20, cls_loss 0.0268 cls_loss_mapping 0.0348 cls_loss_causal 0.7402 re_mapping 0.0183 re_causal 0.0492 /// teacc 98.68 lr 0.00010000 Epoch 46, weight, value: tensor([[-0.0223, -0.0140, -0.0157, ..., -0.0668, 0.0235, 0.0201], [-0.0123, 0.0147, -0.0699, ..., 0.0135, 0.0164, -0.0214], [ 0.0382, -0.0053, -0.0381, ..., -0.0237, -0.0004, -0.0166], ..., [ 0.0234, -0.0312, 0.0023, ..., 0.0235, 0.0099, 0.0263], [-0.0044, -0.0240, -0.0268, ..., -0.0015, -0.0125, 0.0133], [-0.0390, -0.0734, 0.0420, ..., -0.0419, -0.0142, -0.0015]], device='cuda:0'), grad: tensor([[ 1.0669e-05, 0.0000e+00, 4.6045e-06, ..., 2.3190e-07, 1.3091e-05, 0.0000e+00], [ 1.4126e-04, 0.0000e+00, 4.9621e-06, ..., 7.6089e-07, 2.8700e-05, 0.0000e+00], [ 3.1638e-04, 0.0000e+00, 1.7406e-06, ..., 1.5926e-07, 3.5882e-05, 0.0000e+00], ..., [-6.7902e-04, 0.0000e+00, 1.1735e-07, ..., -2.6859e-06, -8.9183e-06, 0.0000e+00], [ 2.8312e-05, 0.0000e+00, 1.7777e-05, ..., 8.1025e-08, 2.1660e-04, 0.0000e+00], [ 6.2466e-05, 0.0000e+00, -4.7497e-06, ..., 2.1793e-07, -2.8229e-04, 0.0000e+00]], device='cuda:0') Epoch 46, bias, value: tensor([-0.0042, 0.0267, 0.0071, 0.0033, 0.0050, -0.0061, 0.0196, -0.0244, 0.0241, 0.0045], device='cuda:0'), grad: tensor([-6.1035e-04, 3.5381e-04, 7.6532e-04, 4.0317e-04, 6.7353e-05, 3.9721e-04, 1.4015e-05, -9.4509e-04, -1.1098e-04, -3.3498e-04], device='cuda:0') 100 0.0001 changing lr epoch 45, time 217.33, cls_loss 0.0208 cls_loss_mapping 0.0290 cls_loss_causal 0.7351 re_mapping 0.0180 re_causal 0.0513 /// teacc 98.69 lr 0.00010000 Epoch 47, weight, value: tensor([[-0.0228, -0.0141, -0.0160, ..., -0.0670, 0.0233, 0.0196], [-0.0126, 0.0150, -0.0703, ..., 0.0137, 0.0169, -0.0235], [ 0.0384, -0.0053, -0.0385, ..., -0.0238, -0.0008, -0.0166], ..., [ 0.0239, -0.0319, 0.0024, ..., 0.0238, 0.0096, 0.0267], [-0.0047, -0.0242, -0.0272, ..., -0.0015, -0.0126, 0.0130], [-0.0400, -0.0736, 0.0419, ..., -0.0418, -0.0144, -0.0011]], device='cuda:0'), grad: tensor([[ 8.3447e-06, 1.8626e-08, 1.8269e-05, ..., 1.1269e-07, 1.8924e-05, 1.1893e-06], [ 1.6347e-05, 8.1956e-08, 1.2800e-05, ..., -6.3181e-06, -5.4389e-05, 8.4285e-07], [-1.5616e-05, -6.2399e-07, 4.8965e-05, ..., 1.2694e-06, 4.0561e-05, -6.8657e-06], ..., [-2.3615e-04, 4.6566e-07, 1.6987e-05, ..., 1.6680e-06, -8.1837e-05, -3.3863e-06], [ 3.1978e-05, 1.6764e-08, -4.4346e-05, ..., 5.5321e-07, -6.6936e-05, 4.3400e-06], [ 1.6749e-04, 2.7940e-09, 6.3753e-04, ..., 3.1479e-07, 7.5817e-05, 2.1495e-06]], device='cuda:0') Epoch 47, bias, value: tensor([-0.0041, 0.0264, 0.0074, 0.0033, 0.0052, -0.0056, 0.0195, -0.0239, 0.0235, 0.0040], device='cuda:0'), grad: tensor([-3.1424e-04, 1.0319e-06, 1.4210e-04, 1.0484e-04, -1.3399e-03, 1.2827e-04, 2.6202e-04, -4.3726e-04, -2.3150e-04, 1.6851e-03], device='cuda:0') 100 0.0001 changing lr epoch 46, time 217.28, cls_loss 0.0208 cls_loss_mapping 0.0304 cls_loss_causal 0.7148 re_mapping 0.0173 re_causal 0.0485 /// teacc 98.64 lr 0.00010000 Epoch 48, weight, value: tensor([[-0.0234, -0.0142, -0.0162, ..., -0.0672, 0.0230, 0.0196], [-0.0131, 0.0151, -0.0706, ..., 0.0152, 0.0175, -0.0253], [ 0.0389, -0.0050, -0.0388, ..., -0.0239, -0.0016, -0.0169], ..., [ 0.0238, -0.0329, 0.0023, ..., 0.0241, 0.0090, 0.0277], [-0.0052, -0.0244, -0.0277, ..., -0.0017, -0.0124, 0.0122], [-0.0410, -0.0738, 0.0418, ..., -0.0422, -0.0139, -0.0014]], device='cuda:0'), grad: tensor([[ 1.7062e-05, 0.0000e+00, 2.8804e-05, ..., 2.7940e-09, 3.5763e-05, 0.0000e+00], [ 8.6799e-06, 0.0000e+00, 1.0848e-05, ..., 3.2596e-08, -3.9190e-05, 0.0000e+00], [ 4.5896e-05, 0.0000e+00, 3.2723e-05, ..., 1.3970e-08, 2.1017e-04, 0.0000e+00], ..., [ 1.2845e-05, 0.0000e+00, 8.6963e-05, ..., -1.5739e-07, 3.2812e-05, 0.0000e+00], [ 7.5638e-05, 0.0000e+00, -5.3406e-05, ..., 8.3819e-09, -3.9005e-04, 0.0000e+00], [ 1.3161e-04, 0.0000e+00, -6.5625e-05, ..., 6.7987e-08, -5.2527e-06, 0.0000e+00]], device='cuda:0') Epoch 48, bias, value: tensor([-0.0047, 0.0262, 0.0074, 0.0039, 0.0061, -0.0060, 0.0197, -0.0240, 0.0235, 0.0036], device='cuda:0'), grad: tensor([-2.6345e-04, 1.5981e-06, 9.1696e-04, -2.0218e-03, 6.6423e-04, -1.0996e-03, 3.2692e-03, 3.5763e-04, -2.2240e-03, 4.0007e-04], device='cuda:0') 100 0.0001 changing lr epoch 47, time 217.62, cls_loss 0.0205 cls_loss_mapping 0.0240 cls_loss_causal 0.7104 re_mapping 0.0168 re_causal 0.0484 /// teacc 98.68 lr 0.00010000 Epoch 49, weight, value: tensor([[-0.0235, -0.0142, -0.0164, ..., -0.0674, 0.0224, 0.0165], [-0.0133, 0.0151, -0.0709, ..., 0.0155, 0.0183, -0.0294], [ 0.0385, -0.0049, -0.0396, ..., -0.0241, -0.0021, -0.0166], ..., [ 0.0241, -0.0333, 0.0022, ..., 0.0244, 0.0084, 0.0283], [-0.0050, -0.0247, -0.0281, ..., -0.0018, -0.0123, 0.0110], [-0.0410, -0.0740, 0.0420, ..., -0.0424, -0.0130, 0.0020]], device='cuda:0'), grad: tensor([[ 3.2723e-05, 0.0000e+00, 6.3896e-05, ..., 0.0000e+00, 2.8729e-05, 5.4110e-07], [ 1.4234e-04, 0.0000e+00, 8.8066e-06, ..., 0.0000e+00, -3.2149e-06, 8.9593e-07], [ 4.6778e-04, 0.0000e+00, 5.3532e-06, ..., 0.0000e+00, 1.2696e-04, 1.5814e-06], ..., [ 7.8559e-05, 0.0000e+00, 9.2626e-05, ..., 0.0000e+00, 1.0234e-04, 1.0051e-05], [ 1.8215e-04, 0.0000e+00, 1.2577e-05, ..., 0.0000e+00, 4.3094e-05, 3.9581e-07], [ 9.0778e-05, 0.0000e+00, -2.1911e-04, ..., 0.0000e+00, -1.9205e-04, -1.6764e-05]], device='cuda:0') Epoch 49, bias, value: tensor([-0.0053, 0.0265, 0.0061, 0.0034, 0.0063, -0.0054, 0.0199, -0.0241, 0.0236, 0.0044], device='cuda:0'), grad: tensor([ 0.0003, 0.0001, 0.0006, -0.0014, 0.0006, -0.0008, 0.0002, 0.0004, 0.0003, -0.0004], device='cuda:0') 100 0.0001 changing lr epoch 48, time 217.58, cls_loss 0.0171 cls_loss_mapping 0.0232 cls_loss_causal 0.7122 re_mapping 0.0174 re_causal 0.0492 /// teacc 98.60 lr 0.00010000 Epoch 50, weight, value: tensor([[-0.0229, -0.0144, -0.0163, ..., -0.0677, 0.0216, 0.0158], [-0.0135, 0.0156, -0.0712, ..., 0.0171, 0.0185, -0.0293], [ 0.0386, -0.0052, -0.0398, ..., -0.0242, -0.0027, -0.0153], ..., [ 0.0244, -0.0335, 0.0021, ..., 0.0249, 0.0083, 0.0310], [-0.0054, -0.0249, -0.0284, ..., -0.0019, -0.0120, 0.0078], [-0.0418, -0.0750, 0.0423, ..., -0.0418, -0.0124, 0.0005]], device='cuda:0'), grad: tensor([[ 5.0142e-06, 0.0000e+00, 6.1020e-06, ..., 2.1420e-08, -1.4156e-05, 9.3132e-10], [-6.5379e-07, 0.0000e+00, 4.9174e-06, ..., -2.3823e-06, -2.0295e-05, 9.3132e-10], [ 6.5267e-06, 0.0000e+00, 1.5005e-05, ..., 1.9372e-07, 1.5408e-05, -1.2107e-08], ..., [ 1.2331e-05, 0.0000e+00, 2.1517e-05, ..., 1.3039e-06, 1.7717e-05, 2.7940e-09], [-9.4116e-05, 0.0000e+00, -2.7895e-05, ..., 1.8533e-07, 8.9705e-05, 2.7940e-09], [-2.3156e-05, 0.0000e+00, -5.6237e-05, ..., 2.6729e-07, -1.2684e-04, 0.0000e+00]], device='cuda:0') Epoch 50, bias, value: tensor([-0.0048, 0.0265, 0.0059, 0.0037, 0.0057, -0.0055, 0.0199, -0.0240, 0.0235, 0.0046], device='cuda:0'), grad: tensor([-4.3488e-04, -5.6811e-06, 1.0598e-04, 9.5308e-05, 4.9978e-05, 4.1246e-04, 2.6131e-04, 1.4341e-04, -4.8876e-04, -1.3745e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 49---------------------------------------------------- epoch 49, time 218.07, cls_loss 0.0172 cls_loss_mapping 0.0231 cls_loss_causal 0.7000 re_mapping 0.0167 re_causal 0.0471 /// teacc 98.72 lr 0.00010000 Epoch 51, weight, value: tensor([[-0.0231, -0.0144, -0.0166, ..., -0.0680, 0.0211, 0.0147], [-0.0139, 0.0156, -0.0716, ..., 0.0181, 0.0188, -0.0318], [ 0.0396, -0.0049, -0.0398, ..., -0.0246, -0.0024, -0.0150], ..., [ 0.0245, -0.0342, 0.0018, ..., 0.0246, 0.0076, 0.0297], [-0.0057, -0.0250, -0.0284, ..., -0.0021, -0.0119, 0.0064], [-0.0426, -0.0752, 0.0422, ..., -0.0421, -0.0116, 0.0011]], device='cuda:0'), grad: tensor([[-8.6594e-04, 0.0000e+00, 1.1921e-05, ..., 0.0000e+00, 5.4687e-06, 0.0000e+00], [ 4.1068e-05, 0.0000e+00, 4.1574e-05, ..., 0.0000e+00, -7.4089e-05, 0.0000e+00], [ 1.1700e-04, 0.0000e+00, 6.5446e-05, ..., 0.0000e+00, 1.3456e-05, -2.7940e-09], ..., [-2.2620e-05, 0.0000e+00, 3.8862e-05, ..., 0.0000e+00, 4.2856e-05, 0.0000e+00], [ 7.3814e-04, 0.0000e+00, -8.7082e-05, ..., 0.0000e+00, 2.8610e-05, 9.3132e-10], [-2.6718e-05, 0.0000e+00, 5.5805e-06, ..., 0.0000e+00, -1.3721e-04, 0.0000e+00]], device='cuda:0') Epoch 51, bias, value: tensor([-0.0046, 0.0264, 0.0070, 0.0030, 0.0056, -0.0055, 0.0200, -0.0245, 0.0235, 0.0046], device='cuda:0'), grad: tensor([-5.1727e-03, 7.4387e-05, 6.0892e-04, 3.2425e-03, 3.1328e-04, -3.0842e-03, 1.1027e-04, 3.2401e-04, 3.7804e-03, -1.9515e-04], device='cuda:0') 100 0.0001 changing lr epoch 50, time 217.49, cls_loss 0.0194 cls_loss_mapping 0.0260 cls_loss_causal 0.6824 re_mapping 0.0163 re_causal 0.0452 /// teacc 98.56 lr 0.00010000 Epoch 52, weight, value: tensor([[-0.0237, -0.0151, -0.0168, ..., -0.0683, 0.0208, 0.0146], [-0.0139, 0.0150, -0.0722, ..., 0.0181, 0.0196, -0.0351], [ 0.0401, -0.0038, -0.0403, ..., -0.0247, -0.0028, -0.0152], ..., [ 0.0245, -0.0348, 0.0019, ..., 0.0255, 0.0076, 0.0301], [-0.0063, -0.0263, -0.0286, ..., -0.0021, -0.0122, 0.0049], [-0.0436, -0.0768, 0.0415, ..., -0.0425, -0.0117, 0.0009]], device='cuda:0'), grad: tensor([[-6.1452e-05, 0.0000e+00, -4.2200e-05, ..., 0.0000e+00, 7.9721e-06, 1.6764e-08], [ 4.0352e-05, 0.0000e+00, 2.9981e-05, ..., 0.0000e+00, 8.1509e-06, 3.9116e-08], [ 2.7388e-05, 0.0000e+00, 1.7679e-04, ..., 0.0000e+00, 1.1191e-05, -3.2131e-07], ..., [-3.9756e-05, 0.0000e+00, 3.2693e-05, ..., 0.0000e+00, 5.7667e-05, 5.7742e-08], [ 1.0622e-04, 0.0000e+00, 1.6844e-04, ..., 0.0000e+00, -1.8692e-04, 5.2154e-08], [ 3.5346e-05, 0.0000e+00, 1.6868e-05, ..., 0.0000e+00, -8.7202e-05, 3.7253e-09]], device='cuda:0') Epoch 52, bias, value: tensor([-0.0044, 0.0265, 0.0073, 0.0034, 0.0061, -0.0054, 0.0201, -0.0242, 0.0230, 0.0035], device='cuda:0'), grad: tensor([-2.1553e-04, 1.7560e-04, 1.6892e-04, 1.2817e-03, -1.4615e-04, -3.2043e-03, 1.0386e-03, 1.3685e-04, 6.8235e-04, 8.4579e-05], device='cuda:0') 100 0.0001 changing lr epoch 51, time 217.38, cls_loss 0.0181 cls_loss_mapping 0.0232 cls_loss_causal 0.6852 re_mapping 0.0160 re_causal 0.0452 /// teacc 98.68 lr 0.00010000 Epoch 53, weight, value: tensor([[-0.0240, -0.0152, -0.0172, ..., -0.0684, 0.0205, 0.0145], [-0.0140, 0.0152, -0.0721, ..., 0.0189, 0.0205, -0.0359], [ 0.0398, -0.0039, -0.0409, ..., -0.0248, -0.0036, -0.0153], ..., [ 0.0249, -0.0349, 0.0021, ..., 0.0253, 0.0072, 0.0312], [-0.0060, -0.0263, -0.0290, ..., -0.0022, -0.0128, 0.0040], [-0.0438, -0.0769, 0.0416, ..., -0.0423, -0.0112, 0.0007]], device='cuda:0'), grad: tensor([[ 1.5512e-05, 1.1791e-06, 9.5218e-06, ..., 0.0000e+00, 1.8209e-05, 6.9477e-07], [ 2.7180e-05, 8.4750e-08, 3.7942e-06, ..., 0.0000e+00, -6.8605e-05, 2.7940e-07], [-1.0514e-04, -1.0476e-05, 1.2526e-06, ..., 0.0000e+00, 2.2516e-05, -1.9297e-06], ..., [-8.1122e-05, 8.3633e-07, 4.9248e-06, ..., 0.0000e+00, 1.2346e-05, 4.5076e-07], [ 1.5162e-06, 5.1688e-07, 5.5701e-05, ..., 0.0000e+00, 8.7023e-05, 4.5002e-06], [ 4.0025e-05, 1.7881e-07, 4.8578e-06, ..., 0.0000e+00, 6.1989e-06, 8.6799e-07]], device='cuda:0') Epoch 53, bias, value: tensor([-0.0037, 0.0273, 0.0067, 0.0031, 0.0058, -0.0056, 0.0206, -0.0238, 0.0227, 0.0034], device='cuda:0'), grad: tensor([ 5.4300e-05, -5.0664e-05, -1.0252e-04, 1.6451e-04, 4.5300e-05, -7.2670e-03, 6.9962e-03, -1.2708e-04, 1.9574e-04, 8.8096e-05], device='cuda:0') 100 0.0001 changing lr epoch 52, time 217.56, cls_loss 0.0145 cls_loss_mapping 0.0191 cls_loss_causal 0.7120 re_mapping 0.0158 re_causal 0.0456 /// teacc 98.68 lr 0.00010000 Epoch 54, weight, value: tensor([[-0.0241, -0.0153, -0.0176, ..., -0.0685, 0.0203, 0.0144], [-0.0138, 0.0154, -0.0725, ..., 0.0190, 0.0211, -0.0371], [ 0.0397, -0.0038, -0.0413, ..., -0.0249, -0.0041, -0.0159], ..., [ 0.0248, -0.0359, 0.0021, ..., 0.0253, 0.0066, 0.0319], [-0.0059, -0.0269, -0.0288, ..., -0.0023, -0.0124, 0.0032], [-0.0443, -0.0789, 0.0415, ..., -0.0426, -0.0106, 0.0006]], device='cuda:0'), grad: tensor([[-8.9034e-07, 9.3132e-10, 3.8929e-06, ..., 8.7265e-07, 1.2085e-05, 6.5193e-09], [ 4.7952e-05, -6.5193e-08, 2.8968e-05, ..., 1.3120e-05, -9.6142e-05, 3.0734e-08], [-5.5321e-06, 8.3819e-09, 3.2540e-06, ..., 5.8860e-07, 1.3672e-05, 3.3528e-08], ..., [-8.9288e-05, 4.6566e-09, -2.4661e-05, ..., -3.7491e-05, 3.8385e-05, 7.0781e-08], [ 3.9227e-06, 3.6322e-08, -9.1419e-06, ..., 7.3947e-07, 1.2167e-05, 5.5879e-09], [-1.1146e-04, 9.3132e-10, -1.2898e-04, ..., 1.4231e-05, -9.1851e-05, 6.1467e-08]], device='cuda:0') Epoch 54, bias, value: tensor([-0.0036, 0.0274, 0.0066, 0.0028, 0.0057, -0.0053, 0.0204, -0.0241, 0.0234, 0.0033], device='cuda:0'), grad: tensor([-2.7001e-05, -1.7390e-05, 1.7717e-05, 8.5473e-05, 3.6478e-04, 5.4330e-05, 8.6904e-05, -7.1585e-05, -1.2189e-05, -4.8113e-04], device='cuda:0') 100 0.0001 changing lr epoch 53, time 217.60, cls_loss 0.0164 cls_loss_mapping 0.0233 cls_loss_causal 0.6930 re_mapping 0.0159 re_causal 0.0448 /// teacc 98.62 lr 0.00010000 Epoch 55, weight, value: tensor([[-0.0245, -0.0155, -0.0180, ..., -0.0689, 0.0198, 0.0144], [-0.0141, 0.0152, -0.0729, ..., 0.0195, 0.0218, -0.0375], [ 0.0400, -0.0033, -0.0416, ..., -0.0252, -0.0046, -0.0159], ..., [ 0.0251, -0.0366, 0.0018, ..., 0.0248, 0.0064, 0.0318], [-0.0060, -0.0278, -0.0297, ..., -0.0027, -0.0125, 0.0026], [-0.0445, -0.0798, 0.0418, ..., -0.0417, -0.0100, 0.0015]], device='cuda:0'), grad: tensor([[ 5.4855e-07, 1.8626e-09, 2.0880e-06, ..., 9.3132e-10, 7.0147e-06, 1.8626e-09], [ 1.0625e-05, -3.6322e-08, 4.1723e-05, ..., 2.7008e-08, 1.2495e-05, 2.7940e-09], [-1.1854e-05, 2.3283e-08, 1.8954e-05, ..., 3.7253e-09, 5.8934e-06, -2.9802e-08], ..., [-3.2354e-06, -7.6368e-08, 4.4376e-05, ..., -7.4506e-08, 1.7107e-05, 1.0245e-08], [-4.6849e-05, 1.3039e-08, 7.5586e-06, ..., 1.8626e-09, 2.4036e-05, 6.5193e-09], [ 9.4324e-06, 3.0734e-08, 2.6035e-04, ..., 2.2352e-08, 8.3864e-05, 0.0000e+00]], device='cuda:0') Epoch 55, bias, value: tensor([-0.0039, 0.0272, 0.0065, 0.0021, 0.0061, -0.0050, 0.0206, -0.0239, 0.0228, 0.0038], device='cuda:0'), grad: tensor([-3.8713e-05, 1.0645e-04, 2.7493e-05, 6.2644e-05, -8.4352e-04, 2.0945e-04, -2.6345e-04, 1.0085e-04, -3.8370e-07, 6.3944e-04], device='cuda:0') 100 0.0001 changing lr epoch 54, time 217.37, cls_loss 0.0150 cls_loss_mapping 0.0199 cls_loss_causal 0.7068 re_mapping 0.0156 re_causal 0.0453 /// teacc 98.60 lr 0.00010000 Epoch 56, weight, value: tensor([[-0.0247, -0.0156, -0.0177, ..., -0.0693, 0.0197, 0.0145], [-0.0148, 0.0154, -0.0735, ..., 0.0200, 0.0215, -0.0382], [ 0.0410, -0.0032, -0.0419, ..., -0.0252, -0.0052, -0.0155], ..., [ 0.0249, -0.0371, 0.0017, ..., 0.0249, 0.0060, 0.0317], [-0.0065, -0.0281, -0.0298, ..., -0.0029, -0.0117, 0.0022], [-0.0451, -0.0802, 0.0417, ..., -0.0417, -0.0097, 0.0015]], device='cuda:0'), grad: tensor([[ 1.1310e-05, 0.0000e+00, 1.1489e-05, ..., 2.4214e-07, 8.5384e-06, 2.0489e-07], [ 1.0908e-05, 0.0000e+00, 9.8133e-04, ..., 1.5553e-07, -1.4871e-05, 1.3039e-07], [-6.3062e-05, 0.0000e+00, 1.6928e-05, ..., 8.8476e-08, 1.0617e-05, 7.4506e-08], ..., [ 7.4267e-05, 0.0000e+00, 1.1963e-04, ..., -8.7079e-07, 2.7373e-05, -7.2457e-07], [-7.1704e-05, 0.0000e+00, -1.0040e-06, ..., 1.3970e-08, -5.5641e-05, 1.3970e-08], [-2.6345e-04, 0.0000e+00, -1.4257e-04, ..., 1.2480e-07, -1.0115e-04, 8.6613e-08]], device='cuda:0') Epoch 56, bias, value: tensor([-0.0034, 0.0261, 0.0076, 0.0022, 0.0060, -0.0051, 0.0208, -0.0239, 0.0226, 0.0035], device='cuda:0'), grad: tensor([-6.1512e-05, 1.6642e-03, 1.2144e-05, 6.1369e-04, -2.0256e-03, 1.1718e-04, 2.0981e-04, 3.4213e-04, -2.2316e-04, -6.4898e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 55---------------------------------------------------- epoch 55, time 218.09, cls_loss 0.0158 cls_loss_mapping 0.0216 cls_loss_causal 0.6807 re_mapping 0.0153 re_causal 0.0438 /// teacc 98.75 lr 0.00010000 Epoch 57, weight, value: tensor([[-0.0248, -0.0158, -0.0183, ..., -0.0694, 0.0195, 0.0146], [-0.0153, 0.0157, -0.0749, ..., 0.0201, 0.0222, -0.0391], [ 0.0409, -0.0034, -0.0421, ..., -0.0253, -0.0060, -0.0154], ..., [ 0.0251, -0.0371, 0.0017, ..., 0.0247, 0.0056, 0.0319], [-0.0071, -0.0287, -0.0297, ..., -0.0029, -0.0114, 0.0019], [-0.0456, -0.0806, 0.0416, ..., -0.0414, -0.0094, 0.0013]], device='cuda:0'), grad: tensor([[ 4.9710e-05, 0.0000e+00, 4.1053e-06, ..., 0.0000e+00, 8.2478e-06, 2.0489e-08], [ 6.7830e-05, 0.0000e+00, 8.4657e-07, ..., 0.0000e+00, -6.4932e-06, 3.7253e-09], [ 1.3816e-04, 0.0000e+00, 2.0117e-06, ..., 0.0000e+00, 6.0797e-05, -1.6019e-07], ..., [ 1.2720e-04, 0.0000e+00, 1.6131e-06, ..., 0.0000e+00, 2.3380e-05, 2.4214e-08], [ 3.1528e-03, 0.0000e+00, 1.2845e-05, ..., 0.0000e+00, 5.5122e-04, 4.5635e-08], [ 4.6790e-05, 0.0000e+00, -7.7952e-07, ..., 0.0000e+00, -5.1558e-06, 1.8626e-09]], device='cuda:0') Epoch 57, bias, value: tensor([-0.0044, 0.0262, 0.0070, 0.0028, 0.0058, -0.0049, 0.0213, -0.0238, 0.0228, 0.0034], device='cuda:0'), grad: tensor([-4.8339e-05, 9.0957e-05, 3.6979e-04, -6.9618e-03, 2.9474e-05, 6.2561e-04, -7.4267e-05, 2.2662e-04, 5.6686e-03, 7.0810e-05], device='cuda:0') 100 0.0001 changing lr epoch 56, time 217.63, cls_loss 0.0186 cls_loss_mapping 0.0258 cls_loss_causal 0.7043 re_mapping 0.0158 re_causal 0.0433 /// teacc 98.57 lr 0.00010000 Epoch 58, weight, value: tensor([[-0.0250, -0.0158, -0.0180, ..., -0.0695, 0.0192, 0.0145], [-0.0156, 0.0161, -0.0754, ..., 0.0215, 0.0232, -0.0394], [ 0.0417, -0.0034, -0.0439, ..., -0.0253, -0.0064, -0.0157], ..., [ 0.0254, -0.0379, 0.0012, ..., 0.0248, 0.0045, 0.0322], [-0.0078, -0.0289, -0.0301, ..., -0.0030, -0.0117, 0.0004], [-0.0466, -0.0810, 0.0413, ..., -0.0424, -0.0096, 0.0033]], device='cuda:0'), grad: tensor([[ 1.6138e-05, 0.0000e+00, 3.1888e-05, ..., 4.6566e-09, 4.7117e-05, 0.0000e+00], [-9.5248e-05, 0.0000e+00, 3.1412e-05, ..., 4.0978e-08, -2.5892e-04, 0.0000e+00], [ 2.8133e-04, 0.0000e+00, 4.5395e-04, ..., 8.3819e-09, 2.1851e-04, 0.0000e+00], ..., [-4.9233e-05, 0.0000e+00, 5.6535e-05, ..., -1.4808e-07, 1.3351e-04, 0.0000e+00], [-4.2677e-04, 0.0000e+00, -5.2357e-04, ..., 3.7253e-09, -8.2636e-04, 0.0000e+00], [ 1.4293e-04, 0.0000e+00, 2.3687e-04, ..., 4.3772e-08, 3.1137e-04, 0.0000e+00]], device='cuda:0') Epoch 58, bias, value: tensor([-0.0039, 0.0267, 0.0069, 0.0029, 0.0070, -0.0045, 0.0206, -0.0240, 0.0224, 0.0025], device='cuda:0'), grad: tensor([ 1.6534e-04, -4.2868e-04, 1.7815e-03, 7.1716e-04, -1.0710e-03, 5.3740e-04, 1.2405e-05, 2.7108e-04, -3.0842e-03, 1.1015e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 57---------------------------------------------------- epoch 57, time 218.30, cls_loss 0.0125 cls_loss_mapping 0.0167 cls_loss_causal 0.6787 re_mapping 0.0154 re_causal 0.0436 /// teacc 98.78 lr 0.00010000 Epoch 59, weight, value: tensor([[-0.0252, -0.0158, -0.0180, ..., -0.0697, 0.0189, 0.0145], [-0.0163, 0.0161, -0.0760, ..., 0.0215, 0.0231, -0.0400], [ 0.0415, -0.0034, -0.0442, ..., -0.0254, -0.0066, -0.0156], ..., [ 0.0263, -0.0379, 0.0009, ..., 0.0247, 0.0043, 0.0323], [-0.0084, -0.0290, -0.0303, ..., -0.0030, -0.0117, -0.0010], [-0.0472, -0.0810, 0.0410, ..., -0.0421, -0.0095, 0.0040]], device='cuda:0'), grad: tensor([[ 2.0528e-04, 0.0000e+00, 8.5640e-04, ..., 1.2200e-07, 8.0228e-05, 2.2352e-08], [ 3.8892e-05, 0.0000e+00, 4.8578e-06, ..., 7.1153e-07, -3.3855e-05, 2.9802e-08], [-1.9681e-04, 0.0000e+00, 1.1034e-05, ..., 1.6112e-07, 3.5137e-05, -4.0792e-07], ..., [ 2.4706e-05, 0.0000e+00, 3.2902e-05, ..., -3.7421e-06, 1.9372e-05, 4.4703e-08], [ 6.1870e-05, 0.0000e+00, 1.7241e-05, ..., 3.6322e-08, 3.2216e-05, 1.0710e-07], [ 1.3340e-04, 0.0000e+00, -2.8968e-05, ..., 1.3616e-06, 3.5495e-05, 5.6811e-08]], device='cuda:0') Epoch 59, bias, value: tensor([-0.0037, 0.0260, 0.0064, 0.0036, 0.0073, -0.0051, 0.0210, -0.0234, 0.0222, 0.0023], device='cuda:0'), grad: tensor([ 6.5384e-03, 3.2693e-05, -2.4128e-04, -1.9169e-03, 6.7711e-05, -5.8861e-03, 7.1907e-04, 1.6999e-04, 1.9586e-04, 3.2806e-04], device='cuda:0') 100 0.0001 changing lr epoch 58, time 217.54, cls_loss 0.0172 cls_loss_mapping 0.0211 cls_loss_causal 0.6828 re_mapping 0.0150 re_causal 0.0416 /// teacc 98.75 lr 0.00010000 Epoch 60, weight, value: tensor([[-0.0257, -0.0160, -0.0187, ..., -0.0699, 0.0184, 0.0121], [-0.0173, 0.0160, -0.0761, ..., 0.0232, 0.0238, -0.0407], [ 0.0416, -0.0029, -0.0445, ..., -0.0256, -0.0071, -0.0150], ..., [ 0.0271, -0.0385, 0.0006, ..., 0.0239, 0.0037, 0.0321], [-0.0088, -0.0295, -0.0307, ..., -0.0032, -0.0118, -0.0034], [-0.0476, -0.0819, 0.0410, ..., -0.0422, -0.0087, 0.0062]], device='cuda:0'), grad: tensor([[ 1.1347e-05, 0.0000e+00, 2.9624e-05, ..., 5.4296e-07, 1.6227e-05, 2.1514e-07], [ 2.5064e-05, 0.0000e+00, 3.1590e-05, ..., 2.9374e-06, -8.3625e-05, 4.2841e-08], [ 2.3520e-04, 0.0000e+00, 1.5926e-04, ..., 1.3541e-06, 6.3181e-05, -2.0284e-06], ..., [-7.4530e-04, 0.0000e+00, -2.9373e-04, ..., -1.1764e-05, -1.0717e-04, 5.2806e-07], [ 1.2094e-04, 0.0000e+00, 5.5695e-04, ..., 2.0768e-07, 2.5153e-04, 4.9639e-07], [ 3.8266e-04, 0.0000e+00, 3.5048e-04, ..., 1.9874e-06, 1.6320e-04, 5.0291e-08]], device='cuda:0') Epoch 60, bias, value: tensor([-0.0046, 0.0262, 0.0061, 0.0037, 0.0078, -0.0049, 0.0213, -0.0230, 0.0215, 0.0025], device='cuda:0'), grad: tensor([-7.2300e-05, -8.2433e-05, 6.9761e-04, 3.4237e-03, 1.0049e-04, -6.0387e-03, 7.3135e-05, -1.3237e-03, 1.8244e-03, 1.3971e-03], device='cuda:0') 100 0.0001 changing lr epoch 59, time 217.55, cls_loss 0.0166 cls_loss_mapping 0.0204 cls_loss_causal 0.6826 re_mapping 0.0141 re_causal 0.0387 /// teacc 98.65 lr 0.00010000 Epoch 61, weight, value: tensor([[-0.0254, -0.0160, -0.0193, ..., -0.0705, 0.0180, 0.0111], [-0.0169, 0.0160, -0.0763, ..., 0.0229, 0.0251, -0.0412], [ 0.0423, -0.0028, -0.0438, ..., -0.0259, -0.0079, -0.0139], ..., [ 0.0263, -0.0387, 0.0005, ..., 0.0245, 0.0034, 0.0320], [-0.0092, -0.0297, -0.0309, ..., -0.0033, -0.0117, -0.0040], [-0.0483, -0.0820, 0.0406, ..., -0.0423, -0.0077, 0.0070]], device='cuda:0'), grad: tensor([[-3.3736e-05, 0.0000e+00, 2.2724e-06, ..., 1.7416e-07, 3.0883e-06, 0.0000e+00], [ 1.7837e-05, 0.0000e+00, 2.3052e-05, ..., 3.1590e-06, 2.2829e-05, 0.0000e+00], [-1.8287e-04, 0.0000e+00, 2.1011e-06, ..., 1.8347e-07, 6.0201e-06, 0.0000e+00], ..., [ 1.4937e-04, 0.0000e+00, 9.5516e-06, ..., 1.8999e-07, 4.2841e-06, 0.0000e+00], [ 1.1183e-05, 0.0000e+00, 1.4611e-05, ..., 2.2724e-07, 8.9109e-06, 0.0000e+00], [ 6.5155e-06, 0.0000e+00, 2.1946e-04, ..., 3.0637e-05, 2.4462e-04, 0.0000e+00]], device='cuda:0') Epoch 61, bias, value: tensor([-0.0037, 0.0270, 0.0065, 0.0033, 0.0075, -0.0048, 0.0207, -0.0236, 0.0217, 0.0021], device='cuda:0'), grad: tensor([-1.5318e-04, 1.0294e-04, -2.0874e-04, 4.4703e-05, -2.3991e-05, -3.2872e-05, -7.9489e-04, 2.3258e-04, 7.4685e-05, 7.5865e-04], device='cuda:0') 100 0.0001 changing lr epoch 60, time 217.42, cls_loss 0.0137 cls_loss_mapping 0.0218 cls_loss_causal 0.6739 re_mapping 0.0142 re_causal 0.0422 /// teacc 98.61 lr 0.00010000 Epoch 62, weight, value: tensor([[-0.0256, -0.0160, -0.0195, ..., -0.0706, 0.0177, 0.0111], [-0.0171, 0.0160, -0.0770, ..., 0.0231, 0.0256, -0.0414], [ 0.0417, -0.0027, -0.0439, ..., -0.0260, -0.0082, -0.0138], ..., [ 0.0272, -0.0389, 0.0004, ..., 0.0248, 0.0031, 0.0320], [-0.0095, -0.0297, -0.0307, ..., -0.0034, -0.0116, -0.0042], [-0.0486, -0.0820, 0.0405, ..., -0.0426, -0.0075, 0.0070]], device='cuda:0'), grad: tensor([[ 2.0191e-05, 0.0000e+00, 1.8001e-05, ..., 1.0617e-05, 9.7305e-06, 0.0000e+00], [ 2.0862e-04, 0.0000e+00, 2.8238e-05, ..., -3.7265e-04, -2.3079e-04, 0.0000e+00], [ 2.2903e-05, 0.0000e+00, 6.5118e-06, ..., 1.6198e-05, 4.0196e-06, 0.0000e+00], ..., [-1.2789e-03, 0.0000e+00, 8.1062e-05, ..., -3.9506e-04, 1.4150e-04, 0.0000e+00], [ 1.8105e-05, 0.0000e+00, 9.6202e-05, ..., 1.5512e-05, 1.6421e-05, 0.0000e+00], [ 7.0238e-04, 0.0000e+00, -1.8911e-03, ..., 4.2415e-04, -2.1935e-05, 0.0000e+00]], device='cuda:0') Epoch 62, bias, value: tensor([-0.0035, 0.0276, 0.0059, 0.0031, 0.0076, -0.0045, 0.0199, -0.0230, 0.0217, 0.0017], device='cuda:0'), grad: tensor([ 8.7202e-05, -2.6779e-03, 1.0395e-04, 2.7990e-04, 4.8065e-03, -1.0052e-03, 8.5497e-04, -1.5268e-03, 2.6345e-04, -1.1806e-03], device='cuda:0') 100 0.0001 changing lr epoch 61, time 217.58, cls_loss 0.0129 cls_loss_mapping 0.0180 cls_loss_causal 0.6543 re_mapping 0.0144 re_causal 0.0404 /// teacc 98.73 lr 0.00010000 Epoch 63, weight, value: tensor([[-0.0264, -0.0161, -0.0200, ..., -0.0708, 0.0174, 0.0110], [-0.0186, 0.0160, -0.0776, ..., 0.0244, 0.0262, -0.0416], [ 0.0426, -0.0027, -0.0440, ..., -0.0261, -0.0085, -0.0137], ..., [ 0.0280, -0.0389, 0.0004, ..., 0.0248, 0.0028, 0.0320], [-0.0099, -0.0298, -0.0309, ..., -0.0035, -0.0125, -0.0043], [-0.0492, -0.0821, 0.0408, ..., -0.0434, -0.0069, 0.0070]], device='cuda:0'), grad: tensor([[ 1.1988e-05, 0.0000e+00, 6.2250e-06, ..., 4.4219e-06, 1.3508e-05, 9.5461e-07], [ 2.5082e-04, 0.0000e+00, 1.0999e-06, ..., 1.0228e-04, -3.6192e-04, 3.7812e-07], [ 1.3149e-04, 0.0000e+00, 2.9393e-06, ..., 2.8592e-06, 5.7220e-05, 2.0675e-07], ..., [-2.6727e-04, 0.0000e+00, 9.2462e-06, ..., -4.9973e-04, -3.0804e-04, 6.6962e-07], [ 8.1360e-06, 0.0000e+00, 1.0498e-05, ..., 5.7146e-06, 2.4289e-05, 1.7351e-06], [ 1.4722e-04, 0.0000e+00, 1.9461e-05, ..., 1.4126e-04, 1.3447e-04, 3.5390e-08]], device='cuda:0') Epoch 63, bias, value: tensor([-0.0045, 0.0273, 0.0065, 0.0025, 0.0070, -0.0043, 0.0207, -0.0226, 0.0212, 0.0024], device='cuda:0'), grad: tensor([ 5.1260e-05, -3.9196e-04, 2.0123e-04, -3.1924e-04, 1.1377e-03, -4.2605e-04, 3.1304e-04, -1.0786e-03, 1.2651e-05, 5.0116e-04], device='cuda:0') 100 0.0001 changing lr epoch 62, time 217.42, cls_loss 0.0098 cls_loss_mapping 0.0155 cls_loss_causal 0.6317 re_mapping 0.0139 re_causal 0.0408 /// teacc 98.67 lr 0.00010000 Epoch 64, weight, value: tensor([[-0.0267, -0.0161, -0.0201, ..., -0.0709, 0.0170, 0.0106], [-0.0182, 0.0160, -0.0781, ..., 0.0245, 0.0267, -0.0451], [ 0.0432, -0.0027, -0.0442, ..., -0.0263, -0.0086, -0.0112], ..., [ 0.0278, -0.0389, 0.0005, ..., 0.0252, 0.0024, 0.0331], [-0.0105, -0.0298, -0.0310, ..., -0.0036, -0.0123, -0.0077], [-0.0498, -0.0821, 0.0407, ..., -0.0436, -0.0066, 0.0070]], device='cuda:0'), grad: tensor([[ 6.0536e-07, 0.0000e+00, -3.2447e-06, ..., 2.3562e-07, 2.0694e-06, 0.0000e+00], [ 2.2396e-05, 0.0000e+00, 1.0796e-05, ..., 4.9584e-06, 9.0897e-07, 0.0000e+00], [-2.8551e-05, 0.0000e+00, 3.1125e-06, ..., 6.1560e-07, 3.4608e-06, 0.0000e+00], ..., [-3.1620e-05, 0.0000e+00, 8.2999e-06, ..., -9.4622e-06, 3.0138e-06, 0.0000e+00], [ 5.6699e-06, 0.0000e+00, 6.8732e-06, ..., 1.3225e-07, -5.4669e-07, 0.0000e+00], [ 1.3776e-05, 0.0000e+00, 6.4913e-07, ..., 2.0433e-06, -9.7081e-06, 0.0000e+00]], device='cuda:0') Epoch 64, bias, value: tensor([-0.0043, 0.0275, 0.0069, 0.0023, 0.0069, -0.0045, 0.0206, -0.0225, 0.0213, 0.0020], device='cuda:0'), grad: tensor([-6.9439e-05, 5.7817e-05, -2.4870e-05, 1.9029e-05, -3.3706e-05, 2.5228e-05, 1.1884e-05, -2.7746e-05, 1.3225e-05, 2.8580e-05], device='cuda:0') 100 0.0001 changing lr epoch 63, time 217.33, cls_loss 0.0105 cls_loss_mapping 0.0174 cls_loss_causal 0.6285 re_mapping 0.0139 re_causal 0.0398 /// teacc 98.61 lr 0.00010000 Epoch 65, weight, value: tensor([[-2.7072e-02, -1.6177e-02, -2.0683e-02, ..., -7.1142e-02, 1.6065e-02, 1.0269e-02], [-1.8219e-02, 1.6318e-02, -7.8408e-02, ..., 2.4330e-02, 2.6914e-02, -4.5978e-02], [ 4.3092e-02, -3.2435e-03, -4.4140e-02, ..., -2.6427e-02, -9.2436e-03, -1.0578e-02], ..., [ 2.8469e-02, -3.7164e-02, 9.7241e-05, ..., 2.5689e-02, 2.2677e-03, 3.3124e-02], [-1.1044e-02, -2.8373e-02, -3.1328e-02, ..., -3.7202e-03, -1.2169e-02, -8.0873e-03], [-5.0665e-02, -8.2488e-02, 4.0945e-02, ..., -4.3980e-02, -5.8393e-03, 7.2448e-03]], device='cuda:0'), grad: tensor([[-4.5806e-05, 0.0000e+00, -6.1654e-06, ..., 0.0000e+00, -8.8692e-05, 2.5891e-07], [-2.2078e-04, 0.0000e+00, 3.5428e-06, ..., -7.4506e-09, -1.3590e-04, 8.4750e-08], [ 1.2517e-04, 0.0000e+00, 9.6038e-06, ..., 9.3132e-10, 8.9407e-05, -2.4959e-06], ..., [-1.9395e-04, 0.0000e+00, 1.1474e-05, ..., 2.7940e-09, 2.5004e-05, 2.8219e-07], [ 3.0190e-05, 0.0000e+00, -9.6262e-05, ..., 9.3132e-10, 2.9400e-05, 9.4995e-07], [ 9.4056e-05, 0.0000e+00, 3.0756e-05, ..., 9.3132e-10, 1.1668e-05, 2.9430e-07]], device='cuda:0') Epoch 65, bias, value: tensor([-0.0056, 0.0274, 0.0066, 0.0022, 0.0069, -0.0040, 0.0208, -0.0223, 0.0213, 0.0025], device='cuda:0'), grad: tensor([-6.9189e-04, -4.8566e-04, 6.0892e-04, 4.1437e-04, -1.0639e-05, 3.4142e-04, 4.0531e-05, -2.5105e-04, -3.0065e-04, 3.3569e-04], device='cuda:0') 100 0.0001 changing lr epoch 64, time 217.47, cls_loss 0.0115 cls_loss_mapping 0.0163 cls_loss_causal 0.6585 re_mapping 0.0138 re_causal 0.0398 /// teacc 98.60 lr 0.00010000 Epoch 66, weight, value: tensor([[-0.0272, -0.0163, -0.0210, ..., -0.0715, 0.0157, 0.0102], [-0.0181, 0.0168, -0.0790, ..., 0.0249, 0.0274, -0.0474], [ 0.0430, -0.0037, -0.0443, ..., -0.0267, -0.0095, -0.0101], ..., [ 0.0285, -0.0373, -0.0003, ..., 0.0255, 0.0015, 0.0334], [-0.0114, -0.0279, -0.0316, ..., -0.0040, -0.0121, -0.0090], [-0.0507, -0.0830, 0.0411, ..., -0.0438, -0.0049, 0.0073]], device='cuda:0'), grad: tensor([[ 3.2187e-05, 0.0000e+00, 7.4133e-07, ..., 4.4703e-08, 6.4336e-06, 3.7253e-09], [ 1.0520e-04, 0.0000e+00, -1.7323e-07, ..., 3.7067e-07, -4.6074e-05, 1.8626e-09], [-1.4615e-04, 0.0000e+00, 3.5223e-06, ..., 6.5193e-08, 1.0267e-05, 0.0000e+00], ..., [-1.4150e-04, 0.0000e+00, 3.3677e-06, ..., 3.0547e-07, -3.5204e-07, 1.8626e-09], [ 1.1258e-05, 0.0000e+00, -1.8906e-06, ..., 8.1956e-08, 2.2426e-05, 1.6764e-08], [ 4.5061e-05, 0.0000e+00, 9.7603e-06, ..., 2.1700e-06, 2.8387e-06, 7.4506e-09]], device='cuda:0') Epoch 66, bias, value: tensor([-0.0053, 0.0279, 0.0062, 0.0022, 0.0073, -0.0040, 0.0208, -0.0230, 0.0211, 0.0029], device='cuda:0'), grad: tensor([ 4.7386e-05, 1.1557e-04, -1.2052e-04, 1.2934e-04, 3.9935e-06, -3.7272e-06, 4.7654e-05, -3.1686e-04, 5.4836e-05, 4.2111e-05], device='cuda:0') 100 0.0001 changing lr epoch 65, time 217.73, cls_loss 0.0131 cls_loss_mapping 0.0205 cls_loss_causal 0.6611 re_mapping 0.0136 re_causal 0.0398 /// teacc 98.61 lr 0.00010000 Epoch 67, weight, value: tensor([[-0.0276, -0.0163, -0.0225, ..., -0.0717, 0.0154, 0.0086], [-0.0182, 0.0173, -0.0797, ..., 0.0261, 0.0281, -0.0498], [ 0.0430, -0.0041, -0.0445, ..., -0.0268, -0.0098, -0.0102], ..., [ 0.0291, -0.0373, -0.0002, ..., 0.0245, 0.0011, 0.0325], [-0.0117, -0.0280, -0.0316, ..., -0.0040, -0.0123, -0.0114], [-0.0516, -0.0832, 0.0411, ..., -0.0444, -0.0043, 0.0091]], device='cuda:0'), grad: tensor([[ 2.9281e-06, 0.0000e+00, -4.0010e-06, ..., 2.9802e-08, 9.4622e-07, 0.0000e+00], [ 5.2080e-06, 0.0000e+00, 8.5607e-06, ..., 2.2911e-07, -5.0589e-06, 0.0000e+00], [ 1.6773e-04, 0.0000e+00, 1.8571e-06, ..., 1.4901e-08, 4.1246e-05, 0.0000e+00], ..., [ 4.6015e-04, 0.0000e+00, 4.7415e-05, ..., -8.4005e-07, 1.1665e-04, 0.0000e+00], [ 2.2560e-05, 0.0000e+00, 4.4592e-06, ..., 9.3132e-09, 1.9707e-06, 0.0000e+00], [-4.7588e-04, 0.0000e+00, 9.3818e-05, ..., 4.3772e-07, -1.2022e-04, 0.0000e+00]], device='cuda:0') Epoch 67, bias, value: tensor([-0.0062, 0.0282, 0.0058, 0.0026, 0.0072, -0.0043, 0.0209, -0.0223, 0.0210, 0.0028], device='cuda:0'), grad: tensor([-4.2653e-04, 3.5793e-05, 2.2519e-04, -1.6630e-04, -3.9911e-04, -6.3926e-06, 4.6432e-05, 1.2360e-03, 3.9428e-05, -5.8317e-04], device='cuda:0') 100 0.0001 changing lr epoch 66, time 217.57, cls_loss 0.0108 cls_loss_mapping 0.0163 cls_loss_causal 0.6815 re_mapping 0.0131 re_causal 0.0391 /// teacc 98.44 lr 0.00010000 Epoch 68, weight, value: tensor([[-0.0278, -0.0164, -0.0225, ..., -0.0718, 0.0154, 0.0085], [-0.0188, 0.0172, -0.0803, ..., 0.0262, 0.0280, -0.0518], [ 0.0430, -0.0038, -0.0447, ..., -0.0269, -0.0103, -0.0097], ..., [ 0.0290, -0.0376, -0.0004, ..., 0.0245, 0.0009, 0.0319], [-0.0116, -0.0282, -0.0319, ..., -0.0040, -0.0119, -0.0129], [-0.0521, -0.0835, 0.0407, ..., -0.0446, -0.0044, 0.0092]], device='cuda:0'), grad: tensor([[-4.2558e-05, 0.0000e+00, -1.0401e-04, ..., 1.7323e-06, 1.0937e-05, 0.0000e+00], [ 2.0385e-05, 0.0000e+00, 2.5779e-06, ..., 7.1898e-07, 7.6070e-06, 0.0000e+00], [-4.4405e-06, 0.0000e+00, 2.7329e-05, ..., 2.0117e-07, 1.4424e-05, 0.0000e+00], ..., [ 1.9610e-05, 0.0000e+00, 6.6310e-06, ..., 4.5672e-06, 4.9099e-06, 0.0000e+00], [ 4.4078e-05, 0.0000e+00, 2.2978e-05, ..., 5.1223e-07, 3.1263e-05, 0.0000e+00], [ 1.4924e-05, 0.0000e+00, 8.5980e-06, ..., 1.3504e-06, -5.0217e-05, 0.0000e+00]], device='cuda:0') Epoch 68, bias, value: tensor([-0.0052, 0.0277, 0.0054, 0.0029, 0.0079, -0.0038, 0.0199, -0.0224, 0.0219, 0.0017], device='cuda:0'), grad: tensor([-2.2864e-04, 4.4703e-05, 6.5088e-05, -1.7500e-04, 6.8188e-04, 8.3596e-06, -5.8270e-04, 7.8678e-05, 1.7023e-04, -6.3062e-05], device='cuda:0') 100 0.0001 changing lr epoch 67, time 217.62, cls_loss 0.0130 cls_loss_mapping 0.0167 cls_loss_causal 0.6240 re_mapping 0.0135 re_causal 0.0360 /// teacc 98.68 lr 0.00010000 Epoch 69, weight, value: tensor([[-0.0273, -0.0171, -0.0232, ..., -0.0719, 0.0148, 0.0083], [-0.0195, 0.0175, -0.0807, ..., 0.0266, 0.0282, -0.0539], [ 0.0436, -0.0041, -0.0447, ..., -0.0273, -0.0102, -0.0084], ..., [ 0.0291, -0.0375, -0.0004, ..., 0.0244, 0.0005, 0.0320], [-0.0122, -0.0285, -0.0323, ..., -0.0042, -0.0117, -0.0147], [-0.0524, -0.0865, 0.0411, ..., -0.0446, -0.0034, 0.0093]], device='cuda:0'), grad: tensor([[ 8.6054e-07, 0.0000e+00, 5.7556e-07, ..., 0.0000e+00, 4.5784e-06, 1.4901e-08], [ 8.1062e-06, 0.0000e+00, 7.7114e-07, ..., 0.0000e+00, 2.4028e-06, 3.7253e-09], [-4.3273e-05, 0.0000e+00, 6.4075e-07, ..., 0.0000e+00, -8.1807e-06, -1.8999e-07], ..., [-7.8604e-06, 0.0000e+00, 1.1940e-06, ..., 0.0000e+00, 1.1660e-06, 2.6077e-08], [ 7.7963e-05, 0.0000e+00, -3.0361e-07, ..., 0.0000e+00, -4.7296e-05, 6.7055e-08], [ 7.4506e-06, 0.0000e+00, 2.8443e-06, ..., 0.0000e+00, -2.8815e-06, 1.1176e-08]], device='cuda:0') Epoch 69, bias, value: tensor([-0.0056, 0.0271, 0.0061, 0.0027, 0.0071, -0.0043, 0.0207, -0.0226, 0.0216, 0.0029], device='cuda:0'), grad: tensor([ 4.1723e-06, 3.2455e-05, -3.0112e-04, -6.1929e-05, 3.6150e-05, 2.8145e-06, 5.7995e-05, 2.3823e-06, 2.0814e-04, 1.8746e-05], device='cuda:0') 100 0.0001 changing lr epoch 68, time 217.61, cls_loss 0.0105 cls_loss_mapping 0.0142 cls_loss_causal 0.6285 re_mapping 0.0132 re_causal 0.0378 /// teacc 98.70 lr 0.00010000 Epoch 70, weight, value: tensor([[-0.0276, -0.0177, -0.0233, ..., -0.0720, 0.0147, 0.0083], [-0.0199, 0.0174, -0.0809, ..., 0.0267, 0.0285, -0.0540], [ 0.0452, -0.0029, -0.0452, ..., -0.0276, -0.0104, -0.0083], ..., [ 0.0280, -0.0391, -0.0008, ..., 0.0243, 0.0003, 0.0320], [-0.0127, -0.0294, -0.0326, ..., -0.0043, -0.0117, -0.0150], [-0.0531, -0.0898, 0.0410, ..., -0.0447, -0.0032, 0.0093]], device='cuda:0'), grad: tensor([[-2.1040e-05, 1.4901e-08, 2.5287e-05, ..., 5.8301e-07, 1.3039e-06, 0.0000e+00], [-3.9265e-06, 1.8626e-09, 3.8929e-06, ..., 2.6822e-07, -2.9355e-05, 0.0000e+00], [ 2.5421e-05, -1.9185e-07, 1.0908e-05, ..., 2.4028e-07, 6.5304e-06, 0.0000e+00], ..., [-3.5763e-05, 5.7742e-08, 3.8091e-06, ..., 4.4703e-07, 1.3202e-05, 0.0000e+00], [ 4.1649e-06, 3.5390e-08, 2.1327e-06, ..., 6.3889e-07, 1.5311e-06, 0.0000e+00], [ 9.0748e-06, 1.8626e-09, 2.4605e-04, ..., 9.0152e-07, -4.3884e-06, 0.0000e+00]], device='cuda:0') Epoch 70, bias, value: tensor([-0.0059, 0.0271, 0.0073, 0.0026, 0.0070, -0.0035, 0.0207, -0.0235, 0.0211, 0.0028], device='cuda:0'), grad: tensor([-5.0545e-05, -5.3406e-05, 7.9989e-05, 2.6679e-04, -6.3086e-04, -1.8489e-04, 1.1188e-04, -2.3812e-05, -5.2899e-06, 4.9019e-04], device='cuda:0') 100 0.0001 changing lr epoch 69, time 217.57, cls_loss 0.0124 cls_loss_mapping 0.0166 cls_loss_causal 0.6367 re_mapping 0.0129 re_causal 0.0360 /// teacc 98.55 lr 0.00010000 Epoch 71, weight, value: tensor([[-0.0278, -0.0180, -0.0226, ..., -0.0724, 0.0145, 0.0083], [-0.0206, 0.0190, -0.0813, ..., 0.0269, 0.0288, -0.0541], [ 0.0452, -0.0032, -0.0453, ..., -0.0278, -0.0111, -0.0083], ..., [ 0.0292, -0.0418, -0.0010, ..., 0.0238, 0.0003, 0.0320], [-0.0134, -0.0293, -0.0328, ..., -0.0043, -0.0119, -0.0150], [-0.0540, -0.0909, 0.0407, ..., -0.0438, -0.0030, 0.0093]], device='cuda:0'), grad: tensor([[ 9.9465e-06, 1.1735e-07, 5.9120e-06, ..., 0.0000e+00, 3.9116e-06, 0.0000e+00], [ 5.7316e-04, 3.5204e-07, 1.4305e-05, ..., 0.0000e+00, 1.9228e-04, 0.0000e+00], [ 1.3344e-05, -3.5763e-06, 9.8124e-06, ..., 0.0000e+00, 4.1306e-05, 0.0000e+00], ..., [-6.3467e-04, 2.5537e-06, -1.3620e-05, ..., 0.0000e+00, -2.1434e-04, 0.0000e+00], [ 4.4584e-05, 2.5891e-07, 3.2842e-05, ..., 0.0000e+00, 8.7693e-06, 0.0000e+00], [ 1.7524e-05, 7.6368e-08, 7.8306e-06, ..., 0.0000e+00, -2.5794e-05, 0.0000e+00]], device='cuda:0') Epoch 71, bias, value: tensor([-0.0054, 0.0273, 0.0069, 0.0026, 0.0068, -0.0034, 0.0209, -0.0228, 0.0209, 0.0021], device='cuda:0'), grad: tensor([ 2.7493e-05, 1.0395e-03, -3.6693e-04, 1.1736e-04, 5.2404e-04, -1.9684e-03, 1.3933e-03, -1.0023e-03, 2.4009e-04, -2.1681e-06], device='cuda:0') 100 0.0001 changing lr epoch 70, time 217.65, cls_loss 0.0096 cls_loss_mapping 0.0137 cls_loss_causal 0.6347 re_mapping 0.0125 re_causal 0.0354 /// teacc 98.60 lr 0.00010000 Epoch 72, weight, value: tensor([[-2.8076e-02, -1.8209e-02, -2.2606e-02, ..., -7.2785e-02, 1.3884e-02, 8.2790e-03], [-2.1000e-02, 1.8953e-02, -8.1715e-02, ..., 2.7560e-02, 2.9278e-02, -5.4273e-02], [ 4.5328e-02, -2.9603e-03, -4.5887e-02, ..., -2.8830e-02, -1.1658e-02, -7.9245e-03], ..., [ 2.9706e-02, -4.2213e-02, -5.4599e-05, ..., 2.3662e-02, -8.8294e-05, 3.1973e-02], [-1.3631e-02, -2.9744e-02, -3.3184e-02, ..., -4.5730e-03, -1.1871e-02, -1.5075e-02], [-5.4218e-02, -9.1713e-02, 4.1074e-02, ..., -4.3564e-02, -2.3413e-03, 9.3247e-03]], device='cuda:0'), grad: tensor([[-5.3257e-05, 0.0000e+00, 4.0978e-05, ..., 1.2480e-07, 6.3553e-06, 0.0000e+00], [ 5.7034e-06, 0.0000e+00, 9.4920e-06, ..., -1.6868e-05, -2.9728e-05, 0.0000e+00], [ 8.7470e-06, 0.0000e+00, 1.0051e-05, ..., 1.1176e-06, 5.6326e-06, -9.3132e-09], ..., [ 5.7891e-06, 0.0000e+00, 5.3674e-05, ..., 3.6340e-06, 5.5254e-05, 0.0000e+00], [ 2.6226e-05, 0.0000e+00, 6.0536e-06, ..., 2.4773e-07, -4.4554e-06, 1.8626e-09], [ 6.3255e-06, 0.0000e+00, 6.3837e-05, ..., 9.2015e-07, -4.6134e-05, 0.0000e+00]], device='cuda:0') Epoch 72, bias, value: tensor([-0.0056, 0.0270, 0.0067, 0.0025, 0.0060, -0.0033, 0.0209, -0.0222, 0.0209, 0.0024], device='cuda:0'), grad: tensor([-3.5524e-04, 6.8918e-08, 1.2147e-04, 3.8385e-04, -1.1522e-04, -6.2704e-04, -1.8954e-05, 2.8110e-04, 2.0301e-04, 1.2720e-04], device='cuda:0') 100 0.0001 changing lr epoch 71, time 217.80, cls_loss 0.0099 cls_loss_mapping 0.0126 cls_loss_causal 0.6549 re_mapping 0.0128 re_causal 0.0372 /// teacc 98.77 lr 0.00010000 Epoch 73, weight, value: tensor([[-0.0276, -0.0184, -0.0229, ..., -0.0733, 0.0139, 0.0083], [-0.0213, 0.0189, -0.0816, ..., 0.0293, 0.0302, -0.0545], [ 0.0446, -0.0027, -0.0462, ..., -0.0308, -0.0128, -0.0083], ..., [ 0.0303, -0.0423, -0.0002, ..., 0.0235, -0.0001, 0.0324], [-0.0142, -0.0300, -0.0337, ..., -0.0053, -0.0121, -0.0152], [-0.0547, -0.0924, 0.0411, ..., -0.0437, -0.0021, 0.0093]], device='cuda:0'), grad: tensor([[ 1.6242e-06, 0.0000e+00, -1.7297e-04, ..., 5.3272e-07, 1.2666e-06, 0.0000e+00], [ 5.1230e-05, 0.0000e+00, 5.2564e-06, ..., 4.6402e-05, -1.1206e-05, 0.0000e+00], [-5.2661e-05, 0.0000e+00, 4.1425e-06, ..., 6.0163e-07, 8.9779e-07, 0.0000e+00], ..., [-5.5909e-05, 0.0000e+00, 1.0967e-05, ..., -5.7161e-05, 9.8050e-06, 0.0000e+00], [ 1.1679e-06, 0.0000e+00, 1.2025e-05, ..., 3.9674e-07, 2.4810e-06, 0.0000e+00], [ 4.8950e-06, 0.0000e+00, 1.2124e-04, ..., 6.9030e-06, -1.0848e-05, 0.0000e+00]], device='cuda:0') Epoch 73, bias, value: tensor([-0.0045, 0.0273, 0.0058, 0.0027, 0.0059, -0.0034, 0.0206, -0.0217, 0.0205, 0.0024], device='cuda:0'), grad: tensor([-7.5912e-04, 5.9128e-04, -6.1035e-05, 9.1970e-05, -3.3379e-04, 3.0175e-05, 6.6280e-04, -6.4182e-04, 5.1618e-05, 3.6740e-04], device='cuda:0') 100 0.0001 changing lr epoch 72, time 217.57, cls_loss 0.0129 cls_loss_mapping 0.0171 cls_loss_causal 0.6663 re_mapping 0.0127 re_causal 0.0349 /// teacc 98.62 lr 0.00010000 Epoch 74, weight, value: tensor([[-0.0277, -0.0190, -0.0246, ..., -0.0738, 0.0134, 0.0078], [-0.0223, 0.0185, -0.0821, ..., 0.0293, 0.0309, -0.0553], [ 0.0448, -0.0023, -0.0465, ..., -0.0308, -0.0133, -0.0077], ..., [ 0.0310, -0.0423, -0.0001, ..., 0.0239, -0.0003, 0.0325], [-0.0142, -0.0309, -0.0344, ..., -0.0055, -0.0121, -0.0163], [-0.0555, -0.0943, 0.0415, ..., -0.0436, -0.0013, 0.0097]], device='cuda:0'), grad: tensor([[ 6.0387e-06, 3.1665e-08, 1.0455e-04, ..., 1.9558e-07, 1.3933e-06, 4.6566e-08], [-2.6073e-03, -4.9174e-05, 8.5458e-06, ..., 2.5313e-06, 2.6897e-06, 5.5879e-09], [ 2.2202e-03, 4.1455e-05, 1.2144e-05, ..., 2.7753e-07, 4.5113e-06, -4.3958e-07], ..., [ 2.5654e-04, 5.5470e-06, 1.0699e-05, ..., -1.8671e-05, 2.0545e-06, 2.1048e-07], [ 1.6987e-05, 1.9744e-07, 1.4409e-05, ..., 8.3819e-08, -6.2399e-06, 4.0978e-08], [ 5.2214e-05, 4.0978e-08, 1.0767e-03, ..., 1.3493e-05, -2.0359e-06, 7.4506e-09]], device='cuda:0') Epoch 74, bias, value: tensor([-0.0050, 0.0271, 0.0057, 0.0025, 0.0058, -0.0030, 0.0199, -0.0212, 0.0201, 0.0029], device='cuda:0'), grad: tensor([ 1.8048e-04, -4.6577e-03, 4.0016e-03, 8.1718e-05, 3.7625e-06, -1.9464e-03, 2.7511e-06, 4.8470e-04, 3.1322e-05, 1.8177e-03], device='cuda:0') 100 0.0001 changing lr epoch 73, time 217.83, cls_loss 0.0099 cls_loss_mapping 0.0152 cls_loss_causal 0.6255 re_mapping 0.0126 re_causal 0.0362 /// teacc 98.67 lr 0.00010000 Epoch 75, weight, value: tensor([[-0.0281, -0.0192, -0.0255, ..., -0.0741, 0.0130, 0.0078], [-0.0216, 0.0191, -0.0827, ..., 0.0298, 0.0315, -0.0558], [ 0.0450, -0.0026, -0.0463, ..., -0.0315, -0.0137, -0.0075], ..., [ 0.0308, -0.0425, -0.0002, ..., 0.0239, -0.0010, 0.0326], [-0.0145, -0.0312, -0.0355, ..., -0.0056, -0.0123, -0.0165], [-0.0560, -0.0947, 0.0413, ..., -0.0436, -0.0015, 0.0097]], device='cuda:0'), grad: tensor([[ 4.6119e-06, 0.0000e+00, 6.6981e-06, ..., 6.7055e-08, 3.1859e-05, 2.0489e-07], [ 5.7220e-06, 0.0000e+00, 2.3358e-06, ..., 1.9558e-07, 3.3903e-04, 2.4214e-08], [ 1.8284e-05, 0.0000e+00, 1.1176e-08, ..., 2.5518e-07, 2.1055e-05, -6.8732e-07], ..., [-6.1058e-06, 0.0000e+00, -2.6077e-07, ..., 8.4005e-07, 5.3197e-05, 9.8720e-08], [ 1.2532e-05, 0.0000e+00, 1.9684e-05, ..., 1.2852e-07, -3.5381e-04, 1.0990e-07], [ 1.6481e-05, 0.0000e+00, 4.2245e-06, ..., 3.0231e-06, -7.0453e-05, 6.5193e-08]], device='cuda:0') Epoch 75, bias, value: tensor([-0.0054, 0.0278, 0.0057, 0.0021, 0.0069, -0.0031, 0.0206, -0.0215, 0.0197, 0.0025], device='cuda:0'), grad: tensor([ 1.7381e-04, 8.4591e-04, 6.5923e-05, -6.0111e-05, 8.6784e-05, -4.0359e-03, 3.4924e-03, 1.2183e-04, -4.9400e-04, -1.9872e-04], device='cuda:0') 100 0.0001 changing lr epoch 74, time 217.65, cls_loss 0.0095 cls_loss_mapping 0.0135 cls_loss_causal 0.6296 re_mapping 0.0118 re_causal 0.0349 /// teacc 98.65 lr 0.00010000 Epoch 76, weight, value: tensor([[-2.8399e-02, -1.9514e-02, -2.5600e-02, ..., -7.4149e-02, 1.2587e-02, 7.4560e-03], [-2.1964e-02, 2.1534e-02, -8.3086e-02, ..., 2.9884e-02, 3.1886e-02, -5.7684e-02], [ 4.5360e-02, -4.0404e-03, -4.5648e-02, ..., -3.1736e-02, -1.4179e-02, -6.7006e-03], ..., [ 3.1116e-02, -4.3402e-02, 9.1840e-05, ..., 2.4230e-02, -9.5036e-04, 3.2503e-02], [-1.5044e-02, -3.1580e-02, -3.5740e-02, ..., -5.6757e-03, -1.3008e-02, -1.7315e-02], [-5.6663e-02, -9.5329e-02, 4.0913e-02, ..., -4.3756e-02, -9.6210e-04, 9.7934e-03]], device='cuda:0'), grad: tensor([[ 7.1004e-06, 0.0000e+00, 8.8848e-07, ..., 3.9116e-07, 1.0788e-05, 1.7881e-07], [ 3.9116e-06, 0.0000e+00, 1.3895e-06, ..., -3.4243e-05, -2.4652e-04, 9.8720e-08], [ 2.9707e-04, 0.0000e+00, 1.1399e-06, ..., 5.3085e-07, 4.3124e-05, -2.5798e-06], ..., [ 2.3693e-05, 0.0000e+00, 2.4587e-06, ..., 3.8706e-06, 3.9846e-05, 4.7684e-07], [-1.6236e-04, 0.0000e+00, 9.8720e-07, ..., 2.9802e-07, -2.1088e-04, 7.6741e-07], [ 8.5160e-06, 0.0000e+00, 4.1574e-06, ..., 4.7944e-06, -3.5197e-05, 7.8231e-08]], device='cuda:0') Epoch 76, bias, value: tensor([-0.0049, 0.0282, 0.0057, 0.0022, 0.0063, -0.0026, 0.0197, -0.0212, 0.0190, 0.0024], device='cuda:0'), grad: tensor([-1.1748e-04, -4.1556e-04, 4.7827e-04, -2.1830e-06, 9.2149e-05, -3.1948e-05, 2.3785e-03, 1.1194e-04, -2.4166e-03, -7.4744e-05], device='cuda:0') 100 0.0001 changing lr epoch 75, time 217.53, cls_loss 0.0075 cls_loss_mapping 0.0109 cls_loss_causal 0.6068 re_mapping 0.0120 re_causal 0.0356 /// teacc 98.64 lr 0.00010000 Epoch 77, weight, value: tensor([[-2.8559e-02, -1.9567e-02, -2.5859e-02, ..., -7.4169e-02, 1.2412e-02, 7.1361e-03], [-2.1615e-02, 2.1567e-02, -8.3421e-02, ..., 3.0023e-02, 3.2002e-02, -5.9072e-02], [ 4.5470e-02, -3.6383e-03, -4.6237e-02, ..., -3.1782e-02, -1.4606e-02, -6.3415e-03], ..., [ 3.1047e-02, -4.3923e-02, 1.2460e-05, ..., 2.4196e-02, -1.2116e-03, 3.2991e-02], [-1.5367e-02, -3.1813e-02, -3.5822e-02, ..., -5.6927e-03, -1.2372e-02, -1.8245e-02], [-5.7009e-02, -9.5381e-02, 4.0660e-02, ..., -4.3789e-02, -5.2460e-04, 1.0220e-02]], device='cuda:0'), grad: tensor([[ 2.3916e-06, 0.0000e+00, -1.6969e-06, ..., 0.0000e+00, 1.8533e-06, 1.4156e-07], [ 1.2130e-05, 0.0000e+00, 4.8429e-07, ..., 0.0000e+00, -6.6012e-06, 2.4214e-08], [ 6.9700e-06, 0.0000e+00, 1.1772e-06, ..., 0.0000e+00, 2.9169e-06, 3.1665e-08], ..., [-1.7917e-04, 0.0000e+00, 1.1697e-06, ..., 0.0000e+00, 5.1945e-05, 5.5879e-09], [ 2.0468e-04, 0.0000e+00, 3.3528e-05, ..., 0.0000e+00, 4.0650e-05, 2.4773e-07], [ 2.4766e-05, 0.0000e+00, -1.6280e-06, ..., 0.0000e+00, -6.6042e-05, 2.0489e-08]], device='cuda:0') Epoch 77, bias, value: tensor([-0.0048, 0.0285, 0.0054, 0.0022, 0.0076, -0.0024, 0.0174, -0.0213, 0.0196, 0.0022], device='cuda:0'), grad: tensor([-9.1791e-06, 1.7196e-05, 4.1872e-05, -8.0395e-04, 9.6023e-05, 3.4380e-04, -1.3918e-05, -2.5916e-04, 5.5361e-04, 3.5286e-05], device='cuda:0') 100 0.0001 changing lr epoch 76, time 217.60, cls_loss 0.0090 cls_loss_mapping 0.0118 cls_loss_causal 0.6183 re_mapping 0.0120 re_causal 0.0344 /// teacc 98.77 lr 0.00010000 Epoch 78, weight, value: tensor([[-0.0289, -0.0196, -0.0262, ..., -0.0742, 0.0118, 0.0068], [-0.0222, 0.0217, -0.0851, ..., 0.0300, 0.0325, -0.0612], [ 0.0466, -0.0037, -0.0468, ..., -0.0318, -0.0142, -0.0054], ..., [ 0.0313, -0.0438, 0.0001, ..., 0.0243, -0.0014, 0.0335], [-0.0163, -0.0320, -0.0355, ..., -0.0057, -0.0129, -0.0190], [-0.0577, -0.0956, 0.0403, ..., -0.0438, -0.0003, 0.0103]], device='cuda:0'), grad: tensor([[ 2.5295e-06, 7.6368e-08, 1.7136e-06, ..., 7.4506e-09, 4.4405e-06, 2.4773e-07], [-3.1924e-04, -1.0008e-04, 1.5330e-06, ..., -4.2096e-07, -6.0129e-04, 4.8429e-08], [ 3.0065e-04, 9.2387e-05, 1.8813e-06, ..., 4.0978e-08, 5.3644e-04, 2.7195e-07], ..., [ 1.4678e-05, 4.0904e-06, 2.4028e-06, ..., 1.7695e-07, 3.1292e-05, 1.3039e-08], [ 3.8333e-06, 2.7008e-07, -2.9337e-06, ..., 3.7253e-08, 2.9244e-06, 1.6261e-06], [ 1.4510e-06, 7.8231e-08, 1.1213e-05, ..., 2.2352e-08, 6.5528e-06, 3.5390e-08]], device='cuda:0') Epoch 78, bias, value: tensor([-0.0048, 0.0283, 0.0063, 0.0019, 0.0073, -0.0020, 0.0177, -0.0210, 0.0189, 0.0017], device='cuda:0'), grad: tensor([ 4.7758e-06, -2.2602e-03, 2.0752e-03, -1.2435e-05, 7.8321e-05, 7.6950e-05, -4.9859e-05, 1.1832e-04, -9.5487e-05, 6.1035e-05], device='cuda:0') 100 0.0001 changing lr epoch 77, time 217.53, cls_loss 0.0098 cls_loss_mapping 0.0129 cls_loss_causal 0.6230 re_mapping 0.0113 re_causal 0.0339 /// teacc 98.74 lr 0.00010000 Epoch 79, weight, value: tensor([[-0.0286, -0.0196, -0.0263, ..., -0.0747, 0.0114, 0.0067], [-0.0218, 0.0221, -0.0858, ..., 0.0331, 0.0339, -0.0616], [ 0.0466, -0.0041, -0.0468, ..., -0.0320, -0.0141, -0.0048], ..., [ 0.0310, -0.0438, -0.0003, ..., 0.0228, -0.0034, 0.0334], [-0.0166, -0.0321, -0.0360, ..., -0.0059, -0.0131, -0.0194], [-0.0586, -0.0956, 0.0403, ..., -0.0447, 0.0010, 0.0103]], device='cuda:0'), grad: tensor([[ 9.2328e-05, 0.0000e+00, 1.8060e-05, ..., 1.2480e-07, 1.3292e-04, 3.7253e-09], [ 3.3855e-05, 0.0000e+00, 1.5460e-06, ..., 1.7025e-06, 8.7991e-06, 0.0000e+00], [-4.5121e-05, 0.0000e+00, 6.0461e-06, ..., 8.3633e-07, 6.4932e-06, 0.0000e+00], ..., [-7.1228e-06, 0.0000e+00, -4.9695e-06, ..., 1.5073e-05, 4.0442e-05, 0.0000e+00], [ 2.5570e-05, 0.0000e+00, -4.3809e-06, ..., 6.4634e-07, -3.2812e-05, 1.4901e-08], [-8.4221e-05, 0.0000e+00, -1.4782e-04, ..., 2.6580e-06, -2.7251e-04, 0.0000e+00]], device='cuda:0') Epoch 79, bias, value: tensor([-0.0045, 0.0297, 0.0063, 0.0016, 0.0076, -0.0024, 0.0181, -0.0226, 0.0188, 0.0018], device='cuda:0'), grad: tensor([ 4.7183e-04, 6.1333e-05, -6.2108e-05, -1.4615e-04, 6.9666e-04, 6.6876e-05, 9.8869e-06, 3.1382e-05, -5.8174e-05, -1.0710e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 78---------------------------------------------------- epoch 78, time 218.26, cls_loss 0.0097 cls_loss_mapping 0.0133 cls_loss_causal 0.6282 re_mapping 0.0121 re_causal 0.0346 /// teacc 98.79 lr 0.00010000 Epoch 80, weight, value: tensor([[-0.0289, -0.0198, -0.0275, ..., -0.0754, 0.0105, 0.0049], [-0.0217, 0.0221, -0.0861, ..., 0.0336, 0.0338, -0.0620], [ 0.0463, -0.0040, -0.0467, ..., -0.0322, -0.0146, -0.0046], ..., [ 0.0314, -0.0439, -0.0003, ..., 0.0226, -0.0035, 0.0333], [-0.0166, -0.0324, -0.0359, ..., -0.0063, -0.0126, -0.0197], [-0.0595, -0.0957, 0.0409, ..., -0.0447, 0.0018, 0.0122]], device='cuda:0'), grad: tensor([[ 7.0930e-05, 1.9558e-07, -5.0217e-06, ..., 1.3813e-05, 3.9786e-06, 3.7253e-09], [ 1.4639e-04, -1.7136e-06, 3.3509e-06, ..., 1.9018e-06, -8.3148e-05, 2.4214e-08], [-7.3957e-04, -7.9274e-06, 1.2778e-06, ..., 2.4438e-06, 4.7535e-05, 2.7940e-08], ..., [ 7.0858e-04, 6.4857e-06, 1.8418e-05, ..., 3.2043e-04, 6.5744e-05, -1.3970e-07], [ 7.5758e-05, 1.1027e-06, 2.8443e-06, ..., 1.7229e-06, -7.4692e-07, 5.5879e-09], [-5.4598e-04, 1.9372e-07, -3.1471e-05, ..., -3.9339e-04, -8.2970e-05, 1.4901e-08]], device='cuda:0') Epoch 80, bias, value: tensor([-0.0055, 0.0298, 0.0057, 0.0021, 0.0071, -0.0028, 0.0181, -0.0223, 0.0192, 0.0028], device='cuda:0'), grad: tensor([ 9.1434e-05, -9.4593e-05, -1.1358e-03, 4.4203e-04, 1.4544e-04, 1.0180e-04, 4.5955e-05, 1.7052e-03, 1.6081e-04, -1.4639e-03], device='cuda:0') 100 0.0001 changing lr epoch 79, time 217.32, cls_loss 0.0091 cls_loss_mapping 0.0132 cls_loss_causal 0.6258 re_mapping 0.0123 re_causal 0.0346 /// teacc 98.70 lr 0.00010000 Epoch 81, weight, value: tensor([[-0.0290, -0.0202, -0.0279, ..., -0.0756, 0.0100, 0.0050], [-0.0223, 0.0217, -0.0868, ..., 0.0338, 0.0339, -0.0622], [ 0.0470, -0.0040, -0.0466, ..., -0.0323, -0.0145, -0.0045], ..., [ 0.0315, -0.0419, -0.0005, ..., 0.0224, -0.0039, 0.0332], [-0.0165, -0.0328, -0.0365, ..., -0.0064, -0.0124, -0.0198], [-0.0605, -0.0962, 0.0407, ..., -0.0441, 0.0022, 0.0122]], device='cuda:0'), grad: tensor([[ 2.7776e-05, 8.9407e-08, -2.6405e-05, ..., 0.0000e+00, 6.1616e-06, 0.0000e+00], [ 1.4842e-04, 3.5390e-08, 6.4820e-06, ..., 0.0000e+00, 6.1333e-05, 0.0000e+00], [-1.3828e-04, 1.0803e-07, 4.7684e-06, ..., 0.0000e+00, 1.9372e-05, 0.0000e+00], ..., [-1.0990e-07, 1.4342e-07, 5.3495e-06, ..., 0.0000e+00, 4.9882e-06, 0.0000e+00], [-5.6219e-04, 1.0617e-07, 1.2136e-04, ..., 0.0000e+00, -2.1923e-04, 0.0000e+00], [-4.3064e-05, 3.5949e-07, 6.5923e-05, ..., 0.0000e+00, -1.8394e-04, 0.0000e+00]], device='cuda:0') Epoch 81, bias, value: tensor([-0.0056, 0.0293, 0.0064, 0.0021, 0.0071, -0.0025, 0.0179, -0.0224, 0.0191, 0.0028], device='cuda:0'), grad: tensor([-1.6165e-04, 3.9744e-04, -1.3697e-04, 1.2474e-03, 2.3350e-05, -1.8339e-03, 8.6641e-04, 6.3539e-05, -4.9400e-04, 2.9072e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 80---------------------------------------------------- epoch 80, time 218.07, cls_loss 0.0083 cls_loss_mapping 0.0122 cls_loss_causal 0.5981 re_mapping 0.0119 re_causal 0.0344 /// teacc 98.88 lr 0.00010000 Epoch 82, weight, value: tensor([[-0.0295, -0.0203, -0.0289, ..., -0.0757, 0.0097, 0.0042], [-0.0221, 0.0221, -0.0875, ..., 0.0342, 0.0343, -0.0623], [ 0.0472, -0.0043, -0.0468, ..., -0.0335, -0.0148, -0.0045], ..., [ 0.0320, -0.0418, -0.0002, ..., 0.0227, -0.0041, 0.0332], [-0.0169, -0.0341, -0.0370, ..., -0.0065, -0.0128, -0.0199], [-0.0610, -0.0966, 0.0413, ..., -0.0442, 0.0028, 0.0130]], device='cuda:0'), grad: tensor([[ 5.7220e-06, 1.7695e-07, -3.6713e-06, ..., 2.3805e-06, 1.2200e-06, 3.7253e-09], [ 4.9382e-05, 9.1456e-07, 2.1867e-06, ..., 1.8761e-05, -5.6028e-06, 3.7253e-09], [ 1.0651e-04, 4.0755e-06, 1.1295e-05, ..., 5.3078e-05, 1.7025e-06, -6.5193e-08], ..., [ 1.6308e-04, 2.2724e-05, 6.4746e-06, ..., 3.0184e-04, 2.1812e-06, 3.7253e-09], [ 3.4869e-05, 1.2256e-06, 7.9334e-05, ..., 1.4484e-05, 3.1199e-06, 2.0489e-08], [ 1.7846e-04, 1.9930e-07, -4.9400e-03, ..., 3.4031e-06, -1.0654e-06, 9.3132e-09]], device='cuda:0') Epoch 82, bias, value: tensor([-0.0060, 0.0293, 0.0065, 0.0020, 0.0067, -0.0030, 0.0182, -0.0220, 0.0185, 0.0036], device='cuda:0'), grad: tensor([-1.7142e-04, 7.7486e-05, 1.7238e-04, -8.3065e-04, 6.1455e-03, -1.7178e-04, 4.5568e-05, -2.8342e-05, 2.2936e-04, -5.4665e-03], device='cuda:0') 100 0.0001 changing lr epoch 81, time 217.48, cls_loss 0.0085 cls_loss_mapping 0.0113 cls_loss_causal 0.6304 re_mapping 0.0117 re_causal 0.0343 /// teacc 98.83 lr 0.00010000 Epoch 83, weight, value: tensor([[-0.0297, -0.0208, -0.0294, ..., -0.0767, 0.0092, 0.0033], [-0.0226, 0.0220, -0.0880, ..., 0.0343, 0.0345, -0.0626], [ 0.0475, -0.0028, -0.0478, ..., -0.0341, -0.0153, -0.0042], ..., [ 0.0323, -0.0433, -0.0006, ..., 0.0221, -0.0041, 0.0332], [-0.0169, -0.0308, -0.0373, ..., -0.0069, -0.0139, -0.0200], [-0.0618, -0.0982, 0.0415, ..., -0.0443, 0.0032, 0.0138]], device='cuda:0'), grad: tensor([[ 1.3448e-06, 3.7253e-08, -6.2063e-06, ..., 3.7253e-09, -1.6429e-06, 7.8231e-08], [ 4.7348e-06, 5.5879e-09, 1.9372e-07, ..., 1.8626e-09, 7.8604e-07, 3.1665e-08], [-5.2378e-06, -8.8662e-07, 2.8312e-07, ..., 0.0000e+00, 4.0755e-06, -9.8720e-07], ..., [-9.8944e-06, 8.0094e-08, 4.0792e-07, ..., 3.7253e-09, 8.3074e-07, 1.9744e-07], [ 1.0118e-05, 4.9919e-07, 2.5332e-07, ..., 5.5879e-09, -7.5065e-06, 5.2713e-07], [ 1.1526e-05, 1.1176e-08, 3.8259e-06, ..., 1.1176e-08, 4.9993e-06, 1.1176e-08]], device='cuda:0') Epoch 83, bias, value: tensor([-0.0063, 0.0290, 0.0062, 0.0021, 0.0065, -0.0021, 0.0192, -0.0217, 0.0179, 0.0033], device='cuda:0'), grad: tensor([-9.0420e-05, 1.2666e-05, 2.2035e-06, -5.9414e-04, 3.4347e-06, 5.9795e-04, 2.0131e-05, -1.6093e-05, -5.9605e-06, 6.9916e-05], device='cuda:0') 100 0.0001 changing lr epoch 82, time 217.31, cls_loss 0.0095 cls_loss_mapping 0.0135 cls_loss_causal 0.6320 re_mapping 0.0115 re_causal 0.0318 /// teacc 98.85 lr 0.00010000 Epoch 84, weight, value: tensor([[-0.0307, -0.0219, -0.0295, ..., -0.0770, 0.0084, 0.0033], [-0.0229, 0.0219, -0.0883, ..., 0.0352, 0.0349, -0.0627], [ 0.0477, -0.0026, -0.0476, ..., -0.0344, -0.0156, -0.0041], ..., [ 0.0325, -0.0433, -0.0002, ..., 0.0227, -0.0041, 0.0332], [-0.0162, -0.0286, -0.0376, ..., -0.0074, -0.0139, -0.0201], [-0.0632, -0.0995, 0.0413, ..., -0.0446, 0.0031, 0.0140]], device='cuda:0'), grad: tensor([[ 7.9051e-06, 0.0000e+00, 5.6066e-07, ..., 1.1027e-06, 1.5385e-06, 1.8626e-09], [ 3.9554e-04, 0.0000e+00, 7.1339e-07, ..., 9.8348e-05, -3.7253e-07, 1.8626e-09], [-3.3236e-04, 0.0000e+00, 2.4214e-06, ..., -1.3435e-04, 7.2829e-07, -3.3528e-08], ..., [-4.0102e-04, 0.0000e+00, -4.8429e-06, ..., 8.5086e-06, 1.6112e-06, 3.7253e-09], [ 1.6546e-04, 0.0000e+00, 5.8860e-06, ..., 2.3358e-06, 3.5595e-06, 1.8626e-09], [ 1.1906e-05, 0.0000e+00, 1.7118e-06, ..., 6.2957e-07, -3.7514e-06, 0.0000e+00]], device='cuda:0') Epoch 84, bias, value: tensor([-0.0060, 0.0289, 0.0062, 0.0016, 0.0061, -0.0018, 0.0198, -0.0207, 0.0182, 0.0022], device='cuda:0'), grad: tensor([ 6.0052e-06, 7.1144e-04, -5.6505e-04, 2.3878e-04, 3.9935e-05, -3.9458e-05, 2.0295e-05, -7.6151e-04, 3.2234e-04, 2.7657e-05], device='cuda:0') 100 0.0001 changing lr epoch 83, time 217.22, cls_loss 0.0081 cls_loss_mapping 0.0107 cls_loss_causal 0.6117 re_mapping 0.0115 re_causal 0.0324 /// teacc 98.82 lr 0.00010000 Epoch 85, weight, value: tensor([[-3.1685e-02, -2.2533e-02, -2.9514e-02, ..., -7.7276e-02, 7.8431e-03, 8.4042e-04], [-2.2637e-02, 2.1747e-02, -8.8562e-02, ..., 3.5310e-02, 3.5274e-02, -6.3473e-02], [ 4.9551e-02, -4.4498e-04, -4.7656e-02, ..., -3.4386e-02, -1.5733e-02, -3.9914e-03], ..., [ 3.0931e-02, -4.5407e-02, -4.3666e-05, ..., 2.3128e-02, -4.6640e-03, 3.3222e-02], [-1.6874e-02, -2.8735e-02, -3.7578e-02, ..., -7.5971e-03, -1.3717e-02, -2.0330e-02], [-6.3224e-02, -1.0058e-01, 4.1021e-02, ..., -4.4998e-02, 3.1767e-03, 1.6374e-02]], device='cuda:0'), grad: tensor([[-2.7150e-05, 3.7253e-09, 1.3225e-07, ..., 1.3039e-08, 6.4224e-06, 3.7253e-09], [ 5.2787e-06, 3.7253e-09, 1.2293e-07, ..., -2.7940e-08, 5.6601e-04, 1.8626e-09], [-2.3559e-05, -7.0781e-08, 2.4214e-08, ..., 3.5390e-08, 1.1571e-05, -3.9116e-08], ..., [ 7.3686e-06, 1.8626e-08, 2.0117e-07, ..., 3.3528e-08, 3.8669e-06, 7.4506e-09], [ 2.4468e-05, 2.0489e-08, 3.0156e-06, ..., 4.2841e-08, -7.7391e-04, 1.3039e-08], [ 4.1910e-06, 0.0000e+00, 1.2629e-06, ..., 3.5390e-08, 6.4783e-06, 0.0000e+00]], device='cuda:0') Epoch 85, bias, value: tensor([-0.0068, 0.0291, 0.0075, 0.0017, 0.0062, -0.0014, 0.0201, -0.0219, 0.0181, 0.0021], device='cuda:0'), grad: tensor([-7.9930e-05, 1.6823e-03, 1.3895e-05, 4.8667e-05, 2.1145e-05, 8.5354e-05, 3.5524e-04, 2.8700e-05, -2.1915e-03, 3.5554e-05], device='cuda:0') 100 0.0001 changing lr epoch 84, time 217.28, cls_loss 0.0073 cls_loss_mapping 0.0105 cls_loss_causal 0.6257 re_mapping 0.0116 re_causal 0.0334 /// teacc 98.88 lr 0.00010000 Epoch 86, weight, value: tensor([[-3.1825e-02, -2.3469e-02, -2.9618e-02, ..., -7.7369e-02, 7.4945e-03, 5.2248e-04], [-2.2788e-02, 2.1964e-02, -8.8911e-02, ..., 3.5498e-02, 3.5503e-02, -6.3982e-02], [ 4.9456e-02, 4.7805e-05, -4.8111e-02, ..., -3.4768e-02, -1.6050e-02, -3.4699e-03], ..., [ 3.1461e-02, -4.5332e-02, -9.5766e-05, ..., 2.3146e-02, -4.7482e-03, 3.3152e-02], [-1.6918e-02, -2.9669e-02, -3.7855e-02, ..., -7.7859e-03, -1.3847e-02, -2.0583e-02], [-6.3803e-02, -1.0353e-01, 4.1037e-02, ..., -4.5113e-02, 3.5997e-03, 1.6679e-02]], device='cuda:0'), grad: tensor([[ 1.1260e-06, 1.7695e-08, 1.9503e-04, ..., 6.5193e-09, 3.2596e-07, 3.6322e-08], [ 1.2983e-06, 6.2399e-08, 6.6198e-06, ..., -2.1420e-08, -4.6790e-06, 1.2107e-08], [-1.0736e-05, -7.5437e-07, 4.1395e-05, ..., 3.2596e-08, 1.0859e-06, -6.5099e-07], ..., [-2.7418e-06, 2.0489e-08, 1.0639e-05, ..., 4.2841e-08, 2.0191e-06, 2.0489e-08], [ 5.7966e-06, 3.8277e-07, 8.2552e-05, ..., 5.3085e-08, 2.1383e-06, 3.4086e-07], [ 1.7062e-06, 5.5879e-09, -1.0139e-04, ..., 1.1269e-07, -1.9521e-06, 4.0978e-08]], device='cuda:0') Epoch 86, bias, value: tensor([-0.0065, 0.0290, 0.0072, 0.0018, 0.0061, -0.0014, 0.0200, -0.0214, 0.0178, 0.0020], device='cuda:0'), grad: tensor([ 3.5858e-04, 4.9174e-06, 6.3777e-05, 4.7207e-05, -8.9359e-04, 2.5129e-04, 1.4913e-04, 2.2173e-05, 1.9062e-04, -1.9395e-04], device='cuda:0') 100 0.0001 changing lr epoch 85, time 217.40, cls_loss 0.0081 cls_loss_mapping 0.0098 cls_loss_causal 0.5964 re_mapping 0.0113 re_causal 0.0326 /// teacc 98.49 lr 0.00010000 Epoch 87, weight, value: tensor([[-3.1343e-02, -2.4470e-02, -2.9745e-02, ..., -7.7509e-02, 7.6419e-03, 5.0189e-04], [-2.2495e-02, 2.0924e-02, -8.9341e-02, ..., 3.6480e-02, 3.6347e-02, -6.4996e-02], [ 4.9868e-02, 1.4075e-03, -4.8820e-02, ..., -3.4958e-02, -1.5985e-02, -3.3829e-03], ..., [ 3.1334e-02, -4.5443e-02, 5.2919e-05, ..., 2.3273e-02, -5.9362e-03, 3.3070e-02], [-1.6709e-02, -3.0405e-02, -3.7853e-02, ..., -7.9055e-03, -1.3858e-02, -2.2335e-02], [-6.4422e-02, -1.0611e-01, 4.0836e-02, ..., -4.5450e-02, 3.9890e-03, 1.6682e-02]], device='cuda:0'), grad: tensor([[ 4.0978e-07, 9.3132e-10, -1.0170e-05, ..., 0.0000e+00, 5.4296e-07, 7.4506e-09], [ 1.5544e-06, 9.3132e-10, 2.5164e-06, ..., 0.0000e+00, -3.7178e-06, 6.5193e-09], [-1.3791e-05, -1.3970e-08, 5.5507e-07, ..., 0.0000e+00, 8.4005e-07, -9.8720e-08], ..., [-8.5030e-07, 9.3132e-09, 1.6736e-06, ..., 0.0000e+00, 5.3823e-05, 4.9360e-08], [ 7.9200e-06, 9.3132e-10, 6.5506e-05, ..., 0.0000e+00, 9.7379e-06, 1.0245e-08], [ 1.4817e-06, 0.0000e+00, 1.8880e-05, ..., 0.0000e+00, -1.5497e-04, 1.8626e-09]], device='cuda:0') Epoch 87, bias, value: tensor([-0.0061, 0.0293, 0.0075, 0.0013, 0.0064, -0.0016, 0.0196, -0.0218, 0.0183, 0.0017], device='cuda:0'), grad: tensor([-1.3685e-04, 2.7679e-06, -4.1686e-06, 8.9228e-05, 3.0661e-04, -2.4045e-04, 2.9087e-05, 2.0981e-04, 2.5105e-04, -5.0783e-04], device='cuda:0') 100 0.0001 changing lr epoch 86, time 217.68, cls_loss 0.0067 cls_loss_mapping 0.0099 cls_loss_causal 0.5950 re_mapping 0.0114 re_causal 0.0325 /// teacc 98.70 lr 0.00010000 Epoch 88, weight, value: tensor([[-0.0312, -0.0246, -0.0295, ..., -0.0782, 0.0075, 0.0002], [-0.0228, 0.0209, -0.0900, ..., 0.0364, 0.0365, -0.0667], [ 0.0502, 0.0013, -0.0498, ..., -0.0350, -0.0165, -0.0029], ..., [ 0.0315, -0.0452, 0.0002, ..., 0.0235, -0.0060, 0.0331], [-0.0170, -0.0305, -0.0380, ..., -0.0079, -0.0142, -0.0247], [-0.0646, -0.1065, 0.0410, ..., -0.0454, 0.0050, 0.0170]], device='cuda:0'), grad: tensor([[ 4.7907e-06, 0.0000e+00, -5.7649e-07, ..., 0.0000e+00, 1.0449e-06, 6.6496e-07], [ 1.1303e-05, 0.0000e+00, 2.1413e-05, ..., 4.6566e-09, -1.8990e-06, 2.6338e-06], [-3.0100e-05, 0.0000e+00, 4.1813e-05, ..., 1.8626e-09, -6.7204e-06, 4.8764e-06], ..., [ 1.7524e-05, 0.0000e+00, 3.8296e-06, ..., 3.2596e-08, 4.2953e-06, 1.5181e-07], [ 8.2701e-06, 0.0000e+00, -2.0396e-06, ..., 9.3132e-10, -4.3735e-06, 3.8464e-07], [ 2.4214e-06, 0.0000e+00, 6.9737e-05, ..., 8.4843e-07, -4.8727e-06, 1.0524e-07]], device='cuda:0') Epoch 88, bias, value: tensor([-0.0055, 0.0290, 0.0079, 0.0007, 0.0065, -0.0022, 0.0192, -0.0216, 0.0183, 0.0020], device='cuda:0'), grad: tensor([ 1.0412e-06, 7.9870e-05, 1.3269e-05, 2.9147e-05, 1.2331e-03, 7.5459e-05, -1.6127e-03, 5.8949e-05, 8.2776e-06, 1.1331e-04], device='cuda:0') 100 0.0001 changing lr epoch 87, time 217.48, cls_loss 0.0085 cls_loss_mapping 0.0108 cls_loss_causal 0.6280 re_mapping 0.0110 re_causal 0.0322 /// teacc 98.77 lr 0.00010000 Epoch 89, weight, value: tensor([[-0.0314, -0.0251, -0.0293, ..., -0.0785, 0.0071, 0.0002], [-0.0230, 0.0212, -0.0907, ..., 0.0371, 0.0365, -0.0686], [ 0.0491, 0.0004, -0.0504, ..., -0.0356, -0.0174, -0.0023], ..., [ 0.0328, -0.0442, 0.0004, ..., 0.0232, -0.0067, 0.0329], [-0.0175, -0.0311, -0.0381, ..., -0.0081, -0.0145, -0.0269], [-0.0650, -0.1072, 0.0408, ..., -0.0456, 0.0066, 0.0170]], device='cuda:0'), grad: tensor([[ 1.4510e-06, 6.4261e-08, 5.1036e-07, ..., 0.0000e+00, -3.2574e-05, 4.5635e-08], [ 3.4392e-05, 2.6412e-06, 1.1539e-06, ..., 0.0000e+00, -6.3963e-06, 2.1420e-08], [-2.8417e-05, -4.3958e-06, 6.0070e-07, ..., 0.0000e+00, 5.9754e-06, -2.8312e-07], ..., [ 2.1935e-05, 1.0682e-06, 4.6752e-07, ..., 0.0000e+00, 4.7982e-06, 2.9802e-08], [ 1.4707e-05, 6.3330e-08, -4.3139e-06, ..., 0.0000e+00, 1.6503e-06, 3.7253e-08], [ 4.8801e-06, 4.8429e-08, 3.6657e-05, ..., 0.0000e+00, 1.2271e-05, 9.3132e-09]], device='cuda:0') Epoch 89, bias, value: tensor([-0.0057, 0.0289, 0.0068, 0.0005, 0.0063, -0.0010, 0.0189, -0.0212, 0.0179, 0.0025], device='cuda:0'), grad: tensor([-6.7234e-04, 4.2140e-05, -4.7266e-05, -4.3631e-05, -1.5020e-04, 5.5313e-05, 5.9032e-04, 4.2647e-05, 1.5251e-05, 1.6809e-04], device='cuda:0') 100 0.0001 changing lr epoch 88, time 217.44, cls_loss 0.0074 cls_loss_mapping 0.0116 cls_loss_causal 0.5915 re_mapping 0.0114 re_causal 0.0316 /// teacc 98.73 lr 0.00010000 Epoch 90, weight, value: tensor([[-3.1579e-02, -2.6152e-02, -2.9042e-02, ..., -7.8652e-02, 6.8017e-03, 1.4344e-04], [-2.3562e-02, 2.0799e-02, -9.0943e-02, ..., 3.7086e-02, 3.6621e-02, -7.0911e-02], [ 4.7967e-02, -5.5387e-04, -5.0581e-02, ..., -3.5739e-02, -1.8544e-02, -1.5847e-03], ..., [ 3.4704e-02, -4.2498e-02, 4.0799e-05, ..., 2.3258e-02, -6.1374e-03, 3.2744e-02], [-1.8080e-02, -3.3078e-02, -3.8440e-02, ..., -8.2613e-03, -1.4512e-02, -2.7978e-02], [-6.5961e-02, -1.0942e-01, 4.0714e-02, ..., -4.5746e-02, 7.1932e-03, 1.6938e-02]], device='cuda:0'), grad: tensor([[ 9.8124e-06, 1.8347e-07, 2.8126e-07, ..., 6.7707e-07, 3.8259e-06, 7.4506e-09], [ 4.4203e-04, 1.8571e-06, 2.7195e-07, ..., 7.1898e-06, 8.2016e-05, 1.8626e-09], [-1.6975e-04, -7.3671e-05, 8.1956e-08, ..., 3.5483e-07, 2.9877e-05, -5.5879e-09], ..., [-2.3403e-03, 7.0214e-05, 1.0729e-06, ..., -3.7074e-05, -1.5855e-04, 4.6566e-09], [ 8.0824e-05, 6.4634e-07, 8.0653e-07, ..., 4.1258e-07, 2.5138e-05, 2.0489e-08], [ 1.4281e-04, 3.3528e-08, 9.7230e-07, ..., 2.3738e-05, 2.8461e-05, 3.7253e-09]], device='cuda:0') Epoch 90, bias, value: tensor([-0.0057, 0.0286, 0.0057, 0.0009, 0.0060, -0.0014, 0.0198, -0.0198, 0.0174, 0.0025], device='cuda:0'), grad: tensor([-7.1563e-06, 1.0080e-03, -9.0539e-05, 2.0084e-03, 1.7658e-05, 2.0057e-05, -1.9744e-06, -3.4561e-03, 1.9920e-04, 3.0518e-04], device='cuda:0') 100 0.0001 changing lr epoch 89, time 217.67, cls_loss 0.0073 cls_loss_mapping 0.0093 cls_loss_causal 0.5837 re_mapping 0.0107 re_causal 0.0308 /// teacc 98.78 lr 0.00010000 Epoch 91, weight, value: tensor([[-3.1746e-02, -2.7109e-02, -2.9686e-02, ..., -7.8891e-02, 6.5501e-03, -1.6570e-03], [-2.3629e-02, 2.0780e-02, -9.1575e-02, ..., 3.6932e-02, 3.7027e-02, -7.1353e-02], [ 4.8177e-02, 2.5558e-05, -5.0932e-02, ..., -3.5760e-02, -1.8929e-02, -1.5281e-03], ..., [ 3.4894e-02, -4.3069e-02, 1.3605e-04, ..., 2.3350e-02, -6.3779e-03, 3.2673e-02], [-1.8058e-02, -3.6473e-02, -3.9193e-02, ..., -8.4988e-03, -1.4524e-02, -2.8189e-02], [-6.6684e-02, -1.1092e-01, 4.1127e-02, ..., -4.6283e-02, 7.5552e-03, 1.8712e-02]], device='cuda:0'), grad: tensor([[ 1.2424e-06, 2.1420e-08, 2.6003e-06, ..., 0.0000e+00, -2.1964e-05, 2.1607e-07], [ 1.0744e-05, 6.5193e-09, 2.0824e-06, ..., 0.0000e+00, 6.2399e-07, 2.3283e-08], [-2.1346e-06, -2.1234e-07, 1.6196e-06, ..., 0.0000e+00, 4.5374e-06, 3.0361e-07], ..., [-1.4091e-04, 6.5193e-08, 1.2927e-06, ..., 0.0000e+00, -6.7055e-05, 4.4703e-08], [ 5.2713e-06, 8.1025e-08, 1.5963e-06, ..., 0.0000e+00, 5.4426e-06, 3.8184e-08], [ 1.2803e-04, 2.7940e-09, 1.3679e-05, ..., 0.0000e+00, 7.4387e-05, -7.7114e-07]], device='cuda:0') Epoch 91, bias, value: tensor([-0.0062, 0.0285, 0.0056, 0.0005, 0.0059, -0.0013, 0.0196, -0.0199, 0.0176, 0.0032], device='cuda:0'), grad: tensor([-2.4843e-04, 2.7999e-05, 1.0908e-05, 1.1347e-05, -4.0501e-05, 1.3694e-05, 1.4402e-05, -4.5466e-04, 3.1620e-05, 6.3324e-04], device='cuda:0') 100 0.0001 changing lr epoch 90, time 217.48, cls_loss 0.0071 cls_loss_mapping 0.0088 cls_loss_causal 0.6120 re_mapping 0.0111 re_causal 0.0317 /// teacc 98.75 lr 0.00010000 Epoch 92, weight, value: tensor([[-0.0322, -0.0274, -0.0297, ..., -0.0794, 0.0062, -0.0017], [-0.0244, 0.0209, -0.0918, ..., 0.0371, 0.0372, -0.0720], [ 0.0483, 0.0001, -0.0511, ..., -0.0358, -0.0193, -0.0015], ..., [ 0.0347, -0.0432, 0.0004, ..., 0.0243, -0.0064, 0.0330], [-0.0170, -0.0365, -0.0396, ..., -0.0088, -0.0141, -0.0284], [-0.0680, -0.1112, 0.0409, ..., -0.0468, 0.0070, 0.0187]], device='cuda:0'), grad: tensor([[ 1.1742e-05, 0.0000e+00, 1.6717e-06, ..., 0.0000e+00, 2.2665e-05, 4.1723e-07], [ 3.9458e-05, 0.0000e+00, 2.5239e-06, ..., 0.0000e+00, 9.7901e-06, 1.8626e-09], [-3.8058e-05, 0.0000e+00, 9.5833e-07, ..., 0.0000e+00, 3.7700e-06, 2.7940e-09], ..., [ 2.9221e-05, 0.0000e+00, 2.1160e-06, ..., 0.0000e+00, 5.3123e-06, 9.3132e-10], [ 1.7121e-05, 0.0000e+00, 6.3255e-06, ..., 0.0000e+00, -2.2724e-06, 1.1176e-08], [ 3.3647e-05, 0.0000e+00, 4.2655e-07, ..., 0.0000e+00, -2.1141e-06, 1.1176e-08]], device='cuda:0') Epoch 92, bias, value: tensor([-0.0058, 0.0281, 0.0056, 0.0013, 0.0061, -0.0017, 0.0193, -0.0199, 0.0184, 0.0025], device='cuda:0'), grad: tensor([ 2.1122e-06, 1.1557e-04, 6.3404e-06, 1.5147e-05, -7.5400e-05, 2.9594e-05, -2.5630e-04, 6.0707e-05, 4.5806e-05, 5.6356e-05], device='cuda:0') 100 0.0001 changing lr epoch 91, time 217.71, cls_loss 0.0082 cls_loss_mapping 0.0113 cls_loss_causal 0.5898 re_mapping 0.0106 re_causal 0.0300 /// teacc 98.74 lr 0.00010000 Epoch 93, weight, value: tensor([[-0.0327, -0.0276, -0.0300, ..., -0.0796, 0.0058, -0.0017], [-0.0256, 0.0210, -0.0923, ..., 0.0383, 0.0380, -0.0738], [ 0.0484, 0.0001, -0.0511, ..., -0.0359, -0.0198, -0.0015], ..., [ 0.0346, -0.0431, 0.0007, ..., 0.0239, -0.0064, 0.0339], [-0.0152, -0.0369, -0.0400, ..., -0.0089, -0.0150, -0.0288], [-0.0696, -0.1117, 0.0409, ..., -0.0471, 0.0072, 0.0187]], device='cuda:0'), grad: tensor([[ 1.6345e-06, 0.0000e+00, -2.1327e-07, ..., 0.0000e+00, 9.4995e-07, 1.0245e-08], [ 9.8720e-06, 0.0000e+00, 2.3022e-06, ..., 0.0000e+00, 5.2676e-06, 8.3819e-08], [ 1.1444e-05, 0.0000e+00, 3.8836e-07, ..., 0.0000e+00, 2.6487e-06, 1.3970e-08], ..., [-7.4387e-05, 0.0000e+00, -2.9467e-06, ..., 0.0000e+00, -1.2696e-05, 6.3330e-08], [ 7.1935e-06, 0.0000e+00, 2.0880e-06, ..., 0.0000e+00, 2.8312e-05, 6.5193e-09], [ 4.0323e-05, 0.0000e+00, -8.0705e-05, ..., 0.0000e+00, -2.6989e-04, 1.1269e-07]], device='cuda:0') Epoch 93, bias, value: tensor([-0.0055, 0.0282, 0.0057, 0.0009, 0.0062, -0.0014, 0.0195, -0.0204, 0.0192, 0.0019], device='cuda:0'), grad: tensor([-3.8457e-04, 3.9786e-05, 2.4736e-05, -1.7537e-06, 8.3256e-04, 2.4509e-04, 1.2743e-04, -1.9467e-04, 9.7811e-05, -7.8630e-04], device='cuda:0') 100 0.0001 changing lr epoch 92, time 217.78, cls_loss 0.0079 cls_loss_mapping 0.0109 cls_loss_causal 0.6241 re_mapping 0.0104 re_causal 0.0300 /// teacc 98.67 lr 0.00010000 Epoch 94, weight, value: tensor([[-0.0335, -0.0284, -0.0298, ..., -0.0800, 0.0075, -0.0018], [-0.0254, 0.0211, -0.0932, ..., 0.0390, 0.0385, -0.0767], [ 0.0486, 0.0006, -0.0513, ..., -0.0360, -0.0200, -0.0013], ..., [ 0.0347, -0.0436, 0.0005, ..., 0.0237, -0.0067, 0.0343], [-0.0154, -0.0376, -0.0412, ..., -0.0090, -0.0151, -0.0310], [-0.0708, -0.1126, 0.0407, ..., -0.0479, 0.0077, 0.0189]], device='cuda:0'), grad: tensor([[ 3.9227e-06, 0.0000e+00, 1.1586e-05, ..., 1.3132e-07, 4.2245e-06, 0.0000e+00], [ 2.0355e-05, 0.0000e+00, 6.8992e-06, ..., 2.9057e-07, 2.5891e-07, 0.0000e+00], [-2.3365e-05, 0.0000e+00, -2.0117e-06, ..., 6.9849e-08, 6.9384e-07, 0.0000e+00], ..., [-2.1830e-05, 0.0000e+00, 7.8380e-06, ..., 2.4494e-07, 6.2808e-06, 0.0000e+00], [ 5.6356e-05, 0.0000e+00, 1.0610e-05, ..., 5.4203e-07, 1.2092e-05, 0.0000e+00], [-8.9686e-07, 0.0000e+00, 3.0994e-05, ..., 8.9407e-08, -2.6330e-05, 0.0000e+00]], device='cuda:0') Epoch 94, bias, value: tensor([-0.0044, 0.0283, 0.0060, 0.0006, 0.0068, -0.0011, 0.0183, -0.0203, 0.0190, 0.0012], device='cuda:0'), grad: tensor([ 6.1356e-06, 5.4240e-05, -3.8385e-05, 3.0786e-05, 2.3693e-05, -1.9336e-04, 2.2739e-05, -1.4365e-05, 1.3149e-04, -2.3007e-05], device='cuda:0') 100 0.0001 changing lr epoch 93, time 217.53, cls_loss 0.0064 cls_loss_mapping 0.0085 cls_loss_causal 0.5821 re_mapping 0.0112 re_causal 0.0301 /// teacc 98.77 lr 0.00010000 Epoch 95, weight, value: tensor([[-0.0337, -0.0289, -0.0299, ..., -0.0801, 0.0071, -0.0018], [-0.0257, 0.0209, -0.0937, ..., 0.0391, 0.0383, -0.0771], [ 0.0490, -0.0002, -0.0512, ..., -0.0360, -0.0202, -0.0012], ..., [ 0.0351, -0.0427, 0.0004, ..., 0.0236, -0.0069, 0.0343], [-0.0163, -0.0364, -0.0413, ..., -0.0091, -0.0152, -0.0313], [-0.0713, -0.1136, 0.0402, ..., -0.0481, 0.0087, 0.0189]], device='cuda:0'), grad: tensor([[ 2.8685e-06, 0.0000e+00, -4.3027e-07, ..., 0.0000e+00, 4.1053e-06, 0.0000e+00], [ 5.6252e-06, 0.0000e+00, 1.0505e-06, ..., 0.0000e+00, 3.9935e-06, 0.0000e+00], [-2.7657e-04, 0.0000e+00, 1.4780e-06, ..., 0.0000e+00, -5.3763e-05, 0.0000e+00], ..., [ 5.6103e-06, 0.0000e+00, 3.8296e-06, ..., 0.0000e+00, 2.5593e-06, 0.0000e+00], [ 4.9353e-05, 0.0000e+00, 5.8208e-07, ..., 0.0000e+00, -5.4762e-06, 0.0000e+00], [ 1.3048e-06, 0.0000e+00, 9.8441e-07, ..., 0.0000e+00, -5.4687e-06, 0.0000e+00]], device='cuda:0') Epoch 95, bias, value: tensor([-0.0041, 0.0277, 0.0060, 0.0004, 0.0066, -0.0012, 0.0192, -0.0197, 0.0181, 0.0013], device='cuda:0'), grad: tensor([-1.1310e-05, 2.5257e-05, -3.2830e-04, 2.7680e-04, 8.7693e-06, 1.5780e-05, -4.0382e-05, 5.8264e-06, 2.8074e-05, 1.9282e-05], device='cuda:0') 100 0.0001 changing lr epoch 94, time 217.70, cls_loss 0.0068 cls_loss_mapping 0.0090 cls_loss_causal 0.5953 re_mapping 0.0108 re_causal 0.0296 /// teacc 98.76 lr 0.00010000 Epoch 96, weight, value: tensor([[-0.0332, -0.0295, -0.0301, ..., -0.0801, 0.0071, -0.0027], [-0.0264, 0.0209, -0.0953, ..., 0.0391, 0.0382, -0.0777], [ 0.0489, -0.0002, -0.0499, ..., -0.0361, -0.0207, -0.0007], ..., [ 0.0356, -0.0426, 0.0003, ..., 0.0236, -0.0074, 0.0342], [-0.0162, -0.0369, -0.0418, ..., -0.0092, -0.0156, -0.0314], [-0.0722, -0.1155, 0.0398, ..., -0.0483, 0.0097, 0.0198]], device='cuda:0'), grad: tensor([[ 1.5683e-06, 0.0000e+00, 9.8534e-07, ..., 0.0000e+00, 3.6806e-06, 2.4214e-08], [-9.3639e-05, 0.0000e+00, 3.6042e-07, ..., 0.0000e+00, -2.0456e-04, 1.8626e-09], [-2.2817e-06, 0.0000e+00, 1.1735e-07, ..., 0.0000e+00, 6.6198e-06, -9.6858e-08], ..., [-1.4678e-05, 0.0000e+00, 2.6356e-07, ..., 0.0000e+00, 1.8373e-05, 2.0489e-08], [ 7.4446e-05, 0.0000e+00, 1.0848e-05, ..., 0.0000e+00, 1.9288e-04, 1.8626e-08], [ 3.4552e-06, 0.0000e+00, 1.7444e-06, ..., 0.0000e+00, -5.3734e-05, 6.5193e-09]], device='cuda:0') Epoch 96, bias, value: tensor([-3.8285e-03, 2.7058e-02, 5.7196e-03, 4.5091e-05, 6.8383e-03, -1.6129e-03, 1.9928e-02, -1.9388e-02, 1.8173e-02, 1.3574e-03], device='cuda:0'), grad: tensor([ 1.0476e-05, -4.3154e-04, 7.2643e-06, 4.7475e-05, 3.8713e-05, -1.9848e-04, 1.8823e-04, 2.1607e-05, 4.4274e-04, -1.2684e-04], device='cuda:0') 100 0.0001 changing lr epoch 95, time 217.70, cls_loss 0.0067 cls_loss_mapping 0.0095 cls_loss_causal 0.5823 re_mapping 0.0103 re_causal 0.0293 /// teacc 98.85 lr 0.00010000 Epoch 97, weight, value: tensor([[-0.0331, -0.0296, -0.0303, ..., -0.0802, 0.0070, -0.0033], [-0.0267, 0.0209, -0.0961, ..., 0.0392, 0.0389, -0.0779], [ 0.0491, -0.0002, -0.0502, ..., -0.0361, -0.0209, -0.0006], ..., [ 0.0356, -0.0426, 0.0006, ..., 0.0236, -0.0087, 0.0342], [-0.0166, -0.0374, -0.0426, ..., -0.0093, -0.0163, -0.0314], [-0.0717, -0.1157, 0.0396, ..., -0.0484, 0.0107, 0.0204]], device='cuda:0'), grad: tensor([[ 5.3905e-06, 9.3132e-09, -1.1951e-05, ..., 0.0000e+00, 1.0353e-04, 0.0000e+00], [ 3.4533e-06, 2.1420e-08, 5.6438e-07, ..., 0.0000e+00, -4.2629e-04, 0.0000e+00], [-3.0383e-05, -5.0571e-07, 2.6450e-07, ..., 0.0000e+00, 3.0875e-05, 0.0000e+00], ..., [ 6.2659e-06, 3.0547e-07, 1.0934e-06, ..., 0.0000e+00, 1.9431e-05, 0.0000e+00], [ 1.3940e-05, 1.0151e-07, 1.5991e-06, ..., 0.0000e+00, 1.8382e-04, 0.0000e+00], [ 1.6302e-05, 9.3132e-10, 1.0524e-06, ..., 0.0000e+00, 1.2964e-05, 0.0000e+00]], device='cuda:0') Epoch 97, bias, value: tensor([-0.0044, 0.0272, 0.0058, -0.0002, 0.0070, -0.0013, 0.0195, -0.0194, 0.0173, 0.0024], device='cuda:0'), grad: tensor([ 1.3947e-04, -8.3208e-04, 2.7224e-05, 4.8429e-07, 5.4121e-05, 5.3167e-05, 1.0347e-04, 5.0157e-05, 3.6764e-04, 3.7253e-05], device='cuda:0') 100 0.0001 changing lr epoch 96, time 218.00, cls_loss 0.0065 cls_loss_mapping 0.0105 cls_loss_causal 0.6096 re_mapping 0.0104 re_causal 0.0302 /// teacc 98.85 lr 0.00010000 Epoch 98, weight, value: tensor([[-0.0333, -0.0296, -0.0303, ..., -0.0803, 0.0063, -0.0033], [-0.0270, 0.0209, -0.0973, ..., 0.0392, 0.0411, -0.0783], [ 0.0490, -0.0002, -0.0507, ..., -0.0361, -0.0218, -0.0003], ..., [ 0.0359, -0.0426, 0.0009, ..., 0.0240, -0.0096, 0.0342], [-0.0169, -0.0373, -0.0425, ..., -0.0093, -0.0166, -0.0316], [-0.0730, -0.1157, 0.0383, ..., -0.0485, 0.0098, 0.0204]], device='cuda:0'), grad: tensor([[ 9.4157e-07, 0.0000e+00, 2.0452e-06, ..., 0.0000e+00, 1.8045e-05, 5.8934e-06], [ 1.5616e-05, 0.0000e+00, 8.4005e-07, ..., 0.0000e+00, -5.6811e-06, 3.9116e-08], [-2.0695e-03, 0.0000e+00, 5.0385e-07, ..., 0.0000e+00, 2.2054e-06, 4.9360e-08], ..., [ 3.4332e-05, 0.0000e+00, 3.7346e-07, ..., 0.0000e+00, 4.5411e-06, 1.5832e-08], [ 2.0199e-03, 0.0000e+00, 1.4147e-06, ..., 0.0000e+00, 5.1633e-06, 2.9150e-07], [ 4.5076e-06, 0.0000e+00, 7.2829e-06, ..., 0.0000e+00, 5.1744e-06, 2.1141e-07]], device='cuda:0') Epoch 98, bias, value: tensor([-0.0042, 0.0278, 0.0052, -0.0019, 0.0079, 0.0004, 0.0197, -0.0191, 0.0174, 0.0009], device='cuda:0'), grad: tensor([ 7.0572e-05, 6.8724e-05, -1.0391e-02, -1.7971e-05, 8.3745e-06, 2.9594e-05, -9.5189e-05, 1.7679e-04, 1.0132e-02, 2.8670e-05], device='cuda:0') 100 0.0001 changing lr epoch 97, time 219.00, cls_loss 0.0060 cls_loss_mapping 0.0092 cls_loss_causal 0.6126 re_mapping 0.0106 re_causal 0.0308 /// teacc 98.75 lr 0.00010000 Epoch 99, weight, value: tensor([[-0.0339, -0.0298, -0.0312, ..., -0.0804, 0.0058, -0.0038], [-0.0272, 0.0208, -0.0988, ..., 0.0392, 0.0420, -0.0788], [ 0.0487, -0.0002, -0.0513, ..., -0.0361, -0.0223, 0.0007], ..., [ 0.0366, -0.0425, 0.0013, ..., 0.0241, -0.0114, 0.0342], [-0.0174, -0.0375, -0.0420, ..., -0.0093, -0.0159, -0.0318], [-0.0740, -0.1160, 0.0385, ..., -0.0486, 0.0104, 0.0209]], device='cuda:0'), grad: tensor([[ 1.1455e-06, 0.0000e+00, 3.3118e-06, ..., 0.0000e+00, 2.7604e-06, 0.0000e+00], [ 1.1334e-06, 0.0000e+00, 2.2903e-05, ..., 0.0000e+00, 2.7925e-05, 0.0000e+00], [-2.0444e-05, 0.0000e+00, 2.8647e-06, ..., 0.0000e+00, 3.4068e-06, 0.0000e+00], ..., [ 1.0906e-06, 0.0000e+00, 1.6332e-05, ..., 0.0000e+00, 1.2338e-05, 0.0000e+00], [ 9.9763e-06, 0.0000e+00, 2.6286e-05, ..., 0.0000e+00, 3.4183e-05, 0.0000e+00], [-9.5218e-06, 0.0000e+00, -1.2856e-03, ..., 0.0000e+00, -1.5278e-03, 0.0000e+00]], device='cuda:0') Epoch 99, bias, value: tensor([-0.0046, 0.0282, 0.0053, -0.0019, 0.0082, 0.0003, 0.0200, -0.0192, 0.0171, 0.0006], device='cuda:0'), grad: tensor([ 1.3858e-05, 1.1659e-04, -2.6584e-05, 2.7001e-05, 5.9814e-03, 2.6464e-05, 5.9083e-06, 6.7651e-05, 1.5199e-04, -6.3629e-03], device='cuda:0') 100 0.0001 changing lr epoch 98, time 220.62, cls_loss 0.0061 cls_loss_mapping 0.0087 cls_loss_causal 0.5846 re_mapping 0.0107 re_causal 0.0294 /// teacc 98.87 lr 0.00010000 Epoch 100, weight, value: tensor([[-3.3953e-02, -3.0393e-02, -3.1401e-02, ..., -8.0508e-02, 5.6541e-03, -3.9072e-03], [-2.8621e-02, 2.0676e-02, -9.9945e-02, ..., 3.9273e-02, 4.1239e-02, -7.9373e-02], [ 4.9674e-02, 1.0520e-04, -5.1749e-02, ..., -3.6169e-02, -2.1992e-02, 3.4256e-04], ..., [ 3.6689e-02, -4.3312e-02, 6.7844e-04, ..., 2.4578e-02, -1.0763e-02, 3.4235e-02], [-1.7645e-02, -3.4443e-02, -4.2634e-02, ..., -9.3818e-03, -1.5728e-02, -3.1914e-02], [-7.5739e-02, -1.1693e-01, 3.9002e-02, ..., -4.8760e-02, 1.0871e-02, 2.0944e-02]], device='cuda:0'), grad: tensor([[ 4.7684e-06, 0.0000e+00, -7.3668e-07, ..., 0.0000e+00, 3.2447e-06, 0.0000e+00], [ 2.9672e-06, 0.0000e+00, 8.9407e-08, ..., 0.0000e+00, -2.3264e-06, 9.3132e-10], [ 3.5405e-05, 0.0000e+00, 2.9709e-07, ..., 0.0000e+00, 2.1495e-06, 9.3132e-10], ..., [ 4.1038e-05, 0.0000e+00, 2.0862e-07, ..., 0.0000e+00, 2.0117e-06, -3.7253e-09], [ 3.2037e-05, 0.0000e+00, -1.1316e-06, ..., 0.0000e+00, -1.0459e-06, 0.0000e+00], [ 1.9386e-05, 0.0000e+00, 1.7621e-06, ..., 0.0000e+00, -1.4435e-07, 0.0000e+00]], device='cuda:0') Epoch 100, bias, value: tensor([-0.0045, 0.0270, 0.0061, -0.0019, 0.0081, 0.0003, 0.0199, -0.0191, 0.0172, 0.0007], device='cuda:0'), grad: tensor([ 9.5665e-06, 1.8738e-06, 6.9201e-05, -2.7299e-04, 1.2249e-05, 2.7418e-05, -2.0176e-05, 7.9393e-05, 5.6714e-05, 3.6359e-05], device='cuda:0') 100 0.0001 changing lr epoch 99, time 219.76, cls_loss 0.0061 cls_loss_mapping 0.0098 cls_loss_causal 0.5776 re_mapping 0.0097 re_causal 0.0281 /// teacc 98.79 lr 0.00010000 Epoch 101, weight, value: tensor([[-0.0344, -0.0308, -0.0316, ..., -0.0808, 0.0052, -0.0039], [-0.0280, 0.0221, -0.1008, ..., 0.0393, 0.0423, -0.0805], [ 0.0494, -0.0005, -0.0518, ..., -0.0363, -0.0229, 0.0003], ..., [ 0.0369, -0.0435, 0.0007, ..., 0.0246, -0.0113, 0.0342], [-0.0179, -0.0344, -0.0427, ..., -0.0095, -0.0154, -0.0321], [-0.0764, -0.1177, 0.0383, ..., -0.0488, 0.0093, 0.0210]], device='cuda:0'), grad: tensor([[ 8.3353e-07, 0.0000e+00, 6.8918e-08, ..., 9.3132e-10, 2.0228e-06, 0.0000e+00], [ 9.1866e-06, 0.0000e+00, 1.9092e-07, ..., 4.6566e-09, -4.9263e-05, 0.0000e+00], [ 7.2643e-06, 0.0000e+00, 6.3330e-08, ..., 9.3132e-10, 1.4089e-05, 0.0000e+00], ..., [-1.6123e-05, 0.0000e+00, -3.3062e-07, ..., 6.5193e-09, 6.4969e-06, 0.0000e+00], [-7.9162e-07, 0.0000e+00, 6.7055e-08, ..., 0.0000e+00, -1.4335e-05, 0.0000e+00], [ 5.4426e-06, 0.0000e+00, 1.1902e-06, ..., 1.8626e-09, 2.3276e-05, 0.0000e+00]], device='cuda:0') Epoch 101, bias, value: tensor([-0.0043, 0.0280, 0.0055, -0.0019, 0.0094, 0.0004, 0.0195, -0.0191, 0.0174, -0.0006], device='cuda:0'), grad: tensor([ 8.6520e-07, -1.1861e-04, 4.8667e-05, -3.8683e-05, 2.2829e-05, 4.7565e-05, 1.4812e-05, -1.3679e-05, -3.6657e-05, 7.2837e-05], device='cuda:0') 100 0.0001 changing lr epoch 100, time 220.23, cls_loss 0.0049 cls_loss_mapping 0.0077 cls_loss_causal 0.5981 re_mapping 0.0095 re_causal 0.0302 /// teacc 98.85 lr 0.00010000 Epoch 102, weight, value: tensor([[-3.6489e-02, -3.0842e-02, -3.2005e-02, ..., -8.1052e-02, 4.7590e-03, -5.4737e-03], [-2.8206e-02, 2.2080e-02, -1.0224e-01, ..., 3.9480e-02, 4.2716e-02, -8.1424e-02], [ 4.9432e-02, -5.1822e-04, -5.2023e-02, ..., -3.6279e-02, -2.3179e-02, 7.6837e-05], ..., [ 3.7007e-02, -4.3526e-02, 6.6154e-04, ..., 2.4375e-02, -1.1564e-02, 3.4171e-02], [-1.8206e-02, -3.4415e-02, -4.2652e-02, ..., -9.5949e-03, -1.5960e-02, -3.3009e-02], [-7.6831e-02, -1.1773e-01, 3.8692e-02, ..., -4.9303e-02, 9.9592e-03, 2.2451e-02]], device='cuda:0'), grad: tensor([[ 6.8545e-07, 0.0000e+00, 4.5355e-07, ..., 0.0000e+00, -2.2911e-07, 9.3132e-10], [ 6.9514e-06, 0.0000e+00, 3.3323e-06, ..., 0.0000e+00, 6.7391e-06, 0.0000e+00], [ 4.8131e-06, 0.0000e+00, 9.9558e-07, ..., 0.0000e+00, 1.6466e-05, 0.0000e+00], ..., [-2.0906e-05, 0.0000e+00, 4.2111e-05, ..., 0.0000e+00, 1.5534e-06, 9.3132e-10], [ 4.7199e-06, 0.0000e+00, 1.2564e-06, ..., 0.0000e+00, -6.0886e-05, 9.3132e-09], [ 1.3299e-06, 0.0000e+00, 1.7929e-04, ..., 0.0000e+00, -5.2899e-07, 1.8626e-09]], device='cuda:0') Epoch 102, bias, value: tensor([-0.0053, 0.0281, 0.0052, -0.0015, 0.0089, 0.0005, 0.0198, -0.0191, 0.0170, 0.0002], device='cuda:0'), grad: tensor([-7.5847e-06, 5.0664e-05, 5.9903e-05, -2.1353e-05, -3.3379e-04, 1.0902e-04, 2.3350e-05, 3.3051e-05, -1.8108e-04, 2.6751e-04], device='cuda:0') 100 0.0001 changing lr epoch 101, time 219.97, cls_loss 0.0055 cls_loss_mapping 0.0081 cls_loss_causal 0.5910 re_mapping 0.0098 re_causal 0.0284 /// teacc 98.82 lr 0.00010000 Epoch 103, weight, value: tensor([[-3.6805e-02, -3.1012e-02, -3.2107e-02, ..., -8.1267e-02, 4.7487e-03, -5.6299e-03], [-2.8118e-02, 2.2084e-02, -1.0252e-01, ..., 3.9501e-02, 4.2437e-02, -8.1695e-02], [ 4.8978e-02, -2.9018e-04, -5.1800e-02, ..., -3.6337e-02, -2.3783e-02, 9.4734e-05], ..., [ 3.7675e-02, -4.3732e-02, 5.3099e-04, ..., 2.4381e-02, -1.1898e-02, 3.4208e-02], [-1.8497e-02, -3.4439e-02, -4.2968e-02, ..., -9.6682e-03, -1.6326e-02, -3.3269e-02], [-7.7103e-02, -1.1787e-01, 3.8125e-02, ..., -4.9589e-02, 1.1430e-02, 2.2593e-02]], device='cuda:0'), grad: tensor([[ 7.4320e-07, 0.0000e+00, 1.9997e-05, ..., 1.2573e-07, 3.8058e-05, 3.7253e-09], [ 3.3472e-06, 0.0000e+00, 3.8370e-06, ..., 3.5763e-07, 1.8710e-06, 3.7253e-09], [-4.1537e-06, 0.0000e+00, 5.2862e-06, ..., 7.7300e-08, 4.3996e-06, -8.4750e-08], ..., [-1.9163e-05, 0.0000e+00, 2.0787e-06, ..., -4.3698e-06, 3.8818e-06, 1.3039e-08], [ 1.2266e-06, 0.0000e+00, 4.3772e-07, ..., 4.9360e-08, 1.7673e-05, 3.0734e-08], [ 1.4603e-05, 0.0000e+00, -8.8096e-05, ..., 3.9563e-06, -1.3924e-04, 3.7253e-09]], device='cuda:0') Epoch 103, bias, value: tensor([-0.0048, 0.0281, 0.0043, -0.0017, 0.0088, 0.0003, 0.0202, -0.0185, 0.0166, 0.0005], device='cuda:0'), grad: tensor([ 2.3580e-04, 3.7789e-05, 1.9640e-05, 2.2024e-05, 6.1655e-04, 1.6823e-05, 2.2396e-05, -6.7279e-06, 4.7803e-05, -1.0118e-03], device='cuda:0') 100 0.0001 changing lr epoch 102, time 220.05, cls_loss 0.0062 cls_loss_mapping 0.0108 cls_loss_causal 0.5879 re_mapping 0.0097 re_causal 0.0290 /// teacc 98.87 lr 0.00010000 Epoch 104, weight, value: tensor([[-0.0372, -0.0314, -0.0326, ..., -0.0813, 0.0045, -0.0057], [-0.0287, 0.0221, -0.1038, ..., 0.0397, 0.0423, -0.0833], [ 0.0489, -0.0010, -0.0521, ..., -0.0364, -0.0239, 0.0015], ..., [ 0.0382, -0.0427, 0.0004, ..., 0.0244, -0.0124, 0.0340], [-0.0191, -0.0345, -0.0431, ..., -0.0097, -0.0165, -0.0338], [-0.0773, -0.1185, 0.0383, ..., -0.0499, 0.0114, 0.0225]], device='cuda:0'), grad: tensor([[ 4.8801e-06, 0.0000e+00, 3.4366e-07, ..., 0.0000e+00, 1.7863e-06, -6.0629e-07], [ 4.2021e-05, 0.0000e+00, 1.4063e-07, ..., 0.0000e+00, -2.1011e-06, 3.6322e-08], [-5.3585e-05, 0.0000e+00, 3.0827e-07, ..., 0.0000e+00, 8.3167e-07, 1.5367e-07], ..., [-3.4332e-05, 0.0000e+00, 2.3656e-07, ..., 0.0000e+00, 1.2573e-06, 9.2201e-08], [ 1.2048e-05, 0.0000e+00, 1.1548e-07, ..., 0.0000e+00, 2.6934e-06, 9.5926e-08], [ 1.2487e-05, 0.0000e+00, 6.4913e-07, ..., 0.0000e+00, 3.5763e-07, 4.9360e-08]], device='cuda:0') Epoch 104, bias, value: tensor([-0.0036, 0.0277, 0.0041, -0.0008, 0.0091, -0.0004, 0.0206, -0.0183, 0.0162, -0.0001], device='cuda:0'), grad: tensor([-3.6210e-06, 9.2983e-05, -1.3793e-04, -1.1981e-05, 1.0997e-05, 1.8626e-05, 2.5213e-05, -5.8264e-05, 2.3052e-05, 4.0591e-05], device='cuda:0') 100 0.0001 changing lr epoch 103, time 219.73, cls_loss 0.0055 cls_loss_mapping 0.0080 cls_loss_causal 0.5779 re_mapping 0.0097 re_causal 0.0281 /// teacc 98.79 lr 0.00010000 Epoch 105, weight, value: tensor([[-0.0377, -0.0315, -0.0327, ..., -0.0816, 0.0042, -0.0059], [-0.0282, 0.0222, -0.1054, ..., 0.0401, 0.0427, -0.0848], [ 0.0493, -0.0010, -0.0513, ..., -0.0364, -0.0243, 0.0016], ..., [ 0.0382, -0.0427, 0.0005, ..., 0.0247, -0.0134, 0.0341], [-0.0195, -0.0345, -0.0432, ..., -0.0098, -0.0164, -0.0341], [-0.0782, -0.1185, 0.0380, ..., -0.0498, 0.0116, 0.0225]], device='cuda:0'), grad: tensor([[ 9.7603e-07, 0.0000e+00, 6.4075e-07, ..., 0.0000e+00, 3.6448e-05, 2.1420e-08], [-9.6798e-05, 0.0000e+00, 1.9465e-07, ..., 0.0000e+00, -2.4629e-04, 7.4506e-09], [ 7.4357e-06, 0.0000e+00, 3.0641e-07, ..., 0.0000e+00, 1.6451e-05, 3.7253e-09], ..., [ 6.8307e-05, 0.0000e+00, 2.0862e-07, ..., 0.0000e+00, 1.7691e-04, 1.5832e-08], [ 3.5577e-06, 0.0000e+00, 9.3691e-07, ..., 0.0000e+00, 3.7938e-05, 1.9185e-07], [ 8.3745e-06, 0.0000e+00, 2.2054e-06, ..., 0.0000e+00, -4.7654e-05, 1.6764e-07]], device='cuda:0') Epoch 105, bias, value: tensor([-0.0042, 0.0281, 0.0044, -0.0022, 0.0098, 0.0007, 0.0211, -0.0186, 0.0160, -0.0006], device='cuda:0'), grad: tensor([ 3.2902e-04, -7.4434e-04, 1.5211e-04, 9.5963e-05, 1.7428e-04, 9.2313e-06, -8.7991e-06, 5.7507e-04, -1.1958e-05, -5.7125e-04], device='cuda:0') 100 0.0001 changing lr epoch 104, time 219.80, cls_loss 0.0053 cls_loss_mapping 0.0086 cls_loss_causal 0.6177 re_mapping 0.0097 re_causal 0.0289 /// teacc 98.84 lr 0.00010000 Epoch 106, weight, value: tensor([[-0.0375, -0.0318, -0.0335, ..., -0.0818, 0.0043, -0.0078], [-0.0294, 0.0224, -0.1056, ..., 0.0403, 0.0426, -0.0851], [ 0.0495, -0.0010, -0.0514, ..., -0.0367, -0.0229, 0.0016], ..., [ 0.0389, -0.0428, 0.0004, ..., 0.0246, -0.0140, 0.0341], [-0.0201, -0.0335, -0.0428, ..., -0.0099, -0.0162, -0.0342], [-0.0784, -0.1199, 0.0380, ..., -0.0498, 0.0120, 0.0244]], device='cuda:0'), grad: tensor([[ 2.0377e-06, 3.2596e-09, 1.2163e-06, ..., 2.6962e-07, 5.1316e-07, 1.3504e-08], [ 1.1802e-05, 4.7963e-08, 1.4426e-06, ..., 1.4808e-06, -2.5108e-06, 2.2817e-08], [-1.8716e-05, -2.3795e-07, 3.4971e-07, ..., 2.9616e-07, 4.3586e-07, -1.3085e-07], ..., [-8.3983e-05, 1.5227e-07, -1.7285e-05, ..., -1.8656e-05, 2.1756e-06, 2.5611e-08], [ 2.4855e-05, 8.3819e-09, 6.1421e-07, ..., 2.2911e-07, -2.1867e-06, 6.0536e-09], [ 4.7535e-05, 9.3132e-10, 8.0317e-06, ..., 7.7635e-06, 2.5565e-07, 5.1223e-09]], device='cuda:0') Epoch 106, bias, value: tensor([-4.4666e-03, 2.7202e-02, 4.9677e-03, -1.8957e-03, 9.6344e-03, -4.6174e-04, 2.1111e-02, -1.8173e-02, 1.6226e-02, -7.8943e-05], device='cuda:0'), grad: tensor([-8.6927e-04, 3.1650e-05, -3.9786e-06, 3.0935e-05, 3.7134e-05, 7.3686e-06, 1.0520e-05, -2.5725e-04, 1.6838e-05, 9.9659e-04], device='cuda:0') 100 0.0001 changing lr epoch 105, time 219.41, cls_loss 0.0056 cls_loss_mapping 0.0066 cls_loss_causal 0.5660 re_mapping 0.0099 re_causal 0.0279 /// teacc 98.81 lr 0.00010000 Epoch 107, weight, value: tensor([[-0.0380, -0.0338, -0.0338, ..., -0.0819, 0.0038, -0.0078], [-0.0297, 0.0233, -0.1064, ..., 0.0406, 0.0433, -0.0856], [ 0.0496, -0.0015, -0.0515, ..., -0.0367, -0.0233, 0.0018], ..., [ 0.0392, -0.0427, 0.0006, ..., 0.0247, -0.0144, 0.0343], [-0.0206, -0.0329, -0.0432, ..., -0.0100, -0.0165, -0.0344], [-0.0791, -0.1209, 0.0382, ..., -0.0500, 0.0121, 0.0244]], device='cuda:0'), grad: tensor([[ 7.0035e-07, 2.7474e-08, 4.5775e-07, ..., 0.0000e+00, 2.5127e-06, 2.9802e-08], [ 3.5707e-06, 2.2817e-08, 4.5868e-07, ..., 0.0000e+00, -2.4401e-06, 8.5216e-08], [-3.9548e-05, -5.2452e-06, 9.3598e-08, ..., 0.0000e+00, 2.2724e-07, -1.7229e-07], ..., [ 1.9401e-05, 5.1335e-06, 2.0443e-07, ..., 0.0000e+00, 1.3933e-06, -2.0443e-07], [ 3.3937e-06, 3.6322e-08, 2.0474e-05, ..., 0.0000e+00, 9.7603e-06, 5.7742e-08], [ 7.6666e-06, 1.3970e-09, 5.3365e-07, ..., 0.0000e+00, -1.6429e-06, 6.2864e-08]], device='cuda:0') Epoch 107, bias, value: tensor([-0.0046, 0.0273, 0.0049, -0.0016, 0.0094, -0.0007, 0.0190, -0.0179, 0.0183, -0.0001], device='cuda:0'), grad: tensor([-1.5404e-06, 9.8720e-07, -5.4210e-05, 2.7239e-05, 1.2495e-05, -1.6248e-04, 5.5850e-05, 2.2382e-05, 9.7394e-05, 1.6484e-06], device='cuda:0') 100 0.0001 changing lr epoch 106, time 219.79, cls_loss 0.0052 cls_loss_mapping 0.0092 cls_loss_causal 0.5927 re_mapping 0.0095 re_causal 0.0290 /// teacc 98.65 lr 0.00010000 Epoch 108, weight, value: tensor([[-0.0382, -0.0345, -0.0339, ..., -0.0820, 0.0035, -0.0079], [-0.0309, 0.0235, -0.1067, ..., 0.0407, 0.0445, -0.0868], [ 0.0506, -0.0014, -0.0512, ..., -0.0368, -0.0235, 0.0022], ..., [ 0.0392, -0.0429, 0.0004, ..., 0.0250, -0.0150, 0.0342], [-0.0208, -0.0336, -0.0445, ..., -0.0100, -0.0169, -0.0345], [-0.0795, -0.1217, 0.0382, ..., -0.0508, 0.0120, 0.0244]], device='cuda:0'), grad: tensor([[ 1.3961e-06, 0.0000e+00, 4.7497e-08, ..., 0.0000e+00, 2.9981e-05, 1.3292e-05], [ 7.4413e-07, 0.0000e+00, 6.5193e-09, ..., 0.0000e+00, -6.9849e-07, 6.2399e-08], [ 4.8578e-06, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 4.6268e-06, 1.6568e-06], ..., [ 1.1353e-06, 0.0000e+00, 3.5390e-08, ..., 0.0000e+00, 1.3700e-06, 9.3132e-09], [ 2.0396e-06, 0.0000e+00, 3.5297e-07, ..., 0.0000e+00, 2.5164e-06, 2.6729e-07], [ 2.6077e-06, 0.0000e+00, 3.6322e-08, ..., 0.0000e+00, -1.6456e-06, 3.5390e-08]], device='cuda:0') Epoch 108, bias, value: tensor([-0.0045, 0.0273, 0.0062, -0.0019, 0.0109, -0.0006, 0.0191, -0.0187, 0.0177, -0.0013], device='cuda:0'), grad: tensor([ 1.6916e-04, 1.0170e-06, 3.1114e-05, -2.7850e-05, 8.5160e-06, 2.6435e-05, -2.2793e-04, 5.0329e-06, 1.3761e-05, 8.7079e-07], device='cuda:0') 100 0.0001 changing lr epoch 107, time 219.86, cls_loss 0.0052 cls_loss_mapping 0.0074 cls_loss_causal 0.5872 re_mapping 0.0096 re_causal 0.0278 /// teacc 98.86 lr 0.00010000 Epoch 109, weight, value: tensor([[-3.8403e-02, -3.5559e-02, -3.4272e-02, ..., -8.2115e-02, 2.9623e-03, -8.0154e-03], [-3.1084e-02, 2.2822e-02, -1.0705e-01, ..., 3.8417e-02, 4.4395e-02, -8.6931e-02], [ 5.0280e-02, -1.7909e-03, -5.1552e-02, ..., -3.6777e-02, -2.4747e-02, 2.1256e-03], ..., [ 3.9916e-02, -4.1949e-02, 1.0697e-04, ..., 2.5264e-02, -1.5121e-02, 3.4228e-02], [-2.1378e-02, -3.5603e-02, -4.5783e-02, ..., -1.0111e-02, -1.7135e-02, -3.4575e-02], [-7.9854e-02, -1.2301e-01, 3.7948e-02, ..., -4.8472e-02, 1.2831e-02, 2.4600e-02]], device='cuda:0'), grad: tensor([[ 5.8301e-07, -2.1532e-06, 7.0594e-07, ..., 0.0000e+00, 1.6959e-06, 0.0000e+00], [ 1.4745e-05, 6.0536e-08, 1.3905e-06, ..., 0.0000e+00, 6.9849e-07, 0.0000e+00], [-2.2218e-05, 8.7544e-08, -9.7789e-08, ..., 0.0000e+00, -6.2399e-07, 0.0000e+00], ..., [-2.6971e-06, 3.1292e-07, 2.0228e-06, ..., 0.0000e+00, 6.8918e-07, 0.0000e+00], [ 2.8759e-06, 1.5274e-07, 6.7167e-06, ..., 0.0000e+00, 2.2873e-06, 0.0000e+00], [ 2.0433e-06, 2.3656e-07, -1.3580e-03, ..., 0.0000e+00, -3.6788e-04, 0.0000e+00]], device='cuda:0') Epoch 109, bias, value: tensor([-0.0042, 0.0271, 0.0052, -0.0021, 0.0114, -0.0006, 0.0191, -0.0176, 0.0169, -0.0011], device='cuda:0'), grad: tensor([-1.9848e-05, 3.2067e-05, -3.6597e-05, 2.2620e-05, 3.3932e-03, 2.1175e-05, -4.8071e-05, 1.7164e-06, 3.0056e-05, -3.3989e-03], device='cuda:0') 100 0.0001 changing lr epoch 108, time 219.60, cls_loss 0.0059 cls_loss_mapping 0.0100 cls_loss_causal 0.5958 re_mapping 0.0092 re_causal 0.0278 /// teacc 98.78 lr 0.00010000 Epoch 110, weight, value: tensor([[-0.0397, -0.0375, -0.0346, ..., -0.0836, 0.0025, -0.0081], [-0.0318, 0.0207, -0.1075, ..., 0.0384, 0.0443, -0.0871], [ 0.0500, -0.0008, -0.0536, ..., -0.0373, -0.0245, 0.0021], ..., [ 0.0403, -0.0417, -0.0009, ..., 0.0246, -0.0151, 0.0343], [-0.0218, -0.0337, -0.0461, ..., -0.0106, -0.0170, -0.0346], [-0.0807, -0.1270, 0.0384, ..., -0.0486, 0.0131, 0.0246]], device='cuda:0'), grad: tensor([[ 7.8231e-08, 2.0489e-08, 3.4254e-06, ..., 4.1258e-07, 2.0452e-06, 0.0000e+00], [ 2.3656e-07, 1.0245e-08, 7.1153e-07, ..., 2.0489e-08, -3.0994e-06, 0.0000e+00], [-8.2608e-07, -7.0874e-07, 5.8208e-07, ..., 7.4506e-09, 3.9581e-07, 0.0000e+00], ..., [-7.5437e-08, 2.5518e-07, 6.0722e-06, ..., 5.5879e-08, 1.3607e-06, 0.0000e+00], [ 7.8231e-07, 1.6112e-07, 1.4920e-06, ..., 9.3132e-08, 1.9316e-06, 0.0000e+00], [ 7.1526e-07, 1.2107e-08, 1.4193e-05, ..., 1.0617e-07, 4.4703e-08, 0.0000e+00]], device='cuda:0') Epoch 110, bias, value: tensor([-0.0047, 0.0257, 0.0052, 0.0003, 0.0111, -0.0010, 0.0188, -0.0171, 0.0169, -0.0009], device='cuda:0'), grad: tensor([-2.3380e-05, -2.4252e-06, 3.3677e-06, 3.6340e-06, -8.4877e-05, -2.2605e-05, 3.2932e-06, 4.2081e-05, 1.0341e-05, 7.0393e-05], device='cuda:0') 100 0.0001 changing lr epoch 109, time 219.83, cls_loss 0.0051 cls_loss_mapping 0.0073 cls_loss_causal 0.6049 re_mapping 0.0095 re_causal 0.0287 /// teacc 98.85 lr 0.00010000 Epoch 111, weight, value: tensor([[-0.0401, -0.0383, -0.0351, ..., -0.0838, 0.0019, -0.0081], [-0.0320, 0.0204, -0.1081, ..., 0.0385, 0.0450, -0.0875], [ 0.0501, -0.0009, -0.0542, ..., -0.0374, -0.0250, 0.0025], ..., [ 0.0405, -0.0414, -0.0002, ..., 0.0248, -0.0157, 0.0342], [-0.0220, -0.0337, -0.0465, ..., -0.0106, -0.0177, -0.0348], [-0.0817, -0.1276, 0.0384, ..., -0.0487, 0.0133, 0.0247]], device='cuda:0'), grad: tensor([[ 2.1830e-06, 1.3504e-07, 3.1628e-06, ..., 0.0000e+00, 7.7300e-07, 4.3772e-07], [ 1.8273e-06, 5.7742e-08, 7.6368e-07, ..., 0.0000e+00, -2.2873e-06, 1.3411e-07], [-1.3113e-06, -1.6158e-06, 5.5656e-06, ..., 0.0000e+00, 2.1569e-06, -8.2795e-07], ..., [ 1.4260e-05, 1.2852e-07, 1.4743e-06, ..., 0.0000e+00, 7.3388e-06, 2.8033e-07], [ 4.4182e-06, 2.0210e-07, 4.3809e-06, ..., 0.0000e+00, 2.0806e-06, 9.9465e-07], [ 5.1379e-05, 1.5832e-08, 5.2825e-06, ..., 0.0000e+00, 1.9237e-05, 3.3900e-07]], device='cuda:0') Epoch 111, bias, value: tensor([-5.3301e-03, 2.5805e-02, 4.9322e-03, 3.8411e-05, 1.1153e-02, -4.7907e-04, 1.9546e-02, -1.6959e-02, 1.6641e-02, -1.1235e-03], device='cuda:0'), grad: tensor([-5.1051e-05, -3.2093e-06, 2.5973e-05, -1.5163e-04, -3.4308e-04, -1.0085e-04, 4.3035e-04, 5.0932e-05, 3.2067e-05, 1.0979e-04], device='cuda:0') 100 0.0001 changing lr epoch 110, time 220.27, cls_loss 0.0051 cls_loss_mapping 0.0084 cls_loss_causal 0.5764 re_mapping 0.0097 re_causal 0.0273 /// teacc 98.81 lr 0.00010000 Epoch 112, weight, value: tensor([[-0.0405, -0.0396, -0.0347, ..., -0.0838, 0.0017, -0.0086], [-0.0311, 0.0213, -0.1087, ..., 0.0385, 0.0462, -0.0880], [ 0.0502, -0.0020, -0.0536, ..., -0.0374, -0.0254, 0.0026], ..., [ 0.0405, -0.0406, 0.0002, ..., 0.0251, -0.0170, 0.0342], [-0.0222, -0.0338, -0.0465, ..., -0.0106, -0.0174, -0.0356], [-0.0823, -0.1288, 0.0387, ..., -0.0487, 0.0137, 0.0247]], device='cuda:0'), grad: tensor([[ 2.3246e-06, 8.3819e-09, 1.0341e-05, ..., 0.0000e+00, 2.5574e-06, 0.0000e+00], [ 8.8010e-07, 1.8626e-09, 3.6843e-06, ..., 0.0000e+00, -2.1979e-07, 0.0000e+00], [-7.9256e-07, 9.3132e-10, 2.4289e-06, ..., 0.0000e+00, 1.6838e-06, 0.0000e+00], ..., [-1.6140e-06, 1.8626e-09, 1.4961e-05, ..., 0.0000e+00, 6.4820e-07, 0.0000e+00], [-1.1418e-06, 5.2154e-08, 4.6864e-06, ..., 0.0000e+00, -1.4819e-05, 0.0000e+00], [ 1.9986e-06, 8.3819e-09, 1.9264e-04, ..., 0.0000e+00, -5.7183e-07, 0.0000e+00]], device='cuda:0') Epoch 112, bias, value: tensor([-0.0049, 0.0272, 0.0048, -0.0007, 0.0104, -0.0015, 0.0205, -0.0174, 0.0169, -0.0010], device='cuda:0'), grad: tensor([ 3.0816e-05, 2.3514e-05, 8.3089e-05, 2.1005e-04, -1.1311e-03, -1.1855e-04, 1.1645e-05, 1.3542e-04, -3.0446e-04, 1.0605e-03], device='cuda:0') 100 0.0001 changing lr epoch 111, time 219.73, cls_loss 0.0060 cls_loss_mapping 0.0070 cls_loss_causal 0.6029 re_mapping 0.0092 re_causal 0.0268 /// teacc 98.88 lr 0.00010000 Epoch 113, weight, value: tensor([[-0.0410, -0.0416, -0.0349, ..., -0.0840, 0.0015, -0.0088], [-0.0308, 0.0216, -0.1101, ..., 0.0388, 0.0468, -0.0901], [ 0.0507, -0.0021, -0.0546, ..., -0.0374, -0.0257, 0.0020], ..., [ 0.0405, -0.0407, 0.0014, ..., 0.0249, -0.0180, 0.0341], [-0.0225, -0.0344, -0.0476, ..., -0.0107, -0.0170, -0.0371], [-0.0834, -0.1302, 0.0389, ..., -0.0488, 0.0139, 0.0247]], device='cuda:0'), grad: tensor([[ 1.9558e-07, 0.0000e+00, 4.5821e-06, ..., 0.0000e+00, 1.8282e-06, 0.0000e+00], [ 2.3190e-07, 0.0000e+00, -1.5914e-05, ..., 1.8626e-09, -3.9071e-05, 0.0000e+00], [-3.0361e-06, 0.0000e+00, 1.4994e-07, ..., 9.3132e-10, 1.5991e-06, 0.0000e+00], ..., [ 9.9279e-07, 0.0000e+00, 3.0920e-06, ..., 8.3819e-09, 5.6997e-06, 0.0000e+00], [-5.7146e-06, 0.0000e+00, 5.6982e-05, ..., 9.3132e-10, 2.2531e-05, 0.0000e+00], [ 3.7812e-07, 0.0000e+00, -4.5347e-04, ..., 1.1176e-08, -2.0897e-04, 0.0000e+00]], device='cuda:0') Epoch 113, bias, value: tensor([-0.0051, 0.0274, 0.0048, -0.0015, 0.0101, -0.0007, 0.0202, -0.0174, 0.0173, -0.0010], device='cuda:0'), grad: tensor([ 1.4007e-06, -1.2732e-04, 1.7891e-06, 7.3624e-04, 1.2189e-04, 3.4404e-04, 4.4614e-05, 2.5123e-05, 9.4354e-05, -1.2417e-03], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 112---------------------------------------------------- epoch 112, time 220.48, cls_loss 0.0040 cls_loss_mapping 0.0059 cls_loss_causal 0.5629 re_mapping 0.0095 re_causal 0.0286 /// teacc 98.89 lr 0.00010000 Epoch 114, weight, value: tensor([[-0.0416, -0.0419, -0.0361, ..., -0.0842, 0.0012, -0.0088], [-0.0308, 0.0217, -0.1108, ..., 0.0389, 0.0475, -0.0909], [ 0.0509, -0.0021, -0.0534, ..., -0.0377, -0.0263, 0.0035], ..., [ 0.0407, -0.0408, 0.0013, ..., 0.0249, -0.0181, 0.0340], [-0.0225, -0.0343, -0.0476, ..., -0.0108, -0.0173, -0.0379], [-0.0839, -0.1303, 0.0393, ..., -0.0489, 0.0144, 0.0246]], device='cuda:0'), grad: tensor([[ 3.4459e-07, 4.6566e-09, 3.3993e-07, ..., 0.0000e+00, 4.8801e-07, 6.3330e-08], [ 4.1574e-06, 9.3132e-09, 2.7064e-06, ..., 0.0000e+00, -8.3968e-06, 2.4214e-08], [-1.7732e-05, -1.5739e-07, 1.5255e-06, ..., 0.0000e+00, 9.2015e-07, 3.5390e-08], ..., [-3.6098e-06, 1.5832e-08, 1.9614e-06, ..., 0.0000e+00, 1.7537e-06, 9.3132e-10], [ 7.2606e-06, 7.0781e-08, 1.9046e-06, ..., 0.0000e+00, 4.4405e-06, 2.9802e-08], [ 1.5832e-06, 1.8626e-09, -8.3260e-07, ..., 0.0000e+00, -5.6550e-06, 1.8626e-09]], device='cuda:0') Epoch 114, bias, value: tensor([-0.0057, 0.0279, 0.0047, -0.0011, 0.0098, -0.0015, 0.0200, -0.0174, 0.0176, -0.0008], device='cuda:0'), grad: tensor([ 2.4587e-06, -5.3048e-06, -2.0295e-05, 8.5086e-06, -1.9427e-06, 9.2015e-07, 2.9840e-06, 1.7984e-06, 2.1785e-05, -1.0937e-05], device='cuda:0') 100 0.0001 changing lr epoch 113, time 219.65, cls_loss 0.0048 cls_loss_mapping 0.0087 cls_loss_causal 0.5868 re_mapping 0.0096 re_causal 0.0280 /// teacc 98.83 lr 0.00010000 Epoch 115, weight, value: tensor([[-0.0419, -0.0422, -0.0371, ..., -0.0843, 0.0009, -0.0088], [-0.0309, 0.0223, -0.1121, ..., 0.0390, 0.0478, -0.0915], [ 0.0512, -0.0020, -0.0527, ..., -0.0381, -0.0266, 0.0045], ..., [ 0.0407, -0.0411, 0.0011, ..., 0.0253, -0.0188, 0.0340], [-0.0232, -0.0344, -0.0470, ..., -0.0108, -0.0183, -0.0381], [-0.0845, -0.1310, 0.0389, ..., -0.0491, 0.0151, 0.0246]], device='cuda:0'), grad: tensor([[ 4.8801e-07, 0.0000e+00, 5.6550e-06, ..., 4.0978e-08, 4.6287e-07, 2.7940e-09], [ 8.4117e-06, 0.0000e+00, 7.6666e-06, ..., 2.4214e-08, 3.3155e-07, 9.3132e-10], [ 1.0967e-05, 0.0000e+00, 8.8885e-06, ..., 3.7253e-09, 8.9686e-07, -5.4017e-08], ..., [-1.7602e-06, 0.0000e+00, 7.7710e-06, ..., -4.6566e-08, 7.3053e-06, 7.4506e-09], [ 2.0191e-05, 0.0000e+00, 9.5963e-06, ..., 8.3819e-08, -6.0461e-06, 2.4214e-08], [ 8.5868e-07, 0.0000e+00, 1.5557e-05, ..., 4.6566e-08, -5.7220e-06, 9.3132e-10]], device='cuda:0') Epoch 115, bias, value: tensor([-0.0051, 0.0280, 0.0047, -0.0013, 0.0102, -0.0013, 0.0198, -0.0177, 0.0173, -0.0006], device='cuda:0'), grad: tensor([-1.1814e-04, 4.5747e-05, 5.8800e-05, -5.1022e-05, -3.1257e-04, -6.2406e-05, 2.9612e-04, 1.3113e-04, 5.0783e-05, -3.8475e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 114---------------------------------------------------- epoch 114, time 220.54, cls_loss 0.0040 cls_loss_mapping 0.0069 cls_loss_causal 0.5432 re_mapping 0.0096 re_causal 0.0265 /// teacc 98.92 lr 0.00010000 Epoch 116, weight, value: tensor([[-0.0423, -0.0426, -0.0373, ..., -0.0843, 0.0007, -0.0092], [-0.0312, 0.0222, -0.1125, ..., 0.0390, 0.0470, -0.0932], [ 0.0517, -0.0015, -0.0525, ..., -0.0384, -0.0267, 0.0045], ..., [ 0.0407, -0.0413, 0.0012, ..., 0.0253, -0.0177, 0.0339], [-0.0237, -0.0342, -0.0470, ..., -0.0109, -0.0186, -0.0386], [-0.0853, -0.1316, 0.0386, ..., -0.0491, 0.0151, 0.0247]], device='cuda:0'), grad: tensor([[ 4.0606e-07, 1.8626e-09, 3.8370e-07, ..., 9.3132e-10, 3.8557e-07, 0.0000e+00], [ 9.6038e-06, 1.6764e-08, 2.0832e-05, ..., 1.8626e-09, 1.8403e-05, 0.0000e+00], [ 3.4273e-06, -8.5682e-08, 5.6159e-07, ..., 4.6566e-09, 1.6233e-06, 0.0000e+00], ..., [-7.3835e-06, 1.3970e-08, -9.6619e-05, ..., 1.6764e-08, -6.5923e-05, 0.0000e+00], [-9.4891e-05, 3.8184e-08, -4.2260e-05, ..., 1.8626e-09, -8.6606e-05, 0.0000e+00], [ 2.0843e-06, 1.8626e-09, 7.0691e-05, ..., 2.7940e-09, 4.8637e-05, 0.0000e+00]], device='cuda:0') Epoch 116, bias, value: tensor([-0.0052, 0.0264, 0.0050, -0.0018, 0.0099, -0.0006, 0.0199, -0.0164, 0.0176, -0.0011], device='cuda:0'), grad: tensor([ 1.8645e-06, 1.5104e-04, 1.2010e-05, 2.0695e-04, 3.3528e-05, 5.3078e-05, 1.7393e-04, -5.8126e-04, -4.6754e-04, 4.1628e-04], device='cuda:0') 100 0.0001 changing lr epoch 115, time 219.85, cls_loss 0.0056 cls_loss_mapping 0.0099 cls_loss_causal 0.5739 re_mapping 0.0091 re_causal 0.0261 /// teacc 98.77 lr 0.00010000 Epoch 117, weight, value: tensor([[-0.0407, -0.0432, -0.0374, ..., -0.0845, 0.0027, -0.0093], [-0.0325, 0.0221, -0.1119, ..., 0.0402, 0.0472, -0.0946], [ 0.0514, -0.0006, -0.0528, ..., -0.0384, -0.0271, 0.0048], ..., [ 0.0411, -0.0412, 0.0010, ..., 0.0249, -0.0189, 0.0341], [-0.0247, -0.0362, -0.0474, ..., -0.0110, -0.0186, -0.0394], [-0.0860, -0.1327, 0.0384, ..., -0.0494, 0.0161, 0.0249]], device='cuda:0'), grad: tensor([[ 1.3076e-06, 1.2107e-08, 1.0677e-05, ..., 0.0000e+00, 1.1912e-06, 3.4180e-07], [ 1.3467e-06, 1.0245e-08, 2.3469e-07, ..., 0.0000e+00, -6.3404e-06, 6.9849e-08], [-7.3854e-07, -1.8347e-07, 3.0026e-06, ..., 0.0000e+00, 1.0692e-06, -1.3132e-07], ..., [-3.8855e-06, 2.4214e-08, 5.8487e-07, ..., 0.0000e+00, 1.8552e-06, 5.3085e-08], [ 1.3523e-06, 4.4703e-08, 1.0077e-06, ..., 0.0000e+00, 5.0105e-06, 2.4587e-07], [ 1.0654e-05, 2.7940e-09, -2.0608e-05, ..., 0.0000e+00, 1.6108e-05, 1.3970e-08]], device='cuda:0') Epoch 117, bias, value: tensor([-0.0032, 0.0256, 0.0052, -0.0014, 0.0094, -0.0002, 0.0200, -0.0167, 0.0170, -0.0013], device='cuda:0'), grad: tensor([ 4.9114e-05, -9.1717e-06, 1.4983e-05, -4.3780e-05, 1.0595e-05, 1.4469e-05, -6.2212e-06, -1.5805e-06, 1.5303e-05, -4.3660e-05], device='cuda:0') 100 0.0001 changing lr epoch 116, time 219.65, cls_loss 0.0058 cls_loss_mapping 0.0087 cls_loss_causal 0.5783 re_mapping 0.0091 re_causal 0.0268 /// teacc 98.89 lr 0.00010000 Epoch 118, weight, value: tensor([[-0.0412, -0.0444, -0.0362, ..., -0.0850, 0.0034, -0.0121], [-0.0339, 0.0230, -0.1138, ..., 0.0404, 0.0474, -0.0954], [ 0.0513, -0.0005, -0.0519, ..., -0.0386, -0.0276, 0.0050], ..., [ 0.0422, -0.0417, 0.0007, ..., 0.0250, -0.0193, 0.0343], [-0.0249, -0.0364, -0.0479, ..., -0.0112, -0.0190, -0.0396], [-0.0869, -0.1341, 0.0369, ..., -0.0498, 0.0159, 0.0277]], device='cuda:0'), grad: tensor([[ 9.4399e-06, 0.0000e+00, 2.0802e-05, ..., 0.0000e+00, 6.4261e-08, 0.0000e+00], [ 1.6140e-06, 0.0000e+00, 2.6338e-06, ..., 0.0000e+00, -2.1979e-07, 0.0000e+00], [ 4.3437e-06, 0.0000e+00, 7.7337e-06, ..., 0.0000e+00, 2.7101e-07, -1.8626e-09], ..., [ 8.1304e-07, 0.0000e+00, 9.3505e-06, ..., 0.0000e+00, 2.5332e-07, 0.0000e+00], [ 1.5739e-06, 0.0000e+00, 1.3579e-06, ..., 0.0000e+00, 7.8231e-08, 0.0000e+00], [ 5.5805e-06, 0.0000e+00, 1.0952e-05, ..., 0.0000e+00, -9.4995e-07, 0.0000e+00]], device='cuda:0') Epoch 118, bias, value: tensor([-0.0037, 0.0250, 0.0049, -0.0013, 0.0106, -0.0005, 0.0202, -0.0159, 0.0166, -0.0014], device='cuda:0'), grad: tensor([ 4.8786e-05, 8.2701e-06, 2.3931e-05, 1.1943e-05, -1.4257e-04, -1.7434e-05, 3.3349e-05, 6.0797e-06, -1.7202e-06, 2.9370e-05], device='cuda:0') 100 0.0001 changing lr epoch 117, time 219.61, cls_loss 0.0035 cls_loss_mapping 0.0055 cls_loss_causal 0.5805 re_mapping 0.0087 re_causal 0.0272 /// teacc 98.87 lr 0.00010000 Epoch 119, weight, value: tensor([[-0.0414, -0.0448, -0.0364, ..., -0.0850, 0.0033, -0.0122], [-0.0341, 0.0231, -0.1143, ..., 0.0404, 0.0475, -0.0961], [ 0.0523, -0.0004, -0.0503, ..., -0.0386, -0.0279, 0.0050], ..., [ 0.0422, -0.0416, 0.0007, ..., 0.0251, -0.0198, 0.0347], [-0.0262, -0.0368, -0.0491, ..., -0.0112, -0.0194, -0.0397], [-0.0871, -0.1347, 0.0367, ..., -0.0498, 0.0169, 0.0277]], device='cuda:0'), grad: tensor([[ 6.3144e-07, 0.0000e+00, 5.0198e-07, ..., 0.0000e+00, 2.3749e-07, 4.6566e-09], [ 5.6066e-06, 0.0000e+00, 9.0525e-07, ..., 0.0000e+00, -4.2841e-08, 4.6566e-09], [-4.0114e-05, 0.0000e+00, 1.9064e-06, ..., 0.0000e+00, 7.3947e-07, 2.7940e-09], ..., [ 4.6939e-05, 0.0000e+00, 1.7196e-05, ..., 0.0000e+00, 2.2911e-07, 2.2352e-08], [ 7.9349e-06, 0.0000e+00, 1.1995e-06, ..., 0.0000e+00, 9.8720e-07, 2.7940e-09], [ 4.9993e-06, 0.0000e+00, 6.9998e-06, ..., 0.0000e+00, -1.8999e-07, 7.4506e-09]], device='cuda:0') Epoch 119, bias, value: tensor([-0.0032, 0.0248, 0.0056, -0.0015, 0.0106, -0.0004, 0.0204, -0.0163, 0.0156, -0.0013], device='cuda:0'), grad: tensor([ 2.1793e-06, 1.2830e-05, -3.2961e-05, -3.2187e-05, -9.1553e-05, 3.6769e-06, 7.6108e-06, 8.7440e-05, 1.3083e-05, 2.9802e-05], device='cuda:0') 100 0.0001 changing lr epoch 118, time 219.85, cls_loss 0.0037 cls_loss_mapping 0.0055 cls_loss_causal 0.5427 re_mapping 0.0089 re_causal 0.0256 /// teacc 98.83 lr 0.00010000 Epoch 120, weight, value: tensor([[-0.0416, -0.0450, -0.0365, ..., -0.0850, 0.0025, -0.0136], [-0.0344, 0.0231, -0.1147, ..., 0.0404, 0.0476, -0.0969], [ 0.0525, -0.0004, -0.0504, ..., -0.0387, -0.0277, 0.0049], ..., [ 0.0423, -0.0416, 0.0007, ..., 0.0251, -0.0199, 0.0344], [-0.0266, -0.0368, -0.0503, ..., -0.0112, -0.0201, -0.0400], [-0.0880, -0.1349, 0.0374, ..., -0.0498, 0.0172, 0.0292]], device='cuda:0'), grad: tensor([[ 1.9968e-06, 0.0000e+00, 2.7493e-06, ..., 6.7707e-07, 1.9558e-06, 1.8626e-09], [-2.8300e-04, 0.0000e+00, 8.0094e-07, ..., -1.1557e-04, -8.0764e-05, 9.3132e-10], [ 2.4331e-04, 0.0000e+00, 3.8818e-06, ..., 9.9182e-05, 6.3241e-05, -2.5146e-08], ..., [ 2.2314e-06, 0.0000e+00, 8.8196e-07, ..., 8.5160e-06, 5.8040e-06, 3.7253e-09], [ 3.5837e-06, 0.0000e+00, 1.9558e-06, ..., 1.8626e-07, 1.0006e-05, 1.8626e-09], [ 1.0699e-05, 0.0000e+00, 2.6608e-04, ..., 1.7043e-07, 2.2724e-07, 1.8626e-09]], device='cuda:0') Epoch 120, bias, value: tensor([-0.0037, 0.0247, 0.0058, -0.0014, 0.0098, -0.0005, 0.0215, -0.0163, 0.0147, -0.0005], device='cuda:0'), grad: tensor([-2.1830e-06, -8.9788e-04, 7.7772e-04, 2.5436e-05, -1.2054e-03, 2.3901e-05, -9.4324e-06, 2.6017e-05, 4.8608e-05, 1.2121e-03], device='cuda:0') 100 0.0001 changing lr epoch 119, time 219.49, cls_loss 0.0045 cls_loss_mapping 0.0060 cls_loss_causal 0.5151 re_mapping 0.0089 re_causal 0.0245 /// teacc 98.91 lr 0.00010000 Epoch 121, weight, value: tensor([[-4.1891e-02, -4.5554e-02, -3.6993e-02, ..., -8.5104e-02, 2.3789e-03, -1.3753e-02], [-3.4267e-02, 2.3360e-02, -1.1493e-01, ..., 4.0970e-02, 4.8083e-02, -9.7092e-02], [ 5.2753e-02, -7.1994e-06, -5.0606e-02, ..., -3.9629e-02, -2.8378e-02, 5.0982e-03], ..., [ 4.1737e-02, -4.2261e-02, 3.1517e-04, ..., 2.4773e-02, -2.1074e-02, 3.4351e-02], [-2.6481e-02, -3.6210e-02, -5.0717e-02, ..., -1.1237e-02, -1.9154e-02, -4.0000e-02], [-8.8695e-02, -1.3654e-01, 3.6906e-02, ..., -4.9851e-02, 1.7697e-02, 2.9283e-02]], device='cuda:0'), grad: tensor([[ 7.0315e-07, 0.0000e+00, 1.1744e-06, ..., 0.0000e+00, 2.7101e-06, 0.0000e+00], [ 4.6380e-07, 0.0000e+00, 6.3423e-07, ..., 0.0000e+00, -2.8815e-06, 0.0000e+00], [-1.1571e-05, 0.0000e+00, -8.7917e-07, ..., 0.0000e+00, 3.9861e-06, 0.0000e+00], ..., [-4.3958e-07, 0.0000e+00, 1.5646e-06, ..., 0.0000e+00, 1.2740e-06, 0.0000e+00], [ 2.5108e-06, 0.0000e+00, 5.3108e-05, ..., 0.0000e+00, -5.8532e-05, 0.0000e+00], [ 2.4866e-07, 0.0000e+00, -7.8738e-05, ..., 0.0000e+00, 5.6505e-05, 0.0000e+00]], device='cuda:0') Epoch 121, bias, value: tensor([-0.0034, 0.0249, 0.0060, -0.0009, 0.0099, -0.0005, 0.0210, -0.0172, 0.0154, -0.0007], device='cuda:0'), grad: tensor([ 4.8786e-05, -6.2473e-06, 3.3826e-05, 1.0192e-04, 3.7503e-04, -2.4390e-04, 5.5581e-05, 1.4514e-05, 3.8177e-05, -4.1676e-04], device='cuda:0') 100 0.0001 changing lr epoch 120, time 219.65, cls_loss 0.0042 cls_loss_mapping 0.0060 cls_loss_causal 0.5517 re_mapping 0.0085 re_causal 0.0251 /// teacc 98.84 lr 0.00010000 Epoch 122, weight, value: tensor([[-0.0423, -0.0464, -0.0373, ..., -0.0852, 0.0020, -0.0138], [-0.0338, 0.0234, -0.1153, ..., 0.0410, 0.0483, -0.0971], [ 0.0527, 0.0008, -0.0507, ..., -0.0397, -0.0288, 0.0051], ..., [ 0.0415, -0.0431, 0.0005, ..., 0.0248, -0.0214, 0.0344], [-0.0267, -0.0355, -0.0510, ..., -0.0113, -0.0192, -0.0401], [-0.0895, -0.1379, 0.0366, ..., -0.0499, 0.0181, 0.0293]], device='cuda:0'), grad: tensor([[ 3.2224e-07, 0.0000e+00, 2.0862e-07, ..., 0.0000e+00, 3.1851e-07, 0.0000e+00], [ 3.3900e-06, 0.0000e+00, 1.2843e-06, ..., 0.0000e+00, 2.7381e-07, 0.0000e+00], [ 5.3346e-06, 0.0000e+00, 1.3132e-07, ..., 0.0000e+00, 7.0259e-06, 0.0000e+00], ..., [-1.0841e-05, 0.0000e+00, 4.3120e-07, ..., 0.0000e+00, 3.8557e-07, 0.0000e+00], [-1.0632e-05, 0.0000e+00, 7.9256e-07, ..., 0.0000e+00, -2.5615e-05, 0.0000e+00], [ 6.6869e-07, 0.0000e+00, 9.8526e-05, ..., 0.0000e+00, 3.0287e-06, 0.0000e+00]], device='cuda:0') Epoch 122, bias, value: tensor([-0.0036, 0.0253, 0.0059, -0.0012, 0.0097, 0.0006, 0.0206, -0.0174, 0.0154, -0.0008], device='cuda:0'), grad: tensor([-1.7909e-06, 1.2085e-05, 4.6521e-05, 8.1480e-05, -3.6263e-04, 1.0304e-05, 9.7752e-06, -1.3247e-05, -1.5104e-04, 3.6764e-04], device='cuda:0') 100 0.0001 changing lr epoch 121, time 218.27, cls_loss 0.0038 cls_loss_mapping 0.0058 cls_loss_causal 0.5796 re_mapping 0.0081 re_causal 0.0251 /// teacc 98.73 lr 0.00010000 Epoch 123, weight, value: tensor([[-0.0426, -0.0470, -0.0376, ..., -0.0853, 0.0020, -0.0138], [-0.0337, 0.0231, -0.1159, ..., 0.0410, 0.0484, -0.0973], [ 0.0524, 0.0007, -0.0509, ..., -0.0398, -0.0290, 0.0054], ..., [ 0.0417, -0.0429, 0.0005, ..., 0.0249, -0.0216, 0.0344], [-0.0266, -0.0341, -0.0521, ..., -0.0113, -0.0193, -0.0402], [-0.0903, -0.1387, 0.0365, ..., -0.0499, 0.0185, 0.0293]], device='cuda:0'), grad: tensor([[ 6.6124e-08, 0.0000e+00, 1.0338e-07, ..., 0.0000e+00, 1.7602e-07, 6.8918e-08], [ 7.8324e-07, 0.0000e+00, 1.8068e-07, ..., 2.7940e-09, -6.5845e-07, 1.7695e-08], [-1.5525e-06, 0.0000e+00, 1.4622e-07, ..., 9.3132e-10, 2.9244e-07, -4.8429e-08], ..., [-4.4703e-07, 0.0000e+00, 6.1560e-07, ..., 8.3819e-09, 1.0319e-06, 3.6322e-08], [ 2.1048e-07, 0.0000e+00, 1.1146e-05, ..., 0.0000e+00, 5.3924e-07, 4.0978e-08], [ 7.4599e-07, 0.0000e+00, -9.6336e-06, ..., 9.3132e-10, -1.3925e-05, 1.7136e-07]], device='cuda:0') Epoch 123, bias, value: tensor([-0.0035, 0.0253, 0.0053, -0.0015, 0.0101, 0.0009, 0.0206, -0.0171, 0.0153, -0.0009], device='cuda:0'), grad: tensor([-1.8924e-05, 2.5779e-06, 1.6484e-06, 3.5524e-05, 8.7440e-05, -2.3335e-05, 4.7311e-06, 5.7630e-06, 2.2635e-05, -1.1802e-04], device='cuda:0') 100 0.0001 changing lr epoch 122, time 217.89, cls_loss 0.0045 cls_loss_mapping 0.0068 cls_loss_causal 0.5693 re_mapping 0.0082 re_causal 0.0244 /// teacc 98.82 lr 0.00010000 Epoch 124, weight, value: tensor([[-0.0426, -0.0489, -0.0372, ..., -0.0853, 0.0019, -0.0138], [-0.0349, 0.0233, -0.1164, ..., 0.0410, 0.0484, -0.0974], [ 0.0517, 0.0010, -0.0507, ..., -0.0398, -0.0296, 0.0055], ..., [ 0.0434, -0.0433, 0.0004, ..., 0.0249, -0.0216, 0.0345], [-0.0270, -0.0343, -0.0527, ..., -0.0113, -0.0191, -0.0402], [-0.0913, -0.1407, 0.0360, ..., -0.0499, 0.0186, 0.0294]], device='cuda:0'), grad: tensor([[ 6.2678e-07, 5.0291e-08, 3.0190e-05, ..., 0.0000e+00, 4.0555e-04, 1.0395e-03], [ 2.1532e-06, 6.4448e-07, 3.4459e-07, ..., 0.0000e+00, -6.1281e-06, 1.1707e-06], [ 6.0797e-06, 2.2855e-06, 4.6194e-07, ..., 0.0000e+00, 7.1488e-06, 1.3039e-05], ..., [-1.4164e-05, -4.1462e-06, 1.9465e-07, ..., 0.0000e+00, 4.1015e-06, 6.4299e-06], [ 1.7900e-06, 5.0571e-07, 9.4436e-07, ..., 0.0000e+00, 3.4794e-06, 4.2096e-07], [ 2.3730e-06, 2.8871e-08, -3.0905e-05, ..., 0.0000e+00, -4.2057e-04, -1.0691e-03]], device='cuda:0') Epoch 124, bias, value: tensor([-0.0031, 0.0245, 0.0046, -0.0014, 0.0098, 0.0005, 0.0216, -0.0160, 0.0155, -0.0015], device='cuda:0'), grad: tensor([ 3.3512e-03, -6.5193e-06, 7.2837e-05, 1.5944e-05, 2.4661e-05, 9.9763e-06, -6.9775e-06, -3.8624e-05, 2.0161e-05, -3.4447e-03], device='cuda:0') 100 0.0001 changing lr epoch 123, time 217.94, cls_loss 0.0048 cls_loss_mapping 0.0090 cls_loss_causal 0.5585 re_mapping 0.0081 re_causal 0.0244 /// teacc 98.73 lr 0.00010000 Epoch 125, weight, value: tensor([[-4.2627e-02, -5.0596e-02, -3.7549e-02, ..., -8.5329e-02, 1.2055e-03, -1.4830e-02], [-3.6399e-02, 2.5624e-02, -1.1669e-01, ..., 4.1077e-02, 4.8046e-02, -9.7861e-02], [ 5.3142e-02, 5.3213e-05, -5.0821e-02, ..., -3.9793e-02, -2.7590e-02, 5.0983e-03], ..., [ 4.3337e-02, -4.3581e-02, 1.4466e-03, ..., 2.4874e-02, -2.3025e-02, 3.4299e-02], [-2.7399e-02, -3.4579e-02, -5.3095e-02, ..., -1.1379e-02, -1.9393e-02, -4.0246e-02], [-9.2012e-02, -1.4119e-01, 3.4798e-02, ..., -4.9943e-02, 1.9133e-02, 3.0366e-02]], device='cuda:0'), grad: tensor([[ 2.9523e-07, 9.3132e-10, 7.2643e-08, ..., 0.0000e+00, 4.5002e-06, 0.0000e+00], [ 9.2573e-07, 5.5879e-09, 2.2259e-07, ..., 9.3132e-10, -5.0198e-07, 0.0000e+00], [ 1.3959e-04, 1.1176e-08, 6.3330e-08, ..., 0.0000e+00, 5.2482e-05, 0.0000e+00], ..., [ 1.1493e-06, -2.7940e-08, 1.4473e-06, ..., 1.3970e-08, 3.7681e-06, 0.0000e+00], [ 5.0142e-06, -2.7940e-09, 1.2480e-07, ..., 0.0000e+00, 1.8567e-05, 0.0000e+00], [ 1.0822e-06, 9.3132e-10, -2.6263e-07, ..., 1.8626e-09, -1.7047e-05, 0.0000e+00]], device='cuda:0') Epoch 125, bias, value: tensor([-0.0032, 0.0235, 0.0065, -0.0008, 0.0110, -0.0002, 0.0218, -0.0160, 0.0151, -0.0026], device='cuda:0'), grad: tensor([ 9.6262e-06, 1.2014e-06, 2.0218e-04, -2.0838e-04, 2.0593e-05, 1.2290e-04, -1.7917e-04, 1.5825e-05, 1.0389e-04, -8.8751e-05], device='cuda:0') 100 0.0001 changing lr epoch 124, time 218.06, cls_loss 0.0034 cls_loss_mapping 0.0056 cls_loss_causal 0.5950 re_mapping 0.0083 re_causal 0.0250 /// teacc 98.79 lr 0.00010000 Epoch 126, weight, value: tensor([[-0.0428, -0.0509, -0.0376, ..., -0.0866, 0.0010, -0.0148], [-0.0362, 0.0260, -0.1168, ..., 0.0423, 0.0494, -0.0980], [ 0.0527, -0.0002, -0.0503, ..., -0.0405, -0.0284, 0.0052], ..., [ 0.0435, -0.0438, 0.0012, ..., 0.0238, -0.0230, 0.0343], [-0.0281, -0.0339, -0.0535, ..., -0.0116, -0.0212, -0.0403], [-0.0922, -0.1420, 0.0347, ..., -0.0500, 0.0193, 0.0304]], device='cuda:0'), grad: tensor([[ 6.7875e-06, 0.0000e+00, 4.1537e-07, ..., 3.5390e-08, 5.9139e-07, 1.6764e-08], [ 3.1050e-06, 9.3132e-10, 2.1774e-06, ..., 6.9290e-07, -1.7852e-05, 1.8626e-09], [ 1.0759e-05, 9.3132e-10, 4.9081e-07, ..., 3.0734e-08, 1.4435e-07, 1.8626e-09], ..., [ 1.6838e-05, 0.0000e+00, 1.6131e-06, ..., 5.0291e-08, 8.5309e-07, 0.0000e+00], [ 5.2489e-06, -5.5879e-09, 2.3097e-07, ..., 2.7940e-09, 6.3889e-07, 3.6322e-08], [ 6.8024e-06, 0.0000e+00, 3.1479e-06, ..., 1.2666e-07, -5.5879e-09, 1.8626e-09]], device='cuda:0') Epoch 126, bias, value: tensor([-0.0029, 0.0244, 0.0063, -0.0001, 0.0111, -0.0007, 0.0218, -0.0160, 0.0137, -0.0028], device='cuda:0'), grad: tensor([ 3.1926e-06, -8.8871e-05, 2.6375e-05, -7.8917e-05, 7.8261e-05, 6.2212e-06, 3.8929e-06, 2.9132e-05, -3.8520e-06, 2.4319e-05], device='cuda:0') 100 0.0001 changing lr epoch 125, time 217.57, cls_loss 0.0039 cls_loss_mapping 0.0057 cls_loss_causal 0.5592 re_mapping 0.0083 re_causal 0.0252 /// teacc 98.84 lr 0.00010000 Epoch 127, weight, value: tensor([[-0.0426, -0.0521, -0.0376, ..., -0.0870, 0.0008, -0.0148], [-0.0368, 0.0249, -0.1158, ..., 0.0442, 0.0492, -0.0984], [ 0.0525, -0.0003, -0.0502, ..., -0.0409, -0.0288, 0.0055], ..., [ 0.0444, -0.0424, 0.0007, ..., 0.0229, -0.0225, 0.0340], [-0.0284, -0.0344, -0.0547, ..., -0.0118, -0.0204, -0.0404], [-0.0932, -0.1440, 0.0345, ..., -0.0502, 0.0194, 0.0304]], device='cuda:0'), grad: tensor([[ 1.8310e-06, 0.0000e+00, 1.2293e-07, ..., 0.0000e+00, 1.4342e-07, 0.0000e+00], [ 1.1725e-06, 0.0000e+00, 1.0338e-07, ..., 0.0000e+00, -2.3246e-06, 9.3132e-10], [-1.0657e-04, 0.0000e+00, 1.7695e-08, ..., 0.0000e+00, 3.2410e-07, -6.5193e-09], ..., [ 8.5354e-05, 0.0000e+00, 3.0454e-07, ..., 0.0000e+00, 1.3057e-06, 1.8626e-09], [ 4.9137e-06, 0.0000e+00, 5.5693e-07, ..., 0.0000e+00, 1.3243e-06, 1.8626e-09], [ 1.0684e-05, 0.0000e+00, -2.9616e-06, ..., 0.0000e+00, -7.9125e-06, 0.0000e+00]], device='cuda:0') Epoch 127, bias, value: tensor([-2.7319e-03, 2.4492e-02, 6.0260e-03, -1.9379e-05, 1.0846e-02, -5.6764e-04, 2.1557e-02, -1.5500e-02, 1.3877e-02, -3.1854e-03], device='cuda:0'), grad: tensor([-6.3851e-06, -2.0042e-06, -1.6820e-04, 1.6257e-05, 1.7643e-05, -8.6054e-06, 7.4655e-06, 1.4496e-04, 1.2785e-05, -1.4186e-05], device='cuda:0') 100 0.0001 changing lr epoch 126, time 217.37, cls_loss 0.0034 cls_loss_mapping 0.0062 cls_loss_causal 0.5403 re_mapping 0.0081 re_causal 0.0253 /// teacc 98.90 lr 0.00010000 Epoch 128, weight, value: tensor([[-0.0425, -0.0528, -0.0379, ..., -0.0872, 0.0007, -0.0148], [-0.0369, 0.0253, -0.1164, ..., 0.0444, 0.0497, -0.0987], [ 0.0513, -0.0005, -0.0504, ..., -0.0414, -0.0289, 0.0055], ..., [ 0.0457, -0.0424, 0.0005, ..., 0.0229, -0.0225, 0.0340], [-0.0285, -0.0345, -0.0547, ..., -0.0119, -0.0210, -0.0404], [-0.0939, -0.1444, 0.0344, ..., -0.0503, 0.0195, 0.0304]], device='cuda:0'), grad: tensor([[ 1.8422e-06, 0.0000e+00, 1.8338e-06, ..., 4.6566e-10, 1.2480e-07, 2.7940e-09], [ 1.0021e-05, 1.8626e-09, 2.9337e-06, ..., 2.3283e-09, -1.0785e-06, 2.0955e-08], [ 2.4457e-06, 9.3132e-10, 1.3243e-06, ..., 1.8626e-09, 1.7649e-07, 3.4925e-08], ..., [-2.2396e-05, -6.5193e-09, -3.8091e-06, ..., 4.6566e-09, 4.9965e-07, -1.0431e-07], [ 1.6659e-05, 4.6566e-10, 1.1057e-05, ..., 7.9162e-09, -3.0734e-08, 4.1910e-09], [ 2.5872e-06, 4.6566e-10, 8.7731e-07, ..., 1.1642e-08, 4.6100e-08, 2.5611e-08]], device='cuda:0') Epoch 128, bias, value: tensor([-0.0025, 0.0247, 0.0048, -0.0004, 0.0110, -0.0006, 0.0213, -0.0144, 0.0139, -0.0034], device='cuda:0'), grad: tensor([ 1.6913e-05, 3.0816e-05, 1.3649e-05, 3.4094e-05, 1.5363e-05, 2.3520e-04, -3.9649e-04, -5.6863e-05, 9.9123e-05, 8.1956e-06], device='cuda:0') 100 0.0001 changing lr epoch 127, time 217.16, cls_loss 0.0054 cls_loss_mapping 0.0067 cls_loss_causal 0.5697 re_mapping 0.0086 re_causal 0.0245 /// teacc 98.79 lr 0.00010000 Epoch 129, weight, value: tensor([[-0.0431, -0.0539, -0.0384, ..., -0.0876, 0.0008, -0.0148], [-0.0363, 0.0256, -0.1174, ..., 0.0447, 0.0508, -0.0988], [ 0.0513, 0.0008, -0.0493, ..., -0.0422, -0.0292, 0.0056], ..., [ 0.0455, -0.0440, 0.0005, ..., 0.0229, -0.0243, 0.0340], [-0.0289, -0.0348, -0.0554, ..., -0.0121, -0.0216, -0.0404], [-0.0930, -0.1463, 0.0342, ..., -0.0503, 0.0203, 0.0304]], device='cuda:0'), grad: tensor([[ 4.1258e-07, 0.0000e+00, 2.1420e-07, ..., 0.0000e+00, 6.1467e-08, 9.3132e-10], [ 9.4533e-05, 0.0000e+00, 6.4727e-07, ..., 4.6566e-09, -2.1420e-08, 5.5879e-09], [ 5.4762e-07, 0.0000e+00, 3.4459e-07, ..., 9.3132e-10, 1.0896e-07, 4.6566e-09], ..., [-7.2527e-04, 0.0000e+00, 6.6161e-05, ..., 2.7008e-08, 8.5831e-06, -2.6077e-08], [ 1.7481e-06, 0.0000e+00, 5.9828e-06, ..., 0.0000e+00, 1.6633e-06, 3.7253e-09], [ 5.7745e-04, 0.0000e+00, -9.6083e-05, ..., 0.0000e+00, -1.3061e-05, 8.3819e-09]], device='cuda:0') Epoch 129, bias, value: tensor([-0.0022, 0.0255, 0.0050, -0.0011, 0.0105, -0.0012, 0.0227, -0.0155, 0.0130, -0.0025], device='cuda:0'), grad: tensor([ 1.6550e-06, 2.4092e-04, 4.4219e-06, 1.1832e-04, 8.6546e-05, -1.9282e-05, 4.3474e-06, -1.5459e-03, 1.9282e-05, 1.0891e-03], device='cuda:0') 100 0.0001 changing lr epoch 128, time 217.40, cls_loss 0.0045 cls_loss_mapping 0.0068 cls_loss_causal 0.5977 re_mapping 0.0082 re_causal 0.0249 /// teacc 98.87 lr 0.00010000 Epoch 130, weight, value: tensor([[-0.0434, -0.0553, -0.0403, ..., -0.0878, -0.0003, -0.0160], [-0.0365, 0.0258, -0.1181, ..., 0.0450, 0.0512, -0.0995], [ 0.0518, 0.0008, -0.0492, ..., -0.0426, -0.0294, 0.0051], ..., [ 0.0457, -0.0443, 0.0003, ..., 0.0227, -0.0246, 0.0338], [-0.0284, -0.0349, -0.0574, ..., -0.0122, -0.0218, -0.0405], [-0.0943, -0.1489, 0.0365, ..., -0.0505, 0.0213, 0.0316]], device='cuda:0'), grad: tensor([[ 1.4072e-06, 2.8219e-07, 1.3411e-07, ..., 0.0000e+00, 3.3267e-06, 7.4506e-09], [ 1.6105e-04, 3.3975e-05, 1.5926e-07, ..., 0.0000e+00, -1.9222e-06, 9.3132e-10], [-2.0921e-04, -4.4346e-05, 3.9209e-07, ..., 0.0000e+00, 1.2405e-06, -4.0978e-08], ..., [ 8.6427e-06, 2.0191e-06, 1.5460e-07, ..., 0.0000e+00, 4.3306e-07, 1.1176e-08], [ 4.1910e-06, 7.6555e-07, -2.9746e-06, ..., 0.0000e+00, 1.5181e-07, 9.3132e-10], [ 1.1116e-05, 2.2855e-06, 2.7418e-06, ..., 0.0000e+00, 1.0040e-06, 1.4901e-08]], device='cuda:0') Epoch 130, bias, value: tensor([-0.0031, 0.0255, 0.0054, -0.0015, 0.0102, -0.0016, 0.0219, -0.0154, 0.0128, -0.0010], device='cuda:0'), grad: tensor([ 9.9242e-06, 2.6059e-04, -3.4046e-04, 2.0459e-05, 4.7684e-06, 1.5073e-05, -1.1809e-05, 1.6093e-05, -4.4331e-06, 2.9758e-05], device='cuda:0') 100 0.0001 changing lr epoch 129, time 217.34, cls_loss 0.0038 cls_loss_mapping 0.0056 cls_loss_causal 0.5527 re_mapping 0.0082 re_causal 0.0253 /// teacc 98.82 lr 0.00010000 Epoch 131, weight, value: tensor([[-0.0437, -0.0563, -0.0406, ..., -0.0878, -0.0007, -0.0165], [-0.0375, 0.0238, -0.1184, ..., 0.0451, 0.0515, -0.0998], [ 0.0517, 0.0022, -0.0495, ..., -0.0427, -0.0294, 0.0050], ..., [ 0.0467, -0.0440, 0.0016, ..., 0.0227, -0.0247, 0.0336], [-0.0289, -0.0351, -0.0576, ..., -0.0122, -0.0222, -0.0405], [-0.0954, -0.1510, 0.0365, ..., -0.0505, 0.0218, 0.0321]], device='cuda:0'), grad: tensor([[ 7.7300e-07, 1.0151e-07, 1.3877e-07, ..., 0.0000e+00, 3.6974e-07, 0.0000e+00], [ 8.5533e-06, 1.4249e-07, 3.5763e-07, ..., 0.0000e+00, -3.4198e-06, 0.0000e+00], [-5.0664e-06, -2.1160e-06, 1.4063e-07, ..., 0.0000e+00, 8.2608e-07, -5.5879e-09], ..., [ 7.8380e-06, 5.4482e-07, 1.8522e-05, ..., 0.0000e+00, 4.0568e-06, 2.7940e-09], [ 9.5144e-06, 1.1539e-06, 1.5851e-06, ..., 0.0000e+00, 3.0361e-07, 9.3132e-10], [ 7.9036e-05, 8.3819e-09, 1.2070e-04, ..., 0.0000e+00, -4.2245e-06, 0.0000e+00]], device='cuda:0') Epoch 131, bias, value: tensor([-0.0032, 0.0251, 0.0053, -0.0016, 0.0097, -0.0019, 0.0216, -0.0143, 0.0131, -0.0011], device='cuda:0'), grad: tensor([ 2.6338e-06, 9.9242e-06, -4.4778e-06, 1.5542e-05, -4.4227e-04, 3.9674e-06, 9.7975e-07, 7.2122e-05, 2.2560e-05, 3.1805e-04], device='cuda:0') 100 0.0001 changing lr epoch 130, time 217.21, cls_loss 0.0041 cls_loss_mapping 0.0064 cls_loss_causal 0.5552 re_mapping 0.0082 re_causal 0.0238 /// teacc 98.89 lr 0.00010000 Epoch 132, weight, value: tensor([[-0.0466, -0.0579, -0.0408, ..., -0.0879, -0.0012, -0.0166], [-0.0385, 0.0230, -0.1188, ..., 0.0449, 0.0518, -0.1009], [ 0.0519, 0.0021, -0.0489, ..., -0.0428, -0.0295, 0.0092], ..., [ 0.0473, -0.0432, 0.0017, ..., 0.0227, -0.0250, 0.0335], [-0.0294, -0.0348, -0.0575, ..., -0.0123, -0.0221, -0.0409], [-0.0963, -0.1540, 0.0359, ..., -0.0500, 0.0220, 0.0320]], device='cuda:0'), grad: tensor([[ 1.5646e-07, 2.7940e-09, 4.4424e-07, ..., 0.0000e+00, 1.2480e-07, 2.4214e-08], [ 4.7311e-07, 4.0047e-08, 1.7583e-06, ..., 0.0000e+00, -3.4831e-06, 7.4506e-09], [-5.1036e-07, 2.8871e-08, 7.6182e-07, ..., 0.0000e+00, 2.4866e-07, -2.4866e-07], ..., [-3.5688e-06, -1.7975e-07, 6.4913e-07, ..., 0.0000e+00, 1.5320e-06, 1.8068e-07], [ 4.8243e-07, 2.7940e-09, 4.0233e-07, ..., 0.0000e+00, 9.6671e-07, 2.1420e-08], [ 1.5311e-06, 5.2154e-08, 4.7326e-05, ..., 0.0000e+00, -9.8348e-07, 1.8626e-09]], device='cuda:0') Epoch 132, bias, value: tensor([-0.0034, 0.0246, 0.0053, -0.0010, 0.0100, -0.0024, 0.0197, -0.0138, 0.0151, -0.0015], device='cuda:0'), grad: tensor([ 2.4825e-05, -1.5842e-06, 8.5384e-06, 4.1090e-06, -8.9109e-05, 3.8072e-06, -8.0407e-05, 2.0396e-07, 8.9332e-06, 1.2082e-04], device='cuda:0') 100 0.0001 changing lr epoch 131, time 217.19, cls_loss 0.0032 cls_loss_mapping 0.0047 cls_loss_causal 0.5390 re_mapping 0.0083 re_causal 0.0238 /// teacc 98.80 lr 0.00010000 Epoch 133, weight, value: tensor([[-0.0463, -0.0582, -0.0412, ..., -0.0892, -0.0010, -0.0166], [-0.0383, 0.0232, -0.1192, ..., 0.0449, 0.0529, -0.1012], [ 0.0523, 0.0023, -0.0494, ..., -0.0434, -0.0297, 0.0095], ..., [ 0.0471, -0.0435, 0.0015, ..., 0.0225, -0.0259, 0.0332], [-0.0303, -0.0349, -0.0581, ..., -0.0126, -0.0239, -0.0412], [-0.0964, -0.1553, 0.0365, ..., -0.0498, 0.0228, 0.0320]], device='cuda:0'), grad: tensor([[ 4.6566e-07, 9.3132e-10, 8.1956e-08, ..., 2.7195e-07, 6.0629e-07, 0.0000e+00], [-2.2069e-05, 2.4214e-08, 9.7789e-08, ..., -2.5630e-05, -5.0306e-05, 0.0000e+00], [ 1.6153e-05, 4.6566e-09, 2.3376e-07, ..., 2.0161e-05, 3.9101e-05, 0.0000e+00], ..., [ 1.3942e-06, -7.1712e-08, -9.7416e-07, ..., 2.8163e-06, 5.8860e-06, 0.0000e+00], [ 8.4471e-07, 3.7253e-09, 6.8266e-07, ..., 5.1223e-08, 4.0885e-07, 0.0000e+00], [ 7.3668e-07, 1.5832e-08, 2.9337e-07, ..., 1.0990e-07, -6.2492e-07, 0.0000e+00]], device='cuda:0') Epoch 133, bias, value: tensor([-0.0031, 0.0250, 0.0054, -0.0017, 0.0094, -0.0017, 0.0194, -0.0142, 0.0142, -0.0007], device='cuda:0'), grad: tensor([-4.5240e-05, -2.7251e-04, 2.1565e-04, 4.6715e-06, 2.0668e-05, 2.1495e-06, 4.8101e-05, 1.2137e-05, 4.9211e-06, 9.2387e-06], device='cuda:0') 100 0.0001 changing lr epoch 132, time 217.25, cls_loss 0.0039 cls_loss_mapping 0.0056 cls_loss_causal 0.5650 re_mapping 0.0080 re_causal 0.0237 /// teacc 98.77 lr 0.00010000 Epoch 134, weight, value: tensor([[-0.0463, -0.0613, -0.0417, ..., -0.0893, -0.0013, -0.0187], [-0.0384, 0.0235, -0.1194, ..., 0.0455, 0.0533, -0.1029], [ 0.0525, 0.0023, -0.0488, ..., -0.0442, -0.0299, 0.0089], ..., [ 0.0473, -0.0432, 0.0013, ..., 0.0221, -0.0259, 0.0329], [-0.0310, -0.0349, -0.0587, ..., -0.0128, -0.0245, -0.0417], [-0.0979, -0.1640, 0.0360, ..., -0.0499, 0.0225, 0.0341]], device='cuda:0'), grad: tensor([[ 3.4198e-06, 1.8626e-09, 1.8626e-07, ..., 7.4506e-09, 6.5193e-08, 3.3434e-06], [ 1.7323e-07, 1.8626e-09, 1.8394e-06, ..., 3.1665e-07, -1.7062e-06, 4.3772e-08], [-5.4911e-06, -4.1910e-08, 4.7218e-07, ..., 4.2841e-08, 1.4994e-07, -5.1446e-06], ..., [ 7.6462e-07, 3.5390e-08, 9.9931e-07, ..., 6.5193e-08, 7.1619e-07, 6.5845e-07], [ 2.3376e-07, -2.7940e-09, 9.4250e-07, ..., 4.6566e-09, 7.9256e-07, 1.2200e-07], [ 4.0885e-07, 9.3132e-10, 1.3942e-06, ..., 5.5879e-08, -5.8766e-07, 2.5798e-07]], device='cuda:0') Epoch 134, bias, value: tensor([-0.0042, 0.0252, 0.0058, -0.0018, 0.0096, -0.0012, 0.0194, -0.0140, 0.0137, -0.0006], device='cuda:0'), grad: tensor([-3.5260e-06, 2.3991e-06, -1.3016e-05, 1.7034e-06, -1.1824e-05, 1.2685e-06, 8.4788e-06, 6.2846e-06, 3.9376e-06, 4.2804e-06], device='cuda:0') 100 0.0001 changing lr epoch 133, time 217.18, cls_loss 0.0043 cls_loss_mapping 0.0053 cls_loss_causal 0.5260 re_mapping 0.0081 re_causal 0.0236 /// teacc 98.89 lr 0.00010000 Epoch 135, weight, value: tensor([[-0.0463, -0.0633, -0.0420, ..., -0.0893, -0.0027, -0.0206], [-0.0399, 0.0244, -0.1198, ..., 0.0464, 0.0537, -0.1077], [ 0.0537, 0.0029, -0.0491, ..., -0.0443, -0.0300, 0.0102], ..., [ 0.0469, -0.0448, 0.0012, ..., 0.0213, -0.0263, 0.0317], [-0.0315, -0.0351, -0.0591, ..., -0.0130, -0.0244, -0.0432], [-0.0995, -0.1688, 0.0354, ..., -0.0501, 0.0233, 0.0359]], device='cuda:0'), grad: tensor([[ 2.6636e-07, 1.6764e-08, 3.1665e-08, ..., 0.0000e+00, -1.0364e-05, 0.0000e+00], [ 9.0618e-07, 3.4273e-07, 8.3819e-08, ..., 0.0000e+00, -3.4496e-06, 0.0000e+00], [-2.2911e-06, 3.1944e-07, 5.3085e-08, ..., 9.3132e-10, 1.6820e-06, -9.3132e-10], ..., [ 9.4995e-06, -1.3513e-06, 3.7216e-06, ..., 9.3132e-10, 4.5169e-07, 9.3132e-10], [ 1.9260e-06, 6.8918e-08, 1.1735e-07, ..., 9.3132e-10, 3.4153e-05, 0.0000e+00], [-5.7369e-05, 1.4901e-08, -1.3188e-05, ..., 0.0000e+00, 1.6633e-06, 0.0000e+00]], device='cuda:0') Epoch 135, bias, value: tensor([-0.0055, 0.0245, 0.0065, -0.0011, 0.0104, -0.0005, 0.0197, -0.0143, 0.0131, -0.0003], device='cuda:0'), grad: tensor([-4.0740e-05, -4.6194e-06, 3.1628e-06, 8.1360e-06, 3.3259e-04, 5.8270e-04, -6.5899e-04, 1.1539e-04, 1.3030e-04, -4.6730e-04], device='cuda:0') 100 0.0001 changing lr epoch 134, time 217.67, cls_loss 0.0039 cls_loss_mapping 0.0075 cls_loss_causal 0.5445 re_mapping 0.0081 re_causal 0.0231 /// teacc 98.92 lr 0.00010000 Epoch 136, weight, value: tensor([[-0.0446, -0.0650, -0.0424, ..., -0.0896, -0.0028, -0.0208], [-0.0403, 0.0243, -0.1201, ..., 0.0471, 0.0548, -0.1087], [ 0.0539, 0.0028, -0.0515, ..., -0.0444, -0.0302, 0.0104], ..., [ 0.0469, -0.0449, 0.0010, ..., 0.0210, -0.0274, 0.0314], [-0.0317, -0.0344, -0.0599, ..., -0.0131, -0.0246, -0.0438], [-0.1025, -0.1704, 0.0346, ..., -0.0506, 0.0234, 0.0363]], device='cuda:0'), grad: tensor([[ 1.3322e-05, 0.0000e+00, 5.4911e-06, ..., 0.0000e+00, 1.3970e-08, 0.0000e+00], [ 5.3458e-06, 0.0000e+00, 2.2128e-06, ..., 0.0000e+00, -1.4529e-07, 9.3132e-10], [ 1.5602e-05, 0.0000e+00, 6.6981e-06, ..., 0.0000e+00, 5.5879e-08, 4.6566e-09], ..., [ 4.7497e-08, 0.0000e+00, 1.9744e-07, ..., 0.0000e+00, 4.2841e-08, -7.4506e-09], [ 1.3858e-06, 0.0000e+00, 8.2701e-07, ..., 0.0000e+00, 8.3819e-09, 0.0000e+00], [-1.6898e-05, 0.0000e+00, -6.8434e-06, ..., 0.0000e+00, -2.0489e-08, 9.3132e-10]], device='cuda:0') Epoch 136, bias, value: tensor([-4.9428e-03, 2.4967e-02, 6.1505e-03, -4.1212e-04, 1.0852e-02, 9.9237e-05, 1.9116e-02, -1.4833e-02, 1.2888e-02, -9.0948e-04], device='cuda:0'), grad: tensor([ 4.9740e-05, 1.3009e-05, 4.1425e-05, 1.8209e-05, 3.4142e-04, 1.0274e-05, -4.0984e-04, 1.1306e-06, 3.4682e-06, -6.9201e-05], device='cuda:0') 100 0.0001 changing lr epoch 135, time 217.64, cls_loss 0.0030 cls_loss_mapping 0.0049 cls_loss_causal 0.5488 re_mapping 0.0080 re_causal 0.0247 /// teacc 98.76 lr 0.00010000 Epoch 137, weight, value: tensor([[-0.0448, -0.0658, -0.0425, ..., -0.0896, -0.0030, -0.0208], [-0.0404, 0.0244, -0.1203, ..., 0.0471, 0.0550, -0.1090], [ 0.0538, 0.0026, -0.0521, ..., -0.0444, -0.0305, 0.0103], ..., [ 0.0471, -0.0449, 0.0011, ..., 0.0211, -0.0276, 0.0316], [-0.0315, -0.0326, -0.0603, ..., -0.0132, -0.0247, -0.0440], [-0.1029, -0.1710, 0.0333, ..., -0.0515, 0.0235, 0.0363]], device='cuda:0'), grad: tensor([[ 2.5332e-07, 0.0000e+00, 4.4703e-08, ..., 0.0000e+00, 2.0396e-07, 1.5739e-07], [ 1.1683e-05, 0.0000e+00, 1.1083e-07, ..., 0.0000e+00, -2.6245e-06, 1.6810e-06], [ 2.2486e-05, 0.0000e+00, -1.3867e-06, ..., 0.0000e+00, 1.5507e-06, 3.6452e-06], ..., [-5.7071e-05, 0.0000e+00, 4.0978e-08, ..., 0.0000e+00, 6.8173e-07, -8.5905e-06], [ 3.2987e-06, 0.0000e+00, 1.6764e-07, ..., 0.0000e+00, 4.4145e-07, 3.7067e-07], [ 1.4596e-05, 0.0000e+00, 7.7300e-08, ..., 0.0000e+00, -2.3954e-06, 2.1141e-06]], device='cuda:0') Epoch 137, bias, value: tensor([-4.9095e-03, 2.4978e-02, 5.8470e-03, 8.5718e-05, 1.1930e-02, -4.4005e-04, 1.9489e-02, -1.4733e-02, 1.3005e-02, -1.7870e-03], device='cuda:0'), grad: tensor([-1.5870e-05, 3.3855e-05, 9.0301e-05, 1.9558e-06, 1.3597e-05, 7.1041e-06, 3.5763e-06, -2.0278e-04, 1.1377e-05, 5.6952e-05], device='cuda:0') 100 0.0001 changing lr epoch 136, time 217.36, cls_loss 0.0033 cls_loss_mapping 0.0059 cls_loss_causal 0.5427 re_mapping 0.0076 re_causal 0.0228 /// teacc 98.84 lr 0.00010000 Epoch 138, weight, value: tensor([[-0.0449, -0.0669, -0.0431, ..., -0.0898, -0.0024, -0.0211], [-0.0418, 0.0247, -0.1206, ..., 0.0471, 0.0555, -0.1102], [ 0.0541, 0.0025, -0.0514, ..., -0.0444, -0.0306, 0.0113], ..., [ 0.0481, -0.0450, 0.0007, ..., 0.0214, -0.0281, 0.0316], [-0.0318, -0.0334, -0.0608, ..., -0.0134, -0.0248, -0.0449], [-0.1037, -0.1713, 0.0326, ..., -0.0523, 0.0237, 0.0365]], device='cuda:0'), grad: tensor([[-1.4043e-04, 9.3132e-10, 6.2212e-07, ..., 1.8626e-09, -2.4009e-06, 9.8720e-08], [ 3.7961e-06, -4.7497e-08, 3.4086e-07, ..., 3.4459e-08, -3.8370e-06, 5.4948e-08], [-1.2718e-05, 8.3819e-09, -4.4443e-06, ..., 3.7253e-09, -1.1260e-06, -1.3718e-06], ..., [ 4.2692e-06, 1.0245e-08, 4.8708e-07, ..., 1.2107e-08, 6.2026e-07, 5.4017e-08], [ 6.9402e-06, 1.3970e-08, 2.5183e-06, ..., 1.8626e-09, 2.3618e-06, 3.9116e-08], [ 1.2279e-04, 9.3132e-10, 1.3933e-06, ..., 8.3819e-09, 2.7195e-06, 7.5437e-08]], device='cuda:0') Epoch 138, bias, value: tensor([-4.3489e-03, 2.3705e-02, 5.6880e-03, -1.3175e-04, 1.2197e-02, -5.3660e-05, 1.9176e-02, -1.3496e-02, 1.3220e-02, -2.2984e-03], device='cuda:0'), grad: tensor([-7.7009e-04, 1.1409e-06, -1.7881e-05, 5.5611e-05, 6.0759e-06, 5.9903e-06, 9.7975e-06, 7.3388e-06, 3.2365e-05, 6.6900e-04], device='cuda:0') 100 0.0001 changing lr epoch 137, time 217.46, cls_loss 0.0042 cls_loss_mapping 0.0060 cls_loss_causal 0.5544 re_mapping 0.0078 re_causal 0.0232 /// teacc 98.87 lr 0.00010000 Epoch 139, weight, value: tensor([[-0.0448, -0.0679, -0.0437, ..., -0.0912, -0.0026, -0.0211], [-0.0423, 0.0250, -0.1206, ..., 0.0474, 0.0550, -0.1124], [ 0.0543, 0.0024, -0.0517, ..., -0.0447, -0.0295, 0.0122], ..., [ 0.0486, -0.0451, 0.0005, ..., 0.0205, -0.0287, 0.0302], [-0.0324, -0.0338, -0.0610, ..., -0.0138, -0.0253, -0.0470], [-0.1049, -0.1715, 0.0322, ..., -0.0523, 0.0242, 0.0366]], device='cuda:0'), grad: tensor([[ 2.2072e-07, 0.0000e+00, 1.9558e-08, ..., 0.0000e+00, 2.0899e-06, 1.8626e-08], [ 5.0757e-07, 5.5879e-09, 2.4214e-08, ..., 0.0000e+00, 5.5879e-08, 1.3970e-08], [ 7.1526e-06, 9.3132e-10, 4.1910e-08, ..., 0.0000e+00, 1.5870e-06, 3.7253e-09], ..., [ 1.2174e-05, -9.3132e-09, 1.0710e-07, ..., 0.0000e+00, 1.0626e-06, 3.9116e-08], [ 6.2585e-07, 9.3132e-10, 2.1141e-07, ..., 0.0000e+00, 2.4289e-06, 1.2014e-07], [ 6.1095e-07, 9.3132e-10, -6.9197e-07, ..., 0.0000e+00, -6.6385e-06, -3.9302e-07]], device='cuda:0') Epoch 139, bias, value: tensor([-0.0043, 0.0236, 0.0059, 0.0019, 0.0121, -0.0017, 0.0196, -0.0133, 0.0130, -0.0027], device='cuda:0'), grad: tensor([ 4.6156e-06, 1.5106e-06, 1.3456e-05, -2.1562e-05, 2.1249e-05, 3.4962e-06, -1.4141e-05, 2.0027e-05, 7.1302e-06, -3.5793e-05], device='cuda:0') 100 0.0001 changing lr epoch 138, time 217.48, cls_loss 0.0047 cls_loss_mapping 0.0060 cls_loss_causal 0.5589 re_mapping 0.0077 re_causal 0.0233 /// teacc 98.84 lr 0.00010000 Epoch 140, weight, value: tensor([[-0.0451, -0.0694, -0.0438, ..., -0.0915, -0.0029, -0.0212], [-0.0424, 0.0250, -0.1212, ..., 0.0492, 0.0561, -0.1182], [ 0.0536, 0.0024, -0.0518, ..., -0.0449, -0.0290, 0.0139], ..., [ 0.0495, -0.0451, 0.0025, ..., 0.0231, -0.0302, 0.0292], [-0.0327, -0.0336, -0.0619, ..., -0.0139, -0.0265, -0.0526], [-0.1056, -0.1719, 0.0334, ..., -0.0525, 0.0227, 0.0365]], device='cuda:0'), grad: tensor([[-2.9244e-07, 9.3132e-10, 3.9581e-07, ..., 0.0000e+00, 5.7742e-08, 2.1420e-08], [ 1.7285e-06, 3.7253e-09, 2.3823e-06, ..., 1.7788e-07, -7.9162e-08, 1.3132e-07], [-9.8813e-07, 7.4506e-09, 6.0350e-07, ..., 9.3132e-10, 1.6298e-07, 3.4459e-08], ..., [-2.2240e-06, -1.9558e-08, 7.5027e-06, ..., -2.0675e-07, 4.1723e-07, 4.1444e-07], [-7.4413e-07, 9.3132e-10, 6.8359e-07, ..., 9.3132e-10, -3.6769e-06, 2.7008e-08], [ 7.7859e-07, 9.3132e-10, 3.5837e-06, ..., 2.1420e-08, -2.4047e-06, 2.9150e-07]], device='cuda:0') Epoch 140, bias, value: tensor([-0.0044, 0.0243, 0.0056, 0.0009, 0.0124, -0.0006, 0.0198, -0.0127, 0.0120, -0.0035], device='cuda:0'), grad: tensor([ 1.6764e-07, 8.1733e-06, 1.2890e-06, 1.4327e-05, -2.9519e-05, 8.2105e-06, 1.5181e-06, 1.1414e-05, -2.0623e-05, 5.0142e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 139---------------------------------------------------- epoch 139, time 218.25, cls_loss 0.0035 cls_loss_mapping 0.0048 cls_loss_causal 0.5540 re_mapping 0.0076 re_causal 0.0230 /// teacc 98.93 lr 0.00010000 Epoch 141, weight, value: tensor([[-0.0454, -0.0696, -0.0441, ..., -0.0916, -0.0033, -0.0213], [-0.0428, 0.0251, -0.1215, ..., 0.0494, 0.0568, -0.1189], [ 0.0539, 0.0017, -0.0528, ..., -0.0449, -0.0294, 0.0139], ..., [ 0.0491, -0.0452, 0.0023, ..., 0.0229, -0.0309, 0.0282], [-0.0330, -0.0320, -0.0622, ..., -0.0140, -0.0268, -0.0538], [-0.1058, -0.1722, 0.0339, ..., -0.0523, 0.0233, 0.0367]], device='cuda:0'), grad: tensor([[-2.4121e-06, 1.8626e-09, 3.4180e-07, ..., 0.0000e+00, 2.7101e-07, 4.1910e-08], [ 3.8277e-07, 2.7940e-09, 1.4901e-07, ..., 0.0000e+00, -5.5879e-09, 1.0245e-08], [-3.5297e-07, -2.9802e-08, 2.8536e-06, ..., 0.0000e+00, 5.9325e-07, -4.6566e-09], ..., [-5.1036e-07, 9.3132e-09, 1.4435e-07, ..., 0.0000e+00, 3.7625e-07, 7.4506e-09], [ 3.1851e-07, 4.6566e-09, -8.1584e-07, ..., 0.0000e+00, 2.1271e-06, 3.3900e-07], [ 1.5451e-06, 9.3132e-10, 2.3656e-07, ..., 0.0000e+00, -1.5395e-06, 1.8626e-09]], device='cuda:0') Epoch 141, bias, value: tensor([-0.0046, 0.0242, 0.0054, 0.0019, 0.0122, -0.0005, 0.0196, -0.0133, 0.0120, -0.0029], device='cuda:0'), grad: tensor([-5.8636e-06, 1.3923e-06, 1.5125e-05, 3.0808e-06, 3.4496e-06, 7.8797e-05, -1.0431e-04, 1.5749e-06, 8.1062e-06, -1.5376e-06], device='cuda:0') 100 0.0001 changing lr epoch 140, time 217.41, cls_loss 0.0032 cls_loss_mapping 0.0053 cls_loss_causal 0.5313 re_mapping 0.0077 re_causal 0.0235 /// teacc 98.85 lr 0.00010000 Epoch 142, weight, value: tensor([[-0.0447, -0.0701, -0.0445, ..., -0.0917, -0.0038, -0.0215], [-0.0425, 0.0258, -0.1222, ..., 0.0500, 0.0573, -0.1193], [ 0.0538, 0.0013, -0.0520, ..., -0.0469, -0.0297, 0.0162], ..., [ 0.0490, -0.0456, 0.0023, ..., 0.0228, -0.0317, 0.0284], [-0.0329, -0.0321, -0.0626, ..., -0.0140, -0.0270, -0.0592], [-0.1066, -0.1723, 0.0340, ..., -0.0524, 0.0241, 0.0365]], device='cuda:0'), grad: tensor([[-2.8033e-07, 0.0000e+00, 8.9034e-07, ..., 0.0000e+00, 2.0675e-07, 6.5193e-08], [ 1.0421e-06, 0.0000e+00, 1.2983e-06, ..., 0.0000e+00, -1.0375e-06, 9.2201e-08], [ 4.2692e-06, 0.0000e+00, 4.3064e-06, ..., 0.0000e+00, 1.2480e-07, 3.8464e-07], ..., [-1.5497e-06, 0.0000e+00, 2.4289e-06, ..., 0.0000e+00, 4.1258e-07, -8.6706e-07], [ 1.0962e-06, 0.0000e+00, 1.9185e-07, ..., 0.0000e+00, 5.1223e-08, 2.5984e-07], [ 7.5717e-07, 0.0000e+00, 6.8545e-06, ..., 0.0000e+00, 4.9733e-07, 6.8918e-08]], device='cuda:0') Epoch 142, bias, value: tensor([-0.0046, 0.0246, 0.0050, 0.0019, 0.0117, -0.0004, 0.0194, -0.0136, 0.0123, -0.0024], device='cuda:0'), grad: tensor([-4.0792e-06, 1.3718e-06, 1.4804e-05, 2.8163e-06, -3.7611e-05, -7.2550e-07, 5.1558e-06, 1.1157e-06, -5.2247e-07, 1.7643e-05], device='cuda:0') 100 0.0001 changing lr epoch 141, time 217.42, cls_loss 0.0035 cls_loss_mapping 0.0060 cls_loss_causal 0.5447 re_mapping 0.0077 re_causal 0.0229 /// teacc 98.83 lr 0.00010000 Epoch 143, weight, value: tensor([[-0.0448, -0.0706, -0.0453, ..., -0.0917, -0.0041, -0.0215], [-0.0414, 0.0260, -0.1232, ..., 0.0500, 0.0595, -0.1196], [ 0.0543, 0.0015, -0.0515, ..., -0.0469, -0.0298, 0.0175], ..., [ 0.0485, -0.0459, 0.0021, ..., 0.0228, -0.0325, 0.0282], [-0.0333, -0.0323, -0.0634, ..., -0.0140, -0.0274, -0.0595], [-0.1097, -0.1725, 0.0332, ..., -0.0524, 0.0228, 0.0364]], device='cuda:0'), grad: tensor([[ 1.1083e-06, 4.6566e-09, 2.0768e-07, ..., 0.0000e+00, 4.4703e-08, 0.0000e+00], [ 7.3314e-06, 2.0768e-07, 1.2480e-07, ..., 0.0000e+00, 3.3807e-07, 0.0000e+00], [-3.9369e-05, 4.2841e-08, -5.3197e-06, ..., 0.0000e+00, 1.3039e-08, 0.0000e+00], ..., [-9.8944e-06, -5.2340e-07, 1.1297e-06, ..., 0.0000e+00, -1.5935e-06, 0.0000e+00], [ 2.1383e-05, 3.7253e-09, 1.2234e-05, ..., 0.0000e+00, 7.1116e-06, 0.0000e+00], [ 6.4820e-06, 1.1548e-07, 2.1048e-07, ..., 0.0000e+00, 6.9663e-07, 0.0000e+00]], device='cuda:0') Epoch 143, bias, value: tensor([-0.0046, 0.0259, 0.0053, 0.0023, 0.0116, -0.0006, 0.0198, -0.0144, 0.0118, -0.0031], device='cuda:0'), grad: tensor([ 1.6829e-06, 1.2398e-05, -7.4327e-05, 1.2413e-05, 8.0094e-06, 1.1623e-04, -1.9383e-04, -1.9014e-05, 1.2445e-04, 1.1809e-05], device='cuda:0') 100 0.0001 changing lr epoch 142, time 217.58, cls_loss 0.0028 cls_loss_mapping 0.0043 cls_loss_causal 0.5187 re_mapping 0.0077 re_causal 0.0228 /// teacc 98.89 lr 0.00010000 Epoch 144, weight, value: tensor([[-0.0450, -0.0708, -0.0456, ..., -0.0917, -0.0038, -0.0215], [-0.0414, 0.0262, -0.1243, ..., 0.0500, 0.0585, -0.1197], [ 0.0548, 0.0012, -0.0511, ..., -0.0470, -0.0299, 0.0174], ..., [ 0.0485, -0.0460, 0.0019, ..., 0.0228, -0.0327, 0.0282], [-0.0336, -0.0314, -0.0640, ..., -0.0140, -0.0252, -0.0596], [-0.1108, -0.1727, 0.0348, ..., -0.0524, 0.0231, 0.0364]], device='cuda:0'), grad: tensor([[-6.2678e-07, 1.8626e-09, 2.8498e-07, ..., 2.3283e-08, 1.2573e-07, 0.0000e+00], [ 6.0070e-07, -8.2888e-07, 1.0720e-06, ..., 8.8476e-08, -6.0052e-06, 0.0000e+00], [ 6.2361e-06, 5.9884e-07, 5.0385e-07, ..., 3.3528e-08, 4.2655e-06, 0.0000e+00], ..., [ 1.3024e-05, 5.5879e-08, -4.5568e-05, ..., -1.4752e-05, 4.6939e-06, 0.0000e+00], [ 2.4572e-05, 1.3877e-07, -1.7630e-06, ..., 2.9802e-08, 1.2793e-05, 0.0000e+00], [ 1.3085e-06, 1.1176e-08, 3.5018e-06, ..., 1.3504e-07, -2.5518e-07, 0.0000e+00]], device='cuda:0') Epoch 144, bias, value: tensor([-0.0042, 0.0249, 0.0056, 0.0021, 0.0100, -0.0005, 0.0199, -0.0144, 0.0132, -0.0019], device='cuda:0'), grad: tensor([-4.2543e-06, -1.4521e-05, 3.1054e-05, -7.2896e-05, 2.6584e-04, 1.2696e-05, -7.4983e-05, -2.7537e-04, 1.1462e-04, 1.8358e-05], device='cuda:0') 100 0.0001 changing lr epoch 143, time 217.66, cls_loss 0.0027 cls_loss_mapping 0.0044 cls_loss_causal 0.5373 re_mapping 0.0075 re_causal 0.0226 /// teacc 98.84 lr 0.00010000 Epoch 145, weight, value: tensor([[-0.0444, -0.0713, -0.0461, ..., -0.0917, -0.0041, -0.0215], [-0.0419, 0.0264, -0.1247, ..., 0.0500, 0.0575, -0.1198], [ 0.0551, 0.0011, -0.0513, ..., -0.0470, -0.0301, 0.0174], ..., [ 0.0489, -0.0460, 0.0010, ..., 0.0229, -0.0309, 0.0276], [-0.0344, -0.0313, -0.0642, ..., -0.0140, -0.0254, -0.0596], [-0.1114, -0.1728, 0.0348, ..., -0.0524, 0.0231, 0.0364]], device='cuda:0'), grad: tensor([[-4.3306e-07, 2.2352e-08, 1.2293e-07, ..., 0.0000e+00, 1.9372e-07, 0.0000e+00], [ 2.6915e-07, -2.9523e-06, 3.0082e-07, ..., 0.0000e+00, -5.8301e-06, 0.0000e+00], [ 3.0547e-06, 2.6356e-07, 1.5274e-07, ..., 0.0000e+00, 2.2277e-06, 0.0000e+00], ..., [-1.4156e-05, 8.3819e-08, 7.1712e-08, ..., 0.0000e+00, -3.4869e-06, 0.0000e+00], [ 2.6524e-06, 2.1607e-06, 1.0151e-07, ..., 0.0000e+00, 5.1558e-06, 9.3132e-10], [ 7.3574e-06, 1.8626e-08, 2.2724e-06, ..., 0.0000e+00, 9.8627e-07, 9.3132e-10]], device='cuda:0') Epoch 145, bias, value: tensor([-4.0226e-03, 2.4210e-02, 5.7599e-03, 1.7476e-03, 1.0116e-02, 1.7832e-05, 1.9924e-02, -1.3649e-02, 1.2958e-02, -2.1085e-03], device='cuda:0'), grad: tensor([-3.8035e-06, -1.1526e-05, 1.4603e-05, 5.5581e-06, -5.6103e-06, -2.2613e-06, -2.2873e-06, -4.0263e-05, 1.9819e-05, 2.5690e-05], device='cuda:0') 100 0.0001 changing lr epoch 144, time 217.57, cls_loss 0.0024 cls_loss_mapping 0.0042 cls_loss_causal 0.5331 re_mapping 0.0073 re_causal 0.0216 /// teacc 98.89 lr 0.00010000 Epoch 146, weight, value: tensor([[-0.0445, -0.0720, -0.0463, ..., -0.0917, -0.0044, -0.0215], [-0.0419, 0.0265, -0.1249, ..., 0.0500, 0.0578, -0.1200], [ 0.0554, 0.0008, -0.0515, ..., -0.0470, -0.0303, 0.0174], ..., [ 0.0488, -0.0460, 0.0014, ..., 0.0229, -0.0311, 0.0273], [-0.0350, -0.0306, -0.0640, ..., -0.0140, -0.0262, -0.0597], [-0.1121, -0.1734, 0.0342, ..., -0.0524, 0.0235, 0.0364]], device='cuda:0'), grad: tensor([[ 2.8126e-07, 0.0000e+00, 1.6671e-07, ..., 0.0000e+00, 1.1176e-07, 0.0000e+00], [ 9.6112e-07, 0.0000e+00, 7.8324e-07, ..., 0.0000e+00, -5.8673e-08, 0.0000e+00], [-2.6673e-06, 0.0000e+00, 1.0710e-07, ..., 0.0000e+00, 2.9430e-07, 0.0000e+00], ..., [ 9.1866e-06, 0.0000e+00, 1.7554e-05, ..., 0.0000e+00, 4.7535e-06, 0.0000e+00], [ 2.7195e-06, 0.0000e+00, 4.5449e-07, ..., 0.0000e+00, 8.2236e-07, 0.0000e+00], [ 1.2228e-06, 0.0000e+00, 4.8369e-05, ..., 0.0000e+00, 7.8231e-06, 0.0000e+00]], device='cuda:0') Epoch 146, bias, value: tensor([-3.6336e-03, 2.4358e-02, 5.8552e-03, 1.8479e-03, 1.0120e-02, 1.4250e-05, 2.0323e-02, -1.3637e-02, 1.2483e-02, -2.4146e-03], device='cuda:0'), grad: tensor([-2.7902e-06, 3.3528e-06, -4.1872e-06, 3.2540e-06, -1.9431e-04, -1.9789e-05, -1.9651e-06, 6.4611e-05, 8.3670e-06, 1.4329e-04], device='cuda:0') 100 0.0001 changing lr epoch 145, time 217.47, cls_loss 0.0028 cls_loss_mapping 0.0051 cls_loss_causal 0.5687 re_mapping 0.0077 re_causal 0.0228 /// teacc 98.85 lr 0.00010000 Epoch 147, weight, value: tensor([[-0.0445, -0.0724, -0.0466, ..., -0.0919, -0.0047, -0.0215], [-0.0420, 0.0266, -0.1255, ..., 0.0500, 0.0585, -0.1201], [ 0.0554, 0.0007, -0.0519, ..., -0.0470, -0.0305, 0.0174], ..., [ 0.0490, -0.0461, 0.0010, ..., 0.0228, -0.0315, 0.0276], [-0.0355, -0.0295, -0.0640, ..., -0.0141, -0.0263, -0.0598], [-0.1123, -0.1737, 0.0340, ..., -0.0524, 0.0241, 0.0364]], device='cuda:0'), grad: tensor([[ 6.8918e-08, 0.0000e+00, 5.7649e-07, ..., 0.0000e+00, 7.3947e-07, 2.0489e-07], [ 1.0477e-06, -2.8871e-08, 5.6811e-08, ..., 0.0000e+00, 1.5367e-06, 4.5728e-07], [-3.5316e-06, 1.3970e-08, 3.2503e-07, ..., 0.0000e+00, -2.1290e-06, 3.4459e-08], ..., [-7.1339e-07, 5.5879e-09, 4.7497e-08, ..., 0.0000e+00, -9.7826e-06, -3.1497e-06], [ 2.7083e-06, 3.7253e-09, 1.8626e-09, ..., 0.0000e+00, 2.0787e-06, 3.5390e-08], [ 2.0675e-07, 0.0000e+00, 1.0738e-06, ..., 0.0000e+00, 6.0797e-06, 2.2184e-06]], device='cuda:0') Epoch 147, bias, value: tensor([-0.0035, 0.0246, 0.0056, 0.0007, 0.0101, 0.0006, 0.0204, -0.0136, 0.0126, -0.0024], device='cuda:0'), grad: tensor([ 4.9248e-06, 9.6038e-06, -8.0019e-06, 7.2643e-07, -3.5390e-08, 7.0706e-06, -4.2841e-06, -5.3465e-05, 8.5384e-06, 3.4958e-05], device='cuda:0') 100 0.0001 changing lr epoch 146, time 217.52, cls_loss 0.0028 cls_loss_mapping 0.0055 cls_loss_causal 0.5228 re_mapping 0.0077 re_causal 0.0230 /// teacc 98.90 lr 0.00010000 Epoch 148, weight, value: tensor([[-0.0446, -0.0726, -0.0467, ..., -0.0919, -0.0049, -0.0215], [-0.0424, 0.0266, -0.1268, ..., 0.0500, 0.0589, -0.1203], [ 0.0559, 0.0009, -0.0518, ..., -0.0470, -0.0298, 0.0173], ..., [ 0.0491, -0.0462, 0.0009, ..., 0.0228, -0.0323, 0.0277], [-0.0360, -0.0287, -0.0648, ..., -0.0141, -0.0271, -0.0599], [-0.1123, -0.1738, 0.0340, ..., -0.0524, 0.0246, 0.0364]], device='cuda:0'), grad: tensor([[ 6.2399e-08, 0.0000e+00, -9.7096e-05, ..., 0.0000e+00, 1.6484e-07, 3.7253e-09], [-7.2271e-07, 0.0000e+00, 1.0459e-06, ..., 0.0000e+00, -1.3262e-06, 9.3132e-10], [ 1.0934e-06, 0.0000e+00, 5.0180e-06, ..., 0.0000e+00, 1.6801e-06, 9.3132e-10], ..., [-1.6009e-06, 0.0000e+00, 3.1572e-07, ..., 0.0000e+00, -9.3132e-08, 0.0000e+00], [ 1.2759e-07, 0.0000e+00, -4.2439e-05, ..., 0.0000e+00, -2.7016e-05, 2.7940e-09], [ 7.0874e-07, 0.0000e+00, 8.7798e-05, ..., 0.0000e+00, 2.2233e-05, 0.0000e+00]], device='cuda:0') Epoch 148, bias, value: tensor([-0.0034, 0.0245, 0.0062, 0.0007, 0.0101, 0.0008, 0.0203, -0.0139, 0.0123, -0.0022], device='cuda:0'), grad: tensor([-5.8651e-04, -1.1176e-08, 3.8773e-05, 1.9059e-05, 5.6326e-06, 3.4153e-05, 2.1720e-04, -1.6810e-06, -2.0349e-04, 4.7636e-04], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 147---------------------------------------------------- epoch 147, time 218.16, cls_loss 0.0030 cls_loss_mapping 0.0047 cls_loss_causal 0.5137 re_mapping 0.0077 re_causal 0.0221 /// teacc 98.94 lr 0.00010000 Epoch 149, weight, value: tensor([[-0.0451, -0.0729, -0.0466, ..., -0.0920, -0.0053, -0.0216], [-0.0416, 0.0270, -0.1280, ..., 0.0502, 0.0595, -0.1204], [ 0.0555, 0.0004, -0.0523, ..., -0.0478, -0.0306, 0.0174], ..., [ 0.0492, -0.0463, 0.0007, ..., 0.0228, -0.0320, 0.0277], [-0.0368, -0.0287, -0.0642, ..., -0.0141, -0.0265, -0.0599], [-0.1132, -0.1739, 0.0333, ..., -0.0525, 0.0244, 0.0364]], device='cuda:0'), grad: tensor([[ 4.1444e-08, 0.0000e+00, 1.2247e-07, ..., 0.0000e+00, 4.8950e-06, 2.1607e-07], [-6.0573e-06, 0.0000e+00, 1.4761e-07, ..., 0.0000e+00, -7.9051e-06, 1.8626e-09], [ 1.8906e-07, 0.0000e+00, 3.5390e-08, ..., 0.0000e+00, 2.4866e-07, 0.0000e+00], ..., [ 5.6587e-06, 0.0000e+00, 2.9802e-07, ..., 0.0000e+00, 7.3090e-06, 4.6566e-10], [ 2.9337e-07, 0.0000e+00, 5.3644e-07, ..., 0.0000e+00, 1.0747e-06, 3.2596e-08], [ 1.3970e-07, 0.0000e+00, 1.0170e-06, ..., 0.0000e+00, -2.0061e-06, 4.6566e-10]], device='cuda:0') Epoch 149, bias, value: tensor([-0.0035, 0.0253, 0.0051, -0.0010, 0.0101, 0.0020, 0.0202, -0.0137, 0.0128, -0.0027], device='cuda:0'), grad: tensor([ 2.5809e-05, -2.0534e-05, 8.4052e-07, 1.2070e-06, 2.6580e-06, 3.0659e-06, -3.2723e-05, 2.0325e-05, 5.8934e-06, -6.6124e-06], device='cuda:0') 100 0.0001 changing lr epoch 148, time 217.82, cls_loss 0.0030 cls_loss_mapping 0.0062 cls_loss_causal 0.5130 re_mapping 0.0080 re_causal 0.0223 /// teacc 98.84 lr 0.00010000 Epoch 150, weight, value: tensor([[-0.0456, -0.0736, -0.0468, ..., -0.0921, -0.0058, -0.0216], [-0.0415, 0.0272, -0.1290, ..., 0.0505, 0.0600, -0.1207], [ 0.0559, 0.0014, -0.0526, ..., -0.0492, -0.0309, 0.0180], ..., [ 0.0490, -0.0470, 0.0006, ..., 0.0228, -0.0323, 0.0276], [-0.0379, -0.0287, -0.0651, ..., -0.0141, -0.0267, -0.0600], [-0.1134, -0.1741, 0.0333, ..., -0.0525, 0.0251, 0.0365]], device='cuda:0'), grad: tensor([[ 9.3598e-08, 0.0000e+00, 2.8405e-08, ..., 0.0000e+00, 1.2387e-07, 4.6566e-10], [ 1.7229e-08, 0.0000e+00, 6.5658e-08, ..., 0.0000e+00, -2.5313e-06, 4.6566e-10], [ 1.1213e-05, 0.0000e+00, 1.3392e-06, ..., 0.0000e+00, 2.0443e-07, -2.3283e-09], ..., [-1.2457e-05, 0.0000e+00, -1.4557e-06, ..., 0.0000e+00, 1.4324e-06, 4.6566e-10], [ 7.2131e-07, 0.0000e+00, 1.0477e-07, ..., 0.0000e+00, 2.4401e-07, 9.3132e-10], [ 2.0862e-07, 0.0000e+00, 3.4040e-07, ..., 0.0000e+00, 8.8476e-08, 0.0000e+00]], device='cuda:0') Epoch 150, bias, value: tensor([-0.0036, 0.0254, 0.0049, -0.0022, 0.0102, 0.0039, 0.0198, -0.0137, 0.0123, -0.0026], device='cuda:0'), grad: tensor([ 5.7230e-07, -7.2159e-06, 1.9610e-05, -1.2107e-07, 7.1526e-07, 9.0711e-07, -3.8138e-07, -1.6734e-05, 1.9874e-06, 6.5379e-07], device='cuda:0') 100 0.0001 changing lr epoch 149, time 217.74, cls_loss 0.0027 cls_loss_mapping 0.0036 cls_loss_causal 0.5477 re_mapping 0.0077 re_causal 0.0229 /// teacc 98.92 lr 0.00010000 Epoch 151, weight, value: tensor([[-0.0455, -0.0740, -0.0453, ..., -0.0921, -0.0061, -0.0216], [-0.0412, 0.0272, -0.1294, ..., 0.0505, 0.0605, -0.1209], [ 0.0557, 0.0012, -0.0527, ..., -0.0492, -0.0311, 0.0180], ..., [ 0.0488, -0.0470, 0.0019, ..., 0.0228, -0.0343, 0.0275], [-0.0375, -0.0283, -0.0651, ..., -0.0141, -0.0265, -0.0600], [-0.1138, -0.1742, 0.0332, ..., -0.0525, 0.0268, 0.0365]], device='cuda:0'), grad: tensor([[ 1.5320e-07, 0.0000e+00, -1.2154e-07, ..., 0.0000e+00, 8.1956e-08, 0.0000e+00], [ 6.7987e-08, 0.0000e+00, 1.1390e-06, ..., 0.0000e+00, 8.0233e-07, 0.0000e+00], [-1.0170e-06, 0.0000e+00, -4.3772e-08, ..., 0.0000e+00, 1.0058e-07, -9.3132e-10], ..., [ 4.6147e-07, 0.0000e+00, 3.6974e-07, ..., 0.0000e+00, 4.5588e-07, 0.0000e+00], [ 1.5134e-07, 0.0000e+00, 5.9884e-07, ..., 0.0000e+00, 5.1549e-07, 0.0000e+00], [ 3.4226e-07, 0.0000e+00, -3.8072e-06, ..., 0.0000e+00, -5.7556e-06, 0.0000e+00]], device='cuda:0') Epoch 151, bias, value: tensor([-0.0030, 0.0257, 0.0046, -0.0019, 0.0100, 0.0037, 0.0193, -0.0145, 0.0127, -0.0022], device='cuda:0'), grad: tensor([-5.2117e-06, 9.8422e-06, -8.2934e-07, 2.6803e-06, 2.2963e-05, 4.6901e-06, 6.5845e-07, 3.6135e-06, 2.5406e-06, -4.0919e-05], device='cuda:0') 100 0.0001 changing lr epoch 150, time 217.61, cls_loss 0.0034 cls_loss_mapping 0.0058 cls_loss_causal 0.5523 re_mapping 0.0072 re_causal 0.0217 /// teacc 98.87 lr 0.00010000 Epoch 152, weight, value: tensor([[-0.0445, -0.0740, -0.0457, ..., -0.0921, -0.0081, -0.0241], [-0.0425, 0.0272, -0.1311, ..., 0.0505, 0.0605, -0.1214], [ 0.0560, 0.0012, -0.0515, ..., -0.0492, -0.0312, 0.0180], ..., [ 0.0498, -0.0470, 0.0016, ..., 0.0228, -0.0335, 0.0272], [-0.0373, -0.0284, -0.0660, ..., -0.0141, -0.0267, -0.0603], [-0.1154, -0.1743, 0.0332, ..., -0.0525, 0.0274, 0.0390]], device='cuda:0'), grad: tensor([[ 5.5321e-07, 0.0000e+00, 1.2293e-07, ..., 0.0000e+00, 8.2888e-08, 1.8626e-09], [ 1.1642e-06, 0.0000e+00, 1.1735e-07, ..., 0.0000e+00, -6.0163e-07, 1.8626e-09], [ 1.7092e-05, 0.0000e+00, 4.2506e-06, ..., 0.0000e+00, 1.6298e-07, -8.3819e-09], ..., [ 5.6289e-06, 0.0000e+00, 1.4324e-06, ..., 0.0000e+00, 1.8068e-07, 9.3132e-10], [ 4.2468e-06, 0.0000e+00, 6.9663e-06, ..., 0.0000e+00, 2.7977e-06, 0.0000e+00], [ 1.0999e-06, 0.0000e+00, 8.0373e-07, ..., 0.0000e+00, -3.2037e-07, 0.0000e+00]], device='cuda:0') Epoch 152, bias, value: tensor([-0.0042, 0.0249, 0.0047, -0.0017, 0.0101, 0.0030, 0.0206, -0.0134, 0.0126, -0.0022], device='cuda:0'), grad: tensor([-5.5023e-06, 2.7940e-09, 1.6540e-05, -2.8536e-05, -6.6217e-07, -3.9376e-06, -3.1441e-05, 3.6415e-06, 4.4644e-05, 5.1484e-06], device='cuda:0') 100 0.0001 changing lr epoch 151, time 217.86, cls_loss 0.0033 cls_loss_mapping 0.0058 cls_loss_causal 0.5633 re_mapping 0.0074 re_causal 0.0217 /// teacc 98.83 lr 0.00010000 Epoch 153, weight, value: tensor([[-0.0442, -0.0743, -0.0464, ..., -0.0921, -0.0063, -0.0247], [-0.0426, 0.0273, -0.1324, ..., 0.0505, 0.0607, -0.1218], [ 0.0571, 0.0012, -0.0523, ..., -0.0493, -0.0312, 0.0172], ..., [ 0.0493, -0.0470, 0.0018, ..., 0.0228, -0.0339, 0.0266], [-0.0379, -0.0284, -0.0657, ..., -0.0142, -0.0270, -0.0605], [-0.1168, -0.1744, 0.0330, ..., -0.0525, 0.0267, 0.0395]], device='cuda:0'), grad: tensor([[-8.3819e-09, 0.0000e+00, 4.0047e-08, ..., 0.0000e+00, 8.4750e-08, 2.1420e-08], [ 2.3209e-06, 0.0000e+00, 1.2601e-06, ..., 1.8626e-09, -1.3690e-07, 2.1420e-08], [-1.2904e-05, 0.0000e+00, 3.6322e-07, ..., 1.8626e-09, 1.9278e-07, -2.5146e-08], ..., [-4.9453e-07, 0.0000e+00, 7.5437e-07, ..., 1.8626e-09, 7.5996e-07, 1.8813e-07], [-4.2841e-08, 0.0000e+00, 6.7875e-06, ..., 1.8626e-09, 1.3355e-06, 1.1083e-07], [ 3.1386e-07, 0.0000e+00, -4.7591e-07, ..., 9.3132e-10, -4.3288e-06, -7.0035e-07]], device='cuda:0') Epoch 153, bias, value: tensor([-0.0027, 0.0247, 0.0057, -0.0012, 0.0097, 0.0036, 0.0201, -0.0138, 0.0126, -0.0031], device='cuda:0'), grad: tensor([-2.0005e-06, 9.4622e-06, -1.2368e-05, 1.4052e-05, -5.3942e-06, -1.1943e-05, 3.2075e-06, 4.1127e-06, 2.1353e-05, -2.0593e-05], device='cuda:0') 100 0.0001 changing lr epoch 152, time 218.05, cls_loss 0.0026 cls_loss_mapping 0.0047 cls_loss_causal 0.5490 re_mapping 0.0073 re_causal 0.0216 /// teacc 98.87 lr 0.00010000 Epoch 154, weight, value: tensor([[-0.0443, -0.0746, -0.0463, ..., -0.0921, -0.0066, -0.0247], [-0.0428, 0.0278, -0.1330, ..., 0.0505, 0.0608, -0.1222], [ 0.0596, 0.0007, -0.0517, ..., -0.0493, -0.0312, 0.0171], ..., [ 0.0474, -0.0472, 0.0017, ..., 0.0229, -0.0339, 0.0264], [-0.0384, -0.0285, -0.0663, ..., -0.0142, -0.0270, -0.0610], [-0.1173, -0.1745, 0.0331, ..., -0.0526, 0.0269, 0.0395]], device='cuda:0'), grad: tensor([[ 3.0547e-07, 0.0000e+00, 1.3504e-07, ..., 9.3132e-10, 5.8450e-06, 1.5926e-06], [-4.0978e-08, 0.0000e+00, 4.6566e-09, ..., 5.5879e-09, -1.5795e-06, 2.6077e-08], [-7.4506e-09, 0.0000e+00, 5.5879e-09, ..., 2.7940e-09, 9.6764e-07, 5.3085e-08], ..., [-2.9802e-07, 0.0000e+00, 0.0000e+00, ..., -1.3039e-08, 3.2224e-07, 1.8626e-09], [ 4.6473e-07, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 3.2224e-07, 5.0291e-08], [ 2.9523e-07, 0.0000e+00, 1.8626e-09, ..., 1.8626e-09, 4.9360e-08, 1.3970e-08]], device='cuda:0') Epoch 154, bias, value: tensor([-0.0023, 0.0246, 0.0080, -0.0016, 0.0094, 0.0042, 0.0197, -0.0155, 0.0126, -0.0033], device='cuda:0'), grad: tensor([ 3.6716e-05, -3.2857e-06, 1.7313e-06, -2.9318e-06, 3.0212e-06, 4.8950e-06, -4.3869e-05, -9.3132e-08, 2.6077e-06, 1.1381e-06], device='cuda:0') 100 0.0001 changing lr epoch 153, time 217.75, cls_loss 0.0029 cls_loss_mapping 0.0044 cls_loss_causal 0.5572 re_mapping 0.0070 re_causal 0.0218 /// teacc 98.85 lr 0.00010000 Epoch 155, weight, value: tensor([[-0.0440, -0.0747, -0.0463, ..., -0.0923, -0.0064, -0.0247], [-0.0425, 0.0278, -0.1318, ..., 0.0505, 0.0614, -0.1225], [ 0.0598, 0.0006, -0.0519, ..., -0.0494, -0.0320, 0.0171], ..., [ 0.0472, -0.0472, 0.0020, ..., 0.0238, -0.0336, 0.0259], [-0.0389, -0.0284, -0.0666, ..., -0.0142, -0.0272, -0.0611], [-0.1190, -0.1746, 0.0330, ..., -0.0527, 0.0269, 0.0397]], device='cuda:0'), grad: tensor([[ 8.6240e-07, 0.0000e+00, 3.3528e-08, ..., 0.0000e+00, 9.9931e-07, 0.0000e+00], [ 4.8988e-07, 1.8626e-09, 1.8440e-07, ..., 0.0000e+00, -1.5181e-07, 2.7940e-09], [ 4.7684e-05, -3.7253e-09, 4.3772e-08, ..., 0.0000e+00, 1.3970e-08, 9.3132e-10], ..., [-4.5076e-06, 1.8626e-09, 5.5321e-07, ..., 0.0000e+00, 7.5437e-08, 8.3819e-09], [-4.6343e-05, 0.0000e+00, 2.0768e-07, ..., 0.0000e+00, 1.2200e-07, 0.0000e+00], [ 2.4773e-07, 0.0000e+00, -1.4901e-07, ..., 0.0000e+00, -5.4482e-07, 2.7940e-09]], device='cuda:0') Epoch 155, bias, value: tensor([-0.0017, 0.0251, 0.0078, -0.0010, 0.0092, 0.0033, 0.0202, -0.0153, 0.0124, -0.0038], device='cuda:0'), grad: tensor([ 6.0201e-06, 1.1977e-06, 1.4019e-04, 4.9621e-06, 3.3155e-07, 8.0187e-07, -4.4703e-06, -3.9376e-06, -1.4472e-04, -4.8988e-07], device='cuda:0') 100 0.0001 changing lr epoch 154, time 217.72, cls_loss 0.0034 cls_loss_mapping 0.0058 cls_loss_causal 0.5530 re_mapping 0.0069 re_causal 0.0213 /// teacc 98.91 lr 0.00010000 Epoch 156, weight, value: tensor([[-0.0441, -0.0755, -0.0469, ..., -0.0925, -0.0068, -0.0247], [-0.0421, 0.0279, -0.1352, ..., 0.0510, 0.0634, -0.1232], [ 0.0586, -0.0022, -0.0530, ..., -0.0507, -0.0325, 0.0169], ..., [ 0.0483, -0.0449, 0.0015, ..., 0.0237, -0.0354, 0.0235], [-0.0393, -0.0283, -0.0660, ..., -0.0143, -0.0255, -0.0614], [-0.1201, -0.1748, 0.0330, ..., -0.0529, 0.0270, 0.0398]], device='cuda:0'), grad: tensor([[ 6.5472e-07, 1.8626e-09, 3.2876e-07, ..., 0.0000e+00, -2.4699e-06, 9.3132e-10], [ 1.1828e-07, 5.5879e-09, 9.9652e-08, ..., 0.0000e+00, -1.4529e-07, 0.0000e+00], [-3.1412e-05, 3.9116e-08, -2.3603e-05, ..., 0.0000e+00, 2.4866e-07, -2.7940e-09], ..., [ 1.5367e-07, 1.7695e-08, 1.7602e-07, ..., 0.0000e+00, 2.9337e-07, 9.3132e-10], [ 2.7269e-05, -1.5646e-07, 2.1785e-05, ..., 0.0000e+00, 5.4725e-06, 9.3132e-10], [ 4.0978e-07, 9.3132e-10, -8.5961e-07, ..., 0.0000e+00, -4.4852e-06, 0.0000e+00]], device='cuda:0') Epoch 156, bias, value: tensor([-0.0023, 0.0256, 0.0065, -0.0002, 0.0093, 0.0019, 0.0202, -0.0149, 0.0141, -0.0037], device='cuda:0'), grad: tensor([-1.4089e-05, 4.7125e-07, -9.9063e-05, 6.6571e-06, 5.4203e-06, 2.8536e-06, -9.4809e-07, 1.4780e-06, 1.1295e-04, -1.5602e-05], device='cuda:0') 100 0.0001 changing lr epoch 155, time 217.80, cls_loss 0.0030 cls_loss_mapping 0.0058 cls_loss_causal 0.5448 re_mapping 0.0072 re_causal 0.0215 /// teacc 98.93 lr 0.00010000 Epoch 157, weight, value: tensor([[-0.0442, -0.0758, -0.0471, ..., -0.0928, -0.0067, -0.0247], [-0.0446, 0.0279, -0.1359, ..., 0.0509, 0.0622, -0.1236], [ 0.0598, -0.0022, -0.0530, ..., -0.0508, -0.0304, 0.0169], ..., [ 0.0487, -0.0449, 0.0009, ..., 0.0237, -0.0353, 0.0232], [-0.0398, -0.0279, -0.0662, ..., -0.0144, -0.0263, -0.0615], [-0.1212, -0.1750, 0.0325, ..., -0.0528, 0.0269, 0.0398]], device='cuda:0'), grad: tensor([[-8.3353e-07, 0.0000e+00, -2.5611e-07, ..., 0.0000e+00, 2.4959e-07, 0.0000e+00], [-2.0601e-06, 0.0000e+00, 1.2815e-06, ..., 0.0000e+00, -7.8827e-06, 0.0000e+00], [ 2.9802e-06, 0.0000e+00, 1.8394e-06, ..., 0.0000e+00, 4.3586e-06, 0.0000e+00], ..., [ 8.6613e-07, 0.0000e+00, 9.4064e-08, ..., 0.0000e+00, 4.1686e-06, 0.0000e+00], [-4.0233e-06, 0.0000e+00, -5.5581e-06, ..., 0.0000e+00, -3.3081e-06, 0.0000e+00], [ 4.6287e-07, 0.0000e+00, 1.1818e-06, ..., 0.0000e+00, 1.7416e-07, 0.0000e+00]], device='cuda:0') Epoch 157, bias, value: tensor([-0.0022, 0.0239, 0.0079, -0.0004, 0.0096, 0.0022, 0.0199, -0.0145, 0.0135, -0.0039], device='cuda:0'), grad: tensor([-8.4639e-06, -2.0847e-05, 3.1441e-05, 1.3344e-05, 1.7360e-06, 1.5423e-05, 2.3842e-06, 1.7628e-05, -6.0380e-05, 7.6443e-06], device='cuda:0') 100 0.0001 changing lr epoch 156, time 217.43, cls_loss 0.0032 cls_loss_mapping 0.0045 cls_loss_causal 0.5750 re_mapping 0.0072 re_causal 0.0213 /// teacc 98.92 lr 0.00010000 Epoch 158, weight, value: tensor([[-0.0445, -0.0761, -0.0478, ..., -0.0931, -0.0068, -0.0247], [-0.0424, 0.0283, -0.1368, ..., 0.0509, 0.0646, -0.1236], [ 0.0591, -0.0025, -0.0529, ..., -0.0508, -0.0307, 0.0169], ..., [ 0.0478, -0.0450, 0.0009, ..., 0.0237, -0.0373, 0.0232], [-0.0399, -0.0279, -0.0667, ..., -0.0145, -0.0266, -0.0615], [-0.1223, -0.1751, 0.0321, ..., -0.0528, 0.0263, 0.0398]], device='cuda:0'), grad: tensor([[ 5.9605e-08, 8.3819e-09, 4.1444e-07, ..., 0.0000e+00, 4.8429e-08, 0.0000e+00], [-9.1735e-07, -1.6214e-06, 1.2435e-05, ..., 0.0000e+00, -2.4028e-07, 0.0000e+00], [ 1.1344e-06, 1.1222e-06, 2.9057e-07, ..., 0.0000e+00, 9.4995e-08, 0.0000e+00], ..., [-9.2294e-07, 4.2841e-08, 8.0187e-07, ..., 0.0000e+00, 1.9837e-07, 0.0000e+00], [-4.9360e-08, 7.9162e-08, -1.8626e-08, ..., 0.0000e+00, -8.0280e-07, 0.0000e+00], [ 3.3807e-07, 1.8626e-09, 2.1353e-05, ..., 0.0000e+00, 5.6997e-07, 0.0000e+00]], device='cuda:0') Epoch 158, bias, value: tensor([-0.0020, 0.0261, 0.0071, -0.0005, 0.0094, 0.0025, 0.0201, -0.0156, 0.0134, -0.0044], device='cuda:0'), grad: tensor([-8.5160e-06, 2.6196e-05, 7.8380e-06, 6.6683e-07, -1.3053e-04, 6.0834e-06, 3.9905e-05, 1.7984e-06, -4.0755e-06, 6.0737e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 157---------------------------------------------------- epoch 157, time 218.21, cls_loss 0.0034 cls_loss_mapping 0.0053 cls_loss_causal 0.5586 re_mapping 0.0067 re_causal 0.0209 /// teacc 98.96 lr 0.00010000 Epoch 159, weight, value: tensor([[-0.0452, -0.0764, -0.0487, ..., -0.0932, -0.0071, -0.0247], [-0.0435, 0.0286, -0.1380, ..., 0.0509, 0.0646, -0.1237], [ 0.0579, -0.0022, -0.0525, ..., -0.0509, -0.0309, 0.0169], ..., [ 0.0498, -0.0454, 0.0009, ..., 0.0237, -0.0373, 0.0232], [-0.0412, -0.0280, -0.0670, ..., -0.0145, -0.0266, -0.0615], [-0.1213, -0.1752, 0.0311, ..., -0.0528, 0.0275, 0.0398]], device='cuda:0'), grad: tensor([[ 1.1735e-07, 0.0000e+00, 8.8476e-08, ..., 0.0000e+00, 2.1327e-07, 0.0000e+00], [ 3.7253e-08, 0.0000e+00, 1.9185e-07, ..., 0.0000e+00, -2.2098e-05, 0.0000e+00], [-7.9162e-08, 0.0000e+00, 3.6322e-08, ..., 0.0000e+00, 1.7295e-06, 0.0000e+00], ..., [-5.5879e-08, 0.0000e+00, 4.5169e-07, ..., 0.0000e+00, 1.1539e-06, 0.0000e+00], [ 3.6508e-06, 0.0000e+00, 1.2517e-05, ..., 0.0000e+00, 1.9237e-05, 0.0000e+00], [ 2.6077e-08, 0.0000e+00, -2.4773e-07, ..., 0.0000e+00, -1.0366e-06, 0.0000e+00]], device='cuda:0') Epoch 159, bias, value: tensor([-0.0024, 0.0252, 0.0058, -0.0021, 0.0101, 0.0026, 0.0212, -0.0139, 0.0133, -0.0044], device='cuda:0'), grad: tensor([ 6.1747e-07, -6.8307e-05, 4.9807e-06, 7.4580e-06, 1.5581e-06, -4.1008e-05, 2.7660e-07, 8.9258e-06, 9.3997e-05, -8.7172e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 158---------------------------------------------------- epoch 158, time 218.38, cls_loss 0.0036 cls_loss_mapping 0.0043 cls_loss_causal 0.5536 re_mapping 0.0071 re_causal 0.0211 /// teacc 98.97 lr 0.00010000 Epoch 160, weight, value: tensor([[-0.0455, -0.0767, -0.0508, ..., -0.0932, -0.0089, -0.0255], [-0.0436, 0.0289, -0.1382, ..., 0.0509, 0.0648, -0.1238], [ 0.0579, -0.0023, -0.0526, ..., -0.0509, -0.0311, 0.0169], ..., [ 0.0478, -0.0455, 0.0009, ..., 0.0237, -0.0372, 0.0231], [-0.0415, -0.0277, -0.0684, ..., -0.0145, -0.0270, -0.0615], [-0.1229, -0.1755, 0.0311, ..., -0.0528, 0.0283, 0.0406]], device='cuda:0'), grad: tensor([[ 5.8077e-06, 0.0000e+00, 2.2762e-06, ..., 0.0000e+00, 8.5216e-07, 0.0000e+00], [ 2.7582e-05, 0.0000e+00, 9.1456e-07, ..., 0.0000e+00, 6.4448e-07, 0.0000e+00], [-9.2015e-06, 0.0000e+00, 3.0734e-08, ..., 0.0000e+00, -2.0396e-07, 0.0000e+00], ..., [-5.7548e-05, 0.0000e+00, 3.1292e-06, ..., 0.0000e+00, 1.1148e-06, 0.0000e+00], [ 6.4597e-06, 0.0000e+00, 2.8014e-06, ..., 0.0000e+00, 3.8520e-06, 0.0000e+00], [ 2.0847e-05, 0.0000e+00, 2.8461e-06, ..., 0.0000e+00, -1.0237e-05, 0.0000e+00]], device='cuda:0') Epoch 160, bias, value: tensor([-0.0036, 0.0253, 0.0057, -0.0008, 0.0101, 0.0041, 0.0214, -0.0156, 0.0128, -0.0043], device='cuda:0'), grad: tensor([-1.9324e-04, 5.5164e-05, 2.5973e-05, 8.0407e-05, 2.9713e-05, 3.0939e-06, 4.7356e-05, -1.2052e-04, 5.6177e-05, 1.5780e-05], device='cuda:0') 100 0.0001 changing lr epoch 159, time 217.57, cls_loss 0.0030 cls_loss_mapping 0.0042 cls_loss_causal 0.5736 re_mapping 0.0069 re_causal 0.0218 /// teacc 98.88 lr 0.00010000 Epoch 161, weight, value: tensor([[-4.5857e-02, -7.6750e-02, -5.1790e-02, ..., -9.3160e-02, -9.3876e-03, -2.5657e-02], [-4.3888e-02, 2.8944e-02, -1.4007e-01, ..., 5.0875e-02, 6.4678e-02, -1.2382e-01], [ 5.7464e-02, -2.3744e-03, -5.2539e-02, ..., -5.0855e-02, -3.1399e-02, 1.6877e-02], ..., [ 4.8401e-02, -4.5495e-02, 1.2720e-04, ..., 2.3744e-02, -3.6993e-02, 2.3039e-02], [-4.1970e-02, -2.7749e-02, -7.1137e-02, ..., -1.4514e-02, -2.7987e-02, -6.1501e-02], [-1.2395e-01, -1.7548e-01, 3.0484e-02, ..., -5.2835e-02, 2.8416e-02, 4.0770e-02]], device='cuda:0'), grad: tensor([[ 1.0151e-06, 7.4506e-09, 9.3132e-08, ..., 0.0000e+00, 2.9393e-06, 0.0000e+00], [-4.5672e-06, -5.4762e-06, 3.1833e-06, ..., 0.0000e+00, -2.2188e-05, 0.0000e+00], [ 2.8685e-06, 4.0717e-06, 1.9930e-07, ..., 0.0000e+00, 7.0706e-06, 0.0000e+00], ..., [ 4.5337e-06, 1.1846e-06, 2.2110e-06, ..., 0.0000e+00, 1.1772e-05, 0.0000e+00], [ 8.3074e-06, 8.3819e-09, 8.1770e-07, ..., 0.0000e+00, 6.1467e-07, 0.0000e+00], [-1.1269e-06, 2.7940e-09, 4.8503e-06, ..., 0.0000e+00, -1.8068e-06, 0.0000e+00]], device='cuda:0') Epoch 161, bias, value: tensor([-0.0040, 0.0249, 0.0052, -0.0010, 0.0104, 0.0048, 0.0224, -0.0150, 0.0115, -0.0046], device='cuda:0'), grad: tensor([ 2.6301e-06, -4.5180e-05, 2.3752e-05, -2.2769e-05, -3.8683e-05, 3.5968e-06, 1.8431e-06, 4.0680e-05, 2.0668e-05, 1.3426e-05], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 160---------------------------------------------------- epoch 160, time 218.11, cls_loss 0.0026 cls_loss_mapping 0.0038 cls_loss_causal 0.5599 re_mapping 0.0070 re_causal 0.0206 /// teacc 98.99 lr 0.00010000 Epoch 162, weight, value: tensor([[-4.5117e-02, -7.6832e-02, -5.1337e-02, ..., -9.3163e-02, -9.3608e-03, -2.5657e-02], [-4.4441e-02, 2.8980e-02, -1.4044e-01, ..., 5.0872e-02, 6.4983e-02, -1.2392e-01], [ 5.7667e-02, -2.3949e-03, -5.3077e-02, ..., -5.0859e-02, -3.1665e-02, 1.6904e-02], ..., [ 4.8568e-02, -4.5502e-02, 1.5208e-04, ..., 2.3794e-02, -3.7037e-02, 2.3019e-02], [-4.2857e-02, -2.7810e-02, -7.1684e-02, ..., -1.4516e-02, -2.8775e-02, -6.1504e-02], [-1.2449e-01, -1.7553e-01, 3.0229e-02, ..., -5.2838e-02, 2.8715e-02, 4.0770e-02]], device='cuda:0'), grad: tensor([[ 1.3709e-06, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 6.4168e-07, 0.0000e+00], [ 5.2974e-06, 0.0000e+00, 2.9802e-08, ..., 0.0000e+00, 5.8766e-07, 0.0000e+00], [ 6.5845e-07, 0.0000e+00, 1.3039e-08, ..., 0.0000e+00, 9.8813e-07, 0.0000e+00], ..., [-2.5891e-07, 0.0000e+00, 2.6077e-08, ..., 0.0000e+00, 1.1558e-06, 0.0000e+00], [ 3.4049e-06, 0.0000e+00, 2.4214e-08, ..., 0.0000e+00, -4.3493e-07, 0.0000e+00], [ 9.9838e-06, 0.0000e+00, 2.9802e-08, ..., 0.0000e+00, 3.1516e-06, 0.0000e+00]], device='cuda:0') Epoch 162, bias, value: tensor([-0.0034, 0.0246, 0.0054, -0.0009, 0.0102, 0.0047, 0.0226, -0.0148, 0.0106, -0.0046], device='cuda:0'), grad: tensor([ 4.8019e-06, 1.5199e-05, 5.3905e-06, -7.6711e-05, 5.9754e-06, 1.5259e-05, -6.6385e-06, 1.4622e-07, -1.3849e-06, 3.7879e-05], device='cuda:0') 100 0.0001 changing lr epoch 161, time 217.40, cls_loss 0.0030 cls_loss_mapping 0.0047 cls_loss_causal 0.5517 re_mapping 0.0069 re_causal 0.0209 /// teacc 98.89 lr 0.00010000 Epoch 163, weight, value: tensor([[-4.5946e-02, -7.7002e-02, -5.2057e-02, ..., -9.3185e-02, -9.5410e-03, -2.5657e-02], [-4.3536e-02, 2.9060e-02, -1.4104e-01, ..., 5.0846e-02, 6.6817e-02, -1.2396e-01], [ 5.7733e-02, -2.4102e-03, -5.4186e-02, ..., -5.0873e-02, -3.1812e-02, 1.6908e-02], ..., [ 4.8195e-02, -4.5548e-02, 1.2527e-04, ..., 2.3880e-02, -3.9076e-02, 2.3016e-02], [-4.4463e-02, -2.7671e-02, -7.3438e-02, ..., -1.4531e-02, -2.8740e-02, -6.1509e-02], [-1.2521e-01, -1.7565e-01, 3.0465e-02, ..., -5.2845e-02, 2.8975e-02, 4.0770e-02]], device='cuda:0'), grad: tensor([[ 1.9837e-07, 0.0000e+00, 1.5832e-08, ..., 0.0000e+00, 1.0896e-07, 0.0000e+00], [ 5.7463e-07, 0.0000e+00, 1.0245e-08, ..., 0.0000e+00, -5.2303e-06, 0.0000e+00], [-2.1923e-06, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 1.2573e-07, 0.0000e+00], ..., [ 7.8231e-08, 0.0000e+00, 6.5193e-09, ..., 0.0000e+00, 4.8149e-07, 0.0000e+00], [ 5.2713e-07, 4.6566e-09, 1.9185e-07, ..., 0.0000e+00, 4.2878e-06, 0.0000e+00], [ 6.7987e-08, 0.0000e+00, 1.2387e-07, ..., 0.0000e+00, 1.3132e-07, 0.0000e+00]], device='cuda:0') Epoch 163, bias, value: tensor([-0.0037, 0.0260, 0.0052, -0.0010, 0.0106, 0.0054, 0.0222, -0.0158, 0.0100, -0.0045], device='cuda:0'), grad: tensor([ 2.0303e-07, -1.0371e-05, -2.8480e-06, 1.0291e-06, 9.9652e-08, 4.3027e-07, -3.8184e-07, 1.2992e-06, 1.0043e-05, 4.8894e-07], device='cuda:0') 100 0.0001 changing lr epoch 162, time 217.49, cls_loss 0.0033 cls_loss_mapping 0.0041 cls_loss_causal 0.5250 re_mapping 0.0072 re_causal 0.0202 /// teacc 98.97 lr 0.00010000 Epoch 164, weight, value: tensor([[-4.6336e-02, -7.7186e-02, -5.3196e-02, ..., -9.3215e-02, -1.2330e-02, -2.8327e-02], [-4.4323e-02, 2.9254e-02, -1.4132e-01, ..., 5.0783e-02, 6.7174e-02, -1.2457e-01], [ 5.7638e-02, -2.2848e-03, -5.4184e-02, ..., -5.0889e-02, -3.2044e-02, 1.6707e-02], ..., [ 4.8958e-02, -4.5794e-02, -6.4190e-05, ..., 2.3987e-02, -3.9013e-02, 2.2711e-02], [-4.6292e-02, -2.7411e-02, -7.3815e-02, ..., -1.4554e-02, -2.9622e-02, -6.1524e-02], [-1.2614e-01, -1.7583e-01, 3.0799e-02, ..., -5.2888e-02, 3.1135e-02, 4.3435e-02]], device='cuda:0'), grad: tensor([[ 2.2016e-06, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 1.4901e-08, 0.0000e+00], [ 6.7592e-05, 5.5879e-09, 1.4901e-08, ..., 0.0000e+00, -4.9639e-07, 0.0000e+00], [-2.7880e-05, 0.0000e+00, 9.3132e-10, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], ..., [-5.8919e-05, -8.3819e-09, 2.7008e-08, ..., 0.0000e+00, 1.5460e-07, 0.0000e+00], [ 4.2021e-06, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 2.2911e-07, 0.0000e+00], [ 1.5069e-06, 0.0000e+00, 2.4214e-08, ..., 0.0000e+00, -2.3935e-07, 0.0000e+00]], device='cuda:0') Epoch 164, bias, value: tensor([-0.0054, 0.0258, 0.0051, -0.0011, 0.0105, 0.0055, 0.0230, -0.0152, 0.0087, -0.0037], device='cuda:0'), grad: tensor([-4.3511e-06, 1.3995e-04, -4.7803e-05, 1.2711e-05, 2.4773e-06, 5.5246e-06, 5.1185e-06, -1.2827e-04, 8.6054e-06, 6.0871e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 163---------------------------------------------------- epoch 163, time 218.26, cls_loss 0.0022 cls_loss_mapping 0.0037 cls_loss_causal 0.5264 re_mapping 0.0069 re_causal 0.0208 /// teacc 99.05 lr 0.00010000 Epoch 165, weight, value: tensor([[-0.0463, -0.0772, -0.0532, ..., -0.0934, -0.0123, -0.0283], [-0.0447, 0.0293, -0.1418, ..., 0.0502, 0.0672, -0.1246], [ 0.0577, -0.0023, -0.0545, ..., -0.0509, -0.0323, 0.0167], ..., [ 0.0492, -0.0458, -0.0002, ..., 0.0250, -0.0388, 0.0227], [-0.0464, -0.0274, -0.0735, ..., -0.0146, -0.0297, -0.0615], [-0.1270, -0.1759, 0.0305, ..., -0.0544, 0.0312, 0.0434]], device='cuda:0'), grad: tensor([[ 5.1502e-07, 0.0000e+00, 5.2620e-07, ..., 8.3819e-09, 2.3376e-07, 0.0000e+00], [-8.2999e-06, 0.0000e+00, 4.5635e-06, ..., 4.0978e-08, -1.5542e-05, 0.0000e+00], [ 4.5151e-05, 0.0000e+00, 1.9968e-04, ..., 1.3970e-08, 8.4657e-07, -2.7940e-09], ..., [ 6.4671e-06, 0.0000e+00, 3.3695e-06, ..., 5.5879e-08, 1.2413e-05, 9.3132e-10], [ 1.2284e-06, 0.0000e+00, 2.8443e-06, ..., 2.7940e-09, 2.0675e-07, 0.0000e+00], [ 1.8030e-06, 0.0000e+00, 7.8790e-07, ..., 8.7544e-08, 1.3337e-06, 0.0000e+00]], device='cuda:0') Epoch 165, bias, value: tensor([-0.0048, 0.0256, 0.0050, -0.0013, 0.0105, 0.0055, 0.0232, -0.0149, 0.0089, -0.0041], device='cuda:0'), grad: tensor([-3.7365e-06, -4.0889e-05, 3.4976e-04, -2.4773e-06, -3.7122e-04, 1.1437e-06, 3.6266e-06, 4.4227e-05, 6.7800e-06, 1.3463e-05], device='cuda:0') 100 0.0001 changing lr epoch 164, time 217.47, cls_loss 0.0029 cls_loss_mapping 0.0033 cls_loss_causal 0.5503 re_mapping 0.0070 re_causal 0.0209 /// teacc 98.99 lr 0.00010000 Epoch 166, weight, value: tensor([[-0.0468, -0.0773, -0.0539, ..., -0.0943, -0.0124, -0.0283], [-0.0449, 0.0296, -0.1427, ..., 0.0519, 0.0680, -0.1246], [ 0.0571, -0.0024, -0.0553, ..., -0.0510, -0.0325, 0.0167], ..., [ 0.0497, -0.0459, -0.0006, ..., 0.0241, -0.0395, 0.0227], [-0.0472, -0.0274, -0.0739, ..., -0.0151, -0.0301, -0.0615], [-0.1277, -0.1759, 0.0296, ..., -0.0546, 0.0314, 0.0434]], device='cuda:0'), grad: tensor([[ 1.6671e-07, 0.0000e+00, 6.5472e-07, ..., 9.5926e-08, 3.2596e-07, 0.0000e+00], [ 1.8766e-06, 0.0000e+00, 1.8179e-06, ..., 3.0175e-07, 3.1665e-08, 0.0000e+00], [-1.8999e-06, 0.0000e+00, -2.9244e-07, ..., 6.5193e-08, 7.9162e-08, 0.0000e+00], ..., [-1.3970e-08, 0.0000e+00, 2.4438e-06, ..., 4.0047e-07, 1.9837e-07, 0.0000e+00], [ 7.9721e-07, 0.0000e+00, 4.8727e-06, ..., 5.8953e-07, 3.4153e-05, 0.0000e+00], [ 6.2063e-06, 0.0000e+00, 5.7429e-05, ..., 5.1595e-06, 1.0207e-06, 0.0000e+00]], device='cuda:0') Epoch 166, bias, value: tensor([-0.0045, 0.0259, 0.0043, -0.0012, 0.0108, 0.0054, 0.0237, -0.0147, 0.0084, -0.0046], device='cuda:0'), grad: tensor([ 3.2187e-06, 1.0543e-05, -3.2801e-06, 1.3709e-06, -1.9014e-04, 4.7708e-04, -6.4754e-04, 3.2224e-06, 1.8990e-04, 1.5438e-04], device='cuda:0') 100 0.0001 changing lr epoch 165, time 217.56, cls_loss 0.0024 cls_loss_mapping 0.0047 cls_loss_causal 0.5648 re_mapping 0.0069 re_causal 0.0207 /// teacc 98.99 lr 0.00010000 Epoch 167, weight, value: tensor([[-0.0462, -0.0773, -0.0544, ..., -0.0945, -0.0121, -0.0283], [-0.0451, 0.0296, -0.1436, ..., 0.0521, 0.0683, -0.1247], [ 0.0571, -0.0024, -0.0551, ..., -0.0510, -0.0327, 0.0166], ..., [ 0.0499, -0.0459, -0.0008, ..., 0.0240, -0.0399, 0.0227], [-0.0475, -0.0274, -0.0741, ..., -0.0153, -0.0304, -0.0615], [-0.1284, -0.1759, 0.0292, ..., -0.0548, 0.0316, 0.0434]], device='cuda:0'), grad: tensor([[ 2.4773e-07, 0.0000e+00, -1.5497e-06, ..., 1.1176e-08, 1.8533e-07, 0.0000e+00], [ 4.2375e-07, 0.0000e+00, 5.3179e-07, ..., 4.0978e-08, 7.4506e-09, 0.0000e+00], [-1.9297e-06, 0.0000e+00, 7.7207e-07, ..., 1.1176e-08, 1.4529e-07, 0.0000e+00], ..., [-3.6228e-07, 0.0000e+00, 5.0385e-07, ..., 1.0058e-07, 1.0068e-06, 0.0000e+00], [ 5.1875e-07, 0.0000e+00, 5.8860e-07, ..., 9.3132e-09, 5.0887e-06, 0.0000e+00], [ 3.9581e-07, 0.0000e+00, 2.1905e-06, ..., 1.5553e-07, -1.2159e-05, 0.0000e+00]], device='cuda:0') Epoch 167, bias, value: tensor([-0.0041, 0.0257, 0.0043, -0.0019, 0.0108, 0.0064, 0.0238, -0.0147, 0.0082, -0.0046], device='cuda:0'), grad: tensor([-1.0706e-05, 2.3842e-06, -3.2131e-07, 2.1532e-06, 6.7279e-06, 6.0461e-06, 4.2543e-06, 2.3358e-06, 2.2218e-05, -3.5197e-05], device='cuda:0') 100 0.0001 changing lr epoch 166, time 217.68, cls_loss 0.0028 cls_loss_mapping 0.0046 cls_loss_causal 0.5522 re_mapping 0.0065 re_causal 0.0200 /// teacc 98.98 lr 0.00010000 Epoch 168, weight, value: tensor([[-0.0465, -0.0773, -0.0549, ..., -0.0960, -0.0125, -0.0283], [-0.0452, 0.0297, -0.1446, ..., 0.0515, 0.0685, -0.1249], [ 0.0573, -0.0025, -0.0549, ..., -0.0512, -0.0331, 0.0166], ..., [ 0.0499, -0.0459, -0.0009, ..., 0.0262, -0.0399, 0.0225], [-0.0479, -0.0275, -0.0744, ..., -0.0158, -0.0305, -0.0615], [-0.1296, -0.1759, 0.0293, ..., -0.0575, 0.0319, 0.0435]], device='cuda:0'), grad: tensor([[ 6.1467e-08, 2.1420e-08, 2.9150e-07, ..., 4.6566e-09, 6.6534e-06, 0.0000e+00], [ 1.5553e-07, 6.7987e-08, 1.4249e-07, ..., 2.9802e-08, -8.5402e-07, 0.0000e+00], [ 9.9093e-07, 6.5193e-09, 3.1199e-07, ..., 1.3970e-08, 5.0850e-06, 0.0000e+00], ..., [ 2.1234e-06, 4.6566e-09, 1.7043e-07, ..., 6.0536e-08, 1.1092e-06, 0.0000e+00], [ 1.1921e-07, 4.6566e-09, -1.3048e-06, ..., 2.7940e-09, -3.4750e-05, 0.0000e+00], [ 1.1483e-06, 4.6566e-09, 1.8943e-06, ..., 2.1327e-07, 3.7979e-06, 0.0000e+00]], device='cuda:0') Epoch 168, bias, value: tensor([-0.0040, 0.0256, 0.0044, -0.0017, 0.0106, 0.0061, 0.0245, -0.0145, 0.0083, -0.0051], device='cuda:0'), grad: tensor([ 2.5034e-05, -6.6496e-07, 2.5898e-05, 1.3337e-05, -4.2319e-06, -1.2740e-06, 6.6221e-05, 3.5781e-06, -1.4889e-04, 2.0996e-05], device='cuda:0') 100 0.0001 changing lr epoch 167, time 217.51, cls_loss 0.0034 cls_loss_mapping 0.0045 cls_loss_causal 0.5187 re_mapping 0.0066 re_causal 0.0194 /// teacc 98.93 lr 0.00010000 Epoch 169, weight, value: tensor([[-0.0469, -0.0774, -0.0553, ..., -0.0961, -0.0143, -0.0287], [-0.0476, 0.0297, -0.1454, ..., 0.0514, 0.0674, -0.1250], [ 0.0574, -0.0024, -0.0551, ..., -0.0512, -0.0335, 0.0165], ..., [ 0.0512, -0.0459, -0.0023, ..., 0.0264, -0.0385, 0.0224], [-0.0487, -0.0275, -0.0748, ..., -0.0159, -0.0305, -0.0615], [-0.1304, -0.1759, 0.0293, ..., -0.0578, 0.0336, 0.0438]], device='cuda:0'), grad: tensor([[-2.4233e-06, 0.0000e+00, 1.2107e-08, ..., 0.0000e+00, -1.5125e-05, 1.1176e-08], [ 3.9786e-06, 3.7253e-09, 1.7695e-08, ..., 0.0000e+00, -1.9893e-06, 2.7940e-09], [-2.8834e-05, 9.3132e-10, -1.7695e-08, ..., 0.0000e+00, 2.2445e-06, -1.0431e-07], ..., [ 3.2317e-07, -8.3819e-09, 2.6077e-08, ..., 0.0000e+00, 1.1111e-06, 2.1420e-08], [ 1.3247e-05, 9.3132e-10, 4.4703e-08, ..., 0.0000e+00, 3.7625e-07, 7.4506e-09], [ 4.9546e-06, 0.0000e+00, 1.7509e-07, ..., 0.0000e+00, 1.2144e-05, 9.3132e-09]], device='cuda:0') Epoch 169, bias, value: tensor([-0.0052, 0.0233, 0.0042, -0.0015, 0.0106, 0.0059, 0.0247, -0.0128, 0.0085, -0.0044], device='cuda:0'), grad: tensor([-1.3173e-04, 6.2864e-07, -4.6223e-05, 1.4842e-05, 3.0678e-06, 1.9725e-06, 8.0094e-06, 3.7253e-06, 3.2663e-05, 1.1313e-04], device='cuda:0') 100 0.0001 changing lr epoch 168, time 217.62, cls_loss 0.0027 cls_loss_mapping 0.0058 cls_loss_causal 0.5603 re_mapping 0.0067 re_causal 0.0196 /// teacc 98.95 lr 0.00010000 Epoch 170, weight, value: tensor([[-0.0471, -0.0774, -0.0554, ..., -0.0962, -0.0147, -0.0288], [-0.0479, 0.0298, -0.1472, ..., 0.0513, 0.0672, -0.1251], [ 0.0579, -0.0025, -0.0546, ..., -0.0512, -0.0326, 0.0165], ..., [ 0.0510, -0.0460, -0.0052, ..., 0.0265, -0.0400, 0.0224], [-0.0491, -0.0275, -0.0763, ..., -0.0159, -0.0311, -0.0616], [-0.1292, -0.1760, 0.0293, ..., -0.0579, 0.0357, 0.0440]], device='cuda:0'), grad: tensor([[ 2.8312e-07, 0.0000e+00, 1.1176e-08, ..., 0.0000e+00, -3.1665e-08, 0.0000e+00], [ 4.8801e-06, 0.0000e+00, 2.1793e-07, ..., 0.0000e+00, 1.3746e-06, 0.0000e+00], [-4.7348e-06, 0.0000e+00, -2.1420e-08, ..., 0.0000e+00, 1.7509e-07, 0.0000e+00], ..., [ 6.8955e-06, 0.0000e+00, 7.3574e-08, ..., 0.0000e+00, 2.1793e-06, 0.0000e+00], [ 1.3672e-06, 0.0000e+00, 3.8557e-07, ..., 0.0000e+00, 2.7493e-06, 0.0000e+00], [-1.1928e-05, 0.0000e+00, 4.7870e-06, ..., 0.0000e+00, -5.4687e-06, 0.0000e+00]], device='cuda:0') Epoch 170, bias, value: tensor([-0.0050, 0.0230, 0.0049, -0.0016, 0.0106, 0.0060, 0.0244, -0.0135, 0.0080, -0.0033], device='cuda:0'), grad: tensor([-8.3745e-05, 1.4491e-05, -1.7378e-06, 1.4357e-05, -9.7230e-06, 2.9951e-06, -1.1288e-05, 3.9488e-05, 1.7539e-05, 1.7643e-05], device='cuda:0') 100 0.0001 changing lr epoch 169, time 217.55, cls_loss 0.0025 cls_loss_mapping 0.0033 cls_loss_causal 0.5709 re_mapping 0.0070 re_causal 0.0216 /// teacc 98.99 lr 0.00010000 Epoch 171, weight, value: tensor([[-0.0474, -0.0775, -0.0557, ..., -0.0965, -0.0148, -0.0288], [-0.0480, 0.0298, -0.1475, ..., 0.0513, 0.0675, -0.1251], [ 0.0585, -0.0025, -0.0548, ..., -0.0513, -0.0328, 0.0165], ..., [ 0.0508, -0.0460, -0.0056, ..., 0.0267, -0.0401, 0.0224], [-0.0490, -0.0275, -0.0768, ..., -0.0161, -0.0308, -0.0616], [-0.1308, -0.1760, 0.0286, ..., -0.0579, 0.0356, 0.0440]], device='cuda:0'), grad: tensor([[ 3.4459e-08, 0.0000e+00, 4.3772e-08, ..., 0.0000e+00, 6.8918e-08, 0.0000e+00], [ 3.3528e-08, 0.0000e+00, 3.2224e-07, ..., 0.0000e+00, 3.8091e-07, 0.0000e+00], [-2.6077e-08, 0.0000e+00, 2.0396e-07, ..., 0.0000e+00, 4.0978e-07, 0.0000e+00], ..., [ 2.4214e-08, 0.0000e+00, 3.5856e-07, ..., 0.0000e+00, 3.7905e-07, 0.0000e+00], [ 3.1665e-08, 0.0000e+00, 3.1199e-07, ..., 0.0000e+00, 5.0291e-08, 0.0000e+00], [ 1.8626e-08, 0.0000e+00, 1.3508e-05, ..., 0.0000e+00, -1.8384e-06, 0.0000e+00]], device='cuda:0') Epoch 171, bias, value: tensor([-0.0050, 0.0232, 0.0051, -0.0016, 0.0115, 0.0060, 0.0249, -0.0138, 0.0082, -0.0040], device='cuda:0'), grad: tensor([-5.3085e-08, 2.4959e-06, 1.7369e-06, 3.7998e-07, -3.4571e-05, -3.2876e-07, -2.5034e-06, 1.9297e-06, 1.6764e-08, 3.0965e-05], device='cuda:0') 100 0.0001 changing lr epoch 170, time 217.37, cls_loss 0.0055 cls_loss_mapping 0.0079 cls_loss_causal 0.5374 re_mapping 0.0066 re_causal 0.0194 /// teacc 98.98 lr 0.00010000 Epoch 172, weight, value: tensor([[-0.0486, -0.0777, -0.0559, ..., -0.0975, -0.0149, -0.0288], [-0.0497, 0.0299, -0.1479, ..., 0.0565, 0.0661, -0.1251], [ 0.0581, -0.0025, -0.0548, ..., -0.0515, -0.0343, 0.0165], ..., [ 0.0525, -0.0460, -0.0057, ..., 0.0216, -0.0383, 0.0224], [-0.0497, -0.0268, -0.0770, ..., -0.0170, -0.0318, -0.0616], [-0.1322, -0.1763, 0.0288, ..., -0.0568, 0.0354, 0.0440]], device='cuda:0'), grad: tensor([[ 4.7497e-08, 0.0000e+00, -4.0559e-07, ..., 8.8476e-09, 1.0245e-07, 0.0000e+00], [ 8.3726e-07, 0.0000e+00, 1.1362e-07, ..., 9.7789e-09, -4.4936e-07, 0.0000e+00], [-1.4361e-06, 0.0000e+00, 1.1688e-07, ..., 2.1886e-08, 9.0338e-08, 0.0000e+00], ..., [ 6.9663e-07, 0.0000e+00, 1.1921e-07, ..., 5.4482e-08, 2.4028e-07, 0.0000e+00], [ 1.2852e-07, 0.0000e+00, 1.9511e-07, ..., 1.9092e-08, 3.2363e-07, 0.0000e+00], [ 4.5681e-07, 0.0000e+00, 8.3912e-07, ..., 3.1665e-08, -3.1292e-07, 0.0000e+00]], device='cuda:0') Epoch 172, bias, value: tensor([-0.0048, 0.0222, 0.0039, -0.0015, 0.0110, 0.0055, 0.0248, -0.0124, 0.0078, -0.0041], device='cuda:0'), grad: tensor([-3.6210e-06, 3.9814e-07, -1.5739e-06, 9.3654e-06, -1.7276e-07, -7.1637e-06, -2.6654e-06, 1.4063e-06, 1.7658e-06, 2.2445e-06], device='cuda:0') 100 0.0001 changing lr epoch 171, time 217.23, cls_loss 0.0025 cls_loss_mapping 0.0040 cls_loss_causal 0.5282 re_mapping 0.0066 re_causal 0.0205 /// teacc 98.98 lr 0.00010000 Epoch 173, weight, value: tensor([[-0.0492, -0.0784, -0.0559, ..., -0.0977, -0.0150, -0.0288], [-0.0500, 0.0305, -0.1486, ..., 0.0564, 0.0662, -0.1251], [ 0.0576, -0.0027, -0.0546, ..., -0.0516, -0.0349, 0.0165], ..., [ 0.0531, -0.0461, -0.0057, ..., 0.0218, -0.0383, 0.0224], [-0.0500, -0.0266, -0.0769, ..., -0.0171, -0.0315, -0.0616], [-0.1324, -0.1766, 0.0287, ..., -0.0576, 0.0360, 0.0440]], device='cuda:0'), grad: tensor([[ 3.9581e-08, 0.0000e+00, -1.8207e-07, ..., 0.0000e+00, -4.6846e-07, 0.0000e+00], [ 4.2142e-07, 1.1176e-08, 7.5437e-08, ..., 0.0000e+00, 2.0023e-08, 0.0000e+00], [ 1.2061e-07, 3.9581e-08, 9.2201e-08, ..., 0.0000e+00, 1.6298e-08, 0.0000e+00], ..., [ 2.5146e-06, -5.5879e-08, 6.0070e-08, ..., 0.0000e+00, 1.0878e-06, 0.0000e+00], [ 2.6729e-07, 2.3283e-09, -6.8499e-07, ..., 0.0000e+00, 9.3132e-09, 0.0000e+00], [ 8.3074e-07, 0.0000e+00, 2.3330e-07, ..., 0.0000e+00, 1.4761e-07, 0.0000e+00]], device='cuda:0') Epoch 173, bias, value: tensor([-0.0048, 0.0220, 0.0034, -0.0016, 0.0107, 0.0057, 0.0240, -0.0121, 0.0081, -0.0040], device='cuda:0'), grad: tensor([-6.1929e-05, 9.0478e-07, 5.3085e-07, -4.6268e-06, -1.6671e-07, 7.5735e-06, 5.0366e-05, 3.3751e-06, 2.9197e-07, 3.5688e-06], device='cuda:0') 100 0.0001 changing lr epoch 172, time 217.41, cls_loss 0.0021 cls_loss_mapping 0.0035 cls_loss_causal 0.5382 re_mapping 0.0066 re_causal 0.0206 /// teacc 98.99 lr 0.00010000 Epoch 174, weight, value: tensor([[-0.0498, -0.0789, -0.0559, ..., -0.0978, -0.0150, -0.0288], [-0.0502, 0.0306, -0.1494, ..., 0.0565, 0.0661, -0.1251], [ 0.0578, -0.0027, -0.0547, ..., -0.0516, -0.0348, 0.0165], ..., [ 0.0532, -0.0461, -0.0061, ..., 0.0218, -0.0384, 0.0224], [-0.0506, -0.0267, -0.0774, ..., -0.0171, -0.0316, -0.0616], [-0.1330, -0.1775, 0.0287, ..., -0.0577, 0.0363, 0.0440]], device='cuda:0'), grad: tensor([[ 1.5367e-07, 0.0000e+00, 4.1444e-08, ..., 4.6566e-10, 6.8452e-08, 0.0000e+00], [ 4.4843e-07, 0.0000e+00, 2.9802e-08, ..., 4.6566e-10, -1.2349e-06, 0.0000e+00], [ 4.8103e-07, 0.0000e+00, 6.9849e-09, ..., 0.0000e+00, 1.8347e-07, 0.0000e+00], ..., [-3.6843e-06, 0.0000e+00, 1.9092e-08, ..., 4.6566e-10, 7.2410e-07, 0.0000e+00], [ 4.9826e-07, 0.0000e+00, 1.5786e-07, ..., 2.7940e-09, 1.9558e-07, 0.0000e+00], [ 4.2748e-07, 0.0000e+00, 3.6787e-08, ..., 4.6566e-10, 2.1420e-08, 0.0000e+00]], device='cuda:0') Epoch 174, bias, value: tensor([-0.0048, 0.0219, 0.0035, -0.0017, 0.0113, 0.0054, 0.0251, -0.0121, 0.0079, -0.0041], device='cuda:0'), grad: tensor([-3.2969e-07, -1.4976e-06, 1.2200e-06, 1.3344e-05, 3.1153e-07, -1.0528e-05, -1.2852e-07, -4.8652e-06, 1.2126e-06, 1.2703e-06], device='cuda:0') 100 0.0001 changing lr epoch 173, time 217.46, cls_loss 0.0018 cls_loss_mapping 0.0045 cls_loss_causal 0.5394 re_mapping 0.0066 re_causal 0.0205 /// teacc 98.95 lr 0.00010000 Epoch 175, weight, value: tensor([[-0.0501, -0.0797, -0.0562, ..., -0.0986, -0.0151, -0.0288], [-0.0502, 0.0317, -0.1499, ..., 0.0564, 0.0662, -0.1251], [ 0.0577, -0.0035, -0.0547, ..., -0.0518, -0.0352, 0.0165], ..., [ 0.0534, -0.0463, -0.0064, ..., 0.0218, -0.0384, 0.0223], [-0.0513, -0.0275, -0.0775, ..., -0.0175, -0.0314, -0.0616], [-0.1336, -0.1782, 0.0285, ..., -0.0570, 0.0366, 0.0440]], device='cuda:0'), grad: tensor([[ 1.4901e-07, 0.0000e+00, 1.4435e-07, ..., 9.3132e-10, 2.0117e-07, 0.0000e+00], [ 3.7067e-07, 0.0000e+00, 2.6077e-08, ..., 0.0000e+00, -4.3772e-08, 0.0000e+00], [ 3.8277e-07, 0.0000e+00, 2.7940e-08, ..., 0.0000e+00, 6.6124e-08, 0.0000e+00], ..., [-2.6897e-06, 0.0000e+00, 2.2352e-08, ..., 0.0000e+00, 1.7695e-08, 0.0000e+00], [ 1.3281e-06, 0.0000e+00, 1.5348e-06, ..., 1.8626e-09, 2.2277e-06, 0.0000e+00], [ 1.6894e-06, 0.0000e+00, -2.5593e-06, ..., 3.7253e-09, -4.1239e-06, 0.0000e+00]], device='cuda:0') Epoch 175, bias, value: tensor([-0.0047, 0.0219, 0.0033, -0.0006, 0.0113, 0.0038, 0.0250, -0.0120, 0.0077, -0.0040], device='cuda:0'), grad: tensor([ 1.6699e-06, 6.5565e-07, 1.9521e-06, 4.6343e-06, 1.2545e-06, 3.6787e-07, 1.9372e-07, -4.6156e-06, 1.6019e-05, -2.2128e-05], device='cuda:0') 100 0.0001 changing lr epoch 174, time 217.71, cls_loss 0.0026 cls_loss_mapping 0.0034 cls_loss_causal 0.5116 re_mapping 0.0066 re_causal 0.0198 /// teacc 98.87 lr 0.00010000 Epoch 176, weight, value: tensor([[-0.0506, -0.0815, -0.0566, ..., -0.0989, -0.0152, -0.0288], [-0.0502, 0.0346, -0.1505, ..., 0.0565, 0.0662, -0.1252], [ 0.0579, -0.0043, -0.0549, ..., -0.0530, -0.0354, 0.0164], ..., [ 0.0535, -0.0482, -0.0068, ..., 0.0217, -0.0384, 0.0225], [-0.0517, -0.0277, -0.0783, ..., -0.0177, -0.0320, -0.0616], [-0.1337, -0.1789, 0.0304, ..., -0.0571, 0.0368, 0.0440]], device='cuda:0'), grad: tensor([[ 1.2387e-07, 0.0000e+00, 4.2655e-07, ..., 8.3819e-09, 1.3970e-08, 0.0000e+00], [ 4.2282e-07, 0.0000e+00, 5.6904e-07, ..., 9.3132e-08, -3.9116e-08, 0.0000e+00], [ 3.0641e-07, 0.0000e+00, 4.9546e-07, ..., 6.7987e-08, 9.3132e-09, 0.0000e+00], ..., [ 2.0396e-07, 0.0000e+00, 4.9081e-07, ..., 7.4506e-08, 8.9407e-08, 0.0000e+00], [ 2.8498e-07, 0.0000e+00, 1.1828e-07, ..., 1.8626e-09, 2.5146e-08, 0.0000e+00], [ 2.2072e-07, 0.0000e+00, -1.6734e-05, ..., 2.1420e-08, -1.3132e-07, 0.0000e+00]], device='cuda:0') Epoch 176, bias, value: tensor([-0.0047, 0.0219, 0.0033, -0.0008, 0.0094, 0.0048, 0.0238, -0.0120, 0.0073, -0.0023], device='cuda:0'), grad: tensor([ 8.7637e-07, 1.5022e-06, 1.6857e-06, -4.4107e-05, 2.3648e-05, 4.7028e-05, 1.9073e-06, 1.6000e-06, 5.8115e-07, -3.4839e-05], device='cuda:0') 100 0.0001 changing lr epoch 175, time 217.51, cls_loss 0.0023 cls_loss_mapping 0.0032 cls_loss_causal 0.5346 re_mapping 0.0065 re_causal 0.0200 /// teacc 98.95 lr 0.00010000 Epoch 177, weight, value: tensor([[-0.0510, -0.0844, -0.0569, ..., -0.0993, -0.0153, -0.0289], [-0.0501, 0.0381, -0.1520, ..., 0.0567, 0.0666, -0.1252], [ 0.0577, -0.0067, -0.0551, ..., -0.0537, -0.0363, 0.0163], ..., [ 0.0536, -0.0498, -0.0068, ..., 0.0217, -0.0386, 0.0224], [-0.0519, -0.0299, -0.0786, ..., -0.0178, -0.0324, -0.0616], [-0.1342, -0.1804, 0.0313, ..., -0.0572, 0.0367, 0.0441]], device='cuda:0'), grad: tensor([[ 2.3358e-06, 0.0000e+00, 1.5367e-06, ..., 0.0000e+00, 1.4901e-08, 0.0000e+00], [ 1.4555e-04, 0.0000e+00, 3.4645e-07, ..., 0.0000e+00, -1.2191e-06, 0.0000e+00], [ 1.8477e-05, 0.0000e+00, 5.4576e-07, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], ..., [-2.3496e-04, 0.0000e+00, 7.2643e-08, ..., 0.0000e+00, 1.5553e-07, 0.0000e+00], [ 8.0049e-05, 0.0000e+00, 1.8710e-06, ..., 0.0000e+00, 5.8580e-07, 0.0000e+00], [ 1.0774e-05, 0.0000e+00, 1.1653e-05, ..., 0.0000e+00, 1.1921e-07, 0.0000e+00]], device='cuda:0') Epoch 177, bias, value: tensor([-0.0047, 0.0223, 0.0025, -0.0010, 0.0084, 0.0045, 0.0237, -0.0121, 0.0073, -0.0015], device='cuda:0'), grad: tensor([ 1.3605e-05, 4.6992e-04, 5.9366e-05, 4.4644e-05, 4.5486e-06, -8.6725e-05, -4.1515e-05, -7.9012e-04, 2.5797e-04, 6.7592e-05], device='cuda:0') 100 0.0001 changing lr epoch 176, time 217.40, cls_loss 0.0024 cls_loss_mapping 0.0037 cls_loss_causal 0.5344 re_mapping 0.0067 re_causal 0.0197 /// teacc 98.92 lr 0.00010000 Epoch 178, weight, value: tensor([[-0.0513, -0.0846, -0.0570, ..., -0.0994, -0.0154, -0.0289], [-0.0502, 0.0382, -0.1522, ..., 0.0568, 0.0666, -0.1252], [ 0.0580, -0.0068, -0.0552, ..., -0.0538, -0.0356, 0.0163], ..., [ 0.0537, -0.0498, -0.0069, ..., 0.0215, -0.0387, 0.0224], [-0.0527, -0.0304, -0.0791, ..., -0.0179, -0.0325, -0.0616], [-0.1346, -0.1809, 0.0311, ..., -0.0572, 0.0369, 0.0441]], device='cuda:0'), grad: tensor([[ 8.1956e-08, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], [ 1.9008e-06, 0.0000e+00, 1.0524e-07, ..., 0.0000e+00, -5.1316e-07, 0.0000e+00], [ 7.5437e-08, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.6764e-08, 0.0000e+00], ..., [ 4.9081e-07, 0.0000e+00, -1.0338e-07, ..., 0.0000e+00, 6.6683e-07, 0.0000e+00], [ 4.0978e-08, 0.0000e+00, 7.6089e-07, ..., 0.0000e+00, -4.0978e-08, 0.0000e+00], [-7.0184e-06, 0.0000e+00, 2.9523e-07, ..., 0.0000e+00, -5.6066e-07, 0.0000e+00]], device='cuda:0') Epoch 178, bias, value: tensor([-0.0048, 0.0223, 0.0030, -0.0011, 0.0106, 0.0045, 0.0239, -0.0121, 0.0068, -0.0036], device='cuda:0'), grad: tensor([ 1.0151e-07, 2.7232e-06, 2.9430e-07, 2.7195e-06, 1.2636e-05, -2.7567e-06, 6.4913e-07, 8.3297e-06, 1.2098e-06, -2.5928e-05], device='cuda:0') 100 0.0001 changing lr epoch 177, time 217.36, cls_loss 0.0020 cls_loss_mapping 0.0032 cls_loss_causal 0.4922 re_mapping 0.0066 re_causal 0.0197 /// teacc 99.00 lr 0.00010000 Epoch 179, weight, value: tensor([[-0.0528, -0.0848, -0.0571, ..., -0.0995, -0.0156, -0.0289], [-0.0502, 0.0383, -0.1555, ..., 0.0569, 0.0663, -0.1252], [ 0.0580, -0.0068, -0.0551, ..., -0.0546, -0.0356, 0.0163], ..., [ 0.0536, -0.0499, -0.0071, ..., 0.0215, -0.0389, 0.0224], [-0.0532, -0.0306, -0.0793, ..., -0.0179, -0.0336, -0.0616], [-0.1353, -0.1813, 0.0312, ..., -0.0567, 0.0378, 0.0441]], device='cuda:0'), grad: tensor([[-1.0338e-06, 0.0000e+00, 1.1176e-08, ..., 9.3132e-10, 1.0077e-06, 0.0000e+00], [ 2.9057e-07, -9.3132e-10, 1.4342e-07, ..., 1.2107e-08, -5.4296e-07, 0.0000e+00], [-7.0669e-06, 0.0000e+00, 2.4214e-08, ..., 9.3132e-10, 1.4687e-06, 0.0000e+00], ..., [ 8.2254e-06, 9.3132e-10, 2.7940e-08, ..., 1.8626e-09, 4.9453e-07, 0.0000e+00], [ 5.4911e-06, 0.0000e+00, 4.0978e-08, ..., 9.3132e-10, 7.5996e-07, 0.0000e+00], [ 2.1681e-06, 0.0000e+00, 1.3784e-07, ..., 1.1176e-08, 1.1455e-07, 0.0000e+00]], device='cuda:0') Epoch 179, bias, value: tensor([-0.0053, 0.0218, 0.0030, -0.0009, 0.0117, 0.0045, 0.0242, -0.0123, 0.0061, -0.0035], device='cuda:0'), grad: tensor([-1.0490e-05, -2.0899e-06, -1.4510e-06, -1.3478e-05, 6.2101e-06, 7.2829e-07, -1.9118e-05, 1.6659e-05, 1.0513e-05, 1.2442e-05], device='cuda:0') 100 0.0001 changing lr epoch 178, time 217.36, cls_loss 0.0021 cls_loss_mapping 0.0038 cls_loss_causal 0.5240 re_mapping 0.0070 re_causal 0.0196 /// teacc 98.83 lr 0.00010000 Epoch 180, weight, value: tensor([[-0.0541, -0.0853, -0.0572, ..., -0.0995, -0.0157, -0.0289], [-0.0502, 0.0384, -0.1557, ..., 0.0569, 0.0664, -0.1253], [ 0.0584, -0.0046, -0.0547, ..., -0.0546, -0.0357, 0.0164], ..., [ 0.0535, -0.0516, -0.0072, ..., 0.0215, -0.0390, 0.0223], [-0.0538, -0.0308, -0.0794, ..., -0.0179, -0.0336, -0.0617], [-0.1358, -0.1820, 0.0310, ..., -0.0568, 0.0379, 0.0441]], device='cuda:0'), grad: tensor([[-9.3132e-09, 0.0000e+00, 3.7253e-08, ..., 0.0000e+00, 7.4863e-05, 0.0000e+00], [ 6.7800e-07, 0.0000e+00, 8.7917e-07, ..., 0.0000e+00, -1.2340e-06, 0.0000e+00], [ 3.1330e-06, 0.0000e+00, 1.5739e-07, ..., 0.0000e+00, 3.6694e-07, 0.0000e+00], ..., [ 4.0978e-06, 0.0000e+00, 2.1141e-07, ..., 0.0000e+00, 2.4959e-06, 0.0000e+00], [-7.8380e-06, 0.0000e+00, 9.4995e-08, ..., 0.0000e+00, 9.9558e-07, 0.0000e+00], [-5.6904e-07, 0.0000e+00, 3.1032e-06, ..., 0.0000e+00, -4.0382e-06, 0.0000e+00]], device='cuda:0') Epoch 180, bias, value: tensor([-0.0050, 0.0217, 0.0031, -0.0003, 0.0118, 0.0042, 0.0246, -0.0124, 0.0060, -0.0036], device='cuda:0'), grad: tensor([ 9.6560e-04, 2.4959e-06, 1.2040e-05, 1.9390e-06, -4.2245e-06, 3.2391e-06, -9.8038e-04, 4.6074e-05, -4.3392e-05, -4.9397e-06], device='cuda:0') 100 0.0001 changing lr epoch 179, time 217.26, cls_loss 0.0032 cls_loss_mapping 0.0034 cls_loss_causal 0.5091 re_mapping 0.0062 re_causal 0.0188 /// teacc 98.98 lr 0.00010000 Epoch 181, weight, value: tensor([[-0.0544, -0.0860, -0.0573, ..., -0.0995, -0.0160, -0.0322], [-0.0497, 0.0399, -0.1558, ..., 0.0569, 0.0667, -0.1261], [ 0.0580, -0.0057, -0.0549, ..., -0.0547, -0.0361, 0.0138], ..., [ 0.0532, -0.0525, -0.0076, ..., 0.0215, -0.0393, 0.0215], [-0.0537, -0.0309, -0.0796, ..., -0.0179, -0.0338, -0.0619], [-0.1372, -0.1827, 0.0305, ..., -0.0568, 0.0384, 0.0474]], device='cuda:0'), grad: tensor([[ 2.2352e-08, 0.0000e+00, 4.9360e-08, ..., 5.5879e-09, 1.6112e-07, 9.3132e-10], [ 7.4226e-07, 0.0000e+00, 8.8476e-08, ..., -1.3569e-06, -1.0379e-05, 9.3132e-10], [ 3.1535e-06, 0.0000e+00, 4.8429e-08, ..., 2.7940e-09, 1.0338e-07, -2.4214e-08], ..., [-5.9530e-06, 0.0000e+00, 3.3155e-07, ..., 9.6392e-07, 8.4266e-06, 2.0489e-08], [ 7.8231e-08, 0.0000e+00, -3.7253e-09, ..., 1.3970e-08, -2.3935e-07, 0.0000e+00], [-2.8908e-06, 0.0000e+00, 6.4075e-07, ..., 2.2165e-07, -1.3754e-05, 0.0000e+00]], device='cuda:0') Epoch 181, bias, value: tensor([-0.0074, 0.0221, 0.0024, -0.0004, 0.0120, 0.0045, 0.0251, -0.0127, 0.0062, -0.0027], device='cuda:0'), grad: tensor([-2.8223e-05, -1.3173e-05, 7.3463e-06, 3.0454e-06, 8.0705e-05, 1.8431e-06, 6.7241e-06, 8.4639e-06, -3.4757e-06, -6.3300e-05], device='cuda:0') 100 0.0001 changing lr epoch 180, time 217.34, cls_loss 0.0033 cls_loss_mapping 0.0051 cls_loss_causal 0.5516 re_mapping 0.0072 re_causal 0.0200 /// teacc 98.91 lr 0.00010000 Epoch 182, weight, value: tensor([[-0.0548, -0.0890, -0.0563, ..., -0.0997, -0.0162, -0.0323], [-0.0496, 0.0410, -0.1559, ..., 0.0568, 0.0668, -0.1264], [ 0.0580, -0.0062, -0.0546, ..., -0.0548, -0.0364, 0.0142], ..., [ 0.0566, -0.0531, -0.0078, ..., 0.0215, -0.0381, 0.0211], [-0.0545, -0.0310, -0.0795, ..., -0.0181, -0.0330, -0.0622], [-0.1408, -0.1838, 0.0300, ..., -0.0565, 0.0356, 0.0475]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 0.0000e+00, 3.2596e-08, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], [ 2.6077e-08, 0.0000e+00, 2.6077e-08, ..., 0.0000e+00, -1.5832e-07, 0.0000e+00], [-4.2655e-07, 0.0000e+00, 3.0734e-08, ..., 0.0000e+00, 1.8626e-08, 0.0000e+00], ..., [ 4.7404e-07, 0.0000e+00, 3.7253e-08, ..., 0.0000e+00, 1.3318e-07, 0.0000e+00], [ 4.3772e-08, 0.0000e+00, 3.6135e-07, ..., 0.0000e+00, 1.9092e-07, 0.0000e+00], [ 9.0338e-08, 0.0000e+00, 3.1851e-07, ..., 0.0000e+00, -1.0151e-07, 0.0000e+00]], device='cuda:0') Epoch 182, bias, value: tensor([-0.0066, 0.0222, 0.0023, -0.0042, 0.0120, 0.0041, 0.0255, -0.0102, 0.0067, -0.0053], device='cuda:0'), grad: tensor([-3.2317e-07, -2.5425e-07, -3.8464e-07, -4.3213e-07, 1.0058e-07, -1.5080e-05, 1.3612e-05, 9.4157e-07, 1.3318e-06, 4.6473e-07], device='cuda:0') 100 0.0001 changing lr epoch 181, time 217.48, cls_loss 0.0021 cls_loss_mapping 0.0037 cls_loss_causal 0.5057 re_mapping 0.0069 re_causal 0.0202 /// teacc 99.00 lr 0.00010000 Epoch 183, weight, value: tensor([[-0.0550, -0.0909, -0.0564, ..., -0.0999, -0.0163, -0.0323], [-0.0499, 0.0418, -0.1561, ..., 0.0568, 0.0668, -0.1268], [ 0.0580, -0.0065, -0.0556, ..., -0.0548, -0.0367, 0.0136], ..., [ 0.0564, -0.0537, -0.0085, ..., 0.0216, -0.0380, 0.0213], [-0.0557, -0.0313, -0.0794, ..., -0.0182, -0.0336, -0.0627], [-0.1408, -0.1867, 0.0301, ..., -0.0561, 0.0358, 0.0475]], device='cuda:0'), grad: tensor([[ 2.1979e-07, 9.3132e-09, 3.7253e-08, ..., 0.0000e+00, 2.3283e-08, 1.6764e-08], [-1.0796e-05, 6.5193e-09, 1.0245e-08, ..., 0.0000e+00, -1.1310e-05, 3.7253e-09], [-2.8443e-06, -2.1327e-07, 1.3039e-08, ..., 0.0000e+00, 1.7229e-07, 5.5879e-09], ..., [ 1.1265e-05, 2.1700e-07, 5.4948e-08, ..., 0.0000e+00, 1.1042e-05, 2.5146e-08], [ 9.1176e-07, 5.5879e-09, 6.6590e-07, ..., 0.0000e+00, -4.8950e-06, 5.5879e-09], [ 7.1526e-07, 2.2352e-08, 1.1548e-07, ..., 0.0000e+00, 5.3085e-08, 5.1223e-08]], device='cuda:0') Epoch 183, bias, value: tensor([-0.0065, 0.0221, 0.0021, -0.0034, 0.0118, 0.0036, 0.0256, -0.0103, 0.0062, -0.0051], device='cuda:0'), grad: tensor([-8.0094e-07, -2.5913e-05, -3.1013e-06, 2.0325e-05, 3.2689e-07, -1.8045e-05, 1.4335e-05, 2.5362e-05, -1.4544e-05, 2.0210e-06], device='cuda:0') 100 0.0001 changing lr epoch 182, time 217.24, cls_loss 0.0029 cls_loss_mapping 0.0048 cls_loss_causal 0.5477 re_mapping 0.0070 re_causal 0.0204 /// teacc 98.82 lr 0.00010000 Epoch 184, weight, value: tensor([[-0.0544, -0.0920, -0.0566, ..., -0.1026, -0.0160, -0.0323], [-0.0502, 0.0422, -0.1562, ..., 0.0568, 0.0670, -0.1272], [ 0.0581, -0.0055, -0.0555, ..., -0.0556, -0.0371, 0.0135], ..., [ 0.0566, -0.0549, -0.0094, ..., 0.0219, -0.0381, 0.0214], [-0.0569, -0.0312, -0.0802, ..., -0.0186, -0.0344, -0.0627], [-0.1411, -0.1877, 0.0294, ..., -0.0572, 0.0357, 0.0475]], device='cuda:0'), grad: tensor([[ 2.7847e-07, 0.0000e+00, 4.3027e-07, ..., 0.0000e+00, 4.3400e-07, 0.0000e+00], [ 1.3225e-07, 0.0000e+00, 3.3341e-07, ..., 3.7253e-09, -2.3376e-07, 0.0000e+00], [-9.7323e-07, 0.0000e+00, 3.4831e-07, ..., 0.0000e+00, 1.1399e-06, 0.0000e+00], ..., [ 7.5251e-07, 0.0000e+00, 1.2005e-06, ..., -5.5879e-09, 2.8759e-06, 0.0000e+00], [ 1.5711e-06, 0.0000e+00, 8.8010e-07, ..., 0.0000e+00, 2.6412e-06, 0.0000e+00], [-2.1495e-06, 0.0000e+00, -1.5972e-06, ..., 9.3132e-10, -9.2760e-06, 0.0000e+00]], device='cuda:0') Epoch 184, bias, value: tensor([-0.0061, 0.0220, 0.0020, -0.0034, 0.0120, 0.0038, 0.0250, -0.0102, 0.0054, -0.0053], device='cuda:0'), grad: tensor([ 3.7812e-06, 1.0710e-07, 1.7788e-06, 1.1902e-06, 5.0850e-07, -2.4047e-06, 1.5311e-06, 1.0528e-05, 1.1221e-05, -2.8238e-05], device='cuda:0') 100 0.0001 changing lr epoch 183, time 217.79, cls_loss 0.0021 cls_loss_mapping 0.0038 cls_loss_causal 0.5195 re_mapping 0.0068 re_causal 0.0195 /// teacc 98.89 lr 0.00010000 Epoch 185, weight, value: tensor([[-0.0545, -0.0928, -0.0553, ..., -0.1031, -0.0161, -0.0323], [-0.0503, 0.0431, -0.1564, ..., 0.0567, 0.0671, -0.1274], [ 0.0588, -0.0058, -0.0556, ..., -0.0570, -0.0373, 0.0136], ..., [ 0.0566, -0.0556, -0.0097, ..., 0.0219, -0.0382, 0.0213], [-0.0583, -0.0305, -0.0802, ..., -0.0189, -0.0337, -0.0628], [-0.1410, -0.1885, 0.0290, ..., -0.0563, 0.0361, 0.0475]], device='cuda:0'), grad: tensor([[ 1.4994e-07, 0.0000e+00, -4.2841e-08, ..., 6.6124e-08, 8.4750e-08, 0.0000e+00], [ 2.0731e-06, 0.0000e+00, 3.2596e-08, ..., 1.6810e-06, -1.0934e-06, 0.0000e+00], [ 6.6776e-07, 0.0000e+00, 2.1420e-08, ..., 3.0734e-08, 7.6462e-07, 0.0000e+00], ..., [-4.1425e-06, 0.0000e+00, 5.1223e-08, ..., -2.5332e-06, 2.3190e-07, 0.0000e+00], [ 8.0094e-08, 0.0000e+00, 8.2888e-08, ..., 2.9802e-08, 6.8918e-08, 0.0000e+00], [ 5.2620e-07, 0.0000e+00, 8.8196e-07, ..., 4.2468e-07, -2.0862e-07, 0.0000e+00]], device='cuda:0') Epoch 185, bias, value: tensor([-0.0056, 0.0220, 0.0027, -0.0043, 0.0120, 0.0060, 0.0236, -0.0103, 0.0055, -0.0054], device='cuda:0'), grad: tensor([ 1.6671e-07, 5.0887e-06, 2.9076e-06, 1.7481e-06, -1.5274e-07, -2.8033e-07, -1.3877e-06, -1.1422e-05, 4.6659e-07, 2.8554e-06], device='cuda:0') 100 0.0001 changing lr epoch 184, time 217.34, cls_loss 0.0020 cls_loss_mapping 0.0033 cls_loss_causal 0.5266 re_mapping 0.0068 re_causal 0.0198 /// teacc 98.95 lr 0.00010000 Epoch 186, weight, value: tensor([[-0.0548, -0.0931, -0.0579, ..., -0.1046, -0.0160, -0.0323], [-0.0504, 0.0432, -0.1566, ..., 0.0568, 0.0672, -0.1276], [ 0.0590, -0.0058, -0.0560, ..., -0.0575, -0.0375, 0.0136], ..., [ 0.0565, -0.0556, -0.0118, ..., 0.0215, -0.0383, 0.0212], [-0.0586, -0.0308, -0.0785, ..., -0.0201, -0.0335, -0.0629], [-0.1411, -0.1887, 0.0287, ..., -0.0570, 0.0361, 0.0475]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, -1.9092e-07, ..., 0.0000e+00, 2.3097e-07, 0.0000e+00], [ 6.1467e-08, 0.0000e+00, 1.2480e-07, ..., 0.0000e+00, -6.1281e-07, 0.0000e+00], [-8.5682e-08, 0.0000e+00, 5.2154e-08, ..., 0.0000e+00, 4.0978e-08, 0.0000e+00], ..., [ 2.1141e-07, 0.0000e+00, 2.0843e-06, ..., 0.0000e+00, 1.1548e-07, 9.3132e-10], [-3.3602e-06, 0.0000e+00, 6.7987e-08, ..., 0.0000e+00, 2.4084e-06, 0.0000e+00], [ 3.0063e-06, 0.0000e+00, 3.0279e-05, ..., 0.0000e+00, -2.7940e-08, -3.7253e-09]], device='cuda:0') Epoch 186, bias, value: tensor([-0.0056, 0.0219, 0.0027, -0.0043, 0.0123, 0.0053, 0.0230, -0.0104, 0.0073, -0.0055], device='cuda:0'), grad: tensor([-1.3756e-06, 5.6140e-06, 1.0943e-06, 7.5251e-07, -5.7906e-05, 2.9095e-06, -1.0684e-05, 6.6906e-06, -2.1085e-05, 7.3910e-05], device='cuda:0') 100 0.0001 changing lr epoch 185, time 217.74, cls_loss 0.0022 cls_loss_mapping 0.0035 cls_loss_causal 0.5201 re_mapping 0.0067 re_causal 0.0185 /// teacc 99.01 lr 0.00010000 Epoch 187, weight, value: tensor([[-0.0551, -0.0933, -0.0580, ..., -0.1058, -0.0160, -0.0323], [-0.0503, 0.0434, -0.1567, ..., 0.0578, 0.0675, -0.1280], [ 0.0588, -0.0061, -0.0567, ..., -0.0578, -0.0380, 0.0134], ..., [ 0.0566, -0.0555, -0.0122, ..., 0.0207, -0.0386, 0.0206], [-0.0596, -0.0312, -0.0787, ..., -0.0206, -0.0336, -0.0637], [-0.1412, -0.1891, 0.0284, ..., -0.0587, 0.0364, 0.0476]], device='cuda:0'), grad: tensor([[ 3.4273e-07, 0.0000e+00, 6.4261e-08, ..., 0.0000e+00, 1.7043e-07, 0.0000e+00], [ 6.6496e-07, 0.0000e+00, 1.2107e-07, ..., 0.0000e+00, -2.0396e-07, 0.0000e+00], [-1.7926e-05, 0.0000e+00, 1.6112e-07, ..., 0.0000e+00, 2.8871e-08, 0.0000e+00], ..., [ 1.2584e-05, 0.0000e+00, 1.0263e-06, ..., 0.0000e+00, 1.8226e-06, 4.8429e-08], [ 4.0829e-06, 0.0000e+00, 1.2945e-07, ..., 0.0000e+00, 7.9535e-07, 0.0000e+00], [ 3.6135e-07, 0.0000e+00, -3.2820e-06, ..., 0.0000e+00, -2.6561e-06, -5.7742e-08]], device='cuda:0') Epoch 187, bias, value: tensor([-0.0055, 0.0221, 0.0024, -0.0041, 0.0121, 0.0050, 0.0241, -0.0106, 0.0072, -0.0053], device='cuda:0'), grad: tensor([ 2.3600e-06, 1.3169e-06, -2.9624e-05, 9.9465e-07, 1.6585e-05, 8.2105e-06, -9.0748e-06, 3.7760e-05, 1.7405e-05, -4.5925e-05], device='cuda:0') 100 0.0001 changing lr epoch 186, time 217.68, cls_loss 0.0023 cls_loss_mapping 0.0040 cls_loss_causal 0.5198 re_mapping 0.0063 re_causal 0.0189 /// teacc 98.92 lr 0.00010000 Epoch 188, weight, value: tensor([[-0.0553, -0.0941, -0.0582, ..., -0.1067, -0.0162, -0.0323], [-0.0504, 0.0444, -0.1573, ..., 0.0578, 0.0676, -0.1281], [ 0.0593, -0.0071, -0.0552, ..., -0.0579, -0.0369, 0.0134], ..., [ 0.0565, -0.0558, -0.0124, ..., 0.0207, -0.0390, 0.0195], [-0.0602, -0.0324, -0.0787, ..., -0.0209, -0.0323, -0.0639], [-0.1407, -0.1899, 0.0281, ..., -0.0607, 0.0369, 0.0476]], device='cuda:0'), grad: tensor([[ 5.5507e-07, 0.0000e+00, 3.4925e-07, ..., 0.0000e+00, 2.2352e-08, 0.0000e+00], [ 7.9349e-07, 0.0000e+00, 6.1467e-08, ..., 0.0000e+00, 5.3085e-08, 0.0000e+00], [-1.7434e-05, 0.0000e+00, 8.5030e-07, ..., 0.0000e+00, 1.3225e-07, 0.0000e+00], ..., [-4.6659e-07, 0.0000e+00, 5.4948e-08, ..., 0.0000e+00, 9.3132e-10, 0.0000e+00], [ 1.7984e-06, 2.7940e-09, 1.5832e-07, ..., 0.0000e+00, 8.5682e-08, 0.0000e+00], [ 2.0638e-06, 0.0000e+00, 5.6997e-06, ..., 0.0000e+00, -2.4773e-07, 0.0000e+00]], device='cuda:0') Epoch 188, bias, value: tensor([-0.0055, 0.0220, 0.0032, -0.0042, 0.0124, 0.0048, 0.0240, -0.0109, 0.0081, -0.0052], device='cuda:0'), grad: tensor([ 1.1846e-06, 1.5805e-06, -4.0084e-05, 3.7253e-05, -1.7092e-05, 2.5891e-07, 1.2200e-07, -7.5344e-07, 3.8780e-06, 1.3664e-05], device='cuda:0') 100 0.0001 changing lr epoch 187, time 217.32, cls_loss 0.0020 cls_loss_mapping 0.0034 cls_loss_causal 0.5193 re_mapping 0.0064 re_causal 0.0192 /// teacc 98.91 lr 0.00010000 Epoch 189, weight, value: tensor([[-0.0554, -0.0953, -0.0581, ..., -0.1068, -0.0164, -0.0323], [-0.0505, 0.0446, -0.1574, ..., 0.0593, 0.0681, -0.1284], [ 0.0592, -0.0063, -0.0565, ..., -0.0582, -0.0371, 0.0127], ..., [ 0.0565, -0.0565, -0.0119, ..., 0.0196, -0.0394, 0.0195], [-0.0607, -0.0328, -0.0795, ..., -0.0212, -0.0324, -0.0647], [-0.1407, -0.1907, 0.0285, ..., -0.0607, 0.0370, 0.0476]], device='cuda:0'), grad: tensor([[ 1.5339e-06, 9.2201e-08, 4.7497e-08, ..., 1.8626e-09, 1.5553e-07, 3.7253e-09], [-5.2340e-07, -3.1143e-06, 3.9861e-07, ..., 4.0978e-08, -5.1856e-06, 2.0489e-08], [-2.5079e-05, 6.6031e-07, 5.6811e-08, ..., 3.7253e-09, 1.0421e-06, 8.3819e-09], ..., [ 3.1106e-06, 1.7853e-06, 7.7952e-07, ..., 3.7253e-09, 2.9616e-06, 2.7940e-08], [ 1.5395e-06, 1.9558e-07, 7.8231e-08, ..., 0.0000e+00, 3.1292e-07, 4.9360e-08], [ 1.0505e-06, 1.3970e-07, 2.2426e-06, ..., 1.1828e-07, 2.2911e-07, 6.5193e-08]], device='cuda:0') Epoch 189, bias, value: tensor([-0.0055, 0.0223, 0.0030, -0.0042, 0.0123, 0.0047, 0.0245, -0.0111, 0.0076, -0.0052], device='cuda:0'), grad: tensor([ 1.1865e-06, -8.8587e-06, -4.2409e-05, -5.9485e-05, -9.2685e-06, 9.1076e-05, 2.1942e-06, 1.2279e-05, 3.5465e-06, 9.6038e-06], device='cuda:0') 100 0.0001 changing lr epoch 188, time 217.87, cls_loss 0.0019 cls_loss_mapping 0.0027 cls_loss_causal 0.5406 re_mapping 0.0064 re_causal 0.0199 /// teacc 98.99 lr 0.00010000 Epoch 190, weight, value: tensor([[-0.0557, -0.0962, -0.0581, ..., -0.1073, -0.0163, -0.0323], [-0.0505, 0.0448, -0.1578, ..., 0.0610, 0.0685, -0.1292], [ 0.0596, -0.0063, -0.0563, ..., -0.0582, -0.0371, 0.0127], ..., [ 0.0565, -0.0566, -0.0121, ..., 0.0180, -0.0397, 0.0198], [-0.0611, -0.0326, -0.0796, ..., -0.0214, -0.0332, -0.0649], [-0.1408, -0.1916, 0.0284, ..., -0.0611, 0.0370, 0.0476]], device='cuda:0'), grad: tensor([[ 4.8429e-08, 3.4273e-07, 1.2852e-07, ..., 2.7940e-09, 1.7975e-07, 0.0000e+00], [-1.0589e-06, -1.9640e-05, 2.3283e-07, ..., 6.5193e-09, -4.0047e-07, 0.0000e+00], [ 2.2165e-07, 1.2331e-05, 1.1455e-07, ..., 3.7253e-09, 3.2783e-07, 0.0000e+00], ..., [ 2.0582e-07, 7.9721e-07, 2.6450e-07, ..., 5.1223e-08, 1.2740e-06, 0.0000e+00], [ 8.1025e-08, 1.8906e-07, 2.5053e-07, ..., 9.3132e-10, -2.5108e-06, 0.0000e+00], [-5.1223e-08, 2.9802e-08, 2.7698e-06, ..., 4.6566e-09, -2.4009e-06, 0.0000e+00]], device='cuda:0') Epoch 190, bias, value: tensor([-0.0052, 0.0227, 0.0035, -0.0040, 0.0123, 0.0033, 0.0253, -0.0114, 0.0076, -0.0052], device='cuda:0'), grad: tensor([ 2.0191e-06, -7.1228e-05, 4.4823e-05, 4.6268e-06, 6.1393e-06, 1.7136e-05, 7.8678e-06, 8.6725e-06, -1.5676e-05, -4.4778e-06], device='cuda:0') 100 0.0001 changing lr epoch 189, time 217.83, cls_loss 0.0023 cls_loss_mapping 0.0035 cls_loss_causal 0.5231 re_mapping 0.0063 re_causal 0.0190 /// teacc 98.96 lr 0.00010000 Epoch 191, weight, value: tensor([[-0.0559, -0.0985, -0.0588, ..., -0.1076, -0.0170, -0.0324], [-0.0501, 0.0477, -0.1580, ..., 0.0612, 0.0687, -0.1306], [ 0.0588, -0.0090, -0.0593, ..., -0.0583, -0.0379, 0.0126], ..., [ 0.0566, -0.0575, -0.0122, ..., 0.0179, -0.0396, 0.0192], [-0.0611, -0.0319, -0.0797, ..., -0.0215, -0.0339, -0.0655], [-0.1412, -0.1933, 0.0307, ..., -0.0612, 0.0371, 0.0478]], device='cuda:0'), grad: tensor([[ 1.9372e-07, 0.0000e+00, 7.4506e-08, ..., 0.0000e+00, 1.6019e-07, 1.6578e-07], [ 4.8056e-07, 0.0000e+00, 4.1071e-07, ..., 0.0000e+00, -8.9221e-07, 2.7940e-09], [ 1.8701e-06, 0.0000e+00, 1.8813e-07, ..., 0.0000e+00, 1.2945e-07, 1.6764e-08], ..., [ 4.8243e-06, 0.0000e+00, 7.8976e-07, ..., 0.0000e+00, 8.7265e-07, 0.0000e+00], [ 7.1060e-07, 0.0000e+00, 3.9302e-07, ..., 0.0000e+00, -4.9919e-07, 1.6484e-07], [ 1.2619e-06, 0.0000e+00, 1.7649e-06, ..., 0.0000e+00, 4.1351e-07, 7.4506e-09]], device='cuda:0') Epoch 191, bias, value: tensor([-0.0056, 0.0231, 0.0017, -0.0038, 0.0125, 0.0014, 0.0254, -0.0113, 0.0074, -0.0046], device='cuda:0'), grad: tensor([ 2.2724e-06, -1.3318e-07, 2.1532e-06, -1.0058e-05, -4.9174e-06, -1.0151e-07, -4.2915e-06, 8.5682e-06, -2.0396e-07, 6.6534e-06], device='cuda:0') 100 0.0001 changing lr ---------------------saving model at epoch 190---------------------------------------------------- epoch 190, time 218.54, cls_loss 0.0026 cls_loss_mapping 0.0034 cls_loss_causal 0.5075 re_mapping 0.0065 re_causal 0.0187 /// teacc 99.13 lr 0.00010000 Epoch 192, weight, value: tensor([[-0.0564, -0.1009, -0.0593, ..., -0.1078, -0.0182, -0.0324], [-0.0500, 0.0478, -0.1583, ..., 0.0612, 0.0689, -0.1308], [ 0.0549, -0.0103, -0.0633, ..., -0.0583, -0.0407, 0.0126], ..., [ 0.0572, -0.0554, -0.0125, ..., 0.0179, -0.0394, 0.0192], [-0.0620, -0.0329, -0.0800, ..., -0.0216, -0.0335, -0.0655], [-0.1417, -0.1968, 0.0305, ..., -0.0612, 0.0378, 0.0478]], device='cuda:0'), grad: tensor([[ 1.4715e-07, 0.0000e+00, 1.6764e-08, ..., 0.0000e+00, 5.8673e-08, 0.0000e+00], [-9.4064e-08, 9.3132e-10, 5.5879e-09, ..., 0.0000e+00, -3.1013e-06, 0.0000e+00], [ 6.9570e-07, 9.3132e-10, 1.8626e-09, ..., 0.0000e+00, 8.5682e-07, 0.0000e+00], ..., [ 2.1327e-07, -1.7695e-08, 7.4506e-09, ..., 0.0000e+00, 1.7565e-06, 0.0000e+00], [-5.7742e-08, 1.8626e-09, 1.6287e-05, ..., 0.0000e+00, 4.7088e-06, 0.0000e+00], [ 8.6613e-08, 1.8626e-09, -1.7077e-05, ..., 0.0000e+00, -5.1893e-06, 0.0000e+00]], device='cuda:0') Epoch 192, bias, value: tensor([-0.0063, 0.0232, -0.0031, -0.0041, 0.0126, 0.0044, 0.0250, -0.0107, 0.0071, -0.0044], device='cuda:0'), grad: tensor([-8.1584e-07, -7.7188e-06, 2.5351e-06, -2.5053e-06, 2.0508e-06, 2.9467e-06, 2.2277e-06, 5.1036e-06, 3.5316e-05, -3.9220e-05], device='cuda:0') 100 0.0001 changing lr epoch 191, time 217.78, cls_loss 0.0013 cls_loss_mapping 0.0023 cls_loss_causal 0.4607 re_mapping 0.0063 re_causal 0.0180 /// teacc 98.80 lr 0.00010000 Epoch 193, weight, value: tensor([[-0.0561, -0.1010, -0.0593, ..., -0.1079, -0.0184, -0.0324], [-0.0504, 0.0478, -0.1585, ..., 0.0612, 0.0689, -0.1308], [ 0.0550, -0.0103, -0.0634, ..., -0.0583, -0.0411, 0.0126], ..., [ 0.0574, -0.0554, -0.0136, ..., 0.0179, -0.0394, 0.0192], [-0.0623, -0.0326, -0.0802, ..., -0.0217, -0.0336, -0.0656], [-0.1419, -0.1969, 0.0305, ..., -0.0612, 0.0380, 0.0478]], device='cuda:0'), grad: tensor([[ 6.5193e-08, 0.0000e+00, 6.2399e-08, ..., 0.0000e+00, 3.9116e-08, 0.0000e+00], [ 1.1455e-07, 0.0000e+00, 1.1548e-07, ..., 9.3132e-10, -2.7940e-07, 0.0000e+00], [-2.2277e-06, 0.0000e+00, 5.7742e-08, ..., 0.0000e+00, 3.0734e-08, 0.0000e+00], ..., [ 4.8578e-06, -9.3132e-10, 1.5274e-07, ..., -3.7253e-09, 2.7101e-07, 0.0000e+00], [ 1.0421e-06, 0.0000e+00, 9.4716e-07, ..., 0.0000e+00, 6.7335e-07, 0.0000e+00], [-2.3469e-06, 0.0000e+00, -8.0541e-06, ..., 9.3132e-10, -6.3926e-06, 0.0000e+00]], device='cuda:0') Epoch 193, bias, value: tensor([-0.0063, 0.0229, -0.0032, -0.0041, 0.0125, 0.0045, 0.0251, -0.0106, 0.0069, -0.0043], device='cuda:0'), grad: tensor([ 1.2601e-06, -1.0058e-07, -2.3302e-06, 4.3511e-05, -9.2667e-07, 1.4929e-06, 2.5891e-07, 7.1712e-06, 5.6811e-06, -5.6058e-05], device='cuda:0') 100 0.0001 changing lr epoch 192, time 217.55, cls_loss 0.0022 cls_loss_mapping 0.0027 cls_loss_causal 0.4963 re_mapping 0.0064 re_causal 0.0185 /// teacc 99.02 lr 0.00010000 Epoch 194, weight, value: tensor([[-0.0559, -0.1018, -0.0595, ..., -0.1079, -0.0187, -0.0324], [-0.0507, 0.0484, -0.1586, ..., 0.0612, 0.0690, -0.1311], [ 0.0552, -0.0105, -0.0633, ..., -0.0583, -0.0413, 0.0129], ..., [ 0.0573, -0.0561, -0.0134, ..., 0.0180, -0.0395, 0.0189], [-0.0629, -0.0313, -0.0803, ..., -0.0219, -0.0337, -0.0657], [-0.1417, -0.1972, 0.0308, ..., -0.0612, 0.0386, 0.0478]], device='cuda:0'), grad: tensor([[ 6.6124e-08, 0.0000e+00, -1.5749e-06, ..., 0.0000e+00, 3.7253e-08, 2.7940e-09], [ 4.4666e-06, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 9.4716e-07, 4.6566e-08], [-1.3016e-05, 0.0000e+00, 1.2107e-08, ..., 0.0000e+00, -5.3458e-06, 1.8626e-08], ..., [-8.2105e-06, 0.0000e+00, 9.3132e-09, ..., 0.0000e+00, -1.1446e-06, -2.3935e-07], [ 1.4435e-07, 0.0000e+00, 8.3819e-08, ..., 0.0000e+00, 1.4994e-07, 8.3819e-09], [ 1.2061e-06, 0.0000e+00, 4.7218e-07, ..., 0.0000e+00, -1.2759e-07, 5.5879e-08]], device='cuda:0') Epoch 194, bias, value: tensor([-0.0071, 0.0228, -0.0029, -0.0041, 0.0121, 0.0047, 0.0248, -0.0108, 0.0068, -0.0035], device='cuda:0'), grad: tensor([-8.2403e-06, 1.9744e-05, -8.9824e-05, 6.2846e-06, 4.1910e-06, 1.0291e-06, 9.0480e-05, -2.6152e-05, 2.3749e-07, 2.1402e-06], device='cuda:0') 100 0.0001 changing lr epoch 193, time 217.64, cls_loss 0.0020 cls_loss_mapping 0.0040 cls_loss_causal 0.5150 re_mapping 0.0060 re_causal 0.0186 /// teacc 98.87 lr 0.00010000 Epoch 195, weight, value: tensor([[-0.0560, -0.1023, -0.0593, ..., -0.1092, -0.0187, -0.0324], [-0.0494, 0.0485, -0.1588, ..., 0.0612, 0.0702, -0.1329], [ 0.0555, -0.0100, -0.0633, ..., -0.0587, -0.0413, 0.0128], ..., [ 0.0564, -0.0569, -0.0131, ..., 0.0187, -0.0407, 0.0198], [-0.0635, -0.0286, -0.0805, ..., -0.0237, -0.0343, -0.0658], [-0.1417, -0.1975, 0.0304, ..., -0.0614, 0.0388, 0.0478]], device='cuda:0'), grad: tensor([[ 3.0734e-08, 0.0000e+00, 4.4238e-08, ..., 1.1176e-08, 1.5907e-06, 0.0000e+00], [ 5.3504e-07, 9.3132e-10, 1.5181e-07, ..., 2.6077e-08, -6.4790e-05, 0.0000e+00], [ 1.3132e-07, 0.0000e+00, 4.2375e-08, ..., 1.0710e-08, 3.1441e-05, 0.0000e+00], ..., [-1.4473e-06, -1.8626e-09, 2.0768e-07, ..., 5.3551e-08, 1.1120e-06, 0.0000e+00], [ 9.9652e-08, 0.0000e+00, 5.8208e-08, ..., 5.1223e-09, 2.7835e-05, 0.0000e+00], [ 2.9476e-07, 0.0000e+00, 1.7919e-06, ..., 3.1339e-07, 8.3353e-08, 0.0000e+00]], device='cuda:0') Epoch 195, bias, value: tensor([-0.0069, 0.0242, -0.0026, -0.0042, 0.0122, 0.0045, 0.0248, -0.0119, 0.0071, -0.0037], device='cuda:0'), grad: tensor([ 3.6284e-06, -1.4067e-04, 6.9082e-05, 3.7532e-07, -6.2436e-06, 1.4976e-06, 5.6848e-06, 7.6927e-07, 6.1214e-05, 4.6268e-06], device='cuda:0') 100 0.0001 changing lr epoch 194, time 217.62, cls_loss 0.0022 cls_loss_mapping 0.0035 cls_loss_causal 0.5276 re_mapping 0.0060 re_causal 0.0191 /// teacc 98.90 lr 0.00010000 Epoch 196, weight, value: tensor([[-0.0561, -0.1033, -0.0594, ..., -0.1103, -0.0213, -0.0324], [-0.0495, 0.0485, -0.1590, ..., 0.0612, 0.0703, -0.1334], [ 0.0563, -0.0076, -0.0632, ..., -0.0588, -0.0417, 0.0128], ..., [ 0.0562, -0.0595, -0.0136, ..., 0.0187, -0.0407, 0.0200], [-0.0639, -0.0286, -0.0806, ..., -0.0242, -0.0348, -0.0658], [-0.1419, -0.1980, 0.0303, ..., -0.0613, 0.0401, 0.0478]], device='cuda:0'), grad: tensor([[ 4.9360e-08, 0.0000e+00, 3.4925e-08, ..., 8.3819e-09, 1.4529e-07, 0.0000e+00], [ 3.1758e-07, 0.0000e+00, 9.6858e-08, ..., 1.7695e-08, -3.7719e-08, 0.0000e+00], [-5.3868e-06, 0.0000e+00, -8.4145e-07, ..., 1.8626e-08, 1.7695e-08, 0.0000e+00], ..., [-4.7218e-07, 0.0000e+00, 3.5018e-07, ..., 1.0291e-07, 3.1199e-08, 0.0000e+00], [ 6.6124e-08, 0.0000e+00, -3.3341e-07, ..., 1.8626e-09, -3.5390e-08, 0.0000e+00], [ 5.2154e-08, 0.0000e+00, 5.9092e-07, ..., 1.1642e-08, -2.6450e-07, 0.0000e+00]], device='cuda:0') Epoch 196, bias, value: tensor([-0.0090, 0.0242, -0.0018, -0.0041, 0.0124, 0.0045, 0.0242, -0.0122, 0.0069, -0.0023], device='cuda:0'), grad: tensor([ 8.5980e-06, 1.0505e-06, -6.4299e-06, 1.0617e-06, -1.9446e-06, 8.2701e-06, -2.0280e-05, -5.0105e-07, 8.1286e-06, 2.0191e-06], device='cuda:0') 100 0.0001 changing lr epoch 195, time 218.00, cls_loss 0.0015 cls_loss_mapping 0.0026 cls_loss_causal 0.5352 re_mapping 0.0064 re_causal 0.0190 /// teacc 98.94 lr 0.00010000 Epoch 197, weight, value: tensor([[-0.0559, -0.1038, -0.0593, ..., -0.1106, -0.0215, -0.0324], [-0.0495, 0.0485, -0.1592, ..., 0.0611, 0.0705, -0.1337], [ 0.0567, -0.0072, -0.0629, ..., -0.0589, -0.0419, 0.0133], ..., [ 0.0562, -0.0598, -0.0135, ..., 0.0187, -0.0408, 0.0193], [-0.0646, -0.0285, -0.0807, ..., -0.0245, -0.0362, -0.0660], [-0.1419, -0.1988, 0.0303, ..., -0.0610, 0.0402, 0.0478]], device='cuda:0'), grad: tensor([[-1.1688e-07, 0.0000e+00, 3.8883e-07, ..., 1.8626e-09, 6.7987e-08, 0.0000e+00], [ 3.2177e-07, 0.0000e+00, 2.2305e-07, ..., 9.7789e-09, -8.3353e-08, 0.0000e+00], [ 7.3528e-07, 0.0000e+00, 1.4482e-07, ..., 2.3283e-09, 1.9558e-07, 0.0000e+00], ..., [ 8.3167e-07, 0.0000e+00, 7.4971e-07, ..., 4.7963e-08, 1.1129e-07, 0.0000e+00], [ 5.8115e-07, 0.0000e+00, 1.7490e-06, ..., 4.6566e-10, 5.6811e-08, 0.0000e+00], [ 1.5358e-06, 0.0000e+00, 4.4517e-06, ..., 1.3970e-08, 1.0617e-07, 0.0000e+00]], device='cuda:0') Epoch 197, bias, value: tensor([-0.0087, 0.0243, -0.0016, -0.0041, 0.0123, 0.0043, 0.0238, -0.0123, 0.0063, -0.0023], device='cuda:0'), grad: tensor([-1.9390e-06, 4.6426e-07, 2.5649e-06, 7.7128e-05, 1.9874e-06, -9.3937e-05, -6.4820e-06, 2.4643e-06, 4.3772e-06, 1.3426e-05], device='cuda:0') 100 0.0001 changing lr epoch 196, time 217.69, cls_loss 0.0016 cls_loss_mapping 0.0029 cls_loss_causal 0.5102 re_mapping 0.0061 re_causal 0.0183 /// teacc 98.95 lr 0.00010000 Epoch 198, weight, value: tensor([[-0.0553, -0.1042, -0.0594, ..., -0.1115, -0.0203, -0.0324], [-0.0496, 0.0486, -0.1594, ..., 0.0614, 0.0704, -0.1337], [ 0.0570, -0.0072, -0.0629, ..., -0.0589, -0.0423, 0.0133], ..., [ 0.0562, -0.0599, -0.0138, ..., 0.0184, -0.0408, 0.0193], [-0.0661, -0.0282, -0.0810, ..., -0.0248, -0.0366, -0.0660], [-0.1421, -0.1994, 0.0307, ..., -0.0607, 0.0405, 0.0478]], device='cuda:0'), grad: tensor([[-4.8168e-06, 0.0000e+00, 1.0710e-08, ..., 0.0000e+00, -5.8264e-06, 0.0000e+00], [ 2.8554e-06, 9.3132e-10, 8.2888e-08, ..., 4.6566e-10, 1.5646e-06, 0.0000e+00], [ 3.4273e-06, -4.1910e-09, 1.2573e-08, ..., 0.0000e+00, 6.5286e-07, 0.0000e+00], ..., [-4.3064e-06, 2.7940e-09, 4.4703e-08, ..., 9.3132e-10, 3.6322e-07, 0.0000e+00], [ 4.5309e-07, 0.0000e+00, 1.7229e-08, ..., 0.0000e+00, 1.2247e-07, 4.6566e-10], [ 2.3469e-07, 0.0000e+00, 2.4401e-07, ..., 2.3283e-09, 2.3423e-07, 0.0000e+00]], device='cuda:0') Epoch 198, bias, value: tensor([-0.0077, 0.0241, -0.0015, -0.0044, 0.0123, 0.0042, 0.0239, -0.0124, 0.0050, -0.0023], device='cuda:0'), grad: tensor([-3.3975e-05, 1.3359e-05, 7.4357e-06, 1.5944e-06, 3.2457e-07, -1.4855e-07, 1.3851e-05, -5.1409e-06, 9.0431e-07, 1.8040e-06], device='cuda:0') 100 0.0001 changing lr epoch 197, time 217.76, cls_loss 0.0023 cls_loss_mapping 0.0036 cls_loss_causal 0.5214 re_mapping 0.0064 re_causal 0.0179 /// teacc 98.89 lr 0.00010000 Epoch 199, weight, value: tensor([[-0.0551, -0.1055, -0.0586, ..., -0.1120, -0.0202, -0.0324], [-0.0494, 0.0486, -0.1595, ..., 0.0612, 0.0706, -0.1345], [ 0.0567, -0.0073, -0.0626, ..., -0.0592, -0.0431, 0.0136], ..., [ 0.0563, -0.0597, -0.0141, ..., 0.0190, -0.0409, 0.0195], [-0.0657, -0.0283, -0.0811, ..., -0.0251, -0.0361, -0.0663], [-0.1426, -0.2014, 0.0302, ..., -0.0626, 0.0406, 0.0478]], device='cuda:0'), grad: tensor([[ 7.3668e-07, 9.3132e-10, 1.8813e-07, ..., 0.0000e+00, 2.0210e-07, 1.6764e-08], [ 3.2410e-06, 2.5146e-08, 7.4226e-07, ..., 9.3132e-10, 2.8964e-07, 7.4506e-09], [ 4.4592e-06, 1.7695e-08, 9.4436e-07, ..., 9.3132e-10, 7.5996e-07, 1.1176e-08], ..., [-2.0385e-05, -6.7055e-08, 1.2452e-06, ..., -1.2107e-08, -4.1537e-06, 2.3842e-07], [-1.3702e-05, 9.3132e-09, -6.6943e-06, ..., 0.0000e+00, -1.0058e-05, 9.3132e-09], [ 1.7971e-05, 7.4506e-09, 8.9128e-07, ..., 0.0000e+00, 1.1697e-05, -2.9430e-07]], device='cuda:0') Epoch 199, bias, value: tensor([-0.0066, 0.0242, -0.0018, -0.0045, 0.0124, 0.0046, 0.0243, -0.0122, 0.0054, -0.0036], device='cuda:0'), grad: tensor([ 2.0191e-06, 9.6858e-06, 1.3098e-05, 1.1854e-05, 1.3299e-05, 8.8010e-07, -1.8626e-08, -5.0873e-05, -9.9599e-05, 9.9659e-05], device='cuda:0') 100 0.0001 changing lr epoch 198, time 217.76, cls_loss 0.0025 cls_loss_mapping 0.0038 cls_loss_causal 0.5459 re_mapping 0.0061 re_causal 0.0182 /// teacc 98.90 lr 0.00010000 Epoch 200, weight, value: tensor([[-0.0557, -0.1071, -0.0590, ..., -0.1147, -0.0204, -0.0324], [-0.0494, 0.0491, -0.1600, ..., 0.0614, 0.0709, -0.1355], [ 0.0565, -0.0077, -0.0625, ..., -0.0595, -0.0434, 0.0117], ..., [ 0.0565, -0.0596, -0.0155, ..., 0.0187, -0.0410, 0.0189], [-0.0661, -0.0266, -0.0825, ..., -0.0281, -0.0365, -0.0693], [-0.1430, -0.2037, 0.0285, ..., -0.0653, 0.0397, 0.0478]], device='cuda:0'), grad: tensor([[ 5.6811e-08, 9.3132e-10, 3.4459e-08, ..., 1.4622e-07, 6.3330e-07, 2.7940e-09], [ 6.2399e-07, 9.3132e-09, 1.8626e-08, ..., -4.4346e-05, -5.9754e-05, 2.8871e-08], [-1.0710e-06, 2.7940e-09, 1.8626e-09, ..., 1.0431e-07, 2.7847e-07, 7.4506e-09], ..., [ 5.6531e-07, 7.4506e-09, 2.7940e-08, ..., 3.6180e-05, 4.7743e-05, 2.3283e-08], [ 2.6356e-07, 2.0489e-08, 1.4808e-07, ..., 2.7940e-09, 2.4121e-07, 5.5879e-08], [ 2.1979e-07, 1.6764e-08, 1.3784e-07, ..., 4.5113e-06, 5.9046e-06, 5.1223e-08]], device='cuda:0') Epoch 200, bias, value: tensor([-0.0068, 0.0244, -0.0021, -0.0049, 0.0147, 0.0054, 0.0251, -0.0122, 0.0047, -0.0056], device='cuda:0'), grad: tensor([ 1.7108e-06, -2.0659e-04, -7.8697e-07, -6.5029e-05, 1.6347e-05, 6.3658e-05, 5.0478e-07, 1.6820e-04, 1.1297e-06, 2.1100e-05], device='cuda:0') 100 0.0001 changing lr epoch 199, time 217.80, cls_loss 0.0022 cls_loss_mapping 0.0039 cls_loss_causal 0.5363 re_mapping 0.0064 re_causal 0.0190 /// teacc 98.85 lr 0.00001000 Epoch 201, weight, value: tensor([[-0.0558, -0.1075, -0.0597, ..., -0.1150, -0.0204, -0.0324], [-0.0494, 0.0489, -0.1612, ..., 0.0615, 0.0709, -0.1365], [ 0.0563, -0.0076, -0.0633, ..., -0.0596, -0.0440, 0.0089], ..., [ 0.0566, -0.0597, -0.0167, ..., 0.0187, -0.0411, 0.0182], [-0.0664, -0.0264, -0.0824, ..., -0.0285, -0.0351, -0.0708], [-0.1432, -0.2039, 0.0286, ..., -0.0658, 0.0400, 0.0480]], device='cuda:0'), grad: tensor([[ 1.5739e-07, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 9.0338e-08, 0.0000e+00], [ 4.1984e-06, 0.0000e+00, 3.4180e-07, ..., 0.0000e+00, 1.9372e-06, 0.0000e+00], [ 2.9579e-06, 0.0000e+00, 1.6764e-08, ..., 0.0000e+00, 2.7195e-06, 0.0000e+00], ..., [-4.7237e-05, 0.0000e+00, 3.6992e-06, ..., 0.0000e+00, -2.3842e-05, -9.3132e-10], [ 5.6103e-06, 0.0000e+00, 5.3570e-06, ..., 0.0000e+00, 3.3602e-06, 9.3132e-10], [ 3.3200e-05, 0.0000e+00, -2.1994e-05, ..., 0.0000e+00, 1.4156e-05, 0.0000e+00]], device='cuda:0') Epoch 201, bias, value: tensor([-0.0068, 0.0244, -0.0026, -0.0052, 0.0149, 0.0056, 0.0250, -0.0123, 0.0057, -0.0054], device='cuda:0'), grad: tensor([ 1.4668e-06, 3.3379e-05, 4.2379e-05, 1.5264e-06, 3.7491e-05, 9.4026e-06, 9.5926e-08, -4.4703e-04, 7.0095e-05, 2.5082e-04], device='cuda:0') 100 1e-05 changing lr epoch 200, time 217.27, cls_loss 0.0020 cls_loss_mapping 0.0034 cls_loss_causal 0.5173 re_mapping 0.0064 re_causal 0.0187 /// teacc 98.95 lr 0.00001000 Epoch 202, weight, value: tensor([[-0.0558, -0.1076, -0.0598, ..., -0.1151, -0.0204, -0.0325], [-0.0495, 0.0490, -0.1613, ..., 0.0615, 0.0709, -0.1365], [ 0.0563, -0.0076, -0.0633, ..., -0.0596, -0.0440, 0.0089], ..., [ 0.0566, -0.0597, -0.0167, ..., 0.0187, -0.0411, 0.0181], [-0.0664, -0.0263, -0.0824, ..., -0.0285, -0.0350, -0.0709], [-0.1433, -0.2039, 0.0287, ..., -0.0659, 0.0400, 0.0480]], device='cuda:0'), grad: tensor([[ 7.7300e-08, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.7695e-08, 0.0000e+00], [ 1.8533e-07, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, -4.2841e-08, 0.0000e+00], [-4.6752e-07, 0.0000e+00, -1.2107e-08, ..., 0.0000e+00, 1.9558e-08, 0.0000e+00], ..., [ 1.5115e-06, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 3.6787e-07, 0.0000e+00], [ 7.6368e-08, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 4.6566e-08, 0.0000e+00], [ 1.4808e-07, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, -7.7114e-07, 0.0000e+00]], device='cuda:0') Epoch 202, bias, value: tensor([-0.0068, 0.0244, -0.0026, -0.0051, 0.0148, 0.0056, 0.0250, -0.0123, 0.0058, -0.0054], device='cuda:0'), grad: tensor([-2.4475e-06, 6.0629e-07, -1.9027e-06, -1.8049e-06, 1.6913e-06, 2.6543e-07, -1.1642e-07, 2.5313e-06, 5.5693e-07, 6.2771e-07], device='cuda:0') 100 1e-05 changing lr epoch 201, time 217.37, cls_loss 0.0018 cls_loss_mapping 0.0024 cls_loss_causal 0.5187 re_mapping 0.0058 re_causal 0.0183 /// teacc 98.99 lr 0.00001000 Epoch 203, weight, value: tensor([[-0.0558, -0.1076, -0.0598, ..., -0.1151, -0.0204, -0.0325], [-0.0496, 0.0490, -0.1613, ..., 0.0615, 0.0709, -0.1366], [ 0.0563, -0.0076, -0.0633, ..., -0.0596, -0.0441, 0.0089], ..., [ 0.0567, -0.0597, -0.0168, ..., 0.0187, -0.0411, 0.0181], [-0.0664, -0.0263, -0.0824, ..., -0.0285, -0.0350, -0.0709], [-0.1433, -0.2040, 0.0286, ..., -0.0659, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 1.7323e-07, 9.3132e-10, 6.5193e-09, ..., 0.0000e+00, 1.4901e-08, 0.0000e+00], [ 4.9453e-07, -1.0338e-07, 7.1153e-07, ..., 0.0000e+00, -8.5682e-08, 0.0000e+00], [-5.2713e-06, 6.7987e-08, 3.0734e-08, ..., 0.0000e+00, 5.8673e-08, 0.0000e+00], ..., [ 6.1467e-07, 2.0489e-08, 9.0525e-07, ..., 0.0000e+00, 7.7765e-07, 0.0000e+00], [ 3.8445e-06, 9.3132e-10, 1.1176e-08, ..., 0.0000e+00, 3.2596e-08, 0.0000e+00], [-4.9546e-07, 0.0000e+00, -9.7603e-07, ..., 0.0000e+00, -1.0189e-06, 0.0000e+00]], device='cuda:0') Epoch 203, bias, value: tensor([-0.0068, 0.0242, -0.0026, -0.0051, 0.0148, 0.0055, 0.0249, -0.0122, 0.0058, -0.0054], device='cuda:0'), grad: tensor([-4.6939e-07, 1.6829e-06, -1.3433e-05, 6.0815e-07, 1.9409e-06, 5.5321e-07, 5.7556e-07, 1.0334e-05, 9.3132e-06, -1.1131e-05], device='cuda:0') 100 1e-05 changing lr epoch 202, time 217.21, cls_loss 0.0014 cls_loss_mapping 0.0016 cls_loss_causal 0.5039 re_mapping 0.0056 re_causal 0.0177 /// teacc 99.04 lr 0.00001000 Epoch 204, weight, value: tensor([[-0.0558, -0.1077, -0.0599, ..., -0.1151, -0.0204, -0.0325], [-0.0496, 0.0490, -0.1613, ..., 0.0615, 0.0709, -0.1367], [ 0.0564, -0.0076, -0.0633, ..., -0.0596, -0.0441, 0.0089], ..., [ 0.0567, -0.0597, -0.0169, ..., 0.0187, -0.0411, 0.0181], [-0.0665, -0.0264, -0.0824, ..., -0.0286, -0.0350, -0.0709], [-0.1433, -0.2040, 0.0286, ..., -0.0659, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 1.8626e-09, 0.0000e+00, 5.1223e-08, ..., 9.3132e-10, 2.7940e-08, 9.3132e-10], [ 4.9360e-08, 0.0000e+00, 3.1907e-06, ..., 9.3132e-09, 7.2643e-08, 0.0000e+00], [-1.9558e-08, 0.0000e+00, 1.0431e-07, ..., 1.8626e-09, 3.5390e-08, 0.0000e+00], ..., [-1.9558e-08, 0.0000e+00, 2.5444e-06, ..., 4.7497e-08, 2.9523e-07, 0.0000e+00], [ 9.3132e-10, 0.0000e+00, 1.5087e-07, ..., 0.0000e+00, 1.4342e-07, 1.8626e-09], [ 2.7940e-09, 0.0000e+00, 6.3144e-06, ..., 5.5879e-09, 1.6391e-07, 0.0000e+00]], device='cuda:0') Epoch 204, bias, value: tensor([-0.0068, 0.0242, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0121, 0.0058, -0.0054], device='cuda:0'), grad: tensor([ 2.1327e-07, 1.0818e-05, 3.7812e-07, 1.4342e-07, -3.7342e-05, 6.9756e-07, -8.4843e-07, 7.5474e-06, 9.1828e-07, 1.7449e-05], device='cuda:0') 100 1e-05 changing lr epoch 203, time 217.32, cls_loss 0.0016 cls_loss_mapping 0.0016 cls_loss_causal 0.4828 re_mapping 0.0055 re_causal 0.0171 /// teacc 99.02 lr 0.00001000 Epoch 205, weight, value: tensor([[-0.0558, -0.1078, -0.0599, ..., -0.1151, -0.0205, -0.0325], [-0.0497, 0.0490, -0.1614, ..., 0.0616, 0.0709, -0.1367], [ 0.0564, -0.0076, -0.0633, ..., -0.0596, -0.0441, 0.0088], ..., [ 0.0568, -0.0597, -0.0169, ..., 0.0186, -0.0412, 0.0181], [-0.0665, -0.0264, -0.0824, ..., -0.0286, -0.0351, -0.0709], [-0.1433, -0.2040, 0.0286, ..., -0.0659, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 1.0338e-07, 0.0000e+00, 1.4994e-07, ..., 0.0000e+00, 5.2154e-08, 0.0000e+00], [ 2.0210e-07, 0.0000e+00, 3.5390e-08, ..., 9.3132e-10, 1.6391e-07, 0.0000e+00], [-5.2713e-07, 0.0000e+00, 3.6322e-08, ..., 0.0000e+00, 1.4994e-07, 0.0000e+00], ..., [ 1.8626e-07, 0.0000e+00, 9.1270e-08, ..., 9.3132e-10, 1.8720e-07, 0.0000e+00], [ 4.9919e-07, 0.0000e+00, 1.4333e-06, ..., 0.0000e+00, 1.2573e-07, 0.0000e+00], [ 2.1979e-07, 0.0000e+00, -1.7742e-06, ..., 3.7253e-09, -1.3188e-06, 0.0000e+00]], device='cuda:0') Epoch 205, bias, value: tensor([-0.0068, 0.0241, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0121, 0.0058, -0.0054], device='cuda:0'), grad: tensor([-2.7567e-07, 1.3702e-05, 5.9698e-07, 2.7511e-06, 1.5814e-06, -2.4289e-06, 1.5087e-07, 1.3495e-06, -1.0237e-05, -7.2308e-06], device='cuda:0') 100 1e-05 changing lr epoch 204, time 217.47, cls_loss 0.0019 cls_loss_mapping 0.0018 cls_loss_causal 0.4965 re_mapping 0.0054 re_causal 0.0175 /// teacc 99.00 lr 0.00001000 Epoch 206, weight, value: tensor([[-0.0558, -0.1078, -0.0599, ..., -0.1152, -0.0205, -0.0325], [-0.0499, 0.0491, -0.1614, ..., 0.0618, 0.0710, -0.1369], [ 0.0564, -0.0076, -0.0633, ..., -0.0596, -0.0441, 0.0088], ..., [ 0.0569, -0.0597, -0.0169, ..., 0.0184, -0.0412, 0.0182], [-0.0665, -0.0264, -0.0823, ..., -0.0287, -0.0351, -0.0709], [-0.1433, -0.2040, 0.0285, ..., -0.0660, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 1.4435e-07, 0.0000e+00, -2.6543e-07, ..., 0.0000e+00, 5.4948e-08, 0.0000e+00], [ 2.2724e-07, 0.0000e+00, 8.1025e-08, ..., 0.0000e+00, -5.5879e-08, 0.0000e+00], [-7.2550e-07, 0.0000e+00, 1.7695e-08, ..., 0.0000e+00, 1.7695e-08, 0.0000e+00], ..., [ 3.0734e-08, 0.0000e+00, 3.8184e-08, ..., 0.0000e+00, 4.6566e-08, 0.0000e+00], [ 1.6149e-06, 0.0000e+00, 7.2643e-08, ..., 0.0000e+00, 3.7253e-08, 0.0000e+00], [ 9.3132e-08, 0.0000e+00, 6.8545e-07, ..., 0.0000e+00, -6.2399e-08, 0.0000e+00]], device='cuda:0') Epoch 206, bias, value: tensor([-0.0068, 0.0240, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0120, 0.0059, -0.0054], device='cuda:0'), grad: tensor([-8.0373e-07, 6.0908e-07, -6.3889e-07, -2.4177e-06, 5.5283e-06, 5.7369e-07, -7.8827e-06, -1.6950e-07, 3.4273e-06, 1.7742e-06], device='cuda:0') 100 1e-05 changing lr epoch 205, time 217.63, cls_loss 0.0015 cls_loss_mapping 0.0017 cls_loss_causal 0.5079 re_mapping 0.0053 re_causal 0.0177 /// teacc 99.02 lr 0.00001000 Epoch 207, weight, value: tensor([[-0.0558, -0.1079, -0.0599, ..., -0.1153, -0.0205, -0.0325], [-0.0500, 0.0491, -0.1614, ..., 0.0618, 0.0710, -0.1369], [ 0.0564, -0.0076, -0.0634, ..., -0.0596, -0.0441, 0.0088], ..., [ 0.0569, -0.0597, -0.0170, ..., 0.0184, -0.0412, 0.0181], [-0.0665, -0.0264, -0.0823, ..., -0.0288, -0.0351, -0.0709], [-0.1434, -0.2041, 0.0284, ..., -0.0660, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 9.0338e-08, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 3.1758e-07, 0.0000e+00], [ 3.8277e-07, 0.0000e+00, 5.4948e-08, ..., 0.0000e+00, -1.8235e-06, 0.0000e+00], [ 9.8720e-07, 0.0000e+00, -7.4506e-09, ..., 0.0000e+00, 1.7416e-07, 0.0000e+00], ..., [-2.5183e-06, 0.0000e+00, 2.4308e-07, ..., 0.0000e+00, 7.6741e-07, 0.0000e+00], [ 8.7544e-08, 0.0000e+00, 7.4506e-09, ..., 0.0000e+00, 1.0710e-07, 0.0000e+00], [ 1.8813e-07, 0.0000e+00, 2.6729e-07, ..., 1.8626e-09, 8.2888e-08, 0.0000e+00]], device='cuda:0') Epoch 207, bias, value: tensor([-0.0068, 0.0239, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0059, -0.0054], device='cuda:0'), grad: tensor([-2.4647e-05, -2.9095e-06, 3.4720e-06, 1.2005e-06, -3.2783e-07, 1.5674e-06, 2.0713e-05, -2.5723e-06, 2.7046e-06, 8.3167e-07], device='cuda:0') 100 1e-05 changing lr epoch 206, time 217.72, cls_loss 0.0016 cls_loss_mapping 0.0014 cls_loss_causal 0.5105 re_mapping 0.0053 re_causal 0.0176 /// teacc 99.01 lr 0.00001000 Epoch 208, weight, value: tensor([[-0.0559, -0.1079, -0.0599, ..., -0.1154, -0.0205, -0.0325], [-0.0500, 0.0492, -0.1614, ..., 0.0620, 0.0711, -0.1369], [ 0.0564, -0.0076, -0.0634, ..., -0.0597, -0.0443, 0.0088], ..., [ 0.0569, -0.0597, -0.0170, ..., 0.0183, -0.0412, 0.0181], [-0.0665, -0.0264, -0.0823, ..., -0.0288, -0.0352, -0.0709], [-0.1434, -0.2041, 0.0284, ..., -0.0660, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 2.1420e-08, 0.0000e+00, -1.3039e-08, ..., 2.7940e-09, 5.8673e-08, 0.0000e+00], [ 8.0932e-07, -1.1176e-08, 1.8626e-09, ..., 1.2573e-07, -8.9407e-07, 0.0000e+00], [ 6.7055e-08, 1.8626e-09, 9.3132e-10, ..., 5.5879e-09, 7.6368e-08, 0.0000e+00], ..., [-2.0042e-06, 7.4506e-09, 2.7940e-09, ..., -3.2596e-07, 3.5763e-07, 0.0000e+00], [ 1.3039e-08, 0.0000e+00, 1.7695e-08, ..., 9.3132e-10, 6.9849e-08, 0.0000e+00], [ 9.5740e-07, 0.0000e+00, 4.6566e-09, ..., 1.7416e-07, 1.1176e-08, 0.0000e+00]], device='cuda:0') Epoch 208, bias, value: tensor([-0.0067, 0.0239, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0059, -0.0055], device='cuda:0'), grad: tensor([ 4.1630e-07, -6.7987e-07, 1.2973e-06, 3.9116e-07, 1.0571e-06, 3.4366e-07, -2.8312e-07, -3.2913e-06, -1.4212e-06, 2.1700e-06], device='cuda:0') 100 1e-05 changing lr epoch 207, time 217.40, cls_loss 0.0015 cls_loss_mapping 0.0014 cls_loss_causal 0.4776 re_mapping 0.0052 re_causal 0.0165 /// teacc 99.04 lr 0.00001000 Epoch 209, weight, value: tensor([[-0.0559, -0.1080, -0.0599, ..., -0.1154, -0.0205, -0.0325], [-0.0501, 0.0492, -0.1615, ..., 0.0620, 0.0711, -0.1369], [ 0.0564, -0.0076, -0.0634, ..., -0.0597, -0.0443, 0.0088], ..., [ 0.0570, -0.0597, -0.0170, ..., 0.0183, -0.0412, 0.0181], [-0.0665, -0.0265, -0.0823, ..., -0.0289, -0.0352, -0.0709], [-0.1434, -0.2041, 0.0284, ..., -0.0661, 0.0401, 0.0480]], device='cuda:0'), grad: tensor([[ 3.7253e-08, 9.3132e-10, 6.7055e-08, ..., 0.0000e+00, 8.1956e-08, 0.0000e+00], [ 9.4716e-07, -3.9116e-08, 1.1083e-07, ..., 1.1176e-08, -1.5832e-08, 0.0000e+00], [ 2.2072e-07, 1.7695e-08, 2.9802e-08, ..., 9.3132e-10, 1.3690e-07, 0.0000e+00], ..., [-1.1697e-06, 8.3819e-09, 7.5437e-08, ..., 3.7253e-09, 1.8068e-07, 0.0000e+00], [ 6.8918e-08, 0.0000e+00, 2.4680e-07, ..., 9.3132e-10, 6.1560e-07, 0.0000e+00], [ 4.3400e-07, 0.0000e+00, 1.0245e-07, ..., 7.4506e-09, -1.7835e-06, 0.0000e+00]], device='cuda:0') Epoch 209, bias, value: tensor([-0.0068, 0.0239, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 4.2096e-07, 1.8552e-06, 1.0254e-06, 2.9095e-06, 1.0394e-06, -2.1867e-06, -9.6019e-07, -1.7770e-06, 1.6447e-06, -3.9972e-06], device='cuda:0') 100 1e-05 changing lr epoch 208, time 217.22, cls_loss 0.0015 cls_loss_mapping 0.0013 cls_loss_causal 0.4651 re_mapping 0.0050 re_causal 0.0163 /// teacc 99.05 lr 0.00001000 Epoch 210, weight, value: tensor([[-0.0559, -0.1080, -0.0600, ..., -0.1156, -0.0205, -0.0325], [-0.0501, 0.0492, -0.1615, ..., 0.0621, 0.0711, -0.1370], [ 0.0564, -0.0076, -0.0634, ..., -0.0597, -0.0444, 0.0088], ..., [ 0.0570, -0.0597, -0.0170, ..., 0.0182, -0.0413, 0.0181], [-0.0666, -0.0264, -0.0823, ..., -0.0289, -0.0352, -0.0709], [-0.1434, -0.2041, 0.0284, ..., -0.0660, 0.0402, 0.0480]], device='cuda:0'), grad: tensor([[ 3.1237e-06, 0.0000e+00, 8.3819e-09, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], [ 6.9197e-07, 0.0000e+00, 5.2154e-08, ..., 0.0000e+00, -3.7253e-08, 0.0000e+00], [-9.0972e-06, 0.0000e+00, 2.0489e-08, ..., 0.0000e+00, 1.8626e-08, 0.0000e+00], ..., [-3.3807e-07, 0.0000e+00, 2.0489e-08, ..., 0.0000e+00, -8.4750e-08, 0.0000e+00], [ 2.5462e-06, 0.0000e+00, -3.8091e-07, ..., 0.0000e+00, 1.1176e-08, 0.0000e+00], [ 1.1455e-07, 0.0000e+00, 2.3376e-07, ..., 0.0000e+00, -1.2107e-08, 0.0000e+00]], device='cuda:0') Epoch 210, bias, value: tensor([-0.0068, 0.0239, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0061, -0.0055], device='cuda:0'), grad: tensor([ 1.0259e-05, 1.9968e-06, -3.1561e-05, 6.5044e-06, 1.6205e-07, 7.1432e-07, 4.2766e-06, -6.2864e-07, 6.6943e-06, 1.5358e-06], device='cuda:0') 100 1e-05 changing lr epoch 209, time 217.24, cls_loss 0.0013 cls_loss_mapping 0.0013 cls_loss_causal 0.5117 re_mapping 0.0050 re_causal 0.0171 /// teacc 99.05 lr 0.00001000 Epoch 211, weight, value: tensor([[-0.0559, -0.1080, -0.0600, ..., -0.1156, -0.0205, -0.0325], [-0.0502, 0.0492, -0.1615, ..., 0.0621, 0.0712, -0.1370], [ 0.0564, -0.0076, -0.0634, ..., -0.0597, -0.0444, 0.0088], ..., [ 0.0571, -0.0597, -0.0170, ..., 0.0181, -0.0413, 0.0181], [-0.0666, -0.0265, -0.0823, ..., -0.0290, -0.0353, -0.0709], [-0.1435, -0.2041, 0.0283, ..., -0.0659, 0.0402, 0.0480]], device='cuda:0'), grad: tensor([[ 5.5879e-09, 0.0000e+00, 2.0489e-08, ..., 0.0000e+00, 5.3085e-08, 0.0000e+00], [ 2.4680e-07, 0.0000e+00, 1.0058e-07, ..., 0.0000e+00, -2.2259e-07, 0.0000e+00], [-8.8476e-08, 0.0000e+00, 1.2107e-08, ..., 0.0000e+00, 1.3970e-08, 0.0000e+00], ..., [-3.6787e-07, 0.0000e+00, 6.8918e-08, ..., 0.0000e+00, 1.4622e-07, 0.0000e+00], [ 4.6566e-09, 0.0000e+00, 4.2934e-07, ..., 0.0000e+00, 1.3970e-07, 0.0000e+00], [ 8.4750e-08, 0.0000e+00, 5.0571e-07, ..., 0.0000e+00, 5.5879e-09, 0.0000e+00]], device='cuda:0') Epoch 211, bias, value: tensor([-0.0068, 0.0238, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0118, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 1.2852e-07, 4.2189e-07, -2.7940e-08, 9.0338e-07, -1.6801e-06, -7.8510e-07, -7.4785e-07, -3.5949e-07, 3.5483e-07, 1.7742e-06], device='cuda:0') 100 1e-05 changing lr epoch 210, time 217.42, cls_loss 0.0013 cls_loss_mapping 0.0014 cls_loss_causal 0.4885 re_mapping 0.0050 re_causal 0.0169 /// teacc 99.05 lr 0.00001000 Epoch 212, weight, value: tensor([[-0.0559, -0.1081, -0.0600, ..., -0.1157, -0.0205, -0.0325], [-0.0502, 0.0492, -0.1615, ..., 0.0624, 0.0713, -0.1371], [ 0.0565, -0.0076, -0.0634, ..., -0.0597, -0.0444, 0.0089], ..., [ 0.0571, -0.0597, -0.0170, ..., 0.0179, -0.0414, 0.0180], [-0.0667, -0.0265, -0.0823, ..., -0.0291, -0.0353, -0.0709], [-0.1435, -0.2042, 0.0283, ..., -0.0659, 0.0402, 0.0480]], device='cuda:0'), grad: tensor([[ 1.9092e-07, 0.0000e+00, 2.9150e-07, ..., 0.0000e+00, 1.5646e-07, 5.5879e-09], [ 8.0280e-07, 0.0000e+00, 6.0908e-07, ..., 2.7940e-08, -8.4098e-07, 0.0000e+00], [-1.1511e-06, 0.0000e+00, -1.4622e-07, ..., 9.3132e-10, 1.0803e-07, 0.0000e+00], ..., [ 1.0980e-06, 0.0000e+00, 1.1213e-06, ..., 4.6566e-08, 3.0268e-07, 3.7253e-09], [ 6.6124e-08, 0.0000e+00, 2.2352e-08, ..., 0.0000e+00, 1.0123e-06, 0.0000e+00], [ 4.7591e-07, 0.0000e+00, 2.4363e-06, ..., 2.4214e-08, -1.5665e-06, -2.9802e-08]], device='cuda:0') Epoch 212, bias, value: tensor([-0.0067, 0.0239, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0118, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 1.0114e-06, 7.6648e-07, -1.4771e-06, 8.3260e-07, -3.3956e-06, -4.9137e-06, 5.3737e-07, 3.7849e-06, 3.8408e-06, -9.9372e-07], device='cuda:0') 100 1e-05 changing lr epoch 211, time 217.28, cls_loss 0.0011 cls_loss_mapping 0.0009 cls_loss_causal 0.4808 re_mapping 0.0049 re_causal 0.0169 /// teacc 99.03 lr 0.00001000 Epoch 213, weight, value: tensor([[-0.0559, -0.1081, -0.0600, ..., -0.1158, -0.0205, -0.0325], [-0.0502, 0.0493, -0.1616, ..., 0.0624, 0.0713, -0.1371], [ 0.0565, -0.0076, -0.0634, ..., -0.0597, -0.0444, 0.0089], ..., [ 0.0571, -0.0598, -0.0171, ..., 0.0179, -0.0414, 0.0180], [-0.0667, -0.0265, -0.0823, ..., -0.0292, -0.0354, -0.0709], [-0.1435, -0.2042, 0.0283, ..., -0.0659, 0.0402, 0.0480]], device='cuda:0'), grad: tensor([[ 1.7695e-08, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 6.5193e-08, 0.0000e+00], [ 4.4517e-07, -9.3132e-10, 5.3085e-08, ..., 0.0000e+00, -5.6997e-07, 0.0000e+00], [ 1.8161e-07, 0.0000e+00, 3.5390e-08, ..., 0.0000e+00, 8.5682e-08, 0.0000e+00], ..., [-1.8710e-06, 9.3132e-10, -5.5879e-09, ..., 0.0000e+00, 3.1292e-07, 0.0000e+00], [-1.2387e-07, 0.0000e+00, 1.4901e-08, ..., 0.0000e+00, -1.5646e-07, 0.0000e+00], [ 2.3283e-07, 0.0000e+00, -4.0606e-07, ..., 0.0000e+00, -2.2538e-07, 0.0000e+00]], device='cuda:0') Epoch 213, bias, value: tensor([-0.0067, 0.0239, -0.0027, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 2.2911e-07, -5.0385e-07, 3.4571e-06, 1.9055e-06, 5.1036e-06, 3.6415e-07, 1.2387e-07, -8.7395e-06, -9.7603e-07, -9.7137e-07], device='cuda:0') 100 1e-05 changing lr epoch 212, time 217.45, cls_loss 0.0011 cls_loss_mapping 0.0010 cls_loss_causal 0.5021 re_mapping 0.0049 re_causal 0.0171 /// teacc 99.06 lr 0.00001000 Epoch 214, weight, value: tensor([[-0.0559, -0.1081, -0.0601, ..., -0.1159, -0.0205, -0.0325], [-0.0502, 0.0493, -0.1616, ..., 0.0626, 0.0714, -0.1371], [ 0.0565, -0.0076, -0.0634, ..., -0.0598, -0.0444, 0.0089], ..., [ 0.0571, -0.0598, -0.0171, ..., 0.0177, -0.0415, 0.0180], [-0.0667, -0.0266, -0.0823, ..., -0.0293, -0.0354, -0.0709], [-0.1435, -0.2042, 0.0283, ..., -0.0660, 0.0402, 0.0480]], device='cuda:0'), grad: tensor([[ 2.2352e-08, 0.0000e+00, 2.7008e-08, ..., 9.3132e-09, 3.6322e-08, 0.0000e+00], [ 6.1374e-07, 0.0000e+00, 1.4622e-07, ..., -4.5076e-06, -7.3947e-06, 0.0000e+00], [-3.1106e-07, 0.0000e+00, 5.2154e-08, ..., 3.7253e-09, 2.5146e-08, 0.0000e+00], ..., [-6.5286e-07, 0.0000e+00, 7.6275e-07, ..., 2.5872e-06, 4.2431e-06, 0.0000e+00], [ 4.5635e-08, 0.0000e+00, 3.5390e-08, ..., 1.3039e-08, 5.6811e-08, 0.0000e+00], [ 8.7544e-08, 0.0000e+00, 1.4110e-06, ..., 3.1013e-07, 4.1258e-07, 0.0000e+00]], device='cuda:0') Epoch 214, bias, value: tensor([-0.0067, 0.0239, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([-2.6915e-07, -1.7598e-05, -5.1875e-07, 3.6042e-07, -1.0915e-06, -1.1921e-07, 2.6356e-07, 1.2495e-05, 3.7346e-07, 6.0759e-06], device='cuda:0') 100 1e-05 changing lr epoch 213, time 217.46, cls_loss 0.0011 cls_loss_mapping 0.0012 cls_loss_causal 0.4840 re_mapping 0.0048 re_causal 0.0169 /// teacc 99.05 lr 0.00001000 Epoch 215, weight, value: tensor([[-0.0559, -0.1082, -0.0601, ..., -0.1160, -0.0205, -0.0325], [-0.0502, 0.0493, -0.1616, ..., 0.0627, 0.0714, -0.1371], [ 0.0565, -0.0076, -0.0634, ..., -0.0598, -0.0445, 0.0088], ..., [ 0.0571, -0.0598, -0.0171, ..., 0.0176, -0.0415, 0.0180], [-0.0668, -0.0267, -0.0823, ..., -0.0294, -0.0354, -0.0709], [-0.1435, -0.2042, 0.0283, ..., -0.0658, 0.0403, 0.0480]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 0.0000e+00, 2.3935e-07, ..., 0.0000e+00, 6.5193e-09, 0.0000e+00], [ 1.9092e-07, 0.0000e+00, 1.6950e-07, ..., 9.3132e-10, -2.6729e-07, 0.0000e+00], [-5.6811e-08, 0.0000e+00, 4.4703e-08, ..., 0.0000e+00, 2.3283e-08, 0.0000e+00], ..., [-2.8871e-07, 0.0000e+00, 3.1013e-07, ..., 9.3132e-10, 1.8999e-07, 0.0000e+00], [ 8.9407e-08, 0.0000e+00, 1.7202e-06, ..., 0.0000e+00, 2.7940e-08, 0.0000e+00], [ 1.1176e-07, 0.0000e+00, 8.6520e-07, ..., 1.2107e-08, 7.4506e-09, 0.0000e+00]], device='cuda:0') Epoch 215, bias, value: tensor([-0.0067, 0.0239, -0.0026, -0.0051, 0.0149, 0.0055, 0.0248, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 6.1560e-07, 2.0023e-07, 8.5682e-08, 3.1114e-05, -1.6857e-06, -4.1664e-05, 3.9414e-06, 5.8394e-07, 4.5672e-06, 2.3115e-06], device='cuda:0') 100 1e-05 changing lr epoch 214, time 217.44, cls_loss 0.0013 cls_loss_mapping 0.0011 cls_loss_causal 0.4966 re_mapping 0.0049 re_causal 0.0167 /// teacc 99.03 lr 0.00001000 Epoch 216, weight, value: tensor([[-0.0559, -0.1082, -0.0601, ..., -0.1161, -0.0206, -0.0325], [-0.0502, 0.0493, -0.1616, ..., 0.0627, 0.0715, -0.1372], [ 0.0566, -0.0076, -0.0634, ..., -0.0598, -0.0445, 0.0088], ..., [ 0.0571, -0.0598, -0.0172, ..., 0.0176, -0.0416, 0.0180], [-0.0668, -0.0267, -0.0823, ..., -0.0295, -0.0355, -0.0709], [-0.1435, -0.2042, 0.0282, ..., -0.0659, 0.0403, 0.0480]], device='cuda:0'), grad: tensor([[ 3.5390e-08, 0.0000e+00, -2.5146e-08, ..., 0.0000e+00, 1.0431e-07, 0.0000e+00], [-1.7695e-07, 0.0000e+00, 6.7055e-08, ..., 1.8626e-09, -2.3693e-06, 0.0000e+00], [-8.1025e-08, 0.0000e+00, 1.8626e-08, ..., 0.0000e+00, 9.8627e-07, 0.0000e+00], ..., [ 4.2375e-07, 0.0000e+00, 2.5798e-07, ..., 9.3132e-10, 1.1530e-06, 0.0000e+00], [ 4.1910e-08, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 4.5728e-07, 0.0000e+00], [ 4.0978e-08, 0.0000e+00, 1.0338e-07, ..., 4.6566e-09, -1.3132e-07, 0.0000e+00]], device='cuda:0') Epoch 216, bias, value: tensor([-0.0067, 0.0239, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([-6.3516e-07, -5.3756e-06, 2.2668e-06, -4.6287e-07, 3.8091e-07, 1.2405e-06, -2.3022e-06, 3.4161e-06, 1.4408e-06, 6.6124e-08], device='cuda:0') 100 1e-05 changing lr epoch 215, time 217.22, cls_loss 0.0013 cls_loss_mapping 0.0011 cls_loss_causal 0.4995 re_mapping 0.0049 re_causal 0.0170 /// teacc 99.06 lr 0.00001000 Epoch 217, weight, value: tensor([[-0.0559, -0.1082, -0.0601, ..., -0.1162, -0.0206, -0.0325], [-0.0502, 0.0494, -0.1617, ..., 0.0628, 0.0715, -0.1372], [ 0.0566, -0.0076, -0.0634, ..., -0.0598, -0.0445, 0.0088], ..., [ 0.0571, -0.0598, -0.0173, ..., 0.0175, -0.0416, 0.0180], [-0.0668, -0.0267, -0.0823, ..., -0.0296, -0.0355, -0.0709], [-0.1436, -0.2042, 0.0281, ..., -0.0659, 0.0403, 0.0480]], device='cuda:0'), grad: tensor([[ 7.7300e-08, 0.0000e+00, 1.4901e-08, ..., 0.0000e+00, 3.2410e-07, 0.0000e+00], [ 5.3085e-08, 0.0000e+00, 2.4214e-08, ..., 0.0000e+00, -1.2927e-06, 0.0000e+00], [-7.5251e-07, 0.0000e+00, 2.7940e-08, ..., 0.0000e+00, 1.0338e-07, 0.0000e+00], ..., [ 4.2375e-07, 0.0000e+00, 3.1665e-08, ..., 0.0000e+00, 4.8615e-07, 0.0000e+00], [ 1.0245e-07, 9.3132e-10, -1.0608e-06, ..., 1.8626e-09, -7.7300e-08, 9.3132e-10], [ 1.3970e-08, 0.0000e+00, 1.1688e-06, ..., 9.3132e-10, 2.5239e-07, 0.0000e+00]], device='cuda:0') Epoch 217, bias, value: tensor([-0.0067, 0.0240, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 8.7358e-07, -2.3674e-06, -8.9686e-07, 1.2740e-06, 3.4831e-07, 1.4622e-07, 2.3004e-07, 1.9260e-06, -7.6666e-06, 6.1281e-06], device='cuda:0') 100 1e-05 changing lr epoch 216, time 217.54, cls_loss 0.0010 cls_loss_mapping 0.0011 cls_loss_causal 0.4542 re_mapping 0.0048 re_causal 0.0164 /// teacc 99.00 lr 0.00001000 Epoch 218, weight, value: tensor([[-0.0559, -0.1083, -0.0601, ..., -0.1163, -0.0206, -0.0325], [-0.0502, 0.0494, -0.1617, ..., 0.0628, 0.0715, -0.1372], [ 0.0567, -0.0076, -0.0634, ..., -0.0598, -0.0446, 0.0088], ..., [ 0.0571, -0.0598, -0.0173, ..., 0.0175, -0.0416, 0.0180], [-0.0668, -0.0267, -0.0823, ..., -0.0296, -0.0356, -0.0709], [-0.1436, -0.2043, 0.0281, ..., -0.0658, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 1.4678e-06, 0.0000e+00, 9.8720e-08, ..., 4.6566e-09, 2.3283e-08, 0.0000e+00], [ 9.6858e-08, 0.0000e+00, 4.3772e-08, ..., 7.4506e-09, -4.3120e-07, 0.0000e+00], [-2.9597e-06, 0.0000e+00, -1.2666e-07, ..., 9.3132e-10, 3.5390e-08, 0.0000e+00], ..., [ 5.0664e-07, 0.0000e+00, 2.9802e-08, ..., 3.7253e-09, 2.6077e-07, 0.0000e+00], [ 9.0711e-07, 0.0000e+00, 2.5984e-07, ..., 4.1910e-08, 5.1223e-08, 0.0000e+00], [ 1.7136e-07, 0.0000e+00, 3.9395e-07, ..., 1.2107e-08, 4.7497e-08, 0.0000e+00]], device='cuda:0') Epoch 218, bias, value: tensor([-0.0067, 0.0239, -0.0026, -0.0051, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([ 9.0003e-06, -4.7963e-07, -1.5974e-05, 7.8753e-06, -3.7253e-07, -8.4788e-06, 2.8033e-07, 1.5674e-06, 5.2229e-06, 1.3541e-06], device='cuda:0') 100 1e-05 changing lr epoch 217, time 217.58, cls_loss 0.0011 cls_loss_mapping 0.0009 cls_loss_causal 0.4670 re_mapping 0.0047 re_causal 0.0166 /// teacc 99.02 lr 0.00001000 Epoch 219, weight, value: tensor([[-0.0559, -0.1083, -0.0601, ..., -0.1163, -0.0206, -0.0325], [-0.0503, 0.0494, -0.1617, ..., 0.0628, 0.0715, -0.1372], [ 0.0567, -0.0076, -0.0633, ..., -0.0598, -0.0446, 0.0088], ..., [ 0.0571, -0.0598, -0.0174, ..., 0.0175, -0.0416, 0.0179], [-0.0669, -0.0267, -0.0823, ..., -0.0297, -0.0356, -0.0709], [-0.1437, -0.2043, 0.0280, ..., -0.0659, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 1.7788e-07, 0.0000e+00, 5.2620e-08, ..., 0.0000e+00, 4.4238e-08, 4.6566e-10], [ 4.7497e-07, 0.0000e+00, 4.1910e-09, ..., -2.3283e-09, -7.1479e-07, 0.0000e+00], [-6.0163e-07, 0.0000e+00, -1.8720e-07, ..., 0.0000e+00, 1.3690e-07, 0.0000e+00], ..., [-1.0598e-06, 0.0000e+00, 2.5611e-08, ..., 4.6566e-10, 6.7661e-07, 0.0000e+00], [ 4.9919e-07, 0.0000e+00, 4.5169e-08, ..., 9.3132e-10, 1.4110e-07, 9.3132e-10], [ 2.2305e-07, 0.0000e+00, -2.7893e-07, ..., 4.6566e-10, -1.4026e-06, 0.0000e+00]], device='cuda:0') Epoch 219, bias, value: tensor([-0.0067, 0.0239, -0.0026, -0.0050, 0.0149, 0.0055, 0.0248, -0.0119, 0.0060, -0.0055], device='cuda:0'), grad: tensor([-6.8732e-07, -2.9057e-07, -5.4715e-07, 5.2294e-07, 8.1211e-06, 2.3982e-07, 1.4668e-07, 1.5218e-06, 1.2591e-06, -1.0327e-05], device='cuda:0') 100 1e-05 changing lr epoch 218, time 217.43, cls_loss 0.0010 cls_loss_mapping 0.0007 cls_loss_causal 0.4783 re_mapping 0.0048 re_causal 0.0166 /// teacc 99.08 lr 0.00001000 Epoch 220, weight, value: tensor([[-0.0560, -0.1083, -0.0601, ..., -0.1164, -0.0206, -0.0325], [-0.0504, 0.0494, -0.1618, ..., 0.0628, 0.0715, -0.1372], [ 0.0568, -0.0076, -0.0633, ..., -0.0598, -0.0446, 0.0088], ..., [ 0.0572, -0.0598, -0.0174, ..., 0.0175, -0.0416, 0.0179], [-0.0669, -0.0267, -0.0823, ..., -0.0298, -0.0356, -0.0710], [-0.1437, -0.2043, 0.0280, ..., -0.0660, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 8.3819e-09, 0.0000e+00, 6.5193e-09, ..., 0.0000e+00, 1.6764e-08, 0.0000e+00], [ 5.2620e-08, 0.0000e+00, 3.1199e-08, ..., 0.0000e+00, -1.4389e-07, 0.0000e+00], [ 3.2131e-08, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 2.0955e-08, 0.0000e+00], ..., [-6.4727e-08, 0.0000e+00, 1.7229e-08, ..., 4.6566e-10, 1.2619e-07, 0.0000e+00], [ 2.1886e-08, 0.0000e+00, 3.2131e-08, ..., 4.6566e-10, 4.1444e-08, 0.0000e+00], [ 2.6543e-08, 0.0000e+00, 1.3495e-06, ..., 4.6566e-10, -1.6764e-07, 0.0000e+00]], device='cuda:0') Epoch 220, bias, value: tensor([-0.0067, 0.0239, -0.0025, -0.0050, 0.0149, 0.0055, 0.0249, -0.0119, 0.0060, -0.0056], device='cuda:0'), grad: tensor([-1.6419e-06, -5.6811e-08, 1.2480e-06, 1.0319e-06, -7.6834e-08, -3.3490e-06, -6.5193e-09, 2.6636e-07, 2.9337e-07, 2.3078e-06], device='cuda:0') 100 1e-05 changing lr epoch 219, time 217.35, cls_loss 0.0012 cls_loss_mapping 0.0009 cls_loss_causal 0.4607 re_mapping 0.0048 re_causal 0.0166 /// teacc 99.08 lr 0.00001000 Epoch 221, weight, value: tensor([[-0.0560, -0.1083, -0.0601, ..., -0.1164, -0.0206, -0.0325], [-0.0505, 0.0495, -0.1618, ..., 0.0629, 0.0716, -0.1372], [ 0.0568, -0.0076, -0.0633, ..., -0.0598, -0.0447, 0.0088], ..., [ 0.0573, -0.0598, -0.0175, ..., 0.0174, -0.0416, 0.0179], [-0.0670, -0.0267, -0.0823, ..., -0.0299, -0.0357, -0.0710], [-0.1438, -0.2043, 0.0280, ..., -0.0660, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 2.9802e-08, 0.0000e+00, 2.7940e-09, ..., 3.2596e-09, 1.4901e-08, 3.2596e-09], [ 1.0049e-06, 0.0000e+00, 8.6147e-08, ..., 2.0443e-07, -2.0675e-07, 4.6566e-10], [ 9.4529e-08, 0.0000e+00, 7.9162e-09, ..., 7.4506e-09, 1.7695e-08, 9.3132e-10], ..., [-1.5199e-06, 0.0000e+00, 1.4901e-08, ..., -3.4412e-07, 1.7369e-07, 0.0000e+00], [ 1.4063e-07, 0.0000e+00, -3.4459e-08, ..., 1.8626e-09, 2.4121e-07, 2.3283e-09], [ 5.0943e-07, 0.0000e+00, -3.6694e-07, ..., 1.0151e-07, -5.3551e-07, 0.0000e+00]], device='cuda:0') Epoch 221, bias, value: tensor([-0.0067, 0.0238, -0.0026, -0.0050, 0.0149, 0.0055, 0.0249, -0.0118, 0.0060, -0.0055], device='cuda:0'), grad: tensor([-1.2210e-06, 2.8107e-06, 3.2131e-07, -4.5495e-07, 3.1479e-06, 8.6799e-07, -6.4448e-07, -4.1798e-06, 5.5414e-08, -7.0315e-07], device='cuda:0') 100 1e-05 changing lr epoch 220, time 217.49, cls_loss 0.0009 cls_loss_mapping 0.0009 cls_loss_causal 0.4765 re_mapping 0.0048 re_causal 0.0169 /// teacc 99.09 lr 0.00001000 Epoch 222, weight, value: tensor([[-0.0560, -0.1084, -0.0601, ..., -0.1165, -0.0206, -0.0325], [-0.0505, 0.0495, -0.1618, ..., 0.0629, 0.0716, -0.1372], [ 0.0568, -0.0076, -0.0633, ..., -0.0598, -0.0447, 0.0088], ..., [ 0.0573, -0.0598, -0.0175, ..., 0.0174, -0.0417, 0.0179], [-0.0671, -0.0267, -0.0823, ..., -0.0300, -0.0357, -0.0710], [-0.1438, -0.2043, 0.0280, ..., -0.0659, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 7.9162e-09, 0.0000e+00, 3.2596e-09, ..., 0.0000e+00, 9.7789e-09, 0.0000e+00], [ 2.6124e-07, 0.0000e+00, 2.7940e-09, ..., 0.0000e+00, -7.3574e-08, 0.0000e+00], [ 4.2375e-08, 0.0000e+00, 4.6566e-10, ..., 0.0000e+00, 2.3749e-08, 0.0000e+00], ..., [-1.9651e-07, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.0990e-07, 0.0000e+00], [ 2.3283e-08, 0.0000e+00, 6.9384e-08, ..., 0.0000e+00, -1.8021e-07, 0.0000e+00], [ 8.6613e-08, 0.0000e+00, 1.6298e-08, ..., 0.0000e+00, -4.9360e-08, 0.0000e+00]], device='cuda:0') Epoch 222, bias, value: tensor([-0.0066, 0.0238, -0.0025, -0.0050, 0.0149, 0.0055, 0.0249, -0.0118, 0.0060, -0.0056], device='cuda:0'), grad: tensor([-2.9802e-08, 9.0152e-07, 2.3050e-07, -9.6392e-08, 3.3760e-07, 1.4082e-06, -9.2573e-07, -3.0873e-07, -1.6680e-06, 1.7788e-07], device='cuda:0') 100 1e-05 changing lr epoch 221, time 217.43, cls_loss 0.0009 cls_loss_mapping 0.0009 cls_loss_causal 0.4679 re_mapping 0.0048 re_causal 0.0166 /// teacc 99.06 lr 0.00001000 Epoch 223, weight, value: tensor([[-0.0560, -0.1084, -0.0601, ..., -0.1165, -0.0206, -0.0325], [-0.0505, 0.0495, -0.1618, ..., 0.0629, 0.0717, -0.1372], [ 0.0569, -0.0076, -0.0634, ..., -0.0598, -0.0448, 0.0088], ..., [ 0.0572, -0.0599, -0.0175, ..., 0.0174, -0.0417, 0.0179], [-0.0672, -0.0268, -0.0823, ..., -0.0300, -0.0358, -0.0710], [-0.1439, -0.2044, 0.0280, ..., -0.0659, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 7.8697e-08, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, -2.0489e-08, 0.0000e+00], [ 2.1141e-07, 4.6566e-10, 3.9581e-08, ..., 0.0000e+00, -2.4308e-07, 0.0000e+00], [ 1.1809e-06, 4.6566e-10, 4.6566e-09, ..., 0.0000e+00, 5.1223e-08, 0.0000e+00], ..., [ 1.0272e-06, -3.7253e-09, 2.8405e-08, ..., 0.0000e+00, 1.8161e-07, 0.0000e+00], [ 8.6613e-08, 0.0000e+00, 1.3970e-09, ..., 0.0000e+00, 3.0734e-08, 0.0000e+00], [ 2.5798e-07, 0.0000e+00, 1.3504e-07, ..., 0.0000e+00, -1.3178e-07, 0.0000e+00]], device='cuda:0') Epoch 223, bias, value: tensor([-0.0066, 0.0238, -0.0025, -0.0050, 0.0149, 0.0055, 0.0249, -0.0118, 0.0059, -0.0056], device='cuda:0'), grad: tensor([-2.0564e-06, 1.2107e-08, 2.0638e-06, -3.6675e-06, -4.1304e-07, 6.1328e-07, 4.9965e-07, 1.7984e-06, 2.7707e-07, 8.4564e-07], device='cuda:0') 100 1e-05 changing lr epoch 222, time 217.28, cls_loss 0.0011 cls_loss_mapping 0.0008 cls_loss_causal 0.4832 re_mapping 0.0048 re_causal 0.0167 /// teacc 99.09 lr 0.00001000 Epoch 224, weight, value: tensor([[-0.0560, -0.1084, -0.0601, ..., -0.1167, -0.0206, -0.0325], [-0.0505, 0.0496, -0.1619, ..., 0.0629, 0.0717, -0.1372], [ 0.0569, -0.0076, -0.0633, ..., -0.0599, -0.0448, 0.0088], ..., [ 0.0573, -0.0599, -0.0175, ..., 0.0174, -0.0417, 0.0179], [-0.0673, -0.0268, -0.0823, ..., -0.0301, -0.0358, -0.0710], [-0.1439, -0.2044, 0.0279, ..., -0.0658, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[ 2.8871e-08, 0.0000e+00, 2.3283e-09, ..., 0.0000e+00, 3.8650e-08, 4.6566e-10], [ 5.2154e-08, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, -1.2573e-07, 0.0000e+00], [-4.5029e-07, 0.0000e+00, 3.2596e-09, ..., 0.0000e+00, 2.1886e-08, 0.0000e+00], ..., [-5.6811e-08, 0.0000e+00, 1.7229e-08, ..., 0.0000e+00, 9.7323e-08, 0.0000e+00], [ 3.8091e-07, 0.0000e+00, -4.6566e-09, ..., 0.0000e+00, 2.7474e-08, 5.5879e-09], [ 5.4948e-08, 0.0000e+00, -7.4506e-09, ..., 0.0000e+00, -7.8697e-08, 0.0000e+00]], device='cuda:0') Epoch 224, bias, value: tensor([-0.0066, 0.0238, -0.0025, -0.0051, 0.0149, 0.0055, 0.0249, -0.0118, 0.0059, -0.0056], device='cuda:0'), grad: tensor([-3.6601e-07, 4.7963e-08, -3.9814e-07, 2.4494e-07, 2.1666e-05, 2.1085e-06, -2.4587e-05, 4.5914e-07, 1.1511e-06, -3.2643e-07], device='cuda:0') 100 1e-05 changing lr epoch 223, time 217.27, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4846 re_mapping 0.0047 re_causal 0.0167 /// teacc 99.09 lr 0.00001000 Epoch 225, weight, value: tensor([[-0.0560, -0.1085, -0.0601, ..., -0.1168, -0.0206, -0.0325], [-0.0505, 0.0497, -0.1619, ..., 0.0629, 0.0718, -0.1372], [ 0.0569, -0.0076, -0.0633, ..., -0.0599, -0.0448, 0.0088], ..., [ 0.0573, -0.0600, -0.0175, ..., 0.0174, -0.0418, 0.0179], [-0.0674, -0.0268, -0.0823, ..., -0.0302, -0.0359, -0.0710], [-0.1440, -0.2044, 0.0279, ..., -0.0659, 0.0404, 0.0480]], device='cuda:0'), grad: tensor([[-1.1967e-07, 0.0000e+00, 1.3970e-08, ..., 1.8626e-09, 2.7986e-07, 0.0000e+00], [ 1.5181e-07, 0.0000e+00, 3.9814e-07, ..., 1.0710e-08, -8.0392e-06, 0.0000e+00], [-1.5661e-05, 0.0000e+00, 3.9581e-08, ..., 3.7253e-09, 1.5087e-07, 0.0000e+00], ..., [ 1.5557e-05, 0.0000e+00, 1.5693e-07, ..., 3.4459e-08, 1.1967e-06, 0.0000e+00], [ 2.4354e-07, 0.0000e+00, 3.7719e-08, ..., 2.3283e-09, 4.0270e-06, 0.0000e+00], [ 2.9383e-07, 0.0000e+00, 5.9605e-07, ..., 9.7789e-09, 2.2724e-07, 0.0000e+00]], device='cuda:0') Epoch 225, bias, value: tensor([-0.0066, 0.0239, -0.0025, -0.0051, 0.0149, 0.0055, 0.0249, -0.0118, 0.0059, -0.0056], device='cuda:0'), grad: tensor([-8.4564e-06, -3.4094e-05, -1.8910e-05, -6.5891e-07, 7.0930e-06, 8.5216e-07, 9.5554e-07, 2.4691e-05, 1.8150e-05, 1.0386e-05], device='cuda:0') 100 1e-05 changing lr epoch 224, time 217.37, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4598 re_mapping 0.0046 re_causal 0.0162 /// teacc 99.07 lr 0.00001000 Epoch 226, weight, value: tensor([[-0.0560, -0.1087, -0.0601, ..., -0.1169, -0.0206, -0.0325], [-0.0505, 0.0498, -0.1619, ..., 0.0629, 0.0718, -0.1372], [ 0.0570, -0.0076, -0.0633, ..., -0.0599, -0.0448, 0.0088], ..., [ 0.0573, -0.0600, -0.0176, ..., 0.0174, -0.0418, 0.0179], [-0.0675, -0.0269, -0.0824, ..., -0.0303, -0.0360, -0.0710], [-0.1440, -0.2045, 0.0278, ..., -0.0659, 0.0405, 0.0480]], device='cuda:0'), grad: tensor([[ 1.3970e-07, 7.9162e-09, 8.1025e-08, ..., 0.0000e+00, -1.1036e-07, 0.0000e+00], [ 1.7118e-06, 3.4319e-07, 3.0268e-08, ..., 0.0000e+00, -2.6543e-08, 0.0000e+00], [-3.2084e-07, 9.8441e-07, 8.7079e-08, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], ..., [ 1.5059e-06, 2.4540e-07, 2.5099e-07, ..., 1.8626e-09, 1.0058e-07, 0.0000e+00], [-9.4846e-06, -4.2617e-06, -1.8366e-06, ..., 0.0000e+00, -3.2596e-09, 4.6566e-10], [ 1.4128e-06, 2.2352e-08, 9.1875e-07, ..., 0.0000e+00, -1.8021e-07, 0.0000e+00]], device='cuda:0') Epoch 226, bias, value: tensor([-0.0066, 0.0239, -0.0025, -0.0051, 0.0148, 0.0055, 0.0249, -0.0118, 0.0058, -0.0056], device='cuda:0'), grad: tensor([-1.4342e-06, 5.1707e-06, 7.7859e-06, 5.4725e-06, 1.0990e-06, 9.1642e-06, 1.3642e-05, 4.9323e-06, -5.1469e-05, 5.7034e-06], device='cuda:0') 100 1e-05 changing lr epoch 225, time 217.26, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4718 re_mapping 0.0046 re_causal 0.0162 /// teacc 99.06 lr 0.00001000 Epoch 227, weight, value: tensor([[-0.0560, -0.1087, -0.0601, ..., -0.1169, -0.0207, -0.0325], [-0.0505, 0.0499, -0.1620, ..., 0.0629, 0.0718, -0.1373], [ 0.0570, -0.0077, -0.0633, ..., -0.0599, -0.0448, 0.0088], ..., [ 0.0573, -0.0600, -0.0176, ..., 0.0174, -0.0418, 0.0178], [-0.0676, -0.0269, -0.0824, ..., -0.0303, -0.0360, -0.0710], [-0.1441, -0.2045, 0.0278, ..., -0.0659, 0.0405, 0.0480]], device='cuda:0'), grad: tensor([[ 1.0887e-06, 0.0000e+00, -4.2282e-07, ..., 4.6566e-10, 1.7229e-08, 0.0000e+00], [ 8.6501e-06, 4.6566e-10, 5.1223e-08, ..., 8.3819e-09, -1.8300e-07, 0.0000e+00], [ 1.9930e-07, -3.0268e-08, 8.7079e-08, ..., 4.6566e-10, 2.9337e-08, 0.0000e+00], ..., [-1.4834e-05, 2.7940e-08, 4.0978e-08, ..., 1.8626e-09, 1.8068e-07, 0.0000e+00], [ 8.0559e-08, 4.6566e-10, -3.1665e-08, ..., 0.0000e+00, 4.4238e-08, 0.0000e+00], [ 7.6182e-07, 0.0000e+00, 2.2445e-07, ..., 5.1223e-09, -2.0023e-07, 0.0000e+00]], device='cuda:0') Epoch 227, bias, value: tensor([-0.0066, 0.0239, -0.0024, -0.0050, 0.0148, 0.0055, 0.0250, -0.0118, 0.0058, -0.0056], device='cuda:0'), grad: tensor([ 5.4482e-08, 1.9237e-05, 1.0636e-06, 3.3602e-06, 4.5709e-06, 2.9020e-06, -2.7474e-08, -3.2455e-05, -4.2375e-08, 1.3420e-06], device='cuda:0') 100 1e-05 changing lr epoch 226, time 217.33, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4768 re_mapping 0.0046 re_causal 0.0162 /// teacc 99.06 lr 0.00001000 Epoch 228, weight, value: tensor([[-0.0560, -0.1089, -0.0601, ..., -0.1170, -0.0207, -0.0325], [-0.0506, 0.0501, -0.1620, ..., 0.0629, 0.0718, -0.1373], [ 0.0571, -0.0077, -0.0633, ..., -0.0599, -0.0449, 0.0088], ..., [ 0.0573, -0.0602, -0.0177, ..., 0.0174, -0.0418, 0.0178], [-0.0677, -0.0269, -0.0824, ..., -0.0304, -0.0360, -0.0710], [-0.1441, -0.2046, 0.0278, ..., -0.0660, 0.0406, 0.0481]], device='cuda:0'), grad: tensor([[-1.5069e-06, 0.0000e+00, 1.3970e-09, ..., 9.3132e-10, 4.1444e-08, 0.0000e+00], [ 2.5202e-06, 0.0000e+00, 1.4435e-08, ..., 1.2424e-06, -2.6915e-07, 0.0000e+00], [-7.4320e-07, 0.0000e+00, -6.9849e-09, ..., 2.3283e-09, 2.0489e-08, 0.0000e+00], ..., [-3.3397e-06, 0.0000e+00, -1.0617e-07, ..., -1.3513e-06, 2.9290e-07, 0.0000e+00], [ 4.2468e-07, 0.0000e+00, 1.5832e-08, ..., 0.0000e+00, 4.4703e-08, 1.3970e-09], [ 1.7658e-06, 0.0000e+00, 6.2399e-08, ..., 8.1956e-08, -2.5518e-07, 0.0000e+00]], device='cuda:0') Epoch 228, bias, value: tensor([-0.0065, 0.0238, -0.0024, -0.0051, 0.0148, 0.0055, 0.0249, -0.0118, 0.0058, -0.0056], device='cuda:0'), grad: tensor([-7.0482e-06, 5.2638e-06, -5.5600e-07, 6.2585e-07, 1.0468e-06, 4.6752e-07, 1.5907e-06, -7.0408e-06, 1.1679e-06, 4.4964e-06], device='cuda:0') 100 1e-05 changing lr epoch 227, time 217.31, cls_loss 0.0011 cls_loss_mapping 0.0008 cls_loss_causal 0.4716 re_mapping 0.0045 re_causal 0.0158 /// teacc 99.06 lr 0.00001000 Epoch 229, weight, value: tensor([[-0.0560, -0.1089, -0.0602, ..., -0.1171, -0.0207, -0.0325], [-0.0506, 0.0501, -0.1620, ..., 0.0629, 0.0719, -0.1373], [ 0.0571, -0.0077, -0.0632, ..., -0.0600, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0178, ..., 0.0174, -0.0418, 0.0178], [-0.0678, -0.0269, -0.0824, ..., -0.0305, -0.0360, -0.0710], [-0.1442, -0.2046, 0.0277, ..., -0.0660, 0.0406, 0.0481]], device='cuda:0'), grad: tensor([[ 4.0978e-08, 0.0000e+00, -9.5740e-06, ..., 4.6566e-10, 5.0291e-08, 0.0000e+00], [ 4.0382e-06, 0.0000e+00, 7.0781e-08, ..., 0.0000e+00, -1.7462e-07, 0.0000e+00], [ 1.1083e-07, 0.0000e+00, 1.4435e-08, ..., 0.0000e+00, 2.7008e-08, 0.0000e+00], ..., [-1.2569e-05, 0.0000e+00, 1.5600e-07, ..., 4.6566e-10, 1.7276e-07, 0.0000e+00], [ 1.5134e-07, 0.0000e+00, -1.1735e-07, ..., 1.3970e-09, -7.8231e-08, 0.0000e+00], [ 5.5693e-06, 0.0000e+00, 1.9604e-07, ..., 1.3970e-09, -6.3051e-07, 0.0000e+00]], device='cuda:0') Epoch 229, bias, value: tensor([-0.0065, 0.0238, -0.0024, -0.0051, 0.0148, 0.0055, 0.0249, -0.0118, 0.0058, -0.0056], device='cuda:0'), grad: tensor([-7.5281e-05, 1.2934e-05, 9.1121e-06, 9.0823e-06, 3.4235e-06, 1.7181e-05, 4.0114e-05, -4.0084e-05, 1.9614e-06, 2.1502e-05], device='cuda:0') 100 1e-05 changing lr epoch 228, time 217.25, cls_loss 0.0011 cls_loss_mapping 0.0010 cls_loss_causal 0.4693 re_mapping 0.0045 re_causal 0.0162 /// teacc 99.07 lr 0.00001000 Epoch 230, weight, value: tensor([[-0.0560, -0.1090, -0.0601, ..., -0.1172, -0.0207, -0.0325], [-0.0507, 0.0501, -0.1621, ..., 0.0629, 0.0719, -0.1373], [ 0.0573, -0.0076, -0.0632, ..., -0.0600, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0178, ..., 0.0174, -0.0418, 0.0178], [-0.0680, -0.0269, -0.0824, ..., -0.0306, -0.0361, -0.0710], [-0.1442, -0.2046, 0.0277, ..., -0.0660, 0.0406, 0.0481]], device='cuda:0'), grad: tensor([[ 3.3528e-08, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 3.9581e-08, 0.0000e+00], [ 7.5437e-08, 8.3819e-09, 4.2375e-08, ..., 4.6566e-10, -4.6706e-07, 0.0000e+00], [-6.9477e-07, -1.3970e-08, 8.3819e-09, ..., 0.0000e+00, 3.7253e-08, 0.0000e+00], ..., [ 4.6426e-07, 5.1223e-09, 3.2596e-08, ..., 0.0000e+00, 2.7241e-07, 0.0000e+00], [ 7.3574e-08, 0.0000e+00, 4.1910e-09, ..., 0.0000e+00, 8.9873e-08, 0.0000e+00], [ 9.7789e-09, 4.6566e-10, 1.2629e-06, ..., 4.6566e-10, -1.9791e-07, 0.0000e+00]], device='cuda:0') Epoch 230, bias, value: tensor([-0.0065, 0.0238, -0.0023, -0.0051, 0.0148, 0.0055, 0.0249, -0.0118, 0.0057, -0.0056], device='cuda:0'), grad: tensor([-2.1653e-07, -7.5204e-07, -7.1526e-07, 1.3178e-07, -2.4363e-06, 1.5181e-07, -3.0687e-07, 1.3579e-06, 2.9337e-07, 2.4978e-06], device='cuda:0') 100 1e-05 changing lr epoch 229, time 217.25, cls_loss 0.0008 cls_loss_mapping 0.0007 cls_loss_causal 0.4313 re_mapping 0.0045 re_causal 0.0159 /// teacc 99.05 lr 0.00001000 Epoch 231, weight, value: tensor([[-0.0560, -0.1090, -0.0601, ..., -0.1173, -0.0207, -0.0325], [-0.0507, 0.0501, -0.1621, ..., 0.0629, 0.0719, -0.1374], [ 0.0574, -0.0076, -0.0631, ..., -0.0600, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0180, ..., 0.0174, -0.0419, 0.0177], [-0.0681, -0.0269, -0.0825, ..., -0.0306, -0.0362, -0.0710], [-0.1443, -0.2047, 0.0276, ..., -0.0660, 0.0407, 0.0481]], device='cuda:0'), grad: tensor([[ 7.7020e-07, 0.0000e+00, 7.1852e-07, ..., 9.3132e-10, 3.9022e-07, 0.0000e+00], [ 2.0284e-06, 0.0000e+00, 2.3330e-07, ..., 9.3132e-10, -5.2787e-06, 0.0000e+00], [-4.7907e-06, 0.0000e+00, 2.8610e-06, ..., 0.0000e+00, 5.4017e-07, 0.0000e+00], ..., [ 1.9036e-06, 0.0000e+00, 1.4855e-07, ..., 1.4435e-08, -5.1735e-07, 0.0000e+00], [-2.0768e-06, 0.0000e+00, -5.4352e-06, ..., 2.3283e-09, 1.2089e-06, 0.0000e+00], [ 2.7716e-06, 0.0000e+00, 7.6648e-07, ..., -1.5832e-08, 8.6613e-07, 0.0000e+00]], device='cuda:0') Epoch 231, bias, value: tensor([-0.0065, 0.0237, -0.0022, -0.0051, 0.0148, 0.0055, 0.0248, -0.0118, 0.0056, -0.0057], device='cuda:0'), grad: tensor([ 6.5342e-06, -1.7121e-05, 5.6028e-06, -6.1607e-07, 1.5572e-05, 1.8459e-06, -5.3868e-06, 2.3749e-07, -1.7524e-05, 1.0848e-05], device='cuda:0') 100 1e-05 changing lr epoch 230, time 217.45, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4844 re_mapping 0.0045 re_causal 0.0164 /// teacc 99.04 lr 0.00001000 Epoch 232, weight, value: tensor([[-0.0560, -0.1090, -0.0601, ..., -0.1173, -0.0207, -0.0325], [-0.0507, 0.0501, -0.1622, ..., 0.0629, 0.0720, -0.1374], [ 0.0574, -0.0076, -0.0632, ..., -0.0600, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0181, ..., 0.0174, -0.0419, 0.0176], [-0.0681, -0.0270, -0.0825, ..., -0.0307, -0.0363, -0.0710], [-0.1443, -0.2047, 0.0276, ..., -0.0660, 0.0407, 0.0481]], device='cuda:0'), grad: tensor([[ 1.2573e-08, 1.8626e-09, 1.2107e-08, ..., 0.0000e+00, 3.9581e-08, 0.0000e+00], [-6.3656e-07, -1.3709e-06, 2.4214e-08, ..., 0.0000e+00, -2.9728e-06, 0.0000e+00], [ 1.9465e-07, 3.1060e-07, 9.3132e-10, ..., 0.0000e+00, 7.3761e-07, -4.6566e-10], ..., [ 3.9767e-07, 9.4064e-07, 6.9849e-09, ..., 0.0000e+00, 1.4007e-06, 4.6566e-10], [-6.9384e-08, 1.3970e-09, 2.7940e-08, ..., 0.0000e+00, 1.1036e-07, 0.0000e+00], [ 1.3504e-07, 9.3132e-10, 1.5311e-06, ..., 0.0000e+00, -6.1467e-08, 0.0000e+00]], device='cuda:0') Epoch 232, bias, value: tensor([-0.0064, 0.0238, -0.0022, -0.0050, 0.0148, 0.0055, 0.0248, -0.0118, 0.0056, -0.0057], device='cuda:0'), grad: tensor([-2.6077e-07, -1.1854e-05, 3.8091e-06, 5.7835e-07, 1.8813e-06, -2.9672e-06, 9.7416e-07, 6.3889e-06, -3.2075e-06, 4.6380e-06], device='cuda:0') 100 1e-05 changing lr epoch 231, time 217.33, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4745 re_mapping 0.0045 re_causal 0.0162 /// teacc 99.05 lr 0.00001000 Epoch 233, weight, value: tensor([[-0.0560, -0.1091, -0.0600, ..., -0.1174, -0.0207, -0.0325], [-0.0508, 0.0502, -0.1622, ..., 0.0629, 0.0720, -0.1374], [ 0.0575, -0.0076, -0.0631, ..., -0.0601, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0182, ..., 0.0174, -0.0419, 0.0176], [-0.0682, -0.0271, -0.0825, ..., -0.0308, -0.0364, -0.0710], [-0.1444, -0.2047, 0.0276, ..., -0.0660, 0.0407, 0.0481]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 9.3132e-10, 3.2596e-09, ..., 0.0000e+00, 2.4680e-08, 0.0000e+00], [ 1.0934e-06, -2.8266e-07, 1.5832e-08, ..., 4.6566e-10, -2.2259e-07, 0.0000e+00], [-8.6986e-07, 1.7509e-07, 3.7253e-09, ..., 0.0000e+00, 1.9185e-07, 0.0000e+00], ..., [-2.7614e-07, 9.8255e-08, 1.6764e-08, ..., 4.6566e-10, 1.9511e-07, 0.0000e+00], [ 1.4482e-07, 4.6566e-10, 1.3039e-08, ..., 0.0000e+00, 1.3318e-07, 0.0000e+00], [ 1.7695e-07, 0.0000e+00, -6.0536e-09, ..., 4.6566e-10, -2.3143e-07, 0.0000e+00]], device='cuda:0') Epoch 233, bias, value: tensor([-0.0064, 0.0237, -0.0022, -0.0051, 0.0148, 0.0055, 0.0248, -0.0118, 0.0056, -0.0057], device='cuda:0'), grad: tensor([-9.6206e-07, 1.1604e-06, -5.0291e-07, -6.1467e-07, 7.1200e-07, 5.9977e-07, -1.1763e-06, -5.8673e-08, 6.4075e-07, 2.2445e-07], device='cuda:0') 100 1e-05 changing lr epoch 232, time 217.63, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4912 re_mapping 0.0045 re_causal 0.0163 /// teacc 99.05 lr 0.00001000 Epoch 234, weight, value: tensor([[-0.0560, -0.1091, -0.0600, ..., -0.1175, -0.0208, -0.0325], [-0.0508, 0.0502, -0.1623, ..., 0.0629, 0.0720, -0.1374], [ 0.0575, -0.0076, -0.0631, ..., -0.0601, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0182, ..., 0.0174, -0.0419, 0.0176], [-0.0683, -0.0271, -0.0826, ..., -0.0308, -0.0365, -0.0710], [-0.1445, -0.2047, 0.0275, ..., -0.0660, 0.0408, 0.0481]], device='cuda:0'), grad: tensor([[ 4.4703e-08, 7.4506e-09, 3.9116e-08, ..., 0.0000e+00, 4.6566e-08, 1.1176e-08], [ 1.8841e-06, 3.7998e-07, 2.8126e-07, ..., 0.0000e+00, 2.3842e-07, 1.8626e-09], [ 7.9721e-07, 1.8626e-07, 4.1910e-08, ..., 0.0000e+00, 1.0245e-08, 9.3132e-10], ..., [-3.7961e-06, -8.1025e-07, 1.2480e-06, ..., 0.0000e+00, 1.0896e-07, 0.0000e+00], [ 4.4703e-08, 4.6566e-09, 1.4901e-08, ..., 0.0000e+00, -5.3272e-07, 3.4459e-08], [ 5.1316e-07, 9.3132e-08, 6.3293e-06, ..., 9.3132e-10, 1.9558e-08, 9.3132e-10]], device='cuda:0') Epoch 234, bias, value: tensor([-0.0064, 0.0237, -0.0021, -0.0051, 0.0148, 0.0056, 0.0248, -0.0117, 0.0054, -0.0057], device='cuda:0'), grad: tensor([-1.9632e-06, 7.5102e-06, 2.1011e-06, 1.5208e-06, -1.8805e-05, 1.8440e-06, -2.1793e-07, -5.1335e-06, -3.6955e-06, 1.6794e-05], device='cuda:0') 100 1e-05 changing lr epoch 233, time 217.48, cls_loss 0.0009 cls_loss_mapping 0.0006 cls_loss_causal 0.4746 re_mapping 0.0044 re_causal 0.0160 /// teacc 99.04 lr 0.00001000 Epoch 235, weight, value: tensor([[-0.0561, -0.1092, -0.0600, ..., -0.1175, -0.0208, -0.0325], [-0.0509, 0.0502, -0.1624, ..., 0.0629, 0.0721, -0.1374], [ 0.0577, -0.0076, -0.0630, ..., -0.0601, -0.0449, 0.0088], ..., [ 0.0574, -0.0602, -0.0183, ..., 0.0174, -0.0419, 0.0176], [-0.0684, -0.0271, -0.0826, ..., -0.0308, -0.0366, -0.0710], [-0.1445, -0.2047, 0.0274, ..., -0.0661, 0.0408, 0.0481]], device='cuda:0'), grad: tensor([[ 4.4703e-08, 2.7940e-09, 6.5193e-09, ..., 0.0000e+00, 3.9116e-08, 0.0000e+00], [ 6.7521e-07, 1.0058e-07, 1.8626e-08, ..., 0.0000e+00, -3.9116e-08, 0.0000e+00], [ 8.5589e-07, 2.2538e-07, -8.3819e-09, ..., 0.0000e+00, 1.3970e-08, 0.0000e+00], ..., [-2.7753e-07, 6.8918e-08, 2.5146e-08, ..., 0.0000e+00, 7.7300e-08, 4.6566e-09], [-4.3176e-06, -1.2023e-06, 1.2387e-07, ..., 0.0000e+00, 7.9162e-08, 0.0000e+00], [ 6.6776e-07, 7.4506e-09, 8.1025e-08, ..., 0.0000e+00, -1.7136e-07, -5.5879e-09]], device='cuda:0') Epoch 235, bias, value: tensor([-0.0064, 0.0237, -0.0021, -0.0051, 0.0148, 0.0056, 0.0248, -0.0117, 0.0053, -0.0058], device='cuda:0'), grad: tensor([ 1.6764e-07, 3.3490e-06, 5.7146e-06, 7.9721e-07, 8.7172e-07, 8.1435e-06, 8.4713e-06, -1.3318e-07, -2.8715e-05, 1.3076e-06], device='cuda:0') 100 1e-05 changing lr epoch 234, time 217.43, cls_loss 0.0009 cls_loss_mapping 0.0007 cls_loss_causal 0.4849 re_mapping 0.0044 re_causal 0.0165 /// teacc 99.03 lr 0.00001000 Epoch 236, weight, value: tensor([[-0.0560, -0.1092, -0.0600, ..., -0.1176, -0.0208, -0.0325], [-0.0509, 0.0502, -0.1625, ..., 0.0630, 0.0721, -0.1374], [ 0.0578, -0.0076, -0.0629, ..., -0.0601, -0.0450, 0.0088], ..., [ 0.0574, -0.0602, -0.0184, ..., 0.0173, -0.0420, 0.0176], [-0.0686, -0.0271, -0.0827, ..., -0.0309, -0.0368, -0.0710], [-0.1446, -0.2048, 0.0273, ..., -0.0661, 0.0409, 0.0481]], device='cuda:0'), grad: tensor([[ 2.2352e-08, 9.3132e-10, -2.7940e-09, ..., 0.0000e+00, 1.4901e-08, 0.0000e+00], [-9.7696e-07, -1.7183e-06, 2.7008e-08, ..., 0.0000e+00, -2.5313e-06, 0.0000e+00], [ 6.7800e-07, 9.8720e-07, 6.5193e-09, ..., 0.0000e+00, 1.3402e-06, 0.0000e+00], ..., [-1.2135e-06, 6.8825e-07, -7.0781e-08, ..., 0.0000e+00, 1.0477e-06, 0.0000e+00], [ 4.3493e-07, 9.3132e-10, 9.6858e-08, ..., 0.0000e+00, 3.2596e-08, 0.0000e+00], [ 1.0971e-06, 0.0000e+00, 1.8813e-07, ..., 9.3132e-10, -5.9605e-08, 0.0000e+00]], device='cuda:0') Epoch 236, bias, value: tensor([-0.0063, 0.0237, -0.0020, -0.0051, 0.0148, 0.0056, 0.0248, -0.0118, 0.0052, -0.0058], device='cuda:0'), grad: tensor([-2.3283e-07, -4.9025e-06, 2.6934e-06, -2.4587e-07, 5.1595e-07, 5.3085e-08, 1.3411e-07, -2.5406e-06, 1.4957e-06, 3.0212e-06], device='cuda:0') 100 1e-05 changing lr epoch 235, time 217.60, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4786 re_mapping 0.0045 re_causal 0.0162 /// teacc 99.04 lr 0.00001000 Epoch 237, weight, value: tensor([[-0.0561, -0.1093, -0.0601, ..., -0.1177, -0.0209, -0.0325], [-0.0509, 0.0502, -0.1626, ..., 0.0629, 0.0722, -0.1374], [ 0.0578, -0.0077, -0.0629, ..., -0.0601, -0.0451, 0.0088], ..., [ 0.0574, -0.0602, -0.0185, ..., 0.0173, -0.0420, 0.0175], [-0.0687, -0.0271, -0.0827, ..., -0.0310, -0.0369, -0.0711], [-0.1446, -0.2048, 0.0274, ..., -0.0661, 0.0410, 0.0481]], device='cuda:0'), grad: tensor([[ 2.9802e-08, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 1.5832e-08, 0.0000e+00], [ 1.8720e-07, 0.0000e+00, 1.3970e-08, ..., 4.6566e-09, -2.3376e-07, 0.0000e+00], [-2.7474e-07, 0.0000e+00, 1.8626e-09, ..., 0.0000e+00, 5.1223e-08, 0.0000e+00], ..., [-4.1723e-07, 0.0000e+00, 3.7253e-09, ..., -2.3283e-08, 2.1700e-07, 0.0000e+00], [ 6.7055e-08, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 4.0047e-08, 0.0000e+00], [ 7.6368e-08, 0.0000e+00, -1.1176e-08, ..., 3.7253e-09, -1.0272e-06, 0.0000e+00]], device='cuda:0') Epoch 237, bias, value: tensor([-0.0063, 0.0237, -0.0020, -0.0051, 0.0147, 0.0057, 0.0249, -0.0118, 0.0052, -0.0057], device='cuda:0'), grad: tensor([-4.1425e-06, 2.0675e-07, -2.4866e-07, 1.0775e-06, 3.2783e-06, 3.0287e-06, 7.1060e-07, -1.2517e-06, 3.1572e-07, -2.9635e-06], device='cuda:0') 100 1e-05 changing lr epoch 236, time 217.65, cls_loss 0.0009 cls_loss_mapping 0.0007 cls_loss_causal 0.4453 re_mapping 0.0043 re_causal 0.0157 /// teacc 99.08 lr 0.00001000 Epoch 238, weight, value: tensor([[-0.0561, -0.1093, -0.0600, ..., -0.1178, -0.0210, -0.0325], [-0.0509, 0.0502, -0.1626, ..., 0.0629, 0.0722, -0.1375], [ 0.0579, -0.0077, -0.0629, ..., -0.0602, -0.0451, 0.0088], ..., [ 0.0574, -0.0602, -0.0185, ..., 0.0174, -0.0420, 0.0175], [-0.0688, -0.0271, -0.0828, ..., -0.0310, -0.0370, -0.0711], [-0.1447, -0.2048, 0.0273, ..., -0.0661, 0.0411, 0.0481]], device='cuda:0'), grad: tensor([[ 1.4901e-08, 0.0000e+00, 1.6671e-07, ..., 9.3132e-10, 1.0245e-08, 0.0000e+00], [ 1.5553e-07, 0.0000e+00, 9.8720e-08, ..., 1.8626e-09, 3.6322e-08, 0.0000e+00], [-3.7253e-09, 0.0000e+00, 1.5739e-07, ..., 0.0000e+00, 1.3970e-08, 0.0000e+00], ..., [-9.9652e-08, 0.0000e+00, 4.9081e-07, ..., 1.6671e-07, 6.8732e-07, 0.0000e+00], [ 3.7253e-08, 0.0000e+00, 8.0932e-07, ..., 1.8626e-09, -3.3900e-07, 0.0000e+00], [-6.0536e-08, 0.0000e+00, -8.2795e-07, ..., -2.1048e-07, -5.6252e-07, 0.0000e+00]], device='cuda:0') Epoch 238, bias, value: tensor([-0.0063, 0.0237, -0.0020, -0.0051, 0.0147, 0.0057, 0.0249, -0.0118, 0.0051, -0.0057], device='cuda:0'), grad: tensor([-6.0536e-06, 1.3392e-06, 4.1537e-07, 3.9041e-05, 8.0913e-06, -4.2588e-05, 1.8114e-06, 5.6326e-06, 1.7956e-06, -9.5442e-06], device='cuda:0') 100 1e-05 changing lr epoch 237, time 217.36, cls_loss 0.0008 cls_loss_mapping 0.0008 cls_loss_causal 0.4947 re_mapping 0.0045 re_causal 0.0168 /// teacc 99.07 lr 0.00001000 Epoch 239, weight, value: tensor([[-0.0562, -0.1093, -0.0601, ..., -0.1178, -0.0210, -0.0325], [-0.0509, 0.0503, -0.1627, ..., 0.0629, 0.0723, -0.1375], [ 0.0579, -0.0077, -0.0629, ..., -0.0602, -0.0452, 0.0088], ..., [ 0.0574, -0.0602, -0.0186, ..., 0.0174, -0.0421, 0.0175], [-0.0689, -0.0271, -0.0828, ..., -0.0311, -0.0370, -0.0711], [-0.1447, -0.2048, 0.0272, ..., -0.0661, 0.0411, 0.0481]], device='cuda:0'), grad: tensor([[ 7.6368e-08, 0.0000e+00, 8.6613e-08, ..., 0.0000e+00, 3.5390e-08, 0.0000e+00], [ 8.9407e-08, 0.0000e+00, 2.9802e-08, ..., 0.0000e+00, -6.5193e-08, 0.0000e+00], [-2.7940e-08, 0.0000e+00, 1.2107e-08, ..., 0.0000e+00, 7.4506e-09, 0.0000e+00], ..., [-1.8626e-08, 0.0000e+00, 2.5146e-08, ..., 0.0000e+00, 4.1910e-08, 0.0000e+00], [ 1.5832e-08, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 1.2107e-08, 0.0000e+00], [ 6.0536e-08, 0.0000e+00, -9.1176e-07, ..., 0.0000e+00, -3.8650e-07, 0.0000e+00]], device='cuda:0') Epoch 239, bias, value: tensor([-0.0063, 0.0237, -0.0020, -0.0051, 0.0147, 0.0057, 0.0249, -0.0118, 0.0051, -0.0057], device='cuda:0'), grad: tensor([ 2.4214e-06, 1.5926e-07, 8.4750e-07, -1.7695e-08, 4.8876e-06, 6.4634e-07, -6.2585e-06, 1.3970e-08, 3.7253e-08, -2.7139e-06], device='cuda:0') 100 1e-05 changing lr epoch 238, time 217.38, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4748 re_mapping 0.0045 re_causal 0.0163 /// teacc 99.08 lr 0.00001000 Epoch 240, weight, value: tensor([[-0.0561, -0.1093, -0.0601, ..., -0.1179, -0.0210, -0.0325], [-0.0509, 0.0503, -0.1627, ..., 0.0632, 0.0725, -0.1375], [ 0.0580, -0.0077, -0.0629, ..., -0.0602, -0.0453, 0.0088], ..., [ 0.0574, -0.0602, -0.0187, ..., 0.0171, -0.0423, 0.0175], [-0.0690, -0.0271, -0.0828, ..., -0.0312, -0.0371, -0.0711], [-0.1448, -0.2048, 0.0271, ..., -0.0661, 0.0412, 0.0481]], device='cuda:0'), grad: tensor([[ 9.2108e-07, 0.0000e+00, 1.4901e-08, ..., 2.0489e-08, 6.0536e-08, 0.0000e+00], [ 1.9372e-07, 0.0000e+00, 1.3690e-07, ..., 1.7229e-07, -9.3132e-10, 0.0000e+00], [-2.3954e-06, 0.0000e+00, 5.9605e-08, ..., 9.6858e-08, 8.0094e-08, 0.0000e+00], ..., [-3.3453e-06, 0.0000e+00, -6.5006e-06, ..., -1.0706e-05, 4.2841e-08, 0.0000e+00], [ 4.1723e-07, 0.0000e+00, 2.4214e-08, ..., 5.5879e-09, 6.9849e-08, 0.0000e+00], [ 2.5146e-07, 0.0000e+00, 6.9197e-07, ..., 2.8126e-07, -6.7987e-08, 0.0000e+00]], device='cuda:0') Epoch 240, bias, value: tensor([-0.0063, 0.0238, -0.0020, -0.0051, 0.0147, 0.0057, 0.0249, -0.0119, 0.0050, -0.0058], device='cuda:0'), grad: tensor([ 2.0154e-06, 8.3912e-07, -5.2452e-06, 2.4885e-06, 1.5751e-05, 4.3623e-06, -1.8626e-07, -2.3693e-05, 1.2135e-06, 2.4140e-06], device='cuda:0') 100 1e-05 changing lr epoch 239, time 217.94, cls_loss 0.0009 cls_loss_mapping 0.0009 cls_loss_causal 0.4928 re_mapping 0.0045 re_causal 0.0165 /// teacc 99.09 lr 0.00001000 Epoch 241, weight, value: tensor([[-0.0561, -0.1094, -0.0601, ..., -0.1179, -0.0210, -0.0325], [-0.0510, 0.0503, -0.1628, ..., 0.0634, 0.0727, -0.1375], [ 0.0581, -0.0077, -0.0628, ..., -0.0603, -0.0453, 0.0088], ..., [ 0.0574, -0.0602, -0.0187, ..., 0.0170, -0.0425, 0.0174], [-0.0691, -0.0271, -0.0828, ..., -0.0312, -0.0372, -0.0711], [-0.1448, -0.2048, 0.0270, ..., -0.0661, 0.0412, 0.0481]], device='cuda:0'), grad: tensor([[ 1.3318e-07, 0.0000e+00, 4.7870e-07, ..., 0.0000e+00, 8.3167e-07, 0.0000e+00], [ 3.0641e-07, 0.0000e+00, 3.7439e-07, ..., 0.0000e+00, 4.0978e-08, 0.0000e+00], [-3.0976e-06, 0.0000e+00, 1.9483e-06, ..., 0.0000e+00, 3.1944e-07, 0.0000e+00], ..., [ 2.5146e-06, 0.0000e+00, 4.3958e-07, ..., 0.0000e+00, 4.5635e-08, 0.0000e+00], [-4.4424e-07, 0.0000e+00, -3.5241e-06, ..., 0.0000e+00, 1.4156e-07, 0.0000e+00], [ 4.0699e-07, 0.0000e+00, 5.8860e-07, ..., 0.0000e+00, -8.3819e-08, 0.0000e+00]], device='cuda:0') Epoch 241, bias, value: tensor([-0.0063, 0.0239, -0.0019, -0.0052, 0.0148, 0.0057, 0.0249, -0.0119, 0.0050, -0.0058], device='cuda:0'), grad: tensor([ 4.8317e-06, 1.8971e-06, -1.1455e-07, 4.8056e-07, -1.3271e-06, 2.4453e-05, -2.7820e-05, 8.4937e-06, -1.3821e-05, 2.8722e-06], device='cuda:0') 100 1e-05 changing lr epoch 240, time 218.12, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4641 re_mapping 0.0044 re_causal 0.0161 /// teacc 99.09 lr 0.00001000 Epoch 242, weight, value: tensor([[-0.0562, -0.1094, -0.0602, ..., -0.1180, -0.0210, -0.0325], [-0.0509, 0.0504, -0.1629, ..., 0.0634, 0.0727, -0.1375], [ 0.0581, -0.0077, -0.0628, ..., -0.0603, -0.0454, 0.0088], ..., [ 0.0574, -0.0603, -0.0189, ..., 0.0170, -0.0425, 0.0174], [-0.0692, -0.0271, -0.0829, ..., -0.0313, -0.0373, -0.0711], [-0.1449, -0.2048, 0.0270, ..., -0.0661, 0.0413, 0.0481]], device='cuda:0'), grad: tensor([[ 1.5832e-08, 0.0000e+00, 5.5879e-09, ..., 0.0000e+00, 1.8626e-08, 0.0000e+00], [ 4.2003e-07, 0.0000e+00, 1.4435e-07, ..., 3.7253e-09, -1.0338e-07, 0.0000e+00], [-7.1246e-07, 0.0000e+00, -2.4214e-08, ..., 9.3132e-10, 1.4901e-08, 0.0000e+00], ..., [-9.2201e-08, 0.0000e+00, -4.0978e-08, ..., -4.0047e-08, 6.3889e-07, 0.0000e+00], [ 4.1910e-08, 0.0000e+00, -2.9802e-08, ..., 0.0000e+00, 7.0781e-08, 0.0000e+00], [ 1.8161e-07, 0.0000e+00, 1.7136e-07, ..., 2.4214e-08, -1.0300e-06, 0.0000e+00]], device='cuda:0') Epoch 242, bias, value: tensor([-0.0063, 0.0239, -0.0019, -0.0051, 0.0147, 0.0057, 0.0248, -0.0120, 0.0049, -0.0057], device='cuda:0'), grad: tensor([ 1.1269e-07, 1.0245e-06, -9.7975e-07, -6.3796e-07, 4.9639e-07, 1.2433e-06, 1.0058e-07, 1.3551e-06, 6.0536e-08, -2.7735e-06], device='cuda:0') 100 1e-05 changing lr epoch 241, time 218.15, cls_loss 0.0009 cls_loss_mapping 0.0007 cls_loss_causal 0.4478 re_mapping 0.0045 re_causal 0.0161 /// teacc 99.10 lr 0.00001000 Epoch 243, weight, value: tensor([[-0.0562, -0.1094, -0.0603, ..., -0.1181, -0.0211, -0.0326], [-0.0509, 0.0505, -0.1630, ..., 0.0634, 0.0728, -0.1376], [ 0.0582, -0.0077, -0.0628, ..., -0.0603, -0.0455, 0.0087], ..., [ 0.0574, -0.0603, -0.0189, ..., 0.0170, -0.0426, 0.0174], [-0.0694, -0.0271, -0.0830, ..., -0.0314, -0.0374, -0.0711], [-0.1449, -0.2049, 0.0270, ..., -0.0661, 0.0415, 0.0481]], device='cuda:0'), grad: tensor([[ 4.8429e-08, 0.0000e+00, 1.4901e-08, ..., 0.0000e+00, 1.2107e-08, 0.0000e+00], [ 9.1922e-07, 0.0000e+00, 6.0536e-08, ..., 0.0000e+00, -5.3085e-08, 0.0000e+00], [-6.2399e-08, 0.0000e+00, 1.4901e-08, ..., 0.0000e+00, -5.5879e-08, 0.0000e+00], ..., [-1.2908e-06, 0.0000e+00, 9.4995e-08, ..., 0.0000e+00, 7.9162e-08, 0.0000e+00], [ 6.7055e-08, 0.0000e+00, -3.7253e-09, ..., 0.0000e+00, 6.5193e-09, 0.0000e+00], [ 4.1910e-08, 0.0000e+00, 2.3022e-05, ..., 0.0000e+00, -5.5879e-08, 0.0000e+00]], device='cuda:0') Epoch 243, bias, value: tensor([-0.0063, 0.0239, -0.0019, -0.0051, 0.0147, 0.0058, 0.0247, -0.0120, 0.0048, -0.0057], device='cuda:0'), grad: tensor([ 1.2945e-07, 1.6727e-06, -3.2783e-07, 6.6124e-07, -5.6118e-05, -1.3923e-06, 1.3961e-06, -1.9539e-06, 6.9849e-08, 5.5909e-05], device='cuda:0') 100 1e-05 changing lr epoch 242, time 217.94, cls_loss 0.0008 cls_loss_mapping 0.0009 cls_loss_causal 0.4683 re_mapping 0.0046 re_causal 0.0163 /// teacc 99.09 lr 0.00001000 Epoch 244, weight, value: tensor([[-0.0562, -0.1095, -0.0603, ..., -0.1181, -0.0211, -0.0326], [-0.0509, 0.0508, -0.1631, ..., 0.0634, 0.0728, -0.1376], [ 0.0583, -0.0077, -0.0628, ..., -0.0603, -0.0455, 0.0087], ..., [ 0.0574, -0.0605, -0.0191, ..., 0.0170, -0.0426, 0.0174], [-0.0695, -0.0271, -0.0830, ..., -0.0314, -0.0374, -0.0711], [-0.1450, -0.2049, 0.0268, ..., -0.0661, 0.0415, 0.0481]], device='cuda:0'), grad: tensor([[ 5.7742e-08, 0.0000e+00, 1.2107e-07, ..., 0.0000e+00, 1.6298e-07, 0.0000e+00], [ 3.0547e-07, 0.0000e+00, 6.8918e-08, ..., 0.0000e+00, -1.2144e-06, 0.0000e+00], [ 1.3877e-07, 0.0000e+00, 6.1933e-07, ..., 0.0000e+00, 1.3905e-06, 0.0000e+00], ..., [-2.1011e-06, 0.0000e+00, 3.8184e-08, ..., 0.0000e+00, 7.4320e-07, 6.5193e-09], [-9.3132e-09, 0.0000e+00, 4.1164e-07, ..., 0.0000e+00, 1.4901e-07, 9.3132e-10], [ 9.5554e-07, 0.0000e+00, 2.2817e-07, ..., 0.0000e+00, -3.0920e-07, -2.8871e-08]], device='cuda:0') Epoch 244, bias, value: tensor([-0.0062, 0.0239, -0.0018, -0.0051, 0.0148, 0.0058, 0.0247, -0.0120, 0.0048, -0.0058], device='cuda:0'), grad: tensor([ 1.6363e-06, -1.4044e-06, 9.6411e-06, 2.4773e-06, 6.1169e-06, -5.4110e-07, -1.8746e-05, -3.8296e-06, 1.9372e-06, 2.7083e-06], device='cuda:0') 100 1e-05 changing lr epoch 243, time 217.97, cls_loss 0.0008 cls_loss_mapping 0.0007 cls_loss_causal 0.4854 re_mapping 0.0045 re_causal 0.0166 /// teacc 99.10 lr 0.00001000 Epoch 245, weight, value: tensor([[-0.0563, -0.1095, -0.0603, ..., -0.1181, -0.0211, -0.0326], [-0.0509, 0.0509, -0.1632, ..., 0.0634, 0.0729, -0.1376], [ 0.0584, -0.0077, -0.0628, ..., -0.0604, -0.0455, 0.0088], ..., [ 0.0574, -0.0606, -0.0191, ..., 0.0170, -0.0426, 0.0174], [-0.0698, -0.0271, -0.0830, ..., -0.0314, -0.0375, -0.0711], [-0.1450, -0.2049, 0.0267, ..., -0.0661, 0.0415, 0.0481]], device='cuda:0'), grad: tensor([[ 2.0582e-07, 0.0000e+00, 8.7544e-08, ..., 0.0000e+00, 1.8720e-07, 0.0000e+00], [ 2.5984e-07, 0.0000e+00, 1.6298e-07, ..., 0.0000e+00, -5.3309e-06, 0.0000e+00], [-8.6427e-06, 0.0000e+00, -4.4517e-07, ..., 0.0000e+00, -3.7253e-09, 0.0000e+00], ..., [ 7.5400e-06, 0.0000e+00, 3.9302e-07, ..., 0.0000e+00, 3.6508e-06, 0.0000e+00], [ 4.4610e-07, 0.0000e+00, 3.0082e-07, ..., 0.0000e+00, 1.5274e-07, 0.0000e+00], [ 7.1712e-08, 0.0000e+00, 8.5495e-07, ..., 0.0000e+00, 9.6578e-07, 0.0000e+00]], device='cuda:0') Epoch 245, bias, value: tensor([-0.0062, 0.0239, -0.0018, -0.0050, 0.0148, 0.0057, 0.0248, -0.0120, 0.0047, -0.0058], device='cuda:0'), grad: tensor([-4.5542e-07, -1.5318e-05, -1.4447e-05, 3.1665e-07, -2.4922e-06, 6.5193e-09, -4.9360e-08, 2.4512e-05, 1.8803e-06, 6.0573e-06], device='cuda:0') 100 1e-05 changing lr epoch 244, time 217.95, cls_loss 0.0008 cls_loss_mapping 0.0008 cls_loss_causal 0.4894 re_mapping 0.0045 re_causal 0.0166 /// teacc 99.10 lr 0.00001000 Epoch 246, weight, value: tensor([[-0.0563, -0.1096, -0.0603, ..., -0.1181, -0.0211, -0.0326], [-0.0509, 0.0511, -0.1632, ..., 0.0634, 0.0730, -0.1376], [ 0.0585, -0.0078, -0.0628, ..., -0.0604, -0.0455, 0.0088], ..., [ 0.0573, -0.0608, -0.0193, ..., 0.0169, -0.0427, 0.0173], [-0.0701, -0.0271, -0.0830, ..., -0.0315, -0.0375, -0.0711], [-0.1451, -0.2049, 0.0266, ..., -0.0662, 0.0416, 0.0481]], device='cuda:0'), grad: tensor([[ 6.9849e-08, 0.0000e+00, -7.1712e-08, ..., 9.3132e-10, 1.4901e-08, 8.3819e-09], [ 1.7975e-07, 0.0000e+00, 6.1616e-06, ..., 9.1735e-07, 1.8338e-06, 4.6566e-09], [ 9.0338e-07, 0.0000e+00, 4.8429e-08, ..., 3.7253e-09, 2.7008e-08, 8.3819e-08], ..., [-4.5914e-07, 0.0000e+00, 2.3935e-07, ..., 6.5193e-09, 1.5926e-07, 2.1420e-08], [ 3.5390e-08, 0.0000e+00, 7.2643e-08, ..., 0.0000e+00, 1.1176e-08, 3.7253e-09], [ 8.1956e-08, 0.0000e+00, 3.5651e-06, ..., 4.9639e-07, 1.1204e-06, 7.4506e-09]], device='cuda:0') Epoch 246, bias, value: tensor([-0.0062, 0.0240, -0.0017, -0.0050, 0.0148, 0.0057, 0.0247, -0.0121, 0.0046, -0.0058], device='cuda:0'), grad: tensor([-3.7961e-06, 1.4223e-05, 1.6093e-06, -1.5646e-06, -2.2992e-05, 1.1595e-06, 1.9930e-06, 2.4866e-07, 2.8033e-07, 8.8140e-06], device='cuda:0') 100 1e-05 changing lr epoch 245, time 217.97, cls_loss 0.0010 cls_loss_mapping 0.0007 cls_loss_causal 0.4716 re_mapping 0.0044 re_causal 0.0160 /// teacc 99.06 lr 0.00001000 Epoch 247, weight, value: tensor([[-0.0564, -0.1097, -0.0602, ..., -0.1182, -0.0212, -0.0326], [-0.0510, 0.0512, -0.1634, ..., 0.0634, 0.0731, -0.1376], [ 0.0586, -0.0077, -0.0628, ..., -0.0604, -0.0456, 0.0087], ..., [ 0.0574, -0.0609, -0.0194, ..., 0.0169, -0.0428, 0.0172], [-0.0703, -0.0271, -0.0831, ..., -0.0315, -0.0377, -0.0711], [-0.1452, -0.2049, 0.0266, ..., -0.0662, 0.0417, 0.0482]], device='cuda:0'), grad: tensor([[ 1.0245e-08, 0.0000e+00, 4.6566e-09, ..., 0.0000e+00, 1.5944e-06, 1.8626e-09], [ 1.2852e-07, 0.0000e+00, 8.3819e-08, ..., 0.0000e+00, -1.0617e-07, 2.7940e-09], [-7.4506e-09, 0.0000e+00, 4.0047e-08, ..., 0.0000e+00, 2.2352e-08, 0.0000e+00], ..., [-6.7055e-08, 0.0000e+00, 7.7300e-08, ..., 5.5879e-09, 7.9442e-07, 1.2293e-07], [ 1.4901e-08, 0.0000e+00, 1.3970e-08, ..., 0.0000e+00, 3.7253e-08, 0.0000e+00], [ 3.4459e-08, 0.0000e+00, 3.1114e-05, ..., -2.2352e-08, -2.0117e-06, -3.1479e-07]], device='cuda:0') Epoch 247, bias, value: tensor([-0.0061, 0.0240, -0.0017, -0.0050, 0.0148, 0.0057, 0.0248, -0.0121, 0.0044, -0.0059], device='cuda:0'), grad: tensor([ 1.9282e-05, 2.9895e-07, 1.5646e-07, -8.5682e-08, -9.7990e-05, 3.8743e-06, -2.3216e-05, 3.5446e-06, 2.1420e-07, 9.3937e-05], device='cuda:0') 100 1e-05 changing lr epoch 246, time 217.96, cls_loss 0.0010 cls_loss_mapping 0.0008 cls_loss_causal 0.4909 re_mapping 0.0044 re_causal 0.0165 /// teacc 99.06 lr 0.00001000 Epoch 248, weight, value: tensor([[-0.0564, -0.1098, -0.0602, ..., -0.1182, -0.0213, -0.0326], [-0.0509, 0.0514, -0.1635, ..., 0.0636, 0.0733, -0.1376], [ 0.0586, -0.0078, -0.0628, ..., -0.0604, -0.0457, 0.0087], ..., [ 0.0573, -0.0610, -0.0195, ..., 0.0168, -0.0430, 0.0171], [-0.0705, -0.0271, -0.0832, ..., -0.0316, -0.0378, -0.0711], [-0.1452, -0.2049, 0.0264, ..., -0.0662, 0.0417, 0.0482]], device='cuda:0'), grad: tensor([[ 7.4506e-08, 6.5193e-09, -9.6858e-08, ..., 0.0000e+00, 1.1455e-07, 0.0000e+00], [-1.0006e-05, -3.0398e-06, 2.2743e-06, ..., 0.0000e+00, 2.7474e-07, 0.0000e+00], [ 8.7768e-06, 2.6524e-06, 4.0047e-08, ..., 0.0000e+00, 2.4717e-06, 0.0000e+00], ..., [ 1.4026e-06, 3.5856e-07, 9.4995e-08, ..., 0.0000e+00, 3.8743e-07, 0.0000e+00], [ 3.7160e-07, 1.8626e-09, 1.4693e-05, ..., 0.0000e+00, 2.1145e-05, 0.0000e+00], [ 9.4995e-08, 0.0000e+00, 4.7907e-06, ..., 0.0000e+00, -6.0536e-08, 0.0000e+00]], device='cuda:0') Epoch 248, bias, value: tensor([-0.0061, 0.0241, -0.0017, -0.0049, 0.0148, 0.0056, 0.0248, -0.0122, 0.0044, -0.0059], device='cuda:0'), grad: tensor([-1.3225e-07, -2.0951e-05, 3.2604e-05, -8.4937e-07, -1.3880e-05, -1.9765e-04, 7.2956e-05, 5.0738e-06, 1.0872e-04, 1.3739e-05], device='cuda:0') 100 1e-05 changing lr epoch 247, time 217.99, cls_loss 0.0012 cls_loss_mapping 0.0010 cls_loss_causal 0.4750 re_mapping 0.0044 re_causal 0.0158 /// teacc 99.03 lr 0.00001000 Epoch 249, weight, value: tensor([[-0.0564, -0.1098, -0.0603, ..., -0.1183, -0.0214, -0.0326], [-0.0510, 0.0514, -0.1637, ..., 0.0636, 0.0735, -0.1377], [ 0.0586, -0.0078, -0.0628, ..., -0.0605, -0.0458, 0.0086], ..., [ 0.0574, -0.0611, -0.0197, ..., 0.0167, -0.0431, 0.0171], [-0.0706, -0.0272, -0.0832, ..., -0.0316, -0.0380, -0.0711], [-0.1453, -0.2050, 0.0264, ..., -0.0664, 0.0419, 0.0482]], device='cuda:0'), grad: tensor([[ 4.7777e-07, 9.3132e-10, 1.1176e-08, ..., 0.0000e+00, 2.7008e-08, 0.0000e+00], [ 3.9767e-07, -4.0699e-07, 1.5832e-08, ..., 0.0000e+00, -8.4192e-07, 0.0000e+00], [-1.7695e-06, 1.0245e-08, -1.1642e-07, ..., 0.0000e+00, 3.4459e-08, 0.0000e+00], ..., [ 3.7067e-07, 3.8184e-07, 2.4214e-08, ..., 0.0000e+00, 7.2829e-07, 0.0000e+00], [ 3.8221e-06, 5.5879e-09, 1.2107e-08, ..., 0.0000e+00, 4.0047e-08, 0.0000e+00], [ 9.4995e-08, 0.0000e+00, 1.5832e-08, ..., 0.0000e+00, -1.0151e-07, 0.0000e+00]], device='cuda:0') Epoch 249, bias, value: tensor([-0.0061, 0.0242, -0.0017, -0.0048, 0.0148, 0.0056, 0.0247, -0.0123, 0.0042, -0.0059], device='cuda:0'), grad: tensor([ 5.9325e-07, -1.1474e-06, -7.4543e-06, -6.0275e-06, 5.3085e-07, 2.7977e-06, -6.1467e-08, 1.8887e-06, 9.1195e-06, -2.4028e-07], device='cuda:0') 100 1e-05 changing lr epoch 248, time 217.94, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4825 re_mapping 0.0043 re_causal 0.0161 /// teacc 99.07 lr 0.00001000 Epoch 250, weight, value: tensor([[-0.0565, -0.1099, -0.0603, ..., -0.1183, -0.0214, -0.0326], [-0.0510, 0.0515, -0.1638, ..., 0.0636, 0.0735, -0.1377], [ 0.0587, -0.0078, -0.0628, ..., -0.0605, -0.0459, 0.0086], ..., [ 0.0574, -0.0611, -0.0198, ..., 0.0167, -0.0432, 0.0171], [-0.0707, -0.0272, -0.0833, ..., -0.0317, -0.0379, -0.0711], [-0.1453, -0.2050, 0.0263, ..., -0.0664, 0.0419, 0.0482]], device='cuda:0'), grad: tensor([[ 1.6857e-07, 0.0000e+00, 1.2107e-08, ..., 0.0000e+00, 8.0094e-08, 0.0000e+00], [ 3.0454e-07, 0.0000e+00, 7.5437e-08, ..., 0.0000e+00, -2.2631e-07, 0.0000e+00], [-6.8098e-06, 0.0000e+00, 7.0781e-08, ..., 0.0000e+00, 3.0734e-08, 0.0000e+00], ..., [ 6.0499e-06, 0.0000e+00, 9.1270e-08, ..., 0.0000e+00, 8.4750e-08, 0.0000e+00], [ 1.1921e-07, 0.0000e+00, 3.7253e-09, ..., 0.0000e+00, 2.7940e-08, 0.0000e+00], [ 8.5682e-08, 0.0000e+00, 4.1910e-08, ..., 0.0000e+00, 6.8918e-08, 0.0000e+00]], device='cuda:0') Epoch 250, bias, value: tensor([-0.0060, 0.0242, -0.0017, -0.0049, 0.0149, 0.0056, 0.0246, -0.0122, 0.0042, -0.0059], device='cuda:0'), grad: tensor([ 4.5355e-07, 3.3993e-07, -9.6262e-06, 1.1735e-07, 3.1199e-07, 3.3528e-07, -9.0897e-07, 8.1509e-06, 2.8033e-07, 5.3365e-07], device='cuda:0') 100 1e-05 changing lr epoch 249, time 217.65, cls_loss 0.0009 cls_loss_mapping 0.0008 cls_loss_causal 0.4359 re_mapping 0.0043 re_causal 0.0156 /// teacc 99.02 lr 0.00001000 ---------------------saving last model at epoch 249---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep250_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_epoch250', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep250_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3_WithStyleAttackExp1_epoch250/14factor_last.csv', 'channels': 3, 'factor_num': 14, 'stride': 3, 'epoch': 'last', 'eval_mapping': True} loading weight of last randm: False stride: 3 loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last loading weight of last Using downloaded and verified file: /home/yuqian_fu/.pytorch/SVHN/test_32x32.mat mnist mnist_FA ... usps_FA Avg ShearX 98.879997 98.769997 ... 76.482315 71.892851 ShearY 98.739998 98.629997 ... 76.482315 62.471693 AutoContrast 98.970001 99.000000 ... 76.482315 58.870535 Invert 98.750000 97.680000 ... 76.482315 62.949374 Equalize 98.329994 97.570000 ... 76.482315 69.557647 Solarize 98.070000 97.029999 ... 76.482315 63.104321 SolarizeAdd 98.239998 97.389999 ... 76.482315 69.777911 Posterize 99.010002 98.849998 ... 76.482315 73.372108 Contrast 99.049995 98.970001 ... 76.482315 68.781693 Color 99.089996 99.019997 ... 76.482315 63.887585 Brightness 99.000000 98.979996 ... 76.482315 66.205976 Sharpness 99.019997 98.979996 ... 76.482315 71.537817 NoiseSalt 99.119995 98.970001 ... 76.482315 57.677266 NoiseGaussian 99.110001 99.010002 ... 76.482315 58.686552 w/o do (original x) 99.020000 0.000000 ... 0.000000 72.226558 [15 rows x 11 columns] mnist svhn mnist_m syndigit usps Avg do 99.04 66.122465 78.591268 75.662096 80.568012 75.23596