/home/yuqian_fu {'gpu': '0', 'data': 'art_painting', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 5, 'factor_num': 16, 'epochs': 70, 'nbatch': 100, 'batchsize': 6, 'lr': 0.01, 'lr_scheduler': 'cosine', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep70_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} x.shape: (1840, 227, 227, 3) x_aug train here torch.Size([1840, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_train.hdf5 torch.Size([1840, 3, 227, 227]) torch.Size([1840]) --------------------------CA_multiple-------------------------- ---------------------------16 factors----------------- randm: True randn: True n: 3 randm: False x.shape: (208, 227, 227, 3) x_aug test here torch.Size([208, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_val.hdf5 torch.Size([208, 3, 227, 227]) torch.Size([208]) -------------------------------------loading pretrain weights---------------------------------- 306 cls_loss: tensor(2.2935, device='cuda:0', grad_fn=) cls_loss: tensor(2.2858, device='cuda:0', grad_fn=) cls_loss: tensor(1.7793, device='cuda:0', grad_fn=) cls_loss: tensor(6.2461, device='cuda:0', grad_fn=) cls_loss: tensor(5.5242, device='cuda:0', grad_fn=) cls_loss: tensor(18.2740, device='cuda:0', grad_fn=) cls_loss: tensor(14.2471, device='cuda:0', grad_fn=) cls_loss: tensor(6.1980, device='cuda:0', grad_fn=) cls_loss: tensor(6.3535, device='cuda:0', grad_fn=) cls_loss: tensor(22.8802, device='cuda:0', grad_fn=) cls_loss: tensor(13.4238, device='cuda:0', grad_fn=) cls_loss: tensor(11.0419, device='cuda:0', grad_fn=) cls_loss: tensor(4.2368, device='cuda:0', grad_fn=) cls_loss: tensor(30.8763, device='cuda:0', grad_fn=) cls_loss: tensor(16.7465, device='cuda:0', grad_fn=) cls_loss: tensor(15.3426, device='cuda:0', grad_fn=) cls_loss: tensor(24.1849, device='cuda:0', grad_fn=) cls_loss: tensor(22.2878, device='cuda:0', grad_fn=) cls_loss: tensor(1.0692, device='cuda:0', grad_fn=) cls_loss: tensor(16.8876, device='cuda:0', grad_fn=) cls_loss: tensor(10.7633, device='cuda:0', grad_fn=) cls_loss: tensor(8.5395, device='cuda:0', grad_fn=) cls_loss: tensor(6.1919, device='cuda:0', grad_fn=) cls_loss: tensor(4.4297, device='cuda:0', grad_fn=) cls_loss: tensor(3.3681, device='cuda:0', grad_fn=) cls_loss: tensor(14.5060, device='cuda:0', grad_fn=) cls_loss: tensor(6.2463, device='cuda:0', grad_fn=) cls_loss: tensor(11.1387, device='cuda:0', grad_fn=) cls_loss: tensor(8.6156, device='cuda:0', grad_fn=) cls_loss: tensor(9.2663, device='cuda:0', grad_fn=) cls_loss: tensor(4.5333, device='cuda:0', grad_fn=) cls_loss: tensor(6.2918, device='cuda:0', grad_fn=) cls_loss: tensor(13.7344, device='cuda:0', grad_fn=) cls_loss: tensor(4.8238, device='cuda:0', grad_fn=) cls_loss: tensor(8.4837, device='cuda:0', grad_fn=) cls_loss: tensor(6.3708, device='cuda:0', grad_fn=) cls_loss: tensor(3.3485, device='cuda:0', grad_fn=) cls_loss: tensor(4.7990, device='cuda:0', grad_fn=) cls_loss: tensor(14.6686, device='cuda:0', grad_fn=) cls_loss: tensor(4.5339, device='cuda:0', grad_fn=) cls_loss: tensor(10.5646, device='cuda:0', grad_fn=) cls_loss: tensor(6.2536, device='cuda:0', grad_fn=) cls_loss: tensor(32.3005, device='cuda:0', grad_fn=) cls_loss: tensor(10.1595, device='cuda:0', grad_fn=) cls_loss: tensor(9.7116, device='cuda:0', grad_fn=) cls_loss: tensor(8.5924, device='cuda:0', grad_fn=) cls_loss: tensor(1.9130, device='cuda:0', grad_fn=) cls_loss: tensor(10.0599, device='cuda:0', grad_fn=) cls_loss: tensor(12.0036, device='cuda:0', grad_fn=) cls_loss: tensor(7.1292, device='cuda:0', grad_fn=) cls_loss: tensor(2.6592, device='cuda:0', grad_fn=) cls_loss: tensor(8.6772, device='cuda:0', grad_fn=) cls_loss: tensor(3.6541, device='cuda:0', grad_fn=) cls_loss: tensor(9.8724, device='cuda:0', grad_fn=) cls_loss: tensor(11.1064, device='cuda:0', grad_fn=) cls_loss: tensor(5.9768, device='cuda:0', grad_fn=) cls_loss: tensor(0.7866, device='cuda:0', grad_fn=) cls_loss: tensor(6.7919, device='cuda:0', grad_fn=) cls_loss: tensor(4.4239, device='cuda:0', grad_fn=) cls_loss: tensor(12.6269, device='cuda:0', grad_fn=) cls_loss: tensor(1.9542, device='cuda:0', grad_fn=) cls_loss: tensor(3.5696, device='cuda:0', grad_fn=) cls_loss: tensor(6.1387, device='cuda:0', grad_fn=) cls_loss: tensor(3.6853, device='cuda:0', grad_fn=) cls_loss: tensor(0.2153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(7.6715, device='cuda:0', grad_fn=) cls_loss: tensor(5.2956, device='cuda:0', grad_fn=) cls_loss: tensor(2.4352, device='cuda:0', grad_fn=) cls_loss: tensor(0.6591, device='cuda:0', grad_fn=) cls_loss: tensor(23.3229, device='cuda:0', grad_fn=) cls_loss: tensor(6.1087, device='cuda:0', grad_fn=) cls_loss: tensor(8.2331, device='cuda:0', grad_fn=) cls_loss: tensor(8.2596, device='cuda:0', grad_fn=) cls_loss: tensor(8.5277, device='cuda:0', grad_fn=) cls_loss: tensor(5.5132, device='cuda:0', grad_fn=) cls_loss: tensor(1.7722, device='cuda:0', grad_fn=) cls_loss: tensor(2.5606, device='cuda:0', grad_fn=) cls_loss: tensor(6.6145, device='cuda:0', grad_fn=) cls_loss: tensor(10.4831, device='cuda:0', grad_fn=) cls_loss: tensor(11.3048, device='cuda:0', grad_fn=) cls_loss: tensor(5.8227, device='cuda:0', grad_fn=) cls_loss: tensor(4.5807, device='cuda:0', grad_fn=) cls_loss: tensor(9.4805, device='cuda:0', grad_fn=) cls_loss: tensor(14.6686, device='cuda:0', grad_fn=) cls_loss: tensor(2.2433, device='cuda:0', grad_fn=) cls_loss: tensor(15.3711, device='cuda:0', grad_fn=) cls_loss: tensor(1.1627, device='cuda:0', grad_fn=) cls_loss: tensor(9.8917, device='cuda:0', grad_fn=) cls_loss: tensor(12.0818, device='cuda:0', grad_fn=) cls_loss: tensor(0.0882, device='cuda:0', grad_fn=) cls_loss: tensor(14.5354, device='cuda:0', grad_fn=) cls_loss: tensor(11.3483, device='cuda:0', grad_fn=) cls_loss: tensor(8.7110, device='cuda:0', grad_fn=) cls_loss: tensor(9.1689, device='cuda:0', grad_fn=) cls_loss: tensor(12.6338, device='cuda:0', grad_fn=) cls_loss: tensor(35.7786, device='cuda:0', grad_fn=) cls_loss: tensor(1.6535, device='cuda:0', grad_fn=) cls_loss: tensor(10.9492, device='cuda:0', grad_fn=) cls_loss: tensor(12.8542, device='cuda:0', grad_fn=) cls_loss: tensor(10.2592, device='cuda:0', grad_fn=) cls_loss: tensor(10.1615, device='cuda:0', grad_fn=) cls_loss: tensor(2.9883, device='cuda:0', grad_fn=) cls_loss: tensor(4.7067, device='cuda:0', grad_fn=) cls_loss: tensor(4.4258, device='cuda:0', grad_fn=) cls_loss: tensor(3.0482, device='cuda:0', grad_fn=) cls_loss: tensor(6.1906, device='cuda:0', grad_fn=) cls_loss: tensor(19.8615, device='cuda:0', grad_fn=) cls_loss: tensor(0.2966, device='cuda:0', grad_fn=) cls_loss: tensor(1.1797, device='cuda:0', grad_fn=) cls_loss: tensor(4.3566, device='cuda:0', grad_fn=) cls_loss: tensor(16.8438, device='cuda:0', grad_fn=) cls_loss: tensor(2.2898, device='cuda:0', grad_fn=) cls_loss: tensor(6.7071, device='cuda:0', grad_fn=) cls_loss: tensor(9.9359, device='cuda:0', grad_fn=) cls_loss: tensor(9.0977, device='cuda:0', grad_fn=) cls_loss: tensor(9.5880, device='cuda:0', grad_fn=) cls_loss: tensor(11.4896, device='cuda:0', grad_fn=) cls_loss: tensor(4.7292, device='cuda:0', grad_fn=) cls_loss: tensor(15.7689, device='cuda:0', grad_fn=) cls_loss: tensor(9.7108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(16.1529, device='cuda:0', grad_fn=) cls_loss: tensor(4.1345, device='cuda:0', grad_fn=) cls_loss: tensor(2.1748, device='cuda:0', grad_fn=) cls_loss: tensor(7.7127, device='cuda:0', grad_fn=) cls_loss: tensor(28.6545, device='cuda:0', grad_fn=) cls_loss: tensor(20.7057, device='cuda:0', grad_fn=) cls_loss: tensor(1.5473, device='cuda:0', grad_fn=) cls_loss: tensor(17.5304, device='cuda:0', grad_fn=) cls_loss: tensor(23.4637, device='cuda:0', grad_fn=) cls_loss: tensor(2.3529, device='cuda:0', grad_fn=) cls_loss: tensor(16.0820, device='cuda:0', grad_fn=) cls_loss: tensor(21.6462, device='cuda:0', grad_fn=) cls_loss: tensor(16.0736, device='cuda:0', grad_fn=) cls_loss: tensor(15.4636, device='cuda:0', grad_fn=) cls_loss: tensor(5.6375, device='cuda:0', grad_fn=) cls_loss: tensor(5.2438, device='cuda:0', grad_fn=) cls_loss: tensor(4.0990, device='cuda:0', grad_fn=) cls_loss: tensor(19.9086, device='cuda:0', grad_fn=) cls_loss: tensor(3.3116, device='cuda:0', grad_fn=) cls_loss: tensor(9.5767, device='cuda:0', grad_fn=) cls_loss: tensor(15.0807, device='cuda:0', grad_fn=) cls_loss: tensor(18.1963, device='cuda:0', grad_fn=) cls_loss: tensor(4.2227, device='cuda:0', grad_fn=) cls_loss: tensor(6.1826, device='cuda:0', grad_fn=) cls_loss: tensor(21.9622, device='cuda:0', grad_fn=) cls_loss: tensor(1.1757, device='cuda:0', grad_fn=) cls_loss: tensor(11.4477, device='cuda:0', grad_fn=) cls_loss: tensor(2.8626, device='cuda:0', grad_fn=) cls_loss: tensor(22.1042, device='cuda:0', grad_fn=) cls_loss: tensor(3.8924, device='cuda:0', grad_fn=) cls_loss: tensor(2.2720, device='cuda:0', grad_fn=) cls_loss: tensor(21.7604, device='cuda:0', grad_fn=) cls_loss: tensor(22.6425, device='cuda:0', grad_fn=) cls_loss: tensor(4.2670, device='cuda:0', grad_fn=) cls_loss: tensor(4.6418, device='cuda:0', grad_fn=) cls_loss: tensor(23.8737, device='cuda:0', grad_fn=) cls_loss: tensor(23.0524, device='cuda:0', grad_fn=) cls_loss: tensor(15.1250, device='cuda:0', grad_fn=) cls_loss: tensor(14.6068, device='cuda:0', grad_fn=) cls_loss: tensor(2.6768, device='cuda:0', grad_fn=) cls_loss: tensor(18.6380, device='cuda:0', grad_fn=) cls_loss: tensor(15.8815, device='cuda:0', grad_fn=) cls_loss: tensor(7.9714, device='cuda:0', grad_fn=) cls_loss: tensor(36.1007, device='cuda:0', grad_fn=) cls_loss: tensor(7.8483, device='cuda:0', grad_fn=) cls_loss: tensor(11.4844, device='cuda:0', grad_fn=) cls_loss: tensor(6.6608, device='cuda:0', grad_fn=) cls_loss: tensor(9.2629, device='cuda:0', grad_fn=) cls_loss: tensor(9.4831, device='cuda:0', grad_fn=) cls_loss: tensor(3.5443, device='cuda:0', grad_fn=) cls_loss: tensor(10.0833, device='cuda:0', grad_fn=) cls_loss: tensor(22.0471, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(7.5573, device='cuda:0', grad_fn=) cls_loss: tensor(22.2188, device='cuda:0', grad_fn=) cls_loss: tensor(21.6570, device='cuda:0', grad_fn=) cls_loss: tensor(5.7983, device='cuda:0', grad_fn=) cls_loss: tensor(5.9805, device='cuda:0', grad_fn=) cls_loss: tensor(16.1500, device='cuda:0', grad_fn=) cls_loss: tensor(7.9316, device='cuda:0', grad_fn=) cls_loss: tensor(11.0088, device='cuda:0', grad_fn=) cls_loss: tensor(13.2359, device='cuda:0', grad_fn=) cls_loss: tensor(18.0378, device='cuda:0', grad_fn=) cls_loss: tensor(7.0029, device='cuda:0', grad_fn=) cls_loss: tensor(8.5645, device='cuda:0', grad_fn=) cls_loss: tensor(12.6550, device='cuda:0', grad_fn=) cls_loss: tensor(7.1960, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9648, device='cuda:0', grad_fn=) cls_loss: tensor(25.5935, device='cuda:0', grad_fn=) cls_loss: tensor(4.4616, device='cuda:0', grad_fn=) cls_loss: tensor(11.0182, device='cuda:0', grad_fn=) cls_loss: tensor(1.5394, device='cuda:0', grad_fn=) cls_loss: tensor(9.3388, device='cuda:0', grad_fn=) cls_loss: tensor(2.7676, device='cuda:0', grad_fn=) cls_loss: tensor(11.6200, device='cuda:0', grad_fn=) cls_loss: tensor(3.5615, device='cuda:0', grad_fn=) cls_loss: tensor(2.3400, device='cuda:0', grad_fn=) cls_loss: tensor(0.1931, device='cuda:0', grad_fn=) cls_loss: tensor(10.7409, device='cuda:0', grad_fn=) cls_loss: tensor(6.0875, device='cuda:0', grad_fn=) cls_loss: tensor(5.2389, device='cuda:0', grad_fn=) cls_loss: tensor(13.9711, device='cuda:0', grad_fn=) cls_loss: tensor(1.1958, device='cuda:0', grad_fn=) cls_loss: tensor(5.4583, device='cuda:0', grad_fn=) cls_loss: tensor(5.9310, device='cuda:0', grad_fn=) cls_loss: tensor(4.1241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0650, device='cuda:0', grad_fn=) cls_loss: tensor(5.1671, device='cuda:0', grad_fn=) cls_loss: tensor(4.2002, device='cuda:0', grad_fn=) cls_loss: tensor(12.0527, device='cuda:0', grad_fn=) cls_loss: tensor(2.1976, device='cuda:0', grad_fn=) cls_loss: tensor(5.4964, device='cuda:0', grad_fn=) cls_loss: tensor(9.3540, device='cuda:0', grad_fn=) cls_loss: tensor(6.6148, device='cuda:0', grad_fn=) cls_loss: tensor(5.0822, device='cuda:0', grad_fn=) cls_loss: tensor(3.1532, device='cuda:0', grad_fn=) cls_loss: tensor(0.3577, device='cuda:0', grad_fn=) cls_loss: tensor(14.1654, device='cuda:0', grad_fn=) cls_loss: tensor(13.7392, device='cuda:0', grad_fn=) cls_loss: tensor(6.2917, device='cuda:0', grad_fn=) cls_loss: tensor(7.1047, device='cuda:0', grad_fn=) cls_loss: tensor(7.1615, device='cuda:0', grad_fn=) cls_loss: tensor(1.1288, device='cuda:0', grad_fn=) cls_loss: tensor(4.8260, device='cuda:0', grad_fn=) cls_loss: tensor(4.8229, device='cuda:0', grad_fn=) cls_loss: tensor(15.1943, device='cuda:0', grad_fn=) cls_loss: tensor(8.3953, device='cuda:0', grad_fn=) cls_loss: tensor(3.4560, device='cuda:0', grad_fn=) cls_loss: tensor(5.3958, device='cuda:0', grad_fn=) cls_loss: tensor(8.6074, device='cuda:0', grad_fn=) cls_loss: tensor(14.2597, device='cuda:0', grad_fn=) cls_loss: tensor(8.9993, device='cuda:0', grad_fn=) cls_loss: tensor(0.4432, device='cuda:0', grad_fn=) cls_loss: tensor(2.0904, device='cuda:0', grad_fn=) cls_loss: tensor(10.7485, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5127, device='cuda:0', grad_fn=) cls_loss: tensor(14.2604, device='cuda:0', grad_fn=) cls_loss: tensor(18.8507, device='cuda:0', grad_fn=) cls_loss: tensor(11.9632, device='cuda:0', grad_fn=) cls_loss: tensor(27.1523, device='cuda:0', grad_fn=) cls_loss: tensor(25.3352, device='cuda:0', grad_fn=) cls_loss: tensor(7.7556, device='cuda:0', grad_fn=) cls_loss: tensor(5.9127, device='cuda:0', grad_fn=) cls_loss: tensor(1.0433, device='cuda:0', grad_fn=) cls_loss: tensor(10.8202, device='cuda:0', grad_fn=) cls_loss: tensor(3.8961, device='cuda:0', grad_fn=) cls_loss: tensor(4.0412, device='cuda:0', grad_fn=) cls_loss: tensor(14.1969, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(5.9792, device='cuda:0', grad_fn=) cls_loss: tensor(8.4439, device='cuda:0', grad_fn=) cls_loss: tensor(5.0156, device='cuda:0', grad_fn=) cls_loss: tensor(10.8843, device='cuda:0', grad_fn=) cls_loss: tensor(12.4674, device='cuda:0', grad_fn=) cls_loss: tensor(24.0221, device='cuda:0', grad_fn=) cls_loss: tensor(3.5400, device='cuda:0', grad_fn=) cls_loss: tensor(14.7572, device='cuda:0', grad_fn=) cls_loss: tensor(4.6254, device='cuda:0', grad_fn=) cls_loss: tensor(4.6815, device='cuda:0', grad_fn=) cls_loss: tensor(2.6294, device='cuda:0', grad_fn=) cls_loss: tensor(6.6929, device='cuda:0', grad_fn=) cls_loss: tensor(4.6918, device='cuda:0', grad_fn=) cls_loss: tensor(2.4524, device='cuda:0', grad_fn=) cls_loss: tensor(2.3878, device='cuda:0', grad_fn=) cls_loss: tensor(13.9297, device='cuda:0', grad_fn=) cls_loss: tensor(15.1634, device='cuda:0', grad_fn=) cls_loss: tensor(12.0334, device='cuda:0', grad_fn=) cls_loss: tensor(9.8737, device='cuda:0', grad_fn=) cls_loss: tensor(14.1849, device='cuda:0', grad_fn=) cls_loss: tensor(2.4522, device='cuda:0', grad_fn=) cls_loss: tensor(3.8331, device='cuda:0', grad_fn=) cls_loss: tensor(16.5143, device='cuda:0', grad_fn=) cls_loss: tensor(8.0526, device='cuda:0', grad_fn=) cls_loss: tensor(4.0658, device='cuda:0', grad_fn=) cls_loss: tensor(8.6966, device='cuda:0', grad_fn=) cls_loss: tensor(2.6194, device='cuda:0', grad_fn=) cls_loss: tensor(4.9896, device='cuda:0', grad_fn=) cls_loss: tensor(10.6211, device='cuda:0', grad_fn=) cls_loss: tensor(4.8333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(1.8257, device='cuda:0', grad_fn=) cls_loss: tensor(8.0924, device='cuda:0', grad_fn=) cls_loss: tensor(16.8203, device='cuda:0', grad_fn=) cls_loss: tensor(0.1178, device='cuda:0', grad_fn=) cls_loss: tensor(12.9343, device='cuda:0', grad_fn=) cls_loss: tensor(5.6281, device='cuda:0', grad_fn=) cls_loss: tensor(8.9135, device='cuda:0', grad_fn=) cls_loss: tensor(12.9583, device='cuda:0', grad_fn=) cls_loss: tensor(6.7833, device='cuda:0', grad_fn=) cls_loss: tensor(1.7396, device='cuda:0', grad_fn=) cls_loss: tensor(20.7474, device='cuda:0', grad_fn=) cls_loss: tensor(12.5770, device='cuda:0', grad_fn=) cls_loss: tensor(15.6935, device='cuda:0', grad_fn=) cls_loss: tensor(13.9870, device='cuda:0', grad_fn=) cls_loss: tensor(5.6599, device='cuda:0', grad_fn=) cls_loss: tensor(1.0031, device='cuda:0', grad_fn=) cls_loss: tensor(12.8792, device='cuda:0', grad_fn=) cls_loss: tensor(9.4656, device='cuda:0', grad_fn=) cls_loss: tensor(5.5304, device='cuda:0', grad_fn=) cls_loss: tensor(7.8819, device='cuda:0', grad_fn=) cls_loss: tensor(7.5363, device='cuda:0', grad_fn=) cls_loss: tensor(2.7025, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 329.69, cls_loss 9.0417 306 cls_loss: tensor(11.7513, device='cuda:0', grad_fn=) cls_loss: tensor(5.5858, device='cuda:0', grad_fn=) cls_loss: tensor(25.3281, device='cuda:0', grad_fn=) cls_loss: tensor(4.0965, device='cuda:0', grad_fn=) cls_loss: tensor(14.3963, device='cuda:0', grad_fn=) cls_loss: tensor(14.3412, device='cuda:0', grad_fn=) cls_loss: tensor(3.3737, device='cuda:0', grad_fn=) cls_loss: tensor(4.0732, device='cuda:0', grad_fn=) cls_loss: tensor(0.0980, device='cuda:0', grad_fn=) cls_loss: tensor(4.0990, device='cuda:0', grad_fn=) cls_loss: tensor(5.6683, device='cuda:0', grad_fn=) cls_loss: tensor(1., device='cuda:0', grad_fn=) cls_loss: tensor(3.4508, device='cuda:0', grad_fn=) cls_loss: tensor(2.0167, device='cuda:0', grad_fn=) cls_loss: tensor(4.9634, device='cuda:0', grad_fn=) cls_loss: tensor(13.8854, device='cuda:0', grad_fn=) cls_loss: tensor(6.4323, device='cuda:0', grad_fn=) cls_loss: tensor(2.7730, device='cuda:0', grad_fn=) cls_loss: tensor(0.6263, device='cuda:0', grad_fn=) cls_loss: tensor(4.5938, device='cuda:0', grad_fn=) cls_loss: tensor(16.6397, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518, device='cuda:0', grad_fn=) cls_loss: tensor(10.4795, device='cuda:0', grad_fn=) cls_loss: tensor(5.5378, device='cuda:0', grad_fn=) cls_loss: tensor(0.7821, device='cuda:0', grad_fn=) cls_loss: tensor(0.8984, device='cuda:0', grad_fn=) cls_loss: tensor(2.6746, device='cuda:0', grad_fn=) cls_loss: tensor(3.5677, device='cuda:0', grad_fn=) cls_loss: tensor(4.9268, device='cuda:0', grad_fn=) cls_loss: tensor(0.6180, device='cuda:0', grad_fn=) cls_loss: tensor(14.0965, device='cuda:0', grad_fn=) cls_loss: tensor(7.0781, device='cuda:0', grad_fn=) cls_loss: tensor(0.9130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.7717, device='cuda:0', grad_fn=) cls_loss: tensor(2.3349, device='cuda:0', grad_fn=) cls_loss: tensor(7.0651, device='cuda:0', grad_fn=) cls_loss: tensor(5.0882, device='cuda:0', grad_fn=) cls_loss: tensor(12.4378, device='cuda:0', grad_fn=) cls_loss: tensor(0.1223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0863, device='cuda:0', grad_fn=) cls_loss: tensor(7.9181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0877, device='cuda:0', grad_fn=) cls_loss: tensor(0.0966, device='cuda:0', grad_fn=) cls_loss: tensor(3.8467, device='cuda:0', grad_fn=) cls_loss: tensor(4.6442, device='cuda:0', grad_fn=) cls_loss: tensor(4.0148, device='cuda:0', grad_fn=) cls_loss: tensor(3.1183, device='cuda:0', grad_fn=) cls_loss: tensor(11.7009, device='cuda:0', grad_fn=) cls_loss: tensor(5.0365, device='cuda:0', grad_fn=) cls_loss: tensor(11.8664, device='cuda:0', grad_fn=) cls_loss: tensor(1.8697, device='cuda:0', grad_fn=) cls_loss: tensor(2.8673, device='cuda:0', grad_fn=) cls_loss: tensor(2.8930, device='cuda:0', grad_fn=) cls_loss: tensor(1.3060, device='cuda:0', grad_fn=) cls_loss: tensor(5.3919, device='cuda:0', grad_fn=) cls_loss: tensor(3.9326, device='cuda:0', grad_fn=) cls_loss: tensor(1.7624, device='cuda:0', grad_fn=) cls_loss: tensor(15.8626, device='cuda:0', grad_fn=) cls_loss: tensor(7.9971, device='cuda:0', grad_fn=) cls_loss: tensor(10.4440, device='cuda:0', grad_fn=) cls_loss: tensor(5.4298, device='cuda:0', grad_fn=) cls_loss: tensor(14.5384, device='cuda:0', grad_fn=) cls_loss: tensor(5.2487, device='cuda:0', grad_fn=) cls_loss: tensor(7.3147, device='cuda:0', grad_fn=) cls_loss: tensor(3.0799, device='cuda:0', grad_fn=) cls_loss: tensor(0.5954, device='cuda:0', grad_fn=) cls_loss: tensor(6.7924, device='cuda:0', grad_fn=) cls_loss: tensor(3.8438, device='cuda:0', grad_fn=) cls_loss: tensor(3.6395, device='cuda:0', grad_fn=) cls_loss: tensor(18.0589, device='cuda:0', grad_fn=) cls_loss: tensor(3.9702, device='cuda:0', grad_fn=) cls_loss: tensor(4.8503, device='cuda:0', grad_fn=) cls_loss: tensor(2.0457, device='cuda:0', grad_fn=) cls_loss: tensor(5.2031, device='cuda:0', grad_fn=) cls_loss: tensor(10.6228, device='cuda:0', grad_fn=) cls_loss: tensor(11.1535, device='cuda:0', grad_fn=) cls_loss: tensor(8.3112, device='cuda:0', grad_fn=) cls_loss: tensor(4.8034, device='cuda:0', grad_fn=) cls_loss: tensor(4.7113, device='cuda:0', grad_fn=) cls_loss: tensor(5.0274, device='cuda:0', grad_fn=) cls_loss: tensor(7.9110, device='cuda:0', grad_fn=) cls_loss: tensor(1.9527, device='cuda:0', grad_fn=) cls_loss: tensor(0.2743, device='cuda:0', grad_fn=) cls_loss: tensor(4.4714, device='cuda:0', grad_fn=) cls_loss: tensor(11.3882, device='cuda:0', grad_fn=) cls_loss: tensor(4.7279, device='cuda:0', grad_fn=) cls_loss: tensor(0.4451, device='cuda:0', grad_fn=) cls_loss: tensor(2.9799, device='cuda:0', grad_fn=) cls_loss: tensor(3.6413, device='cuda:0', grad_fn=) cls_loss: tensor(4.6640, device='cuda:0', grad_fn=) cls_loss: tensor(5.0997, device='cuda:0', grad_fn=) cls_loss: tensor(5.1613, device='cuda:0', grad_fn=) cls_loss: tensor(0.1941, device='cuda:0', grad_fn=) cls_loss: tensor(4.6510, device='cuda:0', grad_fn=) cls_loss: tensor(2.5975, device='cuda:0', grad_fn=) cls_loss: tensor(3.0609, device='cuda:0', grad_fn=) cls_loss: tensor(1.1476, device='cuda:0', grad_fn=) cls_loss: tensor(1.8712, device='cuda:0', grad_fn=) cls_loss: tensor(10.0620, device='cuda:0', grad_fn=) cls_loss: tensor(2.2478, device='cuda:0', grad_fn=) cls_loss: tensor(7.9433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2097, device='cuda:0', grad_fn=) cls_loss: tensor(6.9688, device='cuda:0', grad_fn=) cls_loss: tensor(22.7552, device='cuda:0', grad_fn=) cls_loss: tensor(2.1432, device='cuda:0', grad_fn=) cls_loss: tensor(7.8620, device='cuda:0', grad_fn=) cls_loss: tensor(8.6532, device='cuda:0', grad_fn=) cls_loss: tensor(2.4733, device='cuda:0', grad_fn=) cls_loss: tensor(1.4891, device='cuda:0', grad_fn=) cls_loss: tensor(3.8434, device='cuda:0', grad_fn=) cls_loss: tensor(13.4929, device='cuda:0', grad_fn=) cls_loss: tensor(3.4027, device='cuda:0', grad_fn=) cls_loss: tensor(3.2995, device='cuda:0', grad_fn=) cls_loss: tensor(1.7852, device='cuda:0', grad_fn=) cls_loss: tensor(7.8465, device='cuda:0', grad_fn=) cls_loss: tensor(2.4226, device='cuda:0', grad_fn=) cls_loss: tensor(12.3893, device='cuda:0', grad_fn=) cls_loss: tensor(5.6929, device='cuda:0', grad_fn=) cls_loss: tensor(5.6439, device='cuda:0', grad_fn=) cls_loss: tensor(3.2871, device='cuda:0', grad_fn=) cls_loss: tensor(10.2485, device='cuda:0', grad_fn=) cls_loss: tensor(8.2987, device='cuda:0', grad_fn=) cls_loss: tensor(2.0469, device='cuda:0', grad_fn=) cls_loss: tensor(3.7856, device='cuda:0', grad_fn=) cls_loss: tensor(8.4085, device='cuda:0', grad_fn=) cls_loss: tensor(9.7292, device='cuda:0', grad_fn=) cls_loss: tensor(6.2884, device='cuda:0', grad_fn=) cls_loss: tensor(11.3177, device='cuda:0', grad_fn=) cls_loss: tensor(0.9042, device='cuda:0', grad_fn=) cls_loss: tensor(4.7905, device='cuda:0', grad_fn=) cls_loss: tensor(8.0737, device='cuda:0', grad_fn=) cls_loss: tensor(7.4439, device='cuda:0', grad_fn=) cls_loss: tensor(2.5203, device='cuda:0', grad_fn=) cls_loss: tensor(5.2961, device='cuda:0', grad_fn=) cls_loss: tensor(11.1995, device='cuda:0', grad_fn=) cls_loss: tensor(14.6726, device='cuda:0', grad_fn=) cls_loss: tensor(4.3671, device='cuda:0', grad_fn=) cls_loss: tensor(1.5029, device='cuda:0', grad_fn=) cls_loss: tensor(1.2744, device='cuda:0', grad_fn=) cls_loss: tensor(6.8702, device='cuda:0', grad_fn=) cls_loss: tensor(4.8305, device='cuda:0', grad_fn=) cls_loss: tensor(4.2063, device='cuda:0', grad_fn=) cls_loss: tensor(10.1279, device='cuda:0', grad_fn=) cls_loss: tensor(9.2025, device='cuda:0', grad_fn=) cls_loss: tensor(5.7927, device='cuda:0', grad_fn=) cls_loss: tensor(12.6406, device='cuda:0', grad_fn=) cls_loss: tensor(13.1656, device='cuda:0', grad_fn=) cls_loss: tensor(5.3984, device='cuda:0', grad_fn=) cls_loss: tensor(1.4564, device='cuda:0', grad_fn=) cls_loss: tensor(3.5992, device='cuda:0', grad_fn=) cls_loss: tensor(9.2840, device='cuda:0', grad_fn=) cls_loss: tensor(0.9473, device='cuda:0', grad_fn=) cls_loss: tensor(6.7230, device='cuda:0', grad_fn=) cls_loss: tensor(5.1779, device='cuda:0', grad_fn=) cls_loss: tensor(7.9518, device='cuda:0', grad_fn=) cls_loss: tensor(5.5143, device='cuda:0', grad_fn=) cls_loss: tensor(2.3282, device='cuda:0', grad_fn=) cls_loss: tensor(2.5887, device='cuda:0', grad_fn=) cls_loss: tensor(11.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.9669, device='cuda:0', grad_fn=) cls_loss: tensor(4.1217, device='cuda:0', grad_fn=) cls_loss: tensor(8.5222, device='cuda:0', grad_fn=) cls_loss: tensor(10.9622, device='cuda:0', grad_fn=) cls_loss: tensor(8.9868, device='cuda:0', grad_fn=) cls_loss: tensor(7.0726, device='cuda:0', grad_fn=) cls_loss: tensor(0.3735, device='cuda:0', grad_fn=) cls_loss: tensor(4.0417, device='cuda:0', grad_fn=) cls_loss: tensor(10.1743, device='cuda:0', grad_fn=) cls_loss: tensor(6.4391, device='cuda:0', grad_fn=) cls_loss: tensor(2.5443, device='cuda:0', grad_fn=) cls_loss: tensor(3.8513, device='cuda:0', grad_fn=) cls_loss: tensor(2.5460, device='cuda:0', grad_fn=) cls_loss: tensor(6.7372, device='cuda:0', grad_fn=) cls_loss: tensor(0.3369, device='cuda:0', grad_fn=) cls_loss: tensor(15.1452, device='cuda:0', grad_fn=) cls_loss: tensor(7.8587, device='cuda:0', grad_fn=) cls_loss: tensor(9.3589, device='cuda:0', grad_fn=) cls_loss: tensor(4.9562, device='cuda:0', grad_fn=) cls_loss: tensor(8.6493, device='cuda:0', grad_fn=) cls_loss: tensor(0.3101, device='cuda:0', grad_fn=) cls_loss: tensor(3.9977, device='cuda:0', grad_fn=) cls_loss: tensor(2.8757, device='cuda:0', grad_fn=) cls_loss: tensor(3.1408, device='cuda:0', grad_fn=) cls_loss: tensor(6.4481, device='cuda:0', grad_fn=) cls_loss: tensor(17.1799, device='cuda:0', grad_fn=) cls_loss: tensor(4.1751, device='cuda:0', grad_fn=) cls_loss: tensor(0.7166, device='cuda:0', grad_fn=) cls_loss: tensor(5.1014, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200, device='cuda:0', grad_fn=) cls_loss: tensor(3.7644, device='cuda:0', grad_fn=) cls_loss: tensor(11.4440, device='cuda:0', grad_fn=) cls_loss: tensor(0.3680, device='cuda:0', grad_fn=) cls_loss: tensor(3.3547, device='cuda:0', grad_fn=) cls_loss: tensor(5.2626, device='cuda:0', grad_fn=) cls_loss: tensor(2.8995, device='cuda:0', grad_fn=) cls_loss: tensor(6.7760, device='cuda:0', grad_fn=) cls_loss: tensor(3.8818, device='cuda:0', grad_fn=) cls_loss: tensor(3.8893, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(19.7578, device='cuda:0', grad_fn=) cls_loss: tensor(5.2327, device='cuda:0', grad_fn=) cls_loss: tensor(6.5216, device='cuda:0', grad_fn=) cls_loss: tensor(6.6152, device='cuda:0', grad_fn=) cls_loss: tensor(6.4978, device='cuda:0', grad_fn=) cls_loss: tensor(2.6004, device='cuda:0', grad_fn=) cls_loss: tensor(10.0140, device='cuda:0', grad_fn=) cls_loss: tensor(4.7148, device='cuda:0', grad_fn=) cls_loss: tensor(2.6655, device='cuda:0', grad_fn=) cls_loss: tensor(16.9719, device='cuda:0', grad_fn=) cls_loss: tensor(13.8009, device='cuda:0', grad_fn=) cls_loss: tensor(2.9433, device='cuda:0', grad_fn=) cls_loss: tensor(3.2560, device='cuda:0', grad_fn=) cls_loss: tensor(18.4814, device='cuda:0', grad_fn=) cls_loss: tensor(4.3385, device='cuda:0', grad_fn=) cls_loss: tensor(9.9872, device='cuda:0', grad_fn=) cls_loss: tensor(12.2826, device='cuda:0', grad_fn=) cls_loss: tensor(6.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.4977, device='cuda:0', grad_fn=) cls_loss: tensor(1.0411, device='cuda:0', grad_fn=) cls_loss: tensor(1.2310, device='cuda:0', grad_fn=) cls_loss: tensor(15.6644, device='cuda:0', grad_fn=) cls_loss: tensor(4.9265, device='cuda:0', grad_fn=) cls_loss: tensor(5.9903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8037, device='cuda:0', grad_fn=) cls_loss: tensor(7.3372, device='cuda:0', grad_fn=) cls_loss: tensor(14.7178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0873, device='cuda:0', grad_fn=) cls_loss: tensor(2.0349, device='cuda:0', grad_fn=) cls_loss: tensor(2.2205, device='cuda:0', grad_fn=) cls_loss: tensor(0.0343, device='cuda:0', grad_fn=) cls_loss: tensor(7.5757, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(1.2789, device='cuda:0', grad_fn=) cls_loss: tensor(2.3239, device='cuda:0', grad_fn=) cls_loss: tensor(6.6081, device='cuda:0', grad_fn=) cls_loss: tensor(5.4523, device='cuda:0', grad_fn=) cls_loss: tensor(5.3825, device='cuda:0', grad_fn=) cls_loss: tensor(2.0482, device='cuda:0', grad_fn=) cls_loss: tensor(0.3672, device='cuda:0', grad_fn=) cls_loss: tensor(1.3594, device='cuda:0', grad_fn=) cls_loss: tensor(10.5799, device='cuda:0', grad_fn=) cls_loss: tensor(1.2898, device='cuda:0', grad_fn=) cls_loss: tensor(3.6335, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(6.1978, device='cuda:0', grad_fn=) cls_loss: tensor(0.2681, device='cuda:0', grad_fn=) cls_loss: tensor(0.7176, device='cuda:0', grad_fn=) cls_loss: tensor(2.4108, device='cuda:0', grad_fn=) cls_loss: tensor(4.3679, device='cuda:0', grad_fn=) cls_loss: tensor(7.1868, device='cuda:0', grad_fn=) cls_loss: tensor(7.1543, device='cuda:0', grad_fn=) cls_loss: tensor(10.5835, device='cuda:0', grad_fn=) cls_loss: tensor(4.9294, device='cuda:0', grad_fn=) cls_loss: tensor(6.8038, device='cuda:0', grad_fn=) cls_loss: tensor(0.5901, device='cuda:0', grad_fn=) cls_loss: tensor(4.3464, device='cuda:0', grad_fn=) cls_loss: tensor(3.8289, device='cuda:0', grad_fn=) cls_loss: tensor(2.7026, device='cuda:0', grad_fn=) cls_loss: tensor(6.4831, device='cuda:0', grad_fn=) cls_loss: tensor(2.5354, device='cuda:0', grad_fn=) cls_loss: tensor(2.6640, device='cuda:0', grad_fn=) cls_loss: tensor(1.7059, device='cuda:0', grad_fn=) cls_loss: tensor(4.7007, device='cuda:0', grad_fn=) cls_loss: tensor(4.7082, device='cuda:0', grad_fn=) cls_loss: tensor(3.1042, device='cuda:0', grad_fn=) cls_loss: tensor(0.5568, device='cuda:0', grad_fn=) cls_loss: tensor(8.4219, device='cuda:0', grad_fn=) cls_loss: tensor(2.1867, device='cuda:0', grad_fn=) cls_loss: tensor(0.4647, device='cuda:0', grad_fn=) cls_loss: tensor(1.7613, device='cuda:0', grad_fn=) cls_loss: tensor(3.3528, device='cuda:0', grad_fn=) cls_loss: tensor(1.2047, device='cuda:0', grad_fn=) cls_loss: tensor(2.3796, device='cuda:0', grad_fn=) cls_loss: tensor(7.9888, device='cuda:0', grad_fn=) cls_loss: tensor(7.1146, device='cuda:0', grad_fn=) cls_loss: tensor(7.0622, device='cuda:0', grad_fn=) cls_loss: tensor(5.7546, device='cuda:0', grad_fn=) cls_loss: tensor(3.4956, device='cuda:0', grad_fn=) cls_loss: tensor(0.1683, device='cuda:0', grad_fn=) cls_loss: tensor(2.3891, device='cuda:0', grad_fn=) cls_loss: tensor(3.2903, device='cuda:0', grad_fn=) cls_loss: tensor(3.5354, device='cuda:0', grad_fn=) cls_loss: tensor(0.5960, device='cuda:0', grad_fn=) cls_loss: tensor(7.3125, device='cuda:0', grad_fn=) cls_loss: tensor(3.6260, device='cuda:0', grad_fn=) cls_loss: tensor(3.7917, device='cuda:0', grad_fn=) cls_loss: tensor(5.4195, device='cuda:0', grad_fn=) cls_loss: tensor(6.6823, device='cuda:0', grad_fn=) cls_loss: tensor(6.5575, device='cuda:0', grad_fn=) cls_loss: tensor(0.6911, device='cuda:0', grad_fn=) cls_loss: tensor(0.4440, device='cuda:0', grad_fn=) cls_loss: tensor(3.2416, device='cuda:0', grad_fn=) cls_loss: tensor(6.9753, device='cuda:0', grad_fn=) cls_loss: tensor(5.3217, device='cuda:0', grad_fn=) cls_loss: tensor(4.5893, device='cuda:0', grad_fn=) cls_loss: tensor(3.6595, device='cuda:0', grad_fn=) cls_loss: tensor(1.1882, device='cuda:0', grad_fn=) cls_loss: tensor(4.3892, device='cuda:0', grad_fn=) cls_loss: tensor(4.0266, device='cuda:0', grad_fn=) cls_loss: tensor(1.9479, device='cuda:0', grad_fn=) cls_loss: tensor(10.7082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0783, device='cuda:0', grad_fn=) 9.994965332706573e-05 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 359.68, cls_loss 5.2840 306 cls_loss: tensor(4.0417, device='cuda:0', grad_fn=) cls_loss: tensor(18.1537, device='cuda:0', grad_fn=) cls_loss: tensor(3.9527, device='cuda:0', grad_fn=) cls_loss: tensor(3.7201, device='cuda:0', grad_fn=) cls_loss: tensor(3.9023, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9018, device='cuda:0', grad_fn=) cls_loss: tensor(7.4564, device='cuda:0', grad_fn=) cls_loss: tensor(2.4642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.8857, device='cuda:0', grad_fn=) cls_loss: tensor(0.1661, device='cuda:0', grad_fn=) cls_loss: tensor(7.0143, device='cuda:0', grad_fn=) cls_loss: tensor(4.4748, device='cuda:0', grad_fn=) cls_loss: tensor(0.9733, device='cuda:0', grad_fn=) cls_loss: tensor(1.2761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0780, device='cuda:0', grad_fn=) cls_loss: tensor(1.2266, device='cuda:0', grad_fn=) cls_loss: tensor(7.8506, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3490, device='cuda:0', grad_fn=) cls_loss: tensor(4.7400, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(2.1628, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(8.3033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(8.4636, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(9.5068, device='cuda:0', grad_fn=) cls_loss: tensor(0.5198, device='cuda:0', grad_fn=) cls_loss: tensor(0.8075, device='cuda:0', grad_fn=) cls_loss: tensor(8.5371, device='cuda:0', grad_fn=) cls_loss: tensor(4.4890, device='cuda:0', grad_fn=) cls_loss: tensor(2.3892, device='cuda:0', grad_fn=) cls_loss: tensor(2.4336, device='cuda:0', grad_fn=) cls_loss: tensor(0.7513, device='cuda:0', grad_fn=) cls_loss: tensor(2.5883, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(5.2989, device='cuda:0', grad_fn=) cls_loss: tensor(0.9043, device='cuda:0', grad_fn=) cls_loss: tensor(4.3194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5197, device='cuda:0', grad_fn=) cls_loss: tensor(17.6563, device='cuda:0', grad_fn=) cls_loss: tensor(4.4386, device='cuda:0', grad_fn=) cls_loss: tensor(0.4975, device='cuda:0', grad_fn=) cls_loss: tensor(0.1370, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.4001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9535, device='cuda:0', grad_fn=) cls_loss: tensor(1.5568, device='cuda:0', grad_fn=) cls_loss: tensor(0.9661, device='cuda:0', grad_fn=) cls_loss: tensor(3.5154, device='cuda:0', grad_fn=) cls_loss: tensor(6.3395, device='cuda:0', grad_fn=) cls_loss: tensor(1.4766, device='cuda:0', grad_fn=) cls_loss: tensor(0.7267, device='cuda:0', grad_fn=) cls_loss: tensor(2.4694, device='cuda:0', grad_fn=) cls_loss: tensor(1.0925, device='cuda:0', grad_fn=) cls_loss: tensor(6.8293, device='cuda:0', grad_fn=) cls_loss: tensor(7.2032, device='cuda:0', grad_fn=) cls_loss: tensor(2.0996, device='cuda:0', grad_fn=) cls_loss: tensor(0.8233, device='cuda:0', grad_fn=) cls_loss: tensor(0.4554, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1699, device='cuda:0', grad_fn=) cls_loss: tensor(1.1419, device='cuda:0', grad_fn=) cls_loss: tensor(0.7233, device='cuda:0', grad_fn=) cls_loss: tensor(1.2210, device='cuda:0', grad_fn=) cls_loss: tensor(0.6647, device='cuda:0', grad_fn=) cls_loss: tensor(2.3291, device='cuda:0', grad_fn=) cls_loss: tensor(4.4698, device='cuda:0', grad_fn=) cls_loss: tensor(3.7439, device='cuda:0', grad_fn=) cls_loss: tensor(4.4531, device='cuda:0', grad_fn=) cls_loss: tensor(4.6198, device='cuda:0', grad_fn=) cls_loss: tensor(0.1414, device='cuda:0', grad_fn=) cls_loss: tensor(4.9974, device='cuda:0', grad_fn=) cls_loss: tensor(2.7253, device='cuda:0', grad_fn=) cls_loss: tensor(6.3464, device='cuda:0', grad_fn=) cls_loss: tensor(6.7917, device='cuda:0', grad_fn=) cls_loss: tensor(1.2457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5634, device='cuda:0', grad_fn=) cls_loss: tensor(0.9466, device='cuda:0', grad_fn=) cls_loss: tensor(0.2315, device='cuda:0', grad_fn=) cls_loss: tensor(1.3112, device='cuda:0', grad_fn=) cls_loss: tensor(4.4740, device='cuda:0', grad_fn=) cls_loss: tensor(7.9805, device='cuda:0', grad_fn=) cls_loss: tensor(0.5776, device='cuda:0', grad_fn=) cls_loss: tensor(0.6182, device='cuda:0', grad_fn=) cls_loss: tensor(1.6922, device='cuda:0', grad_fn=) cls_loss: tensor(9.3156, device='cuda:0', grad_fn=) cls_loss: tensor(4.2112, device='cuda:0', grad_fn=) cls_loss: tensor(3.7370, device='cuda:0', grad_fn=) cls_loss: tensor(1.1384, device='cuda:0', grad_fn=) cls_loss: tensor(1.4175, device='cuda:0', grad_fn=) cls_loss: tensor(5.6609, device='cuda:0', grad_fn=) cls_loss: tensor(5.7383, device='cuda:0', grad_fn=) cls_loss: tensor(6.1914, device='cuda:0', grad_fn=) cls_loss: tensor(7.4315, device='cuda:0', grad_fn=) cls_loss: tensor(9.4275e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7180, device='cuda:0', grad_fn=) cls_loss: tensor(2.9834, device='cuda:0', grad_fn=) cls_loss: tensor(4.3737, device='cuda:0', grad_fn=) cls_loss: tensor(4.9362, device='cuda:0', grad_fn=) cls_loss: tensor(1.3106, device='cuda:0', grad_fn=) cls_loss: tensor(6.4318, device='cuda:0', grad_fn=) cls_loss: tensor(2.7543, device='cuda:0', grad_fn=) cls_loss: tensor(0.0966, device='cuda:0', grad_fn=) cls_loss: tensor(3.5574, device='cuda:0', grad_fn=) cls_loss: tensor(1.7565, device='cuda:0', grad_fn=) cls_loss: tensor(0.6218, device='cuda:0', grad_fn=) cls_loss: tensor(5.5427, device='cuda:0', grad_fn=) cls_loss: tensor(3.1918, device='cuda:0', grad_fn=) cls_loss: tensor(6.1044, device='cuda:0', grad_fn=) cls_loss: tensor(2.1234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0554, device='cuda:0', grad_fn=) cls_loss: tensor(0.0366, device='cuda:0', grad_fn=) cls_loss: tensor(6.6174, device='cuda:0', grad_fn=) cls_loss: tensor(0.7121, device='cuda:0', grad_fn=) cls_loss: tensor(0.1028, device='cuda:0', grad_fn=) cls_loss: tensor(4.0703, device='cuda:0', grad_fn=) cls_loss: tensor(2.6238, device='cuda:0', grad_fn=) cls_loss: tensor(5.4277, device='cuda:0', grad_fn=) cls_loss: tensor(2.6668, device='cuda:0', grad_fn=) cls_loss: tensor(2.1790, device='cuda:0', grad_fn=) cls_loss: tensor(5.4163, device='cuda:0', grad_fn=) cls_loss: tensor(23.0226, device='cuda:0', grad_fn=) cls_loss: tensor(2.3890, device='cuda:0', grad_fn=) cls_loss: tensor(13.2207, device='cuda:0', grad_fn=) cls_loss: tensor(7.2776, device='cuda:0', grad_fn=) cls_loss: tensor(4.9253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5921, device='cuda:0', grad_fn=) cls_loss: tensor(2.8676, device='cuda:0', grad_fn=) cls_loss: tensor(1.5537, device='cuda:0', grad_fn=) cls_loss: tensor(2.6854, device='cuda:0', grad_fn=) cls_loss: tensor(4.7518, device='cuda:0', grad_fn=) cls_loss: tensor(5.5695, device='cuda:0', grad_fn=) cls_loss: tensor(1.3689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8334, device='cuda:0', grad_fn=) cls_loss: tensor(4.5117, device='cuda:0', grad_fn=) cls_loss: tensor(1.4036, device='cuda:0', grad_fn=) cls_loss: tensor(5.3021, device='cuda:0', grad_fn=) cls_loss: tensor(0.2672, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(5.8971, device='cuda:0', grad_fn=) cls_loss: tensor(0.3003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5941, device='cuda:0', grad_fn=) cls_loss: tensor(7.2919, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.4365, device='cuda:0', grad_fn=) cls_loss: tensor(6.3669, device='cuda:0', grad_fn=) cls_loss: tensor(1.3553, device='cuda:0', grad_fn=) cls_loss: tensor(4.8749, device='cuda:0', grad_fn=) cls_loss: tensor(1.4818, device='cuda:0', grad_fn=) cls_loss: tensor(2.8445, device='cuda:0', grad_fn=) cls_loss: tensor(7.8661, device='cuda:0', grad_fn=) cls_loss: tensor(7.3128, device='cuda:0', grad_fn=) cls_loss: tensor(0.7623, device='cuda:0', grad_fn=) cls_loss: tensor(1.2723, device='cuda:0', grad_fn=) cls_loss: tensor(7.8972, device='cuda:0', grad_fn=) cls_loss: tensor(0.7124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.9333, device='cuda:0', grad_fn=) cls_loss: tensor(1.3979, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2911, device='cuda:0', grad_fn=) cls_loss: tensor(1.9208, device='cuda:0', grad_fn=) cls_loss: tensor(1.3130, device='cuda:0', grad_fn=) cls_loss: tensor(5.8307, device='cuda:0', grad_fn=) cls_loss: tensor(0.6771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(1.7165, device='cuda:0', grad_fn=) cls_loss: tensor(4.0355, device='cuda:0', grad_fn=) cls_loss: tensor(0.1312, device='cuda:0', grad_fn=) cls_loss: tensor(0.2413, device='cuda:0', grad_fn=) cls_loss: tensor(8.1270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(3.2241, device='cuda:0', grad_fn=) cls_loss: tensor(0.4688, device='cuda:0', grad_fn=) cls_loss: tensor(2.2890, device='cuda:0', grad_fn=) cls_loss: tensor(1.2088, device='cuda:0', grad_fn=) cls_loss: tensor(2.5903, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.6632, device='cuda:0', grad_fn=) cls_loss: tensor(2.8350, device='cuda:0', grad_fn=) cls_loss: tensor(2.8542, device='cuda:0', grad_fn=) cls_loss: tensor(5.6068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9732, device='cuda:0', grad_fn=) cls_loss: tensor(7.9526, device='cuda:0', grad_fn=) cls_loss: tensor(4.2122, device='cuda:0', grad_fn=) cls_loss: tensor(6.9873, device='cuda:0', grad_fn=) cls_loss: tensor(4.5716, device='cuda:0', grad_fn=) cls_loss: tensor(0.8442, device='cuda:0', grad_fn=) cls_loss: tensor(0.7555, device='cuda:0', grad_fn=) cls_loss: tensor(5.7449, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(4.2920, device='cuda:0', grad_fn=) cls_loss: tensor(6.4479, device='cuda:0', grad_fn=) cls_loss: tensor(1.5839, device='cuda:0', grad_fn=) cls_loss: tensor(2.1429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0737, device='cuda:0', grad_fn=) cls_loss: tensor(1.1784, device='cuda:0', grad_fn=) cls_loss: tensor(3.4570, device='cuda:0', grad_fn=) cls_loss: tensor(0.4355, device='cuda:0', grad_fn=) cls_loss: tensor(4.1011, device='cuda:0', grad_fn=) cls_loss: tensor(0.8014, device='cuda:0', grad_fn=) cls_loss: tensor(6.1322, device='cuda:0', grad_fn=) cls_loss: tensor(1.2233, device='cuda:0', grad_fn=) cls_loss: tensor(4.4658, device='cuda:0', grad_fn=) cls_loss: tensor(2.6539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9144, device='cuda:0', grad_fn=) cls_loss: tensor(3.6794, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.0560, device='cuda:0', grad_fn=) cls_loss: tensor(1.5426, device='cuda:0', grad_fn=) cls_loss: tensor(2.3563, device='cuda:0', grad_fn=) cls_loss: tensor(7.0827, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(3.2022, device='cuda:0', grad_fn=) cls_loss: tensor(5.7865, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(1.1074, device='cuda:0', grad_fn=) cls_loss: tensor(2.6396, device='cuda:0', grad_fn=) cls_loss: tensor(3.2719, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3656, device='cuda:0', grad_fn=) cls_loss: tensor(3.4375, device='cuda:0', grad_fn=) cls_loss: tensor(5.6837, device='cuda:0', grad_fn=) cls_loss: tensor(4.4456, device='cuda:0', grad_fn=) cls_loss: tensor(7.8783, device='cuda:0', grad_fn=) cls_loss: tensor(3.8201, device='cuda:0', grad_fn=) cls_loss: tensor(0.3539, device='cuda:0', grad_fn=) cls_loss: tensor(4.3419, device='cuda:0', grad_fn=) cls_loss: tensor(1.3482, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(1.7488, device='cuda:0', grad_fn=) cls_loss: tensor(2.2943, device='cuda:0', grad_fn=) cls_loss: tensor(2.2993, device='cuda:0', grad_fn=) cls_loss: tensor(8.3208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(0.9503, device='cuda:0', grad_fn=) cls_loss: tensor(5.4697, device='cuda:0', grad_fn=) cls_loss: tensor(2.3474, device='cuda:0', grad_fn=) cls_loss: tensor(2.6390, device='cuda:0', grad_fn=) cls_loss: tensor(1.3291, device='cuda:0', grad_fn=) cls_loss: tensor(0.6359, device='cuda:0', grad_fn=) cls_loss: tensor(0.2009, device='cuda:0', grad_fn=) cls_loss: tensor(1.1280, device='cuda:0', grad_fn=) cls_loss: tensor(2.9421, device='cuda:0', grad_fn=) cls_loss: tensor(5.1961, device='cuda:0', grad_fn=) cls_loss: tensor(0.2569, device='cuda:0', grad_fn=) cls_loss: tensor(3.4505, device='cuda:0', grad_fn=) cls_loss: tensor(1.7370, device='cuda:0', grad_fn=) cls_loss: tensor(2.0645, device='cuda:0', grad_fn=) cls_loss: tensor(2.8340, device='cuda:0', grad_fn=) cls_loss: tensor(2.6375, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.3245, device='cuda:0', grad_fn=) cls_loss: tensor(2.4557, device='cuda:0', grad_fn=) cls_loss: tensor(2.4206, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1156, device='cuda:0', grad_fn=) cls_loss: tensor(2.4520, device='cuda:0', grad_fn=) cls_loss: tensor(0.7426, device='cuda:0', grad_fn=) cls_loss: tensor(5.6003, device='cuda:0', grad_fn=) cls_loss: tensor(4.0549, device='cuda:0', grad_fn=) cls_loss: tensor(2.9577, device='cuda:0', grad_fn=) cls_loss: tensor(2.5373, device='cuda:0', grad_fn=) cls_loss: tensor(3.4797, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.4457, device='cuda:0', grad_fn=) cls_loss: tensor(2.4724, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3330, device='cuda:0', grad_fn=) cls_loss: tensor(1.6822, device='cuda:0', grad_fn=) cls_loss: tensor(2.6419, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(3.4495, device='cuda:0', grad_fn=) cls_loss: tensor(0.0138, device='cuda:0', grad_fn=) cls_loss: tensor(2.6164, device='cuda:0', grad_fn=) cls_loss: tensor(2.7462, device='cuda:0', grad_fn=) cls_loss: tensor(1.2069, device='cuda:0', grad_fn=) cls_loss: tensor(4.6196, device='cuda:0', grad_fn=) cls_loss: tensor(5.1427, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.7313, device='cuda:0', grad_fn=) cls_loss: tensor(1.7826, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0928, device='cuda:0', grad_fn=) cls_loss: tensor(3.0000, device='cuda:0', grad_fn=) cls_loss: tensor(1.7787, device='cuda:0', grad_fn=) cls_loss: tensor(4.5177, device='cuda:0', grad_fn=) cls_loss: tensor(0.1099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(3.4252, device='cuda:0', grad_fn=) cls_loss: tensor(3.0960, device='cuda:0', grad_fn=) cls_loss: tensor(2.8702, device='cuda:0', grad_fn=) cls_loss: tensor(0.4010, device='cuda:0', grad_fn=) cls_loss: tensor(1.3613, device='cuda:0', grad_fn=) cls_loss: tensor(5.8686, device='cuda:0', grad_fn=) cls_loss: tensor(0.0691, device='cuda:0', grad_fn=) 9.979871469976196e-05 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 358.51, cls_loss 2.9761 306 cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(1.6697, device='cuda:0', grad_fn=) cls_loss: tensor(0.0656, device='cuda:0', grad_fn=) cls_loss: tensor(0.3888, device='cuda:0', grad_fn=) cls_loss: tensor(0.6330, device='cuda:0', grad_fn=) cls_loss: tensor(1.1094, device='cuda:0', grad_fn=) cls_loss: tensor(1.4946, device='cuda:0', grad_fn=) cls_loss: tensor(1.3231, device='cuda:0', grad_fn=) cls_loss: tensor(0.9401, device='cuda:0', grad_fn=) cls_loss: tensor(1.0890, device='cuda:0', grad_fn=) cls_loss: tensor(1.7178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.8036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.9136, device='cuda:0', grad_fn=) cls_loss: tensor(0.2471, device='cuda:0', grad_fn=) cls_loss: tensor(0.2111, device='cuda:0', grad_fn=) cls_loss: tensor(3.6096, device='cuda:0', grad_fn=) cls_loss: tensor(0.5165, device='cuda:0', grad_fn=) cls_loss: tensor(0.5088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2587, device='cuda:0', grad_fn=) cls_loss: tensor(1.8107, device='cuda:0', grad_fn=) cls_loss: tensor(5.2045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.2992, device='cuda:0', grad_fn=) cls_loss: tensor(8.2457, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6772, device='cuda:0', grad_fn=) cls_loss: tensor(1.6706, device='cuda:0', grad_fn=) cls_loss: tensor(1.4826, device='cuda:0', grad_fn=) cls_loss: tensor(3.5631, device='cuda:0', grad_fn=) cls_loss: tensor(3.8100, device='cuda:0', grad_fn=) cls_loss: tensor(3.9027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0424, device='cuda:0', grad_fn=) cls_loss: tensor(1.0781, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0807, device='cuda:0', grad_fn=) cls_loss: tensor(0.8387, device='cuda:0', grad_fn=) cls_loss: tensor(0.0904, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5709, device='cuda:0', grad_fn=) cls_loss: tensor(5.9710, device='cuda:0', grad_fn=) cls_loss: tensor(7.1113, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3464, device='cuda:0', grad_fn=) cls_loss: tensor(2.2687, device='cuda:0', grad_fn=) cls_loss: tensor(1.1985, device='cuda:0', grad_fn=) cls_loss: tensor(0.3292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0610, device='cuda:0', grad_fn=) cls_loss: tensor(3.7950, device='cuda:0', grad_fn=) cls_loss: tensor(5.4590, device='cuda:0', grad_fn=) cls_loss: tensor(2.7768, device='cuda:0', grad_fn=) cls_loss: tensor(0.8968, device='cuda:0', grad_fn=) cls_loss: tensor(0.4589, device='cuda:0', grad_fn=) cls_loss: tensor(0.7234, device='cuda:0', grad_fn=) cls_loss: tensor(3.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.9325, device='cuda:0', grad_fn=) cls_loss: tensor(0.1624, device='cuda:0', grad_fn=) cls_loss: tensor(2.8542, device='cuda:0', grad_fn=) cls_loss: tensor(3.0979, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(1.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(3.9531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8062, device='cuda:0', grad_fn=) cls_loss: tensor(2.8431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331, device='cuda:0', grad_fn=) cls_loss: tensor(2.1749, device='cuda:0', grad_fn=) cls_loss: tensor(0.5356, device='cuda:0', grad_fn=) cls_loss: tensor(1.6357, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(2.6107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.5576, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0827, device='cuda:0', grad_fn=) cls_loss: tensor(7.6423, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(2.4742, device='cuda:0', grad_fn=) cls_loss: tensor(4.5028, device='cuda:0', grad_fn=) cls_loss: tensor(3.0217, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(2.6198, device='cuda:0', grad_fn=) cls_loss: tensor(4.1178, device='cuda:0', grad_fn=) cls_loss: tensor(0.4548, device='cuda:0', grad_fn=) cls_loss: tensor(1.6087, device='cuda:0', grad_fn=) cls_loss: tensor(4.6087, device='cuda:0', grad_fn=) cls_loss: tensor(0.2154, device='cuda:0', grad_fn=) cls_loss: tensor(0.4492, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5248, device='cuda:0', grad_fn=) cls_loss: tensor(1.9650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1849, device='cuda:0', grad_fn=) cls_loss: tensor(7.1419, device='cuda:0', grad_fn=) cls_loss: tensor(0.1948, device='cuda:0', grad_fn=) cls_loss: tensor(0.7891, device='cuda:0', grad_fn=) cls_loss: tensor(1.0679, device='cuda:0', grad_fn=) cls_loss: tensor(1.5293, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(6.8943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.6838, device='cuda:0', grad_fn=) cls_loss: tensor(2.6080, device='cuda:0', grad_fn=) cls_loss: tensor(2.1392, device='cuda:0', grad_fn=) cls_loss: tensor(0.7208, device='cuda:0', grad_fn=) cls_loss: tensor(0.7135, device='cuda:0', grad_fn=) cls_loss: tensor(0.5827, device='cuda:0', grad_fn=) cls_loss: tensor(0.3789, device='cuda:0', grad_fn=) cls_loss: tensor(5.0833, device='cuda:0', grad_fn=) cls_loss: tensor(2.2681, device='cuda:0', grad_fn=) cls_loss: tensor(2.3345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5651, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(1.4466, device='cuda:0', grad_fn=) cls_loss: tensor(3.0518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8268, device='cuda:0', grad_fn=) cls_loss: tensor(0.1084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(2.5941, device='cuda:0', grad_fn=) cls_loss: tensor(1.2103, device='cuda:0', grad_fn=) cls_loss: tensor(1.3052, device='cuda:0', grad_fn=) cls_loss: tensor(0.4290, device='cuda:0', grad_fn=) cls_loss: tensor(3.9021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4482, device='cuda:0', grad_fn=) cls_loss: tensor(0.0444, device='cuda:0', grad_fn=) cls_loss: tensor(0.0578, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(4.6660, device='cuda:0', grad_fn=) cls_loss: tensor(0.8666, device='cuda:0', grad_fn=) cls_loss: tensor(1.2978, device='cuda:0', grad_fn=) cls_loss: tensor(2.7318, device='cuda:0', grad_fn=) cls_loss: tensor(2.4821, device='cuda:0', grad_fn=) cls_loss: tensor(1.9340, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5029, device='cuda:0', grad_fn=) cls_loss: tensor(0.1761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(2.2703, device='cuda:0', grad_fn=) cls_loss: tensor(1.1376, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(1.1375, device='cuda:0', grad_fn=) cls_loss: tensor(0.2180, device='cuda:0', grad_fn=) cls_loss: tensor(0.9357, device='cuda:0', grad_fn=) cls_loss: tensor(4.0690, device='cuda:0', grad_fn=) cls_loss: tensor(0.7187, device='cuda:0', grad_fn=) cls_loss: tensor(2.3000, device='cuda:0', grad_fn=) cls_loss: tensor(0.7443, device='cuda:0', grad_fn=) cls_loss: tensor(0.1204, device='cuda:0', grad_fn=) cls_loss: tensor(1.0894, device='cuda:0', grad_fn=) cls_loss: tensor(2.8070, device='cuda:0', grad_fn=) cls_loss: tensor(9.9719, device='cuda:0', grad_fn=) cls_loss: tensor(2.1512, device='cuda:0', grad_fn=) cls_loss: tensor(0.8874, device='cuda:0', grad_fn=) cls_loss: tensor(1.5636e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0838, device='cuda:0', grad_fn=) cls_loss: tensor(1.0387, device='cuda:0', grad_fn=) cls_loss: tensor(4.3229, device='cuda:0', grad_fn=) cls_loss: tensor(2.8210, device='cuda:0', grad_fn=) cls_loss: tensor(3.0638, device='cuda:0', grad_fn=) cls_loss: tensor(3.6058, device='cuda:0', grad_fn=) cls_loss: tensor(5.0969, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(2.1453, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.3740, device='cuda:0', grad_fn=) cls_loss: tensor(0.4102, device='cuda:0', grad_fn=) cls_loss: tensor(0.5576, device='cuda:0', grad_fn=) cls_loss: tensor(2.5830, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(1.1124, device='cuda:0', grad_fn=) cls_loss: tensor(4.0915, device='cuda:0', grad_fn=) cls_loss: tensor(3.3066, device='cuda:0', grad_fn=) cls_loss: tensor(1.8929, device='cuda:0', grad_fn=) cls_loss: tensor(7.5301e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8066, device='cuda:0', grad_fn=) cls_loss: tensor(3.0078, device='cuda:0', grad_fn=) cls_loss: tensor(6.1413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0939, device='cuda:0', grad_fn=) cls_loss: tensor(3.1127, device='cuda:0', grad_fn=) cls_loss: tensor(0.6661, device='cuda:0', grad_fn=) cls_loss: tensor(1.0187, device='cuda:0', grad_fn=) cls_loss: tensor(3.3317, device='cuda:0', grad_fn=) cls_loss: tensor(0.5615, device='cuda:0', grad_fn=) cls_loss: tensor(7.3268, device='cuda:0', grad_fn=) cls_loss: tensor(3.5652, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(5.1145, device='cuda:0', grad_fn=) cls_loss: tensor(1.1729, device='cuda:0', grad_fn=) cls_loss: tensor(2.5007, device='cuda:0', grad_fn=) cls_loss: tensor(6.1641, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.3751, device='cuda:0', grad_fn=) cls_loss: tensor(5.6548, device='cuda:0', grad_fn=) cls_loss: tensor(2.1966, device='cuda:0', grad_fn=) cls_loss: tensor(2.1923, device='cuda:0', grad_fn=) cls_loss: tensor(10.6146, device='cuda:0', grad_fn=) cls_loss: tensor(1.1667, device='cuda:0', grad_fn=) cls_loss: tensor(1.0510, device='cuda:0', grad_fn=) cls_loss: tensor(1.2745, device='cuda:0', grad_fn=) cls_loss: tensor(2.7758, device='cuda:0', grad_fn=) cls_loss: tensor(0.6825, device='cuda:0', grad_fn=) cls_loss: tensor(0.4375, device='cuda:0', grad_fn=) cls_loss: tensor(2.2488, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.7504, device='cuda:0', grad_fn=) cls_loss: tensor(1.0656, device='cuda:0', grad_fn=) cls_loss: tensor(3.2285, device='cuda:0', grad_fn=) cls_loss: tensor(2.1042, device='cuda:0', grad_fn=) cls_loss: tensor(1.6211, device='cuda:0', grad_fn=) cls_loss: tensor(5.8164, device='cuda:0', grad_fn=) cls_loss: tensor(1.3884, device='cuda:0', grad_fn=) cls_loss: tensor(5.1302, device='cuda:0', grad_fn=) cls_loss: tensor(1.2260, device='cuda:0', grad_fn=) cls_loss: tensor(0.6213, device='cuda:0', grad_fn=) cls_loss: tensor(1.8013, device='cuda:0', grad_fn=) cls_loss: tensor(0.6790, device='cuda:0', grad_fn=) cls_loss: tensor(0.0341, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(2.6065, device='cuda:0', grad_fn=) cls_loss: tensor(3.5652, device='cuda:0', grad_fn=) cls_loss: tensor(3.7324, device='cuda:0', grad_fn=) cls_loss: tensor(0.3860, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.0328, device='cuda:0', grad_fn=) cls_loss: tensor(0.9499, device='cuda:0', grad_fn=) cls_loss: tensor(1.3973, device='cuda:0', grad_fn=) cls_loss: tensor(2.5276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.6660, device='cuda:0', grad_fn=) cls_loss: tensor(0.0769, device='cuda:0', grad_fn=) cls_loss: tensor(1.0361, device='cuda:0', grad_fn=) cls_loss: tensor(2.4518, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(6.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.7993, device='cuda:0', grad_fn=) cls_loss: tensor(2.8252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(1.9766, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7598, device='cuda:0', grad_fn=) cls_loss: tensor(5.9497, device='cuda:0', grad_fn=) cls_loss: tensor(8.0926, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.2010, device='cuda:0', grad_fn=) cls_loss: tensor(2.5078, device='cuda:0', grad_fn=) cls_loss: tensor(3.8924, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(3.3425, device='cuda:0', grad_fn=) cls_loss: tensor(3.0684, device='cuda:0', grad_fn=) cls_loss: tensor(2.6807, device='cuda:0', grad_fn=) cls_loss: tensor(4.0358, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.6172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.5688, device='cuda:0', grad_fn=) cls_loss: tensor(1.0560, device='cuda:0', grad_fn=) cls_loss: tensor(1.8223, device='cuda:0', grad_fn=) cls_loss: tensor(0.2384, device='cuda:0', grad_fn=) cls_loss: tensor(0.5646, device='cuda:0', grad_fn=) cls_loss: tensor(0.2547, device='cuda:0', grad_fn=) cls_loss: tensor(5.7031, device='cuda:0', grad_fn=) cls_loss: tensor(1.1094, device='cuda:0', grad_fn=) cls_loss: tensor(2.1328, device='cuda:0', grad_fn=) cls_loss: tensor(3.9466, device='cuda:0', grad_fn=) cls_loss: tensor(5.4871, device='cuda:0', grad_fn=) cls_loss: tensor(0.1286, device='cuda:0', grad_fn=) cls_loss: tensor(2.8753, device='cuda:0', grad_fn=) cls_loss: tensor(0.7414, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(2.9935, device='cuda:0', grad_fn=) cls_loss: tensor(0.1348, device='cuda:0', grad_fn=) cls_loss: tensor(2.8203, device='cuda:0', grad_fn=) cls_loss: tensor(1.0625, device='cuda:0', grad_fn=) cls_loss: tensor(4.0747, device='cuda:0', grad_fn=) cls_loss: tensor(3.7602, device='cuda:0', grad_fn=) cls_loss: tensor(3.7354, device='cuda:0', grad_fn=) cls_loss: tensor(0.5704, device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(4.1990, device='cuda:0', grad_fn=) cls_loss: tensor(0.7506, device='cuda:0', grad_fn=) cls_loss: tensor(1.8909, device='cuda:0', grad_fn=) cls_loss: tensor(0.5779, device='cuda:0', grad_fn=) cls_loss: tensor(0.2404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0164, device='cuda:0', grad_fn=) cls_loss: tensor(2.9109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0635, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.4251, device='cuda:0', grad_fn=) cls_loss: tensor(0.2317, device='cuda:0', grad_fn=) cls_loss: tensor(0.3379, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2133, device='cuda:0', grad_fn=) cls_loss: tensor(3.0209, device='cuda:0', grad_fn=) 9.954748808839674e-05 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 357.65, cls_loss 1.7143 306 cls_loss: tensor(1.0860, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3650, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2860, device='cuda:0', grad_fn=) cls_loss: tensor(0.4709, device='cuda:0', grad_fn=) cls_loss: tensor(0.0723, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.9296, device='cuda:0', grad_fn=) cls_loss: tensor(2.7720, device='cuda:0', grad_fn=) cls_loss: tensor(0.2074, device='cuda:0', grad_fn=) cls_loss: tensor(0.3618, device='cuda:0', grad_fn=) cls_loss: tensor(0.1768, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.9838, device='cuda:0', grad_fn=) cls_loss: tensor(1.5425, device='cuda:0', grad_fn=) cls_loss: tensor(1.3653, device='cuda:0', grad_fn=) cls_loss: tensor(0.6294, device='cuda:0', grad_fn=) cls_loss: tensor(1.9434, device='cuda:0', grad_fn=) cls_loss: tensor(0.8800, device='cuda:0', grad_fn=) cls_loss: tensor(0.7494, device='cuda:0', grad_fn=) cls_loss: tensor(2.4331, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.9206, device='cuda:0', grad_fn=) cls_loss: tensor(0.5483, device='cuda:0', grad_fn=) cls_loss: tensor(1.1399, device='cuda:0', grad_fn=) cls_loss: tensor(5.9107, device='cuda:0', grad_fn=) cls_loss: tensor(2.8335, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3923, device='cuda:0', grad_fn=) cls_loss: tensor(1.4797, device='cuda:0', grad_fn=) cls_loss: tensor(1.0807, device='cuda:0', grad_fn=) cls_loss: tensor(1.6797, device='cuda:0', grad_fn=) cls_loss: tensor(1.9795, device='cuda:0', grad_fn=) cls_loss: tensor(0.1105, device='cuda:0', grad_fn=) cls_loss: tensor(1.8021, device='cuda:0', grad_fn=) cls_loss: tensor(4.8210, device='cuda:0', grad_fn=) cls_loss: tensor(2.3398, device='cuda:0', grad_fn=) cls_loss: tensor(0.9988, device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(4.7525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4881, device='cuda:0', grad_fn=) cls_loss: tensor(0.5716, device='cuda:0', grad_fn=) cls_loss: tensor(2.1410, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(4.7663, device='cuda:0', grad_fn=) cls_loss: tensor(0.9979, device='cuda:0', grad_fn=) cls_loss: tensor(2.8528, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(2.7574, device='cuda:0', grad_fn=) cls_loss: tensor(1.4226, device='cuda:0', grad_fn=) cls_loss: tensor(4.9783, device='cuda:0', grad_fn=) cls_loss: tensor(0.4208, device='cuda:0', grad_fn=) cls_loss: tensor(0.1616, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(5.0441, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9645, device='cuda:0', grad_fn=) cls_loss: tensor(1.7293, device='cuda:0', grad_fn=) cls_loss: tensor(1.3542, device='cuda:0', grad_fn=) cls_loss: tensor(4.5148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1472, device='cuda:0', grad_fn=) cls_loss: tensor(0.0910, device='cuda:0', grad_fn=) cls_loss: tensor(2.9017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3786, device='cuda:0', grad_fn=) cls_loss: tensor(1.8187, device='cuda:0', grad_fn=) cls_loss: tensor(0.1180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.7422, device='cuda:0', grad_fn=) cls_loss: tensor(1.4010, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4868, device='cuda:0', grad_fn=) cls_loss: tensor(0.6494, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7454, device='cuda:0', grad_fn=) cls_loss: tensor(1.8223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0700, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.2790, device='cuda:0', grad_fn=) cls_loss: tensor(0.1192, device='cuda:0', grad_fn=) cls_loss: tensor(0.1397, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9661, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(1.8947, device='cuda:0', grad_fn=) cls_loss: tensor(0.0594, device='cuda:0', grad_fn=) cls_loss: tensor(0.6803, device='cuda:0', grad_fn=) cls_loss: tensor(1.5431, device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(1.6837, device='cuda:0', grad_fn=) cls_loss: tensor(0.3758, device='cuda:0', grad_fn=) cls_loss: tensor(6.6864, device='cuda:0', grad_fn=) cls_loss: tensor(2.3972, device='cuda:0', grad_fn=) cls_loss: tensor(1.2873, device='cuda:0', grad_fn=) cls_loss: tensor(4.7644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1383, device='cuda:0', grad_fn=) cls_loss: tensor(0.9671, device='cuda:0', grad_fn=) cls_loss: tensor(3.8425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1847, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.7572, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8539, device='cuda:0', grad_fn=) cls_loss: tensor(0.4122, device='cuda:0', grad_fn=) cls_loss: tensor(1.8283, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676, device='cuda:0', grad_fn=) cls_loss: tensor(0.5694, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(1.2445, device='cuda:0', grad_fn=) cls_loss: tensor(0.0385, device='cuda:0', grad_fn=) cls_loss: tensor(0.0684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(11.4479, device='cuda:0', grad_fn=) cls_loss: tensor(2.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.2630, device='cuda:0', grad_fn=) cls_loss: tensor(2.5968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0559, device='cuda:0', grad_fn=) cls_loss: tensor(0.8848, device='cuda:0', grad_fn=) cls_loss: tensor(0.4077, device='cuda:0', grad_fn=) cls_loss: tensor(1.8620, device='cuda:0', grad_fn=) cls_loss: tensor(1.1908, device='cuda:0', grad_fn=) cls_loss: tensor(3.3677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1460, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.6738, device='cuda:0', grad_fn=) cls_loss: tensor(0.0634, device='cuda:0', grad_fn=) cls_loss: tensor(1.3815, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.1569, device='cuda:0', grad_fn=) cls_loss: tensor(0.5140, device='cuda:0', grad_fn=) cls_loss: tensor(1.4099, device='cuda:0', grad_fn=) cls_loss: tensor(4.7266, device='cuda:0', grad_fn=) cls_loss: tensor(2.6804, device='cuda:0', grad_fn=) cls_loss: tensor(1.7546, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.7090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.2105, device='cuda:0', grad_fn=) cls_loss: tensor(1.4427, device='cuda:0', grad_fn=) cls_loss: tensor(0.0288, device='cuda:0', grad_fn=) cls_loss: tensor(0.0475, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.9944, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3236, device='cuda:0', grad_fn=) cls_loss: tensor(1.6352, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3434, device='cuda:0', grad_fn=) cls_loss: tensor(0.5579, device='cuda:0', grad_fn=) cls_loss: tensor(3.2147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4782, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.5251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(8.6824e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1148, device='cuda:0', grad_fn=) cls_loss: tensor(5.0469, device='cuda:0', grad_fn=) cls_loss: tensor(1.2498, device='cuda:0', grad_fn=) cls_loss: tensor(1.1367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3169, device='cuda:0', grad_fn=) cls_loss: tensor(2.5961, device='cuda:0', grad_fn=) cls_loss: tensor(0.7321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8577, device='cuda:0', grad_fn=) cls_loss: tensor(1.2509, device='cuda:0', grad_fn=) cls_loss: tensor(0.3986, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.3290, device='cuda:0', grad_fn=) cls_loss: tensor(1.9726, device='cuda:0', grad_fn=) cls_loss: tensor(3.5095, device='cuda:0', grad_fn=) cls_loss: tensor(1.3620, device='cuda:0', grad_fn=) cls_loss: tensor(0.0649, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3423, device='cuda:0', grad_fn=) cls_loss: tensor(0.4475, device='cuda:0', grad_fn=) cls_loss: tensor(1.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.8335, device='cuda:0', grad_fn=) cls_loss: tensor(0.2647, device='cuda:0', grad_fn=) cls_loss: tensor(1.8483, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1895, device='cuda:0', grad_fn=) cls_loss: tensor(2.8025, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2294, device='cuda:0', grad_fn=) cls_loss: tensor(5.8582, device='cuda:0', grad_fn=) cls_loss: tensor(1.2109, device='cuda:0', grad_fn=) cls_loss: tensor(2.7288, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.1739, device='cuda:0', grad_fn=) cls_loss: tensor(2.4062, device='cuda:0', grad_fn=) cls_loss: tensor(3.1047, device='cuda:0', grad_fn=) cls_loss: tensor(2.6999, device='cuda:0', grad_fn=) cls_loss: tensor(0.3731, device='cuda:0', grad_fn=) cls_loss: tensor(0.1379, device='cuda:0', grad_fn=) cls_loss: tensor(2.7916, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.9006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0798, device='cuda:0', grad_fn=) cls_loss: tensor(2.4844, device='cuda:0', grad_fn=) cls_loss: tensor(0.0372, device='cuda:0', grad_fn=) cls_loss: tensor(3.1974, device='cuda:0', grad_fn=) cls_loss: tensor(3.9298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(4.1303, device='cuda:0', grad_fn=) cls_loss: tensor(3.7018, device='cuda:0', grad_fn=) cls_loss: tensor(1.5885, device='cuda:0', grad_fn=) cls_loss: tensor(0.0698, device='cuda:0', grad_fn=) cls_loss: tensor(1.5677, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(6.2552, device='cuda:0', grad_fn=) cls_loss: tensor(1.3101, device='cuda:0', grad_fn=) cls_loss: tensor(4.7223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0854, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.2931, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(4.2803, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.1490, device='cuda:0', grad_fn=) cls_loss: tensor(1.9610, device='cuda:0', grad_fn=) cls_loss: tensor(2.0941, device='cuda:0', grad_fn=) cls_loss: tensor(5.8134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2018, device='cuda:0', grad_fn=) cls_loss: tensor(3.8079, device='cuda:0', grad_fn=) cls_loss: tensor(0.9820, device='cuda:0', grad_fn=) cls_loss: tensor(3.6374, device='cuda:0', grad_fn=) cls_loss: tensor(1.3119, device='cuda:0', grad_fn=) cls_loss: tensor(0.8713, device='cuda:0', grad_fn=) cls_loss: tensor(1.8841, device='cuda:0', grad_fn=) cls_loss: tensor(0.5384, device='cuda:0', grad_fn=) cls_loss: tensor(0.1836, device='cuda:0', grad_fn=) cls_loss: tensor(0.5967, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6152, device='cuda:0', grad_fn=) cls_loss: tensor(0.2445, device='cuda:0', grad_fn=) cls_loss: tensor(0.2498, device='cuda:0', grad_fn=) cls_loss: tensor(1.4596, device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7537, device='cuda:0', grad_fn=) cls_loss: tensor(0.0520, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(3.1627, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.6406, device='cuda:0', grad_fn=) cls_loss: tensor(2.4847, device='cuda:0', grad_fn=) cls_loss: tensor(4.2756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5426, device='cuda:0', grad_fn=) cls_loss: tensor(1.4272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.8674, device='cuda:0', grad_fn=) cls_loss: tensor(0.1104, device='cuda:0', grad_fn=) cls_loss: tensor(0.3022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0849, device='cuda:0', grad_fn=) cls_loss: tensor(4.4232, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(2.5642, device='cuda:0', grad_fn=) cls_loss: tensor(1.8281, device='cuda:0', grad_fn=) cls_loss: tensor(2.5892, device='cuda:0', grad_fn=) cls_loss: tensor(0.1689, device='cuda:0', grad_fn=) cls_loss: tensor(1.9924, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9126, device='cuda:0', grad_fn=) cls_loss: tensor(4.8164, device='cuda:0', grad_fn=) cls_loss: tensor(0.2096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(1.1781, device='cuda:0', grad_fn=) cls_loss: tensor(0.0929, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(1.8243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(2.3984, device='cuda:0', grad_fn=) cls_loss: tensor(0.2640, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) 9.919647942993148e-05 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 360.90, cls_loss 1.1780 306 cls_loss: tensor(2.0266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8538, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.8854, device='cuda:0', grad_fn=) cls_loss: tensor(1.5283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0237, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.6263, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.1523, device='cuda:0', grad_fn=) cls_loss: tensor(1.0689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1907, device='cuda:0', grad_fn=) cls_loss: tensor(2.4388, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5086, device='cuda:0', grad_fn=) cls_loss: tensor(1.2795, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.7037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.5977, device='cuda:0', grad_fn=) cls_loss: tensor(1.3729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2910, device='cuda:0', grad_fn=) cls_loss: tensor(0.7585, device='cuda:0', grad_fn=) cls_loss: tensor(1.4850, device='cuda:0', grad_fn=) cls_loss: tensor(3.0652, device='cuda:0', grad_fn=) cls_loss: tensor(0.0450, device='cuda:0', grad_fn=) cls_loss: tensor(0.5827, device='cuda:0', grad_fn=) cls_loss: tensor(3.6538e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7732, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0553, device='cuda:0', grad_fn=) cls_loss: tensor(0.0432, device='cuda:0', grad_fn=) cls_loss: tensor(0.0377, device='cuda:0', grad_fn=) cls_loss: tensor(0.8213, device='cuda:0', grad_fn=) cls_loss: tensor(2.9922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2309, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6796, device='cuda:0', grad_fn=) cls_loss: tensor(1.9368, device='cuda:0', grad_fn=) cls_loss: tensor(1.9995, device='cuda:0', grad_fn=) cls_loss: tensor(0.8555, device='cuda:0', grad_fn=) cls_loss: tensor(0.0797, device='cuda:0', grad_fn=) cls_loss: tensor(4.7624, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.2161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6728, device='cuda:0', grad_fn=) cls_loss: tensor(2.2431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.2176, device='cuda:0', grad_fn=) cls_loss: tensor(0.7484, device='cuda:0', grad_fn=) cls_loss: tensor(0.0990, device='cuda:0', grad_fn=) cls_loss: tensor(1.2124, device='cuda:0', grad_fn=) cls_loss: tensor(2.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.2891, device='cuda:0', grad_fn=) cls_loss: tensor(3.9121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1839, device='cuda:0', grad_fn=) cls_loss: tensor(2.1442, device='cuda:0', grad_fn=) cls_loss: tensor(1.1422, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7611, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.8844, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(3.3812, device='cuda:0', grad_fn=) cls_loss: tensor(9.2584, device='cuda:0', grad_fn=) cls_loss: tensor(1.4250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(1.6034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1834, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0504, device='cuda:0', grad_fn=) cls_loss: tensor(1.1302, device='cuda:0', grad_fn=) cls_loss: tensor(5.2141, device='cuda:0', grad_fn=) cls_loss: tensor(0.4482, device='cuda:0', grad_fn=) cls_loss: tensor(3.6178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5963e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(1.8845, device='cuda:0', grad_fn=) cls_loss: tensor(2.0524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5156, device='cuda:0', grad_fn=) cls_loss: tensor(0.1121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.5148, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5542, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0458, device='cuda:0', grad_fn=) cls_loss: tensor(1.3731, device='cuda:0', grad_fn=) cls_loss: tensor(1.9828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.7868, device='cuda:0', grad_fn=) cls_loss: tensor(5.2730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5614, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3008, device='cuda:0', grad_fn=) cls_loss: tensor(1.3320, device='cuda:0', grad_fn=) cls_loss: tensor(0.3180, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7904, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.7376, device='cuda:0', grad_fn=) cls_loss: tensor(9.6679e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2886, device='cuda:0', grad_fn=) cls_loss: tensor(1.9582, device='cuda:0', grad_fn=) cls_loss: tensor(0.6520, device='cuda:0', grad_fn=) cls_loss: tensor(0.1014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(2.5709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2795, device='cuda:0', grad_fn=) cls_loss: tensor(7.9493e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1254, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1971, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.6205, device='cuda:0', grad_fn=) cls_loss: tensor(3.7005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0.8255, device='cuda:0', grad_fn=) cls_loss: tensor(2.1595, device='cuda:0', grad_fn=) cls_loss: tensor(3.7616, device='cuda:0', grad_fn=) cls_loss: tensor(2.2936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.1180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.6377, device='cuda:0', grad_fn=) cls_loss: tensor(2.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8426, device='cuda:0', grad_fn=) cls_loss: tensor(1.3581, device='cuda:0', grad_fn=) cls_loss: tensor(0.4297, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.5199, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0712, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.6751, device='cuda:0', grad_fn=) cls_loss: tensor(4.1081, device='cuda:0', grad_fn=) cls_loss: tensor(1.0595, device='cuda:0', grad_fn=) cls_loss: tensor(0.2561, device='cuda:0', grad_fn=) cls_loss: tensor(0.0640, device='cuda:0', grad_fn=) cls_loss: tensor(4.2905, device='cuda:0', grad_fn=) cls_loss: tensor(0.5033, device='cuda:0', grad_fn=) cls_loss: tensor(1.3687, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) cls_loss: tensor(0.9437, device='cuda:0', grad_fn=) cls_loss: tensor(1.6642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.8239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.1589, device='cuda:0', grad_fn=) cls_loss: tensor(0.4824, device='cuda:0', grad_fn=) cls_loss: tensor(3.5384, device='cuda:0', grad_fn=) cls_loss: tensor(0.9828, device='cuda:0', grad_fn=) cls_loss: tensor(3.6946, device='cuda:0', grad_fn=) cls_loss: tensor(0.1311, device='cuda:0', grad_fn=) cls_loss: tensor(1.1312, device='cuda:0', grad_fn=) cls_loss: tensor(0.4092, device='cuda:0', grad_fn=) cls_loss: tensor(3.4403, device='cuda:0', grad_fn=) cls_loss: tensor(0.0871, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.5780, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.4623, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.9669, device='cuda:0', grad_fn=) cls_loss: tensor(2.5690, device='cuda:0', grad_fn=) cls_loss: tensor(1.2297, device='cuda:0', grad_fn=) cls_loss: tensor(0.4303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.5859, device='cuda:0', grad_fn=) cls_loss: tensor(0.4012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(2.3659, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(2.1876, device='cuda:0', grad_fn=) cls_loss: tensor(0.2048, device='cuda:0', grad_fn=) cls_loss: tensor(1.5291, device='cuda:0', grad_fn=) cls_loss: tensor(0.3783, device='cuda:0', grad_fn=) cls_loss: tensor(9.4056e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7058, device='cuda:0', grad_fn=) cls_loss: tensor(2.9273, device='cuda:0', grad_fn=) cls_loss: tensor(0.9652, device='cuda:0', grad_fn=) cls_loss: tensor(0.7536, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2561, device='cuda:0', grad_fn=) cls_loss: tensor(0.2661, device='cuda:0', grad_fn=) cls_loss: tensor(1.3854, device='cuda:0', grad_fn=) cls_loss: tensor(0.4721, device='cuda:0', grad_fn=) cls_loss: tensor(1.0881, device='cuda:0', grad_fn=) cls_loss: tensor(0.2441, device='cuda:0', grad_fn=) cls_loss: tensor(0.6373, device='cuda:0', grad_fn=) cls_loss: tensor(2.2726, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(2.1349, device='cuda:0', grad_fn=) cls_loss: tensor(0.5433, device='cuda:0', grad_fn=) cls_loss: tensor(1.4841, device='cuda:0', grad_fn=) cls_loss: tensor(3.8623, device='cuda:0', grad_fn=) cls_loss: tensor(0.3249, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6342, device='cuda:0', grad_fn=) cls_loss: tensor(1.3958, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8622, device='cuda:0', grad_fn=) cls_loss: tensor(7.8555, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1557, device='cuda:0', grad_fn=) cls_loss: tensor(1.8052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0976, device='cuda:0', grad_fn=) cls_loss: tensor(0.0430, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0751, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.9972, device='cuda:0', grad_fn=) cls_loss: tensor(0.6715, device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2791, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.9799, device='cuda:0', grad_fn=) cls_loss: tensor(2.2012, device='cuda:0', grad_fn=) cls_loss: tensor(2.2541, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.5841, device='cuda:0', grad_fn=) cls_loss: tensor(4.7325, device='cuda:0', grad_fn=) cls_loss: tensor(0.3667, device='cuda:0', grad_fn=) cls_loss: tensor(0.7741, device='cuda:0', grad_fn=) cls_loss: tensor(0.5241, device='cuda:0', grad_fn=) cls_loss: tensor(0.4893, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.4466, device='cuda:0', grad_fn=) cls_loss: tensor(1.3468, device='cuda:0', grad_fn=) cls_loss: tensor(0.9471, device='cuda:0', grad_fn=) cls_loss: tensor(0.7156, device='cuda:0', grad_fn=) cls_loss: tensor(0.4038, device='cuda:0', grad_fn=) cls_loss: tensor(4.8260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9428, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4406, device='cuda:0', grad_fn=) cls_loss: tensor(0.1428, device='cuda:0', grad_fn=) cls_loss: tensor(0.0797, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0781, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.6823, device='cuda:0', grad_fn=) cls_loss: tensor(2.3274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(1.3689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3041, device='cuda:0', grad_fn=) cls_loss: tensor(0.3446, device='cuda:0', grad_fn=) cls_loss: tensor(5.6702, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(0.9307, device='cuda:0', grad_fn=) cls_loss: tensor(2.5568, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.4830, device='cuda:0', grad_fn=) cls_loss: tensor(1.1688, device='cuda:0', grad_fn=) cls_loss: tensor(0.7195, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(1.5478, device='cuda:0', grad_fn=) cls_loss: tensor(0.4717, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0992, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0.0257, device='cuda:0', grad_fn=) cls_loss: tensor(1.2592, device='cuda:0', grad_fn=) cls_loss: tensor(0.8451, device='cuda:0', grad_fn=) cls_loss: tensor(9.7812, device='cuda:0', grad_fn=) cls_loss: tensor(1.1311, device='cuda:0', grad_fn=) cls_loss: tensor(0.5583, device='cuda:0', grad_fn=) 9.874639560909117e-05 changing lr epoch 5, time 357.06, cls_loss 0.9013 306 cls_loss: tensor(0.5313, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6253, device='cuda:0', grad_fn=) cls_loss: tensor(0.0722, device='cuda:0', grad_fn=) cls_loss: tensor(2.4964, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.4755, device='cuda:0', grad_fn=) cls_loss: tensor(2.4720, device='cuda:0', grad_fn=) cls_loss: tensor(0.1097, device='cuda:0', grad_fn=) cls_loss: tensor(0.5009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(1.0630, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.2395, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.7854, device='cuda:0', grad_fn=) cls_loss: tensor(0.1808, device='cuda:0', grad_fn=) cls_loss: tensor(0.3281, device='cuda:0', grad_fn=) cls_loss: tensor(4.4419, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2939, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(3.1292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8034, device='cuda:0', grad_fn=) cls_loss: tensor(0.4277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0377, device='cuda:0', grad_fn=) cls_loss: tensor(0.3986, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(2.0088, device='cuda:0', grad_fn=) cls_loss: tensor(1.8854, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(2.3294, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.8151, device='cuda:0', grad_fn=) cls_loss: tensor(3.6320, device='cuda:0', grad_fn=) cls_loss: tensor(1.6133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9097, device='cuda:0', grad_fn=) cls_loss: tensor(1.3757, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4429, device='cuda:0', grad_fn=) cls_loss: tensor(3.2441, device='cuda:0', grad_fn=) cls_loss: tensor(0.7182, device='cuda:0', grad_fn=) cls_loss: tensor(1.2710, device='cuda:0', grad_fn=) cls_loss: tensor(1.8611, device='cuda:0', grad_fn=) cls_loss: tensor(4.9889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0919, device='cuda:0', grad_fn=) cls_loss: tensor(2.6270, device='cuda:0', grad_fn=) cls_loss: tensor(0.8968, device='cuda:0', grad_fn=) cls_loss: tensor(2.2132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0601, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2124, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4847, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2192, device='cuda:0', grad_fn=) cls_loss: tensor(0.3053, device='cuda:0', grad_fn=) cls_loss: tensor(1.4225, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3096, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9903, device='cuda:0', grad_fn=) cls_loss: tensor(2.3250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9759, device='cuda:0', grad_fn=) cls_loss: tensor(0.3018, device='cuda:0', grad_fn=) cls_loss: tensor(0.5734, device='cuda:0', grad_fn=) cls_loss: tensor(0.2630, device='cuda:0', grad_fn=) cls_loss: tensor(0.5107, device='cuda:0', grad_fn=) cls_loss: tensor(0.1080, device='cuda:0', grad_fn=) cls_loss: tensor(0.1305, device='cuda:0', grad_fn=) cls_loss: tensor(5.6783e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1546, device='cuda:0', grad_fn=) cls_loss: tensor(0.7792, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(1.5961, device='cuda:0', grad_fn=) cls_loss: tensor(0.7050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0983, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0676, device='cuda:0', grad_fn=) cls_loss: tensor(0.8119, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312, device='cuda:0', grad_fn=) cls_loss: tensor(3.1908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.2217, device='cuda:0', grad_fn=) cls_loss: tensor(2.0901, device='cuda:0', grad_fn=) cls_loss: tensor(0.1214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(1.6055, device='cuda:0', grad_fn=) cls_loss: tensor(0.9134, device='cuda:0', grad_fn=) cls_loss: tensor(0.2381, device='cuda:0', grad_fn=) cls_loss: tensor(1.1771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.9525, device='cuda:0', grad_fn=) cls_loss: tensor(0.7416, device='cuda:0', grad_fn=) cls_loss: tensor(2.5124, device='cuda:0', grad_fn=) cls_loss: tensor(0.3609, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1638, device='cuda:0', grad_fn=) cls_loss: tensor(1.7194, device='cuda:0', grad_fn=) cls_loss: tensor(0.5875, device='cuda:0', grad_fn=) cls_loss: tensor(0.5576, device='cuda:0', grad_fn=) cls_loss: tensor(0.7787, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.4020, device='cuda:0', grad_fn=) cls_loss: tensor(0.3702, device='cuda:0', grad_fn=) cls_loss: tensor(0.6810, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.7402, device='cuda:0', grad_fn=) cls_loss: tensor(1.9206, device='cuda:0', grad_fn=) cls_loss: tensor(0.8112, device='cuda:0', grad_fn=) cls_loss: tensor(1.6833, device='cuda:0', grad_fn=) cls_loss: tensor(2.8834, device='cuda:0', grad_fn=) cls_loss: tensor(0.0497, device='cuda:0', grad_fn=) cls_loss: tensor(1.2819, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(2.9278, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.2533, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.2005, device='cuda:0', grad_fn=) cls_loss: tensor(2.2175, device='cuda:0', grad_fn=) cls_loss: tensor(3.0150, device='cuda:0', grad_fn=) cls_loss: tensor(1.6211, device='cuda:0', grad_fn=) cls_loss: tensor(5.5417, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.4803, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.3576, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(4.4405e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(3.4193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7355, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2939, device='cuda:0', grad_fn=) cls_loss: tensor(0.1829, device='cuda:0', grad_fn=) cls_loss: tensor(0.3238, device='cuda:0', grad_fn=) cls_loss: tensor(1.3817, device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8399, device='cuda:0', grad_fn=) cls_loss: tensor(2.6609, device='cuda:0', grad_fn=) cls_loss: tensor(1.8756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9580, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(4.6035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(9.6043e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2696, device='cuda:0', grad_fn=) cls_loss: tensor(2.4656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0896, device='cuda:0', grad_fn=) cls_loss: tensor(3.6399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7676, device='cuda:0', grad_fn=) cls_loss: tensor(0.2183, device='cuda:0', grad_fn=) cls_loss: tensor(0.1716, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0412, device='cuda:0', grad_fn=) cls_loss: tensor(1.5823, device='cuda:0', grad_fn=) cls_loss: tensor(1.9727, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(2.4339, device='cuda:0', grad_fn=) cls_loss: tensor(0.3896, device='cuda:0', grad_fn=) cls_loss: tensor(7.5956e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6599, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(3.4524, device='cuda:0', grad_fn=) cls_loss: tensor(0.0312, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(1.3926, device='cuda:0', grad_fn=) cls_loss: tensor(0.6378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(1.6784, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(1.1683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2417, device='cuda:0', grad_fn=) cls_loss: tensor(0.3811, device='cuda:0', grad_fn=) cls_loss: tensor(0.3449, device='cuda:0', grad_fn=) cls_loss: tensor(2.6252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0316, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(6.6607, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.9596, device='cuda:0', grad_fn=) cls_loss: tensor(1.3173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3341, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(1.1580, device='cuda:0', grad_fn=) cls_loss: tensor(1.6925, device='cuda:0', grad_fn=) cls_loss: tensor(0.3313, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0859, device='cuda:0', grad_fn=) cls_loss: tensor(1.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0.9204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.0767, device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8751, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0576, device='cuda:0', grad_fn=) cls_loss: tensor(0.1606, device='cuda:0', grad_fn=) cls_loss: tensor(8.0884e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.5247, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.3394, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(1.8027, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(2.3669, device='cuda:0', grad_fn=) cls_loss: tensor(0.9178, device='cuda:0', grad_fn=) cls_loss: tensor(4.1251, device='cuda:0', grad_fn=) cls_loss: tensor(5.7042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.5435, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0779, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(1.2894e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9925, device='cuda:0', grad_fn=) cls_loss: tensor(3.8092, device='cuda:0', grad_fn=) cls_loss: tensor(5.3359, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.2158, device='cuda:0', grad_fn=) cls_loss: tensor(1.3462, device='cuda:0', grad_fn=) cls_loss: tensor(0.2791, device='cuda:0', grad_fn=) cls_loss: tensor(5.9780, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9792, device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1071, device='cuda:0', grad_fn=) cls_loss: tensor(0.4389, device='cuda:0', grad_fn=) cls_loss: tensor(4.2384, device='cuda:0', grad_fn=) cls_loss: tensor(0.3577, device='cuda:0', grad_fn=) cls_loss: tensor(0.9060, device='cuda:0', grad_fn=) cls_loss: tensor(3.1115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0409, device='cuda:0', grad_fn=) cls_loss: tensor(3.6001e-05, device='cuda:0', grad_fn=) 9.819814303479267e-05 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 355.43, cls_loss 0.7067 306 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5811, device='cuda:0', grad_fn=) cls_loss: tensor(2.1304, device='cuda:0', grad_fn=) cls_loss: tensor(0.3756, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.6207e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7354, device='cuda:0', grad_fn=) cls_loss: tensor(6.8943e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1999, device='cuda:0', grad_fn=) cls_loss: tensor(0.8229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.3169, device='cuda:0', grad_fn=) cls_loss: tensor(0.9968, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4349, device='cuda:0', grad_fn=) cls_loss: tensor(1.2219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.4723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7914, device='cuda:0', grad_fn=) cls_loss: tensor(0.0742, device='cuda:0', grad_fn=) cls_loss: tensor(0.6383, device='cuda:0', grad_fn=) cls_loss: tensor(0.2222, device='cuda:0', grad_fn=) cls_loss: tensor(0.7422, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.8141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(1.3893, device='cuda:0', grad_fn=) cls_loss: tensor(0.0548, device='cuda:0', grad_fn=) cls_loss: tensor(2.4931, device='cuda:0', grad_fn=) cls_loss: tensor(0.0314, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(4.9710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5485, device='cuda:0', grad_fn=) cls_loss: tensor(3.5230, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4983, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0448, device='cuda:0', grad_fn=) cls_loss: tensor(5.4042e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0932, device='cuda:0', grad_fn=) cls_loss: tensor(0.4027, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1516, device='cuda:0', grad_fn=) cls_loss: tensor(1.5724, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(3.5624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0625, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1991, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5904, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.1828, device='cuda:0', grad_fn=) cls_loss: tensor(4.5684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0543, device='cuda:0', grad_fn=) cls_loss: tensor(7.0790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2646, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6517, device='cuda:0', grad_fn=) cls_loss: tensor(0.1064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(4.7240, device='cuda:0', grad_fn=) cls_loss: tensor(1.5181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(3.9316, device='cuda:0', grad_fn=) cls_loss: tensor(3.4655, device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(1.9144, device='cuda:0', grad_fn=) cls_loss: tensor(2.2676, device='cuda:0', grad_fn=) cls_loss: tensor(0.7098, device='cuda:0', grad_fn=) cls_loss: tensor(0.4084, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.3121, device='cuda:0', grad_fn=) cls_loss: tensor(3.0637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8215e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0309, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.3864, device='cuda:0', grad_fn=) cls_loss: tensor(1.1647, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.9643, device='cuda:0', grad_fn=) cls_loss: tensor(1.5514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1306, device='cuda:0', grad_fn=) cls_loss: tensor(0.1279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0949, device='cuda:0', grad_fn=) cls_loss: tensor(0.2241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.8086, device='cuda:0', grad_fn=) cls_loss: tensor(0.9069, device='cuda:0', grad_fn=) cls_loss: tensor(0.1350, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3983, device='cuda:0', grad_fn=) cls_loss: tensor(0.7987, device='cuda:0', grad_fn=) cls_loss: tensor(0.1701, device='cuda:0', grad_fn=) cls_loss: tensor(0.5782, device='cuda:0', grad_fn=) cls_loss: tensor(1.5993, device='cuda:0', grad_fn=) cls_loss: tensor(1.8125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(4.1681, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.2878, device='cuda:0', grad_fn=) cls_loss: tensor(1.3248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0576, device='cuda:0', grad_fn=) cls_loss: tensor(0.0993, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(1.2124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(1.3285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0682, device='cuda:0', grad_fn=) cls_loss: tensor(0.2199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0794, device='cuda:0', grad_fn=) cls_loss: tensor(0.8236, device='cuda:0', grad_fn=) cls_loss: tensor(1.7714, device='cuda:0', grad_fn=) cls_loss: tensor(0.7494, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3861, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7643, device='cuda:0', grad_fn=) cls_loss: tensor(3.3140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1505, device='cuda:0', grad_fn=) cls_loss: tensor(2.9060, device='cuda:0', grad_fn=) cls_loss: tensor(2.6668, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.8659, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(1.2497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1493, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3317, device='cuda:0', grad_fn=) cls_loss: tensor(0.0483, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(2.1080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(2.9323, device='cuda:0', grad_fn=) cls_loss: tensor(0.0293, device='cuda:0', grad_fn=) cls_loss: tensor(8.5632e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5846, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(0.7350, device='cuda:0', grad_fn=) cls_loss: tensor(0.3250, device='cuda:0', grad_fn=) cls_loss: tensor(1.9054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2129, device='cuda:0', grad_fn=) cls_loss: tensor(1.0084, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1890, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(5.4677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.0941, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.4713, device='cuda:0', grad_fn=) cls_loss: tensor(0.1981, device='cuda:0', grad_fn=) cls_loss: tensor(0.5507, device='cuda:0', grad_fn=) cls_loss: tensor(1.5597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8543, device='cuda:0', grad_fn=) cls_loss: tensor(0.0310, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6758, device='cuda:0', grad_fn=) cls_loss: tensor(0.2152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0616, device='cuda:0', grad_fn=) cls_loss: tensor(1.4345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5941, device='cuda:0', grad_fn=) cls_loss: tensor(0.1260, device='cuda:0', grad_fn=) cls_loss: tensor(1.6875, device='cuda:0', grad_fn=) cls_loss: tensor(0.2314, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0.5729, device='cuda:0', grad_fn=) cls_loss: tensor(1.4484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0637, device='cuda:0', grad_fn=) cls_loss: tensor(0.0421, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(1.6603, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4674, device='cuda:0', grad_fn=) cls_loss: tensor(2.8279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.2141, device='cuda:0', grad_fn=) cls_loss: tensor(2.7780, device='cuda:0', grad_fn=) cls_loss: tensor(0.6263, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2699, device='cuda:0', grad_fn=) cls_loss: tensor(2.7279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1644, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.5755, device='cuda:0', grad_fn=) cls_loss: tensor(0.9991, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0205, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2555, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3660, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.2874, device='cuda:0', grad_fn=) cls_loss: tensor(7.5479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.7385, device='cuda:0', grad_fn=) cls_loss: tensor(0.4157, device='cuda:0', grad_fn=) cls_loss: tensor(0.1868, device='cuda:0', grad_fn=) cls_loss: tensor(1.2835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0586, device='cuda:0', grad_fn=) cls_loss: tensor(0.1364, device='cuda:0', grad_fn=) cls_loss: tensor(0.2181, device='cuda:0', grad_fn=) cls_loss: tensor(0.1053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(1.2162, device='cuda:0', grad_fn=) cls_loss: tensor(0.5688, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.1643, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.2694, device='cuda:0', grad_fn=) cls_loss: tensor(3.5967, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.8415, device='cuda:0', grad_fn=) cls_loss: tensor(0.4909, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(2.4095, device='cuda:0', grad_fn=) cls_loss: tensor(0.3284, device='cuda:0', grad_fn=) cls_loss: tensor(0.4248, device='cuda:0', grad_fn=) cls_loss: tensor(0.1006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1635, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3127e-05, device='cuda:0', grad_fn=) 9.755282581475769e-05 changing lr epoch 7, time 354.12, cls_loss 0.4559 306 cls_loss: tensor(0.3040, device='cuda:0', grad_fn=) cls_loss: tensor(0.4326, device='cuda:0', grad_fn=) cls_loss: tensor(0.7005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6056, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1305, device='cuda:0', grad_fn=) cls_loss: tensor(0.1513, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.8073, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1509, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2106, device='cuda:0', grad_fn=) cls_loss: tensor(0.1742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.1683, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(7.4029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0573, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.9361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0491, device='cuda:0', grad_fn=) cls_loss: tensor(4.5160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7518, device='cuda:0', grad_fn=) cls_loss: tensor(1.0124, device='cuda:0', grad_fn=) cls_loss: tensor(2.2943, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1993, device='cuda:0', grad_fn=) cls_loss: tensor(2.7955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.8585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0277, device='cuda:0', grad_fn=) cls_loss: tensor(1.3897, device='cuda:0', grad_fn=) cls_loss: tensor(3.1606, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.3535, device='cuda:0', grad_fn=) cls_loss: tensor(0.5078, device='cuda:0', grad_fn=) cls_loss: tensor(0.1060, device='cuda:0', grad_fn=) cls_loss: tensor(0.4240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(3.8854, device='cuda:0', grad_fn=) cls_loss: tensor(0.4424, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6628, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(5.1300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2643, device='cuda:0', grad_fn=) cls_loss: tensor(0.0965, device='cuda:0', grad_fn=) cls_loss: tensor(0.3189, device='cuda:0', grad_fn=) cls_loss: tensor(0.1321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.9514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.1636, device='cuda:0', grad_fn=) cls_loss: tensor(1.6038, device='cuda:0', grad_fn=) cls_loss: tensor(1.7620, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8952, device='cuda:0', grad_fn=) cls_loss: tensor(0.2669, device='cuda:0', grad_fn=) cls_loss: tensor(1.8259, device='cuda:0', grad_fn=) cls_loss: tensor(0.8360, device='cuda:0', grad_fn=) cls_loss: tensor(0.2178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1843, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.2244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.1017, device='cuda:0', grad_fn=) cls_loss: tensor(1.8199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2559, device='cuda:0', grad_fn=) cls_loss: tensor(0.5600, device='cuda:0', grad_fn=) cls_loss: tensor(0.1060, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9997e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.7480, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.4010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(1.1594, device='cuda:0', grad_fn=) cls_loss: tensor(4.2637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2228, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.6268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.2300, device='cuda:0', grad_fn=) cls_loss: tensor(0.1347, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(1.8281, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(3.3593, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(2.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.4004, device='cuda:0', grad_fn=) cls_loss: tensor(0.8072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0824, device='cuda:0', grad_fn=) cls_loss: tensor(1.7902, device='cuda:0', grad_fn=) cls_loss: tensor(2.8133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1395, device='cuda:0', grad_fn=) cls_loss: tensor(0.9347, device='cuda:0', grad_fn=) cls_loss: tensor(1.4903, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3610, device='cuda:0', grad_fn=) cls_loss: tensor(1.3392, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9440, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8848, device='cuda:0', grad_fn=) cls_loss: tensor(1.1067, device='cuda:0', grad_fn=) cls_loss: tensor(0.1617, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2755e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.7402, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(2.2125, device='cuda:0', grad_fn=) cls_loss: tensor(1.4962, device='cuda:0', grad_fn=) cls_loss: tensor(1.3749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0263, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) cls_loss: tensor(3.3259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6494, device='cuda:0', grad_fn=) cls_loss: tensor(0.8220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.6192, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0649, device='cuda:0', grad_fn=) cls_loss: tensor(5.2929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.4132, device='cuda:0', grad_fn=) cls_loss: tensor(0.3240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0419, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5692e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1196, device='cuda:0', grad_fn=) cls_loss: tensor(1.1720, device='cuda:0', grad_fn=) cls_loss: tensor(1.1322, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0281, device='cuda:0', grad_fn=) cls_loss: tensor(9.7434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.2850, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.3636, device='cuda:0', grad_fn=) cls_loss: tensor(0.0547, device='cuda:0', grad_fn=) cls_loss: tensor(9.9937e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.2741, device='cuda:0', grad_fn=) cls_loss: tensor(0.2781, device='cuda:0', grad_fn=) cls_loss: tensor(0.2277, device='cuda:0', grad_fn=) cls_loss: tensor(1.8318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.5218, device='cuda:0', grad_fn=) cls_loss: tensor(0.3145, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4056, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2395, device='cuda:0', grad_fn=) cls_loss: tensor(0.1490, device='cuda:0', grad_fn=) cls_loss: tensor(0.4772, device='cuda:0', grad_fn=) cls_loss: tensor(4.2737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.5702, device='cuda:0', grad_fn=) cls_loss: tensor(0.3942, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.6497, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.1855, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0232, device='cuda:0', grad_fn=) cls_loss: tensor(2.1667, device='cuda:0', grad_fn=) cls_loss: tensor(1.8112, device='cuda:0', grad_fn=) cls_loss: tensor(3.0160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1790e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0227, device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8346, device='cuda:0', grad_fn=) cls_loss: tensor(0.1186, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0577, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2424, device='cuda:0', grad_fn=) cls_loss: tensor(0.0865, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.8239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3988, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(2.0007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(6.8307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.1504, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2311, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.0195, device='cuda:0', grad_fn=) cls_loss: tensor(2.2560, device='cuda:0', grad_fn=) cls_loss: tensor(2.0547, device='cuda:0', grad_fn=) cls_loss: tensor(1.1047, device='cuda:0', grad_fn=) cls_loss: tensor(2.7756e-05, device='cuda:0', grad_fn=) 9.681174353198687e-05 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 358.80, cls_loss 0.3400 306 cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(2.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.1340, device='cuda:0', grad_fn=) cls_loss: tensor(2.9465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0758, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0745, device='cuda:0', grad_fn=) cls_loss: tensor(0.4342, device='cuda:0', grad_fn=) cls_loss: tensor(1.5421, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0841, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.8490, device='cuda:0', grad_fn=) cls_loss: tensor(1.8020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1453, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.9971, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0304, device='cuda:0', grad_fn=) cls_loss: tensor(0.0561, device='cuda:0', grad_fn=) cls_loss: tensor(2.2789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2899, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0955, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9124, device='cuda:0', grad_fn=) cls_loss: tensor(0.2478, device='cuda:0', grad_fn=) cls_loss: tensor(0.9535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(4.7147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7840, device='cuda:0', grad_fn=) cls_loss: tensor(6.4572e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9082, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0899, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0469, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.9029, device='cuda:0', grad_fn=) cls_loss: tensor(0.8691, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(1.5859, device='cuda:0', grad_fn=) cls_loss: tensor(1.3652, device='cuda:0', grad_fn=) cls_loss: tensor(0.8996, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(4.2041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(1.7451, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1497, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(1.4047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0266, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.1202, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0583, device='cuda:0', grad_fn=) cls_loss: tensor(7.8082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5902, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.1458, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0799, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(2.3750, device='cuda:0', grad_fn=) cls_loss: tensor(0.5477, device='cuda:0', grad_fn=) cls_loss: tensor(0.3141, device='cuda:0', grad_fn=) cls_loss: tensor(0.8783, device='cuda:0', grad_fn=) cls_loss: tensor(0.3247, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7897e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.5738e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2247, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4602, device='cuda:0', grad_fn=) cls_loss: tensor(1.0733, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0561, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.1461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.6712, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0634, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.3760, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.4551, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4979, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(2.8650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0368, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0379, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0847, device='cuda:0', grad_fn=) cls_loss: tensor(2.8749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0866, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(5.0386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0438, device='cuda:0', grad_fn=) cls_loss: tensor(0.1204, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3590, device='cuda:0', grad_fn=) cls_loss: tensor(1.2636e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.5934, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(9.8348e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.4039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.5882, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0903, device='cuda:0', grad_fn=) cls_loss: tensor(5.9942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1829, device='cuda:0', grad_fn=) cls_loss: tensor(0.4356, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.8320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.6859, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2431, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2838, device='cuda:0', grad_fn=) cls_loss: tensor(0.8573, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.2813, device='cuda:0', grad_fn=) cls_loss: tensor(5.8889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.9755, device='cuda:0', grad_fn=) cls_loss: tensor(0.2506, device='cuda:0', grad_fn=) cls_loss: tensor(0.0621, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1804, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0.1183, device='cuda:0', grad_fn=) cls_loss: tensor(0.2868, device='cuda:0', grad_fn=) cls_loss: tensor(1.0874, device='cuda:0', grad_fn=) cls_loss: tensor(0.4108, device='cuda:0', grad_fn=) cls_loss: tensor(0.8520, device='cuda:0', grad_fn=) cls_loss: tensor(9.1195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.4161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3726, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0800, device='cuda:0', grad_fn=) cls_loss: tensor(3.3213, device='cuda:0', grad_fn=) cls_loss: tensor(0.5916, device='cuda:0', grad_fn=) cls_loss: tensor(0.0779, device='cuda:0', grad_fn=) cls_loss: tensor(0.5605, device='cuda:0', grad_fn=) cls_loss: tensor(0.6986, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5024, device='cuda:0', grad_fn=) cls_loss: tensor(0.8402, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.8028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0858, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.2202, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.1276, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3206, device='cuda:0', grad_fn=) cls_loss: tensor(5.4518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1306, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.2611, device='cuda:0', grad_fn=) 9.597638862757255e-05 changing lr epoch 9, time 371.01, cls_loss 0.2145 306 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5150, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(1.6901, device='cuda:0', grad_fn=) cls_loss: tensor(0.0463, device='cuda:0', grad_fn=) cls_loss: tensor(0.4959, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0351, device='cuda:0', grad_fn=) cls_loss: tensor(6.1790e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0993, device='cuda:0', grad_fn=) cls_loss: tensor(0.3081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(7.7605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1896, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1968, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.1002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0780, device='cuda:0', grad_fn=) cls_loss: tensor(4.6869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(1.6393, device='cuda:0', grad_fn=) cls_loss: tensor(0.0288, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(1.9093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.6094, device='cuda:0', grad_fn=) cls_loss: tensor(9.0917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8857, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5597, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(4.9638, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1664, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8117, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4271, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0362, device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0800, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3689, device='cuda:0', grad_fn=) cls_loss: tensor(0.1394, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(1.4762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(1.1114, device='cuda:0', grad_fn=) cls_loss: tensor(2.8957, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0942, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0725, device='cuda:0', grad_fn=) cls_loss: tensor(0.3479, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.1421, device='cuda:0', grad_fn=) cls_loss: tensor(5.1260e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8667, device='cuda:0', grad_fn=) cls_loss: tensor(3.7670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.2235, device='cuda:0', grad_fn=) cls_loss: tensor(1.5651, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5355, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.2462, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.1410, device='cuda:0', grad_fn=) cls_loss: tensor(0.4418, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9786, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(1.0716, device='cuda:0', grad_fn=) cls_loss: tensor(0.6086, device='cuda:0', grad_fn=) cls_loss: tensor(2.2332, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2128, device='cuda:0', grad_fn=) cls_loss: tensor(2.8650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0327, device='cuda:0', grad_fn=) cls_loss: tensor(0.3224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(1.5782, device='cuda:0', grad_fn=) cls_loss: tensor(0.1281, device='cuda:0', grad_fn=) cls_loss: tensor(1.8535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2773, device='cuda:0', grad_fn=) cls_loss: tensor(0.1952, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6431, device='cuda:0', grad_fn=) cls_loss: tensor(2.6244, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.7220, device='cuda:0', grad_fn=) cls_loss: tensor(0.5844, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.3288e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.6842, device='cuda:0', grad_fn=) cls_loss: tensor(3.2135, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(3.1931, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.4903, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.4996e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5474, device='cuda:0', grad_fn=) cls_loss: tensor(0.7956, device='cuda:0', grad_fn=) cls_loss: tensor(1.8372, device='cuda:0', grad_fn=) cls_loss: tensor(0.0383, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.2954, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.9246, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5156, device='cuda:0', grad_fn=) cls_loss: tensor(2.5938, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4570, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.9235, device='cuda:0', grad_fn=) cls_loss: tensor(1.4433, device='cuda:0', grad_fn=) cls_loss: tensor(7.4108e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.2943, device='cuda:0', grad_fn=) cls_loss: tensor(0.8272, device='cuda:0', grad_fn=) cls_loss: tensor(0.9779, device='cuda:0', grad_fn=) cls_loss: tensor(1.7188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.9956e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.5168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.2036, device='cuda:0', grad_fn=) cls_loss: tensor(0.3708, device='cuda:0', grad_fn=) cls_loss: tensor(0.1943, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0227, device='cuda:0', grad_fn=) cls_loss: tensor(0.6239, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8718, device='cuda:0', grad_fn=) cls_loss: tensor(0.4562, device='cuda:0', grad_fn=) cls_loss: tensor(1.0476, device='cuda:0', grad_fn=) cls_loss: tensor(1.6650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3157, device='cuda:0', grad_fn=) cls_loss: tensor(0.4823, device='cuda:0', grad_fn=) cls_loss: tensor(4.7349, device='cuda:0', grad_fn=) cls_loss: tensor(0.5908, device='cuda:0', grad_fn=) cls_loss: tensor(0.3685, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.8088, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.5663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.4438, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5301e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5858, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.3753, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(4.2404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0832, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0351, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1315, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 9.504844339512095e-05 changing lr epoch 10, time 357.40, cls_loss 0.3486 306 cls_loss: tensor(0.1106, device='cuda:0', grad_fn=) cls_loss: tensor(3.0843, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.1380, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0309, device='cuda:0', grad_fn=) cls_loss: tensor(0.6922, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(7.7029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0777, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.3464, device='cuda:0', grad_fn=) cls_loss: tensor(0.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9362, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0371, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9738e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1962e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3081, device='cuda:0', grad_fn=) cls_loss: tensor(0.1256, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4430, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(1.5001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0489, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2150, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(3.1214, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7881, device='cuda:0', grad_fn=) cls_loss: tensor(1.6061, device='cuda:0', grad_fn=) cls_loss: tensor(1.4563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0333e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8122, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8203, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.1033, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4154, device='cuda:0', grad_fn=) cls_loss: tensor(9.2208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0836, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.9397, device='cuda:0', grad_fn=) cls_loss: tensor(2.4199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.0915, device='cuda:0', grad_fn=) cls_loss: tensor(4.6869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1703, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2435, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.1128, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0836, device='cuda:0', grad_fn=) cls_loss: tensor(1.0033e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.9831, device='cuda:0', grad_fn=) cls_loss: tensor(2.3345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1579, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(0.1882, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.6391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9356, device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.7428, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1580, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0818, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(1.6848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2587, device='cuda:0', grad_fn=) cls_loss: tensor(0.4613, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2693, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0966, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7891, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0726, device='cuda:0', grad_fn=) cls_loss: tensor(1.2501, device='cuda:0', grad_fn=) cls_loss: tensor(1.0307, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.2453e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5331, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8009e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.6335, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7566e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.6643, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.8659, device='cuda:0', grad_fn=) cls_loss: tensor(0.0653, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2338, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.9201, device='cuda:0', grad_fn=) cls_loss: tensor(0.5746, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0377, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0653, device='cuda:0', grad_fn=) cls_loss: tensor(0.0872, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.4129, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0462, device='cuda:0', grad_fn=) cls_loss: tensor(1.9310, device='cuda:0', grad_fn=) cls_loss: tensor(2.2681, device='cuda:0', grad_fn=) cls_loss: tensor(7.7883e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5633, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8031, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1278, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2443, device='cuda:0', grad_fn=) cls_loss: tensor(3.0021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0592, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2956e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0703, device='cuda:0', grad_fn=) cls_loss: tensor(0.4395, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3679e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6732, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.4902e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4938, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1182, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0648, device='cuda:0', grad_fn=) cls_loss: tensor(0.3381, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.4105, device='cuda:0', grad_fn=) cls_loss: tensor(1.7524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.9694, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(2.2193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1384, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.3460, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(1.0430, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(7.6493e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3525, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1418, device='cuda:0', grad_fn=) cls_loss: tensor(0.2630, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(9.2824e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1533, device='cuda:0', grad_fn=) cls_loss: tensor(2.1378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8112, device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1091, device='cuda:0', grad_fn=) cls_loss: tensor(2.8094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0416, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2808, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) 9.40297765928369e-05 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 358.94, cls_loss 0.1780 306 cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.3231, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.8888, device='cuda:0', grad_fn=) cls_loss: tensor(0.4202, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(2.6921e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2677, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1021, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(1.5093, device='cuda:0', grad_fn=) cls_loss: tensor(0.6416, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(1.2438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.3144, device='cuda:0', grad_fn=) cls_loss: tensor(1.3705, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.6556, device='cuda:0', grad_fn=) cls_loss: tensor(0.2053, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(1.1432, device='cuda:0', grad_fn=) cls_loss: tensor(0.4834, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.1274, device='cuda:0', grad_fn=) cls_loss: tensor(0.1365, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2762, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.0637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0674, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2622, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1421, device='cuda:0', grad_fn=) cls_loss: tensor(9.8149e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(9.8964e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3545, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.4732, device='cuda:0', grad_fn=) cls_loss: tensor(0.0362, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.4180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.2759, device='cuda:0', grad_fn=) cls_loss: tensor(1.3173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8421, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0996, device='cuda:0', grad_fn=) cls_loss: tensor(0.8737, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(1.9729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.2297, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2441, device='cuda:0', grad_fn=) cls_loss: tensor(0.7504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(3.7708, device='cuda:0', grad_fn=) cls_loss: tensor(0.2769, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.4419e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3256, device='cuda:0', grad_fn=) cls_loss: tensor(0.0539, device='cuda:0', grad_fn=) cls_loss: tensor(0.5389, device='cuda:0', grad_fn=) cls_loss: tensor(0.7507, device='cuda:0', grad_fn=) cls_loss: tensor(1.1047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1237, device='cuda:0', grad_fn=) cls_loss: tensor(0.2529, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(8.7957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.2510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(1.0462, device='cuda:0', grad_fn=) cls_loss: tensor(0.1696, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(1.4769, device='cuda:0', grad_fn=) cls_loss: tensor(1.0651, device='cuda:0', grad_fn=) cls_loss: tensor(0.3581, device='cuda:0', grad_fn=) cls_loss: tensor(8.7301e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7402, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2617, device='cuda:0', grad_fn=) cls_loss: tensor(1.1302, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9362, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(2.8021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6335, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5568, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.9095, device='cuda:0', grad_fn=) cls_loss: tensor(5.7499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0579, device='cuda:0', grad_fn=) cls_loss: tensor(2.0444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.4958, device='cuda:0', grad_fn=) cls_loss: tensor(3.7968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(9.2586e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3406, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3159, device='cuda:0', grad_fn=) cls_loss: tensor(3.1069, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4535, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.6691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0787, device='cuda:0', grad_fn=) cls_loss: tensor(0.0759, device='cuda:0', grad_fn=) cls_loss: tensor(0.1546, device='cuda:0', grad_fn=) cls_loss: tensor(0.4437, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.4036, device='cuda:0', grad_fn=) cls_loss: tensor(0.3382, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2690, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.1141, device='cuda:0', grad_fn=) cls_loss: tensor(0.1910, device='cuda:0', grad_fn=) cls_loss: tensor(1.1067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2055, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3824, device='cuda:0', grad_fn=) cls_loss: tensor(0.7936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.1281e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8893, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6418, device='cuda:0', grad_fn=) cls_loss: tensor(0.6783, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.2109, device='cuda:0', grad_fn=) cls_loss: tensor(6.9161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.4192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.1756, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.7083, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(1.9372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0508, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8269, device='cuda:0', grad_fn=) cls_loss: tensor(3.6577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.6992, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(9.9798e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.3679, device='cuda:0', grad_fn=) cls_loss: tensor(1.1756, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4352, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5042, device='cuda:0', grad_fn=) cls_loss: tensor(1.2995, device='cuda:0', grad_fn=) cls_loss: tensor(0.0578, device='cuda:0', grad_fn=) cls_loss: tensor(0.8585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(1.0212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.7227, device='cuda:0', grad_fn=) cls_loss: tensor(1.0753, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.7082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) cls_loss: tensor(2.0324, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0954, device='cuda:0', grad_fn=) cls_loss: tensor(4.4068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8829e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.4361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(7.0691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3803, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7311, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) 9.292243968009331e-05 changing lr epoch 12, time 356.09, cls_loss 0.2509 306 cls_loss: tensor(8.3804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0322, device='cuda:0', grad_fn=) cls_loss: tensor(1.1097, device='cuda:0', grad_fn=) cls_loss: tensor(0.2942, device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5379, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3296, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.3418, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1311, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0744e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3782, device='cuda:0', grad_fn=) cls_loss: tensor(0.0969, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(1.0153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0054, device='cuda:0', grad_fn=) cls_loss: tensor(1.7708, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0592, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7453, device='cuda:0', grad_fn=) cls_loss: tensor(5.1498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1816, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0437, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.6963e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.6740, device='cuda:0', grad_fn=) cls_loss: tensor(0.0675, device='cuda:0', grad_fn=) cls_loss: tensor(1.1797, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0702, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.1761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0717, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.2840, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.2994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5453, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.5646e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5150, device='cuda:0', grad_fn=) cls_loss: tensor(4.0713, device='cuda:0', grad_fn=) cls_loss: tensor(0.2779, device='cuda:0', grad_fn=) cls_loss: tensor(3.7471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.6289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.3975, device='cuda:0', grad_fn=) cls_loss: tensor(5.2790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1852, device='cuda:0', grad_fn=) cls_loss: tensor(0.8138, device='cuda:0', grad_fn=) cls_loss: tensor(5.5499, device='cuda:0', grad_fn=) cls_loss: tensor(1.5117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.3636, device='cuda:0', grad_fn=) cls_loss: tensor(0.4001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0880, device='cuda:0', grad_fn=) cls_loss: tensor(0.2926, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0669, device='cuda:0', grad_fn=) cls_loss: tensor(0.3444, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.6398, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.7754, device='cuda:0', grad_fn=) cls_loss: tensor(1.1563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0596, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1856, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.6224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3219, device='cuda:0', grad_fn=) cls_loss: tensor(1.0029, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6228e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9766, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0998, device='cuda:0', grad_fn=) cls_loss: tensor(0.6004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(8.2572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(4.4376, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(1.5791, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.6609, device='cuda:0', grad_fn=) cls_loss: tensor(0.0547, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.9095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4276, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2391, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(2.2670, device='cuda:0', grad_fn=) cls_loss: tensor(2.1684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0528, device='cuda:0', grad_fn=) cls_loss: tensor(0.1327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.4126, device='cuda:0', grad_fn=) cls_loss: tensor(3.8763, device='cuda:0', grad_fn=) cls_loss: tensor(0.0702, device='cuda:0', grad_fn=) cls_loss: tensor(0.0618, device='cuda:0', grad_fn=) cls_loss: tensor(1.2116, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.0055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.1716, device='cuda:0', grad_fn=) cls_loss: tensor(2.7743, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.2869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0874, device='cuda:0', grad_fn=) cls_loss: tensor(9.6917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.6418, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7094, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.3228, device='cuda:0', grad_fn=) cls_loss: tensor(6.3638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4466, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.1895, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.5273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.2310, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1424e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0860, device='cuda:0', grad_fn=) cls_loss: tensor(0.9940, device='cuda:0', grad_fn=) cls_loss: tensor(3.8422, device='cuda:0', grad_fn=) cls_loss: tensor(0.9880, device='cuda:0', grad_fn=) cls_loss: tensor(0.3317, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0816, device='cuda:0', grad_fn=) cls_loss: tensor(1.5225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1879, device='cuda:0', grad_fn=) cls_loss: tensor(3.0758, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.2994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0931, device='cuda:0', grad_fn=) cls_loss: tensor(0.3092, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3045, device='cuda:0', grad_fn=) cls_loss: tensor(7.9274e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.2969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.2189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3041, device='cuda:0', grad_fn=) 9.172866268606513e-05 changing lr epoch 13, time 358.04, cls_loss 0.2684 306 cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.6212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1243, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0826, device='cuda:0', grad_fn=) cls_loss: tensor(0.0763, device='cuda:0', grad_fn=) cls_loss: tensor(6.3856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.0192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3698, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(8.7976e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.6300, device='cuda:0', grad_fn=) cls_loss: tensor(1.1702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.6436, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.1514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.5581, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1314e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0669, device='cuda:0', grad_fn=) cls_loss: tensor(0.2778, device='cuda:0', grad_fn=) cls_loss: tensor(0.3534, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.6230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0989, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8460, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.4492, device='cuda:0', grad_fn=) cls_loss: tensor(0.0480, device='cuda:0', grad_fn=) cls_loss: tensor(0.2150, device='cuda:0', grad_fn=) cls_loss: tensor(0.2503, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0432, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(8.7718e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2030, device='cuda:0', grad_fn=) cls_loss: tensor(1.1927, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2819, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.3615, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(3.2286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9897e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.4584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0689, device='cuda:0', grad_fn=) cls_loss: tensor(7.0055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5625, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.8955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0.2523, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.4215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.8578, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2336, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.2663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(1.1522, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.1632e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4988, device='cuda:0', grad_fn=) cls_loss: tensor(1.4881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0360, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1479, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.3546, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2661, device='cuda:0', grad_fn=) cls_loss: tensor(0.4419, device='cuda:0', grad_fn=) cls_loss: tensor(1.3919, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0572, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1018, device='cuda:0', grad_fn=) cls_loss: tensor(0.1165, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.7553e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2837, device='cuda:0', grad_fn=) cls_loss: tensor(0.0954, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1265, device='cuda:0', grad_fn=) cls_loss: tensor(4.5717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.6821, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2849, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0626, device='cuda:0', grad_fn=) cls_loss: tensor(1.1133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.1165, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.9087, device='cuda:0', grad_fn=) cls_loss: tensor(1.3659, device='cuda:0', grad_fn=) cls_loss: tensor(0.1037, device='cuda:0', grad_fn=) cls_loss: tensor(1.5875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0995, device='cuda:0', grad_fn=) cls_loss: tensor(0.3273, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3984, device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0422, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(8.8811e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(0.9128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4850, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.6087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(1.2549, device='cuda:0', grad_fn=) cls_loss: tensor(4.0710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.2257, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.3455, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.5557, device='cuda:0', grad_fn=) cls_loss: tensor(0.3117, device='cuda:0', grad_fn=) cls_loss: tensor(0.6765, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.2749, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.3328, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1892, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0431, device='cuda:0', grad_fn=) cls_loss: tensor(0.4586, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1104, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.9199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0499, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.5231, device='cuda:0', grad_fn=) cls_loss: tensor(0.3386, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(0.0425, device='cuda:0', grad_fn=) cls_loss: tensor(0.6827, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1881, device='cuda:0', grad_fn=) cls_loss: tensor(8.2135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 9.045084971874738e-05 changing lr epoch 14, time 357.34, cls_loss 0.1243 306 cls_loss: tensor(1.6669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0849, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.6771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0794, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.8691, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.1637, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5949, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.5815, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5469, device='cuda:0', grad_fn=) cls_loss: tensor(0.3273, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.9450, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(2.5054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2048, device='cuda:0', grad_fn=) cls_loss: tensor(0.1173, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.1195e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0614, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1292, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.4491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6293e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0808e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1872, device='cuda:0', grad_fn=) cls_loss: tensor(1.0879, device='cuda:0', grad_fn=) cls_loss: tensor(2.8789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(2.0791, device='cuda:0', grad_fn=) cls_loss: tensor(2.2372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(2.0723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0702, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7728, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(1.7742e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5897e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0435, device='cuda:0', grad_fn=) cls_loss: tensor(0.4331, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2642, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1715, device='cuda:0', grad_fn=) cls_loss: tensor(2.3186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0841, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(1.3987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0692, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.0108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(6.6400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2519e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0517, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0594, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4688, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0699, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.1559, device='cuda:0', grad_fn=) cls_loss: tensor(2.7552, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0473, device='cuda:0', grad_fn=) cls_loss: tensor(0.5129, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3589, device='cuda:0', grad_fn=) cls_loss: tensor(0.5414, device='cuda:0', grad_fn=) cls_loss: tensor(4.4306e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8473e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0496, device='cuda:0', grad_fn=) cls_loss: tensor(0.0968, device='cuda:0', grad_fn=) cls_loss: tensor(1.7467, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.3363, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3104, device='cuda:0', grad_fn=) cls_loss: tensor(3.5114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0855, device='cuda:0', grad_fn=) cls_loss: tensor(6.4850e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.5459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9406e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1697e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1965, device='cuda:0', grad_fn=) cls_loss: tensor(1.1245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2113, device='cuda:0', grad_fn=) cls_loss: tensor(0.7492, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1787, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0423, device='cuda:0', grad_fn=) cls_loss: tensor(0.4559, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0217, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2049, device='cuda:0', grad_fn=) cls_loss: tensor(1.6543, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.1320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1292, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0318, device='cuda:0', grad_fn=) cls_loss: tensor(0.6190, device='cuda:0', grad_fn=) cls_loss: tensor(0.8022, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0474, device='cuda:0', grad_fn=) cls_loss: tensor(1.2060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(1.8823, device='cuda:0', grad_fn=) cls_loss: tensor(0.1628, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-05, device='cuda:0', grad_fn=) 8.90915741234015e-05 changing lr epoch 15, time 357.89, cls_loss 0.1362 306 cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0310, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.6174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1912, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.5378, device='cuda:0', grad_fn=) cls_loss: tensor(6.5605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.4570, device='cuda:0', grad_fn=) cls_loss: tensor(2.1257, device='cuda:0', grad_fn=) cls_loss: tensor(0.0502, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(2.1636e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0552, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0967, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1802, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0755, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0872, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1263, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0655, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.5597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8626e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1319, device='cuda:0', grad_fn=) cls_loss: tensor(0.3517, device='cuda:0', grad_fn=) cls_loss: tensor(1.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0671, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4766, device='cuda:0', grad_fn=) cls_loss: tensor(0.8451, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5542, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0762, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.8320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.1136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0782, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.2604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.6491, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9608, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0342, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(2.4557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7737, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0558, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(6.7393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5339, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4541, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4770e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.4485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(3.2485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.2264, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0304, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.1426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2777, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1941, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(1.4126e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1706, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(4.4326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0641, device='cuda:0', grad_fn=) cls_loss: tensor(0.0900, device='cuda:0', grad_fn=) cls_loss: tensor(1.7603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0519, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9381e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3735, device='cuda:0', grad_fn=) cls_loss: tensor(0.0627, device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1328, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(8.6188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0582, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9505, device='cuda:0', grad_fn=) cls_loss: tensor(1.6153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4194, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(1.1702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1552, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5033, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) 8.765357330018056e-05 changing lr epoch 16, time 360.08, cls_loss 0.0830 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1808, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0915, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3477, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2250, device='cuda:0', grad_fn=) cls_loss: tensor(3.6478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.5835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.4451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(8.2115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7910, device='cuda:0', grad_fn=) cls_loss: tensor(4.5220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9672e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7516, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.4007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.6191, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.6101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1976, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0318, device='cuda:0', grad_fn=) cls_loss: tensor(6.8267e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.2151, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0405, device='cuda:0', grad_fn=) cls_loss: tensor(1.8020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.7350, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0617, device='cuda:0', grad_fn=) cls_loss: tensor(4.5975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0318, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5531, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0827, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0469, device='cuda:0', grad_fn=) cls_loss: tensor(0.0485, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0217, device='cuda:0', grad_fn=) cls_loss: tensor(1.4365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.2510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.0788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.0091, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(3.3240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.3146, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8744e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7246, device='cuda:0', grad_fn=) cls_loss: tensor(7.6890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(3.4511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.2227, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0623, device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0759, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0476, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(7.9513e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6960, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.3255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(8.9188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1768, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2643, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(9.9977e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7150, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1755, device='cuda:0', grad_fn=) cls_loss: tensor(0.7024, device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1427, device='cuda:0', grad_fn=) cls_loss: tensor(0.0317, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9028e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6217, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1500, device='cuda:0', grad_fn=) cls_loss: tensor(1.0456, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5120, device='cuda:0', grad_fn=) cls_loss: tensor(2.3652, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3339, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3426, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(2.4701, device='cuda:0', grad_fn=) cls_loss: tensor(0.3805, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7760, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0468, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5744e-05, device='cuda:0', grad_fn=) 8.613974319136958e-05 changing lr epoch 17, time 358.06, cls_loss 0.1141 306 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5810e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.6094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0920, device='cuda:0', grad_fn=) cls_loss: tensor(0.2100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1614, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.2993, device='cuda:0', grad_fn=) cls_loss: tensor(0.0395, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.6240, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0798, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2999, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2811, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3818e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0457, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.9902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0379, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.1433, device='cuda:0', grad_fn=) cls_loss: tensor(0.0253, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(4.7147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0235, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0342, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1274, device='cuda:0', grad_fn=) cls_loss: tensor(2.7398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0341e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.2373, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0465, device='cuda:0', grad_fn=) cls_loss: tensor(1.5676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0760, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.2539e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7474, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1169, device='cuda:0', grad_fn=) cls_loss: tensor(0.3501, device='cuda:0', grad_fn=) cls_loss: tensor(0.0612, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0589, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.1239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1312, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.0637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.5338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5407e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.0212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1641, device='cuda:0', grad_fn=) cls_loss: tensor(0.2077, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.5526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.3262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.9775, device='cuda:0', grad_fn=) cls_loss: tensor(3.4014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1442, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.6028e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9918, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0571, device='cuda:0', grad_fn=) cls_loss: tensor(0.0335, device='cuda:0', grad_fn=) cls_loss: tensor(0.3153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(1.0848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6656, device='cuda:0', grad_fn=) cls_loss: tensor(3.9458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0556, device='cuda:0', grad_fn=) cls_loss: tensor(0.0342, device='cuda:0', grad_fn=) cls_loss: tensor(0.4377, device='cuda:0', grad_fn=) cls_loss: tensor(1.4881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1783, device='cuda:0', grad_fn=) cls_loss: tensor(4.4127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.4798, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(2.2352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.5418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1230, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(8.3129e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) 8.455313244934324e-05 changing lr epoch 18, time 357.52, cls_loss 0.0474 306 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.8447e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0490, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4052, device='cuda:0', grad_fn=) cls_loss: tensor(2.1418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(1.1855, device='cuda:0', grad_fn=) cls_loss: tensor(7.8877e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0483, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3801, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4733, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.0664, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2905, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0975, device='cuda:0', grad_fn=) cls_loss: tensor(0.2447, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4382, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1834, device='cuda:0', grad_fn=) cls_loss: tensor(0.4662, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1804, device='cuda:0', grad_fn=) cls_loss: tensor(2.4120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1044, device='cuda:0', grad_fn=) cls_loss: tensor(4.4743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1772, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.6625e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.4862, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8147e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9221e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0465, device='cuda:0', grad_fn=) cls_loss: tensor(6.1790e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1438, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(2.3186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(7.1764e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.4374e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1772, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.1140, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.6432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0672, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2544, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(3.3279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8555, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0802, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.6549, device='cuda:0', grad_fn=) cls_loss: tensor(0.2298, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9075e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0444, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0753, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(2.0603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8473, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1394, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8601, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3089e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0764, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) 8.289693629698562e-05 changing lr ---------------------saving model at epoch 19---------------------------------------------------- epoch 19, time 356.57, cls_loss 0.0406 306 cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5844e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0490, device='cuda:0', grad_fn=) cls_loss: tensor(0.0492, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0405e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3063, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0973, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.9705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.1570, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(1.1405, device='cuda:0', grad_fn=) cls_loss: tensor(0.0558, device='cuda:0', grad_fn=) cls_loss: tensor(0.0990, device='cuda:0', grad_fn=) cls_loss: tensor(1.4166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.9353e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.8850e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(2.3941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4357, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1589, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.1740, device='cuda:0', grad_fn=) cls_loss: tensor(3.7988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1156, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2996e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.0400e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5990e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7798, device='cuda:0', grad_fn=) cls_loss: tensor(0.0713, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0390, device='cuda:0', grad_fn=) cls_loss: tensor(9.6818e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.6560e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.1062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(2.0564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(6.1929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1136, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8189, device='cuda:0', grad_fn=) cls_loss: tensor(2.7756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(6.2009e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2266, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2744e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2900, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0498, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.9087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(4.6333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.2519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.4227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0413, device='cuda:0', grad_fn=) cls_loss: tensor(5.3664e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(7.5102e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(1.9610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1384e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3408, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.6857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(1.3431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0724, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.9538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7798e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(1.8934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2494, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) 8.117449009293667e-05 changing lr ---------------------saving model at epoch 20---------------------------------------------------- epoch 20, time 357.61, cls_loss 0.0392 306 cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0867, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9698e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0543, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.2586e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2612, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2850e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7818e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9306, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1459e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2763, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0929, device='cuda:0', grad_fn=) cls_loss: tensor(1.0252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2731, device='cuda:0', grad_fn=) cls_loss: tensor(0.7876, device='cuda:0', grad_fn=) cls_loss: tensor(0.1740, device='cuda:0', grad_fn=) cls_loss: tensor(5.3406e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.0268e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.2107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0576, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0408, device='cuda:0', grad_fn=) cls_loss: tensor(0.0277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(4.5578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5967, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.5999, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2688, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8162e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1987, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3571, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1851, device='cuda:0', grad_fn=) cls_loss: tensor(0.2330, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(9.6162e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0638, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0554, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8744e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5897e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(1.7623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.5038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2638, device='cuda:0', grad_fn=) cls_loss: tensor(0.9180, device='cuda:0', grad_fn=) cls_loss: tensor(8.6228e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0.9180, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(1.5809, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0635, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2915e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2205, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0726, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) 7.938926261462365e-05 changing lr epoch 21, time 358.54, cls_loss 0.0412 306 cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.7762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1640, device='cuda:0', grad_fn=) cls_loss: tensor(1.0153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3545, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) cls_loss: tensor(0.1664, device='cuda:0', grad_fn=) cls_loss: tensor(6.8327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1414, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0507, device='cuda:0', grad_fn=) cls_loss: tensor(2.3703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1506, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(8.2652e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0345, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0596, device='cuda:0', grad_fn=) cls_loss: tensor(0.0293, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0728, device='cuda:0', grad_fn=) cls_loss: tensor(3.1253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2159e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(1.6431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(2.9961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1909e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0500, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1290, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1508, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.0757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4107e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2702, device='cuda:0', grad_fn=) cls_loss: tensor(3.7193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.1676, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1007, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1810, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(6.5962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7533, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0693, device='cuda:0', grad_fn=) cls_loss: tensor(1.4663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7089e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.1824, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5896e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4027, device='cuda:0', grad_fn=) cls_loss: tensor(1.2736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(1.2696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(8.6427e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.0154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.3876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2917, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0414, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.6530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.6133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.8439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9737e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) 7.754484907260511e-05 changing lr ---------------------saving model at epoch 22---------------------------------------------------- epoch 22, time 357.59, cls_loss 0.0210 306 cls_loss: tensor(5.3247e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0400e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0416, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(1.6352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(5.5750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.7545e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2983e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(4.3948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0613, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6293e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3221e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3529, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0575, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.1260e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.5749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1664, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5983e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.0438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1291, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8592e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.6320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(7.9751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2612, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9232, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.3937, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1560, device='cuda:0', grad_fn=) cls_loss: tensor(0.3581, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3738e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.3906, device='cuda:0', grad_fn=) cls_loss: tensor(0.0639, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.6432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4374e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2513, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.4645e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0307, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3142e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(2.1895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.5014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0746, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(8.0387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4020, device='cuda:0', grad_fn=) cls_loss: tensor(1.3232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6505e-05, device='cuda:0', grad_fn=) 7.56449638702953e-05 changing lr epoch 23, time 360.75, cls_loss 0.0187 306 cls_loss: tensor(3.4332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0569, device='cuda:0', grad_fn=) cls_loss: tensor(0.0739, device='cuda:0', grad_fn=) cls_loss: tensor(0.1756, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(7.2857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2909, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5687, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.6890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(1.4246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(2.0762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.1968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.7950e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.6948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2116, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(7.3075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0381, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0338, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2386e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2224, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.9843, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3440e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.8318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8744e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2639, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0752, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(2.2274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.4843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0436, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9375, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(9.6957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0506, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2314, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1565e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3314, device='cuda:0', grad_fn=) cls_loss: tensor(1.6908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4976, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9500e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.3617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0436, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.4599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(1.7027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0342, device='cuda:0', grad_fn=) 7.369343312364992e-05 changing lr epoch 24, time 356.58, cls_loss 0.0428 306 cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(4.3352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(4.8955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0447, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6161e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3758e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5419e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0469, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0754, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.8075, device='cuda:0', grad_fn=) cls_loss: tensor(2.6286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.3029, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(9.2049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.1358, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9405, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4452, device='cuda:0', grad_fn=) cls_loss: tensor(0.0416, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0659, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0729, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2139e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2777e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1390, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2024, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6346, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1177, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3640, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3539e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.7819, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0419, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9645, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.4486, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3486, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1559, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0350, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(2.8286, device='cuda:0', grad_fn=) cls_loss: tensor(0.1825, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2285, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.3844e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0361e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.5062, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0524, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.9436, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1212, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.2646, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0733, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 7.169418695587788e-05 changing lr epoch 25, time 365.69, cls_loss 0.0531 306 cls_loss: tensor(7.1824e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0315, device='cuda:0', grad_fn=) cls_loss: tensor(1.3550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2188e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.2684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(7.3711e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8506e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5657, device='cuda:0', grad_fn=) cls_loss: tensor(4.2260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8822, device='cuda:0', grad_fn=) cls_loss: tensor(0.8123, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1517, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4821, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9162e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0287, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.4606, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0331, device='cuda:0', grad_fn=) cls_loss: tensor(1.5994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0664, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4603, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.1940, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7126e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.2915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(6.5962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(1.3550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4590, device='cuda:0', grad_fn=) cls_loss: tensor(7.5320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0372, device='cuda:0', grad_fn=) cls_loss: tensor(2.1279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.4200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0666, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(2.6902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6824e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.8944e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0797, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.5140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0291, device='cuda:0', grad_fn=) cls_loss: tensor(1.1245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.6934, device='cuda:0', grad_fn=) cls_loss: tensor(6.5744e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0830, device='cuda:0', grad_fn=) cls_loss: tensor(9.5526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(9.6560e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.8678, device='cuda:0', grad_fn=) cls_loss: tensor(5.2392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3685, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1404, device='cuda:0', grad_fn=) cls_loss: tensor(0.1812, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.8537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(4.2915e-06, device='cuda:0', grad_fn=) 6.965125158269616e-05 changing lr epoch 26, time 358.04, cls_loss 0.0377 306 cls_loss: tensor(0.7696, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0826, device='cuda:0', grad_fn=) cls_loss: tensor(8.3327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1535, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2129, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1160, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(4.6670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.5338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1152, device='cuda:0', grad_fn=) cls_loss: tensor(3.5544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4421, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(4.8796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(7.9930e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1550, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2340, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7883e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0399, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6560e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0497, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.4005, device='cuda:0', grad_fn=) cls_loss: tensor(8.2056e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(1.7047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0808, device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0388, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0274, device='cuda:0', grad_fn=) cls_loss: tensor(6.6996e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5844e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4861e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(8.6904e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) 6.756874120406712e-05 changing lr epoch 27, time 357.21, cls_loss 0.0154 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4650, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(9.7791e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0396, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1493, device='cuda:0', grad_fn=) cls_loss: tensor(4.7425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0505, device='cuda:0', grad_fn=) cls_loss: tensor(9.7950e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0271, device='cuda:0', grad_fn=) cls_loss: tensor(8.9467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.0474, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5236, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3182e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(9.8050e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2188e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9548, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0598e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.1625, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1685e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(1.8338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.9913, device='cuda:0', grad_fn=) cls_loss: tensor(1.3272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0594, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3334e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1261e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1028e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5036e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4374e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5274, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4572e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(5.1459e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4154, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(1.7424e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1370, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1194e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7275e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0494, device='cuda:0', grad_fn=) cls_loss: tensor(3.7789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0327, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4671e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5051, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2056e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.6001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1452, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3843e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6538e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.9036e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0339e-05, device='cuda:0', grad_fn=) 6.545084971874736e-05 changing lr epoch 28, time 358.61, cls_loss 0.0293 306 cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3203, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1249, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(2.1716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0650, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2347, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(3.9975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9056e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1918, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5169e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2387e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5159e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0419, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1434, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(3.3657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0268e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5159e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0068e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(1.3403, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0814, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(4.2021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0970, device='cuda:0', grad_fn=) cls_loss: tensor(0.0388, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1156, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0983, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.4060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(9.1990e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.0868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.5798, device='cuda:0', grad_fn=) cls_loss: tensor(1.9888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.2954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.7299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6625e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5301e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.9439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2816, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0672, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.4168e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.8725e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2678, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4240e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7817e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0721, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(3.6776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0579, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) 6.330184227833373e-05 changing lr epoch 29, time 355.96, cls_loss 0.0227 306 cls_loss: tensor(1.2815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6957e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0170, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.6369, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(5.6585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9208e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(1.3570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.6957e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0627, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.1844, device='cuda:0', grad_fn=) cls_loss: tensor(8.9208e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1745, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8803e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0692, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.2512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.2970e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3789, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1174, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1583e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0758e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1851, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.2398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0807, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.7154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2159e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(3.5644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.5354e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.4241e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0731e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6361e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0780, device='cuda:0', grad_fn=) cls_loss: tensor(3.7014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0003e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) 6.11260466978157e-05 changing lr epoch 30, time 354.42, cls_loss 0.0082 306 cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9419e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.6928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0757, device='cuda:0', grad_fn=) cls_loss: tensor(6.0598e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6155e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7962e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0563, device='cuda:0', grad_fn=) cls_loss: tensor(1.9630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(1.5199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.4195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.9174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3519, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6293e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3731e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1658e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6376, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.2605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(1.1543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0357, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4142e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0734, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.5116, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1291, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0672, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0608, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(3.4412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2484, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0349, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.1797, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 5.892784473993181e-05 changing lr epoch 31, time 357.66, cls_loss 0.0135 306 cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0492, device='cuda:0', grad_fn=) cls_loss: tensor(1.0371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.4531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4022e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(9.4990e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.4822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1738, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0628, device='cuda:0', grad_fn=) cls_loss: tensor(0.1139, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.5577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0545e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8308e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1812, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.8561, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0558, device='cuda:0', grad_fn=) cls_loss: tensor(1.6332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.0478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.1468, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1179, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0463, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.4241e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(5.7499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0447, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) 5.6711663290882756e-05 changing lr epoch 32, time 359.43, cls_loss 0.0168 306 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8002e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.1565e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.4811e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3843e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(3.8723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(2.2789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1794, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3122e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.5127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1539e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.4275e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(3.5127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7783e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5698e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0677, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1968, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0502, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3060, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2868, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(2.6464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1384e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.4120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4264, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0442, device='cuda:0', grad_fn=) cls_loss: tensor(0.1564, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.4214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(8.5235e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0774, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1311, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0454, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.8822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.4837e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(1.1702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2539, device='cuda:0', grad_fn=) cls_loss: tensor(0.1117, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6791e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2624, device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.3036e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0852, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5235e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5554, device='cuda:0', grad_fn=) cls_loss: tensor(9.5427e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.5457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) 5.448196544517166e-05 changing lr epoch 33, time 358.22, cls_loss 0.0171 306 cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(9.6758e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4937, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.5932, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0305, device='cuda:0', grad_fn=) cls_loss: tensor(5.2353e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(2.1696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1803, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0983, device='cuda:0', grad_fn=) cls_loss: tensor(0.0969, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1202, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6777e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.8239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6162e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4896, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(1.0908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1252, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(1.0967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.9888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2298e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1319, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0369, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.2652e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2298e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0392, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(1.3789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(9.1553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1175, device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1007, device='cuda:0', grad_fn=) cls_loss: tensor(7.8579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(7.4188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) 5.2243241517525733e-05 changing lr epoch 34, time 357.39, cls_loss 0.0185 306 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(1.3570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5447, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.8658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9406e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0431, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0628, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2626, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6161e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0793, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1861e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0790, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2910, device='cuda:0', grad_fn=) cls_loss: tensor(1.5557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2430, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.2288e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.0770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.4108e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0503, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1861e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.8843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9963e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0689, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0335, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.0889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.4485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(2.3504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.2795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.3828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0713, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8149e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0562, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) 4.999999999999998e-05 changing lr epoch 35, time 360.38, cls_loss 0.0109 306 cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.7584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0053e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(6.3260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.7917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0355, device='cuda:0', grad_fn=) cls_loss: tensor(5.8671e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.8465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2016e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.7254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.4717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9883e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8811e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(1.1961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5366e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0281, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(2.7517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1299, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.0731e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4573e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5897e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(3.2147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0296, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.8526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.6333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0519, device='cuda:0', grad_fn=) cls_loss: tensor(1.1941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0423, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0766, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) 4.7756758482474244e-05 changing lr epoch 36, time 356.36, cls_loss 0.0026 306 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5636e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.1050, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7553e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0598e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.1048, device='cuda:0', grad_fn=) cls_loss: tensor(1.6431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0068e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7023e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0863, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8923e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5169e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7285, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1165, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6160e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(4.3511e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(2.6703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0456, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0351, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2453e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(4.2280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.3686, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.1572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(2.0047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(1.5318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0444, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2850e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.4163, device='cuda:0', grad_fn=) cls_loss: tensor(4.9392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3241e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) 4.551803455482832e-05 changing lr epoch 37, time 356.94, cls_loss 0.0074 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0466, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2357, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0522, device='cuda:0', grad_fn=) cls_loss: tensor(4.3670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0405e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.7810e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(8.0268e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4108e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.8564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(4.0273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.4440e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(2.2093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0419, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3843e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(4.9849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0458, device='cuda:0', grad_fn=) cls_loss: tensor(8.4877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3645e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4770e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(3.0080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0314, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.4254, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5366e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1488, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6028e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0706, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1182e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.2823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1348, device='cuda:0', grad_fn=) cls_loss: tensor(8.1658e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1301, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4395, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(1.5338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8282, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 4.328833670911722e-05 changing lr epoch 38, time 361.18, cls_loss 0.0128 306 cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(1.3649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9937e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7385e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1261e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.4440e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(1.9014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0558, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1528, device='cuda:0', grad_fn=) cls_loss: tensor(6.5366e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(1.5622, device='cuda:0', grad_fn=) cls_loss: tensor(7.3711e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0271, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0471, device='cuda:0', grad_fn=) cls_loss: tensor(7.6493e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0763, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(9.4573e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1222, device='cuda:0', grad_fn=) cls_loss: tensor(9.8467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.3864e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.8750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0814, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4583e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0939, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(8.1162e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3808e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.0546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(3.3240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.5199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.2983e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4593e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0526, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0362, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(1.0570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(2.7061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) 4.1072155260068164e-05 changing lr ---------------------saving model at epoch 39---------------------------------------------------- epoch 39, time 358.80, cls_loss 0.0086 306 cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1143, device='cuda:0', grad_fn=) cls_loss: tensor(2.0842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.6843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2155e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0487, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.8578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6029e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1566, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0494, device='cuda:0', grad_fn=) cls_loss: tensor(5.0088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8612e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.2227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2646, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8247e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.8678e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.2180, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0598e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0943, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5632e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(1.7623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.4412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1105, device='cuda:0', grad_fn=) cls_loss: tensor(7.6294e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5446e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0876, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9659e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.8010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6009, device='cuda:0', grad_fn=) cls_loss: tensor(1.2239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.8149e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4770e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7089e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) 3.8873953302184275e-05 changing lr ---------------------saving model at epoch 40---------------------------------------------------- epoch 40, time 361.67, cls_loss 0.0076 306 cls_loss: tensor(0.0823, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.4452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.1683, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2420e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4036e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1831, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.3525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(7.7784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4886, device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(8.6228e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9937e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0955, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0322, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(6.4552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.0737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7883e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1404e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(1.3053e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3366, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1592e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8235e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.2990e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.5169e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.1617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.7553e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(3.4908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.2420e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.3067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.5765e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(7.7089e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.6705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0383, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5340e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-05, device='cuda:0', grad_fn=) 3.669815772166625e-05 changing lr epoch 41, time 357.39, cls_loss 0.0055 306 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(8.3546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(8.6089e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(2.8352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0523, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.1133, device='cuda:0', grad_fn=) cls_loss: tensor(3.6120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.2189e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.2106, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.8169e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4551, device='cuda:0', grad_fn=) cls_loss: tensor(2.9544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5632e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.8030e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0633, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6221e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6141e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.3869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4838, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8598e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1853, device='cuda:0', grad_fn=) cls_loss: tensor(6.6956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.3314e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(3.4650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.3466e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0786, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0197, device='cuda:0', grad_fn=) cls_loss: tensor(6.8943e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.8737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1966, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(1.6431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.2194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4445e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.6148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) 3.454915028125263e-05 changing lr epoch 42, time 356.34, cls_loss 0.0075 306 cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1431, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(5.7062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.1592e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1477, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.4783e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9870e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.5360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(6.1631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.4108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3380e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2533e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0892, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.9703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.2557, device='cuda:0', grad_fn=) cls_loss: tensor(3.7690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3511e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0708, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1579, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0533, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.4583e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(7.8360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(1.1245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2844e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8314e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(1.2894e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2435, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(1.5537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0287, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.3544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.4100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.1697e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0164, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9818e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.6956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1106, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.9904e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.6201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) 3.2431258795932867e-05 changing lr epoch 43, time 357.10, cls_loss 0.0085 306 cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5169e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(4.4107e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(2.2491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.7253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0639e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9141e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(4.7127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0253, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.4162e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0810, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0460, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.0868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.6625e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.1367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5785e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(1.6073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(2.9167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.4837e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0798e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8963e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.4253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3022e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4967, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(3.6438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8493e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.6936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.4902e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1086e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1194e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6028e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9142e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3947e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) 3.0348748417303827e-05 changing lr epoch 44, time 356.52, cls_loss 0.0033 306 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.0915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5963e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.0016e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5896e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4126e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.4418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1205, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.0021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.1398, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5068, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1559e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0347, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1592e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3386, device='cuda:0', grad_fn=) cls_loss: tensor(1.0451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0794, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4036e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(3.4312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(9.8507e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.4866, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6293e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1215, device='cuda:0', grad_fn=) cls_loss: tensor(8.6526e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.5570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0053e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(7.2161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.8301e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6294e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1260e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0705, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2894e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) 2.830581304412209e-05 changing lr epoch 45, time 358.60, cls_loss 0.0138 306 cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(7.2638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(8.7937e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0720, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.5505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0349, device='cuda:0', grad_fn=) cls_loss: tensor(2.1279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7711e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1194e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0383, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.1856e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(8.5453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4969e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0217, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2831e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1414, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3380e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6595, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0500e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.0366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0834, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.1062e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0580, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.9797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1841e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0211, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9274e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7976e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1288, device='cuda:0', grad_fn=) cls_loss: tensor(3.1034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7120e-05, device='cuda:0', grad_fn=) 2.6306566876350062e-05 changing lr epoch 46, time 355.85, cls_loss 0.0049 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.2212, device='cuda:0', grad_fn=) cls_loss: tensor(3.1273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0951, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3221e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3844e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3843e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.2624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0543, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4107e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1623, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7353e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(7.2042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8274e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7817e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0356, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1194e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1166, device='cuda:0', grad_fn=) cls_loss: tensor(4.1246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9022e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5281e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.0902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.5168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1790e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(2.8888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.8877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2320e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(2.8789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0068e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3591e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(6.9141e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 2.4355036129704693e-05 changing lr epoch 47, time 358.34, cls_loss 0.0063 306 cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3778e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.3579e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(3.1352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.7818e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4969e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.5584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4108e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.4771e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.3848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(1.9928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0033e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9036e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.4697e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0884e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(9.3063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.0140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4107e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0411, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1176, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(7.6870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5235e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.9737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2694, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2415, device='cuda:0', grad_fn=) cls_loss: tensor(1.5001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(3.0478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1586, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.6956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.2948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) 2.2455150927394874e-05 changing lr epoch 48, time 354.26, cls_loss 0.0070 306 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7818e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9937e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0554, device='cuda:0', grad_fn=) cls_loss: tensor(9.3420e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0400e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.7603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2652e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.6193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5235e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0497, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(7.4208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8147e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.8666e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0742e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(8.3705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9606e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.5544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3234e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.3783e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6920, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0682, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.1923e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.4307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8142e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0479, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.6387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0244, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5897e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.8944e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7817e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.2586e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) 2.0610737385376345e-05 changing lr epoch 49, time 358.20, cls_loss 0.0036 306 cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.6134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.5373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5520, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(3.1352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3511e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0565, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.9936e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.5784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2803e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(1.3928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.3167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9208e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(1.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.8811e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3645e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9406e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8565e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(1.1881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(1.8179e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(6.9479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1717, device='cuda:0', grad_fn=) cls_loss: tensor(5.6883e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0003e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3997e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2213, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1168e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4573e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 1.8825509907063323e-05 changing lr epoch 50, time 356.73, cls_loss 0.0042 306 cls_loss: tensor(6.2346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1110, device='cuda:0', grad_fn=) cls_loss: tensor(8.4360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.6432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.6307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9976e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9738e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4023e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(9.3838e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.8644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3385e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0645, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0540, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.1459e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3223, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3579e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1595, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.7259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0700, device='cuda:0', grad_fn=) cls_loss: tensor(3.0498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0658, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9909e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9904e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5765e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1016e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) 1.710306370301437e-05 changing lr epoch 51, time 356.93, cls_loss 0.0081 306 cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.8199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0186, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(5.8194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0707, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4533e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1742e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.2253e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.9049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0710, device='cuda:0', grad_fn=) cls_loss: tensor(4.7843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(1.4603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0423, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.7836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0731e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.2304, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(4.4902e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0028e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.1484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0298, device='cuda:0', grad_fn=) cls_loss: tensor(4.4922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7803e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.4121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.2279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5446e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 1.5446867550656765e-05 changing lr epoch 52, time 357.03, cls_loss 0.0026 306 cls_loss: tensor(2.4100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(3.2465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(9.6361e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7724e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1201, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.6108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.6227e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1563, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0536, device='cuda:0', grad_fn=) cls_loss: tensor(9.3063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.2585e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5771e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1038, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.6399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5159e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6758e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.1791e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.7194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0033e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1652e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2519e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.3657, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.8360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6029e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8611e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(7.7625e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.2153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 1.3860256808630425e-05 changing lr epoch 53, time 357.45, cls_loss 0.0030 306 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.5532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0407e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7145, device='cuda:0', grad_fn=) cls_loss: tensor(8.5354e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.8147e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3778e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7565e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3711e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8479e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1923e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1424e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3083e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9016e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0486, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.3678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9606e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9909e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0611, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.3987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0591, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(1.6212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6758e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(1.4563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.1347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.2358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1589, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.8944e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(5.7399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8179e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2386e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.2850e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0875, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1769e-05, device='cuda:0', grad_fn=) 1.2346426699819455e-05 changing lr epoch 54, time 356.20, cls_loss 0.0051 306 cls_loss: tensor(9.9719e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.2466e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1428, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0471, device='cuda:0', grad_fn=) cls_loss: tensor(5.3505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(2.1756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.1442, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1022e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.1424e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4440e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.4374e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1089, device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0808e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4116e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.2652e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4742e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.6431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0290, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2320e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8611e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9737e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.1115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.1787, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(1.4643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0316, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.7904, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2894, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.8694, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9771e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(2.8908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.7916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.3697e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2320e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.9928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(1.7365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3486e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) 1.0908425876598507e-05 changing lr epoch 55, time 356.09, cls_loss 0.0136 306 cls_loss: tensor(3.6001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6493e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.0717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1261e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2850e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(7.5698e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(6.6102e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.7285, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4572e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(2.1895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6028e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(2.0981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9737e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9976e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0317, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(3.0955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2179e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.6738e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2253e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1015, device='cuda:0', grad_fn=) cls_loss: tensor(2.5789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.2585e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.2433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(9.1791e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0565e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(4.3750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5896e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3977e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3108e-05, device='cuda:0', grad_fn=) 9.549150281252631e-06 changing lr epoch 56, time 355.81, cls_loss 0.0036 306 cls_loss: tensor(2.1438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(6.3380e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0719, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8082e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.7300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1136e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7724e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.3685e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1968, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.4471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2377, device='cuda:0', grad_fn=) cls_loss: tensor(0.2864, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0399, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0304, device='cuda:0', grad_fn=) cls_loss: tensor(3.5882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.4261e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2519e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(5.2333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(7.5102e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.0480e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0425, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5248e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8122e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0323, device='cuda:0', grad_fn=) cls_loss: tensor(0.0478, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(1.8199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0446, device='cuda:0', grad_fn=) cls_loss: tensor(2.2391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8744e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.5253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7089e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2734, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.9379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.9186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7500e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.2571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.4988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-05, device='cuda:0', grad_fn=) 8.271337313934865e-06 changing lr epoch 57, time 361.23, cls_loss 0.0051 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.2387e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0438, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.5566e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4969e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0406, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.4240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1062e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0704e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8015e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.3983e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(8.7619e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.5836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3314e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2696e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.6625e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8116e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1062, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2637, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.1061e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0425, device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(3.7452e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(1.1583e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4241e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.8825e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2809e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.2777e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(4.9849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1791e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) 7.077560319906693e-06 changing lr epoch 58, time 364.07, cls_loss 0.0025 306 cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(3.2961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.6882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(7.1923e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7247e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.2665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(4.3849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1261e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8069e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1388, device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3026, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4664e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0505, device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8120e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.6823e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.5287e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.1007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6029e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8612e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.3697e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(7.1923e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(1.0073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4902e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2915e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.3778e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.8254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1326, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0654, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(7.5599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0716, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(8.6010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1086e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.9169e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 5.970223407163098e-06 changing lr epoch 59, time 356.49, cls_loss 0.0036 306 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0911, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5765e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.1414, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2596e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.1841e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.8348e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1878, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.8149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8215e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.9763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.4208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0731e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0593, device='cuda:0', grad_fn=) cls_loss: tensor(1.1782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.1895, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.1194e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.4632, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.6703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1086e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.1543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7817e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.4066, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2970e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0400, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.0439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7948e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.2359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.6387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6625e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) 4.9515566048790464e-06 changing lr epoch 60, time 355.95, cls_loss 0.0089 306 cls_loss: tensor(2.6246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0400e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3511e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.1260e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.0473e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.3182e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0128e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(7.4168e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.0864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5036e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(4.6889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2341, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.2333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.9745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0926, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0626, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5698e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.6484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.0352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1837e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4704e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.7862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(6.8108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(5.4340e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.9274e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7909e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.1133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2850e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1856e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(1.8656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 4.0236113724274705e-06 changing lr epoch 61, time 354.62, cls_loss 0.0018 306 cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(2.9842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(6.9340e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0400, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8811e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6559e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5896e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.2189e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2253e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8829e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0377, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.4961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5948e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3313e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3645e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.4769e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9168e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2983e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4440e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4625e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.0996e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.1871e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.6823e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(2.4478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.9810e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0227, device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.5234e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1538e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.3419e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.2020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(9.7613e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.5365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.2089e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.8414e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7538e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1413, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8850e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 3.188256468013139e-06 changing lr epoch 62, time 356.37, cls_loss 0.0015 306 cls_loss: tensor(1.2795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2732, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1591, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.3445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.7134e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8479e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9936e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(5.7896e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0834, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0593, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0197, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6401e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2585e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8213e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.8485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.2056e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.9352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(6.1790e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(4.6432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.2189e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0623, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(7.2161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3947e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9141e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8366e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5924e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.2586e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(9.9738e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2640, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.6492e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.4472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9560e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.6371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.1630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3947e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1653, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.2796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0848e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1090, device='cuda:0', grad_fn=) 2.4471741852423225e-06 changing lr epoch 63, time 356.08, cls_loss 0.0047 306 cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0441, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9379e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.3173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(1.2438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(5.9207e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4380e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.9963e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.8082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9737e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6756e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(1.6868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6294e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.1961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0525, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.6260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8604e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0227, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.1275e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(5.5830e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3115e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.1935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(6.3777e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.4227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.3146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0590, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.2983e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.8492e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.3380e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) 1.8018569652073373e-06 changing lr epoch 64, time 360.71, cls_loss 0.0015 306 cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.3180e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2207e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3579e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.2471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.1060e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.7465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(1.5676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5487e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(4.3015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6493e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5385e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(5.4578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(8.8672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1525e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8347e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0667, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0268e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9141e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8943e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.7553e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(2.9206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.1992, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2282, device='cuda:0', grad_fn=) cls_loss: tensor(5.3048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3181e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(9.8586e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0560, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9207e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.6665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.8306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2870, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9073e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.8392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.8817e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(3.0617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.7830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(2.7736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3774, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9339e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) 1.2536043909088185e-06 changing lr epoch 65, time 356.73, cls_loss 0.0081 306 cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9618e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(5.8810e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.0731e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(2.7736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.3273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.8944e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2386e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(2.8491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0854, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.4306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.7535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(4.6750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2387e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7089e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7883e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7818e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5233e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1195, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3808e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.2199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.9889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0641, device='cuda:0', grad_fn=) cls_loss: tensor(7.2519e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.3691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2698e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(9.7553e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4829e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.0113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0636, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.7491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4981e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6558e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.7480e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0372, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9267e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8612e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.3319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3115e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(1.6491e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.1897e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0014e-05, device='cuda:0', grad_fn=) 8.035205700685162e-07 changing lr epoch 66, time 355.86, cls_loss 0.0052 306 cls_loss: tensor(8.6427e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0333e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0301, device='cuda:0', grad_fn=) cls_loss: tensor(3.5226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0550e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0487, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(3.5703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8612e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8001e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7882e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5366e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0003e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(4.2756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.2890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(9.8566e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4983e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(7.5817e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3778e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7950e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.3830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2055e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0798e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(1.1047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8811e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(1.7305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1583e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2598e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0863, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0467, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6093e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8964e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(4.0273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2765e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4742e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8544e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.3228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7685e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0862e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.4592e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(2.2233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.7173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.0869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9314e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.4380e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(2.5551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.2557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.3828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(3.9776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.6327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) 4.525119116032647e-07 changing lr epoch 67, time 360.93, cls_loss 0.0015 306 cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3292e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(7.1327e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(8.5394e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2651e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7722e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2512e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(6.5168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0855, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.0325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8081e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.0381e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.3379e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.0202e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.0003e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0268e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1234e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5036e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3312e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1096, device='cuda:0', grad_fn=) cls_loss: tensor(3.8207e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(6.7353e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(4.9472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.7950e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.3802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.0520e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(7.8241e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8943e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9870e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(6.2644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5564e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.8062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.6987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3128e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9406e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0435, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2446e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0724e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.8412e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.0644e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) 2.0128530023804648e-07 changing lr epoch 68, time 357.71, cls_loss 0.0022 306 cls_loss: tensor(6.7949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0704, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0054e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1988e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3945, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(1.3530e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0360, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.7557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7663e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0465e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0217, device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(5.3505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8147e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0486, device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0484, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.1286, device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(9.7950e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9956e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(7.2340e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.7015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8359e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.4849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4638e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2056e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5903e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8215e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9803e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0566e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.0167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(4.0571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.6963e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.4440e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0053e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2188e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.7287e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.1962e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 5.03466729342705e-08 changing lr epoch 69, time 357.42, cls_loss 0.0029 ---------------------saving last model at epoch 69---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep70_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5', 'source_domain': 'art_painting', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep70_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5/art_painting_16factor_last_test_check.csv', 'factor_num': 16, 'epoch': 'last', 'stride': 5, 'eval_mapping': False, 'network': 'resnet18'} -------------------------------------loading pretrain weights---------------------------------- loading weight of last columns: ['art_painting', 'cartoon', 'photo', 'sketch'] x.shape: (2048, 227, 227, 3) x_aug test here torch.Size([2048, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_test.hdf5 torch.Size([2048, 3, 227, 227]) torch.Size([2048]) x.shape: (2344, 227, 227, 3) x_aug test here torch.Size([2344, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/cartoon_test.hdf5 torch.Size([2344, 3, 227, 227]) torch.Size([2344]) x.shape: (1670, 227, 227, 3) x_aug test here torch.Size([1670, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/photo_test.hdf5 torch.Size([1670, 3, 227, 227]) torch.Size([1670]) x.shape: (3929, 227, 227, 3) x_aug test here torch.Size([3929, 3, 227, 227]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/sketch_test.hdf5 torch.Size([3929, 3, 227, 227]) torch.Size([3929]) art_painting cartoon photo sketch Avg w/o do (original x) 88.916016 58.020478 81.497006 50.572665 63.363383